Commit 7bfac9ecf0585962fe13584f5cf526d8c8e76f17

Authored by Miklos Szeredi
Committed by Linus Torvalds
1 parent 612392307c

splice: fix deadlock in splicing to file

There's a possible deadlock in generic_file_splice_write(),
splice_from_pipe() and ocfs2_file_splice_write():

 - task A calls generic_file_splice_write()
 - this calls inode_double_lock(), which locks i_mutex on both
   pipe->inode and target inode
 - ordering depends on inode pointers, can happen that pipe->inode is
   locked first
 - __splice_from_pipe() needs more data, calls pipe_wait()
 - this releases lock on pipe->inode, goes to interruptible sleep
 - task B calls generic_file_splice_write(), similarly to the first
 - this locks pipe->inode, then tries to lock inode, but that is
   already held by task A
 - task A is interrupted, it tries to lock pipe->inode, but fails, as
   it is already held by task B
 - ABBA deadlock

Fix this by explicitly ordering locks: the outer lock must be on
target inode and the inner lock (which is later unlocked and relocked)
must be on pipe->inode.  This is OK, pipe inodes and target inodes
form two nonoverlapping sets, generic_file_splice_write() and friends
are not called with a target which is a pipe.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 26 additions and 7 deletions Inline Diff

1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * file.c 4 * file.c
5 * 5 *
6 * File open, close, extend, truncate 6 * File open, close, extend, truncate
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/capability.h> 26 #include <linux/capability.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/uio.h> 32 #include <linux/uio.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/splice.h> 34 #include <linux/splice.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/writeback.h> 36 #include <linux/writeback.h>
37 #include <linux/falloc.h> 37 #include <linux/falloc.h>
38 #include <linux/quotaops.h> 38 #include <linux/quotaops.h>
39 39
40 #define MLOG_MASK_PREFIX ML_INODE 40 #define MLOG_MASK_PREFIX ML_INODE
41 #include <cluster/masklog.h> 41 #include <cluster/masklog.h>
42 42
43 #include "ocfs2.h" 43 #include "ocfs2.h"
44 44
45 #include "alloc.h" 45 #include "alloc.h"
46 #include "aops.h" 46 #include "aops.h"
47 #include "dir.h" 47 #include "dir.h"
48 #include "dlmglue.h" 48 #include "dlmglue.h"
49 #include "extent_map.h" 49 #include "extent_map.h"
50 #include "file.h" 50 #include "file.h"
51 #include "sysfile.h" 51 #include "sysfile.h"
52 #include "inode.h" 52 #include "inode.h"
53 #include "ioctl.h" 53 #include "ioctl.h"
54 #include "journal.h" 54 #include "journal.h"
55 #include "locks.h" 55 #include "locks.h"
56 #include "mmap.h" 56 #include "mmap.h"
57 #include "suballoc.h" 57 #include "suballoc.h"
58 #include "super.h" 58 #include "super.h"
59 #include "xattr.h" 59 #include "xattr.h"
60 #include "acl.h" 60 #include "acl.h"
61 #include "quota.h" 61 #include "quota.h"
62 62
63 #include "buffer_head_io.h" 63 #include "buffer_head_io.h"
64 64
65 static int ocfs2_sync_inode(struct inode *inode) 65 static int ocfs2_sync_inode(struct inode *inode)
66 { 66 {
67 filemap_fdatawrite(inode->i_mapping); 67 filemap_fdatawrite(inode->i_mapping);
68 return sync_mapping_buffers(inode->i_mapping); 68 return sync_mapping_buffers(inode->i_mapping);
69 } 69 }
70 70
71 static int ocfs2_init_file_private(struct inode *inode, struct file *file) 71 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
72 { 72 {
73 struct ocfs2_file_private *fp; 73 struct ocfs2_file_private *fp;
74 74
75 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); 75 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
76 if (!fp) 76 if (!fp)
77 return -ENOMEM; 77 return -ENOMEM;
78 78
79 fp->fp_file = file; 79 fp->fp_file = file;
80 mutex_init(&fp->fp_mutex); 80 mutex_init(&fp->fp_mutex);
81 ocfs2_file_lock_res_init(&fp->fp_flock, fp); 81 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
82 file->private_data = fp; 82 file->private_data = fp;
83 83
84 return 0; 84 return 0;
85 } 85 }
86 86
87 static void ocfs2_free_file_private(struct inode *inode, struct file *file) 87 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
88 { 88 {
89 struct ocfs2_file_private *fp = file->private_data; 89 struct ocfs2_file_private *fp = file->private_data;
90 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 90 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
91 91
92 if (fp) { 92 if (fp) {
93 ocfs2_simple_drop_lockres(osb, &fp->fp_flock); 93 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
94 ocfs2_lock_res_free(&fp->fp_flock); 94 ocfs2_lock_res_free(&fp->fp_flock);
95 kfree(fp); 95 kfree(fp);
96 file->private_data = NULL; 96 file->private_data = NULL;
97 } 97 }
98 } 98 }
99 99
100 static int ocfs2_file_open(struct inode *inode, struct file *file) 100 static int ocfs2_file_open(struct inode *inode, struct file *file)
101 { 101 {
102 int status; 102 int status;
103 int mode = file->f_flags; 103 int mode = file->f_flags;
104 struct ocfs2_inode_info *oi = OCFS2_I(inode); 104 struct ocfs2_inode_info *oi = OCFS2_I(inode);
105 105
106 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 106 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
107 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 107 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
108 108
109 spin_lock(&oi->ip_lock); 109 spin_lock(&oi->ip_lock);
110 110
111 /* Check that the inode hasn't been wiped from disk by another 111 /* Check that the inode hasn't been wiped from disk by another
112 * node. If it hasn't then we're safe as long as we hold the 112 * node. If it hasn't then we're safe as long as we hold the
113 * spin lock until our increment of open count. */ 113 * spin lock until our increment of open count. */
114 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 114 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
115 spin_unlock(&oi->ip_lock); 115 spin_unlock(&oi->ip_lock);
116 116
117 status = -ENOENT; 117 status = -ENOENT;
118 goto leave; 118 goto leave;
119 } 119 }
120 120
121 if (mode & O_DIRECT) 121 if (mode & O_DIRECT)
122 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 122 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
123 123
124 oi->ip_open_count++; 124 oi->ip_open_count++;
125 spin_unlock(&oi->ip_lock); 125 spin_unlock(&oi->ip_lock);
126 126
127 status = ocfs2_init_file_private(inode, file); 127 status = ocfs2_init_file_private(inode, file);
128 if (status) { 128 if (status) {
129 /* 129 /*
130 * We want to set open count back if we're failing the 130 * We want to set open count back if we're failing the
131 * open. 131 * open.
132 */ 132 */
133 spin_lock(&oi->ip_lock); 133 spin_lock(&oi->ip_lock);
134 oi->ip_open_count--; 134 oi->ip_open_count--;
135 spin_unlock(&oi->ip_lock); 135 spin_unlock(&oi->ip_lock);
136 } 136 }
137 137
138 leave: 138 leave:
139 mlog_exit(status); 139 mlog_exit(status);
140 return status; 140 return status;
141 } 141 }
142 142
143 static int ocfs2_file_release(struct inode *inode, struct file *file) 143 static int ocfs2_file_release(struct inode *inode, struct file *file)
144 { 144 {
145 struct ocfs2_inode_info *oi = OCFS2_I(inode); 145 struct ocfs2_inode_info *oi = OCFS2_I(inode);
146 146
147 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 147 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
148 file->f_path.dentry->d_name.len, 148 file->f_path.dentry->d_name.len,
149 file->f_path.dentry->d_name.name); 149 file->f_path.dentry->d_name.name);
150 150
151 spin_lock(&oi->ip_lock); 151 spin_lock(&oi->ip_lock);
152 if (!--oi->ip_open_count) 152 if (!--oi->ip_open_count)
153 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 153 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
154 spin_unlock(&oi->ip_lock); 154 spin_unlock(&oi->ip_lock);
155 155
156 ocfs2_free_file_private(inode, file); 156 ocfs2_free_file_private(inode, file);
157 157
158 mlog_exit(0); 158 mlog_exit(0);
159 159
160 return 0; 160 return 0;
161 } 161 }
162 162
163 static int ocfs2_dir_open(struct inode *inode, struct file *file) 163 static int ocfs2_dir_open(struct inode *inode, struct file *file)
164 { 164 {
165 return ocfs2_init_file_private(inode, file); 165 return ocfs2_init_file_private(inode, file);
166 } 166 }
167 167
168 static int ocfs2_dir_release(struct inode *inode, struct file *file) 168 static int ocfs2_dir_release(struct inode *inode, struct file *file)
169 { 169 {
170 ocfs2_free_file_private(inode, file); 170 ocfs2_free_file_private(inode, file);
171 return 0; 171 return 0;
172 } 172 }
173 173
174 static int ocfs2_sync_file(struct file *file, 174 static int ocfs2_sync_file(struct file *file,
175 struct dentry *dentry, 175 struct dentry *dentry,
176 int datasync) 176 int datasync)
177 { 177 {
178 int err = 0; 178 int err = 0;
179 journal_t *journal; 179 journal_t *journal;
180 struct inode *inode = dentry->d_inode; 180 struct inode *inode = dentry->d_inode;
181 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 181 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
182 182
183 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 183 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
184 dentry->d_name.len, dentry->d_name.name); 184 dentry->d_name.len, dentry->d_name.name);
185 185
186 err = ocfs2_sync_inode(dentry->d_inode); 186 err = ocfs2_sync_inode(dentry->d_inode);
187 if (err) 187 if (err)
188 goto bail; 188 goto bail;
189 189
190 journal = osb->journal->j_journal; 190 journal = osb->journal->j_journal;
191 err = jbd2_journal_force_commit(journal); 191 err = jbd2_journal_force_commit(journal);
192 192
193 bail: 193 bail:
194 mlog_exit(err); 194 mlog_exit(err);
195 195
196 return (err < 0) ? -EIO : 0; 196 return (err < 0) ? -EIO : 0;
197 } 197 }
198 198
199 int ocfs2_should_update_atime(struct inode *inode, 199 int ocfs2_should_update_atime(struct inode *inode,
200 struct vfsmount *vfsmnt) 200 struct vfsmount *vfsmnt)
201 { 201 {
202 struct timespec now; 202 struct timespec now;
203 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 203 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
204 204
205 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 205 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
206 return 0; 206 return 0;
207 207
208 if ((inode->i_flags & S_NOATIME) || 208 if ((inode->i_flags & S_NOATIME) ||
209 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 209 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
210 return 0; 210 return 0;
211 211
212 /* 212 /*
213 * We can be called with no vfsmnt structure - NFSD will 213 * We can be called with no vfsmnt structure - NFSD will
214 * sometimes do this. 214 * sometimes do this.
215 * 215 *
216 * Note that our action here is different than touch_atime() - 216 * Note that our action here is different than touch_atime() -
217 * if we can't tell whether this is a noatime mount, then we 217 * if we can't tell whether this is a noatime mount, then we
218 * don't know whether to trust the value of s_atime_quantum. 218 * don't know whether to trust the value of s_atime_quantum.
219 */ 219 */
220 if (vfsmnt == NULL) 220 if (vfsmnt == NULL)
221 return 0; 221 return 0;
222 222
223 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 223 if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
224 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 224 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
225 return 0; 225 return 0;
226 226
227 if (vfsmnt->mnt_flags & MNT_RELATIME) { 227 if (vfsmnt->mnt_flags & MNT_RELATIME) {
228 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 228 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
229 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 229 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
230 return 1; 230 return 1;
231 231
232 return 0; 232 return 0;
233 } 233 }
234 234
235 now = CURRENT_TIME; 235 now = CURRENT_TIME;
236 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 236 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
237 return 0; 237 return 0;
238 else 238 else
239 return 1; 239 return 1;
240 } 240 }
241 241
242 int ocfs2_update_inode_atime(struct inode *inode, 242 int ocfs2_update_inode_atime(struct inode *inode,
243 struct buffer_head *bh) 243 struct buffer_head *bh)
244 { 244 {
245 int ret; 245 int ret;
246 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 246 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
247 handle_t *handle; 247 handle_t *handle;
248 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 248 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
249 249
250 mlog_entry_void(); 250 mlog_entry_void();
251 251
252 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 252 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
253 if (IS_ERR(handle)) { 253 if (IS_ERR(handle)) {
254 ret = PTR_ERR(handle); 254 ret = PTR_ERR(handle);
255 mlog_errno(ret); 255 mlog_errno(ret);
256 goto out; 256 goto out;
257 } 257 }
258 258
259 ret = ocfs2_journal_access_di(handle, inode, bh, 259 ret = ocfs2_journal_access_di(handle, inode, bh,
260 OCFS2_JOURNAL_ACCESS_WRITE); 260 OCFS2_JOURNAL_ACCESS_WRITE);
261 if (ret) { 261 if (ret) {
262 mlog_errno(ret); 262 mlog_errno(ret);
263 goto out_commit; 263 goto out_commit;
264 } 264 }
265 265
266 /* 266 /*
267 * Don't use ocfs2_mark_inode_dirty() here as we don't always 267 * Don't use ocfs2_mark_inode_dirty() here as we don't always
268 * have i_mutex to guard against concurrent changes to other 268 * have i_mutex to guard against concurrent changes to other
269 * inode fields. 269 * inode fields.
270 */ 270 */
271 inode->i_atime = CURRENT_TIME; 271 inode->i_atime = CURRENT_TIME;
272 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 272 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
273 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 273 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
274 274
275 ret = ocfs2_journal_dirty(handle, bh); 275 ret = ocfs2_journal_dirty(handle, bh);
276 if (ret < 0) 276 if (ret < 0)
277 mlog_errno(ret); 277 mlog_errno(ret);
278 278
279 out_commit: 279 out_commit:
280 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 280 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
281 out: 281 out:
282 mlog_exit(ret); 282 mlog_exit(ret);
283 return ret; 283 return ret;
284 } 284 }
285 285
286 static int ocfs2_set_inode_size(handle_t *handle, 286 static int ocfs2_set_inode_size(handle_t *handle,
287 struct inode *inode, 287 struct inode *inode,
288 struct buffer_head *fe_bh, 288 struct buffer_head *fe_bh,
289 u64 new_i_size) 289 u64 new_i_size)
290 { 290 {
291 int status; 291 int status;
292 292
293 mlog_entry_void(); 293 mlog_entry_void();
294 i_size_write(inode, new_i_size); 294 i_size_write(inode, new_i_size);
295 inode->i_blocks = ocfs2_inode_sector_count(inode); 295 inode->i_blocks = ocfs2_inode_sector_count(inode);
296 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 296 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
297 297
298 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 298 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
299 if (status < 0) { 299 if (status < 0) {
300 mlog_errno(status); 300 mlog_errno(status);
301 goto bail; 301 goto bail;
302 } 302 }
303 303
304 bail: 304 bail:
305 mlog_exit(status); 305 mlog_exit(status);
306 return status; 306 return status;
307 } 307 }
308 308
309 int ocfs2_simple_size_update(struct inode *inode, 309 int ocfs2_simple_size_update(struct inode *inode,
310 struct buffer_head *di_bh, 310 struct buffer_head *di_bh,
311 u64 new_i_size) 311 u64 new_i_size)
312 { 312 {
313 int ret; 313 int ret;
314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
315 handle_t *handle = NULL; 315 handle_t *handle = NULL;
316 316
317 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 317 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
318 if (IS_ERR(handle)) { 318 if (IS_ERR(handle)) {
319 ret = PTR_ERR(handle); 319 ret = PTR_ERR(handle);
320 mlog_errno(ret); 320 mlog_errno(ret);
321 goto out; 321 goto out;
322 } 322 }
323 323
324 ret = ocfs2_set_inode_size(handle, inode, di_bh, 324 ret = ocfs2_set_inode_size(handle, inode, di_bh,
325 new_i_size); 325 new_i_size);
326 if (ret < 0) 326 if (ret < 0)
327 mlog_errno(ret); 327 mlog_errno(ret);
328 328
329 ocfs2_commit_trans(osb, handle); 329 ocfs2_commit_trans(osb, handle);
330 out: 330 out:
331 return ret; 331 return ret;
332 } 332 }
333 333
334 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 334 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
335 struct inode *inode, 335 struct inode *inode,
336 struct buffer_head *fe_bh, 336 struct buffer_head *fe_bh,
337 u64 new_i_size) 337 u64 new_i_size)
338 { 338 {
339 int status; 339 int status;
340 handle_t *handle; 340 handle_t *handle;
341 struct ocfs2_dinode *di; 341 struct ocfs2_dinode *di;
342 u64 cluster_bytes; 342 u64 cluster_bytes;
343 343
344 mlog_entry_void(); 344 mlog_entry_void();
345 345
346 /* TODO: This needs to actually orphan the inode in this 346 /* TODO: This needs to actually orphan the inode in this
347 * transaction. */ 347 * transaction. */
348 348
349 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 349 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
350 if (IS_ERR(handle)) { 350 if (IS_ERR(handle)) {
351 status = PTR_ERR(handle); 351 status = PTR_ERR(handle);
352 mlog_errno(status); 352 mlog_errno(status);
353 goto out; 353 goto out;
354 } 354 }
355 355
356 status = ocfs2_journal_access_di(handle, inode, fe_bh, 356 status = ocfs2_journal_access_di(handle, inode, fe_bh,
357 OCFS2_JOURNAL_ACCESS_WRITE); 357 OCFS2_JOURNAL_ACCESS_WRITE);
358 if (status < 0) { 358 if (status < 0) {
359 mlog_errno(status); 359 mlog_errno(status);
360 goto out_commit; 360 goto out_commit;
361 } 361 }
362 362
363 /* 363 /*
364 * Do this before setting i_size. 364 * Do this before setting i_size.
365 */ 365 */
366 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 366 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
367 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 367 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
368 cluster_bytes); 368 cluster_bytes);
369 if (status) { 369 if (status) {
370 mlog_errno(status); 370 mlog_errno(status);
371 goto out_commit; 371 goto out_commit;
372 } 372 }
373 373
374 i_size_write(inode, new_i_size); 374 i_size_write(inode, new_i_size);
375 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 375 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
376 376
377 di = (struct ocfs2_dinode *) fe_bh->b_data; 377 di = (struct ocfs2_dinode *) fe_bh->b_data;
378 di->i_size = cpu_to_le64(new_i_size); 378 di->i_size = cpu_to_le64(new_i_size);
379 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 379 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
380 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 380 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
381 381
382 status = ocfs2_journal_dirty(handle, fe_bh); 382 status = ocfs2_journal_dirty(handle, fe_bh);
383 if (status < 0) 383 if (status < 0)
384 mlog_errno(status); 384 mlog_errno(status);
385 385
386 out_commit: 386 out_commit:
387 ocfs2_commit_trans(osb, handle); 387 ocfs2_commit_trans(osb, handle);
388 out: 388 out:
389 389
390 mlog_exit(status); 390 mlog_exit(status);
391 return status; 391 return status;
392 } 392 }
393 393
394 static int ocfs2_truncate_file(struct inode *inode, 394 static int ocfs2_truncate_file(struct inode *inode,
395 struct buffer_head *di_bh, 395 struct buffer_head *di_bh,
396 u64 new_i_size) 396 u64 new_i_size)
397 { 397 {
398 int status = 0; 398 int status = 0;
399 struct ocfs2_dinode *fe = NULL; 399 struct ocfs2_dinode *fe = NULL;
400 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 400 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
401 struct ocfs2_truncate_context *tc = NULL; 401 struct ocfs2_truncate_context *tc = NULL;
402 402
403 mlog_entry("(inode = %llu, new_i_size = %llu\n", 403 mlog_entry("(inode = %llu, new_i_size = %llu\n",
404 (unsigned long long)OCFS2_I(inode)->ip_blkno, 404 (unsigned long long)OCFS2_I(inode)->ip_blkno,
405 (unsigned long long)new_i_size); 405 (unsigned long long)new_i_size);
406 406
407 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 407 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
408 * already validated it */ 408 * already validated it */
409 fe = (struct ocfs2_dinode *) di_bh->b_data; 409 fe = (struct ocfs2_dinode *) di_bh->b_data;
410 410
411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
412 "Inode %llu, inode i_size = %lld != di " 412 "Inode %llu, inode i_size = %lld != di "
413 "i_size = %llu, i_flags = 0x%x\n", 413 "i_size = %llu, i_flags = 0x%x\n",
414 (unsigned long long)OCFS2_I(inode)->ip_blkno, 414 (unsigned long long)OCFS2_I(inode)->ip_blkno,
415 i_size_read(inode), 415 i_size_read(inode),
416 (unsigned long long)le64_to_cpu(fe->i_size), 416 (unsigned long long)le64_to_cpu(fe->i_size),
417 le32_to_cpu(fe->i_flags)); 417 le32_to_cpu(fe->i_flags));
418 418
419 if (new_i_size > le64_to_cpu(fe->i_size)) { 419 if (new_i_size > le64_to_cpu(fe->i_size)) {
420 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 420 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
421 (unsigned long long)le64_to_cpu(fe->i_size), 421 (unsigned long long)le64_to_cpu(fe->i_size),
422 (unsigned long long)new_i_size); 422 (unsigned long long)new_i_size);
423 status = -EINVAL; 423 status = -EINVAL;
424 mlog_errno(status); 424 mlog_errno(status);
425 goto bail; 425 goto bail;
426 } 426 }
427 427
428 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 428 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
429 (unsigned long long)le64_to_cpu(fe->i_blkno), 429 (unsigned long long)le64_to_cpu(fe->i_blkno),
430 (unsigned long long)le64_to_cpu(fe->i_size), 430 (unsigned long long)le64_to_cpu(fe->i_size),
431 (unsigned long long)new_i_size); 431 (unsigned long long)new_i_size);
432 432
433 /* lets handle the simple truncate cases before doing any more 433 /* lets handle the simple truncate cases before doing any more
434 * cluster locking. */ 434 * cluster locking. */
435 if (new_i_size == le64_to_cpu(fe->i_size)) 435 if (new_i_size == le64_to_cpu(fe->i_size))
436 goto bail; 436 goto bail;
437 437
438 down_write(&OCFS2_I(inode)->ip_alloc_sem); 438 down_write(&OCFS2_I(inode)->ip_alloc_sem);
439 439
440 /* 440 /*
441 * The inode lock forced other nodes to sync and drop their 441 * The inode lock forced other nodes to sync and drop their
442 * pages, which (correctly) happens even if we have a truncate 442 * pages, which (correctly) happens even if we have a truncate
443 * without allocation change - ocfs2 cluster sizes can be much 443 * without allocation change - ocfs2 cluster sizes can be much
444 * greater than page size, so we have to truncate them 444 * greater than page size, so we have to truncate them
445 * anyway. 445 * anyway.
446 */ 446 */
447 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 447 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
448 truncate_inode_pages(inode->i_mapping, new_i_size); 448 truncate_inode_pages(inode->i_mapping, new_i_size);
449 449
450 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 450 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
451 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 451 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
452 i_size_read(inode), 1); 452 i_size_read(inode), 1);
453 if (status) 453 if (status)
454 mlog_errno(status); 454 mlog_errno(status);
455 455
456 goto bail_unlock_sem; 456 goto bail_unlock_sem;
457 } 457 }
458 458
459 /* alright, we're going to need to do a full blown alloc size 459 /* alright, we're going to need to do a full blown alloc size
460 * change. Orphan the inode so that recovery can complete the 460 * change. Orphan the inode so that recovery can complete the
461 * truncate if necessary. This does the task of marking 461 * truncate if necessary. This does the task of marking
462 * i_size. */ 462 * i_size. */
463 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 463 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
464 if (status < 0) { 464 if (status < 0) {
465 mlog_errno(status); 465 mlog_errno(status);
466 goto bail_unlock_sem; 466 goto bail_unlock_sem;
467 } 467 }
468 468
469 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 469 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
470 if (status < 0) { 470 if (status < 0) {
471 mlog_errno(status); 471 mlog_errno(status);
472 goto bail_unlock_sem; 472 goto bail_unlock_sem;
473 } 473 }
474 474
475 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 475 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
476 if (status < 0) { 476 if (status < 0) {
477 mlog_errno(status); 477 mlog_errno(status);
478 goto bail_unlock_sem; 478 goto bail_unlock_sem;
479 } 479 }
480 480
481 /* TODO: orphan dir cleanup here. */ 481 /* TODO: orphan dir cleanup here. */
482 bail_unlock_sem: 482 bail_unlock_sem:
483 up_write(&OCFS2_I(inode)->ip_alloc_sem); 483 up_write(&OCFS2_I(inode)->ip_alloc_sem);
484 484
485 bail: 485 bail:
486 486
487 mlog_exit(status); 487 mlog_exit(status);
488 return status; 488 return status;
489 } 489 }
490 490
491 /* 491 /*
492 * extend file allocation only here. 492 * extend file allocation only here.
493 * we'll update all the disk stuff, and oip->alloc_size 493 * we'll update all the disk stuff, and oip->alloc_size
494 * 494 *
495 * expect stuff to be locked, a transaction started and enough data / 495 * expect stuff to be locked, a transaction started and enough data /
496 * metadata reservations in the contexts. 496 * metadata reservations in the contexts.
497 * 497 *
498 * Will return -EAGAIN, and a reason if a restart is needed. 498 * Will return -EAGAIN, and a reason if a restart is needed.
499 * If passed in, *reason will always be set, even in error. 499 * If passed in, *reason will always be set, even in error.
500 */ 500 */
501 int ocfs2_add_inode_data(struct ocfs2_super *osb, 501 int ocfs2_add_inode_data(struct ocfs2_super *osb,
502 struct inode *inode, 502 struct inode *inode,
503 u32 *logical_offset, 503 u32 *logical_offset,
504 u32 clusters_to_add, 504 u32 clusters_to_add,
505 int mark_unwritten, 505 int mark_unwritten,
506 struct buffer_head *fe_bh, 506 struct buffer_head *fe_bh,
507 handle_t *handle, 507 handle_t *handle,
508 struct ocfs2_alloc_context *data_ac, 508 struct ocfs2_alloc_context *data_ac,
509 struct ocfs2_alloc_context *meta_ac, 509 struct ocfs2_alloc_context *meta_ac,
510 enum ocfs2_alloc_restarted *reason_ret) 510 enum ocfs2_alloc_restarted *reason_ret)
511 { 511 {
512 int ret; 512 int ret;
513 struct ocfs2_extent_tree et; 513 struct ocfs2_extent_tree et;
514 514
515 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); 515 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
516 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, 516 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
517 clusters_to_add, mark_unwritten, 517 clusters_to_add, mark_unwritten,
518 &et, handle, 518 &et, handle,
519 data_ac, meta_ac, reason_ret); 519 data_ac, meta_ac, reason_ret);
520 520
521 return ret; 521 return ret;
522 } 522 }
523 523
524 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 524 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
525 u32 clusters_to_add, int mark_unwritten) 525 u32 clusters_to_add, int mark_unwritten)
526 { 526 {
527 int status = 0; 527 int status = 0;
528 int restart_func = 0; 528 int restart_func = 0;
529 int credits; 529 int credits;
530 u32 prev_clusters; 530 u32 prev_clusters;
531 struct buffer_head *bh = NULL; 531 struct buffer_head *bh = NULL;
532 struct ocfs2_dinode *fe = NULL; 532 struct ocfs2_dinode *fe = NULL;
533 handle_t *handle = NULL; 533 handle_t *handle = NULL;
534 struct ocfs2_alloc_context *data_ac = NULL; 534 struct ocfs2_alloc_context *data_ac = NULL;
535 struct ocfs2_alloc_context *meta_ac = NULL; 535 struct ocfs2_alloc_context *meta_ac = NULL;
536 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et; 538 struct ocfs2_extent_tree et;
539 int did_quota = 0; 539 int did_quota = 0;
540 540
541 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 541 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
542 542
543 /* 543 /*
544 * This function only exists for file systems which don't 544 * This function only exists for file systems which don't
545 * support holes. 545 * support holes.
546 */ 546 */
547 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 547 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
548 548
549 status = ocfs2_read_inode_block(inode, &bh); 549 status = ocfs2_read_inode_block(inode, &bh);
550 if (status < 0) { 550 if (status < 0) {
551 mlog_errno(status); 551 mlog_errno(status);
552 goto leave; 552 goto leave;
553 } 553 }
554 fe = (struct ocfs2_dinode *) bh->b_data; 554 fe = (struct ocfs2_dinode *) bh->b_data;
555 555
556 restart_all: 556 restart_all:
557 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 557 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
558 558
559 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 559 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
560 "clusters_to_add = %u\n", 560 "clusters_to_add = %u\n",
561 (unsigned long long)OCFS2_I(inode)->ip_blkno, 561 (unsigned long long)OCFS2_I(inode)->ip_blkno,
562 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 562 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
563 clusters_to_add); 563 clusters_to_add);
564 ocfs2_init_dinode_extent_tree(&et, inode, bh); 564 ocfs2_init_dinode_extent_tree(&et, inode, bh);
565 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 565 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
566 &data_ac, &meta_ac); 566 &data_ac, &meta_ac);
567 if (status) { 567 if (status) {
568 mlog_errno(status); 568 mlog_errno(status);
569 goto leave; 569 goto leave;
570 } 570 }
571 571
572 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, 572 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
573 clusters_to_add); 573 clusters_to_add);
574 handle = ocfs2_start_trans(osb, credits); 574 handle = ocfs2_start_trans(osb, credits);
575 if (IS_ERR(handle)) { 575 if (IS_ERR(handle)) {
576 status = PTR_ERR(handle); 576 status = PTR_ERR(handle);
577 handle = NULL; 577 handle = NULL;
578 mlog_errno(status); 578 mlog_errno(status);
579 goto leave; 579 goto leave;
580 } 580 }
581 581
582 restarted_transaction: 582 restarted_transaction:
583 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 583 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
584 clusters_to_add))) { 584 clusters_to_add))) {
585 status = -EDQUOT; 585 status = -EDQUOT;
586 goto leave; 586 goto leave;
587 } 587 }
588 did_quota = 1; 588 did_quota = 1;
589 589
590 /* reserve a write to the file entry early on - that we if we 590 /* reserve a write to the file entry early on - that we if we
591 * run out of credits in the allocation path, we can still 591 * run out of credits in the allocation path, we can still
592 * update i_size. */ 592 * update i_size. */
593 status = ocfs2_journal_access_di(handle, inode, bh, 593 status = ocfs2_journal_access_di(handle, inode, bh,
594 OCFS2_JOURNAL_ACCESS_WRITE); 594 OCFS2_JOURNAL_ACCESS_WRITE);
595 if (status < 0) { 595 if (status < 0) {
596 mlog_errno(status); 596 mlog_errno(status);
597 goto leave; 597 goto leave;
598 } 598 }
599 599
600 prev_clusters = OCFS2_I(inode)->ip_clusters; 600 prev_clusters = OCFS2_I(inode)->ip_clusters;
601 601
602 status = ocfs2_add_inode_data(osb, 602 status = ocfs2_add_inode_data(osb,
603 inode, 603 inode,
604 &logical_start, 604 &logical_start,
605 clusters_to_add, 605 clusters_to_add,
606 mark_unwritten, 606 mark_unwritten,
607 bh, 607 bh,
608 handle, 608 handle,
609 data_ac, 609 data_ac,
610 meta_ac, 610 meta_ac,
611 &why); 611 &why);
612 if ((status < 0) && (status != -EAGAIN)) { 612 if ((status < 0) && (status != -EAGAIN)) {
613 if (status != -ENOSPC) 613 if (status != -ENOSPC)
614 mlog_errno(status); 614 mlog_errno(status);
615 goto leave; 615 goto leave;
616 } 616 }
617 617
618 status = ocfs2_journal_dirty(handle, bh); 618 status = ocfs2_journal_dirty(handle, bh);
619 if (status < 0) { 619 if (status < 0) {
620 mlog_errno(status); 620 mlog_errno(status);
621 goto leave; 621 goto leave;
622 } 622 }
623 623
624 spin_lock(&OCFS2_I(inode)->ip_lock); 624 spin_lock(&OCFS2_I(inode)->ip_lock);
625 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 625 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
626 spin_unlock(&OCFS2_I(inode)->ip_lock); 626 spin_unlock(&OCFS2_I(inode)->ip_lock);
627 /* Release unused quota reservation */ 627 /* Release unused quota reservation */
628 vfs_dq_free_space(inode, 628 vfs_dq_free_space(inode,
629 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 629 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
630 did_quota = 0; 630 did_quota = 0;
631 631
632 if (why != RESTART_NONE && clusters_to_add) { 632 if (why != RESTART_NONE && clusters_to_add) {
633 if (why == RESTART_META) { 633 if (why == RESTART_META) {
634 mlog(0, "restarting function.\n"); 634 mlog(0, "restarting function.\n");
635 restart_func = 1; 635 restart_func = 1;
636 } else { 636 } else {
637 BUG_ON(why != RESTART_TRANS); 637 BUG_ON(why != RESTART_TRANS);
638 638
639 mlog(0, "restarting transaction.\n"); 639 mlog(0, "restarting transaction.\n");
640 /* TODO: This can be more intelligent. */ 640 /* TODO: This can be more intelligent. */
641 credits = ocfs2_calc_extend_credits(osb->sb, 641 credits = ocfs2_calc_extend_credits(osb->sb,
642 &fe->id2.i_list, 642 &fe->id2.i_list,
643 clusters_to_add); 643 clusters_to_add);
644 status = ocfs2_extend_trans(handle, credits); 644 status = ocfs2_extend_trans(handle, credits);
645 if (status < 0) { 645 if (status < 0) {
646 /* handle still has to be committed at 646 /* handle still has to be committed at
647 * this point. */ 647 * this point. */
648 status = -ENOMEM; 648 status = -ENOMEM;
649 mlog_errno(status); 649 mlog_errno(status);
650 goto leave; 650 goto leave;
651 } 651 }
652 goto restarted_transaction; 652 goto restarted_transaction;
653 } 653 }
654 } 654 }
655 655
656 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 656 mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
657 le32_to_cpu(fe->i_clusters), 657 le32_to_cpu(fe->i_clusters),
658 (unsigned long long)le64_to_cpu(fe->i_size)); 658 (unsigned long long)le64_to_cpu(fe->i_size));
659 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 659 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
660 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 660 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
661 661
662 leave: 662 leave:
663 if (status < 0 && did_quota) 663 if (status < 0 && did_quota)
664 vfs_dq_free_space(inode, 664 vfs_dq_free_space(inode,
665 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 665 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
666 if (handle) { 666 if (handle) {
667 ocfs2_commit_trans(osb, handle); 667 ocfs2_commit_trans(osb, handle);
668 handle = NULL; 668 handle = NULL;
669 } 669 }
670 if (data_ac) { 670 if (data_ac) {
671 ocfs2_free_alloc_context(data_ac); 671 ocfs2_free_alloc_context(data_ac);
672 data_ac = NULL; 672 data_ac = NULL;
673 } 673 }
674 if (meta_ac) { 674 if (meta_ac) {
675 ocfs2_free_alloc_context(meta_ac); 675 ocfs2_free_alloc_context(meta_ac);
676 meta_ac = NULL; 676 meta_ac = NULL;
677 } 677 }
678 if ((!status) && restart_func) { 678 if ((!status) && restart_func) {
679 restart_func = 0; 679 restart_func = 0;
680 goto restart_all; 680 goto restart_all;
681 } 681 }
682 brelse(bh); 682 brelse(bh);
683 bh = NULL; 683 bh = NULL;
684 684
685 mlog_exit(status); 685 mlog_exit(status);
686 return status; 686 return status;
687 } 687 }
688 688
689 /* Some parts of this taken from generic_cont_expand, which turned out 689 /* Some parts of this taken from generic_cont_expand, which turned out
690 * to be too fragile to do exactly what we need without us having to 690 * to be too fragile to do exactly what we need without us having to
691 * worry about recursive locking in ->write_begin() and ->write_end(). */ 691 * worry about recursive locking in ->write_begin() and ->write_end(). */
692 static int ocfs2_write_zero_page(struct inode *inode, 692 static int ocfs2_write_zero_page(struct inode *inode,
693 u64 size) 693 u64 size)
694 { 694 {
695 struct address_space *mapping = inode->i_mapping; 695 struct address_space *mapping = inode->i_mapping;
696 struct page *page; 696 struct page *page;
697 unsigned long index; 697 unsigned long index;
698 unsigned int offset; 698 unsigned int offset;
699 handle_t *handle = NULL; 699 handle_t *handle = NULL;
700 int ret; 700 int ret;
701 701
702 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 702 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
703 /* ugh. in prepare/commit_write, if from==to==start of block, we 703 /* ugh. in prepare/commit_write, if from==to==start of block, we
704 ** skip the prepare. make sure we never send an offset for the start 704 ** skip the prepare. make sure we never send an offset for the start
705 ** of a block 705 ** of a block
706 */ 706 */
707 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 707 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
708 offset++; 708 offset++;
709 } 709 }
710 index = size >> PAGE_CACHE_SHIFT; 710 index = size >> PAGE_CACHE_SHIFT;
711 711
712 page = grab_cache_page(mapping, index); 712 page = grab_cache_page(mapping, index);
713 if (!page) { 713 if (!page) {
714 ret = -ENOMEM; 714 ret = -ENOMEM;
715 mlog_errno(ret); 715 mlog_errno(ret);
716 goto out; 716 goto out;
717 } 717 }
718 718
719 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 719 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
720 if (ret < 0) { 720 if (ret < 0) {
721 mlog_errno(ret); 721 mlog_errno(ret);
722 goto out_unlock; 722 goto out_unlock;
723 } 723 }
724 724
725 if (ocfs2_should_order_data(inode)) { 725 if (ocfs2_should_order_data(inode)) {
726 handle = ocfs2_start_walk_page_trans(inode, page, offset, 726 handle = ocfs2_start_walk_page_trans(inode, page, offset,
727 offset); 727 offset);
728 if (IS_ERR(handle)) { 728 if (IS_ERR(handle)) {
729 ret = PTR_ERR(handle); 729 ret = PTR_ERR(handle);
730 handle = NULL; 730 handle = NULL;
731 goto out_unlock; 731 goto out_unlock;
732 } 732 }
733 } 733 }
734 734
735 /* must not update i_size! */ 735 /* must not update i_size! */
736 ret = block_commit_write(page, offset, offset); 736 ret = block_commit_write(page, offset, offset);
737 if (ret < 0) 737 if (ret < 0)
738 mlog_errno(ret); 738 mlog_errno(ret);
739 else 739 else
740 ret = 0; 740 ret = 0;
741 741
742 if (handle) 742 if (handle)
743 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 743 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
744 out_unlock: 744 out_unlock:
745 unlock_page(page); 745 unlock_page(page);
746 page_cache_release(page); 746 page_cache_release(page);
747 out: 747 out:
748 return ret; 748 return ret;
749 } 749 }
750 750
751 static int ocfs2_zero_extend(struct inode *inode, 751 static int ocfs2_zero_extend(struct inode *inode,
752 u64 zero_to_size) 752 u64 zero_to_size)
753 { 753 {
754 int ret = 0; 754 int ret = 0;
755 u64 start_off; 755 u64 start_off;
756 struct super_block *sb = inode->i_sb; 756 struct super_block *sb = inode->i_sb;
757 757
758 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 758 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
759 while (start_off < zero_to_size) { 759 while (start_off < zero_to_size) {
760 ret = ocfs2_write_zero_page(inode, start_off); 760 ret = ocfs2_write_zero_page(inode, start_off);
761 if (ret < 0) { 761 if (ret < 0) {
762 mlog_errno(ret); 762 mlog_errno(ret);
763 goto out; 763 goto out;
764 } 764 }
765 765
766 start_off += sb->s_blocksize; 766 start_off += sb->s_blocksize;
767 767
768 /* 768 /*
769 * Very large extends have the potential to lock up 769 * Very large extends have the potential to lock up
770 * the cpu for extended periods of time. 770 * the cpu for extended periods of time.
771 */ 771 */
772 cond_resched(); 772 cond_resched();
773 } 773 }
774 774
775 out: 775 out:
776 return ret; 776 return ret;
777 } 777 }
778 778
779 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 779 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
780 { 780 {
781 int ret; 781 int ret;
782 u32 clusters_to_add; 782 u32 clusters_to_add;
783 struct ocfs2_inode_info *oi = OCFS2_I(inode); 783 struct ocfs2_inode_info *oi = OCFS2_I(inode);
784 784
785 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 785 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
786 if (clusters_to_add < oi->ip_clusters) 786 if (clusters_to_add < oi->ip_clusters)
787 clusters_to_add = 0; 787 clusters_to_add = 0;
788 else 788 else
789 clusters_to_add -= oi->ip_clusters; 789 clusters_to_add -= oi->ip_clusters;
790 790
791 if (clusters_to_add) { 791 if (clusters_to_add) {
792 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, 792 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
793 clusters_to_add, 0); 793 clusters_to_add, 0);
794 if (ret) { 794 if (ret) {
795 mlog_errno(ret); 795 mlog_errno(ret);
796 goto out; 796 goto out;
797 } 797 }
798 } 798 }
799 799
800 /* 800 /*
801 * Call this even if we don't add any clusters to the tree. We 801 * Call this even if we don't add any clusters to the tree. We
802 * still need to zero the area between the old i_size and the 802 * still need to zero the area between the old i_size and the
803 * new i_size. 803 * new i_size.
804 */ 804 */
805 ret = ocfs2_zero_extend(inode, zero_to); 805 ret = ocfs2_zero_extend(inode, zero_to);
806 if (ret < 0) 806 if (ret < 0)
807 mlog_errno(ret); 807 mlog_errno(ret);
808 808
809 out: 809 out:
810 return ret; 810 return ret;
811 } 811 }
812 812
813 static int ocfs2_extend_file(struct inode *inode, 813 static int ocfs2_extend_file(struct inode *inode,
814 struct buffer_head *di_bh, 814 struct buffer_head *di_bh,
815 u64 new_i_size) 815 u64 new_i_size)
816 { 816 {
817 int ret = 0; 817 int ret = 0;
818 struct ocfs2_inode_info *oi = OCFS2_I(inode); 818 struct ocfs2_inode_info *oi = OCFS2_I(inode);
819 819
820 BUG_ON(!di_bh); 820 BUG_ON(!di_bh);
821 821
822 /* setattr sometimes calls us like this. */ 822 /* setattr sometimes calls us like this. */
823 if (new_i_size == 0) 823 if (new_i_size == 0)
824 goto out; 824 goto out;
825 825
826 if (i_size_read(inode) == new_i_size) 826 if (i_size_read(inode) == new_i_size)
827 goto out; 827 goto out;
828 BUG_ON(new_i_size < i_size_read(inode)); 828 BUG_ON(new_i_size < i_size_read(inode));
829 829
830 /* 830 /*
831 * Fall through for converting inline data, even if the fs 831 * Fall through for converting inline data, even if the fs
832 * supports sparse files. 832 * supports sparse files.
833 * 833 *
834 * The check for inline data here is legal - nobody can add 834 * The check for inline data here is legal - nobody can add
835 * the feature since we have i_mutex. We must check it again 835 * the feature since we have i_mutex. We must check it again
836 * after acquiring ip_alloc_sem though, as paths like mmap 836 * after acquiring ip_alloc_sem though, as paths like mmap
837 * might have raced us to converting the inode to extents. 837 * might have raced us to converting the inode to extents.
838 */ 838 */
839 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 839 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
840 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 840 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
841 goto out_update_size; 841 goto out_update_size;
842 842
843 /* 843 /*
844 * The alloc sem blocks people in read/write from reading our 844 * The alloc sem blocks people in read/write from reading our
845 * allocation until we're done changing it. We depend on 845 * allocation until we're done changing it. We depend on
846 * i_mutex to block other extend/truncate calls while we're 846 * i_mutex to block other extend/truncate calls while we're
847 * here. 847 * here.
848 */ 848 */
849 down_write(&oi->ip_alloc_sem); 849 down_write(&oi->ip_alloc_sem);
850 850
851 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 851 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
852 /* 852 /*
853 * We can optimize small extends by keeping the inodes 853 * We can optimize small extends by keeping the inodes
854 * inline data. 854 * inline data.
855 */ 855 */
856 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { 856 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
857 up_write(&oi->ip_alloc_sem); 857 up_write(&oi->ip_alloc_sem);
858 goto out_update_size; 858 goto out_update_size;
859 } 859 }
860 860
861 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 861 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
862 if (ret) { 862 if (ret) {
863 up_write(&oi->ip_alloc_sem); 863 up_write(&oi->ip_alloc_sem);
864 864
865 mlog_errno(ret); 865 mlog_errno(ret);
866 goto out; 866 goto out;
867 } 867 }
868 } 868 }
869 869
870 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 870 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
871 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 871 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
872 872
873 up_write(&oi->ip_alloc_sem); 873 up_write(&oi->ip_alloc_sem);
874 874
875 if (ret < 0) { 875 if (ret < 0) {
876 mlog_errno(ret); 876 mlog_errno(ret);
877 goto out; 877 goto out;
878 } 878 }
879 879
880 out_update_size: 880 out_update_size:
881 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 881 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
882 if (ret < 0) 882 if (ret < 0)
883 mlog_errno(ret); 883 mlog_errno(ret);
884 884
885 out: 885 out:
886 return ret; 886 return ret;
887 } 887 }
888 888
889 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 889 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
890 { 890 {
891 int status = 0, size_change; 891 int status = 0, size_change;
892 struct inode *inode = dentry->d_inode; 892 struct inode *inode = dentry->d_inode;
893 struct super_block *sb = inode->i_sb; 893 struct super_block *sb = inode->i_sb;
894 struct ocfs2_super *osb = OCFS2_SB(sb); 894 struct ocfs2_super *osb = OCFS2_SB(sb);
895 struct buffer_head *bh = NULL; 895 struct buffer_head *bh = NULL;
896 handle_t *handle = NULL; 896 handle_t *handle = NULL;
897 int locked[MAXQUOTAS] = {0, 0}; 897 int locked[MAXQUOTAS] = {0, 0};
898 int credits, qtype; 898 int credits, qtype;
899 struct ocfs2_mem_dqinfo *oinfo; 899 struct ocfs2_mem_dqinfo *oinfo;
900 900
901 mlog_entry("(0x%p, '%.*s')\n", dentry, 901 mlog_entry("(0x%p, '%.*s')\n", dentry,
902 dentry->d_name.len, dentry->d_name.name); 902 dentry->d_name.len, dentry->d_name.name);
903 903
904 /* ensuring we don't even attempt to truncate a symlink */ 904 /* ensuring we don't even attempt to truncate a symlink */
905 if (S_ISLNK(inode->i_mode)) 905 if (S_ISLNK(inode->i_mode))
906 attr->ia_valid &= ~ATTR_SIZE; 906 attr->ia_valid &= ~ATTR_SIZE;
907 907
908 if (attr->ia_valid & ATTR_MODE) 908 if (attr->ia_valid & ATTR_MODE)
909 mlog(0, "mode change: %d\n", attr->ia_mode); 909 mlog(0, "mode change: %d\n", attr->ia_mode);
910 if (attr->ia_valid & ATTR_UID) 910 if (attr->ia_valid & ATTR_UID)
911 mlog(0, "uid change: %d\n", attr->ia_uid); 911 mlog(0, "uid change: %d\n", attr->ia_uid);
912 if (attr->ia_valid & ATTR_GID) 912 if (attr->ia_valid & ATTR_GID)
913 mlog(0, "gid change: %d\n", attr->ia_gid); 913 mlog(0, "gid change: %d\n", attr->ia_gid);
914 if (attr->ia_valid & ATTR_SIZE) 914 if (attr->ia_valid & ATTR_SIZE)
915 mlog(0, "size change...\n"); 915 mlog(0, "size change...\n");
916 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 916 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
917 mlog(0, "time change...\n"); 917 mlog(0, "time change...\n");
918 918
919 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 919 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
920 | ATTR_GID | ATTR_UID | ATTR_MODE) 920 | ATTR_GID | ATTR_UID | ATTR_MODE)
921 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 921 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
922 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 922 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
923 return 0; 923 return 0;
924 } 924 }
925 925
926 status = inode_change_ok(inode, attr); 926 status = inode_change_ok(inode, attr);
927 if (status) 927 if (status)
928 return status; 928 return status;
929 929
930 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 930 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
931 if (size_change) { 931 if (size_change) {
932 status = ocfs2_rw_lock(inode, 1); 932 status = ocfs2_rw_lock(inode, 1);
933 if (status < 0) { 933 if (status < 0) {
934 mlog_errno(status); 934 mlog_errno(status);
935 goto bail; 935 goto bail;
936 } 936 }
937 } 937 }
938 938
939 status = ocfs2_inode_lock(inode, &bh, 1); 939 status = ocfs2_inode_lock(inode, &bh, 1);
940 if (status < 0) { 940 if (status < 0) {
941 if (status != -ENOENT) 941 if (status != -ENOENT)
942 mlog_errno(status); 942 mlog_errno(status);
943 goto bail_unlock_rw; 943 goto bail_unlock_rw;
944 } 944 }
945 945
946 if (size_change && attr->ia_size != i_size_read(inode)) { 946 if (size_change && attr->ia_size != i_size_read(inode)) {
947 if (attr->ia_size > sb->s_maxbytes) { 947 if (attr->ia_size > sb->s_maxbytes) {
948 status = -EFBIG; 948 status = -EFBIG;
949 goto bail_unlock; 949 goto bail_unlock;
950 } 950 }
951 951
952 if (i_size_read(inode) > attr->ia_size) { 952 if (i_size_read(inode) > attr->ia_size) {
953 if (ocfs2_should_order_data(inode)) { 953 if (ocfs2_should_order_data(inode)) {
954 status = ocfs2_begin_ordered_truncate(inode, 954 status = ocfs2_begin_ordered_truncate(inode,
955 attr->ia_size); 955 attr->ia_size);
956 if (status) 956 if (status)
957 goto bail_unlock; 957 goto bail_unlock;
958 } 958 }
959 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 959 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
960 } else 960 } else
961 status = ocfs2_extend_file(inode, bh, attr->ia_size); 961 status = ocfs2_extend_file(inode, bh, attr->ia_size);
962 if (status < 0) { 962 if (status < 0) {
963 if (status != -ENOSPC) 963 if (status != -ENOSPC)
964 mlog_errno(status); 964 mlog_errno(status);
965 status = -ENOSPC; 965 status = -ENOSPC;
966 goto bail_unlock; 966 goto bail_unlock;
967 } 967 }
968 } 968 }
969 969
970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
972 credits = OCFS2_INODE_UPDATE_CREDITS; 972 credits = OCFS2_INODE_UPDATE_CREDITS;
973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; 976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
977 status = ocfs2_lock_global_qf(oinfo, 1); 977 status = ocfs2_lock_global_qf(oinfo, 1);
978 if (status < 0) 978 if (status < 0)
979 goto bail_unlock; 979 goto bail_unlock;
980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + 980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
981 ocfs2_calc_qdel_credits(sb, USRQUOTA); 981 ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 locked[USRQUOTA] = 1; 982 locked[USRQUOTA] = 1;
983 } 983 }
984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; 987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
988 status = ocfs2_lock_global_qf(oinfo, 1); 988 status = ocfs2_lock_global_qf(oinfo, 1);
989 if (status < 0) 989 if (status < 0)
990 goto bail_unlock; 990 goto bail_unlock;
991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + 991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
992 ocfs2_calc_qdel_credits(sb, GRPQUOTA); 992 ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 locked[GRPQUOTA] = 1; 993 locked[GRPQUOTA] = 1;
994 } 994 }
995 handle = ocfs2_start_trans(osb, credits); 995 handle = ocfs2_start_trans(osb, credits);
996 if (IS_ERR(handle)) { 996 if (IS_ERR(handle)) {
997 status = PTR_ERR(handle); 997 status = PTR_ERR(handle);
998 mlog_errno(status); 998 mlog_errno(status);
999 goto bail_unlock; 999 goto bail_unlock;
1000 } 1000 }
1001 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1001 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1002 if (status < 0) 1002 if (status < 0)
1003 goto bail_commit; 1003 goto bail_commit;
1004 } else { 1004 } else {
1005 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1005 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1006 if (IS_ERR(handle)) { 1006 if (IS_ERR(handle)) {
1007 status = PTR_ERR(handle); 1007 status = PTR_ERR(handle);
1008 mlog_errno(status); 1008 mlog_errno(status);
1009 goto bail_unlock; 1009 goto bail_unlock;
1010 } 1010 }
1011 } 1011 }
1012 1012
1013 /* 1013 /*
1014 * This will intentionally not wind up calling vmtruncate(), 1014 * This will intentionally not wind up calling vmtruncate(),
1015 * since all the work for a size change has been done above. 1015 * since all the work for a size change has been done above.
1016 * Otherwise, we could get into problems with truncate as 1016 * Otherwise, we could get into problems with truncate as
1017 * ip_alloc_sem is used there to protect against i_size 1017 * ip_alloc_sem is used there to protect against i_size
1018 * changes. 1018 * changes.
1019 */ 1019 */
1020 status = inode_setattr(inode, attr); 1020 status = inode_setattr(inode, attr);
1021 if (status < 0) { 1021 if (status < 0) {
1022 mlog_errno(status); 1022 mlog_errno(status);
1023 goto bail_commit; 1023 goto bail_commit;
1024 } 1024 }
1025 1025
1026 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1026 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1027 if (status < 0) 1027 if (status < 0)
1028 mlog_errno(status); 1028 mlog_errno(status);
1029 1029
1030 bail_commit: 1030 bail_commit:
1031 ocfs2_commit_trans(osb, handle); 1031 ocfs2_commit_trans(osb, handle);
1032 bail_unlock: 1032 bail_unlock:
1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 if (!locked[qtype]) 1034 if (!locked[qtype])
1035 continue; 1035 continue;
1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv; 1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 ocfs2_unlock_global_qf(oinfo, 1); 1037 ocfs2_unlock_global_qf(oinfo, 1);
1038 } 1038 }
1039 ocfs2_inode_unlock(inode, 1); 1039 ocfs2_inode_unlock(inode, 1);
1040 bail_unlock_rw: 1040 bail_unlock_rw:
1041 if (size_change) 1041 if (size_change)
1042 ocfs2_rw_unlock(inode, 1); 1042 ocfs2_rw_unlock(inode, 1);
1043 bail: 1043 bail:
1044 brelse(bh); 1044 brelse(bh);
1045 1045
1046 if (!status && attr->ia_valid & ATTR_MODE) { 1046 if (!status && attr->ia_valid & ATTR_MODE) {
1047 status = ocfs2_acl_chmod(inode); 1047 status = ocfs2_acl_chmod(inode);
1048 if (status < 0) 1048 if (status < 0)
1049 mlog_errno(status); 1049 mlog_errno(status);
1050 } 1050 }
1051 1051
1052 mlog_exit(status); 1052 mlog_exit(status);
1053 return status; 1053 return status;
1054 } 1054 }
1055 1055
1056 int ocfs2_getattr(struct vfsmount *mnt, 1056 int ocfs2_getattr(struct vfsmount *mnt,
1057 struct dentry *dentry, 1057 struct dentry *dentry,
1058 struct kstat *stat) 1058 struct kstat *stat)
1059 { 1059 {
1060 struct inode *inode = dentry->d_inode; 1060 struct inode *inode = dentry->d_inode;
1061 struct super_block *sb = dentry->d_inode->i_sb; 1061 struct super_block *sb = dentry->d_inode->i_sb;
1062 struct ocfs2_super *osb = sb->s_fs_info; 1062 struct ocfs2_super *osb = sb->s_fs_info;
1063 int err; 1063 int err;
1064 1064
1065 mlog_entry_void(); 1065 mlog_entry_void();
1066 1066
1067 err = ocfs2_inode_revalidate(dentry); 1067 err = ocfs2_inode_revalidate(dentry);
1068 if (err) { 1068 if (err) {
1069 if (err != -ENOENT) 1069 if (err != -ENOENT)
1070 mlog_errno(err); 1070 mlog_errno(err);
1071 goto bail; 1071 goto bail;
1072 } 1072 }
1073 1073
1074 generic_fillattr(inode, stat); 1074 generic_fillattr(inode, stat);
1075 1075
1076 /* We set the blksize from the cluster size for performance */ 1076 /* We set the blksize from the cluster size for performance */
1077 stat->blksize = osb->s_clustersize; 1077 stat->blksize = osb->s_clustersize;
1078 1078
1079 bail: 1079 bail:
1080 mlog_exit(err); 1080 mlog_exit(err);
1081 1081
1082 return err; 1082 return err;
1083 } 1083 }
1084 1084
1085 int ocfs2_permission(struct inode *inode, int mask) 1085 int ocfs2_permission(struct inode *inode, int mask)
1086 { 1086 {
1087 int ret; 1087 int ret;
1088 1088
1089 mlog_entry_void(); 1089 mlog_entry_void();
1090 1090
1091 ret = ocfs2_inode_lock(inode, NULL, 0); 1091 ret = ocfs2_inode_lock(inode, NULL, 0);
1092 if (ret) { 1092 if (ret) {
1093 if (ret != -ENOENT) 1093 if (ret != -ENOENT)
1094 mlog_errno(ret); 1094 mlog_errno(ret);
1095 goto out; 1095 goto out;
1096 } 1096 }
1097 1097
1098 ret = generic_permission(inode, mask, ocfs2_check_acl); 1098 ret = generic_permission(inode, mask, ocfs2_check_acl);
1099 1099
1100 ocfs2_inode_unlock(inode, 0); 1100 ocfs2_inode_unlock(inode, 0);
1101 out: 1101 out:
1102 mlog_exit(ret); 1102 mlog_exit(ret);
1103 return ret; 1103 return ret;
1104 } 1104 }
1105 1105
1106 static int __ocfs2_write_remove_suid(struct inode *inode, 1106 static int __ocfs2_write_remove_suid(struct inode *inode,
1107 struct buffer_head *bh) 1107 struct buffer_head *bh)
1108 { 1108 {
1109 int ret; 1109 int ret;
1110 handle_t *handle; 1110 handle_t *handle;
1111 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1111 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1112 struct ocfs2_dinode *di; 1112 struct ocfs2_dinode *di;
1113 1113
1114 mlog_entry("(Inode %llu, mode 0%o)\n", 1114 mlog_entry("(Inode %llu, mode 0%o)\n",
1115 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); 1115 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1116 1116
1117 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1117 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1118 if (IS_ERR(handle)) { 1118 if (IS_ERR(handle)) {
1119 ret = PTR_ERR(handle); 1119 ret = PTR_ERR(handle);
1120 mlog_errno(ret); 1120 mlog_errno(ret);
1121 goto out; 1121 goto out;
1122 } 1122 }
1123 1123
1124 ret = ocfs2_journal_access_di(handle, inode, bh, 1124 ret = ocfs2_journal_access_di(handle, inode, bh,
1125 OCFS2_JOURNAL_ACCESS_WRITE); 1125 OCFS2_JOURNAL_ACCESS_WRITE);
1126 if (ret < 0) { 1126 if (ret < 0) {
1127 mlog_errno(ret); 1127 mlog_errno(ret);
1128 goto out_trans; 1128 goto out_trans;
1129 } 1129 }
1130 1130
1131 inode->i_mode &= ~S_ISUID; 1131 inode->i_mode &= ~S_ISUID;
1132 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1132 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1133 inode->i_mode &= ~S_ISGID; 1133 inode->i_mode &= ~S_ISGID;
1134 1134
1135 di = (struct ocfs2_dinode *) bh->b_data; 1135 di = (struct ocfs2_dinode *) bh->b_data;
1136 di->i_mode = cpu_to_le16(inode->i_mode); 1136 di->i_mode = cpu_to_le16(inode->i_mode);
1137 1137
1138 ret = ocfs2_journal_dirty(handle, bh); 1138 ret = ocfs2_journal_dirty(handle, bh);
1139 if (ret < 0) 1139 if (ret < 0)
1140 mlog_errno(ret); 1140 mlog_errno(ret);
1141 1141
1142 out_trans: 1142 out_trans:
1143 ocfs2_commit_trans(osb, handle); 1143 ocfs2_commit_trans(osb, handle);
1144 out: 1144 out:
1145 mlog_exit(ret); 1145 mlog_exit(ret);
1146 return ret; 1146 return ret;
1147 } 1147 }
1148 1148
1149 /* 1149 /*
1150 * Will look for holes and unwritten extents in the range starting at 1150 * Will look for holes and unwritten extents in the range starting at
1151 * pos for count bytes (inclusive). 1151 * pos for count bytes (inclusive).
1152 */ 1152 */
1153 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1153 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1154 size_t count) 1154 size_t count)
1155 { 1155 {
1156 int ret = 0; 1156 int ret = 0;
1157 unsigned int extent_flags; 1157 unsigned int extent_flags;
1158 u32 cpos, clusters, extent_len, phys_cpos; 1158 u32 cpos, clusters, extent_len, phys_cpos;
1159 struct super_block *sb = inode->i_sb; 1159 struct super_block *sb = inode->i_sb;
1160 1160
1161 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1161 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1162 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1162 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1163 1163
1164 while (clusters) { 1164 while (clusters) {
1165 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1165 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1166 &extent_flags); 1166 &extent_flags);
1167 if (ret < 0) { 1167 if (ret < 0) {
1168 mlog_errno(ret); 1168 mlog_errno(ret);
1169 goto out; 1169 goto out;
1170 } 1170 }
1171 1171
1172 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1172 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1173 ret = 1; 1173 ret = 1;
1174 break; 1174 break;
1175 } 1175 }
1176 1176
1177 if (extent_len > clusters) 1177 if (extent_len > clusters)
1178 extent_len = clusters; 1178 extent_len = clusters;
1179 1179
1180 clusters -= extent_len; 1180 clusters -= extent_len;
1181 cpos += extent_len; 1181 cpos += extent_len;
1182 } 1182 }
1183 out: 1183 out:
1184 return ret; 1184 return ret;
1185 } 1185 }
1186 1186
1187 static int ocfs2_write_remove_suid(struct inode *inode) 1187 static int ocfs2_write_remove_suid(struct inode *inode)
1188 { 1188 {
1189 int ret; 1189 int ret;
1190 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
1191 1191
1192 ret = ocfs2_read_inode_block(inode, &bh); 1192 ret = ocfs2_read_inode_block(inode, &bh);
1193 if (ret < 0) { 1193 if (ret < 0) {
1194 mlog_errno(ret); 1194 mlog_errno(ret);
1195 goto out; 1195 goto out;
1196 } 1196 }
1197 1197
1198 ret = __ocfs2_write_remove_suid(inode, bh); 1198 ret = __ocfs2_write_remove_suid(inode, bh);
1199 out: 1199 out:
1200 brelse(bh); 1200 brelse(bh);
1201 return ret; 1201 return ret;
1202 } 1202 }
1203 1203
1204 /* 1204 /*
1205 * Allocate enough extents to cover the region starting at byte offset 1205 * Allocate enough extents to cover the region starting at byte offset
1206 * start for len bytes. Existing extents are skipped, any extents 1206 * start for len bytes. Existing extents are skipped, any extents
1207 * added are marked as "unwritten". 1207 * added are marked as "unwritten".
1208 */ 1208 */
1209 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1209 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1210 u64 start, u64 len) 1210 u64 start, u64 len)
1211 { 1211 {
1212 int ret; 1212 int ret;
1213 u32 cpos, phys_cpos, clusters, alloc_size; 1213 u32 cpos, phys_cpos, clusters, alloc_size;
1214 u64 end = start + len; 1214 u64 end = start + len;
1215 struct buffer_head *di_bh = NULL; 1215 struct buffer_head *di_bh = NULL;
1216 1216
1217 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1217 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1218 ret = ocfs2_read_inode_block(inode, &di_bh); 1218 ret = ocfs2_read_inode_block(inode, &di_bh);
1219 if (ret) { 1219 if (ret) {
1220 mlog_errno(ret); 1220 mlog_errno(ret);
1221 goto out; 1221 goto out;
1222 } 1222 }
1223 1223
1224 /* 1224 /*
1225 * Nothing to do if the requested reservation range 1225 * Nothing to do if the requested reservation range
1226 * fits within the inode. 1226 * fits within the inode.
1227 */ 1227 */
1228 if (ocfs2_size_fits_inline_data(di_bh, end)) 1228 if (ocfs2_size_fits_inline_data(di_bh, end))
1229 goto out; 1229 goto out;
1230 1230
1231 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1231 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1232 if (ret) { 1232 if (ret) {
1233 mlog_errno(ret); 1233 mlog_errno(ret);
1234 goto out; 1234 goto out;
1235 } 1235 }
1236 } 1236 }
1237 1237
1238 /* 1238 /*
1239 * We consider both start and len to be inclusive. 1239 * We consider both start and len to be inclusive.
1240 */ 1240 */
1241 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1241 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1242 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1242 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1243 clusters -= cpos; 1243 clusters -= cpos;
1244 1244
1245 while (clusters) { 1245 while (clusters) {
1246 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1246 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1247 &alloc_size, NULL); 1247 &alloc_size, NULL);
1248 if (ret) { 1248 if (ret) {
1249 mlog_errno(ret); 1249 mlog_errno(ret);
1250 goto out; 1250 goto out;
1251 } 1251 }
1252 1252
1253 /* 1253 /*
1254 * Hole or existing extent len can be arbitrary, so 1254 * Hole or existing extent len can be arbitrary, so
1255 * cap it to our own allocation request. 1255 * cap it to our own allocation request.
1256 */ 1256 */
1257 if (alloc_size > clusters) 1257 if (alloc_size > clusters)
1258 alloc_size = clusters; 1258 alloc_size = clusters;
1259 1259
1260 if (phys_cpos) { 1260 if (phys_cpos) {
1261 /* 1261 /*
1262 * We already have an allocation at this 1262 * We already have an allocation at this
1263 * region so we can safely skip it. 1263 * region so we can safely skip it.
1264 */ 1264 */
1265 goto next; 1265 goto next;
1266 } 1266 }
1267 1267
1268 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1268 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1269 if (ret) { 1269 if (ret) {
1270 if (ret != -ENOSPC) 1270 if (ret != -ENOSPC)
1271 mlog_errno(ret); 1271 mlog_errno(ret);
1272 goto out; 1272 goto out;
1273 } 1273 }
1274 1274
1275 next: 1275 next:
1276 cpos += alloc_size; 1276 cpos += alloc_size;
1277 clusters -= alloc_size; 1277 clusters -= alloc_size;
1278 } 1278 }
1279 1279
1280 ret = 0; 1280 ret = 0;
1281 out: 1281 out:
1282 1282
1283 brelse(di_bh); 1283 brelse(di_bh);
1284 return ret; 1284 return ret;
1285 } 1285 }
1286 1286
1287 /* 1287 /*
1288 * Truncate a byte range, avoiding pages within partial clusters. This 1288 * Truncate a byte range, avoiding pages within partial clusters. This
1289 * preserves those pages for the zeroing code to write to. 1289 * preserves those pages for the zeroing code to write to.
1290 */ 1290 */
1291 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1291 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1292 u64 byte_len) 1292 u64 byte_len)
1293 { 1293 {
1294 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1294 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1295 loff_t start, end; 1295 loff_t start, end;
1296 struct address_space *mapping = inode->i_mapping; 1296 struct address_space *mapping = inode->i_mapping;
1297 1297
1298 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1298 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1299 end = byte_start + byte_len; 1299 end = byte_start + byte_len;
1300 end = end & ~(osb->s_clustersize - 1); 1300 end = end & ~(osb->s_clustersize - 1);
1301 1301
1302 if (start < end) { 1302 if (start < end) {
1303 unmap_mapping_range(mapping, start, end - start, 0); 1303 unmap_mapping_range(mapping, start, end - start, 0);
1304 truncate_inode_pages_range(mapping, start, end - 1); 1304 truncate_inode_pages_range(mapping, start, end - 1);
1305 } 1305 }
1306 } 1306 }
1307 1307
1308 static int ocfs2_zero_partial_clusters(struct inode *inode, 1308 static int ocfs2_zero_partial_clusters(struct inode *inode,
1309 u64 start, u64 len) 1309 u64 start, u64 len)
1310 { 1310 {
1311 int ret = 0; 1311 int ret = 0;
1312 u64 tmpend, end = start + len; 1312 u64 tmpend, end = start + len;
1313 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1313 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1314 unsigned int csize = osb->s_clustersize; 1314 unsigned int csize = osb->s_clustersize;
1315 handle_t *handle; 1315 handle_t *handle;
1316 1316
1317 /* 1317 /*
1318 * The "start" and "end" values are NOT necessarily part of 1318 * The "start" and "end" values are NOT necessarily part of
1319 * the range whose allocation is being deleted. Rather, this 1319 * the range whose allocation is being deleted. Rather, this
1320 * is what the user passed in with the request. We must zero 1320 * is what the user passed in with the request. We must zero
1321 * partial clusters here. There's no need to worry about 1321 * partial clusters here. There's no need to worry about
1322 * physical allocation - the zeroing code knows to skip holes. 1322 * physical allocation - the zeroing code knows to skip holes.
1323 */ 1323 */
1324 mlog(0, "byte start: %llu, end: %llu\n", 1324 mlog(0, "byte start: %llu, end: %llu\n",
1325 (unsigned long long)start, (unsigned long long)end); 1325 (unsigned long long)start, (unsigned long long)end);
1326 1326
1327 /* 1327 /*
1328 * If both edges are on a cluster boundary then there's no 1328 * If both edges are on a cluster boundary then there's no
1329 * zeroing required as the region is part of the allocation to 1329 * zeroing required as the region is part of the allocation to
1330 * be truncated. 1330 * be truncated.
1331 */ 1331 */
1332 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1332 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1333 goto out; 1333 goto out;
1334 1334
1335 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1335 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1336 if (IS_ERR(handle)) { 1336 if (IS_ERR(handle)) {
1337 ret = PTR_ERR(handle); 1337 ret = PTR_ERR(handle);
1338 mlog_errno(ret); 1338 mlog_errno(ret);
1339 goto out; 1339 goto out;
1340 } 1340 }
1341 1341
1342 /* 1342 /*
1343 * We want to get the byte offset of the end of the 1st cluster. 1343 * We want to get the byte offset of the end of the 1st cluster.
1344 */ 1344 */
1345 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1345 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1346 if (tmpend > end) 1346 if (tmpend > end)
1347 tmpend = end; 1347 tmpend = end;
1348 1348
1349 mlog(0, "1st range: start: %llu, tmpend: %llu\n", 1349 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1350 (unsigned long long)start, (unsigned long long)tmpend); 1350 (unsigned long long)start, (unsigned long long)tmpend);
1351 1351
1352 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1352 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1353 if (ret) 1353 if (ret)
1354 mlog_errno(ret); 1354 mlog_errno(ret);
1355 1355
1356 if (tmpend < end) { 1356 if (tmpend < end) {
1357 /* 1357 /*
1358 * This may make start and end equal, but the zeroing 1358 * This may make start and end equal, but the zeroing
1359 * code will skip any work in that case so there's no 1359 * code will skip any work in that case so there's no
1360 * need to catch it up here. 1360 * need to catch it up here.
1361 */ 1361 */
1362 start = end & ~(osb->s_clustersize - 1); 1362 start = end & ~(osb->s_clustersize - 1);
1363 1363
1364 mlog(0, "2nd range: start: %llu, end: %llu\n", 1364 mlog(0, "2nd range: start: %llu, end: %llu\n",
1365 (unsigned long long)start, (unsigned long long)end); 1365 (unsigned long long)start, (unsigned long long)end);
1366 1366
1367 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1367 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1368 if (ret) 1368 if (ret)
1369 mlog_errno(ret); 1369 mlog_errno(ret);
1370 } 1370 }
1371 1371
1372 ocfs2_commit_trans(osb, handle); 1372 ocfs2_commit_trans(osb, handle);
1373 out: 1373 out:
1374 return ret; 1374 return ret;
1375 } 1375 }
1376 1376
1377 static int ocfs2_remove_inode_range(struct inode *inode, 1377 static int ocfs2_remove_inode_range(struct inode *inode,
1378 struct buffer_head *di_bh, u64 byte_start, 1378 struct buffer_head *di_bh, u64 byte_start,
1379 u64 byte_len) 1379 u64 byte_len)
1380 { 1380 {
1381 int ret = 0; 1381 int ret = 0;
1382 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1382 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1383 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1383 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1384 struct ocfs2_cached_dealloc_ctxt dealloc; 1384 struct ocfs2_cached_dealloc_ctxt dealloc;
1385 struct address_space *mapping = inode->i_mapping; 1385 struct address_space *mapping = inode->i_mapping;
1386 struct ocfs2_extent_tree et; 1386 struct ocfs2_extent_tree et;
1387 1387
1388 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 1388 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1389 ocfs2_init_dealloc_ctxt(&dealloc); 1389 ocfs2_init_dealloc_ctxt(&dealloc);
1390 1390
1391 if (byte_len == 0) 1391 if (byte_len == 0)
1392 return 0; 1392 return 0;
1393 1393
1394 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1394 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1395 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1395 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1396 byte_start + byte_len, 0); 1396 byte_start + byte_len, 0);
1397 if (ret) { 1397 if (ret) {
1398 mlog_errno(ret); 1398 mlog_errno(ret);
1399 goto out; 1399 goto out;
1400 } 1400 }
1401 /* 1401 /*
1402 * There's no need to get fancy with the page cache 1402 * There's no need to get fancy with the page cache
1403 * truncate of an inline-data inode. We're talking 1403 * truncate of an inline-data inode. We're talking
1404 * about less than a page here, which will be cached 1404 * about less than a page here, which will be cached
1405 * in the dinode buffer anyway. 1405 * in the dinode buffer anyway.
1406 */ 1406 */
1407 unmap_mapping_range(mapping, 0, 0, 0); 1407 unmap_mapping_range(mapping, 0, 0, 0);
1408 truncate_inode_pages(mapping, 0); 1408 truncate_inode_pages(mapping, 0);
1409 goto out; 1409 goto out;
1410 } 1410 }
1411 1411
1412 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1412 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1413 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1413 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1414 if (trunc_len >= trunc_start) 1414 if (trunc_len >= trunc_start)
1415 trunc_len -= trunc_start; 1415 trunc_len -= trunc_start;
1416 else 1416 else
1417 trunc_len = 0; 1417 trunc_len = 0;
1418 1418
1419 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1419 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1420 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1420 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1421 (unsigned long long)byte_start, 1421 (unsigned long long)byte_start,
1422 (unsigned long long)byte_len, trunc_start, trunc_len); 1422 (unsigned long long)byte_len, trunc_start, trunc_len);
1423 1423
1424 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1424 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1425 if (ret) { 1425 if (ret) {
1426 mlog_errno(ret); 1426 mlog_errno(ret);
1427 goto out; 1427 goto out;
1428 } 1428 }
1429 1429
1430 cpos = trunc_start; 1430 cpos = trunc_start;
1431 while (trunc_len) { 1431 while (trunc_len) {
1432 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1432 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1433 &alloc_size, NULL); 1433 &alloc_size, NULL);
1434 if (ret) { 1434 if (ret) {
1435 mlog_errno(ret); 1435 mlog_errno(ret);
1436 goto out; 1436 goto out;
1437 } 1437 }
1438 1438
1439 if (alloc_size > trunc_len) 1439 if (alloc_size > trunc_len)
1440 alloc_size = trunc_len; 1440 alloc_size = trunc_len;
1441 1441
1442 /* Only do work for non-holes */ 1442 /* Only do work for non-holes */
1443 if (phys_cpos != 0) { 1443 if (phys_cpos != 0) {
1444 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1444 ret = ocfs2_remove_btree_range(inode, &et, cpos,
1445 phys_cpos, alloc_size, 1445 phys_cpos, alloc_size,
1446 &dealloc); 1446 &dealloc);
1447 if (ret) { 1447 if (ret) {
1448 mlog_errno(ret); 1448 mlog_errno(ret);
1449 goto out; 1449 goto out;
1450 } 1450 }
1451 } 1451 }
1452 1452
1453 cpos += alloc_size; 1453 cpos += alloc_size;
1454 trunc_len -= alloc_size; 1454 trunc_len -= alloc_size;
1455 } 1455 }
1456 1456
1457 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1457 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1458 1458
1459 out: 1459 out:
1460 ocfs2_schedule_truncate_log_flush(osb, 1); 1460 ocfs2_schedule_truncate_log_flush(osb, 1);
1461 ocfs2_run_deallocs(osb, &dealloc); 1461 ocfs2_run_deallocs(osb, &dealloc);
1462 1462
1463 return ret; 1463 return ret;
1464 } 1464 }
1465 1465
1466 /* 1466 /*
1467 * Parts of this function taken from xfs_change_file_space() 1467 * Parts of this function taken from xfs_change_file_space()
1468 */ 1468 */
1469 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1469 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1470 loff_t f_pos, unsigned int cmd, 1470 loff_t f_pos, unsigned int cmd,
1471 struct ocfs2_space_resv *sr, 1471 struct ocfs2_space_resv *sr,
1472 int change_size) 1472 int change_size)
1473 { 1473 {
1474 int ret; 1474 int ret;
1475 s64 llen; 1475 s64 llen;
1476 loff_t size; 1476 loff_t size;
1477 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1477 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1478 struct buffer_head *di_bh = NULL; 1478 struct buffer_head *di_bh = NULL;
1479 handle_t *handle; 1479 handle_t *handle;
1480 unsigned long long max_off = inode->i_sb->s_maxbytes; 1480 unsigned long long max_off = inode->i_sb->s_maxbytes;
1481 1481
1482 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1482 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1483 return -EROFS; 1483 return -EROFS;
1484 1484
1485 mutex_lock(&inode->i_mutex); 1485 mutex_lock(&inode->i_mutex);
1486 1486
1487 /* 1487 /*
1488 * This prevents concurrent writes on other nodes 1488 * This prevents concurrent writes on other nodes
1489 */ 1489 */
1490 ret = ocfs2_rw_lock(inode, 1); 1490 ret = ocfs2_rw_lock(inode, 1);
1491 if (ret) { 1491 if (ret) {
1492 mlog_errno(ret); 1492 mlog_errno(ret);
1493 goto out; 1493 goto out;
1494 } 1494 }
1495 1495
1496 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1496 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1497 if (ret) { 1497 if (ret) {
1498 mlog_errno(ret); 1498 mlog_errno(ret);
1499 goto out_rw_unlock; 1499 goto out_rw_unlock;
1500 } 1500 }
1501 1501
1502 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1502 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1503 ret = -EPERM; 1503 ret = -EPERM;
1504 goto out_inode_unlock; 1504 goto out_inode_unlock;
1505 } 1505 }
1506 1506
1507 switch (sr->l_whence) { 1507 switch (sr->l_whence) {
1508 case 0: /*SEEK_SET*/ 1508 case 0: /*SEEK_SET*/
1509 break; 1509 break;
1510 case 1: /*SEEK_CUR*/ 1510 case 1: /*SEEK_CUR*/
1511 sr->l_start += f_pos; 1511 sr->l_start += f_pos;
1512 break; 1512 break;
1513 case 2: /*SEEK_END*/ 1513 case 2: /*SEEK_END*/
1514 sr->l_start += i_size_read(inode); 1514 sr->l_start += i_size_read(inode);
1515 break; 1515 break;
1516 default: 1516 default:
1517 ret = -EINVAL; 1517 ret = -EINVAL;
1518 goto out_inode_unlock; 1518 goto out_inode_unlock;
1519 } 1519 }
1520 sr->l_whence = 0; 1520 sr->l_whence = 0;
1521 1521
1522 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1522 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1523 1523
1524 if (sr->l_start < 0 1524 if (sr->l_start < 0
1525 || sr->l_start > max_off 1525 || sr->l_start > max_off
1526 || (sr->l_start + llen) < 0 1526 || (sr->l_start + llen) < 0
1527 || (sr->l_start + llen) > max_off) { 1527 || (sr->l_start + llen) > max_off) {
1528 ret = -EINVAL; 1528 ret = -EINVAL;
1529 goto out_inode_unlock; 1529 goto out_inode_unlock;
1530 } 1530 }
1531 size = sr->l_start + sr->l_len; 1531 size = sr->l_start + sr->l_len;
1532 1532
1533 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1533 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1534 if (sr->l_len <= 0) { 1534 if (sr->l_len <= 0) {
1535 ret = -EINVAL; 1535 ret = -EINVAL;
1536 goto out_inode_unlock; 1536 goto out_inode_unlock;
1537 } 1537 }
1538 } 1538 }
1539 1539
1540 if (file && should_remove_suid(file->f_path.dentry)) { 1540 if (file && should_remove_suid(file->f_path.dentry)) {
1541 ret = __ocfs2_write_remove_suid(inode, di_bh); 1541 ret = __ocfs2_write_remove_suid(inode, di_bh);
1542 if (ret) { 1542 if (ret) {
1543 mlog_errno(ret); 1543 mlog_errno(ret);
1544 goto out_inode_unlock; 1544 goto out_inode_unlock;
1545 } 1545 }
1546 } 1546 }
1547 1547
1548 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1548 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1549 switch (cmd) { 1549 switch (cmd) {
1550 case OCFS2_IOC_RESVSP: 1550 case OCFS2_IOC_RESVSP:
1551 case OCFS2_IOC_RESVSP64: 1551 case OCFS2_IOC_RESVSP64:
1552 /* 1552 /*
1553 * This takes unsigned offsets, but the signed ones we 1553 * This takes unsigned offsets, but the signed ones we
1554 * pass have been checked against overflow above. 1554 * pass have been checked against overflow above.
1555 */ 1555 */
1556 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1556 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1557 sr->l_len); 1557 sr->l_len);
1558 break; 1558 break;
1559 case OCFS2_IOC_UNRESVSP: 1559 case OCFS2_IOC_UNRESVSP:
1560 case OCFS2_IOC_UNRESVSP64: 1560 case OCFS2_IOC_UNRESVSP64:
1561 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1561 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1562 sr->l_len); 1562 sr->l_len);
1563 break; 1563 break;
1564 default: 1564 default:
1565 ret = -EINVAL; 1565 ret = -EINVAL;
1566 } 1566 }
1567 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1567 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1568 if (ret) { 1568 if (ret) {
1569 mlog_errno(ret); 1569 mlog_errno(ret);
1570 goto out_inode_unlock; 1570 goto out_inode_unlock;
1571 } 1571 }
1572 1572
1573 /* 1573 /*
1574 * We update c/mtime for these changes 1574 * We update c/mtime for these changes
1575 */ 1575 */
1576 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1576 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1577 if (IS_ERR(handle)) { 1577 if (IS_ERR(handle)) {
1578 ret = PTR_ERR(handle); 1578 ret = PTR_ERR(handle);
1579 mlog_errno(ret); 1579 mlog_errno(ret);
1580 goto out_inode_unlock; 1580 goto out_inode_unlock;
1581 } 1581 }
1582 1582
1583 if (change_size && i_size_read(inode) < size) 1583 if (change_size && i_size_read(inode) < size)
1584 i_size_write(inode, size); 1584 i_size_write(inode, size);
1585 1585
1586 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1586 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1587 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1587 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1588 if (ret < 0) 1588 if (ret < 0)
1589 mlog_errno(ret); 1589 mlog_errno(ret);
1590 1590
1591 ocfs2_commit_trans(osb, handle); 1591 ocfs2_commit_trans(osb, handle);
1592 1592
1593 out_inode_unlock: 1593 out_inode_unlock:
1594 brelse(di_bh); 1594 brelse(di_bh);
1595 ocfs2_inode_unlock(inode, 1); 1595 ocfs2_inode_unlock(inode, 1);
1596 out_rw_unlock: 1596 out_rw_unlock:
1597 ocfs2_rw_unlock(inode, 1); 1597 ocfs2_rw_unlock(inode, 1);
1598 1598
1599 out: 1599 out:
1600 mutex_unlock(&inode->i_mutex); 1600 mutex_unlock(&inode->i_mutex);
1601 return ret; 1601 return ret;
1602 } 1602 }
1603 1603
1604 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1604 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1605 struct ocfs2_space_resv *sr) 1605 struct ocfs2_space_resv *sr)
1606 { 1606 {
1607 struct inode *inode = file->f_path.dentry->d_inode; 1607 struct inode *inode = file->f_path.dentry->d_inode;
1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1609 1609
1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1611 !ocfs2_writes_unwritten_extents(osb)) 1611 !ocfs2_writes_unwritten_extents(osb))
1612 return -ENOTTY; 1612 return -ENOTTY;
1613 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1613 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1614 !ocfs2_sparse_alloc(osb)) 1614 !ocfs2_sparse_alloc(osb))
1615 return -ENOTTY; 1615 return -ENOTTY;
1616 1616
1617 if (!S_ISREG(inode->i_mode)) 1617 if (!S_ISREG(inode->i_mode))
1618 return -EINVAL; 1618 return -EINVAL;
1619 1619
1620 if (!(file->f_mode & FMODE_WRITE)) 1620 if (!(file->f_mode & FMODE_WRITE))
1621 return -EBADF; 1621 return -EBADF;
1622 1622
1623 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1623 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1624 } 1624 }
1625 1625
1626 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1626 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1627 loff_t len) 1627 loff_t len)
1628 { 1628 {
1629 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1629 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1630 struct ocfs2_space_resv sr; 1630 struct ocfs2_space_resv sr;
1631 int change_size = 1; 1631 int change_size = 1;
1632 1632
1633 if (!ocfs2_writes_unwritten_extents(osb)) 1633 if (!ocfs2_writes_unwritten_extents(osb))
1634 return -EOPNOTSUPP; 1634 return -EOPNOTSUPP;
1635 1635
1636 if (S_ISDIR(inode->i_mode)) 1636 if (S_ISDIR(inode->i_mode))
1637 return -ENODEV; 1637 return -ENODEV;
1638 1638
1639 if (mode & FALLOC_FL_KEEP_SIZE) 1639 if (mode & FALLOC_FL_KEEP_SIZE)
1640 change_size = 0; 1640 change_size = 0;
1641 1641
1642 sr.l_whence = 0; 1642 sr.l_whence = 0;
1643 sr.l_start = (s64)offset; 1643 sr.l_start = (s64)offset;
1644 sr.l_len = (s64)len; 1644 sr.l_len = (s64)len;
1645 1645
1646 return __ocfs2_change_file_space(NULL, inode, offset, 1646 return __ocfs2_change_file_space(NULL, inode, offset,
1647 OCFS2_IOC_RESVSP64, &sr, change_size); 1647 OCFS2_IOC_RESVSP64, &sr, change_size);
1648 } 1648 }
1649 1649
1650 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1650 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1651 loff_t *ppos, 1651 loff_t *ppos,
1652 size_t count, 1652 size_t count,
1653 int appending, 1653 int appending,
1654 int *direct_io) 1654 int *direct_io)
1655 { 1655 {
1656 int ret = 0, meta_level = 0; 1656 int ret = 0, meta_level = 0;
1657 struct inode *inode = dentry->d_inode; 1657 struct inode *inode = dentry->d_inode;
1658 loff_t saved_pos, end; 1658 loff_t saved_pos, end;
1659 1659
1660 /* 1660 /*
1661 * We start with a read level meta lock and only jump to an ex 1661 * We start with a read level meta lock and only jump to an ex
1662 * if we need to make modifications here. 1662 * if we need to make modifications here.
1663 */ 1663 */
1664 for(;;) { 1664 for(;;) {
1665 ret = ocfs2_inode_lock(inode, NULL, meta_level); 1665 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1666 if (ret < 0) { 1666 if (ret < 0) {
1667 meta_level = -1; 1667 meta_level = -1;
1668 mlog_errno(ret); 1668 mlog_errno(ret);
1669 goto out; 1669 goto out;
1670 } 1670 }
1671 1671
1672 /* Clear suid / sgid if necessary. We do this here 1672 /* Clear suid / sgid if necessary. We do this here
1673 * instead of later in the write path because 1673 * instead of later in the write path because
1674 * remove_suid() calls ->setattr without any hint that 1674 * remove_suid() calls ->setattr without any hint that
1675 * we may have already done our cluster locking. Since 1675 * we may have already done our cluster locking. Since
1676 * ocfs2_setattr() *must* take cluster locks to 1676 * ocfs2_setattr() *must* take cluster locks to
1677 * proceeed, this will lead us to recursively lock the 1677 * proceeed, this will lead us to recursively lock the
1678 * inode. There's also the dinode i_size state which 1678 * inode. There's also the dinode i_size state which
1679 * can be lost via setattr during extending writes (we 1679 * can be lost via setattr during extending writes (we
1680 * set inode->i_size at the end of a write. */ 1680 * set inode->i_size at the end of a write. */
1681 if (should_remove_suid(dentry)) { 1681 if (should_remove_suid(dentry)) {
1682 if (meta_level == 0) { 1682 if (meta_level == 0) {
1683 ocfs2_inode_unlock(inode, meta_level); 1683 ocfs2_inode_unlock(inode, meta_level);
1684 meta_level = 1; 1684 meta_level = 1;
1685 continue; 1685 continue;
1686 } 1686 }
1687 1687
1688 ret = ocfs2_write_remove_suid(inode); 1688 ret = ocfs2_write_remove_suid(inode);
1689 if (ret < 0) { 1689 if (ret < 0) {
1690 mlog_errno(ret); 1690 mlog_errno(ret);
1691 goto out_unlock; 1691 goto out_unlock;
1692 } 1692 }
1693 } 1693 }
1694 1694
1695 /* work on a copy of ppos until we're sure that we won't have 1695 /* work on a copy of ppos until we're sure that we won't have
1696 * to recalculate it due to relocking. */ 1696 * to recalculate it due to relocking. */
1697 if (appending) { 1697 if (appending) {
1698 saved_pos = i_size_read(inode); 1698 saved_pos = i_size_read(inode);
1699 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1699 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1700 } else { 1700 } else {
1701 saved_pos = *ppos; 1701 saved_pos = *ppos;
1702 } 1702 }
1703 1703
1704 end = saved_pos + count; 1704 end = saved_pos + count;
1705 1705
1706 /* 1706 /*
1707 * Skip the O_DIRECT checks if we don't need 1707 * Skip the O_DIRECT checks if we don't need
1708 * them. 1708 * them.
1709 */ 1709 */
1710 if (!direct_io || !(*direct_io)) 1710 if (!direct_io || !(*direct_io))
1711 break; 1711 break;
1712 1712
1713 /* 1713 /*
1714 * There's no sane way to do direct writes to an inode 1714 * There's no sane way to do direct writes to an inode
1715 * with inline data. 1715 * with inline data.
1716 */ 1716 */
1717 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1717 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1718 *direct_io = 0; 1718 *direct_io = 0;
1719 break; 1719 break;
1720 } 1720 }
1721 1721
1722 /* 1722 /*
1723 * Allowing concurrent direct writes means 1723 * Allowing concurrent direct writes means
1724 * i_size changes wouldn't be synchronized, so 1724 * i_size changes wouldn't be synchronized, so
1725 * one node could wind up truncating another 1725 * one node could wind up truncating another
1726 * nodes writes. 1726 * nodes writes.
1727 */ 1727 */
1728 if (end > i_size_read(inode)) { 1728 if (end > i_size_read(inode)) {
1729 *direct_io = 0; 1729 *direct_io = 0;
1730 break; 1730 break;
1731 } 1731 }
1732 1732
1733 /* 1733 /*
1734 * We don't fill holes during direct io, so 1734 * We don't fill holes during direct io, so
1735 * check for them here. If any are found, the 1735 * check for them here. If any are found, the
1736 * caller will have to retake some cluster 1736 * caller will have to retake some cluster
1737 * locks and initiate the io as buffered. 1737 * locks and initiate the io as buffered.
1738 */ 1738 */
1739 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 1739 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1740 if (ret == 1) { 1740 if (ret == 1) {
1741 *direct_io = 0; 1741 *direct_io = 0;
1742 ret = 0; 1742 ret = 0;
1743 } else if (ret < 0) 1743 } else if (ret < 0)
1744 mlog_errno(ret); 1744 mlog_errno(ret);
1745 break; 1745 break;
1746 } 1746 }
1747 1747
1748 if (appending) 1748 if (appending)
1749 *ppos = saved_pos; 1749 *ppos = saved_pos;
1750 1750
1751 out_unlock: 1751 out_unlock:
1752 ocfs2_inode_unlock(inode, meta_level); 1752 ocfs2_inode_unlock(inode, meta_level);
1753 1753
1754 out: 1754 out:
1755 return ret; 1755 return ret;
1756 } 1756 }
1757 1757
1758 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1758 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1759 const struct iovec *iov, 1759 const struct iovec *iov,
1760 unsigned long nr_segs, 1760 unsigned long nr_segs,
1761 loff_t pos) 1761 loff_t pos)
1762 { 1762 {
1763 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1763 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1764 int can_do_direct; 1764 int can_do_direct;
1765 ssize_t written = 0; 1765 ssize_t written = 0;
1766 size_t ocount; /* original count */ 1766 size_t ocount; /* original count */
1767 size_t count; /* after file limit checks */ 1767 size_t count; /* after file limit checks */
1768 loff_t old_size, *ppos = &iocb->ki_pos; 1768 loff_t old_size, *ppos = &iocb->ki_pos;
1769 u32 old_clusters; 1769 u32 old_clusters;
1770 struct file *file = iocb->ki_filp; 1770 struct file *file = iocb->ki_filp;
1771 struct inode *inode = file->f_path.dentry->d_inode; 1771 struct inode *inode = file->f_path.dentry->d_inode;
1772 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1772 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1773 1773
1774 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1774 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1775 (unsigned int)nr_segs, 1775 (unsigned int)nr_segs,
1776 file->f_path.dentry->d_name.len, 1776 file->f_path.dentry->d_name.len,
1777 file->f_path.dentry->d_name.name); 1777 file->f_path.dentry->d_name.name);
1778 1778
1779 if (iocb->ki_left == 0) 1779 if (iocb->ki_left == 0)
1780 return 0; 1780 return 0;
1781 1781
1782 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1782 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1783 1783
1784 appending = file->f_flags & O_APPEND ? 1 : 0; 1784 appending = file->f_flags & O_APPEND ? 1 : 0;
1785 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 1785 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1786 1786
1787 mutex_lock(&inode->i_mutex); 1787 mutex_lock(&inode->i_mutex);
1788 1788
1789 relock: 1789 relock:
1790 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1790 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1791 if (direct_io) { 1791 if (direct_io) {
1792 down_read(&inode->i_alloc_sem); 1792 down_read(&inode->i_alloc_sem);
1793 have_alloc_sem = 1; 1793 have_alloc_sem = 1;
1794 } 1794 }
1795 1795
1796 /* concurrent O_DIRECT writes are allowed */ 1796 /* concurrent O_DIRECT writes are allowed */
1797 rw_level = !direct_io; 1797 rw_level = !direct_io;
1798 ret = ocfs2_rw_lock(inode, rw_level); 1798 ret = ocfs2_rw_lock(inode, rw_level);
1799 if (ret < 0) { 1799 if (ret < 0) {
1800 mlog_errno(ret); 1800 mlog_errno(ret);
1801 goto out_sems; 1801 goto out_sems;
1802 } 1802 }
1803 1803
1804 can_do_direct = direct_io; 1804 can_do_direct = direct_io;
1805 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1805 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1806 iocb->ki_left, appending, 1806 iocb->ki_left, appending,
1807 &can_do_direct); 1807 &can_do_direct);
1808 if (ret < 0) { 1808 if (ret < 0) {
1809 mlog_errno(ret); 1809 mlog_errno(ret);
1810 goto out; 1810 goto out;
1811 } 1811 }
1812 1812
1813 /* 1813 /*
1814 * We can't complete the direct I/O as requested, fall back to 1814 * We can't complete the direct I/O as requested, fall back to
1815 * buffered I/O. 1815 * buffered I/O.
1816 */ 1816 */
1817 if (direct_io && !can_do_direct) { 1817 if (direct_io && !can_do_direct) {
1818 ocfs2_rw_unlock(inode, rw_level); 1818 ocfs2_rw_unlock(inode, rw_level);
1819 up_read(&inode->i_alloc_sem); 1819 up_read(&inode->i_alloc_sem);
1820 1820
1821 have_alloc_sem = 0; 1821 have_alloc_sem = 0;
1822 rw_level = -1; 1822 rw_level = -1;
1823 1823
1824 direct_io = 0; 1824 direct_io = 0;
1825 goto relock; 1825 goto relock;
1826 } 1826 }
1827 1827
1828 /* 1828 /*
1829 * To later detect whether a journal commit for sync writes is 1829 * To later detect whether a journal commit for sync writes is
1830 * necessary, we sample i_size, and cluster count here. 1830 * necessary, we sample i_size, and cluster count here.
1831 */ 1831 */
1832 old_size = i_size_read(inode); 1832 old_size = i_size_read(inode);
1833 old_clusters = OCFS2_I(inode)->ip_clusters; 1833 old_clusters = OCFS2_I(inode)->ip_clusters;
1834 1834
1835 /* communicate with ocfs2_dio_end_io */ 1835 /* communicate with ocfs2_dio_end_io */
1836 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1836 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1837 1837
1838 if (direct_io) { 1838 if (direct_io) {
1839 ret = generic_segment_checks(iov, &nr_segs, &ocount, 1839 ret = generic_segment_checks(iov, &nr_segs, &ocount,
1840 VERIFY_READ); 1840 VERIFY_READ);
1841 if (ret) 1841 if (ret)
1842 goto out_dio; 1842 goto out_dio;
1843 1843
1844 ret = generic_write_checks(file, ppos, &count, 1844 ret = generic_write_checks(file, ppos, &count,
1845 S_ISBLK(inode->i_mode)); 1845 S_ISBLK(inode->i_mode));
1846 if (ret) 1846 if (ret)
1847 goto out_dio; 1847 goto out_dio;
1848 1848
1849 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 1849 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1850 ppos, count, ocount); 1850 ppos, count, ocount);
1851 if (written < 0) { 1851 if (written < 0) {
1852 /* 1852 /*
1853 * direct write may have instantiated a few 1853 * direct write may have instantiated a few
1854 * blocks outside i_size. Trim these off again. 1854 * blocks outside i_size. Trim these off again.
1855 * Don't need i_size_read because we hold i_mutex. 1855 * Don't need i_size_read because we hold i_mutex.
1856 */ 1856 */
1857 if (*ppos + count > inode->i_size) 1857 if (*ppos + count > inode->i_size)
1858 vmtruncate(inode, inode->i_size); 1858 vmtruncate(inode, inode->i_size);
1859 ret = written; 1859 ret = written;
1860 goto out_dio; 1860 goto out_dio;
1861 } 1861 }
1862 } else { 1862 } else {
1863 written = generic_file_aio_write_nolock(iocb, iov, nr_segs, 1863 written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
1864 *ppos); 1864 *ppos);
1865 } 1865 }
1866 1866
1867 out_dio: 1867 out_dio:
1868 /* buffered aio wouldn't have proper lock coverage today */ 1868 /* buffered aio wouldn't have proper lock coverage today */
1869 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 1869 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1870 1870
1871 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 1871 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
1872 /* 1872 /*
1873 * The generic write paths have handled getting data 1873 * The generic write paths have handled getting data
1874 * to disk, but since we don't make use of the dirty 1874 * to disk, but since we don't make use of the dirty
1875 * inode list, a manual journal commit is necessary 1875 * inode list, a manual journal commit is necessary
1876 * here. 1876 * here.
1877 */ 1877 */
1878 if (old_size != i_size_read(inode) || 1878 if (old_size != i_size_read(inode) ||
1879 old_clusters != OCFS2_I(inode)->ip_clusters) { 1879 old_clusters != OCFS2_I(inode)->ip_clusters) {
1880 ret = jbd2_journal_force_commit(osb->journal->j_journal); 1880 ret = jbd2_journal_force_commit(osb->journal->j_journal);
1881 if (ret < 0) 1881 if (ret < 0)
1882 written = ret; 1882 written = ret;
1883 } 1883 }
1884 } 1884 }
1885 1885
1886 /* 1886 /*
1887 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1887 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1888 * function pointer which is called when o_direct io completes so that 1888 * function pointer which is called when o_direct io completes so that
1889 * it can unlock our rw lock. (it's the clustered equivalent of 1889 * it can unlock our rw lock. (it's the clustered equivalent of
1890 * i_alloc_sem; protects truncate from racing with pending ios). 1890 * i_alloc_sem; protects truncate from racing with pending ios).
1891 * Unfortunately there are error cases which call end_io and others 1891 * Unfortunately there are error cases which call end_io and others
1892 * that don't. so we don't have to unlock the rw_lock if either an 1892 * that don't. so we don't have to unlock the rw_lock if either an
1893 * async dio is going to do it in the future or an end_io after an 1893 * async dio is going to do it in the future or an end_io after an
1894 * error has already done it. 1894 * error has already done it.
1895 */ 1895 */
1896 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1896 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1897 rw_level = -1; 1897 rw_level = -1;
1898 have_alloc_sem = 0; 1898 have_alloc_sem = 0;
1899 } 1899 }
1900 1900
1901 out: 1901 out:
1902 if (rw_level != -1) 1902 if (rw_level != -1)
1903 ocfs2_rw_unlock(inode, rw_level); 1903 ocfs2_rw_unlock(inode, rw_level);
1904 1904
1905 out_sems: 1905 out_sems:
1906 if (have_alloc_sem) 1906 if (have_alloc_sem)
1907 up_read(&inode->i_alloc_sem); 1907 up_read(&inode->i_alloc_sem);
1908 1908
1909 mutex_unlock(&inode->i_mutex); 1909 mutex_unlock(&inode->i_mutex);
1910 1910
1911 mlog_exit(ret); 1911 mlog_exit(ret);
1912 return written ? written : ret; 1912 return written ? written : ret;
1913 } 1913 }
1914 1914
1915 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1915 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1916 struct file *out, 1916 struct file *out,
1917 loff_t *ppos, 1917 loff_t *ppos,
1918 size_t len, 1918 size_t len,
1919 unsigned int flags) 1919 unsigned int flags)
1920 { 1920 {
1921 int ret; 1921 int ret;
1922 struct inode *inode = out->f_path.dentry->d_inode; 1922 struct inode *inode = out->f_path.dentry->d_inode;
1923 1923
1924 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1924 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1925 (unsigned int)len, 1925 (unsigned int)len,
1926 out->f_path.dentry->d_name.len, 1926 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1927 out->f_path.dentry->d_name.name);
1928 1928
1929 inode_double_lock(inode, pipe->inode); 1929 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1930 1930
1931 ret = ocfs2_rw_lock(inode, 1); 1931 ret = ocfs2_rw_lock(inode, 1);
1932 if (ret < 0) { 1932 if (ret < 0) {
1933 mlog_errno(ret); 1933 mlog_errno(ret);
1934 goto out; 1934 goto out;
1935 } 1935 }
1936 1936
1937 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1937 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1938 NULL); 1938 NULL);
1939 if (ret < 0) { 1939 if (ret < 0) {
1940 mlog_errno(ret); 1940 mlog_errno(ret);
1941 goto out_unlock; 1941 goto out_unlock;
1942 } 1942 }
1943 1943
1944 if (pipe->inode)
1945 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
1944 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1946 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1947 if (pipe->inode)
1948 mutex_unlock(&pipe->inode->i_mutex);
1945 1949
1946 out_unlock: 1950 out_unlock:
1947 ocfs2_rw_unlock(inode, 1); 1951 ocfs2_rw_unlock(inode, 1);
1948 out: 1952 out:
1949 inode_double_unlock(inode, pipe->inode); 1953 mutex_unlock(&inode->i_mutex);
1950 1954
1951 mlog_exit(ret); 1955 mlog_exit(ret);
1952 return ret; 1956 return ret;
1953 } 1957 }
1954 1958
1955 static ssize_t ocfs2_file_splice_read(struct file *in, 1959 static ssize_t ocfs2_file_splice_read(struct file *in,
1956 loff_t *ppos, 1960 loff_t *ppos,
1957 struct pipe_inode_info *pipe, 1961 struct pipe_inode_info *pipe,
1958 size_t len, 1962 size_t len,
1959 unsigned int flags) 1963 unsigned int flags)
1960 { 1964 {
1961 int ret = 0; 1965 int ret = 0;
1962 struct inode *inode = in->f_path.dentry->d_inode; 1966 struct inode *inode = in->f_path.dentry->d_inode;
1963 1967
1964 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 1968 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1965 (unsigned int)len, 1969 (unsigned int)len,
1966 in->f_path.dentry->d_name.len, 1970 in->f_path.dentry->d_name.len,
1967 in->f_path.dentry->d_name.name); 1971 in->f_path.dentry->d_name.name);
1968 1972
1969 /* 1973 /*
1970 * See the comment in ocfs2_file_aio_read() 1974 * See the comment in ocfs2_file_aio_read()
1971 */ 1975 */
1972 ret = ocfs2_inode_lock(inode, NULL, 0); 1976 ret = ocfs2_inode_lock(inode, NULL, 0);
1973 if (ret < 0) { 1977 if (ret < 0) {
1974 mlog_errno(ret); 1978 mlog_errno(ret);
1975 goto bail; 1979 goto bail;
1976 } 1980 }
1977 ocfs2_inode_unlock(inode, 0); 1981 ocfs2_inode_unlock(inode, 0);
1978 1982
1979 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 1983 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1980 1984
1981 bail: 1985 bail:
1982 mlog_exit(ret); 1986 mlog_exit(ret);
1983 return ret; 1987 return ret;
1984 } 1988 }
1985 1989
1986 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1990 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1987 const struct iovec *iov, 1991 const struct iovec *iov,
1988 unsigned long nr_segs, 1992 unsigned long nr_segs,
1989 loff_t pos) 1993 loff_t pos)
1990 { 1994 {
1991 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 1995 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1992 struct file *filp = iocb->ki_filp; 1996 struct file *filp = iocb->ki_filp;
1993 struct inode *inode = filp->f_path.dentry->d_inode; 1997 struct inode *inode = filp->f_path.dentry->d_inode;
1994 1998
1995 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1999 mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1996 (unsigned int)nr_segs, 2000 (unsigned int)nr_segs,
1997 filp->f_path.dentry->d_name.len, 2001 filp->f_path.dentry->d_name.len,
1998 filp->f_path.dentry->d_name.name); 2002 filp->f_path.dentry->d_name.name);
1999 2003
2000 if (!inode) { 2004 if (!inode) {
2001 ret = -EINVAL; 2005 ret = -EINVAL;
2002 mlog_errno(ret); 2006 mlog_errno(ret);
2003 goto bail; 2007 goto bail;
2004 } 2008 }
2005 2009
2006 /* 2010 /*
2007 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2011 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2008 * need locks to protect pending reads from racing with truncate. 2012 * need locks to protect pending reads from racing with truncate.
2009 */ 2013 */
2010 if (filp->f_flags & O_DIRECT) { 2014 if (filp->f_flags & O_DIRECT) {
2011 down_read(&inode->i_alloc_sem); 2015 down_read(&inode->i_alloc_sem);
2012 have_alloc_sem = 1; 2016 have_alloc_sem = 1;
2013 2017
2014 ret = ocfs2_rw_lock(inode, 0); 2018 ret = ocfs2_rw_lock(inode, 0);
2015 if (ret < 0) { 2019 if (ret < 0) {
2016 mlog_errno(ret); 2020 mlog_errno(ret);
2017 goto bail; 2021 goto bail;
2018 } 2022 }
2019 rw_level = 0; 2023 rw_level = 0;
2020 /* communicate with ocfs2_dio_end_io */ 2024 /* communicate with ocfs2_dio_end_io */
2021 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2025 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2022 } 2026 }
2023 2027
2024 /* 2028 /*
2025 * We're fine letting folks race truncates and extending 2029 * We're fine letting folks race truncates and extending
2026 * writes with read across the cluster, just like they can 2030 * writes with read across the cluster, just like they can
2027 * locally. Hence no rw_lock during read. 2031 * locally. Hence no rw_lock during read.
2028 * 2032 *
2029 * Take and drop the meta data lock to update inode fields 2033 * Take and drop the meta data lock to update inode fields
2030 * like i_size. This allows the checks down below 2034 * like i_size. This allows the checks down below
2031 * generic_file_aio_read() a chance of actually working. 2035 * generic_file_aio_read() a chance of actually working.
2032 */ 2036 */
2033 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2037 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2034 if (ret < 0) { 2038 if (ret < 0) {
2035 mlog_errno(ret); 2039 mlog_errno(ret);
2036 goto bail; 2040 goto bail;
2037 } 2041 }
2038 ocfs2_inode_unlock(inode, lock_level); 2042 ocfs2_inode_unlock(inode, lock_level);
2039 2043
2040 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2044 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2041 if (ret == -EINVAL) 2045 if (ret == -EINVAL)
2042 mlog(0, "generic_file_aio_read returned -EINVAL\n"); 2046 mlog(0, "generic_file_aio_read returned -EINVAL\n");
2043 2047
2044 /* buffered aio wouldn't have proper lock coverage today */ 2048 /* buffered aio wouldn't have proper lock coverage today */
2045 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2049 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2046 2050
2047 /* see ocfs2_file_aio_write */ 2051 /* see ocfs2_file_aio_write */
2048 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2052 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2049 rw_level = -1; 2053 rw_level = -1;
2050 have_alloc_sem = 0; 2054 have_alloc_sem = 0;
2051 } 2055 }
2052 2056
2053 bail: 2057 bail:
2054 if (have_alloc_sem) 2058 if (have_alloc_sem)
2055 up_read(&inode->i_alloc_sem); 2059 up_read(&inode->i_alloc_sem);
2056 if (rw_level != -1) 2060 if (rw_level != -1)
2057 ocfs2_rw_unlock(inode, rw_level); 2061 ocfs2_rw_unlock(inode, rw_level);
2058 mlog_exit(ret); 2062 mlog_exit(ret);
2059 2063
2060 return ret; 2064 return ret;
2061 } 2065 }
2062 2066
2063 const struct inode_operations ocfs2_file_iops = { 2067 const struct inode_operations ocfs2_file_iops = {
2064 .setattr = ocfs2_setattr, 2068 .setattr = ocfs2_setattr,
2065 .getattr = ocfs2_getattr, 2069 .getattr = ocfs2_getattr,
2066 .permission = ocfs2_permission, 2070 .permission = ocfs2_permission,
2067 .setxattr = generic_setxattr, 2071 .setxattr = generic_setxattr,
2068 .getxattr = generic_getxattr, 2072 .getxattr = generic_getxattr,
2069 .listxattr = ocfs2_listxattr, 2073 .listxattr = ocfs2_listxattr,
2070 .removexattr = generic_removexattr, 2074 .removexattr = generic_removexattr,
2071 .fallocate = ocfs2_fallocate, 2075 .fallocate = ocfs2_fallocate,
2072 .fiemap = ocfs2_fiemap, 2076 .fiemap = ocfs2_fiemap,
2073 }; 2077 };
2074 2078
2075 const struct inode_operations ocfs2_special_file_iops = { 2079 const struct inode_operations ocfs2_special_file_iops = {
2076 .setattr = ocfs2_setattr, 2080 .setattr = ocfs2_setattr,
2077 .getattr = ocfs2_getattr, 2081 .getattr = ocfs2_getattr,
2078 .permission = ocfs2_permission, 2082 .permission = ocfs2_permission,
2079 }; 2083 };
2080 2084
2081 /* 2085 /*
2082 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with 2086 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2083 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2087 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2084 */ 2088 */
2085 const struct file_operations ocfs2_fops = { 2089 const struct file_operations ocfs2_fops = {
2086 .llseek = generic_file_llseek, 2090 .llseek = generic_file_llseek,
2087 .read = do_sync_read, 2091 .read = do_sync_read,
2088 .write = do_sync_write, 2092 .write = do_sync_write,
2089 .mmap = ocfs2_mmap, 2093 .mmap = ocfs2_mmap,
2090 .fsync = ocfs2_sync_file, 2094 .fsync = ocfs2_sync_file,
2091 .release = ocfs2_file_release, 2095 .release = ocfs2_file_release,
2092 .open = ocfs2_file_open, 2096 .open = ocfs2_file_open,
2093 .aio_read = ocfs2_file_aio_read, 2097 .aio_read = ocfs2_file_aio_read,
2094 .aio_write = ocfs2_file_aio_write, 2098 .aio_write = ocfs2_file_aio_write,
2095 .unlocked_ioctl = ocfs2_ioctl, 2099 .unlocked_ioctl = ocfs2_ioctl,
2096 #ifdef CONFIG_COMPAT 2100 #ifdef CONFIG_COMPAT
2097 .compat_ioctl = ocfs2_compat_ioctl, 2101 .compat_ioctl = ocfs2_compat_ioctl,
2098 #endif 2102 #endif
2099 .lock = ocfs2_lock, 2103 .lock = ocfs2_lock,
2100 .flock = ocfs2_flock, 2104 .flock = ocfs2_flock,
2101 .splice_read = ocfs2_file_splice_read, 2105 .splice_read = ocfs2_file_splice_read,
2102 .splice_write = ocfs2_file_splice_write, 2106 .splice_write = ocfs2_file_splice_write,
2103 }; 2107 };
2104 2108
2105 const struct file_operations ocfs2_dops = { 2109 const struct file_operations ocfs2_dops = {
2106 .llseek = generic_file_llseek, 2110 .llseek = generic_file_llseek,
2107 .read = generic_read_dir, 2111 .read = generic_read_dir,
2108 .readdir = ocfs2_readdir, 2112 .readdir = ocfs2_readdir,
2109 .fsync = ocfs2_sync_file, 2113 .fsync = ocfs2_sync_file,
2110 .release = ocfs2_dir_release, 2114 .release = ocfs2_dir_release,
2111 .open = ocfs2_dir_open, 2115 .open = ocfs2_dir_open,
2112 .unlocked_ioctl = ocfs2_ioctl, 2116 .unlocked_ioctl = ocfs2_ioctl,
2113 #ifdef CONFIG_COMPAT 2117 #ifdef CONFIG_COMPAT
2114 .compat_ioctl = ocfs2_compat_ioctl, 2118 .compat_ioctl = ocfs2_compat_ioctl,
2115 #endif 2119 #endif
2116 .lock = ocfs2_lock, 2120 .lock = ocfs2_lock,
2117 .flock = ocfs2_flock, 2121 .flock = ocfs2_flock,
2118 }; 2122 };
2119 2123
2120 /* 2124 /*
2121 * POSIX-lockless variants of our file_operations. 2125 * POSIX-lockless variants of our file_operations.
2122 * 2126 *
2123 * These will be used if the underlying cluster stack does not support 2127 * These will be used if the underlying cluster stack does not support
2124 * posix file locking, if the user passes the "localflocks" mount 2128 * posix file locking, if the user passes the "localflocks" mount
2125 * option, or if we have a local-only fs. 2129 * option, or if we have a local-only fs.
2126 * 2130 *
2127 * ocfs2_flock is in here because all stacks handle UNIX file locks, 2131 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2128 * so we still want it in the case of no stack support for 2132 * so we still want it in the case of no stack support for
2129 * plocks. Internally, it will do the right thing when asked to ignore 2133 * plocks. Internally, it will do the right thing when asked to ignore
2130 * the cluster. 2134 * the cluster.
2131 */ 2135 */
2132 const struct file_operations ocfs2_fops_no_plocks = { 2136 const struct file_operations ocfs2_fops_no_plocks = {
2133 .llseek = generic_file_llseek, 2137 .llseek = generic_file_llseek,
2134 .read = do_sync_read, 2138 .read = do_sync_read,
2135 .write = do_sync_write, 2139 .write = do_sync_write,
2136 .mmap = ocfs2_mmap, 2140 .mmap = ocfs2_mmap,
2137 .fsync = ocfs2_sync_file, 2141 .fsync = ocfs2_sync_file,
2138 .release = ocfs2_file_release, 2142 .release = ocfs2_file_release,
2139 .open = ocfs2_file_open, 2143 .open = ocfs2_file_open,
2140 .aio_read = ocfs2_file_aio_read, 2144 .aio_read = ocfs2_file_aio_read,
2141 .aio_write = ocfs2_file_aio_write, 2145 .aio_write = ocfs2_file_aio_write,
2142 .unlocked_ioctl = ocfs2_ioctl, 2146 .unlocked_ioctl = ocfs2_ioctl,
2143 #ifdef CONFIG_COMPAT 2147 #ifdef CONFIG_COMPAT
2144 .compat_ioctl = ocfs2_compat_ioctl, 2148 .compat_ioctl = ocfs2_compat_ioctl,
2145 #endif 2149 #endif
2146 .flock = ocfs2_flock, 2150 .flock = ocfs2_flock,
2147 .splice_read = ocfs2_file_splice_read, 2151 .splice_read = ocfs2_file_splice_read,
2148 .splice_write = ocfs2_file_splice_write, 2152 .splice_write = ocfs2_file_splice_write,
2149 }; 2153 };
2150 2154
2151 const struct file_operations ocfs2_dops_no_plocks = { 2155 const struct file_operations ocfs2_dops_no_plocks = {
2152 .llseek = generic_file_llseek, 2156 .llseek = generic_file_llseek,
2153 .read = generic_read_dir, 2157 .read = generic_read_dir,
2154 .readdir = ocfs2_readdir, 2158 .readdir = ocfs2_readdir,
2155 .fsync = ocfs2_sync_file, 2159 .fsync = ocfs2_sync_file,
2156 .release = ocfs2_dir_release, 2160 .release = ocfs2_dir_release,
2157 .open = ocfs2_dir_open, 2161 .open = ocfs2_dir_open,
2158 .unlocked_ioctl = ocfs2_ioctl, 2162 .unlocked_ioctl = ocfs2_ioctl,
2159 #ifdef CONFIG_COMPAT 2163 #ifdef CONFIG_COMPAT
2160 .compat_ioctl = ocfs2_compat_ioctl, 2164 .compat_ioctl = ocfs2_compat_ioctl,
2161 #endif 2165 #endif
2162 .flock = ocfs2_flock, 2166 .flock = ocfs2_flock,
2163 }; 2167 };
2164 2168
1 /* 1 /*
2 * "splice": joining two ropes together by interweaving their strands. 2 * "splice": joining two ropes together by interweaving their strands.
3 * 3 *
4 * This is the "extended pipe" functionality, where a pipe is used as 4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other. 6 * buffer that you can use to transfer data from one end to the other.
7 * 7 *
8 * The traditional unix read/write is extended with a "splice()" operation 8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer. 9 * that transfers data buffers to or from a pipe buffer.
10 * 10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by 11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and 12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs. 13 * fixing lots of bugs.
14 * 14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 * 18 *
19 */ 19 */
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/file.h> 21 #include <linux/file.h>
22 #include <linux/pagemap.h> 22 #include <linux/pagemap.h>
23 #include <linux/splice.h> 23 #include <linux/splice.h>
24 #include <linux/memcontrol.h> 24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h> 25 #include <linux/mm_inline.h>
26 #include <linux/swap.h> 26 #include <linux/swap.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/buffer_head.h> 28 #include <linux/buffer_head.h>
29 #include <linux/module.h> 29 #include <linux/module.h>
30 #include <linux/syscalls.h> 30 #include <linux/syscalls.h>
31 #include <linux/uio.h> 31 #include <linux/uio.h>
32 #include <linux/security.h> 32 #include <linux/security.h>
33 33
34 /* 34 /*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 35 * Attempt to steal a page from a pipe buffer. This should perhaps go into
36 * a vm helper function, it's already simplified quite a bit by the 36 * a vm helper function, it's already simplified quite a bit by the
37 * addition of remove_mapping(). If success is returned, the caller may 37 * addition of remove_mapping(). If success is returned, the caller may
38 * attempt to reuse this page for another destination. 38 * attempt to reuse this page for another destination.
39 */ 39 */
40 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 40 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
41 struct pipe_buffer *buf) 41 struct pipe_buffer *buf)
42 { 42 {
43 struct page *page = buf->page; 43 struct page *page = buf->page;
44 struct address_space *mapping; 44 struct address_space *mapping;
45 45
46 lock_page(page); 46 lock_page(page);
47 47
48 mapping = page_mapping(page); 48 mapping = page_mapping(page);
49 if (mapping) { 49 if (mapping) {
50 WARN_ON(!PageUptodate(page)); 50 WARN_ON(!PageUptodate(page));
51 51
52 /* 52 /*
53 * At least for ext2 with nobh option, we need to wait on 53 * At least for ext2 with nobh option, we need to wait on
54 * writeback completing on this page, since we'll remove it 54 * writeback completing on this page, since we'll remove it
55 * from the pagecache. Otherwise truncate wont wait on the 55 * from the pagecache. Otherwise truncate wont wait on the
56 * page, allowing the disk blocks to be reused by someone else 56 * page, allowing the disk blocks to be reused by someone else
57 * before we actually wrote our data to them. fs corruption 57 * before we actually wrote our data to them. fs corruption
58 * ensues. 58 * ensues.
59 */ 59 */
60 wait_on_page_writeback(page); 60 wait_on_page_writeback(page);
61 61
62 if (page_has_private(page) && 62 if (page_has_private(page) &&
63 !try_to_release_page(page, GFP_KERNEL)) 63 !try_to_release_page(page, GFP_KERNEL))
64 goto out_unlock; 64 goto out_unlock;
65 65
66 /* 66 /*
67 * If we succeeded in removing the mapping, set LRU flag 67 * If we succeeded in removing the mapping, set LRU flag
68 * and return good. 68 * and return good.
69 */ 69 */
70 if (remove_mapping(mapping, page)) { 70 if (remove_mapping(mapping, page)) {
71 buf->flags |= PIPE_BUF_FLAG_LRU; 71 buf->flags |= PIPE_BUF_FLAG_LRU;
72 return 0; 72 return 0;
73 } 73 }
74 } 74 }
75 75
76 /* 76 /*
77 * Raced with truncate or failed to remove page from current 77 * Raced with truncate or failed to remove page from current
78 * address space, unlock and return failure. 78 * address space, unlock and return failure.
79 */ 79 */
80 out_unlock: 80 out_unlock:
81 unlock_page(page); 81 unlock_page(page);
82 return 1; 82 return 1;
83 } 83 }
84 84
85 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 85 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
86 struct pipe_buffer *buf) 86 struct pipe_buffer *buf)
87 { 87 {
88 page_cache_release(buf->page); 88 page_cache_release(buf->page);
89 buf->flags &= ~PIPE_BUF_FLAG_LRU; 89 buf->flags &= ~PIPE_BUF_FLAG_LRU;
90 } 90 }
91 91
92 /* 92 /*
93 * Check whether the contents of buf is OK to access. Since the content 93 * Check whether the contents of buf is OK to access. Since the content
94 * is a page cache page, IO may be in flight. 94 * is a page cache page, IO may be in flight.
95 */ 95 */
96 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 96 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
97 struct pipe_buffer *buf) 97 struct pipe_buffer *buf)
98 { 98 {
99 struct page *page = buf->page; 99 struct page *page = buf->page;
100 int err; 100 int err;
101 101
102 if (!PageUptodate(page)) { 102 if (!PageUptodate(page)) {
103 lock_page(page); 103 lock_page(page);
104 104
105 /* 105 /*
106 * Page got truncated/unhashed. This will cause a 0-byte 106 * Page got truncated/unhashed. This will cause a 0-byte
107 * splice, if this is the first page. 107 * splice, if this is the first page.
108 */ 108 */
109 if (!page->mapping) { 109 if (!page->mapping) {
110 err = -ENODATA; 110 err = -ENODATA;
111 goto error; 111 goto error;
112 } 112 }
113 113
114 /* 114 /*
115 * Uh oh, read-error from disk. 115 * Uh oh, read-error from disk.
116 */ 116 */
117 if (!PageUptodate(page)) { 117 if (!PageUptodate(page)) {
118 err = -EIO; 118 err = -EIO;
119 goto error; 119 goto error;
120 } 120 }
121 121
122 /* 122 /*
123 * Page is ok afterall, we are done. 123 * Page is ok afterall, we are done.
124 */ 124 */
125 unlock_page(page); 125 unlock_page(page);
126 } 126 }
127 127
128 return 0; 128 return 0;
129 error: 129 error:
130 unlock_page(page); 130 unlock_page(page);
131 return err; 131 return err;
132 } 132 }
133 133
134 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 134 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
135 .can_merge = 0, 135 .can_merge = 0,
136 .map = generic_pipe_buf_map, 136 .map = generic_pipe_buf_map,
137 .unmap = generic_pipe_buf_unmap, 137 .unmap = generic_pipe_buf_unmap,
138 .confirm = page_cache_pipe_buf_confirm, 138 .confirm = page_cache_pipe_buf_confirm,
139 .release = page_cache_pipe_buf_release, 139 .release = page_cache_pipe_buf_release,
140 .steal = page_cache_pipe_buf_steal, 140 .steal = page_cache_pipe_buf_steal,
141 .get = generic_pipe_buf_get, 141 .get = generic_pipe_buf_get,
142 }; 142 };
143 143
144 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 144 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
145 struct pipe_buffer *buf) 145 struct pipe_buffer *buf)
146 { 146 {
147 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 147 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
148 return 1; 148 return 1;
149 149
150 buf->flags |= PIPE_BUF_FLAG_LRU; 150 buf->flags |= PIPE_BUF_FLAG_LRU;
151 return generic_pipe_buf_steal(pipe, buf); 151 return generic_pipe_buf_steal(pipe, buf);
152 } 152 }
153 153
154 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 154 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
155 .can_merge = 0, 155 .can_merge = 0,
156 .map = generic_pipe_buf_map, 156 .map = generic_pipe_buf_map,
157 .unmap = generic_pipe_buf_unmap, 157 .unmap = generic_pipe_buf_unmap,
158 .confirm = generic_pipe_buf_confirm, 158 .confirm = generic_pipe_buf_confirm,
159 .release = page_cache_pipe_buf_release, 159 .release = page_cache_pipe_buf_release,
160 .steal = user_page_pipe_buf_steal, 160 .steal = user_page_pipe_buf_steal,
161 .get = generic_pipe_buf_get, 161 .get = generic_pipe_buf_get,
162 }; 162 };
163 163
164 /** 164 /**
165 * splice_to_pipe - fill passed data into a pipe 165 * splice_to_pipe - fill passed data into a pipe
166 * @pipe: pipe to fill 166 * @pipe: pipe to fill
167 * @spd: data to fill 167 * @spd: data to fill
168 * 168 *
169 * Description: 169 * Description:
170 * @spd contains a map of pages and len/offset tuples, along with 170 * @spd contains a map of pages and len/offset tuples, along with
171 * the struct pipe_buf_operations associated with these pages. This 171 * the struct pipe_buf_operations associated with these pages. This
172 * function will link that data to the pipe. 172 * function will link that data to the pipe.
173 * 173 *
174 */ 174 */
175 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 175 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
176 struct splice_pipe_desc *spd) 176 struct splice_pipe_desc *spd)
177 { 177 {
178 unsigned int spd_pages = spd->nr_pages; 178 unsigned int spd_pages = spd->nr_pages;
179 int ret, do_wakeup, page_nr; 179 int ret, do_wakeup, page_nr;
180 180
181 ret = 0; 181 ret = 0;
182 do_wakeup = 0; 182 do_wakeup = 0;
183 page_nr = 0; 183 page_nr = 0;
184 184
185 if (pipe->inode) 185 if (pipe->inode)
186 mutex_lock(&pipe->inode->i_mutex); 186 mutex_lock(&pipe->inode->i_mutex);
187 187
188 for (;;) { 188 for (;;) {
189 if (!pipe->readers) { 189 if (!pipe->readers) {
190 send_sig(SIGPIPE, current, 0); 190 send_sig(SIGPIPE, current, 0);
191 if (!ret) 191 if (!ret)
192 ret = -EPIPE; 192 ret = -EPIPE;
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < PIPE_BUFFERS) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
201 buf->offset = spd->partial[page_nr].offset; 201 buf->offset = spd->partial[page_nr].offset;
202 buf->len = spd->partial[page_nr].len; 202 buf->len = spd->partial[page_nr].len;
203 buf->private = spd->partial[page_nr].private; 203 buf->private = spd->partial[page_nr].private;
204 buf->ops = spd->ops; 204 buf->ops = spd->ops;
205 if (spd->flags & SPLICE_F_GIFT) 205 if (spd->flags & SPLICE_F_GIFT)
206 buf->flags |= PIPE_BUF_FLAG_GIFT; 206 buf->flags |= PIPE_BUF_FLAG_GIFT;
207 207
208 pipe->nrbufs++; 208 pipe->nrbufs++;
209 page_nr++; 209 page_nr++;
210 ret += buf->len; 210 ret += buf->len;
211 211
212 if (pipe->inode) 212 if (pipe->inode)
213 do_wakeup = 1; 213 do_wakeup = 1;
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < PIPE_BUFFERS)
218 continue; 218 continue;
219 219
220 break; 220 break;
221 } 221 }
222 222
223 if (spd->flags & SPLICE_F_NONBLOCK) { 223 if (spd->flags & SPLICE_F_NONBLOCK) {
224 if (!ret) 224 if (!ret)
225 ret = -EAGAIN; 225 ret = -EAGAIN;
226 break; 226 break;
227 } 227 }
228 228
229 if (signal_pending(current)) { 229 if (signal_pending(current)) {
230 if (!ret) 230 if (!ret)
231 ret = -ERESTARTSYS; 231 ret = -ERESTARTSYS;
232 break; 232 break;
233 } 233 }
234 234
235 if (do_wakeup) { 235 if (do_wakeup) {
236 smp_mb(); 236 smp_mb();
237 if (waitqueue_active(&pipe->wait)) 237 if (waitqueue_active(&pipe->wait))
238 wake_up_interruptible_sync(&pipe->wait); 238 wake_up_interruptible_sync(&pipe->wait);
239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
240 do_wakeup = 0; 240 do_wakeup = 0;
241 } 241 }
242 242
243 pipe->waiting_writers++; 243 pipe->waiting_writers++;
244 pipe_wait(pipe); 244 pipe_wait(pipe);
245 pipe->waiting_writers--; 245 pipe->waiting_writers--;
246 } 246 }
247 247
248 if (pipe->inode) { 248 if (pipe->inode) {
249 mutex_unlock(&pipe->inode->i_mutex); 249 mutex_unlock(&pipe->inode->i_mutex);
250 250
251 if (do_wakeup) { 251 if (do_wakeup) {
252 smp_mb(); 252 smp_mb();
253 if (waitqueue_active(&pipe->wait)) 253 if (waitqueue_active(&pipe->wait))
254 wake_up_interruptible(&pipe->wait); 254 wake_up_interruptible(&pipe->wait);
255 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
256 } 256 }
257 } 257 }
258 258
259 while (page_nr < spd_pages) 259 while (page_nr < spd_pages)
260 spd->spd_release(spd, page_nr++); 260 spd->spd_release(spd, page_nr++);
261 261
262 return ret; 262 return ret;
263 } 263 }
264 264
265 static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) 265 static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
266 { 266 {
267 page_cache_release(spd->pages[i]); 267 page_cache_release(spd->pages[i]);
268 } 268 }
269 269
270 static int 270 static int
271 __generic_file_splice_read(struct file *in, loff_t *ppos, 271 __generic_file_splice_read(struct file *in, loff_t *ppos,
272 struct pipe_inode_info *pipe, size_t len, 272 struct pipe_inode_info *pipe, size_t len,
273 unsigned int flags) 273 unsigned int flags)
274 { 274 {
275 struct address_space *mapping = in->f_mapping; 275 struct address_space *mapping = in->f_mapping;
276 unsigned int loff, nr_pages, req_pages; 276 unsigned int loff, nr_pages, req_pages;
277 struct page *pages[PIPE_BUFFERS]; 277 struct page *pages[PIPE_BUFFERS];
278 struct partial_page partial[PIPE_BUFFERS]; 278 struct partial_page partial[PIPE_BUFFERS];
279 struct page *page; 279 struct page *page;
280 pgoff_t index, end_index; 280 pgoff_t index, end_index;
281 loff_t isize; 281 loff_t isize;
282 int error, page_nr; 282 int error, page_nr;
283 struct splice_pipe_desc spd = { 283 struct splice_pipe_desc spd = {
284 .pages = pages, 284 .pages = pages,
285 .partial = partial, 285 .partial = partial,
286 .flags = flags, 286 .flags = flags,
287 .ops = &page_cache_pipe_buf_ops, 287 .ops = &page_cache_pipe_buf_ops,
288 .spd_release = spd_release_page, 288 .spd_release = spd_release_page,
289 }; 289 };
290 290
291 index = *ppos >> PAGE_CACHE_SHIFT; 291 index = *ppos >> PAGE_CACHE_SHIFT;
292 loff = *ppos & ~PAGE_CACHE_MASK; 292 loff = *ppos & ~PAGE_CACHE_MASK;
293 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 293 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
294 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 294 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
295 295
296 /* 296 /*
297 * Lookup the (hopefully) full range of pages we need. 297 * Lookup the (hopefully) full range of pages we need.
298 */ 298 */
299 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 299 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
300 index += spd.nr_pages; 300 index += spd.nr_pages;
301 301
302 /* 302 /*
303 * If find_get_pages_contig() returned fewer pages than we needed, 303 * If find_get_pages_contig() returned fewer pages than we needed,
304 * readahead/allocate the rest and fill in the holes. 304 * readahead/allocate the rest and fill in the holes.
305 */ 305 */
306 if (spd.nr_pages < nr_pages) 306 if (spd.nr_pages < nr_pages)
307 page_cache_sync_readahead(mapping, &in->f_ra, in, 307 page_cache_sync_readahead(mapping, &in->f_ra, in,
308 index, req_pages - spd.nr_pages); 308 index, req_pages - spd.nr_pages);
309 309
310 error = 0; 310 error = 0;
311 while (spd.nr_pages < nr_pages) { 311 while (spd.nr_pages < nr_pages) {
312 /* 312 /*
313 * Page could be there, find_get_pages_contig() breaks on 313 * Page could be there, find_get_pages_contig() breaks on
314 * the first hole. 314 * the first hole.
315 */ 315 */
316 page = find_get_page(mapping, index); 316 page = find_get_page(mapping, index);
317 if (!page) { 317 if (!page) {
318 /* 318 /*
319 * page didn't exist, allocate one. 319 * page didn't exist, allocate one.
320 */ 320 */
321 page = page_cache_alloc_cold(mapping); 321 page = page_cache_alloc_cold(mapping);
322 if (!page) 322 if (!page)
323 break; 323 break;
324 324
325 error = add_to_page_cache_lru(page, mapping, index, 325 error = add_to_page_cache_lru(page, mapping, index,
326 mapping_gfp_mask(mapping)); 326 mapping_gfp_mask(mapping));
327 if (unlikely(error)) { 327 if (unlikely(error)) {
328 page_cache_release(page); 328 page_cache_release(page);
329 if (error == -EEXIST) 329 if (error == -EEXIST)
330 continue; 330 continue;
331 break; 331 break;
332 } 332 }
333 /* 333 /*
334 * add_to_page_cache() locks the page, unlock it 334 * add_to_page_cache() locks the page, unlock it
335 * to avoid convoluting the logic below even more. 335 * to avoid convoluting the logic below even more.
336 */ 336 */
337 unlock_page(page); 337 unlock_page(page);
338 } 338 }
339 339
340 pages[spd.nr_pages++] = page; 340 pages[spd.nr_pages++] = page;
341 index++; 341 index++;
342 } 342 }
343 343
344 /* 344 /*
345 * Now loop over the map and see if we need to start IO on any 345 * Now loop over the map and see if we need to start IO on any
346 * pages, fill in the partial map, etc. 346 * pages, fill in the partial map, etc.
347 */ 347 */
348 index = *ppos >> PAGE_CACHE_SHIFT; 348 index = *ppos >> PAGE_CACHE_SHIFT;
349 nr_pages = spd.nr_pages; 349 nr_pages = spd.nr_pages;
350 spd.nr_pages = 0; 350 spd.nr_pages = 0;
351 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 351 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
352 unsigned int this_len; 352 unsigned int this_len;
353 353
354 if (!len) 354 if (!len)
355 break; 355 break;
356 356
357 /* 357 /*
358 * this_len is the max we'll use from this page 358 * this_len is the max we'll use from this page
359 */ 359 */
360 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 360 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
361 page = pages[page_nr]; 361 page = pages[page_nr];
362 362
363 if (PageReadahead(page)) 363 if (PageReadahead(page))
364 page_cache_async_readahead(mapping, &in->f_ra, in, 364 page_cache_async_readahead(mapping, &in->f_ra, in,
365 page, index, req_pages - page_nr); 365 page, index, req_pages - page_nr);
366 366
367 /* 367 /*
368 * If the page isn't uptodate, we may need to start io on it 368 * If the page isn't uptodate, we may need to start io on it
369 */ 369 */
370 if (!PageUptodate(page)) { 370 if (!PageUptodate(page)) {
371 /* 371 /*
372 * If in nonblock mode then dont block on waiting 372 * If in nonblock mode then dont block on waiting
373 * for an in-flight io page 373 * for an in-flight io page
374 */ 374 */
375 if (flags & SPLICE_F_NONBLOCK) { 375 if (flags & SPLICE_F_NONBLOCK) {
376 if (!trylock_page(page)) { 376 if (!trylock_page(page)) {
377 error = -EAGAIN; 377 error = -EAGAIN;
378 break; 378 break;
379 } 379 }
380 } else 380 } else
381 lock_page(page); 381 lock_page(page);
382 382
383 /* 383 /*
384 * Page was truncated, or invalidated by the 384 * Page was truncated, or invalidated by the
385 * filesystem. Redo the find/create, but this time the 385 * filesystem. Redo the find/create, but this time the
386 * page is kept locked, so there's no chance of another 386 * page is kept locked, so there's no chance of another
387 * race with truncate/invalidate. 387 * race with truncate/invalidate.
388 */ 388 */
389 if (!page->mapping) { 389 if (!page->mapping) {
390 unlock_page(page); 390 unlock_page(page);
391 page = find_or_create_page(mapping, index, 391 page = find_or_create_page(mapping, index,
392 mapping_gfp_mask(mapping)); 392 mapping_gfp_mask(mapping));
393 393
394 if (!page) { 394 if (!page) {
395 error = -ENOMEM; 395 error = -ENOMEM;
396 break; 396 break;
397 } 397 }
398 page_cache_release(pages[page_nr]); 398 page_cache_release(pages[page_nr]);
399 pages[page_nr] = page; 399 pages[page_nr] = page;
400 } 400 }
401 /* 401 /*
402 * page was already under io and is now done, great 402 * page was already under io and is now done, great
403 */ 403 */
404 if (PageUptodate(page)) { 404 if (PageUptodate(page)) {
405 unlock_page(page); 405 unlock_page(page);
406 goto fill_it; 406 goto fill_it;
407 } 407 }
408 408
409 /* 409 /*
410 * need to read in the page 410 * need to read in the page
411 */ 411 */
412 error = mapping->a_ops->readpage(in, page); 412 error = mapping->a_ops->readpage(in, page);
413 if (unlikely(error)) { 413 if (unlikely(error)) {
414 /* 414 /*
415 * We really should re-lookup the page here, 415 * We really should re-lookup the page here,
416 * but it complicates things a lot. Instead 416 * but it complicates things a lot. Instead
417 * lets just do what we already stored, and 417 * lets just do what we already stored, and
418 * we'll get it the next time we are called. 418 * we'll get it the next time we are called.
419 */ 419 */
420 if (error == AOP_TRUNCATED_PAGE) 420 if (error == AOP_TRUNCATED_PAGE)
421 error = 0; 421 error = 0;
422 422
423 break; 423 break;
424 } 424 }
425 } 425 }
426 fill_it: 426 fill_it:
427 /* 427 /*
428 * i_size must be checked after PageUptodate. 428 * i_size must be checked after PageUptodate.
429 */ 429 */
430 isize = i_size_read(mapping->host); 430 isize = i_size_read(mapping->host);
431 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 431 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
432 if (unlikely(!isize || index > end_index)) 432 if (unlikely(!isize || index > end_index))
433 break; 433 break;
434 434
435 /* 435 /*
436 * if this is the last page, see if we need to shrink 436 * if this is the last page, see if we need to shrink
437 * the length and stop 437 * the length and stop
438 */ 438 */
439 if (end_index == index) { 439 if (end_index == index) {
440 unsigned int plen; 440 unsigned int plen;
441 441
442 /* 442 /*
443 * max good bytes in this page 443 * max good bytes in this page
444 */ 444 */
445 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 445 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
446 if (plen <= loff) 446 if (plen <= loff)
447 break; 447 break;
448 448
449 /* 449 /*
450 * force quit after adding this page 450 * force quit after adding this page
451 */ 451 */
452 this_len = min(this_len, plen - loff); 452 this_len = min(this_len, plen - loff);
453 len = this_len; 453 len = this_len;
454 } 454 }
455 455
456 partial[page_nr].offset = loff; 456 partial[page_nr].offset = loff;
457 partial[page_nr].len = this_len; 457 partial[page_nr].len = this_len;
458 len -= this_len; 458 len -= this_len;
459 loff = 0; 459 loff = 0;
460 spd.nr_pages++; 460 spd.nr_pages++;
461 index++; 461 index++;
462 } 462 }
463 463
464 /* 464 /*
465 * Release any pages at the end, if we quit early. 'page_nr' is how far 465 * Release any pages at the end, if we quit early. 'page_nr' is how far
466 * we got, 'nr_pages' is how many pages are in the map. 466 * we got, 'nr_pages' is how many pages are in the map.
467 */ 467 */
468 while (page_nr < nr_pages) 468 while (page_nr < nr_pages)
469 page_cache_release(pages[page_nr++]); 469 page_cache_release(pages[page_nr++]);
470 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 470 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
471 471
472 if (spd.nr_pages) 472 if (spd.nr_pages)
473 return splice_to_pipe(pipe, &spd); 473 return splice_to_pipe(pipe, &spd);
474 474
475 return error; 475 return error;
476 } 476 }
477 477
478 /** 478 /**
479 * generic_file_splice_read - splice data from file to a pipe 479 * generic_file_splice_read - splice data from file to a pipe
480 * @in: file to splice from 480 * @in: file to splice from
481 * @ppos: position in @in 481 * @ppos: position in @in
482 * @pipe: pipe to splice to 482 * @pipe: pipe to splice to
483 * @len: number of bytes to splice 483 * @len: number of bytes to splice
484 * @flags: splice modifier flags 484 * @flags: splice modifier flags
485 * 485 *
486 * Description: 486 * Description:
487 * Will read pages from given file and fill them into a pipe. Can be 487 * Will read pages from given file and fill them into a pipe. Can be
488 * used as long as the address_space operations for the source implements 488 * used as long as the address_space operations for the source implements
489 * a readpage() hook. 489 * a readpage() hook.
490 * 490 *
491 */ 491 */
492 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 492 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
493 struct pipe_inode_info *pipe, size_t len, 493 struct pipe_inode_info *pipe, size_t len,
494 unsigned int flags) 494 unsigned int flags)
495 { 495 {
496 loff_t isize, left; 496 loff_t isize, left;
497 int ret; 497 int ret;
498 498
499 isize = i_size_read(in->f_mapping->host); 499 isize = i_size_read(in->f_mapping->host);
500 if (unlikely(*ppos >= isize)) 500 if (unlikely(*ppos >= isize))
501 return 0; 501 return 0;
502 502
503 left = isize - *ppos; 503 left = isize - *ppos;
504 if (unlikely(left < len)) 504 if (unlikely(left < len))
505 len = left; 505 len = left;
506 506
507 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 507 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
508 if (ret > 0) 508 if (ret > 0)
509 *ppos += ret; 509 *ppos += ret;
510 510
511 return ret; 511 return ret;
512 } 512 }
513 513
514 EXPORT_SYMBOL(generic_file_splice_read); 514 EXPORT_SYMBOL(generic_file_splice_read);
515 515
516 /* 516 /*
517 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 517 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
518 * using sendpage(). Return the number of bytes sent. 518 * using sendpage(). Return the number of bytes sent.
519 */ 519 */
520 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 520 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
521 struct pipe_buffer *buf, struct splice_desc *sd) 521 struct pipe_buffer *buf, struct splice_desc *sd)
522 { 522 {
523 struct file *file = sd->u.file; 523 struct file *file = sd->u.file;
524 loff_t pos = sd->pos; 524 loff_t pos = sd->pos;
525 int ret, more; 525 int ret, more;
526 526
527 ret = buf->ops->confirm(pipe, buf); 527 ret = buf->ops->confirm(pipe, buf);
528 if (!ret) { 528 if (!ret) {
529 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 529 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
530 530
531 ret = file->f_op->sendpage(file, buf->page, buf->offset, 531 ret = file->f_op->sendpage(file, buf->page, buf->offset,
532 sd->len, &pos, more); 532 sd->len, &pos, more);
533 } 533 }
534 534
535 return ret; 535 return ret;
536 } 536 }
537 537
538 /* 538 /*
539 * This is a little more tricky than the file -> pipe splicing. There are 539 * This is a little more tricky than the file -> pipe splicing. There are
540 * basically three cases: 540 * basically three cases:
541 * 541 *
542 * - Destination page already exists in the address space and there 542 * - Destination page already exists in the address space and there
543 * are users of it. For that case we have no other option that 543 * are users of it. For that case we have no other option that
544 * copying the data. Tough luck. 544 * copying the data. Tough luck.
545 * - Destination page already exists in the address space, but there 545 * - Destination page already exists in the address space, but there
546 * are no users of it. Make sure it's uptodate, then drop it. Fall 546 * are no users of it. Make sure it's uptodate, then drop it. Fall
547 * through to last case. 547 * through to last case.
548 * - Destination page does not exist, we can add the pipe page to 548 * - Destination page does not exist, we can add the pipe page to
549 * the page cache and avoid the copy. 549 * the page cache and avoid the copy.
550 * 550 *
551 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 551 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
552 * sd->flags), we attempt to migrate pages from the pipe to the output 552 * sd->flags), we attempt to migrate pages from the pipe to the output
553 * file address space page cache. This is possible if no one else has 553 * file address space page cache. This is possible if no one else has
554 * the pipe page referenced outside of the pipe and page cache. If 554 * the pipe page referenced outside of the pipe and page cache. If
555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
556 * a new page in the output file page cache and fill/dirty that. 556 * a new page in the output file page cache and fill/dirty that.
557 */ 557 */
558 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 558 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
559 struct splice_desc *sd) 559 struct splice_desc *sd)
560 { 560 {
561 struct file *file = sd->u.file; 561 struct file *file = sd->u.file;
562 struct address_space *mapping = file->f_mapping; 562 struct address_space *mapping = file->f_mapping;
563 unsigned int offset, this_len; 563 unsigned int offset, this_len;
564 struct page *page; 564 struct page *page;
565 void *fsdata; 565 void *fsdata;
566 int ret; 566 int ret;
567 567
568 /* 568 /*
569 * make sure the data in this buffer is uptodate 569 * make sure the data in this buffer is uptodate
570 */ 570 */
571 ret = buf->ops->confirm(pipe, buf); 571 ret = buf->ops->confirm(pipe, buf);
572 if (unlikely(ret)) 572 if (unlikely(ret))
573 return ret; 573 return ret;
574 574
575 offset = sd->pos & ~PAGE_CACHE_MASK; 575 offset = sd->pos & ~PAGE_CACHE_MASK;
576 576
577 this_len = sd->len; 577 this_len = sd->len;
578 if (this_len + offset > PAGE_CACHE_SIZE) 578 if (this_len + offset > PAGE_CACHE_SIZE)
579 this_len = PAGE_CACHE_SIZE - offset; 579 this_len = PAGE_CACHE_SIZE - offset;
580 580
581 ret = pagecache_write_begin(file, mapping, sd->pos, this_len, 581 ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
582 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 582 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
583 if (unlikely(ret)) 583 if (unlikely(ret))
584 goto out; 584 goto out;
585 585
586 if (buf->page != page) { 586 if (buf->page != page) {
587 /* 587 /*
588 * Careful, ->map() uses KM_USER0! 588 * Careful, ->map() uses KM_USER0!
589 */ 589 */
590 char *src = buf->ops->map(pipe, buf, 1); 590 char *src = buf->ops->map(pipe, buf, 1);
591 char *dst = kmap_atomic(page, KM_USER1); 591 char *dst = kmap_atomic(page, KM_USER1);
592 592
593 memcpy(dst + offset, src + buf->offset, this_len); 593 memcpy(dst + offset, src + buf->offset, this_len);
594 flush_dcache_page(page); 594 flush_dcache_page(page);
595 kunmap_atomic(dst, KM_USER1); 595 kunmap_atomic(dst, KM_USER1);
596 buf->ops->unmap(pipe, buf, src); 596 buf->ops->unmap(pipe, buf, src);
597 } 597 }
598 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, 598 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
599 page, fsdata); 599 page, fsdata);
600 out: 600 out:
601 return ret; 601 return ret;
602 } 602 }
603 603
604 /** 604 /**
605 * __splice_from_pipe - splice data from a pipe to given actor 605 * __splice_from_pipe - splice data from a pipe to given actor
606 * @pipe: pipe to splice from 606 * @pipe: pipe to splice from
607 * @sd: information to @actor 607 * @sd: information to @actor
608 * @actor: handler that splices the data 608 * @actor: handler that splices the data
609 * 609 *
610 * Description: 610 * Description:
611 * This function does little more than loop over the pipe and call 611 * This function does little more than loop over the pipe and call
612 * @actor to do the actual moving of a single struct pipe_buffer to 612 * @actor to do the actual moving of a single struct pipe_buffer to
613 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 613 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
614 * pipe_to_user. 614 * pipe_to_user.
615 * 615 *
616 */ 616 */
617 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 617 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
618 splice_actor *actor) 618 splice_actor *actor)
619 { 619 {
620 int ret, do_wakeup, err; 620 int ret, do_wakeup, err;
621 621
622 ret = 0; 622 ret = 0;
623 do_wakeup = 0; 623 do_wakeup = 0;
624 624
625 for (;;) { 625 for (;;) {
626 if (pipe->nrbufs) { 626 if (pipe->nrbufs) {
627 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 627 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
628 const struct pipe_buf_operations *ops = buf->ops; 628 const struct pipe_buf_operations *ops = buf->ops;
629 629
630 sd->len = buf->len; 630 sd->len = buf->len;
631 if (sd->len > sd->total_len) 631 if (sd->len > sd->total_len)
632 sd->len = sd->total_len; 632 sd->len = sd->total_len;
633 633
634 err = actor(pipe, buf, sd); 634 err = actor(pipe, buf, sd);
635 if (err <= 0) { 635 if (err <= 0) {
636 if (!ret && err != -ENODATA) 636 if (!ret && err != -ENODATA)
637 ret = err; 637 ret = err;
638 638
639 break; 639 break;
640 } 640 }
641 641
642 ret += err; 642 ret += err;
643 buf->offset += err; 643 buf->offset += err;
644 buf->len -= err; 644 buf->len -= err;
645 645
646 sd->len -= err; 646 sd->len -= err;
647 sd->pos += err; 647 sd->pos += err;
648 sd->total_len -= err; 648 sd->total_len -= err;
649 if (sd->len) 649 if (sd->len)
650 continue; 650 continue;
651 651
652 if (!buf->len) { 652 if (!buf->len) {
653 buf->ops = NULL; 653 buf->ops = NULL;
654 ops->release(pipe, buf); 654 ops->release(pipe, buf);
655 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 655 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
656 pipe->nrbufs--; 656 pipe->nrbufs--;
657 if (pipe->inode) 657 if (pipe->inode)
658 do_wakeup = 1; 658 do_wakeup = 1;
659 } 659 }
660 660
661 if (!sd->total_len) 661 if (!sd->total_len)
662 break; 662 break;
663 } 663 }
664 664
665 if (pipe->nrbufs) 665 if (pipe->nrbufs)
666 continue; 666 continue;
667 if (!pipe->writers) 667 if (!pipe->writers)
668 break; 668 break;
669 if (!pipe->waiting_writers) { 669 if (!pipe->waiting_writers) {
670 if (ret) 670 if (ret)
671 break; 671 break;
672 } 672 }
673 673
674 if (sd->flags & SPLICE_F_NONBLOCK) { 674 if (sd->flags & SPLICE_F_NONBLOCK) {
675 if (!ret) 675 if (!ret)
676 ret = -EAGAIN; 676 ret = -EAGAIN;
677 break; 677 break;
678 } 678 }
679 679
680 if (signal_pending(current)) { 680 if (signal_pending(current)) {
681 if (!ret) 681 if (!ret)
682 ret = -ERESTARTSYS; 682 ret = -ERESTARTSYS;
683 break; 683 break;
684 } 684 }
685 685
686 if (do_wakeup) { 686 if (do_wakeup) {
687 smp_mb(); 687 smp_mb();
688 if (waitqueue_active(&pipe->wait)) 688 if (waitqueue_active(&pipe->wait))
689 wake_up_interruptible_sync(&pipe->wait); 689 wake_up_interruptible_sync(&pipe->wait);
690 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 690 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
691 do_wakeup = 0; 691 do_wakeup = 0;
692 } 692 }
693 693
694 pipe_wait(pipe); 694 pipe_wait(pipe);
695 } 695 }
696 696
697 if (do_wakeup) { 697 if (do_wakeup) {
698 smp_mb(); 698 smp_mb();
699 if (waitqueue_active(&pipe->wait)) 699 if (waitqueue_active(&pipe->wait))
700 wake_up_interruptible(&pipe->wait); 700 wake_up_interruptible(&pipe->wait);
701 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 701 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
702 } 702 }
703 703
704 return ret; 704 return ret;
705 } 705 }
706 EXPORT_SYMBOL(__splice_from_pipe); 706 EXPORT_SYMBOL(__splice_from_pipe);
707 707
708 /** 708 /**
709 * splice_from_pipe - splice data from a pipe to a file 709 * splice_from_pipe - splice data from a pipe to a file
710 * @pipe: pipe to splice from 710 * @pipe: pipe to splice from
711 * @out: file to splice to 711 * @out: file to splice to
712 * @ppos: position in @out 712 * @ppos: position in @out
713 * @len: how many bytes to splice 713 * @len: how many bytes to splice
714 * @flags: splice modifier flags 714 * @flags: splice modifier flags
715 * @actor: handler that splices the data 715 * @actor: handler that splices the data
716 * 716 *
717 * Description: 717 * Description:
718 * See __splice_from_pipe. This function locks the input and output inodes, 718 * See __splice_from_pipe. This function locks the input and output inodes,
719 * otherwise it's identical to __splice_from_pipe(). 719 * otherwise it's identical to __splice_from_pipe().
720 * 720 *
721 */ 721 */
722 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 722 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
723 loff_t *ppos, size_t len, unsigned int flags, 723 loff_t *ppos, size_t len, unsigned int flags,
724 splice_actor *actor) 724 splice_actor *actor)
725 { 725 {
726 ssize_t ret; 726 ssize_t ret;
727 struct inode *inode = out->f_mapping->host; 727 struct inode *inode = out->f_mapping->host;
728 struct splice_desc sd = { 728 struct splice_desc sd = {
729 .total_len = len, 729 .total_len = len,
730 .flags = flags, 730 .flags = flags,
731 .pos = *ppos, 731 .pos = *ppos,
732 .u.file = out, 732 .u.file = out,
733 }; 733 };
734 734
735 /* 735 /*
736 * The actor worker might be calling ->write_begin and 736 * The actor worker might be calling ->write_begin and
737 * ->write_end. Most of the time, these expect i_mutex to 737 * ->write_end. Most of the time, these expect i_mutex to
738 * be held. Since this may result in an ABBA deadlock with 738 * be held. Since this may result in an ABBA deadlock with
739 * pipe->inode, we have to order lock acquiry here. 739 * pipe->inode, we have to order lock acquiry here.
740 *
741 * Outer lock must be inode->i_mutex, as pipe_wait() will
742 * release and reacquire pipe->inode->i_mutex, AND inode must
743 * never be a pipe.
740 */ 744 */
741 inode_double_lock(inode, pipe->inode); 745 WARN_ON(S_ISFIFO(inode->i_mode));
746 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
747 if (pipe->inode)
748 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
742 ret = __splice_from_pipe(pipe, &sd, actor); 749 ret = __splice_from_pipe(pipe, &sd, actor);
743 inode_double_unlock(inode, pipe->inode); 750 if (pipe->inode)
751 mutex_unlock(&pipe->inode->i_mutex);
752 mutex_unlock(&inode->i_mutex);
744 753
745 return ret; 754 return ret;
746 } 755 }
747 756
748 /** 757 /**
749 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 758 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
750 * @pipe: pipe info 759 * @pipe: pipe info
751 * @out: file to write to 760 * @out: file to write to
752 * @ppos: position in @out 761 * @ppos: position in @out
753 * @len: number of bytes to splice 762 * @len: number of bytes to splice
754 * @flags: splice modifier flags 763 * @flags: splice modifier flags
755 * 764 *
756 * Description: 765 * Description:
757 * Will either move or copy pages (determined by @flags options) from 766 * Will either move or copy pages (determined by @flags options) from
758 * the given pipe inode to the given file. The caller is responsible 767 * the given pipe inode to the given file. The caller is responsible
759 * for acquiring i_mutex on both inodes. 768 * for acquiring i_mutex on both inodes.
760 * 769 *
761 */ 770 */
762 ssize_t 771 ssize_t
763 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 772 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
764 loff_t *ppos, size_t len, unsigned int flags) 773 loff_t *ppos, size_t len, unsigned int flags)
765 { 774 {
766 struct address_space *mapping = out->f_mapping; 775 struct address_space *mapping = out->f_mapping;
767 struct inode *inode = mapping->host; 776 struct inode *inode = mapping->host;
768 struct splice_desc sd = { 777 struct splice_desc sd = {
769 .total_len = len, 778 .total_len = len,
770 .flags = flags, 779 .flags = flags,
771 .pos = *ppos, 780 .pos = *ppos,
772 .u.file = out, 781 .u.file = out,
773 }; 782 };
774 ssize_t ret; 783 ssize_t ret;
775 int err; 784 int err;
776 785
777 err = file_remove_suid(out); 786 err = file_remove_suid(out);
778 if (unlikely(err)) 787 if (unlikely(err))
779 return err; 788 return err;
780 789
781 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 790 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
782 if (ret > 0) { 791 if (ret > 0) {
783 unsigned long nr_pages; 792 unsigned long nr_pages;
784 793
785 *ppos += ret; 794 *ppos += ret;
786 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 795 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
787 796
788 /* 797 /*
789 * If file or inode is SYNC and we actually wrote some data, 798 * If file or inode is SYNC and we actually wrote some data,
790 * sync it. 799 * sync it.
791 */ 800 */
792 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 801 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
793 err = generic_osync_inode(inode, mapping, 802 err = generic_osync_inode(inode, mapping,
794 OSYNC_METADATA|OSYNC_DATA); 803 OSYNC_METADATA|OSYNC_DATA);
795 804
796 if (err) 805 if (err)
797 ret = err; 806 ret = err;
798 } 807 }
799 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 808 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
800 } 809 }
801 810
802 return ret; 811 return ret;
803 } 812 }
804 813
805 EXPORT_SYMBOL(generic_file_splice_write_nolock); 814 EXPORT_SYMBOL(generic_file_splice_write_nolock);
806 815
807 /** 816 /**
808 * generic_file_splice_write - splice data from a pipe to a file 817 * generic_file_splice_write - splice data from a pipe to a file
809 * @pipe: pipe info 818 * @pipe: pipe info
810 * @out: file to write to 819 * @out: file to write to
811 * @ppos: position in @out 820 * @ppos: position in @out
812 * @len: number of bytes to splice 821 * @len: number of bytes to splice
813 * @flags: splice modifier flags 822 * @flags: splice modifier flags
814 * 823 *
815 * Description: 824 * Description:
816 * Will either move or copy pages (determined by @flags options) from 825 * Will either move or copy pages (determined by @flags options) from
817 * the given pipe inode to the given file. 826 * the given pipe inode to the given file.
818 * 827 *
819 */ 828 */
820 ssize_t 829 ssize_t
821 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 830 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
822 loff_t *ppos, size_t len, unsigned int flags) 831 loff_t *ppos, size_t len, unsigned int flags)
823 { 832 {
824 struct address_space *mapping = out->f_mapping; 833 struct address_space *mapping = out->f_mapping;
825 struct inode *inode = mapping->host; 834 struct inode *inode = mapping->host;
826 struct splice_desc sd = { 835 struct splice_desc sd = {
827 .total_len = len, 836 .total_len = len,
828 .flags = flags, 837 .flags = flags,
829 .pos = *ppos, 838 .pos = *ppos,
830 .u.file = out, 839 .u.file = out,
831 }; 840 };
832 ssize_t ret; 841 ssize_t ret;
833 842
834 inode_double_lock(inode, pipe->inode); 843 WARN_ON(S_ISFIFO(inode->i_mode));
844 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
835 ret = file_remove_suid(out); 845 ret = file_remove_suid(out);
836 if (likely(!ret)) 846 if (likely(!ret)) {
847 if (pipe->inode)
848 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
837 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 849 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
838 inode_double_unlock(inode, pipe->inode); 850 if (pipe->inode)
851 mutex_unlock(&pipe->inode->i_mutex);
852 }
853 mutex_unlock(&inode->i_mutex);
839 if (ret > 0) { 854 if (ret > 0) {
840 unsigned long nr_pages; 855 unsigned long nr_pages;
841 856
842 *ppos += ret; 857 *ppos += ret;
843 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 858 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
844 859
845 /* 860 /*
846 * If file or inode is SYNC and we actually wrote some data, 861 * If file or inode is SYNC and we actually wrote some data,
847 * sync it. 862 * sync it.
848 */ 863 */
849 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 864 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
850 int err; 865 int err;
851 866
852 mutex_lock(&inode->i_mutex); 867 mutex_lock(&inode->i_mutex);
853 err = generic_osync_inode(inode, mapping, 868 err = generic_osync_inode(inode, mapping,
854 OSYNC_METADATA|OSYNC_DATA); 869 OSYNC_METADATA|OSYNC_DATA);
855 mutex_unlock(&inode->i_mutex); 870 mutex_unlock(&inode->i_mutex);
856 871
857 if (err) 872 if (err)
858 ret = err; 873 ret = err;
859 } 874 }
860 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 875 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
861 } 876 }
862 877
863 return ret; 878 return ret;
864 } 879 }
865 880
866 EXPORT_SYMBOL(generic_file_splice_write); 881 EXPORT_SYMBOL(generic_file_splice_write);
867 882
868 /** 883 /**
869 * generic_splice_sendpage - splice data from a pipe to a socket 884 * generic_splice_sendpage - splice data from a pipe to a socket
870 * @pipe: pipe to splice from 885 * @pipe: pipe to splice from
871 * @out: socket to write to 886 * @out: socket to write to
872 * @ppos: position in @out 887 * @ppos: position in @out
873 * @len: number of bytes to splice 888 * @len: number of bytes to splice
874 * @flags: splice modifier flags 889 * @flags: splice modifier flags
875 * 890 *
876 * Description: 891 * Description:
877 * Will send @len bytes from the pipe to a network socket. No data copying 892 * Will send @len bytes from the pipe to a network socket. No data copying
878 * is involved. 893 * is involved.
879 * 894 *
880 */ 895 */
881 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 896 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
882 loff_t *ppos, size_t len, unsigned int flags) 897 loff_t *ppos, size_t len, unsigned int flags)
883 { 898 {
884 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 899 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
885 } 900 }
886 901
887 EXPORT_SYMBOL(generic_splice_sendpage); 902 EXPORT_SYMBOL(generic_splice_sendpage);
888 903
889 /* 904 /*
890 * Attempt to initiate a splice from pipe to file. 905 * Attempt to initiate a splice from pipe to file.
891 */ 906 */
892 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 907 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
893 loff_t *ppos, size_t len, unsigned int flags) 908 loff_t *ppos, size_t len, unsigned int flags)
894 { 909 {
895 int ret; 910 int ret;
896 911
897 if (unlikely(!out->f_op || !out->f_op->splice_write)) 912 if (unlikely(!out->f_op || !out->f_op->splice_write))
898 return -EINVAL; 913 return -EINVAL;
899 914
900 if (unlikely(!(out->f_mode & FMODE_WRITE))) 915 if (unlikely(!(out->f_mode & FMODE_WRITE)))
901 return -EBADF; 916 return -EBADF;
902 917
903 if (unlikely(out->f_flags & O_APPEND)) 918 if (unlikely(out->f_flags & O_APPEND))
904 return -EINVAL; 919 return -EINVAL;
905 920
906 ret = rw_verify_area(WRITE, out, ppos, len); 921 ret = rw_verify_area(WRITE, out, ppos, len);
907 if (unlikely(ret < 0)) 922 if (unlikely(ret < 0))
908 return ret; 923 return ret;
909 924
910 return out->f_op->splice_write(pipe, out, ppos, len, flags); 925 return out->f_op->splice_write(pipe, out, ppos, len, flags);
911 } 926 }
912 927
913 /* 928 /*
914 * Attempt to initiate a splice from a file to a pipe. 929 * Attempt to initiate a splice from a file to a pipe.
915 */ 930 */
916 static long do_splice_to(struct file *in, loff_t *ppos, 931 static long do_splice_to(struct file *in, loff_t *ppos,
917 struct pipe_inode_info *pipe, size_t len, 932 struct pipe_inode_info *pipe, size_t len,
918 unsigned int flags) 933 unsigned int flags)
919 { 934 {
920 int ret; 935 int ret;
921 936
922 if (unlikely(!in->f_op || !in->f_op->splice_read)) 937 if (unlikely(!in->f_op || !in->f_op->splice_read))
923 return -EINVAL; 938 return -EINVAL;
924 939
925 if (unlikely(!(in->f_mode & FMODE_READ))) 940 if (unlikely(!(in->f_mode & FMODE_READ)))
926 return -EBADF; 941 return -EBADF;
927 942
928 ret = rw_verify_area(READ, in, ppos, len); 943 ret = rw_verify_area(READ, in, ppos, len);
929 if (unlikely(ret < 0)) 944 if (unlikely(ret < 0))
930 return ret; 945 return ret;
931 946
932 return in->f_op->splice_read(in, ppos, pipe, len, flags); 947 return in->f_op->splice_read(in, ppos, pipe, len, flags);
933 } 948 }
934 949
935 /** 950 /**
936 * splice_direct_to_actor - splices data directly between two non-pipes 951 * splice_direct_to_actor - splices data directly between two non-pipes
937 * @in: file to splice from 952 * @in: file to splice from
938 * @sd: actor information on where to splice to 953 * @sd: actor information on where to splice to
939 * @actor: handles the data splicing 954 * @actor: handles the data splicing
940 * 955 *
941 * Description: 956 * Description:
942 * This is a special case helper to splice directly between two 957 * This is a special case helper to splice directly between two
943 * points, without requiring an explicit pipe. Internally an allocated 958 * points, without requiring an explicit pipe. Internally an allocated
944 * pipe is cached in the process, and reused during the lifetime of 959 * pipe is cached in the process, and reused during the lifetime of
945 * that process. 960 * that process.
946 * 961 *
947 */ 962 */
948 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 963 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
949 splice_direct_actor *actor) 964 splice_direct_actor *actor)
950 { 965 {
951 struct pipe_inode_info *pipe; 966 struct pipe_inode_info *pipe;
952 long ret, bytes; 967 long ret, bytes;
953 umode_t i_mode; 968 umode_t i_mode;
954 size_t len; 969 size_t len;
955 int i, flags; 970 int i, flags;
956 971
957 /* 972 /*
958 * We require the input being a regular file, as we don't want to 973 * We require the input being a regular file, as we don't want to
959 * randomly drop data for eg socket -> socket splicing. Use the 974 * randomly drop data for eg socket -> socket splicing. Use the
960 * piped splicing for that! 975 * piped splicing for that!
961 */ 976 */
962 i_mode = in->f_path.dentry->d_inode->i_mode; 977 i_mode = in->f_path.dentry->d_inode->i_mode;
963 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 978 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
964 return -EINVAL; 979 return -EINVAL;
965 980
966 /* 981 /*
967 * neither in nor out is a pipe, setup an internal pipe attached to 982 * neither in nor out is a pipe, setup an internal pipe attached to
968 * 'out' and transfer the wanted data from 'in' to 'out' through that 983 * 'out' and transfer the wanted data from 'in' to 'out' through that
969 */ 984 */
970 pipe = current->splice_pipe; 985 pipe = current->splice_pipe;
971 if (unlikely(!pipe)) { 986 if (unlikely(!pipe)) {
972 pipe = alloc_pipe_info(NULL); 987 pipe = alloc_pipe_info(NULL);
973 if (!pipe) 988 if (!pipe)
974 return -ENOMEM; 989 return -ENOMEM;
975 990
976 /* 991 /*
977 * We don't have an immediate reader, but we'll read the stuff 992 * We don't have an immediate reader, but we'll read the stuff
978 * out of the pipe right after the splice_to_pipe(). So set 993 * out of the pipe right after the splice_to_pipe(). So set
979 * PIPE_READERS appropriately. 994 * PIPE_READERS appropriately.
980 */ 995 */
981 pipe->readers = 1; 996 pipe->readers = 1;
982 997
983 current->splice_pipe = pipe; 998 current->splice_pipe = pipe;
984 } 999 }
985 1000
986 /* 1001 /*
987 * Do the splice. 1002 * Do the splice.
988 */ 1003 */
989 ret = 0; 1004 ret = 0;
990 bytes = 0; 1005 bytes = 0;
991 len = sd->total_len; 1006 len = sd->total_len;
992 flags = sd->flags; 1007 flags = sd->flags;
993 1008
994 /* 1009 /*
995 * Don't block on output, we have to drain the direct pipe. 1010 * Don't block on output, we have to drain the direct pipe.
996 */ 1011 */
997 sd->flags &= ~SPLICE_F_NONBLOCK; 1012 sd->flags &= ~SPLICE_F_NONBLOCK;
998 1013
999 while (len) { 1014 while (len) {
1000 size_t read_len; 1015 size_t read_len;
1001 loff_t pos = sd->pos, prev_pos = pos; 1016 loff_t pos = sd->pos, prev_pos = pos;
1002 1017
1003 ret = do_splice_to(in, &pos, pipe, len, flags); 1018 ret = do_splice_to(in, &pos, pipe, len, flags);
1004 if (unlikely(ret <= 0)) 1019 if (unlikely(ret <= 0))
1005 goto out_release; 1020 goto out_release;
1006 1021
1007 read_len = ret; 1022 read_len = ret;
1008 sd->total_len = read_len; 1023 sd->total_len = read_len;
1009 1024
1010 /* 1025 /*
1011 * NOTE: nonblocking mode only applies to the input. We 1026 * NOTE: nonblocking mode only applies to the input. We
1012 * must not do the output in nonblocking mode as then we 1027 * must not do the output in nonblocking mode as then we
1013 * could get stuck data in the internal pipe: 1028 * could get stuck data in the internal pipe:
1014 */ 1029 */
1015 ret = actor(pipe, sd); 1030 ret = actor(pipe, sd);
1016 if (unlikely(ret <= 0)) { 1031 if (unlikely(ret <= 0)) {
1017 sd->pos = prev_pos; 1032 sd->pos = prev_pos;
1018 goto out_release; 1033 goto out_release;
1019 } 1034 }
1020 1035
1021 bytes += ret; 1036 bytes += ret;
1022 len -= ret; 1037 len -= ret;
1023 sd->pos = pos; 1038 sd->pos = pos;
1024 1039
1025 if (ret < read_len) { 1040 if (ret < read_len) {
1026 sd->pos = prev_pos + ret; 1041 sd->pos = prev_pos + ret;
1027 goto out_release; 1042 goto out_release;
1028 } 1043 }
1029 } 1044 }
1030 1045
1031 done: 1046 done:
1032 pipe->nrbufs = pipe->curbuf = 0; 1047 pipe->nrbufs = pipe->curbuf = 0;
1033 file_accessed(in); 1048 file_accessed(in);
1034 return bytes; 1049 return bytes;
1035 1050
1036 out_release: 1051 out_release:
1037 /* 1052 /*
1038 * If we did an incomplete transfer we must release 1053 * If we did an incomplete transfer we must release
1039 * the pipe buffers in question: 1054 * the pipe buffers in question:
1040 */ 1055 */
1041 for (i = 0; i < PIPE_BUFFERS; i++) { 1056 for (i = 0; i < PIPE_BUFFERS; i++) {
1042 struct pipe_buffer *buf = pipe->bufs + i; 1057 struct pipe_buffer *buf = pipe->bufs + i;
1043 1058
1044 if (buf->ops) { 1059 if (buf->ops) {
1045 buf->ops->release(pipe, buf); 1060 buf->ops->release(pipe, buf);
1046 buf->ops = NULL; 1061 buf->ops = NULL;
1047 } 1062 }
1048 } 1063 }
1049 1064
1050 if (!bytes) 1065 if (!bytes)
1051 bytes = ret; 1066 bytes = ret;
1052 1067
1053 goto done; 1068 goto done;
1054 } 1069 }
1055 EXPORT_SYMBOL(splice_direct_to_actor); 1070 EXPORT_SYMBOL(splice_direct_to_actor);
1056 1071
1057 static int direct_splice_actor(struct pipe_inode_info *pipe, 1072 static int direct_splice_actor(struct pipe_inode_info *pipe,
1058 struct splice_desc *sd) 1073 struct splice_desc *sd)
1059 { 1074 {
1060 struct file *file = sd->u.file; 1075 struct file *file = sd->u.file;
1061 1076
1062 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1077 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1063 } 1078 }
1064 1079
1065 /** 1080 /**
1066 * do_splice_direct - splices data directly between two files 1081 * do_splice_direct - splices data directly between two files
1067 * @in: file to splice from 1082 * @in: file to splice from
1068 * @ppos: input file offset 1083 * @ppos: input file offset
1069 * @out: file to splice to 1084 * @out: file to splice to
1070 * @len: number of bytes to splice 1085 * @len: number of bytes to splice
1071 * @flags: splice modifier flags 1086 * @flags: splice modifier flags
1072 * 1087 *
1073 * Description: 1088 * Description:
1074 * For use by do_sendfile(). splice can easily emulate sendfile, but 1089 * For use by do_sendfile(). splice can easily emulate sendfile, but
1075 * doing it in the application would incur an extra system call 1090 * doing it in the application would incur an extra system call
1076 * (splice in + splice out, as compared to just sendfile()). So this helper 1091 * (splice in + splice out, as compared to just sendfile()). So this helper
1077 * can splice directly through a process-private pipe. 1092 * can splice directly through a process-private pipe.
1078 * 1093 *
1079 */ 1094 */
1080 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1095 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1081 size_t len, unsigned int flags) 1096 size_t len, unsigned int flags)
1082 { 1097 {
1083 struct splice_desc sd = { 1098 struct splice_desc sd = {
1084 .len = len, 1099 .len = len,
1085 .total_len = len, 1100 .total_len = len,
1086 .flags = flags, 1101 .flags = flags,
1087 .pos = *ppos, 1102 .pos = *ppos,
1088 .u.file = out, 1103 .u.file = out,
1089 }; 1104 };
1090 long ret; 1105 long ret;
1091 1106
1092 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1107 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1093 if (ret > 0) 1108 if (ret > 0)
1094 *ppos = sd.pos; 1109 *ppos = sd.pos;
1095 1110
1096 return ret; 1111 return ret;
1097 } 1112 }
1098 1113
1099 /* 1114 /*
1100 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1115 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1101 * location, so checking ->i_pipe is not enough to verify that this is a 1116 * location, so checking ->i_pipe is not enough to verify that this is a
1102 * pipe. 1117 * pipe.
1103 */ 1118 */
1104 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1119 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1105 { 1120 {
1106 if (S_ISFIFO(inode->i_mode)) 1121 if (S_ISFIFO(inode->i_mode))
1107 return inode->i_pipe; 1122 return inode->i_pipe;
1108 1123
1109 return NULL; 1124 return NULL;
1110 } 1125 }
1111 1126
1112 /* 1127 /*
1113 * Determine where to splice to/from. 1128 * Determine where to splice to/from.
1114 */ 1129 */
1115 static long do_splice(struct file *in, loff_t __user *off_in, 1130 static long do_splice(struct file *in, loff_t __user *off_in,
1116 struct file *out, loff_t __user *off_out, 1131 struct file *out, loff_t __user *off_out,
1117 size_t len, unsigned int flags) 1132 size_t len, unsigned int flags)
1118 { 1133 {
1119 struct pipe_inode_info *pipe; 1134 struct pipe_inode_info *pipe;
1120 loff_t offset, *off; 1135 loff_t offset, *off;
1121 long ret; 1136 long ret;
1122 1137
1123 pipe = pipe_info(in->f_path.dentry->d_inode); 1138 pipe = pipe_info(in->f_path.dentry->d_inode);
1124 if (pipe) { 1139 if (pipe) {
1125 if (off_in) 1140 if (off_in)
1126 return -ESPIPE; 1141 return -ESPIPE;
1127 if (off_out) { 1142 if (off_out) {
1128 if (out->f_op->llseek == no_llseek) 1143 if (out->f_op->llseek == no_llseek)
1129 return -EINVAL; 1144 return -EINVAL;
1130 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1145 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1131 return -EFAULT; 1146 return -EFAULT;
1132 off = &offset; 1147 off = &offset;
1133 } else 1148 } else
1134 off = &out->f_pos; 1149 off = &out->f_pos;
1135 1150
1136 ret = do_splice_from(pipe, out, off, len, flags); 1151 ret = do_splice_from(pipe, out, off, len, flags);
1137 1152
1138 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1153 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1139 ret = -EFAULT; 1154 ret = -EFAULT;
1140 1155
1141 return ret; 1156 return ret;
1142 } 1157 }
1143 1158
1144 pipe = pipe_info(out->f_path.dentry->d_inode); 1159 pipe = pipe_info(out->f_path.dentry->d_inode);
1145 if (pipe) { 1160 if (pipe) {
1146 if (off_out) 1161 if (off_out)
1147 return -ESPIPE; 1162 return -ESPIPE;
1148 if (off_in) { 1163 if (off_in) {
1149 if (in->f_op->llseek == no_llseek) 1164 if (in->f_op->llseek == no_llseek)
1150 return -EINVAL; 1165 return -EINVAL;
1151 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1166 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1152 return -EFAULT; 1167 return -EFAULT;
1153 off = &offset; 1168 off = &offset;
1154 } else 1169 } else
1155 off = &in->f_pos; 1170 off = &in->f_pos;
1156 1171
1157 ret = do_splice_to(in, off, pipe, len, flags); 1172 ret = do_splice_to(in, off, pipe, len, flags);
1158 1173
1159 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1174 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1160 ret = -EFAULT; 1175 ret = -EFAULT;
1161 1176
1162 return ret; 1177 return ret;
1163 } 1178 }
1164 1179
1165 return -EINVAL; 1180 return -EINVAL;
1166 } 1181 }
1167 1182
1168 /* 1183 /*
1169 * Map an iov into an array of pages and offset/length tupples. With the 1184 * Map an iov into an array of pages and offset/length tupples. With the
1170 * partial_page structure, we can map several non-contiguous ranges into 1185 * partial_page structure, we can map several non-contiguous ranges into
1171 * our ones pages[] map instead of splitting that operation into pieces. 1186 * our ones pages[] map instead of splitting that operation into pieces.
1172 * Could easily be exported as a generic helper for other users, in which 1187 * Could easily be exported as a generic helper for other users, in which
1173 * case one would probably want to add a 'max_nr_pages' parameter as well. 1188 * case one would probably want to add a 'max_nr_pages' parameter as well.
1174 */ 1189 */
1175 static int get_iovec_page_array(const struct iovec __user *iov, 1190 static int get_iovec_page_array(const struct iovec __user *iov,
1176 unsigned int nr_vecs, struct page **pages, 1191 unsigned int nr_vecs, struct page **pages,
1177 struct partial_page *partial, int aligned) 1192 struct partial_page *partial, int aligned)
1178 { 1193 {
1179 int buffers = 0, error = 0; 1194 int buffers = 0, error = 0;
1180 1195
1181 while (nr_vecs) { 1196 while (nr_vecs) {
1182 unsigned long off, npages; 1197 unsigned long off, npages;
1183 struct iovec entry; 1198 struct iovec entry;
1184 void __user *base; 1199 void __user *base;
1185 size_t len; 1200 size_t len;
1186 int i; 1201 int i;
1187 1202
1188 error = -EFAULT; 1203 error = -EFAULT;
1189 if (copy_from_user(&entry, iov, sizeof(entry))) 1204 if (copy_from_user(&entry, iov, sizeof(entry)))
1190 break; 1205 break;
1191 1206
1192 base = entry.iov_base; 1207 base = entry.iov_base;
1193 len = entry.iov_len; 1208 len = entry.iov_len;
1194 1209
1195 /* 1210 /*
1196 * Sanity check this iovec. 0 read succeeds. 1211 * Sanity check this iovec. 0 read succeeds.
1197 */ 1212 */
1198 error = 0; 1213 error = 0;
1199 if (unlikely(!len)) 1214 if (unlikely(!len))
1200 break; 1215 break;
1201 error = -EFAULT; 1216 error = -EFAULT;
1202 if (!access_ok(VERIFY_READ, base, len)) 1217 if (!access_ok(VERIFY_READ, base, len))
1203 break; 1218 break;
1204 1219
1205 /* 1220 /*
1206 * Get this base offset and number of pages, then map 1221 * Get this base offset and number of pages, then map
1207 * in the user pages. 1222 * in the user pages.
1208 */ 1223 */
1209 off = (unsigned long) base & ~PAGE_MASK; 1224 off = (unsigned long) base & ~PAGE_MASK;
1210 1225
1211 /* 1226 /*
1212 * If asked for alignment, the offset must be zero and the 1227 * If asked for alignment, the offset must be zero and the
1213 * length a multiple of the PAGE_SIZE. 1228 * length a multiple of the PAGE_SIZE.
1214 */ 1229 */
1215 error = -EINVAL; 1230 error = -EINVAL;
1216 if (aligned && (off || len & ~PAGE_MASK)) 1231 if (aligned && (off || len & ~PAGE_MASK))
1217 break; 1232 break;
1218 1233
1219 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1234 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1220 if (npages > PIPE_BUFFERS - buffers) 1235 if (npages > PIPE_BUFFERS - buffers)
1221 npages = PIPE_BUFFERS - buffers; 1236 npages = PIPE_BUFFERS - buffers;
1222 1237
1223 error = get_user_pages_fast((unsigned long)base, npages, 1238 error = get_user_pages_fast((unsigned long)base, npages,
1224 0, &pages[buffers]); 1239 0, &pages[buffers]);
1225 1240
1226 if (unlikely(error <= 0)) 1241 if (unlikely(error <= 0))
1227 break; 1242 break;
1228 1243
1229 /* 1244 /*
1230 * Fill this contiguous range into the partial page map. 1245 * Fill this contiguous range into the partial page map.
1231 */ 1246 */
1232 for (i = 0; i < error; i++) { 1247 for (i = 0; i < error; i++) {
1233 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1248 const int plen = min_t(size_t, len, PAGE_SIZE - off);
1234 1249
1235 partial[buffers].offset = off; 1250 partial[buffers].offset = off;
1236 partial[buffers].len = plen; 1251 partial[buffers].len = plen;
1237 1252
1238 off = 0; 1253 off = 0;
1239 len -= plen; 1254 len -= plen;
1240 buffers++; 1255 buffers++;
1241 } 1256 }
1242 1257
1243 /* 1258 /*
1244 * We didn't complete this iov, stop here since it probably 1259 * We didn't complete this iov, stop here since it probably
1245 * means we have to move some of this into a pipe to 1260 * means we have to move some of this into a pipe to
1246 * be able to continue. 1261 * be able to continue.
1247 */ 1262 */
1248 if (len) 1263 if (len)
1249 break; 1264 break;
1250 1265
1251 /* 1266 /*
1252 * Don't continue if we mapped fewer pages than we asked for, 1267 * Don't continue if we mapped fewer pages than we asked for,
1253 * or if we mapped the max number of pages that we have 1268 * or if we mapped the max number of pages that we have
1254 * room for. 1269 * room for.
1255 */ 1270 */
1256 if (error < npages || buffers == PIPE_BUFFERS) 1271 if (error < npages || buffers == PIPE_BUFFERS)
1257 break; 1272 break;
1258 1273
1259 nr_vecs--; 1274 nr_vecs--;
1260 iov++; 1275 iov++;
1261 } 1276 }
1262 1277
1263 if (buffers) 1278 if (buffers)
1264 return buffers; 1279 return buffers;
1265 1280
1266 return error; 1281 return error;
1267 } 1282 }
1268 1283
1269 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1284 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1270 struct splice_desc *sd) 1285 struct splice_desc *sd)
1271 { 1286 {
1272 char *src; 1287 char *src;
1273 int ret; 1288 int ret;
1274 1289
1275 ret = buf->ops->confirm(pipe, buf); 1290 ret = buf->ops->confirm(pipe, buf);
1276 if (unlikely(ret)) 1291 if (unlikely(ret))
1277 return ret; 1292 return ret;
1278 1293
1279 /* 1294 /*
1280 * See if we can use the atomic maps, by prefaulting in the 1295 * See if we can use the atomic maps, by prefaulting in the
1281 * pages and doing an atomic copy 1296 * pages and doing an atomic copy
1282 */ 1297 */
1283 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1298 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1284 src = buf->ops->map(pipe, buf, 1); 1299 src = buf->ops->map(pipe, buf, 1);
1285 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1300 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1286 sd->len); 1301 sd->len);
1287 buf->ops->unmap(pipe, buf, src); 1302 buf->ops->unmap(pipe, buf, src);
1288 if (!ret) { 1303 if (!ret) {
1289 ret = sd->len; 1304 ret = sd->len;
1290 goto out; 1305 goto out;
1291 } 1306 }
1292 } 1307 }
1293 1308
1294 /* 1309 /*
1295 * No dice, use slow non-atomic map and copy 1310 * No dice, use slow non-atomic map and copy
1296 */ 1311 */
1297 src = buf->ops->map(pipe, buf, 0); 1312 src = buf->ops->map(pipe, buf, 0);
1298 1313
1299 ret = sd->len; 1314 ret = sd->len;
1300 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1315 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1301 ret = -EFAULT; 1316 ret = -EFAULT;
1302 1317
1303 buf->ops->unmap(pipe, buf, src); 1318 buf->ops->unmap(pipe, buf, src);
1304 out: 1319 out:
1305 if (ret > 0) 1320 if (ret > 0)
1306 sd->u.userptr += ret; 1321 sd->u.userptr += ret;
1307 return ret; 1322 return ret;
1308 } 1323 }
1309 1324
1310 /* 1325 /*
1311 * For lack of a better implementation, implement vmsplice() to userspace 1326 * For lack of a better implementation, implement vmsplice() to userspace
1312 * as a simple copy of the pipes pages to the user iov. 1327 * as a simple copy of the pipes pages to the user iov.
1313 */ 1328 */
1314 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1329 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1315 unsigned long nr_segs, unsigned int flags) 1330 unsigned long nr_segs, unsigned int flags)
1316 { 1331 {
1317 struct pipe_inode_info *pipe; 1332 struct pipe_inode_info *pipe;
1318 struct splice_desc sd; 1333 struct splice_desc sd;
1319 ssize_t size; 1334 ssize_t size;
1320 int error; 1335 int error;
1321 long ret; 1336 long ret;
1322 1337
1323 pipe = pipe_info(file->f_path.dentry->d_inode); 1338 pipe = pipe_info(file->f_path.dentry->d_inode);
1324 if (!pipe) 1339 if (!pipe)
1325 return -EBADF; 1340 return -EBADF;
1326 1341
1327 if (pipe->inode) 1342 if (pipe->inode)
1328 mutex_lock(&pipe->inode->i_mutex); 1343 mutex_lock(&pipe->inode->i_mutex);
1329 1344
1330 error = ret = 0; 1345 error = ret = 0;
1331 while (nr_segs) { 1346 while (nr_segs) {
1332 void __user *base; 1347 void __user *base;
1333 size_t len; 1348 size_t len;
1334 1349
1335 /* 1350 /*
1336 * Get user address base and length for this iovec. 1351 * Get user address base and length for this iovec.
1337 */ 1352 */
1338 error = get_user(base, &iov->iov_base); 1353 error = get_user(base, &iov->iov_base);
1339 if (unlikely(error)) 1354 if (unlikely(error))
1340 break; 1355 break;
1341 error = get_user(len, &iov->iov_len); 1356 error = get_user(len, &iov->iov_len);
1342 if (unlikely(error)) 1357 if (unlikely(error))
1343 break; 1358 break;
1344 1359
1345 /* 1360 /*
1346 * Sanity check this iovec. 0 read succeeds. 1361 * Sanity check this iovec. 0 read succeeds.
1347 */ 1362 */
1348 if (unlikely(!len)) 1363 if (unlikely(!len))
1349 break; 1364 break;
1350 if (unlikely(!base)) { 1365 if (unlikely(!base)) {
1351 error = -EFAULT; 1366 error = -EFAULT;
1352 break; 1367 break;
1353 } 1368 }
1354 1369
1355 if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { 1370 if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1356 error = -EFAULT; 1371 error = -EFAULT;
1357 break; 1372 break;
1358 } 1373 }
1359 1374
1360 sd.len = 0; 1375 sd.len = 0;
1361 sd.total_len = len; 1376 sd.total_len = len;
1362 sd.flags = flags; 1377 sd.flags = flags;
1363 sd.u.userptr = base; 1378 sd.u.userptr = base;
1364 sd.pos = 0; 1379 sd.pos = 0;
1365 1380
1366 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1381 size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1367 if (size < 0) { 1382 if (size < 0) {
1368 if (!ret) 1383 if (!ret)
1369 ret = size; 1384 ret = size;
1370 1385
1371 break; 1386 break;
1372 } 1387 }
1373 1388
1374 ret += size; 1389 ret += size;
1375 1390
1376 if (size < len) 1391 if (size < len)
1377 break; 1392 break;
1378 1393
1379 nr_segs--; 1394 nr_segs--;
1380 iov++; 1395 iov++;
1381 } 1396 }
1382 1397
1383 if (pipe->inode) 1398 if (pipe->inode)
1384 mutex_unlock(&pipe->inode->i_mutex); 1399 mutex_unlock(&pipe->inode->i_mutex);
1385 1400
1386 if (!ret) 1401 if (!ret)
1387 ret = error; 1402 ret = error;
1388 1403
1389 return ret; 1404 return ret;
1390 } 1405 }
1391 1406
1392 /* 1407 /*
1393 * vmsplice splices a user address range into a pipe. It can be thought of 1408 * vmsplice splices a user address range into a pipe. It can be thought of
1394 * as splice-from-memory, where the regular splice is splice-from-file (or 1409 * as splice-from-memory, where the regular splice is splice-from-file (or
1395 * to file). In both cases the output is a pipe, naturally. 1410 * to file). In both cases the output is a pipe, naturally.
1396 */ 1411 */
1397 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1412 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1398 unsigned long nr_segs, unsigned int flags) 1413 unsigned long nr_segs, unsigned int flags)
1399 { 1414 {
1400 struct pipe_inode_info *pipe; 1415 struct pipe_inode_info *pipe;
1401 struct page *pages[PIPE_BUFFERS]; 1416 struct page *pages[PIPE_BUFFERS];
1402 struct partial_page partial[PIPE_BUFFERS]; 1417 struct partial_page partial[PIPE_BUFFERS];
1403 struct splice_pipe_desc spd = { 1418 struct splice_pipe_desc spd = {
1404 .pages = pages, 1419 .pages = pages,
1405 .partial = partial, 1420 .partial = partial,
1406 .flags = flags, 1421 .flags = flags,
1407 .ops = &user_page_pipe_buf_ops, 1422 .ops = &user_page_pipe_buf_ops,
1408 .spd_release = spd_release_page, 1423 .spd_release = spd_release_page,
1409 }; 1424 };
1410 1425
1411 pipe = pipe_info(file->f_path.dentry->d_inode); 1426 pipe = pipe_info(file->f_path.dentry->d_inode);
1412 if (!pipe) 1427 if (!pipe)
1413 return -EBADF; 1428 return -EBADF;
1414 1429
1415 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1430 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1416 flags & SPLICE_F_GIFT); 1431 flags & SPLICE_F_GIFT);
1417 if (spd.nr_pages <= 0) 1432 if (spd.nr_pages <= 0)
1418 return spd.nr_pages; 1433 return spd.nr_pages;
1419 1434
1420 return splice_to_pipe(pipe, &spd); 1435 return splice_to_pipe(pipe, &spd);
1421 } 1436 }
1422 1437
1423 /* 1438 /*
1424 * Note that vmsplice only really supports true splicing _from_ user memory 1439 * Note that vmsplice only really supports true splicing _from_ user memory
1425 * to a pipe, not the other way around. Splicing from user memory is a simple 1440 * to a pipe, not the other way around. Splicing from user memory is a simple
1426 * operation that can be supported without any funky alignment restrictions 1441 * operation that can be supported without any funky alignment restrictions
1427 * or nasty vm tricks. We simply map in the user memory and fill them into 1442 * or nasty vm tricks. We simply map in the user memory and fill them into
1428 * a pipe. The reverse isn't quite as easy, though. There are two possible 1443 * a pipe. The reverse isn't quite as easy, though. There are two possible
1429 * solutions for that: 1444 * solutions for that:
1430 * 1445 *
1431 * - memcpy() the data internally, at which point we might as well just 1446 * - memcpy() the data internally, at which point we might as well just
1432 * do a regular read() on the buffer anyway. 1447 * do a regular read() on the buffer anyway.
1433 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1448 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1434 * has restriction limitations on both ends of the pipe). 1449 * has restriction limitations on both ends of the pipe).
1435 * 1450 *
1436 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1451 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1437 * 1452 *
1438 */ 1453 */
1439 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, 1454 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1440 unsigned long, nr_segs, unsigned int, flags) 1455 unsigned long, nr_segs, unsigned int, flags)
1441 { 1456 {
1442 struct file *file; 1457 struct file *file;
1443 long error; 1458 long error;
1444 int fput; 1459 int fput;
1445 1460
1446 if (unlikely(nr_segs > UIO_MAXIOV)) 1461 if (unlikely(nr_segs > UIO_MAXIOV))
1447 return -EINVAL; 1462 return -EINVAL;
1448 else if (unlikely(!nr_segs)) 1463 else if (unlikely(!nr_segs))
1449 return 0; 1464 return 0;
1450 1465
1451 error = -EBADF; 1466 error = -EBADF;
1452 file = fget_light(fd, &fput); 1467 file = fget_light(fd, &fput);
1453 if (file) { 1468 if (file) {
1454 if (file->f_mode & FMODE_WRITE) 1469 if (file->f_mode & FMODE_WRITE)
1455 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1470 error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1456 else if (file->f_mode & FMODE_READ) 1471 else if (file->f_mode & FMODE_READ)
1457 error = vmsplice_to_user(file, iov, nr_segs, flags); 1472 error = vmsplice_to_user(file, iov, nr_segs, flags);
1458 1473
1459 fput_light(file, fput); 1474 fput_light(file, fput);
1460 } 1475 }
1461 1476
1462 return error; 1477 return error;
1463 } 1478 }
1464 1479
1465 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1480 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1466 int, fd_out, loff_t __user *, off_out, 1481 int, fd_out, loff_t __user *, off_out,
1467 size_t, len, unsigned int, flags) 1482 size_t, len, unsigned int, flags)
1468 { 1483 {
1469 long error; 1484 long error;
1470 struct file *in, *out; 1485 struct file *in, *out;
1471 int fput_in, fput_out; 1486 int fput_in, fput_out;
1472 1487
1473 if (unlikely(!len)) 1488 if (unlikely(!len))
1474 return 0; 1489 return 0;
1475 1490
1476 error = -EBADF; 1491 error = -EBADF;
1477 in = fget_light(fd_in, &fput_in); 1492 in = fget_light(fd_in, &fput_in);
1478 if (in) { 1493 if (in) {
1479 if (in->f_mode & FMODE_READ) { 1494 if (in->f_mode & FMODE_READ) {
1480 out = fget_light(fd_out, &fput_out); 1495 out = fget_light(fd_out, &fput_out);
1481 if (out) { 1496 if (out) {
1482 if (out->f_mode & FMODE_WRITE) 1497 if (out->f_mode & FMODE_WRITE)
1483 error = do_splice(in, off_in, 1498 error = do_splice(in, off_in,
1484 out, off_out, 1499 out, off_out,
1485 len, flags); 1500 len, flags);
1486 fput_light(out, fput_out); 1501 fput_light(out, fput_out);
1487 } 1502 }
1488 } 1503 }
1489 1504
1490 fput_light(in, fput_in); 1505 fput_light(in, fput_in);
1491 } 1506 }
1492 1507
1493 return error; 1508 return error;
1494 } 1509 }
1495 1510
1496 /* 1511 /*
1497 * Make sure there's data to read. Wait for input if we can, otherwise 1512 * Make sure there's data to read. Wait for input if we can, otherwise
1498 * return an appropriate error. 1513 * return an appropriate error.
1499 */ 1514 */
1500 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1515 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1501 { 1516 {
1502 int ret; 1517 int ret;
1503 1518
1504 /* 1519 /*
1505 * Check ->nrbufs without the inode lock first. This function 1520 * Check ->nrbufs without the inode lock first. This function
1506 * is speculative anyways, so missing one is ok. 1521 * is speculative anyways, so missing one is ok.
1507 */ 1522 */
1508 if (pipe->nrbufs) 1523 if (pipe->nrbufs)
1509 return 0; 1524 return 0;
1510 1525
1511 ret = 0; 1526 ret = 0;
1512 mutex_lock(&pipe->inode->i_mutex); 1527 mutex_lock(&pipe->inode->i_mutex);
1513 1528
1514 while (!pipe->nrbufs) { 1529 while (!pipe->nrbufs) {
1515 if (signal_pending(current)) { 1530 if (signal_pending(current)) {
1516 ret = -ERESTARTSYS; 1531 ret = -ERESTARTSYS;
1517 break; 1532 break;
1518 } 1533 }
1519 if (!pipe->writers) 1534 if (!pipe->writers)
1520 break; 1535 break;
1521 if (!pipe->waiting_writers) { 1536 if (!pipe->waiting_writers) {
1522 if (flags & SPLICE_F_NONBLOCK) { 1537 if (flags & SPLICE_F_NONBLOCK) {
1523 ret = -EAGAIN; 1538 ret = -EAGAIN;
1524 break; 1539 break;
1525 } 1540 }
1526 } 1541 }
1527 pipe_wait(pipe); 1542 pipe_wait(pipe);
1528 } 1543 }
1529 1544
1530 mutex_unlock(&pipe->inode->i_mutex); 1545 mutex_unlock(&pipe->inode->i_mutex);
1531 return ret; 1546 return ret;
1532 } 1547 }
1533 1548
1534 /* 1549 /*
1535 * Make sure there's writeable room. Wait for room if we can, otherwise 1550 * Make sure there's writeable room. Wait for room if we can, otherwise
1536 * return an appropriate error. 1551 * return an appropriate error.
1537 */ 1552 */
1538 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1553 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1539 { 1554 {
1540 int ret; 1555 int ret;
1541 1556
1542 /* 1557 /*
1543 * Check ->nrbufs without the inode lock first. This function 1558 * Check ->nrbufs without the inode lock first. This function
1544 * is speculative anyways, so missing one is ok. 1559 * is speculative anyways, so missing one is ok.
1545 */ 1560 */
1546 if (pipe->nrbufs < PIPE_BUFFERS) 1561 if (pipe->nrbufs < PIPE_BUFFERS)
1547 return 0; 1562 return 0;
1548 1563
1549 ret = 0; 1564 ret = 0;
1550 mutex_lock(&pipe->inode->i_mutex); 1565 mutex_lock(&pipe->inode->i_mutex);
1551 1566
1552 while (pipe->nrbufs >= PIPE_BUFFERS) { 1567 while (pipe->nrbufs >= PIPE_BUFFERS) {
1553 if (!pipe->readers) { 1568 if (!pipe->readers) {
1554 send_sig(SIGPIPE, current, 0); 1569 send_sig(SIGPIPE, current, 0);
1555 ret = -EPIPE; 1570 ret = -EPIPE;
1556 break; 1571 break;
1557 } 1572 }
1558 if (flags & SPLICE_F_NONBLOCK) { 1573 if (flags & SPLICE_F_NONBLOCK) {
1559 ret = -EAGAIN; 1574 ret = -EAGAIN;
1560 break; 1575 break;
1561 } 1576 }
1562 if (signal_pending(current)) { 1577 if (signal_pending(current)) {
1563 ret = -ERESTARTSYS; 1578 ret = -ERESTARTSYS;
1564 break; 1579 break;
1565 } 1580 }
1566 pipe->waiting_writers++; 1581 pipe->waiting_writers++;
1567 pipe_wait(pipe); 1582 pipe_wait(pipe);
1568 pipe->waiting_writers--; 1583 pipe->waiting_writers--;
1569 } 1584 }
1570 1585
1571 mutex_unlock(&pipe->inode->i_mutex); 1586 mutex_unlock(&pipe->inode->i_mutex);
1572 return ret; 1587 return ret;
1573 } 1588 }
1574 1589
1575 /* 1590 /*
1576 * Link contents of ipipe to opipe. 1591 * Link contents of ipipe to opipe.
1577 */ 1592 */
1578 static int link_pipe(struct pipe_inode_info *ipipe, 1593 static int link_pipe(struct pipe_inode_info *ipipe,
1579 struct pipe_inode_info *opipe, 1594 struct pipe_inode_info *opipe,
1580 size_t len, unsigned int flags) 1595 size_t len, unsigned int flags)
1581 { 1596 {
1582 struct pipe_buffer *ibuf, *obuf; 1597 struct pipe_buffer *ibuf, *obuf;
1583 int ret = 0, i = 0, nbuf; 1598 int ret = 0, i = 0, nbuf;
1584 1599
1585 /* 1600 /*
1586 * Potential ABBA deadlock, work around it by ordering lock 1601 * Potential ABBA deadlock, work around it by ordering lock
1587 * grabbing by inode address. Otherwise two different processes 1602 * grabbing by inode address. Otherwise two different processes
1588 * could deadlock (one doing tee from A -> B, the other from B -> A). 1603 * could deadlock (one doing tee from A -> B, the other from B -> A).
1589 */ 1604 */
1590 inode_double_lock(ipipe->inode, opipe->inode); 1605 inode_double_lock(ipipe->inode, opipe->inode);
1591 1606
1592 do { 1607 do {
1593 if (!opipe->readers) { 1608 if (!opipe->readers) {
1594 send_sig(SIGPIPE, current, 0); 1609 send_sig(SIGPIPE, current, 0);
1595 if (!ret) 1610 if (!ret)
1596 ret = -EPIPE; 1611 ret = -EPIPE;
1597 break; 1612 break;
1598 } 1613 }
1599 1614
1600 /* 1615 /*
1601 * If we have iterated all input buffers or ran out of 1616 * If we have iterated all input buffers or ran out of
1602 * output room, break. 1617 * output room, break.
1603 */ 1618 */
1604 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1619 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1605 break; 1620 break;
1606 1621
1607 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1622 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1608 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1623 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1609 1624
1610 /* 1625 /*
1611 * Get a reference to this pipe buffer, 1626 * Get a reference to this pipe buffer,
1612 * so we can copy the contents over. 1627 * so we can copy the contents over.
1613 */ 1628 */
1614 ibuf->ops->get(ipipe, ibuf); 1629 ibuf->ops->get(ipipe, ibuf);
1615 1630
1616 obuf = opipe->bufs + nbuf; 1631 obuf = opipe->bufs + nbuf;
1617 *obuf = *ibuf; 1632 *obuf = *ibuf;
1618 1633
1619 /* 1634 /*
1620 * Don't inherit the gift flag, we need to 1635 * Don't inherit the gift flag, we need to
1621 * prevent multiple steals of this page. 1636 * prevent multiple steals of this page.
1622 */ 1637 */
1623 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1638 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1624 1639
1625 if (obuf->len > len) 1640 if (obuf->len > len)
1626 obuf->len = len; 1641 obuf->len = len;
1627 1642
1628 opipe->nrbufs++; 1643 opipe->nrbufs++;
1629 ret += obuf->len; 1644 ret += obuf->len;
1630 len -= obuf->len; 1645 len -= obuf->len;
1631 i++; 1646 i++;
1632 } while (len); 1647 } while (len);
1633 1648
1634 /* 1649 /*
1635 * return EAGAIN if we have the potential of some data in the 1650 * return EAGAIN if we have the potential of some data in the
1636 * future, otherwise just return 0 1651 * future, otherwise just return 0
1637 */ 1652 */
1638 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1639 ret = -EAGAIN; 1654 ret = -EAGAIN;
1640 1655
1641 inode_double_unlock(ipipe->inode, opipe->inode); 1656 inode_double_unlock(ipipe->inode, opipe->inode);
1642 1657
1643 /* 1658 /*
1644 * If we put data in the output pipe, wakeup any potential readers. 1659 * If we put data in the output pipe, wakeup any potential readers.
1645 */ 1660 */
1646 if (ret > 0) { 1661 if (ret > 0) {
1647 smp_mb(); 1662 smp_mb();
1648 if (waitqueue_active(&opipe->wait)) 1663 if (waitqueue_active(&opipe->wait))
1649 wake_up_interruptible(&opipe->wait); 1664 wake_up_interruptible(&opipe->wait);
1650 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1665 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1651 } 1666 }
1652 1667
1653 return ret; 1668 return ret;
1654 } 1669 }
1655 1670
1656 /* 1671 /*
1657 * This is a tee(1) implementation that works on pipes. It doesn't copy 1672 * This is a tee(1) implementation that works on pipes. It doesn't copy
1658 * any data, it simply references the 'in' pages on the 'out' pipe. 1673 * any data, it simply references the 'in' pages on the 'out' pipe.
1659 * The 'flags' used are the SPLICE_F_* variants, currently the only 1674 * The 'flags' used are the SPLICE_F_* variants, currently the only
1660 * applicable one is SPLICE_F_NONBLOCK. 1675 * applicable one is SPLICE_F_NONBLOCK.
1661 */ 1676 */
1662 static long do_tee(struct file *in, struct file *out, size_t len, 1677 static long do_tee(struct file *in, struct file *out, size_t len,
1663 unsigned int flags) 1678 unsigned int flags)
1664 { 1679 {
1665 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1680 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1666 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1681 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1667 int ret = -EINVAL; 1682 int ret = -EINVAL;
1668 1683
1669 /* 1684 /*
1670 * Duplicate the contents of ipipe to opipe without actually 1685 * Duplicate the contents of ipipe to opipe without actually
1671 * copying the data. 1686 * copying the data.
1672 */ 1687 */
1673 if (ipipe && opipe && ipipe != opipe) { 1688 if (ipipe && opipe && ipipe != opipe) {
1674 /* 1689 /*
1675 * Keep going, unless we encounter an error. The ipipe/opipe 1690 * Keep going, unless we encounter an error. The ipipe/opipe
1676 * ordering doesn't really matter. 1691 * ordering doesn't really matter.
1677 */ 1692 */
1678 ret = link_ipipe_prep(ipipe, flags); 1693 ret = link_ipipe_prep(ipipe, flags);
1679 if (!ret) { 1694 if (!ret) {
1680 ret = link_opipe_prep(opipe, flags); 1695 ret = link_opipe_prep(opipe, flags);
1681 if (!ret) 1696 if (!ret)
1682 ret = link_pipe(ipipe, opipe, len, flags); 1697 ret = link_pipe(ipipe, opipe, len, flags);
1683 } 1698 }
1684 } 1699 }
1685 1700
1686 return ret; 1701 return ret;
1687 } 1702 }
1688 1703
1689 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1704 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1690 { 1705 {
1691 struct file *in; 1706 struct file *in;
1692 int error, fput_in; 1707 int error, fput_in;
1693 1708
1694 if (unlikely(!len)) 1709 if (unlikely(!len))
1695 return 0; 1710 return 0;
1696 1711
1697 error = -EBADF; 1712 error = -EBADF;
1698 in = fget_light(fdin, &fput_in); 1713 in = fget_light(fdin, &fput_in);
1699 if (in) { 1714 if (in) {
1700 if (in->f_mode & FMODE_READ) { 1715 if (in->f_mode & FMODE_READ) {
1701 int fput_out; 1716 int fput_out;
1702 struct file *out = fget_light(fdout, &fput_out); 1717 struct file *out = fget_light(fdout, &fput_out);
1703 1718
1704 if (out) { 1719 if (out) {
1705 if (out->f_mode & FMODE_WRITE) 1720 if (out->f_mode & FMODE_WRITE)
1706 error = do_tee(in, out, len, flags); 1721 error = do_tee(in, out, len, flags);
1707 fput_light(out, fput_out); 1722 fput_light(out, fput_out);
1708 } 1723 }
1709 } 1724 }
1710 fput_light(in, fput_in); 1725 fput_light(in, fput_in);
1711 } 1726 }
1712 1727
1713 return error; 1728 return error;
1714 } 1729 }
1715 1730