Commit 965c8e59cfcf845ecde2265a1d1bfee5f011d302

Authored by Andrew Morton
Committed by Linus Torvalds
1 parent c0f041602c

lseek: the "whence" argument is called "whence"

But the kernel decided to call it "origin" instead.  Fix most of the
sites.

Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 26 changed files with 116 additions and 116 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/bad_inode.c 2 * linux/fs/bad_inode.c
3 * 3 *
4 * Copyright (C) 1997, Stephen Tweedie 4 * Copyright (C) 1997, Stephen Tweedie
5 * 5 *
6 * Provide stub functions for unreadable inodes 6 * Provide stub functions for unreadable inodes
7 * 7 *
8 * Fabian Frederick : August 2003 - All file operations assigned to EIO 8 * Fabian Frederick : August 2003 - All file operations assigned to EIO
9 */ 9 */
10 10
11 #include <linux/fs.h> 11 #include <linux/fs.h>
12 #include <linux/export.h> 12 #include <linux/export.h>
13 #include <linux/stat.h> 13 #include <linux/stat.h>
14 #include <linux/time.h> 14 #include <linux/time.h>
15 #include <linux/namei.h> 15 #include <linux/namei.h>
16 #include <linux/poll.h> 16 #include <linux/poll.h>
17 17
18 18
19 static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) 19 static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
20 { 20 {
21 return -EIO; 21 return -EIO;
22 } 22 }
23 23
24 static ssize_t bad_file_read(struct file *filp, char __user *buf, 24 static ssize_t bad_file_read(struct file *filp, char __user *buf,
25 size_t size, loff_t *ppos) 25 size_t size, loff_t *ppos)
26 { 26 {
27 return -EIO; 27 return -EIO;
28 } 28 }
29 29
30 static ssize_t bad_file_write(struct file *filp, const char __user *buf, 30 static ssize_t bad_file_write(struct file *filp, const char __user *buf,
31 size_t siz, loff_t *ppos) 31 size_t siz, loff_t *ppos)
32 { 32 {
33 return -EIO; 33 return -EIO;
34 } 34 }
35 35
36 static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 36 static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
37 unsigned long nr_segs, loff_t pos) 37 unsigned long nr_segs, loff_t pos)
38 { 38 {
39 return -EIO; 39 return -EIO;
40 } 40 }
41 41
42 static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 42 static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
43 unsigned long nr_segs, loff_t pos) 43 unsigned long nr_segs, loff_t pos)
44 { 44 {
45 return -EIO; 45 return -EIO;
46 } 46 }
47 47
48 static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) 48 static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
49 { 49 {
50 return -EIO; 50 return -EIO;
51 } 51 }
52 52
53 static unsigned int bad_file_poll(struct file *filp, poll_table *wait) 53 static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
54 { 54 {
55 return POLLERR; 55 return POLLERR;
56 } 56 }
57 57
58 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, 58 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
59 unsigned long arg) 59 unsigned long arg)
60 { 60 {
61 return -EIO; 61 return -EIO;
62 } 62 }
63 63
64 static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, 64 static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
65 unsigned long arg) 65 unsigned long arg)
66 { 66 {
67 return -EIO; 67 return -EIO;
68 } 68 }
69 69
70 static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) 70 static int bad_file_mmap(struct file *file, struct vm_area_struct *vma)
71 { 71 {
72 return -EIO; 72 return -EIO;
73 } 73 }
74 74
75 static int bad_file_open(struct inode *inode, struct file *filp) 75 static int bad_file_open(struct inode *inode, struct file *filp)
76 { 76 {
77 return -EIO; 77 return -EIO;
78 } 78 }
79 79
80 static int bad_file_flush(struct file *file, fl_owner_t id) 80 static int bad_file_flush(struct file *file, fl_owner_t id)
81 { 81 {
82 return -EIO; 82 return -EIO;
83 } 83 }
84 84
85 static int bad_file_release(struct inode *inode, struct file *filp) 85 static int bad_file_release(struct inode *inode, struct file *filp)
86 { 86 {
87 return -EIO; 87 return -EIO;
88 } 88 }
89 89
90 static int bad_file_fsync(struct file *file, loff_t start, loff_t end, 90 static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
91 int datasync) 91 int datasync)
92 { 92 {
93 return -EIO; 93 return -EIO;
94 } 94 }
95 95
96 static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) 96 static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
97 { 97 {
98 return -EIO; 98 return -EIO;
99 } 99 }
100 100
101 static int bad_file_fasync(int fd, struct file *filp, int on) 101 static int bad_file_fasync(int fd, struct file *filp, int on)
102 { 102 {
103 return -EIO; 103 return -EIO;
104 } 104 }
105 105
106 static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) 106 static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
107 { 107 {
108 return -EIO; 108 return -EIO;
109 } 109 }
110 110
111 static ssize_t bad_file_sendpage(struct file *file, struct page *page, 111 static ssize_t bad_file_sendpage(struct file *file, struct page *page,
112 int off, size_t len, loff_t *pos, int more) 112 int off, size_t len, loff_t *pos, int more)
113 { 113 {
114 return -EIO; 114 return -EIO;
115 } 115 }
116 116
117 static unsigned long bad_file_get_unmapped_area(struct file *file, 117 static unsigned long bad_file_get_unmapped_area(struct file *file,
118 unsigned long addr, unsigned long len, 118 unsigned long addr, unsigned long len,
119 unsigned long pgoff, unsigned long flags) 119 unsigned long pgoff, unsigned long flags)
120 { 120 {
121 return -EIO; 121 return -EIO;
122 } 122 }
123 123
124 static int bad_file_check_flags(int flags) 124 static int bad_file_check_flags(int flags)
125 { 125 {
126 return -EIO; 126 return -EIO;
127 } 127 }
128 128
129 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) 129 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
130 { 130 {
131 return -EIO; 131 return -EIO;
132 } 132 }
133 133
134 static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, 134 static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
135 struct file *out, loff_t *ppos, size_t len, 135 struct file *out, loff_t *ppos, size_t len,
136 unsigned int flags) 136 unsigned int flags)
137 { 137 {
138 return -EIO; 138 return -EIO;
139 } 139 }
140 140
141 static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, 141 static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos,
142 struct pipe_inode_info *pipe, size_t len, 142 struct pipe_inode_info *pipe, size_t len,
143 unsigned int flags) 143 unsigned int flags)
144 { 144 {
145 return -EIO; 145 return -EIO;
146 } 146 }
147 147
148 static const struct file_operations bad_file_ops = 148 static const struct file_operations bad_file_ops =
149 { 149 {
150 .llseek = bad_file_llseek, 150 .llseek = bad_file_llseek,
151 .read = bad_file_read, 151 .read = bad_file_read,
152 .write = bad_file_write, 152 .write = bad_file_write,
153 .aio_read = bad_file_aio_read, 153 .aio_read = bad_file_aio_read,
154 .aio_write = bad_file_aio_write, 154 .aio_write = bad_file_aio_write,
155 .readdir = bad_file_readdir, 155 .readdir = bad_file_readdir,
156 .poll = bad_file_poll, 156 .poll = bad_file_poll,
157 .unlocked_ioctl = bad_file_unlocked_ioctl, 157 .unlocked_ioctl = bad_file_unlocked_ioctl,
158 .compat_ioctl = bad_file_compat_ioctl, 158 .compat_ioctl = bad_file_compat_ioctl,
159 .mmap = bad_file_mmap, 159 .mmap = bad_file_mmap,
160 .open = bad_file_open, 160 .open = bad_file_open,
161 .flush = bad_file_flush, 161 .flush = bad_file_flush,
162 .release = bad_file_release, 162 .release = bad_file_release,
163 .fsync = bad_file_fsync, 163 .fsync = bad_file_fsync,
164 .aio_fsync = bad_file_aio_fsync, 164 .aio_fsync = bad_file_aio_fsync,
165 .fasync = bad_file_fasync, 165 .fasync = bad_file_fasync,
166 .lock = bad_file_lock, 166 .lock = bad_file_lock,
167 .sendpage = bad_file_sendpage, 167 .sendpage = bad_file_sendpage,
168 .get_unmapped_area = bad_file_get_unmapped_area, 168 .get_unmapped_area = bad_file_get_unmapped_area,
169 .check_flags = bad_file_check_flags, 169 .check_flags = bad_file_check_flags,
170 .flock = bad_file_flock, 170 .flock = bad_file_flock,
171 .splice_write = bad_file_splice_write, 171 .splice_write = bad_file_splice_write,
172 .splice_read = bad_file_splice_read, 172 .splice_read = bad_file_splice_read,
173 }; 173 };
174 174
175 static int bad_inode_create (struct inode *dir, struct dentry *dentry, 175 static int bad_inode_create (struct inode *dir, struct dentry *dentry,
176 umode_t mode, bool excl) 176 umode_t mode, bool excl)
177 { 177 {
178 return -EIO; 178 return -EIO;
179 } 179 }
180 180
181 static struct dentry *bad_inode_lookup(struct inode *dir, 181 static struct dentry *bad_inode_lookup(struct inode *dir,
182 struct dentry *dentry, unsigned int flags) 182 struct dentry *dentry, unsigned int flags)
183 { 183 {
184 return ERR_PTR(-EIO); 184 return ERR_PTR(-EIO);
185 } 185 }
186 186
187 static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, 187 static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
188 struct dentry *dentry) 188 struct dentry *dentry)
189 { 189 {
190 return -EIO; 190 return -EIO;
191 } 191 }
192 192
193 static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) 193 static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
194 { 194 {
195 return -EIO; 195 return -EIO;
196 } 196 }
197 197
198 static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, 198 static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
199 const char *symname) 199 const char *symname)
200 { 200 {
201 return -EIO; 201 return -EIO;
202 } 202 }
203 203
204 static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, 204 static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
205 umode_t mode) 205 umode_t mode)
206 { 206 {
207 return -EIO; 207 return -EIO;
208 } 208 }
209 209
210 static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) 210 static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
211 { 211 {
212 return -EIO; 212 return -EIO;
213 } 213 }
214 214
215 static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, 215 static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
216 umode_t mode, dev_t rdev) 216 umode_t mode, dev_t rdev)
217 { 217 {
218 return -EIO; 218 return -EIO;
219 } 219 }
220 220
221 static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, 221 static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
222 struct inode *new_dir, struct dentry *new_dentry) 222 struct inode *new_dir, struct dentry *new_dentry)
223 { 223 {
224 return -EIO; 224 return -EIO;
225 } 225 }
226 226
227 static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, 227 static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
228 int buflen) 228 int buflen)
229 { 229 {
230 return -EIO; 230 return -EIO;
231 } 231 }
232 232
233 static int bad_inode_permission(struct inode *inode, int mask) 233 static int bad_inode_permission(struct inode *inode, int mask)
234 { 234 {
235 return -EIO; 235 return -EIO;
236 } 236 }
237 237
238 static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, 238 static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
239 struct kstat *stat) 239 struct kstat *stat)
240 { 240 {
241 return -EIO; 241 return -EIO;
242 } 242 }
243 243
244 static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) 244 static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
245 { 245 {
246 return -EIO; 246 return -EIO;
247 } 247 }
248 248
249 static int bad_inode_setxattr(struct dentry *dentry, const char *name, 249 static int bad_inode_setxattr(struct dentry *dentry, const char *name,
250 const void *value, size_t size, int flags) 250 const void *value, size_t size, int flags)
251 { 251 {
252 return -EIO; 252 return -EIO;
253 } 253 }
254 254
255 static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, 255 static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
256 void *buffer, size_t size) 256 void *buffer, size_t size)
257 { 257 {
258 return -EIO; 258 return -EIO;
259 } 259 }
260 260
261 static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, 261 static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
262 size_t buffer_size) 262 size_t buffer_size)
263 { 263 {
264 return -EIO; 264 return -EIO;
265 } 265 }
266 266
267 static int bad_inode_removexattr(struct dentry *dentry, const char *name) 267 static int bad_inode_removexattr(struct dentry *dentry, const char *name)
268 { 268 {
269 return -EIO; 269 return -EIO;
270 } 270 }
271 271
272 static const struct inode_operations bad_inode_ops = 272 static const struct inode_operations bad_inode_ops =
273 { 273 {
274 .create = bad_inode_create, 274 .create = bad_inode_create,
275 .lookup = bad_inode_lookup, 275 .lookup = bad_inode_lookup,
276 .link = bad_inode_link, 276 .link = bad_inode_link,
277 .unlink = bad_inode_unlink, 277 .unlink = bad_inode_unlink,
278 .symlink = bad_inode_symlink, 278 .symlink = bad_inode_symlink,
279 .mkdir = bad_inode_mkdir, 279 .mkdir = bad_inode_mkdir,
280 .rmdir = bad_inode_rmdir, 280 .rmdir = bad_inode_rmdir,
281 .mknod = bad_inode_mknod, 281 .mknod = bad_inode_mknod,
282 .rename = bad_inode_rename, 282 .rename = bad_inode_rename,
283 .readlink = bad_inode_readlink, 283 .readlink = bad_inode_readlink,
284 /* follow_link must be no-op, otherwise unmounting this inode 284 /* follow_link must be no-op, otherwise unmounting this inode
285 won't work */ 285 won't work */
286 /* put_link returns void */ 286 /* put_link returns void */
287 /* truncate returns void */ 287 /* truncate returns void */
288 .permission = bad_inode_permission, 288 .permission = bad_inode_permission,
289 .getattr = bad_inode_getattr, 289 .getattr = bad_inode_getattr,
290 .setattr = bad_inode_setattr, 290 .setattr = bad_inode_setattr,
291 .setxattr = bad_inode_setxattr, 291 .setxattr = bad_inode_setxattr,
292 .getxattr = bad_inode_getxattr, 292 .getxattr = bad_inode_getxattr,
293 .listxattr = bad_inode_listxattr, 293 .listxattr = bad_inode_listxattr,
294 .removexattr = bad_inode_removexattr, 294 .removexattr = bad_inode_removexattr,
295 }; 295 };
296 296
297 297
298 /* 298 /*
299 * When a filesystem is unable to read an inode due to an I/O error in 299 * When a filesystem is unable to read an inode due to an I/O error in
300 * its read_inode() function, it can call make_bad_inode() to return a 300 * its read_inode() function, it can call make_bad_inode() to return a
301 * set of stubs which will return EIO errors as required. 301 * set of stubs which will return EIO errors as required.
302 * 302 *
303 * We only need to do limited initialisation: all other fields are 303 * We only need to do limited initialisation: all other fields are
304 * preinitialised to zero automatically. 304 * preinitialised to zero automatically.
305 */ 305 */
306 306
307 /** 307 /**
308 * make_bad_inode - mark an inode bad due to an I/O error 308 * make_bad_inode - mark an inode bad due to an I/O error
309 * @inode: Inode to mark bad 309 * @inode: Inode to mark bad
310 * 310 *
311 * When an inode cannot be read due to a media or remote network 311 * When an inode cannot be read due to a media or remote network
312 * failure this function makes the inode "bad" and causes I/O operations 312 * failure this function makes the inode "bad" and causes I/O operations
313 * on it to fail from this point on. 313 * on it to fail from this point on.
314 */ 314 */
315 315
316 void make_bad_inode(struct inode *inode) 316 void make_bad_inode(struct inode *inode)
317 { 317 {
318 remove_inode_hash(inode); 318 remove_inode_hash(inode);
319 319
320 inode->i_mode = S_IFREG; 320 inode->i_mode = S_IFREG;
321 inode->i_atime = inode->i_mtime = inode->i_ctime = 321 inode->i_atime = inode->i_mtime = inode->i_ctime =
322 current_fs_time(inode->i_sb); 322 current_fs_time(inode->i_sb);
323 inode->i_op = &bad_inode_ops; 323 inode->i_op = &bad_inode_ops;
324 inode->i_fop = &bad_file_ops; 324 inode->i_fop = &bad_file_ops;
325 } 325 }
326 EXPORT_SYMBOL(make_bad_inode); 326 EXPORT_SYMBOL(make_bad_inode);
327 327
328 /* 328 /*
329 * This tests whether an inode has been flagged as bad. The test uses 329 * This tests whether an inode has been flagged as bad. The test uses
330 * &bad_inode_ops to cover the case of invalidated inodes as well as 330 * &bad_inode_ops to cover the case of invalidated inodes as well as
331 * those created by make_bad_inode() above. 331 * those created by make_bad_inode() above.
332 */ 332 */
333 333
334 /** 334 /**
335 * is_bad_inode - is an inode errored 335 * is_bad_inode - is an inode errored
336 * @inode: inode to test 336 * @inode: inode to test
337 * 337 *
338 * Returns true if the inode in question has been marked as bad. 338 * Returns true if the inode in question has been marked as bad.
339 */ 339 */
340 340
341 int is_bad_inode(struct inode *inode) 341 int is_bad_inode(struct inode *inode)
342 { 342 {
343 return (inode->i_op == &bad_inode_ops); 343 return (inode->i_op == &bad_inode_ops);
344 } 344 }
345 345
346 EXPORT_SYMBOL(is_bad_inode); 346 EXPORT_SYMBOL(is_bad_inode);
347 347
348 /** 348 /**
349 * iget_failed - Mark an under-construction inode as dead and release it 349 * iget_failed - Mark an under-construction inode as dead and release it
350 * @inode: The inode to discard 350 * @inode: The inode to discard
351 * 351 *
352 * Mark an under-construction inode as dead and release it. 352 * Mark an under-construction inode as dead and release it.
353 */ 353 */
354 void iget_failed(struct inode *inode) 354 void iget_failed(struct inode *inode)
355 { 355 {
356 make_bad_inode(inode); 356 make_bad_inode(inode);
357 unlock_new_inode(inode); 357 unlock_new_inode(inode);
358 iput(inode); 358 iput(inode);
359 } 359 }
360 EXPORT_SYMBOL(iget_failed); 360 EXPORT_SYMBOL(iget_failed);
361 361
1 /* 1 /*
2 * linux/fs/block_dev.c 2 * linux/fs/block_dev.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6 */ 6 */
7 7
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/fcntl.h> 10 #include <linux/fcntl.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/kmod.h> 12 #include <linux/kmod.h>
13 #include <linux/major.h> 13 #include <linux/major.h>
14 #include <linux/device_cgroup.h> 14 #include <linux/device_cgroup.h>
15 #include <linux/highmem.h> 15 #include <linux/highmem.h>
16 #include <linux/blkdev.h> 16 #include <linux/blkdev.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/blkpg.h> 18 #include <linux/blkpg.h>
19 #include <linux/magic.h> 19 #include <linux/magic.h>
20 #include <linux/buffer_head.h> 20 #include <linux/buffer_head.h>
21 #include <linux/swap.h> 21 #include <linux/swap.h>
22 #include <linux/pagevec.h> 22 #include <linux/pagevec.h>
23 #include <linux/writeback.h> 23 #include <linux/writeback.h>
24 #include <linux/mpage.h> 24 #include <linux/mpage.h>
25 #include <linux/mount.h> 25 #include <linux/mount.h>
26 #include <linux/uio.h> 26 #include <linux/uio.h>
27 #include <linux/namei.h> 27 #include <linux/namei.h>
28 #include <linux/log2.h> 28 #include <linux/log2.h>
29 #include <linux/cleancache.h> 29 #include <linux/cleancache.h>
30 #include <asm/uaccess.h> 30 #include <asm/uaccess.h>
31 #include "internal.h" 31 #include "internal.h"
32 32
33 struct bdev_inode { 33 struct bdev_inode {
34 struct block_device bdev; 34 struct block_device bdev;
35 struct inode vfs_inode; 35 struct inode vfs_inode;
36 }; 36 };
37 37
38 static const struct address_space_operations def_blk_aops; 38 static const struct address_space_operations def_blk_aops;
39 39
40 static inline struct bdev_inode *BDEV_I(struct inode *inode) 40 static inline struct bdev_inode *BDEV_I(struct inode *inode)
41 { 41 {
42 return container_of(inode, struct bdev_inode, vfs_inode); 42 return container_of(inode, struct bdev_inode, vfs_inode);
43 } 43 }
44 44
45 inline struct block_device *I_BDEV(struct inode *inode) 45 inline struct block_device *I_BDEV(struct inode *inode)
46 { 46 {
47 return &BDEV_I(inode)->bdev; 47 return &BDEV_I(inode)->bdev;
48 } 48 }
49 EXPORT_SYMBOL(I_BDEV); 49 EXPORT_SYMBOL(I_BDEV);
50 50
51 /* 51 /*
52 * Move the inode from its current bdi to a new bdi. If the inode is dirty we 52 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
53 * need to move it onto the dirty list of @dst so that the inode is always on 53 * need to move it onto the dirty list of @dst so that the inode is always on
54 * the right list. 54 * the right list.
55 */ 55 */
56 static void bdev_inode_switch_bdi(struct inode *inode, 56 static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst) 57 struct backing_dev_info *dst)
58 { 58 {
59 struct backing_dev_info *old = inode->i_data.backing_dev_info; 59 struct backing_dev_info *old = inode->i_data.backing_dev_info;
60 60
61 if (unlikely(dst == old)) /* deadlock avoidance */ 61 if (unlikely(dst == old)) /* deadlock avoidance */
62 return; 62 return;
63 bdi_lock_two(&old->wb, &dst->wb); 63 bdi_lock_two(&old->wb, &dst->wb);
64 spin_lock(&inode->i_lock); 64 spin_lock(&inode->i_lock);
65 inode->i_data.backing_dev_info = dst; 65 inode->i_data.backing_dev_info = dst;
66 if (inode->i_state & I_DIRTY) 66 if (inode->i_state & I_DIRTY)
67 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 67 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
68 spin_unlock(&inode->i_lock); 68 spin_unlock(&inode->i_lock);
69 spin_unlock(&old->wb.list_lock); 69 spin_unlock(&old->wb.list_lock);
70 spin_unlock(&dst->wb.list_lock); 70 spin_unlock(&dst->wb.list_lock);
71 } 71 }
72 72
73 /* Kill _all_ buffers and pagecache , dirty or not.. */ 73 /* Kill _all_ buffers and pagecache , dirty or not.. */
74 void kill_bdev(struct block_device *bdev) 74 void kill_bdev(struct block_device *bdev)
75 { 75 {
76 struct address_space *mapping = bdev->bd_inode->i_mapping; 76 struct address_space *mapping = bdev->bd_inode->i_mapping;
77 77
78 if (mapping->nrpages == 0) 78 if (mapping->nrpages == 0)
79 return; 79 return;
80 80
81 invalidate_bh_lrus(); 81 invalidate_bh_lrus();
82 truncate_inode_pages(mapping, 0); 82 truncate_inode_pages(mapping, 0);
83 } 83 }
84 EXPORT_SYMBOL(kill_bdev); 84 EXPORT_SYMBOL(kill_bdev);
85 85
86 /* Invalidate clean unused buffers and pagecache. */ 86 /* Invalidate clean unused buffers and pagecache. */
87 void invalidate_bdev(struct block_device *bdev) 87 void invalidate_bdev(struct block_device *bdev)
88 { 88 {
89 struct address_space *mapping = bdev->bd_inode->i_mapping; 89 struct address_space *mapping = bdev->bd_inode->i_mapping;
90 90
91 if (mapping->nrpages == 0) 91 if (mapping->nrpages == 0)
92 return; 92 return;
93 93
94 invalidate_bh_lrus(); 94 invalidate_bh_lrus();
95 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 95 lru_add_drain_all(); /* make sure all lru add caches are flushed */
96 invalidate_mapping_pages(mapping, 0, -1); 96 invalidate_mapping_pages(mapping, 0, -1);
97 /* 99% of the time, we don't need to flush the cleancache on the bdev. 97 /* 99% of the time, we don't need to flush the cleancache on the bdev.
98 * But, for the strange corners, lets be cautious 98 * But, for the strange corners, lets be cautious
99 */ 99 */
100 cleancache_invalidate_inode(mapping); 100 cleancache_invalidate_inode(mapping);
101 } 101 }
102 EXPORT_SYMBOL(invalidate_bdev); 102 EXPORT_SYMBOL(invalidate_bdev);
103 103
104 int set_blocksize(struct block_device *bdev, int size) 104 int set_blocksize(struct block_device *bdev, int size)
105 { 105 {
106 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 106 /* Size must be a power of two, and between 512 and PAGE_SIZE */
107 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 107 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
108 return -EINVAL; 108 return -EINVAL;
109 109
110 /* Size cannot be smaller than the size supported by the device */ 110 /* Size cannot be smaller than the size supported by the device */
111 if (size < bdev_logical_block_size(bdev)) 111 if (size < bdev_logical_block_size(bdev))
112 return -EINVAL; 112 return -EINVAL;
113 113
114 /* Don't change the size if it is same as current */ 114 /* Don't change the size if it is same as current */
115 if (bdev->bd_block_size != size) { 115 if (bdev->bd_block_size != size) {
116 sync_blockdev(bdev); 116 sync_blockdev(bdev);
117 bdev->bd_block_size = size; 117 bdev->bd_block_size = size;
118 bdev->bd_inode->i_blkbits = blksize_bits(size); 118 bdev->bd_inode->i_blkbits = blksize_bits(size);
119 kill_bdev(bdev); 119 kill_bdev(bdev);
120 } 120 }
121 return 0; 121 return 0;
122 } 122 }
123 123
124 EXPORT_SYMBOL(set_blocksize); 124 EXPORT_SYMBOL(set_blocksize);
125 125
126 int sb_set_blocksize(struct super_block *sb, int size) 126 int sb_set_blocksize(struct super_block *sb, int size)
127 { 127 {
128 if (set_blocksize(sb->s_bdev, size)) 128 if (set_blocksize(sb->s_bdev, size))
129 return 0; 129 return 0;
130 /* If we get here, we know size is power of two 130 /* If we get here, we know size is power of two
131 * and it's value is between 512 and PAGE_SIZE */ 131 * and it's value is between 512 and PAGE_SIZE */
132 sb->s_blocksize = size; 132 sb->s_blocksize = size;
133 sb->s_blocksize_bits = blksize_bits(size); 133 sb->s_blocksize_bits = blksize_bits(size);
134 return sb->s_blocksize; 134 return sb->s_blocksize;
135 } 135 }
136 136
137 EXPORT_SYMBOL(sb_set_blocksize); 137 EXPORT_SYMBOL(sb_set_blocksize);
138 138
139 int sb_min_blocksize(struct super_block *sb, int size) 139 int sb_min_blocksize(struct super_block *sb, int size)
140 { 140 {
141 int minsize = bdev_logical_block_size(sb->s_bdev); 141 int minsize = bdev_logical_block_size(sb->s_bdev);
142 if (size < minsize) 142 if (size < minsize)
143 size = minsize; 143 size = minsize;
144 return sb_set_blocksize(sb, size); 144 return sb_set_blocksize(sb, size);
145 } 145 }
146 146
147 EXPORT_SYMBOL(sb_min_blocksize); 147 EXPORT_SYMBOL(sb_min_blocksize);
148 148
149 static int 149 static int
150 blkdev_get_block(struct inode *inode, sector_t iblock, 150 blkdev_get_block(struct inode *inode, sector_t iblock,
151 struct buffer_head *bh, int create) 151 struct buffer_head *bh, int create)
152 { 152 {
153 bh->b_bdev = I_BDEV(inode); 153 bh->b_bdev = I_BDEV(inode);
154 bh->b_blocknr = iblock; 154 bh->b_blocknr = iblock;
155 set_buffer_mapped(bh); 155 set_buffer_mapped(bh);
156 return 0; 156 return 0;
157 } 157 }
158 158
159 static ssize_t 159 static ssize_t
160 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 160 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
161 loff_t offset, unsigned long nr_segs) 161 loff_t offset, unsigned long nr_segs)
162 { 162 {
163 struct file *file = iocb->ki_filp; 163 struct file *file = iocb->ki_filp;
164 struct inode *inode = file->f_mapping->host; 164 struct inode *inode = file->f_mapping->host;
165 165
166 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 166 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
167 nr_segs, blkdev_get_block, NULL, NULL, 0); 167 nr_segs, blkdev_get_block, NULL, NULL, 0);
168 } 168 }
169 169
170 int __sync_blockdev(struct block_device *bdev, int wait) 170 int __sync_blockdev(struct block_device *bdev, int wait)
171 { 171 {
172 if (!bdev) 172 if (!bdev)
173 return 0; 173 return 0;
174 if (!wait) 174 if (!wait)
175 return filemap_flush(bdev->bd_inode->i_mapping); 175 return filemap_flush(bdev->bd_inode->i_mapping);
176 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 176 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
177 } 177 }
178 178
179 /* 179 /*
180 * Write out and wait upon all the dirty data associated with a block 180 * Write out and wait upon all the dirty data associated with a block
181 * device via its mapping. Does not take the superblock lock. 181 * device via its mapping. Does not take the superblock lock.
182 */ 182 */
183 int sync_blockdev(struct block_device *bdev) 183 int sync_blockdev(struct block_device *bdev)
184 { 184 {
185 return __sync_blockdev(bdev, 1); 185 return __sync_blockdev(bdev, 1);
186 } 186 }
187 EXPORT_SYMBOL(sync_blockdev); 187 EXPORT_SYMBOL(sync_blockdev);
188 188
189 /* 189 /*
190 * Write out and wait upon all dirty data associated with this 190 * Write out and wait upon all dirty data associated with this
191 * device. Filesystem data as well as the underlying block 191 * device. Filesystem data as well as the underlying block
192 * device. Takes the superblock lock. 192 * device. Takes the superblock lock.
193 */ 193 */
194 int fsync_bdev(struct block_device *bdev) 194 int fsync_bdev(struct block_device *bdev)
195 { 195 {
196 struct super_block *sb = get_super(bdev); 196 struct super_block *sb = get_super(bdev);
197 if (sb) { 197 if (sb) {
198 int res = sync_filesystem(sb); 198 int res = sync_filesystem(sb);
199 drop_super(sb); 199 drop_super(sb);
200 return res; 200 return res;
201 } 201 }
202 return sync_blockdev(bdev); 202 return sync_blockdev(bdev);
203 } 203 }
204 EXPORT_SYMBOL(fsync_bdev); 204 EXPORT_SYMBOL(fsync_bdev);
205 205
206 /** 206 /**
207 * freeze_bdev -- lock a filesystem and force it into a consistent state 207 * freeze_bdev -- lock a filesystem and force it into a consistent state
208 * @bdev: blockdevice to lock 208 * @bdev: blockdevice to lock
209 * 209 *
210 * If a superblock is found on this device, we take the s_umount semaphore 210 * If a superblock is found on this device, we take the s_umount semaphore
211 * on it to make sure nobody unmounts until the snapshot creation is done. 211 * on it to make sure nobody unmounts until the snapshot creation is done.
212 * The reference counter (bd_fsfreeze_count) guarantees that only the last 212 * The reference counter (bd_fsfreeze_count) guarantees that only the last
213 * unfreeze process can unfreeze the frozen filesystem actually when multiple 213 * unfreeze process can unfreeze the frozen filesystem actually when multiple
214 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 214 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
215 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 215 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
216 * actually. 216 * actually.
217 */ 217 */
218 struct super_block *freeze_bdev(struct block_device *bdev) 218 struct super_block *freeze_bdev(struct block_device *bdev)
219 { 219 {
220 struct super_block *sb; 220 struct super_block *sb;
221 int error = 0; 221 int error = 0;
222 222
223 mutex_lock(&bdev->bd_fsfreeze_mutex); 223 mutex_lock(&bdev->bd_fsfreeze_mutex);
224 if (++bdev->bd_fsfreeze_count > 1) { 224 if (++bdev->bd_fsfreeze_count > 1) {
225 /* 225 /*
226 * We don't even need to grab a reference - the first call 226 * We don't even need to grab a reference - the first call
227 * to freeze_bdev grab an active reference and only the last 227 * to freeze_bdev grab an active reference and only the last
228 * thaw_bdev drops it. 228 * thaw_bdev drops it.
229 */ 229 */
230 sb = get_super(bdev); 230 sb = get_super(bdev);
231 drop_super(sb); 231 drop_super(sb);
232 mutex_unlock(&bdev->bd_fsfreeze_mutex); 232 mutex_unlock(&bdev->bd_fsfreeze_mutex);
233 return sb; 233 return sb;
234 } 234 }
235 235
236 sb = get_active_super(bdev); 236 sb = get_active_super(bdev);
237 if (!sb) 237 if (!sb)
238 goto out; 238 goto out;
239 error = freeze_super(sb); 239 error = freeze_super(sb);
240 if (error) { 240 if (error) {
241 deactivate_super(sb); 241 deactivate_super(sb);
242 bdev->bd_fsfreeze_count--; 242 bdev->bd_fsfreeze_count--;
243 mutex_unlock(&bdev->bd_fsfreeze_mutex); 243 mutex_unlock(&bdev->bd_fsfreeze_mutex);
244 return ERR_PTR(error); 244 return ERR_PTR(error);
245 } 245 }
246 deactivate_super(sb); 246 deactivate_super(sb);
247 out: 247 out:
248 sync_blockdev(bdev); 248 sync_blockdev(bdev);
249 mutex_unlock(&bdev->bd_fsfreeze_mutex); 249 mutex_unlock(&bdev->bd_fsfreeze_mutex);
250 return sb; /* thaw_bdev releases s->s_umount */ 250 return sb; /* thaw_bdev releases s->s_umount */
251 } 251 }
252 EXPORT_SYMBOL(freeze_bdev); 252 EXPORT_SYMBOL(freeze_bdev);
253 253
254 /** 254 /**
255 * thaw_bdev -- unlock filesystem 255 * thaw_bdev -- unlock filesystem
256 * @bdev: blockdevice to unlock 256 * @bdev: blockdevice to unlock
257 * @sb: associated superblock 257 * @sb: associated superblock
258 * 258 *
259 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 259 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
260 */ 260 */
261 int thaw_bdev(struct block_device *bdev, struct super_block *sb) 261 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
262 { 262 {
263 int error = -EINVAL; 263 int error = -EINVAL;
264 264
265 mutex_lock(&bdev->bd_fsfreeze_mutex); 265 mutex_lock(&bdev->bd_fsfreeze_mutex);
266 if (!bdev->bd_fsfreeze_count) 266 if (!bdev->bd_fsfreeze_count)
267 goto out; 267 goto out;
268 268
269 error = 0; 269 error = 0;
270 if (--bdev->bd_fsfreeze_count > 0) 270 if (--bdev->bd_fsfreeze_count > 0)
271 goto out; 271 goto out;
272 272
273 if (!sb) 273 if (!sb)
274 goto out; 274 goto out;
275 275
276 error = thaw_super(sb); 276 error = thaw_super(sb);
277 if (error) { 277 if (error) {
278 bdev->bd_fsfreeze_count++; 278 bdev->bd_fsfreeze_count++;
279 mutex_unlock(&bdev->bd_fsfreeze_mutex); 279 mutex_unlock(&bdev->bd_fsfreeze_mutex);
280 return error; 280 return error;
281 } 281 }
282 out: 282 out:
283 mutex_unlock(&bdev->bd_fsfreeze_mutex); 283 mutex_unlock(&bdev->bd_fsfreeze_mutex);
284 return 0; 284 return 0;
285 } 285 }
286 EXPORT_SYMBOL(thaw_bdev); 286 EXPORT_SYMBOL(thaw_bdev);
287 287
288 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 288 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
289 { 289 {
290 return block_write_full_page(page, blkdev_get_block, wbc); 290 return block_write_full_page(page, blkdev_get_block, wbc);
291 } 291 }
292 292
293 static int blkdev_readpage(struct file * file, struct page * page) 293 static int blkdev_readpage(struct file * file, struct page * page)
294 { 294 {
295 return block_read_full_page(page, blkdev_get_block); 295 return block_read_full_page(page, blkdev_get_block);
296 } 296 }
297 297
298 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 298 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
299 loff_t pos, unsigned len, unsigned flags, 299 loff_t pos, unsigned len, unsigned flags,
300 struct page **pagep, void **fsdata) 300 struct page **pagep, void **fsdata)
301 { 301 {
302 return block_write_begin(mapping, pos, len, flags, pagep, 302 return block_write_begin(mapping, pos, len, flags, pagep,
303 blkdev_get_block); 303 blkdev_get_block);
304 } 304 }
305 305
306 static int blkdev_write_end(struct file *file, struct address_space *mapping, 306 static int blkdev_write_end(struct file *file, struct address_space *mapping,
307 loff_t pos, unsigned len, unsigned copied, 307 loff_t pos, unsigned len, unsigned copied,
308 struct page *page, void *fsdata) 308 struct page *page, void *fsdata)
309 { 309 {
310 int ret; 310 int ret;
311 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 311 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
312 312
313 unlock_page(page); 313 unlock_page(page);
314 page_cache_release(page); 314 page_cache_release(page);
315 315
316 return ret; 316 return ret;
317 } 317 }
318 318
319 /* 319 /*
320 * private llseek: 320 * private llseek:
321 * for a block special file file->f_path.dentry->d_inode->i_size is zero 321 * for a block special file file->f_path.dentry->d_inode->i_size is zero
322 * so we compute the size by hand (just as in block_read/write above) 322 * so we compute the size by hand (just as in block_read/write above)
323 */ 323 */
324 static loff_t block_llseek(struct file *file, loff_t offset, int origin) 324 static loff_t block_llseek(struct file *file, loff_t offset, int whence)
325 { 325 {
326 struct inode *bd_inode = file->f_mapping->host; 326 struct inode *bd_inode = file->f_mapping->host;
327 loff_t size; 327 loff_t size;
328 loff_t retval; 328 loff_t retval;
329 329
330 mutex_lock(&bd_inode->i_mutex); 330 mutex_lock(&bd_inode->i_mutex);
331 size = i_size_read(bd_inode); 331 size = i_size_read(bd_inode);
332 332
333 retval = -EINVAL; 333 retval = -EINVAL;
334 switch (origin) { 334 switch (whence) {
335 case SEEK_END: 335 case SEEK_END:
336 offset += size; 336 offset += size;
337 break; 337 break;
338 case SEEK_CUR: 338 case SEEK_CUR:
339 offset += file->f_pos; 339 offset += file->f_pos;
340 case SEEK_SET: 340 case SEEK_SET:
341 break; 341 break;
342 default: 342 default:
343 goto out; 343 goto out;
344 } 344 }
345 if (offset >= 0 && offset <= size) { 345 if (offset >= 0 && offset <= size) {
346 if (offset != file->f_pos) { 346 if (offset != file->f_pos) {
347 file->f_pos = offset; 347 file->f_pos = offset;
348 } 348 }
349 retval = offset; 349 retval = offset;
350 } 350 }
351 out: 351 out:
352 mutex_unlock(&bd_inode->i_mutex); 352 mutex_unlock(&bd_inode->i_mutex);
353 return retval; 353 return retval;
354 } 354 }
355 355
356 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 356 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
357 { 357 {
358 struct inode *bd_inode = filp->f_mapping->host; 358 struct inode *bd_inode = filp->f_mapping->host;
359 struct block_device *bdev = I_BDEV(bd_inode); 359 struct block_device *bdev = I_BDEV(bd_inode);
360 int error; 360 int error;
361 361
362 error = filemap_write_and_wait_range(filp->f_mapping, start, end); 362 error = filemap_write_and_wait_range(filp->f_mapping, start, end);
363 if (error) 363 if (error)
364 return error; 364 return error;
365 365
366 /* 366 /*
367 * There is no need to serialise calls to blkdev_issue_flush with 367 * There is no need to serialise calls to blkdev_issue_flush with
368 * i_mutex and doing so causes performance issues with concurrent 368 * i_mutex and doing so causes performance issues with concurrent
369 * O_SYNC writers to a block device. 369 * O_SYNC writers to a block device.
370 */ 370 */
371 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); 371 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
372 if (error == -EOPNOTSUPP) 372 if (error == -EOPNOTSUPP)
373 error = 0; 373 error = 0;
374 374
375 return error; 375 return error;
376 } 376 }
377 EXPORT_SYMBOL(blkdev_fsync); 377 EXPORT_SYMBOL(blkdev_fsync);
378 378
379 /* 379 /*
380 * pseudo-fs 380 * pseudo-fs
381 */ 381 */
382 382
383 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 383 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
384 static struct kmem_cache * bdev_cachep __read_mostly; 384 static struct kmem_cache * bdev_cachep __read_mostly;
385 385
386 static struct inode *bdev_alloc_inode(struct super_block *sb) 386 static struct inode *bdev_alloc_inode(struct super_block *sb)
387 { 387 {
388 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 388 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
389 if (!ei) 389 if (!ei)
390 return NULL; 390 return NULL;
391 return &ei->vfs_inode; 391 return &ei->vfs_inode;
392 } 392 }
393 393
394 static void bdev_i_callback(struct rcu_head *head) 394 static void bdev_i_callback(struct rcu_head *head)
395 { 395 {
396 struct inode *inode = container_of(head, struct inode, i_rcu); 396 struct inode *inode = container_of(head, struct inode, i_rcu);
397 struct bdev_inode *bdi = BDEV_I(inode); 397 struct bdev_inode *bdi = BDEV_I(inode);
398 398
399 kmem_cache_free(bdev_cachep, bdi); 399 kmem_cache_free(bdev_cachep, bdi);
400 } 400 }
401 401
402 static void bdev_destroy_inode(struct inode *inode) 402 static void bdev_destroy_inode(struct inode *inode)
403 { 403 {
404 call_rcu(&inode->i_rcu, bdev_i_callback); 404 call_rcu(&inode->i_rcu, bdev_i_callback);
405 } 405 }
406 406
407 static void init_once(void *foo) 407 static void init_once(void *foo)
408 { 408 {
409 struct bdev_inode *ei = (struct bdev_inode *) foo; 409 struct bdev_inode *ei = (struct bdev_inode *) foo;
410 struct block_device *bdev = &ei->bdev; 410 struct block_device *bdev = &ei->bdev;
411 411
412 memset(bdev, 0, sizeof(*bdev)); 412 memset(bdev, 0, sizeof(*bdev));
413 mutex_init(&bdev->bd_mutex); 413 mutex_init(&bdev->bd_mutex);
414 INIT_LIST_HEAD(&bdev->bd_inodes); 414 INIT_LIST_HEAD(&bdev->bd_inodes);
415 INIT_LIST_HEAD(&bdev->bd_list); 415 INIT_LIST_HEAD(&bdev->bd_list);
416 #ifdef CONFIG_SYSFS 416 #ifdef CONFIG_SYSFS
417 INIT_LIST_HEAD(&bdev->bd_holder_disks); 417 INIT_LIST_HEAD(&bdev->bd_holder_disks);
418 #endif 418 #endif
419 inode_init_once(&ei->vfs_inode); 419 inode_init_once(&ei->vfs_inode);
420 /* Initialize mutex for freeze. */ 420 /* Initialize mutex for freeze. */
421 mutex_init(&bdev->bd_fsfreeze_mutex); 421 mutex_init(&bdev->bd_fsfreeze_mutex);
422 } 422 }
423 423
424 static inline void __bd_forget(struct inode *inode) 424 static inline void __bd_forget(struct inode *inode)
425 { 425 {
426 list_del_init(&inode->i_devices); 426 list_del_init(&inode->i_devices);
427 inode->i_bdev = NULL; 427 inode->i_bdev = NULL;
428 inode->i_mapping = &inode->i_data; 428 inode->i_mapping = &inode->i_data;
429 } 429 }
430 430
431 static void bdev_evict_inode(struct inode *inode) 431 static void bdev_evict_inode(struct inode *inode)
432 { 432 {
433 struct block_device *bdev = &BDEV_I(inode)->bdev; 433 struct block_device *bdev = &BDEV_I(inode)->bdev;
434 struct list_head *p; 434 struct list_head *p;
435 truncate_inode_pages(&inode->i_data, 0); 435 truncate_inode_pages(&inode->i_data, 0);
436 invalidate_inode_buffers(inode); /* is it needed here? */ 436 invalidate_inode_buffers(inode); /* is it needed here? */
437 clear_inode(inode); 437 clear_inode(inode);
438 spin_lock(&bdev_lock); 438 spin_lock(&bdev_lock);
439 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 439 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
440 __bd_forget(list_entry(p, struct inode, i_devices)); 440 __bd_forget(list_entry(p, struct inode, i_devices));
441 } 441 }
442 list_del_init(&bdev->bd_list); 442 list_del_init(&bdev->bd_list);
443 spin_unlock(&bdev_lock); 443 spin_unlock(&bdev_lock);
444 } 444 }
445 445
446 static const struct super_operations bdev_sops = { 446 static const struct super_operations bdev_sops = {
447 .statfs = simple_statfs, 447 .statfs = simple_statfs,
448 .alloc_inode = bdev_alloc_inode, 448 .alloc_inode = bdev_alloc_inode,
449 .destroy_inode = bdev_destroy_inode, 449 .destroy_inode = bdev_destroy_inode,
450 .drop_inode = generic_delete_inode, 450 .drop_inode = generic_delete_inode,
451 .evict_inode = bdev_evict_inode, 451 .evict_inode = bdev_evict_inode,
452 }; 452 };
453 453
454 static struct dentry *bd_mount(struct file_system_type *fs_type, 454 static struct dentry *bd_mount(struct file_system_type *fs_type,
455 int flags, const char *dev_name, void *data) 455 int flags, const char *dev_name, void *data)
456 { 456 {
457 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); 457 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
458 } 458 }
459 459
460 static struct file_system_type bd_type = { 460 static struct file_system_type bd_type = {
461 .name = "bdev", 461 .name = "bdev",
462 .mount = bd_mount, 462 .mount = bd_mount,
463 .kill_sb = kill_anon_super, 463 .kill_sb = kill_anon_super,
464 }; 464 };
465 465
466 static struct super_block *blockdev_superblock __read_mostly; 466 static struct super_block *blockdev_superblock __read_mostly;
467 467
468 void __init bdev_cache_init(void) 468 void __init bdev_cache_init(void)
469 { 469 {
470 int err; 470 int err;
471 static struct vfsmount *bd_mnt; 471 static struct vfsmount *bd_mnt;
472 472
473 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 473 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
474 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 474 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
475 SLAB_MEM_SPREAD|SLAB_PANIC), 475 SLAB_MEM_SPREAD|SLAB_PANIC),
476 init_once); 476 init_once);
477 err = register_filesystem(&bd_type); 477 err = register_filesystem(&bd_type);
478 if (err) 478 if (err)
479 panic("Cannot register bdev pseudo-fs"); 479 panic("Cannot register bdev pseudo-fs");
480 bd_mnt = kern_mount(&bd_type); 480 bd_mnt = kern_mount(&bd_type);
481 if (IS_ERR(bd_mnt)) 481 if (IS_ERR(bd_mnt))
482 panic("Cannot create bdev pseudo-fs"); 482 panic("Cannot create bdev pseudo-fs");
483 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 483 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
484 } 484 }
485 485
486 /* 486 /*
487 * Most likely _very_ bad one - but then it's hardly critical for small 487 * Most likely _very_ bad one - but then it's hardly critical for small
488 * /dev and can be fixed when somebody will need really large one. 488 * /dev and can be fixed when somebody will need really large one.
489 * Keep in mind that it will be fed through icache hash function too. 489 * Keep in mind that it will be fed through icache hash function too.
490 */ 490 */
491 static inline unsigned long hash(dev_t dev) 491 static inline unsigned long hash(dev_t dev)
492 { 492 {
493 return MAJOR(dev)+MINOR(dev); 493 return MAJOR(dev)+MINOR(dev);
494 } 494 }
495 495
496 static int bdev_test(struct inode *inode, void *data) 496 static int bdev_test(struct inode *inode, void *data)
497 { 497 {
498 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 498 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
499 } 499 }
500 500
501 static int bdev_set(struct inode *inode, void *data) 501 static int bdev_set(struct inode *inode, void *data)
502 { 502 {
503 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 503 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
504 return 0; 504 return 0;
505 } 505 }
506 506
507 static LIST_HEAD(all_bdevs); 507 static LIST_HEAD(all_bdevs);
508 508
509 struct block_device *bdget(dev_t dev) 509 struct block_device *bdget(dev_t dev)
510 { 510 {
511 struct block_device *bdev; 511 struct block_device *bdev;
512 struct inode *inode; 512 struct inode *inode;
513 513
514 inode = iget5_locked(blockdev_superblock, hash(dev), 514 inode = iget5_locked(blockdev_superblock, hash(dev),
515 bdev_test, bdev_set, &dev); 515 bdev_test, bdev_set, &dev);
516 516
517 if (!inode) 517 if (!inode)
518 return NULL; 518 return NULL;
519 519
520 bdev = &BDEV_I(inode)->bdev; 520 bdev = &BDEV_I(inode)->bdev;
521 521
522 if (inode->i_state & I_NEW) { 522 if (inode->i_state & I_NEW) {
523 bdev->bd_contains = NULL; 523 bdev->bd_contains = NULL;
524 bdev->bd_super = NULL; 524 bdev->bd_super = NULL;
525 bdev->bd_inode = inode; 525 bdev->bd_inode = inode;
526 bdev->bd_block_size = (1 << inode->i_blkbits); 526 bdev->bd_block_size = (1 << inode->i_blkbits);
527 bdev->bd_part_count = 0; 527 bdev->bd_part_count = 0;
528 bdev->bd_invalidated = 0; 528 bdev->bd_invalidated = 0;
529 inode->i_mode = S_IFBLK; 529 inode->i_mode = S_IFBLK;
530 inode->i_rdev = dev; 530 inode->i_rdev = dev;
531 inode->i_bdev = bdev; 531 inode->i_bdev = bdev;
532 inode->i_data.a_ops = &def_blk_aops; 532 inode->i_data.a_ops = &def_blk_aops;
533 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 533 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
534 inode->i_data.backing_dev_info = &default_backing_dev_info; 534 inode->i_data.backing_dev_info = &default_backing_dev_info;
535 spin_lock(&bdev_lock); 535 spin_lock(&bdev_lock);
536 list_add(&bdev->bd_list, &all_bdevs); 536 list_add(&bdev->bd_list, &all_bdevs);
537 spin_unlock(&bdev_lock); 537 spin_unlock(&bdev_lock);
538 unlock_new_inode(inode); 538 unlock_new_inode(inode);
539 } 539 }
540 return bdev; 540 return bdev;
541 } 541 }
542 542
543 EXPORT_SYMBOL(bdget); 543 EXPORT_SYMBOL(bdget);
544 544
545 /** 545 /**
546 * bdgrab -- Grab a reference to an already referenced block device 546 * bdgrab -- Grab a reference to an already referenced block device
547 * @bdev: Block device to grab a reference to. 547 * @bdev: Block device to grab a reference to.
548 */ 548 */
549 struct block_device *bdgrab(struct block_device *bdev) 549 struct block_device *bdgrab(struct block_device *bdev)
550 { 550 {
551 ihold(bdev->bd_inode); 551 ihold(bdev->bd_inode);
552 return bdev; 552 return bdev;
553 } 553 }
554 554
555 long nr_blockdev_pages(void) 555 long nr_blockdev_pages(void)
556 { 556 {
557 struct block_device *bdev; 557 struct block_device *bdev;
558 long ret = 0; 558 long ret = 0;
559 spin_lock(&bdev_lock); 559 spin_lock(&bdev_lock);
560 list_for_each_entry(bdev, &all_bdevs, bd_list) { 560 list_for_each_entry(bdev, &all_bdevs, bd_list) {
561 ret += bdev->bd_inode->i_mapping->nrpages; 561 ret += bdev->bd_inode->i_mapping->nrpages;
562 } 562 }
563 spin_unlock(&bdev_lock); 563 spin_unlock(&bdev_lock);
564 return ret; 564 return ret;
565 } 565 }
566 566
567 void bdput(struct block_device *bdev) 567 void bdput(struct block_device *bdev)
568 { 568 {
569 iput(bdev->bd_inode); 569 iput(bdev->bd_inode);
570 } 570 }
571 571
572 EXPORT_SYMBOL(bdput); 572 EXPORT_SYMBOL(bdput);
573 573
574 static struct block_device *bd_acquire(struct inode *inode) 574 static struct block_device *bd_acquire(struct inode *inode)
575 { 575 {
576 struct block_device *bdev; 576 struct block_device *bdev;
577 577
578 spin_lock(&bdev_lock); 578 spin_lock(&bdev_lock);
579 bdev = inode->i_bdev; 579 bdev = inode->i_bdev;
580 if (bdev) { 580 if (bdev) {
581 ihold(bdev->bd_inode); 581 ihold(bdev->bd_inode);
582 spin_unlock(&bdev_lock); 582 spin_unlock(&bdev_lock);
583 return bdev; 583 return bdev;
584 } 584 }
585 spin_unlock(&bdev_lock); 585 spin_unlock(&bdev_lock);
586 586
587 bdev = bdget(inode->i_rdev); 587 bdev = bdget(inode->i_rdev);
588 if (bdev) { 588 if (bdev) {
589 spin_lock(&bdev_lock); 589 spin_lock(&bdev_lock);
590 if (!inode->i_bdev) { 590 if (!inode->i_bdev) {
591 /* 591 /*
592 * We take an additional reference to bd_inode, 592 * We take an additional reference to bd_inode,
593 * and it's released in clear_inode() of inode. 593 * and it's released in clear_inode() of inode.
594 * So, we can access it via ->i_mapping always 594 * So, we can access it via ->i_mapping always
595 * without igrab(). 595 * without igrab().
596 */ 596 */
597 ihold(bdev->bd_inode); 597 ihold(bdev->bd_inode);
598 inode->i_bdev = bdev; 598 inode->i_bdev = bdev;
599 inode->i_mapping = bdev->bd_inode->i_mapping; 599 inode->i_mapping = bdev->bd_inode->i_mapping;
600 list_add(&inode->i_devices, &bdev->bd_inodes); 600 list_add(&inode->i_devices, &bdev->bd_inodes);
601 } 601 }
602 spin_unlock(&bdev_lock); 602 spin_unlock(&bdev_lock);
603 } 603 }
604 return bdev; 604 return bdev;
605 } 605 }
606 606
607 static inline int sb_is_blkdev_sb(struct super_block *sb) 607 static inline int sb_is_blkdev_sb(struct super_block *sb)
608 { 608 {
609 return sb == blockdev_superblock; 609 return sb == blockdev_superblock;
610 } 610 }
611 611
612 /* Call when you free inode */ 612 /* Call when you free inode */
613 613
614 void bd_forget(struct inode *inode) 614 void bd_forget(struct inode *inode)
615 { 615 {
616 struct block_device *bdev = NULL; 616 struct block_device *bdev = NULL;
617 617
618 spin_lock(&bdev_lock); 618 spin_lock(&bdev_lock);
619 if (inode->i_bdev) { 619 if (inode->i_bdev) {
620 if (!sb_is_blkdev_sb(inode->i_sb)) 620 if (!sb_is_blkdev_sb(inode->i_sb))
621 bdev = inode->i_bdev; 621 bdev = inode->i_bdev;
622 __bd_forget(inode); 622 __bd_forget(inode);
623 } 623 }
624 spin_unlock(&bdev_lock); 624 spin_unlock(&bdev_lock);
625 625
626 if (bdev) 626 if (bdev)
627 iput(bdev->bd_inode); 627 iput(bdev->bd_inode);
628 } 628 }
629 629
630 /** 630 /**
631 * bd_may_claim - test whether a block device can be claimed 631 * bd_may_claim - test whether a block device can be claimed
632 * @bdev: block device of interest 632 * @bdev: block device of interest
633 * @whole: whole block device containing @bdev, may equal @bdev 633 * @whole: whole block device containing @bdev, may equal @bdev
634 * @holder: holder trying to claim @bdev 634 * @holder: holder trying to claim @bdev
635 * 635 *
636 * Test whether @bdev can be claimed by @holder. 636 * Test whether @bdev can be claimed by @holder.
637 * 637 *
638 * CONTEXT: 638 * CONTEXT:
639 * spin_lock(&bdev_lock). 639 * spin_lock(&bdev_lock).
640 * 640 *
641 * RETURNS: 641 * RETURNS:
642 * %true if @bdev can be claimed, %false otherwise. 642 * %true if @bdev can be claimed, %false otherwise.
643 */ 643 */
644 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 644 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
645 void *holder) 645 void *holder)
646 { 646 {
647 if (bdev->bd_holder == holder) 647 if (bdev->bd_holder == holder)
648 return true; /* already a holder */ 648 return true; /* already a holder */
649 else if (bdev->bd_holder != NULL) 649 else if (bdev->bd_holder != NULL)
650 return false; /* held by someone else */ 650 return false; /* held by someone else */
651 else if (bdev->bd_contains == bdev) 651 else if (bdev->bd_contains == bdev)
652 return true; /* is a whole device which isn't held */ 652 return true; /* is a whole device which isn't held */
653 653
654 else if (whole->bd_holder == bd_may_claim) 654 else if (whole->bd_holder == bd_may_claim)
655 return true; /* is a partition of a device that is being partitioned */ 655 return true; /* is a partition of a device that is being partitioned */
656 else if (whole->bd_holder != NULL) 656 else if (whole->bd_holder != NULL)
657 return false; /* is a partition of a held device */ 657 return false; /* is a partition of a held device */
658 else 658 else
659 return true; /* is a partition of an un-held device */ 659 return true; /* is a partition of an un-held device */
660 } 660 }
661 661
662 /** 662 /**
663 * bd_prepare_to_claim - prepare to claim a block device 663 * bd_prepare_to_claim - prepare to claim a block device
664 * @bdev: block device of interest 664 * @bdev: block device of interest
665 * @whole: the whole device containing @bdev, may equal @bdev 665 * @whole: the whole device containing @bdev, may equal @bdev
666 * @holder: holder trying to claim @bdev 666 * @holder: holder trying to claim @bdev
667 * 667 *
668 * Prepare to claim @bdev. This function fails if @bdev is already 668 * Prepare to claim @bdev. This function fails if @bdev is already
669 * claimed by another holder and waits if another claiming is in 669 * claimed by another holder and waits if another claiming is in
670 * progress. This function doesn't actually claim. On successful 670 * progress. This function doesn't actually claim. On successful
671 * return, the caller has ownership of bd_claiming and bd_holder[s]. 671 * return, the caller has ownership of bd_claiming and bd_holder[s].
672 * 672 *
673 * CONTEXT: 673 * CONTEXT:
674 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 674 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
675 * it multiple times. 675 * it multiple times.
676 * 676 *
677 * RETURNS: 677 * RETURNS:
678 * 0 if @bdev can be claimed, -EBUSY otherwise. 678 * 0 if @bdev can be claimed, -EBUSY otherwise.
679 */ 679 */
680 static int bd_prepare_to_claim(struct block_device *bdev, 680 static int bd_prepare_to_claim(struct block_device *bdev,
681 struct block_device *whole, void *holder) 681 struct block_device *whole, void *holder)
682 { 682 {
683 retry: 683 retry:
684 /* if someone else claimed, fail */ 684 /* if someone else claimed, fail */
685 if (!bd_may_claim(bdev, whole, holder)) 685 if (!bd_may_claim(bdev, whole, holder))
686 return -EBUSY; 686 return -EBUSY;
687 687
688 /* if claiming is already in progress, wait for it to finish */ 688 /* if claiming is already in progress, wait for it to finish */
689 if (whole->bd_claiming) { 689 if (whole->bd_claiming) {
690 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 690 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
691 DEFINE_WAIT(wait); 691 DEFINE_WAIT(wait);
692 692
693 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 693 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
694 spin_unlock(&bdev_lock); 694 spin_unlock(&bdev_lock);
695 schedule(); 695 schedule();
696 finish_wait(wq, &wait); 696 finish_wait(wq, &wait);
697 spin_lock(&bdev_lock); 697 spin_lock(&bdev_lock);
698 goto retry; 698 goto retry;
699 } 699 }
700 700
701 /* yay, all mine */ 701 /* yay, all mine */
702 return 0; 702 return 0;
703 } 703 }
704 704
705 /** 705 /**
706 * bd_start_claiming - start claiming a block device 706 * bd_start_claiming - start claiming a block device
707 * @bdev: block device of interest 707 * @bdev: block device of interest
708 * @holder: holder trying to claim @bdev 708 * @holder: holder trying to claim @bdev
709 * 709 *
710 * @bdev is about to be opened exclusively. Check @bdev can be opened 710 * @bdev is about to be opened exclusively. Check @bdev can be opened
711 * exclusively and mark that an exclusive open is in progress. Each 711 * exclusively and mark that an exclusive open is in progress. Each
712 * successful call to this function must be matched with a call to 712 * successful call to this function must be matched with a call to
713 * either bd_finish_claiming() or bd_abort_claiming() (which do not 713 * either bd_finish_claiming() or bd_abort_claiming() (which do not
714 * fail). 714 * fail).
715 * 715 *
716 * This function is used to gain exclusive access to the block device 716 * This function is used to gain exclusive access to the block device
717 * without actually causing other exclusive open attempts to fail. It 717 * without actually causing other exclusive open attempts to fail. It
718 * should be used when the open sequence itself requires exclusive 718 * should be used when the open sequence itself requires exclusive
719 * access but may subsequently fail. 719 * access but may subsequently fail.
720 * 720 *
721 * CONTEXT: 721 * CONTEXT:
722 * Might sleep. 722 * Might sleep.
723 * 723 *
724 * RETURNS: 724 * RETURNS:
725 * Pointer to the block device containing @bdev on success, ERR_PTR() 725 * Pointer to the block device containing @bdev on success, ERR_PTR()
726 * value on failure. 726 * value on failure.
727 */ 727 */
728 static struct block_device *bd_start_claiming(struct block_device *bdev, 728 static struct block_device *bd_start_claiming(struct block_device *bdev,
729 void *holder) 729 void *holder)
730 { 730 {
731 struct gendisk *disk; 731 struct gendisk *disk;
732 struct block_device *whole; 732 struct block_device *whole;
733 int partno, err; 733 int partno, err;
734 734
735 might_sleep(); 735 might_sleep();
736 736
737 /* 737 /*
738 * @bdev might not have been initialized properly yet, look up 738 * @bdev might not have been initialized properly yet, look up
739 * and grab the outer block device the hard way. 739 * and grab the outer block device the hard way.
740 */ 740 */
741 disk = get_gendisk(bdev->bd_dev, &partno); 741 disk = get_gendisk(bdev->bd_dev, &partno);
742 if (!disk) 742 if (!disk)
743 return ERR_PTR(-ENXIO); 743 return ERR_PTR(-ENXIO);
744 744
745 /* 745 /*
746 * Normally, @bdev should equal what's returned from bdget_disk() 746 * Normally, @bdev should equal what's returned from bdget_disk()
747 * if partno is 0; however, some drivers (floppy) use multiple 747 * if partno is 0; however, some drivers (floppy) use multiple
748 * bdev's for the same physical device and @bdev may be one of the 748 * bdev's for the same physical device and @bdev may be one of the
749 * aliases. Keep @bdev if partno is 0. This means claimer 749 * aliases. Keep @bdev if partno is 0. This means claimer
750 * tracking is broken for those devices but it has always been that 750 * tracking is broken for those devices but it has always been that
751 * way. 751 * way.
752 */ 752 */
753 if (partno) 753 if (partno)
754 whole = bdget_disk(disk, 0); 754 whole = bdget_disk(disk, 0);
755 else 755 else
756 whole = bdgrab(bdev); 756 whole = bdgrab(bdev);
757 757
758 module_put(disk->fops->owner); 758 module_put(disk->fops->owner);
759 put_disk(disk); 759 put_disk(disk);
760 if (!whole) 760 if (!whole)
761 return ERR_PTR(-ENOMEM); 761 return ERR_PTR(-ENOMEM);
762 762
763 /* prepare to claim, if successful, mark claiming in progress */ 763 /* prepare to claim, if successful, mark claiming in progress */
764 spin_lock(&bdev_lock); 764 spin_lock(&bdev_lock);
765 765
766 err = bd_prepare_to_claim(bdev, whole, holder); 766 err = bd_prepare_to_claim(bdev, whole, holder);
767 if (err == 0) { 767 if (err == 0) {
768 whole->bd_claiming = holder; 768 whole->bd_claiming = holder;
769 spin_unlock(&bdev_lock); 769 spin_unlock(&bdev_lock);
770 return whole; 770 return whole;
771 } else { 771 } else {
772 spin_unlock(&bdev_lock); 772 spin_unlock(&bdev_lock);
773 bdput(whole); 773 bdput(whole);
774 return ERR_PTR(err); 774 return ERR_PTR(err);
775 } 775 }
776 } 776 }
777 777
778 #ifdef CONFIG_SYSFS 778 #ifdef CONFIG_SYSFS
779 struct bd_holder_disk { 779 struct bd_holder_disk {
780 struct list_head list; 780 struct list_head list;
781 struct gendisk *disk; 781 struct gendisk *disk;
782 int refcnt; 782 int refcnt;
783 }; 783 };
784 784
785 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, 785 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
786 struct gendisk *disk) 786 struct gendisk *disk)
787 { 787 {
788 struct bd_holder_disk *holder; 788 struct bd_holder_disk *holder;
789 789
790 list_for_each_entry(holder, &bdev->bd_holder_disks, list) 790 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
791 if (holder->disk == disk) 791 if (holder->disk == disk)
792 return holder; 792 return holder;
793 return NULL; 793 return NULL;
794 } 794 }
795 795
796 static int add_symlink(struct kobject *from, struct kobject *to) 796 static int add_symlink(struct kobject *from, struct kobject *to)
797 { 797 {
798 return sysfs_create_link(from, to, kobject_name(to)); 798 return sysfs_create_link(from, to, kobject_name(to));
799 } 799 }
800 800
801 static void del_symlink(struct kobject *from, struct kobject *to) 801 static void del_symlink(struct kobject *from, struct kobject *to)
802 { 802 {
803 sysfs_remove_link(from, kobject_name(to)); 803 sysfs_remove_link(from, kobject_name(to));
804 } 804 }
805 805
806 /** 806 /**
807 * bd_link_disk_holder - create symlinks between holding disk and slave bdev 807 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
808 * @bdev: the claimed slave bdev 808 * @bdev: the claimed slave bdev
809 * @disk: the holding disk 809 * @disk: the holding disk
810 * 810 *
811 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 811 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
812 * 812 *
813 * This functions creates the following sysfs symlinks. 813 * This functions creates the following sysfs symlinks.
814 * 814 *
815 * - from "slaves" directory of the holder @disk to the claimed @bdev 815 * - from "slaves" directory of the holder @disk to the claimed @bdev
816 * - from "holders" directory of the @bdev to the holder @disk 816 * - from "holders" directory of the @bdev to the holder @disk
817 * 817 *
818 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is 818 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
819 * passed to bd_link_disk_holder(), then: 819 * passed to bd_link_disk_holder(), then:
820 * 820 *
821 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 821 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
822 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 822 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
823 * 823 *
824 * The caller must have claimed @bdev before calling this function and 824 * The caller must have claimed @bdev before calling this function and
825 * ensure that both @bdev and @disk are valid during the creation and 825 * ensure that both @bdev and @disk are valid during the creation and
826 * lifetime of these symlinks. 826 * lifetime of these symlinks.
827 * 827 *
828 * CONTEXT: 828 * CONTEXT:
829 * Might sleep. 829 * Might sleep.
830 * 830 *
831 * RETURNS: 831 * RETURNS:
832 * 0 on success, -errno on failure. 832 * 0 on success, -errno on failure.
833 */ 833 */
834 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) 834 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
835 { 835 {
836 struct bd_holder_disk *holder; 836 struct bd_holder_disk *holder;
837 int ret = 0; 837 int ret = 0;
838 838
839 mutex_lock(&bdev->bd_mutex); 839 mutex_lock(&bdev->bd_mutex);
840 840
841 WARN_ON_ONCE(!bdev->bd_holder); 841 WARN_ON_ONCE(!bdev->bd_holder);
842 842
843 /* FIXME: remove the following once add_disk() handles errors */ 843 /* FIXME: remove the following once add_disk() handles errors */
844 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) 844 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
845 goto out_unlock; 845 goto out_unlock;
846 846
847 holder = bd_find_holder_disk(bdev, disk); 847 holder = bd_find_holder_disk(bdev, disk);
848 if (holder) { 848 if (holder) {
849 holder->refcnt++; 849 holder->refcnt++;
850 goto out_unlock; 850 goto out_unlock;
851 } 851 }
852 852
853 holder = kzalloc(sizeof(*holder), GFP_KERNEL); 853 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
854 if (!holder) { 854 if (!holder) {
855 ret = -ENOMEM; 855 ret = -ENOMEM;
856 goto out_unlock; 856 goto out_unlock;
857 } 857 }
858 858
859 INIT_LIST_HEAD(&holder->list); 859 INIT_LIST_HEAD(&holder->list);
860 holder->disk = disk; 860 holder->disk = disk;
861 holder->refcnt = 1; 861 holder->refcnt = 1;
862 862
863 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 863 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
864 if (ret) 864 if (ret)
865 goto out_free; 865 goto out_free;
866 866
867 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 867 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
868 if (ret) 868 if (ret)
869 goto out_del; 869 goto out_del;
870 /* 870 /*
871 * bdev could be deleted beneath us which would implicitly destroy 871 * bdev could be deleted beneath us which would implicitly destroy
872 * the holder directory. Hold on to it. 872 * the holder directory. Hold on to it.
873 */ 873 */
874 kobject_get(bdev->bd_part->holder_dir); 874 kobject_get(bdev->bd_part->holder_dir);
875 875
876 list_add(&holder->list, &bdev->bd_holder_disks); 876 list_add(&holder->list, &bdev->bd_holder_disks);
877 goto out_unlock; 877 goto out_unlock;
878 878
879 out_del: 879 out_del:
880 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 880 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
881 out_free: 881 out_free:
882 kfree(holder); 882 kfree(holder);
883 out_unlock: 883 out_unlock:
884 mutex_unlock(&bdev->bd_mutex); 884 mutex_unlock(&bdev->bd_mutex);
885 return ret; 885 return ret;
886 } 886 }
887 EXPORT_SYMBOL_GPL(bd_link_disk_holder); 887 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
888 888
889 /** 889 /**
890 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() 890 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
891 * @bdev: the calimed slave bdev 891 * @bdev: the calimed slave bdev
892 * @disk: the holding disk 892 * @disk: the holding disk
893 * 893 *
894 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 894 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
895 * 895 *
896 * CONTEXT: 896 * CONTEXT:
897 * Might sleep. 897 * Might sleep.
898 */ 898 */
899 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) 899 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
900 { 900 {
901 struct bd_holder_disk *holder; 901 struct bd_holder_disk *holder;
902 902
903 mutex_lock(&bdev->bd_mutex); 903 mutex_lock(&bdev->bd_mutex);
904 904
905 holder = bd_find_holder_disk(bdev, disk); 905 holder = bd_find_holder_disk(bdev, disk);
906 906
907 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { 907 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
908 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 908 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
909 del_symlink(bdev->bd_part->holder_dir, 909 del_symlink(bdev->bd_part->holder_dir,
910 &disk_to_dev(disk)->kobj); 910 &disk_to_dev(disk)->kobj);
911 kobject_put(bdev->bd_part->holder_dir); 911 kobject_put(bdev->bd_part->holder_dir);
912 list_del_init(&holder->list); 912 list_del_init(&holder->list);
913 kfree(holder); 913 kfree(holder);
914 } 914 }
915 915
916 mutex_unlock(&bdev->bd_mutex); 916 mutex_unlock(&bdev->bd_mutex);
917 } 917 }
918 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); 918 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
919 #endif 919 #endif
920 920
921 /** 921 /**
922 * flush_disk - invalidates all buffer-cache entries on a disk 922 * flush_disk - invalidates all buffer-cache entries on a disk
923 * 923 *
924 * @bdev: struct block device to be flushed 924 * @bdev: struct block device to be flushed
925 * @kill_dirty: flag to guide handling of dirty inodes 925 * @kill_dirty: flag to guide handling of dirty inodes
926 * 926 *
927 * Invalidates all buffer-cache entries on a disk. It should be called 927 * Invalidates all buffer-cache entries on a disk. It should be called
928 * when a disk has been changed -- either by a media change or online 928 * when a disk has been changed -- either by a media change or online
929 * resize. 929 * resize.
930 */ 930 */
931 static void flush_disk(struct block_device *bdev, bool kill_dirty) 931 static void flush_disk(struct block_device *bdev, bool kill_dirty)
932 { 932 {
933 if (__invalidate_device(bdev, kill_dirty)) { 933 if (__invalidate_device(bdev, kill_dirty)) {
934 char name[BDEVNAME_SIZE] = ""; 934 char name[BDEVNAME_SIZE] = "";
935 935
936 if (bdev->bd_disk) 936 if (bdev->bd_disk)
937 disk_name(bdev->bd_disk, 0, name); 937 disk_name(bdev->bd_disk, 0, name);
938 printk(KERN_WARNING "VFS: busy inodes on changed media or " 938 printk(KERN_WARNING "VFS: busy inodes on changed media or "
939 "resized disk %s\n", name); 939 "resized disk %s\n", name);
940 } 940 }
941 941
942 if (!bdev->bd_disk) 942 if (!bdev->bd_disk)
943 return; 943 return;
944 if (disk_part_scan_enabled(bdev->bd_disk)) 944 if (disk_part_scan_enabled(bdev->bd_disk))
945 bdev->bd_invalidated = 1; 945 bdev->bd_invalidated = 1;
946 } 946 }
947 947
948 /** 948 /**
949 * check_disk_size_change - checks for disk size change and adjusts bdev size. 949 * check_disk_size_change - checks for disk size change and adjusts bdev size.
950 * @disk: struct gendisk to check 950 * @disk: struct gendisk to check
951 * @bdev: struct bdev to adjust. 951 * @bdev: struct bdev to adjust.
952 * 952 *
953 * This routine checks to see if the bdev size does not match the disk size 953 * This routine checks to see if the bdev size does not match the disk size
954 * and adjusts it if it differs. 954 * and adjusts it if it differs.
955 */ 955 */
956 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 956 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
957 { 957 {
958 loff_t disk_size, bdev_size; 958 loff_t disk_size, bdev_size;
959 959
960 disk_size = (loff_t)get_capacity(disk) << 9; 960 disk_size = (loff_t)get_capacity(disk) << 9;
961 bdev_size = i_size_read(bdev->bd_inode); 961 bdev_size = i_size_read(bdev->bd_inode);
962 if (disk_size != bdev_size) { 962 if (disk_size != bdev_size) {
963 char name[BDEVNAME_SIZE]; 963 char name[BDEVNAME_SIZE];
964 964
965 disk_name(disk, 0, name); 965 disk_name(disk, 0, name);
966 printk(KERN_INFO 966 printk(KERN_INFO
967 "%s: detected capacity change from %lld to %lld\n", 967 "%s: detected capacity change from %lld to %lld\n",
968 name, bdev_size, disk_size); 968 name, bdev_size, disk_size);
969 i_size_write(bdev->bd_inode, disk_size); 969 i_size_write(bdev->bd_inode, disk_size);
970 flush_disk(bdev, false); 970 flush_disk(bdev, false);
971 } 971 }
972 } 972 }
973 EXPORT_SYMBOL(check_disk_size_change); 973 EXPORT_SYMBOL(check_disk_size_change);
974 974
975 /** 975 /**
976 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 976 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
977 * @disk: struct gendisk to be revalidated 977 * @disk: struct gendisk to be revalidated
978 * 978 *
979 * This routine is a wrapper for lower-level driver's revalidate_disk 979 * This routine is a wrapper for lower-level driver's revalidate_disk
980 * call-backs. It is used to do common pre and post operations needed 980 * call-backs. It is used to do common pre and post operations needed
981 * for all revalidate_disk operations. 981 * for all revalidate_disk operations.
982 */ 982 */
983 int revalidate_disk(struct gendisk *disk) 983 int revalidate_disk(struct gendisk *disk)
984 { 984 {
985 struct block_device *bdev; 985 struct block_device *bdev;
986 int ret = 0; 986 int ret = 0;
987 987
988 if (disk->fops->revalidate_disk) 988 if (disk->fops->revalidate_disk)
989 ret = disk->fops->revalidate_disk(disk); 989 ret = disk->fops->revalidate_disk(disk);
990 990
991 bdev = bdget_disk(disk, 0); 991 bdev = bdget_disk(disk, 0);
992 if (!bdev) 992 if (!bdev)
993 return ret; 993 return ret;
994 994
995 mutex_lock(&bdev->bd_mutex); 995 mutex_lock(&bdev->bd_mutex);
996 check_disk_size_change(disk, bdev); 996 check_disk_size_change(disk, bdev);
997 mutex_unlock(&bdev->bd_mutex); 997 mutex_unlock(&bdev->bd_mutex);
998 bdput(bdev); 998 bdput(bdev);
999 return ret; 999 return ret;
1000 } 1000 }
1001 EXPORT_SYMBOL(revalidate_disk); 1001 EXPORT_SYMBOL(revalidate_disk);
1002 1002
1003 /* 1003 /*
1004 * This routine checks whether a removable media has been changed, 1004 * This routine checks whether a removable media has been changed,
1005 * and invalidates all buffer-cache-entries in that case. This 1005 * and invalidates all buffer-cache-entries in that case. This
1006 * is a relatively slow routine, so we have to try to minimize using 1006 * is a relatively slow routine, so we have to try to minimize using
1007 * it. Thus it is called only upon a 'mount' or 'open'. This 1007 * it. Thus it is called only upon a 'mount' or 'open'. This
1008 * is the best way of combining speed and utility, I think. 1008 * is the best way of combining speed and utility, I think.
1009 * People changing diskettes in the middle of an operation deserve 1009 * People changing diskettes in the middle of an operation deserve
1010 * to lose :-) 1010 * to lose :-)
1011 */ 1011 */
1012 int check_disk_change(struct block_device *bdev) 1012 int check_disk_change(struct block_device *bdev)
1013 { 1013 {
1014 struct gendisk *disk = bdev->bd_disk; 1014 struct gendisk *disk = bdev->bd_disk;
1015 const struct block_device_operations *bdops = disk->fops; 1015 const struct block_device_operations *bdops = disk->fops;
1016 unsigned int events; 1016 unsigned int events;
1017 1017
1018 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | 1018 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1019 DISK_EVENT_EJECT_REQUEST); 1019 DISK_EVENT_EJECT_REQUEST);
1020 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1020 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1021 return 0; 1021 return 0;
1022 1022
1023 flush_disk(bdev, true); 1023 flush_disk(bdev, true);
1024 if (bdops->revalidate_disk) 1024 if (bdops->revalidate_disk)
1025 bdops->revalidate_disk(bdev->bd_disk); 1025 bdops->revalidate_disk(bdev->bd_disk);
1026 return 1; 1026 return 1;
1027 } 1027 }
1028 1028
1029 EXPORT_SYMBOL(check_disk_change); 1029 EXPORT_SYMBOL(check_disk_change);
1030 1030
1031 void bd_set_size(struct block_device *bdev, loff_t size) 1031 void bd_set_size(struct block_device *bdev, loff_t size)
1032 { 1032 {
1033 unsigned bsize = bdev_logical_block_size(bdev); 1033 unsigned bsize = bdev_logical_block_size(bdev);
1034 1034
1035 bdev->bd_inode->i_size = size; 1035 bdev->bd_inode->i_size = size;
1036 while (bsize < PAGE_CACHE_SIZE) { 1036 while (bsize < PAGE_CACHE_SIZE) {
1037 if (size & bsize) 1037 if (size & bsize)
1038 break; 1038 break;
1039 bsize <<= 1; 1039 bsize <<= 1;
1040 } 1040 }
1041 bdev->bd_block_size = bsize; 1041 bdev->bd_block_size = bsize;
1042 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1042 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1043 } 1043 }
1044 EXPORT_SYMBOL(bd_set_size); 1044 EXPORT_SYMBOL(bd_set_size);
1045 1045
1046 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1046 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1047 1047
1048 /* 1048 /*
1049 * bd_mutex locking: 1049 * bd_mutex locking:
1050 * 1050 *
1051 * mutex_lock(part->bd_mutex) 1051 * mutex_lock(part->bd_mutex)
1052 * mutex_lock_nested(whole->bd_mutex, 1) 1052 * mutex_lock_nested(whole->bd_mutex, 1)
1053 */ 1053 */
1054 1054
1055 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1055 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1056 { 1056 {
1057 struct gendisk *disk; 1057 struct gendisk *disk;
1058 struct module *owner; 1058 struct module *owner;
1059 int ret; 1059 int ret;
1060 int partno; 1060 int partno;
1061 int perm = 0; 1061 int perm = 0;
1062 1062
1063 if (mode & FMODE_READ) 1063 if (mode & FMODE_READ)
1064 perm |= MAY_READ; 1064 perm |= MAY_READ;
1065 if (mode & FMODE_WRITE) 1065 if (mode & FMODE_WRITE)
1066 perm |= MAY_WRITE; 1066 perm |= MAY_WRITE;
1067 /* 1067 /*
1068 * hooks: /n/, see "layering violations". 1068 * hooks: /n/, see "layering violations".
1069 */ 1069 */
1070 if (!for_part) { 1070 if (!for_part) {
1071 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1071 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1072 if (ret != 0) { 1072 if (ret != 0) {
1073 bdput(bdev); 1073 bdput(bdev);
1074 return ret; 1074 return ret;
1075 } 1075 }
1076 } 1076 }
1077 1077
1078 restart: 1078 restart:
1079 1079
1080 ret = -ENXIO; 1080 ret = -ENXIO;
1081 disk = get_gendisk(bdev->bd_dev, &partno); 1081 disk = get_gendisk(bdev->bd_dev, &partno);
1082 if (!disk) 1082 if (!disk)
1083 goto out; 1083 goto out;
1084 owner = disk->fops->owner; 1084 owner = disk->fops->owner;
1085 1085
1086 disk_block_events(disk); 1086 disk_block_events(disk);
1087 mutex_lock_nested(&bdev->bd_mutex, for_part); 1087 mutex_lock_nested(&bdev->bd_mutex, for_part);
1088 if (!bdev->bd_openers) { 1088 if (!bdev->bd_openers) {
1089 bdev->bd_disk = disk; 1089 bdev->bd_disk = disk;
1090 bdev->bd_queue = disk->queue; 1090 bdev->bd_queue = disk->queue;
1091 bdev->bd_contains = bdev; 1091 bdev->bd_contains = bdev;
1092 if (!partno) { 1092 if (!partno) {
1093 struct backing_dev_info *bdi; 1093 struct backing_dev_info *bdi;
1094 1094
1095 ret = -ENXIO; 1095 ret = -ENXIO;
1096 bdev->bd_part = disk_get_part(disk, partno); 1096 bdev->bd_part = disk_get_part(disk, partno);
1097 if (!bdev->bd_part) 1097 if (!bdev->bd_part)
1098 goto out_clear; 1098 goto out_clear;
1099 1099
1100 ret = 0; 1100 ret = 0;
1101 if (disk->fops->open) { 1101 if (disk->fops->open) {
1102 ret = disk->fops->open(bdev, mode); 1102 ret = disk->fops->open(bdev, mode);
1103 if (ret == -ERESTARTSYS) { 1103 if (ret == -ERESTARTSYS) {
1104 /* Lost a race with 'disk' being 1104 /* Lost a race with 'disk' being
1105 * deleted, try again. 1105 * deleted, try again.
1106 * See md.c 1106 * See md.c
1107 */ 1107 */
1108 disk_put_part(bdev->bd_part); 1108 disk_put_part(bdev->bd_part);
1109 bdev->bd_part = NULL; 1109 bdev->bd_part = NULL;
1110 bdev->bd_disk = NULL; 1110 bdev->bd_disk = NULL;
1111 bdev->bd_queue = NULL; 1111 bdev->bd_queue = NULL;
1112 mutex_unlock(&bdev->bd_mutex); 1112 mutex_unlock(&bdev->bd_mutex);
1113 disk_unblock_events(disk); 1113 disk_unblock_events(disk);
1114 put_disk(disk); 1114 put_disk(disk);
1115 module_put(owner); 1115 module_put(owner);
1116 goto restart; 1116 goto restart;
1117 } 1117 }
1118 } 1118 }
1119 1119
1120 if (!ret && !bdev->bd_openers) { 1120 if (!ret && !bdev->bd_openers) {
1121 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1121 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1122 bdi = blk_get_backing_dev_info(bdev); 1122 bdi = blk_get_backing_dev_info(bdev);
1123 if (bdi == NULL) 1123 if (bdi == NULL)
1124 bdi = &default_backing_dev_info; 1124 bdi = &default_backing_dev_info;
1125 bdev_inode_switch_bdi(bdev->bd_inode, bdi); 1125 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1126 } 1126 }
1127 1127
1128 /* 1128 /*
1129 * If the device is invalidated, rescan partition 1129 * If the device is invalidated, rescan partition
1130 * if open succeeded or failed with -ENOMEDIUM. 1130 * if open succeeded or failed with -ENOMEDIUM.
1131 * The latter is necessary to prevent ghost 1131 * The latter is necessary to prevent ghost
1132 * partitions on a removed medium. 1132 * partitions on a removed medium.
1133 */ 1133 */
1134 if (bdev->bd_invalidated) { 1134 if (bdev->bd_invalidated) {
1135 if (!ret) 1135 if (!ret)
1136 rescan_partitions(disk, bdev); 1136 rescan_partitions(disk, bdev);
1137 else if (ret == -ENOMEDIUM) 1137 else if (ret == -ENOMEDIUM)
1138 invalidate_partitions(disk, bdev); 1138 invalidate_partitions(disk, bdev);
1139 } 1139 }
1140 if (ret) 1140 if (ret)
1141 goto out_clear; 1141 goto out_clear;
1142 } else { 1142 } else {
1143 struct block_device *whole; 1143 struct block_device *whole;
1144 whole = bdget_disk(disk, 0); 1144 whole = bdget_disk(disk, 0);
1145 ret = -ENOMEM; 1145 ret = -ENOMEM;
1146 if (!whole) 1146 if (!whole)
1147 goto out_clear; 1147 goto out_clear;
1148 BUG_ON(for_part); 1148 BUG_ON(for_part);
1149 ret = __blkdev_get(whole, mode, 1); 1149 ret = __blkdev_get(whole, mode, 1);
1150 if (ret) 1150 if (ret)
1151 goto out_clear; 1151 goto out_clear;
1152 bdev->bd_contains = whole; 1152 bdev->bd_contains = whole;
1153 bdev_inode_switch_bdi(bdev->bd_inode, 1153 bdev_inode_switch_bdi(bdev->bd_inode,
1154 whole->bd_inode->i_data.backing_dev_info); 1154 whole->bd_inode->i_data.backing_dev_info);
1155 bdev->bd_part = disk_get_part(disk, partno); 1155 bdev->bd_part = disk_get_part(disk, partno);
1156 if (!(disk->flags & GENHD_FL_UP) || 1156 if (!(disk->flags & GENHD_FL_UP) ||
1157 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1157 !bdev->bd_part || !bdev->bd_part->nr_sects) {
1158 ret = -ENXIO; 1158 ret = -ENXIO;
1159 goto out_clear; 1159 goto out_clear;
1160 } 1160 }
1161 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1161 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1162 } 1162 }
1163 } else { 1163 } else {
1164 if (bdev->bd_contains == bdev) { 1164 if (bdev->bd_contains == bdev) {
1165 ret = 0; 1165 ret = 0;
1166 if (bdev->bd_disk->fops->open) 1166 if (bdev->bd_disk->fops->open)
1167 ret = bdev->bd_disk->fops->open(bdev, mode); 1167 ret = bdev->bd_disk->fops->open(bdev, mode);
1168 /* the same as first opener case, read comment there */ 1168 /* the same as first opener case, read comment there */
1169 if (bdev->bd_invalidated) { 1169 if (bdev->bd_invalidated) {
1170 if (!ret) 1170 if (!ret)
1171 rescan_partitions(bdev->bd_disk, bdev); 1171 rescan_partitions(bdev->bd_disk, bdev);
1172 else if (ret == -ENOMEDIUM) 1172 else if (ret == -ENOMEDIUM)
1173 invalidate_partitions(bdev->bd_disk, bdev); 1173 invalidate_partitions(bdev->bd_disk, bdev);
1174 } 1174 }
1175 if (ret) 1175 if (ret)
1176 goto out_unlock_bdev; 1176 goto out_unlock_bdev;
1177 } 1177 }
1178 /* only one opener holds refs to the module and disk */ 1178 /* only one opener holds refs to the module and disk */
1179 put_disk(disk); 1179 put_disk(disk);
1180 module_put(owner); 1180 module_put(owner);
1181 } 1181 }
1182 bdev->bd_openers++; 1182 bdev->bd_openers++;
1183 if (for_part) 1183 if (for_part)
1184 bdev->bd_part_count++; 1184 bdev->bd_part_count++;
1185 mutex_unlock(&bdev->bd_mutex); 1185 mutex_unlock(&bdev->bd_mutex);
1186 disk_unblock_events(disk); 1186 disk_unblock_events(disk);
1187 return 0; 1187 return 0;
1188 1188
1189 out_clear: 1189 out_clear:
1190 disk_put_part(bdev->bd_part); 1190 disk_put_part(bdev->bd_part);
1191 bdev->bd_disk = NULL; 1191 bdev->bd_disk = NULL;
1192 bdev->bd_part = NULL; 1192 bdev->bd_part = NULL;
1193 bdev->bd_queue = NULL; 1193 bdev->bd_queue = NULL;
1194 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1194 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1195 if (bdev != bdev->bd_contains) 1195 if (bdev != bdev->bd_contains)
1196 __blkdev_put(bdev->bd_contains, mode, 1); 1196 __blkdev_put(bdev->bd_contains, mode, 1);
1197 bdev->bd_contains = NULL; 1197 bdev->bd_contains = NULL;
1198 out_unlock_bdev: 1198 out_unlock_bdev:
1199 mutex_unlock(&bdev->bd_mutex); 1199 mutex_unlock(&bdev->bd_mutex);
1200 disk_unblock_events(disk); 1200 disk_unblock_events(disk);
1201 put_disk(disk); 1201 put_disk(disk);
1202 module_put(owner); 1202 module_put(owner);
1203 out: 1203 out:
1204 bdput(bdev); 1204 bdput(bdev);
1205 1205
1206 return ret; 1206 return ret;
1207 } 1207 }
1208 1208
1209 /** 1209 /**
1210 * blkdev_get - open a block device 1210 * blkdev_get - open a block device
1211 * @bdev: block_device to open 1211 * @bdev: block_device to open
1212 * @mode: FMODE_* mask 1212 * @mode: FMODE_* mask
1213 * @holder: exclusive holder identifier 1213 * @holder: exclusive holder identifier
1214 * 1214 *
1215 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is 1215 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1216 * open with exclusive access. Specifying %FMODE_EXCL with %NULL 1216 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1217 * @holder is invalid. Exclusive opens may nest for the same @holder. 1217 * @holder is invalid. Exclusive opens may nest for the same @holder.
1218 * 1218 *
1219 * On success, the reference count of @bdev is unchanged. On failure, 1219 * On success, the reference count of @bdev is unchanged. On failure,
1220 * @bdev is put. 1220 * @bdev is put.
1221 * 1221 *
1222 * CONTEXT: 1222 * CONTEXT:
1223 * Might sleep. 1223 * Might sleep.
1224 * 1224 *
1225 * RETURNS: 1225 * RETURNS:
1226 * 0 on success, -errno on failure. 1226 * 0 on success, -errno on failure.
1227 */ 1227 */
1228 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1228 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1229 { 1229 {
1230 struct block_device *whole = NULL; 1230 struct block_device *whole = NULL;
1231 int res; 1231 int res;
1232 1232
1233 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); 1233 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1234 1234
1235 if ((mode & FMODE_EXCL) && holder) { 1235 if ((mode & FMODE_EXCL) && holder) {
1236 whole = bd_start_claiming(bdev, holder); 1236 whole = bd_start_claiming(bdev, holder);
1237 if (IS_ERR(whole)) { 1237 if (IS_ERR(whole)) {
1238 bdput(bdev); 1238 bdput(bdev);
1239 return PTR_ERR(whole); 1239 return PTR_ERR(whole);
1240 } 1240 }
1241 } 1241 }
1242 1242
1243 res = __blkdev_get(bdev, mode, 0); 1243 res = __blkdev_get(bdev, mode, 0);
1244 1244
1245 if (whole) { 1245 if (whole) {
1246 struct gendisk *disk = whole->bd_disk; 1246 struct gendisk *disk = whole->bd_disk;
1247 1247
1248 /* finish claiming */ 1248 /* finish claiming */
1249 mutex_lock(&bdev->bd_mutex); 1249 mutex_lock(&bdev->bd_mutex);
1250 spin_lock(&bdev_lock); 1250 spin_lock(&bdev_lock);
1251 1251
1252 if (!res) { 1252 if (!res) {
1253 BUG_ON(!bd_may_claim(bdev, whole, holder)); 1253 BUG_ON(!bd_may_claim(bdev, whole, holder));
1254 /* 1254 /*
1255 * Note that for a whole device bd_holders 1255 * Note that for a whole device bd_holders
1256 * will be incremented twice, and bd_holder 1256 * will be incremented twice, and bd_holder
1257 * will be set to bd_may_claim before being 1257 * will be set to bd_may_claim before being
1258 * set to holder 1258 * set to holder
1259 */ 1259 */
1260 whole->bd_holders++; 1260 whole->bd_holders++;
1261 whole->bd_holder = bd_may_claim; 1261 whole->bd_holder = bd_may_claim;
1262 bdev->bd_holders++; 1262 bdev->bd_holders++;
1263 bdev->bd_holder = holder; 1263 bdev->bd_holder = holder;
1264 } 1264 }
1265 1265
1266 /* tell others that we're done */ 1266 /* tell others that we're done */
1267 BUG_ON(whole->bd_claiming != holder); 1267 BUG_ON(whole->bd_claiming != holder);
1268 whole->bd_claiming = NULL; 1268 whole->bd_claiming = NULL;
1269 wake_up_bit(&whole->bd_claiming, 0); 1269 wake_up_bit(&whole->bd_claiming, 0);
1270 1270
1271 spin_unlock(&bdev_lock); 1271 spin_unlock(&bdev_lock);
1272 1272
1273 /* 1273 /*
1274 * Block event polling for write claims if requested. Any 1274 * Block event polling for write claims if requested. Any
1275 * write holder makes the write_holder state stick until 1275 * write holder makes the write_holder state stick until
1276 * all are released. This is good enough and tracking 1276 * all are released. This is good enough and tracking
1277 * individual writeable reference is too fragile given the 1277 * individual writeable reference is too fragile given the
1278 * way @mode is used in blkdev_get/put(). 1278 * way @mode is used in blkdev_get/put().
1279 */ 1279 */
1280 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && 1280 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1281 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { 1281 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1282 bdev->bd_write_holder = true; 1282 bdev->bd_write_holder = true;
1283 disk_block_events(disk); 1283 disk_block_events(disk);
1284 } 1284 }
1285 1285
1286 mutex_unlock(&bdev->bd_mutex); 1286 mutex_unlock(&bdev->bd_mutex);
1287 bdput(whole); 1287 bdput(whole);
1288 } 1288 }
1289 1289
1290 return res; 1290 return res;
1291 } 1291 }
1292 EXPORT_SYMBOL(blkdev_get); 1292 EXPORT_SYMBOL(blkdev_get);
1293 1293
1294 /** 1294 /**
1295 * blkdev_get_by_path - open a block device by name 1295 * blkdev_get_by_path - open a block device by name
1296 * @path: path to the block device to open 1296 * @path: path to the block device to open
1297 * @mode: FMODE_* mask 1297 * @mode: FMODE_* mask
1298 * @holder: exclusive holder identifier 1298 * @holder: exclusive holder identifier
1299 * 1299 *
1300 * Open the blockdevice described by the device file at @path. @mode 1300 * Open the blockdevice described by the device file at @path. @mode
1301 * and @holder are identical to blkdev_get(). 1301 * and @holder are identical to blkdev_get().
1302 * 1302 *
1303 * On success, the returned block_device has reference count of one. 1303 * On success, the returned block_device has reference count of one.
1304 * 1304 *
1305 * CONTEXT: 1305 * CONTEXT:
1306 * Might sleep. 1306 * Might sleep.
1307 * 1307 *
1308 * RETURNS: 1308 * RETURNS:
1309 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1309 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1310 */ 1310 */
1311 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 1311 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1312 void *holder) 1312 void *holder)
1313 { 1313 {
1314 struct block_device *bdev; 1314 struct block_device *bdev;
1315 int err; 1315 int err;
1316 1316
1317 bdev = lookup_bdev(path); 1317 bdev = lookup_bdev(path);
1318 if (IS_ERR(bdev)) 1318 if (IS_ERR(bdev))
1319 return bdev; 1319 return bdev;
1320 1320
1321 err = blkdev_get(bdev, mode, holder); 1321 err = blkdev_get(bdev, mode, holder);
1322 if (err) 1322 if (err)
1323 return ERR_PTR(err); 1323 return ERR_PTR(err);
1324 1324
1325 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { 1325 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1326 blkdev_put(bdev, mode); 1326 blkdev_put(bdev, mode);
1327 return ERR_PTR(-EACCES); 1327 return ERR_PTR(-EACCES);
1328 } 1328 }
1329 1329
1330 return bdev; 1330 return bdev;
1331 } 1331 }
1332 EXPORT_SYMBOL(blkdev_get_by_path); 1332 EXPORT_SYMBOL(blkdev_get_by_path);
1333 1333
1334 /** 1334 /**
1335 * blkdev_get_by_dev - open a block device by device number 1335 * blkdev_get_by_dev - open a block device by device number
1336 * @dev: device number of block device to open 1336 * @dev: device number of block device to open
1337 * @mode: FMODE_* mask 1337 * @mode: FMODE_* mask
1338 * @holder: exclusive holder identifier 1338 * @holder: exclusive holder identifier
1339 * 1339 *
1340 * Open the blockdevice described by device number @dev. @mode and 1340 * Open the blockdevice described by device number @dev. @mode and
1341 * @holder are identical to blkdev_get(). 1341 * @holder are identical to blkdev_get().
1342 * 1342 *
1343 * Use it ONLY if you really do not have anything better - i.e. when 1343 * Use it ONLY if you really do not have anything better - i.e. when
1344 * you are behind a truly sucky interface and all you are given is a 1344 * you are behind a truly sucky interface and all you are given is a
1345 * device number. _Never_ to be used for internal purposes. If you 1345 * device number. _Never_ to be used for internal purposes. If you
1346 * ever need it - reconsider your API. 1346 * ever need it - reconsider your API.
1347 * 1347 *
1348 * On success, the returned block_device has reference count of one. 1348 * On success, the returned block_device has reference count of one.
1349 * 1349 *
1350 * CONTEXT: 1350 * CONTEXT:
1351 * Might sleep. 1351 * Might sleep.
1352 * 1352 *
1353 * RETURNS: 1353 * RETURNS:
1354 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1354 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1355 */ 1355 */
1356 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) 1356 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1357 { 1357 {
1358 struct block_device *bdev; 1358 struct block_device *bdev;
1359 int err; 1359 int err;
1360 1360
1361 bdev = bdget(dev); 1361 bdev = bdget(dev);
1362 if (!bdev) 1362 if (!bdev)
1363 return ERR_PTR(-ENOMEM); 1363 return ERR_PTR(-ENOMEM);
1364 1364
1365 err = blkdev_get(bdev, mode, holder); 1365 err = blkdev_get(bdev, mode, holder);
1366 if (err) 1366 if (err)
1367 return ERR_PTR(err); 1367 return ERR_PTR(err);
1368 1368
1369 return bdev; 1369 return bdev;
1370 } 1370 }
1371 EXPORT_SYMBOL(blkdev_get_by_dev); 1371 EXPORT_SYMBOL(blkdev_get_by_dev);
1372 1372
1373 static int blkdev_open(struct inode * inode, struct file * filp) 1373 static int blkdev_open(struct inode * inode, struct file * filp)
1374 { 1374 {
1375 struct block_device *bdev; 1375 struct block_device *bdev;
1376 1376
1377 /* 1377 /*
1378 * Preserve backwards compatibility and allow large file access 1378 * Preserve backwards compatibility and allow large file access
1379 * even if userspace doesn't ask for it explicitly. Some mkfs 1379 * even if userspace doesn't ask for it explicitly. Some mkfs
1380 * binary needs it. We might want to drop this workaround 1380 * binary needs it. We might want to drop this workaround
1381 * during an unstable branch. 1381 * during an unstable branch.
1382 */ 1382 */
1383 filp->f_flags |= O_LARGEFILE; 1383 filp->f_flags |= O_LARGEFILE;
1384 1384
1385 if (filp->f_flags & O_NDELAY) 1385 if (filp->f_flags & O_NDELAY)
1386 filp->f_mode |= FMODE_NDELAY; 1386 filp->f_mode |= FMODE_NDELAY;
1387 if (filp->f_flags & O_EXCL) 1387 if (filp->f_flags & O_EXCL)
1388 filp->f_mode |= FMODE_EXCL; 1388 filp->f_mode |= FMODE_EXCL;
1389 if ((filp->f_flags & O_ACCMODE) == 3) 1389 if ((filp->f_flags & O_ACCMODE) == 3)
1390 filp->f_mode |= FMODE_WRITE_IOCTL; 1390 filp->f_mode |= FMODE_WRITE_IOCTL;
1391 1391
1392 bdev = bd_acquire(inode); 1392 bdev = bd_acquire(inode);
1393 if (bdev == NULL) 1393 if (bdev == NULL)
1394 return -ENOMEM; 1394 return -ENOMEM;
1395 1395
1396 filp->f_mapping = bdev->bd_inode->i_mapping; 1396 filp->f_mapping = bdev->bd_inode->i_mapping;
1397 1397
1398 return blkdev_get(bdev, filp->f_mode, filp); 1398 return blkdev_get(bdev, filp->f_mode, filp);
1399 } 1399 }
1400 1400
1401 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1401 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1402 { 1402 {
1403 int ret = 0; 1403 int ret = 0;
1404 struct gendisk *disk = bdev->bd_disk; 1404 struct gendisk *disk = bdev->bd_disk;
1405 struct block_device *victim = NULL; 1405 struct block_device *victim = NULL;
1406 1406
1407 mutex_lock_nested(&bdev->bd_mutex, for_part); 1407 mutex_lock_nested(&bdev->bd_mutex, for_part);
1408 if (for_part) 1408 if (for_part)
1409 bdev->bd_part_count--; 1409 bdev->bd_part_count--;
1410 1410
1411 if (!--bdev->bd_openers) { 1411 if (!--bdev->bd_openers) {
1412 WARN_ON_ONCE(bdev->bd_holders); 1412 WARN_ON_ONCE(bdev->bd_holders);
1413 sync_blockdev(bdev); 1413 sync_blockdev(bdev);
1414 kill_bdev(bdev); 1414 kill_bdev(bdev);
1415 /* ->release can cause the old bdi to disappear, 1415 /* ->release can cause the old bdi to disappear,
1416 * so must switch it out first 1416 * so must switch it out first
1417 */ 1417 */
1418 bdev_inode_switch_bdi(bdev->bd_inode, 1418 bdev_inode_switch_bdi(bdev->bd_inode,
1419 &default_backing_dev_info); 1419 &default_backing_dev_info);
1420 } 1420 }
1421 if (bdev->bd_contains == bdev) { 1421 if (bdev->bd_contains == bdev) {
1422 if (disk->fops->release) 1422 if (disk->fops->release)
1423 ret = disk->fops->release(disk, mode); 1423 ret = disk->fops->release(disk, mode);
1424 } 1424 }
1425 if (!bdev->bd_openers) { 1425 if (!bdev->bd_openers) {
1426 struct module *owner = disk->fops->owner; 1426 struct module *owner = disk->fops->owner;
1427 1427
1428 disk_put_part(bdev->bd_part); 1428 disk_put_part(bdev->bd_part);
1429 bdev->bd_part = NULL; 1429 bdev->bd_part = NULL;
1430 bdev->bd_disk = NULL; 1430 bdev->bd_disk = NULL;
1431 if (bdev != bdev->bd_contains) 1431 if (bdev != bdev->bd_contains)
1432 victim = bdev->bd_contains; 1432 victim = bdev->bd_contains;
1433 bdev->bd_contains = NULL; 1433 bdev->bd_contains = NULL;
1434 1434
1435 put_disk(disk); 1435 put_disk(disk);
1436 module_put(owner); 1436 module_put(owner);
1437 } 1437 }
1438 mutex_unlock(&bdev->bd_mutex); 1438 mutex_unlock(&bdev->bd_mutex);
1439 bdput(bdev); 1439 bdput(bdev);
1440 if (victim) 1440 if (victim)
1441 __blkdev_put(victim, mode, 1); 1441 __blkdev_put(victim, mode, 1);
1442 return ret; 1442 return ret;
1443 } 1443 }
1444 1444
1445 int blkdev_put(struct block_device *bdev, fmode_t mode) 1445 int blkdev_put(struct block_device *bdev, fmode_t mode)
1446 { 1446 {
1447 mutex_lock(&bdev->bd_mutex); 1447 mutex_lock(&bdev->bd_mutex);
1448 1448
1449 if (mode & FMODE_EXCL) { 1449 if (mode & FMODE_EXCL) {
1450 bool bdev_free; 1450 bool bdev_free;
1451 1451
1452 /* 1452 /*
1453 * Release a claim on the device. The holder fields 1453 * Release a claim on the device. The holder fields
1454 * are protected with bdev_lock. bd_mutex is to 1454 * are protected with bdev_lock. bd_mutex is to
1455 * synchronize disk_holder unlinking. 1455 * synchronize disk_holder unlinking.
1456 */ 1456 */
1457 spin_lock(&bdev_lock); 1457 spin_lock(&bdev_lock);
1458 1458
1459 WARN_ON_ONCE(--bdev->bd_holders < 0); 1459 WARN_ON_ONCE(--bdev->bd_holders < 0);
1460 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); 1460 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1461 1461
1462 /* bd_contains might point to self, check in a separate step */ 1462 /* bd_contains might point to self, check in a separate step */
1463 if ((bdev_free = !bdev->bd_holders)) 1463 if ((bdev_free = !bdev->bd_holders))
1464 bdev->bd_holder = NULL; 1464 bdev->bd_holder = NULL;
1465 if (!bdev->bd_contains->bd_holders) 1465 if (!bdev->bd_contains->bd_holders)
1466 bdev->bd_contains->bd_holder = NULL; 1466 bdev->bd_contains->bd_holder = NULL;
1467 1467
1468 spin_unlock(&bdev_lock); 1468 spin_unlock(&bdev_lock);
1469 1469
1470 /* 1470 /*
1471 * If this was the last claim, remove holder link and 1471 * If this was the last claim, remove holder link and
1472 * unblock evpoll if it was a write holder. 1472 * unblock evpoll if it was a write holder.
1473 */ 1473 */
1474 if (bdev_free && bdev->bd_write_holder) { 1474 if (bdev_free && bdev->bd_write_holder) {
1475 disk_unblock_events(bdev->bd_disk); 1475 disk_unblock_events(bdev->bd_disk);
1476 bdev->bd_write_holder = false; 1476 bdev->bd_write_holder = false;
1477 } 1477 }
1478 } 1478 }
1479 1479
1480 /* 1480 /*
1481 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1481 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1482 * event. This is to ensure detection of media removal commanded 1482 * event. This is to ensure detection of media removal commanded
1483 * from userland - e.g. eject(1). 1483 * from userland - e.g. eject(1).
1484 */ 1484 */
1485 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); 1485 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1486 1486
1487 mutex_unlock(&bdev->bd_mutex); 1487 mutex_unlock(&bdev->bd_mutex);
1488 1488
1489 return __blkdev_put(bdev, mode, 0); 1489 return __blkdev_put(bdev, mode, 0);
1490 } 1490 }
1491 EXPORT_SYMBOL(blkdev_put); 1491 EXPORT_SYMBOL(blkdev_put);
1492 1492
1493 static int blkdev_close(struct inode * inode, struct file * filp) 1493 static int blkdev_close(struct inode * inode, struct file * filp)
1494 { 1494 {
1495 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1495 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1496 1496
1497 return blkdev_put(bdev, filp->f_mode); 1497 return blkdev_put(bdev, filp->f_mode);
1498 } 1498 }
1499 1499
1500 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1500 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1501 { 1501 {
1502 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1502 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1503 fmode_t mode = file->f_mode; 1503 fmode_t mode = file->f_mode;
1504 1504
1505 /* 1505 /*
1506 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1506 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1507 * to updated it before every ioctl. 1507 * to updated it before every ioctl.
1508 */ 1508 */
1509 if (file->f_flags & O_NDELAY) 1509 if (file->f_flags & O_NDELAY)
1510 mode |= FMODE_NDELAY; 1510 mode |= FMODE_NDELAY;
1511 else 1511 else
1512 mode &= ~FMODE_NDELAY; 1512 mode &= ~FMODE_NDELAY;
1513 1513
1514 return blkdev_ioctl(bdev, mode, cmd, arg); 1514 return blkdev_ioctl(bdev, mode, cmd, arg);
1515 } 1515 }
1516 1516
1517 /* 1517 /*
1518 * Write data to the block device. Only intended for the block device itself 1518 * Write data to the block device. Only intended for the block device itself
1519 * and the raw driver which basically is a fake block device. 1519 * and the raw driver which basically is a fake block device.
1520 * 1520 *
1521 * Does not take i_mutex for the write and thus is not for general purpose 1521 * Does not take i_mutex for the write and thus is not for general purpose
1522 * use. 1522 * use.
1523 */ 1523 */
1524 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 1524 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1525 unsigned long nr_segs, loff_t pos) 1525 unsigned long nr_segs, loff_t pos)
1526 { 1526 {
1527 struct file *file = iocb->ki_filp; 1527 struct file *file = iocb->ki_filp;
1528 struct blk_plug plug; 1528 struct blk_plug plug;
1529 ssize_t ret; 1529 ssize_t ret;
1530 1530
1531 BUG_ON(iocb->ki_pos != pos); 1531 BUG_ON(iocb->ki_pos != pos);
1532 1532
1533 blk_start_plug(&plug); 1533 blk_start_plug(&plug);
1534 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1534 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1535 if (ret > 0 || ret == -EIOCBQUEUED) { 1535 if (ret > 0 || ret == -EIOCBQUEUED) {
1536 ssize_t err; 1536 ssize_t err;
1537 1537
1538 err = generic_write_sync(file, pos, ret); 1538 err = generic_write_sync(file, pos, ret);
1539 if (err < 0 && ret > 0) 1539 if (err < 0 && ret > 0)
1540 ret = err; 1540 ret = err;
1541 } 1541 }
1542 blk_finish_plug(&plug); 1542 blk_finish_plug(&plug);
1543 return ret; 1543 return ret;
1544 } 1544 }
1545 EXPORT_SYMBOL_GPL(blkdev_aio_write); 1545 EXPORT_SYMBOL_GPL(blkdev_aio_write);
1546 1546
1547 static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, 1547 static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1548 unsigned long nr_segs, loff_t pos) 1548 unsigned long nr_segs, loff_t pos)
1549 { 1549 {
1550 struct file *file = iocb->ki_filp; 1550 struct file *file = iocb->ki_filp;
1551 struct inode *bd_inode = file->f_mapping->host; 1551 struct inode *bd_inode = file->f_mapping->host;
1552 loff_t size = i_size_read(bd_inode); 1552 loff_t size = i_size_read(bd_inode);
1553 1553
1554 if (pos >= size) 1554 if (pos >= size)
1555 return 0; 1555 return 0;
1556 1556
1557 size -= pos; 1557 size -= pos;
1558 if (size < INT_MAX) 1558 if (size < INT_MAX)
1559 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1559 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1560 return generic_file_aio_read(iocb, iov, nr_segs, pos); 1560 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1561 } 1561 }
1562 1562
1563 /* 1563 /*
1564 * Try to release a page associated with block device when the system 1564 * Try to release a page associated with block device when the system
1565 * is under memory pressure. 1565 * is under memory pressure.
1566 */ 1566 */
1567 static int blkdev_releasepage(struct page *page, gfp_t wait) 1567 static int blkdev_releasepage(struct page *page, gfp_t wait)
1568 { 1568 {
1569 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1569 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1570 1570
1571 if (super && super->s_op->bdev_try_to_free_page) 1571 if (super && super->s_op->bdev_try_to_free_page)
1572 return super->s_op->bdev_try_to_free_page(super, page, wait); 1572 return super->s_op->bdev_try_to_free_page(super, page, wait);
1573 1573
1574 return try_to_free_buffers(page); 1574 return try_to_free_buffers(page);
1575 } 1575 }
1576 1576
1577 static const struct address_space_operations def_blk_aops = { 1577 static const struct address_space_operations def_blk_aops = {
1578 .readpage = blkdev_readpage, 1578 .readpage = blkdev_readpage,
1579 .writepage = blkdev_writepage, 1579 .writepage = blkdev_writepage,
1580 .write_begin = blkdev_write_begin, 1580 .write_begin = blkdev_write_begin,
1581 .write_end = blkdev_write_end, 1581 .write_end = blkdev_write_end,
1582 .writepages = generic_writepages, 1582 .writepages = generic_writepages,
1583 .releasepage = blkdev_releasepage, 1583 .releasepage = blkdev_releasepage,
1584 .direct_IO = blkdev_direct_IO, 1584 .direct_IO = blkdev_direct_IO,
1585 }; 1585 };
1586 1586
1587 const struct file_operations def_blk_fops = { 1587 const struct file_operations def_blk_fops = {
1588 .open = blkdev_open, 1588 .open = blkdev_open,
1589 .release = blkdev_close, 1589 .release = blkdev_close,
1590 .llseek = block_llseek, 1590 .llseek = block_llseek,
1591 .read = do_sync_read, 1591 .read = do_sync_read,
1592 .write = do_sync_write, 1592 .write = do_sync_write,
1593 .aio_read = blkdev_aio_read, 1593 .aio_read = blkdev_aio_read,
1594 .aio_write = blkdev_aio_write, 1594 .aio_write = blkdev_aio_write,
1595 .mmap = generic_file_mmap, 1595 .mmap = generic_file_mmap,
1596 .fsync = blkdev_fsync, 1596 .fsync = blkdev_fsync,
1597 .unlocked_ioctl = block_ioctl, 1597 .unlocked_ioctl = block_ioctl,
1598 #ifdef CONFIG_COMPAT 1598 #ifdef CONFIG_COMPAT
1599 .compat_ioctl = compat_blkdev_ioctl, 1599 .compat_ioctl = compat_blkdev_ioctl,
1600 #endif 1600 #endif
1601 .splice_read = generic_file_splice_read, 1601 .splice_read = generic_file_splice_read,
1602 .splice_write = generic_file_splice_write, 1602 .splice_write = generic_file_splice_write,
1603 }; 1603 };
1604 1604
1605 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1605 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1606 { 1606 {
1607 int res; 1607 int res;
1608 mm_segment_t old_fs = get_fs(); 1608 mm_segment_t old_fs = get_fs();
1609 set_fs(KERNEL_DS); 1609 set_fs(KERNEL_DS);
1610 res = blkdev_ioctl(bdev, 0, cmd, arg); 1610 res = blkdev_ioctl(bdev, 0, cmd, arg);
1611 set_fs(old_fs); 1611 set_fs(old_fs);
1612 return res; 1612 return res;
1613 } 1613 }
1614 1614
1615 EXPORT_SYMBOL(ioctl_by_bdev); 1615 EXPORT_SYMBOL(ioctl_by_bdev);
1616 1616
1617 /** 1617 /**
1618 * lookup_bdev - lookup a struct block_device by name 1618 * lookup_bdev - lookup a struct block_device by name
1619 * @pathname: special file representing the block device 1619 * @pathname: special file representing the block device
1620 * 1620 *
1621 * Get a reference to the blockdevice at @pathname in the current 1621 * Get a reference to the blockdevice at @pathname in the current
1622 * namespace if possible and return it. Return ERR_PTR(error) 1622 * namespace if possible and return it. Return ERR_PTR(error)
1623 * otherwise. 1623 * otherwise.
1624 */ 1624 */
1625 struct block_device *lookup_bdev(const char *pathname) 1625 struct block_device *lookup_bdev(const char *pathname)
1626 { 1626 {
1627 struct block_device *bdev; 1627 struct block_device *bdev;
1628 struct inode *inode; 1628 struct inode *inode;
1629 struct path path; 1629 struct path path;
1630 int error; 1630 int error;
1631 1631
1632 if (!pathname || !*pathname) 1632 if (!pathname || !*pathname)
1633 return ERR_PTR(-EINVAL); 1633 return ERR_PTR(-EINVAL);
1634 1634
1635 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1635 error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1636 if (error) 1636 if (error)
1637 return ERR_PTR(error); 1637 return ERR_PTR(error);
1638 1638
1639 inode = path.dentry->d_inode; 1639 inode = path.dentry->d_inode;
1640 error = -ENOTBLK; 1640 error = -ENOTBLK;
1641 if (!S_ISBLK(inode->i_mode)) 1641 if (!S_ISBLK(inode->i_mode))
1642 goto fail; 1642 goto fail;
1643 error = -EACCES; 1643 error = -EACCES;
1644 if (path.mnt->mnt_flags & MNT_NODEV) 1644 if (path.mnt->mnt_flags & MNT_NODEV)
1645 goto fail; 1645 goto fail;
1646 error = -ENOMEM; 1646 error = -ENOMEM;
1647 bdev = bd_acquire(inode); 1647 bdev = bd_acquire(inode);
1648 if (!bdev) 1648 if (!bdev)
1649 goto fail; 1649 goto fail;
1650 out: 1650 out:
1651 path_put(&path); 1651 path_put(&path);
1652 return bdev; 1652 return bdev;
1653 fail: 1653 fail:
1654 bdev = ERR_PTR(error); 1654 bdev = ERR_PTR(error);
1655 goto out; 1655 goto out;
1656 } 1656 }
1657 EXPORT_SYMBOL(lookup_bdev); 1657 EXPORT_SYMBOL(lookup_bdev);
1658 1658
1659 int __invalidate_device(struct block_device *bdev, bool kill_dirty) 1659 int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1660 { 1660 {
1661 struct super_block *sb = get_super(bdev); 1661 struct super_block *sb = get_super(bdev);
1662 int res = 0; 1662 int res = 0;
1663 1663
1664 if (sb) { 1664 if (sb) {
1665 /* 1665 /*
1666 * no need to lock the super, get_super holds the 1666 * no need to lock the super, get_super holds the
1667 * read mutex so the filesystem cannot go away 1667 * read mutex so the filesystem cannot go away
1668 * under us (->put_super runs with the write lock 1668 * under us (->put_super runs with the write lock
1669 * hold). 1669 * hold).
1670 */ 1670 */
1671 shrink_dcache_sb(sb); 1671 shrink_dcache_sb(sb);
1672 res = invalidate_inodes(sb, kill_dirty); 1672 res = invalidate_inodes(sb, kill_dirty);
1673 drop_super(sb); 1673 drop_super(sb);
1674 } 1674 }
1675 invalidate_bdev(bdev); 1675 invalidate_bdev(bdev);
1676 return res; 1676 return res;
1677 } 1677 }
1678 EXPORT_SYMBOL(__invalidate_device); 1678 EXPORT_SYMBOL(__invalidate_device);
1679 1679
1680 void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) 1680 void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
1681 { 1681 {
1682 struct inode *inode, *old_inode = NULL; 1682 struct inode *inode, *old_inode = NULL;
1683 1683
1684 spin_lock(&inode_sb_list_lock); 1684 spin_lock(&inode_sb_list_lock);
1685 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1685 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1686 struct address_space *mapping = inode->i_mapping; 1686 struct address_space *mapping = inode->i_mapping;
1687 1687
1688 spin_lock(&inode->i_lock); 1688 spin_lock(&inode->i_lock);
1689 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1689 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
1690 mapping->nrpages == 0) { 1690 mapping->nrpages == 0) {
1691 spin_unlock(&inode->i_lock); 1691 spin_unlock(&inode->i_lock);
1692 continue; 1692 continue;
1693 } 1693 }
1694 __iget(inode); 1694 __iget(inode);
1695 spin_unlock(&inode->i_lock); 1695 spin_unlock(&inode->i_lock);
1696 spin_unlock(&inode_sb_list_lock); 1696 spin_unlock(&inode_sb_list_lock);
1697 /* 1697 /*
1698 * We hold a reference to 'inode' so it couldn't have been 1698 * We hold a reference to 'inode' so it couldn't have been
1699 * removed from s_inodes list while we dropped the 1699 * removed from s_inodes list while we dropped the
1700 * inode_sb_list_lock. We cannot iput the inode now as we can 1700 * inode_sb_list_lock. We cannot iput the inode now as we can
1701 * be holding the last reference and we cannot iput it under 1701 * be holding the last reference and we cannot iput it under
1702 * inode_sb_list_lock. So we keep the reference and iput it 1702 * inode_sb_list_lock. So we keep the reference and iput it
1703 * later. 1703 * later.
1704 */ 1704 */
1705 iput(old_inode); 1705 iput(old_inode);
1706 old_inode = inode; 1706 old_inode = inode;
1707 1707
1708 func(I_BDEV(inode), arg); 1708 func(I_BDEV(inode), arg);
1709 1709
1710 spin_lock(&inode_sb_list_lock); 1710 spin_lock(&inode_sb_list_lock);
1711 } 1711 }
1712 spin_unlock(&inode_sb_list_lock); 1712 spin_unlock(&inode_sb_list_lock);
1713 iput(old_inode); 1713 iput(old_inode);
1714 } 1714 }
1715 1715
1 /* 1 /*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation. 6 * License v2 as published by the Free Software Foundation.
7 * 7 *
8 * This program is distributed in the hope that it will be useful, 8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details. 11 * General Public License for more details.
12 * 12 *
13 * You should have received a copy of the GNU General Public 13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the 14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/pagemap.h> 20 #include <linux/pagemap.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/time.h> 22 #include <linux/time.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/string.h> 24 #include <linux/string.h>
25 #include <linux/backing-dev.h> 25 #include <linux/backing-dev.h>
26 #include <linux/mpage.h> 26 #include <linux/mpage.h>
27 #include <linux/falloc.h> 27 #include <linux/falloc.h>
28 #include <linux/swap.h> 28 #include <linux/swap.h>
29 #include <linux/writeback.h> 29 #include <linux/writeback.h>
30 #include <linux/statfs.h> 30 #include <linux/statfs.h>
31 #include <linux/compat.h> 31 #include <linux/compat.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include "ctree.h" 33 #include "ctree.h"
34 #include "disk-io.h" 34 #include "disk-io.h"
35 #include "transaction.h" 35 #include "transaction.h"
36 #include "btrfs_inode.h" 36 #include "btrfs_inode.h"
37 #include "ioctl.h" 37 #include "ioctl.h"
38 #include "print-tree.h" 38 #include "print-tree.h"
39 #include "tree-log.h" 39 #include "tree-log.h"
40 #include "locking.h" 40 #include "locking.h"
41 #include "compat.h" 41 #include "compat.h"
42 #include "volumes.h" 42 #include "volumes.h"
43 43
44 /* 44 /*
45 * when auto defrag is enabled we 45 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 46 * queue up these defrag structs to remember which
47 * inodes need defragging passes 47 * inodes need defragging passes
48 */ 48 */
49 struct inode_defrag { 49 struct inode_defrag {
50 struct rb_node rb_node; 50 struct rb_node rb_node;
51 /* objectid */ 51 /* objectid */
52 u64 ino; 52 u64 ino;
53 /* 53 /*
54 * transid where the defrag was added, we search for 54 * transid where the defrag was added, we search for
55 * extents newer than this 55 * extents newer than this
56 */ 56 */
57 u64 transid; 57 u64 transid;
58 58
59 /* root objectid */ 59 /* root objectid */
60 u64 root; 60 u64 root;
61 61
62 /* last offset we were able to defrag */ 62 /* last offset we were able to defrag */
63 u64 last_offset; 63 u64 last_offset;
64 64
65 /* if we've wrapped around back to zero once already */ 65 /* if we've wrapped around back to zero once already */
66 int cycled; 66 int cycled;
67 }; 67 };
68 68
69 static int __compare_inode_defrag(struct inode_defrag *defrag1, 69 static int __compare_inode_defrag(struct inode_defrag *defrag1,
70 struct inode_defrag *defrag2) 70 struct inode_defrag *defrag2)
71 { 71 {
72 if (defrag1->root > defrag2->root) 72 if (defrag1->root > defrag2->root)
73 return 1; 73 return 1;
74 else if (defrag1->root < defrag2->root) 74 else if (defrag1->root < defrag2->root)
75 return -1; 75 return -1;
76 else if (defrag1->ino > defrag2->ino) 76 else if (defrag1->ino > defrag2->ino)
77 return 1; 77 return 1;
78 else if (defrag1->ino < defrag2->ino) 78 else if (defrag1->ino < defrag2->ino)
79 return -1; 79 return -1;
80 else 80 else
81 return 0; 81 return 0;
82 } 82 }
83 83
84 /* pop a record for an inode into the defrag tree. The lock 84 /* pop a record for an inode into the defrag tree. The lock
85 * must be held already 85 * must be held already
86 * 86 *
87 * If you're inserting a record for an older transid than an 87 * If you're inserting a record for an older transid than an
88 * existing record, the transid already in the tree is lowered 88 * existing record, the transid already in the tree is lowered
89 * 89 *
90 * If an existing record is found the defrag item you 90 * If an existing record is found the defrag item you
91 * pass in is freed 91 * pass in is freed
92 */ 92 */
93 static void __btrfs_add_inode_defrag(struct inode *inode, 93 static void __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 94 struct inode_defrag *defrag)
95 { 95 {
96 struct btrfs_root *root = BTRFS_I(inode)->root; 96 struct btrfs_root *root = BTRFS_I(inode)->root;
97 struct inode_defrag *entry; 97 struct inode_defrag *entry;
98 struct rb_node **p; 98 struct rb_node **p;
99 struct rb_node *parent = NULL; 99 struct rb_node *parent = NULL;
100 int ret; 100 int ret;
101 101
102 p = &root->fs_info->defrag_inodes.rb_node; 102 p = &root->fs_info->defrag_inodes.rb_node;
103 while (*p) { 103 while (*p) {
104 parent = *p; 104 parent = *p;
105 entry = rb_entry(parent, struct inode_defrag, rb_node); 105 entry = rb_entry(parent, struct inode_defrag, rb_node);
106 106
107 ret = __compare_inode_defrag(defrag, entry); 107 ret = __compare_inode_defrag(defrag, entry);
108 if (ret < 0) 108 if (ret < 0)
109 p = &parent->rb_left; 109 p = &parent->rb_left;
110 else if (ret > 0) 110 else if (ret > 0)
111 p = &parent->rb_right; 111 p = &parent->rb_right;
112 else { 112 else {
113 /* if we're reinserting an entry for 113 /* if we're reinserting an entry for
114 * an old defrag run, make sure to 114 * an old defrag run, make sure to
115 * lower the transid of our existing record 115 * lower the transid of our existing record
116 */ 116 */
117 if (defrag->transid < entry->transid) 117 if (defrag->transid < entry->transid)
118 entry->transid = defrag->transid; 118 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 119 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 120 entry->last_offset = defrag->last_offset;
121 goto exists; 121 goto exists;
122 } 122 }
123 } 123 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 125 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 127 return;
128 128
129 exists: 129 exists:
130 kfree(defrag); 130 kfree(defrag);
131 return; 131 return;
132 132
133 } 133 }
134 134
135 /* 135 /*
136 * insert a defrag record for this inode if auto defrag is 136 * insert a defrag record for this inode if auto defrag is
137 * enabled 137 * enabled
138 */ 138 */
139 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 139 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
140 struct inode *inode) 140 struct inode *inode)
141 { 141 {
142 struct btrfs_root *root = BTRFS_I(inode)->root; 142 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 143 struct inode_defrag *defrag;
144 u64 transid; 144 u64 transid;
145 145
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 146 if (!btrfs_test_opt(root, AUTO_DEFRAG))
147 return 0; 147 return 0;
148 148
149 if (btrfs_fs_closing(root->fs_info)) 149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 150 return 0;
151 151
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
153 return 0; 153 return 0;
154 154
155 if (trans) 155 if (trans)
156 transid = trans->transid; 156 transid = trans->transid;
157 else 157 else
158 transid = BTRFS_I(inode)->root->last_trans; 158 transid = BTRFS_I(inode)->root->last_trans;
159 159
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
161 if (!defrag) 161 if (!defrag)
162 return -ENOMEM; 162 return -ENOMEM;
163 163
164 defrag->ino = btrfs_ino(inode); 164 defrag->ino = btrfs_ino(inode);
165 defrag->transid = transid; 165 defrag->transid = transid;
166 defrag->root = root->root_key.objectid; 166 defrag->root = root->root_key.objectid;
167 167
168 spin_lock(&root->fs_info->defrag_inodes_lock); 168 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
170 __btrfs_add_inode_defrag(inode, defrag); 170 __btrfs_add_inode_defrag(inode, defrag);
171 else 171 else
172 kfree(defrag); 172 kfree(defrag);
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 173 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 174 return 0;
175 } 175 }
176 176
177 /* 177 /*
178 * must be called with the defrag_inodes lock held 178 * must be called with the defrag_inodes lock held
179 */ 179 */
180 struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 180 struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
181 u64 root, u64 ino, 181 u64 root, u64 ino,
182 struct rb_node **next) 182 struct rb_node **next)
183 { 183 {
184 struct inode_defrag *entry = NULL; 184 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 185 struct inode_defrag tmp;
186 struct rb_node *p; 186 struct rb_node *p;
187 struct rb_node *parent = NULL; 187 struct rb_node *parent = NULL;
188 int ret; 188 int ret;
189 189
190 tmp.ino = ino; 190 tmp.ino = ino;
191 tmp.root = root; 191 tmp.root = root;
192 192
193 p = info->defrag_inodes.rb_node; 193 p = info->defrag_inodes.rb_node;
194 while (p) { 194 while (p) {
195 parent = p; 195 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 196 entry = rb_entry(parent, struct inode_defrag, rb_node);
197 197
198 ret = __compare_inode_defrag(&tmp, entry); 198 ret = __compare_inode_defrag(&tmp, entry);
199 if (ret < 0) 199 if (ret < 0)
200 p = parent->rb_left; 200 p = parent->rb_left;
201 else if (ret > 0) 201 else if (ret > 0)
202 p = parent->rb_right; 202 p = parent->rb_right;
203 else 203 else
204 return entry; 204 return entry;
205 } 205 }
206 206
207 if (next) { 207 if (next) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
209 parent = rb_next(parent); 209 parent = rb_next(parent);
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 210 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 211 }
212 *next = parent; 212 *next = parent;
213 } 213 }
214 return NULL; 214 return NULL;
215 } 215 }
216 216
217 /* 217 /*
218 * run through the list of inodes in the FS that need 218 * run through the list of inodes in the FS that need
219 * defragging 219 * defragging
220 */ 220 */
221 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 221 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222 { 222 {
223 struct inode_defrag *defrag; 223 struct inode_defrag *defrag;
224 struct btrfs_root *inode_root; 224 struct btrfs_root *inode_root;
225 struct inode *inode; 225 struct inode *inode;
226 struct rb_node *n; 226 struct rb_node *n;
227 struct btrfs_key key; 227 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 228 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0; 229 u64 first_ino = 0;
230 u64 root_objectid = 0; 230 u64 root_objectid = 0;
231 int num_defrag; 231 int num_defrag;
232 int defrag_batch = 1024; 232 int defrag_batch = 1024;
233 233
234 memset(&range, 0, sizeof(range)); 234 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 235 range.len = (u64)-1;
236 236
237 atomic_inc(&fs_info->defrag_running); 237 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock); 238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 239 while(1) {
240 n = NULL; 240 n = NULL;
241 241
242 /* find an inode to defrag */ 242 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 244 first_ino, &n);
245 if (!defrag) { 245 if (!defrag) {
246 if (n) { 246 if (n) {
247 defrag = rb_entry(n, struct inode_defrag, 247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node); 248 rb_node);
249 } else if (root_objectid || first_ino) { 249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 250 root_objectid = 0;
251 first_ino = 0; 251 first_ino = 0;
252 continue; 252 continue;
253 } else { 253 } else {
254 break; 254 break;
255 } 255 }
256 } 256 }
257 257
258 /* remove it from the rbtree */ 258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 259 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 260 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262 262
263 if (btrfs_fs_closing(fs_info)) 263 if (btrfs_fs_closing(fs_info))
264 goto next_free; 264 goto next_free;
265 265
266 spin_unlock(&fs_info->defrag_inodes_lock); 266 spin_unlock(&fs_info->defrag_inodes_lock);
267 267
268 /* get the inode */ 268 /* get the inode */
269 key.objectid = defrag->root; 269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1; 271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root)) 273 if (IS_ERR(inode_root))
274 goto next; 274 goto next;
275 275
276 key.objectid = defrag->ino; 276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0; 278 key.offset = 0;
279 279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode)) 281 if (IS_ERR(inode))
282 goto next; 282 goto next;
283 283
284 /* do a chunk of defrag */ 284 /* do a chunk of defrag */
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset; 286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch); 288 defrag_batch);
289 /* 289 /*
290 * if we filled the whole defrag batch, there 290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag 291 * must be more work to do. Queue this defrag
292 * again 292 * again
293 */ 293 */
294 if (num_defrag == defrag_batch) { 294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start; 295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag); 296 __btrfs_add_inode_defrag(inode, defrag);
297 /* 297 /*
298 * we don't want to kfree defrag, we added it back to 298 * we don't want to kfree defrag, we added it back to
299 * the rbtree 299 * the rbtree
300 */ 300 */
301 defrag = NULL; 301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) { 302 } else if (defrag->last_offset && !defrag->cycled) {
303 /* 303 /*
304 * we didn't fill our defrag batch, but 304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop 305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file. 306 * around to the start of the file.
307 */ 307 */
308 defrag->last_offset = 0; 308 defrag->last_offset = 0;
309 defrag->cycled = 1; 309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag); 310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL; 311 defrag = NULL;
312 } 312 }
313 313
314 iput(inode); 314 iput(inode);
315 next: 315 next:
316 spin_lock(&fs_info->defrag_inodes_lock); 316 spin_lock(&fs_info->defrag_inodes_lock);
317 next_free: 317 next_free:
318 kfree(defrag); 318 kfree(defrag);
319 } 319 }
320 spin_unlock(&fs_info->defrag_inodes_lock); 320 spin_unlock(&fs_info->defrag_inodes_lock);
321 321
322 atomic_dec(&fs_info->defrag_running); 322 atomic_dec(&fs_info->defrag_running);
323 323
324 /* 324 /*
325 * during unmount, we use the transaction_wait queue to 325 * during unmount, we use the transaction_wait queue to
326 * wait for the defragger to stop 326 * wait for the defragger to stop
327 */ 327 */
328 wake_up(&fs_info->transaction_wait); 328 wake_up(&fs_info->transaction_wait);
329 return 0; 329 return 0;
330 } 330 }
331 331
332 /* simple helper to fault in pages and copy. This should go away 332 /* simple helper to fault in pages and copy. This should go away
333 * and be replaced with calls into generic code. 333 * and be replaced with calls into generic code.
334 */ 334 */
335 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 335 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
336 size_t write_bytes, 336 size_t write_bytes,
337 struct page **prepared_pages, 337 struct page **prepared_pages,
338 struct iov_iter *i) 338 struct iov_iter *i)
339 { 339 {
340 size_t copied = 0; 340 size_t copied = 0;
341 size_t total_copied = 0; 341 size_t total_copied = 0;
342 int pg = 0; 342 int pg = 0;
343 int offset = pos & (PAGE_CACHE_SIZE - 1); 343 int offset = pos & (PAGE_CACHE_SIZE - 1);
344 344
345 while (write_bytes > 0) { 345 while (write_bytes > 0) {
346 size_t count = min_t(size_t, 346 size_t count = min_t(size_t,
347 PAGE_CACHE_SIZE - offset, write_bytes); 347 PAGE_CACHE_SIZE - offset, write_bytes);
348 struct page *page = prepared_pages[pg]; 348 struct page *page = prepared_pages[pg];
349 /* 349 /*
350 * Copy data from userspace to the current page 350 * Copy data from userspace to the current page
351 * 351 *
352 * Disable pagefault to avoid recursive lock since 352 * Disable pagefault to avoid recursive lock since
353 * the pages are already locked 353 * the pages are already locked
354 */ 354 */
355 pagefault_disable(); 355 pagefault_disable();
356 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 356 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
357 pagefault_enable(); 357 pagefault_enable();
358 358
359 /* Flush processor's dcache for this page */ 359 /* Flush processor's dcache for this page */
360 flush_dcache_page(page); 360 flush_dcache_page(page);
361 361
362 /* 362 /*
363 * if we get a partial write, we can end up with 363 * if we get a partial write, we can end up with
364 * partially up to date pages. These add 364 * partially up to date pages. These add
365 * a lot of complexity, so make sure they don't 365 * a lot of complexity, so make sure they don't
366 * happen by forcing this copy to be retried. 366 * happen by forcing this copy to be retried.
367 * 367 *
368 * The rest of the btrfs_file_write code will fall 368 * The rest of the btrfs_file_write code will fall
369 * back to page at a time copies after we return 0. 369 * back to page at a time copies after we return 0.
370 */ 370 */
371 if (!PageUptodate(page) && copied < count) 371 if (!PageUptodate(page) && copied < count)
372 copied = 0; 372 copied = 0;
373 373
374 iov_iter_advance(i, copied); 374 iov_iter_advance(i, copied);
375 write_bytes -= copied; 375 write_bytes -= copied;
376 total_copied += copied; 376 total_copied += copied;
377 377
378 /* Return to btrfs_file_aio_write to fault page */ 378 /* Return to btrfs_file_aio_write to fault page */
379 if (unlikely(copied == 0)) 379 if (unlikely(copied == 0))
380 break; 380 break;
381 381
382 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 382 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
383 offset += copied; 383 offset += copied;
384 } else { 384 } else {
385 pg++; 385 pg++;
386 offset = 0; 386 offset = 0;
387 } 387 }
388 } 388 }
389 return total_copied; 389 return total_copied;
390 } 390 }
391 391
392 /* 392 /*
393 * unlocks pages after btrfs_file_write is done with them 393 * unlocks pages after btrfs_file_write is done with them
394 */ 394 */
395 void btrfs_drop_pages(struct page **pages, size_t num_pages) 395 void btrfs_drop_pages(struct page **pages, size_t num_pages)
396 { 396 {
397 size_t i; 397 size_t i;
398 for (i = 0; i < num_pages; i++) { 398 for (i = 0; i < num_pages; i++) {
399 /* page checked is some magic around finding pages that 399 /* page checked is some magic around finding pages that
400 * have been modified without going through btrfs_set_page_dirty 400 * have been modified without going through btrfs_set_page_dirty
401 * clear it here 401 * clear it here
402 */ 402 */
403 ClearPageChecked(pages[i]); 403 ClearPageChecked(pages[i]);
404 unlock_page(pages[i]); 404 unlock_page(pages[i]);
405 mark_page_accessed(pages[i]); 405 mark_page_accessed(pages[i]);
406 page_cache_release(pages[i]); 406 page_cache_release(pages[i]);
407 } 407 }
408 } 408 }
409 409
410 /* 410 /*
411 * after copy_from_user, pages need to be dirtied and we need to make 411 * after copy_from_user, pages need to be dirtied and we need to make
412 * sure holes are created between the current EOF and the start of 412 * sure holes are created between the current EOF and the start of
413 * any next extents (if required). 413 * any next extents (if required).
414 * 414 *
415 * this also makes the decision about creating an inline extent vs 415 * this also makes the decision about creating an inline extent vs
416 * doing real data extents, marking pages dirty and delalloc as required. 416 * doing real data extents, marking pages dirty and delalloc as required.
417 */ 417 */
418 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 418 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
419 struct page **pages, size_t num_pages, 419 struct page **pages, size_t num_pages,
420 loff_t pos, size_t write_bytes, 420 loff_t pos, size_t write_bytes,
421 struct extent_state **cached) 421 struct extent_state **cached)
422 { 422 {
423 int err = 0; 423 int err = 0;
424 int i; 424 int i;
425 u64 num_bytes; 425 u64 num_bytes;
426 u64 start_pos; 426 u64 start_pos;
427 u64 end_of_last_block; 427 u64 end_of_last_block;
428 u64 end_pos = pos + write_bytes; 428 u64 end_pos = pos + write_bytes;
429 loff_t isize = i_size_read(inode); 429 loff_t isize = i_size_read(inode);
430 430
431 start_pos = pos & ~((u64)root->sectorsize - 1); 431 start_pos = pos & ~((u64)root->sectorsize - 1);
432 num_bytes = (write_bytes + pos - start_pos + 432 num_bytes = (write_bytes + pos - start_pos +
433 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 433 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
434 434
435 end_of_last_block = start_pos + num_bytes - 1; 435 end_of_last_block = start_pos + num_bytes - 1;
436 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 436 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
437 cached); 437 cached);
438 if (err) 438 if (err)
439 return err; 439 return err;
440 440
441 for (i = 0; i < num_pages; i++) { 441 for (i = 0; i < num_pages; i++) {
442 struct page *p = pages[i]; 442 struct page *p = pages[i];
443 SetPageUptodate(p); 443 SetPageUptodate(p);
444 ClearPageChecked(p); 444 ClearPageChecked(p);
445 set_page_dirty(p); 445 set_page_dirty(p);
446 } 446 }
447 447
448 /* 448 /*
449 * we've only changed i_size in ram, and we haven't updated 449 * we've only changed i_size in ram, and we haven't updated
450 * the disk i_size. There is no need to log the inode 450 * the disk i_size. There is no need to log the inode
451 * at this time. 451 * at this time.
452 */ 452 */
453 if (end_pos > isize) 453 if (end_pos > isize)
454 i_size_write(inode, end_pos); 454 i_size_write(inode, end_pos);
455 return 0; 455 return 0;
456 } 456 }
457 457
458 /* 458 /*
459 * this drops all the extents in the cache that intersect the range 459 * this drops all the extents in the cache that intersect the range
460 * [start, end]. Existing extents are split as required. 460 * [start, end]. Existing extents are split as required.
461 */ 461 */
462 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 462 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
463 int skip_pinned) 463 int skip_pinned)
464 { 464 {
465 struct extent_map *em; 465 struct extent_map *em;
466 struct extent_map *split = NULL; 466 struct extent_map *split = NULL;
467 struct extent_map *split2 = NULL; 467 struct extent_map *split2 = NULL;
468 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 468 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
469 u64 len = end - start + 1; 469 u64 len = end - start + 1;
470 u64 gen; 470 u64 gen;
471 int ret; 471 int ret;
472 int testend = 1; 472 int testend = 1;
473 unsigned long flags; 473 unsigned long flags;
474 int compressed = 0; 474 int compressed = 0;
475 475
476 WARN_ON(end < start); 476 WARN_ON(end < start);
477 if (end == (u64)-1) { 477 if (end == (u64)-1) {
478 len = (u64)-1; 478 len = (u64)-1;
479 testend = 0; 479 testend = 0;
480 } 480 }
481 while (1) { 481 while (1) {
482 int no_splits = 0; 482 int no_splits = 0;
483 483
484 if (!split) 484 if (!split)
485 split = alloc_extent_map(); 485 split = alloc_extent_map();
486 if (!split2) 486 if (!split2)
487 split2 = alloc_extent_map(); 487 split2 = alloc_extent_map();
488 if (!split || !split2) 488 if (!split || !split2)
489 no_splits = 1; 489 no_splits = 1;
490 490
491 write_lock(&em_tree->lock); 491 write_lock(&em_tree->lock);
492 em = lookup_extent_mapping(em_tree, start, len); 492 em = lookup_extent_mapping(em_tree, start, len);
493 if (!em) { 493 if (!em) {
494 write_unlock(&em_tree->lock); 494 write_unlock(&em_tree->lock);
495 break; 495 break;
496 } 496 }
497 flags = em->flags; 497 flags = em->flags;
498 gen = em->generation; 498 gen = em->generation;
499 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 499 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
500 if (testend && em->start + em->len >= start + len) { 500 if (testend && em->start + em->len >= start + len) {
501 free_extent_map(em); 501 free_extent_map(em);
502 write_unlock(&em_tree->lock); 502 write_unlock(&em_tree->lock);
503 break; 503 break;
504 } 504 }
505 start = em->start + em->len; 505 start = em->start + em->len;
506 if (testend) 506 if (testend)
507 len = start + len - (em->start + em->len); 507 len = start + len - (em->start + em->len);
508 free_extent_map(em); 508 free_extent_map(em);
509 write_unlock(&em_tree->lock); 509 write_unlock(&em_tree->lock);
510 continue; 510 continue;
511 } 511 }
512 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 512 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
513 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 513 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
514 remove_extent_mapping(em_tree, em); 514 remove_extent_mapping(em_tree, em);
515 if (no_splits) 515 if (no_splits)
516 goto next; 516 goto next;
517 517
518 if (em->block_start < EXTENT_MAP_LAST_BYTE && 518 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
519 em->start < start) { 519 em->start < start) {
520 split->start = em->start; 520 split->start = em->start;
521 split->len = start - em->start; 521 split->len = start - em->start;
522 split->orig_start = em->orig_start; 522 split->orig_start = em->orig_start;
523 split->block_start = em->block_start; 523 split->block_start = em->block_start;
524 524
525 if (compressed) 525 if (compressed)
526 split->block_len = em->block_len; 526 split->block_len = em->block_len;
527 else 527 else
528 split->block_len = split->len; 528 split->block_len = split->len;
529 split->generation = gen; 529 split->generation = gen;
530 split->bdev = em->bdev; 530 split->bdev = em->bdev;
531 split->flags = flags; 531 split->flags = flags;
532 split->compress_type = em->compress_type; 532 split->compress_type = em->compress_type;
533 ret = add_extent_mapping(em_tree, split); 533 ret = add_extent_mapping(em_tree, split);
534 BUG_ON(ret); /* Logic error */ 534 BUG_ON(ret); /* Logic error */
535 list_move(&split->list, &em_tree->modified_extents); 535 list_move(&split->list, &em_tree->modified_extents);
536 free_extent_map(split); 536 free_extent_map(split);
537 split = split2; 537 split = split2;
538 split2 = NULL; 538 split2 = NULL;
539 } 539 }
540 if (em->block_start < EXTENT_MAP_LAST_BYTE && 540 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
541 testend && em->start + em->len > start + len) { 541 testend && em->start + em->len > start + len) {
542 u64 diff = start + len - em->start; 542 u64 diff = start + len - em->start;
543 543
544 split->start = start + len; 544 split->start = start + len;
545 split->len = em->start + em->len - (start + len); 545 split->len = em->start + em->len - (start + len);
546 split->bdev = em->bdev; 546 split->bdev = em->bdev;
547 split->flags = flags; 547 split->flags = flags;
548 split->compress_type = em->compress_type; 548 split->compress_type = em->compress_type;
549 split->generation = gen; 549 split->generation = gen;
550 550
551 if (compressed) { 551 if (compressed) {
552 split->block_len = em->block_len; 552 split->block_len = em->block_len;
553 split->block_start = em->block_start; 553 split->block_start = em->block_start;
554 split->orig_start = em->orig_start; 554 split->orig_start = em->orig_start;
555 } else { 555 } else {
556 split->block_len = split->len; 556 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 557 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 558 split->orig_start = split->start;
559 } 559 }
560 560
561 ret = add_extent_mapping(em_tree, split); 561 ret = add_extent_mapping(em_tree, split);
562 BUG_ON(ret); /* Logic error */ 562 BUG_ON(ret); /* Logic error */
563 list_move(&split->list, &em_tree->modified_extents); 563 list_move(&split->list, &em_tree->modified_extents);
564 free_extent_map(split); 564 free_extent_map(split);
565 split = NULL; 565 split = NULL;
566 } 566 }
567 next: 567 next:
568 write_unlock(&em_tree->lock); 568 write_unlock(&em_tree->lock);
569 569
570 /* once for us */ 570 /* once for us */
571 free_extent_map(em); 571 free_extent_map(em);
572 /* once for the tree*/ 572 /* once for the tree*/
573 free_extent_map(em); 573 free_extent_map(em);
574 } 574 }
575 if (split) 575 if (split)
576 free_extent_map(split); 576 free_extent_map(split);
577 if (split2) 577 if (split2)
578 free_extent_map(split2); 578 free_extent_map(split2);
579 } 579 }
580 580
581 /* 581 /*
582 * this is very complex, but the basic idea is to drop all extents 582 * this is very complex, but the basic idea is to drop all extents
583 * in the range start - end. hint_block is filled in with a block number 583 * in the range start - end. hint_block is filled in with a block number
584 * that would be a good hint to the block allocator for this file. 584 * that would be a good hint to the block allocator for this file.
585 * 585 *
586 * If an extent intersects the range but is not entirely inside the range 586 * If an extent intersects the range but is not entirely inside the range
587 * it is either truncated or split. Anything entirely inside the range 587 * it is either truncated or split. Anything entirely inside the range
588 * is deleted from the tree. 588 * is deleted from the tree.
589 */ 589 */
590 int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 590 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
591 struct btrfs_root *root, struct inode *inode, 591 struct btrfs_root *root, struct inode *inode,
592 struct btrfs_path *path, u64 start, u64 end, 592 struct btrfs_path *path, u64 start, u64 end,
593 u64 *drop_end, int drop_cache) 593 u64 *drop_end, int drop_cache)
594 { 594 {
595 struct extent_buffer *leaf; 595 struct extent_buffer *leaf;
596 struct btrfs_file_extent_item *fi; 596 struct btrfs_file_extent_item *fi;
597 struct btrfs_key key; 597 struct btrfs_key key;
598 struct btrfs_key new_key; 598 struct btrfs_key new_key;
599 u64 ino = btrfs_ino(inode); 599 u64 ino = btrfs_ino(inode);
600 u64 search_start = start; 600 u64 search_start = start;
601 u64 disk_bytenr = 0; 601 u64 disk_bytenr = 0;
602 u64 num_bytes = 0; 602 u64 num_bytes = 0;
603 u64 extent_offset = 0; 603 u64 extent_offset = 0;
604 u64 extent_end = 0; 604 u64 extent_end = 0;
605 int del_nr = 0; 605 int del_nr = 0;
606 int del_slot = 0; 606 int del_slot = 0;
607 int extent_type; 607 int extent_type;
608 int recow; 608 int recow;
609 int ret; 609 int ret;
610 int modify_tree = -1; 610 int modify_tree = -1;
611 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 611 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
612 int found = 0; 612 int found = 0;
613 613
614 if (drop_cache) 614 if (drop_cache)
615 btrfs_drop_extent_cache(inode, start, end - 1, 0); 615 btrfs_drop_extent_cache(inode, start, end - 1, 0);
616 616
617 if (start >= BTRFS_I(inode)->disk_i_size) 617 if (start >= BTRFS_I(inode)->disk_i_size)
618 modify_tree = 0; 618 modify_tree = 0;
619 619
620 while (1) { 620 while (1) {
621 recow = 0; 621 recow = 0;
622 ret = btrfs_lookup_file_extent(trans, root, path, ino, 622 ret = btrfs_lookup_file_extent(trans, root, path, ino,
623 search_start, modify_tree); 623 search_start, modify_tree);
624 if (ret < 0) 624 if (ret < 0)
625 break; 625 break;
626 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 626 if (ret > 0 && path->slots[0] > 0 && search_start == start) {
627 leaf = path->nodes[0]; 627 leaf = path->nodes[0];
628 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 628 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
629 if (key.objectid == ino && 629 if (key.objectid == ino &&
630 key.type == BTRFS_EXTENT_DATA_KEY) 630 key.type == BTRFS_EXTENT_DATA_KEY)
631 path->slots[0]--; 631 path->slots[0]--;
632 } 632 }
633 ret = 0; 633 ret = 0;
634 next_slot: 634 next_slot:
635 leaf = path->nodes[0]; 635 leaf = path->nodes[0];
636 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 636 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
637 BUG_ON(del_nr > 0); 637 BUG_ON(del_nr > 0);
638 ret = btrfs_next_leaf(root, path); 638 ret = btrfs_next_leaf(root, path);
639 if (ret < 0) 639 if (ret < 0)
640 break; 640 break;
641 if (ret > 0) { 641 if (ret > 0) {
642 ret = 0; 642 ret = 0;
643 break; 643 break;
644 } 644 }
645 leaf = path->nodes[0]; 645 leaf = path->nodes[0];
646 recow = 1; 646 recow = 1;
647 } 647 }
648 648
649 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 649 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
650 if (key.objectid > ino || 650 if (key.objectid > ino ||
651 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 651 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
652 break; 652 break;
653 653
654 fi = btrfs_item_ptr(leaf, path->slots[0], 654 fi = btrfs_item_ptr(leaf, path->slots[0],
655 struct btrfs_file_extent_item); 655 struct btrfs_file_extent_item);
656 extent_type = btrfs_file_extent_type(leaf, fi); 656 extent_type = btrfs_file_extent_type(leaf, fi);
657 657
658 if (extent_type == BTRFS_FILE_EXTENT_REG || 658 if (extent_type == BTRFS_FILE_EXTENT_REG ||
659 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 659 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
660 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 660 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
661 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 661 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
662 extent_offset = btrfs_file_extent_offset(leaf, fi); 662 extent_offset = btrfs_file_extent_offset(leaf, fi);
663 extent_end = key.offset + 663 extent_end = key.offset +
664 btrfs_file_extent_num_bytes(leaf, fi); 664 btrfs_file_extent_num_bytes(leaf, fi);
665 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 665 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
666 extent_end = key.offset + 666 extent_end = key.offset +
667 btrfs_file_extent_inline_len(leaf, fi); 667 btrfs_file_extent_inline_len(leaf, fi);
668 } else { 668 } else {
669 WARN_ON(1); 669 WARN_ON(1);
670 extent_end = search_start; 670 extent_end = search_start;
671 } 671 }
672 672
673 if (extent_end <= search_start) { 673 if (extent_end <= search_start) {
674 path->slots[0]++; 674 path->slots[0]++;
675 goto next_slot; 675 goto next_slot;
676 } 676 }
677 677
678 found = 1; 678 found = 1;
679 search_start = max(key.offset, start); 679 search_start = max(key.offset, start);
680 if (recow || !modify_tree) { 680 if (recow || !modify_tree) {
681 modify_tree = -1; 681 modify_tree = -1;
682 btrfs_release_path(path); 682 btrfs_release_path(path);
683 continue; 683 continue;
684 } 684 }
685 685
686 /* 686 /*
687 * | - range to drop - | 687 * | - range to drop - |
688 * | -------- extent -------- | 688 * | -------- extent -------- |
689 */ 689 */
690 if (start > key.offset && end < extent_end) { 690 if (start > key.offset && end < extent_end) {
691 BUG_ON(del_nr > 0); 691 BUG_ON(del_nr > 0);
692 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 692 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
693 693
694 memcpy(&new_key, &key, sizeof(new_key)); 694 memcpy(&new_key, &key, sizeof(new_key));
695 new_key.offset = start; 695 new_key.offset = start;
696 ret = btrfs_duplicate_item(trans, root, path, 696 ret = btrfs_duplicate_item(trans, root, path,
697 &new_key); 697 &new_key);
698 if (ret == -EAGAIN) { 698 if (ret == -EAGAIN) {
699 btrfs_release_path(path); 699 btrfs_release_path(path);
700 continue; 700 continue;
701 } 701 }
702 if (ret < 0) 702 if (ret < 0)
703 break; 703 break;
704 704
705 leaf = path->nodes[0]; 705 leaf = path->nodes[0];
706 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 706 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
707 struct btrfs_file_extent_item); 707 struct btrfs_file_extent_item);
708 btrfs_set_file_extent_num_bytes(leaf, fi, 708 btrfs_set_file_extent_num_bytes(leaf, fi,
709 start - key.offset); 709 start - key.offset);
710 710
711 fi = btrfs_item_ptr(leaf, path->slots[0], 711 fi = btrfs_item_ptr(leaf, path->slots[0],
712 struct btrfs_file_extent_item); 712 struct btrfs_file_extent_item);
713 713
714 extent_offset += start - key.offset; 714 extent_offset += start - key.offset;
715 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 715 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
716 btrfs_set_file_extent_num_bytes(leaf, fi, 716 btrfs_set_file_extent_num_bytes(leaf, fi,
717 extent_end - start); 717 extent_end - start);
718 btrfs_mark_buffer_dirty(leaf); 718 btrfs_mark_buffer_dirty(leaf);
719 719
720 if (update_refs && disk_bytenr > 0) { 720 if (update_refs && disk_bytenr > 0) {
721 ret = btrfs_inc_extent_ref(trans, root, 721 ret = btrfs_inc_extent_ref(trans, root,
722 disk_bytenr, num_bytes, 0, 722 disk_bytenr, num_bytes, 0,
723 root->root_key.objectid, 723 root->root_key.objectid,
724 new_key.objectid, 724 new_key.objectid,
725 start - extent_offset, 0); 725 start - extent_offset, 0);
726 BUG_ON(ret); /* -ENOMEM */ 726 BUG_ON(ret); /* -ENOMEM */
727 } 727 }
728 key.offset = start; 728 key.offset = start;
729 } 729 }
730 /* 730 /*
731 * | ---- range to drop ----- | 731 * | ---- range to drop ----- |
732 * | -------- extent -------- | 732 * | -------- extent -------- |
733 */ 733 */
734 if (start <= key.offset && end < extent_end) { 734 if (start <= key.offset && end < extent_end) {
735 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 735 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
736 736
737 memcpy(&new_key, &key, sizeof(new_key)); 737 memcpy(&new_key, &key, sizeof(new_key));
738 new_key.offset = end; 738 new_key.offset = end;
739 btrfs_set_item_key_safe(trans, root, path, &new_key); 739 btrfs_set_item_key_safe(trans, root, path, &new_key);
740 740
741 extent_offset += end - key.offset; 741 extent_offset += end - key.offset;
742 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 742 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
743 btrfs_set_file_extent_num_bytes(leaf, fi, 743 btrfs_set_file_extent_num_bytes(leaf, fi,
744 extent_end - end); 744 extent_end - end);
745 btrfs_mark_buffer_dirty(leaf); 745 btrfs_mark_buffer_dirty(leaf);
746 if (update_refs && disk_bytenr > 0) 746 if (update_refs && disk_bytenr > 0)
747 inode_sub_bytes(inode, end - key.offset); 747 inode_sub_bytes(inode, end - key.offset);
748 break; 748 break;
749 } 749 }
750 750
751 search_start = extent_end; 751 search_start = extent_end;
752 /* 752 /*
753 * | ---- range to drop ----- | 753 * | ---- range to drop ----- |
754 * | -------- extent -------- | 754 * | -------- extent -------- |
755 */ 755 */
756 if (start > key.offset && end >= extent_end) { 756 if (start > key.offset && end >= extent_end) {
757 BUG_ON(del_nr > 0); 757 BUG_ON(del_nr > 0);
758 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 758 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
759 759
760 btrfs_set_file_extent_num_bytes(leaf, fi, 760 btrfs_set_file_extent_num_bytes(leaf, fi,
761 start - key.offset); 761 start - key.offset);
762 btrfs_mark_buffer_dirty(leaf); 762 btrfs_mark_buffer_dirty(leaf);
763 if (update_refs && disk_bytenr > 0) 763 if (update_refs && disk_bytenr > 0)
764 inode_sub_bytes(inode, extent_end - start); 764 inode_sub_bytes(inode, extent_end - start);
765 if (end == extent_end) 765 if (end == extent_end)
766 break; 766 break;
767 767
768 path->slots[0]++; 768 path->slots[0]++;
769 goto next_slot; 769 goto next_slot;
770 } 770 }
771 771
772 /* 772 /*
773 * | ---- range to drop ----- | 773 * | ---- range to drop ----- |
774 * | ------ extent ------ | 774 * | ------ extent ------ |
775 */ 775 */
776 if (start <= key.offset && end >= extent_end) { 776 if (start <= key.offset && end >= extent_end) {
777 if (del_nr == 0) { 777 if (del_nr == 0) {
778 del_slot = path->slots[0]; 778 del_slot = path->slots[0];
779 del_nr = 1; 779 del_nr = 1;
780 } else { 780 } else {
781 BUG_ON(del_slot + del_nr != path->slots[0]); 781 BUG_ON(del_slot + del_nr != path->slots[0]);
782 del_nr++; 782 del_nr++;
783 } 783 }
784 784
785 if (update_refs && 785 if (update_refs &&
786 extent_type == BTRFS_FILE_EXTENT_INLINE) { 786 extent_type == BTRFS_FILE_EXTENT_INLINE) {
787 inode_sub_bytes(inode, 787 inode_sub_bytes(inode,
788 extent_end - key.offset); 788 extent_end - key.offset);
789 extent_end = ALIGN(extent_end, 789 extent_end = ALIGN(extent_end,
790 root->sectorsize); 790 root->sectorsize);
791 } else if (update_refs && disk_bytenr > 0) { 791 } else if (update_refs && disk_bytenr > 0) {
792 ret = btrfs_free_extent(trans, root, 792 ret = btrfs_free_extent(trans, root,
793 disk_bytenr, num_bytes, 0, 793 disk_bytenr, num_bytes, 0,
794 root->root_key.objectid, 794 root->root_key.objectid,
795 key.objectid, key.offset - 795 key.objectid, key.offset -
796 extent_offset, 0); 796 extent_offset, 0);
797 BUG_ON(ret); /* -ENOMEM */ 797 BUG_ON(ret); /* -ENOMEM */
798 inode_sub_bytes(inode, 798 inode_sub_bytes(inode,
799 extent_end - key.offset); 799 extent_end - key.offset);
800 } 800 }
801 801
802 if (end == extent_end) 802 if (end == extent_end)
803 break; 803 break;
804 804
805 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 805 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
806 path->slots[0]++; 806 path->slots[0]++;
807 goto next_slot; 807 goto next_slot;
808 } 808 }
809 809
810 ret = btrfs_del_items(trans, root, path, del_slot, 810 ret = btrfs_del_items(trans, root, path, del_slot,
811 del_nr); 811 del_nr);
812 if (ret) { 812 if (ret) {
813 btrfs_abort_transaction(trans, root, ret); 813 btrfs_abort_transaction(trans, root, ret);
814 break; 814 break;
815 } 815 }
816 816
817 del_nr = 0; 817 del_nr = 0;
818 del_slot = 0; 818 del_slot = 0;
819 819
820 btrfs_release_path(path); 820 btrfs_release_path(path);
821 continue; 821 continue;
822 } 822 }
823 823
824 BUG_ON(1); 824 BUG_ON(1);
825 } 825 }
826 826
827 if (!ret && del_nr > 0) { 827 if (!ret && del_nr > 0) {
828 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 828 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
829 if (ret) 829 if (ret)
830 btrfs_abort_transaction(trans, root, ret); 830 btrfs_abort_transaction(trans, root, ret);
831 } 831 }
832 832
833 if (drop_end) 833 if (drop_end)
834 *drop_end = found ? min(end, extent_end) : end; 834 *drop_end = found ? min(end, extent_end) : end;
835 btrfs_release_path(path); 835 btrfs_release_path(path);
836 return ret; 836 return ret;
837 } 837 }
838 838
839 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 839 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
840 struct btrfs_root *root, struct inode *inode, u64 start, 840 struct btrfs_root *root, struct inode *inode, u64 start,
841 u64 end, int drop_cache) 841 u64 end, int drop_cache)
842 { 842 {
843 struct btrfs_path *path; 843 struct btrfs_path *path;
844 int ret; 844 int ret;
845 845
846 path = btrfs_alloc_path(); 846 path = btrfs_alloc_path();
847 if (!path) 847 if (!path)
848 return -ENOMEM; 848 return -ENOMEM;
849 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, 849 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
850 drop_cache); 850 drop_cache);
851 btrfs_free_path(path); 851 btrfs_free_path(path);
852 return ret; 852 return ret;
853 } 853 }
854 854
855 static int extent_mergeable(struct extent_buffer *leaf, int slot, 855 static int extent_mergeable(struct extent_buffer *leaf, int slot,
856 u64 objectid, u64 bytenr, u64 orig_offset, 856 u64 objectid, u64 bytenr, u64 orig_offset,
857 u64 *start, u64 *end) 857 u64 *start, u64 *end)
858 { 858 {
859 struct btrfs_file_extent_item *fi; 859 struct btrfs_file_extent_item *fi;
860 struct btrfs_key key; 860 struct btrfs_key key;
861 u64 extent_end; 861 u64 extent_end;
862 862
863 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 863 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
864 return 0; 864 return 0;
865 865
866 btrfs_item_key_to_cpu(leaf, &key, slot); 866 btrfs_item_key_to_cpu(leaf, &key, slot);
867 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 867 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
868 return 0; 868 return 0;
869 869
870 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 870 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
871 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 871 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
872 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 872 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
873 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 873 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
874 btrfs_file_extent_compression(leaf, fi) || 874 btrfs_file_extent_compression(leaf, fi) ||
875 btrfs_file_extent_encryption(leaf, fi) || 875 btrfs_file_extent_encryption(leaf, fi) ||
876 btrfs_file_extent_other_encoding(leaf, fi)) 876 btrfs_file_extent_other_encoding(leaf, fi))
877 return 0; 877 return 0;
878 878
879 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 879 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
880 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 880 if ((*start && *start != key.offset) || (*end && *end != extent_end))
881 return 0; 881 return 0;
882 882
883 *start = key.offset; 883 *start = key.offset;
884 *end = extent_end; 884 *end = extent_end;
885 return 1; 885 return 1;
886 } 886 }
887 887
888 /* 888 /*
889 * Mark extent in the range start - end as written. 889 * Mark extent in the range start - end as written.
890 * 890 *
891 * This changes extent type from 'pre-allocated' to 'regular'. If only 891 * This changes extent type from 'pre-allocated' to 'regular'. If only
892 * part of extent is marked as written, the extent will be split into 892 * part of extent is marked as written, the extent will be split into
893 * two or three. 893 * two or three.
894 */ 894 */
895 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 895 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
896 struct inode *inode, u64 start, u64 end) 896 struct inode *inode, u64 start, u64 end)
897 { 897 {
898 struct btrfs_root *root = BTRFS_I(inode)->root; 898 struct btrfs_root *root = BTRFS_I(inode)->root;
899 struct extent_buffer *leaf; 899 struct extent_buffer *leaf;
900 struct btrfs_path *path; 900 struct btrfs_path *path;
901 struct btrfs_file_extent_item *fi; 901 struct btrfs_file_extent_item *fi;
902 struct btrfs_key key; 902 struct btrfs_key key;
903 struct btrfs_key new_key; 903 struct btrfs_key new_key;
904 u64 bytenr; 904 u64 bytenr;
905 u64 num_bytes; 905 u64 num_bytes;
906 u64 extent_end; 906 u64 extent_end;
907 u64 orig_offset; 907 u64 orig_offset;
908 u64 other_start; 908 u64 other_start;
909 u64 other_end; 909 u64 other_end;
910 u64 split; 910 u64 split;
911 int del_nr = 0; 911 int del_nr = 0;
912 int del_slot = 0; 912 int del_slot = 0;
913 int recow; 913 int recow;
914 int ret; 914 int ret;
915 u64 ino = btrfs_ino(inode); 915 u64 ino = btrfs_ino(inode);
916 916
917 path = btrfs_alloc_path(); 917 path = btrfs_alloc_path();
918 if (!path) 918 if (!path)
919 return -ENOMEM; 919 return -ENOMEM;
920 again: 920 again:
921 recow = 0; 921 recow = 0;
922 split = start; 922 split = start;
923 key.objectid = ino; 923 key.objectid = ino;
924 key.type = BTRFS_EXTENT_DATA_KEY; 924 key.type = BTRFS_EXTENT_DATA_KEY;
925 key.offset = split; 925 key.offset = split;
926 926
927 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 927 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
928 if (ret < 0) 928 if (ret < 0)
929 goto out; 929 goto out;
930 if (ret > 0 && path->slots[0] > 0) 930 if (ret > 0 && path->slots[0] > 0)
931 path->slots[0]--; 931 path->slots[0]--;
932 932
933 leaf = path->nodes[0]; 933 leaf = path->nodes[0];
934 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 934 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
935 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); 935 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
936 fi = btrfs_item_ptr(leaf, path->slots[0], 936 fi = btrfs_item_ptr(leaf, path->slots[0],
937 struct btrfs_file_extent_item); 937 struct btrfs_file_extent_item);
938 BUG_ON(btrfs_file_extent_type(leaf, fi) != 938 BUG_ON(btrfs_file_extent_type(leaf, fi) !=
939 BTRFS_FILE_EXTENT_PREALLOC); 939 BTRFS_FILE_EXTENT_PREALLOC);
940 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 940 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
941 BUG_ON(key.offset > start || extent_end < end); 941 BUG_ON(key.offset > start || extent_end < end);
942 942
943 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 943 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
944 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 944 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
945 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 945 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
946 memcpy(&new_key, &key, sizeof(new_key)); 946 memcpy(&new_key, &key, sizeof(new_key));
947 947
948 if (start == key.offset && end < extent_end) { 948 if (start == key.offset && end < extent_end) {
949 other_start = 0; 949 other_start = 0;
950 other_end = start; 950 other_end = start;
951 if (extent_mergeable(leaf, path->slots[0] - 1, 951 if (extent_mergeable(leaf, path->slots[0] - 1,
952 ino, bytenr, orig_offset, 952 ino, bytenr, orig_offset,
953 &other_start, &other_end)) { 953 &other_start, &other_end)) {
954 new_key.offset = end; 954 new_key.offset = end;
955 btrfs_set_item_key_safe(trans, root, path, &new_key); 955 btrfs_set_item_key_safe(trans, root, path, &new_key);
956 fi = btrfs_item_ptr(leaf, path->slots[0], 956 fi = btrfs_item_ptr(leaf, path->slots[0],
957 struct btrfs_file_extent_item); 957 struct btrfs_file_extent_item);
958 btrfs_set_file_extent_generation(leaf, fi, 958 btrfs_set_file_extent_generation(leaf, fi,
959 trans->transid); 959 trans->transid);
960 btrfs_set_file_extent_num_bytes(leaf, fi, 960 btrfs_set_file_extent_num_bytes(leaf, fi,
961 extent_end - end); 961 extent_end - end);
962 btrfs_set_file_extent_offset(leaf, fi, 962 btrfs_set_file_extent_offset(leaf, fi,
963 end - orig_offset); 963 end - orig_offset);
964 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 964 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
965 struct btrfs_file_extent_item); 965 struct btrfs_file_extent_item);
966 btrfs_set_file_extent_generation(leaf, fi, 966 btrfs_set_file_extent_generation(leaf, fi,
967 trans->transid); 967 trans->transid);
968 btrfs_set_file_extent_num_bytes(leaf, fi, 968 btrfs_set_file_extent_num_bytes(leaf, fi,
969 end - other_start); 969 end - other_start);
970 btrfs_mark_buffer_dirty(leaf); 970 btrfs_mark_buffer_dirty(leaf);
971 goto out; 971 goto out;
972 } 972 }
973 } 973 }
974 974
975 if (start > key.offset && end == extent_end) { 975 if (start > key.offset && end == extent_end) {
976 other_start = end; 976 other_start = end;
977 other_end = 0; 977 other_end = 0;
978 if (extent_mergeable(leaf, path->slots[0] + 1, 978 if (extent_mergeable(leaf, path->slots[0] + 1,
979 ino, bytenr, orig_offset, 979 ino, bytenr, orig_offset,
980 &other_start, &other_end)) { 980 &other_start, &other_end)) {
981 fi = btrfs_item_ptr(leaf, path->slots[0], 981 fi = btrfs_item_ptr(leaf, path->slots[0],
982 struct btrfs_file_extent_item); 982 struct btrfs_file_extent_item);
983 btrfs_set_file_extent_num_bytes(leaf, fi, 983 btrfs_set_file_extent_num_bytes(leaf, fi,
984 start - key.offset); 984 start - key.offset);
985 btrfs_set_file_extent_generation(leaf, fi, 985 btrfs_set_file_extent_generation(leaf, fi,
986 trans->transid); 986 trans->transid);
987 path->slots[0]++; 987 path->slots[0]++;
988 new_key.offset = start; 988 new_key.offset = start;
989 btrfs_set_item_key_safe(trans, root, path, &new_key); 989 btrfs_set_item_key_safe(trans, root, path, &new_key);
990 990
991 fi = btrfs_item_ptr(leaf, path->slots[0], 991 fi = btrfs_item_ptr(leaf, path->slots[0],
992 struct btrfs_file_extent_item); 992 struct btrfs_file_extent_item);
993 btrfs_set_file_extent_generation(leaf, fi, 993 btrfs_set_file_extent_generation(leaf, fi,
994 trans->transid); 994 trans->transid);
995 btrfs_set_file_extent_num_bytes(leaf, fi, 995 btrfs_set_file_extent_num_bytes(leaf, fi,
996 other_end - start); 996 other_end - start);
997 btrfs_set_file_extent_offset(leaf, fi, 997 btrfs_set_file_extent_offset(leaf, fi,
998 start - orig_offset); 998 start - orig_offset);
999 btrfs_mark_buffer_dirty(leaf); 999 btrfs_mark_buffer_dirty(leaf);
1000 goto out; 1000 goto out;
1001 } 1001 }
1002 } 1002 }
1003 1003
1004 while (start > key.offset || end < extent_end) { 1004 while (start > key.offset || end < extent_end) {
1005 if (key.offset == start) 1005 if (key.offset == start)
1006 split = end; 1006 split = end;
1007 1007
1008 new_key.offset = split; 1008 new_key.offset = split;
1009 ret = btrfs_duplicate_item(trans, root, path, &new_key); 1009 ret = btrfs_duplicate_item(trans, root, path, &new_key);
1010 if (ret == -EAGAIN) { 1010 if (ret == -EAGAIN) {
1011 btrfs_release_path(path); 1011 btrfs_release_path(path);
1012 goto again; 1012 goto again;
1013 } 1013 }
1014 if (ret < 0) { 1014 if (ret < 0) {
1015 btrfs_abort_transaction(trans, root, ret); 1015 btrfs_abort_transaction(trans, root, ret);
1016 goto out; 1016 goto out;
1017 } 1017 }
1018 1018
1019 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
1020 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1020 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1021 struct btrfs_file_extent_item); 1021 struct btrfs_file_extent_item);
1022 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1022 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1023 btrfs_set_file_extent_num_bytes(leaf, fi, 1023 btrfs_set_file_extent_num_bytes(leaf, fi,
1024 split - key.offset); 1024 split - key.offset);
1025 1025
1026 fi = btrfs_item_ptr(leaf, path->slots[0], 1026 fi = btrfs_item_ptr(leaf, path->slots[0],
1027 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
1028 1028
1029 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1029 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1030 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1030 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1031 btrfs_set_file_extent_num_bytes(leaf, fi, 1031 btrfs_set_file_extent_num_bytes(leaf, fi,
1032 extent_end - split); 1032 extent_end - split);
1033 btrfs_mark_buffer_dirty(leaf); 1033 btrfs_mark_buffer_dirty(leaf);
1034 1034
1035 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1035 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1036 root->root_key.objectid, 1036 root->root_key.objectid,
1037 ino, orig_offset, 0); 1037 ino, orig_offset, 0);
1038 BUG_ON(ret); /* -ENOMEM */ 1038 BUG_ON(ret); /* -ENOMEM */
1039 1039
1040 if (split == start) { 1040 if (split == start) {
1041 key.offset = start; 1041 key.offset = start;
1042 } else { 1042 } else {
1043 BUG_ON(start != key.offset); 1043 BUG_ON(start != key.offset);
1044 path->slots[0]--; 1044 path->slots[0]--;
1045 extent_end = end; 1045 extent_end = end;
1046 } 1046 }
1047 recow = 1; 1047 recow = 1;
1048 } 1048 }
1049 1049
1050 other_start = end; 1050 other_start = end;
1051 other_end = 0; 1051 other_end = 0;
1052 if (extent_mergeable(leaf, path->slots[0] + 1, 1052 if (extent_mergeable(leaf, path->slots[0] + 1,
1053 ino, bytenr, orig_offset, 1053 ino, bytenr, orig_offset,
1054 &other_start, &other_end)) { 1054 &other_start, &other_end)) {
1055 if (recow) { 1055 if (recow) {
1056 btrfs_release_path(path); 1056 btrfs_release_path(path);
1057 goto again; 1057 goto again;
1058 } 1058 }
1059 extent_end = other_end; 1059 extent_end = other_end;
1060 del_slot = path->slots[0] + 1; 1060 del_slot = path->slots[0] + 1;
1061 del_nr++; 1061 del_nr++;
1062 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1062 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1063 0, root->root_key.objectid, 1063 0, root->root_key.objectid,
1064 ino, orig_offset, 0); 1064 ino, orig_offset, 0);
1065 BUG_ON(ret); /* -ENOMEM */ 1065 BUG_ON(ret); /* -ENOMEM */
1066 } 1066 }
1067 other_start = 0; 1067 other_start = 0;
1068 other_end = start; 1068 other_end = start;
1069 if (extent_mergeable(leaf, path->slots[0] - 1, 1069 if (extent_mergeable(leaf, path->slots[0] - 1,
1070 ino, bytenr, orig_offset, 1070 ino, bytenr, orig_offset,
1071 &other_start, &other_end)) { 1071 &other_start, &other_end)) {
1072 if (recow) { 1072 if (recow) {
1073 btrfs_release_path(path); 1073 btrfs_release_path(path);
1074 goto again; 1074 goto again;
1075 } 1075 }
1076 key.offset = other_start; 1076 key.offset = other_start;
1077 del_slot = path->slots[0]; 1077 del_slot = path->slots[0];
1078 del_nr++; 1078 del_nr++;
1079 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1079 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1080 0, root->root_key.objectid, 1080 0, root->root_key.objectid,
1081 ino, orig_offset, 0); 1081 ino, orig_offset, 0);
1082 BUG_ON(ret); /* -ENOMEM */ 1082 BUG_ON(ret); /* -ENOMEM */
1083 } 1083 }
1084 if (del_nr == 0) { 1084 if (del_nr == 0) {
1085 fi = btrfs_item_ptr(leaf, path->slots[0], 1085 fi = btrfs_item_ptr(leaf, path->slots[0],
1086 struct btrfs_file_extent_item); 1086 struct btrfs_file_extent_item);
1087 btrfs_set_file_extent_type(leaf, fi, 1087 btrfs_set_file_extent_type(leaf, fi,
1088 BTRFS_FILE_EXTENT_REG); 1088 BTRFS_FILE_EXTENT_REG);
1089 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1089 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1090 btrfs_mark_buffer_dirty(leaf); 1090 btrfs_mark_buffer_dirty(leaf);
1091 } else { 1091 } else {
1092 fi = btrfs_item_ptr(leaf, del_slot - 1, 1092 fi = btrfs_item_ptr(leaf, del_slot - 1,
1093 struct btrfs_file_extent_item); 1093 struct btrfs_file_extent_item);
1094 btrfs_set_file_extent_type(leaf, fi, 1094 btrfs_set_file_extent_type(leaf, fi,
1095 BTRFS_FILE_EXTENT_REG); 1095 BTRFS_FILE_EXTENT_REG);
1096 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1096 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1097 btrfs_set_file_extent_num_bytes(leaf, fi, 1097 btrfs_set_file_extent_num_bytes(leaf, fi,
1098 extent_end - key.offset); 1098 extent_end - key.offset);
1099 btrfs_mark_buffer_dirty(leaf); 1099 btrfs_mark_buffer_dirty(leaf);
1100 1100
1101 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1101 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1102 if (ret < 0) { 1102 if (ret < 0) {
1103 btrfs_abort_transaction(trans, root, ret); 1103 btrfs_abort_transaction(trans, root, ret);
1104 goto out; 1104 goto out;
1105 } 1105 }
1106 } 1106 }
1107 out: 1107 out:
1108 btrfs_free_path(path); 1108 btrfs_free_path(path);
1109 return 0; 1109 return 0;
1110 } 1110 }
1111 1111
1112 /* 1112 /*
1113 * on error we return an unlocked page and the error value 1113 * on error we return an unlocked page and the error value
1114 * on success we return a locked page and 0 1114 * on success we return a locked page and 0
1115 */ 1115 */
1116 static int prepare_uptodate_page(struct page *page, u64 pos, 1116 static int prepare_uptodate_page(struct page *page, u64 pos,
1117 bool force_uptodate) 1117 bool force_uptodate)
1118 { 1118 {
1119 int ret = 0; 1119 int ret = 0;
1120 1120
1121 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && 1121 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1122 !PageUptodate(page)) { 1122 !PageUptodate(page)) {
1123 ret = btrfs_readpage(NULL, page); 1123 ret = btrfs_readpage(NULL, page);
1124 if (ret) 1124 if (ret)
1125 return ret; 1125 return ret;
1126 lock_page(page); 1126 lock_page(page);
1127 if (!PageUptodate(page)) { 1127 if (!PageUptodate(page)) {
1128 unlock_page(page); 1128 unlock_page(page);
1129 return -EIO; 1129 return -EIO;
1130 } 1130 }
1131 } 1131 }
1132 return 0; 1132 return 0;
1133 } 1133 }
1134 1134
1135 /* 1135 /*
1136 * this gets pages into the page cache and locks them down, it also properly 1136 * this gets pages into the page cache and locks them down, it also properly
1137 * waits for data=ordered extents to finish before allowing the pages to be 1137 * waits for data=ordered extents to finish before allowing the pages to be
1138 * modified. 1138 * modified.
1139 */ 1139 */
1140 static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1140 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1141 struct page **pages, size_t num_pages, 1141 struct page **pages, size_t num_pages,
1142 loff_t pos, unsigned long first_index, 1142 loff_t pos, unsigned long first_index,
1143 size_t write_bytes, bool force_uptodate) 1143 size_t write_bytes, bool force_uptodate)
1144 { 1144 {
1145 struct extent_state *cached_state = NULL; 1145 struct extent_state *cached_state = NULL;
1146 int i; 1146 int i;
1147 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1147 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1148 struct inode *inode = fdentry(file)->d_inode; 1148 struct inode *inode = fdentry(file)->d_inode;
1149 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1149 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1150 int err = 0; 1150 int err = 0;
1151 int faili = 0; 1151 int faili = 0;
1152 u64 start_pos; 1152 u64 start_pos;
1153 u64 last_pos; 1153 u64 last_pos;
1154 1154
1155 start_pos = pos & ~((u64)root->sectorsize - 1); 1155 start_pos = pos & ~((u64)root->sectorsize - 1);
1156 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1156 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1157 1157
1158 again: 1158 again:
1159 for (i = 0; i < num_pages; i++) { 1159 for (i = 0; i < num_pages; i++) {
1160 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1160 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1161 mask | __GFP_WRITE); 1161 mask | __GFP_WRITE);
1162 if (!pages[i]) { 1162 if (!pages[i]) {
1163 faili = i - 1; 1163 faili = i - 1;
1164 err = -ENOMEM; 1164 err = -ENOMEM;
1165 goto fail; 1165 goto fail;
1166 } 1166 }
1167 1167
1168 if (i == 0) 1168 if (i == 0)
1169 err = prepare_uptodate_page(pages[i], pos, 1169 err = prepare_uptodate_page(pages[i], pos,
1170 force_uptodate); 1170 force_uptodate);
1171 if (i == num_pages - 1) 1171 if (i == num_pages - 1)
1172 err = prepare_uptodate_page(pages[i], 1172 err = prepare_uptodate_page(pages[i],
1173 pos + write_bytes, false); 1173 pos + write_bytes, false);
1174 if (err) { 1174 if (err) {
1175 page_cache_release(pages[i]); 1175 page_cache_release(pages[i]);
1176 faili = i - 1; 1176 faili = i - 1;
1177 goto fail; 1177 goto fail;
1178 } 1178 }
1179 wait_on_page_writeback(pages[i]); 1179 wait_on_page_writeback(pages[i]);
1180 } 1180 }
1181 err = 0; 1181 err = 0;
1182 if (start_pos < inode->i_size) { 1182 if (start_pos < inode->i_size) {
1183 struct btrfs_ordered_extent *ordered; 1183 struct btrfs_ordered_extent *ordered;
1184 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1184 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1185 start_pos, last_pos - 1, 0, &cached_state); 1185 start_pos, last_pos - 1, 0, &cached_state);
1186 ordered = btrfs_lookup_first_ordered_extent(inode, 1186 ordered = btrfs_lookup_first_ordered_extent(inode,
1187 last_pos - 1); 1187 last_pos - 1);
1188 if (ordered && 1188 if (ordered &&
1189 ordered->file_offset + ordered->len > start_pos && 1189 ordered->file_offset + ordered->len > start_pos &&
1190 ordered->file_offset < last_pos) { 1190 ordered->file_offset < last_pos) {
1191 btrfs_put_ordered_extent(ordered); 1191 btrfs_put_ordered_extent(ordered);
1192 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1192 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1193 start_pos, last_pos - 1, 1193 start_pos, last_pos - 1,
1194 &cached_state, GFP_NOFS); 1194 &cached_state, GFP_NOFS);
1195 for (i = 0; i < num_pages; i++) { 1195 for (i = 0; i < num_pages; i++) {
1196 unlock_page(pages[i]); 1196 unlock_page(pages[i]);
1197 page_cache_release(pages[i]); 1197 page_cache_release(pages[i]);
1198 } 1198 }
1199 btrfs_wait_ordered_range(inode, start_pos, 1199 btrfs_wait_ordered_range(inode, start_pos,
1200 last_pos - start_pos); 1200 last_pos - start_pos);
1201 goto again; 1201 goto again;
1202 } 1202 }
1203 if (ordered) 1203 if (ordered)
1204 btrfs_put_ordered_extent(ordered); 1204 btrfs_put_ordered_extent(ordered);
1205 1205
1206 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1206 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1207 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1207 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1208 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1208 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1209 0, 0, &cached_state, GFP_NOFS); 1209 0, 0, &cached_state, GFP_NOFS);
1210 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1210 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1211 start_pos, last_pos - 1, &cached_state, 1211 start_pos, last_pos - 1, &cached_state,
1212 GFP_NOFS); 1212 GFP_NOFS);
1213 } 1213 }
1214 for (i = 0; i < num_pages; i++) { 1214 for (i = 0; i < num_pages; i++) {
1215 if (clear_page_dirty_for_io(pages[i])) 1215 if (clear_page_dirty_for_io(pages[i]))
1216 account_page_redirty(pages[i]); 1216 account_page_redirty(pages[i]);
1217 set_page_extent_mapped(pages[i]); 1217 set_page_extent_mapped(pages[i]);
1218 WARN_ON(!PageLocked(pages[i])); 1218 WARN_ON(!PageLocked(pages[i]));
1219 } 1219 }
1220 return 0; 1220 return 0;
1221 fail: 1221 fail:
1222 while (faili >= 0) { 1222 while (faili >= 0) {
1223 unlock_page(pages[faili]); 1223 unlock_page(pages[faili]);
1224 page_cache_release(pages[faili]); 1224 page_cache_release(pages[faili]);
1225 faili--; 1225 faili--;
1226 } 1226 }
1227 return err; 1227 return err;
1228 1228
1229 } 1229 }
1230 1230
1231 static noinline ssize_t __btrfs_buffered_write(struct file *file, 1231 static noinline ssize_t __btrfs_buffered_write(struct file *file,
1232 struct iov_iter *i, 1232 struct iov_iter *i,
1233 loff_t pos) 1233 loff_t pos)
1234 { 1234 {
1235 struct inode *inode = fdentry(file)->d_inode; 1235 struct inode *inode = fdentry(file)->d_inode;
1236 struct btrfs_root *root = BTRFS_I(inode)->root; 1236 struct btrfs_root *root = BTRFS_I(inode)->root;
1237 struct page **pages = NULL; 1237 struct page **pages = NULL;
1238 unsigned long first_index; 1238 unsigned long first_index;
1239 size_t num_written = 0; 1239 size_t num_written = 0;
1240 int nrptrs; 1240 int nrptrs;
1241 int ret = 0; 1241 int ret = 0;
1242 bool force_page_uptodate = false; 1242 bool force_page_uptodate = false;
1243 1243
1244 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1244 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1245 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1245 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1246 (sizeof(struct page *))); 1246 (sizeof(struct page *)));
1247 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1247 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1248 nrptrs = max(nrptrs, 8); 1248 nrptrs = max(nrptrs, 8);
1249 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1249 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1250 if (!pages) 1250 if (!pages)
1251 return -ENOMEM; 1251 return -ENOMEM;
1252 1252
1253 first_index = pos >> PAGE_CACHE_SHIFT; 1253 first_index = pos >> PAGE_CACHE_SHIFT;
1254 1254
1255 while (iov_iter_count(i) > 0) { 1255 while (iov_iter_count(i) > 0) {
1256 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1256 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1257 size_t write_bytes = min(iov_iter_count(i), 1257 size_t write_bytes = min(iov_iter_count(i),
1258 nrptrs * (size_t)PAGE_CACHE_SIZE - 1258 nrptrs * (size_t)PAGE_CACHE_SIZE -
1259 offset); 1259 offset);
1260 size_t num_pages = (write_bytes + offset + 1260 size_t num_pages = (write_bytes + offset +
1261 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1261 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1262 size_t dirty_pages; 1262 size_t dirty_pages;
1263 size_t copied; 1263 size_t copied;
1264 1264
1265 WARN_ON(num_pages > nrptrs); 1265 WARN_ON(num_pages > nrptrs);
1266 1266
1267 /* 1267 /*
1268 * Fault pages before locking them in prepare_pages 1268 * Fault pages before locking them in prepare_pages
1269 * to avoid recursive lock 1269 * to avoid recursive lock
1270 */ 1270 */
1271 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1271 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1272 ret = -EFAULT; 1272 ret = -EFAULT;
1273 break; 1273 break;
1274 } 1274 }
1275 1275
1276 ret = btrfs_delalloc_reserve_space(inode, 1276 ret = btrfs_delalloc_reserve_space(inode,
1277 num_pages << PAGE_CACHE_SHIFT); 1277 num_pages << PAGE_CACHE_SHIFT);
1278 if (ret) 1278 if (ret)
1279 break; 1279 break;
1280 1280
1281 /* 1281 /*
1282 * This is going to setup the pages array with the number of 1282 * This is going to setup the pages array with the number of
1283 * pages we want, so we don't really need to worry about the 1283 * pages we want, so we don't really need to worry about the
1284 * contents of pages from loop to loop 1284 * contents of pages from loop to loop
1285 */ 1285 */
1286 ret = prepare_pages(root, file, pages, num_pages, 1286 ret = prepare_pages(root, file, pages, num_pages,
1287 pos, first_index, write_bytes, 1287 pos, first_index, write_bytes,
1288 force_page_uptodate); 1288 force_page_uptodate);
1289 if (ret) { 1289 if (ret) {
1290 btrfs_delalloc_release_space(inode, 1290 btrfs_delalloc_release_space(inode,
1291 num_pages << PAGE_CACHE_SHIFT); 1291 num_pages << PAGE_CACHE_SHIFT);
1292 break; 1292 break;
1293 } 1293 }
1294 1294
1295 copied = btrfs_copy_from_user(pos, num_pages, 1295 copied = btrfs_copy_from_user(pos, num_pages,
1296 write_bytes, pages, i); 1296 write_bytes, pages, i);
1297 1297
1298 /* 1298 /*
1299 * if we have trouble faulting in the pages, fall 1299 * if we have trouble faulting in the pages, fall
1300 * back to one page at a time 1300 * back to one page at a time
1301 */ 1301 */
1302 if (copied < write_bytes) 1302 if (copied < write_bytes)
1303 nrptrs = 1; 1303 nrptrs = 1;
1304 1304
1305 if (copied == 0) { 1305 if (copied == 0) {
1306 force_page_uptodate = true; 1306 force_page_uptodate = true;
1307 dirty_pages = 0; 1307 dirty_pages = 0;
1308 } else { 1308 } else {
1309 force_page_uptodate = false; 1309 force_page_uptodate = false;
1310 dirty_pages = (copied + offset + 1310 dirty_pages = (copied + offset +
1311 PAGE_CACHE_SIZE - 1) >> 1311 PAGE_CACHE_SIZE - 1) >>
1312 PAGE_CACHE_SHIFT; 1312 PAGE_CACHE_SHIFT;
1313 } 1313 }
1314 1314
1315 /* 1315 /*
1316 * If we had a short copy we need to release the excess delaloc 1316 * If we had a short copy we need to release the excess delaloc
1317 * bytes we reserved. We need to increment outstanding_extents 1317 * bytes we reserved. We need to increment outstanding_extents
1318 * because btrfs_delalloc_release_space will decrement it, but 1318 * because btrfs_delalloc_release_space will decrement it, but
1319 * we still have an outstanding extent for the chunk we actually 1319 * we still have an outstanding extent for the chunk we actually
1320 * managed to copy. 1320 * managed to copy.
1321 */ 1321 */
1322 if (num_pages > dirty_pages) { 1322 if (num_pages > dirty_pages) {
1323 if (copied > 0) { 1323 if (copied > 0) {
1324 spin_lock(&BTRFS_I(inode)->lock); 1324 spin_lock(&BTRFS_I(inode)->lock);
1325 BTRFS_I(inode)->outstanding_extents++; 1325 BTRFS_I(inode)->outstanding_extents++;
1326 spin_unlock(&BTRFS_I(inode)->lock); 1326 spin_unlock(&BTRFS_I(inode)->lock);
1327 } 1327 }
1328 btrfs_delalloc_release_space(inode, 1328 btrfs_delalloc_release_space(inode,
1329 (num_pages - dirty_pages) << 1329 (num_pages - dirty_pages) <<
1330 PAGE_CACHE_SHIFT); 1330 PAGE_CACHE_SHIFT);
1331 } 1331 }
1332 1332
1333 if (copied > 0) { 1333 if (copied > 0) {
1334 ret = btrfs_dirty_pages(root, inode, pages, 1334 ret = btrfs_dirty_pages(root, inode, pages,
1335 dirty_pages, pos, copied, 1335 dirty_pages, pos, copied,
1336 NULL); 1336 NULL);
1337 if (ret) { 1337 if (ret) {
1338 btrfs_delalloc_release_space(inode, 1338 btrfs_delalloc_release_space(inode,
1339 dirty_pages << PAGE_CACHE_SHIFT); 1339 dirty_pages << PAGE_CACHE_SHIFT);
1340 btrfs_drop_pages(pages, num_pages); 1340 btrfs_drop_pages(pages, num_pages);
1341 break; 1341 break;
1342 } 1342 }
1343 } 1343 }
1344 1344
1345 btrfs_drop_pages(pages, num_pages); 1345 btrfs_drop_pages(pages, num_pages);
1346 1346
1347 cond_resched(); 1347 cond_resched();
1348 1348
1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1349 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1351 btrfs_btree_balance_dirty(root, 1); 1351 btrfs_btree_balance_dirty(root, 1);
1352 1352
1353 pos += copied; 1353 pos += copied;
1354 num_written += copied; 1354 num_written += copied;
1355 } 1355 }
1356 1356
1357 kfree(pages); 1357 kfree(pages);
1358 1358
1359 return num_written ? num_written : ret; 1359 return num_written ? num_written : ret;
1360 } 1360 }
1361 1361
1362 static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1362 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1363 const struct iovec *iov, 1363 const struct iovec *iov,
1364 unsigned long nr_segs, loff_t pos, 1364 unsigned long nr_segs, loff_t pos,
1365 loff_t *ppos, size_t count, size_t ocount) 1365 loff_t *ppos, size_t count, size_t ocount)
1366 { 1366 {
1367 struct file *file = iocb->ki_filp; 1367 struct file *file = iocb->ki_filp;
1368 struct iov_iter i; 1368 struct iov_iter i;
1369 ssize_t written; 1369 ssize_t written;
1370 ssize_t written_buffered; 1370 ssize_t written_buffered;
1371 loff_t endbyte; 1371 loff_t endbyte;
1372 int err; 1372 int err;
1373 1373
1374 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1374 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1375 count, ocount); 1375 count, ocount);
1376 1376
1377 if (written < 0 || written == count) 1377 if (written < 0 || written == count)
1378 return written; 1378 return written;
1379 1379
1380 pos += written; 1380 pos += written;
1381 count -= written; 1381 count -= written;
1382 iov_iter_init(&i, iov, nr_segs, count, written); 1382 iov_iter_init(&i, iov, nr_segs, count, written);
1383 written_buffered = __btrfs_buffered_write(file, &i, pos); 1383 written_buffered = __btrfs_buffered_write(file, &i, pos);
1384 if (written_buffered < 0) { 1384 if (written_buffered < 0) {
1385 err = written_buffered; 1385 err = written_buffered;
1386 goto out; 1386 goto out;
1387 } 1387 }
1388 endbyte = pos + written_buffered - 1; 1388 endbyte = pos + written_buffered - 1;
1389 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1389 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1390 if (err) 1390 if (err)
1391 goto out; 1391 goto out;
1392 written += written_buffered; 1392 written += written_buffered;
1393 *ppos = pos + written_buffered; 1393 *ppos = pos + written_buffered;
1394 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1394 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1395 endbyte >> PAGE_CACHE_SHIFT); 1395 endbyte >> PAGE_CACHE_SHIFT);
1396 out: 1396 out:
1397 return written ? written : err; 1397 return written ? written : err;
1398 } 1398 }
1399 1399
1400 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1400 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1401 const struct iovec *iov, 1401 const struct iovec *iov,
1402 unsigned long nr_segs, loff_t pos) 1402 unsigned long nr_segs, loff_t pos)
1403 { 1403 {
1404 struct file *file = iocb->ki_filp; 1404 struct file *file = iocb->ki_filp;
1405 struct inode *inode = fdentry(file)->d_inode; 1405 struct inode *inode = fdentry(file)->d_inode;
1406 struct btrfs_root *root = BTRFS_I(inode)->root; 1406 struct btrfs_root *root = BTRFS_I(inode)->root;
1407 loff_t *ppos = &iocb->ki_pos; 1407 loff_t *ppos = &iocb->ki_pos;
1408 u64 start_pos; 1408 u64 start_pos;
1409 ssize_t num_written = 0; 1409 ssize_t num_written = 0;
1410 ssize_t err = 0; 1410 ssize_t err = 0;
1411 size_t count, ocount; 1411 size_t count, ocount;
1412 1412
1413 sb_start_write(inode->i_sb); 1413 sb_start_write(inode->i_sb);
1414 1414
1415 mutex_lock(&inode->i_mutex); 1415 mutex_lock(&inode->i_mutex);
1416 1416
1417 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1417 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1418 if (err) { 1418 if (err) {
1419 mutex_unlock(&inode->i_mutex); 1419 mutex_unlock(&inode->i_mutex);
1420 goto out; 1420 goto out;
1421 } 1421 }
1422 count = ocount; 1422 count = ocount;
1423 1423
1424 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1424 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1425 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1425 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1426 if (err) { 1426 if (err) {
1427 mutex_unlock(&inode->i_mutex); 1427 mutex_unlock(&inode->i_mutex);
1428 goto out; 1428 goto out;
1429 } 1429 }
1430 1430
1431 if (count == 0) { 1431 if (count == 0) {
1432 mutex_unlock(&inode->i_mutex); 1432 mutex_unlock(&inode->i_mutex);
1433 goto out; 1433 goto out;
1434 } 1434 }
1435 1435
1436 err = file_remove_suid(file); 1436 err = file_remove_suid(file);
1437 if (err) { 1437 if (err) {
1438 mutex_unlock(&inode->i_mutex); 1438 mutex_unlock(&inode->i_mutex);
1439 goto out; 1439 goto out;
1440 } 1440 }
1441 1441
1442 /* 1442 /*
1443 * If BTRFS flips readonly due to some impossible error 1443 * If BTRFS flips readonly due to some impossible error
1444 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), 1444 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1445 * although we have opened a file as writable, we have 1445 * although we have opened a file as writable, we have
1446 * to stop this write operation to ensure FS consistency. 1446 * to stop this write operation to ensure FS consistency.
1447 */ 1447 */
1448 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1448 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1449 mutex_unlock(&inode->i_mutex); 1449 mutex_unlock(&inode->i_mutex);
1450 err = -EROFS; 1450 err = -EROFS;
1451 goto out; 1451 goto out;
1452 } 1452 }
1453 1453
1454 err = file_update_time(file); 1454 err = file_update_time(file);
1455 if (err) { 1455 if (err) {
1456 mutex_unlock(&inode->i_mutex); 1456 mutex_unlock(&inode->i_mutex);
1457 goto out; 1457 goto out;
1458 } 1458 }
1459 1459
1460 start_pos = round_down(pos, root->sectorsize); 1460 start_pos = round_down(pos, root->sectorsize);
1461 if (start_pos > i_size_read(inode)) { 1461 if (start_pos > i_size_read(inode)) {
1462 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1462 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1463 if (err) { 1463 if (err) {
1464 mutex_unlock(&inode->i_mutex); 1464 mutex_unlock(&inode->i_mutex);
1465 goto out; 1465 goto out;
1466 } 1466 }
1467 } 1467 }
1468 1468
1469 if (unlikely(file->f_flags & O_DIRECT)) { 1469 if (unlikely(file->f_flags & O_DIRECT)) {
1470 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1470 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1471 pos, ppos, count, ocount); 1471 pos, ppos, count, ocount);
1472 } else { 1472 } else {
1473 struct iov_iter i; 1473 struct iov_iter i;
1474 1474
1475 iov_iter_init(&i, iov, nr_segs, count, num_written); 1475 iov_iter_init(&i, iov, nr_segs, count, num_written);
1476 1476
1477 num_written = __btrfs_buffered_write(file, &i, pos); 1477 num_written = __btrfs_buffered_write(file, &i, pos);
1478 if (num_written > 0) 1478 if (num_written > 0)
1479 *ppos = pos + num_written; 1479 *ppos = pos + num_written;
1480 } 1480 }
1481 1481
1482 mutex_unlock(&inode->i_mutex); 1482 mutex_unlock(&inode->i_mutex);
1483 1483
1484 /* 1484 /*
1485 * we want to make sure fsync finds this change 1485 * we want to make sure fsync finds this change
1486 * but we haven't joined a transaction running right now. 1486 * but we haven't joined a transaction running right now.
1487 * 1487 *
1488 * Later on, someone is sure to update the inode and get the 1488 * Later on, someone is sure to update the inode and get the
1489 * real transid recorded. 1489 * real transid recorded.
1490 * 1490 *
1491 * We set last_trans now to the fs_info generation + 1, 1491 * We set last_trans now to the fs_info generation + 1,
1492 * this will either be one more than the running transaction 1492 * this will either be one more than the running transaction
1493 * or the generation used for the next transaction if there isn't 1493 * or the generation used for the next transaction if there isn't
1494 * one running right now. 1494 * one running right now.
1495 */ 1495 */
1496 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1496 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1497 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1497 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1498 err = generic_write_sync(file, pos, num_written); 1498 err = generic_write_sync(file, pos, num_written);
1499 if (err < 0 && num_written > 0) 1499 if (err < 0 && num_written > 0)
1500 num_written = err; 1500 num_written = err;
1501 } 1501 }
1502 out: 1502 out:
1503 sb_end_write(inode->i_sb); 1503 sb_end_write(inode->i_sb);
1504 current->backing_dev_info = NULL; 1504 current->backing_dev_info = NULL;
1505 return num_written ? num_written : err; 1505 return num_written ? num_written : err;
1506 } 1506 }
1507 1507
1508 int btrfs_release_file(struct inode *inode, struct file *filp) 1508 int btrfs_release_file(struct inode *inode, struct file *filp)
1509 { 1509 {
1510 /* 1510 /*
1511 * ordered_data_close is set by settattr when we are about to truncate 1511 * ordered_data_close is set by settattr when we are about to truncate
1512 * a file from a non-zero size to a zero size. This tries to 1512 * a file from a non-zero size to a zero size. This tries to
1513 * flush down new bytes that may have been written if the 1513 * flush down new bytes that may have been written if the
1514 * application were using truncate to replace a file in place. 1514 * application were using truncate to replace a file in place.
1515 */ 1515 */
1516 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1516 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1517 &BTRFS_I(inode)->runtime_flags)) { 1517 &BTRFS_I(inode)->runtime_flags)) {
1518 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1518 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1519 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1519 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1520 filemap_flush(inode->i_mapping); 1520 filemap_flush(inode->i_mapping);
1521 } 1521 }
1522 if (filp->private_data) 1522 if (filp->private_data)
1523 btrfs_ioctl_trans_end(filp); 1523 btrfs_ioctl_trans_end(filp);
1524 return 0; 1524 return 0;
1525 } 1525 }
1526 1526
1527 /* 1527 /*
1528 * fsync call for both files and directories. This logs the inode into 1528 * fsync call for both files and directories. This logs the inode into
1529 * the tree log instead of forcing full commits whenever possible. 1529 * the tree log instead of forcing full commits whenever possible.
1530 * 1530 *
1531 * It needs to call filemap_fdatawait so that all ordered extent updates are 1531 * It needs to call filemap_fdatawait so that all ordered extent updates are
1532 * in the metadata btree are up to date for copying to the log. 1532 * in the metadata btree are up to date for copying to the log.
1533 * 1533 *
1534 * It drops the inode mutex before doing the tree log commit. This is an 1534 * It drops the inode mutex before doing the tree log commit. This is an
1535 * important optimization for directories because holding the mutex prevents 1535 * important optimization for directories because holding the mutex prevents
1536 * new operations on the dir while we write to disk. 1536 * new operations on the dir while we write to disk.
1537 */ 1537 */
1538 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1538 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1539 { 1539 {
1540 struct dentry *dentry = file->f_path.dentry; 1540 struct dentry *dentry = file->f_path.dentry;
1541 struct inode *inode = dentry->d_inode; 1541 struct inode *inode = dentry->d_inode;
1542 struct btrfs_root *root = BTRFS_I(inode)->root; 1542 struct btrfs_root *root = BTRFS_I(inode)->root;
1543 int ret = 0; 1543 int ret = 0;
1544 struct btrfs_trans_handle *trans; 1544 struct btrfs_trans_handle *trans;
1545 1545
1546 trace_btrfs_sync_file(file, datasync); 1546 trace_btrfs_sync_file(file, datasync);
1547 1547
1548 /* 1548 /*
1549 * We write the dirty pages in the range and wait until they complete 1549 * We write the dirty pages in the range and wait until they complete
1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1550 * out of the ->i_mutex. If so, we can flush the dirty pages by
1551 * multi-task, and make the performance up. 1551 * multi-task, and make the performance up.
1552 */ 1552 */
1553 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1553 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1554 if (ret) 1554 if (ret)
1555 return ret; 1555 return ret;
1556 1556
1557 mutex_lock(&inode->i_mutex); 1557 mutex_lock(&inode->i_mutex);
1558 1558
1559 /* 1559 /*
1560 * We flush the dirty pages again to avoid some dirty pages in the 1560 * We flush the dirty pages again to avoid some dirty pages in the
1561 * range being left. 1561 * range being left.
1562 */ 1562 */
1563 atomic_inc(&root->log_batch); 1563 atomic_inc(&root->log_batch);
1564 btrfs_wait_ordered_range(inode, start, end); 1564 btrfs_wait_ordered_range(inode, start, end);
1565 atomic_inc(&root->log_batch); 1565 atomic_inc(&root->log_batch);
1566 1566
1567 /* 1567 /*
1568 * check the transaction that last modified this inode 1568 * check the transaction that last modified this inode
1569 * and see if its already been committed 1569 * and see if its already been committed
1570 */ 1570 */
1571 if (!BTRFS_I(inode)->last_trans) { 1571 if (!BTRFS_I(inode)->last_trans) {
1572 mutex_unlock(&inode->i_mutex); 1572 mutex_unlock(&inode->i_mutex);
1573 goto out; 1573 goto out;
1574 } 1574 }
1575 1575
1576 /* 1576 /*
1577 * if the last transaction that changed this file was before 1577 * if the last transaction that changed this file was before
1578 * the current transaction, we can bail out now without any 1578 * the current transaction, we can bail out now without any
1579 * syncing 1579 * syncing
1580 */ 1580 */
1581 smp_mb(); 1581 smp_mb();
1582 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1582 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1583 BTRFS_I(inode)->last_trans <= 1583 BTRFS_I(inode)->last_trans <=
1584 root->fs_info->last_trans_committed) { 1584 root->fs_info->last_trans_committed) {
1585 BTRFS_I(inode)->last_trans = 0; 1585 BTRFS_I(inode)->last_trans = 0;
1586 1586
1587 /* 1587 /*
1588 * We'v had everything committed since the last time we were 1588 * We'v had everything committed since the last time we were
1589 * modified so clear this flag in case it was set for whatever 1589 * modified so clear this flag in case it was set for whatever
1590 * reason, it's no longer relevant. 1590 * reason, it's no longer relevant.
1591 */ 1591 */
1592 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1592 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1593 &BTRFS_I(inode)->runtime_flags); 1593 &BTRFS_I(inode)->runtime_flags);
1594 mutex_unlock(&inode->i_mutex); 1594 mutex_unlock(&inode->i_mutex);
1595 goto out; 1595 goto out;
1596 } 1596 }
1597 1597
1598 /* 1598 /*
1599 * ok we haven't committed the transaction yet, lets do a commit 1599 * ok we haven't committed the transaction yet, lets do a commit
1600 */ 1600 */
1601 if (file->private_data) 1601 if (file->private_data)
1602 btrfs_ioctl_trans_end(file); 1602 btrfs_ioctl_trans_end(file);
1603 1603
1604 trans = btrfs_start_transaction(root, 0); 1604 trans = btrfs_start_transaction(root, 0);
1605 if (IS_ERR(trans)) { 1605 if (IS_ERR(trans)) {
1606 ret = PTR_ERR(trans); 1606 ret = PTR_ERR(trans);
1607 mutex_unlock(&inode->i_mutex); 1607 mutex_unlock(&inode->i_mutex);
1608 goto out; 1608 goto out;
1609 } 1609 }
1610 1610
1611 ret = btrfs_log_dentry_safe(trans, root, dentry); 1611 ret = btrfs_log_dentry_safe(trans, root, dentry);
1612 if (ret < 0) { 1612 if (ret < 0) {
1613 mutex_unlock(&inode->i_mutex); 1613 mutex_unlock(&inode->i_mutex);
1614 goto out; 1614 goto out;
1615 } 1615 }
1616 1616
1617 /* we've logged all the items and now have a consistent 1617 /* we've logged all the items and now have a consistent
1618 * version of the file in the log. It is possible that 1618 * version of the file in the log. It is possible that
1619 * someone will come in and modify the file, but that's 1619 * someone will come in and modify the file, but that's
1620 * fine because the log is consistent on disk, and we 1620 * fine because the log is consistent on disk, and we
1621 * have references to all of the file's extents 1621 * have references to all of the file's extents
1622 * 1622 *
1623 * It is possible that someone will come in and log the 1623 * It is possible that someone will come in and log the
1624 * file again, but that will end up using the synchronization 1624 * file again, but that will end up using the synchronization
1625 * inside btrfs_sync_log to keep things safe. 1625 * inside btrfs_sync_log to keep things safe.
1626 */ 1626 */
1627 mutex_unlock(&inode->i_mutex); 1627 mutex_unlock(&inode->i_mutex);
1628 1628
1629 if (ret != BTRFS_NO_LOG_SYNC) { 1629 if (ret != BTRFS_NO_LOG_SYNC) {
1630 if (ret > 0) { 1630 if (ret > 0) {
1631 ret = btrfs_commit_transaction(trans, root); 1631 ret = btrfs_commit_transaction(trans, root);
1632 } else { 1632 } else {
1633 ret = btrfs_sync_log(trans, root); 1633 ret = btrfs_sync_log(trans, root);
1634 if (ret == 0) 1634 if (ret == 0)
1635 ret = btrfs_end_transaction(trans, root); 1635 ret = btrfs_end_transaction(trans, root);
1636 else 1636 else
1637 ret = btrfs_commit_transaction(trans, root); 1637 ret = btrfs_commit_transaction(trans, root);
1638 } 1638 }
1639 } else { 1639 } else {
1640 ret = btrfs_end_transaction(trans, root); 1640 ret = btrfs_end_transaction(trans, root);
1641 } 1641 }
1642 out: 1642 out:
1643 return ret > 0 ? -EIO : ret; 1643 return ret > 0 ? -EIO : ret;
1644 } 1644 }
1645 1645
1646 static const struct vm_operations_struct btrfs_file_vm_ops = { 1646 static const struct vm_operations_struct btrfs_file_vm_ops = {
1647 .fault = filemap_fault, 1647 .fault = filemap_fault,
1648 .page_mkwrite = btrfs_page_mkwrite, 1648 .page_mkwrite = btrfs_page_mkwrite,
1649 .remap_pages = generic_file_remap_pages, 1649 .remap_pages = generic_file_remap_pages,
1650 }; 1650 };
1651 1651
1652 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1652 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1653 { 1653 {
1654 struct address_space *mapping = filp->f_mapping; 1654 struct address_space *mapping = filp->f_mapping;
1655 1655
1656 if (!mapping->a_ops->readpage) 1656 if (!mapping->a_ops->readpage)
1657 return -ENOEXEC; 1657 return -ENOEXEC;
1658 1658
1659 file_accessed(filp); 1659 file_accessed(filp);
1660 vma->vm_ops = &btrfs_file_vm_ops; 1660 vma->vm_ops = &btrfs_file_vm_ops;
1661 1661
1662 return 0; 1662 return 0;
1663 } 1663 }
1664 1664
1665 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, 1665 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
1666 int slot, u64 start, u64 end) 1666 int slot, u64 start, u64 end)
1667 { 1667 {
1668 struct btrfs_file_extent_item *fi; 1668 struct btrfs_file_extent_item *fi;
1669 struct btrfs_key key; 1669 struct btrfs_key key;
1670 1670
1671 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 1671 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1672 return 0; 1672 return 0;
1673 1673
1674 btrfs_item_key_to_cpu(leaf, &key, slot); 1674 btrfs_item_key_to_cpu(leaf, &key, slot);
1675 if (key.objectid != btrfs_ino(inode) || 1675 if (key.objectid != btrfs_ino(inode) ||
1676 key.type != BTRFS_EXTENT_DATA_KEY) 1676 key.type != BTRFS_EXTENT_DATA_KEY)
1677 return 0; 1677 return 0;
1678 1678
1679 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 1679 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1680 1680
1681 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 1681 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1682 return 0; 1682 return 0;
1683 1683
1684 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 1684 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1685 return 0; 1685 return 0;
1686 1686
1687 if (key.offset == end) 1687 if (key.offset == end)
1688 return 1; 1688 return 1;
1689 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 1689 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1690 return 1; 1690 return 1;
1691 return 0; 1691 return 0;
1692 } 1692 }
1693 1693
1694 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, 1694 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1695 struct btrfs_path *path, u64 offset, u64 end) 1695 struct btrfs_path *path, u64 offset, u64 end)
1696 { 1696 {
1697 struct btrfs_root *root = BTRFS_I(inode)->root; 1697 struct btrfs_root *root = BTRFS_I(inode)->root;
1698 struct extent_buffer *leaf; 1698 struct extent_buffer *leaf;
1699 struct btrfs_file_extent_item *fi; 1699 struct btrfs_file_extent_item *fi;
1700 struct extent_map *hole_em; 1700 struct extent_map *hole_em;
1701 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1701 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1702 struct btrfs_key key; 1702 struct btrfs_key key;
1703 int ret; 1703 int ret;
1704 1704
1705 key.objectid = btrfs_ino(inode); 1705 key.objectid = btrfs_ino(inode);
1706 key.type = BTRFS_EXTENT_DATA_KEY; 1706 key.type = BTRFS_EXTENT_DATA_KEY;
1707 key.offset = offset; 1707 key.offset = offset;
1708 1708
1709 1709
1710 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1710 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1711 if (ret < 0) 1711 if (ret < 0)
1712 return ret; 1712 return ret;
1713 BUG_ON(!ret); 1713 BUG_ON(!ret);
1714 1714
1715 leaf = path->nodes[0]; 1715 leaf = path->nodes[0];
1716 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { 1716 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1717 u64 num_bytes; 1717 u64 num_bytes;
1718 1718
1719 path->slots[0]--; 1719 path->slots[0]--;
1720 fi = btrfs_item_ptr(leaf, path->slots[0], 1720 fi = btrfs_item_ptr(leaf, path->slots[0],
1721 struct btrfs_file_extent_item); 1721 struct btrfs_file_extent_item);
1722 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 1722 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1723 end - offset; 1723 end - offset;
1724 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1724 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1725 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 1725 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1726 btrfs_set_file_extent_offset(leaf, fi, 0); 1726 btrfs_set_file_extent_offset(leaf, fi, 0);
1727 btrfs_mark_buffer_dirty(leaf); 1727 btrfs_mark_buffer_dirty(leaf);
1728 goto out; 1728 goto out;
1729 } 1729 }
1730 1730
1731 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 1731 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1732 u64 num_bytes; 1732 u64 num_bytes;
1733 1733
1734 path->slots[0]++; 1734 path->slots[0]++;
1735 key.offset = offset; 1735 key.offset = offset;
1736 btrfs_set_item_key_safe(trans, root, path, &key); 1736 btrfs_set_item_key_safe(trans, root, path, &key);
1737 fi = btrfs_item_ptr(leaf, path->slots[0], 1737 fi = btrfs_item_ptr(leaf, path->slots[0],
1738 struct btrfs_file_extent_item); 1738 struct btrfs_file_extent_item);
1739 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 1739 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1740 offset; 1740 offset;
1741 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1741 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1742 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 1742 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1743 btrfs_set_file_extent_offset(leaf, fi, 0); 1743 btrfs_set_file_extent_offset(leaf, fi, 0);
1744 btrfs_mark_buffer_dirty(leaf); 1744 btrfs_mark_buffer_dirty(leaf);
1745 goto out; 1745 goto out;
1746 } 1746 }
1747 btrfs_release_path(path); 1747 btrfs_release_path(path);
1748 1748
1749 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 1749 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1750 0, 0, end - offset, 0, end - offset, 1750 0, 0, end - offset, 0, end - offset,
1751 0, 0, 0); 1751 0, 0, 0);
1752 if (ret) 1752 if (ret)
1753 return ret; 1753 return ret;
1754 1754
1755 out: 1755 out:
1756 btrfs_release_path(path); 1756 btrfs_release_path(path);
1757 1757
1758 hole_em = alloc_extent_map(); 1758 hole_em = alloc_extent_map();
1759 if (!hole_em) { 1759 if (!hole_em) {
1760 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 1760 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1761 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1761 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1762 &BTRFS_I(inode)->runtime_flags); 1762 &BTRFS_I(inode)->runtime_flags);
1763 } else { 1763 } else {
1764 hole_em->start = offset; 1764 hole_em->start = offset;
1765 hole_em->len = end - offset; 1765 hole_em->len = end - offset;
1766 hole_em->orig_start = offset; 1766 hole_em->orig_start = offset;
1767 1767
1768 hole_em->block_start = EXTENT_MAP_HOLE; 1768 hole_em->block_start = EXTENT_MAP_HOLE;
1769 hole_em->block_len = 0; 1769 hole_em->block_len = 0;
1770 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1770 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1771 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1771 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1772 hole_em->generation = trans->transid; 1772 hole_em->generation = trans->transid;
1773 1773
1774 do { 1774 do {
1775 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 1775 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1776 write_lock(&em_tree->lock); 1776 write_lock(&em_tree->lock);
1777 ret = add_extent_mapping(em_tree, hole_em); 1777 ret = add_extent_mapping(em_tree, hole_em);
1778 if (!ret) 1778 if (!ret)
1779 list_move(&hole_em->list, 1779 list_move(&hole_em->list,
1780 &em_tree->modified_extents); 1780 &em_tree->modified_extents);
1781 write_unlock(&em_tree->lock); 1781 write_unlock(&em_tree->lock);
1782 } while (ret == -EEXIST); 1782 } while (ret == -EEXIST);
1783 free_extent_map(hole_em); 1783 free_extent_map(hole_em);
1784 if (ret) 1784 if (ret)
1785 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1785 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1786 &BTRFS_I(inode)->runtime_flags); 1786 &BTRFS_I(inode)->runtime_flags);
1787 } 1787 }
1788 1788
1789 return 0; 1789 return 0;
1790 } 1790 }
1791 1791
1792 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 1792 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1793 { 1793 {
1794 struct btrfs_root *root = BTRFS_I(inode)->root; 1794 struct btrfs_root *root = BTRFS_I(inode)->root;
1795 struct extent_state *cached_state = NULL; 1795 struct extent_state *cached_state = NULL;
1796 struct btrfs_path *path; 1796 struct btrfs_path *path;
1797 struct btrfs_block_rsv *rsv; 1797 struct btrfs_block_rsv *rsv;
1798 struct btrfs_trans_handle *trans; 1798 struct btrfs_trans_handle *trans;
1799 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1799 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1800 u64 lockstart = (offset + mask) & ~mask; 1800 u64 lockstart = (offset + mask) & ~mask;
1801 u64 lockend = ((offset + len) & ~mask) - 1; 1801 u64 lockend = ((offset + len) & ~mask) - 1;
1802 u64 cur_offset = lockstart; 1802 u64 cur_offset = lockstart;
1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1804 u64 drop_end; 1804 u64 drop_end;
1805 unsigned long nr; 1805 unsigned long nr;
1806 int ret = 0; 1806 int ret = 0;
1807 int err = 0; 1807 int err = 0;
1808 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1808 bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
1809 ((offset + len) >> PAGE_CACHE_SHIFT); 1809 ((offset + len) >> PAGE_CACHE_SHIFT);
1810 1810
1811 btrfs_wait_ordered_range(inode, offset, len); 1811 btrfs_wait_ordered_range(inode, offset, len);
1812 1812
1813 mutex_lock(&inode->i_mutex); 1813 mutex_lock(&inode->i_mutex);
1814 if (offset >= inode->i_size) { 1814 if (offset >= inode->i_size) {
1815 mutex_unlock(&inode->i_mutex); 1815 mutex_unlock(&inode->i_mutex);
1816 return 0; 1816 return 0;
1817 } 1817 }
1818 1818
1819 /* 1819 /*
1820 * Only do this if we are in the same page and we aren't doing the 1820 * Only do this if we are in the same page and we aren't doing the
1821 * entire page. 1821 * entire page.
1822 */ 1822 */
1823 if (same_page && len < PAGE_CACHE_SIZE) { 1823 if (same_page && len < PAGE_CACHE_SIZE) {
1824 ret = btrfs_truncate_page(inode, offset, len, 0); 1824 ret = btrfs_truncate_page(inode, offset, len, 0);
1825 mutex_unlock(&inode->i_mutex); 1825 mutex_unlock(&inode->i_mutex);
1826 return ret; 1826 return ret;
1827 } 1827 }
1828 1828
1829 /* zero back part of the first page */ 1829 /* zero back part of the first page */
1830 ret = btrfs_truncate_page(inode, offset, 0, 0); 1830 ret = btrfs_truncate_page(inode, offset, 0, 0);
1831 if (ret) { 1831 if (ret) {
1832 mutex_unlock(&inode->i_mutex); 1832 mutex_unlock(&inode->i_mutex);
1833 return ret; 1833 return ret;
1834 } 1834 }
1835 1835
1836 /* zero the front end of the last page */ 1836 /* zero the front end of the last page */
1837 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1837 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1838 if (ret) { 1838 if (ret) {
1839 mutex_unlock(&inode->i_mutex); 1839 mutex_unlock(&inode->i_mutex);
1840 return ret; 1840 return ret;
1841 } 1841 }
1842 1842
1843 if (lockend < lockstart) { 1843 if (lockend < lockstart) {
1844 mutex_unlock(&inode->i_mutex); 1844 mutex_unlock(&inode->i_mutex);
1845 return 0; 1845 return 0;
1846 } 1846 }
1847 1847
1848 while (1) { 1848 while (1) {
1849 struct btrfs_ordered_extent *ordered; 1849 struct btrfs_ordered_extent *ordered;
1850 1850
1851 truncate_pagecache_range(inode, lockstart, lockend); 1851 truncate_pagecache_range(inode, lockstart, lockend);
1852 1852
1853 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1853 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1854 0, &cached_state); 1854 0, &cached_state);
1855 ordered = btrfs_lookup_first_ordered_extent(inode, lockend); 1855 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1856 1856
1857 /* 1857 /*
1858 * We need to make sure we have no ordered extents in this range 1858 * We need to make sure we have no ordered extents in this range
1859 * and nobody raced in and read a page in this range, if we did 1859 * and nobody raced in and read a page in this range, if we did
1860 * we need to try again. 1860 * we need to try again.
1861 */ 1861 */
1862 if ((!ordered || 1862 if ((!ordered ||
1863 (ordered->file_offset + ordered->len < lockstart || 1863 (ordered->file_offset + ordered->len < lockstart ||
1864 ordered->file_offset > lockend)) && 1864 ordered->file_offset > lockend)) &&
1865 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 1865 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1866 lockend, EXTENT_UPTODATE, 0, 1866 lockend, EXTENT_UPTODATE, 0,
1867 cached_state)) { 1867 cached_state)) {
1868 if (ordered) 1868 if (ordered)
1869 btrfs_put_ordered_extent(ordered); 1869 btrfs_put_ordered_extent(ordered);
1870 break; 1870 break;
1871 } 1871 }
1872 if (ordered) 1872 if (ordered)
1873 btrfs_put_ordered_extent(ordered); 1873 btrfs_put_ordered_extent(ordered);
1874 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 1874 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1875 lockend, &cached_state, GFP_NOFS); 1875 lockend, &cached_state, GFP_NOFS);
1876 btrfs_wait_ordered_range(inode, lockstart, 1876 btrfs_wait_ordered_range(inode, lockstart,
1877 lockend - lockstart + 1); 1877 lockend - lockstart + 1);
1878 } 1878 }
1879 1879
1880 path = btrfs_alloc_path(); 1880 path = btrfs_alloc_path();
1881 if (!path) { 1881 if (!path) {
1882 ret = -ENOMEM; 1882 ret = -ENOMEM;
1883 goto out; 1883 goto out;
1884 } 1884 }
1885 1885
1886 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 1886 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1887 if (!rsv) { 1887 if (!rsv) {
1888 ret = -ENOMEM; 1888 ret = -ENOMEM;
1889 goto out_free; 1889 goto out_free;
1890 } 1890 }
1891 rsv->size = btrfs_calc_trunc_metadata_size(root, 1); 1891 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1892 rsv->failfast = 1; 1892 rsv->failfast = 1;
1893 1893
1894 /* 1894 /*
1895 * 1 - update the inode 1895 * 1 - update the inode
1896 * 1 - removing the extents in the range 1896 * 1 - removing the extents in the range
1897 * 1 - adding the hole extent 1897 * 1 - adding the hole extent
1898 */ 1898 */
1899 trans = btrfs_start_transaction(root, 3); 1899 trans = btrfs_start_transaction(root, 3);
1900 if (IS_ERR(trans)) { 1900 if (IS_ERR(trans)) {
1901 err = PTR_ERR(trans); 1901 err = PTR_ERR(trans);
1902 goto out_free; 1902 goto out_free;
1903 } 1903 }
1904 1904
1905 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 1905 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
1906 min_size); 1906 min_size);
1907 BUG_ON(ret); 1907 BUG_ON(ret);
1908 trans->block_rsv = rsv; 1908 trans->block_rsv = rsv;
1909 1909
1910 while (cur_offset < lockend) { 1910 while (cur_offset < lockend) {
1911 ret = __btrfs_drop_extents(trans, root, inode, path, 1911 ret = __btrfs_drop_extents(trans, root, inode, path,
1912 cur_offset, lockend + 1, 1912 cur_offset, lockend + 1,
1913 &drop_end, 1); 1913 &drop_end, 1);
1914 if (ret != -ENOSPC) 1914 if (ret != -ENOSPC)
1915 break; 1915 break;
1916 1916
1917 trans->block_rsv = &root->fs_info->trans_block_rsv; 1917 trans->block_rsv = &root->fs_info->trans_block_rsv;
1918 1918
1919 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 1919 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1920 if (ret) { 1920 if (ret) {
1921 err = ret; 1921 err = ret;
1922 break; 1922 break;
1923 } 1923 }
1924 1924
1925 cur_offset = drop_end; 1925 cur_offset = drop_end;
1926 1926
1927 ret = btrfs_update_inode(trans, root, inode); 1927 ret = btrfs_update_inode(trans, root, inode);
1928 if (ret) { 1928 if (ret) {
1929 err = ret; 1929 err = ret;
1930 break; 1930 break;
1931 } 1931 }
1932 1932
1933 nr = trans->blocks_used; 1933 nr = trans->blocks_used;
1934 btrfs_end_transaction(trans, root); 1934 btrfs_end_transaction(trans, root);
1935 btrfs_btree_balance_dirty(root, nr); 1935 btrfs_btree_balance_dirty(root, nr);
1936 1936
1937 trans = btrfs_start_transaction(root, 3); 1937 trans = btrfs_start_transaction(root, 3);
1938 if (IS_ERR(trans)) { 1938 if (IS_ERR(trans)) {
1939 ret = PTR_ERR(trans); 1939 ret = PTR_ERR(trans);
1940 trans = NULL; 1940 trans = NULL;
1941 break; 1941 break;
1942 } 1942 }
1943 1943
1944 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 1944 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
1945 rsv, min_size); 1945 rsv, min_size);
1946 BUG_ON(ret); /* shouldn't happen */ 1946 BUG_ON(ret); /* shouldn't happen */
1947 trans->block_rsv = rsv; 1947 trans->block_rsv = rsv;
1948 } 1948 }
1949 1949
1950 if (ret) { 1950 if (ret) {
1951 err = ret; 1951 err = ret;
1952 goto out_trans; 1952 goto out_trans;
1953 } 1953 }
1954 1954
1955 trans->block_rsv = &root->fs_info->trans_block_rsv; 1955 trans->block_rsv = &root->fs_info->trans_block_rsv;
1956 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 1956 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1957 if (ret) { 1957 if (ret) {
1958 err = ret; 1958 err = ret;
1959 goto out_trans; 1959 goto out_trans;
1960 } 1960 }
1961 1961
1962 out_trans: 1962 out_trans:
1963 if (!trans) 1963 if (!trans)
1964 goto out_free; 1964 goto out_free;
1965 1965
1966 trans->block_rsv = &root->fs_info->trans_block_rsv; 1966 trans->block_rsv = &root->fs_info->trans_block_rsv;
1967 ret = btrfs_update_inode(trans, root, inode); 1967 ret = btrfs_update_inode(trans, root, inode);
1968 nr = trans->blocks_used; 1968 nr = trans->blocks_used;
1969 btrfs_end_transaction(trans, root); 1969 btrfs_end_transaction(trans, root);
1970 btrfs_btree_balance_dirty(root, nr); 1970 btrfs_btree_balance_dirty(root, nr);
1971 out_free: 1971 out_free:
1972 btrfs_free_path(path); 1972 btrfs_free_path(path);
1973 btrfs_free_block_rsv(root, rsv); 1973 btrfs_free_block_rsv(root, rsv);
1974 out: 1974 out:
1975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1976 &cached_state, GFP_NOFS); 1976 &cached_state, GFP_NOFS);
1977 mutex_unlock(&inode->i_mutex); 1977 mutex_unlock(&inode->i_mutex);
1978 if (ret && !err) 1978 if (ret && !err)
1979 err = ret; 1979 err = ret;
1980 return err; 1980 return err;
1981 } 1981 }
1982 1982
1983 static long btrfs_fallocate(struct file *file, int mode, 1983 static long btrfs_fallocate(struct file *file, int mode,
1984 loff_t offset, loff_t len) 1984 loff_t offset, loff_t len)
1985 { 1985 {
1986 struct inode *inode = file->f_path.dentry->d_inode; 1986 struct inode *inode = file->f_path.dentry->d_inode;
1987 struct extent_state *cached_state = NULL; 1987 struct extent_state *cached_state = NULL;
1988 u64 cur_offset; 1988 u64 cur_offset;
1989 u64 last_byte; 1989 u64 last_byte;
1990 u64 alloc_start; 1990 u64 alloc_start;
1991 u64 alloc_end; 1991 u64 alloc_end;
1992 u64 alloc_hint = 0; 1992 u64 alloc_hint = 0;
1993 u64 locked_end; 1993 u64 locked_end;
1994 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1994 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1995 struct extent_map *em; 1995 struct extent_map *em;
1996 int ret; 1996 int ret;
1997 1997
1998 alloc_start = offset & ~mask; 1998 alloc_start = offset & ~mask;
1999 alloc_end = (offset + len + mask) & ~mask; 1999 alloc_end = (offset + len + mask) & ~mask;
2000 2000
2001 /* Make sure we aren't being give some crap mode */ 2001 /* Make sure we aren't being give some crap mode */
2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2003 return -EOPNOTSUPP; 2003 return -EOPNOTSUPP;
2004 2004
2005 if (mode & FALLOC_FL_PUNCH_HOLE) 2005 if (mode & FALLOC_FL_PUNCH_HOLE)
2006 return btrfs_punch_hole(inode, offset, len); 2006 return btrfs_punch_hole(inode, offset, len);
2007 2007
2008 /* 2008 /*
2009 * Make sure we have enough space before we do the 2009 * Make sure we have enough space before we do the
2010 * allocation. 2010 * allocation.
2011 */ 2011 */
2012 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2012 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
2013 if (ret) 2013 if (ret)
2014 return ret; 2014 return ret;
2015 2015
2016 /* 2016 /*
2017 * wait for ordered IO before we have any locks. We'll loop again 2017 * wait for ordered IO before we have any locks. We'll loop again
2018 * below with the locks held. 2018 * below with the locks held.
2019 */ 2019 */
2020 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 2020 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2021 2021
2022 mutex_lock(&inode->i_mutex); 2022 mutex_lock(&inode->i_mutex);
2023 ret = inode_newsize_ok(inode, alloc_end); 2023 ret = inode_newsize_ok(inode, alloc_end);
2024 if (ret) 2024 if (ret)
2025 goto out; 2025 goto out;
2026 2026
2027 if (alloc_start > inode->i_size) { 2027 if (alloc_start > inode->i_size) {
2028 ret = btrfs_cont_expand(inode, i_size_read(inode), 2028 ret = btrfs_cont_expand(inode, i_size_read(inode),
2029 alloc_start); 2029 alloc_start);
2030 if (ret) 2030 if (ret)
2031 goto out; 2031 goto out;
2032 } 2032 }
2033 2033
2034 locked_end = alloc_end - 1; 2034 locked_end = alloc_end - 1;
2035 while (1) { 2035 while (1) {
2036 struct btrfs_ordered_extent *ordered; 2036 struct btrfs_ordered_extent *ordered;
2037 2037
2038 /* the extent lock is ordered inside the running 2038 /* the extent lock is ordered inside the running
2039 * transaction 2039 * transaction
2040 */ 2040 */
2041 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 2041 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2042 locked_end, 0, &cached_state); 2042 locked_end, 0, &cached_state);
2043 ordered = btrfs_lookup_first_ordered_extent(inode, 2043 ordered = btrfs_lookup_first_ordered_extent(inode,
2044 alloc_end - 1); 2044 alloc_end - 1);
2045 if (ordered && 2045 if (ordered &&
2046 ordered->file_offset + ordered->len > alloc_start && 2046 ordered->file_offset + ordered->len > alloc_start &&
2047 ordered->file_offset < alloc_end) { 2047 ordered->file_offset < alloc_end) {
2048 btrfs_put_ordered_extent(ordered); 2048 btrfs_put_ordered_extent(ordered);
2049 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2049 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2050 alloc_start, locked_end, 2050 alloc_start, locked_end,
2051 &cached_state, GFP_NOFS); 2051 &cached_state, GFP_NOFS);
2052 /* 2052 /*
2053 * we can't wait on the range with the transaction 2053 * we can't wait on the range with the transaction
2054 * running or with the extent lock held 2054 * running or with the extent lock held
2055 */ 2055 */
2056 btrfs_wait_ordered_range(inode, alloc_start, 2056 btrfs_wait_ordered_range(inode, alloc_start,
2057 alloc_end - alloc_start); 2057 alloc_end - alloc_start);
2058 } else { 2058 } else {
2059 if (ordered) 2059 if (ordered)
2060 btrfs_put_ordered_extent(ordered); 2060 btrfs_put_ordered_extent(ordered);
2061 break; 2061 break;
2062 } 2062 }
2063 } 2063 }
2064 2064
2065 cur_offset = alloc_start; 2065 cur_offset = alloc_start;
2066 while (1) { 2066 while (1) {
2067 u64 actual_end; 2067 u64 actual_end;
2068 2068
2069 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2069 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2070 alloc_end - cur_offset, 0); 2070 alloc_end - cur_offset, 0);
2071 if (IS_ERR_OR_NULL(em)) { 2071 if (IS_ERR_OR_NULL(em)) {
2072 if (!em) 2072 if (!em)
2073 ret = -ENOMEM; 2073 ret = -ENOMEM;
2074 else 2074 else
2075 ret = PTR_ERR(em); 2075 ret = PTR_ERR(em);
2076 break; 2076 break;
2077 } 2077 }
2078 last_byte = min(extent_map_end(em), alloc_end); 2078 last_byte = min(extent_map_end(em), alloc_end);
2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2079 actual_end = min_t(u64, extent_map_end(em), offset + len);
2080 last_byte = (last_byte + mask) & ~mask; 2080 last_byte = (last_byte + mask) & ~mask;
2081 2081
2082 if (em->block_start == EXTENT_MAP_HOLE || 2082 if (em->block_start == EXTENT_MAP_HOLE ||
2083 (cur_offset >= inode->i_size && 2083 (cur_offset >= inode->i_size &&
2084 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 2084 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2085 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 2085 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2086 last_byte - cur_offset, 2086 last_byte - cur_offset,
2087 1 << inode->i_blkbits, 2087 1 << inode->i_blkbits,
2088 offset + len, 2088 offset + len,
2089 &alloc_hint); 2089 &alloc_hint);
2090 2090
2091 if (ret < 0) { 2091 if (ret < 0) {
2092 free_extent_map(em); 2092 free_extent_map(em);
2093 break; 2093 break;
2094 } 2094 }
2095 } else if (actual_end > inode->i_size && 2095 } else if (actual_end > inode->i_size &&
2096 !(mode & FALLOC_FL_KEEP_SIZE)) { 2096 !(mode & FALLOC_FL_KEEP_SIZE)) {
2097 /* 2097 /*
2098 * We didn't need to allocate any more space, but we 2098 * We didn't need to allocate any more space, but we
2099 * still extended the size of the file so we need to 2099 * still extended the size of the file so we need to
2100 * update i_size. 2100 * update i_size.
2101 */ 2101 */
2102 inode->i_ctime = CURRENT_TIME; 2102 inode->i_ctime = CURRENT_TIME;
2103 i_size_write(inode, actual_end); 2103 i_size_write(inode, actual_end);
2104 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2104 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2105 } 2105 }
2106 free_extent_map(em); 2106 free_extent_map(em);
2107 2107
2108 cur_offset = last_byte; 2108 cur_offset = last_byte;
2109 if (cur_offset >= alloc_end) { 2109 if (cur_offset >= alloc_end) {
2110 ret = 0; 2110 ret = 0;
2111 break; 2111 break;
2112 } 2112 }
2113 } 2113 }
2114 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2114 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2115 &cached_state, GFP_NOFS); 2115 &cached_state, GFP_NOFS);
2116 out: 2116 out:
2117 mutex_unlock(&inode->i_mutex); 2117 mutex_unlock(&inode->i_mutex);
2118 /* Let go of our reservation. */ 2118 /* Let go of our reservation. */
2119 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2119 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
2120 return ret; 2120 return ret;
2121 } 2121 }
2122 2122
2123 static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 2123 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2124 { 2124 {
2125 struct btrfs_root *root = BTRFS_I(inode)->root; 2125 struct btrfs_root *root = BTRFS_I(inode)->root;
2126 struct extent_map *em; 2126 struct extent_map *em;
2127 struct extent_state *cached_state = NULL; 2127 struct extent_state *cached_state = NULL;
2128 u64 lockstart = *offset; 2128 u64 lockstart = *offset;
2129 u64 lockend = i_size_read(inode); 2129 u64 lockend = i_size_read(inode);
2130 u64 start = *offset; 2130 u64 start = *offset;
2131 u64 orig_start = *offset; 2131 u64 orig_start = *offset;
2132 u64 len = i_size_read(inode); 2132 u64 len = i_size_read(inode);
2133 u64 last_end = 0; 2133 u64 last_end = 0;
2134 int ret = 0; 2134 int ret = 0;
2135 2135
2136 lockend = max_t(u64, root->sectorsize, lockend); 2136 lockend = max_t(u64, root->sectorsize, lockend);
2137 if (lockend <= lockstart) 2137 if (lockend <= lockstart)
2138 lockend = lockstart + root->sectorsize; 2138 lockend = lockstart + root->sectorsize;
2139 2139
2140 len = lockend - lockstart + 1; 2140 len = lockend - lockstart + 1;
2141 2141
2142 len = max_t(u64, len, root->sectorsize); 2142 len = max_t(u64, len, root->sectorsize);
2143 if (inode->i_size == 0) 2143 if (inode->i_size == 0)
2144 return -ENXIO; 2144 return -ENXIO;
2145 2145
2146 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2146 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2147 &cached_state); 2147 &cached_state);
2148 2148
2149 /* 2149 /*
2150 * Delalloc is such a pain. If we have a hole and we have pending 2150 * Delalloc is such a pain. If we have a hole and we have pending
2151 * delalloc for a portion of the hole we will get back a hole that 2151 * delalloc for a portion of the hole we will get back a hole that
2152 * exists for the entire range since it hasn't been actually written 2152 * exists for the entire range since it hasn't been actually written
2153 * yet. So to take care of this case we need to look for an extent just 2153 * yet. So to take care of this case we need to look for an extent just
2154 * before the position we want in case there is outstanding delalloc 2154 * before the position we want in case there is outstanding delalloc
2155 * going on here. 2155 * going on here.
2156 */ 2156 */
2157 if (origin == SEEK_HOLE && start != 0) { 2157 if (whence == SEEK_HOLE && start != 0) {
2158 if (start <= root->sectorsize) 2158 if (start <= root->sectorsize)
2159 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2159 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
2160 root->sectorsize, 0); 2160 root->sectorsize, 0);
2161 else 2161 else
2162 em = btrfs_get_extent_fiemap(inode, NULL, 0, 2162 em = btrfs_get_extent_fiemap(inode, NULL, 0,
2163 start - root->sectorsize, 2163 start - root->sectorsize,
2164 root->sectorsize, 0); 2164 root->sectorsize, 0);
2165 if (IS_ERR(em)) { 2165 if (IS_ERR(em)) {
2166 ret = PTR_ERR(em); 2166 ret = PTR_ERR(em);
2167 goto out; 2167 goto out;
2168 } 2168 }
2169 last_end = em->start + em->len; 2169 last_end = em->start + em->len;
2170 if (em->block_start == EXTENT_MAP_DELALLOC) 2170 if (em->block_start == EXTENT_MAP_DELALLOC)
2171 last_end = min_t(u64, last_end, inode->i_size); 2171 last_end = min_t(u64, last_end, inode->i_size);
2172 free_extent_map(em); 2172 free_extent_map(em);
2173 } 2173 }
2174 2174
2175 while (1) { 2175 while (1) {
2176 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); 2176 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2177 if (IS_ERR(em)) { 2177 if (IS_ERR(em)) {
2178 ret = PTR_ERR(em); 2178 ret = PTR_ERR(em);
2179 break; 2179 break;
2180 } 2180 }
2181 2181
2182 if (em->block_start == EXTENT_MAP_HOLE) { 2182 if (em->block_start == EXTENT_MAP_HOLE) {
2183 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2183 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2184 if (last_end <= orig_start) { 2184 if (last_end <= orig_start) {
2185 free_extent_map(em); 2185 free_extent_map(em);
2186 ret = -ENXIO; 2186 ret = -ENXIO;
2187 break; 2187 break;
2188 } 2188 }
2189 } 2189 }
2190 2190
2191 if (origin == SEEK_HOLE) { 2191 if (whence == SEEK_HOLE) {
2192 *offset = start; 2192 *offset = start;
2193 free_extent_map(em); 2193 free_extent_map(em);
2194 break; 2194 break;
2195 } 2195 }
2196 } else { 2196 } else {
2197 if (origin == SEEK_DATA) { 2197 if (whence == SEEK_DATA) {
2198 if (em->block_start == EXTENT_MAP_DELALLOC) { 2198 if (em->block_start == EXTENT_MAP_DELALLOC) {
2199 if (start >= inode->i_size) { 2199 if (start >= inode->i_size) {
2200 free_extent_map(em); 2200 free_extent_map(em);
2201 ret = -ENXIO; 2201 ret = -ENXIO;
2202 break; 2202 break;
2203 } 2203 }
2204 } 2204 }
2205 2205
2206 *offset = start; 2206 *offset = start;
2207 free_extent_map(em); 2207 free_extent_map(em);
2208 break; 2208 break;
2209 } 2209 }
2210 } 2210 }
2211 2211
2212 start = em->start + em->len; 2212 start = em->start + em->len;
2213 last_end = em->start + em->len; 2213 last_end = em->start + em->len;
2214 2214
2215 if (em->block_start == EXTENT_MAP_DELALLOC) 2215 if (em->block_start == EXTENT_MAP_DELALLOC)
2216 last_end = min_t(u64, last_end, inode->i_size); 2216 last_end = min_t(u64, last_end, inode->i_size);
2217 2217
2218 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2218 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2219 free_extent_map(em); 2219 free_extent_map(em);
2220 ret = -ENXIO; 2220 ret = -ENXIO;
2221 break; 2221 break;
2222 } 2222 }
2223 free_extent_map(em); 2223 free_extent_map(em);
2224 cond_resched(); 2224 cond_resched();
2225 } 2225 }
2226 if (!ret) 2226 if (!ret)
2227 *offset = min(*offset, inode->i_size); 2227 *offset = min(*offset, inode->i_size);
2228 out: 2228 out:
2229 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2229 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2230 &cached_state, GFP_NOFS); 2230 &cached_state, GFP_NOFS);
2231 return ret; 2231 return ret;
2232 } 2232 }
2233 2233
2234 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 2234 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2235 { 2235 {
2236 struct inode *inode = file->f_mapping->host; 2236 struct inode *inode = file->f_mapping->host;
2237 int ret; 2237 int ret;
2238 2238
2239 mutex_lock(&inode->i_mutex); 2239 mutex_lock(&inode->i_mutex);
2240 switch (origin) { 2240 switch (whence) {
2241 case SEEK_END: 2241 case SEEK_END:
2242 case SEEK_CUR: 2242 case SEEK_CUR:
2243 offset = generic_file_llseek(file, offset, origin); 2243 offset = generic_file_llseek(file, offset, whence);
2244 goto out; 2244 goto out;
2245 case SEEK_DATA: 2245 case SEEK_DATA:
2246 case SEEK_HOLE: 2246 case SEEK_HOLE:
2247 if (offset >= i_size_read(inode)) { 2247 if (offset >= i_size_read(inode)) {
2248 mutex_unlock(&inode->i_mutex); 2248 mutex_unlock(&inode->i_mutex);
2249 return -ENXIO; 2249 return -ENXIO;
2250 } 2250 }
2251 2251
2252 ret = find_desired_extent(inode, &offset, origin); 2252 ret = find_desired_extent(inode, &offset, whence);
2253 if (ret) { 2253 if (ret) {
2254 mutex_unlock(&inode->i_mutex); 2254 mutex_unlock(&inode->i_mutex);
2255 return ret; 2255 return ret;
2256 } 2256 }
2257 } 2257 }
2258 2258
2259 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { 2259 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
2260 offset = -EINVAL; 2260 offset = -EINVAL;
2261 goto out; 2261 goto out;
2262 } 2262 }
2263 if (offset > inode->i_sb->s_maxbytes) { 2263 if (offset > inode->i_sb->s_maxbytes) {
2264 offset = -EINVAL; 2264 offset = -EINVAL;
2265 goto out; 2265 goto out;
2266 } 2266 }
2267 2267
2268 /* Special lock needed here? */ 2268 /* Special lock needed here? */
2269 if (offset != file->f_pos) { 2269 if (offset != file->f_pos) {
2270 file->f_pos = offset; 2270 file->f_pos = offset;
2271 file->f_version = 0; 2271 file->f_version = 0;
2272 } 2272 }
2273 out: 2273 out:
2274 mutex_unlock(&inode->i_mutex); 2274 mutex_unlock(&inode->i_mutex);
2275 return offset; 2275 return offset;
2276 } 2276 }
2277 2277
2278 const struct file_operations btrfs_file_operations = { 2278 const struct file_operations btrfs_file_operations = {
2279 .llseek = btrfs_file_llseek, 2279 .llseek = btrfs_file_llseek,
2280 .read = do_sync_read, 2280 .read = do_sync_read,
2281 .write = do_sync_write, 2281 .write = do_sync_write,
2282 .aio_read = generic_file_aio_read, 2282 .aio_read = generic_file_aio_read,
2283 .splice_read = generic_file_splice_read, 2283 .splice_read = generic_file_splice_read,
2284 .aio_write = btrfs_file_aio_write, 2284 .aio_write = btrfs_file_aio_write,
2285 .mmap = btrfs_file_mmap, 2285 .mmap = btrfs_file_mmap,
2286 .open = generic_file_open, 2286 .open = generic_file_open,
2287 .release = btrfs_release_file, 2287 .release = btrfs_release_file,
2288 .fsync = btrfs_sync_file, 2288 .fsync = btrfs_sync_file,
2289 .fallocate = btrfs_fallocate, 2289 .fallocate = btrfs_fallocate,
2290 .unlocked_ioctl = btrfs_ioctl, 2290 .unlocked_ioctl = btrfs_ioctl,
2291 #ifdef CONFIG_COMPAT 2291 #ifdef CONFIG_COMPAT
2292 .compat_ioctl = btrfs_ioctl, 2292 .compat_ioctl = btrfs_ioctl,
2293 #endif 2293 #endif
2294 }; 2294 };
2295 2295
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/spinlock.h> 3 #include <linux/spinlock.h>
4 #include <linux/fs_struct.h> 4 #include <linux/fs_struct.h>
5 #include <linux/namei.h> 5 #include <linux/namei.h>
6 #include <linux/slab.h> 6 #include <linux/slab.h>
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 8
9 #include "super.h" 9 #include "super.h"
10 #include "mds_client.h" 10 #include "mds_client.h"
11 11
12 /* 12 /*
13 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
14 * rename, etc. 14 * rename, etc.
15 */ 15 */
16 16
17 /* 17 /*
18 * Ceph MDS operations are specified in terms of a base ino and 18 * Ceph MDS operations are specified in terms of a base ino and
19 * relative path. Thus, the client can specify an operation on a 19 * relative path. Thus, the client can specify an operation on a
20 * specific inode (e.g., a getattr due to fstat(2)), or as a path 20 * specific inode (e.g., a getattr due to fstat(2)), or as a path
21 * relative to, say, the root directory. 21 * relative to, say, the root directory.
22 * 22 *
23 * Normally, we limit ourselves to strict inode ops (no path component) 23 * Normally, we limit ourselves to strict inode ops (no path component)
24 * or dentry operations (a single path component relative to an ino). The 24 * or dentry operations (a single path component relative to an ino). The
25 * exception to this is open_root_dentry(), which will open the mount 25 * exception to this is open_root_dentry(), which will open the mount
26 * point by name. 26 * point by name.
27 */ 27 */
28 28
29 const struct inode_operations ceph_dir_iops; 29 const struct inode_operations ceph_dir_iops;
30 const struct file_operations ceph_dir_fops; 30 const struct file_operations ceph_dir_fops;
31 const struct dentry_operations ceph_dentry_ops; 31 const struct dentry_operations ceph_dentry_ops;
32 32
33 /* 33 /*
34 * Initialize ceph dentry state. 34 * Initialize ceph dentry state.
35 */ 35 */
36 int ceph_init_dentry(struct dentry *dentry) 36 int ceph_init_dentry(struct dentry *dentry)
37 { 37 {
38 struct ceph_dentry_info *di; 38 struct ceph_dentry_info *di;
39 39
40 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
41 return 0; 41 return 0;
42 42
43 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 43 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
44 if (!di) 44 if (!di)
45 return -ENOMEM; /* oh well */ 45 return -ENOMEM; /* oh well */
46 46
47 spin_lock(&dentry->d_lock); 47 spin_lock(&dentry->d_lock);
48 if (dentry->d_fsdata) { 48 if (dentry->d_fsdata) {
49 /* lost a race */ 49 /* lost a race */
50 kmem_cache_free(ceph_dentry_cachep, di); 50 kmem_cache_free(ceph_dentry_cachep, di);
51 goto out_unlock; 51 goto out_unlock;
52 } 52 }
53 53
54 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 54 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
55 d_set_d_op(dentry, &ceph_dentry_ops); 55 d_set_d_op(dentry, &ceph_dentry_ops);
56 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 56 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
57 d_set_d_op(dentry, &ceph_snapdir_dentry_ops); 57 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
58 else 58 else
59 d_set_d_op(dentry, &ceph_snap_dentry_ops); 59 d_set_d_op(dentry, &ceph_snap_dentry_ops);
60 60
61 di->dentry = dentry; 61 di->dentry = dentry;
62 di->lease_session = NULL; 62 di->lease_session = NULL;
63 dentry->d_time = jiffies; 63 dentry->d_time = jiffies;
64 /* avoid reordering d_fsdata setup so that the check above is safe */ 64 /* avoid reordering d_fsdata setup so that the check above is safe */
65 smp_mb(); 65 smp_mb();
66 dentry->d_fsdata = di; 66 dentry->d_fsdata = di;
67 ceph_dentry_lru_add(dentry); 67 ceph_dentry_lru_add(dentry);
68 out_unlock: 68 out_unlock:
69 spin_unlock(&dentry->d_lock); 69 spin_unlock(&dentry->d_lock);
70 return 0; 70 return 0;
71 } 71 }
72 72
73 struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) 73 struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
74 { 74 {
75 struct inode *inode = NULL; 75 struct inode *inode = NULL;
76 76
77 if (!dentry) 77 if (!dentry)
78 return NULL; 78 return NULL;
79 79
80 spin_lock(&dentry->d_lock); 80 spin_lock(&dentry->d_lock);
81 if (!IS_ROOT(dentry)) { 81 if (!IS_ROOT(dentry)) {
82 inode = dentry->d_parent->d_inode; 82 inode = dentry->d_parent->d_inode;
83 ihold(inode); 83 ihold(inode);
84 } 84 }
85 spin_unlock(&dentry->d_lock); 85 spin_unlock(&dentry->d_lock);
86 return inode; 86 return inode;
87 } 87 }
88 88
89 89
90 /* 90 /*
91 * for readdir, we encode the directory frag and offset within that 91 * for readdir, we encode the directory frag and offset within that
92 * frag into f_pos. 92 * frag into f_pos.
93 */ 93 */
94 static unsigned fpos_frag(loff_t p) 94 static unsigned fpos_frag(loff_t p)
95 { 95 {
96 return p >> 32; 96 return p >> 32;
97 } 97 }
98 static unsigned fpos_off(loff_t p) 98 static unsigned fpos_off(loff_t p)
99 { 99 {
100 return p & 0xffffffff; 100 return p & 0xffffffff;
101 } 101 }
102 102
103 /* 103 /*
104 * When possible, we try to satisfy a readdir by peeking at the 104 * When possible, we try to satisfy a readdir by peeking at the
105 * dcache. We make this work by carefully ordering dentries on 105 * dcache. We make this work by carefully ordering dentries on
106 * d_u.d_child when we initially get results back from the MDS, and 106 * d_u.d_child when we initially get results back from the MDS, and
107 * falling back to a "normal" sync readdir if any dentries in the dir 107 * falling back to a "normal" sync readdir if any dentries in the dir
108 * are dropped. 108 * are dropped.
109 * 109 *
110 * D_COMPLETE tells indicates we have all dentries in the dir. It is 110 * D_COMPLETE tells indicates we have all dentries in the dir. It is
111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
112 * the MDS if/when the directory is modified). 112 * the MDS if/when the directory is modified).
113 */ 113 */
114 static int __dcache_readdir(struct file *filp, 114 static int __dcache_readdir(struct file *filp,
115 void *dirent, filldir_t filldir) 115 void *dirent, filldir_t filldir)
116 { 116 {
117 struct ceph_file_info *fi = filp->private_data; 117 struct ceph_file_info *fi = filp->private_data;
118 struct dentry *parent = filp->f_dentry; 118 struct dentry *parent = filp->f_dentry;
119 struct inode *dir = parent->d_inode; 119 struct inode *dir = parent->d_inode;
120 struct list_head *p; 120 struct list_head *p;
121 struct dentry *dentry, *last; 121 struct dentry *dentry, *last;
122 struct ceph_dentry_info *di; 122 struct ceph_dentry_info *di;
123 int err = 0; 123 int err = 0;
124 124
125 /* claim ref on last dentry we returned */ 125 /* claim ref on last dentry we returned */
126 last = fi->dentry; 126 last = fi->dentry;
127 fi->dentry = NULL; 127 fi->dentry = NULL;
128 128
129 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 129 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
130 last); 130 last);
131 131
132 spin_lock(&parent->d_lock); 132 spin_lock(&parent->d_lock);
133 133
134 /* start at beginning? */ 134 /* start at beginning? */
135 if (filp->f_pos == 2 || last == NULL || 135 if (filp->f_pos == 2 || last == NULL ||
136 filp->f_pos < ceph_dentry(last)->offset) { 136 filp->f_pos < ceph_dentry(last)->offset) {
137 if (list_empty(&parent->d_subdirs)) 137 if (list_empty(&parent->d_subdirs))
138 goto out_unlock; 138 goto out_unlock;
139 p = parent->d_subdirs.prev; 139 p = parent->d_subdirs.prev;
140 dout(" initial p %p/%p\n", p->prev, p->next); 140 dout(" initial p %p/%p\n", p->prev, p->next);
141 } else { 141 } else {
142 p = last->d_u.d_child.prev; 142 p = last->d_u.d_child.prev;
143 } 143 }
144 144
145 more: 145 more:
146 dentry = list_entry(p, struct dentry, d_u.d_child); 146 dentry = list_entry(p, struct dentry, d_u.d_child);
147 di = ceph_dentry(dentry); 147 di = ceph_dentry(dentry);
148 while (1) { 148 while (1) {
149 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, 149 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
150 d_unhashed(dentry) ? "!hashed" : "hashed", 150 d_unhashed(dentry) ? "!hashed" : "hashed",
151 parent->d_subdirs.prev, parent->d_subdirs.next); 151 parent->d_subdirs.prev, parent->d_subdirs.next);
152 if (p == &parent->d_subdirs) { 152 if (p == &parent->d_subdirs) {
153 fi->flags |= CEPH_F_ATEND; 153 fi->flags |= CEPH_F_ATEND;
154 goto out_unlock; 154 goto out_unlock;
155 } 155 }
156 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 156 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
157 if (!d_unhashed(dentry) && dentry->d_inode && 157 if (!d_unhashed(dentry) && dentry->d_inode &&
158 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 158 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
159 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 159 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
160 filp->f_pos <= di->offset) 160 filp->f_pos <= di->offset)
161 break; 161 break;
162 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 162 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
163 dentry->d_name.len, dentry->d_name.name, di->offset, 163 dentry->d_name.len, dentry->d_name.name, di->offset,
164 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 164 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
165 !dentry->d_inode ? " null" : ""); 165 !dentry->d_inode ? " null" : "");
166 spin_unlock(&dentry->d_lock); 166 spin_unlock(&dentry->d_lock);
167 p = p->prev; 167 p = p->prev;
168 dentry = list_entry(p, struct dentry, d_u.d_child); 168 dentry = list_entry(p, struct dentry, d_u.d_child);
169 di = ceph_dentry(dentry); 169 di = ceph_dentry(dentry);
170 } 170 }
171 171
172 dget_dlock(dentry); 172 dget_dlock(dentry);
173 spin_unlock(&dentry->d_lock); 173 spin_unlock(&dentry->d_lock);
174 spin_unlock(&parent->d_lock); 174 spin_unlock(&parent->d_lock);
175 175
176 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 176 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
177 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 177 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
178 filp->f_pos = di->offset; 178 filp->f_pos = di->offset;
179 err = filldir(dirent, dentry->d_name.name, 179 err = filldir(dirent, dentry->d_name.name,
180 dentry->d_name.len, di->offset, 180 dentry->d_name.len, di->offset,
181 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), 181 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
182 dentry->d_inode->i_mode >> 12); 182 dentry->d_inode->i_mode >> 12);
183 183
184 if (last) { 184 if (last) {
185 if (err < 0) { 185 if (err < 0) {
186 /* remember our position */ 186 /* remember our position */
187 fi->dentry = last; 187 fi->dentry = last;
188 fi->next_offset = di->offset; 188 fi->next_offset = di->offset;
189 } else { 189 } else {
190 dput(last); 190 dput(last);
191 } 191 }
192 } 192 }
193 last = dentry; 193 last = dentry;
194 194
195 if (err < 0) 195 if (err < 0)
196 goto out; 196 goto out;
197 197
198 filp->f_pos++; 198 filp->f_pos++;
199 199
200 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 200 /* make sure a dentry wasn't dropped while we didn't have parent lock */
201 if (!ceph_dir_test_complete(dir)) { 201 if (!ceph_dir_test_complete(dir)) {
202 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); 202 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
203 err = -EAGAIN; 203 err = -EAGAIN;
204 goto out; 204 goto out;
205 } 205 }
206 206
207 spin_lock(&parent->d_lock); 207 spin_lock(&parent->d_lock);
208 p = p->prev; /* advance to next dentry */ 208 p = p->prev; /* advance to next dentry */
209 goto more; 209 goto more;
210 210
211 out_unlock: 211 out_unlock:
212 spin_unlock(&parent->d_lock); 212 spin_unlock(&parent->d_lock);
213 out: 213 out:
214 if (last) 214 if (last)
215 dput(last); 215 dput(last);
216 return err; 216 return err;
217 } 217 }
218 218
219 /* 219 /*
220 * make note of the last dentry we read, so we can 220 * make note of the last dentry we read, so we can
221 * continue at the same lexicographical point, 221 * continue at the same lexicographical point,
222 * regardless of what dir changes take place on the 222 * regardless of what dir changes take place on the
223 * server. 223 * server.
224 */ 224 */
225 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 225 static int note_last_dentry(struct ceph_file_info *fi, const char *name,
226 int len) 226 int len)
227 { 227 {
228 kfree(fi->last_name); 228 kfree(fi->last_name);
229 fi->last_name = kmalloc(len+1, GFP_NOFS); 229 fi->last_name = kmalloc(len+1, GFP_NOFS);
230 if (!fi->last_name) 230 if (!fi->last_name)
231 return -ENOMEM; 231 return -ENOMEM;
232 memcpy(fi->last_name, name, len); 232 memcpy(fi->last_name, name, len);
233 fi->last_name[len] = 0; 233 fi->last_name[len] = 0;
234 dout("note_last_dentry '%s'\n", fi->last_name); 234 dout("note_last_dentry '%s'\n", fi->last_name);
235 return 0; 235 return 0;
236 } 236 }
237 237
238 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 238 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
239 { 239 {
240 struct ceph_file_info *fi = filp->private_data; 240 struct ceph_file_info *fi = filp->private_data;
241 struct inode *inode = filp->f_dentry->d_inode; 241 struct inode *inode = filp->f_dentry->d_inode;
242 struct ceph_inode_info *ci = ceph_inode(inode); 242 struct ceph_inode_info *ci = ceph_inode(inode);
243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244 struct ceph_mds_client *mdsc = fsc->mdsc; 244 struct ceph_mds_client *mdsc = fsc->mdsc;
245 unsigned frag = fpos_frag(filp->f_pos); 245 unsigned frag = fpos_frag(filp->f_pos);
246 int off = fpos_off(filp->f_pos); 246 int off = fpos_off(filp->f_pos);
247 int err; 247 int err;
248 u32 ftype; 248 u32 ftype;
249 struct ceph_mds_reply_info_parsed *rinfo; 249 struct ceph_mds_reply_info_parsed *rinfo;
250 const int max_entries = fsc->mount_options->max_readdir; 250 const int max_entries = fsc->mount_options->max_readdir;
251 const int max_bytes = fsc->mount_options->max_readdir_bytes; 251 const int max_bytes = fsc->mount_options->max_readdir_bytes;
252 252
253 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 253 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
254 if (fi->flags & CEPH_F_ATEND) 254 if (fi->flags & CEPH_F_ATEND)
255 return 0; 255 return 0;
256 256
257 /* always start with . and .. */ 257 /* always start with . and .. */
258 if (filp->f_pos == 0) { 258 if (filp->f_pos == 0) {
259 /* note dir version at start of readdir so we can tell 259 /* note dir version at start of readdir so we can tell
260 * if any dentries get dropped */ 260 * if any dentries get dropped */
261 fi->dir_release_count = ci->i_release_count; 261 fi->dir_release_count = ci->i_release_count;
262 262
263 dout("readdir off 0 -> '.'\n"); 263 dout("readdir off 0 -> '.'\n");
264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
265 ceph_translate_ino(inode->i_sb, inode->i_ino), 265 ceph_translate_ino(inode->i_sb, inode->i_ino),
266 inode->i_mode >> 12) < 0) 266 inode->i_mode >> 12) < 0)
267 return 0; 267 return 0;
268 filp->f_pos = 1; 268 filp->f_pos = 1;
269 off = 1; 269 off = 1;
270 } 270 }
271 if (filp->f_pos == 1) { 271 if (filp->f_pos == 1) {
272 ino_t ino = parent_ino(filp->f_dentry); 272 ino_t ino = parent_ino(filp->f_dentry);
273 dout("readdir off 1 -> '..'\n"); 273 dout("readdir off 1 -> '..'\n");
274 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 274 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
275 ceph_translate_ino(inode->i_sb, ino), 275 ceph_translate_ino(inode->i_sb, ino),
276 inode->i_mode >> 12) < 0) 276 inode->i_mode >> 12) < 0)
277 return 0; 277 return 0;
278 filp->f_pos = 2; 278 filp->f_pos = 2;
279 off = 2; 279 off = 2;
280 } 280 }
281 281
282 /* can we use the dcache? */ 282 /* can we use the dcache? */
283 spin_lock(&ci->i_ceph_lock); 283 spin_lock(&ci->i_ceph_lock);
284 if ((filp->f_pos == 2 || fi->dentry) && 284 if ((filp->f_pos == 2 || fi->dentry) &&
285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
286 ceph_snap(inode) != CEPH_SNAPDIR && 286 ceph_snap(inode) != CEPH_SNAPDIR &&
287 ceph_dir_test_complete(inode) && 287 ceph_dir_test_complete(inode) &&
288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
289 spin_unlock(&ci->i_ceph_lock); 289 spin_unlock(&ci->i_ceph_lock);
290 err = __dcache_readdir(filp, dirent, filldir); 290 err = __dcache_readdir(filp, dirent, filldir);
291 if (err != -EAGAIN) 291 if (err != -EAGAIN)
292 return err; 292 return err;
293 } else { 293 } else {
294 spin_unlock(&ci->i_ceph_lock); 294 spin_unlock(&ci->i_ceph_lock);
295 } 295 }
296 if (fi->dentry) { 296 if (fi->dentry) {
297 err = note_last_dentry(fi, fi->dentry->d_name.name, 297 err = note_last_dentry(fi, fi->dentry->d_name.name,
298 fi->dentry->d_name.len); 298 fi->dentry->d_name.len);
299 if (err) 299 if (err)
300 return err; 300 return err;
301 dput(fi->dentry); 301 dput(fi->dentry);
302 fi->dentry = NULL; 302 fi->dentry = NULL;
303 } 303 }
304 304
305 /* proceed with a normal readdir */ 305 /* proceed with a normal readdir */
306 306
307 more: 307 more:
308 /* do we have the correct frag content buffered? */ 308 /* do we have the correct frag content buffered? */
309 if (fi->frag != frag || fi->last_readdir == NULL) { 309 if (fi->frag != frag || fi->last_readdir == NULL) {
310 struct ceph_mds_request *req; 310 struct ceph_mds_request *req;
311 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 311 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
312 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 312 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
313 313
314 /* discard old result, if any */ 314 /* discard old result, if any */
315 if (fi->last_readdir) { 315 if (fi->last_readdir) {
316 ceph_mdsc_put_request(fi->last_readdir); 316 ceph_mdsc_put_request(fi->last_readdir);
317 fi->last_readdir = NULL; 317 fi->last_readdir = NULL;
318 } 318 }
319 319
320 /* requery frag tree, as the frag topology may have changed */ 320 /* requery frag tree, as the frag topology may have changed */
321 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 321 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
322 322
323 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 323 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
324 ceph_vinop(inode), frag, fi->last_name); 324 ceph_vinop(inode), frag, fi->last_name);
325 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 325 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
326 if (IS_ERR(req)) 326 if (IS_ERR(req))
327 return PTR_ERR(req); 327 return PTR_ERR(req);
328 req->r_inode = inode; 328 req->r_inode = inode;
329 ihold(inode); 329 ihold(inode);
330 req->r_dentry = dget(filp->f_dentry); 330 req->r_dentry = dget(filp->f_dentry);
331 /* hints to request -> mds selection code */ 331 /* hints to request -> mds selection code */
332 req->r_direct_mode = USE_AUTH_MDS; 332 req->r_direct_mode = USE_AUTH_MDS;
333 req->r_direct_hash = ceph_frag_value(frag); 333 req->r_direct_hash = ceph_frag_value(frag);
334 req->r_direct_is_hash = true; 334 req->r_direct_is_hash = true;
335 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 335 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
336 req->r_readdir_offset = fi->next_offset; 336 req->r_readdir_offset = fi->next_offset;
337 req->r_args.readdir.frag = cpu_to_le32(frag); 337 req->r_args.readdir.frag = cpu_to_le32(frag);
338 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 338 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
339 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); 339 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
340 req->r_num_caps = max_entries + 1; 340 req->r_num_caps = max_entries + 1;
341 err = ceph_mdsc_do_request(mdsc, NULL, req); 341 err = ceph_mdsc_do_request(mdsc, NULL, req);
342 if (err < 0) { 342 if (err < 0) {
343 ceph_mdsc_put_request(req); 343 ceph_mdsc_put_request(req);
344 return err; 344 return err;
345 } 345 }
346 dout("readdir got and parsed readdir result=%d" 346 dout("readdir got and parsed readdir result=%d"
347 " on frag %x, end=%d, complete=%d\n", err, frag, 347 " on frag %x, end=%d, complete=%d\n", err, frag,
348 (int)req->r_reply_info.dir_end, 348 (int)req->r_reply_info.dir_end,
349 (int)req->r_reply_info.dir_complete); 349 (int)req->r_reply_info.dir_complete);
350 350
351 if (!req->r_did_prepopulate) { 351 if (!req->r_did_prepopulate) {
352 dout("readdir !did_prepopulate"); 352 dout("readdir !did_prepopulate");
353 fi->dir_release_count--; /* preclude D_COMPLETE */ 353 fi->dir_release_count--; /* preclude D_COMPLETE */
354 } 354 }
355 355
356 /* note next offset and last dentry name */ 356 /* note next offset and last dentry name */
357 fi->offset = fi->next_offset; 357 fi->offset = fi->next_offset;
358 fi->last_readdir = req; 358 fi->last_readdir = req;
359 359
360 if (req->r_reply_info.dir_end) { 360 if (req->r_reply_info.dir_end) {
361 kfree(fi->last_name); 361 kfree(fi->last_name);
362 fi->last_name = NULL; 362 fi->last_name = NULL;
363 if (ceph_frag_is_rightmost(frag)) 363 if (ceph_frag_is_rightmost(frag))
364 fi->next_offset = 2; 364 fi->next_offset = 2;
365 else 365 else
366 fi->next_offset = 0; 366 fi->next_offset = 0;
367 } else { 367 } else {
368 rinfo = &req->r_reply_info; 368 rinfo = &req->r_reply_info;
369 err = note_last_dentry(fi, 369 err = note_last_dentry(fi,
370 rinfo->dir_dname[rinfo->dir_nr-1], 370 rinfo->dir_dname[rinfo->dir_nr-1],
371 rinfo->dir_dname_len[rinfo->dir_nr-1]); 371 rinfo->dir_dname_len[rinfo->dir_nr-1]);
372 if (err) 372 if (err)
373 return err; 373 return err;
374 fi->next_offset += rinfo->dir_nr; 374 fi->next_offset += rinfo->dir_nr;
375 } 375 }
376 } 376 }
377 377
378 rinfo = &fi->last_readdir->r_reply_info; 378 rinfo = &fi->last_readdir->r_reply_info;
379 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 379 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
380 rinfo->dir_nr, off, fi->offset); 380 rinfo->dir_nr, off, fi->offset);
381 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 381 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
382 u64 pos = ceph_make_fpos(frag, off); 382 u64 pos = ceph_make_fpos(frag, off);
383 struct ceph_mds_reply_inode *in = 383 struct ceph_mds_reply_inode *in =
384 rinfo->dir_in[off - fi->offset].in; 384 rinfo->dir_in[off - fi->offset].in;
385 struct ceph_vino vino; 385 struct ceph_vino vino;
386 ino_t ino; 386 ino_t ino;
387 387
388 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 388 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
389 off, off - fi->offset, rinfo->dir_nr, pos, 389 off, off - fi->offset, rinfo->dir_nr, pos,
390 rinfo->dir_dname_len[off - fi->offset], 390 rinfo->dir_dname_len[off - fi->offset],
391 rinfo->dir_dname[off - fi->offset], in); 391 rinfo->dir_dname[off - fi->offset], in);
392 BUG_ON(!in); 392 BUG_ON(!in);
393 ftype = le32_to_cpu(in->mode) >> 12; 393 ftype = le32_to_cpu(in->mode) >> 12;
394 vino.ino = le64_to_cpu(in->ino); 394 vino.ino = le64_to_cpu(in->ino);
395 vino.snap = le64_to_cpu(in->snapid); 395 vino.snap = le64_to_cpu(in->snapid);
396 ino = ceph_vino_to_ino(vino); 396 ino = ceph_vino_to_ino(vino);
397 if (filldir(dirent, 397 if (filldir(dirent,
398 rinfo->dir_dname[off - fi->offset], 398 rinfo->dir_dname[off - fi->offset],
399 rinfo->dir_dname_len[off - fi->offset], 399 rinfo->dir_dname_len[off - fi->offset],
400 pos, 400 pos,
401 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { 401 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
402 dout("filldir stopping us...\n"); 402 dout("filldir stopping us...\n");
403 return 0; 403 return 0;
404 } 404 }
405 off++; 405 off++;
406 filp->f_pos = pos + 1; 406 filp->f_pos = pos + 1;
407 } 407 }
408 408
409 if (fi->last_name) { 409 if (fi->last_name) {
410 ceph_mdsc_put_request(fi->last_readdir); 410 ceph_mdsc_put_request(fi->last_readdir);
411 fi->last_readdir = NULL; 411 fi->last_readdir = NULL;
412 goto more; 412 goto more;
413 } 413 }
414 414
415 /* more frags? */ 415 /* more frags? */
416 if (!ceph_frag_is_rightmost(frag)) { 416 if (!ceph_frag_is_rightmost(frag)) {
417 frag = ceph_frag_next(frag); 417 frag = ceph_frag_next(frag);
418 off = 0; 418 off = 0;
419 filp->f_pos = ceph_make_fpos(frag, off); 419 filp->f_pos = ceph_make_fpos(frag, off);
420 dout("readdir next frag is %x\n", frag); 420 dout("readdir next frag is %x\n", frag);
421 goto more; 421 goto more;
422 } 422 }
423 fi->flags |= CEPH_F_ATEND; 423 fi->flags |= CEPH_F_ATEND;
424 424
425 /* 425 /*
426 * if dir_release_count still matches the dir, no dentries 426 * if dir_release_count still matches the dir, no dentries
427 * were released during the whole readdir, and we should have 427 * were released during the whole readdir, and we should have
428 * the complete dir contents in our cache. 428 * the complete dir contents in our cache.
429 */ 429 */
430 spin_lock(&ci->i_ceph_lock); 430 spin_lock(&ci->i_ceph_lock);
431 if (ci->i_release_count == fi->dir_release_count) { 431 if (ci->i_release_count == fi->dir_release_count) {
432 ceph_dir_set_complete(inode); 432 ceph_dir_set_complete(inode);
433 ci->i_max_offset = filp->f_pos; 433 ci->i_max_offset = filp->f_pos;
434 } 434 }
435 spin_unlock(&ci->i_ceph_lock); 435 spin_unlock(&ci->i_ceph_lock);
436 436
437 dout("readdir %p filp %p done.\n", inode, filp); 437 dout("readdir %p filp %p done.\n", inode, filp);
438 return 0; 438 return 0;
439 } 439 }
440 440
441 static void reset_readdir(struct ceph_file_info *fi) 441 static void reset_readdir(struct ceph_file_info *fi)
442 { 442 {
443 if (fi->last_readdir) { 443 if (fi->last_readdir) {
444 ceph_mdsc_put_request(fi->last_readdir); 444 ceph_mdsc_put_request(fi->last_readdir);
445 fi->last_readdir = NULL; 445 fi->last_readdir = NULL;
446 } 446 }
447 kfree(fi->last_name); 447 kfree(fi->last_name);
448 fi->last_name = NULL; 448 fi->last_name = NULL;
449 fi->next_offset = 2; /* compensate for . and .. */ 449 fi->next_offset = 2; /* compensate for . and .. */
450 if (fi->dentry) { 450 if (fi->dentry) {
451 dput(fi->dentry); 451 dput(fi->dentry);
452 fi->dentry = NULL; 452 fi->dentry = NULL;
453 } 453 }
454 fi->flags &= ~CEPH_F_ATEND; 454 fi->flags &= ~CEPH_F_ATEND;
455 } 455 }
456 456
457 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 457 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
458 { 458 {
459 struct ceph_file_info *fi = file->private_data; 459 struct ceph_file_info *fi = file->private_data;
460 struct inode *inode = file->f_mapping->host; 460 struct inode *inode = file->f_mapping->host;
461 loff_t old_offset = offset; 461 loff_t old_offset = offset;
462 loff_t retval; 462 loff_t retval;
463 463
464 mutex_lock(&inode->i_mutex); 464 mutex_lock(&inode->i_mutex);
465 retval = -EINVAL; 465 retval = -EINVAL;
466 switch (origin) { 466 switch (whence) {
467 case SEEK_END: 467 case SEEK_END:
468 offset += inode->i_size + 2; /* FIXME */ 468 offset += inode->i_size + 2; /* FIXME */
469 break; 469 break;
470 case SEEK_CUR: 470 case SEEK_CUR:
471 offset += file->f_pos; 471 offset += file->f_pos;
472 case SEEK_SET: 472 case SEEK_SET:
473 break; 473 break;
474 default: 474 default:
475 goto out; 475 goto out;
476 } 476 }
477 477
478 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 478 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
479 if (offset != file->f_pos) { 479 if (offset != file->f_pos) {
480 file->f_pos = offset; 480 file->f_pos = offset;
481 file->f_version = 0; 481 file->f_version = 0;
482 fi->flags &= ~CEPH_F_ATEND; 482 fi->flags &= ~CEPH_F_ATEND;
483 } 483 }
484 retval = offset; 484 retval = offset;
485 485
486 /* 486 /*
487 * discard buffered readdir content on seekdir(0), or 487 * discard buffered readdir content on seekdir(0), or
488 * seek to new frag, or seek prior to current chunk. 488 * seek to new frag, or seek prior to current chunk.
489 */ 489 */
490 if (offset == 0 || 490 if (offset == 0 ||
491 fpos_frag(offset) != fpos_frag(old_offset) || 491 fpos_frag(offset) != fpos_frag(old_offset) ||
492 fpos_off(offset) < fi->offset) { 492 fpos_off(offset) < fi->offset) {
493 dout("dir_llseek dropping %p content\n", file); 493 dout("dir_llseek dropping %p content\n", file);
494 reset_readdir(fi); 494 reset_readdir(fi);
495 } 495 }
496 496
497 /* bump dir_release_count if we did a forward seek */ 497 /* bump dir_release_count if we did a forward seek */
498 if (offset > old_offset) 498 if (offset > old_offset)
499 fi->dir_release_count--; 499 fi->dir_release_count--;
500 } 500 }
501 out: 501 out:
502 mutex_unlock(&inode->i_mutex); 502 mutex_unlock(&inode->i_mutex);
503 return retval; 503 return retval;
504 } 504 }
505 505
506 /* 506 /*
507 * Handle lookups for the hidden .snap directory. 507 * Handle lookups for the hidden .snap directory.
508 */ 508 */
509 int ceph_handle_snapdir(struct ceph_mds_request *req, 509 int ceph_handle_snapdir(struct ceph_mds_request *req,
510 struct dentry *dentry, int err) 510 struct dentry *dentry, int err)
511 { 511 {
512 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 512 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
513 struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ 513 struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
514 514
515 /* .snap dir? */ 515 /* .snap dir? */
516 if (err == -ENOENT && 516 if (err == -ENOENT &&
517 ceph_snap(parent) == CEPH_NOSNAP && 517 ceph_snap(parent) == CEPH_NOSNAP &&
518 strcmp(dentry->d_name.name, 518 strcmp(dentry->d_name.name,
519 fsc->mount_options->snapdir_name) == 0) { 519 fsc->mount_options->snapdir_name) == 0) {
520 struct inode *inode = ceph_get_snapdir(parent); 520 struct inode *inode = ceph_get_snapdir(parent);
521 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 521 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
522 dentry, dentry->d_name.len, dentry->d_name.name, inode); 522 dentry, dentry->d_name.len, dentry->d_name.name, inode);
523 BUG_ON(!d_unhashed(dentry)); 523 BUG_ON(!d_unhashed(dentry));
524 d_add(dentry, inode); 524 d_add(dentry, inode);
525 err = 0; 525 err = 0;
526 } 526 }
527 return err; 527 return err;
528 } 528 }
529 529
530 /* 530 /*
531 * Figure out final result of a lookup/open request. 531 * Figure out final result of a lookup/open request.
532 * 532 *
533 * Mainly, make sure we return the final req->r_dentry (if it already 533 * Mainly, make sure we return the final req->r_dentry (if it already
534 * existed) in place of the original VFS-provided dentry when they 534 * existed) in place of the original VFS-provided dentry when they
535 * differ. 535 * differ.
536 * 536 *
537 * Gracefully handle the case where the MDS replies with -ENOENT and 537 * Gracefully handle the case where the MDS replies with -ENOENT and
538 * no trace (which it may do, at its discretion, e.g., if it doesn't 538 * no trace (which it may do, at its discretion, e.g., if it doesn't
539 * care to issue a lease on the negative dentry). 539 * care to issue a lease on the negative dentry).
540 */ 540 */
541 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 541 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
542 struct dentry *dentry, int err) 542 struct dentry *dentry, int err)
543 { 543 {
544 if (err == -ENOENT) { 544 if (err == -ENOENT) {
545 /* no trace? */ 545 /* no trace? */
546 err = 0; 546 err = 0;
547 if (!req->r_reply_info.head->is_dentry) { 547 if (!req->r_reply_info.head->is_dentry) {
548 dout("ENOENT and no trace, dentry %p inode %p\n", 548 dout("ENOENT and no trace, dentry %p inode %p\n",
549 dentry, dentry->d_inode); 549 dentry, dentry->d_inode);
550 if (dentry->d_inode) { 550 if (dentry->d_inode) {
551 d_drop(dentry); 551 d_drop(dentry);
552 err = -ENOENT; 552 err = -ENOENT;
553 } else { 553 } else {
554 d_add(dentry, NULL); 554 d_add(dentry, NULL);
555 } 555 }
556 } 556 }
557 } 557 }
558 if (err) 558 if (err)
559 dentry = ERR_PTR(err); 559 dentry = ERR_PTR(err);
560 else if (dentry != req->r_dentry) 560 else if (dentry != req->r_dentry)
561 dentry = dget(req->r_dentry); /* we got spliced */ 561 dentry = dget(req->r_dentry); /* we got spliced */
562 else 562 else
563 dentry = NULL; 563 dentry = NULL;
564 return dentry; 564 return dentry;
565 } 565 }
566 566
567 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 567 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
568 { 568 {
569 return ceph_ino(inode) == CEPH_INO_ROOT && 569 return ceph_ino(inode) == CEPH_INO_ROOT &&
570 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 570 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
571 } 571 }
572 572
573 /* 573 /*
574 * Look up a single dir entry. If there is a lookup intent, inform 574 * Look up a single dir entry. If there is a lookup intent, inform
575 * the MDS so that it gets our 'caps wanted' value in a single op. 575 * the MDS so that it gets our 'caps wanted' value in a single op.
576 */ 576 */
577 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 577 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
578 unsigned int flags) 578 unsigned int flags)
579 { 579 {
580 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 580 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
581 struct ceph_mds_client *mdsc = fsc->mdsc; 581 struct ceph_mds_client *mdsc = fsc->mdsc;
582 struct ceph_mds_request *req; 582 struct ceph_mds_request *req;
583 int op; 583 int op;
584 int err; 584 int err;
585 585
586 dout("lookup %p dentry %p '%.*s'\n", 586 dout("lookup %p dentry %p '%.*s'\n",
587 dir, dentry, dentry->d_name.len, dentry->d_name.name); 587 dir, dentry, dentry->d_name.len, dentry->d_name.name);
588 588
589 if (dentry->d_name.len > NAME_MAX) 589 if (dentry->d_name.len > NAME_MAX)
590 return ERR_PTR(-ENAMETOOLONG); 590 return ERR_PTR(-ENAMETOOLONG);
591 591
592 err = ceph_init_dentry(dentry); 592 err = ceph_init_dentry(dentry);
593 if (err < 0) 593 if (err < 0)
594 return ERR_PTR(err); 594 return ERR_PTR(err);
595 595
596 /* can we conclude ENOENT locally? */ 596 /* can we conclude ENOENT locally? */
597 if (dentry->d_inode == NULL) { 597 if (dentry->d_inode == NULL) {
598 struct ceph_inode_info *ci = ceph_inode(dir); 598 struct ceph_inode_info *ci = ceph_inode(dir);
599 struct ceph_dentry_info *di = ceph_dentry(dentry); 599 struct ceph_dentry_info *di = ceph_dentry(dentry);
600 600
601 spin_lock(&ci->i_ceph_lock); 601 spin_lock(&ci->i_ceph_lock);
602 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 602 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
603 if (strncmp(dentry->d_name.name, 603 if (strncmp(dentry->d_name.name,
604 fsc->mount_options->snapdir_name, 604 fsc->mount_options->snapdir_name,
605 dentry->d_name.len) && 605 dentry->d_name.len) &&
606 !is_root_ceph_dentry(dir, dentry) && 606 !is_root_ceph_dentry(dir, dentry) &&
607 ceph_dir_test_complete(dir) && 607 ceph_dir_test_complete(dir) &&
608 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 608 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
609 spin_unlock(&ci->i_ceph_lock); 609 spin_unlock(&ci->i_ceph_lock);
610 dout(" dir %p complete, -ENOENT\n", dir); 610 dout(" dir %p complete, -ENOENT\n", dir);
611 d_add(dentry, NULL); 611 d_add(dentry, NULL);
612 di->lease_shared_gen = ci->i_shared_gen; 612 di->lease_shared_gen = ci->i_shared_gen;
613 return NULL; 613 return NULL;
614 } 614 }
615 spin_unlock(&ci->i_ceph_lock); 615 spin_unlock(&ci->i_ceph_lock);
616 } 616 }
617 617
618 op = ceph_snap(dir) == CEPH_SNAPDIR ? 618 op = ceph_snap(dir) == CEPH_SNAPDIR ?
619 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 619 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
620 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 620 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
621 if (IS_ERR(req)) 621 if (IS_ERR(req))
622 return ERR_CAST(req); 622 return ERR_CAST(req);
623 req->r_dentry = dget(dentry); 623 req->r_dentry = dget(dentry);
624 req->r_num_caps = 2; 624 req->r_num_caps = 2;
625 /* we only need inode linkage */ 625 /* we only need inode linkage */
626 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 626 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
627 req->r_locked_dir = dir; 627 req->r_locked_dir = dir;
628 err = ceph_mdsc_do_request(mdsc, NULL, req); 628 err = ceph_mdsc_do_request(mdsc, NULL, req);
629 err = ceph_handle_snapdir(req, dentry, err); 629 err = ceph_handle_snapdir(req, dentry, err);
630 dentry = ceph_finish_lookup(req, dentry, err); 630 dentry = ceph_finish_lookup(req, dentry, err);
631 ceph_mdsc_put_request(req); /* will dput(dentry) */ 631 ceph_mdsc_put_request(req); /* will dput(dentry) */
632 dout("lookup result=%p\n", dentry); 632 dout("lookup result=%p\n", dentry);
633 return dentry; 633 return dentry;
634 } 634 }
635 635
636 /* 636 /*
637 * If we do a create but get no trace back from the MDS, follow up with 637 * If we do a create but get no trace back from the MDS, follow up with
638 * a lookup (the VFS expects us to link up the provided dentry). 638 * a lookup (the VFS expects us to link up the provided dentry).
639 */ 639 */
640 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 640 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
641 { 641 {
642 struct dentry *result = ceph_lookup(dir, dentry, 0); 642 struct dentry *result = ceph_lookup(dir, dentry, 0);
643 643
644 if (result && !IS_ERR(result)) { 644 if (result && !IS_ERR(result)) {
645 /* 645 /*
646 * We created the item, then did a lookup, and found 646 * We created the item, then did a lookup, and found
647 * it was already linked to another inode we already 647 * it was already linked to another inode we already
648 * had in our cache (and thus got spliced). Link our 648 * had in our cache (and thus got spliced). Link our
649 * dentry to that inode, but don't hash it, just in 649 * dentry to that inode, but don't hash it, just in
650 * case the VFS wants to dereference it. 650 * case the VFS wants to dereference it.
651 */ 651 */
652 BUG_ON(!result->d_inode); 652 BUG_ON(!result->d_inode);
653 d_instantiate(dentry, result->d_inode); 653 d_instantiate(dentry, result->d_inode);
654 return 0; 654 return 0;
655 } 655 }
656 return PTR_ERR(result); 656 return PTR_ERR(result);
657 } 657 }
658 658
659 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 659 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
660 umode_t mode, dev_t rdev) 660 umode_t mode, dev_t rdev)
661 { 661 {
662 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 662 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
663 struct ceph_mds_client *mdsc = fsc->mdsc; 663 struct ceph_mds_client *mdsc = fsc->mdsc;
664 struct ceph_mds_request *req; 664 struct ceph_mds_request *req;
665 int err; 665 int err;
666 666
667 if (ceph_snap(dir) != CEPH_NOSNAP) 667 if (ceph_snap(dir) != CEPH_NOSNAP)
668 return -EROFS; 668 return -EROFS;
669 669
670 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 670 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
671 dir, dentry, mode, rdev); 671 dir, dentry, mode, rdev);
672 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 672 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
673 if (IS_ERR(req)) { 673 if (IS_ERR(req)) {
674 d_drop(dentry); 674 d_drop(dentry);
675 return PTR_ERR(req); 675 return PTR_ERR(req);
676 } 676 }
677 req->r_dentry = dget(dentry); 677 req->r_dentry = dget(dentry);
678 req->r_num_caps = 2; 678 req->r_num_caps = 2;
679 req->r_locked_dir = dir; 679 req->r_locked_dir = dir;
680 req->r_args.mknod.mode = cpu_to_le32(mode); 680 req->r_args.mknod.mode = cpu_to_le32(mode);
681 req->r_args.mknod.rdev = cpu_to_le32(rdev); 681 req->r_args.mknod.rdev = cpu_to_le32(rdev);
682 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 682 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
683 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 683 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
684 err = ceph_mdsc_do_request(mdsc, dir, req); 684 err = ceph_mdsc_do_request(mdsc, dir, req);
685 if (!err && !req->r_reply_info.head->is_dentry) 685 if (!err && !req->r_reply_info.head->is_dentry)
686 err = ceph_handle_notrace_create(dir, dentry); 686 err = ceph_handle_notrace_create(dir, dentry);
687 ceph_mdsc_put_request(req); 687 ceph_mdsc_put_request(req);
688 if (err) 688 if (err)
689 d_drop(dentry); 689 d_drop(dentry);
690 return err; 690 return err;
691 } 691 }
692 692
693 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, 693 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
694 bool excl) 694 bool excl)
695 { 695 {
696 return ceph_mknod(dir, dentry, mode, 0); 696 return ceph_mknod(dir, dentry, mode, 0);
697 } 697 }
698 698
699 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 699 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
700 const char *dest) 700 const char *dest)
701 { 701 {
702 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 702 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
703 struct ceph_mds_client *mdsc = fsc->mdsc; 703 struct ceph_mds_client *mdsc = fsc->mdsc;
704 struct ceph_mds_request *req; 704 struct ceph_mds_request *req;
705 int err; 705 int err;
706 706
707 if (ceph_snap(dir) != CEPH_NOSNAP) 707 if (ceph_snap(dir) != CEPH_NOSNAP)
708 return -EROFS; 708 return -EROFS;
709 709
710 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 710 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
711 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 711 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
712 if (IS_ERR(req)) { 712 if (IS_ERR(req)) {
713 d_drop(dentry); 713 d_drop(dentry);
714 return PTR_ERR(req); 714 return PTR_ERR(req);
715 } 715 }
716 req->r_dentry = dget(dentry); 716 req->r_dentry = dget(dentry);
717 req->r_num_caps = 2; 717 req->r_num_caps = 2;
718 req->r_path2 = kstrdup(dest, GFP_NOFS); 718 req->r_path2 = kstrdup(dest, GFP_NOFS);
719 req->r_locked_dir = dir; 719 req->r_locked_dir = dir;
720 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 720 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
721 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 721 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
722 err = ceph_mdsc_do_request(mdsc, dir, req); 722 err = ceph_mdsc_do_request(mdsc, dir, req);
723 if (!err && !req->r_reply_info.head->is_dentry) 723 if (!err && !req->r_reply_info.head->is_dentry)
724 err = ceph_handle_notrace_create(dir, dentry); 724 err = ceph_handle_notrace_create(dir, dentry);
725 ceph_mdsc_put_request(req); 725 ceph_mdsc_put_request(req);
726 if (err) 726 if (err)
727 d_drop(dentry); 727 d_drop(dentry);
728 return err; 728 return err;
729 } 729 }
730 730
731 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 731 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
732 { 732 {
733 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 733 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
734 struct ceph_mds_client *mdsc = fsc->mdsc; 734 struct ceph_mds_client *mdsc = fsc->mdsc;
735 struct ceph_mds_request *req; 735 struct ceph_mds_request *req;
736 int err = -EROFS; 736 int err = -EROFS;
737 int op; 737 int op;
738 738
739 if (ceph_snap(dir) == CEPH_SNAPDIR) { 739 if (ceph_snap(dir) == CEPH_SNAPDIR) {
740 /* mkdir .snap/foo is a MKSNAP */ 740 /* mkdir .snap/foo is a MKSNAP */
741 op = CEPH_MDS_OP_MKSNAP; 741 op = CEPH_MDS_OP_MKSNAP;
742 dout("mksnap dir %p snap '%.*s' dn %p\n", dir, 742 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
743 dentry->d_name.len, dentry->d_name.name, dentry); 743 dentry->d_name.len, dentry->d_name.name, dentry);
744 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 744 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
745 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 745 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
746 op = CEPH_MDS_OP_MKDIR; 746 op = CEPH_MDS_OP_MKDIR;
747 } else { 747 } else {
748 goto out; 748 goto out;
749 } 749 }
750 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 750 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
751 if (IS_ERR(req)) { 751 if (IS_ERR(req)) {
752 err = PTR_ERR(req); 752 err = PTR_ERR(req);
753 goto out; 753 goto out;
754 } 754 }
755 755
756 req->r_dentry = dget(dentry); 756 req->r_dentry = dget(dentry);
757 req->r_num_caps = 2; 757 req->r_num_caps = 2;
758 req->r_locked_dir = dir; 758 req->r_locked_dir = dir;
759 req->r_args.mkdir.mode = cpu_to_le32(mode); 759 req->r_args.mkdir.mode = cpu_to_le32(mode);
760 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 760 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
761 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 761 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
762 err = ceph_mdsc_do_request(mdsc, dir, req); 762 err = ceph_mdsc_do_request(mdsc, dir, req);
763 if (!err && !req->r_reply_info.head->is_dentry) 763 if (!err && !req->r_reply_info.head->is_dentry)
764 err = ceph_handle_notrace_create(dir, dentry); 764 err = ceph_handle_notrace_create(dir, dentry);
765 ceph_mdsc_put_request(req); 765 ceph_mdsc_put_request(req);
766 out: 766 out:
767 if (err < 0) 767 if (err < 0)
768 d_drop(dentry); 768 d_drop(dentry);
769 return err; 769 return err;
770 } 770 }
771 771
772 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 772 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
773 struct dentry *dentry) 773 struct dentry *dentry)
774 { 774 {
775 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 775 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
776 struct ceph_mds_client *mdsc = fsc->mdsc; 776 struct ceph_mds_client *mdsc = fsc->mdsc;
777 struct ceph_mds_request *req; 777 struct ceph_mds_request *req;
778 int err; 778 int err;
779 779
780 if (ceph_snap(dir) != CEPH_NOSNAP) 780 if (ceph_snap(dir) != CEPH_NOSNAP)
781 return -EROFS; 781 return -EROFS;
782 782
783 dout("link in dir %p old_dentry %p dentry %p\n", dir, 783 dout("link in dir %p old_dentry %p dentry %p\n", dir,
784 old_dentry, dentry); 784 old_dentry, dentry);
785 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 785 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
786 if (IS_ERR(req)) { 786 if (IS_ERR(req)) {
787 d_drop(dentry); 787 d_drop(dentry);
788 return PTR_ERR(req); 788 return PTR_ERR(req);
789 } 789 }
790 req->r_dentry = dget(dentry); 790 req->r_dentry = dget(dentry);
791 req->r_num_caps = 2; 791 req->r_num_caps = 2;
792 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 792 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
793 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 793 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
794 req->r_locked_dir = dir; 794 req->r_locked_dir = dir;
795 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 795 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
796 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 796 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
797 err = ceph_mdsc_do_request(mdsc, dir, req); 797 err = ceph_mdsc_do_request(mdsc, dir, req);
798 if (err) { 798 if (err) {
799 d_drop(dentry); 799 d_drop(dentry);
800 } else if (!req->r_reply_info.head->is_dentry) { 800 } else if (!req->r_reply_info.head->is_dentry) {
801 ihold(old_dentry->d_inode); 801 ihold(old_dentry->d_inode);
802 d_instantiate(dentry, old_dentry->d_inode); 802 d_instantiate(dentry, old_dentry->d_inode);
803 } 803 }
804 ceph_mdsc_put_request(req); 804 ceph_mdsc_put_request(req);
805 return err; 805 return err;
806 } 806 }
807 807
808 /* 808 /*
809 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 809 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
810 * looks like the link count will hit 0, drop any other caps (other 810 * looks like the link count will hit 0, drop any other caps (other
811 * than PIN) we don't specifically want (due to the file still being 811 * than PIN) we don't specifically want (due to the file still being
812 * open). 812 * open).
813 */ 813 */
814 static int drop_caps_for_unlink(struct inode *inode) 814 static int drop_caps_for_unlink(struct inode *inode)
815 { 815 {
816 struct ceph_inode_info *ci = ceph_inode(inode); 816 struct ceph_inode_info *ci = ceph_inode(inode);
817 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 817 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
818 818
819 spin_lock(&ci->i_ceph_lock); 819 spin_lock(&ci->i_ceph_lock);
820 if (inode->i_nlink == 1) { 820 if (inode->i_nlink == 1) {
821 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 821 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
822 ci->i_ceph_flags |= CEPH_I_NODELAY; 822 ci->i_ceph_flags |= CEPH_I_NODELAY;
823 } 823 }
824 spin_unlock(&ci->i_ceph_lock); 824 spin_unlock(&ci->i_ceph_lock);
825 return drop; 825 return drop;
826 } 826 }
827 827
828 /* 828 /*
829 * rmdir and unlink are differ only by the metadata op code 829 * rmdir and unlink are differ only by the metadata op code
830 */ 830 */
831 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 831 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
832 { 832 {
833 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 833 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
834 struct ceph_mds_client *mdsc = fsc->mdsc; 834 struct ceph_mds_client *mdsc = fsc->mdsc;
835 struct inode *inode = dentry->d_inode; 835 struct inode *inode = dentry->d_inode;
836 struct ceph_mds_request *req; 836 struct ceph_mds_request *req;
837 int err = -EROFS; 837 int err = -EROFS;
838 int op; 838 int op;
839 839
840 if (ceph_snap(dir) == CEPH_SNAPDIR) { 840 if (ceph_snap(dir) == CEPH_SNAPDIR) {
841 /* rmdir .snap/foo is RMSNAP */ 841 /* rmdir .snap/foo is RMSNAP */
842 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, 842 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
843 dentry->d_name.name, dentry); 843 dentry->d_name.name, dentry);
844 op = CEPH_MDS_OP_RMSNAP; 844 op = CEPH_MDS_OP_RMSNAP;
845 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 845 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
846 dout("unlink/rmdir dir %p dn %p inode %p\n", 846 dout("unlink/rmdir dir %p dn %p inode %p\n",
847 dir, dentry, inode); 847 dir, dentry, inode);
848 op = S_ISDIR(dentry->d_inode->i_mode) ? 848 op = S_ISDIR(dentry->d_inode->i_mode) ?
849 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 849 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
850 } else 850 } else
851 goto out; 851 goto out;
852 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 852 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
853 if (IS_ERR(req)) { 853 if (IS_ERR(req)) {
854 err = PTR_ERR(req); 854 err = PTR_ERR(req);
855 goto out; 855 goto out;
856 } 856 }
857 req->r_dentry = dget(dentry); 857 req->r_dentry = dget(dentry);
858 req->r_num_caps = 2; 858 req->r_num_caps = 2;
859 req->r_locked_dir = dir; 859 req->r_locked_dir = dir;
860 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 860 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
861 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 861 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
862 req->r_inode_drop = drop_caps_for_unlink(inode); 862 req->r_inode_drop = drop_caps_for_unlink(inode);
863 err = ceph_mdsc_do_request(mdsc, dir, req); 863 err = ceph_mdsc_do_request(mdsc, dir, req);
864 if (!err && !req->r_reply_info.head->is_dentry) 864 if (!err && !req->r_reply_info.head->is_dentry)
865 d_delete(dentry); 865 d_delete(dentry);
866 ceph_mdsc_put_request(req); 866 ceph_mdsc_put_request(req);
867 out: 867 out:
868 return err; 868 return err;
869 } 869 }
870 870
871 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 871 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
872 struct inode *new_dir, struct dentry *new_dentry) 872 struct inode *new_dir, struct dentry *new_dentry)
873 { 873 {
874 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 874 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
875 struct ceph_mds_client *mdsc = fsc->mdsc; 875 struct ceph_mds_client *mdsc = fsc->mdsc;
876 struct ceph_mds_request *req; 876 struct ceph_mds_request *req;
877 int err; 877 int err;
878 878
879 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 879 if (ceph_snap(old_dir) != ceph_snap(new_dir))
880 return -EXDEV; 880 return -EXDEV;
881 if (ceph_snap(old_dir) != CEPH_NOSNAP || 881 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
882 ceph_snap(new_dir) != CEPH_NOSNAP) 882 ceph_snap(new_dir) != CEPH_NOSNAP)
883 return -EROFS; 883 return -EROFS;
884 dout("rename dir %p dentry %p to dir %p dentry %p\n", 884 dout("rename dir %p dentry %p to dir %p dentry %p\n",
885 old_dir, old_dentry, new_dir, new_dentry); 885 old_dir, old_dentry, new_dir, new_dentry);
886 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 886 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
887 if (IS_ERR(req)) 887 if (IS_ERR(req))
888 return PTR_ERR(req); 888 return PTR_ERR(req);
889 req->r_dentry = dget(new_dentry); 889 req->r_dentry = dget(new_dentry);
890 req->r_num_caps = 2; 890 req->r_num_caps = 2;
891 req->r_old_dentry = dget(old_dentry); 891 req->r_old_dentry = dget(old_dentry);
892 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 892 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
893 req->r_locked_dir = new_dir; 893 req->r_locked_dir = new_dir;
894 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 894 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
895 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 895 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
896 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 896 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
897 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 897 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
898 /* release LINK_RDCACHE on source inode (mds will lock it) */ 898 /* release LINK_RDCACHE on source inode (mds will lock it) */
899 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 899 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
900 if (new_dentry->d_inode) 900 if (new_dentry->d_inode)
901 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); 901 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
902 err = ceph_mdsc_do_request(mdsc, old_dir, req); 902 err = ceph_mdsc_do_request(mdsc, old_dir, req);
903 if (!err && !req->r_reply_info.head->is_dentry) { 903 if (!err && !req->r_reply_info.head->is_dentry) {
904 /* 904 /*
905 * Normally d_move() is done by fill_trace (called by 905 * Normally d_move() is done by fill_trace (called by
906 * do_request, above). If there is no trace, we need 906 * do_request, above). If there is no trace, we need
907 * to do it here. 907 * to do it here.
908 */ 908 */
909 909
910 /* d_move screws up d_subdirs order */ 910 /* d_move screws up d_subdirs order */
911 ceph_dir_clear_complete(new_dir); 911 ceph_dir_clear_complete(new_dir);
912 912
913 d_move(old_dentry, new_dentry); 913 d_move(old_dentry, new_dentry);
914 914
915 /* ensure target dentry is invalidated, despite 915 /* ensure target dentry is invalidated, despite
916 rehashing bug in vfs_rename_dir */ 916 rehashing bug in vfs_rename_dir */
917 ceph_invalidate_dentry_lease(new_dentry); 917 ceph_invalidate_dentry_lease(new_dentry);
918 } 918 }
919 ceph_mdsc_put_request(req); 919 ceph_mdsc_put_request(req);
920 return err; 920 return err;
921 } 921 }
922 922
923 /* 923 /*
924 * Ensure a dentry lease will no longer revalidate. 924 * Ensure a dentry lease will no longer revalidate.
925 */ 925 */
926 void ceph_invalidate_dentry_lease(struct dentry *dentry) 926 void ceph_invalidate_dentry_lease(struct dentry *dentry)
927 { 927 {
928 spin_lock(&dentry->d_lock); 928 spin_lock(&dentry->d_lock);
929 dentry->d_time = jiffies; 929 dentry->d_time = jiffies;
930 ceph_dentry(dentry)->lease_shared_gen = 0; 930 ceph_dentry(dentry)->lease_shared_gen = 0;
931 spin_unlock(&dentry->d_lock); 931 spin_unlock(&dentry->d_lock);
932 } 932 }
933 933
934 /* 934 /*
935 * Check if dentry lease is valid. If not, delete the lease. Try to 935 * Check if dentry lease is valid. If not, delete the lease. Try to
936 * renew if the least is more than half up. 936 * renew if the least is more than half up.
937 */ 937 */
938 static int dentry_lease_is_valid(struct dentry *dentry) 938 static int dentry_lease_is_valid(struct dentry *dentry)
939 { 939 {
940 struct ceph_dentry_info *di; 940 struct ceph_dentry_info *di;
941 struct ceph_mds_session *s; 941 struct ceph_mds_session *s;
942 int valid = 0; 942 int valid = 0;
943 u32 gen; 943 u32 gen;
944 unsigned long ttl; 944 unsigned long ttl;
945 struct ceph_mds_session *session = NULL; 945 struct ceph_mds_session *session = NULL;
946 struct inode *dir = NULL; 946 struct inode *dir = NULL;
947 u32 seq = 0; 947 u32 seq = 0;
948 948
949 spin_lock(&dentry->d_lock); 949 spin_lock(&dentry->d_lock);
950 di = ceph_dentry(dentry); 950 di = ceph_dentry(dentry);
951 if (di->lease_session) { 951 if (di->lease_session) {
952 s = di->lease_session; 952 s = di->lease_session;
953 spin_lock(&s->s_gen_ttl_lock); 953 spin_lock(&s->s_gen_ttl_lock);
954 gen = s->s_cap_gen; 954 gen = s->s_cap_gen;
955 ttl = s->s_cap_ttl; 955 ttl = s->s_cap_ttl;
956 spin_unlock(&s->s_gen_ttl_lock); 956 spin_unlock(&s->s_gen_ttl_lock);
957 957
958 if (di->lease_gen == gen && 958 if (di->lease_gen == gen &&
959 time_before(jiffies, dentry->d_time) && 959 time_before(jiffies, dentry->d_time) &&
960 time_before(jiffies, ttl)) { 960 time_before(jiffies, ttl)) {
961 valid = 1; 961 valid = 1;
962 if (di->lease_renew_after && 962 if (di->lease_renew_after &&
963 time_after(jiffies, di->lease_renew_after)) { 963 time_after(jiffies, di->lease_renew_after)) {
964 /* we should renew */ 964 /* we should renew */
965 dir = dentry->d_parent->d_inode; 965 dir = dentry->d_parent->d_inode;
966 session = ceph_get_mds_session(s); 966 session = ceph_get_mds_session(s);
967 seq = di->lease_seq; 967 seq = di->lease_seq;
968 di->lease_renew_after = 0; 968 di->lease_renew_after = 0;
969 di->lease_renew_from = jiffies; 969 di->lease_renew_from = jiffies;
970 } 970 }
971 } 971 }
972 } 972 }
973 spin_unlock(&dentry->d_lock); 973 spin_unlock(&dentry->d_lock);
974 974
975 if (session) { 975 if (session) {
976 ceph_mdsc_lease_send_msg(session, dir, dentry, 976 ceph_mdsc_lease_send_msg(session, dir, dentry,
977 CEPH_MDS_LEASE_RENEW, seq); 977 CEPH_MDS_LEASE_RENEW, seq);
978 ceph_put_mds_session(session); 978 ceph_put_mds_session(session);
979 } 979 }
980 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 980 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
981 return valid; 981 return valid;
982 } 982 }
983 983
984 /* 984 /*
985 * Check if directory-wide content lease/cap is valid. 985 * Check if directory-wide content lease/cap is valid.
986 */ 986 */
987 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 987 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
988 { 988 {
989 struct ceph_inode_info *ci = ceph_inode(dir); 989 struct ceph_inode_info *ci = ceph_inode(dir);
990 struct ceph_dentry_info *di = ceph_dentry(dentry); 990 struct ceph_dentry_info *di = ceph_dentry(dentry);
991 int valid = 0; 991 int valid = 0;
992 992
993 spin_lock(&ci->i_ceph_lock); 993 spin_lock(&ci->i_ceph_lock);
994 if (ci->i_shared_gen == di->lease_shared_gen) 994 if (ci->i_shared_gen == di->lease_shared_gen)
995 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 995 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
996 spin_unlock(&ci->i_ceph_lock); 996 spin_unlock(&ci->i_ceph_lock);
997 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 997 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
998 dir, (unsigned)ci->i_shared_gen, dentry, 998 dir, (unsigned)ci->i_shared_gen, dentry,
999 (unsigned)di->lease_shared_gen, valid); 999 (unsigned)di->lease_shared_gen, valid);
1000 return valid; 1000 return valid;
1001 } 1001 }
1002 1002
1003 /* 1003 /*
1004 * Check if cached dentry can be trusted. 1004 * Check if cached dentry can be trusted.
1005 */ 1005 */
1006 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1006 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1007 { 1007 {
1008 int valid = 0; 1008 int valid = 0;
1009 struct inode *dir; 1009 struct inode *dir;
1010 1010
1011 if (flags & LOOKUP_RCU) 1011 if (flags & LOOKUP_RCU)
1012 return -ECHILD; 1012 return -ECHILD;
1013 1013
1014 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 1014 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
1015 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 1015 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
1016 ceph_dentry(dentry)->offset); 1016 ceph_dentry(dentry)->offset);
1017 1017
1018 dir = ceph_get_dentry_parent_inode(dentry); 1018 dir = ceph_get_dentry_parent_inode(dentry);
1019 1019
1020 /* always trust cached snapped dentries, snapdir dentry */ 1020 /* always trust cached snapped dentries, snapdir dentry */
1021 if (ceph_snap(dir) != CEPH_NOSNAP) { 1021 if (ceph_snap(dir) != CEPH_NOSNAP) {
1022 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, 1022 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
1023 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 1023 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
1024 valid = 1; 1024 valid = 1;
1025 } else if (dentry->d_inode && 1025 } else if (dentry->d_inode &&
1026 ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { 1026 ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
1027 valid = 1; 1027 valid = 1;
1028 } else if (dentry_lease_is_valid(dentry) || 1028 } else if (dentry_lease_is_valid(dentry) ||
1029 dir_lease_is_valid(dir, dentry)) { 1029 dir_lease_is_valid(dir, dentry)) {
1030 valid = 1; 1030 valid = 1;
1031 } 1031 }
1032 1032
1033 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1033 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1034 if (valid) 1034 if (valid)
1035 ceph_dentry_lru_touch(dentry); 1035 ceph_dentry_lru_touch(dentry);
1036 else 1036 else
1037 d_drop(dentry); 1037 d_drop(dentry);
1038 iput(dir); 1038 iput(dir);
1039 return valid; 1039 return valid;
1040 } 1040 }
1041 1041
1042 /* 1042 /*
1043 * Release our ceph_dentry_info. 1043 * Release our ceph_dentry_info.
1044 */ 1044 */
1045 static void ceph_d_release(struct dentry *dentry) 1045 static void ceph_d_release(struct dentry *dentry)
1046 { 1046 {
1047 struct ceph_dentry_info *di = ceph_dentry(dentry); 1047 struct ceph_dentry_info *di = ceph_dentry(dentry);
1048 1048
1049 dout("d_release %p\n", dentry); 1049 dout("d_release %p\n", dentry);
1050 ceph_dentry_lru_del(dentry); 1050 ceph_dentry_lru_del(dentry);
1051 if (di->lease_session) 1051 if (di->lease_session)
1052 ceph_put_mds_session(di->lease_session); 1052 ceph_put_mds_session(di->lease_session);
1053 kmem_cache_free(ceph_dentry_cachep, di); 1053 kmem_cache_free(ceph_dentry_cachep, di);
1054 dentry->d_fsdata = NULL; 1054 dentry->d_fsdata = NULL;
1055 } 1055 }
1056 1056
1057 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1057 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1058 unsigned int flags) 1058 unsigned int flags)
1059 { 1059 {
1060 /* 1060 /*
1061 * Eventually, we'll want to revalidate snapped metadata 1061 * Eventually, we'll want to revalidate snapped metadata
1062 * too... probably... 1062 * too... probably...
1063 */ 1063 */
1064 return 1; 1064 return 1;
1065 } 1065 }
1066 1066
1067 /* 1067 /*
1068 * Set/clear/test dir complete flag on the dir's dentry. 1068 * Set/clear/test dir complete flag on the dir's dentry.
1069 */ 1069 */
1070 void ceph_dir_set_complete(struct inode *inode) 1070 void ceph_dir_set_complete(struct inode *inode)
1071 { 1071 {
1072 struct dentry *dentry = d_find_any_alias(inode); 1072 struct dentry *dentry = d_find_any_alias(inode);
1073 1073
1074 if (dentry && ceph_dentry(dentry) && 1074 if (dentry && ceph_dentry(dentry) &&
1075 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { 1075 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1076 dout(" marking %p (%p) complete\n", inode, dentry); 1076 dout(" marking %p (%p) complete\n", inode, dentry);
1077 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1077 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1078 } 1078 }
1079 dput(dentry); 1079 dput(dentry);
1080 } 1080 }
1081 1081
1082 void ceph_dir_clear_complete(struct inode *inode) 1082 void ceph_dir_clear_complete(struct inode *inode)
1083 { 1083 {
1084 struct dentry *dentry = d_find_any_alias(inode); 1084 struct dentry *dentry = d_find_any_alias(inode);
1085 1085
1086 if (dentry && ceph_dentry(dentry)) { 1086 if (dentry && ceph_dentry(dentry)) {
1087 dout(" marking %p (%p) complete\n", inode, dentry); 1087 dout(" marking %p (%p) complete\n", inode, dentry);
1088 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1088 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1089 } 1089 }
1090 dput(dentry); 1090 dput(dentry);
1091 } 1091 }
1092 1092
1093 bool ceph_dir_test_complete(struct inode *inode) 1093 bool ceph_dir_test_complete(struct inode *inode)
1094 { 1094 {
1095 struct dentry *dentry = d_find_any_alias(inode); 1095 struct dentry *dentry = d_find_any_alias(inode);
1096 1096
1097 if (dentry && ceph_dentry(dentry)) { 1097 if (dentry && ceph_dentry(dentry)) {
1098 dout(" marking %p (%p) NOT complete\n", inode, dentry); 1098 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1099 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1099 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1100 } 1100 }
1101 dput(dentry); 1101 dput(dentry);
1102 return false; 1102 return false;
1103 } 1103 }
1104 1104
1105 /* 1105 /*
1106 * When the VFS prunes a dentry from the cache, we need to clear the 1106 * When the VFS prunes a dentry from the cache, we need to clear the
1107 * complete flag on the parent directory. 1107 * complete flag on the parent directory.
1108 * 1108 *
1109 * Called under dentry->d_lock. 1109 * Called under dentry->d_lock.
1110 */ 1110 */
1111 static void ceph_d_prune(struct dentry *dentry) 1111 static void ceph_d_prune(struct dentry *dentry)
1112 { 1112 {
1113 struct ceph_dentry_info *di; 1113 struct ceph_dentry_info *di;
1114 1114
1115 dout("ceph_d_prune %p\n", dentry); 1115 dout("ceph_d_prune %p\n", dentry);
1116 1116
1117 /* do we have a valid parent? */ 1117 /* do we have a valid parent? */
1118 if (IS_ROOT(dentry)) 1118 if (IS_ROOT(dentry))
1119 return; 1119 return;
1120 1120
1121 /* if we are not hashed, we don't affect D_COMPLETE */ 1121 /* if we are not hashed, we don't affect D_COMPLETE */
1122 if (d_unhashed(dentry)) 1122 if (d_unhashed(dentry))
1123 return; 1123 return;
1124 1124
1125 /* 1125 /*
1126 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1126 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1127 * cleared until d_release 1127 * cleared until d_release
1128 */ 1128 */
1129 di = ceph_dentry(dentry->d_parent); 1129 di = ceph_dentry(dentry->d_parent);
1130 clear_bit(CEPH_D_COMPLETE, &di->flags); 1130 clear_bit(CEPH_D_COMPLETE, &di->flags);
1131 } 1131 }
1132 1132
1133 /* 1133 /*
1134 * read() on a dir. This weird interface hack only works if mounted 1134 * read() on a dir. This weird interface hack only works if mounted
1135 * with '-o dirstat'. 1135 * with '-o dirstat'.
1136 */ 1136 */
1137 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1137 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1138 loff_t *ppos) 1138 loff_t *ppos)
1139 { 1139 {
1140 struct ceph_file_info *cf = file->private_data; 1140 struct ceph_file_info *cf = file->private_data;
1141 struct inode *inode = file->f_dentry->d_inode; 1141 struct inode *inode = file->f_dentry->d_inode;
1142 struct ceph_inode_info *ci = ceph_inode(inode); 1142 struct ceph_inode_info *ci = ceph_inode(inode);
1143 int left; 1143 int left;
1144 const int bufsize = 1024; 1144 const int bufsize = 1024;
1145 1145
1146 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1146 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1147 return -EISDIR; 1147 return -EISDIR;
1148 1148
1149 if (!cf->dir_info) { 1149 if (!cf->dir_info) {
1150 cf->dir_info = kmalloc(bufsize, GFP_NOFS); 1150 cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1151 if (!cf->dir_info) 1151 if (!cf->dir_info)
1152 return -ENOMEM; 1152 return -ENOMEM;
1153 cf->dir_info_len = 1153 cf->dir_info_len =
1154 snprintf(cf->dir_info, bufsize, 1154 snprintf(cf->dir_info, bufsize,
1155 "entries: %20lld\n" 1155 "entries: %20lld\n"
1156 " files: %20lld\n" 1156 " files: %20lld\n"
1157 " subdirs: %20lld\n" 1157 " subdirs: %20lld\n"
1158 "rentries: %20lld\n" 1158 "rentries: %20lld\n"
1159 " rfiles: %20lld\n" 1159 " rfiles: %20lld\n"
1160 " rsubdirs: %20lld\n" 1160 " rsubdirs: %20lld\n"
1161 "rbytes: %20lld\n" 1161 "rbytes: %20lld\n"
1162 "rctime: %10ld.%09ld\n", 1162 "rctime: %10ld.%09ld\n",
1163 ci->i_files + ci->i_subdirs, 1163 ci->i_files + ci->i_subdirs,
1164 ci->i_files, 1164 ci->i_files,
1165 ci->i_subdirs, 1165 ci->i_subdirs,
1166 ci->i_rfiles + ci->i_rsubdirs, 1166 ci->i_rfiles + ci->i_rsubdirs,
1167 ci->i_rfiles, 1167 ci->i_rfiles,
1168 ci->i_rsubdirs, 1168 ci->i_rsubdirs,
1169 ci->i_rbytes, 1169 ci->i_rbytes,
1170 (long)ci->i_rctime.tv_sec, 1170 (long)ci->i_rctime.tv_sec,
1171 (long)ci->i_rctime.tv_nsec); 1171 (long)ci->i_rctime.tv_nsec);
1172 } 1172 }
1173 1173
1174 if (*ppos >= cf->dir_info_len) 1174 if (*ppos >= cf->dir_info_len)
1175 return 0; 1175 return 0;
1176 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1176 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1177 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1177 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1178 if (left == size) 1178 if (left == size)
1179 return -EFAULT; 1179 return -EFAULT;
1180 *ppos += (size - left); 1180 *ppos += (size - left);
1181 return size - left; 1181 return size - left;
1182 } 1182 }
1183 1183
1184 /* 1184 /*
1185 * an fsync() on a dir will wait for any uncommitted directory 1185 * an fsync() on a dir will wait for any uncommitted directory
1186 * operations to commit. 1186 * operations to commit.
1187 */ 1187 */
1188 static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, 1188 static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1189 int datasync) 1189 int datasync)
1190 { 1190 {
1191 struct inode *inode = file->f_path.dentry->d_inode; 1191 struct inode *inode = file->f_path.dentry->d_inode;
1192 struct ceph_inode_info *ci = ceph_inode(inode); 1192 struct ceph_inode_info *ci = ceph_inode(inode);
1193 struct list_head *head = &ci->i_unsafe_dirops; 1193 struct list_head *head = &ci->i_unsafe_dirops;
1194 struct ceph_mds_request *req; 1194 struct ceph_mds_request *req;
1195 u64 last_tid; 1195 u64 last_tid;
1196 int ret = 0; 1196 int ret = 0;
1197 1197
1198 dout("dir_fsync %p\n", inode); 1198 dout("dir_fsync %p\n", inode);
1199 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1199 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1200 if (ret) 1200 if (ret)
1201 return ret; 1201 return ret;
1202 mutex_lock(&inode->i_mutex); 1202 mutex_lock(&inode->i_mutex);
1203 1203
1204 spin_lock(&ci->i_unsafe_lock); 1204 spin_lock(&ci->i_unsafe_lock);
1205 if (list_empty(head)) 1205 if (list_empty(head))
1206 goto out; 1206 goto out;
1207 1207
1208 req = list_entry(head->prev, 1208 req = list_entry(head->prev,
1209 struct ceph_mds_request, r_unsafe_dir_item); 1209 struct ceph_mds_request, r_unsafe_dir_item);
1210 last_tid = req->r_tid; 1210 last_tid = req->r_tid;
1211 1211
1212 do { 1212 do {
1213 ceph_mdsc_get_request(req); 1213 ceph_mdsc_get_request(req);
1214 spin_unlock(&ci->i_unsafe_lock); 1214 spin_unlock(&ci->i_unsafe_lock);
1215 1215
1216 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1216 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1217 inode, req->r_tid, last_tid); 1217 inode, req->r_tid, last_tid);
1218 if (req->r_timeout) { 1218 if (req->r_timeout) {
1219 ret = wait_for_completion_timeout( 1219 ret = wait_for_completion_timeout(
1220 &req->r_safe_completion, req->r_timeout); 1220 &req->r_safe_completion, req->r_timeout);
1221 if (ret > 0) 1221 if (ret > 0)
1222 ret = 0; 1222 ret = 0;
1223 else if (ret == 0) 1223 else if (ret == 0)
1224 ret = -EIO; /* timed out */ 1224 ret = -EIO; /* timed out */
1225 } else { 1225 } else {
1226 wait_for_completion(&req->r_safe_completion); 1226 wait_for_completion(&req->r_safe_completion);
1227 } 1227 }
1228 ceph_mdsc_put_request(req); 1228 ceph_mdsc_put_request(req);
1229 1229
1230 spin_lock(&ci->i_unsafe_lock); 1230 spin_lock(&ci->i_unsafe_lock);
1231 if (ret || list_empty(head)) 1231 if (ret || list_empty(head))
1232 break; 1232 break;
1233 req = list_entry(head->next, 1233 req = list_entry(head->next,
1234 struct ceph_mds_request, r_unsafe_dir_item); 1234 struct ceph_mds_request, r_unsafe_dir_item);
1235 } while (req->r_tid < last_tid); 1235 } while (req->r_tid < last_tid);
1236 out: 1236 out:
1237 spin_unlock(&ci->i_unsafe_lock); 1237 spin_unlock(&ci->i_unsafe_lock);
1238 mutex_unlock(&inode->i_mutex); 1238 mutex_unlock(&inode->i_mutex);
1239 1239
1240 return ret; 1240 return ret;
1241 } 1241 }
1242 1242
1243 /* 1243 /*
1244 * We maintain a private dentry LRU. 1244 * We maintain a private dentry LRU.
1245 * 1245 *
1246 * FIXME: this needs to be changed to a per-mds lru to be useful. 1246 * FIXME: this needs to be changed to a per-mds lru to be useful.
1247 */ 1247 */
1248 void ceph_dentry_lru_add(struct dentry *dn) 1248 void ceph_dentry_lru_add(struct dentry *dn)
1249 { 1249 {
1250 struct ceph_dentry_info *di = ceph_dentry(dn); 1250 struct ceph_dentry_info *di = ceph_dentry(dn);
1251 struct ceph_mds_client *mdsc; 1251 struct ceph_mds_client *mdsc;
1252 1252
1253 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1253 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1254 dn->d_name.len, dn->d_name.name); 1254 dn->d_name.len, dn->d_name.name);
1255 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1255 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1256 spin_lock(&mdsc->dentry_lru_lock); 1256 spin_lock(&mdsc->dentry_lru_lock);
1257 list_add_tail(&di->lru, &mdsc->dentry_lru); 1257 list_add_tail(&di->lru, &mdsc->dentry_lru);
1258 mdsc->num_dentry++; 1258 mdsc->num_dentry++;
1259 spin_unlock(&mdsc->dentry_lru_lock); 1259 spin_unlock(&mdsc->dentry_lru_lock);
1260 } 1260 }
1261 1261
1262 void ceph_dentry_lru_touch(struct dentry *dn) 1262 void ceph_dentry_lru_touch(struct dentry *dn)
1263 { 1263 {
1264 struct ceph_dentry_info *di = ceph_dentry(dn); 1264 struct ceph_dentry_info *di = ceph_dentry(dn);
1265 struct ceph_mds_client *mdsc; 1265 struct ceph_mds_client *mdsc;
1266 1266
1267 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1267 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1268 dn->d_name.len, dn->d_name.name, di->offset); 1268 dn->d_name.len, dn->d_name.name, di->offset);
1269 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1269 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1270 spin_lock(&mdsc->dentry_lru_lock); 1270 spin_lock(&mdsc->dentry_lru_lock);
1271 list_move_tail(&di->lru, &mdsc->dentry_lru); 1271 list_move_tail(&di->lru, &mdsc->dentry_lru);
1272 spin_unlock(&mdsc->dentry_lru_lock); 1272 spin_unlock(&mdsc->dentry_lru_lock);
1273 } 1273 }
1274 1274
1275 void ceph_dentry_lru_del(struct dentry *dn) 1275 void ceph_dentry_lru_del(struct dentry *dn)
1276 { 1276 {
1277 struct ceph_dentry_info *di = ceph_dentry(dn); 1277 struct ceph_dentry_info *di = ceph_dentry(dn);
1278 struct ceph_mds_client *mdsc; 1278 struct ceph_mds_client *mdsc;
1279 1279
1280 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1280 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1281 dn->d_name.len, dn->d_name.name); 1281 dn->d_name.len, dn->d_name.name);
1282 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1282 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1283 spin_lock(&mdsc->dentry_lru_lock); 1283 spin_lock(&mdsc->dentry_lru_lock);
1284 list_del_init(&di->lru); 1284 list_del_init(&di->lru);
1285 mdsc->num_dentry--; 1285 mdsc->num_dentry--;
1286 spin_unlock(&mdsc->dentry_lru_lock); 1286 spin_unlock(&mdsc->dentry_lru_lock);
1287 } 1287 }
1288 1288
1289 /* 1289 /*
1290 * Return name hash for a given dentry. This is dependent on 1290 * Return name hash for a given dentry. This is dependent on
1291 * the parent directory's hash function. 1291 * the parent directory's hash function.
1292 */ 1292 */
1293 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) 1293 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
1294 { 1294 {
1295 struct ceph_inode_info *dci = ceph_inode(dir); 1295 struct ceph_inode_info *dci = ceph_inode(dir);
1296 1296
1297 switch (dci->i_dir_layout.dl_dir_hash) { 1297 switch (dci->i_dir_layout.dl_dir_hash) {
1298 case 0: /* for backward compat */ 1298 case 0: /* for backward compat */
1299 case CEPH_STR_HASH_LINUX: 1299 case CEPH_STR_HASH_LINUX:
1300 return dn->d_name.hash; 1300 return dn->d_name.hash;
1301 1301
1302 default: 1302 default:
1303 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1303 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1304 dn->d_name.name, dn->d_name.len); 1304 dn->d_name.name, dn->d_name.len);
1305 } 1305 }
1306 } 1306 }
1307 1307
1308 const struct file_operations ceph_dir_fops = { 1308 const struct file_operations ceph_dir_fops = {
1309 .read = ceph_read_dir, 1309 .read = ceph_read_dir,
1310 .readdir = ceph_readdir, 1310 .readdir = ceph_readdir,
1311 .llseek = ceph_dir_llseek, 1311 .llseek = ceph_dir_llseek,
1312 .open = ceph_open, 1312 .open = ceph_open,
1313 .release = ceph_release, 1313 .release = ceph_release,
1314 .unlocked_ioctl = ceph_ioctl, 1314 .unlocked_ioctl = ceph_ioctl,
1315 .fsync = ceph_dir_fsync, 1315 .fsync = ceph_dir_fsync,
1316 }; 1316 };
1317 1317
1318 const struct inode_operations ceph_dir_iops = { 1318 const struct inode_operations ceph_dir_iops = {
1319 .lookup = ceph_lookup, 1319 .lookup = ceph_lookup,
1320 .permission = ceph_permission, 1320 .permission = ceph_permission,
1321 .getattr = ceph_getattr, 1321 .getattr = ceph_getattr,
1322 .setattr = ceph_setattr, 1322 .setattr = ceph_setattr,
1323 .setxattr = ceph_setxattr, 1323 .setxattr = ceph_setxattr,
1324 .getxattr = ceph_getxattr, 1324 .getxattr = ceph_getxattr,
1325 .listxattr = ceph_listxattr, 1325 .listxattr = ceph_listxattr,
1326 .removexattr = ceph_removexattr, 1326 .removexattr = ceph_removexattr,
1327 .mknod = ceph_mknod, 1327 .mknod = ceph_mknod,
1328 .symlink = ceph_symlink, 1328 .symlink = ceph_symlink,
1329 .mkdir = ceph_mkdir, 1329 .mkdir = ceph_mkdir,
1330 .link = ceph_link, 1330 .link = ceph_link,
1331 .unlink = ceph_unlink, 1331 .unlink = ceph_unlink,
1332 .rmdir = ceph_unlink, 1332 .rmdir = ceph_unlink,
1333 .rename = ceph_rename, 1333 .rename = ceph_rename,
1334 .create = ceph_create, 1334 .create = ceph_create,
1335 .atomic_open = ceph_atomic_open, 1335 .atomic_open = ceph_atomic_open,
1336 }; 1336 };
1337 1337
1338 const struct dentry_operations ceph_dentry_ops = { 1338 const struct dentry_operations ceph_dentry_ops = {
1339 .d_revalidate = ceph_d_revalidate, 1339 .d_revalidate = ceph_d_revalidate,
1340 .d_release = ceph_d_release, 1340 .d_release = ceph_d_release,
1341 .d_prune = ceph_d_prune, 1341 .d_prune = ceph_d_prune,
1342 }; 1342 };
1343 1343
1344 const struct dentry_operations ceph_snapdir_dentry_ops = { 1344 const struct dentry_operations ceph_snapdir_dentry_ops = {
1345 .d_revalidate = ceph_snapdir_d_revalidate, 1345 .d_revalidate = ceph_snapdir_d_revalidate,
1346 .d_release = ceph_d_release, 1346 .d_release = ceph_d_release,
1347 }; 1347 };
1348 1348
1349 const struct dentry_operations ceph_snap_dentry_ops = { 1349 const struct dentry_operations ceph_snap_dentry_ops = {
1350 .d_release = ceph_d_release, 1350 .d_release = ceph_d_release,
1351 .d_prune = ceph_d_prune, 1351 .d_prune = ceph_d_prune,
1352 }; 1352 };
1353 1353
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/module.h> 3 #include <linux/module.h>
4 #include <linux/sched.h> 4 #include <linux/sched.h>
5 #include <linux/slab.h> 5 #include <linux/slab.h>
6 #include <linux/file.h> 6 #include <linux/file.h>
7 #include <linux/mount.h> 7 #include <linux/mount.h>
8 #include <linux/namei.h> 8 #include <linux/namei.h>
9 #include <linux/writeback.h> 9 #include <linux/writeback.h>
10 10
11 #include "super.h" 11 #include "super.h"
12 #include "mds_client.h" 12 #include "mds_client.h"
13 13
14 /* 14 /*
15 * Ceph file operations 15 * Ceph file operations
16 * 16 *
17 * Implement basic open/close functionality, and implement 17 * Implement basic open/close functionality, and implement
18 * read/write. 18 * read/write.
19 * 19 *
20 * We implement three modes of file I/O: 20 * We implement three modes of file I/O:
21 * - buffered uses the generic_file_aio_{read,write} helpers 21 * - buffered uses the generic_file_aio_{read,write} helpers
22 * 22 *
23 * - synchronous is used when there is multi-client read/write 23 * - synchronous is used when there is multi-client read/write
24 * sharing, avoids the page cache, and synchronously waits for an 24 * sharing, avoids the page cache, and synchronously waits for an
25 * ack from the OSD. 25 * ack from the OSD.
26 * 26 *
27 * - direct io takes the variant of the sync path that references 27 * - direct io takes the variant of the sync path that references
28 * user pages directly. 28 * user pages directly.
29 * 29 *
30 * fsync() flushes and waits on dirty pages, but just queues metadata 30 * fsync() flushes and waits on dirty pages, but just queues metadata
31 * for writeback: since the MDS can recover size and mtime there is no 31 * for writeback: since the MDS can recover size and mtime there is no
32 * need to wait for MDS acknowledgement. 32 * need to wait for MDS acknowledgement.
33 */ 33 */
34 34
35 35
36 /* 36 /*
37 * Prepare an open request. Preallocate ceph_cap to avoid an 37 * Prepare an open request. Preallocate ceph_cap to avoid an
38 * inopportune ENOMEM later. 38 * inopportune ENOMEM later.
39 */ 39 */
40 static struct ceph_mds_request * 40 static struct ceph_mds_request *
41 prepare_open_request(struct super_block *sb, int flags, int create_mode) 41 prepare_open_request(struct super_block *sb, int flags, int create_mode)
42 { 42 {
43 struct ceph_fs_client *fsc = ceph_sb_to_client(sb); 43 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
44 struct ceph_mds_client *mdsc = fsc->mdsc; 44 struct ceph_mds_client *mdsc = fsc->mdsc;
45 struct ceph_mds_request *req; 45 struct ceph_mds_request *req;
46 int want_auth = USE_ANY_MDS; 46 int want_auth = USE_ANY_MDS;
47 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 47 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
48 48
49 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) 49 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
50 want_auth = USE_AUTH_MDS; 50 want_auth = USE_AUTH_MDS;
51 51
52 req = ceph_mdsc_create_request(mdsc, op, want_auth); 52 req = ceph_mdsc_create_request(mdsc, op, want_auth);
53 if (IS_ERR(req)) 53 if (IS_ERR(req))
54 goto out; 54 goto out;
55 req->r_fmode = ceph_flags_to_mode(flags); 55 req->r_fmode = ceph_flags_to_mode(flags);
56 req->r_args.open.flags = cpu_to_le32(flags); 56 req->r_args.open.flags = cpu_to_le32(flags);
57 req->r_args.open.mode = cpu_to_le32(create_mode); 57 req->r_args.open.mode = cpu_to_le32(create_mode);
58 out: 58 out:
59 return req; 59 return req;
60 } 60 }
61 61
62 /* 62 /*
63 * initialize private struct file data. 63 * initialize private struct file data.
64 * if we fail, clean up by dropping fmode reference on the ceph_inode 64 * if we fail, clean up by dropping fmode reference on the ceph_inode
65 */ 65 */
66 static int ceph_init_file(struct inode *inode, struct file *file, int fmode) 66 static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
67 { 67 {
68 struct ceph_file_info *cf; 68 struct ceph_file_info *cf;
69 int ret = 0; 69 int ret = 0;
70 70
71 switch (inode->i_mode & S_IFMT) { 71 switch (inode->i_mode & S_IFMT) {
72 case S_IFREG: 72 case S_IFREG:
73 case S_IFDIR: 73 case S_IFDIR:
74 dout("init_file %p %p 0%o (regular)\n", inode, file, 74 dout("init_file %p %p 0%o (regular)\n", inode, file,
75 inode->i_mode); 75 inode->i_mode);
76 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); 76 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
77 if (cf == NULL) { 77 if (cf == NULL) {
78 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 78 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
79 return -ENOMEM; 79 return -ENOMEM;
80 } 80 }
81 cf->fmode = fmode; 81 cf->fmode = fmode;
82 cf->next_offset = 2; 82 cf->next_offset = 2;
83 file->private_data = cf; 83 file->private_data = cf;
84 BUG_ON(inode->i_fop->release != ceph_release); 84 BUG_ON(inode->i_fop->release != ceph_release);
85 break; 85 break;
86 86
87 case S_IFLNK: 87 case S_IFLNK:
88 dout("init_file %p %p 0%o (symlink)\n", inode, file, 88 dout("init_file %p %p 0%o (symlink)\n", inode, file,
89 inode->i_mode); 89 inode->i_mode);
90 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 90 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
91 break; 91 break;
92 92
93 default: 93 default:
94 dout("init_file %p %p 0%o (special)\n", inode, file, 94 dout("init_file %p %p 0%o (special)\n", inode, file,
95 inode->i_mode); 95 inode->i_mode);
96 /* 96 /*
97 * we need to drop the open ref now, since we don't 97 * we need to drop the open ref now, since we don't
98 * have .release set to ceph_release. 98 * have .release set to ceph_release.
99 */ 99 */
100 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 100 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
101 BUG_ON(inode->i_fop->release == ceph_release); 101 BUG_ON(inode->i_fop->release == ceph_release);
102 102
103 /* call the proper open fop */ 103 /* call the proper open fop */
104 ret = inode->i_fop->open(inode, file); 104 ret = inode->i_fop->open(inode, file);
105 } 105 }
106 return ret; 106 return ret;
107 } 107 }
108 108
109 /* 109 /*
110 * If we already have the requisite capabilities, we can satisfy 110 * If we already have the requisite capabilities, we can satisfy
111 * the open request locally (no need to request new caps from the 111 * the open request locally (no need to request new caps from the
112 * MDS). We do, however, need to inform the MDS (asynchronously) 112 * MDS). We do, however, need to inform the MDS (asynchronously)
113 * if our wanted caps set expands. 113 * if our wanted caps set expands.
114 */ 114 */
115 int ceph_open(struct inode *inode, struct file *file) 115 int ceph_open(struct inode *inode, struct file *file)
116 { 116 {
117 struct ceph_inode_info *ci = ceph_inode(inode); 117 struct ceph_inode_info *ci = ceph_inode(inode);
118 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 118 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
119 struct ceph_mds_client *mdsc = fsc->mdsc; 119 struct ceph_mds_client *mdsc = fsc->mdsc;
120 struct ceph_mds_request *req; 120 struct ceph_mds_request *req;
121 struct ceph_file_info *cf = file->private_data; 121 struct ceph_file_info *cf = file->private_data;
122 struct inode *parent_inode = NULL; 122 struct inode *parent_inode = NULL;
123 int err; 123 int err;
124 int flags, fmode, wanted; 124 int flags, fmode, wanted;
125 125
126 if (cf) { 126 if (cf) {
127 dout("open file %p is already opened\n", file); 127 dout("open file %p is already opened\n", file);
128 return 0; 128 return 0;
129 } 129 }
130 130
131 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ 131 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
132 flags = file->f_flags & ~(O_CREAT|O_EXCL); 132 flags = file->f_flags & ~(O_CREAT|O_EXCL);
133 if (S_ISDIR(inode->i_mode)) 133 if (S_ISDIR(inode->i_mode))
134 flags = O_DIRECTORY; /* mds likes to know */ 134 flags = O_DIRECTORY; /* mds likes to know */
135 135
136 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, 136 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
137 ceph_vinop(inode), file, flags, file->f_flags); 137 ceph_vinop(inode), file, flags, file->f_flags);
138 fmode = ceph_flags_to_mode(flags); 138 fmode = ceph_flags_to_mode(flags);
139 wanted = ceph_caps_for_mode(fmode); 139 wanted = ceph_caps_for_mode(fmode);
140 140
141 /* snapped files are read-only */ 141 /* snapped files are read-only */
142 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) 142 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
143 return -EROFS; 143 return -EROFS;
144 144
145 /* trivially open snapdir */ 145 /* trivially open snapdir */
146 if (ceph_snap(inode) == CEPH_SNAPDIR) { 146 if (ceph_snap(inode) == CEPH_SNAPDIR) {
147 spin_lock(&ci->i_ceph_lock); 147 spin_lock(&ci->i_ceph_lock);
148 __ceph_get_fmode(ci, fmode); 148 __ceph_get_fmode(ci, fmode);
149 spin_unlock(&ci->i_ceph_lock); 149 spin_unlock(&ci->i_ceph_lock);
150 return ceph_init_file(inode, file, fmode); 150 return ceph_init_file(inode, file, fmode);
151 } 151 }
152 152
153 /* 153 /*
154 * No need to block if we have caps on the auth MDS (for 154 * No need to block if we have caps on the auth MDS (for
155 * write) or any MDS (for read). Update wanted set 155 * write) or any MDS (for read). Update wanted set
156 * asynchronously. 156 * asynchronously.
157 */ 157 */
158 spin_lock(&ci->i_ceph_lock); 158 spin_lock(&ci->i_ceph_lock);
159 if (__ceph_is_any_real_caps(ci) && 159 if (__ceph_is_any_real_caps(ci) &&
160 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { 160 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci); 161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL); 162 int issued = __ceph_caps_issued(ci, NULL);
163 163
164 dout("open %p fmode %d want %s issued %s using existing\n", 164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted), 165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued)); 166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode); 167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&ci->i_ceph_lock); 168 spin_unlock(&ci->i_ceph_lock);
169 169
170 /* adjust wanted? */ 170 /* adjust wanted? */
171 if ((issued & wanted) != wanted && 171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted && 172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR) 173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL); 174 ceph_check_caps(ci, 0, NULL);
175 175
176 return ceph_init_file(inode, file, fmode); 176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP && 177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) { 178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode); 179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&ci->i_ceph_lock); 180 spin_unlock(&ci->i_ceph_lock);
181 return ceph_init_file(inode, file, fmode); 181 return ceph_init_file(inode, file, fmode);
182 } 182 }
183 spin_unlock(&ci->i_ceph_lock); 183 spin_unlock(&ci->i_ceph_lock);
184 184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0); 186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) { 187 if (IS_ERR(req)) {
188 err = PTR_ERR(req); 188 err = PTR_ERR(req);
189 goto out; 189 goto out;
190 } 190 }
191 req->r_inode = inode; 191 req->r_inode = inode;
192 ihold(inode); 192 ihold(inode);
193 req->r_num_caps = 1; 193 req->r_num_caps = 1;
194 if (flags & (O_CREAT|O_TRUNC)) 194 if (flags & (O_CREAT|O_TRUNC))
195 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 195 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
196 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 196 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
197 iput(parent_inode); 197 iput(parent_inode);
198 if (!err) 198 if (!err)
199 err = ceph_init_file(inode, file, req->r_fmode); 199 err = ceph_init_file(inode, file, req->r_fmode);
200 ceph_mdsc_put_request(req); 200 ceph_mdsc_put_request(req);
201 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); 201 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
202 out: 202 out:
203 return err; 203 return err;
204 } 204 }
205 205
206 206
207 /* 207 /*
208 * Do a lookup + open with a single request. If we get a non-existent 208 * Do a lookup + open with a single request. If we get a non-existent
209 * file or symlink, return 1 so the VFS can retry. 209 * file or symlink, return 1 so the VFS can retry.
210 */ 210 */
211 int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 211 int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
212 struct file *file, unsigned flags, umode_t mode, 212 struct file *file, unsigned flags, umode_t mode,
213 int *opened) 213 int *opened)
214 { 214 {
215 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 215 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
216 struct ceph_mds_client *mdsc = fsc->mdsc; 216 struct ceph_mds_client *mdsc = fsc->mdsc;
217 struct ceph_mds_request *req; 217 struct ceph_mds_request *req;
218 struct dentry *dn; 218 struct dentry *dn;
219 int err; 219 int err;
220 220
221 dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", 221 dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
222 dir, dentry, dentry->d_name.len, dentry->d_name.name, 222 dir, dentry, dentry->d_name.len, dentry->d_name.name,
223 d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); 223 d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
224 224
225 if (dentry->d_name.len > NAME_MAX) 225 if (dentry->d_name.len > NAME_MAX)
226 return -ENAMETOOLONG; 226 return -ENAMETOOLONG;
227 227
228 err = ceph_init_dentry(dentry); 228 err = ceph_init_dentry(dentry);
229 if (err < 0) 229 if (err < 0)
230 return err; 230 return err;
231 231
232 /* do the open */ 232 /* do the open */
233 req = prepare_open_request(dir->i_sb, flags, mode); 233 req = prepare_open_request(dir->i_sb, flags, mode);
234 if (IS_ERR(req)) 234 if (IS_ERR(req))
235 return PTR_ERR(req); 235 return PTR_ERR(req);
236 req->r_dentry = dget(dentry); 236 req->r_dentry = dget(dentry);
237 req->r_num_caps = 2; 237 req->r_num_caps = 2;
238 if (flags & O_CREAT) { 238 if (flags & O_CREAT) {
239 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 239 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
240 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 240 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
241 } 241 }
242 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 242 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
243 err = ceph_mdsc_do_request(mdsc, 243 err = ceph_mdsc_do_request(mdsc,
244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
245 req); 245 req);
246 err = ceph_handle_snapdir(req, dentry, err); 246 err = ceph_handle_snapdir(req, dentry, err);
247 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 247 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248 err = ceph_handle_notrace_create(dir, dentry); 248 err = ceph_handle_notrace_create(dir, dentry);
249 249
250 if (d_unhashed(dentry)) { 250 if (d_unhashed(dentry)) {
251 dn = ceph_finish_lookup(req, dentry, err); 251 dn = ceph_finish_lookup(req, dentry, err);
252 if (IS_ERR(dn)) 252 if (IS_ERR(dn))
253 err = PTR_ERR(dn); 253 err = PTR_ERR(dn);
254 } else { 254 } else {
255 /* we were given a hashed negative dentry */ 255 /* we were given a hashed negative dentry */
256 dn = NULL; 256 dn = NULL;
257 } 257 }
258 if (err) 258 if (err)
259 goto out_err; 259 goto out_err;
260 if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { 260 if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
261 /* make vfs retry on splice, ENOENT, or symlink */ 261 /* make vfs retry on splice, ENOENT, or symlink */
262 dout("atomic_open finish_no_open on dn %p\n", dn); 262 dout("atomic_open finish_no_open on dn %p\n", dn);
263 err = finish_no_open(file, dn); 263 err = finish_no_open(file, dn);
264 } else { 264 } else {
265 dout("atomic_open finish_open on dn %p\n", dn); 265 dout("atomic_open finish_open on dn %p\n", dn);
266 err = finish_open(file, dentry, ceph_open, opened); 266 err = finish_open(file, dentry, ceph_open, opened);
267 } 267 }
268 268
269 out_err: 269 out_err:
270 ceph_mdsc_put_request(req); 270 ceph_mdsc_put_request(req);
271 dout("atomic_open result=%d\n", err); 271 dout("atomic_open result=%d\n", err);
272 return err; 272 return err;
273 } 273 }
274 274
275 int ceph_release(struct inode *inode, struct file *file) 275 int ceph_release(struct inode *inode, struct file *file)
276 { 276 {
277 struct ceph_inode_info *ci = ceph_inode(inode); 277 struct ceph_inode_info *ci = ceph_inode(inode);
278 struct ceph_file_info *cf = file->private_data; 278 struct ceph_file_info *cf = file->private_data;
279 279
280 dout("release inode %p file %p\n", inode, file); 280 dout("release inode %p file %p\n", inode, file);
281 ceph_put_fmode(ci, cf->fmode); 281 ceph_put_fmode(ci, cf->fmode);
282 if (cf->last_readdir) 282 if (cf->last_readdir)
283 ceph_mdsc_put_request(cf->last_readdir); 283 ceph_mdsc_put_request(cf->last_readdir);
284 kfree(cf->last_name); 284 kfree(cf->last_name);
285 kfree(cf->dir_info); 285 kfree(cf->dir_info);
286 dput(cf->dentry); 286 dput(cf->dentry);
287 kmem_cache_free(ceph_file_cachep, cf); 287 kmem_cache_free(ceph_file_cachep, cf);
288 288
289 /* wake up anyone waiting for caps on this inode */ 289 /* wake up anyone waiting for caps on this inode */
290 wake_up_all(&ci->i_cap_wq); 290 wake_up_all(&ci->i_cap_wq);
291 return 0; 291 return 0;
292 } 292 }
293 293
294 /* 294 /*
295 * Read a range of bytes striped over one or more objects. Iterate over 295 * Read a range of bytes striped over one or more objects. Iterate over
296 * objects we stripe over. (That's not atomic, but good enough for now.) 296 * objects we stripe over. (That's not atomic, but good enough for now.)
297 * 297 *
298 * If we get a short result from the OSD, check against i_size; we need to 298 * If we get a short result from the OSD, check against i_size; we need to
299 * only return a short read to the caller if we hit EOF. 299 * only return a short read to the caller if we hit EOF.
300 */ 300 */
301 static int striped_read(struct inode *inode, 301 static int striped_read(struct inode *inode,
302 u64 off, u64 len, 302 u64 off, u64 len,
303 struct page **pages, int num_pages, 303 struct page **pages, int num_pages,
304 int *checkeof, bool o_direct, 304 int *checkeof, bool o_direct,
305 unsigned long buf_align) 305 unsigned long buf_align)
306 { 306 {
307 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 307 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
308 struct ceph_inode_info *ci = ceph_inode(inode); 308 struct ceph_inode_info *ci = ceph_inode(inode);
309 u64 pos, this_len; 309 u64 pos, this_len;
310 int io_align, page_align; 310 int io_align, page_align;
311 int left, pages_left; 311 int left, pages_left;
312 int read; 312 int read;
313 struct page **page_pos; 313 struct page **page_pos;
314 int ret; 314 int ret;
315 bool hit_stripe, was_short; 315 bool hit_stripe, was_short;
316 316
317 /* 317 /*
318 * we may need to do multiple reads. not atomic, unfortunately. 318 * we may need to do multiple reads. not atomic, unfortunately.
319 */ 319 */
320 pos = off; 320 pos = off;
321 left = len; 321 left = len;
322 page_pos = pages; 322 page_pos = pages;
323 pages_left = num_pages; 323 pages_left = num_pages;
324 read = 0; 324 read = 0;
325 io_align = off & ~PAGE_MASK; 325 io_align = off & ~PAGE_MASK;
326 326
327 more: 327 more:
328 if (o_direct) 328 if (o_direct)
329 page_align = (pos - io_align + buf_align) & ~PAGE_MASK; 329 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
330 else 330 else
331 page_align = pos & ~PAGE_MASK; 331 page_align = pos & ~PAGE_MASK;
332 this_len = left; 332 this_len = left;
333 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 333 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
334 &ci->i_layout, pos, &this_len, 334 &ci->i_layout, pos, &this_len,
335 ci->i_truncate_seq, 335 ci->i_truncate_seq,
336 ci->i_truncate_size, 336 ci->i_truncate_size,
337 page_pos, pages_left, page_align); 337 page_pos, pages_left, page_align);
338 if (ret == -ENOENT) 338 if (ret == -ENOENT)
339 ret = 0; 339 ret = 0;
340 hit_stripe = this_len < left; 340 hit_stripe = this_len < left;
341 was_short = ret >= 0 && ret < this_len; 341 was_short = ret >= 0 && ret < this_len;
342 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, 342 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
343 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); 343 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
344 344
345 if (ret > 0) { 345 if (ret > 0) {
346 int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; 346 int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
347 347
348 if (read < pos - off) { 348 if (read < pos - off) {
349 dout(" zero gap %llu to %llu\n", off + read, pos); 349 dout(" zero gap %llu to %llu\n", off + read, pos);
350 ceph_zero_page_vector_range(page_align + read, 350 ceph_zero_page_vector_range(page_align + read,
351 pos - off - read, pages); 351 pos - off - read, pages);
352 } 352 }
353 pos += ret; 353 pos += ret;
354 read = pos - off; 354 read = pos - off;
355 left -= ret; 355 left -= ret;
356 page_pos += didpages; 356 page_pos += didpages;
357 pages_left -= didpages; 357 pages_left -= didpages;
358 358
359 /* hit stripe? */ 359 /* hit stripe? */
360 if (left && hit_stripe) 360 if (left && hit_stripe)
361 goto more; 361 goto more;
362 } 362 }
363 363
364 if (was_short) { 364 if (was_short) {
365 /* did we bounce off eof? */ 365 /* did we bounce off eof? */
366 if (pos + left > inode->i_size) 366 if (pos + left > inode->i_size)
367 *checkeof = 1; 367 *checkeof = 1;
368 368
369 /* zero trailing bytes (inside i_size) */ 369 /* zero trailing bytes (inside i_size) */
370 if (left > 0 && pos < inode->i_size) { 370 if (left > 0 && pos < inode->i_size) {
371 if (pos + left > inode->i_size) 371 if (pos + left > inode->i_size)
372 left = inode->i_size - pos; 372 left = inode->i_size - pos;
373 373
374 dout("zero tail %d\n", left); 374 dout("zero tail %d\n", left);
375 ceph_zero_page_vector_range(page_align + read, left, 375 ceph_zero_page_vector_range(page_align + read, left,
376 pages); 376 pages);
377 read += left; 377 read += left;
378 } 378 }
379 } 379 }
380 380
381 if (ret >= 0) 381 if (ret >= 0)
382 ret = read; 382 ret = read;
383 dout("striped_read returns %d\n", ret); 383 dout("striped_read returns %d\n", ret);
384 return ret; 384 return ret;
385 } 385 }
386 386
387 /* 387 /*
388 * Completely synchronous read and write methods. Direct from __user 388 * Completely synchronous read and write methods. Direct from __user
389 * buffer to osd, or directly to user pages (if O_DIRECT). 389 * buffer to osd, or directly to user pages (if O_DIRECT).
390 * 390 *
391 * If the read spans object boundary, just do multiple reads. 391 * If the read spans object boundary, just do multiple reads.
392 */ 392 */
393 static ssize_t ceph_sync_read(struct file *file, char __user *data, 393 static ssize_t ceph_sync_read(struct file *file, char __user *data,
394 unsigned len, loff_t *poff, int *checkeof) 394 unsigned len, loff_t *poff, int *checkeof)
395 { 395 {
396 struct inode *inode = file->f_dentry->d_inode; 396 struct inode *inode = file->f_dentry->d_inode;
397 struct page **pages; 397 struct page **pages;
398 u64 off = *poff; 398 u64 off = *poff;
399 int num_pages, ret; 399 int num_pages, ret;
400 400
401 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 401 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
402 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 402 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
403 403
404 if (file->f_flags & O_DIRECT) { 404 if (file->f_flags & O_DIRECT) {
405 num_pages = calc_pages_for((unsigned long)data, len); 405 num_pages = calc_pages_for((unsigned long)data, len);
406 pages = ceph_get_direct_page_vector(data, num_pages, true); 406 pages = ceph_get_direct_page_vector(data, num_pages, true);
407 } else { 407 } else {
408 num_pages = calc_pages_for(off, len); 408 num_pages = calc_pages_for(off, len);
409 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 409 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
410 } 410 }
411 if (IS_ERR(pages)) 411 if (IS_ERR(pages))
412 return PTR_ERR(pages); 412 return PTR_ERR(pages);
413 413
414 /* 414 /*
415 * flush any page cache pages in this range. this 415 * flush any page cache pages in this range. this
416 * will make concurrent normal and sync io slow, 416 * will make concurrent normal and sync io slow,
417 * but it will at least behave sensibly when they are 417 * but it will at least behave sensibly when they are
418 * in sequence. 418 * in sequence.
419 */ 419 */
420 ret = filemap_write_and_wait(inode->i_mapping); 420 ret = filemap_write_and_wait(inode->i_mapping);
421 if (ret < 0) 421 if (ret < 0)
422 goto done; 422 goto done;
423 423
424 ret = striped_read(inode, off, len, pages, num_pages, checkeof, 424 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
425 file->f_flags & O_DIRECT, 425 file->f_flags & O_DIRECT,
426 (unsigned long)data & ~PAGE_MASK); 426 (unsigned long)data & ~PAGE_MASK);
427 427
428 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 428 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
429 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 429 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
430 if (ret >= 0) 430 if (ret >= 0)
431 *poff = off + ret; 431 *poff = off + ret;
432 432
433 done: 433 done:
434 if (file->f_flags & O_DIRECT) 434 if (file->f_flags & O_DIRECT)
435 ceph_put_page_vector(pages, num_pages, true); 435 ceph_put_page_vector(pages, num_pages, true);
436 else 436 else
437 ceph_release_page_vector(pages, num_pages); 437 ceph_release_page_vector(pages, num_pages);
438 dout("sync_read result %d\n", ret); 438 dout("sync_read result %d\n", ret);
439 return ret; 439 return ret;
440 } 440 }
441 441
442 /* 442 /*
443 * Write commit callback, called if we requested both an ACK and 443 * Write commit callback, called if we requested both an ACK and
444 * ONDISK commit reply from the OSD. 444 * ONDISK commit reply from the OSD.
445 */ 445 */
446 static void sync_write_commit(struct ceph_osd_request *req, 446 static void sync_write_commit(struct ceph_osd_request *req,
447 struct ceph_msg *msg) 447 struct ceph_msg *msg)
448 { 448 {
449 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 449 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
450 450
451 dout("sync_write_commit %p tid %llu\n", req, req->r_tid); 451 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
452 spin_lock(&ci->i_unsafe_lock); 452 spin_lock(&ci->i_unsafe_lock);
453 list_del_init(&req->r_unsafe_item); 453 list_del_init(&req->r_unsafe_item);
454 spin_unlock(&ci->i_unsafe_lock); 454 spin_unlock(&ci->i_unsafe_lock);
455 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); 455 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
456 } 456 }
457 457
458 /* 458 /*
459 * Synchronous write, straight from __user pointer or user pages (if 459 * Synchronous write, straight from __user pointer or user pages (if
460 * O_DIRECT). 460 * O_DIRECT).
461 * 461 *
462 * If write spans object boundary, just do multiple writes. (For a 462 * If write spans object boundary, just do multiple writes. (For a
463 * correct atomic write, we should e.g. take write locks on all 463 * correct atomic write, we should e.g. take write locks on all
464 * objects, rollback on failure, etc.) 464 * objects, rollback on failure, etc.)
465 */ 465 */
466 static ssize_t ceph_sync_write(struct file *file, const char __user *data, 466 static ssize_t ceph_sync_write(struct file *file, const char __user *data,
467 size_t left, loff_t *offset) 467 size_t left, loff_t *offset)
468 { 468 {
469 struct inode *inode = file->f_dentry->d_inode; 469 struct inode *inode = file->f_dentry->d_inode;
470 struct ceph_inode_info *ci = ceph_inode(inode); 470 struct ceph_inode_info *ci = ceph_inode(inode);
471 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 471 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
472 struct ceph_osd_request *req; 472 struct ceph_osd_request *req;
473 struct page **pages; 473 struct page **pages;
474 int num_pages; 474 int num_pages;
475 long long unsigned pos; 475 long long unsigned pos;
476 u64 len; 476 u64 len;
477 int written = 0; 477 int written = 0;
478 int flags; 478 int flags;
479 int do_sync = 0; 479 int do_sync = 0;
480 int check_caps = 0; 480 int check_caps = 0;
481 int page_align, io_align; 481 int page_align, io_align;
482 unsigned long buf_align; 482 unsigned long buf_align;
483 int ret; 483 int ret;
484 struct timespec mtime = CURRENT_TIME; 484 struct timespec mtime = CURRENT_TIME;
485 485
486 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) 486 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
487 return -EROFS; 487 return -EROFS;
488 488
489 dout("sync_write on file %p %lld~%u %s\n", file, *offset, 489 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
490 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 490 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
491 491
492 if (file->f_flags & O_APPEND) 492 if (file->f_flags & O_APPEND)
493 pos = i_size_read(inode); 493 pos = i_size_read(inode);
494 else 494 else
495 pos = *offset; 495 pos = *offset;
496 496
497 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 497 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
498 if (ret < 0) 498 if (ret < 0)
499 return ret; 499 return ret;
500 500
501 ret = invalidate_inode_pages2_range(inode->i_mapping, 501 ret = invalidate_inode_pages2_range(inode->i_mapping,
502 pos >> PAGE_CACHE_SHIFT, 502 pos >> PAGE_CACHE_SHIFT,
503 (pos + left) >> PAGE_CACHE_SHIFT); 503 (pos + left) >> PAGE_CACHE_SHIFT);
504 if (ret < 0) 504 if (ret < 0)
505 dout("invalidate_inode_pages2_range returned %d\n", ret); 505 dout("invalidate_inode_pages2_range returned %d\n", ret);
506 506
507 flags = CEPH_OSD_FLAG_ORDERSNAP | 507 flags = CEPH_OSD_FLAG_ORDERSNAP |
508 CEPH_OSD_FLAG_ONDISK | 508 CEPH_OSD_FLAG_ONDISK |
509 CEPH_OSD_FLAG_WRITE; 509 CEPH_OSD_FLAG_WRITE;
510 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) 510 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
511 flags |= CEPH_OSD_FLAG_ACK; 511 flags |= CEPH_OSD_FLAG_ACK;
512 else 512 else
513 do_sync = 1; 513 do_sync = 1;
514 514
515 /* 515 /*
516 * we may need to do multiple writes here if we span an object 516 * we may need to do multiple writes here if we span an object
517 * boundary. this isn't atomic, unfortunately. :( 517 * boundary. this isn't atomic, unfortunately. :(
518 */ 518 */
519 more: 519 more:
520 io_align = pos & ~PAGE_MASK; 520 io_align = pos & ~PAGE_MASK;
521 buf_align = (unsigned long)data & ~PAGE_MASK; 521 buf_align = (unsigned long)data & ~PAGE_MASK;
522 len = left; 522 len = left;
523 if (file->f_flags & O_DIRECT) { 523 if (file->f_flags & O_DIRECT) {
524 /* write from beginning of first page, regardless of 524 /* write from beginning of first page, regardless of
525 io alignment */ 525 io alignment */
526 page_align = (pos - io_align + buf_align) & ~PAGE_MASK; 526 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
527 num_pages = calc_pages_for((unsigned long)data, len); 527 num_pages = calc_pages_for((unsigned long)data, len);
528 } else { 528 } else {
529 page_align = pos & ~PAGE_MASK; 529 page_align = pos & ~PAGE_MASK;
530 num_pages = calc_pages_for(pos, len); 530 num_pages = calc_pages_for(pos, len);
531 } 531 }
532 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 532 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
533 ceph_vino(inode), pos, &len, 533 ceph_vino(inode), pos, &len,
534 CEPH_OSD_OP_WRITE, flags, 534 CEPH_OSD_OP_WRITE, flags,
535 ci->i_snap_realm->cached_context, 535 ci->i_snap_realm->cached_context,
536 do_sync, 536 do_sync,
537 ci->i_truncate_seq, ci->i_truncate_size, 537 ci->i_truncate_seq, ci->i_truncate_size,
538 &mtime, false, 2, page_align); 538 &mtime, false, 2, page_align);
539 if (IS_ERR(req)) 539 if (IS_ERR(req))
540 return PTR_ERR(req); 540 return PTR_ERR(req);
541 541
542 if (file->f_flags & O_DIRECT) { 542 if (file->f_flags & O_DIRECT) {
543 pages = ceph_get_direct_page_vector(data, num_pages, false); 543 pages = ceph_get_direct_page_vector(data, num_pages, false);
544 if (IS_ERR(pages)) { 544 if (IS_ERR(pages)) {
545 ret = PTR_ERR(pages); 545 ret = PTR_ERR(pages);
546 goto out; 546 goto out;
547 } 547 }
548 548
549 /* 549 /*
550 * throw out any page cache pages in this range. this 550 * throw out any page cache pages in this range. this
551 * may block. 551 * may block.
552 */ 552 */
553 truncate_inode_pages_range(inode->i_mapping, pos, 553 truncate_inode_pages_range(inode->i_mapping, pos,
554 (pos+len) | (PAGE_CACHE_SIZE-1)); 554 (pos+len) | (PAGE_CACHE_SIZE-1));
555 } else { 555 } else {
556 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 556 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
557 if (IS_ERR(pages)) { 557 if (IS_ERR(pages)) {
558 ret = PTR_ERR(pages); 558 ret = PTR_ERR(pages);
559 goto out; 559 goto out;
560 } 560 }
561 ret = ceph_copy_user_to_page_vector(pages, data, pos, len); 561 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
562 if (ret < 0) { 562 if (ret < 0) {
563 ceph_release_page_vector(pages, num_pages); 563 ceph_release_page_vector(pages, num_pages);
564 goto out; 564 goto out;
565 } 565 }
566 566
567 if ((file->f_flags & O_SYNC) == 0) { 567 if ((file->f_flags & O_SYNC) == 0) {
568 /* get a second commit callback */ 568 /* get a second commit callback */
569 req->r_safe_callback = sync_write_commit; 569 req->r_safe_callback = sync_write_commit;
570 req->r_own_pages = 1; 570 req->r_own_pages = 1;
571 } 571 }
572 } 572 }
573 req->r_pages = pages; 573 req->r_pages = pages;
574 req->r_num_pages = num_pages; 574 req->r_num_pages = num_pages;
575 req->r_inode = inode; 575 req->r_inode = inode;
576 576
577 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 577 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
578 if (!ret) { 578 if (!ret) {
579 if (req->r_safe_callback) { 579 if (req->r_safe_callback) {
580 /* 580 /*
581 * Add to inode unsafe list only after we 581 * Add to inode unsafe list only after we
582 * start_request so that a tid has been assigned. 582 * start_request so that a tid has been assigned.
583 */ 583 */
584 spin_lock(&ci->i_unsafe_lock); 584 spin_lock(&ci->i_unsafe_lock);
585 list_add_tail(&req->r_unsafe_item, 585 list_add_tail(&req->r_unsafe_item,
586 &ci->i_unsafe_writes); 586 &ci->i_unsafe_writes);
587 spin_unlock(&ci->i_unsafe_lock); 587 spin_unlock(&ci->i_unsafe_lock);
588 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 588 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
589 } 589 }
590 590
591 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 591 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
592 if (ret < 0 && req->r_safe_callback) { 592 if (ret < 0 && req->r_safe_callback) {
593 spin_lock(&ci->i_unsafe_lock); 593 spin_lock(&ci->i_unsafe_lock);
594 list_del_init(&req->r_unsafe_item); 594 list_del_init(&req->r_unsafe_item);
595 spin_unlock(&ci->i_unsafe_lock); 595 spin_unlock(&ci->i_unsafe_lock);
596 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); 596 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
597 } 597 }
598 } 598 }
599 599
600 if (file->f_flags & O_DIRECT) 600 if (file->f_flags & O_DIRECT)
601 ceph_put_page_vector(pages, num_pages, false); 601 ceph_put_page_vector(pages, num_pages, false);
602 else if (file->f_flags & O_SYNC) 602 else if (file->f_flags & O_SYNC)
603 ceph_release_page_vector(pages, num_pages); 603 ceph_release_page_vector(pages, num_pages);
604 604
605 out: 605 out:
606 ceph_osdc_put_request(req); 606 ceph_osdc_put_request(req);
607 if (ret == 0) { 607 if (ret == 0) {
608 pos += len; 608 pos += len;
609 written += len; 609 written += len;
610 left -= len; 610 left -= len;
611 data += written; 611 data += written;
612 if (left) 612 if (left)
613 goto more; 613 goto more;
614 614
615 ret = written; 615 ret = written;
616 *offset = pos; 616 *offset = pos;
617 if (pos > i_size_read(inode)) 617 if (pos > i_size_read(inode))
618 check_caps = ceph_inode_set_size(inode, pos); 618 check_caps = ceph_inode_set_size(inode, pos);
619 if (check_caps) 619 if (check_caps)
620 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, 620 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
621 NULL); 621 NULL);
622 } 622 }
623 return ret; 623 return ret;
624 } 624 }
625 625
626 /* 626 /*
627 * Wrap generic_file_aio_read with checks for cap bits on the inode. 627 * Wrap generic_file_aio_read with checks for cap bits on the inode.
628 * Atomically grab references, so that those bits are not released 628 * Atomically grab references, so that those bits are not released
629 * back to the MDS mid-read. 629 * back to the MDS mid-read.
630 * 630 *
631 * Hmm, the sync read case isn't actually async... should it be? 631 * Hmm, the sync read case isn't actually async... should it be?
632 */ 632 */
633 static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, 633 static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
634 unsigned long nr_segs, loff_t pos) 634 unsigned long nr_segs, loff_t pos)
635 { 635 {
636 struct file *filp = iocb->ki_filp; 636 struct file *filp = iocb->ki_filp;
637 struct ceph_file_info *fi = filp->private_data; 637 struct ceph_file_info *fi = filp->private_data;
638 loff_t *ppos = &iocb->ki_pos; 638 loff_t *ppos = &iocb->ki_pos;
639 size_t len = iov->iov_len; 639 size_t len = iov->iov_len;
640 struct inode *inode = filp->f_dentry->d_inode; 640 struct inode *inode = filp->f_dentry->d_inode;
641 struct ceph_inode_info *ci = ceph_inode(inode); 641 struct ceph_inode_info *ci = ceph_inode(inode);
642 void __user *base = iov->iov_base; 642 void __user *base = iov->iov_base;
643 ssize_t ret; 643 ssize_t ret;
644 int want, got = 0; 644 int want, got = 0;
645 int checkeof = 0, read = 0; 645 int checkeof = 0, read = 0;
646 646
647 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 647 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
648 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 648 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
649 again: 649 again:
650 __ceph_do_pending_vmtruncate(inode); 650 __ceph_do_pending_vmtruncate(inode);
651 if (fi->fmode & CEPH_FILE_MODE_LAZY) 651 if (fi->fmode & CEPH_FILE_MODE_LAZY)
652 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 652 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
653 else 653 else
654 want = CEPH_CAP_FILE_CACHE; 654 want = CEPH_CAP_FILE_CACHE;
655 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 655 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
656 if (ret < 0) 656 if (ret < 0)
657 goto out; 657 goto out;
658 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 658 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
659 inode, ceph_vinop(inode), pos, (unsigned)len, 659 inode, ceph_vinop(inode), pos, (unsigned)len,
660 ceph_cap_string(got)); 660 ceph_cap_string(got));
661 661
662 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 662 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
663 (iocb->ki_filp->f_flags & O_DIRECT) || 663 (iocb->ki_filp->f_flags & O_DIRECT) ||
664 (inode->i_sb->s_flags & MS_SYNCHRONOUS) || 664 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
665 (fi->flags & CEPH_F_SYNC)) 665 (fi->flags & CEPH_F_SYNC))
666 /* hmm, this isn't really async... */ 666 /* hmm, this isn't really async... */
667 ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 667 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
668 else 668 else
669 ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 669 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
670 670
671 out: 671 out:
672 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 672 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
673 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 673 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
674 ceph_put_cap_refs(ci, got); 674 ceph_put_cap_refs(ci, got);
675 675
676 if (checkeof && ret >= 0) { 676 if (checkeof && ret >= 0) {
677 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 677 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
678 678
679 /* hit EOF or hole? */ 679 /* hit EOF or hole? */
680 if (statret == 0 && *ppos < inode->i_size) { 680 if (statret == 0 && *ppos < inode->i_size) {
681 dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); 681 dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
682 read += ret; 682 read += ret;
683 base += ret; 683 base += ret;
684 len -= ret; 684 len -= ret;
685 checkeof = 0; 685 checkeof = 0;
686 goto again; 686 goto again;
687 } 687 }
688 } 688 }
689 if (ret >= 0) 689 if (ret >= 0)
690 ret += read; 690 ret += read;
691 691
692 return ret; 692 return ret;
693 } 693 }
694 694
695 /* 695 /*
696 * Take cap references to avoid releasing caps to MDS mid-write. 696 * Take cap references to avoid releasing caps to MDS mid-write.
697 * 697 *
698 * If we are synchronous, and write with an old snap context, the OSD 698 * If we are synchronous, and write with an old snap context, the OSD
699 * may return EOLDSNAPC. In that case, retry the write.. _after_ 699 * may return EOLDSNAPC. In that case, retry the write.. _after_
700 * dropping our cap refs and allowing the pending snap to logically 700 * dropping our cap refs and allowing the pending snap to logically
701 * complete _before_ this write occurs. 701 * complete _before_ this write occurs.
702 * 702 *
703 * If we are near ENOSPC, write synchronously. 703 * If we are near ENOSPC, write synchronously.
704 */ 704 */
705 static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, 705 static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
706 unsigned long nr_segs, loff_t pos) 706 unsigned long nr_segs, loff_t pos)
707 { 707 {
708 struct file *file = iocb->ki_filp; 708 struct file *file = iocb->ki_filp;
709 struct ceph_file_info *fi = file->private_data; 709 struct ceph_file_info *fi = file->private_data;
710 struct inode *inode = file->f_dentry->d_inode; 710 struct inode *inode = file->f_dentry->d_inode;
711 struct ceph_inode_info *ci = ceph_inode(inode); 711 struct ceph_inode_info *ci = ceph_inode(inode);
712 struct ceph_osd_client *osdc = 712 struct ceph_osd_client *osdc =
713 &ceph_sb_to_client(inode->i_sb)->client->osdc; 713 &ceph_sb_to_client(inode->i_sb)->client->osdc;
714 loff_t endoff = pos + iov->iov_len; 714 loff_t endoff = pos + iov->iov_len;
715 int want, got = 0; 715 int want, got = 0;
716 int ret, err; 716 int ret, err;
717 717
718 if (ceph_snap(inode) != CEPH_NOSNAP) 718 if (ceph_snap(inode) != CEPH_NOSNAP)
719 return -EROFS; 719 return -EROFS;
720 720
721 retry_snap: 721 retry_snap:
722 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 722 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
723 return -ENOSPC; 723 return -ENOSPC;
724 __ceph_do_pending_vmtruncate(inode); 724 __ceph_do_pending_vmtruncate(inode);
725 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 725 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
726 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 726 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
727 inode->i_size); 727 inode->i_size);
728 if (fi->fmode & CEPH_FILE_MODE_LAZY) 728 if (fi->fmode & CEPH_FILE_MODE_LAZY)
729 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 729 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
730 else 730 else
731 want = CEPH_CAP_FILE_BUFFER; 731 want = CEPH_CAP_FILE_BUFFER;
732 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); 732 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
733 if (ret < 0) 733 if (ret < 0)
734 goto out_put; 734 goto out_put;
735 735
736 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", 736 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
737 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 737 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
738 ceph_cap_string(got)); 738 ceph_cap_string(got));
739 739
740 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 740 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
741 (iocb->ki_filp->f_flags & O_DIRECT) || 741 (iocb->ki_filp->f_flags & O_DIRECT) ||
742 (inode->i_sb->s_flags & MS_SYNCHRONOUS) || 742 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
743 (fi->flags & CEPH_F_SYNC)) { 743 (fi->flags & CEPH_F_SYNC)) {
744 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 744 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
745 &iocb->ki_pos); 745 &iocb->ki_pos);
746 } else { 746 } else {
747 /* 747 /*
748 * buffered write; drop Fw early to avoid slow 748 * buffered write; drop Fw early to avoid slow
749 * revocation if we get stuck on balance_dirty_pages 749 * revocation if we get stuck on balance_dirty_pages
750 */ 750 */
751 int dirty; 751 int dirty;
752 752
753 spin_lock(&ci->i_ceph_lock); 753 spin_lock(&ci->i_ceph_lock);
754 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 754 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
755 spin_unlock(&ci->i_ceph_lock); 755 spin_unlock(&ci->i_ceph_lock);
756 ceph_put_cap_refs(ci, got); 756 ceph_put_cap_refs(ci, got);
757 757
758 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 758 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
759 if ((ret >= 0 || ret == -EIOCBQUEUED) && 759 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
760 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 760 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
761 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 761 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
762 err = vfs_fsync_range(file, pos, pos + ret - 1, 1); 762 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
763 if (err < 0) 763 if (err < 0)
764 ret = err; 764 ret = err;
765 } 765 }
766 766
767 if (dirty) 767 if (dirty)
768 __mark_inode_dirty(inode, dirty); 768 __mark_inode_dirty(inode, dirty);
769 goto out; 769 goto out;
770 } 770 }
771 771
772 if (ret >= 0) { 772 if (ret >= 0) {
773 int dirty; 773 int dirty;
774 spin_lock(&ci->i_ceph_lock); 774 spin_lock(&ci->i_ceph_lock);
775 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 775 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
776 spin_unlock(&ci->i_ceph_lock); 776 spin_unlock(&ci->i_ceph_lock);
777 if (dirty) 777 if (dirty)
778 __mark_inode_dirty(inode, dirty); 778 __mark_inode_dirty(inode, dirty);
779 } 779 }
780 780
781 out_put: 781 out_put:
782 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 782 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
784 ceph_cap_string(got)); 784 ceph_cap_string(got));
785 ceph_put_cap_refs(ci, got); 785 ceph_put_cap_refs(ci, got);
786 786
787 out: 787 out:
788 if (ret == -EOLDSNAPC) { 788 if (ret == -EOLDSNAPC) {
789 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 789 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
790 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); 790 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
791 goto retry_snap; 791 goto retry_snap;
792 } 792 }
793 793
794 return ret; 794 return ret;
795 } 795 }
796 796
797 /* 797 /*
798 * llseek. be sure to verify file size on SEEK_END. 798 * llseek. be sure to verify file size on SEEK_END.
799 */ 799 */
800 static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) 800 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
801 { 801 {
802 struct inode *inode = file->f_mapping->host; 802 struct inode *inode = file->f_mapping->host;
803 int ret; 803 int ret;
804 804
805 mutex_lock(&inode->i_mutex); 805 mutex_lock(&inode->i_mutex);
806 __ceph_do_pending_vmtruncate(inode); 806 __ceph_do_pending_vmtruncate(inode);
807 807
808 if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { 808 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
809 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 809 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
810 if (ret < 0) { 810 if (ret < 0) {
811 offset = ret; 811 offset = ret;
812 goto out; 812 goto out;
813 } 813 }
814 } 814 }
815 815
816 switch (origin) { 816 switch (whence) {
817 case SEEK_END: 817 case SEEK_END:
818 offset += inode->i_size; 818 offset += inode->i_size;
819 break; 819 break;
820 case SEEK_CUR: 820 case SEEK_CUR:
821 /* 821 /*
822 * Here we special-case the lseek(fd, 0, SEEK_CUR) 822 * Here we special-case the lseek(fd, 0, SEEK_CUR)
823 * position-querying operation. Avoid rewriting the "same" 823 * position-querying operation. Avoid rewriting the "same"
824 * f_pos value back to the file because a concurrent read(), 824 * f_pos value back to the file because a concurrent read(),
825 * write() or lseek() might have altered it 825 * write() or lseek() might have altered it
826 */ 826 */
827 if (offset == 0) { 827 if (offset == 0) {
828 offset = file->f_pos; 828 offset = file->f_pos;
829 goto out; 829 goto out;
830 } 830 }
831 offset += file->f_pos; 831 offset += file->f_pos;
832 break; 832 break;
833 case SEEK_DATA: 833 case SEEK_DATA:
834 if (offset >= inode->i_size) { 834 if (offset >= inode->i_size) {
835 ret = -ENXIO; 835 ret = -ENXIO;
836 goto out; 836 goto out;
837 } 837 }
838 break; 838 break;
839 case SEEK_HOLE: 839 case SEEK_HOLE:
840 if (offset >= inode->i_size) { 840 if (offset >= inode->i_size) {
841 ret = -ENXIO; 841 ret = -ENXIO;
842 goto out; 842 goto out;
843 } 843 }
844 offset = inode->i_size; 844 offset = inode->i_size;
845 break; 845 break;
846 } 846 }
847 847
848 if (offset < 0 || offset > inode->i_sb->s_maxbytes) { 848 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
849 offset = -EINVAL; 849 offset = -EINVAL;
850 goto out; 850 goto out;
851 } 851 }
852 852
853 /* Special lock needed here? */ 853 /* Special lock needed here? */
854 if (offset != file->f_pos) { 854 if (offset != file->f_pos) {
855 file->f_pos = offset; 855 file->f_pos = offset;
856 file->f_version = 0; 856 file->f_version = 0;
857 } 857 }
858 858
859 out: 859 out:
860 mutex_unlock(&inode->i_mutex); 860 mutex_unlock(&inode->i_mutex);
861 return offset; 861 return offset;
862 } 862 }
863 863
864 const struct file_operations ceph_file_fops = { 864 const struct file_operations ceph_file_fops = {
865 .open = ceph_open, 865 .open = ceph_open,
866 .release = ceph_release, 866 .release = ceph_release,
867 .llseek = ceph_llseek, 867 .llseek = ceph_llseek,
868 .read = do_sync_read, 868 .read = do_sync_read,
869 .write = do_sync_write, 869 .write = do_sync_write,
870 .aio_read = ceph_aio_read, 870 .aio_read = ceph_aio_read,
871 .aio_write = ceph_aio_write, 871 .aio_write = ceph_aio_write,
872 .mmap = ceph_mmap, 872 .mmap = ceph_mmap,
873 .fsync = ceph_fsync, 873 .fsync = ceph_fsync,
874 .lock = ceph_lock, 874 .lock = ceph_lock,
875 .flock = ceph_flock, 875 .flock = ceph_flock,
876 .splice_read = generic_file_splice_read, 876 .splice_read = generic_file_splice_read,
877 .splice_write = generic_file_splice_write, 877 .splice_write = generic_file_splice_write,
878 .unlocked_ioctl = ceph_ioctl, 878 .unlocked_ioctl = ceph_ioctl,
879 .compat_ioctl = ceph_ioctl, 879 .compat_ioctl = ceph_ioctl,
880 }; 880 };
881 881
882 882
1 /* 1 /*
2 * fs/cifs/cifsfs.c 2 * fs/cifs/cifsfs.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2008
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Common Internet FileSystem (CIFS) client 7 * Common Internet FileSystem (CIFS) client
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as published 10 * it under the terms of the GNU Lesser General Public License as published
11 * by the Free Software Foundation; either version 2.1 of the License, or 11 * by the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version. 12 * (at your option) any later version.
13 * 13 *
14 * This library is distributed in the hope that it will be useful, 14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17 * the GNU Lesser General Public License for more details. 17 * the GNU Lesser General Public License for more details.
18 * 18 *
19 * You should have received a copy of the GNU Lesser General Public License 19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this library; if not, write to the Free Software 20 * along with this library; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */ 22 */
23 23
24 /* Note that BB means BUGBUG (ie something to fix eventually) */ 24 /* Note that BB means BUGBUG (ie something to fix eventually) */
25 25
26 #include <linux/module.h> 26 #include <linux/module.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/mount.h> 28 #include <linux/mount.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/init.h> 30 #include <linux/init.h>
31 #include <linux/list.h> 31 #include <linux/list.h>
32 #include <linux/seq_file.h> 32 #include <linux/seq_file.h>
33 #include <linux/vfs.h> 33 #include <linux/vfs.h>
34 #include <linux/mempool.h> 34 #include <linux/mempool.h>
35 #include <linux/delay.h> 35 #include <linux/delay.h>
36 #include <linux/kthread.h> 36 #include <linux/kthread.h>
37 #include <linux/freezer.h> 37 #include <linux/freezer.h>
38 #include <linux/namei.h> 38 #include <linux/namei.h>
39 #include <linux/random.h> 39 #include <linux/random.h>
40 #include <net/ipv6.h> 40 #include <net/ipv6.h>
41 #include "cifsfs.h" 41 #include "cifsfs.h"
42 #include "cifspdu.h" 42 #include "cifspdu.h"
43 #define DECLARE_GLOBALS_HERE 43 #define DECLARE_GLOBALS_HERE
44 #include "cifsglob.h" 44 #include "cifsglob.h"
45 #include "cifsproto.h" 45 #include "cifsproto.h"
46 #include "cifs_debug.h" 46 #include "cifs_debug.h"
47 #include "cifs_fs_sb.h" 47 #include "cifs_fs_sb.h"
48 #include <linux/mm.h> 48 #include <linux/mm.h>
49 #include <linux/key-type.h> 49 #include <linux/key-type.h>
50 #include "cifs_spnego.h" 50 #include "cifs_spnego.h"
51 #include "fscache.h" 51 #include "fscache.h"
52 #ifdef CONFIG_CIFS_SMB2 52 #ifdef CONFIG_CIFS_SMB2
53 #include "smb2pdu.h" 53 #include "smb2pdu.h"
54 #endif 54 #endif
55 55
56 int cifsFYI = 0; 56 int cifsFYI = 0;
57 int cifsERROR = 1; 57 int cifsERROR = 1;
58 int traceSMB = 0; 58 int traceSMB = 0;
59 bool enable_oplocks = true; 59 bool enable_oplocks = true;
60 unsigned int linuxExtEnabled = 1; 60 unsigned int linuxExtEnabled = 1;
61 unsigned int lookupCacheEnabled = 1; 61 unsigned int lookupCacheEnabled = 1;
62 unsigned int global_secflags = CIFSSEC_DEF; 62 unsigned int global_secflags = CIFSSEC_DEF;
63 /* unsigned int ntlmv2_support = 0; */ 63 /* unsigned int ntlmv2_support = 0; */
64 unsigned int sign_CIFS_PDUs = 1; 64 unsigned int sign_CIFS_PDUs = 1;
65 static const struct super_operations cifs_super_ops; 65 static const struct super_operations cifs_super_ops;
66 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 66 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67 module_param(CIFSMaxBufSize, uint, 0); 67 module_param(CIFSMaxBufSize, uint, 0);
68 MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " 68 MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
69 "Default: 16384 Range: 8192 to 130048"); 69 "Default: 16384 Range: 8192 to 130048");
70 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; 70 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
71 module_param(cifs_min_rcv, uint, 0); 71 module_param(cifs_min_rcv, uint, 0);
72 MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " 72 MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
73 "1 to 64"); 73 "1 to 64");
74 unsigned int cifs_min_small = 30; 74 unsigned int cifs_min_small = 30;
75 module_param(cifs_min_small, uint, 0); 75 module_param(cifs_min_small, uint, 0);
76 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 76 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
77 "Range: 2 to 256"); 77 "Range: 2 to 256");
78 unsigned int cifs_max_pending = CIFS_MAX_REQ; 78 unsigned int cifs_max_pending = CIFS_MAX_REQ;
79 module_param(cifs_max_pending, uint, 0444); 79 module_param(cifs_max_pending, uint, 0444);
80 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 80 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
81 "Default: 32767 Range: 2 to 32767."); 81 "Default: 32767 Range: 2 to 32767.");
82 module_param(enable_oplocks, bool, 0644); 82 module_param(enable_oplocks, bool, 0644);
83 MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); 83 MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
84 84
85 extern mempool_t *cifs_sm_req_poolp; 85 extern mempool_t *cifs_sm_req_poolp;
86 extern mempool_t *cifs_req_poolp; 86 extern mempool_t *cifs_req_poolp;
87 extern mempool_t *cifs_mid_poolp; 87 extern mempool_t *cifs_mid_poolp;
88 88
89 struct workqueue_struct *cifsiod_wq; 89 struct workqueue_struct *cifsiod_wq;
90 90
91 #ifdef CONFIG_CIFS_SMB2 91 #ifdef CONFIG_CIFS_SMB2
92 __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; 92 __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
93 #endif 93 #endif
94 94
95 static int 95 static int
96 cifs_read_super(struct super_block *sb) 96 cifs_read_super(struct super_block *sb)
97 { 97 {
98 struct inode *inode; 98 struct inode *inode;
99 struct cifs_sb_info *cifs_sb; 99 struct cifs_sb_info *cifs_sb;
100 int rc = 0; 100 int rc = 0;
101 101
102 cifs_sb = CIFS_SB(sb); 102 cifs_sb = CIFS_SB(sb);
103 103
104 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) 104 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
105 sb->s_flags |= MS_POSIXACL; 105 sb->s_flags |= MS_POSIXACL;
106 106
107 if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) 107 if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
108 sb->s_maxbytes = MAX_LFS_FILESIZE; 108 sb->s_maxbytes = MAX_LFS_FILESIZE;
109 else 109 else
110 sb->s_maxbytes = MAX_NON_LFS; 110 sb->s_maxbytes = MAX_NON_LFS;
111 111
112 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 112 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
113 sb->s_time_gran = 100; 113 sb->s_time_gran = 100;
114 114
115 sb->s_magic = CIFS_MAGIC_NUMBER; 115 sb->s_magic = CIFS_MAGIC_NUMBER;
116 sb->s_op = &cifs_super_ops; 116 sb->s_op = &cifs_super_ops;
117 sb->s_bdi = &cifs_sb->bdi; 117 sb->s_bdi = &cifs_sb->bdi;
118 sb->s_blocksize = CIFS_MAX_MSGSIZE; 118 sb->s_blocksize = CIFS_MAX_MSGSIZE;
119 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 119 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
120 inode = cifs_root_iget(sb); 120 inode = cifs_root_iget(sb);
121 121
122 if (IS_ERR(inode)) { 122 if (IS_ERR(inode)) {
123 rc = PTR_ERR(inode); 123 rc = PTR_ERR(inode);
124 goto out_no_root; 124 goto out_no_root;
125 } 125 }
126 126
127 sb->s_root = d_make_root(inode); 127 sb->s_root = d_make_root(inode);
128 if (!sb->s_root) { 128 if (!sb->s_root) {
129 rc = -ENOMEM; 129 rc = -ENOMEM;
130 goto out_no_root; 130 goto out_no_root;
131 } 131 }
132 132
133 /* do that *after* d_make_root() - we want NULL ->d_op for root here */ 133 /* do that *after* d_make_root() - we want NULL ->d_op for root here */
134 if (cifs_sb_master_tcon(cifs_sb)->nocase) 134 if (cifs_sb_master_tcon(cifs_sb)->nocase)
135 sb->s_d_op = &cifs_ci_dentry_ops; 135 sb->s_d_op = &cifs_ci_dentry_ops;
136 else 136 else
137 sb->s_d_op = &cifs_dentry_ops; 137 sb->s_d_op = &cifs_dentry_ops;
138 138
139 #ifdef CONFIG_CIFS_NFSD_EXPORT 139 #ifdef CONFIG_CIFS_NFSD_EXPORT
140 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 140 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
141 cFYI(1, "export ops supported"); 141 cFYI(1, "export ops supported");
142 sb->s_export_op = &cifs_export_ops; 142 sb->s_export_op = &cifs_export_ops;
143 } 143 }
144 #endif /* CONFIG_CIFS_NFSD_EXPORT */ 144 #endif /* CONFIG_CIFS_NFSD_EXPORT */
145 145
146 return 0; 146 return 0;
147 147
148 out_no_root: 148 out_no_root:
149 cERROR(1, "cifs_read_super: get root inode failed"); 149 cERROR(1, "cifs_read_super: get root inode failed");
150 return rc; 150 return rc;
151 } 151 }
152 152
153 static void cifs_kill_sb(struct super_block *sb) 153 static void cifs_kill_sb(struct super_block *sb)
154 { 154 {
155 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 155 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
156 kill_anon_super(sb); 156 kill_anon_super(sb);
157 cifs_umount(cifs_sb); 157 cifs_umount(cifs_sb);
158 } 158 }
159 159
160 static int 160 static int
161 cifs_statfs(struct dentry *dentry, struct kstatfs *buf) 161 cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
162 { 162 {
163 struct super_block *sb = dentry->d_sb; 163 struct super_block *sb = dentry->d_sb;
164 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 164 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
165 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); 165 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
166 struct TCP_Server_Info *server = tcon->ses->server; 166 struct TCP_Server_Info *server = tcon->ses->server;
167 unsigned int xid; 167 unsigned int xid;
168 int rc = 0; 168 int rc = 0;
169 169
170 xid = get_xid(); 170 xid = get_xid();
171 171
172 /* 172 /*
173 * PATH_MAX may be too long - it would presumably be total path, 173 * PATH_MAX may be too long - it would presumably be total path,
174 * but note that some servers (includinng Samba 3) have a shorter 174 * but note that some servers (includinng Samba 3) have a shorter
175 * maximum path. 175 * maximum path.
176 * 176 *
177 * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO. 177 * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.
178 */ 178 */
179 buf->f_namelen = PATH_MAX; 179 buf->f_namelen = PATH_MAX;
180 buf->f_files = 0; /* undefined */ 180 buf->f_files = 0; /* undefined */
181 buf->f_ffree = 0; /* unlimited */ 181 buf->f_ffree = 0; /* unlimited */
182 182
183 if (server->ops->queryfs) 183 if (server->ops->queryfs)
184 rc = server->ops->queryfs(xid, tcon, buf); 184 rc = server->ops->queryfs(xid, tcon, buf);
185 185
186 free_xid(xid); 186 free_xid(xid);
187 return 0; 187 return 0;
188 } 188 }
189 189
190 static int cifs_permission(struct inode *inode, int mask) 190 static int cifs_permission(struct inode *inode, int mask)
191 { 191 {
192 struct cifs_sb_info *cifs_sb; 192 struct cifs_sb_info *cifs_sb;
193 193
194 cifs_sb = CIFS_SB(inode->i_sb); 194 cifs_sb = CIFS_SB(inode->i_sb);
195 195
196 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { 196 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
197 if ((mask & MAY_EXEC) && !execute_ok(inode)) 197 if ((mask & MAY_EXEC) && !execute_ok(inode))
198 return -EACCES; 198 return -EACCES;
199 else 199 else
200 return 0; 200 return 0;
201 } else /* file mode might have been restricted at mount time 201 } else /* file mode might have been restricted at mount time
202 on the client (above and beyond ACL on servers) for 202 on the client (above and beyond ACL on servers) for
203 servers which do not support setting and viewing mode bits, 203 servers which do not support setting and viewing mode bits,
204 so allowing client to check permissions is useful */ 204 so allowing client to check permissions is useful */
205 return generic_permission(inode, mask); 205 return generic_permission(inode, mask);
206 } 206 }
207 207
208 static struct kmem_cache *cifs_inode_cachep; 208 static struct kmem_cache *cifs_inode_cachep;
209 static struct kmem_cache *cifs_req_cachep; 209 static struct kmem_cache *cifs_req_cachep;
210 static struct kmem_cache *cifs_mid_cachep; 210 static struct kmem_cache *cifs_mid_cachep;
211 static struct kmem_cache *cifs_sm_req_cachep; 211 static struct kmem_cache *cifs_sm_req_cachep;
212 mempool_t *cifs_sm_req_poolp; 212 mempool_t *cifs_sm_req_poolp;
213 mempool_t *cifs_req_poolp; 213 mempool_t *cifs_req_poolp;
214 mempool_t *cifs_mid_poolp; 214 mempool_t *cifs_mid_poolp;
215 215
216 static struct inode * 216 static struct inode *
217 cifs_alloc_inode(struct super_block *sb) 217 cifs_alloc_inode(struct super_block *sb)
218 { 218 {
219 struct cifsInodeInfo *cifs_inode; 219 struct cifsInodeInfo *cifs_inode;
220 cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); 220 cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
221 if (!cifs_inode) 221 if (!cifs_inode)
222 return NULL; 222 return NULL;
223 cifs_inode->cifsAttrs = 0x20; /* default */ 223 cifs_inode->cifsAttrs = 0x20; /* default */
224 cifs_inode->time = 0; 224 cifs_inode->time = 0;
225 /* 225 /*
226 * Until the file is open and we have gotten oplock info back from the 226 * Until the file is open and we have gotten oplock info back from the
227 * server, can not assume caching of file data or metadata. 227 * server, can not assume caching of file data or metadata.
228 */ 228 */
229 cifs_set_oplock_level(cifs_inode, 0); 229 cifs_set_oplock_level(cifs_inode, 0);
230 cifs_inode->delete_pending = false; 230 cifs_inode->delete_pending = false;
231 cifs_inode->invalid_mapping = false; 231 cifs_inode->invalid_mapping = false;
232 cifs_inode->leave_pages_clean = false; 232 cifs_inode->leave_pages_clean = false;
233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
234 cifs_inode->server_eof = 0; 234 cifs_inode->server_eof = 0;
235 cifs_inode->uniqueid = 0; 235 cifs_inode->uniqueid = 0;
236 cifs_inode->createtime = 0; 236 cifs_inode->createtime = 0;
237 #ifdef CONFIG_CIFS_SMB2 237 #ifdef CONFIG_CIFS_SMB2
238 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); 238 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
239 #endif 239 #endif
240 /* 240 /*
241 * Can not set i_flags here - they get immediately overwritten to zero 241 * Can not set i_flags here - they get immediately overwritten to zero
242 * by the VFS. 242 * by the VFS.
243 */ 243 */
244 /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ 244 /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
245 INIT_LIST_HEAD(&cifs_inode->openFileList); 245 INIT_LIST_HEAD(&cifs_inode->openFileList);
246 INIT_LIST_HEAD(&cifs_inode->llist); 246 INIT_LIST_HEAD(&cifs_inode->llist);
247 return &cifs_inode->vfs_inode; 247 return &cifs_inode->vfs_inode;
248 } 248 }
249 249
250 static void cifs_i_callback(struct rcu_head *head) 250 static void cifs_i_callback(struct rcu_head *head)
251 { 251 {
252 struct inode *inode = container_of(head, struct inode, i_rcu); 252 struct inode *inode = container_of(head, struct inode, i_rcu);
253 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); 253 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
254 } 254 }
255 255
256 static void 256 static void
257 cifs_destroy_inode(struct inode *inode) 257 cifs_destroy_inode(struct inode *inode)
258 { 258 {
259 call_rcu(&inode->i_rcu, cifs_i_callback); 259 call_rcu(&inode->i_rcu, cifs_i_callback);
260 } 260 }
261 261
262 static void 262 static void
263 cifs_evict_inode(struct inode *inode) 263 cifs_evict_inode(struct inode *inode)
264 { 264 {
265 truncate_inode_pages(&inode->i_data, 0); 265 truncate_inode_pages(&inode->i_data, 0);
266 clear_inode(inode); 266 clear_inode(inode);
267 cifs_fscache_release_inode_cookie(inode); 267 cifs_fscache_release_inode_cookie(inode);
268 } 268 }
269 269
270 static void 270 static void
271 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 271 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
272 { 272 {
273 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; 273 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
274 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; 274 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
275 275
276 seq_printf(s, ",addr="); 276 seq_printf(s, ",addr=");
277 277
278 switch (server->dstaddr.ss_family) { 278 switch (server->dstaddr.ss_family) {
279 case AF_INET: 279 case AF_INET:
280 seq_printf(s, "%pI4", &sa->sin_addr.s_addr); 280 seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
281 break; 281 break;
282 case AF_INET6: 282 case AF_INET6:
283 seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr); 283 seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
284 if (sa6->sin6_scope_id) 284 if (sa6->sin6_scope_id)
285 seq_printf(s, "%%%u", sa6->sin6_scope_id); 285 seq_printf(s, "%%%u", sa6->sin6_scope_id);
286 break; 286 break;
287 default: 287 default:
288 seq_printf(s, "(unknown)"); 288 seq_printf(s, "(unknown)");
289 } 289 }
290 } 290 }
291 291
292 static void 292 static void
293 cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) 293 cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
294 { 294 {
295 seq_printf(s, ",sec="); 295 seq_printf(s, ",sec=");
296 296
297 switch (server->secType) { 297 switch (server->secType) {
298 case LANMAN: 298 case LANMAN:
299 seq_printf(s, "lanman"); 299 seq_printf(s, "lanman");
300 break; 300 break;
301 case NTLMv2: 301 case NTLMv2:
302 seq_printf(s, "ntlmv2"); 302 seq_printf(s, "ntlmv2");
303 break; 303 break;
304 case NTLM: 304 case NTLM:
305 seq_printf(s, "ntlm"); 305 seq_printf(s, "ntlm");
306 break; 306 break;
307 case Kerberos: 307 case Kerberos:
308 seq_printf(s, "krb5"); 308 seq_printf(s, "krb5");
309 break; 309 break;
310 case RawNTLMSSP: 310 case RawNTLMSSP:
311 seq_printf(s, "ntlmssp"); 311 seq_printf(s, "ntlmssp");
312 break; 312 break;
313 default: 313 default:
314 /* shouldn't ever happen */ 314 /* shouldn't ever happen */
315 seq_printf(s, "unknown"); 315 seq_printf(s, "unknown");
316 break; 316 break;
317 } 317 }
318 318
319 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 319 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
320 seq_printf(s, "i"); 320 seq_printf(s, "i");
321 } 321 }
322 322
323 static void 323 static void
324 cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) 324 cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
325 { 325 {
326 seq_printf(s, ",cache="); 326 seq_printf(s, ",cache=");
327 327
328 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) 328 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
329 seq_printf(s, "strict"); 329 seq_printf(s, "strict");
330 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 330 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
331 seq_printf(s, "none"); 331 seq_printf(s, "none");
332 else 332 else
333 seq_printf(s, "loose"); 333 seq_printf(s, "loose");
334 } 334 }
335 335
336 /* 336 /*
337 * cifs_show_options() is for displaying mount options in /proc/mounts. 337 * cifs_show_options() is for displaying mount options in /proc/mounts.
338 * Not all settable options are displayed but most of the important 338 * Not all settable options are displayed but most of the important
339 * ones are. 339 * ones are.
340 */ 340 */
341 static int 341 static int
342 cifs_show_options(struct seq_file *s, struct dentry *root) 342 cifs_show_options(struct seq_file *s, struct dentry *root)
343 { 343 {
344 struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); 344 struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
345 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); 345 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
346 struct sockaddr *srcaddr; 346 struct sockaddr *srcaddr;
347 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; 347 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
348 348
349 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); 349 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
350 cifs_show_security(s, tcon->ses->server); 350 cifs_show_security(s, tcon->ses->server);
351 cifs_show_cache_flavor(s, cifs_sb); 351 cifs_show_cache_flavor(s, cifs_sb);
352 352
353 seq_printf(s, ",unc="); 353 seq_printf(s, ",unc=");
354 seq_escape(s, tcon->treeName, " \t\n\\"); 354 seq_escape(s, tcon->treeName, " \t\n\\");
355 355
356 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 356 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
357 seq_printf(s, ",multiuser"); 357 seq_printf(s, ",multiuser");
358 else if (tcon->ses->user_name) 358 else if (tcon->ses->user_name)
359 seq_printf(s, ",username=%s", tcon->ses->user_name); 359 seq_printf(s, ",username=%s", tcon->ses->user_name);
360 360
361 if (tcon->ses->domainName) 361 if (tcon->ses->domainName)
362 seq_printf(s, ",domain=%s", tcon->ses->domainName); 362 seq_printf(s, ",domain=%s", tcon->ses->domainName);
363 363
364 if (srcaddr->sa_family != AF_UNSPEC) { 364 if (srcaddr->sa_family != AF_UNSPEC) {
365 struct sockaddr_in *saddr4; 365 struct sockaddr_in *saddr4;
366 struct sockaddr_in6 *saddr6; 366 struct sockaddr_in6 *saddr6;
367 saddr4 = (struct sockaddr_in *)srcaddr; 367 saddr4 = (struct sockaddr_in *)srcaddr;
368 saddr6 = (struct sockaddr_in6 *)srcaddr; 368 saddr6 = (struct sockaddr_in6 *)srcaddr;
369 if (srcaddr->sa_family == AF_INET6) 369 if (srcaddr->sa_family == AF_INET6)
370 seq_printf(s, ",srcaddr=%pI6c", 370 seq_printf(s, ",srcaddr=%pI6c",
371 &saddr6->sin6_addr); 371 &saddr6->sin6_addr);
372 else if (srcaddr->sa_family == AF_INET) 372 else if (srcaddr->sa_family == AF_INET)
373 seq_printf(s, ",srcaddr=%pI4", 373 seq_printf(s, ",srcaddr=%pI4",
374 &saddr4->sin_addr.s_addr); 374 &saddr4->sin_addr.s_addr);
375 else 375 else
376 seq_printf(s, ",srcaddr=BAD-AF:%i", 376 seq_printf(s, ",srcaddr=BAD-AF:%i",
377 (int)(srcaddr->sa_family)); 377 (int)(srcaddr->sa_family));
378 } 378 }
379 379
380 seq_printf(s, ",uid=%u", cifs_sb->mnt_uid); 380 seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
381 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 381 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
382 seq_printf(s, ",forceuid"); 382 seq_printf(s, ",forceuid");
383 else 383 else
384 seq_printf(s, ",noforceuid"); 384 seq_printf(s, ",noforceuid");
385 385
386 seq_printf(s, ",gid=%u", cifs_sb->mnt_gid); 386 seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
387 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) 387 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
388 seq_printf(s, ",forcegid"); 388 seq_printf(s, ",forcegid");
389 else 389 else
390 seq_printf(s, ",noforcegid"); 390 seq_printf(s, ",noforcegid");
391 391
392 cifs_show_address(s, tcon->ses->server); 392 cifs_show_address(s, tcon->ses->server);
393 393
394 if (!tcon->unix_ext) 394 if (!tcon->unix_ext)
395 seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", 395 seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
396 cifs_sb->mnt_file_mode, 396 cifs_sb->mnt_file_mode,
397 cifs_sb->mnt_dir_mode); 397 cifs_sb->mnt_dir_mode);
398 if (tcon->seal) 398 if (tcon->seal)
399 seq_printf(s, ",seal"); 399 seq_printf(s, ",seal");
400 if (tcon->nocase) 400 if (tcon->nocase)
401 seq_printf(s, ",nocase"); 401 seq_printf(s, ",nocase");
402 if (tcon->retry) 402 if (tcon->retry)
403 seq_printf(s, ",hard"); 403 seq_printf(s, ",hard");
404 if (tcon->unix_ext) 404 if (tcon->unix_ext)
405 seq_printf(s, ",unix"); 405 seq_printf(s, ",unix");
406 else 406 else
407 seq_printf(s, ",nounix"); 407 seq_printf(s, ",nounix");
408 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) 408 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
409 seq_printf(s, ",posixpaths"); 409 seq_printf(s, ",posixpaths");
410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) 410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
411 seq_printf(s, ",setuids"); 411 seq_printf(s, ",setuids");
412 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) 412 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
413 seq_printf(s, ",serverino"); 413 seq_printf(s, ",serverino");
414 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 414 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
415 seq_printf(s, ",rwpidforward"); 415 seq_printf(s, ",rwpidforward");
416 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) 416 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
417 seq_printf(s, ",forcemand"); 417 seq_printf(s, ",forcemand");
418 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 418 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
419 seq_printf(s, ",nouser_xattr"); 419 seq_printf(s, ",nouser_xattr");
420 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) 420 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
421 seq_printf(s, ",mapchars"); 421 seq_printf(s, ",mapchars");
422 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 422 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
423 seq_printf(s, ",sfu"); 423 seq_printf(s, ",sfu");
424 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 424 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
425 seq_printf(s, ",nobrl"); 425 seq_printf(s, ",nobrl");
426 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) 426 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
427 seq_printf(s, ",cifsacl"); 427 seq_printf(s, ",cifsacl");
428 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) 428 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
429 seq_printf(s, ",dynperm"); 429 seq_printf(s, ",dynperm");
430 if (root->d_sb->s_flags & MS_POSIXACL) 430 if (root->d_sb->s_flags & MS_POSIXACL)
431 seq_printf(s, ",acl"); 431 seq_printf(s, ",acl");
432 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 432 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
433 seq_printf(s, ",mfsymlinks"); 433 seq_printf(s, ",mfsymlinks");
434 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) 434 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
435 seq_printf(s, ",fsc"); 435 seq_printf(s, ",fsc");
436 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) 436 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
437 seq_printf(s, ",nostrictsync"); 437 seq_printf(s, ",nostrictsync");
438 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) 438 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
439 seq_printf(s, ",noperm"); 439 seq_printf(s, ",noperm");
440 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) 440 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
441 seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid); 441 seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
442 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) 442 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
443 seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); 443 seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
444 444
445 seq_printf(s, ",rsize=%u", cifs_sb->rsize); 445 seq_printf(s, ",rsize=%u", cifs_sb->rsize);
446 seq_printf(s, ",wsize=%u", cifs_sb->wsize); 446 seq_printf(s, ",wsize=%u", cifs_sb->wsize);
447 /* convert actimeo and display it in seconds */ 447 /* convert actimeo and display it in seconds */
448 seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); 448 seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
449 449
450 return 0; 450 return 0;
451 } 451 }
452 452
453 static void cifs_umount_begin(struct super_block *sb) 453 static void cifs_umount_begin(struct super_block *sb)
454 { 454 {
455 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 455 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
456 struct cifs_tcon *tcon; 456 struct cifs_tcon *tcon;
457 457
458 if (cifs_sb == NULL) 458 if (cifs_sb == NULL)
459 return; 459 return;
460 460
461 tcon = cifs_sb_master_tcon(cifs_sb); 461 tcon = cifs_sb_master_tcon(cifs_sb);
462 462
463 spin_lock(&cifs_tcp_ses_lock); 463 spin_lock(&cifs_tcp_ses_lock);
464 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { 464 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
465 /* we have other mounts to same share or we have 465 /* we have other mounts to same share or we have
466 already tried to force umount this and woken up 466 already tried to force umount this and woken up
467 all waiting network requests, nothing to do */ 467 all waiting network requests, nothing to do */
468 spin_unlock(&cifs_tcp_ses_lock); 468 spin_unlock(&cifs_tcp_ses_lock);
469 return; 469 return;
470 } else if (tcon->tc_count == 1) 470 } else if (tcon->tc_count == 1)
471 tcon->tidStatus = CifsExiting; 471 tcon->tidStatus = CifsExiting;
472 spin_unlock(&cifs_tcp_ses_lock); 472 spin_unlock(&cifs_tcp_ses_lock);
473 473
474 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 474 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
475 /* cancel_notify_requests(tcon); */ 475 /* cancel_notify_requests(tcon); */
476 if (tcon->ses && tcon->ses->server) { 476 if (tcon->ses && tcon->ses->server) {
477 cFYI(1, "wake up tasks now - umount begin not complete"); 477 cFYI(1, "wake up tasks now - umount begin not complete");
478 wake_up_all(&tcon->ses->server->request_q); 478 wake_up_all(&tcon->ses->server->request_q);
479 wake_up_all(&tcon->ses->server->response_q); 479 wake_up_all(&tcon->ses->server->response_q);
480 msleep(1); /* yield */ 480 msleep(1); /* yield */
481 /* we have to kick the requests once more */ 481 /* we have to kick the requests once more */
482 wake_up_all(&tcon->ses->server->response_q); 482 wake_up_all(&tcon->ses->server->response_q);
483 msleep(1); 483 msleep(1);
484 } 484 }
485 485
486 return; 486 return;
487 } 487 }
488 488
489 #ifdef CONFIG_CIFS_STATS2 489 #ifdef CONFIG_CIFS_STATS2
490 static int cifs_show_stats(struct seq_file *s, struct dentry *root) 490 static int cifs_show_stats(struct seq_file *s, struct dentry *root)
491 { 491 {
492 /* BB FIXME */ 492 /* BB FIXME */
493 return 0; 493 return 0;
494 } 494 }
495 #endif 495 #endif
496 496
497 static int cifs_remount(struct super_block *sb, int *flags, char *data) 497 static int cifs_remount(struct super_block *sb, int *flags, char *data)
498 { 498 {
499 *flags |= MS_NODIRATIME; 499 *flags |= MS_NODIRATIME;
500 return 0; 500 return 0;
501 } 501 }
502 502
503 static int cifs_drop_inode(struct inode *inode) 503 static int cifs_drop_inode(struct inode *inode)
504 { 504 {
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 506
507 /* no serverino => unconditional eviction */ 507 /* no serverino => unconditional eviction */
508 return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || 508 return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
509 generic_drop_inode(inode); 509 generic_drop_inode(inode);
510 } 510 }
511 511
512 static const struct super_operations cifs_super_ops = { 512 static const struct super_operations cifs_super_ops = {
513 .statfs = cifs_statfs, 513 .statfs = cifs_statfs,
514 .alloc_inode = cifs_alloc_inode, 514 .alloc_inode = cifs_alloc_inode,
515 .destroy_inode = cifs_destroy_inode, 515 .destroy_inode = cifs_destroy_inode,
516 .drop_inode = cifs_drop_inode, 516 .drop_inode = cifs_drop_inode,
517 .evict_inode = cifs_evict_inode, 517 .evict_inode = cifs_evict_inode,
518 /* .delete_inode = cifs_delete_inode, */ /* Do not need above 518 /* .delete_inode = cifs_delete_inode, */ /* Do not need above
519 function unless later we add lazy close of inodes or unless the 519 function unless later we add lazy close of inodes or unless the
520 kernel forgets to call us with the same number of releases (closes) 520 kernel forgets to call us with the same number of releases (closes)
521 as opens */ 521 as opens */
522 .show_options = cifs_show_options, 522 .show_options = cifs_show_options,
523 .umount_begin = cifs_umount_begin, 523 .umount_begin = cifs_umount_begin,
524 .remount_fs = cifs_remount, 524 .remount_fs = cifs_remount,
525 #ifdef CONFIG_CIFS_STATS2 525 #ifdef CONFIG_CIFS_STATS2
526 .show_stats = cifs_show_stats, 526 .show_stats = cifs_show_stats,
527 #endif 527 #endif
528 }; 528 };
529 529
530 /* 530 /*
531 * Get root dentry from superblock according to prefix path mount option. 531 * Get root dentry from superblock according to prefix path mount option.
532 * Return dentry with refcount + 1 on success and NULL otherwise. 532 * Return dentry with refcount + 1 on success and NULL otherwise.
533 */ 533 */
534 static struct dentry * 534 static struct dentry *
535 cifs_get_root(struct smb_vol *vol, struct super_block *sb) 535 cifs_get_root(struct smb_vol *vol, struct super_block *sb)
536 { 536 {
537 struct dentry *dentry; 537 struct dentry *dentry;
538 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 538 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
539 char *full_path = NULL; 539 char *full_path = NULL;
540 char *s, *p; 540 char *s, *p;
541 char sep; 541 char sep;
542 542
543 full_path = cifs_build_path_to_root(vol, cifs_sb, 543 full_path = cifs_build_path_to_root(vol, cifs_sb,
544 cifs_sb_master_tcon(cifs_sb)); 544 cifs_sb_master_tcon(cifs_sb));
545 if (full_path == NULL) 545 if (full_path == NULL)
546 return ERR_PTR(-ENOMEM); 546 return ERR_PTR(-ENOMEM);
547 547
548 cFYI(1, "Get root dentry for %s", full_path); 548 cFYI(1, "Get root dentry for %s", full_path);
549 549
550 sep = CIFS_DIR_SEP(cifs_sb); 550 sep = CIFS_DIR_SEP(cifs_sb);
551 dentry = dget(sb->s_root); 551 dentry = dget(sb->s_root);
552 p = s = full_path; 552 p = s = full_path;
553 553
554 do { 554 do {
555 struct inode *dir = dentry->d_inode; 555 struct inode *dir = dentry->d_inode;
556 struct dentry *child; 556 struct dentry *child;
557 557
558 if (!dir) { 558 if (!dir) {
559 dput(dentry); 559 dput(dentry);
560 dentry = ERR_PTR(-ENOENT); 560 dentry = ERR_PTR(-ENOENT);
561 break; 561 break;
562 } 562 }
563 563
564 /* skip separators */ 564 /* skip separators */
565 while (*s == sep) 565 while (*s == sep)
566 s++; 566 s++;
567 if (!*s) 567 if (!*s)
568 break; 568 break;
569 p = s++; 569 p = s++;
570 /* next separator */ 570 /* next separator */
571 while (*s && *s != sep) 571 while (*s && *s != sep)
572 s++; 572 s++;
573 573
574 mutex_lock(&dir->i_mutex); 574 mutex_lock(&dir->i_mutex);
575 child = lookup_one_len(p, dentry, s - p); 575 child = lookup_one_len(p, dentry, s - p);
576 mutex_unlock(&dir->i_mutex); 576 mutex_unlock(&dir->i_mutex);
577 dput(dentry); 577 dput(dentry);
578 dentry = child; 578 dentry = child;
579 } while (!IS_ERR(dentry)); 579 } while (!IS_ERR(dentry));
580 kfree(full_path); 580 kfree(full_path);
581 return dentry; 581 return dentry;
582 } 582 }
583 583
584 static int cifs_set_super(struct super_block *sb, void *data) 584 static int cifs_set_super(struct super_block *sb, void *data)
585 { 585 {
586 struct cifs_mnt_data *mnt_data = data; 586 struct cifs_mnt_data *mnt_data = data;
587 sb->s_fs_info = mnt_data->cifs_sb; 587 sb->s_fs_info = mnt_data->cifs_sb;
588 return set_anon_super(sb, NULL); 588 return set_anon_super(sb, NULL);
589 } 589 }
590 590
591 static struct dentry * 591 static struct dentry *
592 cifs_do_mount(struct file_system_type *fs_type, 592 cifs_do_mount(struct file_system_type *fs_type,
593 int flags, const char *dev_name, void *data) 593 int flags, const char *dev_name, void *data)
594 { 594 {
595 int rc; 595 int rc;
596 struct super_block *sb; 596 struct super_block *sb;
597 struct cifs_sb_info *cifs_sb; 597 struct cifs_sb_info *cifs_sb;
598 struct smb_vol *volume_info; 598 struct smb_vol *volume_info;
599 struct cifs_mnt_data mnt_data; 599 struct cifs_mnt_data mnt_data;
600 struct dentry *root; 600 struct dentry *root;
601 601
602 cFYI(1, "Devname: %s flags: %d ", dev_name, flags); 602 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
603 603
604 volume_info = cifs_get_volume_info((char *)data, dev_name); 604 volume_info = cifs_get_volume_info((char *)data, dev_name);
605 if (IS_ERR(volume_info)) 605 if (IS_ERR(volume_info))
606 return ERR_CAST(volume_info); 606 return ERR_CAST(volume_info);
607 607
608 cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); 608 cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
609 if (cifs_sb == NULL) { 609 if (cifs_sb == NULL) {
610 root = ERR_PTR(-ENOMEM); 610 root = ERR_PTR(-ENOMEM);
611 goto out_nls; 611 goto out_nls;
612 } 612 }
613 613
614 cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); 614 cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
615 if (cifs_sb->mountdata == NULL) { 615 if (cifs_sb->mountdata == NULL) {
616 root = ERR_PTR(-ENOMEM); 616 root = ERR_PTR(-ENOMEM);
617 goto out_cifs_sb; 617 goto out_cifs_sb;
618 } 618 }
619 619
620 cifs_setup_cifs_sb(volume_info, cifs_sb); 620 cifs_setup_cifs_sb(volume_info, cifs_sb);
621 621
622 rc = cifs_mount(cifs_sb, volume_info); 622 rc = cifs_mount(cifs_sb, volume_info);
623 if (rc) { 623 if (rc) {
624 if (!(flags & MS_SILENT)) 624 if (!(flags & MS_SILENT))
625 cERROR(1, "cifs_mount failed w/return code = %d", rc); 625 cERROR(1, "cifs_mount failed w/return code = %d", rc);
626 root = ERR_PTR(rc); 626 root = ERR_PTR(rc);
627 goto out_mountdata; 627 goto out_mountdata;
628 } 628 }
629 629
630 mnt_data.vol = volume_info; 630 mnt_data.vol = volume_info;
631 mnt_data.cifs_sb = cifs_sb; 631 mnt_data.cifs_sb = cifs_sb;
632 mnt_data.flags = flags; 632 mnt_data.flags = flags;
633 633
634 /* BB should we make this contingent on mount parm? */ 634 /* BB should we make this contingent on mount parm? */
635 flags |= MS_NODIRATIME | MS_NOATIME; 635 flags |= MS_NODIRATIME | MS_NOATIME;
636 636
637 sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); 637 sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
638 if (IS_ERR(sb)) { 638 if (IS_ERR(sb)) {
639 root = ERR_CAST(sb); 639 root = ERR_CAST(sb);
640 cifs_umount(cifs_sb); 640 cifs_umount(cifs_sb);
641 goto out; 641 goto out;
642 } 642 }
643 643
644 if (sb->s_root) { 644 if (sb->s_root) {
645 cFYI(1, "Use existing superblock"); 645 cFYI(1, "Use existing superblock");
646 cifs_umount(cifs_sb); 646 cifs_umount(cifs_sb);
647 } else { 647 } else {
648 rc = cifs_read_super(sb); 648 rc = cifs_read_super(sb);
649 if (rc) { 649 if (rc) {
650 root = ERR_PTR(rc); 650 root = ERR_PTR(rc);
651 goto out_super; 651 goto out_super;
652 } 652 }
653 653
654 sb->s_flags |= MS_ACTIVE; 654 sb->s_flags |= MS_ACTIVE;
655 } 655 }
656 656
657 root = cifs_get_root(volume_info, sb); 657 root = cifs_get_root(volume_info, sb);
658 if (IS_ERR(root)) 658 if (IS_ERR(root))
659 goto out_super; 659 goto out_super;
660 660
661 cFYI(1, "dentry root is: %p", root); 661 cFYI(1, "dentry root is: %p", root);
662 goto out; 662 goto out;
663 663
664 out_super: 664 out_super:
665 deactivate_locked_super(sb); 665 deactivate_locked_super(sb);
666 out: 666 out:
667 cifs_cleanup_volume_info(volume_info); 667 cifs_cleanup_volume_info(volume_info);
668 return root; 668 return root;
669 669
670 out_mountdata: 670 out_mountdata:
671 kfree(cifs_sb->mountdata); 671 kfree(cifs_sb->mountdata);
672 out_cifs_sb: 672 out_cifs_sb:
673 kfree(cifs_sb); 673 kfree(cifs_sb);
674 out_nls: 674 out_nls:
675 unload_nls(volume_info->local_nls); 675 unload_nls(volume_info->local_nls);
676 goto out; 676 goto out;
677 } 677 }
678 678
679 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 679 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
680 unsigned long nr_segs, loff_t pos) 680 unsigned long nr_segs, loff_t pos)
681 { 681 {
682 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 682 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
683 ssize_t written; 683 ssize_t written;
684 int rc; 684 int rc;
685 685
686 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 686 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
687 687
688 if (CIFS_I(inode)->clientCanCacheAll) 688 if (CIFS_I(inode)->clientCanCacheAll)
689 return written; 689 return written;
690 690
691 rc = filemap_fdatawrite(inode->i_mapping); 691 rc = filemap_fdatawrite(inode->i_mapping);
692 if (rc) 692 if (rc)
693 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode); 693 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
694 694
695 return written; 695 return written;
696 } 696 }
697 697
698 static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) 698 static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
699 { 699 {
700 /* 700 /*
701 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 701 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
702 * the cached file length 702 * the cached file length
703 */ 703 */
704 if (origin != SEEK_SET && origin != SEEK_CUR) { 704 if (whence != SEEK_SET && whence != SEEK_CUR) {
705 int rc; 705 int rc;
706 struct inode *inode = file->f_path.dentry->d_inode; 706 struct inode *inode = file->f_path.dentry->d_inode;
707 707
708 /* 708 /*
709 * We need to be sure that all dirty pages are written and the 709 * We need to be sure that all dirty pages are written and the
710 * server has the newest file length. 710 * server has the newest file length.
711 */ 711 */
712 if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && 712 if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
713 inode->i_mapping->nrpages != 0) { 713 inode->i_mapping->nrpages != 0) {
714 rc = filemap_fdatawait(inode->i_mapping); 714 rc = filemap_fdatawait(inode->i_mapping);
715 if (rc) { 715 if (rc) {
716 mapping_set_error(inode->i_mapping, rc); 716 mapping_set_error(inode->i_mapping, rc);
717 return rc; 717 return rc;
718 } 718 }
719 } 719 }
720 /* 720 /*
721 * Some applications poll for the file length in this strange 721 * Some applications poll for the file length in this strange
722 * way so we must seek to end on non-oplocked files by 722 * way so we must seek to end on non-oplocked files by
723 * setting the revalidate time to zero. 723 * setting the revalidate time to zero.
724 */ 724 */
725 CIFS_I(inode)->time = 0; 725 CIFS_I(inode)->time = 0;
726 726
727 rc = cifs_revalidate_file_attr(file); 727 rc = cifs_revalidate_file_attr(file);
728 if (rc < 0) 728 if (rc < 0)
729 return (loff_t)rc; 729 return (loff_t)rc;
730 } 730 }
731 return generic_file_llseek(file, offset, origin); 731 return generic_file_llseek(file, offset, whence);
732 } 732 }
733 733
734 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 734 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
735 { 735 {
736 /* note that this is called by vfs setlease with lock_flocks held 736 /* note that this is called by vfs setlease with lock_flocks held
737 to protect *lease from going away */ 737 to protect *lease from going away */
738 struct inode *inode = file->f_path.dentry->d_inode; 738 struct inode *inode = file->f_path.dentry->d_inode;
739 struct cifsFileInfo *cfile = file->private_data; 739 struct cifsFileInfo *cfile = file->private_data;
740 740
741 if (!(S_ISREG(inode->i_mode))) 741 if (!(S_ISREG(inode->i_mode)))
742 return -EINVAL; 742 return -EINVAL;
743 743
744 /* check if file is oplocked */ 744 /* check if file is oplocked */
745 if (((arg == F_RDLCK) && 745 if (((arg == F_RDLCK) &&
746 (CIFS_I(inode)->clientCanCacheRead)) || 746 (CIFS_I(inode)->clientCanCacheRead)) ||
747 ((arg == F_WRLCK) && 747 ((arg == F_WRLCK) &&
748 (CIFS_I(inode)->clientCanCacheAll))) 748 (CIFS_I(inode)->clientCanCacheAll)))
749 return generic_setlease(file, arg, lease); 749 return generic_setlease(file, arg, lease);
750 else if (tlink_tcon(cfile->tlink)->local_lease && 750 else if (tlink_tcon(cfile->tlink)->local_lease &&
751 !CIFS_I(inode)->clientCanCacheRead) 751 !CIFS_I(inode)->clientCanCacheRead)
752 /* If the server claims to support oplock on this 752 /* If the server claims to support oplock on this
753 file, then we still need to check oplock even 753 file, then we still need to check oplock even
754 if the local_lease mount option is set, but there 754 if the local_lease mount option is set, but there
755 are servers which do not support oplock for which 755 are servers which do not support oplock for which
756 this mount option may be useful if the user 756 this mount option may be useful if the user
757 knows that the file won't be changed on the server 757 knows that the file won't be changed on the server
758 by anyone else */ 758 by anyone else */
759 return generic_setlease(file, arg, lease); 759 return generic_setlease(file, arg, lease);
760 else 760 else
761 return -EAGAIN; 761 return -EAGAIN;
762 } 762 }
763 763
764 struct file_system_type cifs_fs_type = { 764 struct file_system_type cifs_fs_type = {
765 .owner = THIS_MODULE, 765 .owner = THIS_MODULE,
766 .name = "cifs", 766 .name = "cifs",
767 .mount = cifs_do_mount, 767 .mount = cifs_do_mount,
768 .kill_sb = cifs_kill_sb, 768 .kill_sb = cifs_kill_sb,
769 /* .fs_flags */ 769 /* .fs_flags */
770 }; 770 };
771 const struct inode_operations cifs_dir_inode_ops = { 771 const struct inode_operations cifs_dir_inode_ops = {
772 .create = cifs_create, 772 .create = cifs_create,
773 .atomic_open = cifs_atomic_open, 773 .atomic_open = cifs_atomic_open,
774 .lookup = cifs_lookup, 774 .lookup = cifs_lookup,
775 .getattr = cifs_getattr, 775 .getattr = cifs_getattr,
776 .unlink = cifs_unlink, 776 .unlink = cifs_unlink,
777 .link = cifs_hardlink, 777 .link = cifs_hardlink,
778 .mkdir = cifs_mkdir, 778 .mkdir = cifs_mkdir,
779 .rmdir = cifs_rmdir, 779 .rmdir = cifs_rmdir,
780 .rename = cifs_rename, 780 .rename = cifs_rename,
781 .permission = cifs_permission, 781 .permission = cifs_permission,
782 /* revalidate:cifs_revalidate, */ 782 /* revalidate:cifs_revalidate, */
783 .setattr = cifs_setattr, 783 .setattr = cifs_setattr,
784 .symlink = cifs_symlink, 784 .symlink = cifs_symlink,
785 .mknod = cifs_mknod, 785 .mknod = cifs_mknod,
786 #ifdef CONFIG_CIFS_XATTR 786 #ifdef CONFIG_CIFS_XATTR
787 .setxattr = cifs_setxattr, 787 .setxattr = cifs_setxattr,
788 .getxattr = cifs_getxattr, 788 .getxattr = cifs_getxattr,
789 .listxattr = cifs_listxattr, 789 .listxattr = cifs_listxattr,
790 .removexattr = cifs_removexattr, 790 .removexattr = cifs_removexattr,
791 #endif 791 #endif
792 }; 792 };
793 793
794 const struct inode_operations cifs_file_inode_ops = { 794 const struct inode_operations cifs_file_inode_ops = {
795 /* revalidate:cifs_revalidate, */ 795 /* revalidate:cifs_revalidate, */
796 .setattr = cifs_setattr, 796 .setattr = cifs_setattr,
797 .getattr = cifs_getattr, /* do we need this anymore? */ 797 .getattr = cifs_getattr, /* do we need this anymore? */
798 .rename = cifs_rename, 798 .rename = cifs_rename,
799 .permission = cifs_permission, 799 .permission = cifs_permission,
800 #ifdef CONFIG_CIFS_XATTR 800 #ifdef CONFIG_CIFS_XATTR
801 .setxattr = cifs_setxattr, 801 .setxattr = cifs_setxattr,
802 .getxattr = cifs_getxattr, 802 .getxattr = cifs_getxattr,
803 .listxattr = cifs_listxattr, 803 .listxattr = cifs_listxattr,
804 .removexattr = cifs_removexattr, 804 .removexattr = cifs_removexattr,
805 #endif 805 #endif
806 }; 806 };
807 807
808 const struct inode_operations cifs_symlink_inode_ops = { 808 const struct inode_operations cifs_symlink_inode_ops = {
809 .readlink = generic_readlink, 809 .readlink = generic_readlink,
810 .follow_link = cifs_follow_link, 810 .follow_link = cifs_follow_link,
811 .put_link = cifs_put_link, 811 .put_link = cifs_put_link,
812 .permission = cifs_permission, 812 .permission = cifs_permission,
813 /* BB add the following two eventually */ 813 /* BB add the following two eventually */
814 /* revalidate: cifs_revalidate, 814 /* revalidate: cifs_revalidate,
815 setattr: cifs_notify_change, *//* BB do we need notify change */ 815 setattr: cifs_notify_change, *//* BB do we need notify change */
816 #ifdef CONFIG_CIFS_XATTR 816 #ifdef CONFIG_CIFS_XATTR
817 .setxattr = cifs_setxattr, 817 .setxattr = cifs_setxattr,
818 .getxattr = cifs_getxattr, 818 .getxattr = cifs_getxattr,
819 .listxattr = cifs_listxattr, 819 .listxattr = cifs_listxattr,
820 .removexattr = cifs_removexattr, 820 .removexattr = cifs_removexattr,
821 #endif 821 #endif
822 }; 822 };
823 823
824 const struct file_operations cifs_file_ops = { 824 const struct file_operations cifs_file_ops = {
825 .read = do_sync_read, 825 .read = do_sync_read,
826 .write = do_sync_write, 826 .write = do_sync_write,
827 .aio_read = generic_file_aio_read, 827 .aio_read = generic_file_aio_read,
828 .aio_write = cifs_file_aio_write, 828 .aio_write = cifs_file_aio_write,
829 .open = cifs_open, 829 .open = cifs_open,
830 .release = cifs_close, 830 .release = cifs_close,
831 .lock = cifs_lock, 831 .lock = cifs_lock,
832 .fsync = cifs_fsync, 832 .fsync = cifs_fsync,
833 .flush = cifs_flush, 833 .flush = cifs_flush,
834 .mmap = cifs_file_mmap, 834 .mmap = cifs_file_mmap,
835 .splice_read = generic_file_splice_read, 835 .splice_read = generic_file_splice_read,
836 .llseek = cifs_llseek, 836 .llseek = cifs_llseek,
837 #ifdef CONFIG_CIFS_POSIX 837 #ifdef CONFIG_CIFS_POSIX
838 .unlocked_ioctl = cifs_ioctl, 838 .unlocked_ioctl = cifs_ioctl,
839 #endif /* CONFIG_CIFS_POSIX */ 839 #endif /* CONFIG_CIFS_POSIX */
840 .setlease = cifs_setlease, 840 .setlease = cifs_setlease,
841 }; 841 };
842 842
843 const struct file_operations cifs_file_strict_ops = { 843 const struct file_operations cifs_file_strict_ops = {
844 .read = do_sync_read, 844 .read = do_sync_read,
845 .write = do_sync_write, 845 .write = do_sync_write,
846 .aio_read = cifs_strict_readv, 846 .aio_read = cifs_strict_readv,
847 .aio_write = cifs_strict_writev, 847 .aio_write = cifs_strict_writev,
848 .open = cifs_open, 848 .open = cifs_open,
849 .release = cifs_close, 849 .release = cifs_close,
850 .lock = cifs_lock, 850 .lock = cifs_lock,
851 .fsync = cifs_strict_fsync, 851 .fsync = cifs_strict_fsync,
852 .flush = cifs_flush, 852 .flush = cifs_flush,
853 .mmap = cifs_file_strict_mmap, 853 .mmap = cifs_file_strict_mmap,
854 .splice_read = generic_file_splice_read, 854 .splice_read = generic_file_splice_read,
855 .llseek = cifs_llseek, 855 .llseek = cifs_llseek,
856 #ifdef CONFIG_CIFS_POSIX 856 #ifdef CONFIG_CIFS_POSIX
857 .unlocked_ioctl = cifs_ioctl, 857 .unlocked_ioctl = cifs_ioctl,
858 #endif /* CONFIG_CIFS_POSIX */ 858 #endif /* CONFIG_CIFS_POSIX */
859 .setlease = cifs_setlease, 859 .setlease = cifs_setlease,
860 }; 860 };
861 861
862 const struct file_operations cifs_file_direct_ops = { 862 const struct file_operations cifs_file_direct_ops = {
863 /* BB reevaluate whether they can be done with directio, no cache */ 863 /* BB reevaluate whether they can be done with directio, no cache */
864 .read = do_sync_read, 864 .read = do_sync_read,
865 .write = do_sync_write, 865 .write = do_sync_write,
866 .aio_read = cifs_user_readv, 866 .aio_read = cifs_user_readv,
867 .aio_write = cifs_user_writev, 867 .aio_write = cifs_user_writev,
868 .open = cifs_open, 868 .open = cifs_open,
869 .release = cifs_close, 869 .release = cifs_close,
870 .lock = cifs_lock, 870 .lock = cifs_lock,
871 .fsync = cifs_fsync, 871 .fsync = cifs_fsync,
872 .flush = cifs_flush, 872 .flush = cifs_flush,
873 .mmap = cifs_file_mmap, 873 .mmap = cifs_file_mmap,
874 .splice_read = generic_file_splice_read, 874 .splice_read = generic_file_splice_read,
875 #ifdef CONFIG_CIFS_POSIX 875 #ifdef CONFIG_CIFS_POSIX
876 .unlocked_ioctl = cifs_ioctl, 876 .unlocked_ioctl = cifs_ioctl,
877 #endif /* CONFIG_CIFS_POSIX */ 877 #endif /* CONFIG_CIFS_POSIX */
878 .llseek = cifs_llseek, 878 .llseek = cifs_llseek,
879 .setlease = cifs_setlease, 879 .setlease = cifs_setlease,
880 }; 880 };
881 881
882 const struct file_operations cifs_file_nobrl_ops = { 882 const struct file_operations cifs_file_nobrl_ops = {
883 .read = do_sync_read, 883 .read = do_sync_read,
884 .write = do_sync_write, 884 .write = do_sync_write,
885 .aio_read = generic_file_aio_read, 885 .aio_read = generic_file_aio_read,
886 .aio_write = cifs_file_aio_write, 886 .aio_write = cifs_file_aio_write,
887 .open = cifs_open, 887 .open = cifs_open,
888 .release = cifs_close, 888 .release = cifs_close,
889 .fsync = cifs_fsync, 889 .fsync = cifs_fsync,
890 .flush = cifs_flush, 890 .flush = cifs_flush,
891 .mmap = cifs_file_mmap, 891 .mmap = cifs_file_mmap,
892 .splice_read = generic_file_splice_read, 892 .splice_read = generic_file_splice_read,
893 .llseek = cifs_llseek, 893 .llseek = cifs_llseek,
894 #ifdef CONFIG_CIFS_POSIX 894 #ifdef CONFIG_CIFS_POSIX
895 .unlocked_ioctl = cifs_ioctl, 895 .unlocked_ioctl = cifs_ioctl,
896 #endif /* CONFIG_CIFS_POSIX */ 896 #endif /* CONFIG_CIFS_POSIX */
897 .setlease = cifs_setlease, 897 .setlease = cifs_setlease,
898 }; 898 };
899 899
900 const struct file_operations cifs_file_strict_nobrl_ops = { 900 const struct file_operations cifs_file_strict_nobrl_ops = {
901 .read = do_sync_read, 901 .read = do_sync_read,
902 .write = do_sync_write, 902 .write = do_sync_write,
903 .aio_read = cifs_strict_readv, 903 .aio_read = cifs_strict_readv,
904 .aio_write = cifs_strict_writev, 904 .aio_write = cifs_strict_writev,
905 .open = cifs_open, 905 .open = cifs_open,
906 .release = cifs_close, 906 .release = cifs_close,
907 .fsync = cifs_strict_fsync, 907 .fsync = cifs_strict_fsync,
908 .flush = cifs_flush, 908 .flush = cifs_flush,
909 .mmap = cifs_file_strict_mmap, 909 .mmap = cifs_file_strict_mmap,
910 .splice_read = generic_file_splice_read, 910 .splice_read = generic_file_splice_read,
911 .llseek = cifs_llseek, 911 .llseek = cifs_llseek,
912 #ifdef CONFIG_CIFS_POSIX 912 #ifdef CONFIG_CIFS_POSIX
913 .unlocked_ioctl = cifs_ioctl, 913 .unlocked_ioctl = cifs_ioctl,
914 #endif /* CONFIG_CIFS_POSIX */ 914 #endif /* CONFIG_CIFS_POSIX */
915 .setlease = cifs_setlease, 915 .setlease = cifs_setlease,
916 }; 916 };
917 917
918 const struct file_operations cifs_file_direct_nobrl_ops = { 918 const struct file_operations cifs_file_direct_nobrl_ops = {
919 /* BB reevaluate whether they can be done with directio, no cache */ 919 /* BB reevaluate whether they can be done with directio, no cache */
920 .read = do_sync_read, 920 .read = do_sync_read,
921 .write = do_sync_write, 921 .write = do_sync_write,
922 .aio_read = cifs_user_readv, 922 .aio_read = cifs_user_readv,
923 .aio_write = cifs_user_writev, 923 .aio_write = cifs_user_writev,
924 .open = cifs_open, 924 .open = cifs_open,
925 .release = cifs_close, 925 .release = cifs_close,
926 .fsync = cifs_fsync, 926 .fsync = cifs_fsync,
927 .flush = cifs_flush, 927 .flush = cifs_flush,
928 .mmap = cifs_file_mmap, 928 .mmap = cifs_file_mmap,
929 .splice_read = generic_file_splice_read, 929 .splice_read = generic_file_splice_read,
930 #ifdef CONFIG_CIFS_POSIX 930 #ifdef CONFIG_CIFS_POSIX
931 .unlocked_ioctl = cifs_ioctl, 931 .unlocked_ioctl = cifs_ioctl,
932 #endif /* CONFIG_CIFS_POSIX */ 932 #endif /* CONFIG_CIFS_POSIX */
933 .llseek = cifs_llseek, 933 .llseek = cifs_llseek,
934 .setlease = cifs_setlease, 934 .setlease = cifs_setlease,
935 }; 935 };
936 936
937 const struct file_operations cifs_dir_ops = { 937 const struct file_operations cifs_dir_ops = {
938 .readdir = cifs_readdir, 938 .readdir = cifs_readdir,
939 .release = cifs_closedir, 939 .release = cifs_closedir,
940 .read = generic_read_dir, 940 .read = generic_read_dir,
941 .unlocked_ioctl = cifs_ioctl, 941 .unlocked_ioctl = cifs_ioctl,
942 .llseek = generic_file_llseek, 942 .llseek = generic_file_llseek,
943 }; 943 };
944 944
945 static void 945 static void
946 cifs_init_once(void *inode) 946 cifs_init_once(void *inode)
947 { 947 {
948 struct cifsInodeInfo *cifsi = inode; 948 struct cifsInodeInfo *cifsi = inode;
949 949
950 inode_init_once(&cifsi->vfs_inode); 950 inode_init_once(&cifsi->vfs_inode);
951 init_rwsem(&cifsi->lock_sem); 951 init_rwsem(&cifsi->lock_sem);
952 } 952 }
953 953
954 static int 954 static int
955 cifs_init_inodecache(void) 955 cifs_init_inodecache(void)
956 { 956 {
957 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", 957 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
958 sizeof(struct cifsInodeInfo), 958 sizeof(struct cifsInodeInfo),
959 0, (SLAB_RECLAIM_ACCOUNT| 959 0, (SLAB_RECLAIM_ACCOUNT|
960 SLAB_MEM_SPREAD), 960 SLAB_MEM_SPREAD),
961 cifs_init_once); 961 cifs_init_once);
962 if (cifs_inode_cachep == NULL) 962 if (cifs_inode_cachep == NULL)
963 return -ENOMEM; 963 return -ENOMEM;
964 964
965 return 0; 965 return 0;
966 } 966 }
967 967
968 static void 968 static void
969 cifs_destroy_inodecache(void) 969 cifs_destroy_inodecache(void)
970 { 970 {
971 /* 971 /*
972 * Make sure all delayed rcu free inodes are flushed before we 972 * Make sure all delayed rcu free inodes are flushed before we
973 * destroy cache. 973 * destroy cache.
974 */ 974 */
975 rcu_barrier(); 975 rcu_barrier();
976 kmem_cache_destroy(cifs_inode_cachep); 976 kmem_cache_destroy(cifs_inode_cachep);
977 } 977 }
978 978
979 static int 979 static int
980 cifs_init_request_bufs(void) 980 cifs_init_request_bufs(void)
981 { 981 {
982 size_t max_hdr_size = MAX_CIFS_HDR_SIZE; 982 size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
983 #ifdef CONFIG_CIFS_SMB2 983 #ifdef CONFIG_CIFS_SMB2
984 /* 984 /*
985 * SMB2 maximum header size is bigger than CIFS one - no problems to 985 * SMB2 maximum header size is bigger than CIFS one - no problems to
986 * allocate some more bytes for CIFS. 986 * allocate some more bytes for CIFS.
987 */ 987 */
988 max_hdr_size = MAX_SMB2_HDR_SIZE; 988 max_hdr_size = MAX_SMB2_HDR_SIZE;
989 #endif 989 #endif
990 if (CIFSMaxBufSize < 8192) { 990 if (CIFSMaxBufSize < 8192) {
991 /* Buffer size can not be smaller than 2 * PATH_MAX since maximum 991 /* Buffer size can not be smaller than 2 * PATH_MAX since maximum
992 Unicode path name has to fit in any SMB/CIFS path based frames */ 992 Unicode path name has to fit in any SMB/CIFS path based frames */
993 CIFSMaxBufSize = 8192; 993 CIFSMaxBufSize = 8192;
994 } else if (CIFSMaxBufSize > 1024*127) { 994 } else if (CIFSMaxBufSize > 1024*127) {
995 CIFSMaxBufSize = 1024 * 127; 995 CIFSMaxBufSize = 1024 * 127;
996 } else { 996 } else {
997 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ 997 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
998 } 998 }
999 /* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */ 999 /* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
1000 cifs_req_cachep = kmem_cache_create("cifs_request", 1000 cifs_req_cachep = kmem_cache_create("cifs_request",
1001 CIFSMaxBufSize + max_hdr_size, 0, 1001 CIFSMaxBufSize + max_hdr_size, 0,
1002 SLAB_HWCACHE_ALIGN, NULL); 1002 SLAB_HWCACHE_ALIGN, NULL);
1003 if (cifs_req_cachep == NULL) 1003 if (cifs_req_cachep == NULL)
1004 return -ENOMEM; 1004 return -ENOMEM;
1005 1005
1006 if (cifs_min_rcv < 1) 1006 if (cifs_min_rcv < 1)
1007 cifs_min_rcv = 1; 1007 cifs_min_rcv = 1;
1008 else if (cifs_min_rcv > 64) { 1008 else if (cifs_min_rcv > 64) {
1009 cifs_min_rcv = 64; 1009 cifs_min_rcv = 64;
1010 cERROR(1, "cifs_min_rcv set to maximum (64)"); 1010 cERROR(1, "cifs_min_rcv set to maximum (64)");
1011 } 1011 }
1012 1012
1013 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, 1013 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
1014 cifs_req_cachep); 1014 cifs_req_cachep);
1015 1015
1016 if (cifs_req_poolp == NULL) { 1016 if (cifs_req_poolp == NULL) {
1017 kmem_cache_destroy(cifs_req_cachep); 1017 kmem_cache_destroy(cifs_req_cachep);
1018 return -ENOMEM; 1018 return -ENOMEM;
1019 } 1019 }
1020 /* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and 1020 /* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
1021 almost all handle based requests (but not write response, nor is it 1021 almost all handle based requests (but not write response, nor is it
1022 sufficient for path based requests). A smaller size would have 1022 sufficient for path based requests). A smaller size would have
1023 been more efficient (compacting multiple slab items on one 4k page) 1023 been more efficient (compacting multiple slab items on one 4k page)
1024 for the case in which debug was on, but this larger size allows 1024 for the case in which debug was on, but this larger size allows
1025 more SMBs to use small buffer alloc and is still much more 1025 more SMBs to use small buffer alloc and is still much more
1026 efficient to alloc 1 per page off the slab compared to 17K (5page) 1026 efficient to alloc 1 per page off the slab compared to 17K (5page)
1027 alloc of large cifs buffers even when page debugging is on */ 1027 alloc of large cifs buffers even when page debugging is on */
1028 cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq", 1028 cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
1029 MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN, 1029 MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
1030 NULL); 1030 NULL);
1031 if (cifs_sm_req_cachep == NULL) { 1031 if (cifs_sm_req_cachep == NULL) {
1032 mempool_destroy(cifs_req_poolp); 1032 mempool_destroy(cifs_req_poolp);
1033 kmem_cache_destroy(cifs_req_cachep); 1033 kmem_cache_destroy(cifs_req_cachep);
1034 return -ENOMEM; 1034 return -ENOMEM;
1035 } 1035 }
1036 1036
1037 if (cifs_min_small < 2) 1037 if (cifs_min_small < 2)
1038 cifs_min_small = 2; 1038 cifs_min_small = 2;
1039 else if (cifs_min_small > 256) { 1039 else if (cifs_min_small > 256) {
1040 cifs_min_small = 256; 1040 cifs_min_small = 256;
1041 cFYI(1, "cifs_min_small set to maximum (256)"); 1041 cFYI(1, "cifs_min_small set to maximum (256)");
1042 } 1042 }
1043 1043
1044 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, 1044 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
1045 cifs_sm_req_cachep); 1045 cifs_sm_req_cachep);
1046 1046
1047 if (cifs_sm_req_poolp == NULL) { 1047 if (cifs_sm_req_poolp == NULL) {
1048 mempool_destroy(cifs_req_poolp); 1048 mempool_destroy(cifs_req_poolp);
1049 kmem_cache_destroy(cifs_req_cachep); 1049 kmem_cache_destroy(cifs_req_cachep);
1050 kmem_cache_destroy(cifs_sm_req_cachep); 1050 kmem_cache_destroy(cifs_sm_req_cachep);
1051 return -ENOMEM; 1051 return -ENOMEM;
1052 } 1052 }
1053 1053
1054 return 0; 1054 return 0;
1055 } 1055 }
1056 1056
1057 static void 1057 static void
1058 cifs_destroy_request_bufs(void) 1058 cifs_destroy_request_bufs(void)
1059 { 1059 {
1060 mempool_destroy(cifs_req_poolp); 1060 mempool_destroy(cifs_req_poolp);
1061 kmem_cache_destroy(cifs_req_cachep); 1061 kmem_cache_destroy(cifs_req_cachep);
1062 mempool_destroy(cifs_sm_req_poolp); 1062 mempool_destroy(cifs_sm_req_poolp);
1063 kmem_cache_destroy(cifs_sm_req_cachep); 1063 kmem_cache_destroy(cifs_sm_req_cachep);
1064 } 1064 }
1065 1065
1066 static int 1066 static int
1067 cifs_init_mids(void) 1067 cifs_init_mids(void)
1068 { 1068 {
1069 cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids", 1069 cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
1070 sizeof(struct mid_q_entry), 0, 1070 sizeof(struct mid_q_entry), 0,
1071 SLAB_HWCACHE_ALIGN, NULL); 1071 SLAB_HWCACHE_ALIGN, NULL);
1072 if (cifs_mid_cachep == NULL) 1072 if (cifs_mid_cachep == NULL)
1073 return -ENOMEM; 1073 return -ENOMEM;
1074 1074
1075 /* 3 is a reasonable minimum number of simultaneous operations */ 1075 /* 3 is a reasonable minimum number of simultaneous operations */
1076 cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep); 1076 cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
1077 if (cifs_mid_poolp == NULL) { 1077 if (cifs_mid_poolp == NULL) {
1078 kmem_cache_destroy(cifs_mid_cachep); 1078 kmem_cache_destroy(cifs_mid_cachep);
1079 return -ENOMEM; 1079 return -ENOMEM;
1080 } 1080 }
1081 1081
1082 return 0; 1082 return 0;
1083 } 1083 }
1084 1084
1085 static void 1085 static void
1086 cifs_destroy_mids(void) 1086 cifs_destroy_mids(void)
1087 { 1087 {
1088 mempool_destroy(cifs_mid_poolp); 1088 mempool_destroy(cifs_mid_poolp);
1089 kmem_cache_destroy(cifs_mid_cachep); 1089 kmem_cache_destroy(cifs_mid_cachep);
1090 } 1090 }
1091 1091
1092 static int __init 1092 static int __init
1093 init_cifs(void) 1093 init_cifs(void)
1094 { 1094 {
1095 int rc = 0; 1095 int rc = 0;
1096 cifs_proc_init(); 1096 cifs_proc_init();
1097 INIT_LIST_HEAD(&cifs_tcp_ses_list); 1097 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1098 #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ 1098 #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
1099 INIT_LIST_HEAD(&GlobalDnotifyReqList); 1099 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1100 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 1100 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
1101 #endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ 1101 #endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
1102 /* 1102 /*
1103 * Initialize Global counters 1103 * Initialize Global counters
1104 */ 1104 */
1105 atomic_set(&sesInfoAllocCount, 0); 1105 atomic_set(&sesInfoAllocCount, 0);
1106 atomic_set(&tconInfoAllocCount, 0); 1106 atomic_set(&tconInfoAllocCount, 0);
1107 atomic_set(&tcpSesAllocCount, 0); 1107 atomic_set(&tcpSesAllocCount, 0);
1108 atomic_set(&tcpSesReconnectCount, 0); 1108 atomic_set(&tcpSesReconnectCount, 0);
1109 atomic_set(&tconInfoReconnectCount, 0); 1109 atomic_set(&tconInfoReconnectCount, 0);
1110 1110
1111 atomic_set(&bufAllocCount, 0); 1111 atomic_set(&bufAllocCount, 0);
1112 atomic_set(&smBufAllocCount, 0); 1112 atomic_set(&smBufAllocCount, 0);
1113 #ifdef CONFIG_CIFS_STATS2 1113 #ifdef CONFIG_CIFS_STATS2
1114 atomic_set(&totBufAllocCount, 0); 1114 atomic_set(&totBufAllocCount, 0);
1115 atomic_set(&totSmBufAllocCount, 0); 1115 atomic_set(&totSmBufAllocCount, 0);
1116 #endif /* CONFIG_CIFS_STATS2 */ 1116 #endif /* CONFIG_CIFS_STATS2 */
1117 1117
1118 atomic_set(&midCount, 0); 1118 atomic_set(&midCount, 0);
1119 GlobalCurrentXid = 0; 1119 GlobalCurrentXid = 0;
1120 GlobalTotalActiveXid = 0; 1120 GlobalTotalActiveXid = 0;
1121 GlobalMaxActiveXid = 0; 1121 GlobalMaxActiveXid = 0;
1122 spin_lock_init(&cifs_tcp_ses_lock); 1122 spin_lock_init(&cifs_tcp_ses_lock);
1123 spin_lock_init(&cifs_file_list_lock); 1123 spin_lock_init(&cifs_file_list_lock);
1124 spin_lock_init(&GlobalMid_Lock); 1124 spin_lock_init(&GlobalMid_Lock);
1125 1125
1126 #ifdef CONFIG_CIFS_SMB2 1126 #ifdef CONFIG_CIFS_SMB2
1127 get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE); 1127 get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
1128 #endif 1128 #endif
1129 1129
1130 if (cifs_max_pending < 2) { 1130 if (cifs_max_pending < 2) {
1131 cifs_max_pending = 2; 1131 cifs_max_pending = 2;
1132 cFYI(1, "cifs_max_pending set to min of 2"); 1132 cFYI(1, "cifs_max_pending set to min of 2");
1133 } else if (cifs_max_pending > CIFS_MAX_REQ) { 1133 } else if (cifs_max_pending > CIFS_MAX_REQ) {
1134 cifs_max_pending = CIFS_MAX_REQ; 1134 cifs_max_pending = CIFS_MAX_REQ;
1135 cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ); 1135 cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
1136 } 1136 }
1137 1137
1138 cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); 1138 cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1139 if (!cifsiod_wq) { 1139 if (!cifsiod_wq) {
1140 rc = -ENOMEM; 1140 rc = -ENOMEM;
1141 goto out_clean_proc; 1141 goto out_clean_proc;
1142 } 1142 }
1143 1143
1144 rc = cifs_fscache_register(); 1144 rc = cifs_fscache_register();
1145 if (rc) 1145 if (rc)
1146 goto out_destroy_wq; 1146 goto out_destroy_wq;
1147 1147
1148 rc = cifs_init_inodecache(); 1148 rc = cifs_init_inodecache();
1149 if (rc) 1149 if (rc)
1150 goto out_unreg_fscache; 1150 goto out_unreg_fscache;
1151 1151
1152 rc = cifs_init_mids(); 1152 rc = cifs_init_mids();
1153 if (rc) 1153 if (rc)
1154 goto out_destroy_inodecache; 1154 goto out_destroy_inodecache;
1155 1155
1156 rc = cifs_init_request_bufs(); 1156 rc = cifs_init_request_bufs();
1157 if (rc) 1157 if (rc)
1158 goto out_destroy_mids; 1158 goto out_destroy_mids;
1159 1159
1160 #ifdef CONFIG_CIFS_UPCALL 1160 #ifdef CONFIG_CIFS_UPCALL
1161 rc = register_key_type(&cifs_spnego_key_type); 1161 rc = register_key_type(&cifs_spnego_key_type);
1162 if (rc) 1162 if (rc)
1163 goto out_destroy_request_bufs; 1163 goto out_destroy_request_bufs;
1164 #endif /* CONFIG_CIFS_UPCALL */ 1164 #endif /* CONFIG_CIFS_UPCALL */
1165 1165
1166 #ifdef CONFIG_CIFS_ACL 1166 #ifdef CONFIG_CIFS_ACL
1167 rc = init_cifs_idmap(); 1167 rc = init_cifs_idmap();
1168 if (rc) 1168 if (rc)
1169 goto out_register_key_type; 1169 goto out_register_key_type;
1170 #endif /* CONFIG_CIFS_ACL */ 1170 #endif /* CONFIG_CIFS_ACL */
1171 1171
1172 rc = register_filesystem(&cifs_fs_type); 1172 rc = register_filesystem(&cifs_fs_type);
1173 if (rc) 1173 if (rc)
1174 goto out_init_cifs_idmap; 1174 goto out_init_cifs_idmap;
1175 1175
1176 return 0; 1176 return 0;
1177 1177
1178 out_init_cifs_idmap: 1178 out_init_cifs_idmap:
1179 #ifdef CONFIG_CIFS_ACL 1179 #ifdef CONFIG_CIFS_ACL
1180 exit_cifs_idmap(); 1180 exit_cifs_idmap();
1181 out_register_key_type: 1181 out_register_key_type:
1182 #endif 1182 #endif
1183 #ifdef CONFIG_CIFS_UPCALL 1183 #ifdef CONFIG_CIFS_UPCALL
1184 unregister_key_type(&cifs_spnego_key_type); 1184 unregister_key_type(&cifs_spnego_key_type);
1185 out_destroy_request_bufs: 1185 out_destroy_request_bufs:
1186 #endif 1186 #endif
1187 cifs_destroy_request_bufs(); 1187 cifs_destroy_request_bufs();
1188 out_destroy_mids: 1188 out_destroy_mids:
1189 cifs_destroy_mids(); 1189 cifs_destroy_mids();
1190 out_destroy_inodecache: 1190 out_destroy_inodecache:
1191 cifs_destroy_inodecache(); 1191 cifs_destroy_inodecache();
1192 out_unreg_fscache: 1192 out_unreg_fscache:
1193 cifs_fscache_unregister(); 1193 cifs_fscache_unregister();
1194 out_destroy_wq: 1194 out_destroy_wq:
1195 destroy_workqueue(cifsiod_wq); 1195 destroy_workqueue(cifsiod_wq);
1196 out_clean_proc: 1196 out_clean_proc:
1197 cifs_proc_clean(); 1197 cifs_proc_clean();
1198 return rc; 1198 return rc;
1199 } 1199 }
1200 1200
1201 static void __exit 1201 static void __exit
1202 exit_cifs(void) 1202 exit_cifs(void)
1203 { 1203 {
1204 cFYI(DBG2, "exit_cifs"); 1204 cFYI(DBG2, "exit_cifs");
1205 unregister_filesystem(&cifs_fs_type); 1205 unregister_filesystem(&cifs_fs_type);
1206 cifs_dfs_release_automount_timer(); 1206 cifs_dfs_release_automount_timer();
1207 #ifdef CONFIG_CIFS_ACL 1207 #ifdef CONFIG_CIFS_ACL
1208 exit_cifs_idmap(); 1208 exit_cifs_idmap();
1209 #endif 1209 #endif
1210 #ifdef CONFIG_CIFS_UPCALL 1210 #ifdef CONFIG_CIFS_UPCALL
1211 unregister_key_type(&cifs_spnego_key_type); 1211 unregister_key_type(&cifs_spnego_key_type);
1212 #endif 1212 #endif
1213 cifs_destroy_request_bufs(); 1213 cifs_destroy_request_bufs();
1214 cifs_destroy_mids(); 1214 cifs_destroy_mids();
1215 cifs_destroy_inodecache(); 1215 cifs_destroy_inodecache();
1216 cifs_fscache_unregister(); 1216 cifs_fscache_unregister();
1217 destroy_workqueue(cifsiod_wq); 1217 destroy_workqueue(cifsiod_wq);
1218 cifs_proc_clean(); 1218 cifs_proc_clean();
1219 } 1219 }
1220 1220
1221 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); 1221 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
1222 MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ 1222 MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */
1223 MODULE_DESCRIPTION 1223 MODULE_DESCRIPTION
1224 ("VFS to access servers complying with the SNIA CIFS Specification " 1224 ("VFS to access servers complying with the SNIA CIFS Specification "
1225 "e.g. Samba and Windows"); 1225 "e.g. Samba and Windows");
1226 MODULE_VERSION(CIFS_VERSION); 1226 MODULE_VERSION(CIFS_VERSION);
1227 module_init(init_cifs) 1227 module_init(init_cifs)
1228 module_exit(exit_cifs) 1228 module_exit(exit_cifs)
1229 1229
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * dir.c - Operations for configfs directories. 4 * dir.c - Operations for configfs directories.
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public 7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either 8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version. 9 * version 2 of the License, or (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details. 14 * General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public 16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the 17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 * 20 *
21 * Based on sysfs: 21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel 22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 * 23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved. 24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */ 25 */
26 26
27 #undef DEBUG 27 #undef DEBUG
28 28
29 #include <linux/fs.h> 29 #include <linux/fs.h>
30 #include <linux/mount.h> 30 #include <linux/mount.h>
31 #include <linux/module.h> 31 #include <linux/module.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/err.h> 33 #include <linux/err.h>
34 34
35 #include <linux/configfs.h> 35 #include <linux/configfs.h>
36 #include "configfs_internal.h" 36 #include "configfs_internal.h"
37 37
38 DECLARE_RWSEM(configfs_rename_sem); 38 DECLARE_RWSEM(configfs_rename_sem);
39 /* 39 /*
40 * Protects mutations of configfs_dirent linkage together with proper i_mutex 40 * Protects mutations of configfs_dirent linkage together with proper i_mutex
41 * Also protects mutations of symlinks linkage to target configfs_dirent 41 * Also protects mutations of symlinks linkage to target configfs_dirent
42 * Mutators of configfs_dirent linkage must *both* have the proper inode locked 42 * Mutators of configfs_dirent linkage must *both* have the proper inode locked
43 * and configfs_dirent_lock locked, in that order. 43 * and configfs_dirent_lock locked, in that order.
44 * This allows one to safely traverse configfs_dirent trees and symlinks without 44 * This allows one to safely traverse configfs_dirent trees and symlinks without
45 * having to lock inodes. 45 * having to lock inodes.
46 * 46 *
47 * Protects setting of CONFIGFS_USET_DROPPING: checking the flag 47 * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
48 * unlocked is not reliable unless in detach_groups() called from 48 * unlocked is not reliable unless in detach_groups() called from
49 * rmdir()/unregister() and from configfs_attach_group() 49 * rmdir()/unregister() and from configfs_attach_group()
50 */ 50 */
51 DEFINE_SPINLOCK(configfs_dirent_lock); 51 DEFINE_SPINLOCK(configfs_dirent_lock);
52 52
53 static void configfs_d_iput(struct dentry * dentry, 53 static void configfs_d_iput(struct dentry * dentry,
54 struct inode * inode) 54 struct inode * inode)
55 { 55 {
56 struct configfs_dirent *sd = dentry->d_fsdata; 56 struct configfs_dirent *sd = dentry->d_fsdata;
57 57
58 if (sd) { 58 if (sd) {
59 BUG_ON(sd->s_dentry != dentry); 59 BUG_ON(sd->s_dentry != dentry);
60 /* Coordinate with configfs_readdir */ 60 /* Coordinate with configfs_readdir */
61 spin_lock(&configfs_dirent_lock); 61 spin_lock(&configfs_dirent_lock);
62 sd->s_dentry = NULL; 62 sd->s_dentry = NULL;
63 spin_unlock(&configfs_dirent_lock); 63 spin_unlock(&configfs_dirent_lock);
64 configfs_put(sd); 64 configfs_put(sd);
65 } 65 }
66 iput(inode); 66 iput(inode);
67 } 67 }
68 68
69 /* 69 /*
70 * We _must_ delete our dentries on last dput, as the chain-to-parent 70 * We _must_ delete our dentries on last dput, as the chain-to-parent
71 * behavior is required to clear the parents of default_groups. 71 * behavior is required to clear the parents of default_groups.
72 */ 72 */
73 static int configfs_d_delete(const struct dentry *dentry) 73 static int configfs_d_delete(const struct dentry *dentry)
74 { 74 {
75 return 1; 75 return 1;
76 } 76 }
77 77
78 const struct dentry_operations configfs_dentry_ops = { 78 const struct dentry_operations configfs_dentry_ops = {
79 .d_iput = configfs_d_iput, 79 .d_iput = configfs_d_iput,
80 /* simple_delete_dentry() isn't exported */ 80 /* simple_delete_dentry() isn't exported */
81 .d_delete = configfs_d_delete, 81 .d_delete = configfs_d_delete,
82 }; 82 };
83 83
84 #ifdef CONFIG_LOCKDEP 84 #ifdef CONFIG_LOCKDEP
85 85
86 /* 86 /*
87 * Helpers to make lockdep happy with our recursive locking of default groups' 87 * Helpers to make lockdep happy with our recursive locking of default groups'
88 * inodes (see configfs_attach_group() and configfs_detach_group()). 88 * inodes (see configfs_attach_group() and configfs_detach_group()).
89 * We put default groups i_mutexes in separate classes according to their depth 89 * We put default groups i_mutexes in separate classes according to their depth
90 * from the youngest non-default group ancestor. 90 * from the youngest non-default group ancestor.
91 * 91 *
92 * For a non-default group A having default groups A/B, A/C, and A/C/D, default 92 * For a non-default group A having default groups A/B, A/C, and A/C/D, default
93 * groups A/B and A/C will have their inode's mutex in class 93 * groups A/B and A/C will have their inode's mutex in class
94 * default_group_class[0], and default group A/C/D will be in 94 * default_group_class[0], and default group A/C/D will be in
95 * default_group_class[1]. 95 * default_group_class[1].
96 * 96 *
97 * The lock classes are declared and assigned in inode.c, according to the 97 * The lock classes are declared and assigned in inode.c, according to the
98 * s_depth value. 98 * s_depth value.
99 * The s_depth value is initialized to -1, adjusted to >= 0 when attaching 99 * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
100 * default groups, and reset to -1 when all default groups are attached. During 100 * default groups, and reset to -1 when all default groups are attached. During
101 * attachment, if configfs_create() sees s_depth > 0, the lock class of the new 101 * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
102 * inode's mutex is set to default_group_class[s_depth - 1]. 102 * inode's mutex is set to default_group_class[s_depth - 1].
103 */ 103 */
104 104
105 static void configfs_init_dirent_depth(struct configfs_dirent *sd) 105 static void configfs_init_dirent_depth(struct configfs_dirent *sd)
106 { 106 {
107 sd->s_depth = -1; 107 sd->s_depth = -1;
108 } 108 }
109 109
110 static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, 110 static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
111 struct configfs_dirent *sd) 111 struct configfs_dirent *sd)
112 { 112 {
113 int parent_depth = parent_sd->s_depth; 113 int parent_depth = parent_sd->s_depth;
114 114
115 if (parent_depth >= 0) 115 if (parent_depth >= 0)
116 sd->s_depth = parent_depth + 1; 116 sd->s_depth = parent_depth + 1;
117 } 117 }
118 118
119 static void 119 static void
120 configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) 120 configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
121 { 121 {
122 /* 122 /*
123 * item's i_mutex class is already setup, so s_depth is now only 123 * item's i_mutex class is already setup, so s_depth is now only
124 * used to set new sub-directories s_depth, which is always done 124 * used to set new sub-directories s_depth, which is always done
125 * with item's i_mutex locked. 125 * with item's i_mutex locked.
126 */ 126 */
127 /* 127 /*
128 * sd->s_depth == -1 iff we are a non default group. 128 * sd->s_depth == -1 iff we are a non default group.
129 * else (we are a default group) sd->s_depth > 0 (see 129 * else (we are a default group) sd->s_depth > 0 (see
130 * create_dir()). 130 * create_dir()).
131 */ 131 */
132 if (sd->s_depth == -1) 132 if (sd->s_depth == -1)
133 /* 133 /*
134 * We are a non default group and we are going to create 134 * We are a non default group and we are going to create
135 * default groups. 135 * default groups.
136 */ 136 */
137 sd->s_depth = 0; 137 sd->s_depth = 0;
138 } 138 }
139 139
140 static void 140 static void
141 configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) 141 configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
142 { 142 {
143 /* We will not create default groups anymore. */ 143 /* We will not create default groups anymore. */
144 sd->s_depth = -1; 144 sd->s_depth = -1;
145 } 145 }
146 146
147 #else /* CONFIG_LOCKDEP */ 147 #else /* CONFIG_LOCKDEP */
148 148
149 static void configfs_init_dirent_depth(struct configfs_dirent *sd) 149 static void configfs_init_dirent_depth(struct configfs_dirent *sd)
150 { 150 {
151 } 151 }
152 152
153 static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, 153 static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
154 struct configfs_dirent *sd) 154 struct configfs_dirent *sd)
155 { 155 {
156 } 156 }
157 157
158 static void 158 static void
159 configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) 159 configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
160 { 160 {
161 } 161 }
162 162
163 static void 163 static void
164 configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) 164 configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
165 { 165 {
166 } 166 }
167 167
168 #endif /* CONFIG_LOCKDEP */ 168 #endif /* CONFIG_LOCKDEP */
169 169
170 /* 170 /*
171 * Allocates a new configfs_dirent and links it to the parent configfs_dirent 171 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
172 */ 172 */
173 static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, 173 static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
174 void *element, int type) 174 void *element, int type)
175 { 175 {
176 struct configfs_dirent * sd; 176 struct configfs_dirent * sd;
177 177
178 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); 178 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
179 if (!sd) 179 if (!sd)
180 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
181 181
182 atomic_set(&sd->s_count, 1); 182 atomic_set(&sd->s_count, 1);
183 INIT_LIST_HEAD(&sd->s_links); 183 INIT_LIST_HEAD(&sd->s_links);
184 INIT_LIST_HEAD(&sd->s_children); 184 INIT_LIST_HEAD(&sd->s_children);
185 sd->s_element = element; 185 sd->s_element = element;
186 sd->s_type = type; 186 sd->s_type = type;
187 configfs_init_dirent_depth(sd); 187 configfs_init_dirent_depth(sd);
188 spin_lock(&configfs_dirent_lock); 188 spin_lock(&configfs_dirent_lock);
189 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { 189 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
190 spin_unlock(&configfs_dirent_lock); 190 spin_unlock(&configfs_dirent_lock);
191 kmem_cache_free(configfs_dir_cachep, sd); 191 kmem_cache_free(configfs_dir_cachep, sd);
192 return ERR_PTR(-ENOENT); 192 return ERR_PTR(-ENOENT);
193 } 193 }
194 list_add(&sd->s_sibling, &parent_sd->s_children); 194 list_add(&sd->s_sibling, &parent_sd->s_children);
195 spin_unlock(&configfs_dirent_lock); 195 spin_unlock(&configfs_dirent_lock);
196 196
197 return sd; 197 return sd;
198 } 198 }
199 199
200 /* 200 /*
201 * 201 *
202 * Return -EEXIST if there is already a configfs element with the same 202 * Return -EEXIST if there is already a configfs element with the same
203 * name for the same parent. 203 * name for the same parent.
204 * 204 *
205 * called with parent inode's i_mutex held 205 * called with parent inode's i_mutex held
206 */ 206 */
207 static int configfs_dirent_exists(struct configfs_dirent *parent_sd, 207 static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
208 const unsigned char *new) 208 const unsigned char *new)
209 { 209 {
210 struct configfs_dirent * sd; 210 struct configfs_dirent * sd;
211 211
212 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 212 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
213 if (sd->s_element) { 213 if (sd->s_element) {
214 const unsigned char *existing = configfs_get_name(sd); 214 const unsigned char *existing = configfs_get_name(sd);
215 if (strcmp(existing, new)) 215 if (strcmp(existing, new))
216 continue; 216 continue;
217 else 217 else
218 return -EEXIST; 218 return -EEXIST;
219 } 219 }
220 } 220 }
221 221
222 return 0; 222 return 0;
223 } 223 }
224 224
225 225
226 int configfs_make_dirent(struct configfs_dirent * parent_sd, 226 int configfs_make_dirent(struct configfs_dirent * parent_sd,
227 struct dentry * dentry, void * element, 227 struct dentry * dentry, void * element,
228 umode_t mode, int type) 228 umode_t mode, int type)
229 { 229 {
230 struct configfs_dirent * sd; 230 struct configfs_dirent * sd;
231 231
232 sd = configfs_new_dirent(parent_sd, element, type); 232 sd = configfs_new_dirent(parent_sd, element, type);
233 if (IS_ERR(sd)) 233 if (IS_ERR(sd))
234 return PTR_ERR(sd); 234 return PTR_ERR(sd);
235 235
236 sd->s_mode = mode; 236 sd->s_mode = mode;
237 sd->s_dentry = dentry; 237 sd->s_dentry = dentry;
238 if (dentry) 238 if (dentry)
239 dentry->d_fsdata = configfs_get(sd); 239 dentry->d_fsdata = configfs_get(sd);
240 240
241 return 0; 241 return 0;
242 } 242 }
243 243
244 static int init_dir(struct inode * inode) 244 static int init_dir(struct inode * inode)
245 { 245 {
246 inode->i_op = &configfs_dir_inode_operations; 246 inode->i_op = &configfs_dir_inode_operations;
247 inode->i_fop = &configfs_dir_operations; 247 inode->i_fop = &configfs_dir_operations;
248 248
249 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 249 /* directory inodes start off with i_nlink == 2 (for "." entry) */
250 inc_nlink(inode); 250 inc_nlink(inode);
251 return 0; 251 return 0;
252 } 252 }
253 253
254 static int configfs_init_file(struct inode * inode) 254 static int configfs_init_file(struct inode * inode)
255 { 255 {
256 inode->i_size = PAGE_SIZE; 256 inode->i_size = PAGE_SIZE;
257 inode->i_fop = &configfs_file_operations; 257 inode->i_fop = &configfs_file_operations;
258 return 0; 258 return 0;
259 } 259 }
260 260
261 static int init_symlink(struct inode * inode) 261 static int init_symlink(struct inode * inode)
262 { 262 {
263 inode->i_op = &configfs_symlink_inode_operations; 263 inode->i_op = &configfs_symlink_inode_operations;
264 return 0; 264 return 0;
265 } 265 }
266 266
267 static int create_dir(struct config_item *k, struct dentry *d) 267 static int create_dir(struct config_item *k, struct dentry *d)
268 { 268 {
269 int error; 269 int error;
270 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 270 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
271 struct dentry *p = d->d_parent; 271 struct dentry *p = d->d_parent;
272 272
273 BUG_ON(!k); 273 BUG_ON(!k);
274 274
275 error = configfs_dirent_exists(p->d_fsdata, d->d_name.name); 275 error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
276 if (!error) 276 if (!error)
277 error = configfs_make_dirent(p->d_fsdata, d, k, mode, 277 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
278 CONFIGFS_DIR | CONFIGFS_USET_CREATING); 278 CONFIGFS_DIR | CONFIGFS_USET_CREATING);
279 if (!error) { 279 if (!error) {
280 configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata); 280 configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
281 error = configfs_create(d, mode, init_dir); 281 error = configfs_create(d, mode, init_dir);
282 if (!error) { 282 if (!error) {
283 inc_nlink(p->d_inode); 283 inc_nlink(p->d_inode);
284 } else { 284 } else {
285 struct configfs_dirent *sd = d->d_fsdata; 285 struct configfs_dirent *sd = d->d_fsdata;
286 if (sd) { 286 if (sd) {
287 spin_lock(&configfs_dirent_lock); 287 spin_lock(&configfs_dirent_lock);
288 list_del_init(&sd->s_sibling); 288 list_del_init(&sd->s_sibling);
289 spin_unlock(&configfs_dirent_lock); 289 spin_unlock(&configfs_dirent_lock);
290 configfs_put(sd); 290 configfs_put(sd);
291 } 291 }
292 } 292 }
293 } 293 }
294 return error; 294 return error;
295 } 295 }
296 296
297 297
298 /** 298 /**
299 * configfs_create_dir - create a directory for an config_item. 299 * configfs_create_dir - create a directory for an config_item.
300 * @item: config_itemwe're creating directory for. 300 * @item: config_itemwe're creating directory for.
301 * @dentry: config_item's dentry. 301 * @dentry: config_item's dentry.
302 * 302 *
303 * Note: user-created entries won't be allowed under this new directory 303 * Note: user-created entries won't be allowed under this new directory
304 * until it is validated by configfs_dir_set_ready() 304 * until it is validated by configfs_dir_set_ready()
305 */ 305 */
306 306
307 static int configfs_create_dir(struct config_item * item, struct dentry *dentry) 307 static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
308 { 308 {
309 int error = create_dir(item, dentry); 309 int error = create_dir(item, dentry);
310 if (!error) 310 if (!error)
311 item->ci_dentry = dentry; 311 item->ci_dentry = dentry;
312 return error; 312 return error;
313 } 313 }
314 314
315 /* 315 /*
316 * Allow userspace to create new entries under a new directory created with 316 * Allow userspace to create new entries under a new directory created with
317 * configfs_create_dir(), and under all of its chidlren directories recursively. 317 * configfs_create_dir(), and under all of its chidlren directories recursively.
318 * @sd configfs_dirent of the new directory to validate 318 * @sd configfs_dirent of the new directory to validate
319 * 319 *
320 * Caller must hold configfs_dirent_lock. 320 * Caller must hold configfs_dirent_lock.
321 */ 321 */
322 static void configfs_dir_set_ready(struct configfs_dirent *sd) 322 static void configfs_dir_set_ready(struct configfs_dirent *sd)
323 { 323 {
324 struct configfs_dirent *child_sd; 324 struct configfs_dirent *child_sd;
325 325
326 sd->s_type &= ~CONFIGFS_USET_CREATING; 326 sd->s_type &= ~CONFIGFS_USET_CREATING;
327 list_for_each_entry(child_sd, &sd->s_children, s_sibling) 327 list_for_each_entry(child_sd, &sd->s_children, s_sibling)
328 if (child_sd->s_type & CONFIGFS_USET_CREATING) 328 if (child_sd->s_type & CONFIGFS_USET_CREATING)
329 configfs_dir_set_ready(child_sd); 329 configfs_dir_set_ready(child_sd);
330 } 330 }
331 331
332 /* 332 /*
333 * Check that a directory does not belong to a directory hierarchy being 333 * Check that a directory does not belong to a directory hierarchy being
334 * attached and not validated yet. 334 * attached and not validated yet.
335 * @sd configfs_dirent of the directory to check 335 * @sd configfs_dirent of the directory to check
336 * 336 *
337 * @return non-zero iff the directory was validated 337 * @return non-zero iff the directory was validated
338 * 338 *
339 * Note: takes configfs_dirent_lock, so the result may change from false to true 339 * Note: takes configfs_dirent_lock, so the result may change from false to true
340 * in two consecutive calls, but never from true to false. 340 * in two consecutive calls, but never from true to false.
341 */ 341 */
342 int configfs_dirent_is_ready(struct configfs_dirent *sd) 342 int configfs_dirent_is_ready(struct configfs_dirent *sd)
343 { 343 {
344 int ret; 344 int ret;
345 345
346 spin_lock(&configfs_dirent_lock); 346 spin_lock(&configfs_dirent_lock);
347 ret = !(sd->s_type & CONFIGFS_USET_CREATING); 347 ret = !(sd->s_type & CONFIGFS_USET_CREATING);
348 spin_unlock(&configfs_dirent_lock); 348 spin_unlock(&configfs_dirent_lock);
349 349
350 return ret; 350 return ret;
351 } 351 }
352 352
353 int configfs_create_link(struct configfs_symlink *sl, 353 int configfs_create_link(struct configfs_symlink *sl,
354 struct dentry *parent, 354 struct dentry *parent,
355 struct dentry *dentry) 355 struct dentry *dentry)
356 { 356 {
357 int err = 0; 357 int err = 0;
358 umode_t mode = S_IFLNK | S_IRWXUGO; 358 umode_t mode = S_IFLNK | S_IRWXUGO;
359 359
360 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, 360 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
361 CONFIGFS_ITEM_LINK); 361 CONFIGFS_ITEM_LINK);
362 if (!err) { 362 if (!err) {
363 err = configfs_create(dentry, mode, init_symlink); 363 err = configfs_create(dentry, mode, init_symlink);
364 if (err) { 364 if (err) {
365 struct configfs_dirent *sd = dentry->d_fsdata; 365 struct configfs_dirent *sd = dentry->d_fsdata;
366 if (sd) { 366 if (sd) {
367 spin_lock(&configfs_dirent_lock); 367 spin_lock(&configfs_dirent_lock);
368 list_del_init(&sd->s_sibling); 368 list_del_init(&sd->s_sibling);
369 spin_unlock(&configfs_dirent_lock); 369 spin_unlock(&configfs_dirent_lock);
370 configfs_put(sd); 370 configfs_put(sd);
371 } 371 }
372 } 372 }
373 } 373 }
374 return err; 374 return err;
375 } 375 }
376 376
377 static void remove_dir(struct dentry * d) 377 static void remove_dir(struct dentry * d)
378 { 378 {
379 struct dentry * parent = dget(d->d_parent); 379 struct dentry * parent = dget(d->d_parent);
380 struct configfs_dirent * sd; 380 struct configfs_dirent * sd;
381 381
382 sd = d->d_fsdata; 382 sd = d->d_fsdata;
383 spin_lock(&configfs_dirent_lock); 383 spin_lock(&configfs_dirent_lock);
384 list_del_init(&sd->s_sibling); 384 list_del_init(&sd->s_sibling);
385 spin_unlock(&configfs_dirent_lock); 385 spin_unlock(&configfs_dirent_lock);
386 configfs_put(sd); 386 configfs_put(sd);
387 if (d->d_inode) 387 if (d->d_inode)
388 simple_rmdir(parent->d_inode,d); 388 simple_rmdir(parent->d_inode,d);
389 389
390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); 390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
391 391
392 dput(parent); 392 dput(parent);
393 } 393 }
394 394
395 /** 395 /**
396 * configfs_remove_dir - remove an config_item's directory. 396 * configfs_remove_dir - remove an config_item's directory.
397 * @item: config_item we're removing. 397 * @item: config_item we're removing.
398 * 398 *
399 * The only thing special about this is that we remove any files in 399 * The only thing special about this is that we remove any files in
400 * the directory before we remove the directory, and we've inlined 400 * the directory before we remove the directory, and we've inlined
401 * what used to be configfs_rmdir() below, instead of calling separately. 401 * what used to be configfs_rmdir() below, instead of calling separately.
402 * 402 *
403 * Caller holds the mutex of the item's inode 403 * Caller holds the mutex of the item's inode
404 */ 404 */
405 405
406 static void configfs_remove_dir(struct config_item * item) 406 static void configfs_remove_dir(struct config_item * item)
407 { 407 {
408 struct dentry * dentry = dget(item->ci_dentry); 408 struct dentry * dentry = dget(item->ci_dentry);
409 409
410 if (!dentry) 410 if (!dentry)
411 return; 411 return;
412 412
413 remove_dir(dentry); 413 remove_dir(dentry);
414 /** 414 /**
415 * Drop reference from dget() on entrance. 415 * Drop reference from dget() on entrance.
416 */ 416 */
417 dput(dentry); 417 dput(dentry);
418 } 418 }
419 419
420 420
421 /* attaches attribute's configfs_dirent to the dentry corresponding to the 421 /* attaches attribute's configfs_dirent to the dentry corresponding to the
422 * attribute file 422 * attribute file
423 */ 423 */
424 static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) 424 static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
425 { 425 {
426 struct configfs_attribute * attr = sd->s_element; 426 struct configfs_attribute * attr = sd->s_element;
427 int error; 427 int error;
428 428
429 dentry->d_fsdata = configfs_get(sd); 429 dentry->d_fsdata = configfs_get(sd);
430 sd->s_dentry = dentry; 430 sd->s_dentry = dentry;
431 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, 431 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
432 configfs_init_file); 432 configfs_init_file);
433 if (error) { 433 if (error) {
434 configfs_put(sd); 434 configfs_put(sd);
435 return error; 435 return error;
436 } 436 }
437 437
438 d_rehash(dentry); 438 d_rehash(dentry);
439 439
440 return 0; 440 return 0;
441 } 441 }
442 442
443 static struct dentry * configfs_lookup(struct inode *dir, 443 static struct dentry * configfs_lookup(struct inode *dir,
444 struct dentry *dentry, 444 struct dentry *dentry,
445 unsigned int flags) 445 unsigned int flags)
446 { 446 {
447 struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; 447 struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
448 struct configfs_dirent * sd; 448 struct configfs_dirent * sd;
449 int found = 0; 449 int found = 0;
450 int err; 450 int err;
451 451
452 /* 452 /*
453 * Fake invisibility if dir belongs to a group/default groups hierarchy 453 * Fake invisibility if dir belongs to a group/default groups hierarchy
454 * being attached 454 * being attached
455 * 455 *
456 * This forbids userspace to read/write attributes of items which may 456 * This forbids userspace to read/write attributes of items which may
457 * not complete their initialization, since the dentries of the 457 * not complete their initialization, since the dentries of the
458 * attributes won't be instantiated. 458 * attributes won't be instantiated.
459 */ 459 */
460 err = -ENOENT; 460 err = -ENOENT;
461 if (!configfs_dirent_is_ready(parent_sd)) 461 if (!configfs_dirent_is_ready(parent_sd))
462 goto out; 462 goto out;
463 463
464 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 464 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
465 if (sd->s_type & CONFIGFS_NOT_PINNED) { 465 if (sd->s_type & CONFIGFS_NOT_PINNED) {
466 const unsigned char * name = configfs_get_name(sd); 466 const unsigned char * name = configfs_get_name(sd);
467 467
468 if (strcmp(name, dentry->d_name.name)) 468 if (strcmp(name, dentry->d_name.name))
469 continue; 469 continue;
470 470
471 found = 1; 471 found = 1;
472 err = configfs_attach_attr(sd, dentry); 472 err = configfs_attach_attr(sd, dentry);
473 break; 473 break;
474 } 474 }
475 } 475 }
476 476
477 if (!found) { 477 if (!found) {
478 /* 478 /*
479 * If it doesn't exist and it isn't a NOT_PINNED item, 479 * If it doesn't exist and it isn't a NOT_PINNED item,
480 * it must be negative. 480 * it must be negative.
481 */ 481 */
482 if (dentry->d_name.len > NAME_MAX) 482 if (dentry->d_name.len > NAME_MAX)
483 return ERR_PTR(-ENAMETOOLONG); 483 return ERR_PTR(-ENAMETOOLONG);
484 d_add(dentry, NULL); 484 d_add(dentry, NULL);
485 return NULL; 485 return NULL;
486 } 486 }
487 487
488 out: 488 out:
489 return ERR_PTR(err); 489 return ERR_PTR(err);
490 } 490 }
491 491
492 /* 492 /*
493 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are 493 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
494 * attributes and are removed by rmdir(). We recurse, setting 494 * attributes and are removed by rmdir(). We recurse, setting
495 * CONFIGFS_USET_DROPPING on all children that are candidates for 495 * CONFIGFS_USET_DROPPING on all children that are candidates for
496 * default detach. 496 * default detach.
497 * If there is an error, the caller will reset the flags via 497 * If there is an error, the caller will reset the flags via
498 * configfs_detach_rollback(). 498 * configfs_detach_rollback().
499 */ 499 */
500 static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) 500 static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
501 { 501 {
502 struct configfs_dirent *parent_sd = dentry->d_fsdata; 502 struct configfs_dirent *parent_sd = dentry->d_fsdata;
503 struct configfs_dirent *sd; 503 struct configfs_dirent *sd;
504 int ret; 504 int ret;
505 505
506 /* Mark that we're trying to drop the group */ 506 /* Mark that we're trying to drop the group */
507 parent_sd->s_type |= CONFIGFS_USET_DROPPING; 507 parent_sd->s_type |= CONFIGFS_USET_DROPPING;
508 508
509 ret = -EBUSY; 509 ret = -EBUSY;
510 if (!list_empty(&parent_sd->s_links)) 510 if (!list_empty(&parent_sd->s_links))
511 goto out; 511 goto out;
512 512
513 ret = 0; 513 ret = 0;
514 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 514 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
515 if (!sd->s_element || 515 if (!sd->s_element ||
516 (sd->s_type & CONFIGFS_NOT_PINNED)) 516 (sd->s_type & CONFIGFS_NOT_PINNED))
517 continue; 517 continue;
518 if (sd->s_type & CONFIGFS_USET_DEFAULT) { 518 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
519 /* Abort if racing with mkdir() */ 519 /* Abort if racing with mkdir() */
520 if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { 520 if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
521 if (wait_mutex) 521 if (wait_mutex)
522 *wait_mutex = &sd->s_dentry->d_inode->i_mutex; 522 *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
523 return -EAGAIN; 523 return -EAGAIN;
524 } 524 }
525 525
526 /* 526 /*
527 * Yup, recursive. If there's a problem, blame 527 * Yup, recursive. If there's a problem, blame
528 * deep nesting of default_groups 528 * deep nesting of default_groups
529 */ 529 */
530 ret = configfs_detach_prep(sd->s_dentry, wait_mutex); 530 ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
531 if (!ret) 531 if (!ret)
532 continue; 532 continue;
533 } else 533 } else
534 ret = -ENOTEMPTY; 534 ret = -ENOTEMPTY;
535 535
536 break; 536 break;
537 } 537 }
538 538
539 out: 539 out:
540 return ret; 540 return ret;
541 } 541 }
542 542
543 /* 543 /*
544 * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was 544 * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
545 * set. 545 * set.
546 */ 546 */
547 static void configfs_detach_rollback(struct dentry *dentry) 547 static void configfs_detach_rollback(struct dentry *dentry)
548 { 548 {
549 struct configfs_dirent *parent_sd = dentry->d_fsdata; 549 struct configfs_dirent *parent_sd = dentry->d_fsdata;
550 struct configfs_dirent *sd; 550 struct configfs_dirent *sd;
551 551
552 parent_sd->s_type &= ~CONFIGFS_USET_DROPPING; 552 parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
553 553
554 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) 554 list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
555 if (sd->s_type & CONFIGFS_USET_DEFAULT) 555 if (sd->s_type & CONFIGFS_USET_DEFAULT)
556 configfs_detach_rollback(sd->s_dentry); 556 configfs_detach_rollback(sd->s_dentry);
557 } 557 }
558 558
559 static void detach_attrs(struct config_item * item) 559 static void detach_attrs(struct config_item * item)
560 { 560 {
561 struct dentry * dentry = dget(item->ci_dentry); 561 struct dentry * dentry = dget(item->ci_dentry);
562 struct configfs_dirent * parent_sd; 562 struct configfs_dirent * parent_sd;
563 struct configfs_dirent * sd, * tmp; 563 struct configfs_dirent * sd, * tmp;
564 564
565 if (!dentry) 565 if (!dentry)
566 return; 566 return;
567 567
568 pr_debug("configfs %s: dropping attrs for dir\n", 568 pr_debug("configfs %s: dropping attrs for dir\n",
569 dentry->d_name.name); 569 dentry->d_name.name);
570 570
571 parent_sd = dentry->d_fsdata; 571 parent_sd = dentry->d_fsdata;
572 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 572 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
573 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) 573 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
574 continue; 574 continue;
575 spin_lock(&configfs_dirent_lock); 575 spin_lock(&configfs_dirent_lock);
576 list_del_init(&sd->s_sibling); 576 list_del_init(&sd->s_sibling);
577 spin_unlock(&configfs_dirent_lock); 577 spin_unlock(&configfs_dirent_lock);
578 configfs_drop_dentry(sd, dentry); 578 configfs_drop_dentry(sd, dentry);
579 configfs_put(sd); 579 configfs_put(sd);
580 } 580 }
581 581
582 /** 582 /**
583 * Drop reference from dget() on entrance. 583 * Drop reference from dget() on entrance.
584 */ 584 */
585 dput(dentry); 585 dput(dentry);
586 } 586 }
587 587
588 static int populate_attrs(struct config_item *item) 588 static int populate_attrs(struct config_item *item)
589 { 589 {
590 struct config_item_type *t = item->ci_type; 590 struct config_item_type *t = item->ci_type;
591 struct configfs_attribute *attr; 591 struct configfs_attribute *attr;
592 int error = 0; 592 int error = 0;
593 int i; 593 int i;
594 594
595 if (!t) 595 if (!t)
596 return -EINVAL; 596 return -EINVAL;
597 if (t->ct_attrs) { 597 if (t->ct_attrs) {
598 for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { 598 for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
599 if ((error = configfs_create_file(item, attr))) 599 if ((error = configfs_create_file(item, attr)))
600 break; 600 break;
601 } 601 }
602 } 602 }
603 603
604 if (error) 604 if (error)
605 detach_attrs(item); 605 detach_attrs(item);
606 606
607 return error; 607 return error;
608 } 608 }
609 609
610 static int configfs_attach_group(struct config_item *parent_item, 610 static int configfs_attach_group(struct config_item *parent_item,
611 struct config_item *item, 611 struct config_item *item,
612 struct dentry *dentry); 612 struct dentry *dentry);
613 static void configfs_detach_group(struct config_item *item); 613 static void configfs_detach_group(struct config_item *item);
614 614
615 static void detach_groups(struct config_group *group) 615 static void detach_groups(struct config_group *group)
616 { 616 {
617 struct dentry * dentry = dget(group->cg_item.ci_dentry); 617 struct dentry * dentry = dget(group->cg_item.ci_dentry);
618 struct dentry *child; 618 struct dentry *child;
619 struct configfs_dirent *parent_sd; 619 struct configfs_dirent *parent_sd;
620 struct configfs_dirent *sd, *tmp; 620 struct configfs_dirent *sd, *tmp;
621 621
622 if (!dentry) 622 if (!dentry)
623 return; 623 return;
624 624
625 parent_sd = dentry->d_fsdata; 625 parent_sd = dentry->d_fsdata;
626 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 626 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
627 if (!sd->s_element || 627 if (!sd->s_element ||
628 !(sd->s_type & CONFIGFS_USET_DEFAULT)) 628 !(sd->s_type & CONFIGFS_USET_DEFAULT))
629 continue; 629 continue;
630 630
631 child = sd->s_dentry; 631 child = sd->s_dentry;
632 632
633 mutex_lock(&child->d_inode->i_mutex); 633 mutex_lock(&child->d_inode->i_mutex);
634 634
635 configfs_detach_group(sd->s_element); 635 configfs_detach_group(sd->s_element);
636 child->d_inode->i_flags |= S_DEAD; 636 child->d_inode->i_flags |= S_DEAD;
637 dont_mount(child); 637 dont_mount(child);
638 638
639 mutex_unlock(&child->d_inode->i_mutex); 639 mutex_unlock(&child->d_inode->i_mutex);
640 640
641 d_delete(child); 641 d_delete(child);
642 dput(child); 642 dput(child);
643 } 643 }
644 644
645 /** 645 /**
646 * Drop reference from dget() on entrance. 646 * Drop reference from dget() on entrance.
647 */ 647 */
648 dput(dentry); 648 dput(dentry);
649 } 649 }
650 650
651 /* 651 /*
652 * This fakes mkdir(2) on a default_groups[] entry. It 652 * This fakes mkdir(2) on a default_groups[] entry. It
653 * creates a dentry, attachs it, and then does fixup 653 * creates a dentry, attachs it, and then does fixup
654 * on the sd->s_type. 654 * on the sd->s_type.
655 * 655 *
656 * We could, perhaps, tweak our parent's ->mkdir for a minute and 656 * We could, perhaps, tweak our parent's ->mkdir for a minute and
657 * try using vfs_mkdir. Just a thought. 657 * try using vfs_mkdir. Just a thought.
658 */ 658 */
659 static int create_default_group(struct config_group *parent_group, 659 static int create_default_group(struct config_group *parent_group,
660 struct config_group *group) 660 struct config_group *group)
661 { 661 {
662 int ret; 662 int ret;
663 struct qstr name; 663 struct qstr name;
664 struct configfs_dirent *sd; 664 struct configfs_dirent *sd;
665 /* We trust the caller holds a reference to parent */ 665 /* We trust the caller holds a reference to parent */
666 struct dentry *child, *parent = parent_group->cg_item.ci_dentry; 666 struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
667 667
668 if (!group->cg_item.ci_name) 668 if (!group->cg_item.ci_name)
669 group->cg_item.ci_name = group->cg_item.ci_namebuf; 669 group->cg_item.ci_name = group->cg_item.ci_namebuf;
670 name.name = group->cg_item.ci_name; 670 name.name = group->cg_item.ci_name;
671 name.len = strlen(name.name); 671 name.len = strlen(name.name);
672 name.hash = full_name_hash(name.name, name.len); 672 name.hash = full_name_hash(name.name, name.len);
673 673
674 ret = -ENOMEM; 674 ret = -ENOMEM;
675 child = d_alloc(parent, &name); 675 child = d_alloc(parent, &name);
676 if (child) { 676 if (child) {
677 d_add(child, NULL); 677 d_add(child, NULL);
678 678
679 ret = configfs_attach_group(&parent_group->cg_item, 679 ret = configfs_attach_group(&parent_group->cg_item,
680 &group->cg_item, child); 680 &group->cg_item, child);
681 if (!ret) { 681 if (!ret) {
682 sd = child->d_fsdata; 682 sd = child->d_fsdata;
683 sd->s_type |= CONFIGFS_USET_DEFAULT; 683 sd->s_type |= CONFIGFS_USET_DEFAULT;
684 } else { 684 } else {
685 BUG_ON(child->d_inode); 685 BUG_ON(child->d_inode);
686 d_drop(child); 686 d_drop(child);
687 dput(child); 687 dput(child);
688 } 688 }
689 } 689 }
690 690
691 return ret; 691 return ret;
692 } 692 }
693 693
694 static int populate_groups(struct config_group *group) 694 static int populate_groups(struct config_group *group)
695 { 695 {
696 struct config_group *new_group; 696 struct config_group *new_group;
697 int ret = 0; 697 int ret = 0;
698 int i; 698 int i;
699 699
700 if (group->default_groups) { 700 if (group->default_groups) {
701 for (i = 0; group->default_groups[i]; i++) { 701 for (i = 0; group->default_groups[i]; i++) {
702 new_group = group->default_groups[i]; 702 new_group = group->default_groups[i];
703 703
704 ret = create_default_group(group, new_group); 704 ret = create_default_group(group, new_group);
705 if (ret) { 705 if (ret) {
706 detach_groups(group); 706 detach_groups(group);
707 break; 707 break;
708 } 708 }
709 } 709 }
710 } 710 }
711 711
712 return ret; 712 return ret;
713 } 713 }
714 714
715 /* 715 /*
716 * All of link_obj/unlink_obj/link_group/unlink_group require that 716 * All of link_obj/unlink_obj/link_group/unlink_group require that
717 * subsys->su_mutex is held. 717 * subsys->su_mutex is held.
718 */ 718 */
719 719
720 static void unlink_obj(struct config_item *item) 720 static void unlink_obj(struct config_item *item)
721 { 721 {
722 struct config_group *group; 722 struct config_group *group;
723 723
724 group = item->ci_group; 724 group = item->ci_group;
725 if (group) { 725 if (group) {
726 list_del_init(&item->ci_entry); 726 list_del_init(&item->ci_entry);
727 727
728 item->ci_group = NULL; 728 item->ci_group = NULL;
729 item->ci_parent = NULL; 729 item->ci_parent = NULL;
730 730
731 /* Drop the reference for ci_entry */ 731 /* Drop the reference for ci_entry */
732 config_item_put(item); 732 config_item_put(item);
733 733
734 /* Drop the reference for ci_parent */ 734 /* Drop the reference for ci_parent */
735 config_group_put(group); 735 config_group_put(group);
736 } 736 }
737 } 737 }
738 738
739 static void link_obj(struct config_item *parent_item, struct config_item *item) 739 static void link_obj(struct config_item *parent_item, struct config_item *item)
740 { 740 {
741 /* 741 /*
742 * Parent seems redundant with group, but it makes certain 742 * Parent seems redundant with group, but it makes certain
743 * traversals much nicer. 743 * traversals much nicer.
744 */ 744 */
745 item->ci_parent = parent_item; 745 item->ci_parent = parent_item;
746 746
747 /* 747 /*
748 * We hold a reference on the parent for the child's ci_parent 748 * We hold a reference on the parent for the child's ci_parent
749 * link. 749 * link.
750 */ 750 */
751 item->ci_group = config_group_get(to_config_group(parent_item)); 751 item->ci_group = config_group_get(to_config_group(parent_item));
752 list_add_tail(&item->ci_entry, &item->ci_group->cg_children); 752 list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
753 753
754 /* 754 /*
755 * We hold a reference on the child for ci_entry on the parent's 755 * We hold a reference on the child for ci_entry on the parent's
756 * cg_children 756 * cg_children
757 */ 757 */
758 config_item_get(item); 758 config_item_get(item);
759 } 759 }
760 760
761 static void unlink_group(struct config_group *group) 761 static void unlink_group(struct config_group *group)
762 { 762 {
763 int i; 763 int i;
764 struct config_group *new_group; 764 struct config_group *new_group;
765 765
766 if (group->default_groups) { 766 if (group->default_groups) {
767 for (i = 0; group->default_groups[i]; i++) { 767 for (i = 0; group->default_groups[i]; i++) {
768 new_group = group->default_groups[i]; 768 new_group = group->default_groups[i];
769 unlink_group(new_group); 769 unlink_group(new_group);
770 } 770 }
771 } 771 }
772 772
773 group->cg_subsys = NULL; 773 group->cg_subsys = NULL;
774 unlink_obj(&group->cg_item); 774 unlink_obj(&group->cg_item);
775 } 775 }
776 776
777 static void link_group(struct config_group *parent_group, struct config_group *group) 777 static void link_group(struct config_group *parent_group, struct config_group *group)
778 { 778 {
779 int i; 779 int i;
780 struct config_group *new_group; 780 struct config_group *new_group;
781 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ 781 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
782 782
783 link_obj(&parent_group->cg_item, &group->cg_item); 783 link_obj(&parent_group->cg_item, &group->cg_item);
784 784
785 if (parent_group->cg_subsys) 785 if (parent_group->cg_subsys)
786 subsys = parent_group->cg_subsys; 786 subsys = parent_group->cg_subsys;
787 else if (configfs_is_root(&parent_group->cg_item)) 787 else if (configfs_is_root(&parent_group->cg_item))
788 subsys = to_configfs_subsystem(group); 788 subsys = to_configfs_subsystem(group);
789 else 789 else
790 BUG(); 790 BUG();
791 group->cg_subsys = subsys; 791 group->cg_subsys = subsys;
792 792
793 if (group->default_groups) { 793 if (group->default_groups) {
794 for (i = 0; group->default_groups[i]; i++) { 794 for (i = 0; group->default_groups[i]; i++) {
795 new_group = group->default_groups[i]; 795 new_group = group->default_groups[i];
796 link_group(group, new_group); 796 link_group(group, new_group);
797 } 797 }
798 } 798 }
799 } 799 }
800 800
801 /* 801 /*
802 * The goal is that configfs_attach_item() (and 802 * The goal is that configfs_attach_item() (and
803 * configfs_attach_group()) can be called from either the VFS or this 803 * configfs_attach_group()) can be called from either the VFS or this
804 * module. That is, they assume that the items have been created, 804 * module. That is, they assume that the items have been created,
805 * the dentry allocated, and the dcache is all ready to go. 805 * the dentry allocated, and the dcache is all ready to go.
806 * 806 *
807 * If they fail, they must clean up after themselves as if they 807 * If they fail, they must clean up after themselves as if they
808 * had never been called. The caller (VFS or local function) will 808 * had never been called. The caller (VFS or local function) will
809 * handle cleaning up the dcache bits. 809 * handle cleaning up the dcache bits.
810 * 810 *
811 * configfs_detach_group() and configfs_detach_item() behave similarly on 811 * configfs_detach_group() and configfs_detach_item() behave similarly on
812 * the way out. They assume that the proper semaphores are held, they 812 * the way out. They assume that the proper semaphores are held, they
813 * clean up the configfs items, and they expect their callers will 813 * clean up the configfs items, and they expect their callers will
814 * handle the dcache bits. 814 * handle the dcache bits.
815 */ 815 */
816 static int configfs_attach_item(struct config_item *parent_item, 816 static int configfs_attach_item(struct config_item *parent_item,
817 struct config_item *item, 817 struct config_item *item,
818 struct dentry *dentry) 818 struct dentry *dentry)
819 { 819 {
820 int ret; 820 int ret;
821 821
822 ret = configfs_create_dir(item, dentry); 822 ret = configfs_create_dir(item, dentry);
823 if (!ret) { 823 if (!ret) {
824 ret = populate_attrs(item); 824 ret = populate_attrs(item);
825 if (ret) { 825 if (ret) {
826 /* 826 /*
827 * We are going to remove an inode and its dentry but 827 * We are going to remove an inode and its dentry but
828 * the VFS may already have hit and used them. Thus, 828 * the VFS may already have hit and used them. Thus,
829 * we must lock them as rmdir() would. 829 * we must lock them as rmdir() would.
830 */ 830 */
831 mutex_lock(&dentry->d_inode->i_mutex); 831 mutex_lock(&dentry->d_inode->i_mutex);
832 configfs_remove_dir(item); 832 configfs_remove_dir(item);
833 dentry->d_inode->i_flags |= S_DEAD; 833 dentry->d_inode->i_flags |= S_DEAD;
834 dont_mount(dentry); 834 dont_mount(dentry);
835 mutex_unlock(&dentry->d_inode->i_mutex); 835 mutex_unlock(&dentry->d_inode->i_mutex);
836 d_delete(dentry); 836 d_delete(dentry);
837 } 837 }
838 } 838 }
839 839
840 return ret; 840 return ret;
841 } 841 }
842 842
843 /* Caller holds the mutex of the item's inode */ 843 /* Caller holds the mutex of the item's inode */
844 static void configfs_detach_item(struct config_item *item) 844 static void configfs_detach_item(struct config_item *item)
845 { 845 {
846 detach_attrs(item); 846 detach_attrs(item);
847 configfs_remove_dir(item); 847 configfs_remove_dir(item);
848 } 848 }
849 849
850 static int configfs_attach_group(struct config_item *parent_item, 850 static int configfs_attach_group(struct config_item *parent_item,
851 struct config_item *item, 851 struct config_item *item,
852 struct dentry *dentry) 852 struct dentry *dentry)
853 { 853 {
854 int ret; 854 int ret;
855 struct configfs_dirent *sd; 855 struct configfs_dirent *sd;
856 856
857 ret = configfs_attach_item(parent_item, item, dentry); 857 ret = configfs_attach_item(parent_item, item, dentry);
858 if (!ret) { 858 if (!ret) {
859 sd = dentry->d_fsdata; 859 sd = dentry->d_fsdata;
860 sd->s_type |= CONFIGFS_USET_DIR; 860 sd->s_type |= CONFIGFS_USET_DIR;
861 861
862 /* 862 /*
863 * FYI, we're faking mkdir in populate_groups() 863 * FYI, we're faking mkdir in populate_groups()
864 * We must lock the group's inode to avoid races with the VFS 864 * We must lock the group's inode to avoid races with the VFS
865 * which can already hit the inode and try to add/remove entries 865 * which can already hit the inode and try to add/remove entries
866 * under it. 866 * under it.
867 * 867 *
868 * We must also lock the inode to remove it safely in case of 868 * We must also lock the inode to remove it safely in case of
869 * error, as rmdir() would. 869 * error, as rmdir() would.
870 */ 870 */
871 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 871 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
872 configfs_adjust_dir_dirent_depth_before_populate(sd); 872 configfs_adjust_dir_dirent_depth_before_populate(sd);
873 ret = populate_groups(to_config_group(item)); 873 ret = populate_groups(to_config_group(item));
874 if (ret) { 874 if (ret) {
875 configfs_detach_item(item); 875 configfs_detach_item(item);
876 dentry->d_inode->i_flags |= S_DEAD; 876 dentry->d_inode->i_flags |= S_DEAD;
877 dont_mount(dentry); 877 dont_mount(dentry);
878 } 878 }
879 configfs_adjust_dir_dirent_depth_after_populate(sd); 879 configfs_adjust_dir_dirent_depth_after_populate(sd);
880 mutex_unlock(&dentry->d_inode->i_mutex); 880 mutex_unlock(&dentry->d_inode->i_mutex);
881 if (ret) 881 if (ret)
882 d_delete(dentry); 882 d_delete(dentry);
883 } 883 }
884 884
885 return ret; 885 return ret;
886 } 886 }
887 887
888 /* Caller holds the mutex of the group's inode */ 888 /* Caller holds the mutex of the group's inode */
889 static void configfs_detach_group(struct config_item *item) 889 static void configfs_detach_group(struct config_item *item)
890 { 890 {
891 detach_groups(to_config_group(item)); 891 detach_groups(to_config_group(item));
892 configfs_detach_item(item); 892 configfs_detach_item(item);
893 } 893 }
894 894
895 /* 895 /*
896 * After the item has been detached from the filesystem view, we are 896 * After the item has been detached from the filesystem view, we are
897 * ready to tear it out of the hierarchy. Notify the client before 897 * ready to tear it out of the hierarchy. Notify the client before
898 * we do that so they can perform any cleanup that requires 898 * we do that so they can perform any cleanup that requires
899 * navigating the hierarchy. A client does not need to provide this 899 * navigating the hierarchy. A client does not need to provide this
900 * callback. The subsystem semaphore MUST be held by the caller, and 900 * callback. The subsystem semaphore MUST be held by the caller, and
901 * references must be valid for both items. It also assumes the 901 * references must be valid for both items. It also assumes the
902 * caller has validated ci_type. 902 * caller has validated ci_type.
903 */ 903 */
904 static void client_disconnect_notify(struct config_item *parent_item, 904 static void client_disconnect_notify(struct config_item *parent_item,
905 struct config_item *item) 905 struct config_item *item)
906 { 906 {
907 struct config_item_type *type; 907 struct config_item_type *type;
908 908
909 type = parent_item->ci_type; 909 type = parent_item->ci_type;
910 BUG_ON(!type); 910 BUG_ON(!type);
911 911
912 if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) 912 if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
913 type->ct_group_ops->disconnect_notify(to_config_group(parent_item), 913 type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
914 item); 914 item);
915 } 915 }
916 916
917 /* 917 /*
918 * Drop the initial reference from make_item()/make_group() 918 * Drop the initial reference from make_item()/make_group()
919 * This function assumes that reference is held on item 919 * This function assumes that reference is held on item
920 * and that item holds a valid reference to the parent. Also, it 920 * and that item holds a valid reference to the parent. Also, it
921 * assumes the caller has validated ci_type. 921 * assumes the caller has validated ci_type.
922 */ 922 */
923 static void client_drop_item(struct config_item *parent_item, 923 static void client_drop_item(struct config_item *parent_item,
924 struct config_item *item) 924 struct config_item *item)
925 { 925 {
926 struct config_item_type *type; 926 struct config_item_type *type;
927 927
928 type = parent_item->ci_type; 928 type = parent_item->ci_type;
929 BUG_ON(!type); 929 BUG_ON(!type);
930 930
931 /* 931 /*
932 * If ->drop_item() exists, it is responsible for the 932 * If ->drop_item() exists, it is responsible for the
933 * config_item_put(). 933 * config_item_put().
934 */ 934 */
935 if (type->ct_group_ops && type->ct_group_ops->drop_item) 935 if (type->ct_group_ops && type->ct_group_ops->drop_item)
936 type->ct_group_ops->drop_item(to_config_group(parent_item), 936 type->ct_group_ops->drop_item(to_config_group(parent_item),
937 item); 937 item);
938 else 938 else
939 config_item_put(item); 939 config_item_put(item);
940 } 940 }
941 941
942 #ifdef DEBUG 942 #ifdef DEBUG
943 static void configfs_dump_one(struct configfs_dirent *sd, int level) 943 static void configfs_dump_one(struct configfs_dirent *sd, int level)
944 { 944 {
945 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); 945 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
946 946
947 #define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); 947 #define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
948 type_print(CONFIGFS_ROOT); 948 type_print(CONFIGFS_ROOT);
949 type_print(CONFIGFS_DIR); 949 type_print(CONFIGFS_DIR);
950 type_print(CONFIGFS_ITEM_ATTR); 950 type_print(CONFIGFS_ITEM_ATTR);
951 type_print(CONFIGFS_ITEM_LINK); 951 type_print(CONFIGFS_ITEM_LINK);
952 type_print(CONFIGFS_USET_DIR); 952 type_print(CONFIGFS_USET_DIR);
953 type_print(CONFIGFS_USET_DEFAULT); 953 type_print(CONFIGFS_USET_DEFAULT);
954 type_print(CONFIGFS_USET_DROPPING); 954 type_print(CONFIGFS_USET_DROPPING);
955 #undef type_print 955 #undef type_print
956 } 956 }
957 957
958 static int configfs_dump(struct configfs_dirent *sd, int level) 958 static int configfs_dump(struct configfs_dirent *sd, int level)
959 { 959 {
960 struct configfs_dirent *child_sd; 960 struct configfs_dirent *child_sd;
961 int ret = 0; 961 int ret = 0;
962 962
963 configfs_dump_one(sd, level); 963 configfs_dump_one(sd, level);
964 964
965 if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) 965 if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
966 return 0; 966 return 0;
967 967
968 list_for_each_entry(child_sd, &sd->s_children, s_sibling) { 968 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
969 ret = configfs_dump(child_sd, level + 2); 969 ret = configfs_dump(child_sd, level + 2);
970 if (ret) 970 if (ret)
971 break; 971 break;
972 } 972 }
973 973
974 return ret; 974 return ret;
975 } 975 }
976 #endif 976 #endif
977 977
978 978
979 /* 979 /*
980 * configfs_depend_item() and configfs_undepend_item() 980 * configfs_depend_item() and configfs_undepend_item()
981 * 981 *
982 * WARNING: Do not call these from a configfs callback! 982 * WARNING: Do not call these from a configfs callback!
983 * 983 *
984 * This describes these functions and their helpers. 984 * This describes these functions and their helpers.
985 * 985 *
986 * Allow another kernel system to depend on a config_item. If this 986 * Allow another kernel system to depend on a config_item. If this
987 * happens, the item cannot go away until the dependent can live without 987 * happens, the item cannot go away until the dependent can live without
988 * it. The idea is to give client modules as simple an interface as 988 * it. The idea is to give client modules as simple an interface as
989 * possible. When a system asks them to depend on an item, they just 989 * possible. When a system asks them to depend on an item, they just
990 * call configfs_depend_item(). If the item is live and the client 990 * call configfs_depend_item(). If the item is live and the client
991 * driver is in good shape, we'll happily do the work for them. 991 * driver is in good shape, we'll happily do the work for them.
992 * 992 *
993 * Why is the locking complex? Because configfs uses the VFS to handle 993 * Why is the locking complex? Because configfs uses the VFS to handle
994 * all locking, but this function is called outside the normal 994 * all locking, but this function is called outside the normal
995 * VFS->configfs path. So it must take VFS locks to prevent the 995 * VFS->configfs path. So it must take VFS locks to prevent the
996 * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is 996 * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
997 * why you can't call these functions underneath configfs callbacks. 997 * why you can't call these functions underneath configfs callbacks.
998 * 998 *
999 * Note, btw, that this can be called at *any* time, even when a configfs 999 * Note, btw, that this can be called at *any* time, even when a configfs
1000 * subsystem isn't registered, or when configfs is loading or unloading. 1000 * subsystem isn't registered, or when configfs is loading or unloading.
1001 * Just like configfs_register_subsystem(). So we take the same 1001 * Just like configfs_register_subsystem(). So we take the same
1002 * precautions. We pin the filesystem. We lock configfs_dirent_lock. 1002 * precautions. We pin the filesystem. We lock configfs_dirent_lock.
1003 * If we can find the target item in the 1003 * If we can find the target item in the
1004 * configfs tree, it must be part of the subsystem tree as well, so we 1004 * configfs tree, it must be part of the subsystem tree as well, so we
1005 * do not need the subsystem semaphore. Holding configfs_dirent_lock helps 1005 * do not need the subsystem semaphore. Holding configfs_dirent_lock helps
1006 * locking out mkdir() and rmdir(), who might be racing us. 1006 * locking out mkdir() and rmdir(), who might be racing us.
1007 */ 1007 */
1008 1008
1009 /* 1009 /*
1010 * configfs_depend_prep() 1010 * configfs_depend_prep()
1011 * 1011 *
1012 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are 1012 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
1013 * attributes. This is similar but not the same to configfs_detach_prep(). 1013 * attributes. This is similar but not the same to configfs_detach_prep().
1014 * Note that configfs_detach_prep() expects the parent to be locked when it 1014 * Note that configfs_detach_prep() expects the parent to be locked when it
1015 * is called, but we lock the parent *inside* configfs_depend_prep(). We 1015 * is called, but we lock the parent *inside* configfs_depend_prep(). We
1016 * do that so we can unlock it if we find nothing. 1016 * do that so we can unlock it if we find nothing.
1017 * 1017 *
1018 * Here we do a depth-first search of the dentry hierarchy looking for 1018 * Here we do a depth-first search of the dentry hierarchy looking for
1019 * our object. 1019 * our object.
1020 * We deliberately ignore items tagged as dropping since they are virtually 1020 * We deliberately ignore items tagged as dropping since they are virtually
1021 * dead, as well as items in the middle of attachment since they virtually 1021 * dead, as well as items in the middle of attachment since they virtually
1022 * do not exist yet. This completes the locking out of racing mkdir() and 1022 * do not exist yet. This completes the locking out of racing mkdir() and
1023 * rmdir(). 1023 * rmdir().
1024 * Note: subdirectories in the middle of attachment start with s_type = 1024 * Note: subdirectories in the middle of attachment start with s_type =
1025 * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When 1025 * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When
1026 * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of 1026 * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of
1027 * s_type is in configfs_new_dirent(), which has configfs_dirent_lock. 1027 * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
1028 * 1028 *
1029 * If the target is not found, -ENOENT is bubbled up. 1029 * If the target is not found, -ENOENT is bubbled up.
1030 * 1030 *
1031 * This adds a requirement that all config_items be unique! 1031 * This adds a requirement that all config_items be unique!
1032 * 1032 *
1033 * This is recursive. There isn't 1033 * This is recursive. There isn't
1034 * much on the stack, though, so folks that need this function - be careful 1034 * much on the stack, though, so folks that need this function - be careful
1035 * about your stack! Patches will be accepted to make it iterative. 1035 * about your stack! Patches will be accepted to make it iterative.
1036 */ 1036 */
1037 static int configfs_depend_prep(struct dentry *origin, 1037 static int configfs_depend_prep(struct dentry *origin,
1038 struct config_item *target) 1038 struct config_item *target)
1039 { 1039 {
1040 struct configfs_dirent *child_sd, *sd = origin->d_fsdata; 1040 struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
1041 int ret = 0; 1041 int ret = 0;
1042 1042
1043 BUG_ON(!origin || !sd); 1043 BUG_ON(!origin || !sd);
1044 1044
1045 if (sd->s_element == target) /* Boo-yah */ 1045 if (sd->s_element == target) /* Boo-yah */
1046 goto out; 1046 goto out;
1047 1047
1048 list_for_each_entry(child_sd, &sd->s_children, s_sibling) { 1048 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
1049 if ((child_sd->s_type & CONFIGFS_DIR) && 1049 if ((child_sd->s_type & CONFIGFS_DIR) &&
1050 !(child_sd->s_type & CONFIGFS_USET_DROPPING) && 1050 !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
1051 !(child_sd->s_type & CONFIGFS_USET_CREATING)) { 1051 !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
1052 ret = configfs_depend_prep(child_sd->s_dentry, 1052 ret = configfs_depend_prep(child_sd->s_dentry,
1053 target); 1053 target);
1054 if (!ret) 1054 if (!ret)
1055 goto out; /* Child path boo-yah */ 1055 goto out; /* Child path boo-yah */
1056 } 1056 }
1057 } 1057 }
1058 1058
1059 /* We looped all our children and didn't find target */ 1059 /* We looped all our children and didn't find target */
1060 ret = -ENOENT; 1060 ret = -ENOENT;
1061 1061
1062 out: 1062 out:
1063 return ret; 1063 return ret;
1064 } 1064 }
1065 1065
1066 int configfs_depend_item(struct configfs_subsystem *subsys, 1066 int configfs_depend_item(struct configfs_subsystem *subsys,
1067 struct config_item *target) 1067 struct config_item *target)
1068 { 1068 {
1069 int ret; 1069 int ret;
1070 struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; 1070 struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
1071 struct config_item *s_item = &subsys->su_group.cg_item; 1071 struct config_item *s_item = &subsys->su_group.cg_item;
1072 struct dentry *root; 1072 struct dentry *root;
1073 1073
1074 /* 1074 /*
1075 * Pin the configfs filesystem. This means we can safely access 1075 * Pin the configfs filesystem. This means we can safely access
1076 * the root of the configfs filesystem. 1076 * the root of the configfs filesystem.
1077 */ 1077 */
1078 root = configfs_pin_fs(); 1078 root = configfs_pin_fs();
1079 if (IS_ERR(root)) 1079 if (IS_ERR(root))
1080 return PTR_ERR(root); 1080 return PTR_ERR(root);
1081 1081
1082 /* 1082 /*
1083 * Next, lock the root directory. We're going to check that the 1083 * Next, lock the root directory. We're going to check that the
1084 * subsystem is really registered, and so we need to lock out 1084 * subsystem is really registered, and so we need to lock out
1085 * configfs_[un]register_subsystem(). 1085 * configfs_[un]register_subsystem().
1086 */ 1086 */
1087 mutex_lock(&root->d_inode->i_mutex); 1087 mutex_lock(&root->d_inode->i_mutex);
1088 1088
1089 root_sd = root->d_fsdata; 1089 root_sd = root->d_fsdata;
1090 1090
1091 list_for_each_entry(p, &root_sd->s_children, s_sibling) { 1091 list_for_each_entry(p, &root_sd->s_children, s_sibling) {
1092 if (p->s_type & CONFIGFS_DIR) { 1092 if (p->s_type & CONFIGFS_DIR) {
1093 if (p->s_element == s_item) { 1093 if (p->s_element == s_item) {
1094 subsys_sd = p; 1094 subsys_sd = p;
1095 break; 1095 break;
1096 } 1096 }
1097 } 1097 }
1098 } 1098 }
1099 1099
1100 if (!subsys_sd) { 1100 if (!subsys_sd) {
1101 ret = -ENOENT; 1101 ret = -ENOENT;
1102 goto out_unlock_fs; 1102 goto out_unlock_fs;
1103 } 1103 }
1104 1104
1105 /* Ok, now we can trust subsys/s_item */ 1105 /* Ok, now we can trust subsys/s_item */
1106 1106
1107 spin_lock(&configfs_dirent_lock); 1107 spin_lock(&configfs_dirent_lock);
1108 /* Scan the tree, return 0 if found */ 1108 /* Scan the tree, return 0 if found */
1109 ret = configfs_depend_prep(subsys_sd->s_dentry, target); 1109 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
1110 if (ret) 1110 if (ret)
1111 goto out_unlock_dirent_lock; 1111 goto out_unlock_dirent_lock;
1112 1112
1113 /* 1113 /*
1114 * We are sure that the item is not about to be removed by rmdir(), and 1114 * We are sure that the item is not about to be removed by rmdir(), and
1115 * not in the middle of attachment by mkdir(). 1115 * not in the middle of attachment by mkdir().
1116 */ 1116 */
1117 p = target->ci_dentry->d_fsdata; 1117 p = target->ci_dentry->d_fsdata;
1118 p->s_dependent_count += 1; 1118 p->s_dependent_count += 1;
1119 1119
1120 out_unlock_dirent_lock: 1120 out_unlock_dirent_lock:
1121 spin_unlock(&configfs_dirent_lock); 1121 spin_unlock(&configfs_dirent_lock);
1122 out_unlock_fs: 1122 out_unlock_fs:
1123 mutex_unlock(&root->d_inode->i_mutex); 1123 mutex_unlock(&root->d_inode->i_mutex);
1124 1124
1125 /* 1125 /*
1126 * If we succeeded, the fs is pinned via other methods. If not, 1126 * If we succeeded, the fs is pinned via other methods. If not,
1127 * we're done with it anyway. So release_fs() is always right. 1127 * we're done with it anyway. So release_fs() is always right.
1128 */ 1128 */
1129 configfs_release_fs(); 1129 configfs_release_fs();
1130 1130
1131 return ret; 1131 return ret;
1132 } 1132 }
1133 EXPORT_SYMBOL(configfs_depend_item); 1133 EXPORT_SYMBOL(configfs_depend_item);
1134 1134
1135 /* 1135 /*
1136 * Release the dependent linkage. This is much simpler than 1136 * Release the dependent linkage. This is much simpler than
1137 * configfs_depend_item() because we know that that the client driver is 1137 * configfs_depend_item() because we know that that the client driver is
1138 * pinned, thus the subsystem is pinned, and therefore configfs is pinned. 1138 * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
1139 */ 1139 */
1140 void configfs_undepend_item(struct configfs_subsystem *subsys, 1140 void configfs_undepend_item(struct configfs_subsystem *subsys,
1141 struct config_item *target) 1141 struct config_item *target)
1142 { 1142 {
1143 struct configfs_dirent *sd; 1143 struct configfs_dirent *sd;
1144 1144
1145 /* 1145 /*
1146 * Since we can trust everything is pinned, we just need 1146 * Since we can trust everything is pinned, we just need
1147 * configfs_dirent_lock. 1147 * configfs_dirent_lock.
1148 */ 1148 */
1149 spin_lock(&configfs_dirent_lock); 1149 spin_lock(&configfs_dirent_lock);
1150 1150
1151 sd = target->ci_dentry->d_fsdata; 1151 sd = target->ci_dentry->d_fsdata;
1152 BUG_ON(sd->s_dependent_count < 1); 1152 BUG_ON(sd->s_dependent_count < 1);
1153 1153
1154 sd->s_dependent_count -= 1; 1154 sd->s_dependent_count -= 1;
1155 1155
1156 /* 1156 /*
1157 * After this unlock, we cannot trust the item to stay alive! 1157 * After this unlock, we cannot trust the item to stay alive!
1158 * DO NOT REFERENCE item after this unlock. 1158 * DO NOT REFERENCE item after this unlock.
1159 */ 1159 */
1160 spin_unlock(&configfs_dirent_lock); 1160 spin_unlock(&configfs_dirent_lock);
1161 } 1161 }
1162 EXPORT_SYMBOL(configfs_undepend_item); 1162 EXPORT_SYMBOL(configfs_undepend_item);
1163 1163
1164 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 1164 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1165 { 1165 {
1166 int ret = 0; 1166 int ret = 0;
1167 int module_got = 0; 1167 int module_got = 0;
1168 struct config_group *group = NULL; 1168 struct config_group *group = NULL;
1169 struct config_item *item = NULL; 1169 struct config_item *item = NULL;
1170 struct config_item *parent_item; 1170 struct config_item *parent_item;
1171 struct configfs_subsystem *subsys; 1171 struct configfs_subsystem *subsys;
1172 struct configfs_dirent *sd; 1172 struct configfs_dirent *sd;
1173 struct config_item_type *type; 1173 struct config_item_type *type;
1174 struct module *subsys_owner = NULL, *new_item_owner = NULL; 1174 struct module *subsys_owner = NULL, *new_item_owner = NULL;
1175 char *name; 1175 char *name;
1176 1176
1177 sd = dentry->d_parent->d_fsdata; 1177 sd = dentry->d_parent->d_fsdata;
1178 1178
1179 /* 1179 /*
1180 * Fake invisibility if dir belongs to a group/default groups hierarchy 1180 * Fake invisibility if dir belongs to a group/default groups hierarchy
1181 * being attached 1181 * being attached
1182 */ 1182 */
1183 if (!configfs_dirent_is_ready(sd)) { 1183 if (!configfs_dirent_is_ready(sd)) {
1184 ret = -ENOENT; 1184 ret = -ENOENT;
1185 goto out; 1185 goto out;
1186 } 1186 }
1187 1187
1188 if (!(sd->s_type & CONFIGFS_USET_DIR)) { 1188 if (!(sd->s_type & CONFIGFS_USET_DIR)) {
1189 ret = -EPERM; 1189 ret = -EPERM;
1190 goto out; 1190 goto out;
1191 } 1191 }
1192 1192
1193 /* Get a working ref for the duration of this function */ 1193 /* Get a working ref for the duration of this function */
1194 parent_item = configfs_get_config_item(dentry->d_parent); 1194 parent_item = configfs_get_config_item(dentry->d_parent);
1195 type = parent_item->ci_type; 1195 type = parent_item->ci_type;
1196 subsys = to_config_group(parent_item)->cg_subsys; 1196 subsys = to_config_group(parent_item)->cg_subsys;
1197 BUG_ON(!subsys); 1197 BUG_ON(!subsys);
1198 1198
1199 if (!type || !type->ct_group_ops || 1199 if (!type || !type->ct_group_ops ||
1200 (!type->ct_group_ops->make_group && 1200 (!type->ct_group_ops->make_group &&
1201 !type->ct_group_ops->make_item)) { 1201 !type->ct_group_ops->make_item)) {
1202 ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ 1202 ret = -EPERM; /* Lack-of-mkdir returns -EPERM */
1203 goto out_put; 1203 goto out_put;
1204 } 1204 }
1205 1205
1206 /* 1206 /*
1207 * The subsystem may belong to a different module than the item 1207 * The subsystem may belong to a different module than the item
1208 * being created. We don't want to safely pin the new item but 1208 * being created. We don't want to safely pin the new item but
1209 * fail to pin the subsystem it sits under. 1209 * fail to pin the subsystem it sits under.
1210 */ 1210 */
1211 if (!subsys->su_group.cg_item.ci_type) { 1211 if (!subsys->su_group.cg_item.ci_type) {
1212 ret = -EINVAL; 1212 ret = -EINVAL;
1213 goto out_put; 1213 goto out_put;
1214 } 1214 }
1215 subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; 1215 subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
1216 if (!try_module_get(subsys_owner)) { 1216 if (!try_module_get(subsys_owner)) {
1217 ret = -EINVAL; 1217 ret = -EINVAL;
1218 goto out_put; 1218 goto out_put;
1219 } 1219 }
1220 1220
1221 name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); 1221 name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
1222 if (!name) { 1222 if (!name) {
1223 ret = -ENOMEM; 1223 ret = -ENOMEM;
1224 goto out_subsys_put; 1224 goto out_subsys_put;
1225 } 1225 }
1226 1226
1227 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); 1227 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
1228 1228
1229 mutex_lock(&subsys->su_mutex); 1229 mutex_lock(&subsys->su_mutex);
1230 if (type->ct_group_ops->make_group) { 1230 if (type->ct_group_ops->make_group) {
1231 group = type->ct_group_ops->make_group(to_config_group(parent_item), name); 1231 group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
1232 if (!group) 1232 if (!group)
1233 group = ERR_PTR(-ENOMEM); 1233 group = ERR_PTR(-ENOMEM);
1234 if (!IS_ERR(group)) { 1234 if (!IS_ERR(group)) {
1235 link_group(to_config_group(parent_item), group); 1235 link_group(to_config_group(parent_item), group);
1236 item = &group->cg_item; 1236 item = &group->cg_item;
1237 } else 1237 } else
1238 ret = PTR_ERR(group); 1238 ret = PTR_ERR(group);
1239 } else { 1239 } else {
1240 item = type->ct_group_ops->make_item(to_config_group(parent_item), name); 1240 item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
1241 if (!item) 1241 if (!item)
1242 item = ERR_PTR(-ENOMEM); 1242 item = ERR_PTR(-ENOMEM);
1243 if (!IS_ERR(item)) 1243 if (!IS_ERR(item))
1244 link_obj(parent_item, item); 1244 link_obj(parent_item, item);
1245 else 1245 else
1246 ret = PTR_ERR(item); 1246 ret = PTR_ERR(item);
1247 } 1247 }
1248 mutex_unlock(&subsys->su_mutex); 1248 mutex_unlock(&subsys->su_mutex);
1249 1249
1250 kfree(name); 1250 kfree(name);
1251 if (ret) { 1251 if (ret) {
1252 /* 1252 /*
1253 * If ret != 0, then link_obj() was never called. 1253 * If ret != 0, then link_obj() was never called.
1254 * There are no extra references to clean up. 1254 * There are no extra references to clean up.
1255 */ 1255 */
1256 goto out_subsys_put; 1256 goto out_subsys_put;
1257 } 1257 }
1258 1258
1259 /* 1259 /*
1260 * link_obj() has been called (via link_group() for groups). 1260 * link_obj() has been called (via link_group() for groups).
1261 * From here on out, errors must clean that up. 1261 * From here on out, errors must clean that up.
1262 */ 1262 */
1263 1263
1264 type = item->ci_type; 1264 type = item->ci_type;
1265 if (!type) { 1265 if (!type) {
1266 ret = -EINVAL; 1266 ret = -EINVAL;
1267 goto out_unlink; 1267 goto out_unlink;
1268 } 1268 }
1269 1269
1270 new_item_owner = type->ct_owner; 1270 new_item_owner = type->ct_owner;
1271 if (!try_module_get(new_item_owner)) { 1271 if (!try_module_get(new_item_owner)) {
1272 ret = -EINVAL; 1272 ret = -EINVAL;
1273 goto out_unlink; 1273 goto out_unlink;
1274 } 1274 }
1275 1275
1276 /* 1276 /*
1277 * I hate doing it this way, but if there is 1277 * I hate doing it this way, but if there is
1278 * an error, module_put() probably should 1278 * an error, module_put() probably should
1279 * happen after any cleanup. 1279 * happen after any cleanup.
1280 */ 1280 */
1281 module_got = 1; 1281 module_got = 1;
1282 1282
1283 /* 1283 /*
1284 * Make racing rmdir() fail if it did not tag parent with 1284 * Make racing rmdir() fail if it did not tag parent with
1285 * CONFIGFS_USET_DROPPING 1285 * CONFIGFS_USET_DROPPING
1286 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will 1286 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
1287 * fail and let rmdir() terminate correctly 1287 * fail and let rmdir() terminate correctly
1288 */ 1288 */
1289 spin_lock(&configfs_dirent_lock); 1289 spin_lock(&configfs_dirent_lock);
1290 /* This will make configfs_detach_prep() fail */ 1290 /* This will make configfs_detach_prep() fail */
1291 sd->s_type |= CONFIGFS_USET_IN_MKDIR; 1291 sd->s_type |= CONFIGFS_USET_IN_MKDIR;
1292 spin_unlock(&configfs_dirent_lock); 1292 spin_unlock(&configfs_dirent_lock);
1293 1293
1294 if (group) 1294 if (group)
1295 ret = configfs_attach_group(parent_item, item, dentry); 1295 ret = configfs_attach_group(parent_item, item, dentry);
1296 else 1296 else
1297 ret = configfs_attach_item(parent_item, item, dentry); 1297 ret = configfs_attach_item(parent_item, item, dentry);
1298 1298
1299 spin_lock(&configfs_dirent_lock); 1299 spin_lock(&configfs_dirent_lock);
1300 sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; 1300 sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
1301 if (!ret) 1301 if (!ret)
1302 configfs_dir_set_ready(dentry->d_fsdata); 1302 configfs_dir_set_ready(dentry->d_fsdata);
1303 spin_unlock(&configfs_dirent_lock); 1303 spin_unlock(&configfs_dirent_lock);
1304 1304
1305 out_unlink: 1305 out_unlink:
1306 if (ret) { 1306 if (ret) {
1307 /* Tear down everything we built up */ 1307 /* Tear down everything we built up */
1308 mutex_lock(&subsys->su_mutex); 1308 mutex_lock(&subsys->su_mutex);
1309 1309
1310 client_disconnect_notify(parent_item, item); 1310 client_disconnect_notify(parent_item, item);
1311 if (group) 1311 if (group)
1312 unlink_group(group); 1312 unlink_group(group);
1313 else 1313 else
1314 unlink_obj(item); 1314 unlink_obj(item);
1315 client_drop_item(parent_item, item); 1315 client_drop_item(parent_item, item);
1316 1316
1317 mutex_unlock(&subsys->su_mutex); 1317 mutex_unlock(&subsys->su_mutex);
1318 1318
1319 if (module_got) 1319 if (module_got)
1320 module_put(new_item_owner); 1320 module_put(new_item_owner);
1321 } 1321 }
1322 1322
1323 out_subsys_put: 1323 out_subsys_put:
1324 if (ret) 1324 if (ret)
1325 module_put(subsys_owner); 1325 module_put(subsys_owner);
1326 1326
1327 out_put: 1327 out_put:
1328 /* 1328 /*
1329 * link_obj()/link_group() took a reference from child->parent, 1329 * link_obj()/link_group() took a reference from child->parent,
1330 * so the parent is safely pinned. We can drop our working 1330 * so the parent is safely pinned. We can drop our working
1331 * reference. 1331 * reference.
1332 */ 1332 */
1333 config_item_put(parent_item); 1333 config_item_put(parent_item);
1334 1334
1335 out: 1335 out:
1336 return ret; 1336 return ret;
1337 } 1337 }
1338 1338
1339 static int configfs_rmdir(struct inode *dir, struct dentry *dentry) 1339 static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1340 { 1340 {
1341 struct config_item *parent_item; 1341 struct config_item *parent_item;
1342 struct config_item *item; 1342 struct config_item *item;
1343 struct configfs_subsystem *subsys; 1343 struct configfs_subsystem *subsys;
1344 struct configfs_dirent *sd; 1344 struct configfs_dirent *sd;
1345 struct module *subsys_owner = NULL, *dead_item_owner = NULL; 1345 struct module *subsys_owner = NULL, *dead_item_owner = NULL;
1346 int ret; 1346 int ret;
1347 1347
1348 sd = dentry->d_fsdata; 1348 sd = dentry->d_fsdata;
1349 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1349 if (sd->s_type & CONFIGFS_USET_DEFAULT)
1350 return -EPERM; 1350 return -EPERM;
1351 1351
1352 /* Get a working ref until we have the child */ 1352 /* Get a working ref until we have the child */
1353 parent_item = configfs_get_config_item(dentry->d_parent); 1353 parent_item = configfs_get_config_item(dentry->d_parent);
1354 subsys = to_config_group(parent_item)->cg_subsys; 1354 subsys = to_config_group(parent_item)->cg_subsys;
1355 BUG_ON(!subsys); 1355 BUG_ON(!subsys);
1356 1356
1357 if (!parent_item->ci_type) { 1357 if (!parent_item->ci_type) {
1358 config_item_put(parent_item); 1358 config_item_put(parent_item);
1359 return -EINVAL; 1359 return -EINVAL;
1360 } 1360 }
1361 1361
1362 /* configfs_mkdir() shouldn't have allowed this */ 1362 /* configfs_mkdir() shouldn't have allowed this */
1363 BUG_ON(!subsys->su_group.cg_item.ci_type); 1363 BUG_ON(!subsys->su_group.cg_item.ci_type);
1364 subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; 1364 subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
1365 1365
1366 /* 1366 /*
1367 * Ensure that no racing symlink() will make detach_prep() fail while 1367 * Ensure that no racing symlink() will make detach_prep() fail while
1368 * the new link is temporarily attached 1368 * the new link is temporarily attached
1369 */ 1369 */
1370 do { 1370 do {
1371 struct mutex *wait_mutex; 1371 struct mutex *wait_mutex;
1372 1372
1373 mutex_lock(&configfs_symlink_mutex); 1373 mutex_lock(&configfs_symlink_mutex);
1374 spin_lock(&configfs_dirent_lock); 1374 spin_lock(&configfs_dirent_lock);
1375 /* 1375 /*
1376 * Here's where we check for dependents. We're protected by 1376 * Here's where we check for dependents. We're protected by
1377 * configfs_dirent_lock. 1377 * configfs_dirent_lock.
1378 * If no dependent, atomically tag the item as dropping. 1378 * If no dependent, atomically tag the item as dropping.
1379 */ 1379 */
1380 ret = sd->s_dependent_count ? -EBUSY : 0; 1380 ret = sd->s_dependent_count ? -EBUSY : 0;
1381 if (!ret) { 1381 if (!ret) {
1382 ret = configfs_detach_prep(dentry, &wait_mutex); 1382 ret = configfs_detach_prep(dentry, &wait_mutex);
1383 if (ret) 1383 if (ret)
1384 configfs_detach_rollback(dentry); 1384 configfs_detach_rollback(dentry);
1385 } 1385 }
1386 spin_unlock(&configfs_dirent_lock); 1386 spin_unlock(&configfs_dirent_lock);
1387 mutex_unlock(&configfs_symlink_mutex); 1387 mutex_unlock(&configfs_symlink_mutex);
1388 1388
1389 if (ret) { 1389 if (ret) {
1390 if (ret != -EAGAIN) { 1390 if (ret != -EAGAIN) {
1391 config_item_put(parent_item); 1391 config_item_put(parent_item);
1392 return ret; 1392 return ret;
1393 } 1393 }
1394 1394
1395 /* Wait until the racing operation terminates */ 1395 /* Wait until the racing operation terminates */
1396 mutex_lock(wait_mutex); 1396 mutex_lock(wait_mutex);
1397 mutex_unlock(wait_mutex); 1397 mutex_unlock(wait_mutex);
1398 } 1398 }
1399 } while (ret == -EAGAIN); 1399 } while (ret == -EAGAIN);
1400 1400
1401 /* Get a working ref for the duration of this function */ 1401 /* Get a working ref for the duration of this function */
1402 item = configfs_get_config_item(dentry); 1402 item = configfs_get_config_item(dentry);
1403 1403
1404 /* Drop reference from above, item already holds one. */ 1404 /* Drop reference from above, item already holds one. */
1405 config_item_put(parent_item); 1405 config_item_put(parent_item);
1406 1406
1407 if (item->ci_type) 1407 if (item->ci_type)
1408 dead_item_owner = item->ci_type->ct_owner; 1408 dead_item_owner = item->ci_type->ct_owner;
1409 1409
1410 if (sd->s_type & CONFIGFS_USET_DIR) { 1410 if (sd->s_type & CONFIGFS_USET_DIR) {
1411 configfs_detach_group(item); 1411 configfs_detach_group(item);
1412 1412
1413 mutex_lock(&subsys->su_mutex); 1413 mutex_lock(&subsys->su_mutex);
1414 client_disconnect_notify(parent_item, item); 1414 client_disconnect_notify(parent_item, item);
1415 unlink_group(to_config_group(item)); 1415 unlink_group(to_config_group(item));
1416 } else { 1416 } else {
1417 configfs_detach_item(item); 1417 configfs_detach_item(item);
1418 1418
1419 mutex_lock(&subsys->su_mutex); 1419 mutex_lock(&subsys->su_mutex);
1420 client_disconnect_notify(parent_item, item); 1420 client_disconnect_notify(parent_item, item);
1421 unlink_obj(item); 1421 unlink_obj(item);
1422 } 1422 }
1423 1423
1424 client_drop_item(parent_item, item); 1424 client_drop_item(parent_item, item);
1425 mutex_unlock(&subsys->su_mutex); 1425 mutex_unlock(&subsys->su_mutex);
1426 1426
1427 /* Drop our reference from above */ 1427 /* Drop our reference from above */
1428 config_item_put(item); 1428 config_item_put(item);
1429 1429
1430 module_put(dead_item_owner); 1430 module_put(dead_item_owner);
1431 module_put(subsys_owner); 1431 module_put(subsys_owner);
1432 1432
1433 return 0; 1433 return 0;
1434 } 1434 }
1435 1435
1436 const struct inode_operations configfs_dir_inode_operations = { 1436 const struct inode_operations configfs_dir_inode_operations = {
1437 .mkdir = configfs_mkdir, 1437 .mkdir = configfs_mkdir,
1438 .rmdir = configfs_rmdir, 1438 .rmdir = configfs_rmdir,
1439 .symlink = configfs_symlink, 1439 .symlink = configfs_symlink,
1440 .unlink = configfs_unlink, 1440 .unlink = configfs_unlink,
1441 .lookup = configfs_lookup, 1441 .lookup = configfs_lookup,
1442 .setattr = configfs_setattr, 1442 .setattr = configfs_setattr,
1443 }; 1443 };
1444 1444
1445 const struct inode_operations configfs_root_inode_operations = { 1445 const struct inode_operations configfs_root_inode_operations = {
1446 .lookup = configfs_lookup, 1446 .lookup = configfs_lookup,
1447 .setattr = configfs_setattr, 1447 .setattr = configfs_setattr,
1448 }; 1448 };
1449 1449
1450 #if 0 1450 #if 0
1451 int configfs_rename_dir(struct config_item * item, const char *new_name) 1451 int configfs_rename_dir(struct config_item * item, const char *new_name)
1452 { 1452 {
1453 int error = 0; 1453 int error = 0;
1454 struct dentry * new_dentry, * parent; 1454 struct dentry * new_dentry, * parent;
1455 1455
1456 if (!strcmp(config_item_name(item), new_name)) 1456 if (!strcmp(config_item_name(item), new_name))
1457 return -EINVAL; 1457 return -EINVAL;
1458 1458
1459 if (!item->parent) 1459 if (!item->parent)
1460 return -EINVAL; 1460 return -EINVAL;
1461 1461
1462 down_write(&configfs_rename_sem); 1462 down_write(&configfs_rename_sem);
1463 parent = item->parent->dentry; 1463 parent = item->parent->dentry;
1464 1464
1465 mutex_lock(&parent->d_inode->i_mutex); 1465 mutex_lock(&parent->d_inode->i_mutex);
1466 1466
1467 new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); 1467 new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
1468 if (!IS_ERR(new_dentry)) { 1468 if (!IS_ERR(new_dentry)) {
1469 if (!new_dentry->d_inode) { 1469 if (!new_dentry->d_inode) {
1470 error = config_item_set_name(item, "%s", new_name); 1470 error = config_item_set_name(item, "%s", new_name);
1471 if (!error) { 1471 if (!error) {
1472 d_add(new_dentry, NULL); 1472 d_add(new_dentry, NULL);
1473 d_move(item->dentry, new_dentry); 1473 d_move(item->dentry, new_dentry);
1474 } 1474 }
1475 else 1475 else
1476 d_delete(new_dentry); 1476 d_delete(new_dentry);
1477 } else 1477 } else
1478 error = -EEXIST; 1478 error = -EEXIST;
1479 dput(new_dentry); 1479 dput(new_dentry);
1480 } 1480 }
1481 mutex_unlock(&parent->d_inode->i_mutex); 1481 mutex_unlock(&parent->d_inode->i_mutex);
1482 up_write(&configfs_rename_sem); 1482 up_write(&configfs_rename_sem);
1483 1483
1484 return error; 1484 return error;
1485 } 1485 }
1486 #endif 1486 #endif
1487 1487
1488 static int configfs_dir_open(struct inode *inode, struct file *file) 1488 static int configfs_dir_open(struct inode *inode, struct file *file)
1489 { 1489 {
1490 struct dentry * dentry = file->f_path.dentry; 1490 struct dentry * dentry = file->f_path.dentry;
1491 struct configfs_dirent * parent_sd = dentry->d_fsdata; 1491 struct configfs_dirent * parent_sd = dentry->d_fsdata;
1492 int err; 1492 int err;
1493 1493
1494 mutex_lock(&dentry->d_inode->i_mutex); 1494 mutex_lock(&dentry->d_inode->i_mutex);
1495 /* 1495 /*
1496 * Fake invisibility if dir belongs to a group/default groups hierarchy 1496 * Fake invisibility if dir belongs to a group/default groups hierarchy
1497 * being attached 1497 * being attached
1498 */ 1498 */
1499 err = -ENOENT; 1499 err = -ENOENT;
1500 if (configfs_dirent_is_ready(parent_sd)) { 1500 if (configfs_dirent_is_ready(parent_sd)) {
1501 file->private_data = configfs_new_dirent(parent_sd, NULL, 0); 1501 file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
1502 if (IS_ERR(file->private_data)) 1502 if (IS_ERR(file->private_data))
1503 err = PTR_ERR(file->private_data); 1503 err = PTR_ERR(file->private_data);
1504 else 1504 else
1505 err = 0; 1505 err = 0;
1506 } 1506 }
1507 mutex_unlock(&dentry->d_inode->i_mutex); 1507 mutex_unlock(&dentry->d_inode->i_mutex);
1508 1508
1509 return err; 1509 return err;
1510 } 1510 }
1511 1511
1512 static int configfs_dir_close(struct inode *inode, struct file *file) 1512 static int configfs_dir_close(struct inode *inode, struct file *file)
1513 { 1513 {
1514 struct dentry * dentry = file->f_path.dentry; 1514 struct dentry * dentry = file->f_path.dentry;
1515 struct configfs_dirent * cursor = file->private_data; 1515 struct configfs_dirent * cursor = file->private_data;
1516 1516
1517 mutex_lock(&dentry->d_inode->i_mutex); 1517 mutex_lock(&dentry->d_inode->i_mutex);
1518 spin_lock(&configfs_dirent_lock); 1518 spin_lock(&configfs_dirent_lock);
1519 list_del_init(&cursor->s_sibling); 1519 list_del_init(&cursor->s_sibling);
1520 spin_unlock(&configfs_dirent_lock); 1520 spin_unlock(&configfs_dirent_lock);
1521 mutex_unlock(&dentry->d_inode->i_mutex); 1521 mutex_unlock(&dentry->d_inode->i_mutex);
1522 1522
1523 release_configfs_dirent(cursor); 1523 release_configfs_dirent(cursor);
1524 1524
1525 return 0; 1525 return 0;
1526 } 1526 }
1527 1527
1528 /* Relationship between s_mode and the DT_xxx types */ 1528 /* Relationship between s_mode and the DT_xxx types */
1529 static inline unsigned char dt_type(struct configfs_dirent *sd) 1529 static inline unsigned char dt_type(struct configfs_dirent *sd)
1530 { 1530 {
1531 return (sd->s_mode >> 12) & 15; 1531 return (sd->s_mode >> 12) & 15;
1532 } 1532 }
1533 1533
1534 static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 1534 static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
1535 { 1535 {
1536 struct dentry *dentry = filp->f_path.dentry; 1536 struct dentry *dentry = filp->f_path.dentry;
1537 struct super_block *sb = dentry->d_sb; 1537 struct super_block *sb = dentry->d_sb;
1538 struct configfs_dirent * parent_sd = dentry->d_fsdata; 1538 struct configfs_dirent * parent_sd = dentry->d_fsdata;
1539 struct configfs_dirent *cursor = filp->private_data; 1539 struct configfs_dirent *cursor = filp->private_data;
1540 struct list_head *p, *q = &cursor->s_sibling; 1540 struct list_head *p, *q = &cursor->s_sibling;
1541 ino_t ino = 0; 1541 ino_t ino = 0;
1542 int i = filp->f_pos; 1542 int i = filp->f_pos;
1543 1543
1544 switch (i) { 1544 switch (i) {
1545 case 0: 1545 case 0:
1546 ino = dentry->d_inode->i_ino; 1546 ino = dentry->d_inode->i_ino;
1547 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 1547 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1548 break; 1548 break;
1549 filp->f_pos++; 1549 filp->f_pos++;
1550 i++; 1550 i++;
1551 /* fallthrough */ 1551 /* fallthrough */
1552 case 1: 1552 case 1:
1553 ino = parent_ino(dentry); 1553 ino = parent_ino(dentry);
1554 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 1554 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1555 break; 1555 break;
1556 filp->f_pos++; 1556 filp->f_pos++;
1557 i++; 1557 i++;
1558 /* fallthrough */ 1558 /* fallthrough */
1559 default: 1559 default:
1560 if (filp->f_pos == 2) { 1560 if (filp->f_pos == 2) {
1561 spin_lock(&configfs_dirent_lock); 1561 spin_lock(&configfs_dirent_lock);
1562 list_move(q, &parent_sd->s_children); 1562 list_move(q, &parent_sd->s_children);
1563 spin_unlock(&configfs_dirent_lock); 1563 spin_unlock(&configfs_dirent_lock);
1564 } 1564 }
1565 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 1565 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1566 struct configfs_dirent *next; 1566 struct configfs_dirent *next;
1567 const char * name; 1567 const char * name;
1568 int len; 1568 int len;
1569 struct inode *inode = NULL; 1569 struct inode *inode = NULL;
1570 1570
1571 next = list_entry(p, struct configfs_dirent, 1571 next = list_entry(p, struct configfs_dirent,
1572 s_sibling); 1572 s_sibling);
1573 if (!next->s_element) 1573 if (!next->s_element)
1574 continue; 1574 continue;
1575 1575
1576 name = configfs_get_name(next); 1576 name = configfs_get_name(next);
1577 len = strlen(name); 1577 len = strlen(name);
1578 1578
1579 /* 1579 /*
1580 * We'll have a dentry and an inode for 1580 * We'll have a dentry and an inode for
1581 * PINNED items and for open attribute 1581 * PINNED items and for open attribute
1582 * files. We lock here to prevent a race 1582 * files. We lock here to prevent a race
1583 * with configfs_d_iput() clearing 1583 * with configfs_d_iput() clearing
1584 * s_dentry before calling iput(). 1584 * s_dentry before calling iput().
1585 * 1585 *
1586 * Why do we go to the trouble? If 1586 * Why do we go to the trouble? If
1587 * someone has an attribute file open, 1587 * someone has an attribute file open,
1588 * the inode number should match until 1588 * the inode number should match until
1589 * they close it. Beyond that, we don't 1589 * they close it. Beyond that, we don't
1590 * care. 1590 * care.
1591 */ 1591 */
1592 spin_lock(&configfs_dirent_lock); 1592 spin_lock(&configfs_dirent_lock);
1593 dentry = next->s_dentry; 1593 dentry = next->s_dentry;
1594 if (dentry) 1594 if (dentry)
1595 inode = dentry->d_inode; 1595 inode = dentry->d_inode;
1596 if (inode) 1596 if (inode)
1597 ino = inode->i_ino; 1597 ino = inode->i_ino;
1598 spin_unlock(&configfs_dirent_lock); 1598 spin_unlock(&configfs_dirent_lock);
1599 if (!inode) 1599 if (!inode)
1600 ino = iunique(sb, 2); 1600 ino = iunique(sb, 2);
1601 1601
1602 if (filldir(dirent, name, len, filp->f_pos, ino, 1602 if (filldir(dirent, name, len, filp->f_pos, ino,
1603 dt_type(next)) < 0) 1603 dt_type(next)) < 0)
1604 return 0; 1604 return 0;
1605 1605
1606 spin_lock(&configfs_dirent_lock); 1606 spin_lock(&configfs_dirent_lock);
1607 list_move(q, p); 1607 list_move(q, p);
1608 spin_unlock(&configfs_dirent_lock); 1608 spin_unlock(&configfs_dirent_lock);
1609 p = q; 1609 p = q;
1610 filp->f_pos++; 1610 filp->f_pos++;
1611 } 1611 }
1612 } 1612 }
1613 return 0; 1613 return 0;
1614 } 1614 }
1615 1615
1616 static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) 1616 static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
1617 { 1617 {
1618 struct dentry * dentry = file->f_path.dentry; 1618 struct dentry * dentry = file->f_path.dentry;
1619 1619
1620 mutex_lock(&dentry->d_inode->i_mutex); 1620 mutex_lock(&dentry->d_inode->i_mutex);
1621 switch (origin) { 1621 switch (whence) {
1622 case 1: 1622 case 1:
1623 offset += file->f_pos; 1623 offset += file->f_pos;
1624 case 0: 1624 case 0:
1625 if (offset >= 0) 1625 if (offset >= 0)
1626 break; 1626 break;
1627 default: 1627 default:
1628 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); 1628 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
1629 return -EINVAL; 1629 return -EINVAL;
1630 } 1630 }
1631 if (offset != file->f_pos) { 1631 if (offset != file->f_pos) {
1632 file->f_pos = offset; 1632 file->f_pos = offset;
1633 if (file->f_pos >= 2) { 1633 if (file->f_pos >= 2) {
1634 struct configfs_dirent *sd = dentry->d_fsdata; 1634 struct configfs_dirent *sd = dentry->d_fsdata;
1635 struct configfs_dirent *cursor = file->private_data; 1635 struct configfs_dirent *cursor = file->private_data;
1636 struct list_head *p; 1636 struct list_head *p;
1637 loff_t n = file->f_pos - 2; 1637 loff_t n = file->f_pos - 2;
1638 1638
1639 spin_lock(&configfs_dirent_lock); 1639 spin_lock(&configfs_dirent_lock);
1640 list_del(&cursor->s_sibling); 1640 list_del(&cursor->s_sibling);
1641 p = sd->s_children.next; 1641 p = sd->s_children.next;
1642 while (n && p != &sd->s_children) { 1642 while (n && p != &sd->s_children) {
1643 struct configfs_dirent *next; 1643 struct configfs_dirent *next;
1644 next = list_entry(p, struct configfs_dirent, 1644 next = list_entry(p, struct configfs_dirent,
1645 s_sibling); 1645 s_sibling);
1646 if (next->s_element) 1646 if (next->s_element)
1647 n--; 1647 n--;
1648 p = p->next; 1648 p = p->next;
1649 } 1649 }
1650 list_add_tail(&cursor->s_sibling, p); 1650 list_add_tail(&cursor->s_sibling, p);
1651 spin_unlock(&configfs_dirent_lock); 1651 spin_unlock(&configfs_dirent_lock);
1652 } 1652 }
1653 } 1653 }
1654 mutex_unlock(&dentry->d_inode->i_mutex); 1654 mutex_unlock(&dentry->d_inode->i_mutex);
1655 return offset; 1655 return offset;
1656 } 1656 }
1657 1657
1658 const struct file_operations configfs_dir_operations = { 1658 const struct file_operations configfs_dir_operations = {
1659 .open = configfs_dir_open, 1659 .open = configfs_dir_open,
1660 .release = configfs_dir_close, 1660 .release = configfs_dir_close,
1661 .llseek = configfs_dir_lseek, 1661 .llseek = configfs_dir_lseek,
1662 .read = generic_read_dir, 1662 .read = generic_read_dir,
1663 .readdir = configfs_readdir, 1663 .readdir = configfs_readdir,
1664 }; 1664 };
1665 1665
1666 int configfs_register_subsystem(struct configfs_subsystem *subsys) 1666 int configfs_register_subsystem(struct configfs_subsystem *subsys)
1667 { 1667 {
1668 int err; 1668 int err;
1669 struct config_group *group = &subsys->su_group; 1669 struct config_group *group = &subsys->su_group;
1670 struct qstr name; 1670 struct qstr name;
1671 struct dentry *dentry; 1671 struct dentry *dentry;
1672 struct dentry *root; 1672 struct dentry *root;
1673 struct configfs_dirent *sd; 1673 struct configfs_dirent *sd;
1674 1674
1675 root = configfs_pin_fs(); 1675 root = configfs_pin_fs();
1676 if (IS_ERR(root)) 1676 if (IS_ERR(root))
1677 return PTR_ERR(root); 1677 return PTR_ERR(root);
1678 1678
1679 if (!group->cg_item.ci_name) 1679 if (!group->cg_item.ci_name)
1680 group->cg_item.ci_name = group->cg_item.ci_namebuf; 1680 group->cg_item.ci_name = group->cg_item.ci_namebuf;
1681 1681
1682 sd = root->d_fsdata; 1682 sd = root->d_fsdata;
1683 link_group(to_config_group(sd->s_element), group); 1683 link_group(to_config_group(sd->s_element), group);
1684 1684
1685 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); 1685 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
1686 1686
1687 name.name = group->cg_item.ci_name; 1687 name.name = group->cg_item.ci_name;
1688 name.len = strlen(name.name); 1688 name.len = strlen(name.name);
1689 name.hash = full_name_hash(name.name, name.len); 1689 name.hash = full_name_hash(name.name, name.len);
1690 1690
1691 err = -ENOMEM; 1691 err = -ENOMEM;
1692 dentry = d_alloc(root, &name); 1692 dentry = d_alloc(root, &name);
1693 if (dentry) { 1693 if (dentry) {
1694 d_add(dentry, NULL); 1694 d_add(dentry, NULL);
1695 1695
1696 err = configfs_attach_group(sd->s_element, &group->cg_item, 1696 err = configfs_attach_group(sd->s_element, &group->cg_item,
1697 dentry); 1697 dentry);
1698 if (err) { 1698 if (err) {
1699 BUG_ON(dentry->d_inode); 1699 BUG_ON(dentry->d_inode);
1700 d_drop(dentry); 1700 d_drop(dentry);
1701 dput(dentry); 1701 dput(dentry);
1702 } else { 1702 } else {
1703 spin_lock(&configfs_dirent_lock); 1703 spin_lock(&configfs_dirent_lock);
1704 configfs_dir_set_ready(dentry->d_fsdata); 1704 configfs_dir_set_ready(dentry->d_fsdata);
1705 spin_unlock(&configfs_dirent_lock); 1705 spin_unlock(&configfs_dirent_lock);
1706 } 1706 }
1707 } 1707 }
1708 1708
1709 mutex_unlock(&root->d_inode->i_mutex); 1709 mutex_unlock(&root->d_inode->i_mutex);
1710 1710
1711 if (err) { 1711 if (err) {
1712 unlink_group(group); 1712 unlink_group(group);
1713 configfs_release_fs(); 1713 configfs_release_fs();
1714 } 1714 }
1715 1715
1716 return err; 1716 return err;
1717 } 1717 }
1718 1718
1719 void configfs_unregister_subsystem(struct configfs_subsystem *subsys) 1719 void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1720 { 1720 {
1721 struct config_group *group = &subsys->su_group; 1721 struct config_group *group = &subsys->su_group;
1722 struct dentry *dentry = group->cg_item.ci_dentry; 1722 struct dentry *dentry = group->cg_item.ci_dentry;
1723 struct dentry *root = dentry->d_sb->s_root; 1723 struct dentry *root = dentry->d_sb->s_root;
1724 1724
1725 if (dentry->d_parent != root) { 1725 if (dentry->d_parent != root) {
1726 printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); 1726 printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
1727 return; 1727 return;
1728 } 1728 }
1729 1729
1730 mutex_lock_nested(&root->d_inode->i_mutex, 1730 mutex_lock_nested(&root->d_inode->i_mutex,
1731 I_MUTEX_PARENT); 1731 I_MUTEX_PARENT);
1732 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 1732 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
1733 mutex_lock(&configfs_symlink_mutex); 1733 mutex_lock(&configfs_symlink_mutex);
1734 spin_lock(&configfs_dirent_lock); 1734 spin_lock(&configfs_dirent_lock);
1735 if (configfs_detach_prep(dentry, NULL)) { 1735 if (configfs_detach_prep(dentry, NULL)) {
1736 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); 1736 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1737 } 1737 }
1738 spin_unlock(&configfs_dirent_lock); 1738 spin_unlock(&configfs_dirent_lock);
1739 mutex_unlock(&configfs_symlink_mutex); 1739 mutex_unlock(&configfs_symlink_mutex);
1740 configfs_detach_group(&group->cg_item); 1740 configfs_detach_group(&group->cg_item);
1741 dentry->d_inode->i_flags |= S_DEAD; 1741 dentry->d_inode->i_flags |= S_DEAD;
1742 dont_mount(dentry); 1742 dont_mount(dentry);
1743 mutex_unlock(&dentry->d_inode->i_mutex); 1743 mutex_unlock(&dentry->d_inode->i_mutex);
1744 1744
1745 d_delete(dentry); 1745 d_delete(dentry);
1746 1746
1747 mutex_unlock(&root->d_inode->i_mutex); 1747 mutex_unlock(&root->d_inode->i_mutex);
1748 1748
1749 dput(dentry); 1749 dput(dentry);
1750 1750
1751 unlink_group(group); 1751 unlink_group(group);
1752 configfs_release_fs(); 1752 configfs_release_fs();
1753 } 1753 }
1754 1754
1755 EXPORT_SYMBOL(configfs_register_subsystem); 1755 EXPORT_SYMBOL(configfs_register_subsystem);
1756 EXPORT_SYMBOL(configfs_unregister_subsystem); 1756 EXPORT_SYMBOL(configfs_unregister_subsystem);
1757 1757
1 /* 1 /*
2 * linux/fs/ext3/dir.c 2 * linux/fs/ext3/dir.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/dir.c 11 * linux/fs/minix/dir.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * ext3 directory handling functions 15 * ext3 directory handling functions
16 * 16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips 20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 * 21 *
22 */ 22 */
23 23
24 #include <linux/compat.h> 24 #include <linux/compat.h>
25 #include "ext3.h" 25 #include "ext3.h"
26 26
27 static unsigned char ext3_filetype_table[] = { 27 static unsigned char ext3_filetype_table[] = {
28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29 }; 29 };
30 30
31 static int ext3_dx_readdir(struct file * filp, 31 static int ext3_dx_readdir(struct file * filp,
32 void * dirent, filldir_t filldir); 32 void * dirent, filldir_t filldir);
33 33
34 static unsigned char get_dtype(struct super_block *sb, int filetype) 34 static unsigned char get_dtype(struct super_block *sb, int filetype)
35 { 35 {
36 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || 36 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
37 (filetype >= EXT3_FT_MAX)) 37 (filetype >= EXT3_FT_MAX))
38 return DT_UNKNOWN; 38 return DT_UNKNOWN;
39 39
40 return (ext3_filetype_table[filetype]); 40 return (ext3_filetype_table[filetype]);
41 } 41 }
42 42
43 /** 43 /**
44 * Check if the given dir-inode refers to an htree-indexed directory 44 * Check if the given dir-inode refers to an htree-indexed directory
45 * (or a directory which chould potentially get coverted to use htree 45 * (or a directory which chould potentially get coverted to use htree
46 * indexing). 46 * indexing).
47 * 47 *
48 * Return 1 if it is a dx dir, 0 if not 48 * Return 1 if it is a dx dir, 0 if not
49 */ 49 */
50 static int is_dx_dir(struct inode *inode) 50 static int is_dx_dir(struct inode *inode)
51 { 51 {
52 struct super_block *sb = inode->i_sb; 52 struct super_block *sb = inode->i_sb;
53 53
54 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, 54 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
55 EXT3_FEATURE_COMPAT_DIR_INDEX) && 55 EXT3_FEATURE_COMPAT_DIR_INDEX) &&
56 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || 56 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
57 ((inode->i_size >> sb->s_blocksize_bits) == 1))) 57 ((inode->i_size >> sb->s_blocksize_bits) == 1)))
58 return 1; 58 return 1;
59 59
60 return 0; 60 return 0;
61 } 61 }
62 62
63 int ext3_check_dir_entry (const char * function, struct inode * dir, 63 int ext3_check_dir_entry (const char * function, struct inode * dir,
64 struct ext3_dir_entry_2 * de, 64 struct ext3_dir_entry_2 * de,
65 struct buffer_head * bh, 65 struct buffer_head * bh,
66 unsigned long offset) 66 unsigned long offset)
67 { 67 {
68 const char * error_msg = NULL; 68 const char * error_msg = NULL;
69 const int rlen = ext3_rec_len_from_disk(de->rec_len); 69 const int rlen = ext3_rec_len_from_disk(de->rec_len);
70 70
71 if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) 71 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
72 error_msg = "rec_len is smaller than minimal"; 72 error_msg = "rec_len is smaller than minimal";
73 else if (unlikely(rlen % 4 != 0)) 73 else if (unlikely(rlen % 4 != 0))
74 error_msg = "rec_len % 4 != 0"; 74 error_msg = "rec_len % 4 != 0";
75 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) 75 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
76 error_msg = "rec_len is too small for name_len"; 76 error_msg = "rec_len is too small for name_len";
77 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) 77 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
78 error_msg = "directory entry across blocks"; 78 error_msg = "directory entry across blocks";
79 else if (unlikely(le32_to_cpu(de->inode) > 79 else if (unlikely(le32_to_cpu(de->inode) >
80 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) 80 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
81 error_msg = "inode out of bounds"; 81 error_msg = "inode out of bounds";
82 82
83 if (unlikely(error_msg != NULL)) 83 if (unlikely(error_msg != NULL))
84 ext3_error (dir->i_sb, function, 84 ext3_error (dir->i_sb, function,
85 "bad entry in directory #%lu: %s - " 85 "bad entry in directory #%lu: %s - "
86 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 86 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
87 dir->i_ino, error_msg, offset, 87 dir->i_ino, error_msg, offset,
88 (unsigned long) le32_to_cpu(de->inode), 88 (unsigned long) le32_to_cpu(de->inode),
89 rlen, de->name_len); 89 rlen, de->name_len);
90 90
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92 } 92 }
93 93
94 static int ext3_readdir(struct file * filp, 94 static int ext3_readdir(struct file * filp,
95 void * dirent, filldir_t filldir) 95 void * dirent, filldir_t filldir)
96 { 96 {
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned long offset;
99 int i, stored; 99 int i, stored;
100 struct ext3_dir_entry_2 *de; 100 struct ext3_dir_entry_2 *de;
101 int err; 101 int err;
102 struct inode *inode = filp->f_path.dentry->d_inode; 102 struct inode *inode = filp->f_path.dentry->d_inode;
103 struct super_block *sb = inode->i_sb; 103 struct super_block *sb = inode->i_sb;
104 int ret = 0; 104 int ret = 0;
105 int dir_has_error = 0; 105 int dir_has_error = 0;
106 106
107 if (is_dx_dir(inode)) { 107 if (is_dx_dir(inode)) {
108 err = ext3_dx_readdir(filp, dirent, filldir); 108 err = ext3_dx_readdir(filp, dirent, filldir);
109 if (err != ERR_BAD_DX_DIR) { 109 if (err != ERR_BAD_DX_DIR) {
110 ret = err; 110 ret = err;
111 goto out; 111 goto out;
112 } 112 }
113 /* 113 /*
114 * We don't set the inode dirty flag since it's not 114 * We don't set the inode dirty flag since it's not
115 * critical that it get flushed back to the disk. 115 * critical that it get flushed back to the disk.
116 */ 116 */
117 EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; 117 EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
118 } 118 }
119 stored = 0; 119 stored = 0;
120 offset = filp->f_pos & (sb->s_blocksize - 1); 120 offset = filp->f_pos & (sb->s_blocksize - 1);
121 121
122 while (!error && !stored && filp->f_pos < inode->i_size) { 122 while (!error && !stored && filp->f_pos < inode->i_size) {
123 unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); 123 unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
124 struct buffer_head map_bh; 124 struct buffer_head map_bh;
125 struct buffer_head *bh = NULL; 125 struct buffer_head *bh = NULL;
126 126
127 map_bh.b_state = 0; 127 map_bh.b_state = 0;
128 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); 128 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
129 if (err > 0) { 129 if (err > 0) {
130 pgoff_t index = map_bh.b_blocknr >> 130 pgoff_t index = map_bh.b_blocknr >>
131 (PAGE_CACHE_SHIFT - inode->i_blkbits); 131 (PAGE_CACHE_SHIFT - inode->i_blkbits);
132 if (!ra_has_index(&filp->f_ra, index)) 132 if (!ra_has_index(&filp->f_ra, index))
133 page_cache_sync_readahead( 133 page_cache_sync_readahead(
134 sb->s_bdev->bd_inode->i_mapping, 134 sb->s_bdev->bd_inode->i_mapping,
135 &filp->f_ra, filp, 135 &filp->f_ra, filp,
136 index, 1); 136 index, 1);
137 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 137 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
138 bh = ext3_bread(NULL, inode, blk, 0, &err); 138 bh = ext3_bread(NULL, inode, blk, 0, &err);
139 } 139 }
140 140
141 /* 141 /*
142 * We ignore I/O errors on directories so users have a chance 142 * We ignore I/O errors on directories so users have a chance
143 * of recovering data when there's a bad sector 143 * of recovering data when there's a bad sector
144 */ 144 */
145 if (!bh) { 145 if (!bh) {
146 if (!dir_has_error) { 146 if (!dir_has_error) {
147 ext3_error(sb, __func__, "directory #%lu " 147 ext3_error(sb, __func__, "directory #%lu "
148 "contains a hole at offset %lld", 148 "contains a hole at offset %lld",
149 inode->i_ino, filp->f_pos); 149 inode->i_ino, filp->f_pos);
150 dir_has_error = 1; 150 dir_has_error = 1;
151 } 151 }
152 /* corrupt size? Maybe no more blocks to read */ 152 /* corrupt size? Maybe no more blocks to read */
153 if (filp->f_pos > inode->i_blocks << 9) 153 if (filp->f_pos > inode->i_blocks << 9)
154 break; 154 break;
155 filp->f_pos += sb->s_blocksize - offset; 155 filp->f_pos += sb->s_blocksize - offset;
156 continue; 156 continue;
157 } 157 }
158 158
159 revalidate: 159 revalidate:
160 /* If the dir block has changed since the last call to 160 /* If the dir block has changed since the last call to
161 * readdir(2), then we might be pointing to an invalid 161 * readdir(2), then we might be pointing to an invalid
162 * dirent right now. Scan from the start of the block 162 * dirent right now. Scan from the start of the block
163 * to make sure. */ 163 * to make sure. */
164 if (filp->f_version != inode->i_version) { 164 if (filp->f_version != inode->i_version) {
165 for (i = 0; i < sb->s_blocksize && i < offset; ) { 165 for (i = 0; i < sb->s_blocksize && i < offset; ) {
166 de = (struct ext3_dir_entry_2 *) 166 de = (struct ext3_dir_entry_2 *)
167 (bh->b_data + i); 167 (bh->b_data + i);
168 /* It's too expensive to do a full 168 /* It's too expensive to do a full
169 * dirent test each time round this 169 * dirent test each time round this
170 * loop, but we do have to test at 170 * loop, but we do have to test at
171 * least that it is non-zero. A 171 * least that it is non-zero. A
172 * failure will be detected in the 172 * failure will be detected in the
173 * dirent test below. */ 173 * dirent test below. */
174 if (ext3_rec_len_from_disk(de->rec_len) < 174 if (ext3_rec_len_from_disk(de->rec_len) <
175 EXT3_DIR_REC_LEN(1)) 175 EXT3_DIR_REC_LEN(1))
176 break; 176 break;
177 i += ext3_rec_len_from_disk(de->rec_len); 177 i += ext3_rec_len_from_disk(de->rec_len);
178 } 178 }
179 offset = i; 179 offset = i;
180 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 180 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
181 | offset; 181 | offset;
182 filp->f_version = inode->i_version; 182 filp->f_version = inode->i_version;
183 } 183 }
184 184
185 while (!error && filp->f_pos < inode->i_size 185 while (!error && filp->f_pos < inode->i_size
186 && offset < sb->s_blocksize) { 186 && offset < sb->s_blocksize) {
187 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); 187 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
188 if (!ext3_check_dir_entry ("ext3_readdir", inode, de, 188 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
189 bh, offset)) { 189 bh, offset)) {
190 /* On error, skip the f_pos to the 190 /* On error, skip the f_pos to the
191 next block. */ 191 next block. */
192 filp->f_pos = (filp->f_pos | 192 filp->f_pos = (filp->f_pos |
193 (sb->s_blocksize - 1)) + 1; 193 (sb->s_blocksize - 1)) + 1;
194 brelse (bh); 194 brelse (bh);
195 ret = stored; 195 ret = stored;
196 goto out; 196 goto out;
197 } 197 }
198 offset += ext3_rec_len_from_disk(de->rec_len); 198 offset += ext3_rec_len_from_disk(de->rec_len);
199 if (le32_to_cpu(de->inode)) { 199 if (le32_to_cpu(de->inode)) {
200 /* We might block in the next section 200 /* We might block in the next section
201 * if the data destination is 201 * if the data destination is
202 * currently swapped out. So, use a 202 * currently swapped out. So, use a
203 * version stamp to detect whether or 203 * version stamp to detect whether or
204 * not the directory has been modified 204 * not the directory has been modified
205 * during the copy operation. 205 * during the copy operation.
206 */ 206 */
207 u64 version = filp->f_version; 207 u64 version = filp->f_version;
208 208
209 error = filldir(dirent, de->name, 209 error = filldir(dirent, de->name,
210 de->name_len, 210 de->name_len,
211 filp->f_pos, 211 filp->f_pos,
212 le32_to_cpu(de->inode), 212 le32_to_cpu(de->inode),
213 get_dtype(sb, de->file_type)); 213 get_dtype(sb, de->file_type));
214 if (error) 214 if (error)
215 break; 215 break;
216 if (version != filp->f_version) 216 if (version != filp->f_version)
217 goto revalidate; 217 goto revalidate;
218 stored ++; 218 stored ++;
219 } 219 }
220 filp->f_pos += ext3_rec_len_from_disk(de->rec_len); 220 filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
221 } 221 }
222 offset = 0; 222 offset = 0;
223 brelse (bh); 223 brelse (bh);
224 } 224 }
225 out: 225 out:
226 return ret; 226 return ret;
227 } 227 }
228 228
229 static inline int is_32bit_api(void) 229 static inline int is_32bit_api(void)
230 { 230 {
231 #ifdef CONFIG_COMPAT 231 #ifdef CONFIG_COMPAT
232 return is_compat_task(); 232 return is_compat_task();
233 #else 233 #else
234 return (BITS_PER_LONG == 32); 234 return (BITS_PER_LONG == 32);
235 #endif 235 #endif
236 } 236 }
237 237
238 /* 238 /*
239 * These functions convert from the major/minor hash to an f_pos 239 * These functions convert from the major/minor hash to an f_pos
240 * value for dx directories 240 * value for dx directories
241 * 241 *
242 * Upper layer (for example NFS) should specify FMODE_32BITHASH or 242 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
243 * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted 243 * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
244 * directly on both 32-bit and 64-bit nodes, under such case, neither 244 * directly on both 32-bit and 64-bit nodes, under such case, neither
245 * FMODE_32BITHASH nor FMODE_64BITHASH is specified. 245 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
246 */ 246 */
247 static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) 247 static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
248 { 248 {
249 if ((filp->f_mode & FMODE_32BITHASH) || 249 if ((filp->f_mode & FMODE_32BITHASH) ||
250 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 250 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
251 return major >> 1; 251 return major >> 1;
252 else 252 else
253 return ((__u64)(major >> 1) << 32) | (__u64)minor; 253 return ((__u64)(major >> 1) << 32) | (__u64)minor;
254 } 254 }
255 255
256 static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) 256 static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
257 { 257 {
258 if ((filp->f_mode & FMODE_32BITHASH) || 258 if ((filp->f_mode & FMODE_32BITHASH) ||
259 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 259 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
260 return (pos << 1) & 0xffffffff; 260 return (pos << 1) & 0xffffffff;
261 else 261 else
262 return ((pos >> 32) << 1) & 0xffffffff; 262 return ((pos >> 32) << 1) & 0xffffffff;
263 } 263 }
264 264
265 static inline __u32 pos2min_hash(struct file *filp, loff_t pos) 265 static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
266 { 266 {
267 if ((filp->f_mode & FMODE_32BITHASH) || 267 if ((filp->f_mode & FMODE_32BITHASH) ||
268 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 268 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
269 return 0; 269 return 0;
270 else 270 else
271 return pos & 0xffffffff; 271 return pos & 0xffffffff;
272 } 272 }
273 273
274 /* 274 /*
275 * Return 32- or 64-bit end-of-file for dx directories 275 * Return 32- or 64-bit end-of-file for dx directories
276 */ 276 */
277 static inline loff_t ext3_get_htree_eof(struct file *filp) 277 static inline loff_t ext3_get_htree_eof(struct file *filp)
278 { 278 {
279 if ((filp->f_mode & FMODE_32BITHASH) || 279 if ((filp->f_mode & FMODE_32BITHASH) ||
280 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 280 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
281 return EXT3_HTREE_EOF_32BIT; 281 return EXT3_HTREE_EOF_32BIT;
282 else 282 else
283 return EXT3_HTREE_EOF_64BIT; 283 return EXT3_HTREE_EOF_64BIT;
284 } 284 }
285 285
286 286
287 /* 287 /*
288 * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both 288 * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
289 * non-htree and htree directories, where the "offset" is in terms 289 * non-htree and htree directories, where the "offset" is in terms
290 * of the filename hash value instead of the byte offset. 290 * of the filename hash value instead of the byte offset.
291 * 291 *
292 * Because we may return a 64-bit hash that is well beyond s_maxbytes, 292 * Because we may return a 64-bit hash that is well beyond s_maxbytes,
293 * we need to pass the max hash as the maximum allowable offset in 293 * we need to pass the max hash as the maximum allowable offset in
294 * the htree directory case. 294 * the htree directory case.
295 * 295 *
296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
297 * will be invalid once the directory was converted into a dx directory 297 * will be invalid once the directory was converted into a dx directory
298 */ 298 */
299 loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) 299 loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
300 { 300 {
301 struct inode *inode = file->f_mapping->host; 301 struct inode *inode = file->f_mapping->host;
302 int dx_dir = is_dx_dir(inode); 302 int dx_dir = is_dx_dir(inode);
303 loff_t htree_max = ext3_get_htree_eof(file); 303 loff_t htree_max = ext3_get_htree_eof(file);
304 304
305 if (likely(dx_dir)) 305 if (likely(dx_dir))
306 return generic_file_llseek_size(file, offset, origin, 306 return generic_file_llseek_size(file, offset, whence,
307 htree_max, htree_max); 307 htree_max, htree_max);
308 else 308 else
309 return generic_file_llseek(file, offset, origin); 309 return generic_file_llseek(file, offset, whence);
310 } 310 }
311 311
312 /* 312 /*
313 * This structure holds the nodes of the red-black tree used to store 313 * This structure holds the nodes of the red-black tree used to store
314 * the directory entry in hash order. 314 * the directory entry in hash order.
315 */ 315 */
316 struct fname { 316 struct fname {
317 __u32 hash; 317 __u32 hash;
318 __u32 minor_hash; 318 __u32 minor_hash;
319 struct rb_node rb_hash; 319 struct rb_node rb_hash;
320 struct fname *next; 320 struct fname *next;
321 __u32 inode; 321 __u32 inode;
322 __u8 name_len; 322 __u8 name_len;
323 __u8 file_type; 323 __u8 file_type;
324 char name[0]; 324 char name[0];
325 }; 325 };
326 326
327 /* 327 /*
328 * This functoin implements a non-recursive way of freeing all of the 328 * This functoin implements a non-recursive way of freeing all of the
329 * nodes in the red-black tree. 329 * nodes in the red-black tree.
330 */ 330 */
331 static void free_rb_tree_fname(struct rb_root *root) 331 static void free_rb_tree_fname(struct rb_root *root)
332 { 332 {
333 struct rb_node *n = root->rb_node; 333 struct rb_node *n = root->rb_node;
334 struct rb_node *parent; 334 struct rb_node *parent;
335 struct fname *fname; 335 struct fname *fname;
336 336
337 while (n) { 337 while (n) {
338 /* Do the node's children first */ 338 /* Do the node's children first */
339 if (n->rb_left) { 339 if (n->rb_left) {
340 n = n->rb_left; 340 n = n->rb_left;
341 continue; 341 continue;
342 } 342 }
343 if (n->rb_right) { 343 if (n->rb_right) {
344 n = n->rb_right; 344 n = n->rb_right;
345 continue; 345 continue;
346 } 346 }
347 /* 347 /*
348 * The node has no children; free it, and then zero 348 * The node has no children; free it, and then zero
349 * out parent's link to it. Finally go to the 349 * out parent's link to it. Finally go to the
350 * beginning of the loop and try to free the parent 350 * beginning of the loop and try to free the parent
351 * node. 351 * node.
352 */ 352 */
353 parent = rb_parent(n); 353 parent = rb_parent(n);
354 fname = rb_entry(n, struct fname, rb_hash); 354 fname = rb_entry(n, struct fname, rb_hash);
355 while (fname) { 355 while (fname) {
356 struct fname * old = fname; 356 struct fname * old = fname;
357 fname = fname->next; 357 fname = fname->next;
358 kfree (old); 358 kfree (old);
359 } 359 }
360 if (!parent) 360 if (!parent)
361 *root = RB_ROOT; 361 *root = RB_ROOT;
362 else if (parent->rb_left == n) 362 else if (parent->rb_left == n)
363 parent->rb_left = NULL; 363 parent->rb_left = NULL;
364 else if (parent->rb_right == n) 364 else if (parent->rb_right == n)
365 parent->rb_right = NULL; 365 parent->rb_right = NULL;
366 n = parent; 366 n = parent;
367 } 367 }
368 } 368 }
369 369
370 370
371 static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, 371 static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
372 loff_t pos) 372 loff_t pos)
373 { 373 {
374 struct dir_private_info *p; 374 struct dir_private_info *p;
375 375
376 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); 376 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
377 if (!p) 377 if (!p)
378 return NULL; 378 return NULL;
379 p->curr_hash = pos2maj_hash(filp, pos); 379 p->curr_hash = pos2maj_hash(filp, pos);
380 p->curr_minor_hash = pos2min_hash(filp, pos); 380 p->curr_minor_hash = pos2min_hash(filp, pos);
381 return p; 381 return p;
382 } 382 }
383 383
384 void ext3_htree_free_dir_info(struct dir_private_info *p) 384 void ext3_htree_free_dir_info(struct dir_private_info *p)
385 { 385 {
386 free_rb_tree_fname(&p->root); 386 free_rb_tree_fname(&p->root);
387 kfree(p); 387 kfree(p);
388 } 388 }
389 389
390 /* 390 /*
391 * Given a directory entry, enter it into the fname rb tree. 391 * Given a directory entry, enter it into the fname rb tree.
392 */ 392 */
393 int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, 393 int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
394 __u32 minor_hash, 394 __u32 minor_hash,
395 struct ext3_dir_entry_2 *dirent) 395 struct ext3_dir_entry_2 *dirent)
396 { 396 {
397 struct rb_node **p, *parent = NULL; 397 struct rb_node **p, *parent = NULL;
398 struct fname * fname, *new_fn; 398 struct fname * fname, *new_fn;
399 struct dir_private_info *info; 399 struct dir_private_info *info;
400 int len; 400 int len;
401 401
402 info = (struct dir_private_info *) dir_file->private_data; 402 info = (struct dir_private_info *) dir_file->private_data;
403 p = &info->root.rb_node; 403 p = &info->root.rb_node;
404 404
405 /* Create and allocate the fname structure */ 405 /* Create and allocate the fname structure */
406 len = sizeof(struct fname) + dirent->name_len + 1; 406 len = sizeof(struct fname) + dirent->name_len + 1;
407 new_fn = kzalloc(len, GFP_KERNEL); 407 new_fn = kzalloc(len, GFP_KERNEL);
408 if (!new_fn) 408 if (!new_fn)
409 return -ENOMEM; 409 return -ENOMEM;
410 new_fn->hash = hash; 410 new_fn->hash = hash;
411 new_fn->minor_hash = minor_hash; 411 new_fn->minor_hash = minor_hash;
412 new_fn->inode = le32_to_cpu(dirent->inode); 412 new_fn->inode = le32_to_cpu(dirent->inode);
413 new_fn->name_len = dirent->name_len; 413 new_fn->name_len = dirent->name_len;
414 new_fn->file_type = dirent->file_type; 414 new_fn->file_type = dirent->file_type;
415 memcpy(new_fn->name, dirent->name, dirent->name_len); 415 memcpy(new_fn->name, dirent->name, dirent->name_len);
416 new_fn->name[dirent->name_len] = 0; 416 new_fn->name[dirent->name_len] = 0;
417 417
418 while (*p) { 418 while (*p) {
419 parent = *p; 419 parent = *p;
420 fname = rb_entry(parent, struct fname, rb_hash); 420 fname = rb_entry(parent, struct fname, rb_hash);
421 421
422 /* 422 /*
423 * If the hash and minor hash match up, then we put 423 * If the hash and minor hash match up, then we put
424 * them on a linked list. This rarely happens... 424 * them on a linked list. This rarely happens...
425 */ 425 */
426 if ((new_fn->hash == fname->hash) && 426 if ((new_fn->hash == fname->hash) &&
427 (new_fn->minor_hash == fname->minor_hash)) { 427 (new_fn->minor_hash == fname->minor_hash)) {
428 new_fn->next = fname->next; 428 new_fn->next = fname->next;
429 fname->next = new_fn; 429 fname->next = new_fn;
430 return 0; 430 return 0;
431 } 431 }
432 432
433 if (new_fn->hash < fname->hash) 433 if (new_fn->hash < fname->hash)
434 p = &(*p)->rb_left; 434 p = &(*p)->rb_left;
435 else if (new_fn->hash > fname->hash) 435 else if (new_fn->hash > fname->hash)
436 p = &(*p)->rb_right; 436 p = &(*p)->rb_right;
437 else if (new_fn->minor_hash < fname->minor_hash) 437 else if (new_fn->minor_hash < fname->minor_hash)
438 p = &(*p)->rb_left; 438 p = &(*p)->rb_left;
439 else /* if (new_fn->minor_hash > fname->minor_hash) */ 439 else /* if (new_fn->minor_hash > fname->minor_hash) */
440 p = &(*p)->rb_right; 440 p = &(*p)->rb_right;
441 } 441 }
442 442
443 rb_link_node(&new_fn->rb_hash, parent, p); 443 rb_link_node(&new_fn->rb_hash, parent, p);
444 rb_insert_color(&new_fn->rb_hash, &info->root); 444 rb_insert_color(&new_fn->rb_hash, &info->root);
445 return 0; 445 return 0;
446 } 446 }
447 447
448 448
449 449
450 /* 450 /*
451 * This is a helper function for ext3_dx_readdir. It calls filldir 451 * This is a helper function for ext3_dx_readdir. It calls filldir
452 * for all entres on the fname linked list. (Normally there is only 452 * for all entres on the fname linked list. (Normally there is only
453 * one entry on the linked list, unless there are 62 bit hash collisions.) 453 * one entry on the linked list, unless there are 62 bit hash collisions.)
454 */ 454 */
455 static int call_filldir(struct file * filp, void * dirent, 455 static int call_filldir(struct file * filp, void * dirent,
456 filldir_t filldir, struct fname *fname) 456 filldir_t filldir, struct fname *fname)
457 { 457 {
458 struct dir_private_info *info = filp->private_data; 458 struct dir_private_info *info = filp->private_data;
459 loff_t curr_pos; 459 loff_t curr_pos;
460 struct inode *inode = filp->f_path.dentry->d_inode; 460 struct inode *inode = filp->f_path.dentry->d_inode;
461 struct super_block * sb; 461 struct super_block * sb;
462 int error; 462 int error;
463 463
464 sb = inode->i_sb; 464 sb = inode->i_sb;
465 465
466 if (!fname) { 466 if (!fname) {
467 printk("call_filldir: called with null fname?!?\n"); 467 printk("call_filldir: called with null fname?!?\n");
468 return 0; 468 return 0;
469 } 469 }
470 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 470 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
471 while (fname) { 471 while (fname) {
472 error = filldir(dirent, fname->name, 472 error = filldir(dirent, fname->name,
473 fname->name_len, curr_pos, 473 fname->name_len, curr_pos,
474 fname->inode, 474 fname->inode,
475 get_dtype(sb, fname->file_type)); 475 get_dtype(sb, fname->file_type));
476 if (error) { 476 if (error) {
477 filp->f_pos = curr_pos; 477 filp->f_pos = curr_pos;
478 info->extra_fname = fname; 478 info->extra_fname = fname;
479 return error; 479 return error;
480 } 480 }
481 fname = fname->next; 481 fname = fname->next;
482 } 482 }
483 return 0; 483 return 0;
484 } 484 }
485 485
486 static int ext3_dx_readdir(struct file * filp, 486 static int ext3_dx_readdir(struct file * filp,
487 void * dirent, filldir_t filldir) 487 void * dirent, filldir_t filldir)
488 { 488 {
489 struct dir_private_info *info = filp->private_data; 489 struct dir_private_info *info = filp->private_data;
490 struct inode *inode = filp->f_path.dentry->d_inode; 490 struct inode *inode = filp->f_path.dentry->d_inode;
491 struct fname *fname; 491 struct fname *fname;
492 int ret; 492 int ret;
493 493
494 if (!info) { 494 if (!info) {
495 info = ext3_htree_create_dir_info(filp, filp->f_pos); 495 info = ext3_htree_create_dir_info(filp, filp->f_pos);
496 if (!info) 496 if (!info)
497 return -ENOMEM; 497 return -ENOMEM;
498 filp->private_data = info; 498 filp->private_data = info;
499 } 499 }
500 500
501 if (filp->f_pos == ext3_get_htree_eof(filp)) 501 if (filp->f_pos == ext3_get_htree_eof(filp))
502 return 0; /* EOF */ 502 return 0; /* EOF */
503 503
504 /* Some one has messed with f_pos; reset the world */ 504 /* Some one has messed with f_pos; reset the world */
505 if (info->last_pos != filp->f_pos) { 505 if (info->last_pos != filp->f_pos) {
506 free_rb_tree_fname(&info->root); 506 free_rb_tree_fname(&info->root);
507 info->curr_node = NULL; 507 info->curr_node = NULL;
508 info->extra_fname = NULL; 508 info->extra_fname = NULL;
509 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 509 info->curr_hash = pos2maj_hash(filp, filp->f_pos);
510 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 510 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
511 } 511 }
512 512
513 /* 513 /*
514 * If there are any leftover names on the hash collision 514 * If there are any leftover names on the hash collision
515 * chain, return them first. 515 * chain, return them first.
516 */ 516 */
517 if (info->extra_fname) { 517 if (info->extra_fname) {
518 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 518 if (call_filldir(filp, dirent, filldir, info->extra_fname))
519 goto finished; 519 goto finished;
520 info->extra_fname = NULL; 520 info->extra_fname = NULL;
521 goto next_node; 521 goto next_node;
522 } else if (!info->curr_node) 522 } else if (!info->curr_node)
523 info->curr_node = rb_first(&info->root); 523 info->curr_node = rb_first(&info->root);
524 524
525 while (1) { 525 while (1) {
526 /* 526 /*
527 * Fill the rbtree if we have no more entries, 527 * Fill the rbtree if we have no more entries,
528 * or the inode has changed since we last read in the 528 * or the inode has changed since we last read in the
529 * cached entries. 529 * cached entries.
530 */ 530 */
531 if ((!info->curr_node) || 531 if ((!info->curr_node) ||
532 (filp->f_version != inode->i_version)) { 532 (filp->f_version != inode->i_version)) {
533 info->curr_node = NULL; 533 info->curr_node = NULL;
534 free_rb_tree_fname(&info->root); 534 free_rb_tree_fname(&info->root);
535 filp->f_version = inode->i_version; 535 filp->f_version = inode->i_version;
536 ret = ext3_htree_fill_tree(filp, info->curr_hash, 536 ret = ext3_htree_fill_tree(filp, info->curr_hash,
537 info->curr_minor_hash, 537 info->curr_minor_hash,
538 &info->next_hash); 538 &info->next_hash);
539 if (ret < 0) 539 if (ret < 0)
540 return ret; 540 return ret;
541 if (ret == 0) { 541 if (ret == 0) {
542 filp->f_pos = ext3_get_htree_eof(filp); 542 filp->f_pos = ext3_get_htree_eof(filp);
543 break; 543 break;
544 } 544 }
545 info->curr_node = rb_first(&info->root); 545 info->curr_node = rb_first(&info->root);
546 } 546 }
547 547
548 fname = rb_entry(info->curr_node, struct fname, rb_hash); 548 fname = rb_entry(info->curr_node, struct fname, rb_hash);
549 info->curr_hash = fname->hash; 549 info->curr_hash = fname->hash;
550 info->curr_minor_hash = fname->minor_hash; 550 info->curr_minor_hash = fname->minor_hash;
551 if (call_filldir(filp, dirent, filldir, fname)) 551 if (call_filldir(filp, dirent, filldir, fname))
552 break; 552 break;
553 next_node: 553 next_node:
554 info->curr_node = rb_next(info->curr_node); 554 info->curr_node = rb_next(info->curr_node);
555 if (info->curr_node) { 555 if (info->curr_node) {
556 fname = rb_entry(info->curr_node, struct fname, 556 fname = rb_entry(info->curr_node, struct fname,
557 rb_hash); 557 rb_hash);
558 info->curr_hash = fname->hash; 558 info->curr_hash = fname->hash;
559 info->curr_minor_hash = fname->minor_hash; 559 info->curr_minor_hash = fname->minor_hash;
560 } else { 560 } else {
561 if (info->next_hash == ~0) { 561 if (info->next_hash == ~0) {
562 filp->f_pos = ext3_get_htree_eof(filp); 562 filp->f_pos = ext3_get_htree_eof(filp);
563 break; 563 break;
564 } 564 }
565 info->curr_hash = info->next_hash; 565 info->curr_hash = info->next_hash;
566 info->curr_minor_hash = 0; 566 info->curr_minor_hash = 0;
567 } 567 }
568 } 568 }
569 finished: 569 finished:
570 info->last_pos = filp->f_pos; 570 info->last_pos = filp->f_pos;
571 return 0; 571 return 0;
572 } 572 }
573 573
574 static int ext3_release_dir (struct inode * inode, struct file * filp) 574 static int ext3_release_dir (struct inode * inode, struct file * filp)
575 { 575 {
576 if (filp->private_data) 576 if (filp->private_data)
577 ext3_htree_free_dir_info(filp->private_data); 577 ext3_htree_free_dir_info(filp->private_data);
578 578
579 return 0; 579 return 0;
580 } 580 }
581 581
582 const struct file_operations ext3_dir_operations = { 582 const struct file_operations ext3_dir_operations = {
583 .llseek = ext3_dir_llseek, 583 .llseek = ext3_dir_llseek,
584 .read = generic_read_dir, 584 .read = generic_read_dir,
585 .readdir = ext3_readdir, 585 .readdir = ext3_readdir,
586 .unlocked_ioctl = ext3_ioctl, 586 .unlocked_ioctl = ext3_ioctl,
587 #ifdef CONFIG_COMPAT 587 #ifdef CONFIG_COMPAT
588 .compat_ioctl = ext3_compat_ioctl, 588 .compat_ioctl = ext3_compat_ioctl,
589 #endif 589 #endif
590 .fsync = ext3_sync_file, 590 .fsync = ext3_sync_file,
591 .release = ext3_release_dir, 591 .release = ext3_release_dir,
592 }; 592 };
593 593
1 /* 1 /*
2 * linux/fs/ext4/dir.c 2 * linux/fs/ext4/dir.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/dir.c 11 * linux/fs/minix/dir.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * ext4 directory handling functions 15 * ext4 directory handling functions
16 * 16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips 20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 * 21 *
22 */ 22 */
23 23
24 #include <linux/fs.h> 24 #include <linux/fs.h>
25 #include <linux/jbd2.h> 25 #include <linux/jbd2.h>
26 #include <linux/buffer_head.h> 26 #include <linux/buffer_head.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/rbtree.h> 28 #include <linux/rbtree.h>
29 #include "ext4.h" 29 #include "ext4.h"
30 #include "xattr.h" 30 #include "xattr.h"
31 31
32 static int ext4_dx_readdir(struct file *filp, 32 static int ext4_dx_readdir(struct file *filp,
33 void *dirent, filldir_t filldir); 33 void *dirent, filldir_t filldir);
34 34
35 /** 35 /**
36 * Check if the given dir-inode refers to an htree-indexed directory 36 * Check if the given dir-inode refers to an htree-indexed directory
37 * (or a directory which chould potentially get coverted to use htree 37 * (or a directory which chould potentially get coverted to use htree
38 * indexing). 38 * indexing).
39 * 39 *
40 * Return 1 if it is a dx dir, 0 if not 40 * Return 1 if it is a dx dir, 0 if not
41 */ 41 */
42 static int is_dx_dir(struct inode *inode) 42 static int is_dx_dir(struct inode *inode)
43 { 43 {
44 struct super_block *sb = inode->i_sb; 44 struct super_block *sb = inode->i_sb;
45 45
46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
47 EXT4_FEATURE_COMPAT_DIR_INDEX) && 47 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || 48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
49 ((inode->i_size >> sb->s_blocksize_bits) == 1))) 49 ((inode->i_size >> sb->s_blocksize_bits) == 1)))
50 return 1; 50 return 1;
51 51
52 return 0; 52 return 0;
53 } 53 }
54 54
55 /* 55 /*
56 * Return 0 if the directory entry is OK, and 1 if there is a problem 56 * Return 0 if the directory entry is OK, and 1 if there is a problem
57 * 57 *
58 * Note: this is the opposite of what ext2 and ext3 historically returned... 58 * Note: this is the opposite of what ext2 and ext3 historically returned...
59 * 59 *
60 * bh passed here can be an inode block or a dir data block, depending 60 * bh passed here can be an inode block or a dir data block, depending
61 * on the inode inline data flag. 61 * on the inode inline data flag.
62 */ 62 */
63 int __ext4_check_dir_entry(const char *function, unsigned int line, 63 int __ext4_check_dir_entry(const char *function, unsigned int line,
64 struct inode *dir, struct file *filp, 64 struct inode *dir, struct file *filp,
65 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head *bh, char *buf, int size, 66 struct buffer_head *bh, char *buf, int size,
67 unsigned int offset) 67 unsigned int offset)
68 { 68 {
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len, 70 const int rlen = ext4_rec_len_from_disk(de->rec_len,
71 dir->i_sb->s_blocksize); 71 dir->i_sb->s_blocksize);
72 72
73 if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) 73 if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
74 error_msg = "rec_len is smaller than minimal"; 74 error_msg = "rec_len is smaller than minimal";
75 else if (unlikely(rlen % 4 != 0)) 75 else if (unlikely(rlen % 4 != 0))
76 error_msg = "rec_len % 4 != 0"; 76 error_msg = "rec_len % 4 != 0";
77 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 77 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
78 error_msg = "rec_len is too small for name_len"; 78 error_msg = "rec_len is too small for name_len";
79 else if (unlikely(((char *) de - buf) + rlen > size)) 79 else if (unlikely(((char *) de - buf) + rlen > size))
80 error_msg = "directory entry across range"; 80 error_msg = "directory entry across range";
81 else if (unlikely(le32_to_cpu(de->inode) > 81 else if (unlikely(le32_to_cpu(de->inode) >
82 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) 82 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 else 84 else
85 return 0; 85 return 0;
86 86
87 if (filp) 87 if (filp)
88 ext4_error_file(filp, function, line, bh->b_blocknr, 88 ext4_error_file(filp, function, line, bh->b_blocknr,
89 "bad entry in directory: %s - offset=%u(%u), " 89 "bad entry in directory: %s - offset=%u(%u), "
90 "inode=%u, rec_len=%d, name_len=%d", 90 "inode=%u, rec_len=%d, name_len=%d",
91 error_msg, (unsigned) (offset % size), 91 error_msg, (unsigned) (offset % size),
92 offset, le32_to_cpu(de->inode), 92 offset, le32_to_cpu(de->inode),
93 rlen, de->name_len); 93 rlen, de->name_len);
94 else 94 else
95 ext4_error_inode(dir, function, line, bh->b_blocknr, 95 ext4_error_inode(dir, function, line, bh->b_blocknr,
96 "bad entry in directory: %s - offset=%u(%u), " 96 "bad entry in directory: %s - offset=%u(%u), "
97 "inode=%u, rec_len=%d, name_len=%d", 97 "inode=%u, rec_len=%d, name_len=%d",
98 error_msg, (unsigned) (offset % size), 98 error_msg, (unsigned) (offset % size),
99 offset, le32_to_cpu(de->inode), 99 offset, le32_to_cpu(de->inode),
100 rlen, de->name_len); 100 rlen, de->name_len);
101 101
102 return 1; 102 return 1;
103 } 103 }
104 104
105 static int ext4_readdir(struct file *filp, 105 static int ext4_readdir(struct file *filp,
106 void *dirent, filldir_t filldir) 106 void *dirent, filldir_t filldir)
107 { 107 {
108 int error = 0; 108 int error = 0;
109 unsigned int offset; 109 unsigned int offset;
110 int i, stored; 110 int i, stored;
111 struct ext4_dir_entry_2 *de; 111 struct ext4_dir_entry_2 *de;
112 int err; 112 int err;
113 struct inode *inode = filp->f_path.dentry->d_inode; 113 struct inode *inode = filp->f_path.dentry->d_inode;
114 struct super_block *sb = inode->i_sb; 114 struct super_block *sb = inode->i_sb;
115 int ret = 0; 115 int ret = 0;
116 int dir_has_error = 0; 116 int dir_has_error = 0;
117 117
118 if (ext4_has_inline_data(inode)) { 118 if (ext4_has_inline_data(inode)) {
119 int has_inline_data = 1; 119 int has_inline_data = 1;
120 ret = ext4_read_inline_dir(filp, dirent, filldir, 120 ret = ext4_read_inline_dir(filp, dirent, filldir,
121 &has_inline_data); 121 &has_inline_data);
122 if (has_inline_data) 122 if (has_inline_data)
123 return ret; 123 return ret;
124 } 124 }
125 125
126 if (is_dx_dir(inode)) { 126 if (is_dx_dir(inode)) {
127 err = ext4_dx_readdir(filp, dirent, filldir); 127 err = ext4_dx_readdir(filp, dirent, filldir);
128 if (err != ERR_BAD_DX_DIR) { 128 if (err != ERR_BAD_DX_DIR) {
129 ret = err; 129 ret = err;
130 goto out; 130 goto out;
131 } 131 }
132 /* 132 /*
133 * We don't set the inode dirty flag since it's not 133 * We don't set the inode dirty flag since it's not
134 * critical that it get flushed back to the disk. 134 * critical that it get flushed back to the disk.
135 */ 135 */
136 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, 136 ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
137 EXT4_INODE_INDEX); 137 EXT4_INODE_INDEX);
138 } 138 }
139 stored = 0; 139 stored = 0;
140 offset = filp->f_pos & (sb->s_blocksize - 1); 140 offset = filp->f_pos & (sb->s_blocksize - 1);
141 141
142 while (!error && !stored && filp->f_pos < inode->i_size) { 142 while (!error && !stored && filp->f_pos < inode->i_size) {
143 struct ext4_map_blocks map; 143 struct ext4_map_blocks map;
144 struct buffer_head *bh = NULL; 144 struct buffer_head *bh = NULL;
145 145
146 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 146 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
147 map.m_len = 1; 147 map.m_len = 1;
148 err = ext4_map_blocks(NULL, inode, &map, 0); 148 err = ext4_map_blocks(NULL, inode, &map, 0);
149 if (err > 0) { 149 if (err > 0) {
150 pgoff_t index = map.m_pblk >> 150 pgoff_t index = map.m_pblk >>
151 (PAGE_CACHE_SHIFT - inode->i_blkbits); 151 (PAGE_CACHE_SHIFT - inode->i_blkbits);
152 if (!ra_has_index(&filp->f_ra, index)) 152 if (!ra_has_index(&filp->f_ra, index))
153 page_cache_sync_readahead( 153 page_cache_sync_readahead(
154 sb->s_bdev->bd_inode->i_mapping, 154 sb->s_bdev->bd_inode->i_mapping,
155 &filp->f_ra, filp, 155 &filp->f_ra, filp,
156 index, 1); 156 index, 1);
157 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 157 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
158 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 158 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
159 } 159 }
160 160
161 /* 161 /*
162 * We ignore I/O errors on directories so users have a chance 162 * We ignore I/O errors on directories so users have a chance
163 * of recovering data when there's a bad sector 163 * of recovering data when there's a bad sector
164 */ 164 */
165 if (!bh) { 165 if (!bh) {
166 if (!dir_has_error) { 166 if (!dir_has_error) {
167 EXT4_ERROR_FILE(filp, 0, 167 EXT4_ERROR_FILE(filp, 0,
168 "directory contains a " 168 "directory contains a "
169 "hole at offset %llu", 169 "hole at offset %llu",
170 (unsigned long long) filp->f_pos); 170 (unsigned long long) filp->f_pos);
171 dir_has_error = 1; 171 dir_has_error = 1;
172 } 172 }
173 /* corrupt size? Maybe no more blocks to read */ 173 /* corrupt size? Maybe no more blocks to read */
174 if (filp->f_pos > inode->i_blocks << 9) 174 if (filp->f_pos > inode->i_blocks << 9)
175 break; 175 break;
176 filp->f_pos += sb->s_blocksize - offset; 176 filp->f_pos += sb->s_blocksize - offset;
177 continue; 177 continue;
178 } 178 }
179 179
180 /* Check the checksum */ 180 /* Check the checksum */
181 if (!buffer_verified(bh) && 181 if (!buffer_verified(bh) &&
182 !ext4_dirent_csum_verify(inode, 182 !ext4_dirent_csum_verify(inode,
183 (struct ext4_dir_entry *)bh->b_data)) { 183 (struct ext4_dir_entry *)bh->b_data)) {
184 EXT4_ERROR_FILE(filp, 0, "directory fails checksum " 184 EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
185 "at offset %llu", 185 "at offset %llu",
186 (unsigned long long)filp->f_pos); 186 (unsigned long long)filp->f_pos);
187 filp->f_pos += sb->s_blocksize - offset; 187 filp->f_pos += sb->s_blocksize - offset;
188 continue; 188 continue;
189 } 189 }
190 set_buffer_verified(bh); 190 set_buffer_verified(bh);
191 191
192 revalidate: 192 revalidate:
193 /* If the dir block has changed since the last call to 193 /* If the dir block has changed since the last call to
194 * readdir(2), then we might be pointing to an invalid 194 * readdir(2), then we might be pointing to an invalid
195 * dirent right now. Scan from the start of the block 195 * dirent right now. Scan from the start of the block
196 * to make sure. */ 196 * to make sure. */
197 if (filp->f_version != inode->i_version) { 197 if (filp->f_version != inode->i_version) {
198 for (i = 0; i < sb->s_blocksize && i < offset; ) { 198 for (i = 0; i < sb->s_blocksize && i < offset; ) {
199 de = (struct ext4_dir_entry_2 *) 199 de = (struct ext4_dir_entry_2 *)
200 (bh->b_data + i); 200 (bh->b_data + i);
201 /* It's too expensive to do a full 201 /* It's too expensive to do a full
202 * dirent test each time round this 202 * dirent test each time round this
203 * loop, but we do have to test at 203 * loop, but we do have to test at
204 * least that it is non-zero. A 204 * least that it is non-zero. A
205 * failure will be detected in the 205 * failure will be detected in the
206 * dirent test below. */ 206 * dirent test below. */
207 if (ext4_rec_len_from_disk(de->rec_len, 207 if (ext4_rec_len_from_disk(de->rec_len,
208 sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) 208 sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
209 break; 209 break;
210 i += ext4_rec_len_from_disk(de->rec_len, 210 i += ext4_rec_len_from_disk(de->rec_len,
211 sb->s_blocksize); 211 sb->s_blocksize);
212 } 212 }
213 offset = i; 213 offset = i;
214 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 214 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
215 | offset; 215 | offset;
216 filp->f_version = inode->i_version; 216 filp->f_version = inode->i_version;
217 } 217 }
218 218
219 while (!error && filp->f_pos < inode->i_size 219 while (!error && filp->f_pos < inode->i_size
220 && offset < sb->s_blocksize) { 220 && offset < sb->s_blocksize) {
221 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 221 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
222 if (ext4_check_dir_entry(inode, filp, de, bh, 222 if (ext4_check_dir_entry(inode, filp, de, bh,
223 bh->b_data, bh->b_size, 223 bh->b_data, bh->b_size,
224 offset)) { 224 offset)) {
225 /* 225 /*
226 * On error, skip the f_pos to the next block 226 * On error, skip the f_pos to the next block
227 */ 227 */
228 filp->f_pos = (filp->f_pos | 228 filp->f_pos = (filp->f_pos |
229 (sb->s_blocksize - 1)) + 1; 229 (sb->s_blocksize - 1)) + 1;
230 brelse(bh); 230 brelse(bh);
231 ret = stored; 231 ret = stored;
232 goto out; 232 goto out;
233 } 233 }
234 offset += ext4_rec_len_from_disk(de->rec_len, 234 offset += ext4_rec_len_from_disk(de->rec_len,
235 sb->s_blocksize); 235 sb->s_blocksize);
236 if (le32_to_cpu(de->inode)) { 236 if (le32_to_cpu(de->inode)) {
237 /* We might block in the next section 237 /* We might block in the next section
238 * if the data destination is 238 * if the data destination is
239 * currently swapped out. So, use a 239 * currently swapped out. So, use a
240 * version stamp to detect whether or 240 * version stamp to detect whether or
241 * not the directory has been modified 241 * not the directory has been modified
242 * during the copy operation. 242 * during the copy operation.
243 */ 243 */
244 u64 version = filp->f_version; 244 u64 version = filp->f_version;
245 245
246 error = filldir(dirent, de->name, 246 error = filldir(dirent, de->name,
247 de->name_len, 247 de->name_len,
248 filp->f_pos, 248 filp->f_pos,
249 le32_to_cpu(de->inode), 249 le32_to_cpu(de->inode),
250 get_dtype(sb, de->file_type)); 250 get_dtype(sb, de->file_type));
251 if (error) 251 if (error)
252 break; 252 break;
253 if (version != filp->f_version) 253 if (version != filp->f_version)
254 goto revalidate; 254 goto revalidate;
255 stored++; 255 stored++;
256 } 256 }
257 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 257 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
258 sb->s_blocksize); 258 sb->s_blocksize);
259 } 259 }
260 offset = 0; 260 offset = 0;
261 brelse(bh); 261 brelse(bh);
262 } 262 }
263 out: 263 out:
264 return ret; 264 return ret;
265 } 265 }
266 266
267 static inline int is_32bit_api(void) 267 static inline int is_32bit_api(void)
268 { 268 {
269 #ifdef CONFIG_COMPAT 269 #ifdef CONFIG_COMPAT
270 return is_compat_task(); 270 return is_compat_task();
271 #else 271 #else
272 return (BITS_PER_LONG == 32); 272 return (BITS_PER_LONG == 32);
273 #endif 273 #endif
274 } 274 }
275 275
276 /* 276 /*
277 * These functions convert from the major/minor hash to an f_pos 277 * These functions convert from the major/minor hash to an f_pos
278 * value for dx directories 278 * value for dx directories
279 * 279 *
280 * Upper layer (for example NFS) should specify FMODE_32BITHASH or 280 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
281 * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted 281 * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
282 * directly on both 32-bit and 64-bit nodes, under such case, neither 282 * directly on both 32-bit and 64-bit nodes, under such case, neither
283 * FMODE_32BITHASH nor FMODE_64BITHASH is specified. 283 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
284 */ 284 */
285 static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) 285 static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
286 { 286 {
287 if ((filp->f_mode & FMODE_32BITHASH) || 287 if ((filp->f_mode & FMODE_32BITHASH) ||
288 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 288 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
289 return major >> 1; 289 return major >> 1;
290 else 290 else
291 return ((__u64)(major >> 1) << 32) | (__u64)minor; 291 return ((__u64)(major >> 1) << 32) | (__u64)minor;
292 } 292 }
293 293
294 static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) 294 static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
295 { 295 {
296 if ((filp->f_mode & FMODE_32BITHASH) || 296 if ((filp->f_mode & FMODE_32BITHASH) ||
297 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 297 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
298 return (pos << 1) & 0xffffffff; 298 return (pos << 1) & 0xffffffff;
299 else 299 else
300 return ((pos >> 32) << 1) & 0xffffffff; 300 return ((pos >> 32) << 1) & 0xffffffff;
301 } 301 }
302 302
303 static inline __u32 pos2min_hash(struct file *filp, loff_t pos) 303 static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
304 { 304 {
305 if ((filp->f_mode & FMODE_32BITHASH) || 305 if ((filp->f_mode & FMODE_32BITHASH) ||
306 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 306 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
307 return 0; 307 return 0;
308 else 308 else
309 return pos & 0xffffffff; 309 return pos & 0xffffffff;
310 } 310 }
311 311
312 /* 312 /*
313 * Return 32- or 64-bit end-of-file for dx directories 313 * Return 32- or 64-bit end-of-file for dx directories
314 */ 314 */
315 static inline loff_t ext4_get_htree_eof(struct file *filp) 315 static inline loff_t ext4_get_htree_eof(struct file *filp)
316 { 316 {
317 if ((filp->f_mode & FMODE_32BITHASH) || 317 if ((filp->f_mode & FMODE_32BITHASH) ||
318 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 318 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
319 return EXT4_HTREE_EOF_32BIT; 319 return EXT4_HTREE_EOF_32BIT;
320 else 320 else
321 return EXT4_HTREE_EOF_64BIT; 321 return EXT4_HTREE_EOF_64BIT;
322 } 322 }
323 323
324 324
325 /* 325 /*
326 * ext4_dir_llseek() calls generic_file_llseek_size to handle htree 326 * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
327 * directories, where the "offset" is in terms of the filename hash 327 * directories, where the "offset" is in terms of the filename hash
328 * value instead of the byte offset. 328 * value instead of the byte offset.
329 * 329 *
330 * Because we may return a 64-bit hash that is well beyond offset limits, 330 * Because we may return a 64-bit hash that is well beyond offset limits,
331 * we need to pass the max hash as the maximum allowable offset in 331 * we need to pass the max hash as the maximum allowable offset in
332 * the htree directory case. 332 * the htree directory case.
333 * 333 *
334 * For non-htree, ext4_llseek already chooses the proper max offset. 334 * For non-htree, ext4_llseek already chooses the proper max offset.
335 */ 335 */
336 loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) 336 loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
337 { 337 {
338 struct inode *inode = file->f_mapping->host; 338 struct inode *inode = file->f_mapping->host;
339 int dx_dir = is_dx_dir(inode); 339 int dx_dir = is_dx_dir(inode);
340 loff_t htree_max = ext4_get_htree_eof(file); 340 loff_t htree_max = ext4_get_htree_eof(file);
341 341
342 if (likely(dx_dir)) 342 if (likely(dx_dir))
343 return generic_file_llseek_size(file, offset, origin, 343 return generic_file_llseek_size(file, offset, whence,
344 htree_max, htree_max); 344 htree_max, htree_max);
345 else 345 else
346 return ext4_llseek(file, offset, origin); 346 return ext4_llseek(file, offset, whence);
347 } 347 }
348 348
349 /* 349 /*
350 * This structure holds the nodes of the red-black tree used to store 350 * This structure holds the nodes of the red-black tree used to store
351 * the directory entry in hash order. 351 * the directory entry in hash order.
352 */ 352 */
353 struct fname { 353 struct fname {
354 __u32 hash; 354 __u32 hash;
355 __u32 minor_hash; 355 __u32 minor_hash;
356 struct rb_node rb_hash; 356 struct rb_node rb_hash;
357 struct fname *next; 357 struct fname *next;
358 __u32 inode; 358 __u32 inode;
359 __u8 name_len; 359 __u8 name_len;
360 __u8 file_type; 360 __u8 file_type;
361 char name[0]; 361 char name[0];
362 }; 362 };
363 363
364 /* 364 /*
365 * This functoin implements a non-recursive way of freeing all of the 365 * This functoin implements a non-recursive way of freeing all of the
366 * nodes in the red-black tree. 366 * nodes in the red-black tree.
367 */ 367 */
368 static void free_rb_tree_fname(struct rb_root *root) 368 static void free_rb_tree_fname(struct rb_root *root)
369 { 369 {
370 struct rb_node *n = root->rb_node; 370 struct rb_node *n = root->rb_node;
371 struct rb_node *parent; 371 struct rb_node *parent;
372 struct fname *fname; 372 struct fname *fname;
373 373
374 while (n) { 374 while (n) {
375 /* Do the node's children first */ 375 /* Do the node's children first */
376 if (n->rb_left) { 376 if (n->rb_left) {
377 n = n->rb_left; 377 n = n->rb_left;
378 continue; 378 continue;
379 } 379 }
380 if (n->rb_right) { 380 if (n->rb_right) {
381 n = n->rb_right; 381 n = n->rb_right;
382 continue; 382 continue;
383 } 383 }
384 /* 384 /*
385 * The node has no children; free it, and then zero 385 * The node has no children; free it, and then zero
386 * out parent's link to it. Finally go to the 386 * out parent's link to it. Finally go to the
387 * beginning of the loop and try to free the parent 387 * beginning of the loop and try to free the parent
388 * node. 388 * node.
389 */ 389 */
390 parent = rb_parent(n); 390 parent = rb_parent(n);
391 fname = rb_entry(n, struct fname, rb_hash); 391 fname = rb_entry(n, struct fname, rb_hash);
392 while (fname) { 392 while (fname) {
393 struct fname *old = fname; 393 struct fname *old = fname;
394 fname = fname->next; 394 fname = fname->next;
395 kfree(old); 395 kfree(old);
396 } 396 }
397 if (!parent) 397 if (!parent)
398 *root = RB_ROOT; 398 *root = RB_ROOT;
399 else if (parent->rb_left == n) 399 else if (parent->rb_left == n)
400 parent->rb_left = NULL; 400 parent->rb_left = NULL;
401 else if (parent->rb_right == n) 401 else if (parent->rb_right == n)
402 parent->rb_right = NULL; 402 parent->rb_right = NULL;
403 n = parent; 403 n = parent;
404 } 404 }
405 } 405 }
406 406
407 407
408 static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, 408 static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
409 loff_t pos) 409 loff_t pos)
410 { 410 {
411 struct dir_private_info *p; 411 struct dir_private_info *p;
412 412
413 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); 413 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
414 if (!p) 414 if (!p)
415 return NULL; 415 return NULL;
416 p->curr_hash = pos2maj_hash(filp, pos); 416 p->curr_hash = pos2maj_hash(filp, pos);
417 p->curr_minor_hash = pos2min_hash(filp, pos); 417 p->curr_minor_hash = pos2min_hash(filp, pos);
418 return p; 418 return p;
419 } 419 }
420 420
421 void ext4_htree_free_dir_info(struct dir_private_info *p) 421 void ext4_htree_free_dir_info(struct dir_private_info *p)
422 { 422 {
423 free_rb_tree_fname(&p->root); 423 free_rb_tree_fname(&p->root);
424 kfree(p); 424 kfree(p);
425 } 425 }
426 426
427 /* 427 /*
428 * Given a directory entry, enter it into the fname rb tree. 428 * Given a directory entry, enter it into the fname rb tree.
429 */ 429 */
430 int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 430 int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
431 __u32 minor_hash, 431 __u32 minor_hash,
432 struct ext4_dir_entry_2 *dirent) 432 struct ext4_dir_entry_2 *dirent)
433 { 433 {
434 struct rb_node **p, *parent = NULL; 434 struct rb_node **p, *parent = NULL;
435 struct fname *fname, *new_fn; 435 struct fname *fname, *new_fn;
436 struct dir_private_info *info; 436 struct dir_private_info *info;
437 int len; 437 int len;
438 438
439 info = dir_file->private_data; 439 info = dir_file->private_data;
440 p = &info->root.rb_node; 440 p = &info->root.rb_node;
441 441
442 /* Create and allocate the fname structure */ 442 /* Create and allocate the fname structure */
443 len = sizeof(struct fname) + dirent->name_len + 1; 443 len = sizeof(struct fname) + dirent->name_len + 1;
444 new_fn = kzalloc(len, GFP_KERNEL); 444 new_fn = kzalloc(len, GFP_KERNEL);
445 if (!new_fn) 445 if (!new_fn)
446 return -ENOMEM; 446 return -ENOMEM;
447 new_fn->hash = hash; 447 new_fn->hash = hash;
448 new_fn->minor_hash = minor_hash; 448 new_fn->minor_hash = minor_hash;
449 new_fn->inode = le32_to_cpu(dirent->inode); 449 new_fn->inode = le32_to_cpu(dirent->inode);
450 new_fn->name_len = dirent->name_len; 450 new_fn->name_len = dirent->name_len;
451 new_fn->file_type = dirent->file_type; 451 new_fn->file_type = dirent->file_type;
452 memcpy(new_fn->name, dirent->name, dirent->name_len); 452 memcpy(new_fn->name, dirent->name, dirent->name_len);
453 new_fn->name[dirent->name_len] = 0; 453 new_fn->name[dirent->name_len] = 0;
454 454
455 while (*p) { 455 while (*p) {
456 parent = *p; 456 parent = *p;
457 fname = rb_entry(parent, struct fname, rb_hash); 457 fname = rb_entry(parent, struct fname, rb_hash);
458 458
459 /* 459 /*
460 * If the hash and minor hash match up, then we put 460 * If the hash and minor hash match up, then we put
461 * them on a linked list. This rarely happens... 461 * them on a linked list. This rarely happens...
462 */ 462 */
463 if ((new_fn->hash == fname->hash) && 463 if ((new_fn->hash == fname->hash) &&
464 (new_fn->minor_hash == fname->minor_hash)) { 464 (new_fn->minor_hash == fname->minor_hash)) {
465 new_fn->next = fname->next; 465 new_fn->next = fname->next;
466 fname->next = new_fn; 466 fname->next = new_fn;
467 return 0; 467 return 0;
468 } 468 }
469 469
470 if (new_fn->hash < fname->hash) 470 if (new_fn->hash < fname->hash)
471 p = &(*p)->rb_left; 471 p = &(*p)->rb_left;
472 else if (new_fn->hash > fname->hash) 472 else if (new_fn->hash > fname->hash)
473 p = &(*p)->rb_right; 473 p = &(*p)->rb_right;
474 else if (new_fn->minor_hash < fname->minor_hash) 474 else if (new_fn->minor_hash < fname->minor_hash)
475 p = &(*p)->rb_left; 475 p = &(*p)->rb_left;
476 else /* if (new_fn->minor_hash > fname->minor_hash) */ 476 else /* if (new_fn->minor_hash > fname->minor_hash) */
477 p = &(*p)->rb_right; 477 p = &(*p)->rb_right;
478 } 478 }
479 479
480 rb_link_node(&new_fn->rb_hash, parent, p); 480 rb_link_node(&new_fn->rb_hash, parent, p);
481 rb_insert_color(&new_fn->rb_hash, &info->root); 481 rb_insert_color(&new_fn->rb_hash, &info->root);
482 return 0; 482 return 0;
483 } 483 }
484 484
485 485
486 486
487 /* 487 /*
488 * This is a helper function for ext4_dx_readdir. It calls filldir 488 * This is a helper function for ext4_dx_readdir. It calls filldir
489 * for all entres on the fname linked list. (Normally there is only 489 * for all entres on the fname linked list. (Normally there is only
490 * one entry on the linked list, unless there are 62 bit hash collisions.) 490 * one entry on the linked list, unless there are 62 bit hash collisions.)
491 */ 491 */
492 static int call_filldir(struct file *filp, void *dirent, 492 static int call_filldir(struct file *filp, void *dirent,
493 filldir_t filldir, struct fname *fname) 493 filldir_t filldir, struct fname *fname)
494 { 494 {
495 struct dir_private_info *info = filp->private_data; 495 struct dir_private_info *info = filp->private_data;
496 loff_t curr_pos; 496 loff_t curr_pos;
497 struct inode *inode = filp->f_path.dentry->d_inode; 497 struct inode *inode = filp->f_path.dentry->d_inode;
498 struct super_block *sb; 498 struct super_block *sb;
499 int error; 499 int error;
500 500
501 sb = inode->i_sb; 501 sb = inode->i_sb;
502 502
503 if (!fname) { 503 if (!fname) {
504 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " 504 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
505 "called with null fname?!?", __func__, __LINE__, 505 "called with null fname?!?", __func__, __LINE__,
506 inode->i_ino, current->comm); 506 inode->i_ino, current->comm);
507 return 0; 507 return 0;
508 } 508 }
509 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 509 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
510 while (fname) { 510 while (fname) {
511 error = filldir(dirent, fname->name, 511 error = filldir(dirent, fname->name,
512 fname->name_len, curr_pos, 512 fname->name_len, curr_pos,
513 fname->inode, 513 fname->inode,
514 get_dtype(sb, fname->file_type)); 514 get_dtype(sb, fname->file_type));
515 if (error) { 515 if (error) {
516 filp->f_pos = curr_pos; 516 filp->f_pos = curr_pos;
517 info->extra_fname = fname; 517 info->extra_fname = fname;
518 return error; 518 return error;
519 } 519 }
520 fname = fname->next; 520 fname = fname->next;
521 } 521 }
522 return 0; 522 return 0;
523 } 523 }
524 524
525 static int ext4_dx_readdir(struct file *filp, 525 static int ext4_dx_readdir(struct file *filp,
526 void *dirent, filldir_t filldir) 526 void *dirent, filldir_t filldir)
527 { 527 {
528 struct dir_private_info *info = filp->private_data; 528 struct dir_private_info *info = filp->private_data;
529 struct inode *inode = filp->f_path.dentry->d_inode; 529 struct inode *inode = filp->f_path.dentry->d_inode;
530 struct fname *fname; 530 struct fname *fname;
531 int ret; 531 int ret;
532 532
533 if (!info) { 533 if (!info) {
534 info = ext4_htree_create_dir_info(filp, filp->f_pos); 534 info = ext4_htree_create_dir_info(filp, filp->f_pos);
535 if (!info) 535 if (!info)
536 return -ENOMEM; 536 return -ENOMEM;
537 filp->private_data = info; 537 filp->private_data = info;
538 } 538 }
539 539
540 if (filp->f_pos == ext4_get_htree_eof(filp)) 540 if (filp->f_pos == ext4_get_htree_eof(filp))
541 return 0; /* EOF */ 541 return 0; /* EOF */
542 542
543 /* Some one has messed with f_pos; reset the world */ 543 /* Some one has messed with f_pos; reset the world */
544 if (info->last_pos != filp->f_pos) { 544 if (info->last_pos != filp->f_pos) {
545 free_rb_tree_fname(&info->root); 545 free_rb_tree_fname(&info->root);
546 info->curr_node = NULL; 546 info->curr_node = NULL;
547 info->extra_fname = NULL; 547 info->extra_fname = NULL;
548 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 548 info->curr_hash = pos2maj_hash(filp, filp->f_pos);
549 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 549 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
550 } 550 }
551 551
552 /* 552 /*
553 * If there are any leftover names on the hash collision 553 * If there are any leftover names on the hash collision
554 * chain, return them first. 554 * chain, return them first.
555 */ 555 */
556 if (info->extra_fname) { 556 if (info->extra_fname) {
557 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 557 if (call_filldir(filp, dirent, filldir, info->extra_fname))
558 goto finished; 558 goto finished;
559 info->extra_fname = NULL; 559 info->extra_fname = NULL;
560 goto next_node; 560 goto next_node;
561 } else if (!info->curr_node) 561 } else if (!info->curr_node)
562 info->curr_node = rb_first(&info->root); 562 info->curr_node = rb_first(&info->root);
563 563
564 while (1) { 564 while (1) {
565 /* 565 /*
566 * Fill the rbtree if we have no more entries, 566 * Fill the rbtree if we have no more entries,
567 * or the inode has changed since we last read in the 567 * or the inode has changed since we last read in the
568 * cached entries. 568 * cached entries.
569 */ 569 */
570 if ((!info->curr_node) || 570 if ((!info->curr_node) ||
571 (filp->f_version != inode->i_version)) { 571 (filp->f_version != inode->i_version)) {
572 info->curr_node = NULL; 572 info->curr_node = NULL;
573 free_rb_tree_fname(&info->root); 573 free_rb_tree_fname(&info->root);
574 filp->f_version = inode->i_version; 574 filp->f_version = inode->i_version;
575 ret = ext4_htree_fill_tree(filp, info->curr_hash, 575 ret = ext4_htree_fill_tree(filp, info->curr_hash,
576 info->curr_minor_hash, 576 info->curr_minor_hash,
577 &info->next_hash); 577 &info->next_hash);
578 if (ret < 0) 578 if (ret < 0)
579 return ret; 579 return ret;
580 if (ret == 0) { 580 if (ret == 0) {
581 filp->f_pos = ext4_get_htree_eof(filp); 581 filp->f_pos = ext4_get_htree_eof(filp);
582 break; 582 break;
583 } 583 }
584 info->curr_node = rb_first(&info->root); 584 info->curr_node = rb_first(&info->root);
585 } 585 }
586 586
587 fname = rb_entry(info->curr_node, struct fname, rb_hash); 587 fname = rb_entry(info->curr_node, struct fname, rb_hash);
588 info->curr_hash = fname->hash; 588 info->curr_hash = fname->hash;
589 info->curr_minor_hash = fname->minor_hash; 589 info->curr_minor_hash = fname->minor_hash;
590 if (call_filldir(filp, dirent, filldir, fname)) 590 if (call_filldir(filp, dirent, filldir, fname))
591 break; 591 break;
592 next_node: 592 next_node:
593 info->curr_node = rb_next(info->curr_node); 593 info->curr_node = rb_next(info->curr_node);
594 if (info->curr_node) { 594 if (info->curr_node) {
595 fname = rb_entry(info->curr_node, struct fname, 595 fname = rb_entry(info->curr_node, struct fname,
596 rb_hash); 596 rb_hash);
597 info->curr_hash = fname->hash; 597 info->curr_hash = fname->hash;
598 info->curr_minor_hash = fname->minor_hash; 598 info->curr_minor_hash = fname->minor_hash;
599 } else { 599 } else {
600 if (info->next_hash == ~0) { 600 if (info->next_hash == ~0) {
601 filp->f_pos = ext4_get_htree_eof(filp); 601 filp->f_pos = ext4_get_htree_eof(filp);
602 break; 602 break;
603 } 603 }
604 info->curr_hash = info->next_hash; 604 info->curr_hash = info->next_hash;
605 info->curr_minor_hash = 0; 605 info->curr_minor_hash = 0;
606 } 606 }
607 } 607 }
608 finished: 608 finished:
609 info->last_pos = filp->f_pos; 609 info->last_pos = filp->f_pos;
610 return 0; 610 return 0;
611 } 611 }
612 612
613 static int ext4_release_dir(struct inode *inode, struct file *filp) 613 static int ext4_release_dir(struct inode *inode, struct file *filp)
614 { 614 {
615 if (filp->private_data) 615 if (filp->private_data)
616 ext4_htree_free_dir_info(filp->private_data); 616 ext4_htree_free_dir_info(filp->private_data);
617 617
618 return 0; 618 return 0;
619 } 619 }
620 620
621 const struct file_operations ext4_dir_operations = { 621 const struct file_operations ext4_dir_operations = {
622 .llseek = ext4_dir_llseek, 622 .llseek = ext4_dir_llseek,
623 .read = generic_read_dir, 623 .read = generic_read_dir,
624 .readdir = ext4_readdir, 624 .readdir = ext4_readdir,
625 .unlocked_ioctl = ext4_ioctl, 625 .unlocked_ioctl = ext4_ioctl,
626 #ifdef CONFIG_COMPAT 626 #ifdef CONFIG_COMPAT
627 .compat_ioctl = ext4_compat_ioctl, 627 .compat_ioctl = ext4_compat_ioctl,
628 #endif 628 #endif
629 .fsync = ext4_sync_file, 629 .fsync = ext4_sync_file,
630 .release = ext4_release_dir, 630 .release = ext4_release_dir,
631 }; 631 };
632 632
1 /* 1 /*
2 * linux/fs/ext4/file.c 2 * linux/fs/ext4/file.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/file.c 11 * linux/fs/minix/file.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * ext4 fs regular file handling primitives 15 * ext4 fs regular file handling primitives
16 * 16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek 17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz) 18 * (jj@sunsite.ms.mff.cuni.cz)
19 */ 19 */
20 20
21 #include <linux/time.h> 21 #include <linux/time.h>
22 #include <linux/fs.h> 22 #include <linux/fs.h>
23 #include <linux/jbd2.h> 23 #include <linux/jbd2.h>
24 #include <linux/mount.h> 24 #include <linux/mount.h>
25 #include <linux/path.h> 25 #include <linux/path.h>
26 #include <linux/quotaops.h> 26 #include <linux/quotaops.h>
27 #include <linux/pagevec.h> 27 #include <linux/pagevec.h>
28 #include "ext4.h" 28 #include "ext4.h"
29 #include "ext4_jbd2.h" 29 #include "ext4_jbd2.h"
30 #include "xattr.h" 30 #include "xattr.h"
31 #include "acl.h" 31 #include "acl.h"
32 32
33 /* 33 /*
34 * Called when an inode is released. Note that this is different 34 * Called when an inode is released. Note that this is different
35 * from ext4_file_open: open gets called at every open, but release 35 * from ext4_file_open: open gets called at every open, but release
36 * gets called only when /all/ the files are closed. 36 * gets called only when /all/ the files are closed.
37 */ 37 */
38 static int ext4_release_file(struct inode *inode, struct file *filp) 38 static int ext4_release_file(struct inode *inode, struct file *filp)
39 { 39 {
40 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 40 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
41 ext4_alloc_da_blocks(inode); 41 ext4_alloc_da_blocks(inode);
42 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 42 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
43 } 43 }
44 /* if we are the last writer on the inode, drop the block reservation */ 44 /* if we are the last writer on the inode, drop the block reservation */
45 if ((filp->f_mode & FMODE_WRITE) && 45 if ((filp->f_mode & FMODE_WRITE) &&
46 (atomic_read(&inode->i_writecount) == 1) && 46 (atomic_read(&inode->i_writecount) == 1) &&
47 !EXT4_I(inode)->i_reserved_data_blocks) 47 !EXT4_I(inode)->i_reserved_data_blocks)
48 { 48 {
49 down_write(&EXT4_I(inode)->i_data_sem); 49 down_write(&EXT4_I(inode)->i_data_sem);
50 ext4_discard_preallocations(inode); 50 ext4_discard_preallocations(inode);
51 up_write(&EXT4_I(inode)->i_data_sem); 51 up_write(&EXT4_I(inode)->i_data_sem);
52 } 52 }
53 if (is_dx(inode) && filp->private_data) 53 if (is_dx(inode) && filp->private_data)
54 ext4_htree_free_dir_info(filp->private_data); 54 ext4_htree_free_dir_info(filp->private_data);
55 55
56 return 0; 56 return 0;
57 } 57 }
58 58
59 void ext4_unwritten_wait(struct inode *inode) 59 void ext4_unwritten_wait(struct inode *inode)
60 { 60 {
61 wait_queue_head_t *wq = ext4_ioend_wq(inode); 61 wait_queue_head_t *wq = ext4_ioend_wq(inode);
62 62
63 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); 63 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
64 } 64 }
65 65
66 /* 66 /*
67 * This tests whether the IO in question is block-aligned or not. 67 * This tests whether the IO in question is block-aligned or not.
68 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 68 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
69 * are converted to written only after the IO is complete. Until they are 69 * are converted to written only after the IO is complete. Until they are
70 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 70 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
71 * it needs to zero out portions of the start and/or end block. If 2 AIO 71 * it needs to zero out portions of the start and/or end block. If 2 AIO
72 * threads are at work on the same unwritten block, they must be synchronized 72 * threads are at work on the same unwritten block, they must be synchronized
73 * or one thread will zero the other's data, causing corruption. 73 * or one thread will zero the other's data, causing corruption.
74 */ 74 */
75 static int 75 static int
76 ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, 76 ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
77 unsigned long nr_segs, loff_t pos) 77 unsigned long nr_segs, loff_t pos)
78 { 78 {
79 struct super_block *sb = inode->i_sb; 79 struct super_block *sb = inode->i_sb;
80 int blockmask = sb->s_blocksize - 1; 80 int blockmask = sb->s_blocksize - 1;
81 size_t count = iov_length(iov, nr_segs); 81 size_t count = iov_length(iov, nr_segs);
82 loff_t final_size = pos + count; 82 loff_t final_size = pos + count;
83 83
84 if (pos >= inode->i_size) 84 if (pos >= inode->i_size)
85 return 0; 85 return 0;
86 86
87 if ((pos & blockmask) || (final_size & blockmask)) 87 if ((pos & blockmask) || (final_size & blockmask))
88 return 1; 88 return 1;
89 89
90 return 0; 90 return 0;
91 } 91 }
92 92
93 static ssize_t 93 static ssize_t
94 ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, 94 ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
95 unsigned long nr_segs, loff_t pos) 95 unsigned long nr_segs, loff_t pos)
96 { 96 {
97 struct file *file = iocb->ki_filp; 97 struct file *file = iocb->ki_filp;
98 struct inode *inode = file->f_mapping->host; 98 struct inode *inode = file->f_mapping->host;
99 struct blk_plug plug; 99 struct blk_plug plug;
100 int unaligned_aio = 0; 100 int unaligned_aio = 0;
101 ssize_t ret; 101 ssize_t ret;
102 int overwrite = 0; 102 int overwrite = 0;
103 size_t length = iov_length(iov, nr_segs); 103 size_t length = iov_length(iov, nr_segs);
104 104
105 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && 105 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
106 !is_sync_kiocb(iocb)) 106 !is_sync_kiocb(iocb))
107 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); 107 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
108 108
109 /* Unaligned direct AIO must be serialized; see comment above */ 109 /* Unaligned direct AIO must be serialized; see comment above */
110 if (unaligned_aio) { 110 if (unaligned_aio) {
111 static unsigned long unaligned_warn_time; 111 static unsigned long unaligned_warn_time;
112 112
113 /* Warn about this once per day */ 113 /* Warn about this once per day */
114 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) 114 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
115 ext4_msg(inode->i_sb, KERN_WARNING, 115 ext4_msg(inode->i_sb, KERN_WARNING,
116 "Unaligned AIO/DIO on inode %ld by %s; " 116 "Unaligned AIO/DIO on inode %ld by %s; "
117 "performance will be poor.", 117 "performance will be poor.",
118 inode->i_ino, current->comm); 118 inode->i_ino, current->comm);
119 mutex_lock(ext4_aio_mutex(inode)); 119 mutex_lock(ext4_aio_mutex(inode));
120 ext4_unwritten_wait(inode); 120 ext4_unwritten_wait(inode);
121 } 121 }
122 122
123 BUG_ON(iocb->ki_pos != pos); 123 BUG_ON(iocb->ki_pos != pos);
124 124
125 mutex_lock(&inode->i_mutex); 125 mutex_lock(&inode->i_mutex);
126 blk_start_plug(&plug); 126 blk_start_plug(&plug);
127 127
128 iocb->private = &overwrite; 128 iocb->private = &overwrite;
129 129
130 /* check whether we do a DIO overwrite or not */ 130 /* check whether we do a DIO overwrite or not */
131 if (ext4_should_dioread_nolock(inode) && !unaligned_aio && 131 if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
132 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { 132 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
133 struct ext4_map_blocks map; 133 struct ext4_map_blocks map;
134 unsigned int blkbits = inode->i_blkbits; 134 unsigned int blkbits = inode->i_blkbits;
135 int err, len; 135 int err, len;
136 136
137 map.m_lblk = pos >> blkbits; 137 map.m_lblk = pos >> blkbits;
138 map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) 138 map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
139 - map.m_lblk; 139 - map.m_lblk;
140 len = map.m_len; 140 len = map.m_len;
141 141
142 err = ext4_map_blocks(NULL, inode, &map, 0); 142 err = ext4_map_blocks(NULL, inode, &map, 0);
143 /* 143 /*
144 * 'err==len' means that all of blocks has been preallocated no 144 * 'err==len' means that all of blocks has been preallocated no
145 * matter they are initialized or not. For excluding 145 * matter they are initialized or not. For excluding
146 * uninitialized extents, we need to check m_flags. There are 146 * uninitialized extents, we need to check m_flags. There are
147 * two conditions that indicate for initialized extents. 147 * two conditions that indicate for initialized extents.
148 * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; 148 * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
149 * 2) If we do a real lookup, non-flags are returned. 149 * 2) If we do a real lookup, non-flags are returned.
150 * So we should check these two conditions. 150 * So we should check these two conditions.
151 */ 151 */
152 if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) 152 if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
153 overwrite = 1; 153 overwrite = 1;
154 } 154 }
155 155
156 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 156 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
157 mutex_unlock(&inode->i_mutex); 157 mutex_unlock(&inode->i_mutex);
158 158
159 if (ret > 0 || ret == -EIOCBQUEUED) { 159 if (ret > 0 || ret == -EIOCBQUEUED) {
160 ssize_t err; 160 ssize_t err;
161 161
162 err = generic_write_sync(file, pos, ret); 162 err = generic_write_sync(file, pos, ret);
163 if (err < 0 && ret > 0) 163 if (err < 0 && ret > 0)
164 ret = err; 164 ret = err;
165 } 165 }
166 blk_finish_plug(&plug); 166 blk_finish_plug(&plug);
167 167
168 if (unaligned_aio) 168 if (unaligned_aio)
169 mutex_unlock(ext4_aio_mutex(inode)); 169 mutex_unlock(ext4_aio_mutex(inode));
170 170
171 return ret; 171 return ret;
172 } 172 }
173 173
174 static ssize_t 174 static ssize_t
175 ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 175 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
176 unsigned long nr_segs, loff_t pos) 176 unsigned long nr_segs, loff_t pos)
177 { 177 {
178 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 178 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
179 ssize_t ret; 179 ssize_t ret;
180 180
181 /* 181 /*
182 * If we have encountered a bitmap-format file, the size limit 182 * If we have encountered a bitmap-format file, the size limit
183 * is smaller than s_maxbytes, which is for extent-mapped files. 183 * is smaller than s_maxbytes, which is for extent-mapped files.
184 */ 184 */
185 185
186 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 186 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
187 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 187 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
188 size_t length = iov_length(iov, nr_segs); 188 size_t length = iov_length(iov, nr_segs);
189 189
190 if ((pos > sbi->s_bitmap_maxbytes || 190 if ((pos > sbi->s_bitmap_maxbytes ||
191 (pos == sbi->s_bitmap_maxbytes && length > 0))) 191 (pos == sbi->s_bitmap_maxbytes && length > 0)))
192 return -EFBIG; 192 return -EFBIG;
193 193
194 if (pos + length > sbi->s_bitmap_maxbytes) { 194 if (pos + length > sbi->s_bitmap_maxbytes) {
195 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 195 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
196 sbi->s_bitmap_maxbytes - pos); 196 sbi->s_bitmap_maxbytes - pos);
197 } 197 }
198 } 198 }
199 199
200 if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) 200 if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
201 ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); 201 ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
202 else 202 else
203 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 203 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
204 204
205 return ret; 205 return ret;
206 } 206 }
207 207
208 static const struct vm_operations_struct ext4_file_vm_ops = { 208 static const struct vm_operations_struct ext4_file_vm_ops = {
209 .fault = filemap_fault, 209 .fault = filemap_fault,
210 .page_mkwrite = ext4_page_mkwrite, 210 .page_mkwrite = ext4_page_mkwrite,
211 .remap_pages = generic_file_remap_pages, 211 .remap_pages = generic_file_remap_pages,
212 }; 212 };
213 213
214 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 214 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
215 { 215 {
216 struct address_space *mapping = file->f_mapping; 216 struct address_space *mapping = file->f_mapping;
217 217
218 if (!mapping->a_ops->readpage) 218 if (!mapping->a_ops->readpage)
219 return -ENOEXEC; 219 return -ENOEXEC;
220 file_accessed(file); 220 file_accessed(file);
221 vma->vm_ops = &ext4_file_vm_ops; 221 vma->vm_ops = &ext4_file_vm_ops;
222 return 0; 222 return 0;
223 } 223 }
224 224
225 static int ext4_file_open(struct inode * inode, struct file * filp) 225 static int ext4_file_open(struct inode * inode, struct file * filp)
226 { 226 {
227 struct super_block *sb = inode->i_sb; 227 struct super_block *sb = inode->i_sb;
228 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 228 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
229 struct ext4_inode_info *ei = EXT4_I(inode); 229 struct ext4_inode_info *ei = EXT4_I(inode);
230 struct vfsmount *mnt = filp->f_path.mnt; 230 struct vfsmount *mnt = filp->f_path.mnt;
231 struct path path; 231 struct path path;
232 char buf[64], *cp; 232 char buf[64], *cp;
233 233
234 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && 234 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
235 !(sb->s_flags & MS_RDONLY))) { 235 !(sb->s_flags & MS_RDONLY))) {
236 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; 236 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
237 /* 237 /*
238 * Sample where the filesystem has been mounted and 238 * Sample where the filesystem has been mounted and
239 * store it in the superblock for sysadmin convenience 239 * store it in the superblock for sysadmin convenience
240 * when trying to sort through large numbers of block 240 * when trying to sort through large numbers of block
241 * devices or filesystem images. 241 * devices or filesystem images.
242 */ 242 */
243 memset(buf, 0, sizeof(buf)); 243 memset(buf, 0, sizeof(buf));
244 path.mnt = mnt; 244 path.mnt = mnt;
245 path.dentry = mnt->mnt_root; 245 path.dentry = mnt->mnt_root;
246 cp = d_path(&path, buf, sizeof(buf)); 246 cp = d_path(&path, buf, sizeof(buf));
247 if (!IS_ERR(cp)) { 247 if (!IS_ERR(cp)) {
248 handle_t *handle; 248 handle_t *handle;
249 int err; 249 int err;
250 250
251 handle = ext4_journal_start_sb(sb, 1); 251 handle = ext4_journal_start_sb(sb, 1);
252 if (IS_ERR(handle)) 252 if (IS_ERR(handle))
253 return PTR_ERR(handle); 253 return PTR_ERR(handle);
254 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 254 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
255 if (err) { 255 if (err) {
256 ext4_journal_stop(handle); 256 ext4_journal_stop(handle);
257 return err; 257 return err;
258 } 258 }
259 strlcpy(sbi->s_es->s_last_mounted, cp, 259 strlcpy(sbi->s_es->s_last_mounted, cp,
260 sizeof(sbi->s_es->s_last_mounted)); 260 sizeof(sbi->s_es->s_last_mounted));
261 ext4_handle_dirty_super(handle, sb); 261 ext4_handle_dirty_super(handle, sb);
262 ext4_journal_stop(handle); 262 ext4_journal_stop(handle);
263 } 263 }
264 } 264 }
265 /* 265 /*
266 * Set up the jbd2_inode if we are opening the inode for 266 * Set up the jbd2_inode if we are opening the inode for
267 * writing and the journal is present 267 * writing and the journal is present
268 */ 268 */
269 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { 269 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
270 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); 270 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
271 271
272 spin_lock(&inode->i_lock); 272 spin_lock(&inode->i_lock);
273 if (!ei->jinode) { 273 if (!ei->jinode) {
274 if (!jinode) { 274 if (!jinode) {
275 spin_unlock(&inode->i_lock); 275 spin_unlock(&inode->i_lock);
276 return -ENOMEM; 276 return -ENOMEM;
277 } 277 }
278 ei->jinode = jinode; 278 ei->jinode = jinode;
279 jbd2_journal_init_jbd_inode(ei->jinode, inode); 279 jbd2_journal_init_jbd_inode(ei->jinode, inode);
280 jinode = NULL; 280 jinode = NULL;
281 } 281 }
282 spin_unlock(&inode->i_lock); 282 spin_unlock(&inode->i_lock);
283 if (unlikely(jinode != NULL)) 283 if (unlikely(jinode != NULL))
284 jbd2_free_inode(jinode); 284 jbd2_free_inode(jinode);
285 } 285 }
286 return dquot_file_open(inode, filp); 286 return dquot_file_open(inode, filp);
287 } 287 }
288 288
289 /* 289 /*
290 * Here we use ext4_map_blocks() to get a block mapping for a extent-based 290 * Here we use ext4_map_blocks() to get a block mapping for a extent-based
291 * file rather than ext4_ext_walk_space() because we can introduce 291 * file rather than ext4_ext_walk_space() because we can introduce
292 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same 292 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
293 * function. When extent status tree has been fully implemented, it will 293 * function. When extent status tree has been fully implemented, it will
294 * track all extent status for a file and we can directly use it to 294 * track all extent status for a file and we can directly use it to
295 * retrieve the offset for SEEK_DATA/SEEK_HOLE. 295 * retrieve the offset for SEEK_DATA/SEEK_HOLE.
296 */ 296 */
297 297
298 /* 298 /*
299 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to 299 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
300 * lookup page cache to check whether or not there has some data between 300 * lookup page cache to check whether or not there has some data between
301 * [startoff, endoff] because, if this range contains an unwritten extent, 301 * [startoff, endoff] because, if this range contains an unwritten extent,
302 * we determine this extent as a data or a hole according to whether the 302 * we determine this extent as a data or a hole according to whether the
303 * page cache has data or not. 303 * page cache has data or not.
304 */ 304 */
305 static int ext4_find_unwritten_pgoff(struct inode *inode, 305 static int ext4_find_unwritten_pgoff(struct inode *inode,
306 int origin, 306 int whence,
307 struct ext4_map_blocks *map, 307 struct ext4_map_blocks *map,
308 loff_t *offset) 308 loff_t *offset)
309 { 309 {
310 struct pagevec pvec; 310 struct pagevec pvec;
311 unsigned int blkbits; 311 unsigned int blkbits;
312 pgoff_t index; 312 pgoff_t index;
313 pgoff_t end; 313 pgoff_t end;
314 loff_t endoff; 314 loff_t endoff;
315 loff_t startoff; 315 loff_t startoff;
316 loff_t lastoff; 316 loff_t lastoff;
317 int found = 0; 317 int found = 0;
318 318
319 blkbits = inode->i_sb->s_blocksize_bits; 319 blkbits = inode->i_sb->s_blocksize_bits;
320 startoff = *offset; 320 startoff = *offset;
321 lastoff = startoff; 321 lastoff = startoff;
322 endoff = (map->m_lblk + map->m_len) << blkbits; 322 endoff = (map->m_lblk + map->m_len) << blkbits;
323 323
324 index = startoff >> PAGE_CACHE_SHIFT; 324 index = startoff >> PAGE_CACHE_SHIFT;
325 end = endoff >> PAGE_CACHE_SHIFT; 325 end = endoff >> PAGE_CACHE_SHIFT;
326 326
327 pagevec_init(&pvec, 0); 327 pagevec_init(&pvec, 0);
328 do { 328 do {
329 int i, num; 329 int i, num;
330 unsigned long nr_pages; 330 unsigned long nr_pages;
331 331
332 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 332 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
333 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 333 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
334 (pgoff_t)num); 334 (pgoff_t)num);
335 if (nr_pages == 0) { 335 if (nr_pages == 0) {
336 if (origin == SEEK_DATA) 336 if (whence == SEEK_DATA)
337 break; 337 break;
338 338
339 BUG_ON(origin != SEEK_HOLE); 339 BUG_ON(whence != SEEK_HOLE);
340 /* 340 /*
341 * If this is the first time to go into the loop and 341 * If this is the first time to go into the loop and
342 * offset is not beyond the end offset, it will be a 342 * offset is not beyond the end offset, it will be a
343 * hole at this offset 343 * hole at this offset
344 */ 344 */
345 if (lastoff == startoff || lastoff < endoff) 345 if (lastoff == startoff || lastoff < endoff)
346 found = 1; 346 found = 1;
347 break; 347 break;
348 } 348 }
349 349
350 /* 350 /*
351 * If this is the first time to go into the loop and 351 * If this is the first time to go into the loop and
352 * offset is smaller than the first page offset, it will be a 352 * offset is smaller than the first page offset, it will be a
353 * hole at this offset. 353 * hole at this offset.
354 */ 354 */
355 if (lastoff == startoff && origin == SEEK_HOLE && 355 if (lastoff == startoff && whence == SEEK_HOLE &&
356 lastoff < page_offset(pvec.pages[0])) { 356 lastoff < page_offset(pvec.pages[0])) {
357 found = 1; 357 found = 1;
358 break; 358 break;
359 } 359 }
360 360
361 for (i = 0; i < nr_pages; i++) { 361 for (i = 0; i < nr_pages; i++) {
362 struct page *page = pvec.pages[i]; 362 struct page *page = pvec.pages[i];
363 struct buffer_head *bh, *head; 363 struct buffer_head *bh, *head;
364 364
365 /* 365 /*
366 * If the current offset is not beyond the end of given 366 * If the current offset is not beyond the end of given
367 * range, it will be a hole. 367 * range, it will be a hole.
368 */ 368 */
369 if (lastoff < endoff && origin == SEEK_HOLE && 369 if (lastoff < endoff && whence == SEEK_HOLE &&
370 page->index > end) { 370 page->index > end) {
371 found = 1; 371 found = 1;
372 *offset = lastoff; 372 *offset = lastoff;
373 goto out; 373 goto out;
374 } 374 }
375 375
376 lock_page(page); 376 lock_page(page);
377 377
378 if (unlikely(page->mapping != inode->i_mapping)) { 378 if (unlikely(page->mapping != inode->i_mapping)) {
379 unlock_page(page); 379 unlock_page(page);
380 continue; 380 continue;
381 } 381 }
382 382
383 if (!page_has_buffers(page)) { 383 if (!page_has_buffers(page)) {
384 unlock_page(page); 384 unlock_page(page);
385 continue; 385 continue;
386 } 386 }
387 387
388 if (page_has_buffers(page)) { 388 if (page_has_buffers(page)) {
389 lastoff = page_offset(page); 389 lastoff = page_offset(page);
390 bh = head = page_buffers(page); 390 bh = head = page_buffers(page);
391 do { 391 do {
392 if (buffer_uptodate(bh) || 392 if (buffer_uptodate(bh) ||
393 buffer_unwritten(bh)) { 393 buffer_unwritten(bh)) {
394 if (origin == SEEK_DATA) 394 if (whence == SEEK_DATA)
395 found = 1; 395 found = 1;
396 } else { 396 } else {
397 if (origin == SEEK_HOLE) 397 if (whence == SEEK_HOLE)
398 found = 1; 398 found = 1;
399 } 399 }
400 if (found) { 400 if (found) {
401 *offset = max_t(loff_t, 401 *offset = max_t(loff_t,
402 startoff, lastoff); 402 startoff, lastoff);
403 unlock_page(page); 403 unlock_page(page);
404 goto out; 404 goto out;
405 } 405 }
406 lastoff += bh->b_size; 406 lastoff += bh->b_size;
407 bh = bh->b_this_page; 407 bh = bh->b_this_page;
408 } while (bh != head); 408 } while (bh != head);
409 } 409 }
410 410
411 lastoff = page_offset(page) + PAGE_SIZE; 411 lastoff = page_offset(page) + PAGE_SIZE;
412 unlock_page(page); 412 unlock_page(page);
413 } 413 }
414 414
415 /* 415 /*
416 * The no. of pages is less than our desired, that would be a 416 * The no. of pages is less than our desired, that would be a
417 * hole in there. 417 * hole in there.
418 */ 418 */
419 if (nr_pages < num && origin == SEEK_HOLE) { 419 if (nr_pages < num && whence == SEEK_HOLE) {
420 found = 1; 420 found = 1;
421 *offset = lastoff; 421 *offset = lastoff;
422 break; 422 break;
423 } 423 }
424 424
425 index = pvec.pages[i - 1]->index + 1; 425 index = pvec.pages[i - 1]->index + 1;
426 pagevec_release(&pvec); 426 pagevec_release(&pvec);
427 } while (index <= end); 427 } while (index <= end);
428 428
429 out: 429 out:
430 pagevec_release(&pvec); 430 pagevec_release(&pvec);
431 return found; 431 return found;
432 } 432 }
433 433
434 /* 434 /*
435 * ext4_seek_data() retrieves the offset for SEEK_DATA. 435 * ext4_seek_data() retrieves the offset for SEEK_DATA.
436 */ 436 */
437 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 437 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
438 { 438 {
439 struct inode *inode = file->f_mapping->host; 439 struct inode *inode = file->f_mapping->host;
440 struct ext4_map_blocks map; 440 struct ext4_map_blocks map;
441 struct extent_status es; 441 struct extent_status es;
442 ext4_lblk_t start, last, end; 442 ext4_lblk_t start, last, end;
443 loff_t dataoff, isize; 443 loff_t dataoff, isize;
444 int blkbits; 444 int blkbits;
445 int ret = 0; 445 int ret = 0;
446 446
447 mutex_lock(&inode->i_mutex); 447 mutex_lock(&inode->i_mutex);
448 448
449 isize = i_size_read(inode); 449 isize = i_size_read(inode);
450 if (offset >= isize) { 450 if (offset >= isize) {
451 mutex_unlock(&inode->i_mutex); 451 mutex_unlock(&inode->i_mutex);
452 return -ENXIO; 452 return -ENXIO;
453 } 453 }
454 454
455 blkbits = inode->i_sb->s_blocksize_bits; 455 blkbits = inode->i_sb->s_blocksize_bits;
456 start = offset >> blkbits; 456 start = offset >> blkbits;
457 last = start; 457 last = start;
458 end = isize >> blkbits; 458 end = isize >> blkbits;
459 dataoff = offset; 459 dataoff = offset;
460 460
461 do { 461 do {
462 map.m_lblk = last; 462 map.m_lblk = last;
463 map.m_len = end - last + 1; 463 map.m_len = end - last + 1;
464 ret = ext4_map_blocks(NULL, inode, &map, 0); 464 ret = ext4_map_blocks(NULL, inode, &map, 0);
465 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 465 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
466 if (last != start) 466 if (last != start)
467 dataoff = last << blkbits; 467 dataoff = last << blkbits;
468 break; 468 break;
469 } 469 }
470 470
471 /* 471 /*
472 * If there is a delay extent at this offset, 472 * If there is a delay extent at this offset,
473 * it will be as a data. 473 * it will be as a data.
474 */ 474 */
475 es.start = last; 475 es.start = last;
476 (void)ext4_es_find_extent(inode, &es); 476 (void)ext4_es_find_extent(inode, &es);
477 if (last >= es.start && 477 if (last >= es.start &&
478 last < es.start + es.len) { 478 last < es.start + es.len) {
479 if (last != start) 479 if (last != start)
480 dataoff = last << blkbits; 480 dataoff = last << blkbits;
481 break; 481 break;
482 } 482 }
483 483
484 /* 484 /*
485 * If there is a unwritten extent at this offset, 485 * If there is a unwritten extent at this offset,
486 * it will be as a data or a hole according to page 486 * it will be as a data or a hole according to page
487 * cache that has data or not. 487 * cache that has data or not.
488 */ 488 */
489 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 489 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
490 int unwritten; 490 int unwritten;
491 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 491 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
492 &map, &dataoff); 492 &map, &dataoff);
493 if (unwritten) 493 if (unwritten)
494 break; 494 break;
495 } 495 }
496 496
497 last++; 497 last++;
498 dataoff = last << blkbits; 498 dataoff = last << blkbits;
499 } while (last <= end); 499 } while (last <= end);
500 500
501 mutex_unlock(&inode->i_mutex); 501 mutex_unlock(&inode->i_mutex);
502 502
503 if (dataoff > isize) 503 if (dataoff > isize)
504 return -ENXIO; 504 return -ENXIO;
505 505
506 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 506 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
507 return -EINVAL; 507 return -EINVAL;
508 if (dataoff > maxsize) 508 if (dataoff > maxsize)
509 return -EINVAL; 509 return -EINVAL;
510 510
511 if (dataoff != file->f_pos) { 511 if (dataoff != file->f_pos) {
512 file->f_pos = dataoff; 512 file->f_pos = dataoff;
513 file->f_version = 0; 513 file->f_version = 0;
514 } 514 }
515 515
516 return dataoff; 516 return dataoff;
517 } 517 }
518 518
519 /* 519 /*
520 * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 520 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
521 */ 521 */
522 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 522 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
523 { 523 {
524 struct inode *inode = file->f_mapping->host; 524 struct inode *inode = file->f_mapping->host;
525 struct ext4_map_blocks map; 525 struct ext4_map_blocks map;
526 struct extent_status es; 526 struct extent_status es;
527 ext4_lblk_t start, last, end; 527 ext4_lblk_t start, last, end;
528 loff_t holeoff, isize; 528 loff_t holeoff, isize;
529 int blkbits; 529 int blkbits;
530 int ret = 0; 530 int ret = 0;
531 531
532 mutex_lock(&inode->i_mutex); 532 mutex_lock(&inode->i_mutex);
533 533
534 isize = i_size_read(inode); 534 isize = i_size_read(inode);
535 if (offset >= isize) { 535 if (offset >= isize) {
536 mutex_unlock(&inode->i_mutex); 536 mutex_unlock(&inode->i_mutex);
537 return -ENXIO; 537 return -ENXIO;
538 } 538 }
539 539
540 blkbits = inode->i_sb->s_blocksize_bits; 540 blkbits = inode->i_sb->s_blocksize_bits;
541 start = offset >> blkbits; 541 start = offset >> blkbits;
542 last = start; 542 last = start;
543 end = isize >> blkbits; 543 end = isize >> blkbits;
544 holeoff = offset; 544 holeoff = offset;
545 545
546 do { 546 do {
547 map.m_lblk = last; 547 map.m_lblk = last;
548 map.m_len = end - last + 1; 548 map.m_len = end - last + 1;
549 ret = ext4_map_blocks(NULL, inode, &map, 0); 549 ret = ext4_map_blocks(NULL, inode, &map, 0);
550 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 550 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
551 last += ret; 551 last += ret;
552 holeoff = last << blkbits; 552 holeoff = last << blkbits;
553 continue; 553 continue;
554 } 554 }
555 555
556 /* 556 /*
557 * If there is a delay extent at this offset, 557 * If there is a delay extent at this offset,
558 * we will skip this extent. 558 * we will skip this extent.
559 */ 559 */
560 es.start = last; 560 es.start = last;
561 (void)ext4_es_find_extent(inode, &es); 561 (void)ext4_es_find_extent(inode, &es);
562 if (last >= es.start && 562 if (last >= es.start &&
563 last < es.start + es.len) { 563 last < es.start + es.len) {
564 last = es.start + es.len; 564 last = es.start + es.len;
565 holeoff = last << blkbits; 565 holeoff = last << blkbits;
566 continue; 566 continue;
567 } 567 }
568 568
569 /* 569 /*
570 * If there is a unwritten extent at this offset, 570 * If there is a unwritten extent at this offset,
571 * it will be as a data or a hole according to page 571 * it will be as a data or a hole according to page
572 * cache that has data or not. 572 * cache that has data or not.
573 */ 573 */
574 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 574 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
575 int unwritten; 575 int unwritten;
576 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 576 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
577 &map, &holeoff); 577 &map, &holeoff);
578 if (!unwritten) { 578 if (!unwritten) {
579 last += ret; 579 last += ret;
580 holeoff = last << blkbits; 580 holeoff = last << blkbits;
581 continue; 581 continue;
582 } 582 }
583 } 583 }
584 584
585 /* find a hole */ 585 /* find a hole */
586 break; 586 break;
587 } while (last <= end); 587 } while (last <= end);
588 588
589 mutex_unlock(&inode->i_mutex); 589 mutex_unlock(&inode->i_mutex);
590 590
591 if (holeoff > isize) 591 if (holeoff > isize)
592 holeoff = isize; 592 holeoff = isize;
593 593
594 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 594 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
595 return -EINVAL; 595 return -EINVAL;
596 if (holeoff > maxsize) 596 if (holeoff > maxsize)
597 return -EINVAL; 597 return -EINVAL;
598 598
599 if (holeoff != file->f_pos) { 599 if (holeoff != file->f_pos) {
600 file->f_pos = holeoff; 600 file->f_pos = holeoff;
601 file->f_version = 0; 601 file->f_version = 0;
602 } 602 }
603 603
604 return holeoff; 604 return holeoff;
605 } 605 }
606 606
607 /* 607 /*
608 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 608 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
609 * by calling generic_file_llseek_size() with the appropriate maxbytes 609 * by calling generic_file_llseek_size() with the appropriate maxbytes
610 * value for each. 610 * value for each.
611 */ 611 */
612 loff_t ext4_llseek(struct file *file, loff_t offset, int origin) 612 loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
613 { 613 {
614 struct inode *inode = file->f_mapping->host; 614 struct inode *inode = file->f_mapping->host;
615 loff_t maxbytes; 615 loff_t maxbytes;
616 616
617 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 617 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
618 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 618 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
619 else 619 else
620 maxbytes = inode->i_sb->s_maxbytes; 620 maxbytes = inode->i_sb->s_maxbytes;
621 621
622 switch (origin) { 622 switch (whence) {
623 case SEEK_SET: 623 case SEEK_SET:
624 case SEEK_CUR: 624 case SEEK_CUR:
625 case SEEK_END: 625 case SEEK_END:
626 return generic_file_llseek_size(file, offset, origin, 626 return generic_file_llseek_size(file, offset, whence,
627 maxbytes, i_size_read(inode)); 627 maxbytes, i_size_read(inode));
628 case SEEK_DATA: 628 case SEEK_DATA:
629 return ext4_seek_data(file, offset, maxbytes); 629 return ext4_seek_data(file, offset, maxbytes);
630 case SEEK_HOLE: 630 case SEEK_HOLE:
631 return ext4_seek_hole(file, offset, maxbytes); 631 return ext4_seek_hole(file, offset, maxbytes);
632 } 632 }
633 633
634 return -EINVAL; 634 return -EINVAL;
635 } 635 }
636 636
637 const struct file_operations ext4_file_operations = { 637 const struct file_operations ext4_file_operations = {
638 .llseek = ext4_llseek, 638 .llseek = ext4_llseek,
639 .read = do_sync_read, 639 .read = do_sync_read,
640 .write = do_sync_write, 640 .write = do_sync_write,
641 .aio_read = generic_file_aio_read, 641 .aio_read = generic_file_aio_read,
642 .aio_write = ext4_file_write, 642 .aio_write = ext4_file_write,
643 .unlocked_ioctl = ext4_ioctl, 643 .unlocked_ioctl = ext4_ioctl,
644 #ifdef CONFIG_COMPAT 644 #ifdef CONFIG_COMPAT
645 .compat_ioctl = ext4_compat_ioctl, 645 .compat_ioctl = ext4_compat_ioctl,
646 #endif 646 #endif
647 .mmap = ext4_file_mmap, 647 .mmap = ext4_file_mmap,
648 .open = ext4_file_open, 648 .open = ext4_file_open,
649 .release = ext4_release_file, 649 .release = ext4_release_file,
650 .fsync = ext4_sync_file, 650 .fsync = ext4_sync_file,
651 .splice_read = generic_file_splice_read, 651 .splice_read = generic_file_splice_read,
652 .splice_write = generic_file_splice_write, 652 .splice_write = generic_file_splice_write,
653 .fallocate = ext4_fallocate, 653 .fallocate = ext4_fallocate,
654 }; 654 };
655 655
656 const struct inode_operations ext4_file_inode_operations = { 656 const struct inode_operations ext4_file_inode_operations = {
657 .setattr = ext4_setattr, 657 .setattr = ext4_setattr,
658 .getattr = ext4_getattr, 658 .getattr = ext4_getattr,
659 .setxattr = generic_setxattr, 659 .setxattr = generic_setxattr,
660 .getxattr = generic_getxattr, 660 .getxattr = generic_getxattr,
661 .listxattr = ext4_listxattr, 661 .listxattr = ext4_listxattr,
662 .removexattr = generic_removexattr, 662 .removexattr = generic_removexattr,
663 .get_acl = ext4_get_acl, 663 .get_acl = ext4_get_acl,
664 .fiemap = ext4_fiemap, 664 .fiemap = ext4_fiemap,
665 }; 665 };
666 666
667 667
1 /* 1 /*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
7 */ 7 */
8 8
9 #include "fuse_i.h" 9 #include "fuse_i.h"
10 10
11 #include <linux/pagemap.h> 11 #include <linux/pagemap.h>
12 #include <linux/slab.h> 12 #include <linux/slab.h>
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/sched.h> 14 #include <linux/sched.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/compat.h> 16 #include <linux/compat.h>
17 #include <linux/swap.h> 17 #include <linux/swap.h>
18 18
19 static const struct file_operations fuse_direct_io_file_operations; 19 static const struct file_operations fuse_direct_io_file_operations;
20 20
21 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 21 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
22 int opcode, struct fuse_open_out *outargp) 22 int opcode, struct fuse_open_out *outargp)
23 { 23 {
24 struct fuse_open_in inarg; 24 struct fuse_open_in inarg;
25 struct fuse_req *req; 25 struct fuse_req *req;
26 int err; 26 int err;
27 27
28 req = fuse_get_req(fc); 28 req = fuse_get_req(fc);
29 if (IS_ERR(req)) 29 if (IS_ERR(req))
30 return PTR_ERR(req); 30 return PTR_ERR(req);
31 31
32 memset(&inarg, 0, sizeof(inarg)); 32 memset(&inarg, 0, sizeof(inarg));
33 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 33 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
34 if (!fc->atomic_o_trunc) 34 if (!fc->atomic_o_trunc)
35 inarg.flags &= ~O_TRUNC; 35 inarg.flags &= ~O_TRUNC;
36 req->in.h.opcode = opcode; 36 req->in.h.opcode = opcode;
37 req->in.h.nodeid = nodeid; 37 req->in.h.nodeid = nodeid;
38 req->in.numargs = 1; 38 req->in.numargs = 1;
39 req->in.args[0].size = sizeof(inarg); 39 req->in.args[0].size = sizeof(inarg);
40 req->in.args[0].value = &inarg; 40 req->in.args[0].value = &inarg;
41 req->out.numargs = 1; 41 req->out.numargs = 1;
42 req->out.args[0].size = sizeof(*outargp); 42 req->out.args[0].size = sizeof(*outargp);
43 req->out.args[0].value = outargp; 43 req->out.args[0].value = outargp;
44 fuse_request_send(fc, req); 44 fuse_request_send(fc, req);
45 err = req->out.h.error; 45 err = req->out.h.error;
46 fuse_put_request(fc, req); 46 fuse_put_request(fc, req);
47 47
48 return err; 48 return err;
49 } 49 }
50 50
51 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) 51 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
52 { 52 {
53 struct fuse_file *ff; 53 struct fuse_file *ff;
54 54
55 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 55 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
56 if (unlikely(!ff)) 56 if (unlikely(!ff))
57 return NULL; 57 return NULL;
58 58
59 ff->fc = fc; 59 ff->fc = fc;
60 ff->reserved_req = fuse_request_alloc(); 60 ff->reserved_req = fuse_request_alloc();
61 if (unlikely(!ff->reserved_req)) { 61 if (unlikely(!ff->reserved_req)) {
62 kfree(ff); 62 kfree(ff);
63 return NULL; 63 return NULL;
64 } 64 }
65 65
66 INIT_LIST_HEAD(&ff->write_entry); 66 INIT_LIST_HEAD(&ff->write_entry);
67 atomic_set(&ff->count, 0); 67 atomic_set(&ff->count, 0);
68 RB_CLEAR_NODE(&ff->polled_node); 68 RB_CLEAR_NODE(&ff->polled_node);
69 init_waitqueue_head(&ff->poll_wait); 69 init_waitqueue_head(&ff->poll_wait);
70 70
71 spin_lock(&fc->lock); 71 spin_lock(&fc->lock);
72 ff->kh = ++fc->khctr; 72 ff->kh = ++fc->khctr;
73 spin_unlock(&fc->lock); 73 spin_unlock(&fc->lock);
74 74
75 return ff; 75 return ff;
76 } 76 }
77 77
78 void fuse_file_free(struct fuse_file *ff) 78 void fuse_file_free(struct fuse_file *ff)
79 { 79 {
80 fuse_request_free(ff->reserved_req); 80 fuse_request_free(ff->reserved_req);
81 kfree(ff); 81 kfree(ff);
82 } 82 }
83 83
84 struct fuse_file *fuse_file_get(struct fuse_file *ff) 84 struct fuse_file *fuse_file_get(struct fuse_file *ff)
85 { 85 {
86 atomic_inc(&ff->count); 86 atomic_inc(&ff->count);
87 return ff; 87 return ff;
88 } 88 }
89 89
90 static void fuse_release_async(struct work_struct *work) 90 static void fuse_release_async(struct work_struct *work)
91 { 91 {
92 struct fuse_req *req; 92 struct fuse_req *req;
93 struct fuse_conn *fc; 93 struct fuse_conn *fc;
94 struct path path; 94 struct path path;
95 95
96 req = container_of(work, struct fuse_req, misc.release.work); 96 req = container_of(work, struct fuse_req, misc.release.work);
97 path = req->misc.release.path; 97 path = req->misc.release.path;
98 fc = get_fuse_conn(path.dentry->d_inode); 98 fc = get_fuse_conn(path.dentry->d_inode);
99 99
100 fuse_put_request(fc, req); 100 fuse_put_request(fc, req);
101 path_put(&path); 101 path_put(&path);
102 } 102 }
103 103
104 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 104 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
105 { 105 {
106 if (fc->destroy_req) { 106 if (fc->destroy_req) {
107 /* 107 /*
108 * If this is a fuseblk mount, then it's possible that 108 * If this is a fuseblk mount, then it's possible that
109 * releasing the path will result in releasing the 109 * releasing the path will result in releasing the
110 * super block and sending the DESTROY request. If 110 * super block and sending the DESTROY request. If
111 * the server is single threaded, this would hang. 111 * the server is single threaded, this would hang.
112 * For this reason do the path_put() in a separate 112 * For this reason do the path_put() in a separate
113 * thread. 113 * thread.
114 */ 114 */
115 atomic_inc(&req->count); 115 atomic_inc(&req->count);
116 INIT_WORK(&req->misc.release.work, fuse_release_async); 116 INIT_WORK(&req->misc.release.work, fuse_release_async);
117 schedule_work(&req->misc.release.work); 117 schedule_work(&req->misc.release.work);
118 } else { 118 } else {
119 path_put(&req->misc.release.path); 119 path_put(&req->misc.release.path);
120 } 120 }
121 } 121 }
122 122
123 static void fuse_file_put(struct fuse_file *ff, bool sync) 123 static void fuse_file_put(struct fuse_file *ff, bool sync)
124 { 124 {
125 if (atomic_dec_and_test(&ff->count)) { 125 if (atomic_dec_and_test(&ff->count)) {
126 struct fuse_req *req = ff->reserved_req; 126 struct fuse_req *req = ff->reserved_req;
127 127
128 if (sync) { 128 if (sync) {
129 fuse_request_send(ff->fc, req); 129 fuse_request_send(ff->fc, req);
130 path_put(&req->misc.release.path); 130 path_put(&req->misc.release.path);
131 fuse_put_request(ff->fc, req); 131 fuse_put_request(ff->fc, req);
132 } else { 132 } else {
133 req->end = fuse_release_end; 133 req->end = fuse_release_end;
134 fuse_request_send_background(ff->fc, req); 134 fuse_request_send_background(ff->fc, req);
135 } 135 }
136 kfree(ff); 136 kfree(ff);
137 } 137 }
138 } 138 }
139 139
140 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 140 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
141 bool isdir) 141 bool isdir)
142 { 142 {
143 struct fuse_open_out outarg; 143 struct fuse_open_out outarg;
144 struct fuse_file *ff; 144 struct fuse_file *ff;
145 int err; 145 int err;
146 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 146 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
147 147
148 ff = fuse_file_alloc(fc); 148 ff = fuse_file_alloc(fc);
149 if (!ff) 149 if (!ff)
150 return -ENOMEM; 150 return -ENOMEM;
151 151
152 err = fuse_send_open(fc, nodeid, file, opcode, &outarg); 152 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
153 if (err) { 153 if (err) {
154 fuse_file_free(ff); 154 fuse_file_free(ff);
155 return err; 155 return err;
156 } 156 }
157 157
158 if (isdir) 158 if (isdir)
159 outarg.open_flags &= ~FOPEN_DIRECT_IO; 159 outarg.open_flags &= ~FOPEN_DIRECT_IO;
160 160
161 ff->fh = outarg.fh; 161 ff->fh = outarg.fh;
162 ff->nodeid = nodeid; 162 ff->nodeid = nodeid;
163 ff->open_flags = outarg.open_flags; 163 ff->open_flags = outarg.open_flags;
164 file->private_data = fuse_file_get(ff); 164 file->private_data = fuse_file_get(ff);
165 165
166 return 0; 166 return 0;
167 } 167 }
168 EXPORT_SYMBOL_GPL(fuse_do_open); 168 EXPORT_SYMBOL_GPL(fuse_do_open);
169 169
170 void fuse_finish_open(struct inode *inode, struct file *file) 170 void fuse_finish_open(struct inode *inode, struct file *file)
171 { 171 {
172 struct fuse_file *ff = file->private_data; 172 struct fuse_file *ff = file->private_data;
173 struct fuse_conn *fc = get_fuse_conn(inode); 173 struct fuse_conn *fc = get_fuse_conn(inode);
174 174
175 if (ff->open_flags & FOPEN_DIRECT_IO) 175 if (ff->open_flags & FOPEN_DIRECT_IO)
176 file->f_op = &fuse_direct_io_file_operations; 176 file->f_op = &fuse_direct_io_file_operations;
177 if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 177 if (!(ff->open_flags & FOPEN_KEEP_CACHE))
178 invalidate_inode_pages2(inode->i_mapping); 178 invalidate_inode_pages2(inode->i_mapping);
179 if (ff->open_flags & FOPEN_NONSEEKABLE) 179 if (ff->open_flags & FOPEN_NONSEEKABLE)
180 nonseekable_open(inode, file); 180 nonseekable_open(inode, file);
181 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { 181 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
182 struct fuse_inode *fi = get_fuse_inode(inode); 182 struct fuse_inode *fi = get_fuse_inode(inode);
183 183
184 spin_lock(&fc->lock); 184 spin_lock(&fc->lock);
185 fi->attr_version = ++fc->attr_version; 185 fi->attr_version = ++fc->attr_version;
186 i_size_write(inode, 0); 186 i_size_write(inode, 0);
187 spin_unlock(&fc->lock); 187 spin_unlock(&fc->lock);
188 fuse_invalidate_attr(inode); 188 fuse_invalidate_attr(inode);
189 } 189 }
190 } 190 }
191 191
192 int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 192 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
193 { 193 {
194 struct fuse_conn *fc = get_fuse_conn(inode); 194 struct fuse_conn *fc = get_fuse_conn(inode);
195 int err; 195 int err;
196 196
197 err = generic_file_open(inode, file); 197 err = generic_file_open(inode, file);
198 if (err) 198 if (err)
199 return err; 199 return err;
200 200
201 err = fuse_do_open(fc, get_node_id(inode), file, isdir); 201 err = fuse_do_open(fc, get_node_id(inode), file, isdir);
202 if (err) 202 if (err)
203 return err; 203 return err;
204 204
205 fuse_finish_open(inode, file); 205 fuse_finish_open(inode, file);
206 206
207 return 0; 207 return 0;
208 } 208 }
209 209
210 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) 210 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
211 { 211 {
212 struct fuse_conn *fc = ff->fc; 212 struct fuse_conn *fc = ff->fc;
213 struct fuse_req *req = ff->reserved_req; 213 struct fuse_req *req = ff->reserved_req;
214 struct fuse_release_in *inarg = &req->misc.release.in; 214 struct fuse_release_in *inarg = &req->misc.release.in;
215 215
216 spin_lock(&fc->lock); 216 spin_lock(&fc->lock);
217 list_del(&ff->write_entry); 217 list_del(&ff->write_entry);
218 if (!RB_EMPTY_NODE(&ff->polled_node)) 218 if (!RB_EMPTY_NODE(&ff->polled_node))
219 rb_erase(&ff->polled_node, &fc->polled_files); 219 rb_erase(&ff->polled_node, &fc->polled_files);
220 spin_unlock(&fc->lock); 220 spin_unlock(&fc->lock);
221 221
222 wake_up_interruptible_all(&ff->poll_wait); 222 wake_up_interruptible_all(&ff->poll_wait);
223 223
224 inarg->fh = ff->fh; 224 inarg->fh = ff->fh;
225 inarg->flags = flags; 225 inarg->flags = flags;
226 req->in.h.opcode = opcode; 226 req->in.h.opcode = opcode;
227 req->in.h.nodeid = ff->nodeid; 227 req->in.h.nodeid = ff->nodeid;
228 req->in.numargs = 1; 228 req->in.numargs = 1;
229 req->in.args[0].size = sizeof(struct fuse_release_in); 229 req->in.args[0].size = sizeof(struct fuse_release_in);
230 req->in.args[0].value = inarg; 230 req->in.args[0].value = inarg;
231 } 231 }
232 232
233 void fuse_release_common(struct file *file, int opcode) 233 void fuse_release_common(struct file *file, int opcode)
234 { 234 {
235 struct fuse_file *ff; 235 struct fuse_file *ff;
236 struct fuse_req *req; 236 struct fuse_req *req;
237 237
238 ff = file->private_data; 238 ff = file->private_data;
239 if (unlikely(!ff)) 239 if (unlikely(!ff))
240 return; 240 return;
241 241
242 req = ff->reserved_req; 242 req = ff->reserved_req;
243 fuse_prepare_release(ff, file->f_flags, opcode); 243 fuse_prepare_release(ff, file->f_flags, opcode);
244 244
245 if (ff->flock) { 245 if (ff->flock) {
246 struct fuse_release_in *inarg = &req->misc.release.in; 246 struct fuse_release_in *inarg = &req->misc.release.in;
247 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 247 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
248 inarg->lock_owner = fuse_lock_owner_id(ff->fc, 248 inarg->lock_owner = fuse_lock_owner_id(ff->fc,
249 (fl_owner_t) file); 249 (fl_owner_t) file);
250 } 250 }
251 /* Hold vfsmount and dentry until release is finished */ 251 /* Hold vfsmount and dentry until release is finished */
252 path_get(&file->f_path); 252 path_get(&file->f_path);
253 req->misc.release.path = file->f_path; 253 req->misc.release.path = file->f_path;
254 254
255 /* 255 /*
256 * Normally this will send the RELEASE request, however if 256 * Normally this will send the RELEASE request, however if
257 * some asynchronous READ or WRITE requests are outstanding, 257 * some asynchronous READ or WRITE requests are outstanding,
258 * the sending will be delayed. 258 * the sending will be delayed.
259 * 259 *
260 * Make the release synchronous if this is a fuseblk mount, 260 * Make the release synchronous if this is a fuseblk mount,
261 * synchronous RELEASE is allowed (and desirable) in this case 261 * synchronous RELEASE is allowed (and desirable) in this case
262 * because the server can be trusted not to screw up. 262 * because the server can be trusted not to screw up.
263 */ 263 */
264 fuse_file_put(ff, ff->fc->destroy_req != NULL); 264 fuse_file_put(ff, ff->fc->destroy_req != NULL);
265 } 265 }
266 266
267 static int fuse_open(struct inode *inode, struct file *file) 267 static int fuse_open(struct inode *inode, struct file *file)
268 { 268 {
269 return fuse_open_common(inode, file, false); 269 return fuse_open_common(inode, file, false);
270 } 270 }
271 271
272 static int fuse_release(struct inode *inode, struct file *file) 272 static int fuse_release(struct inode *inode, struct file *file)
273 { 273 {
274 fuse_release_common(file, FUSE_RELEASE); 274 fuse_release_common(file, FUSE_RELEASE);
275 275
276 /* return value is ignored by VFS */ 276 /* return value is ignored by VFS */
277 return 0; 277 return 0;
278 } 278 }
279 279
280 void fuse_sync_release(struct fuse_file *ff, int flags) 280 void fuse_sync_release(struct fuse_file *ff, int flags)
281 { 281 {
282 WARN_ON(atomic_read(&ff->count) > 1); 282 WARN_ON(atomic_read(&ff->count) > 1);
283 fuse_prepare_release(ff, flags, FUSE_RELEASE); 283 fuse_prepare_release(ff, flags, FUSE_RELEASE);
284 ff->reserved_req->force = 1; 284 ff->reserved_req->force = 1;
285 fuse_request_send(ff->fc, ff->reserved_req); 285 fuse_request_send(ff->fc, ff->reserved_req);
286 fuse_put_request(ff->fc, ff->reserved_req); 286 fuse_put_request(ff->fc, ff->reserved_req);
287 kfree(ff); 287 kfree(ff);
288 } 288 }
289 EXPORT_SYMBOL_GPL(fuse_sync_release); 289 EXPORT_SYMBOL_GPL(fuse_sync_release);
290 290
291 /* 291 /*
292 * Scramble the ID space with XTEA, so that the value of the files_struct 292 * Scramble the ID space with XTEA, so that the value of the files_struct
293 * pointer is not exposed to userspace. 293 * pointer is not exposed to userspace.
294 */ 294 */
295 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) 295 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
296 { 296 {
297 u32 *k = fc->scramble_key; 297 u32 *k = fc->scramble_key;
298 u64 v = (unsigned long) id; 298 u64 v = (unsigned long) id;
299 u32 v0 = v; 299 u32 v0 = v;
300 u32 v1 = v >> 32; 300 u32 v1 = v >> 32;
301 u32 sum = 0; 301 u32 sum = 0;
302 int i; 302 int i;
303 303
304 for (i = 0; i < 32; i++) { 304 for (i = 0; i < 32; i++) {
305 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); 305 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
306 sum += 0x9E3779B9; 306 sum += 0x9E3779B9;
307 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); 307 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
308 } 308 }
309 309
310 return (u64) v0 + ((u64) v1 << 32); 310 return (u64) v0 + ((u64) v1 << 32);
311 } 311 }
312 312
313 /* 313 /*
314 * Check if page is under writeback 314 * Check if page is under writeback
315 * 315 *
316 * This is currently done by walking the list of writepage requests 316 * This is currently done by walking the list of writepage requests
317 * for the inode, which can be pretty inefficient. 317 * for the inode, which can be pretty inefficient.
318 */ 318 */
319 static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 319 static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
320 { 320 {
321 struct fuse_conn *fc = get_fuse_conn(inode); 321 struct fuse_conn *fc = get_fuse_conn(inode);
322 struct fuse_inode *fi = get_fuse_inode(inode); 322 struct fuse_inode *fi = get_fuse_inode(inode);
323 struct fuse_req *req; 323 struct fuse_req *req;
324 bool found = false; 324 bool found = false;
325 325
326 spin_lock(&fc->lock); 326 spin_lock(&fc->lock);
327 list_for_each_entry(req, &fi->writepages, writepages_entry) { 327 list_for_each_entry(req, &fi->writepages, writepages_entry) {
328 pgoff_t curr_index; 328 pgoff_t curr_index;
329 329
330 BUG_ON(req->inode != inode); 330 BUG_ON(req->inode != inode);
331 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; 331 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
332 if (curr_index == index) { 332 if (curr_index == index) {
333 found = true; 333 found = true;
334 break; 334 break;
335 } 335 }
336 } 336 }
337 spin_unlock(&fc->lock); 337 spin_unlock(&fc->lock);
338 338
339 return found; 339 return found;
340 } 340 }
341 341
342 /* 342 /*
343 * Wait for page writeback to be completed. 343 * Wait for page writeback to be completed.
344 * 344 *
345 * Since fuse doesn't rely on the VM writeback tracking, this has to 345 * Since fuse doesn't rely on the VM writeback tracking, this has to
346 * use some other means. 346 * use some other means.
347 */ 347 */
348 static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) 348 static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
349 { 349 {
350 struct fuse_inode *fi = get_fuse_inode(inode); 350 struct fuse_inode *fi = get_fuse_inode(inode);
351 351
352 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); 352 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
353 return 0; 353 return 0;
354 } 354 }
355 355
356 static int fuse_flush(struct file *file, fl_owner_t id) 356 static int fuse_flush(struct file *file, fl_owner_t id)
357 { 357 {
358 struct inode *inode = file->f_path.dentry->d_inode; 358 struct inode *inode = file->f_path.dentry->d_inode;
359 struct fuse_conn *fc = get_fuse_conn(inode); 359 struct fuse_conn *fc = get_fuse_conn(inode);
360 struct fuse_file *ff = file->private_data; 360 struct fuse_file *ff = file->private_data;
361 struct fuse_req *req; 361 struct fuse_req *req;
362 struct fuse_flush_in inarg; 362 struct fuse_flush_in inarg;
363 int err; 363 int err;
364 364
365 if (is_bad_inode(inode)) 365 if (is_bad_inode(inode))
366 return -EIO; 366 return -EIO;
367 367
368 if (fc->no_flush) 368 if (fc->no_flush)
369 return 0; 369 return 0;
370 370
371 req = fuse_get_req_nofail(fc, file); 371 req = fuse_get_req_nofail(fc, file);
372 memset(&inarg, 0, sizeof(inarg)); 372 memset(&inarg, 0, sizeof(inarg));
373 inarg.fh = ff->fh; 373 inarg.fh = ff->fh;
374 inarg.lock_owner = fuse_lock_owner_id(fc, id); 374 inarg.lock_owner = fuse_lock_owner_id(fc, id);
375 req->in.h.opcode = FUSE_FLUSH; 375 req->in.h.opcode = FUSE_FLUSH;
376 req->in.h.nodeid = get_node_id(inode); 376 req->in.h.nodeid = get_node_id(inode);
377 req->in.numargs = 1; 377 req->in.numargs = 1;
378 req->in.args[0].size = sizeof(inarg); 378 req->in.args[0].size = sizeof(inarg);
379 req->in.args[0].value = &inarg; 379 req->in.args[0].value = &inarg;
380 req->force = 1; 380 req->force = 1;
381 fuse_request_send(fc, req); 381 fuse_request_send(fc, req);
382 err = req->out.h.error; 382 err = req->out.h.error;
383 fuse_put_request(fc, req); 383 fuse_put_request(fc, req);
384 if (err == -ENOSYS) { 384 if (err == -ENOSYS) {
385 fc->no_flush = 1; 385 fc->no_flush = 1;
386 err = 0; 386 err = 0;
387 } 387 }
388 return err; 388 return err;
389 } 389 }
390 390
391 /* 391 /*
392 * Wait for all pending writepages on the inode to finish. 392 * Wait for all pending writepages on the inode to finish.
393 * 393 *
394 * This is currently done by blocking further writes with FUSE_NOWRITE 394 * This is currently done by blocking further writes with FUSE_NOWRITE
395 * and waiting for all sent writes to complete. 395 * and waiting for all sent writes to complete.
396 * 396 *
397 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage 397 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
398 * could conflict with truncation. 398 * could conflict with truncation.
399 */ 399 */
400 static void fuse_sync_writes(struct inode *inode) 400 static void fuse_sync_writes(struct inode *inode)
401 { 401 {
402 fuse_set_nowrite(inode); 402 fuse_set_nowrite(inode);
403 fuse_release_nowrite(inode); 403 fuse_release_nowrite(inode);
404 } 404 }
405 405
406 int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 406 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
407 int datasync, int isdir) 407 int datasync, int isdir)
408 { 408 {
409 struct inode *inode = file->f_mapping->host; 409 struct inode *inode = file->f_mapping->host;
410 struct fuse_conn *fc = get_fuse_conn(inode); 410 struct fuse_conn *fc = get_fuse_conn(inode);
411 struct fuse_file *ff = file->private_data; 411 struct fuse_file *ff = file->private_data;
412 struct fuse_req *req; 412 struct fuse_req *req;
413 struct fuse_fsync_in inarg; 413 struct fuse_fsync_in inarg;
414 int err; 414 int err;
415 415
416 if (is_bad_inode(inode)) 416 if (is_bad_inode(inode))
417 return -EIO; 417 return -EIO;
418 418
419 err = filemap_write_and_wait_range(inode->i_mapping, start, end); 419 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
420 if (err) 420 if (err)
421 return err; 421 return err;
422 422
423 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) 423 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
424 return 0; 424 return 0;
425 425
426 mutex_lock(&inode->i_mutex); 426 mutex_lock(&inode->i_mutex);
427 427
428 /* 428 /*
429 * Start writeback against all dirty pages of the inode, then 429 * Start writeback against all dirty pages of the inode, then
430 * wait for all outstanding writes, before sending the FSYNC 430 * wait for all outstanding writes, before sending the FSYNC
431 * request. 431 * request.
432 */ 432 */
433 err = write_inode_now(inode, 0); 433 err = write_inode_now(inode, 0);
434 if (err) 434 if (err)
435 goto out; 435 goto out;
436 436
437 fuse_sync_writes(inode); 437 fuse_sync_writes(inode);
438 438
439 req = fuse_get_req(fc); 439 req = fuse_get_req(fc);
440 if (IS_ERR(req)) { 440 if (IS_ERR(req)) {
441 err = PTR_ERR(req); 441 err = PTR_ERR(req);
442 goto out; 442 goto out;
443 } 443 }
444 444
445 memset(&inarg, 0, sizeof(inarg)); 445 memset(&inarg, 0, sizeof(inarg));
446 inarg.fh = ff->fh; 446 inarg.fh = ff->fh;
447 inarg.fsync_flags = datasync ? 1 : 0; 447 inarg.fsync_flags = datasync ? 1 : 0;
448 req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; 448 req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
449 req->in.h.nodeid = get_node_id(inode); 449 req->in.h.nodeid = get_node_id(inode);
450 req->in.numargs = 1; 450 req->in.numargs = 1;
451 req->in.args[0].size = sizeof(inarg); 451 req->in.args[0].size = sizeof(inarg);
452 req->in.args[0].value = &inarg; 452 req->in.args[0].value = &inarg;
453 fuse_request_send(fc, req); 453 fuse_request_send(fc, req);
454 err = req->out.h.error; 454 err = req->out.h.error;
455 fuse_put_request(fc, req); 455 fuse_put_request(fc, req);
456 if (err == -ENOSYS) { 456 if (err == -ENOSYS) {
457 if (isdir) 457 if (isdir)
458 fc->no_fsyncdir = 1; 458 fc->no_fsyncdir = 1;
459 else 459 else
460 fc->no_fsync = 1; 460 fc->no_fsync = 1;
461 err = 0; 461 err = 0;
462 } 462 }
463 out: 463 out:
464 mutex_unlock(&inode->i_mutex); 464 mutex_unlock(&inode->i_mutex);
465 return err; 465 return err;
466 } 466 }
467 467
468 static int fuse_fsync(struct file *file, loff_t start, loff_t end, 468 static int fuse_fsync(struct file *file, loff_t start, loff_t end,
469 int datasync) 469 int datasync)
470 { 470 {
471 return fuse_fsync_common(file, start, end, datasync, 0); 471 return fuse_fsync_common(file, start, end, datasync, 0);
472 } 472 }
473 473
474 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 474 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
475 size_t count, int opcode) 475 size_t count, int opcode)
476 { 476 {
477 struct fuse_read_in *inarg = &req->misc.read.in; 477 struct fuse_read_in *inarg = &req->misc.read.in;
478 struct fuse_file *ff = file->private_data; 478 struct fuse_file *ff = file->private_data;
479 479
480 inarg->fh = ff->fh; 480 inarg->fh = ff->fh;
481 inarg->offset = pos; 481 inarg->offset = pos;
482 inarg->size = count; 482 inarg->size = count;
483 inarg->flags = file->f_flags; 483 inarg->flags = file->f_flags;
484 req->in.h.opcode = opcode; 484 req->in.h.opcode = opcode;
485 req->in.h.nodeid = ff->nodeid; 485 req->in.h.nodeid = ff->nodeid;
486 req->in.numargs = 1; 486 req->in.numargs = 1;
487 req->in.args[0].size = sizeof(struct fuse_read_in); 487 req->in.args[0].size = sizeof(struct fuse_read_in);
488 req->in.args[0].value = inarg; 488 req->in.args[0].value = inarg;
489 req->out.argvar = 1; 489 req->out.argvar = 1;
490 req->out.numargs = 1; 490 req->out.numargs = 1;
491 req->out.args[0].size = count; 491 req->out.args[0].size = count;
492 } 492 }
493 493
494 static size_t fuse_send_read(struct fuse_req *req, struct file *file, 494 static size_t fuse_send_read(struct fuse_req *req, struct file *file,
495 loff_t pos, size_t count, fl_owner_t owner) 495 loff_t pos, size_t count, fl_owner_t owner)
496 { 496 {
497 struct fuse_file *ff = file->private_data; 497 struct fuse_file *ff = file->private_data;
498 struct fuse_conn *fc = ff->fc; 498 struct fuse_conn *fc = ff->fc;
499 499
500 fuse_read_fill(req, file, pos, count, FUSE_READ); 500 fuse_read_fill(req, file, pos, count, FUSE_READ);
501 if (owner != NULL) { 501 if (owner != NULL) {
502 struct fuse_read_in *inarg = &req->misc.read.in; 502 struct fuse_read_in *inarg = &req->misc.read.in;
503 503
504 inarg->read_flags |= FUSE_READ_LOCKOWNER; 504 inarg->read_flags |= FUSE_READ_LOCKOWNER;
505 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 505 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
506 } 506 }
507 fuse_request_send(fc, req); 507 fuse_request_send(fc, req);
508 return req->out.args[0].size; 508 return req->out.args[0].size;
509 } 509 }
510 510
511 static void fuse_read_update_size(struct inode *inode, loff_t size, 511 static void fuse_read_update_size(struct inode *inode, loff_t size,
512 u64 attr_ver) 512 u64 attr_ver)
513 { 513 {
514 struct fuse_conn *fc = get_fuse_conn(inode); 514 struct fuse_conn *fc = get_fuse_conn(inode);
515 struct fuse_inode *fi = get_fuse_inode(inode); 515 struct fuse_inode *fi = get_fuse_inode(inode);
516 516
517 spin_lock(&fc->lock); 517 spin_lock(&fc->lock);
518 if (attr_ver == fi->attr_version && size < inode->i_size) { 518 if (attr_ver == fi->attr_version && size < inode->i_size) {
519 fi->attr_version = ++fc->attr_version; 519 fi->attr_version = ++fc->attr_version;
520 i_size_write(inode, size); 520 i_size_write(inode, size);
521 } 521 }
522 spin_unlock(&fc->lock); 522 spin_unlock(&fc->lock);
523 } 523 }
524 524
525 static int fuse_readpage(struct file *file, struct page *page) 525 static int fuse_readpage(struct file *file, struct page *page)
526 { 526 {
527 struct inode *inode = page->mapping->host; 527 struct inode *inode = page->mapping->host;
528 struct fuse_conn *fc = get_fuse_conn(inode); 528 struct fuse_conn *fc = get_fuse_conn(inode);
529 struct fuse_req *req; 529 struct fuse_req *req;
530 size_t num_read; 530 size_t num_read;
531 loff_t pos = page_offset(page); 531 loff_t pos = page_offset(page);
532 size_t count = PAGE_CACHE_SIZE; 532 size_t count = PAGE_CACHE_SIZE;
533 u64 attr_ver; 533 u64 attr_ver;
534 int err; 534 int err;
535 535
536 err = -EIO; 536 err = -EIO;
537 if (is_bad_inode(inode)) 537 if (is_bad_inode(inode))
538 goto out; 538 goto out;
539 539
540 /* 540 /*
541 * Page writeback can extend beyond the lifetime of the 541 * Page writeback can extend beyond the lifetime of the
542 * page-cache page, so make sure we read a properly synced 542 * page-cache page, so make sure we read a properly synced
543 * page. 543 * page.
544 */ 544 */
545 fuse_wait_on_page_writeback(inode, page->index); 545 fuse_wait_on_page_writeback(inode, page->index);
546 546
547 req = fuse_get_req(fc); 547 req = fuse_get_req(fc);
548 err = PTR_ERR(req); 548 err = PTR_ERR(req);
549 if (IS_ERR(req)) 549 if (IS_ERR(req))
550 goto out; 550 goto out;
551 551
552 attr_ver = fuse_get_attr_version(fc); 552 attr_ver = fuse_get_attr_version(fc);
553 553
554 req->out.page_zeroing = 1; 554 req->out.page_zeroing = 1;
555 req->out.argpages = 1; 555 req->out.argpages = 1;
556 req->num_pages = 1; 556 req->num_pages = 1;
557 req->pages[0] = page; 557 req->pages[0] = page;
558 num_read = fuse_send_read(req, file, pos, count, NULL); 558 num_read = fuse_send_read(req, file, pos, count, NULL);
559 err = req->out.h.error; 559 err = req->out.h.error;
560 fuse_put_request(fc, req); 560 fuse_put_request(fc, req);
561 561
562 if (!err) { 562 if (!err) {
563 /* 563 /*
564 * Short read means EOF. If file size is larger, truncate it 564 * Short read means EOF. If file size is larger, truncate it
565 */ 565 */
566 if (num_read < count) 566 if (num_read < count)
567 fuse_read_update_size(inode, pos + num_read, attr_ver); 567 fuse_read_update_size(inode, pos + num_read, attr_ver);
568 568
569 SetPageUptodate(page); 569 SetPageUptodate(page);
570 } 570 }
571 571
572 fuse_invalidate_attr(inode); /* atime changed */ 572 fuse_invalidate_attr(inode); /* atime changed */
573 out: 573 out:
574 unlock_page(page); 574 unlock_page(page);
575 return err; 575 return err;
576 } 576 }
577 577
578 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) 578 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
579 { 579 {
580 int i; 580 int i;
581 size_t count = req->misc.read.in.size; 581 size_t count = req->misc.read.in.size;
582 size_t num_read = req->out.args[0].size; 582 size_t num_read = req->out.args[0].size;
583 struct address_space *mapping = NULL; 583 struct address_space *mapping = NULL;
584 584
585 for (i = 0; mapping == NULL && i < req->num_pages; i++) 585 for (i = 0; mapping == NULL && i < req->num_pages; i++)
586 mapping = req->pages[i]->mapping; 586 mapping = req->pages[i]->mapping;
587 587
588 if (mapping) { 588 if (mapping) {
589 struct inode *inode = mapping->host; 589 struct inode *inode = mapping->host;
590 590
591 /* 591 /*
592 * Short read means EOF. If file size is larger, truncate it 592 * Short read means EOF. If file size is larger, truncate it
593 */ 593 */
594 if (!req->out.h.error && num_read < count) { 594 if (!req->out.h.error && num_read < count) {
595 loff_t pos; 595 loff_t pos;
596 596
597 pos = page_offset(req->pages[0]) + num_read; 597 pos = page_offset(req->pages[0]) + num_read;
598 fuse_read_update_size(inode, pos, 598 fuse_read_update_size(inode, pos,
599 req->misc.read.attr_ver); 599 req->misc.read.attr_ver);
600 } 600 }
601 fuse_invalidate_attr(inode); /* atime changed */ 601 fuse_invalidate_attr(inode); /* atime changed */
602 } 602 }
603 603
604 for (i = 0; i < req->num_pages; i++) { 604 for (i = 0; i < req->num_pages; i++) {
605 struct page *page = req->pages[i]; 605 struct page *page = req->pages[i];
606 if (!req->out.h.error) 606 if (!req->out.h.error)
607 SetPageUptodate(page); 607 SetPageUptodate(page);
608 else 608 else
609 SetPageError(page); 609 SetPageError(page);
610 unlock_page(page); 610 unlock_page(page);
611 page_cache_release(page); 611 page_cache_release(page);
612 } 612 }
613 if (req->ff) 613 if (req->ff)
614 fuse_file_put(req->ff, false); 614 fuse_file_put(req->ff, false);
615 } 615 }
616 616
617 static void fuse_send_readpages(struct fuse_req *req, struct file *file) 617 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
618 { 618 {
619 struct fuse_file *ff = file->private_data; 619 struct fuse_file *ff = file->private_data;
620 struct fuse_conn *fc = ff->fc; 620 struct fuse_conn *fc = ff->fc;
621 loff_t pos = page_offset(req->pages[0]); 621 loff_t pos = page_offset(req->pages[0]);
622 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 622 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
623 623
624 req->out.argpages = 1; 624 req->out.argpages = 1;
625 req->out.page_zeroing = 1; 625 req->out.page_zeroing = 1;
626 req->out.page_replace = 1; 626 req->out.page_replace = 1;
627 fuse_read_fill(req, file, pos, count, FUSE_READ); 627 fuse_read_fill(req, file, pos, count, FUSE_READ);
628 req->misc.read.attr_ver = fuse_get_attr_version(fc); 628 req->misc.read.attr_ver = fuse_get_attr_version(fc);
629 if (fc->async_read) { 629 if (fc->async_read) {
630 req->ff = fuse_file_get(ff); 630 req->ff = fuse_file_get(ff);
631 req->end = fuse_readpages_end; 631 req->end = fuse_readpages_end;
632 fuse_request_send_background(fc, req); 632 fuse_request_send_background(fc, req);
633 } else { 633 } else {
634 fuse_request_send(fc, req); 634 fuse_request_send(fc, req);
635 fuse_readpages_end(fc, req); 635 fuse_readpages_end(fc, req);
636 fuse_put_request(fc, req); 636 fuse_put_request(fc, req);
637 } 637 }
638 } 638 }
639 639
640 struct fuse_fill_data { 640 struct fuse_fill_data {
641 struct fuse_req *req; 641 struct fuse_req *req;
642 struct file *file; 642 struct file *file;
643 struct inode *inode; 643 struct inode *inode;
644 }; 644 };
645 645
646 static int fuse_readpages_fill(void *_data, struct page *page) 646 static int fuse_readpages_fill(void *_data, struct page *page)
647 { 647 {
648 struct fuse_fill_data *data = _data; 648 struct fuse_fill_data *data = _data;
649 struct fuse_req *req = data->req; 649 struct fuse_req *req = data->req;
650 struct inode *inode = data->inode; 650 struct inode *inode = data->inode;
651 struct fuse_conn *fc = get_fuse_conn(inode); 651 struct fuse_conn *fc = get_fuse_conn(inode);
652 652
653 fuse_wait_on_page_writeback(inode, page->index); 653 fuse_wait_on_page_writeback(inode, page->index);
654 654
655 if (req->num_pages && 655 if (req->num_pages &&
656 (req->num_pages == FUSE_MAX_PAGES_PER_REQ || 656 (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
657 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || 657 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
658 req->pages[req->num_pages - 1]->index + 1 != page->index)) { 658 req->pages[req->num_pages - 1]->index + 1 != page->index)) {
659 fuse_send_readpages(req, data->file); 659 fuse_send_readpages(req, data->file);
660 data->req = req = fuse_get_req(fc); 660 data->req = req = fuse_get_req(fc);
661 if (IS_ERR(req)) { 661 if (IS_ERR(req)) {
662 unlock_page(page); 662 unlock_page(page);
663 return PTR_ERR(req); 663 return PTR_ERR(req);
664 } 664 }
665 } 665 }
666 page_cache_get(page); 666 page_cache_get(page);
667 req->pages[req->num_pages] = page; 667 req->pages[req->num_pages] = page;
668 req->num_pages++; 668 req->num_pages++;
669 return 0; 669 return 0;
670 } 670 }
671 671
672 static int fuse_readpages(struct file *file, struct address_space *mapping, 672 static int fuse_readpages(struct file *file, struct address_space *mapping,
673 struct list_head *pages, unsigned nr_pages) 673 struct list_head *pages, unsigned nr_pages)
674 { 674 {
675 struct inode *inode = mapping->host; 675 struct inode *inode = mapping->host;
676 struct fuse_conn *fc = get_fuse_conn(inode); 676 struct fuse_conn *fc = get_fuse_conn(inode);
677 struct fuse_fill_data data; 677 struct fuse_fill_data data;
678 int err; 678 int err;
679 679
680 err = -EIO; 680 err = -EIO;
681 if (is_bad_inode(inode)) 681 if (is_bad_inode(inode))
682 goto out; 682 goto out;
683 683
684 data.file = file; 684 data.file = file;
685 data.inode = inode; 685 data.inode = inode;
686 data.req = fuse_get_req(fc); 686 data.req = fuse_get_req(fc);
687 err = PTR_ERR(data.req); 687 err = PTR_ERR(data.req);
688 if (IS_ERR(data.req)) 688 if (IS_ERR(data.req))
689 goto out; 689 goto out;
690 690
691 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 691 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
692 if (!err) { 692 if (!err) {
693 if (data.req->num_pages) 693 if (data.req->num_pages)
694 fuse_send_readpages(data.req, file); 694 fuse_send_readpages(data.req, file);
695 else 695 else
696 fuse_put_request(fc, data.req); 696 fuse_put_request(fc, data.req);
697 } 697 }
698 out: 698 out:
699 return err; 699 return err;
700 } 700 }
701 701
702 static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 702 static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
703 unsigned long nr_segs, loff_t pos) 703 unsigned long nr_segs, loff_t pos)
704 { 704 {
705 struct inode *inode = iocb->ki_filp->f_mapping->host; 705 struct inode *inode = iocb->ki_filp->f_mapping->host;
706 struct fuse_conn *fc = get_fuse_conn(inode); 706 struct fuse_conn *fc = get_fuse_conn(inode);
707 707
708 /* 708 /*
709 * In auto invalidate mode, always update attributes on read. 709 * In auto invalidate mode, always update attributes on read.
710 * Otherwise, only update if we attempt to read past EOF (to ensure 710 * Otherwise, only update if we attempt to read past EOF (to ensure
711 * i_size is up to date). 711 * i_size is up to date).
712 */ 712 */
713 if (fc->auto_inval_data || 713 if (fc->auto_inval_data ||
714 (pos + iov_length(iov, nr_segs) > i_size_read(inode))) { 714 (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
715 int err; 715 int err;
716 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); 716 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
717 if (err) 717 if (err)
718 return err; 718 return err;
719 } 719 }
720 720
721 return generic_file_aio_read(iocb, iov, nr_segs, pos); 721 return generic_file_aio_read(iocb, iov, nr_segs, pos);
722 } 722 }
723 723
724 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, 724 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
725 loff_t pos, size_t count) 725 loff_t pos, size_t count)
726 { 726 {
727 struct fuse_write_in *inarg = &req->misc.write.in; 727 struct fuse_write_in *inarg = &req->misc.write.in;
728 struct fuse_write_out *outarg = &req->misc.write.out; 728 struct fuse_write_out *outarg = &req->misc.write.out;
729 729
730 inarg->fh = ff->fh; 730 inarg->fh = ff->fh;
731 inarg->offset = pos; 731 inarg->offset = pos;
732 inarg->size = count; 732 inarg->size = count;
733 req->in.h.opcode = FUSE_WRITE; 733 req->in.h.opcode = FUSE_WRITE;
734 req->in.h.nodeid = ff->nodeid; 734 req->in.h.nodeid = ff->nodeid;
735 req->in.numargs = 2; 735 req->in.numargs = 2;
736 if (ff->fc->minor < 9) 736 if (ff->fc->minor < 9)
737 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 737 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
738 else 738 else
739 req->in.args[0].size = sizeof(struct fuse_write_in); 739 req->in.args[0].size = sizeof(struct fuse_write_in);
740 req->in.args[0].value = inarg; 740 req->in.args[0].value = inarg;
741 req->in.args[1].size = count; 741 req->in.args[1].size = count;
742 req->out.numargs = 1; 742 req->out.numargs = 1;
743 req->out.args[0].size = sizeof(struct fuse_write_out); 743 req->out.args[0].size = sizeof(struct fuse_write_out);
744 req->out.args[0].value = outarg; 744 req->out.args[0].value = outarg;
745 } 745 }
746 746
747 static size_t fuse_send_write(struct fuse_req *req, struct file *file, 747 static size_t fuse_send_write(struct fuse_req *req, struct file *file,
748 loff_t pos, size_t count, fl_owner_t owner) 748 loff_t pos, size_t count, fl_owner_t owner)
749 { 749 {
750 struct fuse_file *ff = file->private_data; 750 struct fuse_file *ff = file->private_data;
751 struct fuse_conn *fc = ff->fc; 751 struct fuse_conn *fc = ff->fc;
752 struct fuse_write_in *inarg = &req->misc.write.in; 752 struct fuse_write_in *inarg = &req->misc.write.in;
753 753
754 fuse_write_fill(req, ff, pos, count); 754 fuse_write_fill(req, ff, pos, count);
755 inarg->flags = file->f_flags; 755 inarg->flags = file->f_flags;
756 if (owner != NULL) { 756 if (owner != NULL) {
757 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 757 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
758 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 758 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
759 } 759 }
760 fuse_request_send(fc, req); 760 fuse_request_send(fc, req);
761 return req->misc.write.out.size; 761 return req->misc.write.out.size;
762 } 762 }
763 763
764 void fuse_write_update_size(struct inode *inode, loff_t pos) 764 void fuse_write_update_size(struct inode *inode, loff_t pos)
765 { 765 {
766 struct fuse_conn *fc = get_fuse_conn(inode); 766 struct fuse_conn *fc = get_fuse_conn(inode);
767 struct fuse_inode *fi = get_fuse_inode(inode); 767 struct fuse_inode *fi = get_fuse_inode(inode);
768 768
769 spin_lock(&fc->lock); 769 spin_lock(&fc->lock);
770 fi->attr_version = ++fc->attr_version; 770 fi->attr_version = ++fc->attr_version;
771 if (pos > inode->i_size) 771 if (pos > inode->i_size)
772 i_size_write(inode, pos); 772 i_size_write(inode, pos);
773 spin_unlock(&fc->lock); 773 spin_unlock(&fc->lock);
774 } 774 }
775 775
776 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 776 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
777 struct inode *inode, loff_t pos, 777 struct inode *inode, loff_t pos,
778 size_t count) 778 size_t count)
779 { 779 {
780 size_t res; 780 size_t res;
781 unsigned offset; 781 unsigned offset;
782 unsigned i; 782 unsigned i;
783 783
784 for (i = 0; i < req->num_pages; i++) 784 for (i = 0; i < req->num_pages; i++)
785 fuse_wait_on_page_writeback(inode, req->pages[i]->index); 785 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
786 786
787 res = fuse_send_write(req, file, pos, count, NULL); 787 res = fuse_send_write(req, file, pos, count, NULL);
788 788
789 offset = req->page_offset; 789 offset = req->page_offset;
790 count = res; 790 count = res;
791 for (i = 0; i < req->num_pages; i++) { 791 for (i = 0; i < req->num_pages; i++) {
792 struct page *page = req->pages[i]; 792 struct page *page = req->pages[i];
793 793
794 if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) 794 if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
795 SetPageUptodate(page); 795 SetPageUptodate(page);
796 796
797 if (count > PAGE_CACHE_SIZE - offset) 797 if (count > PAGE_CACHE_SIZE - offset)
798 count -= PAGE_CACHE_SIZE - offset; 798 count -= PAGE_CACHE_SIZE - offset;
799 else 799 else
800 count = 0; 800 count = 0;
801 offset = 0; 801 offset = 0;
802 802
803 unlock_page(page); 803 unlock_page(page);
804 page_cache_release(page); 804 page_cache_release(page);
805 } 805 }
806 806
807 return res; 807 return res;
808 } 808 }
809 809
810 static ssize_t fuse_fill_write_pages(struct fuse_req *req, 810 static ssize_t fuse_fill_write_pages(struct fuse_req *req,
811 struct address_space *mapping, 811 struct address_space *mapping,
812 struct iov_iter *ii, loff_t pos) 812 struct iov_iter *ii, loff_t pos)
813 { 813 {
814 struct fuse_conn *fc = get_fuse_conn(mapping->host); 814 struct fuse_conn *fc = get_fuse_conn(mapping->host);
815 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 815 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
816 size_t count = 0; 816 size_t count = 0;
817 int err; 817 int err;
818 818
819 req->in.argpages = 1; 819 req->in.argpages = 1;
820 req->page_offset = offset; 820 req->page_offset = offset;
821 821
822 do { 822 do {
823 size_t tmp; 823 size_t tmp;
824 struct page *page; 824 struct page *page;
825 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 825 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
826 size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, 826 size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
827 iov_iter_count(ii)); 827 iov_iter_count(ii));
828 828
829 bytes = min_t(size_t, bytes, fc->max_write - count); 829 bytes = min_t(size_t, bytes, fc->max_write - count);
830 830
831 again: 831 again:
832 err = -EFAULT; 832 err = -EFAULT;
833 if (iov_iter_fault_in_readable(ii, bytes)) 833 if (iov_iter_fault_in_readable(ii, bytes))
834 break; 834 break;
835 835
836 err = -ENOMEM; 836 err = -ENOMEM;
837 page = grab_cache_page_write_begin(mapping, index, 0); 837 page = grab_cache_page_write_begin(mapping, index, 0);
838 if (!page) 838 if (!page)
839 break; 839 break;
840 840
841 if (mapping_writably_mapped(mapping)) 841 if (mapping_writably_mapped(mapping))
842 flush_dcache_page(page); 842 flush_dcache_page(page);
843 843
844 pagefault_disable(); 844 pagefault_disable();
845 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 845 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
846 pagefault_enable(); 846 pagefault_enable();
847 flush_dcache_page(page); 847 flush_dcache_page(page);
848 848
849 mark_page_accessed(page); 849 mark_page_accessed(page);
850 850
851 if (!tmp) { 851 if (!tmp) {
852 unlock_page(page); 852 unlock_page(page);
853 page_cache_release(page); 853 page_cache_release(page);
854 bytes = min(bytes, iov_iter_single_seg_count(ii)); 854 bytes = min(bytes, iov_iter_single_seg_count(ii));
855 goto again; 855 goto again;
856 } 856 }
857 857
858 err = 0; 858 err = 0;
859 req->pages[req->num_pages] = page; 859 req->pages[req->num_pages] = page;
860 req->num_pages++; 860 req->num_pages++;
861 861
862 iov_iter_advance(ii, tmp); 862 iov_iter_advance(ii, tmp);
863 count += tmp; 863 count += tmp;
864 pos += tmp; 864 pos += tmp;
865 offset += tmp; 865 offset += tmp;
866 if (offset == PAGE_CACHE_SIZE) 866 if (offset == PAGE_CACHE_SIZE)
867 offset = 0; 867 offset = 0;
868 868
869 if (!fc->big_writes) 869 if (!fc->big_writes)
870 break; 870 break;
871 } while (iov_iter_count(ii) && count < fc->max_write && 871 } while (iov_iter_count(ii) && count < fc->max_write &&
872 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); 872 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
873 873
874 return count > 0 ? count : err; 874 return count > 0 ? count : err;
875 } 875 }
876 876
877 static ssize_t fuse_perform_write(struct file *file, 877 static ssize_t fuse_perform_write(struct file *file,
878 struct address_space *mapping, 878 struct address_space *mapping,
879 struct iov_iter *ii, loff_t pos) 879 struct iov_iter *ii, loff_t pos)
880 { 880 {
881 struct inode *inode = mapping->host; 881 struct inode *inode = mapping->host;
882 struct fuse_conn *fc = get_fuse_conn(inode); 882 struct fuse_conn *fc = get_fuse_conn(inode);
883 int err = 0; 883 int err = 0;
884 ssize_t res = 0; 884 ssize_t res = 0;
885 885
886 if (is_bad_inode(inode)) 886 if (is_bad_inode(inode))
887 return -EIO; 887 return -EIO;
888 888
889 do { 889 do {
890 struct fuse_req *req; 890 struct fuse_req *req;
891 ssize_t count; 891 ssize_t count;
892 892
893 req = fuse_get_req(fc); 893 req = fuse_get_req(fc);
894 if (IS_ERR(req)) { 894 if (IS_ERR(req)) {
895 err = PTR_ERR(req); 895 err = PTR_ERR(req);
896 break; 896 break;
897 } 897 }
898 898
899 count = fuse_fill_write_pages(req, mapping, ii, pos); 899 count = fuse_fill_write_pages(req, mapping, ii, pos);
900 if (count <= 0) { 900 if (count <= 0) {
901 err = count; 901 err = count;
902 } else { 902 } else {
903 size_t num_written; 903 size_t num_written;
904 904
905 num_written = fuse_send_write_pages(req, file, inode, 905 num_written = fuse_send_write_pages(req, file, inode,
906 pos, count); 906 pos, count);
907 err = req->out.h.error; 907 err = req->out.h.error;
908 if (!err) { 908 if (!err) {
909 res += num_written; 909 res += num_written;
910 pos += num_written; 910 pos += num_written;
911 911
912 /* break out of the loop on short write */ 912 /* break out of the loop on short write */
913 if (num_written != count) 913 if (num_written != count)
914 err = -EIO; 914 err = -EIO;
915 } 915 }
916 } 916 }
917 fuse_put_request(fc, req); 917 fuse_put_request(fc, req);
918 } while (!err && iov_iter_count(ii)); 918 } while (!err && iov_iter_count(ii));
919 919
920 if (res > 0) 920 if (res > 0)
921 fuse_write_update_size(inode, pos); 921 fuse_write_update_size(inode, pos);
922 922
923 fuse_invalidate_attr(inode); 923 fuse_invalidate_attr(inode);
924 924
925 return res > 0 ? res : err; 925 return res > 0 ? res : err;
926 } 926 }
927 927
928 static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 928 static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
929 unsigned long nr_segs, loff_t pos) 929 unsigned long nr_segs, loff_t pos)
930 { 930 {
931 struct file *file = iocb->ki_filp; 931 struct file *file = iocb->ki_filp;
932 struct address_space *mapping = file->f_mapping; 932 struct address_space *mapping = file->f_mapping;
933 size_t count = 0; 933 size_t count = 0;
934 size_t ocount = 0; 934 size_t ocount = 0;
935 ssize_t written = 0; 935 ssize_t written = 0;
936 ssize_t written_buffered = 0; 936 ssize_t written_buffered = 0;
937 struct inode *inode = mapping->host; 937 struct inode *inode = mapping->host;
938 ssize_t err; 938 ssize_t err;
939 struct iov_iter i; 939 struct iov_iter i;
940 loff_t endbyte = 0; 940 loff_t endbyte = 0;
941 941
942 WARN_ON(iocb->ki_pos != pos); 942 WARN_ON(iocb->ki_pos != pos);
943 943
944 ocount = 0; 944 ocount = 0;
945 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 945 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
946 if (err) 946 if (err)
947 return err; 947 return err;
948 948
949 count = ocount; 949 count = ocount;
950 sb_start_write(inode->i_sb); 950 sb_start_write(inode->i_sb);
951 mutex_lock(&inode->i_mutex); 951 mutex_lock(&inode->i_mutex);
952 952
953 /* We can write back this queue in page reclaim */ 953 /* We can write back this queue in page reclaim */
954 current->backing_dev_info = mapping->backing_dev_info; 954 current->backing_dev_info = mapping->backing_dev_info;
955 955
956 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 956 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
957 if (err) 957 if (err)
958 goto out; 958 goto out;
959 959
960 if (count == 0) 960 if (count == 0)
961 goto out; 961 goto out;
962 962
963 err = file_remove_suid(file); 963 err = file_remove_suid(file);
964 if (err) 964 if (err)
965 goto out; 965 goto out;
966 966
967 err = file_update_time(file); 967 err = file_update_time(file);
968 if (err) 968 if (err)
969 goto out; 969 goto out;
970 970
971 if (file->f_flags & O_DIRECT) { 971 if (file->f_flags & O_DIRECT) {
972 written = generic_file_direct_write(iocb, iov, &nr_segs, 972 written = generic_file_direct_write(iocb, iov, &nr_segs,
973 pos, &iocb->ki_pos, 973 pos, &iocb->ki_pos,
974 count, ocount); 974 count, ocount);
975 if (written < 0 || written == count) 975 if (written < 0 || written == count)
976 goto out; 976 goto out;
977 977
978 pos += written; 978 pos += written;
979 count -= written; 979 count -= written;
980 980
981 iov_iter_init(&i, iov, nr_segs, count, written); 981 iov_iter_init(&i, iov, nr_segs, count, written);
982 written_buffered = fuse_perform_write(file, mapping, &i, pos); 982 written_buffered = fuse_perform_write(file, mapping, &i, pos);
983 if (written_buffered < 0) { 983 if (written_buffered < 0) {
984 err = written_buffered; 984 err = written_buffered;
985 goto out; 985 goto out;
986 } 986 }
987 endbyte = pos + written_buffered - 1; 987 endbyte = pos + written_buffered - 1;
988 988
989 err = filemap_write_and_wait_range(file->f_mapping, pos, 989 err = filemap_write_and_wait_range(file->f_mapping, pos,
990 endbyte); 990 endbyte);
991 if (err) 991 if (err)
992 goto out; 992 goto out;
993 993
994 invalidate_mapping_pages(file->f_mapping, 994 invalidate_mapping_pages(file->f_mapping,
995 pos >> PAGE_CACHE_SHIFT, 995 pos >> PAGE_CACHE_SHIFT,
996 endbyte >> PAGE_CACHE_SHIFT); 996 endbyte >> PAGE_CACHE_SHIFT);
997 997
998 written += written_buffered; 998 written += written_buffered;
999 iocb->ki_pos = pos + written_buffered; 999 iocb->ki_pos = pos + written_buffered;
1000 } else { 1000 } else {
1001 iov_iter_init(&i, iov, nr_segs, count, 0); 1001 iov_iter_init(&i, iov, nr_segs, count, 0);
1002 written = fuse_perform_write(file, mapping, &i, pos); 1002 written = fuse_perform_write(file, mapping, &i, pos);
1003 if (written >= 0) 1003 if (written >= 0)
1004 iocb->ki_pos = pos + written; 1004 iocb->ki_pos = pos + written;
1005 } 1005 }
1006 out: 1006 out:
1007 current->backing_dev_info = NULL; 1007 current->backing_dev_info = NULL;
1008 mutex_unlock(&inode->i_mutex); 1008 mutex_unlock(&inode->i_mutex);
1009 sb_end_write(inode->i_sb); 1009 sb_end_write(inode->i_sb);
1010 1010
1011 return written ? written : err; 1011 return written ? written : err;
1012 } 1012 }
1013 1013
1014 static void fuse_release_user_pages(struct fuse_req *req, int write) 1014 static void fuse_release_user_pages(struct fuse_req *req, int write)
1015 { 1015 {
1016 unsigned i; 1016 unsigned i;
1017 1017
1018 for (i = 0; i < req->num_pages; i++) { 1018 for (i = 0; i < req->num_pages; i++) {
1019 struct page *page = req->pages[i]; 1019 struct page *page = req->pages[i];
1020 if (write) 1020 if (write)
1021 set_page_dirty_lock(page); 1021 set_page_dirty_lock(page);
1022 put_page(page); 1022 put_page(page);
1023 } 1023 }
1024 } 1024 }
1025 1025
1026 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 1026 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
1027 size_t *nbytesp, int write) 1027 size_t *nbytesp, int write)
1028 { 1028 {
1029 size_t nbytes = *nbytesp; 1029 size_t nbytes = *nbytesp;
1030 unsigned long user_addr = (unsigned long) buf; 1030 unsigned long user_addr = (unsigned long) buf;
1031 unsigned offset = user_addr & ~PAGE_MASK; 1031 unsigned offset = user_addr & ~PAGE_MASK;
1032 int npages; 1032 int npages;
1033 1033
1034 /* Special case for kernel I/O: can copy directly into the buffer */ 1034 /* Special case for kernel I/O: can copy directly into the buffer */
1035 if (segment_eq(get_fs(), KERNEL_DS)) { 1035 if (segment_eq(get_fs(), KERNEL_DS)) {
1036 if (write) 1036 if (write)
1037 req->in.args[1].value = (void *) user_addr; 1037 req->in.args[1].value = (void *) user_addr;
1038 else 1038 else
1039 req->out.args[0].value = (void *) user_addr; 1039 req->out.args[0].value = (void *) user_addr;
1040 1040
1041 return 0; 1041 return 0;
1042 } 1042 }
1043 1043
1044 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1044 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
1045 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1045 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1046 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1046 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
1047 npages = get_user_pages_fast(user_addr, npages, !write, req->pages); 1047 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
1048 if (npages < 0) 1048 if (npages < 0)
1049 return npages; 1049 return npages;
1050 1050
1051 req->num_pages = npages; 1051 req->num_pages = npages;
1052 req->page_offset = offset; 1052 req->page_offset = offset;
1053 1053
1054 if (write) 1054 if (write)
1055 req->in.argpages = 1; 1055 req->in.argpages = 1;
1056 else 1056 else
1057 req->out.argpages = 1; 1057 req->out.argpages = 1;
1058 1058
1059 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; 1059 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
1060 *nbytesp = min(*nbytesp, nbytes); 1060 *nbytesp = min(*nbytesp, nbytes);
1061 1061
1062 return 0; 1062 return 0;
1063 } 1063 }
1064 1064
1065 ssize_t fuse_direct_io(struct file *file, const char __user *buf, 1065 ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1066 size_t count, loff_t *ppos, int write) 1066 size_t count, loff_t *ppos, int write)
1067 { 1067 {
1068 struct fuse_file *ff = file->private_data; 1068 struct fuse_file *ff = file->private_data;
1069 struct fuse_conn *fc = ff->fc; 1069 struct fuse_conn *fc = ff->fc;
1070 size_t nmax = write ? fc->max_write : fc->max_read; 1070 size_t nmax = write ? fc->max_write : fc->max_read;
1071 loff_t pos = *ppos; 1071 loff_t pos = *ppos;
1072 ssize_t res = 0; 1072 ssize_t res = 0;
1073 struct fuse_req *req; 1073 struct fuse_req *req;
1074 1074
1075 req = fuse_get_req(fc); 1075 req = fuse_get_req(fc);
1076 if (IS_ERR(req)) 1076 if (IS_ERR(req))
1077 return PTR_ERR(req); 1077 return PTR_ERR(req);
1078 1078
1079 while (count) { 1079 while (count) {
1080 size_t nres; 1080 size_t nres;
1081 fl_owner_t owner = current->files; 1081 fl_owner_t owner = current->files;
1082 size_t nbytes = min(count, nmax); 1082 size_t nbytes = min(count, nmax);
1083 int err = fuse_get_user_pages(req, buf, &nbytes, write); 1083 int err = fuse_get_user_pages(req, buf, &nbytes, write);
1084 if (err) { 1084 if (err) {
1085 res = err; 1085 res = err;
1086 break; 1086 break;
1087 } 1087 }
1088 1088
1089 if (write) 1089 if (write)
1090 nres = fuse_send_write(req, file, pos, nbytes, owner); 1090 nres = fuse_send_write(req, file, pos, nbytes, owner);
1091 else 1091 else
1092 nres = fuse_send_read(req, file, pos, nbytes, owner); 1092 nres = fuse_send_read(req, file, pos, nbytes, owner);
1093 1093
1094 fuse_release_user_pages(req, !write); 1094 fuse_release_user_pages(req, !write);
1095 if (req->out.h.error) { 1095 if (req->out.h.error) {
1096 if (!res) 1096 if (!res)
1097 res = req->out.h.error; 1097 res = req->out.h.error;
1098 break; 1098 break;
1099 } else if (nres > nbytes) { 1099 } else if (nres > nbytes) {
1100 res = -EIO; 1100 res = -EIO;
1101 break; 1101 break;
1102 } 1102 }
1103 count -= nres; 1103 count -= nres;
1104 res += nres; 1104 res += nres;
1105 pos += nres; 1105 pos += nres;
1106 buf += nres; 1106 buf += nres;
1107 if (nres != nbytes) 1107 if (nres != nbytes)
1108 break; 1108 break;
1109 if (count) { 1109 if (count) {
1110 fuse_put_request(fc, req); 1110 fuse_put_request(fc, req);
1111 req = fuse_get_req(fc); 1111 req = fuse_get_req(fc);
1112 if (IS_ERR(req)) 1112 if (IS_ERR(req))
1113 break; 1113 break;
1114 } 1114 }
1115 } 1115 }
1116 if (!IS_ERR(req)) 1116 if (!IS_ERR(req))
1117 fuse_put_request(fc, req); 1117 fuse_put_request(fc, req);
1118 if (res > 0) 1118 if (res > 0)
1119 *ppos = pos; 1119 *ppos = pos;
1120 1120
1121 return res; 1121 return res;
1122 } 1122 }
1123 EXPORT_SYMBOL_GPL(fuse_direct_io); 1123 EXPORT_SYMBOL_GPL(fuse_direct_io);
1124 1124
1125 static ssize_t fuse_direct_read(struct file *file, char __user *buf, 1125 static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1126 size_t count, loff_t *ppos) 1126 size_t count, loff_t *ppos)
1127 { 1127 {
1128 ssize_t res; 1128 ssize_t res;
1129 struct inode *inode = file->f_path.dentry->d_inode; 1129 struct inode *inode = file->f_path.dentry->d_inode;
1130 1130
1131 if (is_bad_inode(inode)) 1131 if (is_bad_inode(inode))
1132 return -EIO; 1132 return -EIO;
1133 1133
1134 res = fuse_direct_io(file, buf, count, ppos, 0); 1134 res = fuse_direct_io(file, buf, count, ppos, 0);
1135 1135
1136 fuse_invalidate_attr(inode); 1136 fuse_invalidate_attr(inode);
1137 1137
1138 return res; 1138 return res;
1139 } 1139 }
1140 1140
1141 static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, 1141 static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
1142 size_t count, loff_t *ppos) 1142 size_t count, loff_t *ppos)
1143 { 1143 {
1144 struct inode *inode = file->f_path.dentry->d_inode; 1144 struct inode *inode = file->f_path.dentry->d_inode;
1145 ssize_t res; 1145 ssize_t res;
1146 1146
1147 res = generic_write_checks(file, ppos, &count, 0); 1147 res = generic_write_checks(file, ppos, &count, 0);
1148 if (!res) { 1148 if (!res) {
1149 res = fuse_direct_io(file, buf, count, ppos, 1); 1149 res = fuse_direct_io(file, buf, count, ppos, 1);
1150 if (res > 0) 1150 if (res > 0)
1151 fuse_write_update_size(inode, *ppos); 1151 fuse_write_update_size(inode, *ppos);
1152 } 1152 }
1153 1153
1154 fuse_invalidate_attr(inode); 1154 fuse_invalidate_attr(inode);
1155 1155
1156 return res; 1156 return res;
1157 } 1157 }
1158 1158
1159 static ssize_t fuse_direct_write(struct file *file, const char __user *buf, 1159 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1160 size_t count, loff_t *ppos) 1160 size_t count, loff_t *ppos)
1161 { 1161 {
1162 struct inode *inode = file->f_path.dentry->d_inode; 1162 struct inode *inode = file->f_path.dentry->d_inode;
1163 ssize_t res; 1163 ssize_t res;
1164 1164
1165 if (is_bad_inode(inode)) 1165 if (is_bad_inode(inode))
1166 return -EIO; 1166 return -EIO;
1167 1167
1168 /* Don't allow parallel writes to the same file */ 1168 /* Don't allow parallel writes to the same file */
1169 mutex_lock(&inode->i_mutex); 1169 mutex_lock(&inode->i_mutex);
1170 res = __fuse_direct_write(file, buf, count, ppos); 1170 res = __fuse_direct_write(file, buf, count, ppos);
1171 mutex_unlock(&inode->i_mutex); 1171 mutex_unlock(&inode->i_mutex);
1172 1172
1173 return res; 1173 return res;
1174 } 1174 }
1175 1175
1176 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) 1176 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1177 { 1177 {
1178 __free_page(req->pages[0]); 1178 __free_page(req->pages[0]);
1179 fuse_file_put(req->ff, false); 1179 fuse_file_put(req->ff, false);
1180 } 1180 }
1181 1181
1182 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1182 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1183 { 1183 {
1184 struct inode *inode = req->inode; 1184 struct inode *inode = req->inode;
1185 struct fuse_inode *fi = get_fuse_inode(inode); 1185 struct fuse_inode *fi = get_fuse_inode(inode);
1186 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; 1186 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
1187 1187
1188 list_del(&req->writepages_entry); 1188 list_del(&req->writepages_entry);
1189 dec_bdi_stat(bdi, BDI_WRITEBACK); 1189 dec_bdi_stat(bdi, BDI_WRITEBACK);
1190 dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); 1190 dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
1191 bdi_writeout_inc(bdi); 1191 bdi_writeout_inc(bdi);
1192 wake_up(&fi->page_waitq); 1192 wake_up(&fi->page_waitq);
1193 } 1193 }
1194 1194
1195 /* Called under fc->lock, may release and reacquire it */ 1195 /* Called under fc->lock, may release and reacquire it */
1196 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1196 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1197 __releases(fc->lock) 1197 __releases(fc->lock)
1198 __acquires(fc->lock) 1198 __acquires(fc->lock)
1199 { 1199 {
1200 struct fuse_inode *fi = get_fuse_inode(req->inode); 1200 struct fuse_inode *fi = get_fuse_inode(req->inode);
1201 loff_t size = i_size_read(req->inode); 1201 loff_t size = i_size_read(req->inode);
1202 struct fuse_write_in *inarg = &req->misc.write.in; 1202 struct fuse_write_in *inarg = &req->misc.write.in;
1203 1203
1204 if (!fc->connected) 1204 if (!fc->connected)
1205 goto out_free; 1205 goto out_free;
1206 1206
1207 if (inarg->offset + PAGE_CACHE_SIZE <= size) { 1207 if (inarg->offset + PAGE_CACHE_SIZE <= size) {
1208 inarg->size = PAGE_CACHE_SIZE; 1208 inarg->size = PAGE_CACHE_SIZE;
1209 } else if (inarg->offset < size) { 1209 } else if (inarg->offset < size) {
1210 inarg->size = size & (PAGE_CACHE_SIZE - 1); 1210 inarg->size = size & (PAGE_CACHE_SIZE - 1);
1211 } else { 1211 } else {
1212 /* Got truncated off completely */ 1212 /* Got truncated off completely */
1213 goto out_free; 1213 goto out_free;
1214 } 1214 }
1215 1215
1216 req->in.args[1].size = inarg->size; 1216 req->in.args[1].size = inarg->size;
1217 fi->writectr++; 1217 fi->writectr++;
1218 fuse_request_send_background_locked(fc, req); 1218 fuse_request_send_background_locked(fc, req);
1219 return; 1219 return;
1220 1220
1221 out_free: 1221 out_free:
1222 fuse_writepage_finish(fc, req); 1222 fuse_writepage_finish(fc, req);
1223 spin_unlock(&fc->lock); 1223 spin_unlock(&fc->lock);
1224 fuse_writepage_free(fc, req); 1224 fuse_writepage_free(fc, req);
1225 fuse_put_request(fc, req); 1225 fuse_put_request(fc, req);
1226 spin_lock(&fc->lock); 1226 spin_lock(&fc->lock);
1227 } 1227 }
1228 1228
1229 /* 1229 /*
1230 * If fi->writectr is positive (no truncate or fsync going on) send 1230 * If fi->writectr is positive (no truncate or fsync going on) send
1231 * all queued writepage requests. 1231 * all queued writepage requests.
1232 * 1232 *
1233 * Called with fc->lock 1233 * Called with fc->lock
1234 */ 1234 */
1235 void fuse_flush_writepages(struct inode *inode) 1235 void fuse_flush_writepages(struct inode *inode)
1236 __releases(fc->lock) 1236 __releases(fc->lock)
1237 __acquires(fc->lock) 1237 __acquires(fc->lock)
1238 { 1238 {
1239 struct fuse_conn *fc = get_fuse_conn(inode); 1239 struct fuse_conn *fc = get_fuse_conn(inode);
1240 struct fuse_inode *fi = get_fuse_inode(inode); 1240 struct fuse_inode *fi = get_fuse_inode(inode);
1241 struct fuse_req *req; 1241 struct fuse_req *req;
1242 1242
1243 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 1243 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1244 req = list_entry(fi->queued_writes.next, struct fuse_req, list); 1244 req = list_entry(fi->queued_writes.next, struct fuse_req, list);
1245 list_del_init(&req->list); 1245 list_del_init(&req->list);
1246 fuse_send_writepage(fc, req); 1246 fuse_send_writepage(fc, req);
1247 } 1247 }
1248 } 1248 }
1249 1249
1250 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) 1250 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1251 { 1251 {
1252 struct inode *inode = req->inode; 1252 struct inode *inode = req->inode;
1253 struct fuse_inode *fi = get_fuse_inode(inode); 1253 struct fuse_inode *fi = get_fuse_inode(inode);
1254 1254
1255 mapping_set_error(inode->i_mapping, req->out.h.error); 1255 mapping_set_error(inode->i_mapping, req->out.h.error);
1256 spin_lock(&fc->lock); 1256 spin_lock(&fc->lock);
1257 fi->writectr--; 1257 fi->writectr--;
1258 fuse_writepage_finish(fc, req); 1258 fuse_writepage_finish(fc, req);
1259 spin_unlock(&fc->lock); 1259 spin_unlock(&fc->lock);
1260 fuse_writepage_free(fc, req); 1260 fuse_writepage_free(fc, req);
1261 } 1261 }
1262 1262
1263 static int fuse_writepage_locked(struct page *page) 1263 static int fuse_writepage_locked(struct page *page)
1264 { 1264 {
1265 struct address_space *mapping = page->mapping; 1265 struct address_space *mapping = page->mapping;
1266 struct inode *inode = mapping->host; 1266 struct inode *inode = mapping->host;
1267 struct fuse_conn *fc = get_fuse_conn(inode); 1267 struct fuse_conn *fc = get_fuse_conn(inode);
1268 struct fuse_inode *fi = get_fuse_inode(inode); 1268 struct fuse_inode *fi = get_fuse_inode(inode);
1269 struct fuse_req *req; 1269 struct fuse_req *req;
1270 struct fuse_file *ff; 1270 struct fuse_file *ff;
1271 struct page *tmp_page; 1271 struct page *tmp_page;
1272 1272
1273 set_page_writeback(page); 1273 set_page_writeback(page);
1274 1274
1275 req = fuse_request_alloc_nofs(); 1275 req = fuse_request_alloc_nofs();
1276 if (!req) 1276 if (!req)
1277 goto err; 1277 goto err;
1278 1278
1279 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1279 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1280 if (!tmp_page) 1280 if (!tmp_page)
1281 goto err_free; 1281 goto err_free;
1282 1282
1283 spin_lock(&fc->lock); 1283 spin_lock(&fc->lock);
1284 BUG_ON(list_empty(&fi->write_files)); 1284 BUG_ON(list_empty(&fi->write_files));
1285 ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); 1285 ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
1286 req->ff = fuse_file_get(ff); 1286 req->ff = fuse_file_get(ff);
1287 spin_unlock(&fc->lock); 1287 spin_unlock(&fc->lock);
1288 1288
1289 fuse_write_fill(req, ff, page_offset(page), 0); 1289 fuse_write_fill(req, ff, page_offset(page), 0);
1290 1290
1291 copy_highpage(tmp_page, page); 1291 copy_highpage(tmp_page, page);
1292 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; 1292 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1293 req->in.argpages = 1; 1293 req->in.argpages = 1;
1294 req->num_pages = 1; 1294 req->num_pages = 1;
1295 req->pages[0] = tmp_page; 1295 req->pages[0] = tmp_page;
1296 req->page_offset = 0; 1296 req->page_offset = 0;
1297 req->end = fuse_writepage_end; 1297 req->end = fuse_writepage_end;
1298 req->inode = inode; 1298 req->inode = inode;
1299 1299
1300 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); 1300 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
1301 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1301 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1302 end_page_writeback(page); 1302 end_page_writeback(page);
1303 1303
1304 spin_lock(&fc->lock); 1304 spin_lock(&fc->lock);
1305 list_add(&req->writepages_entry, &fi->writepages); 1305 list_add(&req->writepages_entry, &fi->writepages);
1306 list_add_tail(&req->list, &fi->queued_writes); 1306 list_add_tail(&req->list, &fi->queued_writes);
1307 fuse_flush_writepages(inode); 1307 fuse_flush_writepages(inode);
1308 spin_unlock(&fc->lock); 1308 spin_unlock(&fc->lock);
1309 1309
1310 return 0; 1310 return 0;
1311 1311
1312 err_free: 1312 err_free:
1313 fuse_request_free(req); 1313 fuse_request_free(req);
1314 err: 1314 err:
1315 end_page_writeback(page); 1315 end_page_writeback(page);
1316 return -ENOMEM; 1316 return -ENOMEM;
1317 } 1317 }
1318 1318
1319 static int fuse_writepage(struct page *page, struct writeback_control *wbc) 1319 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1320 { 1320 {
1321 int err; 1321 int err;
1322 1322
1323 err = fuse_writepage_locked(page); 1323 err = fuse_writepage_locked(page);
1324 unlock_page(page); 1324 unlock_page(page);
1325 1325
1326 return err; 1326 return err;
1327 } 1327 }
1328 1328
1329 static int fuse_launder_page(struct page *page) 1329 static int fuse_launder_page(struct page *page)
1330 { 1330 {
1331 int err = 0; 1331 int err = 0;
1332 if (clear_page_dirty_for_io(page)) { 1332 if (clear_page_dirty_for_io(page)) {
1333 struct inode *inode = page->mapping->host; 1333 struct inode *inode = page->mapping->host;
1334 err = fuse_writepage_locked(page); 1334 err = fuse_writepage_locked(page);
1335 if (!err) 1335 if (!err)
1336 fuse_wait_on_page_writeback(inode, page->index); 1336 fuse_wait_on_page_writeback(inode, page->index);
1337 } 1337 }
1338 return err; 1338 return err;
1339 } 1339 }
1340 1340
1341 /* 1341 /*
1342 * Write back dirty pages now, because there may not be any suitable 1342 * Write back dirty pages now, because there may not be any suitable
1343 * open files later 1343 * open files later
1344 */ 1344 */
1345 static void fuse_vma_close(struct vm_area_struct *vma) 1345 static void fuse_vma_close(struct vm_area_struct *vma)
1346 { 1346 {
1347 filemap_write_and_wait(vma->vm_file->f_mapping); 1347 filemap_write_and_wait(vma->vm_file->f_mapping);
1348 } 1348 }
1349 1349
1350 /* 1350 /*
1351 * Wait for writeback against this page to complete before allowing it 1351 * Wait for writeback against this page to complete before allowing it
1352 * to be marked dirty again, and hence written back again, possibly 1352 * to be marked dirty again, and hence written back again, possibly
1353 * before the previous writepage completed. 1353 * before the previous writepage completed.
1354 * 1354 *
1355 * Block here, instead of in ->writepage(), so that the userspace fs 1355 * Block here, instead of in ->writepage(), so that the userspace fs
1356 * can only block processes actually operating on the filesystem. 1356 * can only block processes actually operating on the filesystem.
1357 * 1357 *
1358 * Otherwise unprivileged userspace fs would be able to block 1358 * Otherwise unprivileged userspace fs would be able to block
1359 * unrelated: 1359 * unrelated:
1360 * 1360 *
1361 * - page migration 1361 * - page migration
1362 * - sync(2) 1362 * - sync(2)
1363 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 1363 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1364 */ 1364 */
1365 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1365 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1366 { 1366 {
1367 struct page *page = vmf->page; 1367 struct page *page = vmf->page;
1368 /* 1368 /*
1369 * Don't use page->mapping as it may become NULL from a 1369 * Don't use page->mapping as it may become NULL from a
1370 * concurrent truncate. 1370 * concurrent truncate.
1371 */ 1371 */
1372 struct inode *inode = vma->vm_file->f_mapping->host; 1372 struct inode *inode = vma->vm_file->f_mapping->host;
1373 1373
1374 fuse_wait_on_page_writeback(inode, page->index); 1374 fuse_wait_on_page_writeback(inode, page->index);
1375 return 0; 1375 return 0;
1376 } 1376 }
1377 1377
1378 static const struct vm_operations_struct fuse_file_vm_ops = { 1378 static const struct vm_operations_struct fuse_file_vm_ops = {
1379 .close = fuse_vma_close, 1379 .close = fuse_vma_close,
1380 .fault = filemap_fault, 1380 .fault = filemap_fault,
1381 .page_mkwrite = fuse_page_mkwrite, 1381 .page_mkwrite = fuse_page_mkwrite,
1382 .remap_pages = generic_file_remap_pages, 1382 .remap_pages = generic_file_remap_pages,
1383 }; 1383 };
1384 1384
1385 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 1385 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1386 { 1386 {
1387 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 1387 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1388 struct inode *inode = file->f_dentry->d_inode; 1388 struct inode *inode = file->f_dentry->d_inode;
1389 struct fuse_conn *fc = get_fuse_conn(inode); 1389 struct fuse_conn *fc = get_fuse_conn(inode);
1390 struct fuse_inode *fi = get_fuse_inode(inode); 1390 struct fuse_inode *fi = get_fuse_inode(inode);
1391 struct fuse_file *ff = file->private_data; 1391 struct fuse_file *ff = file->private_data;
1392 /* 1392 /*
1393 * file may be written through mmap, so chain it onto the 1393 * file may be written through mmap, so chain it onto the
1394 * inodes's write_file list 1394 * inodes's write_file list
1395 */ 1395 */
1396 spin_lock(&fc->lock); 1396 spin_lock(&fc->lock);
1397 if (list_empty(&ff->write_entry)) 1397 if (list_empty(&ff->write_entry))
1398 list_add(&ff->write_entry, &fi->write_files); 1398 list_add(&ff->write_entry, &fi->write_files);
1399 spin_unlock(&fc->lock); 1399 spin_unlock(&fc->lock);
1400 } 1400 }
1401 file_accessed(file); 1401 file_accessed(file);
1402 vma->vm_ops = &fuse_file_vm_ops; 1402 vma->vm_ops = &fuse_file_vm_ops;
1403 return 0; 1403 return 0;
1404 } 1404 }
1405 1405
1406 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) 1406 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1407 { 1407 {
1408 /* Can't provide the coherency needed for MAP_SHARED */ 1408 /* Can't provide the coherency needed for MAP_SHARED */
1409 if (vma->vm_flags & VM_MAYSHARE) 1409 if (vma->vm_flags & VM_MAYSHARE)
1410 return -ENODEV; 1410 return -ENODEV;
1411 1411
1412 invalidate_inode_pages2(file->f_mapping); 1412 invalidate_inode_pages2(file->f_mapping);
1413 1413
1414 return generic_file_mmap(file, vma); 1414 return generic_file_mmap(file, vma);
1415 } 1415 }
1416 1416
1417 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, 1417 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1418 struct file_lock *fl) 1418 struct file_lock *fl)
1419 { 1419 {
1420 switch (ffl->type) { 1420 switch (ffl->type) {
1421 case F_UNLCK: 1421 case F_UNLCK:
1422 break; 1422 break;
1423 1423
1424 case F_RDLCK: 1424 case F_RDLCK:
1425 case F_WRLCK: 1425 case F_WRLCK:
1426 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || 1426 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
1427 ffl->end < ffl->start) 1427 ffl->end < ffl->start)
1428 return -EIO; 1428 return -EIO;
1429 1429
1430 fl->fl_start = ffl->start; 1430 fl->fl_start = ffl->start;
1431 fl->fl_end = ffl->end; 1431 fl->fl_end = ffl->end;
1432 fl->fl_pid = ffl->pid; 1432 fl->fl_pid = ffl->pid;
1433 break; 1433 break;
1434 1434
1435 default: 1435 default:
1436 return -EIO; 1436 return -EIO;
1437 } 1437 }
1438 fl->fl_type = ffl->type; 1438 fl->fl_type = ffl->type;
1439 return 0; 1439 return 0;
1440 } 1440 }
1441 1441
1442 static void fuse_lk_fill(struct fuse_req *req, struct file *file, 1442 static void fuse_lk_fill(struct fuse_req *req, struct file *file,
1443 const struct file_lock *fl, int opcode, pid_t pid, 1443 const struct file_lock *fl, int opcode, pid_t pid,
1444 int flock) 1444 int flock)
1445 { 1445 {
1446 struct inode *inode = file->f_path.dentry->d_inode; 1446 struct inode *inode = file->f_path.dentry->d_inode;
1447 struct fuse_conn *fc = get_fuse_conn(inode); 1447 struct fuse_conn *fc = get_fuse_conn(inode);
1448 struct fuse_file *ff = file->private_data; 1448 struct fuse_file *ff = file->private_data;
1449 struct fuse_lk_in *arg = &req->misc.lk_in; 1449 struct fuse_lk_in *arg = &req->misc.lk_in;
1450 1450
1451 arg->fh = ff->fh; 1451 arg->fh = ff->fh;
1452 arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); 1452 arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
1453 arg->lk.start = fl->fl_start; 1453 arg->lk.start = fl->fl_start;
1454 arg->lk.end = fl->fl_end; 1454 arg->lk.end = fl->fl_end;
1455 arg->lk.type = fl->fl_type; 1455 arg->lk.type = fl->fl_type;
1456 arg->lk.pid = pid; 1456 arg->lk.pid = pid;
1457 if (flock) 1457 if (flock)
1458 arg->lk_flags |= FUSE_LK_FLOCK; 1458 arg->lk_flags |= FUSE_LK_FLOCK;
1459 req->in.h.opcode = opcode; 1459 req->in.h.opcode = opcode;
1460 req->in.h.nodeid = get_node_id(inode); 1460 req->in.h.nodeid = get_node_id(inode);
1461 req->in.numargs = 1; 1461 req->in.numargs = 1;
1462 req->in.args[0].size = sizeof(*arg); 1462 req->in.args[0].size = sizeof(*arg);
1463 req->in.args[0].value = arg; 1463 req->in.args[0].value = arg;
1464 } 1464 }
1465 1465
1466 static int fuse_getlk(struct file *file, struct file_lock *fl) 1466 static int fuse_getlk(struct file *file, struct file_lock *fl)
1467 { 1467 {
1468 struct inode *inode = file->f_path.dentry->d_inode; 1468 struct inode *inode = file->f_path.dentry->d_inode;
1469 struct fuse_conn *fc = get_fuse_conn(inode); 1469 struct fuse_conn *fc = get_fuse_conn(inode);
1470 struct fuse_req *req; 1470 struct fuse_req *req;
1471 struct fuse_lk_out outarg; 1471 struct fuse_lk_out outarg;
1472 int err; 1472 int err;
1473 1473
1474 req = fuse_get_req(fc); 1474 req = fuse_get_req(fc);
1475 if (IS_ERR(req)) 1475 if (IS_ERR(req))
1476 return PTR_ERR(req); 1476 return PTR_ERR(req);
1477 1477
1478 fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0); 1478 fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
1479 req->out.numargs = 1; 1479 req->out.numargs = 1;
1480 req->out.args[0].size = sizeof(outarg); 1480 req->out.args[0].size = sizeof(outarg);
1481 req->out.args[0].value = &outarg; 1481 req->out.args[0].value = &outarg;
1482 fuse_request_send(fc, req); 1482 fuse_request_send(fc, req);
1483 err = req->out.h.error; 1483 err = req->out.h.error;
1484 fuse_put_request(fc, req); 1484 fuse_put_request(fc, req);
1485 if (!err) 1485 if (!err)
1486 err = convert_fuse_file_lock(&outarg.lk, fl); 1486 err = convert_fuse_file_lock(&outarg.lk, fl);
1487 1487
1488 return err; 1488 return err;
1489 } 1489 }
1490 1490
1491 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) 1491 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1492 { 1492 {
1493 struct inode *inode = file->f_path.dentry->d_inode; 1493 struct inode *inode = file->f_path.dentry->d_inode;
1494 struct fuse_conn *fc = get_fuse_conn(inode); 1494 struct fuse_conn *fc = get_fuse_conn(inode);
1495 struct fuse_req *req; 1495 struct fuse_req *req;
1496 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; 1496 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
1497 pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; 1497 pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
1498 int err; 1498 int err;
1499 1499
1500 if (fl->fl_lmops && fl->fl_lmops->lm_grant) { 1500 if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
1501 /* NLM needs asynchronous locks, which we don't support yet */ 1501 /* NLM needs asynchronous locks, which we don't support yet */
1502 return -ENOLCK; 1502 return -ENOLCK;
1503 } 1503 }
1504 1504
1505 /* Unlock on close is handled by the flush method */ 1505 /* Unlock on close is handled by the flush method */
1506 if (fl->fl_flags & FL_CLOSE) 1506 if (fl->fl_flags & FL_CLOSE)
1507 return 0; 1507 return 0;
1508 1508
1509 req = fuse_get_req(fc); 1509 req = fuse_get_req(fc);
1510 if (IS_ERR(req)) 1510 if (IS_ERR(req))
1511 return PTR_ERR(req); 1511 return PTR_ERR(req);
1512 1512
1513 fuse_lk_fill(req, file, fl, opcode, pid, flock); 1513 fuse_lk_fill(req, file, fl, opcode, pid, flock);
1514 fuse_request_send(fc, req); 1514 fuse_request_send(fc, req);
1515 err = req->out.h.error; 1515 err = req->out.h.error;
1516 /* locking is restartable */ 1516 /* locking is restartable */
1517 if (err == -EINTR) 1517 if (err == -EINTR)
1518 err = -ERESTARTSYS; 1518 err = -ERESTARTSYS;
1519 fuse_put_request(fc, req); 1519 fuse_put_request(fc, req);
1520 return err; 1520 return err;
1521 } 1521 }
1522 1522
1523 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) 1523 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
1524 { 1524 {
1525 struct inode *inode = file->f_path.dentry->d_inode; 1525 struct inode *inode = file->f_path.dentry->d_inode;
1526 struct fuse_conn *fc = get_fuse_conn(inode); 1526 struct fuse_conn *fc = get_fuse_conn(inode);
1527 int err; 1527 int err;
1528 1528
1529 if (cmd == F_CANCELLK) { 1529 if (cmd == F_CANCELLK) {
1530 err = 0; 1530 err = 0;
1531 } else if (cmd == F_GETLK) { 1531 } else if (cmd == F_GETLK) {
1532 if (fc->no_lock) { 1532 if (fc->no_lock) {
1533 posix_test_lock(file, fl); 1533 posix_test_lock(file, fl);
1534 err = 0; 1534 err = 0;
1535 } else 1535 } else
1536 err = fuse_getlk(file, fl); 1536 err = fuse_getlk(file, fl);
1537 } else { 1537 } else {
1538 if (fc->no_lock) 1538 if (fc->no_lock)
1539 err = posix_lock_file(file, fl, NULL); 1539 err = posix_lock_file(file, fl, NULL);
1540 else 1540 else
1541 err = fuse_setlk(file, fl, 0); 1541 err = fuse_setlk(file, fl, 0);
1542 } 1542 }
1543 return err; 1543 return err;
1544 } 1544 }
1545 1545
1546 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) 1546 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
1547 { 1547 {
1548 struct inode *inode = file->f_path.dentry->d_inode; 1548 struct inode *inode = file->f_path.dentry->d_inode;
1549 struct fuse_conn *fc = get_fuse_conn(inode); 1549 struct fuse_conn *fc = get_fuse_conn(inode);
1550 int err; 1550 int err;
1551 1551
1552 if (fc->no_flock) { 1552 if (fc->no_flock) {
1553 err = flock_lock_file_wait(file, fl); 1553 err = flock_lock_file_wait(file, fl);
1554 } else { 1554 } else {
1555 struct fuse_file *ff = file->private_data; 1555 struct fuse_file *ff = file->private_data;
1556 1556
1557 /* emulate flock with POSIX locks */ 1557 /* emulate flock with POSIX locks */
1558 fl->fl_owner = (fl_owner_t) file; 1558 fl->fl_owner = (fl_owner_t) file;
1559 ff->flock = true; 1559 ff->flock = true;
1560 err = fuse_setlk(file, fl, 1); 1560 err = fuse_setlk(file, fl, 1);
1561 } 1561 }
1562 1562
1563 return err; 1563 return err;
1564 } 1564 }
1565 1565
1566 static sector_t fuse_bmap(struct address_space *mapping, sector_t block) 1566 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1567 { 1567 {
1568 struct inode *inode = mapping->host; 1568 struct inode *inode = mapping->host;
1569 struct fuse_conn *fc = get_fuse_conn(inode); 1569 struct fuse_conn *fc = get_fuse_conn(inode);
1570 struct fuse_req *req; 1570 struct fuse_req *req;
1571 struct fuse_bmap_in inarg; 1571 struct fuse_bmap_in inarg;
1572 struct fuse_bmap_out outarg; 1572 struct fuse_bmap_out outarg;
1573 int err; 1573 int err;
1574 1574
1575 if (!inode->i_sb->s_bdev || fc->no_bmap) 1575 if (!inode->i_sb->s_bdev || fc->no_bmap)
1576 return 0; 1576 return 0;
1577 1577
1578 req = fuse_get_req(fc); 1578 req = fuse_get_req(fc);
1579 if (IS_ERR(req)) 1579 if (IS_ERR(req))
1580 return 0; 1580 return 0;
1581 1581
1582 memset(&inarg, 0, sizeof(inarg)); 1582 memset(&inarg, 0, sizeof(inarg));
1583 inarg.block = block; 1583 inarg.block = block;
1584 inarg.blocksize = inode->i_sb->s_blocksize; 1584 inarg.blocksize = inode->i_sb->s_blocksize;
1585 req->in.h.opcode = FUSE_BMAP; 1585 req->in.h.opcode = FUSE_BMAP;
1586 req->in.h.nodeid = get_node_id(inode); 1586 req->in.h.nodeid = get_node_id(inode);
1587 req->in.numargs = 1; 1587 req->in.numargs = 1;
1588 req->in.args[0].size = sizeof(inarg); 1588 req->in.args[0].size = sizeof(inarg);
1589 req->in.args[0].value = &inarg; 1589 req->in.args[0].value = &inarg;
1590 req->out.numargs = 1; 1590 req->out.numargs = 1;
1591 req->out.args[0].size = sizeof(outarg); 1591 req->out.args[0].size = sizeof(outarg);
1592 req->out.args[0].value = &outarg; 1592 req->out.args[0].value = &outarg;
1593 fuse_request_send(fc, req); 1593 fuse_request_send(fc, req);
1594 err = req->out.h.error; 1594 err = req->out.h.error;
1595 fuse_put_request(fc, req); 1595 fuse_put_request(fc, req);
1596 if (err == -ENOSYS) 1596 if (err == -ENOSYS)
1597 fc->no_bmap = 1; 1597 fc->no_bmap = 1;
1598 1598
1599 return err ? 0 : outarg.block; 1599 return err ? 0 : outarg.block;
1600 } 1600 }
1601 1601
1602 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) 1602 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
1603 { 1603 {
1604 loff_t retval; 1604 loff_t retval;
1605 struct inode *inode = file->f_path.dentry->d_inode; 1605 struct inode *inode = file->f_path.dentry->d_inode;
1606 1606
1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1608 if (origin == SEEK_CUR || origin == SEEK_SET) 1608 if (whence == SEEK_CUR || whence == SEEK_SET)
1609 return generic_file_llseek(file, offset, origin); 1609 return generic_file_llseek(file, offset, whence);
1610 1610
1611 mutex_lock(&inode->i_mutex); 1611 mutex_lock(&inode->i_mutex);
1612 retval = fuse_update_attributes(inode, NULL, file, NULL); 1612 retval = fuse_update_attributes(inode, NULL, file, NULL);
1613 if (!retval) 1613 if (!retval)
1614 retval = generic_file_llseek(file, offset, origin); 1614 retval = generic_file_llseek(file, offset, whence);
1615 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1616 1616
1617 return retval; 1617 return retval;
1618 } 1618 }
1619 1619
1620 static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, 1620 static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1621 unsigned int nr_segs, size_t bytes, bool to_user) 1621 unsigned int nr_segs, size_t bytes, bool to_user)
1622 { 1622 {
1623 struct iov_iter ii; 1623 struct iov_iter ii;
1624 int page_idx = 0; 1624 int page_idx = 0;
1625 1625
1626 if (!bytes) 1626 if (!bytes)
1627 return 0; 1627 return 0;
1628 1628
1629 iov_iter_init(&ii, iov, nr_segs, bytes, 0); 1629 iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1630 1630
1631 while (iov_iter_count(&ii)) { 1631 while (iov_iter_count(&ii)) {
1632 struct page *page = pages[page_idx++]; 1632 struct page *page = pages[page_idx++];
1633 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1633 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1634 void *kaddr; 1634 void *kaddr;
1635 1635
1636 kaddr = kmap(page); 1636 kaddr = kmap(page);
1637 1637
1638 while (todo) { 1638 while (todo) {
1639 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1639 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1640 size_t iov_len = ii.iov->iov_len - ii.iov_offset; 1640 size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1641 size_t copy = min(todo, iov_len); 1641 size_t copy = min(todo, iov_len);
1642 size_t left; 1642 size_t left;
1643 1643
1644 if (!to_user) 1644 if (!to_user)
1645 left = copy_from_user(kaddr, uaddr, copy); 1645 left = copy_from_user(kaddr, uaddr, copy);
1646 else 1646 else
1647 left = copy_to_user(uaddr, kaddr, copy); 1647 left = copy_to_user(uaddr, kaddr, copy);
1648 1648
1649 if (unlikely(left)) 1649 if (unlikely(left))
1650 return -EFAULT; 1650 return -EFAULT;
1651 1651
1652 iov_iter_advance(&ii, copy); 1652 iov_iter_advance(&ii, copy);
1653 todo -= copy; 1653 todo -= copy;
1654 kaddr += copy; 1654 kaddr += copy;
1655 } 1655 }
1656 1656
1657 kunmap(page); 1657 kunmap(page);
1658 } 1658 }
1659 1659
1660 return 0; 1660 return 0;
1661 } 1661 }
1662 1662
1663 /* 1663 /*
1664 * CUSE servers compiled on 32bit broke on 64bit kernels because the 1664 * CUSE servers compiled on 32bit broke on 64bit kernels because the
1665 * ABI was defined to be 'struct iovec' which is different on 32bit 1665 * ABI was defined to be 'struct iovec' which is different on 32bit
1666 * and 64bit. Fortunately we can determine which structure the server 1666 * and 64bit. Fortunately we can determine which structure the server
1667 * used from the size of the reply. 1667 * used from the size of the reply.
1668 */ 1668 */
1669 static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, 1669 static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
1670 size_t transferred, unsigned count, 1670 size_t transferred, unsigned count,
1671 bool is_compat) 1671 bool is_compat)
1672 { 1672 {
1673 #ifdef CONFIG_COMPAT 1673 #ifdef CONFIG_COMPAT
1674 if (count * sizeof(struct compat_iovec) == transferred) { 1674 if (count * sizeof(struct compat_iovec) == transferred) {
1675 struct compat_iovec *ciov = src; 1675 struct compat_iovec *ciov = src;
1676 unsigned i; 1676 unsigned i;
1677 1677
1678 /* 1678 /*
1679 * With this interface a 32bit server cannot support 1679 * With this interface a 32bit server cannot support
1680 * non-compat (i.e. ones coming from 64bit apps) ioctl 1680 * non-compat (i.e. ones coming from 64bit apps) ioctl
1681 * requests 1681 * requests
1682 */ 1682 */
1683 if (!is_compat) 1683 if (!is_compat)
1684 return -EINVAL; 1684 return -EINVAL;
1685 1685
1686 for (i = 0; i < count; i++) { 1686 for (i = 0; i < count; i++) {
1687 dst[i].iov_base = compat_ptr(ciov[i].iov_base); 1687 dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1688 dst[i].iov_len = ciov[i].iov_len; 1688 dst[i].iov_len = ciov[i].iov_len;
1689 } 1689 }
1690 return 0; 1690 return 0;
1691 } 1691 }
1692 #endif 1692 #endif
1693 1693
1694 if (count * sizeof(struct iovec) != transferred) 1694 if (count * sizeof(struct iovec) != transferred)
1695 return -EIO; 1695 return -EIO;
1696 1696
1697 memcpy(dst, src, transferred); 1697 memcpy(dst, src, transferred);
1698 return 0; 1698 return 0;
1699 } 1699 }
1700 1700
1701 /* Make sure iov_length() won't overflow */ 1701 /* Make sure iov_length() won't overflow */
1702 static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) 1702 static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1703 { 1703 {
1704 size_t n; 1704 size_t n;
1705 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; 1705 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1706 1706
1707 for (n = 0; n < count; n++, iov++) { 1707 for (n = 0; n < count; n++, iov++) {
1708 if (iov->iov_len > (size_t) max) 1708 if (iov->iov_len > (size_t) max)
1709 return -ENOMEM; 1709 return -ENOMEM;
1710 max -= iov->iov_len; 1710 max -= iov->iov_len;
1711 } 1711 }
1712 return 0; 1712 return 0;
1713 } 1713 }
1714 1714
1715 static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, 1715 static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
1716 void *src, size_t transferred, unsigned count, 1716 void *src, size_t transferred, unsigned count,
1717 bool is_compat) 1717 bool is_compat)
1718 { 1718 {
1719 unsigned i; 1719 unsigned i;
1720 struct fuse_ioctl_iovec *fiov = src; 1720 struct fuse_ioctl_iovec *fiov = src;
1721 1721
1722 if (fc->minor < 16) { 1722 if (fc->minor < 16) {
1723 return fuse_copy_ioctl_iovec_old(dst, src, transferred, 1723 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
1724 count, is_compat); 1724 count, is_compat);
1725 } 1725 }
1726 1726
1727 if (count * sizeof(struct fuse_ioctl_iovec) != transferred) 1727 if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
1728 return -EIO; 1728 return -EIO;
1729 1729
1730 for (i = 0; i < count; i++) { 1730 for (i = 0; i < count; i++) {
1731 /* Did the server supply an inappropriate value? */ 1731 /* Did the server supply an inappropriate value? */
1732 if (fiov[i].base != (unsigned long) fiov[i].base || 1732 if (fiov[i].base != (unsigned long) fiov[i].base ||
1733 fiov[i].len != (unsigned long) fiov[i].len) 1733 fiov[i].len != (unsigned long) fiov[i].len)
1734 return -EIO; 1734 return -EIO;
1735 1735
1736 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; 1736 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
1737 dst[i].iov_len = (size_t) fiov[i].len; 1737 dst[i].iov_len = (size_t) fiov[i].len;
1738 1738
1739 #ifdef CONFIG_COMPAT 1739 #ifdef CONFIG_COMPAT
1740 if (is_compat && 1740 if (is_compat &&
1741 (ptr_to_compat(dst[i].iov_base) != fiov[i].base || 1741 (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
1742 (compat_size_t) dst[i].iov_len != fiov[i].len)) 1742 (compat_size_t) dst[i].iov_len != fiov[i].len))
1743 return -EIO; 1743 return -EIO;
1744 #endif 1744 #endif
1745 } 1745 }
1746 1746
1747 return 0; 1747 return 0;
1748 } 1748 }
1749 1749
1750 1750
1751 /* 1751 /*
1752 * For ioctls, there is no generic way to determine how much memory 1752 * For ioctls, there is no generic way to determine how much memory
1753 * needs to be read and/or written. Furthermore, ioctls are allowed 1753 * needs to be read and/or written. Furthermore, ioctls are allowed
1754 * to dereference the passed pointer, so the parameter requires deep 1754 * to dereference the passed pointer, so the parameter requires deep
1755 * copying but FUSE has no idea whatsoever about what to copy in or 1755 * copying but FUSE has no idea whatsoever about what to copy in or
1756 * out. 1756 * out.
1757 * 1757 *
1758 * This is solved by allowing FUSE server to retry ioctl with 1758 * This is solved by allowing FUSE server to retry ioctl with
1759 * necessary in/out iovecs. Let's assume the ioctl implementation 1759 * necessary in/out iovecs. Let's assume the ioctl implementation
1760 * needs to read in the following structure. 1760 * needs to read in the following structure.
1761 * 1761 *
1762 * struct a { 1762 * struct a {
1763 * char *buf; 1763 * char *buf;
1764 * size_t buflen; 1764 * size_t buflen;
1765 * } 1765 * }
1766 * 1766 *
1767 * On the first callout to FUSE server, inarg->in_size and 1767 * On the first callout to FUSE server, inarg->in_size and
1768 * inarg->out_size will be NULL; then, the server completes the ioctl 1768 * inarg->out_size will be NULL; then, the server completes the ioctl
1769 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and 1769 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1770 * the actual iov array to 1770 * the actual iov array to
1771 * 1771 *
1772 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } 1772 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
1773 * 1773 *
1774 * which tells FUSE to copy in the requested area and retry the ioctl. 1774 * which tells FUSE to copy in the requested area and retry the ioctl.
1775 * On the second round, the server has access to the structure and 1775 * On the second round, the server has access to the structure and
1776 * from that it can tell what to look for next, so on the invocation, 1776 * from that it can tell what to look for next, so on the invocation,
1777 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to 1777 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1778 * 1778 *
1779 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, 1779 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
1780 * { .iov_base = a.buf, .iov_len = a.buflen } } 1780 * { .iov_base = a.buf, .iov_len = a.buflen } }
1781 * 1781 *
1782 * FUSE will copy both struct a and the pointed buffer from the 1782 * FUSE will copy both struct a and the pointed buffer from the
1783 * process doing the ioctl and retry ioctl with both struct a and the 1783 * process doing the ioctl and retry ioctl with both struct a and the
1784 * buffer. 1784 * buffer.
1785 * 1785 *
1786 * This time, FUSE server has everything it needs and completes ioctl 1786 * This time, FUSE server has everything it needs and completes ioctl
1787 * without FUSE_IOCTL_RETRY which finishes the ioctl call. 1787 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1788 * 1788 *
1789 * Copying data out works the same way. 1789 * Copying data out works the same way.
1790 * 1790 *
1791 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel 1791 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1792 * automatically initializes in and out iovs by decoding @cmd with 1792 * automatically initializes in and out iovs by decoding @cmd with
1793 * _IOC_* macros and the server is not allowed to request RETRY. This 1793 * _IOC_* macros and the server is not allowed to request RETRY. This
1794 * limits ioctl data transfers to well-formed ioctls and is the forced 1794 * limits ioctl data transfers to well-formed ioctls and is the forced
1795 * behavior for all FUSE servers. 1795 * behavior for all FUSE servers.
1796 */ 1796 */
1797 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 1797 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1798 unsigned int flags) 1798 unsigned int flags)
1799 { 1799 {
1800 struct fuse_file *ff = file->private_data; 1800 struct fuse_file *ff = file->private_data;
1801 struct fuse_conn *fc = ff->fc; 1801 struct fuse_conn *fc = ff->fc;
1802 struct fuse_ioctl_in inarg = { 1802 struct fuse_ioctl_in inarg = {
1803 .fh = ff->fh, 1803 .fh = ff->fh,
1804 .cmd = cmd, 1804 .cmd = cmd,
1805 .arg = arg, 1805 .arg = arg,
1806 .flags = flags 1806 .flags = flags
1807 }; 1807 };
1808 struct fuse_ioctl_out outarg; 1808 struct fuse_ioctl_out outarg;
1809 struct fuse_req *req = NULL; 1809 struct fuse_req *req = NULL;
1810 struct page **pages = NULL; 1810 struct page **pages = NULL;
1811 struct iovec *iov_page = NULL; 1811 struct iovec *iov_page = NULL;
1812 struct iovec *in_iov = NULL, *out_iov = NULL; 1812 struct iovec *in_iov = NULL, *out_iov = NULL;
1813 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; 1813 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1814 size_t in_size, out_size, transferred; 1814 size_t in_size, out_size, transferred;
1815 int err; 1815 int err;
1816 1816
1817 #if BITS_PER_LONG == 32 1817 #if BITS_PER_LONG == 32
1818 inarg.flags |= FUSE_IOCTL_32BIT; 1818 inarg.flags |= FUSE_IOCTL_32BIT;
1819 #else 1819 #else
1820 if (flags & FUSE_IOCTL_COMPAT) 1820 if (flags & FUSE_IOCTL_COMPAT)
1821 inarg.flags |= FUSE_IOCTL_32BIT; 1821 inarg.flags |= FUSE_IOCTL_32BIT;
1822 #endif 1822 #endif
1823 1823
1824 /* assume all the iovs returned by client always fits in a page */ 1824 /* assume all the iovs returned by client always fits in a page */
1825 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1825 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1826 1826
1827 err = -ENOMEM; 1827 err = -ENOMEM;
1828 pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); 1828 pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
1829 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); 1829 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1830 if (!pages || !iov_page) 1830 if (!pages || !iov_page)
1831 goto out; 1831 goto out;
1832 1832
1833 /* 1833 /*
1834 * If restricted, initialize IO parameters as encoded in @cmd. 1834 * If restricted, initialize IO parameters as encoded in @cmd.
1835 * RETRY from server is not allowed. 1835 * RETRY from server is not allowed.
1836 */ 1836 */
1837 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { 1837 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1838 struct iovec *iov = iov_page; 1838 struct iovec *iov = iov_page;
1839 1839
1840 iov->iov_base = (void __user *)arg; 1840 iov->iov_base = (void __user *)arg;
1841 iov->iov_len = _IOC_SIZE(cmd); 1841 iov->iov_len = _IOC_SIZE(cmd);
1842 1842
1843 if (_IOC_DIR(cmd) & _IOC_WRITE) { 1843 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1844 in_iov = iov; 1844 in_iov = iov;
1845 in_iovs = 1; 1845 in_iovs = 1;
1846 } 1846 }
1847 1847
1848 if (_IOC_DIR(cmd) & _IOC_READ) { 1848 if (_IOC_DIR(cmd) & _IOC_READ) {
1849 out_iov = iov; 1849 out_iov = iov;
1850 out_iovs = 1; 1850 out_iovs = 1;
1851 } 1851 }
1852 } 1852 }
1853 1853
1854 retry: 1854 retry:
1855 inarg.in_size = in_size = iov_length(in_iov, in_iovs); 1855 inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1856 inarg.out_size = out_size = iov_length(out_iov, out_iovs); 1856 inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1857 1857
1858 /* 1858 /*
1859 * Out data can be used either for actual out data or iovs, 1859 * Out data can be used either for actual out data or iovs,
1860 * make sure there always is at least one page. 1860 * make sure there always is at least one page.
1861 */ 1861 */
1862 out_size = max_t(size_t, out_size, PAGE_SIZE); 1862 out_size = max_t(size_t, out_size, PAGE_SIZE);
1863 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); 1863 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1864 1864
1865 /* make sure there are enough buffer pages and init request with them */ 1865 /* make sure there are enough buffer pages and init request with them */
1866 err = -ENOMEM; 1866 err = -ENOMEM;
1867 if (max_pages > FUSE_MAX_PAGES_PER_REQ) 1867 if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1868 goto out; 1868 goto out;
1869 while (num_pages < max_pages) { 1869 while (num_pages < max_pages) {
1870 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 1870 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1871 if (!pages[num_pages]) 1871 if (!pages[num_pages])
1872 goto out; 1872 goto out;
1873 num_pages++; 1873 num_pages++;
1874 } 1874 }
1875 1875
1876 req = fuse_get_req(fc); 1876 req = fuse_get_req(fc);
1877 if (IS_ERR(req)) { 1877 if (IS_ERR(req)) {
1878 err = PTR_ERR(req); 1878 err = PTR_ERR(req);
1879 req = NULL; 1879 req = NULL;
1880 goto out; 1880 goto out;
1881 } 1881 }
1882 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); 1882 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1883 req->num_pages = num_pages; 1883 req->num_pages = num_pages;
1884 1884
1885 /* okay, let's send it to the client */ 1885 /* okay, let's send it to the client */
1886 req->in.h.opcode = FUSE_IOCTL; 1886 req->in.h.opcode = FUSE_IOCTL;
1887 req->in.h.nodeid = ff->nodeid; 1887 req->in.h.nodeid = ff->nodeid;
1888 req->in.numargs = 1; 1888 req->in.numargs = 1;
1889 req->in.args[0].size = sizeof(inarg); 1889 req->in.args[0].size = sizeof(inarg);
1890 req->in.args[0].value = &inarg; 1890 req->in.args[0].value = &inarg;
1891 if (in_size) { 1891 if (in_size) {
1892 req->in.numargs++; 1892 req->in.numargs++;
1893 req->in.args[1].size = in_size; 1893 req->in.args[1].size = in_size;
1894 req->in.argpages = 1; 1894 req->in.argpages = 1;
1895 1895
1896 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, 1896 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1897 false); 1897 false);
1898 if (err) 1898 if (err)
1899 goto out; 1899 goto out;
1900 } 1900 }
1901 1901
1902 req->out.numargs = 2; 1902 req->out.numargs = 2;
1903 req->out.args[0].size = sizeof(outarg); 1903 req->out.args[0].size = sizeof(outarg);
1904 req->out.args[0].value = &outarg; 1904 req->out.args[0].value = &outarg;
1905 req->out.args[1].size = out_size; 1905 req->out.args[1].size = out_size;
1906 req->out.argpages = 1; 1906 req->out.argpages = 1;
1907 req->out.argvar = 1; 1907 req->out.argvar = 1;
1908 1908
1909 fuse_request_send(fc, req); 1909 fuse_request_send(fc, req);
1910 err = req->out.h.error; 1910 err = req->out.h.error;
1911 transferred = req->out.args[1].size; 1911 transferred = req->out.args[1].size;
1912 fuse_put_request(fc, req); 1912 fuse_put_request(fc, req);
1913 req = NULL; 1913 req = NULL;
1914 if (err) 1914 if (err)
1915 goto out; 1915 goto out;
1916 1916
1917 /* did it ask for retry? */ 1917 /* did it ask for retry? */
1918 if (outarg.flags & FUSE_IOCTL_RETRY) { 1918 if (outarg.flags & FUSE_IOCTL_RETRY) {
1919 void *vaddr; 1919 void *vaddr;
1920 1920
1921 /* no retry if in restricted mode */ 1921 /* no retry if in restricted mode */
1922 err = -EIO; 1922 err = -EIO;
1923 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) 1923 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1924 goto out; 1924 goto out;
1925 1925
1926 in_iovs = outarg.in_iovs; 1926 in_iovs = outarg.in_iovs;
1927 out_iovs = outarg.out_iovs; 1927 out_iovs = outarg.out_iovs;
1928 1928
1929 /* 1929 /*
1930 * Make sure things are in boundary, separate checks 1930 * Make sure things are in boundary, separate checks
1931 * are to protect against overflow. 1931 * are to protect against overflow.
1932 */ 1932 */
1933 err = -ENOMEM; 1933 err = -ENOMEM;
1934 if (in_iovs > FUSE_IOCTL_MAX_IOV || 1934 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1935 out_iovs > FUSE_IOCTL_MAX_IOV || 1935 out_iovs > FUSE_IOCTL_MAX_IOV ||
1936 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 1936 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1937 goto out; 1937 goto out;
1938 1938
1939 vaddr = kmap_atomic(pages[0]); 1939 vaddr = kmap_atomic(pages[0]);
1940 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, 1940 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
1941 transferred, in_iovs + out_iovs, 1941 transferred, in_iovs + out_iovs,
1942 (flags & FUSE_IOCTL_COMPAT) != 0); 1942 (flags & FUSE_IOCTL_COMPAT) != 0);
1943 kunmap_atomic(vaddr); 1943 kunmap_atomic(vaddr);
1944 if (err) 1944 if (err)
1945 goto out; 1945 goto out;
1946 1946
1947 in_iov = iov_page; 1947 in_iov = iov_page;
1948 out_iov = in_iov + in_iovs; 1948 out_iov = in_iov + in_iovs;
1949 1949
1950 err = fuse_verify_ioctl_iov(in_iov, in_iovs); 1950 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1951 if (err) 1951 if (err)
1952 goto out; 1952 goto out;
1953 1953
1954 err = fuse_verify_ioctl_iov(out_iov, out_iovs); 1954 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1955 if (err) 1955 if (err)
1956 goto out; 1956 goto out;
1957 1957
1958 goto retry; 1958 goto retry;
1959 } 1959 }
1960 1960
1961 err = -EIO; 1961 err = -EIO;
1962 if (transferred > inarg.out_size) 1962 if (transferred > inarg.out_size)
1963 goto out; 1963 goto out;
1964 1964
1965 err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); 1965 err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1966 out: 1966 out:
1967 if (req) 1967 if (req)
1968 fuse_put_request(fc, req); 1968 fuse_put_request(fc, req);
1969 free_page((unsigned long) iov_page); 1969 free_page((unsigned long) iov_page);
1970 while (num_pages) 1970 while (num_pages)
1971 __free_page(pages[--num_pages]); 1971 __free_page(pages[--num_pages]);
1972 kfree(pages); 1972 kfree(pages);
1973 1973
1974 return err ? err : outarg.result; 1974 return err ? err : outarg.result;
1975 } 1975 }
1976 EXPORT_SYMBOL_GPL(fuse_do_ioctl); 1976 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1977 1977
1978 long fuse_ioctl_common(struct file *file, unsigned int cmd, 1978 long fuse_ioctl_common(struct file *file, unsigned int cmd,
1979 unsigned long arg, unsigned int flags) 1979 unsigned long arg, unsigned int flags)
1980 { 1980 {
1981 struct inode *inode = file->f_dentry->d_inode; 1981 struct inode *inode = file->f_dentry->d_inode;
1982 struct fuse_conn *fc = get_fuse_conn(inode); 1982 struct fuse_conn *fc = get_fuse_conn(inode);
1983 1983
1984 if (!fuse_allow_task(fc, current)) 1984 if (!fuse_allow_task(fc, current))
1985 return -EACCES; 1985 return -EACCES;
1986 1986
1987 if (is_bad_inode(inode)) 1987 if (is_bad_inode(inode))
1988 return -EIO; 1988 return -EIO;
1989 1989
1990 return fuse_do_ioctl(file, cmd, arg, flags); 1990 return fuse_do_ioctl(file, cmd, arg, flags);
1991 } 1991 }
1992 1992
1993 static long fuse_file_ioctl(struct file *file, unsigned int cmd, 1993 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1994 unsigned long arg) 1994 unsigned long arg)
1995 { 1995 {
1996 return fuse_ioctl_common(file, cmd, arg, 0); 1996 return fuse_ioctl_common(file, cmd, arg, 0);
1997 } 1997 }
1998 1998
1999 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 1999 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
2000 unsigned long arg) 2000 unsigned long arg)
2001 { 2001 {
2002 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); 2002 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
2003 } 2003 }
2004 2004
2005 /* 2005 /*
2006 * All files which have been polled are linked to RB tree 2006 * All files which have been polled are linked to RB tree
2007 * fuse_conn->polled_files which is indexed by kh. Walk the tree and 2007 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
2008 * find the matching one. 2008 * find the matching one.
2009 */ 2009 */
2010 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, 2010 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2011 struct rb_node **parent_out) 2011 struct rb_node **parent_out)
2012 { 2012 {
2013 struct rb_node **link = &fc->polled_files.rb_node; 2013 struct rb_node **link = &fc->polled_files.rb_node;
2014 struct rb_node *last = NULL; 2014 struct rb_node *last = NULL;
2015 2015
2016 while (*link) { 2016 while (*link) {
2017 struct fuse_file *ff; 2017 struct fuse_file *ff;
2018 2018
2019 last = *link; 2019 last = *link;
2020 ff = rb_entry(last, struct fuse_file, polled_node); 2020 ff = rb_entry(last, struct fuse_file, polled_node);
2021 2021
2022 if (kh < ff->kh) 2022 if (kh < ff->kh)
2023 link = &last->rb_left; 2023 link = &last->rb_left;
2024 else if (kh > ff->kh) 2024 else if (kh > ff->kh)
2025 link = &last->rb_right; 2025 link = &last->rb_right;
2026 else 2026 else
2027 return link; 2027 return link;
2028 } 2028 }
2029 2029
2030 if (parent_out) 2030 if (parent_out)
2031 *parent_out = last; 2031 *parent_out = last;
2032 return link; 2032 return link;
2033 } 2033 }
2034 2034
2035 /* 2035 /*
2036 * The file is about to be polled. Make sure it's on the polled_files 2036 * The file is about to be polled. Make sure it's on the polled_files
2037 * RB tree. Note that files once added to the polled_files tree are 2037 * RB tree. Note that files once added to the polled_files tree are
2038 * not removed before the file is released. This is because a file 2038 * not removed before the file is released. This is because a file
2039 * polled once is likely to be polled again. 2039 * polled once is likely to be polled again.
2040 */ 2040 */
2041 static void fuse_register_polled_file(struct fuse_conn *fc, 2041 static void fuse_register_polled_file(struct fuse_conn *fc,
2042 struct fuse_file *ff) 2042 struct fuse_file *ff)
2043 { 2043 {
2044 spin_lock(&fc->lock); 2044 spin_lock(&fc->lock);
2045 if (RB_EMPTY_NODE(&ff->polled_node)) { 2045 if (RB_EMPTY_NODE(&ff->polled_node)) {
2046 struct rb_node **link, *parent; 2046 struct rb_node **link, *parent;
2047 2047
2048 link = fuse_find_polled_node(fc, ff->kh, &parent); 2048 link = fuse_find_polled_node(fc, ff->kh, &parent);
2049 BUG_ON(*link); 2049 BUG_ON(*link);
2050 rb_link_node(&ff->polled_node, parent, link); 2050 rb_link_node(&ff->polled_node, parent, link);
2051 rb_insert_color(&ff->polled_node, &fc->polled_files); 2051 rb_insert_color(&ff->polled_node, &fc->polled_files);
2052 } 2052 }
2053 spin_unlock(&fc->lock); 2053 spin_unlock(&fc->lock);
2054 } 2054 }
2055 2055
2056 unsigned fuse_file_poll(struct file *file, poll_table *wait) 2056 unsigned fuse_file_poll(struct file *file, poll_table *wait)
2057 { 2057 {
2058 struct fuse_file *ff = file->private_data; 2058 struct fuse_file *ff = file->private_data;
2059 struct fuse_conn *fc = ff->fc; 2059 struct fuse_conn *fc = ff->fc;
2060 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 2060 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2061 struct fuse_poll_out outarg; 2061 struct fuse_poll_out outarg;
2062 struct fuse_req *req; 2062 struct fuse_req *req;
2063 int err; 2063 int err;
2064 2064
2065 if (fc->no_poll) 2065 if (fc->no_poll)
2066 return DEFAULT_POLLMASK; 2066 return DEFAULT_POLLMASK;
2067 2067
2068 poll_wait(file, &ff->poll_wait, wait); 2068 poll_wait(file, &ff->poll_wait, wait);
2069 2069
2070 /* 2070 /*
2071 * Ask for notification iff there's someone waiting for it. 2071 * Ask for notification iff there's someone waiting for it.
2072 * The client may ignore the flag and always notify. 2072 * The client may ignore the flag and always notify.
2073 */ 2073 */
2074 if (waitqueue_active(&ff->poll_wait)) { 2074 if (waitqueue_active(&ff->poll_wait)) {
2075 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; 2075 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2076 fuse_register_polled_file(fc, ff); 2076 fuse_register_polled_file(fc, ff);
2077 } 2077 }
2078 2078
2079 req = fuse_get_req(fc); 2079 req = fuse_get_req(fc);
2080 if (IS_ERR(req)) 2080 if (IS_ERR(req))
2081 return POLLERR; 2081 return POLLERR;
2082 2082
2083 req->in.h.opcode = FUSE_POLL; 2083 req->in.h.opcode = FUSE_POLL;
2084 req->in.h.nodeid = ff->nodeid; 2084 req->in.h.nodeid = ff->nodeid;
2085 req->in.numargs = 1; 2085 req->in.numargs = 1;
2086 req->in.args[0].size = sizeof(inarg); 2086 req->in.args[0].size = sizeof(inarg);
2087 req->in.args[0].value = &inarg; 2087 req->in.args[0].value = &inarg;
2088 req->out.numargs = 1; 2088 req->out.numargs = 1;
2089 req->out.args[0].size = sizeof(outarg); 2089 req->out.args[0].size = sizeof(outarg);
2090 req->out.args[0].value = &outarg; 2090 req->out.args[0].value = &outarg;
2091 fuse_request_send(fc, req); 2091 fuse_request_send(fc, req);
2092 err = req->out.h.error; 2092 err = req->out.h.error;
2093 fuse_put_request(fc, req); 2093 fuse_put_request(fc, req);
2094 2094
2095 if (!err) 2095 if (!err)
2096 return outarg.revents; 2096 return outarg.revents;
2097 if (err == -ENOSYS) { 2097 if (err == -ENOSYS) {
2098 fc->no_poll = 1; 2098 fc->no_poll = 1;
2099 return DEFAULT_POLLMASK; 2099 return DEFAULT_POLLMASK;
2100 } 2100 }
2101 return POLLERR; 2101 return POLLERR;
2102 } 2102 }
2103 EXPORT_SYMBOL_GPL(fuse_file_poll); 2103 EXPORT_SYMBOL_GPL(fuse_file_poll);
2104 2104
2105 /* 2105 /*
2106 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 2106 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2107 * wakes up the poll waiters. 2107 * wakes up the poll waiters.
2108 */ 2108 */
2109 int fuse_notify_poll_wakeup(struct fuse_conn *fc, 2109 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2110 struct fuse_notify_poll_wakeup_out *outarg) 2110 struct fuse_notify_poll_wakeup_out *outarg)
2111 { 2111 {
2112 u64 kh = outarg->kh; 2112 u64 kh = outarg->kh;
2113 struct rb_node **link; 2113 struct rb_node **link;
2114 2114
2115 spin_lock(&fc->lock); 2115 spin_lock(&fc->lock);
2116 2116
2117 link = fuse_find_polled_node(fc, kh, NULL); 2117 link = fuse_find_polled_node(fc, kh, NULL);
2118 if (*link) { 2118 if (*link) {
2119 struct fuse_file *ff; 2119 struct fuse_file *ff;
2120 2120
2121 ff = rb_entry(*link, struct fuse_file, polled_node); 2121 ff = rb_entry(*link, struct fuse_file, polled_node);
2122 wake_up_interruptible_sync(&ff->poll_wait); 2122 wake_up_interruptible_sync(&ff->poll_wait);
2123 } 2123 }
2124 2124
2125 spin_unlock(&fc->lock); 2125 spin_unlock(&fc->lock);
2126 return 0; 2126 return 0;
2127 } 2127 }
2128 2128
2129 static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, 2129 static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
2130 unsigned long nr_segs, loff_t *ppos, int rw) 2130 unsigned long nr_segs, loff_t *ppos, int rw)
2131 { 2131 {
2132 const struct iovec *vector = iov; 2132 const struct iovec *vector = iov;
2133 ssize_t ret = 0; 2133 ssize_t ret = 0;
2134 2134
2135 while (nr_segs > 0) { 2135 while (nr_segs > 0) {
2136 void __user *base; 2136 void __user *base;
2137 size_t len; 2137 size_t len;
2138 ssize_t nr; 2138 ssize_t nr;
2139 2139
2140 base = vector->iov_base; 2140 base = vector->iov_base;
2141 len = vector->iov_len; 2141 len = vector->iov_len;
2142 vector++; 2142 vector++;
2143 nr_segs--; 2143 nr_segs--;
2144 2144
2145 if (rw == WRITE) 2145 if (rw == WRITE)
2146 nr = __fuse_direct_write(filp, base, len, ppos); 2146 nr = __fuse_direct_write(filp, base, len, ppos);
2147 else 2147 else
2148 nr = fuse_direct_read(filp, base, len, ppos); 2148 nr = fuse_direct_read(filp, base, len, ppos);
2149 2149
2150 if (nr < 0) { 2150 if (nr < 0) {
2151 if (!ret) 2151 if (!ret)
2152 ret = nr; 2152 ret = nr;
2153 break; 2153 break;
2154 } 2154 }
2155 ret += nr; 2155 ret += nr;
2156 if (nr != len) 2156 if (nr != len)
2157 break; 2157 break;
2158 } 2158 }
2159 2159
2160 return ret; 2160 return ret;
2161 } 2161 }
2162 2162
2163 2163
2164 static ssize_t 2164 static ssize_t
2165 fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 2165 fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2166 loff_t offset, unsigned long nr_segs) 2166 loff_t offset, unsigned long nr_segs)
2167 { 2167 {
2168 ssize_t ret = 0; 2168 ssize_t ret = 0;
2169 struct file *file = NULL; 2169 struct file *file = NULL;
2170 loff_t pos = 0; 2170 loff_t pos = 0;
2171 2171
2172 file = iocb->ki_filp; 2172 file = iocb->ki_filp;
2173 pos = offset; 2173 pos = offset;
2174 2174
2175 ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); 2175 ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
2176 2176
2177 return ret; 2177 return ret;
2178 } 2178 }
2179 2179
2180 long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 2180 long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2181 loff_t length) 2181 loff_t length)
2182 { 2182 {
2183 struct fuse_file *ff = file->private_data; 2183 struct fuse_file *ff = file->private_data;
2184 struct fuse_conn *fc = ff->fc; 2184 struct fuse_conn *fc = ff->fc;
2185 struct fuse_req *req; 2185 struct fuse_req *req;
2186 struct fuse_fallocate_in inarg = { 2186 struct fuse_fallocate_in inarg = {
2187 .fh = ff->fh, 2187 .fh = ff->fh,
2188 .offset = offset, 2188 .offset = offset,
2189 .length = length, 2189 .length = length,
2190 .mode = mode 2190 .mode = mode
2191 }; 2191 };
2192 int err; 2192 int err;
2193 2193
2194 if (fc->no_fallocate) 2194 if (fc->no_fallocate)
2195 return -EOPNOTSUPP; 2195 return -EOPNOTSUPP;
2196 2196
2197 req = fuse_get_req(fc); 2197 req = fuse_get_req(fc);
2198 if (IS_ERR(req)) 2198 if (IS_ERR(req))
2199 return PTR_ERR(req); 2199 return PTR_ERR(req);
2200 2200
2201 req->in.h.opcode = FUSE_FALLOCATE; 2201 req->in.h.opcode = FUSE_FALLOCATE;
2202 req->in.h.nodeid = ff->nodeid; 2202 req->in.h.nodeid = ff->nodeid;
2203 req->in.numargs = 1; 2203 req->in.numargs = 1;
2204 req->in.args[0].size = sizeof(inarg); 2204 req->in.args[0].size = sizeof(inarg);
2205 req->in.args[0].value = &inarg; 2205 req->in.args[0].value = &inarg;
2206 fuse_request_send(fc, req); 2206 fuse_request_send(fc, req);
2207 err = req->out.h.error; 2207 err = req->out.h.error;
2208 if (err == -ENOSYS) { 2208 if (err == -ENOSYS) {
2209 fc->no_fallocate = 1; 2209 fc->no_fallocate = 1;
2210 err = -EOPNOTSUPP; 2210 err = -EOPNOTSUPP;
2211 } 2211 }
2212 fuse_put_request(fc, req); 2212 fuse_put_request(fc, req);
2213 2213
2214 return err; 2214 return err;
2215 } 2215 }
2216 EXPORT_SYMBOL_GPL(fuse_file_fallocate); 2216 EXPORT_SYMBOL_GPL(fuse_file_fallocate);
2217 2217
2218 static const struct file_operations fuse_file_operations = { 2218 static const struct file_operations fuse_file_operations = {
2219 .llseek = fuse_file_llseek, 2219 .llseek = fuse_file_llseek,
2220 .read = do_sync_read, 2220 .read = do_sync_read,
2221 .aio_read = fuse_file_aio_read, 2221 .aio_read = fuse_file_aio_read,
2222 .write = do_sync_write, 2222 .write = do_sync_write,
2223 .aio_write = fuse_file_aio_write, 2223 .aio_write = fuse_file_aio_write,
2224 .mmap = fuse_file_mmap, 2224 .mmap = fuse_file_mmap,
2225 .open = fuse_open, 2225 .open = fuse_open,
2226 .flush = fuse_flush, 2226 .flush = fuse_flush,
2227 .release = fuse_release, 2227 .release = fuse_release,
2228 .fsync = fuse_fsync, 2228 .fsync = fuse_fsync,
2229 .lock = fuse_file_lock, 2229 .lock = fuse_file_lock,
2230 .flock = fuse_file_flock, 2230 .flock = fuse_file_flock,
2231 .splice_read = generic_file_splice_read, 2231 .splice_read = generic_file_splice_read,
2232 .unlocked_ioctl = fuse_file_ioctl, 2232 .unlocked_ioctl = fuse_file_ioctl,
2233 .compat_ioctl = fuse_file_compat_ioctl, 2233 .compat_ioctl = fuse_file_compat_ioctl,
2234 .poll = fuse_file_poll, 2234 .poll = fuse_file_poll,
2235 .fallocate = fuse_file_fallocate, 2235 .fallocate = fuse_file_fallocate,
2236 }; 2236 };
2237 2237
2238 static const struct file_operations fuse_direct_io_file_operations = { 2238 static const struct file_operations fuse_direct_io_file_operations = {
2239 .llseek = fuse_file_llseek, 2239 .llseek = fuse_file_llseek,
2240 .read = fuse_direct_read, 2240 .read = fuse_direct_read,
2241 .write = fuse_direct_write, 2241 .write = fuse_direct_write,
2242 .mmap = fuse_direct_mmap, 2242 .mmap = fuse_direct_mmap,
2243 .open = fuse_open, 2243 .open = fuse_open,
2244 .flush = fuse_flush, 2244 .flush = fuse_flush,
2245 .release = fuse_release, 2245 .release = fuse_release,
2246 .fsync = fuse_fsync, 2246 .fsync = fuse_fsync,
2247 .lock = fuse_file_lock, 2247 .lock = fuse_file_lock,
2248 .flock = fuse_file_flock, 2248 .flock = fuse_file_flock,
2249 .unlocked_ioctl = fuse_file_ioctl, 2249 .unlocked_ioctl = fuse_file_ioctl,
2250 .compat_ioctl = fuse_file_compat_ioctl, 2250 .compat_ioctl = fuse_file_compat_ioctl,
2251 .poll = fuse_file_poll, 2251 .poll = fuse_file_poll,
2252 .fallocate = fuse_file_fallocate, 2252 .fallocate = fuse_file_fallocate,
2253 /* no splice_read */ 2253 /* no splice_read */
2254 }; 2254 };
2255 2255
2256 static const struct address_space_operations fuse_file_aops = { 2256 static const struct address_space_operations fuse_file_aops = {
2257 .readpage = fuse_readpage, 2257 .readpage = fuse_readpage,
2258 .writepage = fuse_writepage, 2258 .writepage = fuse_writepage,
2259 .launder_page = fuse_launder_page, 2259 .launder_page = fuse_launder_page,
2260 .readpages = fuse_readpages, 2260 .readpages = fuse_readpages,
2261 .set_page_dirty = __set_page_dirty_nobuffers, 2261 .set_page_dirty = __set_page_dirty_nobuffers,
2262 .bmap = fuse_bmap, 2262 .bmap = fuse_bmap,
2263 .direct_IO = fuse_direct_IO, 2263 .direct_IO = fuse_direct_IO,
2264 }; 2264 };
2265 2265
2266 void fuse_init_file_inode(struct inode *inode) 2266 void fuse_init_file_inode(struct inode *inode)
2267 { 2267 {
2268 inode->i_fop = &fuse_file_operations; 2268 inode->i_fop = &fuse_file_operations;
2269 inode->i_data.a_ops = &fuse_file_aops; 2269 inode->i_data.a_ops = &fuse_file_aops;
2270 } 2270 }
2271 2271
1 /* 1 /*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/spinlock.h> 11 #include <linux/spinlock.h>
12 #include <linux/completion.h> 12 #include <linux/completion.h>
13 #include <linux/buffer_head.h> 13 #include <linux/buffer_head.h>
14 #include <linux/pagemap.h> 14 #include <linux/pagemap.h>
15 #include <linux/uio.h> 15 #include <linux/uio.h>
16 #include <linux/blkdev.h> 16 #include <linux/blkdev.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/mount.h> 18 #include <linux/mount.h>
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/gfs2_ondisk.h> 20 #include <linux/gfs2_ondisk.h>
21 #include <linux/falloc.h> 21 #include <linux/falloc.h>
22 #include <linux/swap.h> 22 #include <linux/swap.h>
23 #include <linux/crc32.h> 23 #include <linux/crc32.h>
24 #include <linux/writeback.h> 24 #include <linux/writeback.h>
25 #include <asm/uaccess.h> 25 #include <asm/uaccess.h>
26 #include <linux/dlm.h> 26 #include <linux/dlm.h>
27 #include <linux/dlm_plock.h> 27 #include <linux/dlm_plock.h>
28 28
29 #include "gfs2.h" 29 #include "gfs2.h"
30 #include "incore.h" 30 #include "incore.h"
31 #include "bmap.h" 31 #include "bmap.h"
32 #include "dir.h" 32 #include "dir.h"
33 #include "glock.h" 33 #include "glock.h"
34 #include "glops.h" 34 #include "glops.h"
35 #include "inode.h" 35 #include "inode.h"
36 #include "log.h" 36 #include "log.h"
37 #include "meta_io.h" 37 #include "meta_io.h"
38 #include "quota.h" 38 #include "quota.h"
39 #include "rgrp.h" 39 #include "rgrp.h"
40 #include "trans.h" 40 #include "trans.h"
41 #include "util.h" 41 #include "util.h"
42 42
43 /** 43 /**
44 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
45 * @file: the file 45 * @file: the file
46 * @offset: the offset 46 * @offset: the offset
47 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) 47 * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
48 * 48 *
49 * SEEK_END requires the glock for the file because it references the 49 * SEEK_END requires the glock for the file because it references the
50 * file's size. 50 * file's size.
51 * 51 *
52 * Returns: The new offset, or errno 52 * Returns: The new offset, or errno
53 */ 53 */
54 54
55 static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) 55 static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
56 { 56 {
57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
58 struct gfs2_holder i_gh; 58 struct gfs2_holder i_gh;
59 loff_t error; 59 loff_t error;
60 60
61 switch (origin) { 61 switch (whence) {
62 case SEEK_END: /* These reference inode->i_size */ 62 case SEEK_END: /* These reference inode->i_size */
63 case SEEK_DATA: 63 case SEEK_DATA:
64 case SEEK_HOLE: 64 case SEEK_HOLE:
65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
66 &i_gh); 66 &i_gh);
67 if (!error) { 67 if (!error) {
68 error = generic_file_llseek(file, offset, origin); 68 error = generic_file_llseek(file, offset, whence);
69 gfs2_glock_dq_uninit(&i_gh); 69 gfs2_glock_dq_uninit(&i_gh);
70 } 70 }
71 break; 71 break;
72 case SEEK_CUR: 72 case SEEK_CUR:
73 case SEEK_SET: 73 case SEEK_SET:
74 error = generic_file_llseek(file, offset, origin); 74 error = generic_file_llseek(file, offset, whence);
75 break; 75 break;
76 default: 76 default:
77 error = -EINVAL; 77 error = -EINVAL;
78 } 78 }
79 79
80 return error; 80 return error;
81 } 81 }
82 82
83 /** 83 /**
84 * gfs2_readdir - Read directory entries from a directory 84 * gfs2_readdir - Read directory entries from a directory
85 * @file: The directory to read from 85 * @file: The directory to read from
86 * @dirent: Buffer for dirents 86 * @dirent: Buffer for dirents
87 * @filldir: Function used to do the copying 87 * @filldir: Function used to do the copying
88 * 88 *
89 * Returns: errno 89 * Returns: errno
90 */ 90 */
91 91
92 static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) 92 static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
93 { 93 {
94 struct inode *dir = file->f_mapping->host; 94 struct inode *dir = file->f_mapping->host;
95 struct gfs2_inode *dip = GFS2_I(dir); 95 struct gfs2_inode *dip = GFS2_I(dir);
96 struct gfs2_holder d_gh; 96 struct gfs2_holder d_gh;
97 u64 offset = file->f_pos; 97 u64 offset = file->f_pos;
98 int error; 98 int error;
99 99
100 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 100 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
101 error = gfs2_glock_nq(&d_gh); 101 error = gfs2_glock_nq(&d_gh);
102 if (error) { 102 if (error) {
103 gfs2_holder_uninit(&d_gh); 103 gfs2_holder_uninit(&d_gh);
104 return error; 104 return error;
105 } 105 }
106 106
107 error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); 107 error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
108 108
109 gfs2_glock_dq_uninit(&d_gh); 109 gfs2_glock_dq_uninit(&d_gh);
110 110
111 file->f_pos = offset; 111 file->f_pos = offset;
112 112
113 return error; 113 return error;
114 } 114 }
115 115
116 /** 116 /**
117 * fsflags_cvt 117 * fsflags_cvt
118 * @table: A table of 32 u32 flags 118 * @table: A table of 32 u32 flags
119 * @val: a 32 bit value to convert 119 * @val: a 32 bit value to convert
120 * 120 *
121 * This function can be used to convert between fsflags values and 121 * This function can be used to convert between fsflags values and
122 * GFS2's own flags values. 122 * GFS2's own flags values.
123 * 123 *
124 * Returns: the converted flags 124 * Returns: the converted flags
125 */ 125 */
126 static u32 fsflags_cvt(const u32 *table, u32 val) 126 static u32 fsflags_cvt(const u32 *table, u32 val)
127 { 127 {
128 u32 res = 0; 128 u32 res = 0;
129 while(val) { 129 while(val) {
130 if (val & 1) 130 if (val & 1)
131 res |= *table; 131 res |= *table;
132 table++; 132 table++;
133 val >>= 1; 133 val >>= 1;
134 } 134 }
135 return res; 135 return res;
136 } 136 }
137 137
138 static const u32 fsflags_to_gfs2[32] = { 138 static const u32 fsflags_to_gfs2[32] = {
139 [3] = GFS2_DIF_SYNC, 139 [3] = GFS2_DIF_SYNC,
140 [4] = GFS2_DIF_IMMUTABLE, 140 [4] = GFS2_DIF_IMMUTABLE,
141 [5] = GFS2_DIF_APPENDONLY, 141 [5] = GFS2_DIF_APPENDONLY,
142 [7] = GFS2_DIF_NOATIME, 142 [7] = GFS2_DIF_NOATIME,
143 [12] = GFS2_DIF_EXHASH, 143 [12] = GFS2_DIF_EXHASH,
144 [14] = GFS2_DIF_INHERIT_JDATA, 144 [14] = GFS2_DIF_INHERIT_JDATA,
145 [17] = GFS2_DIF_TOPDIR, 145 [17] = GFS2_DIF_TOPDIR,
146 }; 146 };
147 147
148 static const u32 gfs2_to_fsflags[32] = { 148 static const u32 gfs2_to_fsflags[32] = {
149 [gfs2fl_Sync] = FS_SYNC_FL, 149 [gfs2fl_Sync] = FS_SYNC_FL,
150 [gfs2fl_Immutable] = FS_IMMUTABLE_FL, 150 [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
151 [gfs2fl_AppendOnly] = FS_APPEND_FL, 151 [gfs2fl_AppendOnly] = FS_APPEND_FL,
152 [gfs2fl_NoAtime] = FS_NOATIME_FL, 152 [gfs2fl_NoAtime] = FS_NOATIME_FL,
153 [gfs2fl_ExHash] = FS_INDEX_FL, 153 [gfs2fl_ExHash] = FS_INDEX_FL,
154 [gfs2fl_TopLevel] = FS_TOPDIR_FL, 154 [gfs2fl_TopLevel] = FS_TOPDIR_FL,
155 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, 155 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
156 }; 156 };
157 157
158 static int gfs2_get_flags(struct file *filp, u32 __user *ptr) 158 static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
159 { 159 {
160 struct inode *inode = filp->f_path.dentry->d_inode; 160 struct inode *inode = filp->f_path.dentry->d_inode;
161 struct gfs2_inode *ip = GFS2_I(inode); 161 struct gfs2_inode *ip = GFS2_I(inode);
162 struct gfs2_holder gh; 162 struct gfs2_holder gh;
163 int error; 163 int error;
164 u32 fsflags; 164 u32 fsflags;
165 165
166 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); 166 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
167 error = gfs2_glock_nq(&gh); 167 error = gfs2_glock_nq(&gh);
168 if (error) 168 if (error)
169 return error; 169 return error;
170 170
171 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); 171 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
172 if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) 172 if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
173 fsflags |= FS_JOURNAL_DATA_FL; 173 fsflags |= FS_JOURNAL_DATA_FL;
174 if (put_user(fsflags, ptr)) 174 if (put_user(fsflags, ptr))
175 error = -EFAULT; 175 error = -EFAULT;
176 176
177 gfs2_glock_dq(&gh); 177 gfs2_glock_dq(&gh);
178 gfs2_holder_uninit(&gh); 178 gfs2_holder_uninit(&gh);
179 return error; 179 return error;
180 } 180 }
181 181
182 void gfs2_set_inode_flags(struct inode *inode) 182 void gfs2_set_inode_flags(struct inode *inode)
183 { 183 {
184 struct gfs2_inode *ip = GFS2_I(inode); 184 struct gfs2_inode *ip = GFS2_I(inode);
185 unsigned int flags = inode->i_flags; 185 unsigned int flags = inode->i_flags;
186 186
187 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); 187 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC);
188 if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) 188 if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
189 inode->i_flags |= S_NOSEC; 189 inode->i_flags |= S_NOSEC;
190 if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) 190 if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
191 flags |= S_IMMUTABLE; 191 flags |= S_IMMUTABLE;
192 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) 192 if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
193 flags |= S_APPEND; 193 flags |= S_APPEND;
194 if (ip->i_diskflags & GFS2_DIF_NOATIME) 194 if (ip->i_diskflags & GFS2_DIF_NOATIME)
195 flags |= S_NOATIME; 195 flags |= S_NOATIME;
196 if (ip->i_diskflags & GFS2_DIF_SYNC) 196 if (ip->i_diskflags & GFS2_DIF_SYNC)
197 flags |= S_SYNC; 197 flags |= S_SYNC;
198 inode->i_flags = flags; 198 inode->i_flags = flags;
199 } 199 }
200 200
201 /* Flags that can be set by user space */ 201 /* Flags that can be set by user space */
202 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ 202 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
203 GFS2_DIF_IMMUTABLE| \ 203 GFS2_DIF_IMMUTABLE| \
204 GFS2_DIF_APPENDONLY| \ 204 GFS2_DIF_APPENDONLY| \
205 GFS2_DIF_NOATIME| \ 205 GFS2_DIF_NOATIME| \
206 GFS2_DIF_SYNC| \ 206 GFS2_DIF_SYNC| \
207 GFS2_DIF_SYSTEM| \ 207 GFS2_DIF_SYSTEM| \
208 GFS2_DIF_TOPDIR| \ 208 GFS2_DIF_TOPDIR| \
209 GFS2_DIF_INHERIT_JDATA) 209 GFS2_DIF_INHERIT_JDATA)
210 210
211 /** 211 /**
212 * gfs2_set_flags - set flags on an inode 212 * gfs2_set_flags - set flags on an inode
213 * @inode: The inode 213 * @inode: The inode
214 * @flags: The flags to set 214 * @flags: The flags to set
215 * @mask: Indicates which flags are valid 215 * @mask: Indicates which flags are valid
216 * 216 *
217 */ 217 */
218 static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) 218 static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
219 { 219 {
220 struct inode *inode = filp->f_path.dentry->d_inode; 220 struct inode *inode = filp->f_path.dentry->d_inode;
221 struct gfs2_inode *ip = GFS2_I(inode); 221 struct gfs2_inode *ip = GFS2_I(inode);
222 struct gfs2_sbd *sdp = GFS2_SB(inode); 222 struct gfs2_sbd *sdp = GFS2_SB(inode);
223 struct buffer_head *bh; 223 struct buffer_head *bh;
224 struct gfs2_holder gh; 224 struct gfs2_holder gh;
225 int error; 225 int error;
226 u32 new_flags, flags; 226 u32 new_flags, flags;
227 227
228 error = mnt_want_write_file(filp); 228 error = mnt_want_write_file(filp);
229 if (error) 229 if (error)
230 return error; 230 return error;
231 231
232 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 232 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
233 if (error) 233 if (error)
234 goto out_drop_write; 234 goto out_drop_write;
235 235
236 error = -EACCES; 236 error = -EACCES;
237 if (!inode_owner_or_capable(inode)) 237 if (!inode_owner_or_capable(inode))
238 goto out; 238 goto out;
239 239
240 error = 0; 240 error = 0;
241 flags = ip->i_diskflags; 241 flags = ip->i_diskflags;
242 new_flags = (flags & ~mask) | (reqflags & mask); 242 new_flags = (flags & ~mask) | (reqflags & mask);
243 if ((new_flags ^ flags) == 0) 243 if ((new_flags ^ flags) == 0)
244 goto out; 244 goto out;
245 245
246 error = -EINVAL; 246 error = -EINVAL;
247 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) 247 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
248 goto out; 248 goto out;
249 249
250 error = -EPERM; 250 error = -EPERM;
251 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) 251 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
252 goto out; 252 goto out;
253 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) 253 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
254 goto out; 254 goto out;
255 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && 255 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
256 !capable(CAP_LINUX_IMMUTABLE)) 256 !capable(CAP_LINUX_IMMUTABLE))
257 goto out; 257 goto out;
258 if (!IS_IMMUTABLE(inode)) { 258 if (!IS_IMMUTABLE(inode)) {
259 error = gfs2_permission(inode, MAY_WRITE); 259 error = gfs2_permission(inode, MAY_WRITE);
260 if (error) 260 if (error)
261 goto out; 261 goto out;
262 } 262 }
263 if ((flags ^ new_flags) & GFS2_DIF_JDATA) { 263 if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
264 if (flags & GFS2_DIF_JDATA) 264 if (flags & GFS2_DIF_JDATA)
265 gfs2_log_flush(sdp, ip->i_gl); 265 gfs2_log_flush(sdp, ip->i_gl);
266 error = filemap_fdatawrite(inode->i_mapping); 266 error = filemap_fdatawrite(inode->i_mapping);
267 if (error) 267 if (error)
268 goto out; 268 goto out;
269 error = filemap_fdatawait(inode->i_mapping); 269 error = filemap_fdatawait(inode->i_mapping);
270 if (error) 270 if (error)
271 goto out; 271 goto out;
272 } 272 }
273 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 273 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
274 if (error) 274 if (error)
275 goto out; 275 goto out;
276 error = gfs2_meta_inode_buffer(ip, &bh); 276 error = gfs2_meta_inode_buffer(ip, &bh);
277 if (error) 277 if (error)
278 goto out_trans_end; 278 goto out_trans_end;
279 gfs2_trans_add_bh(ip->i_gl, bh, 1); 279 gfs2_trans_add_bh(ip->i_gl, bh, 1);
280 ip->i_diskflags = new_flags; 280 ip->i_diskflags = new_flags;
281 gfs2_dinode_out(ip, bh->b_data); 281 gfs2_dinode_out(ip, bh->b_data);
282 brelse(bh); 282 brelse(bh);
283 gfs2_set_inode_flags(inode); 283 gfs2_set_inode_flags(inode);
284 gfs2_set_aops(inode); 284 gfs2_set_aops(inode);
285 out_trans_end: 285 out_trans_end:
286 gfs2_trans_end(sdp); 286 gfs2_trans_end(sdp);
287 out: 287 out:
288 gfs2_glock_dq_uninit(&gh); 288 gfs2_glock_dq_uninit(&gh);
289 out_drop_write: 289 out_drop_write:
290 mnt_drop_write_file(filp); 290 mnt_drop_write_file(filp);
291 return error; 291 return error;
292 } 292 }
293 293
294 static int gfs2_set_flags(struct file *filp, u32 __user *ptr) 294 static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
295 { 295 {
296 struct inode *inode = filp->f_path.dentry->d_inode; 296 struct inode *inode = filp->f_path.dentry->d_inode;
297 u32 fsflags, gfsflags; 297 u32 fsflags, gfsflags;
298 298
299 if (get_user(fsflags, ptr)) 299 if (get_user(fsflags, ptr))
300 return -EFAULT; 300 return -EFAULT;
301 301
302 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 302 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
303 if (!S_ISDIR(inode->i_mode)) { 303 if (!S_ISDIR(inode->i_mode)) {
304 gfsflags &= ~GFS2_DIF_TOPDIR; 304 gfsflags &= ~GFS2_DIF_TOPDIR;
305 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 305 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
306 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); 306 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
307 return do_gfs2_set_flags(filp, gfsflags, ~0); 307 return do_gfs2_set_flags(filp, gfsflags, ~0);
308 } 308 }
309 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); 309 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
310 } 310 }
311 311
312 static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 312 static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
313 { 313 {
314 switch(cmd) { 314 switch(cmd) {
315 case FS_IOC_GETFLAGS: 315 case FS_IOC_GETFLAGS:
316 return gfs2_get_flags(filp, (u32 __user *)arg); 316 return gfs2_get_flags(filp, (u32 __user *)arg);
317 case FS_IOC_SETFLAGS: 317 case FS_IOC_SETFLAGS:
318 return gfs2_set_flags(filp, (u32 __user *)arg); 318 return gfs2_set_flags(filp, (u32 __user *)arg);
319 case FITRIM: 319 case FITRIM:
320 return gfs2_fitrim(filp, (void __user *)arg); 320 return gfs2_fitrim(filp, (void __user *)arg);
321 } 321 }
322 return -ENOTTY; 322 return -ENOTTY;
323 } 323 }
324 324
325 /** 325 /**
326 * gfs2_size_hint - Give a hint to the size of a write request 326 * gfs2_size_hint - Give a hint to the size of a write request
327 * @file: The struct file 327 * @file: The struct file
328 * @offset: The file offset of the write 328 * @offset: The file offset of the write
329 * @size: The length of the write 329 * @size: The length of the write
330 * 330 *
331 * When we are about to do a write, this function records the total 331 * When we are about to do a write, this function records the total
332 * write size in order to provide a suitable hint to the lower layers 332 * write size in order to provide a suitable hint to the lower layers
333 * about how many blocks will be required. 333 * about how many blocks will be required.
334 * 334 *
335 */ 335 */
336 336
337 static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) 337 static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
338 { 338 {
339 struct inode *inode = filep->f_dentry->d_inode; 339 struct inode *inode = filep->f_dentry->d_inode;
340 struct gfs2_sbd *sdp = GFS2_SB(inode); 340 struct gfs2_sbd *sdp = GFS2_SB(inode);
341 struct gfs2_inode *ip = GFS2_I(inode); 341 struct gfs2_inode *ip = GFS2_I(inode);
342 size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; 342 size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
343 int hint = min_t(size_t, INT_MAX, blks); 343 int hint = min_t(size_t, INT_MAX, blks);
344 344
345 atomic_set(&ip->i_res->rs_sizehint, hint); 345 atomic_set(&ip->i_res->rs_sizehint, hint);
346 } 346 }
347 347
348 /** 348 /**
349 * gfs2_allocate_page_backing - Use bmap to allocate blocks 349 * gfs2_allocate_page_backing - Use bmap to allocate blocks
350 * @page: The (locked) page to allocate backing for 350 * @page: The (locked) page to allocate backing for
351 * 351 *
352 * We try to allocate all the blocks required for the page in 352 * We try to allocate all the blocks required for the page in
353 * one go. This might fail for various reasons, so we keep 353 * one go. This might fail for various reasons, so we keep
354 * trying until all the blocks to back this page are allocated. 354 * trying until all the blocks to back this page are allocated.
355 * If some of the blocks are already allocated, thats ok too. 355 * If some of the blocks are already allocated, thats ok too.
356 */ 356 */
357 357
358 static int gfs2_allocate_page_backing(struct page *page) 358 static int gfs2_allocate_page_backing(struct page *page)
359 { 359 {
360 struct inode *inode = page->mapping->host; 360 struct inode *inode = page->mapping->host;
361 struct buffer_head bh; 361 struct buffer_head bh;
362 unsigned long size = PAGE_CACHE_SIZE; 362 unsigned long size = PAGE_CACHE_SIZE;
363 u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 363 u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
364 364
365 do { 365 do {
366 bh.b_state = 0; 366 bh.b_state = 0;
367 bh.b_size = size; 367 bh.b_size = size;
368 gfs2_block_map(inode, lblock, &bh, 1); 368 gfs2_block_map(inode, lblock, &bh, 1);
369 if (!buffer_mapped(&bh)) 369 if (!buffer_mapped(&bh))
370 return -EIO; 370 return -EIO;
371 size -= bh.b_size; 371 size -= bh.b_size;
372 lblock += (bh.b_size >> inode->i_blkbits); 372 lblock += (bh.b_size >> inode->i_blkbits);
373 } while(size > 0); 373 } while(size > 0);
374 return 0; 374 return 0;
375 } 375 }
376 376
377 /** 377 /**
378 * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable 378 * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
379 * @vma: The virtual memory area 379 * @vma: The virtual memory area
380 * @page: The page which is about to become writable 380 * @page: The page which is about to become writable
381 * 381 *
382 * When the page becomes writable, we need to ensure that we have 382 * When the page becomes writable, we need to ensure that we have
383 * blocks allocated on disk to back that page. 383 * blocks allocated on disk to back that page.
384 */ 384 */
385 385
386 static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 386 static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
387 { 387 {
388 struct page *page = vmf->page; 388 struct page *page = vmf->page;
389 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 389 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
390 struct gfs2_inode *ip = GFS2_I(inode); 390 struct gfs2_inode *ip = GFS2_I(inode);
391 struct gfs2_sbd *sdp = GFS2_SB(inode); 391 struct gfs2_sbd *sdp = GFS2_SB(inode);
392 unsigned long last_index; 392 unsigned long last_index;
393 u64 pos = page->index << PAGE_CACHE_SHIFT; 393 u64 pos = page->index << PAGE_CACHE_SHIFT;
394 unsigned int data_blocks, ind_blocks, rblocks; 394 unsigned int data_blocks, ind_blocks, rblocks;
395 struct gfs2_holder gh; 395 struct gfs2_holder gh;
396 loff_t size; 396 loff_t size;
397 int ret; 397 int ret;
398 398
399 sb_start_pagefault(inode->i_sb); 399 sb_start_pagefault(inode->i_sb);
400 400
401 /* Update file times before taking page lock */ 401 /* Update file times before taking page lock */
402 file_update_time(vma->vm_file); 402 file_update_time(vma->vm_file);
403 403
404 ret = gfs2_rs_alloc(ip); 404 ret = gfs2_rs_alloc(ip);
405 if (ret) 405 if (ret)
406 return ret; 406 return ret;
407 407
408 gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); 408 gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
409 409
410 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 410 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
411 ret = gfs2_glock_nq(&gh); 411 ret = gfs2_glock_nq(&gh);
412 if (ret) 412 if (ret)
413 goto out; 413 goto out;
414 414
415 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 415 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
416 set_bit(GIF_SW_PAGED, &ip->i_flags); 416 set_bit(GIF_SW_PAGED, &ip->i_flags);
417 417
418 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { 418 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
419 lock_page(page); 419 lock_page(page);
420 if (!PageUptodate(page) || page->mapping != inode->i_mapping) { 420 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
421 ret = -EAGAIN; 421 ret = -EAGAIN;
422 unlock_page(page); 422 unlock_page(page);
423 } 423 }
424 goto out_unlock; 424 goto out_unlock;
425 } 425 }
426 426
427 ret = gfs2_rindex_update(sdp); 427 ret = gfs2_rindex_update(sdp);
428 if (ret) 428 if (ret)
429 goto out_unlock; 429 goto out_unlock;
430 430
431 ret = gfs2_quota_lock_check(ip); 431 ret = gfs2_quota_lock_check(ip);
432 if (ret) 432 if (ret)
433 goto out_unlock; 433 goto out_unlock;
434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); 434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); 435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
436 if (ret) 436 if (ret)
437 goto out_quota_unlock; 437 goto out_quota_unlock;
438 438
439 rblocks = RES_DINODE + ind_blocks; 439 rblocks = RES_DINODE + ind_blocks;
440 if (gfs2_is_jdata(ip)) 440 if (gfs2_is_jdata(ip))
441 rblocks += data_blocks ? data_blocks : 1; 441 rblocks += data_blocks ? data_blocks : 1;
442 if (ind_blocks || data_blocks) { 442 if (ind_blocks || data_blocks) {
443 rblocks += RES_STATFS + RES_QUOTA; 443 rblocks += RES_STATFS + RES_QUOTA;
444 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); 444 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
445 } 445 }
446 ret = gfs2_trans_begin(sdp, rblocks, 0); 446 ret = gfs2_trans_begin(sdp, rblocks, 0);
447 if (ret) 447 if (ret)
448 goto out_trans_fail; 448 goto out_trans_fail;
449 449
450 lock_page(page); 450 lock_page(page);
451 ret = -EINVAL; 451 ret = -EINVAL;
452 size = i_size_read(inode); 452 size = i_size_read(inode);
453 last_index = (size - 1) >> PAGE_CACHE_SHIFT; 453 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
454 /* Check page index against inode size */ 454 /* Check page index against inode size */
455 if (size == 0 || (page->index > last_index)) 455 if (size == 0 || (page->index > last_index))
456 goto out_trans_end; 456 goto out_trans_end;
457 457
458 ret = -EAGAIN; 458 ret = -EAGAIN;
459 /* If truncated, we must retry the operation, we may have raced 459 /* If truncated, we must retry the operation, we may have raced
460 * with the glock demotion code. 460 * with the glock demotion code.
461 */ 461 */
462 if (!PageUptodate(page) || page->mapping != inode->i_mapping) 462 if (!PageUptodate(page) || page->mapping != inode->i_mapping)
463 goto out_trans_end; 463 goto out_trans_end;
464 464
465 /* Unstuff, if required, and allocate backing blocks for page */ 465 /* Unstuff, if required, and allocate backing blocks for page */
466 ret = 0; 466 ret = 0;
467 if (gfs2_is_stuffed(ip)) 467 if (gfs2_is_stuffed(ip))
468 ret = gfs2_unstuff_dinode(ip, page); 468 ret = gfs2_unstuff_dinode(ip, page);
469 if (ret == 0) 469 if (ret == 0)
470 ret = gfs2_allocate_page_backing(page); 470 ret = gfs2_allocate_page_backing(page);
471 471
472 out_trans_end: 472 out_trans_end:
473 if (ret) 473 if (ret)
474 unlock_page(page); 474 unlock_page(page);
475 gfs2_trans_end(sdp); 475 gfs2_trans_end(sdp);
476 out_trans_fail: 476 out_trans_fail:
477 gfs2_inplace_release(ip); 477 gfs2_inplace_release(ip);
478 out_quota_unlock: 478 out_quota_unlock:
479 gfs2_quota_unlock(ip); 479 gfs2_quota_unlock(ip);
480 out_unlock: 480 out_unlock:
481 gfs2_glock_dq(&gh); 481 gfs2_glock_dq(&gh);
482 out: 482 out:
483 gfs2_holder_uninit(&gh); 483 gfs2_holder_uninit(&gh);
484 if (ret == 0) { 484 if (ret == 0) {
485 set_page_dirty(page); 485 set_page_dirty(page);
486 wait_on_page_writeback(page); 486 wait_on_page_writeback(page);
487 } 487 }
488 sb_end_pagefault(inode->i_sb); 488 sb_end_pagefault(inode->i_sb);
489 return block_page_mkwrite_return(ret); 489 return block_page_mkwrite_return(ret);
490 } 490 }
491 491
492 static const struct vm_operations_struct gfs2_vm_ops = { 492 static const struct vm_operations_struct gfs2_vm_ops = {
493 .fault = filemap_fault, 493 .fault = filemap_fault,
494 .page_mkwrite = gfs2_page_mkwrite, 494 .page_mkwrite = gfs2_page_mkwrite,
495 .remap_pages = generic_file_remap_pages, 495 .remap_pages = generic_file_remap_pages,
496 }; 496 };
497 497
498 /** 498 /**
499 * gfs2_mmap - 499 * gfs2_mmap -
500 * @file: The file to map 500 * @file: The file to map
501 * @vma: The VMA which described the mapping 501 * @vma: The VMA which described the mapping
502 * 502 *
503 * There is no need to get a lock here unless we should be updating 503 * There is no need to get a lock here unless we should be updating
504 * atime. We ignore any locking errors since the only consequence is 504 * atime. We ignore any locking errors since the only consequence is
505 * a missed atime update (which will just be deferred until later). 505 * a missed atime update (which will just be deferred until later).
506 * 506 *
507 * Returns: 0 507 * Returns: 0
508 */ 508 */
509 509
510 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) 510 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
511 { 511 {
512 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 512 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
513 513
514 if (!(file->f_flags & O_NOATIME) && 514 if (!(file->f_flags & O_NOATIME) &&
515 !IS_NOATIME(&ip->i_inode)) { 515 !IS_NOATIME(&ip->i_inode)) {
516 struct gfs2_holder i_gh; 516 struct gfs2_holder i_gh;
517 int error; 517 int error;
518 518
519 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 519 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
520 &i_gh); 520 &i_gh);
521 if (error) 521 if (error)
522 return error; 522 return error;
523 /* grab lock to update inode */ 523 /* grab lock to update inode */
524 gfs2_glock_dq_uninit(&i_gh); 524 gfs2_glock_dq_uninit(&i_gh);
525 file_accessed(file); 525 file_accessed(file);
526 } 526 }
527 vma->vm_ops = &gfs2_vm_ops; 527 vma->vm_ops = &gfs2_vm_ops;
528 528
529 return 0; 529 return 0;
530 } 530 }
531 531
532 /** 532 /**
533 * gfs2_open - open a file 533 * gfs2_open - open a file
534 * @inode: the inode to open 534 * @inode: the inode to open
535 * @file: the struct file for this opening 535 * @file: the struct file for this opening
536 * 536 *
537 * Returns: errno 537 * Returns: errno
538 */ 538 */
539 539
540 static int gfs2_open(struct inode *inode, struct file *file) 540 static int gfs2_open(struct inode *inode, struct file *file)
541 { 541 {
542 struct gfs2_inode *ip = GFS2_I(inode); 542 struct gfs2_inode *ip = GFS2_I(inode);
543 struct gfs2_holder i_gh; 543 struct gfs2_holder i_gh;
544 struct gfs2_file *fp; 544 struct gfs2_file *fp;
545 int error; 545 int error;
546 546
547 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); 547 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
548 if (!fp) 548 if (!fp)
549 return -ENOMEM; 549 return -ENOMEM;
550 550
551 mutex_init(&fp->f_fl_mutex); 551 mutex_init(&fp->f_fl_mutex);
552 552
553 gfs2_assert_warn(GFS2_SB(inode), !file->private_data); 553 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
554 file->private_data = fp; 554 file->private_data = fp;
555 555
556 if (S_ISREG(ip->i_inode.i_mode)) { 556 if (S_ISREG(ip->i_inode.i_mode)) {
557 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 557 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
558 &i_gh); 558 &i_gh);
559 if (error) 559 if (error)
560 goto fail; 560 goto fail;
561 561
562 if (!(file->f_flags & O_LARGEFILE) && 562 if (!(file->f_flags & O_LARGEFILE) &&
563 i_size_read(inode) > MAX_NON_LFS) { 563 i_size_read(inode) > MAX_NON_LFS) {
564 error = -EOVERFLOW; 564 error = -EOVERFLOW;
565 goto fail_gunlock; 565 goto fail_gunlock;
566 } 566 }
567 567
568 gfs2_glock_dq_uninit(&i_gh); 568 gfs2_glock_dq_uninit(&i_gh);
569 } 569 }
570 570
571 return 0; 571 return 0;
572 572
573 fail_gunlock: 573 fail_gunlock:
574 gfs2_glock_dq_uninit(&i_gh); 574 gfs2_glock_dq_uninit(&i_gh);
575 fail: 575 fail:
576 file->private_data = NULL; 576 file->private_data = NULL;
577 kfree(fp); 577 kfree(fp);
578 return error; 578 return error;
579 } 579 }
580 580
581 /** 581 /**
582 * gfs2_release - called to close a struct file 582 * gfs2_release - called to close a struct file
583 * @inode: the inode the struct file belongs to 583 * @inode: the inode the struct file belongs to
584 * @file: the struct file being closed 584 * @file: the struct file being closed
585 * 585 *
586 * Returns: errno 586 * Returns: errno
587 */ 587 */
588 588
589 static int gfs2_release(struct inode *inode, struct file *file) 589 static int gfs2_release(struct inode *inode, struct file *file)
590 { 590 {
591 struct gfs2_inode *ip = GFS2_I(inode); 591 struct gfs2_inode *ip = GFS2_I(inode);
592 592
593 kfree(file->private_data); 593 kfree(file->private_data);
594 file->private_data = NULL; 594 file->private_data = NULL;
595 595
596 if ((file->f_mode & FMODE_WRITE) && 596 if ((file->f_mode & FMODE_WRITE) &&
597 (atomic_read(&inode->i_writecount) == 1)) 597 (atomic_read(&inode->i_writecount) == 1))
598 gfs2_rs_delete(ip); 598 gfs2_rs_delete(ip);
599 599
600 return 0; 600 return 0;
601 } 601 }
602 602
603 /** 603 /**
604 * gfs2_fsync - sync the dirty data for a file (across the cluster) 604 * gfs2_fsync - sync the dirty data for a file (across the cluster)
605 * @file: the file that points to the dentry 605 * @file: the file that points to the dentry
606 * @start: the start position in the file to sync 606 * @start: the start position in the file to sync
607 * @end: the end position in the file to sync 607 * @end: the end position in the file to sync
608 * @datasync: set if we can ignore timestamp changes 608 * @datasync: set if we can ignore timestamp changes
609 * 609 *
610 * We split the data flushing here so that we don't wait for the data 610 * We split the data flushing here so that we don't wait for the data
611 * until after we've also sent the metadata to disk. Note that for 611 * until after we've also sent the metadata to disk. Note that for
612 * data=ordered, we will write & wait for the data at the log flush 612 * data=ordered, we will write & wait for the data at the log flush
613 * stage anyway, so this is unlikely to make much of a difference 613 * stage anyway, so this is unlikely to make much of a difference
614 * except in the data=writeback case. 614 * except in the data=writeback case.
615 * 615 *
616 * If the fdatawrite fails due to any reason except -EIO, we will 616 * If the fdatawrite fails due to any reason except -EIO, we will
617 * continue the remainder of the fsync, although we'll still report 617 * continue the remainder of the fsync, although we'll still report
618 * the error at the end. This is to match filemap_write_and_wait_range() 618 * the error at the end. This is to match filemap_write_and_wait_range()
619 * behaviour. 619 * behaviour.
620 * 620 *
621 * Returns: errno 621 * Returns: errno
622 */ 622 */
623 623
624 static int gfs2_fsync(struct file *file, loff_t start, loff_t end, 624 static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
625 int datasync) 625 int datasync)
626 { 626 {
627 struct address_space *mapping = file->f_mapping; 627 struct address_space *mapping = file->f_mapping;
628 struct inode *inode = mapping->host; 628 struct inode *inode = mapping->host;
629 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 629 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
630 struct gfs2_inode *ip = GFS2_I(inode); 630 struct gfs2_inode *ip = GFS2_I(inode);
631 int ret = 0, ret1 = 0; 631 int ret = 0, ret1 = 0;
632 632
633 if (mapping->nrpages) { 633 if (mapping->nrpages) {
634 ret1 = filemap_fdatawrite_range(mapping, start, end); 634 ret1 = filemap_fdatawrite_range(mapping, start, end);
635 if (ret1 == -EIO) 635 if (ret1 == -EIO)
636 return ret1; 636 return ret1;
637 } 637 }
638 638
639 if (datasync) 639 if (datasync)
640 sync_state &= ~I_DIRTY_SYNC; 640 sync_state &= ~I_DIRTY_SYNC;
641 641
642 if (sync_state) { 642 if (sync_state) {
643 ret = sync_inode_metadata(inode, 1); 643 ret = sync_inode_metadata(inode, 1);
644 if (ret) 644 if (ret)
645 return ret; 645 return ret;
646 if (gfs2_is_jdata(ip)) 646 if (gfs2_is_jdata(ip))
647 filemap_write_and_wait(mapping); 647 filemap_write_and_wait(mapping);
648 gfs2_ail_flush(ip->i_gl, 1); 648 gfs2_ail_flush(ip->i_gl, 1);
649 } 649 }
650 650
651 if (mapping->nrpages) 651 if (mapping->nrpages)
652 ret = filemap_fdatawait_range(mapping, start, end); 652 ret = filemap_fdatawait_range(mapping, start, end);
653 653
654 return ret ? ret : ret1; 654 return ret ? ret : ret1;
655 } 655 }
656 656
657 /** 657 /**
658 * gfs2_file_aio_write - Perform a write to a file 658 * gfs2_file_aio_write - Perform a write to a file
659 * @iocb: The io context 659 * @iocb: The io context
660 * @iov: The data to write 660 * @iov: The data to write
661 * @nr_segs: Number of @iov segments 661 * @nr_segs: Number of @iov segments
662 * @pos: The file position 662 * @pos: The file position
663 * 663 *
664 * We have to do a lock/unlock here to refresh the inode size for 664 * We have to do a lock/unlock here to refresh the inode size for
665 * O_APPEND writes, otherwise we can land up writing at the wrong 665 * O_APPEND writes, otherwise we can land up writing at the wrong
666 * offset. There is still a race, but provided the app is using its 666 * offset. There is still a race, but provided the app is using its
667 * own file locking, this will make O_APPEND work as expected. 667 * own file locking, this will make O_APPEND work as expected.
668 * 668 *
669 */ 669 */
670 670
671 static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 671 static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
672 unsigned long nr_segs, loff_t pos) 672 unsigned long nr_segs, loff_t pos)
673 { 673 {
674 struct file *file = iocb->ki_filp; 674 struct file *file = iocb->ki_filp;
675 size_t writesize = iov_length(iov, nr_segs); 675 size_t writesize = iov_length(iov, nr_segs);
676 struct dentry *dentry = file->f_dentry; 676 struct dentry *dentry = file->f_dentry;
677 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 677 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
678 int ret; 678 int ret;
679 679
680 ret = gfs2_rs_alloc(ip); 680 ret = gfs2_rs_alloc(ip);
681 if (ret) 681 if (ret)
682 return ret; 682 return ret;
683 683
684 gfs2_size_hint(file, pos, writesize); 684 gfs2_size_hint(file, pos, writesize);
685 685
686 if (file->f_flags & O_APPEND) { 686 if (file->f_flags & O_APPEND) {
687 struct gfs2_holder gh; 687 struct gfs2_holder gh;
688 688
689 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); 689 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
690 if (ret) 690 if (ret)
691 return ret; 691 return ret;
692 gfs2_glock_dq_uninit(&gh); 692 gfs2_glock_dq_uninit(&gh);
693 } 693 }
694 694
695 return generic_file_aio_write(iocb, iov, nr_segs, pos); 695 return generic_file_aio_write(iocb, iov, nr_segs, pos);
696 } 696 }
697 697
698 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, 698 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
699 int mode) 699 int mode)
700 { 700 {
701 struct gfs2_inode *ip = GFS2_I(inode); 701 struct gfs2_inode *ip = GFS2_I(inode);
702 struct buffer_head *dibh; 702 struct buffer_head *dibh;
703 int error; 703 int error;
704 loff_t size = len; 704 loff_t size = len;
705 unsigned int nr_blks; 705 unsigned int nr_blks;
706 sector_t lblock = offset >> inode->i_blkbits; 706 sector_t lblock = offset >> inode->i_blkbits;
707 707
708 error = gfs2_meta_inode_buffer(ip, &dibh); 708 error = gfs2_meta_inode_buffer(ip, &dibh);
709 if (unlikely(error)) 709 if (unlikely(error))
710 return error; 710 return error;
711 711
712 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 712 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
713 713
714 if (gfs2_is_stuffed(ip)) { 714 if (gfs2_is_stuffed(ip)) {
715 error = gfs2_unstuff_dinode(ip, NULL); 715 error = gfs2_unstuff_dinode(ip, NULL);
716 if (unlikely(error)) 716 if (unlikely(error))
717 goto out; 717 goto out;
718 } 718 }
719 719
720 while (len) { 720 while (len) {
721 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; 721 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
722 bh_map.b_size = len; 722 bh_map.b_size = len;
723 set_buffer_zeronew(&bh_map); 723 set_buffer_zeronew(&bh_map);
724 724
725 error = gfs2_block_map(inode, lblock, &bh_map, 1); 725 error = gfs2_block_map(inode, lblock, &bh_map, 1);
726 if (unlikely(error)) 726 if (unlikely(error))
727 goto out; 727 goto out;
728 len -= bh_map.b_size; 728 len -= bh_map.b_size;
729 nr_blks = bh_map.b_size >> inode->i_blkbits; 729 nr_blks = bh_map.b_size >> inode->i_blkbits;
730 lblock += nr_blks; 730 lblock += nr_blks;
731 if (!buffer_new(&bh_map)) 731 if (!buffer_new(&bh_map))
732 continue; 732 continue;
733 if (unlikely(!buffer_zeronew(&bh_map))) { 733 if (unlikely(!buffer_zeronew(&bh_map))) {
734 error = -EIO; 734 error = -EIO;
735 goto out; 735 goto out;
736 } 736 }
737 } 737 }
738 if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) 738 if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
739 i_size_write(inode, offset + size); 739 i_size_write(inode, offset + size);
740 740
741 mark_inode_dirty(inode); 741 mark_inode_dirty(inode);
742 742
743 out: 743 out:
744 brelse(dibh); 744 brelse(dibh);
745 return error; 745 return error;
746 } 746 }
747 747
748 static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, 748 static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
749 unsigned int *data_blocks, unsigned int *ind_blocks) 749 unsigned int *data_blocks, unsigned int *ind_blocks)
750 { 750 {
751 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 751 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
752 unsigned int max_blocks = ip->i_rgd->rd_free_clone; 752 unsigned int max_blocks = ip->i_rgd->rd_free_clone;
753 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); 753 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
754 754
755 for (tmp = max_data; tmp > sdp->sd_diptrs;) { 755 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
756 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); 756 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
757 max_data -= tmp; 757 max_data -= tmp;
758 } 758 }
759 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, 759 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
760 so it might end up with fewer data blocks */ 760 so it might end up with fewer data blocks */
761 if (max_data <= *data_blocks) 761 if (max_data <= *data_blocks)
762 return; 762 return;
763 *data_blocks = max_data; 763 *data_blocks = max_data;
764 *ind_blocks = max_blocks - max_data; 764 *ind_blocks = max_blocks - max_data;
765 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; 765 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
766 if (*len > max) { 766 if (*len > max) {
767 *len = max; 767 *len = max;
768 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); 768 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
769 } 769 }
770 } 770 }
771 771
772 static long gfs2_fallocate(struct file *file, int mode, loff_t offset, 772 static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
773 loff_t len) 773 loff_t len)
774 { 774 {
775 struct inode *inode = file->f_path.dentry->d_inode; 775 struct inode *inode = file->f_path.dentry->d_inode;
776 struct gfs2_sbd *sdp = GFS2_SB(inode); 776 struct gfs2_sbd *sdp = GFS2_SB(inode);
777 struct gfs2_inode *ip = GFS2_I(inode); 777 struct gfs2_inode *ip = GFS2_I(inode);
778 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 778 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
779 loff_t bytes, max_bytes; 779 loff_t bytes, max_bytes;
780 int error; 780 int error;
781 const loff_t pos = offset; 781 const loff_t pos = offset;
782 const loff_t count = len; 782 const loff_t count = len;
783 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 783 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
784 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 784 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
785 loff_t max_chunk_size = UINT_MAX & bsize_mask; 785 loff_t max_chunk_size = UINT_MAX & bsize_mask;
786 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 786 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
787 787
788 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 788 /* We only support the FALLOC_FL_KEEP_SIZE mode */
789 if (mode & ~FALLOC_FL_KEEP_SIZE) 789 if (mode & ~FALLOC_FL_KEEP_SIZE)
790 return -EOPNOTSUPP; 790 return -EOPNOTSUPP;
791 791
792 offset &= bsize_mask; 792 offset &= bsize_mask;
793 793
794 len = next - offset; 794 len = next - offset;
795 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; 795 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
796 if (!bytes) 796 if (!bytes)
797 bytes = UINT_MAX; 797 bytes = UINT_MAX;
798 bytes &= bsize_mask; 798 bytes &= bsize_mask;
799 if (bytes == 0) 799 if (bytes == 0)
800 bytes = sdp->sd_sb.sb_bsize; 800 bytes = sdp->sd_sb.sb_bsize;
801 801
802 error = gfs2_rs_alloc(ip); 802 error = gfs2_rs_alloc(ip);
803 if (error) 803 if (error)
804 return error; 804 return error;
805 805
806 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 806 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
807 error = gfs2_glock_nq(&ip->i_gh); 807 error = gfs2_glock_nq(&ip->i_gh);
808 if (unlikely(error)) 808 if (unlikely(error))
809 goto out_uninit; 809 goto out_uninit;
810 810
811 gfs2_size_hint(file, offset, len); 811 gfs2_size_hint(file, offset, len);
812 812
813 while (len > 0) { 813 while (len > 0) {
814 if (len < bytes) 814 if (len < bytes)
815 bytes = len; 815 bytes = len;
816 if (!gfs2_write_alloc_required(ip, offset, bytes)) { 816 if (!gfs2_write_alloc_required(ip, offset, bytes)) {
817 len -= bytes; 817 len -= bytes;
818 offset += bytes; 818 offset += bytes;
819 continue; 819 continue;
820 } 820 }
821 error = gfs2_quota_lock_check(ip); 821 error = gfs2_quota_lock_check(ip);
822 if (error) 822 if (error)
823 goto out_unlock; 823 goto out_unlock;
824 824
825 retry: 825 retry:
826 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); 826 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
827 827
828 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); 828 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
829 if (error) { 829 if (error) {
830 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { 830 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
831 bytes >>= 1; 831 bytes >>= 1;
832 bytes &= bsize_mask; 832 bytes &= bsize_mask;
833 if (bytes == 0) 833 if (bytes == 0)
834 bytes = sdp->sd_sb.sb_bsize; 834 bytes = sdp->sd_sb.sb_bsize;
835 goto retry; 835 goto retry;
836 } 836 }
837 goto out_qunlock; 837 goto out_qunlock;
838 } 838 }
839 max_bytes = bytes; 839 max_bytes = bytes;
840 calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, 840 calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
841 &max_bytes, &data_blocks, &ind_blocks); 841 &max_bytes, &data_blocks, &ind_blocks);
842 842
843 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + 843 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
844 RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); 844 RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
845 if (gfs2_is_jdata(ip)) 845 if (gfs2_is_jdata(ip))
846 rblocks += data_blocks ? data_blocks : 1; 846 rblocks += data_blocks ? data_blocks : 1;
847 847
848 error = gfs2_trans_begin(sdp, rblocks, 848 error = gfs2_trans_begin(sdp, rblocks,
849 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 849 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
850 if (error) 850 if (error)
851 goto out_trans_fail; 851 goto out_trans_fail;
852 852
853 error = fallocate_chunk(inode, offset, max_bytes, mode); 853 error = fallocate_chunk(inode, offset, max_bytes, mode);
854 gfs2_trans_end(sdp); 854 gfs2_trans_end(sdp);
855 855
856 if (error) 856 if (error)
857 goto out_trans_fail; 857 goto out_trans_fail;
858 858
859 len -= max_bytes; 859 len -= max_bytes;
860 offset += max_bytes; 860 offset += max_bytes;
861 gfs2_inplace_release(ip); 861 gfs2_inplace_release(ip);
862 gfs2_quota_unlock(ip); 862 gfs2_quota_unlock(ip);
863 } 863 }
864 864
865 if (error == 0) 865 if (error == 0)
866 error = generic_write_sync(file, pos, count); 866 error = generic_write_sync(file, pos, count);
867 goto out_unlock; 867 goto out_unlock;
868 868
869 out_trans_fail: 869 out_trans_fail:
870 gfs2_inplace_release(ip); 870 gfs2_inplace_release(ip);
871 out_qunlock: 871 out_qunlock:
872 gfs2_quota_unlock(ip); 872 gfs2_quota_unlock(ip);
873 out_unlock: 873 out_unlock:
874 gfs2_glock_dq(&ip->i_gh); 874 gfs2_glock_dq(&ip->i_gh);
875 out_uninit: 875 out_uninit:
876 gfs2_holder_uninit(&ip->i_gh); 876 gfs2_holder_uninit(&ip->i_gh);
877 return error; 877 return error;
878 } 878 }
879 879
880 #ifdef CONFIG_GFS2_FS_LOCKING_DLM 880 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
881 881
882 /** 882 /**
883 * gfs2_setlease - acquire/release a file lease 883 * gfs2_setlease - acquire/release a file lease
884 * @file: the file pointer 884 * @file: the file pointer
885 * @arg: lease type 885 * @arg: lease type
886 * @fl: file lock 886 * @fl: file lock
887 * 887 *
888 * We don't currently have a way to enforce a lease across the whole 888 * We don't currently have a way to enforce a lease across the whole
889 * cluster; until we do, disable leases (by just returning -EINVAL), 889 * cluster; until we do, disable leases (by just returning -EINVAL),
890 * unless the administrator has requested purely local locking. 890 * unless the administrator has requested purely local locking.
891 * 891 *
892 * Locking: called under lock_flocks 892 * Locking: called under lock_flocks
893 * 893 *
894 * Returns: errno 894 * Returns: errno
895 */ 895 */
896 896
897 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) 897 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
898 { 898 {
899 return -EINVAL; 899 return -EINVAL;
900 } 900 }
901 901
902 /** 902 /**
903 * gfs2_lock - acquire/release a posix lock on a file 903 * gfs2_lock - acquire/release a posix lock on a file
904 * @file: the file pointer 904 * @file: the file pointer
905 * @cmd: either modify or retrieve lock state, possibly wait 905 * @cmd: either modify or retrieve lock state, possibly wait
906 * @fl: type and range of lock 906 * @fl: type and range of lock
907 * 907 *
908 * Returns: errno 908 * Returns: errno
909 */ 909 */
910 910
911 static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) 911 static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
912 { 912 {
913 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 913 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
914 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 914 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
915 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 915 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
916 916
917 if (!(fl->fl_flags & FL_POSIX)) 917 if (!(fl->fl_flags & FL_POSIX))
918 return -ENOLCK; 918 return -ENOLCK;
919 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) 919 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
920 return -ENOLCK; 920 return -ENOLCK;
921 921
922 if (cmd == F_CANCELLK) { 922 if (cmd == F_CANCELLK) {
923 /* Hack: */ 923 /* Hack: */
924 cmd = F_SETLK; 924 cmd = F_SETLK;
925 fl->fl_type = F_UNLCK; 925 fl->fl_type = F_UNLCK;
926 } 926 }
927 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 927 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
928 return -EIO; 928 return -EIO;
929 if (IS_GETLK(cmd)) 929 if (IS_GETLK(cmd))
930 return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); 930 return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
931 else if (fl->fl_type == F_UNLCK) 931 else if (fl->fl_type == F_UNLCK)
932 return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); 932 return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
933 else 933 else
934 return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); 934 return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
935 } 935 }
936 936
937 static int do_flock(struct file *file, int cmd, struct file_lock *fl) 937 static int do_flock(struct file *file, int cmd, struct file_lock *fl)
938 { 938 {
939 struct gfs2_file *fp = file->private_data; 939 struct gfs2_file *fp = file->private_data;
940 struct gfs2_holder *fl_gh = &fp->f_fl_gh; 940 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
941 struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); 941 struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
942 struct gfs2_glock *gl; 942 struct gfs2_glock *gl;
943 unsigned int state; 943 unsigned int state;
944 int flags; 944 int flags;
945 int error = 0; 945 int error = 0;
946 946
947 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 947 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
948 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; 948 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
949 949
950 mutex_lock(&fp->f_fl_mutex); 950 mutex_lock(&fp->f_fl_mutex);
951 951
952 gl = fl_gh->gh_gl; 952 gl = fl_gh->gh_gl;
953 if (gl) { 953 if (gl) {
954 if (fl_gh->gh_state == state) 954 if (fl_gh->gh_state == state)
955 goto out; 955 goto out;
956 flock_lock_file_wait(file, 956 flock_lock_file_wait(file,
957 &(struct file_lock){.fl_type = F_UNLCK}); 957 &(struct file_lock){.fl_type = F_UNLCK});
958 gfs2_glock_dq_wait(fl_gh); 958 gfs2_glock_dq_wait(fl_gh);
959 gfs2_holder_reinit(state, flags, fl_gh); 959 gfs2_holder_reinit(state, flags, fl_gh);
960 } else { 960 } else {
961 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, 961 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
962 &gfs2_flock_glops, CREATE, &gl); 962 &gfs2_flock_glops, CREATE, &gl);
963 if (error) 963 if (error)
964 goto out; 964 goto out;
965 gfs2_holder_init(gl, state, flags, fl_gh); 965 gfs2_holder_init(gl, state, flags, fl_gh);
966 gfs2_glock_put(gl); 966 gfs2_glock_put(gl);
967 } 967 }
968 error = gfs2_glock_nq(fl_gh); 968 error = gfs2_glock_nq(fl_gh);
969 if (error) { 969 if (error) {
970 gfs2_holder_uninit(fl_gh); 970 gfs2_holder_uninit(fl_gh);
971 if (error == GLR_TRYFAILED) 971 if (error == GLR_TRYFAILED)
972 error = -EAGAIN; 972 error = -EAGAIN;
973 } else { 973 } else {
974 error = flock_lock_file_wait(file, fl); 974 error = flock_lock_file_wait(file, fl);
975 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); 975 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
976 } 976 }
977 977
978 out: 978 out:
979 mutex_unlock(&fp->f_fl_mutex); 979 mutex_unlock(&fp->f_fl_mutex);
980 return error; 980 return error;
981 } 981 }
982 982
983 static void do_unflock(struct file *file, struct file_lock *fl) 983 static void do_unflock(struct file *file, struct file_lock *fl)
984 { 984 {
985 struct gfs2_file *fp = file->private_data; 985 struct gfs2_file *fp = file->private_data;
986 struct gfs2_holder *fl_gh = &fp->f_fl_gh; 986 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
987 987
988 mutex_lock(&fp->f_fl_mutex); 988 mutex_lock(&fp->f_fl_mutex);
989 flock_lock_file_wait(file, fl); 989 flock_lock_file_wait(file, fl);
990 if (fl_gh->gh_gl) { 990 if (fl_gh->gh_gl) {
991 gfs2_glock_dq_wait(fl_gh); 991 gfs2_glock_dq_wait(fl_gh);
992 gfs2_holder_uninit(fl_gh); 992 gfs2_holder_uninit(fl_gh);
993 } 993 }
994 mutex_unlock(&fp->f_fl_mutex); 994 mutex_unlock(&fp->f_fl_mutex);
995 } 995 }
996 996
997 /** 997 /**
998 * gfs2_flock - acquire/release a flock lock on a file 998 * gfs2_flock - acquire/release a flock lock on a file
999 * @file: the file pointer 999 * @file: the file pointer
1000 * @cmd: either modify or retrieve lock state, possibly wait 1000 * @cmd: either modify or retrieve lock state, possibly wait
1001 * @fl: type and range of lock 1001 * @fl: type and range of lock
1002 * 1002 *
1003 * Returns: errno 1003 * Returns: errno
1004 */ 1004 */
1005 1005
1006 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) 1006 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
1007 { 1007 {
1008 if (!(fl->fl_flags & FL_FLOCK)) 1008 if (!(fl->fl_flags & FL_FLOCK))
1009 return -ENOLCK; 1009 return -ENOLCK;
1010 if (fl->fl_type & LOCK_MAND) 1010 if (fl->fl_type & LOCK_MAND)
1011 return -EOPNOTSUPP; 1011 return -EOPNOTSUPP;
1012 1012
1013 if (fl->fl_type == F_UNLCK) { 1013 if (fl->fl_type == F_UNLCK) {
1014 do_unflock(file, fl); 1014 do_unflock(file, fl);
1015 return 0; 1015 return 0;
1016 } else { 1016 } else {
1017 return do_flock(file, cmd, fl); 1017 return do_flock(file, cmd, fl);
1018 } 1018 }
1019 } 1019 }
1020 1020
1021 const struct file_operations gfs2_file_fops = { 1021 const struct file_operations gfs2_file_fops = {
1022 .llseek = gfs2_llseek, 1022 .llseek = gfs2_llseek,
1023 .read = do_sync_read, 1023 .read = do_sync_read,
1024 .aio_read = generic_file_aio_read, 1024 .aio_read = generic_file_aio_read,
1025 .write = do_sync_write, 1025 .write = do_sync_write,
1026 .aio_write = gfs2_file_aio_write, 1026 .aio_write = gfs2_file_aio_write,
1027 .unlocked_ioctl = gfs2_ioctl, 1027 .unlocked_ioctl = gfs2_ioctl,
1028 .mmap = gfs2_mmap, 1028 .mmap = gfs2_mmap,
1029 .open = gfs2_open, 1029 .open = gfs2_open,
1030 .release = gfs2_release, 1030 .release = gfs2_release,
1031 .fsync = gfs2_fsync, 1031 .fsync = gfs2_fsync,
1032 .lock = gfs2_lock, 1032 .lock = gfs2_lock,
1033 .flock = gfs2_flock, 1033 .flock = gfs2_flock,
1034 .splice_read = generic_file_splice_read, 1034 .splice_read = generic_file_splice_read,
1035 .splice_write = generic_file_splice_write, 1035 .splice_write = generic_file_splice_write,
1036 .setlease = gfs2_setlease, 1036 .setlease = gfs2_setlease,
1037 .fallocate = gfs2_fallocate, 1037 .fallocate = gfs2_fallocate,
1038 }; 1038 };
1039 1039
1040 const struct file_operations gfs2_dir_fops = { 1040 const struct file_operations gfs2_dir_fops = {
1041 .readdir = gfs2_readdir, 1041 .readdir = gfs2_readdir,
1042 .unlocked_ioctl = gfs2_ioctl, 1042 .unlocked_ioctl = gfs2_ioctl,
1043 .open = gfs2_open, 1043 .open = gfs2_open,
1044 .release = gfs2_release, 1044 .release = gfs2_release,
1045 .fsync = gfs2_fsync, 1045 .fsync = gfs2_fsync,
1046 .lock = gfs2_lock, 1046 .lock = gfs2_lock,
1047 .flock = gfs2_flock, 1047 .flock = gfs2_flock,
1048 .llseek = default_llseek, 1048 .llseek = default_llseek,
1049 }; 1049 };
1050 1050
1051 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 1051 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
1052 1052
1053 const struct file_operations gfs2_file_fops_nolock = { 1053 const struct file_operations gfs2_file_fops_nolock = {
1054 .llseek = gfs2_llseek, 1054 .llseek = gfs2_llseek,
1055 .read = do_sync_read, 1055 .read = do_sync_read,
1056 .aio_read = generic_file_aio_read, 1056 .aio_read = generic_file_aio_read,
1057 .write = do_sync_write, 1057 .write = do_sync_write,
1058 .aio_write = gfs2_file_aio_write, 1058 .aio_write = gfs2_file_aio_write,
1059 .unlocked_ioctl = gfs2_ioctl, 1059 .unlocked_ioctl = gfs2_ioctl,
1060 .mmap = gfs2_mmap, 1060 .mmap = gfs2_mmap,
1061 .open = gfs2_open, 1061 .open = gfs2_open,
1062 .release = gfs2_release, 1062 .release = gfs2_release,
1063 .fsync = gfs2_fsync, 1063 .fsync = gfs2_fsync,
1064 .splice_read = generic_file_splice_read, 1064 .splice_read = generic_file_splice_read,
1065 .splice_write = generic_file_splice_write, 1065 .splice_write = generic_file_splice_write,
1066 .setlease = generic_setlease, 1066 .setlease = generic_setlease,
1067 .fallocate = gfs2_fallocate, 1067 .fallocate = gfs2_fallocate,
1068 }; 1068 };
1069 1069
1070 const struct file_operations gfs2_dir_fops_nolock = { 1070 const struct file_operations gfs2_dir_fops_nolock = {
1071 .readdir = gfs2_readdir, 1071 .readdir = gfs2_readdir,
1072 .unlocked_ioctl = gfs2_ioctl, 1072 .unlocked_ioctl = gfs2_ioctl,
1073 .open = gfs2_open, 1073 .open = gfs2_open,
1074 .release = gfs2_release, 1074 .release = gfs2_release,
1075 .fsync = gfs2_fsync, 1075 .fsync = gfs2_fsync,
1076 .llseek = default_llseek, 1076 .llseek = default_llseek,
1077 }; 1077 };
1078 1078
1079 1079
1 /* 1 /*
2 * fs/libfs.c 2 * fs/libfs.c
3 * Library for filesystems writers. 3 * Library for filesystems writers.
4 */ 4 */
5 5
6 #include <linux/export.h> 6 #include <linux/export.h>
7 #include <linux/pagemap.h> 7 #include <linux/pagemap.h>
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/mount.h> 9 #include <linux/mount.h>
10 #include <linux/vfs.h> 10 #include <linux/vfs.h>
11 #include <linux/quotaops.h> 11 #include <linux/quotaops.h>
12 #include <linux/mutex.h> 12 #include <linux/mutex.h>
13 #include <linux/exportfs.h> 13 #include <linux/exportfs.h>
14 #include <linux/writeback.h> 14 #include <linux/writeback.h>
15 #include <linux/buffer_head.h> /* sync_mapping_buffers */ 15 #include <linux/buffer_head.h> /* sync_mapping_buffers */
16 16
17 #include <asm/uaccess.h> 17 #include <asm/uaccess.h>
18 18
19 #include "internal.h" 19 #include "internal.h"
20 20
21 static inline int simple_positive(struct dentry *dentry) 21 static inline int simple_positive(struct dentry *dentry)
22 { 22 {
23 return dentry->d_inode && !d_unhashed(dentry); 23 return dentry->d_inode && !d_unhashed(dentry);
24 } 24 }
25 25
26 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, 26 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
27 struct kstat *stat) 27 struct kstat *stat)
28 { 28 {
29 struct inode *inode = dentry->d_inode; 29 struct inode *inode = dentry->d_inode;
30 generic_fillattr(inode, stat); 30 generic_fillattr(inode, stat);
31 stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); 31 stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
32 return 0; 32 return 0;
33 } 33 }
34 34
35 int simple_statfs(struct dentry *dentry, struct kstatfs *buf) 35 int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
36 { 36 {
37 buf->f_type = dentry->d_sb->s_magic; 37 buf->f_type = dentry->d_sb->s_magic;
38 buf->f_bsize = PAGE_CACHE_SIZE; 38 buf->f_bsize = PAGE_CACHE_SIZE;
39 buf->f_namelen = NAME_MAX; 39 buf->f_namelen = NAME_MAX;
40 return 0; 40 return 0;
41 } 41 }
42 42
43 /* 43 /*
44 * Retaining negative dentries for an in-memory filesystem just wastes 44 * Retaining negative dentries for an in-memory filesystem just wastes
45 * memory and lookup time: arrange for them to be deleted immediately. 45 * memory and lookup time: arrange for them to be deleted immediately.
46 */ 46 */
47 static int simple_delete_dentry(const struct dentry *dentry) 47 static int simple_delete_dentry(const struct dentry *dentry)
48 { 48 {
49 return 1; 49 return 1;
50 } 50 }
51 51
52 /* 52 /*
53 * Lookup the data. This is trivial - if the dentry didn't already 53 * Lookup the data. This is trivial - if the dentry didn't already
54 * exist, we know it is negative. Set d_op to delete negative dentries. 54 * exist, we know it is negative. Set d_op to delete negative dentries.
55 */ 55 */
56 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 56 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
57 { 57 {
58 static const struct dentry_operations simple_dentry_operations = { 58 static const struct dentry_operations simple_dentry_operations = {
59 .d_delete = simple_delete_dentry, 59 .d_delete = simple_delete_dentry,
60 }; 60 };
61 61
62 if (dentry->d_name.len > NAME_MAX) 62 if (dentry->d_name.len > NAME_MAX)
63 return ERR_PTR(-ENAMETOOLONG); 63 return ERR_PTR(-ENAMETOOLONG);
64 d_set_d_op(dentry, &simple_dentry_operations); 64 d_set_d_op(dentry, &simple_dentry_operations);
65 d_add(dentry, NULL); 65 d_add(dentry, NULL);
66 return NULL; 66 return NULL;
67 } 67 }
68 68
69 int dcache_dir_open(struct inode *inode, struct file *file) 69 int dcache_dir_open(struct inode *inode, struct file *file)
70 { 70 {
71 static struct qstr cursor_name = QSTR_INIT(".", 1); 71 static struct qstr cursor_name = QSTR_INIT(".", 1);
72 72
73 file->private_data = d_alloc(file->f_path.dentry, &cursor_name); 73 file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
74 74
75 return file->private_data ? 0 : -ENOMEM; 75 return file->private_data ? 0 : -ENOMEM;
76 } 76 }
77 77
78 int dcache_dir_close(struct inode *inode, struct file *file) 78 int dcache_dir_close(struct inode *inode, struct file *file)
79 { 79 {
80 dput(file->private_data); 80 dput(file->private_data);
81 return 0; 81 return 0;
82 } 82 }
83 83
84 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 84 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
85 { 85 {
86 struct dentry *dentry = file->f_path.dentry; 86 struct dentry *dentry = file->f_path.dentry;
87 mutex_lock(&dentry->d_inode->i_mutex); 87 mutex_lock(&dentry->d_inode->i_mutex);
88 switch (origin) { 88 switch (whence) {
89 case 1: 89 case 1:
90 offset += file->f_pos; 90 offset += file->f_pos;
91 case 0: 91 case 0:
92 if (offset >= 0) 92 if (offset >= 0)
93 break; 93 break;
94 default: 94 default:
95 mutex_unlock(&dentry->d_inode->i_mutex); 95 mutex_unlock(&dentry->d_inode->i_mutex);
96 return -EINVAL; 96 return -EINVAL;
97 } 97 }
98 if (offset != file->f_pos) { 98 if (offset != file->f_pos) {
99 file->f_pos = offset; 99 file->f_pos = offset;
100 if (file->f_pos >= 2) { 100 if (file->f_pos >= 2) {
101 struct list_head *p; 101 struct list_head *p;
102 struct dentry *cursor = file->private_data; 102 struct dentry *cursor = file->private_data;
103 loff_t n = file->f_pos - 2; 103 loff_t n = file->f_pos - 2;
104 104
105 spin_lock(&dentry->d_lock); 105 spin_lock(&dentry->d_lock);
106 /* d_lock not required for cursor */ 106 /* d_lock not required for cursor */
107 list_del(&cursor->d_u.d_child); 107 list_del(&cursor->d_u.d_child);
108 p = dentry->d_subdirs.next; 108 p = dentry->d_subdirs.next;
109 while (n && p != &dentry->d_subdirs) { 109 while (n && p != &dentry->d_subdirs) {
110 struct dentry *next; 110 struct dentry *next;
111 next = list_entry(p, struct dentry, d_u.d_child); 111 next = list_entry(p, struct dentry, d_u.d_child);
112 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); 112 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
113 if (simple_positive(next)) 113 if (simple_positive(next))
114 n--; 114 n--;
115 spin_unlock(&next->d_lock); 115 spin_unlock(&next->d_lock);
116 p = p->next; 116 p = p->next;
117 } 117 }
118 list_add_tail(&cursor->d_u.d_child, p); 118 list_add_tail(&cursor->d_u.d_child, p);
119 spin_unlock(&dentry->d_lock); 119 spin_unlock(&dentry->d_lock);
120 } 120 }
121 } 121 }
122 mutex_unlock(&dentry->d_inode->i_mutex); 122 mutex_unlock(&dentry->d_inode->i_mutex);
123 return offset; 123 return offset;
124 } 124 }
125 125
126 /* Relationship between i_mode and the DT_xxx types */ 126 /* Relationship between i_mode and the DT_xxx types */
127 static inline unsigned char dt_type(struct inode *inode) 127 static inline unsigned char dt_type(struct inode *inode)
128 { 128 {
129 return (inode->i_mode >> 12) & 15; 129 return (inode->i_mode >> 12) & 15;
130 } 130 }
131 131
132 /* 132 /*
133 * Directory is locked and all positive dentries in it are safe, since 133 * Directory is locked and all positive dentries in it are safe, since
134 * for ramfs-type trees they can't go away without unlink() or rmdir(), 134 * for ramfs-type trees they can't go away without unlink() or rmdir(),
135 * both impossible due to the lock on directory. 135 * both impossible due to the lock on directory.
136 */ 136 */
137 137
138 int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) 138 int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
139 { 139 {
140 struct dentry *dentry = filp->f_path.dentry; 140 struct dentry *dentry = filp->f_path.dentry;
141 struct dentry *cursor = filp->private_data; 141 struct dentry *cursor = filp->private_data;
142 struct list_head *p, *q = &cursor->d_u.d_child; 142 struct list_head *p, *q = &cursor->d_u.d_child;
143 ino_t ino; 143 ino_t ino;
144 int i = filp->f_pos; 144 int i = filp->f_pos;
145 145
146 switch (i) { 146 switch (i) {
147 case 0: 147 case 0:
148 ino = dentry->d_inode->i_ino; 148 ino = dentry->d_inode->i_ino;
149 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 149 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
150 break; 150 break;
151 filp->f_pos++; 151 filp->f_pos++;
152 i++; 152 i++;
153 /* fallthrough */ 153 /* fallthrough */
154 case 1: 154 case 1:
155 ino = parent_ino(dentry); 155 ino = parent_ino(dentry);
156 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 156 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
157 break; 157 break;
158 filp->f_pos++; 158 filp->f_pos++;
159 i++; 159 i++;
160 /* fallthrough */ 160 /* fallthrough */
161 default: 161 default:
162 spin_lock(&dentry->d_lock); 162 spin_lock(&dentry->d_lock);
163 if (filp->f_pos == 2) 163 if (filp->f_pos == 2)
164 list_move(q, &dentry->d_subdirs); 164 list_move(q, &dentry->d_subdirs);
165 165
166 for (p=q->next; p != &dentry->d_subdirs; p=p->next) { 166 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
167 struct dentry *next; 167 struct dentry *next;
168 next = list_entry(p, struct dentry, d_u.d_child); 168 next = list_entry(p, struct dentry, d_u.d_child);
169 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); 169 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
170 if (!simple_positive(next)) { 170 if (!simple_positive(next)) {
171 spin_unlock(&next->d_lock); 171 spin_unlock(&next->d_lock);
172 continue; 172 continue;
173 } 173 }
174 174
175 spin_unlock(&next->d_lock); 175 spin_unlock(&next->d_lock);
176 spin_unlock(&dentry->d_lock); 176 spin_unlock(&dentry->d_lock);
177 if (filldir(dirent, next->d_name.name, 177 if (filldir(dirent, next->d_name.name,
178 next->d_name.len, filp->f_pos, 178 next->d_name.len, filp->f_pos,
179 next->d_inode->i_ino, 179 next->d_inode->i_ino,
180 dt_type(next->d_inode)) < 0) 180 dt_type(next->d_inode)) < 0)
181 return 0; 181 return 0;
182 spin_lock(&dentry->d_lock); 182 spin_lock(&dentry->d_lock);
183 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); 183 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
184 /* next is still alive */ 184 /* next is still alive */
185 list_move(q, p); 185 list_move(q, p);
186 spin_unlock(&next->d_lock); 186 spin_unlock(&next->d_lock);
187 p = q; 187 p = q;
188 filp->f_pos++; 188 filp->f_pos++;
189 } 189 }
190 spin_unlock(&dentry->d_lock); 190 spin_unlock(&dentry->d_lock);
191 } 191 }
192 return 0; 192 return 0;
193 } 193 }
194 194
195 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) 195 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
196 { 196 {
197 return -EISDIR; 197 return -EISDIR;
198 } 198 }
199 199
200 const struct file_operations simple_dir_operations = { 200 const struct file_operations simple_dir_operations = {
201 .open = dcache_dir_open, 201 .open = dcache_dir_open,
202 .release = dcache_dir_close, 202 .release = dcache_dir_close,
203 .llseek = dcache_dir_lseek, 203 .llseek = dcache_dir_lseek,
204 .read = generic_read_dir, 204 .read = generic_read_dir,
205 .readdir = dcache_readdir, 205 .readdir = dcache_readdir,
206 .fsync = noop_fsync, 206 .fsync = noop_fsync,
207 }; 207 };
208 208
209 const struct inode_operations simple_dir_inode_operations = { 209 const struct inode_operations simple_dir_inode_operations = {
210 .lookup = simple_lookup, 210 .lookup = simple_lookup,
211 }; 211 };
212 212
213 static const struct super_operations simple_super_operations = { 213 static const struct super_operations simple_super_operations = {
214 .statfs = simple_statfs, 214 .statfs = simple_statfs,
215 }; 215 };
216 216
217 /* 217 /*
218 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 218 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
219 * will never be mountable) 219 * will never be mountable)
220 */ 220 */
221 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, 221 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
222 const struct super_operations *ops, 222 const struct super_operations *ops,
223 const struct dentry_operations *dops, unsigned long magic) 223 const struct dentry_operations *dops, unsigned long magic)
224 { 224 {
225 struct super_block *s; 225 struct super_block *s;
226 struct dentry *dentry; 226 struct dentry *dentry;
227 struct inode *root; 227 struct inode *root;
228 struct qstr d_name = QSTR_INIT(name, strlen(name)); 228 struct qstr d_name = QSTR_INIT(name, strlen(name));
229 229
230 s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); 230 s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
231 if (IS_ERR(s)) 231 if (IS_ERR(s))
232 return ERR_CAST(s); 232 return ERR_CAST(s);
233 233
234 s->s_maxbytes = MAX_LFS_FILESIZE; 234 s->s_maxbytes = MAX_LFS_FILESIZE;
235 s->s_blocksize = PAGE_SIZE; 235 s->s_blocksize = PAGE_SIZE;
236 s->s_blocksize_bits = PAGE_SHIFT; 236 s->s_blocksize_bits = PAGE_SHIFT;
237 s->s_magic = magic; 237 s->s_magic = magic;
238 s->s_op = ops ? ops : &simple_super_operations; 238 s->s_op = ops ? ops : &simple_super_operations;
239 s->s_time_gran = 1; 239 s->s_time_gran = 1;
240 root = new_inode(s); 240 root = new_inode(s);
241 if (!root) 241 if (!root)
242 goto Enomem; 242 goto Enomem;
243 /* 243 /*
244 * since this is the first inode, make it number 1. New inodes created 244 * since this is the first inode, make it number 1. New inodes created
245 * after this must take care not to collide with it (by passing 245 * after this must take care not to collide with it (by passing
246 * max_reserved of 1 to iunique). 246 * max_reserved of 1 to iunique).
247 */ 247 */
248 root->i_ino = 1; 248 root->i_ino = 1;
249 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 249 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
250 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; 250 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
251 dentry = __d_alloc(s, &d_name); 251 dentry = __d_alloc(s, &d_name);
252 if (!dentry) { 252 if (!dentry) {
253 iput(root); 253 iput(root);
254 goto Enomem; 254 goto Enomem;
255 } 255 }
256 d_instantiate(dentry, root); 256 d_instantiate(dentry, root);
257 s->s_root = dentry; 257 s->s_root = dentry;
258 s->s_d_op = dops; 258 s->s_d_op = dops;
259 s->s_flags |= MS_ACTIVE; 259 s->s_flags |= MS_ACTIVE;
260 return dget(s->s_root); 260 return dget(s->s_root);
261 261
262 Enomem: 262 Enomem:
263 deactivate_locked_super(s); 263 deactivate_locked_super(s);
264 return ERR_PTR(-ENOMEM); 264 return ERR_PTR(-ENOMEM);
265 } 265 }
266 266
267 int simple_open(struct inode *inode, struct file *file) 267 int simple_open(struct inode *inode, struct file *file)
268 { 268 {
269 if (inode->i_private) 269 if (inode->i_private)
270 file->private_data = inode->i_private; 270 file->private_data = inode->i_private;
271 return 0; 271 return 0;
272 } 272 }
273 273
274 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 274 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
275 { 275 {
276 struct inode *inode = old_dentry->d_inode; 276 struct inode *inode = old_dentry->d_inode;
277 277
278 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 278 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
279 inc_nlink(inode); 279 inc_nlink(inode);
280 ihold(inode); 280 ihold(inode);
281 dget(dentry); 281 dget(dentry);
282 d_instantiate(dentry, inode); 282 d_instantiate(dentry, inode);
283 return 0; 283 return 0;
284 } 284 }
285 285
286 int simple_empty(struct dentry *dentry) 286 int simple_empty(struct dentry *dentry)
287 { 287 {
288 struct dentry *child; 288 struct dentry *child;
289 int ret = 0; 289 int ret = 0;
290 290
291 spin_lock(&dentry->d_lock); 291 spin_lock(&dentry->d_lock);
292 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { 292 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
293 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); 293 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
294 if (simple_positive(child)) { 294 if (simple_positive(child)) {
295 spin_unlock(&child->d_lock); 295 spin_unlock(&child->d_lock);
296 goto out; 296 goto out;
297 } 297 }
298 spin_unlock(&child->d_lock); 298 spin_unlock(&child->d_lock);
299 } 299 }
300 ret = 1; 300 ret = 1;
301 out: 301 out:
302 spin_unlock(&dentry->d_lock); 302 spin_unlock(&dentry->d_lock);
303 return ret; 303 return ret;
304 } 304 }
305 305
306 int simple_unlink(struct inode *dir, struct dentry *dentry) 306 int simple_unlink(struct inode *dir, struct dentry *dentry)
307 { 307 {
308 struct inode *inode = dentry->d_inode; 308 struct inode *inode = dentry->d_inode;
309 309
310 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 310 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
311 drop_nlink(inode); 311 drop_nlink(inode);
312 dput(dentry); 312 dput(dentry);
313 return 0; 313 return 0;
314 } 314 }
315 315
316 int simple_rmdir(struct inode *dir, struct dentry *dentry) 316 int simple_rmdir(struct inode *dir, struct dentry *dentry)
317 { 317 {
318 if (!simple_empty(dentry)) 318 if (!simple_empty(dentry))
319 return -ENOTEMPTY; 319 return -ENOTEMPTY;
320 320
321 drop_nlink(dentry->d_inode); 321 drop_nlink(dentry->d_inode);
322 simple_unlink(dir, dentry); 322 simple_unlink(dir, dentry);
323 drop_nlink(dir); 323 drop_nlink(dir);
324 return 0; 324 return 0;
325 } 325 }
326 326
327 int simple_rename(struct inode *old_dir, struct dentry *old_dentry, 327 int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
328 struct inode *new_dir, struct dentry *new_dentry) 328 struct inode *new_dir, struct dentry *new_dentry)
329 { 329 {
330 struct inode *inode = old_dentry->d_inode; 330 struct inode *inode = old_dentry->d_inode;
331 int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode); 331 int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
332 332
333 if (!simple_empty(new_dentry)) 333 if (!simple_empty(new_dentry))
334 return -ENOTEMPTY; 334 return -ENOTEMPTY;
335 335
336 if (new_dentry->d_inode) { 336 if (new_dentry->d_inode) {
337 simple_unlink(new_dir, new_dentry); 337 simple_unlink(new_dir, new_dentry);
338 if (they_are_dirs) { 338 if (they_are_dirs) {
339 drop_nlink(new_dentry->d_inode); 339 drop_nlink(new_dentry->d_inode);
340 drop_nlink(old_dir); 340 drop_nlink(old_dir);
341 } 341 }
342 } else if (they_are_dirs) { 342 } else if (they_are_dirs) {
343 drop_nlink(old_dir); 343 drop_nlink(old_dir);
344 inc_nlink(new_dir); 344 inc_nlink(new_dir);
345 } 345 }
346 346
347 old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = 347 old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
348 new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; 348 new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
349 349
350 return 0; 350 return 0;
351 } 351 }
352 352
353 /** 353 /**
354 * simple_setattr - setattr for simple filesystem 354 * simple_setattr - setattr for simple filesystem
355 * @dentry: dentry 355 * @dentry: dentry
356 * @iattr: iattr structure 356 * @iattr: iattr structure
357 * 357 *
358 * Returns 0 on success, -error on failure. 358 * Returns 0 on success, -error on failure.
359 * 359 *
360 * simple_setattr is a simple ->setattr implementation without a proper 360 * simple_setattr is a simple ->setattr implementation without a proper
361 * implementation of size changes. 361 * implementation of size changes.
362 * 362 *
363 * It can either be used for in-memory filesystems or special files 363 * It can either be used for in-memory filesystems or special files
364 * on simple regular filesystems. Anything that needs to change on-disk 364 * on simple regular filesystems. Anything that needs to change on-disk
365 * or wire state on size changes needs its own setattr method. 365 * or wire state on size changes needs its own setattr method.
366 */ 366 */
367 int simple_setattr(struct dentry *dentry, struct iattr *iattr) 367 int simple_setattr(struct dentry *dentry, struct iattr *iattr)
368 { 368 {
369 struct inode *inode = dentry->d_inode; 369 struct inode *inode = dentry->d_inode;
370 int error; 370 int error;
371 371
372 WARN_ON_ONCE(inode->i_op->truncate); 372 WARN_ON_ONCE(inode->i_op->truncate);
373 373
374 error = inode_change_ok(inode, iattr); 374 error = inode_change_ok(inode, iattr);
375 if (error) 375 if (error)
376 return error; 376 return error;
377 377
378 if (iattr->ia_valid & ATTR_SIZE) 378 if (iattr->ia_valid & ATTR_SIZE)
379 truncate_setsize(inode, iattr->ia_size); 379 truncate_setsize(inode, iattr->ia_size);
380 setattr_copy(inode, iattr); 380 setattr_copy(inode, iattr);
381 mark_inode_dirty(inode); 381 mark_inode_dirty(inode);
382 return 0; 382 return 0;
383 } 383 }
384 EXPORT_SYMBOL(simple_setattr); 384 EXPORT_SYMBOL(simple_setattr);
385 385
386 int simple_readpage(struct file *file, struct page *page) 386 int simple_readpage(struct file *file, struct page *page)
387 { 387 {
388 clear_highpage(page); 388 clear_highpage(page);
389 flush_dcache_page(page); 389 flush_dcache_page(page);
390 SetPageUptodate(page); 390 SetPageUptodate(page);
391 unlock_page(page); 391 unlock_page(page);
392 return 0; 392 return 0;
393 } 393 }
394 394
395 int simple_write_begin(struct file *file, struct address_space *mapping, 395 int simple_write_begin(struct file *file, struct address_space *mapping,
396 loff_t pos, unsigned len, unsigned flags, 396 loff_t pos, unsigned len, unsigned flags,
397 struct page **pagep, void **fsdata) 397 struct page **pagep, void **fsdata)
398 { 398 {
399 struct page *page; 399 struct page *page;
400 pgoff_t index; 400 pgoff_t index;
401 401
402 index = pos >> PAGE_CACHE_SHIFT; 402 index = pos >> PAGE_CACHE_SHIFT;
403 403
404 page = grab_cache_page_write_begin(mapping, index, flags); 404 page = grab_cache_page_write_begin(mapping, index, flags);
405 if (!page) 405 if (!page)
406 return -ENOMEM; 406 return -ENOMEM;
407 407
408 *pagep = page; 408 *pagep = page;
409 409
410 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { 410 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
411 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 411 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
412 412
413 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE); 413 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
414 } 414 }
415 return 0; 415 return 0;
416 } 416 }
417 417
418 /** 418 /**
419 * simple_write_end - .write_end helper for non-block-device FSes 419 * simple_write_end - .write_end helper for non-block-device FSes
420 * @available: See .write_end of address_space_operations 420 * @available: See .write_end of address_space_operations
421 * @file: " 421 * @file: "
422 * @mapping: " 422 * @mapping: "
423 * @pos: " 423 * @pos: "
424 * @len: " 424 * @len: "
425 * @copied: " 425 * @copied: "
426 * @page: " 426 * @page: "
427 * @fsdata: " 427 * @fsdata: "
428 * 428 *
429 * simple_write_end does the minimum needed for updating a page after writing is 429 * simple_write_end does the minimum needed for updating a page after writing is
430 * done. It has the same API signature as the .write_end of 430 * done. It has the same API signature as the .write_end of
431 * address_space_operations vector. So it can just be set onto .write_end for 431 * address_space_operations vector. So it can just be set onto .write_end for
432 * FSes that don't need any other processing. i_mutex is assumed to be held. 432 * FSes that don't need any other processing. i_mutex is assumed to be held.
433 * Block based filesystems should use generic_write_end(). 433 * Block based filesystems should use generic_write_end().
434 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty 434 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
435 * is not called, so a filesystem that actually does store data in .write_inode 435 * is not called, so a filesystem that actually does store data in .write_inode
436 * should extend on what's done here with a call to mark_inode_dirty() in the 436 * should extend on what's done here with a call to mark_inode_dirty() in the
437 * case that i_size has changed. 437 * case that i_size has changed.
438 */ 438 */
439 int simple_write_end(struct file *file, struct address_space *mapping, 439 int simple_write_end(struct file *file, struct address_space *mapping,
440 loff_t pos, unsigned len, unsigned copied, 440 loff_t pos, unsigned len, unsigned copied,
441 struct page *page, void *fsdata) 441 struct page *page, void *fsdata)
442 { 442 {
443 struct inode *inode = page->mapping->host; 443 struct inode *inode = page->mapping->host;
444 loff_t last_pos = pos + copied; 444 loff_t last_pos = pos + copied;
445 445
446 /* zero the stale part of the page if we did a short copy */ 446 /* zero the stale part of the page if we did a short copy */
447 if (copied < len) { 447 if (copied < len) {
448 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 448 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
449 449
450 zero_user(page, from + copied, len - copied); 450 zero_user(page, from + copied, len - copied);
451 } 451 }
452 452
453 if (!PageUptodate(page)) 453 if (!PageUptodate(page))
454 SetPageUptodate(page); 454 SetPageUptodate(page);
455 /* 455 /*
456 * No need to use i_size_read() here, the i_size 456 * No need to use i_size_read() here, the i_size
457 * cannot change under us because we hold the i_mutex. 457 * cannot change under us because we hold the i_mutex.
458 */ 458 */
459 if (last_pos > inode->i_size) 459 if (last_pos > inode->i_size)
460 i_size_write(inode, last_pos); 460 i_size_write(inode, last_pos);
461 461
462 set_page_dirty(page); 462 set_page_dirty(page);
463 unlock_page(page); 463 unlock_page(page);
464 page_cache_release(page); 464 page_cache_release(page);
465 465
466 return copied; 466 return copied;
467 } 467 }
468 468
469 /* 469 /*
470 * the inodes created here are not hashed. If you use iunique to generate 470 * the inodes created here are not hashed. If you use iunique to generate
471 * unique inode values later for this filesystem, then you must take care 471 * unique inode values later for this filesystem, then you must take care
472 * to pass it an appropriate max_reserved value to avoid collisions. 472 * to pass it an appropriate max_reserved value to avoid collisions.
473 */ 473 */
474 int simple_fill_super(struct super_block *s, unsigned long magic, 474 int simple_fill_super(struct super_block *s, unsigned long magic,
475 struct tree_descr *files) 475 struct tree_descr *files)
476 { 476 {
477 struct inode *inode; 477 struct inode *inode;
478 struct dentry *root; 478 struct dentry *root;
479 struct dentry *dentry; 479 struct dentry *dentry;
480 int i; 480 int i;
481 481
482 s->s_blocksize = PAGE_CACHE_SIZE; 482 s->s_blocksize = PAGE_CACHE_SIZE;
483 s->s_blocksize_bits = PAGE_CACHE_SHIFT; 483 s->s_blocksize_bits = PAGE_CACHE_SHIFT;
484 s->s_magic = magic; 484 s->s_magic = magic;
485 s->s_op = &simple_super_operations; 485 s->s_op = &simple_super_operations;
486 s->s_time_gran = 1; 486 s->s_time_gran = 1;
487 487
488 inode = new_inode(s); 488 inode = new_inode(s);
489 if (!inode) 489 if (!inode)
490 return -ENOMEM; 490 return -ENOMEM;
491 /* 491 /*
492 * because the root inode is 1, the files array must not contain an 492 * because the root inode is 1, the files array must not contain an
493 * entry at index 1 493 * entry at index 1
494 */ 494 */
495 inode->i_ino = 1; 495 inode->i_ino = 1;
496 inode->i_mode = S_IFDIR | 0755; 496 inode->i_mode = S_IFDIR | 0755;
497 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 497 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
498 inode->i_op = &simple_dir_inode_operations; 498 inode->i_op = &simple_dir_inode_operations;
499 inode->i_fop = &simple_dir_operations; 499 inode->i_fop = &simple_dir_operations;
500 set_nlink(inode, 2); 500 set_nlink(inode, 2);
501 root = d_make_root(inode); 501 root = d_make_root(inode);
502 if (!root) 502 if (!root)
503 return -ENOMEM; 503 return -ENOMEM;
504 for (i = 0; !files->name || files->name[0]; i++, files++) { 504 for (i = 0; !files->name || files->name[0]; i++, files++) {
505 if (!files->name) 505 if (!files->name)
506 continue; 506 continue;
507 507
508 /* warn if it tries to conflict with the root inode */ 508 /* warn if it tries to conflict with the root inode */
509 if (unlikely(i == 1)) 509 if (unlikely(i == 1))
510 printk(KERN_WARNING "%s: %s passed in a files array" 510 printk(KERN_WARNING "%s: %s passed in a files array"
511 "with an index of 1!\n", __func__, 511 "with an index of 1!\n", __func__,
512 s->s_type->name); 512 s->s_type->name);
513 513
514 dentry = d_alloc_name(root, files->name); 514 dentry = d_alloc_name(root, files->name);
515 if (!dentry) 515 if (!dentry)
516 goto out; 516 goto out;
517 inode = new_inode(s); 517 inode = new_inode(s);
518 if (!inode) { 518 if (!inode) {
519 dput(dentry); 519 dput(dentry);
520 goto out; 520 goto out;
521 } 521 }
522 inode->i_mode = S_IFREG | files->mode; 522 inode->i_mode = S_IFREG | files->mode;
523 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 523 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
524 inode->i_fop = files->ops; 524 inode->i_fop = files->ops;
525 inode->i_ino = i; 525 inode->i_ino = i;
526 d_add(dentry, inode); 526 d_add(dentry, inode);
527 } 527 }
528 s->s_root = root; 528 s->s_root = root;
529 return 0; 529 return 0;
530 out: 530 out:
531 d_genocide(root); 531 d_genocide(root);
532 shrink_dcache_parent(root); 532 shrink_dcache_parent(root);
533 dput(root); 533 dput(root);
534 return -ENOMEM; 534 return -ENOMEM;
535 } 535 }
536 536
537 static DEFINE_SPINLOCK(pin_fs_lock); 537 static DEFINE_SPINLOCK(pin_fs_lock);
538 538
539 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) 539 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
540 { 540 {
541 struct vfsmount *mnt = NULL; 541 struct vfsmount *mnt = NULL;
542 spin_lock(&pin_fs_lock); 542 spin_lock(&pin_fs_lock);
543 if (unlikely(!*mount)) { 543 if (unlikely(!*mount)) {
544 spin_unlock(&pin_fs_lock); 544 spin_unlock(&pin_fs_lock);
545 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL); 545 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
546 if (IS_ERR(mnt)) 546 if (IS_ERR(mnt))
547 return PTR_ERR(mnt); 547 return PTR_ERR(mnt);
548 spin_lock(&pin_fs_lock); 548 spin_lock(&pin_fs_lock);
549 if (!*mount) 549 if (!*mount)
550 *mount = mnt; 550 *mount = mnt;
551 } 551 }
552 mntget(*mount); 552 mntget(*mount);
553 ++*count; 553 ++*count;
554 spin_unlock(&pin_fs_lock); 554 spin_unlock(&pin_fs_lock);
555 mntput(mnt); 555 mntput(mnt);
556 return 0; 556 return 0;
557 } 557 }
558 558
559 void simple_release_fs(struct vfsmount **mount, int *count) 559 void simple_release_fs(struct vfsmount **mount, int *count)
560 { 560 {
561 struct vfsmount *mnt; 561 struct vfsmount *mnt;
562 spin_lock(&pin_fs_lock); 562 spin_lock(&pin_fs_lock);
563 mnt = *mount; 563 mnt = *mount;
564 if (!--*count) 564 if (!--*count)
565 *mount = NULL; 565 *mount = NULL;
566 spin_unlock(&pin_fs_lock); 566 spin_unlock(&pin_fs_lock);
567 mntput(mnt); 567 mntput(mnt);
568 } 568 }
569 569
570 /** 570 /**
571 * simple_read_from_buffer - copy data from the buffer to user space 571 * simple_read_from_buffer - copy data from the buffer to user space
572 * @to: the user space buffer to read to 572 * @to: the user space buffer to read to
573 * @count: the maximum number of bytes to read 573 * @count: the maximum number of bytes to read
574 * @ppos: the current position in the buffer 574 * @ppos: the current position in the buffer
575 * @from: the buffer to read from 575 * @from: the buffer to read from
576 * @available: the size of the buffer 576 * @available: the size of the buffer
577 * 577 *
578 * The simple_read_from_buffer() function reads up to @count bytes from the 578 * The simple_read_from_buffer() function reads up to @count bytes from the
579 * buffer @from at offset @ppos into the user space address starting at @to. 579 * buffer @from at offset @ppos into the user space address starting at @to.
580 * 580 *
581 * On success, the number of bytes read is returned and the offset @ppos is 581 * On success, the number of bytes read is returned and the offset @ppos is
582 * advanced by this number, or negative value is returned on error. 582 * advanced by this number, or negative value is returned on error.
583 **/ 583 **/
584 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 584 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
585 const void *from, size_t available) 585 const void *from, size_t available)
586 { 586 {
587 loff_t pos = *ppos; 587 loff_t pos = *ppos;
588 size_t ret; 588 size_t ret;
589 589
590 if (pos < 0) 590 if (pos < 0)
591 return -EINVAL; 591 return -EINVAL;
592 if (pos >= available || !count) 592 if (pos >= available || !count)
593 return 0; 593 return 0;
594 if (count > available - pos) 594 if (count > available - pos)
595 count = available - pos; 595 count = available - pos;
596 ret = copy_to_user(to, from + pos, count); 596 ret = copy_to_user(to, from + pos, count);
597 if (ret == count) 597 if (ret == count)
598 return -EFAULT; 598 return -EFAULT;
599 count -= ret; 599 count -= ret;
600 *ppos = pos + count; 600 *ppos = pos + count;
601 return count; 601 return count;
602 } 602 }
603 603
604 /** 604 /**
605 * simple_write_to_buffer - copy data from user space to the buffer 605 * simple_write_to_buffer - copy data from user space to the buffer
606 * @to: the buffer to write to 606 * @to: the buffer to write to
607 * @available: the size of the buffer 607 * @available: the size of the buffer
608 * @ppos: the current position in the buffer 608 * @ppos: the current position in the buffer
609 * @from: the user space buffer to read from 609 * @from: the user space buffer to read from
610 * @count: the maximum number of bytes to read 610 * @count: the maximum number of bytes to read
611 * 611 *
612 * The simple_write_to_buffer() function reads up to @count bytes from the user 612 * The simple_write_to_buffer() function reads up to @count bytes from the user
613 * space address starting at @from into the buffer @to at offset @ppos. 613 * space address starting at @from into the buffer @to at offset @ppos.
614 * 614 *
615 * On success, the number of bytes written is returned and the offset @ppos is 615 * On success, the number of bytes written is returned and the offset @ppos is
616 * advanced by this number, or negative value is returned on error. 616 * advanced by this number, or negative value is returned on error.
617 **/ 617 **/
618 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 618 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
619 const void __user *from, size_t count) 619 const void __user *from, size_t count)
620 { 620 {
621 loff_t pos = *ppos; 621 loff_t pos = *ppos;
622 size_t res; 622 size_t res;
623 623
624 if (pos < 0) 624 if (pos < 0)
625 return -EINVAL; 625 return -EINVAL;
626 if (pos >= available || !count) 626 if (pos >= available || !count)
627 return 0; 627 return 0;
628 if (count > available - pos) 628 if (count > available - pos)
629 count = available - pos; 629 count = available - pos;
630 res = copy_from_user(to + pos, from, count); 630 res = copy_from_user(to + pos, from, count);
631 if (res == count) 631 if (res == count)
632 return -EFAULT; 632 return -EFAULT;
633 count -= res; 633 count -= res;
634 *ppos = pos + count; 634 *ppos = pos + count;
635 return count; 635 return count;
636 } 636 }
637 637
638 /** 638 /**
639 * memory_read_from_buffer - copy data from the buffer 639 * memory_read_from_buffer - copy data from the buffer
640 * @to: the kernel space buffer to read to 640 * @to: the kernel space buffer to read to
641 * @count: the maximum number of bytes to read 641 * @count: the maximum number of bytes to read
642 * @ppos: the current position in the buffer 642 * @ppos: the current position in the buffer
643 * @from: the buffer to read from 643 * @from: the buffer to read from
644 * @available: the size of the buffer 644 * @available: the size of the buffer
645 * 645 *
646 * The memory_read_from_buffer() function reads up to @count bytes from the 646 * The memory_read_from_buffer() function reads up to @count bytes from the
647 * buffer @from at offset @ppos into the kernel space address starting at @to. 647 * buffer @from at offset @ppos into the kernel space address starting at @to.
648 * 648 *
649 * On success, the number of bytes read is returned and the offset @ppos is 649 * On success, the number of bytes read is returned and the offset @ppos is
650 * advanced by this number, or negative value is returned on error. 650 * advanced by this number, or negative value is returned on error.
651 **/ 651 **/
652 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 652 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
653 const void *from, size_t available) 653 const void *from, size_t available)
654 { 654 {
655 loff_t pos = *ppos; 655 loff_t pos = *ppos;
656 656
657 if (pos < 0) 657 if (pos < 0)
658 return -EINVAL; 658 return -EINVAL;
659 if (pos >= available) 659 if (pos >= available)
660 return 0; 660 return 0;
661 if (count > available - pos) 661 if (count > available - pos)
662 count = available - pos; 662 count = available - pos;
663 memcpy(to, from + pos, count); 663 memcpy(to, from + pos, count);
664 *ppos = pos + count; 664 *ppos = pos + count;
665 665
666 return count; 666 return count;
667 } 667 }
668 668
669 /* 669 /*
670 * Transaction based IO. 670 * Transaction based IO.
671 * The file expects a single write which triggers the transaction, and then 671 * The file expects a single write which triggers the transaction, and then
672 * possibly a read which collects the result - which is stored in a 672 * possibly a read which collects the result - which is stored in a
673 * file-local buffer. 673 * file-local buffer.
674 */ 674 */
675 675
676 void simple_transaction_set(struct file *file, size_t n) 676 void simple_transaction_set(struct file *file, size_t n)
677 { 677 {
678 struct simple_transaction_argresp *ar = file->private_data; 678 struct simple_transaction_argresp *ar = file->private_data;
679 679
680 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 680 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
681 681
682 /* 682 /*
683 * The barrier ensures that ar->size will really remain zero until 683 * The barrier ensures that ar->size will really remain zero until
684 * ar->data is ready for reading. 684 * ar->data is ready for reading.
685 */ 685 */
686 smp_mb(); 686 smp_mb();
687 ar->size = n; 687 ar->size = n;
688 } 688 }
689 689
690 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 690 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
691 { 691 {
692 struct simple_transaction_argresp *ar; 692 struct simple_transaction_argresp *ar;
693 static DEFINE_SPINLOCK(simple_transaction_lock); 693 static DEFINE_SPINLOCK(simple_transaction_lock);
694 694
695 if (size > SIMPLE_TRANSACTION_LIMIT - 1) 695 if (size > SIMPLE_TRANSACTION_LIMIT - 1)
696 return ERR_PTR(-EFBIG); 696 return ERR_PTR(-EFBIG);
697 697
698 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); 698 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
699 if (!ar) 699 if (!ar)
700 return ERR_PTR(-ENOMEM); 700 return ERR_PTR(-ENOMEM);
701 701
702 spin_lock(&simple_transaction_lock); 702 spin_lock(&simple_transaction_lock);
703 703
704 /* only one write allowed per open */ 704 /* only one write allowed per open */
705 if (file->private_data) { 705 if (file->private_data) {
706 spin_unlock(&simple_transaction_lock); 706 spin_unlock(&simple_transaction_lock);
707 free_page((unsigned long)ar); 707 free_page((unsigned long)ar);
708 return ERR_PTR(-EBUSY); 708 return ERR_PTR(-EBUSY);
709 } 709 }
710 710
711 file->private_data = ar; 711 file->private_data = ar;
712 712
713 spin_unlock(&simple_transaction_lock); 713 spin_unlock(&simple_transaction_lock);
714 714
715 if (copy_from_user(ar->data, buf, size)) 715 if (copy_from_user(ar->data, buf, size))
716 return ERR_PTR(-EFAULT); 716 return ERR_PTR(-EFAULT);
717 717
718 return ar->data; 718 return ar->data;
719 } 719 }
720 720
721 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 721 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
722 { 722 {
723 struct simple_transaction_argresp *ar = file->private_data; 723 struct simple_transaction_argresp *ar = file->private_data;
724 724
725 if (!ar) 725 if (!ar)
726 return 0; 726 return 0;
727 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); 727 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
728 } 728 }
729 729
730 int simple_transaction_release(struct inode *inode, struct file *file) 730 int simple_transaction_release(struct inode *inode, struct file *file)
731 { 731 {
732 free_page((unsigned long)file->private_data); 732 free_page((unsigned long)file->private_data);
733 return 0; 733 return 0;
734 } 734 }
735 735
736 /* Simple attribute files */ 736 /* Simple attribute files */
737 737
738 struct simple_attr { 738 struct simple_attr {
739 int (*get)(void *, u64 *); 739 int (*get)(void *, u64 *);
740 int (*set)(void *, u64); 740 int (*set)(void *, u64);
741 char get_buf[24]; /* enough to store a u64 and "\n\0" */ 741 char get_buf[24]; /* enough to store a u64 and "\n\0" */
742 char set_buf[24]; 742 char set_buf[24];
743 void *data; 743 void *data;
744 const char *fmt; /* format for read operation */ 744 const char *fmt; /* format for read operation */
745 struct mutex mutex; /* protects access to these buffers */ 745 struct mutex mutex; /* protects access to these buffers */
746 }; 746 };
747 747
748 /* simple_attr_open is called by an actual attribute open file operation 748 /* simple_attr_open is called by an actual attribute open file operation
749 * to set the attribute specific access operations. */ 749 * to set the attribute specific access operations. */
750 int simple_attr_open(struct inode *inode, struct file *file, 750 int simple_attr_open(struct inode *inode, struct file *file,
751 int (*get)(void *, u64 *), int (*set)(void *, u64), 751 int (*get)(void *, u64 *), int (*set)(void *, u64),
752 const char *fmt) 752 const char *fmt)
753 { 753 {
754 struct simple_attr *attr; 754 struct simple_attr *attr;
755 755
756 attr = kmalloc(sizeof(*attr), GFP_KERNEL); 756 attr = kmalloc(sizeof(*attr), GFP_KERNEL);
757 if (!attr) 757 if (!attr)
758 return -ENOMEM; 758 return -ENOMEM;
759 759
760 attr->get = get; 760 attr->get = get;
761 attr->set = set; 761 attr->set = set;
762 attr->data = inode->i_private; 762 attr->data = inode->i_private;
763 attr->fmt = fmt; 763 attr->fmt = fmt;
764 mutex_init(&attr->mutex); 764 mutex_init(&attr->mutex);
765 765
766 file->private_data = attr; 766 file->private_data = attr;
767 767
768 return nonseekable_open(inode, file); 768 return nonseekable_open(inode, file);
769 } 769 }
770 770
771 int simple_attr_release(struct inode *inode, struct file *file) 771 int simple_attr_release(struct inode *inode, struct file *file)
772 { 772 {
773 kfree(file->private_data); 773 kfree(file->private_data);
774 return 0; 774 return 0;
775 } 775 }
776 776
777 /* read from the buffer that is filled with the get function */ 777 /* read from the buffer that is filled with the get function */
778 ssize_t simple_attr_read(struct file *file, char __user *buf, 778 ssize_t simple_attr_read(struct file *file, char __user *buf,
779 size_t len, loff_t *ppos) 779 size_t len, loff_t *ppos)
780 { 780 {
781 struct simple_attr *attr; 781 struct simple_attr *attr;
782 size_t size; 782 size_t size;
783 ssize_t ret; 783 ssize_t ret;
784 784
785 attr = file->private_data; 785 attr = file->private_data;
786 786
787 if (!attr->get) 787 if (!attr->get)
788 return -EACCES; 788 return -EACCES;
789 789
790 ret = mutex_lock_interruptible(&attr->mutex); 790 ret = mutex_lock_interruptible(&attr->mutex);
791 if (ret) 791 if (ret)
792 return ret; 792 return ret;
793 793
794 if (*ppos) { /* continued read */ 794 if (*ppos) { /* continued read */
795 size = strlen(attr->get_buf); 795 size = strlen(attr->get_buf);
796 } else { /* first read */ 796 } else { /* first read */
797 u64 val; 797 u64 val;
798 ret = attr->get(attr->data, &val); 798 ret = attr->get(attr->data, &val);
799 if (ret) 799 if (ret)
800 goto out; 800 goto out;
801 801
802 size = scnprintf(attr->get_buf, sizeof(attr->get_buf), 802 size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
803 attr->fmt, (unsigned long long)val); 803 attr->fmt, (unsigned long long)val);
804 } 804 }
805 805
806 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); 806 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
807 out: 807 out:
808 mutex_unlock(&attr->mutex); 808 mutex_unlock(&attr->mutex);
809 return ret; 809 return ret;
810 } 810 }
811 811
812 /* interpret the buffer as a number to call the set function with */ 812 /* interpret the buffer as a number to call the set function with */
813 ssize_t simple_attr_write(struct file *file, const char __user *buf, 813 ssize_t simple_attr_write(struct file *file, const char __user *buf,
814 size_t len, loff_t *ppos) 814 size_t len, loff_t *ppos)
815 { 815 {
816 struct simple_attr *attr; 816 struct simple_attr *attr;
817 u64 val; 817 u64 val;
818 size_t size; 818 size_t size;
819 ssize_t ret; 819 ssize_t ret;
820 820
821 attr = file->private_data; 821 attr = file->private_data;
822 if (!attr->set) 822 if (!attr->set)
823 return -EACCES; 823 return -EACCES;
824 824
825 ret = mutex_lock_interruptible(&attr->mutex); 825 ret = mutex_lock_interruptible(&attr->mutex);
826 if (ret) 826 if (ret)
827 return ret; 827 return ret;
828 828
829 ret = -EFAULT; 829 ret = -EFAULT;
830 size = min(sizeof(attr->set_buf) - 1, len); 830 size = min(sizeof(attr->set_buf) - 1, len);
831 if (copy_from_user(attr->set_buf, buf, size)) 831 if (copy_from_user(attr->set_buf, buf, size))
832 goto out; 832 goto out;
833 833
834 attr->set_buf[size] = '\0'; 834 attr->set_buf[size] = '\0';
835 val = simple_strtoll(attr->set_buf, NULL, 0); 835 val = simple_strtoll(attr->set_buf, NULL, 0);
836 ret = attr->set(attr->data, val); 836 ret = attr->set(attr->data, val);
837 if (ret == 0) 837 if (ret == 0)
838 ret = len; /* on success, claim we got the whole input */ 838 ret = len; /* on success, claim we got the whole input */
839 out: 839 out:
840 mutex_unlock(&attr->mutex); 840 mutex_unlock(&attr->mutex);
841 return ret; 841 return ret;
842 } 842 }
843 843
844 /** 844 /**
845 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation 845 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
846 * @sb: filesystem to do the file handle conversion on 846 * @sb: filesystem to do the file handle conversion on
847 * @fid: file handle to convert 847 * @fid: file handle to convert
848 * @fh_len: length of the file handle in bytes 848 * @fh_len: length of the file handle in bytes
849 * @fh_type: type of file handle 849 * @fh_type: type of file handle
850 * @get_inode: filesystem callback to retrieve inode 850 * @get_inode: filesystem callback to retrieve inode
851 * 851 *
852 * This function decodes @fid as long as it has one of the well-known 852 * This function decodes @fid as long as it has one of the well-known
853 * Linux filehandle types and calls @get_inode on it to retrieve the 853 * Linux filehandle types and calls @get_inode on it to retrieve the
854 * inode for the object specified in the file handle. 854 * inode for the object specified in the file handle.
855 */ 855 */
856 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, 856 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
857 int fh_len, int fh_type, struct inode *(*get_inode) 857 int fh_len, int fh_type, struct inode *(*get_inode)
858 (struct super_block *sb, u64 ino, u32 gen)) 858 (struct super_block *sb, u64 ino, u32 gen))
859 { 859 {
860 struct inode *inode = NULL; 860 struct inode *inode = NULL;
861 861
862 if (fh_len < 2) 862 if (fh_len < 2)
863 return NULL; 863 return NULL;
864 864
865 switch (fh_type) { 865 switch (fh_type) {
866 case FILEID_INO32_GEN: 866 case FILEID_INO32_GEN:
867 case FILEID_INO32_GEN_PARENT: 867 case FILEID_INO32_GEN_PARENT:
868 inode = get_inode(sb, fid->i32.ino, fid->i32.gen); 868 inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
869 break; 869 break;
870 } 870 }
871 871
872 return d_obtain_alias(inode); 872 return d_obtain_alias(inode);
873 } 873 }
874 EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 874 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
875 875
876 /** 876 /**
877 * generic_fh_to_parent - generic helper for the fh_to_parent export operation 877 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
878 * @sb: filesystem to do the file handle conversion on 878 * @sb: filesystem to do the file handle conversion on
879 * @fid: file handle to convert 879 * @fid: file handle to convert
880 * @fh_len: length of the file handle in bytes 880 * @fh_len: length of the file handle in bytes
881 * @fh_type: type of file handle 881 * @fh_type: type of file handle
882 * @get_inode: filesystem callback to retrieve inode 882 * @get_inode: filesystem callback to retrieve inode
883 * 883 *
884 * This function decodes @fid as long as it has one of the well-known 884 * This function decodes @fid as long as it has one of the well-known
885 * Linux filehandle types and calls @get_inode on it to retrieve the 885 * Linux filehandle types and calls @get_inode on it to retrieve the
886 * inode for the _parent_ object specified in the file handle if it 886 * inode for the _parent_ object specified in the file handle if it
887 * is specified in the file handle, or NULL otherwise. 887 * is specified in the file handle, or NULL otherwise.
888 */ 888 */
889 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, 889 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
890 int fh_len, int fh_type, struct inode *(*get_inode) 890 int fh_len, int fh_type, struct inode *(*get_inode)
891 (struct super_block *sb, u64 ino, u32 gen)) 891 (struct super_block *sb, u64 ino, u32 gen))
892 { 892 {
893 struct inode *inode = NULL; 893 struct inode *inode = NULL;
894 894
895 if (fh_len <= 2) 895 if (fh_len <= 2)
896 return NULL; 896 return NULL;
897 897
898 switch (fh_type) { 898 switch (fh_type) {
899 case FILEID_INO32_GEN_PARENT: 899 case FILEID_INO32_GEN_PARENT:
900 inode = get_inode(sb, fid->i32.parent_ino, 900 inode = get_inode(sb, fid->i32.parent_ino,
901 (fh_len > 3 ? fid->i32.parent_gen : 0)); 901 (fh_len > 3 ? fid->i32.parent_gen : 0));
902 break; 902 break;
903 } 903 }
904 904
905 return d_obtain_alias(inode); 905 return d_obtain_alias(inode);
906 } 906 }
907 EXPORT_SYMBOL_GPL(generic_fh_to_parent); 907 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
908 908
909 /** 909 /**
910 * generic_file_fsync - generic fsync implementation for simple filesystems 910 * generic_file_fsync - generic fsync implementation for simple filesystems
911 * @file: file to synchronize 911 * @file: file to synchronize
912 * @datasync: only synchronize essential metadata if true 912 * @datasync: only synchronize essential metadata if true
913 * 913 *
914 * This is a generic implementation of the fsync method for simple 914 * This is a generic implementation of the fsync method for simple
915 * filesystems which track all non-inode metadata in the buffers list 915 * filesystems which track all non-inode metadata in the buffers list
916 * hanging off the address_space structure. 916 * hanging off the address_space structure.
917 */ 917 */
918 int generic_file_fsync(struct file *file, loff_t start, loff_t end, 918 int generic_file_fsync(struct file *file, loff_t start, loff_t end,
919 int datasync) 919 int datasync)
920 { 920 {
921 struct inode *inode = file->f_mapping->host; 921 struct inode *inode = file->f_mapping->host;
922 int err; 922 int err;
923 int ret; 923 int ret;
924 924
925 err = filemap_write_and_wait_range(inode->i_mapping, start, end); 925 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
926 if (err) 926 if (err)
927 return err; 927 return err;
928 928
929 mutex_lock(&inode->i_mutex); 929 mutex_lock(&inode->i_mutex);
930 ret = sync_mapping_buffers(inode->i_mapping); 930 ret = sync_mapping_buffers(inode->i_mapping);
931 if (!(inode->i_state & I_DIRTY)) 931 if (!(inode->i_state & I_DIRTY))
932 goto out; 932 goto out;
933 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 933 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
934 goto out; 934 goto out;
935 935
936 err = sync_inode_metadata(inode, 1); 936 err = sync_inode_metadata(inode, 1);
937 if (ret == 0) 937 if (ret == 0)
938 ret = err; 938 ret = err;
939 out: 939 out:
940 mutex_unlock(&inode->i_mutex); 940 mutex_unlock(&inode->i_mutex);
941 return ret; 941 return ret;
942 } 942 }
943 EXPORT_SYMBOL(generic_file_fsync); 943 EXPORT_SYMBOL(generic_file_fsync);
944 944
945 /** 945 /**
946 * generic_check_addressable - Check addressability of file system 946 * generic_check_addressable - Check addressability of file system
947 * @blocksize_bits: log of file system block size 947 * @blocksize_bits: log of file system block size
948 * @num_blocks: number of blocks in file system 948 * @num_blocks: number of blocks in file system
949 * 949 *
950 * Determine whether a file system with @num_blocks blocks (and a 950 * Determine whether a file system with @num_blocks blocks (and a
951 * block size of 2**@blocksize_bits) is addressable by the sector_t 951 * block size of 2**@blocksize_bits) is addressable by the sector_t
952 * and page cache of the system. Return 0 if so and -EFBIG otherwise. 952 * and page cache of the system. Return 0 if so and -EFBIG otherwise.
953 */ 953 */
954 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) 954 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
955 { 955 {
956 u64 last_fs_block = num_blocks - 1; 956 u64 last_fs_block = num_blocks - 1;
957 u64 last_fs_page = 957 u64 last_fs_page =
958 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); 958 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
959 959
960 if (unlikely(num_blocks == 0)) 960 if (unlikely(num_blocks == 0))
961 return 0; 961 return 0;
962 962
963 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) 963 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
964 return -EINVAL; 964 return -EINVAL;
965 965
966 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || 966 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
967 (last_fs_page > (pgoff_t)(~0ULL))) { 967 (last_fs_page > (pgoff_t)(~0ULL))) {
968 return -EFBIG; 968 return -EFBIG;
969 } 969 }
970 return 0; 970 return 0;
971 } 971 }
972 EXPORT_SYMBOL(generic_check_addressable); 972 EXPORT_SYMBOL(generic_check_addressable);
973 973
974 /* 974 /*
975 * No-op implementation of ->fsync for in-memory filesystems. 975 * No-op implementation of ->fsync for in-memory filesystems.
976 */ 976 */
977 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) 977 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
978 { 978 {
979 return 0; 979 return 0;
980 } 980 }
981 981
982 EXPORT_SYMBOL(dcache_dir_close); 982 EXPORT_SYMBOL(dcache_dir_close);
983 EXPORT_SYMBOL(dcache_dir_lseek); 983 EXPORT_SYMBOL(dcache_dir_lseek);
984 EXPORT_SYMBOL(dcache_dir_open); 984 EXPORT_SYMBOL(dcache_dir_open);
985 EXPORT_SYMBOL(dcache_readdir); 985 EXPORT_SYMBOL(dcache_readdir);
986 EXPORT_SYMBOL(generic_read_dir); 986 EXPORT_SYMBOL(generic_read_dir);
987 EXPORT_SYMBOL(mount_pseudo); 987 EXPORT_SYMBOL(mount_pseudo);
988 EXPORT_SYMBOL(simple_write_begin); 988 EXPORT_SYMBOL(simple_write_begin);
989 EXPORT_SYMBOL(simple_write_end); 989 EXPORT_SYMBOL(simple_write_end);
990 EXPORT_SYMBOL(simple_dir_inode_operations); 990 EXPORT_SYMBOL(simple_dir_inode_operations);
991 EXPORT_SYMBOL(simple_dir_operations); 991 EXPORT_SYMBOL(simple_dir_operations);
992 EXPORT_SYMBOL(simple_empty); 992 EXPORT_SYMBOL(simple_empty);
993 EXPORT_SYMBOL(simple_fill_super); 993 EXPORT_SYMBOL(simple_fill_super);
994 EXPORT_SYMBOL(simple_getattr); 994 EXPORT_SYMBOL(simple_getattr);
995 EXPORT_SYMBOL(simple_open); 995 EXPORT_SYMBOL(simple_open);
996 EXPORT_SYMBOL(simple_link); 996 EXPORT_SYMBOL(simple_link);
997 EXPORT_SYMBOL(simple_lookup); 997 EXPORT_SYMBOL(simple_lookup);
998 EXPORT_SYMBOL(simple_pin_fs); 998 EXPORT_SYMBOL(simple_pin_fs);
999 EXPORT_SYMBOL(simple_readpage); 999 EXPORT_SYMBOL(simple_readpage);
1000 EXPORT_SYMBOL(simple_release_fs); 1000 EXPORT_SYMBOL(simple_release_fs);
1001 EXPORT_SYMBOL(simple_rename); 1001 EXPORT_SYMBOL(simple_rename);
1002 EXPORT_SYMBOL(simple_rmdir); 1002 EXPORT_SYMBOL(simple_rmdir);
1003 EXPORT_SYMBOL(simple_statfs); 1003 EXPORT_SYMBOL(simple_statfs);
1004 EXPORT_SYMBOL(noop_fsync); 1004 EXPORT_SYMBOL(noop_fsync);
1005 EXPORT_SYMBOL(simple_unlink); 1005 EXPORT_SYMBOL(simple_unlink);
1006 EXPORT_SYMBOL(simple_read_from_buffer); 1006 EXPORT_SYMBOL(simple_read_from_buffer);
1007 EXPORT_SYMBOL(simple_write_to_buffer); 1007 EXPORT_SYMBOL(simple_write_to_buffer);
1008 EXPORT_SYMBOL(memory_read_from_buffer); 1008 EXPORT_SYMBOL(memory_read_from_buffer);
1009 EXPORT_SYMBOL(simple_transaction_set); 1009 EXPORT_SYMBOL(simple_transaction_set);
1010 EXPORT_SYMBOL(simple_transaction_get); 1010 EXPORT_SYMBOL(simple_transaction_get);
1011 EXPORT_SYMBOL(simple_transaction_read); 1011 EXPORT_SYMBOL(simple_transaction_read);
1012 EXPORT_SYMBOL(simple_transaction_release); 1012 EXPORT_SYMBOL(simple_transaction_release);
1013 EXPORT_SYMBOL_GPL(simple_attr_open); 1013 EXPORT_SYMBOL_GPL(simple_attr_open);
1014 EXPORT_SYMBOL_GPL(simple_attr_release); 1014 EXPORT_SYMBOL_GPL(simple_attr_release);
1015 EXPORT_SYMBOL_GPL(simple_attr_read); 1015 EXPORT_SYMBOL_GPL(simple_attr_read);
1016 EXPORT_SYMBOL_GPL(simple_attr_write); 1016 EXPORT_SYMBOL_GPL(simple_attr_write);
1017 1017
1 /* 1 /*
2 * linux/fs/nfs/dir.c 2 * linux/fs/nfs/dir.c
3 * 3 *
4 * Copyright (C) 1992 Rick Sladkey 4 * Copyright (C) 1992 Rick Sladkey
5 * 5 *
6 * nfs directory handling functions 6 * nfs directory handling functions
7 * 7 *
8 * 10 Apr 1996 Added silly rename for unlink --okir 8 * 10 Apr 1996 Added silly rename for unlink --okir
9 * 28 Sep 1996 Improved directory cache --okir 9 * 28 Sep 1996 Improved directory cache --okir
10 * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de 10 * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de
11 * Re-implemented silly rename for unlink, newly implemented 11 * Re-implemented silly rename for unlink, newly implemented
12 * silly rename for nfs_rename() following the suggestions 12 * silly rename for nfs_rename() following the suggestions
13 * of Olaf Kirch (okir) found in this file. 13 * of Olaf Kirch (okir) found in this file.
14 * Following Linus comments on my original hack, this version 14 * Following Linus comments on my original hack, this version
15 * depends only on the dcache stuff and doesn't touch the inode 15 * depends only on the dcache stuff and doesn't touch the inode
16 * layer (iput() and friends). 16 * layer (iput() and friends).
17 * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM 17 * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
18 */ 18 */
19 19
20 #include <linux/module.h> 20 #include <linux/module.h>
21 #include <linux/time.h> 21 #include <linux/time.h>
22 #include <linux/errno.h> 22 #include <linux/errno.h>
23 #include <linux/stat.h> 23 #include <linux/stat.h>
24 #include <linux/fcntl.h> 24 #include <linux/fcntl.h>
25 #include <linux/string.h> 25 #include <linux/string.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/mm.h> 28 #include <linux/mm.h>
29 #include <linux/sunrpc/clnt.h> 29 #include <linux/sunrpc/clnt.h>
30 #include <linux/nfs_fs.h> 30 #include <linux/nfs_fs.h>
31 #include <linux/nfs_mount.h> 31 #include <linux/nfs_mount.h>
32 #include <linux/pagemap.h> 32 #include <linux/pagemap.h>
33 #include <linux/pagevec.h> 33 #include <linux/pagevec.h>
34 #include <linux/namei.h> 34 #include <linux/namei.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/sched.h> 36 #include <linux/sched.h>
37 #include <linux/kmemleak.h> 37 #include <linux/kmemleak.h>
38 #include <linux/xattr.h> 38 #include <linux/xattr.h>
39 39
40 #include "delegation.h" 40 #include "delegation.h"
41 #include "iostat.h" 41 #include "iostat.h"
42 #include "internal.h" 42 #include "internal.h"
43 #include "fscache.h" 43 #include "fscache.h"
44 44
45 /* #define NFS_DEBUG_VERBOSE 1 */ 45 /* #define NFS_DEBUG_VERBOSE 1 */
46 46
47 static int nfs_opendir(struct inode *, struct file *); 47 static int nfs_opendir(struct inode *, struct file *);
48 static int nfs_closedir(struct inode *, struct file *); 48 static int nfs_closedir(struct inode *, struct file *);
49 static int nfs_readdir(struct file *, void *, filldir_t); 49 static int nfs_readdir(struct file *, void *, filldir_t);
50 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); 50 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
51 static loff_t nfs_llseek_dir(struct file *, loff_t, int); 51 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
52 static void nfs_readdir_clear_array(struct page*); 52 static void nfs_readdir_clear_array(struct page*);
53 53
54 const struct file_operations nfs_dir_operations = { 54 const struct file_operations nfs_dir_operations = {
55 .llseek = nfs_llseek_dir, 55 .llseek = nfs_llseek_dir,
56 .read = generic_read_dir, 56 .read = generic_read_dir,
57 .readdir = nfs_readdir, 57 .readdir = nfs_readdir,
58 .open = nfs_opendir, 58 .open = nfs_opendir,
59 .release = nfs_closedir, 59 .release = nfs_closedir,
60 .fsync = nfs_fsync_dir, 60 .fsync = nfs_fsync_dir,
61 }; 61 };
62 62
63 const struct address_space_operations nfs_dir_aops = { 63 const struct address_space_operations nfs_dir_aops = {
64 .freepage = nfs_readdir_clear_array, 64 .freepage = nfs_readdir_clear_array,
65 }; 65 };
66 66
67 static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) 67 static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
68 { 68 {
69 struct nfs_open_dir_context *ctx; 69 struct nfs_open_dir_context *ctx;
70 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 70 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
71 if (ctx != NULL) { 71 if (ctx != NULL) {
72 ctx->duped = 0; 72 ctx->duped = 0;
73 ctx->attr_gencount = NFS_I(dir)->attr_gencount; 73 ctx->attr_gencount = NFS_I(dir)->attr_gencount;
74 ctx->dir_cookie = 0; 74 ctx->dir_cookie = 0;
75 ctx->dup_cookie = 0; 75 ctx->dup_cookie = 0;
76 ctx->cred = get_rpccred(cred); 76 ctx->cred = get_rpccred(cred);
77 return ctx; 77 return ctx;
78 } 78 }
79 return ERR_PTR(-ENOMEM); 79 return ERR_PTR(-ENOMEM);
80 } 80 }
81 81
82 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 82 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
83 { 83 {
84 put_rpccred(ctx->cred); 84 put_rpccred(ctx->cred);
85 kfree(ctx); 85 kfree(ctx);
86 } 86 }
87 87
88 /* 88 /*
89 * Open file 89 * Open file
90 */ 90 */
91 static int 91 static int
92 nfs_opendir(struct inode *inode, struct file *filp) 92 nfs_opendir(struct inode *inode, struct file *filp)
93 { 93 {
94 int res = 0; 94 int res = 0;
95 struct nfs_open_dir_context *ctx; 95 struct nfs_open_dir_context *ctx;
96 struct rpc_cred *cred; 96 struct rpc_cred *cred;
97 97
98 dfprintk(FILE, "NFS: open dir(%s/%s)\n", 98 dfprintk(FILE, "NFS: open dir(%s/%s)\n",
99 filp->f_path.dentry->d_parent->d_name.name, 99 filp->f_path.dentry->d_parent->d_name.name,
100 filp->f_path.dentry->d_name.name); 100 filp->f_path.dentry->d_name.name);
101 101
102 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 102 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
103 103
104 cred = rpc_lookup_cred(); 104 cred = rpc_lookup_cred();
105 if (IS_ERR(cred)) 105 if (IS_ERR(cred))
106 return PTR_ERR(cred); 106 return PTR_ERR(cred);
107 ctx = alloc_nfs_open_dir_context(inode, cred); 107 ctx = alloc_nfs_open_dir_context(inode, cred);
108 if (IS_ERR(ctx)) { 108 if (IS_ERR(ctx)) {
109 res = PTR_ERR(ctx); 109 res = PTR_ERR(ctx);
110 goto out; 110 goto out;
111 } 111 }
112 filp->private_data = ctx; 112 filp->private_data = ctx;
113 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { 113 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
114 /* This is a mountpoint, so d_revalidate will never 114 /* This is a mountpoint, so d_revalidate will never
115 * have been called, so we need to refresh the 115 * have been called, so we need to refresh the
116 * inode (for close-open consistency) ourselves. 116 * inode (for close-open consistency) ourselves.
117 */ 117 */
118 __nfs_revalidate_inode(NFS_SERVER(inode), inode); 118 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
119 } 119 }
120 out: 120 out:
121 put_rpccred(cred); 121 put_rpccred(cred);
122 return res; 122 return res;
123 } 123 }
124 124
125 static int 125 static int
126 nfs_closedir(struct inode *inode, struct file *filp) 126 nfs_closedir(struct inode *inode, struct file *filp)
127 { 127 {
128 put_nfs_open_dir_context(filp->private_data); 128 put_nfs_open_dir_context(filp->private_data);
129 return 0; 129 return 0;
130 } 130 }
131 131
132 struct nfs_cache_array_entry { 132 struct nfs_cache_array_entry {
133 u64 cookie; 133 u64 cookie;
134 u64 ino; 134 u64 ino;
135 struct qstr string; 135 struct qstr string;
136 unsigned char d_type; 136 unsigned char d_type;
137 }; 137 };
138 138
139 struct nfs_cache_array { 139 struct nfs_cache_array {
140 int size; 140 int size;
141 int eof_index; 141 int eof_index;
142 u64 last_cookie; 142 u64 last_cookie;
143 struct nfs_cache_array_entry array[0]; 143 struct nfs_cache_array_entry array[0];
144 }; 144 };
145 145
146 typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); 146 typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
147 typedef struct { 147 typedef struct {
148 struct file *file; 148 struct file *file;
149 struct page *page; 149 struct page *page;
150 unsigned long page_index; 150 unsigned long page_index;
151 u64 *dir_cookie; 151 u64 *dir_cookie;
152 u64 last_cookie; 152 u64 last_cookie;
153 loff_t current_index; 153 loff_t current_index;
154 decode_dirent_t decode; 154 decode_dirent_t decode;
155 155
156 unsigned long timestamp; 156 unsigned long timestamp;
157 unsigned long gencount; 157 unsigned long gencount;
158 unsigned int cache_entry_index; 158 unsigned int cache_entry_index;
159 unsigned int plus:1; 159 unsigned int plus:1;
160 unsigned int eof:1; 160 unsigned int eof:1;
161 } nfs_readdir_descriptor_t; 161 } nfs_readdir_descriptor_t;
162 162
163 /* 163 /*
164 * The caller is responsible for calling nfs_readdir_release_array(page) 164 * The caller is responsible for calling nfs_readdir_release_array(page)
165 */ 165 */
166 static 166 static
167 struct nfs_cache_array *nfs_readdir_get_array(struct page *page) 167 struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
168 { 168 {
169 void *ptr; 169 void *ptr;
170 if (page == NULL) 170 if (page == NULL)
171 return ERR_PTR(-EIO); 171 return ERR_PTR(-EIO);
172 ptr = kmap(page); 172 ptr = kmap(page);
173 if (ptr == NULL) 173 if (ptr == NULL)
174 return ERR_PTR(-ENOMEM); 174 return ERR_PTR(-ENOMEM);
175 return ptr; 175 return ptr;
176 } 176 }
177 177
178 static 178 static
179 void nfs_readdir_release_array(struct page *page) 179 void nfs_readdir_release_array(struct page *page)
180 { 180 {
181 kunmap(page); 181 kunmap(page);
182 } 182 }
183 183
184 /* 184 /*
185 * we are freeing strings created by nfs_add_to_readdir_array() 185 * we are freeing strings created by nfs_add_to_readdir_array()
186 */ 186 */
187 static 187 static
188 void nfs_readdir_clear_array(struct page *page) 188 void nfs_readdir_clear_array(struct page *page)
189 { 189 {
190 struct nfs_cache_array *array; 190 struct nfs_cache_array *array;
191 int i; 191 int i;
192 192
193 array = kmap_atomic(page); 193 array = kmap_atomic(page);
194 for (i = 0; i < array->size; i++) 194 for (i = 0; i < array->size; i++)
195 kfree(array->array[i].string.name); 195 kfree(array->array[i].string.name);
196 kunmap_atomic(array); 196 kunmap_atomic(array);
197 } 197 }
198 198
199 /* 199 /*
200 * the caller is responsible for freeing qstr.name 200 * the caller is responsible for freeing qstr.name
201 * when called by nfs_readdir_add_to_array, the strings will be freed in 201 * when called by nfs_readdir_add_to_array, the strings will be freed in
202 * nfs_clear_readdir_array() 202 * nfs_clear_readdir_array()
203 */ 203 */
204 static 204 static
205 int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) 205 int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
206 { 206 {
207 string->len = len; 207 string->len = len;
208 string->name = kmemdup(name, len, GFP_KERNEL); 208 string->name = kmemdup(name, len, GFP_KERNEL);
209 if (string->name == NULL) 209 if (string->name == NULL)
210 return -ENOMEM; 210 return -ENOMEM;
211 /* 211 /*
212 * Avoid a kmemleak false positive. The pointer to the name is stored 212 * Avoid a kmemleak false positive. The pointer to the name is stored
213 * in a page cache page which kmemleak does not scan. 213 * in a page cache page which kmemleak does not scan.
214 */ 214 */
215 kmemleak_not_leak(string->name); 215 kmemleak_not_leak(string->name);
216 string->hash = full_name_hash(name, len); 216 string->hash = full_name_hash(name, len);
217 return 0; 217 return 0;
218 } 218 }
219 219
220 static 220 static
221 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) 221 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
222 { 222 {
223 struct nfs_cache_array *array = nfs_readdir_get_array(page); 223 struct nfs_cache_array *array = nfs_readdir_get_array(page);
224 struct nfs_cache_array_entry *cache_entry; 224 struct nfs_cache_array_entry *cache_entry;
225 int ret; 225 int ret;
226 226
227 if (IS_ERR(array)) 227 if (IS_ERR(array))
228 return PTR_ERR(array); 228 return PTR_ERR(array);
229 229
230 cache_entry = &array->array[array->size]; 230 cache_entry = &array->array[array->size];
231 231
232 /* Check that this entry lies within the page bounds */ 232 /* Check that this entry lies within the page bounds */
233 ret = -ENOSPC; 233 ret = -ENOSPC;
234 if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) 234 if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
235 goto out; 235 goto out;
236 236
237 cache_entry->cookie = entry->prev_cookie; 237 cache_entry->cookie = entry->prev_cookie;
238 cache_entry->ino = entry->ino; 238 cache_entry->ino = entry->ino;
239 cache_entry->d_type = entry->d_type; 239 cache_entry->d_type = entry->d_type;
240 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); 240 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
241 if (ret) 241 if (ret)
242 goto out; 242 goto out;
243 array->last_cookie = entry->cookie; 243 array->last_cookie = entry->cookie;
244 array->size++; 244 array->size++;
245 if (entry->eof != 0) 245 if (entry->eof != 0)
246 array->eof_index = array->size; 246 array->eof_index = array->size;
247 out: 247 out:
248 nfs_readdir_release_array(page); 248 nfs_readdir_release_array(page);
249 return ret; 249 return ret;
250 } 250 }
251 251
252 static 252 static
253 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 253 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
254 { 254 {
255 loff_t diff = desc->file->f_pos - desc->current_index; 255 loff_t diff = desc->file->f_pos - desc->current_index;
256 unsigned int index; 256 unsigned int index;
257 257
258 if (diff < 0) 258 if (diff < 0)
259 goto out_eof; 259 goto out_eof;
260 if (diff >= array->size) { 260 if (diff >= array->size) {
261 if (array->eof_index >= 0) 261 if (array->eof_index >= 0)
262 goto out_eof; 262 goto out_eof;
263 return -EAGAIN; 263 return -EAGAIN;
264 } 264 }
265 265
266 index = (unsigned int)diff; 266 index = (unsigned int)diff;
267 *desc->dir_cookie = array->array[index].cookie; 267 *desc->dir_cookie = array->array[index].cookie;
268 desc->cache_entry_index = index; 268 desc->cache_entry_index = index;
269 return 0; 269 return 0;
270 out_eof: 270 out_eof:
271 desc->eof = 1; 271 desc->eof = 1;
272 return -EBADCOOKIE; 272 return -EBADCOOKIE;
273 } 273 }
274 274
275 static 275 static
276 int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 276 int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
277 { 277 {
278 int i; 278 int i;
279 loff_t new_pos; 279 loff_t new_pos;
280 int status = -EAGAIN; 280 int status = -EAGAIN;
281 281
282 for (i = 0; i < array->size; i++) { 282 for (i = 0; i < array->size; i++) {
283 if (array->array[i].cookie == *desc->dir_cookie) { 283 if (array->array[i].cookie == *desc->dir_cookie) {
284 struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); 284 struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
285 struct nfs_open_dir_context *ctx = desc->file->private_data; 285 struct nfs_open_dir_context *ctx = desc->file->private_data;
286 286
287 new_pos = desc->current_index + i; 287 new_pos = desc->current_index + i;
288 if (ctx->attr_gencount != nfsi->attr_gencount 288 if (ctx->attr_gencount != nfsi->attr_gencount
289 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 289 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
290 ctx->duped = 0; 290 ctx->duped = 0;
291 ctx->attr_gencount = nfsi->attr_gencount; 291 ctx->attr_gencount = nfsi->attr_gencount;
292 } else if (new_pos < desc->file->f_pos) { 292 } else if (new_pos < desc->file->f_pos) {
293 if (ctx->duped > 0 293 if (ctx->duped > 0
294 && ctx->dup_cookie == *desc->dir_cookie) { 294 && ctx->dup_cookie == *desc->dir_cookie) {
295 if (printk_ratelimit()) { 295 if (printk_ratelimit()) {
296 pr_notice("NFS: directory %s/%s contains a readdir loop." 296 pr_notice("NFS: directory %s/%s contains a readdir loop."
297 "Please contact your server vendor. " 297 "Please contact your server vendor. "
298 "The file: %s has duplicate cookie %llu\n", 298 "The file: %s has duplicate cookie %llu\n",
299 desc->file->f_dentry->d_parent->d_name.name, 299 desc->file->f_dentry->d_parent->d_name.name,
300 desc->file->f_dentry->d_name.name, 300 desc->file->f_dentry->d_name.name,
301 array->array[i].string.name, 301 array->array[i].string.name,
302 *desc->dir_cookie); 302 *desc->dir_cookie);
303 } 303 }
304 status = -ELOOP; 304 status = -ELOOP;
305 goto out; 305 goto out;
306 } 306 }
307 ctx->dup_cookie = *desc->dir_cookie; 307 ctx->dup_cookie = *desc->dir_cookie;
308 ctx->duped = -1; 308 ctx->duped = -1;
309 } 309 }
310 desc->file->f_pos = new_pos; 310 desc->file->f_pos = new_pos;
311 desc->cache_entry_index = i; 311 desc->cache_entry_index = i;
312 return 0; 312 return 0;
313 } 313 }
314 } 314 }
315 if (array->eof_index >= 0) { 315 if (array->eof_index >= 0) {
316 status = -EBADCOOKIE; 316 status = -EBADCOOKIE;
317 if (*desc->dir_cookie == array->last_cookie) 317 if (*desc->dir_cookie == array->last_cookie)
318 desc->eof = 1; 318 desc->eof = 1;
319 } 319 }
320 out: 320 out:
321 return status; 321 return status;
322 } 322 }
323 323
324 static 324 static
325 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) 325 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
326 { 326 {
327 struct nfs_cache_array *array; 327 struct nfs_cache_array *array;
328 int status; 328 int status;
329 329
330 array = nfs_readdir_get_array(desc->page); 330 array = nfs_readdir_get_array(desc->page);
331 if (IS_ERR(array)) { 331 if (IS_ERR(array)) {
332 status = PTR_ERR(array); 332 status = PTR_ERR(array);
333 goto out; 333 goto out;
334 } 334 }
335 335
336 if (*desc->dir_cookie == 0) 336 if (*desc->dir_cookie == 0)
337 status = nfs_readdir_search_for_pos(array, desc); 337 status = nfs_readdir_search_for_pos(array, desc);
338 else 338 else
339 status = nfs_readdir_search_for_cookie(array, desc); 339 status = nfs_readdir_search_for_cookie(array, desc);
340 340
341 if (status == -EAGAIN) { 341 if (status == -EAGAIN) {
342 desc->last_cookie = array->last_cookie; 342 desc->last_cookie = array->last_cookie;
343 desc->current_index += array->size; 343 desc->current_index += array->size;
344 desc->page_index++; 344 desc->page_index++;
345 } 345 }
346 nfs_readdir_release_array(desc->page); 346 nfs_readdir_release_array(desc->page);
347 out: 347 out:
348 return status; 348 return status;
349 } 349 }
350 350
351 /* Fill a page with xdr information before transferring to the cache page */ 351 /* Fill a page with xdr information before transferring to the cache page */
352 static 352 static
353 int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, 353 int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
354 struct nfs_entry *entry, struct file *file, struct inode *inode) 354 struct nfs_entry *entry, struct file *file, struct inode *inode)
355 { 355 {
356 struct nfs_open_dir_context *ctx = file->private_data; 356 struct nfs_open_dir_context *ctx = file->private_data;
357 struct rpc_cred *cred = ctx->cred; 357 struct rpc_cred *cred = ctx->cred;
358 unsigned long timestamp, gencount; 358 unsigned long timestamp, gencount;
359 int error; 359 int error;
360 360
361 again: 361 again:
362 timestamp = jiffies; 362 timestamp = jiffies;
363 gencount = nfs_inc_attr_generation_counter(); 363 gencount = nfs_inc_attr_generation_counter();
364 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, 364 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
365 NFS_SERVER(inode)->dtsize, desc->plus); 365 NFS_SERVER(inode)->dtsize, desc->plus);
366 if (error < 0) { 366 if (error < 0) {
367 /* We requested READDIRPLUS, but the server doesn't grok it */ 367 /* We requested READDIRPLUS, but the server doesn't grok it */
368 if (error == -ENOTSUPP && desc->plus) { 368 if (error == -ENOTSUPP && desc->plus) {
369 NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; 369 NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
370 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 370 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
371 desc->plus = 0; 371 desc->plus = 0;
372 goto again; 372 goto again;
373 } 373 }
374 goto error; 374 goto error;
375 } 375 }
376 desc->timestamp = timestamp; 376 desc->timestamp = timestamp;
377 desc->gencount = gencount; 377 desc->gencount = gencount;
378 error: 378 error:
379 return error; 379 return error;
380 } 380 }
381 381
382 static int xdr_decode(nfs_readdir_descriptor_t *desc, 382 static int xdr_decode(nfs_readdir_descriptor_t *desc,
383 struct nfs_entry *entry, struct xdr_stream *xdr) 383 struct nfs_entry *entry, struct xdr_stream *xdr)
384 { 384 {
385 int error; 385 int error;
386 386
387 error = desc->decode(xdr, entry, desc->plus); 387 error = desc->decode(xdr, entry, desc->plus);
388 if (error) 388 if (error)
389 return error; 389 return error;
390 entry->fattr->time_start = desc->timestamp; 390 entry->fattr->time_start = desc->timestamp;
391 entry->fattr->gencount = desc->gencount; 391 entry->fattr->gencount = desc->gencount;
392 return 0; 392 return 0;
393 } 393 }
394 394
395 static 395 static
396 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) 396 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
397 { 397 {
398 if (dentry->d_inode == NULL) 398 if (dentry->d_inode == NULL)
399 goto different; 399 goto different;
400 if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) 400 if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
401 goto different; 401 goto different;
402 return 1; 402 return 1;
403 different: 403 different:
404 return 0; 404 return 0;
405 } 405 }
406 406
407 static 407 static
408 bool nfs_use_readdirplus(struct inode *dir, struct file *filp) 408 bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
409 { 409 {
410 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) 410 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
411 return false; 411 return false;
412 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) 412 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
413 return true; 413 return true;
414 if (filp->f_pos == 0) 414 if (filp->f_pos == 0)
415 return true; 415 return true;
416 return false; 416 return false;
417 } 417 }
418 418
419 /* 419 /*
420 * This function is called by the lookup code to request the use of 420 * This function is called by the lookup code to request the use of
421 * readdirplus to accelerate any future lookups in the same 421 * readdirplus to accelerate any future lookups in the same
422 * directory. 422 * directory.
423 */ 423 */
424 static 424 static
425 void nfs_advise_use_readdirplus(struct inode *dir) 425 void nfs_advise_use_readdirplus(struct inode *dir)
426 { 426 {
427 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); 427 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
428 } 428 }
429 429
430 static 430 static
431 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) 431 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
432 { 432 {
433 struct qstr filename = QSTR_INIT(entry->name, entry->len); 433 struct qstr filename = QSTR_INIT(entry->name, entry->len);
434 struct dentry *dentry; 434 struct dentry *dentry;
435 struct dentry *alias; 435 struct dentry *alias;
436 struct inode *dir = parent->d_inode; 436 struct inode *dir = parent->d_inode;
437 struct inode *inode; 437 struct inode *inode;
438 438
439 if (filename.name[0] == '.') { 439 if (filename.name[0] == '.') {
440 if (filename.len == 1) 440 if (filename.len == 1)
441 return; 441 return;
442 if (filename.len == 2 && filename.name[1] == '.') 442 if (filename.len == 2 && filename.name[1] == '.')
443 return; 443 return;
444 } 444 }
445 filename.hash = full_name_hash(filename.name, filename.len); 445 filename.hash = full_name_hash(filename.name, filename.len);
446 446
447 dentry = d_lookup(parent, &filename); 447 dentry = d_lookup(parent, &filename);
448 if (dentry != NULL) { 448 if (dentry != NULL) {
449 if (nfs_same_file(dentry, entry)) { 449 if (nfs_same_file(dentry, entry)) {
450 nfs_refresh_inode(dentry->d_inode, entry->fattr); 450 nfs_refresh_inode(dentry->d_inode, entry->fattr);
451 goto out; 451 goto out;
452 } else { 452 } else {
453 if (d_invalidate(dentry) != 0) 453 if (d_invalidate(dentry) != 0)
454 goto out; 454 goto out;
455 dput(dentry); 455 dput(dentry);
456 } 456 }
457 } 457 }
458 458
459 dentry = d_alloc(parent, &filename); 459 dentry = d_alloc(parent, &filename);
460 if (dentry == NULL) 460 if (dentry == NULL)
461 return; 461 return;
462 462
463 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 463 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
464 if (IS_ERR(inode)) 464 if (IS_ERR(inode))
465 goto out; 465 goto out;
466 466
467 alias = d_materialise_unique(dentry, inode); 467 alias = d_materialise_unique(dentry, inode);
468 if (IS_ERR(alias)) 468 if (IS_ERR(alias))
469 goto out; 469 goto out;
470 else if (alias) { 470 else if (alias) {
471 nfs_set_verifier(alias, nfs_save_change_attribute(dir)); 471 nfs_set_verifier(alias, nfs_save_change_attribute(dir));
472 dput(alias); 472 dput(alias);
473 } else 473 } else
474 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 474 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
475 475
476 out: 476 out:
477 dput(dentry); 477 dput(dentry);
478 } 478 }
479 479
480 /* Perform conversion from xdr to cache array */ 480 /* Perform conversion from xdr to cache array */
481 static 481 static
482 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, 482 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
483 struct page **xdr_pages, struct page *page, unsigned int buflen) 483 struct page **xdr_pages, struct page *page, unsigned int buflen)
484 { 484 {
485 struct xdr_stream stream; 485 struct xdr_stream stream;
486 struct xdr_buf buf; 486 struct xdr_buf buf;
487 struct page *scratch; 487 struct page *scratch;
488 struct nfs_cache_array *array; 488 struct nfs_cache_array *array;
489 unsigned int count = 0; 489 unsigned int count = 0;
490 int status; 490 int status;
491 491
492 scratch = alloc_page(GFP_KERNEL); 492 scratch = alloc_page(GFP_KERNEL);
493 if (scratch == NULL) 493 if (scratch == NULL)
494 return -ENOMEM; 494 return -ENOMEM;
495 495
496 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); 496 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
497 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 497 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
498 498
499 do { 499 do {
500 status = xdr_decode(desc, entry, &stream); 500 status = xdr_decode(desc, entry, &stream);
501 if (status != 0) { 501 if (status != 0) {
502 if (status == -EAGAIN) 502 if (status == -EAGAIN)
503 status = 0; 503 status = 0;
504 break; 504 break;
505 } 505 }
506 506
507 count++; 507 count++;
508 508
509 if (desc->plus != 0) 509 if (desc->plus != 0)
510 nfs_prime_dcache(desc->file->f_path.dentry, entry); 510 nfs_prime_dcache(desc->file->f_path.dentry, entry);
511 511
512 status = nfs_readdir_add_to_array(entry, page); 512 status = nfs_readdir_add_to_array(entry, page);
513 if (status != 0) 513 if (status != 0)
514 break; 514 break;
515 } while (!entry->eof); 515 } while (!entry->eof);
516 516
517 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { 517 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
518 array = nfs_readdir_get_array(page); 518 array = nfs_readdir_get_array(page);
519 if (!IS_ERR(array)) { 519 if (!IS_ERR(array)) {
520 array->eof_index = array->size; 520 array->eof_index = array->size;
521 status = 0; 521 status = 0;
522 nfs_readdir_release_array(page); 522 nfs_readdir_release_array(page);
523 } else 523 } else
524 status = PTR_ERR(array); 524 status = PTR_ERR(array);
525 } 525 }
526 526
527 put_page(scratch); 527 put_page(scratch);
528 return status; 528 return status;
529 } 529 }
530 530
531 static 531 static
532 void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) 532 void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
533 { 533 {
534 unsigned int i; 534 unsigned int i;
535 for (i = 0; i < npages; i++) 535 for (i = 0; i < npages; i++)
536 put_page(pages[i]); 536 put_page(pages[i]);
537 } 537 }
538 538
539 static 539 static
540 void nfs_readdir_free_large_page(void *ptr, struct page **pages, 540 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
541 unsigned int npages) 541 unsigned int npages)
542 { 542 {
543 nfs_readdir_free_pagearray(pages, npages); 543 nfs_readdir_free_pagearray(pages, npages);
544 } 544 }
545 545
546 /* 546 /*
547 * nfs_readdir_large_page will allocate pages that must be freed with a call 547 * nfs_readdir_large_page will allocate pages that must be freed with a call
548 * to nfs_readdir_free_large_page 548 * to nfs_readdir_free_large_page
549 */ 549 */
550 static 550 static
551 int nfs_readdir_large_page(struct page **pages, unsigned int npages) 551 int nfs_readdir_large_page(struct page **pages, unsigned int npages)
552 { 552 {
553 unsigned int i; 553 unsigned int i;
554 554
555 for (i = 0; i < npages; i++) { 555 for (i = 0; i < npages; i++) {
556 struct page *page = alloc_page(GFP_KERNEL); 556 struct page *page = alloc_page(GFP_KERNEL);
557 if (page == NULL) 557 if (page == NULL)
558 goto out_freepages; 558 goto out_freepages;
559 pages[i] = page; 559 pages[i] = page;
560 } 560 }
561 return 0; 561 return 0;
562 562
563 out_freepages: 563 out_freepages:
564 nfs_readdir_free_pagearray(pages, i); 564 nfs_readdir_free_pagearray(pages, i);
565 return -ENOMEM; 565 return -ENOMEM;
566 } 566 }
567 567
568 static 568 static
569 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) 569 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
570 { 570 {
571 struct page *pages[NFS_MAX_READDIR_PAGES]; 571 struct page *pages[NFS_MAX_READDIR_PAGES];
572 void *pages_ptr = NULL; 572 void *pages_ptr = NULL;
573 struct nfs_entry entry; 573 struct nfs_entry entry;
574 struct file *file = desc->file; 574 struct file *file = desc->file;
575 struct nfs_cache_array *array; 575 struct nfs_cache_array *array;
576 int status = -ENOMEM; 576 int status = -ENOMEM;
577 unsigned int array_size = ARRAY_SIZE(pages); 577 unsigned int array_size = ARRAY_SIZE(pages);
578 578
579 entry.prev_cookie = 0; 579 entry.prev_cookie = 0;
580 entry.cookie = desc->last_cookie; 580 entry.cookie = desc->last_cookie;
581 entry.eof = 0; 581 entry.eof = 0;
582 entry.fh = nfs_alloc_fhandle(); 582 entry.fh = nfs_alloc_fhandle();
583 entry.fattr = nfs_alloc_fattr(); 583 entry.fattr = nfs_alloc_fattr();
584 entry.server = NFS_SERVER(inode); 584 entry.server = NFS_SERVER(inode);
585 if (entry.fh == NULL || entry.fattr == NULL) 585 if (entry.fh == NULL || entry.fattr == NULL)
586 goto out; 586 goto out;
587 587
588 array = nfs_readdir_get_array(page); 588 array = nfs_readdir_get_array(page);
589 if (IS_ERR(array)) { 589 if (IS_ERR(array)) {
590 status = PTR_ERR(array); 590 status = PTR_ERR(array);
591 goto out; 591 goto out;
592 } 592 }
593 memset(array, 0, sizeof(struct nfs_cache_array)); 593 memset(array, 0, sizeof(struct nfs_cache_array));
594 array->eof_index = -1; 594 array->eof_index = -1;
595 595
596 status = nfs_readdir_large_page(pages, array_size); 596 status = nfs_readdir_large_page(pages, array_size);
597 if (status < 0) 597 if (status < 0)
598 goto out_release_array; 598 goto out_release_array;
599 do { 599 do {
600 unsigned int pglen; 600 unsigned int pglen;
601 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); 601 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
602 602
603 if (status < 0) 603 if (status < 0)
604 break; 604 break;
605 pglen = status; 605 pglen = status;
606 status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); 606 status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
607 if (status < 0) { 607 if (status < 0) {
608 if (status == -ENOSPC) 608 if (status == -ENOSPC)
609 status = 0; 609 status = 0;
610 break; 610 break;
611 } 611 }
612 } while (array->eof_index < 0); 612 } while (array->eof_index < 0);
613 613
614 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 614 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
615 out_release_array: 615 out_release_array:
616 nfs_readdir_release_array(page); 616 nfs_readdir_release_array(page);
617 out: 617 out:
618 nfs_free_fattr(entry.fattr); 618 nfs_free_fattr(entry.fattr);
619 nfs_free_fhandle(entry.fh); 619 nfs_free_fhandle(entry.fh);
620 return status; 620 return status;
621 } 621 }
622 622
623 /* 623 /*
624 * Now we cache directories properly, by converting xdr information 624 * Now we cache directories properly, by converting xdr information
625 * to an array that can be used for lookups later. This results in 625 * to an array that can be used for lookups later. This results in
626 * fewer cache pages, since we can store more information on each page. 626 * fewer cache pages, since we can store more information on each page.
627 * We only need to convert from xdr once so future lookups are much simpler 627 * We only need to convert from xdr once so future lookups are much simpler
628 */ 628 */
629 static 629 static
630 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) 630 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
631 { 631 {
632 struct inode *inode = desc->file->f_path.dentry->d_inode; 632 struct inode *inode = desc->file->f_path.dentry->d_inode;
633 int ret; 633 int ret;
634 634
635 ret = nfs_readdir_xdr_to_array(desc, page, inode); 635 ret = nfs_readdir_xdr_to_array(desc, page, inode);
636 if (ret < 0) 636 if (ret < 0)
637 goto error; 637 goto error;
638 SetPageUptodate(page); 638 SetPageUptodate(page);
639 639
640 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { 640 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
641 /* Should never happen */ 641 /* Should never happen */
642 nfs_zap_mapping(inode, inode->i_mapping); 642 nfs_zap_mapping(inode, inode->i_mapping);
643 } 643 }
644 unlock_page(page); 644 unlock_page(page);
645 return 0; 645 return 0;
646 error: 646 error:
647 unlock_page(page); 647 unlock_page(page);
648 return ret; 648 return ret;
649 } 649 }
650 650
651 static 651 static
652 void cache_page_release(nfs_readdir_descriptor_t *desc) 652 void cache_page_release(nfs_readdir_descriptor_t *desc)
653 { 653 {
654 if (!desc->page->mapping) 654 if (!desc->page->mapping)
655 nfs_readdir_clear_array(desc->page); 655 nfs_readdir_clear_array(desc->page);
656 page_cache_release(desc->page); 656 page_cache_release(desc->page);
657 desc->page = NULL; 657 desc->page = NULL;
658 } 658 }
659 659
660 static 660 static
661 struct page *get_cache_page(nfs_readdir_descriptor_t *desc) 661 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
662 { 662 {
663 return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, 663 return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
664 desc->page_index, (filler_t *)nfs_readdir_filler, desc); 664 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
665 } 665 }
666 666
667 /* 667 /*
668 * Returns 0 if desc->dir_cookie was found on page desc->page_index 668 * Returns 0 if desc->dir_cookie was found on page desc->page_index
669 */ 669 */
670 static 670 static
671 int find_cache_page(nfs_readdir_descriptor_t *desc) 671 int find_cache_page(nfs_readdir_descriptor_t *desc)
672 { 672 {
673 int res; 673 int res;
674 674
675 desc->page = get_cache_page(desc); 675 desc->page = get_cache_page(desc);
676 if (IS_ERR(desc->page)) 676 if (IS_ERR(desc->page))
677 return PTR_ERR(desc->page); 677 return PTR_ERR(desc->page);
678 678
679 res = nfs_readdir_search_array(desc); 679 res = nfs_readdir_search_array(desc);
680 if (res != 0) 680 if (res != 0)
681 cache_page_release(desc); 681 cache_page_release(desc);
682 return res; 682 return res;
683 } 683 }
684 684
685 /* Search for desc->dir_cookie from the beginning of the page cache */ 685 /* Search for desc->dir_cookie from the beginning of the page cache */
686 static inline 686 static inline
687 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 687 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
688 { 688 {
689 int res; 689 int res;
690 690
691 if (desc->page_index == 0) { 691 if (desc->page_index == 0) {
692 desc->current_index = 0; 692 desc->current_index = 0;
693 desc->last_cookie = 0; 693 desc->last_cookie = 0;
694 } 694 }
695 do { 695 do {
696 res = find_cache_page(desc); 696 res = find_cache_page(desc);
697 } while (res == -EAGAIN); 697 } while (res == -EAGAIN);
698 return res; 698 return res;
699 } 699 }
700 700
701 /* 701 /*
702 * Once we've found the start of the dirent within a page: fill 'er up... 702 * Once we've found the start of the dirent within a page: fill 'er up...
703 */ 703 */
704 static 704 static
705 int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, 705 int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
706 filldir_t filldir) 706 filldir_t filldir)
707 { 707 {
708 struct file *file = desc->file; 708 struct file *file = desc->file;
709 int i = 0; 709 int i = 0;
710 int res = 0; 710 int res = 0;
711 struct nfs_cache_array *array = NULL; 711 struct nfs_cache_array *array = NULL;
712 struct nfs_open_dir_context *ctx = file->private_data; 712 struct nfs_open_dir_context *ctx = file->private_data;
713 713
714 array = nfs_readdir_get_array(desc->page); 714 array = nfs_readdir_get_array(desc->page);
715 if (IS_ERR(array)) { 715 if (IS_ERR(array)) {
716 res = PTR_ERR(array); 716 res = PTR_ERR(array);
717 goto out; 717 goto out;
718 } 718 }
719 719
720 for (i = desc->cache_entry_index; i < array->size; i++) { 720 for (i = desc->cache_entry_index; i < array->size; i++) {
721 struct nfs_cache_array_entry *ent; 721 struct nfs_cache_array_entry *ent;
722 722
723 ent = &array->array[i]; 723 ent = &array->array[i];
724 if (filldir(dirent, ent->string.name, ent->string.len, 724 if (filldir(dirent, ent->string.name, ent->string.len,
725 file->f_pos, nfs_compat_user_ino64(ent->ino), 725 file->f_pos, nfs_compat_user_ino64(ent->ino),
726 ent->d_type) < 0) { 726 ent->d_type) < 0) {
727 desc->eof = 1; 727 desc->eof = 1;
728 break; 728 break;
729 } 729 }
730 file->f_pos++; 730 file->f_pos++;
731 if (i < (array->size-1)) 731 if (i < (array->size-1))
732 *desc->dir_cookie = array->array[i+1].cookie; 732 *desc->dir_cookie = array->array[i+1].cookie;
733 else 733 else
734 *desc->dir_cookie = array->last_cookie; 734 *desc->dir_cookie = array->last_cookie;
735 if (ctx->duped != 0) 735 if (ctx->duped != 0)
736 ctx->duped = 1; 736 ctx->duped = 1;
737 } 737 }
738 if (array->eof_index >= 0) 738 if (array->eof_index >= 0)
739 desc->eof = 1; 739 desc->eof = 1;
740 740
741 nfs_readdir_release_array(desc->page); 741 nfs_readdir_release_array(desc->page);
742 out: 742 out:
743 cache_page_release(desc); 743 cache_page_release(desc);
744 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 744 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
745 (unsigned long long)*desc->dir_cookie, res); 745 (unsigned long long)*desc->dir_cookie, res);
746 return res; 746 return res;
747 } 747 }
748 748
749 /* 749 /*
750 * If we cannot find a cookie in our cache, we suspect that this is 750 * If we cannot find a cookie in our cache, we suspect that this is
751 * because it points to a deleted file, so we ask the server to return 751 * because it points to a deleted file, so we ask the server to return
752 * whatever it thinks is the next entry. We then feed this to filldir. 752 * whatever it thinks is the next entry. We then feed this to filldir.
753 * If all goes well, we should then be able to find our way round the 753 * If all goes well, we should then be able to find our way round the
754 * cache on the next call to readdir_search_pagecache(); 754 * cache on the next call to readdir_search_pagecache();
755 * 755 *
756 * NOTE: we cannot add the anonymous page to the pagecache because 756 * NOTE: we cannot add the anonymous page to the pagecache because
757 * the data it contains might not be page aligned. Besides, 757 * the data it contains might not be page aligned. Besides,
758 * we should already have a complete representation of the 758 * we should already have a complete representation of the
759 * directory in the page cache by the time we get here. 759 * directory in the page cache by the time we get here.
760 */ 760 */
761 static inline 761 static inline
762 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 762 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
763 filldir_t filldir) 763 filldir_t filldir)
764 { 764 {
765 struct page *page = NULL; 765 struct page *page = NULL;
766 int status; 766 int status;
767 struct inode *inode = desc->file->f_path.dentry->d_inode; 767 struct inode *inode = desc->file->f_path.dentry->d_inode;
768 struct nfs_open_dir_context *ctx = desc->file->private_data; 768 struct nfs_open_dir_context *ctx = desc->file->private_data;
769 769
770 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 770 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
771 (unsigned long long)*desc->dir_cookie); 771 (unsigned long long)*desc->dir_cookie);
772 772
773 page = alloc_page(GFP_HIGHUSER); 773 page = alloc_page(GFP_HIGHUSER);
774 if (!page) { 774 if (!page) {
775 status = -ENOMEM; 775 status = -ENOMEM;
776 goto out; 776 goto out;
777 } 777 }
778 778
779 desc->page_index = 0; 779 desc->page_index = 0;
780 desc->last_cookie = *desc->dir_cookie; 780 desc->last_cookie = *desc->dir_cookie;
781 desc->page = page; 781 desc->page = page;
782 ctx->duped = 0; 782 ctx->duped = 0;
783 783
784 status = nfs_readdir_xdr_to_array(desc, page, inode); 784 status = nfs_readdir_xdr_to_array(desc, page, inode);
785 if (status < 0) 785 if (status < 0)
786 goto out_release; 786 goto out_release;
787 787
788 status = nfs_do_filldir(desc, dirent, filldir); 788 status = nfs_do_filldir(desc, dirent, filldir);
789 789
790 out: 790 out:
791 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 791 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
792 __func__, status); 792 __func__, status);
793 return status; 793 return status;
794 out_release: 794 out_release:
795 cache_page_release(desc); 795 cache_page_release(desc);
796 goto out; 796 goto out;
797 } 797 }
798 798
799 /* The file offset position represents the dirent entry number. A 799 /* The file offset position represents the dirent entry number. A
800 last cookie cache takes care of the common case of reading the 800 last cookie cache takes care of the common case of reading the
801 whole directory. 801 whole directory.
802 */ 802 */
803 static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 803 static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
804 { 804 {
805 struct dentry *dentry = filp->f_path.dentry; 805 struct dentry *dentry = filp->f_path.dentry;
806 struct inode *inode = dentry->d_inode; 806 struct inode *inode = dentry->d_inode;
807 nfs_readdir_descriptor_t my_desc, 807 nfs_readdir_descriptor_t my_desc,
808 *desc = &my_desc; 808 *desc = &my_desc;
809 struct nfs_open_dir_context *dir_ctx = filp->private_data; 809 struct nfs_open_dir_context *dir_ctx = filp->private_data;
810 int res; 810 int res;
811 811
812 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 812 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
813 dentry->d_parent->d_name.name, dentry->d_name.name, 813 dentry->d_parent->d_name.name, dentry->d_name.name,
814 (long long)filp->f_pos); 814 (long long)filp->f_pos);
815 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); 815 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
816 816
817 /* 817 /*
818 * filp->f_pos points to the dirent entry number. 818 * filp->f_pos points to the dirent entry number.
819 * *desc->dir_cookie has the cookie for the next entry. We have 819 * *desc->dir_cookie has the cookie for the next entry. We have
820 * to either find the entry with the appropriate number or 820 * to either find the entry with the appropriate number or
821 * revalidate the cookie. 821 * revalidate the cookie.
822 */ 822 */
823 memset(desc, 0, sizeof(*desc)); 823 memset(desc, 0, sizeof(*desc));
824 824
825 desc->file = filp; 825 desc->file = filp;
826 desc->dir_cookie = &dir_ctx->dir_cookie; 826 desc->dir_cookie = &dir_ctx->dir_cookie;
827 desc->decode = NFS_PROTO(inode)->decode_dirent; 827 desc->decode = NFS_PROTO(inode)->decode_dirent;
828 desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; 828 desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
829 829
830 nfs_block_sillyrename(dentry); 830 nfs_block_sillyrename(dentry);
831 res = nfs_revalidate_mapping(inode, filp->f_mapping); 831 res = nfs_revalidate_mapping(inode, filp->f_mapping);
832 if (res < 0) 832 if (res < 0)
833 goto out; 833 goto out;
834 834
835 do { 835 do {
836 res = readdir_search_pagecache(desc); 836 res = readdir_search_pagecache(desc);
837 837
838 if (res == -EBADCOOKIE) { 838 if (res == -EBADCOOKIE) {
839 res = 0; 839 res = 0;
840 /* This means either end of directory */ 840 /* This means either end of directory */
841 if (*desc->dir_cookie && desc->eof == 0) { 841 if (*desc->dir_cookie && desc->eof == 0) {
842 /* Or that the server has 'lost' a cookie */ 842 /* Or that the server has 'lost' a cookie */
843 res = uncached_readdir(desc, dirent, filldir); 843 res = uncached_readdir(desc, dirent, filldir);
844 if (res == 0) 844 if (res == 0)
845 continue; 845 continue;
846 } 846 }
847 break; 847 break;
848 } 848 }
849 if (res == -ETOOSMALL && desc->plus) { 849 if (res == -ETOOSMALL && desc->plus) {
850 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 850 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
851 nfs_zap_caches(inode); 851 nfs_zap_caches(inode);
852 desc->page_index = 0; 852 desc->page_index = 0;
853 desc->plus = 0; 853 desc->plus = 0;
854 desc->eof = 0; 854 desc->eof = 0;
855 continue; 855 continue;
856 } 856 }
857 if (res < 0) 857 if (res < 0)
858 break; 858 break;
859 859
860 res = nfs_do_filldir(desc, dirent, filldir); 860 res = nfs_do_filldir(desc, dirent, filldir);
861 if (res < 0) 861 if (res < 0)
862 break; 862 break;
863 } while (!desc->eof); 863 } while (!desc->eof);
864 out: 864 out:
865 nfs_unblock_sillyrename(dentry); 865 nfs_unblock_sillyrename(dentry);
866 if (res > 0) 866 if (res > 0)
867 res = 0; 867 res = 0;
868 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", 868 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
869 dentry->d_parent->d_name.name, dentry->d_name.name, 869 dentry->d_parent->d_name.name, dentry->d_name.name,
870 res); 870 res);
871 return res; 871 return res;
872 } 872 }
873 873
874 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 874 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
875 { 875 {
876 struct dentry *dentry = filp->f_path.dentry; 876 struct dentry *dentry = filp->f_path.dentry;
877 struct inode *inode = dentry->d_inode; 877 struct inode *inode = dentry->d_inode;
878 struct nfs_open_dir_context *dir_ctx = filp->private_data; 878 struct nfs_open_dir_context *dir_ctx = filp->private_data;
879 879
880 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", 880 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
881 dentry->d_parent->d_name.name, 881 dentry->d_parent->d_name.name,
882 dentry->d_name.name, 882 dentry->d_name.name,
883 offset, origin); 883 offset, whence);
884 884
885 mutex_lock(&inode->i_mutex); 885 mutex_lock(&inode->i_mutex);
886 switch (origin) { 886 switch (whence) {
887 case 1: 887 case 1:
888 offset += filp->f_pos; 888 offset += filp->f_pos;
889 case 0: 889 case 0:
890 if (offset >= 0) 890 if (offset >= 0)
891 break; 891 break;
892 default: 892 default:
893 offset = -EINVAL; 893 offset = -EINVAL;
894 goto out; 894 goto out;
895 } 895 }
896 if (offset != filp->f_pos) { 896 if (offset != filp->f_pos) {
897 filp->f_pos = offset; 897 filp->f_pos = offset;
898 dir_ctx->dir_cookie = 0; 898 dir_ctx->dir_cookie = 0;
899 dir_ctx->duped = 0; 899 dir_ctx->duped = 0;
900 } 900 }
901 out: 901 out:
902 mutex_unlock(&inode->i_mutex); 902 mutex_unlock(&inode->i_mutex);
903 return offset; 903 return offset;
904 } 904 }
905 905
906 /* 906 /*
907 * All directory operations under NFS are synchronous, so fsync() 907 * All directory operations under NFS are synchronous, so fsync()
908 * is a dummy operation. 908 * is a dummy operation.
909 */ 909 */
910 static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, 910 static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
911 int datasync) 911 int datasync)
912 { 912 {
913 struct dentry *dentry = filp->f_path.dentry; 913 struct dentry *dentry = filp->f_path.dentry;
914 struct inode *inode = dentry->d_inode; 914 struct inode *inode = dentry->d_inode;
915 915
916 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 916 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
917 dentry->d_parent->d_name.name, dentry->d_name.name, 917 dentry->d_parent->d_name.name, dentry->d_name.name,
918 datasync); 918 datasync);
919 919
920 mutex_lock(&inode->i_mutex); 920 mutex_lock(&inode->i_mutex);
921 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); 921 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
922 mutex_unlock(&inode->i_mutex); 922 mutex_unlock(&inode->i_mutex);
923 return 0; 923 return 0;
924 } 924 }
925 925
926 /** 926 /**
927 * nfs_force_lookup_revalidate - Mark the directory as having changed 927 * nfs_force_lookup_revalidate - Mark the directory as having changed
928 * @dir - pointer to directory inode 928 * @dir - pointer to directory inode
929 * 929 *
930 * This forces the revalidation code in nfs_lookup_revalidate() to do a 930 * This forces the revalidation code in nfs_lookup_revalidate() to do a
931 * full lookup on all child dentries of 'dir' whenever a change occurs 931 * full lookup on all child dentries of 'dir' whenever a change occurs
932 * on the server that might have invalidated our dcache. 932 * on the server that might have invalidated our dcache.
933 * 933 *
934 * The caller should be holding dir->i_lock 934 * The caller should be holding dir->i_lock
935 */ 935 */
936 void nfs_force_lookup_revalidate(struct inode *dir) 936 void nfs_force_lookup_revalidate(struct inode *dir)
937 { 937 {
938 NFS_I(dir)->cache_change_attribute++; 938 NFS_I(dir)->cache_change_attribute++;
939 } 939 }
940 EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); 940 EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
941 941
942 /* 942 /*
943 * A check for whether or not the parent directory has changed. 943 * A check for whether or not the parent directory has changed.
944 * In the case it has, we assume that the dentries are untrustworthy 944 * In the case it has, we assume that the dentries are untrustworthy
945 * and may need to be looked up again. 945 * and may need to be looked up again.
946 */ 946 */
947 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) 947 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
948 { 948 {
949 if (IS_ROOT(dentry)) 949 if (IS_ROOT(dentry))
950 return 1; 950 return 1;
951 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) 951 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
952 return 0; 952 return 0;
953 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 953 if (!nfs_verify_change_attribute(dir, dentry->d_time))
954 return 0; 954 return 0;
955 /* Revalidate nfsi->cache_change_attribute before we declare a match */ 955 /* Revalidate nfsi->cache_change_attribute before we declare a match */
956 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) 956 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
957 return 0; 957 return 0;
958 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 958 if (!nfs_verify_change_attribute(dir, dentry->d_time))
959 return 0; 959 return 0;
960 return 1; 960 return 1;
961 } 961 }
962 962
963 /* 963 /*
964 * Use intent information to check whether or not we're going to do 964 * Use intent information to check whether or not we're going to do
965 * an O_EXCL create using this path component. 965 * an O_EXCL create using this path component.
966 */ 966 */
967 static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) 967 static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
968 { 968 {
969 if (NFS_PROTO(dir)->version == 2) 969 if (NFS_PROTO(dir)->version == 2)
970 return 0; 970 return 0;
971 return flags & LOOKUP_EXCL; 971 return flags & LOOKUP_EXCL;
972 } 972 }
973 973
974 /* 974 /*
975 * Inode and filehandle revalidation for lookups. 975 * Inode and filehandle revalidation for lookups.
976 * 976 *
977 * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, 977 * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
978 * or if the intent information indicates that we're about to open this 978 * or if the intent information indicates that we're about to open this
979 * particular file and the "nocto" mount flag is not set. 979 * particular file and the "nocto" mount flag is not set.
980 * 980 *
981 */ 981 */
982 static inline 982 static inline
983 int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) 983 int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
984 { 984 {
985 struct nfs_server *server = NFS_SERVER(inode); 985 struct nfs_server *server = NFS_SERVER(inode);
986 986
987 if (IS_AUTOMOUNT(inode)) 987 if (IS_AUTOMOUNT(inode))
988 return 0; 988 return 0;
989 /* VFS wants an on-the-wire revalidation */ 989 /* VFS wants an on-the-wire revalidation */
990 if (flags & LOOKUP_REVAL) 990 if (flags & LOOKUP_REVAL)
991 goto out_force; 991 goto out_force;
992 /* This is an open(2) */ 992 /* This is an open(2) */
993 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && 993 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
994 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 994 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
995 goto out_force; 995 goto out_force;
996 return 0; 996 return 0;
997 out_force: 997 out_force:
998 return __nfs_revalidate_inode(server, inode); 998 return __nfs_revalidate_inode(server, inode);
999 } 999 }
1000 1000
1001 /* 1001 /*
1002 * We judge how long we want to trust negative 1002 * We judge how long we want to trust negative
1003 * dentries by looking at the parent inode mtime. 1003 * dentries by looking at the parent inode mtime.
1004 * 1004 *
1005 * If parent mtime has changed, we revalidate, else we wait for a 1005 * If parent mtime has changed, we revalidate, else we wait for a
1006 * period corresponding to the parent's attribute cache timeout value. 1006 * period corresponding to the parent's attribute cache timeout value.
1007 */ 1007 */
1008 static inline 1008 static inline
1009 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, 1009 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
1010 unsigned int flags) 1010 unsigned int flags)
1011 { 1011 {
1012 /* Don't revalidate a negative dentry if we're creating a new file */ 1012 /* Don't revalidate a negative dentry if we're creating a new file */
1013 if (flags & LOOKUP_CREATE) 1013 if (flags & LOOKUP_CREATE)
1014 return 0; 1014 return 0;
1015 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) 1015 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
1016 return 1; 1016 return 1;
1017 return !nfs_check_verifier(dir, dentry); 1017 return !nfs_check_verifier(dir, dentry);
1018 } 1018 }
1019 1019
1020 /* 1020 /*
1021 * This is called every time the dcache has a lookup hit, 1021 * This is called every time the dcache has a lookup hit,
1022 * and we should check whether we can really trust that 1022 * and we should check whether we can really trust that
1023 * lookup. 1023 * lookup.
1024 * 1024 *
1025 * NOTE! The hit can be a negative hit too, don't assume 1025 * NOTE! The hit can be a negative hit too, don't assume
1026 * we have an inode! 1026 * we have an inode!
1027 * 1027 *
1028 * If the parent directory is seen to have changed, we throw out the 1028 * If the parent directory is seen to have changed, we throw out the
1029 * cached dentry and do a new lookup. 1029 * cached dentry and do a new lookup.
1030 */ 1030 */
1031 static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) 1031 static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1032 { 1032 {
1033 struct inode *dir; 1033 struct inode *dir;
1034 struct inode *inode; 1034 struct inode *inode;
1035 struct dentry *parent; 1035 struct dentry *parent;
1036 struct nfs_fh *fhandle = NULL; 1036 struct nfs_fh *fhandle = NULL;
1037 struct nfs_fattr *fattr = NULL; 1037 struct nfs_fattr *fattr = NULL;
1038 int error; 1038 int error;
1039 1039
1040 if (flags & LOOKUP_RCU) 1040 if (flags & LOOKUP_RCU)
1041 return -ECHILD; 1041 return -ECHILD;
1042 1042
1043 parent = dget_parent(dentry); 1043 parent = dget_parent(dentry);
1044 dir = parent->d_inode; 1044 dir = parent->d_inode;
1045 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 1045 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
1046 inode = dentry->d_inode; 1046 inode = dentry->d_inode;
1047 1047
1048 if (!inode) { 1048 if (!inode) {
1049 if (nfs_neg_need_reval(dir, dentry, flags)) 1049 if (nfs_neg_need_reval(dir, dentry, flags))
1050 goto out_bad; 1050 goto out_bad;
1051 goto out_valid_noent; 1051 goto out_valid_noent;
1052 } 1052 }
1053 1053
1054 if (is_bad_inode(inode)) { 1054 if (is_bad_inode(inode)) {
1055 dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", 1055 dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
1056 __func__, dentry->d_parent->d_name.name, 1056 __func__, dentry->d_parent->d_name.name,
1057 dentry->d_name.name); 1057 dentry->d_name.name);
1058 goto out_bad; 1058 goto out_bad;
1059 } 1059 }
1060 1060
1061 if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) 1061 if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
1062 goto out_set_verifier; 1062 goto out_set_verifier;
1063 1063
1064 /* Force a full look up iff the parent directory has changed */ 1064 /* Force a full look up iff the parent directory has changed */
1065 if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { 1065 if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
1066 if (nfs_lookup_verify_inode(inode, flags)) 1066 if (nfs_lookup_verify_inode(inode, flags))
1067 goto out_zap_parent; 1067 goto out_zap_parent;
1068 goto out_valid; 1068 goto out_valid;
1069 } 1069 }
1070 1070
1071 if (NFS_STALE(inode)) 1071 if (NFS_STALE(inode))
1072 goto out_bad; 1072 goto out_bad;
1073 1073
1074 error = -ENOMEM; 1074 error = -ENOMEM;
1075 fhandle = nfs_alloc_fhandle(); 1075 fhandle = nfs_alloc_fhandle();
1076 fattr = nfs_alloc_fattr(); 1076 fattr = nfs_alloc_fattr();
1077 if (fhandle == NULL || fattr == NULL) 1077 if (fhandle == NULL || fattr == NULL)
1078 goto out_error; 1078 goto out_error;
1079 1079
1080 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1080 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1081 if (error) 1081 if (error)
1082 goto out_bad; 1082 goto out_bad;
1083 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1083 if (nfs_compare_fh(NFS_FH(inode), fhandle))
1084 goto out_bad; 1084 goto out_bad;
1085 if ((error = nfs_refresh_inode(inode, fattr)) != 0) 1085 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
1086 goto out_bad; 1086 goto out_bad;
1087 1087
1088 nfs_free_fattr(fattr); 1088 nfs_free_fattr(fattr);
1089 nfs_free_fhandle(fhandle); 1089 nfs_free_fhandle(fhandle);
1090 out_set_verifier: 1090 out_set_verifier:
1091 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1091 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1092 out_valid: 1092 out_valid:
1093 /* Success: notify readdir to use READDIRPLUS */ 1093 /* Success: notify readdir to use READDIRPLUS */
1094 nfs_advise_use_readdirplus(dir); 1094 nfs_advise_use_readdirplus(dir);
1095 out_valid_noent: 1095 out_valid_noent:
1096 dput(parent); 1096 dput(parent);
1097 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", 1097 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
1098 __func__, dentry->d_parent->d_name.name, 1098 __func__, dentry->d_parent->d_name.name,
1099 dentry->d_name.name); 1099 dentry->d_name.name);
1100 return 1; 1100 return 1;
1101 out_zap_parent: 1101 out_zap_parent:
1102 nfs_zap_caches(dir); 1102 nfs_zap_caches(dir);
1103 out_bad: 1103 out_bad:
1104 nfs_free_fattr(fattr); 1104 nfs_free_fattr(fattr);
1105 nfs_free_fhandle(fhandle); 1105 nfs_free_fhandle(fhandle);
1106 nfs_mark_for_revalidate(dir); 1106 nfs_mark_for_revalidate(dir);
1107 if (inode && S_ISDIR(inode->i_mode)) { 1107 if (inode && S_ISDIR(inode->i_mode)) {
1108 /* Purge readdir caches. */ 1108 /* Purge readdir caches. */
1109 nfs_zap_caches(inode); 1109 nfs_zap_caches(inode);
1110 /* If we have submounts, don't unhash ! */ 1110 /* If we have submounts, don't unhash ! */
1111 if (have_submounts(dentry)) 1111 if (have_submounts(dentry))
1112 goto out_valid; 1112 goto out_valid;
1113 if (dentry->d_flags & DCACHE_DISCONNECTED) 1113 if (dentry->d_flags & DCACHE_DISCONNECTED)
1114 goto out_valid; 1114 goto out_valid;
1115 shrink_dcache_parent(dentry); 1115 shrink_dcache_parent(dentry);
1116 } 1116 }
1117 d_drop(dentry); 1117 d_drop(dentry);
1118 dput(parent); 1118 dput(parent);
1119 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 1119 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
1120 __func__, dentry->d_parent->d_name.name, 1120 __func__, dentry->d_parent->d_name.name,
1121 dentry->d_name.name); 1121 dentry->d_name.name);
1122 return 0; 1122 return 0;
1123 out_error: 1123 out_error:
1124 nfs_free_fattr(fattr); 1124 nfs_free_fattr(fattr);
1125 nfs_free_fhandle(fhandle); 1125 nfs_free_fhandle(fhandle);
1126 dput(parent); 1126 dput(parent);
1127 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", 1127 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
1128 __func__, dentry->d_parent->d_name.name, 1128 __func__, dentry->d_parent->d_name.name,
1129 dentry->d_name.name, error); 1129 dentry->d_name.name, error);
1130 return error; 1130 return error;
1131 } 1131 }
1132 1132
1133 /* 1133 /*
1134 * This is called from dput() when d_count is going to 0. 1134 * This is called from dput() when d_count is going to 0.
1135 */ 1135 */
1136 static int nfs_dentry_delete(const struct dentry *dentry) 1136 static int nfs_dentry_delete(const struct dentry *dentry)
1137 { 1137 {
1138 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", 1138 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
1139 dentry->d_parent->d_name.name, dentry->d_name.name, 1139 dentry->d_parent->d_name.name, dentry->d_name.name,
1140 dentry->d_flags); 1140 dentry->d_flags);
1141 1141
1142 /* Unhash any dentry with a stale inode */ 1142 /* Unhash any dentry with a stale inode */
1143 if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) 1143 if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
1144 return 1; 1144 return 1;
1145 1145
1146 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 1146 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1147 /* Unhash it, so that ->d_iput() would be called */ 1147 /* Unhash it, so that ->d_iput() would be called */
1148 return 1; 1148 return 1;
1149 } 1149 }
1150 if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { 1150 if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
1151 /* Unhash it, so that ancestors of killed async unlink 1151 /* Unhash it, so that ancestors of killed async unlink
1152 * files will be cleaned up during umount */ 1152 * files will be cleaned up during umount */
1153 return 1; 1153 return 1;
1154 } 1154 }
1155 return 0; 1155 return 0;
1156 1156
1157 } 1157 }
1158 1158
1159 static void nfs_drop_nlink(struct inode *inode) 1159 static void nfs_drop_nlink(struct inode *inode)
1160 { 1160 {
1161 spin_lock(&inode->i_lock); 1161 spin_lock(&inode->i_lock);
1162 if (inode->i_nlink > 0) 1162 if (inode->i_nlink > 0)
1163 drop_nlink(inode); 1163 drop_nlink(inode);
1164 spin_unlock(&inode->i_lock); 1164 spin_unlock(&inode->i_lock);
1165 } 1165 }
1166 1166
1167 /* 1167 /*
1168 * Called when the dentry loses inode. 1168 * Called when the dentry loses inode.
1169 * We use it to clean up silly-renamed files. 1169 * We use it to clean up silly-renamed files.
1170 */ 1170 */
1171 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) 1171 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
1172 { 1172 {
1173 if (S_ISDIR(inode->i_mode)) 1173 if (S_ISDIR(inode->i_mode))
1174 /* drop any readdir cache as it could easily be old */ 1174 /* drop any readdir cache as it could easily be old */
1175 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 1175 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
1176 1176
1177 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 1177 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1178 drop_nlink(inode); 1178 drop_nlink(inode);
1179 nfs_complete_unlink(dentry, inode); 1179 nfs_complete_unlink(dentry, inode);
1180 } 1180 }
1181 iput(inode); 1181 iput(inode);
1182 } 1182 }
1183 1183
1184 static void nfs_d_release(struct dentry *dentry) 1184 static void nfs_d_release(struct dentry *dentry)
1185 { 1185 {
1186 /* free cached devname value, if it survived that far */ 1186 /* free cached devname value, if it survived that far */
1187 if (unlikely(dentry->d_fsdata)) { 1187 if (unlikely(dentry->d_fsdata)) {
1188 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 1188 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1189 WARN_ON(1); 1189 WARN_ON(1);
1190 else 1190 else
1191 kfree(dentry->d_fsdata); 1191 kfree(dentry->d_fsdata);
1192 } 1192 }
1193 } 1193 }
1194 1194
1195 const struct dentry_operations nfs_dentry_operations = { 1195 const struct dentry_operations nfs_dentry_operations = {
1196 .d_revalidate = nfs_lookup_revalidate, 1196 .d_revalidate = nfs_lookup_revalidate,
1197 .d_delete = nfs_dentry_delete, 1197 .d_delete = nfs_dentry_delete,
1198 .d_iput = nfs_dentry_iput, 1198 .d_iput = nfs_dentry_iput,
1199 .d_automount = nfs_d_automount, 1199 .d_automount = nfs_d_automount,
1200 .d_release = nfs_d_release, 1200 .d_release = nfs_d_release,
1201 }; 1201 };
1202 EXPORT_SYMBOL_GPL(nfs_dentry_operations); 1202 EXPORT_SYMBOL_GPL(nfs_dentry_operations);
1203 1203
1204 struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 1204 struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
1205 { 1205 {
1206 struct dentry *res; 1206 struct dentry *res;
1207 struct dentry *parent; 1207 struct dentry *parent;
1208 struct inode *inode = NULL; 1208 struct inode *inode = NULL;
1209 struct nfs_fh *fhandle = NULL; 1209 struct nfs_fh *fhandle = NULL;
1210 struct nfs_fattr *fattr = NULL; 1210 struct nfs_fattr *fattr = NULL;
1211 int error; 1211 int error;
1212 1212
1213 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 1213 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
1214 dentry->d_parent->d_name.name, dentry->d_name.name); 1214 dentry->d_parent->d_name.name, dentry->d_name.name);
1215 nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); 1215 nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
1216 1216
1217 res = ERR_PTR(-ENAMETOOLONG); 1217 res = ERR_PTR(-ENAMETOOLONG);
1218 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1218 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1219 goto out; 1219 goto out;
1220 1220
1221 /* 1221 /*
1222 * If we're doing an exclusive create, optimize away the lookup 1222 * If we're doing an exclusive create, optimize away the lookup
1223 * but don't hash the dentry. 1223 * but don't hash the dentry.
1224 */ 1224 */
1225 if (nfs_is_exclusive_create(dir, flags)) { 1225 if (nfs_is_exclusive_create(dir, flags)) {
1226 d_instantiate(dentry, NULL); 1226 d_instantiate(dentry, NULL);
1227 res = NULL; 1227 res = NULL;
1228 goto out; 1228 goto out;
1229 } 1229 }
1230 1230
1231 res = ERR_PTR(-ENOMEM); 1231 res = ERR_PTR(-ENOMEM);
1232 fhandle = nfs_alloc_fhandle(); 1232 fhandle = nfs_alloc_fhandle();
1233 fattr = nfs_alloc_fattr(); 1233 fattr = nfs_alloc_fattr();
1234 if (fhandle == NULL || fattr == NULL) 1234 if (fhandle == NULL || fattr == NULL)
1235 goto out; 1235 goto out;
1236 1236
1237 parent = dentry->d_parent; 1237 parent = dentry->d_parent;
1238 /* Protect against concurrent sillydeletes */ 1238 /* Protect against concurrent sillydeletes */
1239 nfs_block_sillyrename(parent); 1239 nfs_block_sillyrename(parent);
1240 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1240 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1241 if (error == -ENOENT) 1241 if (error == -ENOENT)
1242 goto no_entry; 1242 goto no_entry;
1243 if (error < 0) { 1243 if (error < 0) {
1244 res = ERR_PTR(error); 1244 res = ERR_PTR(error);
1245 goto out_unblock_sillyrename; 1245 goto out_unblock_sillyrename;
1246 } 1246 }
1247 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1247 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1248 res = ERR_CAST(inode); 1248 res = ERR_CAST(inode);
1249 if (IS_ERR(res)) 1249 if (IS_ERR(res))
1250 goto out_unblock_sillyrename; 1250 goto out_unblock_sillyrename;
1251 1251
1252 /* Success: notify readdir to use READDIRPLUS */ 1252 /* Success: notify readdir to use READDIRPLUS */
1253 nfs_advise_use_readdirplus(dir); 1253 nfs_advise_use_readdirplus(dir);
1254 1254
1255 no_entry: 1255 no_entry:
1256 res = d_materialise_unique(dentry, inode); 1256 res = d_materialise_unique(dentry, inode);
1257 if (res != NULL) { 1257 if (res != NULL) {
1258 if (IS_ERR(res)) 1258 if (IS_ERR(res))
1259 goto out_unblock_sillyrename; 1259 goto out_unblock_sillyrename;
1260 dentry = res; 1260 dentry = res;
1261 } 1261 }
1262 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1262 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1263 out_unblock_sillyrename: 1263 out_unblock_sillyrename:
1264 nfs_unblock_sillyrename(parent); 1264 nfs_unblock_sillyrename(parent);
1265 out: 1265 out:
1266 nfs_free_fattr(fattr); 1266 nfs_free_fattr(fattr);
1267 nfs_free_fhandle(fhandle); 1267 nfs_free_fhandle(fhandle);
1268 return res; 1268 return res;
1269 } 1269 }
1270 EXPORT_SYMBOL_GPL(nfs_lookup); 1270 EXPORT_SYMBOL_GPL(nfs_lookup);
1271 1271
1272 #if IS_ENABLED(CONFIG_NFS_V4) 1272 #if IS_ENABLED(CONFIG_NFS_V4)
1273 static int nfs4_lookup_revalidate(struct dentry *, unsigned int); 1273 static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
1274 1274
1275 const struct dentry_operations nfs4_dentry_operations = { 1275 const struct dentry_operations nfs4_dentry_operations = {
1276 .d_revalidate = nfs4_lookup_revalidate, 1276 .d_revalidate = nfs4_lookup_revalidate,
1277 .d_delete = nfs_dentry_delete, 1277 .d_delete = nfs_dentry_delete,
1278 .d_iput = nfs_dentry_iput, 1278 .d_iput = nfs_dentry_iput,
1279 .d_automount = nfs_d_automount, 1279 .d_automount = nfs_d_automount,
1280 .d_release = nfs_d_release, 1280 .d_release = nfs_d_release,
1281 }; 1281 };
1282 EXPORT_SYMBOL_GPL(nfs4_dentry_operations); 1282 EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
1283 1283
1284 static fmode_t flags_to_mode(int flags) 1284 static fmode_t flags_to_mode(int flags)
1285 { 1285 {
1286 fmode_t res = (__force fmode_t)flags & FMODE_EXEC; 1286 fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
1287 if ((flags & O_ACCMODE) != O_WRONLY) 1287 if ((flags & O_ACCMODE) != O_WRONLY)
1288 res |= FMODE_READ; 1288 res |= FMODE_READ;
1289 if ((flags & O_ACCMODE) != O_RDONLY) 1289 if ((flags & O_ACCMODE) != O_RDONLY)
1290 res |= FMODE_WRITE; 1290 res |= FMODE_WRITE;
1291 return res; 1291 return res;
1292 } 1292 }
1293 1293
1294 static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) 1294 static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
1295 { 1295 {
1296 return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); 1296 return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
1297 } 1297 }
1298 1298
1299 static int do_open(struct inode *inode, struct file *filp) 1299 static int do_open(struct inode *inode, struct file *filp)
1300 { 1300 {
1301 nfs_fscache_set_inode_cookie(inode, filp); 1301 nfs_fscache_set_inode_cookie(inode, filp);
1302 return 0; 1302 return 0;
1303 } 1303 }
1304 1304
1305 static int nfs_finish_open(struct nfs_open_context *ctx, 1305 static int nfs_finish_open(struct nfs_open_context *ctx,
1306 struct dentry *dentry, 1306 struct dentry *dentry,
1307 struct file *file, unsigned open_flags, 1307 struct file *file, unsigned open_flags,
1308 int *opened) 1308 int *opened)
1309 { 1309 {
1310 int err; 1310 int err;
1311 1311
1312 if (ctx->dentry != dentry) { 1312 if (ctx->dentry != dentry) {
1313 dput(ctx->dentry); 1313 dput(ctx->dentry);
1314 ctx->dentry = dget(dentry); 1314 ctx->dentry = dget(dentry);
1315 } 1315 }
1316 1316
1317 /* If the open_intent is for execute, we have an extra check to make */ 1317 /* If the open_intent is for execute, we have an extra check to make */
1318 if (ctx->mode & FMODE_EXEC) { 1318 if (ctx->mode & FMODE_EXEC) {
1319 err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags); 1319 err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
1320 if (err < 0) 1320 if (err < 0)
1321 goto out; 1321 goto out;
1322 } 1322 }
1323 1323
1324 err = finish_open(file, dentry, do_open, opened); 1324 err = finish_open(file, dentry, do_open, opened);
1325 if (err) 1325 if (err)
1326 goto out; 1326 goto out;
1327 nfs_file_set_open_context(file, ctx); 1327 nfs_file_set_open_context(file, ctx);
1328 1328
1329 out: 1329 out:
1330 put_nfs_open_context(ctx); 1330 put_nfs_open_context(ctx);
1331 return err; 1331 return err;
1332 } 1332 }
1333 1333
1334 int nfs_atomic_open(struct inode *dir, struct dentry *dentry, 1334 int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1335 struct file *file, unsigned open_flags, 1335 struct file *file, unsigned open_flags,
1336 umode_t mode, int *opened) 1336 umode_t mode, int *opened)
1337 { 1337 {
1338 struct nfs_open_context *ctx; 1338 struct nfs_open_context *ctx;
1339 struct dentry *res; 1339 struct dentry *res;
1340 struct iattr attr = { .ia_valid = ATTR_OPEN }; 1340 struct iattr attr = { .ia_valid = ATTR_OPEN };
1341 struct inode *inode; 1341 struct inode *inode;
1342 int err; 1342 int err;
1343 1343
1344 /* Expect a negative dentry */ 1344 /* Expect a negative dentry */
1345 BUG_ON(dentry->d_inode); 1345 BUG_ON(dentry->d_inode);
1346 1346
1347 dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n", 1347 dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
1348 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1348 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1349 1349
1350 /* NFS only supports OPEN on regular files */ 1350 /* NFS only supports OPEN on regular files */
1351 if ((open_flags & O_DIRECTORY)) { 1351 if ((open_flags & O_DIRECTORY)) {
1352 if (!d_unhashed(dentry)) { 1352 if (!d_unhashed(dentry)) {
1353 /* 1353 /*
1354 * Hashed negative dentry with O_DIRECTORY: dentry was 1354 * Hashed negative dentry with O_DIRECTORY: dentry was
1355 * revalidated and is fine, no need to perform lookup 1355 * revalidated and is fine, no need to perform lookup
1356 * again 1356 * again
1357 */ 1357 */
1358 return -ENOENT; 1358 return -ENOENT;
1359 } 1359 }
1360 goto no_open; 1360 goto no_open;
1361 } 1361 }
1362 1362
1363 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1363 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1364 return -ENAMETOOLONG; 1364 return -ENAMETOOLONG;
1365 1365
1366 if (open_flags & O_CREAT) { 1366 if (open_flags & O_CREAT) {
1367 attr.ia_valid |= ATTR_MODE; 1367 attr.ia_valid |= ATTR_MODE;
1368 attr.ia_mode = mode & ~current_umask(); 1368 attr.ia_mode = mode & ~current_umask();
1369 } 1369 }
1370 if (open_flags & O_TRUNC) { 1370 if (open_flags & O_TRUNC) {
1371 attr.ia_valid |= ATTR_SIZE; 1371 attr.ia_valid |= ATTR_SIZE;
1372 attr.ia_size = 0; 1372 attr.ia_size = 0;
1373 } 1373 }
1374 1374
1375 ctx = create_nfs_open_context(dentry, open_flags); 1375 ctx = create_nfs_open_context(dentry, open_flags);
1376 err = PTR_ERR(ctx); 1376 err = PTR_ERR(ctx);
1377 if (IS_ERR(ctx)) 1377 if (IS_ERR(ctx))
1378 goto out; 1378 goto out;
1379 1379
1380 nfs_block_sillyrename(dentry->d_parent); 1380 nfs_block_sillyrename(dentry->d_parent);
1381 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); 1381 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1382 d_drop(dentry); 1382 d_drop(dentry);
1383 if (IS_ERR(inode)) { 1383 if (IS_ERR(inode)) {
1384 nfs_unblock_sillyrename(dentry->d_parent); 1384 nfs_unblock_sillyrename(dentry->d_parent);
1385 put_nfs_open_context(ctx); 1385 put_nfs_open_context(ctx);
1386 err = PTR_ERR(inode); 1386 err = PTR_ERR(inode);
1387 switch (err) { 1387 switch (err) {
1388 case -ENOENT: 1388 case -ENOENT:
1389 d_add(dentry, NULL); 1389 d_add(dentry, NULL);
1390 break; 1390 break;
1391 case -EISDIR: 1391 case -EISDIR:
1392 case -ENOTDIR: 1392 case -ENOTDIR:
1393 goto no_open; 1393 goto no_open;
1394 case -ELOOP: 1394 case -ELOOP:
1395 if (!(open_flags & O_NOFOLLOW)) 1395 if (!(open_flags & O_NOFOLLOW))
1396 goto no_open; 1396 goto no_open;
1397 break; 1397 break;
1398 /* case -EINVAL: */ 1398 /* case -EINVAL: */
1399 default: 1399 default:
1400 break; 1400 break;
1401 } 1401 }
1402 goto out; 1402 goto out;
1403 } 1403 }
1404 res = d_add_unique(dentry, inode); 1404 res = d_add_unique(dentry, inode);
1405 if (res != NULL) 1405 if (res != NULL)
1406 dentry = res; 1406 dentry = res;
1407 1407
1408 nfs_unblock_sillyrename(dentry->d_parent); 1408 nfs_unblock_sillyrename(dentry->d_parent);
1409 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1409 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1410 1410
1411 err = nfs_finish_open(ctx, dentry, file, open_flags, opened); 1411 err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
1412 1412
1413 dput(res); 1413 dput(res);
1414 out: 1414 out:
1415 return err; 1415 return err;
1416 1416
1417 no_open: 1417 no_open:
1418 res = nfs_lookup(dir, dentry, 0); 1418 res = nfs_lookup(dir, dentry, 0);
1419 err = PTR_ERR(res); 1419 err = PTR_ERR(res);
1420 if (IS_ERR(res)) 1420 if (IS_ERR(res))
1421 goto out; 1421 goto out;
1422 1422
1423 return finish_no_open(file, res); 1423 return finish_no_open(file, res);
1424 } 1424 }
1425 EXPORT_SYMBOL_GPL(nfs_atomic_open); 1425 EXPORT_SYMBOL_GPL(nfs_atomic_open);
1426 1426
1427 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) 1427 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1428 { 1428 {
1429 struct dentry *parent = NULL; 1429 struct dentry *parent = NULL;
1430 struct inode *inode; 1430 struct inode *inode;
1431 struct inode *dir; 1431 struct inode *dir;
1432 int ret = 0; 1432 int ret = 0;
1433 1433
1434 if (flags & LOOKUP_RCU) 1434 if (flags & LOOKUP_RCU)
1435 return -ECHILD; 1435 return -ECHILD;
1436 1436
1437 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) 1437 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
1438 goto no_open; 1438 goto no_open;
1439 if (d_mountpoint(dentry)) 1439 if (d_mountpoint(dentry))
1440 goto no_open; 1440 goto no_open;
1441 1441
1442 inode = dentry->d_inode; 1442 inode = dentry->d_inode;
1443 parent = dget_parent(dentry); 1443 parent = dget_parent(dentry);
1444 dir = parent->d_inode; 1444 dir = parent->d_inode;
1445 1445
1446 /* We can't create new files in nfs_open_revalidate(), so we 1446 /* We can't create new files in nfs_open_revalidate(), so we
1447 * optimize away revalidation of negative dentries. 1447 * optimize away revalidation of negative dentries.
1448 */ 1448 */
1449 if (inode == NULL) { 1449 if (inode == NULL) {
1450 if (!nfs_neg_need_reval(dir, dentry, flags)) 1450 if (!nfs_neg_need_reval(dir, dentry, flags))
1451 ret = 1; 1451 ret = 1;
1452 goto out; 1452 goto out;
1453 } 1453 }
1454 1454
1455 /* NFS only supports OPEN on regular files */ 1455 /* NFS only supports OPEN on regular files */
1456 if (!S_ISREG(inode->i_mode)) 1456 if (!S_ISREG(inode->i_mode))
1457 goto no_open_dput; 1457 goto no_open_dput;
1458 /* We cannot do exclusive creation on a positive dentry */ 1458 /* We cannot do exclusive creation on a positive dentry */
1459 if (flags & LOOKUP_EXCL) 1459 if (flags & LOOKUP_EXCL)
1460 goto no_open_dput; 1460 goto no_open_dput;
1461 1461
1462 /* Let f_op->open() actually open (and revalidate) the file */ 1462 /* Let f_op->open() actually open (and revalidate) the file */
1463 ret = 1; 1463 ret = 1;
1464 1464
1465 out: 1465 out:
1466 dput(parent); 1466 dput(parent);
1467 return ret; 1467 return ret;
1468 1468
1469 no_open_dput: 1469 no_open_dput:
1470 dput(parent); 1470 dput(parent);
1471 no_open: 1471 no_open:
1472 return nfs_lookup_revalidate(dentry, flags); 1472 return nfs_lookup_revalidate(dentry, flags);
1473 } 1473 }
1474 1474
1475 #endif /* CONFIG_NFSV4 */ 1475 #endif /* CONFIG_NFSV4 */
1476 1476
1477 /* 1477 /*
1478 * Code common to create, mkdir, and mknod. 1478 * Code common to create, mkdir, and mknod.
1479 */ 1479 */
1480 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1480 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1481 struct nfs_fattr *fattr) 1481 struct nfs_fattr *fattr)
1482 { 1482 {
1483 struct dentry *parent = dget_parent(dentry); 1483 struct dentry *parent = dget_parent(dentry);
1484 struct inode *dir = parent->d_inode; 1484 struct inode *dir = parent->d_inode;
1485 struct inode *inode; 1485 struct inode *inode;
1486 int error = -EACCES; 1486 int error = -EACCES;
1487 1487
1488 d_drop(dentry); 1488 d_drop(dentry);
1489 1489
1490 /* We may have been initialized further down */ 1490 /* We may have been initialized further down */
1491 if (dentry->d_inode) 1491 if (dentry->d_inode)
1492 goto out; 1492 goto out;
1493 if (fhandle->size == 0) { 1493 if (fhandle->size == 0) {
1494 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1494 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1495 if (error) 1495 if (error)
1496 goto out_error; 1496 goto out_error;
1497 } 1497 }
1498 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1498 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1499 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1499 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1500 struct nfs_server *server = NFS_SB(dentry->d_sb); 1500 struct nfs_server *server = NFS_SB(dentry->d_sb);
1501 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1501 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
1502 if (error < 0) 1502 if (error < 0)
1503 goto out_error; 1503 goto out_error;
1504 } 1504 }
1505 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1505 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1506 error = PTR_ERR(inode); 1506 error = PTR_ERR(inode);
1507 if (IS_ERR(inode)) 1507 if (IS_ERR(inode))
1508 goto out_error; 1508 goto out_error;
1509 d_add(dentry, inode); 1509 d_add(dentry, inode);
1510 out: 1510 out:
1511 dput(parent); 1511 dput(parent);
1512 return 0; 1512 return 0;
1513 out_error: 1513 out_error:
1514 nfs_mark_for_revalidate(dir); 1514 nfs_mark_for_revalidate(dir);
1515 dput(parent); 1515 dput(parent);
1516 return error; 1516 return error;
1517 } 1517 }
1518 EXPORT_SYMBOL_GPL(nfs_instantiate); 1518 EXPORT_SYMBOL_GPL(nfs_instantiate);
1519 1519
1520 /* 1520 /*
1521 * Following a failed create operation, we drop the dentry rather 1521 * Following a failed create operation, we drop the dentry rather
1522 * than retain a negative dentry. This avoids a problem in the event 1522 * than retain a negative dentry. This avoids a problem in the event
1523 * that the operation succeeded on the server, but an error in the 1523 * that the operation succeeded on the server, but an error in the
1524 * reply path made it appear to have failed. 1524 * reply path made it appear to have failed.
1525 */ 1525 */
1526 int nfs_create(struct inode *dir, struct dentry *dentry, 1526 int nfs_create(struct inode *dir, struct dentry *dentry,
1527 umode_t mode, bool excl) 1527 umode_t mode, bool excl)
1528 { 1528 {
1529 struct iattr attr; 1529 struct iattr attr;
1530 int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; 1530 int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
1531 int error; 1531 int error;
1532 1532
1533 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1533 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1534 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1534 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1535 1535
1536 attr.ia_mode = mode; 1536 attr.ia_mode = mode;
1537 attr.ia_valid = ATTR_MODE; 1537 attr.ia_valid = ATTR_MODE;
1538 1538
1539 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); 1539 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
1540 if (error != 0) 1540 if (error != 0)
1541 goto out_err; 1541 goto out_err;
1542 return 0; 1542 return 0;
1543 out_err: 1543 out_err:
1544 d_drop(dentry); 1544 d_drop(dentry);
1545 return error; 1545 return error;
1546 } 1546 }
1547 EXPORT_SYMBOL_GPL(nfs_create); 1547 EXPORT_SYMBOL_GPL(nfs_create);
1548 1548
1549 /* 1549 /*
1550 * See comments for nfs_proc_create regarding failed operations. 1550 * See comments for nfs_proc_create regarding failed operations.
1551 */ 1551 */
1552 int 1552 int
1553 nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) 1553 nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
1554 { 1554 {
1555 struct iattr attr; 1555 struct iattr attr;
1556 int status; 1556 int status;
1557 1557
1558 dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", 1558 dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",
1559 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1559 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1560 1560
1561 if (!new_valid_dev(rdev)) 1561 if (!new_valid_dev(rdev))
1562 return -EINVAL; 1562 return -EINVAL;
1563 1563
1564 attr.ia_mode = mode; 1564 attr.ia_mode = mode;
1565 attr.ia_valid = ATTR_MODE; 1565 attr.ia_valid = ATTR_MODE;
1566 1566
1567 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1567 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1568 if (status != 0) 1568 if (status != 0)
1569 goto out_err; 1569 goto out_err;
1570 return 0; 1570 return 0;
1571 out_err: 1571 out_err:
1572 d_drop(dentry); 1572 d_drop(dentry);
1573 return status; 1573 return status;
1574 } 1574 }
1575 EXPORT_SYMBOL_GPL(nfs_mknod); 1575 EXPORT_SYMBOL_GPL(nfs_mknod);
1576 1576
1577 /* 1577 /*
1578 * See comments for nfs_proc_create regarding failed operations. 1578 * See comments for nfs_proc_create regarding failed operations.
1579 */ 1579 */
1580 int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 1580 int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1581 { 1581 {
1582 struct iattr attr; 1582 struct iattr attr;
1583 int error; 1583 int error;
1584 1584
1585 dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", 1585 dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",
1586 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1586 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1587 1587
1588 attr.ia_valid = ATTR_MODE; 1588 attr.ia_valid = ATTR_MODE;
1589 attr.ia_mode = mode | S_IFDIR; 1589 attr.ia_mode = mode | S_IFDIR;
1590 1590
1591 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1591 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1592 if (error != 0) 1592 if (error != 0)
1593 goto out_err; 1593 goto out_err;
1594 return 0; 1594 return 0;
1595 out_err: 1595 out_err:
1596 d_drop(dentry); 1596 d_drop(dentry);
1597 return error; 1597 return error;
1598 } 1598 }
1599 EXPORT_SYMBOL_GPL(nfs_mkdir); 1599 EXPORT_SYMBOL_GPL(nfs_mkdir);
1600 1600
1601 static void nfs_dentry_handle_enoent(struct dentry *dentry) 1601 static void nfs_dentry_handle_enoent(struct dentry *dentry)
1602 { 1602 {
1603 if (dentry->d_inode != NULL && !d_unhashed(dentry)) 1603 if (dentry->d_inode != NULL && !d_unhashed(dentry))
1604 d_delete(dentry); 1604 d_delete(dentry);
1605 } 1605 }
1606 1606
1607 int nfs_rmdir(struct inode *dir, struct dentry *dentry) 1607 int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1608 { 1608 {
1609 int error; 1609 int error;
1610 1610
1611 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", 1611 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
1612 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1612 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1613 1613
1614 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1614 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1615 /* Ensure the VFS deletes this inode */ 1615 /* Ensure the VFS deletes this inode */
1616 if (error == 0 && dentry->d_inode != NULL) 1616 if (error == 0 && dentry->d_inode != NULL)
1617 clear_nlink(dentry->d_inode); 1617 clear_nlink(dentry->d_inode);
1618 else if (error == -ENOENT) 1618 else if (error == -ENOENT)
1619 nfs_dentry_handle_enoent(dentry); 1619 nfs_dentry_handle_enoent(dentry);
1620 1620
1621 return error; 1621 return error;
1622 } 1622 }
1623 EXPORT_SYMBOL_GPL(nfs_rmdir); 1623 EXPORT_SYMBOL_GPL(nfs_rmdir);
1624 1624
1625 /* 1625 /*
1626 * Remove a file after making sure there are no pending writes, 1626 * Remove a file after making sure there are no pending writes,
1627 * and after checking that the file has only one user. 1627 * and after checking that the file has only one user.
1628 * 1628 *
1629 * We invalidate the attribute cache and free the inode prior to the operation 1629 * We invalidate the attribute cache and free the inode prior to the operation
1630 * to avoid possible races if the server reuses the inode. 1630 * to avoid possible races if the server reuses the inode.
1631 */ 1631 */
1632 static int nfs_safe_remove(struct dentry *dentry) 1632 static int nfs_safe_remove(struct dentry *dentry)
1633 { 1633 {
1634 struct inode *dir = dentry->d_parent->d_inode; 1634 struct inode *dir = dentry->d_parent->d_inode;
1635 struct inode *inode = dentry->d_inode; 1635 struct inode *inode = dentry->d_inode;
1636 int error = -EBUSY; 1636 int error = -EBUSY;
1637 1637
1638 dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", 1638 dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",
1639 dentry->d_parent->d_name.name, dentry->d_name.name); 1639 dentry->d_parent->d_name.name, dentry->d_name.name);
1640 1640
1641 /* If the dentry was sillyrenamed, we simply call d_delete() */ 1641 /* If the dentry was sillyrenamed, we simply call d_delete() */
1642 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 1642 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1643 error = 0; 1643 error = 0;
1644 goto out; 1644 goto out;
1645 } 1645 }
1646 1646
1647 if (inode != NULL) { 1647 if (inode != NULL) {
1648 NFS_PROTO(inode)->return_delegation(inode); 1648 NFS_PROTO(inode)->return_delegation(inode);
1649 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1649 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1650 /* The VFS may want to delete this inode */ 1650 /* The VFS may want to delete this inode */
1651 if (error == 0) 1651 if (error == 0)
1652 nfs_drop_nlink(inode); 1652 nfs_drop_nlink(inode);
1653 nfs_mark_for_revalidate(inode); 1653 nfs_mark_for_revalidate(inode);
1654 } else 1654 } else
1655 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1655 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1656 if (error == -ENOENT) 1656 if (error == -ENOENT)
1657 nfs_dentry_handle_enoent(dentry); 1657 nfs_dentry_handle_enoent(dentry);
1658 out: 1658 out:
1659 return error; 1659 return error;
1660 } 1660 }
1661 1661
1662 /* We do silly rename. In case sillyrename() returns -EBUSY, the inode 1662 /* We do silly rename. In case sillyrename() returns -EBUSY, the inode
1663 * belongs to an active ".nfs..." file and we return -EBUSY. 1663 * belongs to an active ".nfs..." file and we return -EBUSY.
1664 * 1664 *
1665 * If sillyrename() returns 0, we do nothing, otherwise we unlink. 1665 * If sillyrename() returns 0, we do nothing, otherwise we unlink.
1666 */ 1666 */
1667 int nfs_unlink(struct inode *dir, struct dentry *dentry) 1667 int nfs_unlink(struct inode *dir, struct dentry *dentry)
1668 { 1668 {
1669 int error; 1669 int error;
1670 int need_rehash = 0; 1670 int need_rehash = 0;
1671 1671
1672 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1672 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1673 dir->i_ino, dentry->d_name.name); 1673 dir->i_ino, dentry->d_name.name);
1674 1674
1675 spin_lock(&dentry->d_lock); 1675 spin_lock(&dentry->d_lock);
1676 if (dentry->d_count > 1) { 1676 if (dentry->d_count > 1) {
1677 spin_unlock(&dentry->d_lock); 1677 spin_unlock(&dentry->d_lock);
1678 /* Start asynchronous writeout of the inode */ 1678 /* Start asynchronous writeout of the inode */
1679 write_inode_now(dentry->d_inode, 0); 1679 write_inode_now(dentry->d_inode, 0);
1680 error = nfs_sillyrename(dir, dentry); 1680 error = nfs_sillyrename(dir, dentry);
1681 return error; 1681 return error;
1682 } 1682 }
1683 if (!d_unhashed(dentry)) { 1683 if (!d_unhashed(dentry)) {
1684 __d_drop(dentry); 1684 __d_drop(dentry);
1685 need_rehash = 1; 1685 need_rehash = 1;
1686 } 1686 }
1687 spin_unlock(&dentry->d_lock); 1687 spin_unlock(&dentry->d_lock);
1688 error = nfs_safe_remove(dentry); 1688 error = nfs_safe_remove(dentry);
1689 if (!error || error == -ENOENT) { 1689 if (!error || error == -ENOENT) {
1690 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1690 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1691 } else if (need_rehash) 1691 } else if (need_rehash)
1692 d_rehash(dentry); 1692 d_rehash(dentry);
1693 return error; 1693 return error;
1694 } 1694 }
1695 EXPORT_SYMBOL_GPL(nfs_unlink); 1695 EXPORT_SYMBOL_GPL(nfs_unlink);
1696 1696
1697 /* 1697 /*
1698 * To create a symbolic link, most file systems instantiate a new inode, 1698 * To create a symbolic link, most file systems instantiate a new inode,
1699 * add a page to it containing the path, then write it out to the disk 1699 * add a page to it containing the path, then write it out to the disk
1700 * using prepare_write/commit_write. 1700 * using prepare_write/commit_write.
1701 * 1701 *
1702 * Unfortunately the NFS client can't create the in-core inode first 1702 * Unfortunately the NFS client can't create the in-core inode first
1703 * because it needs a file handle to create an in-core inode (see 1703 * because it needs a file handle to create an in-core inode (see
1704 * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the 1704 * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the
1705 * symlink request has completed on the server. 1705 * symlink request has completed on the server.
1706 * 1706 *
1707 * So instead we allocate a raw page, copy the symname into it, then do 1707 * So instead we allocate a raw page, copy the symname into it, then do
1708 * the SYMLINK request with the page as the buffer. If it succeeds, we 1708 * the SYMLINK request with the page as the buffer. If it succeeds, we
1709 * now have a new file handle and can instantiate an in-core NFS inode 1709 * now have a new file handle and can instantiate an in-core NFS inode
1710 * and move the raw page into its mapping. 1710 * and move the raw page into its mapping.
1711 */ 1711 */
1712 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1712 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1713 { 1713 {
1714 struct pagevec lru_pvec; 1714 struct pagevec lru_pvec;
1715 struct page *page; 1715 struct page *page;
1716 char *kaddr; 1716 char *kaddr;
1717 struct iattr attr; 1717 struct iattr attr;
1718 unsigned int pathlen = strlen(symname); 1718 unsigned int pathlen = strlen(symname);
1719 int error; 1719 int error;
1720 1720
1721 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, 1721 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
1722 dir->i_ino, dentry->d_name.name, symname); 1722 dir->i_ino, dentry->d_name.name, symname);
1723 1723
1724 if (pathlen > PAGE_SIZE) 1724 if (pathlen > PAGE_SIZE)
1725 return -ENAMETOOLONG; 1725 return -ENAMETOOLONG;
1726 1726
1727 attr.ia_mode = S_IFLNK | S_IRWXUGO; 1727 attr.ia_mode = S_IFLNK | S_IRWXUGO;
1728 attr.ia_valid = ATTR_MODE; 1728 attr.ia_valid = ATTR_MODE;
1729 1729
1730 page = alloc_page(GFP_HIGHUSER); 1730 page = alloc_page(GFP_HIGHUSER);
1731 if (!page) 1731 if (!page)
1732 return -ENOMEM; 1732 return -ENOMEM;
1733 1733
1734 kaddr = kmap_atomic(page); 1734 kaddr = kmap_atomic(page);
1735 memcpy(kaddr, symname, pathlen); 1735 memcpy(kaddr, symname, pathlen);
1736 if (pathlen < PAGE_SIZE) 1736 if (pathlen < PAGE_SIZE)
1737 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); 1737 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
1738 kunmap_atomic(kaddr); 1738 kunmap_atomic(kaddr);
1739 1739
1740 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); 1740 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
1741 if (error != 0) { 1741 if (error != 0) {
1742 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", 1742 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
1743 dir->i_sb->s_id, dir->i_ino, 1743 dir->i_sb->s_id, dir->i_ino,
1744 dentry->d_name.name, symname, error); 1744 dentry->d_name.name, symname, error);
1745 d_drop(dentry); 1745 d_drop(dentry);
1746 __free_page(page); 1746 __free_page(page);
1747 return error; 1747 return error;
1748 } 1748 }
1749 1749
1750 /* 1750 /*
1751 * No big deal if we can't add this page to the page cache here. 1751 * No big deal if we can't add this page to the page cache here.
1752 * READLINK will get the missing page from the server if needed. 1752 * READLINK will get the missing page from the server if needed.
1753 */ 1753 */
1754 pagevec_init(&lru_pvec, 0); 1754 pagevec_init(&lru_pvec, 0);
1755 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, 1755 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1756 GFP_KERNEL)) { 1756 GFP_KERNEL)) {
1757 pagevec_add(&lru_pvec, page); 1757 pagevec_add(&lru_pvec, page);
1758 pagevec_lru_add_file(&lru_pvec); 1758 pagevec_lru_add_file(&lru_pvec);
1759 SetPageUptodate(page); 1759 SetPageUptodate(page);
1760 unlock_page(page); 1760 unlock_page(page);
1761 } else 1761 } else
1762 __free_page(page); 1762 __free_page(page);
1763 1763
1764 return 0; 1764 return 0;
1765 } 1765 }
1766 EXPORT_SYMBOL_GPL(nfs_symlink); 1766 EXPORT_SYMBOL_GPL(nfs_symlink);
1767 1767
1768 int 1768 int
1769 nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1769 nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1770 { 1770 {
1771 struct inode *inode = old_dentry->d_inode; 1771 struct inode *inode = old_dentry->d_inode;
1772 int error; 1772 int error;
1773 1773
1774 dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", 1774 dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",
1775 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1775 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1776 dentry->d_parent->d_name.name, dentry->d_name.name); 1776 dentry->d_parent->d_name.name, dentry->d_name.name);
1777 1777
1778 NFS_PROTO(inode)->return_delegation(inode); 1778 NFS_PROTO(inode)->return_delegation(inode);
1779 1779
1780 d_drop(dentry); 1780 d_drop(dentry);
1781 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1781 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1782 if (error == 0) { 1782 if (error == 0) {
1783 ihold(inode); 1783 ihold(inode);
1784 d_add(dentry, inode); 1784 d_add(dentry, inode);
1785 } 1785 }
1786 return error; 1786 return error;
1787 } 1787 }
1788 EXPORT_SYMBOL_GPL(nfs_link); 1788 EXPORT_SYMBOL_GPL(nfs_link);
1789 1789
1790 /* 1790 /*
1791 * RENAME 1791 * RENAME
1792 * FIXME: Some nfsds, like the Linux user space nfsd, may generate a 1792 * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
1793 * different file handle for the same inode after a rename (e.g. when 1793 * different file handle for the same inode after a rename (e.g. when
1794 * moving to a different directory). A fail-safe method to do so would 1794 * moving to a different directory). A fail-safe method to do so would
1795 * be to look up old_dir/old_name, create a link to new_dir/new_name and 1795 * be to look up old_dir/old_name, create a link to new_dir/new_name and
1796 * rename the old file using the sillyrename stuff. This way, the original 1796 * rename the old file using the sillyrename stuff. This way, the original
1797 * file in old_dir will go away when the last process iput()s the inode. 1797 * file in old_dir will go away when the last process iput()s the inode.
1798 * 1798 *
1799 * FIXED. 1799 * FIXED.
1800 * 1800 *
1801 * It actually works quite well. One needs to have the possibility for 1801 * It actually works quite well. One needs to have the possibility for
1802 * at least one ".nfs..." file in each directory the file ever gets 1802 * at least one ".nfs..." file in each directory the file ever gets
1803 * moved or linked to which happens automagically with the new 1803 * moved or linked to which happens automagically with the new
1804 * implementation that only depends on the dcache stuff instead of 1804 * implementation that only depends on the dcache stuff instead of
1805 * using the inode layer 1805 * using the inode layer
1806 * 1806 *
1807 * Unfortunately, things are a little more complicated than indicated 1807 * Unfortunately, things are a little more complicated than indicated
1808 * above. For a cross-directory move, we want to make sure we can get 1808 * above. For a cross-directory move, we want to make sure we can get
1809 * rid of the old inode after the operation. This means there must be 1809 * rid of the old inode after the operation. This means there must be
1810 * no pending writes (if it's a file), and the use count must be 1. 1810 * no pending writes (if it's a file), and the use count must be 1.
1811 * If these conditions are met, we can drop the dentries before doing 1811 * If these conditions are met, we can drop the dentries before doing
1812 * the rename. 1812 * the rename.
1813 */ 1813 */
1814 int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, 1814 int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1815 struct inode *new_dir, struct dentry *new_dentry) 1815 struct inode *new_dir, struct dentry *new_dentry)
1816 { 1816 {
1817 struct inode *old_inode = old_dentry->d_inode; 1817 struct inode *old_inode = old_dentry->d_inode;
1818 struct inode *new_inode = new_dentry->d_inode; 1818 struct inode *new_inode = new_dentry->d_inode;
1819 struct dentry *dentry = NULL, *rehash = NULL; 1819 struct dentry *dentry = NULL, *rehash = NULL;
1820 int error = -EBUSY; 1820 int error = -EBUSY;
1821 1821
1822 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1822 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1823 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1823 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1824 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1824 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1825 new_dentry->d_count); 1825 new_dentry->d_count);
1826 1826
1827 /* 1827 /*
1828 * For non-directories, check whether the target is busy and if so, 1828 * For non-directories, check whether the target is busy and if so,
1829 * make a copy of the dentry and then do a silly-rename. If the 1829 * make a copy of the dentry and then do a silly-rename. If the
1830 * silly-rename succeeds, the copied dentry is hashed and becomes 1830 * silly-rename succeeds, the copied dentry is hashed and becomes
1831 * the new target. 1831 * the new target.
1832 */ 1832 */
1833 if (new_inode && !S_ISDIR(new_inode->i_mode)) { 1833 if (new_inode && !S_ISDIR(new_inode->i_mode)) {
1834 /* 1834 /*
1835 * To prevent any new references to the target during the 1835 * To prevent any new references to the target during the
1836 * rename, we unhash the dentry in advance. 1836 * rename, we unhash the dentry in advance.
1837 */ 1837 */
1838 if (!d_unhashed(new_dentry)) { 1838 if (!d_unhashed(new_dentry)) {
1839 d_drop(new_dentry); 1839 d_drop(new_dentry);
1840 rehash = new_dentry; 1840 rehash = new_dentry;
1841 } 1841 }
1842 1842
1843 if (new_dentry->d_count > 2) { 1843 if (new_dentry->d_count > 2) {
1844 int err; 1844 int err;
1845 1845
1846 /* copy the target dentry's name */ 1846 /* copy the target dentry's name */
1847 dentry = d_alloc(new_dentry->d_parent, 1847 dentry = d_alloc(new_dentry->d_parent,
1848 &new_dentry->d_name); 1848 &new_dentry->d_name);
1849 if (!dentry) 1849 if (!dentry)
1850 goto out; 1850 goto out;
1851 1851
1852 /* silly-rename the existing target ... */ 1852 /* silly-rename the existing target ... */
1853 err = nfs_sillyrename(new_dir, new_dentry); 1853 err = nfs_sillyrename(new_dir, new_dentry);
1854 if (err) 1854 if (err)
1855 goto out; 1855 goto out;
1856 1856
1857 new_dentry = dentry; 1857 new_dentry = dentry;
1858 rehash = NULL; 1858 rehash = NULL;
1859 new_inode = NULL; 1859 new_inode = NULL;
1860 } 1860 }
1861 } 1861 }
1862 1862
1863 NFS_PROTO(old_inode)->return_delegation(old_inode); 1863 NFS_PROTO(old_inode)->return_delegation(old_inode);
1864 if (new_inode != NULL) 1864 if (new_inode != NULL)
1865 NFS_PROTO(new_inode)->return_delegation(new_inode); 1865 NFS_PROTO(new_inode)->return_delegation(new_inode);
1866 1866
1867 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1867 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1868 new_dir, &new_dentry->d_name); 1868 new_dir, &new_dentry->d_name);
1869 nfs_mark_for_revalidate(old_inode); 1869 nfs_mark_for_revalidate(old_inode);
1870 out: 1870 out:
1871 if (rehash) 1871 if (rehash)
1872 d_rehash(rehash); 1872 d_rehash(rehash);
1873 if (!error) { 1873 if (!error) {
1874 if (new_inode != NULL) 1874 if (new_inode != NULL)
1875 nfs_drop_nlink(new_inode); 1875 nfs_drop_nlink(new_inode);
1876 d_move(old_dentry, new_dentry); 1876 d_move(old_dentry, new_dentry);
1877 nfs_set_verifier(new_dentry, 1877 nfs_set_verifier(new_dentry,
1878 nfs_save_change_attribute(new_dir)); 1878 nfs_save_change_attribute(new_dir));
1879 } else if (error == -ENOENT) 1879 } else if (error == -ENOENT)
1880 nfs_dentry_handle_enoent(old_dentry); 1880 nfs_dentry_handle_enoent(old_dentry);
1881 1881
1882 /* new dentry created? */ 1882 /* new dentry created? */
1883 if (dentry) 1883 if (dentry)
1884 dput(dentry); 1884 dput(dentry);
1885 return error; 1885 return error;
1886 } 1886 }
1887 EXPORT_SYMBOL_GPL(nfs_rename); 1887 EXPORT_SYMBOL_GPL(nfs_rename);
1888 1888
1889 static DEFINE_SPINLOCK(nfs_access_lru_lock); 1889 static DEFINE_SPINLOCK(nfs_access_lru_lock);
1890 static LIST_HEAD(nfs_access_lru_list); 1890 static LIST_HEAD(nfs_access_lru_list);
1891 static atomic_long_t nfs_access_nr_entries; 1891 static atomic_long_t nfs_access_nr_entries;
1892 1892
1893 static void nfs_access_free_entry(struct nfs_access_entry *entry) 1893 static void nfs_access_free_entry(struct nfs_access_entry *entry)
1894 { 1894 {
1895 put_rpccred(entry->cred); 1895 put_rpccred(entry->cred);
1896 kfree(entry); 1896 kfree(entry);
1897 smp_mb__before_atomic_dec(); 1897 smp_mb__before_atomic_dec();
1898 atomic_long_dec(&nfs_access_nr_entries); 1898 atomic_long_dec(&nfs_access_nr_entries);
1899 smp_mb__after_atomic_dec(); 1899 smp_mb__after_atomic_dec();
1900 } 1900 }
1901 1901
1902 static void nfs_access_free_list(struct list_head *head) 1902 static void nfs_access_free_list(struct list_head *head)
1903 { 1903 {
1904 struct nfs_access_entry *cache; 1904 struct nfs_access_entry *cache;
1905 1905
1906 while (!list_empty(head)) { 1906 while (!list_empty(head)) {
1907 cache = list_entry(head->next, struct nfs_access_entry, lru); 1907 cache = list_entry(head->next, struct nfs_access_entry, lru);
1908 list_del(&cache->lru); 1908 list_del(&cache->lru);
1909 nfs_access_free_entry(cache); 1909 nfs_access_free_entry(cache);
1910 } 1910 }
1911 } 1911 }
1912 1912
1913 int nfs_access_cache_shrinker(struct shrinker *shrink, 1913 int nfs_access_cache_shrinker(struct shrinker *shrink,
1914 struct shrink_control *sc) 1914 struct shrink_control *sc)
1915 { 1915 {
1916 LIST_HEAD(head); 1916 LIST_HEAD(head);
1917 struct nfs_inode *nfsi, *next; 1917 struct nfs_inode *nfsi, *next;
1918 struct nfs_access_entry *cache; 1918 struct nfs_access_entry *cache;
1919 int nr_to_scan = sc->nr_to_scan; 1919 int nr_to_scan = sc->nr_to_scan;
1920 gfp_t gfp_mask = sc->gfp_mask; 1920 gfp_t gfp_mask = sc->gfp_mask;
1921 1921
1922 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 1922 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1923 return (nr_to_scan == 0) ? 0 : -1; 1923 return (nr_to_scan == 0) ? 0 : -1;
1924 1924
1925 spin_lock(&nfs_access_lru_lock); 1925 spin_lock(&nfs_access_lru_lock);
1926 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { 1926 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1927 struct inode *inode; 1927 struct inode *inode;
1928 1928
1929 if (nr_to_scan-- == 0) 1929 if (nr_to_scan-- == 0)
1930 break; 1930 break;
1931 inode = &nfsi->vfs_inode; 1931 inode = &nfsi->vfs_inode;
1932 spin_lock(&inode->i_lock); 1932 spin_lock(&inode->i_lock);
1933 if (list_empty(&nfsi->access_cache_entry_lru)) 1933 if (list_empty(&nfsi->access_cache_entry_lru))
1934 goto remove_lru_entry; 1934 goto remove_lru_entry;
1935 cache = list_entry(nfsi->access_cache_entry_lru.next, 1935 cache = list_entry(nfsi->access_cache_entry_lru.next,
1936 struct nfs_access_entry, lru); 1936 struct nfs_access_entry, lru);
1937 list_move(&cache->lru, &head); 1937 list_move(&cache->lru, &head);
1938 rb_erase(&cache->rb_node, &nfsi->access_cache); 1938 rb_erase(&cache->rb_node, &nfsi->access_cache);
1939 if (!list_empty(&nfsi->access_cache_entry_lru)) 1939 if (!list_empty(&nfsi->access_cache_entry_lru))
1940 list_move_tail(&nfsi->access_cache_inode_lru, 1940 list_move_tail(&nfsi->access_cache_inode_lru,
1941 &nfs_access_lru_list); 1941 &nfs_access_lru_list);
1942 else { 1942 else {
1943 remove_lru_entry: 1943 remove_lru_entry:
1944 list_del_init(&nfsi->access_cache_inode_lru); 1944 list_del_init(&nfsi->access_cache_inode_lru);
1945 smp_mb__before_clear_bit(); 1945 smp_mb__before_clear_bit();
1946 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1946 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1947 smp_mb__after_clear_bit(); 1947 smp_mb__after_clear_bit();
1948 } 1948 }
1949 spin_unlock(&inode->i_lock); 1949 spin_unlock(&inode->i_lock);
1950 } 1950 }
1951 spin_unlock(&nfs_access_lru_lock); 1951 spin_unlock(&nfs_access_lru_lock);
1952 nfs_access_free_list(&head); 1952 nfs_access_free_list(&head);
1953 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; 1953 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
1954 } 1954 }
1955 1955
1956 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) 1956 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
1957 { 1957 {
1958 struct rb_root *root_node = &nfsi->access_cache; 1958 struct rb_root *root_node = &nfsi->access_cache;
1959 struct rb_node *n; 1959 struct rb_node *n;
1960 struct nfs_access_entry *entry; 1960 struct nfs_access_entry *entry;
1961 1961
1962 /* Unhook entries from the cache */ 1962 /* Unhook entries from the cache */
1963 while ((n = rb_first(root_node)) != NULL) { 1963 while ((n = rb_first(root_node)) != NULL) {
1964 entry = rb_entry(n, struct nfs_access_entry, rb_node); 1964 entry = rb_entry(n, struct nfs_access_entry, rb_node);
1965 rb_erase(n, root_node); 1965 rb_erase(n, root_node);
1966 list_move(&entry->lru, head); 1966 list_move(&entry->lru, head);
1967 } 1967 }
1968 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; 1968 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
1969 } 1969 }
1970 1970
1971 void nfs_access_zap_cache(struct inode *inode) 1971 void nfs_access_zap_cache(struct inode *inode)
1972 { 1972 {
1973 LIST_HEAD(head); 1973 LIST_HEAD(head);
1974 1974
1975 if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) 1975 if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
1976 return; 1976 return;
1977 /* Remove from global LRU init */ 1977 /* Remove from global LRU init */
1978 spin_lock(&nfs_access_lru_lock); 1978 spin_lock(&nfs_access_lru_lock);
1979 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) 1979 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1980 list_del_init(&NFS_I(inode)->access_cache_inode_lru); 1980 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1981 1981
1982 spin_lock(&inode->i_lock); 1982 spin_lock(&inode->i_lock);
1983 __nfs_access_zap_cache(NFS_I(inode), &head); 1983 __nfs_access_zap_cache(NFS_I(inode), &head);
1984 spin_unlock(&inode->i_lock); 1984 spin_unlock(&inode->i_lock);
1985 spin_unlock(&nfs_access_lru_lock); 1985 spin_unlock(&nfs_access_lru_lock);
1986 nfs_access_free_list(&head); 1986 nfs_access_free_list(&head);
1987 } 1987 }
1988 EXPORT_SYMBOL_GPL(nfs_access_zap_cache); 1988 EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
1989 1989
1990 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) 1990 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
1991 { 1991 {
1992 struct rb_node *n = NFS_I(inode)->access_cache.rb_node; 1992 struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
1993 struct nfs_access_entry *entry; 1993 struct nfs_access_entry *entry;
1994 1994
1995 while (n != NULL) { 1995 while (n != NULL) {
1996 entry = rb_entry(n, struct nfs_access_entry, rb_node); 1996 entry = rb_entry(n, struct nfs_access_entry, rb_node);
1997 1997
1998 if (cred < entry->cred) 1998 if (cred < entry->cred)
1999 n = n->rb_left; 1999 n = n->rb_left;
2000 else if (cred > entry->cred) 2000 else if (cred > entry->cred)
2001 n = n->rb_right; 2001 n = n->rb_right;
2002 else 2002 else
2003 return entry; 2003 return entry;
2004 } 2004 }
2005 return NULL; 2005 return NULL;
2006 } 2006 }
2007 2007
2008 static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 2008 static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
2009 { 2009 {
2010 struct nfs_inode *nfsi = NFS_I(inode); 2010 struct nfs_inode *nfsi = NFS_I(inode);
2011 struct nfs_access_entry *cache; 2011 struct nfs_access_entry *cache;
2012 int err = -ENOENT; 2012 int err = -ENOENT;
2013 2013
2014 spin_lock(&inode->i_lock); 2014 spin_lock(&inode->i_lock);
2015 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) 2015 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
2016 goto out_zap; 2016 goto out_zap;
2017 cache = nfs_access_search_rbtree(inode, cred); 2017 cache = nfs_access_search_rbtree(inode, cred);
2018 if (cache == NULL) 2018 if (cache == NULL)
2019 goto out; 2019 goto out;
2020 if (!nfs_have_delegated_attributes(inode) && 2020 if (!nfs_have_delegated_attributes(inode) &&
2021 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 2021 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
2022 goto out_stale; 2022 goto out_stale;
2023 res->jiffies = cache->jiffies; 2023 res->jiffies = cache->jiffies;
2024 res->cred = cache->cred; 2024 res->cred = cache->cred;
2025 res->mask = cache->mask; 2025 res->mask = cache->mask;
2026 list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); 2026 list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
2027 err = 0; 2027 err = 0;
2028 out: 2028 out:
2029 spin_unlock(&inode->i_lock); 2029 spin_unlock(&inode->i_lock);
2030 return err; 2030 return err;
2031 out_stale: 2031 out_stale:
2032 rb_erase(&cache->rb_node, &nfsi->access_cache); 2032 rb_erase(&cache->rb_node, &nfsi->access_cache);
2033 list_del(&cache->lru); 2033 list_del(&cache->lru);
2034 spin_unlock(&inode->i_lock); 2034 spin_unlock(&inode->i_lock);
2035 nfs_access_free_entry(cache); 2035 nfs_access_free_entry(cache);
2036 return -ENOENT; 2036 return -ENOENT;
2037 out_zap: 2037 out_zap:
2038 spin_unlock(&inode->i_lock); 2038 spin_unlock(&inode->i_lock);
2039 nfs_access_zap_cache(inode); 2039 nfs_access_zap_cache(inode);
2040 return -ENOENT; 2040 return -ENOENT;
2041 } 2041 }
2042 2042
2043 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) 2043 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
2044 { 2044 {
2045 struct nfs_inode *nfsi = NFS_I(inode); 2045 struct nfs_inode *nfsi = NFS_I(inode);
2046 struct rb_root *root_node = &nfsi->access_cache; 2046 struct rb_root *root_node = &nfsi->access_cache;
2047 struct rb_node **p = &root_node->rb_node; 2047 struct rb_node **p = &root_node->rb_node;
2048 struct rb_node *parent = NULL; 2048 struct rb_node *parent = NULL;
2049 struct nfs_access_entry *entry; 2049 struct nfs_access_entry *entry;
2050 2050
2051 spin_lock(&inode->i_lock); 2051 spin_lock(&inode->i_lock);
2052 while (*p != NULL) { 2052 while (*p != NULL) {
2053 parent = *p; 2053 parent = *p;
2054 entry = rb_entry(parent, struct nfs_access_entry, rb_node); 2054 entry = rb_entry(parent, struct nfs_access_entry, rb_node);
2055 2055
2056 if (set->cred < entry->cred) 2056 if (set->cred < entry->cred)
2057 p = &parent->rb_left; 2057 p = &parent->rb_left;
2058 else if (set->cred > entry->cred) 2058 else if (set->cred > entry->cred)
2059 p = &parent->rb_right; 2059 p = &parent->rb_right;
2060 else 2060 else
2061 goto found; 2061 goto found;
2062 } 2062 }
2063 rb_link_node(&set->rb_node, parent, p); 2063 rb_link_node(&set->rb_node, parent, p);
2064 rb_insert_color(&set->rb_node, root_node); 2064 rb_insert_color(&set->rb_node, root_node);
2065 list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); 2065 list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
2066 spin_unlock(&inode->i_lock); 2066 spin_unlock(&inode->i_lock);
2067 return; 2067 return;
2068 found: 2068 found:
2069 rb_replace_node(parent, &set->rb_node, root_node); 2069 rb_replace_node(parent, &set->rb_node, root_node);
2070 list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); 2070 list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
2071 list_del(&entry->lru); 2071 list_del(&entry->lru);
2072 spin_unlock(&inode->i_lock); 2072 spin_unlock(&inode->i_lock);
2073 nfs_access_free_entry(entry); 2073 nfs_access_free_entry(entry);
2074 } 2074 }
2075 2075
2076 void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 2076 void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2077 { 2077 {
2078 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); 2078 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
2079 if (cache == NULL) 2079 if (cache == NULL)
2080 return; 2080 return;
2081 RB_CLEAR_NODE(&cache->rb_node); 2081 RB_CLEAR_NODE(&cache->rb_node);
2082 cache->jiffies = set->jiffies; 2082 cache->jiffies = set->jiffies;
2083 cache->cred = get_rpccred(set->cred); 2083 cache->cred = get_rpccred(set->cred);
2084 cache->mask = set->mask; 2084 cache->mask = set->mask;
2085 2085
2086 nfs_access_add_rbtree(inode, cache); 2086 nfs_access_add_rbtree(inode, cache);
2087 2087
2088 /* Update accounting */ 2088 /* Update accounting */
2089 smp_mb__before_atomic_inc(); 2089 smp_mb__before_atomic_inc();
2090 atomic_long_inc(&nfs_access_nr_entries); 2090 atomic_long_inc(&nfs_access_nr_entries);
2091 smp_mb__after_atomic_inc(); 2091 smp_mb__after_atomic_inc();
2092 2092
2093 /* Add inode to global LRU list */ 2093 /* Add inode to global LRU list */
2094 if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 2094 if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
2095 spin_lock(&nfs_access_lru_lock); 2095 spin_lock(&nfs_access_lru_lock);
2096 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) 2096 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
2097 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, 2097 list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
2098 &nfs_access_lru_list); 2098 &nfs_access_lru_list);
2099 spin_unlock(&nfs_access_lru_lock); 2099 spin_unlock(&nfs_access_lru_lock);
2100 } 2100 }
2101 } 2101 }
2102 EXPORT_SYMBOL_GPL(nfs_access_add_cache); 2102 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2103 2103
2104 void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) 2104 void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
2105 { 2105 {
2106 entry->mask = 0; 2106 entry->mask = 0;
2107 if (access_result & NFS4_ACCESS_READ) 2107 if (access_result & NFS4_ACCESS_READ)
2108 entry->mask |= MAY_READ; 2108 entry->mask |= MAY_READ;
2109 if (access_result & 2109 if (access_result &
2110 (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) 2110 (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
2111 entry->mask |= MAY_WRITE; 2111 entry->mask |= MAY_WRITE;
2112 if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 2112 if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2113 entry->mask |= MAY_EXEC; 2113 entry->mask |= MAY_EXEC;
2114 } 2114 }
2115 EXPORT_SYMBOL_GPL(nfs_access_set_mask); 2115 EXPORT_SYMBOL_GPL(nfs_access_set_mask);
2116 2116
2117 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) 2117 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2118 { 2118 {
2119 struct nfs_access_entry cache; 2119 struct nfs_access_entry cache;
2120 int status; 2120 int status;
2121 2121
2122 status = nfs_access_get_cached(inode, cred, &cache); 2122 status = nfs_access_get_cached(inode, cred, &cache);
2123 if (status == 0) 2123 if (status == 0)
2124 goto out; 2124 goto out;
2125 2125
2126 /* Be clever: ask server to check for all possible rights */ 2126 /* Be clever: ask server to check for all possible rights */
2127 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; 2127 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
2128 cache.cred = cred; 2128 cache.cred = cred;
2129 cache.jiffies = jiffies; 2129 cache.jiffies = jiffies;
2130 status = NFS_PROTO(inode)->access(inode, &cache); 2130 status = NFS_PROTO(inode)->access(inode, &cache);
2131 if (status != 0) { 2131 if (status != 0) {
2132 if (status == -ESTALE) { 2132 if (status == -ESTALE) {
2133 nfs_zap_caches(inode); 2133 nfs_zap_caches(inode);
2134 if (!S_ISDIR(inode->i_mode)) 2134 if (!S_ISDIR(inode->i_mode))
2135 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 2135 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
2136 } 2136 }
2137 return status; 2137 return status;
2138 } 2138 }
2139 nfs_access_add_cache(inode, &cache); 2139 nfs_access_add_cache(inode, &cache);
2140 out: 2140 out:
2141 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2141 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
2142 return 0; 2142 return 0;
2143 return -EACCES; 2143 return -EACCES;
2144 } 2144 }
2145 2145
2146 static int nfs_open_permission_mask(int openflags) 2146 static int nfs_open_permission_mask(int openflags)
2147 { 2147 {
2148 int mask = 0; 2148 int mask = 0;
2149 2149
2150 if ((openflags & O_ACCMODE) != O_WRONLY) 2150 if ((openflags & O_ACCMODE) != O_WRONLY)
2151 mask |= MAY_READ; 2151 mask |= MAY_READ;
2152 if ((openflags & O_ACCMODE) != O_RDONLY) 2152 if ((openflags & O_ACCMODE) != O_RDONLY)
2153 mask |= MAY_WRITE; 2153 mask |= MAY_WRITE;
2154 if (openflags & __FMODE_EXEC) 2154 if (openflags & __FMODE_EXEC)
2155 mask |= MAY_EXEC; 2155 mask |= MAY_EXEC;
2156 return mask; 2156 return mask;
2157 } 2157 }
2158 2158
2159 int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) 2159 int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
2160 { 2160 {
2161 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); 2161 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
2162 } 2162 }
2163 EXPORT_SYMBOL_GPL(nfs_may_open); 2163 EXPORT_SYMBOL_GPL(nfs_may_open);
2164 2164
2165 int nfs_permission(struct inode *inode, int mask) 2165 int nfs_permission(struct inode *inode, int mask)
2166 { 2166 {
2167 struct rpc_cred *cred; 2167 struct rpc_cred *cred;
2168 int res = 0; 2168 int res = 0;
2169 2169
2170 if (mask & MAY_NOT_BLOCK) 2170 if (mask & MAY_NOT_BLOCK)
2171 return -ECHILD; 2171 return -ECHILD;
2172 2172
2173 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2173 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
2174 2174
2175 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2175 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
2176 goto out; 2176 goto out;
2177 /* Is this sys_access() ? */ 2177 /* Is this sys_access() ? */
2178 if (mask & (MAY_ACCESS | MAY_CHDIR)) 2178 if (mask & (MAY_ACCESS | MAY_CHDIR))
2179 goto force_lookup; 2179 goto force_lookup;
2180 2180
2181 switch (inode->i_mode & S_IFMT) { 2181 switch (inode->i_mode & S_IFMT) {
2182 case S_IFLNK: 2182 case S_IFLNK:
2183 goto out; 2183 goto out;
2184 case S_IFREG: 2184 case S_IFREG:
2185 /* NFSv4 has atomic_open... */ 2185 /* NFSv4 has atomic_open... */
2186 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) 2186 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
2187 && (mask & MAY_OPEN) 2187 && (mask & MAY_OPEN)
2188 && !(mask & MAY_EXEC)) 2188 && !(mask & MAY_EXEC))
2189 goto out; 2189 goto out;
2190 break; 2190 break;
2191 case S_IFDIR: 2191 case S_IFDIR:
2192 /* 2192 /*
2193 * Optimize away all write operations, since the server 2193 * Optimize away all write operations, since the server
2194 * will check permissions when we perform the op. 2194 * will check permissions when we perform the op.
2195 */ 2195 */
2196 if ((mask & MAY_WRITE) && !(mask & MAY_READ)) 2196 if ((mask & MAY_WRITE) && !(mask & MAY_READ))
2197 goto out; 2197 goto out;
2198 } 2198 }
2199 2199
2200 force_lookup: 2200 force_lookup:
2201 if (!NFS_PROTO(inode)->access) 2201 if (!NFS_PROTO(inode)->access)
2202 goto out_notsup; 2202 goto out_notsup;
2203 2203
2204 cred = rpc_lookup_cred(); 2204 cred = rpc_lookup_cred();
2205 if (!IS_ERR(cred)) { 2205 if (!IS_ERR(cred)) {
2206 res = nfs_do_access(inode, cred, mask); 2206 res = nfs_do_access(inode, cred, mask);
2207 put_rpccred(cred); 2207 put_rpccred(cred);
2208 } else 2208 } else
2209 res = PTR_ERR(cred); 2209 res = PTR_ERR(cred);
2210 out: 2210 out:
2211 if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) 2211 if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
2212 res = -EACCES; 2212 res = -EACCES;
2213 2213
2214 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", 2214 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
2215 inode->i_sb->s_id, inode->i_ino, mask, res); 2215 inode->i_sb->s_id, inode->i_ino, mask, res);
2216 return res; 2216 return res;
2217 out_notsup: 2217 out_notsup:
2218 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2218 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2219 if (res == 0) 2219 if (res == 0)
2220 res = generic_permission(inode, mask); 2220 res = generic_permission(inode, mask);
2221 goto out; 2221 goto out;
2222 } 2222 }
2223 EXPORT_SYMBOL_GPL(nfs_permission); 2223 EXPORT_SYMBOL_GPL(nfs_permission);
2224 2224
2225 /* 2225 /*
2226 * Local variables: 2226 * Local variables:
2227 * version-control: t 2227 * version-control: t
2228 * kept-new-versions: 5 2228 * kept-new-versions: 5
2229 * End: 2229 * End:
2230 */ 2230 */
2231 2231
1 /* 1 /*
2 * linux/fs/nfs/file.c 2 * linux/fs/nfs/file.c
3 * 3 *
4 * Copyright (C) 1992 Rick Sladkey 4 * Copyright (C) 1992 Rick Sladkey
5 * 5 *
6 * Changes Copyright (C) 1994 by Florian La Roche 6 * Changes Copyright (C) 1994 by Florian La Roche
7 * - Do not copy data too often around in the kernel. 7 * - Do not copy data too often around in the kernel.
8 * - In nfs_file_read the return value of kmalloc wasn't checked. 8 * - In nfs_file_read the return value of kmalloc wasn't checked.
9 * - Put in a better version of read look-ahead buffering. Original idea 9 * - Put in a better version of read look-ahead buffering. Original idea
10 * and implementation by Wai S Kok elekokws@ee.nus.sg. 10 * and implementation by Wai S Kok elekokws@ee.nus.sg.
11 * 11 *
12 * Expire cache on write to a file by Wai S Kok (Oct 1994). 12 * Expire cache on write to a file by Wai S Kok (Oct 1994).
13 * 13 *
14 * Total rewrite of read side for new NFS buffer cache.. Linus. 14 * Total rewrite of read side for new NFS buffer cache.. Linus.
15 * 15 *
16 * nfs regular file handling functions 16 * nfs regular file handling functions
17 */ 17 */
18 18
19 #include <linux/module.h> 19 #include <linux/module.h>
20 #include <linux/time.h> 20 #include <linux/time.h>
21 #include <linux/kernel.h> 21 #include <linux/kernel.h>
22 #include <linux/errno.h> 22 #include <linux/errno.h>
23 #include <linux/fcntl.h> 23 #include <linux/fcntl.h>
24 #include <linux/stat.h> 24 #include <linux/stat.h>
25 #include <linux/nfs_fs.h> 25 #include <linux/nfs_fs.h>
26 #include <linux/nfs_mount.h> 26 #include <linux/nfs_mount.h>
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/pagemap.h> 28 #include <linux/pagemap.h>
29 #include <linux/aio.h> 29 #include <linux/aio.h>
30 #include <linux/gfp.h> 30 #include <linux/gfp.h>
31 #include <linux/swap.h> 31 #include <linux/swap.h>
32 32
33 #include <asm/uaccess.h> 33 #include <asm/uaccess.h>
34 34
35 #include "delegation.h" 35 #include "delegation.h"
36 #include "internal.h" 36 #include "internal.h"
37 #include "iostat.h" 37 #include "iostat.h"
38 #include "fscache.h" 38 #include "fscache.h"
39 39
40 #define NFSDBG_FACILITY NFSDBG_FILE 40 #define NFSDBG_FACILITY NFSDBG_FILE
41 41
42 static const struct vm_operations_struct nfs_file_vm_ops; 42 static const struct vm_operations_struct nfs_file_vm_ops;
43 43
44 /* Hack for future NFS swap support */ 44 /* Hack for future NFS swap support */
45 #ifndef IS_SWAPFILE 45 #ifndef IS_SWAPFILE
46 # define IS_SWAPFILE(inode) (0) 46 # define IS_SWAPFILE(inode) (0)
47 #endif 47 #endif
48 48
49 int nfs_check_flags(int flags) 49 int nfs_check_flags(int flags)
50 { 50 {
51 if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) 51 if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
52 return -EINVAL; 52 return -EINVAL;
53 53
54 return 0; 54 return 0;
55 } 55 }
56 EXPORT_SYMBOL_GPL(nfs_check_flags); 56 EXPORT_SYMBOL_GPL(nfs_check_flags);
57 57
58 /* 58 /*
59 * Open file 59 * Open file
60 */ 60 */
61 static int 61 static int
62 nfs_file_open(struct inode *inode, struct file *filp) 62 nfs_file_open(struct inode *inode, struct file *filp)
63 { 63 {
64 int res; 64 int res;
65 65
66 dprintk("NFS: open file(%s/%s)\n", 66 dprintk("NFS: open file(%s/%s)\n",
67 filp->f_path.dentry->d_parent->d_name.name, 67 filp->f_path.dentry->d_parent->d_name.name,
68 filp->f_path.dentry->d_name.name); 68 filp->f_path.dentry->d_name.name);
69 69
70 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 70 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
71 res = nfs_check_flags(filp->f_flags); 71 res = nfs_check_flags(filp->f_flags);
72 if (res) 72 if (res)
73 return res; 73 return res;
74 74
75 res = nfs_open(inode, filp); 75 res = nfs_open(inode, filp);
76 return res; 76 return res;
77 } 77 }
78 78
79 int 79 int
80 nfs_file_release(struct inode *inode, struct file *filp) 80 nfs_file_release(struct inode *inode, struct file *filp)
81 { 81 {
82 dprintk("NFS: release(%s/%s)\n", 82 dprintk("NFS: release(%s/%s)\n",
83 filp->f_path.dentry->d_parent->d_name.name, 83 filp->f_path.dentry->d_parent->d_name.name,
84 filp->f_path.dentry->d_name.name); 84 filp->f_path.dentry->d_name.name);
85 85
86 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 86 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
87 return nfs_release(inode, filp); 87 return nfs_release(inode, filp);
88 } 88 }
89 EXPORT_SYMBOL_GPL(nfs_file_release); 89 EXPORT_SYMBOL_GPL(nfs_file_release);
90 90
91 /** 91 /**
92 * nfs_revalidate_size - Revalidate the file size 92 * nfs_revalidate_size - Revalidate the file size
93 * @inode - pointer to inode struct 93 * @inode - pointer to inode struct
94 * @file - pointer to struct file 94 * @file - pointer to struct file
95 * 95 *
96 * Revalidates the file length. This is basically a wrapper around 96 * Revalidates the file length. This is basically a wrapper around
97 * nfs_revalidate_inode() that takes into account the fact that we may 97 * nfs_revalidate_inode() that takes into account the fact that we may
98 * have cached writes (in which case we don't care about the server's 98 * have cached writes (in which case we don't care about the server's
99 * idea of what the file length is), or O_DIRECT (in which case we 99 * idea of what the file length is), or O_DIRECT (in which case we
100 * shouldn't trust the cache). 100 * shouldn't trust the cache).
101 */ 101 */
102 static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) 102 static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
103 { 103 {
104 struct nfs_server *server = NFS_SERVER(inode); 104 struct nfs_server *server = NFS_SERVER(inode);
105 struct nfs_inode *nfsi = NFS_I(inode); 105 struct nfs_inode *nfsi = NFS_I(inode);
106 106
107 if (nfs_have_delegated_attributes(inode)) 107 if (nfs_have_delegated_attributes(inode))
108 goto out_noreval; 108 goto out_noreval;
109 109
110 if (filp->f_flags & O_DIRECT) 110 if (filp->f_flags & O_DIRECT)
111 goto force_reval; 111 goto force_reval;
112 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) 112 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
113 goto force_reval; 113 goto force_reval;
114 if (nfs_attribute_timeout(inode)) 114 if (nfs_attribute_timeout(inode))
115 goto force_reval; 115 goto force_reval;
116 out_noreval: 116 out_noreval:
117 return 0; 117 return 0;
118 force_reval: 118 force_reval:
119 return __nfs_revalidate_inode(server, inode); 119 return __nfs_revalidate_inode(server, inode);
120 } 120 }
121 121
122 loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 122 loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
123 { 123 {
124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
125 filp->f_path.dentry->d_parent->d_name.name, 125 filp->f_path.dentry->d_parent->d_name.name,
126 filp->f_path.dentry->d_name.name, 126 filp->f_path.dentry->d_name.name,
127 offset, origin); 127 offset, whence);
128 128
129 /* 129 /*
130 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 130 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
131 * the cached file length 131 * the cached file length
132 */ 132 */
133 if (origin != SEEK_SET && origin != SEEK_CUR) { 133 if (whence != SEEK_SET && whence != SEEK_CUR) {
134 struct inode *inode = filp->f_mapping->host; 134 struct inode *inode = filp->f_mapping->host;
135 135
136 int retval = nfs_revalidate_file_size(inode, filp); 136 int retval = nfs_revalidate_file_size(inode, filp);
137 if (retval < 0) 137 if (retval < 0)
138 return (loff_t)retval; 138 return (loff_t)retval;
139 } 139 }
140 140
141 return generic_file_llseek(filp, offset, origin); 141 return generic_file_llseek(filp, offset, whence);
142 } 142 }
143 EXPORT_SYMBOL_GPL(nfs_file_llseek); 143 EXPORT_SYMBOL_GPL(nfs_file_llseek);
144 144
145 /* 145 /*
146 * Flush all dirty pages, and check for write errors. 146 * Flush all dirty pages, and check for write errors.
147 */ 147 */
148 int 148 int
149 nfs_file_flush(struct file *file, fl_owner_t id) 149 nfs_file_flush(struct file *file, fl_owner_t id)
150 { 150 {
151 struct dentry *dentry = file->f_path.dentry; 151 struct dentry *dentry = file->f_path.dentry;
152 struct inode *inode = dentry->d_inode; 152 struct inode *inode = dentry->d_inode;
153 153
154 dprintk("NFS: flush(%s/%s)\n", 154 dprintk("NFS: flush(%s/%s)\n",
155 dentry->d_parent->d_name.name, 155 dentry->d_parent->d_name.name,
156 dentry->d_name.name); 156 dentry->d_name.name);
157 157
158 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 158 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
159 if ((file->f_mode & FMODE_WRITE) == 0) 159 if ((file->f_mode & FMODE_WRITE) == 0)
160 return 0; 160 return 0;
161 161
162 /* 162 /*
163 * If we're holding a write delegation, then just start the i/o 163 * If we're holding a write delegation, then just start the i/o
164 * but don't wait for completion (or send a commit). 164 * but don't wait for completion (or send a commit).
165 */ 165 */
166 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 166 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
167 return filemap_fdatawrite(file->f_mapping); 167 return filemap_fdatawrite(file->f_mapping);
168 168
169 /* Flush writes to the server and return any errors */ 169 /* Flush writes to the server and return any errors */
170 return vfs_fsync(file, 0); 170 return vfs_fsync(file, 0);
171 } 171 }
172 EXPORT_SYMBOL_GPL(nfs_file_flush); 172 EXPORT_SYMBOL_GPL(nfs_file_flush);
173 173
174 ssize_t 174 ssize_t
175 nfs_file_read(struct kiocb *iocb, const struct iovec *iov, 175 nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
176 unsigned long nr_segs, loff_t pos) 176 unsigned long nr_segs, loff_t pos)
177 { 177 {
178 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 178 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
179 struct inode * inode = dentry->d_inode; 179 struct inode * inode = dentry->d_inode;
180 ssize_t result; 180 ssize_t result;
181 181
182 if (iocb->ki_filp->f_flags & O_DIRECT) 182 if (iocb->ki_filp->f_flags & O_DIRECT)
183 return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); 183 return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
184 184
185 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 185 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
186 dentry->d_parent->d_name.name, dentry->d_name.name, 186 dentry->d_parent->d_name.name, dentry->d_name.name,
187 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); 187 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
188 188
189 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 189 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
190 if (!result) { 190 if (!result) {
191 result = generic_file_aio_read(iocb, iov, nr_segs, pos); 191 result = generic_file_aio_read(iocb, iov, nr_segs, pos);
192 if (result > 0) 192 if (result > 0)
193 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); 193 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
194 } 194 }
195 return result; 195 return result;
196 } 196 }
197 EXPORT_SYMBOL_GPL(nfs_file_read); 197 EXPORT_SYMBOL_GPL(nfs_file_read);
198 198
199 ssize_t 199 ssize_t
200 nfs_file_splice_read(struct file *filp, loff_t *ppos, 200 nfs_file_splice_read(struct file *filp, loff_t *ppos,
201 struct pipe_inode_info *pipe, size_t count, 201 struct pipe_inode_info *pipe, size_t count,
202 unsigned int flags) 202 unsigned int flags)
203 { 203 {
204 struct dentry *dentry = filp->f_path.dentry; 204 struct dentry *dentry = filp->f_path.dentry;
205 struct inode *inode = dentry->d_inode; 205 struct inode *inode = dentry->d_inode;
206 ssize_t res; 206 ssize_t res;
207 207
208 dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", 208 dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
209 dentry->d_parent->d_name.name, dentry->d_name.name, 209 dentry->d_parent->d_name.name, dentry->d_name.name,
210 (unsigned long) count, (unsigned long long) *ppos); 210 (unsigned long) count, (unsigned long long) *ppos);
211 211
212 res = nfs_revalidate_mapping(inode, filp->f_mapping); 212 res = nfs_revalidate_mapping(inode, filp->f_mapping);
213 if (!res) { 213 if (!res) {
214 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 214 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
215 if (res > 0) 215 if (res > 0)
216 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); 216 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
217 } 217 }
218 return res; 218 return res;
219 } 219 }
220 EXPORT_SYMBOL_GPL(nfs_file_splice_read); 220 EXPORT_SYMBOL_GPL(nfs_file_splice_read);
221 221
222 int 222 int
223 nfs_file_mmap(struct file * file, struct vm_area_struct * vma) 223 nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
224 { 224 {
225 struct dentry *dentry = file->f_path.dentry; 225 struct dentry *dentry = file->f_path.dentry;
226 struct inode *inode = dentry->d_inode; 226 struct inode *inode = dentry->d_inode;
227 int status; 227 int status;
228 228
229 dprintk("NFS: mmap(%s/%s)\n", 229 dprintk("NFS: mmap(%s/%s)\n",
230 dentry->d_parent->d_name.name, dentry->d_name.name); 230 dentry->d_parent->d_name.name, dentry->d_name.name);
231 231
232 /* Note: generic_file_mmap() returns ENOSYS on nommu systems 232 /* Note: generic_file_mmap() returns ENOSYS on nommu systems
233 * so we call that before revalidating the mapping 233 * so we call that before revalidating the mapping
234 */ 234 */
235 status = generic_file_mmap(file, vma); 235 status = generic_file_mmap(file, vma);
236 if (!status) { 236 if (!status) {
237 vma->vm_ops = &nfs_file_vm_ops; 237 vma->vm_ops = &nfs_file_vm_ops;
238 status = nfs_revalidate_mapping(inode, file->f_mapping); 238 status = nfs_revalidate_mapping(inode, file->f_mapping);
239 } 239 }
240 return status; 240 return status;
241 } 241 }
242 EXPORT_SYMBOL_GPL(nfs_file_mmap); 242 EXPORT_SYMBOL_GPL(nfs_file_mmap);
243 243
244 /* 244 /*
245 * Flush any dirty pages for this process, and check for write errors. 245 * Flush any dirty pages for this process, and check for write errors.
246 * The return status from this call provides a reliable indication of 246 * The return status from this call provides a reliable indication of
247 * whether any write errors occurred for this process. 247 * whether any write errors occurred for this process.
248 * 248 *
249 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to 249 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
250 * disk, but it retrieves and clears ctx->error after synching, despite 250 * disk, but it retrieves and clears ctx->error after synching, despite
251 * the two being set at the same time in nfs_context_set_write_error(). 251 * the two being set at the same time in nfs_context_set_write_error().
252 * This is because the former is used to notify the _next_ call to 252 * This is because the former is used to notify the _next_ call to
253 * nfs_file_write() that a write error occurred, and hence cause it to 253 * nfs_file_write() that a write error occurred, and hence cause it to
254 * fall back to doing a synchronous write. 254 * fall back to doing a synchronous write.
255 */ 255 */
256 int 256 int
257 nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) 257 nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
258 { 258 {
259 struct dentry *dentry = file->f_path.dentry; 259 struct dentry *dentry = file->f_path.dentry;
260 struct nfs_open_context *ctx = nfs_file_open_context(file); 260 struct nfs_open_context *ctx = nfs_file_open_context(file);
261 struct inode *inode = dentry->d_inode; 261 struct inode *inode = dentry->d_inode;
262 int have_error, do_resend, status; 262 int have_error, do_resend, status;
263 int ret = 0; 263 int ret = 0;
264 264
265 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 265 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
266 dentry->d_parent->d_name.name, dentry->d_name.name, 266 dentry->d_parent->d_name.name, dentry->d_name.name,
267 datasync); 267 datasync);
268 268
269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
270 do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); 270 do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
271 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 271 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
272 status = nfs_commit_inode(inode, FLUSH_SYNC); 272 status = nfs_commit_inode(inode, FLUSH_SYNC);
273 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 273 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
274 if (have_error) { 274 if (have_error) {
275 ret = xchg(&ctx->error, 0); 275 ret = xchg(&ctx->error, 0);
276 if (ret) 276 if (ret)
277 goto out; 277 goto out;
278 } 278 }
279 if (status < 0) { 279 if (status < 0) {
280 ret = status; 280 ret = status;
281 goto out; 281 goto out;
282 } 282 }
283 do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); 283 do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
284 if (do_resend) 284 if (do_resend)
285 ret = -EAGAIN; 285 ret = -EAGAIN;
286 out: 286 out:
287 return ret; 287 return ret;
288 } 288 }
289 EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); 289 EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
290 290
291 static int 291 static int
292 nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) 292 nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
293 { 293 {
294 int ret; 294 int ret;
295 struct inode *inode = file->f_path.dentry->d_inode; 295 struct inode *inode = file->f_path.dentry->d_inode;
296 296
297 do { 297 do {
298 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 298 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
299 if (ret != 0) 299 if (ret != 0)
300 break; 300 break;
301 mutex_lock(&inode->i_mutex); 301 mutex_lock(&inode->i_mutex);
302 ret = nfs_file_fsync_commit(file, start, end, datasync); 302 ret = nfs_file_fsync_commit(file, start, end, datasync);
303 mutex_unlock(&inode->i_mutex); 303 mutex_unlock(&inode->i_mutex);
304 /* 304 /*
305 * If nfs_file_fsync_commit detected a server reboot, then 305 * If nfs_file_fsync_commit detected a server reboot, then
306 * resend all dirty pages that might have been covered by 306 * resend all dirty pages that might have been covered by
307 * the NFS_CONTEXT_RESEND_WRITES flag 307 * the NFS_CONTEXT_RESEND_WRITES flag
308 */ 308 */
309 start = 0; 309 start = 0;
310 end = LLONG_MAX; 310 end = LLONG_MAX;
311 } while (ret == -EAGAIN); 311 } while (ret == -EAGAIN);
312 312
313 return ret; 313 return ret;
314 } 314 }
315 315
316 /* 316 /*
317 * Decide whether a read/modify/write cycle may be more efficient 317 * Decide whether a read/modify/write cycle may be more efficient
318 * then a modify/write/read cycle when writing to a page in the 318 * then a modify/write/read cycle when writing to a page in the
319 * page cache. 319 * page cache.
320 * 320 *
321 * The modify/write/read cycle may occur if a page is read before 321 * The modify/write/read cycle may occur if a page is read before
322 * being completely filled by the writer. In this situation, the 322 * being completely filled by the writer. In this situation, the
323 * page must be completely written to stable storage on the server 323 * page must be completely written to stable storage on the server
324 * before it can be refilled by reading in the page from the server. 324 * before it can be refilled by reading in the page from the server.
325 * This can lead to expensive, small, FILE_SYNC mode writes being 325 * This can lead to expensive, small, FILE_SYNC mode writes being
326 * done. 326 * done.
327 * 327 *
328 * It may be more efficient to read the page first if the file is 328 * It may be more efficient to read the page first if the file is
329 * open for reading in addition to writing, the page is not marked 329 * open for reading in addition to writing, the page is not marked
330 * as Uptodate, it is not dirty or waiting to be committed, 330 * as Uptodate, it is not dirty or waiting to be committed,
331 * indicating that it was previously allocated and then modified, 331 * indicating that it was previously allocated and then modified,
332 * that there were valid bytes of data in that range of the file, 332 * that there were valid bytes of data in that range of the file,
333 * and that the new data won't completely replace the old data in 333 * and that the new data won't completely replace the old data in
334 * that range of the file. 334 * that range of the file.
335 */ 335 */
336 static int nfs_want_read_modify_write(struct file *file, struct page *page, 336 static int nfs_want_read_modify_write(struct file *file, struct page *page,
337 loff_t pos, unsigned len) 337 loff_t pos, unsigned len)
338 { 338 {
339 unsigned int pglen = nfs_page_length(page); 339 unsigned int pglen = nfs_page_length(page);
340 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); 340 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
341 unsigned int end = offset + len; 341 unsigned int end = offset + len;
342 342
343 if ((file->f_mode & FMODE_READ) && /* open for read? */ 343 if ((file->f_mode & FMODE_READ) && /* open for read? */
344 !PageUptodate(page) && /* Uptodate? */ 344 !PageUptodate(page) && /* Uptodate? */
345 !PagePrivate(page) && /* i/o request already? */ 345 !PagePrivate(page) && /* i/o request already? */
346 pglen && /* valid bytes of file? */ 346 pglen && /* valid bytes of file? */
347 (end < pglen || offset)) /* replace all valid bytes? */ 347 (end < pglen || offset)) /* replace all valid bytes? */
348 return 1; 348 return 1;
349 return 0; 349 return 0;
350 } 350 }
351 351
352 /* 352 /*
353 * This does the "real" work of the write. We must allocate and lock the 353 * This does the "real" work of the write. We must allocate and lock the
354 * page to be sent back to the generic routine, which then copies the 354 * page to be sent back to the generic routine, which then copies the
355 * data from user space. 355 * data from user space.
356 * 356 *
357 * If the writer ends up delaying the write, the writer needs to 357 * If the writer ends up delaying the write, the writer needs to
358 * increment the page use counts until he is done with the page. 358 * increment the page use counts until he is done with the page.
359 */ 359 */
360 static int nfs_write_begin(struct file *file, struct address_space *mapping, 360 static int nfs_write_begin(struct file *file, struct address_space *mapping,
361 loff_t pos, unsigned len, unsigned flags, 361 loff_t pos, unsigned len, unsigned flags,
362 struct page **pagep, void **fsdata) 362 struct page **pagep, void **fsdata)
363 { 363 {
364 int ret; 364 int ret;
365 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 365 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
366 struct page *page; 366 struct page *page;
367 int once_thru = 0; 367 int once_thru = 0;
368 368
369 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", 369 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
370 file->f_path.dentry->d_parent->d_name.name, 370 file->f_path.dentry->d_parent->d_name.name,
371 file->f_path.dentry->d_name.name, 371 file->f_path.dentry->d_name.name,
372 mapping->host->i_ino, len, (long long) pos); 372 mapping->host->i_ino, len, (long long) pos);
373 373
374 start: 374 start:
375 /* 375 /*
376 * Prevent starvation issues if someone is doing a consistency 376 * Prevent starvation issues if someone is doing a consistency
377 * sync-to-disk 377 * sync-to-disk
378 */ 378 */
379 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, 379 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
380 nfs_wait_bit_killable, TASK_KILLABLE); 380 nfs_wait_bit_killable, TASK_KILLABLE);
381 if (ret) 381 if (ret)
382 return ret; 382 return ret;
383 383
384 page = grab_cache_page_write_begin(mapping, index, flags); 384 page = grab_cache_page_write_begin(mapping, index, flags);
385 if (!page) 385 if (!page)
386 return -ENOMEM; 386 return -ENOMEM;
387 *pagep = page; 387 *pagep = page;
388 388
389 ret = nfs_flush_incompatible(file, page); 389 ret = nfs_flush_incompatible(file, page);
390 if (ret) { 390 if (ret) {
391 unlock_page(page); 391 unlock_page(page);
392 page_cache_release(page); 392 page_cache_release(page);
393 } else if (!once_thru && 393 } else if (!once_thru &&
394 nfs_want_read_modify_write(file, page, pos, len)) { 394 nfs_want_read_modify_write(file, page, pos, len)) {
395 once_thru = 1; 395 once_thru = 1;
396 ret = nfs_readpage(file, page); 396 ret = nfs_readpage(file, page);
397 page_cache_release(page); 397 page_cache_release(page);
398 if (!ret) 398 if (!ret)
399 goto start; 399 goto start;
400 } 400 }
401 return ret; 401 return ret;
402 } 402 }
403 403
404 static int nfs_write_end(struct file *file, struct address_space *mapping, 404 static int nfs_write_end(struct file *file, struct address_space *mapping,
405 loff_t pos, unsigned len, unsigned copied, 405 loff_t pos, unsigned len, unsigned copied,
406 struct page *page, void *fsdata) 406 struct page *page, void *fsdata)
407 { 407 {
408 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 408 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
409 int status; 409 int status;
410 410
411 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", 411 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
412 file->f_path.dentry->d_parent->d_name.name, 412 file->f_path.dentry->d_parent->d_name.name,
413 file->f_path.dentry->d_name.name, 413 file->f_path.dentry->d_name.name,
414 mapping->host->i_ino, len, (long long) pos); 414 mapping->host->i_ino, len, (long long) pos);
415 415
416 /* 416 /*
417 * Zero any uninitialised parts of the page, and then mark the page 417 * Zero any uninitialised parts of the page, and then mark the page
418 * as up to date if it turns out that we're extending the file. 418 * as up to date if it turns out that we're extending the file.
419 */ 419 */
420 if (!PageUptodate(page)) { 420 if (!PageUptodate(page)) {
421 unsigned pglen = nfs_page_length(page); 421 unsigned pglen = nfs_page_length(page);
422 unsigned end = offset + len; 422 unsigned end = offset + len;
423 423
424 if (pglen == 0) { 424 if (pglen == 0) {
425 zero_user_segments(page, 0, offset, 425 zero_user_segments(page, 0, offset,
426 end, PAGE_CACHE_SIZE); 426 end, PAGE_CACHE_SIZE);
427 SetPageUptodate(page); 427 SetPageUptodate(page);
428 } else if (end >= pglen) { 428 } else if (end >= pglen) {
429 zero_user_segment(page, end, PAGE_CACHE_SIZE); 429 zero_user_segment(page, end, PAGE_CACHE_SIZE);
430 if (offset == 0) 430 if (offset == 0)
431 SetPageUptodate(page); 431 SetPageUptodate(page);
432 } else 432 } else
433 zero_user_segment(page, pglen, PAGE_CACHE_SIZE); 433 zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
434 } 434 }
435 435
436 status = nfs_updatepage(file, page, offset, copied); 436 status = nfs_updatepage(file, page, offset, copied);
437 437
438 unlock_page(page); 438 unlock_page(page);
439 page_cache_release(page); 439 page_cache_release(page);
440 440
441 if (status < 0) 441 if (status < 0)
442 return status; 442 return status;
443 NFS_I(mapping->host)->write_io += copied; 443 NFS_I(mapping->host)->write_io += copied;
444 return copied; 444 return copied;
445 } 445 }
446 446
447 /* 447 /*
448 * Partially or wholly invalidate a page 448 * Partially or wholly invalidate a page
449 * - Release the private state associated with a page if undergoing complete 449 * - Release the private state associated with a page if undergoing complete
450 * page invalidation 450 * page invalidation
451 * - Called if either PG_private or PG_fscache is set on the page 451 * - Called if either PG_private or PG_fscache is set on the page
452 * - Caller holds page lock 452 * - Caller holds page lock
453 */ 453 */
454 static void nfs_invalidate_page(struct page *page, unsigned long offset) 454 static void nfs_invalidate_page(struct page *page, unsigned long offset)
455 { 455 {
456 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 456 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
457 457
458 if (offset != 0) 458 if (offset != 0)
459 return; 459 return;
460 /* Cancel any unstarted writes on this page */ 460 /* Cancel any unstarted writes on this page */
461 nfs_wb_page_cancel(page_file_mapping(page)->host, page); 461 nfs_wb_page_cancel(page_file_mapping(page)->host, page);
462 462
463 nfs_fscache_invalidate_page(page, page->mapping->host); 463 nfs_fscache_invalidate_page(page, page->mapping->host);
464 } 464 }
465 465
466 /* 466 /*
467 * Attempt to release the private state associated with a page 467 * Attempt to release the private state associated with a page
468 * - Called if either PG_private or PG_fscache is set on the page 468 * - Called if either PG_private or PG_fscache is set on the page
469 * - Caller holds page lock 469 * - Caller holds page lock
470 * - Return true (may release page) or false (may not) 470 * - Return true (may release page) or false (may not)
471 */ 471 */
472 static int nfs_release_page(struct page *page, gfp_t gfp) 472 static int nfs_release_page(struct page *page, gfp_t gfp)
473 { 473 {
474 struct address_space *mapping = page->mapping; 474 struct address_space *mapping = page->mapping;
475 475
476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
477 477
478 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not 478 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
479 * doing this memory reclaim for a fs-related allocation. 479 * doing this memory reclaim for a fs-related allocation.
480 */ 480 */
481 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && 481 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
482 !(current->flags & PF_FSTRANS)) { 482 !(current->flags & PF_FSTRANS)) {
483 int how = FLUSH_SYNC; 483 int how = FLUSH_SYNC;
484 484
485 /* Don't let kswapd deadlock waiting for OOM RPC calls */ 485 /* Don't let kswapd deadlock waiting for OOM RPC calls */
486 if (current_is_kswapd()) 486 if (current_is_kswapd())
487 how = 0; 487 how = 0;
488 nfs_commit_inode(mapping->host, how); 488 nfs_commit_inode(mapping->host, how);
489 } 489 }
490 /* If PagePrivate() is set, then the page is not freeable */ 490 /* If PagePrivate() is set, then the page is not freeable */
491 if (PagePrivate(page)) 491 if (PagePrivate(page))
492 return 0; 492 return 0;
493 return nfs_fscache_release_page(page, gfp); 493 return nfs_fscache_release_page(page, gfp);
494 } 494 }
495 495
496 /* 496 /*
497 * Attempt to clear the private state associated with a page when an error 497 * Attempt to clear the private state associated with a page when an error
498 * occurs that requires the cached contents of an inode to be written back or 498 * occurs that requires the cached contents of an inode to be written back or
499 * destroyed 499 * destroyed
500 * - Called if either PG_private or fscache is set on the page 500 * - Called if either PG_private or fscache is set on the page
501 * - Caller holds page lock 501 * - Caller holds page lock
502 * - Return 0 if successful, -error otherwise 502 * - Return 0 if successful, -error otherwise
503 */ 503 */
504 static int nfs_launder_page(struct page *page) 504 static int nfs_launder_page(struct page *page)
505 { 505 {
506 struct inode *inode = page_file_mapping(page)->host; 506 struct inode *inode = page_file_mapping(page)->host;
507 struct nfs_inode *nfsi = NFS_I(inode); 507 struct nfs_inode *nfsi = NFS_I(inode);
508 508
509 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 509 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
510 inode->i_ino, (long long)page_offset(page)); 510 inode->i_ino, (long long)page_offset(page));
511 511
512 nfs_fscache_wait_on_page_write(nfsi, page); 512 nfs_fscache_wait_on_page_write(nfsi, page);
513 return nfs_wb_page(inode, page); 513 return nfs_wb_page(inode, page);
514 } 514 }
515 515
516 #ifdef CONFIG_NFS_SWAP 516 #ifdef CONFIG_NFS_SWAP
517 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 517 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
518 sector_t *span) 518 sector_t *span)
519 { 519 {
520 *span = sis->pages; 520 *span = sis->pages;
521 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); 521 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
522 } 522 }
523 523
524 static void nfs_swap_deactivate(struct file *file) 524 static void nfs_swap_deactivate(struct file *file)
525 { 525 {
526 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); 526 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
527 } 527 }
528 #endif 528 #endif
529 529
530 const struct address_space_operations nfs_file_aops = { 530 const struct address_space_operations nfs_file_aops = {
531 .readpage = nfs_readpage, 531 .readpage = nfs_readpage,
532 .readpages = nfs_readpages, 532 .readpages = nfs_readpages,
533 .set_page_dirty = __set_page_dirty_nobuffers, 533 .set_page_dirty = __set_page_dirty_nobuffers,
534 .writepage = nfs_writepage, 534 .writepage = nfs_writepage,
535 .writepages = nfs_writepages, 535 .writepages = nfs_writepages,
536 .write_begin = nfs_write_begin, 536 .write_begin = nfs_write_begin,
537 .write_end = nfs_write_end, 537 .write_end = nfs_write_end,
538 .invalidatepage = nfs_invalidate_page, 538 .invalidatepage = nfs_invalidate_page,
539 .releasepage = nfs_release_page, 539 .releasepage = nfs_release_page,
540 .direct_IO = nfs_direct_IO, 540 .direct_IO = nfs_direct_IO,
541 .migratepage = nfs_migrate_page, 541 .migratepage = nfs_migrate_page,
542 .launder_page = nfs_launder_page, 542 .launder_page = nfs_launder_page,
543 .error_remove_page = generic_error_remove_page, 543 .error_remove_page = generic_error_remove_page,
544 #ifdef CONFIG_NFS_SWAP 544 #ifdef CONFIG_NFS_SWAP
545 .swap_activate = nfs_swap_activate, 545 .swap_activate = nfs_swap_activate,
546 .swap_deactivate = nfs_swap_deactivate, 546 .swap_deactivate = nfs_swap_deactivate,
547 #endif 547 #endif
548 }; 548 };
549 549
550 /* 550 /*
551 * Notification that a PTE pointing to an NFS page is about to be made 551 * Notification that a PTE pointing to an NFS page is about to be made
552 * writable, implying that someone is about to modify the page through a 552 * writable, implying that someone is about to modify the page through a
553 * shared-writable mapping 553 * shared-writable mapping
554 */ 554 */
555 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 555 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
556 { 556 {
557 struct page *page = vmf->page; 557 struct page *page = vmf->page;
558 struct file *filp = vma->vm_file; 558 struct file *filp = vma->vm_file;
559 struct dentry *dentry = filp->f_path.dentry; 559 struct dentry *dentry = filp->f_path.dentry;
560 unsigned pagelen; 560 unsigned pagelen;
561 int ret = VM_FAULT_NOPAGE; 561 int ret = VM_FAULT_NOPAGE;
562 struct address_space *mapping; 562 struct address_space *mapping;
563 563
564 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", 564 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
565 dentry->d_parent->d_name.name, dentry->d_name.name, 565 dentry->d_parent->d_name.name, dentry->d_name.name,
566 filp->f_mapping->host->i_ino, 566 filp->f_mapping->host->i_ino,
567 (long long)page_offset(page)); 567 (long long)page_offset(page));
568 568
569 /* make sure the cache has finished storing the page */ 569 /* make sure the cache has finished storing the page */
570 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); 570 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
571 571
572 lock_page(page); 572 lock_page(page);
573 mapping = page_file_mapping(page); 573 mapping = page_file_mapping(page);
574 if (mapping != dentry->d_inode->i_mapping) 574 if (mapping != dentry->d_inode->i_mapping)
575 goto out_unlock; 575 goto out_unlock;
576 576
577 wait_on_page_writeback(page); 577 wait_on_page_writeback(page);
578 578
579 pagelen = nfs_page_length(page); 579 pagelen = nfs_page_length(page);
580 if (pagelen == 0) 580 if (pagelen == 0)
581 goto out_unlock; 581 goto out_unlock;
582 582
583 ret = VM_FAULT_LOCKED; 583 ret = VM_FAULT_LOCKED;
584 if (nfs_flush_incompatible(filp, page) == 0 && 584 if (nfs_flush_incompatible(filp, page) == 0 &&
585 nfs_updatepage(filp, page, 0, pagelen) == 0) 585 nfs_updatepage(filp, page, 0, pagelen) == 0)
586 goto out; 586 goto out;
587 587
588 ret = VM_FAULT_SIGBUS; 588 ret = VM_FAULT_SIGBUS;
589 out_unlock: 589 out_unlock:
590 unlock_page(page); 590 unlock_page(page);
591 out: 591 out:
592 return ret; 592 return ret;
593 } 593 }
594 594
595 static const struct vm_operations_struct nfs_file_vm_ops = { 595 static const struct vm_operations_struct nfs_file_vm_ops = {
596 .fault = filemap_fault, 596 .fault = filemap_fault,
597 .page_mkwrite = nfs_vm_page_mkwrite, 597 .page_mkwrite = nfs_vm_page_mkwrite,
598 .remap_pages = generic_file_remap_pages, 598 .remap_pages = generic_file_remap_pages,
599 }; 599 };
600 600
601 static int nfs_need_sync_write(struct file *filp, struct inode *inode) 601 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
602 { 602 {
603 struct nfs_open_context *ctx; 603 struct nfs_open_context *ctx;
604 604
605 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) 605 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
606 return 1; 606 return 1;
607 ctx = nfs_file_open_context(filp); 607 ctx = nfs_file_open_context(filp);
608 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) 608 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
609 return 1; 609 return 1;
610 return 0; 610 return 0;
611 } 611 }
612 612
613 ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, 613 ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
614 unsigned long nr_segs, loff_t pos) 614 unsigned long nr_segs, loff_t pos)
615 { 615 {
616 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 616 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
617 struct inode * inode = dentry->d_inode; 617 struct inode * inode = dentry->d_inode;
618 unsigned long written = 0; 618 unsigned long written = 0;
619 ssize_t result; 619 ssize_t result;
620 size_t count = iov_length(iov, nr_segs); 620 size_t count = iov_length(iov, nr_segs);
621 621
622 if (iocb->ki_filp->f_flags & O_DIRECT) 622 if (iocb->ki_filp->f_flags & O_DIRECT)
623 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); 623 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
624 624
625 dprintk("NFS: write(%s/%s, %lu@%Ld)\n", 625 dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
626 dentry->d_parent->d_name.name, dentry->d_name.name, 626 dentry->d_parent->d_name.name, dentry->d_name.name,
627 (unsigned long) count, (long long) pos); 627 (unsigned long) count, (long long) pos);
628 628
629 result = -EBUSY; 629 result = -EBUSY;
630 if (IS_SWAPFILE(inode)) 630 if (IS_SWAPFILE(inode))
631 goto out_swapfile; 631 goto out_swapfile;
632 /* 632 /*
633 * O_APPEND implies that we must revalidate the file length. 633 * O_APPEND implies that we must revalidate the file length.
634 */ 634 */
635 if (iocb->ki_filp->f_flags & O_APPEND) { 635 if (iocb->ki_filp->f_flags & O_APPEND) {
636 result = nfs_revalidate_file_size(inode, iocb->ki_filp); 636 result = nfs_revalidate_file_size(inode, iocb->ki_filp);
637 if (result) 637 if (result)
638 goto out; 638 goto out;
639 } 639 }
640 640
641 result = count; 641 result = count;
642 if (!count) 642 if (!count)
643 goto out; 643 goto out;
644 644
645 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 645 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
646 if (result > 0) 646 if (result > 0)
647 written = result; 647 written = result;
648 648
649 /* Return error values for O_DSYNC and IS_SYNC() */ 649 /* Return error values for O_DSYNC and IS_SYNC() */
650 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 650 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
651 int err = vfs_fsync(iocb->ki_filp, 0); 651 int err = vfs_fsync(iocb->ki_filp, 0);
652 if (err < 0) 652 if (err < 0)
653 result = err; 653 result = err;
654 } 654 }
655 if (result > 0) 655 if (result > 0)
656 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); 656 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
657 out: 657 out:
658 return result; 658 return result;
659 659
660 out_swapfile: 660 out_swapfile:
661 printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); 661 printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
662 goto out; 662 goto out;
663 } 663 }
664 EXPORT_SYMBOL_GPL(nfs_file_write); 664 EXPORT_SYMBOL_GPL(nfs_file_write);
665 665
666 ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, 666 ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
667 struct file *filp, loff_t *ppos, 667 struct file *filp, loff_t *ppos,
668 size_t count, unsigned int flags) 668 size_t count, unsigned int flags)
669 { 669 {
670 struct dentry *dentry = filp->f_path.dentry; 670 struct dentry *dentry = filp->f_path.dentry;
671 struct inode *inode = dentry->d_inode; 671 struct inode *inode = dentry->d_inode;
672 unsigned long written = 0; 672 unsigned long written = 0;
673 ssize_t ret; 673 ssize_t ret;
674 674
675 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", 675 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
676 dentry->d_parent->d_name.name, dentry->d_name.name, 676 dentry->d_parent->d_name.name, dentry->d_name.name,
677 (unsigned long) count, (unsigned long long) *ppos); 677 (unsigned long) count, (unsigned long long) *ppos);
678 678
679 /* 679 /*
680 * The combination of splice and an O_APPEND destination is disallowed. 680 * The combination of splice and an O_APPEND destination is disallowed.
681 */ 681 */
682 682
683 ret = generic_file_splice_write(pipe, filp, ppos, count, flags); 683 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
684 if (ret > 0) 684 if (ret > 0)
685 written = ret; 685 written = ret;
686 686
687 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 687 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
688 int err = vfs_fsync(filp, 0); 688 int err = vfs_fsync(filp, 0);
689 if (err < 0) 689 if (err < 0)
690 ret = err; 690 ret = err;
691 } 691 }
692 if (ret > 0) 692 if (ret > 0)
693 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); 693 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
694 return ret; 694 return ret;
695 } 695 }
696 EXPORT_SYMBOL_GPL(nfs_file_splice_write); 696 EXPORT_SYMBOL_GPL(nfs_file_splice_write);
697 697
698 static int 698 static int
699 do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 699 do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
700 { 700 {
701 struct inode *inode = filp->f_mapping->host; 701 struct inode *inode = filp->f_mapping->host;
702 int status = 0; 702 int status = 0;
703 unsigned int saved_type = fl->fl_type; 703 unsigned int saved_type = fl->fl_type;
704 704
705 /* Try local locking first */ 705 /* Try local locking first */
706 posix_test_lock(filp, fl); 706 posix_test_lock(filp, fl);
707 if (fl->fl_type != F_UNLCK) { 707 if (fl->fl_type != F_UNLCK) {
708 /* found a conflict */ 708 /* found a conflict */
709 goto out; 709 goto out;
710 } 710 }
711 fl->fl_type = saved_type; 711 fl->fl_type = saved_type;
712 712
713 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) 713 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
714 goto out_noconflict; 714 goto out_noconflict;
715 715
716 if (is_local) 716 if (is_local)
717 goto out_noconflict; 717 goto out_noconflict;
718 718
719 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 719 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
720 out: 720 out:
721 return status; 721 return status;
722 out_noconflict: 722 out_noconflict:
723 fl->fl_type = F_UNLCK; 723 fl->fl_type = F_UNLCK;
724 goto out; 724 goto out;
725 } 725 }
726 726
727 static int do_vfs_lock(struct file *file, struct file_lock *fl) 727 static int do_vfs_lock(struct file *file, struct file_lock *fl)
728 { 728 {
729 int res = 0; 729 int res = 0;
730 switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { 730 switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
731 case FL_POSIX: 731 case FL_POSIX:
732 res = posix_lock_file_wait(file, fl); 732 res = posix_lock_file_wait(file, fl);
733 break; 733 break;
734 case FL_FLOCK: 734 case FL_FLOCK:
735 res = flock_lock_file_wait(file, fl); 735 res = flock_lock_file_wait(file, fl);
736 break; 736 break;
737 default: 737 default:
738 BUG(); 738 BUG();
739 } 739 }
740 return res; 740 return res;
741 } 741 }
742 742
743 static int 743 static int
744 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 744 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
745 { 745 {
746 struct inode *inode = filp->f_mapping->host; 746 struct inode *inode = filp->f_mapping->host;
747 int status; 747 int status;
748 748
749 /* 749 /*
750 * Flush all pending writes before doing anything 750 * Flush all pending writes before doing anything
751 * with locks.. 751 * with locks..
752 */ 752 */
753 nfs_sync_mapping(filp->f_mapping); 753 nfs_sync_mapping(filp->f_mapping);
754 754
755 /* NOTE: special case 755 /* NOTE: special case
756 * If we're signalled while cleaning up locks on process exit, we 756 * If we're signalled while cleaning up locks on process exit, we
757 * still need to complete the unlock. 757 * still need to complete the unlock.
758 */ 758 */
759 /* 759 /*
760 * Use local locking if mounted with "-onolock" or with appropriate 760 * Use local locking if mounted with "-onolock" or with appropriate
761 * "-olocal_lock=" 761 * "-olocal_lock="
762 */ 762 */
763 if (!is_local) 763 if (!is_local)
764 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 764 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
765 else 765 else
766 status = do_vfs_lock(filp, fl); 766 status = do_vfs_lock(filp, fl);
767 return status; 767 return status;
768 } 768 }
769 769
770 static int 770 static int
771 is_time_granular(struct timespec *ts) { 771 is_time_granular(struct timespec *ts) {
772 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); 772 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
773 } 773 }
774 774
775 static int 775 static int
776 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 776 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
777 { 777 {
778 struct inode *inode = filp->f_mapping->host; 778 struct inode *inode = filp->f_mapping->host;
779 int status; 779 int status;
780 780
781 /* 781 /*
782 * Flush all pending writes before doing anything 782 * Flush all pending writes before doing anything
783 * with locks.. 783 * with locks..
784 */ 784 */
785 status = nfs_sync_mapping(filp->f_mapping); 785 status = nfs_sync_mapping(filp->f_mapping);
786 if (status != 0) 786 if (status != 0)
787 goto out; 787 goto out;
788 788
789 /* 789 /*
790 * Use local locking if mounted with "-onolock" or with appropriate 790 * Use local locking if mounted with "-onolock" or with appropriate
791 * "-olocal_lock=" 791 * "-olocal_lock="
792 */ 792 */
793 if (!is_local) 793 if (!is_local)
794 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 794 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
795 else 795 else
796 status = do_vfs_lock(filp, fl); 796 status = do_vfs_lock(filp, fl);
797 if (status < 0) 797 if (status < 0)
798 goto out; 798 goto out;
799 799
800 /* 800 /*
801 * Revalidate the cache if the server has time stamps granular 801 * Revalidate the cache if the server has time stamps granular
802 * enough to detect subsecond changes. Otherwise, clear the 802 * enough to detect subsecond changes. Otherwise, clear the
803 * cache to prevent missing any changes. 803 * cache to prevent missing any changes.
804 * 804 *
805 * This makes locking act as a cache coherency point. 805 * This makes locking act as a cache coherency point.
806 */ 806 */
807 nfs_sync_mapping(filp->f_mapping); 807 nfs_sync_mapping(filp->f_mapping);
808 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { 808 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
809 if (is_time_granular(&NFS_SERVER(inode)->time_delta)) 809 if (is_time_granular(&NFS_SERVER(inode)->time_delta))
810 __nfs_revalidate_inode(NFS_SERVER(inode), inode); 810 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
811 else 811 else
812 nfs_zap_caches(inode); 812 nfs_zap_caches(inode);
813 } 813 }
814 out: 814 out:
815 return status; 815 return status;
816 } 816 }
817 817
818 /* 818 /*
819 * Lock a (portion of) a file 819 * Lock a (portion of) a file
820 */ 820 */
821 int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) 821 int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
822 { 822 {
823 struct inode *inode = filp->f_mapping->host; 823 struct inode *inode = filp->f_mapping->host;
824 int ret = -ENOLCK; 824 int ret = -ENOLCK;
825 int is_local = 0; 825 int is_local = 0;
826 826
827 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", 827 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
828 filp->f_path.dentry->d_parent->d_name.name, 828 filp->f_path.dentry->d_parent->d_name.name,
829 filp->f_path.dentry->d_name.name, 829 filp->f_path.dentry->d_name.name,
830 fl->fl_type, fl->fl_flags, 830 fl->fl_type, fl->fl_flags,
831 (long long)fl->fl_start, (long long)fl->fl_end); 831 (long long)fl->fl_start, (long long)fl->fl_end);
832 832
833 nfs_inc_stats(inode, NFSIOS_VFSLOCK); 833 nfs_inc_stats(inode, NFSIOS_VFSLOCK);
834 834
835 /* No mandatory locks over NFS */ 835 /* No mandatory locks over NFS */
836 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 836 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
837 goto out_err; 837 goto out_err;
838 838
839 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) 839 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
840 is_local = 1; 840 is_local = 1;
841 841
842 if (NFS_PROTO(inode)->lock_check_bounds != NULL) { 842 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
843 ret = NFS_PROTO(inode)->lock_check_bounds(fl); 843 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
844 if (ret < 0) 844 if (ret < 0)
845 goto out_err; 845 goto out_err;
846 } 846 }
847 847
848 if (IS_GETLK(cmd)) 848 if (IS_GETLK(cmd))
849 ret = do_getlk(filp, cmd, fl, is_local); 849 ret = do_getlk(filp, cmd, fl, is_local);
850 else if (fl->fl_type == F_UNLCK) 850 else if (fl->fl_type == F_UNLCK)
851 ret = do_unlk(filp, cmd, fl, is_local); 851 ret = do_unlk(filp, cmd, fl, is_local);
852 else 852 else
853 ret = do_setlk(filp, cmd, fl, is_local); 853 ret = do_setlk(filp, cmd, fl, is_local);
854 out_err: 854 out_err:
855 return ret; 855 return ret;
856 } 856 }
857 EXPORT_SYMBOL_GPL(nfs_lock); 857 EXPORT_SYMBOL_GPL(nfs_lock);
858 858
859 /* 859 /*
860 * Lock a (portion of) a file 860 * Lock a (portion of) a file
861 */ 861 */
862 int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 862 int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
863 { 863 {
864 struct inode *inode = filp->f_mapping->host; 864 struct inode *inode = filp->f_mapping->host;
865 int is_local = 0; 865 int is_local = 0;
866 866
867 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", 867 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
868 filp->f_path.dentry->d_parent->d_name.name, 868 filp->f_path.dentry->d_parent->d_name.name,
869 filp->f_path.dentry->d_name.name, 869 filp->f_path.dentry->d_name.name,
870 fl->fl_type, fl->fl_flags); 870 fl->fl_type, fl->fl_flags);
871 871
872 if (!(fl->fl_flags & FL_FLOCK)) 872 if (!(fl->fl_flags & FL_FLOCK))
873 return -ENOLCK; 873 return -ENOLCK;
874 874
875 /* 875 /*
876 * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of 876 * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
877 * any standard. In principle we might be able to support LOCK_MAND 877 * any standard. In principle we might be able to support LOCK_MAND
878 * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the 878 * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
879 * NFS code is not set up for it. 879 * NFS code is not set up for it.
880 */ 880 */
881 if (fl->fl_type & LOCK_MAND) 881 if (fl->fl_type & LOCK_MAND)
882 return -EINVAL; 882 return -EINVAL;
883 883
884 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) 884 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
885 is_local = 1; 885 is_local = 1;
886 886
887 /* We're simulating flock() locks using posix locks on the server */ 887 /* We're simulating flock() locks using posix locks on the server */
888 fl->fl_owner = (fl_owner_t)filp; 888 fl->fl_owner = (fl_owner_t)filp;
889 fl->fl_start = 0; 889 fl->fl_start = 0;
890 fl->fl_end = OFFSET_MAX; 890 fl->fl_end = OFFSET_MAX;
891 891
892 if (fl->fl_type == F_UNLCK) 892 if (fl->fl_type == F_UNLCK)
893 return do_unlk(filp, cmd, fl, is_local); 893 return do_unlk(filp, cmd, fl, is_local);
894 return do_setlk(filp, cmd, fl, is_local); 894 return do_setlk(filp, cmd, fl, is_local);
895 } 895 }
896 EXPORT_SYMBOL_GPL(nfs_flock); 896 EXPORT_SYMBOL_GPL(nfs_flock);
897 897
898 /* 898 /*
899 * There is no protocol support for leases, so we have no way to implement 899 * There is no protocol support for leases, so we have no way to implement
900 * them correctly in the face of opens by other clients. 900 * them correctly in the face of opens by other clients.
901 */ 901 */
902 int nfs_setlease(struct file *file, long arg, struct file_lock **fl) 902 int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
903 { 903 {
904 dprintk("NFS: setlease(%s/%s, arg=%ld)\n", 904 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
905 file->f_path.dentry->d_parent->d_name.name, 905 file->f_path.dentry->d_parent->d_name.name,
906 file->f_path.dentry->d_name.name, arg); 906 file->f_path.dentry->d_name.name, arg);
907 return -EINVAL; 907 return -EINVAL;
908 } 908 }
909 EXPORT_SYMBOL_GPL(nfs_setlease); 909 EXPORT_SYMBOL_GPL(nfs_setlease);
910 910
911 const struct file_operations nfs_file_operations = { 911 const struct file_operations nfs_file_operations = {
912 .llseek = nfs_file_llseek, 912 .llseek = nfs_file_llseek,
913 .read = do_sync_read, 913 .read = do_sync_read,
914 .write = do_sync_write, 914 .write = do_sync_write,
915 .aio_read = nfs_file_read, 915 .aio_read = nfs_file_read,
916 .aio_write = nfs_file_write, 916 .aio_write = nfs_file_write,
917 .mmap = nfs_file_mmap, 917 .mmap = nfs_file_mmap,
918 .open = nfs_file_open, 918 .open = nfs_file_open,
919 .flush = nfs_file_flush, 919 .flush = nfs_file_flush,
920 .release = nfs_file_release, 920 .release = nfs_file_release,
921 .fsync = nfs_file_fsync, 921 .fsync = nfs_file_fsync,
922 .lock = nfs_lock, 922 .lock = nfs_lock,
923 .flock = nfs_flock, 923 .flock = nfs_flock,
924 .splice_read = nfs_file_splice_read, 924 .splice_read = nfs_file_splice_read,
925 .splice_write = nfs_file_splice_write, 925 .splice_write = nfs_file_splice_write,
926 .check_flags = nfs_check_flags, 926 .check_flags = nfs_check_flags,
927 .setlease = nfs_setlease, 927 .setlease = nfs_setlease,
928 }; 928 };
929 EXPORT_SYMBOL_GPL(nfs_file_operations); 929 EXPORT_SYMBOL_GPL(nfs_file_operations);
930 930
fs/ocfs2/extent_map.c
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * extent_map.c 4 * extent_map.c
5 * 5 *
6 * Block/Cluster mapping functions 6 * Block/Cluster mapping functions
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation. 12 * License, version 2, as published by the Free Software Foundation.
13 * 13 *
14 * This program is distributed in the hope that it will be useful, 14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details. 17 * General Public License for more details.
18 * 18 *
19 * You should have received a copy of the GNU General Public 19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the 20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA. 22 * Boston, MA 021110-1307, USA.
23 */ 23 */
24 24
25 #include <linux/fs.h> 25 #include <linux/fs.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/fiemap.h> 29 #include <linux/fiemap.h>
30 30
31 #include <cluster/masklog.h> 31 #include <cluster/masklog.h>
32 32
33 #include "ocfs2.h" 33 #include "ocfs2.h"
34 34
35 #include "alloc.h" 35 #include "alloc.h"
36 #include "dlmglue.h" 36 #include "dlmglue.h"
37 #include "extent_map.h" 37 #include "extent_map.h"
38 #include "inode.h" 38 #include "inode.h"
39 #include "super.h" 39 #include "super.h"
40 #include "symlink.h" 40 #include "symlink.h"
41 #include "ocfs2_trace.h" 41 #include "ocfs2_trace.h"
42 42
43 #include "buffer_head_io.h" 43 #include "buffer_head_io.h"
44 44
45 /* 45 /*
46 * The extent caching implementation is intentionally trivial. 46 * The extent caching implementation is intentionally trivial.
47 * 47 *
48 * We only cache a small number of extents stored directly on the 48 * We only cache a small number of extents stored directly on the
49 * inode, so linear order operations are acceptable. If we ever want 49 * inode, so linear order operations are acceptable. If we ever want
50 * to increase the size of the extent map, then these algorithms must 50 * to increase the size of the extent map, then these algorithms must
51 * get smarter. 51 * get smarter.
52 */ 52 */
53 53
54 void ocfs2_extent_map_init(struct inode *inode) 54 void ocfs2_extent_map_init(struct inode *inode)
55 { 55 {
56 struct ocfs2_inode_info *oi = OCFS2_I(inode); 56 struct ocfs2_inode_info *oi = OCFS2_I(inode);
57 57
58 oi->ip_extent_map.em_num_items = 0; 58 oi->ip_extent_map.em_num_items = 0;
59 INIT_LIST_HEAD(&oi->ip_extent_map.em_list); 59 INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
60 } 60 }
61 61
62 static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, 62 static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
63 unsigned int cpos, 63 unsigned int cpos,
64 struct ocfs2_extent_map_item **ret_emi) 64 struct ocfs2_extent_map_item **ret_emi)
65 { 65 {
66 unsigned int range; 66 unsigned int range;
67 struct ocfs2_extent_map_item *emi; 67 struct ocfs2_extent_map_item *emi;
68 68
69 *ret_emi = NULL; 69 *ret_emi = NULL;
70 70
71 list_for_each_entry(emi, &em->em_list, ei_list) { 71 list_for_each_entry(emi, &em->em_list, ei_list) {
72 range = emi->ei_cpos + emi->ei_clusters; 72 range = emi->ei_cpos + emi->ei_clusters;
73 73
74 if (cpos >= emi->ei_cpos && cpos < range) { 74 if (cpos >= emi->ei_cpos && cpos < range) {
75 list_move(&emi->ei_list, &em->em_list); 75 list_move(&emi->ei_list, &em->em_list);
76 76
77 *ret_emi = emi; 77 *ret_emi = emi;
78 break; 78 break;
79 } 79 }
80 } 80 }
81 } 81 }
82 82
83 static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, 83 static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
84 unsigned int *phys, unsigned int *len, 84 unsigned int *phys, unsigned int *len,
85 unsigned int *flags) 85 unsigned int *flags)
86 { 86 {
87 unsigned int coff; 87 unsigned int coff;
88 struct ocfs2_inode_info *oi = OCFS2_I(inode); 88 struct ocfs2_inode_info *oi = OCFS2_I(inode);
89 struct ocfs2_extent_map_item *emi; 89 struct ocfs2_extent_map_item *emi;
90 90
91 spin_lock(&oi->ip_lock); 91 spin_lock(&oi->ip_lock);
92 92
93 __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); 93 __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
94 if (emi) { 94 if (emi) {
95 coff = cpos - emi->ei_cpos; 95 coff = cpos - emi->ei_cpos;
96 *phys = emi->ei_phys + coff; 96 *phys = emi->ei_phys + coff;
97 if (len) 97 if (len)
98 *len = emi->ei_clusters - coff; 98 *len = emi->ei_clusters - coff;
99 if (flags) 99 if (flags)
100 *flags = emi->ei_flags; 100 *flags = emi->ei_flags;
101 } 101 }
102 102
103 spin_unlock(&oi->ip_lock); 103 spin_unlock(&oi->ip_lock);
104 104
105 if (emi == NULL) 105 if (emi == NULL)
106 return -ENOENT; 106 return -ENOENT;
107 107
108 return 0; 108 return 0;
109 } 109 }
110 110
111 /* 111 /*
112 * Forget about all clusters equal to or greater than cpos. 112 * Forget about all clusters equal to or greater than cpos.
113 */ 113 */
114 void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) 114 void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
115 { 115 {
116 struct ocfs2_extent_map_item *emi, *n; 116 struct ocfs2_extent_map_item *emi, *n;
117 struct ocfs2_inode_info *oi = OCFS2_I(inode); 117 struct ocfs2_inode_info *oi = OCFS2_I(inode);
118 struct ocfs2_extent_map *em = &oi->ip_extent_map; 118 struct ocfs2_extent_map *em = &oi->ip_extent_map;
119 LIST_HEAD(tmp_list); 119 LIST_HEAD(tmp_list);
120 unsigned int range; 120 unsigned int range;
121 121
122 spin_lock(&oi->ip_lock); 122 spin_lock(&oi->ip_lock);
123 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { 123 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
124 if (emi->ei_cpos >= cpos) { 124 if (emi->ei_cpos >= cpos) {
125 /* Full truncate of this record. */ 125 /* Full truncate of this record. */
126 list_move(&emi->ei_list, &tmp_list); 126 list_move(&emi->ei_list, &tmp_list);
127 BUG_ON(em->em_num_items == 0); 127 BUG_ON(em->em_num_items == 0);
128 em->em_num_items--; 128 em->em_num_items--;
129 continue; 129 continue;
130 } 130 }
131 131
132 range = emi->ei_cpos + emi->ei_clusters; 132 range = emi->ei_cpos + emi->ei_clusters;
133 if (range > cpos) { 133 if (range > cpos) {
134 /* Partial truncate */ 134 /* Partial truncate */
135 emi->ei_clusters = cpos - emi->ei_cpos; 135 emi->ei_clusters = cpos - emi->ei_cpos;
136 } 136 }
137 } 137 }
138 spin_unlock(&oi->ip_lock); 138 spin_unlock(&oi->ip_lock);
139 139
140 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { 140 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
141 list_del(&emi->ei_list); 141 list_del(&emi->ei_list);
142 kfree(emi); 142 kfree(emi);
143 } 143 }
144 } 144 }
145 145
146 /* 146 /*
147 * Is any part of emi2 contained within emi1 147 * Is any part of emi2 contained within emi1
148 */ 148 */
149 static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, 149 static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
150 struct ocfs2_extent_map_item *emi2) 150 struct ocfs2_extent_map_item *emi2)
151 { 151 {
152 unsigned int range1, range2; 152 unsigned int range1, range2;
153 153
154 /* 154 /*
155 * Check if logical start of emi2 is inside emi1 155 * Check if logical start of emi2 is inside emi1
156 */ 156 */
157 range1 = emi1->ei_cpos + emi1->ei_clusters; 157 range1 = emi1->ei_cpos + emi1->ei_clusters;
158 if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) 158 if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
159 return 1; 159 return 1;
160 160
161 /* 161 /*
162 * Check if logical end of emi2 is inside emi1 162 * Check if logical end of emi2 is inside emi1
163 */ 163 */
164 range2 = emi2->ei_cpos + emi2->ei_clusters; 164 range2 = emi2->ei_cpos + emi2->ei_clusters;
165 if (range2 > emi1->ei_cpos && range2 <= range1) 165 if (range2 > emi1->ei_cpos && range2 <= range1)
166 return 1; 166 return 1;
167 167
168 return 0; 168 return 0;
169 } 169 }
170 170
171 static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, 171 static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
172 struct ocfs2_extent_map_item *src) 172 struct ocfs2_extent_map_item *src)
173 { 173 {
174 dest->ei_cpos = src->ei_cpos; 174 dest->ei_cpos = src->ei_cpos;
175 dest->ei_phys = src->ei_phys; 175 dest->ei_phys = src->ei_phys;
176 dest->ei_clusters = src->ei_clusters; 176 dest->ei_clusters = src->ei_clusters;
177 dest->ei_flags = src->ei_flags; 177 dest->ei_flags = src->ei_flags;
178 } 178 }
179 179
180 /* 180 /*
181 * Try to merge emi with ins. Returns 1 if merge succeeds, zero 181 * Try to merge emi with ins. Returns 1 if merge succeeds, zero
182 * otherwise. 182 * otherwise.
183 */ 183 */
184 static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, 184 static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
185 struct ocfs2_extent_map_item *ins) 185 struct ocfs2_extent_map_item *ins)
186 { 186 {
187 /* 187 /*
188 * Handle contiguousness 188 * Handle contiguousness
189 */ 189 */
190 if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && 190 if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
191 ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && 191 ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
192 ins->ei_flags == emi->ei_flags) { 192 ins->ei_flags == emi->ei_flags) {
193 emi->ei_clusters += ins->ei_clusters; 193 emi->ei_clusters += ins->ei_clusters;
194 return 1; 194 return 1;
195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos && 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
197 ins->ei_flags == emi->ei_flags) { 197 ins->ei_flags == emi->ei_flags) {
198 emi->ei_phys = ins->ei_phys; 198 emi->ei_phys = ins->ei_phys;
199 emi->ei_cpos = ins->ei_cpos; 199 emi->ei_cpos = ins->ei_cpos;
200 emi->ei_clusters += ins->ei_clusters; 200 emi->ei_clusters += ins->ei_clusters;
201 return 1; 201 return 1;
202 } 202 }
203 203
204 /* 204 /*
205 * Overlapping extents - this shouldn't happen unless we've 205 * Overlapping extents - this shouldn't happen unless we've
206 * split an extent to change it's flags. That is exceedingly 206 * split an extent to change it's flags. That is exceedingly
207 * rare, so there's no sense in trying to optimize it yet. 207 * rare, so there's no sense in trying to optimize it yet.
208 */ 208 */
209 if (ocfs2_ei_is_contained(emi, ins) || 209 if (ocfs2_ei_is_contained(emi, ins) ||
210 ocfs2_ei_is_contained(ins, emi)) { 210 ocfs2_ei_is_contained(ins, emi)) {
211 ocfs2_copy_emi_fields(emi, ins); 211 ocfs2_copy_emi_fields(emi, ins);
212 return 1; 212 return 1;
213 } 213 }
214 214
215 /* No merge was possible. */ 215 /* No merge was possible. */
216 return 0; 216 return 0;
217 } 217 }
218 218
219 /* 219 /*
220 * In order to reduce complexity on the caller, this insert function 220 * In order to reduce complexity on the caller, this insert function
221 * is intentionally liberal in what it will accept. 221 * is intentionally liberal in what it will accept.
222 * 222 *
223 * The only rule is that the truncate call *must* be used whenever 223 * The only rule is that the truncate call *must* be used whenever
224 * records have been deleted. This avoids inserting overlapping 224 * records have been deleted. This avoids inserting overlapping
225 * records with different physical mappings. 225 * records with different physical mappings.
226 */ 226 */
227 void ocfs2_extent_map_insert_rec(struct inode *inode, 227 void ocfs2_extent_map_insert_rec(struct inode *inode,
228 struct ocfs2_extent_rec *rec) 228 struct ocfs2_extent_rec *rec)
229 { 229 {
230 struct ocfs2_inode_info *oi = OCFS2_I(inode); 230 struct ocfs2_inode_info *oi = OCFS2_I(inode);
231 struct ocfs2_extent_map *em = &oi->ip_extent_map; 231 struct ocfs2_extent_map *em = &oi->ip_extent_map;
232 struct ocfs2_extent_map_item *emi, *new_emi = NULL; 232 struct ocfs2_extent_map_item *emi, *new_emi = NULL;
233 struct ocfs2_extent_map_item ins; 233 struct ocfs2_extent_map_item ins;
234 234
235 ins.ei_cpos = le32_to_cpu(rec->e_cpos); 235 ins.ei_cpos = le32_to_cpu(rec->e_cpos);
236 ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, 236 ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
237 le64_to_cpu(rec->e_blkno)); 237 le64_to_cpu(rec->e_blkno));
238 ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); 238 ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
239 ins.ei_flags = rec->e_flags; 239 ins.ei_flags = rec->e_flags;
240 240
241 search: 241 search:
242 spin_lock(&oi->ip_lock); 242 spin_lock(&oi->ip_lock);
243 243
244 list_for_each_entry(emi, &em->em_list, ei_list) { 244 list_for_each_entry(emi, &em->em_list, ei_list) {
245 if (ocfs2_try_to_merge_extent_map(emi, &ins)) { 245 if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
246 list_move(&emi->ei_list, &em->em_list); 246 list_move(&emi->ei_list, &em->em_list);
247 spin_unlock(&oi->ip_lock); 247 spin_unlock(&oi->ip_lock);
248 goto out; 248 goto out;
249 } 249 }
250 } 250 }
251 251
252 /* 252 /*
253 * No item could be merged. 253 * No item could be merged.
254 * 254 *
255 * Either allocate and add a new item, or overwrite the last recently 255 * Either allocate and add a new item, or overwrite the last recently
256 * inserted. 256 * inserted.
257 */ 257 */
258 258
259 if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { 259 if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
260 if (new_emi == NULL) { 260 if (new_emi == NULL) {
261 spin_unlock(&oi->ip_lock); 261 spin_unlock(&oi->ip_lock);
262 262
263 new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); 263 new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
264 if (new_emi == NULL) 264 if (new_emi == NULL)
265 goto out; 265 goto out;
266 266
267 goto search; 267 goto search;
268 } 268 }
269 269
270 ocfs2_copy_emi_fields(new_emi, &ins); 270 ocfs2_copy_emi_fields(new_emi, &ins);
271 list_add(&new_emi->ei_list, &em->em_list); 271 list_add(&new_emi->ei_list, &em->em_list);
272 em->em_num_items++; 272 em->em_num_items++;
273 new_emi = NULL; 273 new_emi = NULL;
274 } else { 274 } else {
275 BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); 275 BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
276 emi = list_entry(em->em_list.prev, 276 emi = list_entry(em->em_list.prev,
277 struct ocfs2_extent_map_item, ei_list); 277 struct ocfs2_extent_map_item, ei_list);
278 list_move(&emi->ei_list, &em->em_list); 278 list_move(&emi->ei_list, &em->em_list);
279 ocfs2_copy_emi_fields(emi, &ins); 279 ocfs2_copy_emi_fields(emi, &ins);
280 } 280 }
281 281
282 spin_unlock(&oi->ip_lock); 282 spin_unlock(&oi->ip_lock);
283 283
284 out: 284 out:
285 if (new_emi) 285 if (new_emi)
286 kfree(new_emi); 286 kfree(new_emi);
287 } 287 }
288 288
289 static int ocfs2_last_eb_is_empty(struct inode *inode, 289 static int ocfs2_last_eb_is_empty(struct inode *inode,
290 struct ocfs2_dinode *di) 290 struct ocfs2_dinode *di)
291 { 291 {
292 int ret, next_free; 292 int ret, next_free;
293 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); 293 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
294 struct buffer_head *eb_bh = NULL; 294 struct buffer_head *eb_bh = NULL;
295 struct ocfs2_extent_block *eb; 295 struct ocfs2_extent_block *eb;
296 struct ocfs2_extent_list *el; 296 struct ocfs2_extent_list *el;
297 297
298 ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); 298 ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
299 if (ret) { 299 if (ret) {
300 mlog_errno(ret); 300 mlog_errno(ret);
301 goto out; 301 goto out;
302 } 302 }
303 303
304 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 304 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
305 el = &eb->h_list; 305 el = &eb->h_list;
306 306
307 if (el->l_tree_depth) { 307 if (el->l_tree_depth) {
308 ocfs2_error(inode->i_sb, 308 ocfs2_error(inode->i_sb,
309 "Inode %lu has non zero tree depth in " 309 "Inode %lu has non zero tree depth in "
310 "leaf block %llu\n", inode->i_ino, 310 "leaf block %llu\n", inode->i_ino,
311 (unsigned long long)eb_bh->b_blocknr); 311 (unsigned long long)eb_bh->b_blocknr);
312 ret = -EROFS; 312 ret = -EROFS;
313 goto out; 313 goto out;
314 } 314 }
315 315
316 next_free = le16_to_cpu(el->l_next_free_rec); 316 next_free = le16_to_cpu(el->l_next_free_rec);
317 317
318 if (next_free == 0 || 318 if (next_free == 0 ||
319 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) 319 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
320 ret = 1; 320 ret = 1;
321 321
322 out: 322 out:
323 brelse(eb_bh); 323 brelse(eb_bh);
324 return ret; 324 return ret;
325 } 325 }
326 326
327 /* 327 /*
328 * Return the 1st index within el which contains an extent start 328 * Return the 1st index within el which contains an extent start
329 * larger than v_cluster. 329 * larger than v_cluster.
330 */ 330 */
331 static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, 331 static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
332 u32 v_cluster) 332 u32 v_cluster)
333 { 333 {
334 int i; 334 int i;
335 struct ocfs2_extent_rec *rec; 335 struct ocfs2_extent_rec *rec;
336 336
337 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 337 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
338 rec = &el->l_recs[i]; 338 rec = &el->l_recs[i];
339 339
340 if (v_cluster < le32_to_cpu(rec->e_cpos)) 340 if (v_cluster < le32_to_cpu(rec->e_cpos))
341 break; 341 break;
342 } 342 }
343 343
344 return i; 344 return i;
345 } 345 }
346 346
347 /* 347 /*
348 * Figure out the size of a hole which starts at v_cluster within the given 348 * Figure out the size of a hole which starts at v_cluster within the given
349 * extent list. 349 * extent list.
350 * 350 *
351 * If there is no more allocation past v_cluster, we return the maximum 351 * If there is no more allocation past v_cluster, we return the maximum
352 * cluster size minus v_cluster. 352 * cluster size minus v_cluster.
353 * 353 *
354 * If we have in-inode extents, then el points to the dinode list and 354 * If we have in-inode extents, then el points to the dinode list and
355 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block 355 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
356 * containing el. 356 * containing el.
357 */ 357 */
358 int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, 358 int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
359 struct ocfs2_extent_list *el, 359 struct ocfs2_extent_list *el,
360 struct buffer_head *eb_bh, 360 struct buffer_head *eb_bh,
361 u32 v_cluster, 361 u32 v_cluster,
362 u32 *num_clusters) 362 u32 *num_clusters)
363 { 363 {
364 int ret, i; 364 int ret, i;
365 struct buffer_head *next_eb_bh = NULL; 365 struct buffer_head *next_eb_bh = NULL;
366 struct ocfs2_extent_block *eb, *next_eb; 366 struct ocfs2_extent_block *eb, *next_eb;
367 367
368 i = ocfs2_search_for_hole_index(el, v_cluster); 368 i = ocfs2_search_for_hole_index(el, v_cluster);
369 369
370 if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { 370 if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
371 eb = (struct ocfs2_extent_block *)eb_bh->b_data; 371 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
372 372
373 /* 373 /*
374 * Check the next leaf for any extents. 374 * Check the next leaf for any extents.
375 */ 375 */
376 376
377 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 377 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
378 goto no_more_extents; 378 goto no_more_extents;
379 379
380 ret = ocfs2_read_extent_block(ci, 380 ret = ocfs2_read_extent_block(ci,
381 le64_to_cpu(eb->h_next_leaf_blk), 381 le64_to_cpu(eb->h_next_leaf_blk),
382 &next_eb_bh); 382 &next_eb_bh);
383 if (ret) { 383 if (ret) {
384 mlog_errno(ret); 384 mlog_errno(ret);
385 goto out; 385 goto out;
386 } 386 }
387 387
388 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; 388 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
389 el = &next_eb->h_list; 389 el = &next_eb->h_list;
390 i = ocfs2_search_for_hole_index(el, v_cluster); 390 i = ocfs2_search_for_hole_index(el, v_cluster);
391 } 391 }
392 392
393 no_more_extents: 393 no_more_extents:
394 if (i == le16_to_cpu(el->l_next_free_rec)) { 394 if (i == le16_to_cpu(el->l_next_free_rec)) {
395 /* 395 /*
396 * We're at the end of our existing allocation. Just 396 * We're at the end of our existing allocation. Just
397 * return the maximum number of clusters we could 397 * return the maximum number of clusters we could
398 * possibly allocate. 398 * possibly allocate.
399 */ 399 */
400 *num_clusters = UINT_MAX - v_cluster; 400 *num_clusters = UINT_MAX - v_cluster;
401 } else { 401 } else {
402 *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; 402 *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
403 } 403 }
404 404
405 ret = 0; 405 ret = 0;
406 out: 406 out:
407 brelse(next_eb_bh); 407 brelse(next_eb_bh);
408 return ret; 408 return ret;
409 } 409 }
410 410
411 static int ocfs2_get_clusters_nocache(struct inode *inode, 411 static int ocfs2_get_clusters_nocache(struct inode *inode,
412 struct buffer_head *di_bh, 412 struct buffer_head *di_bh,
413 u32 v_cluster, unsigned int *hole_len, 413 u32 v_cluster, unsigned int *hole_len,
414 struct ocfs2_extent_rec *ret_rec, 414 struct ocfs2_extent_rec *ret_rec,
415 unsigned int *is_last) 415 unsigned int *is_last)
416 { 416 {
417 int i, ret, tree_height, len; 417 int i, ret, tree_height, len;
418 struct ocfs2_dinode *di; 418 struct ocfs2_dinode *di;
419 struct ocfs2_extent_block *uninitialized_var(eb); 419 struct ocfs2_extent_block *uninitialized_var(eb);
420 struct ocfs2_extent_list *el; 420 struct ocfs2_extent_list *el;
421 struct ocfs2_extent_rec *rec; 421 struct ocfs2_extent_rec *rec;
422 struct buffer_head *eb_bh = NULL; 422 struct buffer_head *eb_bh = NULL;
423 423
424 memset(ret_rec, 0, sizeof(*ret_rec)); 424 memset(ret_rec, 0, sizeof(*ret_rec));
425 if (is_last) 425 if (is_last)
426 *is_last = 0; 426 *is_last = 0;
427 427
428 di = (struct ocfs2_dinode *) di_bh->b_data; 428 di = (struct ocfs2_dinode *) di_bh->b_data;
429 el = &di->id2.i_list; 429 el = &di->id2.i_list;
430 tree_height = le16_to_cpu(el->l_tree_depth); 430 tree_height = le16_to_cpu(el->l_tree_depth);
431 431
432 if (tree_height > 0) { 432 if (tree_height > 0) {
433 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, 433 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
434 &eb_bh); 434 &eb_bh);
435 if (ret) { 435 if (ret) {
436 mlog_errno(ret); 436 mlog_errno(ret);
437 goto out; 437 goto out;
438 } 438 }
439 439
440 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 440 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
441 el = &eb->h_list; 441 el = &eb->h_list;
442 442
443 if (el->l_tree_depth) { 443 if (el->l_tree_depth) {
444 ocfs2_error(inode->i_sb, 444 ocfs2_error(inode->i_sb,
445 "Inode %lu has non zero tree depth in " 445 "Inode %lu has non zero tree depth in "
446 "leaf block %llu\n", inode->i_ino, 446 "leaf block %llu\n", inode->i_ino,
447 (unsigned long long)eb_bh->b_blocknr); 447 (unsigned long long)eb_bh->b_blocknr);
448 ret = -EROFS; 448 ret = -EROFS;
449 goto out; 449 goto out;
450 } 450 }
451 } 451 }
452 452
453 i = ocfs2_search_extent_list(el, v_cluster); 453 i = ocfs2_search_extent_list(el, v_cluster);
454 if (i == -1) { 454 if (i == -1) {
455 /* 455 /*
456 * Holes can be larger than the maximum size of an 456 * Holes can be larger than the maximum size of an
457 * extent, so we return their lengths in a separate 457 * extent, so we return their lengths in a separate
458 * field. 458 * field.
459 */ 459 */
460 if (hole_len) { 460 if (hole_len) {
461 ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), 461 ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
462 el, eb_bh, 462 el, eb_bh,
463 v_cluster, &len); 463 v_cluster, &len);
464 if (ret) { 464 if (ret) {
465 mlog_errno(ret); 465 mlog_errno(ret);
466 goto out; 466 goto out;
467 } 467 }
468 468
469 *hole_len = len; 469 *hole_len = len;
470 } 470 }
471 goto out_hole; 471 goto out_hole;
472 } 472 }
473 473
474 rec = &el->l_recs[i]; 474 rec = &el->l_recs[i];
475 475
476 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 476 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
477 477
478 if (!rec->e_blkno) { 478 if (!rec->e_blkno) {
479 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 479 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
480 "record (%u, %u, 0)", inode->i_ino, 480 "record (%u, %u, 0)", inode->i_ino,
481 le32_to_cpu(rec->e_cpos), 481 le32_to_cpu(rec->e_cpos),
482 ocfs2_rec_clusters(el, rec)); 482 ocfs2_rec_clusters(el, rec));
483 ret = -EROFS; 483 ret = -EROFS;
484 goto out; 484 goto out;
485 } 485 }
486 486
487 *ret_rec = *rec; 487 *ret_rec = *rec;
488 488
489 /* 489 /*
490 * Checking for last extent is potentially expensive - we 490 * Checking for last extent is potentially expensive - we
491 * might have to look at the next leaf over to see if it's 491 * might have to look at the next leaf over to see if it's
492 * empty. 492 * empty.
493 * 493 *
494 * The first two checks are to see whether the caller even 494 * The first two checks are to see whether the caller even
495 * cares for this information, and if the extent is at least 495 * cares for this information, and if the extent is at least
496 * the last in it's list. 496 * the last in it's list.
497 * 497 *
498 * If those hold true, then the extent is last if any of the 498 * If those hold true, then the extent is last if any of the
499 * additional conditions hold true: 499 * additional conditions hold true:
500 * - Extent list is in-inode 500 * - Extent list is in-inode
501 * - Extent list is right-most 501 * - Extent list is right-most
502 * - Extent list is 2nd to rightmost, with empty right-most 502 * - Extent list is 2nd to rightmost, with empty right-most
503 */ 503 */
504 if (is_last) { 504 if (is_last) {
505 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { 505 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
506 if (tree_height == 0) 506 if (tree_height == 0)
507 *is_last = 1; 507 *is_last = 1;
508 else if (eb->h_blkno == di->i_last_eb_blk) 508 else if (eb->h_blkno == di->i_last_eb_blk)
509 *is_last = 1; 509 *is_last = 1;
510 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { 510 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
511 ret = ocfs2_last_eb_is_empty(inode, di); 511 ret = ocfs2_last_eb_is_empty(inode, di);
512 if (ret < 0) { 512 if (ret < 0) {
513 mlog_errno(ret); 513 mlog_errno(ret);
514 goto out; 514 goto out;
515 } 515 }
516 if (ret == 1) 516 if (ret == 1)
517 *is_last = 1; 517 *is_last = 1;
518 } 518 }
519 } 519 }
520 } 520 }
521 521
522 out_hole: 522 out_hole:
523 ret = 0; 523 ret = 0;
524 out: 524 out:
525 brelse(eb_bh); 525 brelse(eb_bh);
526 return ret; 526 return ret;
527 } 527 }
528 528
529 static void ocfs2_relative_extent_offsets(struct super_block *sb, 529 static void ocfs2_relative_extent_offsets(struct super_block *sb,
530 u32 v_cluster, 530 u32 v_cluster,
531 struct ocfs2_extent_rec *rec, 531 struct ocfs2_extent_rec *rec,
532 u32 *p_cluster, u32 *num_clusters) 532 u32 *p_cluster, u32 *num_clusters)
533 533
534 { 534 {
535 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); 535 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
536 536
537 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); 537 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
538 *p_cluster = *p_cluster + coff; 538 *p_cluster = *p_cluster + coff;
539 539
540 if (num_clusters) 540 if (num_clusters)
541 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; 541 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
542 } 542 }
543 543
544 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 544 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
545 u32 *p_cluster, u32 *num_clusters, 545 u32 *p_cluster, u32 *num_clusters,
546 struct ocfs2_extent_list *el, 546 struct ocfs2_extent_list *el,
547 unsigned int *extent_flags) 547 unsigned int *extent_flags)
548 { 548 {
549 int ret = 0, i; 549 int ret = 0, i;
550 struct buffer_head *eb_bh = NULL; 550 struct buffer_head *eb_bh = NULL;
551 struct ocfs2_extent_block *eb; 551 struct ocfs2_extent_block *eb;
552 struct ocfs2_extent_rec *rec; 552 struct ocfs2_extent_rec *rec;
553 u32 coff; 553 u32 coff;
554 554
555 if (el->l_tree_depth) { 555 if (el->l_tree_depth) {
556 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, 556 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
557 &eb_bh); 557 &eb_bh);
558 if (ret) { 558 if (ret) {
559 mlog_errno(ret); 559 mlog_errno(ret);
560 goto out; 560 goto out;
561 } 561 }
562 562
563 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 563 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
564 el = &eb->h_list; 564 el = &eb->h_list;
565 565
566 if (el->l_tree_depth) { 566 if (el->l_tree_depth) {
567 ocfs2_error(inode->i_sb, 567 ocfs2_error(inode->i_sb,
568 "Inode %lu has non zero tree depth in " 568 "Inode %lu has non zero tree depth in "
569 "xattr leaf block %llu\n", inode->i_ino, 569 "xattr leaf block %llu\n", inode->i_ino,
570 (unsigned long long)eb_bh->b_blocknr); 570 (unsigned long long)eb_bh->b_blocknr);
571 ret = -EROFS; 571 ret = -EROFS;
572 goto out; 572 goto out;
573 } 573 }
574 } 574 }
575 575
576 i = ocfs2_search_extent_list(el, v_cluster); 576 i = ocfs2_search_extent_list(el, v_cluster);
577 if (i == -1) { 577 if (i == -1) {
578 ret = -EROFS; 578 ret = -EROFS;
579 mlog_errno(ret); 579 mlog_errno(ret);
580 goto out; 580 goto out;
581 } else { 581 } else {
582 rec = &el->l_recs[i]; 582 rec = &el->l_recs[i];
583 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 583 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
584 584
585 if (!rec->e_blkno) { 585 if (!rec->e_blkno) {
586 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 586 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
587 "record (%u, %u, 0) in xattr", inode->i_ino, 587 "record (%u, %u, 0) in xattr", inode->i_ino,
588 le32_to_cpu(rec->e_cpos), 588 le32_to_cpu(rec->e_cpos),
589 ocfs2_rec_clusters(el, rec)); 589 ocfs2_rec_clusters(el, rec));
590 ret = -EROFS; 590 ret = -EROFS;
591 goto out; 591 goto out;
592 } 592 }
593 coff = v_cluster - le32_to_cpu(rec->e_cpos); 593 coff = v_cluster - le32_to_cpu(rec->e_cpos);
594 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 594 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
595 le64_to_cpu(rec->e_blkno)); 595 le64_to_cpu(rec->e_blkno));
596 *p_cluster = *p_cluster + coff; 596 *p_cluster = *p_cluster + coff;
597 if (num_clusters) 597 if (num_clusters)
598 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 598 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
599 599
600 if (extent_flags) 600 if (extent_flags)
601 *extent_flags = rec->e_flags; 601 *extent_flags = rec->e_flags;
602 } 602 }
603 out: 603 out:
604 if (eb_bh) 604 if (eb_bh)
605 brelse(eb_bh); 605 brelse(eb_bh);
606 return ret; 606 return ret;
607 } 607 }
608 608
609 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 609 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
610 u32 *p_cluster, u32 *num_clusters, 610 u32 *p_cluster, u32 *num_clusters,
611 unsigned int *extent_flags) 611 unsigned int *extent_flags)
612 { 612 {
613 int ret; 613 int ret;
614 unsigned int uninitialized_var(hole_len), flags = 0; 614 unsigned int uninitialized_var(hole_len), flags = 0;
615 struct buffer_head *di_bh = NULL; 615 struct buffer_head *di_bh = NULL;
616 struct ocfs2_extent_rec rec; 616 struct ocfs2_extent_rec rec;
617 617
618 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 618 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
619 ret = -ERANGE; 619 ret = -ERANGE;
620 mlog_errno(ret); 620 mlog_errno(ret);
621 goto out; 621 goto out;
622 } 622 }
623 623
624 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, 624 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
625 num_clusters, extent_flags); 625 num_clusters, extent_flags);
626 if (ret == 0) 626 if (ret == 0)
627 goto out; 627 goto out;
628 628
629 ret = ocfs2_read_inode_block(inode, &di_bh); 629 ret = ocfs2_read_inode_block(inode, &di_bh);
630 if (ret) { 630 if (ret) {
631 mlog_errno(ret); 631 mlog_errno(ret);
632 goto out; 632 goto out;
633 } 633 }
634 634
635 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, 635 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
636 &rec, NULL); 636 &rec, NULL);
637 if (ret) { 637 if (ret) {
638 mlog_errno(ret); 638 mlog_errno(ret);
639 goto out; 639 goto out;
640 } 640 }
641 641
642 if (rec.e_blkno == 0ULL) { 642 if (rec.e_blkno == 0ULL) {
643 /* 643 /*
644 * A hole was found. Return some canned values that 644 * A hole was found. Return some canned values that
645 * callers can key on. If asked for, num_clusters will 645 * callers can key on. If asked for, num_clusters will
646 * be populated with the size of the hole. 646 * be populated with the size of the hole.
647 */ 647 */
648 *p_cluster = 0; 648 *p_cluster = 0;
649 if (num_clusters) { 649 if (num_clusters) {
650 *num_clusters = hole_len; 650 *num_clusters = hole_len;
651 } 651 }
652 } else { 652 } else {
653 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, 653 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
654 p_cluster, num_clusters); 654 p_cluster, num_clusters);
655 flags = rec.e_flags; 655 flags = rec.e_flags;
656 656
657 ocfs2_extent_map_insert_rec(inode, &rec); 657 ocfs2_extent_map_insert_rec(inode, &rec);
658 } 658 }
659 659
660 if (extent_flags) 660 if (extent_flags)
661 *extent_flags = flags; 661 *extent_flags = flags;
662 662
663 out: 663 out:
664 brelse(di_bh); 664 brelse(di_bh);
665 return ret; 665 return ret;
666 } 666 }
667 667
668 /* 668 /*
669 * This expects alloc_sem to be held. The allocation cannot change at 669 * This expects alloc_sem to be held. The allocation cannot change at
670 * all while the map is in the process of being updated. 670 * all while the map is in the process of being updated.
671 */ 671 */
672 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 672 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
673 u64 *ret_count, unsigned int *extent_flags) 673 u64 *ret_count, unsigned int *extent_flags)
674 { 674 {
675 int ret; 675 int ret;
676 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 676 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
677 u32 cpos, num_clusters, p_cluster; 677 u32 cpos, num_clusters, p_cluster;
678 u64 boff = 0; 678 u64 boff = 0;
679 679
680 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); 680 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
681 681
682 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, 682 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
683 extent_flags); 683 extent_flags);
684 if (ret) { 684 if (ret) {
685 mlog_errno(ret); 685 mlog_errno(ret);
686 goto out; 686 goto out;
687 } 687 }
688 688
689 /* 689 /*
690 * p_cluster == 0 indicates a hole. 690 * p_cluster == 0 indicates a hole.
691 */ 691 */
692 if (p_cluster) { 692 if (p_cluster) {
693 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 693 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
694 boff += (v_blkno & (u64)(bpc - 1)); 694 boff += (v_blkno & (u64)(bpc - 1));
695 } 695 }
696 696
697 *p_blkno = boff; 697 *p_blkno = boff;
698 698
699 if (ret_count) { 699 if (ret_count) {
700 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); 700 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
701 *ret_count -= v_blkno & (u64)(bpc - 1); 701 *ret_count -= v_blkno & (u64)(bpc - 1);
702 } 702 }
703 703
704 out: 704 out:
705 return ret; 705 return ret;
706 } 706 }
707 707
708 /* 708 /*
709 * The ocfs2_fiemap_inline() may be a little bit misleading, since 709 * The ocfs2_fiemap_inline() may be a little bit misleading, since
710 * it not only handles the fiemap for inlined files, but also deals 710 * it not only handles the fiemap for inlined files, but also deals
711 * with the fast symlink, cause they have no difference for extent 711 * with the fast symlink, cause they have no difference for extent
712 * mapping per se. 712 * mapping per se.
713 */ 713 */
714 static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, 714 static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
715 struct fiemap_extent_info *fieinfo, 715 struct fiemap_extent_info *fieinfo,
716 u64 map_start) 716 u64 map_start)
717 { 717 {
718 int ret; 718 int ret;
719 unsigned int id_count; 719 unsigned int id_count;
720 struct ocfs2_dinode *di; 720 struct ocfs2_dinode *di;
721 u64 phys; 721 u64 phys;
722 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; 722 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
723 struct ocfs2_inode_info *oi = OCFS2_I(inode); 723 struct ocfs2_inode_info *oi = OCFS2_I(inode);
724 724
725 di = (struct ocfs2_dinode *)di_bh->b_data; 725 di = (struct ocfs2_dinode *)di_bh->b_data;
726 if (ocfs2_inode_is_fast_symlink(inode)) 726 if (ocfs2_inode_is_fast_symlink(inode))
727 id_count = ocfs2_fast_symlink_chars(inode->i_sb); 727 id_count = ocfs2_fast_symlink_chars(inode->i_sb);
728 else 728 else
729 id_count = le16_to_cpu(di->id2.i_data.id_count); 729 id_count = le16_to_cpu(di->id2.i_data.id_count);
730 730
731 if (map_start < id_count) { 731 if (map_start < id_count) {
732 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; 732 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
733 if (ocfs2_inode_is_fast_symlink(inode)) 733 if (ocfs2_inode_is_fast_symlink(inode))
734 phys += offsetof(struct ocfs2_dinode, id2.i_symlink); 734 phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
735 else 735 else
736 phys += offsetof(struct ocfs2_dinode, 736 phys += offsetof(struct ocfs2_dinode,
737 id2.i_data.id_data); 737 id2.i_data.id_data);
738 738
739 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, 739 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
740 flags); 740 flags);
741 if (ret < 0) 741 if (ret < 0)
742 return ret; 742 return ret;
743 } 743 }
744 744
745 return 0; 745 return 0;
746 } 746 }
747 747
748 #define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 748 #define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
749 749
750 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 750 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
751 u64 map_start, u64 map_len) 751 u64 map_start, u64 map_len)
752 { 752 {
753 int ret, is_last; 753 int ret, is_last;
754 u32 mapping_end, cpos; 754 u32 mapping_end, cpos;
755 unsigned int hole_size; 755 unsigned int hole_size;
756 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 756 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
757 u64 len_bytes, phys_bytes, virt_bytes; 757 u64 len_bytes, phys_bytes, virt_bytes;
758 struct buffer_head *di_bh = NULL; 758 struct buffer_head *di_bh = NULL;
759 struct ocfs2_extent_rec rec; 759 struct ocfs2_extent_rec rec;
760 760
761 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); 761 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
762 if (ret) 762 if (ret)
763 return ret; 763 return ret;
764 764
765 ret = ocfs2_inode_lock(inode, &di_bh, 0); 765 ret = ocfs2_inode_lock(inode, &di_bh, 0);
766 if (ret) { 766 if (ret) {
767 mlog_errno(ret); 767 mlog_errno(ret);
768 goto out; 768 goto out;
769 } 769 }
770 770
771 down_read(&OCFS2_I(inode)->ip_alloc_sem); 771 down_read(&OCFS2_I(inode)->ip_alloc_sem);
772 772
773 /* 773 /*
774 * Handle inline-data and fast symlink separately. 774 * Handle inline-data and fast symlink separately.
775 */ 775 */
776 if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 776 if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
777 ocfs2_inode_is_fast_symlink(inode)) { 777 ocfs2_inode_is_fast_symlink(inode)) {
778 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); 778 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
779 goto out_unlock; 779 goto out_unlock;
780 } 780 }
781 781
782 cpos = map_start >> osb->s_clustersize_bits; 782 cpos = map_start >> osb->s_clustersize_bits;
783 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, 783 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
784 map_start + map_len); 784 map_start + map_len);
785 mapping_end -= cpos; 785 mapping_end -= cpos;
786 is_last = 0; 786 is_last = 0;
787 while (cpos < mapping_end && !is_last) { 787 while (cpos < mapping_end && !is_last) {
788 u32 fe_flags; 788 u32 fe_flags;
789 789
790 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, 790 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
791 &hole_size, &rec, &is_last); 791 &hole_size, &rec, &is_last);
792 if (ret) { 792 if (ret) {
793 mlog_errno(ret); 793 mlog_errno(ret);
794 goto out; 794 goto out;
795 } 795 }
796 796
797 if (rec.e_blkno == 0ULL) { 797 if (rec.e_blkno == 0ULL) {
798 cpos += hole_size; 798 cpos += hole_size;
799 continue; 799 continue;
800 } 800 }
801 801
802 fe_flags = 0; 802 fe_flags = 0;
803 if (rec.e_flags & OCFS2_EXT_UNWRITTEN) 803 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
804 fe_flags |= FIEMAP_EXTENT_UNWRITTEN; 804 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
805 if (rec.e_flags & OCFS2_EXT_REFCOUNTED) 805 if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
806 fe_flags |= FIEMAP_EXTENT_SHARED; 806 fe_flags |= FIEMAP_EXTENT_SHARED;
807 if (is_last) 807 if (is_last)
808 fe_flags |= FIEMAP_EXTENT_LAST; 808 fe_flags |= FIEMAP_EXTENT_LAST;
809 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; 809 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
810 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; 810 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
811 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; 811 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
812 812
813 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, 813 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
814 len_bytes, fe_flags); 814 len_bytes, fe_flags);
815 if (ret) 815 if (ret)
816 break; 816 break;
817 817
818 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); 818 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
819 } 819 }
820 820
821 if (ret > 0) 821 if (ret > 0)
822 ret = 0; 822 ret = 0;
823 823
824 out_unlock: 824 out_unlock:
825 brelse(di_bh); 825 brelse(di_bh);
826 826
827 up_read(&OCFS2_I(inode)->ip_alloc_sem); 827 up_read(&OCFS2_I(inode)->ip_alloc_sem);
828 828
829 ocfs2_inode_unlock(inode, 0); 829 ocfs2_inode_unlock(inode, 0);
830 out: 830 out:
831 831
832 return ret; 832 return ret;
833 } 833 }
834 834
835 int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) 835 int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
836 { 836 {
837 struct inode *inode = file->f_mapping->host; 837 struct inode *inode = file->f_mapping->host;
838 int ret; 838 int ret;
839 unsigned int is_last = 0, is_data = 0; 839 unsigned int is_last = 0, is_data = 0;
840 u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; 840 u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
841 u32 cpos, cend, clen, hole_size; 841 u32 cpos, cend, clen, hole_size;
842 u64 extoff, extlen; 842 u64 extoff, extlen;
843 struct buffer_head *di_bh = NULL; 843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec; 844 struct ocfs2_extent_rec rec;
845 845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); 846 BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
847 847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0); 848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) { 849 if (ret) {
850 mlog_errno(ret); 850 mlog_errno(ret);
851 goto out; 851 goto out;
852 } 852 }
853 853
854 down_read(&OCFS2_I(inode)->ip_alloc_sem); 854 down_read(&OCFS2_I(inode)->ip_alloc_sem);
855 855
856 if (*offset >= inode->i_size) { 856 if (*offset >= inode->i_size) {
857 ret = -ENXIO; 857 ret = -ENXIO;
858 goto out_unlock; 858 goto out_unlock;
859 } 859 }
860 860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE) 862 if (whence == SEEK_HOLE)
863 *offset = inode->i_size; 863 *offset = inode->i_size;
864 goto out_unlock; 864 goto out_unlock;
865 } 865 }
866 866
867 clen = 0; 867 clen = 0;
868 cpos = *offset >> cs_bits; 868 cpos = *offset >> cs_bits;
869 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); 869 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
870 870
871 while (cpos < cend && !is_last) { 871 while (cpos < cend && !is_last) {
872 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, 872 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
873 &rec, &is_last); 873 &rec, &is_last);
874 if (ret) { 874 if (ret) {
875 mlog_errno(ret); 875 mlog_errno(ret);
876 goto out_unlock; 876 goto out_unlock;
877 } 877 }
878 878
879 extoff = cpos; 879 extoff = cpos;
880 extoff <<= cs_bits; 880 extoff <<= cs_bits;
881 881
882 if (rec.e_blkno == 0ULL) { 882 if (rec.e_blkno == 0ULL) {
883 clen = hole_size; 883 clen = hole_size;
884 is_data = 0; 884 is_data = 0;
885 } else { 885 } else {
886 clen = le16_to_cpu(rec.e_leaf_clusters) - 886 clen = le16_to_cpu(rec.e_leaf_clusters) -
887 (cpos - le32_to_cpu(rec.e_cpos)); 887 (cpos - le32_to_cpu(rec.e_cpos));
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; 888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 } 889 }
890 890
891 if ((!is_data && origin == SEEK_HOLE) || 891 if ((!is_data && whence == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) { 892 (is_data && whence == SEEK_DATA)) {
893 if (extoff > *offset) 893 if (extoff > *offset)
894 *offset = extoff; 894 *offset = extoff;
895 goto out_unlock; 895 goto out_unlock;
896 } 896 }
897 897
898 if (!is_last) 898 if (!is_last)
899 cpos += clen; 899 cpos += clen;
900 } 900 }
901 901
902 if (origin == SEEK_HOLE) { 902 if (whence == SEEK_HOLE) {
903 extoff = cpos; 903 extoff = cpos;
904 extoff <<= cs_bits; 904 extoff <<= cs_bits;
905 extlen = clen; 905 extlen = clen;
906 extlen <<= cs_bits; 906 extlen <<= cs_bits;
907 907
908 if ((extoff + extlen) > inode->i_size) 908 if ((extoff + extlen) > inode->i_size)
909 extlen = inode->i_size - extoff; 909 extlen = inode->i_size - extoff;
910 extoff += extlen; 910 extoff += extlen;
911 if (extoff > *offset) 911 if (extoff > *offset)
912 *offset = extoff; 912 *offset = extoff;
913 goto out_unlock; 913 goto out_unlock;
914 } 914 }
915 915
916 ret = -ENXIO; 916 ret = -ENXIO;
917 917
918 out_unlock: 918 out_unlock:
919 919
920 brelse(di_bh); 920 brelse(di_bh);
921 921
922 up_read(&OCFS2_I(inode)->ip_alloc_sem); 922 up_read(&OCFS2_I(inode)->ip_alloc_sem);
923 923
924 ocfs2_inode_unlock(inode, 0); 924 ocfs2_inode_unlock(inode, 0);
925 out: 925 out:
926 return ret; 926 return ret;
927 } 927 }
928 928
929 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 929 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
930 struct buffer_head *bhs[], int flags, 930 struct buffer_head *bhs[], int flags,
931 int (*validate)(struct super_block *sb, 931 int (*validate)(struct super_block *sb,
932 struct buffer_head *bh)) 932 struct buffer_head *bh))
933 { 933 {
934 int rc = 0; 934 int rc = 0;
935 u64 p_block, p_count; 935 u64 p_block, p_count;
936 int i, count, done = 0; 936 int i, count, done = 0;
937 937
938 trace_ocfs2_read_virt_blocks( 938 trace_ocfs2_read_virt_blocks(
939 inode, (unsigned long long)v_block, nr, bhs, flags, 939 inode, (unsigned long long)v_block, nr, bhs, flags,
940 validate); 940 validate);
941 941
942 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= 942 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
943 i_size_read(inode)) { 943 i_size_read(inode)) {
944 BUG_ON(!(flags & OCFS2_BH_READAHEAD)); 944 BUG_ON(!(flags & OCFS2_BH_READAHEAD));
945 goto out; 945 goto out;
946 } 946 }
947 947
948 while (done < nr) { 948 while (done < nr) {
949 down_read(&OCFS2_I(inode)->ip_alloc_sem); 949 down_read(&OCFS2_I(inode)->ip_alloc_sem);
950 rc = ocfs2_extent_map_get_blocks(inode, v_block + done, 950 rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
951 &p_block, &p_count, NULL); 951 &p_block, &p_count, NULL);
952 up_read(&OCFS2_I(inode)->ip_alloc_sem); 952 up_read(&OCFS2_I(inode)->ip_alloc_sem);
953 if (rc) { 953 if (rc) {
954 mlog_errno(rc); 954 mlog_errno(rc);
955 break; 955 break;
956 } 956 }
957 957
958 if (!p_block) { 958 if (!p_block) {
959 rc = -EIO; 959 rc = -EIO;
960 mlog(ML_ERROR, 960 mlog(ML_ERROR,
961 "Inode #%llu contains a hole at offset %llu\n", 961 "Inode #%llu contains a hole at offset %llu\n",
962 (unsigned long long)OCFS2_I(inode)->ip_blkno, 962 (unsigned long long)OCFS2_I(inode)->ip_blkno,
963 (unsigned long long)(v_block + done) << 963 (unsigned long long)(v_block + done) <<
964 inode->i_sb->s_blocksize_bits); 964 inode->i_sb->s_blocksize_bits);
965 break; 965 break;
966 } 966 }
967 967
968 count = nr - done; 968 count = nr - done;
969 if (p_count < count) 969 if (p_count < count)
970 count = p_count; 970 count = p_count;
971 971
972 /* 972 /*
973 * If the caller passed us bhs, they should have come 973 * If the caller passed us bhs, they should have come
974 * from a previous readahead call to this function. Thus, 974 * from a previous readahead call to this function. Thus,
975 * they should have the right b_blocknr. 975 * they should have the right b_blocknr.
976 */ 976 */
977 for (i = 0; i < count; i++) { 977 for (i = 0; i < count; i++) {
978 if (!bhs[done + i]) 978 if (!bhs[done + i])
979 continue; 979 continue;
980 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); 980 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
981 } 981 }
982 982
983 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, 983 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
984 bhs + done, flags, validate); 984 bhs + done, flags, validate);
985 if (rc) { 985 if (rc) {
986 mlog_errno(rc); 986 mlog_errno(rc);
987 break; 987 break;
988 } 988 }
989 done += count; 989 done += count;
990 } 990 }
991 991
992 out: 992 out:
993 return rc; 993 return rc;
994 } 994 }
995 995
996 996
997 997
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * file.c 4 * file.c
5 * 5 *
6 * File open, close, extend, truncate 6 * File open, close, extend, truncate
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/capability.h> 26 #include <linux/capability.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/uio.h> 32 #include <linux/uio.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/splice.h> 34 #include <linux/splice.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/writeback.h> 36 #include <linux/writeback.h>
37 #include <linux/falloc.h> 37 #include <linux/falloc.h>
38 #include <linux/quotaops.h> 38 #include <linux/quotaops.h>
39 #include <linux/blkdev.h> 39 #include <linux/blkdev.h>
40 40
41 #include <cluster/masklog.h> 41 #include <cluster/masklog.h>
42 42
43 #include "ocfs2.h" 43 #include "ocfs2.h"
44 44
45 #include "alloc.h" 45 #include "alloc.h"
46 #include "aops.h" 46 #include "aops.h"
47 #include "dir.h" 47 #include "dir.h"
48 #include "dlmglue.h" 48 #include "dlmglue.h"
49 #include "extent_map.h" 49 #include "extent_map.h"
50 #include "file.h" 50 #include "file.h"
51 #include "sysfile.h" 51 #include "sysfile.h"
52 #include "inode.h" 52 #include "inode.h"
53 #include "ioctl.h" 53 #include "ioctl.h"
54 #include "journal.h" 54 #include "journal.h"
55 #include "locks.h" 55 #include "locks.h"
56 #include "mmap.h" 56 #include "mmap.h"
57 #include "suballoc.h" 57 #include "suballoc.h"
58 #include "super.h" 58 #include "super.h"
59 #include "xattr.h" 59 #include "xattr.h"
60 #include "acl.h" 60 #include "acl.h"
61 #include "quota.h" 61 #include "quota.h"
62 #include "refcounttree.h" 62 #include "refcounttree.h"
63 #include "ocfs2_trace.h" 63 #include "ocfs2_trace.h"
64 64
65 #include "buffer_head_io.h" 65 #include "buffer_head_io.h"
66 66
67 static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68 { 68 {
69 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
70 70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); 71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp) 72 if (!fp)
73 return -ENOMEM; 73 return -ENOMEM;
74 74
75 fp->fp_file = file; 75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex); 76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp); 77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp; 78 file->private_data = fp;
79 79
80 return 0; 80 return 0;
81 } 81 }
82 82
83 static void ocfs2_free_file_private(struct inode *inode, struct file *file) 83 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84 { 84 {
85 struct ocfs2_file_private *fp = file->private_data; 85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87 87
88 if (fp) { 88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock); 89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock); 90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp); 91 kfree(fp);
92 file->private_data = NULL; 92 file->private_data = NULL;
93 } 93 }
94 } 94 }
95 95
96 static int ocfs2_file_open(struct inode *inode, struct file *file) 96 static int ocfs2_file_open(struct inode *inode, struct file *file)
97 { 97 {
98 int status; 98 int status;
99 int mode = file->f_flags; 99 int mode = file->f_flags;
100 struct ocfs2_inode_info *oi = OCFS2_I(inode); 100 struct ocfs2_inode_info *oi = OCFS2_I(inode);
101 101
102 trace_ocfs2_file_open(inode, file, file->f_path.dentry, 102 trace_ocfs2_file_open(inode, file, file->f_path.dentry,
103 (unsigned long long)OCFS2_I(inode)->ip_blkno, 103 (unsigned long long)OCFS2_I(inode)->ip_blkno,
104 file->f_path.dentry->d_name.len, 104 file->f_path.dentry->d_name.len,
105 file->f_path.dentry->d_name.name, mode); 105 file->f_path.dentry->d_name.name, mode);
106 106
107 if (file->f_mode & FMODE_WRITE) 107 if (file->f_mode & FMODE_WRITE)
108 dquot_initialize(inode); 108 dquot_initialize(inode);
109 109
110 spin_lock(&oi->ip_lock); 110 spin_lock(&oi->ip_lock);
111 111
112 /* Check that the inode hasn't been wiped from disk by another 112 /* Check that the inode hasn't been wiped from disk by another
113 * node. If it hasn't then we're safe as long as we hold the 113 * node. If it hasn't then we're safe as long as we hold the
114 * spin lock until our increment of open count. */ 114 * spin lock until our increment of open count. */
115 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 115 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
116 spin_unlock(&oi->ip_lock); 116 spin_unlock(&oi->ip_lock);
117 117
118 status = -ENOENT; 118 status = -ENOENT;
119 goto leave; 119 goto leave;
120 } 120 }
121 121
122 if (mode & O_DIRECT) 122 if (mode & O_DIRECT)
123 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 123 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
124 124
125 oi->ip_open_count++; 125 oi->ip_open_count++;
126 spin_unlock(&oi->ip_lock); 126 spin_unlock(&oi->ip_lock);
127 127
128 status = ocfs2_init_file_private(inode, file); 128 status = ocfs2_init_file_private(inode, file);
129 if (status) { 129 if (status) {
130 /* 130 /*
131 * We want to set open count back if we're failing the 131 * We want to set open count back if we're failing the
132 * open. 132 * open.
133 */ 133 */
134 spin_lock(&oi->ip_lock); 134 spin_lock(&oi->ip_lock);
135 oi->ip_open_count--; 135 oi->ip_open_count--;
136 spin_unlock(&oi->ip_lock); 136 spin_unlock(&oi->ip_lock);
137 } 137 }
138 138
139 leave: 139 leave:
140 return status; 140 return status;
141 } 141 }
142 142
143 static int ocfs2_file_release(struct inode *inode, struct file *file) 143 static int ocfs2_file_release(struct inode *inode, struct file *file)
144 { 144 {
145 struct ocfs2_inode_info *oi = OCFS2_I(inode); 145 struct ocfs2_inode_info *oi = OCFS2_I(inode);
146 146
147 spin_lock(&oi->ip_lock); 147 spin_lock(&oi->ip_lock);
148 if (!--oi->ip_open_count) 148 if (!--oi->ip_open_count)
149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
150 150
151 trace_ocfs2_file_release(inode, file, file->f_path.dentry, 151 trace_ocfs2_file_release(inode, file, file->f_path.dentry,
152 oi->ip_blkno, 152 oi->ip_blkno,
153 file->f_path.dentry->d_name.len, 153 file->f_path.dentry->d_name.len,
154 file->f_path.dentry->d_name.name, 154 file->f_path.dentry->d_name.name,
155 oi->ip_open_count); 155 oi->ip_open_count);
156 spin_unlock(&oi->ip_lock); 156 spin_unlock(&oi->ip_lock);
157 157
158 ocfs2_free_file_private(inode, file); 158 ocfs2_free_file_private(inode, file);
159 159
160 return 0; 160 return 0;
161 } 161 }
162 162
163 static int ocfs2_dir_open(struct inode *inode, struct file *file) 163 static int ocfs2_dir_open(struct inode *inode, struct file *file)
164 { 164 {
165 return ocfs2_init_file_private(inode, file); 165 return ocfs2_init_file_private(inode, file);
166 } 166 }
167 167
168 static int ocfs2_dir_release(struct inode *inode, struct file *file) 168 static int ocfs2_dir_release(struct inode *inode, struct file *file)
169 { 169 {
170 ocfs2_free_file_private(inode, file); 170 ocfs2_free_file_private(inode, file);
171 return 0; 171 return 0;
172 } 172 }
173 173
174 static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, 174 static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync) 175 int datasync)
176 { 176 {
177 int err = 0; 177 int err = 0;
178 journal_t *journal; 178 journal_t *journal;
179 struct inode *inode = file->f_mapping->host; 179 struct inode *inode = file->f_mapping->host;
180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
181 181
182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183 OCFS2_I(inode)->ip_blkno, 183 OCFS2_I(inode)->ip_blkno,
184 file->f_path.dentry->d_name.len, 184 file->f_path.dentry->d_name.len,
185 file->f_path.dentry->d_name.name, 185 file->f_path.dentry->d_name.name,
186 (unsigned long long)datasync); 186 (unsigned long long)datasync);
187 187
188 err = filemap_write_and_wait_range(inode->i_mapping, start, end); 188 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
189 if (err) 189 if (err)
190 return err; 190 return err;
191 191
192 /* 192 /*
193 * Probably don't need the i_mutex at all in here, just putting it here 193 * Probably don't need the i_mutex at all in here, just putting it here
194 * to be consistent with how fsync used to be called, someone more 194 * to be consistent with how fsync used to be called, someone more
195 * familiar with the fs could possibly remove it. 195 * familiar with the fs could possibly remove it.
196 */ 196 */
197 mutex_lock(&inode->i_mutex); 197 mutex_lock(&inode->i_mutex);
198 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 198 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
199 /* 199 /*
200 * We still have to flush drive's caches to get data to the 200 * We still have to flush drive's caches to get data to the
201 * platter 201 * platter
202 */ 202 */
203 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 203 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
204 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 204 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
205 goto bail; 205 goto bail;
206 } 206 }
207 207
208 journal = osb->journal->j_journal; 208 journal = osb->journal->j_journal;
209 err = jbd2_journal_force_commit(journal); 209 err = jbd2_journal_force_commit(journal);
210 210
211 bail: 211 bail:
212 if (err) 212 if (err)
213 mlog_errno(err); 213 mlog_errno(err);
214 mutex_unlock(&inode->i_mutex); 214 mutex_unlock(&inode->i_mutex);
215 215
216 return (err < 0) ? -EIO : 0; 216 return (err < 0) ? -EIO : 0;
217 } 217 }
218 218
219 int ocfs2_should_update_atime(struct inode *inode, 219 int ocfs2_should_update_atime(struct inode *inode,
220 struct vfsmount *vfsmnt) 220 struct vfsmount *vfsmnt)
221 { 221 {
222 struct timespec now; 222 struct timespec now;
223 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 223 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
224 224
225 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 225 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
226 return 0; 226 return 0;
227 227
228 if ((inode->i_flags & S_NOATIME) || 228 if ((inode->i_flags & S_NOATIME) ||
229 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 229 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
230 return 0; 230 return 0;
231 231
232 /* 232 /*
233 * We can be called with no vfsmnt structure - NFSD will 233 * We can be called with no vfsmnt structure - NFSD will
234 * sometimes do this. 234 * sometimes do this.
235 * 235 *
236 * Note that our action here is different than touch_atime() - 236 * Note that our action here is different than touch_atime() -
237 * if we can't tell whether this is a noatime mount, then we 237 * if we can't tell whether this is a noatime mount, then we
238 * don't know whether to trust the value of s_atime_quantum. 238 * don't know whether to trust the value of s_atime_quantum.
239 */ 239 */
240 if (vfsmnt == NULL) 240 if (vfsmnt == NULL)
241 return 0; 241 return 0;
242 242
243 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 243 if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
244 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 244 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
245 return 0; 245 return 0;
246 246
247 if (vfsmnt->mnt_flags & MNT_RELATIME) { 247 if (vfsmnt->mnt_flags & MNT_RELATIME) {
248 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 248 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
249 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 249 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
250 return 1; 250 return 1;
251 251
252 return 0; 252 return 0;
253 } 253 }
254 254
255 now = CURRENT_TIME; 255 now = CURRENT_TIME;
256 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 256 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
257 return 0; 257 return 0;
258 else 258 else
259 return 1; 259 return 1;
260 } 260 }
261 261
262 int ocfs2_update_inode_atime(struct inode *inode, 262 int ocfs2_update_inode_atime(struct inode *inode,
263 struct buffer_head *bh) 263 struct buffer_head *bh)
264 { 264 {
265 int ret; 265 int ret;
266 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 266 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
267 handle_t *handle; 267 handle_t *handle;
268 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 268 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
269 269
270 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 270 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
271 if (IS_ERR(handle)) { 271 if (IS_ERR(handle)) {
272 ret = PTR_ERR(handle); 272 ret = PTR_ERR(handle);
273 mlog_errno(ret); 273 mlog_errno(ret);
274 goto out; 274 goto out;
275 } 275 }
276 276
277 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 277 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
278 OCFS2_JOURNAL_ACCESS_WRITE); 278 OCFS2_JOURNAL_ACCESS_WRITE);
279 if (ret) { 279 if (ret) {
280 mlog_errno(ret); 280 mlog_errno(ret);
281 goto out_commit; 281 goto out_commit;
282 } 282 }
283 283
284 /* 284 /*
285 * Don't use ocfs2_mark_inode_dirty() here as we don't always 285 * Don't use ocfs2_mark_inode_dirty() here as we don't always
286 * have i_mutex to guard against concurrent changes to other 286 * have i_mutex to guard against concurrent changes to other
287 * inode fields. 287 * inode fields.
288 */ 288 */
289 inode->i_atime = CURRENT_TIME; 289 inode->i_atime = CURRENT_TIME;
290 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 290 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
291 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 291 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
292 ocfs2_journal_dirty(handle, bh); 292 ocfs2_journal_dirty(handle, bh);
293 293
294 out_commit: 294 out_commit:
295 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 295 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
296 out: 296 out:
297 return ret; 297 return ret;
298 } 298 }
299 299
300 static int ocfs2_set_inode_size(handle_t *handle, 300 static int ocfs2_set_inode_size(handle_t *handle,
301 struct inode *inode, 301 struct inode *inode,
302 struct buffer_head *fe_bh, 302 struct buffer_head *fe_bh,
303 u64 new_i_size) 303 u64 new_i_size)
304 { 304 {
305 int status; 305 int status;
306 306
307 i_size_write(inode, new_i_size); 307 i_size_write(inode, new_i_size);
308 inode->i_blocks = ocfs2_inode_sector_count(inode); 308 inode->i_blocks = ocfs2_inode_sector_count(inode);
309 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 309 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
310 310
311 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 311 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
312 if (status < 0) { 312 if (status < 0) {
313 mlog_errno(status); 313 mlog_errno(status);
314 goto bail; 314 goto bail;
315 } 315 }
316 316
317 bail: 317 bail:
318 return status; 318 return status;
319 } 319 }
320 320
321 int ocfs2_simple_size_update(struct inode *inode, 321 int ocfs2_simple_size_update(struct inode *inode,
322 struct buffer_head *di_bh, 322 struct buffer_head *di_bh,
323 u64 new_i_size) 323 u64 new_i_size)
324 { 324 {
325 int ret; 325 int ret;
326 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 326 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
327 handle_t *handle = NULL; 327 handle_t *handle = NULL;
328 328
329 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 329 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
330 if (IS_ERR(handle)) { 330 if (IS_ERR(handle)) {
331 ret = PTR_ERR(handle); 331 ret = PTR_ERR(handle);
332 mlog_errno(ret); 332 mlog_errno(ret);
333 goto out; 333 goto out;
334 } 334 }
335 335
336 ret = ocfs2_set_inode_size(handle, inode, di_bh, 336 ret = ocfs2_set_inode_size(handle, inode, di_bh,
337 new_i_size); 337 new_i_size);
338 if (ret < 0) 338 if (ret < 0)
339 mlog_errno(ret); 339 mlog_errno(ret);
340 340
341 ocfs2_commit_trans(osb, handle); 341 ocfs2_commit_trans(osb, handle);
342 out: 342 out:
343 return ret; 343 return ret;
344 } 344 }
345 345
346 static int ocfs2_cow_file_pos(struct inode *inode, 346 static int ocfs2_cow_file_pos(struct inode *inode,
347 struct buffer_head *fe_bh, 347 struct buffer_head *fe_bh,
348 u64 offset) 348 u64 offset)
349 { 349 {
350 int status; 350 int status;
351 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 351 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
352 unsigned int num_clusters = 0; 352 unsigned int num_clusters = 0;
353 unsigned int ext_flags = 0; 353 unsigned int ext_flags = 0;
354 354
355 /* 355 /*
356 * If the new offset is aligned to the range of the cluster, there is 356 * If the new offset is aligned to the range of the cluster, there is
357 * no space for ocfs2_zero_range_for_truncate to fill, so no need to 357 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
358 * CoW either. 358 * CoW either.
359 */ 359 */
360 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) 360 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
361 return 0; 361 return 0;
362 362
363 status = ocfs2_get_clusters(inode, cpos, &phys, 363 status = ocfs2_get_clusters(inode, cpos, &phys,
364 &num_clusters, &ext_flags); 364 &num_clusters, &ext_flags);
365 if (status) { 365 if (status) {
366 mlog_errno(status); 366 mlog_errno(status);
367 goto out; 367 goto out;
368 } 368 }
369 369
370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371 goto out; 371 goto out;
372 372
373 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); 373 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374 374
375 out: 375 out:
376 return status; 376 return status;
377 } 377 }
378 378
379 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 379 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
380 struct inode *inode, 380 struct inode *inode,
381 struct buffer_head *fe_bh, 381 struct buffer_head *fe_bh,
382 u64 new_i_size) 382 u64 new_i_size)
383 { 383 {
384 int status; 384 int status;
385 handle_t *handle; 385 handle_t *handle;
386 struct ocfs2_dinode *di; 386 struct ocfs2_dinode *di;
387 u64 cluster_bytes; 387 u64 cluster_bytes;
388 388
389 /* 389 /*
390 * We need to CoW the cluster contains the offset if it is reflinked 390 * We need to CoW the cluster contains the offset if it is reflinked
391 * since we will call ocfs2_zero_range_for_truncate later which will 391 * since we will call ocfs2_zero_range_for_truncate later which will
392 * write "0" from offset to the end of the cluster. 392 * write "0" from offset to the end of the cluster.
393 */ 393 */
394 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); 394 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
395 if (status) { 395 if (status) {
396 mlog_errno(status); 396 mlog_errno(status);
397 return status; 397 return status;
398 } 398 }
399 399
400 /* TODO: This needs to actually orphan the inode in this 400 /* TODO: This needs to actually orphan the inode in this
401 * transaction. */ 401 * transaction. */
402 402
403 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 403 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
404 if (IS_ERR(handle)) { 404 if (IS_ERR(handle)) {
405 status = PTR_ERR(handle); 405 status = PTR_ERR(handle);
406 mlog_errno(status); 406 mlog_errno(status);
407 goto out; 407 goto out;
408 } 408 }
409 409
410 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 410 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
411 OCFS2_JOURNAL_ACCESS_WRITE); 411 OCFS2_JOURNAL_ACCESS_WRITE);
412 if (status < 0) { 412 if (status < 0) {
413 mlog_errno(status); 413 mlog_errno(status);
414 goto out_commit; 414 goto out_commit;
415 } 415 }
416 416
417 /* 417 /*
418 * Do this before setting i_size. 418 * Do this before setting i_size.
419 */ 419 */
420 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 420 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
421 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 421 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
422 cluster_bytes); 422 cluster_bytes);
423 if (status) { 423 if (status) {
424 mlog_errno(status); 424 mlog_errno(status);
425 goto out_commit; 425 goto out_commit;
426 } 426 }
427 427
428 i_size_write(inode, new_i_size); 428 i_size_write(inode, new_i_size);
429 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 429 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
430 430
431 di = (struct ocfs2_dinode *) fe_bh->b_data; 431 di = (struct ocfs2_dinode *) fe_bh->b_data;
432 di->i_size = cpu_to_le64(new_i_size); 432 di->i_size = cpu_to_le64(new_i_size);
433 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 433 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
434 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 434 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
435 435
436 ocfs2_journal_dirty(handle, fe_bh); 436 ocfs2_journal_dirty(handle, fe_bh);
437 437
438 out_commit: 438 out_commit:
439 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
440 out: 440 out:
441 return status; 441 return status;
442 } 442 }
443 443
444 static int ocfs2_truncate_file(struct inode *inode, 444 static int ocfs2_truncate_file(struct inode *inode,
445 struct buffer_head *di_bh, 445 struct buffer_head *di_bh,
446 u64 new_i_size) 446 u64 new_i_size)
447 { 447 {
448 int status = 0; 448 int status = 0;
449 struct ocfs2_dinode *fe = NULL; 449 struct ocfs2_dinode *fe = NULL;
450 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 450 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
451 451
452 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 452 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
453 * already validated it */ 453 * already validated it */
454 fe = (struct ocfs2_dinode *) di_bh->b_data; 454 fe = (struct ocfs2_dinode *) di_bh->b_data;
455 455
456 trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno, 456 trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
457 (unsigned long long)le64_to_cpu(fe->i_size), 457 (unsigned long long)le64_to_cpu(fe->i_size),
458 (unsigned long long)new_i_size); 458 (unsigned long long)new_i_size);
459 459
460 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 460 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
461 "Inode %llu, inode i_size = %lld != di " 461 "Inode %llu, inode i_size = %lld != di "
462 "i_size = %llu, i_flags = 0x%x\n", 462 "i_size = %llu, i_flags = 0x%x\n",
463 (unsigned long long)OCFS2_I(inode)->ip_blkno, 463 (unsigned long long)OCFS2_I(inode)->ip_blkno,
464 i_size_read(inode), 464 i_size_read(inode),
465 (unsigned long long)le64_to_cpu(fe->i_size), 465 (unsigned long long)le64_to_cpu(fe->i_size),
466 le32_to_cpu(fe->i_flags)); 466 le32_to_cpu(fe->i_flags));
467 467
468 if (new_i_size > le64_to_cpu(fe->i_size)) { 468 if (new_i_size > le64_to_cpu(fe->i_size)) {
469 trace_ocfs2_truncate_file_error( 469 trace_ocfs2_truncate_file_error(
470 (unsigned long long)le64_to_cpu(fe->i_size), 470 (unsigned long long)le64_to_cpu(fe->i_size),
471 (unsigned long long)new_i_size); 471 (unsigned long long)new_i_size);
472 status = -EINVAL; 472 status = -EINVAL;
473 mlog_errno(status); 473 mlog_errno(status);
474 goto bail; 474 goto bail;
475 } 475 }
476 476
477 /* lets handle the simple truncate cases before doing any more 477 /* lets handle the simple truncate cases before doing any more
478 * cluster locking. */ 478 * cluster locking. */
479 if (new_i_size == le64_to_cpu(fe->i_size)) 479 if (new_i_size == le64_to_cpu(fe->i_size))
480 goto bail; 480 goto bail;
481 481
482 down_write(&OCFS2_I(inode)->ip_alloc_sem); 482 down_write(&OCFS2_I(inode)->ip_alloc_sem);
483 483
484 ocfs2_resv_discard(&osb->osb_la_resmap, 484 ocfs2_resv_discard(&osb->osb_la_resmap,
485 &OCFS2_I(inode)->ip_la_data_resv); 485 &OCFS2_I(inode)->ip_la_data_resv);
486 486
487 /* 487 /*
488 * The inode lock forced other nodes to sync and drop their 488 * The inode lock forced other nodes to sync and drop their
489 * pages, which (correctly) happens even if we have a truncate 489 * pages, which (correctly) happens even if we have a truncate
490 * without allocation change - ocfs2 cluster sizes can be much 490 * without allocation change - ocfs2 cluster sizes can be much
491 * greater than page size, so we have to truncate them 491 * greater than page size, so we have to truncate them
492 * anyway. 492 * anyway.
493 */ 493 */
494 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 494 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
495 truncate_inode_pages(inode->i_mapping, new_i_size); 495 truncate_inode_pages(inode->i_mapping, new_i_size);
496 496
497 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 497 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
498 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 498 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
499 i_size_read(inode), 1); 499 i_size_read(inode), 1);
500 if (status) 500 if (status)
501 mlog_errno(status); 501 mlog_errno(status);
502 502
503 goto bail_unlock_sem; 503 goto bail_unlock_sem;
504 } 504 }
505 505
506 /* alright, we're going to need to do a full blown alloc size 506 /* alright, we're going to need to do a full blown alloc size
507 * change. Orphan the inode so that recovery can complete the 507 * change. Orphan the inode so that recovery can complete the
508 * truncate if necessary. This does the task of marking 508 * truncate if necessary. This does the task of marking
509 * i_size. */ 509 * i_size. */
510 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 510 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
511 if (status < 0) { 511 if (status < 0) {
512 mlog_errno(status); 512 mlog_errno(status);
513 goto bail_unlock_sem; 513 goto bail_unlock_sem;
514 } 514 }
515 515
516 status = ocfs2_commit_truncate(osb, inode, di_bh); 516 status = ocfs2_commit_truncate(osb, inode, di_bh);
517 if (status < 0) { 517 if (status < 0) {
518 mlog_errno(status); 518 mlog_errno(status);
519 goto bail_unlock_sem; 519 goto bail_unlock_sem;
520 } 520 }
521 521
522 /* TODO: orphan dir cleanup here. */ 522 /* TODO: orphan dir cleanup here. */
523 bail_unlock_sem: 523 bail_unlock_sem:
524 up_write(&OCFS2_I(inode)->ip_alloc_sem); 524 up_write(&OCFS2_I(inode)->ip_alloc_sem);
525 525
526 bail: 526 bail:
527 if (!status && OCFS2_I(inode)->ip_clusters == 0) 527 if (!status && OCFS2_I(inode)->ip_clusters == 0)
528 status = ocfs2_try_remove_refcount_tree(inode, di_bh); 528 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
529 529
530 return status; 530 return status;
531 } 531 }
532 532
533 /* 533 /*
534 * extend file allocation only here. 534 * extend file allocation only here.
535 * we'll update all the disk stuff, and oip->alloc_size 535 * we'll update all the disk stuff, and oip->alloc_size
536 * 536 *
537 * expect stuff to be locked, a transaction started and enough data / 537 * expect stuff to be locked, a transaction started and enough data /
538 * metadata reservations in the contexts. 538 * metadata reservations in the contexts.
539 * 539 *
540 * Will return -EAGAIN, and a reason if a restart is needed. 540 * Will return -EAGAIN, and a reason if a restart is needed.
541 * If passed in, *reason will always be set, even in error. 541 * If passed in, *reason will always be set, even in error.
542 */ 542 */
543 int ocfs2_add_inode_data(struct ocfs2_super *osb, 543 int ocfs2_add_inode_data(struct ocfs2_super *osb,
544 struct inode *inode, 544 struct inode *inode,
545 u32 *logical_offset, 545 u32 *logical_offset,
546 u32 clusters_to_add, 546 u32 clusters_to_add,
547 int mark_unwritten, 547 int mark_unwritten,
548 struct buffer_head *fe_bh, 548 struct buffer_head *fe_bh,
549 handle_t *handle, 549 handle_t *handle,
550 struct ocfs2_alloc_context *data_ac, 550 struct ocfs2_alloc_context *data_ac,
551 struct ocfs2_alloc_context *meta_ac, 551 struct ocfs2_alloc_context *meta_ac,
552 enum ocfs2_alloc_restarted *reason_ret) 552 enum ocfs2_alloc_restarted *reason_ret)
553 { 553 {
554 int ret; 554 int ret;
555 struct ocfs2_extent_tree et; 555 struct ocfs2_extent_tree et;
556 556
557 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); 557 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
558 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, 558 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
559 clusters_to_add, mark_unwritten, 559 clusters_to_add, mark_unwritten,
560 data_ac, meta_ac, reason_ret); 560 data_ac, meta_ac, reason_ret);
561 561
562 return ret; 562 return ret;
563 } 563 }
564 564
565 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 565 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
566 u32 clusters_to_add, int mark_unwritten) 566 u32 clusters_to_add, int mark_unwritten)
567 { 567 {
568 int status = 0; 568 int status = 0;
569 int restart_func = 0; 569 int restart_func = 0;
570 int credits; 570 int credits;
571 u32 prev_clusters; 571 u32 prev_clusters;
572 struct buffer_head *bh = NULL; 572 struct buffer_head *bh = NULL;
573 struct ocfs2_dinode *fe = NULL; 573 struct ocfs2_dinode *fe = NULL;
574 handle_t *handle = NULL; 574 handle_t *handle = NULL;
575 struct ocfs2_alloc_context *data_ac = NULL; 575 struct ocfs2_alloc_context *data_ac = NULL;
576 struct ocfs2_alloc_context *meta_ac = NULL; 576 struct ocfs2_alloc_context *meta_ac = NULL;
577 enum ocfs2_alloc_restarted why; 577 enum ocfs2_alloc_restarted why;
578 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 578 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
579 struct ocfs2_extent_tree et; 579 struct ocfs2_extent_tree et;
580 int did_quota = 0; 580 int did_quota = 0;
581 581
582 /* 582 /*
583 * This function only exists for file systems which don't 583 * This function only exists for file systems which don't
584 * support holes. 584 * support holes.
585 */ 585 */
586 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 586 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
587 587
588 status = ocfs2_read_inode_block(inode, &bh); 588 status = ocfs2_read_inode_block(inode, &bh);
589 if (status < 0) { 589 if (status < 0) {
590 mlog_errno(status); 590 mlog_errno(status);
591 goto leave; 591 goto leave;
592 } 592 }
593 fe = (struct ocfs2_dinode *) bh->b_data; 593 fe = (struct ocfs2_dinode *) bh->b_data;
594 594
595 restart_all: 595 restart_all:
596 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 596 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
597 597
598 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); 598 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
599 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 599 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
600 &data_ac, &meta_ac); 600 &data_ac, &meta_ac);
601 if (status) { 601 if (status) {
602 mlog_errno(status); 602 mlog_errno(status);
603 goto leave; 603 goto leave;
604 } 604 }
605 605
606 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, 606 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
607 clusters_to_add); 607 clusters_to_add);
608 handle = ocfs2_start_trans(osb, credits); 608 handle = ocfs2_start_trans(osb, credits);
609 if (IS_ERR(handle)) { 609 if (IS_ERR(handle)) {
610 status = PTR_ERR(handle); 610 status = PTR_ERR(handle);
611 handle = NULL; 611 handle = NULL;
612 mlog_errno(status); 612 mlog_errno(status);
613 goto leave; 613 goto leave;
614 } 614 }
615 615
616 restarted_transaction: 616 restarted_transaction:
617 trace_ocfs2_extend_allocation( 617 trace_ocfs2_extend_allocation(
618 (unsigned long long)OCFS2_I(inode)->ip_blkno, 618 (unsigned long long)OCFS2_I(inode)->ip_blkno,
619 (unsigned long long)i_size_read(inode), 619 (unsigned long long)i_size_read(inode),
620 le32_to_cpu(fe->i_clusters), clusters_to_add, 620 le32_to_cpu(fe->i_clusters), clusters_to_add,
621 why, restart_func); 621 why, restart_func);
622 622
623 status = dquot_alloc_space_nodirty(inode, 623 status = dquot_alloc_space_nodirty(inode,
624 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 624 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
625 if (status) 625 if (status)
626 goto leave; 626 goto leave;
627 did_quota = 1; 627 did_quota = 1;
628 628
629 /* reserve a write to the file entry early on - that we if we 629 /* reserve a write to the file entry early on - that we if we
630 * run out of credits in the allocation path, we can still 630 * run out of credits in the allocation path, we can still
631 * update i_size. */ 631 * update i_size. */
632 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 632 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
633 OCFS2_JOURNAL_ACCESS_WRITE); 633 OCFS2_JOURNAL_ACCESS_WRITE);
634 if (status < 0) { 634 if (status < 0) {
635 mlog_errno(status); 635 mlog_errno(status);
636 goto leave; 636 goto leave;
637 } 637 }
638 638
639 prev_clusters = OCFS2_I(inode)->ip_clusters; 639 prev_clusters = OCFS2_I(inode)->ip_clusters;
640 640
641 status = ocfs2_add_inode_data(osb, 641 status = ocfs2_add_inode_data(osb,
642 inode, 642 inode,
643 &logical_start, 643 &logical_start,
644 clusters_to_add, 644 clusters_to_add,
645 mark_unwritten, 645 mark_unwritten,
646 bh, 646 bh,
647 handle, 647 handle,
648 data_ac, 648 data_ac,
649 meta_ac, 649 meta_ac,
650 &why); 650 &why);
651 if ((status < 0) && (status != -EAGAIN)) { 651 if ((status < 0) && (status != -EAGAIN)) {
652 if (status != -ENOSPC) 652 if (status != -ENOSPC)
653 mlog_errno(status); 653 mlog_errno(status);
654 goto leave; 654 goto leave;
655 } 655 }
656 656
657 ocfs2_journal_dirty(handle, bh); 657 ocfs2_journal_dirty(handle, bh);
658 658
659 spin_lock(&OCFS2_I(inode)->ip_lock); 659 spin_lock(&OCFS2_I(inode)->ip_lock);
660 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 660 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
661 spin_unlock(&OCFS2_I(inode)->ip_lock); 661 spin_unlock(&OCFS2_I(inode)->ip_lock);
662 /* Release unused quota reservation */ 662 /* Release unused quota reservation */
663 dquot_free_space(inode, 663 dquot_free_space(inode,
664 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 664 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
665 did_quota = 0; 665 did_quota = 0;
666 666
667 if (why != RESTART_NONE && clusters_to_add) { 667 if (why != RESTART_NONE && clusters_to_add) {
668 if (why == RESTART_META) { 668 if (why == RESTART_META) {
669 restart_func = 1; 669 restart_func = 1;
670 status = 0; 670 status = 0;
671 } else { 671 } else {
672 BUG_ON(why != RESTART_TRANS); 672 BUG_ON(why != RESTART_TRANS);
673 673
674 /* TODO: This can be more intelligent. */ 674 /* TODO: This can be more intelligent. */
675 credits = ocfs2_calc_extend_credits(osb->sb, 675 credits = ocfs2_calc_extend_credits(osb->sb,
676 &fe->id2.i_list, 676 &fe->id2.i_list,
677 clusters_to_add); 677 clusters_to_add);
678 status = ocfs2_extend_trans(handle, credits); 678 status = ocfs2_extend_trans(handle, credits);
679 if (status < 0) { 679 if (status < 0) {
680 /* handle still has to be committed at 680 /* handle still has to be committed at
681 * this point. */ 681 * this point. */
682 status = -ENOMEM; 682 status = -ENOMEM;
683 mlog_errno(status); 683 mlog_errno(status);
684 goto leave; 684 goto leave;
685 } 685 }
686 goto restarted_transaction; 686 goto restarted_transaction;
687 } 687 }
688 } 688 }
689 689
690 trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno, 690 trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
691 le32_to_cpu(fe->i_clusters), 691 le32_to_cpu(fe->i_clusters),
692 (unsigned long long)le64_to_cpu(fe->i_size), 692 (unsigned long long)le64_to_cpu(fe->i_size),
693 OCFS2_I(inode)->ip_clusters, 693 OCFS2_I(inode)->ip_clusters,
694 (unsigned long long)i_size_read(inode)); 694 (unsigned long long)i_size_read(inode));
695 695
696 leave: 696 leave:
697 if (status < 0 && did_quota) 697 if (status < 0 && did_quota)
698 dquot_free_space(inode, 698 dquot_free_space(inode,
699 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 699 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
700 if (handle) { 700 if (handle) {
701 ocfs2_commit_trans(osb, handle); 701 ocfs2_commit_trans(osb, handle);
702 handle = NULL; 702 handle = NULL;
703 } 703 }
704 if (data_ac) { 704 if (data_ac) {
705 ocfs2_free_alloc_context(data_ac); 705 ocfs2_free_alloc_context(data_ac);
706 data_ac = NULL; 706 data_ac = NULL;
707 } 707 }
708 if (meta_ac) { 708 if (meta_ac) {
709 ocfs2_free_alloc_context(meta_ac); 709 ocfs2_free_alloc_context(meta_ac);
710 meta_ac = NULL; 710 meta_ac = NULL;
711 } 711 }
712 if ((!status) && restart_func) { 712 if ((!status) && restart_func) {
713 restart_func = 0; 713 restart_func = 0;
714 goto restart_all; 714 goto restart_all;
715 } 715 }
716 brelse(bh); 716 brelse(bh);
717 bh = NULL; 717 bh = NULL;
718 718
719 return status; 719 return status;
720 } 720 }
721 721
722 /* 722 /*
723 * While a write will already be ordering the data, a truncate will not. 723 * While a write will already be ordering the data, a truncate will not.
724 * Thus, we need to explicitly order the zeroed pages. 724 * Thus, we need to explicitly order the zeroed pages.
725 */ 725 */
726 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) 726 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
727 { 727 {
728 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 728 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
729 handle_t *handle = NULL; 729 handle_t *handle = NULL;
730 int ret = 0; 730 int ret = 0;
731 731
732 if (!ocfs2_should_order_data(inode)) 732 if (!ocfs2_should_order_data(inode))
733 goto out; 733 goto out;
734 734
735 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 735 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
736 if (IS_ERR(handle)) { 736 if (IS_ERR(handle)) {
737 ret = -ENOMEM; 737 ret = -ENOMEM;
738 mlog_errno(ret); 738 mlog_errno(ret);
739 goto out; 739 goto out;
740 } 740 }
741 741
742 ret = ocfs2_jbd2_file_inode(handle, inode); 742 ret = ocfs2_jbd2_file_inode(handle, inode);
743 if (ret < 0) 743 if (ret < 0)
744 mlog_errno(ret); 744 mlog_errno(ret);
745 745
746 out: 746 out:
747 if (ret) { 747 if (ret) {
748 if (!IS_ERR(handle)) 748 if (!IS_ERR(handle))
749 ocfs2_commit_trans(osb, handle); 749 ocfs2_commit_trans(osb, handle);
750 handle = ERR_PTR(ret); 750 handle = ERR_PTR(ret);
751 } 751 }
752 return handle; 752 return handle;
753 } 753 }
754 754
755 /* Some parts of this taken from generic_cont_expand, which turned out 755 /* Some parts of this taken from generic_cont_expand, which turned out
756 * to be too fragile to do exactly what we need without us having to 756 * to be too fragile to do exactly what we need without us having to
757 * worry about recursive locking in ->write_begin() and ->write_end(). */ 757 * worry about recursive locking in ->write_begin() and ->write_end(). */
758 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, 758 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
759 u64 abs_to) 759 u64 abs_to)
760 { 760 {
761 struct address_space *mapping = inode->i_mapping; 761 struct address_space *mapping = inode->i_mapping;
762 struct page *page; 762 struct page *page;
763 unsigned long index = abs_from >> PAGE_CACHE_SHIFT; 763 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
764 handle_t *handle = NULL; 764 handle_t *handle = NULL;
765 int ret = 0; 765 int ret = 0;
766 unsigned zero_from, zero_to, block_start, block_end; 766 unsigned zero_from, zero_to, block_start, block_end;
767 767
768 BUG_ON(abs_from >= abs_to); 768 BUG_ON(abs_from >= abs_to);
769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770 BUG_ON(abs_from & (inode->i_blkbits - 1)); 770 BUG_ON(abs_from & (inode->i_blkbits - 1));
771 771
772 page = find_or_create_page(mapping, index, GFP_NOFS); 772 page = find_or_create_page(mapping, index, GFP_NOFS);
773 if (!page) { 773 if (!page) {
774 ret = -ENOMEM; 774 ret = -ENOMEM;
775 mlog_errno(ret); 775 mlog_errno(ret);
776 goto out; 776 goto out;
777 } 777 }
778 778
779 /* Get the offsets within the page that we want to zero */ 779 /* Get the offsets within the page that we want to zero */
780 zero_from = abs_from & (PAGE_CACHE_SIZE - 1); 780 zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
781 zero_to = abs_to & (PAGE_CACHE_SIZE - 1); 781 zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
782 if (!zero_to) 782 if (!zero_to)
783 zero_to = PAGE_CACHE_SIZE; 783 zero_to = PAGE_CACHE_SIZE;
784 784
785 trace_ocfs2_write_zero_page( 785 trace_ocfs2_write_zero_page(
786 (unsigned long long)OCFS2_I(inode)->ip_blkno, 786 (unsigned long long)OCFS2_I(inode)->ip_blkno,
787 (unsigned long long)abs_from, 787 (unsigned long long)abs_from,
788 (unsigned long long)abs_to, 788 (unsigned long long)abs_to,
789 index, zero_from, zero_to); 789 index, zero_from, zero_to);
790 790
791 /* We know that zero_from is block aligned */ 791 /* We know that zero_from is block aligned */
792 for (block_start = zero_from; block_start < zero_to; 792 for (block_start = zero_from; block_start < zero_to;
793 block_start = block_end) { 793 block_start = block_end) {
794 block_end = block_start + (1 << inode->i_blkbits); 794 block_end = block_start + (1 << inode->i_blkbits);
795 795
796 /* 796 /*
797 * block_start is block-aligned. Bump it by one to force 797 * block_start is block-aligned. Bump it by one to force
798 * __block_write_begin and block_commit_write to zero the 798 * __block_write_begin and block_commit_write to zero the
799 * whole block. 799 * whole block.
800 */ 800 */
801 ret = __block_write_begin(page, block_start + 1, 0, 801 ret = __block_write_begin(page, block_start + 1, 0,
802 ocfs2_get_block); 802 ocfs2_get_block);
803 if (ret < 0) { 803 if (ret < 0) {
804 mlog_errno(ret); 804 mlog_errno(ret);
805 goto out_unlock; 805 goto out_unlock;
806 } 806 }
807 807
808 if (!handle) { 808 if (!handle) {
809 handle = ocfs2_zero_start_ordered_transaction(inode); 809 handle = ocfs2_zero_start_ordered_transaction(inode);
810 if (IS_ERR(handle)) { 810 if (IS_ERR(handle)) {
811 ret = PTR_ERR(handle); 811 ret = PTR_ERR(handle);
812 handle = NULL; 812 handle = NULL;
813 break; 813 break;
814 } 814 }
815 } 815 }
816 816
817 /* must not update i_size! */ 817 /* must not update i_size! */
818 ret = block_commit_write(page, block_start + 1, 818 ret = block_commit_write(page, block_start + 1,
819 block_start + 1); 819 block_start + 1);
820 if (ret < 0) 820 if (ret < 0)
821 mlog_errno(ret); 821 mlog_errno(ret);
822 else 822 else
823 ret = 0; 823 ret = 0;
824 } 824 }
825 825
826 if (handle) 826 if (handle)
827 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 827 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
828 828
829 out_unlock: 829 out_unlock:
830 unlock_page(page); 830 unlock_page(page);
831 page_cache_release(page); 831 page_cache_release(page);
832 out: 832 out:
833 return ret; 833 return ret;
834 } 834 }
835 835
836 /* 836 /*
837 * Find the next range to zero. We do this in terms of bytes because 837 * Find the next range to zero. We do this in terms of bytes because
838 * that's what ocfs2_zero_extend() wants, and it is dealing with the 838 * that's what ocfs2_zero_extend() wants, and it is dealing with the
839 * pagecache. We may return multiple extents. 839 * pagecache. We may return multiple extents.
840 * 840 *
841 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what 841 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
842 * needs to be zeroed. range_start and range_end return the next zeroing 842 * needs to be zeroed. range_start and range_end return the next zeroing
843 * range. A subsequent call should pass the previous range_end as its 843 * range. A subsequent call should pass the previous range_end as its
844 * zero_start. If range_end is 0, there's nothing to do. 844 * zero_start. If range_end is 0, there's nothing to do.
845 * 845 *
846 * Unwritten extents are skipped over. Refcounted extents are CoWd. 846 * Unwritten extents are skipped over. Refcounted extents are CoWd.
847 */ 847 */
848 static int ocfs2_zero_extend_get_range(struct inode *inode, 848 static int ocfs2_zero_extend_get_range(struct inode *inode,
849 struct buffer_head *di_bh, 849 struct buffer_head *di_bh,
850 u64 zero_start, u64 zero_end, 850 u64 zero_start, u64 zero_end,
851 u64 *range_start, u64 *range_end) 851 u64 *range_start, u64 *range_end)
852 { 852 {
853 int rc = 0, needs_cow = 0; 853 int rc = 0, needs_cow = 0;
854 u32 p_cpos, zero_clusters = 0; 854 u32 p_cpos, zero_clusters = 0;
855 u32 zero_cpos = 855 u32 zero_cpos =
856 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 856 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
857 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); 857 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
858 unsigned int num_clusters = 0; 858 unsigned int num_clusters = 0;
859 unsigned int ext_flags = 0; 859 unsigned int ext_flags = 0;
860 860
861 while (zero_cpos < last_cpos) { 861 while (zero_cpos < last_cpos) {
862 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, 862 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
863 &num_clusters, &ext_flags); 863 &num_clusters, &ext_flags);
864 if (rc) { 864 if (rc) {
865 mlog_errno(rc); 865 mlog_errno(rc);
866 goto out; 866 goto out;
867 } 867 }
868 868
869 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { 869 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
870 zero_clusters = num_clusters; 870 zero_clusters = num_clusters;
871 if (ext_flags & OCFS2_EXT_REFCOUNTED) 871 if (ext_flags & OCFS2_EXT_REFCOUNTED)
872 needs_cow = 1; 872 needs_cow = 1;
873 break; 873 break;
874 } 874 }
875 875
876 zero_cpos += num_clusters; 876 zero_cpos += num_clusters;
877 } 877 }
878 if (!zero_clusters) { 878 if (!zero_clusters) {
879 *range_end = 0; 879 *range_end = 0;
880 goto out; 880 goto out;
881 } 881 }
882 882
883 while ((zero_cpos + zero_clusters) < last_cpos) { 883 while ((zero_cpos + zero_clusters) < last_cpos) {
884 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, 884 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
885 &p_cpos, &num_clusters, 885 &p_cpos, &num_clusters,
886 &ext_flags); 886 &ext_flags);
887 if (rc) { 887 if (rc) {
888 mlog_errno(rc); 888 mlog_errno(rc);
889 goto out; 889 goto out;
890 } 890 }
891 891
892 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) 892 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
893 break; 893 break;
894 if (ext_flags & OCFS2_EXT_REFCOUNTED) 894 if (ext_flags & OCFS2_EXT_REFCOUNTED)
895 needs_cow = 1; 895 needs_cow = 1;
896 zero_clusters += num_clusters; 896 zero_clusters += num_clusters;
897 } 897 }
898 if ((zero_cpos + zero_clusters) > last_cpos) 898 if ((zero_cpos + zero_clusters) > last_cpos)
899 zero_clusters = last_cpos - zero_cpos; 899 zero_clusters = last_cpos - zero_cpos;
900 900
901 if (needs_cow) { 901 if (needs_cow) {
902 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, 902 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
903 zero_clusters, UINT_MAX); 903 zero_clusters, UINT_MAX);
904 if (rc) { 904 if (rc) {
905 mlog_errno(rc); 905 mlog_errno(rc);
906 goto out; 906 goto out;
907 } 907 }
908 } 908 }
909 909
910 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); 910 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
911 *range_end = ocfs2_clusters_to_bytes(inode->i_sb, 911 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
912 zero_cpos + zero_clusters); 912 zero_cpos + zero_clusters);
913 913
914 out: 914 out:
915 return rc; 915 return rc;
916 } 916 }
917 917
918 /* 918 /*
919 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller 919 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
920 * has made sure that the entire range needs zeroing. 920 * has made sure that the entire range needs zeroing.
921 */ 921 */
922 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, 922 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
923 u64 range_end) 923 u64 range_end)
924 { 924 {
925 int rc = 0; 925 int rc = 0;
926 u64 next_pos; 926 u64 next_pos;
927 u64 zero_pos = range_start; 927 u64 zero_pos = range_start;
928 928
929 trace_ocfs2_zero_extend_range( 929 trace_ocfs2_zero_extend_range(
930 (unsigned long long)OCFS2_I(inode)->ip_blkno, 930 (unsigned long long)OCFS2_I(inode)->ip_blkno,
931 (unsigned long long)range_start, 931 (unsigned long long)range_start,
932 (unsigned long long)range_end); 932 (unsigned long long)range_end);
933 BUG_ON(range_start >= range_end); 933 BUG_ON(range_start >= range_end);
934 934
935 while (zero_pos < range_end) { 935 while (zero_pos < range_end) {
936 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; 936 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
937 if (next_pos > range_end) 937 if (next_pos > range_end)
938 next_pos = range_end; 938 next_pos = range_end;
939 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); 939 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
940 if (rc < 0) { 940 if (rc < 0) {
941 mlog_errno(rc); 941 mlog_errno(rc);
942 break; 942 break;
943 } 943 }
944 zero_pos = next_pos; 944 zero_pos = next_pos;
945 945
946 /* 946 /*
947 * Very large extends have the potential to lock up 947 * Very large extends have the potential to lock up
948 * the cpu for extended periods of time. 948 * the cpu for extended periods of time.
949 */ 949 */
950 cond_resched(); 950 cond_resched();
951 } 951 }
952 952
953 return rc; 953 return rc;
954 } 954 }
955 955
956 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, 956 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
957 loff_t zero_to_size) 957 loff_t zero_to_size)
958 { 958 {
959 int ret = 0; 959 int ret = 0;
960 u64 zero_start, range_start = 0, range_end = 0; 960 u64 zero_start, range_start = 0, range_end = 0;
961 struct super_block *sb = inode->i_sb; 961 struct super_block *sb = inode->i_sb;
962 962
963 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 963 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
964 trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno, 964 trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
965 (unsigned long long)zero_start, 965 (unsigned long long)zero_start,
966 (unsigned long long)i_size_read(inode)); 966 (unsigned long long)i_size_read(inode));
967 while (zero_start < zero_to_size) { 967 while (zero_start < zero_to_size) {
968 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, 968 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
969 zero_to_size, 969 zero_to_size,
970 &range_start, 970 &range_start,
971 &range_end); 971 &range_end);
972 if (ret) { 972 if (ret) {
973 mlog_errno(ret); 973 mlog_errno(ret);
974 break; 974 break;
975 } 975 }
976 if (!range_end) 976 if (!range_end)
977 break; 977 break;
978 /* Trim the ends */ 978 /* Trim the ends */
979 if (range_start < zero_start) 979 if (range_start < zero_start)
980 range_start = zero_start; 980 range_start = zero_start;
981 if (range_end > zero_to_size) 981 if (range_end > zero_to_size)
982 range_end = zero_to_size; 982 range_end = zero_to_size;
983 983
984 ret = ocfs2_zero_extend_range(inode, range_start, 984 ret = ocfs2_zero_extend_range(inode, range_start,
985 range_end); 985 range_end);
986 if (ret) { 986 if (ret) {
987 mlog_errno(ret); 987 mlog_errno(ret);
988 break; 988 break;
989 } 989 }
990 zero_start = range_end; 990 zero_start = range_end;
991 } 991 }
992 992
993 return ret; 993 return ret;
994 } 994 }
995 995
996 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, 996 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
997 u64 new_i_size, u64 zero_to) 997 u64 new_i_size, u64 zero_to)
998 { 998 {
999 int ret; 999 int ret;
1000 u32 clusters_to_add; 1000 u32 clusters_to_add;
1001 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1001 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1002 1002
1003 /* 1003 /*
1004 * Only quota files call this without a bh, and they can't be 1004 * Only quota files call this without a bh, and they can't be
1005 * refcounted. 1005 * refcounted.
1006 */ 1006 */
1007 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 1007 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1008 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); 1008 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1009 1009
1010 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 1010 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1011 if (clusters_to_add < oi->ip_clusters) 1011 if (clusters_to_add < oi->ip_clusters)
1012 clusters_to_add = 0; 1012 clusters_to_add = 0;
1013 else 1013 else
1014 clusters_to_add -= oi->ip_clusters; 1014 clusters_to_add -= oi->ip_clusters;
1015 1015
1016 if (clusters_to_add) { 1016 if (clusters_to_add) {
1017 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, 1017 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
1018 clusters_to_add, 0); 1018 clusters_to_add, 0);
1019 if (ret) { 1019 if (ret) {
1020 mlog_errno(ret); 1020 mlog_errno(ret);
1021 goto out; 1021 goto out;
1022 } 1022 }
1023 } 1023 }
1024 1024
1025 /* 1025 /*
1026 * Call this even if we don't add any clusters to the tree. We 1026 * Call this even if we don't add any clusters to the tree. We
1027 * still need to zero the area between the old i_size and the 1027 * still need to zero the area between the old i_size and the
1028 * new i_size. 1028 * new i_size.
1029 */ 1029 */
1030 ret = ocfs2_zero_extend(inode, di_bh, zero_to); 1030 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1031 if (ret < 0) 1031 if (ret < 0)
1032 mlog_errno(ret); 1032 mlog_errno(ret);
1033 1033
1034 out: 1034 out:
1035 return ret; 1035 return ret;
1036 } 1036 }
1037 1037
1038 static int ocfs2_extend_file(struct inode *inode, 1038 static int ocfs2_extend_file(struct inode *inode,
1039 struct buffer_head *di_bh, 1039 struct buffer_head *di_bh,
1040 u64 new_i_size) 1040 u64 new_i_size)
1041 { 1041 {
1042 int ret = 0; 1042 int ret = 0;
1043 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1043 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1044 1044
1045 BUG_ON(!di_bh); 1045 BUG_ON(!di_bh);
1046 1046
1047 /* setattr sometimes calls us like this. */ 1047 /* setattr sometimes calls us like this. */
1048 if (new_i_size == 0) 1048 if (new_i_size == 0)
1049 goto out; 1049 goto out;
1050 1050
1051 if (i_size_read(inode) == new_i_size) 1051 if (i_size_read(inode) == new_i_size)
1052 goto out; 1052 goto out;
1053 BUG_ON(new_i_size < i_size_read(inode)); 1053 BUG_ON(new_i_size < i_size_read(inode));
1054 1054
1055 /* 1055 /*
1056 * The alloc sem blocks people in read/write from reading our 1056 * The alloc sem blocks people in read/write from reading our
1057 * allocation until we're done changing it. We depend on 1057 * allocation until we're done changing it. We depend on
1058 * i_mutex to block other extend/truncate calls while we're 1058 * i_mutex to block other extend/truncate calls while we're
1059 * here. We even have to hold it for sparse files because there 1059 * here. We even have to hold it for sparse files because there
1060 * might be some tail zeroing. 1060 * might be some tail zeroing.
1061 */ 1061 */
1062 down_write(&oi->ip_alloc_sem); 1062 down_write(&oi->ip_alloc_sem);
1063 1063
1064 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1064 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1065 /* 1065 /*
1066 * We can optimize small extends by keeping the inodes 1066 * We can optimize small extends by keeping the inodes
1067 * inline data. 1067 * inline data.
1068 */ 1068 */
1069 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { 1069 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1070 up_write(&oi->ip_alloc_sem); 1070 up_write(&oi->ip_alloc_sem);
1071 goto out_update_size; 1071 goto out_update_size;
1072 } 1072 }
1073 1073
1074 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1074 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1075 if (ret) { 1075 if (ret) {
1076 up_write(&oi->ip_alloc_sem); 1076 up_write(&oi->ip_alloc_sem);
1077 mlog_errno(ret); 1077 mlog_errno(ret);
1078 goto out; 1078 goto out;
1079 } 1079 }
1080 } 1080 }
1081 1081
1082 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1082 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1083 ret = ocfs2_zero_extend(inode, di_bh, new_i_size); 1083 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1084 else 1084 else
1085 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, 1085 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1086 new_i_size); 1086 new_i_size);
1087 1087
1088 up_write(&oi->ip_alloc_sem); 1088 up_write(&oi->ip_alloc_sem);
1089 1089
1090 if (ret < 0) { 1090 if (ret < 0) {
1091 mlog_errno(ret); 1091 mlog_errno(ret);
1092 goto out; 1092 goto out;
1093 } 1093 }
1094 1094
1095 out_update_size: 1095 out_update_size:
1096 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 1096 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1097 if (ret < 0) 1097 if (ret < 0)
1098 mlog_errno(ret); 1098 mlog_errno(ret);
1099 1099
1100 out: 1100 out:
1101 return ret; 1101 return ret;
1102 } 1102 }
1103 1103
1104 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 1104 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1105 { 1105 {
1106 int status = 0, size_change; 1106 int status = 0, size_change;
1107 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1108 struct super_block *sb = inode->i_sb; 1108 struct super_block *sb = inode->i_sb;
1109 struct ocfs2_super *osb = OCFS2_SB(sb); 1109 struct ocfs2_super *osb = OCFS2_SB(sb);
1110 struct buffer_head *bh = NULL; 1110 struct buffer_head *bh = NULL;
1111 handle_t *handle = NULL; 1111 handle_t *handle = NULL;
1112 struct dquot *transfer_to[MAXQUOTAS] = { }; 1112 struct dquot *transfer_to[MAXQUOTAS] = { };
1113 int qtype; 1113 int qtype;
1114 1114
1115 trace_ocfs2_setattr(inode, dentry, 1115 trace_ocfs2_setattr(inode, dentry,
1116 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1116 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1117 dentry->d_name.len, dentry->d_name.name, 1117 dentry->d_name.len, dentry->d_name.name,
1118 attr->ia_valid, attr->ia_mode, 1118 attr->ia_valid, attr->ia_mode,
1119 attr->ia_uid, attr->ia_gid); 1119 attr->ia_uid, attr->ia_gid);
1120 1120
1121 /* ensuring we don't even attempt to truncate a symlink */ 1121 /* ensuring we don't even attempt to truncate a symlink */
1122 if (S_ISLNK(inode->i_mode)) 1122 if (S_ISLNK(inode->i_mode))
1123 attr->ia_valid &= ~ATTR_SIZE; 1123 attr->ia_valid &= ~ATTR_SIZE;
1124 1124
1125 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 1125 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1126 | ATTR_GID | ATTR_UID | ATTR_MODE) 1126 | ATTR_GID | ATTR_UID | ATTR_MODE)
1127 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) 1127 if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1128 return 0; 1128 return 0;
1129 1129
1130 status = inode_change_ok(inode, attr); 1130 status = inode_change_ok(inode, attr);
1131 if (status) 1131 if (status)
1132 return status; 1132 return status;
1133 1133
1134 if (is_quota_modification(inode, attr)) 1134 if (is_quota_modification(inode, attr))
1135 dquot_initialize(inode); 1135 dquot_initialize(inode);
1136 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 1136 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1137 if (size_change) { 1137 if (size_change) {
1138 status = ocfs2_rw_lock(inode, 1); 1138 status = ocfs2_rw_lock(inode, 1);
1139 if (status < 0) { 1139 if (status < 0) {
1140 mlog_errno(status); 1140 mlog_errno(status);
1141 goto bail; 1141 goto bail;
1142 } 1142 }
1143 } 1143 }
1144 1144
1145 status = ocfs2_inode_lock(inode, &bh, 1); 1145 status = ocfs2_inode_lock(inode, &bh, 1);
1146 if (status < 0) { 1146 if (status < 0) {
1147 if (status != -ENOENT) 1147 if (status != -ENOENT)
1148 mlog_errno(status); 1148 mlog_errno(status);
1149 goto bail_unlock_rw; 1149 goto bail_unlock_rw;
1150 } 1150 }
1151 1151
1152 if (size_change && attr->ia_size != i_size_read(inode)) { 1152 if (size_change && attr->ia_size != i_size_read(inode)) {
1153 status = inode_newsize_ok(inode, attr->ia_size); 1153 status = inode_newsize_ok(inode, attr->ia_size);
1154 if (status) 1154 if (status)
1155 goto bail_unlock; 1155 goto bail_unlock;
1156 1156
1157 inode_dio_wait(inode); 1157 inode_dio_wait(inode);
1158 1158
1159 if (i_size_read(inode) > attr->ia_size) { 1159 if (i_size_read(inode) > attr->ia_size) {
1160 if (ocfs2_should_order_data(inode)) { 1160 if (ocfs2_should_order_data(inode)) {
1161 status = ocfs2_begin_ordered_truncate(inode, 1161 status = ocfs2_begin_ordered_truncate(inode,
1162 attr->ia_size); 1162 attr->ia_size);
1163 if (status) 1163 if (status)
1164 goto bail_unlock; 1164 goto bail_unlock;
1165 } 1165 }
1166 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1166 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1167 } else 1167 } else
1168 status = ocfs2_extend_file(inode, bh, attr->ia_size); 1168 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1169 if (status < 0) { 1169 if (status < 0) {
1170 if (status != -ENOSPC) 1170 if (status != -ENOSPC)
1171 mlog_errno(status); 1171 mlog_errno(status);
1172 status = -ENOSPC; 1172 status = -ENOSPC;
1173 goto bail_unlock; 1173 goto bail_unlock;
1174 } 1174 }
1175 } 1175 }
1176 1176
1177 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 1177 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1178 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 1178 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1179 /* 1179 /*
1180 * Gather pointers to quota structures so that allocation / 1180 * Gather pointers to quota structures so that allocation /
1181 * freeing of quota structures happens here and not inside 1181 * freeing of quota structures happens here and not inside
1182 * dquot_transfer() where we have problems with lock ordering 1182 * dquot_transfer() where we have problems with lock ordering
1183 */ 1183 */
1184 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1184 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1185 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1185 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1186 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1186 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1187 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); 1187 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1188 if (!transfer_to[USRQUOTA]) { 1188 if (!transfer_to[USRQUOTA]) {
1189 status = -ESRCH; 1189 status = -ESRCH;
1190 goto bail_unlock; 1190 goto bail_unlock;
1191 } 1191 }
1192 } 1192 }
1193 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 1193 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1194 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1194 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1195 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1195 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1196 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); 1196 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1197 if (!transfer_to[GRPQUOTA]) { 1197 if (!transfer_to[GRPQUOTA]) {
1198 status = -ESRCH; 1198 status = -ESRCH;
1199 goto bail_unlock; 1199 goto bail_unlock;
1200 } 1200 }
1201 } 1201 }
1202 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 1202 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1203 2 * ocfs2_quota_trans_credits(sb)); 1203 2 * ocfs2_quota_trans_credits(sb));
1204 if (IS_ERR(handle)) { 1204 if (IS_ERR(handle)) {
1205 status = PTR_ERR(handle); 1205 status = PTR_ERR(handle);
1206 mlog_errno(status); 1206 mlog_errno(status);
1207 goto bail_unlock; 1207 goto bail_unlock;
1208 } 1208 }
1209 status = __dquot_transfer(inode, transfer_to); 1209 status = __dquot_transfer(inode, transfer_to);
1210 if (status < 0) 1210 if (status < 0)
1211 goto bail_commit; 1211 goto bail_commit;
1212 } else { 1212 } else {
1213 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1213 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1214 if (IS_ERR(handle)) { 1214 if (IS_ERR(handle)) {
1215 status = PTR_ERR(handle); 1215 status = PTR_ERR(handle);
1216 mlog_errno(status); 1216 mlog_errno(status);
1217 goto bail_unlock; 1217 goto bail_unlock;
1218 } 1218 }
1219 } 1219 }
1220 1220
1221 /* 1221 /*
1222 * This will intentionally not wind up calling truncate_setsize(), 1222 * This will intentionally not wind up calling truncate_setsize(),
1223 * since all the work for a size change has been done above. 1223 * since all the work for a size change has been done above.
1224 * Otherwise, we could get into problems with truncate as 1224 * Otherwise, we could get into problems with truncate as
1225 * ip_alloc_sem is used there to protect against i_size 1225 * ip_alloc_sem is used there to protect against i_size
1226 * changes. 1226 * changes.
1227 * 1227 *
1228 * XXX: this means the conditional below can probably be removed. 1228 * XXX: this means the conditional below can probably be removed.
1229 */ 1229 */
1230 if ((attr->ia_valid & ATTR_SIZE) && 1230 if ((attr->ia_valid & ATTR_SIZE) &&
1231 attr->ia_size != i_size_read(inode)) { 1231 attr->ia_size != i_size_read(inode)) {
1232 status = vmtruncate(inode, attr->ia_size); 1232 status = vmtruncate(inode, attr->ia_size);
1233 if (status) { 1233 if (status) {
1234 mlog_errno(status); 1234 mlog_errno(status);
1235 goto bail_commit; 1235 goto bail_commit;
1236 } 1236 }
1237 } 1237 }
1238 1238
1239 setattr_copy(inode, attr); 1239 setattr_copy(inode, attr);
1240 mark_inode_dirty(inode); 1240 mark_inode_dirty(inode);
1241 1241
1242 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1242 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1243 if (status < 0) 1243 if (status < 0)
1244 mlog_errno(status); 1244 mlog_errno(status);
1245 1245
1246 bail_commit: 1246 bail_commit:
1247 ocfs2_commit_trans(osb, handle); 1247 ocfs2_commit_trans(osb, handle);
1248 bail_unlock: 1248 bail_unlock:
1249 ocfs2_inode_unlock(inode, 1); 1249 ocfs2_inode_unlock(inode, 1);
1250 bail_unlock_rw: 1250 bail_unlock_rw:
1251 if (size_change) 1251 if (size_change)
1252 ocfs2_rw_unlock(inode, 1); 1252 ocfs2_rw_unlock(inode, 1);
1253 bail: 1253 bail:
1254 brelse(bh); 1254 brelse(bh);
1255 1255
1256 /* Release quota pointers in case we acquired them */ 1256 /* Release quota pointers in case we acquired them */
1257 for (qtype = 0; qtype < MAXQUOTAS; qtype++) 1257 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1258 dqput(transfer_to[qtype]); 1258 dqput(transfer_to[qtype]);
1259 1259
1260 if (!status && attr->ia_valid & ATTR_MODE) { 1260 if (!status && attr->ia_valid & ATTR_MODE) {
1261 status = ocfs2_acl_chmod(inode); 1261 status = ocfs2_acl_chmod(inode);
1262 if (status < 0) 1262 if (status < 0)
1263 mlog_errno(status); 1263 mlog_errno(status);
1264 } 1264 }
1265 1265
1266 return status; 1266 return status;
1267 } 1267 }
1268 1268
1269 int ocfs2_getattr(struct vfsmount *mnt, 1269 int ocfs2_getattr(struct vfsmount *mnt,
1270 struct dentry *dentry, 1270 struct dentry *dentry,
1271 struct kstat *stat) 1271 struct kstat *stat)
1272 { 1272 {
1273 struct inode *inode = dentry->d_inode; 1273 struct inode *inode = dentry->d_inode;
1274 struct super_block *sb = dentry->d_inode->i_sb; 1274 struct super_block *sb = dentry->d_inode->i_sb;
1275 struct ocfs2_super *osb = sb->s_fs_info; 1275 struct ocfs2_super *osb = sb->s_fs_info;
1276 int err; 1276 int err;
1277 1277
1278 err = ocfs2_inode_revalidate(dentry); 1278 err = ocfs2_inode_revalidate(dentry);
1279 if (err) { 1279 if (err) {
1280 if (err != -ENOENT) 1280 if (err != -ENOENT)
1281 mlog_errno(err); 1281 mlog_errno(err);
1282 goto bail; 1282 goto bail;
1283 } 1283 }
1284 1284
1285 generic_fillattr(inode, stat); 1285 generic_fillattr(inode, stat);
1286 1286
1287 /* We set the blksize from the cluster size for performance */ 1287 /* We set the blksize from the cluster size for performance */
1288 stat->blksize = osb->s_clustersize; 1288 stat->blksize = osb->s_clustersize;
1289 1289
1290 bail: 1290 bail:
1291 return err; 1291 return err;
1292 } 1292 }
1293 1293
1294 int ocfs2_permission(struct inode *inode, int mask) 1294 int ocfs2_permission(struct inode *inode, int mask)
1295 { 1295 {
1296 int ret; 1296 int ret;
1297 1297
1298 if (mask & MAY_NOT_BLOCK) 1298 if (mask & MAY_NOT_BLOCK)
1299 return -ECHILD; 1299 return -ECHILD;
1300 1300
1301 ret = ocfs2_inode_lock(inode, NULL, 0); 1301 ret = ocfs2_inode_lock(inode, NULL, 0);
1302 if (ret) { 1302 if (ret) {
1303 if (ret != -ENOENT) 1303 if (ret != -ENOENT)
1304 mlog_errno(ret); 1304 mlog_errno(ret);
1305 goto out; 1305 goto out;
1306 } 1306 }
1307 1307
1308 ret = generic_permission(inode, mask); 1308 ret = generic_permission(inode, mask);
1309 1309
1310 ocfs2_inode_unlock(inode, 0); 1310 ocfs2_inode_unlock(inode, 0);
1311 out: 1311 out:
1312 return ret; 1312 return ret;
1313 } 1313 }
1314 1314
1315 static int __ocfs2_write_remove_suid(struct inode *inode, 1315 static int __ocfs2_write_remove_suid(struct inode *inode,
1316 struct buffer_head *bh) 1316 struct buffer_head *bh)
1317 { 1317 {
1318 int ret; 1318 int ret;
1319 handle_t *handle; 1319 handle_t *handle;
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1321 struct ocfs2_dinode *di; 1321 struct ocfs2_dinode *di;
1322 1322
1323 trace_ocfs2_write_remove_suid( 1323 trace_ocfs2_write_remove_suid(
1324 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1324 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1325 inode->i_mode); 1325 inode->i_mode);
1326 1326
1327 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1327 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1328 if (IS_ERR(handle)) { 1328 if (IS_ERR(handle)) {
1329 ret = PTR_ERR(handle); 1329 ret = PTR_ERR(handle);
1330 mlog_errno(ret); 1330 mlog_errno(ret);
1331 goto out; 1331 goto out;
1332 } 1332 }
1333 1333
1334 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1334 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1335 OCFS2_JOURNAL_ACCESS_WRITE); 1335 OCFS2_JOURNAL_ACCESS_WRITE);
1336 if (ret < 0) { 1336 if (ret < 0) {
1337 mlog_errno(ret); 1337 mlog_errno(ret);
1338 goto out_trans; 1338 goto out_trans;
1339 } 1339 }
1340 1340
1341 inode->i_mode &= ~S_ISUID; 1341 inode->i_mode &= ~S_ISUID;
1342 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1342 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1343 inode->i_mode &= ~S_ISGID; 1343 inode->i_mode &= ~S_ISGID;
1344 1344
1345 di = (struct ocfs2_dinode *) bh->b_data; 1345 di = (struct ocfs2_dinode *) bh->b_data;
1346 di->i_mode = cpu_to_le16(inode->i_mode); 1346 di->i_mode = cpu_to_le16(inode->i_mode);
1347 1347
1348 ocfs2_journal_dirty(handle, bh); 1348 ocfs2_journal_dirty(handle, bh);
1349 1349
1350 out_trans: 1350 out_trans:
1351 ocfs2_commit_trans(osb, handle); 1351 ocfs2_commit_trans(osb, handle);
1352 out: 1352 out:
1353 return ret; 1353 return ret;
1354 } 1354 }
1355 1355
1356 /* 1356 /*
1357 * Will look for holes and unwritten extents in the range starting at 1357 * Will look for holes and unwritten extents in the range starting at
1358 * pos for count bytes (inclusive). 1358 * pos for count bytes (inclusive).
1359 */ 1359 */
1360 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1360 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1361 size_t count) 1361 size_t count)
1362 { 1362 {
1363 int ret = 0; 1363 int ret = 0;
1364 unsigned int extent_flags; 1364 unsigned int extent_flags;
1365 u32 cpos, clusters, extent_len, phys_cpos; 1365 u32 cpos, clusters, extent_len, phys_cpos;
1366 struct super_block *sb = inode->i_sb; 1366 struct super_block *sb = inode->i_sb;
1367 1367
1368 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1368 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1369 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1369 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1370 1370
1371 while (clusters) { 1371 while (clusters) {
1372 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1372 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1373 &extent_flags); 1373 &extent_flags);
1374 if (ret < 0) { 1374 if (ret < 0) {
1375 mlog_errno(ret); 1375 mlog_errno(ret);
1376 goto out; 1376 goto out;
1377 } 1377 }
1378 1378
1379 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1379 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1380 ret = 1; 1380 ret = 1;
1381 break; 1381 break;
1382 } 1382 }
1383 1383
1384 if (extent_len > clusters) 1384 if (extent_len > clusters)
1385 extent_len = clusters; 1385 extent_len = clusters;
1386 1386
1387 clusters -= extent_len; 1387 clusters -= extent_len;
1388 cpos += extent_len; 1388 cpos += extent_len;
1389 } 1389 }
1390 out: 1390 out:
1391 return ret; 1391 return ret;
1392 } 1392 }
1393 1393
1394 static int ocfs2_write_remove_suid(struct inode *inode) 1394 static int ocfs2_write_remove_suid(struct inode *inode)
1395 { 1395 {
1396 int ret; 1396 int ret;
1397 struct buffer_head *bh = NULL; 1397 struct buffer_head *bh = NULL;
1398 1398
1399 ret = ocfs2_read_inode_block(inode, &bh); 1399 ret = ocfs2_read_inode_block(inode, &bh);
1400 if (ret < 0) { 1400 if (ret < 0) {
1401 mlog_errno(ret); 1401 mlog_errno(ret);
1402 goto out; 1402 goto out;
1403 } 1403 }
1404 1404
1405 ret = __ocfs2_write_remove_suid(inode, bh); 1405 ret = __ocfs2_write_remove_suid(inode, bh);
1406 out: 1406 out:
1407 brelse(bh); 1407 brelse(bh);
1408 return ret; 1408 return ret;
1409 } 1409 }
1410 1410
1411 /* 1411 /*
1412 * Allocate enough extents to cover the region starting at byte offset 1412 * Allocate enough extents to cover the region starting at byte offset
1413 * start for len bytes. Existing extents are skipped, any extents 1413 * start for len bytes. Existing extents are skipped, any extents
1414 * added are marked as "unwritten". 1414 * added are marked as "unwritten".
1415 */ 1415 */
1416 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1416 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1417 u64 start, u64 len) 1417 u64 start, u64 len)
1418 { 1418 {
1419 int ret; 1419 int ret;
1420 u32 cpos, phys_cpos, clusters, alloc_size; 1420 u32 cpos, phys_cpos, clusters, alloc_size;
1421 u64 end = start + len; 1421 u64 end = start + len;
1422 struct buffer_head *di_bh = NULL; 1422 struct buffer_head *di_bh = NULL;
1423 1423
1424 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1424 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1425 ret = ocfs2_read_inode_block(inode, &di_bh); 1425 ret = ocfs2_read_inode_block(inode, &di_bh);
1426 if (ret) { 1426 if (ret) {
1427 mlog_errno(ret); 1427 mlog_errno(ret);
1428 goto out; 1428 goto out;
1429 } 1429 }
1430 1430
1431 /* 1431 /*
1432 * Nothing to do if the requested reservation range 1432 * Nothing to do if the requested reservation range
1433 * fits within the inode. 1433 * fits within the inode.
1434 */ 1434 */
1435 if (ocfs2_size_fits_inline_data(di_bh, end)) 1435 if (ocfs2_size_fits_inline_data(di_bh, end))
1436 goto out; 1436 goto out;
1437 1437
1438 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1438 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1439 if (ret) { 1439 if (ret) {
1440 mlog_errno(ret); 1440 mlog_errno(ret);
1441 goto out; 1441 goto out;
1442 } 1442 }
1443 } 1443 }
1444 1444
1445 /* 1445 /*
1446 * We consider both start and len to be inclusive. 1446 * We consider both start and len to be inclusive.
1447 */ 1447 */
1448 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1448 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1449 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1449 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1450 clusters -= cpos; 1450 clusters -= cpos;
1451 1451
1452 while (clusters) { 1452 while (clusters) {
1453 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1453 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1454 &alloc_size, NULL); 1454 &alloc_size, NULL);
1455 if (ret) { 1455 if (ret) {
1456 mlog_errno(ret); 1456 mlog_errno(ret);
1457 goto out; 1457 goto out;
1458 } 1458 }
1459 1459
1460 /* 1460 /*
1461 * Hole or existing extent len can be arbitrary, so 1461 * Hole or existing extent len can be arbitrary, so
1462 * cap it to our own allocation request. 1462 * cap it to our own allocation request.
1463 */ 1463 */
1464 if (alloc_size > clusters) 1464 if (alloc_size > clusters)
1465 alloc_size = clusters; 1465 alloc_size = clusters;
1466 1466
1467 if (phys_cpos) { 1467 if (phys_cpos) {
1468 /* 1468 /*
1469 * We already have an allocation at this 1469 * We already have an allocation at this
1470 * region so we can safely skip it. 1470 * region so we can safely skip it.
1471 */ 1471 */
1472 goto next; 1472 goto next;
1473 } 1473 }
1474 1474
1475 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1475 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1476 if (ret) { 1476 if (ret) {
1477 if (ret != -ENOSPC) 1477 if (ret != -ENOSPC)
1478 mlog_errno(ret); 1478 mlog_errno(ret);
1479 goto out; 1479 goto out;
1480 } 1480 }
1481 1481
1482 next: 1482 next:
1483 cpos += alloc_size; 1483 cpos += alloc_size;
1484 clusters -= alloc_size; 1484 clusters -= alloc_size;
1485 } 1485 }
1486 1486
1487 ret = 0; 1487 ret = 0;
1488 out: 1488 out:
1489 1489
1490 brelse(di_bh); 1490 brelse(di_bh);
1491 return ret; 1491 return ret;
1492 } 1492 }
1493 1493
1494 /* 1494 /*
1495 * Truncate a byte range, avoiding pages within partial clusters. This 1495 * Truncate a byte range, avoiding pages within partial clusters. This
1496 * preserves those pages for the zeroing code to write to. 1496 * preserves those pages for the zeroing code to write to.
1497 */ 1497 */
1498 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1498 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1499 u64 byte_len) 1499 u64 byte_len)
1500 { 1500 {
1501 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1501 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1502 loff_t start, end; 1502 loff_t start, end;
1503 struct address_space *mapping = inode->i_mapping; 1503 struct address_space *mapping = inode->i_mapping;
1504 1504
1505 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1505 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1506 end = byte_start + byte_len; 1506 end = byte_start + byte_len;
1507 end = end & ~(osb->s_clustersize - 1); 1507 end = end & ~(osb->s_clustersize - 1);
1508 1508
1509 if (start < end) { 1509 if (start < end) {
1510 unmap_mapping_range(mapping, start, end - start, 0); 1510 unmap_mapping_range(mapping, start, end - start, 0);
1511 truncate_inode_pages_range(mapping, start, end - 1); 1511 truncate_inode_pages_range(mapping, start, end - 1);
1512 } 1512 }
1513 } 1513 }
1514 1514
1515 static int ocfs2_zero_partial_clusters(struct inode *inode, 1515 static int ocfs2_zero_partial_clusters(struct inode *inode,
1516 u64 start, u64 len) 1516 u64 start, u64 len)
1517 { 1517 {
1518 int ret = 0; 1518 int ret = 0;
1519 u64 tmpend, end = start + len; 1519 u64 tmpend, end = start + len;
1520 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1520 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1521 unsigned int csize = osb->s_clustersize; 1521 unsigned int csize = osb->s_clustersize;
1522 handle_t *handle; 1522 handle_t *handle;
1523 1523
1524 /* 1524 /*
1525 * The "start" and "end" values are NOT necessarily part of 1525 * The "start" and "end" values are NOT necessarily part of
1526 * the range whose allocation is being deleted. Rather, this 1526 * the range whose allocation is being deleted. Rather, this
1527 * is what the user passed in with the request. We must zero 1527 * is what the user passed in with the request. We must zero
1528 * partial clusters here. There's no need to worry about 1528 * partial clusters here. There's no need to worry about
1529 * physical allocation - the zeroing code knows to skip holes. 1529 * physical allocation - the zeroing code knows to skip holes.
1530 */ 1530 */
1531 trace_ocfs2_zero_partial_clusters( 1531 trace_ocfs2_zero_partial_clusters(
1532 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1532 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1533 (unsigned long long)start, (unsigned long long)end); 1533 (unsigned long long)start, (unsigned long long)end);
1534 1534
1535 /* 1535 /*
1536 * If both edges are on a cluster boundary then there's no 1536 * If both edges are on a cluster boundary then there's no
1537 * zeroing required as the region is part of the allocation to 1537 * zeroing required as the region is part of the allocation to
1538 * be truncated. 1538 * be truncated.
1539 */ 1539 */
1540 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1540 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1541 goto out; 1541 goto out;
1542 1542
1543 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1543 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1544 if (IS_ERR(handle)) { 1544 if (IS_ERR(handle)) {
1545 ret = PTR_ERR(handle); 1545 ret = PTR_ERR(handle);
1546 mlog_errno(ret); 1546 mlog_errno(ret);
1547 goto out; 1547 goto out;
1548 } 1548 }
1549 1549
1550 /* 1550 /*
1551 * We want to get the byte offset of the end of the 1st cluster. 1551 * We want to get the byte offset of the end of the 1st cluster.
1552 */ 1552 */
1553 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1553 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1554 if (tmpend > end) 1554 if (tmpend > end)
1555 tmpend = end; 1555 tmpend = end;
1556 1556
1557 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, 1557 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
1558 (unsigned long long)tmpend); 1558 (unsigned long long)tmpend);
1559 1559
1560 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1560 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1561 if (ret) 1561 if (ret)
1562 mlog_errno(ret); 1562 mlog_errno(ret);
1563 1563
1564 if (tmpend < end) { 1564 if (tmpend < end) {
1565 /* 1565 /*
1566 * This may make start and end equal, but the zeroing 1566 * This may make start and end equal, but the zeroing
1567 * code will skip any work in that case so there's no 1567 * code will skip any work in that case so there's no
1568 * need to catch it up here. 1568 * need to catch it up here.
1569 */ 1569 */
1570 start = end & ~(osb->s_clustersize - 1); 1570 start = end & ~(osb->s_clustersize - 1);
1571 1571
1572 trace_ocfs2_zero_partial_clusters_range2( 1572 trace_ocfs2_zero_partial_clusters_range2(
1573 (unsigned long long)start, (unsigned long long)end); 1573 (unsigned long long)start, (unsigned long long)end);
1574 1574
1575 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1575 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1576 if (ret) 1576 if (ret)
1577 mlog_errno(ret); 1577 mlog_errno(ret);
1578 } 1578 }
1579 1579
1580 ocfs2_commit_trans(osb, handle); 1580 ocfs2_commit_trans(osb, handle);
1581 out: 1581 out:
1582 return ret; 1582 return ret;
1583 } 1583 }
1584 1584
1585 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) 1585 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1586 { 1586 {
1587 int i; 1587 int i;
1588 struct ocfs2_extent_rec *rec = NULL; 1588 struct ocfs2_extent_rec *rec = NULL;
1589 1589
1590 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 1590 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1591 1591
1592 rec = &el->l_recs[i]; 1592 rec = &el->l_recs[i];
1593 1593
1594 if (le32_to_cpu(rec->e_cpos) < pos) 1594 if (le32_to_cpu(rec->e_cpos) < pos)
1595 break; 1595 break;
1596 } 1596 }
1597 1597
1598 return i; 1598 return i;
1599 } 1599 }
1600 1600
1601 /* 1601 /*
1602 * Helper to calculate the punching pos and length in one run, we handle the 1602 * Helper to calculate the punching pos and length in one run, we handle the
1603 * following three cases in order: 1603 * following three cases in order:
1604 * 1604 *
1605 * - remove the entire record 1605 * - remove the entire record
1606 * - remove a partial record 1606 * - remove a partial record
1607 * - no record needs to be removed (hole-punching completed) 1607 * - no record needs to be removed (hole-punching completed)
1608 */ 1608 */
1609 static void ocfs2_calc_trunc_pos(struct inode *inode, 1609 static void ocfs2_calc_trunc_pos(struct inode *inode,
1610 struct ocfs2_extent_list *el, 1610 struct ocfs2_extent_list *el,
1611 struct ocfs2_extent_rec *rec, 1611 struct ocfs2_extent_rec *rec,
1612 u32 trunc_start, u32 *trunc_cpos, 1612 u32 trunc_start, u32 *trunc_cpos,
1613 u32 *trunc_len, u32 *trunc_end, 1613 u32 *trunc_len, u32 *trunc_end,
1614 u64 *blkno, int *done) 1614 u64 *blkno, int *done)
1615 { 1615 {
1616 int ret = 0; 1616 int ret = 0;
1617 u32 coff, range; 1617 u32 coff, range;
1618 1618
1619 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 1619 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1620 1620
1621 if (le32_to_cpu(rec->e_cpos) >= trunc_start) { 1621 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1622 /* 1622 /*
1623 * remove an entire extent record. 1623 * remove an entire extent record.
1624 */ 1624 */
1625 *trunc_cpos = le32_to_cpu(rec->e_cpos); 1625 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1626 /* 1626 /*
1627 * Skip holes if any. 1627 * Skip holes if any.
1628 */ 1628 */
1629 if (range < *trunc_end) 1629 if (range < *trunc_end)
1630 *trunc_end = range; 1630 *trunc_end = range;
1631 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); 1631 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1632 *blkno = le64_to_cpu(rec->e_blkno); 1632 *blkno = le64_to_cpu(rec->e_blkno);
1633 *trunc_end = le32_to_cpu(rec->e_cpos); 1633 *trunc_end = le32_to_cpu(rec->e_cpos);
1634 } else if (range > trunc_start) { 1634 } else if (range > trunc_start) {
1635 /* 1635 /*
1636 * remove a partial extent record, which means we're 1636 * remove a partial extent record, which means we're
1637 * removing the last extent record. 1637 * removing the last extent record.
1638 */ 1638 */
1639 *trunc_cpos = trunc_start; 1639 *trunc_cpos = trunc_start;
1640 /* 1640 /*
1641 * skip hole if any. 1641 * skip hole if any.
1642 */ 1642 */
1643 if (range < *trunc_end) 1643 if (range < *trunc_end)
1644 *trunc_end = range; 1644 *trunc_end = range;
1645 *trunc_len = *trunc_end - trunc_start; 1645 *trunc_len = *trunc_end - trunc_start;
1646 coff = trunc_start - le32_to_cpu(rec->e_cpos); 1646 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1647 *blkno = le64_to_cpu(rec->e_blkno) + 1647 *blkno = le64_to_cpu(rec->e_blkno) +
1648 ocfs2_clusters_to_blocks(inode->i_sb, coff); 1648 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1649 *trunc_end = trunc_start; 1649 *trunc_end = trunc_start;
1650 } else { 1650 } else {
1651 /* 1651 /*
1652 * It may have two following possibilities: 1652 * It may have two following possibilities:
1653 * 1653 *
1654 * - last record has been removed 1654 * - last record has been removed
1655 * - trunc_start was within a hole 1655 * - trunc_start was within a hole
1656 * 1656 *
1657 * both two cases mean the completion of hole punching. 1657 * both two cases mean the completion of hole punching.
1658 */ 1658 */
1659 ret = 1; 1659 ret = 1;
1660 } 1660 }
1661 1661
1662 *done = ret; 1662 *done = ret;
1663 } 1663 }
1664 1664
1665 static int ocfs2_remove_inode_range(struct inode *inode, 1665 static int ocfs2_remove_inode_range(struct inode *inode,
1666 struct buffer_head *di_bh, u64 byte_start, 1666 struct buffer_head *di_bh, u64 byte_start,
1667 u64 byte_len) 1667 u64 byte_len)
1668 { 1668 {
1669 int ret = 0, flags = 0, done = 0, i; 1669 int ret = 0, flags = 0, done = 0, i;
1670 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; 1670 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1671 u32 cluster_in_el; 1671 u32 cluster_in_el;
1672 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1672 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1673 struct ocfs2_cached_dealloc_ctxt dealloc; 1673 struct ocfs2_cached_dealloc_ctxt dealloc;
1674 struct address_space *mapping = inode->i_mapping; 1674 struct address_space *mapping = inode->i_mapping;
1675 struct ocfs2_extent_tree et; 1675 struct ocfs2_extent_tree et;
1676 struct ocfs2_path *path = NULL; 1676 struct ocfs2_path *path = NULL;
1677 struct ocfs2_extent_list *el = NULL; 1677 struct ocfs2_extent_list *el = NULL;
1678 struct ocfs2_extent_rec *rec = NULL; 1678 struct ocfs2_extent_rec *rec = NULL;
1679 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1679 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1680 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); 1680 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1681 1681
1682 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1682 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1683 ocfs2_init_dealloc_ctxt(&dealloc); 1683 ocfs2_init_dealloc_ctxt(&dealloc);
1684 1684
1685 trace_ocfs2_remove_inode_range( 1685 trace_ocfs2_remove_inode_range(
1686 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1686 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1687 (unsigned long long)byte_start, 1687 (unsigned long long)byte_start,
1688 (unsigned long long)byte_len); 1688 (unsigned long long)byte_len);
1689 1689
1690 if (byte_len == 0) 1690 if (byte_len == 0)
1691 return 0; 1691 return 0;
1692 1692
1693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1694 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1694 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1695 byte_start + byte_len, 0); 1695 byte_start + byte_len, 0);
1696 if (ret) { 1696 if (ret) {
1697 mlog_errno(ret); 1697 mlog_errno(ret);
1698 goto out; 1698 goto out;
1699 } 1699 }
1700 /* 1700 /*
1701 * There's no need to get fancy with the page cache 1701 * There's no need to get fancy with the page cache
1702 * truncate of an inline-data inode. We're talking 1702 * truncate of an inline-data inode. We're talking
1703 * about less than a page here, which will be cached 1703 * about less than a page here, which will be cached
1704 * in the dinode buffer anyway. 1704 * in the dinode buffer anyway.
1705 */ 1705 */
1706 unmap_mapping_range(mapping, 0, 0, 0); 1706 unmap_mapping_range(mapping, 0, 0, 0);
1707 truncate_inode_pages(mapping, 0); 1707 truncate_inode_pages(mapping, 0);
1708 goto out; 1708 goto out;
1709 } 1709 }
1710 1710
1711 /* 1711 /*
1712 * For reflinks, we may need to CoW 2 clusters which might be 1712 * For reflinks, we may need to CoW 2 clusters which might be
1713 * partially zero'd later, if hole's start and end offset were 1713 * partially zero'd later, if hole's start and end offset were
1714 * within one cluster(means is not exactly aligned to clustersize). 1714 * within one cluster(means is not exactly aligned to clustersize).
1715 */ 1715 */
1716 1716
1717 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { 1717 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1718 1718
1719 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); 1719 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1720 if (ret) { 1720 if (ret) {
1721 mlog_errno(ret); 1721 mlog_errno(ret);
1722 goto out; 1722 goto out;
1723 } 1723 }
1724 1724
1725 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); 1725 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1726 if (ret) { 1726 if (ret) {
1727 mlog_errno(ret); 1727 mlog_errno(ret);
1728 goto out; 1728 goto out;
1729 } 1729 }
1730 } 1730 }
1731 1731
1732 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1732 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1733 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; 1733 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1734 cluster_in_el = trunc_end; 1734 cluster_in_el = trunc_end;
1735 1735
1736 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1736 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1737 if (ret) { 1737 if (ret) {
1738 mlog_errno(ret); 1738 mlog_errno(ret);
1739 goto out; 1739 goto out;
1740 } 1740 }
1741 1741
1742 path = ocfs2_new_path_from_et(&et); 1742 path = ocfs2_new_path_from_et(&et);
1743 if (!path) { 1743 if (!path) {
1744 ret = -ENOMEM; 1744 ret = -ENOMEM;
1745 mlog_errno(ret); 1745 mlog_errno(ret);
1746 goto out; 1746 goto out;
1747 } 1747 }
1748 1748
1749 while (trunc_end > trunc_start) { 1749 while (trunc_end > trunc_start) {
1750 1750
1751 ret = ocfs2_find_path(INODE_CACHE(inode), path, 1751 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1752 cluster_in_el); 1752 cluster_in_el);
1753 if (ret) { 1753 if (ret) {
1754 mlog_errno(ret); 1754 mlog_errno(ret);
1755 goto out; 1755 goto out;
1756 } 1756 }
1757 1757
1758 el = path_leaf_el(path); 1758 el = path_leaf_el(path);
1759 1759
1760 i = ocfs2_find_rec(el, trunc_end); 1760 i = ocfs2_find_rec(el, trunc_end);
1761 /* 1761 /*
1762 * Need to go to previous extent block. 1762 * Need to go to previous extent block.
1763 */ 1763 */
1764 if (i < 0) { 1764 if (i < 0) {
1765 if (path->p_tree_depth == 0) 1765 if (path->p_tree_depth == 0)
1766 break; 1766 break;
1767 1767
1768 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 1768 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1769 path, 1769 path,
1770 &cluster_in_el); 1770 &cluster_in_el);
1771 if (ret) { 1771 if (ret) {
1772 mlog_errno(ret); 1772 mlog_errno(ret);
1773 goto out; 1773 goto out;
1774 } 1774 }
1775 1775
1776 /* 1776 /*
1777 * We've reached the leftmost extent block, 1777 * We've reached the leftmost extent block,
1778 * it's safe to leave. 1778 * it's safe to leave.
1779 */ 1779 */
1780 if (cluster_in_el == 0) 1780 if (cluster_in_el == 0)
1781 break; 1781 break;
1782 1782
1783 /* 1783 /*
1784 * The 'pos' searched for previous extent block is 1784 * The 'pos' searched for previous extent block is
1785 * always one cluster less than actual trunc_end. 1785 * always one cluster less than actual trunc_end.
1786 */ 1786 */
1787 trunc_end = cluster_in_el + 1; 1787 trunc_end = cluster_in_el + 1;
1788 1788
1789 ocfs2_reinit_path(path, 1); 1789 ocfs2_reinit_path(path, 1);
1790 1790
1791 continue; 1791 continue;
1792 1792
1793 } else 1793 } else
1794 rec = &el->l_recs[i]; 1794 rec = &el->l_recs[i];
1795 1795
1796 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, 1796 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1797 &trunc_len, &trunc_end, &blkno, &done); 1797 &trunc_len, &trunc_end, &blkno, &done);
1798 if (done) 1798 if (done)
1799 break; 1799 break;
1800 1800
1801 flags = rec->e_flags; 1801 flags = rec->e_flags;
1802 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 1802 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1803 1803
1804 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, 1804 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1805 phys_cpos, trunc_len, flags, 1805 phys_cpos, trunc_len, flags,
1806 &dealloc, refcount_loc); 1806 &dealloc, refcount_loc);
1807 if (ret < 0) { 1807 if (ret < 0) {
1808 mlog_errno(ret); 1808 mlog_errno(ret);
1809 goto out; 1809 goto out;
1810 } 1810 }
1811 1811
1812 cluster_in_el = trunc_end; 1812 cluster_in_el = trunc_end;
1813 1813
1814 ocfs2_reinit_path(path, 1); 1814 ocfs2_reinit_path(path, 1);
1815 } 1815 }
1816 1816
1817 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1817 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1818 1818
1819 out: 1819 out:
1820 ocfs2_schedule_truncate_log_flush(osb, 1); 1820 ocfs2_schedule_truncate_log_flush(osb, 1);
1821 ocfs2_run_deallocs(osb, &dealloc); 1821 ocfs2_run_deallocs(osb, &dealloc);
1822 1822
1823 return ret; 1823 return ret;
1824 } 1824 }
1825 1825
1826 /* 1826 /*
1827 * Parts of this function taken from xfs_change_file_space() 1827 * Parts of this function taken from xfs_change_file_space()
1828 */ 1828 */
1829 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1829 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1830 loff_t f_pos, unsigned int cmd, 1830 loff_t f_pos, unsigned int cmd,
1831 struct ocfs2_space_resv *sr, 1831 struct ocfs2_space_resv *sr,
1832 int change_size) 1832 int change_size)
1833 { 1833 {
1834 int ret; 1834 int ret;
1835 s64 llen; 1835 s64 llen;
1836 loff_t size; 1836 loff_t size;
1837 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1837 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1838 struct buffer_head *di_bh = NULL; 1838 struct buffer_head *di_bh = NULL;
1839 handle_t *handle; 1839 handle_t *handle;
1840 unsigned long long max_off = inode->i_sb->s_maxbytes; 1840 unsigned long long max_off = inode->i_sb->s_maxbytes;
1841 1841
1842 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1842 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1843 return -EROFS; 1843 return -EROFS;
1844 1844
1845 mutex_lock(&inode->i_mutex); 1845 mutex_lock(&inode->i_mutex);
1846 1846
1847 /* 1847 /*
1848 * This prevents concurrent writes on other nodes 1848 * This prevents concurrent writes on other nodes
1849 */ 1849 */
1850 ret = ocfs2_rw_lock(inode, 1); 1850 ret = ocfs2_rw_lock(inode, 1);
1851 if (ret) { 1851 if (ret) {
1852 mlog_errno(ret); 1852 mlog_errno(ret);
1853 goto out; 1853 goto out;
1854 } 1854 }
1855 1855
1856 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1856 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1857 if (ret) { 1857 if (ret) {
1858 mlog_errno(ret); 1858 mlog_errno(ret);
1859 goto out_rw_unlock; 1859 goto out_rw_unlock;
1860 } 1860 }
1861 1861
1862 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1862 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1863 ret = -EPERM; 1863 ret = -EPERM;
1864 goto out_inode_unlock; 1864 goto out_inode_unlock;
1865 } 1865 }
1866 1866
1867 switch (sr->l_whence) { 1867 switch (sr->l_whence) {
1868 case 0: /*SEEK_SET*/ 1868 case 0: /*SEEK_SET*/
1869 break; 1869 break;
1870 case 1: /*SEEK_CUR*/ 1870 case 1: /*SEEK_CUR*/
1871 sr->l_start += f_pos; 1871 sr->l_start += f_pos;
1872 break; 1872 break;
1873 case 2: /*SEEK_END*/ 1873 case 2: /*SEEK_END*/
1874 sr->l_start += i_size_read(inode); 1874 sr->l_start += i_size_read(inode);
1875 break; 1875 break;
1876 default: 1876 default:
1877 ret = -EINVAL; 1877 ret = -EINVAL;
1878 goto out_inode_unlock; 1878 goto out_inode_unlock;
1879 } 1879 }
1880 sr->l_whence = 0; 1880 sr->l_whence = 0;
1881 1881
1882 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1882 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1883 1883
1884 if (sr->l_start < 0 1884 if (sr->l_start < 0
1885 || sr->l_start > max_off 1885 || sr->l_start > max_off
1886 || (sr->l_start + llen) < 0 1886 || (sr->l_start + llen) < 0
1887 || (sr->l_start + llen) > max_off) { 1887 || (sr->l_start + llen) > max_off) {
1888 ret = -EINVAL; 1888 ret = -EINVAL;
1889 goto out_inode_unlock; 1889 goto out_inode_unlock;
1890 } 1890 }
1891 size = sr->l_start + sr->l_len; 1891 size = sr->l_start + sr->l_len;
1892 1892
1893 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1893 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1894 if (sr->l_len <= 0) { 1894 if (sr->l_len <= 0) {
1895 ret = -EINVAL; 1895 ret = -EINVAL;
1896 goto out_inode_unlock; 1896 goto out_inode_unlock;
1897 } 1897 }
1898 } 1898 }
1899 1899
1900 if (file && should_remove_suid(file->f_path.dentry)) { 1900 if (file && should_remove_suid(file->f_path.dentry)) {
1901 ret = __ocfs2_write_remove_suid(inode, di_bh); 1901 ret = __ocfs2_write_remove_suid(inode, di_bh);
1902 if (ret) { 1902 if (ret) {
1903 mlog_errno(ret); 1903 mlog_errno(ret);
1904 goto out_inode_unlock; 1904 goto out_inode_unlock;
1905 } 1905 }
1906 } 1906 }
1907 1907
1908 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1908 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1909 switch (cmd) { 1909 switch (cmd) {
1910 case OCFS2_IOC_RESVSP: 1910 case OCFS2_IOC_RESVSP:
1911 case OCFS2_IOC_RESVSP64: 1911 case OCFS2_IOC_RESVSP64:
1912 /* 1912 /*
1913 * This takes unsigned offsets, but the signed ones we 1913 * This takes unsigned offsets, but the signed ones we
1914 * pass have been checked against overflow above. 1914 * pass have been checked against overflow above.
1915 */ 1915 */
1916 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1916 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1917 sr->l_len); 1917 sr->l_len);
1918 break; 1918 break;
1919 case OCFS2_IOC_UNRESVSP: 1919 case OCFS2_IOC_UNRESVSP:
1920 case OCFS2_IOC_UNRESVSP64: 1920 case OCFS2_IOC_UNRESVSP64:
1921 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1921 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1922 sr->l_len); 1922 sr->l_len);
1923 break; 1923 break;
1924 default: 1924 default:
1925 ret = -EINVAL; 1925 ret = -EINVAL;
1926 } 1926 }
1927 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1927 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1928 if (ret) { 1928 if (ret) {
1929 mlog_errno(ret); 1929 mlog_errno(ret);
1930 goto out_inode_unlock; 1930 goto out_inode_unlock;
1931 } 1931 }
1932 1932
1933 /* 1933 /*
1934 * We update c/mtime for these changes 1934 * We update c/mtime for these changes
1935 */ 1935 */
1936 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1936 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1937 if (IS_ERR(handle)) { 1937 if (IS_ERR(handle)) {
1938 ret = PTR_ERR(handle); 1938 ret = PTR_ERR(handle);
1939 mlog_errno(ret); 1939 mlog_errno(ret);
1940 goto out_inode_unlock; 1940 goto out_inode_unlock;
1941 } 1941 }
1942 1942
1943 if (change_size && i_size_read(inode) < size) 1943 if (change_size && i_size_read(inode) < size)
1944 i_size_write(inode, size); 1944 i_size_write(inode, size);
1945 1945
1946 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1946 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1947 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1947 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1948 if (ret < 0) 1948 if (ret < 0)
1949 mlog_errno(ret); 1949 mlog_errno(ret);
1950 1950
1951 if (file && (file->f_flags & O_SYNC)) 1951 if (file && (file->f_flags & O_SYNC))
1952 handle->h_sync = 1; 1952 handle->h_sync = 1;
1953 1953
1954 ocfs2_commit_trans(osb, handle); 1954 ocfs2_commit_trans(osb, handle);
1955 1955
1956 out_inode_unlock: 1956 out_inode_unlock:
1957 brelse(di_bh); 1957 brelse(di_bh);
1958 ocfs2_inode_unlock(inode, 1); 1958 ocfs2_inode_unlock(inode, 1);
1959 out_rw_unlock: 1959 out_rw_unlock:
1960 ocfs2_rw_unlock(inode, 1); 1960 ocfs2_rw_unlock(inode, 1);
1961 1961
1962 out: 1962 out:
1963 mutex_unlock(&inode->i_mutex); 1963 mutex_unlock(&inode->i_mutex);
1964 return ret; 1964 return ret;
1965 } 1965 }
1966 1966
1967 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1967 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1968 struct ocfs2_space_resv *sr) 1968 struct ocfs2_space_resv *sr)
1969 { 1969 {
1970 struct inode *inode = file->f_path.dentry->d_inode; 1970 struct inode *inode = file->f_path.dentry->d_inode;
1971 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1971 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1972 int ret; 1972 int ret;
1973 1973
1974 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1974 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1975 !ocfs2_writes_unwritten_extents(osb)) 1975 !ocfs2_writes_unwritten_extents(osb))
1976 return -ENOTTY; 1976 return -ENOTTY;
1977 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1977 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1978 !ocfs2_sparse_alloc(osb)) 1978 !ocfs2_sparse_alloc(osb))
1979 return -ENOTTY; 1979 return -ENOTTY;
1980 1980
1981 if (!S_ISREG(inode->i_mode)) 1981 if (!S_ISREG(inode->i_mode))
1982 return -EINVAL; 1982 return -EINVAL;
1983 1983
1984 if (!(file->f_mode & FMODE_WRITE)) 1984 if (!(file->f_mode & FMODE_WRITE))
1985 return -EBADF; 1985 return -EBADF;
1986 1986
1987 ret = mnt_want_write_file(file); 1987 ret = mnt_want_write_file(file);
1988 if (ret) 1988 if (ret)
1989 return ret; 1989 return ret;
1990 ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1990 ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1991 mnt_drop_write_file(file); 1991 mnt_drop_write_file(file);
1992 return ret; 1992 return ret;
1993 } 1993 }
1994 1994
1995 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, 1995 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1996 loff_t len) 1996 loff_t len)
1997 { 1997 {
1998 struct inode *inode = file->f_path.dentry->d_inode; 1998 struct inode *inode = file->f_path.dentry->d_inode;
1999 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1999 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2000 struct ocfs2_space_resv sr; 2000 struct ocfs2_space_resv sr;
2001 int change_size = 1; 2001 int change_size = 1;
2002 int cmd = OCFS2_IOC_RESVSP64; 2002 int cmd = OCFS2_IOC_RESVSP64;
2003 2003
2004 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2004 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2005 return -EOPNOTSUPP; 2005 return -EOPNOTSUPP;
2006 if (!ocfs2_writes_unwritten_extents(osb)) 2006 if (!ocfs2_writes_unwritten_extents(osb))
2007 return -EOPNOTSUPP; 2007 return -EOPNOTSUPP;
2008 2008
2009 if (mode & FALLOC_FL_KEEP_SIZE) 2009 if (mode & FALLOC_FL_KEEP_SIZE)
2010 change_size = 0; 2010 change_size = 0;
2011 2011
2012 if (mode & FALLOC_FL_PUNCH_HOLE) 2012 if (mode & FALLOC_FL_PUNCH_HOLE)
2013 cmd = OCFS2_IOC_UNRESVSP64; 2013 cmd = OCFS2_IOC_UNRESVSP64;
2014 2014
2015 sr.l_whence = 0; 2015 sr.l_whence = 0;
2016 sr.l_start = (s64)offset; 2016 sr.l_start = (s64)offset;
2017 sr.l_len = (s64)len; 2017 sr.l_len = (s64)len;
2018 2018
2019 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr, 2019 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2020 change_size); 2020 change_size);
2021 } 2021 }
2022 2022
2023 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 2023 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2024 size_t count) 2024 size_t count)
2025 { 2025 {
2026 int ret = 0; 2026 int ret = 0;
2027 unsigned int extent_flags; 2027 unsigned int extent_flags;
2028 u32 cpos, clusters, extent_len, phys_cpos; 2028 u32 cpos, clusters, extent_len, phys_cpos;
2029 struct super_block *sb = inode->i_sb; 2029 struct super_block *sb = inode->i_sb;
2030 2030
2031 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || 2031 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2032 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || 2032 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
2033 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2033 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2034 return 0; 2034 return 0;
2035 2035
2036 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 2036 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2037 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 2037 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2038 2038
2039 while (clusters) { 2039 while (clusters) {
2040 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 2040 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2041 &extent_flags); 2041 &extent_flags);
2042 if (ret < 0) { 2042 if (ret < 0) {
2043 mlog_errno(ret); 2043 mlog_errno(ret);
2044 goto out; 2044 goto out;
2045 } 2045 }
2046 2046
2047 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { 2047 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2048 ret = 1; 2048 ret = 1;
2049 break; 2049 break;
2050 } 2050 }
2051 2051
2052 if (extent_len > clusters) 2052 if (extent_len > clusters)
2053 extent_len = clusters; 2053 extent_len = clusters;
2054 2054
2055 clusters -= extent_len; 2055 clusters -= extent_len;
2056 cpos += extent_len; 2056 cpos += extent_len;
2057 } 2057 }
2058 out: 2058 out:
2059 return ret; 2059 return ret;
2060 } 2060 }
2061 2061
2062 static void ocfs2_aiodio_wait(struct inode *inode) 2062 static void ocfs2_aiodio_wait(struct inode *inode)
2063 { 2063 {
2064 wait_queue_head_t *wq = ocfs2_ioend_wq(inode); 2064 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2065 2065
2066 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); 2066 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2067 } 2067 }
2068 2068
2069 static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) 2069 static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2070 { 2070 {
2071 int blockmask = inode->i_sb->s_blocksize - 1; 2071 int blockmask = inode->i_sb->s_blocksize - 1;
2072 loff_t final_size = pos + count; 2072 loff_t final_size = pos + count;
2073 2073
2074 if ((pos & blockmask) || (final_size & blockmask)) 2074 if ((pos & blockmask) || (final_size & blockmask))
2075 return 1; 2075 return 1;
2076 return 0; 2076 return 0;
2077 } 2077 }
2078 2078
2079 static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2079 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2080 struct file *file, 2080 struct file *file,
2081 loff_t pos, size_t count, 2081 loff_t pos, size_t count,
2082 int *meta_level) 2082 int *meta_level)
2083 { 2083 {
2084 int ret; 2084 int ret;
2085 struct buffer_head *di_bh = NULL; 2085 struct buffer_head *di_bh = NULL;
2086 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 2086 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2087 u32 clusters = 2087 u32 clusters =
2088 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 2088 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2089 2089
2090 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2090 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2091 if (ret) { 2091 if (ret) {
2092 mlog_errno(ret); 2092 mlog_errno(ret);
2093 goto out; 2093 goto out;
2094 } 2094 }
2095 2095
2096 *meta_level = 1; 2096 *meta_level = 1;
2097 2097
2098 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); 2098 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2099 if (ret) 2099 if (ret)
2100 mlog_errno(ret); 2100 mlog_errno(ret);
2101 out: 2101 out:
2102 brelse(di_bh); 2102 brelse(di_bh);
2103 return ret; 2103 return ret;
2104 } 2104 }
2105 2105
2106 static int ocfs2_prepare_inode_for_write(struct file *file, 2106 static int ocfs2_prepare_inode_for_write(struct file *file,
2107 loff_t *ppos, 2107 loff_t *ppos,
2108 size_t count, 2108 size_t count,
2109 int appending, 2109 int appending,
2110 int *direct_io, 2110 int *direct_io,
2111 int *has_refcount) 2111 int *has_refcount)
2112 { 2112 {
2113 int ret = 0, meta_level = 0; 2113 int ret = 0, meta_level = 0;
2114 struct dentry *dentry = file->f_path.dentry; 2114 struct dentry *dentry = file->f_path.dentry;
2115 struct inode *inode = dentry->d_inode; 2115 struct inode *inode = dentry->d_inode;
2116 loff_t saved_pos = 0, end; 2116 loff_t saved_pos = 0, end;
2117 2117
2118 /* 2118 /*
2119 * We start with a read level meta lock and only jump to an ex 2119 * We start with a read level meta lock and only jump to an ex
2120 * if we need to make modifications here. 2120 * if we need to make modifications here.
2121 */ 2121 */
2122 for(;;) { 2122 for(;;) {
2123 ret = ocfs2_inode_lock(inode, NULL, meta_level); 2123 ret = ocfs2_inode_lock(inode, NULL, meta_level);
2124 if (ret < 0) { 2124 if (ret < 0) {
2125 meta_level = -1; 2125 meta_level = -1;
2126 mlog_errno(ret); 2126 mlog_errno(ret);
2127 goto out; 2127 goto out;
2128 } 2128 }
2129 2129
2130 /* Clear suid / sgid if necessary. We do this here 2130 /* Clear suid / sgid if necessary. We do this here
2131 * instead of later in the write path because 2131 * instead of later in the write path because
2132 * remove_suid() calls ->setattr without any hint that 2132 * remove_suid() calls ->setattr without any hint that
2133 * we may have already done our cluster locking. Since 2133 * we may have already done our cluster locking. Since
2134 * ocfs2_setattr() *must* take cluster locks to 2134 * ocfs2_setattr() *must* take cluster locks to
2135 * proceed, this will lead us to recursively lock the 2135 * proceed, this will lead us to recursively lock the
2136 * inode. There's also the dinode i_size state which 2136 * inode. There's also the dinode i_size state which
2137 * can be lost via setattr during extending writes (we 2137 * can be lost via setattr during extending writes (we
2138 * set inode->i_size at the end of a write. */ 2138 * set inode->i_size at the end of a write. */
2139 if (should_remove_suid(dentry)) { 2139 if (should_remove_suid(dentry)) {
2140 if (meta_level == 0) { 2140 if (meta_level == 0) {
2141 ocfs2_inode_unlock(inode, meta_level); 2141 ocfs2_inode_unlock(inode, meta_level);
2142 meta_level = 1; 2142 meta_level = 1;
2143 continue; 2143 continue;
2144 } 2144 }
2145 2145
2146 ret = ocfs2_write_remove_suid(inode); 2146 ret = ocfs2_write_remove_suid(inode);
2147 if (ret < 0) { 2147 if (ret < 0) {
2148 mlog_errno(ret); 2148 mlog_errno(ret);
2149 goto out_unlock; 2149 goto out_unlock;
2150 } 2150 }
2151 } 2151 }
2152 2152
2153 /* work on a copy of ppos until we're sure that we won't have 2153 /* work on a copy of ppos until we're sure that we won't have
2154 * to recalculate it due to relocking. */ 2154 * to recalculate it due to relocking. */
2155 if (appending) 2155 if (appending)
2156 saved_pos = i_size_read(inode); 2156 saved_pos = i_size_read(inode);
2157 else 2157 else
2158 saved_pos = *ppos; 2158 saved_pos = *ppos;
2159 2159
2160 end = saved_pos + count; 2160 end = saved_pos + count;
2161 2161
2162 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); 2162 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
2163 if (ret == 1) { 2163 if (ret == 1) {
2164 ocfs2_inode_unlock(inode, meta_level); 2164 ocfs2_inode_unlock(inode, meta_level);
2165 meta_level = -1; 2165 meta_level = -1;
2166 2166
2167 ret = ocfs2_prepare_inode_for_refcount(inode, 2167 ret = ocfs2_prepare_inode_for_refcount(inode,
2168 file, 2168 file,
2169 saved_pos, 2169 saved_pos,
2170 count, 2170 count,
2171 &meta_level); 2171 &meta_level);
2172 if (has_refcount) 2172 if (has_refcount)
2173 *has_refcount = 1; 2173 *has_refcount = 1;
2174 if (direct_io) 2174 if (direct_io)
2175 *direct_io = 0; 2175 *direct_io = 0;
2176 } 2176 }
2177 2177
2178 if (ret < 0) { 2178 if (ret < 0) {
2179 mlog_errno(ret); 2179 mlog_errno(ret);
2180 goto out_unlock; 2180 goto out_unlock;
2181 } 2181 }
2182 2182
2183 /* 2183 /*
2184 * Skip the O_DIRECT checks if we don't need 2184 * Skip the O_DIRECT checks if we don't need
2185 * them. 2185 * them.
2186 */ 2186 */
2187 if (!direct_io || !(*direct_io)) 2187 if (!direct_io || !(*direct_io))
2188 break; 2188 break;
2189 2189
2190 /* 2190 /*
2191 * There's no sane way to do direct writes to an inode 2191 * There's no sane way to do direct writes to an inode
2192 * with inline data. 2192 * with inline data.
2193 */ 2193 */
2194 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2194 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2195 *direct_io = 0; 2195 *direct_io = 0;
2196 break; 2196 break;
2197 } 2197 }
2198 2198
2199 /* 2199 /*
2200 * Allowing concurrent direct writes means 2200 * Allowing concurrent direct writes means
2201 * i_size changes wouldn't be synchronized, so 2201 * i_size changes wouldn't be synchronized, so
2202 * one node could wind up truncating another 2202 * one node could wind up truncating another
2203 * nodes writes. 2203 * nodes writes.
2204 */ 2204 */
2205 if (end > i_size_read(inode)) { 2205 if (end > i_size_read(inode)) {
2206 *direct_io = 0; 2206 *direct_io = 0;
2207 break; 2207 break;
2208 } 2208 }
2209 2209
2210 /* 2210 /*
2211 * We don't fill holes during direct io, so 2211 * We don't fill holes during direct io, so
2212 * check for them here. If any are found, the 2212 * check for them here. If any are found, the
2213 * caller will have to retake some cluster 2213 * caller will have to retake some cluster
2214 * locks and initiate the io as buffered. 2214 * locks and initiate the io as buffered.
2215 */ 2215 */
2216 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 2216 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2217 if (ret == 1) { 2217 if (ret == 1) {
2218 *direct_io = 0; 2218 *direct_io = 0;
2219 ret = 0; 2219 ret = 0;
2220 } else if (ret < 0) 2220 } else if (ret < 0)
2221 mlog_errno(ret); 2221 mlog_errno(ret);
2222 break; 2222 break;
2223 } 2223 }
2224 2224
2225 if (appending) 2225 if (appending)
2226 *ppos = saved_pos; 2226 *ppos = saved_pos;
2227 2227
2228 out_unlock: 2228 out_unlock:
2229 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2229 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2230 saved_pos, appending, count, 2230 saved_pos, appending, count,
2231 direct_io, has_refcount); 2231 direct_io, has_refcount);
2232 2232
2233 if (meta_level >= 0) 2233 if (meta_level >= 0)
2234 ocfs2_inode_unlock(inode, meta_level); 2234 ocfs2_inode_unlock(inode, meta_level);
2235 2235
2236 out: 2236 out:
2237 return ret; 2237 return ret;
2238 } 2238 }
2239 2239
2240 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 2240 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2241 const struct iovec *iov, 2241 const struct iovec *iov,
2242 unsigned long nr_segs, 2242 unsigned long nr_segs,
2243 loff_t pos) 2243 loff_t pos)
2244 { 2244 {
2245 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 2245 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
2246 int can_do_direct, has_refcount = 0; 2246 int can_do_direct, has_refcount = 0;
2247 ssize_t written = 0; 2247 ssize_t written = 0;
2248 size_t ocount; /* original count */ 2248 size_t ocount; /* original count */
2249 size_t count; /* after file limit checks */ 2249 size_t count; /* after file limit checks */
2250 loff_t old_size, *ppos = &iocb->ki_pos; 2250 loff_t old_size, *ppos = &iocb->ki_pos;
2251 u32 old_clusters; 2251 u32 old_clusters;
2252 struct file *file = iocb->ki_filp; 2252 struct file *file = iocb->ki_filp;
2253 struct inode *inode = file->f_path.dentry->d_inode; 2253 struct inode *inode = file->f_path.dentry->d_inode;
2254 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2254 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255 int full_coherency = !(osb->s_mount_opt & 2255 int full_coherency = !(osb->s_mount_opt &
2256 OCFS2_MOUNT_COHERENCY_BUFFERED); 2256 OCFS2_MOUNT_COHERENCY_BUFFERED);
2257 int unaligned_dio = 0; 2257 int unaligned_dio = 0;
2258 2258
2259 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2259 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2260 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2260 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2261 file->f_path.dentry->d_name.len, 2261 file->f_path.dentry->d_name.len,
2262 file->f_path.dentry->d_name.name, 2262 file->f_path.dentry->d_name.name,
2263 (unsigned int)nr_segs); 2263 (unsigned int)nr_segs);
2264 2264
2265 if (iocb->ki_left == 0) 2265 if (iocb->ki_left == 0)
2266 return 0; 2266 return 0;
2267 2267
2268 sb_start_write(inode->i_sb); 2268 sb_start_write(inode->i_sb);
2269 2269
2270 appending = file->f_flags & O_APPEND ? 1 : 0; 2270 appending = file->f_flags & O_APPEND ? 1 : 0;
2271 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 2271 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
2272 2272
2273 mutex_lock(&inode->i_mutex); 2273 mutex_lock(&inode->i_mutex);
2274 2274
2275 ocfs2_iocb_clear_sem_locked(iocb); 2275 ocfs2_iocb_clear_sem_locked(iocb);
2276 2276
2277 relock: 2277 relock:
2278 /* to match setattr's i_mutex -> rw_lock ordering */ 2278 /* to match setattr's i_mutex -> rw_lock ordering */
2279 if (direct_io) { 2279 if (direct_io) {
2280 have_alloc_sem = 1; 2280 have_alloc_sem = 1;
2281 /* communicate with ocfs2_dio_end_io */ 2281 /* communicate with ocfs2_dio_end_io */
2282 ocfs2_iocb_set_sem_locked(iocb); 2282 ocfs2_iocb_set_sem_locked(iocb);
2283 } 2283 }
2284 2284
2285 /* 2285 /*
2286 * Concurrent O_DIRECT writes are allowed with 2286 * Concurrent O_DIRECT writes are allowed with
2287 * mount_option "coherency=buffered". 2287 * mount_option "coherency=buffered".
2288 */ 2288 */
2289 rw_level = (!direct_io || full_coherency); 2289 rw_level = (!direct_io || full_coherency);
2290 2290
2291 ret = ocfs2_rw_lock(inode, rw_level); 2291 ret = ocfs2_rw_lock(inode, rw_level);
2292 if (ret < 0) { 2292 if (ret < 0) {
2293 mlog_errno(ret); 2293 mlog_errno(ret);
2294 goto out_sems; 2294 goto out_sems;
2295 } 2295 }
2296 2296
2297 /* 2297 /*
2298 * O_DIRECT writes with "coherency=full" need to take EX cluster 2298 * O_DIRECT writes with "coherency=full" need to take EX cluster
2299 * inode_lock to guarantee coherency. 2299 * inode_lock to guarantee coherency.
2300 */ 2300 */
2301 if (direct_io && full_coherency) { 2301 if (direct_io && full_coherency) {
2302 /* 2302 /*
2303 * We need to take and drop the inode lock to force 2303 * We need to take and drop the inode lock to force
2304 * other nodes to drop their caches. Buffered I/O 2304 * other nodes to drop their caches. Buffered I/O
2305 * already does this in write_begin(). 2305 * already does this in write_begin().
2306 */ 2306 */
2307 ret = ocfs2_inode_lock(inode, NULL, 1); 2307 ret = ocfs2_inode_lock(inode, NULL, 1);
2308 if (ret < 0) { 2308 if (ret < 0) {
2309 mlog_errno(ret); 2309 mlog_errno(ret);
2310 goto out_sems; 2310 goto out_sems;
2311 } 2311 }
2312 2312
2313 ocfs2_inode_unlock(inode, 1); 2313 ocfs2_inode_unlock(inode, 1);
2314 } 2314 }
2315 2315
2316 can_do_direct = direct_io; 2316 can_do_direct = direct_io;
2317 ret = ocfs2_prepare_inode_for_write(file, ppos, 2317 ret = ocfs2_prepare_inode_for_write(file, ppos,
2318 iocb->ki_left, appending, 2318 iocb->ki_left, appending,
2319 &can_do_direct, &has_refcount); 2319 &can_do_direct, &has_refcount);
2320 if (ret < 0) { 2320 if (ret < 0) {
2321 mlog_errno(ret); 2321 mlog_errno(ret);
2322 goto out; 2322 goto out;
2323 } 2323 }
2324 2324
2325 if (direct_io && !is_sync_kiocb(iocb)) 2325 if (direct_io && !is_sync_kiocb(iocb))
2326 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, 2326 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2327 *ppos); 2327 *ppos);
2328 2328
2329 /* 2329 /*
2330 * We can't complete the direct I/O as requested, fall back to 2330 * We can't complete the direct I/O as requested, fall back to
2331 * buffered I/O. 2331 * buffered I/O.
2332 */ 2332 */
2333 if (direct_io && !can_do_direct) { 2333 if (direct_io && !can_do_direct) {
2334 ocfs2_rw_unlock(inode, rw_level); 2334 ocfs2_rw_unlock(inode, rw_level);
2335 2335
2336 have_alloc_sem = 0; 2336 have_alloc_sem = 0;
2337 rw_level = -1; 2337 rw_level = -1;
2338 2338
2339 direct_io = 0; 2339 direct_io = 0;
2340 goto relock; 2340 goto relock;
2341 } 2341 }
2342 2342
2343 if (unaligned_dio) { 2343 if (unaligned_dio) {
2344 /* 2344 /*
2345 * Wait on previous unaligned aio to complete before 2345 * Wait on previous unaligned aio to complete before
2346 * proceeding. 2346 * proceeding.
2347 */ 2347 */
2348 ocfs2_aiodio_wait(inode); 2348 ocfs2_aiodio_wait(inode);
2349 2349
2350 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ 2350 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); 2351 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2352 ocfs2_iocb_set_unaligned_aio(iocb); 2352 ocfs2_iocb_set_unaligned_aio(iocb);
2353 } 2353 }
2354 2354
2355 /* 2355 /*
2356 * To later detect whether a journal commit for sync writes is 2356 * To later detect whether a journal commit for sync writes is
2357 * necessary, we sample i_size, and cluster count here. 2357 * necessary, we sample i_size, and cluster count here.
2358 */ 2358 */
2359 old_size = i_size_read(inode); 2359 old_size = i_size_read(inode);
2360 old_clusters = OCFS2_I(inode)->ip_clusters; 2360 old_clusters = OCFS2_I(inode)->ip_clusters;
2361 2361
2362 /* communicate with ocfs2_dio_end_io */ 2362 /* communicate with ocfs2_dio_end_io */
2363 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2363 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2364 2364
2365 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2365 ret = generic_segment_checks(iov, &nr_segs, &ocount,
2366 VERIFY_READ); 2366 VERIFY_READ);
2367 if (ret) 2367 if (ret)
2368 goto out_dio; 2368 goto out_dio;
2369 2369
2370 count = ocount; 2370 count = ocount;
2371 ret = generic_write_checks(file, ppos, &count, 2371 ret = generic_write_checks(file, ppos, &count,
2372 S_ISBLK(inode->i_mode)); 2372 S_ISBLK(inode->i_mode));
2373 if (ret) 2373 if (ret)
2374 goto out_dio; 2374 goto out_dio;
2375 2375
2376 if (direct_io) { 2376 if (direct_io) {
2377 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2377 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2378 ppos, count, ocount); 2378 ppos, count, ocount);
2379 if (written < 0) { 2379 if (written < 0) {
2380 ret = written; 2380 ret = written;
2381 goto out_dio; 2381 goto out_dio;
2382 } 2382 }
2383 } else { 2383 } else {
2384 current->backing_dev_info = file->f_mapping->backing_dev_info; 2384 current->backing_dev_info = file->f_mapping->backing_dev_info;
2385 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, 2385 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2386 ppos, count, 0); 2386 ppos, count, 0);
2387 current->backing_dev_info = NULL; 2387 current->backing_dev_info = NULL;
2388 } 2388 }
2389 2389
2390 out_dio: 2390 out_dio:
2391 /* buffered aio wouldn't have proper lock coverage today */ 2391 /* buffered aio wouldn't have proper lock coverage today */
2392 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2392 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2393 2393
2394 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2394 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2395 ((file->f_flags & O_DIRECT) && !direct_io)) { 2395 ((file->f_flags & O_DIRECT) && !direct_io)) {
2396 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2396 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2397 pos + count - 1); 2397 pos + count - 1);
2398 if (ret < 0) 2398 if (ret < 0)
2399 written = ret; 2399 written = ret;
2400 2400
2401 if (!ret && ((old_size != i_size_read(inode)) || 2401 if (!ret && ((old_size != i_size_read(inode)) ||
2402 (old_clusters != OCFS2_I(inode)->ip_clusters) || 2402 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2403 has_refcount)) { 2403 has_refcount)) {
2404 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2404 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2405 if (ret < 0) 2405 if (ret < 0)
2406 written = ret; 2406 written = ret;
2407 } 2407 }
2408 2408
2409 if (!ret) 2409 if (!ret)
2410 ret = filemap_fdatawait_range(file->f_mapping, pos, 2410 ret = filemap_fdatawait_range(file->f_mapping, pos,
2411 pos + count - 1); 2411 pos + count - 1);
2412 } 2412 }
2413 2413
2414 /* 2414 /*
2415 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2415 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2416 * function pointer which is called when o_direct io completes so that 2416 * function pointer which is called when o_direct io completes so that
2417 * it can unlock our rw lock. 2417 * it can unlock our rw lock.
2418 * Unfortunately there are error cases which call end_io and others 2418 * Unfortunately there are error cases which call end_io and others
2419 * that don't. so we don't have to unlock the rw_lock if either an 2419 * that don't. so we don't have to unlock the rw_lock if either an
2420 * async dio is going to do it in the future or an end_io after an 2420 * async dio is going to do it in the future or an end_io after an
2421 * error has already done it. 2421 * error has already done it.
2422 */ 2422 */
2423 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2423 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2424 rw_level = -1; 2424 rw_level = -1;
2425 have_alloc_sem = 0; 2425 have_alloc_sem = 0;
2426 unaligned_dio = 0; 2426 unaligned_dio = 0;
2427 } 2427 }
2428 2428
2429 if (unaligned_dio) { 2429 if (unaligned_dio) {
2430 ocfs2_iocb_clear_unaligned_aio(iocb); 2430 ocfs2_iocb_clear_unaligned_aio(iocb);
2431 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2431 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2432 } 2432 }
2433 2433
2434 out: 2434 out:
2435 if (rw_level != -1) 2435 if (rw_level != -1)
2436 ocfs2_rw_unlock(inode, rw_level); 2436 ocfs2_rw_unlock(inode, rw_level);
2437 2437
2438 out_sems: 2438 out_sems:
2439 if (have_alloc_sem) 2439 if (have_alloc_sem)
2440 ocfs2_iocb_clear_sem_locked(iocb); 2440 ocfs2_iocb_clear_sem_locked(iocb);
2441 2441
2442 mutex_unlock(&inode->i_mutex); 2442 mutex_unlock(&inode->i_mutex);
2443 sb_end_write(inode->i_sb); 2443 sb_end_write(inode->i_sb);
2444 2444
2445 if (written) 2445 if (written)
2446 ret = written; 2446 ret = written;
2447 return ret; 2447 return ret;
2448 } 2448 }
2449 2449
2450 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 2450 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2451 struct file *out, 2451 struct file *out,
2452 struct splice_desc *sd) 2452 struct splice_desc *sd)
2453 { 2453 {
2454 int ret; 2454 int ret;
2455 2455
2456 ret = ocfs2_prepare_inode_for_write(out, &sd->pos, 2456 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2457 sd->total_len, 0, NULL, NULL); 2457 sd->total_len, 0, NULL, NULL);
2458 if (ret < 0) { 2458 if (ret < 0) {
2459 mlog_errno(ret); 2459 mlog_errno(ret);
2460 return ret; 2460 return ret;
2461 } 2461 }
2462 2462
2463 return splice_from_pipe_feed(pipe, sd, pipe_to_file); 2463 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2464 } 2464 }
2465 2465
2466 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2466 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2467 struct file *out, 2467 struct file *out,
2468 loff_t *ppos, 2468 loff_t *ppos,
2469 size_t len, 2469 size_t len,
2470 unsigned int flags) 2470 unsigned int flags)
2471 { 2471 {
2472 int ret; 2472 int ret;
2473 struct address_space *mapping = out->f_mapping; 2473 struct address_space *mapping = out->f_mapping;
2474 struct inode *inode = mapping->host; 2474 struct inode *inode = mapping->host;
2475 struct splice_desc sd = { 2475 struct splice_desc sd = {
2476 .total_len = len, 2476 .total_len = len,
2477 .flags = flags, 2477 .flags = flags,
2478 .pos = *ppos, 2478 .pos = *ppos,
2479 .u.file = out, 2479 .u.file = out,
2480 }; 2480 };
2481 2481
2482 2482
2483 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, 2483 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2484 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2484 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2485 out->f_path.dentry->d_name.len, 2485 out->f_path.dentry->d_name.len,
2486 out->f_path.dentry->d_name.name, len); 2486 out->f_path.dentry->d_name.name, len);
2487 2487
2488 if (pipe->inode) 2488 if (pipe->inode)
2489 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 2489 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2490 2490
2491 splice_from_pipe_begin(&sd); 2491 splice_from_pipe_begin(&sd);
2492 do { 2492 do {
2493 ret = splice_from_pipe_next(pipe, &sd); 2493 ret = splice_from_pipe_next(pipe, &sd);
2494 if (ret <= 0) 2494 if (ret <= 0)
2495 break; 2495 break;
2496 2496
2497 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2497 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2498 ret = ocfs2_rw_lock(inode, 1); 2498 ret = ocfs2_rw_lock(inode, 1);
2499 if (ret < 0) 2499 if (ret < 0)
2500 mlog_errno(ret); 2500 mlog_errno(ret);
2501 else { 2501 else {
2502 ret = ocfs2_splice_to_file(pipe, out, &sd); 2502 ret = ocfs2_splice_to_file(pipe, out, &sd);
2503 ocfs2_rw_unlock(inode, 1); 2503 ocfs2_rw_unlock(inode, 1);
2504 } 2504 }
2505 mutex_unlock(&inode->i_mutex); 2505 mutex_unlock(&inode->i_mutex);
2506 } while (ret > 0); 2506 } while (ret > 0);
2507 splice_from_pipe_end(pipe, &sd); 2507 splice_from_pipe_end(pipe, &sd);
2508 2508
2509 if (pipe->inode) 2509 if (pipe->inode)
2510 mutex_unlock(&pipe->inode->i_mutex); 2510 mutex_unlock(&pipe->inode->i_mutex);
2511 2511
2512 if (sd.num_spliced) 2512 if (sd.num_spliced)
2513 ret = sd.num_spliced; 2513 ret = sd.num_spliced;
2514 2514
2515 if (ret > 0) { 2515 if (ret > 0) {
2516 int err; 2516 int err;
2517 2517
2518 err = generic_write_sync(out, *ppos, ret); 2518 err = generic_write_sync(out, *ppos, ret);
2519 if (err) 2519 if (err)
2520 ret = err; 2520 ret = err;
2521 else 2521 else
2522 *ppos += ret; 2522 *ppos += ret;
2523 2523
2524 balance_dirty_pages_ratelimited(mapping); 2524 balance_dirty_pages_ratelimited(mapping);
2525 } 2525 }
2526 2526
2527 return ret; 2527 return ret;
2528 } 2528 }
2529 2529
2530 static ssize_t ocfs2_file_splice_read(struct file *in, 2530 static ssize_t ocfs2_file_splice_read(struct file *in,
2531 loff_t *ppos, 2531 loff_t *ppos,
2532 struct pipe_inode_info *pipe, 2532 struct pipe_inode_info *pipe,
2533 size_t len, 2533 size_t len,
2534 unsigned int flags) 2534 unsigned int flags)
2535 { 2535 {
2536 int ret = 0, lock_level = 0; 2536 int ret = 0, lock_level = 0;
2537 struct inode *inode = in->f_path.dentry->d_inode; 2537 struct inode *inode = in->f_path.dentry->d_inode;
2538 2538
2539 trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, 2539 trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2540 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2540 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2541 in->f_path.dentry->d_name.len, 2541 in->f_path.dentry->d_name.len,
2542 in->f_path.dentry->d_name.name, len); 2542 in->f_path.dentry->d_name.name, len);
2543 2543
2544 /* 2544 /*
2545 * See the comment in ocfs2_file_aio_read() 2545 * See the comment in ocfs2_file_aio_read()
2546 */ 2546 */
2547 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); 2547 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2548 if (ret < 0) { 2548 if (ret < 0) {
2549 mlog_errno(ret); 2549 mlog_errno(ret);
2550 goto bail; 2550 goto bail;
2551 } 2551 }
2552 ocfs2_inode_unlock(inode, lock_level); 2552 ocfs2_inode_unlock(inode, lock_level);
2553 2553
2554 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2554 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2555 2555
2556 bail: 2556 bail:
2557 return ret; 2557 return ret;
2558 } 2558 }
2559 2559
2560 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2560 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2561 const struct iovec *iov, 2561 const struct iovec *iov,
2562 unsigned long nr_segs, 2562 unsigned long nr_segs,
2563 loff_t pos) 2563 loff_t pos)
2564 { 2564 {
2565 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2565 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2566 struct file *filp = iocb->ki_filp; 2566 struct file *filp = iocb->ki_filp;
2567 struct inode *inode = filp->f_path.dentry->d_inode; 2567 struct inode *inode = filp->f_path.dentry->d_inode;
2568 2568
2569 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, 2569 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2570 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2570 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2571 filp->f_path.dentry->d_name.len, 2571 filp->f_path.dentry->d_name.len,
2572 filp->f_path.dentry->d_name.name, nr_segs); 2572 filp->f_path.dentry->d_name.name, nr_segs);
2573 2573
2574 2574
2575 if (!inode) { 2575 if (!inode) {
2576 ret = -EINVAL; 2576 ret = -EINVAL;
2577 mlog_errno(ret); 2577 mlog_errno(ret);
2578 goto bail; 2578 goto bail;
2579 } 2579 }
2580 2580
2581 ocfs2_iocb_clear_sem_locked(iocb); 2581 ocfs2_iocb_clear_sem_locked(iocb);
2582 2582
2583 /* 2583 /*
2584 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2584 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2585 * need locks to protect pending reads from racing with truncate. 2585 * need locks to protect pending reads from racing with truncate.
2586 */ 2586 */
2587 if (filp->f_flags & O_DIRECT) { 2587 if (filp->f_flags & O_DIRECT) {
2588 have_alloc_sem = 1; 2588 have_alloc_sem = 1;
2589 ocfs2_iocb_set_sem_locked(iocb); 2589 ocfs2_iocb_set_sem_locked(iocb);
2590 2590
2591 ret = ocfs2_rw_lock(inode, 0); 2591 ret = ocfs2_rw_lock(inode, 0);
2592 if (ret < 0) { 2592 if (ret < 0) {
2593 mlog_errno(ret); 2593 mlog_errno(ret);
2594 goto bail; 2594 goto bail;
2595 } 2595 }
2596 rw_level = 0; 2596 rw_level = 0;
2597 /* communicate with ocfs2_dio_end_io */ 2597 /* communicate with ocfs2_dio_end_io */
2598 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2598 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2599 } 2599 }
2600 2600
2601 /* 2601 /*
2602 * We're fine letting folks race truncates and extending 2602 * We're fine letting folks race truncates and extending
2603 * writes with read across the cluster, just like they can 2603 * writes with read across the cluster, just like they can
2604 * locally. Hence no rw_lock during read. 2604 * locally. Hence no rw_lock during read.
2605 * 2605 *
2606 * Take and drop the meta data lock to update inode fields 2606 * Take and drop the meta data lock to update inode fields
2607 * like i_size. This allows the checks down below 2607 * like i_size. This allows the checks down below
2608 * generic_file_aio_read() a chance of actually working. 2608 * generic_file_aio_read() a chance of actually working.
2609 */ 2609 */
2610 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2610 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2611 if (ret < 0) { 2611 if (ret < 0) {
2612 mlog_errno(ret); 2612 mlog_errno(ret);
2613 goto bail; 2613 goto bail;
2614 } 2614 }
2615 ocfs2_inode_unlock(inode, lock_level); 2615 ocfs2_inode_unlock(inode, lock_level);
2616 2616
2617 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2617 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2618 trace_generic_file_aio_read_ret(ret); 2618 trace_generic_file_aio_read_ret(ret);
2619 2619
2620 /* buffered aio wouldn't have proper lock coverage today */ 2620 /* buffered aio wouldn't have proper lock coverage today */
2621 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2621 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2622 2622
2623 /* see ocfs2_file_aio_write */ 2623 /* see ocfs2_file_aio_write */
2624 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2624 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2625 rw_level = -1; 2625 rw_level = -1;
2626 have_alloc_sem = 0; 2626 have_alloc_sem = 0;
2627 } 2627 }
2628 2628
2629 bail: 2629 bail:
2630 if (have_alloc_sem) 2630 if (have_alloc_sem)
2631 ocfs2_iocb_clear_sem_locked(iocb); 2631 ocfs2_iocb_clear_sem_locked(iocb);
2632 2632
2633 if (rw_level != -1) 2633 if (rw_level != -1)
2634 ocfs2_rw_unlock(inode, rw_level); 2634 ocfs2_rw_unlock(inode, rw_level);
2635 2635
2636 return ret; 2636 return ret;
2637 } 2637 }
2638 2638
2639 /* Refer generic_file_llseek_unlocked() */ 2639 /* Refer generic_file_llseek_unlocked() */
2640 static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) 2640 static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2641 { 2641 {
2642 struct inode *inode = file->f_mapping->host; 2642 struct inode *inode = file->f_mapping->host;
2643 int ret = 0; 2643 int ret = 0;
2644 2644
2645 mutex_lock(&inode->i_mutex); 2645 mutex_lock(&inode->i_mutex);
2646 2646
2647 switch (origin) { 2647 switch (whence) {
2648 case SEEK_SET: 2648 case SEEK_SET:
2649 break; 2649 break;
2650 case SEEK_END: 2650 case SEEK_END:
2651 offset += inode->i_size; 2651 offset += inode->i_size;
2652 break; 2652 break;
2653 case SEEK_CUR: 2653 case SEEK_CUR:
2654 if (offset == 0) { 2654 if (offset == 0) {
2655 offset = file->f_pos; 2655 offset = file->f_pos;
2656 goto out; 2656 goto out;
2657 } 2657 }
2658 offset += file->f_pos; 2658 offset += file->f_pos;
2659 break; 2659 break;
2660 case SEEK_DATA: 2660 case SEEK_DATA:
2661 case SEEK_HOLE: 2661 case SEEK_HOLE:
2662 ret = ocfs2_seek_data_hole_offset(file, &offset, origin); 2662 ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2663 if (ret) 2663 if (ret)
2664 goto out; 2664 goto out;
2665 break; 2665 break;
2666 default: 2666 default:
2667 ret = -EINVAL; 2667 ret = -EINVAL;
2668 goto out; 2668 goto out;
2669 } 2669 }
2670 2670
2671 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 2671 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2672 ret = -EINVAL; 2672 ret = -EINVAL;
2673 if (!ret && offset > inode->i_sb->s_maxbytes) 2673 if (!ret && offset > inode->i_sb->s_maxbytes)
2674 ret = -EINVAL; 2674 ret = -EINVAL;
2675 if (ret) 2675 if (ret)
2676 goto out; 2676 goto out;
2677 2677
2678 if (offset != file->f_pos) { 2678 if (offset != file->f_pos) {
2679 file->f_pos = offset; 2679 file->f_pos = offset;
2680 file->f_version = 0; 2680 file->f_version = 0;
2681 } 2681 }
2682 2682
2683 out: 2683 out:
2684 mutex_unlock(&inode->i_mutex); 2684 mutex_unlock(&inode->i_mutex);
2685 if (ret) 2685 if (ret)
2686 return ret; 2686 return ret;
2687 return offset; 2687 return offset;
2688 } 2688 }
2689 2689
2690 const struct inode_operations ocfs2_file_iops = { 2690 const struct inode_operations ocfs2_file_iops = {
2691 .setattr = ocfs2_setattr, 2691 .setattr = ocfs2_setattr,
2692 .getattr = ocfs2_getattr, 2692 .getattr = ocfs2_getattr,
2693 .permission = ocfs2_permission, 2693 .permission = ocfs2_permission,
2694 .setxattr = generic_setxattr, 2694 .setxattr = generic_setxattr,
2695 .getxattr = generic_getxattr, 2695 .getxattr = generic_getxattr,
2696 .listxattr = ocfs2_listxattr, 2696 .listxattr = ocfs2_listxattr,
2697 .removexattr = generic_removexattr, 2697 .removexattr = generic_removexattr,
2698 .fiemap = ocfs2_fiemap, 2698 .fiemap = ocfs2_fiemap,
2699 .get_acl = ocfs2_iop_get_acl, 2699 .get_acl = ocfs2_iop_get_acl,
2700 }; 2700 };
2701 2701
2702 const struct inode_operations ocfs2_special_file_iops = { 2702 const struct inode_operations ocfs2_special_file_iops = {
2703 .setattr = ocfs2_setattr, 2703 .setattr = ocfs2_setattr,
2704 .getattr = ocfs2_getattr, 2704 .getattr = ocfs2_getattr,
2705 .permission = ocfs2_permission, 2705 .permission = ocfs2_permission,
2706 .get_acl = ocfs2_iop_get_acl, 2706 .get_acl = ocfs2_iop_get_acl,
2707 }; 2707 };
2708 2708
2709 /* 2709 /*
2710 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with 2710 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2711 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2711 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2712 */ 2712 */
2713 const struct file_operations ocfs2_fops = { 2713 const struct file_operations ocfs2_fops = {
2714 .llseek = ocfs2_file_llseek, 2714 .llseek = ocfs2_file_llseek,
2715 .read = do_sync_read, 2715 .read = do_sync_read,
2716 .write = do_sync_write, 2716 .write = do_sync_write,
2717 .mmap = ocfs2_mmap, 2717 .mmap = ocfs2_mmap,
2718 .fsync = ocfs2_sync_file, 2718 .fsync = ocfs2_sync_file,
2719 .release = ocfs2_file_release, 2719 .release = ocfs2_file_release,
2720 .open = ocfs2_file_open, 2720 .open = ocfs2_file_open,
2721 .aio_read = ocfs2_file_aio_read, 2721 .aio_read = ocfs2_file_aio_read,
2722 .aio_write = ocfs2_file_aio_write, 2722 .aio_write = ocfs2_file_aio_write,
2723 .unlocked_ioctl = ocfs2_ioctl, 2723 .unlocked_ioctl = ocfs2_ioctl,
2724 #ifdef CONFIG_COMPAT 2724 #ifdef CONFIG_COMPAT
2725 .compat_ioctl = ocfs2_compat_ioctl, 2725 .compat_ioctl = ocfs2_compat_ioctl,
2726 #endif 2726 #endif
2727 .lock = ocfs2_lock, 2727 .lock = ocfs2_lock,
2728 .flock = ocfs2_flock, 2728 .flock = ocfs2_flock,
2729 .splice_read = ocfs2_file_splice_read, 2729 .splice_read = ocfs2_file_splice_read,
2730 .splice_write = ocfs2_file_splice_write, 2730 .splice_write = ocfs2_file_splice_write,
2731 .fallocate = ocfs2_fallocate, 2731 .fallocate = ocfs2_fallocate,
2732 }; 2732 };
2733 2733
2734 const struct file_operations ocfs2_dops = { 2734 const struct file_operations ocfs2_dops = {
2735 .llseek = generic_file_llseek, 2735 .llseek = generic_file_llseek,
2736 .read = generic_read_dir, 2736 .read = generic_read_dir,
2737 .readdir = ocfs2_readdir, 2737 .readdir = ocfs2_readdir,
2738 .fsync = ocfs2_sync_file, 2738 .fsync = ocfs2_sync_file,
2739 .release = ocfs2_dir_release, 2739 .release = ocfs2_dir_release,
2740 .open = ocfs2_dir_open, 2740 .open = ocfs2_dir_open,
2741 .unlocked_ioctl = ocfs2_ioctl, 2741 .unlocked_ioctl = ocfs2_ioctl,
2742 #ifdef CONFIG_COMPAT 2742 #ifdef CONFIG_COMPAT
2743 .compat_ioctl = ocfs2_compat_ioctl, 2743 .compat_ioctl = ocfs2_compat_ioctl,
2744 #endif 2744 #endif
2745 .lock = ocfs2_lock, 2745 .lock = ocfs2_lock,
2746 .flock = ocfs2_flock, 2746 .flock = ocfs2_flock,
2747 }; 2747 };
2748 2748
2749 /* 2749 /*
2750 * POSIX-lockless variants of our file_operations. 2750 * POSIX-lockless variants of our file_operations.
2751 * 2751 *
2752 * These will be used if the underlying cluster stack does not support 2752 * These will be used if the underlying cluster stack does not support
2753 * posix file locking, if the user passes the "localflocks" mount 2753 * posix file locking, if the user passes the "localflocks" mount
2754 * option, or if we have a local-only fs. 2754 * option, or if we have a local-only fs.
2755 * 2755 *
2756 * ocfs2_flock is in here because all stacks handle UNIX file locks, 2756 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2757 * so we still want it in the case of no stack support for 2757 * so we still want it in the case of no stack support for
2758 * plocks. Internally, it will do the right thing when asked to ignore 2758 * plocks. Internally, it will do the right thing when asked to ignore
2759 * the cluster. 2759 * the cluster.
2760 */ 2760 */
2761 const struct file_operations ocfs2_fops_no_plocks = { 2761 const struct file_operations ocfs2_fops_no_plocks = {
2762 .llseek = ocfs2_file_llseek, 2762 .llseek = ocfs2_file_llseek,
2763 .read = do_sync_read, 2763 .read = do_sync_read,
2764 .write = do_sync_write, 2764 .write = do_sync_write,
2765 .mmap = ocfs2_mmap, 2765 .mmap = ocfs2_mmap,
2766 .fsync = ocfs2_sync_file, 2766 .fsync = ocfs2_sync_file,
2767 .release = ocfs2_file_release, 2767 .release = ocfs2_file_release,
2768 .open = ocfs2_file_open, 2768 .open = ocfs2_file_open,
2769 .aio_read = ocfs2_file_aio_read, 2769 .aio_read = ocfs2_file_aio_read,
2770 .aio_write = ocfs2_file_aio_write, 2770 .aio_write = ocfs2_file_aio_write,
2771 .unlocked_ioctl = ocfs2_ioctl, 2771 .unlocked_ioctl = ocfs2_ioctl,
2772 #ifdef CONFIG_COMPAT 2772 #ifdef CONFIG_COMPAT
2773 .compat_ioctl = ocfs2_compat_ioctl, 2773 .compat_ioctl = ocfs2_compat_ioctl,
2774 #endif 2774 #endif
2775 .flock = ocfs2_flock, 2775 .flock = ocfs2_flock,
2776 .splice_read = ocfs2_file_splice_read, 2776 .splice_read = ocfs2_file_splice_read,
2777 .splice_write = ocfs2_file_splice_write, 2777 .splice_write = ocfs2_file_splice_write,
2778 .fallocate = ocfs2_fallocate, 2778 .fallocate = ocfs2_fallocate,
2779 }; 2779 };
2780 2780
2781 const struct file_operations ocfs2_dops_no_plocks = { 2781 const struct file_operations ocfs2_dops_no_plocks = {
2782 .llseek = generic_file_llseek, 2782 .llseek = generic_file_llseek,
2783 .read = generic_read_dir, 2783 .read = generic_read_dir,
2784 .readdir = ocfs2_readdir, 2784 .readdir = ocfs2_readdir,
2785 .fsync = ocfs2_sync_file, 2785 .fsync = ocfs2_sync_file,
2786 .release = ocfs2_dir_release, 2786 .release = ocfs2_dir_release,
2787 .open = ocfs2_dir_open, 2787 .open = ocfs2_dir_open,
2788 .unlocked_ioctl = ocfs2_ioctl, 2788 .unlocked_ioctl = ocfs2_ioctl,
2789 #ifdef CONFIG_COMPAT 2789 #ifdef CONFIG_COMPAT
2790 .compat_ioctl = ocfs2_compat_ioctl, 2790 .compat_ioctl = ocfs2_compat_ioctl,
2791 #endif 2791 #endif
2792 .flock = ocfs2_flock, 2792 .flock = ocfs2_flock,
2793 }; 2793 };
2794 2794
1 /* 1 /*
2 * Persistent Storage - ramfs parts. 2 * Persistent Storage - ramfs parts.
3 * 3 *
4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com> 4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, 10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details. 13 * GNU General Public License for more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20 #include <linux/module.h> 20 #include <linux/module.h>
21 #include <linux/fs.h> 21 #include <linux/fs.h>
22 #include <linux/fsnotify.h> 22 #include <linux/fsnotify.h>
23 #include <linux/pagemap.h> 23 #include <linux/pagemap.h>
24 #include <linux/highmem.h> 24 #include <linux/highmem.h>
25 #include <linux/time.h> 25 #include <linux/time.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/list.h> 27 #include <linux/list.h>
28 #include <linux/string.h> 28 #include <linux/string.h>
29 #include <linux/mount.h> 29 #include <linux/mount.h>
30 #include <linux/seq_file.h> 30 #include <linux/seq_file.h>
31 #include <linux/ramfs.h> 31 #include <linux/ramfs.h>
32 #include <linux/parser.h> 32 #include <linux/parser.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/magic.h> 34 #include <linux/magic.h>
35 #include <linux/pstore.h> 35 #include <linux/pstore.h>
36 #include <linux/slab.h> 36 #include <linux/slab.h>
37 #include <linux/spinlock.h> 37 #include <linux/spinlock.h>
38 #include <linux/uaccess.h> 38 #include <linux/uaccess.h>
39 39
40 #include "internal.h" 40 #include "internal.h"
41 41
42 #define PSTORE_NAMELEN 64 42 #define PSTORE_NAMELEN 64
43 43
44 static DEFINE_SPINLOCK(allpstore_lock); 44 static DEFINE_SPINLOCK(allpstore_lock);
45 static LIST_HEAD(allpstore); 45 static LIST_HEAD(allpstore);
46 46
47 struct pstore_private { 47 struct pstore_private {
48 struct list_head list; 48 struct list_head list;
49 struct pstore_info *psi; 49 struct pstore_info *psi;
50 enum pstore_type_id type; 50 enum pstore_type_id type;
51 u64 id; 51 u64 id;
52 int count; 52 int count;
53 ssize_t size; 53 ssize_t size;
54 char data[]; 54 char data[];
55 }; 55 };
56 56
57 struct pstore_ftrace_seq_data { 57 struct pstore_ftrace_seq_data {
58 const void *ptr; 58 const void *ptr;
59 size_t off; 59 size_t off;
60 size_t size; 60 size_t size;
61 }; 61 };
62 62
63 #define REC_SIZE sizeof(struct pstore_ftrace_record) 63 #define REC_SIZE sizeof(struct pstore_ftrace_record)
64 64
65 static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) 65 static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
66 { 66 {
67 struct pstore_private *ps = s->private; 67 struct pstore_private *ps = s->private;
68 struct pstore_ftrace_seq_data *data; 68 struct pstore_ftrace_seq_data *data;
69 69
70 data = kzalloc(sizeof(*data), GFP_KERNEL); 70 data = kzalloc(sizeof(*data), GFP_KERNEL);
71 if (!data) 71 if (!data)
72 return NULL; 72 return NULL;
73 73
74 data->off = ps->size % REC_SIZE; 74 data->off = ps->size % REC_SIZE;
75 data->off += *pos * REC_SIZE; 75 data->off += *pos * REC_SIZE;
76 if (data->off + REC_SIZE > ps->size) { 76 if (data->off + REC_SIZE > ps->size) {
77 kfree(data); 77 kfree(data);
78 return NULL; 78 return NULL;
79 } 79 }
80 80
81 return data; 81 return data;
82 82
83 } 83 }
84 84
85 static void pstore_ftrace_seq_stop(struct seq_file *s, void *v) 85 static void pstore_ftrace_seq_stop(struct seq_file *s, void *v)
86 { 86 {
87 kfree(v); 87 kfree(v);
88 } 88 }
89 89
90 static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) 90 static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
91 { 91 {
92 struct pstore_private *ps = s->private; 92 struct pstore_private *ps = s->private;
93 struct pstore_ftrace_seq_data *data = v; 93 struct pstore_ftrace_seq_data *data = v;
94 94
95 data->off += REC_SIZE; 95 data->off += REC_SIZE;
96 if (data->off + REC_SIZE > ps->size) 96 if (data->off + REC_SIZE > ps->size)
97 return NULL; 97 return NULL;
98 98
99 (*pos)++; 99 (*pos)++;
100 return data; 100 return data;
101 } 101 }
102 102
103 static int pstore_ftrace_seq_show(struct seq_file *s, void *v) 103 static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
104 { 104 {
105 struct pstore_private *ps = s->private; 105 struct pstore_private *ps = s->private;
106 struct pstore_ftrace_seq_data *data = v; 106 struct pstore_ftrace_seq_data *data = v;
107 struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); 107 struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
108 108
109 seq_printf(s, "%d %08lx %08lx %pf <- %pF\n", 109 seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
110 pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip, 110 pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
111 (void *)rec->ip, (void *)rec->parent_ip); 111 (void *)rec->ip, (void *)rec->parent_ip);
112 112
113 return 0; 113 return 0;
114 } 114 }
115 115
116 static const struct seq_operations pstore_ftrace_seq_ops = { 116 static const struct seq_operations pstore_ftrace_seq_ops = {
117 .start = pstore_ftrace_seq_start, 117 .start = pstore_ftrace_seq_start,
118 .next = pstore_ftrace_seq_next, 118 .next = pstore_ftrace_seq_next,
119 .stop = pstore_ftrace_seq_stop, 119 .stop = pstore_ftrace_seq_stop,
120 .show = pstore_ftrace_seq_show, 120 .show = pstore_ftrace_seq_show,
121 }; 121 };
122 122
123 static ssize_t pstore_file_read(struct file *file, char __user *userbuf, 123 static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
124 size_t count, loff_t *ppos) 124 size_t count, loff_t *ppos)
125 { 125 {
126 struct seq_file *sf = file->private_data; 126 struct seq_file *sf = file->private_data;
127 struct pstore_private *ps = sf->private; 127 struct pstore_private *ps = sf->private;
128 128
129 if (ps->type == PSTORE_TYPE_FTRACE) 129 if (ps->type == PSTORE_TYPE_FTRACE)
130 return seq_read(file, userbuf, count, ppos); 130 return seq_read(file, userbuf, count, ppos);
131 return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); 131 return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
132 } 132 }
133 133
134 static int pstore_file_open(struct inode *inode, struct file *file) 134 static int pstore_file_open(struct inode *inode, struct file *file)
135 { 135 {
136 struct pstore_private *ps = inode->i_private; 136 struct pstore_private *ps = inode->i_private;
137 struct seq_file *sf; 137 struct seq_file *sf;
138 int err; 138 int err;
139 const struct seq_operations *sops = NULL; 139 const struct seq_operations *sops = NULL;
140 140
141 if (ps->type == PSTORE_TYPE_FTRACE) 141 if (ps->type == PSTORE_TYPE_FTRACE)
142 sops = &pstore_ftrace_seq_ops; 142 sops = &pstore_ftrace_seq_ops;
143 143
144 err = seq_open(file, sops); 144 err = seq_open(file, sops);
145 if (err < 0) 145 if (err < 0)
146 return err; 146 return err;
147 147
148 sf = file->private_data; 148 sf = file->private_data;
149 sf->private = ps; 149 sf->private = ps;
150 150
151 return 0; 151 return 0;
152 } 152 }
153 153
154 static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) 154 static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
155 { 155 {
156 struct seq_file *sf = file->private_data; 156 struct seq_file *sf = file->private_data;
157 157
158 if (sf->op) 158 if (sf->op)
159 return seq_lseek(file, off, origin); 159 return seq_lseek(file, off, whence);
160 return default_llseek(file, off, origin); 160 return default_llseek(file, off, whence);
161 } 161 }
162 162
163 static const struct file_operations pstore_file_operations = { 163 static const struct file_operations pstore_file_operations = {
164 .open = pstore_file_open, 164 .open = pstore_file_open,
165 .read = pstore_file_read, 165 .read = pstore_file_read,
166 .llseek = pstore_file_llseek, 166 .llseek = pstore_file_llseek,
167 .release = seq_release, 167 .release = seq_release,
168 }; 168 };
169 169
170 /* 170 /*
171 * When a file is unlinked from our file system we call the 171 * When a file is unlinked from our file system we call the
172 * platform driver to erase the record from persistent store. 172 * platform driver to erase the record from persistent store.
173 */ 173 */
174 static int pstore_unlink(struct inode *dir, struct dentry *dentry) 174 static int pstore_unlink(struct inode *dir, struct dentry *dentry)
175 { 175 {
176 struct pstore_private *p = dentry->d_inode->i_private; 176 struct pstore_private *p = dentry->d_inode->i_private;
177 177
178 if (p->psi->erase) 178 if (p->psi->erase)
179 p->psi->erase(p->type, p->id, p->count, 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi); 180 dentry->d_inode->i_ctime, p->psi);
181 181
182 return simple_unlink(dir, dentry); 182 return simple_unlink(dir, dentry);
183 } 183 }
184 184
185 static void pstore_evict_inode(struct inode *inode) 185 static void pstore_evict_inode(struct inode *inode)
186 { 186 {
187 struct pstore_private *p = inode->i_private; 187 struct pstore_private *p = inode->i_private;
188 unsigned long flags; 188 unsigned long flags;
189 189
190 clear_inode(inode); 190 clear_inode(inode);
191 if (p) { 191 if (p) {
192 spin_lock_irqsave(&allpstore_lock, flags); 192 spin_lock_irqsave(&allpstore_lock, flags);
193 list_del(&p->list); 193 list_del(&p->list);
194 spin_unlock_irqrestore(&allpstore_lock, flags); 194 spin_unlock_irqrestore(&allpstore_lock, flags);
195 kfree(p); 195 kfree(p);
196 } 196 }
197 } 197 }
198 198
199 static const struct inode_operations pstore_dir_inode_operations = { 199 static const struct inode_operations pstore_dir_inode_operations = {
200 .lookup = simple_lookup, 200 .lookup = simple_lookup,
201 .unlink = pstore_unlink, 201 .unlink = pstore_unlink,
202 }; 202 };
203 203
204 static struct inode *pstore_get_inode(struct super_block *sb) 204 static struct inode *pstore_get_inode(struct super_block *sb)
205 { 205 {
206 struct inode *inode = new_inode(sb); 206 struct inode *inode = new_inode(sb);
207 if (inode) { 207 if (inode) {
208 inode->i_ino = get_next_ino(); 208 inode->i_ino = get_next_ino();
209 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 209 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
210 } 210 }
211 return inode; 211 return inode;
212 } 212 }
213 213
214 enum { 214 enum {
215 Opt_kmsg_bytes, Opt_err 215 Opt_kmsg_bytes, Opt_err
216 }; 216 };
217 217
218 static const match_table_t tokens = { 218 static const match_table_t tokens = {
219 {Opt_kmsg_bytes, "kmsg_bytes=%u"}, 219 {Opt_kmsg_bytes, "kmsg_bytes=%u"},
220 {Opt_err, NULL} 220 {Opt_err, NULL}
221 }; 221 };
222 222
223 static void parse_options(char *options) 223 static void parse_options(char *options)
224 { 224 {
225 char *p; 225 char *p;
226 substring_t args[MAX_OPT_ARGS]; 226 substring_t args[MAX_OPT_ARGS];
227 int option; 227 int option;
228 228
229 if (!options) 229 if (!options)
230 return; 230 return;
231 231
232 while ((p = strsep(&options, ",")) != NULL) { 232 while ((p = strsep(&options, ",")) != NULL) {
233 int token; 233 int token;
234 234
235 if (!*p) 235 if (!*p)
236 continue; 236 continue;
237 237
238 token = match_token(p, tokens, args); 238 token = match_token(p, tokens, args);
239 switch (token) { 239 switch (token) {
240 case Opt_kmsg_bytes: 240 case Opt_kmsg_bytes:
241 if (!match_int(&args[0], &option)) 241 if (!match_int(&args[0], &option))
242 pstore_set_kmsg_bytes(option); 242 pstore_set_kmsg_bytes(option);
243 break; 243 break;
244 } 244 }
245 } 245 }
246 } 246 }
247 247
248 static int pstore_remount(struct super_block *sb, int *flags, char *data) 248 static int pstore_remount(struct super_block *sb, int *flags, char *data)
249 { 249 {
250 parse_options(data); 250 parse_options(data);
251 251
252 return 0; 252 return 0;
253 } 253 }
254 254
255 static const struct super_operations pstore_ops = { 255 static const struct super_operations pstore_ops = {
256 .statfs = simple_statfs, 256 .statfs = simple_statfs,
257 .drop_inode = generic_delete_inode, 257 .drop_inode = generic_delete_inode,
258 .evict_inode = pstore_evict_inode, 258 .evict_inode = pstore_evict_inode,
259 .remount_fs = pstore_remount, 259 .remount_fs = pstore_remount,
260 .show_options = generic_show_options, 260 .show_options = generic_show_options,
261 }; 261 };
262 262
263 static struct super_block *pstore_sb; 263 static struct super_block *pstore_sb;
264 264
265 int pstore_is_mounted(void) 265 int pstore_is_mounted(void)
266 { 266 {
267 return pstore_sb != NULL; 267 return pstore_sb != NULL;
268 } 268 }
269 269
270 /* 270 /*
271 * Make a regular file in the root directory of our file system. 271 * Make a regular file in the root directory of our file system.
272 * Load it up with "size" bytes of data from "buf". 272 * Load it up with "size" bytes of data from "buf".
273 * Set the mtime & ctime to the date that this record was originally stored. 273 * Set the mtime & ctime to the date that this record was originally stored.
274 */ 274 */
275 int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, 275 int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
276 char *data, size_t size, struct timespec time, 276 char *data, size_t size, struct timespec time,
277 struct pstore_info *psi) 277 struct pstore_info *psi)
278 { 278 {
279 struct dentry *root = pstore_sb->s_root; 279 struct dentry *root = pstore_sb->s_root;
280 struct dentry *dentry; 280 struct dentry *dentry;
281 struct inode *inode; 281 struct inode *inode;
282 int rc = 0; 282 int rc = 0;
283 char name[PSTORE_NAMELEN]; 283 char name[PSTORE_NAMELEN];
284 struct pstore_private *private, *pos; 284 struct pstore_private *private, *pos;
285 unsigned long flags; 285 unsigned long flags;
286 286
287 spin_lock_irqsave(&allpstore_lock, flags); 287 spin_lock_irqsave(&allpstore_lock, flags);
288 list_for_each_entry(pos, &allpstore, list) { 288 list_for_each_entry(pos, &allpstore, list) {
289 if (pos->type == type && 289 if (pos->type == type &&
290 pos->id == id && 290 pos->id == id &&
291 pos->psi == psi) { 291 pos->psi == psi) {
292 rc = -EEXIST; 292 rc = -EEXIST;
293 break; 293 break;
294 } 294 }
295 } 295 }
296 spin_unlock_irqrestore(&allpstore_lock, flags); 296 spin_unlock_irqrestore(&allpstore_lock, flags);
297 if (rc) 297 if (rc)
298 return rc; 298 return rc;
299 299
300 rc = -ENOMEM; 300 rc = -ENOMEM;
301 inode = pstore_get_inode(pstore_sb); 301 inode = pstore_get_inode(pstore_sb);
302 if (!inode) 302 if (!inode)
303 goto fail; 303 goto fail;
304 inode->i_mode = S_IFREG | 0444; 304 inode->i_mode = S_IFREG | 0444;
305 inode->i_fop = &pstore_file_operations; 305 inode->i_fop = &pstore_file_operations;
306 private = kmalloc(sizeof *private + size, GFP_KERNEL); 306 private = kmalloc(sizeof *private + size, GFP_KERNEL);
307 if (!private) 307 if (!private)
308 goto fail_alloc; 308 goto fail_alloc;
309 private->type = type; 309 private->type = type;
310 private->id = id; 310 private->id = id;
311 private->count = count; 311 private->count = count;
312 private->psi = psi; 312 private->psi = psi;
313 313
314 switch (type) { 314 switch (type) {
315 case PSTORE_TYPE_DMESG: 315 case PSTORE_TYPE_DMESG:
316 sprintf(name, "dmesg-%s-%lld", psname, id); 316 sprintf(name, "dmesg-%s-%lld", psname, id);
317 break; 317 break;
318 case PSTORE_TYPE_CONSOLE: 318 case PSTORE_TYPE_CONSOLE:
319 sprintf(name, "console-%s", psname); 319 sprintf(name, "console-%s", psname);
320 break; 320 break;
321 case PSTORE_TYPE_FTRACE: 321 case PSTORE_TYPE_FTRACE:
322 sprintf(name, "ftrace-%s", psname); 322 sprintf(name, "ftrace-%s", psname);
323 break; 323 break;
324 case PSTORE_TYPE_MCE: 324 case PSTORE_TYPE_MCE:
325 sprintf(name, "mce-%s-%lld", psname, id); 325 sprintf(name, "mce-%s-%lld", psname, id);
326 break; 326 break;
327 case PSTORE_TYPE_UNKNOWN: 327 case PSTORE_TYPE_UNKNOWN:
328 sprintf(name, "unknown-%s-%lld", psname, id); 328 sprintf(name, "unknown-%s-%lld", psname, id);
329 break; 329 break;
330 default: 330 default:
331 sprintf(name, "type%d-%s-%lld", type, psname, id); 331 sprintf(name, "type%d-%s-%lld", type, psname, id);
332 break; 332 break;
333 } 333 }
334 334
335 mutex_lock(&root->d_inode->i_mutex); 335 mutex_lock(&root->d_inode->i_mutex);
336 336
337 rc = -ENOSPC; 337 rc = -ENOSPC;
338 dentry = d_alloc_name(root, name); 338 dentry = d_alloc_name(root, name);
339 if (IS_ERR(dentry)) 339 if (IS_ERR(dentry))
340 goto fail_lockedalloc; 340 goto fail_lockedalloc;
341 341
342 memcpy(private->data, data, size); 342 memcpy(private->data, data, size);
343 inode->i_size = private->size = size; 343 inode->i_size = private->size = size;
344 344
345 inode->i_private = private; 345 inode->i_private = private;
346 346
347 if (time.tv_sec) 347 if (time.tv_sec)
348 inode->i_mtime = inode->i_ctime = time; 348 inode->i_mtime = inode->i_ctime = time;
349 349
350 d_add(dentry, inode); 350 d_add(dentry, inode);
351 351
352 spin_lock_irqsave(&allpstore_lock, flags); 352 spin_lock_irqsave(&allpstore_lock, flags);
353 list_add(&private->list, &allpstore); 353 list_add(&private->list, &allpstore);
354 spin_unlock_irqrestore(&allpstore_lock, flags); 354 spin_unlock_irqrestore(&allpstore_lock, flags);
355 355
356 mutex_unlock(&root->d_inode->i_mutex); 356 mutex_unlock(&root->d_inode->i_mutex);
357 357
358 return 0; 358 return 0;
359 359
360 fail_lockedalloc: 360 fail_lockedalloc:
361 mutex_unlock(&root->d_inode->i_mutex); 361 mutex_unlock(&root->d_inode->i_mutex);
362 kfree(private); 362 kfree(private);
363 fail_alloc: 363 fail_alloc:
364 iput(inode); 364 iput(inode);
365 365
366 fail: 366 fail:
367 return rc; 367 return rc;
368 } 368 }
369 369
370 static int pstore_fill_super(struct super_block *sb, void *data, int silent) 370 static int pstore_fill_super(struct super_block *sb, void *data, int silent)
371 { 371 {
372 struct inode *inode; 372 struct inode *inode;
373 373
374 save_mount_options(sb, data); 374 save_mount_options(sb, data);
375 375
376 pstore_sb = sb; 376 pstore_sb = sb;
377 377
378 sb->s_maxbytes = MAX_LFS_FILESIZE; 378 sb->s_maxbytes = MAX_LFS_FILESIZE;
379 sb->s_blocksize = PAGE_CACHE_SIZE; 379 sb->s_blocksize = PAGE_CACHE_SIZE;
380 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 380 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
381 sb->s_magic = PSTOREFS_MAGIC; 381 sb->s_magic = PSTOREFS_MAGIC;
382 sb->s_op = &pstore_ops; 382 sb->s_op = &pstore_ops;
383 sb->s_time_gran = 1; 383 sb->s_time_gran = 1;
384 384
385 parse_options(data); 385 parse_options(data);
386 386
387 inode = pstore_get_inode(sb); 387 inode = pstore_get_inode(sb);
388 if (inode) { 388 if (inode) {
389 inode->i_mode = S_IFDIR | 0755; 389 inode->i_mode = S_IFDIR | 0755;
390 inode->i_op = &pstore_dir_inode_operations; 390 inode->i_op = &pstore_dir_inode_operations;
391 inode->i_fop = &simple_dir_operations; 391 inode->i_fop = &simple_dir_operations;
392 inc_nlink(inode); 392 inc_nlink(inode);
393 } 393 }
394 sb->s_root = d_make_root(inode); 394 sb->s_root = d_make_root(inode);
395 if (!sb->s_root) 395 if (!sb->s_root)
396 return -ENOMEM; 396 return -ENOMEM;
397 397
398 pstore_get_records(0); 398 pstore_get_records(0);
399 399
400 return 0; 400 return 0;
401 } 401 }
402 402
403 static struct dentry *pstore_mount(struct file_system_type *fs_type, 403 static struct dentry *pstore_mount(struct file_system_type *fs_type,
404 int flags, const char *dev_name, void *data) 404 int flags, const char *dev_name, void *data)
405 { 405 {
406 return mount_single(fs_type, flags, data, pstore_fill_super); 406 return mount_single(fs_type, flags, data, pstore_fill_super);
407 } 407 }
408 408
409 static void pstore_kill_sb(struct super_block *sb) 409 static void pstore_kill_sb(struct super_block *sb)
410 { 410 {
411 kill_litter_super(sb); 411 kill_litter_super(sb);
412 pstore_sb = NULL; 412 pstore_sb = NULL;
413 } 413 }
414 414
415 static struct file_system_type pstore_fs_type = { 415 static struct file_system_type pstore_fs_type = {
416 .name = "pstore", 416 .name = "pstore",
417 .mount = pstore_mount, 417 .mount = pstore_mount,
418 .kill_sb = pstore_kill_sb, 418 .kill_sb = pstore_kill_sb,
419 }; 419 };
420 420
421 static int __init init_pstore_fs(void) 421 static int __init init_pstore_fs(void)
422 { 422 {
423 return register_filesystem(&pstore_fs_type); 423 return register_filesystem(&pstore_fs_type);
424 } 424 }
425 module_init(init_pstore_fs) 425 module_init(init_pstore_fs)
426 426
427 MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); 427 MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
428 MODULE_LICENSE("GPL"); 428 MODULE_LICENSE("GPL");
429 429
1 /* 1 /*
2 * linux/fs/read_write.c 2 * linux/fs/read_write.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/slab.h> 7 #include <linux/slab.h>
8 #include <linux/stat.h> 8 #include <linux/stat.h>
9 #include <linux/fcntl.h> 9 #include <linux/fcntl.h>
10 #include <linux/file.h> 10 #include <linux/file.h>
11 #include <linux/uio.h> 11 #include <linux/uio.h>
12 #include <linux/fsnotify.h> 12 #include <linux/fsnotify.h>
13 #include <linux/security.h> 13 #include <linux/security.h>
14 #include <linux/export.h> 14 #include <linux/export.h>
15 #include <linux/syscalls.h> 15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h> 16 #include <linux/pagemap.h>
17 #include <linux/splice.h> 17 #include <linux/splice.h>
18 #include "read_write.h" 18 #include "read_write.h"
19 19
20 #include <asm/uaccess.h> 20 #include <asm/uaccess.h>
21 #include <asm/unistd.h> 21 #include <asm/unistd.h>
22 22
23 const struct file_operations generic_ro_fops = { 23 const struct file_operations generic_ro_fops = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = do_sync_read, 25 .read = do_sync_read,
26 .aio_read = generic_file_aio_read, 26 .aio_read = generic_file_aio_read,
27 .mmap = generic_file_readonly_mmap, 27 .mmap = generic_file_readonly_mmap,
28 .splice_read = generic_file_splice_read, 28 .splice_read = generic_file_splice_read,
29 }; 29 };
30 30
31 EXPORT_SYMBOL(generic_ro_fops); 31 EXPORT_SYMBOL(generic_ro_fops);
32 32
33 static inline int unsigned_offsets(struct file *file) 33 static inline int unsigned_offsets(struct file *file)
34 { 34 {
35 return file->f_mode & FMODE_UNSIGNED_OFFSET; 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
36 } 36 }
37 37
38 static loff_t lseek_execute(struct file *file, struct inode *inode, 38 static loff_t lseek_execute(struct file *file, struct inode *inode,
39 loff_t offset, loff_t maxsize) 39 loff_t offset, loff_t maxsize)
40 { 40 {
41 if (offset < 0 && !unsigned_offsets(file)) 41 if (offset < 0 && !unsigned_offsets(file))
42 return -EINVAL; 42 return -EINVAL;
43 if (offset > maxsize) 43 if (offset > maxsize)
44 return -EINVAL; 44 return -EINVAL;
45 45
46 if (offset != file->f_pos) { 46 if (offset != file->f_pos) {
47 file->f_pos = offset; 47 file->f_pos = offset;
48 file->f_version = 0; 48 file->f_version = 0;
49 } 49 }
50 return offset; 50 return offset;
51 } 51 }
52 52
53 /** 53 /**
54 * generic_file_llseek_size - generic llseek implementation for regular files 54 * generic_file_llseek_size - generic llseek implementation for regular files
55 * @file: file structure to seek on 55 * @file: file structure to seek on
56 * @offset: file offset to seek to 56 * @offset: file offset to seek to
57 * @origin: type of seek 57 * @whence: type of seek
58 * @size: max size of this file in file system 58 * @size: max size of this file in file system
59 * @eof: offset used for SEEK_END position 59 * @eof: offset used for SEEK_END position
60 * 60 *
61 * This is a variant of generic_file_llseek that allows passing in a custom 61 * This is a variant of generic_file_llseek that allows passing in a custom
62 * maximum file size and a custom EOF position, for e.g. hashed directories 62 * maximum file size and a custom EOF position, for e.g. hashed directories
63 * 63 *
64 * Synchronization: 64 * Synchronization:
65 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 65 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
66 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 66 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
67 * read/writes behave like SEEK_SET against seeks. 67 * read/writes behave like SEEK_SET against seeks.
68 */ 68 */
69 loff_t 69 loff_t
70 generic_file_llseek_size(struct file *file, loff_t offset, int origin, 70 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
71 loff_t maxsize, loff_t eof) 71 loff_t maxsize, loff_t eof)
72 { 72 {
73 struct inode *inode = file->f_mapping->host; 73 struct inode *inode = file->f_mapping->host;
74 74
75 switch (origin) { 75 switch (whence) {
76 case SEEK_END: 76 case SEEK_END:
77 offset += eof; 77 offset += eof;
78 break; 78 break;
79 case SEEK_CUR: 79 case SEEK_CUR:
80 /* 80 /*
81 * Here we special-case the lseek(fd, 0, SEEK_CUR) 81 * Here we special-case the lseek(fd, 0, SEEK_CUR)
82 * position-querying operation. Avoid rewriting the "same" 82 * position-querying operation. Avoid rewriting the "same"
83 * f_pos value back to the file because a concurrent read(), 83 * f_pos value back to the file because a concurrent read(),
84 * write() or lseek() might have altered it 84 * write() or lseek() might have altered it
85 */ 85 */
86 if (offset == 0) 86 if (offset == 0)
87 return file->f_pos; 87 return file->f_pos;
88 /* 88 /*
89 * f_lock protects against read/modify/write race with other 89 * f_lock protects against read/modify/write race with other
90 * SEEK_CURs. Note that parallel writes and reads behave 90 * SEEK_CURs. Note that parallel writes and reads behave
91 * like SEEK_SET. 91 * like SEEK_SET.
92 */ 92 */
93 spin_lock(&file->f_lock); 93 spin_lock(&file->f_lock);
94 offset = lseek_execute(file, inode, file->f_pos + offset, 94 offset = lseek_execute(file, inode, file->f_pos + offset,
95 maxsize); 95 maxsize);
96 spin_unlock(&file->f_lock); 96 spin_unlock(&file->f_lock);
97 return offset; 97 return offset;
98 case SEEK_DATA: 98 case SEEK_DATA:
99 /* 99 /*
100 * In the generic case the entire file is data, so as long as 100 * In the generic case the entire file is data, so as long as
101 * offset isn't at the end of the file then the offset is data. 101 * offset isn't at the end of the file then the offset is data.
102 */ 102 */
103 if (offset >= eof) 103 if (offset >= eof)
104 return -ENXIO; 104 return -ENXIO;
105 break; 105 break;
106 case SEEK_HOLE: 106 case SEEK_HOLE:
107 /* 107 /*
108 * There is a virtual hole at the end of the file, so as long as 108 * There is a virtual hole at the end of the file, so as long as
109 * offset isn't i_size or larger, return i_size. 109 * offset isn't i_size or larger, return i_size.
110 */ 110 */
111 if (offset >= eof) 111 if (offset >= eof)
112 return -ENXIO; 112 return -ENXIO;
113 offset = eof; 113 offset = eof;
114 break; 114 break;
115 } 115 }
116 116
117 return lseek_execute(file, inode, offset, maxsize); 117 return lseek_execute(file, inode, offset, maxsize);
118 } 118 }
119 EXPORT_SYMBOL(generic_file_llseek_size); 119 EXPORT_SYMBOL(generic_file_llseek_size);
120 120
121 /** 121 /**
122 * generic_file_llseek - generic llseek implementation for regular files 122 * generic_file_llseek - generic llseek implementation for regular files
123 * @file: file structure to seek on 123 * @file: file structure to seek on
124 * @offset: file offset to seek to 124 * @offset: file offset to seek to
125 * @origin: type of seek 125 * @whence: type of seek
126 * 126 *
127 * This is a generic implemenation of ->llseek useable for all normal local 127 * This is a generic implemenation of ->llseek useable for all normal local
128 * filesystems. It just updates the file offset to the value specified by 128 * filesystems. It just updates the file offset to the value specified by
129 * @offset and @origin under i_mutex. 129 * @offset and @whence under i_mutex.
130 */ 130 */
131 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 131 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
132 { 132 {
133 struct inode *inode = file->f_mapping->host; 133 struct inode *inode = file->f_mapping->host;
134 134
135 return generic_file_llseek_size(file, offset, origin, 135 return generic_file_llseek_size(file, offset, whence,
136 inode->i_sb->s_maxbytes, 136 inode->i_sb->s_maxbytes,
137 i_size_read(inode)); 137 i_size_read(inode));
138 } 138 }
139 EXPORT_SYMBOL(generic_file_llseek); 139 EXPORT_SYMBOL(generic_file_llseek);
140 140
141 /** 141 /**
142 * noop_llseek - No Operation Performed llseek implementation 142 * noop_llseek - No Operation Performed llseek implementation
143 * @file: file structure to seek on 143 * @file: file structure to seek on
144 * @offset: file offset to seek to 144 * @offset: file offset to seek to
145 * @origin: type of seek 145 * @whence: type of seek
146 * 146 *
147 * This is an implementation of ->llseek useable for the rare special case when 147 * This is an implementation of ->llseek useable for the rare special case when
148 * userspace expects the seek to succeed but the (device) file is actually not 148 * userspace expects the seek to succeed but the (device) file is actually not
149 * able to perform the seek. In this case you use noop_llseek() instead of 149 * able to perform the seek. In this case you use noop_llseek() instead of
150 * falling back to the default implementation of ->llseek. 150 * falling back to the default implementation of ->llseek.
151 */ 151 */
152 loff_t noop_llseek(struct file *file, loff_t offset, int origin) 152 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
153 { 153 {
154 return file->f_pos; 154 return file->f_pos;
155 } 155 }
156 EXPORT_SYMBOL(noop_llseek); 156 EXPORT_SYMBOL(noop_llseek);
157 157
158 loff_t no_llseek(struct file *file, loff_t offset, int origin) 158 loff_t no_llseek(struct file *file, loff_t offset, int whence)
159 { 159 {
160 return -ESPIPE; 160 return -ESPIPE;
161 } 161 }
162 EXPORT_SYMBOL(no_llseek); 162 EXPORT_SYMBOL(no_llseek);
163 163
164 loff_t default_llseek(struct file *file, loff_t offset, int origin) 164 loff_t default_llseek(struct file *file, loff_t offset, int whence)
165 { 165 {
166 struct inode *inode = file->f_path.dentry->d_inode; 166 struct inode *inode = file->f_path.dentry->d_inode;
167 loff_t retval; 167 loff_t retval;
168 168
169 mutex_lock(&inode->i_mutex); 169 mutex_lock(&inode->i_mutex);
170 switch (origin) { 170 switch (whence) {
171 case SEEK_END: 171 case SEEK_END:
172 offset += i_size_read(inode); 172 offset += i_size_read(inode);
173 break; 173 break;
174 case SEEK_CUR: 174 case SEEK_CUR:
175 if (offset == 0) { 175 if (offset == 0) {
176 retval = file->f_pos; 176 retval = file->f_pos;
177 goto out; 177 goto out;
178 } 178 }
179 offset += file->f_pos; 179 offset += file->f_pos;
180 break; 180 break;
181 case SEEK_DATA: 181 case SEEK_DATA:
182 /* 182 /*
183 * In the generic case the entire file is data, so as 183 * In the generic case the entire file is data, so as
184 * long as offset isn't at the end of the file then the 184 * long as offset isn't at the end of the file then the
185 * offset is data. 185 * offset is data.
186 */ 186 */
187 if (offset >= inode->i_size) { 187 if (offset >= inode->i_size) {
188 retval = -ENXIO; 188 retval = -ENXIO;
189 goto out; 189 goto out;
190 } 190 }
191 break; 191 break;
192 case SEEK_HOLE: 192 case SEEK_HOLE:
193 /* 193 /*
194 * There is a virtual hole at the end of the file, so 194 * There is a virtual hole at the end of the file, so
195 * as long as offset isn't i_size or larger, return 195 * as long as offset isn't i_size or larger, return
196 * i_size. 196 * i_size.
197 */ 197 */
198 if (offset >= inode->i_size) { 198 if (offset >= inode->i_size) {
199 retval = -ENXIO; 199 retval = -ENXIO;
200 goto out; 200 goto out;
201 } 201 }
202 offset = inode->i_size; 202 offset = inode->i_size;
203 break; 203 break;
204 } 204 }
205 retval = -EINVAL; 205 retval = -EINVAL;
206 if (offset >= 0 || unsigned_offsets(file)) { 206 if (offset >= 0 || unsigned_offsets(file)) {
207 if (offset != file->f_pos) { 207 if (offset != file->f_pos) {
208 file->f_pos = offset; 208 file->f_pos = offset;
209 file->f_version = 0; 209 file->f_version = 0;
210 } 210 }
211 retval = offset; 211 retval = offset;
212 } 212 }
213 out: 213 out:
214 mutex_unlock(&inode->i_mutex); 214 mutex_unlock(&inode->i_mutex);
215 return retval; 215 return retval;
216 } 216 }
217 EXPORT_SYMBOL(default_llseek); 217 EXPORT_SYMBOL(default_llseek);
218 218
219 loff_t vfs_llseek(struct file *file, loff_t offset, int origin) 219 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
220 { 220 {
221 loff_t (*fn)(struct file *, loff_t, int); 221 loff_t (*fn)(struct file *, loff_t, int);
222 222
223 fn = no_llseek; 223 fn = no_llseek;
224 if (file->f_mode & FMODE_LSEEK) { 224 if (file->f_mode & FMODE_LSEEK) {
225 if (file->f_op && file->f_op->llseek) 225 if (file->f_op && file->f_op->llseek)
226 fn = file->f_op->llseek; 226 fn = file->f_op->llseek;
227 } 227 }
228 return fn(file, offset, origin); 228 return fn(file, offset, whence);
229 } 229 }
230 EXPORT_SYMBOL(vfs_llseek); 230 EXPORT_SYMBOL(vfs_llseek);
231 231
232 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) 232 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
233 { 233 {
234 off_t retval; 234 off_t retval;
235 struct fd f = fdget(fd); 235 struct fd f = fdget(fd);
236 if (!f.file) 236 if (!f.file)
237 return -EBADF; 237 return -EBADF;
238 238
239 retval = -EINVAL; 239 retval = -EINVAL;
240 if (origin <= SEEK_MAX) { 240 if (whence <= SEEK_MAX) {
241 loff_t res = vfs_llseek(f.file, offset, origin); 241 loff_t res = vfs_llseek(f.file, offset, whence);
242 retval = res; 242 retval = res;
243 if (res != (loff_t)retval) 243 if (res != (loff_t)retval)
244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
245 } 245 }
246 fdput(f); 246 fdput(f);
247 return retval; 247 return retval;
248 } 248 }
249 249
250 #ifdef __ARCH_WANT_SYS_LLSEEK 250 #ifdef __ARCH_WANT_SYS_LLSEEK
251 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 251 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
252 unsigned long, offset_low, loff_t __user *, result, 252 unsigned long, offset_low, loff_t __user *, result,
253 unsigned int, origin) 253 unsigned int, whence)
254 { 254 {
255 int retval; 255 int retval;
256 struct fd f = fdget(fd); 256 struct fd f = fdget(fd);
257 loff_t offset; 257 loff_t offset;
258 258
259 if (!f.file) 259 if (!f.file)
260 return -EBADF; 260 return -EBADF;
261 261
262 retval = -EINVAL; 262 retval = -EINVAL;
263 if (origin > SEEK_MAX) 263 if (whence > SEEK_MAX)
264 goto out_putf; 264 goto out_putf;
265 265
266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
267 origin); 267 whence);
268 268
269 retval = (int)offset; 269 retval = (int)offset;
270 if (offset >= 0) { 270 if (offset >= 0) {
271 retval = -EFAULT; 271 retval = -EFAULT;
272 if (!copy_to_user(result, &offset, sizeof(offset))) 272 if (!copy_to_user(result, &offset, sizeof(offset)))
273 retval = 0; 273 retval = 0;
274 } 274 }
275 out_putf: 275 out_putf:
276 fdput(f); 276 fdput(f);
277 return retval; 277 return retval;
278 } 278 }
279 #endif 279 #endif
280 280
281 281
282 /* 282 /*
283 * rw_verify_area doesn't like huge counts. We limit 283 * rw_verify_area doesn't like huge counts. We limit
284 * them to something that fits in "int" so that others 284 * them to something that fits in "int" so that others
285 * won't have to do range checks all the time. 285 * won't have to do range checks all the time.
286 */ 286 */
287 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 287 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
288 { 288 {
289 struct inode *inode; 289 struct inode *inode;
290 loff_t pos; 290 loff_t pos;
291 int retval = -EINVAL; 291 int retval = -EINVAL;
292 292
293 inode = file->f_path.dentry->d_inode; 293 inode = file->f_path.dentry->d_inode;
294 if (unlikely((ssize_t) count < 0)) 294 if (unlikely((ssize_t) count < 0))
295 return retval; 295 return retval;
296 pos = *ppos; 296 pos = *ppos;
297 if (unlikely(pos < 0)) { 297 if (unlikely(pos < 0)) {
298 if (!unsigned_offsets(file)) 298 if (!unsigned_offsets(file))
299 return retval; 299 return retval;
300 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 300 if (count >= -pos) /* both values are in 0..LLONG_MAX */
301 return -EOVERFLOW; 301 return -EOVERFLOW;
302 } else if (unlikely((loff_t) (pos + count) < 0)) { 302 } else if (unlikely((loff_t) (pos + count) < 0)) {
303 if (!unsigned_offsets(file)) 303 if (!unsigned_offsets(file))
304 return retval; 304 return retval;
305 } 305 }
306 306
307 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 307 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
308 retval = locks_mandatory_area( 308 retval = locks_mandatory_area(
309 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 309 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
310 inode, file, pos, count); 310 inode, file, pos, count);
311 if (retval < 0) 311 if (retval < 0)
312 return retval; 312 return retval;
313 } 313 }
314 retval = security_file_permission(file, 314 retval = security_file_permission(file,
315 read_write == READ ? MAY_READ : MAY_WRITE); 315 read_write == READ ? MAY_READ : MAY_WRITE);
316 if (retval) 316 if (retval)
317 return retval; 317 return retval;
318 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; 318 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
319 } 319 }
320 320
321 static void wait_on_retry_sync_kiocb(struct kiocb *iocb) 321 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
322 { 322 {
323 set_current_state(TASK_UNINTERRUPTIBLE); 323 set_current_state(TASK_UNINTERRUPTIBLE);
324 if (!kiocbIsKicked(iocb)) 324 if (!kiocbIsKicked(iocb))
325 schedule(); 325 schedule();
326 else 326 else
327 kiocbClearKicked(iocb); 327 kiocbClearKicked(iocb);
328 __set_current_state(TASK_RUNNING); 328 __set_current_state(TASK_RUNNING);
329 } 329 }
330 330
331 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 331 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
332 { 332 {
333 struct iovec iov = { .iov_base = buf, .iov_len = len }; 333 struct iovec iov = { .iov_base = buf, .iov_len = len };
334 struct kiocb kiocb; 334 struct kiocb kiocb;
335 ssize_t ret; 335 ssize_t ret;
336 336
337 init_sync_kiocb(&kiocb, filp); 337 init_sync_kiocb(&kiocb, filp);
338 kiocb.ki_pos = *ppos; 338 kiocb.ki_pos = *ppos;
339 kiocb.ki_left = len; 339 kiocb.ki_left = len;
340 kiocb.ki_nbytes = len; 340 kiocb.ki_nbytes = len;
341 341
342 for (;;) { 342 for (;;) {
343 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 343 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
344 if (ret != -EIOCBRETRY) 344 if (ret != -EIOCBRETRY)
345 break; 345 break;
346 wait_on_retry_sync_kiocb(&kiocb); 346 wait_on_retry_sync_kiocb(&kiocb);
347 } 347 }
348 348
349 if (-EIOCBQUEUED == ret) 349 if (-EIOCBQUEUED == ret)
350 ret = wait_on_sync_kiocb(&kiocb); 350 ret = wait_on_sync_kiocb(&kiocb);
351 *ppos = kiocb.ki_pos; 351 *ppos = kiocb.ki_pos;
352 return ret; 352 return ret;
353 } 353 }
354 354
355 EXPORT_SYMBOL(do_sync_read); 355 EXPORT_SYMBOL(do_sync_read);
356 356
357 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 357 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
358 { 358 {
359 ssize_t ret; 359 ssize_t ret;
360 360
361 if (!(file->f_mode & FMODE_READ)) 361 if (!(file->f_mode & FMODE_READ))
362 return -EBADF; 362 return -EBADF;
363 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) 363 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
364 return -EINVAL; 364 return -EINVAL;
365 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 365 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
366 return -EFAULT; 366 return -EFAULT;
367 367
368 ret = rw_verify_area(READ, file, pos, count); 368 ret = rw_verify_area(READ, file, pos, count);
369 if (ret >= 0) { 369 if (ret >= 0) {
370 count = ret; 370 count = ret;
371 if (file->f_op->read) 371 if (file->f_op->read)
372 ret = file->f_op->read(file, buf, count, pos); 372 ret = file->f_op->read(file, buf, count, pos);
373 else 373 else
374 ret = do_sync_read(file, buf, count, pos); 374 ret = do_sync_read(file, buf, count, pos);
375 if (ret > 0) { 375 if (ret > 0) {
376 fsnotify_access(file); 376 fsnotify_access(file);
377 add_rchar(current, ret); 377 add_rchar(current, ret);
378 } 378 }
379 inc_syscr(current); 379 inc_syscr(current);
380 } 380 }
381 381
382 return ret; 382 return ret;
383 } 383 }
384 384
385 EXPORT_SYMBOL(vfs_read); 385 EXPORT_SYMBOL(vfs_read);
386 386
387 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 387 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
388 { 388 {
389 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; 389 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
390 struct kiocb kiocb; 390 struct kiocb kiocb;
391 ssize_t ret; 391 ssize_t ret;
392 392
393 init_sync_kiocb(&kiocb, filp); 393 init_sync_kiocb(&kiocb, filp);
394 kiocb.ki_pos = *ppos; 394 kiocb.ki_pos = *ppos;
395 kiocb.ki_left = len; 395 kiocb.ki_left = len;
396 kiocb.ki_nbytes = len; 396 kiocb.ki_nbytes = len;
397 397
398 for (;;) { 398 for (;;) {
399 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 399 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
400 if (ret != -EIOCBRETRY) 400 if (ret != -EIOCBRETRY)
401 break; 401 break;
402 wait_on_retry_sync_kiocb(&kiocb); 402 wait_on_retry_sync_kiocb(&kiocb);
403 } 403 }
404 404
405 if (-EIOCBQUEUED == ret) 405 if (-EIOCBQUEUED == ret)
406 ret = wait_on_sync_kiocb(&kiocb); 406 ret = wait_on_sync_kiocb(&kiocb);
407 *ppos = kiocb.ki_pos; 407 *ppos = kiocb.ki_pos;
408 return ret; 408 return ret;
409 } 409 }
410 410
411 EXPORT_SYMBOL(do_sync_write); 411 EXPORT_SYMBOL(do_sync_write);
412 412
413 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 413 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
414 { 414 {
415 ssize_t ret; 415 ssize_t ret;
416 416
417 if (!(file->f_mode & FMODE_WRITE)) 417 if (!(file->f_mode & FMODE_WRITE))
418 return -EBADF; 418 return -EBADF;
419 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) 419 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
420 return -EINVAL; 420 return -EINVAL;
421 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 421 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
422 return -EFAULT; 422 return -EFAULT;
423 423
424 ret = rw_verify_area(WRITE, file, pos, count); 424 ret = rw_verify_area(WRITE, file, pos, count);
425 if (ret >= 0) { 425 if (ret >= 0) {
426 count = ret; 426 count = ret;
427 if (file->f_op->write) 427 if (file->f_op->write)
428 ret = file->f_op->write(file, buf, count, pos); 428 ret = file->f_op->write(file, buf, count, pos);
429 else 429 else
430 ret = do_sync_write(file, buf, count, pos); 430 ret = do_sync_write(file, buf, count, pos);
431 if (ret > 0) { 431 if (ret > 0) {
432 fsnotify_modify(file); 432 fsnotify_modify(file);
433 add_wchar(current, ret); 433 add_wchar(current, ret);
434 } 434 }
435 inc_syscw(current); 435 inc_syscw(current);
436 } 436 }
437 437
438 return ret; 438 return ret;
439 } 439 }
440 440
441 EXPORT_SYMBOL(vfs_write); 441 EXPORT_SYMBOL(vfs_write);
442 442
443 static inline loff_t file_pos_read(struct file *file) 443 static inline loff_t file_pos_read(struct file *file)
444 { 444 {
445 return file->f_pos; 445 return file->f_pos;
446 } 446 }
447 447
448 static inline void file_pos_write(struct file *file, loff_t pos) 448 static inline void file_pos_write(struct file *file, loff_t pos)
449 { 449 {
450 file->f_pos = pos; 450 file->f_pos = pos;
451 } 451 }
452 452
453 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 453 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
454 { 454 {
455 struct fd f = fdget(fd); 455 struct fd f = fdget(fd);
456 ssize_t ret = -EBADF; 456 ssize_t ret = -EBADF;
457 457
458 if (f.file) { 458 if (f.file) {
459 loff_t pos = file_pos_read(f.file); 459 loff_t pos = file_pos_read(f.file);
460 ret = vfs_read(f.file, buf, count, &pos); 460 ret = vfs_read(f.file, buf, count, &pos);
461 file_pos_write(f.file, pos); 461 file_pos_write(f.file, pos);
462 fdput(f); 462 fdput(f);
463 } 463 }
464 return ret; 464 return ret;
465 } 465 }
466 466
467 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 467 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
468 size_t, count) 468 size_t, count)
469 { 469 {
470 struct fd f = fdget(fd); 470 struct fd f = fdget(fd);
471 ssize_t ret = -EBADF; 471 ssize_t ret = -EBADF;
472 472
473 if (f.file) { 473 if (f.file) {
474 loff_t pos = file_pos_read(f.file); 474 loff_t pos = file_pos_read(f.file);
475 ret = vfs_write(f.file, buf, count, &pos); 475 ret = vfs_write(f.file, buf, count, &pos);
476 file_pos_write(f.file, pos); 476 file_pos_write(f.file, pos);
477 fdput(f); 477 fdput(f);
478 } 478 }
479 479
480 return ret; 480 return ret;
481 } 481 }
482 482
483 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, 483 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
484 size_t count, loff_t pos) 484 size_t count, loff_t pos)
485 { 485 {
486 struct fd f; 486 struct fd f;
487 ssize_t ret = -EBADF; 487 ssize_t ret = -EBADF;
488 488
489 if (pos < 0) 489 if (pos < 0)
490 return -EINVAL; 490 return -EINVAL;
491 491
492 f = fdget(fd); 492 f = fdget(fd);
493 if (f.file) { 493 if (f.file) {
494 ret = -ESPIPE; 494 ret = -ESPIPE;
495 if (f.file->f_mode & FMODE_PREAD) 495 if (f.file->f_mode & FMODE_PREAD)
496 ret = vfs_read(f.file, buf, count, &pos); 496 ret = vfs_read(f.file, buf, count, &pos);
497 fdput(f); 497 fdput(f);
498 } 498 }
499 499
500 return ret; 500 return ret;
501 } 501 }
502 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 502 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
503 asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) 503 asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
504 { 504 {
505 return SYSC_pread64((unsigned int) fd, (char __user *) buf, 505 return SYSC_pread64((unsigned int) fd, (char __user *) buf,
506 (size_t) count, pos); 506 (size_t) count, pos);
507 } 507 }
508 SYSCALL_ALIAS(sys_pread64, SyS_pread64); 508 SYSCALL_ALIAS(sys_pread64, SyS_pread64);
509 #endif 509 #endif
510 510
511 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, 511 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
512 size_t count, loff_t pos) 512 size_t count, loff_t pos)
513 { 513 {
514 struct fd f; 514 struct fd f;
515 ssize_t ret = -EBADF; 515 ssize_t ret = -EBADF;
516 516
517 if (pos < 0) 517 if (pos < 0)
518 return -EINVAL; 518 return -EINVAL;
519 519
520 f = fdget(fd); 520 f = fdget(fd);
521 if (f.file) { 521 if (f.file) {
522 ret = -ESPIPE; 522 ret = -ESPIPE;
523 if (f.file->f_mode & FMODE_PWRITE) 523 if (f.file->f_mode & FMODE_PWRITE)
524 ret = vfs_write(f.file, buf, count, &pos); 524 ret = vfs_write(f.file, buf, count, &pos);
525 fdput(f); 525 fdput(f);
526 } 526 }
527 527
528 return ret; 528 return ret;
529 } 529 }
530 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 530 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
531 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) 531 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
532 { 532 {
533 return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, 533 return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
534 (size_t) count, pos); 534 (size_t) count, pos);
535 } 535 }
536 SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); 536 SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
537 #endif 537 #endif
538 538
539 /* 539 /*
540 * Reduce an iovec's length in-place. Return the resulting number of segments 540 * Reduce an iovec's length in-place. Return the resulting number of segments
541 */ 541 */
542 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) 542 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
543 { 543 {
544 unsigned long seg = 0; 544 unsigned long seg = 0;
545 size_t len = 0; 545 size_t len = 0;
546 546
547 while (seg < nr_segs) { 547 while (seg < nr_segs) {
548 seg++; 548 seg++;
549 if (len + iov->iov_len >= to) { 549 if (len + iov->iov_len >= to) {
550 iov->iov_len = to - len; 550 iov->iov_len = to - len;
551 break; 551 break;
552 } 552 }
553 len += iov->iov_len; 553 len += iov->iov_len;
554 iov++; 554 iov++;
555 } 555 }
556 return seg; 556 return seg;
557 } 557 }
558 EXPORT_SYMBOL(iov_shorten); 558 EXPORT_SYMBOL(iov_shorten);
559 559
560 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, 560 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
561 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) 561 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
562 { 562 {
563 struct kiocb kiocb; 563 struct kiocb kiocb;
564 ssize_t ret; 564 ssize_t ret;
565 565
566 init_sync_kiocb(&kiocb, filp); 566 init_sync_kiocb(&kiocb, filp);
567 kiocb.ki_pos = *ppos; 567 kiocb.ki_pos = *ppos;
568 kiocb.ki_left = len; 568 kiocb.ki_left = len;
569 kiocb.ki_nbytes = len; 569 kiocb.ki_nbytes = len;
570 570
571 for (;;) { 571 for (;;) {
572 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); 572 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
573 if (ret != -EIOCBRETRY) 573 if (ret != -EIOCBRETRY)
574 break; 574 break;
575 wait_on_retry_sync_kiocb(&kiocb); 575 wait_on_retry_sync_kiocb(&kiocb);
576 } 576 }
577 577
578 if (ret == -EIOCBQUEUED) 578 if (ret == -EIOCBQUEUED)
579 ret = wait_on_sync_kiocb(&kiocb); 579 ret = wait_on_sync_kiocb(&kiocb);
580 *ppos = kiocb.ki_pos; 580 *ppos = kiocb.ki_pos;
581 return ret; 581 return ret;
582 } 582 }
583 583
584 /* Do it by hand, with file-ops */ 584 /* Do it by hand, with file-ops */
585 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, 585 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
586 unsigned long nr_segs, loff_t *ppos, io_fn_t fn) 586 unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
587 { 587 {
588 struct iovec *vector = iov; 588 struct iovec *vector = iov;
589 ssize_t ret = 0; 589 ssize_t ret = 0;
590 590
591 while (nr_segs > 0) { 591 while (nr_segs > 0) {
592 void __user *base; 592 void __user *base;
593 size_t len; 593 size_t len;
594 ssize_t nr; 594 ssize_t nr;
595 595
596 base = vector->iov_base; 596 base = vector->iov_base;
597 len = vector->iov_len; 597 len = vector->iov_len;
598 vector++; 598 vector++;
599 nr_segs--; 599 nr_segs--;
600 600
601 nr = fn(filp, base, len, ppos); 601 nr = fn(filp, base, len, ppos);
602 602
603 if (nr < 0) { 603 if (nr < 0) {
604 if (!ret) 604 if (!ret)
605 ret = nr; 605 ret = nr;
606 break; 606 break;
607 } 607 }
608 ret += nr; 608 ret += nr;
609 if (nr != len) 609 if (nr != len)
610 break; 610 break;
611 } 611 }
612 612
613 return ret; 613 return ret;
614 } 614 }
615 615
616 /* A write operation does a read from user space and vice versa */ 616 /* A write operation does a read from user space and vice versa */
617 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) 617 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
618 618
619 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 619 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
620 unsigned long nr_segs, unsigned long fast_segs, 620 unsigned long nr_segs, unsigned long fast_segs,
621 struct iovec *fast_pointer, 621 struct iovec *fast_pointer,
622 struct iovec **ret_pointer) 622 struct iovec **ret_pointer)
623 { 623 {
624 unsigned long seg; 624 unsigned long seg;
625 ssize_t ret; 625 ssize_t ret;
626 struct iovec *iov = fast_pointer; 626 struct iovec *iov = fast_pointer;
627 627
628 /* 628 /*
629 * SuS says "The readv() function *may* fail if the iovcnt argument 629 * SuS says "The readv() function *may* fail if the iovcnt argument
630 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 630 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
631 * traditionally returned zero for zero segments, so... 631 * traditionally returned zero for zero segments, so...
632 */ 632 */
633 if (nr_segs == 0) { 633 if (nr_segs == 0) {
634 ret = 0; 634 ret = 0;
635 goto out; 635 goto out;
636 } 636 }
637 637
638 /* 638 /*
639 * First get the "struct iovec" from user memory and 639 * First get the "struct iovec" from user memory and
640 * verify all the pointers 640 * verify all the pointers
641 */ 641 */
642 if (nr_segs > UIO_MAXIOV) { 642 if (nr_segs > UIO_MAXIOV) {
643 ret = -EINVAL; 643 ret = -EINVAL;
644 goto out; 644 goto out;
645 } 645 }
646 if (nr_segs > fast_segs) { 646 if (nr_segs > fast_segs) {
647 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 647 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
648 if (iov == NULL) { 648 if (iov == NULL) {
649 ret = -ENOMEM; 649 ret = -ENOMEM;
650 goto out; 650 goto out;
651 } 651 }
652 } 652 }
653 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 653 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
654 ret = -EFAULT; 654 ret = -EFAULT;
655 goto out; 655 goto out;
656 } 656 }
657 657
658 /* 658 /*
659 * According to the Single Unix Specification we should return EINVAL 659 * According to the Single Unix Specification we should return EINVAL
660 * if an element length is < 0 when cast to ssize_t or if the 660 * if an element length is < 0 when cast to ssize_t or if the
661 * total length would overflow the ssize_t return value of the 661 * total length would overflow the ssize_t return value of the
662 * system call. 662 * system call.
663 * 663 *
664 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 664 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
665 * overflow case. 665 * overflow case.
666 */ 666 */
667 ret = 0; 667 ret = 0;
668 for (seg = 0; seg < nr_segs; seg++) { 668 for (seg = 0; seg < nr_segs; seg++) {
669 void __user *buf = iov[seg].iov_base; 669 void __user *buf = iov[seg].iov_base;
670 ssize_t len = (ssize_t)iov[seg].iov_len; 670 ssize_t len = (ssize_t)iov[seg].iov_len;
671 671
672 /* see if we we're about to use an invalid len or if 672 /* see if we we're about to use an invalid len or if
673 * it's about to overflow ssize_t */ 673 * it's about to overflow ssize_t */
674 if (len < 0) { 674 if (len < 0) {
675 ret = -EINVAL; 675 ret = -EINVAL;
676 goto out; 676 goto out;
677 } 677 }
678 if (type >= 0 678 if (type >= 0
679 && unlikely(!access_ok(vrfy_dir(type), buf, len))) { 679 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
680 ret = -EFAULT; 680 ret = -EFAULT;
681 goto out; 681 goto out;
682 } 682 }
683 if (len > MAX_RW_COUNT - ret) { 683 if (len > MAX_RW_COUNT - ret) {
684 len = MAX_RW_COUNT - ret; 684 len = MAX_RW_COUNT - ret;
685 iov[seg].iov_len = len; 685 iov[seg].iov_len = len;
686 } 686 }
687 ret += len; 687 ret += len;
688 } 688 }
689 out: 689 out:
690 *ret_pointer = iov; 690 *ret_pointer = iov;
691 return ret; 691 return ret;
692 } 692 }
693 693
694 static ssize_t do_readv_writev(int type, struct file *file, 694 static ssize_t do_readv_writev(int type, struct file *file,
695 const struct iovec __user * uvector, 695 const struct iovec __user * uvector,
696 unsigned long nr_segs, loff_t *pos) 696 unsigned long nr_segs, loff_t *pos)
697 { 697 {
698 size_t tot_len; 698 size_t tot_len;
699 struct iovec iovstack[UIO_FASTIOV]; 699 struct iovec iovstack[UIO_FASTIOV];
700 struct iovec *iov = iovstack; 700 struct iovec *iov = iovstack;
701 ssize_t ret; 701 ssize_t ret;
702 io_fn_t fn; 702 io_fn_t fn;
703 iov_fn_t fnv; 703 iov_fn_t fnv;
704 704
705 if (!file->f_op) { 705 if (!file->f_op) {
706 ret = -EINVAL; 706 ret = -EINVAL;
707 goto out; 707 goto out;
708 } 708 }
709 709
710 ret = rw_copy_check_uvector(type, uvector, nr_segs, 710 ret = rw_copy_check_uvector(type, uvector, nr_segs,
711 ARRAY_SIZE(iovstack), iovstack, &iov); 711 ARRAY_SIZE(iovstack), iovstack, &iov);
712 if (ret <= 0) 712 if (ret <= 0)
713 goto out; 713 goto out;
714 714
715 tot_len = ret; 715 tot_len = ret;
716 ret = rw_verify_area(type, file, pos, tot_len); 716 ret = rw_verify_area(type, file, pos, tot_len);
717 if (ret < 0) 717 if (ret < 0)
718 goto out; 718 goto out;
719 719
720 fnv = NULL; 720 fnv = NULL;
721 if (type == READ) { 721 if (type == READ) {
722 fn = file->f_op->read; 722 fn = file->f_op->read;
723 fnv = file->f_op->aio_read; 723 fnv = file->f_op->aio_read;
724 } else { 724 } else {
725 fn = (io_fn_t)file->f_op->write; 725 fn = (io_fn_t)file->f_op->write;
726 fnv = file->f_op->aio_write; 726 fnv = file->f_op->aio_write;
727 } 727 }
728 728
729 if (fnv) 729 if (fnv)
730 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, 730 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
731 pos, fnv); 731 pos, fnv);
732 else 732 else
733 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); 733 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
734 734
735 out: 735 out:
736 if (iov != iovstack) 736 if (iov != iovstack)
737 kfree(iov); 737 kfree(iov);
738 if ((ret + (type == READ)) > 0) { 738 if ((ret + (type == READ)) > 0) {
739 if (type == READ) 739 if (type == READ)
740 fsnotify_access(file); 740 fsnotify_access(file);
741 else 741 else
742 fsnotify_modify(file); 742 fsnotify_modify(file);
743 } 743 }
744 return ret; 744 return ret;
745 } 745 }
746 746
747 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 747 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
748 unsigned long vlen, loff_t *pos) 748 unsigned long vlen, loff_t *pos)
749 { 749 {
750 if (!(file->f_mode & FMODE_READ)) 750 if (!(file->f_mode & FMODE_READ))
751 return -EBADF; 751 return -EBADF;
752 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) 752 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
753 return -EINVAL; 753 return -EINVAL;
754 754
755 return do_readv_writev(READ, file, vec, vlen, pos); 755 return do_readv_writev(READ, file, vec, vlen, pos);
756 } 756 }
757 757
758 EXPORT_SYMBOL(vfs_readv); 758 EXPORT_SYMBOL(vfs_readv);
759 759
760 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 760 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
761 unsigned long vlen, loff_t *pos) 761 unsigned long vlen, loff_t *pos)
762 { 762 {
763 if (!(file->f_mode & FMODE_WRITE)) 763 if (!(file->f_mode & FMODE_WRITE))
764 return -EBADF; 764 return -EBADF;
765 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) 765 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
766 return -EINVAL; 766 return -EINVAL;
767 767
768 return do_readv_writev(WRITE, file, vec, vlen, pos); 768 return do_readv_writev(WRITE, file, vec, vlen, pos);
769 } 769 }
770 770
771 EXPORT_SYMBOL(vfs_writev); 771 EXPORT_SYMBOL(vfs_writev);
772 772
773 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 773 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
774 unsigned long, vlen) 774 unsigned long, vlen)
775 { 775 {
776 struct fd f = fdget(fd); 776 struct fd f = fdget(fd);
777 ssize_t ret = -EBADF; 777 ssize_t ret = -EBADF;
778 778
779 if (f.file) { 779 if (f.file) {
780 loff_t pos = file_pos_read(f.file); 780 loff_t pos = file_pos_read(f.file);
781 ret = vfs_readv(f.file, vec, vlen, &pos); 781 ret = vfs_readv(f.file, vec, vlen, &pos);
782 file_pos_write(f.file, pos); 782 file_pos_write(f.file, pos);
783 fdput(f); 783 fdput(f);
784 } 784 }
785 785
786 if (ret > 0) 786 if (ret > 0)
787 add_rchar(current, ret); 787 add_rchar(current, ret);
788 inc_syscr(current); 788 inc_syscr(current);
789 return ret; 789 return ret;
790 } 790 }
791 791
792 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 792 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
793 unsigned long, vlen) 793 unsigned long, vlen)
794 { 794 {
795 struct fd f = fdget(fd); 795 struct fd f = fdget(fd);
796 ssize_t ret = -EBADF; 796 ssize_t ret = -EBADF;
797 797
798 if (f.file) { 798 if (f.file) {
799 loff_t pos = file_pos_read(f.file); 799 loff_t pos = file_pos_read(f.file);
800 ret = vfs_writev(f.file, vec, vlen, &pos); 800 ret = vfs_writev(f.file, vec, vlen, &pos);
801 file_pos_write(f.file, pos); 801 file_pos_write(f.file, pos);
802 fdput(f); 802 fdput(f);
803 } 803 }
804 804
805 if (ret > 0) 805 if (ret > 0)
806 add_wchar(current, ret); 806 add_wchar(current, ret);
807 inc_syscw(current); 807 inc_syscw(current);
808 return ret; 808 return ret;
809 } 809 }
810 810
811 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 811 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
812 { 812 {
813 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 813 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
814 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 814 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
815 } 815 }
816 816
817 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 817 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
818 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 818 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
819 { 819 {
820 loff_t pos = pos_from_hilo(pos_h, pos_l); 820 loff_t pos = pos_from_hilo(pos_h, pos_l);
821 struct fd f; 821 struct fd f;
822 ssize_t ret = -EBADF; 822 ssize_t ret = -EBADF;
823 823
824 if (pos < 0) 824 if (pos < 0)
825 return -EINVAL; 825 return -EINVAL;
826 826
827 f = fdget(fd); 827 f = fdget(fd);
828 if (f.file) { 828 if (f.file) {
829 ret = -ESPIPE; 829 ret = -ESPIPE;
830 if (f.file->f_mode & FMODE_PREAD) 830 if (f.file->f_mode & FMODE_PREAD)
831 ret = vfs_readv(f.file, vec, vlen, &pos); 831 ret = vfs_readv(f.file, vec, vlen, &pos);
832 fdput(f); 832 fdput(f);
833 } 833 }
834 834
835 if (ret > 0) 835 if (ret > 0)
836 add_rchar(current, ret); 836 add_rchar(current, ret);
837 inc_syscr(current); 837 inc_syscr(current);
838 return ret; 838 return ret;
839 } 839 }
840 840
841 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 841 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
842 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 842 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
843 { 843 {
844 loff_t pos = pos_from_hilo(pos_h, pos_l); 844 loff_t pos = pos_from_hilo(pos_h, pos_l);
845 struct fd f; 845 struct fd f;
846 ssize_t ret = -EBADF; 846 ssize_t ret = -EBADF;
847 847
848 if (pos < 0) 848 if (pos < 0)
849 return -EINVAL; 849 return -EINVAL;
850 850
851 f = fdget(fd); 851 f = fdget(fd);
852 if (f.file) { 852 if (f.file) {
853 ret = -ESPIPE; 853 ret = -ESPIPE;
854 if (f.file->f_mode & FMODE_PWRITE) 854 if (f.file->f_mode & FMODE_PWRITE)
855 ret = vfs_writev(f.file, vec, vlen, &pos); 855 ret = vfs_writev(f.file, vec, vlen, &pos);
856 fdput(f); 856 fdput(f);
857 } 857 }
858 858
859 if (ret > 0) 859 if (ret > 0)
860 add_wchar(current, ret); 860 add_wchar(current, ret);
861 inc_syscw(current); 861 inc_syscw(current);
862 return ret; 862 return ret;
863 } 863 }
864 864
865 ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, 865 ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
866 loff_t max) 866 loff_t max)
867 { 867 {
868 struct fd in, out; 868 struct fd in, out;
869 struct inode *in_inode, *out_inode; 869 struct inode *in_inode, *out_inode;
870 loff_t pos; 870 loff_t pos;
871 ssize_t retval; 871 ssize_t retval;
872 int fl; 872 int fl;
873 873
874 /* 874 /*
875 * Get input file, and verify that it is ok.. 875 * Get input file, and verify that it is ok..
876 */ 876 */
877 retval = -EBADF; 877 retval = -EBADF;
878 in = fdget(in_fd); 878 in = fdget(in_fd);
879 if (!in.file) 879 if (!in.file)
880 goto out; 880 goto out;
881 if (!(in.file->f_mode & FMODE_READ)) 881 if (!(in.file->f_mode & FMODE_READ))
882 goto fput_in; 882 goto fput_in;
883 retval = -ESPIPE; 883 retval = -ESPIPE;
884 if (!ppos) 884 if (!ppos)
885 ppos = &in.file->f_pos; 885 ppos = &in.file->f_pos;
886 else 886 else
887 if (!(in.file->f_mode & FMODE_PREAD)) 887 if (!(in.file->f_mode & FMODE_PREAD))
888 goto fput_in; 888 goto fput_in;
889 retval = rw_verify_area(READ, in.file, ppos, count); 889 retval = rw_verify_area(READ, in.file, ppos, count);
890 if (retval < 0) 890 if (retval < 0)
891 goto fput_in; 891 goto fput_in;
892 count = retval; 892 count = retval;
893 893
894 /* 894 /*
895 * Get output file, and verify that it is ok.. 895 * Get output file, and verify that it is ok..
896 */ 896 */
897 retval = -EBADF; 897 retval = -EBADF;
898 out = fdget(out_fd); 898 out = fdget(out_fd);
899 if (!out.file) 899 if (!out.file)
900 goto fput_in; 900 goto fput_in;
901 if (!(out.file->f_mode & FMODE_WRITE)) 901 if (!(out.file->f_mode & FMODE_WRITE))
902 goto fput_out; 902 goto fput_out;
903 retval = -EINVAL; 903 retval = -EINVAL;
904 in_inode = in.file->f_path.dentry->d_inode; 904 in_inode = in.file->f_path.dentry->d_inode;
905 out_inode = out.file->f_path.dentry->d_inode; 905 out_inode = out.file->f_path.dentry->d_inode;
906 retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); 906 retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
907 if (retval < 0) 907 if (retval < 0)
908 goto fput_out; 908 goto fput_out;
909 count = retval; 909 count = retval;
910 910
911 if (!max) 911 if (!max)
912 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 912 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
913 913
914 pos = *ppos; 914 pos = *ppos;
915 if (unlikely(pos + count > max)) { 915 if (unlikely(pos + count > max)) {
916 retval = -EOVERFLOW; 916 retval = -EOVERFLOW;
917 if (pos >= max) 917 if (pos >= max)
918 goto fput_out; 918 goto fput_out;
919 count = max - pos; 919 count = max - pos;
920 } 920 }
921 921
922 fl = 0; 922 fl = 0;
923 #if 0 923 #if 0
924 /* 924 /*
925 * We need to debate whether we can enable this or not. The 925 * We need to debate whether we can enable this or not. The
926 * man page documents EAGAIN return for the output at least, 926 * man page documents EAGAIN return for the output at least,
927 * and the application is arguably buggy if it doesn't expect 927 * and the application is arguably buggy if it doesn't expect
928 * EAGAIN on a non-blocking file descriptor. 928 * EAGAIN on a non-blocking file descriptor.
929 */ 929 */
930 if (in.file->f_flags & O_NONBLOCK) 930 if (in.file->f_flags & O_NONBLOCK)
931 fl = SPLICE_F_NONBLOCK; 931 fl = SPLICE_F_NONBLOCK;
932 #endif 932 #endif
933 retval = do_splice_direct(in.file, ppos, out.file, count, fl); 933 retval = do_splice_direct(in.file, ppos, out.file, count, fl);
934 934
935 if (retval > 0) { 935 if (retval > 0) {
936 add_rchar(current, retval); 936 add_rchar(current, retval);
937 add_wchar(current, retval); 937 add_wchar(current, retval);
938 } 938 }
939 939
940 inc_syscr(current); 940 inc_syscr(current);
941 inc_syscw(current); 941 inc_syscw(current);
942 if (*ppos > max) 942 if (*ppos > max)
943 retval = -EOVERFLOW; 943 retval = -EOVERFLOW;
944 944
945 fput_out: 945 fput_out:
946 fdput(out); 946 fdput(out);
947 fput_in: 947 fput_in:
948 fdput(in); 948 fdput(in);
949 out: 949 out:
950 return retval; 950 return retval;
951 } 951 }
952 952
953 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 953 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
954 { 954 {
955 loff_t pos; 955 loff_t pos;
956 off_t off; 956 off_t off;
957 ssize_t ret; 957 ssize_t ret;
958 958
959 if (offset) { 959 if (offset) {
960 if (unlikely(get_user(off, offset))) 960 if (unlikely(get_user(off, offset)))
961 return -EFAULT; 961 return -EFAULT;
962 pos = off; 962 pos = off;
963 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 963 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
964 if (unlikely(put_user(pos, offset))) 964 if (unlikely(put_user(pos, offset)))
965 return -EFAULT; 965 return -EFAULT;
966 return ret; 966 return ret;
967 } 967 }
968 968
969 return do_sendfile(out_fd, in_fd, NULL, count, 0); 969 return do_sendfile(out_fd, in_fd, NULL, count, 0);
970 } 970 }
971 971
972 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 972 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
973 { 973 {
974 loff_t pos; 974 loff_t pos;
975 ssize_t ret; 975 ssize_t ret;
976 976
977 if (offset) { 977 if (offset) {
978 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 978 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
979 return -EFAULT; 979 return -EFAULT;
980 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 980 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
981 if (unlikely(put_user(pos, offset))) 981 if (unlikely(put_user(pos, offset)))
982 return -EFAULT; 982 return -EFAULT;
983 return ret; 983 return ret;
984 } 984 }
985 985
986 return do_sendfile(out_fd, in_fd, NULL, count, 0); 986 return do_sendfile(out_fd, in_fd, NULL, count, 0);
987 } 987 }
988 988
1 /* 1 /*
2 * linux/fs/seq_file.c 2 * linux/fs/seq_file.c
3 * 3 *
4 * helper functions for making synthetic files from sequences of records. 4 * helper functions for making synthetic files from sequences of records.
5 * initial implementation -- AV, Oct 2001. 5 * initial implementation -- AV, Oct 2001.
6 */ 6 */
7 7
8 #include <linux/fs.h> 8 #include <linux/fs.h>
9 #include <linux/export.h> 9 #include <linux/export.h>
10 #include <linux/seq_file.h> 10 #include <linux/seq_file.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/cred.h> 12 #include <linux/cred.h>
13 13
14 #include <asm/uaccess.h> 14 #include <asm/uaccess.h>
15 #include <asm/page.h> 15 #include <asm/page.h>
16 16
17 17
18 /* 18 /*
19 * seq_files have a buffer which can may overflow. When this happens a larger 19 * seq_files have a buffer which can may overflow. When this happens a larger
20 * buffer is reallocated and all the data will be printed again. 20 * buffer is reallocated and all the data will be printed again.
21 * The overflow state is true when m->count == m->size. 21 * The overflow state is true when m->count == m->size.
22 */ 22 */
23 static bool seq_overflow(struct seq_file *m) 23 static bool seq_overflow(struct seq_file *m)
24 { 24 {
25 return m->count == m->size; 25 return m->count == m->size;
26 } 26 }
27 27
28 static void seq_set_overflow(struct seq_file *m) 28 static void seq_set_overflow(struct seq_file *m)
29 { 29 {
30 m->count = m->size; 30 m->count = m->size;
31 } 31 }
32 32
33 /** 33 /**
34 * seq_open - initialize sequential file 34 * seq_open - initialize sequential file
35 * @file: file we initialize 35 * @file: file we initialize
36 * @op: method table describing the sequence 36 * @op: method table describing the sequence
37 * 37 *
38 * seq_open() sets @file, associating it with a sequence described 38 * seq_open() sets @file, associating it with a sequence described
39 * by @op. @op->start() sets the iterator up and returns the first 39 * by @op. @op->start() sets the iterator up and returns the first
40 * element of sequence. @op->stop() shuts it down. @op->next() 40 * element of sequence. @op->stop() shuts it down. @op->next()
41 * returns the next element of sequence. @op->show() prints element 41 * returns the next element of sequence. @op->show() prints element
42 * into the buffer. In case of error ->start() and ->next() return 42 * into the buffer. In case of error ->start() and ->next() return
43 * ERR_PTR(error). In the end of sequence they return %NULL. ->show() 43 * ERR_PTR(error). In the end of sequence they return %NULL. ->show()
44 * returns 0 in case of success and negative number in case of error. 44 * returns 0 in case of success and negative number in case of error.
45 * Returning SEQ_SKIP means "discard this element and move on". 45 * Returning SEQ_SKIP means "discard this element and move on".
46 */ 46 */
47 int seq_open(struct file *file, const struct seq_operations *op) 47 int seq_open(struct file *file, const struct seq_operations *op)
48 { 48 {
49 struct seq_file *p = file->private_data; 49 struct seq_file *p = file->private_data;
50 50
51 if (!p) { 51 if (!p) {
52 p = kmalloc(sizeof(*p), GFP_KERNEL); 52 p = kmalloc(sizeof(*p), GFP_KERNEL);
53 if (!p) 53 if (!p)
54 return -ENOMEM; 54 return -ENOMEM;
55 file->private_data = p; 55 file->private_data = p;
56 } 56 }
57 memset(p, 0, sizeof(*p)); 57 memset(p, 0, sizeof(*p));
58 mutex_init(&p->lock); 58 mutex_init(&p->lock);
59 p->op = op; 59 p->op = op;
60 #ifdef CONFIG_USER_NS 60 #ifdef CONFIG_USER_NS
61 p->user_ns = file->f_cred->user_ns; 61 p->user_ns = file->f_cred->user_ns;
62 #endif 62 #endif
63 63
64 /* 64 /*
65 * Wrappers around seq_open(e.g. swaps_open) need to be 65 * Wrappers around seq_open(e.g. swaps_open) need to be
66 * aware of this. If they set f_version themselves, they 66 * aware of this. If they set f_version themselves, they
67 * should call seq_open first and then set f_version. 67 * should call seq_open first and then set f_version.
68 */ 68 */
69 file->f_version = 0; 69 file->f_version = 0;
70 70
71 /* 71 /*
72 * seq_files support lseek() and pread(). They do not implement 72 * seq_files support lseek() and pread(). They do not implement
73 * write() at all, but we clear FMODE_PWRITE here for historical 73 * write() at all, but we clear FMODE_PWRITE here for historical
74 * reasons. 74 * reasons.
75 * 75 *
76 * If a client of seq_files a) implements file.write() and b) wishes to 76 * If a client of seq_files a) implements file.write() and b) wishes to
77 * support pwrite() then that client will need to implement its own 77 * support pwrite() then that client will need to implement its own
78 * file.open() which calls seq_open() and then sets FMODE_PWRITE. 78 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
79 */ 79 */
80 file->f_mode &= ~FMODE_PWRITE; 80 file->f_mode &= ~FMODE_PWRITE;
81 return 0; 81 return 0;
82 } 82 }
83 EXPORT_SYMBOL(seq_open); 83 EXPORT_SYMBOL(seq_open);
84 84
85 static int traverse(struct seq_file *m, loff_t offset) 85 static int traverse(struct seq_file *m, loff_t offset)
86 { 86 {
87 loff_t pos = 0, index; 87 loff_t pos = 0, index;
88 int error = 0; 88 int error = 0;
89 void *p; 89 void *p;
90 90
91 m->version = 0; 91 m->version = 0;
92 index = 0; 92 index = 0;
93 m->count = m->from = 0; 93 m->count = m->from = 0;
94 if (!offset) { 94 if (!offset) {
95 m->index = index; 95 m->index = index;
96 return 0; 96 return 0;
97 } 97 }
98 if (!m->buf) { 98 if (!m->buf) {
99 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 99 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
100 if (!m->buf) 100 if (!m->buf)
101 return -ENOMEM; 101 return -ENOMEM;
102 } 102 }
103 p = m->op->start(m, &index); 103 p = m->op->start(m, &index);
104 while (p) { 104 while (p) {
105 error = PTR_ERR(p); 105 error = PTR_ERR(p);
106 if (IS_ERR(p)) 106 if (IS_ERR(p))
107 break; 107 break;
108 error = m->op->show(m, p); 108 error = m->op->show(m, p);
109 if (error < 0) 109 if (error < 0)
110 break; 110 break;
111 if (unlikely(error)) { 111 if (unlikely(error)) {
112 error = 0; 112 error = 0;
113 m->count = 0; 113 m->count = 0;
114 } 114 }
115 if (seq_overflow(m)) 115 if (seq_overflow(m))
116 goto Eoverflow; 116 goto Eoverflow;
117 if (pos + m->count > offset) { 117 if (pos + m->count > offset) {
118 m->from = offset - pos; 118 m->from = offset - pos;
119 m->count -= m->from; 119 m->count -= m->from;
120 m->index = index; 120 m->index = index;
121 break; 121 break;
122 } 122 }
123 pos += m->count; 123 pos += m->count;
124 m->count = 0; 124 m->count = 0;
125 if (pos == offset) { 125 if (pos == offset) {
126 index++; 126 index++;
127 m->index = index; 127 m->index = index;
128 break; 128 break;
129 } 129 }
130 p = m->op->next(m, p, &index); 130 p = m->op->next(m, p, &index);
131 } 131 }
132 m->op->stop(m, p); 132 m->op->stop(m, p);
133 m->index = index; 133 m->index = index;
134 return error; 134 return error;
135 135
136 Eoverflow: 136 Eoverflow:
137 m->op->stop(m, p); 137 m->op->stop(m, p);
138 kfree(m->buf); 138 kfree(m->buf);
139 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); 139 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
140 return !m->buf ? -ENOMEM : -EAGAIN; 140 return !m->buf ? -ENOMEM : -EAGAIN;
141 } 141 }
142 142
143 /** 143 /**
144 * seq_read - ->read() method for sequential files. 144 * seq_read - ->read() method for sequential files.
145 * @file: the file to read from 145 * @file: the file to read from
146 * @buf: the buffer to read to 146 * @buf: the buffer to read to
147 * @size: the maximum number of bytes to read 147 * @size: the maximum number of bytes to read
148 * @ppos: the current position in the file 148 * @ppos: the current position in the file
149 * 149 *
150 * Ready-made ->f_op->read() 150 * Ready-made ->f_op->read()
151 */ 151 */
152 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) 152 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
153 { 153 {
154 struct seq_file *m = file->private_data; 154 struct seq_file *m = file->private_data;
155 size_t copied = 0; 155 size_t copied = 0;
156 loff_t pos; 156 loff_t pos;
157 size_t n; 157 size_t n;
158 void *p; 158 void *p;
159 int err = 0; 159 int err = 0;
160 160
161 mutex_lock(&m->lock); 161 mutex_lock(&m->lock);
162 162
163 /* 163 /*
164 * seq_file->op->..m_start/m_stop/m_next may do special actions 164 * seq_file->op->..m_start/m_stop/m_next may do special actions
165 * or optimisations based on the file->f_version, so we want to 165 * or optimisations based on the file->f_version, so we want to
166 * pass the file->f_version to those methods. 166 * pass the file->f_version to those methods.
167 * 167 *
168 * seq_file->version is just copy of f_version, and seq_file 168 * seq_file->version is just copy of f_version, and seq_file
169 * methods can treat it simply as file version. 169 * methods can treat it simply as file version.
170 * It is copied in first and copied out after all operations. 170 * It is copied in first and copied out after all operations.
171 * It is convenient to have it as part of structure to avoid the 171 * It is convenient to have it as part of structure to avoid the
172 * need of passing another argument to all the seq_file methods. 172 * need of passing another argument to all the seq_file methods.
173 */ 173 */
174 m->version = file->f_version; 174 m->version = file->f_version;
175 175
176 /* Don't assume *ppos is where we left it */ 176 /* Don't assume *ppos is where we left it */
177 if (unlikely(*ppos != m->read_pos)) { 177 if (unlikely(*ppos != m->read_pos)) {
178 while ((err = traverse(m, *ppos)) == -EAGAIN) 178 while ((err = traverse(m, *ppos)) == -EAGAIN)
179 ; 179 ;
180 if (err) { 180 if (err) {
181 /* With prejudice... */ 181 /* With prejudice... */
182 m->read_pos = 0; 182 m->read_pos = 0;
183 m->version = 0; 183 m->version = 0;
184 m->index = 0; 184 m->index = 0;
185 m->count = 0; 185 m->count = 0;
186 goto Done; 186 goto Done;
187 } else { 187 } else {
188 m->read_pos = *ppos; 188 m->read_pos = *ppos;
189 } 189 }
190 } 190 }
191 191
192 /* grab buffer if we didn't have one */ 192 /* grab buffer if we didn't have one */
193 if (!m->buf) { 193 if (!m->buf) {
194 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 194 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
195 if (!m->buf) 195 if (!m->buf)
196 goto Enomem; 196 goto Enomem;
197 } 197 }
198 /* if not empty - flush it first */ 198 /* if not empty - flush it first */
199 if (m->count) { 199 if (m->count) {
200 n = min(m->count, size); 200 n = min(m->count, size);
201 err = copy_to_user(buf, m->buf + m->from, n); 201 err = copy_to_user(buf, m->buf + m->from, n);
202 if (err) 202 if (err)
203 goto Efault; 203 goto Efault;
204 m->count -= n; 204 m->count -= n;
205 m->from += n; 205 m->from += n;
206 size -= n; 206 size -= n;
207 buf += n; 207 buf += n;
208 copied += n; 208 copied += n;
209 if (!m->count) 209 if (!m->count)
210 m->index++; 210 m->index++;
211 if (!size) 211 if (!size)
212 goto Done; 212 goto Done;
213 } 213 }
214 /* we need at least one record in buffer */ 214 /* we need at least one record in buffer */
215 pos = m->index; 215 pos = m->index;
216 p = m->op->start(m, &pos); 216 p = m->op->start(m, &pos);
217 while (1) { 217 while (1) {
218 err = PTR_ERR(p); 218 err = PTR_ERR(p);
219 if (!p || IS_ERR(p)) 219 if (!p || IS_ERR(p))
220 break; 220 break;
221 err = m->op->show(m, p); 221 err = m->op->show(m, p);
222 if (err < 0) 222 if (err < 0)
223 break; 223 break;
224 if (unlikely(err)) 224 if (unlikely(err))
225 m->count = 0; 225 m->count = 0;
226 if (unlikely(!m->count)) { 226 if (unlikely(!m->count)) {
227 p = m->op->next(m, p, &pos); 227 p = m->op->next(m, p, &pos);
228 m->index = pos; 228 m->index = pos;
229 continue; 229 continue;
230 } 230 }
231 if (m->count < m->size) 231 if (m->count < m->size)
232 goto Fill; 232 goto Fill;
233 m->op->stop(m, p); 233 m->op->stop(m, p);
234 kfree(m->buf); 234 kfree(m->buf);
235 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); 235 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
236 if (!m->buf) 236 if (!m->buf)
237 goto Enomem; 237 goto Enomem;
238 m->count = 0; 238 m->count = 0;
239 m->version = 0; 239 m->version = 0;
240 pos = m->index; 240 pos = m->index;
241 p = m->op->start(m, &pos); 241 p = m->op->start(m, &pos);
242 } 242 }
243 m->op->stop(m, p); 243 m->op->stop(m, p);
244 m->count = 0; 244 m->count = 0;
245 goto Done; 245 goto Done;
246 Fill: 246 Fill:
247 /* they want more? let's try to get some more */ 247 /* they want more? let's try to get some more */
248 while (m->count < size) { 248 while (m->count < size) {
249 size_t offs = m->count; 249 size_t offs = m->count;
250 loff_t next = pos; 250 loff_t next = pos;
251 p = m->op->next(m, p, &next); 251 p = m->op->next(m, p, &next);
252 if (!p || IS_ERR(p)) { 252 if (!p || IS_ERR(p)) {
253 err = PTR_ERR(p); 253 err = PTR_ERR(p);
254 break; 254 break;
255 } 255 }
256 err = m->op->show(m, p); 256 err = m->op->show(m, p);
257 if (seq_overflow(m) || err) { 257 if (seq_overflow(m) || err) {
258 m->count = offs; 258 m->count = offs;
259 if (likely(err <= 0)) 259 if (likely(err <= 0))
260 break; 260 break;
261 } 261 }
262 pos = next; 262 pos = next;
263 } 263 }
264 m->op->stop(m, p); 264 m->op->stop(m, p);
265 n = min(m->count, size); 265 n = min(m->count, size);
266 err = copy_to_user(buf, m->buf, n); 266 err = copy_to_user(buf, m->buf, n);
267 if (err) 267 if (err)
268 goto Efault; 268 goto Efault;
269 copied += n; 269 copied += n;
270 m->count -= n; 270 m->count -= n;
271 if (m->count) 271 if (m->count)
272 m->from = n; 272 m->from = n;
273 else 273 else
274 pos++; 274 pos++;
275 m->index = pos; 275 m->index = pos;
276 Done: 276 Done:
277 if (!copied) 277 if (!copied)
278 copied = err; 278 copied = err;
279 else { 279 else {
280 *ppos += copied; 280 *ppos += copied;
281 m->read_pos += copied; 281 m->read_pos += copied;
282 } 282 }
283 file->f_version = m->version; 283 file->f_version = m->version;
284 mutex_unlock(&m->lock); 284 mutex_unlock(&m->lock);
285 return copied; 285 return copied;
286 Enomem: 286 Enomem:
287 err = -ENOMEM; 287 err = -ENOMEM;
288 goto Done; 288 goto Done;
289 Efault: 289 Efault:
290 err = -EFAULT; 290 err = -EFAULT;
291 goto Done; 291 goto Done;
292 } 292 }
293 EXPORT_SYMBOL(seq_read); 293 EXPORT_SYMBOL(seq_read);
294 294
295 /** 295 /**
296 * seq_lseek - ->llseek() method for sequential files. 296 * seq_lseek - ->llseek() method for sequential files.
297 * @file: the file in question 297 * @file: the file in question
298 * @offset: new position 298 * @offset: new position
299 * @origin: 0 for absolute, 1 for relative position 299 * @origin: 0 for absolute, 1 for relative position
300 * 300 *
301 * Ready-made ->f_op->llseek() 301 * Ready-made ->f_op->llseek()
302 */ 302 */
303 loff_t seq_lseek(struct file *file, loff_t offset, int origin) 303 loff_t seq_lseek(struct file *file, loff_t offset, int whence)
304 { 304 {
305 struct seq_file *m = file->private_data; 305 struct seq_file *m = file->private_data;
306 loff_t retval = -EINVAL; 306 loff_t retval = -EINVAL;
307 307
308 mutex_lock(&m->lock); 308 mutex_lock(&m->lock);
309 m->version = file->f_version; 309 m->version = file->f_version;
310 switch (origin) { 310 switch (whence) {
311 case 1: 311 case 1:
312 offset += file->f_pos; 312 offset += file->f_pos;
313 case 0: 313 case 0:
314 if (offset < 0) 314 if (offset < 0)
315 break; 315 break;
316 retval = offset; 316 retval = offset;
317 if (offset != m->read_pos) { 317 if (offset != m->read_pos) {
318 while ((retval=traverse(m, offset)) == -EAGAIN) 318 while ((retval=traverse(m, offset)) == -EAGAIN)
319 ; 319 ;
320 if (retval) { 320 if (retval) {
321 /* with extreme prejudice... */ 321 /* with extreme prejudice... */
322 file->f_pos = 0; 322 file->f_pos = 0;
323 m->read_pos = 0; 323 m->read_pos = 0;
324 m->version = 0; 324 m->version = 0;
325 m->index = 0; 325 m->index = 0;
326 m->count = 0; 326 m->count = 0;
327 } else { 327 } else {
328 m->read_pos = offset; 328 m->read_pos = offset;
329 retval = file->f_pos = offset; 329 retval = file->f_pos = offset;
330 } 330 }
331 } 331 }
332 } 332 }
333 file->f_version = m->version; 333 file->f_version = m->version;
334 mutex_unlock(&m->lock); 334 mutex_unlock(&m->lock);
335 return retval; 335 return retval;
336 } 336 }
337 EXPORT_SYMBOL(seq_lseek); 337 EXPORT_SYMBOL(seq_lseek);
338 338
339 /** 339 /**
340 * seq_release - free the structures associated with sequential file. 340 * seq_release - free the structures associated with sequential file.
341 * @file: file in question 341 * @file: file in question
342 * @inode: file->f_path.dentry->d_inode 342 * @inode: file->f_path.dentry->d_inode
343 * 343 *
344 * Frees the structures associated with sequential file; can be used 344 * Frees the structures associated with sequential file; can be used
345 * as ->f_op->release() if you don't have private data to destroy. 345 * as ->f_op->release() if you don't have private data to destroy.
346 */ 346 */
347 int seq_release(struct inode *inode, struct file *file) 347 int seq_release(struct inode *inode, struct file *file)
348 { 348 {
349 struct seq_file *m = file->private_data; 349 struct seq_file *m = file->private_data;
350 kfree(m->buf); 350 kfree(m->buf);
351 kfree(m); 351 kfree(m);
352 return 0; 352 return 0;
353 } 353 }
354 EXPORT_SYMBOL(seq_release); 354 EXPORT_SYMBOL(seq_release);
355 355
356 /** 356 /**
357 * seq_escape - print string into buffer, escaping some characters 357 * seq_escape - print string into buffer, escaping some characters
358 * @m: target buffer 358 * @m: target buffer
359 * @s: string 359 * @s: string
360 * @esc: set of characters that need escaping 360 * @esc: set of characters that need escaping
361 * 361 *
362 * Puts string into buffer, replacing each occurrence of character from 362 * Puts string into buffer, replacing each occurrence of character from
363 * @esc with usual octal escape. Returns 0 in case of success, -1 - in 363 * @esc with usual octal escape. Returns 0 in case of success, -1 - in
364 * case of overflow. 364 * case of overflow.
365 */ 365 */
366 int seq_escape(struct seq_file *m, const char *s, const char *esc) 366 int seq_escape(struct seq_file *m, const char *s, const char *esc)
367 { 367 {
368 char *end = m->buf + m->size; 368 char *end = m->buf + m->size;
369 char *p; 369 char *p;
370 char c; 370 char c;
371 371
372 for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { 372 for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
373 if (!strchr(esc, c)) { 373 if (!strchr(esc, c)) {
374 *p++ = c; 374 *p++ = c;
375 continue; 375 continue;
376 } 376 }
377 if (p + 3 < end) { 377 if (p + 3 < end) {
378 *p++ = '\\'; 378 *p++ = '\\';
379 *p++ = '0' + ((c & 0300) >> 6); 379 *p++ = '0' + ((c & 0300) >> 6);
380 *p++ = '0' + ((c & 070) >> 3); 380 *p++ = '0' + ((c & 070) >> 3);
381 *p++ = '0' + (c & 07); 381 *p++ = '0' + (c & 07);
382 continue; 382 continue;
383 } 383 }
384 seq_set_overflow(m); 384 seq_set_overflow(m);
385 return -1; 385 return -1;
386 } 386 }
387 m->count = p - m->buf; 387 m->count = p - m->buf;
388 return 0; 388 return 0;
389 } 389 }
390 EXPORT_SYMBOL(seq_escape); 390 EXPORT_SYMBOL(seq_escape);
391 391
392 int seq_vprintf(struct seq_file *m, const char *f, va_list args) 392 int seq_vprintf(struct seq_file *m, const char *f, va_list args)
393 { 393 {
394 int len; 394 int len;
395 395
396 if (m->count < m->size) { 396 if (m->count < m->size) {
397 len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); 397 len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
398 if (m->count + len < m->size) { 398 if (m->count + len < m->size) {
399 m->count += len; 399 m->count += len;
400 return 0; 400 return 0;
401 } 401 }
402 } 402 }
403 seq_set_overflow(m); 403 seq_set_overflow(m);
404 return -1; 404 return -1;
405 } 405 }
406 EXPORT_SYMBOL(seq_vprintf); 406 EXPORT_SYMBOL(seq_vprintf);
407 407
408 int seq_printf(struct seq_file *m, const char *f, ...) 408 int seq_printf(struct seq_file *m, const char *f, ...)
409 { 409 {
410 int ret; 410 int ret;
411 va_list args; 411 va_list args;
412 412
413 va_start(args, f); 413 va_start(args, f);
414 ret = seq_vprintf(m, f, args); 414 ret = seq_vprintf(m, f, args);
415 va_end(args); 415 va_end(args);
416 416
417 return ret; 417 return ret;
418 } 418 }
419 EXPORT_SYMBOL(seq_printf); 419 EXPORT_SYMBOL(seq_printf);
420 420
421 /** 421 /**
422 * mangle_path - mangle and copy path to buffer beginning 422 * mangle_path - mangle and copy path to buffer beginning
423 * @s: buffer start 423 * @s: buffer start
424 * @p: beginning of path in above buffer 424 * @p: beginning of path in above buffer
425 * @esc: set of characters that need escaping 425 * @esc: set of characters that need escaping
426 * 426 *
427 * Copy the path from @p to @s, replacing each occurrence of character from 427 * Copy the path from @p to @s, replacing each occurrence of character from
428 * @esc with usual octal escape. 428 * @esc with usual octal escape.
429 * Returns pointer past last written character in @s, or NULL in case of 429 * Returns pointer past last written character in @s, or NULL in case of
430 * failure. 430 * failure.
431 */ 431 */
432 char *mangle_path(char *s, const char *p, const char *esc) 432 char *mangle_path(char *s, const char *p, const char *esc)
433 { 433 {
434 while (s <= p) { 434 while (s <= p) {
435 char c = *p++; 435 char c = *p++;
436 if (!c) { 436 if (!c) {
437 return s; 437 return s;
438 } else if (!strchr(esc, c)) { 438 } else if (!strchr(esc, c)) {
439 *s++ = c; 439 *s++ = c;
440 } else if (s + 4 > p) { 440 } else if (s + 4 > p) {
441 break; 441 break;
442 } else { 442 } else {
443 *s++ = '\\'; 443 *s++ = '\\';
444 *s++ = '0' + ((c & 0300) >> 6); 444 *s++ = '0' + ((c & 0300) >> 6);
445 *s++ = '0' + ((c & 070) >> 3); 445 *s++ = '0' + ((c & 070) >> 3);
446 *s++ = '0' + (c & 07); 446 *s++ = '0' + (c & 07);
447 } 447 }
448 } 448 }
449 return NULL; 449 return NULL;
450 } 450 }
451 EXPORT_SYMBOL(mangle_path); 451 EXPORT_SYMBOL(mangle_path);
452 452
453 /** 453 /**
454 * seq_path - seq_file interface to print a pathname 454 * seq_path - seq_file interface to print a pathname
455 * @m: the seq_file handle 455 * @m: the seq_file handle
456 * @path: the struct path to print 456 * @path: the struct path to print
457 * @esc: set of characters to escape in the output 457 * @esc: set of characters to escape in the output
458 * 458 *
459 * return the absolute path of 'path', as represented by the 459 * return the absolute path of 'path', as represented by the
460 * dentry / mnt pair in the path parameter. 460 * dentry / mnt pair in the path parameter.
461 */ 461 */
462 int seq_path(struct seq_file *m, const struct path *path, const char *esc) 462 int seq_path(struct seq_file *m, const struct path *path, const char *esc)
463 { 463 {
464 char *buf; 464 char *buf;
465 size_t size = seq_get_buf(m, &buf); 465 size_t size = seq_get_buf(m, &buf);
466 int res = -1; 466 int res = -1;
467 467
468 if (size) { 468 if (size) {
469 char *p = d_path(path, buf, size); 469 char *p = d_path(path, buf, size);
470 if (!IS_ERR(p)) { 470 if (!IS_ERR(p)) {
471 char *end = mangle_path(buf, p, esc); 471 char *end = mangle_path(buf, p, esc);
472 if (end) 472 if (end)
473 res = end - buf; 473 res = end - buf;
474 } 474 }
475 } 475 }
476 seq_commit(m, res); 476 seq_commit(m, res);
477 477
478 return res; 478 return res;
479 } 479 }
480 EXPORT_SYMBOL(seq_path); 480 EXPORT_SYMBOL(seq_path);
481 481
482 /* 482 /*
483 * Same as seq_path, but relative to supplied root. 483 * Same as seq_path, but relative to supplied root.
484 */ 484 */
485 int seq_path_root(struct seq_file *m, const struct path *path, 485 int seq_path_root(struct seq_file *m, const struct path *path,
486 const struct path *root, const char *esc) 486 const struct path *root, const char *esc)
487 { 487 {
488 char *buf; 488 char *buf;
489 size_t size = seq_get_buf(m, &buf); 489 size_t size = seq_get_buf(m, &buf);
490 int res = -ENAMETOOLONG; 490 int res = -ENAMETOOLONG;
491 491
492 if (size) { 492 if (size) {
493 char *p; 493 char *p;
494 494
495 p = __d_path(path, root, buf, size); 495 p = __d_path(path, root, buf, size);
496 if (!p) 496 if (!p)
497 return SEQ_SKIP; 497 return SEQ_SKIP;
498 res = PTR_ERR(p); 498 res = PTR_ERR(p);
499 if (!IS_ERR(p)) { 499 if (!IS_ERR(p)) {
500 char *end = mangle_path(buf, p, esc); 500 char *end = mangle_path(buf, p, esc);
501 if (end) 501 if (end)
502 res = end - buf; 502 res = end - buf;
503 else 503 else
504 res = -ENAMETOOLONG; 504 res = -ENAMETOOLONG;
505 } 505 }
506 } 506 }
507 seq_commit(m, res); 507 seq_commit(m, res);
508 508
509 return res < 0 && res != -ENAMETOOLONG ? res : 0; 509 return res < 0 && res != -ENAMETOOLONG ? res : 0;
510 } 510 }
511 511
512 /* 512 /*
513 * returns the path of the 'dentry' from the root of its filesystem. 513 * returns the path of the 'dentry' from the root of its filesystem.
514 */ 514 */
515 int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) 515 int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
516 { 516 {
517 char *buf; 517 char *buf;
518 size_t size = seq_get_buf(m, &buf); 518 size_t size = seq_get_buf(m, &buf);
519 int res = -1; 519 int res = -1;
520 520
521 if (size) { 521 if (size) {
522 char *p = dentry_path(dentry, buf, size); 522 char *p = dentry_path(dentry, buf, size);
523 if (!IS_ERR(p)) { 523 if (!IS_ERR(p)) {
524 char *end = mangle_path(buf, p, esc); 524 char *end = mangle_path(buf, p, esc);
525 if (end) 525 if (end)
526 res = end - buf; 526 res = end - buf;
527 } 527 }
528 } 528 }
529 seq_commit(m, res); 529 seq_commit(m, res);
530 530
531 return res; 531 return res;
532 } 532 }
533 533
534 int seq_bitmap(struct seq_file *m, const unsigned long *bits, 534 int seq_bitmap(struct seq_file *m, const unsigned long *bits,
535 unsigned int nr_bits) 535 unsigned int nr_bits)
536 { 536 {
537 if (m->count < m->size) { 537 if (m->count < m->size) {
538 int len = bitmap_scnprintf(m->buf + m->count, 538 int len = bitmap_scnprintf(m->buf + m->count,
539 m->size - m->count, bits, nr_bits); 539 m->size - m->count, bits, nr_bits);
540 if (m->count + len < m->size) { 540 if (m->count + len < m->size) {
541 m->count += len; 541 m->count += len;
542 return 0; 542 return 0;
543 } 543 }
544 } 544 }
545 seq_set_overflow(m); 545 seq_set_overflow(m);
546 return -1; 546 return -1;
547 } 547 }
548 EXPORT_SYMBOL(seq_bitmap); 548 EXPORT_SYMBOL(seq_bitmap);
549 549
550 int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, 550 int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
551 unsigned int nr_bits) 551 unsigned int nr_bits)
552 { 552 {
553 if (m->count < m->size) { 553 if (m->count < m->size) {
554 int len = bitmap_scnlistprintf(m->buf + m->count, 554 int len = bitmap_scnlistprintf(m->buf + m->count,
555 m->size - m->count, bits, nr_bits); 555 m->size - m->count, bits, nr_bits);
556 if (m->count + len < m->size) { 556 if (m->count + len < m->size) {
557 m->count += len; 557 m->count += len;
558 return 0; 558 return 0;
559 } 559 }
560 } 560 }
561 seq_set_overflow(m); 561 seq_set_overflow(m);
562 return -1; 562 return -1;
563 } 563 }
564 EXPORT_SYMBOL(seq_bitmap_list); 564 EXPORT_SYMBOL(seq_bitmap_list);
565 565
566 static void *single_start(struct seq_file *p, loff_t *pos) 566 static void *single_start(struct seq_file *p, loff_t *pos)
567 { 567 {
568 return NULL + (*pos == 0); 568 return NULL + (*pos == 0);
569 } 569 }
570 570
571 static void *single_next(struct seq_file *p, void *v, loff_t *pos) 571 static void *single_next(struct seq_file *p, void *v, loff_t *pos)
572 { 572 {
573 ++*pos; 573 ++*pos;
574 return NULL; 574 return NULL;
575 } 575 }
576 576
577 static void single_stop(struct seq_file *p, void *v) 577 static void single_stop(struct seq_file *p, void *v)
578 { 578 {
579 } 579 }
580 580
581 int single_open(struct file *file, int (*show)(struct seq_file *, void *), 581 int single_open(struct file *file, int (*show)(struct seq_file *, void *),
582 void *data) 582 void *data)
583 { 583 {
584 struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); 584 struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
585 int res = -ENOMEM; 585 int res = -ENOMEM;
586 586
587 if (op) { 587 if (op) {
588 op->start = single_start; 588 op->start = single_start;
589 op->next = single_next; 589 op->next = single_next;
590 op->stop = single_stop; 590 op->stop = single_stop;
591 op->show = show; 591 op->show = show;
592 res = seq_open(file, op); 592 res = seq_open(file, op);
593 if (!res) 593 if (!res)
594 ((struct seq_file *)file->private_data)->private = data; 594 ((struct seq_file *)file->private_data)->private = data;
595 else 595 else
596 kfree(op); 596 kfree(op);
597 } 597 }
598 return res; 598 return res;
599 } 599 }
600 EXPORT_SYMBOL(single_open); 600 EXPORT_SYMBOL(single_open);
601 601
602 int single_release(struct inode *inode, struct file *file) 602 int single_release(struct inode *inode, struct file *file)
603 { 603 {
604 const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; 604 const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
605 int res = seq_release(inode, file); 605 int res = seq_release(inode, file);
606 kfree(op); 606 kfree(op);
607 return res; 607 return res;
608 } 608 }
609 EXPORT_SYMBOL(single_release); 609 EXPORT_SYMBOL(single_release);
610 610
611 int seq_release_private(struct inode *inode, struct file *file) 611 int seq_release_private(struct inode *inode, struct file *file)
612 { 612 {
613 struct seq_file *seq = file->private_data; 613 struct seq_file *seq = file->private_data;
614 614
615 kfree(seq->private); 615 kfree(seq->private);
616 seq->private = NULL; 616 seq->private = NULL;
617 return seq_release(inode, file); 617 return seq_release(inode, file);
618 } 618 }
619 EXPORT_SYMBOL(seq_release_private); 619 EXPORT_SYMBOL(seq_release_private);
620 620
621 void *__seq_open_private(struct file *f, const struct seq_operations *ops, 621 void *__seq_open_private(struct file *f, const struct seq_operations *ops,
622 int psize) 622 int psize)
623 { 623 {
624 int rc; 624 int rc;
625 void *private; 625 void *private;
626 struct seq_file *seq; 626 struct seq_file *seq;
627 627
628 private = kzalloc(psize, GFP_KERNEL); 628 private = kzalloc(psize, GFP_KERNEL);
629 if (private == NULL) 629 if (private == NULL)
630 goto out; 630 goto out;
631 631
632 rc = seq_open(f, ops); 632 rc = seq_open(f, ops);
633 if (rc < 0) 633 if (rc < 0)
634 goto out_free; 634 goto out_free;
635 635
636 seq = f->private_data; 636 seq = f->private_data;
637 seq->private = private; 637 seq->private = private;
638 return private; 638 return private;
639 639
640 out_free: 640 out_free:
641 kfree(private); 641 kfree(private);
642 out: 642 out:
643 return NULL; 643 return NULL;
644 } 644 }
645 EXPORT_SYMBOL(__seq_open_private); 645 EXPORT_SYMBOL(__seq_open_private);
646 646
647 int seq_open_private(struct file *filp, const struct seq_operations *ops, 647 int seq_open_private(struct file *filp, const struct seq_operations *ops,
648 int psize) 648 int psize)
649 { 649 {
650 return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; 650 return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
651 } 651 }
652 EXPORT_SYMBOL(seq_open_private); 652 EXPORT_SYMBOL(seq_open_private);
653 653
654 int seq_putc(struct seq_file *m, char c) 654 int seq_putc(struct seq_file *m, char c)
655 { 655 {
656 if (m->count < m->size) { 656 if (m->count < m->size) {
657 m->buf[m->count++] = c; 657 m->buf[m->count++] = c;
658 return 0; 658 return 0;
659 } 659 }
660 return -1; 660 return -1;
661 } 661 }
662 EXPORT_SYMBOL(seq_putc); 662 EXPORT_SYMBOL(seq_putc);
663 663
664 int seq_puts(struct seq_file *m, const char *s) 664 int seq_puts(struct seq_file *m, const char *s)
665 { 665 {
666 int len = strlen(s); 666 int len = strlen(s);
667 if (m->count + len < m->size) { 667 if (m->count + len < m->size) {
668 memcpy(m->buf + m->count, s, len); 668 memcpy(m->buf + m->count, s, len);
669 m->count += len; 669 m->count += len;
670 return 0; 670 return 0;
671 } 671 }
672 seq_set_overflow(m); 672 seq_set_overflow(m);
673 return -1; 673 return -1;
674 } 674 }
675 EXPORT_SYMBOL(seq_puts); 675 EXPORT_SYMBOL(seq_puts);
676 676
677 /* 677 /*
678 * A helper routine for putting decimal numbers without rich format of printf(). 678 * A helper routine for putting decimal numbers without rich format of printf().
679 * only 'unsigned long long' is supported. 679 * only 'unsigned long long' is supported.
680 * This routine will put one byte delimiter + number into seq_file. 680 * This routine will put one byte delimiter + number into seq_file.
681 * This routine is very quick when you show lots of numbers. 681 * This routine is very quick when you show lots of numbers.
682 * In usual cases, it will be better to use seq_printf(). It's easier to read. 682 * In usual cases, it will be better to use seq_printf(). It's easier to read.
683 */ 683 */
684 int seq_put_decimal_ull(struct seq_file *m, char delimiter, 684 int seq_put_decimal_ull(struct seq_file *m, char delimiter,
685 unsigned long long num) 685 unsigned long long num)
686 { 686 {
687 int len; 687 int len;
688 688
689 if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ 689 if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
690 goto overflow; 690 goto overflow;
691 691
692 if (delimiter) 692 if (delimiter)
693 m->buf[m->count++] = delimiter; 693 m->buf[m->count++] = delimiter;
694 694
695 if (num < 10) { 695 if (num < 10) {
696 m->buf[m->count++] = num + '0'; 696 m->buf[m->count++] = num + '0';
697 return 0; 697 return 0;
698 } 698 }
699 699
700 len = num_to_str(m->buf + m->count, m->size - m->count, num); 700 len = num_to_str(m->buf + m->count, m->size - m->count, num);
701 if (!len) 701 if (!len)
702 goto overflow; 702 goto overflow;
703 m->count += len; 703 m->count += len;
704 return 0; 704 return 0;
705 overflow: 705 overflow:
706 seq_set_overflow(m); 706 seq_set_overflow(m);
707 return -1; 707 return -1;
708 } 708 }
709 EXPORT_SYMBOL(seq_put_decimal_ull); 709 EXPORT_SYMBOL(seq_put_decimal_ull);
710 710
711 int seq_put_decimal_ll(struct seq_file *m, char delimiter, 711 int seq_put_decimal_ll(struct seq_file *m, char delimiter,
712 long long num) 712 long long num)
713 { 713 {
714 if (num < 0) { 714 if (num < 0) {
715 if (m->count + 3 >= m->size) { 715 if (m->count + 3 >= m->size) {
716 seq_set_overflow(m); 716 seq_set_overflow(m);
717 return -1; 717 return -1;
718 } 718 }
719 if (delimiter) 719 if (delimiter)
720 m->buf[m->count++] = delimiter; 720 m->buf[m->count++] = delimiter;
721 num = -num; 721 num = -num;
722 delimiter = '-'; 722 delimiter = '-';
723 } 723 }
724 return seq_put_decimal_ull(m, delimiter, num); 724 return seq_put_decimal_ull(m, delimiter, num);
725 725
726 } 726 }
727 EXPORT_SYMBOL(seq_put_decimal_ll); 727 EXPORT_SYMBOL(seq_put_decimal_ll);
728 728
729 /** 729 /**
730 * seq_write - write arbitrary data to buffer 730 * seq_write - write arbitrary data to buffer
731 * @seq: seq_file identifying the buffer to which data should be written 731 * @seq: seq_file identifying the buffer to which data should be written
732 * @data: data address 732 * @data: data address
733 * @len: number of bytes 733 * @len: number of bytes
734 * 734 *
735 * Return 0 on success, non-zero otherwise. 735 * Return 0 on success, non-zero otherwise.
736 */ 736 */
737 int seq_write(struct seq_file *seq, const void *data, size_t len) 737 int seq_write(struct seq_file *seq, const void *data, size_t len)
738 { 738 {
739 if (seq->count + len < seq->size) { 739 if (seq->count + len < seq->size) {
740 memcpy(seq->buf + seq->count, data, len); 740 memcpy(seq->buf + seq->count, data, len);
741 seq->count += len; 741 seq->count += len;
742 return 0; 742 return 0;
743 } 743 }
744 seq_set_overflow(seq); 744 seq_set_overflow(seq);
745 return -1; 745 return -1;
746 } 746 }
747 EXPORT_SYMBOL(seq_write); 747 EXPORT_SYMBOL(seq_write);
748 748
749 struct list_head *seq_list_start(struct list_head *head, loff_t pos) 749 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
750 { 750 {
751 struct list_head *lh; 751 struct list_head *lh;
752 752
753 list_for_each(lh, head) 753 list_for_each(lh, head)
754 if (pos-- == 0) 754 if (pos-- == 0)
755 return lh; 755 return lh;
756 756
757 return NULL; 757 return NULL;
758 } 758 }
759 EXPORT_SYMBOL(seq_list_start); 759 EXPORT_SYMBOL(seq_list_start);
760 760
761 struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 761 struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
762 { 762 {
763 if (!pos) 763 if (!pos)
764 return head; 764 return head;
765 765
766 return seq_list_start(head, pos - 1); 766 return seq_list_start(head, pos - 1);
767 } 767 }
768 EXPORT_SYMBOL(seq_list_start_head); 768 EXPORT_SYMBOL(seq_list_start_head);
769 769
770 struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 770 struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
771 { 771 {
772 struct list_head *lh; 772 struct list_head *lh;
773 773
774 lh = ((struct list_head *)v)->next; 774 lh = ((struct list_head *)v)->next;
775 ++*ppos; 775 ++*ppos;
776 return lh == head ? NULL : lh; 776 return lh == head ? NULL : lh;
777 } 777 }
778 EXPORT_SYMBOL(seq_list_next); 778 EXPORT_SYMBOL(seq_list_next);
779 779
780 /** 780 /**
781 * seq_hlist_start - start an iteration of a hlist 781 * seq_hlist_start - start an iteration of a hlist
782 * @head: the head of the hlist 782 * @head: the head of the hlist
783 * @pos: the start position of the sequence 783 * @pos: the start position of the sequence
784 * 784 *
785 * Called at seq_file->op->start(). 785 * Called at seq_file->op->start().
786 */ 786 */
787 struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) 787 struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
788 { 788 {
789 struct hlist_node *node; 789 struct hlist_node *node;
790 790
791 hlist_for_each(node, head) 791 hlist_for_each(node, head)
792 if (pos-- == 0) 792 if (pos-- == 0)
793 return node; 793 return node;
794 return NULL; 794 return NULL;
795 } 795 }
796 EXPORT_SYMBOL(seq_hlist_start); 796 EXPORT_SYMBOL(seq_hlist_start);
797 797
798 /** 798 /**
799 * seq_hlist_start_head - start an iteration of a hlist 799 * seq_hlist_start_head - start an iteration of a hlist
800 * @head: the head of the hlist 800 * @head: the head of the hlist
801 * @pos: the start position of the sequence 801 * @pos: the start position of the sequence
802 * 802 *
803 * Called at seq_file->op->start(). Call this function if you want to 803 * Called at seq_file->op->start(). Call this function if you want to
804 * print a header at the top of the output. 804 * print a header at the top of the output.
805 */ 805 */
806 struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) 806 struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
807 { 807 {
808 if (!pos) 808 if (!pos)
809 return SEQ_START_TOKEN; 809 return SEQ_START_TOKEN;
810 810
811 return seq_hlist_start(head, pos - 1); 811 return seq_hlist_start(head, pos - 1);
812 } 812 }
813 EXPORT_SYMBOL(seq_hlist_start_head); 813 EXPORT_SYMBOL(seq_hlist_start_head);
814 814
815 /** 815 /**
816 * seq_hlist_next - move to the next position of the hlist 816 * seq_hlist_next - move to the next position of the hlist
817 * @v: the current iterator 817 * @v: the current iterator
818 * @head: the head of the hlist 818 * @head: the head of the hlist
819 * @ppos: the current position 819 * @ppos: the current position
820 * 820 *
821 * Called at seq_file->op->next(). 821 * Called at seq_file->op->next().
822 */ 822 */
823 struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, 823 struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
824 loff_t *ppos) 824 loff_t *ppos)
825 { 825 {
826 struct hlist_node *node = v; 826 struct hlist_node *node = v;
827 827
828 ++*ppos; 828 ++*ppos;
829 if (v == SEQ_START_TOKEN) 829 if (v == SEQ_START_TOKEN)
830 return head->first; 830 return head->first;
831 else 831 else
832 return node->next; 832 return node->next;
833 } 833 }
834 EXPORT_SYMBOL(seq_hlist_next); 834 EXPORT_SYMBOL(seq_hlist_next);
835 835
836 /** 836 /**
837 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU 837 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
838 * @head: the head of the hlist 838 * @head: the head of the hlist
839 * @pos: the start position of the sequence 839 * @pos: the start position of the sequence
840 * 840 *
841 * Called at seq_file->op->start(). 841 * Called at seq_file->op->start().
842 * 842 *
843 * This list-traversal primitive may safely run concurrently with 843 * This list-traversal primitive may safely run concurrently with
844 * the _rcu list-mutation primitives such as hlist_add_head_rcu() 844 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
845 * as long as the traversal is guarded by rcu_read_lock(). 845 * as long as the traversal is guarded by rcu_read_lock().
846 */ 846 */
847 struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, 847 struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
848 loff_t pos) 848 loff_t pos)
849 { 849 {
850 struct hlist_node *node; 850 struct hlist_node *node;
851 851
852 __hlist_for_each_rcu(node, head) 852 __hlist_for_each_rcu(node, head)
853 if (pos-- == 0) 853 if (pos-- == 0)
854 return node; 854 return node;
855 return NULL; 855 return NULL;
856 } 856 }
857 EXPORT_SYMBOL(seq_hlist_start_rcu); 857 EXPORT_SYMBOL(seq_hlist_start_rcu);
858 858
859 /** 859 /**
860 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU 860 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
861 * @head: the head of the hlist 861 * @head: the head of the hlist
862 * @pos: the start position of the sequence 862 * @pos: the start position of the sequence
863 * 863 *
864 * Called at seq_file->op->start(). Call this function if you want to 864 * Called at seq_file->op->start(). Call this function if you want to
865 * print a header at the top of the output. 865 * print a header at the top of the output.
866 * 866 *
867 * This list-traversal primitive may safely run concurrently with 867 * This list-traversal primitive may safely run concurrently with
868 * the _rcu list-mutation primitives such as hlist_add_head_rcu() 868 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
869 * as long as the traversal is guarded by rcu_read_lock(). 869 * as long as the traversal is guarded by rcu_read_lock().
870 */ 870 */
871 struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, 871 struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
872 loff_t pos) 872 loff_t pos)
873 { 873 {
874 if (!pos) 874 if (!pos)
875 return SEQ_START_TOKEN; 875 return SEQ_START_TOKEN;
876 876
877 return seq_hlist_start_rcu(head, pos - 1); 877 return seq_hlist_start_rcu(head, pos - 1);
878 } 878 }
879 EXPORT_SYMBOL(seq_hlist_start_head_rcu); 879 EXPORT_SYMBOL(seq_hlist_start_head_rcu);
880 880
881 /** 881 /**
882 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU 882 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
883 * @v: the current iterator 883 * @v: the current iterator
884 * @head: the head of the hlist 884 * @head: the head of the hlist
885 * @ppos: the current position 885 * @ppos: the current position
886 * 886 *
887 * Called at seq_file->op->next(). 887 * Called at seq_file->op->next().
888 * 888 *
889 * This list-traversal primitive may safely run concurrently with 889 * This list-traversal primitive may safely run concurrently with
890 * the _rcu list-mutation primitives such as hlist_add_head_rcu() 890 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
891 * as long as the traversal is guarded by rcu_read_lock(). 891 * as long as the traversal is guarded by rcu_read_lock().
892 */ 892 */
893 struct hlist_node *seq_hlist_next_rcu(void *v, 893 struct hlist_node *seq_hlist_next_rcu(void *v,
894 struct hlist_head *head, 894 struct hlist_head *head,
895 loff_t *ppos) 895 loff_t *ppos)
896 { 896 {
897 struct hlist_node *node = v; 897 struct hlist_node *node = v;
898 898
899 ++*ppos; 899 ++*ppos;
900 if (v == SEQ_START_TOKEN) 900 if (v == SEQ_START_TOKEN)
901 return rcu_dereference(head->first); 901 return rcu_dereference(head->first);
902 else 902 else
903 return rcu_dereference(node->next); 903 return rcu_dereference(node->next);
904 } 904 }
905 EXPORT_SYMBOL(seq_hlist_next_rcu); 905 EXPORT_SYMBOL(seq_hlist_next_rcu);
906 906
1 /* * This file is part of UBIFS. 1 /* * This file is part of UBIFS.
2 * 2 *
3 * Copyright (C) 2006-2008 Nokia Corporation. 3 * Copyright (C) 2006-2008 Nokia Corporation.
4 * Copyright (C) 2006, 2007 University of Szeged, Hungary 4 * Copyright (C) 2006, 2007 University of Szeged, Hungary
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by 7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation. 8 * the Free Software Foundation.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT 10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details. 13 * more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License along with 15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51 16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 * 18 *
19 * Authors: Artem Bityutskiy (ะ‘ะธั‚ัŽั†ะบะธะน ะั€ั‚ั‘ะผ) 19 * Authors: Artem Bityutskiy (ะ‘ะธั‚ัŽั†ะบะธะน ะั€ั‚ั‘ะผ)
20 * Adrian Hunter 20 * Adrian Hunter
21 * Zoltan Sogor 21 * Zoltan Sogor
22 */ 22 */
23 23
24 /* 24 /*
25 * This file implements directory operations. 25 * This file implements directory operations.
26 * 26 *
27 * All FS operations in this file allocate budget before writing anything to the 27 * All FS operations in this file allocate budget before writing anything to the
28 * media. If they fail to allocate it, the error is returned. The only 28 * media. If they fail to allocate it, the error is returned. The only
29 * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even 29 * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
30 * if they unable to allocate the budget, because deletion %-ENOSPC failure is 30 * if they unable to allocate the budget, because deletion %-ENOSPC failure is
31 * not what users are usually ready to get. UBIFS budgeting subsystem has some 31 * not what users are usually ready to get. UBIFS budgeting subsystem has some
32 * space reserved for these purposes. 32 * space reserved for these purposes.
33 * 33 *
34 * All operations in this file write all inodes which they change straight 34 * All operations in this file write all inodes which they change straight
35 * away, instead of marking them dirty. For example, 'ubifs_link()' changes 35 * away, instead of marking them dirty. For example, 'ubifs_link()' changes
36 * @i_size of the parent inode and writes the parent inode together with the 36 * @i_size of the parent inode and writes the parent inode together with the
37 * target inode. This was done to simplify file-system recovery which would 37 * target inode. This was done to simplify file-system recovery which would
38 * otherwise be very difficult to do. The only exception is rename which marks 38 * otherwise be very difficult to do. The only exception is rename which marks
39 * the re-named inode dirty (because its @i_ctime is updated) but does not 39 * the re-named inode dirty (because its @i_ctime is updated) but does not
40 * write it, but just marks it as dirty. 40 * write it, but just marks it as dirty.
41 */ 41 */
42 42
43 #include "ubifs.h" 43 #include "ubifs.h"
44 44
45 /** 45 /**
46 * inherit_flags - inherit flags of the parent inode. 46 * inherit_flags - inherit flags of the parent inode.
47 * @dir: parent inode 47 * @dir: parent inode
48 * @mode: new inode mode flags 48 * @mode: new inode mode flags
49 * 49 *
50 * This is a helper function for 'ubifs_new_inode()' which inherits flag of the 50 * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
51 * parent directory inode @dir. UBIFS inodes inherit the following flags: 51 * parent directory inode @dir. UBIFS inodes inherit the following flags:
52 * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on 52 * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
53 * sub-directory basis; 53 * sub-directory basis;
54 * o %UBIFS_SYNC_FL - useful for the same reasons; 54 * o %UBIFS_SYNC_FL - useful for the same reasons;
55 * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories. 55 * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
56 * 56 *
57 * This function returns the inherited flags. 57 * This function returns the inherited flags.
58 */ 58 */
59 static int inherit_flags(const struct inode *dir, umode_t mode) 59 static int inherit_flags(const struct inode *dir, umode_t mode)
60 { 60 {
61 int flags; 61 int flags;
62 const struct ubifs_inode *ui = ubifs_inode(dir); 62 const struct ubifs_inode *ui = ubifs_inode(dir);
63 63
64 if (!S_ISDIR(dir->i_mode)) 64 if (!S_ISDIR(dir->i_mode))
65 /* 65 /*
66 * The parent is not a directory, which means that an extended 66 * The parent is not a directory, which means that an extended
67 * attribute inode is being created. No flags. 67 * attribute inode is being created. No flags.
68 */ 68 */
69 return 0; 69 return 0;
70 70
71 flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL); 71 flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
72 if (!S_ISDIR(mode)) 72 if (!S_ISDIR(mode))
73 /* The "DIRSYNC" flag only applies to directories */ 73 /* The "DIRSYNC" flag only applies to directories */
74 flags &= ~UBIFS_DIRSYNC_FL; 74 flags &= ~UBIFS_DIRSYNC_FL;
75 return flags; 75 return flags;
76 } 76 }
77 77
78 /** 78 /**
79 * ubifs_new_inode - allocate new UBIFS inode object. 79 * ubifs_new_inode - allocate new UBIFS inode object.
80 * @c: UBIFS file-system description object 80 * @c: UBIFS file-system description object
81 * @dir: parent directory inode 81 * @dir: parent directory inode
82 * @mode: inode mode flags 82 * @mode: inode mode flags
83 * 83 *
84 * This function finds an unused inode number, allocates new inode and 84 * This function finds an unused inode number, allocates new inode and
85 * initializes it. Returns new inode in case of success and an error code in 85 * initializes it. Returns new inode in case of success and an error code in
86 * case of failure. 86 * case of failure.
87 */ 87 */
88 struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, 88 struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
89 umode_t mode) 89 umode_t mode)
90 { 90 {
91 struct inode *inode; 91 struct inode *inode;
92 struct ubifs_inode *ui; 92 struct ubifs_inode *ui;
93 93
94 inode = new_inode(c->vfs_sb); 94 inode = new_inode(c->vfs_sb);
95 ui = ubifs_inode(inode); 95 ui = ubifs_inode(inode);
96 if (!inode) 96 if (!inode)
97 return ERR_PTR(-ENOMEM); 97 return ERR_PTR(-ENOMEM);
98 98
99 /* 99 /*
100 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and 100 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
101 * marking them dirty in file write path (see 'file_update_time()'). 101 * marking them dirty in file write path (see 'file_update_time()').
102 * UBIFS has to fully control "clean <-> dirty" transitions of inodes 102 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
103 * to make budgeting work. 103 * to make budgeting work.
104 */ 104 */
105 inode->i_flags |= S_NOCMTIME; 105 inode->i_flags |= S_NOCMTIME;
106 106
107 inode_init_owner(inode, dir, mode); 107 inode_init_owner(inode, dir, mode);
108 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
109 ubifs_current_time(inode); 109 ubifs_current_time(inode);
110 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
111 /* Disable readahead */ 111 /* Disable readahead */
112 inode->i_mapping->backing_dev_info = &c->bdi; 112 inode->i_mapping->backing_dev_info = &c->bdi;
113 113
114 switch (mode & S_IFMT) { 114 switch (mode & S_IFMT) {
115 case S_IFREG: 115 case S_IFREG:
116 inode->i_mapping->a_ops = &ubifs_file_address_operations; 116 inode->i_mapping->a_ops = &ubifs_file_address_operations;
117 inode->i_op = &ubifs_file_inode_operations; 117 inode->i_op = &ubifs_file_inode_operations;
118 inode->i_fop = &ubifs_file_operations; 118 inode->i_fop = &ubifs_file_operations;
119 break; 119 break;
120 case S_IFDIR: 120 case S_IFDIR:
121 inode->i_op = &ubifs_dir_inode_operations; 121 inode->i_op = &ubifs_dir_inode_operations;
122 inode->i_fop = &ubifs_dir_operations; 122 inode->i_fop = &ubifs_dir_operations;
123 inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ; 123 inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
124 break; 124 break;
125 case S_IFLNK: 125 case S_IFLNK:
126 inode->i_op = &ubifs_symlink_inode_operations; 126 inode->i_op = &ubifs_symlink_inode_operations;
127 break; 127 break;
128 case S_IFSOCK: 128 case S_IFSOCK:
129 case S_IFIFO: 129 case S_IFIFO:
130 case S_IFBLK: 130 case S_IFBLK:
131 case S_IFCHR: 131 case S_IFCHR:
132 inode->i_op = &ubifs_file_inode_operations; 132 inode->i_op = &ubifs_file_inode_operations;
133 break; 133 break;
134 default: 134 default:
135 BUG(); 135 BUG();
136 } 136 }
137 137
138 ui->flags = inherit_flags(dir, mode); 138 ui->flags = inherit_flags(dir, mode);
139 ubifs_set_inode_flags(inode); 139 ubifs_set_inode_flags(inode);
140 if (S_ISREG(mode)) 140 if (S_ISREG(mode))
141 ui->compr_type = c->default_compr; 141 ui->compr_type = c->default_compr;
142 else 142 else
143 ui->compr_type = UBIFS_COMPR_NONE; 143 ui->compr_type = UBIFS_COMPR_NONE;
144 ui->synced_i_size = 0; 144 ui->synced_i_size = 0;
145 145
146 spin_lock(&c->cnt_lock); 146 spin_lock(&c->cnt_lock);
147 /* Inode number overflow is currently not supported */ 147 /* Inode number overflow is currently not supported */
148 if (c->highest_inum >= INUM_WARN_WATERMARK) { 148 if (c->highest_inum >= INUM_WARN_WATERMARK) {
149 if (c->highest_inum >= INUM_WATERMARK) { 149 if (c->highest_inum >= INUM_WATERMARK) {
150 spin_unlock(&c->cnt_lock); 150 spin_unlock(&c->cnt_lock);
151 ubifs_err("out of inode numbers"); 151 ubifs_err("out of inode numbers");
152 make_bad_inode(inode); 152 make_bad_inode(inode);
153 iput(inode); 153 iput(inode);
154 return ERR_PTR(-EINVAL); 154 return ERR_PTR(-EINVAL);
155 } 155 }
156 ubifs_warn("running out of inode numbers (current %lu, max %d)", 156 ubifs_warn("running out of inode numbers (current %lu, max %d)",
157 (unsigned long)c->highest_inum, INUM_WATERMARK); 157 (unsigned long)c->highest_inum, INUM_WATERMARK);
158 } 158 }
159 159
160 inode->i_ino = ++c->highest_inum; 160 inode->i_ino = ++c->highest_inum;
161 /* 161 /*
162 * The creation sequence number remains with this inode for its 162 * The creation sequence number remains with this inode for its
163 * lifetime. All nodes for this inode have a greater sequence number, 163 * lifetime. All nodes for this inode have a greater sequence number,
164 * and so it is possible to distinguish obsolete nodes belonging to a 164 * and so it is possible to distinguish obsolete nodes belonging to a
165 * previous incarnation of the same inode number - for example, for the 165 * previous incarnation of the same inode number - for example, for the
166 * purpose of rebuilding the index. 166 * purpose of rebuilding the index.
167 */ 167 */
168 ui->creat_sqnum = ++c->max_sqnum; 168 ui->creat_sqnum = ++c->max_sqnum;
169 spin_unlock(&c->cnt_lock); 169 spin_unlock(&c->cnt_lock);
170 return inode; 170 return inode;
171 } 171 }
172 172
173 static int dbg_check_name(const struct ubifs_info *c, 173 static int dbg_check_name(const struct ubifs_info *c,
174 const struct ubifs_dent_node *dent, 174 const struct ubifs_dent_node *dent,
175 const struct qstr *nm) 175 const struct qstr *nm)
176 { 176 {
177 if (!dbg_is_chk_gen(c)) 177 if (!dbg_is_chk_gen(c))
178 return 0; 178 return 0;
179 if (le16_to_cpu(dent->nlen) != nm->len) 179 if (le16_to_cpu(dent->nlen) != nm->len)
180 return -EINVAL; 180 return -EINVAL;
181 if (memcmp(dent->name, nm->name, nm->len)) 181 if (memcmp(dent->name, nm->name, nm->len))
182 return -EINVAL; 182 return -EINVAL;
183 return 0; 183 return 0;
184 } 184 }
185 185
186 static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, 186 static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
187 unsigned int flags) 187 unsigned int flags)
188 { 188 {
189 int err; 189 int err;
190 union ubifs_key key; 190 union ubifs_key key;
191 struct inode *inode = NULL; 191 struct inode *inode = NULL;
192 struct ubifs_dent_node *dent; 192 struct ubifs_dent_node *dent;
193 struct ubifs_info *c = dir->i_sb->s_fs_info; 193 struct ubifs_info *c = dir->i_sb->s_fs_info;
194 194
195 dbg_gen("'%.*s' in dir ino %lu", 195 dbg_gen("'%.*s' in dir ino %lu",
196 dentry->d_name.len, dentry->d_name.name, dir->i_ino); 196 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
197 197
198 if (dentry->d_name.len > UBIFS_MAX_NLEN) 198 if (dentry->d_name.len > UBIFS_MAX_NLEN)
199 return ERR_PTR(-ENAMETOOLONG); 199 return ERR_PTR(-ENAMETOOLONG);
200 200
201 dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); 201 dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
202 if (!dent) 202 if (!dent)
203 return ERR_PTR(-ENOMEM); 203 return ERR_PTR(-ENOMEM);
204 204
205 dent_key_init(c, &key, dir->i_ino, &dentry->d_name); 205 dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
206 206
207 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); 207 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
208 if (err) { 208 if (err) {
209 if (err == -ENOENT) { 209 if (err == -ENOENT) {
210 dbg_gen("not found"); 210 dbg_gen("not found");
211 goto done; 211 goto done;
212 } 212 }
213 goto out; 213 goto out;
214 } 214 }
215 215
216 if (dbg_check_name(c, dent, &dentry->d_name)) { 216 if (dbg_check_name(c, dent, &dentry->d_name)) {
217 err = -EINVAL; 217 err = -EINVAL;
218 goto out; 218 goto out;
219 } 219 }
220 220
221 inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); 221 inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
222 if (IS_ERR(inode)) { 222 if (IS_ERR(inode)) {
223 /* 223 /*
224 * This should not happen. Probably the file-system needs 224 * This should not happen. Probably the file-system needs
225 * checking. 225 * checking.
226 */ 226 */
227 err = PTR_ERR(inode); 227 err = PTR_ERR(inode);
228 ubifs_err("dead directory entry '%.*s', error %d", 228 ubifs_err("dead directory entry '%.*s', error %d",
229 dentry->d_name.len, dentry->d_name.name, err); 229 dentry->d_name.len, dentry->d_name.name, err);
230 ubifs_ro_mode(c, err); 230 ubifs_ro_mode(c, err);
231 goto out; 231 goto out;
232 } 232 }
233 233
234 done: 234 done:
235 kfree(dent); 235 kfree(dent);
236 /* 236 /*
237 * Note, d_splice_alias() would be required instead if we supported 237 * Note, d_splice_alias() would be required instead if we supported
238 * NFS. 238 * NFS.
239 */ 239 */
240 d_add(dentry, inode); 240 d_add(dentry, inode);
241 return NULL; 241 return NULL;
242 242
243 out: 243 out:
244 kfree(dent); 244 kfree(dent);
245 return ERR_PTR(err); 245 return ERR_PTR(err);
246 } 246 }
247 247
248 static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 248 static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
249 bool excl) 249 bool excl)
250 { 250 {
251 struct inode *inode; 251 struct inode *inode;
252 struct ubifs_info *c = dir->i_sb->s_fs_info; 252 struct ubifs_info *c = dir->i_sb->s_fs_info;
253 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); 253 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
254 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 254 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
255 .dirtied_ino = 1 }; 255 .dirtied_ino = 1 };
256 struct ubifs_inode *dir_ui = ubifs_inode(dir); 256 struct ubifs_inode *dir_ui = ubifs_inode(dir);
257 257
258 /* 258 /*
259 * Budget request settings: new inode, new direntry, changing the 259 * Budget request settings: new inode, new direntry, changing the
260 * parent directory inode. 260 * parent directory inode.
261 */ 261 */
262 262
263 dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", 263 dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
264 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); 264 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
265 265
266 err = ubifs_budget_space(c, &req); 266 err = ubifs_budget_space(c, &req);
267 if (err) 267 if (err)
268 return err; 268 return err;
269 269
270 inode = ubifs_new_inode(c, dir, mode); 270 inode = ubifs_new_inode(c, dir, mode);
271 if (IS_ERR(inode)) { 271 if (IS_ERR(inode)) {
272 err = PTR_ERR(inode); 272 err = PTR_ERR(inode);
273 goto out_budg; 273 goto out_budg;
274 } 274 }
275 275
276 mutex_lock(&dir_ui->ui_mutex); 276 mutex_lock(&dir_ui->ui_mutex);
277 dir->i_size += sz_change; 277 dir->i_size += sz_change;
278 dir_ui->ui_size = dir->i_size; 278 dir_ui->ui_size = dir->i_size;
279 dir->i_mtime = dir->i_ctime = inode->i_ctime; 279 dir->i_mtime = dir->i_ctime = inode->i_ctime;
280 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); 280 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
281 if (err) 281 if (err)
282 goto out_cancel; 282 goto out_cancel;
283 mutex_unlock(&dir_ui->ui_mutex); 283 mutex_unlock(&dir_ui->ui_mutex);
284 284
285 ubifs_release_budget(c, &req); 285 ubifs_release_budget(c, &req);
286 insert_inode_hash(inode); 286 insert_inode_hash(inode);
287 d_instantiate(dentry, inode); 287 d_instantiate(dentry, inode);
288 return 0; 288 return 0;
289 289
290 out_cancel: 290 out_cancel:
291 dir->i_size -= sz_change; 291 dir->i_size -= sz_change;
292 dir_ui->ui_size = dir->i_size; 292 dir_ui->ui_size = dir->i_size;
293 mutex_unlock(&dir_ui->ui_mutex); 293 mutex_unlock(&dir_ui->ui_mutex);
294 make_bad_inode(inode); 294 make_bad_inode(inode);
295 iput(inode); 295 iput(inode);
296 out_budg: 296 out_budg:
297 ubifs_release_budget(c, &req); 297 ubifs_release_budget(c, &req);
298 ubifs_err("cannot create regular file, error %d", err); 298 ubifs_err("cannot create regular file, error %d", err);
299 return err; 299 return err;
300 } 300 }
301 301
302 /** 302 /**
303 * vfs_dent_type - get VFS directory entry type. 303 * vfs_dent_type - get VFS directory entry type.
304 * @type: UBIFS directory entry type 304 * @type: UBIFS directory entry type
305 * 305 *
306 * This function converts UBIFS directory entry type into VFS directory entry 306 * This function converts UBIFS directory entry type into VFS directory entry
307 * type. 307 * type.
308 */ 308 */
309 static unsigned int vfs_dent_type(uint8_t type) 309 static unsigned int vfs_dent_type(uint8_t type)
310 { 310 {
311 switch (type) { 311 switch (type) {
312 case UBIFS_ITYPE_REG: 312 case UBIFS_ITYPE_REG:
313 return DT_REG; 313 return DT_REG;
314 case UBIFS_ITYPE_DIR: 314 case UBIFS_ITYPE_DIR:
315 return DT_DIR; 315 return DT_DIR;
316 case UBIFS_ITYPE_LNK: 316 case UBIFS_ITYPE_LNK:
317 return DT_LNK; 317 return DT_LNK;
318 case UBIFS_ITYPE_BLK: 318 case UBIFS_ITYPE_BLK:
319 return DT_BLK; 319 return DT_BLK;
320 case UBIFS_ITYPE_CHR: 320 case UBIFS_ITYPE_CHR:
321 return DT_CHR; 321 return DT_CHR;
322 case UBIFS_ITYPE_FIFO: 322 case UBIFS_ITYPE_FIFO:
323 return DT_FIFO; 323 return DT_FIFO;
324 case UBIFS_ITYPE_SOCK: 324 case UBIFS_ITYPE_SOCK:
325 return DT_SOCK; 325 return DT_SOCK;
326 default: 326 default:
327 BUG(); 327 BUG();
328 } 328 }
329 return 0; 329 return 0;
330 } 330 }
331 331
332 /* 332 /*
333 * The classical Unix view for directory is that it is a linear array of 333 * The classical Unix view for directory is that it is a linear array of
334 * (name, inode number) entries. Linux/VFS assumes this model as well. 334 * (name, inode number) entries. Linux/VFS assumes this model as well.
335 * Particularly, 'readdir()' call wants us to return a directory entry offset 335 * Particularly, 'readdir()' call wants us to return a directory entry offset
336 * which later may be used to continue 'readdir()'ing the directory or to 336 * which later may be used to continue 'readdir()'ing the directory or to
337 * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this 337 * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
338 * model because directory entries are identified by keys, which may collide. 338 * model because directory entries are identified by keys, which may collide.
339 * 339 *
340 * UBIFS uses directory entry hash value for directory offsets, so 340 * UBIFS uses directory entry hash value for directory offsets, so
341 * 'seekdir()'/'telldir()' may not always work because of possible key 341 * 'seekdir()'/'telldir()' may not always work because of possible key
342 * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work 342 * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
343 * properly by means of saving full directory entry name in the private field 343 * properly by means of saving full directory entry name in the private field
344 * of the file description object. 344 * of the file description object.
345 * 345 *
346 * This means that UBIFS cannot support NFS which requires full 346 * This means that UBIFS cannot support NFS which requires full
347 * 'seekdir()'/'telldir()' support. 347 * 'seekdir()'/'telldir()' support.
348 */ 348 */
349 static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) 349 static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
350 { 350 {
351 int err, over = 0; 351 int err, over = 0;
352 struct qstr nm; 352 struct qstr nm;
353 union ubifs_key key; 353 union ubifs_key key;
354 struct ubifs_dent_node *dent; 354 struct ubifs_dent_node *dent;
355 struct inode *dir = file->f_path.dentry->d_inode; 355 struct inode *dir = file->f_path.dentry->d_inode;
356 struct ubifs_info *c = dir->i_sb->s_fs_info; 356 struct ubifs_info *c = dir->i_sb->s_fs_info;
357 357
358 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); 358 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
359 359
360 if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) 360 if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
361 /* 361 /*
362 * The directory was seek'ed to a senseless position or there 362 * The directory was seek'ed to a senseless position or there
363 * are no more entries. 363 * are no more entries.
364 */ 364 */
365 return 0; 365 return 0;
366 366
367 /* File positions 0 and 1 correspond to "." and ".." */ 367 /* File positions 0 and 1 correspond to "." and ".." */
368 if (file->f_pos == 0) { 368 if (file->f_pos == 0) {
369 ubifs_assert(!file->private_data); 369 ubifs_assert(!file->private_data);
370 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); 370 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
371 if (over) 371 if (over)
372 return 0; 372 return 0;
373 file->f_pos = 1; 373 file->f_pos = 1;
374 } 374 }
375 375
376 if (file->f_pos == 1) { 376 if (file->f_pos == 1) {
377 ubifs_assert(!file->private_data); 377 ubifs_assert(!file->private_data);
378 over = filldir(dirent, "..", 2, 1, 378 over = filldir(dirent, "..", 2, 1,
379 parent_ino(file->f_path.dentry), DT_DIR); 379 parent_ino(file->f_path.dentry), DT_DIR);
380 if (over) 380 if (over)
381 return 0; 381 return 0;
382 382
383 /* Find the first entry in TNC and save it */ 383 /* Find the first entry in TNC and save it */
384 lowest_dent_key(c, &key, dir->i_ino); 384 lowest_dent_key(c, &key, dir->i_ino);
385 nm.name = NULL; 385 nm.name = NULL;
386 dent = ubifs_tnc_next_ent(c, &key, &nm); 386 dent = ubifs_tnc_next_ent(c, &key, &nm);
387 if (IS_ERR(dent)) { 387 if (IS_ERR(dent)) {
388 err = PTR_ERR(dent); 388 err = PTR_ERR(dent);
389 goto out; 389 goto out;
390 } 390 }
391 391
392 file->f_pos = key_hash_flash(c, &dent->key); 392 file->f_pos = key_hash_flash(c, &dent->key);
393 file->private_data = dent; 393 file->private_data = dent;
394 } 394 }
395 395
396 dent = file->private_data; 396 dent = file->private_data;
397 if (!dent) { 397 if (!dent) {
398 /* 398 /*
399 * The directory was seek'ed to and is now readdir'ed. 399 * The directory was seek'ed to and is now readdir'ed.
400 * Find the entry corresponding to @file->f_pos or the 400 * Find the entry corresponding to @file->f_pos or the
401 * closest one. 401 * closest one.
402 */ 402 */
403 dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); 403 dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
404 nm.name = NULL; 404 nm.name = NULL;
405 dent = ubifs_tnc_next_ent(c, &key, &nm); 405 dent = ubifs_tnc_next_ent(c, &key, &nm);
406 if (IS_ERR(dent)) { 406 if (IS_ERR(dent)) {
407 err = PTR_ERR(dent); 407 err = PTR_ERR(dent);
408 goto out; 408 goto out;
409 } 409 }
410 file->f_pos = key_hash_flash(c, &dent->key); 410 file->f_pos = key_hash_flash(c, &dent->key);
411 file->private_data = dent; 411 file->private_data = dent;
412 } 412 }
413 413
414 while (1) { 414 while (1) {
415 dbg_gen("feed '%s', ino %llu, new f_pos %#x", 415 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
416 dent->name, (unsigned long long)le64_to_cpu(dent->inum), 416 dent->name, (unsigned long long)le64_to_cpu(dent->inum),
417 key_hash_flash(c, &dent->key)); 417 key_hash_flash(c, &dent->key));
418 ubifs_assert(le64_to_cpu(dent->ch.sqnum) > 418 ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
419 ubifs_inode(dir)->creat_sqnum); 419 ubifs_inode(dir)->creat_sqnum);
420 420
421 nm.len = le16_to_cpu(dent->nlen); 421 nm.len = le16_to_cpu(dent->nlen);
422 over = filldir(dirent, dent->name, nm.len, file->f_pos, 422 over = filldir(dirent, dent->name, nm.len, file->f_pos,
423 le64_to_cpu(dent->inum), 423 le64_to_cpu(dent->inum),
424 vfs_dent_type(dent->type)); 424 vfs_dent_type(dent->type));
425 if (over) 425 if (over)
426 return 0; 426 return 0;
427 427
428 /* Switch to the next entry */ 428 /* Switch to the next entry */
429 key_read(c, &dent->key, &key); 429 key_read(c, &dent->key, &key);
430 nm.name = dent->name; 430 nm.name = dent->name;
431 dent = ubifs_tnc_next_ent(c, &key, &nm); 431 dent = ubifs_tnc_next_ent(c, &key, &nm);
432 if (IS_ERR(dent)) { 432 if (IS_ERR(dent)) {
433 err = PTR_ERR(dent); 433 err = PTR_ERR(dent);
434 goto out; 434 goto out;
435 } 435 }
436 436
437 kfree(file->private_data); 437 kfree(file->private_data);
438 file->f_pos = key_hash_flash(c, &dent->key); 438 file->f_pos = key_hash_flash(c, &dent->key);
439 file->private_data = dent; 439 file->private_data = dent;
440 cond_resched(); 440 cond_resched();
441 } 441 }
442 442
443 out: 443 out:
444 if (err != -ENOENT) { 444 if (err != -ENOENT) {
445 ubifs_err("cannot find next direntry, error %d", err); 445 ubifs_err("cannot find next direntry, error %d", err);
446 return err; 446 return err;
447 } 447 }
448 448
449 kfree(file->private_data); 449 kfree(file->private_data);
450 file->private_data = NULL; 450 file->private_data = NULL;
451 file->f_pos = 2; 451 file->f_pos = 2;
452 return 0; 452 return 0;
453 } 453 }
454 454
455 /* If a directory is seeked, we have to free saved readdir() state */ 455 /* If a directory is seeked, we have to free saved readdir() state */
456 static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) 456 static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
457 { 457 {
458 kfree(file->private_data); 458 kfree(file->private_data);
459 file->private_data = NULL; 459 file->private_data = NULL;
460 return generic_file_llseek(file, offset, origin); 460 return generic_file_llseek(file, offset, whence);
461 } 461 }
462 462
463 /* Free saved readdir() state when the directory is closed */ 463 /* Free saved readdir() state when the directory is closed */
464 static int ubifs_dir_release(struct inode *dir, struct file *file) 464 static int ubifs_dir_release(struct inode *dir, struct file *file)
465 { 465 {
466 kfree(file->private_data); 466 kfree(file->private_data);
467 file->private_data = NULL; 467 file->private_data = NULL;
468 return 0; 468 return 0;
469 } 469 }
470 470
471 /** 471 /**
472 * lock_2_inodes - a wrapper for locking two UBIFS inodes. 472 * lock_2_inodes - a wrapper for locking two UBIFS inodes.
473 * @inode1: first inode 473 * @inode1: first inode
474 * @inode2: second inode 474 * @inode2: second inode
475 * 475 *
476 * We do not implement any tricks to guarantee strict lock ordering, because 476 * We do not implement any tricks to guarantee strict lock ordering, because
477 * VFS has already done it for us on the @i_mutex. So this is just a simple 477 * VFS has already done it for us on the @i_mutex. So this is just a simple
478 * wrapper function. 478 * wrapper function.
479 */ 479 */
480 static void lock_2_inodes(struct inode *inode1, struct inode *inode2) 480 static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
481 { 481 {
482 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); 482 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
483 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); 483 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
484 } 484 }
485 485
486 /** 486 /**
487 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. 487 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
488 * @inode1: first inode 488 * @inode1: first inode
489 * @inode2: second inode 489 * @inode2: second inode
490 */ 490 */
491 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) 491 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
492 { 492 {
493 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 493 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
494 mutex_unlock(&ubifs_inode(inode1)->ui_mutex); 494 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
495 } 495 }
496 496
497 static int ubifs_link(struct dentry *old_dentry, struct inode *dir, 497 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
498 struct dentry *dentry) 498 struct dentry *dentry)
499 { 499 {
500 struct ubifs_info *c = dir->i_sb->s_fs_info; 500 struct ubifs_info *c = dir->i_sb->s_fs_info;
501 struct inode *inode = old_dentry->d_inode; 501 struct inode *inode = old_dentry->d_inode;
502 struct ubifs_inode *ui = ubifs_inode(inode); 502 struct ubifs_inode *ui = ubifs_inode(inode);
503 struct ubifs_inode *dir_ui = ubifs_inode(dir); 503 struct ubifs_inode *dir_ui = ubifs_inode(dir);
504 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); 504 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
505 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, 505 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
506 .dirtied_ino_d = ALIGN(ui->data_len, 8) }; 506 .dirtied_ino_d = ALIGN(ui->data_len, 8) };
507 507
508 /* 508 /*
509 * Budget request settings: new direntry, changing the target inode, 509 * Budget request settings: new direntry, changing the target inode,
510 * changing the parent inode. 510 * changing the parent inode.
511 */ 511 */
512 512
513 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", 513 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
514 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 514 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
515 inode->i_nlink, dir->i_ino); 515 inode->i_nlink, dir->i_ino);
516 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 516 ubifs_assert(mutex_is_locked(&dir->i_mutex));
517 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 517 ubifs_assert(mutex_is_locked(&inode->i_mutex));
518 518
519 err = dbg_check_synced_i_size(c, inode); 519 err = dbg_check_synced_i_size(c, inode);
520 if (err) 520 if (err)
521 return err; 521 return err;
522 522
523 err = ubifs_budget_space(c, &req); 523 err = ubifs_budget_space(c, &req);
524 if (err) 524 if (err)
525 return err; 525 return err;
526 526
527 lock_2_inodes(dir, inode); 527 lock_2_inodes(dir, inode);
528 inc_nlink(inode); 528 inc_nlink(inode);
529 ihold(inode); 529 ihold(inode);
530 inode->i_ctime = ubifs_current_time(inode); 530 inode->i_ctime = ubifs_current_time(inode);
531 dir->i_size += sz_change; 531 dir->i_size += sz_change;
532 dir_ui->ui_size = dir->i_size; 532 dir_ui->ui_size = dir->i_size;
533 dir->i_mtime = dir->i_ctime = inode->i_ctime; 533 dir->i_mtime = dir->i_ctime = inode->i_ctime;
534 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); 534 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
535 if (err) 535 if (err)
536 goto out_cancel; 536 goto out_cancel;
537 unlock_2_inodes(dir, inode); 537 unlock_2_inodes(dir, inode);
538 538
539 ubifs_release_budget(c, &req); 539 ubifs_release_budget(c, &req);
540 d_instantiate(dentry, inode); 540 d_instantiate(dentry, inode);
541 return 0; 541 return 0;
542 542
543 out_cancel: 543 out_cancel:
544 dir->i_size -= sz_change; 544 dir->i_size -= sz_change;
545 dir_ui->ui_size = dir->i_size; 545 dir_ui->ui_size = dir->i_size;
546 drop_nlink(inode); 546 drop_nlink(inode);
547 unlock_2_inodes(dir, inode); 547 unlock_2_inodes(dir, inode);
548 ubifs_release_budget(c, &req); 548 ubifs_release_budget(c, &req);
549 iput(inode); 549 iput(inode);
550 return err; 550 return err;
551 } 551 }
552 552
553 static int ubifs_unlink(struct inode *dir, struct dentry *dentry) 553 static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
554 { 554 {
555 struct ubifs_info *c = dir->i_sb->s_fs_info; 555 struct ubifs_info *c = dir->i_sb->s_fs_info;
556 struct inode *inode = dentry->d_inode; 556 struct inode *inode = dentry->d_inode;
557 struct ubifs_inode *dir_ui = ubifs_inode(dir); 557 struct ubifs_inode *dir_ui = ubifs_inode(dir);
558 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 558 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
559 int err, budgeted = 1; 559 int err, budgeted = 1;
560 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 560 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
561 unsigned int saved_nlink = inode->i_nlink; 561 unsigned int saved_nlink = inode->i_nlink;
562 562
563 /* 563 /*
564 * Budget request settings: deletion direntry, deletion inode (+1 for 564 * Budget request settings: deletion direntry, deletion inode (+1 for
565 * @dirtied_ino), changing the parent directory inode. If budgeting 565 * @dirtied_ino), changing the parent directory inode. If budgeting
566 * fails, go ahead anyway because we have extra space reserved for 566 * fails, go ahead anyway because we have extra space reserved for
567 * deletions. 567 * deletions.
568 */ 568 */
569 569
570 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", 570 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
571 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 571 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
572 inode->i_nlink, dir->i_ino); 572 inode->i_nlink, dir->i_ino);
573 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 573 ubifs_assert(mutex_is_locked(&dir->i_mutex));
574 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 574 ubifs_assert(mutex_is_locked(&inode->i_mutex));
575 err = dbg_check_synced_i_size(c, inode); 575 err = dbg_check_synced_i_size(c, inode);
576 if (err) 576 if (err)
577 return err; 577 return err;
578 578
579 err = ubifs_budget_space(c, &req); 579 err = ubifs_budget_space(c, &req);
580 if (err) { 580 if (err) {
581 if (err != -ENOSPC) 581 if (err != -ENOSPC)
582 return err; 582 return err;
583 budgeted = 0; 583 budgeted = 0;
584 } 584 }
585 585
586 lock_2_inodes(dir, inode); 586 lock_2_inodes(dir, inode);
587 inode->i_ctime = ubifs_current_time(dir); 587 inode->i_ctime = ubifs_current_time(dir);
588 drop_nlink(inode); 588 drop_nlink(inode);
589 dir->i_size -= sz_change; 589 dir->i_size -= sz_change;
590 dir_ui->ui_size = dir->i_size; 590 dir_ui->ui_size = dir->i_size;
591 dir->i_mtime = dir->i_ctime = inode->i_ctime; 591 dir->i_mtime = dir->i_ctime = inode->i_ctime;
592 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); 592 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
593 if (err) 593 if (err)
594 goto out_cancel; 594 goto out_cancel;
595 unlock_2_inodes(dir, inode); 595 unlock_2_inodes(dir, inode);
596 596
597 if (budgeted) 597 if (budgeted)
598 ubifs_release_budget(c, &req); 598 ubifs_release_budget(c, &req);
599 else { 599 else {
600 /* We've deleted something - clean the "no space" flags */ 600 /* We've deleted something - clean the "no space" flags */
601 c->bi.nospace = c->bi.nospace_rp = 0; 601 c->bi.nospace = c->bi.nospace_rp = 0;
602 smp_wmb(); 602 smp_wmb();
603 } 603 }
604 return 0; 604 return 0;
605 605
606 out_cancel: 606 out_cancel:
607 dir->i_size += sz_change; 607 dir->i_size += sz_change;
608 dir_ui->ui_size = dir->i_size; 608 dir_ui->ui_size = dir->i_size;
609 set_nlink(inode, saved_nlink); 609 set_nlink(inode, saved_nlink);
610 unlock_2_inodes(dir, inode); 610 unlock_2_inodes(dir, inode);
611 if (budgeted) 611 if (budgeted)
612 ubifs_release_budget(c, &req); 612 ubifs_release_budget(c, &req);
613 return err; 613 return err;
614 } 614 }
615 615
616 /** 616 /**
617 * check_dir_empty - check if a directory is empty or not. 617 * check_dir_empty - check if a directory is empty or not.
618 * @c: UBIFS file-system description object 618 * @c: UBIFS file-system description object
619 * @dir: VFS inode object of the directory to check 619 * @dir: VFS inode object of the directory to check
620 * 620 *
621 * This function checks if directory @dir is empty. Returns zero if the 621 * This function checks if directory @dir is empty. Returns zero if the
622 * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes 622 * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
623 * in case of of errors. 623 * in case of of errors.
624 */ 624 */
625 static int check_dir_empty(struct ubifs_info *c, struct inode *dir) 625 static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
626 { 626 {
627 struct qstr nm = { .name = NULL }; 627 struct qstr nm = { .name = NULL };
628 struct ubifs_dent_node *dent; 628 struct ubifs_dent_node *dent;
629 union ubifs_key key; 629 union ubifs_key key;
630 int err; 630 int err;
631 631
632 lowest_dent_key(c, &key, dir->i_ino); 632 lowest_dent_key(c, &key, dir->i_ino);
633 dent = ubifs_tnc_next_ent(c, &key, &nm); 633 dent = ubifs_tnc_next_ent(c, &key, &nm);
634 if (IS_ERR(dent)) { 634 if (IS_ERR(dent)) {
635 err = PTR_ERR(dent); 635 err = PTR_ERR(dent);
636 if (err == -ENOENT) 636 if (err == -ENOENT)
637 err = 0; 637 err = 0;
638 } else { 638 } else {
639 kfree(dent); 639 kfree(dent);
640 err = -ENOTEMPTY; 640 err = -ENOTEMPTY;
641 } 641 }
642 return err; 642 return err;
643 } 643 }
644 644
645 static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) 645 static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
646 { 646 {
647 struct ubifs_info *c = dir->i_sb->s_fs_info; 647 struct ubifs_info *c = dir->i_sb->s_fs_info;
648 struct inode *inode = dentry->d_inode; 648 struct inode *inode = dentry->d_inode;
649 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 649 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
650 int err, budgeted = 1; 650 int err, budgeted = 1;
651 struct ubifs_inode *dir_ui = ubifs_inode(dir); 651 struct ubifs_inode *dir_ui = ubifs_inode(dir);
652 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 652 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
653 653
654 /* 654 /*
655 * Budget request settings: deletion direntry, deletion inode and 655 * Budget request settings: deletion direntry, deletion inode and
656 * changing the parent inode. If budgeting fails, go ahead anyway 656 * changing the parent inode. If budgeting fails, go ahead anyway
657 * because we have extra space reserved for deletions. 657 * because we have extra space reserved for deletions.
658 */ 658 */
659 659
660 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, 660 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
661 dentry->d_name.name, inode->i_ino, dir->i_ino); 661 dentry->d_name.name, inode->i_ino, dir->i_ino);
662 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 662 ubifs_assert(mutex_is_locked(&dir->i_mutex));
663 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 663 ubifs_assert(mutex_is_locked(&inode->i_mutex));
664 err = check_dir_empty(c, dentry->d_inode); 664 err = check_dir_empty(c, dentry->d_inode);
665 if (err) 665 if (err)
666 return err; 666 return err;
667 667
668 err = ubifs_budget_space(c, &req); 668 err = ubifs_budget_space(c, &req);
669 if (err) { 669 if (err) {
670 if (err != -ENOSPC) 670 if (err != -ENOSPC)
671 return err; 671 return err;
672 budgeted = 0; 672 budgeted = 0;
673 } 673 }
674 674
675 lock_2_inodes(dir, inode); 675 lock_2_inodes(dir, inode);
676 inode->i_ctime = ubifs_current_time(dir); 676 inode->i_ctime = ubifs_current_time(dir);
677 clear_nlink(inode); 677 clear_nlink(inode);
678 drop_nlink(dir); 678 drop_nlink(dir);
679 dir->i_size -= sz_change; 679 dir->i_size -= sz_change;
680 dir_ui->ui_size = dir->i_size; 680 dir_ui->ui_size = dir->i_size;
681 dir->i_mtime = dir->i_ctime = inode->i_ctime; 681 dir->i_mtime = dir->i_ctime = inode->i_ctime;
682 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); 682 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
683 if (err) 683 if (err)
684 goto out_cancel; 684 goto out_cancel;
685 unlock_2_inodes(dir, inode); 685 unlock_2_inodes(dir, inode);
686 686
687 if (budgeted) 687 if (budgeted)
688 ubifs_release_budget(c, &req); 688 ubifs_release_budget(c, &req);
689 else { 689 else {
690 /* We've deleted something - clean the "no space" flags */ 690 /* We've deleted something - clean the "no space" flags */
691 c->bi.nospace = c->bi.nospace_rp = 0; 691 c->bi.nospace = c->bi.nospace_rp = 0;
692 smp_wmb(); 692 smp_wmb();
693 } 693 }
694 return 0; 694 return 0;
695 695
696 out_cancel: 696 out_cancel:
697 dir->i_size += sz_change; 697 dir->i_size += sz_change;
698 dir_ui->ui_size = dir->i_size; 698 dir_ui->ui_size = dir->i_size;
699 inc_nlink(dir); 699 inc_nlink(dir);
700 set_nlink(inode, 2); 700 set_nlink(inode, 2);
701 unlock_2_inodes(dir, inode); 701 unlock_2_inodes(dir, inode);
702 if (budgeted) 702 if (budgeted)
703 ubifs_release_budget(c, &req); 703 ubifs_release_budget(c, &req);
704 return err; 704 return err;
705 } 705 }
706 706
707 static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 707 static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
708 { 708 {
709 struct inode *inode; 709 struct inode *inode;
710 struct ubifs_inode *dir_ui = ubifs_inode(dir); 710 struct ubifs_inode *dir_ui = ubifs_inode(dir);
711 struct ubifs_info *c = dir->i_sb->s_fs_info; 711 struct ubifs_info *c = dir->i_sb->s_fs_info;
712 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); 712 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
713 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; 713 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
714 714
715 /* 715 /*
716 * Budget request settings: new inode, new direntry and changing parent 716 * Budget request settings: new inode, new direntry and changing parent
717 * directory inode. 717 * directory inode.
718 */ 718 */
719 719
720 dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", 720 dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
721 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); 721 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
722 722
723 err = ubifs_budget_space(c, &req); 723 err = ubifs_budget_space(c, &req);
724 if (err) 724 if (err)
725 return err; 725 return err;
726 726
727 inode = ubifs_new_inode(c, dir, S_IFDIR | mode); 727 inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
728 if (IS_ERR(inode)) { 728 if (IS_ERR(inode)) {
729 err = PTR_ERR(inode); 729 err = PTR_ERR(inode);
730 goto out_budg; 730 goto out_budg;
731 } 731 }
732 732
733 mutex_lock(&dir_ui->ui_mutex); 733 mutex_lock(&dir_ui->ui_mutex);
734 insert_inode_hash(inode); 734 insert_inode_hash(inode);
735 inc_nlink(inode); 735 inc_nlink(inode);
736 inc_nlink(dir); 736 inc_nlink(dir);
737 dir->i_size += sz_change; 737 dir->i_size += sz_change;
738 dir_ui->ui_size = dir->i_size; 738 dir_ui->ui_size = dir->i_size;
739 dir->i_mtime = dir->i_ctime = inode->i_ctime; 739 dir->i_mtime = dir->i_ctime = inode->i_ctime;
740 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); 740 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
741 if (err) { 741 if (err) {
742 ubifs_err("cannot create directory, error %d", err); 742 ubifs_err("cannot create directory, error %d", err);
743 goto out_cancel; 743 goto out_cancel;
744 } 744 }
745 mutex_unlock(&dir_ui->ui_mutex); 745 mutex_unlock(&dir_ui->ui_mutex);
746 746
747 ubifs_release_budget(c, &req); 747 ubifs_release_budget(c, &req);
748 d_instantiate(dentry, inode); 748 d_instantiate(dentry, inode);
749 return 0; 749 return 0;
750 750
751 out_cancel: 751 out_cancel:
752 dir->i_size -= sz_change; 752 dir->i_size -= sz_change;
753 dir_ui->ui_size = dir->i_size; 753 dir_ui->ui_size = dir->i_size;
754 drop_nlink(dir); 754 drop_nlink(dir);
755 mutex_unlock(&dir_ui->ui_mutex); 755 mutex_unlock(&dir_ui->ui_mutex);
756 make_bad_inode(inode); 756 make_bad_inode(inode);
757 iput(inode); 757 iput(inode);
758 out_budg: 758 out_budg:
759 ubifs_release_budget(c, &req); 759 ubifs_release_budget(c, &req);
760 return err; 760 return err;
761 } 761 }
762 762
763 static int ubifs_mknod(struct inode *dir, struct dentry *dentry, 763 static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
764 umode_t mode, dev_t rdev) 764 umode_t mode, dev_t rdev)
765 { 765 {
766 struct inode *inode; 766 struct inode *inode;
767 struct ubifs_inode *ui; 767 struct ubifs_inode *ui;
768 struct ubifs_inode *dir_ui = ubifs_inode(dir); 768 struct ubifs_inode *dir_ui = ubifs_inode(dir);
769 struct ubifs_info *c = dir->i_sb->s_fs_info; 769 struct ubifs_info *c = dir->i_sb->s_fs_info;
770 union ubifs_dev_desc *dev = NULL; 770 union ubifs_dev_desc *dev = NULL;
771 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 771 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
772 int err, devlen = 0; 772 int err, devlen = 0;
773 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 773 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
774 .new_ino_d = ALIGN(devlen, 8), 774 .new_ino_d = ALIGN(devlen, 8),
775 .dirtied_ino = 1 }; 775 .dirtied_ino = 1 };
776 776
777 /* 777 /*
778 * Budget request settings: new inode, new direntry and changing parent 778 * Budget request settings: new inode, new direntry and changing parent
779 * directory inode. 779 * directory inode.
780 */ 780 */
781 781
782 dbg_gen("dent '%.*s' in dir ino %lu", 782 dbg_gen("dent '%.*s' in dir ino %lu",
783 dentry->d_name.len, dentry->d_name.name, dir->i_ino); 783 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
784 784
785 if (!new_valid_dev(rdev)) 785 if (!new_valid_dev(rdev))
786 return -EINVAL; 786 return -EINVAL;
787 787
788 if (S_ISBLK(mode) || S_ISCHR(mode)) { 788 if (S_ISBLK(mode) || S_ISCHR(mode)) {
789 dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); 789 dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
790 if (!dev) 790 if (!dev)
791 return -ENOMEM; 791 return -ENOMEM;
792 devlen = ubifs_encode_dev(dev, rdev); 792 devlen = ubifs_encode_dev(dev, rdev);
793 } 793 }
794 794
795 err = ubifs_budget_space(c, &req); 795 err = ubifs_budget_space(c, &req);
796 if (err) { 796 if (err) {
797 kfree(dev); 797 kfree(dev);
798 return err; 798 return err;
799 } 799 }
800 800
801 inode = ubifs_new_inode(c, dir, mode); 801 inode = ubifs_new_inode(c, dir, mode);
802 if (IS_ERR(inode)) { 802 if (IS_ERR(inode)) {
803 kfree(dev); 803 kfree(dev);
804 err = PTR_ERR(inode); 804 err = PTR_ERR(inode);
805 goto out_budg; 805 goto out_budg;
806 } 806 }
807 807
808 init_special_inode(inode, inode->i_mode, rdev); 808 init_special_inode(inode, inode->i_mode, rdev);
809 inode->i_size = ubifs_inode(inode)->ui_size = devlen; 809 inode->i_size = ubifs_inode(inode)->ui_size = devlen;
810 ui = ubifs_inode(inode); 810 ui = ubifs_inode(inode);
811 ui->data = dev; 811 ui->data = dev;
812 ui->data_len = devlen; 812 ui->data_len = devlen;
813 813
814 mutex_lock(&dir_ui->ui_mutex); 814 mutex_lock(&dir_ui->ui_mutex);
815 dir->i_size += sz_change; 815 dir->i_size += sz_change;
816 dir_ui->ui_size = dir->i_size; 816 dir_ui->ui_size = dir->i_size;
817 dir->i_mtime = dir->i_ctime = inode->i_ctime; 817 dir->i_mtime = dir->i_ctime = inode->i_ctime;
818 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); 818 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
819 if (err) 819 if (err)
820 goto out_cancel; 820 goto out_cancel;
821 mutex_unlock(&dir_ui->ui_mutex); 821 mutex_unlock(&dir_ui->ui_mutex);
822 822
823 ubifs_release_budget(c, &req); 823 ubifs_release_budget(c, &req);
824 insert_inode_hash(inode); 824 insert_inode_hash(inode);
825 d_instantiate(dentry, inode); 825 d_instantiate(dentry, inode);
826 return 0; 826 return 0;
827 827
828 out_cancel: 828 out_cancel:
829 dir->i_size -= sz_change; 829 dir->i_size -= sz_change;
830 dir_ui->ui_size = dir->i_size; 830 dir_ui->ui_size = dir->i_size;
831 mutex_unlock(&dir_ui->ui_mutex); 831 mutex_unlock(&dir_ui->ui_mutex);
832 make_bad_inode(inode); 832 make_bad_inode(inode);
833 iput(inode); 833 iput(inode);
834 out_budg: 834 out_budg:
835 ubifs_release_budget(c, &req); 835 ubifs_release_budget(c, &req);
836 return err; 836 return err;
837 } 837 }
838 838
839 static int ubifs_symlink(struct inode *dir, struct dentry *dentry, 839 static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
840 const char *symname) 840 const char *symname)
841 { 841 {
842 struct inode *inode; 842 struct inode *inode;
843 struct ubifs_inode *ui; 843 struct ubifs_inode *ui;
844 struct ubifs_inode *dir_ui = ubifs_inode(dir); 844 struct ubifs_inode *dir_ui = ubifs_inode(dir);
845 struct ubifs_info *c = dir->i_sb->s_fs_info; 845 struct ubifs_info *c = dir->i_sb->s_fs_info;
846 int err, len = strlen(symname); 846 int err, len = strlen(symname);
847 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 847 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
848 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 848 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
849 .new_ino_d = ALIGN(len, 8), 849 .new_ino_d = ALIGN(len, 8),
850 .dirtied_ino = 1 }; 850 .dirtied_ino = 1 };
851 851
852 /* 852 /*
853 * Budget request settings: new inode, new direntry and changing parent 853 * Budget request settings: new inode, new direntry and changing parent
854 * directory inode. 854 * directory inode.
855 */ 855 */
856 856
857 dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len, 857 dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
858 dentry->d_name.name, symname, dir->i_ino); 858 dentry->d_name.name, symname, dir->i_ino);
859 859
860 if (len > UBIFS_MAX_INO_DATA) 860 if (len > UBIFS_MAX_INO_DATA)
861 return -ENAMETOOLONG; 861 return -ENAMETOOLONG;
862 862
863 err = ubifs_budget_space(c, &req); 863 err = ubifs_budget_space(c, &req);
864 if (err) 864 if (err)
865 return err; 865 return err;
866 866
867 inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); 867 inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
868 if (IS_ERR(inode)) { 868 if (IS_ERR(inode)) {
869 err = PTR_ERR(inode); 869 err = PTR_ERR(inode);
870 goto out_budg; 870 goto out_budg;
871 } 871 }
872 872
873 ui = ubifs_inode(inode); 873 ui = ubifs_inode(inode);
874 ui->data = kmalloc(len + 1, GFP_NOFS); 874 ui->data = kmalloc(len + 1, GFP_NOFS);
875 if (!ui->data) { 875 if (!ui->data) {
876 err = -ENOMEM; 876 err = -ENOMEM;
877 goto out_inode; 877 goto out_inode;
878 } 878 }
879 879
880 memcpy(ui->data, symname, len); 880 memcpy(ui->data, symname, len);
881 ((char *)ui->data)[len] = '\0'; 881 ((char *)ui->data)[len] = '\0';
882 /* 882 /*
883 * The terminating zero byte is not written to the flash media and it 883 * The terminating zero byte is not written to the flash media and it
884 * is put just to make later in-memory string processing simpler. Thus, 884 * is put just to make later in-memory string processing simpler. Thus,
885 * data length is @len, not @len + %1. 885 * data length is @len, not @len + %1.
886 */ 886 */
887 ui->data_len = len; 887 ui->data_len = len;
888 inode->i_size = ubifs_inode(inode)->ui_size = len; 888 inode->i_size = ubifs_inode(inode)->ui_size = len;
889 889
890 mutex_lock(&dir_ui->ui_mutex); 890 mutex_lock(&dir_ui->ui_mutex);
891 dir->i_size += sz_change; 891 dir->i_size += sz_change;
892 dir_ui->ui_size = dir->i_size; 892 dir_ui->ui_size = dir->i_size;
893 dir->i_mtime = dir->i_ctime = inode->i_ctime; 893 dir->i_mtime = dir->i_ctime = inode->i_ctime;
894 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); 894 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
895 if (err) 895 if (err)
896 goto out_cancel; 896 goto out_cancel;
897 mutex_unlock(&dir_ui->ui_mutex); 897 mutex_unlock(&dir_ui->ui_mutex);
898 898
899 ubifs_release_budget(c, &req); 899 ubifs_release_budget(c, &req);
900 insert_inode_hash(inode); 900 insert_inode_hash(inode);
901 d_instantiate(dentry, inode); 901 d_instantiate(dentry, inode);
902 return 0; 902 return 0;
903 903
904 out_cancel: 904 out_cancel:
905 dir->i_size -= sz_change; 905 dir->i_size -= sz_change;
906 dir_ui->ui_size = dir->i_size; 906 dir_ui->ui_size = dir->i_size;
907 mutex_unlock(&dir_ui->ui_mutex); 907 mutex_unlock(&dir_ui->ui_mutex);
908 out_inode: 908 out_inode:
909 make_bad_inode(inode); 909 make_bad_inode(inode);
910 iput(inode); 910 iput(inode);
911 out_budg: 911 out_budg:
912 ubifs_release_budget(c, &req); 912 ubifs_release_budget(c, &req);
913 return err; 913 return err;
914 } 914 }
915 915
916 /** 916 /**
917 * lock_3_inodes - a wrapper for locking three UBIFS inodes. 917 * lock_3_inodes - a wrapper for locking three UBIFS inodes.
918 * @inode1: first inode 918 * @inode1: first inode
919 * @inode2: second inode 919 * @inode2: second inode
920 * @inode3: third inode 920 * @inode3: third inode
921 * 921 *
922 * This function is used for 'ubifs_rename()' and @inode1 may be the same as 922 * This function is used for 'ubifs_rename()' and @inode1 may be the same as
923 * @inode2 whereas @inode3 may be %NULL. 923 * @inode2 whereas @inode3 may be %NULL.
924 * 924 *
925 * We do not implement any tricks to guarantee strict lock ordering, because 925 * We do not implement any tricks to guarantee strict lock ordering, because
926 * VFS has already done it for us on the @i_mutex. So this is just a simple 926 * VFS has already done it for us on the @i_mutex. So this is just a simple
927 * wrapper function. 927 * wrapper function.
928 */ 928 */
929 static void lock_3_inodes(struct inode *inode1, struct inode *inode2, 929 static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
930 struct inode *inode3) 930 struct inode *inode3)
931 { 931 {
932 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); 932 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
933 if (inode2 != inode1) 933 if (inode2 != inode1)
934 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); 934 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
935 if (inode3) 935 if (inode3)
936 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); 936 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
937 } 937 }
938 938
939 /** 939 /**
940 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. 940 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
941 * @inode1: first inode 941 * @inode1: first inode
942 * @inode2: second inode 942 * @inode2: second inode
943 * @inode3: third inode 943 * @inode3: third inode
944 */ 944 */
945 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, 945 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
946 struct inode *inode3) 946 struct inode *inode3)
947 { 947 {
948 if (inode3) 948 if (inode3)
949 mutex_unlock(&ubifs_inode(inode3)->ui_mutex); 949 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
950 if (inode1 != inode2) 950 if (inode1 != inode2)
951 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 951 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
952 mutex_unlock(&ubifs_inode(inode1)->ui_mutex); 952 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
953 } 953 }
954 954
955 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, 955 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
956 struct inode *new_dir, struct dentry *new_dentry) 956 struct inode *new_dir, struct dentry *new_dentry)
957 { 957 {
958 struct ubifs_info *c = old_dir->i_sb->s_fs_info; 958 struct ubifs_info *c = old_dir->i_sb->s_fs_info;
959 struct inode *old_inode = old_dentry->d_inode; 959 struct inode *old_inode = old_dentry->d_inode;
960 struct inode *new_inode = new_dentry->d_inode; 960 struct inode *new_inode = new_dentry->d_inode;
961 struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); 961 struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
962 int err, release, sync = 0, move = (new_dir != old_dir); 962 int err, release, sync = 0, move = (new_dir != old_dir);
963 int is_dir = S_ISDIR(old_inode->i_mode); 963 int is_dir = S_ISDIR(old_inode->i_mode);
964 int unlink = !!new_inode; 964 int unlink = !!new_inode;
965 int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); 965 int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
966 int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); 966 int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
967 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, 967 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
968 .dirtied_ino = 3 }; 968 .dirtied_ino = 3 };
969 struct ubifs_budget_req ino_req = { .dirtied_ino = 1, 969 struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
970 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 970 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
971 struct timespec time; 971 struct timespec time;
972 unsigned int uninitialized_var(saved_nlink); 972 unsigned int uninitialized_var(saved_nlink);
973 973
974 /* 974 /*
975 * Budget request settings: deletion direntry, new direntry, removing 975 * Budget request settings: deletion direntry, new direntry, removing
976 * the old inode, and changing old and new parent directory inodes. 976 * the old inode, and changing old and new parent directory inodes.
977 * 977 *
978 * However, this operation also marks the target inode as dirty and 978 * However, this operation also marks the target inode as dirty and
979 * does not write it, so we allocate budget for the target inode 979 * does not write it, so we allocate budget for the target inode
980 * separately. 980 * separately.
981 */ 981 */
982 982
983 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu", 983 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu",
984 old_dentry->d_name.len, old_dentry->d_name.name, 984 old_dentry->d_name.len, old_dentry->d_name.name,
985 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 985 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
986 new_dentry->d_name.name, new_dir->i_ino); 986 new_dentry->d_name.name, new_dir->i_ino);
987 ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); 987 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
988 ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); 988 ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
989 if (unlink) 989 if (unlink)
990 ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); 990 ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
991 991
992 992
993 if (unlink && is_dir) { 993 if (unlink && is_dir) {
994 err = check_dir_empty(c, new_inode); 994 err = check_dir_empty(c, new_inode);
995 if (err) 995 if (err)
996 return err; 996 return err;
997 } 997 }
998 998
999 err = ubifs_budget_space(c, &req); 999 err = ubifs_budget_space(c, &req);
1000 if (err) 1000 if (err)
1001 return err; 1001 return err;
1002 err = ubifs_budget_space(c, &ino_req); 1002 err = ubifs_budget_space(c, &ino_req);
1003 if (err) { 1003 if (err) {
1004 ubifs_release_budget(c, &req); 1004 ubifs_release_budget(c, &req);
1005 return err; 1005 return err;
1006 } 1006 }
1007 1007
1008 lock_3_inodes(old_dir, new_dir, new_inode); 1008 lock_3_inodes(old_dir, new_dir, new_inode);
1009 1009
1010 /* 1010 /*
1011 * Like most other Unix systems, set the @i_ctime for inodes on a 1011 * Like most other Unix systems, set the @i_ctime for inodes on a
1012 * rename. 1012 * rename.
1013 */ 1013 */
1014 time = ubifs_current_time(old_dir); 1014 time = ubifs_current_time(old_dir);
1015 old_inode->i_ctime = time; 1015 old_inode->i_ctime = time;
1016 1016
1017 /* We must adjust parent link count when renaming directories */ 1017 /* We must adjust parent link count when renaming directories */
1018 if (is_dir) { 1018 if (is_dir) {
1019 if (move) { 1019 if (move) {
1020 /* 1020 /*
1021 * @old_dir loses a link because we are moving 1021 * @old_dir loses a link because we are moving
1022 * @old_inode to a different directory. 1022 * @old_inode to a different directory.
1023 */ 1023 */
1024 drop_nlink(old_dir); 1024 drop_nlink(old_dir);
1025 /* 1025 /*
1026 * @new_dir only gains a link if we are not also 1026 * @new_dir only gains a link if we are not also
1027 * overwriting an existing directory. 1027 * overwriting an existing directory.
1028 */ 1028 */
1029 if (!unlink) 1029 if (!unlink)
1030 inc_nlink(new_dir); 1030 inc_nlink(new_dir);
1031 } else { 1031 } else {
1032 /* 1032 /*
1033 * @old_inode is not moving to a different directory, 1033 * @old_inode is not moving to a different directory,
1034 * but @old_dir still loses a link if we are 1034 * but @old_dir still loses a link if we are
1035 * overwriting an existing directory. 1035 * overwriting an existing directory.
1036 */ 1036 */
1037 if (unlink) 1037 if (unlink)
1038 drop_nlink(old_dir); 1038 drop_nlink(old_dir);
1039 } 1039 }
1040 } 1040 }
1041 1041
1042 old_dir->i_size -= old_sz; 1042 old_dir->i_size -= old_sz;
1043 ubifs_inode(old_dir)->ui_size = old_dir->i_size; 1043 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1044 old_dir->i_mtime = old_dir->i_ctime = time; 1044 old_dir->i_mtime = old_dir->i_ctime = time;
1045 new_dir->i_mtime = new_dir->i_ctime = time; 1045 new_dir->i_mtime = new_dir->i_ctime = time;
1046 1046
1047 /* 1047 /*
1048 * And finally, if we unlinked a direntry which happened to have the 1048 * And finally, if we unlinked a direntry which happened to have the
1049 * same name as the moved direntry, we have to decrement @i_nlink of 1049 * same name as the moved direntry, we have to decrement @i_nlink of
1050 * the unlinked inode and change its ctime. 1050 * the unlinked inode and change its ctime.
1051 */ 1051 */
1052 if (unlink) { 1052 if (unlink) {
1053 /* 1053 /*
1054 * Directories cannot have hard-links, so if this is a 1054 * Directories cannot have hard-links, so if this is a
1055 * directory, just clear @i_nlink. 1055 * directory, just clear @i_nlink.
1056 */ 1056 */
1057 saved_nlink = new_inode->i_nlink; 1057 saved_nlink = new_inode->i_nlink;
1058 if (is_dir) 1058 if (is_dir)
1059 clear_nlink(new_inode); 1059 clear_nlink(new_inode);
1060 else 1060 else
1061 drop_nlink(new_inode); 1061 drop_nlink(new_inode);
1062 new_inode->i_ctime = time; 1062 new_inode->i_ctime = time;
1063 } else { 1063 } else {
1064 new_dir->i_size += new_sz; 1064 new_dir->i_size += new_sz;
1065 ubifs_inode(new_dir)->ui_size = new_dir->i_size; 1065 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1066 } 1066 }
1067 1067
1068 /* 1068 /*
1069 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode 1069 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
1070 * is dirty, because this will be done later on at the end of 1070 * is dirty, because this will be done later on at the end of
1071 * 'ubifs_rename()'. 1071 * 'ubifs_rename()'.
1072 */ 1072 */
1073 if (IS_SYNC(old_inode)) { 1073 if (IS_SYNC(old_inode)) {
1074 sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); 1074 sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
1075 if (unlink && IS_SYNC(new_inode)) 1075 if (unlink && IS_SYNC(new_inode))
1076 sync = 1; 1076 sync = 1;
1077 } 1077 }
1078 err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, 1078 err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
1079 sync); 1079 sync);
1080 if (err) 1080 if (err)
1081 goto out_cancel; 1081 goto out_cancel;
1082 1082
1083 unlock_3_inodes(old_dir, new_dir, new_inode); 1083 unlock_3_inodes(old_dir, new_dir, new_inode);
1084 ubifs_release_budget(c, &req); 1084 ubifs_release_budget(c, &req);
1085 1085
1086 mutex_lock(&old_inode_ui->ui_mutex); 1086 mutex_lock(&old_inode_ui->ui_mutex);
1087 release = old_inode_ui->dirty; 1087 release = old_inode_ui->dirty;
1088 mark_inode_dirty_sync(old_inode); 1088 mark_inode_dirty_sync(old_inode);
1089 mutex_unlock(&old_inode_ui->ui_mutex); 1089 mutex_unlock(&old_inode_ui->ui_mutex);
1090 1090
1091 if (release) 1091 if (release)
1092 ubifs_release_budget(c, &ino_req); 1092 ubifs_release_budget(c, &ino_req);
1093 if (IS_SYNC(old_inode)) 1093 if (IS_SYNC(old_inode))
1094 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); 1094 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1095 return err; 1095 return err;
1096 1096
1097 out_cancel: 1097 out_cancel:
1098 if (unlink) { 1098 if (unlink) {
1099 set_nlink(new_inode, saved_nlink); 1099 set_nlink(new_inode, saved_nlink);
1100 } else { 1100 } else {
1101 new_dir->i_size -= new_sz; 1101 new_dir->i_size -= new_sz;
1102 ubifs_inode(new_dir)->ui_size = new_dir->i_size; 1102 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1103 } 1103 }
1104 old_dir->i_size += old_sz; 1104 old_dir->i_size += old_sz;
1105 ubifs_inode(old_dir)->ui_size = old_dir->i_size; 1105 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1106 if (is_dir) { 1106 if (is_dir) {
1107 if (move) { 1107 if (move) {
1108 inc_nlink(old_dir); 1108 inc_nlink(old_dir);
1109 if (!unlink) 1109 if (!unlink)
1110 drop_nlink(new_dir); 1110 drop_nlink(new_dir);
1111 } else { 1111 } else {
1112 if (unlink) 1112 if (unlink)
1113 inc_nlink(old_dir); 1113 inc_nlink(old_dir);
1114 } 1114 }
1115 } 1115 }
1116 unlock_3_inodes(old_dir, new_dir, new_inode); 1116 unlock_3_inodes(old_dir, new_dir, new_inode);
1117 ubifs_release_budget(c, &ino_req); 1117 ubifs_release_budget(c, &ino_req);
1118 ubifs_release_budget(c, &req); 1118 ubifs_release_budget(c, &req);
1119 return err; 1119 return err;
1120 } 1120 }
1121 1121
1122 int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1122 int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1123 struct kstat *stat) 1123 struct kstat *stat)
1124 { 1124 {
1125 loff_t size; 1125 loff_t size;
1126 struct inode *inode = dentry->d_inode; 1126 struct inode *inode = dentry->d_inode;
1127 struct ubifs_inode *ui = ubifs_inode(inode); 1127 struct ubifs_inode *ui = ubifs_inode(inode);
1128 1128
1129 mutex_lock(&ui->ui_mutex); 1129 mutex_lock(&ui->ui_mutex);
1130 generic_fillattr(inode, stat); 1130 generic_fillattr(inode, stat);
1131 stat->blksize = UBIFS_BLOCK_SIZE; 1131 stat->blksize = UBIFS_BLOCK_SIZE;
1132 stat->size = ui->ui_size; 1132 stat->size = ui->ui_size;
1133 1133
1134 /* 1134 /*
1135 * Unfortunately, the 'stat()' system call was designed for block 1135 * Unfortunately, the 'stat()' system call was designed for block
1136 * device based file systems, and it is not appropriate for UBIFS, 1136 * device based file systems, and it is not appropriate for UBIFS,
1137 * because UBIFS does not have notion of "block". For example, it is 1137 * because UBIFS does not have notion of "block". For example, it is
1138 * difficult to tell how many block a directory takes - it actually 1138 * difficult to tell how many block a directory takes - it actually
1139 * takes less than 300 bytes, but we have to round it to block size, 1139 * takes less than 300 bytes, but we have to round it to block size,
1140 * which introduces large mistake. This makes utilities like 'du' to 1140 * which introduces large mistake. This makes utilities like 'du' to
1141 * report completely senseless numbers. This is the reason why UBIFS 1141 * report completely senseless numbers. This is the reason why UBIFS
1142 * goes the same way as JFFS2 - it reports zero blocks for everything 1142 * goes the same way as JFFS2 - it reports zero blocks for everything
1143 * but regular files, which makes more sense than reporting completely 1143 * but regular files, which makes more sense than reporting completely
1144 * wrong sizes. 1144 * wrong sizes.
1145 */ 1145 */
1146 if (S_ISREG(inode->i_mode)) { 1146 if (S_ISREG(inode->i_mode)) {
1147 size = ui->xattr_size; 1147 size = ui->xattr_size;
1148 size += stat->size; 1148 size += stat->size;
1149 size = ALIGN(size, UBIFS_BLOCK_SIZE); 1149 size = ALIGN(size, UBIFS_BLOCK_SIZE);
1150 /* 1150 /*
1151 * Note, user-space expects 512-byte blocks count irrespectively 1151 * Note, user-space expects 512-byte blocks count irrespectively
1152 * of what was reported in @stat->size. 1152 * of what was reported in @stat->size.
1153 */ 1153 */
1154 stat->blocks = size >> 9; 1154 stat->blocks = size >> 9;
1155 } else 1155 } else
1156 stat->blocks = 0; 1156 stat->blocks = 0;
1157 mutex_unlock(&ui->ui_mutex); 1157 mutex_unlock(&ui->ui_mutex);
1158 return 0; 1158 return 0;
1159 } 1159 }
1160 1160
1161 const struct inode_operations ubifs_dir_inode_operations = { 1161 const struct inode_operations ubifs_dir_inode_operations = {
1162 .lookup = ubifs_lookup, 1162 .lookup = ubifs_lookup,
1163 .create = ubifs_create, 1163 .create = ubifs_create,
1164 .link = ubifs_link, 1164 .link = ubifs_link,
1165 .symlink = ubifs_symlink, 1165 .symlink = ubifs_symlink,
1166 .unlink = ubifs_unlink, 1166 .unlink = ubifs_unlink,
1167 .mkdir = ubifs_mkdir, 1167 .mkdir = ubifs_mkdir,
1168 .rmdir = ubifs_rmdir, 1168 .rmdir = ubifs_rmdir,
1169 .mknod = ubifs_mknod, 1169 .mknod = ubifs_mknod,
1170 .rename = ubifs_rename, 1170 .rename = ubifs_rename,
1171 .setattr = ubifs_setattr, 1171 .setattr = ubifs_setattr,
1172 .getattr = ubifs_getattr, 1172 .getattr = ubifs_getattr,
1173 .setxattr = ubifs_setxattr, 1173 .setxattr = ubifs_setxattr,
1174 .getxattr = ubifs_getxattr, 1174 .getxattr = ubifs_getxattr,
1175 .listxattr = ubifs_listxattr, 1175 .listxattr = ubifs_listxattr,
1176 .removexattr = ubifs_removexattr, 1176 .removexattr = ubifs_removexattr,
1177 }; 1177 };
1178 1178
1179 const struct file_operations ubifs_dir_operations = { 1179 const struct file_operations ubifs_dir_operations = {
1180 .llseek = ubifs_dir_llseek, 1180 .llseek = ubifs_dir_llseek,
1181 .release = ubifs_dir_release, 1181 .release = ubifs_dir_release,
1182 .read = generic_read_dir, 1182 .read = generic_read_dir,
1183 .readdir = ubifs_readdir, 1183 .readdir = ubifs_readdir,
1184 .fsync = ubifs_fsync, 1184 .fsync = ubifs_fsync,
1185 .unlocked_ioctl = ubifs_ioctl, 1185 .unlocked_ioctl = ubifs_ioctl,
1186 #ifdef CONFIG_COMPAT 1186 #ifdef CONFIG_COMPAT
1187 .compat_ioctl = ubifs_compat_ioctl, 1187 .compat_ioctl = ubifs_compat_ioctl,
1188 #endif 1188 #endif
1189 }; 1189 };
1190 1190
1 #ifndef _LINUX_FS_H 1 #ifndef _LINUX_FS_H
2 #define _LINUX_FS_H 2 #define _LINUX_FS_H
3 3
4 4
5 #include <linux/linkage.h> 5 #include <linux/linkage.h>
6 #include <linux/wait.h> 6 #include <linux/wait.h>
7 #include <linux/kdev_t.h> 7 #include <linux/kdev_t.h>
8 #include <linux/dcache.h> 8 #include <linux/dcache.h>
9 #include <linux/path.h> 9 #include <linux/path.h>
10 #include <linux/stat.h> 10 #include <linux/stat.h>
11 #include <linux/cache.h> 11 #include <linux/cache.h>
12 #include <linux/list.h> 12 #include <linux/list.h>
13 #include <linux/radix-tree.h> 13 #include <linux/radix-tree.h>
14 #include <linux/rbtree.h> 14 #include <linux/rbtree.h>
15 #include <linux/init.h> 15 #include <linux/init.h>
16 #include <linux/pid.h> 16 #include <linux/pid.h>
17 #include <linux/bug.h> 17 #include <linux/bug.h>
18 #include <linux/mutex.h> 18 #include <linux/mutex.h>
19 #include <linux/capability.h> 19 #include <linux/capability.h>
20 #include <linux/semaphore.h> 20 #include <linux/semaphore.h>
21 #include <linux/fiemap.h> 21 #include <linux/fiemap.h>
22 #include <linux/rculist_bl.h> 22 #include <linux/rculist_bl.h>
23 #include <linux/atomic.h> 23 #include <linux/atomic.h>
24 #include <linux/shrinker.h> 24 #include <linux/shrinker.h>
25 #include <linux/migrate_mode.h> 25 #include <linux/migrate_mode.h>
26 #include <linux/uidgid.h> 26 #include <linux/uidgid.h>
27 #include <linux/lockdep.h> 27 #include <linux/lockdep.h>
28 #include <linux/percpu-rwsem.h> 28 #include <linux/percpu-rwsem.h>
29 #include <linux/blk_types.h> 29 #include <linux/blk_types.h>
30 30
31 #include <asm/byteorder.h> 31 #include <asm/byteorder.h>
32 #include <uapi/linux/fs.h> 32 #include <uapi/linux/fs.h>
33 33
34 struct export_operations; 34 struct export_operations;
35 struct hd_geometry; 35 struct hd_geometry;
36 struct iovec; 36 struct iovec;
37 struct nameidata; 37 struct nameidata;
38 struct kiocb; 38 struct kiocb;
39 struct kobject; 39 struct kobject;
40 struct pipe_inode_info; 40 struct pipe_inode_info;
41 struct poll_table_struct; 41 struct poll_table_struct;
42 struct kstatfs; 42 struct kstatfs;
43 struct vm_area_struct; 43 struct vm_area_struct;
44 struct vfsmount; 44 struct vfsmount;
45 struct cred; 45 struct cred;
46 struct swap_info_struct; 46 struct swap_info_struct;
47 47
48 extern void __init inode_init(void); 48 extern void __init inode_init(void);
49 extern void __init inode_init_early(void); 49 extern void __init inode_init_early(void);
50 extern void __init files_init(unsigned long); 50 extern void __init files_init(unsigned long);
51 51
52 extern struct files_stat_struct files_stat; 52 extern struct files_stat_struct files_stat;
53 extern unsigned long get_max_files(void); 53 extern unsigned long get_max_files(void);
54 extern int sysctl_nr_open; 54 extern int sysctl_nr_open;
55 extern struct inodes_stat_t inodes_stat; 55 extern struct inodes_stat_t inodes_stat;
56 extern int leases_enable, lease_break_time; 56 extern int leases_enable, lease_break_time;
57 extern int sysctl_protected_symlinks; 57 extern int sysctl_protected_symlinks;
58 extern int sysctl_protected_hardlinks; 58 extern int sysctl_protected_hardlinks;
59 59
60 struct buffer_head; 60 struct buffer_head;
61 typedef int (get_block_t)(struct inode *inode, sector_t iblock, 61 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
62 struct buffer_head *bh_result, int create); 62 struct buffer_head *bh_result, int create);
63 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 63 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
64 ssize_t bytes, void *private, int ret, 64 ssize_t bytes, void *private, int ret,
65 bool is_async); 65 bool is_async);
66 66
67 #define MAY_EXEC 0x00000001 67 #define MAY_EXEC 0x00000001
68 #define MAY_WRITE 0x00000002 68 #define MAY_WRITE 0x00000002
69 #define MAY_READ 0x00000004 69 #define MAY_READ 0x00000004
70 #define MAY_APPEND 0x00000008 70 #define MAY_APPEND 0x00000008
71 #define MAY_ACCESS 0x00000010 71 #define MAY_ACCESS 0x00000010
72 #define MAY_OPEN 0x00000020 72 #define MAY_OPEN 0x00000020
73 #define MAY_CHDIR 0x00000040 73 #define MAY_CHDIR 0x00000040
74 /* called from RCU mode, don't block */ 74 /* called from RCU mode, don't block */
75 #define MAY_NOT_BLOCK 0x00000080 75 #define MAY_NOT_BLOCK 0x00000080
76 76
77 /* 77 /*
78 * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond 78 * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond
79 * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() 79 * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
80 */ 80 */
81 81
82 /* file is open for reading */ 82 /* file is open for reading */
83 #define FMODE_READ ((__force fmode_t)0x1) 83 #define FMODE_READ ((__force fmode_t)0x1)
84 /* file is open for writing */ 84 /* file is open for writing */
85 #define FMODE_WRITE ((__force fmode_t)0x2) 85 #define FMODE_WRITE ((__force fmode_t)0x2)
86 /* file is seekable */ 86 /* file is seekable */
87 #define FMODE_LSEEK ((__force fmode_t)0x4) 87 #define FMODE_LSEEK ((__force fmode_t)0x4)
88 /* file can be accessed using pread */ 88 /* file can be accessed using pread */
89 #define FMODE_PREAD ((__force fmode_t)0x8) 89 #define FMODE_PREAD ((__force fmode_t)0x8)
90 /* file can be accessed using pwrite */ 90 /* file can be accessed using pwrite */
91 #define FMODE_PWRITE ((__force fmode_t)0x10) 91 #define FMODE_PWRITE ((__force fmode_t)0x10)
92 /* File is opened for execution with sys_execve / sys_uselib */ 92 /* File is opened for execution with sys_execve / sys_uselib */
93 #define FMODE_EXEC ((__force fmode_t)0x20) 93 #define FMODE_EXEC ((__force fmode_t)0x20)
94 /* File is opened with O_NDELAY (only set for block devices) */ 94 /* File is opened with O_NDELAY (only set for block devices) */
95 #define FMODE_NDELAY ((__force fmode_t)0x40) 95 #define FMODE_NDELAY ((__force fmode_t)0x40)
96 /* File is opened with O_EXCL (only set for block devices) */ 96 /* File is opened with O_EXCL (only set for block devices) */
97 #define FMODE_EXCL ((__force fmode_t)0x80) 97 #define FMODE_EXCL ((__force fmode_t)0x80)
98 /* File is opened using open(.., 3, ..) and is writeable only for ioctls 98 /* File is opened using open(.., 3, ..) and is writeable only for ioctls
99 (specialy hack for floppy.c) */ 99 (specialy hack for floppy.c) */
100 #define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) 100 #define FMODE_WRITE_IOCTL ((__force fmode_t)0x100)
101 /* 32bit hashes as llseek() offset (for directories) */ 101 /* 32bit hashes as llseek() offset (for directories) */
102 #define FMODE_32BITHASH ((__force fmode_t)0x200) 102 #define FMODE_32BITHASH ((__force fmode_t)0x200)
103 /* 64bit hashes as llseek() offset (for directories) */ 103 /* 64bit hashes as llseek() offset (for directories) */
104 #define FMODE_64BITHASH ((__force fmode_t)0x400) 104 #define FMODE_64BITHASH ((__force fmode_t)0x400)
105 105
106 /* 106 /*
107 * Don't update ctime and mtime. 107 * Don't update ctime and mtime.
108 * 108 *
109 * Currently a special hack for the XFS open_by_handle ioctl, but we'll 109 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
110 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. 110 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
111 */ 111 */
112 #define FMODE_NOCMTIME ((__force fmode_t)0x800) 112 #define FMODE_NOCMTIME ((__force fmode_t)0x800)
113 113
114 /* Expect random access pattern */ 114 /* Expect random access pattern */
115 #define FMODE_RANDOM ((__force fmode_t)0x1000) 115 #define FMODE_RANDOM ((__force fmode_t)0x1000)
116 116
117 /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ 117 /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
118 #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) 118 #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
119 119
120 /* File is opened with O_PATH; almost nothing can be done with it */ 120 /* File is opened with O_PATH; almost nothing can be done with it */
121 #define FMODE_PATH ((__force fmode_t)0x4000) 121 #define FMODE_PATH ((__force fmode_t)0x4000)
122 122
123 /* File was opened by fanotify and shouldn't generate fanotify events */ 123 /* File was opened by fanotify and shouldn't generate fanotify events */
124 #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 124 #define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
125 125
126 /* 126 /*
127 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector 127 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
128 * that indicates that they should check the contents of the iovec are 128 * that indicates that they should check the contents of the iovec are
129 * valid, but not check the memory that the iovec elements 129 * valid, but not check the memory that the iovec elements
130 * points too. 130 * points too.
131 */ 131 */
132 #define CHECK_IOVEC_ONLY -1 132 #define CHECK_IOVEC_ONLY -1
133 133
134 /* 134 /*
135 * The below are the various read and write types that we support. Some of 135 * The below are the various read and write types that we support. Some of
136 * them include behavioral modifiers that send information down to the 136 * them include behavioral modifiers that send information down to the
137 * block layer and IO scheduler. Terminology: 137 * block layer and IO scheduler. Terminology:
138 * 138 *
139 * The block layer uses device plugging to defer IO a little bit, in 139 * The block layer uses device plugging to defer IO a little bit, in
140 * the hope that we will see more IO very shortly. This increases 140 * the hope that we will see more IO very shortly. This increases
141 * coalescing of adjacent IO and thus reduces the number of IOs we 141 * coalescing of adjacent IO and thus reduces the number of IOs we
142 * have to send to the device. It also allows for better queuing, 142 * have to send to the device. It also allows for better queuing,
143 * if the IO isn't mergeable. If the caller is going to be waiting 143 * if the IO isn't mergeable. If the caller is going to be waiting
144 * for the IO, then he must ensure that the device is unplugged so 144 * for the IO, then he must ensure that the device is unplugged so
145 * that the IO is dispatched to the driver. 145 * that the IO is dispatched to the driver.
146 * 146 *
147 * All IO is handled async in Linux. This is fine for background 147 * All IO is handled async in Linux. This is fine for background
148 * writes, but for reads or writes that someone waits for completion 148 * writes, but for reads or writes that someone waits for completion
149 * on, we want to notify the block layer and IO scheduler so that they 149 * on, we want to notify the block layer and IO scheduler so that they
150 * know about it. That allows them to make better scheduling 150 * know about it. That allows them to make better scheduling
151 * decisions. So when the below references 'sync' and 'async', it 151 * decisions. So when the below references 'sync' and 'async', it
152 * is referencing this priority hint. 152 * is referencing this priority hint.
153 * 153 *
154 * With that in mind, the available types are: 154 * With that in mind, the available types are:
155 * 155 *
156 * READ A normal read operation. Device will be plugged. 156 * READ A normal read operation. Device will be plugged.
157 * READ_SYNC A synchronous read. Device is not plugged, caller can 157 * READ_SYNC A synchronous read. Device is not plugged, caller can
158 * immediately wait on this read without caring about 158 * immediately wait on this read without caring about
159 * unplugging. 159 * unplugging.
160 * READA Used for read-ahead operations. Lower priority, and the 160 * READA Used for read-ahead operations. Lower priority, and the
161 * block layer could (in theory) choose to ignore this 161 * block layer could (in theory) choose to ignore this
162 * request if it runs into resource problems. 162 * request if it runs into resource problems.
163 * WRITE A normal async write. Device will be plugged. 163 * WRITE A normal async write. Device will be plugged.
164 * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down 164 * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
165 * the hint that someone will be waiting on this IO 165 * the hint that someone will be waiting on this IO
166 * shortly. The write equivalent of READ_SYNC. 166 * shortly. The write equivalent of READ_SYNC.
167 * WRITE_ODIRECT Special case write for O_DIRECT only. 167 * WRITE_ODIRECT Special case write for O_DIRECT only.
168 * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. 168 * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush.
169 * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on 169 * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on
170 * non-volatile media on completion. 170 * non-volatile media on completion.
171 * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded 171 * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
172 * by a cache flush and data is guaranteed to be on 172 * by a cache flush and data is guaranteed to be on
173 * non-volatile media on completion. 173 * non-volatile media on completion.
174 * 174 *
175 */ 175 */
176 #define RW_MASK REQ_WRITE 176 #define RW_MASK REQ_WRITE
177 #define RWA_MASK REQ_RAHEAD 177 #define RWA_MASK REQ_RAHEAD
178 178
179 #define READ 0 179 #define READ 0
180 #define WRITE RW_MASK 180 #define WRITE RW_MASK
181 #define READA RWA_MASK 181 #define READA RWA_MASK
182 #define KERNEL_READ (READ|REQ_KERNEL) 182 #define KERNEL_READ (READ|REQ_KERNEL)
183 #define KERNEL_WRITE (WRITE|REQ_KERNEL) 183 #define KERNEL_WRITE (WRITE|REQ_KERNEL)
184 184
185 #define READ_SYNC (READ | REQ_SYNC) 185 #define READ_SYNC (READ | REQ_SYNC)
186 #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) 186 #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
187 #define WRITE_ODIRECT (WRITE | REQ_SYNC) 187 #define WRITE_ODIRECT (WRITE | REQ_SYNC)
188 #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) 188 #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
189 #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) 189 #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
190 #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) 190 #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
191 191
192 /* 192 /*
193 * Attribute flags. These should be or-ed together to figure out what 193 * Attribute flags. These should be or-ed together to figure out what
194 * has been changed! 194 * has been changed!
195 */ 195 */
196 #define ATTR_MODE (1 << 0) 196 #define ATTR_MODE (1 << 0)
197 #define ATTR_UID (1 << 1) 197 #define ATTR_UID (1 << 1)
198 #define ATTR_GID (1 << 2) 198 #define ATTR_GID (1 << 2)
199 #define ATTR_SIZE (1 << 3) 199 #define ATTR_SIZE (1 << 3)
200 #define ATTR_ATIME (1 << 4) 200 #define ATTR_ATIME (1 << 4)
201 #define ATTR_MTIME (1 << 5) 201 #define ATTR_MTIME (1 << 5)
202 #define ATTR_CTIME (1 << 6) 202 #define ATTR_CTIME (1 << 6)
203 #define ATTR_ATIME_SET (1 << 7) 203 #define ATTR_ATIME_SET (1 << 7)
204 #define ATTR_MTIME_SET (1 << 8) 204 #define ATTR_MTIME_SET (1 << 8)
205 #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ 205 #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */
206 #define ATTR_ATTR_FLAG (1 << 10) 206 #define ATTR_ATTR_FLAG (1 << 10)
207 #define ATTR_KILL_SUID (1 << 11) 207 #define ATTR_KILL_SUID (1 << 11)
208 #define ATTR_KILL_SGID (1 << 12) 208 #define ATTR_KILL_SGID (1 << 12)
209 #define ATTR_FILE (1 << 13) 209 #define ATTR_FILE (1 << 13)
210 #define ATTR_KILL_PRIV (1 << 14) 210 #define ATTR_KILL_PRIV (1 << 14)
211 #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ 211 #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */
212 #define ATTR_TIMES_SET (1 << 16) 212 #define ATTR_TIMES_SET (1 << 16)
213 213
214 /* 214 /*
215 * This is the Inode Attributes structure, used for notify_change(). It 215 * This is the Inode Attributes structure, used for notify_change(). It
216 * uses the above definitions as flags, to know which values have changed. 216 * uses the above definitions as flags, to know which values have changed.
217 * Also, in this manner, a Filesystem can look at only the values it cares 217 * Also, in this manner, a Filesystem can look at only the values it cares
218 * about. Basically, these are the attributes that the VFS layer can 218 * about. Basically, these are the attributes that the VFS layer can
219 * request to change from the FS layer. 219 * request to change from the FS layer.
220 * 220 *
221 * Derek Atkins <warlord@MIT.EDU> 94-10-20 221 * Derek Atkins <warlord@MIT.EDU> 94-10-20
222 */ 222 */
223 struct iattr { 223 struct iattr {
224 unsigned int ia_valid; 224 unsigned int ia_valid;
225 umode_t ia_mode; 225 umode_t ia_mode;
226 kuid_t ia_uid; 226 kuid_t ia_uid;
227 kgid_t ia_gid; 227 kgid_t ia_gid;
228 loff_t ia_size; 228 loff_t ia_size;
229 struct timespec ia_atime; 229 struct timespec ia_atime;
230 struct timespec ia_mtime; 230 struct timespec ia_mtime;
231 struct timespec ia_ctime; 231 struct timespec ia_ctime;
232 232
233 /* 233 /*
234 * Not an attribute, but an auxiliary info for filesystems wanting to 234 * Not an attribute, but an auxiliary info for filesystems wanting to
235 * implement an ftruncate() like method. NOTE: filesystem should 235 * implement an ftruncate() like method. NOTE: filesystem should
236 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). 236 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
237 */ 237 */
238 struct file *ia_file; 238 struct file *ia_file;
239 }; 239 };
240 240
241 /* 241 /*
242 * Includes for diskquotas. 242 * Includes for diskquotas.
243 */ 243 */
244 #include <linux/quota.h> 244 #include <linux/quota.h>
245 245
246 /** 246 /**
247 * enum positive_aop_returns - aop return codes with specific semantics 247 * enum positive_aop_returns - aop return codes with specific semantics
248 * 248 *
249 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has 249 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
250 * completed, that the page is still locked, and 250 * completed, that the page is still locked, and
251 * should be considered active. The VM uses this hint 251 * should be considered active. The VM uses this hint
252 * to return the page to the active list -- it won't 252 * to return the page to the active list -- it won't
253 * be a candidate for writeback again in the near 253 * be a candidate for writeback again in the near
254 * future. Other callers must be careful to unlock 254 * future. Other callers must be careful to unlock
255 * the page if they get this return. Returned by 255 * the page if they get this return. Returned by
256 * writepage(); 256 * writepage();
257 * 257 *
258 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has 258 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
259 * unlocked it and the page might have been truncated. 259 * unlocked it and the page might have been truncated.
260 * The caller should back up to acquiring a new page and 260 * The caller should back up to acquiring a new page and
261 * trying again. The aop will be taking reasonable 261 * trying again. The aop will be taking reasonable
262 * precautions not to livelock. If the caller held a page 262 * precautions not to livelock. If the caller held a page
263 * reference, it should drop it before retrying. Returned 263 * reference, it should drop it before retrying. Returned
264 * by readpage(). 264 * by readpage().
265 * 265 *
266 * address_space_operation functions return these large constants to indicate 266 * address_space_operation functions return these large constants to indicate
267 * special semantics to the caller. These are much larger than the bytes in a 267 * special semantics to the caller. These are much larger than the bytes in a
268 * page to allow for functions that return the number of bytes operated on in a 268 * page to allow for functions that return the number of bytes operated on in a
269 * given page. 269 * given page.
270 */ 270 */
271 271
272 enum positive_aop_returns { 272 enum positive_aop_returns {
273 AOP_WRITEPAGE_ACTIVATE = 0x80000, 273 AOP_WRITEPAGE_ACTIVATE = 0x80000,
274 AOP_TRUNCATED_PAGE = 0x80001, 274 AOP_TRUNCATED_PAGE = 0x80001,
275 }; 275 };
276 276
277 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ 277 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */
278 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ 278 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */
279 #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct 279 #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct
280 * helper code (eg buffer layer) 280 * helper code (eg buffer layer)
281 * to clear GFP_FS from alloc */ 281 * to clear GFP_FS from alloc */
282 282
283 /* 283 /*
284 * oh the beauties of C type declarations. 284 * oh the beauties of C type declarations.
285 */ 285 */
286 struct page; 286 struct page;
287 struct address_space; 287 struct address_space;
288 struct writeback_control; 288 struct writeback_control;
289 289
290 struct iov_iter { 290 struct iov_iter {
291 const struct iovec *iov; 291 const struct iovec *iov;
292 unsigned long nr_segs; 292 unsigned long nr_segs;
293 size_t iov_offset; 293 size_t iov_offset;
294 size_t count; 294 size_t count;
295 }; 295 };
296 296
297 size_t iov_iter_copy_from_user_atomic(struct page *page, 297 size_t iov_iter_copy_from_user_atomic(struct page *page,
298 struct iov_iter *i, unsigned long offset, size_t bytes); 298 struct iov_iter *i, unsigned long offset, size_t bytes);
299 size_t iov_iter_copy_from_user(struct page *page, 299 size_t iov_iter_copy_from_user(struct page *page,
300 struct iov_iter *i, unsigned long offset, size_t bytes); 300 struct iov_iter *i, unsigned long offset, size_t bytes);
301 void iov_iter_advance(struct iov_iter *i, size_t bytes); 301 void iov_iter_advance(struct iov_iter *i, size_t bytes);
302 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); 302 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
303 size_t iov_iter_single_seg_count(struct iov_iter *i); 303 size_t iov_iter_single_seg_count(struct iov_iter *i);
304 304
305 static inline void iov_iter_init(struct iov_iter *i, 305 static inline void iov_iter_init(struct iov_iter *i,
306 const struct iovec *iov, unsigned long nr_segs, 306 const struct iovec *iov, unsigned long nr_segs,
307 size_t count, size_t written) 307 size_t count, size_t written)
308 { 308 {
309 i->iov = iov; 309 i->iov = iov;
310 i->nr_segs = nr_segs; 310 i->nr_segs = nr_segs;
311 i->iov_offset = 0; 311 i->iov_offset = 0;
312 i->count = count + written; 312 i->count = count + written;
313 313
314 iov_iter_advance(i, written); 314 iov_iter_advance(i, written);
315 } 315 }
316 316
317 static inline size_t iov_iter_count(struct iov_iter *i) 317 static inline size_t iov_iter_count(struct iov_iter *i)
318 { 318 {
319 return i->count; 319 return i->count;
320 } 320 }
321 321
322 /* 322 /*
323 * "descriptor" for what we're up to with a read. 323 * "descriptor" for what we're up to with a read.
324 * This allows us to use the same read code yet 324 * This allows us to use the same read code yet
325 * have multiple different users of the data that 325 * have multiple different users of the data that
326 * we read from a file. 326 * we read from a file.
327 * 327 *
328 * The simplest case just copies the data to user 328 * The simplest case just copies the data to user
329 * mode. 329 * mode.
330 */ 330 */
331 typedef struct { 331 typedef struct {
332 size_t written; 332 size_t written;
333 size_t count; 333 size_t count;
334 union { 334 union {
335 char __user *buf; 335 char __user *buf;
336 void *data; 336 void *data;
337 } arg; 337 } arg;
338 int error; 338 int error;
339 } read_descriptor_t; 339 } read_descriptor_t;
340 340
341 typedef int (*read_actor_t)(read_descriptor_t *, struct page *, 341 typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
342 unsigned long, unsigned long); 342 unsigned long, unsigned long);
343 343
344 struct address_space_operations { 344 struct address_space_operations {
345 int (*writepage)(struct page *page, struct writeback_control *wbc); 345 int (*writepage)(struct page *page, struct writeback_control *wbc);
346 int (*readpage)(struct file *, struct page *); 346 int (*readpage)(struct file *, struct page *);
347 347
348 /* Write back some dirty pages from this mapping. */ 348 /* Write back some dirty pages from this mapping. */
349 int (*writepages)(struct address_space *, struct writeback_control *); 349 int (*writepages)(struct address_space *, struct writeback_control *);
350 350
351 /* Set a page dirty. Return true if this dirtied it */ 351 /* Set a page dirty. Return true if this dirtied it */
352 int (*set_page_dirty)(struct page *page); 352 int (*set_page_dirty)(struct page *page);
353 353
354 int (*readpages)(struct file *filp, struct address_space *mapping, 354 int (*readpages)(struct file *filp, struct address_space *mapping,
355 struct list_head *pages, unsigned nr_pages); 355 struct list_head *pages, unsigned nr_pages);
356 356
357 int (*write_begin)(struct file *, struct address_space *mapping, 357 int (*write_begin)(struct file *, struct address_space *mapping,
358 loff_t pos, unsigned len, unsigned flags, 358 loff_t pos, unsigned len, unsigned flags,
359 struct page **pagep, void **fsdata); 359 struct page **pagep, void **fsdata);
360 int (*write_end)(struct file *, struct address_space *mapping, 360 int (*write_end)(struct file *, struct address_space *mapping,
361 loff_t pos, unsigned len, unsigned copied, 361 loff_t pos, unsigned len, unsigned copied,
362 struct page *page, void *fsdata); 362 struct page *page, void *fsdata);
363 363
364 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 364 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
365 sector_t (*bmap)(struct address_space *, sector_t); 365 sector_t (*bmap)(struct address_space *, sector_t);
366 void (*invalidatepage) (struct page *, unsigned long); 366 void (*invalidatepage) (struct page *, unsigned long);
367 int (*releasepage) (struct page *, gfp_t); 367 int (*releasepage) (struct page *, gfp_t);
368 void (*freepage)(struct page *); 368 void (*freepage)(struct page *);
369 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 369 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
370 loff_t offset, unsigned long nr_segs); 370 loff_t offset, unsigned long nr_segs);
371 int (*get_xip_mem)(struct address_space *, pgoff_t, int, 371 int (*get_xip_mem)(struct address_space *, pgoff_t, int,
372 void **, unsigned long *); 372 void **, unsigned long *);
373 /* 373 /*
374 * migrate the contents of a page to the specified target. If sync 374 * migrate the contents of a page to the specified target. If sync
375 * is false, it must not block. 375 * is false, it must not block.
376 */ 376 */
377 int (*migratepage) (struct address_space *, 377 int (*migratepage) (struct address_space *,
378 struct page *, struct page *, enum migrate_mode); 378 struct page *, struct page *, enum migrate_mode);
379 int (*launder_page) (struct page *); 379 int (*launder_page) (struct page *);
380 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 380 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
381 unsigned long); 381 unsigned long);
382 int (*error_remove_page)(struct address_space *, struct page *); 382 int (*error_remove_page)(struct address_space *, struct page *);
383 383
384 /* swapfile support */ 384 /* swapfile support */
385 int (*swap_activate)(struct swap_info_struct *sis, struct file *file, 385 int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
386 sector_t *span); 386 sector_t *span);
387 void (*swap_deactivate)(struct file *file); 387 void (*swap_deactivate)(struct file *file);
388 }; 388 };
389 389
390 extern const struct address_space_operations empty_aops; 390 extern const struct address_space_operations empty_aops;
391 391
392 /* 392 /*
393 * pagecache_write_begin/pagecache_write_end must be used by general code 393 * pagecache_write_begin/pagecache_write_end must be used by general code
394 * to write into the pagecache. 394 * to write into the pagecache.
395 */ 395 */
396 int pagecache_write_begin(struct file *, struct address_space *mapping, 396 int pagecache_write_begin(struct file *, struct address_space *mapping,
397 loff_t pos, unsigned len, unsigned flags, 397 loff_t pos, unsigned len, unsigned flags,
398 struct page **pagep, void **fsdata); 398 struct page **pagep, void **fsdata);
399 399
400 int pagecache_write_end(struct file *, struct address_space *mapping, 400 int pagecache_write_end(struct file *, struct address_space *mapping,
401 loff_t pos, unsigned len, unsigned copied, 401 loff_t pos, unsigned len, unsigned copied,
402 struct page *page, void *fsdata); 402 struct page *page, void *fsdata);
403 403
404 struct backing_dev_info; 404 struct backing_dev_info;
405 struct address_space { 405 struct address_space {
406 struct inode *host; /* owner: inode, block_device */ 406 struct inode *host; /* owner: inode, block_device */
407 struct radix_tree_root page_tree; /* radix tree of all pages */ 407 struct radix_tree_root page_tree; /* radix tree of all pages */
408 spinlock_t tree_lock; /* and lock protecting it */ 408 spinlock_t tree_lock; /* and lock protecting it */
409 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 409 unsigned int i_mmap_writable;/* count VM_SHARED mappings */
410 struct rb_root i_mmap; /* tree of private and shared mappings */ 410 struct rb_root i_mmap; /* tree of private and shared mappings */
411 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 411 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
412 struct mutex i_mmap_mutex; /* protect tree, count, list */ 412 struct mutex i_mmap_mutex; /* protect tree, count, list */
413 /* Protected by tree_lock together with the radix tree */ 413 /* Protected by tree_lock together with the radix tree */
414 unsigned long nrpages; /* number of total pages */ 414 unsigned long nrpages; /* number of total pages */
415 pgoff_t writeback_index;/* writeback starts here */ 415 pgoff_t writeback_index;/* writeback starts here */
416 const struct address_space_operations *a_ops; /* methods */ 416 const struct address_space_operations *a_ops; /* methods */
417 unsigned long flags; /* error bits/gfp mask */ 417 unsigned long flags; /* error bits/gfp mask */
418 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 418 struct backing_dev_info *backing_dev_info; /* device readahead, etc */
419 spinlock_t private_lock; /* for use by the address_space */ 419 spinlock_t private_lock; /* for use by the address_space */
420 struct list_head private_list; /* ditto */ 420 struct list_head private_list; /* ditto */
421 void *private_data; /* ditto */ 421 void *private_data; /* ditto */
422 } __attribute__((aligned(sizeof(long)))); 422 } __attribute__((aligned(sizeof(long))));
423 /* 423 /*
424 * On most architectures that alignment is already the case; but 424 * On most architectures that alignment is already the case; but
425 * must be enforced here for CRIS, to let the least significant bit 425 * must be enforced here for CRIS, to let the least significant bit
426 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. 426 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
427 */ 427 */
428 struct request_queue; 428 struct request_queue;
429 429
430 struct block_device { 430 struct block_device {
431 dev_t bd_dev; /* not a kdev_t - it's a search key */ 431 dev_t bd_dev; /* not a kdev_t - it's a search key */
432 int bd_openers; 432 int bd_openers;
433 struct inode * bd_inode; /* will die */ 433 struct inode * bd_inode; /* will die */
434 struct super_block * bd_super; 434 struct super_block * bd_super;
435 struct mutex bd_mutex; /* open/close mutex */ 435 struct mutex bd_mutex; /* open/close mutex */
436 struct list_head bd_inodes; 436 struct list_head bd_inodes;
437 void * bd_claiming; 437 void * bd_claiming;
438 void * bd_holder; 438 void * bd_holder;
439 int bd_holders; 439 int bd_holders;
440 bool bd_write_holder; 440 bool bd_write_holder;
441 #ifdef CONFIG_SYSFS 441 #ifdef CONFIG_SYSFS
442 struct list_head bd_holder_disks; 442 struct list_head bd_holder_disks;
443 #endif 443 #endif
444 struct block_device * bd_contains; 444 struct block_device * bd_contains;
445 unsigned bd_block_size; 445 unsigned bd_block_size;
446 struct hd_struct * bd_part; 446 struct hd_struct * bd_part;
447 /* number of times partitions within this device have been opened. */ 447 /* number of times partitions within this device have been opened. */
448 unsigned bd_part_count; 448 unsigned bd_part_count;
449 int bd_invalidated; 449 int bd_invalidated;
450 struct gendisk * bd_disk; 450 struct gendisk * bd_disk;
451 struct request_queue * bd_queue; 451 struct request_queue * bd_queue;
452 struct list_head bd_list; 452 struct list_head bd_list;
453 /* 453 /*
454 * Private data. You must have bd_claim'ed the block_device 454 * Private data. You must have bd_claim'ed the block_device
455 * to use this. NOTE: bd_claim allows an owner to claim 455 * to use this. NOTE: bd_claim allows an owner to claim
456 * the same device multiple times, the owner must take special 456 * the same device multiple times, the owner must take special
457 * care to not mess up bd_private for that case. 457 * care to not mess up bd_private for that case.
458 */ 458 */
459 unsigned long bd_private; 459 unsigned long bd_private;
460 460
461 /* The counter of freeze processes */ 461 /* The counter of freeze processes */
462 int bd_fsfreeze_count; 462 int bd_fsfreeze_count;
463 /* Mutex for freeze */ 463 /* Mutex for freeze */
464 struct mutex bd_fsfreeze_mutex; 464 struct mutex bd_fsfreeze_mutex;
465 }; 465 };
466 466
467 /* 467 /*
468 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache 468 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
469 * radix trees 469 * radix trees
470 */ 470 */
471 #define PAGECACHE_TAG_DIRTY 0 471 #define PAGECACHE_TAG_DIRTY 0
472 #define PAGECACHE_TAG_WRITEBACK 1 472 #define PAGECACHE_TAG_WRITEBACK 1
473 #define PAGECACHE_TAG_TOWRITE 2 473 #define PAGECACHE_TAG_TOWRITE 2
474 474
475 int mapping_tagged(struct address_space *mapping, int tag); 475 int mapping_tagged(struct address_space *mapping, int tag);
476 476
477 /* 477 /*
478 * Might pages of this file be mapped into userspace? 478 * Might pages of this file be mapped into userspace?
479 */ 479 */
480 static inline int mapping_mapped(struct address_space *mapping) 480 static inline int mapping_mapped(struct address_space *mapping)
481 { 481 {
482 return !RB_EMPTY_ROOT(&mapping->i_mmap) || 482 return !RB_EMPTY_ROOT(&mapping->i_mmap) ||
483 !list_empty(&mapping->i_mmap_nonlinear); 483 !list_empty(&mapping->i_mmap_nonlinear);
484 } 484 }
485 485
486 /* 486 /*
487 * Might pages of this file have been modified in userspace? 487 * Might pages of this file have been modified in userspace?
488 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff 488 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
489 * marks vma as VM_SHARED if it is shared, and the file was opened for 489 * marks vma as VM_SHARED if it is shared, and the file was opened for
490 * writing i.e. vma may be mprotected writable even if now readonly. 490 * writing i.e. vma may be mprotected writable even if now readonly.
491 */ 491 */
492 static inline int mapping_writably_mapped(struct address_space *mapping) 492 static inline int mapping_writably_mapped(struct address_space *mapping)
493 { 493 {
494 return mapping->i_mmap_writable != 0; 494 return mapping->i_mmap_writable != 0;
495 } 495 }
496 496
497 /* 497 /*
498 * Use sequence counter to get consistent i_size on 32-bit processors. 498 * Use sequence counter to get consistent i_size on 32-bit processors.
499 */ 499 */
500 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 500 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
501 #include <linux/seqlock.h> 501 #include <linux/seqlock.h>
502 #define __NEED_I_SIZE_ORDERED 502 #define __NEED_I_SIZE_ORDERED
503 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) 503 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
504 #else 504 #else
505 #define i_size_ordered_init(inode) do { } while (0) 505 #define i_size_ordered_init(inode) do { } while (0)
506 #endif 506 #endif
507 507
508 struct posix_acl; 508 struct posix_acl;
509 #define ACL_NOT_CACHED ((void *)(-1)) 509 #define ACL_NOT_CACHED ((void *)(-1))
510 510
511 #define IOP_FASTPERM 0x0001 511 #define IOP_FASTPERM 0x0001
512 #define IOP_LOOKUP 0x0002 512 #define IOP_LOOKUP 0x0002
513 #define IOP_NOFOLLOW 0x0004 513 #define IOP_NOFOLLOW 0x0004
514 514
515 /* 515 /*
516 * Keep mostly read-only and often accessed (especially for 516 * Keep mostly read-only and often accessed (especially for
517 * the RCU path lookup and 'stat' data) fields at the beginning 517 * the RCU path lookup and 'stat' data) fields at the beginning
518 * of the 'struct inode' 518 * of the 'struct inode'
519 */ 519 */
520 struct inode { 520 struct inode {
521 umode_t i_mode; 521 umode_t i_mode;
522 unsigned short i_opflags; 522 unsigned short i_opflags;
523 kuid_t i_uid; 523 kuid_t i_uid;
524 kgid_t i_gid; 524 kgid_t i_gid;
525 unsigned int i_flags; 525 unsigned int i_flags;
526 526
527 #ifdef CONFIG_FS_POSIX_ACL 527 #ifdef CONFIG_FS_POSIX_ACL
528 struct posix_acl *i_acl; 528 struct posix_acl *i_acl;
529 struct posix_acl *i_default_acl; 529 struct posix_acl *i_default_acl;
530 #endif 530 #endif
531 531
532 const struct inode_operations *i_op; 532 const struct inode_operations *i_op;
533 struct super_block *i_sb; 533 struct super_block *i_sb;
534 struct address_space *i_mapping; 534 struct address_space *i_mapping;
535 535
536 #ifdef CONFIG_SECURITY 536 #ifdef CONFIG_SECURITY
537 void *i_security; 537 void *i_security;
538 #endif 538 #endif
539 539
540 /* Stat data, not accessed from path walking */ 540 /* Stat data, not accessed from path walking */
541 unsigned long i_ino; 541 unsigned long i_ino;
542 /* 542 /*
543 * Filesystems may only read i_nlink directly. They shall use the 543 * Filesystems may only read i_nlink directly. They shall use the
544 * following functions for modification: 544 * following functions for modification:
545 * 545 *
546 * (set|clear|inc|drop)_nlink 546 * (set|clear|inc|drop)_nlink
547 * inode_(inc|dec)_link_count 547 * inode_(inc|dec)_link_count
548 */ 548 */
549 union { 549 union {
550 const unsigned int i_nlink; 550 const unsigned int i_nlink;
551 unsigned int __i_nlink; 551 unsigned int __i_nlink;
552 }; 552 };
553 dev_t i_rdev; 553 dev_t i_rdev;
554 loff_t i_size; 554 loff_t i_size;
555 struct timespec i_atime; 555 struct timespec i_atime;
556 struct timespec i_mtime; 556 struct timespec i_mtime;
557 struct timespec i_ctime; 557 struct timespec i_ctime;
558 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 558 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
559 unsigned short i_bytes; 559 unsigned short i_bytes;
560 unsigned int i_blkbits; 560 unsigned int i_blkbits;
561 blkcnt_t i_blocks; 561 blkcnt_t i_blocks;
562 562
563 #ifdef __NEED_I_SIZE_ORDERED 563 #ifdef __NEED_I_SIZE_ORDERED
564 seqcount_t i_size_seqcount; 564 seqcount_t i_size_seqcount;
565 #endif 565 #endif
566 566
567 /* Misc */ 567 /* Misc */
568 unsigned long i_state; 568 unsigned long i_state;
569 struct mutex i_mutex; 569 struct mutex i_mutex;
570 570
571 unsigned long dirtied_when; /* jiffies of first dirtying */ 571 unsigned long dirtied_when; /* jiffies of first dirtying */
572 572
573 struct hlist_node i_hash; 573 struct hlist_node i_hash;
574 struct list_head i_wb_list; /* backing dev IO list */ 574 struct list_head i_wb_list; /* backing dev IO list */
575 struct list_head i_lru; /* inode LRU list */ 575 struct list_head i_lru; /* inode LRU list */
576 struct list_head i_sb_list; 576 struct list_head i_sb_list;
577 union { 577 union {
578 struct hlist_head i_dentry; 578 struct hlist_head i_dentry;
579 struct rcu_head i_rcu; 579 struct rcu_head i_rcu;
580 }; 580 };
581 u64 i_version; 581 u64 i_version;
582 atomic_t i_count; 582 atomic_t i_count;
583 atomic_t i_dio_count; 583 atomic_t i_dio_count;
584 atomic_t i_writecount; 584 atomic_t i_writecount;
585 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 585 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
586 struct file_lock *i_flock; 586 struct file_lock *i_flock;
587 struct address_space i_data; 587 struct address_space i_data;
588 #ifdef CONFIG_QUOTA 588 #ifdef CONFIG_QUOTA
589 struct dquot *i_dquot[MAXQUOTAS]; 589 struct dquot *i_dquot[MAXQUOTAS];
590 #endif 590 #endif
591 struct list_head i_devices; 591 struct list_head i_devices;
592 union { 592 union {
593 struct pipe_inode_info *i_pipe; 593 struct pipe_inode_info *i_pipe;
594 struct block_device *i_bdev; 594 struct block_device *i_bdev;
595 struct cdev *i_cdev; 595 struct cdev *i_cdev;
596 }; 596 };
597 597
598 __u32 i_generation; 598 __u32 i_generation;
599 599
600 #ifdef CONFIG_FSNOTIFY 600 #ifdef CONFIG_FSNOTIFY
601 __u32 i_fsnotify_mask; /* all events this inode cares about */ 601 __u32 i_fsnotify_mask; /* all events this inode cares about */
602 struct hlist_head i_fsnotify_marks; 602 struct hlist_head i_fsnotify_marks;
603 #endif 603 #endif
604 604
605 #ifdef CONFIG_IMA 605 #ifdef CONFIG_IMA
606 atomic_t i_readcount; /* struct files open RO */ 606 atomic_t i_readcount; /* struct files open RO */
607 #endif 607 #endif
608 void *i_private; /* fs or device private pointer */ 608 void *i_private; /* fs or device private pointer */
609 }; 609 };
610 610
611 static inline int inode_unhashed(struct inode *inode) 611 static inline int inode_unhashed(struct inode *inode)
612 { 612 {
613 return hlist_unhashed(&inode->i_hash); 613 return hlist_unhashed(&inode->i_hash);
614 } 614 }
615 615
616 /* 616 /*
617 * inode->i_mutex nesting subclasses for the lock validator: 617 * inode->i_mutex nesting subclasses for the lock validator:
618 * 618 *
619 * 0: the object of the current VFS operation 619 * 0: the object of the current VFS operation
620 * 1: parent 620 * 1: parent
621 * 2: child/target 621 * 2: child/target
622 * 3: quota file 622 * 3: quota file
623 * 623 *
624 * The locking order between these classes is 624 * The locking order between these classes is
625 * parent -> child -> normal -> xattr -> quota 625 * parent -> child -> normal -> xattr -> quota
626 */ 626 */
627 enum inode_i_mutex_lock_class 627 enum inode_i_mutex_lock_class
628 { 628 {
629 I_MUTEX_NORMAL, 629 I_MUTEX_NORMAL,
630 I_MUTEX_PARENT, 630 I_MUTEX_PARENT,
631 I_MUTEX_CHILD, 631 I_MUTEX_CHILD,
632 I_MUTEX_XATTR, 632 I_MUTEX_XATTR,
633 I_MUTEX_QUOTA 633 I_MUTEX_QUOTA
634 }; 634 };
635 635
636 /* 636 /*
637 * NOTE: in a 32bit arch with a preemptable kernel and 637 * NOTE: in a 32bit arch with a preemptable kernel and
638 * an UP compile the i_size_read/write must be atomic 638 * an UP compile the i_size_read/write must be atomic
639 * with respect to the local cpu (unlike with preempt disabled), 639 * with respect to the local cpu (unlike with preempt disabled),
640 * but they don't need to be atomic with respect to other cpus like in 640 * but they don't need to be atomic with respect to other cpus like in
641 * true SMP (so they need either to either locally disable irq around 641 * true SMP (so they need either to either locally disable irq around
642 * the read or for example on x86 they can be still implemented as a 642 * the read or for example on x86 they can be still implemented as a
643 * cmpxchg8b without the need of the lock prefix). For SMP compiles 643 * cmpxchg8b without the need of the lock prefix). For SMP compiles
644 * and 64bit archs it makes no difference if preempt is enabled or not. 644 * and 64bit archs it makes no difference if preempt is enabled or not.
645 */ 645 */
646 static inline loff_t i_size_read(const struct inode *inode) 646 static inline loff_t i_size_read(const struct inode *inode)
647 { 647 {
648 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 648 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
649 loff_t i_size; 649 loff_t i_size;
650 unsigned int seq; 650 unsigned int seq;
651 651
652 do { 652 do {
653 seq = read_seqcount_begin(&inode->i_size_seqcount); 653 seq = read_seqcount_begin(&inode->i_size_seqcount);
654 i_size = inode->i_size; 654 i_size = inode->i_size;
655 } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); 655 } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
656 return i_size; 656 return i_size;
657 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 657 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
658 loff_t i_size; 658 loff_t i_size;
659 659
660 preempt_disable(); 660 preempt_disable();
661 i_size = inode->i_size; 661 i_size = inode->i_size;
662 preempt_enable(); 662 preempt_enable();
663 return i_size; 663 return i_size;
664 #else 664 #else
665 return inode->i_size; 665 return inode->i_size;
666 #endif 666 #endif
667 } 667 }
668 668
669 /* 669 /*
670 * NOTE: unlike i_size_read(), i_size_write() does need locking around it 670 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
671 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount 671 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
672 * can be lost, resulting in subsequent i_size_read() calls spinning forever. 672 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
673 */ 673 */
674 static inline void i_size_write(struct inode *inode, loff_t i_size) 674 static inline void i_size_write(struct inode *inode, loff_t i_size)
675 { 675 {
676 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 676 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
677 write_seqcount_begin(&inode->i_size_seqcount); 677 write_seqcount_begin(&inode->i_size_seqcount);
678 inode->i_size = i_size; 678 inode->i_size = i_size;
679 write_seqcount_end(&inode->i_size_seqcount); 679 write_seqcount_end(&inode->i_size_seqcount);
680 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 680 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
681 preempt_disable(); 681 preempt_disable();
682 inode->i_size = i_size; 682 inode->i_size = i_size;
683 preempt_enable(); 683 preempt_enable();
684 #else 684 #else
685 inode->i_size = i_size; 685 inode->i_size = i_size;
686 #endif 686 #endif
687 } 687 }
688 688
689 /* Helper functions so that in most cases filesystems will 689 /* Helper functions so that in most cases filesystems will
690 * not need to deal directly with kuid_t and kgid_t and can 690 * not need to deal directly with kuid_t and kgid_t and can
691 * instead deal with the raw numeric values that are stored 691 * instead deal with the raw numeric values that are stored
692 * in the filesystem. 692 * in the filesystem.
693 */ 693 */
694 static inline uid_t i_uid_read(const struct inode *inode) 694 static inline uid_t i_uid_read(const struct inode *inode)
695 { 695 {
696 return from_kuid(&init_user_ns, inode->i_uid); 696 return from_kuid(&init_user_ns, inode->i_uid);
697 } 697 }
698 698
699 static inline gid_t i_gid_read(const struct inode *inode) 699 static inline gid_t i_gid_read(const struct inode *inode)
700 { 700 {
701 return from_kgid(&init_user_ns, inode->i_gid); 701 return from_kgid(&init_user_ns, inode->i_gid);
702 } 702 }
703 703
704 static inline void i_uid_write(struct inode *inode, uid_t uid) 704 static inline void i_uid_write(struct inode *inode, uid_t uid)
705 { 705 {
706 inode->i_uid = make_kuid(&init_user_ns, uid); 706 inode->i_uid = make_kuid(&init_user_ns, uid);
707 } 707 }
708 708
709 static inline void i_gid_write(struct inode *inode, gid_t gid) 709 static inline void i_gid_write(struct inode *inode, gid_t gid)
710 { 710 {
711 inode->i_gid = make_kgid(&init_user_ns, gid); 711 inode->i_gid = make_kgid(&init_user_ns, gid);
712 } 712 }
713 713
714 static inline unsigned iminor(const struct inode *inode) 714 static inline unsigned iminor(const struct inode *inode)
715 { 715 {
716 return MINOR(inode->i_rdev); 716 return MINOR(inode->i_rdev);
717 } 717 }
718 718
719 static inline unsigned imajor(const struct inode *inode) 719 static inline unsigned imajor(const struct inode *inode)
720 { 720 {
721 return MAJOR(inode->i_rdev); 721 return MAJOR(inode->i_rdev);
722 } 722 }
723 723
724 extern struct block_device *I_BDEV(struct inode *inode); 724 extern struct block_device *I_BDEV(struct inode *inode);
725 725
726 struct fown_struct { 726 struct fown_struct {
727 rwlock_t lock; /* protects pid, uid, euid fields */ 727 rwlock_t lock; /* protects pid, uid, euid fields */
728 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ 728 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
729 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ 729 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
730 kuid_t uid, euid; /* uid/euid of process setting the owner */ 730 kuid_t uid, euid; /* uid/euid of process setting the owner */
731 int signum; /* posix.1b rt signal to be delivered on IO */ 731 int signum; /* posix.1b rt signal to be delivered on IO */
732 }; 732 };
733 733
734 /* 734 /*
735 * Track a single file's readahead state 735 * Track a single file's readahead state
736 */ 736 */
737 struct file_ra_state { 737 struct file_ra_state {
738 pgoff_t start; /* where readahead started */ 738 pgoff_t start; /* where readahead started */
739 unsigned int size; /* # of readahead pages */ 739 unsigned int size; /* # of readahead pages */
740 unsigned int async_size; /* do asynchronous readahead when 740 unsigned int async_size; /* do asynchronous readahead when
741 there are only # of pages ahead */ 741 there are only # of pages ahead */
742 742
743 unsigned int ra_pages; /* Maximum readahead window */ 743 unsigned int ra_pages; /* Maximum readahead window */
744 unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ 744 unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
745 loff_t prev_pos; /* Cache last read() position */ 745 loff_t prev_pos; /* Cache last read() position */
746 }; 746 };
747 747
748 /* 748 /*
749 * Check if @index falls in the readahead windows. 749 * Check if @index falls in the readahead windows.
750 */ 750 */
751 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) 751 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
752 { 752 {
753 return (index >= ra->start && 753 return (index >= ra->start &&
754 index < ra->start + ra->size); 754 index < ra->start + ra->size);
755 } 755 }
756 756
757 #define FILE_MNT_WRITE_TAKEN 1 757 #define FILE_MNT_WRITE_TAKEN 1
758 #define FILE_MNT_WRITE_RELEASED 2 758 #define FILE_MNT_WRITE_RELEASED 2
759 759
760 struct file { 760 struct file {
761 /* 761 /*
762 * fu_list becomes invalid after file_free is called and queued via 762 * fu_list becomes invalid after file_free is called and queued via
763 * fu_rcuhead for RCU freeing 763 * fu_rcuhead for RCU freeing
764 */ 764 */
765 union { 765 union {
766 struct list_head fu_list; 766 struct list_head fu_list;
767 struct rcu_head fu_rcuhead; 767 struct rcu_head fu_rcuhead;
768 } f_u; 768 } f_u;
769 struct path f_path; 769 struct path f_path;
770 #define f_dentry f_path.dentry 770 #define f_dentry f_path.dentry
771 #define f_vfsmnt f_path.mnt 771 #define f_vfsmnt f_path.mnt
772 const struct file_operations *f_op; 772 const struct file_operations *f_op;
773 773
774 /* 774 /*
775 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. 775 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
776 * Must not be taken from IRQ context. 776 * Must not be taken from IRQ context.
777 */ 777 */
778 spinlock_t f_lock; 778 spinlock_t f_lock;
779 #ifdef CONFIG_SMP 779 #ifdef CONFIG_SMP
780 int f_sb_list_cpu; 780 int f_sb_list_cpu;
781 #endif 781 #endif
782 atomic_long_t f_count; 782 atomic_long_t f_count;
783 unsigned int f_flags; 783 unsigned int f_flags;
784 fmode_t f_mode; 784 fmode_t f_mode;
785 loff_t f_pos; 785 loff_t f_pos;
786 struct fown_struct f_owner; 786 struct fown_struct f_owner;
787 const struct cred *f_cred; 787 const struct cred *f_cred;
788 struct file_ra_state f_ra; 788 struct file_ra_state f_ra;
789 789
790 u64 f_version; 790 u64 f_version;
791 #ifdef CONFIG_SECURITY 791 #ifdef CONFIG_SECURITY
792 void *f_security; 792 void *f_security;
793 #endif 793 #endif
794 /* needed for tty driver, and maybe others */ 794 /* needed for tty driver, and maybe others */
795 void *private_data; 795 void *private_data;
796 796
797 #ifdef CONFIG_EPOLL 797 #ifdef CONFIG_EPOLL
798 /* Used by fs/eventpoll.c to link all the hooks to this file */ 798 /* Used by fs/eventpoll.c to link all the hooks to this file */
799 struct list_head f_ep_links; 799 struct list_head f_ep_links;
800 struct list_head f_tfile_llink; 800 struct list_head f_tfile_llink;
801 #endif /* #ifdef CONFIG_EPOLL */ 801 #endif /* #ifdef CONFIG_EPOLL */
802 struct address_space *f_mapping; 802 struct address_space *f_mapping;
803 #ifdef CONFIG_DEBUG_WRITECOUNT 803 #ifdef CONFIG_DEBUG_WRITECOUNT
804 unsigned long f_mnt_write_state; 804 unsigned long f_mnt_write_state;
805 #endif 805 #endif
806 }; 806 };
807 807
808 struct file_handle { 808 struct file_handle {
809 __u32 handle_bytes; 809 __u32 handle_bytes;
810 int handle_type; 810 int handle_type;
811 /* file identifier */ 811 /* file identifier */
812 unsigned char f_handle[0]; 812 unsigned char f_handle[0];
813 }; 813 };
814 814
815 static inline struct file *get_file(struct file *f) 815 static inline struct file *get_file(struct file *f)
816 { 816 {
817 atomic_long_inc(&f->f_count); 817 atomic_long_inc(&f->f_count);
818 return f; 818 return f;
819 } 819 }
820 #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) 820 #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1)
821 #define file_count(x) atomic_long_read(&(x)->f_count) 821 #define file_count(x) atomic_long_read(&(x)->f_count)
822 822
823 #ifdef CONFIG_DEBUG_WRITECOUNT 823 #ifdef CONFIG_DEBUG_WRITECOUNT
824 static inline void file_take_write(struct file *f) 824 static inline void file_take_write(struct file *f)
825 { 825 {
826 WARN_ON(f->f_mnt_write_state != 0); 826 WARN_ON(f->f_mnt_write_state != 0);
827 f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; 827 f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
828 } 828 }
829 static inline void file_release_write(struct file *f) 829 static inline void file_release_write(struct file *f)
830 { 830 {
831 f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; 831 f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED;
832 } 832 }
833 static inline void file_reset_write(struct file *f) 833 static inline void file_reset_write(struct file *f)
834 { 834 {
835 f->f_mnt_write_state = 0; 835 f->f_mnt_write_state = 0;
836 } 836 }
837 static inline void file_check_state(struct file *f) 837 static inline void file_check_state(struct file *f)
838 { 838 {
839 /* 839 /*
840 * At this point, either both or neither of these bits 840 * At this point, either both or neither of these bits
841 * should be set. 841 * should be set.
842 */ 842 */
843 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); 843 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
844 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); 844 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
845 } 845 }
846 static inline int file_check_writeable(struct file *f) 846 static inline int file_check_writeable(struct file *f)
847 { 847 {
848 if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) 848 if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN)
849 return 0; 849 return 0;
850 printk(KERN_WARNING "writeable file with no " 850 printk(KERN_WARNING "writeable file with no "
851 "mnt_want_write()\n"); 851 "mnt_want_write()\n");
852 WARN_ON(1); 852 WARN_ON(1);
853 return -EINVAL; 853 return -EINVAL;
854 } 854 }
855 #else /* !CONFIG_DEBUG_WRITECOUNT */ 855 #else /* !CONFIG_DEBUG_WRITECOUNT */
856 static inline void file_take_write(struct file *filp) {} 856 static inline void file_take_write(struct file *filp) {}
857 static inline void file_release_write(struct file *filp) {} 857 static inline void file_release_write(struct file *filp) {}
858 static inline void file_reset_write(struct file *filp) {} 858 static inline void file_reset_write(struct file *filp) {}
859 static inline void file_check_state(struct file *filp) {} 859 static inline void file_check_state(struct file *filp) {}
860 static inline int file_check_writeable(struct file *filp) 860 static inline int file_check_writeable(struct file *filp)
861 { 861 {
862 return 0; 862 return 0;
863 } 863 }
864 #endif /* CONFIG_DEBUG_WRITECOUNT */ 864 #endif /* CONFIG_DEBUG_WRITECOUNT */
865 865
866 #define MAX_NON_LFS ((1UL<<31) - 1) 866 #define MAX_NON_LFS ((1UL<<31) - 1)
867 867
868 /* Page cache limit. The filesystems should put that into their s_maxbytes 868 /* Page cache limit. The filesystems should put that into their s_maxbytes
869 limits, otherwise bad things can happen in VM. */ 869 limits, otherwise bad things can happen in VM. */
870 #if BITS_PER_LONG==32 870 #if BITS_PER_LONG==32
871 #define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 871 #define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
872 #elif BITS_PER_LONG==64 872 #elif BITS_PER_LONG==64
873 #define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL) 873 #define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL)
874 #endif 874 #endif
875 875
876 #define FL_POSIX 1 876 #define FL_POSIX 1
877 #define FL_FLOCK 2 877 #define FL_FLOCK 2
878 #define FL_ACCESS 8 /* not trying to lock, just looking */ 878 #define FL_ACCESS 8 /* not trying to lock, just looking */
879 #define FL_EXISTS 16 /* when unlocking, test for existence */ 879 #define FL_EXISTS 16 /* when unlocking, test for existence */
880 #define FL_LEASE 32 /* lease held on this file */ 880 #define FL_LEASE 32 /* lease held on this file */
881 #define FL_CLOSE 64 /* unlock on close */ 881 #define FL_CLOSE 64 /* unlock on close */
882 #define FL_SLEEP 128 /* A blocking lock */ 882 #define FL_SLEEP 128 /* A blocking lock */
883 #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ 883 #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */
884 #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ 884 #define FL_UNLOCK_PENDING 512 /* Lease is being broken */
885 885
886 /* 886 /*
887 * Special return value from posix_lock_file() and vfs_lock_file() for 887 * Special return value from posix_lock_file() and vfs_lock_file() for
888 * asynchronous locking. 888 * asynchronous locking.
889 */ 889 */
890 #define FILE_LOCK_DEFERRED 1 890 #define FILE_LOCK_DEFERRED 1
891 891
892 /* 892 /*
893 * The POSIX file lock owner is determined by 893 * The POSIX file lock owner is determined by
894 * the "struct files_struct" in the thread group 894 * the "struct files_struct" in the thread group
895 * (or NULL for no owner - BSD locks). 895 * (or NULL for no owner - BSD locks).
896 * 896 *
897 * Lockd stuffs a "host" pointer into this. 897 * Lockd stuffs a "host" pointer into this.
898 */ 898 */
899 typedef struct files_struct *fl_owner_t; 899 typedef struct files_struct *fl_owner_t;
900 900
901 struct file_lock_operations { 901 struct file_lock_operations {
902 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 902 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
903 void (*fl_release_private)(struct file_lock *); 903 void (*fl_release_private)(struct file_lock *);
904 }; 904 };
905 905
906 struct lock_manager_operations { 906 struct lock_manager_operations {
907 int (*lm_compare_owner)(struct file_lock *, struct file_lock *); 907 int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
908 void (*lm_notify)(struct file_lock *); /* unblock callback */ 908 void (*lm_notify)(struct file_lock *); /* unblock callback */
909 int (*lm_grant)(struct file_lock *, struct file_lock *, int); 909 int (*lm_grant)(struct file_lock *, struct file_lock *, int);
910 void (*lm_break)(struct file_lock *); 910 void (*lm_break)(struct file_lock *);
911 int (*lm_change)(struct file_lock **, int); 911 int (*lm_change)(struct file_lock **, int);
912 }; 912 };
913 913
914 struct lock_manager { 914 struct lock_manager {
915 struct list_head list; 915 struct list_head list;
916 }; 916 };
917 917
918 struct net; 918 struct net;
919 void locks_start_grace(struct net *, struct lock_manager *); 919 void locks_start_grace(struct net *, struct lock_manager *);
920 void locks_end_grace(struct lock_manager *); 920 void locks_end_grace(struct lock_manager *);
921 int locks_in_grace(struct net *); 921 int locks_in_grace(struct net *);
922 922
923 /* that will die - we need it for nfs_lock_info */ 923 /* that will die - we need it for nfs_lock_info */
924 #include <linux/nfs_fs_i.h> 924 #include <linux/nfs_fs_i.h>
925 925
926 struct file_lock { 926 struct file_lock {
927 struct file_lock *fl_next; /* singly linked list for this inode */ 927 struct file_lock *fl_next; /* singly linked list for this inode */
928 struct list_head fl_link; /* doubly linked list of all locks */ 928 struct list_head fl_link; /* doubly linked list of all locks */
929 struct list_head fl_block; /* circular list of blocked processes */ 929 struct list_head fl_block; /* circular list of blocked processes */
930 fl_owner_t fl_owner; 930 fl_owner_t fl_owner;
931 unsigned int fl_flags; 931 unsigned int fl_flags;
932 unsigned char fl_type; 932 unsigned char fl_type;
933 unsigned int fl_pid; 933 unsigned int fl_pid;
934 struct pid *fl_nspid; 934 struct pid *fl_nspid;
935 wait_queue_head_t fl_wait; 935 wait_queue_head_t fl_wait;
936 struct file *fl_file; 936 struct file *fl_file;
937 loff_t fl_start; 937 loff_t fl_start;
938 loff_t fl_end; 938 loff_t fl_end;
939 939
940 struct fasync_struct * fl_fasync; /* for lease break notifications */ 940 struct fasync_struct * fl_fasync; /* for lease break notifications */
941 /* for lease breaks: */ 941 /* for lease breaks: */
942 unsigned long fl_break_time; 942 unsigned long fl_break_time;
943 unsigned long fl_downgrade_time; 943 unsigned long fl_downgrade_time;
944 944
945 const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ 945 const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */
946 const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ 946 const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */
947 union { 947 union {
948 struct nfs_lock_info nfs_fl; 948 struct nfs_lock_info nfs_fl;
949 struct nfs4_lock_info nfs4_fl; 949 struct nfs4_lock_info nfs4_fl;
950 struct { 950 struct {
951 struct list_head link; /* link in AFS vnode's pending_locks list */ 951 struct list_head link; /* link in AFS vnode's pending_locks list */
952 int state; /* state of grant or error if -ve */ 952 int state; /* state of grant or error if -ve */
953 } afs; 953 } afs;
954 } fl_u; 954 } fl_u;
955 }; 955 };
956 956
957 /* The following constant reflects the upper bound of the file/locking space */ 957 /* The following constant reflects the upper bound of the file/locking space */
958 #ifndef OFFSET_MAX 958 #ifndef OFFSET_MAX
959 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) 959 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
960 #define OFFSET_MAX INT_LIMIT(loff_t) 960 #define OFFSET_MAX INT_LIMIT(loff_t)
961 #define OFFT_OFFSET_MAX INT_LIMIT(off_t) 961 #define OFFT_OFFSET_MAX INT_LIMIT(off_t)
962 #endif 962 #endif
963 963
964 #include <linux/fcntl.h> 964 #include <linux/fcntl.h>
965 965
966 extern void send_sigio(struct fown_struct *fown, int fd, int band); 966 extern void send_sigio(struct fown_struct *fown, int fd, int band);
967 967
968 #ifdef CONFIG_FILE_LOCKING 968 #ifdef CONFIG_FILE_LOCKING
969 extern int fcntl_getlk(struct file *, struct flock __user *); 969 extern int fcntl_getlk(struct file *, struct flock __user *);
970 extern int fcntl_setlk(unsigned int, struct file *, unsigned int, 970 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
971 struct flock __user *); 971 struct flock __user *);
972 972
973 #if BITS_PER_LONG == 32 973 #if BITS_PER_LONG == 32
974 extern int fcntl_getlk64(struct file *, struct flock64 __user *); 974 extern int fcntl_getlk64(struct file *, struct flock64 __user *);
975 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, 975 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
976 struct flock64 __user *); 976 struct flock64 __user *);
977 #endif 977 #endif
978 978
979 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); 979 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
980 extern int fcntl_getlease(struct file *filp); 980 extern int fcntl_getlease(struct file *filp);
981 981
982 /* fs/locks.c */ 982 /* fs/locks.c */
983 void locks_free_lock(struct file_lock *fl); 983 void locks_free_lock(struct file_lock *fl);
984 extern void locks_init_lock(struct file_lock *); 984 extern void locks_init_lock(struct file_lock *);
985 extern struct file_lock * locks_alloc_lock(void); 985 extern struct file_lock * locks_alloc_lock(void);
986 extern void locks_copy_lock(struct file_lock *, struct file_lock *); 986 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
987 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); 987 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
988 extern void locks_remove_posix(struct file *, fl_owner_t); 988 extern void locks_remove_posix(struct file *, fl_owner_t);
989 extern void locks_remove_flock(struct file *); 989 extern void locks_remove_flock(struct file *);
990 extern void locks_release_private(struct file_lock *); 990 extern void locks_release_private(struct file_lock *);
991 extern void posix_test_lock(struct file *, struct file_lock *); 991 extern void posix_test_lock(struct file *, struct file_lock *);
992 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); 992 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
993 extern int posix_lock_file_wait(struct file *, struct file_lock *); 993 extern int posix_lock_file_wait(struct file *, struct file_lock *);
994 extern int posix_unblock_lock(struct file *, struct file_lock *); 994 extern int posix_unblock_lock(struct file *, struct file_lock *);
995 extern int vfs_test_lock(struct file *, struct file_lock *); 995 extern int vfs_test_lock(struct file *, struct file_lock *);
996 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); 996 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
997 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); 997 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
998 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); 998 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
999 extern int __break_lease(struct inode *inode, unsigned int flags); 999 extern int __break_lease(struct inode *inode, unsigned int flags);
1000 extern void lease_get_mtime(struct inode *, struct timespec *time); 1000 extern void lease_get_mtime(struct inode *, struct timespec *time);
1001 extern int generic_setlease(struct file *, long, struct file_lock **); 1001 extern int generic_setlease(struct file *, long, struct file_lock **);
1002 extern int vfs_setlease(struct file *, long, struct file_lock **); 1002 extern int vfs_setlease(struct file *, long, struct file_lock **);
1003 extern int lease_modify(struct file_lock **, int); 1003 extern int lease_modify(struct file_lock **, int);
1004 extern int lock_may_read(struct inode *, loff_t start, unsigned long count); 1004 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
1005 extern int lock_may_write(struct inode *, loff_t start, unsigned long count); 1005 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
1006 extern void locks_delete_block(struct file_lock *waiter); 1006 extern void locks_delete_block(struct file_lock *waiter);
1007 extern void lock_flocks(void); 1007 extern void lock_flocks(void);
1008 extern void unlock_flocks(void); 1008 extern void unlock_flocks(void);
1009 #else /* !CONFIG_FILE_LOCKING */ 1009 #else /* !CONFIG_FILE_LOCKING */
1010 static inline int fcntl_getlk(struct file *file, struct flock __user *user) 1010 static inline int fcntl_getlk(struct file *file, struct flock __user *user)
1011 { 1011 {
1012 return -EINVAL; 1012 return -EINVAL;
1013 } 1013 }
1014 1014
1015 static inline int fcntl_setlk(unsigned int fd, struct file *file, 1015 static inline int fcntl_setlk(unsigned int fd, struct file *file,
1016 unsigned int cmd, struct flock __user *user) 1016 unsigned int cmd, struct flock __user *user)
1017 { 1017 {
1018 return -EACCES; 1018 return -EACCES;
1019 } 1019 }
1020 1020
1021 #if BITS_PER_LONG == 32 1021 #if BITS_PER_LONG == 32
1022 static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user) 1022 static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user)
1023 { 1023 {
1024 return -EINVAL; 1024 return -EINVAL;
1025 } 1025 }
1026 1026
1027 static inline int fcntl_setlk64(unsigned int fd, struct file *file, 1027 static inline int fcntl_setlk64(unsigned int fd, struct file *file,
1028 unsigned int cmd, struct flock64 __user *user) 1028 unsigned int cmd, struct flock64 __user *user)
1029 { 1029 {
1030 return -EACCES; 1030 return -EACCES;
1031 } 1031 }
1032 #endif 1032 #endif
1033 static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1033 static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1034 { 1034 {
1035 return 0; 1035 return 0;
1036 } 1036 }
1037 1037
1038 static inline int fcntl_getlease(struct file *filp) 1038 static inline int fcntl_getlease(struct file *filp)
1039 { 1039 {
1040 return 0; 1040 return 0;
1041 } 1041 }
1042 1042
1043 static inline void locks_init_lock(struct file_lock *fl) 1043 static inline void locks_init_lock(struct file_lock *fl)
1044 { 1044 {
1045 return; 1045 return;
1046 } 1046 }
1047 1047
1048 static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) 1048 static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
1049 { 1049 {
1050 return; 1050 return;
1051 } 1051 }
1052 1052
1053 static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) 1053 static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
1054 { 1054 {
1055 return; 1055 return;
1056 } 1056 }
1057 1057
1058 static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) 1058 static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
1059 { 1059 {
1060 return; 1060 return;
1061 } 1061 }
1062 1062
1063 static inline void locks_remove_flock(struct file *filp) 1063 static inline void locks_remove_flock(struct file *filp)
1064 { 1064 {
1065 return; 1065 return;
1066 } 1066 }
1067 1067
1068 static inline void posix_test_lock(struct file *filp, struct file_lock *fl) 1068 static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
1069 { 1069 {
1070 return; 1070 return;
1071 } 1071 }
1072 1072
1073 static inline int posix_lock_file(struct file *filp, struct file_lock *fl, 1073 static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
1074 struct file_lock *conflock) 1074 struct file_lock *conflock)
1075 { 1075 {
1076 return -ENOLCK; 1076 return -ENOLCK;
1077 } 1077 }
1078 1078
1079 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl) 1079 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
1080 { 1080 {
1081 return -ENOLCK; 1081 return -ENOLCK;
1082 } 1082 }
1083 1083
1084 static inline int posix_unblock_lock(struct file *filp, 1084 static inline int posix_unblock_lock(struct file *filp,
1085 struct file_lock *waiter) 1085 struct file_lock *waiter)
1086 { 1086 {
1087 return -ENOENT; 1087 return -ENOENT;
1088 } 1088 }
1089 1089
1090 static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) 1090 static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
1091 { 1091 {
1092 return 0; 1092 return 0;
1093 } 1093 }
1094 1094
1095 static inline int vfs_lock_file(struct file *filp, unsigned int cmd, 1095 static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
1096 struct file_lock *fl, struct file_lock *conf) 1096 struct file_lock *fl, struct file_lock *conf)
1097 { 1097 {
1098 return -ENOLCK; 1098 return -ENOLCK;
1099 } 1099 }
1100 1100
1101 static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) 1101 static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
1102 { 1102 {
1103 return 0; 1103 return 0;
1104 } 1104 }
1105 1105
1106 static inline int flock_lock_file_wait(struct file *filp, 1106 static inline int flock_lock_file_wait(struct file *filp,
1107 struct file_lock *request) 1107 struct file_lock *request)
1108 { 1108 {
1109 return -ENOLCK; 1109 return -ENOLCK;
1110 } 1110 }
1111 1111
1112 static inline int __break_lease(struct inode *inode, unsigned int mode) 1112 static inline int __break_lease(struct inode *inode, unsigned int mode)
1113 { 1113 {
1114 return 0; 1114 return 0;
1115 } 1115 }
1116 1116
1117 static inline void lease_get_mtime(struct inode *inode, struct timespec *time) 1117 static inline void lease_get_mtime(struct inode *inode, struct timespec *time)
1118 { 1118 {
1119 return; 1119 return;
1120 } 1120 }
1121 1121
1122 static inline int generic_setlease(struct file *filp, long arg, 1122 static inline int generic_setlease(struct file *filp, long arg,
1123 struct file_lock **flp) 1123 struct file_lock **flp)
1124 { 1124 {
1125 return -EINVAL; 1125 return -EINVAL;
1126 } 1126 }
1127 1127
1128 static inline int vfs_setlease(struct file *filp, long arg, 1128 static inline int vfs_setlease(struct file *filp, long arg,
1129 struct file_lock **lease) 1129 struct file_lock **lease)
1130 { 1130 {
1131 return -EINVAL; 1131 return -EINVAL;
1132 } 1132 }
1133 1133
1134 static inline int lease_modify(struct file_lock **before, int arg) 1134 static inline int lease_modify(struct file_lock **before, int arg)
1135 { 1135 {
1136 return -EINVAL; 1136 return -EINVAL;
1137 } 1137 }
1138 1138
1139 static inline int lock_may_read(struct inode *inode, loff_t start, 1139 static inline int lock_may_read(struct inode *inode, loff_t start,
1140 unsigned long len) 1140 unsigned long len)
1141 { 1141 {
1142 return 1; 1142 return 1;
1143 } 1143 }
1144 1144
1145 static inline int lock_may_write(struct inode *inode, loff_t start, 1145 static inline int lock_may_write(struct inode *inode, loff_t start,
1146 unsigned long len) 1146 unsigned long len)
1147 { 1147 {
1148 return 1; 1148 return 1;
1149 } 1149 }
1150 1150
1151 static inline void locks_delete_block(struct file_lock *waiter) 1151 static inline void locks_delete_block(struct file_lock *waiter)
1152 { 1152 {
1153 } 1153 }
1154 1154
1155 static inline void lock_flocks(void) 1155 static inline void lock_flocks(void)
1156 { 1156 {
1157 } 1157 }
1158 1158
1159 static inline void unlock_flocks(void) 1159 static inline void unlock_flocks(void)
1160 { 1160 {
1161 } 1161 }
1162 1162
1163 #endif /* !CONFIG_FILE_LOCKING */ 1163 #endif /* !CONFIG_FILE_LOCKING */
1164 1164
1165 1165
1166 struct fasync_struct { 1166 struct fasync_struct {
1167 spinlock_t fa_lock; 1167 spinlock_t fa_lock;
1168 int magic; 1168 int magic;
1169 int fa_fd; 1169 int fa_fd;
1170 struct fasync_struct *fa_next; /* singly linked list */ 1170 struct fasync_struct *fa_next; /* singly linked list */
1171 struct file *fa_file; 1171 struct file *fa_file;
1172 struct rcu_head fa_rcu; 1172 struct rcu_head fa_rcu;
1173 }; 1173 };
1174 1174
1175 #define FASYNC_MAGIC 0x4601 1175 #define FASYNC_MAGIC 0x4601
1176 1176
1177 /* SMP safe fasync helpers: */ 1177 /* SMP safe fasync helpers: */
1178 extern int fasync_helper(int, struct file *, int, struct fasync_struct **); 1178 extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
1179 extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); 1179 extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
1180 extern int fasync_remove_entry(struct file *, struct fasync_struct **); 1180 extern int fasync_remove_entry(struct file *, struct fasync_struct **);
1181 extern struct fasync_struct *fasync_alloc(void); 1181 extern struct fasync_struct *fasync_alloc(void);
1182 extern void fasync_free(struct fasync_struct *); 1182 extern void fasync_free(struct fasync_struct *);
1183 1183
1184 /* can be called from interrupts */ 1184 /* can be called from interrupts */
1185 extern void kill_fasync(struct fasync_struct **, int, int); 1185 extern void kill_fasync(struct fasync_struct **, int, int);
1186 1186
1187 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); 1187 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
1188 extern int f_setown(struct file *filp, unsigned long arg, int force); 1188 extern int f_setown(struct file *filp, unsigned long arg, int force);
1189 extern void f_delown(struct file *filp); 1189 extern void f_delown(struct file *filp);
1190 extern pid_t f_getown(struct file *filp); 1190 extern pid_t f_getown(struct file *filp);
1191 extern int send_sigurg(struct fown_struct *fown); 1191 extern int send_sigurg(struct fown_struct *fown);
1192 1192
1193 struct mm_struct; 1193 struct mm_struct;
1194 1194
1195 /* 1195 /*
1196 * Umount options 1196 * Umount options
1197 */ 1197 */
1198 1198
1199 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ 1199 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
1200 #define MNT_DETACH 0x00000002 /* Just detach from the tree */ 1200 #define MNT_DETACH 0x00000002 /* Just detach from the tree */
1201 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ 1201 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */
1202 #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ 1202 #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
1203 #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ 1203 #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
1204 1204
1205 extern struct list_head super_blocks; 1205 extern struct list_head super_blocks;
1206 extern spinlock_t sb_lock; 1206 extern spinlock_t sb_lock;
1207 1207
1208 /* Possible states of 'frozen' field */ 1208 /* Possible states of 'frozen' field */
1209 enum { 1209 enum {
1210 SB_UNFROZEN = 0, /* FS is unfrozen */ 1210 SB_UNFROZEN = 0, /* FS is unfrozen */
1211 SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ 1211 SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
1212 SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ 1212 SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
1213 SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop 1213 SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
1214 * internal threads if needed) */ 1214 * internal threads if needed) */
1215 SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ 1215 SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
1216 }; 1216 };
1217 1217
1218 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) 1218 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
1219 1219
1220 struct sb_writers { 1220 struct sb_writers {
1221 /* Counters for counting writers at each level */ 1221 /* Counters for counting writers at each level */
1222 struct percpu_counter counter[SB_FREEZE_LEVELS]; 1222 struct percpu_counter counter[SB_FREEZE_LEVELS];
1223 wait_queue_head_t wait; /* queue for waiting for 1223 wait_queue_head_t wait; /* queue for waiting for
1224 writers / faults to finish */ 1224 writers / faults to finish */
1225 int frozen; /* Is sb frozen? */ 1225 int frozen; /* Is sb frozen? */
1226 wait_queue_head_t wait_unfrozen; /* queue for waiting for 1226 wait_queue_head_t wait_unfrozen; /* queue for waiting for
1227 sb to be thawed */ 1227 sb to be thawed */
1228 #ifdef CONFIG_DEBUG_LOCK_ALLOC 1228 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1229 struct lockdep_map lock_map[SB_FREEZE_LEVELS]; 1229 struct lockdep_map lock_map[SB_FREEZE_LEVELS];
1230 #endif 1230 #endif
1231 }; 1231 };
1232 1232
1233 struct super_block { 1233 struct super_block {
1234 struct list_head s_list; /* Keep this first */ 1234 struct list_head s_list; /* Keep this first */
1235 dev_t s_dev; /* search index; _not_ kdev_t */ 1235 dev_t s_dev; /* search index; _not_ kdev_t */
1236 unsigned char s_blocksize_bits; 1236 unsigned char s_blocksize_bits;
1237 unsigned long s_blocksize; 1237 unsigned long s_blocksize;
1238 loff_t s_maxbytes; /* Max file size */ 1238 loff_t s_maxbytes; /* Max file size */
1239 struct file_system_type *s_type; 1239 struct file_system_type *s_type;
1240 const struct super_operations *s_op; 1240 const struct super_operations *s_op;
1241 const struct dquot_operations *dq_op; 1241 const struct dquot_operations *dq_op;
1242 const struct quotactl_ops *s_qcop; 1242 const struct quotactl_ops *s_qcop;
1243 const struct export_operations *s_export_op; 1243 const struct export_operations *s_export_op;
1244 unsigned long s_flags; 1244 unsigned long s_flags;
1245 unsigned long s_magic; 1245 unsigned long s_magic;
1246 struct dentry *s_root; 1246 struct dentry *s_root;
1247 struct rw_semaphore s_umount; 1247 struct rw_semaphore s_umount;
1248 int s_count; 1248 int s_count;
1249 atomic_t s_active; 1249 atomic_t s_active;
1250 #ifdef CONFIG_SECURITY 1250 #ifdef CONFIG_SECURITY
1251 void *s_security; 1251 void *s_security;
1252 #endif 1252 #endif
1253 const struct xattr_handler **s_xattr; 1253 const struct xattr_handler **s_xattr;
1254 1254
1255 struct list_head s_inodes; /* all inodes */ 1255 struct list_head s_inodes; /* all inodes */
1256 struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ 1256 struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
1257 #ifdef CONFIG_SMP 1257 #ifdef CONFIG_SMP
1258 struct list_head __percpu *s_files; 1258 struct list_head __percpu *s_files;
1259 #else 1259 #else
1260 struct list_head s_files; 1260 struct list_head s_files;
1261 #endif 1261 #endif
1262 struct list_head s_mounts; /* list of mounts; _not_ for fs use */ 1262 struct list_head s_mounts; /* list of mounts; _not_ for fs use */
1263 /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ 1263 /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
1264 struct list_head s_dentry_lru; /* unused dentry lru */ 1264 struct list_head s_dentry_lru; /* unused dentry lru */
1265 int s_nr_dentry_unused; /* # of dentry on lru */ 1265 int s_nr_dentry_unused; /* # of dentry on lru */
1266 1266
1267 /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ 1267 /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
1268 spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; 1268 spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp;
1269 struct list_head s_inode_lru; /* unused inode lru */ 1269 struct list_head s_inode_lru; /* unused inode lru */
1270 int s_nr_inodes_unused; /* # of inodes on lru */ 1270 int s_nr_inodes_unused; /* # of inodes on lru */
1271 1271
1272 struct block_device *s_bdev; 1272 struct block_device *s_bdev;
1273 struct backing_dev_info *s_bdi; 1273 struct backing_dev_info *s_bdi;
1274 struct mtd_info *s_mtd; 1274 struct mtd_info *s_mtd;
1275 struct hlist_node s_instances; 1275 struct hlist_node s_instances;
1276 struct quota_info s_dquot; /* Diskquota specific options */ 1276 struct quota_info s_dquot; /* Diskquota specific options */
1277 1277
1278 struct sb_writers s_writers; 1278 struct sb_writers s_writers;
1279 1279
1280 char s_id[32]; /* Informational name */ 1280 char s_id[32]; /* Informational name */
1281 u8 s_uuid[16]; /* UUID */ 1281 u8 s_uuid[16]; /* UUID */
1282 1282
1283 void *s_fs_info; /* Filesystem private info */ 1283 void *s_fs_info; /* Filesystem private info */
1284 unsigned int s_max_links; 1284 unsigned int s_max_links;
1285 fmode_t s_mode; 1285 fmode_t s_mode;
1286 1286
1287 /* Granularity of c/m/atime in ns. 1287 /* Granularity of c/m/atime in ns.
1288 Cannot be worse than a second */ 1288 Cannot be worse than a second */
1289 u32 s_time_gran; 1289 u32 s_time_gran;
1290 1290
1291 /* 1291 /*
1292 * The next field is for VFS *only*. No filesystems have any business 1292 * The next field is for VFS *only*. No filesystems have any business
1293 * even looking at it. You had been warned. 1293 * even looking at it. You had been warned.
1294 */ 1294 */
1295 struct mutex s_vfs_rename_mutex; /* Kludge */ 1295 struct mutex s_vfs_rename_mutex; /* Kludge */
1296 1296
1297 /* 1297 /*
1298 * Filesystem subtype. If non-empty the filesystem type field 1298 * Filesystem subtype. If non-empty the filesystem type field
1299 * in /proc/mounts will be "type.subtype" 1299 * in /proc/mounts will be "type.subtype"
1300 */ 1300 */
1301 char *s_subtype; 1301 char *s_subtype;
1302 1302
1303 /* 1303 /*
1304 * Saved mount options for lazy filesystems using 1304 * Saved mount options for lazy filesystems using
1305 * generic_show_options() 1305 * generic_show_options()
1306 */ 1306 */
1307 char __rcu *s_options; 1307 char __rcu *s_options;
1308 const struct dentry_operations *s_d_op; /* default d_op for dentries */ 1308 const struct dentry_operations *s_d_op; /* default d_op for dentries */
1309 1309
1310 /* 1310 /*
1311 * Saved pool identifier for cleancache (-1 means none) 1311 * Saved pool identifier for cleancache (-1 means none)
1312 */ 1312 */
1313 int cleancache_poolid; 1313 int cleancache_poolid;
1314 1314
1315 struct shrinker s_shrink; /* per-sb shrinker handle */ 1315 struct shrinker s_shrink; /* per-sb shrinker handle */
1316 1316
1317 /* Number of inodes with nlink == 0 but still referenced */ 1317 /* Number of inodes with nlink == 0 but still referenced */
1318 atomic_long_t s_remove_count; 1318 atomic_long_t s_remove_count;
1319 1319
1320 /* Being remounted read-only */ 1320 /* Being remounted read-only */
1321 int s_readonly_remount; 1321 int s_readonly_remount;
1322 }; 1322 };
1323 1323
1324 /* superblock cache pruning functions */ 1324 /* superblock cache pruning functions */
1325 extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); 1325 extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
1326 extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); 1326 extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
1327 1327
1328 extern struct timespec current_fs_time(struct super_block *sb); 1328 extern struct timespec current_fs_time(struct super_block *sb);
1329 1329
1330 /* 1330 /*
1331 * Snapshotting support. 1331 * Snapshotting support.
1332 */ 1332 */
1333 1333
1334 void __sb_end_write(struct super_block *sb, int level); 1334 void __sb_end_write(struct super_block *sb, int level);
1335 int __sb_start_write(struct super_block *sb, int level, bool wait); 1335 int __sb_start_write(struct super_block *sb, int level, bool wait);
1336 1336
1337 /** 1337 /**
1338 * sb_end_write - drop write access to a superblock 1338 * sb_end_write - drop write access to a superblock
1339 * @sb: the super we wrote to 1339 * @sb: the super we wrote to
1340 * 1340 *
1341 * Decrement number of writers to the filesystem. Wake up possible waiters 1341 * Decrement number of writers to the filesystem. Wake up possible waiters
1342 * wanting to freeze the filesystem. 1342 * wanting to freeze the filesystem.
1343 */ 1343 */
1344 static inline void sb_end_write(struct super_block *sb) 1344 static inline void sb_end_write(struct super_block *sb)
1345 { 1345 {
1346 __sb_end_write(sb, SB_FREEZE_WRITE); 1346 __sb_end_write(sb, SB_FREEZE_WRITE);
1347 } 1347 }
1348 1348
1349 /** 1349 /**
1350 * sb_end_pagefault - drop write access to a superblock from a page fault 1350 * sb_end_pagefault - drop write access to a superblock from a page fault
1351 * @sb: the super we wrote to 1351 * @sb: the super we wrote to
1352 * 1352 *
1353 * Decrement number of processes handling write page fault to the filesystem. 1353 * Decrement number of processes handling write page fault to the filesystem.
1354 * Wake up possible waiters wanting to freeze the filesystem. 1354 * Wake up possible waiters wanting to freeze the filesystem.
1355 */ 1355 */
1356 static inline void sb_end_pagefault(struct super_block *sb) 1356 static inline void sb_end_pagefault(struct super_block *sb)
1357 { 1357 {
1358 __sb_end_write(sb, SB_FREEZE_PAGEFAULT); 1358 __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
1359 } 1359 }
1360 1360
1361 /** 1361 /**
1362 * sb_end_intwrite - drop write access to a superblock for internal fs purposes 1362 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
1363 * @sb: the super we wrote to 1363 * @sb: the super we wrote to
1364 * 1364 *
1365 * Decrement fs-internal number of writers to the filesystem. Wake up possible 1365 * Decrement fs-internal number of writers to the filesystem. Wake up possible
1366 * waiters wanting to freeze the filesystem. 1366 * waiters wanting to freeze the filesystem.
1367 */ 1367 */
1368 static inline void sb_end_intwrite(struct super_block *sb) 1368 static inline void sb_end_intwrite(struct super_block *sb)
1369 { 1369 {
1370 __sb_end_write(sb, SB_FREEZE_FS); 1370 __sb_end_write(sb, SB_FREEZE_FS);
1371 } 1371 }
1372 1372
1373 /** 1373 /**
1374 * sb_start_write - get write access to a superblock 1374 * sb_start_write - get write access to a superblock
1375 * @sb: the super we write to 1375 * @sb: the super we write to
1376 * 1376 *
1377 * When a process wants to write data or metadata to a file system (i.e. dirty 1377 * When a process wants to write data or metadata to a file system (i.e. dirty
1378 * a page or an inode), it should embed the operation in a sb_start_write() - 1378 * a page or an inode), it should embed the operation in a sb_start_write() -
1379 * sb_end_write() pair to get exclusion against file system freezing. This 1379 * sb_end_write() pair to get exclusion against file system freezing. This
1380 * function increments number of writers preventing freezing. If the file 1380 * function increments number of writers preventing freezing. If the file
1381 * system is already frozen, the function waits until the file system is 1381 * system is already frozen, the function waits until the file system is
1382 * thawed. 1382 * thawed.
1383 * 1383 *
1384 * Since freeze protection behaves as a lock, users have to preserve 1384 * Since freeze protection behaves as a lock, users have to preserve
1385 * ordering of freeze protection and other filesystem locks. Generally, 1385 * ordering of freeze protection and other filesystem locks. Generally,
1386 * freeze protection should be the outermost lock. In particular, we have: 1386 * freeze protection should be the outermost lock. In particular, we have:
1387 * 1387 *
1388 * sb_start_write 1388 * sb_start_write
1389 * -> i_mutex (write path, truncate, directory ops, ...) 1389 * -> i_mutex (write path, truncate, directory ops, ...)
1390 * -> s_umount (freeze_super, thaw_super) 1390 * -> s_umount (freeze_super, thaw_super)
1391 */ 1391 */
1392 static inline void sb_start_write(struct super_block *sb) 1392 static inline void sb_start_write(struct super_block *sb)
1393 { 1393 {
1394 __sb_start_write(sb, SB_FREEZE_WRITE, true); 1394 __sb_start_write(sb, SB_FREEZE_WRITE, true);
1395 } 1395 }
1396 1396
1397 static inline int sb_start_write_trylock(struct super_block *sb) 1397 static inline int sb_start_write_trylock(struct super_block *sb)
1398 { 1398 {
1399 return __sb_start_write(sb, SB_FREEZE_WRITE, false); 1399 return __sb_start_write(sb, SB_FREEZE_WRITE, false);
1400 } 1400 }
1401 1401
1402 /** 1402 /**
1403 * sb_start_pagefault - get write access to a superblock from a page fault 1403 * sb_start_pagefault - get write access to a superblock from a page fault
1404 * @sb: the super we write to 1404 * @sb: the super we write to
1405 * 1405 *
1406 * When a process starts handling write page fault, it should embed the 1406 * When a process starts handling write page fault, it should embed the
1407 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get 1407 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
1408 * exclusion against file system freezing. This is needed since the page fault 1408 * exclusion against file system freezing. This is needed since the page fault
1409 * is going to dirty a page. This function increments number of running page 1409 * is going to dirty a page. This function increments number of running page
1410 * faults preventing freezing. If the file system is already frozen, the 1410 * faults preventing freezing. If the file system is already frozen, the
1411 * function waits until the file system is thawed. 1411 * function waits until the file system is thawed.
1412 * 1412 *
1413 * Since page fault freeze protection behaves as a lock, users have to preserve 1413 * Since page fault freeze protection behaves as a lock, users have to preserve
1414 * ordering of freeze protection and other filesystem locks. It is advised to 1414 * ordering of freeze protection and other filesystem locks. It is advised to
1415 * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault 1415 * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
1416 * handling code implies lock dependency: 1416 * handling code implies lock dependency:
1417 * 1417 *
1418 * mmap_sem 1418 * mmap_sem
1419 * -> sb_start_pagefault 1419 * -> sb_start_pagefault
1420 */ 1420 */
1421 static inline void sb_start_pagefault(struct super_block *sb) 1421 static inline void sb_start_pagefault(struct super_block *sb)
1422 { 1422 {
1423 __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); 1423 __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
1424 } 1424 }
1425 1425
1426 /* 1426 /*
1427 * sb_start_intwrite - get write access to a superblock for internal fs purposes 1427 * sb_start_intwrite - get write access to a superblock for internal fs purposes
1428 * @sb: the super we write to 1428 * @sb: the super we write to
1429 * 1429 *
1430 * This is the third level of protection against filesystem freezing. It is 1430 * This is the third level of protection against filesystem freezing. It is
1431 * free for use by a filesystem. The only requirement is that it must rank 1431 * free for use by a filesystem. The only requirement is that it must rank
1432 * below sb_start_pagefault. 1432 * below sb_start_pagefault.
1433 * 1433 *
1434 * For example filesystem can call sb_start_intwrite() when starting a 1434 * For example filesystem can call sb_start_intwrite() when starting a
1435 * transaction which somewhat eases handling of freezing for internal sources 1435 * transaction which somewhat eases handling of freezing for internal sources
1436 * of filesystem changes (internal fs threads, discarding preallocation on file 1436 * of filesystem changes (internal fs threads, discarding preallocation on file
1437 * close, etc.). 1437 * close, etc.).
1438 */ 1438 */
1439 static inline void sb_start_intwrite(struct super_block *sb) 1439 static inline void sb_start_intwrite(struct super_block *sb)
1440 { 1440 {
1441 __sb_start_write(sb, SB_FREEZE_FS, true); 1441 __sb_start_write(sb, SB_FREEZE_FS, true);
1442 } 1442 }
1443 1443
1444 1444
1445 extern bool inode_owner_or_capable(const struct inode *inode); 1445 extern bool inode_owner_or_capable(const struct inode *inode);
1446 1446
1447 /* not quite ready to be deprecated, but... */ 1447 /* not quite ready to be deprecated, but... */
1448 extern void lock_super(struct super_block *); 1448 extern void lock_super(struct super_block *);
1449 extern void unlock_super(struct super_block *); 1449 extern void unlock_super(struct super_block *);
1450 1450
1451 /* 1451 /*
1452 * VFS helper functions.. 1452 * VFS helper functions..
1453 */ 1453 */
1454 extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); 1454 extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
1455 extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); 1455 extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
1456 extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); 1456 extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
1457 extern int vfs_symlink(struct inode *, struct dentry *, const char *); 1457 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
1458 extern int vfs_link(struct dentry *, struct inode *, struct dentry *); 1458 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
1459 extern int vfs_rmdir(struct inode *, struct dentry *); 1459 extern int vfs_rmdir(struct inode *, struct dentry *);
1460 extern int vfs_unlink(struct inode *, struct dentry *); 1460 extern int vfs_unlink(struct inode *, struct dentry *);
1461 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 1461 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
1462 1462
1463 /* 1463 /*
1464 * VFS dentry helper functions. 1464 * VFS dentry helper functions.
1465 */ 1465 */
1466 extern void dentry_unhash(struct dentry *dentry); 1466 extern void dentry_unhash(struct dentry *dentry);
1467 1467
1468 /* 1468 /*
1469 * VFS file helper functions. 1469 * VFS file helper functions.
1470 */ 1470 */
1471 extern void inode_init_owner(struct inode *inode, const struct inode *dir, 1471 extern void inode_init_owner(struct inode *inode, const struct inode *dir,
1472 umode_t mode); 1472 umode_t mode);
1473 /* 1473 /*
1474 * VFS FS_IOC_FIEMAP helper definitions. 1474 * VFS FS_IOC_FIEMAP helper definitions.
1475 */ 1475 */
1476 struct fiemap_extent_info { 1476 struct fiemap_extent_info {
1477 unsigned int fi_flags; /* Flags as passed from user */ 1477 unsigned int fi_flags; /* Flags as passed from user */
1478 unsigned int fi_extents_mapped; /* Number of mapped extents */ 1478 unsigned int fi_extents_mapped; /* Number of mapped extents */
1479 unsigned int fi_extents_max; /* Size of fiemap_extent array */ 1479 unsigned int fi_extents_max; /* Size of fiemap_extent array */
1480 struct fiemap_extent __user *fi_extents_start; /* Start of 1480 struct fiemap_extent __user *fi_extents_start; /* Start of
1481 fiemap_extent array */ 1481 fiemap_extent array */
1482 }; 1482 };
1483 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, 1483 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
1484 u64 phys, u64 len, u32 flags); 1484 u64 phys, u64 len, u32 flags);
1485 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); 1485 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
1486 1486
1487 /* 1487 /*
1488 * File types 1488 * File types
1489 * 1489 *
1490 * NOTE! These match bits 12..15 of stat.st_mode 1490 * NOTE! These match bits 12..15 of stat.st_mode
1491 * (ie "(i_mode >> 12) & 15"). 1491 * (ie "(i_mode >> 12) & 15").
1492 */ 1492 */
1493 #define DT_UNKNOWN 0 1493 #define DT_UNKNOWN 0
1494 #define DT_FIFO 1 1494 #define DT_FIFO 1
1495 #define DT_CHR 2 1495 #define DT_CHR 2
1496 #define DT_DIR 4 1496 #define DT_DIR 4
1497 #define DT_BLK 6 1497 #define DT_BLK 6
1498 #define DT_REG 8 1498 #define DT_REG 8
1499 #define DT_LNK 10 1499 #define DT_LNK 10
1500 #define DT_SOCK 12 1500 #define DT_SOCK 12
1501 #define DT_WHT 14 1501 #define DT_WHT 14
1502 1502
1503 /* 1503 /*
1504 * This is the "filldir" function type, used by readdir() to let 1504 * This is the "filldir" function type, used by readdir() to let
1505 * the kernel specify what kind of dirent layout it wants to have. 1505 * the kernel specify what kind of dirent layout it wants to have.
1506 * This allows the kernel to read directories into kernel space or 1506 * This allows the kernel to read directories into kernel space or
1507 * to have different dirent layouts depending on the binary type. 1507 * to have different dirent layouts depending on the binary type.
1508 */ 1508 */
1509 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); 1509 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
1510 struct block_device_operations; 1510 struct block_device_operations;
1511 1511
1512 /* These macros are for out of kernel modules to test that 1512 /* These macros are for out of kernel modules to test that
1513 * the kernel supports the unlocked_ioctl and compat_ioctl 1513 * the kernel supports the unlocked_ioctl and compat_ioctl
1514 * fields in struct file_operations. */ 1514 * fields in struct file_operations. */
1515 #define HAVE_COMPAT_IOCTL 1 1515 #define HAVE_COMPAT_IOCTL 1
1516 #define HAVE_UNLOCKED_IOCTL 1 1516 #define HAVE_UNLOCKED_IOCTL 1
1517 1517
1518 struct file_operations { 1518 struct file_operations {
1519 struct module *owner; 1519 struct module *owner;
1520 loff_t (*llseek) (struct file *, loff_t, int); 1520 loff_t (*llseek) (struct file *, loff_t, int);
1521 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 1521 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1522 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 1522 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1523 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1523 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1524 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1524 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1525 int (*readdir) (struct file *, void *, filldir_t); 1525 int (*readdir) (struct file *, void *, filldir_t);
1526 unsigned int (*poll) (struct file *, struct poll_table_struct *); 1526 unsigned int (*poll) (struct file *, struct poll_table_struct *);
1527 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1527 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1528 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1528 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1529 int (*mmap) (struct file *, struct vm_area_struct *); 1529 int (*mmap) (struct file *, struct vm_area_struct *);
1530 int (*open) (struct inode *, struct file *); 1530 int (*open) (struct inode *, struct file *);
1531 int (*flush) (struct file *, fl_owner_t id); 1531 int (*flush) (struct file *, fl_owner_t id);
1532 int (*release) (struct inode *, struct file *); 1532 int (*release) (struct inode *, struct file *);
1533 int (*fsync) (struct file *, loff_t, loff_t, int datasync); 1533 int (*fsync) (struct file *, loff_t, loff_t, int datasync);
1534 int (*aio_fsync) (struct kiocb *, int datasync); 1534 int (*aio_fsync) (struct kiocb *, int datasync);
1535 int (*fasync) (int, struct file *, int); 1535 int (*fasync) (int, struct file *, int);
1536 int (*lock) (struct file *, int, struct file_lock *); 1536 int (*lock) (struct file *, int, struct file_lock *);
1537 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); 1537 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1538 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1538 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1539 int (*check_flags)(int); 1539 int (*check_flags)(int);
1540 int (*flock) (struct file *, int, struct file_lock *); 1540 int (*flock) (struct file *, int, struct file_lock *);
1541 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); 1541 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1542 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); 1542 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1543 int (*setlease)(struct file *, long, struct file_lock **); 1543 int (*setlease)(struct file *, long, struct file_lock **);
1544 long (*fallocate)(struct file *file, int mode, loff_t offset, 1544 long (*fallocate)(struct file *file, int mode, loff_t offset,
1545 loff_t len); 1545 loff_t len);
1546 }; 1546 };
1547 1547
1548 struct inode_operations { 1548 struct inode_operations {
1549 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); 1549 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
1550 void * (*follow_link) (struct dentry *, struct nameidata *); 1550 void * (*follow_link) (struct dentry *, struct nameidata *);
1551 int (*permission) (struct inode *, int); 1551 int (*permission) (struct inode *, int);
1552 struct posix_acl * (*get_acl)(struct inode *, int); 1552 struct posix_acl * (*get_acl)(struct inode *, int);
1553 1553
1554 int (*readlink) (struct dentry *, char __user *,int); 1554 int (*readlink) (struct dentry *, char __user *,int);
1555 void (*put_link) (struct dentry *, struct nameidata *, void *); 1555 void (*put_link) (struct dentry *, struct nameidata *, void *);
1556 1556
1557 int (*create) (struct inode *,struct dentry *, umode_t, bool); 1557 int (*create) (struct inode *,struct dentry *, umode_t, bool);
1558 int (*link) (struct dentry *,struct inode *,struct dentry *); 1558 int (*link) (struct dentry *,struct inode *,struct dentry *);
1559 int (*unlink) (struct inode *,struct dentry *); 1559 int (*unlink) (struct inode *,struct dentry *);
1560 int (*symlink) (struct inode *,struct dentry *,const char *); 1560 int (*symlink) (struct inode *,struct dentry *,const char *);
1561 int (*mkdir) (struct inode *,struct dentry *,umode_t); 1561 int (*mkdir) (struct inode *,struct dentry *,umode_t);
1562 int (*rmdir) (struct inode *,struct dentry *); 1562 int (*rmdir) (struct inode *,struct dentry *);
1563 int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); 1563 int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
1564 int (*rename) (struct inode *, struct dentry *, 1564 int (*rename) (struct inode *, struct dentry *,
1565 struct inode *, struct dentry *); 1565 struct inode *, struct dentry *);
1566 void (*truncate) (struct inode *); 1566 void (*truncate) (struct inode *);
1567 int (*setattr) (struct dentry *, struct iattr *); 1567 int (*setattr) (struct dentry *, struct iattr *);
1568 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 1568 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
1569 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 1569 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
1570 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 1570 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
1571 ssize_t (*listxattr) (struct dentry *, char *, size_t); 1571 ssize_t (*listxattr) (struct dentry *, char *, size_t);
1572 int (*removexattr) (struct dentry *, const char *); 1572 int (*removexattr) (struct dentry *, const char *);
1573 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, 1573 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1574 u64 len); 1574 u64 len);
1575 int (*update_time)(struct inode *, struct timespec *, int); 1575 int (*update_time)(struct inode *, struct timespec *, int);
1576 int (*atomic_open)(struct inode *, struct dentry *, 1576 int (*atomic_open)(struct inode *, struct dentry *,
1577 struct file *, unsigned open_flag, 1577 struct file *, unsigned open_flag,
1578 umode_t create_mode, int *opened); 1578 umode_t create_mode, int *opened);
1579 } ____cacheline_aligned; 1579 } ____cacheline_aligned;
1580 1580
1581 struct seq_file; 1581 struct seq_file;
1582 1582
1583 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 1583 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
1584 unsigned long nr_segs, unsigned long fast_segs, 1584 unsigned long nr_segs, unsigned long fast_segs,
1585 struct iovec *fast_pointer, 1585 struct iovec *fast_pointer,
1586 struct iovec **ret_pointer); 1586 struct iovec **ret_pointer);
1587 1587
1588 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1588 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
1589 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1589 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
1590 extern ssize_t vfs_readv(struct file *, const struct iovec __user *, 1590 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
1591 unsigned long, loff_t *); 1591 unsigned long, loff_t *);
1592 extern ssize_t vfs_writev(struct file *, const struct iovec __user *, 1592 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
1593 unsigned long, loff_t *); 1593 unsigned long, loff_t *);
1594 1594
1595 struct super_operations { 1595 struct super_operations {
1596 struct inode *(*alloc_inode)(struct super_block *sb); 1596 struct inode *(*alloc_inode)(struct super_block *sb);
1597 void (*destroy_inode)(struct inode *); 1597 void (*destroy_inode)(struct inode *);
1598 1598
1599 void (*dirty_inode) (struct inode *, int flags); 1599 void (*dirty_inode) (struct inode *, int flags);
1600 int (*write_inode) (struct inode *, struct writeback_control *wbc); 1600 int (*write_inode) (struct inode *, struct writeback_control *wbc);
1601 int (*drop_inode) (struct inode *); 1601 int (*drop_inode) (struct inode *);
1602 void (*evict_inode) (struct inode *); 1602 void (*evict_inode) (struct inode *);
1603 void (*put_super) (struct super_block *); 1603 void (*put_super) (struct super_block *);
1604 int (*sync_fs)(struct super_block *sb, int wait); 1604 int (*sync_fs)(struct super_block *sb, int wait);
1605 int (*freeze_fs) (struct super_block *); 1605 int (*freeze_fs) (struct super_block *);
1606 int (*unfreeze_fs) (struct super_block *); 1606 int (*unfreeze_fs) (struct super_block *);
1607 int (*statfs) (struct dentry *, struct kstatfs *); 1607 int (*statfs) (struct dentry *, struct kstatfs *);
1608 int (*remount_fs) (struct super_block *, int *, char *); 1608 int (*remount_fs) (struct super_block *, int *, char *);
1609 void (*umount_begin) (struct super_block *); 1609 void (*umount_begin) (struct super_block *);
1610 1610
1611 int (*show_options)(struct seq_file *, struct dentry *); 1611 int (*show_options)(struct seq_file *, struct dentry *);
1612 int (*show_devname)(struct seq_file *, struct dentry *); 1612 int (*show_devname)(struct seq_file *, struct dentry *);
1613 int (*show_path)(struct seq_file *, struct dentry *); 1613 int (*show_path)(struct seq_file *, struct dentry *);
1614 int (*show_stats)(struct seq_file *, struct dentry *); 1614 int (*show_stats)(struct seq_file *, struct dentry *);
1615 #ifdef CONFIG_QUOTA 1615 #ifdef CONFIG_QUOTA
1616 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 1616 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1617 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 1617 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1618 #endif 1618 #endif
1619 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 1619 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
1620 int (*nr_cached_objects)(struct super_block *); 1620 int (*nr_cached_objects)(struct super_block *);
1621 void (*free_cached_objects)(struct super_block *, int); 1621 void (*free_cached_objects)(struct super_block *, int);
1622 }; 1622 };
1623 1623
1624 /* 1624 /*
1625 * Inode flags - they have no relation to superblock flags now 1625 * Inode flags - they have no relation to superblock flags now
1626 */ 1626 */
1627 #define S_SYNC 1 /* Writes are synced at once */ 1627 #define S_SYNC 1 /* Writes are synced at once */
1628 #define S_NOATIME 2 /* Do not update access times */ 1628 #define S_NOATIME 2 /* Do not update access times */
1629 #define S_APPEND 4 /* Append-only file */ 1629 #define S_APPEND 4 /* Append-only file */
1630 #define S_IMMUTABLE 8 /* Immutable file */ 1630 #define S_IMMUTABLE 8 /* Immutable file */
1631 #define S_DEAD 16 /* removed, but still open directory */ 1631 #define S_DEAD 16 /* removed, but still open directory */
1632 #define S_NOQUOTA 32 /* Inode is not counted to quota */ 1632 #define S_NOQUOTA 32 /* Inode is not counted to quota */
1633 #define S_DIRSYNC 64 /* Directory modifications are synchronous */ 1633 #define S_DIRSYNC 64 /* Directory modifications are synchronous */
1634 #define S_NOCMTIME 128 /* Do not update file c/mtime */ 1634 #define S_NOCMTIME 128 /* Do not update file c/mtime */
1635 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ 1635 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
1636 #define S_PRIVATE 512 /* Inode is fs-internal */ 1636 #define S_PRIVATE 512 /* Inode is fs-internal */
1637 #define S_IMA 1024 /* Inode has an associated IMA struct */ 1637 #define S_IMA 1024 /* Inode has an associated IMA struct */
1638 #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ 1638 #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */
1639 #define S_NOSEC 4096 /* no suid or xattr security attributes */ 1639 #define S_NOSEC 4096 /* no suid or xattr security attributes */
1640 1640
1641 /* 1641 /*
1642 * Note that nosuid etc flags are inode-specific: setting some file-system 1642 * Note that nosuid etc flags are inode-specific: setting some file-system
1643 * flags just means all the inodes inherit those flags by default. It might be 1643 * flags just means all the inodes inherit those flags by default. It might be
1644 * possible to override it selectively if you really wanted to with some 1644 * possible to override it selectively if you really wanted to with some
1645 * ioctl() that is not currently implemented. 1645 * ioctl() that is not currently implemented.
1646 * 1646 *
1647 * Exception: MS_RDONLY is always applied to the entire file system. 1647 * Exception: MS_RDONLY is always applied to the entire file system.
1648 * 1648 *
1649 * Unfortunately, it is possible to change a filesystems flags with it mounted 1649 * Unfortunately, it is possible to change a filesystems flags with it mounted
1650 * with files in use. This means that all of the inodes will not have their 1650 * with files in use. This means that all of the inodes will not have their
1651 * i_flags updated. Hence, i_flags no longer inherit the superblock mount 1651 * i_flags updated. Hence, i_flags no longer inherit the superblock mount
1652 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org 1652 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
1653 */ 1653 */
1654 #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) 1654 #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg))
1655 1655
1656 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) 1656 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
1657 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ 1657 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
1658 ((inode)->i_flags & S_SYNC)) 1658 ((inode)->i_flags & S_SYNC))
1659 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ 1659 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
1660 ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) 1660 ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
1661 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) 1661 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
1662 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) 1662 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
1663 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) 1663 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION)
1664 1664
1665 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) 1665 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
1666 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) 1666 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
1667 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) 1667 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
1668 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) 1668 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
1669 1669
1670 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) 1670 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
1671 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) 1671 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
1672 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) 1672 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
1673 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) 1673 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
1674 #define IS_IMA(inode) ((inode)->i_flags & S_IMA) 1674 #define IS_IMA(inode) ((inode)->i_flags & S_IMA)
1675 #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) 1675 #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
1676 #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) 1676 #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC)
1677 1677
1678 /* 1678 /*
1679 * Inode state bits. Protected by inode->i_lock 1679 * Inode state bits. Protected by inode->i_lock
1680 * 1680 *
1681 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1681 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
1682 * I_DIRTY_DATASYNC and I_DIRTY_PAGES. 1682 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
1683 * 1683 *
1684 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, 1684 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
1685 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at 1685 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
1686 * various stages of removing an inode. 1686 * various stages of removing an inode.
1687 * 1687 *
1688 * Two bits are used for locking and completion notification, I_NEW and I_SYNC. 1688 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
1689 * 1689 *
1690 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on 1690 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
1691 * fdatasync(). i_atime is the usual cause. 1691 * fdatasync(). i_atime is the usual cause.
1692 * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of 1692 * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
1693 * these changes separately from I_DIRTY_SYNC so that we 1693 * these changes separately from I_DIRTY_SYNC so that we
1694 * don't have to write inode on fdatasync() when only 1694 * don't have to write inode on fdatasync() when only
1695 * mtime has changed in it. 1695 * mtime has changed in it.
1696 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. 1696 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
1697 * I_NEW Serves as both a mutex and completion notification. 1697 * I_NEW Serves as both a mutex and completion notification.
1698 * New inodes set I_NEW. If two processes both create 1698 * New inodes set I_NEW. If two processes both create
1699 * the same inode, one of them will release its inode and 1699 * the same inode, one of them will release its inode and
1700 * wait for I_NEW to be released before returning. 1700 * wait for I_NEW to be released before returning.
1701 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can 1701 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
1702 * also cause waiting on I_NEW, without I_NEW actually 1702 * also cause waiting on I_NEW, without I_NEW actually
1703 * being set. find_inode() uses this to prevent returning 1703 * being set. find_inode() uses this to prevent returning
1704 * nearly-dead inodes. 1704 * nearly-dead inodes.
1705 * I_WILL_FREE Must be set when calling write_inode_now() if i_count 1705 * I_WILL_FREE Must be set when calling write_inode_now() if i_count
1706 * is zero. I_FREEING must be set when I_WILL_FREE is 1706 * is zero. I_FREEING must be set when I_WILL_FREE is
1707 * cleared. 1707 * cleared.
1708 * I_FREEING Set when inode is about to be freed but still has dirty 1708 * I_FREEING Set when inode is about to be freed but still has dirty
1709 * pages or buffers attached or the inode itself is still 1709 * pages or buffers attached or the inode itself is still
1710 * dirty. 1710 * dirty.
1711 * I_CLEAR Added by clear_inode(). In this state the inode is 1711 * I_CLEAR Added by clear_inode(). In this state the inode is
1712 * clean and can be destroyed. Inode keeps I_FREEING. 1712 * clean and can be destroyed. Inode keeps I_FREEING.
1713 * 1713 *
1714 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are 1714 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
1715 * prohibited for many purposes. iget() must wait for 1715 * prohibited for many purposes. iget() must wait for
1716 * the inode to be completely released, then create it 1716 * the inode to be completely released, then create it
1717 * anew. Other functions will just ignore such inodes, 1717 * anew. Other functions will just ignore such inodes,
1718 * if appropriate. I_NEW is used for waiting. 1718 * if appropriate. I_NEW is used for waiting.
1719 * 1719 *
1720 * I_SYNC Writeback of inode is running. The bit is set during 1720 * I_SYNC Writeback of inode is running. The bit is set during
1721 * data writeback, and cleared with a wakeup on the bit 1721 * data writeback, and cleared with a wakeup on the bit
1722 * address once it is done. The bit is also used to pin 1722 * address once it is done. The bit is also used to pin
1723 * the inode in memory for flusher thread. 1723 * the inode in memory for flusher thread.
1724 * 1724 *
1725 * I_REFERENCED Marks the inode as recently references on the LRU list. 1725 * I_REFERENCED Marks the inode as recently references on the LRU list.
1726 * 1726 *
1727 * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). 1727 * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
1728 * 1728 *
1729 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1729 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1730 */ 1730 */
1731 #define I_DIRTY_SYNC (1 << 0) 1731 #define I_DIRTY_SYNC (1 << 0)
1732 #define I_DIRTY_DATASYNC (1 << 1) 1732 #define I_DIRTY_DATASYNC (1 << 1)
1733 #define I_DIRTY_PAGES (1 << 2) 1733 #define I_DIRTY_PAGES (1 << 2)
1734 #define __I_NEW 3 1734 #define __I_NEW 3
1735 #define I_NEW (1 << __I_NEW) 1735 #define I_NEW (1 << __I_NEW)
1736 #define I_WILL_FREE (1 << 4) 1736 #define I_WILL_FREE (1 << 4)
1737 #define I_FREEING (1 << 5) 1737 #define I_FREEING (1 << 5)
1738 #define I_CLEAR (1 << 6) 1738 #define I_CLEAR (1 << 6)
1739 #define __I_SYNC 7 1739 #define __I_SYNC 7
1740 #define I_SYNC (1 << __I_SYNC) 1740 #define I_SYNC (1 << __I_SYNC)
1741 #define I_REFERENCED (1 << 8) 1741 #define I_REFERENCED (1 << 8)
1742 #define __I_DIO_WAKEUP 9 1742 #define __I_DIO_WAKEUP 9
1743 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) 1743 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
1744 1744
1745 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1745 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1746 1746
1747 extern void __mark_inode_dirty(struct inode *, int); 1747 extern void __mark_inode_dirty(struct inode *, int);
1748 static inline void mark_inode_dirty(struct inode *inode) 1748 static inline void mark_inode_dirty(struct inode *inode)
1749 { 1749 {
1750 __mark_inode_dirty(inode, I_DIRTY); 1750 __mark_inode_dirty(inode, I_DIRTY);
1751 } 1751 }
1752 1752
1753 static inline void mark_inode_dirty_sync(struct inode *inode) 1753 static inline void mark_inode_dirty_sync(struct inode *inode)
1754 { 1754 {
1755 __mark_inode_dirty(inode, I_DIRTY_SYNC); 1755 __mark_inode_dirty(inode, I_DIRTY_SYNC);
1756 } 1756 }
1757 1757
1758 extern void inc_nlink(struct inode *inode); 1758 extern void inc_nlink(struct inode *inode);
1759 extern void drop_nlink(struct inode *inode); 1759 extern void drop_nlink(struct inode *inode);
1760 extern void clear_nlink(struct inode *inode); 1760 extern void clear_nlink(struct inode *inode);
1761 extern void set_nlink(struct inode *inode, unsigned int nlink); 1761 extern void set_nlink(struct inode *inode, unsigned int nlink);
1762 1762
1763 static inline void inode_inc_link_count(struct inode *inode) 1763 static inline void inode_inc_link_count(struct inode *inode)
1764 { 1764 {
1765 inc_nlink(inode); 1765 inc_nlink(inode);
1766 mark_inode_dirty(inode); 1766 mark_inode_dirty(inode);
1767 } 1767 }
1768 1768
1769 static inline void inode_dec_link_count(struct inode *inode) 1769 static inline void inode_dec_link_count(struct inode *inode)
1770 { 1770 {
1771 drop_nlink(inode); 1771 drop_nlink(inode);
1772 mark_inode_dirty(inode); 1772 mark_inode_dirty(inode);
1773 } 1773 }
1774 1774
1775 /** 1775 /**
1776 * inode_inc_iversion - increments i_version 1776 * inode_inc_iversion - increments i_version
1777 * @inode: inode that need to be updated 1777 * @inode: inode that need to be updated
1778 * 1778 *
1779 * Every time the inode is modified, the i_version field will be incremented. 1779 * Every time the inode is modified, the i_version field will be incremented.
1780 * The filesystem has to be mounted with i_version flag 1780 * The filesystem has to be mounted with i_version flag
1781 */ 1781 */
1782 1782
1783 static inline void inode_inc_iversion(struct inode *inode) 1783 static inline void inode_inc_iversion(struct inode *inode)
1784 { 1784 {
1785 spin_lock(&inode->i_lock); 1785 spin_lock(&inode->i_lock);
1786 inode->i_version++; 1786 inode->i_version++;
1787 spin_unlock(&inode->i_lock); 1787 spin_unlock(&inode->i_lock);
1788 } 1788 }
1789 1789
1790 enum file_time_flags { 1790 enum file_time_flags {
1791 S_ATIME = 1, 1791 S_ATIME = 1,
1792 S_MTIME = 2, 1792 S_MTIME = 2,
1793 S_CTIME = 4, 1793 S_CTIME = 4,
1794 S_VERSION = 8, 1794 S_VERSION = 8,
1795 }; 1795 };
1796 1796
1797 extern void touch_atime(struct path *); 1797 extern void touch_atime(struct path *);
1798 static inline void file_accessed(struct file *file) 1798 static inline void file_accessed(struct file *file)
1799 { 1799 {
1800 if (!(file->f_flags & O_NOATIME)) 1800 if (!(file->f_flags & O_NOATIME))
1801 touch_atime(&file->f_path); 1801 touch_atime(&file->f_path);
1802 } 1802 }
1803 1803
1804 int sync_inode(struct inode *inode, struct writeback_control *wbc); 1804 int sync_inode(struct inode *inode, struct writeback_control *wbc);
1805 int sync_inode_metadata(struct inode *inode, int wait); 1805 int sync_inode_metadata(struct inode *inode, int wait);
1806 1806
1807 struct file_system_type { 1807 struct file_system_type {
1808 const char *name; 1808 const char *name;
1809 int fs_flags; 1809 int fs_flags;
1810 #define FS_REQUIRES_DEV 1 1810 #define FS_REQUIRES_DEV 1
1811 #define FS_BINARY_MOUNTDATA 2 1811 #define FS_BINARY_MOUNTDATA 2
1812 #define FS_HAS_SUBTYPE 4 1812 #define FS_HAS_SUBTYPE 4
1813 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 1813 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
1814 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ 1814 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
1815 struct dentry *(*mount) (struct file_system_type *, int, 1815 struct dentry *(*mount) (struct file_system_type *, int,
1816 const char *, void *); 1816 const char *, void *);
1817 void (*kill_sb) (struct super_block *); 1817 void (*kill_sb) (struct super_block *);
1818 struct module *owner; 1818 struct module *owner;
1819 struct file_system_type * next; 1819 struct file_system_type * next;
1820 struct hlist_head fs_supers; 1820 struct hlist_head fs_supers;
1821 1821
1822 struct lock_class_key s_lock_key; 1822 struct lock_class_key s_lock_key;
1823 struct lock_class_key s_umount_key; 1823 struct lock_class_key s_umount_key;
1824 struct lock_class_key s_vfs_rename_key; 1824 struct lock_class_key s_vfs_rename_key;
1825 struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; 1825 struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
1826 1826
1827 struct lock_class_key i_lock_key; 1827 struct lock_class_key i_lock_key;
1828 struct lock_class_key i_mutex_key; 1828 struct lock_class_key i_mutex_key;
1829 struct lock_class_key i_mutex_dir_key; 1829 struct lock_class_key i_mutex_dir_key;
1830 }; 1830 };
1831 1831
1832 extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, 1832 extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
1833 void *data, int (*fill_super)(struct super_block *, void *, int)); 1833 void *data, int (*fill_super)(struct super_block *, void *, int));
1834 extern struct dentry *mount_bdev(struct file_system_type *fs_type, 1834 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
1835 int flags, const char *dev_name, void *data, 1835 int flags, const char *dev_name, void *data,
1836 int (*fill_super)(struct super_block *, void *, int)); 1836 int (*fill_super)(struct super_block *, void *, int));
1837 extern struct dentry *mount_single(struct file_system_type *fs_type, 1837 extern struct dentry *mount_single(struct file_system_type *fs_type,
1838 int flags, void *data, 1838 int flags, void *data,
1839 int (*fill_super)(struct super_block *, void *, int)); 1839 int (*fill_super)(struct super_block *, void *, int));
1840 extern struct dentry *mount_nodev(struct file_system_type *fs_type, 1840 extern struct dentry *mount_nodev(struct file_system_type *fs_type,
1841 int flags, void *data, 1841 int flags, void *data,
1842 int (*fill_super)(struct super_block *, void *, int)); 1842 int (*fill_super)(struct super_block *, void *, int));
1843 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); 1843 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
1844 void generic_shutdown_super(struct super_block *sb); 1844 void generic_shutdown_super(struct super_block *sb);
1845 void kill_block_super(struct super_block *sb); 1845 void kill_block_super(struct super_block *sb);
1846 void kill_anon_super(struct super_block *sb); 1846 void kill_anon_super(struct super_block *sb);
1847 void kill_litter_super(struct super_block *sb); 1847 void kill_litter_super(struct super_block *sb);
1848 void deactivate_super(struct super_block *sb); 1848 void deactivate_super(struct super_block *sb);
1849 void deactivate_locked_super(struct super_block *sb); 1849 void deactivate_locked_super(struct super_block *sb);
1850 int set_anon_super(struct super_block *s, void *data); 1850 int set_anon_super(struct super_block *s, void *data);
1851 int get_anon_bdev(dev_t *); 1851 int get_anon_bdev(dev_t *);
1852 void free_anon_bdev(dev_t); 1852 void free_anon_bdev(dev_t);
1853 struct super_block *sget(struct file_system_type *type, 1853 struct super_block *sget(struct file_system_type *type,
1854 int (*test)(struct super_block *,void *), 1854 int (*test)(struct super_block *,void *),
1855 int (*set)(struct super_block *,void *), 1855 int (*set)(struct super_block *,void *),
1856 int flags, void *data); 1856 int flags, void *data);
1857 extern struct dentry *mount_pseudo(struct file_system_type *, char *, 1857 extern struct dentry *mount_pseudo(struct file_system_type *, char *,
1858 const struct super_operations *ops, 1858 const struct super_operations *ops,
1859 const struct dentry_operations *dops, 1859 const struct dentry_operations *dops,
1860 unsigned long); 1860 unsigned long);
1861 1861
1862 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ 1862 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1863 #define fops_get(fops) \ 1863 #define fops_get(fops) \
1864 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) 1864 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
1865 #define fops_put(fops) \ 1865 #define fops_put(fops) \
1866 do { if (fops) module_put((fops)->owner); } while(0) 1866 do { if (fops) module_put((fops)->owner); } while(0)
1867 1867
1868 extern int register_filesystem(struct file_system_type *); 1868 extern int register_filesystem(struct file_system_type *);
1869 extern int unregister_filesystem(struct file_system_type *); 1869 extern int unregister_filesystem(struct file_system_type *);
1870 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); 1870 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
1871 #define kern_mount(type) kern_mount_data(type, NULL) 1871 #define kern_mount(type) kern_mount_data(type, NULL)
1872 extern void kern_unmount(struct vfsmount *mnt); 1872 extern void kern_unmount(struct vfsmount *mnt);
1873 extern int may_umount_tree(struct vfsmount *); 1873 extern int may_umount_tree(struct vfsmount *);
1874 extern int may_umount(struct vfsmount *); 1874 extern int may_umount(struct vfsmount *);
1875 extern long do_mount(const char *, const char *, const char *, unsigned long, void *); 1875 extern long do_mount(const char *, const char *, const char *, unsigned long, void *);
1876 extern struct vfsmount *collect_mounts(struct path *); 1876 extern struct vfsmount *collect_mounts(struct path *);
1877 extern void drop_collected_mounts(struct vfsmount *); 1877 extern void drop_collected_mounts(struct vfsmount *);
1878 extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, 1878 extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
1879 struct vfsmount *); 1879 struct vfsmount *);
1880 extern int vfs_statfs(struct path *, struct kstatfs *); 1880 extern int vfs_statfs(struct path *, struct kstatfs *);
1881 extern int user_statfs(const char __user *, struct kstatfs *); 1881 extern int user_statfs(const char __user *, struct kstatfs *);
1882 extern int fd_statfs(int, struct kstatfs *); 1882 extern int fd_statfs(int, struct kstatfs *);
1883 extern int vfs_ustat(dev_t, struct kstatfs *); 1883 extern int vfs_ustat(dev_t, struct kstatfs *);
1884 extern int freeze_super(struct super_block *super); 1884 extern int freeze_super(struct super_block *super);
1885 extern int thaw_super(struct super_block *super); 1885 extern int thaw_super(struct super_block *super);
1886 extern bool our_mnt(struct vfsmount *mnt); 1886 extern bool our_mnt(struct vfsmount *mnt);
1887 1887
1888 extern int current_umask(void); 1888 extern int current_umask(void);
1889 1889
1890 /* /sys/fs */ 1890 /* /sys/fs */
1891 extern struct kobject *fs_kobj; 1891 extern struct kobject *fs_kobj;
1892 1892
1893 #define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK) 1893 #define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
1894 extern int rw_verify_area(int, struct file *, loff_t *, size_t); 1894 extern int rw_verify_area(int, struct file *, loff_t *, size_t);
1895 1895
1896 #define FLOCK_VERIFY_READ 1 1896 #define FLOCK_VERIFY_READ 1
1897 #define FLOCK_VERIFY_WRITE 2 1897 #define FLOCK_VERIFY_WRITE 2
1898 1898
1899 #ifdef CONFIG_FILE_LOCKING 1899 #ifdef CONFIG_FILE_LOCKING
1900 extern int locks_mandatory_locked(struct inode *); 1900 extern int locks_mandatory_locked(struct inode *);
1901 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1901 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1902 1902
1903 /* 1903 /*
1904 * Candidates for mandatory locking have the setgid bit set 1904 * Candidates for mandatory locking have the setgid bit set
1905 * but no group execute bit - an otherwise meaningless combination. 1905 * but no group execute bit - an otherwise meaningless combination.
1906 */ 1906 */
1907 1907
1908 static inline int __mandatory_lock(struct inode *ino) 1908 static inline int __mandatory_lock(struct inode *ino)
1909 { 1909 {
1910 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; 1910 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
1911 } 1911 }
1912 1912
1913 /* 1913 /*
1914 * ... and these candidates should be on MS_MANDLOCK mounted fs, 1914 * ... and these candidates should be on MS_MANDLOCK mounted fs,
1915 * otherwise these will be advisory locks 1915 * otherwise these will be advisory locks
1916 */ 1916 */
1917 1917
1918 static inline int mandatory_lock(struct inode *ino) 1918 static inline int mandatory_lock(struct inode *ino)
1919 { 1919 {
1920 return IS_MANDLOCK(ino) && __mandatory_lock(ino); 1920 return IS_MANDLOCK(ino) && __mandatory_lock(ino);
1921 } 1921 }
1922 1922
1923 static inline int locks_verify_locked(struct inode *inode) 1923 static inline int locks_verify_locked(struct inode *inode)
1924 { 1924 {
1925 if (mandatory_lock(inode)) 1925 if (mandatory_lock(inode))
1926 return locks_mandatory_locked(inode); 1926 return locks_mandatory_locked(inode);
1927 return 0; 1927 return 0;
1928 } 1928 }
1929 1929
1930 static inline int locks_verify_truncate(struct inode *inode, 1930 static inline int locks_verify_truncate(struct inode *inode,
1931 struct file *filp, 1931 struct file *filp,
1932 loff_t size) 1932 loff_t size)
1933 { 1933 {
1934 if (inode->i_flock && mandatory_lock(inode)) 1934 if (inode->i_flock && mandatory_lock(inode))
1935 return locks_mandatory_area( 1935 return locks_mandatory_area(
1936 FLOCK_VERIFY_WRITE, inode, filp, 1936 FLOCK_VERIFY_WRITE, inode, filp,
1937 size < inode->i_size ? size : inode->i_size, 1937 size < inode->i_size ? size : inode->i_size,
1938 (size < inode->i_size ? inode->i_size - size 1938 (size < inode->i_size ? inode->i_size - size
1939 : size - inode->i_size) 1939 : size - inode->i_size)
1940 ); 1940 );
1941 return 0; 1941 return 0;
1942 } 1942 }
1943 1943
1944 static inline int break_lease(struct inode *inode, unsigned int mode) 1944 static inline int break_lease(struct inode *inode, unsigned int mode)
1945 { 1945 {
1946 if (inode->i_flock) 1946 if (inode->i_flock)
1947 return __break_lease(inode, mode); 1947 return __break_lease(inode, mode);
1948 return 0; 1948 return 0;
1949 } 1949 }
1950 #else /* !CONFIG_FILE_LOCKING */ 1950 #else /* !CONFIG_FILE_LOCKING */
1951 static inline int locks_mandatory_locked(struct inode *inode) 1951 static inline int locks_mandatory_locked(struct inode *inode)
1952 { 1952 {
1953 return 0; 1953 return 0;
1954 } 1954 }
1955 1955
1956 static inline int locks_mandatory_area(int rw, struct inode *inode, 1956 static inline int locks_mandatory_area(int rw, struct inode *inode,
1957 struct file *filp, loff_t offset, 1957 struct file *filp, loff_t offset,
1958 size_t count) 1958 size_t count)
1959 { 1959 {
1960 return 0; 1960 return 0;
1961 } 1961 }
1962 1962
1963 static inline int __mandatory_lock(struct inode *inode) 1963 static inline int __mandatory_lock(struct inode *inode)
1964 { 1964 {
1965 return 0; 1965 return 0;
1966 } 1966 }
1967 1967
1968 static inline int mandatory_lock(struct inode *inode) 1968 static inline int mandatory_lock(struct inode *inode)
1969 { 1969 {
1970 return 0; 1970 return 0;
1971 } 1971 }
1972 1972
1973 static inline int locks_verify_locked(struct inode *inode) 1973 static inline int locks_verify_locked(struct inode *inode)
1974 { 1974 {
1975 return 0; 1975 return 0;
1976 } 1976 }
1977 1977
1978 static inline int locks_verify_truncate(struct inode *inode, struct file *filp, 1978 static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
1979 size_t size) 1979 size_t size)
1980 { 1980 {
1981 return 0; 1981 return 0;
1982 } 1982 }
1983 1983
1984 static inline int break_lease(struct inode *inode, unsigned int mode) 1984 static inline int break_lease(struct inode *inode, unsigned int mode)
1985 { 1985 {
1986 return 0; 1986 return 0;
1987 } 1987 }
1988 1988
1989 #endif /* CONFIG_FILE_LOCKING */ 1989 #endif /* CONFIG_FILE_LOCKING */
1990 1990
1991 /* fs/open.c */ 1991 /* fs/open.c */
1992 struct audit_names; 1992 struct audit_names;
1993 struct filename { 1993 struct filename {
1994 const char *name; /* pointer to actual string */ 1994 const char *name; /* pointer to actual string */
1995 const __user char *uptr; /* original userland pointer */ 1995 const __user char *uptr; /* original userland pointer */
1996 struct audit_names *aname; 1996 struct audit_names *aname;
1997 bool separate; /* should "name" be freed? */ 1997 bool separate; /* should "name" be freed? */
1998 }; 1998 };
1999 1999
2000 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, 2000 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
2001 struct file *filp); 2001 struct file *filp);
2002 extern int do_fallocate(struct file *file, int mode, loff_t offset, 2002 extern int do_fallocate(struct file *file, int mode, loff_t offset,
2003 loff_t len); 2003 loff_t len);
2004 extern long do_sys_open(int dfd, const char __user *filename, int flags, 2004 extern long do_sys_open(int dfd, const char __user *filename, int flags,
2005 umode_t mode); 2005 umode_t mode);
2006 extern struct file *file_open_name(struct filename *, int, umode_t); 2006 extern struct file *file_open_name(struct filename *, int, umode_t);
2007 extern struct file *filp_open(const char *, int, umode_t); 2007 extern struct file *filp_open(const char *, int, umode_t);
2008 extern struct file *file_open_root(struct dentry *, struct vfsmount *, 2008 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
2009 const char *, int); 2009 const char *, int);
2010 extern struct file * dentry_open(const struct path *, int, const struct cred *); 2010 extern struct file * dentry_open(const struct path *, int, const struct cred *);
2011 extern int filp_close(struct file *, fl_owner_t id); 2011 extern int filp_close(struct file *, fl_owner_t id);
2012 2012
2013 extern struct filename *getname(const char __user *); 2013 extern struct filename *getname(const char __user *);
2014 2014
2015 enum { 2015 enum {
2016 FILE_CREATED = 1, 2016 FILE_CREATED = 1,
2017 FILE_OPENED = 2 2017 FILE_OPENED = 2
2018 }; 2018 };
2019 extern int finish_open(struct file *file, struct dentry *dentry, 2019 extern int finish_open(struct file *file, struct dentry *dentry,
2020 int (*open)(struct inode *, struct file *), 2020 int (*open)(struct inode *, struct file *),
2021 int *opened); 2021 int *opened);
2022 extern int finish_no_open(struct file *file, struct dentry *dentry); 2022 extern int finish_no_open(struct file *file, struct dentry *dentry);
2023 2023
2024 /* fs/ioctl.c */ 2024 /* fs/ioctl.c */
2025 2025
2026 extern int ioctl_preallocate(struct file *filp, void __user *argp); 2026 extern int ioctl_preallocate(struct file *filp, void __user *argp);
2027 2027
2028 /* fs/dcache.c */ 2028 /* fs/dcache.c */
2029 extern void __init vfs_caches_init_early(void); 2029 extern void __init vfs_caches_init_early(void);
2030 extern void __init vfs_caches_init(unsigned long); 2030 extern void __init vfs_caches_init(unsigned long);
2031 2031
2032 extern struct kmem_cache *names_cachep; 2032 extern struct kmem_cache *names_cachep;
2033 2033
2034 extern void final_putname(struct filename *name); 2034 extern void final_putname(struct filename *name);
2035 2035
2036 #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) 2036 #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL)
2037 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) 2037 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
2038 #ifndef CONFIG_AUDITSYSCALL 2038 #ifndef CONFIG_AUDITSYSCALL
2039 #define putname(name) final_putname(name) 2039 #define putname(name) final_putname(name)
2040 #else 2040 #else
2041 extern void putname(struct filename *name); 2041 extern void putname(struct filename *name);
2042 #endif 2042 #endif
2043 2043
2044 #ifdef CONFIG_BLOCK 2044 #ifdef CONFIG_BLOCK
2045 extern int register_blkdev(unsigned int, const char *); 2045 extern int register_blkdev(unsigned int, const char *);
2046 extern void unregister_blkdev(unsigned int, const char *); 2046 extern void unregister_blkdev(unsigned int, const char *);
2047 extern struct block_device *bdget(dev_t); 2047 extern struct block_device *bdget(dev_t);
2048 extern struct block_device *bdgrab(struct block_device *bdev); 2048 extern struct block_device *bdgrab(struct block_device *bdev);
2049 extern void bd_set_size(struct block_device *, loff_t size); 2049 extern void bd_set_size(struct block_device *, loff_t size);
2050 extern void bd_forget(struct inode *inode); 2050 extern void bd_forget(struct inode *inode);
2051 extern void bdput(struct block_device *); 2051 extern void bdput(struct block_device *);
2052 extern void invalidate_bdev(struct block_device *); 2052 extern void invalidate_bdev(struct block_device *);
2053 extern void iterate_bdevs(void (*)(struct block_device *, void *), void *); 2053 extern void iterate_bdevs(void (*)(struct block_device *, void *), void *);
2054 extern int sync_blockdev(struct block_device *bdev); 2054 extern int sync_blockdev(struct block_device *bdev);
2055 extern void kill_bdev(struct block_device *); 2055 extern void kill_bdev(struct block_device *);
2056 extern struct super_block *freeze_bdev(struct block_device *); 2056 extern struct super_block *freeze_bdev(struct block_device *);
2057 extern void emergency_thaw_all(void); 2057 extern void emergency_thaw_all(void);
2058 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); 2058 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
2059 extern int fsync_bdev(struct block_device *); 2059 extern int fsync_bdev(struct block_device *);
2060 #else 2060 #else
2061 static inline void bd_forget(struct inode *inode) {} 2061 static inline void bd_forget(struct inode *inode) {}
2062 static inline int sync_blockdev(struct block_device *bdev) { return 0; } 2062 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
2063 static inline void kill_bdev(struct block_device *bdev) {} 2063 static inline void kill_bdev(struct block_device *bdev) {}
2064 static inline void invalidate_bdev(struct block_device *bdev) {} 2064 static inline void invalidate_bdev(struct block_device *bdev) {}
2065 2065
2066 static inline struct super_block *freeze_bdev(struct block_device *sb) 2066 static inline struct super_block *freeze_bdev(struct block_device *sb)
2067 { 2067 {
2068 return NULL; 2068 return NULL;
2069 } 2069 }
2070 2070
2071 static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) 2071 static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
2072 { 2072 {
2073 return 0; 2073 return 0;
2074 } 2074 }
2075 2075
2076 static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg) 2076 static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg)
2077 { 2077 {
2078 } 2078 }
2079 #endif 2079 #endif
2080 extern int sync_filesystem(struct super_block *); 2080 extern int sync_filesystem(struct super_block *);
2081 extern const struct file_operations def_blk_fops; 2081 extern const struct file_operations def_blk_fops;
2082 extern const struct file_operations def_chr_fops; 2082 extern const struct file_operations def_chr_fops;
2083 extern const struct file_operations bad_sock_fops; 2083 extern const struct file_operations bad_sock_fops;
2084 extern const struct file_operations def_fifo_fops; 2084 extern const struct file_operations def_fifo_fops;
2085 #ifdef CONFIG_BLOCK 2085 #ifdef CONFIG_BLOCK
2086 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); 2086 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
2087 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); 2087 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
2088 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); 2088 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
2089 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); 2089 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
2090 extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 2090 extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
2091 void *holder); 2091 void *holder);
2092 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, 2092 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
2093 void *holder); 2093 void *holder);
2094 extern int blkdev_put(struct block_device *bdev, fmode_t mode); 2094 extern int blkdev_put(struct block_device *bdev, fmode_t mode);
2095 #ifdef CONFIG_SYSFS 2095 #ifdef CONFIG_SYSFS
2096 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); 2096 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
2097 extern void bd_unlink_disk_holder(struct block_device *bdev, 2097 extern void bd_unlink_disk_holder(struct block_device *bdev,
2098 struct gendisk *disk); 2098 struct gendisk *disk);
2099 #else 2099 #else
2100 static inline int bd_link_disk_holder(struct block_device *bdev, 2100 static inline int bd_link_disk_holder(struct block_device *bdev,
2101 struct gendisk *disk) 2101 struct gendisk *disk)
2102 { 2102 {
2103 return 0; 2103 return 0;
2104 } 2104 }
2105 static inline void bd_unlink_disk_holder(struct block_device *bdev, 2105 static inline void bd_unlink_disk_holder(struct block_device *bdev,
2106 struct gendisk *disk) 2106 struct gendisk *disk)
2107 { 2107 {
2108 } 2108 }
2109 #endif 2109 #endif
2110 #endif 2110 #endif
2111 2111
2112 /* fs/char_dev.c */ 2112 /* fs/char_dev.c */
2113 #define CHRDEV_MAJOR_HASH_SIZE 255 2113 #define CHRDEV_MAJOR_HASH_SIZE 255
2114 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); 2114 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
2115 extern int register_chrdev_region(dev_t, unsigned, const char *); 2115 extern int register_chrdev_region(dev_t, unsigned, const char *);
2116 extern int __register_chrdev(unsigned int major, unsigned int baseminor, 2116 extern int __register_chrdev(unsigned int major, unsigned int baseminor,
2117 unsigned int count, const char *name, 2117 unsigned int count, const char *name,
2118 const struct file_operations *fops); 2118 const struct file_operations *fops);
2119 extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, 2119 extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
2120 unsigned int count, const char *name); 2120 unsigned int count, const char *name);
2121 extern void unregister_chrdev_region(dev_t, unsigned); 2121 extern void unregister_chrdev_region(dev_t, unsigned);
2122 extern void chrdev_show(struct seq_file *,off_t); 2122 extern void chrdev_show(struct seq_file *,off_t);
2123 2123
2124 static inline int register_chrdev(unsigned int major, const char *name, 2124 static inline int register_chrdev(unsigned int major, const char *name,
2125 const struct file_operations *fops) 2125 const struct file_operations *fops)
2126 { 2126 {
2127 return __register_chrdev(major, 0, 256, name, fops); 2127 return __register_chrdev(major, 0, 256, name, fops);
2128 } 2128 }
2129 2129
2130 static inline void unregister_chrdev(unsigned int major, const char *name) 2130 static inline void unregister_chrdev(unsigned int major, const char *name)
2131 { 2131 {
2132 __unregister_chrdev(major, 0, 256, name); 2132 __unregister_chrdev(major, 0, 256, name);
2133 } 2133 }
2134 2134
2135 /* fs/block_dev.c */ 2135 /* fs/block_dev.c */
2136 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ 2136 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */
2137 #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ 2137 #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */
2138 2138
2139 #ifdef CONFIG_BLOCK 2139 #ifdef CONFIG_BLOCK
2140 #define BLKDEV_MAJOR_HASH_SIZE 255 2140 #define BLKDEV_MAJOR_HASH_SIZE 255
2141 extern const char *__bdevname(dev_t, char *buffer); 2141 extern const char *__bdevname(dev_t, char *buffer);
2142 extern const char *bdevname(struct block_device *bdev, char *buffer); 2142 extern const char *bdevname(struct block_device *bdev, char *buffer);
2143 extern struct block_device *lookup_bdev(const char *); 2143 extern struct block_device *lookup_bdev(const char *);
2144 extern void blkdev_show(struct seq_file *,off_t); 2144 extern void blkdev_show(struct seq_file *,off_t);
2145 2145
2146 #else 2146 #else
2147 #define BLKDEV_MAJOR_HASH_SIZE 0 2147 #define BLKDEV_MAJOR_HASH_SIZE 0
2148 #endif 2148 #endif
2149 2149
2150 extern void init_special_inode(struct inode *, umode_t, dev_t); 2150 extern void init_special_inode(struct inode *, umode_t, dev_t);
2151 2151
2152 /* Invalid inode operations -- fs/bad_inode.c */ 2152 /* Invalid inode operations -- fs/bad_inode.c */
2153 extern void make_bad_inode(struct inode *); 2153 extern void make_bad_inode(struct inode *);
2154 extern int is_bad_inode(struct inode *); 2154 extern int is_bad_inode(struct inode *);
2155 2155
2156 extern const struct file_operations read_pipefifo_fops; 2156 extern const struct file_operations read_pipefifo_fops;
2157 extern const struct file_operations write_pipefifo_fops; 2157 extern const struct file_operations write_pipefifo_fops;
2158 extern const struct file_operations rdwr_pipefifo_fops; 2158 extern const struct file_operations rdwr_pipefifo_fops;
2159 2159
2160 #ifdef CONFIG_BLOCK 2160 #ifdef CONFIG_BLOCK
2161 /* 2161 /*
2162 * return READ, READA, or WRITE 2162 * return READ, READA, or WRITE
2163 */ 2163 */
2164 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) 2164 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK))
2165 2165
2166 /* 2166 /*
2167 * return data direction, READ or WRITE 2167 * return data direction, READ or WRITE
2168 */ 2168 */
2169 #define bio_data_dir(bio) ((bio)->bi_rw & 1) 2169 #define bio_data_dir(bio) ((bio)->bi_rw & 1)
2170 2170
2171 extern void check_disk_size_change(struct gendisk *disk, 2171 extern void check_disk_size_change(struct gendisk *disk,
2172 struct block_device *bdev); 2172 struct block_device *bdev);
2173 extern int revalidate_disk(struct gendisk *); 2173 extern int revalidate_disk(struct gendisk *);
2174 extern int check_disk_change(struct block_device *); 2174 extern int check_disk_change(struct block_device *);
2175 extern int __invalidate_device(struct block_device *, bool); 2175 extern int __invalidate_device(struct block_device *, bool);
2176 extern int invalidate_partition(struct gendisk *, int); 2176 extern int invalidate_partition(struct gendisk *, int);
2177 #endif 2177 #endif
2178 unsigned long invalidate_mapping_pages(struct address_space *mapping, 2178 unsigned long invalidate_mapping_pages(struct address_space *mapping,
2179 pgoff_t start, pgoff_t end); 2179 pgoff_t start, pgoff_t end);
2180 2180
2181 static inline void invalidate_remote_inode(struct inode *inode) 2181 static inline void invalidate_remote_inode(struct inode *inode)
2182 { 2182 {
2183 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2183 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2184 S_ISLNK(inode->i_mode)) 2184 S_ISLNK(inode->i_mode))
2185 invalidate_mapping_pages(inode->i_mapping, 0, -1); 2185 invalidate_mapping_pages(inode->i_mapping, 0, -1);
2186 } 2186 }
2187 extern int invalidate_inode_pages2(struct address_space *mapping); 2187 extern int invalidate_inode_pages2(struct address_space *mapping);
2188 extern int invalidate_inode_pages2_range(struct address_space *mapping, 2188 extern int invalidate_inode_pages2_range(struct address_space *mapping,
2189 pgoff_t start, pgoff_t end); 2189 pgoff_t start, pgoff_t end);
2190 extern int write_inode_now(struct inode *, int); 2190 extern int write_inode_now(struct inode *, int);
2191 extern int filemap_fdatawrite(struct address_space *); 2191 extern int filemap_fdatawrite(struct address_space *);
2192 extern int filemap_flush(struct address_space *); 2192 extern int filemap_flush(struct address_space *);
2193 extern int filemap_fdatawait(struct address_space *); 2193 extern int filemap_fdatawait(struct address_space *);
2194 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, 2194 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
2195 loff_t lend); 2195 loff_t lend);
2196 extern int filemap_write_and_wait(struct address_space *mapping); 2196 extern int filemap_write_and_wait(struct address_space *mapping);
2197 extern int filemap_write_and_wait_range(struct address_space *mapping, 2197 extern int filemap_write_and_wait_range(struct address_space *mapping,
2198 loff_t lstart, loff_t lend); 2198 loff_t lstart, loff_t lend);
2199 extern int __filemap_fdatawrite_range(struct address_space *mapping, 2199 extern int __filemap_fdatawrite_range(struct address_space *mapping,
2200 loff_t start, loff_t end, int sync_mode); 2200 loff_t start, loff_t end, int sync_mode);
2201 extern int filemap_fdatawrite_range(struct address_space *mapping, 2201 extern int filemap_fdatawrite_range(struct address_space *mapping,
2202 loff_t start, loff_t end); 2202 loff_t start, loff_t end);
2203 2203
2204 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, 2204 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
2205 int datasync); 2205 int datasync);
2206 extern int vfs_fsync(struct file *file, int datasync); 2206 extern int vfs_fsync(struct file *file, int datasync);
2207 extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); 2207 extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
2208 extern void emergency_sync(void); 2208 extern void emergency_sync(void);
2209 extern void emergency_remount(void); 2209 extern void emergency_remount(void);
2210 #ifdef CONFIG_BLOCK 2210 #ifdef CONFIG_BLOCK
2211 extern sector_t bmap(struct inode *, sector_t); 2211 extern sector_t bmap(struct inode *, sector_t);
2212 #endif 2212 #endif
2213 extern int notify_change(struct dentry *, struct iattr *); 2213 extern int notify_change(struct dentry *, struct iattr *);
2214 extern int inode_permission(struct inode *, int); 2214 extern int inode_permission(struct inode *, int);
2215 extern int generic_permission(struct inode *, int); 2215 extern int generic_permission(struct inode *, int);
2216 2216
2217 static inline bool execute_ok(struct inode *inode) 2217 static inline bool execute_ok(struct inode *inode)
2218 { 2218 {
2219 return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); 2219 return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
2220 } 2220 }
2221 2221
2222 /* 2222 /*
2223 * get_write_access() gets write permission for a file. 2223 * get_write_access() gets write permission for a file.
2224 * put_write_access() releases this write permission. 2224 * put_write_access() releases this write permission.
2225 * This is used for regular files. 2225 * This is used for regular files.
2226 * We cannot support write (and maybe mmap read-write shared) accesses and 2226 * We cannot support write (and maybe mmap read-write shared) accesses and
2227 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode 2227 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
2228 * can have the following values: 2228 * can have the following values:
2229 * 0: no writers, no VM_DENYWRITE mappings 2229 * 0: no writers, no VM_DENYWRITE mappings
2230 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist 2230 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
2231 * > 0: (i_writecount) users are writing to the file. 2231 * > 0: (i_writecount) users are writing to the file.
2232 * 2232 *
2233 * Normally we operate on that counter with atomic_{inc,dec} and it's safe 2233 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
2234 * except for the cases where we don't hold i_writecount yet. Then we need to 2234 * except for the cases where we don't hold i_writecount yet. Then we need to
2235 * use {get,deny}_write_access() - these functions check the sign and refuse 2235 * use {get,deny}_write_access() - these functions check the sign and refuse
2236 * to do the change if sign is wrong. 2236 * to do the change if sign is wrong.
2237 */ 2237 */
2238 static inline int get_write_access(struct inode *inode) 2238 static inline int get_write_access(struct inode *inode)
2239 { 2239 {
2240 return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; 2240 return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
2241 } 2241 }
2242 static inline int deny_write_access(struct file *file) 2242 static inline int deny_write_access(struct file *file)
2243 { 2243 {
2244 struct inode *inode = file->f_path.dentry->d_inode; 2244 struct inode *inode = file->f_path.dentry->d_inode;
2245 return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; 2245 return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
2246 } 2246 }
2247 static inline void put_write_access(struct inode * inode) 2247 static inline void put_write_access(struct inode * inode)
2248 { 2248 {
2249 atomic_dec(&inode->i_writecount); 2249 atomic_dec(&inode->i_writecount);
2250 } 2250 }
2251 static inline void allow_write_access(struct file *file) 2251 static inline void allow_write_access(struct file *file)
2252 { 2252 {
2253 if (file) 2253 if (file)
2254 atomic_inc(&file->f_path.dentry->d_inode->i_writecount); 2254 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
2255 } 2255 }
2256 #ifdef CONFIG_IMA 2256 #ifdef CONFIG_IMA
2257 static inline void i_readcount_dec(struct inode *inode) 2257 static inline void i_readcount_dec(struct inode *inode)
2258 { 2258 {
2259 BUG_ON(!atomic_read(&inode->i_readcount)); 2259 BUG_ON(!atomic_read(&inode->i_readcount));
2260 atomic_dec(&inode->i_readcount); 2260 atomic_dec(&inode->i_readcount);
2261 } 2261 }
2262 static inline void i_readcount_inc(struct inode *inode) 2262 static inline void i_readcount_inc(struct inode *inode)
2263 { 2263 {
2264 atomic_inc(&inode->i_readcount); 2264 atomic_inc(&inode->i_readcount);
2265 } 2265 }
2266 #else 2266 #else
2267 static inline void i_readcount_dec(struct inode *inode) 2267 static inline void i_readcount_dec(struct inode *inode)
2268 { 2268 {
2269 return; 2269 return;
2270 } 2270 }
2271 static inline void i_readcount_inc(struct inode *inode) 2271 static inline void i_readcount_inc(struct inode *inode)
2272 { 2272 {
2273 return; 2273 return;
2274 } 2274 }
2275 #endif 2275 #endif
2276 extern int do_pipe_flags(int *, int); 2276 extern int do_pipe_flags(int *, int);
2277 2277
2278 extern int kernel_read(struct file *, loff_t, char *, unsigned long); 2278 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
2279 extern struct file * open_exec(const char *); 2279 extern struct file * open_exec(const char *);
2280 2280
2281 /* fs/dcache.c -- generic fs support functions */ 2281 /* fs/dcache.c -- generic fs support functions */
2282 extern int is_subdir(struct dentry *, struct dentry *); 2282 extern int is_subdir(struct dentry *, struct dentry *);
2283 extern int path_is_under(struct path *, struct path *); 2283 extern int path_is_under(struct path *, struct path *);
2284 extern ino_t find_inode_number(struct dentry *, struct qstr *); 2284 extern ino_t find_inode_number(struct dentry *, struct qstr *);
2285 2285
2286 #include <linux/err.h> 2286 #include <linux/err.h>
2287 2287
2288 /* needed for stackable file system support */ 2288 /* needed for stackable file system support */
2289 extern loff_t default_llseek(struct file *file, loff_t offset, int origin); 2289 extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
2290 2290
2291 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); 2291 extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
2292 2292
2293 extern int inode_init_always(struct super_block *, struct inode *); 2293 extern int inode_init_always(struct super_block *, struct inode *);
2294 extern void inode_init_once(struct inode *); 2294 extern void inode_init_once(struct inode *);
2295 extern void address_space_init_once(struct address_space *mapping); 2295 extern void address_space_init_once(struct address_space *mapping);
2296 extern void ihold(struct inode * inode); 2296 extern void ihold(struct inode * inode);
2297 extern void iput(struct inode *); 2297 extern void iput(struct inode *);
2298 extern struct inode * igrab(struct inode *); 2298 extern struct inode * igrab(struct inode *);
2299 extern ino_t iunique(struct super_block *, ino_t); 2299 extern ino_t iunique(struct super_block *, ino_t);
2300 extern int inode_needs_sync(struct inode *inode); 2300 extern int inode_needs_sync(struct inode *inode);
2301 extern int generic_delete_inode(struct inode *inode); 2301 extern int generic_delete_inode(struct inode *inode);
2302 static inline int generic_drop_inode(struct inode *inode) 2302 static inline int generic_drop_inode(struct inode *inode)
2303 { 2303 {
2304 return !inode->i_nlink || inode_unhashed(inode); 2304 return !inode->i_nlink || inode_unhashed(inode);
2305 } 2305 }
2306 2306
2307 extern struct inode *ilookup5_nowait(struct super_block *sb, 2307 extern struct inode *ilookup5_nowait(struct super_block *sb,
2308 unsigned long hashval, int (*test)(struct inode *, void *), 2308 unsigned long hashval, int (*test)(struct inode *, void *),
2309 void *data); 2309 void *data);
2310 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 2310 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
2311 int (*test)(struct inode *, void *), void *data); 2311 int (*test)(struct inode *, void *), void *data);
2312 extern struct inode *ilookup(struct super_block *sb, unsigned long ino); 2312 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
2313 2313
2314 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); 2314 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
2315 extern struct inode * iget_locked(struct super_block *, unsigned long); 2315 extern struct inode * iget_locked(struct super_block *, unsigned long);
2316 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); 2316 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
2317 extern int insert_inode_locked(struct inode *); 2317 extern int insert_inode_locked(struct inode *);
2318 #ifdef CONFIG_DEBUG_LOCK_ALLOC 2318 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2319 extern void lockdep_annotate_inode_mutex_key(struct inode *inode); 2319 extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
2320 #else 2320 #else
2321 static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; 2321 static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
2322 #endif 2322 #endif
2323 extern void unlock_new_inode(struct inode *); 2323 extern void unlock_new_inode(struct inode *);
2324 extern unsigned int get_next_ino(void); 2324 extern unsigned int get_next_ino(void);
2325 2325
2326 extern void __iget(struct inode * inode); 2326 extern void __iget(struct inode * inode);
2327 extern void iget_failed(struct inode *); 2327 extern void iget_failed(struct inode *);
2328 extern void clear_inode(struct inode *); 2328 extern void clear_inode(struct inode *);
2329 extern void __destroy_inode(struct inode *); 2329 extern void __destroy_inode(struct inode *);
2330 extern struct inode *new_inode_pseudo(struct super_block *sb); 2330 extern struct inode *new_inode_pseudo(struct super_block *sb);
2331 extern struct inode *new_inode(struct super_block *sb); 2331 extern struct inode *new_inode(struct super_block *sb);
2332 extern void free_inode_nonrcu(struct inode *inode); 2332 extern void free_inode_nonrcu(struct inode *inode);
2333 extern int should_remove_suid(struct dentry *); 2333 extern int should_remove_suid(struct dentry *);
2334 extern int file_remove_suid(struct file *); 2334 extern int file_remove_suid(struct file *);
2335 2335
2336 extern void __insert_inode_hash(struct inode *, unsigned long hashval); 2336 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
2337 static inline void insert_inode_hash(struct inode *inode) 2337 static inline void insert_inode_hash(struct inode *inode)
2338 { 2338 {
2339 __insert_inode_hash(inode, inode->i_ino); 2339 __insert_inode_hash(inode, inode->i_ino);
2340 } 2340 }
2341 2341
2342 extern void __remove_inode_hash(struct inode *); 2342 extern void __remove_inode_hash(struct inode *);
2343 static inline void remove_inode_hash(struct inode *inode) 2343 static inline void remove_inode_hash(struct inode *inode)
2344 { 2344 {
2345 if (!inode_unhashed(inode)) 2345 if (!inode_unhashed(inode))
2346 __remove_inode_hash(inode); 2346 __remove_inode_hash(inode);
2347 } 2347 }
2348 2348
2349 extern void inode_sb_list_add(struct inode *inode); 2349 extern void inode_sb_list_add(struct inode *inode);
2350 2350
2351 #ifdef CONFIG_BLOCK 2351 #ifdef CONFIG_BLOCK
2352 extern void submit_bio(int, struct bio *); 2352 extern void submit_bio(int, struct bio *);
2353 extern int bdev_read_only(struct block_device *); 2353 extern int bdev_read_only(struct block_device *);
2354 #endif 2354 #endif
2355 extern int set_blocksize(struct block_device *, int); 2355 extern int set_blocksize(struct block_device *, int);
2356 extern int sb_set_blocksize(struct super_block *, int); 2356 extern int sb_set_blocksize(struct super_block *, int);
2357 extern int sb_min_blocksize(struct super_block *, int); 2357 extern int sb_min_blocksize(struct super_block *, int);
2358 2358
2359 extern int generic_file_mmap(struct file *, struct vm_area_struct *); 2359 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
2360 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 2360 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
2361 extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, 2361 extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
2362 unsigned long size, pgoff_t pgoff); 2362 unsigned long size, pgoff_t pgoff);
2363 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); 2363 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
2364 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); 2364 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
2365 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); 2365 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
2366 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, 2366 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
2367 loff_t *); 2367 loff_t *);
2368 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); 2368 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
2369 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, 2369 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
2370 unsigned long *, loff_t, loff_t *, size_t, size_t); 2370 unsigned long *, loff_t, loff_t *, size_t, size_t);
2371 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, 2371 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
2372 unsigned long, loff_t, loff_t *, size_t, ssize_t); 2372 unsigned long, loff_t, loff_t *, size_t, ssize_t);
2373 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); 2373 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
2374 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); 2374 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
2375 extern int generic_segment_checks(const struct iovec *iov, 2375 extern int generic_segment_checks(const struct iovec *iov,
2376 unsigned long *nr_segs, size_t *count, int access_flags); 2376 unsigned long *nr_segs, size_t *count, int access_flags);
2377 2377
2378 /* fs/block_dev.c */ 2378 /* fs/block_dev.c */
2379 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 2379 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
2380 unsigned long nr_segs, loff_t pos); 2380 unsigned long nr_segs, loff_t pos);
2381 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, 2381 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
2382 int datasync); 2382 int datasync);
2383 extern void block_sync_page(struct page *page); 2383 extern void block_sync_page(struct page *page);
2384 2384
2385 /* fs/splice.c */ 2385 /* fs/splice.c */
2386 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 2386 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
2387 struct pipe_inode_info *, size_t, unsigned int); 2387 struct pipe_inode_info *, size_t, unsigned int);
2388 extern ssize_t default_file_splice_read(struct file *, loff_t *, 2388 extern ssize_t default_file_splice_read(struct file *, loff_t *,
2389 struct pipe_inode_info *, size_t, unsigned int); 2389 struct pipe_inode_info *, size_t, unsigned int);
2390 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2390 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
2391 struct file *, loff_t *, size_t, unsigned int); 2391 struct file *, loff_t *, size_t, unsigned int);
2392 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2392 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
2393 struct file *out, loff_t *, size_t len, unsigned int flags); 2393 struct file *out, loff_t *, size_t len, unsigned int flags);
2394 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 2394 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
2395 size_t len, unsigned int flags); 2395 size_t len, unsigned int flags);
2396 2396
2397 extern void 2397 extern void
2398 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); 2398 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
2399 extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); 2399 extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
2400 extern loff_t no_llseek(struct file *file, loff_t offset, int origin); 2400 extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
2401 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); 2401 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
2402 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, 2402 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
2403 int origin, loff_t maxsize, loff_t eof); 2403 int whence, loff_t maxsize, loff_t eof);
2404 extern int generic_file_open(struct inode * inode, struct file * filp); 2404 extern int generic_file_open(struct inode * inode, struct file * filp);
2405 extern int nonseekable_open(struct inode * inode, struct file * filp); 2405 extern int nonseekable_open(struct inode * inode, struct file * filp);
2406 2406
2407 #ifdef CONFIG_FS_XIP 2407 #ifdef CONFIG_FS_XIP
2408 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, 2408 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
2409 loff_t *ppos); 2409 loff_t *ppos);
2410 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); 2410 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
2411 extern ssize_t xip_file_write(struct file *filp, const char __user *buf, 2411 extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
2412 size_t len, loff_t *ppos); 2412 size_t len, loff_t *ppos);
2413 extern int xip_truncate_page(struct address_space *mapping, loff_t from); 2413 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
2414 #else 2414 #else
2415 static inline int xip_truncate_page(struct address_space *mapping, loff_t from) 2415 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
2416 { 2416 {
2417 return 0; 2417 return 0;
2418 } 2418 }
2419 #endif 2419 #endif
2420 2420
2421 #ifdef CONFIG_BLOCK 2421 #ifdef CONFIG_BLOCK
2422 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, 2422 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
2423 loff_t file_offset); 2423 loff_t file_offset);
2424 2424
2425 enum { 2425 enum {
2426 /* need locking between buffered and direct access */ 2426 /* need locking between buffered and direct access */
2427 DIO_LOCKING = 0x01, 2427 DIO_LOCKING = 0x01,
2428 2428
2429 /* filesystem does not support filling holes */ 2429 /* filesystem does not support filling holes */
2430 DIO_SKIP_HOLES = 0x02, 2430 DIO_SKIP_HOLES = 0x02,
2431 }; 2431 };
2432 2432
2433 void dio_end_io(struct bio *bio, int error); 2433 void dio_end_io(struct bio *bio, int error);
2434 2434
2435 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 2435 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
2436 struct block_device *bdev, const struct iovec *iov, loff_t offset, 2436 struct block_device *bdev, const struct iovec *iov, loff_t offset,
2437 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 2437 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
2438 dio_submit_t submit_io, int flags); 2438 dio_submit_t submit_io, int flags);
2439 2439
2440 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 2440 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2441 struct inode *inode, const struct iovec *iov, loff_t offset, 2441 struct inode *inode, const struct iovec *iov, loff_t offset,
2442 unsigned long nr_segs, get_block_t get_block) 2442 unsigned long nr_segs, get_block_t get_block)
2443 { 2443 {
2444 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 2444 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2445 offset, nr_segs, get_block, NULL, NULL, 2445 offset, nr_segs, get_block, NULL, NULL,
2446 DIO_LOCKING | DIO_SKIP_HOLES); 2446 DIO_LOCKING | DIO_SKIP_HOLES);
2447 } 2447 }
2448 #endif 2448 #endif
2449 2449
2450 void inode_dio_wait(struct inode *inode); 2450 void inode_dio_wait(struct inode *inode);
2451 void inode_dio_done(struct inode *inode); 2451 void inode_dio_done(struct inode *inode);
2452 2452
2453 extern const struct file_operations generic_ro_fops; 2453 extern const struct file_operations generic_ro_fops;
2454 2454
2455 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) 2455 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
2456 2456
2457 extern int vfs_readlink(struct dentry *, char __user *, int, const char *); 2457 extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
2458 extern int vfs_follow_link(struct nameidata *, const char *); 2458 extern int vfs_follow_link(struct nameidata *, const char *);
2459 extern int page_readlink(struct dentry *, char __user *, int); 2459 extern int page_readlink(struct dentry *, char __user *, int);
2460 extern void *page_follow_link_light(struct dentry *, struct nameidata *); 2460 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
2461 extern void page_put_link(struct dentry *, struct nameidata *, void *); 2461 extern void page_put_link(struct dentry *, struct nameidata *, void *);
2462 extern int __page_symlink(struct inode *inode, const char *symname, int len, 2462 extern int __page_symlink(struct inode *inode, const char *symname, int len,
2463 int nofs); 2463 int nofs);
2464 extern int page_symlink(struct inode *inode, const char *symname, int len); 2464 extern int page_symlink(struct inode *inode, const char *symname, int len);
2465 extern const struct inode_operations page_symlink_inode_operations; 2465 extern const struct inode_operations page_symlink_inode_operations;
2466 extern int generic_readlink(struct dentry *, char __user *, int); 2466 extern int generic_readlink(struct dentry *, char __user *, int);
2467 extern void generic_fillattr(struct inode *, struct kstat *); 2467 extern void generic_fillattr(struct inode *, struct kstat *);
2468 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 2468 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
2469 void __inode_add_bytes(struct inode *inode, loff_t bytes); 2469 void __inode_add_bytes(struct inode *inode, loff_t bytes);
2470 void inode_add_bytes(struct inode *inode, loff_t bytes); 2470 void inode_add_bytes(struct inode *inode, loff_t bytes);
2471 void inode_sub_bytes(struct inode *inode, loff_t bytes); 2471 void inode_sub_bytes(struct inode *inode, loff_t bytes);
2472 loff_t inode_get_bytes(struct inode *inode); 2472 loff_t inode_get_bytes(struct inode *inode);
2473 void inode_set_bytes(struct inode *inode, loff_t bytes); 2473 void inode_set_bytes(struct inode *inode, loff_t bytes);
2474 2474
2475 extern int vfs_readdir(struct file *, filldir_t, void *); 2475 extern int vfs_readdir(struct file *, filldir_t, void *);
2476 2476
2477 extern int vfs_stat(const char __user *, struct kstat *); 2477 extern int vfs_stat(const char __user *, struct kstat *);
2478 extern int vfs_lstat(const char __user *, struct kstat *); 2478 extern int vfs_lstat(const char __user *, struct kstat *);
2479 extern int vfs_fstat(unsigned int, struct kstat *); 2479 extern int vfs_fstat(unsigned int, struct kstat *);
2480 extern int vfs_fstatat(int , const char __user *, struct kstat *, int); 2480 extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
2481 2481
2482 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, 2482 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
2483 unsigned long arg); 2483 unsigned long arg);
2484 extern int __generic_block_fiemap(struct inode *inode, 2484 extern int __generic_block_fiemap(struct inode *inode,
2485 struct fiemap_extent_info *fieinfo, 2485 struct fiemap_extent_info *fieinfo,
2486 loff_t start, loff_t len, 2486 loff_t start, loff_t len,
2487 get_block_t *get_block); 2487 get_block_t *get_block);
2488 extern int generic_block_fiemap(struct inode *inode, 2488 extern int generic_block_fiemap(struct inode *inode,
2489 struct fiemap_extent_info *fieinfo, u64 start, 2489 struct fiemap_extent_info *fieinfo, u64 start,
2490 u64 len, get_block_t *get_block); 2490 u64 len, get_block_t *get_block);
2491 2491
2492 extern void get_filesystem(struct file_system_type *fs); 2492 extern void get_filesystem(struct file_system_type *fs);
2493 extern void put_filesystem(struct file_system_type *fs); 2493 extern void put_filesystem(struct file_system_type *fs);
2494 extern struct file_system_type *get_fs_type(const char *name); 2494 extern struct file_system_type *get_fs_type(const char *name);
2495 extern struct super_block *get_super(struct block_device *); 2495 extern struct super_block *get_super(struct block_device *);
2496 extern struct super_block *get_super_thawed(struct block_device *); 2496 extern struct super_block *get_super_thawed(struct block_device *);
2497 extern struct super_block *get_active_super(struct block_device *bdev); 2497 extern struct super_block *get_active_super(struct block_device *bdev);
2498 extern void drop_super(struct super_block *sb); 2498 extern void drop_super(struct super_block *sb);
2499 extern void iterate_supers(void (*)(struct super_block *, void *), void *); 2499 extern void iterate_supers(void (*)(struct super_block *, void *), void *);
2500 extern void iterate_supers_type(struct file_system_type *, 2500 extern void iterate_supers_type(struct file_system_type *,
2501 void (*)(struct super_block *, void *), void *); 2501 void (*)(struct super_block *, void *), void *);
2502 2502
2503 extern int dcache_dir_open(struct inode *, struct file *); 2503 extern int dcache_dir_open(struct inode *, struct file *);
2504 extern int dcache_dir_close(struct inode *, struct file *); 2504 extern int dcache_dir_close(struct inode *, struct file *);
2505 extern loff_t dcache_dir_lseek(struct file *, loff_t, int); 2505 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
2506 extern int dcache_readdir(struct file *, void *, filldir_t); 2506 extern int dcache_readdir(struct file *, void *, filldir_t);
2507 extern int simple_setattr(struct dentry *, struct iattr *); 2507 extern int simple_setattr(struct dentry *, struct iattr *);
2508 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); 2508 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
2509 extern int simple_statfs(struct dentry *, struct kstatfs *); 2509 extern int simple_statfs(struct dentry *, struct kstatfs *);
2510 extern int simple_open(struct inode *inode, struct file *file); 2510 extern int simple_open(struct inode *inode, struct file *file);
2511 extern int simple_link(struct dentry *, struct inode *, struct dentry *); 2511 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
2512 extern int simple_unlink(struct inode *, struct dentry *); 2512 extern int simple_unlink(struct inode *, struct dentry *);
2513 extern int simple_rmdir(struct inode *, struct dentry *); 2513 extern int simple_rmdir(struct inode *, struct dentry *);
2514 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 2514 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
2515 extern int noop_fsync(struct file *, loff_t, loff_t, int); 2515 extern int noop_fsync(struct file *, loff_t, loff_t, int);
2516 extern int simple_empty(struct dentry *); 2516 extern int simple_empty(struct dentry *);
2517 extern int simple_readpage(struct file *file, struct page *page); 2517 extern int simple_readpage(struct file *file, struct page *page);
2518 extern int simple_write_begin(struct file *file, struct address_space *mapping, 2518 extern int simple_write_begin(struct file *file, struct address_space *mapping,
2519 loff_t pos, unsigned len, unsigned flags, 2519 loff_t pos, unsigned len, unsigned flags,
2520 struct page **pagep, void **fsdata); 2520 struct page **pagep, void **fsdata);
2521 extern int simple_write_end(struct file *file, struct address_space *mapping, 2521 extern int simple_write_end(struct file *file, struct address_space *mapping,
2522 loff_t pos, unsigned len, unsigned copied, 2522 loff_t pos, unsigned len, unsigned copied,
2523 struct page *page, void *fsdata); 2523 struct page *page, void *fsdata);
2524 2524
2525 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); 2525 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
2526 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); 2526 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
2527 extern const struct file_operations simple_dir_operations; 2527 extern const struct file_operations simple_dir_operations;
2528 extern const struct inode_operations simple_dir_inode_operations; 2528 extern const struct inode_operations simple_dir_inode_operations;
2529 struct tree_descr { char *name; const struct file_operations *ops; int mode; }; 2529 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
2530 struct dentry *d_alloc_name(struct dentry *, const char *); 2530 struct dentry *d_alloc_name(struct dentry *, const char *);
2531 extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *); 2531 extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
2532 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); 2532 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
2533 extern void simple_release_fs(struct vfsmount **mount, int *count); 2533 extern void simple_release_fs(struct vfsmount **mount, int *count);
2534 2534
2535 extern ssize_t simple_read_from_buffer(void __user *to, size_t count, 2535 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
2536 loff_t *ppos, const void *from, size_t available); 2536 loff_t *ppos, const void *from, size_t available);
2537 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 2537 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
2538 const void __user *from, size_t count); 2538 const void __user *from, size_t count);
2539 2539
2540 extern int generic_file_fsync(struct file *, loff_t, loff_t, int); 2540 extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
2541 2541
2542 extern int generic_check_addressable(unsigned, u64); 2542 extern int generic_check_addressable(unsigned, u64);
2543 2543
2544 #ifdef CONFIG_MIGRATION 2544 #ifdef CONFIG_MIGRATION
2545 extern int buffer_migrate_page(struct address_space *, 2545 extern int buffer_migrate_page(struct address_space *,
2546 struct page *, struct page *, 2546 struct page *, struct page *,
2547 enum migrate_mode); 2547 enum migrate_mode);
2548 #else 2548 #else
2549 #define buffer_migrate_page NULL 2549 #define buffer_migrate_page NULL
2550 #endif 2550 #endif
2551 2551
2552 extern int inode_change_ok(const struct inode *, struct iattr *); 2552 extern int inode_change_ok(const struct inode *, struct iattr *);
2553 extern int inode_newsize_ok(const struct inode *, loff_t offset); 2553 extern int inode_newsize_ok(const struct inode *, loff_t offset);
2554 extern void setattr_copy(struct inode *inode, const struct iattr *attr); 2554 extern void setattr_copy(struct inode *inode, const struct iattr *attr);
2555 2555
2556 extern int file_update_time(struct file *file); 2556 extern int file_update_time(struct file *file);
2557 2557
2558 extern int generic_show_options(struct seq_file *m, struct dentry *root); 2558 extern int generic_show_options(struct seq_file *m, struct dentry *root);
2559 extern void save_mount_options(struct super_block *sb, char *options); 2559 extern void save_mount_options(struct super_block *sb, char *options);
2560 extern void replace_mount_options(struct super_block *sb, char *options); 2560 extern void replace_mount_options(struct super_block *sb, char *options);
2561 2561
2562 static inline ino_t parent_ino(struct dentry *dentry) 2562 static inline ino_t parent_ino(struct dentry *dentry)
2563 { 2563 {
2564 ino_t res; 2564 ino_t res;
2565 2565
2566 /* 2566 /*
2567 * Don't strictly need d_lock here? If the parent ino could change 2567 * Don't strictly need d_lock here? If the parent ino could change
2568 * then surely we'd have a deeper race in the caller? 2568 * then surely we'd have a deeper race in the caller?
2569 */ 2569 */
2570 spin_lock(&dentry->d_lock); 2570 spin_lock(&dentry->d_lock);
2571 res = dentry->d_parent->d_inode->i_ino; 2571 res = dentry->d_parent->d_inode->i_ino;
2572 spin_unlock(&dentry->d_lock); 2572 spin_unlock(&dentry->d_lock);
2573 return res; 2573 return res;
2574 } 2574 }
2575 2575
2576 /* Transaction based IO helpers */ 2576 /* Transaction based IO helpers */
2577 2577
2578 /* 2578 /*
2579 * An argresp is stored in an allocated page and holds the 2579 * An argresp is stored in an allocated page and holds the
2580 * size of the argument or response, along with its content 2580 * size of the argument or response, along with its content
2581 */ 2581 */
2582 struct simple_transaction_argresp { 2582 struct simple_transaction_argresp {
2583 ssize_t size; 2583 ssize_t size;
2584 char data[0]; 2584 char data[0];
2585 }; 2585 };
2586 2586
2587 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) 2587 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))
2588 2588
2589 char *simple_transaction_get(struct file *file, const char __user *buf, 2589 char *simple_transaction_get(struct file *file, const char __user *buf,
2590 size_t size); 2590 size_t size);
2591 ssize_t simple_transaction_read(struct file *file, char __user *buf, 2591 ssize_t simple_transaction_read(struct file *file, char __user *buf,
2592 size_t size, loff_t *pos); 2592 size_t size, loff_t *pos);
2593 int simple_transaction_release(struct inode *inode, struct file *file); 2593 int simple_transaction_release(struct inode *inode, struct file *file);
2594 2594
2595 void simple_transaction_set(struct file *file, size_t n); 2595 void simple_transaction_set(struct file *file, size_t n);
2596 2596
2597 /* 2597 /*
2598 * simple attribute files 2598 * simple attribute files
2599 * 2599 *
2600 * These attributes behave similar to those in sysfs: 2600 * These attributes behave similar to those in sysfs:
2601 * 2601 *
2602 * Writing to an attribute immediately sets a value, an open file can be 2602 * Writing to an attribute immediately sets a value, an open file can be
2603 * written to multiple times. 2603 * written to multiple times.
2604 * 2604 *
2605 * Reading from an attribute creates a buffer from the value that might get 2605 * Reading from an attribute creates a buffer from the value that might get
2606 * read with multiple read calls. When the attribute has been read 2606 * read with multiple read calls. When the attribute has been read
2607 * completely, no further read calls are possible until the file is opened 2607 * completely, no further read calls are possible until the file is opened
2608 * again. 2608 * again.
2609 * 2609 *
2610 * All attributes contain a text representation of a numeric value 2610 * All attributes contain a text representation of a numeric value
2611 * that are accessed with the get() and set() functions. 2611 * that are accessed with the get() and set() functions.
2612 */ 2612 */
2613 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ 2613 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \
2614 static int __fops ## _open(struct inode *inode, struct file *file) \ 2614 static int __fops ## _open(struct inode *inode, struct file *file) \
2615 { \ 2615 { \
2616 __simple_attr_check_format(__fmt, 0ull); \ 2616 __simple_attr_check_format(__fmt, 0ull); \
2617 return simple_attr_open(inode, file, __get, __set, __fmt); \ 2617 return simple_attr_open(inode, file, __get, __set, __fmt); \
2618 } \ 2618 } \
2619 static const struct file_operations __fops = { \ 2619 static const struct file_operations __fops = { \
2620 .owner = THIS_MODULE, \ 2620 .owner = THIS_MODULE, \
2621 .open = __fops ## _open, \ 2621 .open = __fops ## _open, \
2622 .release = simple_attr_release, \ 2622 .release = simple_attr_release, \
2623 .read = simple_attr_read, \ 2623 .read = simple_attr_read, \
2624 .write = simple_attr_write, \ 2624 .write = simple_attr_write, \
2625 .llseek = generic_file_llseek, \ 2625 .llseek = generic_file_llseek, \
2626 }; 2626 };
2627 2627
2628 static inline __printf(1, 2) 2628 static inline __printf(1, 2)
2629 void __simple_attr_check_format(const char *fmt, ...) 2629 void __simple_attr_check_format(const char *fmt, ...)
2630 { 2630 {
2631 /* don't do anything, just let the compiler check the arguments; */ 2631 /* don't do anything, just let the compiler check the arguments; */
2632 } 2632 }
2633 2633
2634 int simple_attr_open(struct inode *inode, struct file *file, 2634 int simple_attr_open(struct inode *inode, struct file *file,
2635 int (*get)(void *, u64 *), int (*set)(void *, u64), 2635 int (*get)(void *, u64 *), int (*set)(void *, u64),
2636 const char *fmt); 2636 const char *fmt);
2637 int simple_attr_release(struct inode *inode, struct file *file); 2637 int simple_attr_release(struct inode *inode, struct file *file);
2638 ssize_t simple_attr_read(struct file *file, char __user *buf, 2638 ssize_t simple_attr_read(struct file *file, char __user *buf,
2639 size_t len, loff_t *ppos); 2639 size_t len, loff_t *ppos);
2640 ssize_t simple_attr_write(struct file *file, const char __user *buf, 2640 ssize_t simple_attr_write(struct file *file, const char __user *buf,
2641 size_t len, loff_t *ppos); 2641 size_t len, loff_t *ppos);
2642 2642
2643 struct ctl_table; 2643 struct ctl_table;
2644 int proc_nr_files(struct ctl_table *table, int write, 2644 int proc_nr_files(struct ctl_table *table, int write,
2645 void __user *buffer, size_t *lenp, loff_t *ppos); 2645 void __user *buffer, size_t *lenp, loff_t *ppos);
2646 int proc_nr_dentry(struct ctl_table *table, int write, 2646 int proc_nr_dentry(struct ctl_table *table, int write,
2647 void __user *buffer, size_t *lenp, loff_t *ppos); 2647 void __user *buffer, size_t *lenp, loff_t *ppos);
2648 int proc_nr_inodes(struct ctl_table *table, int write, 2648 int proc_nr_inodes(struct ctl_table *table, int write,
2649 void __user *buffer, size_t *lenp, loff_t *ppos); 2649 void __user *buffer, size_t *lenp, loff_t *ppos);
2650 int __init get_filesystem_list(char *buf); 2650 int __init get_filesystem_list(char *buf);
2651 2651
2652 #define __FMODE_EXEC ((__force int) FMODE_EXEC) 2652 #define __FMODE_EXEC ((__force int) FMODE_EXEC)
2653 #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) 2653 #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY)
2654 2654
2655 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) 2655 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
2656 #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ 2656 #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
2657 (flag & __FMODE_NONOTIFY))) 2657 (flag & __FMODE_NONOTIFY)))
2658 2658
2659 static inline int is_sxid(umode_t mode) 2659 static inline int is_sxid(umode_t mode)
2660 { 2660 {
2661 return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); 2661 return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
2662 } 2662 }
2663 2663
2664 static inline void inode_has_no_xattr(struct inode *inode) 2664 static inline void inode_has_no_xattr(struct inode *inode)
2665 { 2665 {
2666 if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) 2666 if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
2667 inode->i_flags |= S_NOSEC; 2667 inode->i_flags |= S_NOSEC;
2668 } 2668 }
2669 2669
2670 #endif /* _LINUX_FS_H */ 2670 #endif /* _LINUX_FS_H */
2671 2671
include/linux/ftrace.h
1 /* 1 /*
2 * Ftrace header. For implementation details beyond the random comments 2 * Ftrace header. For implementation details beyond the random comments
3 * scattered below, see: Documentation/trace/ftrace-design.txt 3 * scattered below, see: Documentation/trace/ftrace-design.txt
4 */ 4 */
5 5
6 #ifndef _LINUX_FTRACE_H 6 #ifndef _LINUX_FTRACE_H
7 #define _LINUX_FTRACE_H 7 #define _LINUX_FTRACE_H
8 8
9 #include <linux/trace_clock.h> 9 #include <linux/trace_clock.h>
10 #include <linux/kallsyms.h> 10 #include <linux/kallsyms.h>
11 #include <linux/linkage.h> 11 #include <linux/linkage.h>
12 #include <linux/bitops.h> 12 #include <linux/bitops.h>
13 #include <linux/ptrace.h> 13 #include <linux/ptrace.h>
14 #include <linux/ktime.h> 14 #include <linux/ktime.h>
15 #include <linux/sched.h> 15 #include <linux/sched.h>
16 #include <linux/types.h> 16 #include <linux/types.h>
17 #include <linux/init.h> 17 #include <linux/init.h>
18 #include <linux/fs.h> 18 #include <linux/fs.h>
19 19
20 #include <asm/ftrace.h> 20 #include <asm/ftrace.h>
21 21
22 /* 22 /*
23 * If the arch supports passing the variable contents of 23 * If the arch supports passing the variable contents of
24 * function_trace_op as the third parameter back from the 24 * function_trace_op as the third parameter back from the
25 * mcount call, then the arch should define this as 1. 25 * mcount call, then the arch should define this as 1.
26 */ 26 */
27 #ifndef ARCH_SUPPORTS_FTRACE_OPS 27 #ifndef ARCH_SUPPORTS_FTRACE_OPS
28 #define ARCH_SUPPORTS_FTRACE_OPS 0 28 #define ARCH_SUPPORTS_FTRACE_OPS 0
29 #endif 29 #endif
30 30
31 /* 31 /*
32 * If the arch's mcount caller does not support all of ftrace's 32 * If the arch's mcount caller does not support all of ftrace's
33 * features, then it must call an indirect function that 33 * features, then it must call an indirect function that
34 * does. Or at least does enough to prevent any unwelcomed side effects. 34 * does. Or at least does enough to prevent any unwelcomed side effects.
35 */ 35 */
36 #if !defined(CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST) || \ 36 #if !defined(CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST) || \
37 !ARCH_SUPPORTS_FTRACE_OPS 37 !ARCH_SUPPORTS_FTRACE_OPS
38 # define FTRACE_FORCE_LIST_FUNC 1 38 # define FTRACE_FORCE_LIST_FUNC 1
39 #else 39 #else
40 # define FTRACE_FORCE_LIST_FUNC 0 40 # define FTRACE_FORCE_LIST_FUNC 0
41 #endif 41 #endif
42 42
43 43
44 struct module; 44 struct module;
45 struct ftrace_hash; 45 struct ftrace_hash;
46 46
47 #ifdef CONFIG_FUNCTION_TRACER 47 #ifdef CONFIG_FUNCTION_TRACER
48 48
49 extern int ftrace_enabled; 49 extern int ftrace_enabled;
50 extern int 50 extern int
51 ftrace_enable_sysctl(struct ctl_table *table, int write, 51 ftrace_enable_sysctl(struct ctl_table *table, int write,
52 void __user *buffer, size_t *lenp, 52 void __user *buffer, size_t *lenp,
53 loff_t *ppos); 53 loff_t *ppos);
54 54
55 struct ftrace_ops; 55 struct ftrace_ops;
56 56
57 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, 57 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
58 struct ftrace_ops *op, struct pt_regs *regs); 58 struct ftrace_ops *op, struct pt_regs *regs);
59 59
60 /* 60 /*
61 * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are 61 * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are
62 * set in the flags member. 62 * set in the flags member.
63 * 63 *
64 * ENABLED - set/unset when ftrace_ops is registered/unregistered 64 * ENABLED - set/unset when ftrace_ops is registered/unregistered
65 * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops 65 * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops
66 * is part of the global tracers sharing the same filter 66 * is part of the global tracers sharing the same filter
67 * via set_ftrace_* debugfs files. 67 * via set_ftrace_* debugfs files.
68 * DYNAMIC - set when ftrace_ops is registered to denote dynamically 68 * DYNAMIC - set when ftrace_ops is registered to denote dynamically
69 * allocated ftrace_ops which need special care 69 * allocated ftrace_ops which need special care
70 * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops 70 * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops
71 * could be controled by following calls: 71 * could be controled by following calls:
72 * ftrace_function_local_enable 72 * ftrace_function_local_enable
73 * ftrace_function_local_disable 73 * ftrace_function_local_disable
74 * SAVE_REGS - The ftrace_ops wants regs saved at each function called 74 * SAVE_REGS - The ftrace_ops wants regs saved at each function called
75 * and passed to the callback. If this flag is set, but the 75 * and passed to the callback. If this flag is set, but the
76 * architecture does not support passing regs 76 * architecture does not support passing regs
77 * (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the 77 * (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the
78 * ftrace_ops will fail to register, unless the next flag 78 * ftrace_ops will fail to register, unless the next flag
79 * is set. 79 * is set.
80 * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the 80 * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the
81 * handler can handle an arch that does not save regs 81 * handler can handle an arch that does not save regs
82 * (the handler tests if regs == NULL), then it can set 82 * (the handler tests if regs == NULL), then it can set
83 * this flag instead. It will not fail registering the ftrace_ops 83 * this flag instead. It will not fail registering the ftrace_ops
84 * but, the regs field will be NULL if the arch does not support 84 * but, the regs field will be NULL if the arch does not support
85 * passing regs to the handler. 85 * passing regs to the handler.
86 * Note, if this flag is set, the SAVE_REGS flag will automatically 86 * Note, if this flag is set, the SAVE_REGS flag will automatically
87 * get set upon registering the ftrace_ops, if the arch supports it. 87 * get set upon registering the ftrace_ops, if the arch supports it.
88 * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure 88 * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure
89 * that the call back has its own recursion protection. If it does 89 * that the call back has its own recursion protection. If it does
90 * not set this, then the ftrace infrastructure will add recursion 90 * not set this, then the ftrace infrastructure will add recursion
91 * protection for the caller. 91 * protection for the caller.
92 */ 92 */
93 enum { 93 enum {
94 FTRACE_OPS_FL_ENABLED = 1 << 0, 94 FTRACE_OPS_FL_ENABLED = 1 << 0,
95 FTRACE_OPS_FL_GLOBAL = 1 << 1, 95 FTRACE_OPS_FL_GLOBAL = 1 << 1,
96 FTRACE_OPS_FL_DYNAMIC = 1 << 2, 96 FTRACE_OPS_FL_DYNAMIC = 1 << 2,
97 FTRACE_OPS_FL_CONTROL = 1 << 3, 97 FTRACE_OPS_FL_CONTROL = 1 << 3,
98 FTRACE_OPS_FL_SAVE_REGS = 1 << 4, 98 FTRACE_OPS_FL_SAVE_REGS = 1 << 4,
99 FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, 99 FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5,
100 FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, 100 FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6,
101 }; 101 };
102 102
103 struct ftrace_ops { 103 struct ftrace_ops {
104 ftrace_func_t func; 104 ftrace_func_t func;
105 struct ftrace_ops *next; 105 struct ftrace_ops *next;
106 unsigned long flags; 106 unsigned long flags;
107 int __percpu *disabled; 107 int __percpu *disabled;
108 #ifdef CONFIG_DYNAMIC_FTRACE 108 #ifdef CONFIG_DYNAMIC_FTRACE
109 struct ftrace_hash *notrace_hash; 109 struct ftrace_hash *notrace_hash;
110 struct ftrace_hash *filter_hash; 110 struct ftrace_hash *filter_hash;
111 #endif 111 #endif
112 }; 112 };
113 113
114 extern int function_trace_stop; 114 extern int function_trace_stop;
115 115
116 /* 116 /*
117 * Type of the current tracing. 117 * Type of the current tracing.
118 */ 118 */
119 enum ftrace_tracing_type_t { 119 enum ftrace_tracing_type_t {
120 FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */ 120 FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
121 FTRACE_TYPE_RETURN, /* Hook the return of the function */ 121 FTRACE_TYPE_RETURN, /* Hook the return of the function */
122 }; 122 };
123 123
124 /* Current tracing type, default is FTRACE_TYPE_ENTER */ 124 /* Current tracing type, default is FTRACE_TYPE_ENTER */
125 extern enum ftrace_tracing_type_t ftrace_tracing_type; 125 extern enum ftrace_tracing_type_t ftrace_tracing_type;
126 126
127 /** 127 /**
128 * ftrace_stop - stop function tracer. 128 * ftrace_stop - stop function tracer.
129 * 129 *
130 * A quick way to stop the function tracer. Note this an on off switch, 130 * A quick way to stop the function tracer. Note this an on off switch,
131 * it is not something that is recursive like preempt_disable. 131 * it is not something that is recursive like preempt_disable.
132 * This does not disable the calling of mcount, it only stops the 132 * This does not disable the calling of mcount, it only stops the
133 * calling of functions from mcount. 133 * calling of functions from mcount.
134 */ 134 */
135 static inline void ftrace_stop(void) 135 static inline void ftrace_stop(void)
136 { 136 {
137 function_trace_stop = 1; 137 function_trace_stop = 1;
138 } 138 }
139 139
140 /** 140 /**
141 * ftrace_start - start the function tracer. 141 * ftrace_start - start the function tracer.
142 * 142 *
143 * This function is the inverse of ftrace_stop. This does not enable 143 * This function is the inverse of ftrace_stop. This does not enable
144 * the function tracing if the function tracer is disabled. This only 144 * the function tracing if the function tracer is disabled. This only
145 * sets the function tracer flag to continue calling the functions 145 * sets the function tracer flag to continue calling the functions
146 * from mcount. 146 * from mcount.
147 */ 147 */
148 static inline void ftrace_start(void) 148 static inline void ftrace_start(void)
149 { 149 {
150 function_trace_stop = 0; 150 function_trace_stop = 0;
151 } 151 }
152 152
153 /* 153 /*
154 * The ftrace_ops must be a static and should also 154 * The ftrace_ops must be a static and should also
155 * be read_mostly. These functions do modify read_mostly variables 155 * be read_mostly. These functions do modify read_mostly variables
156 * so use them sparely. Never free an ftrace_op or modify the 156 * so use them sparely. Never free an ftrace_op or modify the
157 * next pointer after it has been registered. Even after unregistering 157 * next pointer after it has been registered. Even after unregistering
158 * it, the next pointer may still be used internally. 158 * it, the next pointer may still be used internally.
159 */ 159 */
160 int register_ftrace_function(struct ftrace_ops *ops); 160 int register_ftrace_function(struct ftrace_ops *ops);
161 int unregister_ftrace_function(struct ftrace_ops *ops); 161 int unregister_ftrace_function(struct ftrace_ops *ops);
162 void clear_ftrace_function(void); 162 void clear_ftrace_function(void);
163 163
164 /** 164 /**
165 * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu 165 * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu
166 * 166 *
167 * This function enables tracing on current cpu by decreasing 167 * This function enables tracing on current cpu by decreasing
168 * the per cpu control variable. 168 * the per cpu control variable.
169 * It must be called with preemption disabled and only on ftrace_ops 169 * It must be called with preemption disabled and only on ftrace_ops
170 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption 170 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption
171 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. 171 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
172 */ 172 */
173 static inline void ftrace_function_local_enable(struct ftrace_ops *ops) 173 static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
174 { 174 {
175 if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) 175 if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)))
176 return; 176 return;
177 177
178 (*this_cpu_ptr(ops->disabled))--; 178 (*this_cpu_ptr(ops->disabled))--;
179 } 179 }
180 180
181 /** 181 /**
182 * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu 182 * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu
183 * 183 *
184 * This function enables tracing on current cpu by decreasing 184 * This function enables tracing on current cpu by decreasing
185 * the per cpu control variable. 185 * the per cpu control variable.
186 * It must be called with preemption disabled and only on ftrace_ops 186 * It must be called with preemption disabled and only on ftrace_ops
187 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption 187 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption
188 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. 188 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
189 */ 189 */
190 static inline void ftrace_function_local_disable(struct ftrace_ops *ops) 190 static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
191 { 191 {
192 if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) 192 if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)))
193 return; 193 return;
194 194
195 (*this_cpu_ptr(ops->disabled))++; 195 (*this_cpu_ptr(ops->disabled))++;
196 } 196 }
197 197
198 /** 198 /**
199 * ftrace_function_local_disabled - returns ftrace_ops disabled value 199 * ftrace_function_local_disabled - returns ftrace_ops disabled value
200 * on current cpu 200 * on current cpu
201 * 201 *
202 * This function returns value of ftrace_ops::disabled on current cpu. 202 * This function returns value of ftrace_ops::disabled on current cpu.
203 * It must be called with preemption disabled and only on ftrace_ops 203 * It must be called with preemption disabled and only on ftrace_ops
204 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption 204 * registered with FTRACE_OPS_FL_CONTROL. If called without preemption
205 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. 205 * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
206 */ 206 */
207 static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) 207 static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
208 { 208 {
209 WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)); 209 WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL));
210 return *this_cpu_ptr(ops->disabled); 210 return *this_cpu_ptr(ops->disabled);
211 } 211 }
212 212
213 extern void ftrace_stub(unsigned long a0, unsigned long a1, 213 extern void ftrace_stub(unsigned long a0, unsigned long a1,
214 struct ftrace_ops *op, struct pt_regs *regs); 214 struct ftrace_ops *op, struct pt_regs *regs);
215 215
216 #else /* !CONFIG_FUNCTION_TRACER */ 216 #else /* !CONFIG_FUNCTION_TRACER */
217 /* 217 /*
218 * (un)register_ftrace_function must be a macro since the ops parameter 218 * (un)register_ftrace_function must be a macro since the ops parameter
219 * must not be evaluated. 219 * must not be evaluated.
220 */ 220 */
221 #define register_ftrace_function(ops) ({ 0; }) 221 #define register_ftrace_function(ops) ({ 0; })
222 #define unregister_ftrace_function(ops) ({ 0; }) 222 #define unregister_ftrace_function(ops) ({ 0; })
223 static inline int ftrace_nr_registered_ops(void) 223 static inline int ftrace_nr_registered_ops(void)
224 { 224 {
225 return 0; 225 return 0;
226 } 226 }
227 static inline void clear_ftrace_function(void) { } 227 static inline void clear_ftrace_function(void) { }
228 static inline void ftrace_kill(void) { } 228 static inline void ftrace_kill(void) { }
229 static inline void ftrace_stop(void) { } 229 static inline void ftrace_stop(void) { }
230 static inline void ftrace_start(void) { } 230 static inline void ftrace_start(void) { }
231 #endif /* CONFIG_FUNCTION_TRACER */ 231 #endif /* CONFIG_FUNCTION_TRACER */
232 232
233 #ifdef CONFIG_STACK_TRACER 233 #ifdef CONFIG_STACK_TRACER
234 extern int stack_tracer_enabled; 234 extern int stack_tracer_enabled;
235 int 235 int
236 stack_trace_sysctl(struct ctl_table *table, int write, 236 stack_trace_sysctl(struct ctl_table *table, int write,
237 void __user *buffer, size_t *lenp, 237 void __user *buffer, size_t *lenp,
238 loff_t *ppos); 238 loff_t *ppos);
239 #endif 239 #endif
240 240
241 struct ftrace_func_command { 241 struct ftrace_func_command {
242 struct list_head list; 242 struct list_head list;
243 char *name; 243 char *name;
244 int (*func)(struct ftrace_hash *hash, 244 int (*func)(struct ftrace_hash *hash,
245 char *func, char *cmd, 245 char *func, char *cmd,
246 char *params, int enable); 246 char *params, int enable);
247 }; 247 };
248 248
249 #ifdef CONFIG_DYNAMIC_FTRACE 249 #ifdef CONFIG_DYNAMIC_FTRACE
250 250
251 int ftrace_arch_code_modify_prepare(void); 251 int ftrace_arch_code_modify_prepare(void);
252 int ftrace_arch_code_modify_post_process(void); 252 int ftrace_arch_code_modify_post_process(void);
253 253
254 void ftrace_bug(int err, unsigned long ip); 254 void ftrace_bug(int err, unsigned long ip);
255 255
256 struct seq_file; 256 struct seq_file;
257 257
258 struct ftrace_probe_ops { 258 struct ftrace_probe_ops {
259 void (*func)(unsigned long ip, 259 void (*func)(unsigned long ip,
260 unsigned long parent_ip, 260 unsigned long parent_ip,
261 void **data); 261 void **data);
262 int (*callback)(unsigned long ip, void **data); 262 int (*callback)(unsigned long ip, void **data);
263 void (*free)(void **data); 263 void (*free)(void **data);
264 int (*print)(struct seq_file *m, 264 int (*print)(struct seq_file *m,
265 unsigned long ip, 265 unsigned long ip,
266 struct ftrace_probe_ops *ops, 266 struct ftrace_probe_ops *ops,
267 void *data); 267 void *data);
268 }; 268 };
269 269
270 extern int 270 extern int
271 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 271 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
272 void *data); 272 void *data);
273 extern void 273 extern void
274 unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 274 unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
275 void *data); 275 void *data);
276 extern void 276 extern void
277 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); 277 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
278 extern void unregister_ftrace_function_probe_all(char *glob); 278 extern void unregister_ftrace_function_probe_all(char *glob);
279 279
280 extern int ftrace_text_reserved(void *start, void *end); 280 extern int ftrace_text_reserved(void *start, void *end);
281 281
282 extern int ftrace_nr_registered_ops(void); 282 extern int ftrace_nr_registered_ops(void);
283 283
284 /* 284 /*
285 * The dyn_ftrace record's flags field is split into two parts. 285 * The dyn_ftrace record's flags field is split into two parts.
286 * the first part which is '0-FTRACE_REF_MAX' is a counter of 286 * the first part which is '0-FTRACE_REF_MAX' is a counter of
287 * the number of callbacks that have registered the function that 287 * the number of callbacks that have registered the function that
288 * the dyn_ftrace descriptor represents. 288 * the dyn_ftrace descriptor represents.
289 * 289 *
290 * The second part is a mask: 290 * The second part is a mask:
291 * ENABLED - the function is being traced 291 * ENABLED - the function is being traced
292 * REGS - the record wants the function to save regs 292 * REGS - the record wants the function to save regs
293 * REGS_EN - the function is set up to save regs. 293 * REGS_EN - the function is set up to save regs.
294 * 294 *
295 * When a new ftrace_ops is registered and wants a function to save 295 * When a new ftrace_ops is registered and wants a function to save
296 * pt_regs, the rec->flag REGS is set. When the function has been 296 * pt_regs, the rec->flag REGS is set. When the function has been
297 * set up to save regs, the REG_EN flag is set. Once a function 297 * set up to save regs, the REG_EN flag is set. Once a function
298 * starts saving regs it will do so until all ftrace_ops are removed 298 * starts saving regs it will do so until all ftrace_ops are removed
299 * from tracing that function. 299 * from tracing that function.
300 */ 300 */
301 enum { 301 enum {
302 FTRACE_FL_ENABLED = (1UL << 29), 302 FTRACE_FL_ENABLED = (1UL << 29),
303 FTRACE_FL_REGS = (1UL << 30), 303 FTRACE_FL_REGS = (1UL << 30),
304 FTRACE_FL_REGS_EN = (1UL << 31) 304 FTRACE_FL_REGS_EN = (1UL << 31)
305 }; 305 };
306 306
307 #define FTRACE_FL_MASK (0x7UL << 29) 307 #define FTRACE_FL_MASK (0x7UL << 29)
308 #define FTRACE_REF_MAX ((1UL << 29) - 1) 308 #define FTRACE_REF_MAX ((1UL << 29) - 1)
309 309
310 struct dyn_ftrace { 310 struct dyn_ftrace {
311 union { 311 union {
312 unsigned long ip; /* address of mcount call-site */ 312 unsigned long ip; /* address of mcount call-site */
313 struct dyn_ftrace *freelist; 313 struct dyn_ftrace *freelist;
314 }; 314 };
315 unsigned long flags; 315 unsigned long flags;
316 struct dyn_arch_ftrace arch; 316 struct dyn_arch_ftrace arch;
317 }; 317 };
318 318
319 int ftrace_force_update(void); 319 int ftrace_force_update(void);
320 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, 320 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
321 int remove, int reset); 321 int remove, int reset);
322 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 322 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
323 int len, int reset); 323 int len, int reset);
324 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 324 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
325 int len, int reset); 325 int len, int reset);
326 void ftrace_set_global_filter(unsigned char *buf, int len, int reset); 326 void ftrace_set_global_filter(unsigned char *buf, int len, int reset);
327 void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); 327 void ftrace_set_global_notrace(unsigned char *buf, int len, int reset);
328 void ftrace_free_filter(struct ftrace_ops *ops); 328 void ftrace_free_filter(struct ftrace_ops *ops);
329 329
330 int register_ftrace_command(struct ftrace_func_command *cmd); 330 int register_ftrace_command(struct ftrace_func_command *cmd);
331 int unregister_ftrace_command(struct ftrace_func_command *cmd); 331 int unregister_ftrace_command(struct ftrace_func_command *cmd);
332 332
333 enum { 333 enum {
334 FTRACE_UPDATE_CALLS = (1 << 0), 334 FTRACE_UPDATE_CALLS = (1 << 0),
335 FTRACE_DISABLE_CALLS = (1 << 1), 335 FTRACE_DISABLE_CALLS = (1 << 1),
336 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 336 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
337 FTRACE_START_FUNC_RET = (1 << 3), 337 FTRACE_START_FUNC_RET = (1 << 3),
338 FTRACE_STOP_FUNC_RET = (1 << 4), 338 FTRACE_STOP_FUNC_RET = (1 << 4),
339 }; 339 };
340 340
341 /* 341 /*
342 * The FTRACE_UPDATE_* enum is used to pass information back 342 * The FTRACE_UPDATE_* enum is used to pass information back
343 * from the ftrace_update_record() and ftrace_test_record() 343 * from the ftrace_update_record() and ftrace_test_record()
344 * functions. These are called by the code update routines 344 * functions. These are called by the code update routines
345 * to find out what is to be done for a given function. 345 * to find out what is to be done for a given function.
346 * 346 *
347 * IGNORE - The function is already what we want it to be 347 * IGNORE - The function is already what we want it to be
348 * MAKE_CALL - Start tracing the function 348 * MAKE_CALL - Start tracing the function
349 * MODIFY_CALL - Stop saving regs for the function 349 * MODIFY_CALL - Stop saving regs for the function
350 * MODIFY_CALL_REGS - Start saving regs for the function 350 * MODIFY_CALL_REGS - Start saving regs for the function
351 * MAKE_NOP - Stop tracing the function 351 * MAKE_NOP - Stop tracing the function
352 */ 352 */
353 enum { 353 enum {
354 FTRACE_UPDATE_IGNORE, 354 FTRACE_UPDATE_IGNORE,
355 FTRACE_UPDATE_MAKE_CALL, 355 FTRACE_UPDATE_MAKE_CALL,
356 FTRACE_UPDATE_MODIFY_CALL, 356 FTRACE_UPDATE_MODIFY_CALL,
357 FTRACE_UPDATE_MODIFY_CALL_REGS, 357 FTRACE_UPDATE_MODIFY_CALL_REGS,
358 FTRACE_UPDATE_MAKE_NOP, 358 FTRACE_UPDATE_MAKE_NOP,
359 }; 359 };
360 360
361 enum { 361 enum {
362 FTRACE_ITER_FILTER = (1 << 0), 362 FTRACE_ITER_FILTER = (1 << 0),
363 FTRACE_ITER_NOTRACE = (1 << 1), 363 FTRACE_ITER_NOTRACE = (1 << 1),
364 FTRACE_ITER_PRINTALL = (1 << 2), 364 FTRACE_ITER_PRINTALL = (1 << 2),
365 FTRACE_ITER_DO_HASH = (1 << 3), 365 FTRACE_ITER_DO_HASH = (1 << 3),
366 FTRACE_ITER_HASH = (1 << 4), 366 FTRACE_ITER_HASH = (1 << 4),
367 FTRACE_ITER_ENABLED = (1 << 5), 367 FTRACE_ITER_ENABLED = (1 << 5),
368 }; 368 };
369 369
370 void arch_ftrace_update_code(int command); 370 void arch_ftrace_update_code(int command);
371 371
372 struct ftrace_rec_iter; 372 struct ftrace_rec_iter;
373 373
374 struct ftrace_rec_iter *ftrace_rec_iter_start(void); 374 struct ftrace_rec_iter *ftrace_rec_iter_start(void);
375 struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); 375 struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
376 struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); 376 struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
377 377
378 #define for_ftrace_rec_iter(iter) \ 378 #define for_ftrace_rec_iter(iter) \
379 for (iter = ftrace_rec_iter_start(); \ 379 for (iter = ftrace_rec_iter_start(); \
380 iter; \ 380 iter; \
381 iter = ftrace_rec_iter_next(iter)) 381 iter = ftrace_rec_iter_next(iter))
382 382
383 383
384 int ftrace_update_record(struct dyn_ftrace *rec, int enable); 384 int ftrace_update_record(struct dyn_ftrace *rec, int enable);
385 int ftrace_test_record(struct dyn_ftrace *rec, int enable); 385 int ftrace_test_record(struct dyn_ftrace *rec, int enable);
386 void ftrace_run_stop_machine(int command); 386 void ftrace_run_stop_machine(int command);
387 unsigned long ftrace_location(unsigned long ip); 387 unsigned long ftrace_location(unsigned long ip);
388 388
389 extern ftrace_func_t ftrace_trace_function; 389 extern ftrace_func_t ftrace_trace_function;
390 390
391 int ftrace_regex_open(struct ftrace_ops *ops, int flag, 391 int ftrace_regex_open(struct ftrace_ops *ops, int flag,
392 struct inode *inode, struct file *file); 392 struct inode *inode, struct file *file);
393 ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, 393 ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
394 size_t cnt, loff_t *ppos); 394 size_t cnt, loff_t *ppos);
395 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, 395 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
396 size_t cnt, loff_t *ppos); 396 size_t cnt, loff_t *ppos);
397 loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin); 397 loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence);
398 int ftrace_regex_release(struct inode *inode, struct file *file); 398 int ftrace_regex_release(struct inode *inode, struct file *file);
399 399
400 void __init 400 void __init
401 ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); 401 ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
402 402
403 /* defined in arch */ 403 /* defined in arch */
404 extern int ftrace_ip_converted(unsigned long ip); 404 extern int ftrace_ip_converted(unsigned long ip);
405 extern int ftrace_dyn_arch_init(void *data); 405 extern int ftrace_dyn_arch_init(void *data);
406 extern void ftrace_replace_code(int enable); 406 extern void ftrace_replace_code(int enable);
407 extern int ftrace_update_ftrace_func(ftrace_func_t func); 407 extern int ftrace_update_ftrace_func(ftrace_func_t func);
408 extern void ftrace_caller(void); 408 extern void ftrace_caller(void);
409 extern void ftrace_regs_caller(void); 409 extern void ftrace_regs_caller(void);
410 extern void ftrace_call(void); 410 extern void ftrace_call(void);
411 extern void ftrace_regs_call(void); 411 extern void ftrace_regs_call(void);
412 extern void mcount_call(void); 412 extern void mcount_call(void);
413 413
414 void ftrace_modify_all_code(int command); 414 void ftrace_modify_all_code(int command);
415 415
416 #ifndef FTRACE_ADDR 416 #ifndef FTRACE_ADDR
417 #define FTRACE_ADDR ((unsigned long)ftrace_caller) 417 #define FTRACE_ADDR ((unsigned long)ftrace_caller)
418 #endif 418 #endif
419 419
420 #ifndef FTRACE_REGS_ADDR 420 #ifndef FTRACE_REGS_ADDR
421 #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 421 #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
422 # define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller) 422 # define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller)
423 #else 423 #else
424 # define FTRACE_REGS_ADDR FTRACE_ADDR 424 # define FTRACE_REGS_ADDR FTRACE_ADDR
425 #endif 425 #endif
426 #endif 426 #endif
427 427
428 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 428 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
429 extern void ftrace_graph_caller(void); 429 extern void ftrace_graph_caller(void);
430 extern int ftrace_enable_ftrace_graph_caller(void); 430 extern int ftrace_enable_ftrace_graph_caller(void);
431 extern int ftrace_disable_ftrace_graph_caller(void); 431 extern int ftrace_disable_ftrace_graph_caller(void);
432 #else 432 #else
433 static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; } 433 static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
434 static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } 434 static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
435 #endif 435 #endif
436 436
437 /** 437 /**
438 * ftrace_make_nop - convert code into nop 438 * ftrace_make_nop - convert code into nop
439 * @mod: module structure if called by module load initialization 439 * @mod: module structure if called by module load initialization
440 * @rec: the mcount call site record 440 * @rec: the mcount call site record
441 * @addr: the address that the call site should be calling 441 * @addr: the address that the call site should be calling
442 * 442 *
443 * This is a very sensitive operation and great care needs 443 * This is a very sensitive operation and great care needs
444 * to be taken by the arch. The operation should carefully 444 * to be taken by the arch. The operation should carefully
445 * read the location, check to see if what is read is indeed 445 * read the location, check to see if what is read is indeed
446 * what we expect it to be, and then on success of the compare, 446 * what we expect it to be, and then on success of the compare,
447 * it should write to the location. 447 * it should write to the location.
448 * 448 *
449 * The code segment at @rec->ip should be a caller to @addr 449 * The code segment at @rec->ip should be a caller to @addr
450 * 450 *
451 * Return must be: 451 * Return must be:
452 * 0 on success 452 * 0 on success
453 * -EFAULT on error reading the location 453 * -EFAULT on error reading the location
454 * -EINVAL on a failed compare of the contents 454 * -EINVAL on a failed compare of the contents
455 * -EPERM on error writing to the location 455 * -EPERM on error writing to the location
456 * Any other value will be considered a failure. 456 * Any other value will be considered a failure.
457 */ 457 */
458 extern int ftrace_make_nop(struct module *mod, 458 extern int ftrace_make_nop(struct module *mod,
459 struct dyn_ftrace *rec, unsigned long addr); 459 struct dyn_ftrace *rec, unsigned long addr);
460 460
461 /** 461 /**
462 * ftrace_make_call - convert a nop call site into a call to addr 462 * ftrace_make_call - convert a nop call site into a call to addr
463 * @rec: the mcount call site record 463 * @rec: the mcount call site record
464 * @addr: the address that the call site should call 464 * @addr: the address that the call site should call
465 * 465 *
466 * This is a very sensitive operation and great care needs 466 * This is a very sensitive operation and great care needs
467 * to be taken by the arch. The operation should carefully 467 * to be taken by the arch. The operation should carefully
468 * read the location, check to see if what is read is indeed 468 * read the location, check to see if what is read is indeed
469 * what we expect it to be, and then on success of the compare, 469 * what we expect it to be, and then on success of the compare,
470 * it should write to the location. 470 * it should write to the location.
471 * 471 *
472 * The code segment at @rec->ip should be a nop 472 * The code segment at @rec->ip should be a nop
473 * 473 *
474 * Return must be: 474 * Return must be:
475 * 0 on success 475 * 0 on success
476 * -EFAULT on error reading the location 476 * -EFAULT on error reading the location
477 * -EINVAL on a failed compare of the contents 477 * -EINVAL on a failed compare of the contents
478 * -EPERM on error writing to the location 478 * -EPERM on error writing to the location
479 * Any other value will be considered a failure. 479 * Any other value will be considered a failure.
480 */ 480 */
481 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); 481 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
482 482
483 #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 483 #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
484 /** 484 /**
485 * ftrace_modify_call - convert from one addr to another (no nop) 485 * ftrace_modify_call - convert from one addr to another (no nop)
486 * @rec: the mcount call site record 486 * @rec: the mcount call site record
487 * @old_addr: the address expected to be currently called to 487 * @old_addr: the address expected to be currently called to
488 * @addr: the address to change to 488 * @addr: the address to change to
489 * 489 *
490 * This is a very sensitive operation and great care needs 490 * This is a very sensitive operation and great care needs
491 * to be taken by the arch. The operation should carefully 491 * to be taken by the arch. The operation should carefully
492 * read the location, check to see if what is read is indeed 492 * read the location, check to see if what is read is indeed
493 * what we expect it to be, and then on success of the compare, 493 * what we expect it to be, and then on success of the compare,
494 * it should write to the location. 494 * it should write to the location.
495 * 495 *
496 * The code segment at @rec->ip should be a caller to @old_addr 496 * The code segment at @rec->ip should be a caller to @old_addr
497 * 497 *
498 * Return must be: 498 * Return must be:
499 * 0 on success 499 * 0 on success
500 * -EFAULT on error reading the location 500 * -EFAULT on error reading the location
501 * -EINVAL on a failed compare of the contents 501 * -EINVAL on a failed compare of the contents
502 * -EPERM on error writing to the location 502 * -EPERM on error writing to the location
503 * Any other value will be considered a failure. 503 * Any other value will be considered a failure.
504 */ 504 */
505 extern int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, 505 extern int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
506 unsigned long addr); 506 unsigned long addr);
507 #else 507 #else
508 /* Should never be called */ 508 /* Should never be called */
509 static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, 509 static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
510 unsigned long addr) 510 unsigned long addr)
511 { 511 {
512 return -EINVAL; 512 return -EINVAL;
513 } 513 }
514 #endif 514 #endif
515 515
516 /* May be defined in arch */ 516 /* May be defined in arch */
517 extern int ftrace_arch_read_dyn_info(char *buf, int size); 517 extern int ftrace_arch_read_dyn_info(char *buf, int size);
518 518
519 extern int skip_trace(unsigned long ip); 519 extern int skip_trace(unsigned long ip);
520 520
521 extern void ftrace_disable_daemon(void); 521 extern void ftrace_disable_daemon(void);
522 extern void ftrace_enable_daemon(void); 522 extern void ftrace_enable_daemon(void);
523 #else /* CONFIG_DYNAMIC_FTRACE */ 523 #else /* CONFIG_DYNAMIC_FTRACE */
524 static inline int skip_trace(unsigned long ip) { return 0; } 524 static inline int skip_trace(unsigned long ip) { return 0; }
525 static inline int ftrace_force_update(void) { return 0; } 525 static inline int ftrace_force_update(void) { return 0; }
526 static inline void ftrace_disable_daemon(void) { } 526 static inline void ftrace_disable_daemon(void) { }
527 static inline void ftrace_enable_daemon(void) { } 527 static inline void ftrace_enable_daemon(void) { }
528 static inline void ftrace_release_mod(struct module *mod) {} 528 static inline void ftrace_release_mod(struct module *mod) {}
529 static inline int register_ftrace_command(struct ftrace_func_command *cmd) 529 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
530 { 530 {
531 return -EINVAL; 531 return -EINVAL;
532 } 532 }
533 static inline int unregister_ftrace_command(char *cmd_name) 533 static inline int unregister_ftrace_command(char *cmd_name)
534 { 534 {
535 return -EINVAL; 535 return -EINVAL;
536 } 536 }
537 static inline int ftrace_text_reserved(void *start, void *end) 537 static inline int ftrace_text_reserved(void *start, void *end)
538 { 538 {
539 return 0; 539 return 0;
540 } 540 }
541 static inline unsigned long ftrace_location(unsigned long ip) 541 static inline unsigned long ftrace_location(unsigned long ip)
542 { 542 {
543 return 0; 543 return 0;
544 } 544 }
545 545
546 /* 546 /*
547 * Again users of functions that have ftrace_ops may not 547 * Again users of functions that have ftrace_ops may not
548 * have them defined when ftrace is not enabled, but these 548 * have them defined when ftrace is not enabled, but these
549 * functions may still be called. Use a macro instead of inline. 549 * functions may still be called. Use a macro instead of inline.
550 */ 550 */
551 #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) 551 #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
552 #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) 552 #define ftrace_set_early_filter(ops, buf, enable) do { } while (0)
553 #define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; }) 553 #define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; })
554 #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) 554 #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; })
555 #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) 555 #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; })
556 #define ftrace_free_filter(ops) do { } while (0) 556 #define ftrace_free_filter(ops) do { } while (0)
557 557
558 static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, 558 static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
559 size_t cnt, loff_t *ppos) { return -ENODEV; } 559 size_t cnt, loff_t *ppos) { return -ENODEV; }
560 static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, 560 static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
561 size_t cnt, loff_t *ppos) { return -ENODEV; } 561 size_t cnt, loff_t *ppos) { return -ENODEV; }
562 static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 562 static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
563 { 563 {
564 return -ENODEV; 564 return -ENODEV;
565 } 565 }
566 static inline int 566 static inline int
567 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } 567 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
568 #endif /* CONFIG_DYNAMIC_FTRACE */ 568 #endif /* CONFIG_DYNAMIC_FTRACE */
569 569
570 /* totally disable ftrace - can not re-enable after this */ 570 /* totally disable ftrace - can not re-enable after this */
571 void ftrace_kill(void); 571 void ftrace_kill(void);
572 572
573 static inline void tracer_disable(void) 573 static inline void tracer_disable(void)
574 { 574 {
575 #ifdef CONFIG_FUNCTION_TRACER 575 #ifdef CONFIG_FUNCTION_TRACER
576 ftrace_enabled = 0; 576 ftrace_enabled = 0;
577 #endif 577 #endif
578 } 578 }
579 579
580 /* 580 /*
581 * Ftrace disable/restore without lock. Some synchronization mechanism 581 * Ftrace disable/restore without lock. Some synchronization mechanism
582 * must be used to prevent ftrace_enabled to be changed between 582 * must be used to prevent ftrace_enabled to be changed between
583 * disable/restore. 583 * disable/restore.
584 */ 584 */
585 static inline int __ftrace_enabled_save(void) 585 static inline int __ftrace_enabled_save(void)
586 { 586 {
587 #ifdef CONFIG_FUNCTION_TRACER 587 #ifdef CONFIG_FUNCTION_TRACER
588 int saved_ftrace_enabled = ftrace_enabled; 588 int saved_ftrace_enabled = ftrace_enabled;
589 ftrace_enabled = 0; 589 ftrace_enabled = 0;
590 return saved_ftrace_enabled; 590 return saved_ftrace_enabled;
591 #else 591 #else
592 return 0; 592 return 0;
593 #endif 593 #endif
594 } 594 }
595 595
596 static inline void __ftrace_enabled_restore(int enabled) 596 static inline void __ftrace_enabled_restore(int enabled)
597 { 597 {
598 #ifdef CONFIG_FUNCTION_TRACER 598 #ifdef CONFIG_FUNCTION_TRACER
599 ftrace_enabled = enabled; 599 ftrace_enabled = enabled;
600 #endif 600 #endif
601 } 601 }
602 602
603 #ifndef HAVE_ARCH_CALLER_ADDR 603 #ifndef HAVE_ARCH_CALLER_ADDR
604 # ifdef CONFIG_FRAME_POINTER 604 # ifdef CONFIG_FRAME_POINTER
605 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 605 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
606 # define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) 606 # define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
607 # define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) 607 # define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
608 # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) 608 # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
609 # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) 609 # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
610 # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) 610 # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
611 # define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) 611 # define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
612 # else 612 # else
613 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 613 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
614 # define CALLER_ADDR1 0UL 614 # define CALLER_ADDR1 0UL
615 # define CALLER_ADDR2 0UL 615 # define CALLER_ADDR2 0UL
616 # define CALLER_ADDR3 0UL 616 # define CALLER_ADDR3 0UL
617 # define CALLER_ADDR4 0UL 617 # define CALLER_ADDR4 0UL
618 # define CALLER_ADDR5 0UL 618 # define CALLER_ADDR5 0UL
619 # define CALLER_ADDR6 0UL 619 # define CALLER_ADDR6 0UL
620 # endif 620 # endif
621 #endif /* ifndef HAVE_ARCH_CALLER_ADDR */ 621 #endif /* ifndef HAVE_ARCH_CALLER_ADDR */
622 622
623 #ifdef CONFIG_IRQSOFF_TRACER 623 #ifdef CONFIG_IRQSOFF_TRACER
624 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 624 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
625 extern void time_hardirqs_off(unsigned long a0, unsigned long a1); 625 extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
626 #else 626 #else
627 static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { } 627 static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { }
628 static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } 628 static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
629 #endif 629 #endif
630 630
631 #ifdef CONFIG_PREEMPT_TRACER 631 #ifdef CONFIG_PREEMPT_TRACER
632 extern void trace_preempt_on(unsigned long a0, unsigned long a1); 632 extern void trace_preempt_on(unsigned long a0, unsigned long a1);
633 extern void trace_preempt_off(unsigned long a0, unsigned long a1); 633 extern void trace_preempt_off(unsigned long a0, unsigned long a1);
634 #else 634 #else
635 /* 635 /*
636 * Use defines instead of static inlines because some arches will make code out 636 * Use defines instead of static inlines because some arches will make code out
637 * of the CALLER_ADDR, when we really want these to be a real nop. 637 * of the CALLER_ADDR, when we really want these to be a real nop.
638 */ 638 */
639 # define trace_preempt_on(a0, a1) do { } while (0) 639 # define trace_preempt_on(a0, a1) do { } while (0)
640 # define trace_preempt_off(a0, a1) do { } while (0) 640 # define trace_preempt_off(a0, a1) do { } while (0)
641 #endif 641 #endif
642 642
643 #ifdef CONFIG_FTRACE_MCOUNT_RECORD 643 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
644 extern void ftrace_init(void); 644 extern void ftrace_init(void);
645 #else 645 #else
646 static inline void ftrace_init(void) { } 646 static inline void ftrace_init(void) { }
647 #endif 647 #endif
648 648
649 /* 649 /*
650 * Structure that defines an entry function trace. 650 * Structure that defines an entry function trace.
651 */ 651 */
652 struct ftrace_graph_ent { 652 struct ftrace_graph_ent {
653 unsigned long func; /* Current function */ 653 unsigned long func; /* Current function */
654 int depth; 654 int depth;
655 }; 655 };
656 656
657 /* 657 /*
658 * Structure that defines a return function trace. 658 * Structure that defines a return function trace.
659 */ 659 */
660 struct ftrace_graph_ret { 660 struct ftrace_graph_ret {
661 unsigned long func; /* Current function */ 661 unsigned long func; /* Current function */
662 unsigned long long calltime; 662 unsigned long long calltime;
663 unsigned long long rettime; 663 unsigned long long rettime;
664 /* Number of functions that overran the depth limit for current task */ 664 /* Number of functions that overran the depth limit for current task */
665 unsigned long overrun; 665 unsigned long overrun;
666 int depth; 666 int depth;
667 }; 667 };
668 668
669 /* Type of the callback handlers for tracing function graph*/ 669 /* Type of the callback handlers for tracing function graph*/
670 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ 670 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
671 typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ 671 typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
672 672
673 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 673 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
674 674
675 /* for init task */ 675 /* for init task */
676 #define INIT_FTRACE_GRAPH .ret_stack = NULL, 676 #define INIT_FTRACE_GRAPH .ret_stack = NULL,
677 677
678 /* 678 /*
679 * Stack of return addresses for functions 679 * Stack of return addresses for functions
680 * of a thread. 680 * of a thread.
681 * Used in struct thread_info 681 * Used in struct thread_info
682 */ 682 */
683 struct ftrace_ret_stack { 683 struct ftrace_ret_stack {
684 unsigned long ret; 684 unsigned long ret;
685 unsigned long func; 685 unsigned long func;
686 unsigned long long calltime; 686 unsigned long long calltime;
687 unsigned long long subtime; 687 unsigned long long subtime;
688 unsigned long fp; 688 unsigned long fp;
689 }; 689 };
690 690
691 /* 691 /*
692 * Primary handler of a function return. 692 * Primary handler of a function return.
693 * It relays on ftrace_return_to_handler. 693 * It relays on ftrace_return_to_handler.
694 * Defined in entry_32/64.S 694 * Defined in entry_32/64.S
695 */ 695 */
696 extern void return_to_handler(void); 696 extern void return_to_handler(void);
697 697
698 extern int 698 extern int
699 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, 699 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
700 unsigned long frame_pointer); 700 unsigned long frame_pointer);
701 701
702 /* 702 /*
703 * Sometimes we don't want to trace a function with the function 703 * Sometimes we don't want to trace a function with the function
704 * graph tracer but we want them to keep traced by the usual function 704 * graph tracer but we want them to keep traced by the usual function
705 * tracer if the function graph tracer is not configured. 705 * tracer if the function graph tracer is not configured.
706 */ 706 */
707 #define __notrace_funcgraph notrace 707 #define __notrace_funcgraph notrace
708 708
709 /* 709 /*
710 * We want to which function is an entrypoint of a hardirq. 710 * We want to which function is an entrypoint of a hardirq.
711 * That will help us to put a signal on output. 711 * That will help us to put a signal on output.
712 */ 712 */
713 #define __irq_entry __attribute__((__section__(".irqentry.text"))) 713 #define __irq_entry __attribute__((__section__(".irqentry.text")))
714 714
715 /* Limits of hardirq entrypoints */ 715 /* Limits of hardirq entrypoints */
716 extern char __irqentry_text_start[]; 716 extern char __irqentry_text_start[];
717 extern char __irqentry_text_end[]; 717 extern char __irqentry_text_end[];
718 718
719 #define FTRACE_RETFUNC_DEPTH 50 719 #define FTRACE_RETFUNC_DEPTH 50
720 #define FTRACE_RETSTACK_ALLOC_SIZE 32 720 #define FTRACE_RETSTACK_ALLOC_SIZE 32
721 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, 721 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
722 trace_func_graph_ent_t entryfunc); 722 trace_func_graph_ent_t entryfunc);
723 723
724 extern void ftrace_graph_stop(void); 724 extern void ftrace_graph_stop(void);
725 725
726 /* The current handlers in use */ 726 /* The current handlers in use */
727 extern trace_func_graph_ret_t ftrace_graph_return; 727 extern trace_func_graph_ret_t ftrace_graph_return;
728 extern trace_func_graph_ent_t ftrace_graph_entry; 728 extern trace_func_graph_ent_t ftrace_graph_entry;
729 729
730 extern void unregister_ftrace_graph(void); 730 extern void unregister_ftrace_graph(void);
731 731
732 extern void ftrace_graph_init_task(struct task_struct *t); 732 extern void ftrace_graph_init_task(struct task_struct *t);
733 extern void ftrace_graph_exit_task(struct task_struct *t); 733 extern void ftrace_graph_exit_task(struct task_struct *t);
734 extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); 734 extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu);
735 735
736 static inline int task_curr_ret_stack(struct task_struct *t) 736 static inline int task_curr_ret_stack(struct task_struct *t)
737 { 737 {
738 return t->curr_ret_stack; 738 return t->curr_ret_stack;
739 } 739 }
740 740
741 static inline void pause_graph_tracing(void) 741 static inline void pause_graph_tracing(void)
742 { 742 {
743 atomic_inc(&current->tracing_graph_pause); 743 atomic_inc(&current->tracing_graph_pause);
744 } 744 }
745 745
746 static inline void unpause_graph_tracing(void) 746 static inline void unpause_graph_tracing(void)
747 { 747 {
748 atomic_dec(&current->tracing_graph_pause); 748 atomic_dec(&current->tracing_graph_pause);
749 } 749 }
750 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ 750 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
751 751
752 #define __notrace_funcgraph 752 #define __notrace_funcgraph
753 #define __irq_entry 753 #define __irq_entry
754 #define INIT_FTRACE_GRAPH 754 #define INIT_FTRACE_GRAPH
755 755
756 static inline void ftrace_graph_init_task(struct task_struct *t) { } 756 static inline void ftrace_graph_init_task(struct task_struct *t) { }
757 static inline void ftrace_graph_exit_task(struct task_struct *t) { } 757 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
758 static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { } 758 static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { }
759 759
760 static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, 760 static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
761 trace_func_graph_ent_t entryfunc) 761 trace_func_graph_ent_t entryfunc)
762 { 762 {
763 return -1; 763 return -1;
764 } 764 }
765 static inline void unregister_ftrace_graph(void) { } 765 static inline void unregister_ftrace_graph(void) { }
766 766
767 static inline int task_curr_ret_stack(struct task_struct *tsk) 767 static inline int task_curr_ret_stack(struct task_struct *tsk)
768 { 768 {
769 return -1; 769 return -1;
770 } 770 }
771 771
772 static inline void pause_graph_tracing(void) { } 772 static inline void pause_graph_tracing(void) { }
773 static inline void unpause_graph_tracing(void) { } 773 static inline void unpause_graph_tracing(void) { }
774 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 774 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
775 775
776 #ifdef CONFIG_TRACING 776 #ifdef CONFIG_TRACING
777 777
778 /* flags for current->trace */ 778 /* flags for current->trace */
779 enum { 779 enum {
780 TSK_TRACE_FL_TRACE_BIT = 0, 780 TSK_TRACE_FL_TRACE_BIT = 0,
781 TSK_TRACE_FL_GRAPH_BIT = 1, 781 TSK_TRACE_FL_GRAPH_BIT = 1,
782 }; 782 };
783 enum { 783 enum {
784 TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT, 784 TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT,
785 TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT, 785 TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT,
786 }; 786 };
787 787
788 static inline void set_tsk_trace_trace(struct task_struct *tsk) 788 static inline void set_tsk_trace_trace(struct task_struct *tsk)
789 { 789 {
790 set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); 790 set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
791 } 791 }
792 792
793 static inline void clear_tsk_trace_trace(struct task_struct *tsk) 793 static inline void clear_tsk_trace_trace(struct task_struct *tsk)
794 { 794 {
795 clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); 795 clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
796 } 796 }
797 797
798 static inline int test_tsk_trace_trace(struct task_struct *tsk) 798 static inline int test_tsk_trace_trace(struct task_struct *tsk)
799 { 799 {
800 return tsk->trace & TSK_TRACE_FL_TRACE; 800 return tsk->trace & TSK_TRACE_FL_TRACE;
801 } 801 }
802 802
803 static inline void set_tsk_trace_graph(struct task_struct *tsk) 803 static inline void set_tsk_trace_graph(struct task_struct *tsk)
804 { 804 {
805 set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); 805 set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
806 } 806 }
807 807
808 static inline void clear_tsk_trace_graph(struct task_struct *tsk) 808 static inline void clear_tsk_trace_graph(struct task_struct *tsk)
809 { 809 {
810 clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); 810 clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
811 } 811 }
812 812
813 static inline int test_tsk_trace_graph(struct task_struct *tsk) 813 static inline int test_tsk_trace_graph(struct task_struct *tsk)
814 { 814 {
815 return tsk->trace & TSK_TRACE_FL_GRAPH; 815 return tsk->trace & TSK_TRACE_FL_GRAPH;
816 } 816 }
817 817
818 enum ftrace_dump_mode; 818 enum ftrace_dump_mode;
819 819
820 extern enum ftrace_dump_mode ftrace_dump_on_oops; 820 extern enum ftrace_dump_mode ftrace_dump_on_oops;
821 821
822 #ifdef CONFIG_PREEMPT 822 #ifdef CONFIG_PREEMPT
823 #define INIT_TRACE_RECURSION .trace_recursion = 0, 823 #define INIT_TRACE_RECURSION .trace_recursion = 0,
824 #endif 824 #endif
825 825
826 #endif /* CONFIG_TRACING */ 826 #endif /* CONFIG_TRACING */
827 827
828 #ifndef INIT_TRACE_RECURSION 828 #ifndef INIT_TRACE_RECURSION
829 #define INIT_TRACE_RECURSION 829 #define INIT_TRACE_RECURSION
830 #endif 830 #endif
831 831
832 #ifdef CONFIG_FTRACE_SYSCALLS 832 #ifdef CONFIG_FTRACE_SYSCALLS
833 833
834 unsigned long arch_syscall_addr(int nr); 834 unsigned long arch_syscall_addr(int nr);
835 835
836 #endif /* CONFIG_FTRACE_SYSCALLS */ 836 #endif /* CONFIG_FTRACE_SYSCALLS */
837 837
838 #endif /* _LINUX_FTRACE_H */ 838 #endif /* _LINUX_FTRACE_H */
839 839
include/linux/syscalls.h
1 /* 1 /*
2 * syscalls.h - Linux syscall interfaces (non-arch-specific) 2 * syscalls.h - Linux syscall interfaces (non-arch-specific)
3 * 3 *
4 * Copyright (c) 2004 Randy Dunlap 4 * Copyright (c) 2004 Randy Dunlap
5 * Copyright (c) 2004 Open Source Development Labs 5 * Copyright (c) 2004 Open Source Development Labs
6 * 6 *
7 * This file is released under the GPLv2. 7 * This file is released under the GPLv2.
8 * See the file COPYING for more details. 8 * See the file COPYING for more details.
9 */ 9 */
10 10
11 #ifndef _LINUX_SYSCALLS_H 11 #ifndef _LINUX_SYSCALLS_H
12 #define _LINUX_SYSCALLS_H 12 #define _LINUX_SYSCALLS_H
13 13
14 struct epoll_event; 14 struct epoll_event;
15 struct iattr; 15 struct iattr;
16 struct inode; 16 struct inode;
17 struct iocb; 17 struct iocb;
18 struct io_event; 18 struct io_event;
19 struct iovec; 19 struct iovec;
20 struct itimerspec; 20 struct itimerspec;
21 struct itimerval; 21 struct itimerval;
22 struct kexec_segment; 22 struct kexec_segment;
23 struct linux_dirent; 23 struct linux_dirent;
24 struct linux_dirent64; 24 struct linux_dirent64;
25 struct list_head; 25 struct list_head;
26 struct mmap_arg_struct; 26 struct mmap_arg_struct;
27 struct msgbuf; 27 struct msgbuf;
28 struct msghdr; 28 struct msghdr;
29 struct mmsghdr; 29 struct mmsghdr;
30 struct msqid_ds; 30 struct msqid_ds;
31 struct new_utsname; 31 struct new_utsname;
32 struct nfsctl_arg; 32 struct nfsctl_arg;
33 struct __old_kernel_stat; 33 struct __old_kernel_stat;
34 struct oldold_utsname; 34 struct oldold_utsname;
35 struct old_utsname; 35 struct old_utsname;
36 struct pollfd; 36 struct pollfd;
37 struct rlimit; 37 struct rlimit;
38 struct rlimit64; 38 struct rlimit64;
39 struct rusage; 39 struct rusage;
40 struct sched_param; 40 struct sched_param;
41 struct sel_arg_struct; 41 struct sel_arg_struct;
42 struct semaphore; 42 struct semaphore;
43 struct sembuf; 43 struct sembuf;
44 struct shmid_ds; 44 struct shmid_ds;
45 struct sockaddr; 45 struct sockaddr;
46 struct stat; 46 struct stat;
47 struct stat64; 47 struct stat64;
48 struct statfs; 48 struct statfs;
49 struct statfs64; 49 struct statfs64;
50 struct __sysctl_args; 50 struct __sysctl_args;
51 struct sysinfo; 51 struct sysinfo;
52 struct timespec; 52 struct timespec;
53 struct timeval; 53 struct timeval;
54 struct timex; 54 struct timex;
55 struct timezone; 55 struct timezone;
56 struct tms; 56 struct tms;
57 struct utimbuf; 57 struct utimbuf;
58 struct mq_attr; 58 struct mq_attr;
59 struct compat_stat; 59 struct compat_stat;
60 struct compat_timeval; 60 struct compat_timeval;
61 struct robust_list_head; 61 struct robust_list_head;
62 struct getcpu_cache; 62 struct getcpu_cache;
63 struct old_linux_dirent; 63 struct old_linux_dirent;
64 struct perf_event_attr; 64 struct perf_event_attr;
65 struct file_handle; 65 struct file_handle;
66 66
67 #include <linux/types.h> 67 #include <linux/types.h>
68 #include <linux/aio_abi.h> 68 #include <linux/aio_abi.h>
69 #include <linux/capability.h> 69 #include <linux/capability.h>
70 #include <linux/list.h> 70 #include <linux/list.h>
71 #include <linux/bug.h> 71 #include <linux/bug.h>
72 #include <linux/sem.h> 72 #include <linux/sem.h>
73 #include <asm/siginfo.h> 73 #include <asm/siginfo.h>
74 #include <asm/signal.h> 74 #include <asm/signal.h>
75 #include <linux/unistd.h> 75 #include <linux/unistd.h>
76 #include <linux/quota.h> 76 #include <linux/quota.h>
77 #include <linux/key.h> 77 #include <linux/key.h>
78 #include <trace/syscall.h> 78 #include <trace/syscall.h>
79 79
80 #define __SC_DECL1(t1, a1) t1 a1 80 #define __SC_DECL1(t1, a1) t1 a1
81 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) 81 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
82 #define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__) 82 #define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__)
83 #define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__) 83 #define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__)
84 #define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__) 84 #define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__)
85 #define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__) 85 #define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__)
86 86
87 #define __SC_LONG1(t1, a1) long a1 87 #define __SC_LONG1(t1, a1) long a1
88 #define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__) 88 #define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__)
89 #define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__) 89 #define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__)
90 #define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__) 90 #define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__)
91 #define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__) 91 #define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__)
92 #define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__) 92 #define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__)
93 93
94 #define __SC_CAST1(t1, a1) (t1) a1 94 #define __SC_CAST1(t1, a1) (t1) a1
95 #define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__) 95 #define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__)
96 #define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__) 96 #define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__)
97 #define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__) 97 #define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__)
98 #define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__) 98 #define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__)
99 #define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__) 99 #define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__)
100 100
101 #define __SC_TEST(type) BUILD_BUG_ON(sizeof(type) > sizeof(long)) 101 #define __SC_TEST(type) BUILD_BUG_ON(sizeof(type) > sizeof(long))
102 #define __SC_TEST1(t1, a1) __SC_TEST(t1) 102 #define __SC_TEST1(t1, a1) __SC_TEST(t1)
103 #define __SC_TEST2(t2, a2, ...) __SC_TEST(t2); __SC_TEST1(__VA_ARGS__) 103 #define __SC_TEST2(t2, a2, ...) __SC_TEST(t2); __SC_TEST1(__VA_ARGS__)
104 #define __SC_TEST3(t3, a3, ...) __SC_TEST(t3); __SC_TEST2(__VA_ARGS__) 104 #define __SC_TEST3(t3, a3, ...) __SC_TEST(t3); __SC_TEST2(__VA_ARGS__)
105 #define __SC_TEST4(t4, a4, ...) __SC_TEST(t4); __SC_TEST3(__VA_ARGS__) 105 #define __SC_TEST4(t4, a4, ...) __SC_TEST(t4); __SC_TEST3(__VA_ARGS__)
106 #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) 106 #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
107 #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) 107 #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
108 108
109 #ifdef CONFIG_FTRACE_SYSCALLS 109 #ifdef CONFIG_FTRACE_SYSCALLS
110 #define __SC_STR_ADECL1(t, a) #a 110 #define __SC_STR_ADECL1(t, a) #a
111 #define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) 111 #define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
112 #define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__) 112 #define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__)
113 #define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__) 113 #define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__)
114 #define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__) 114 #define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__)
115 #define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__) 115 #define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__)
116 116
117 #define __SC_STR_TDECL1(t, a) #t 117 #define __SC_STR_TDECL1(t, a) #t
118 #define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__) 118 #define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__)
119 #define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__) 119 #define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__)
120 #define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__) 120 #define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__)
121 #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) 121 #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
122 #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) 122 #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
123 123
124 extern struct ftrace_event_class event_class_syscall_enter; 124 extern struct ftrace_event_class event_class_syscall_enter;
125 extern struct ftrace_event_class event_class_syscall_exit; 125 extern struct ftrace_event_class event_class_syscall_exit;
126 extern struct trace_event_functions enter_syscall_print_funcs; 126 extern struct trace_event_functions enter_syscall_print_funcs;
127 extern struct trace_event_functions exit_syscall_print_funcs; 127 extern struct trace_event_functions exit_syscall_print_funcs;
128 128
129 #define SYSCALL_TRACE_ENTER_EVENT(sname) \ 129 #define SYSCALL_TRACE_ENTER_EVENT(sname) \
130 static struct syscall_metadata __syscall_meta_##sname; \ 130 static struct syscall_metadata __syscall_meta_##sname; \
131 static struct ftrace_event_call __used \ 131 static struct ftrace_event_call __used \
132 event_enter_##sname = { \ 132 event_enter_##sname = { \
133 .name = "sys_enter"#sname, \ 133 .name = "sys_enter"#sname, \
134 .class = &event_class_syscall_enter, \ 134 .class = &event_class_syscall_enter, \
135 .event.funcs = &enter_syscall_print_funcs, \ 135 .event.funcs = &enter_syscall_print_funcs, \
136 .data = (void *)&__syscall_meta_##sname,\ 136 .data = (void *)&__syscall_meta_##sname,\
137 .flags = TRACE_EVENT_FL_CAP_ANY, \ 137 .flags = TRACE_EVENT_FL_CAP_ANY, \
138 }; \ 138 }; \
139 static struct ftrace_event_call __used \ 139 static struct ftrace_event_call __used \
140 __attribute__((section("_ftrace_events"))) \ 140 __attribute__((section("_ftrace_events"))) \
141 *__event_enter_##sname = &event_enter_##sname; 141 *__event_enter_##sname = &event_enter_##sname;
142 142
143 #define SYSCALL_TRACE_EXIT_EVENT(sname) \ 143 #define SYSCALL_TRACE_EXIT_EVENT(sname) \
144 static struct syscall_metadata __syscall_meta_##sname; \ 144 static struct syscall_metadata __syscall_meta_##sname; \
145 static struct ftrace_event_call __used \ 145 static struct ftrace_event_call __used \
146 event_exit_##sname = { \ 146 event_exit_##sname = { \
147 .name = "sys_exit"#sname, \ 147 .name = "sys_exit"#sname, \
148 .class = &event_class_syscall_exit, \ 148 .class = &event_class_syscall_exit, \
149 .event.funcs = &exit_syscall_print_funcs, \ 149 .event.funcs = &exit_syscall_print_funcs, \
150 .data = (void *)&__syscall_meta_##sname,\ 150 .data = (void *)&__syscall_meta_##sname,\
151 .flags = TRACE_EVENT_FL_CAP_ANY, \ 151 .flags = TRACE_EVENT_FL_CAP_ANY, \
152 }; \ 152 }; \
153 static struct ftrace_event_call __used \ 153 static struct ftrace_event_call __used \
154 __attribute__((section("_ftrace_events"))) \ 154 __attribute__((section("_ftrace_events"))) \
155 *__event_exit_##sname = &event_exit_##sname; 155 *__event_exit_##sname = &event_exit_##sname;
156 156
157 #define SYSCALL_METADATA(sname, nb) \ 157 #define SYSCALL_METADATA(sname, nb) \
158 SYSCALL_TRACE_ENTER_EVENT(sname); \ 158 SYSCALL_TRACE_ENTER_EVENT(sname); \
159 SYSCALL_TRACE_EXIT_EVENT(sname); \ 159 SYSCALL_TRACE_EXIT_EVENT(sname); \
160 static struct syscall_metadata __used \ 160 static struct syscall_metadata __used \
161 __syscall_meta_##sname = { \ 161 __syscall_meta_##sname = { \
162 .name = "sys"#sname, \ 162 .name = "sys"#sname, \
163 .syscall_nr = -1, /* Filled in at boot */ \ 163 .syscall_nr = -1, /* Filled in at boot */ \
164 .nb_args = nb, \ 164 .nb_args = nb, \
165 .types = types_##sname, \ 165 .types = types_##sname, \
166 .args = args_##sname, \ 166 .args = args_##sname, \
167 .enter_event = &event_enter_##sname, \ 167 .enter_event = &event_enter_##sname, \
168 .exit_event = &event_exit_##sname, \ 168 .exit_event = &event_exit_##sname, \
169 .enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \ 169 .enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
170 }; \ 170 }; \
171 static struct syscall_metadata __used \ 171 static struct syscall_metadata __used \
172 __attribute__((section("__syscalls_metadata"))) \ 172 __attribute__((section("__syscalls_metadata"))) \
173 *__p_syscall_meta_##sname = &__syscall_meta_##sname; 173 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
174 174
175 #define SYSCALL_DEFINE0(sname) \ 175 #define SYSCALL_DEFINE0(sname) \
176 SYSCALL_TRACE_ENTER_EVENT(_##sname); \ 176 SYSCALL_TRACE_ENTER_EVENT(_##sname); \
177 SYSCALL_TRACE_EXIT_EVENT(_##sname); \ 177 SYSCALL_TRACE_EXIT_EVENT(_##sname); \
178 static struct syscall_metadata __used \ 178 static struct syscall_metadata __used \
179 __syscall_meta__##sname = { \ 179 __syscall_meta__##sname = { \
180 .name = "sys_"#sname, \ 180 .name = "sys_"#sname, \
181 .syscall_nr = -1, /* Filled in at boot */ \ 181 .syscall_nr = -1, /* Filled in at boot */ \
182 .nb_args = 0, \ 182 .nb_args = 0, \
183 .enter_event = &event_enter__##sname, \ 183 .enter_event = &event_enter__##sname, \
184 .exit_event = &event_exit__##sname, \ 184 .exit_event = &event_exit__##sname, \
185 .enter_fields = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \ 185 .enter_fields = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
186 }; \ 186 }; \
187 static struct syscall_metadata __used \ 187 static struct syscall_metadata __used \
188 __attribute__((section("__syscalls_metadata"))) \ 188 __attribute__((section("__syscalls_metadata"))) \
189 *__p_syscall_meta_##sname = &__syscall_meta__##sname; \ 189 *__p_syscall_meta_##sname = &__syscall_meta__##sname; \
190 asmlinkage long sys_##sname(void) 190 asmlinkage long sys_##sname(void)
191 #else 191 #else
192 #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) 192 #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
193 #endif 193 #endif
194 194
195 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) 195 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
196 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) 196 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
197 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) 197 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
198 #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) 198 #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
199 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) 199 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
200 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) 200 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
201 201
202 #ifdef CONFIG_PPC64 202 #ifdef CONFIG_PPC64
203 #define SYSCALL_ALIAS(alias, name) \ 203 #define SYSCALL_ALIAS(alias, name) \
204 asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n" \ 204 asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n" \
205 "\t.globl ." #alias "\n\t.set ." #alias ", ." #name) 205 "\t.globl ." #alias "\n\t.set ." #alias ", ." #name)
206 #else 206 #else
207 #if defined(CONFIG_ALPHA) || defined(CONFIG_MIPS) 207 #if defined(CONFIG_ALPHA) || defined(CONFIG_MIPS)
208 #define SYSCALL_ALIAS(alias, name) \ 208 #define SYSCALL_ALIAS(alias, name) \
209 asm ( #alias " = " #name "\n\t.globl " #alias) 209 asm ( #alias " = " #name "\n\t.globl " #alias)
210 #else 210 #else
211 #define SYSCALL_ALIAS(alias, name) \ 211 #define SYSCALL_ALIAS(alias, name) \
212 asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) 212 asm ("\t.globl " #alias "\n\t.set " #alias ", " #name)
213 #endif 213 #endif
214 #endif 214 #endif
215 215
216 #ifdef CONFIG_FTRACE_SYSCALLS 216 #ifdef CONFIG_FTRACE_SYSCALLS
217 #define SYSCALL_DEFINEx(x, sname, ...) \ 217 #define SYSCALL_DEFINEx(x, sname, ...) \
218 static const char *types_##sname[] = { \ 218 static const char *types_##sname[] = { \
219 __SC_STR_TDECL##x(__VA_ARGS__) \ 219 __SC_STR_TDECL##x(__VA_ARGS__) \
220 }; \ 220 }; \
221 static const char *args_##sname[] = { \ 221 static const char *args_##sname[] = { \
222 __SC_STR_ADECL##x(__VA_ARGS__) \ 222 __SC_STR_ADECL##x(__VA_ARGS__) \
223 }; \ 223 }; \
224 SYSCALL_METADATA(sname, x); \ 224 SYSCALL_METADATA(sname, x); \
225 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) 225 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
226 #else 226 #else
227 #define SYSCALL_DEFINEx(x, sname, ...) \ 227 #define SYSCALL_DEFINEx(x, sname, ...) \
228 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) 228 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
229 #endif 229 #endif
230 230
231 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 231 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
232 232
233 #define SYSCALL_DEFINE(name) static inline long SYSC_##name 233 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
234 234
235 #define __SYSCALL_DEFINEx(x, name, ...) \ 235 #define __SYSCALL_DEFINEx(x, name, ...) \
236 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ 236 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \
237 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ 237 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
238 asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ 238 asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
239 { \ 239 { \
240 __SC_TEST##x(__VA_ARGS__); \ 240 __SC_TEST##x(__VA_ARGS__); \
241 return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ 241 return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \
242 } \ 242 } \
243 SYSCALL_ALIAS(sys##name, SyS##name); \ 243 SYSCALL_ALIAS(sys##name, SyS##name); \
244 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) 244 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
245 245
246 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ 246 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
247 247
248 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name 248 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
249 #define __SYSCALL_DEFINEx(x, name, ...) \ 249 #define __SYSCALL_DEFINEx(x, name, ...) \
250 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) 250 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
251 251
252 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ 252 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
253 253
254 asmlinkage long sys_time(time_t __user *tloc); 254 asmlinkage long sys_time(time_t __user *tloc);
255 asmlinkage long sys_stime(time_t __user *tptr); 255 asmlinkage long sys_stime(time_t __user *tptr);
256 asmlinkage long sys_gettimeofday(struct timeval __user *tv, 256 asmlinkage long sys_gettimeofday(struct timeval __user *tv,
257 struct timezone __user *tz); 257 struct timezone __user *tz);
258 asmlinkage long sys_settimeofday(struct timeval __user *tv, 258 asmlinkage long sys_settimeofday(struct timeval __user *tv,
259 struct timezone __user *tz); 259 struct timezone __user *tz);
260 asmlinkage long sys_adjtimex(struct timex __user *txc_p); 260 asmlinkage long sys_adjtimex(struct timex __user *txc_p);
261 261
262 asmlinkage long sys_times(struct tms __user *tbuf); 262 asmlinkage long sys_times(struct tms __user *tbuf);
263 263
264 asmlinkage long sys_gettid(void); 264 asmlinkage long sys_gettid(void);
265 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); 265 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
266 asmlinkage long sys_alarm(unsigned int seconds); 266 asmlinkage long sys_alarm(unsigned int seconds);
267 asmlinkage long sys_getpid(void); 267 asmlinkage long sys_getpid(void);
268 asmlinkage long sys_getppid(void); 268 asmlinkage long sys_getppid(void);
269 asmlinkage long sys_getuid(void); 269 asmlinkage long sys_getuid(void);
270 asmlinkage long sys_geteuid(void); 270 asmlinkage long sys_geteuid(void);
271 asmlinkage long sys_getgid(void); 271 asmlinkage long sys_getgid(void);
272 asmlinkage long sys_getegid(void); 272 asmlinkage long sys_getegid(void);
273 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid); 273 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid);
274 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid); 274 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid);
275 asmlinkage long sys_getpgid(pid_t pid); 275 asmlinkage long sys_getpgid(pid_t pid);
276 asmlinkage long sys_getpgrp(void); 276 asmlinkage long sys_getpgrp(void);
277 asmlinkage long sys_getsid(pid_t pid); 277 asmlinkage long sys_getsid(pid_t pid);
278 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist); 278 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist);
279 279
280 asmlinkage long sys_setregid(gid_t rgid, gid_t egid); 280 asmlinkage long sys_setregid(gid_t rgid, gid_t egid);
281 asmlinkage long sys_setgid(gid_t gid); 281 asmlinkage long sys_setgid(gid_t gid);
282 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid); 282 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid);
283 asmlinkage long sys_setuid(uid_t uid); 283 asmlinkage long sys_setuid(uid_t uid);
284 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); 284 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
285 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); 285 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
286 asmlinkage long sys_setfsuid(uid_t uid); 286 asmlinkage long sys_setfsuid(uid_t uid);
287 asmlinkage long sys_setfsgid(gid_t gid); 287 asmlinkage long sys_setfsgid(gid_t gid);
288 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid); 288 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid);
289 asmlinkage long sys_setsid(void); 289 asmlinkage long sys_setsid(void);
290 asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist); 290 asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist);
291 291
292 asmlinkage long sys_acct(const char __user *name); 292 asmlinkage long sys_acct(const char __user *name);
293 asmlinkage long sys_capget(cap_user_header_t header, 293 asmlinkage long sys_capget(cap_user_header_t header,
294 cap_user_data_t dataptr); 294 cap_user_data_t dataptr);
295 asmlinkage long sys_capset(cap_user_header_t header, 295 asmlinkage long sys_capset(cap_user_header_t header,
296 const cap_user_data_t data); 296 const cap_user_data_t data);
297 asmlinkage long sys_personality(unsigned int personality); 297 asmlinkage long sys_personality(unsigned int personality);
298 298
299 asmlinkage long sys_sigpending(old_sigset_t __user *set); 299 asmlinkage long sys_sigpending(old_sigset_t __user *set);
300 asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set, 300 asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
301 old_sigset_t __user *oset); 301 old_sigset_t __user *oset);
302 asmlinkage long sys_getitimer(int which, struct itimerval __user *value); 302 asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
303 asmlinkage long sys_setitimer(int which, 303 asmlinkage long sys_setitimer(int which,
304 struct itimerval __user *value, 304 struct itimerval __user *value,
305 struct itimerval __user *ovalue); 305 struct itimerval __user *ovalue);
306 asmlinkage long sys_timer_create(clockid_t which_clock, 306 asmlinkage long sys_timer_create(clockid_t which_clock,
307 struct sigevent __user *timer_event_spec, 307 struct sigevent __user *timer_event_spec,
308 timer_t __user * created_timer_id); 308 timer_t __user * created_timer_id);
309 asmlinkage long sys_timer_gettime(timer_t timer_id, 309 asmlinkage long sys_timer_gettime(timer_t timer_id,
310 struct itimerspec __user *setting); 310 struct itimerspec __user *setting);
311 asmlinkage long sys_timer_getoverrun(timer_t timer_id); 311 asmlinkage long sys_timer_getoverrun(timer_t timer_id);
312 asmlinkage long sys_timer_settime(timer_t timer_id, int flags, 312 asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
313 const struct itimerspec __user *new_setting, 313 const struct itimerspec __user *new_setting,
314 struct itimerspec __user *old_setting); 314 struct itimerspec __user *old_setting);
315 asmlinkage long sys_timer_delete(timer_t timer_id); 315 asmlinkage long sys_timer_delete(timer_t timer_id);
316 asmlinkage long sys_clock_settime(clockid_t which_clock, 316 asmlinkage long sys_clock_settime(clockid_t which_clock,
317 const struct timespec __user *tp); 317 const struct timespec __user *tp);
318 asmlinkage long sys_clock_gettime(clockid_t which_clock, 318 asmlinkage long sys_clock_gettime(clockid_t which_clock,
319 struct timespec __user *tp); 319 struct timespec __user *tp);
320 asmlinkage long sys_clock_adjtime(clockid_t which_clock, 320 asmlinkage long sys_clock_adjtime(clockid_t which_clock,
321 struct timex __user *tx); 321 struct timex __user *tx);
322 asmlinkage long sys_clock_getres(clockid_t which_clock, 322 asmlinkage long sys_clock_getres(clockid_t which_clock,
323 struct timespec __user *tp); 323 struct timespec __user *tp);
324 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, 324 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
325 const struct timespec __user *rqtp, 325 const struct timespec __user *rqtp,
326 struct timespec __user *rmtp); 326 struct timespec __user *rmtp);
327 327
328 asmlinkage long sys_nice(int increment); 328 asmlinkage long sys_nice(int increment);
329 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 329 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
330 struct sched_param __user *param); 330 struct sched_param __user *param);
331 asmlinkage long sys_sched_setparam(pid_t pid, 331 asmlinkage long sys_sched_setparam(pid_t pid,
332 struct sched_param __user *param); 332 struct sched_param __user *param);
333 asmlinkage long sys_sched_getscheduler(pid_t pid); 333 asmlinkage long sys_sched_getscheduler(pid_t pid);
334 asmlinkage long sys_sched_getparam(pid_t pid, 334 asmlinkage long sys_sched_getparam(pid_t pid,
335 struct sched_param __user *param); 335 struct sched_param __user *param);
336 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 336 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
337 unsigned long __user *user_mask_ptr); 337 unsigned long __user *user_mask_ptr);
338 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 338 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
339 unsigned long __user *user_mask_ptr); 339 unsigned long __user *user_mask_ptr);
340 asmlinkage long sys_sched_yield(void); 340 asmlinkage long sys_sched_yield(void);
341 asmlinkage long sys_sched_get_priority_max(int policy); 341 asmlinkage long sys_sched_get_priority_max(int policy);
342 asmlinkage long sys_sched_get_priority_min(int policy); 342 asmlinkage long sys_sched_get_priority_min(int policy);
343 asmlinkage long sys_sched_rr_get_interval(pid_t pid, 343 asmlinkage long sys_sched_rr_get_interval(pid_t pid,
344 struct timespec __user *interval); 344 struct timespec __user *interval);
345 asmlinkage long sys_setpriority(int which, int who, int niceval); 345 asmlinkage long sys_setpriority(int which, int who, int niceval);
346 asmlinkage long sys_getpriority(int which, int who); 346 asmlinkage long sys_getpriority(int which, int who);
347 347
348 asmlinkage long sys_shutdown(int, int); 348 asmlinkage long sys_shutdown(int, int);
349 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, 349 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
350 void __user *arg); 350 void __user *arg);
351 asmlinkage long sys_restart_syscall(void); 351 asmlinkage long sys_restart_syscall(void);
352 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 352 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
353 struct kexec_segment __user *segments, 353 struct kexec_segment __user *segments,
354 unsigned long flags); 354 unsigned long flags);
355 355
356 asmlinkage long sys_exit(int error_code); 356 asmlinkage long sys_exit(int error_code);
357 asmlinkage long sys_exit_group(int error_code); 357 asmlinkage long sys_exit_group(int error_code);
358 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, 358 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
359 int options, struct rusage __user *ru); 359 int options, struct rusage __user *ru);
360 asmlinkage long sys_waitid(int which, pid_t pid, 360 asmlinkage long sys_waitid(int which, pid_t pid,
361 struct siginfo __user *infop, 361 struct siginfo __user *infop,
362 int options, struct rusage __user *ru); 362 int options, struct rusage __user *ru);
363 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options); 363 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
364 asmlinkage long sys_set_tid_address(int __user *tidptr); 364 asmlinkage long sys_set_tid_address(int __user *tidptr);
365 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, 365 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
366 struct timespec __user *utime, u32 __user *uaddr2, 366 struct timespec __user *utime, u32 __user *uaddr2,
367 u32 val3); 367 u32 val3);
368 368
369 asmlinkage long sys_init_module(void __user *umod, unsigned long len, 369 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
370 const char __user *uargs); 370 const char __user *uargs);
371 asmlinkage long sys_delete_module(const char __user *name_user, 371 asmlinkage long sys_delete_module(const char __user *name_user,
372 unsigned int flags); 372 unsigned int flags);
373 373
374 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set, 374 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
375 sigset_t __user *oset, size_t sigsetsize); 375 sigset_t __user *oset, size_t sigsetsize);
376 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize); 376 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
377 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, 377 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
378 siginfo_t __user *uinfo, 378 siginfo_t __user *uinfo,
379 const struct timespec __user *uts, 379 const struct timespec __user *uts,
380 size_t sigsetsize); 380 size_t sigsetsize);
381 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, 381 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
382 siginfo_t __user *uinfo); 382 siginfo_t __user *uinfo);
383 asmlinkage long sys_kill(int pid, int sig); 383 asmlinkage long sys_kill(int pid, int sig);
384 asmlinkage long sys_tgkill(int tgid, int pid, int sig); 384 asmlinkage long sys_tgkill(int tgid, int pid, int sig);
385 asmlinkage long sys_tkill(int pid, int sig); 385 asmlinkage long sys_tkill(int pid, int sig);
386 asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); 386 asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
387 asmlinkage long sys_sgetmask(void); 387 asmlinkage long sys_sgetmask(void);
388 asmlinkage long sys_ssetmask(int newmask); 388 asmlinkage long sys_ssetmask(int newmask);
389 asmlinkage long sys_signal(int sig, __sighandler_t handler); 389 asmlinkage long sys_signal(int sig, __sighandler_t handler);
390 asmlinkage long sys_pause(void); 390 asmlinkage long sys_pause(void);
391 391
392 asmlinkage long sys_sync(void); 392 asmlinkage long sys_sync(void);
393 asmlinkage long sys_fsync(unsigned int fd); 393 asmlinkage long sys_fsync(unsigned int fd);
394 asmlinkage long sys_fdatasync(unsigned int fd); 394 asmlinkage long sys_fdatasync(unsigned int fd);
395 asmlinkage long sys_bdflush(int func, long data); 395 asmlinkage long sys_bdflush(int func, long data);
396 asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name, 396 asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
397 char __user *type, unsigned long flags, 397 char __user *type, unsigned long flags,
398 void __user *data); 398 void __user *data);
399 asmlinkage long sys_umount(char __user *name, int flags); 399 asmlinkage long sys_umount(char __user *name, int flags);
400 asmlinkage long sys_oldumount(char __user *name); 400 asmlinkage long sys_oldumount(char __user *name);
401 asmlinkage long sys_truncate(const char __user *path, long length); 401 asmlinkage long sys_truncate(const char __user *path, long length);
402 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); 402 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
403 asmlinkage long sys_stat(const char __user *filename, 403 asmlinkage long sys_stat(const char __user *filename,
404 struct __old_kernel_stat __user *statbuf); 404 struct __old_kernel_stat __user *statbuf);
405 asmlinkage long sys_statfs(const char __user * path, 405 asmlinkage long sys_statfs(const char __user * path,
406 struct statfs __user *buf); 406 struct statfs __user *buf);
407 asmlinkage long sys_statfs64(const char __user *path, size_t sz, 407 asmlinkage long sys_statfs64(const char __user *path, size_t sz,
408 struct statfs64 __user *buf); 408 struct statfs64 __user *buf);
409 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf); 409 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
410 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, 410 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
411 struct statfs64 __user *buf); 411 struct statfs64 __user *buf);
412 asmlinkage long sys_lstat(const char __user *filename, 412 asmlinkage long sys_lstat(const char __user *filename,
413 struct __old_kernel_stat __user *statbuf); 413 struct __old_kernel_stat __user *statbuf);
414 asmlinkage long sys_fstat(unsigned int fd, 414 asmlinkage long sys_fstat(unsigned int fd,
415 struct __old_kernel_stat __user *statbuf); 415 struct __old_kernel_stat __user *statbuf);
416 asmlinkage long sys_newstat(const char __user *filename, 416 asmlinkage long sys_newstat(const char __user *filename,
417 struct stat __user *statbuf); 417 struct stat __user *statbuf);
418 asmlinkage long sys_newlstat(const char __user *filename, 418 asmlinkage long sys_newlstat(const char __user *filename,
419 struct stat __user *statbuf); 419 struct stat __user *statbuf);
420 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf); 420 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
421 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf); 421 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
422 #if BITS_PER_LONG == 32 422 #if BITS_PER_LONG == 32
423 asmlinkage long sys_stat64(const char __user *filename, 423 asmlinkage long sys_stat64(const char __user *filename,
424 struct stat64 __user *statbuf); 424 struct stat64 __user *statbuf);
425 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf); 425 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
426 asmlinkage long sys_lstat64(const char __user *filename, 426 asmlinkage long sys_lstat64(const char __user *filename,
427 struct stat64 __user *statbuf); 427 struct stat64 __user *statbuf);
428 asmlinkage long sys_truncate64(const char __user *path, loff_t length); 428 asmlinkage long sys_truncate64(const char __user *path, loff_t length);
429 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); 429 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
430 #endif 430 #endif
431 431
432 asmlinkage long sys_setxattr(const char __user *path, const char __user *name, 432 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
433 const void __user *value, size_t size, int flags); 433 const void __user *value, size_t size, int flags);
434 asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, 434 asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
435 const void __user *value, size_t size, int flags); 435 const void __user *value, size_t size, int flags);
436 asmlinkage long sys_fsetxattr(int fd, const char __user *name, 436 asmlinkage long sys_fsetxattr(int fd, const char __user *name,
437 const void __user *value, size_t size, int flags); 437 const void __user *value, size_t size, int flags);
438 asmlinkage long sys_getxattr(const char __user *path, const char __user *name, 438 asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
439 void __user *value, size_t size); 439 void __user *value, size_t size);
440 asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, 440 asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
441 void __user *value, size_t size); 441 void __user *value, size_t size);
442 asmlinkage long sys_fgetxattr(int fd, const char __user *name, 442 asmlinkage long sys_fgetxattr(int fd, const char __user *name,
443 void __user *value, size_t size); 443 void __user *value, size_t size);
444 asmlinkage long sys_listxattr(const char __user *path, char __user *list, 444 asmlinkage long sys_listxattr(const char __user *path, char __user *list,
445 size_t size); 445 size_t size);
446 asmlinkage long sys_llistxattr(const char __user *path, char __user *list, 446 asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
447 size_t size); 447 size_t size);
448 asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); 448 asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
449 asmlinkage long sys_removexattr(const char __user *path, 449 asmlinkage long sys_removexattr(const char __user *path,
450 const char __user *name); 450 const char __user *name);
451 asmlinkage long sys_lremovexattr(const char __user *path, 451 asmlinkage long sys_lremovexattr(const char __user *path,
452 const char __user *name); 452 const char __user *name);
453 asmlinkage long sys_fremovexattr(int fd, const char __user *name); 453 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
454 454
455 asmlinkage long sys_brk(unsigned long brk); 455 asmlinkage long sys_brk(unsigned long brk);
456 asmlinkage long sys_mprotect(unsigned long start, size_t len, 456 asmlinkage long sys_mprotect(unsigned long start, size_t len,
457 unsigned long prot); 457 unsigned long prot);
458 asmlinkage long sys_mremap(unsigned long addr, 458 asmlinkage long sys_mremap(unsigned long addr,
459 unsigned long old_len, unsigned long new_len, 459 unsigned long old_len, unsigned long new_len,
460 unsigned long flags, unsigned long new_addr); 460 unsigned long flags, unsigned long new_addr);
461 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 461 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
462 unsigned long prot, unsigned long pgoff, 462 unsigned long prot, unsigned long pgoff,
463 unsigned long flags); 463 unsigned long flags);
464 asmlinkage long sys_msync(unsigned long start, size_t len, int flags); 464 asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
465 asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice); 465 asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice);
466 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); 466 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
467 asmlinkage long sys_munmap(unsigned long addr, size_t len); 467 asmlinkage long sys_munmap(unsigned long addr, size_t len);
468 asmlinkage long sys_mlock(unsigned long start, size_t len); 468 asmlinkage long sys_mlock(unsigned long start, size_t len);
469 asmlinkage long sys_munlock(unsigned long start, size_t len); 469 asmlinkage long sys_munlock(unsigned long start, size_t len);
470 asmlinkage long sys_mlockall(int flags); 470 asmlinkage long sys_mlockall(int flags);
471 asmlinkage long sys_munlockall(void); 471 asmlinkage long sys_munlockall(void);
472 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); 472 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
473 asmlinkage long sys_mincore(unsigned long start, size_t len, 473 asmlinkage long sys_mincore(unsigned long start, size_t len,
474 unsigned char __user * vec); 474 unsigned char __user * vec);
475 475
476 asmlinkage long sys_pivot_root(const char __user *new_root, 476 asmlinkage long sys_pivot_root(const char __user *new_root,
477 const char __user *put_old); 477 const char __user *put_old);
478 asmlinkage long sys_chroot(const char __user *filename); 478 asmlinkage long sys_chroot(const char __user *filename);
479 asmlinkage long sys_mknod(const char __user *filename, umode_t mode, 479 asmlinkage long sys_mknod(const char __user *filename, umode_t mode,
480 unsigned dev); 480 unsigned dev);
481 asmlinkage long sys_link(const char __user *oldname, 481 asmlinkage long sys_link(const char __user *oldname,
482 const char __user *newname); 482 const char __user *newname);
483 asmlinkage long sys_symlink(const char __user *old, const char __user *new); 483 asmlinkage long sys_symlink(const char __user *old, const char __user *new);
484 asmlinkage long sys_unlink(const char __user *pathname); 484 asmlinkage long sys_unlink(const char __user *pathname);
485 asmlinkage long sys_rename(const char __user *oldname, 485 asmlinkage long sys_rename(const char __user *oldname,
486 const char __user *newname); 486 const char __user *newname);
487 asmlinkage long sys_chmod(const char __user *filename, umode_t mode); 487 asmlinkage long sys_chmod(const char __user *filename, umode_t mode);
488 asmlinkage long sys_fchmod(unsigned int fd, umode_t mode); 488 asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
489 489
490 asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg); 490 asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
491 #if BITS_PER_LONG == 32 491 #if BITS_PER_LONG == 32
492 asmlinkage long sys_fcntl64(unsigned int fd, 492 asmlinkage long sys_fcntl64(unsigned int fd,
493 unsigned int cmd, unsigned long arg); 493 unsigned int cmd, unsigned long arg);
494 #endif 494 #endif
495 asmlinkage long sys_pipe(int __user *fildes); 495 asmlinkage long sys_pipe(int __user *fildes);
496 asmlinkage long sys_pipe2(int __user *fildes, int flags); 496 asmlinkage long sys_pipe2(int __user *fildes, int flags);
497 asmlinkage long sys_dup(unsigned int fildes); 497 asmlinkage long sys_dup(unsigned int fildes);
498 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd); 498 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
499 asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags); 499 asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
500 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); 500 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
501 asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, 501 asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd,
502 unsigned long arg); 502 unsigned long arg);
503 asmlinkage long sys_flock(unsigned int fd, unsigned int cmd); 503 asmlinkage long sys_flock(unsigned int fd, unsigned int cmd);
504 asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx); 504 asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
505 asmlinkage long sys_io_destroy(aio_context_t ctx); 505 asmlinkage long sys_io_destroy(aio_context_t ctx);
506 asmlinkage long sys_io_getevents(aio_context_t ctx_id, 506 asmlinkage long sys_io_getevents(aio_context_t ctx_id,
507 long min_nr, 507 long min_nr,
508 long nr, 508 long nr,
509 struct io_event __user *events, 509 struct io_event __user *events,
510 struct timespec __user *timeout); 510 struct timespec __user *timeout);
511 asmlinkage long sys_io_submit(aio_context_t, long, 511 asmlinkage long sys_io_submit(aio_context_t, long,
512 struct iocb __user * __user *); 512 struct iocb __user * __user *);
513 asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, 513 asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
514 struct io_event __user *result); 514 struct io_event __user *result);
515 asmlinkage long sys_sendfile(int out_fd, int in_fd, 515 asmlinkage long sys_sendfile(int out_fd, int in_fd,
516 off_t __user *offset, size_t count); 516 off_t __user *offset, size_t count);
517 asmlinkage long sys_sendfile64(int out_fd, int in_fd, 517 asmlinkage long sys_sendfile64(int out_fd, int in_fd,
518 loff_t __user *offset, size_t count); 518 loff_t __user *offset, size_t count);
519 asmlinkage long sys_readlink(const char __user *path, 519 asmlinkage long sys_readlink(const char __user *path,
520 char __user *buf, int bufsiz); 520 char __user *buf, int bufsiz);
521 asmlinkage long sys_creat(const char __user *pathname, umode_t mode); 521 asmlinkage long sys_creat(const char __user *pathname, umode_t mode);
522 asmlinkage long sys_open(const char __user *filename, 522 asmlinkage long sys_open(const char __user *filename,
523 int flags, umode_t mode); 523 int flags, umode_t mode);
524 asmlinkage long sys_close(unsigned int fd); 524 asmlinkage long sys_close(unsigned int fd);
525 asmlinkage long sys_access(const char __user *filename, int mode); 525 asmlinkage long sys_access(const char __user *filename, int mode);
526 asmlinkage long sys_vhangup(void); 526 asmlinkage long sys_vhangup(void);
527 asmlinkage long sys_chown(const char __user *filename, 527 asmlinkage long sys_chown(const char __user *filename,
528 uid_t user, gid_t group); 528 uid_t user, gid_t group);
529 asmlinkage long sys_lchown(const char __user *filename, 529 asmlinkage long sys_lchown(const char __user *filename,
530 uid_t user, gid_t group); 530 uid_t user, gid_t group);
531 asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); 531 asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
532 #ifdef CONFIG_UID16 532 #ifdef CONFIG_UID16
533 asmlinkage long sys_chown16(const char __user *filename, 533 asmlinkage long sys_chown16(const char __user *filename,
534 old_uid_t user, old_gid_t group); 534 old_uid_t user, old_gid_t group);
535 asmlinkage long sys_lchown16(const char __user *filename, 535 asmlinkage long sys_lchown16(const char __user *filename,
536 old_uid_t user, old_gid_t group); 536 old_uid_t user, old_gid_t group);
537 asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group); 537 asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group);
538 asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid); 538 asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid);
539 asmlinkage long sys_setgid16(old_gid_t gid); 539 asmlinkage long sys_setgid16(old_gid_t gid);
540 asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid); 540 asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid);
541 asmlinkage long sys_setuid16(old_uid_t uid); 541 asmlinkage long sys_setuid16(old_uid_t uid);
542 asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid); 542 asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid);
543 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, 543 asmlinkage long sys_getresuid16(old_uid_t __user *ruid,
544 old_uid_t __user *euid, old_uid_t __user *suid); 544 old_uid_t __user *euid, old_uid_t __user *suid);
545 asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid); 545 asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid);
546 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, 546 asmlinkage long sys_getresgid16(old_gid_t __user *rgid,
547 old_gid_t __user *egid, old_gid_t __user *sgid); 547 old_gid_t __user *egid, old_gid_t __user *sgid);
548 asmlinkage long sys_setfsuid16(old_uid_t uid); 548 asmlinkage long sys_setfsuid16(old_uid_t uid);
549 asmlinkage long sys_setfsgid16(old_gid_t gid); 549 asmlinkage long sys_setfsgid16(old_gid_t gid);
550 asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist); 550 asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist);
551 asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist); 551 asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist);
552 asmlinkage long sys_getuid16(void); 552 asmlinkage long sys_getuid16(void);
553 asmlinkage long sys_geteuid16(void); 553 asmlinkage long sys_geteuid16(void);
554 asmlinkage long sys_getgid16(void); 554 asmlinkage long sys_getgid16(void);
555 asmlinkage long sys_getegid16(void); 555 asmlinkage long sys_getegid16(void);
556 #endif 556 #endif
557 557
558 asmlinkage long sys_utime(char __user *filename, 558 asmlinkage long sys_utime(char __user *filename,
559 struct utimbuf __user *times); 559 struct utimbuf __user *times);
560 asmlinkage long sys_utimes(char __user *filename, 560 asmlinkage long sys_utimes(char __user *filename,
561 struct timeval __user *utimes); 561 struct timeval __user *utimes);
562 asmlinkage long sys_lseek(unsigned int fd, off_t offset, 562 asmlinkage long sys_lseek(unsigned int fd, off_t offset,
563 unsigned int origin); 563 unsigned int whence);
564 asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, 564 asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
565 unsigned long offset_low, loff_t __user *result, 565 unsigned long offset_low, loff_t __user *result,
566 unsigned int origin); 566 unsigned int whence);
567 asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); 567 asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
568 asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); 568 asmlinkage long sys_readahead(int fd, loff_t offset, size_t count);
569 asmlinkage long sys_readv(unsigned long fd, 569 asmlinkage long sys_readv(unsigned long fd,
570 const struct iovec __user *vec, 570 const struct iovec __user *vec,
571 unsigned long vlen); 571 unsigned long vlen);
572 asmlinkage long sys_write(unsigned int fd, const char __user *buf, 572 asmlinkage long sys_write(unsigned int fd, const char __user *buf,
573 size_t count); 573 size_t count);
574 asmlinkage long sys_writev(unsigned long fd, 574 asmlinkage long sys_writev(unsigned long fd,
575 const struct iovec __user *vec, 575 const struct iovec __user *vec,
576 unsigned long vlen); 576 unsigned long vlen);
577 asmlinkage long sys_pread64(unsigned int fd, char __user *buf, 577 asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
578 size_t count, loff_t pos); 578 size_t count, loff_t pos);
579 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, 579 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
580 size_t count, loff_t pos); 580 size_t count, loff_t pos);
581 asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec, 581 asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
582 unsigned long vlen, unsigned long pos_l, unsigned long pos_h); 582 unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
583 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec, 583 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
584 unsigned long vlen, unsigned long pos_l, unsigned long pos_h); 584 unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
585 asmlinkage long sys_getcwd(char __user *buf, unsigned long size); 585 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
586 asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode); 586 asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
587 asmlinkage long sys_chdir(const char __user *filename); 587 asmlinkage long sys_chdir(const char __user *filename);
588 asmlinkage long sys_fchdir(unsigned int fd); 588 asmlinkage long sys_fchdir(unsigned int fd);
589 asmlinkage long sys_rmdir(const char __user *pathname); 589 asmlinkage long sys_rmdir(const char __user *pathname);
590 asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len); 590 asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
591 asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, 591 asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
592 qid_t id, void __user *addr); 592 qid_t id, void __user *addr);
593 asmlinkage long sys_getdents(unsigned int fd, 593 asmlinkage long sys_getdents(unsigned int fd,
594 struct linux_dirent __user *dirent, 594 struct linux_dirent __user *dirent,
595 unsigned int count); 595 unsigned int count);
596 asmlinkage long sys_getdents64(unsigned int fd, 596 asmlinkage long sys_getdents64(unsigned int fd,
597 struct linux_dirent64 __user *dirent, 597 struct linux_dirent64 __user *dirent,
598 unsigned int count); 598 unsigned int count);
599 599
600 asmlinkage long sys_setsockopt(int fd, int level, int optname, 600 asmlinkage long sys_setsockopt(int fd, int level, int optname,
601 char __user *optval, int optlen); 601 char __user *optval, int optlen);
602 asmlinkage long sys_getsockopt(int fd, int level, int optname, 602 asmlinkage long sys_getsockopt(int fd, int level, int optname,
603 char __user *optval, int __user *optlen); 603 char __user *optval, int __user *optlen);
604 asmlinkage long sys_bind(int, struct sockaddr __user *, int); 604 asmlinkage long sys_bind(int, struct sockaddr __user *, int);
605 asmlinkage long sys_connect(int, struct sockaddr __user *, int); 605 asmlinkage long sys_connect(int, struct sockaddr __user *, int);
606 asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *); 606 asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
607 asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int); 607 asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
608 asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *); 608 asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
609 asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *); 609 asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
610 asmlinkage long sys_send(int, void __user *, size_t, unsigned); 610 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
611 asmlinkage long sys_sendto(int, void __user *, size_t, unsigned, 611 asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
612 struct sockaddr __user *, int); 612 struct sockaddr __user *, int);
613 asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags); 613 asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
614 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, 614 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
615 unsigned int vlen, unsigned flags); 615 unsigned int vlen, unsigned flags);
616 asmlinkage long sys_recv(int, void __user *, size_t, unsigned); 616 asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
617 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned, 617 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
618 struct sockaddr __user *, int __user *); 618 struct sockaddr __user *, int __user *);
619 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags); 619 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
620 asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg, 620 asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
621 unsigned int vlen, unsigned flags, 621 unsigned int vlen, unsigned flags,
622 struct timespec __user *timeout); 622 struct timespec __user *timeout);
623 asmlinkage long sys_socket(int, int, int); 623 asmlinkage long sys_socket(int, int, int);
624 asmlinkage long sys_socketpair(int, int, int, int __user *); 624 asmlinkage long sys_socketpair(int, int, int, int __user *);
625 asmlinkage long sys_socketcall(int call, unsigned long __user *args); 625 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
626 asmlinkage long sys_listen(int, int); 626 asmlinkage long sys_listen(int, int);
627 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, 627 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
628 int timeout); 628 int timeout);
629 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, 629 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
630 fd_set __user *exp, struct timeval __user *tvp); 630 fd_set __user *exp, struct timeval __user *tvp);
631 asmlinkage long sys_old_select(struct sel_arg_struct __user *arg); 631 asmlinkage long sys_old_select(struct sel_arg_struct __user *arg);
632 asmlinkage long sys_epoll_create(int size); 632 asmlinkage long sys_epoll_create(int size);
633 asmlinkage long sys_epoll_create1(int flags); 633 asmlinkage long sys_epoll_create1(int flags);
634 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, 634 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
635 struct epoll_event __user *event); 635 struct epoll_event __user *event);
636 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 636 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
637 int maxevents, int timeout); 637 int maxevents, int timeout);
638 asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 638 asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
639 int maxevents, int timeout, 639 int maxevents, int timeout,
640 const sigset_t __user *sigmask, 640 const sigset_t __user *sigmask,
641 size_t sigsetsize); 641 size_t sigsetsize);
642 asmlinkage long sys_gethostname(char __user *name, int len); 642 asmlinkage long sys_gethostname(char __user *name, int len);
643 asmlinkage long sys_sethostname(char __user *name, int len); 643 asmlinkage long sys_sethostname(char __user *name, int len);
644 asmlinkage long sys_setdomainname(char __user *name, int len); 644 asmlinkage long sys_setdomainname(char __user *name, int len);
645 asmlinkage long sys_newuname(struct new_utsname __user *name); 645 asmlinkage long sys_newuname(struct new_utsname __user *name);
646 asmlinkage long sys_uname(struct old_utsname __user *); 646 asmlinkage long sys_uname(struct old_utsname __user *);
647 asmlinkage long sys_olduname(struct oldold_utsname __user *); 647 asmlinkage long sys_olduname(struct oldold_utsname __user *);
648 648
649 asmlinkage long sys_getrlimit(unsigned int resource, 649 asmlinkage long sys_getrlimit(unsigned int resource,
650 struct rlimit __user *rlim); 650 struct rlimit __user *rlim);
651 #if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64)) 651 #if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64))
652 asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim); 652 asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim);
653 #endif 653 #endif
654 asmlinkage long sys_setrlimit(unsigned int resource, 654 asmlinkage long sys_setrlimit(unsigned int resource,
655 struct rlimit __user *rlim); 655 struct rlimit __user *rlim);
656 asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource, 656 asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
657 const struct rlimit64 __user *new_rlim, 657 const struct rlimit64 __user *new_rlim,
658 struct rlimit64 __user *old_rlim); 658 struct rlimit64 __user *old_rlim);
659 asmlinkage long sys_getrusage(int who, struct rusage __user *ru); 659 asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
660 asmlinkage long sys_umask(int mask); 660 asmlinkage long sys_umask(int mask);
661 661
662 asmlinkage long sys_msgget(key_t key, int msgflg); 662 asmlinkage long sys_msgget(key_t key, int msgflg);
663 asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp, 663 asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp,
664 size_t msgsz, int msgflg); 664 size_t msgsz, int msgflg);
665 asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, 665 asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp,
666 size_t msgsz, long msgtyp, int msgflg); 666 size_t msgsz, long msgtyp, int msgflg);
667 asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf); 667 asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf);
668 668
669 asmlinkage long sys_semget(key_t key, int nsems, int semflg); 669 asmlinkage long sys_semget(key_t key, int nsems, int semflg);
670 asmlinkage long sys_semop(int semid, struct sembuf __user *sops, 670 asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
671 unsigned nsops); 671 unsigned nsops);
672 asmlinkage long sys_semctl(int semid, int semnum, int cmd, union semun arg); 672 asmlinkage long sys_semctl(int semid, int semnum, int cmd, union semun arg);
673 asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops, 673 asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
674 unsigned nsops, 674 unsigned nsops,
675 const struct timespec __user *timeout); 675 const struct timespec __user *timeout);
676 asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg); 676 asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
677 asmlinkage long sys_shmget(key_t key, size_t size, int flag); 677 asmlinkage long sys_shmget(key_t key, size_t size, int flag);
678 asmlinkage long sys_shmdt(char __user *shmaddr); 678 asmlinkage long sys_shmdt(char __user *shmaddr);
679 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); 679 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
680 asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second, 680 asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
681 unsigned long third, void __user *ptr, long fifth); 681 unsigned long third, void __user *ptr, long fifth);
682 682
683 asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr); 683 asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
684 asmlinkage long sys_mq_unlink(const char __user *name); 684 asmlinkage long sys_mq_unlink(const char __user *name);
685 asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); 685 asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
686 asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); 686 asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
687 asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); 687 asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
688 asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); 688 asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
689 689
690 asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn); 690 asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
691 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, 691 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
692 unsigned long off, unsigned long len, 692 unsigned long off, unsigned long len,
693 void __user *buf); 693 void __user *buf);
694 asmlinkage long sys_pciconfig_write(unsigned long bus, unsigned long dfn, 694 asmlinkage long sys_pciconfig_write(unsigned long bus, unsigned long dfn,
695 unsigned long off, unsigned long len, 695 unsigned long off, unsigned long len,
696 void __user *buf); 696 void __user *buf);
697 697
698 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 698 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
699 unsigned long arg4, unsigned long arg5); 699 unsigned long arg4, unsigned long arg5);
700 asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags); 700 asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
701 asmlinkage long sys_swapoff(const char __user *specialfile); 701 asmlinkage long sys_swapoff(const char __user *specialfile);
702 asmlinkage long sys_sysctl(struct __sysctl_args __user *args); 702 asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
703 asmlinkage long sys_sysinfo(struct sysinfo __user *info); 703 asmlinkage long sys_sysinfo(struct sysinfo __user *info);
704 asmlinkage long sys_sysfs(int option, 704 asmlinkage long sys_sysfs(int option,
705 unsigned long arg1, unsigned long arg2); 705 unsigned long arg1, unsigned long arg2);
706 asmlinkage long sys_syslog(int type, char __user *buf, int len); 706 asmlinkage long sys_syslog(int type, char __user *buf, int len);
707 asmlinkage long sys_uselib(const char __user *library); 707 asmlinkage long sys_uselib(const char __user *library);
708 asmlinkage long sys_ni_syscall(void); 708 asmlinkage long sys_ni_syscall(void);
709 asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, 709 asmlinkage long sys_ptrace(long request, long pid, unsigned long addr,
710 unsigned long data); 710 unsigned long data);
711 711
712 asmlinkage long sys_add_key(const char __user *_type, 712 asmlinkage long sys_add_key(const char __user *_type,
713 const char __user *_description, 713 const char __user *_description,
714 const void __user *_payload, 714 const void __user *_payload,
715 size_t plen, 715 size_t plen,
716 key_serial_t destringid); 716 key_serial_t destringid);
717 717
718 asmlinkage long sys_request_key(const char __user *_type, 718 asmlinkage long sys_request_key(const char __user *_type,
719 const char __user *_description, 719 const char __user *_description,
720 const char __user *_callout_info, 720 const char __user *_callout_info,
721 key_serial_t destringid); 721 key_serial_t destringid);
722 722
723 asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3, 723 asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
724 unsigned long arg4, unsigned long arg5); 724 unsigned long arg4, unsigned long arg5);
725 725
726 asmlinkage long sys_ioprio_set(int which, int who, int ioprio); 726 asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
727 asmlinkage long sys_ioprio_get(int which, int who); 727 asmlinkage long sys_ioprio_get(int which, int who);
728 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 728 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
729 unsigned long maxnode); 729 unsigned long maxnode);
730 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 730 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
731 const unsigned long __user *from, 731 const unsigned long __user *from,
732 const unsigned long __user *to); 732 const unsigned long __user *to);
733 asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, 733 asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
734 const void __user * __user *pages, 734 const void __user * __user *pages,
735 const int __user *nodes, 735 const int __user *nodes,
736 int __user *status, 736 int __user *status,
737 int flags); 737 int flags);
738 asmlinkage long sys_mbind(unsigned long start, unsigned long len, 738 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
739 unsigned long mode, 739 unsigned long mode,
740 unsigned long __user *nmask, 740 unsigned long __user *nmask,
741 unsigned long maxnode, 741 unsigned long maxnode,
742 unsigned flags); 742 unsigned flags);
743 asmlinkage long sys_get_mempolicy(int __user *policy, 743 asmlinkage long sys_get_mempolicy(int __user *policy,
744 unsigned long __user *nmask, 744 unsigned long __user *nmask,
745 unsigned long maxnode, 745 unsigned long maxnode,
746 unsigned long addr, unsigned long flags); 746 unsigned long addr, unsigned long flags);
747 747
748 asmlinkage long sys_inotify_init(void); 748 asmlinkage long sys_inotify_init(void);
749 asmlinkage long sys_inotify_init1(int flags); 749 asmlinkage long sys_inotify_init1(int flags);
750 asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, 750 asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
751 u32 mask); 751 u32 mask);
752 asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd); 752 asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd);
753 753
754 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, 754 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
755 __u32 __user *ustatus); 755 __u32 __user *ustatus);
756 asmlinkage long sys_spu_create(const char __user *name, 756 asmlinkage long sys_spu_create(const char __user *name,
757 unsigned int flags, umode_t mode, int fd); 757 unsigned int flags, umode_t mode, int fd);
758 758
759 asmlinkage long sys_mknodat(int dfd, const char __user * filename, umode_t mode, 759 asmlinkage long sys_mknodat(int dfd, const char __user * filename, umode_t mode,
760 unsigned dev); 760 unsigned dev);
761 asmlinkage long sys_mkdirat(int dfd, const char __user * pathname, umode_t mode); 761 asmlinkage long sys_mkdirat(int dfd, const char __user * pathname, umode_t mode);
762 asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag); 762 asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag);
763 asmlinkage long sys_symlinkat(const char __user * oldname, 763 asmlinkage long sys_symlinkat(const char __user * oldname,
764 int newdfd, const char __user * newname); 764 int newdfd, const char __user * newname);
765 asmlinkage long sys_linkat(int olddfd, const char __user *oldname, 765 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
766 int newdfd, const char __user *newname, int flags); 766 int newdfd, const char __user *newname, int flags);
767 asmlinkage long sys_renameat(int olddfd, const char __user * oldname, 767 asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
768 int newdfd, const char __user * newname); 768 int newdfd, const char __user * newname);
769 asmlinkage long sys_futimesat(int dfd, const char __user *filename, 769 asmlinkage long sys_futimesat(int dfd, const char __user *filename,
770 struct timeval __user *utimes); 770 struct timeval __user *utimes);
771 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode); 771 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
772 asmlinkage long sys_fchmodat(int dfd, const char __user * filename, 772 asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
773 umode_t mode); 773 umode_t mode);
774 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, 774 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
775 gid_t group, int flag); 775 gid_t group, int flag);
776 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, 776 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
777 umode_t mode); 777 umode_t mode);
778 asmlinkage long sys_newfstatat(int dfd, const char __user *filename, 778 asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
779 struct stat __user *statbuf, int flag); 779 struct stat __user *statbuf, int flag);
780 asmlinkage long sys_fstatat64(int dfd, const char __user *filename, 780 asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
781 struct stat64 __user *statbuf, int flag); 781 struct stat64 __user *statbuf, int flag);
782 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf, 782 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
783 int bufsiz); 783 int bufsiz);
784 asmlinkage long sys_utimensat(int dfd, const char __user *filename, 784 asmlinkage long sys_utimensat(int dfd, const char __user *filename,
785 struct timespec __user *utimes, int flags); 785 struct timespec __user *utimes, int flags);
786 asmlinkage long sys_unshare(unsigned long unshare_flags); 786 asmlinkage long sys_unshare(unsigned long unshare_flags);
787 787
788 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 788 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
789 int fd_out, loff_t __user *off_out, 789 int fd_out, loff_t __user *off_out,
790 size_t len, unsigned int flags); 790 size_t len, unsigned int flags);
791 791
792 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 792 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
793 unsigned long nr_segs, unsigned int flags); 793 unsigned long nr_segs, unsigned int flags);
794 794
795 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); 795 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
796 796
797 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, 797 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
798 unsigned int flags); 798 unsigned int flags);
799 asmlinkage long sys_sync_file_range2(int fd, unsigned int flags, 799 asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
800 loff_t offset, loff_t nbytes); 800 loff_t offset, loff_t nbytes);
801 asmlinkage long sys_get_robust_list(int pid, 801 asmlinkage long sys_get_robust_list(int pid,
802 struct robust_list_head __user * __user *head_ptr, 802 struct robust_list_head __user * __user *head_ptr,
803 size_t __user *len_ptr); 803 size_t __user *len_ptr);
804 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, 804 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
805 size_t len); 805 size_t len);
806 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); 806 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
807 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask); 807 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
808 asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags); 808 asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
809 asmlinkage long sys_timerfd_create(int clockid, int flags); 809 asmlinkage long sys_timerfd_create(int clockid, int flags);
810 asmlinkage long sys_timerfd_settime(int ufd, int flags, 810 asmlinkage long sys_timerfd_settime(int ufd, int flags,
811 const struct itimerspec __user *utmr, 811 const struct itimerspec __user *utmr,
812 struct itimerspec __user *otmr); 812 struct itimerspec __user *otmr);
813 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); 813 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
814 asmlinkage long sys_eventfd(unsigned int count); 814 asmlinkage long sys_eventfd(unsigned int count);
815 asmlinkage long sys_eventfd2(unsigned int count, int flags); 815 asmlinkage long sys_eventfd2(unsigned int count, int flags);
816 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); 816 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
817 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); 817 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
818 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, 818 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
819 fd_set __user *, struct timespec __user *, 819 fd_set __user *, struct timespec __user *,
820 void __user *); 820 void __user *);
821 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, 821 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
822 struct timespec __user *, const sigset_t __user *, 822 struct timespec __user *, const sigset_t __user *,
823 size_t); 823 size_t);
824 asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags); 824 asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
825 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, 825 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
826 u64 mask, int fd, 826 u64 mask, int fd,
827 const char __user *pathname); 827 const char __user *pathname);
828 asmlinkage long sys_syncfs(int fd); 828 asmlinkage long sys_syncfs(int fd);
829 829
830 #ifndef CONFIG_GENERIC_KERNEL_EXECVE 830 #ifndef CONFIG_GENERIC_KERNEL_EXECVE
831 int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]); 831 int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);
832 #else 832 #else
833 #define kernel_execve(filename, argv, envp) \ 833 #define kernel_execve(filename, argv, envp) \
834 do_execve(filename, \ 834 do_execve(filename, \
835 (const char __user *const __user *)argv, \ 835 (const char __user *const __user *)argv, \
836 (const char __user *const __user *)envp) 836 (const char __user *const __user *)envp)
837 #endif 837 #endif
838 838
839 asmlinkage long sys_fork(void); 839 asmlinkage long sys_fork(void);
840 asmlinkage long sys_vfork(void); 840 asmlinkage long sys_vfork(void);
841 #ifdef CONFIG_CLONE_BACKWARDS 841 #ifdef CONFIG_CLONE_BACKWARDS
842 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, 842 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
843 int __user *); 843 int __user *);
844 #else 844 #else
845 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, 845 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
846 int __user *, int); 846 int __user *, int);
847 #endif 847 #endif
848 848
849 asmlinkage long sys_execve(const char __user *filename, 849 asmlinkage long sys_execve(const char __user *filename,
850 const char __user *const __user *argv, 850 const char __user *const __user *argv,
851 const char __user *const __user *envp); 851 const char __user *const __user *envp);
852 852
853 asmlinkage long sys_perf_event_open( 853 asmlinkage long sys_perf_event_open(
854 struct perf_event_attr __user *attr_uptr, 854 struct perf_event_attr __user *attr_uptr,
855 pid_t pid, int cpu, int group_fd, unsigned long flags); 855 pid_t pid, int cpu, int group_fd, unsigned long flags);
856 856
857 asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, 857 asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
858 unsigned long prot, unsigned long flags, 858 unsigned long prot, unsigned long flags,
859 unsigned long fd, unsigned long pgoff); 859 unsigned long fd, unsigned long pgoff);
860 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); 860 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
861 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, 861 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
862 struct file_handle __user *handle, 862 struct file_handle __user *handle,
863 int __user *mnt_id, int flag); 863 int __user *mnt_id, int flag);
864 asmlinkage long sys_open_by_handle_at(int mountdirfd, 864 asmlinkage long sys_open_by_handle_at(int mountdirfd,
865 struct file_handle __user *handle, 865 struct file_handle __user *handle,
866 int flags); 866 int flags);
867 asmlinkage long sys_setns(int fd, int nstype); 867 asmlinkage long sys_setns(int fd, int nstype);
868 asmlinkage long sys_process_vm_readv(pid_t pid, 868 asmlinkage long sys_process_vm_readv(pid_t pid,
869 const struct iovec __user *lvec, 869 const struct iovec __user *lvec,
870 unsigned long liovcnt, 870 unsigned long liovcnt,
871 const struct iovec __user *rvec, 871 const struct iovec __user *rvec,
872 unsigned long riovcnt, 872 unsigned long riovcnt,
873 unsigned long flags); 873 unsigned long flags);
874 asmlinkage long sys_process_vm_writev(pid_t pid, 874 asmlinkage long sys_process_vm_writev(pid_t pid,
875 const struct iovec __user *lvec, 875 const struct iovec __user *lvec,
876 unsigned long liovcnt, 876 unsigned long liovcnt,
877 const struct iovec __user *rvec, 877 const struct iovec __user *rvec,
878 unsigned long riovcnt, 878 unsigned long riovcnt,
879 unsigned long flags); 879 unsigned long flags);
880 880
881 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, 881 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
882 unsigned long idx1, unsigned long idx2); 882 unsigned long idx1, unsigned long idx2);
883 #endif 883 #endif
884 884
kernel/trace/ftrace.c
1 /* 1 /*
2 * Infrastructure for profiling code inserted by 'gcc -pg'. 2 * Infrastructure for profiling code inserted by 'gcc -pg'.
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
6 * 6 *
7 * Originally ported from the -rt patch by: 7 * Originally ported from the -rt patch by:
8 * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> 8 * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
9 * 9 *
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 Nadia Yvette Chambers 13 * Copyright (C) 2004 Nadia Yvette Chambers
14 */ 14 */
15 15
16 #include <linux/stop_machine.h> 16 #include <linux/stop_machine.h>
17 #include <linux/clocksource.h> 17 #include <linux/clocksource.h>
18 #include <linux/kallsyms.h> 18 #include <linux/kallsyms.h>
19 #include <linux/seq_file.h> 19 #include <linux/seq_file.h>
20 #include <linux/suspend.h> 20 #include <linux/suspend.h>
21 #include <linux/debugfs.h> 21 #include <linux/debugfs.h>
22 #include <linux/hardirq.h> 22 #include <linux/hardirq.h>
23 #include <linux/kthread.h> 23 #include <linux/kthread.h>
24 #include <linux/uaccess.h> 24 #include <linux/uaccess.h>
25 #include <linux/bsearch.h> 25 #include <linux/bsearch.h>
26 #include <linux/module.h> 26 #include <linux/module.h>
27 #include <linux/ftrace.h> 27 #include <linux/ftrace.h>
28 #include <linux/sysctl.h> 28 #include <linux/sysctl.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/ctype.h> 30 #include <linux/ctype.h>
31 #include <linux/sort.h> 31 #include <linux/sort.h>
32 #include <linux/list.h> 32 #include <linux/list.h>
33 #include <linux/hash.h> 33 #include <linux/hash.h>
34 #include <linux/rcupdate.h> 34 #include <linux/rcupdate.h>
35 35
36 #include <trace/events/sched.h> 36 #include <trace/events/sched.h>
37 37
38 #include <asm/setup.h> 38 #include <asm/setup.h>
39 39
40 #include "trace_output.h" 40 #include "trace_output.h"
41 #include "trace_stat.h" 41 #include "trace_stat.h"
42 42
43 #define FTRACE_WARN_ON(cond) \ 43 #define FTRACE_WARN_ON(cond) \
44 ({ \ 44 ({ \
45 int ___r = cond; \ 45 int ___r = cond; \
46 if (WARN_ON(___r)) \ 46 if (WARN_ON(___r)) \
47 ftrace_kill(); \ 47 ftrace_kill(); \
48 ___r; \ 48 ___r; \
49 }) 49 })
50 50
51 #define FTRACE_WARN_ON_ONCE(cond) \ 51 #define FTRACE_WARN_ON_ONCE(cond) \
52 ({ \ 52 ({ \
53 int ___r = cond; \ 53 int ___r = cond; \
54 if (WARN_ON_ONCE(___r)) \ 54 if (WARN_ON_ONCE(___r)) \
55 ftrace_kill(); \ 55 ftrace_kill(); \
56 ___r; \ 56 ___r; \
57 }) 57 })
58 58
59 /* hash bits for specific function selection */ 59 /* hash bits for specific function selection */
60 #define FTRACE_HASH_BITS 7 60 #define FTRACE_HASH_BITS 7
61 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 61 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
62 #define FTRACE_HASH_DEFAULT_BITS 10 62 #define FTRACE_HASH_DEFAULT_BITS 10
63 #define FTRACE_HASH_MAX_BITS 12 63 #define FTRACE_HASH_MAX_BITS 12
64 64
65 #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65 #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66 66
67 static struct ftrace_ops ftrace_list_end __read_mostly = { 67 static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub, 68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 69 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
70 }; 70 };
71 71
72 /* ftrace_enabled is a method to turn ftrace on or off */ 72 /* ftrace_enabled is a method to turn ftrace on or off */
73 int ftrace_enabled __read_mostly; 73 int ftrace_enabled __read_mostly;
74 static int last_ftrace_enabled; 74 static int last_ftrace_enabled;
75 75
76 /* Quick disabling of function tracer. */ 76 /* Quick disabling of function tracer. */
77 int function_trace_stop __read_mostly; 77 int function_trace_stop __read_mostly;
78 78
79 /* Current function tracing op */ 79 /* Current function tracing op */
80 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; 80 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
81 81
82 /* List for set_ftrace_pid's pids. */ 82 /* List for set_ftrace_pid's pids. */
83 LIST_HEAD(ftrace_pids); 83 LIST_HEAD(ftrace_pids);
84 struct ftrace_pid { 84 struct ftrace_pid {
85 struct list_head list; 85 struct list_head list;
86 struct pid *pid; 86 struct pid *pid;
87 }; 87 };
88 88
89 /* 89 /*
90 * ftrace_disabled is set when an anomaly is discovered. 90 * ftrace_disabled is set when an anomaly is discovered.
91 * ftrace_disabled is much stronger than ftrace_enabled. 91 * ftrace_disabled is much stronger than ftrace_enabled.
92 */ 92 */
93 static int ftrace_disabled __read_mostly; 93 static int ftrace_disabled __read_mostly;
94 94
95 static DEFINE_MUTEX(ftrace_lock); 95 static DEFINE_MUTEX(ftrace_lock);
96 96
97 static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 97 static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
98 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 98 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
99 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 99 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
100 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 100 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
101 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 101 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
102 static struct ftrace_ops global_ops; 102 static struct ftrace_ops global_ops;
103 static struct ftrace_ops control_ops; 103 static struct ftrace_ops control_ops;
104 104
105 #if ARCH_SUPPORTS_FTRACE_OPS 105 #if ARCH_SUPPORTS_FTRACE_OPS
106 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 106 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
107 struct ftrace_ops *op, struct pt_regs *regs); 107 struct ftrace_ops *op, struct pt_regs *regs);
108 #else 108 #else
109 /* See comment below, where ftrace_ops_list_func is defined */ 109 /* See comment below, where ftrace_ops_list_func is defined */
110 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); 110 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112 #endif 112 #endif
113 113
114 /** 114 /**
115 * ftrace_nr_registered_ops - return number of ops registered 115 * ftrace_nr_registered_ops - return number of ops registered
116 * 116 *
117 * Returns the number of ftrace_ops registered and tracing functions 117 * Returns the number of ftrace_ops registered and tracing functions
118 */ 118 */
119 int ftrace_nr_registered_ops(void) 119 int ftrace_nr_registered_ops(void)
120 { 120 {
121 struct ftrace_ops *ops; 121 struct ftrace_ops *ops;
122 int cnt = 0; 122 int cnt = 0;
123 123
124 mutex_lock(&ftrace_lock); 124 mutex_lock(&ftrace_lock);
125 125
126 for (ops = ftrace_ops_list; 126 for (ops = ftrace_ops_list;
127 ops != &ftrace_list_end; ops = ops->next) 127 ops != &ftrace_list_end; ops = ops->next)
128 cnt++; 128 cnt++;
129 129
130 mutex_unlock(&ftrace_lock); 130 mutex_unlock(&ftrace_lock);
131 131
132 return cnt; 132 return cnt;
133 } 133 }
134 134
135 /* 135 /*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we 136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list 137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period 138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle 139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list. 140 * concurrent insertions into the ftrace_global_list.
141 * 141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations! 142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */ 143 */
144 static void 144 static void
145 ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 145 ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 146 struct ftrace_ops *op, struct pt_regs *regs)
147 { 147 {
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
149 return; 149 return;
150 150
151 trace_recursion_set(TRACE_GLOBAL_BIT); 151 trace_recursion_set(TRACE_GLOBAL_BIT);
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/ 152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) { 153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 154 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 155 op = rcu_dereference_raw(op->next); /*see above*/
156 }; 156 };
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 157 trace_recursion_clear(TRACE_GLOBAL_BIT);
158 } 158 }
159 159
160 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 160 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
161 struct ftrace_ops *op, struct pt_regs *regs) 161 struct ftrace_ops *op, struct pt_regs *regs)
162 { 162 {
163 if (!test_tsk_trace_trace(current)) 163 if (!test_tsk_trace_trace(current))
164 return; 164 return;
165 165
166 ftrace_pid_function(ip, parent_ip, op, regs); 166 ftrace_pid_function(ip, parent_ip, op, regs);
167 } 167 }
168 168
169 static void set_ftrace_pid_function(ftrace_func_t func) 169 static void set_ftrace_pid_function(ftrace_func_t func)
170 { 170 {
171 /* do not set ftrace_pid_function to itself! */ 171 /* do not set ftrace_pid_function to itself! */
172 if (func != ftrace_pid_func) 172 if (func != ftrace_pid_func)
173 ftrace_pid_function = func; 173 ftrace_pid_function = func;
174 } 174 }
175 175
176 /** 176 /**
177 * clear_ftrace_function - reset the ftrace function 177 * clear_ftrace_function - reset the ftrace function
178 * 178 *
179 * This NULLs the ftrace function and in essence stops 179 * This NULLs the ftrace function and in essence stops
180 * tracing. There may be lag 180 * tracing. There may be lag
181 */ 181 */
182 void clear_ftrace_function(void) 182 void clear_ftrace_function(void)
183 { 183 {
184 ftrace_trace_function = ftrace_stub; 184 ftrace_trace_function = ftrace_stub;
185 ftrace_pid_function = ftrace_stub; 185 ftrace_pid_function = ftrace_stub;
186 } 186 }
187 187
188 static void control_ops_disable_all(struct ftrace_ops *ops) 188 static void control_ops_disable_all(struct ftrace_ops *ops)
189 { 189 {
190 int cpu; 190 int cpu;
191 191
192 for_each_possible_cpu(cpu) 192 for_each_possible_cpu(cpu)
193 *per_cpu_ptr(ops->disabled, cpu) = 1; 193 *per_cpu_ptr(ops->disabled, cpu) = 1;
194 } 194 }
195 195
196 static int control_ops_alloc(struct ftrace_ops *ops) 196 static int control_ops_alloc(struct ftrace_ops *ops)
197 { 197 {
198 int __percpu *disabled; 198 int __percpu *disabled;
199 199
200 disabled = alloc_percpu(int); 200 disabled = alloc_percpu(int);
201 if (!disabled) 201 if (!disabled)
202 return -ENOMEM; 202 return -ENOMEM;
203 203
204 ops->disabled = disabled; 204 ops->disabled = disabled;
205 control_ops_disable_all(ops); 205 control_ops_disable_all(ops);
206 return 0; 206 return 0;
207 } 207 }
208 208
209 static void control_ops_free(struct ftrace_ops *ops) 209 static void control_ops_free(struct ftrace_ops *ops)
210 { 210 {
211 free_percpu(ops->disabled); 211 free_percpu(ops->disabled);
212 } 212 }
213 213
214 static void update_global_ops(void) 214 static void update_global_ops(void)
215 { 215 {
216 ftrace_func_t func; 216 ftrace_func_t func;
217 217
218 /* 218 /*
219 * If there's only one function registered, then call that 219 * If there's only one function registered, then call that
220 * function directly. Otherwise, we need to iterate over the 220 * function directly. Otherwise, we need to iterate over the
221 * registered callers. 221 * registered callers.
222 */ 222 */
223 if (ftrace_global_list == &ftrace_list_end || 223 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 224 ftrace_global_list->next == &ftrace_list_end)
225 func = ftrace_global_list->func; 225 func = ftrace_global_list->func;
226 else 226 else
227 func = ftrace_global_list_func; 227 func = ftrace_global_list_func;
228 228
229 /* If we filter on pids, update to use the pid function */ 229 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 230 if (!list_empty(&ftrace_pids)) {
231 set_ftrace_pid_function(func); 231 set_ftrace_pid_function(func);
232 func = ftrace_pid_func; 232 func = ftrace_pid_func;
233 } 233 }
234 234
235 global_ops.func = func; 235 global_ops.func = func;
236 } 236 }
237 237
238 static void update_ftrace_function(void) 238 static void update_ftrace_function(void)
239 { 239 {
240 ftrace_func_t func; 240 ftrace_func_t func;
241 241
242 update_global_ops(); 242 update_global_ops();
243 243
244 /* 244 /*
245 * If we are at the end of the list and this ops is 245 * If we are at the end of the list and this ops is
246 * recursion safe and not dynamic and the arch supports passing ops, 246 * recursion safe and not dynamic and the arch supports passing ops,
247 * then have the mcount trampoline call the function directly. 247 * then have the mcount trampoline call the function directly.
248 */ 248 */
249 if (ftrace_ops_list == &ftrace_list_end || 249 if (ftrace_ops_list == &ftrace_list_end ||
250 (ftrace_ops_list->next == &ftrace_list_end && 250 (ftrace_ops_list->next == &ftrace_list_end &&
251 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && 251 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
252 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && 252 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
253 !FTRACE_FORCE_LIST_FUNC)) { 253 !FTRACE_FORCE_LIST_FUNC)) {
254 /* Set the ftrace_ops that the arch callback uses */ 254 /* Set the ftrace_ops that the arch callback uses */
255 if (ftrace_ops_list == &global_ops) 255 if (ftrace_ops_list == &global_ops)
256 function_trace_op = ftrace_global_list; 256 function_trace_op = ftrace_global_list;
257 else 257 else
258 function_trace_op = ftrace_ops_list; 258 function_trace_op = ftrace_ops_list;
259 func = ftrace_ops_list->func; 259 func = ftrace_ops_list->func;
260 } else { 260 } else {
261 /* Just use the default ftrace_ops */ 261 /* Just use the default ftrace_ops */
262 function_trace_op = &ftrace_list_end; 262 function_trace_op = &ftrace_list_end;
263 func = ftrace_ops_list_func; 263 func = ftrace_ops_list_func;
264 } 264 }
265 265
266 ftrace_trace_function = func; 266 ftrace_trace_function = func;
267 } 267 }
268 268
269 static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 269 static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
270 { 270 {
271 ops->next = *list; 271 ops->next = *list;
272 /* 272 /*
273 * We are entering ops into the list but another 273 * We are entering ops into the list but another
274 * CPU might be walking that list. We need to make sure 274 * CPU might be walking that list. We need to make sure
275 * the ops->next pointer is valid before another CPU sees 275 * the ops->next pointer is valid before another CPU sees
276 * the ops pointer included into the list. 276 * the ops pointer included into the list.
277 */ 277 */
278 rcu_assign_pointer(*list, ops); 278 rcu_assign_pointer(*list, ops);
279 } 279 }
280 280
281 static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 281 static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
282 { 282 {
283 struct ftrace_ops **p; 283 struct ftrace_ops **p;
284 284
285 /* 285 /*
286 * If we are removing the last function, then simply point 286 * If we are removing the last function, then simply point
287 * to the ftrace_stub. 287 * to the ftrace_stub.
288 */ 288 */
289 if (*list == ops && ops->next == &ftrace_list_end) { 289 if (*list == ops && ops->next == &ftrace_list_end) {
290 *list = &ftrace_list_end; 290 *list = &ftrace_list_end;
291 return 0; 291 return 0;
292 } 292 }
293 293
294 for (p = list; *p != &ftrace_list_end; p = &(*p)->next) 294 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
295 if (*p == ops) 295 if (*p == ops)
296 break; 296 break;
297 297
298 if (*p != ops) 298 if (*p != ops)
299 return -1; 299 return -1;
300 300
301 *p = (*p)->next; 301 *p = (*p)->next;
302 return 0; 302 return 0;
303 } 303 }
304 304
305 static void add_ftrace_list_ops(struct ftrace_ops **list, 305 static void add_ftrace_list_ops(struct ftrace_ops **list,
306 struct ftrace_ops *main_ops, 306 struct ftrace_ops *main_ops,
307 struct ftrace_ops *ops) 307 struct ftrace_ops *ops)
308 { 308 {
309 int first = *list == &ftrace_list_end; 309 int first = *list == &ftrace_list_end;
310 add_ftrace_ops(list, ops); 310 add_ftrace_ops(list, ops);
311 if (first) 311 if (first)
312 add_ftrace_ops(&ftrace_ops_list, main_ops); 312 add_ftrace_ops(&ftrace_ops_list, main_ops);
313 } 313 }
314 314
315 static int remove_ftrace_list_ops(struct ftrace_ops **list, 315 static int remove_ftrace_list_ops(struct ftrace_ops **list,
316 struct ftrace_ops *main_ops, 316 struct ftrace_ops *main_ops,
317 struct ftrace_ops *ops) 317 struct ftrace_ops *ops)
318 { 318 {
319 int ret = remove_ftrace_ops(list, ops); 319 int ret = remove_ftrace_ops(list, ops);
320 if (!ret && *list == &ftrace_list_end) 320 if (!ret && *list == &ftrace_list_end)
321 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); 321 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
322 return ret; 322 return ret;
323 } 323 }
324 324
325 static int __register_ftrace_function(struct ftrace_ops *ops) 325 static int __register_ftrace_function(struct ftrace_ops *ops)
326 { 326 {
327 if (unlikely(ftrace_disabled)) 327 if (unlikely(ftrace_disabled))
328 return -ENODEV; 328 return -ENODEV;
329 329
330 if (FTRACE_WARN_ON(ops == &global_ops)) 330 if (FTRACE_WARN_ON(ops == &global_ops))
331 return -EINVAL; 331 return -EINVAL;
332 332
333 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 333 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
334 return -EBUSY; 334 return -EBUSY;
335 335
336 /* We don't support both control and global flags set. */ 336 /* We don't support both control and global flags set. */
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 338 return -EINVAL;
339 339
340 #ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 340 #ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
341 /* 341 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
344 * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. 344 * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
345 */ 345 */
346 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && 346 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
347 !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) 347 !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
348 return -EINVAL; 348 return -EINVAL;
349 349
350 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) 350 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
351 ops->flags |= FTRACE_OPS_FL_SAVE_REGS; 351 ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
352 #endif 352 #endif
353 353
354 if (!core_kernel_data((unsigned long)ops)) 354 if (!core_kernel_data((unsigned long)ops))
355 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 355 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
356 356
357 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 357 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
358 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); 358 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
359 ops->flags |= FTRACE_OPS_FL_ENABLED; 359 ops->flags |= FTRACE_OPS_FL_ENABLED;
360 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { 360 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
361 if (control_ops_alloc(ops)) 361 if (control_ops_alloc(ops))
362 return -ENOMEM; 362 return -ENOMEM;
363 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 363 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
364 } else 364 } else
365 add_ftrace_ops(&ftrace_ops_list, ops); 365 add_ftrace_ops(&ftrace_ops_list, ops);
366 366
367 if (ftrace_enabled) 367 if (ftrace_enabled)
368 update_ftrace_function(); 368 update_ftrace_function();
369 369
370 return 0; 370 return 0;
371 } 371 }
372 372
373 static int __unregister_ftrace_function(struct ftrace_ops *ops) 373 static int __unregister_ftrace_function(struct ftrace_ops *ops)
374 { 374 {
375 int ret; 375 int ret;
376 376
377 if (ftrace_disabled) 377 if (ftrace_disabled)
378 return -ENODEV; 378 return -ENODEV;
379 379
380 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 380 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
381 return -EBUSY; 381 return -EBUSY;
382 382
383 if (FTRACE_WARN_ON(ops == &global_ops)) 383 if (FTRACE_WARN_ON(ops == &global_ops))
384 return -EINVAL; 384 return -EINVAL;
385 385
386 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 386 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
387 ret = remove_ftrace_list_ops(&ftrace_global_list, 387 ret = remove_ftrace_list_ops(&ftrace_global_list,
388 &global_ops, ops); 388 &global_ops, ops);
389 if (!ret) 389 if (!ret)
390 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 390 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
391 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { 391 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
392 ret = remove_ftrace_list_ops(&ftrace_control_list, 392 ret = remove_ftrace_list_ops(&ftrace_control_list,
393 &control_ops, ops); 393 &control_ops, ops);
394 if (!ret) { 394 if (!ret) {
395 /* 395 /*
396 * The ftrace_ops is now removed from the list, 396 * The ftrace_ops is now removed from the list,
397 * so there'll be no new users. We must ensure 397 * so there'll be no new users. We must ensure
398 * all current users are done before we free 398 * all current users are done before we free
399 * the control data. 399 * the control data.
400 */ 400 */
401 synchronize_sched(); 401 synchronize_sched();
402 control_ops_free(ops); 402 control_ops_free(ops);
403 } 403 }
404 } else 404 } else
405 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 405 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
406 406
407 if (ret < 0) 407 if (ret < 0)
408 return ret; 408 return ret;
409 409
410 if (ftrace_enabled) 410 if (ftrace_enabled)
411 update_ftrace_function(); 411 update_ftrace_function();
412 412
413 /* 413 /*
414 * Dynamic ops may be freed, we must make sure that all 414 * Dynamic ops may be freed, we must make sure that all
415 * callers are done before leaving this function. 415 * callers are done before leaving this function.
416 */ 416 */
417 if (ops->flags & FTRACE_OPS_FL_DYNAMIC) 417 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
418 synchronize_sched(); 418 synchronize_sched();
419 419
420 return 0; 420 return 0;
421 } 421 }
422 422
423 static void ftrace_update_pid_func(void) 423 static void ftrace_update_pid_func(void)
424 { 424 {
425 /* Only do something if we are tracing something */ 425 /* Only do something if we are tracing something */
426 if (ftrace_trace_function == ftrace_stub) 426 if (ftrace_trace_function == ftrace_stub)
427 return; 427 return;
428 428
429 update_ftrace_function(); 429 update_ftrace_function();
430 } 430 }
431 431
432 #ifdef CONFIG_FUNCTION_PROFILER 432 #ifdef CONFIG_FUNCTION_PROFILER
433 struct ftrace_profile { 433 struct ftrace_profile {
434 struct hlist_node node; 434 struct hlist_node node;
435 unsigned long ip; 435 unsigned long ip;
436 unsigned long counter; 436 unsigned long counter;
437 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 437 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
438 unsigned long long time; 438 unsigned long long time;
439 unsigned long long time_squared; 439 unsigned long long time_squared;
440 #endif 440 #endif
441 }; 441 };
442 442
443 struct ftrace_profile_page { 443 struct ftrace_profile_page {
444 struct ftrace_profile_page *next; 444 struct ftrace_profile_page *next;
445 unsigned long index; 445 unsigned long index;
446 struct ftrace_profile records[]; 446 struct ftrace_profile records[];
447 }; 447 };
448 448
449 struct ftrace_profile_stat { 449 struct ftrace_profile_stat {
450 atomic_t disabled; 450 atomic_t disabled;
451 struct hlist_head *hash; 451 struct hlist_head *hash;
452 struct ftrace_profile_page *pages; 452 struct ftrace_profile_page *pages;
453 struct ftrace_profile_page *start; 453 struct ftrace_profile_page *start;
454 struct tracer_stat stat; 454 struct tracer_stat stat;
455 }; 455 };
456 456
457 #define PROFILE_RECORDS_SIZE \ 457 #define PROFILE_RECORDS_SIZE \
458 (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) 458 (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
459 459
460 #define PROFILES_PER_PAGE \ 460 #define PROFILES_PER_PAGE \
461 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) 461 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
462 462
463 static int ftrace_profile_bits __read_mostly; 463 static int ftrace_profile_bits __read_mostly;
464 static int ftrace_profile_enabled __read_mostly; 464 static int ftrace_profile_enabled __read_mostly;
465 465
466 /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ 466 /* ftrace_profile_lock - synchronize the enable and disable of the profiler */
467 static DEFINE_MUTEX(ftrace_profile_lock); 467 static DEFINE_MUTEX(ftrace_profile_lock);
468 468
469 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); 469 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
470 470
471 #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ 471 #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
472 472
473 static void * 473 static void *
474 function_stat_next(void *v, int idx) 474 function_stat_next(void *v, int idx)
475 { 475 {
476 struct ftrace_profile *rec = v; 476 struct ftrace_profile *rec = v;
477 struct ftrace_profile_page *pg; 477 struct ftrace_profile_page *pg;
478 478
479 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 479 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
480 480
481 again: 481 again:
482 if (idx != 0) 482 if (idx != 0)
483 rec++; 483 rec++;
484 484
485 if ((void *)rec >= (void *)&pg->records[pg->index]) { 485 if ((void *)rec >= (void *)&pg->records[pg->index]) {
486 pg = pg->next; 486 pg = pg->next;
487 if (!pg) 487 if (!pg)
488 return NULL; 488 return NULL;
489 rec = &pg->records[0]; 489 rec = &pg->records[0];
490 if (!rec->counter) 490 if (!rec->counter)
491 goto again; 491 goto again;
492 } 492 }
493 493
494 return rec; 494 return rec;
495 } 495 }
496 496
497 static void *function_stat_start(struct tracer_stat *trace) 497 static void *function_stat_start(struct tracer_stat *trace)
498 { 498 {
499 struct ftrace_profile_stat *stat = 499 struct ftrace_profile_stat *stat =
500 container_of(trace, struct ftrace_profile_stat, stat); 500 container_of(trace, struct ftrace_profile_stat, stat);
501 501
502 if (!stat || !stat->start) 502 if (!stat || !stat->start)
503 return NULL; 503 return NULL;
504 504
505 return function_stat_next(&stat->start->records[0], 0); 505 return function_stat_next(&stat->start->records[0], 0);
506 } 506 }
507 507
508 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 508 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
509 /* function graph compares on total time */ 509 /* function graph compares on total time */
510 static int function_stat_cmp(void *p1, void *p2) 510 static int function_stat_cmp(void *p1, void *p2)
511 { 511 {
512 struct ftrace_profile *a = p1; 512 struct ftrace_profile *a = p1;
513 struct ftrace_profile *b = p2; 513 struct ftrace_profile *b = p2;
514 514
515 if (a->time < b->time) 515 if (a->time < b->time)
516 return -1; 516 return -1;
517 if (a->time > b->time) 517 if (a->time > b->time)
518 return 1; 518 return 1;
519 else 519 else
520 return 0; 520 return 0;
521 } 521 }
522 #else 522 #else
523 /* not function graph compares against hits */ 523 /* not function graph compares against hits */
524 static int function_stat_cmp(void *p1, void *p2) 524 static int function_stat_cmp(void *p1, void *p2)
525 { 525 {
526 struct ftrace_profile *a = p1; 526 struct ftrace_profile *a = p1;
527 struct ftrace_profile *b = p2; 527 struct ftrace_profile *b = p2;
528 528
529 if (a->counter < b->counter) 529 if (a->counter < b->counter)
530 return -1; 530 return -1;
531 if (a->counter > b->counter) 531 if (a->counter > b->counter)
532 return 1; 532 return 1;
533 else 533 else
534 return 0; 534 return 0;
535 } 535 }
536 #endif 536 #endif
537 537
538 static int function_stat_headers(struct seq_file *m) 538 static int function_stat_headers(struct seq_file *m)
539 { 539 {
540 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 540 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
541 seq_printf(m, " Function " 541 seq_printf(m, " Function "
542 "Hit Time Avg s^2\n" 542 "Hit Time Avg s^2\n"
543 " -------- " 543 " -------- "
544 "--- ---- --- ---\n"); 544 "--- ---- --- ---\n");
545 #else 545 #else
546 seq_printf(m, " Function Hit\n" 546 seq_printf(m, " Function Hit\n"
547 " -------- ---\n"); 547 " -------- ---\n");
548 #endif 548 #endif
549 return 0; 549 return 0;
550 } 550 }
551 551
552 static int function_stat_show(struct seq_file *m, void *v) 552 static int function_stat_show(struct seq_file *m, void *v)
553 { 553 {
554 struct ftrace_profile *rec = v; 554 struct ftrace_profile *rec = v;
555 char str[KSYM_SYMBOL_LEN]; 555 char str[KSYM_SYMBOL_LEN];
556 int ret = 0; 556 int ret = 0;
557 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 557 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
558 static struct trace_seq s; 558 static struct trace_seq s;
559 unsigned long long avg; 559 unsigned long long avg;
560 unsigned long long stddev; 560 unsigned long long stddev;
561 #endif 561 #endif
562 mutex_lock(&ftrace_profile_lock); 562 mutex_lock(&ftrace_profile_lock);
563 563
564 /* we raced with function_profile_reset() */ 564 /* we raced with function_profile_reset() */
565 if (unlikely(rec->counter == 0)) { 565 if (unlikely(rec->counter == 0)) {
566 ret = -EBUSY; 566 ret = -EBUSY;
567 goto out; 567 goto out;
568 } 568 }
569 569
570 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 570 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
571 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 571 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
572 572
573 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 573 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
574 seq_printf(m, " "); 574 seq_printf(m, " ");
575 avg = rec->time; 575 avg = rec->time;
576 do_div(avg, rec->counter); 576 do_div(avg, rec->counter);
577 577
578 /* Sample standard deviation (s^2) */ 578 /* Sample standard deviation (s^2) */
579 if (rec->counter <= 1) 579 if (rec->counter <= 1)
580 stddev = 0; 580 stddev = 0;
581 else { 581 else {
582 stddev = rec->time_squared - rec->counter * avg * avg; 582 stddev = rec->time_squared - rec->counter * avg * avg;
583 /* 583 /*
584 * Divide only 1000 for ns^2 -> us^2 conversion. 584 * Divide only 1000 for ns^2 -> us^2 conversion.
585 * trace_print_graph_duration will divide 1000 again. 585 * trace_print_graph_duration will divide 1000 again.
586 */ 586 */
587 do_div(stddev, (rec->counter - 1) * 1000); 587 do_div(stddev, (rec->counter - 1) * 1000);
588 } 588 }
589 589
590 trace_seq_init(&s); 590 trace_seq_init(&s);
591 trace_print_graph_duration(rec->time, &s); 591 trace_print_graph_duration(rec->time, &s);
592 trace_seq_puts(&s, " "); 592 trace_seq_puts(&s, " ");
593 trace_print_graph_duration(avg, &s); 593 trace_print_graph_duration(avg, &s);
594 trace_seq_puts(&s, " "); 594 trace_seq_puts(&s, " ");
595 trace_print_graph_duration(stddev, &s); 595 trace_print_graph_duration(stddev, &s);
596 trace_print_seq(m, &s); 596 trace_print_seq(m, &s);
597 #endif 597 #endif
598 seq_putc(m, '\n'); 598 seq_putc(m, '\n');
599 out: 599 out:
600 mutex_unlock(&ftrace_profile_lock); 600 mutex_unlock(&ftrace_profile_lock);
601 601
602 return ret; 602 return ret;
603 } 603 }
604 604
605 static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 605 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
606 { 606 {
607 struct ftrace_profile_page *pg; 607 struct ftrace_profile_page *pg;
608 608
609 pg = stat->pages = stat->start; 609 pg = stat->pages = stat->start;
610 610
611 while (pg) { 611 while (pg) {
612 memset(pg->records, 0, PROFILE_RECORDS_SIZE); 612 memset(pg->records, 0, PROFILE_RECORDS_SIZE);
613 pg->index = 0; 613 pg->index = 0;
614 pg = pg->next; 614 pg = pg->next;
615 } 615 }
616 616
617 memset(stat->hash, 0, 617 memset(stat->hash, 0,
618 FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); 618 FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
619 } 619 }
620 620
621 int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) 621 int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
622 { 622 {
623 struct ftrace_profile_page *pg; 623 struct ftrace_profile_page *pg;
624 int functions; 624 int functions;
625 int pages; 625 int pages;
626 int i; 626 int i;
627 627
628 /* If we already allocated, do nothing */ 628 /* If we already allocated, do nothing */
629 if (stat->pages) 629 if (stat->pages)
630 return 0; 630 return 0;
631 631
632 stat->pages = (void *)get_zeroed_page(GFP_KERNEL); 632 stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
633 if (!stat->pages) 633 if (!stat->pages)
634 return -ENOMEM; 634 return -ENOMEM;
635 635
636 #ifdef CONFIG_DYNAMIC_FTRACE 636 #ifdef CONFIG_DYNAMIC_FTRACE
637 functions = ftrace_update_tot_cnt; 637 functions = ftrace_update_tot_cnt;
638 #else 638 #else
639 /* 639 /*
640 * We do not know the number of functions that exist because 640 * We do not know the number of functions that exist because
641 * dynamic tracing is what counts them. With past experience 641 * dynamic tracing is what counts them. With past experience
642 * we have around 20K functions. That should be more than enough. 642 * we have around 20K functions. That should be more than enough.
643 * It is highly unlikely we will execute every function in 643 * It is highly unlikely we will execute every function in
644 * the kernel. 644 * the kernel.
645 */ 645 */
646 functions = 20000; 646 functions = 20000;
647 #endif 647 #endif
648 648
649 pg = stat->start = stat->pages; 649 pg = stat->start = stat->pages;
650 650
651 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); 651 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
652 652
653 for (i = 0; i < pages; i++) { 653 for (i = 0; i < pages; i++) {
654 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 654 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
655 if (!pg->next) 655 if (!pg->next)
656 goto out_free; 656 goto out_free;
657 pg = pg->next; 657 pg = pg->next;
658 } 658 }
659 659
660 return 0; 660 return 0;
661 661
662 out_free: 662 out_free:
663 pg = stat->start; 663 pg = stat->start;
664 while (pg) { 664 while (pg) {
665 unsigned long tmp = (unsigned long)pg; 665 unsigned long tmp = (unsigned long)pg;
666 666
667 pg = pg->next; 667 pg = pg->next;
668 free_page(tmp); 668 free_page(tmp);
669 } 669 }
670 670
671 free_page((unsigned long)stat->pages); 671 free_page((unsigned long)stat->pages);
672 stat->pages = NULL; 672 stat->pages = NULL;
673 stat->start = NULL; 673 stat->start = NULL;
674 674
675 return -ENOMEM; 675 return -ENOMEM;
676 } 676 }
677 677
678 static int ftrace_profile_init_cpu(int cpu) 678 static int ftrace_profile_init_cpu(int cpu)
679 { 679 {
680 struct ftrace_profile_stat *stat; 680 struct ftrace_profile_stat *stat;
681 int size; 681 int size;
682 682
683 stat = &per_cpu(ftrace_profile_stats, cpu); 683 stat = &per_cpu(ftrace_profile_stats, cpu);
684 684
685 if (stat->hash) { 685 if (stat->hash) {
686 /* If the profile is already created, simply reset it */ 686 /* If the profile is already created, simply reset it */
687 ftrace_profile_reset(stat); 687 ftrace_profile_reset(stat);
688 return 0; 688 return 0;
689 } 689 }
690 690
691 /* 691 /*
692 * We are profiling all functions, but usually only a few thousand 692 * We are profiling all functions, but usually only a few thousand
693 * functions are hit. We'll make a hash of 1024 items. 693 * functions are hit. We'll make a hash of 1024 items.
694 */ 694 */
695 size = FTRACE_PROFILE_HASH_SIZE; 695 size = FTRACE_PROFILE_HASH_SIZE;
696 696
697 stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); 697 stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
698 698
699 if (!stat->hash) 699 if (!stat->hash)
700 return -ENOMEM; 700 return -ENOMEM;
701 701
702 if (!ftrace_profile_bits) { 702 if (!ftrace_profile_bits) {
703 size--; 703 size--;
704 704
705 for (; size; size >>= 1) 705 for (; size; size >>= 1)
706 ftrace_profile_bits++; 706 ftrace_profile_bits++;
707 } 707 }
708 708
709 /* Preallocate the function profiling pages */ 709 /* Preallocate the function profiling pages */
710 if (ftrace_profile_pages_init(stat) < 0) { 710 if (ftrace_profile_pages_init(stat) < 0) {
711 kfree(stat->hash); 711 kfree(stat->hash);
712 stat->hash = NULL; 712 stat->hash = NULL;
713 return -ENOMEM; 713 return -ENOMEM;
714 } 714 }
715 715
716 return 0; 716 return 0;
717 } 717 }
718 718
719 static int ftrace_profile_init(void) 719 static int ftrace_profile_init(void)
720 { 720 {
721 int cpu; 721 int cpu;
722 int ret = 0; 722 int ret = 0;
723 723
724 for_each_online_cpu(cpu) { 724 for_each_online_cpu(cpu) {
725 ret = ftrace_profile_init_cpu(cpu); 725 ret = ftrace_profile_init_cpu(cpu);
726 if (ret) 726 if (ret)
727 break; 727 break;
728 } 728 }
729 729
730 return ret; 730 return ret;
731 } 731 }
732 732
733 /* interrupts must be disabled */ 733 /* interrupts must be disabled */
734 static struct ftrace_profile * 734 static struct ftrace_profile *
735 ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) 735 ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
736 { 736 {
737 struct ftrace_profile *rec; 737 struct ftrace_profile *rec;
738 struct hlist_head *hhd; 738 struct hlist_head *hhd;
739 struct hlist_node *n; 739 struct hlist_node *n;
740 unsigned long key; 740 unsigned long key;
741 741
742 key = hash_long(ip, ftrace_profile_bits); 742 key = hash_long(ip, ftrace_profile_bits);
743 hhd = &stat->hash[key]; 743 hhd = &stat->hash[key];
744 744
745 if (hlist_empty(hhd)) 745 if (hlist_empty(hhd))
746 return NULL; 746 return NULL;
747 747
748 hlist_for_each_entry_rcu(rec, n, hhd, node) { 748 hlist_for_each_entry_rcu(rec, n, hhd, node) {
749 if (rec->ip == ip) 749 if (rec->ip == ip)
750 return rec; 750 return rec;
751 } 751 }
752 752
753 return NULL; 753 return NULL;
754 } 754 }
755 755
756 static void ftrace_add_profile(struct ftrace_profile_stat *stat, 756 static void ftrace_add_profile(struct ftrace_profile_stat *stat,
757 struct ftrace_profile *rec) 757 struct ftrace_profile *rec)
758 { 758 {
759 unsigned long key; 759 unsigned long key;
760 760
761 key = hash_long(rec->ip, ftrace_profile_bits); 761 key = hash_long(rec->ip, ftrace_profile_bits);
762 hlist_add_head_rcu(&rec->node, &stat->hash[key]); 762 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
763 } 763 }
764 764
765 /* 765 /*
766 * The memory is already allocated, this simply finds a new record to use. 766 * The memory is already allocated, this simply finds a new record to use.
767 */ 767 */
768 static struct ftrace_profile * 768 static struct ftrace_profile *
769 ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) 769 ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
770 { 770 {
771 struct ftrace_profile *rec = NULL; 771 struct ftrace_profile *rec = NULL;
772 772
773 /* prevent recursion (from NMIs) */ 773 /* prevent recursion (from NMIs) */
774 if (atomic_inc_return(&stat->disabled) != 1) 774 if (atomic_inc_return(&stat->disabled) != 1)
775 goto out; 775 goto out;
776 776
777 /* 777 /*
778 * Try to find the function again since an NMI 778 * Try to find the function again since an NMI
779 * could have added it 779 * could have added it
780 */ 780 */
781 rec = ftrace_find_profiled_func(stat, ip); 781 rec = ftrace_find_profiled_func(stat, ip);
782 if (rec) 782 if (rec)
783 goto out; 783 goto out;
784 784
785 if (stat->pages->index == PROFILES_PER_PAGE) { 785 if (stat->pages->index == PROFILES_PER_PAGE) {
786 if (!stat->pages->next) 786 if (!stat->pages->next)
787 goto out; 787 goto out;
788 stat->pages = stat->pages->next; 788 stat->pages = stat->pages->next;
789 } 789 }
790 790
791 rec = &stat->pages->records[stat->pages->index++]; 791 rec = &stat->pages->records[stat->pages->index++];
792 rec->ip = ip; 792 rec->ip = ip;
793 ftrace_add_profile(stat, rec); 793 ftrace_add_profile(stat, rec);
794 794
795 out: 795 out:
796 atomic_dec(&stat->disabled); 796 atomic_dec(&stat->disabled);
797 797
798 return rec; 798 return rec;
799 } 799 }
800 800
801 static void 801 static void
802 function_profile_call(unsigned long ip, unsigned long parent_ip, 802 function_profile_call(unsigned long ip, unsigned long parent_ip,
803 struct ftrace_ops *ops, struct pt_regs *regs) 803 struct ftrace_ops *ops, struct pt_regs *regs)
804 { 804 {
805 struct ftrace_profile_stat *stat; 805 struct ftrace_profile_stat *stat;
806 struct ftrace_profile *rec; 806 struct ftrace_profile *rec;
807 unsigned long flags; 807 unsigned long flags;
808 808
809 if (!ftrace_profile_enabled) 809 if (!ftrace_profile_enabled)
810 return; 810 return;
811 811
812 local_irq_save(flags); 812 local_irq_save(flags);
813 813
814 stat = &__get_cpu_var(ftrace_profile_stats); 814 stat = &__get_cpu_var(ftrace_profile_stats);
815 if (!stat->hash || !ftrace_profile_enabled) 815 if (!stat->hash || !ftrace_profile_enabled)
816 goto out; 816 goto out;
817 817
818 rec = ftrace_find_profiled_func(stat, ip); 818 rec = ftrace_find_profiled_func(stat, ip);
819 if (!rec) { 819 if (!rec) {
820 rec = ftrace_profile_alloc(stat, ip); 820 rec = ftrace_profile_alloc(stat, ip);
821 if (!rec) 821 if (!rec)
822 goto out; 822 goto out;
823 } 823 }
824 824
825 rec->counter++; 825 rec->counter++;
826 out: 826 out:
827 local_irq_restore(flags); 827 local_irq_restore(flags);
828 } 828 }
829 829
830 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 830 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
831 static int profile_graph_entry(struct ftrace_graph_ent *trace) 831 static int profile_graph_entry(struct ftrace_graph_ent *trace)
832 { 832 {
833 function_profile_call(trace->func, 0, NULL, NULL); 833 function_profile_call(trace->func, 0, NULL, NULL);
834 return 1; 834 return 1;
835 } 835 }
836 836
837 static void profile_graph_return(struct ftrace_graph_ret *trace) 837 static void profile_graph_return(struct ftrace_graph_ret *trace)
838 { 838 {
839 struct ftrace_profile_stat *stat; 839 struct ftrace_profile_stat *stat;
840 unsigned long long calltime; 840 unsigned long long calltime;
841 struct ftrace_profile *rec; 841 struct ftrace_profile *rec;
842 unsigned long flags; 842 unsigned long flags;
843 843
844 local_irq_save(flags); 844 local_irq_save(flags);
845 stat = &__get_cpu_var(ftrace_profile_stats); 845 stat = &__get_cpu_var(ftrace_profile_stats);
846 if (!stat->hash || !ftrace_profile_enabled) 846 if (!stat->hash || !ftrace_profile_enabled)
847 goto out; 847 goto out;
848 848
849 /* If the calltime was zero'd ignore it */ 849 /* If the calltime was zero'd ignore it */
850 if (!trace->calltime) 850 if (!trace->calltime)
851 goto out; 851 goto out;
852 852
853 calltime = trace->rettime - trace->calltime; 853 calltime = trace->rettime - trace->calltime;
854 854
855 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 855 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
856 int index; 856 int index;
857 857
858 index = trace->depth; 858 index = trace->depth;
859 859
860 /* Append this call time to the parent time to subtract */ 860 /* Append this call time to the parent time to subtract */
861 if (index) 861 if (index)
862 current->ret_stack[index - 1].subtime += calltime; 862 current->ret_stack[index - 1].subtime += calltime;
863 863
864 if (current->ret_stack[index].subtime < calltime) 864 if (current->ret_stack[index].subtime < calltime)
865 calltime -= current->ret_stack[index].subtime; 865 calltime -= current->ret_stack[index].subtime;
866 else 866 else
867 calltime = 0; 867 calltime = 0;
868 } 868 }
869 869
870 rec = ftrace_find_profiled_func(stat, trace->func); 870 rec = ftrace_find_profiled_func(stat, trace->func);
871 if (rec) { 871 if (rec) {
872 rec->time += calltime; 872 rec->time += calltime;
873 rec->time_squared += calltime * calltime; 873 rec->time_squared += calltime * calltime;
874 } 874 }
875 875
876 out: 876 out:
877 local_irq_restore(flags); 877 local_irq_restore(flags);
878 } 878 }
879 879
880 static int register_ftrace_profiler(void) 880 static int register_ftrace_profiler(void)
881 { 881 {
882 return register_ftrace_graph(&profile_graph_return, 882 return register_ftrace_graph(&profile_graph_return,
883 &profile_graph_entry); 883 &profile_graph_entry);
884 } 884 }
885 885
886 static void unregister_ftrace_profiler(void) 886 static void unregister_ftrace_profiler(void)
887 { 887 {
888 unregister_ftrace_graph(); 888 unregister_ftrace_graph();
889 } 889 }
890 #else 890 #else
891 static struct ftrace_ops ftrace_profile_ops __read_mostly = { 891 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
892 .func = function_profile_call, 892 .func = function_profile_call,
893 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 893 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
894 }; 894 };
895 895
896 static int register_ftrace_profiler(void) 896 static int register_ftrace_profiler(void)
897 { 897 {
898 return register_ftrace_function(&ftrace_profile_ops); 898 return register_ftrace_function(&ftrace_profile_ops);
899 } 899 }
900 900
901 static void unregister_ftrace_profiler(void) 901 static void unregister_ftrace_profiler(void)
902 { 902 {
903 unregister_ftrace_function(&ftrace_profile_ops); 903 unregister_ftrace_function(&ftrace_profile_ops);
904 } 904 }
905 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 905 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
906 906
907 static ssize_t 907 static ssize_t
908 ftrace_profile_write(struct file *filp, const char __user *ubuf, 908 ftrace_profile_write(struct file *filp, const char __user *ubuf,
909 size_t cnt, loff_t *ppos) 909 size_t cnt, loff_t *ppos)
910 { 910 {
911 unsigned long val; 911 unsigned long val;
912 int ret; 912 int ret;
913 913
914 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 914 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
915 if (ret) 915 if (ret)
916 return ret; 916 return ret;
917 917
918 val = !!val; 918 val = !!val;
919 919
920 mutex_lock(&ftrace_profile_lock); 920 mutex_lock(&ftrace_profile_lock);
921 if (ftrace_profile_enabled ^ val) { 921 if (ftrace_profile_enabled ^ val) {
922 if (val) { 922 if (val) {
923 ret = ftrace_profile_init(); 923 ret = ftrace_profile_init();
924 if (ret < 0) { 924 if (ret < 0) {
925 cnt = ret; 925 cnt = ret;
926 goto out; 926 goto out;
927 } 927 }
928 928
929 ret = register_ftrace_profiler(); 929 ret = register_ftrace_profiler();
930 if (ret < 0) { 930 if (ret < 0) {
931 cnt = ret; 931 cnt = ret;
932 goto out; 932 goto out;
933 } 933 }
934 ftrace_profile_enabled = 1; 934 ftrace_profile_enabled = 1;
935 } else { 935 } else {
936 ftrace_profile_enabled = 0; 936 ftrace_profile_enabled = 0;
937 /* 937 /*
938 * unregister_ftrace_profiler calls stop_machine 938 * unregister_ftrace_profiler calls stop_machine
939 * so this acts like an synchronize_sched. 939 * so this acts like an synchronize_sched.
940 */ 940 */
941 unregister_ftrace_profiler(); 941 unregister_ftrace_profiler();
942 } 942 }
943 } 943 }
944 out: 944 out:
945 mutex_unlock(&ftrace_profile_lock); 945 mutex_unlock(&ftrace_profile_lock);
946 946
947 *ppos += cnt; 947 *ppos += cnt;
948 948
949 return cnt; 949 return cnt;
950 } 950 }
951 951
952 static ssize_t 952 static ssize_t
953 ftrace_profile_read(struct file *filp, char __user *ubuf, 953 ftrace_profile_read(struct file *filp, char __user *ubuf,
954 size_t cnt, loff_t *ppos) 954 size_t cnt, loff_t *ppos)
955 { 955 {
956 char buf[64]; /* big enough to hold a number */ 956 char buf[64]; /* big enough to hold a number */
957 int r; 957 int r;
958 958
959 r = sprintf(buf, "%u\n", ftrace_profile_enabled); 959 r = sprintf(buf, "%u\n", ftrace_profile_enabled);
960 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 960 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
961 } 961 }
962 962
963 static const struct file_operations ftrace_profile_fops = { 963 static const struct file_operations ftrace_profile_fops = {
964 .open = tracing_open_generic, 964 .open = tracing_open_generic,
965 .read = ftrace_profile_read, 965 .read = ftrace_profile_read,
966 .write = ftrace_profile_write, 966 .write = ftrace_profile_write,
967 .llseek = default_llseek, 967 .llseek = default_llseek,
968 }; 968 };
969 969
970 /* used to initialize the real stat files */ 970 /* used to initialize the real stat files */
971 static struct tracer_stat function_stats __initdata = { 971 static struct tracer_stat function_stats __initdata = {
972 .name = "functions", 972 .name = "functions",
973 .stat_start = function_stat_start, 973 .stat_start = function_stat_start,
974 .stat_next = function_stat_next, 974 .stat_next = function_stat_next,
975 .stat_cmp = function_stat_cmp, 975 .stat_cmp = function_stat_cmp,
976 .stat_headers = function_stat_headers, 976 .stat_headers = function_stat_headers,
977 .stat_show = function_stat_show 977 .stat_show = function_stat_show
978 }; 978 };
979 979
980 static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 980 static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
981 { 981 {
982 struct ftrace_profile_stat *stat; 982 struct ftrace_profile_stat *stat;
983 struct dentry *entry; 983 struct dentry *entry;
984 char *name; 984 char *name;
985 int ret; 985 int ret;
986 int cpu; 986 int cpu;
987 987
988 for_each_possible_cpu(cpu) { 988 for_each_possible_cpu(cpu) {
989 stat = &per_cpu(ftrace_profile_stats, cpu); 989 stat = &per_cpu(ftrace_profile_stats, cpu);
990 990
991 /* allocate enough for function name + cpu number */ 991 /* allocate enough for function name + cpu number */
992 name = kmalloc(32, GFP_KERNEL); 992 name = kmalloc(32, GFP_KERNEL);
993 if (!name) { 993 if (!name) {
994 /* 994 /*
995 * The files created are permanent, if something happens 995 * The files created are permanent, if something happens
996 * we still do not free memory. 996 * we still do not free memory.
997 */ 997 */
998 WARN(1, 998 WARN(1,
999 "Could not allocate stat file for cpu %d\n", 999 "Could not allocate stat file for cpu %d\n",
1000 cpu); 1000 cpu);
1001 return; 1001 return;
1002 } 1002 }
1003 stat->stat = function_stats; 1003 stat->stat = function_stats;
1004 snprintf(name, 32, "function%d", cpu); 1004 snprintf(name, 32, "function%d", cpu);
1005 stat->stat.name = name; 1005 stat->stat.name = name;
1006 ret = register_stat_tracer(&stat->stat); 1006 ret = register_stat_tracer(&stat->stat);
1007 if (ret) { 1007 if (ret) {
1008 WARN(1, 1008 WARN(1,
1009 "Could not register function stat for cpu %d\n", 1009 "Could not register function stat for cpu %d\n",
1010 cpu); 1010 cpu);
1011 kfree(name); 1011 kfree(name);
1012 return; 1012 return;
1013 } 1013 }
1014 } 1014 }
1015 1015
1016 entry = debugfs_create_file("function_profile_enabled", 0644, 1016 entry = debugfs_create_file("function_profile_enabled", 0644,
1017 d_tracer, NULL, &ftrace_profile_fops); 1017 d_tracer, NULL, &ftrace_profile_fops);
1018 if (!entry) 1018 if (!entry)
1019 pr_warning("Could not create debugfs " 1019 pr_warning("Could not create debugfs "
1020 "'function_profile_enabled' entry\n"); 1020 "'function_profile_enabled' entry\n");
1021 } 1021 }
1022 1022
1023 #else /* CONFIG_FUNCTION_PROFILER */ 1023 #else /* CONFIG_FUNCTION_PROFILER */
1024 static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 1024 static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1025 { 1025 {
1026 } 1026 }
1027 #endif /* CONFIG_FUNCTION_PROFILER */ 1027 #endif /* CONFIG_FUNCTION_PROFILER */
1028 1028
1029 static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1029 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1030 1030
1031 #ifdef CONFIG_DYNAMIC_FTRACE 1031 #ifdef CONFIG_DYNAMIC_FTRACE
1032 1032
1033 #ifndef CONFIG_FTRACE_MCOUNT_RECORD 1033 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
1034 # error Dynamic ftrace depends on MCOUNT_RECORD 1034 # error Dynamic ftrace depends on MCOUNT_RECORD
1035 #endif 1035 #endif
1036 1036
1037 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; 1037 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
1038 1038
1039 struct ftrace_func_probe { 1039 struct ftrace_func_probe {
1040 struct hlist_node node; 1040 struct hlist_node node;
1041 struct ftrace_probe_ops *ops; 1041 struct ftrace_probe_ops *ops;
1042 unsigned long flags; 1042 unsigned long flags;
1043 unsigned long ip; 1043 unsigned long ip;
1044 void *data; 1044 void *data;
1045 struct rcu_head rcu; 1045 struct rcu_head rcu;
1046 }; 1046 };
1047 1047
1048 struct ftrace_func_entry { 1048 struct ftrace_func_entry {
1049 struct hlist_node hlist; 1049 struct hlist_node hlist;
1050 unsigned long ip; 1050 unsigned long ip;
1051 }; 1051 };
1052 1052
1053 struct ftrace_hash { 1053 struct ftrace_hash {
1054 unsigned long size_bits; 1054 unsigned long size_bits;
1055 struct hlist_head *buckets; 1055 struct hlist_head *buckets;
1056 unsigned long count; 1056 unsigned long count;
1057 struct rcu_head rcu; 1057 struct rcu_head rcu;
1058 }; 1058 };
1059 1059
1060 /* 1060 /*
1061 * We make these constant because no one should touch them, 1061 * We make these constant because no one should touch them,
1062 * but they are used as the default "empty hash", to avoid allocating 1062 * but they are used as the default "empty hash", to avoid allocating
1063 * it all the time. These are in a read only section such that if 1063 * it all the time. These are in a read only section such that if
1064 * anyone does try to modify it, it will cause an exception. 1064 * anyone does try to modify it, it will cause an exception.
1065 */ 1065 */
1066 static const struct hlist_head empty_buckets[1]; 1066 static const struct hlist_head empty_buckets[1];
1067 static const struct ftrace_hash empty_hash = { 1067 static const struct ftrace_hash empty_hash = {
1068 .buckets = (struct hlist_head *)empty_buckets, 1068 .buckets = (struct hlist_head *)empty_buckets,
1069 }; 1069 };
1070 #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) 1070 #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
1071 1071
1072 static struct ftrace_ops global_ops = { 1072 static struct ftrace_ops global_ops = {
1073 .func = ftrace_stub, 1073 .func = ftrace_stub,
1074 .notrace_hash = EMPTY_HASH, 1074 .notrace_hash = EMPTY_HASH,
1075 .filter_hash = EMPTY_HASH, 1075 .filter_hash = EMPTY_HASH,
1076 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 1076 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
1077 }; 1077 };
1078 1078
1079 static DEFINE_MUTEX(ftrace_regex_lock); 1079 static DEFINE_MUTEX(ftrace_regex_lock);
1080 1080
1081 struct ftrace_page { 1081 struct ftrace_page {
1082 struct ftrace_page *next; 1082 struct ftrace_page *next;
1083 struct dyn_ftrace *records; 1083 struct dyn_ftrace *records;
1084 int index; 1084 int index;
1085 int size; 1085 int size;
1086 }; 1086 };
1087 1087
1088 static struct ftrace_page *ftrace_new_pgs; 1088 static struct ftrace_page *ftrace_new_pgs;
1089 1089
1090 #define ENTRY_SIZE sizeof(struct dyn_ftrace) 1090 #define ENTRY_SIZE sizeof(struct dyn_ftrace)
1091 #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) 1091 #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1092 1092
1093 /* estimate from running different kernels */ 1093 /* estimate from running different kernels */
1094 #define NR_TO_INIT 10000 1094 #define NR_TO_INIT 10000
1095 1095
1096 static struct ftrace_page *ftrace_pages_start; 1096 static struct ftrace_page *ftrace_pages_start;
1097 static struct ftrace_page *ftrace_pages; 1097 static struct ftrace_page *ftrace_pages;
1098 1098
1099 static bool ftrace_hash_empty(struct ftrace_hash *hash) 1099 static bool ftrace_hash_empty(struct ftrace_hash *hash)
1100 { 1100 {
1101 return !hash || !hash->count; 1101 return !hash || !hash->count;
1102 } 1102 }
1103 1103
1104 static struct ftrace_func_entry * 1104 static struct ftrace_func_entry *
1105 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) 1105 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1106 { 1106 {
1107 unsigned long key; 1107 unsigned long key;
1108 struct ftrace_func_entry *entry; 1108 struct ftrace_func_entry *entry;
1109 struct hlist_head *hhd; 1109 struct hlist_head *hhd;
1110 struct hlist_node *n; 1110 struct hlist_node *n;
1111 1111
1112 if (ftrace_hash_empty(hash)) 1112 if (ftrace_hash_empty(hash))
1113 return NULL; 1113 return NULL;
1114 1114
1115 if (hash->size_bits > 0) 1115 if (hash->size_bits > 0)
1116 key = hash_long(ip, hash->size_bits); 1116 key = hash_long(ip, hash->size_bits);
1117 else 1117 else
1118 key = 0; 1118 key = 0;
1119 1119
1120 hhd = &hash->buckets[key]; 1120 hhd = &hash->buckets[key];
1121 1121
1122 hlist_for_each_entry_rcu(entry, n, hhd, hlist) { 1122 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1123 if (entry->ip == ip) 1123 if (entry->ip == ip)
1124 return entry; 1124 return entry;
1125 } 1125 }
1126 return NULL; 1126 return NULL;
1127 } 1127 }
1128 1128
1129 static void __add_hash_entry(struct ftrace_hash *hash, 1129 static void __add_hash_entry(struct ftrace_hash *hash,
1130 struct ftrace_func_entry *entry) 1130 struct ftrace_func_entry *entry)
1131 { 1131 {
1132 struct hlist_head *hhd; 1132 struct hlist_head *hhd;
1133 unsigned long key; 1133 unsigned long key;
1134 1134
1135 if (hash->size_bits) 1135 if (hash->size_bits)
1136 key = hash_long(entry->ip, hash->size_bits); 1136 key = hash_long(entry->ip, hash->size_bits);
1137 else 1137 else
1138 key = 0; 1138 key = 0;
1139 1139
1140 hhd = &hash->buckets[key]; 1140 hhd = &hash->buckets[key];
1141 hlist_add_head(&entry->hlist, hhd); 1141 hlist_add_head(&entry->hlist, hhd);
1142 hash->count++; 1142 hash->count++;
1143 } 1143 }
1144 1144
1145 static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) 1145 static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1146 { 1146 {
1147 struct ftrace_func_entry *entry; 1147 struct ftrace_func_entry *entry;
1148 1148
1149 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 1149 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1150 if (!entry) 1150 if (!entry)
1151 return -ENOMEM; 1151 return -ENOMEM;
1152 1152
1153 entry->ip = ip; 1153 entry->ip = ip;
1154 __add_hash_entry(hash, entry); 1154 __add_hash_entry(hash, entry);
1155 1155
1156 return 0; 1156 return 0;
1157 } 1157 }
1158 1158
1159 static void 1159 static void
1160 free_hash_entry(struct ftrace_hash *hash, 1160 free_hash_entry(struct ftrace_hash *hash,
1161 struct ftrace_func_entry *entry) 1161 struct ftrace_func_entry *entry)
1162 { 1162 {
1163 hlist_del(&entry->hlist); 1163 hlist_del(&entry->hlist);
1164 kfree(entry); 1164 kfree(entry);
1165 hash->count--; 1165 hash->count--;
1166 } 1166 }
1167 1167
1168 static void 1168 static void
1169 remove_hash_entry(struct ftrace_hash *hash, 1169 remove_hash_entry(struct ftrace_hash *hash,
1170 struct ftrace_func_entry *entry) 1170 struct ftrace_func_entry *entry)
1171 { 1171 {
1172 hlist_del(&entry->hlist); 1172 hlist_del(&entry->hlist);
1173 hash->count--; 1173 hash->count--;
1174 } 1174 }
1175 1175
1176 static void ftrace_hash_clear(struct ftrace_hash *hash) 1176 static void ftrace_hash_clear(struct ftrace_hash *hash)
1177 { 1177 {
1178 struct hlist_head *hhd; 1178 struct hlist_head *hhd;
1179 struct hlist_node *tp, *tn; 1179 struct hlist_node *tp, *tn;
1180 struct ftrace_func_entry *entry; 1180 struct ftrace_func_entry *entry;
1181 int size = 1 << hash->size_bits; 1181 int size = 1 << hash->size_bits;
1182 int i; 1182 int i;
1183 1183
1184 if (!hash->count) 1184 if (!hash->count)
1185 return; 1185 return;
1186 1186
1187 for (i = 0; i < size; i++) { 1187 for (i = 0; i < size; i++) {
1188 hhd = &hash->buckets[i]; 1188 hhd = &hash->buckets[i];
1189 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) 1189 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1190 free_hash_entry(hash, entry); 1190 free_hash_entry(hash, entry);
1191 } 1191 }
1192 FTRACE_WARN_ON(hash->count); 1192 FTRACE_WARN_ON(hash->count);
1193 } 1193 }
1194 1194
1195 static void free_ftrace_hash(struct ftrace_hash *hash) 1195 static void free_ftrace_hash(struct ftrace_hash *hash)
1196 { 1196 {
1197 if (!hash || hash == EMPTY_HASH) 1197 if (!hash || hash == EMPTY_HASH)
1198 return; 1198 return;
1199 ftrace_hash_clear(hash); 1199 ftrace_hash_clear(hash);
1200 kfree(hash->buckets); 1200 kfree(hash->buckets);
1201 kfree(hash); 1201 kfree(hash);
1202 } 1202 }
1203 1203
1204 static void __free_ftrace_hash_rcu(struct rcu_head *rcu) 1204 static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1205 { 1205 {
1206 struct ftrace_hash *hash; 1206 struct ftrace_hash *hash;
1207 1207
1208 hash = container_of(rcu, struct ftrace_hash, rcu); 1208 hash = container_of(rcu, struct ftrace_hash, rcu);
1209 free_ftrace_hash(hash); 1209 free_ftrace_hash(hash);
1210 } 1210 }
1211 1211
1212 static void free_ftrace_hash_rcu(struct ftrace_hash *hash) 1212 static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1213 { 1213 {
1214 if (!hash || hash == EMPTY_HASH) 1214 if (!hash || hash == EMPTY_HASH)
1215 return; 1215 return;
1216 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); 1216 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1217 } 1217 }
1218 1218
1219 void ftrace_free_filter(struct ftrace_ops *ops) 1219 void ftrace_free_filter(struct ftrace_ops *ops)
1220 { 1220 {
1221 free_ftrace_hash(ops->filter_hash); 1221 free_ftrace_hash(ops->filter_hash);
1222 free_ftrace_hash(ops->notrace_hash); 1222 free_ftrace_hash(ops->notrace_hash);
1223 } 1223 }
1224 1224
1225 static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1225 static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1226 { 1226 {
1227 struct ftrace_hash *hash; 1227 struct ftrace_hash *hash;
1228 int size; 1228 int size;
1229 1229
1230 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 1230 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1231 if (!hash) 1231 if (!hash)
1232 return NULL; 1232 return NULL;
1233 1233
1234 size = 1 << size_bits; 1234 size = 1 << size_bits;
1235 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); 1235 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
1236 1236
1237 if (!hash->buckets) { 1237 if (!hash->buckets) {
1238 kfree(hash); 1238 kfree(hash);
1239 return NULL; 1239 return NULL;
1240 } 1240 }
1241 1241
1242 hash->size_bits = size_bits; 1242 hash->size_bits = size_bits;
1243 1243
1244 return hash; 1244 return hash;
1245 } 1245 }
1246 1246
1247 static struct ftrace_hash * 1247 static struct ftrace_hash *
1248 alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) 1248 alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1249 { 1249 {
1250 struct ftrace_func_entry *entry; 1250 struct ftrace_func_entry *entry;
1251 struct ftrace_hash *new_hash; 1251 struct ftrace_hash *new_hash;
1252 struct hlist_node *tp; 1252 struct hlist_node *tp;
1253 int size; 1253 int size;
1254 int ret; 1254 int ret;
1255 int i; 1255 int i;
1256 1256
1257 new_hash = alloc_ftrace_hash(size_bits); 1257 new_hash = alloc_ftrace_hash(size_bits);
1258 if (!new_hash) 1258 if (!new_hash)
1259 return NULL; 1259 return NULL;
1260 1260
1261 /* Empty hash? */ 1261 /* Empty hash? */
1262 if (ftrace_hash_empty(hash)) 1262 if (ftrace_hash_empty(hash))
1263 return new_hash; 1263 return new_hash;
1264 1264
1265 size = 1 << hash->size_bits; 1265 size = 1 << hash->size_bits;
1266 for (i = 0; i < size; i++) { 1266 for (i = 0; i < size; i++) {
1267 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { 1267 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1268 ret = add_hash_entry(new_hash, entry->ip); 1268 ret = add_hash_entry(new_hash, entry->ip);
1269 if (ret < 0) 1269 if (ret < 0)
1270 goto free_hash; 1270 goto free_hash;
1271 } 1271 }
1272 } 1272 }
1273 1273
1274 FTRACE_WARN_ON(new_hash->count != hash->count); 1274 FTRACE_WARN_ON(new_hash->count != hash->count);
1275 1275
1276 return new_hash; 1276 return new_hash;
1277 1277
1278 free_hash: 1278 free_hash:
1279 free_ftrace_hash(new_hash); 1279 free_ftrace_hash(new_hash);
1280 return NULL; 1280 return NULL;
1281 } 1281 }
1282 1282
1283 static void 1283 static void
1284 ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); 1284 ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1285 static void 1285 static void
1286 ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); 1286 ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1287 1287
1288 static int 1288 static int
1289 ftrace_hash_move(struct ftrace_ops *ops, int enable, 1289 ftrace_hash_move(struct ftrace_ops *ops, int enable,
1290 struct ftrace_hash **dst, struct ftrace_hash *src) 1290 struct ftrace_hash **dst, struct ftrace_hash *src)
1291 { 1291 {
1292 struct ftrace_func_entry *entry; 1292 struct ftrace_func_entry *entry;
1293 struct hlist_node *tp, *tn; 1293 struct hlist_node *tp, *tn;
1294 struct hlist_head *hhd; 1294 struct hlist_head *hhd;
1295 struct ftrace_hash *old_hash; 1295 struct ftrace_hash *old_hash;
1296 struct ftrace_hash *new_hash; 1296 struct ftrace_hash *new_hash;
1297 unsigned long key; 1297 unsigned long key;
1298 int size = src->count; 1298 int size = src->count;
1299 int bits = 0; 1299 int bits = 0;
1300 int ret; 1300 int ret;
1301 int i; 1301 int i;
1302 1302
1303 /* 1303 /*
1304 * Remove the current set, update the hash and add 1304 * Remove the current set, update the hash and add
1305 * them back. 1305 * them back.
1306 */ 1306 */
1307 ftrace_hash_rec_disable(ops, enable); 1307 ftrace_hash_rec_disable(ops, enable);
1308 1308
1309 /* 1309 /*
1310 * If the new source is empty, just free dst and assign it 1310 * If the new source is empty, just free dst and assign it
1311 * the empty_hash. 1311 * the empty_hash.
1312 */ 1312 */
1313 if (!src->count) { 1313 if (!src->count) {
1314 free_ftrace_hash_rcu(*dst); 1314 free_ftrace_hash_rcu(*dst);
1315 rcu_assign_pointer(*dst, EMPTY_HASH); 1315 rcu_assign_pointer(*dst, EMPTY_HASH);
1316 /* still need to update the function records */ 1316 /* still need to update the function records */
1317 ret = 0; 1317 ret = 0;
1318 goto out; 1318 goto out;
1319 } 1319 }
1320 1320
1321 /* 1321 /*
1322 * Make the hash size about 1/2 the # found 1322 * Make the hash size about 1/2 the # found
1323 */ 1323 */
1324 for (size /= 2; size; size >>= 1) 1324 for (size /= 2; size; size >>= 1)
1325 bits++; 1325 bits++;
1326 1326
1327 /* Don't allocate too much */ 1327 /* Don't allocate too much */
1328 if (bits > FTRACE_HASH_MAX_BITS) 1328 if (bits > FTRACE_HASH_MAX_BITS)
1329 bits = FTRACE_HASH_MAX_BITS; 1329 bits = FTRACE_HASH_MAX_BITS;
1330 1330
1331 ret = -ENOMEM; 1331 ret = -ENOMEM;
1332 new_hash = alloc_ftrace_hash(bits); 1332 new_hash = alloc_ftrace_hash(bits);
1333 if (!new_hash) 1333 if (!new_hash)
1334 goto out; 1334 goto out;
1335 1335
1336 size = 1 << src->size_bits; 1336 size = 1 << src->size_bits;
1337 for (i = 0; i < size; i++) { 1337 for (i = 0; i < size; i++) {
1338 hhd = &src->buckets[i]; 1338 hhd = &src->buckets[i];
1339 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { 1339 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1340 if (bits > 0) 1340 if (bits > 0)
1341 key = hash_long(entry->ip, bits); 1341 key = hash_long(entry->ip, bits);
1342 else 1342 else
1343 key = 0; 1343 key = 0;
1344 remove_hash_entry(src, entry); 1344 remove_hash_entry(src, entry);
1345 __add_hash_entry(new_hash, entry); 1345 __add_hash_entry(new_hash, entry);
1346 } 1346 }
1347 } 1347 }
1348 1348
1349 old_hash = *dst; 1349 old_hash = *dst;
1350 rcu_assign_pointer(*dst, new_hash); 1350 rcu_assign_pointer(*dst, new_hash);
1351 free_ftrace_hash_rcu(old_hash); 1351 free_ftrace_hash_rcu(old_hash);
1352 1352
1353 ret = 0; 1353 ret = 0;
1354 out: 1354 out:
1355 /* 1355 /*
1356 * Enable regardless of ret: 1356 * Enable regardless of ret:
1357 * On success, we enable the new hash. 1357 * On success, we enable the new hash.
1358 * On failure, we re-enable the original hash. 1358 * On failure, we re-enable the original hash.
1359 */ 1359 */
1360 ftrace_hash_rec_enable(ops, enable); 1360 ftrace_hash_rec_enable(ops, enable);
1361 1361
1362 return ret; 1362 return ret;
1363 } 1363 }
1364 1364
1365 /* 1365 /*
1366 * Test the hashes for this ops to see if we want to call 1366 * Test the hashes for this ops to see if we want to call
1367 * the ops->func or not. 1367 * the ops->func or not.
1368 * 1368 *
1369 * It's a match if the ip is in the ops->filter_hash or 1369 * It's a match if the ip is in the ops->filter_hash or
1370 * the filter_hash does not exist or is empty, 1370 * the filter_hash does not exist or is empty,
1371 * AND 1371 * AND
1372 * the ip is not in the ops->notrace_hash. 1372 * the ip is not in the ops->notrace_hash.
1373 * 1373 *
1374 * This needs to be called with preemption disabled as 1374 * This needs to be called with preemption disabled as
1375 * the hashes are freed with call_rcu_sched(). 1375 * the hashes are freed with call_rcu_sched().
1376 */ 1376 */
1377 static int 1377 static int
1378 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 1378 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1379 { 1379 {
1380 struct ftrace_hash *filter_hash; 1380 struct ftrace_hash *filter_hash;
1381 struct ftrace_hash *notrace_hash; 1381 struct ftrace_hash *notrace_hash;
1382 int ret; 1382 int ret;
1383 1383
1384 filter_hash = rcu_dereference_raw(ops->filter_hash); 1384 filter_hash = rcu_dereference_raw(ops->filter_hash);
1385 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1385 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1386 1386
1387 if ((ftrace_hash_empty(filter_hash) || 1387 if ((ftrace_hash_empty(filter_hash) ||
1388 ftrace_lookup_ip(filter_hash, ip)) && 1388 ftrace_lookup_ip(filter_hash, ip)) &&
1389 (ftrace_hash_empty(notrace_hash) || 1389 (ftrace_hash_empty(notrace_hash) ||
1390 !ftrace_lookup_ip(notrace_hash, ip))) 1390 !ftrace_lookup_ip(notrace_hash, ip)))
1391 ret = 1; 1391 ret = 1;
1392 else 1392 else
1393 ret = 0; 1393 ret = 0;
1394 1394
1395 return ret; 1395 return ret;
1396 } 1396 }
1397 1397
1398 /* 1398 /*
1399 * This is a double for. Do not use 'break' to break out of the loop, 1399 * This is a double for. Do not use 'break' to break out of the loop,
1400 * you must use a goto. 1400 * you must use a goto.
1401 */ 1401 */
1402 #define do_for_each_ftrace_rec(pg, rec) \ 1402 #define do_for_each_ftrace_rec(pg, rec) \
1403 for (pg = ftrace_pages_start; pg; pg = pg->next) { \ 1403 for (pg = ftrace_pages_start; pg; pg = pg->next) { \
1404 int _____i; \ 1404 int _____i; \
1405 for (_____i = 0; _____i < pg->index; _____i++) { \ 1405 for (_____i = 0; _____i < pg->index; _____i++) { \
1406 rec = &pg->records[_____i]; 1406 rec = &pg->records[_____i];
1407 1407
1408 #define while_for_each_ftrace_rec() \ 1408 #define while_for_each_ftrace_rec() \
1409 } \ 1409 } \
1410 } 1410 }
1411 1411
1412 1412
1413 static int ftrace_cmp_recs(const void *a, const void *b) 1413 static int ftrace_cmp_recs(const void *a, const void *b)
1414 { 1414 {
1415 const struct dyn_ftrace *key = a; 1415 const struct dyn_ftrace *key = a;
1416 const struct dyn_ftrace *rec = b; 1416 const struct dyn_ftrace *rec = b;
1417 1417
1418 if (key->flags < rec->ip) 1418 if (key->flags < rec->ip)
1419 return -1; 1419 return -1;
1420 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) 1420 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1421 return 1; 1421 return 1;
1422 return 0; 1422 return 0;
1423 } 1423 }
1424 1424
1425 static unsigned long ftrace_location_range(unsigned long start, unsigned long end) 1425 static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1426 { 1426 {
1427 struct ftrace_page *pg; 1427 struct ftrace_page *pg;
1428 struct dyn_ftrace *rec; 1428 struct dyn_ftrace *rec;
1429 struct dyn_ftrace key; 1429 struct dyn_ftrace key;
1430 1430
1431 key.ip = start; 1431 key.ip = start;
1432 key.flags = end; /* overload flags, as it is unsigned long */ 1432 key.flags = end; /* overload flags, as it is unsigned long */
1433 1433
1434 for (pg = ftrace_pages_start; pg; pg = pg->next) { 1434 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1435 if (end < pg->records[0].ip || 1435 if (end < pg->records[0].ip ||
1436 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) 1436 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1437 continue; 1437 continue;
1438 rec = bsearch(&key, pg->records, pg->index, 1438 rec = bsearch(&key, pg->records, pg->index,
1439 sizeof(struct dyn_ftrace), 1439 sizeof(struct dyn_ftrace),
1440 ftrace_cmp_recs); 1440 ftrace_cmp_recs);
1441 if (rec) 1441 if (rec)
1442 return rec->ip; 1442 return rec->ip;
1443 } 1443 }
1444 1444
1445 return 0; 1445 return 0;
1446 } 1446 }
1447 1447
1448 /** 1448 /**
1449 * ftrace_location - return true if the ip giving is a traced location 1449 * ftrace_location - return true if the ip giving is a traced location
1450 * @ip: the instruction pointer to check 1450 * @ip: the instruction pointer to check
1451 * 1451 *
1452 * Returns rec->ip if @ip given is a pointer to a ftrace location. 1452 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1453 * That is, the instruction that is either a NOP or call to 1453 * That is, the instruction that is either a NOP or call to
1454 * the function tracer. It checks the ftrace internal tables to 1454 * the function tracer. It checks the ftrace internal tables to
1455 * determine if the address belongs or not. 1455 * determine if the address belongs or not.
1456 */ 1456 */
1457 unsigned long ftrace_location(unsigned long ip) 1457 unsigned long ftrace_location(unsigned long ip)
1458 { 1458 {
1459 return ftrace_location_range(ip, ip); 1459 return ftrace_location_range(ip, ip);
1460 } 1460 }
1461 1461
1462 /** 1462 /**
1463 * ftrace_text_reserved - return true if range contains an ftrace location 1463 * ftrace_text_reserved - return true if range contains an ftrace location
1464 * @start: start of range to search 1464 * @start: start of range to search
1465 * @end: end of range to search (inclusive). @end points to the last byte to check. 1465 * @end: end of range to search (inclusive). @end points to the last byte to check.
1466 * 1466 *
1467 * Returns 1 if @start and @end contains a ftrace location. 1467 * Returns 1 if @start and @end contains a ftrace location.
1468 * That is, the instruction that is either a NOP or call to 1468 * That is, the instruction that is either a NOP or call to
1469 * the function tracer. It checks the ftrace internal tables to 1469 * the function tracer. It checks the ftrace internal tables to
1470 * determine if the address belongs or not. 1470 * determine if the address belongs or not.
1471 */ 1471 */
1472 int ftrace_text_reserved(void *start, void *end) 1472 int ftrace_text_reserved(void *start, void *end)
1473 { 1473 {
1474 unsigned long ret; 1474 unsigned long ret;
1475 1475
1476 ret = ftrace_location_range((unsigned long)start, 1476 ret = ftrace_location_range((unsigned long)start,
1477 (unsigned long)end); 1477 (unsigned long)end);
1478 1478
1479 return (int)!!ret; 1479 return (int)!!ret;
1480 } 1480 }
1481 1481
1482 static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1482 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1483 int filter_hash, 1483 int filter_hash,
1484 bool inc) 1484 bool inc)
1485 { 1485 {
1486 struct ftrace_hash *hash; 1486 struct ftrace_hash *hash;
1487 struct ftrace_hash *other_hash; 1487 struct ftrace_hash *other_hash;
1488 struct ftrace_page *pg; 1488 struct ftrace_page *pg;
1489 struct dyn_ftrace *rec; 1489 struct dyn_ftrace *rec;
1490 int count = 0; 1490 int count = 0;
1491 int all = 0; 1491 int all = 0;
1492 1492
1493 /* Only update if the ops has been registered */ 1493 /* Only update if the ops has been registered */
1494 if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) 1494 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1495 return; 1495 return;
1496 1496
1497 /* 1497 /*
1498 * In the filter_hash case: 1498 * In the filter_hash case:
1499 * If the count is zero, we update all records. 1499 * If the count is zero, we update all records.
1500 * Otherwise we just update the items in the hash. 1500 * Otherwise we just update the items in the hash.
1501 * 1501 *
1502 * In the notrace_hash case: 1502 * In the notrace_hash case:
1503 * We enable the update in the hash. 1503 * We enable the update in the hash.
1504 * As disabling notrace means enabling the tracing, 1504 * As disabling notrace means enabling the tracing,
1505 * and enabling notrace means disabling, the inc variable 1505 * and enabling notrace means disabling, the inc variable
1506 * gets inversed. 1506 * gets inversed.
1507 */ 1507 */
1508 if (filter_hash) { 1508 if (filter_hash) {
1509 hash = ops->filter_hash; 1509 hash = ops->filter_hash;
1510 other_hash = ops->notrace_hash; 1510 other_hash = ops->notrace_hash;
1511 if (ftrace_hash_empty(hash)) 1511 if (ftrace_hash_empty(hash))
1512 all = 1; 1512 all = 1;
1513 } else { 1513 } else {
1514 inc = !inc; 1514 inc = !inc;
1515 hash = ops->notrace_hash; 1515 hash = ops->notrace_hash;
1516 other_hash = ops->filter_hash; 1516 other_hash = ops->filter_hash;
1517 /* 1517 /*
1518 * If the notrace hash has no items, 1518 * If the notrace hash has no items,
1519 * then there's nothing to do. 1519 * then there's nothing to do.
1520 */ 1520 */
1521 if (ftrace_hash_empty(hash)) 1521 if (ftrace_hash_empty(hash))
1522 return; 1522 return;
1523 } 1523 }
1524 1524
1525 do_for_each_ftrace_rec(pg, rec) { 1525 do_for_each_ftrace_rec(pg, rec) {
1526 int in_other_hash = 0; 1526 int in_other_hash = 0;
1527 int in_hash = 0; 1527 int in_hash = 0;
1528 int match = 0; 1528 int match = 0;
1529 1529
1530 if (all) { 1530 if (all) {
1531 /* 1531 /*
1532 * Only the filter_hash affects all records. 1532 * Only the filter_hash affects all records.
1533 * Update if the record is not in the notrace hash. 1533 * Update if the record is not in the notrace hash.
1534 */ 1534 */
1535 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) 1535 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1536 match = 1; 1536 match = 1;
1537 } else { 1537 } else {
1538 in_hash = !!ftrace_lookup_ip(hash, rec->ip); 1538 in_hash = !!ftrace_lookup_ip(hash, rec->ip);
1539 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); 1539 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1540 1540
1541 /* 1541 /*
1542 * 1542 *
1543 */ 1543 */
1544 if (filter_hash && in_hash && !in_other_hash) 1544 if (filter_hash && in_hash && !in_other_hash)
1545 match = 1; 1545 match = 1;
1546 else if (!filter_hash && in_hash && 1546 else if (!filter_hash && in_hash &&
1547 (in_other_hash || ftrace_hash_empty(other_hash))) 1547 (in_other_hash || ftrace_hash_empty(other_hash)))
1548 match = 1; 1548 match = 1;
1549 } 1549 }
1550 if (!match) 1550 if (!match)
1551 continue; 1551 continue;
1552 1552
1553 if (inc) { 1553 if (inc) {
1554 rec->flags++; 1554 rec->flags++;
1555 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) 1555 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1556 return; 1556 return;
1557 /* 1557 /*
1558 * If any ops wants regs saved for this function 1558 * If any ops wants regs saved for this function
1559 * then all ops will get saved regs. 1559 * then all ops will get saved regs.
1560 */ 1560 */
1561 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) 1561 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1562 rec->flags |= FTRACE_FL_REGS; 1562 rec->flags |= FTRACE_FL_REGS;
1563 } else { 1563 } else {
1564 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) 1564 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1565 return; 1565 return;
1566 rec->flags--; 1566 rec->flags--;
1567 } 1567 }
1568 count++; 1568 count++;
1569 /* Shortcut, if we handled all records, we are done. */ 1569 /* Shortcut, if we handled all records, we are done. */
1570 if (!all && count == hash->count) 1570 if (!all && count == hash->count)
1571 return; 1571 return;
1572 } while_for_each_ftrace_rec(); 1572 } while_for_each_ftrace_rec();
1573 } 1573 }
1574 1574
1575 static void ftrace_hash_rec_disable(struct ftrace_ops *ops, 1575 static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1576 int filter_hash) 1576 int filter_hash)
1577 { 1577 {
1578 __ftrace_hash_rec_update(ops, filter_hash, 0); 1578 __ftrace_hash_rec_update(ops, filter_hash, 0);
1579 } 1579 }
1580 1580
1581 static void ftrace_hash_rec_enable(struct ftrace_ops *ops, 1581 static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1582 int filter_hash) 1582 int filter_hash)
1583 { 1583 {
1584 __ftrace_hash_rec_update(ops, filter_hash, 1); 1584 __ftrace_hash_rec_update(ops, filter_hash, 1);
1585 } 1585 }
1586 1586
1587 static void print_ip_ins(const char *fmt, unsigned char *p) 1587 static void print_ip_ins(const char *fmt, unsigned char *p)
1588 { 1588 {
1589 int i; 1589 int i;
1590 1590
1591 printk(KERN_CONT "%s", fmt); 1591 printk(KERN_CONT "%s", fmt);
1592 1592
1593 for (i = 0; i < MCOUNT_INSN_SIZE; i++) 1593 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
1594 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1594 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1595 } 1595 }
1596 1596
1597 /** 1597 /**
1598 * ftrace_bug - report and shutdown function tracer 1598 * ftrace_bug - report and shutdown function tracer
1599 * @failed: The failed type (EFAULT, EINVAL, EPERM) 1599 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1600 * @ip: The address that failed 1600 * @ip: The address that failed
1601 * 1601 *
1602 * The arch code that enables or disables the function tracing 1602 * The arch code that enables or disables the function tracing
1603 * can call ftrace_bug() when it has detected a problem in 1603 * can call ftrace_bug() when it has detected a problem in
1604 * modifying the code. @failed should be one of either: 1604 * modifying the code. @failed should be one of either:
1605 * EFAULT - if the problem happens on reading the @ip address 1605 * EFAULT - if the problem happens on reading the @ip address
1606 * EINVAL - if what is read at @ip is not what was expected 1606 * EINVAL - if what is read at @ip is not what was expected
1607 * EPERM - if the problem happens on writting to the @ip address 1607 * EPERM - if the problem happens on writting to the @ip address
1608 */ 1608 */
1609 void ftrace_bug(int failed, unsigned long ip) 1609 void ftrace_bug(int failed, unsigned long ip)
1610 { 1610 {
1611 switch (failed) { 1611 switch (failed) {
1612 case -EFAULT: 1612 case -EFAULT:
1613 FTRACE_WARN_ON_ONCE(1); 1613 FTRACE_WARN_ON_ONCE(1);
1614 pr_info("ftrace faulted on modifying "); 1614 pr_info("ftrace faulted on modifying ");
1615 print_ip_sym(ip); 1615 print_ip_sym(ip);
1616 break; 1616 break;
1617 case -EINVAL: 1617 case -EINVAL:
1618 FTRACE_WARN_ON_ONCE(1); 1618 FTRACE_WARN_ON_ONCE(1);
1619 pr_info("ftrace failed to modify "); 1619 pr_info("ftrace failed to modify ");
1620 print_ip_sym(ip); 1620 print_ip_sym(ip);
1621 print_ip_ins(" actual: ", (unsigned char *)ip); 1621 print_ip_ins(" actual: ", (unsigned char *)ip);
1622 printk(KERN_CONT "\n"); 1622 printk(KERN_CONT "\n");
1623 break; 1623 break;
1624 case -EPERM: 1624 case -EPERM:
1625 FTRACE_WARN_ON_ONCE(1); 1625 FTRACE_WARN_ON_ONCE(1);
1626 pr_info("ftrace faulted on writing "); 1626 pr_info("ftrace faulted on writing ");
1627 print_ip_sym(ip); 1627 print_ip_sym(ip);
1628 break; 1628 break;
1629 default: 1629 default:
1630 FTRACE_WARN_ON_ONCE(1); 1630 FTRACE_WARN_ON_ONCE(1);
1631 pr_info("ftrace faulted on unknown error "); 1631 pr_info("ftrace faulted on unknown error ");
1632 print_ip_sym(ip); 1632 print_ip_sym(ip);
1633 } 1633 }
1634 } 1634 }
1635 1635
1636 static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1636 static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1637 { 1637 {
1638 unsigned long flag = 0UL; 1638 unsigned long flag = 0UL;
1639 1639
1640 /* 1640 /*
1641 * If we are updating calls: 1641 * If we are updating calls:
1642 * 1642 *
1643 * If the record has a ref count, then we need to enable it 1643 * If the record has a ref count, then we need to enable it
1644 * because someone is using it. 1644 * because someone is using it.
1645 * 1645 *
1646 * Otherwise we make sure its disabled. 1646 * Otherwise we make sure its disabled.
1647 * 1647 *
1648 * If we are disabling calls, then disable all records that 1648 * If we are disabling calls, then disable all records that
1649 * are enabled. 1649 * are enabled.
1650 */ 1650 */
1651 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1651 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1652 flag = FTRACE_FL_ENABLED; 1652 flag = FTRACE_FL_ENABLED;
1653 1653
1654 /* 1654 /*
1655 * If enabling and the REGS flag does not match the REGS_EN, then 1655 * If enabling and the REGS flag does not match the REGS_EN, then
1656 * do not ignore this record. Set flags to fail the compare against 1656 * do not ignore this record. Set flags to fail the compare against
1657 * ENABLED. 1657 * ENABLED.
1658 */ 1658 */
1659 if (flag && 1659 if (flag &&
1660 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) 1660 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
1661 flag |= FTRACE_FL_REGS; 1661 flag |= FTRACE_FL_REGS;
1662 1662
1663 /* If the state of this record hasn't changed, then do nothing */ 1663 /* If the state of this record hasn't changed, then do nothing */
1664 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1664 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1665 return FTRACE_UPDATE_IGNORE; 1665 return FTRACE_UPDATE_IGNORE;
1666 1666
1667 if (flag) { 1667 if (flag) {
1668 /* Save off if rec is being enabled (for return value) */ 1668 /* Save off if rec is being enabled (for return value) */
1669 flag ^= rec->flags & FTRACE_FL_ENABLED; 1669 flag ^= rec->flags & FTRACE_FL_ENABLED;
1670 1670
1671 if (update) { 1671 if (update) {
1672 rec->flags |= FTRACE_FL_ENABLED; 1672 rec->flags |= FTRACE_FL_ENABLED;
1673 if (flag & FTRACE_FL_REGS) { 1673 if (flag & FTRACE_FL_REGS) {
1674 if (rec->flags & FTRACE_FL_REGS) 1674 if (rec->flags & FTRACE_FL_REGS)
1675 rec->flags |= FTRACE_FL_REGS_EN; 1675 rec->flags |= FTRACE_FL_REGS_EN;
1676 else 1676 else
1677 rec->flags &= ~FTRACE_FL_REGS_EN; 1677 rec->flags &= ~FTRACE_FL_REGS_EN;
1678 } 1678 }
1679 } 1679 }
1680 1680
1681 /* 1681 /*
1682 * If this record is being updated from a nop, then 1682 * If this record is being updated from a nop, then
1683 * return UPDATE_MAKE_CALL. 1683 * return UPDATE_MAKE_CALL.
1684 * Otherwise, if the EN flag is set, then return 1684 * Otherwise, if the EN flag is set, then return
1685 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert 1685 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1686 * from the non-save regs, to a save regs function. 1686 * from the non-save regs, to a save regs function.
1687 * Otherwise, 1687 * Otherwise,
1688 * return UPDATE_MODIFY_CALL to tell the caller to convert 1688 * return UPDATE_MODIFY_CALL to tell the caller to convert
1689 * from the save regs, to a non-save regs function. 1689 * from the save regs, to a non-save regs function.
1690 */ 1690 */
1691 if (flag & FTRACE_FL_ENABLED) 1691 if (flag & FTRACE_FL_ENABLED)
1692 return FTRACE_UPDATE_MAKE_CALL; 1692 return FTRACE_UPDATE_MAKE_CALL;
1693 else if (rec->flags & FTRACE_FL_REGS_EN) 1693 else if (rec->flags & FTRACE_FL_REGS_EN)
1694 return FTRACE_UPDATE_MODIFY_CALL_REGS; 1694 return FTRACE_UPDATE_MODIFY_CALL_REGS;
1695 else 1695 else
1696 return FTRACE_UPDATE_MODIFY_CALL; 1696 return FTRACE_UPDATE_MODIFY_CALL;
1697 } 1697 }
1698 1698
1699 if (update) { 1699 if (update) {
1700 /* If there's no more users, clear all flags */ 1700 /* If there's no more users, clear all flags */
1701 if (!(rec->flags & ~FTRACE_FL_MASK)) 1701 if (!(rec->flags & ~FTRACE_FL_MASK))
1702 rec->flags = 0; 1702 rec->flags = 0;
1703 else 1703 else
1704 /* Just disable the record (keep REGS state) */ 1704 /* Just disable the record (keep REGS state) */
1705 rec->flags &= ~FTRACE_FL_ENABLED; 1705 rec->flags &= ~FTRACE_FL_ENABLED;
1706 } 1706 }
1707 1707
1708 return FTRACE_UPDATE_MAKE_NOP; 1708 return FTRACE_UPDATE_MAKE_NOP;
1709 } 1709 }
1710 1710
1711 /** 1711 /**
1712 * ftrace_update_record, set a record that now is tracing or not 1712 * ftrace_update_record, set a record that now is tracing or not
1713 * @rec: the record to update 1713 * @rec: the record to update
1714 * @enable: set to 1 if the record is tracing, zero to force disable 1714 * @enable: set to 1 if the record is tracing, zero to force disable
1715 * 1715 *
1716 * The records that represent all functions that can be traced need 1716 * The records that represent all functions that can be traced need
1717 * to be updated when tracing has been enabled. 1717 * to be updated when tracing has been enabled.
1718 */ 1718 */
1719 int ftrace_update_record(struct dyn_ftrace *rec, int enable) 1719 int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1720 { 1720 {
1721 return ftrace_check_record(rec, enable, 1); 1721 return ftrace_check_record(rec, enable, 1);
1722 } 1722 }
1723 1723
1724 /** 1724 /**
1725 * ftrace_test_record, check if the record has been enabled or not 1725 * ftrace_test_record, check if the record has been enabled or not
1726 * @rec: the record to test 1726 * @rec: the record to test
1727 * @enable: set to 1 to check if enabled, 0 if it is disabled 1727 * @enable: set to 1 to check if enabled, 0 if it is disabled
1728 * 1728 *
1729 * The arch code may need to test if a record is already set to 1729 * The arch code may need to test if a record is already set to
1730 * tracing to determine how to modify the function code that it 1730 * tracing to determine how to modify the function code that it
1731 * represents. 1731 * represents.
1732 */ 1732 */
1733 int ftrace_test_record(struct dyn_ftrace *rec, int enable) 1733 int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1734 { 1734 {
1735 return ftrace_check_record(rec, enable, 0); 1735 return ftrace_check_record(rec, enable, 0);
1736 } 1736 }
1737 1737
1738 static int 1738 static int
1739 __ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1739 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1740 { 1740 {
1741 unsigned long ftrace_old_addr; 1741 unsigned long ftrace_old_addr;
1742 unsigned long ftrace_addr; 1742 unsigned long ftrace_addr;
1743 int ret; 1743 int ret;
1744 1744
1745 ret = ftrace_update_record(rec, enable); 1745 ret = ftrace_update_record(rec, enable);
1746 1746
1747 if (rec->flags & FTRACE_FL_REGS) 1747 if (rec->flags & FTRACE_FL_REGS)
1748 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; 1748 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
1749 else 1749 else
1750 ftrace_addr = (unsigned long)FTRACE_ADDR; 1750 ftrace_addr = (unsigned long)FTRACE_ADDR;
1751 1751
1752 switch (ret) { 1752 switch (ret) {
1753 case FTRACE_UPDATE_IGNORE: 1753 case FTRACE_UPDATE_IGNORE:
1754 return 0; 1754 return 0;
1755 1755
1756 case FTRACE_UPDATE_MAKE_CALL: 1756 case FTRACE_UPDATE_MAKE_CALL:
1757 return ftrace_make_call(rec, ftrace_addr); 1757 return ftrace_make_call(rec, ftrace_addr);
1758 1758
1759 case FTRACE_UPDATE_MAKE_NOP: 1759 case FTRACE_UPDATE_MAKE_NOP:
1760 return ftrace_make_nop(NULL, rec, ftrace_addr); 1760 return ftrace_make_nop(NULL, rec, ftrace_addr);
1761 1761
1762 case FTRACE_UPDATE_MODIFY_CALL_REGS: 1762 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1763 case FTRACE_UPDATE_MODIFY_CALL: 1763 case FTRACE_UPDATE_MODIFY_CALL:
1764 if (rec->flags & FTRACE_FL_REGS) 1764 if (rec->flags & FTRACE_FL_REGS)
1765 ftrace_old_addr = (unsigned long)FTRACE_ADDR; 1765 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1766 else 1766 else
1767 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; 1767 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1768 1768
1769 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 1769 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1770 } 1770 }
1771 1771
1772 return -1; /* unknow ftrace bug */ 1772 return -1; /* unknow ftrace bug */
1773 } 1773 }
1774 1774
1775 void __weak ftrace_replace_code(int enable) 1775 void __weak ftrace_replace_code(int enable)
1776 { 1776 {
1777 struct dyn_ftrace *rec; 1777 struct dyn_ftrace *rec;
1778 struct ftrace_page *pg; 1778 struct ftrace_page *pg;
1779 int failed; 1779 int failed;
1780 1780
1781 if (unlikely(ftrace_disabled)) 1781 if (unlikely(ftrace_disabled))
1782 return; 1782 return;
1783 1783
1784 do_for_each_ftrace_rec(pg, rec) { 1784 do_for_each_ftrace_rec(pg, rec) {
1785 failed = __ftrace_replace_code(rec, enable); 1785 failed = __ftrace_replace_code(rec, enable);
1786 if (failed) { 1786 if (failed) {
1787 ftrace_bug(failed, rec->ip); 1787 ftrace_bug(failed, rec->ip);
1788 /* Stop processing */ 1788 /* Stop processing */
1789 return; 1789 return;
1790 } 1790 }
1791 } while_for_each_ftrace_rec(); 1791 } while_for_each_ftrace_rec();
1792 } 1792 }
1793 1793
1794 struct ftrace_rec_iter { 1794 struct ftrace_rec_iter {
1795 struct ftrace_page *pg; 1795 struct ftrace_page *pg;
1796 int index; 1796 int index;
1797 }; 1797 };
1798 1798
1799 /** 1799 /**
1800 * ftrace_rec_iter_start, start up iterating over traced functions 1800 * ftrace_rec_iter_start, start up iterating over traced functions
1801 * 1801 *
1802 * Returns an iterator handle that is used to iterate over all 1802 * Returns an iterator handle that is used to iterate over all
1803 * the records that represent address locations where functions 1803 * the records that represent address locations where functions
1804 * are traced. 1804 * are traced.
1805 * 1805 *
1806 * May return NULL if no records are available. 1806 * May return NULL if no records are available.
1807 */ 1807 */
1808 struct ftrace_rec_iter *ftrace_rec_iter_start(void) 1808 struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1809 { 1809 {
1810 /* 1810 /*
1811 * We only use a single iterator. 1811 * We only use a single iterator.
1812 * Protected by the ftrace_lock mutex. 1812 * Protected by the ftrace_lock mutex.
1813 */ 1813 */
1814 static struct ftrace_rec_iter ftrace_rec_iter; 1814 static struct ftrace_rec_iter ftrace_rec_iter;
1815 struct ftrace_rec_iter *iter = &ftrace_rec_iter; 1815 struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1816 1816
1817 iter->pg = ftrace_pages_start; 1817 iter->pg = ftrace_pages_start;
1818 iter->index = 0; 1818 iter->index = 0;
1819 1819
1820 /* Could have empty pages */ 1820 /* Could have empty pages */
1821 while (iter->pg && !iter->pg->index) 1821 while (iter->pg && !iter->pg->index)
1822 iter->pg = iter->pg->next; 1822 iter->pg = iter->pg->next;
1823 1823
1824 if (!iter->pg) 1824 if (!iter->pg)
1825 return NULL; 1825 return NULL;
1826 1826
1827 return iter; 1827 return iter;
1828 } 1828 }
1829 1829
1830 /** 1830 /**
1831 * ftrace_rec_iter_next, get the next record to process. 1831 * ftrace_rec_iter_next, get the next record to process.
1832 * @iter: The handle to the iterator. 1832 * @iter: The handle to the iterator.
1833 * 1833 *
1834 * Returns the next iterator after the given iterator @iter. 1834 * Returns the next iterator after the given iterator @iter.
1835 */ 1835 */
1836 struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) 1836 struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
1837 { 1837 {
1838 iter->index++; 1838 iter->index++;
1839 1839
1840 if (iter->index >= iter->pg->index) { 1840 if (iter->index >= iter->pg->index) {
1841 iter->pg = iter->pg->next; 1841 iter->pg = iter->pg->next;
1842 iter->index = 0; 1842 iter->index = 0;
1843 1843
1844 /* Could have empty pages */ 1844 /* Could have empty pages */
1845 while (iter->pg && !iter->pg->index) 1845 while (iter->pg && !iter->pg->index)
1846 iter->pg = iter->pg->next; 1846 iter->pg = iter->pg->next;
1847 } 1847 }
1848 1848
1849 if (!iter->pg) 1849 if (!iter->pg)
1850 return NULL; 1850 return NULL;
1851 1851
1852 return iter; 1852 return iter;
1853 } 1853 }
1854 1854
1855 /** 1855 /**
1856 * ftrace_rec_iter_record, get the record at the iterator location 1856 * ftrace_rec_iter_record, get the record at the iterator location
1857 * @iter: The current iterator location 1857 * @iter: The current iterator location
1858 * 1858 *
1859 * Returns the record that the current @iter is at. 1859 * Returns the record that the current @iter is at.
1860 */ 1860 */
1861 struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) 1861 struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
1862 { 1862 {
1863 return &iter->pg->records[iter->index]; 1863 return &iter->pg->records[iter->index];
1864 } 1864 }
1865 1865
1866 static int 1866 static int
1867 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 1867 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1868 { 1868 {
1869 unsigned long ip; 1869 unsigned long ip;
1870 int ret; 1870 int ret;
1871 1871
1872 ip = rec->ip; 1872 ip = rec->ip;
1873 1873
1874 if (unlikely(ftrace_disabled)) 1874 if (unlikely(ftrace_disabled))
1875 return 0; 1875 return 0;
1876 1876
1877 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1877 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1878 if (ret) { 1878 if (ret) {
1879 ftrace_bug(ret, ip); 1879 ftrace_bug(ret, ip);
1880 return 0; 1880 return 0;
1881 } 1881 }
1882 return 1; 1882 return 1;
1883 } 1883 }
1884 1884
1885 /* 1885 /*
1886 * archs can override this function if they must do something 1886 * archs can override this function if they must do something
1887 * before the modifying code is performed. 1887 * before the modifying code is performed.
1888 */ 1888 */
1889 int __weak ftrace_arch_code_modify_prepare(void) 1889 int __weak ftrace_arch_code_modify_prepare(void)
1890 { 1890 {
1891 return 0; 1891 return 0;
1892 } 1892 }
1893 1893
1894 /* 1894 /*
1895 * archs can override this function if they must do something 1895 * archs can override this function if they must do something
1896 * after the modifying code is performed. 1896 * after the modifying code is performed.
1897 */ 1897 */
1898 int __weak ftrace_arch_code_modify_post_process(void) 1898 int __weak ftrace_arch_code_modify_post_process(void)
1899 { 1899 {
1900 return 0; 1900 return 0;
1901 } 1901 }
1902 1902
1903 void ftrace_modify_all_code(int command) 1903 void ftrace_modify_all_code(int command)
1904 { 1904 {
1905 if (command & FTRACE_UPDATE_CALLS) 1905 if (command & FTRACE_UPDATE_CALLS)
1906 ftrace_replace_code(1); 1906 ftrace_replace_code(1);
1907 else if (command & FTRACE_DISABLE_CALLS) 1907 else if (command & FTRACE_DISABLE_CALLS)
1908 ftrace_replace_code(0); 1908 ftrace_replace_code(0);
1909 1909
1910 if (command & FTRACE_UPDATE_TRACE_FUNC) 1910 if (command & FTRACE_UPDATE_TRACE_FUNC)
1911 ftrace_update_ftrace_func(ftrace_trace_function); 1911 ftrace_update_ftrace_func(ftrace_trace_function);
1912 1912
1913 if (command & FTRACE_START_FUNC_RET) 1913 if (command & FTRACE_START_FUNC_RET)
1914 ftrace_enable_ftrace_graph_caller(); 1914 ftrace_enable_ftrace_graph_caller();
1915 else if (command & FTRACE_STOP_FUNC_RET) 1915 else if (command & FTRACE_STOP_FUNC_RET)
1916 ftrace_disable_ftrace_graph_caller(); 1916 ftrace_disable_ftrace_graph_caller();
1917 } 1917 }
1918 1918
1919 static int __ftrace_modify_code(void *data) 1919 static int __ftrace_modify_code(void *data)
1920 { 1920 {
1921 int *command = data; 1921 int *command = data;
1922 1922
1923 ftrace_modify_all_code(*command); 1923 ftrace_modify_all_code(*command);
1924 1924
1925 return 0; 1925 return 0;
1926 } 1926 }
1927 1927
1928 /** 1928 /**
1929 * ftrace_run_stop_machine, go back to the stop machine method 1929 * ftrace_run_stop_machine, go back to the stop machine method
1930 * @command: The command to tell ftrace what to do 1930 * @command: The command to tell ftrace what to do
1931 * 1931 *
1932 * If an arch needs to fall back to the stop machine method, the 1932 * If an arch needs to fall back to the stop machine method, the
1933 * it can call this function. 1933 * it can call this function.
1934 */ 1934 */
1935 void ftrace_run_stop_machine(int command) 1935 void ftrace_run_stop_machine(int command)
1936 { 1936 {
1937 stop_machine(__ftrace_modify_code, &command, NULL); 1937 stop_machine(__ftrace_modify_code, &command, NULL);
1938 } 1938 }
1939 1939
1940 /** 1940 /**
1941 * arch_ftrace_update_code, modify the code to trace or not trace 1941 * arch_ftrace_update_code, modify the code to trace or not trace
1942 * @command: The command that needs to be done 1942 * @command: The command that needs to be done
1943 * 1943 *
1944 * Archs can override this function if it does not need to 1944 * Archs can override this function if it does not need to
1945 * run stop_machine() to modify code. 1945 * run stop_machine() to modify code.
1946 */ 1946 */
1947 void __weak arch_ftrace_update_code(int command) 1947 void __weak arch_ftrace_update_code(int command)
1948 { 1948 {
1949 ftrace_run_stop_machine(command); 1949 ftrace_run_stop_machine(command);
1950 } 1950 }
1951 1951
1952 static void ftrace_run_update_code(int command) 1952 static void ftrace_run_update_code(int command)
1953 { 1953 {
1954 int ret; 1954 int ret;
1955 1955
1956 ret = ftrace_arch_code_modify_prepare(); 1956 ret = ftrace_arch_code_modify_prepare();
1957 FTRACE_WARN_ON(ret); 1957 FTRACE_WARN_ON(ret);
1958 if (ret) 1958 if (ret)
1959 return; 1959 return;
1960 /* 1960 /*
1961 * Do not call function tracer while we update the code. 1961 * Do not call function tracer while we update the code.
1962 * We are in stop machine. 1962 * We are in stop machine.
1963 */ 1963 */
1964 function_trace_stop++; 1964 function_trace_stop++;
1965 1965
1966 /* 1966 /*
1967 * By default we use stop_machine() to modify the code. 1967 * By default we use stop_machine() to modify the code.
1968 * But archs can do what ever they want as long as it 1968 * But archs can do what ever they want as long as it
1969 * is safe. The stop_machine() is the safest, but also 1969 * is safe. The stop_machine() is the safest, but also
1970 * produces the most overhead. 1970 * produces the most overhead.
1971 */ 1971 */
1972 arch_ftrace_update_code(command); 1972 arch_ftrace_update_code(command);
1973 1973
1974 function_trace_stop--; 1974 function_trace_stop--;
1975 1975
1976 ret = ftrace_arch_code_modify_post_process(); 1976 ret = ftrace_arch_code_modify_post_process();
1977 FTRACE_WARN_ON(ret); 1977 FTRACE_WARN_ON(ret);
1978 } 1978 }
1979 1979
1980 static ftrace_func_t saved_ftrace_func; 1980 static ftrace_func_t saved_ftrace_func;
1981 static int ftrace_start_up; 1981 static int ftrace_start_up;
1982 static int global_start_up; 1982 static int global_start_up;
1983 1983
1984 static void ftrace_startup_enable(int command) 1984 static void ftrace_startup_enable(int command)
1985 { 1985 {
1986 if (saved_ftrace_func != ftrace_trace_function) { 1986 if (saved_ftrace_func != ftrace_trace_function) {
1987 saved_ftrace_func = ftrace_trace_function; 1987 saved_ftrace_func = ftrace_trace_function;
1988 command |= FTRACE_UPDATE_TRACE_FUNC; 1988 command |= FTRACE_UPDATE_TRACE_FUNC;
1989 } 1989 }
1990 1990
1991 if (!command || !ftrace_enabled) 1991 if (!command || !ftrace_enabled)
1992 return; 1992 return;
1993 1993
1994 ftrace_run_update_code(command); 1994 ftrace_run_update_code(command);
1995 } 1995 }
1996 1996
1997 static int ftrace_startup(struct ftrace_ops *ops, int command) 1997 static int ftrace_startup(struct ftrace_ops *ops, int command)
1998 { 1998 {
1999 bool hash_enable = true; 1999 bool hash_enable = true;
2000 2000
2001 if (unlikely(ftrace_disabled)) 2001 if (unlikely(ftrace_disabled))
2002 return -ENODEV; 2002 return -ENODEV;
2003 2003
2004 ftrace_start_up++; 2004 ftrace_start_up++;
2005 command |= FTRACE_UPDATE_CALLS; 2005 command |= FTRACE_UPDATE_CALLS;
2006 2006
2007 /* ops marked global share the filter hashes */ 2007 /* ops marked global share the filter hashes */
2008 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 2008 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2009 ops = &global_ops; 2009 ops = &global_ops;
2010 /* Don't update hash if global is already set */ 2010 /* Don't update hash if global is already set */
2011 if (global_start_up) 2011 if (global_start_up)
2012 hash_enable = false; 2012 hash_enable = false;
2013 global_start_up++; 2013 global_start_up++;
2014 } 2014 }
2015 2015
2016 ops->flags |= FTRACE_OPS_FL_ENABLED; 2016 ops->flags |= FTRACE_OPS_FL_ENABLED;
2017 if (hash_enable) 2017 if (hash_enable)
2018 ftrace_hash_rec_enable(ops, 1); 2018 ftrace_hash_rec_enable(ops, 1);
2019 2019
2020 ftrace_startup_enable(command); 2020 ftrace_startup_enable(command);
2021 2021
2022 return 0; 2022 return 0;
2023 } 2023 }
2024 2024
2025 static void ftrace_shutdown(struct ftrace_ops *ops, int command) 2025 static void ftrace_shutdown(struct ftrace_ops *ops, int command)
2026 { 2026 {
2027 bool hash_disable = true; 2027 bool hash_disable = true;
2028 2028
2029 if (unlikely(ftrace_disabled)) 2029 if (unlikely(ftrace_disabled))
2030 return; 2030 return;
2031 2031
2032 ftrace_start_up--; 2032 ftrace_start_up--;
2033 /* 2033 /*
2034 * Just warn in case of unbalance, no need to kill ftrace, it's not 2034 * Just warn in case of unbalance, no need to kill ftrace, it's not
2035 * critical but the ftrace_call callers may be never nopped again after 2035 * critical but the ftrace_call callers may be never nopped again after
2036 * further ftrace uses. 2036 * further ftrace uses.
2037 */ 2037 */
2038 WARN_ON_ONCE(ftrace_start_up < 0); 2038 WARN_ON_ONCE(ftrace_start_up < 0);
2039 2039
2040 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 2040 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2041 ops = &global_ops; 2041 ops = &global_ops;
2042 global_start_up--; 2042 global_start_up--;
2043 WARN_ON_ONCE(global_start_up < 0); 2043 WARN_ON_ONCE(global_start_up < 0);
2044 /* Don't update hash if global still has users */ 2044 /* Don't update hash if global still has users */
2045 if (global_start_up) { 2045 if (global_start_up) {
2046 WARN_ON_ONCE(!ftrace_start_up); 2046 WARN_ON_ONCE(!ftrace_start_up);
2047 hash_disable = false; 2047 hash_disable = false;
2048 } 2048 }
2049 } 2049 }
2050 2050
2051 if (hash_disable) 2051 if (hash_disable)
2052 ftrace_hash_rec_disable(ops, 1); 2052 ftrace_hash_rec_disable(ops, 1);
2053 2053
2054 if (ops != &global_ops || !global_start_up) 2054 if (ops != &global_ops || !global_start_up)
2055 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2055 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2056 2056
2057 command |= FTRACE_UPDATE_CALLS; 2057 command |= FTRACE_UPDATE_CALLS;
2058 2058
2059 if (saved_ftrace_func != ftrace_trace_function) { 2059 if (saved_ftrace_func != ftrace_trace_function) {
2060 saved_ftrace_func = ftrace_trace_function; 2060 saved_ftrace_func = ftrace_trace_function;
2061 command |= FTRACE_UPDATE_TRACE_FUNC; 2061 command |= FTRACE_UPDATE_TRACE_FUNC;
2062 } 2062 }
2063 2063
2064 if (!command || !ftrace_enabled) 2064 if (!command || !ftrace_enabled)
2065 return; 2065 return;
2066 2066
2067 ftrace_run_update_code(command); 2067 ftrace_run_update_code(command);
2068 } 2068 }
2069 2069
2070 static void ftrace_startup_sysctl(void) 2070 static void ftrace_startup_sysctl(void)
2071 { 2071 {
2072 if (unlikely(ftrace_disabled)) 2072 if (unlikely(ftrace_disabled))
2073 return; 2073 return;
2074 2074
2075 /* Force update next time */ 2075 /* Force update next time */
2076 saved_ftrace_func = NULL; 2076 saved_ftrace_func = NULL;
2077 /* ftrace_start_up is true if we want ftrace running */ 2077 /* ftrace_start_up is true if we want ftrace running */
2078 if (ftrace_start_up) 2078 if (ftrace_start_up)
2079 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 2079 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2080 } 2080 }
2081 2081
2082 static void ftrace_shutdown_sysctl(void) 2082 static void ftrace_shutdown_sysctl(void)
2083 { 2083 {
2084 if (unlikely(ftrace_disabled)) 2084 if (unlikely(ftrace_disabled))
2085 return; 2085 return;
2086 2086
2087 /* ftrace_start_up is true if ftrace is running */ 2087 /* ftrace_start_up is true if ftrace is running */
2088 if (ftrace_start_up) 2088 if (ftrace_start_up)
2089 ftrace_run_update_code(FTRACE_DISABLE_CALLS); 2089 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
2090 } 2090 }
2091 2091
2092 static cycle_t ftrace_update_time; 2092 static cycle_t ftrace_update_time;
2093 static unsigned long ftrace_update_cnt; 2093 static unsigned long ftrace_update_cnt;
2094 unsigned long ftrace_update_tot_cnt; 2094 unsigned long ftrace_update_tot_cnt;
2095 2095
2096 static int ops_traces_mod(struct ftrace_ops *ops) 2096 static int ops_traces_mod(struct ftrace_ops *ops)
2097 { 2097 {
2098 struct ftrace_hash *hash; 2098 struct ftrace_hash *hash;
2099 2099
2100 hash = ops->filter_hash; 2100 hash = ops->filter_hash;
2101 return ftrace_hash_empty(hash); 2101 return ftrace_hash_empty(hash);
2102 } 2102 }
2103 2103
2104 static int ftrace_update_code(struct module *mod) 2104 static int ftrace_update_code(struct module *mod)
2105 { 2105 {
2106 struct ftrace_page *pg; 2106 struct ftrace_page *pg;
2107 struct dyn_ftrace *p; 2107 struct dyn_ftrace *p;
2108 cycle_t start, stop; 2108 cycle_t start, stop;
2109 unsigned long ref = 0; 2109 unsigned long ref = 0;
2110 int i; 2110 int i;
2111 2111
2112 /* 2112 /*
2113 * When adding a module, we need to check if tracers are 2113 * When adding a module, we need to check if tracers are
2114 * currently enabled and if they are set to trace all functions. 2114 * currently enabled and if they are set to trace all functions.
2115 * If they are, we need to enable the module functions as well 2115 * If they are, we need to enable the module functions as well
2116 * as update the reference counts for those function records. 2116 * as update the reference counts for those function records.
2117 */ 2117 */
2118 if (mod) { 2118 if (mod) {
2119 struct ftrace_ops *ops; 2119 struct ftrace_ops *ops;
2120 2120
2121 for (ops = ftrace_ops_list; 2121 for (ops = ftrace_ops_list;
2122 ops != &ftrace_list_end; ops = ops->next) { 2122 ops != &ftrace_list_end; ops = ops->next) {
2123 if (ops->flags & FTRACE_OPS_FL_ENABLED && 2123 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
2124 ops_traces_mod(ops)) 2124 ops_traces_mod(ops))
2125 ref++; 2125 ref++;
2126 } 2126 }
2127 } 2127 }
2128 2128
2129 start = ftrace_now(raw_smp_processor_id()); 2129 start = ftrace_now(raw_smp_processor_id());
2130 ftrace_update_cnt = 0; 2130 ftrace_update_cnt = 0;
2131 2131
2132 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2132 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
2133 2133
2134 for (i = 0; i < pg->index; i++) { 2134 for (i = 0; i < pg->index; i++) {
2135 /* If something went wrong, bail without enabling anything */ 2135 /* If something went wrong, bail without enabling anything */
2136 if (unlikely(ftrace_disabled)) 2136 if (unlikely(ftrace_disabled))
2137 return -1; 2137 return -1;
2138 2138
2139 p = &pg->records[i]; 2139 p = &pg->records[i];
2140 p->flags = ref; 2140 p->flags = ref;
2141 2141
2142 /* 2142 /*
2143 * Do the initial record conversion from mcount jump 2143 * Do the initial record conversion from mcount jump
2144 * to the NOP instructions. 2144 * to the NOP instructions.
2145 */ 2145 */
2146 if (!ftrace_code_disable(mod, p)) 2146 if (!ftrace_code_disable(mod, p))
2147 break; 2147 break;
2148 2148
2149 ftrace_update_cnt++; 2149 ftrace_update_cnt++;
2150 2150
2151 /* 2151 /*
2152 * If the tracing is enabled, go ahead and enable the record. 2152 * If the tracing is enabled, go ahead and enable the record.
2153 * 2153 *
2154 * The reason not to enable the record immediatelly is the 2154 * The reason not to enable the record immediatelly is the
2155 * inherent check of ftrace_make_nop/ftrace_make_call for 2155 * inherent check of ftrace_make_nop/ftrace_make_call for
2156 * correct previous instructions. Making first the NOP 2156 * correct previous instructions. Making first the NOP
2157 * conversion puts the module to the correct state, thus 2157 * conversion puts the module to the correct state, thus
2158 * passing the ftrace_make_call check. 2158 * passing the ftrace_make_call check.
2159 */ 2159 */
2160 if (ftrace_start_up && ref) { 2160 if (ftrace_start_up && ref) {
2161 int failed = __ftrace_replace_code(p, 1); 2161 int failed = __ftrace_replace_code(p, 1);
2162 if (failed) 2162 if (failed)
2163 ftrace_bug(failed, p->ip); 2163 ftrace_bug(failed, p->ip);
2164 } 2164 }
2165 } 2165 }
2166 } 2166 }
2167 2167
2168 ftrace_new_pgs = NULL; 2168 ftrace_new_pgs = NULL;
2169 2169
2170 stop = ftrace_now(raw_smp_processor_id()); 2170 stop = ftrace_now(raw_smp_processor_id());
2171 ftrace_update_time = stop - start; 2171 ftrace_update_time = stop - start;
2172 ftrace_update_tot_cnt += ftrace_update_cnt; 2172 ftrace_update_tot_cnt += ftrace_update_cnt;
2173 2173
2174 return 0; 2174 return 0;
2175 } 2175 }
2176 2176
2177 static int ftrace_allocate_records(struct ftrace_page *pg, int count) 2177 static int ftrace_allocate_records(struct ftrace_page *pg, int count)
2178 { 2178 {
2179 int order; 2179 int order;
2180 int cnt; 2180 int cnt;
2181 2181
2182 if (WARN_ON(!count)) 2182 if (WARN_ON(!count))
2183 return -EINVAL; 2183 return -EINVAL;
2184 2184
2185 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); 2185 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
2186 2186
2187 /* 2187 /*
2188 * We want to fill as much as possible. No more than a page 2188 * We want to fill as much as possible. No more than a page
2189 * may be empty. 2189 * may be empty.
2190 */ 2190 */
2191 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) 2191 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2192 order--; 2192 order--;
2193 2193
2194 again: 2194 again:
2195 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); 2195 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
2196 2196
2197 if (!pg->records) { 2197 if (!pg->records) {
2198 /* if we can't allocate this size, try something smaller */ 2198 /* if we can't allocate this size, try something smaller */
2199 if (!order) 2199 if (!order)
2200 return -ENOMEM; 2200 return -ENOMEM;
2201 order >>= 1; 2201 order >>= 1;
2202 goto again; 2202 goto again;
2203 } 2203 }
2204 2204
2205 cnt = (PAGE_SIZE << order) / ENTRY_SIZE; 2205 cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
2206 pg->size = cnt; 2206 pg->size = cnt;
2207 2207
2208 if (cnt > count) 2208 if (cnt > count)
2209 cnt = count; 2209 cnt = count;
2210 2210
2211 return cnt; 2211 return cnt;
2212 } 2212 }
2213 2213
2214 static struct ftrace_page * 2214 static struct ftrace_page *
2215 ftrace_allocate_pages(unsigned long num_to_init) 2215 ftrace_allocate_pages(unsigned long num_to_init)
2216 { 2216 {
2217 struct ftrace_page *start_pg; 2217 struct ftrace_page *start_pg;
2218 struct ftrace_page *pg; 2218 struct ftrace_page *pg;
2219 int order; 2219 int order;
2220 int cnt; 2220 int cnt;
2221 2221
2222 if (!num_to_init) 2222 if (!num_to_init)
2223 return 0; 2223 return 0;
2224 2224
2225 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); 2225 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2226 if (!pg) 2226 if (!pg)
2227 return NULL; 2227 return NULL;
2228 2228
2229 /* 2229 /*
2230 * Try to allocate as much as possible in one continues 2230 * Try to allocate as much as possible in one continues
2231 * location that fills in all of the space. We want to 2231 * location that fills in all of the space. We want to
2232 * waste as little space as possible. 2232 * waste as little space as possible.
2233 */ 2233 */
2234 for (;;) { 2234 for (;;) {
2235 cnt = ftrace_allocate_records(pg, num_to_init); 2235 cnt = ftrace_allocate_records(pg, num_to_init);
2236 if (cnt < 0) 2236 if (cnt < 0)
2237 goto free_pages; 2237 goto free_pages;
2238 2238
2239 num_to_init -= cnt; 2239 num_to_init -= cnt;
2240 if (!num_to_init) 2240 if (!num_to_init)
2241 break; 2241 break;
2242 2242
2243 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); 2243 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
2244 if (!pg->next) 2244 if (!pg->next)
2245 goto free_pages; 2245 goto free_pages;
2246 2246
2247 pg = pg->next; 2247 pg = pg->next;
2248 } 2248 }
2249 2249
2250 return start_pg; 2250 return start_pg;
2251 2251
2252 free_pages: 2252 free_pages:
2253 while (start_pg) { 2253 while (start_pg) {
2254 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 2254 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2255 free_pages((unsigned long)pg->records, order); 2255 free_pages((unsigned long)pg->records, order);
2256 start_pg = pg->next; 2256 start_pg = pg->next;
2257 kfree(pg); 2257 kfree(pg);
2258 pg = start_pg; 2258 pg = start_pg;
2259 } 2259 }
2260 pr_info("ftrace: FAILED to allocate memory for functions\n"); 2260 pr_info("ftrace: FAILED to allocate memory for functions\n");
2261 return NULL; 2261 return NULL;
2262 } 2262 }
2263 2263
2264 static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) 2264 static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2265 { 2265 {
2266 int cnt; 2266 int cnt;
2267 2267
2268 if (!num_to_init) { 2268 if (!num_to_init) {
2269 pr_info("ftrace: No functions to be traced?\n"); 2269 pr_info("ftrace: No functions to be traced?\n");
2270 return -1; 2270 return -1;
2271 } 2271 }
2272 2272
2273 cnt = num_to_init / ENTRIES_PER_PAGE; 2273 cnt = num_to_init / ENTRIES_PER_PAGE;
2274 pr_info("ftrace: allocating %ld entries in %d pages\n", 2274 pr_info("ftrace: allocating %ld entries in %d pages\n",
2275 num_to_init, cnt + 1); 2275 num_to_init, cnt + 1);
2276 2276
2277 return 0; 2277 return 0;
2278 } 2278 }
2279 2279
2280 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2280 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2281 2281
2282 struct ftrace_iterator { 2282 struct ftrace_iterator {
2283 loff_t pos; 2283 loff_t pos;
2284 loff_t func_pos; 2284 loff_t func_pos;
2285 struct ftrace_page *pg; 2285 struct ftrace_page *pg;
2286 struct dyn_ftrace *func; 2286 struct dyn_ftrace *func;
2287 struct ftrace_func_probe *probe; 2287 struct ftrace_func_probe *probe;
2288 struct trace_parser parser; 2288 struct trace_parser parser;
2289 struct ftrace_hash *hash; 2289 struct ftrace_hash *hash;
2290 struct ftrace_ops *ops; 2290 struct ftrace_ops *ops;
2291 int hidx; 2291 int hidx;
2292 int idx; 2292 int idx;
2293 unsigned flags; 2293 unsigned flags;
2294 }; 2294 };
2295 2295
2296 static void * 2296 static void *
2297 t_hash_next(struct seq_file *m, loff_t *pos) 2297 t_hash_next(struct seq_file *m, loff_t *pos)
2298 { 2298 {
2299 struct ftrace_iterator *iter = m->private; 2299 struct ftrace_iterator *iter = m->private;
2300 struct hlist_node *hnd = NULL; 2300 struct hlist_node *hnd = NULL;
2301 struct hlist_head *hhd; 2301 struct hlist_head *hhd;
2302 2302
2303 (*pos)++; 2303 (*pos)++;
2304 iter->pos = *pos; 2304 iter->pos = *pos;
2305 2305
2306 if (iter->probe) 2306 if (iter->probe)
2307 hnd = &iter->probe->node; 2307 hnd = &iter->probe->node;
2308 retry: 2308 retry:
2309 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 2309 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
2310 return NULL; 2310 return NULL;
2311 2311
2312 hhd = &ftrace_func_hash[iter->hidx]; 2312 hhd = &ftrace_func_hash[iter->hidx];
2313 2313
2314 if (hlist_empty(hhd)) { 2314 if (hlist_empty(hhd)) {
2315 iter->hidx++; 2315 iter->hidx++;
2316 hnd = NULL; 2316 hnd = NULL;
2317 goto retry; 2317 goto retry;
2318 } 2318 }
2319 2319
2320 if (!hnd) 2320 if (!hnd)
2321 hnd = hhd->first; 2321 hnd = hhd->first;
2322 else { 2322 else {
2323 hnd = hnd->next; 2323 hnd = hnd->next;
2324 if (!hnd) { 2324 if (!hnd) {
2325 iter->hidx++; 2325 iter->hidx++;
2326 goto retry; 2326 goto retry;
2327 } 2327 }
2328 } 2328 }
2329 2329
2330 if (WARN_ON_ONCE(!hnd)) 2330 if (WARN_ON_ONCE(!hnd))
2331 return NULL; 2331 return NULL;
2332 2332
2333 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); 2333 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
2334 2334
2335 return iter; 2335 return iter;
2336 } 2336 }
2337 2337
2338 static void *t_hash_start(struct seq_file *m, loff_t *pos) 2338 static void *t_hash_start(struct seq_file *m, loff_t *pos)
2339 { 2339 {
2340 struct ftrace_iterator *iter = m->private; 2340 struct ftrace_iterator *iter = m->private;
2341 void *p = NULL; 2341 void *p = NULL;
2342 loff_t l; 2342 loff_t l;
2343 2343
2344 if (!(iter->flags & FTRACE_ITER_DO_HASH)) 2344 if (!(iter->flags & FTRACE_ITER_DO_HASH))
2345 return NULL; 2345 return NULL;
2346 2346
2347 if (iter->func_pos > *pos) 2347 if (iter->func_pos > *pos)
2348 return NULL; 2348 return NULL;
2349 2349
2350 iter->hidx = 0; 2350 iter->hidx = 0;
2351 for (l = 0; l <= (*pos - iter->func_pos); ) { 2351 for (l = 0; l <= (*pos - iter->func_pos); ) {
2352 p = t_hash_next(m, &l); 2352 p = t_hash_next(m, &l);
2353 if (!p) 2353 if (!p)
2354 break; 2354 break;
2355 } 2355 }
2356 if (!p) 2356 if (!p)
2357 return NULL; 2357 return NULL;
2358 2358
2359 /* Only set this if we have an item */ 2359 /* Only set this if we have an item */
2360 iter->flags |= FTRACE_ITER_HASH; 2360 iter->flags |= FTRACE_ITER_HASH;
2361 2361
2362 return iter; 2362 return iter;
2363 } 2363 }
2364 2364
2365 static int 2365 static int
2366 t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) 2366 t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
2367 { 2367 {
2368 struct ftrace_func_probe *rec; 2368 struct ftrace_func_probe *rec;
2369 2369
2370 rec = iter->probe; 2370 rec = iter->probe;
2371 if (WARN_ON_ONCE(!rec)) 2371 if (WARN_ON_ONCE(!rec))
2372 return -EIO; 2372 return -EIO;
2373 2373
2374 if (rec->ops->print) 2374 if (rec->ops->print)
2375 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 2375 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
2376 2376
2377 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); 2377 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
2378 2378
2379 if (rec->data) 2379 if (rec->data)
2380 seq_printf(m, ":%p", rec->data); 2380 seq_printf(m, ":%p", rec->data);
2381 seq_putc(m, '\n'); 2381 seq_putc(m, '\n');
2382 2382
2383 return 0; 2383 return 0;
2384 } 2384 }
2385 2385
2386 static void * 2386 static void *
2387 t_next(struct seq_file *m, void *v, loff_t *pos) 2387 t_next(struct seq_file *m, void *v, loff_t *pos)
2388 { 2388 {
2389 struct ftrace_iterator *iter = m->private; 2389 struct ftrace_iterator *iter = m->private;
2390 struct ftrace_ops *ops = iter->ops; 2390 struct ftrace_ops *ops = iter->ops;
2391 struct dyn_ftrace *rec = NULL; 2391 struct dyn_ftrace *rec = NULL;
2392 2392
2393 if (unlikely(ftrace_disabled)) 2393 if (unlikely(ftrace_disabled))
2394 return NULL; 2394 return NULL;
2395 2395
2396 if (iter->flags & FTRACE_ITER_HASH) 2396 if (iter->flags & FTRACE_ITER_HASH)
2397 return t_hash_next(m, pos); 2397 return t_hash_next(m, pos);
2398 2398
2399 (*pos)++; 2399 (*pos)++;
2400 iter->pos = iter->func_pos = *pos; 2400 iter->pos = iter->func_pos = *pos;
2401 2401
2402 if (iter->flags & FTRACE_ITER_PRINTALL) 2402 if (iter->flags & FTRACE_ITER_PRINTALL)
2403 return t_hash_start(m, pos); 2403 return t_hash_start(m, pos);
2404 2404
2405 retry: 2405 retry:
2406 if (iter->idx >= iter->pg->index) { 2406 if (iter->idx >= iter->pg->index) {
2407 if (iter->pg->next) { 2407 if (iter->pg->next) {
2408 iter->pg = iter->pg->next; 2408 iter->pg = iter->pg->next;
2409 iter->idx = 0; 2409 iter->idx = 0;
2410 goto retry; 2410 goto retry;
2411 } 2411 }
2412 } else { 2412 } else {
2413 rec = &iter->pg->records[iter->idx++]; 2413 rec = &iter->pg->records[iter->idx++];
2414 if (((iter->flags & FTRACE_ITER_FILTER) && 2414 if (((iter->flags & FTRACE_ITER_FILTER) &&
2415 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2415 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
2416 2416
2417 ((iter->flags & FTRACE_ITER_NOTRACE) && 2417 ((iter->flags & FTRACE_ITER_NOTRACE) &&
2418 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || 2418 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
2419 2419
2420 ((iter->flags & FTRACE_ITER_ENABLED) && 2420 ((iter->flags & FTRACE_ITER_ENABLED) &&
2421 !(rec->flags & ~FTRACE_FL_MASK))) { 2421 !(rec->flags & ~FTRACE_FL_MASK))) {
2422 2422
2423 rec = NULL; 2423 rec = NULL;
2424 goto retry; 2424 goto retry;
2425 } 2425 }
2426 } 2426 }
2427 2427
2428 if (!rec) 2428 if (!rec)
2429 return t_hash_start(m, pos); 2429 return t_hash_start(m, pos);
2430 2430
2431 iter->func = rec; 2431 iter->func = rec;
2432 2432
2433 return iter; 2433 return iter;
2434 } 2434 }
2435 2435
2436 static void reset_iter_read(struct ftrace_iterator *iter) 2436 static void reset_iter_read(struct ftrace_iterator *iter)
2437 { 2437 {
2438 iter->pos = 0; 2438 iter->pos = 0;
2439 iter->func_pos = 0; 2439 iter->func_pos = 0;
2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); 2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
2441 } 2441 }
2442 2442
2443 static void *t_start(struct seq_file *m, loff_t *pos) 2443 static void *t_start(struct seq_file *m, loff_t *pos)
2444 { 2444 {
2445 struct ftrace_iterator *iter = m->private; 2445 struct ftrace_iterator *iter = m->private;
2446 struct ftrace_ops *ops = iter->ops; 2446 struct ftrace_ops *ops = iter->ops;
2447 void *p = NULL; 2447 void *p = NULL;
2448 loff_t l; 2448 loff_t l;
2449 2449
2450 mutex_lock(&ftrace_lock); 2450 mutex_lock(&ftrace_lock);
2451 2451
2452 if (unlikely(ftrace_disabled)) 2452 if (unlikely(ftrace_disabled))
2453 return NULL; 2453 return NULL;
2454 2454
2455 /* 2455 /*
2456 * If an lseek was done, then reset and start from beginning. 2456 * If an lseek was done, then reset and start from beginning.
2457 */ 2457 */
2458 if (*pos < iter->pos) 2458 if (*pos < iter->pos)
2459 reset_iter_read(iter); 2459 reset_iter_read(iter);
2460 2460
2461 /* 2461 /*
2462 * For set_ftrace_filter reading, if we have the filter 2462 * For set_ftrace_filter reading, if we have the filter
2463 * off, we can short cut and just print out that all 2463 * off, we can short cut and just print out that all
2464 * functions are enabled. 2464 * functions are enabled.
2465 */ 2465 */
2466 if (iter->flags & FTRACE_ITER_FILTER && 2466 if (iter->flags & FTRACE_ITER_FILTER &&
2467 ftrace_hash_empty(ops->filter_hash)) { 2467 ftrace_hash_empty(ops->filter_hash)) {
2468 if (*pos > 0) 2468 if (*pos > 0)
2469 return t_hash_start(m, pos); 2469 return t_hash_start(m, pos);
2470 iter->flags |= FTRACE_ITER_PRINTALL; 2470 iter->flags |= FTRACE_ITER_PRINTALL;
2471 /* reset in case of seek/pread */ 2471 /* reset in case of seek/pread */
2472 iter->flags &= ~FTRACE_ITER_HASH; 2472 iter->flags &= ~FTRACE_ITER_HASH;
2473 return iter; 2473 return iter;
2474 } 2474 }
2475 2475
2476 if (iter->flags & FTRACE_ITER_HASH) 2476 if (iter->flags & FTRACE_ITER_HASH)
2477 return t_hash_start(m, pos); 2477 return t_hash_start(m, pos);
2478 2478
2479 /* 2479 /*
2480 * Unfortunately, we need to restart at ftrace_pages_start 2480 * Unfortunately, we need to restart at ftrace_pages_start
2481 * every time we let go of the ftrace_mutex. This is because 2481 * every time we let go of the ftrace_mutex. This is because
2482 * those pointers can change without the lock. 2482 * those pointers can change without the lock.
2483 */ 2483 */
2484 iter->pg = ftrace_pages_start; 2484 iter->pg = ftrace_pages_start;
2485 iter->idx = 0; 2485 iter->idx = 0;
2486 for (l = 0; l <= *pos; ) { 2486 for (l = 0; l <= *pos; ) {
2487 p = t_next(m, p, &l); 2487 p = t_next(m, p, &l);
2488 if (!p) 2488 if (!p)
2489 break; 2489 break;
2490 } 2490 }
2491 2491
2492 if (!p) 2492 if (!p)
2493 return t_hash_start(m, pos); 2493 return t_hash_start(m, pos);
2494 2494
2495 return iter; 2495 return iter;
2496 } 2496 }
2497 2497
2498 static void t_stop(struct seq_file *m, void *p) 2498 static void t_stop(struct seq_file *m, void *p)
2499 { 2499 {
2500 mutex_unlock(&ftrace_lock); 2500 mutex_unlock(&ftrace_lock);
2501 } 2501 }
2502 2502
2503 static int t_show(struct seq_file *m, void *v) 2503 static int t_show(struct seq_file *m, void *v)
2504 { 2504 {
2505 struct ftrace_iterator *iter = m->private; 2505 struct ftrace_iterator *iter = m->private;
2506 struct dyn_ftrace *rec; 2506 struct dyn_ftrace *rec;
2507 2507
2508 if (iter->flags & FTRACE_ITER_HASH) 2508 if (iter->flags & FTRACE_ITER_HASH)
2509 return t_hash_show(m, iter); 2509 return t_hash_show(m, iter);
2510 2510
2511 if (iter->flags & FTRACE_ITER_PRINTALL) { 2511 if (iter->flags & FTRACE_ITER_PRINTALL) {
2512 seq_printf(m, "#### all functions enabled ####\n"); 2512 seq_printf(m, "#### all functions enabled ####\n");
2513 return 0; 2513 return 0;
2514 } 2514 }
2515 2515
2516 rec = iter->func; 2516 rec = iter->func;
2517 2517
2518 if (!rec) 2518 if (!rec)
2519 return 0; 2519 return 0;
2520 2520
2521 seq_printf(m, "%ps", (void *)rec->ip); 2521 seq_printf(m, "%ps", (void *)rec->ip);
2522 if (iter->flags & FTRACE_ITER_ENABLED) 2522 if (iter->flags & FTRACE_ITER_ENABLED)
2523 seq_printf(m, " (%ld)%s", 2523 seq_printf(m, " (%ld)%s",
2524 rec->flags & ~FTRACE_FL_MASK, 2524 rec->flags & ~FTRACE_FL_MASK,
2525 rec->flags & FTRACE_FL_REGS ? " R" : ""); 2525 rec->flags & FTRACE_FL_REGS ? " R" : "");
2526 seq_printf(m, "\n"); 2526 seq_printf(m, "\n");
2527 2527
2528 return 0; 2528 return 0;
2529 } 2529 }
2530 2530
2531 static const struct seq_operations show_ftrace_seq_ops = { 2531 static const struct seq_operations show_ftrace_seq_ops = {
2532 .start = t_start, 2532 .start = t_start,
2533 .next = t_next, 2533 .next = t_next,
2534 .stop = t_stop, 2534 .stop = t_stop,
2535 .show = t_show, 2535 .show = t_show,
2536 }; 2536 };
2537 2537
2538 static int 2538 static int
2539 ftrace_avail_open(struct inode *inode, struct file *file) 2539 ftrace_avail_open(struct inode *inode, struct file *file)
2540 { 2540 {
2541 struct ftrace_iterator *iter; 2541 struct ftrace_iterator *iter;
2542 2542
2543 if (unlikely(ftrace_disabled)) 2543 if (unlikely(ftrace_disabled))
2544 return -ENODEV; 2544 return -ENODEV;
2545 2545
2546 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 2546 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2547 if (iter) { 2547 if (iter) {
2548 iter->pg = ftrace_pages_start; 2548 iter->pg = ftrace_pages_start;
2549 iter->ops = &global_ops; 2549 iter->ops = &global_ops;
2550 } 2550 }
2551 2551
2552 return iter ? 0 : -ENOMEM; 2552 return iter ? 0 : -ENOMEM;
2553 } 2553 }
2554 2554
2555 static int 2555 static int
2556 ftrace_enabled_open(struct inode *inode, struct file *file) 2556 ftrace_enabled_open(struct inode *inode, struct file *file)
2557 { 2557 {
2558 struct ftrace_iterator *iter; 2558 struct ftrace_iterator *iter;
2559 2559
2560 if (unlikely(ftrace_disabled)) 2560 if (unlikely(ftrace_disabled))
2561 return -ENODEV; 2561 return -ENODEV;
2562 2562
2563 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 2563 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2564 if (iter) { 2564 if (iter) {
2565 iter->pg = ftrace_pages_start; 2565 iter->pg = ftrace_pages_start;
2566 iter->flags = FTRACE_ITER_ENABLED; 2566 iter->flags = FTRACE_ITER_ENABLED;
2567 iter->ops = &global_ops; 2567 iter->ops = &global_ops;
2568 } 2568 }
2569 2569
2570 return iter ? 0 : -ENOMEM; 2570 return iter ? 0 : -ENOMEM;
2571 } 2571 }
2572 2572
2573 static void ftrace_filter_reset(struct ftrace_hash *hash) 2573 static void ftrace_filter_reset(struct ftrace_hash *hash)
2574 { 2574 {
2575 mutex_lock(&ftrace_lock); 2575 mutex_lock(&ftrace_lock);
2576 ftrace_hash_clear(hash); 2576 ftrace_hash_clear(hash);
2577 mutex_unlock(&ftrace_lock); 2577 mutex_unlock(&ftrace_lock);
2578 } 2578 }
2579 2579
2580 /** 2580 /**
2581 * ftrace_regex_open - initialize function tracer filter files 2581 * ftrace_regex_open - initialize function tracer filter files
2582 * @ops: The ftrace_ops that hold the hash filters 2582 * @ops: The ftrace_ops that hold the hash filters
2583 * @flag: The type of filter to process 2583 * @flag: The type of filter to process
2584 * @inode: The inode, usually passed in to your open routine 2584 * @inode: The inode, usually passed in to your open routine
2585 * @file: The file, usually passed in to your open routine 2585 * @file: The file, usually passed in to your open routine
2586 * 2586 *
2587 * ftrace_regex_open() initializes the filter files for the 2587 * ftrace_regex_open() initializes the filter files for the
2588 * @ops. Depending on @flag it may process the filter hash or 2588 * @ops. Depending on @flag it may process the filter hash or
2589 * the notrace hash of @ops. With this called from the open 2589 * the notrace hash of @ops. With this called from the open
2590 * routine, you can use ftrace_filter_write() for the write 2590 * routine, you can use ftrace_filter_write() for the write
2591 * routine if @flag has FTRACE_ITER_FILTER set, or 2591 * routine if @flag has FTRACE_ITER_FILTER set, or
2592 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2592 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2593 * ftrace_regex_lseek() should be used as the lseek routine, and 2593 * ftrace_regex_lseek() should be used as the lseek routine, and
2594 * release must call ftrace_regex_release(). 2594 * release must call ftrace_regex_release().
2595 */ 2595 */
2596 int 2596 int
2597 ftrace_regex_open(struct ftrace_ops *ops, int flag, 2597 ftrace_regex_open(struct ftrace_ops *ops, int flag,
2598 struct inode *inode, struct file *file) 2598 struct inode *inode, struct file *file)
2599 { 2599 {
2600 struct ftrace_iterator *iter; 2600 struct ftrace_iterator *iter;
2601 struct ftrace_hash *hash; 2601 struct ftrace_hash *hash;
2602 int ret = 0; 2602 int ret = 0;
2603 2603
2604 if (unlikely(ftrace_disabled)) 2604 if (unlikely(ftrace_disabled))
2605 return -ENODEV; 2605 return -ENODEV;
2606 2606
2607 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2607 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2608 if (!iter) 2608 if (!iter)
2609 return -ENOMEM; 2609 return -ENOMEM;
2610 2610
2611 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { 2611 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
2612 kfree(iter); 2612 kfree(iter);
2613 return -ENOMEM; 2613 return -ENOMEM;
2614 } 2614 }
2615 2615
2616 if (flag & FTRACE_ITER_NOTRACE) 2616 if (flag & FTRACE_ITER_NOTRACE)
2617 hash = ops->notrace_hash; 2617 hash = ops->notrace_hash;
2618 else 2618 else
2619 hash = ops->filter_hash; 2619 hash = ops->filter_hash;
2620 2620
2621 iter->ops = ops; 2621 iter->ops = ops;
2622 iter->flags = flag; 2622 iter->flags = flag;
2623 2623
2624 if (file->f_mode & FMODE_WRITE) { 2624 if (file->f_mode & FMODE_WRITE) {
2625 mutex_lock(&ftrace_lock); 2625 mutex_lock(&ftrace_lock);
2626 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); 2626 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2627 mutex_unlock(&ftrace_lock); 2627 mutex_unlock(&ftrace_lock);
2628 2628
2629 if (!iter->hash) { 2629 if (!iter->hash) {
2630 trace_parser_put(&iter->parser); 2630 trace_parser_put(&iter->parser);
2631 kfree(iter); 2631 kfree(iter);
2632 return -ENOMEM; 2632 return -ENOMEM;
2633 } 2633 }
2634 } 2634 }
2635 2635
2636 mutex_lock(&ftrace_regex_lock); 2636 mutex_lock(&ftrace_regex_lock);
2637 2637
2638 if ((file->f_mode & FMODE_WRITE) && 2638 if ((file->f_mode & FMODE_WRITE) &&
2639 (file->f_flags & O_TRUNC)) 2639 (file->f_flags & O_TRUNC))
2640 ftrace_filter_reset(iter->hash); 2640 ftrace_filter_reset(iter->hash);
2641 2641
2642 if (file->f_mode & FMODE_READ) { 2642 if (file->f_mode & FMODE_READ) {
2643 iter->pg = ftrace_pages_start; 2643 iter->pg = ftrace_pages_start;
2644 2644
2645 ret = seq_open(file, &show_ftrace_seq_ops); 2645 ret = seq_open(file, &show_ftrace_seq_ops);
2646 if (!ret) { 2646 if (!ret) {
2647 struct seq_file *m = file->private_data; 2647 struct seq_file *m = file->private_data;
2648 m->private = iter; 2648 m->private = iter;
2649 } else { 2649 } else {
2650 /* Failed */ 2650 /* Failed */
2651 free_ftrace_hash(iter->hash); 2651 free_ftrace_hash(iter->hash);
2652 trace_parser_put(&iter->parser); 2652 trace_parser_put(&iter->parser);
2653 kfree(iter); 2653 kfree(iter);
2654 } 2654 }
2655 } else 2655 } else
2656 file->private_data = iter; 2656 file->private_data = iter;
2657 mutex_unlock(&ftrace_regex_lock); 2657 mutex_unlock(&ftrace_regex_lock);
2658 2658
2659 return ret; 2659 return ret;
2660 } 2660 }
2661 2661
2662 static int 2662 static int
2663 ftrace_filter_open(struct inode *inode, struct file *file) 2663 ftrace_filter_open(struct inode *inode, struct file *file)
2664 { 2664 {
2665 return ftrace_regex_open(&global_ops, 2665 return ftrace_regex_open(&global_ops,
2666 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 2666 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2667 inode, file); 2667 inode, file);
2668 } 2668 }
2669 2669
2670 static int 2670 static int
2671 ftrace_notrace_open(struct inode *inode, struct file *file) 2671 ftrace_notrace_open(struct inode *inode, struct file *file)
2672 { 2672 {
2673 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, 2673 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2674 inode, file); 2674 inode, file);
2675 } 2675 }
2676 2676
2677 loff_t 2677 loff_t
2678 ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2678 ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2679 { 2679 {
2680 loff_t ret; 2680 loff_t ret;
2681 2681
2682 if (file->f_mode & FMODE_READ) 2682 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, origin); 2683 ret = seq_lseek(file, offset, whence);
2684 else 2684 else
2685 file->f_pos = ret = 1; 2685 file->f_pos = ret = 1;
2686 2686
2687 return ret; 2687 return ret;
2688 } 2688 }
2689 2689
2690 static int ftrace_match(char *str, char *regex, int len, int type) 2690 static int ftrace_match(char *str, char *regex, int len, int type)
2691 { 2691 {
2692 int matched = 0; 2692 int matched = 0;
2693 int slen; 2693 int slen;
2694 2694
2695 switch (type) { 2695 switch (type) {
2696 case MATCH_FULL: 2696 case MATCH_FULL:
2697 if (strcmp(str, regex) == 0) 2697 if (strcmp(str, regex) == 0)
2698 matched = 1; 2698 matched = 1;
2699 break; 2699 break;
2700 case MATCH_FRONT_ONLY: 2700 case MATCH_FRONT_ONLY:
2701 if (strncmp(str, regex, len) == 0) 2701 if (strncmp(str, regex, len) == 0)
2702 matched = 1; 2702 matched = 1;
2703 break; 2703 break;
2704 case MATCH_MIDDLE_ONLY: 2704 case MATCH_MIDDLE_ONLY:
2705 if (strstr(str, regex)) 2705 if (strstr(str, regex))
2706 matched = 1; 2706 matched = 1;
2707 break; 2707 break;
2708 case MATCH_END_ONLY: 2708 case MATCH_END_ONLY:
2709 slen = strlen(str); 2709 slen = strlen(str);
2710 if (slen >= len && memcmp(str + slen - len, regex, len) == 0) 2710 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
2711 matched = 1; 2711 matched = 1;
2712 break; 2712 break;
2713 } 2713 }
2714 2714
2715 return matched; 2715 return matched;
2716 } 2716 }
2717 2717
2718 static int 2718 static int
2719 enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) 2719 enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2720 { 2720 {
2721 struct ftrace_func_entry *entry; 2721 struct ftrace_func_entry *entry;
2722 int ret = 0; 2722 int ret = 0;
2723 2723
2724 entry = ftrace_lookup_ip(hash, rec->ip); 2724 entry = ftrace_lookup_ip(hash, rec->ip);
2725 if (not) { 2725 if (not) {
2726 /* Do nothing if it doesn't exist */ 2726 /* Do nothing if it doesn't exist */
2727 if (!entry) 2727 if (!entry)
2728 return 0; 2728 return 0;
2729 2729
2730 free_hash_entry(hash, entry); 2730 free_hash_entry(hash, entry);
2731 } else { 2731 } else {
2732 /* Do nothing if it exists */ 2732 /* Do nothing if it exists */
2733 if (entry) 2733 if (entry)
2734 return 0; 2734 return 0;
2735 2735
2736 ret = add_hash_entry(hash, rec->ip); 2736 ret = add_hash_entry(hash, rec->ip);
2737 } 2737 }
2738 return ret; 2738 return ret;
2739 } 2739 }
2740 2740
2741 static int 2741 static int
2742 ftrace_match_record(struct dyn_ftrace *rec, char *mod, 2742 ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2743 char *regex, int len, int type) 2743 char *regex, int len, int type)
2744 { 2744 {
2745 char str[KSYM_SYMBOL_LEN]; 2745 char str[KSYM_SYMBOL_LEN];
2746 char *modname; 2746 char *modname;
2747 2747
2748 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); 2748 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2749 2749
2750 if (mod) { 2750 if (mod) {
2751 /* module lookup requires matching the module */ 2751 /* module lookup requires matching the module */
2752 if (!modname || strcmp(modname, mod)) 2752 if (!modname || strcmp(modname, mod))
2753 return 0; 2753 return 0;
2754 2754
2755 /* blank search means to match all funcs in the mod */ 2755 /* blank search means to match all funcs in the mod */
2756 if (!len) 2756 if (!len)
2757 return 1; 2757 return 1;
2758 } 2758 }
2759 2759
2760 return ftrace_match(str, regex, len, type); 2760 return ftrace_match(str, regex, len, type);
2761 } 2761 }
2762 2762
2763 static int 2763 static int
2764 match_records(struct ftrace_hash *hash, char *buff, 2764 match_records(struct ftrace_hash *hash, char *buff,
2765 int len, char *mod, int not) 2765 int len, char *mod, int not)
2766 { 2766 {
2767 unsigned search_len = 0; 2767 unsigned search_len = 0;
2768 struct ftrace_page *pg; 2768 struct ftrace_page *pg;
2769 struct dyn_ftrace *rec; 2769 struct dyn_ftrace *rec;
2770 int type = MATCH_FULL; 2770 int type = MATCH_FULL;
2771 char *search = buff; 2771 char *search = buff;
2772 int found = 0; 2772 int found = 0;
2773 int ret; 2773 int ret;
2774 2774
2775 if (len) { 2775 if (len) {
2776 type = filter_parse_regex(buff, len, &search, &not); 2776 type = filter_parse_regex(buff, len, &search, &not);
2777 search_len = strlen(search); 2777 search_len = strlen(search);
2778 } 2778 }
2779 2779
2780 mutex_lock(&ftrace_lock); 2780 mutex_lock(&ftrace_lock);
2781 2781
2782 if (unlikely(ftrace_disabled)) 2782 if (unlikely(ftrace_disabled))
2783 goto out_unlock; 2783 goto out_unlock;
2784 2784
2785 do_for_each_ftrace_rec(pg, rec) { 2785 do_for_each_ftrace_rec(pg, rec) {
2786 if (ftrace_match_record(rec, mod, search, search_len, type)) { 2786 if (ftrace_match_record(rec, mod, search, search_len, type)) {
2787 ret = enter_record(hash, rec, not); 2787 ret = enter_record(hash, rec, not);
2788 if (ret < 0) { 2788 if (ret < 0) {
2789 found = ret; 2789 found = ret;
2790 goto out_unlock; 2790 goto out_unlock;
2791 } 2791 }
2792 found = 1; 2792 found = 1;
2793 } 2793 }
2794 } while_for_each_ftrace_rec(); 2794 } while_for_each_ftrace_rec();
2795 out_unlock: 2795 out_unlock:
2796 mutex_unlock(&ftrace_lock); 2796 mutex_unlock(&ftrace_lock);
2797 2797
2798 return found; 2798 return found;
2799 } 2799 }
2800 2800
2801 static int 2801 static int
2802 ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) 2802 ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
2803 { 2803 {
2804 return match_records(hash, buff, len, NULL, 0); 2804 return match_records(hash, buff, len, NULL, 0);
2805 } 2805 }
2806 2806
2807 static int 2807 static int
2808 ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) 2808 ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
2809 { 2809 {
2810 int not = 0; 2810 int not = 0;
2811 2811
2812 /* blank or '*' mean the same */ 2812 /* blank or '*' mean the same */
2813 if (strcmp(buff, "*") == 0) 2813 if (strcmp(buff, "*") == 0)
2814 buff[0] = 0; 2814 buff[0] = 0;
2815 2815
2816 /* handle the case of 'dont filter this module' */ 2816 /* handle the case of 'dont filter this module' */
2817 if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { 2817 if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
2818 buff[0] = 0; 2818 buff[0] = 0;
2819 not = 1; 2819 not = 1;
2820 } 2820 }
2821 2821
2822 return match_records(hash, buff, strlen(buff), mod, not); 2822 return match_records(hash, buff, strlen(buff), mod, not);
2823 } 2823 }
2824 2824
2825 /* 2825 /*
2826 * We register the module command as a template to show others how 2826 * We register the module command as a template to show others how
2827 * to register the a command as well. 2827 * to register the a command as well.
2828 */ 2828 */
2829 2829
2830 static int 2830 static int
2831 ftrace_mod_callback(struct ftrace_hash *hash, 2831 ftrace_mod_callback(struct ftrace_hash *hash,
2832 char *func, char *cmd, char *param, int enable) 2832 char *func, char *cmd, char *param, int enable)
2833 { 2833 {
2834 char *mod; 2834 char *mod;
2835 int ret = -EINVAL; 2835 int ret = -EINVAL;
2836 2836
2837 /* 2837 /*
2838 * cmd == 'mod' because we only registered this func 2838 * cmd == 'mod' because we only registered this func
2839 * for the 'mod' ftrace_func_command. 2839 * for the 'mod' ftrace_func_command.
2840 * But if you register one func with multiple commands, 2840 * But if you register one func with multiple commands,
2841 * you can tell which command was used by the cmd 2841 * you can tell which command was used by the cmd
2842 * parameter. 2842 * parameter.
2843 */ 2843 */
2844 2844
2845 /* we must have a module name */ 2845 /* we must have a module name */
2846 if (!param) 2846 if (!param)
2847 return ret; 2847 return ret;
2848 2848
2849 mod = strsep(&param, ":"); 2849 mod = strsep(&param, ":");
2850 if (!strlen(mod)) 2850 if (!strlen(mod))
2851 return ret; 2851 return ret;
2852 2852
2853 ret = ftrace_match_module_records(hash, func, mod); 2853 ret = ftrace_match_module_records(hash, func, mod);
2854 if (!ret) 2854 if (!ret)
2855 ret = -EINVAL; 2855 ret = -EINVAL;
2856 if (ret < 0) 2856 if (ret < 0)
2857 return ret; 2857 return ret;
2858 2858
2859 return 0; 2859 return 0;
2860 } 2860 }
2861 2861
2862 static struct ftrace_func_command ftrace_mod_cmd = { 2862 static struct ftrace_func_command ftrace_mod_cmd = {
2863 .name = "mod", 2863 .name = "mod",
2864 .func = ftrace_mod_callback, 2864 .func = ftrace_mod_callback,
2865 }; 2865 };
2866 2866
2867 static int __init ftrace_mod_cmd_init(void) 2867 static int __init ftrace_mod_cmd_init(void)
2868 { 2868 {
2869 return register_ftrace_command(&ftrace_mod_cmd); 2869 return register_ftrace_command(&ftrace_mod_cmd);
2870 } 2870 }
2871 core_initcall(ftrace_mod_cmd_init); 2871 core_initcall(ftrace_mod_cmd_init);
2872 2872
2873 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 2873 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2874 struct ftrace_ops *op, struct pt_regs *pt_regs) 2874 struct ftrace_ops *op, struct pt_regs *pt_regs)
2875 { 2875 {
2876 struct ftrace_func_probe *entry; 2876 struct ftrace_func_probe *entry;
2877 struct hlist_head *hhd; 2877 struct hlist_head *hhd;
2878 struct hlist_node *n; 2878 struct hlist_node *n;
2879 unsigned long key; 2879 unsigned long key;
2880 2880
2881 key = hash_long(ip, FTRACE_HASH_BITS); 2881 key = hash_long(ip, FTRACE_HASH_BITS);
2882 2882
2883 hhd = &ftrace_func_hash[key]; 2883 hhd = &ftrace_func_hash[key];
2884 2884
2885 if (hlist_empty(hhd)) 2885 if (hlist_empty(hhd))
2886 return; 2886 return;
2887 2887
2888 /* 2888 /*
2889 * Disable preemption for these calls to prevent a RCU grace 2889 * Disable preemption for these calls to prevent a RCU grace
2890 * period. This syncs the hash iteration and freeing of items 2890 * period. This syncs the hash iteration and freeing of items
2891 * on the hash. rcu_read_lock is too dangerous here. 2891 * on the hash. rcu_read_lock is too dangerous here.
2892 */ 2892 */
2893 preempt_disable_notrace(); 2893 preempt_disable_notrace();
2894 hlist_for_each_entry_rcu(entry, n, hhd, node) { 2894 hlist_for_each_entry_rcu(entry, n, hhd, node) {
2895 if (entry->ip == ip) 2895 if (entry->ip == ip)
2896 entry->ops->func(ip, parent_ip, &entry->data); 2896 entry->ops->func(ip, parent_ip, &entry->data);
2897 } 2897 }
2898 preempt_enable_notrace(); 2898 preempt_enable_notrace();
2899 } 2899 }
2900 2900
2901 static struct ftrace_ops trace_probe_ops __read_mostly = 2901 static struct ftrace_ops trace_probe_ops __read_mostly =
2902 { 2902 {
2903 .func = function_trace_probe_call, 2903 .func = function_trace_probe_call,
2904 }; 2904 };
2905 2905
2906 static int ftrace_probe_registered; 2906 static int ftrace_probe_registered;
2907 2907
2908 static void __enable_ftrace_function_probe(void) 2908 static void __enable_ftrace_function_probe(void)
2909 { 2909 {
2910 int ret; 2910 int ret;
2911 int i; 2911 int i;
2912 2912
2913 if (ftrace_probe_registered) 2913 if (ftrace_probe_registered)
2914 return; 2914 return;
2915 2915
2916 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 2916 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
2917 struct hlist_head *hhd = &ftrace_func_hash[i]; 2917 struct hlist_head *hhd = &ftrace_func_hash[i];
2918 if (hhd->first) 2918 if (hhd->first)
2919 break; 2919 break;
2920 } 2920 }
2921 /* Nothing registered? */ 2921 /* Nothing registered? */
2922 if (i == FTRACE_FUNC_HASHSIZE) 2922 if (i == FTRACE_FUNC_HASHSIZE)
2923 return; 2923 return;
2924 2924
2925 ret = __register_ftrace_function(&trace_probe_ops); 2925 ret = __register_ftrace_function(&trace_probe_ops);
2926 if (!ret) 2926 if (!ret)
2927 ret = ftrace_startup(&trace_probe_ops, 0); 2927 ret = ftrace_startup(&trace_probe_ops, 0);
2928 2928
2929 ftrace_probe_registered = 1; 2929 ftrace_probe_registered = 1;
2930 } 2930 }
2931 2931
2932 static void __disable_ftrace_function_probe(void) 2932 static void __disable_ftrace_function_probe(void)
2933 { 2933 {
2934 int ret; 2934 int ret;
2935 int i; 2935 int i;
2936 2936
2937 if (!ftrace_probe_registered) 2937 if (!ftrace_probe_registered)
2938 return; 2938 return;
2939 2939
2940 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 2940 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
2941 struct hlist_head *hhd = &ftrace_func_hash[i]; 2941 struct hlist_head *hhd = &ftrace_func_hash[i];
2942 if (hhd->first) 2942 if (hhd->first)
2943 return; 2943 return;
2944 } 2944 }
2945 2945
2946 /* no more funcs left */ 2946 /* no more funcs left */
2947 ret = __unregister_ftrace_function(&trace_probe_ops); 2947 ret = __unregister_ftrace_function(&trace_probe_ops);
2948 if (!ret) 2948 if (!ret)
2949 ftrace_shutdown(&trace_probe_ops, 0); 2949 ftrace_shutdown(&trace_probe_ops, 0);
2950 2950
2951 ftrace_probe_registered = 0; 2951 ftrace_probe_registered = 0;
2952 } 2952 }
2953 2953
2954 2954
2955 static void ftrace_free_entry_rcu(struct rcu_head *rhp) 2955 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
2956 { 2956 {
2957 struct ftrace_func_probe *entry = 2957 struct ftrace_func_probe *entry =
2958 container_of(rhp, struct ftrace_func_probe, rcu); 2958 container_of(rhp, struct ftrace_func_probe, rcu);
2959 2959
2960 if (entry->ops->free) 2960 if (entry->ops->free)
2961 entry->ops->free(&entry->data); 2961 entry->ops->free(&entry->data);
2962 kfree(entry); 2962 kfree(entry);
2963 } 2963 }
2964 2964
2965 2965
2966 int 2966 int
2967 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 2967 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2968 void *data) 2968 void *data)
2969 { 2969 {
2970 struct ftrace_func_probe *entry; 2970 struct ftrace_func_probe *entry;
2971 struct ftrace_page *pg; 2971 struct ftrace_page *pg;
2972 struct dyn_ftrace *rec; 2972 struct dyn_ftrace *rec;
2973 int type, len, not; 2973 int type, len, not;
2974 unsigned long key; 2974 unsigned long key;
2975 int count = 0; 2975 int count = 0;
2976 char *search; 2976 char *search;
2977 2977
2978 type = filter_parse_regex(glob, strlen(glob), &search, &not); 2978 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2979 len = strlen(search); 2979 len = strlen(search);
2980 2980
2981 /* we do not support '!' for function probes */ 2981 /* we do not support '!' for function probes */
2982 if (WARN_ON(not)) 2982 if (WARN_ON(not))
2983 return -EINVAL; 2983 return -EINVAL;
2984 2984
2985 mutex_lock(&ftrace_lock); 2985 mutex_lock(&ftrace_lock);
2986 2986
2987 if (unlikely(ftrace_disabled)) 2987 if (unlikely(ftrace_disabled))
2988 goto out_unlock; 2988 goto out_unlock;
2989 2989
2990 do_for_each_ftrace_rec(pg, rec) { 2990 do_for_each_ftrace_rec(pg, rec) {
2991 2991
2992 if (!ftrace_match_record(rec, NULL, search, len, type)) 2992 if (!ftrace_match_record(rec, NULL, search, len, type))
2993 continue; 2993 continue;
2994 2994
2995 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2995 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2996 if (!entry) { 2996 if (!entry) {
2997 /* If we did not process any, then return error */ 2997 /* If we did not process any, then return error */
2998 if (!count) 2998 if (!count)
2999 count = -ENOMEM; 2999 count = -ENOMEM;
3000 goto out_unlock; 3000 goto out_unlock;
3001 } 3001 }
3002 3002
3003 count++; 3003 count++;
3004 3004
3005 entry->data = data; 3005 entry->data = data;
3006 3006
3007 /* 3007 /*
3008 * The caller might want to do something special 3008 * The caller might want to do something special
3009 * for each function we find. We call the callback 3009 * for each function we find. We call the callback
3010 * to give the caller an opportunity to do so. 3010 * to give the caller an opportunity to do so.
3011 */ 3011 */
3012 if (ops->callback) { 3012 if (ops->callback) {
3013 if (ops->callback(rec->ip, &entry->data) < 0) { 3013 if (ops->callback(rec->ip, &entry->data) < 0) {
3014 /* caller does not like this func */ 3014 /* caller does not like this func */
3015 kfree(entry); 3015 kfree(entry);
3016 continue; 3016 continue;
3017 } 3017 }
3018 } 3018 }
3019 3019
3020 entry->ops = ops; 3020 entry->ops = ops;
3021 entry->ip = rec->ip; 3021 entry->ip = rec->ip;
3022 3022
3023 key = hash_long(entry->ip, FTRACE_HASH_BITS); 3023 key = hash_long(entry->ip, FTRACE_HASH_BITS);
3024 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); 3024 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3025 3025
3026 } while_for_each_ftrace_rec(); 3026 } while_for_each_ftrace_rec();
3027 __enable_ftrace_function_probe(); 3027 __enable_ftrace_function_probe();
3028 3028
3029 out_unlock: 3029 out_unlock:
3030 mutex_unlock(&ftrace_lock); 3030 mutex_unlock(&ftrace_lock);
3031 3031
3032 return count; 3032 return count;
3033 } 3033 }
3034 3034
3035 enum { 3035 enum {
3036 PROBE_TEST_FUNC = 1, 3036 PROBE_TEST_FUNC = 1,
3037 PROBE_TEST_DATA = 2 3037 PROBE_TEST_DATA = 2
3038 }; 3038 };
3039 3039
3040 static void 3040 static void
3041 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3041 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3042 void *data, int flags) 3042 void *data, int flags)
3043 { 3043 {
3044 struct ftrace_func_probe *entry; 3044 struct ftrace_func_probe *entry;
3045 struct hlist_node *n, *tmp; 3045 struct hlist_node *n, *tmp;
3046 char str[KSYM_SYMBOL_LEN]; 3046 char str[KSYM_SYMBOL_LEN];
3047 int type = MATCH_FULL; 3047 int type = MATCH_FULL;
3048 int i, len = 0; 3048 int i, len = 0;
3049 char *search; 3049 char *search;
3050 3050
3051 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) 3051 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
3052 glob = NULL; 3052 glob = NULL;
3053 else if (glob) { 3053 else if (glob) {
3054 int not; 3054 int not;
3055 3055
3056 type = filter_parse_regex(glob, strlen(glob), &search, &not); 3056 type = filter_parse_regex(glob, strlen(glob), &search, &not);
3057 len = strlen(search); 3057 len = strlen(search);
3058 3058
3059 /* we do not support '!' for function probes */ 3059 /* we do not support '!' for function probes */
3060 if (WARN_ON(not)) 3060 if (WARN_ON(not))
3061 return; 3061 return;
3062 } 3062 }
3063 3063
3064 mutex_lock(&ftrace_lock); 3064 mutex_lock(&ftrace_lock);
3065 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3065 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3066 struct hlist_head *hhd = &ftrace_func_hash[i]; 3066 struct hlist_head *hhd = &ftrace_func_hash[i];
3067 3067
3068 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { 3068 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
3069 3069
3070 /* break up if statements for readability */ 3070 /* break up if statements for readability */
3071 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) 3071 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
3072 continue; 3072 continue;
3073 3073
3074 if ((flags & PROBE_TEST_DATA) && entry->data != data) 3074 if ((flags & PROBE_TEST_DATA) && entry->data != data)
3075 continue; 3075 continue;
3076 3076
3077 /* do this last, since it is the most expensive */ 3077 /* do this last, since it is the most expensive */
3078 if (glob) { 3078 if (glob) {
3079 kallsyms_lookup(entry->ip, NULL, NULL, 3079 kallsyms_lookup(entry->ip, NULL, NULL,
3080 NULL, str); 3080 NULL, str);
3081 if (!ftrace_match(str, glob, len, type)) 3081 if (!ftrace_match(str, glob, len, type))
3082 continue; 3082 continue;
3083 } 3083 }
3084 3084
3085 hlist_del(&entry->node); 3085 hlist_del(&entry->node);
3086 call_rcu(&entry->rcu, ftrace_free_entry_rcu); 3086 call_rcu(&entry->rcu, ftrace_free_entry_rcu);
3087 } 3087 }
3088 } 3088 }
3089 __disable_ftrace_function_probe(); 3089 __disable_ftrace_function_probe();
3090 mutex_unlock(&ftrace_lock); 3090 mutex_unlock(&ftrace_lock);
3091 } 3091 }
3092 3092
3093 void 3093 void
3094 unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3094 unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3095 void *data) 3095 void *data)
3096 { 3096 {
3097 __unregister_ftrace_function_probe(glob, ops, data, 3097 __unregister_ftrace_function_probe(glob, ops, data,
3098 PROBE_TEST_FUNC | PROBE_TEST_DATA); 3098 PROBE_TEST_FUNC | PROBE_TEST_DATA);
3099 } 3099 }
3100 3100
3101 void 3101 void
3102 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) 3102 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
3103 { 3103 {
3104 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); 3104 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
3105 } 3105 }
3106 3106
3107 void unregister_ftrace_function_probe_all(char *glob) 3107 void unregister_ftrace_function_probe_all(char *glob)
3108 { 3108 {
3109 __unregister_ftrace_function_probe(glob, NULL, NULL, 0); 3109 __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
3110 } 3110 }
3111 3111
3112 static LIST_HEAD(ftrace_commands); 3112 static LIST_HEAD(ftrace_commands);
3113 static DEFINE_MUTEX(ftrace_cmd_mutex); 3113 static DEFINE_MUTEX(ftrace_cmd_mutex);
3114 3114
3115 int register_ftrace_command(struct ftrace_func_command *cmd) 3115 int register_ftrace_command(struct ftrace_func_command *cmd)
3116 { 3116 {
3117 struct ftrace_func_command *p; 3117 struct ftrace_func_command *p;
3118 int ret = 0; 3118 int ret = 0;
3119 3119
3120 mutex_lock(&ftrace_cmd_mutex); 3120 mutex_lock(&ftrace_cmd_mutex);
3121 list_for_each_entry(p, &ftrace_commands, list) { 3121 list_for_each_entry(p, &ftrace_commands, list) {
3122 if (strcmp(cmd->name, p->name) == 0) { 3122 if (strcmp(cmd->name, p->name) == 0) {
3123 ret = -EBUSY; 3123 ret = -EBUSY;
3124 goto out_unlock; 3124 goto out_unlock;
3125 } 3125 }
3126 } 3126 }
3127 list_add(&cmd->list, &ftrace_commands); 3127 list_add(&cmd->list, &ftrace_commands);
3128 out_unlock: 3128 out_unlock:
3129 mutex_unlock(&ftrace_cmd_mutex); 3129 mutex_unlock(&ftrace_cmd_mutex);
3130 3130
3131 return ret; 3131 return ret;
3132 } 3132 }
3133 3133
3134 int unregister_ftrace_command(struct ftrace_func_command *cmd) 3134 int unregister_ftrace_command(struct ftrace_func_command *cmd)
3135 { 3135 {
3136 struct ftrace_func_command *p, *n; 3136 struct ftrace_func_command *p, *n;
3137 int ret = -ENODEV; 3137 int ret = -ENODEV;
3138 3138
3139 mutex_lock(&ftrace_cmd_mutex); 3139 mutex_lock(&ftrace_cmd_mutex);
3140 list_for_each_entry_safe(p, n, &ftrace_commands, list) { 3140 list_for_each_entry_safe(p, n, &ftrace_commands, list) {
3141 if (strcmp(cmd->name, p->name) == 0) { 3141 if (strcmp(cmd->name, p->name) == 0) {
3142 ret = 0; 3142 ret = 0;
3143 list_del_init(&p->list); 3143 list_del_init(&p->list);
3144 goto out_unlock; 3144 goto out_unlock;
3145 } 3145 }
3146 } 3146 }
3147 out_unlock: 3147 out_unlock:
3148 mutex_unlock(&ftrace_cmd_mutex); 3148 mutex_unlock(&ftrace_cmd_mutex);
3149 3149
3150 return ret; 3150 return ret;
3151 } 3151 }
3152 3152
3153 static int ftrace_process_regex(struct ftrace_hash *hash, 3153 static int ftrace_process_regex(struct ftrace_hash *hash,
3154 char *buff, int len, int enable) 3154 char *buff, int len, int enable)
3155 { 3155 {
3156 char *func, *command, *next = buff; 3156 char *func, *command, *next = buff;
3157 struct ftrace_func_command *p; 3157 struct ftrace_func_command *p;
3158 int ret = -EINVAL; 3158 int ret = -EINVAL;
3159 3159
3160 func = strsep(&next, ":"); 3160 func = strsep(&next, ":");
3161 3161
3162 if (!next) { 3162 if (!next) {
3163 ret = ftrace_match_records(hash, func, len); 3163 ret = ftrace_match_records(hash, func, len);
3164 if (!ret) 3164 if (!ret)
3165 ret = -EINVAL; 3165 ret = -EINVAL;
3166 if (ret < 0) 3166 if (ret < 0)
3167 return ret; 3167 return ret;
3168 return 0; 3168 return 0;
3169 } 3169 }
3170 3170
3171 /* command found */ 3171 /* command found */
3172 3172
3173 command = strsep(&next, ":"); 3173 command = strsep(&next, ":");
3174 3174
3175 mutex_lock(&ftrace_cmd_mutex); 3175 mutex_lock(&ftrace_cmd_mutex);
3176 list_for_each_entry(p, &ftrace_commands, list) { 3176 list_for_each_entry(p, &ftrace_commands, list) {
3177 if (strcmp(p->name, command) == 0) { 3177 if (strcmp(p->name, command) == 0) {
3178 ret = p->func(hash, func, command, next, enable); 3178 ret = p->func(hash, func, command, next, enable);
3179 goto out_unlock; 3179 goto out_unlock;
3180 } 3180 }
3181 } 3181 }
3182 out_unlock: 3182 out_unlock:
3183 mutex_unlock(&ftrace_cmd_mutex); 3183 mutex_unlock(&ftrace_cmd_mutex);
3184 3184
3185 return ret; 3185 return ret;
3186 } 3186 }
3187 3187
3188 static ssize_t 3188 static ssize_t
3189 ftrace_regex_write(struct file *file, const char __user *ubuf, 3189 ftrace_regex_write(struct file *file, const char __user *ubuf,
3190 size_t cnt, loff_t *ppos, int enable) 3190 size_t cnt, loff_t *ppos, int enable)
3191 { 3191 {
3192 struct ftrace_iterator *iter; 3192 struct ftrace_iterator *iter;
3193 struct trace_parser *parser; 3193 struct trace_parser *parser;
3194 ssize_t ret, read; 3194 ssize_t ret, read;
3195 3195
3196 if (!cnt) 3196 if (!cnt)
3197 return 0; 3197 return 0;
3198 3198
3199 mutex_lock(&ftrace_regex_lock); 3199 mutex_lock(&ftrace_regex_lock);
3200 3200
3201 ret = -ENODEV; 3201 ret = -ENODEV;
3202 if (unlikely(ftrace_disabled)) 3202 if (unlikely(ftrace_disabled))
3203 goto out_unlock; 3203 goto out_unlock;
3204 3204
3205 if (file->f_mode & FMODE_READ) { 3205 if (file->f_mode & FMODE_READ) {
3206 struct seq_file *m = file->private_data; 3206 struct seq_file *m = file->private_data;
3207 iter = m->private; 3207 iter = m->private;
3208 } else 3208 } else
3209 iter = file->private_data; 3209 iter = file->private_data;
3210 3210
3211 parser = &iter->parser; 3211 parser = &iter->parser;
3212 read = trace_get_user(parser, ubuf, cnt, ppos); 3212 read = trace_get_user(parser, ubuf, cnt, ppos);
3213 3213
3214 if (read >= 0 && trace_parser_loaded(parser) && 3214 if (read >= 0 && trace_parser_loaded(parser) &&
3215 !trace_parser_cont(parser)) { 3215 !trace_parser_cont(parser)) {
3216 ret = ftrace_process_regex(iter->hash, parser->buffer, 3216 ret = ftrace_process_regex(iter->hash, parser->buffer,
3217 parser->idx, enable); 3217 parser->idx, enable);
3218 trace_parser_clear(parser); 3218 trace_parser_clear(parser);
3219 if (ret) 3219 if (ret)
3220 goto out_unlock; 3220 goto out_unlock;
3221 } 3221 }
3222 3222
3223 ret = read; 3223 ret = read;
3224 out_unlock: 3224 out_unlock:
3225 mutex_unlock(&ftrace_regex_lock); 3225 mutex_unlock(&ftrace_regex_lock);
3226 3226
3227 return ret; 3227 return ret;
3228 } 3228 }
3229 3229
3230 ssize_t 3230 ssize_t
3231 ftrace_filter_write(struct file *file, const char __user *ubuf, 3231 ftrace_filter_write(struct file *file, const char __user *ubuf,
3232 size_t cnt, loff_t *ppos) 3232 size_t cnt, loff_t *ppos)
3233 { 3233 {
3234 return ftrace_regex_write(file, ubuf, cnt, ppos, 1); 3234 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
3235 } 3235 }
3236 3236
3237 ssize_t 3237 ssize_t
3238 ftrace_notrace_write(struct file *file, const char __user *ubuf, 3238 ftrace_notrace_write(struct file *file, const char __user *ubuf,
3239 size_t cnt, loff_t *ppos) 3239 size_t cnt, loff_t *ppos)
3240 { 3240 {
3241 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 3241 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
3242 } 3242 }
3243 3243
3244 static int 3244 static int
3245 ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) 3245 ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3246 { 3246 {
3247 struct ftrace_func_entry *entry; 3247 struct ftrace_func_entry *entry;
3248 3248
3249 if (!ftrace_location(ip)) 3249 if (!ftrace_location(ip))
3250 return -EINVAL; 3250 return -EINVAL;
3251 3251
3252 if (remove) { 3252 if (remove) {
3253 entry = ftrace_lookup_ip(hash, ip); 3253 entry = ftrace_lookup_ip(hash, ip);
3254 if (!entry) 3254 if (!entry)
3255 return -ENOENT; 3255 return -ENOENT;
3256 free_hash_entry(hash, entry); 3256 free_hash_entry(hash, entry);
3257 return 0; 3257 return 0;
3258 } 3258 }
3259 3259
3260 return add_hash_entry(hash, ip); 3260 return add_hash_entry(hash, ip);
3261 } 3261 }
3262 3262
3263 static int 3263 static int
3264 ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, 3264 ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3265 unsigned long ip, int remove, int reset, int enable) 3265 unsigned long ip, int remove, int reset, int enable)
3266 { 3266 {
3267 struct ftrace_hash **orig_hash; 3267 struct ftrace_hash **orig_hash;
3268 struct ftrace_hash *hash; 3268 struct ftrace_hash *hash;
3269 int ret; 3269 int ret;
3270 3270
3271 /* All global ops uses the global ops filters */ 3271 /* All global ops uses the global ops filters */
3272 if (ops->flags & FTRACE_OPS_FL_GLOBAL) 3272 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
3273 ops = &global_ops; 3273 ops = &global_ops;
3274 3274
3275 if (unlikely(ftrace_disabled)) 3275 if (unlikely(ftrace_disabled))
3276 return -ENODEV; 3276 return -ENODEV;
3277 3277
3278 if (enable) 3278 if (enable)
3279 orig_hash = &ops->filter_hash; 3279 orig_hash = &ops->filter_hash;
3280 else 3280 else
3281 orig_hash = &ops->notrace_hash; 3281 orig_hash = &ops->notrace_hash;
3282 3282
3283 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3283 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3284 if (!hash) 3284 if (!hash)
3285 return -ENOMEM; 3285 return -ENOMEM;
3286 3286
3287 mutex_lock(&ftrace_regex_lock); 3287 mutex_lock(&ftrace_regex_lock);
3288 if (reset) 3288 if (reset)
3289 ftrace_filter_reset(hash); 3289 ftrace_filter_reset(hash);
3290 if (buf && !ftrace_match_records(hash, buf, len)) { 3290 if (buf && !ftrace_match_records(hash, buf, len)) {
3291 ret = -EINVAL; 3291 ret = -EINVAL;
3292 goto out_regex_unlock; 3292 goto out_regex_unlock;
3293 } 3293 }
3294 if (ip) { 3294 if (ip) {
3295 ret = ftrace_match_addr(hash, ip, remove); 3295 ret = ftrace_match_addr(hash, ip, remove);
3296 if (ret < 0) 3296 if (ret < 0)
3297 goto out_regex_unlock; 3297 goto out_regex_unlock;
3298 } 3298 }
3299 3299
3300 mutex_lock(&ftrace_lock); 3300 mutex_lock(&ftrace_lock);
3301 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3301 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3302 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3302 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
3303 && ftrace_enabled) 3303 && ftrace_enabled)
3304 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 3304 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3305 3305
3306 mutex_unlock(&ftrace_lock); 3306 mutex_unlock(&ftrace_lock);
3307 3307
3308 out_regex_unlock: 3308 out_regex_unlock:
3309 mutex_unlock(&ftrace_regex_lock); 3309 mutex_unlock(&ftrace_regex_lock);
3310 3310
3311 free_ftrace_hash(hash); 3311 free_ftrace_hash(hash);
3312 return ret; 3312 return ret;
3313 } 3313 }
3314 3314
3315 static int 3315 static int
3316 ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, 3316 ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
3317 int reset, int enable) 3317 int reset, int enable)
3318 { 3318 {
3319 return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); 3319 return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
3320 } 3320 }
3321 3321
3322 /** 3322 /**
3323 * ftrace_set_filter_ip - set a function to filter on in ftrace by address 3323 * ftrace_set_filter_ip - set a function to filter on in ftrace by address
3324 * @ops - the ops to set the filter with 3324 * @ops - the ops to set the filter with
3325 * @ip - the address to add to or remove from the filter. 3325 * @ip - the address to add to or remove from the filter.
3326 * @remove - non zero to remove the ip from the filter 3326 * @remove - non zero to remove the ip from the filter
3327 * @reset - non zero to reset all filters before applying this filter. 3327 * @reset - non zero to reset all filters before applying this filter.
3328 * 3328 *
3329 * Filters denote which functions should be enabled when tracing is enabled 3329 * Filters denote which functions should be enabled when tracing is enabled
3330 * If @ip is NULL, it failes to update filter. 3330 * If @ip is NULL, it failes to update filter.
3331 */ 3331 */
3332 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, 3332 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
3333 int remove, int reset) 3333 int remove, int reset)
3334 { 3334 {
3335 return ftrace_set_addr(ops, ip, remove, reset, 1); 3335 return ftrace_set_addr(ops, ip, remove, reset, 1);
3336 } 3336 }
3337 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); 3337 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
3338 3338
3339 static int 3339 static int
3340 ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, 3340 ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3341 int reset, int enable) 3341 int reset, int enable)
3342 { 3342 {
3343 return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); 3343 return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
3344 } 3344 }
3345 3345
3346 /** 3346 /**
3347 * ftrace_set_filter - set a function to filter on in ftrace 3347 * ftrace_set_filter - set a function to filter on in ftrace
3348 * @ops - the ops to set the filter with 3348 * @ops - the ops to set the filter with
3349 * @buf - the string that holds the function filter text. 3349 * @buf - the string that holds the function filter text.
3350 * @len - the length of the string. 3350 * @len - the length of the string.
3351 * @reset - non zero to reset all filters before applying this filter. 3351 * @reset - non zero to reset all filters before applying this filter.
3352 * 3352 *
3353 * Filters denote which functions should be enabled when tracing is enabled. 3353 * Filters denote which functions should be enabled when tracing is enabled.
3354 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 3354 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3355 */ 3355 */
3356 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 3356 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3357 int len, int reset) 3357 int len, int reset)
3358 { 3358 {
3359 return ftrace_set_regex(ops, buf, len, reset, 1); 3359 return ftrace_set_regex(ops, buf, len, reset, 1);
3360 } 3360 }
3361 EXPORT_SYMBOL_GPL(ftrace_set_filter); 3361 EXPORT_SYMBOL_GPL(ftrace_set_filter);
3362 3362
3363 /** 3363 /**
3364 * ftrace_set_notrace - set a function to not trace in ftrace 3364 * ftrace_set_notrace - set a function to not trace in ftrace
3365 * @ops - the ops to set the notrace filter with 3365 * @ops - the ops to set the notrace filter with
3366 * @buf - the string that holds the function notrace text. 3366 * @buf - the string that holds the function notrace text.
3367 * @len - the length of the string. 3367 * @len - the length of the string.
3368 * @reset - non zero to reset all filters before applying this filter. 3368 * @reset - non zero to reset all filters before applying this filter.
3369 * 3369 *
3370 * Notrace Filters denote which functions should not be enabled when tracing 3370 * Notrace Filters denote which functions should not be enabled when tracing
3371 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 3371 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3372 * for tracing. 3372 * for tracing.
3373 */ 3373 */
3374 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 3374 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3375 int len, int reset) 3375 int len, int reset)
3376 { 3376 {
3377 return ftrace_set_regex(ops, buf, len, reset, 0); 3377 return ftrace_set_regex(ops, buf, len, reset, 0);
3378 } 3378 }
3379 EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3379 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3380 /** 3380 /**
3381 * ftrace_set_filter - set a function to filter on in ftrace 3381 * ftrace_set_filter - set a function to filter on in ftrace
3382 * @ops - the ops to set the filter with 3382 * @ops - the ops to set the filter with
3383 * @buf - the string that holds the function filter text. 3383 * @buf - the string that holds the function filter text.
3384 * @len - the length of the string. 3384 * @len - the length of the string.
3385 * @reset - non zero to reset all filters before applying this filter. 3385 * @reset - non zero to reset all filters before applying this filter.
3386 * 3386 *
3387 * Filters denote which functions should be enabled when tracing is enabled. 3387 * Filters denote which functions should be enabled when tracing is enabled.
3388 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 3388 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3389 */ 3389 */
3390 void ftrace_set_global_filter(unsigned char *buf, int len, int reset) 3390 void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
3391 { 3391 {
3392 ftrace_set_regex(&global_ops, buf, len, reset, 1); 3392 ftrace_set_regex(&global_ops, buf, len, reset, 1);
3393 } 3393 }
3394 EXPORT_SYMBOL_GPL(ftrace_set_global_filter); 3394 EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
3395 3395
3396 /** 3396 /**
3397 * ftrace_set_notrace - set a function to not trace in ftrace 3397 * ftrace_set_notrace - set a function to not trace in ftrace
3398 * @ops - the ops to set the notrace filter with 3398 * @ops - the ops to set the notrace filter with
3399 * @buf - the string that holds the function notrace text. 3399 * @buf - the string that holds the function notrace text.
3400 * @len - the length of the string. 3400 * @len - the length of the string.
3401 * @reset - non zero to reset all filters before applying this filter. 3401 * @reset - non zero to reset all filters before applying this filter.
3402 * 3402 *
3403 * Notrace Filters denote which functions should not be enabled when tracing 3403 * Notrace Filters denote which functions should not be enabled when tracing
3404 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 3404 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3405 * for tracing. 3405 * for tracing.
3406 */ 3406 */
3407 void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) 3407 void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
3408 { 3408 {
3409 ftrace_set_regex(&global_ops, buf, len, reset, 0); 3409 ftrace_set_regex(&global_ops, buf, len, reset, 0);
3410 } 3410 }
3411 EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); 3411 EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
3412 3412
3413 /* 3413 /*
3414 * command line interface to allow users to set filters on boot up. 3414 * command line interface to allow users to set filters on boot up.
3415 */ 3415 */
3416 #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE 3416 #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
3417 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 3417 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3418 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 3418 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3419 3419
3420 static int __init set_ftrace_notrace(char *str) 3420 static int __init set_ftrace_notrace(char *str)
3421 { 3421 {
3422 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3422 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3423 return 1; 3423 return 1;
3424 } 3424 }
3425 __setup("ftrace_notrace=", set_ftrace_notrace); 3425 __setup("ftrace_notrace=", set_ftrace_notrace);
3426 3426
3427 static int __init set_ftrace_filter(char *str) 3427 static int __init set_ftrace_filter(char *str)
3428 { 3428 {
3429 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3429 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3430 return 1; 3430 return 1;
3431 } 3431 }
3432 __setup("ftrace_filter=", set_ftrace_filter); 3432 __setup("ftrace_filter=", set_ftrace_filter);
3433 3433
3434 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 3434 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
3435 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 3435 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3436 static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 3436 static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
3437 3437
3438 static int __init set_graph_function(char *str) 3438 static int __init set_graph_function(char *str)
3439 { 3439 {
3440 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 3440 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
3441 return 1; 3441 return 1;
3442 } 3442 }
3443 __setup("ftrace_graph_filter=", set_graph_function); 3443 __setup("ftrace_graph_filter=", set_graph_function);
3444 3444
3445 static void __init set_ftrace_early_graph(char *buf) 3445 static void __init set_ftrace_early_graph(char *buf)
3446 { 3446 {
3447 int ret; 3447 int ret;
3448 char *func; 3448 char *func;
3449 3449
3450 while (buf) { 3450 while (buf) {
3451 func = strsep(&buf, ","); 3451 func = strsep(&buf, ",");
3452 /* we allow only one expression at a time */ 3452 /* we allow only one expression at a time */
3453 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3453 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3454 func); 3454 func);
3455 if (ret) 3455 if (ret)
3456 printk(KERN_DEBUG "ftrace: function %s not " 3456 printk(KERN_DEBUG "ftrace: function %s not "
3457 "traceable\n", func); 3457 "traceable\n", func);
3458 } 3458 }
3459 } 3459 }
3460 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3460 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3461 3461
3462 void __init 3462 void __init
3463 ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) 3463 ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3464 { 3464 {
3465 char *func; 3465 char *func;
3466 3466
3467 while (buf) { 3467 while (buf) {
3468 func = strsep(&buf, ","); 3468 func = strsep(&buf, ",");
3469 ftrace_set_regex(ops, func, strlen(func), 0, enable); 3469 ftrace_set_regex(ops, func, strlen(func), 0, enable);
3470 } 3470 }
3471 } 3471 }
3472 3472
3473 static void __init set_ftrace_early_filters(void) 3473 static void __init set_ftrace_early_filters(void)
3474 { 3474 {
3475 if (ftrace_filter_buf[0]) 3475 if (ftrace_filter_buf[0])
3476 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); 3476 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
3477 if (ftrace_notrace_buf[0]) 3477 if (ftrace_notrace_buf[0])
3478 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); 3478 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3479 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 3479 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
3480 if (ftrace_graph_buf[0]) 3480 if (ftrace_graph_buf[0])
3481 set_ftrace_early_graph(ftrace_graph_buf); 3481 set_ftrace_early_graph(ftrace_graph_buf);
3482 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3482 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3483 } 3483 }
3484 3484
3485 int ftrace_regex_release(struct inode *inode, struct file *file) 3485 int ftrace_regex_release(struct inode *inode, struct file *file)
3486 { 3486 {
3487 struct seq_file *m = (struct seq_file *)file->private_data; 3487 struct seq_file *m = (struct seq_file *)file->private_data;
3488 struct ftrace_iterator *iter; 3488 struct ftrace_iterator *iter;
3489 struct ftrace_hash **orig_hash; 3489 struct ftrace_hash **orig_hash;
3490 struct trace_parser *parser; 3490 struct trace_parser *parser;
3491 int filter_hash; 3491 int filter_hash;
3492 int ret; 3492 int ret;
3493 3493
3494 mutex_lock(&ftrace_regex_lock); 3494 mutex_lock(&ftrace_regex_lock);
3495 if (file->f_mode & FMODE_READ) { 3495 if (file->f_mode & FMODE_READ) {
3496 iter = m->private; 3496 iter = m->private;
3497 3497
3498 seq_release(inode, file); 3498 seq_release(inode, file);
3499 } else 3499 } else
3500 iter = file->private_data; 3500 iter = file->private_data;
3501 3501
3502 parser = &iter->parser; 3502 parser = &iter->parser;
3503 if (trace_parser_loaded(parser)) { 3503 if (trace_parser_loaded(parser)) {
3504 parser->buffer[parser->idx] = 0; 3504 parser->buffer[parser->idx] = 0;
3505 ftrace_match_records(iter->hash, parser->buffer, parser->idx); 3505 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
3506 } 3506 }
3507 3507
3508 trace_parser_put(parser); 3508 trace_parser_put(parser);
3509 3509
3510 if (file->f_mode & FMODE_WRITE) { 3510 if (file->f_mode & FMODE_WRITE) {
3511 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); 3511 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3512 3512
3513 if (filter_hash) 3513 if (filter_hash)
3514 orig_hash = &iter->ops->filter_hash; 3514 orig_hash = &iter->ops->filter_hash;
3515 else 3515 else
3516 orig_hash = &iter->ops->notrace_hash; 3516 orig_hash = &iter->ops->notrace_hash;
3517 3517
3518 mutex_lock(&ftrace_lock); 3518 mutex_lock(&ftrace_lock);
3519 ret = ftrace_hash_move(iter->ops, filter_hash, 3519 ret = ftrace_hash_move(iter->ops, filter_hash,
3520 orig_hash, iter->hash); 3520 orig_hash, iter->hash);
3521 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3521 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3522 && ftrace_enabled) 3522 && ftrace_enabled)
3523 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 3523 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3524 3524
3525 mutex_unlock(&ftrace_lock); 3525 mutex_unlock(&ftrace_lock);
3526 } 3526 }
3527 free_ftrace_hash(iter->hash); 3527 free_ftrace_hash(iter->hash);
3528 kfree(iter); 3528 kfree(iter);
3529 3529
3530 mutex_unlock(&ftrace_regex_lock); 3530 mutex_unlock(&ftrace_regex_lock);
3531 return 0; 3531 return 0;
3532 } 3532 }
3533 3533
3534 static const struct file_operations ftrace_avail_fops = { 3534 static const struct file_operations ftrace_avail_fops = {
3535 .open = ftrace_avail_open, 3535 .open = ftrace_avail_open,
3536 .read = seq_read, 3536 .read = seq_read,
3537 .llseek = seq_lseek, 3537 .llseek = seq_lseek,
3538 .release = seq_release_private, 3538 .release = seq_release_private,
3539 }; 3539 };
3540 3540
3541 static const struct file_operations ftrace_enabled_fops = { 3541 static const struct file_operations ftrace_enabled_fops = {
3542 .open = ftrace_enabled_open, 3542 .open = ftrace_enabled_open,
3543 .read = seq_read, 3543 .read = seq_read,
3544 .llseek = seq_lseek, 3544 .llseek = seq_lseek,
3545 .release = seq_release_private, 3545 .release = seq_release_private,
3546 }; 3546 };
3547 3547
3548 static const struct file_operations ftrace_filter_fops = { 3548 static const struct file_operations ftrace_filter_fops = {
3549 .open = ftrace_filter_open, 3549 .open = ftrace_filter_open,
3550 .read = seq_read, 3550 .read = seq_read,
3551 .write = ftrace_filter_write, 3551 .write = ftrace_filter_write,
3552 .llseek = ftrace_regex_lseek, 3552 .llseek = ftrace_regex_lseek,
3553 .release = ftrace_regex_release, 3553 .release = ftrace_regex_release,
3554 }; 3554 };
3555 3555
3556 static const struct file_operations ftrace_notrace_fops = { 3556 static const struct file_operations ftrace_notrace_fops = {
3557 .open = ftrace_notrace_open, 3557 .open = ftrace_notrace_open,
3558 .read = seq_read, 3558 .read = seq_read,
3559 .write = ftrace_notrace_write, 3559 .write = ftrace_notrace_write,
3560 .llseek = ftrace_regex_lseek, 3560 .llseek = ftrace_regex_lseek,
3561 .release = ftrace_regex_release, 3561 .release = ftrace_regex_release,
3562 }; 3562 };
3563 3563
3564 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 3564 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
3565 3565
3566 static DEFINE_MUTEX(graph_lock); 3566 static DEFINE_MUTEX(graph_lock);
3567 3567
3568 int ftrace_graph_count; 3568 int ftrace_graph_count;
3569 int ftrace_graph_filter_enabled; 3569 int ftrace_graph_filter_enabled;
3570 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 3570 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3571 3571
3572 static void * 3572 static void *
3573 __g_next(struct seq_file *m, loff_t *pos) 3573 __g_next(struct seq_file *m, loff_t *pos)
3574 { 3574 {
3575 if (*pos >= ftrace_graph_count) 3575 if (*pos >= ftrace_graph_count)
3576 return NULL; 3576 return NULL;
3577 return &ftrace_graph_funcs[*pos]; 3577 return &ftrace_graph_funcs[*pos];
3578 } 3578 }
3579 3579
3580 static void * 3580 static void *
3581 g_next(struct seq_file *m, void *v, loff_t *pos) 3581 g_next(struct seq_file *m, void *v, loff_t *pos)
3582 { 3582 {
3583 (*pos)++; 3583 (*pos)++;
3584 return __g_next(m, pos); 3584 return __g_next(m, pos);
3585 } 3585 }
3586 3586
3587 static void *g_start(struct seq_file *m, loff_t *pos) 3587 static void *g_start(struct seq_file *m, loff_t *pos)
3588 { 3588 {
3589 mutex_lock(&graph_lock); 3589 mutex_lock(&graph_lock);
3590 3590
3591 /* Nothing, tell g_show to print all functions are enabled */ 3591 /* Nothing, tell g_show to print all functions are enabled */
3592 if (!ftrace_graph_filter_enabled && !*pos) 3592 if (!ftrace_graph_filter_enabled && !*pos)
3593 return (void *)1; 3593 return (void *)1;
3594 3594
3595 return __g_next(m, pos); 3595 return __g_next(m, pos);
3596 } 3596 }
3597 3597
3598 static void g_stop(struct seq_file *m, void *p) 3598 static void g_stop(struct seq_file *m, void *p)
3599 { 3599 {
3600 mutex_unlock(&graph_lock); 3600 mutex_unlock(&graph_lock);
3601 } 3601 }
3602 3602
3603 static int g_show(struct seq_file *m, void *v) 3603 static int g_show(struct seq_file *m, void *v)
3604 { 3604 {
3605 unsigned long *ptr = v; 3605 unsigned long *ptr = v;
3606 3606
3607 if (!ptr) 3607 if (!ptr)
3608 return 0; 3608 return 0;
3609 3609
3610 if (ptr == (unsigned long *)1) { 3610 if (ptr == (unsigned long *)1) {
3611 seq_printf(m, "#### all functions enabled ####\n"); 3611 seq_printf(m, "#### all functions enabled ####\n");
3612 return 0; 3612 return 0;
3613 } 3613 }
3614 3614
3615 seq_printf(m, "%ps\n", (void *)*ptr); 3615 seq_printf(m, "%ps\n", (void *)*ptr);
3616 3616
3617 return 0; 3617 return 0;
3618 } 3618 }
3619 3619
3620 static const struct seq_operations ftrace_graph_seq_ops = { 3620 static const struct seq_operations ftrace_graph_seq_ops = {
3621 .start = g_start, 3621 .start = g_start,
3622 .next = g_next, 3622 .next = g_next,
3623 .stop = g_stop, 3623 .stop = g_stop,
3624 .show = g_show, 3624 .show = g_show,
3625 }; 3625 };
3626 3626
3627 static int 3627 static int
3628 ftrace_graph_open(struct inode *inode, struct file *file) 3628 ftrace_graph_open(struct inode *inode, struct file *file)
3629 { 3629 {
3630 int ret = 0; 3630 int ret = 0;
3631 3631
3632 if (unlikely(ftrace_disabled)) 3632 if (unlikely(ftrace_disabled))
3633 return -ENODEV; 3633 return -ENODEV;
3634 3634
3635 mutex_lock(&graph_lock); 3635 mutex_lock(&graph_lock);
3636 if ((file->f_mode & FMODE_WRITE) && 3636 if ((file->f_mode & FMODE_WRITE) &&
3637 (file->f_flags & O_TRUNC)) { 3637 (file->f_flags & O_TRUNC)) {
3638 ftrace_graph_filter_enabled = 0; 3638 ftrace_graph_filter_enabled = 0;
3639 ftrace_graph_count = 0; 3639 ftrace_graph_count = 0;
3640 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 3640 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
3641 } 3641 }
3642 mutex_unlock(&graph_lock); 3642 mutex_unlock(&graph_lock);
3643 3643
3644 if (file->f_mode & FMODE_READ) 3644 if (file->f_mode & FMODE_READ)
3645 ret = seq_open(file, &ftrace_graph_seq_ops); 3645 ret = seq_open(file, &ftrace_graph_seq_ops);
3646 3646
3647 return ret; 3647 return ret;
3648 } 3648 }
3649 3649
3650 static int 3650 static int
3651 ftrace_graph_release(struct inode *inode, struct file *file) 3651 ftrace_graph_release(struct inode *inode, struct file *file)
3652 { 3652 {
3653 if (file->f_mode & FMODE_READ) 3653 if (file->f_mode & FMODE_READ)
3654 seq_release(inode, file); 3654 seq_release(inode, file);
3655 return 0; 3655 return 0;
3656 } 3656 }
3657 3657
3658 static int 3658 static int
3659 ftrace_set_func(unsigned long *array, int *idx, char *buffer) 3659 ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3660 { 3660 {
3661 struct dyn_ftrace *rec; 3661 struct dyn_ftrace *rec;
3662 struct ftrace_page *pg; 3662 struct ftrace_page *pg;
3663 int search_len; 3663 int search_len;
3664 int fail = 1; 3664 int fail = 1;
3665 int type, not; 3665 int type, not;
3666 char *search; 3666 char *search;
3667 bool exists; 3667 bool exists;
3668 int i; 3668 int i;
3669 3669
3670 /* decode regex */ 3670 /* decode regex */
3671 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3671 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
3672 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3672 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
3673 return -EBUSY; 3673 return -EBUSY;
3674 3674
3675 search_len = strlen(search); 3675 search_len = strlen(search);
3676 3676
3677 mutex_lock(&ftrace_lock); 3677 mutex_lock(&ftrace_lock);
3678 3678
3679 if (unlikely(ftrace_disabled)) { 3679 if (unlikely(ftrace_disabled)) {
3680 mutex_unlock(&ftrace_lock); 3680 mutex_unlock(&ftrace_lock);
3681 return -ENODEV; 3681 return -ENODEV;
3682 } 3682 }
3683 3683
3684 do_for_each_ftrace_rec(pg, rec) { 3684 do_for_each_ftrace_rec(pg, rec) {
3685 3685
3686 if (ftrace_match_record(rec, NULL, search, search_len, type)) { 3686 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3687 /* if it is in the array */ 3687 /* if it is in the array */
3688 exists = false; 3688 exists = false;
3689 for (i = 0; i < *idx; i++) { 3689 for (i = 0; i < *idx; i++) {
3690 if (array[i] == rec->ip) { 3690 if (array[i] == rec->ip) {
3691 exists = true; 3691 exists = true;
3692 break; 3692 break;
3693 } 3693 }
3694 } 3694 }
3695 3695
3696 if (!not) { 3696 if (!not) {
3697 fail = 0; 3697 fail = 0;
3698 if (!exists) { 3698 if (!exists) {
3699 array[(*idx)++] = rec->ip; 3699 array[(*idx)++] = rec->ip;
3700 if (*idx >= FTRACE_GRAPH_MAX_FUNCS) 3700 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
3701 goto out; 3701 goto out;
3702 } 3702 }
3703 } else { 3703 } else {
3704 if (exists) { 3704 if (exists) {
3705 array[i] = array[--(*idx)]; 3705 array[i] = array[--(*idx)];
3706 array[*idx] = 0; 3706 array[*idx] = 0;
3707 fail = 0; 3707 fail = 0;
3708 } 3708 }
3709 } 3709 }
3710 } 3710 }
3711 } while_for_each_ftrace_rec(); 3711 } while_for_each_ftrace_rec();
3712 out: 3712 out:
3713 mutex_unlock(&ftrace_lock); 3713 mutex_unlock(&ftrace_lock);
3714 3714
3715 if (fail) 3715 if (fail)
3716 return -EINVAL; 3716 return -EINVAL;
3717 3717
3718 ftrace_graph_filter_enabled = 1; 3718 ftrace_graph_filter_enabled = 1;
3719 return 0; 3719 return 0;
3720 } 3720 }
3721 3721
3722 static ssize_t 3722 static ssize_t
3723 ftrace_graph_write(struct file *file, const char __user *ubuf, 3723 ftrace_graph_write(struct file *file, const char __user *ubuf,
3724 size_t cnt, loff_t *ppos) 3724 size_t cnt, loff_t *ppos)
3725 { 3725 {
3726 struct trace_parser parser; 3726 struct trace_parser parser;
3727 ssize_t read, ret; 3727 ssize_t read, ret;
3728 3728
3729 if (!cnt) 3729 if (!cnt)
3730 return 0; 3730 return 0;
3731 3731
3732 mutex_lock(&graph_lock); 3732 mutex_lock(&graph_lock);
3733 3733
3734 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 3734 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
3735 ret = -ENOMEM; 3735 ret = -ENOMEM;
3736 goto out_unlock; 3736 goto out_unlock;
3737 } 3737 }
3738 3738
3739 read = trace_get_user(&parser, ubuf, cnt, ppos); 3739 read = trace_get_user(&parser, ubuf, cnt, ppos);
3740 3740
3741 if (read >= 0 && trace_parser_loaded((&parser))) { 3741 if (read >= 0 && trace_parser_loaded((&parser))) {
3742 parser.buffer[parser.idx] = 0; 3742 parser.buffer[parser.idx] = 0;
3743 3743
3744 /* we allow only one expression at a time */ 3744 /* we allow only one expression at a time */
3745 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3745 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3746 parser.buffer); 3746 parser.buffer);
3747 if (ret) 3747 if (ret)
3748 goto out_free; 3748 goto out_free;
3749 } 3749 }
3750 3750
3751 ret = read; 3751 ret = read;
3752 3752
3753 out_free: 3753 out_free:
3754 trace_parser_put(&parser); 3754 trace_parser_put(&parser);
3755 out_unlock: 3755 out_unlock:
3756 mutex_unlock(&graph_lock); 3756 mutex_unlock(&graph_lock);
3757 3757
3758 return ret; 3758 return ret;
3759 } 3759 }
3760 3760
3761 static const struct file_operations ftrace_graph_fops = { 3761 static const struct file_operations ftrace_graph_fops = {
3762 .open = ftrace_graph_open, 3762 .open = ftrace_graph_open,
3763 .read = seq_read, 3763 .read = seq_read,
3764 .write = ftrace_graph_write, 3764 .write = ftrace_graph_write,
3765 .release = ftrace_graph_release, 3765 .release = ftrace_graph_release,
3766 .llseek = seq_lseek, 3766 .llseek = seq_lseek,
3767 }; 3767 };
3768 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3768 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3769 3769
3770 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 3770 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3771 { 3771 {
3772 3772
3773 trace_create_file("available_filter_functions", 0444, 3773 trace_create_file("available_filter_functions", 0444,
3774 d_tracer, NULL, &ftrace_avail_fops); 3774 d_tracer, NULL, &ftrace_avail_fops);
3775 3775
3776 trace_create_file("enabled_functions", 0444, 3776 trace_create_file("enabled_functions", 0444,
3777 d_tracer, NULL, &ftrace_enabled_fops); 3777 d_tracer, NULL, &ftrace_enabled_fops);
3778 3778
3779 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3779 trace_create_file("set_ftrace_filter", 0644, d_tracer,
3780 NULL, &ftrace_filter_fops); 3780 NULL, &ftrace_filter_fops);
3781 3781
3782 trace_create_file("set_ftrace_notrace", 0644, d_tracer, 3782 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
3783 NULL, &ftrace_notrace_fops); 3783 NULL, &ftrace_notrace_fops);
3784 3784
3785 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 3785 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
3786 trace_create_file("set_graph_function", 0444, d_tracer, 3786 trace_create_file("set_graph_function", 0444, d_tracer,
3787 NULL, 3787 NULL,
3788 &ftrace_graph_fops); 3788 &ftrace_graph_fops);
3789 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3789 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3790 3790
3791 return 0; 3791 return 0;
3792 } 3792 }
3793 3793
3794 static int ftrace_cmp_ips(const void *a, const void *b) 3794 static int ftrace_cmp_ips(const void *a, const void *b)
3795 { 3795 {
3796 const unsigned long *ipa = a; 3796 const unsigned long *ipa = a;
3797 const unsigned long *ipb = b; 3797 const unsigned long *ipb = b;
3798 3798
3799 if (*ipa > *ipb) 3799 if (*ipa > *ipb)
3800 return 1; 3800 return 1;
3801 if (*ipa < *ipb) 3801 if (*ipa < *ipb)
3802 return -1; 3802 return -1;
3803 return 0; 3803 return 0;
3804 } 3804 }
3805 3805
3806 static void ftrace_swap_ips(void *a, void *b, int size) 3806 static void ftrace_swap_ips(void *a, void *b, int size)
3807 { 3807 {
3808 unsigned long *ipa = a; 3808 unsigned long *ipa = a;
3809 unsigned long *ipb = b; 3809 unsigned long *ipb = b;
3810 unsigned long t; 3810 unsigned long t;
3811 3811
3812 t = *ipa; 3812 t = *ipa;
3813 *ipa = *ipb; 3813 *ipa = *ipb;
3814 *ipb = t; 3814 *ipb = t;
3815 } 3815 }
3816 3816
3817 static int ftrace_process_locs(struct module *mod, 3817 static int ftrace_process_locs(struct module *mod,
3818 unsigned long *start, 3818 unsigned long *start,
3819 unsigned long *end) 3819 unsigned long *end)
3820 { 3820 {
3821 struct ftrace_page *start_pg; 3821 struct ftrace_page *start_pg;
3822 struct ftrace_page *pg; 3822 struct ftrace_page *pg;
3823 struct dyn_ftrace *rec; 3823 struct dyn_ftrace *rec;
3824 unsigned long count; 3824 unsigned long count;
3825 unsigned long *p; 3825 unsigned long *p;
3826 unsigned long addr; 3826 unsigned long addr;
3827 unsigned long flags = 0; /* Shut up gcc */ 3827 unsigned long flags = 0; /* Shut up gcc */
3828 int ret = -ENOMEM; 3828 int ret = -ENOMEM;
3829 3829
3830 count = end - start; 3830 count = end - start;
3831 3831
3832 if (!count) 3832 if (!count)
3833 return 0; 3833 return 0;
3834 3834
3835 sort(start, count, sizeof(*start), 3835 sort(start, count, sizeof(*start),
3836 ftrace_cmp_ips, ftrace_swap_ips); 3836 ftrace_cmp_ips, ftrace_swap_ips);
3837 3837
3838 start_pg = ftrace_allocate_pages(count); 3838 start_pg = ftrace_allocate_pages(count);
3839 if (!start_pg) 3839 if (!start_pg)
3840 return -ENOMEM; 3840 return -ENOMEM;
3841 3841
3842 mutex_lock(&ftrace_lock); 3842 mutex_lock(&ftrace_lock);
3843 3843
3844 /* 3844 /*
3845 * Core and each module needs their own pages, as 3845 * Core and each module needs their own pages, as
3846 * modules will free them when they are removed. 3846 * modules will free them when they are removed.
3847 * Force a new page to be allocated for modules. 3847 * Force a new page to be allocated for modules.
3848 */ 3848 */
3849 if (!mod) { 3849 if (!mod) {
3850 WARN_ON(ftrace_pages || ftrace_pages_start); 3850 WARN_ON(ftrace_pages || ftrace_pages_start);
3851 /* First initialization */ 3851 /* First initialization */
3852 ftrace_pages = ftrace_pages_start = start_pg; 3852 ftrace_pages = ftrace_pages_start = start_pg;
3853 } else { 3853 } else {
3854 if (!ftrace_pages) 3854 if (!ftrace_pages)
3855 goto out; 3855 goto out;
3856 3856
3857 if (WARN_ON(ftrace_pages->next)) { 3857 if (WARN_ON(ftrace_pages->next)) {
3858 /* Hmm, we have free pages? */ 3858 /* Hmm, we have free pages? */
3859 while (ftrace_pages->next) 3859 while (ftrace_pages->next)
3860 ftrace_pages = ftrace_pages->next; 3860 ftrace_pages = ftrace_pages->next;
3861 } 3861 }
3862 3862
3863 ftrace_pages->next = start_pg; 3863 ftrace_pages->next = start_pg;
3864 } 3864 }
3865 3865
3866 p = start; 3866 p = start;
3867 pg = start_pg; 3867 pg = start_pg;
3868 while (p < end) { 3868 while (p < end) {
3869 addr = ftrace_call_adjust(*p++); 3869 addr = ftrace_call_adjust(*p++);
3870 /* 3870 /*
3871 * Some architecture linkers will pad between 3871 * Some architecture linkers will pad between
3872 * the different mcount_loc sections of different 3872 * the different mcount_loc sections of different
3873 * object files to satisfy alignments. 3873 * object files to satisfy alignments.
3874 * Skip any NULL pointers. 3874 * Skip any NULL pointers.
3875 */ 3875 */
3876 if (!addr) 3876 if (!addr)
3877 continue; 3877 continue;
3878 3878
3879 if (pg->index == pg->size) { 3879 if (pg->index == pg->size) {
3880 /* We should have allocated enough */ 3880 /* We should have allocated enough */
3881 if (WARN_ON(!pg->next)) 3881 if (WARN_ON(!pg->next))
3882 break; 3882 break;
3883 pg = pg->next; 3883 pg = pg->next;
3884 } 3884 }
3885 3885
3886 rec = &pg->records[pg->index++]; 3886 rec = &pg->records[pg->index++];
3887 rec->ip = addr; 3887 rec->ip = addr;
3888 } 3888 }
3889 3889
3890 /* We should have used all pages */ 3890 /* We should have used all pages */
3891 WARN_ON(pg->next); 3891 WARN_ON(pg->next);
3892 3892
3893 /* Assign the last page to ftrace_pages */ 3893 /* Assign the last page to ftrace_pages */
3894 ftrace_pages = pg; 3894 ftrace_pages = pg;
3895 3895
3896 /* These new locations need to be initialized */ 3896 /* These new locations need to be initialized */
3897 ftrace_new_pgs = start_pg; 3897 ftrace_new_pgs = start_pg;
3898 3898
3899 /* 3899 /*
3900 * We only need to disable interrupts on start up 3900 * We only need to disable interrupts on start up
3901 * because we are modifying code that an interrupt 3901 * because we are modifying code that an interrupt
3902 * may execute, and the modification is not atomic. 3902 * may execute, and the modification is not atomic.
3903 * But for modules, nothing runs the code we modify 3903 * But for modules, nothing runs the code we modify
3904 * until we are finished with it, and there's no 3904 * until we are finished with it, and there's no
3905 * reason to cause large interrupt latencies while we do it. 3905 * reason to cause large interrupt latencies while we do it.
3906 */ 3906 */
3907 if (!mod) 3907 if (!mod)
3908 local_irq_save(flags); 3908 local_irq_save(flags);
3909 ftrace_update_code(mod); 3909 ftrace_update_code(mod);
3910 if (!mod) 3910 if (!mod)
3911 local_irq_restore(flags); 3911 local_irq_restore(flags);
3912 ret = 0; 3912 ret = 0;
3913 out: 3913 out:
3914 mutex_unlock(&ftrace_lock); 3914 mutex_unlock(&ftrace_lock);
3915 3915
3916 return ret; 3916 return ret;
3917 } 3917 }
3918 3918
3919 #ifdef CONFIG_MODULES 3919 #ifdef CONFIG_MODULES
3920 3920
3921 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) 3921 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3922 3922
3923 void ftrace_release_mod(struct module *mod) 3923 void ftrace_release_mod(struct module *mod)
3924 { 3924 {
3925 struct dyn_ftrace *rec; 3925 struct dyn_ftrace *rec;
3926 struct ftrace_page **last_pg; 3926 struct ftrace_page **last_pg;
3927 struct ftrace_page *pg; 3927 struct ftrace_page *pg;
3928 int order; 3928 int order;
3929 3929
3930 mutex_lock(&ftrace_lock); 3930 mutex_lock(&ftrace_lock);
3931 3931
3932 if (ftrace_disabled) 3932 if (ftrace_disabled)
3933 goto out_unlock; 3933 goto out_unlock;
3934 3934
3935 /* 3935 /*
3936 * Each module has its own ftrace_pages, remove 3936 * Each module has its own ftrace_pages, remove
3937 * them from the list. 3937 * them from the list.
3938 */ 3938 */
3939 last_pg = &ftrace_pages_start; 3939 last_pg = &ftrace_pages_start;
3940 for (pg = ftrace_pages_start; pg; pg = *last_pg) { 3940 for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3941 rec = &pg->records[0]; 3941 rec = &pg->records[0];
3942 if (within_module_core(rec->ip, mod)) { 3942 if (within_module_core(rec->ip, mod)) {
3943 /* 3943 /*
3944 * As core pages are first, the first 3944 * As core pages are first, the first
3945 * page should never be a module page. 3945 * page should never be a module page.
3946 */ 3946 */
3947 if (WARN_ON(pg == ftrace_pages_start)) 3947 if (WARN_ON(pg == ftrace_pages_start))
3948 goto out_unlock; 3948 goto out_unlock;
3949 3949
3950 /* Check if we are deleting the last page */ 3950 /* Check if we are deleting the last page */
3951 if (pg == ftrace_pages) 3951 if (pg == ftrace_pages)
3952 ftrace_pages = next_to_ftrace_page(last_pg); 3952 ftrace_pages = next_to_ftrace_page(last_pg);
3953 3953
3954 *last_pg = pg->next; 3954 *last_pg = pg->next;
3955 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 3955 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3956 free_pages((unsigned long)pg->records, order); 3956 free_pages((unsigned long)pg->records, order);
3957 kfree(pg); 3957 kfree(pg);
3958 } else 3958 } else
3959 last_pg = &pg->next; 3959 last_pg = &pg->next;
3960 } 3960 }
3961 out_unlock: 3961 out_unlock:
3962 mutex_unlock(&ftrace_lock); 3962 mutex_unlock(&ftrace_lock);
3963 } 3963 }
3964 3964
3965 static void ftrace_init_module(struct module *mod, 3965 static void ftrace_init_module(struct module *mod,
3966 unsigned long *start, unsigned long *end) 3966 unsigned long *start, unsigned long *end)
3967 { 3967 {
3968 if (ftrace_disabled || start == end) 3968 if (ftrace_disabled || start == end)
3969 return; 3969 return;
3970 ftrace_process_locs(mod, start, end); 3970 ftrace_process_locs(mod, start, end);
3971 } 3971 }
3972 3972
3973 static int ftrace_module_notify(struct notifier_block *self, 3973 static int ftrace_module_notify(struct notifier_block *self,
3974 unsigned long val, void *data) 3974 unsigned long val, void *data)
3975 { 3975 {
3976 struct module *mod = data; 3976 struct module *mod = data;
3977 3977
3978 switch (val) { 3978 switch (val) {
3979 case MODULE_STATE_COMING: 3979 case MODULE_STATE_COMING:
3980 ftrace_init_module(mod, mod->ftrace_callsites, 3980 ftrace_init_module(mod, mod->ftrace_callsites,
3981 mod->ftrace_callsites + 3981 mod->ftrace_callsites +
3982 mod->num_ftrace_callsites); 3982 mod->num_ftrace_callsites);
3983 break; 3983 break;
3984 case MODULE_STATE_GOING: 3984 case MODULE_STATE_GOING:
3985 ftrace_release_mod(mod); 3985 ftrace_release_mod(mod);
3986 break; 3986 break;
3987 } 3987 }
3988 3988
3989 return 0; 3989 return 0;
3990 } 3990 }
3991 #else 3991 #else
3992 static int ftrace_module_notify(struct notifier_block *self, 3992 static int ftrace_module_notify(struct notifier_block *self,
3993 unsigned long val, void *data) 3993 unsigned long val, void *data)
3994 { 3994 {
3995 return 0; 3995 return 0;
3996 } 3996 }
3997 #endif /* CONFIG_MODULES */ 3997 #endif /* CONFIG_MODULES */
3998 3998
3999 struct notifier_block ftrace_module_nb = { 3999 struct notifier_block ftrace_module_nb = {
4000 .notifier_call = ftrace_module_notify, 4000 .notifier_call = ftrace_module_notify,
4001 .priority = 0, 4001 .priority = 0,
4002 }; 4002 };
4003 4003
4004 extern unsigned long __start_mcount_loc[]; 4004 extern unsigned long __start_mcount_loc[];
4005 extern unsigned long __stop_mcount_loc[]; 4005 extern unsigned long __stop_mcount_loc[];
4006 4006
4007 void __init ftrace_init(void) 4007 void __init ftrace_init(void)
4008 { 4008 {
4009 unsigned long count, addr, flags; 4009 unsigned long count, addr, flags;
4010 int ret; 4010 int ret;
4011 4011
4012 /* Keep the ftrace pointer to the stub */ 4012 /* Keep the ftrace pointer to the stub */
4013 addr = (unsigned long)ftrace_stub; 4013 addr = (unsigned long)ftrace_stub;
4014 4014
4015 local_irq_save(flags); 4015 local_irq_save(flags);
4016 ftrace_dyn_arch_init(&addr); 4016 ftrace_dyn_arch_init(&addr);
4017 local_irq_restore(flags); 4017 local_irq_restore(flags);
4018 4018
4019 /* ftrace_dyn_arch_init places the return code in addr */ 4019 /* ftrace_dyn_arch_init places the return code in addr */
4020 if (addr) 4020 if (addr)
4021 goto failed; 4021 goto failed;
4022 4022
4023 count = __stop_mcount_loc - __start_mcount_loc; 4023 count = __stop_mcount_loc - __start_mcount_loc;
4024 4024
4025 ret = ftrace_dyn_table_alloc(count); 4025 ret = ftrace_dyn_table_alloc(count);
4026 if (ret) 4026 if (ret)
4027 goto failed; 4027 goto failed;
4028 4028
4029 last_ftrace_enabled = ftrace_enabled = 1; 4029 last_ftrace_enabled = ftrace_enabled = 1;
4030 4030
4031 ret = ftrace_process_locs(NULL, 4031 ret = ftrace_process_locs(NULL,
4032 __start_mcount_loc, 4032 __start_mcount_loc,
4033 __stop_mcount_loc); 4033 __stop_mcount_loc);
4034 4034
4035 ret = register_module_notifier(&ftrace_module_nb); 4035 ret = register_module_notifier(&ftrace_module_nb);
4036 if (ret) 4036 if (ret)
4037 pr_warning("Failed to register trace ftrace module notifier\n"); 4037 pr_warning("Failed to register trace ftrace module notifier\n");
4038 4038
4039 set_ftrace_early_filters(); 4039 set_ftrace_early_filters();
4040 4040
4041 return; 4041 return;
4042 failed: 4042 failed:
4043 ftrace_disabled = 1; 4043 ftrace_disabled = 1;
4044 } 4044 }
4045 4045
4046 #else 4046 #else
4047 4047
4048 static struct ftrace_ops global_ops = { 4048 static struct ftrace_ops global_ops = {
4049 .func = ftrace_stub, 4049 .func = ftrace_stub,
4050 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 4050 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
4051 }; 4051 };
4052 4052
4053 static int __init ftrace_nodyn_init(void) 4053 static int __init ftrace_nodyn_init(void)
4054 { 4054 {
4055 ftrace_enabled = 1; 4055 ftrace_enabled = 1;
4056 return 0; 4056 return 0;
4057 } 4057 }
4058 core_initcall(ftrace_nodyn_init); 4058 core_initcall(ftrace_nodyn_init);
4059 4059
4060 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4060 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061 static inline void ftrace_startup_enable(int command) { } 4061 static inline void ftrace_startup_enable(int command) { }
4062 /* Keep as macros so we do not need to define the commands */ 4062 /* Keep as macros so we do not need to define the commands */
4063 # define ftrace_startup(ops, command) \ 4063 # define ftrace_startup(ops, command) \
4064 ({ \ 4064 ({ \
4065 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4065 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4066 0; \ 4066 0; \
4067 }) 4067 })
4068 # define ftrace_shutdown(ops, command) do { } while (0) 4068 # define ftrace_shutdown(ops, command) do { } while (0)
4069 # define ftrace_startup_sysctl() do { } while (0) 4069 # define ftrace_startup_sysctl() do { } while (0)
4070 # define ftrace_shutdown_sysctl() do { } while (0) 4070 # define ftrace_shutdown_sysctl() do { } while (0)
4071 4071
4072 static inline int 4072 static inline int
4073 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 4073 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
4074 { 4074 {
4075 return 1; 4075 return 1;
4076 } 4076 }
4077 4077
4078 #endif /* CONFIG_DYNAMIC_FTRACE */ 4078 #endif /* CONFIG_DYNAMIC_FTRACE */
4079 4079
4080 static void 4080 static void
4081 ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, 4081 ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4082 struct ftrace_ops *op, struct pt_regs *regs) 4082 struct ftrace_ops *op, struct pt_regs *regs)
4083 { 4083 {
4084 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) 4084 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
4085 return; 4085 return;
4086 4086
4087 /* 4087 /*
4088 * Some of the ops may be dynamically allocated, 4088 * Some of the ops may be dynamically allocated,
4089 * they must be freed after a synchronize_sched(). 4089 * they must be freed after a synchronize_sched().
4090 */ 4090 */
4091 preempt_disable_notrace(); 4091 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4092 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4093 op = rcu_dereference_raw(ftrace_control_list);
4094 while (op != &ftrace_list_end) { 4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) && 4095 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4096 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4097 op->func(ip, parent_ip, op, regs);
4098 4098
4099 op = rcu_dereference_raw(op->next); 4099 op = rcu_dereference_raw(op->next);
4100 }; 4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4101 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4102 preempt_enable_notrace();
4103 } 4103 }
4104 4104
4105 static struct ftrace_ops control_ops = { 4105 static struct ftrace_ops control_ops = {
4106 .func = ftrace_ops_control_func, 4106 .func = ftrace_ops_control_func,
4107 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 4107 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
4108 }; 4108 };
4109 4109
4110 static inline void 4110 static inline void
4111 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 4111 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4112 struct ftrace_ops *ignored, struct pt_regs *regs)
4113 { 4113 {
4114 struct ftrace_ops *op; 4114 struct ftrace_ops *op;
4115 4115
4116 if (function_trace_stop) 4116 if (function_trace_stop)
4117 return; 4117 return;
4118 4118
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
4120 return; 4120 return;
4121 4121
4122 trace_recursion_set(TRACE_INTERNAL_BIT); 4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4123 /*
4124 * Some of the ops may be dynamically allocated, 4124 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4125 * they must be freed after a synchronize_sched().
4126 */ 4126 */
4127 preempt_disable_notrace(); 4127 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4128 op = rcu_dereference_raw(ftrace_ops_list);
4129 while (op != &ftrace_list_end) { 4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4130 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4131 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4132 op = rcu_dereference_raw(op->next);
4133 }; 4133 };
4134 preempt_enable_notrace(); 4134 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4135 trace_recursion_clear(TRACE_INTERNAL_BIT);
4136 } 4136 }
4137 4137
4138 /* 4138 /*
4139 * Some archs only support passing ip and parent_ip. Even though 4139 * Some archs only support passing ip and parent_ip. Even though
4140 * the list function ignores the op parameter, we do not want any 4140 * the list function ignores the op parameter, we do not want any
4141 * C side effects, where a function is called without the caller 4141 * C side effects, where a function is called without the caller
4142 * sending a third parameter. 4142 * sending a third parameter.
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4143 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4144 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4145 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4148 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4149 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4150 */
4151 #if ARCH_SUPPORTS_FTRACE_OPS 4151 #if ARCH_SUPPORTS_FTRACE_OPS
4152 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 4152 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4153 struct ftrace_ops *op, struct pt_regs *regs) 4153 struct ftrace_ops *op, struct pt_regs *regs)
4154 { 4154 {
4155 __ftrace_ops_list_func(ip, parent_ip, NULL, regs); 4155 __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
4156 } 4156 }
4157 #else 4157 #else
4158 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) 4158 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4159 { 4159 {
4160 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); 4160 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
4161 } 4161 }
4162 #endif 4162 #endif
4163 4163
4164 static void clear_ftrace_swapper(void) 4164 static void clear_ftrace_swapper(void)
4165 { 4165 {
4166 struct task_struct *p; 4166 struct task_struct *p;
4167 int cpu; 4167 int cpu;
4168 4168
4169 get_online_cpus(); 4169 get_online_cpus();
4170 for_each_online_cpu(cpu) { 4170 for_each_online_cpu(cpu) {
4171 p = idle_task(cpu); 4171 p = idle_task(cpu);
4172 clear_tsk_trace_trace(p); 4172 clear_tsk_trace_trace(p);
4173 } 4173 }
4174 put_online_cpus(); 4174 put_online_cpus();
4175 } 4175 }
4176 4176
4177 static void set_ftrace_swapper(void) 4177 static void set_ftrace_swapper(void)
4178 { 4178 {
4179 struct task_struct *p; 4179 struct task_struct *p;
4180 int cpu; 4180 int cpu;
4181 4181
4182 get_online_cpus(); 4182 get_online_cpus();
4183 for_each_online_cpu(cpu) { 4183 for_each_online_cpu(cpu) {
4184 p = idle_task(cpu); 4184 p = idle_task(cpu);
4185 set_tsk_trace_trace(p); 4185 set_tsk_trace_trace(p);
4186 } 4186 }
4187 put_online_cpus(); 4187 put_online_cpus();
4188 } 4188 }
4189 4189
4190 static void clear_ftrace_pid(struct pid *pid) 4190 static void clear_ftrace_pid(struct pid *pid)
4191 { 4191 {
4192 struct task_struct *p; 4192 struct task_struct *p;
4193 4193
4194 rcu_read_lock(); 4194 rcu_read_lock();
4195 do_each_pid_task(pid, PIDTYPE_PID, p) { 4195 do_each_pid_task(pid, PIDTYPE_PID, p) {
4196 clear_tsk_trace_trace(p); 4196 clear_tsk_trace_trace(p);
4197 } while_each_pid_task(pid, PIDTYPE_PID, p); 4197 } while_each_pid_task(pid, PIDTYPE_PID, p);
4198 rcu_read_unlock(); 4198 rcu_read_unlock();
4199 4199
4200 put_pid(pid); 4200 put_pid(pid);
4201 } 4201 }
4202 4202
4203 static void set_ftrace_pid(struct pid *pid) 4203 static void set_ftrace_pid(struct pid *pid)
4204 { 4204 {
4205 struct task_struct *p; 4205 struct task_struct *p;
4206 4206
4207 rcu_read_lock(); 4207 rcu_read_lock();
4208 do_each_pid_task(pid, PIDTYPE_PID, p) { 4208 do_each_pid_task(pid, PIDTYPE_PID, p) {
4209 set_tsk_trace_trace(p); 4209 set_tsk_trace_trace(p);
4210 } while_each_pid_task(pid, PIDTYPE_PID, p); 4210 } while_each_pid_task(pid, PIDTYPE_PID, p);
4211 rcu_read_unlock(); 4211 rcu_read_unlock();
4212 } 4212 }
4213 4213
4214 static void clear_ftrace_pid_task(struct pid *pid) 4214 static void clear_ftrace_pid_task(struct pid *pid)
4215 { 4215 {
4216 if (pid == ftrace_swapper_pid) 4216 if (pid == ftrace_swapper_pid)
4217 clear_ftrace_swapper(); 4217 clear_ftrace_swapper();
4218 else 4218 else
4219 clear_ftrace_pid(pid); 4219 clear_ftrace_pid(pid);
4220 } 4220 }
4221 4221
4222 static void set_ftrace_pid_task(struct pid *pid) 4222 static void set_ftrace_pid_task(struct pid *pid)
4223 { 4223 {
4224 if (pid == ftrace_swapper_pid) 4224 if (pid == ftrace_swapper_pid)
4225 set_ftrace_swapper(); 4225 set_ftrace_swapper();
4226 else 4226 else
4227 set_ftrace_pid(pid); 4227 set_ftrace_pid(pid);
4228 } 4228 }
4229 4229
4230 static int ftrace_pid_add(int p) 4230 static int ftrace_pid_add(int p)
4231 { 4231 {
4232 struct pid *pid; 4232 struct pid *pid;
4233 struct ftrace_pid *fpid; 4233 struct ftrace_pid *fpid;
4234 int ret = -EINVAL; 4234 int ret = -EINVAL;
4235 4235
4236 mutex_lock(&ftrace_lock); 4236 mutex_lock(&ftrace_lock);
4237 4237
4238 if (!p) 4238 if (!p)
4239 pid = ftrace_swapper_pid; 4239 pid = ftrace_swapper_pid;
4240 else 4240 else
4241 pid = find_get_pid(p); 4241 pid = find_get_pid(p);
4242 4242
4243 if (!pid) 4243 if (!pid)
4244 goto out; 4244 goto out;
4245 4245
4246 ret = 0; 4246 ret = 0;
4247 4247
4248 list_for_each_entry(fpid, &ftrace_pids, list) 4248 list_for_each_entry(fpid, &ftrace_pids, list)
4249 if (fpid->pid == pid) 4249 if (fpid->pid == pid)
4250 goto out_put; 4250 goto out_put;
4251 4251
4252 ret = -ENOMEM; 4252 ret = -ENOMEM;
4253 4253
4254 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); 4254 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
4255 if (!fpid) 4255 if (!fpid)
4256 goto out_put; 4256 goto out_put;
4257 4257
4258 list_add(&fpid->list, &ftrace_pids); 4258 list_add(&fpid->list, &ftrace_pids);
4259 fpid->pid = pid; 4259 fpid->pid = pid;
4260 4260
4261 set_ftrace_pid_task(pid); 4261 set_ftrace_pid_task(pid);
4262 4262
4263 ftrace_update_pid_func(); 4263 ftrace_update_pid_func();
4264 ftrace_startup_enable(0); 4264 ftrace_startup_enable(0);
4265 4265
4266 mutex_unlock(&ftrace_lock); 4266 mutex_unlock(&ftrace_lock);
4267 return 0; 4267 return 0;
4268 4268
4269 out_put: 4269 out_put:
4270 if (pid != ftrace_swapper_pid) 4270 if (pid != ftrace_swapper_pid)
4271 put_pid(pid); 4271 put_pid(pid);
4272 4272
4273 out: 4273 out:
4274 mutex_unlock(&ftrace_lock); 4274 mutex_unlock(&ftrace_lock);
4275 return ret; 4275 return ret;
4276 } 4276 }
4277 4277
4278 static void ftrace_pid_reset(void) 4278 static void ftrace_pid_reset(void)
4279 { 4279 {
4280 struct ftrace_pid *fpid, *safe; 4280 struct ftrace_pid *fpid, *safe;
4281 4281
4282 mutex_lock(&ftrace_lock); 4282 mutex_lock(&ftrace_lock);
4283 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { 4283 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
4284 struct pid *pid = fpid->pid; 4284 struct pid *pid = fpid->pid;
4285 4285
4286 clear_ftrace_pid_task(pid); 4286 clear_ftrace_pid_task(pid);
4287 4287
4288 list_del(&fpid->list); 4288 list_del(&fpid->list);
4289 kfree(fpid); 4289 kfree(fpid);
4290 } 4290 }
4291 4291
4292 ftrace_update_pid_func(); 4292 ftrace_update_pid_func();
4293 ftrace_startup_enable(0); 4293 ftrace_startup_enable(0);
4294 4294
4295 mutex_unlock(&ftrace_lock); 4295 mutex_unlock(&ftrace_lock);
4296 } 4296 }
4297 4297
4298 static void *fpid_start(struct seq_file *m, loff_t *pos) 4298 static void *fpid_start(struct seq_file *m, loff_t *pos)
4299 { 4299 {
4300 mutex_lock(&ftrace_lock); 4300 mutex_lock(&ftrace_lock);
4301 4301
4302 if (list_empty(&ftrace_pids) && (!*pos)) 4302 if (list_empty(&ftrace_pids) && (!*pos))
4303 return (void *) 1; 4303 return (void *) 1;
4304 4304
4305 return seq_list_start(&ftrace_pids, *pos); 4305 return seq_list_start(&ftrace_pids, *pos);
4306 } 4306 }
4307 4307
4308 static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) 4308 static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
4309 { 4309 {
4310 if (v == (void *)1) 4310 if (v == (void *)1)
4311 return NULL; 4311 return NULL;
4312 4312
4313 return seq_list_next(v, &ftrace_pids, pos); 4313 return seq_list_next(v, &ftrace_pids, pos);
4314 } 4314 }
4315 4315
4316 static void fpid_stop(struct seq_file *m, void *p) 4316 static void fpid_stop(struct seq_file *m, void *p)
4317 { 4317 {
4318 mutex_unlock(&ftrace_lock); 4318 mutex_unlock(&ftrace_lock);
4319 } 4319 }
4320 4320
4321 static int fpid_show(struct seq_file *m, void *v) 4321 static int fpid_show(struct seq_file *m, void *v)
4322 { 4322 {
4323 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 4323 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
4324 4324
4325 if (v == (void *)1) { 4325 if (v == (void *)1) {
4326 seq_printf(m, "no pid\n"); 4326 seq_printf(m, "no pid\n");
4327 return 0; 4327 return 0;
4328 } 4328 }
4329 4329
4330 if (fpid->pid == ftrace_swapper_pid) 4330 if (fpid->pid == ftrace_swapper_pid)
4331 seq_printf(m, "swapper tasks\n"); 4331 seq_printf(m, "swapper tasks\n");
4332 else 4332 else
4333 seq_printf(m, "%u\n", pid_vnr(fpid->pid)); 4333 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
4334 4334
4335 return 0; 4335 return 0;
4336 } 4336 }
4337 4337
4338 static const struct seq_operations ftrace_pid_sops = { 4338 static const struct seq_operations ftrace_pid_sops = {
4339 .start = fpid_start, 4339 .start = fpid_start,
4340 .next = fpid_next, 4340 .next = fpid_next,
4341 .stop = fpid_stop, 4341 .stop = fpid_stop,
4342 .show = fpid_show, 4342 .show = fpid_show,
4343 }; 4343 };
4344 4344
4345 static int 4345 static int
4346 ftrace_pid_open(struct inode *inode, struct file *file) 4346 ftrace_pid_open(struct inode *inode, struct file *file)
4347 { 4347 {
4348 int ret = 0; 4348 int ret = 0;
4349 4349
4350 if ((file->f_mode & FMODE_WRITE) && 4350 if ((file->f_mode & FMODE_WRITE) &&
4351 (file->f_flags & O_TRUNC)) 4351 (file->f_flags & O_TRUNC))
4352 ftrace_pid_reset(); 4352 ftrace_pid_reset();
4353 4353
4354 if (file->f_mode & FMODE_READ) 4354 if (file->f_mode & FMODE_READ)
4355 ret = seq_open(file, &ftrace_pid_sops); 4355 ret = seq_open(file, &ftrace_pid_sops);
4356 4356
4357 return ret; 4357 return ret;
4358 } 4358 }
4359 4359
4360 static ssize_t 4360 static ssize_t
4361 ftrace_pid_write(struct file *filp, const char __user *ubuf, 4361 ftrace_pid_write(struct file *filp, const char __user *ubuf,
4362 size_t cnt, loff_t *ppos) 4362 size_t cnt, loff_t *ppos)
4363 { 4363 {
4364 char buf[64], *tmp; 4364 char buf[64], *tmp;
4365 long val; 4365 long val;
4366 int ret; 4366 int ret;
4367 4367
4368 if (cnt >= sizeof(buf)) 4368 if (cnt >= sizeof(buf))
4369 return -EINVAL; 4369 return -EINVAL;
4370 4370
4371 if (copy_from_user(&buf, ubuf, cnt)) 4371 if (copy_from_user(&buf, ubuf, cnt))
4372 return -EFAULT; 4372 return -EFAULT;
4373 4373
4374 buf[cnt] = 0; 4374 buf[cnt] = 0;
4375 4375
4376 /* 4376 /*
4377 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" 4377 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
4378 * to clean the filter quietly. 4378 * to clean the filter quietly.
4379 */ 4379 */
4380 tmp = strstrip(buf); 4380 tmp = strstrip(buf);
4381 if (strlen(tmp) == 0) 4381 if (strlen(tmp) == 0)
4382 return 1; 4382 return 1;
4383 4383
4384 ret = kstrtol(tmp, 10, &val); 4384 ret = kstrtol(tmp, 10, &val);
4385 if (ret < 0) 4385 if (ret < 0)
4386 return ret; 4386 return ret;
4387 4387
4388 ret = ftrace_pid_add(val); 4388 ret = ftrace_pid_add(val);
4389 4389
4390 return ret ? ret : cnt; 4390 return ret ? ret : cnt;
4391 } 4391 }
4392 4392
4393 static int 4393 static int
4394 ftrace_pid_release(struct inode *inode, struct file *file) 4394 ftrace_pid_release(struct inode *inode, struct file *file)
4395 { 4395 {
4396 if (file->f_mode & FMODE_READ) 4396 if (file->f_mode & FMODE_READ)
4397 seq_release(inode, file); 4397 seq_release(inode, file);
4398 4398
4399 return 0; 4399 return 0;
4400 } 4400 }
4401 4401
4402 static const struct file_operations ftrace_pid_fops = { 4402 static const struct file_operations ftrace_pid_fops = {
4403 .open = ftrace_pid_open, 4403 .open = ftrace_pid_open,
4404 .write = ftrace_pid_write, 4404 .write = ftrace_pid_write,
4405 .read = seq_read, 4405 .read = seq_read,
4406 .llseek = seq_lseek, 4406 .llseek = seq_lseek,
4407 .release = ftrace_pid_release, 4407 .release = ftrace_pid_release,
4408 }; 4408 };
4409 4409
4410 static __init int ftrace_init_debugfs(void) 4410 static __init int ftrace_init_debugfs(void)
4411 { 4411 {
4412 struct dentry *d_tracer; 4412 struct dentry *d_tracer;
4413 4413
4414 d_tracer = tracing_init_dentry(); 4414 d_tracer = tracing_init_dentry();
4415 if (!d_tracer) 4415 if (!d_tracer)
4416 return 0; 4416 return 0;
4417 4417
4418 ftrace_init_dyn_debugfs(d_tracer); 4418 ftrace_init_dyn_debugfs(d_tracer);
4419 4419
4420 trace_create_file("set_ftrace_pid", 0644, d_tracer, 4420 trace_create_file("set_ftrace_pid", 0644, d_tracer,
4421 NULL, &ftrace_pid_fops); 4421 NULL, &ftrace_pid_fops);
4422 4422
4423 ftrace_profile_debugfs(d_tracer); 4423 ftrace_profile_debugfs(d_tracer);
4424 4424
4425 return 0; 4425 return 0;
4426 } 4426 }
4427 fs_initcall(ftrace_init_debugfs); 4427 fs_initcall(ftrace_init_debugfs);
4428 4428
4429 /** 4429 /**
4430 * ftrace_kill - kill ftrace 4430 * ftrace_kill - kill ftrace
4431 * 4431 *
4432 * This function should be used by panic code. It stops ftrace 4432 * This function should be used by panic code. It stops ftrace
4433 * but in a not so nice way. If you need to simply kill ftrace 4433 * but in a not so nice way. If you need to simply kill ftrace
4434 * from a non-atomic section, use ftrace_kill. 4434 * from a non-atomic section, use ftrace_kill.
4435 */ 4435 */
4436 void ftrace_kill(void) 4436 void ftrace_kill(void)
4437 { 4437 {
4438 ftrace_disabled = 1; 4438 ftrace_disabled = 1;
4439 ftrace_enabled = 0; 4439 ftrace_enabled = 0;
4440 clear_ftrace_function(); 4440 clear_ftrace_function();
4441 } 4441 }
4442 4442
4443 /** 4443 /**
4444 * Test if ftrace is dead or not. 4444 * Test if ftrace is dead or not.
4445 */ 4445 */
4446 int ftrace_is_dead(void) 4446 int ftrace_is_dead(void)
4447 { 4447 {
4448 return ftrace_disabled; 4448 return ftrace_disabled;
4449 } 4449 }
4450 4450
4451 /** 4451 /**
4452 * register_ftrace_function - register a function for profiling 4452 * register_ftrace_function - register a function for profiling
4453 * @ops - ops structure that holds the function for profiling. 4453 * @ops - ops structure that holds the function for profiling.
4454 * 4454 *
4455 * Register a function to be called by all functions in the 4455 * Register a function to be called by all functions in the
4456 * kernel. 4456 * kernel.
4457 * 4457 *
4458 * Note: @ops->func and all the functions it calls must be labeled 4458 * Note: @ops->func and all the functions it calls must be labeled
4459 * with "notrace", otherwise it will go into a 4459 * with "notrace", otherwise it will go into a
4460 * recursive loop. 4460 * recursive loop.
4461 */ 4461 */
4462 int register_ftrace_function(struct ftrace_ops *ops) 4462 int register_ftrace_function(struct ftrace_ops *ops)
4463 { 4463 {
4464 int ret = -1; 4464 int ret = -1;
4465 4465
4466 mutex_lock(&ftrace_lock); 4466 mutex_lock(&ftrace_lock);
4467 4467
4468 ret = __register_ftrace_function(ops); 4468 ret = __register_ftrace_function(ops);
4469 if (!ret) 4469 if (!ret)
4470 ret = ftrace_startup(ops, 0); 4470 ret = ftrace_startup(ops, 0);
4471 4471
4472 mutex_unlock(&ftrace_lock); 4472 mutex_unlock(&ftrace_lock);
4473 4473
4474 return ret; 4474 return ret;
4475 } 4475 }
4476 EXPORT_SYMBOL_GPL(register_ftrace_function); 4476 EXPORT_SYMBOL_GPL(register_ftrace_function);
4477 4477
4478 /** 4478 /**
4479 * unregister_ftrace_function - unregister a function for profiling. 4479 * unregister_ftrace_function - unregister a function for profiling.
4480 * @ops - ops structure that holds the function to unregister 4480 * @ops - ops structure that holds the function to unregister
4481 * 4481 *
4482 * Unregister a function that was added to be called by ftrace profiling. 4482 * Unregister a function that was added to be called by ftrace profiling.
4483 */ 4483 */
4484 int unregister_ftrace_function(struct ftrace_ops *ops) 4484 int unregister_ftrace_function(struct ftrace_ops *ops)
4485 { 4485 {
4486 int ret; 4486 int ret;
4487 4487
4488 mutex_lock(&ftrace_lock); 4488 mutex_lock(&ftrace_lock);
4489 ret = __unregister_ftrace_function(ops); 4489 ret = __unregister_ftrace_function(ops);
4490 if (!ret) 4490 if (!ret)
4491 ftrace_shutdown(ops, 0); 4491 ftrace_shutdown(ops, 0);
4492 mutex_unlock(&ftrace_lock); 4492 mutex_unlock(&ftrace_lock);
4493 4493
4494 return ret; 4494 return ret;
4495 } 4495 }
4496 EXPORT_SYMBOL_GPL(unregister_ftrace_function); 4496 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
4497 4497
4498 int 4498 int
4499 ftrace_enable_sysctl(struct ctl_table *table, int write, 4499 ftrace_enable_sysctl(struct ctl_table *table, int write,
4500 void __user *buffer, size_t *lenp, 4500 void __user *buffer, size_t *lenp,
4501 loff_t *ppos) 4501 loff_t *ppos)
4502 { 4502 {
4503 int ret = -ENODEV; 4503 int ret = -ENODEV;
4504 4504
4505 mutex_lock(&ftrace_lock); 4505 mutex_lock(&ftrace_lock);
4506 4506
4507 if (unlikely(ftrace_disabled)) 4507 if (unlikely(ftrace_disabled))
4508 goto out; 4508 goto out;
4509 4509
4510 ret = proc_dointvec(table, write, buffer, lenp, ppos); 4510 ret = proc_dointvec(table, write, buffer, lenp, ppos);
4511 4511
4512 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 4512 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
4513 goto out; 4513 goto out;
4514 4514
4515 last_ftrace_enabled = !!ftrace_enabled; 4515 last_ftrace_enabled = !!ftrace_enabled;
4516 4516
4517 if (ftrace_enabled) { 4517 if (ftrace_enabled) {
4518 4518
4519 ftrace_startup_sysctl(); 4519 ftrace_startup_sysctl();
4520 4520
4521 /* we are starting ftrace again */ 4521 /* we are starting ftrace again */
4522 if (ftrace_ops_list != &ftrace_list_end) { 4522 if (ftrace_ops_list != &ftrace_list_end) {
4523 if (ftrace_ops_list->next == &ftrace_list_end) 4523 if (ftrace_ops_list->next == &ftrace_list_end)
4524 ftrace_trace_function = ftrace_ops_list->func; 4524 ftrace_trace_function = ftrace_ops_list->func;
4525 else 4525 else
4526 ftrace_trace_function = ftrace_ops_list_func; 4526 ftrace_trace_function = ftrace_ops_list_func;
4527 } 4527 }
4528 4528
4529 } else { 4529 } else {
4530 /* stopping ftrace calls (just send to ftrace_stub) */ 4530 /* stopping ftrace calls (just send to ftrace_stub) */
4531 ftrace_trace_function = ftrace_stub; 4531 ftrace_trace_function = ftrace_stub;
4532 4532
4533 ftrace_shutdown_sysctl(); 4533 ftrace_shutdown_sysctl();
4534 } 4534 }
4535 4535
4536 out: 4536 out:
4537 mutex_unlock(&ftrace_lock); 4537 mutex_unlock(&ftrace_lock);
4538 return ret; 4538 return ret;
4539 } 4539 }
4540 4540
4541 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 4541 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
4542 4542
4543 static int ftrace_graph_active; 4543 static int ftrace_graph_active;
4544 static struct notifier_block ftrace_suspend_notifier; 4544 static struct notifier_block ftrace_suspend_notifier;
4545 4545
4546 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 4546 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4547 { 4547 {
4548 return 0; 4548 return 0;
4549 } 4549 }
4550 4550
4551 /* The callbacks that hook a function */ 4551 /* The callbacks that hook a function */
4552 trace_func_graph_ret_t ftrace_graph_return = 4552 trace_func_graph_ret_t ftrace_graph_return =
4553 (trace_func_graph_ret_t)ftrace_stub; 4553 (trace_func_graph_ret_t)ftrace_stub;
4554 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; 4554 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
4555 4555
4556 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ 4556 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
4557 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) 4557 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
4558 { 4558 {
4559 int i; 4559 int i;
4560 int ret = 0; 4560 int ret = 0;
4561 unsigned long flags; 4561 unsigned long flags;
4562 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; 4562 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
4563 struct task_struct *g, *t; 4563 struct task_struct *g, *t;
4564 4564
4565 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { 4565 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
4566 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH 4566 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
4567 * sizeof(struct ftrace_ret_stack), 4567 * sizeof(struct ftrace_ret_stack),
4568 GFP_KERNEL); 4568 GFP_KERNEL);
4569 if (!ret_stack_list[i]) { 4569 if (!ret_stack_list[i]) {
4570 start = 0; 4570 start = 0;
4571 end = i; 4571 end = i;
4572 ret = -ENOMEM; 4572 ret = -ENOMEM;
4573 goto free; 4573 goto free;
4574 } 4574 }
4575 } 4575 }
4576 4576
4577 read_lock_irqsave(&tasklist_lock, flags); 4577 read_lock_irqsave(&tasklist_lock, flags);
4578 do_each_thread(g, t) { 4578 do_each_thread(g, t) {
4579 if (start == end) { 4579 if (start == end) {
4580 ret = -EAGAIN; 4580 ret = -EAGAIN;
4581 goto unlock; 4581 goto unlock;
4582 } 4582 }
4583 4583
4584 if (t->ret_stack == NULL) { 4584 if (t->ret_stack == NULL) {
4585 atomic_set(&t->tracing_graph_pause, 0); 4585 atomic_set(&t->tracing_graph_pause, 0);
4586 atomic_set(&t->trace_overrun, 0); 4586 atomic_set(&t->trace_overrun, 0);
4587 t->curr_ret_stack = -1; 4587 t->curr_ret_stack = -1;
4588 /* Make sure the tasks see the -1 first: */ 4588 /* Make sure the tasks see the -1 first: */
4589 smp_wmb(); 4589 smp_wmb();
4590 t->ret_stack = ret_stack_list[start++]; 4590 t->ret_stack = ret_stack_list[start++];
4591 } 4591 }
4592 } while_each_thread(g, t); 4592 } while_each_thread(g, t);
4593 4593
4594 unlock: 4594 unlock:
4595 read_unlock_irqrestore(&tasklist_lock, flags); 4595 read_unlock_irqrestore(&tasklist_lock, flags);
4596 free: 4596 free:
4597 for (i = start; i < end; i++) 4597 for (i = start; i < end; i++)
4598 kfree(ret_stack_list[i]); 4598 kfree(ret_stack_list[i]);
4599 return ret; 4599 return ret;
4600 } 4600 }
4601 4601
4602 static void 4602 static void
4603 ftrace_graph_probe_sched_switch(void *ignore, 4603 ftrace_graph_probe_sched_switch(void *ignore,
4604 struct task_struct *prev, struct task_struct *next) 4604 struct task_struct *prev, struct task_struct *next)
4605 { 4605 {
4606 unsigned long long timestamp; 4606 unsigned long long timestamp;
4607 int index; 4607 int index;
4608 4608
4609 /* 4609 /*
4610 * Does the user want to count the time a function was asleep. 4610 * Does the user want to count the time a function was asleep.
4611 * If so, do not update the time stamps. 4611 * If so, do not update the time stamps.
4612 */ 4612 */
4613 if (trace_flags & TRACE_ITER_SLEEP_TIME) 4613 if (trace_flags & TRACE_ITER_SLEEP_TIME)
4614 return; 4614 return;
4615 4615
4616 timestamp = trace_clock_local(); 4616 timestamp = trace_clock_local();
4617 4617
4618 prev->ftrace_timestamp = timestamp; 4618 prev->ftrace_timestamp = timestamp;
4619 4619
4620 /* only process tasks that we timestamped */ 4620 /* only process tasks that we timestamped */
4621 if (!next->ftrace_timestamp) 4621 if (!next->ftrace_timestamp)
4622 return; 4622 return;
4623 4623
4624 /* 4624 /*
4625 * Update all the counters in next to make up for the 4625 * Update all the counters in next to make up for the
4626 * time next was sleeping. 4626 * time next was sleeping.
4627 */ 4627 */
4628 timestamp -= next->ftrace_timestamp; 4628 timestamp -= next->ftrace_timestamp;
4629 4629
4630 for (index = next->curr_ret_stack; index >= 0; index--) 4630 for (index = next->curr_ret_stack; index >= 0; index--)
4631 next->ret_stack[index].calltime += timestamp; 4631 next->ret_stack[index].calltime += timestamp;
4632 } 4632 }
4633 4633
4634 /* Allocate a return stack for each task */ 4634 /* Allocate a return stack for each task */
4635 static int start_graph_tracing(void) 4635 static int start_graph_tracing(void)
4636 { 4636 {
4637 struct ftrace_ret_stack **ret_stack_list; 4637 struct ftrace_ret_stack **ret_stack_list;
4638 int ret, cpu; 4638 int ret, cpu;
4639 4639
4640 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * 4640 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
4641 sizeof(struct ftrace_ret_stack *), 4641 sizeof(struct ftrace_ret_stack *),
4642 GFP_KERNEL); 4642 GFP_KERNEL);
4643 4643
4644 if (!ret_stack_list) 4644 if (!ret_stack_list)
4645 return -ENOMEM; 4645 return -ENOMEM;
4646 4646
4647 /* The cpu_boot init_task->ret_stack will never be freed */ 4647 /* The cpu_boot init_task->ret_stack will never be freed */
4648 for_each_online_cpu(cpu) { 4648 for_each_online_cpu(cpu) {
4649 if (!idle_task(cpu)->ret_stack) 4649 if (!idle_task(cpu)->ret_stack)
4650 ftrace_graph_init_idle_task(idle_task(cpu), cpu); 4650 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
4651 } 4651 }
4652 4652
4653 do { 4653 do {
4654 ret = alloc_retstack_tasklist(ret_stack_list); 4654 ret = alloc_retstack_tasklist(ret_stack_list);
4655 } while (ret == -EAGAIN); 4655 } while (ret == -EAGAIN);
4656 4656
4657 if (!ret) { 4657 if (!ret) {
4658 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4658 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
4659 if (ret) 4659 if (ret)
4660 pr_info("ftrace_graph: Couldn't activate tracepoint" 4660 pr_info("ftrace_graph: Couldn't activate tracepoint"
4661 " probe to kernel_sched_switch\n"); 4661 " probe to kernel_sched_switch\n");
4662 } 4662 }
4663 4663
4664 kfree(ret_stack_list); 4664 kfree(ret_stack_list);
4665 return ret; 4665 return ret;
4666 } 4666 }
4667 4667
4668 /* 4668 /*
4669 * Hibernation protection. 4669 * Hibernation protection.
4670 * The state of the current task is too much unstable during 4670 * The state of the current task is too much unstable during
4671 * suspend/restore to disk. We want to protect against that. 4671 * suspend/restore to disk. We want to protect against that.
4672 */ 4672 */
4673 static int 4673 static int
4674 ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, 4674 ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
4675 void *unused) 4675 void *unused)
4676 { 4676 {
4677 switch (state) { 4677 switch (state) {
4678 case PM_HIBERNATION_PREPARE: 4678 case PM_HIBERNATION_PREPARE:
4679 pause_graph_tracing(); 4679 pause_graph_tracing();
4680 break; 4680 break;
4681 4681
4682 case PM_POST_HIBERNATION: 4682 case PM_POST_HIBERNATION:
4683 unpause_graph_tracing(); 4683 unpause_graph_tracing();
4684 break; 4684 break;
4685 } 4685 }
4686 return NOTIFY_DONE; 4686 return NOTIFY_DONE;
4687 } 4687 }
4688 4688
4689 int register_ftrace_graph(trace_func_graph_ret_t retfunc, 4689 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
4690 trace_func_graph_ent_t entryfunc) 4690 trace_func_graph_ent_t entryfunc)
4691 { 4691 {
4692 int ret = 0; 4692 int ret = 0;
4693 4693
4694 mutex_lock(&ftrace_lock); 4694 mutex_lock(&ftrace_lock);
4695 4695
4696 /* we currently allow only one tracer registered at a time */ 4696 /* we currently allow only one tracer registered at a time */
4697 if (ftrace_graph_active) { 4697 if (ftrace_graph_active) {
4698 ret = -EBUSY; 4698 ret = -EBUSY;
4699 goto out; 4699 goto out;
4700 } 4700 }
4701 4701
4702 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 4702 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
4703 register_pm_notifier(&ftrace_suspend_notifier); 4703 register_pm_notifier(&ftrace_suspend_notifier);
4704 4704
4705 ftrace_graph_active++; 4705 ftrace_graph_active++;
4706 ret = start_graph_tracing(); 4706 ret = start_graph_tracing();
4707 if (ret) { 4707 if (ret) {
4708 ftrace_graph_active--; 4708 ftrace_graph_active--;
4709 goto out; 4709 goto out;
4710 } 4710 }
4711 4711
4712 ftrace_graph_return = retfunc; 4712 ftrace_graph_return = retfunc;
4713 ftrace_graph_entry = entryfunc; 4713 ftrace_graph_entry = entryfunc;
4714 4714
4715 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 4715 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
4716 4716
4717 out: 4717 out:
4718 mutex_unlock(&ftrace_lock); 4718 mutex_unlock(&ftrace_lock);
4719 return ret; 4719 return ret;
4720 } 4720 }
4721 4721
4722 void unregister_ftrace_graph(void) 4722 void unregister_ftrace_graph(void)
4723 { 4723 {
4724 mutex_lock(&ftrace_lock); 4724 mutex_lock(&ftrace_lock);
4725 4725
4726 if (unlikely(!ftrace_graph_active)) 4726 if (unlikely(!ftrace_graph_active))
4727 goto out; 4727 goto out;
4728 4728
4729 ftrace_graph_active--; 4729 ftrace_graph_active--;
4730 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4730 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
4731 ftrace_graph_entry = ftrace_graph_entry_stub; 4731 ftrace_graph_entry = ftrace_graph_entry_stub;
4732 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 4732 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
4733 unregister_pm_notifier(&ftrace_suspend_notifier); 4733 unregister_pm_notifier(&ftrace_suspend_notifier);
4734 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4734 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
4735 4735
4736 out: 4736 out:
4737 mutex_unlock(&ftrace_lock); 4737 mutex_unlock(&ftrace_lock);
4738 } 4738 }
4739 4739
4740 static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); 4740 static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
4741 4741
4742 static void 4742 static void
4743 graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) 4743 graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
4744 { 4744 {
4745 atomic_set(&t->tracing_graph_pause, 0); 4745 atomic_set(&t->tracing_graph_pause, 0);
4746 atomic_set(&t->trace_overrun, 0); 4746 atomic_set(&t->trace_overrun, 0);
4747 t->ftrace_timestamp = 0; 4747 t->ftrace_timestamp = 0;
4748 /* make curr_ret_stack visible before we add the ret_stack */ 4748 /* make curr_ret_stack visible before we add the ret_stack */
4749 smp_wmb(); 4749 smp_wmb();
4750 t->ret_stack = ret_stack; 4750 t->ret_stack = ret_stack;
4751 } 4751 }
4752 4752
4753 /* 4753 /*
4754 * Allocate a return stack for the idle task. May be the first 4754 * Allocate a return stack for the idle task. May be the first
4755 * time through, or it may be done by CPU hotplug online. 4755 * time through, or it may be done by CPU hotplug online.
4756 */ 4756 */
4757 void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) 4757 void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
4758 { 4758 {
4759 t->curr_ret_stack = -1; 4759 t->curr_ret_stack = -1;
4760 /* 4760 /*
4761 * The idle task has no parent, it either has its own 4761 * The idle task has no parent, it either has its own
4762 * stack or no stack at all. 4762 * stack or no stack at all.
4763 */ 4763 */
4764 if (t->ret_stack) 4764 if (t->ret_stack)
4765 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); 4765 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
4766 4766
4767 if (ftrace_graph_active) { 4767 if (ftrace_graph_active) {
4768 struct ftrace_ret_stack *ret_stack; 4768 struct ftrace_ret_stack *ret_stack;
4769 4769
4770 ret_stack = per_cpu(idle_ret_stack, cpu); 4770 ret_stack = per_cpu(idle_ret_stack, cpu);
4771 if (!ret_stack) { 4771 if (!ret_stack) {
4772 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 4772 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4773 * sizeof(struct ftrace_ret_stack), 4773 * sizeof(struct ftrace_ret_stack),
4774 GFP_KERNEL); 4774 GFP_KERNEL);
4775 if (!ret_stack) 4775 if (!ret_stack)
4776 return; 4776 return;
4777 per_cpu(idle_ret_stack, cpu) = ret_stack; 4777 per_cpu(idle_ret_stack, cpu) = ret_stack;
4778 } 4778 }
4779 graph_init_task(t, ret_stack); 4779 graph_init_task(t, ret_stack);
4780 } 4780 }
4781 } 4781 }
4782 4782
4783 /* Allocate a return stack for newly created task */ 4783 /* Allocate a return stack for newly created task */
4784 void ftrace_graph_init_task(struct task_struct *t) 4784 void ftrace_graph_init_task(struct task_struct *t)
4785 { 4785 {
4786 /* Make sure we do not use the parent ret_stack */ 4786 /* Make sure we do not use the parent ret_stack */
4787 t->ret_stack = NULL; 4787 t->ret_stack = NULL;
4788 t->curr_ret_stack = -1; 4788 t->curr_ret_stack = -1;
4789 4789
4790 if (ftrace_graph_active) { 4790 if (ftrace_graph_active) {
4791 struct ftrace_ret_stack *ret_stack; 4791 struct ftrace_ret_stack *ret_stack;
4792 4792
4793 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 4793 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4794 * sizeof(struct ftrace_ret_stack), 4794 * sizeof(struct ftrace_ret_stack),
4795 GFP_KERNEL); 4795 GFP_KERNEL);
4796 if (!ret_stack) 4796 if (!ret_stack)
4797 return; 4797 return;
4798 graph_init_task(t, ret_stack); 4798 graph_init_task(t, ret_stack);
4799 } 4799 }
4800 } 4800 }
4801 4801
4802 void ftrace_graph_exit_task(struct task_struct *t) 4802 void ftrace_graph_exit_task(struct task_struct *t)
4803 { 4803 {
4804 struct ftrace_ret_stack *ret_stack = t->ret_stack; 4804 struct ftrace_ret_stack *ret_stack = t->ret_stack;
4805 4805
4806 t->ret_stack = NULL; 4806 t->ret_stack = NULL;
4807 /* NULL must become visible to IRQs before we free it: */ 4807 /* NULL must become visible to IRQs before we free it: */
4808 barrier(); 4808 barrier();
4809 4809
4810 kfree(ret_stack); 4810 kfree(ret_stack);
4811 } 4811 }
4812 4812
4813 void ftrace_graph_stop(void) 4813 void ftrace_graph_stop(void)
4814 { 4814 {
4815 ftrace_stop(); 4815 ftrace_stop();
4816 } 4816 }
4817 #endif 4817 #endif
4818 4818
1 /* 1 /*
2 * Resizable virtual memory filesystem for Linux. 2 * Resizable virtual memory filesystem for Linux.
3 * 3 *
4 * Copyright (C) 2000 Linus Torvalds. 4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp. 5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2011 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc. 10 * Copyright (C) 2011 Google Inc.
11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 * 13 *
14 * Extended attribute support for tmpfs: 14 * Extended attribute support for tmpfs:
15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 * 17 *
18 * tiny-shmem: 18 * tiny-shmem:
19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 * 20 *
21 * This file is released under the GPL. 21 * This file is released under the GPL.
22 */ 22 */
23 23
24 #include <linux/fs.h> 24 #include <linux/fs.h>
25 #include <linux/init.h> 25 #include <linux/init.h>
26 #include <linux/vfs.h> 26 #include <linux/vfs.h>
27 #include <linux/mount.h> 27 #include <linux/mount.h>
28 #include <linux/pagemap.h> 28 #include <linux/pagemap.h>
29 #include <linux/file.h> 29 #include <linux/file.h>
30 #include <linux/mm.h> 30 #include <linux/mm.h>
31 #include <linux/export.h> 31 #include <linux/export.h>
32 #include <linux/swap.h> 32 #include <linux/swap.h>
33 33
34 static struct vfsmount *shm_mnt; 34 static struct vfsmount *shm_mnt;
35 35
36 #ifdef CONFIG_SHMEM 36 #ifdef CONFIG_SHMEM
37 /* 37 /*
38 * This virtual memory filesystem is heavily based on the ramfs. It 38 * This virtual memory filesystem is heavily based on the ramfs. It
39 * extends ramfs by the ability to use swap and honor resource limits 39 * extends ramfs by the ability to use swap and honor resource limits
40 * which makes it a completely usable filesystem. 40 * which makes it a completely usable filesystem.
41 */ 41 */
42 42
43 #include <linux/xattr.h> 43 #include <linux/xattr.h>
44 #include <linux/exportfs.h> 44 #include <linux/exportfs.h>
45 #include <linux/posix_acl.h> 45 #include <linux/posix_acl.h>
46 #include <linux/generic_acl.h> 46 #include <linux/generic_acl.h>
47 #include <linux/mman.h> 47 #include <linux/mman.h>
48 #include <linux/string.h> 48 #include <linux/string.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 #include <linux/backing-dev.h> 50 #include <linux/backing-dev.h>
51 #include <linux/shmem_fs.h> 51 #include <linux/shmem_fs.h>
52 #include <linux/writeback.h> 52 #include <linux/writeback.h>
53 #include <linux/blkdev.h> 53 #include <linux/blkdev.h>
54 #include <linux/pagevec.h> 54 #include <linux/pagevec.h>
55 #include <linux/percpu_counter.h> 55 #include <linux/percpu_counter.h>
56 #include <linux/falloc.h> 56 #include <linux/falloc.h>
57 #include <linux/splice.h> 57 #include <linux/splice.h>
58 #include <linux/security.h> 58 #include <linux/security.h>
59 #include <linux/swapops.h> 59 #include <linux/swapops.h>
60 #include <linux/mempolicy.h> 60 #include <linux/mempolicy.h>
61 #include <linux/namei.h> 61 #include <linux/namei.h>
62 #include <linux/ctype.h> 62 #include <linux/ctype.h>
63 #include <linux/migrate.h> 63 #include <linux/migrate.h>
64 #include <linux/highmem.h> 64 #include <linux/highmem.h>
65 #include <linux/seq_file.h> 65 #include <linux/seq_file.h>
66 #include <linux/magic.h> 66 #include <linux/magic.h>
67 67
68 #include <asm/uaccess.h> 68 #include <asm/uaccess.h>
69 #include <asm/pgtable.h> 69 #include <asm/pgtable.h>
70 70
71 #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 71 #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
72 #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 72 #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
73 73
74 /* Pretend that each entry is of this size in directory's i_size */ 74 /* Pretend that each entry is of this size in directory's i_size */
75 #define BOGO_DIRENT_SIZE 20 75 #define BOGO_DIRENT_SIZE 20
76 76
77 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 77 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
78 #define SHORT_SYMLINK_LEN 128 78 #define SHORT_SYMLINK_LEN 128
79 79
80 /* 80 /*
81 * shmem_fallocate and shmem_writepage communicate via inode->i_private 81 * shmem_fallocate and shmem_writepage communicate via inode->i_private
82 * (with i_mutex making sure that it has only one user at a time): 82 * (with i_mutex making sure that it has only one user at a time):
83 * we would prefer not to enlarge the shmem inode just for that. 83 * we would prefer not to enlarge the shmem inode just for that.
84 */ 84 */
85 struct shmem_falloc { 85 struct shmem_falloc {
86 pgoff_t start; /* start of range currently being fallocated */ 86 pgoff_t start; /* start of range currently being fallocated */
87 pgoff_t next; /* the next page offset to be fallocated */ 87 pgoff_t next; /* the next page offset to be fallocated */
88 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 88 pgoff_t nr_falloced; /* how many new pages have been fallocated */
89 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 89 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
90 }; 90 };
91 91
92 /* Flag allocation requirements to shmem_getpage */ 92 /* Flag allocation requirements to shmem_getpage */
93 enum sgp_type { 93 enum sgp_type {
94 SGP_READ, /* don't exceed i_size, don't allocate page */ 94 SGP_READ, /* don't exceed i_size, don't allocate page */
95 SGP_CACHE, /* don't exceed i_size, may allocate page */ 95 SGP_CACHE, /* don't exceed i_size, may allocate page */
96 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 96 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
97 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ 97 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
98 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ 98 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
99 }; 99 };
100 100
101 #ifdef CONFIG_TMPFS 101 #ifdef CONFIG_TMPFS
102 static unsigned long shmem_default_max_blocks(void) 102 static unsigned long shmem_default_max_blocks(void)
103 { 103 {
104 return totalram_pages / 2; 104 return totalram_pages / 2;
105 } 105 }
106 106
107 static unsigned long shmem_default_max_inodes(void) 107 static unsigned long shmem_default_max_inodes(void)
108 { 108 {
109 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 109 return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
110 } 110 }
111 #endif 111 #endif
112 112
113 static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 113 static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
114 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 114 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
115 struct shmem_inode_info *info, pgoff_t index); 115 struct shmem_inode_info *info, pgoff_t index);
116 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 116 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
117 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 117 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
118 118
119 static inline int shmem_getpage(struct inode *inode, pgoff_t index, 119 static inline int shmem_getpage(struct inode *inode, pgoff_t index,
120 struct page **pagep, enum sgp_type sgp, int *fault_type) 120 struct page **pagep, enum sgp_type sgp, int *fault_type)
121 { 121 {
122 return shmem_getpage_gfp(inode, index, pagep, sgp, 122 return shmem_getpage_gfp(inode, index, pagep, sgp,
123 mapping_gfp_mask(inode->i_mapping), fault_type); 123 mapping_gfp_mask(inode->i_mapping), fault_type);
124 } 124 }
125 125
126 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 126 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
127 { 127 {
128 return sb->s_fs_info; 128 return sb->s_fs_info;
129 } 129 }
130 130
131 /* 131 /*
132 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 132 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
133 * for shared memory and for shared anonymous (/dev/zero) mappings 133 * for shared memory and for shared anonymous (/dev/zero) mappings
134 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 134 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
135 * consistent with the pre-accounting of private mappings ... 135 * consistent with the pre-accounting of private mappings ...
136 */ 136 */
137 static inline int shmem_acct_size(unsigned long flags, loff_t size) 137 static inline int shmem_acct_size(unsigned long flags, loff_t size)
138 { 138 {
139 return (flags & VM_NORESERVE) ? 139 return (flags & VM_NORESERVE) ?
140 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 140 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
141 } 141 }
142 142
143 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 143 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
144 { 144 {
145 if (!(flags & VM_NORESERVE)) 145 if (!(flags & VM_NORESERVE))
146 vm_unacct_memory(VM_ACCT(size)); 146 vm_unacct_memory(VM_ACCT(size));
147 } 147 }
148 148
149 /* 149 /*
150 * ... whereas tmpfs objects are accounted incrementally as 150 * ... whereas tmpfs objects are accounted incrementally as
151 * pages are allocated, in order to allow huge sparse files. 151 * pages are allocated, in order to allow huge sparse files.
152 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 152 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
153 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 153 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
154 */ 154 */
155 static inline int shmem_acct_block(unsigned long flags) 155 static inline int shmem_acct_block(unsigned long flags)
156 { 156 {
157 return (flags & VM_NORESERVE) ? 157 return (flags & VM_NORESERVE) ?
158 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; 158 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
159 } 159 }
160 160
161 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 161 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
162 { 162 {
163 if (flags & VM_NORESERVE) 163 if (flags & VM_NORESERVE)
164 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 164 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
165 } 165 }
166 166
167 static const struct super_operations shmem_ops; 167 static const struct super_operations shmem_ops;
168 static const struct address_space_operations shmem_aops; 168 static const struct address_space_operations shmem_aops;
169 static const struct file_operations shmem_file_operations; 169 static const struct file_operations shmem_file_operations;
170 static const struct inode_operations shmem_inode_operations; 170 static const struct inode_operations shmem_inode_operations;
171 static const struct inode_operations shmem_dir_inode_operations; 171 static const struct inode_operations shmem_dir_inode_operations;
172 static const struct inode_operations shmem_special_inode_operations; 172 static const struct inode_operations shmem_special_inode_operations;
173 static const struct vm_operations_struct shmem_vm_ops; 173 static const struct vm_operations_struct shmem_vm_ops;
174 174
175 static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 175 static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
176 .ra_pages = 0, /* No readahead */ 176 .ra_pages = 0, /* No readahead */
177 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 177 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
178 }; 178 };
179 179
180 static LIST_HEAD(shmem_swaplist); 180 static LIST_HEAD(shmem_swaplist);
181 static DEFINE_MUTEX(shmem_swaplist_mutex); 181 static DEFINE_MUTEX(shmem_swaplist_mutex);
182 182
183 static int shmem_reserve_inode(struct super_block *sb) 183 static int shmem_reserve_inode(struct super_block *sb)
184 { 184 {
185 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 185 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
186 if (sbinfo->max_inodes) { 186 if (sbinfo->max_inodes) {
187 spin_lock(&sbinfo->stat_lock); 187 spin_lock(&sbinfo->stat_lock);
188 if (!sbinfo->free_inodes) { 188 if (!sbinfo->free_inodes) {
189 spin_unlock(&sbinfo->stat_lock); 189 spin_unlock(&sbinfo->stat_lock);
190 return -ENOSPC; 190 return -ENOSPC;
191 } 191 }
192 sbinfo->free_inodes--; 192 sbinfo->free_inodes--;
193 spin_unlock(&sbinfo->stat_lock); 193 spin_unlock(&sbinfo->stat_lock);
194 } 194 }
195 return 0; 195 return 0;
196 } 196 }
197 197
198 static void shmem_free_inode(struct super_block *sb) 198 static void shmem_free_inode(struct super_block *sb)
199 { 199 {
200 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 200 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
201 if (sbinfo->max_inodes) { 201 if (sbinfo->max_inodes) {
202 spin_lock(&sbinfo->stat_lock); 202 spin_lock(&sbinfo->stat_lock);
203 sbinfo->free_inodes++; 203 sbinfo->free_inodes++;
204 spin_unlock(&sbinfo->stat_lock); 204 spin_unlock(&sbinfo->stat_lock);
205 } 205 }
206 } 206 }
207 207
208 /** 208 /**
209 * shmem_recalc_inode - recalculate the block usage of an inode 209 * shmem_recalc_inode - recalculate the block usage of an inode
210 * @inode: inode to recalc 210 * @inode: inode to recalc
211 * 211 *
212 * We have to calculate the free blocks since the mm can drop 212 * We have to calculate the free blocks since the mm can drop
213 * undirtied hole pages behind our back. 213 * undirtied hole pages behind our back.
214 * 214 *
215 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 215 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
216 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 216 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
217 * 217 *
218 * It has to be called with the spinlock held. 218 * It has to be called with the spinlock held.
219 */ 219 */
220 static void shmem_recalc_inode(struct inode *inode) 220 static void shmem_recalc_inode(struct inode *inode)
221 { 221 {
222 struct shmem_inode_info *info = SHMEM_I(inode); 222 struct shmem_inode_info *info = SHMEM_I(inode);
223 long freed; 223 long freed;
224 224
225 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 225 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
226 if (freed > 0) { 226 if (freed > 0) {
227 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 227 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
228 if (sbinfo->max_blocks) 228 if (sbinfo->max_blocks)
229 percpu_counter_add(&sbinfo->used_blocks, -freed); 229 percpu_counter_add(&sbinfo->used_blocks, -freed);
230 info->alloced -= freed; 230 info->alloced -= freed;
231 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 231 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
232 shmem_unacct_blocks(info->flags, freed); 232 shmem_unacct_blocks(info->flags, freed);
233 } 233 }
234 } 234 }
235 235
236 /* 236 /*
237 * Replace item expected in radix tree by a new item, while holding tree lock. 237 * Replace item expected in radix tree by a new item, while holding tree lock.
238 */ 238 */
239 static int shmem_radix_tree_replace(struct address_space *mapping, 239 static int shmem_radix_tree_replace(struct address_space *mapping,
240 pgoff_t index, void *expected, void *replacement) 240 pgoff_t index, void *expected, void *replacement)
241 { 241 {
242 void **pslot; 242 void **pslot;
243 void *item = NULL; 243 void *item = NULL;
244 244
245 VM_BUG_ON(!expected); 245 VM_BUG_ON(!expected);
246 pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 246 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
247 if (pslot) 247 if (pslot)
248 item = radix_tree_deref_slot_protected(pslot, 248 item = radix_tree_deref_slot_protected(pslot,
249 &mapping->tree_lock); 249 &mapping->tree_lock);
250 if (item != expected) 250 if (item != expected)
251 return -ENOENT; 251 return -ENOENT;
252 if (replacement) 252 if (replacement)
253 radix_tree_replace_slot(pslot, replacement); 253 radix_tree_replace_slot(pslot, replacement);
254 else 254 else
255 radix_tree_delete(&mapping->page_tree, index); 255 radix_tree_delete(&mapping->page_tree, index);
256 return 0; 256 return 0;
257 } 257 }
258 258
259 /* 259 /*
260 * Sometimes, before we decide whether to proceed or to fail, we must check 260 * Sometimes, before we decide whether to proceed or to fail, we must check
261 * that an entry was not already brought back from swap by a racing thread. 261 * that an entry was not already brought back from swap by a racing thread.
262 * 262 *
263 * Checking page is not enough: by the time a SwapCache page is locked, it 263 * Checking page is not enough: by the time a SwapCache page is locked, it
264 * might be reused, and again be SwapCache, using the same swap as before. 264 * might be reused, and again be SwapCache, using the same swap as before.
265 */ 265 */
266 static bool shmem_confirm_swap(struct address_space *mapping, 266 static bool shmem_confirm_swap(struct address_space *mapping,
267 pgoff_t index, swp_entry_t swap) 267 pgoff_t index, swp_entry_t swap)
268 { 268 {
269 void *item; 269 void *item;
270 270
271 rcu_read_lock(); 271 rcu_read_lock();
272 item = radix_tree_lookup(&mapping->page_tree, index); 272 item = radix_tree_lookup(&mapping->page_tree, index);
273 rcu_read_unlock(); 273 rcu_read_unlock();
274 return item == swp_to_radix_entry(swap); 274 return item == swp_to_radix_entry(swap);
275 } 275 }
276 276
277 /* 277 /*
278 * Like add_to_page_cache_locked, but error if expected item has gone. 278 * Like add_to_page_cache_locked, but error if expected item has gone.
279 */ 279 */
280 static int shmem_add_to_page_cache(struct page *page, 280 static int shmem_add_to_page_cache(struct page *page,
281 struct address_space *mapping, 281 struct address_space *mapping,
282 pgoff_t index, gfp_t gfp, void *expected) 282 pgoff_t index, gfp_t gfp, void *expected)
283 { 283 {
284 int error; 284 int error;
285 285
286 VM_BUG_ON(!PageLocked(page)); 286 VM_BUG_ON(!PageLocked(page));
287 VM_BUG_ON(!PageSwapBacked(page)); 287 VM_BUG_ON(!PageSwapBacked(page));
288 288
289 page_cache_get(page); 289 page_cache_get(page);
290 page->mapping = mapping; 290 page->mapping = mapping;
291 page->index = index; 291 page->index = index;
292 292
293 spin_lock_irq(&mapping->tree_lock); 293 spin_lock_irq(&mapping->tree_lock);
294 if (!expected) 294 if (!expected)
295 error = radix_tree_insert(&mapping->page_tree, index, page); 295 error = radix_tree_insert(&mapping->page_tree, index, page);
296 else 296 else
297 error = shmem_radix_tree_replace(mapping, index, expected, 297 error = shmem_radix_tree_replace(mapping, index, expected,
298 page); 298 page);
299 if (!error) { 299 if (!error) {
300 mapping->nrpages++; 300 mapping->nrpages++;
301 __inc_zone_page_state(page, NR_FILE_PAGES); 301 __inc_zone_page_state(page, NR_FILE_PAGES);
302 __inc_zone_page_state(page, NR_SHMEM); 302 __inc_zone_page_state(page, NR_SHMEM);
303 spin_unlock_irq(&mapping->tree_lock); 303 spin_unlock_irq(&mapping->tree_lock);
304 } else { 304 } else {
305 page->mapping = NULL; 305 page->mapping = NULL;
306 spin_unlock_irq(&mapping->tree_lock); 306 spin_unlock_irq(&mapping->tree_lock);
307 page_cache_release(page); 307 page_cache_release(page);
308 } 308 }
309 return error; 309 return error;
310 } 310 }
311 311
312 /* 312 /*
313 * Like delete_from_page_cache, but substitutes swap for page. 313 * Like delete_from_page_cache, but substitutes swap for page.
314 */ 314 */
315 static void shmem_delete_from_page_cache(struct page *page, void *radswap) 315 static void shmem_delete_from_page_cache(struct page *page, void *radswap)
316 { 316 {
317 struct address_space *mapping = page->mapping; 317 struct address_space *mapping = page->mapping;
318 int error; 318 int error;
319 319
320 spin_lock_irq(&mapping->tree_lock); 320 spin_lock_irq(&mapping->tree_lock);
321 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 321 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
322 page->mapping = NULL; 322 page->mapping = NULL;
323 mapping->nrpages--; 323 mapping->nrpages--;
324 __dec_zone_page_state(page, NR_FILE_PAGES); 324 __dec_zone_page_state(page, NR_FILE_PAGES);
325 __dec_zone_page_state(page, NR_SHMEM); 325 __dec_zone_page_state(page, NR_SHMEM);
326 spin_unlock_irq(&mapping->tree_lock); 326 spin_unlock_irq(&mapping->tree_lock);
327 page_cache_release(page); 327 page_cache_release(page);
328 BUG_ON(error); 328 BUG_ON(error);
329 } 329 }
330 330
331 /* 331 /*
332 * Like find_get_pages, but collecting swap entries as well as pages. 332 * Like find_get_pages, but collecting swap entries as well as pages.
333 */ 333 */
334 static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, 334 static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
335 pgoff_t start, unsigned int nr_pages, 335 pgoff_t start, unsigned int nr_pages,
336 struct page **pages, pgoff_t *indices) 336 struct page **pages, pgoff_t *indices)
337 { 337 {
338 unsigned int i; 338 unsigned int i;
339 unsigned int ret; 339 unsigned int ret;
340 unsigned int nr_found; 340 unsigned int nr_found;
341 341
342 rcu_read_lock(); 342 rcu_read_lock();
343 restart: 343 restart:
344 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 344 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
345 (void ***)pages, indices, start, nr_pages); 345 (void ***)pages, indices, start, nr_pages);
346 ret = 0; 346 ret = 0;
347 for (i = 0; i < nr_found; i++) { 347 for (i = 0; i < nr_found; i++) {
348 struct page *page; 348 struct page *page;
349 repeat: 349 repeat:
350 page = radix_tree_deref_slot((void **)pages[i]); 350 page = radix_tree_deref_slot((void **)pages[i]);
351 if (unlikely(!page)) 351 if (unlikely(!page))
352 continue; 352 continue;
353 if (radix_tree_exception(page)) { 353 if (radix_tree_exception(page)) {
354 if (radix_tree_deref_retry(page)) 354 if (radix_tree_deref_retry(page))
355 goto restart; 355 goto restart;
356 /* 356 /*
357 * Otherwise, we must be storing a swap entry 357 * Otherwise, we must be storing a swap entry
358 * here as an exceptional entry: so return it 358 * here as an exceptional entry: so return it
359 * without attempting to raise page count. 359 * without attempting to raise page count.
360 */ 360 */
361 goto export; 361 goto export;
362 } 362 }
363 if (!page_cache_get_speculative(page)) 363 if (!page_cache_get_speculative(page))
364 goto repeat; 364 goto repeat;
365 365
366 /* Has the page moved? */ 366 /* Has the page moved? */
367 if (unlikely(page != *((void **)pages[i]))) { 367 if (unlikely(page != *((void **)pages[i]))) {
368 page_cache_release(page); 368 page_cache_release(page);
369 goto repeat; 369 goto repeat;
370 } 370 }
371 export: 371 export:
372 indices[ret] = indices[i]; 372 indices[ret] = indices[i];
373 pages[ret] = page; 373 pages[ret] = page;
374 ret++; 374 ret++;
375 } 375 }
376 if (unlikely(!ret && nr_found)) 376 if (unlikely(!ret && nr_found))
377 goto restart; 377 goto restart;
378 rcu_read_unlock(); 378 rcu_read_unlock();
379 return ret; 379 return ret;
380 } 380 }
381 381
382 /* 382 /*
383 * Remove swap entry from radix tree, free the swap and its page cache. 383 * Remove swap entry from radix tree, free the swap and its page cache.
384 */ 384 */
385 static int shmem_free_swap(struct address_space *mapping, 385 static int shmem_free_swap(struct address_space *mapping,
386 pgoff_t index, void *radswap) 386 pgoff_t index, void *radswap)
387 { 387 {
388 int error; 388 int error;
389 389
390 spin_lock_irq(&mapping->tree_lock); 390 spin_lock_irq(&mapping->tree_lock);
391 error = shmem_radix_tree_replace(mapping, index, radswap, NULL); 391 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
392 spin_unlock_irq(&mapping->tree_lock); 392 spin_unlock_irq(&mapping->tree_lock);
393 if (!error) 393 if (!error)
394 free_swap_and_cache(radix_to_swp_entry(radswap)); 394 free_swap_and_cache(radix_to_swp_entry(radswap));
395 return error; 395 return error;
396 } 396 }
397 397
398 /* 398 /*
399 * Pagevec may contain swap entries, so shuffle up pages before releasing. 399 * Pagevec may contain swap entries, so shuffle up pages before releasing.
400 */ 400 */
401 static void shmem_deswap_pagevec(struct pagevec *pvec) 401 static void shmem_deswap_pagevec(struct pagevec *pvec)
402 { 402 {
403 int i, j; 403 int i, j;
404 404
405 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 405 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
406 struct page *page = pvec->pages[i]; 406 struct page *page = pvec->pages[i];
407 if (!radix_tree_exceptional_entry(page)) 407 if (!radix_tree_exceptional_entry(page))
408 pvec->pages[j++] = page; 408 pvec->pages[j++] = page;
409 } 409 }
410 pvec->nr = j; 410 pvec->nr = j;
411 } 411 }
412 412
413 /* 413 /*
414 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 414 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
415 */ 415 */
416 void shmem_unlock_mapping(struct address_space *mapping) 416 void shmem_unlock_mapping(struct address_space *mapping)
417 { 417 {
418 struct pagevec pvec; 418 struct pagevec pvec;
419 pgoff_t indices[PAGEVEC_SIZE]; 419 pgoff_t indices[PAGEVEC_SIZE];
420 pgoff_t index = 0; 420 pgoff_t index = 0;
421 421
422 pagevec_init(&pvec, 0); 422 pagevec_init(&pvec, 0);
423 /* 423 /*
424 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 424 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
425 */ 425 */
426 while (!mapping_unevictable(mapping)) { 426 while (!mapping_unevictable(mapping)) {
427 /* 427 /*
428 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 428 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
429 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 429 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
430 */ 430 */
431 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 431 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
432 PAGEVEC_SIZE, pvec.pages, indices); 432 PAGEVEC_SIZE, pvec.pages, indices);
433 if (!pvec.nr) 433 if (!pvec.nr)
434 break; 434 break;
435 index = indices[pvec.nr - 1] + 1; 435 index = indices[pvec.nr - 1] + 1;
436 shmem_deswap_pagevec(&pvec); 436 shmem_deswap_pagevec(&pvec);
437 check_move_unevictable_pages(pvec.pages, pvec.nr); 437 check_move_unevictable_pages(pvec.pages, pvec.nr);
438 pagevec_release(&pvec); 438 pagevec_release(&pvec);
439 cond_resched(); 439 cond_resched();
440 } 440 }
441 } 441 }
442 442
443 /* 443 /*
444 * Remove range of pages and swap entries from radix tree, and free them. 444 * Remove range of pages and swap entries from radix tree, and free them.
445 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 445 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
446 */ 446 */
447 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 447 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
448 bool unfalloc) 448 bool unfalloc)
449 { 449 {
450 struct address_space *mapping = inode->i_mapping; 450 struct address_space *mapping = inode->i_mapping;
451 struct shmem_inode_info *info = SHMEM_I(inode); 451 struct shmem_inode_info *info = SHMEM_I(inode);
452 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 452 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
453 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; 453 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
454 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); 454 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
455 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); 455 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
456 struct pagevec pvec; 456 struct pagevec pvec;
457 pgoff_t indices[PAGEVEC_SIZE]; 457 pgoff_t indices[PAGEVEC_SIZE];
458 long nr_swaps_freed = 0; 458 long nr_swaps_freed = 0;
459 pgoff_t index; 459 pgoff_t index;
460 int i; 460 int i;
461 461
462 if (lend == -1) 462 if (lend == -1)
463 end = -1; /* unsigned, so actually very big */ 463 end = -1; /* unsigned, so actually very big */
464 464
465 pagevec_init(&pvec, 0); 465 pagevec_init(&pvec, 0);
466 index = start; 466 index = start;
467 while (index < end) { 467 while (index < end) {
468 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 468 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
469 min(end - index, (pgoff_t)PAGEVEC_SIZE), 469 min(end - index, (pgoff_t)PAGEVEC_SIZE),
470 pvec.pages, indices); 470 pvec.pages, indices);
471 if (!pvec.nr) 471 if (!pvec.nr)
472 break; 472 break;
473 mem_cgroup_uncharge_start(); 473 mem_cgroup_uncharge_start();
474 for (i = 0; i < pagevec_count(&pvec); i++) { 474 for (i = 0; i < pagevec_count(&pvec); i++) {
475 struct page *page = pvec.pages[i]; 475 struct page *page = pvec.pages[i];
476 476
477 index = indices[i]; 477 index = indices[i];
478 if (index >= end) 478 if (index >= end)
479 break; 479 break;
480 480
481 if (radix_tree_exceptional_entry(page)) { 481 if (radix_tree_exceptional_entry(page)) {
482 if (unfalloc) 482 if (unfalloc)
483 continue; 483 continue;
484 nr_swaps_freed += !shmem_free_swap(mapping, 484 nr_swaps_freed += !shmem_free_swap(mapping,
485 index, page); 485 index, page);
486 continue; 486 continue;
487 } 487 }
488 488
489 if (!trylock_page(page)) 489 if (!trylock_page(page))
490 continue; 490 continue;
491 if (!unfalloc || !PageUptodate(page)) { 491 if (!unfalloc || !PageUptodate(page)) {
492 if (page->mapping == mapping) { 492 if (page->mapping == mapping) {
493 VM_BUG_ON(PageWriteback(page)); 493 VM_BUG_ON(PageWriteback(page));
494 truncate_inode_page(mapping, page); 494 truncate_inode_page(mapping, page);
495 } 495 }
496 } 496 }
497 unlock_page(page); 497 unlock_page(page);
498 } 498 }
499 shmem_deswap_pagevec(&pvec); 499 shmem_deswap_pagevec(&pvec);
500 pagevec_release(&pvec); 500 pagevec_release(&pvec);
501 mem_cgroup_uncharge_end(); 501 mem_cgroup_uncharge_end();
502 cond_resched(); 502 cond_resched();
503 index++; 503 index++;
504 } 504 }
505 505
506 if (partial_start) { 506 if (partial_start) {
507 struct page *page = NULL; 507 struct page *page = NULL;
508 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 508 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
509 if (page) { 509 if (page) {
510 unsigned int top = PAGE_CACHE_SIZE; 510 unsigned int top = PAGE_CACHE_SIZE;
511 if (start > end) { 511 if (start > end) {
512 top = partial_end; 512 top = partial_end;
513 partial_end = 0; 513 partial_end = 0;
514 } 514 }
515 zero_user_segment(page, partial_start, top); 515 zero_user_segment(page, partial_start, top);
516 set_page_dirty(page); 516 set_page_dirty(page);
517 unlock_page(page); 517 unlock_page(page);
518 page_cache_release(page); 518 page_cache_release(page);
519 } 519 }
520 } 520 }
521 if (partial_end) { 521 if (partial_end) {
522 struct page *page = NULL; 522 struct page *page = NULL;
523 shmem_getpage(inode, end, &page, SGP_READ, NULL); 523 shmem_getpage(inode, end, &page, SGP_READ, NULL);
524 if (page) { 524 if (page) {
525 zero_user_segment(page, 0, partial_end); 525 zero_user_segment(page, 0, partial_end);
526 set_page_dirty(page); 526 set_page_dirty(page);
527 unlock_page(page); 527 unlock_page(page);
528 page_cache_release(page); 528 page_cache_release(page);
529 } 529 }
530 } 530 }
531 if (start >= end) 531 if (start >= end)
532 return; 532 return;
533 533
534 index = start; 534 index = start;
535 for ( ; ; ) { 535 for ( ; ; ) {
536 cond_resched(); 536 cond_resched();
537 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 537 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
538 min(end - index, (pgoff_t)PAGEVEC_SIZE), 538 min(end - index, (pgoff_t)PAGEVEC_SIZE),
539 pvec.pages, indices); 539 pvec.pages, indices);
540 if (!pvec.nr) { 540 if (!pvec.nr) {
541 if (index == start || unfalloc) 541 if (index == start || unfalloc)
542 break; 542 break;
543 index = start; 543 index = start;
544 continue; 544 continue;
545 } 545 }
546 if ((index == start || unfalloc) && indices[0] >= end) { 546 if ((index == start || unfalloc) && indices[0] >= end) {
547 shmem_deswap_pagevec(&pvec); 547 shmem_deswap_pagevec(&pvec);
548 pagevec_release(&pvec); 548 pagevec_release(&pvec);
549 break; 549 break;
550 } 550 }
551 mem_cgroup_uncharge_start(); 551 mem_cgroup_uncharge_start();
552 for (i = 0; i < pagevec_count(&pvec); i++) { 552 for (i = 0; i < pagevec_count(&pvec); i++) {
553 struct page *page = pvec.pages[i]; 553 struct page *page = pvec.pages[i];
554 554
555 index = indices[i]; 555 index = indices[i];
556 if (index >= end) 556 if (index >= end)
557 break; 557 break;
558 558
559 if (radix_tree_exceptional_entry(page)) { 559 if (radix_tree_exceptional_entry(page)) {
560 if (unfalloc) 560 if (unfalloc)
561 continue; 561 continue;
562 nr_swaps_freed += !shmem_free_swap(mapping, 562 nr_swaps_freed += !shmem_free_swap(mapping,
563 index, page); 563 index, page);
564 continue; 564 continue;
565 } 565 }
566 566
567 lock_page(page); 567 lock_page(page);
568 if (!unfalloc || !PageUptodate(page)) { 568 if (!unfalloc || !PageUptodate(page)) {
569 if (page->mapping == mapping) { 569 if (page->mapping == mapping) {
570 VM_BUG_ON(PageWriteback(page)); 570 VM_BUG_ON(PageWriteback(page));
571 truncate_inode_page(mapping, page); 571 truncate_inode_page(mapping, page);
572 } 572 }
573 } 573 }
574 unlock_page(page); 574 unlock_page(page);
575 } 575 }
576 shmem_deswap_pagevec(&pvec); 576 shmem_deswap_pagevec(&pvec);
577 pagevec_release(&pvec); 577 pagevec_release(&pvec);
578 mem_cgroup_uncharge_end(); 578 mem_cgroup_uncharge_end();
579 index++; 579 index++;
580 } 580 }
581 581
582 spin_lock(&info->lock); 582 spin_lock(&info->lock);
583 info->swapped -= nr_swaps_freed; 583 info->swapped -= nr_swaps_freed;
584 shmem_recalc_inode(inode); 584 shmem_recalc_inode(inode);
585 spin_unlock(&info->lock); 585 spin_unlock(&info->lock);
586 } 586 }
587 587
588 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 588 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
589 { 589 {
590 shmem_undo_range(inode, lstart, lend, false); 590 shmem_undo_range(inode, lstart, lend, false);
591 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 591 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
592 } 592 }
593 EXPORT_SYMBOL_GPL(shmem_truncate_range); 593 EXPORT_SYMBOL_GPL(shmem_truncate_range);
594 594
595 static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 595 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
596 { 596 {
597 struct inode *inode = dentry->d_inode; 597 struct inode *inode = dentry->d_inode;
598 int error; 598 int error;
599 599
600 error = inode_change_ok(inode, attr); 600 error = inode_change_ok(inode, attr);
601 if (error) 601 if (error)
602 return error; 602 return error;
603 603
604 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 604 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
605 loff_t oldsize = inode->i_size; 605 loff_t oldsize = inode->i_size;
606 loff_t newsize = attr->ia_size; 606 loff_t newsize = attr->ia_size;
607 607
608 if (newsize != oldsize) { 608 if (newsize != oldsize) {
609 i_size_write(inode, newsize); 609 i_size_write(inode, newsize);
610 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 610 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
611 } 611 }
612 if (newsize < oldsize) { 612 if (newsize < oldsize) {
613 loff_t holebegin = round_up(newsize, PAGE_SIZE); 613 loff_t holebegin = round_up(newsize, PAGE_SIZE);
614 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 614 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
615 shmem_truncate_range(inode, newsize, (loff_t)-1); 615 shmem_truncate_range(inode, newsize, (loff_t)-1);
616 /* unmap again to remove racily COWed private pages */ 616 /* unmap again to remove racily COWed private pages */
617 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 617 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
618 } 618 }
619 } 619 }
620 620
621 setattr_copy(inode, attr); 621 setattr_copy(inode, attr);
622 #ifdef CONFIG_TMPFS_POSIX_ACL 622 #ifdef CONFIG_TMPFS_POSIX_ACL
623 if (attr->ia_valid & ATTR_MODE) 623 if (attr->ia_valid & ATTR_MODE)
624 error = generic_acl_chmod(inode); 624 error = generic_acl_chmod(inode);
625 #endif 625 #endif
626 return error; 626 return error;
627 } 627 }
628 628
629 static void shmem_evict_inode(struct inode *inode) 629 static void shmem_evict_inode(struct inode *inode)
630 { 630 {
631 struct shmem_inode_info *info = SHMEM_I(inode); 631 struct shmem_inode_info *info = SHMEM_I(inode);
632 632
633 if (inode->i_mapping->a_ops == &shmem_aops) { 633 if (inode->i_mapping->a_ops == &shmem_aops) {
634 shmem_unacct_size(info->flags, inode->i_size); 634 shmem_unacct_size(info->flags, inode->i_size);
635 inode->i_size = 0; 635 inode->i_size = 0;
636 shmem_truncate_range(inode, 0, (loff_t)-1); 636 shmem_truncate_range(inode, 0, (loff_t)-1);
637 if (!list_empty(&info->swaplist)) { 637 if (!list_empty(&info->swaplist)) {
638 mutex_lock(&shmem_swaplist_mutex); 638 mutex_lock(&shmem_swaplist_mutex);
639 list_del_init(&info->swaplist); 639 list_del_init(&info->swaplist);
640 mutex_unlock(&shmem_swaplist_mutex); 640 mutex_unlock(&shmem_swaplist_mutex);
641 } 641 }
642 } else 642 } else
643 kfree(info->symlink); 643 kfree(info->symlink);
644 644
645 simple_xattrs_free(&info->xattrs); 645 simple_xattrs_free(&info->xattrs);
646 WARN_ON(inode->i_blocks); 646 WARN_ON(inode->i_blocks);
647 shmem_free_inode(inode->i_sb); 647 shmem_free_inode(inode->i_sb);
648 clear_inode(inode); 648 clear_inode(inode);
649 } 649 }
650 650
651 /* 651 /*
652 * If swap found in inode, free it and move page from swapcache to filecache. 652 * If swap found in inode, free it and move page from swapcache to filecache.
653 */ 653 */
654 static int shmem_unuse_inode(struct shmem_inode_info *info, 654 static int shmem_unuse_inode(struct shmem_inode_info *info,
655 swp_entry_t swap, struct page **pagep) 655 swp_entry_t swap, struct page **pagep)
656 { 656 {
657 struct address_space *mapping = info->vfs_inode.i_mapping; 657 struct address_space *mapping = info->vfs_inode.i_mapping;
658 void *radswap; 658 void *radswap;
659 pgoff_t index; 659 pgoff_t index;
660 gfp_t gfp; 660 gfp_t gfp;
661 int error = 0; 661 int error = 0;
662 662
663 radswap = swp_to_radix_entry(swap); 663 radswap = swp_to_radix_entry(swap);
664 index = radix_tree_locate_item(&mapping->page_tree, radswap); 664 index = radix_tree_locate_item(&mapping->page_tree, radswap);
665 if (index == -1) 665 if (index == -1)
666 return 0; 666 return 0;
667 667
668 /* 668 /*
669 * Move _head_ to start search for next from here. 669 * Move _head_ to start search for next from here.
670 * But be careful: shmem_evict_inode checks list_empty without taking 670 * But be careful: shmem_evict_inode checks list_empty without taking
671 * mutex, and there's an instant in list_move_tail when info->swaplist 671 * mutex, and there's an instant in list_move_tail when info->swaplist
672 * would appear empty, if it were the only one on shmem_swaplist. 672 * would appear empty, if it were the only one on shmem_swaplist.
673 */ 673 */
674 if (shmem_swaplist.next != &info->swaplist) 674 if (shmem_swaplist.next != &info->swaplist)
675 list_move_tail(&shmem_swaplist, &info->swaplist); 675 list_move_tail(&shmem_swaplist, &info->swaplist);
676 676
677 gfp = mapping_gfp_mask(mapping); 677 gfp = mapping_gfp_mask(mapping);
678 if (shmem_should_replace_page(*pagep, gfp)) { 678 if (shmem_should_replace_page(*pagep, gfp)) {
679 mutex_unlock(&shmem_swaplist_mutex); 679 mutex_unlock(&shmem_swaplist_mutex);
680 error = shmem_replace_page(pagep, gfp, info, index); 680 error = shmem_replace_page(pagep, gfp, info, index);
681 mutex_lock(&shmem_swaplist_mutex); 681 mutex_lock(&shmem_swaplist_mutex);
682 /* 682 /*
683 * We needed to drop mutex to make that restrictive page 683 * We needed to drop mutex to make that restrictive page
684 * allocation, but the inode might have been freed while we 684 * allocation, but the inode might have been freed while we
685 * dropped it: although a racing shmem_evict_inode() cannot 685 * dropped it: although a racing shmem_evict_inode() cannot
686 * complete without emptying the radix_tree, our page lock 686 * complete without emptying the radix_tree, our page lock
687 * on this swapcache page is not enough to prevent that - 687 * on this swapcache page is not enough to prevent that -
688 * free_swap_and_cache() of our swap entry will only 688 * free_swap_and_cache() of our swap entry will only
689 * trylock_page(), removing swap from radix_tree whatever. 689 * trylock_page(), removing swap from radix_tree whatever.
690 * 690 *
691 * We must not proceed to shmem_add_to_page_cache() if the 691 * We must not proceed to shmem_add_to_page_cache() if the
692 * inode has been freed, but of course we cannot rely on 692 * inode has been freed, but of course we cannot rely on
693 * inode or mapping or info to check that. However, we can 693 * inode or mapping or info to check that. However, we can
694 * safely check if our swap entry is still in use (and here 694 * safely check if our swap entry is still in use (and here
695 * it can't have got reused for another page): if it's still 695 * it can't have got reused for another page): if it's still
696 * in use, then the inode cannot have been freed yet, and we 696 * in use, then the inode cannot have been freed yet, and we
697 * can safely proceed (if it's no longer in use, that tells 697 * can safely proceed (if it's no longer in use, that tells
698 * nothing about the inode, but we don't need to unuse swap). 698 * nothing about the inode, but we don't need to unuse swap).
699 */ 699 */
700 if (!page_swapcount(*pagep)) 700 if (!page_swapcount(*pagep))
701 error = -ENOENT; 701 error = -ENOENT;
702 } 702 }
703 703
704 /* 704 /*
705 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 705 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
706 * but also to hold up shmem_evict_inode(): so inode cannot be freed 706 * but also to hold up shmem_evict_inode(): so inode cannot be freed
707 * beneath us (pagelock doesn't help until the page is in pagecache). 707 * beneath us (pagelock doesn't help until the page is in pagecache).
708 */ 708 */
709 if (!error) 709 if (!error)
710 error = shmem_add_to_page_cache(*pagep, mapping, index, 710 error = shmem_add_to_page_cache(*pagep, mapping, index,
711 GFP_NOWAIT, radswap); 711 GFP_NOWAIT, radswap);
712 if (error != -ENOMEM) { 712 if (error != -ENOMEM) {
713 /* 713 /*
714 * Truncation and eviction use free_swap_and_cache(), which 714 * Truncation and eviction use free_swap_and_cache(), which
715 * only does trylock page: if we raced, best clean up here. 715 * only does trylock page: if we raced, best clean up here.
716 */ 716 */
717 delete_from_swap_cache(*pagep); 717 delete_from_swap_cache(*pagep);
718 set_page_dirty(*pagep); 718 set_page_dirty(*pagep);
719 if (!error) { 719 if (!error) {
720 spin_lock(&info->lock); 720 spin_lock(&info->lock);
721 info->swapped--; 721 info->swapped--;
722 spin_unlock(&info->lock); 722 spin_unlock(&info->lock);
723 swap_free(swap); 723 swap_free(swap);
724 } 724 }
725 error = 1; /* not an error, but entry was found */ 725 error = 1; /* not an error, but entry was found */
726 } 726 }
727 return error; 727 return error;
728 } 728 }
729 729
730 /* 730 /*
731 * Search through swapped inodes to find and replace swap by page. 731 * Search through swapped inodes to find and replace swap by page.
732 */ 732 */
733 int shmem_unuse(swp_entry_t swap, struct page *page) 733 int shmem_unuse(swp_entry_t swap, struct page *page)
734 { 734 {
735 struct list_head *this, *next; 735 struct list_head *this, *next;
736 struct shmem_inode_info *info; 736 struct shmem_inode_info *info;
737 int found = 0; 737 int found = 0;
738 int error = 0; 738 int error = 0;
739 739
740 /* 740 /*
741 * There's a faint possibility that swap page was replaced before 741 * There's a faint possibility that swap page was replaced before
742 * caller locked it: caller will come back later with the right page. 742 * caller locked it: caller will come back later with the right page.
743 */ 743 */
744 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 744 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
745 goto out; 745 goto out;
746 746
747 /* 747 /*
748 * Charge page using GFP_KERNEL while we can wait, before taking 748 * Charge page using GFP_KERNEL while we can wait, before taking
749 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 749 * the shmem_swaplist_mutex which might hold up shmem_writepage().
750 * Charged back to the user (not to caller) when swap account is used. 750 * Charged back to the user (not to caller) when swap account is used.
751 */ 751 */
752 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 752 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
753 if (error) 753 if (error)
754 goto out; 754 goto out;
755 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 755 /* No radix_tree_preload: swap entry keeps a place for page in tree */
756 756
757 mutex_lock(&shmem_swaplist_mutex); 757 mutex_lock(&shmem_swaplist_mutex);
758 list_for_each_safe(this, next, &shmem_swaplist) { 758 list_for_each_safe(this, next, &shmem_swaplist) {
759 info = list_entry(this, struct shmem_inode_info, swaplist); 759 info = list_entry(this, struct shmem_inode_info, swaplist);
760 if (info->swapped) 760 if (info->swapped)
761 found = shmem_unuse_inode(info, swap, &page); 761 found = shmem_unuse_inode(info, swap, &page);
762 else 762 else
763 list_del_init(&info->swaplist); 763 list_del_init(&info->swaplist);
764 cond_resched(); 764 cond_resched();
765 if (found) 765 if (found)
766 break; 766 break;
767 } 767 }
768 mutex_unlock(&shmem_swaplist_mutex); 768 mutex_unlock(&shmem_swaplist_mutex);
769 769
770 if (found < 0) 770 if (found < 0)
771 error = found; 771 error = found;
772 out: 772 out:
773 unlock_page(page); 773 unlock_page(page);
774 page_cache_release(page); 774 page_cache_release(page);
775 return error; 775 return error;
776 } 776 }
777 777
778 /* 778 /*
779 * Move the page from the page cache to the swap cache. 779 * Move the page from the page cache to the swap cache.
780 */ 780 */
781 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 781 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
782 { 782 {
783 struct shmem_inode_info *info; 783 struct shmem_inode_info *info;
784 struct address_space *mapping; 784 struct address_space *mapping;
785 struct inode *inode; 785 struct inode *inode;
786 swp_entry_t swap; 786 swp_entry_t swap;
787 pgoff_t index; 787 pgoff_t index;
788 788
789 BUG_ON(!PageLocked(page)); 789 BUG_ON(!PageLocked(page));
790 mapping = page->mapping; 790 mapping = page->mapping;
791 index = page->index; 791 index = page->index;
792 inode = mapping->host; 792 inode = mapping->host;
793 info = SHMEM_I(inode); 793 info = SHMEM_I(inode);
794 if (info->flags & VM_LOCKED) 794 if (info->flags & VM_LOCKED)
795 goto redirty; 795 goto redirty;
796 if (!total_swap_pages) 796 if (!total_swap_pages)
797 goto redirty; 797 goto redirty;
798 798
799 /* 799 /*
800 * shmem_backing_dev_info's capabilities prevent regular writeback or 800 * shmem_backing_dev_info's capabilities prevent regular writeback or
801 * sync from ever calling shmem_writepage; but a stacking filesystem 801 * sync from ever calling shmem_writepage; but a stacking filesystem
802 * might use ->writepage of its underlying filesystem, in which case 802 * might use ->writepage of its underlying filesystem, in which case
803 * tmpfs should write out to swap only in response to memory pressure, 803 * tmpfs should write out to swap only in response to memory pressure,
804 * and not for the writeback threads or sync. 804 * and not for the writeback threads or sync.
805 */ 805 */
806 if (!wbc->for_reclaim) { 806 if (!wbc->for_reclaim) {
807 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 807 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
808 goto redirty; 808 goto redirty;
809 } 809 }
810 810
811 /* 811 /*
812 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 812 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
813 * value into swapfile.c, the only way we can correctly account for a 813 * value into swapfile.c, the only way we can correctly account for a
814 * fallocated page arriving here is now to initialize it and write it. 814 * fallocated page arriving here is now to initialize it and write it.
815 * 815 *
816 * That's okay for a page already fallocated earlier, but if we have 816 * That's okay for a page already fallocated earlier, but if we have
817 * not yet completed the fallocation, then (a) we want to keep track 817 * not yet completed the fallocation, then (a) we want to keep track
818 * of this page in case we have to undo it, and (b) it may not be a 818 * of this page in case we have to undo it, and (b) it may not be a
819 * good idea to continue anyway, once we're pushing into swap. So 819 * good idea to continue anyway, once we're pushing into swap. So
820 * reactivate the page, and let shmem_fallocate() quit when too many. 820 * reactivate the page, and let shmem_fallocate() quit when too many.
821 */ 821 */
822 if (!PageUptodate(page)) { 822 if (!PageUptodate(page)) {
823 if (inode->i_private) { 823 if (inode->i_private) {
824 struct shmem_falloc *shmem_falloc; 824 struct shmem_falloc *shmem_falloc;
825 spin_lock(&inode->i_lock); 825 spin_lock(&inode->i_lock);
826 shmem_falloc = inode->i_private; 826 shmem_falloc = inode->i_private;
827 if (shmem_falloc && 827 if (shmem_falloc &&
828 index >= shmem_falloc->start && 828 index >= shmem_falloc->start &&
829 index < shmem_falloc->next) 829 index < shmem_falloc->next)
830 shmem_falloc->nr_unswapped++; 830 shmem_falloc->nr_unswapped++;
831 else 831 else
832 shmem_falloc = NULL; 832 shmem_falloc = NULL;
833 spin_unlock(&inode->i_lock); 833 spin_unlock(&inode->i_lock);
834 if (shmem_falloc) 834 if (shmem_falloc)
835 goto redirty; 835 goto redirty;
836 } 836 }
837 clear_highpage(page); 837 clear_highpage(page);
838 flush_dcache_page(page); 838 flush_dcache_page(page);
839 SetPageUptodate(page); 839 SetPageUptodate(page);
840 } 840 }
841 841
842 swap = get_swap_page(); 842 swap = get_swap_page();
843 if (!swap.val) 843 if (!swap.val)
844 goto redirty; 844 goto redirty;
845 845
846 /* 846 /*
847 * Add inode to shmem_unuse()'s list of swapped-out inodes, 847 * Add inode to shmem_unuse()'s list of swapped-out inodes,
848 * if it's not already there. Do it now before the page is 848 * if it's not already there. Do it now before the page is
849 * moved to swap cache, when its pagelock no longer protects 849 * moved to swap cache, when its pagelock no longer protects
850 * the inode from eviction. But don't unlock the mutex until 850 * the inode from eviction. But don't unlock the mutex until
851 * we've incremented swapped, because shmem_unuse_inode() will 851 * we've incremented swapped, because shmem_unuse_inode() will
852 * prune a !swapped inode from the swaplist under this mutex. 852 * prune a !swapped inode from the swaplist under this mutex.
853 */ 853 */
854 mutex_lock(&shmem_swaplist_mutex); 854 mutex_lock(&shmem_swaplist_mutex);
855 if (list_empty(&info->swaplist)) 855 if (list_empty(&info->swaplist))
856 list_add_tail(&info->swaplist, &shmem_swaplist); 856 list_add_tail(&info->swaplist, &shmem_swaplist);
857 857
858 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 858 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
859 swap_shmem_alloc(swap); 859 swap_shmem_alloc(swap);
860 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 860 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
861 861
862 spin_lock(&info->lock); 862 spin_lock(&info->lock);
863 info->swapped++; 863 info->swapped++;
864 shmem_recalc_inode(inode); 864 shmem_recalc_inode(inode);
865 spin_unlock(&info->lock); 865 spin_unlock(&info->lock);
866 866
867 mutex_unlock(&shmem_swaplist_mutex); 867 mutex_unlock(&shmem_swaplist_mutex);
868 BUG_ON(page_mapped(page)); 868 BUG_ON(page_mapped(page));
869 swap_writepage(page, wbc); 869 swap_writepage(page, wbc);
870 return 0; 870 return 0;
871 } 871 }
872 872
873 mutex_unlock(&shmem_swaplist_mutex); 873 mutex_unlock(&shmem_swaplist_mutex);
874 swapcache_free(swap, NULL); 874 swapcache_free(swap, NULL);
875 redirty: 875 redirty:
876 set_page_dirty(page); 876 set_page_dirty(page);
877 if (wbc->for_reclaim) 877 if (wbc->for_reclaim)
878 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 878 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
879 unlock_page(page); 879 unlock_page(page);
880 return 0; 880 return 0;
881 } 881 }
882 882
883 #ifdef CONFIG_NUMA 883 #ifdef CONFIG_NUMA
884 #ifdef CONFIG_TMPFS 884 #ifdef CONFIG_TMPFS
885 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 885 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
886 { 886 {
887 char buffer[64]; 887 char buffer[64];
888 888
889 if (!mpol || mpol->mode == MPOL_DEFAULT) 889 if (!mpol || mpol->mode == MPOL_DEFAULT)
890 return; /* show nothing */ 890 return; /* show nothing */
891 891
892 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 892 mpol_to_str(buffer, sizeof(buffer), mpol, 1);
893 893
894 seq_printf(seq, ",mpol=%s", buffer); 894 seq_printf(seq, ",mpol=%s", buffer);
895 } 895 }
896 896
897 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 897 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
898 { 898 {
899 struct mempolicy *mpol = NULL; 899 struct mempolicy *mpol = NULL;
900 if (sbinfo->mpol) { 900 if (sbinfo->mpol) {
901 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 901 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
902 mpol = sbinfo->mpol; 902 mpol = sbinfo->mpol;
903 mpol_get(mpol); 903 mpol_get(mpol);
904 spin_unlock(&sbinfo->stat_lock); 904 spin_unlock(&sbinfo->stat_lock);
905 } 905 }
906 return mpol; 906 return mpol;
907 } 907 }
908 #endif /* CONFIG_TMPFS */ 908 #endif /* CONFIG_TMPFS */
909 909
910 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 910 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
911 struct shmem_inode_info *info, pgoff_t index) 911 struct shmem_inode_info *info, pgoff_t index)
912 { 912 {
913 struct vm_area_struct pvma; 913 struct vm_area_struct pvma;
914 struct page *page; 914 struct page *page;
915 915
916 /* Create a pseudo vma that just contains the policy */ 916 /* Create a pseudo vma that just contains the policy */
917 pvma.vm_start = 0; 917 pvma.vm_start = 0;
918 /* Bias interleave by inode number to distribute better across nodes */ 918 /* Bias interleave by inode number to distribute better across nodes */
919 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 919 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
920 pvma.vm_ops = NULL; 920 pvma.vm_ops = NULL;
921 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 921 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
922 922
923 page = swapin_readahead(swap, gfp, &pvma, 0); 923 page = swapin_readahead(swap, gfp, &pvma, 0);
924 924
925 /* Drop reference taken by mpol_shared_policy_lookup() */ 925 /* Drop reference taken by mpol_shared_policy_lookup() */
926 mpol_cond_put(pvma.vm_policy); 926 mpol_cond_put(pvma.vm_policy);
927 927
928 return page; 928 return page;
929 } 929 }
930 930
931 static struct page *shmem_alloc_page(gfp_t gfp, 931 static struct page *shmem_alloc_page(gfp_t gfp,
932 struct shmem_inode_info *info, pgoff_t index) 932 struct shmem_inode_info *info, pgoff_t index)
933 { 933 {
934 struct vm_area_struct pvma; 934 struct vm_area_struct pvma;
935 struct page *page; 935 struct page *page;
936 936
937 /* Create a pseudo vma that just contains the policy */ 937 /* Create a pseudo vma that just contains the policy */
938 pvma.vm_start = 0; 938 pvma.vm_start = 0;
939 /* Bias interleave by inode number to distribute better across nodes */ 939 /* Bias interleave by inode number to distribute better across nodes */
940 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 940 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
941 pvma.vm_ops = NULL; 941 pvma.vm_ops = NULL;
942 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 942 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
943 943
944 page = alloc_page_vma(gfp, &pvma, 0); 944 page = alloc_page_vma(gfp, &pvma, 0);
945 945
946 /* Drop reference taken by mpol_shared_policy_lookup() */ 946 /* Drop reference taken by mpol_shared_policy_lookup() */
947 mpol_cond_put(pvma.vm_policy); 947 mpol_cond_put(pvma.vm_policy);
948 948
949 return page; 949 return page;
950 } 950 }
951 #else /* !CONFIG_NUMA */ 951 #else /* !CONFIG_NUMA */
952 #ifdef CONFIG_TMPFS 952 #ifdef CONFIG_TMPFS
953 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 953 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
954 { 954 {
955 } 955 }
956 #endif /* CONFIG_TMPFS */ 956 #endif /* CONFIG_TMPFS */
957 957
958 static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 958 static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
959 struct shmem_inode_info *info, pgoff_t index) 959 struct shmem_inode_info *info, pgoff_t index)
960 { 960 {
961 return swapin_readahead(swap, gfp, NULL, 0); 961 return swapin_readahead(swap, gfp, NULL, 0);
962 } 962 }
963 963
964 static inline struct page *shmem_alloc_page(gfp_t gfp, 964 static inline struct page *shmem_alloc_page(gfp_t gfp,
965 struct shmem_inode_info *info, pgoff_t index) 965 struct shmem_inode_info *info, pgoff_t index)
966 { 966 {
967 return alloc_page(gfp); 967 return alloc_page(gfp);
968 } 968 }
969 #endif /* CONFIG_NUMA */ 969 #endif /* CONFIG_NUMA */
970 970
971 #if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) 971 #if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
972 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 972 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
973 { 973 {
974 return NULL; 974 return NULL;
975 } 975 }
976 #endif 976 #endif
977 977
978 /* 978 /*
979 * When a page is moved from swapcache to shmem filecache (either by the 979 * When a page is moved from swapcache to shmem filecache (either by the
980 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 980 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
981 * shmem_unuse_inode()), it may have been read in earlier from swap, in 981 * shmem_unuse_inode()), it may have been read in earlier from swap, in
982 * ignorance of the mapping it belongs to. If that mapping has special 982 * ignorance of the mapping it belongs to. If that mapping has special
983 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 983 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
984 * we may need to copy to a suitable page before moving to filecache. 984 * we may need to copy to a suitable page before moving to filecache.
985 * 985 *
986 * In a future release, this may well be extended to respect cpuset and 986 * In a future release, this may well be extended to respect cpuset and
987 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 987 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
988 * but for now it is a simple matter of zone. 988 * but for now it is a simple matter of zone.
989 */ 989 */
990 static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 990 static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
991 { 991 {
992 return page_zonenum(page) > gfp_zone(gfp); 992 return page_zonenum(page) > gfp_zone(gfp);
993 } 993 }
994 994
995 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 995 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
996 struct shmem_inode_info *info, pgoff_t index) 996 struct shmem_inode_info *info, pgoff_t index)
997 { 997 {
998 struct page *oldpage, *newpage; 998 struct page *oldpage, *newpage;
999 struct address_space *swap_mapping; 999 struct address_space *swap_mapping;
1000 pgoff_t swap_index; 1000 pgoff_t swap_index;
1001 int error; 1001 int error;
1002 1002
1003 oldpage = *pagep; 1003 oldpage = *pagep;
1004 swap_index = page_private(oldpage); 1004 swap_index = page_private(oldpage);
1005 swap_mapping = page_mapping(oldpage); 1005 swap_mapping = page_mapping(oldpage);
1006 1006
1007 /* 1007 /*
1008 * We have arrived here because our zones are constrained, so don't 1008 * We have arrived here because our zones are constrained, so don't
1009 * limit chance of success by further cpuset and node constraints. 1009 * limit chance of success by further cpuset and node constraints.
1010 */ 1010 */
1011 gfp &= ~GFP_CONSTRAINT_MASK; 1011 gfp &= ~GFP_CONSTRAINT_MASK;
1012 newpage = shmem_alloc_page(gfp, info, index); 1012 newpage = shmem_alloc_page(gfp, info, index);
1013 if (!newpage) 1013 if (!newpage)
1014 return -ENOMEM; 1014 return -ENOMEM;
1015 1015
1016 page_cache_get(newpage); 1016 page_cache_get(newpage);
1017 copy_highpage(newpage, oldpage); 1017 copy_highpage(newpage, oldpage);
1018 flush_dcache_page(newpage); 1018 flush_dcache_page(newpage);
1019 1019
1020 __set_page_locked(newpage); 1020 __set_page_locked(newpage);
1021 SetPageUptodate(newpage); 1021 SetPageUptodate(newpage);
1022 SetPageSwapBacked(newpage); 1022 SetPageSwapBacked(newpage);
1023 set_page_private(newpage, swap_index); 1023 set_page_private(newpage, swap_index);
1024 SetPageSwapCache(newpage); 1024 SetPageSwapCache(newpage);
1025 1025
1026 /* 1026 /*
1027 * Our caller will very soon move newpage out of swapcache, but it's 1027 * Our caller will very soon move newpage out of swapcache, but it's
1028 * a nice clean interface for us to replace oldpage by newpage there. 1028 * a nice clean interface for us to replace oldpage by newpage there.
1029 */ 1029 */
1030 spin_lock_irq(&swap_mapping->tree_lock); 1030 spin_lock_irq(&swap_mapping->tree_lock);
1031 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1031 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1032 newpage); 1032 newpage);
1033 if (!error) { 1033 if (!error) {
1034 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1034 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1035 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1035 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1036 } 1036 }
1037 spin_unlock_irq(&swap_mapping->tree_lock); 1037 spin_unlock_irq(&swap_mapping->tree_lock);
1038 1038
1039 if (unlikely(error)) { 1039 if (unlikely(error)) {
1040 /* 1040 /*
1041 * Is this possible? I think not, now that our callers check 1041 * Is this possible? I think not, now that our callers check
1042 * both PageSwapCache and page_private after getting page lock; 1042 * both PageSwapCache and page_private after getting page lock;
1043 * but be defensive. Reverse old to newpage for clear and free. 1043 * but be defensive. Reverse old to newpage for clear and free.
1044 */ 1044 */
1045 oldpage = newpage; 1045 oldpage = newpage;
1046 } else { 1046 } else {
1047 mem_cgroup_replace_page_cache(oldpage, newpage); 1047 mem_cgroup_replace_page_cache(oldpage, newpage);
1048 lru_cache_add_anon(newpage); 1048 lru_cache_add_anon(newpage);
1049 *pagep = newpage; 1049 *pagep = newpage;
1050 } 1050 }
1051 1051
1052 ClearPageSwapCache(oldpage); 1052 ClearPageSwapCache(oldpage);
1053 set_page_private(oldpage, 0); 1053 set_page_private(oldpage, 0);
1054 1054
1055 unlock_page(oldpage); 1055 unlock_page(oldpage);
1056 page_cache_release(oldpage); 1056 page_cache_release(oldpage);
1057 page_cache_release(oldpage); 1057 page_cache_release(oldpage);
1058 return error; 1058 return error;
1059 } 1059 }
1060 1060
1061 /* 1061 /*
1062 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1062 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1063 * 1063 *
1064 * If we allocate a new one we do not mark it dirty. That's up to the 1064 * If we allocate a new one we do not mark it dirty. That's up to the
1065 * vm. If we swap it in we mark it dirty since we also free the swap 1065 * vm. If we swap it in we mark it dirty since we also free the swap
1066 * entry since a page cannot live in both the swap and page cache 1066 * entry since a page cannot live in both the swap and page cache
1067 */ 1067 */
1068 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1068 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1069 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 1069 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1070 { 1070 {
1071 struct address_space *mapping = inode->i_mapping; 1071 struct address_space *mapping = inode->i_mapping;
1072 struct shmem_inode_info *info; 1072 struct shmem_inode_info *info;
1073 struct shmem_sb_info *sbinfo; 1073 struct shmem_sb_info *sbinfo;
1074 struct page *page; 1074 struct page *page;
1075 swp_entry_t swap; 1075 swp_entry_t swap;
1076 int error; 1076 int error;
1077 int once = 0; 1077 int once = 0;
1078 int alloced = 0; 1078 int alloced = 0;
1079 1079
1080 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1080 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1081 return -EFBIG; 1081 return -EFBIG;
1082 repeat: 1082 repeat:
1083 swap.val = 0; 1083 swap.val = 0;
1084 page = find_lock_page(mapping, index); 1084 page = find_lock_page(mapping, index);
1085 if (radix_tree_exceptional_entry(page)) { 1085 if (radix_tree_exceptional_entry(page)) {
1086 swap = radix_to_swp_entry(page); 1086 swap = radix_to_swp_entry(page);
1087 page = NULL; 1087 page = NULL;
1088 } 1088 }
1089 1089
1090 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1090 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1091 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1091 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1092 error = -EINVAL; 1092 error = -EINVAL;
1093 goto failed; 1093 goto failed;
1094 } 1094 }
1095 1095
1096 /* fallocated page? */ 1096 /* fallocated page? */
1097 if (page && !PageUptodate(page)) { 1097 if (page && !PageUptodate(page)) {
1098 if (sgp != SGP_READ) 1098 if (sgp != SGP_READ)
1099 goto clear; 1099 goto clear;
1100 unlock_page(page); 1100 unlock_page(page);
1101 page_cache_release(page); 1101 page_cache_release(page);
1102 page = NULL; 1102 page = NULL;
1103 } 1103 }
1104 if (page || (sgp == SGP_READ && !swap.val)) { 1104 if (page || (sgp == SGP_READ && !swap.val)) {
1105 *pagep = page; 1105 *pagep = page;
1106 return 0; 1106 return 0;
1107 } 1107 }
1108 1108
1109 /* 1109 /*
1110 * Fast cache lookup did not find it: 1110 * Fast cache lookup did not find it:
1111 * bring it back from swap or allocate. 1111 * bring it back from swap or allocate.
1112 */ 1112 */
1113 info = SHMEM_I(inode); 1113 info = SHMEM_I(inode);
1114 sbinfo = SHMEM_SB(inode->i_sb); 1114 sbinfo = SHMEM_SB(inode->i_sb);
1115 1115
1116 if (swap.val) { 1116 if (swap.val) {
1117 /* Look it up and read it in.. */ 1117 /* Look it up and read it in.. */
1118 page = lookup_swap_cache(swap); 1118 page = lookup_swap_cache(swap);
1119 if (!page) { 1119 if (!page) {
1120 /* here we actually do the io */ 1120 /* here we actually do the io */
1121 if (fault_type) 1121 if (fault_type)
1122 *fault_type |= VM_FAULT_MAJOR; 1122 *fault_type |= VM_FAULT_MAJOR;
1123 page = shmem_swapin(swap, gfp, info, index); 1123 page = shmem_swapin(swap, gfp, info, index);
1124 if (!page) { 1124 if (!page) {
1125 error = -ENOMEM; 1125 error = -ENOMEM;
1126 goto failed; 1126 goto failed;
1127 } 1127 }
1128 } 1128 }
1129 1129
1130 /* We have to do this with page locked to prevent races */ 1130 /* We have to do this with page locked to prevent races */
1131 lock_page(page); 1131 lock_page(page);
1132 if (!PageSwapCache(page) || page_private(page) != swap.val || 1132 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1133 !shmem_confirm_swap(mapping, index, swap)) { 1133 !shmem_confirm_swap(mapping, index, swap)) {
1134 error = -EEXIST; /* try again */ 1134 error = -EEXIST; /* try again */
1135 goto unlock; 1135 goto unlock;
1136 } 1136 }
1137 if (!PageUptodate(page)) { 1137 if (!PageUptodate(page)) {
1138 error = -EIO; 1138 error = -EIO;
1139 goto failed; 1139 goto failed;
1140 } 1140 }
1141 wait_on_page_writeback(page); 1141 wait_on_page_writeback(page);
1142 1142
1143 if (shmem_should_replace_page(page, gfp)) { 1143 if (shmem_should_replace_page(page, gfp)) {
1144 error = shmem_replace_page(&page, gfp, info, index); 1144 error = shmem_replace_page(&page, gfp, info, index);
1145 if (error) 1145 if (error)
1146 goto failed; 1146 goto failed;
1147 } 1147 }
1148 1148
1149 error = mem_cgroup_cache_charge(page, current->mm, 1149 error = mem_cgroup_cache_charge(page, current->mm,
1150 gfp & GFP_RECLAIM_MASK); 1150 gfp & GFP_RECLAIM_MASK);
1151 if (!error) { 1151 if (!error) {
1152 error = shmem_add_to_page_cache(page, mapping, index, 1152 error = shmem_add_to_page_cache(page, mapping, index,
1153 gfp, swp_to_radix_entry(swap)); 1153 gfp, swp_to_radix_entry(swap));
1154 /* 1154 /*
1155 * We already confirmed swap under page lock, and make 1155 * We already confirmed swap under page lock, and make
1156 * no memory allocation here, so usually no possibility 1156 * no memory allocation here, so usually no possibility
1157 * of error; but free_swap_and_cache() only trylocks a 1157 * of error; but free_swap_and_cache() only trylocks a
1158 * page, so it is just possible that the entry has been 1158 * page, so it is just possible that the entry has been
1159 * truncated or holepunched since swap was confirmed. 1159 * truncated or holepunched since swap was confirmed.
1160 * shmem_undo_range() will have done some of the 1160 * shmem_undo_range() will have done some of the
1161 * unaccounting, now delete_from_swap_cache() will do 1161 * unaccounting, now delete_from_swap_cache() will do
1162 * the rest (including mem_cgroup_uncharge_swapcache). 1162 * the rest (including mem_cgroup_uncharge_swapcache).
1163 * Reset swap.val? No, leave it so "failed" goes back to 1163 * Reset swap.val? No, leave it so "failed" goes back to
1164 * "repeat": reading a hole and writing should succeed. 1164 * "repeat": reading a hole and writing should succeed.
1165 */ 1165 */
1166 if (error) 1166 if (error)
1167 delete_from_swap_cache(page); 1167 delete_from_swap_cache(page);
1168 } 1168 }
1169 if (error) 1169 if (error)
1170 goto failed; 1170 goto failed;
1171 1171
1172 spin_lock(&info->lock); 1172 spin_lock(&info->lock);
1173 info->swapped--; 1173 info->swapped--;
1174 shmem_recalc_inode(inode); 1174 shmem_recalc_inode(inode);
1175 spin_unlock(&info->lock); 1175 spin_unlock(&info->lock);
1176 1176
1177 delete_from_swap_cache(page); 1177 delete_from_swap_cache(page);
1178 set_page_dirty(page); 1178 set_page_dirty(page);
1179 swap_free(swap); 1179 swap_free(swap);
1180 1180
1181 } else { 1181 } else {
1182 if (shmem_acct_block(info->flags)) { 1182 if (shmem_acct_block(info->flags)) {
1183 error = -ENOSPC; 1183 error = -ENOSPC;
1184 goto failed; 1184 goto failed;
1185 } 1185 }
1186 if (sbinfo->max_blocks) { 1186 if (sbinfo->max_blocks) {
1187 if (percpu_counter_compare(&sbinfo->used_blocks, 1187 if (percpu_counter_compare(&sbinfo->used_blocks,
1188 sbinfo->max_blocks) >= 0) { 1188 sbinfo->max_blocks) >= 0) {
1189 error = -ENOSPC; 1189 error = -ENOSPC;
1190 goto unacct; 1190 goto unacct;
1191 } 1191 }
1192 percpu_counter_inc(&sbinfo->used_blocks); 1192 percpu_counter_inc(&sbinfo->used_blocks);
1193 } 1193 }
1194 1194
1195 page = shmem_alloc_page(gfp, info, index); 1195 page = shmem_alloc_page(gfp, info, index);
1196 if (!page) { 1196 if (!page) {
1197 error = -ENOMEM; 1197 error = -ENOMEM;
1198 goto decused; 1198 goto decused;
1199 } 1199 }
1200 1200
1201 SetPageSwapBacked(page); 1201 SetPageSwapBacked(page);
1202 __set_page_locked(page); 1202 __set_page_locked(page);
1203 error = mem_cgroup_cache_charge(page, current->mm, 1203 error = mem_cgroup_cache_charge(page, current->mm,
1204 gfp & GFP_RECLAIM_MASK); 1204 gfp & GFP_RECLAIM_MASK);
1205 if (error) 1205 if (error)
1206 goto decused; 1206 goto decused;
1207 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 1207 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1208 if (!error) { 1208 if (!error) {
1209 error = shmem_add_to_page_cache(page, mapping, index, 1209 error = shmem_add_to_page_cache(page, mapping, index,
1210 gfp, NULL); 1210 gfp, NULL);
1211 radix_tree_preload_end(); 1211 radix_tree_preload_end();
1212 } 1212 }
1213 if (error) { 1213 if (error) {
1214 mem_cgroup_uncharge_cache_page(page); 1214 mem_cgroup_uncharge_cache_page(page);
1215 goto decused; 1215 goto decused;
1216 } 1216 }
1217 lru_cache_add_anon(page); 1217 lru_cache_add_anon(page);
1218 1218
1219 spin_lock(&info->lock); 1219 spin_lock(&info->lock);
1220 info->alloced++; 1220 info->alloced++;
1221 inode->i_blocks += BLOCKS_PER_PAGE; 1221 inode->i_blocks += BLOCKS_PER_PAGE;
1222 shmem_recalc_inode(inode); 1222 shmem_recalc_inode(inode);
1223 spin_unlock(&info->lock); 1223 spin_unlock(&info->lock);
1224 alloced = true; 1224 alloced = true;
1225 1225
1226 /* 1226 /*
1227 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1227 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1228 */ 1228 */
1229 if (sgp == SGP_FALLOC) 1229 if (sgp == SGP_FALLOC)
1230 sgp = SGP_WRITE; 1230 sgp = SGP_WRITE;
1231 clear: 1231 clear:
1232 /* 1232 /*
1233 * Let SGP_WRITE caller clear ends if write does not fill page; 1233 * Let SGP_WRITE caller clear ends if write does not fill page;
1234 * but SGP_FALLOC on a page fallocated earlier must initialize 1234 * but SGP_FALLOC on a page fallocated earlier must initialize
1235 * it now, lest undo on failure cancel our earlier guarantee. 1235 * it now, lest undo on failure cancel our earlier guarantee.
1236 */ 1236 */
1237 if (sgp != SGP_WRITE) { 1237 if (sgp != SGP_WRITE) {
1238 clear_highpage(page); 1238 clear_highpage(page);
1239 flush_dcache_page(page); 1239 flush_dcache_page(page);
1240 SetPageUptodate(page); 1240 SetPageUptodate(page);
1241 } 1241 }
1242 if (sgp == SGP_DIRTY) 1242 if (sgp == SGP_DIRTY)
1243 set_page_dirty(page); 1243 set_page_dirty(page);
1244 } 1244 }
1245 1245
1246 /* Perhaps the file has been truncated since we checked */ 1246 /* Perhaps the file has been truncated since we checked */
1247 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1247 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1248 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1248 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1249 error = -EINVAL; 1249 error = -EINVAL;
1250 if (alloced) 1250 if (alloced)
1251 goto trunc; 1251 goto trunc;
1252 else 1252 else
1253 goto failed; 1253 goto failed;
1254 } 1254 }
1255 *pagep = page; 1255 *pagep = page;
1256 return 0; 1256 return 0;
1257 1257
1258 /* 1258 /*
1259 * Error recovery. 1259 * Error recovery.
1260 */ 1260 */
1261 trunc: 1261 trunc:
1262 info = SHMEM_I(inode); 1262 info = SHMEM_I(inode);
1263 ClearPageDirty(page); 1263 ClearPageDirty(page);
1264 delete_from_page_cache(page); 1264 delete_from_page_cache(page);
1265 spin_lock(&info->lock); 1265 spin_lock(&info->lock);
1266 info->alloced--; 1266 info->alloced--;
1267 inode->i_blocks -= BLOCKS_PER_PAGE; 1267 inode->i_blocks -= BLOCKS_PER_PAGE;
1268 spin_unlock(&info->lock); 1268 spin_unlock(&info->lock);
1269 decused: 1269 decused:
1270 sbinfo = SHMEM_SB(inode->i_sb); 1270 sbinfo = SHMEM_SB(inode->i_sb);
1271 if (sbinfo->max_blocks) 1271 if (sbinfo->max_blocks)
1272 percpu_counter_add(&sbinfo->used_blocks, -1); 1272 percpu_counter_add(&sbinfo->used_blocks, -1);
1273 unacct: 1273 unacct:
1274 shmem_unacct_blocks(info->flags, 1); 1274 shmem_unacct_blocks(info->flags, 1);
1275 failed: 1275 failed:
1276 if (swap.val && error != -EINVAL && 1276 if (swap.val && error != -EINVAL &&
1277 !shmem_confirm_swap(mapping, index, swap)) 1277 !shmem_confirm_swap(mapping, index, swap))
1278 error = -EEXIST; 1278 error = -EEXIST;
1279 unlock: 1279 unlock:
1280 if (page) { 1280 if (page) {
1281 unlock_page(page); 1281 unlock_page(page);
1282 page_cache_release(page); 1282 page_cache_release(page);
1283 } 1283 }
1284 if (error == -ENOSPC && !once++) { 1284 if (error == -ENOSPC && !once++) {
1285 info = SHMEM_I(inode); 1285 info = SHMEM_I(inode);
1286 spin_lock(&info->lock); 1286 spin_lock(&info->lock);
1287 shmem_recalc_inode(inode); 1287 shmem_recalc_inode(inode);
1288 spin_unlock(&info->lock); 1288 spin_unlock(&info->lock);
1289 goto repeat; 1289 goto repeat;
1290 } 1290 }
1291 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1291 if (error == -EEXIST) /* from above or from radix_tree_insert */
1292 goto repeat; 1292 goto repeat;
1293 return error; 1293 return error;
1294 } 1294 }
1295 1295
1296 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1296 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1297 { 1297 {
1298 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1298 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1299 int error; 1299 int error;
1300 int ret = VM_FAULT_LOCKED; 1300 int ret = VM_FAULT_LOCKED;
1301 1301
1302 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1302 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1303 if (error) 1303 if (error)
1304 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1304 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1305 1305
1306 if (ret & VM_FAULT_MAJOR) { 1306 if (ret & VM_FAULT_MAJOR) {
1307 count_vm_event(PGMAJFAULT); 1307 count_vm_event(PGMAJFAULT);
1308 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1308 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1309 } 1309 }
1310 return ret; 1310 return ret;
1311 } 1311 }
1312 1312
1313 #ifdef CONFIG_NUMA 1313 #ifdef CONFIG_NUMA
1314 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 1314 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1315 { 1315 {
1316 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1316 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1317 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 1317 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1318 } 1318 }
1319 1319
1320 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1320 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1321 unsigned long addr) 1321 unsigned long addr)
1322 { 1322 {
1323 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1323 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1324 pgoff_t index; 1324 pgoff_t index;
1325 1325
1326 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1326 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1327 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 1327 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1328 } 1328 }
1329 #endif 1329 #endif
1330 1330
1331 int shmem_lock(struct file *file, int lock, struct user_struct *user) 1331 int shmem_lock(struct file *file, int lock, struct user_struct *user)
1332 { 1332 {
1333 struct inode *inode = file->f_path.dentry->d_inode; 1333 struct inode *inode = file->f_path.dentry->d_inode;
1334 struct shmem_inode_info *info = SHMEM_I(inode); 1334 struct shmem_inode_info *info = SHMEM_I(inode);
1335 int retval = -ENOMEM; 1335 int retval = -ENOMEM;
1336 1336
1337 spin_lock(&info->lock); 1337 spin_lock(&info->lock);
1338 if (lock && !(info->flags & VM_LOCKED)) { 1338 if (lock && !(info->flags & VM_LOCKED)) {
1339 if (!user_shm_lock(inode->i_size, user)) 1339 if (!user_shm_lock(inode->i_size, user))
1340 goto out_nomem; 1340 goto out_nomem;
1341 info->flags |= VM_LOCKED; 1341 info->flags |= VM_LOCKED;
1342 mapping_set_unevictable(file->f_mapping); 1342 mapping_set_unevictable(file->f_mapping);
1343 } 1343 }
1344 if (!lock && (info->flags & VM_LOCKED) && user) { 1344 if (!lock && (info->flags & VM_LOCKED) && user) {
1345 user_shm_unlock(inode->i_size, user); 1345 user_shm_unlock(inode->i_size, user);
1346 info->flags &= ~VM_LOCKED; 1346 info->flags &= ~VM_LOCKED;
1347 mapping_clear_unevictable(file->f_mapping); 1347 mapping_clear_unevictable(file->f_mapping);
1348 } 1348 }
1349 retval = 0; 1349 retval = 0;
1350 1350
1351 out_nomem: 1351 out_nomem:
1352 spin_unlock(&info->lock); 1352 spin_unlock(&info->lock);
1353 return retval; 1353 return retval;
1354 } 1354 }
1355 1355
1356 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1356 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1357 { 1357 {
1358 file_accessed(file); 1358 file_accessed(file);
1359 vma->vm_ops = &shmem_vm_ops; 1359 vma->vm_ops = &shmem_vm_ops;
1360 return 0; 1360 return 0;
1361 } 1361 }
1362 1362
1363 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 1363 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1364 umode_t mode, dev_t dev, unsigned long flags) 1364 umode_t mode, dev_t dev, unsigned long flags)
1365 { 1365 {
1366 struct inode *inode; 1366 struct inode *inode;
1367 struct shmem_inode_info *info; 1367 struct shmem_inode_info *info;
1368 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1368 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1369 1369
1370 if (shmem_reserve_inode(sb)) 1370 if (shmem_reserve_inode(sb))
1371 return NULL; 1371 return NULL;
1372 1372
1373 inode = new_inode(sb); 1373 inode = new_inode(sb);
1374 if (inode) { 1374 if (inode) {
1375 inode->i_ino = get_next_ino(); 1375 inode->i_ino = get_next_ino();
1376 inode_init_owner(inode, dir, mode); 1376 inode_init_owner(inode, dir, mode);
1377 inode->i_blocks = 0; 1377 inode->i_blocks = 0;
1378 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1378 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1379 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1379 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1380 inode->i_generation = get_seconds(); 1380 inode->i_generation = get_seconds();
1381 info = SHMEM_I(inode); 1381 info = SHMEM_I(inode);
1382 memset(info, 0, (char *)inode - (char *)info); 1382 memset(info, 0, (char *)inode - (char *)info);
1383 spin_lock_init(&info->lock); 1383 spin_lock_init(&info->lock);
1384 info->flags = flags & VM_NORESERVE; 1384 info->flags = flags & VM_NORESERVE;
1385 INIT_LIST_HEAD(&info->swaplist); 1385 INIT_LIST_HEAD(&info->swaplist);
1386 simple_xattrs_init(&info->xattrs); 1386 simple_xattrs_init(&info->xattrs);
1387 cache_no_acl(inode); 1387 cache_no_acl(inode);
1388 1388
1389 switch (mode & S_IFMT) { 1389 switch (mode & S_IFMT) {
1390 default: 1390 default:
1391 inode->i_op = &shmem_special_inode_operations; 1391 inode->i_op = &shmem_special_inode_operations;
1392 init_special_inode(inode, mode, dev); 1392 init_special_inode(inode, mode, dev);
1393 break; 1393 break;
1394 case S_IFREG: 1394 case S_IFREG:
1395 inode->i_mapping->a_ops = &shmem_aops; 1395 inode->i_mapping->a_ops = &shmem_aops;
1396 inode->i_op = &shmem_inode_operations; 1396 inode->i_op = &shmem_inode_operations;
1397 inode->i_fop = &shmem_file_operations; 1397 inode->i_fop = &shmem_file_operations;
1398 mpol_shared_policy_init(&info->policy, 1398 mpol_shared_policy_init(&info->policy,
1399 shmem_get_sbmpol(sbinfo)); 1399 shmem_get_sbmpol(sbinfo));
1400 break; 1400 break;
1401 case S_IFDIR: 1401 case S_IFDIR:
1402 inc_nlink(inode); 1402 inc_nlink(inode);
1403 /* Some things misbehave if size == 0 on a directory */ 1403 /* Some things misbehave if size == 0 on a directory */
1404 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1404 inode->i_size = 2 * BOGO_DIRENT_SIZE;
1405 inode->i_op = &shmem_dir_inode_operations; 1405 inode->i_op = &shmem_dir_inode_operations;
1406 inode->i_fop = &simple_dir_operations; 1406 inode->i_fop = &simple_dir_operations;
1407 break; 1407 break;
1408 case S_IFLNK: 1408 case S_IFLNK:
1409 /* 1409 /*
1410 * Must not load anything in the rbtree, 1410 * Must not load anything in the rbtree,
1411 * mpol_free_shared_policy will not be called. 1411 * mpol_free_shared_policy will not be called.
1412 */ 1412 */
1413 mpol_shared_policy_init(&info->policy, NULL); 1413 mpol_shared_policy_init(&info->policy, NULL);
1414 break; 1414 break;
1415 } 1415 }
1416 } else 1416 } else
1417 shmem_free_inode(sb); 1417 shmem_free_inode(sb);
1418 return inode; 1418 return inode;
1419 } 1419 }
1420 1420
1421 #ifdef CONFIG_TMPFS 1421 #ifdef CONFIG_TMPFS
1422 static const struct inode_operations shmem_symlink_inode_operations; 1422 static const struct inode_operations shmem_symlink_inode_operations;
1423 static const struct inode_operations shmem_short_symlink_operations; 1423 static const struct inode_operations shmem_short_symlink_operations;
1424 1424
1425 #ifdef CONFIG_TMPFS_XATTR 1425 #ifdef CONFIG_TMPFS_XATTR
1426 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 1426 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1427 #else 1427 #else
1428 #define shmem_initxattrs NULL 1428 #define shmem_initxattrs NULL
1429 #endif 1429 #endif
1430 1430
1431 static int 1431 static int
1432 shmem_write_begin(struct file *file, struct address_space *mapping, 1432 shmem_write_begin(struct file *file, struct address_space *mapping,
1433 loff_t pos, unsigned len, unsigned flags, 1433 loff_t pos, unsigned len, unsigned flags,
1434 struct page **pagep, void **fsdata) 1434 struct page **pagep, void **fsdata)
1435 { 1435 {
1436 struct inode *inode = mapping->host; 1436 struct inode *inode = mapping->host;
1437 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1437 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1438 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1438 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1439 } 1439 }
1440 1440
1441 static int 1441 static int
1442 shmem_write_end(struct file *file, struct address_space *mapping, 1442 shmem_write_end(struct file *file, struct address_space *mapping,
1443 loff_t pos, unsigned len, unsigned copied, 1443 loff_t pos, unsigned len, unsigned copied,
1444 struct page *page, void *fsdata) 1444 struct page *page, void *fsdata)
1445 { 1445 {
1446 struct inode *inode = mapping->host; 1446 struct inode *inode = mapping->host;
1447 1447
1448 if (pos + copied > inode->i_size) 1448 if (pos + copied > inode->i_size)
1449 i_size_write(inode, pos + copied); 1449 i_size_write(inode, pos + copied);
1450 1450
1451 if (!PageUptodate(page)) { 1451 if (!PageUptodate(page)) {
1452 if (copied < PAGE_CACHE_SIZE) { 1452 if (copied < PAGE_CACHE_SIZE) {
1453 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1453 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1454 zero_user_segments(page, 0, from, 1454 zero_user_segments(page, 0, from,
1455 from + copied, PAGE_CACHE_SIZE); 1455 from + copied, PAGE_CACHE_SIZE);
1456 } 1456 }
1457 SetPageUptodate(page); 1457 SetPageUptodate(page);
1458 } 1458 }
1459 set_page_dirty(page); 1459 set_page_dirty(page);
1460 unlock_page(page); 1460 unlock_page(page);
1461 page_cache_release(page); 1461 page_cache_release(page);
1462 1462
1463 return copied; 1463 return copied;
1464 } 1464 }
1465 1465
1466 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1466 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1467 { 1467 {
1468 struct inode *inode = filp->f_path.dentry->d_inode; 1468 struct inode *inode = filp->f_path.dentry->d_inode;
1469 struct address_space *mapping = inode->i_mapping; 1469 struct address_space *mapping = inode->i_mapping;
1470 pgoff_t index; 1470 pgoff_t index;
1471 unsigned long offset; 1471 unsigned long offset;
1472 enum sgp_type sgp = SGP_READ; 1472 enum sgp_type sgp = SGP_READ;
1473 1473
1474 /* 1474 /*
1475 * Might this read be for a stacking filesystem? Then when reading 1475 * Might this read be for a stacking filesystem? Then when reading
1476 * holes of a sparse file, we actually need to allocate those pages, 1476 * holes of a sparse file, we actually need to allocate those pages,
1477 * and even mark them dirty, so it cannot exceed the max_blocks limit. 1477 * and even mark them dirty, so it cannot exceed the max_blocks limit.
1478 */ 1478 */
1479 if (segment_eq(get_fs(), KERNEL_DS)) 1479 if (segment_eq(get_fs(), KERNEL_DS))
1480 sgp = SGP_DIRTY; 1480 sgp = SGP_DIRTY;
1481 1481
1482 index = *ppos >> PAGE_CACHE_SHIFT; 1482 index = *ppos >> PAGE_CACHE_SHIFT;
1483 offset = *ppos & ~PAGE_CACHE_MASK; 1483 offset = *ppos & ~PAGE_CACHE_MASK;
1484 1484
1485 for (;;) { 1485 for (;;) {
1486 struct page *page = NULL; 1486 struct page *page = NULL;
1487 pgoff_t end_index; 1487 pgoff_t end_index;
1488 unsigned long nr, ret; 1488 unsigned long nr, ret;
1489 loff_t i_size = i_size_read(inode); 1489 loff_t i_size = i_size_read(inode);
1490 1490
1491 end_index = i_size >> PAGE_CACHE_SHIFT; 1491 end_index = i_size >> PAGE_CACHE_SHIFT;
1492 if (index > end_index) 1492 if (index > end_index)
1493 break; 1493 break;
1494 if (index == end_index) { 1494 if (index == end_index) {
1495 nr = i_size & ~PAGE_CACHE_MASK; 1495 nr = i_size & ~PAGE_CACHE_MASK;
1496 if (nr <= offset) 1496 if (nr <= offset)
1497 break; 1497 break;
1498 } 1498 }
1499 1499
1500 desc->error = shmem_getpage(inode, index, &page, sgp, NULL); 1500 desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1501 if (desc->error) { 1501 if (desc->error) {
1502 if (desc->error == -EINVAL) 1502 if (desc->error == -EINVAL)
1503 desc->error = 0; 1503 desc->error = 0;
1504 break; 1504 break;
1505 } 1505 }
1506 if (page) 1506 if (page)
1507 unlock_page(page); 1507 unlock_page(page);
1508 1508
1509 /* 1509 /*
1510 * We must evaluate after, since reads (unlike writes) 1510 * We must evaluate after, since reads (unlike writes)
1511 * are called without i_mutex protection against truncate 1511 * are called without i_mutex protection against truncate
1512 */ 1512 */
1513 nr = PAGE_CACHE_SIZE; 1513 nr = PAGE_CACHE_SIZE;
1514 i_size = i_size_read(inode); 1514 i_size = i_size_read(inode);
1515 end_index = i_size >> PAGE_CACHE_SHIFT; 1515 end_index = i_size >> PAGE_CACHE_SHIFT;
1516 if (index == end_index) { 1516 if (index == end_index) {
1517 nr = i_size & ~PAGE_CACHE_MASK; 1517 nr = i_size & ~PAGE_CACHE_MASK;
1518 if (nr <= offset) { 1518 if (nr <= offset) {
1519 if (page) 1519 if (page)
1520 page_cache_release(page); 1520 page_cache_release(page);
1521 break; 1521 break;
1522 } 1522 }
1523 } 1523 }
1524 nr -= offset; 1524 nr -= offset;
1525 1525
1526 if (page) { 1526 if (page) {
1527 /* 1527 /*
1528 * If users can be writing to this page using arbitrary 1528 * If users can be writing to this page using arbitrary
1529 * virtual addresses, take care about potential aliasing 1529 * virtual addresses, take care about potential aliasing
1530 * before reading the page on the kernel side. 1530 * before reading the page on the kernel side.
1531 */ 1531 */
1532 if (mapping_writably_mapped(mapping)) 1532 if (mapping_writably_mapped(mapping))
1533 flush_dcache_page(page); 1533 flush_dcache_page(page);
1534 /* 1534 /*
1535 * Mark the page accessed if we read the beginning. 1535 * Mark the page accessed if we read the beginning.
1536 */ 1536 */
1537 if (!offset) 1537 if (!offset)
1538 mark_page_accessed(page); 1538 mark_page_accessed(page);
1539 } else { 1539 } else {
1540 page = ZERO_PAGE(0); 1540 page = ZERO_PAGE(0);
1541 page_cache_get(page); 1541 page_cache_get(page);
1542 } 1542 }
1543 1543
1544 /* 1544 /*
1545 * Ok, we have the page, and it's up-to-date, so 1545 * Ok, we have the page, and it's up-to-date, so
1546 * now we can copy it to user space... 1546 * now we can copy it to user space...
1547 * 1547 *
1548 * The actor routine returns how many bytes were actually used.. 1548 * The actor routine returns how many bytes were actually used..
1549 * NOTE! This may not be the same as how much of a user buffer 1549 * NOTE! This may not be the same as how much of a user buffer
1550 * we filled up (we may be padding etc), so we can only update 1550 * we filled up (we may be padding etc), so we can only update
1551 * "pos" here (the actor routine has to update the user buffer 1551 * "pos" here (the actor routine has to update the user buffer
1552 * pointers and the remaining count). 1552 * pointers and the remaining count).
1553 */ 1553 */
1554 ret = actor(desc, page, offset, nr); 1554 ret = actor(desc, page, offset, nr);
1555 offset += ret; 1555 offset += ret;
1556 index += offset >> PAGE_CACHE_SHIFT; 1556 index += offset >> PAGE_CACHE_SHIFT;
1557 offset &= ~PAGE_CACHE_MASK; 1557 offset &= ~PAGE_CACHE_MASK;
1558 1558
1559 page_cache_release(page); 1559 page_cache_release(page);
1560 if (ret != nr || !desc->count) 1560 if (ret != nr || !desc->count)
1561 break; 1561 break;
1562 1562
1563 cond_resched(); 1563 cond_resched();
1564 } 1564 }
1565 1565
1566 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1566 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1567 file_accessed(filp); 1567 file_accessed(filp);
1568 } 1568 }
1569 1569
1570 static ssize_t shmem_file_aio_read(struct kiocb *iocb, 1570 static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1571 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 1571 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1572 { 1572 {
1573 struct file *filp = iocb->ki_filp; 1573 struct file *filp = iocb->ki_filp;
1574 ssize_t retval; 1574 ssize_t retval;
1575 unsigned long seg; 1575 unsigned long seg;
1576 size_t count; 1576 size_t count;
1577 loff_t *ppos = &iocb->ki_pos; 1577 loff_t *ppos = &iocb->ki_pos;
1578 1578
1579 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1579 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1580 if (retval) 1580 if (retval)
1581 return retval; 1581 return retval;
1582 1582
1583 for (seg = 0; seg < nr_segs; seg++) { 1583 for (seg = 0; seg < nr_segs; seg++) {
1584 read_descriptor_t desc; 1584 read_descriptor_t desc;
1585 1585
1586 desc.written = 0; 1586 desc.written = 0;
1587 desc.arg.buf = iov[seg].iov_base; 1587 desc.arg.buf = iov[seg].iov_base;
1588 desc.count = iov[seg].iov_len; 1588 desc.count = iov[seg].iov_len;
1589 if (desc.count == 0) 1589 if (desc.count == 0)
1590 continue; 1590 continue;
1591 desc.error = 0; 1591 desc.error = 0;
1592 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1592 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1593 retval += desc.written; 1593 retval += desc.written;
1594 if (desc.error) { 1594 if (desc.error) {
1595 retval = retval ?: desc.error; 1595 retval = retval ?: desc.error;
1596 break; 1596 break;
1597 } 1597 }
1598 if (desc.count > 0) 1598 if (desc.count > 0)
1599 break; 1599 break;
1600 } 1600 }
1601 return retval; 1601 return retval;
1602 } 1602 }
1603 1603
1604 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1604 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1605 struct pipe_inode_info *pipe, size_t len, 1605 struct pipe_inode_info *pipe, size_t len,
1606 unsigned int flags) 1606 unsigned int flags)
1607 { 1607 {
1608 struct address_space *mapping = in->f_mapping; 1608 struct address_space *mapping = in->f_mapping;
1609 struct inode *inode = mapping->host; 1609 struct inode *inode = mapping->host;
1610 unsigned int loff, nr_pages, req_pages; 1610 unsigned int loff, nr_pages, req_pages;
1611 struct page *pages[PIPE_DEF_BUFFERS]; 1611 struct page *pages[PIPE_DEF_BUFFERS];
1612 struct partial_page partial[PIPE_DEF_BUFFERS]; 1612 struct partial_page partial[PIPE_DEF_BUFFERS];
1613 struct page *page; 1613 struct page *page;
1614 pgoff_t index, end_index; 1614 pgoff_t index, end_index;
1615 loff_t isize, left; 1615 loff_t isize, left;
1616 int error, page_nr; 1616 int error, page_nr;
1617 struct splice_pipe_desc spd = { 1617 struct splice_pipe_desc spd = {
1618 .pages = pages, 1618 .pages = pages,
1619 .partial = partial, 1619 .partial = partial,
1620 .nr_pages_max = PIPE_DEF_BUFFERS, 1620 .nr_pages_max = PIPE_DEF_BUFFERS,
1621 .flags = flags, 1621 .flags = flags,
1622 .ops = &page_cache_pipe_buf_ops, 1622 .ops = &page_cache_pipe_buf_ops,
1623 .spd_release = spd_release_page, 1623 .spd_release = spd_release_page,
1624 }; 1624 };
1625 1625
1626 isize = i_size_read(inode); 1626 isize = i_size_read(inode);
1627 if (unlikely(*ppos >= isize)) 1627 if (unlikely(*ppos >= isize))
1628 return 0; 1628 return 0;
1629 1629
1630 left = isize - *ppos; 1630 left = isize - *ppos;
1631 if (unlikely(left < len)) 1631 if (unlikely(left < len))
1632 len = left; 1632 len = left;
1633 1633
1634 if (splice_grow_spd(pipe, &spd)) 1634 if (splice_grow_spd(pipe, &spd))
1635 return -ENOMEM; 1635 return -ENOMEM;
1636 1636
1637 index = *ppos >> PAGE_CACHE_SHIFT; 1637 index = *ppos >> PAGE_CACHE_SHIFT;
1638 loff = *ppos & ~PAGE_CACHE_MASK; 1638 loff = *ppos & ~PAGE_CACHE_MASK;
1639 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1639 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1640 nr_pages = min(req_pages, pipe->buffers); 1640 nr_pages = min(req_pages, pipe->buffers);
1641 1641
1642 spd.nr_pages = find_get_pages_contig(mapping, index, 1642 spd.nr_pages = find_get_pages_contig(mapping, index,
1643 nr_pages, spd.pages); 1643 nr_pages, spd.pages);
1644 index += spd.nr_pages; 1644 index += spd.nr_pages;
1645 error = 0; 1645 error = 0;
1646 1646
1647 while (spd.nr_pages < nr_pages) { 1647 while (spd.nr_pages < nr_pages) {
1648 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); 1648 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1649 if (error) 1649 if (error)
1650 break; 1650 break;
1651 unlock_page(page); 1651 unlock_page(page);
1652 spd.pages[spd.nr_pages++] = page; 1652 spd.pages[spd.nr_pages++] = page;
1653 index++; 1653 index++;
1654 } 1654 }
1655 1655
1656 index = *ppos >> PAGE_CACHE_SHIFT; 1656 index = *ppos >> PAGE_CACHE_SHIFT;
1657 nr_pages = spd.nr_pages; 1657 nr_pages = spd.nr_pages;
1658 spd.nr_pages = 0; 1658 spd.nr_pages = 0;
1659 1659
1660 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1660 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1661 unsigned int this_len; 1661 unsigned int this_len;
1662 1662
1663 if (!len) 1663 if (!len)
1664 break; 1664 break;
1665 1665
1666 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 1666 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1667 page = spd.pages[page_nr]; 1667 page = spd.pages[page_nr];
1668 1668
1669 if (!PageUptodate(page) || page->mapping != mapping) { 1669 if (!PageUptodate(page) || page->mapping != mapping) {
1670 error = shmem_getpage(inode, index, &page, 1670 error = shmem_getpage(inode, index, &page,
1671 SGP_CACHE, NULL); 1671 SGP_CACHE, NULL);
1672 if (error) 1672 if (error)
1673 break; 1673 break;
1674 unlock_page(page); 1674 unlock_page(page);
1675 page_cache_release(spd.pages[page_nr]); 1675 page_cache_release(spd.pages[page_nr]);
1676 spd.pages[page_nr] = page; 1676 spd.pages[page_nr] = page;
1677 } 1677 }
1678 1678
1679 isize = i_size_read(inode); 1679 isize = i_size_read(inode);
1680 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1680 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1681 if (unlikely(!isize || index > end_index)) 1681 if (unlikely(!isize || index > end_index))
1682 break; 1682 break;
1683 1683
1684 if (end_index == index) { 1684 if (end_index == index) {
1685 unsigned int plen; 1685 unsigned int plen;
1686 1686
1687 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1687 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1688 if (plen <= loff) 1688 if (plen <= loff)
1689 break; 1689 break;
1690 1690
1691 this_len = min(this_len, plen - loff); 1691 this_len = min(this_len, plen - loff);
1692 len = this_len; 1692 len = this_len;
1693 } 1693 }
1694 1694
1695 spd.partial[page_nr].offset = loff; 1695 spd.partial[page_nr].offset = loff;
1696 spd.partial[page_nr].len = this_len; 1696 spd.partial[page_nr].len = this_len;
1697 len -= this_len; 1697 len -= this_len;
1698 loff = 0; 1698 loff = 0;
1699 spd.nr_pages++; 1699 spd.nr_pages++;
1700 index++; 1700 index++;
1701 } 1701 }
1702 1702
1703 while (page_nr < nr_pages) 1703 while (page_nr < nr_pages)
1704 page_cache_release(spd.pages[page_nr++]); 1704 page_cache_release(spd.pages[page_nr++]);
1705 1705
1706 if (spd.nr_pages) 1706 if (spd.nr_pages)
1707 error = splice_to_pipe(pipe, &spd); 1707 error = splice_to_pipe(pipe, &spd);
1708 1708
1709 splice_shrink_spd(&spd); 1709 splice_shrink_spd(&spd);
1710 1710
1711 if (error > 0) { 1711 if (error > 0) {
1712 *ppos += error; 1712 *ppos += error;
1713 file_accessed(in); 1713 file_accessed(in);
1714 } 1714 }
1715 return error; 1715 return error;
1716 } 1716 }
1717 1717
1718 /* 1718 /*
1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720 */ 1720 */
1721 static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 1721 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722 pgoff_t index, pgoff_t end, int origin) 1722 pgoff_t index, pgoff_t end, int whence)
1723 { 1723 {
1724 struct page *page; 1724 struct page *page;
1725 struct pagevec pvec; 1725 struct pagevec pvec;
1726 pgoff_t indices[PAGEVEC_SIZE]; 1726 pgoff_t indices[PAGEVEC_SIZE];
1727 bool done = false; 1727 bool done = false;
1728 int i; 1728 int i;
1729 1729
1730 pagevec_init(&pvec, 0); 1730 pagevec_init(&pvec, 0);
1731 pvec.nr = 1; /* start small: we may be there already */ 1731 pvec.nr = 1; /* start small: we may be there already */
1732 while (!done) { 1732 while (!done) {
1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734 pvec.nr, pvec.pages, indices); 1734 pvec.nr, pvec.pages, indices);
1735 if (!pvec.nr) { 1735 if (!pvec.nr) {
1736 if (origin == SEEK_DATA) 1736 if (whence == SEEK_DATA)
1737 index = end; 1737 index = end;
1738 break; 1738 break;
1739 } 1739 }
1740 for (i = 0; i < pvec.nr; i++, index++) { 1740 for (i = 0; i < pvec.nr; i++, index++) {
1741 if (index < indices[i]) { 1741 if (index < indices[i]) {
1742 if (origin == SEEK_HOLE) { 1742 if (whence == SEEK_HOLE) {
1743 done = true; 1743 done = true;
1744 break; 1744 break;
1745 } 1745 }
1746 index = indices[i]; 1746 index = indices[i];
1747 } 1747 }
1748 page = pvec.pages[i]; 1748 page = pvec.pages[i];
1749 if (page && !radix_tree_exceptional_entry(page)) { 1749 if (page && !radix_tree_exceptional_entry(page)) {
1750 if (!PageUptodate(page)) 1750 if (!PageUptodate(page))
1751 page = NULL; 1751 page = NULL;
1752 } 1752 }
1753 if (index >= end || 1753 if (index >= end ||
1754 (page && origin == SEEK_DATA) || 1754 (page && whence == SEEK_DATA) ||
1755 (!page && origin == SEEK_HOLE)) { 1755 (!page && whence == SEEK_HOLE)) {
1756 done = true; 1756 done = true;
1757 break; 1757 break;
1758 } 1758 }
1759 } 1759 }
1760 shmem_deswap_pagevec(&pvec); 1760 shmem_deswap_pagevec(&pvec);
1761 pagevec_release(&pvec); 1761 pagevec_release(&pvec);
1762 pvec.nr = PAGEVEC_SIZE; 1762 pvec.nr = PAGEVEC_SIZE;
1763 cond_resched(); 1763 cond_resched();
1764 } 1764 }
1765 return index; 1765 return index;
1766 } 1766 }
1767 1767
1768 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) 1768 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769 { 1769 {
1770 struct address_space *mapping = file->f_mapping; 1770 struct address_space *mapping = file->f_mapping;
1771 struct inode *inode = mapping->host; 1771 struct inode *inode = mapping->host;
1772 pgoff_t start, end; 1772 pgoff_t start, end;
1773 loff_t new_offset; 1773 loff_t new_offset;
1774 1774
1775 if (origin != SEEK_DATA && origin != SEEK_HOLE) 1775 if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776 return generic_file_llseek_size(file, offset, origin, 1776 return generic_file_llseek_size(file, offset, whence,
1777 MAX_LFS_FILESIZE, i_size_read(inode)); 1777 MAX_LFS_FILESIZE, i_size_read(inode));
1778 mutex_lock(&inode->i_mutex); 1778 mutex_lock(&inode->i_mutex);
1779 /* We're holding i_mutex so we can access i_size directly */ 1779 /* We're holding i_mutex so we can access i_size directly */
1780 1780
1781 if (offset < 0) 1781 if (offset < 0)
1782 offset = -EINVAL; 1782 offset = -EINVAL;
1783 else if (offset >= inode->i_size) 1783 else if (offset >= inode->i_size)
1784 offset = -ENXIO; 1784 offset = -ENXIO;
1785 else { 1785 else {
1786 start = offset >> PAGE_CACHE_SHIFT; 1786 start = offset >> PAGE_CACHE_SHIFT;
1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788 new_offset = shmem_seek_hole_data(mapping, start, end, origin); 1788 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789 new_offset <<= PAGE_CACHE_SHIFT; 1789 new_offset <<= PAGE_CACHE_SHIFT;
1790 if (new_offset > offset) { 1790 if (new_offset > offset) {
1791 if (new_offset < inode->i_size) 1791 if (new_offset < inode->i_size)
1792 offset = new_offset; 1792 offset = new_offset;
1793 else if (origin == SEEK_DATA) 1793 else if (whence == SEEK_DATA)
1794 offset = -ENXIO; 1794 offset = -ENXIO;
1795 else 1795 else
1796 offset = inode->i_size; 1796 offset = inode->i_size;
1797 } 1797 }
1798 } 1798 }
1799 1799
1800 if (offset >= 0 && offset != file->f_pos) { 1800 if (offset >= 0 && offset != file->f_pos) {
1801 file->f_pos = offset; 1801 file->f_pos = offset;
1802 file->f_version = 0; 1802 file->f_version = 0;
1803 } 1803 }
1804 mutex_unlock(&inode->i_mutex); 1804 mutex_unlock(&inode->i_mutex);
1805 return offset; 1805 return offset;
1806 } 1806 }
1807 1807
1808 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1808 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1809 loff_t len) 1809 loff_t len)
1810 { 1810 {
1811 struct inode *inode = file->f_path.dentry->d_inode; 1811 struct inode *inode = file->f_path.dentry->d_inode;
1812 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1812 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1813 struct shmem_falloc shmem_falloc; 1813 struct shmem_falloc shmem_falloc;
1814 pgoff_t start, index, end; 1814 pgoff_t start, index, end;
1815 int error; 1815 int error;
1816 1816
1817 mutex_lock(&inode->i_mutex); 1817 mutex_lock(&inode->i_mutex);
1818 1818
1819 if (mode & FALLOC_FL_PUNCH_HOLE) { 1819 if (mode & FALLOC_FL_PUNCH_HOLE) {
1820 struct address_space *mapping = file->f_mapping; 1820 struct address_space *mapping = file->f_mapping;
1821 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1821 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1822 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1822 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1823 1823
1824 if ((u64)unmap_end > (u64)unmap_start) 1824 if ((u64)unmap_end > (u64)unmap_start)
1825 unmap_mapping_range(mapping, unmap_start, 1825 unmap_mapping_range(mapping, unmap_start,
1826 1 + unmap_end - unmap_start, 0); 1826 1 + unmap_end - unmap_start, 0);
1827 shmem_truncate_range(inode, offset, offset + len - 1); 1827 shmem_truncate_range(inode, offset, offset + len - 1);
1828 /* No need to unmap again: hole-punching leaves COWed pages */ 1828 /* No need to unmap again: hole-punching leaves COWed pages */
1829 error = 0; 1829 error = 0;
1830 goto out; 1830 goto out;
1831 } 1831 }
1832 1832
1833 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 1833 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1834 error = inode_newsize_ok(inode, offset + len); 1834 error = inode_newsize_ok(inode, offset + len);
1835 if (error) 1835 if (error)
1836 goto out; 1836 goto out;
1837 1837
1838 start = offset >> PAGE_CACHE_SHIFT; 1838 start = offset >> PAGE_CACHE_SHIFT;
1839 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1839 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1840 /* Try to avoid a swapstorm if len is impossible to satisfy */ 1840 /* Try to avoid a swapstorm if len is impossible to satisfy */
1841 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 1841 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1842 error = -ENOSPC; 1842 error = -ENOSPC;
1843 goto out; 1843 goto out;
1844 } 1844 }
1845 1845
1846 shmem_falloc.start = start; 1846 shmem_falloc.start = start;
1847 shmem_falloc.next = start; 1847 shmem_falloc.next = start;
1848 shmem_falloc.nr_falloced = 0; 1848 shmem_falloc.nr_falloced = 0;
1849 shmem_falloc.nr_unswapped = 0; 1849 shmem_falloc.nr_unswapped = 0;
1850 spin_lock(&inode->i_lock); 1850 spin_lock(&inode->i_lock);
1851 inode->i_private = &shmem_falloc; 1851 inode->i_private = &shmem_falloc;
1852 spin_unlock(&inode->i_lock); 1852 spin_unlock(&inode->i_lock);
1853 1853
1854 for (index = start; index < end; index++) { 1854 for (index = start; index < end; index++) {
1855 struct page *page; 1855 struct page *page;
1856 1856
1857 /* 1857 /*
1858 * Good, the fallocate(2) manpage permits EINTR: we may have 1858 * Good, the fallocate(2) manpage permits EINTR: we may have
1859 * been interrupted because we are using up too much memory. 1859 * been interrupted because we are using up too much memory.
1860 */ 1860 */
1861 if (signal_pending(current)) 1861 if (signal_pending(current))
1862 error = -EINTR; 1862 error = -EINTR;
1863 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 1863 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1864 error = -ENOMEM; 1864 error = -ENOMEM;
1865 else 1865 else
1866 error = shmem_getpage(inode, index, &page, SGP_FALLOC, 1866 error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1867 NULL); 1867 NULL);
1868 if (error) { 1868 if (error) {
1869 /* Remove the !PageUptodate pages we added */ 1869 /* Remove the !PageUptodate pages we added */
1870 shmem_undo_range(inode, 1870 shmem_undo_range(inode,
1871 (loff_t)start << PAGE_CACHE_SHIFT, 1871 (loff_t)start << PAGE_CACHE_SHIFT,
1872 (loff_t)index << PAGE_CACHE_SHIFT, true); 1872 (loff_t)index << PAGE_CACHE_SHIFT, true);
1873 goto undone; 1873 goto undone;
1874 } 1874 }
1875 1875
1876 /* 1876 /*
1877 * Inform shmem_writepage() how far we have reached. 1877 * Inform shmem_writepage() how far we have reached.
1878 * No need for lock or barrier: we have the page lock. 1878 * No need for lock or barrier: we have the page lock.
1879 */ 1879 */
1880 shmem_falloc.next++; 1880 shmem_falloc.next++;
1881 if (!PageUptodate(page)) 1881 if (!PageUptodate(page))
1882 shmem_falloc.nr_falloced++; 1882 shmem_falloc.nr_falloced++;
1883 1883
1884 /* 1884 /*
1885 * If !PageUptodate, leave it that way so that freeable pages 1885 * If !PageUptodate, leave it that way so that freeable pages
1886 * can be recognized if we need to rollback on error later. 1886 * can be recognized if we need to rollback on error later.
1887 * But set_page_dirty so that memory pressure will swap rather 1887 * But set_page_dirty so that memory pressure will swap rather
1888 * than free the pages we are allocating (and SGP_CACHE pages 1888 * than free the pages we are allocating (and SGP_CACHE pages
1889 * might still be clean: we now need to mark those dirty too). 1889 * might still be clean: we now need to mark those dirty too).
1890 */ 1890 */
1891 set_page_dirty(page); 1891 set_page_dirty(page);
1892 unlock_page(page); 1892 unlock_page(page);
1893 page_cache_release(page); 1893 page_cache_release(page);
1894 cond_resched(); 1894 cond_resched();
1895 } 1895 }
1896 1896
1897 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 1897 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1898 i_size_write(inode, offset + len); 1898 i_size_write(inode, offset + len);
1899 inode->i_ctime = CURRENT_TIME; 1899 inode->i_ctime = CURRENT_TIME;
1900 undone: 1900 undone:
1901 spin_lock(&inode->i_lock); 1901 spin_lock(&inode->i_lock);
1902 inode->i_private = NULL; 1902 inode->i_private = NULL;
1903 spin_unlock(&inode->i_lock); 1903 spin_unlock(&inode->i_lock);
1904 out: 1904 out:
1905 mutex_unlock(&inode->i_mutex); 1905 mutex_unlock(&inode->i_mutex);
1906 return error; 1906 return error;
1907 } 1907 }
1908 1908
1909 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1909 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1910 { 1910 {
1911 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1911 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1912 1912
1913 buf->f_type = TMPFS_MAGIC; 1913 buf->f_type = TMPFS_MAGIC;
1914 buf->f_bsize = PAGE_CACHE_SIZE; 1914 buf->f_bsize = PAGE_CACHE_SIZE;
1915 buf->f_namelen = NAME_MAX; 1915 buf->f_namelen = NAME_MAX;
1916 if (sbinfo->max_blocks) { 1916 if (sbinfo->max_blocks) {
1917 buf->f_blocks = sbinfo->max_blocks; 1917 buf->f_blocks = sbinfo->max_blocks;
1918 buf->f_bavail = 1918 buf->f_bavail =
1919 buf->f_bfree = sbinfo->max_blocks - 1919 buf->f_bfree = sbinfo->max_blocks -
1920 percpu_counter_sum(&sbinfo->used_blocks); 1920 percpu_counter_sum(&sbinfo->used_blocks);
1921 } 1921 }
1922 if (sbinfo->max_inodes) { 1922 if (sbinfo->max_inodes) {
1923 buf->f_files = sbinfo->max_inodes; 1923 buf->f_files = sbinfo->max_inodes;
1924 buf->f_ffree = sbinfo->free_inodes; 1924 buf->f_ffree = sbinfo->free_inodes;
1925 } 1925 }
1926 /* else leave those fields 0 like simple_statfs */ 1926 /* else leave those fields 0 like simple_statfs */
1927 return 0; 1927 return 0;
1928 } 1928 }
1929 1929
1930 /* 1930 /*
1931 * File creation. Allocate an inode, and we're done.. 1931 * File creation. Allocate an inode, and we're done..
1932 */ 1932 */
1933 static int 1933 static int
1934 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 1934 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1935 { 1935 {
1936 struct inode *inode; 1936 struct inode *inode;
1937 int error = -ENOSPC; 1937 int error = -ENOSPC;
1938 1938
1939 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1939 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1940 if (inode) { 1940 if (inode) {
1941 error = security_inode_init_security(inode, dir, 1941 error = security_inode_init_security(inode, dir,
1942 &dentry->d_name, 1942 &dentry->d_name,
1943 shmem_initxattrs, NULL); 1943 shmem_initxattrs, NULL);
1944 if (error) { 1944 if (error) {
1945 if (error != -EOPNOTSUPP) { 1945 if (error != -EOPNOTSUPP) {
1946 iput(inode); 1946 iput(inode);
1947 return error; 1947 return error;
1948 } 1948 }
1949 } 1949 }
1950 #ifdef CONFIG_TMPFS_POSIX_ACL 1950 #ifdef CONFIG_TMPFS_POSIX_ACL
1951 error = generic_acl_init(inode, dir); 1951 error = generic_acl_init(inode, dir);
1952 if (error) { 1952 if (error) {
1953 iput(inode); 1953 iput(inode);
1954 return error; 1954 return error;
1955 } 1955 }
1956 #else 1956 #else
1957 error = 0; 1957 error = 0;
1958 #endif 1958 #endif
1959 dir->i_size += BOGO_DIRENT_SIZE; 1959 dir->i_size += BOGO_DIRENT_SIZE;
1960 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1960 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1961 d_instantiate(dentry, inode); 1961 d_instantiate(dentry, inode);
1962 dget(dentry); /* Extra count - pin the dentry in core */ 1962 dget(dentry); /* Extra count - pin the dentry in core */
1963 } 1963 }
1964 return error; 1964 return error;
1965 } 1965 }
1966 1966
1967 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 1967 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1968 { 1968 {
1969 int error; 1969 int error;
1970 1970
1971 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 1971 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1972 return error; 1972 return error;
1973 inc_nlink(dir); 1973 inc_nlink(dir);
1974 return 0; 1974 return 0;
1975 } 1975 }
1976 1976
1977 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1977 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1978 bool excl) 1978 bool excl)
1979 { 1979 {
1980 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1980 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1981 } 1981 }
1982 1982
1983 /* 1983 /*
1984 * Link a file.. 1984 * Link a file..
1985 */ 1985 */
1986 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1986 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1987 { 1987 {
1988 struct inode *inode = old_dentry->d_inode; 1988 struct inode *inode = old_dentry->d_inode;
1989 int ret; 1989 int ret;
1990 1990
1991 /* 1991 /*
1992 * No ordinary (disk based) filesystem counts links as inodes; 1992 * No ordinary (disk based) filesystem counts links as inodes;
1993 * but each new link needs a new dentry, pinning lowmem, and 1993 * but each new link needs a new dentry, pinning lowmem, and
1994 * tmpfs dentries cannot be pruned until they are unlinked. 1994 * tmpfs dentries cannot be pruned until they are unlinked.
1995 */ 1995 */
1996 ret = shmem_reserve_inode(inode->i_sb); 1996 ret = shmem_reserve_inode(inode->i_sb);
1997 if (ret) 1997 if (ret)
1998 goto out; 1998 goto out;
1999 1999
2000 dir->i_size += BOGO_DIRENT_SIZE; 2000 dir->i_size += BOGO_DIRENT_SIZE;
2001 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2001 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2002 inc_nlink(inode); 2002 inc_nlink(inode);
2003 ihold(inode); /* New dentry reference */ 2003 ihold(inode); /* New dentry reference */
2004 dget(dentry); /* Extra pinning count for the created dentry */ 2004 dget(dentry); /* Extra pinning count for the created dentry */
2005 d_instantiate(dentry, inode); 2005 d_instantiate(dentry, inode);
2006 out: 2006 out:
2007 return ret; 2007 return ret;
2008 } 2008 }
2009 2009
2010 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2010 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2011 { 2011 {
2012 struct inode *inode = dentry->d_inode; 2012 struct inode *inode = dentry->d_inode;
2013 2013
2014 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2014 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2015 shmem_free_inode(inode->i_sb); 2015 shmem_free_inode(inode->i_sb);
2016 2016
2017 dir->i_size -= BOGO_DIRENT_SIZE; 2017 dir->i_size -= BOGO_DIRENT_SIZE;
2018 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2018 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2019 drop_nlink(inode); 2019 drop_nlink(inode);
2020 dput(dentry); /* Undo the count from "create" - this does all the work */ 2020 dput(dentry); /* Undo the count from "create" - this does all the work */
2021 return 0; 2021 return 0;
2022 } 2022 }
2023 2023
2024 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2024 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2025 { 2025 {
2026 if (!simple_empty(dentry)) 2026 if (!simple_empty(dentry))
2027 return -ENOTEMPTY; 2027 return -ENOTEMPTY;
2028 2028
2029 drop_nlink(dentry->d_inode); 2029 drop_nlink(dentry->d_inode);
2030 drop_nlink(dir); 2030 drop_nlink(dir);
2031 return shmem_unlink(dir, dentry); 2031 return shmem_unlink(dir, dentry);
2032 } 2032 }
2033 2033
2034 /* 2034 /*
2035 * The VFS layer already does all the dentry stuff for rename, 2035 * The VFS layer already does all the dentry stuff for rename,
2036 * we just have to decrement the usage count for the target if 2036 * we just have to decrement the usage count for the target if
2037 * it exists so that the VFS layer correctly free's it when it 2037 * it exists so that the VFS layer correctly free's it when it
2038 * gets overwritten. 2038 * gets overwritten.
2039 */ 2039 */
2040 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2040 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2041 { 2041 {
2042 struct inode *inode = old_dentry->d_inode; 2042 struct inode *inode = old_dentry->d_inode;
2043 int they_are_dirs = S_ISDIR(inode->i_mode); 2043 int they_are_dirs = S_ISDIR(inode->i_mode);
2044 2044
2045 if (!simple_empty(new_dentry)) 2045 if (!simple_empty(new_dentry))
2046 return -ENOTEMPTY; 2046 return -ENOTEMPTY;
2047 2047
2048 if (new_dentry->d_inode) { 2048 if (new_dentry->d_inode) {
2049 (void) shmem_unlink(new_dir, new_dentry); 2049 (void) shmem_unlink(new_dir, new_dentry);
2050 if (they_are_dirs) 2050 if (they_are_dirs)
2051 drop_nlink(old_dir); 2051 drop_nlink(old_dir);
2052 } else if (they_are_dirs) { 2052 } else if (they_are_dirs) {
2053 drop_nlink(old_dir); 2053 drop_nlink(old_dir);
2054 inc_nlink(new_dir); 2054 inc_nlink(new_dir);
2055 } 2055 }
2056 2056
2057 old_dir->i_size -= BOGO_DIRENT_SIZE; 2057 old_dir->i_size -= BOGO_DIRENT_SIZE;
2058 new_dir->i_size += BOGO_DIRENT_SIZE; 2058 new_dir->i_size += BOGO_DIRENT_SIZE;
2059 old_dir->i_ctime = old_dir->i_mtime = 2059 old_dir->i_ctime = old_dir->i_mtime =
2060 new_dir->i_ctime = new_dir->i_mtime = 2060 new_dir->i_ctime = new_dir->i_mtime =
2061 inode->i_ctime = CURRENT_TIME; 2061 inode->i_ctime = CURRENT_TIME;
2062 return 0; 2062 return 0;
2063 } 2063 }
2064 2064
2065 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2065 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
2066 { 2066 {
2067 int error; 2067 int error;
2068 int len; 2068 int len;
2069 struct inode *inode; 2069 struct inode *inode;
2070 struct page *page; 2070 struct page *page;
2071 char *kaddr; 2071 char *kaddr;
2072 struct shmem_inode_info *info; 2072 struct shmem_inode_info *info;
2073 2073
2074 len = strlen(symname) + 1; 2074 len = strlen(symname) + 1;
2075 if (len > PAGE_CACHE_SIZE) 2075 if (len > PAGE_CACHE_SIZE)
2076 return -ENAMETOOLONG; 2076 return -ENAMETOOLONG;
2077 2077
2078 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 2078 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
2079 if (!inode) 2079 if (!inode)
2080 return -ENOSPC; 2080 return -ENOSPC;
2081 2081
2082 error = security_inode_init_security(inode, dir, &dentry->d_name, 2082 error = security_inode_init_security(inode, dir, &dentry->d_name,
2083 shmem_initxattrs, NULL); 2083 shmem_initxattrs, NULL);
2084 if (error) { 2084 if (error) {
2085 if (error != -EOPNOTSUPP) { 2085 if (error != -EOPNOTSUPP) {
2086 iput(inode); 2086 iput(inode);
2087 return error; 2087 return error;
2088 } 2088 }
2089 error = 0; 2089 error = 0;
2090 } 2090 }
2091 2091
2092 info = SHMEM_I(inode); 2092 info = SHMEM_I(inode);
2093 inode->i_size = len-1; 2093 inode->i_size = len-1;
2094 if (len <= SHORT_SYMLINK_LEN) { 2094 if (len <= SHORT_SYMLINK_LEN) {
2095 info->symlink = kmemdup(symname, len, GFP_KERNEL); 2095 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2096 if (!info->symlink) { 2096 if (!info->symlink) {
2097 iput(inode); 2097 iput(inode);
2098 return -ENOMEM; 2098 return -ENOMEM;
2099 } 2099 }
2100 inode->i_op = &shmem_short_symlink_operations; 2100 inode->i_op = &shmem_short_symlink_operations;
2101 } else { 2101 } else {
2102 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2102 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2103 if (error) { 2103 if (error) {
2104 iput(inode); 2104 iput(inode);
2105 return error; 2105 return error;
2106 } 2106 }
2107 inode->i_mapping->a_ops = &shmem_aops; 2107 inode->i_mapping->a_ops = &shmem_aops;
2108 inode->i_op = &shmem_symlink_inode_operations; 2108 inode->i_op = &shmem_symlink_inode_operations;
2109 kaddr = kmap_atomic(page); 2109 kaddr = kmap_atomic(page);
2110 memcpy(kaddr, symname, len); 2110 memcpy(kaddr, symname, len);
2111 kunmap_atomic(kaddr); 2111 kunmap_atomic(kaddr);
2112 SetPageUptodate(page); 2112 SetPageUptodate(page);
2113 set_page_dirty(page); 2113 set_page_dirty(page);
2114 unlock_page(page); 2114 unlock_page(page);
2115 page_cache_release(page); 2115 page_cache_release(page);
2116 } 2116 }
2117 dir->i_size += BOGO_DIRENT_SIZE; 2117 dir->i_size += BOGO_DIRENT_SIZE;
2118 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2118 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2119 d_instantiate(dentry, inode); 2119 d_instantiate(dentry, inode);
2120 dget(dentry); 2120 dget(dentry);
2121 return 0; 2121 return 0;
2122 } 2122 }
2123 2123
2124 static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) 2124 static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2125 { 2125 {
2126 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); 2126 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2127 return NULL; 2127 return NULL;
2128 } 2128 }
2129 2129
2130 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 2130 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2131 { 2131 {
2132 struct page *page = NULL; 2132 struct page *page = NULL;
2133 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 2133 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2134 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); 2134 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2135 if (page) 2135 if (page)
2136 unlock_page(page); 2136 unlock_page(page);
2137 return page; 2137 return page;
2138 } 2138 }
2139 2139
2140 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 2140 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2141 { 2141 {
2142 if (!IS_ERR(nd_get_link(nd))) { 2142 if (!IS_ERR(nd_get_link(nd))) {
2143 struct page *page = cookie; 2143 struct page *page = cookie;
2144 kunmap(page); 2144 kunmap(page);
2145 mark_page_accessed(page); 2145 mark_page_accessed(page);
2146 page_cache_release(page); 2146 page_cache_release(page);
2147 } 2147 }
2148 } 2148 }
2149 2149
2150 #ifdef CONFIG_TMPFS_XATTR 2150 #ifdef CONFIG_TMPFS_XATTR
2151 /* 2151 /*
2152 * Superblocks without xattr inode operations may get some security.* xattr 2152 * Superblocks without xattr inode operations may get some security.* xattr
2153 * support from the LSM "for free". As soon as we have any other xattrs 2153 * support from the LSM "for free". As soon as we have any other xattrs
2154 * like ACLs, we also need to implement the security.* handlers at 2154 * like ACLs, we also need to implement the security.* handlers at
2155 * filesystem level, though. 2155 * filesystem level, though.
2156 */ 2156 */
2157 2157
2158 /* 2158 /*
2159 * Callback for security_inode_init_security() for acquiring xattrs. 2159 * Callback for security_inode_init_security() for acquiring xattrs.
2160 */ 2160 */
2161 static int shmem_initxattrs(struct inode *inode, 2161 static int shmem_initxattrs(struct inode *inode,
2162 const struct xattr *xattr_array, 2162 const struct xattr *xattr_array,
2163 void *fs_info) 2163 void *fs_info)
2164 { 2164 {
2165 struct shmem_inode_info *info = SHMEM_I(inode); 2165 struct shmem_inode_info *info = SHMEM_I(inode);
2166 const struct xattr *xattr; 2166 const struct xattr *xattr;
2167 struct simple_xattr *new_xattr; 2167 struct simple_xattr *new_xattr;
2168 size_t len; 2168 size_t len;
2169 2169
2170 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 2170 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2171 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 2171 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2172 if (!new_xattr) 2172 if (!new_xattr)
2173 return -ENOMEM; 2173 return -ENOMEM;
2174 2174
2175 len = strlen(xattr->name) + 1; 2175 len = strlen(xattr->name) + 1;
2176 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 2176 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
2177 GFP_KERNEL); 2177 GFP_KERNEL);
2178 if (!new_xattr->name) { 2178 if (!new_xattr->name) {
2179 kfree(new_xattr); 2179 kfree(new_xattr);
2180 return -ENOMEM; 2180 return -ENOMEM;
2181 } 2181 }
2182 2182
2183 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 2183 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
2184 XATTR_SECURITY_PREFIX_LEN); 2184 XATTR_SECURITY_PREFIX_LEN);
2185 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 2185 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2186 xattr->name, len); 2186 xattr->name, len);
2187 2187
2188 simple_xattr_list_add(&info->xattrs, new_xattr); 2188 simple_xattr_list_add(&info->xattrs, new_xattr);
2189 } 2189 }
2190 2190
2191 return 0; 2191 return 0;
2192 } 2192 }
2193 2193
2194 static const struct xattr_handler *shmem_xattr_handlers[] = { 2194 static const struct xattr_handler *shmem_xattr_handlers[] = {
2195 #ifdef CONFIG_TMPFS_POSIX_ACL 2195 #ifdef CONFIG_TMPFS_POSIX_ACL
2196 &generic_acl_access_handler, 2196 &generic_acl_access_handler,
2197 &generic_acl_default_handler, 2197 &generic_acl_default_handler,
2198 #endif 2198 #endif
2199 NULL 2199 NULL
2200 }; 2200 };
2201 2201
2202 static int shmem_xattr_validate(const char *name) 2202 static int shmem_xattr_validate(const char *name)
2203 { 2203 {
2204 struct { const char *prefix; size_t len; } arr[] = { 2204 struct { const char *prefix; size_t len; } arr[] = {
2205 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, 2205 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2206 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } 2206 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2207 }; 2207 };
2208 int i; 2208 int i;
2209 2209
2210 for (i = 0; i < ARRAY_SIZE(arr); i++) { 2210 for (i = 0; i < ARRAY_SIZE(arr); i++) {
2211 size_t preflen = arr[i].len; 2211 size_t preflen = arr[i].len;
2212 if (strncmp(name, arr[i].prefix, preflen) == 0) { 2212 if (strncmp(name, arr[i].prefix, preflen) == 0) {
2213 if (!name[preflen]) 2213 if (!name[preflen])
2214 return -EINVAL; 2214 return -EINVAL;
2215 return 0; 2215 return 0;
2216 } 2216 }
2217 } 2217 }
2218 return -EOPNOTSUPP; 2218 return -EOPNOTSUPP;
2219 } 2219 }
2220 2220
2221 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2221 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2222 void *buffer, size_t size) 2222 void *buffer, size_t size)
2223 { 2223 {
2224 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2224 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2225 int err; 2225 int err;
2226 2226
2227 /* 2227 /*
2228 * If this is a request for a synthetic attribute in the system.* 2228 * If this is a request for a synthetic attribute in the system.*
2229 * namespace use the generic infrastructure to resolve a handler 2229 * namespace use the generic infrastructure to resolve a handler
2230 * for it via sb->s_xattr. 2230 * for it via sb->s_xattr.
2231 */ 2231 */
2232 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2232 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2233 return generic_getxattr(dentry, name, buffer, size); 2233 return generic_getxattr(dentry, name, buffer, size);
2234 2234
2235 err = shmem_xattr_validate(name); 2235 err = shmem_xattr_validate(name);
2236 if (err) 2236 if (err)
2237 return err; 2237 return err;
2238 2238
2239 return simple_xattr_get(&info->xattrs, name, buffer, size); 2239 return simple_xattr_get(&info->xattrs, name, buffer, size);
2240 } 2240 }
2241 2241
2242 static int shmem_setxattr(struct dentry *dentry, const char *name, 2242 static int shmem_setxattr(struct dentry *dentry, const char *name,
2243 const void *value, size_t size, int flags) 2243 const void *value, size_t size, int flags)
2244 { 2244 {
2245 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2245 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2246 int err; 2246 int err;
2247 2247
2248 /* 2248 /*
2249 * If this is a request for a synthetic attribute in the system.* 2249 * If this is a request for a synthetic attribute in the system.*
2250 * namespace use the generic infrastructure to resolve a handler 2250 * namespace use the generic infrastructure to resolve a handler
2251 * for it via sb->s_xattr. 2251 * for it via sb->s_xattr.
2252 */ 2252 */
2253 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2253 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2254 return generic_setxattr(dentry, name, value, size, flags); 2254 return generic_setxattr(dentry, name, value, size, flags);
2255 2255
2256 err = shmem_xattr_validate(name); 2256 err = shmem_xattr_validate(name);
2257 if (err) 2257 if (err)
2258 return err; 2258 return err;
2259 2259
2260 return simple_xattr_set(&info->xattrs, name, value, size, flags); 2260 return simple_xattr_set(&info->xattrs, name, value, size, flags);
2261 } 2261 }
2262 2262
2263 static int shmem_removexattr(struct dentry *dentry, const char *name) 2263 static int shmem_removexattr(struct dentry *dentry, const char *name)
2264 { 2264 {
2265 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2265 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2266 int err; 2266 int err;
2267 2267
2268 /* 2268 /*
2269 * If this is a request for a synthetic attribute in the system.* 2269 * If this is a request for a synthetic attribute in the system.*
2270 * namespace use the generic infrastructure to resolve a handler 2270 * namespace use the generic infrastructure to resolve a handler
2271 * for it via sb->s_xattr. 2271 * for it via sb->s_xattr.
2272 */ 2272 */
2273 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2273 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2274 return generic_removexattr(dentry, name); 2274 return generic_removexattr(dentry, name);
2275 2275
2276 err = shmem_xattr_validate(name); 2276 err = shmem_xattr_validate(name);
2277 if (err) 2277 if (err)
2278 return err; 2278 return err;
2279 2279
2280 return simple_xattr_remove(&info->xattrs, name); 2280 return simple_xattr_remove(&info->xattrs, name);
2281 } 2281 }
2282 2282
2283 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2283 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2284 { 2284 {
2285 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2285 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2286 return simple_xattr_list(&info->xattrs, buffer, size); 2286 return simple_xattr_list(&info->xattrs, buffer, size);
2287 } 2287 }
2288 #endif /* CONFIG_TMPFS_XATTR */ 2288 #endif /* CONFIG_TMPFS_XATTR */
2289 2289
2290 static const struct inode_operations shmem_short_symlink_operations = { 2290 static const struct inode_operations shmem_short_symlink_operations = {
2291 .readlink = generic_readlink, 2291 .readlink = generic_readlink,
2292 .follow_link = shmem_follow_short_symlink, 2292 .follow_link = shmem_follow_short_symlink,
2293 #ifdef CONFIG_TMPFS_XATTR 2293 #ifdef CONFIG_TMPFS_XATTR
2294 .setxattr = shmem_setxattr, 2294 .setxattr = shmem_setxattr,
2295 .getxattr = shmem_getxattr, 2295 .getxattr = shmem_getxattr,
2296 .listxattr = shmem_listxattr, 2296 .listxattr = shmem_listxattr,
2297 .removexattr = shmem_removexattr, 2297 .removexattr = shmem_removexattr,
2298 #endif 2298 #endif
2299 }; 2299 };
2300 2300
2301 static const struct inode_operations shmem_symlink_inode_operations = { 2301 static const struct inode_operations shmem_symlink_inode_operations = {
2302 .readlink = generic_readlink, 2302 .readlink = generic_readlink,
2303 .follow_link = shmem_follow_link, 2303 .follow_link = shmem_follow_link,
2304 .put_link = shmem_put_link, 2304 .put_link = shmem_put_link,
2305 #ifdef CONFIG_TMPFS_XATTR 2305 #ifdef CONFIG_TMPFS_XATTR
2306 .setxattr = shmem_setxattr, 2306 .setxattr = shmem_setxattr,
2307 .getxattr = shmem_getxattr, 2307 .getxattr = shmem_getxattr,
2308 .listxattr = shmem_listxattr, 2308 .listxattr = shmem_listxattr,
2309 .removexattr = shmem_removexattr, 2309 .removexattr = shmem_removexattr,
2310 #endif 2310 #endif
2311 }; 2311 };
2312 2312
2313 static struct dentry *shmem_get_parent(struct dentry *child) 2313 static struct dentry *shmem_get_parent(struct dentry *child)
2314 { 2314 {
2315 return ERR_PTR(-ESTALE); 2315 return ERR_PTR(-ESTALE);
2316 } 2316 }
2317 2317
2318 static int shmem_match(struct inode *ino, void *vfh) 2318 static int shmem_match(struct inode *ino, void *vfh)
2319 { 2319 {
2320 __u32 *fh = vfh; 2320 __u32 *fh = vfh;
2321 __u64 inum = fh[2]; 2321 __u64 inum = fh[2];
2322 inum = (inum << 32) | fh[1]; 2322 inum = (inum << 32) | fh[1];
2323 return ino->i_ino == inum && fh[0] == ino->i_generation; 2323 return ino->i_ino == inum && fh[0] == ino->i_generation;
2324 } 2324 }
2325 2325
2326 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 2326 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2327 struct fid *fid, int fh_len, int fh_type) 2327 struct fid *fid, int fh_len, int fh_type)
2328 { 2328 {
2329 struct inode *inode; 2329 struct inode *inode;
2330 struct dentry *dentry = NULL; 2330 struct dentry *dentry = NULL;
2331 u64 inum; 2331 u64 inum;
2332 2332
2333 if (fh_len < 3) 2333 if (fh_len < 3)
2334 return NULL; 2334 return NULL;
2335 2335
2336 inum = fid->raw[2]; 2336 inum = fid->raw[2];
2337 inum = (inum << 32) | fid->raw[1]; 2337 inum = (inum << 32) | fid->raw[1];
2338 2338
2339 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2339 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2340 shmem_match, fid->raw); 2340 shmem_match, fid->raw);
2341 if (inode) { 2341 if (inode) {
2342 dentry = d_find_alias(inode); 2342 dentry = d_find_alias(inode);
2343 iput(inode); 2343 iput(inode);
2344 } 2344 }
2345 2345
2346 return dentry; 2346 return dentry;
2347 } 2347 }
2348 2348
2349 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 2349 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2350 struct inode *parent) 2350 struct inode *parent)
2351 { 2351 {
2352 if (*len < 3) { 2352 if (*len < 3) {
2353 *len = 3; 2353 *len = 3;
2354 return 255; 2354 return 255;
2355 } 2355 }
2356 2356
2357 if (inode_unhashed(inode)) { 2357 if (inode_unhashed(inode)) {
2358 /* Unfortunately insert_inode_hash is not idempotent, 2358 /* Unfortunately insert_inode_hash is not idempotent,
2359 * so as we hash inodes here rather than at creation 2359 * so as we hash inodes here rather than at creation
2360 * time, we need a lock to ensure we only try 2360 * time, we need a lock to ensure we only try
2361 * to do it once 2361 * to do it once
2362 */ 2362 */
2363 static DEFINE_SPINLOCK(lock); 2363 static DEFINE_SPINLOCK(lock);
2364 spin_lock(&lock); 2364 spin_lock(&lock);
2365 if (inode_unhashed(inode)) 2365 if (inode_unhashed(inode))
2366 __insert_inode_hash(inode, 2366 __insert_inode_hash(inode,
2367 inode->i_ino + inode->i_generation); 2367 inode->i_ino + inode->i_generation);
2368 spin_unlock(&lock); 2368 spin_unlock(&lock);
2369 } 2369 }
2370 2370
2371 fh[0] = inode->i_generation; 2371 fh[0] = inode->i_generation;
2372 fh[1] = inode->i_ino; 2372 fh[1] = inode->i_ino;
2373 fh[2] = ((__u64)inode->i_ino) >> 32; 2373 fh[2] = ((__u64)inode->i_ino) >> 32;
2374 2374
2375 *len = 3; 2375 *len = 3;
2376 return 1; 2376 return 1;
2377 } 2377 }
2378 2378
2379 static const struct export_operations shmem_export_ops = { 2379 static const struct export_operations shmem_export_ops = {
2380 .get_parent = shmem_get_parent, 2380 .get_parent = shmem_get_parent,
2381 .encode_fh = shmem_encode_fh, 2381 .encode_fh = shmem_encode_fh,
2382 .fh_to_dentry = shmem_fh_to_dentry, 2382 .fh_to_dentry = shmem_fh_to_dentry,
2383 }; 2383 };
2384 2384
2385 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 2385 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2386 bool remount) 2386 bool remount)
2387 { 2387 {
2388 char *this_char, *value, *rest; 2388 char *this_char, *value, *rest;
2389 uid_t uid; 2389 uid_t uid;
2390 gid_t gid; 2390 gid_t gid;
2391 2391
2392 while (options != NULL) { 2392 while (options != NULL) {
2393 this_char = options; 2393 this_char = options;
2394 for (;;) { 2394 for (;;) {
2395 /* 2395 /*
2396 * NUL-terminate this option: unfortunately, 2396 * NUL-terminate this option: unfortunately,
2397 * mount options form a comma-separated list, 2397 * mount options form a comma-separated list,
2398 * but mpol's nodelist may also contain commas. 2398 * but mpol's nodelist may also contain commas.
2399 */ 2399 */
2400 options = strchr(options, ','); 2400 options = strchr(options, ',');
2401 if (options == NULL) 2401 if (options == NULL)
2402 break; 2402 break;
2403 options++; 2403 options++;
2404 if (!isdigit(*options)) { 2404 if (!isdigit(*options)) {
2405 options[-1] = '\0'; 2405 options[-1] = '\0';
2406 break; 2406 break;
2407 } 2407 }
2408 } 2408 }
2409 if (!*this_char) 2409 if (!*this_char)
2410 continue; 2410 continue;
2411 if ((value = strchr(this_char,'=')) != NULL) { 2411 if ((value = strchr(this_char,'=')) != NULL) {
2412 *value++ = 0; 2412 *value++ = 0;
2413 } else { 2413 } else {
2414 printk(KERN_ERR 2414 printk(KERN_ERR
2415 "tmpfs: No value for mount option '%s'\n", 2415 "tmpfs: No value for mount option '%s'\n",
2416 this_char); 2416 this_char);
2417 return 1; 2417 return 1;
2418 } 2418 }
2419 2419
2420 if (!strcmp(this_char,"size")) { 2420 if (!strcmp(this_char,"size")) {
2421 unsigned long long size; 2421 unsigned long long size;
2422 size = memparse(value,&rest); 2422 size = memparse(value,&rest);
2423 if (*rest == '%') { 2423 if (*rest == '%') {
2424 size <<= PAGE_SHIFT; 2424 size <<= PAGE_SHIFT;
2425 size *= totalram_pages; 2425 size *= totalram_pages;
2426 do_div(size, 100); 2426 do_div(size, 100);
2427 rest++; 2427 rest++;
2428 } 2428 }
2429 if (*rest) 2429 if (*rest)
2430 goto bad_val; 2430 goto bad_val;
2431 sbinfo->max_blocks = 2431 sbinfo->max_blocks =
2432 DIV_ROUND_UP(size, PAGE_CACHE_SIZE); 2432 DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2433 } else if (!strcmp(this_char,"nr_blocks")) { 2433 } else if (!strcmp(this_char,"nr_blocks")) {
2434 sbinfo->max_blocks = memparse(value, &rest); 2434 sbinfo->max_blocks = memparse(value, &rest);
2435 if (*rest) 2435 if (*rest)
2436 goto bad_val; 2436 goto bad_val;
2437 } else if (!strcmp(this_char,"nr_inodes")) { 2437 } else if (!strcmp(this_char,"nr_inodes")) {
2438 sbinfo->max_inodes = memparse(value, &rest); 2438 sbinfo->max_inodes = memparse(value, &rest);
2439 if (*rest) 2439 if (*rest)
2440 goto bad_val; 2440 goto bad_val;
2441 } else if (!strcmp(this_char,"mode")) { 2441 } else if (!strcmp(this_char,"mode")) {
2442 if (remount) 2442 if (remount)
2443 continue; 2443 continue;
2444 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 2444 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
2445 if (*rest) 2445 if (*rest)
2446 goto bad_val; 2446 goto bad_val;
2447 } else if (!strcmp(this_char,"uid")) { 2447 } else if (!strcmp(this_char,"uid")) {
2448 if (remount) 2448 if (remount)
2449 continue; 2449 continue;
2450 uid = simple_strtoul(value, &rest, 0); 2450 uid = simple_strtoul(value, &rest, 0);
2451 if (*rest) 2451 if (*rest)
2452 goto bad_val; 2452 goto bad_val;
2453 sbinfo->uid = make_kuid(current_user_ns(), uid); 2453 sbinfo->uid = make_kuid(current_user_ns(), uid);
2454 if (!uid_valid(sbinfo->uid)) 2454 if (!uid_valid(sbinfo->uid))
2455 goto bad_val; 2455 goto bad_val;
2456 } else if (!strcmp(this_char,"gid")) { 2456 } else if (!strcmp(this_char,"gid")) {
2457 if (remount) 2457 if (remount)
2458 continue; 2458 continue;
2459 gid = simple_strtoul(value, &rest, 0); 2459 gid = simple_strtoul(value, &rest, 0);
2460 if (*rest) 2460 if (*rest)
2461 goto bad_val; 2461 goto bad_val;
2462 sbinfo->gid = make_kgid(current_user_ns(), gid); 2462 sbinfo->gid = make_kgid(current_user_ns(), gid);
2463 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2464 goto bad_val; 2464 goto bad_val;
2465 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2466 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2466 if (mpol_parse_str(value, &sbinfo->mpol, 1))
2467 goto bad_val; 2467 goto bad_val;
2468 } else { 2468 } else {
2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2470 this_char); 2470 this_char);
2471 return 1; 2471 return 1;
2472 } 2472 }
2473 } 2473 }
2474 return 0; 2474 return 0;
2475 2475
2476 bad_val: 2476 bad_val:
2477 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2477 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2478 value, this_char); 2478 value, this_char);
2479 return 1; 2479 return 1;
2480 2480
2481 } 2481 }
2482 2482
2483 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 2483 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2484 { 2484 {
2485 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2485 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2486 struct shmem_sb_info config = *sbinfo; 2486 struct shmem_sb_info config = *sbinfo;
2487 unsigned long inodes; 2487 unsigned long inodes;
2488 int error = -EINVAL; 2488 int error = -EINVAL;
2489 2489
2490 if (shmem_parse_options(data, &config, true)) 2490 if (shmem_parse_options(data, &config, true))
2491 return error; 2491 return error;
2492 2492
2493 spin_lock(&sbinfo->stat_lock); 2493 spin_lock(&sbinfo->stat_lock);
2494 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2494 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2495 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 2495 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2496 goto out; 2496 goto out;
2497 if (config.max_inodes < inodes) 2497 if (config.max_inodes < inodes)
2498 goto out; 2498 goto out;
2499 /* 2499 /*
2500 * Those tests disallow limited->unlimited while any are in use; 2500 * Those tests disallow limited->unlimited while any are in use;
2501 * but we must separately disallow unlimited->limited, because 2501 * but we must separately disallow unlimited->limited, because
2502 * in that case we have no record of how much is already in use. 2502 * in that case we have no record of how much is already in use.
2503 */ 2503 */
2504 if (config.max_blocks && !sbinfo->max_blocks) 2504 if (config.max_blocks && !sbinfo->max_blocks)
2505 goto out; 2505 goto out;
2506 if (config.max_inodes && !sbinfo->max_inodes) 2506 if (config.max_inodes && !sbinfo->max_inodes)
2507 goto out; 2507 goto out;
2508 2508
2509 error = 0; 2509 error = 0;
2510 sbinfo->max_blocks = config.max_blocks; 2510 sbinfo->max_blocks = config.max_blocks;
2511 sbinfo->max_inodes = config.max_inodes; 2511 sbinfo->max_inodes = config.max_inodes;
2512 sbinfo->free_inodes = config.max_inodes - inodes; 2512 sbinfo->free_inodes = config.max_inodes - inodes;
2513 2513
2514 mpol_put(sbinfo->mpol); 2514 mpol_put(sbinfo->mpol);
2515 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2515 sbinfo->mpol = config.mpol; /* transfers initial ref */
2516 out: 2516 out:
2517 spin_unlock(&sbinfo->stat_lock); 2517 spin_unlock(&sbinfo->stat_lock);
2518 return error; 2518 return error;
2519 } 2519 }
2520 2520
2521 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 2521 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2522 { 2522 {
2523 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 2523 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
2524 2524
2525 if (sbinfo->max_blocks != shmem_default_max_blocks()) 2525 if (sbinfo->max_blocks != shmem_default_max_blocks())
2526 seq_printf(seq, ",size=%luk", 2526 seq_printf(seq, ",size=%luk",
2527 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); 2527 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
2528 if (sbinfo->max_inodes != shmem_default_max_inodes()) 2528 if (sbinfo->max_inodes != shmem_default_max_inodes())
2529 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2529 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2530 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2530 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2531 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2531 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2532 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 2532 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2533 seq_printf(seq, ",uid=%u", 2533 seq_printf(seq, ",uid=%u",
2534 from_kuid_munged(&init_user_ns, sbinfo->uid)); 2534 from_kuid_munged(&init_user_ns, sbinfo->uid));
2535 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 2535 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2536 seq_printf(seq, ",gid=%u", 2536 seq_printf(seq, ",gid=%u",
2537 from_kgid_munged(&init_user_ns, sbinfo->gid)); 2537 from_kgid_munged(&init_user_ns, sbinfo->gid));
2538 shmem_show_mpol(seq, sbinfo->mpol); 2538 shmem_show_mpol(seq, sbinfo->mpol);
2539 return 0; 2539 return 0;
2540 } 2540 }
2541 #endif /* CONFIG_TMPFS */ 2541 #endif /* CONFIG_TMPFS */
2542 2542
2543 static void shmem_put_super(struct super_block *sb) 2543 static void shmem_put_super(struct super_block *sb)
2544 { 2544 {
2545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2546 2546
2547 percpu_counter_destroy(&sbinfo->used_blocks); 2547 percpu_counter_destroy(&sbinfo->used_blocks);
2548 kfree(sbinfo); 2548 kfree(sbinfo);
2549 sb->s_fs_info = NULL; 2549 sb->s_fs_info = NULL;
2550 } 2550 }
2551 2551
2552 int shmem_fill_super(struct super_block *sb, void *data, int silent) 2552 int shmem_fill_super(struct super_block *sb, void *data, int silent)
2553 { 2553 {
2554 struct inode *inode; 2554 struct inode *inode;
2555 struct shmem_sb_info *sbinfo; 2555 struct shmem_sb_info *sbinfo;
2556 int err = -ENOMEM; 2556 int err = -ENOMEM;
2557 2557
2558 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2558 /* Round up to L1_CACHE_BYTES to resist false sharing */
2559 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 2559 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2560 L1_CACHE_BYTES), GFP_KERNEL); 2560 L1_CACHE_BYTES), GFP_KERNEL);
2561 if (!sbinfo) 2561 if (!sbinfo)
2562 return -ENOMEM; 2562 return -ENOMEM;
2563 2563
2564 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2564 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2565 sbinfo->uid = current_fsuid(); 2565 sbinfo->uid = current_fsuid();
2566 sbinfo->gid = current_fsgid(); 2566 sbinfo->gid = current_fsgid();
2567 sb->s_fs_info = sbinfo; 2567 sb->s_fs_info = sbinfo;
2568 2568
2569 #ifdef CONFIG_TMPFS 2569 #ifdef CONFIG_TMPFS
2570 /* 2570 /*
2571 * Per default we only allow half of the physical ram per 2571 * Per default we only allow half of the physical ram per
2572 * tmpfs instance, limiting inodes to one per page of lowmem; 2572 * tmpfs instance, limiting inodes to one per page of lowmem;
2573 * but the internal instance is left unlimited. 2573 * but the internal instance is left unlimited.
2574 */ 2574 */
2575 if (!(sb->s_flags & MS_NOUSER)) { 2575 if (!(sb->s_flags & MS_NOUSER)) {
2576 sbinfo->max_blocks = shmem_default_max_blocks(); 2576 sbinfo->max_blocks = shmem_default_max_blocks();
2577 sbinfo->max_inodes = shmem_default_max_inodes(); 2577 sbinfo->max_inodes = shmem_default_max_inodes();
2578 if (shmem_parse_options(data, sbinfo, false)) { 2578 if (shmem_parse_options(data, sbinfo, false)) {
2579 err = -EINVAL; 2579 err = -EINVAL;
2580 goto failed; 2580 goto failed;
2581 } 2581 }
2582 } 2582 }
2583 sb->s_export_op = &shmem_export_ops; 2583 sb->s_export_op = &shmem_export_ops;
2584 sb->s_flags |= MS_NOSEC; 2584 sb->s_flags |= MS_NOSEC;
2585 #else 2585 #else
2586 sb->s_flags |= MS_NOUSER; 2586 sb->s_flags |= MS_NOUSER;
2587 #endif 2587 #endif
2588 2588
2589 spin_lock_init(&sbinfo->stat_lock); 2589 spin_lock_init(&sbinfo->stat_lock);
2590 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2590 if (percpu_counter_init(&sbinfo->used_blocks, 0))
2591 goto failed; 2591 goto failed;
2592 sbinfo->free_inodes = sbinfo->max_inodes; 2592 sbinfo->free_inodes = sbinfo->max_inodes;
2593 2593
2594 sb->s_maxbytes = MAX_LFS_FILESIZE; 2594 sb->s_maxbytes = MAX_LFS_FILESIZE;
2595 sb->s_blocksize = PAGE_CACHE_SIZE; 2595 sb->s_blocksize = PAGE_CACHE_SIZE;
2596 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2596 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2597 sb->s_magic = TMPFS_MAGIC; 2597 sb->s_magic = TMPFS_MAGIC;
2598 sb->s_op = &shmem_ops; 2598 sb->s_op = &shmem_ops;
2599 sb->s_time_gran = 1; 2599 sb->s_time_gran = 1;
2600 #ifdef CONFIG_TMPFS_XATTR 2600 #ifdef CONFIG_TMPFS_XATTR
2601 sb->s_xattr = shmem_xattr_handlers; 2601 sb->s_xattr = shmem_xattr_handlers;
2602 #endif 2602 #endif
2603 #ifdef CONFIG_TMPFS_POSIX_ACL 2603 #ifdef CONFIG_TMPFS_POSIX_ACL
2604 sb->s_flags |= MS_POSIXACL; 2604 sb->s_flags |= MS_POSIXACL;
2605 #endif 2605 #endif
2606 2606
2607 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2607 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2608 if (!inode) 2608 if (!inode)
2609 goto failed; 2609 goto failed;
2610 inode->i_uid = sbinfo->uid; 2610 inode->i_uid = sbinfo->uid;
2611 inode->i_gid = sbinfo->gid; 2611 inode->i_gid = sbinfo->gid;
2612 sb->s_root = d_make_root(inode); 2612 sb->s_root = d_make_root(inode);
2613 if (!sb->s_root) 2613 if (!sb->s_root)
2614 goto failed; 2614 goto failed;
2615 return 0; 2615 return 0;
2616 2616
2617 failed: 2617 failed:
2618 shmem_put_super(sb); 2618 shmem_put_super(sb);
2619 return err; 2619 return err;
2620 } 2620 }
2621 2621
2622 static struct kmem_cache *shmem_inode_cachep; 2622 static struct kmem_cache *shmem_inode_cachep;
2623 2623
2624 static struct inode *shmem_alloc_inode(struct super_block *sb) 2624 static struct inode *shmem_alloc_inode(struct super_block *sb)
2625 { 2625 {
2626 struct shmem_inode_info *info; 2626 struct shmem_inode_info *info;
2627 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2627 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2628 if (!info) 2628 if (!info)
2629 return NULL; 2629 return NULL;
2630 return &info->vfs_inode; 2630 return &info->vfs_inode;
2631 } 2631 }
2632 2632
2633 static void shmem_destroy_callback(struct rcu_head *head) 2633 static void shmem_destroy_callback(struct rcu_head *head)
2634 { 2634 {
2635 struct inode *inode = container_of(head, struct inode, i_rcu); 2635 struct inode *inode = container_of(head, struct inode, i_rcu);
2636 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2636 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2637 } 2637 }
2638 2638
2639 static void shmem_destroy_inode(struct inode *inode) 2639 static void shmem_destroy_inode(struct inode *inode)
2640 { 2640 {
2641 if (S_ISREG(inode->i_mode)) 2641 if (S_ISREG(inode->i_mode))
2642 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2642 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2643 call_rcu(&inode->i_rcu, shmem_destroy_callback); 2643 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2644 } 2644 }
2645 2645
2646 static void shmem_init_inode(void *foo) 2646 static void shmem_init_inode(void *foo)
2647 { 2647 {
2648 struct shmem_inode_info *info = foo; 2648 struct shmem_inode_info *info = foo;
2649 inode_init_once(&info->vfs_inode); 2649 inode_init_once(&info->vfs_inode);
2650 } 2650 }
2651 2651
2652 static int shmem_init_inodecache(void) 2652 static int shmem_init_inodecache(void)
2653 { 2653 {
2654 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2654 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2655 sizeof(struct shmem_inode_info), 2655 sizeof(struct shmem_inode_info),
2656 0, SLAB_PANIC, shmem_init_inode); 2656 0, SLAB_PANIC, shmem_init_inode);
2657 return 0; 2657 return 0;
2658 } 2658 }
2659 2659
2660 static void shmem_destroy_inodecache(void) 2660 static void shmem_destroy_inodecache(void)
2661 { 2661 {
2662 kmem_cache_destroy(shmem_inode_cachep); 2662 kmem_cache_destroy(shmem_inode_cachep);
2663 } 2663 }
2664 2664
2665 static const struct address_space_operations shmem_aops = { 2665 static const struct address_space_operations shmem_aops = {
2666 .writepage = shmem_writepage, 2666 .writepage = shmem_writepage,
2667 .set_page_dirty = __set_page_dirty_no_writeback, 2667 .set_page_dirty = __set_page_dirty_no_writeback,
2668 #ifdef CONFIG_TMPFS 2668 #ifdef CONFIG_TMPFS
2669 .write_begin = shmem_write_begin, 2669 .write_begin = shmem_write_begin,
2670 .write_end = shmem_write_end, 2670 .write_end = shmem_write_end,
2671 #endif 2671 #endif
2672 .migratepage = migrate_page, 2672 .migratepage = migrate_page,
2673 .error_remove_page = generic_error_remove_page, 2673 .error_remove_page = generic_error_remove_page,
2674 }; 2674 };
2675 2675
2676 static const struct file_operations shmem_file_operations = { 2676 static const struct file_operations shmem_file_operations = {
2677 .mmap = shmem_mmap, 2677 .mmap = shmem_mmap,
2678 #ifdef CONFIG_TMPFS 2678 #ifdef CONFIG_TMPFS
2679 .llseek = shmem_file_llseek, 2679 .llseek = shmem_file_llseek,
2680 .read = do_sync_read, 2680 .read = do_sync_read,
2681 .write = do_sync_write, 2681 .write = do_sync_write,
2682 .aio_read = shmem_file_aio_read, 2682 .aio_read = shmem_file_aio_read,
2683 .aio_write = generic_file_aio_write, 2683 .aio_write = generic_file_aio_write,
2684 .fsync = noop_fsync, 2684 .fsync = noop_fsync,
2685 .splice_read = shmem_file_splice_read, 2685 .splice_read = shmem_file_splice_read,
2686 .splice_write = generic_file_splice_write, 2686 .splice_write = generic_file_splice_write,
2687 .fallocate = shmem_fallocate, 2687 .fallocate = shmem_fallocate,
2688 #endif 2688 #endif
2689 }; 2689 };
2690 2690
2691 static const struct inode_operations shmem_inode_operations = { 2691 static const struct inode_operations shmem_inode_operations = {
2692 .setattr = shmem_setattr, 2692 .setattr = shmem_setattr,
2693 #ifdef CONFIG_TMPFS_XATTR 2693 #ifdef CONFIG_TMPFS_XATTR
2694 .setxattr = shmem_setxattr, 2694 .setxattr = shmem_setxattr,
2695 .getxattr = shmem_getxattr, 2695 .getxattr = shmem_getxattr,
2696 .listxattr = shmem_listxattr, 2696 .listxattr = shmem_listxattr,
2697 .removexattr = shmem_removexattr, 2697 .removexattr = shmem_removexattr,
2698 #endif 2698 #endif
2699 }; 2699 };
2700 2700
2701 static const struct inode_operations shmem_dir_inode_operations = { 2701 static const struct inode_operations shmem_dir_inode_operations = {
2702 #ifdef CONFIG_TMPFS 2702 #ifdef CONFIG_TMPFS
2703 .create = shmem_create, 2703 .create = shmem_create,
2704 .lookup = simple_lookup, 2704 .lookup = simple_lookup,
2705 .link = shmem_link, 2705 .link = shmem_link,
2706 .unlink = shmem_unlink, 2706 .unlink = shmem_unlink,
2707 .symlink = shmem_symlink, 2707 .symlink = shmem_symlink,
2708 .mkdir = shmem_mkdir, 2708 .mkdir = shmem_mkdir,
2709 .rmdir = shmem_rmdir, 2709 .rmdir = shmem_rmdir,
2710 .mknod = shmem_mknod, 2710 .mknod = shmem_mknod,
2711 .rename = shmem_rename, 2711 .rename = shmem_rename,
2712 #endif 2712 #endif
2713 #ifdef CONFIG_TMPFS_XATTR 2713 #ifdef CONFIG_TMPFS_XATTR
2714 .setxattr = shmem_setxattr, 2714 .setxattr = shmem_setxattr,
2715 .getxattr = shmem_getxattr, 2715 .getxattr = shmem_getxattr,
2716 .listxattr = shmem_listxattr, 2716 .listxattr = shmem_listxattr,
2717 .removexattr = shmem_removexattr, 2717 .removexattr = shmem_removexattr,
2718 #endif 2718 #endif
2719 #ifdef CONFIG_TMPFS_POSIX_ACL 2719 #ifdef CONFIG_TMPFS_POSIX_ACL
2720 .setattr = shmem_setattr, 2720 .setattr = shmem_setattr,
2721 #endif 2721 #endif
2722 }; 2722 };
2723 2723
2724 static const struct inode_operations shmem_special_inode_operations = { 2724 static const struct inode_operations shmem_special_inode_operations = {
2725 #ifdef CONFIG_TMPFS_XATTR 2725 #ifdef CONFIG_TMPFS_XATTR
2726 .setxattr = shmem_setxattr, 2726 .setxattr = shmem_setxattr,
2727 .getxattr = shmem_getxattr, 2727 .getxattr = shmem_getxattr,
2728 .listxattr = shmem_listxattr, 2728 .listxattr = shmem_listxattr,
2729 .removexattr = shmem_removexattr, 2729 .removexattr = shmem_removexattr,
2730 #endif 2730 #endif
2731 #ifdef CONFIG_TMPFS_POSIX_ACL 2731 #ifdef CONFIG_TMPFS_POSIX_ACL
2732 .setattr = shmem_setattr, 2732 .setattr = shmem_setattr,
2733 #endif 2733 #endif
2734 }; 2734 };
2735 2735
2736 static const struct super_operations shmem_ops = { 2736 static const struct super_operations shmem_ops = {
2737 .alloc_inode = shmem_alloc_inode, 2737 .alloc_inode = shmem_alloc_inode,
2738 .destroy_inode = shmem_destroy_inode, 2738 .destroy_inode = shmem_destroy_inode,
2739 #ifdef CONFIG_TMPFS 2739 #ifdef CONFIG_TMPFS
2740 .statfs = shmem_statfs, 2740 .statfs = shmem_statfs,
2741 .remount_fs = shmem_remount_fs, 2741 .remount_fs = shmem_remount_fs,
2742 .show_options = shmem_show_options, 2742 .show_options = shmem_show_options,
2743 #endif 2743 #endif
2744 .evict_inode = shmem_evict_inode, 2744 .evict_inode = shmem_evict_inode,
2745 .drop_inode = generic_delete_inode, 2745 .drop_inode = generic_delete_inode,
2746 .put_super = shmem_put_super, 2746 .put_super = shmem_put_super,
2747 }; 2747 };
2748 2748
2749 static const struct vm_operations_struct shmem_vm_ops = { 2749 static const struct vm_operations_struct shmem_vm_ops = {
2750 .fault = shmem_fault, 2750 .fault = shmem_fault,
2751 #ifdef CONFIG_NUMA 2751 #ifdef CONFIG_NUMA
2752 .set_policy = shmem_set_policy, 2752 .set_policy = shmem_set_policy,
2753 .get_policy = shmem_get_policy, 2753 .get_policy = shmem_get_policy,
2754 #endif 2754 #endif
2755 .remap_pages = generic_file_remap_pages, 2755 .remap_pages = generic_file_remap_pages,
2756 }; 2756 };
2757 2757
2758 static struct dentry *shmem_mount(struct file_system_type *fs_type, 2758 static struct dentry *shmem_mount(struct file_system_type *fs_type,
2759 int flags, const char *dev_name, void *data) 2759 int flags, const char *dev_name, void *data)
2760 { 2760 {
2761 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2761 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2762 } 2762 }
2763 2763
2764 static struct file_system_type shmem_fs_type = { 2764 static struct file_system_type shmem_fs_type = {
2765 .owner = THIS_MODULE, 2765 .owner = THIS_MODULE,
2766 .name = "tmpfs", 2766 .name = "tmpfs",
2767 .mount = shmem_mount, 2767 .mount = shmem_mount,
2768 .kill_sb = kill_litter_super, 2768 .kill_sb = kill_litter_super,
2769 }; 2769 };
2770 2770
2771 int __init shmem_init(void) 2771 int __init shmem_init(void)
2772 { 2772 {
2773 int error; 2773 int error;
2774 2774
2775 error = bdi_init(&shmem_backing_dev_info); 2775 error = bdi_init(&shmem_backing_dev_info);
2776 if (error) 2776 if (error)
2777 goto out4; 2777 goto out4;
2778 2778
2779 error = shmem_init_inodecache(); 2779 error = shmem_init_inodecache();
2780 if (error) 2780 if (error)
2781 goto out3; 2781 goto out3;
2782 2782
2783 error = register_filesystem(&shmem_fs_type); 2783 error = register_filesystem(&shmem_fs_type);
2784 if (error) { 2784 if (error) {
2785 printk(KERN_ERR "Could not register tmpfs\n"); 2785 printk(KERN_ERR "Could not register tmpfs\n");
2786 goto out2; 2786 goto out2;
2787 } 2787 }
2788 2788
2789 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, 2789 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2790 shmem_fs_type.name, NULL); 2790 shmem_fs_type.name, NULL);
2791 if (IS_ERR(shm_mnt)) { 2791 if (IS_ERR(shm_mnt)) {
2792 error = PTR_ERR(shm_mnt); 2792 error = PTR_ERR(shm_mnt);
2793 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2793 printk(KERN_ERR "Could not kern_mount tmpfs\n");
2794 goto out1; 2794 goto out1;
2795 } 2795 }
2796 return 0; 2796 return 0;
2797 2797
2798 out1: 2798 out1:
2799 unregister_filesystem(&shmem_fs_type); 2799 unregister_filesystem(&shmem_fs_type);
2800 out2: 2800 out2:
2801 shmem_destroy_inodecache(); 2801 shmem_destroy_inodecache();
2802 out3: 2802 out3:
2803 bdi_destroy(&shmem_backing_dev_info); 2803 bdi_destroy(&shmem_backing_dev_info);
2804 out4: 2804 out4:
2805 shm_mnt = ERR_PTR(error); 2805 shm_mnt = ERR_PTR(error);
2806 return error; 2806 return error;
2807 } 2807 }
2808 2808
2809 #else /* !CONFIG_SHMEM */ 2809 #else /* !CONFIG_SHMEM */
2810 2810
2811 /* 2811 /*
2812 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 2812 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2813 * 2813 *
2814 * This is intended for small system where the benefits of the full 2814 * This is intended for small system where the benefits of the full
2815 * shmem code (swap-backed and resource-limited) are outweighed by 2815 * shmem code (swap-backed and resource-limited) are outweighed by
2816 * their complexity. On systems without swap this code should be 2816 * their complexity. On systems without swap this code should be
2817 * effectively equivalent, but much lighter weight. 2817 * effectively equivalent, but much lighter weight.
2818 */ 2818 */
2819 2819
2820 #include <linux/ramfs.h> 2820 #include <linux/ramfs.h>
2821 2821
2822 static struct file_system_type shmem_fs_type = { 2822 static struct file_system_type shmem_fs_type = {
2823 .name = "tmpfs", 2823 .name = "tmpfs",
2824 .mount = ramfs_mount, 2824 .mount = ramfs_mount,
2825 .kill_sb = kill_litter_super, 2825 .kill_sb = kill_litter_super,
2826 }; 2826 };
2827 2827
2828 int __init shmem_init(void) 2828 int __init shmem_init(void)
2829 { 2829 {
2830 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 2830 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2831 2831
2832 shm_mnt = kern_mount(&shmem_fs_type); 2832 shm_mnt = kern_mount(&shmem_fs_type);
2833 BUG_ON(IS_ERR(shm_mnt)); 2833 BUG_ON(IS_ERR(shm_mnt));
2834 2834
2835 return 0; 2835 return 0;
2836 } 2836 }
2837 2837
2838 int shmem_unuse(swp_entry_t swap, struct page *page) 2838 int shmem_unuse(swp_entry_t swap, struct page *page)
2839 { 2839 {
2840 return 0; 2840 return 0;
2841 } 2841 }
2842 2842
2843 int shmem_lock(struct file *file, int lock, struct user_struct *user) 2843 int shmem_lock(struct file *file, int lock, struct user_struct *user)
2844 { 2844 {
2845 return 0; 2845 return 0;
2846 } 2846 }
2847 2847
2848 void shmem_unlock_mapping(struct address_space *mapping) 2848 void shmem_unlock_mapping(struct address_space *mapping)
2849 { 2849 {
2850 } 2850 }
2851 2851
2852 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 2852 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2853 { 2853 {
2854 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 2854 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2855 } 2855 }
2856 EXPORT_SYMBOL_GPL(shmem_truncate_range); 2856 EXPORT_SYMBOL_GPL(shmem_truncate_range);
2857 2857
2858 #define shmem_vm_ops generic_file_vm_ops 2858 #define shmem_vm_ops generic_file_vm_ops
2859 #define shmem_file_operations ramfs_file_operations 2859 #define shmem_file_operations ramfs_file_operations
2860 #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2860 #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2861 #define shmem_acct_size(flags, size) 0 2861 #define shmem_acct_size(flags, size) 0
2862 #define shmem_unacct_size(flags, size) do {} while (0) 2862 #define shmem_unacct_size(flags, size) do {} while (0)
2863 2863
2864 #endif /* CONFIG_SHMEM */ 2864 #endif /* CONFIG_SHMEM */
2865 2865
2866 /* common code */ 2866 /* common code */
2867 2867
2868 /** 2868 /**
2869 * shmem_file_setup - get an unlinked file living in tmpfs 2869 * shmem_file_setup - get an unlinked file living in tmpfs
2870 * @name: name for dentry (to be seen in /proc/<pid>/maps 2870 * @name: name for dentry (to be seen in /proc/<pid>/maps
2871 * @size: size to be set for the file 2871 * @size: size to be set for the file
2872 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2872 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2873 */ 2873 */
2874 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 2874 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2875 { 2875 {
2876 int error; 2876 int error;
2877 struct file *file; 2877 struct file *file;
2878 struct inode *inode; 2878 struct inode *inode;
2879 struct path path; 2879 struct path path;
2880 struct dentry *root; 2880 struct dentry *root;
2881 struct qstr this; 2881 struct qstr this;
2882 2882
2883 if (IS_ERR(shm_mnt)) 2883 if (IS_ERR(shm_mnt))
2884 return (void *)shm_mnt; 2884 return (void *)shm_mnt;
2885 2885
2886 if (size < 0 || size > MAX_LFS_FILESIZE) 2886 if (size < 0 || size > MAX_LFS_FILESIZE)
2887 return ERR_PTR(-EINVAL); 2887 return ERR_PTR(-EINVAL);
2888 2888
2889 if (shmem_acct_size(flags, size)) 2889 if (shmem_acct_size(flags, size))
2890 return ERR_PTR(-ENOMEM); 2890 return ERR_PTR(-ENOMEM);
2891 2891
2892 error = -ENOMEM; 2892 error = -ENOMEM;
2893 this.name = name; 2893 this.name = name;
2894 this.len = strlen(name); 2894 this.len = strlen(name);
2895 this.hash = 0; /* will go */ 2895 this.hash = 0; /* will go */
2896 root = shm_mnt->mnt_root; 2896 root = shm_mnt->mnt_root;
2897 path.dentry = d_alloc(root, &this); 2897 path.dentry = d_alloc(root, &this);
2898 if (!path.dentry) 2898 if (!path.dentry)
2899 goto put_memory; 2899 goto put_memory;
2900 path.mnt = mntget(shm_mnt); 2900 path.mnt = mntget(shm_mnt);
2901 2901
2902 error = -ENOSPC; 2902 error = -ENOSPC;
2903 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 2903 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2904 if (!inode) 2904 if (!inode)
2905 goto put_dentry; 2905 goto put_dentry;
2906 2906
2907 d_instantiate(path.dentry, inode); 2907 d_instantiate(path.dentry, inode);
2908 inode->i_size = size; 2908 inode->i_size = size;
2909 clear_nlink(inode); /* It is unlinked */ 2909 clear_nlink(inode); /* It is unlinked */
2910 #ifndef CONFIG_MMU 2910 #ifndef CONFIG_MMU
2911 error = ramfs_nommu_expand_for_mapping(inode, size); 2911 error = ramfs_nommu_expand_for_mapping(inode, size);
2912 if (error) 2912 if (error)
2913 goto put_dentry; 2913 goto put_dentry;
2914 #endif 2914 #endif
2915 2915
2916 error = -ENFILE; 2916 error = -ENFILE;
2917 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 2917 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2918 &shmem_file_operations); 2918 &shmem_file_operations);
2919 if (!file) 2919 if (!file)
2920 goto put_dentry; 2920 goto put_dentry;
2921 2921
2922 return file; 2922 return file;
2923 2923
2924 put_dentry: 2924 put_dentry:
2925 path_put(&path); 2925 path_put(&path);
2926 put_memory: 2926 put_memory:
2927 shmem_unacct_size(flags, size); 2927 shmem_unacct_size(flags, size);
2928 return ERR_PTR(error); 2928 return ERR_PTR(error);
2929 } 2929 }
2930 EXPORT_SYMBOL_GPL(shmem_file_setup); 2930 EXPORT_SYMBOL_GPL(shmem_file_setup);
2931 2931
2932 /** 2932 /**
2933 * shmem_zero_setup - setup a shared anonymous mapping 2933 * shmem_zero_setup - setup a shared anonymous mapping
2934 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 2934 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2935 */ 2935 */
2936 int shmem_zero_setup(struct vm_area_struct *vma) 2936 int shmem_zero_setup(struct vm_area_struct *vma)
2937 { 2937 {
2938 struct file *file; 2938 struct file *file;
2939 loff_t size = vma->vm_end - vma->vm_start; 2939 loff_t size = vma->vm_end - vma->vm_start;
2940 2940
2941 file = shmem_file_setup("dev/zero", size, vma->vm_flags); 2941 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2942 if (IS_ERR(file)) 2942 if (IS_ERR(file))
2943 return PTR_ERR(file); 2943 return PTR_ERR(file);
2944 2944
2945 if (vma->vm_file) 2945 if (vma->vm_file)
2946 fput(vma->vm_file); 2946 fput(vma->vm_file);
2947 vma->vm_file = file; 2947 vma->vm_file = file;
2948 vma->vm_ops = &shmem_vm_ops; 2948 vma->vm_ops = &shmem_vm_ops;
2949 return 0; 2949 return 0;
2950 } 2950 }
2951 2951
2952 /** 2952 /**
2953 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 2953 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
2954 * @mapping: the page's address_space 2954 * @mapping: the page's address_space
2955 * @index: the page index 2955 * @index: the page index
2956 * @gfp: the page allocator flags to use if allocating 2956 * @gfp: the page allocator flags to use if allocating
2957 * 2957 *
2958 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 2958 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
2959 * with any new page allocations done using the specified allocation flags. 2959 * with any new page allocations done using the specified allocation flags.
2960 * But read_cache_page_gfp() uses the ->readpage() method: which does not 2960 * But read_cache_page_gfp() uses the ->readpage() method: which does not
2961 * suit tmpfs, since it may have pages in swapcache, and needs to find those 2961 * suit tmpfs, since it may have pages in swapcache, and needs to find those
2962 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 2962 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
2963 * 2963 *
2964 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 2964 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
2965 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 2965 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
2966 */ 2966 */
2967 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 2967 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
2968 pgoff_t index, gfp_t gfp) 2968 pgoff_t index, gfp_t gfp)
2969 { 2969 {
2970 #ifdef CONFIG_SHMEM 2970 #ifdef CONFIG_SHMEM
2971 struct inode *inode = mapping->host; 2971 struct inode *inode = mapping->host;
2972 struct page *page; 2972 struct page *page;
2973 int error; 2973 int error;
2974 2974
2975 BUG_ON(mapping->a_ops != &shmem_aops); 2975 BUG_ON(mapping->a_ops != &shmem_aops);
2976 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); 2976 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2977 if (error) 2977 if (error)
2978 page = ERR_PTR(error); 2978 page = ERR_PTR(error);
2979 else 2979 else
2980 unlock_page(page); 2980 unlock_page(page);
2981 return page; 2981 return page;
2982 #else 2982 #else
2983 /* 2983 /*
2984 * The tiny !SHMEM case uses ramfs without swap 2984 * The tiny !SHMEM case uses ramfs without swap
2985 */ 2985 */
2986 return read_cache_page_gfp(mapping, index, gfp); 2986 return read_cache_page_gfp(mapping, index, gfp);
2987 #endif 2987 #endif
2988 } 2988 }
2989 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 2989 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
2990 2990