Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 965c8e59cfcf845ecde2265a1d1bfee5f011d302

Authored by Andrew Morton 2012-12-18 07:59:39 +0800

Committed by Linus Torvalds 2012-12-18 09:15:12 +0800

Exists in smarc-l5.0.0_1.0.0-ga and in 5 other branches

lseek: the "whence" argument is called "whence"

But the kernel decided to call it "origin" instead.  Fix most of the
sites.

Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 26 changed files with 116 additions and 116 deletions Inline Diff

fs/bad_inode.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/bad_inode.c	2	* linux/fs/bad_inode.c
3	*	3	*
4	* Copyright (C) 1997, Stephen Tweedie	4	* Copyright (C) 1997, Stephen Tweedie
5	*	5	*
6	* Provide stub functions for unreadable inodes	6	* Provide stub functions for unreadable inodes
7	*	7	*
8	* Fabian Frederick : August 2003 - All file operations assigned to EIO	8	* Fabian Frederick : August 2003 - All file operations assigned to EIO
9	*/	9	*/
10		10
11	#include <linux/fs.h>	11	#include <linux/fs.h>
12	#include <linux/export.h>	12	#include <linux/export.h>
13	#include <linux/stat.h>	13	#include <linux/stat.h>
14	#include <linux/time.h>	14	#include <linux/time.h>
15	#include <linux/namei.h>	15	#include <linux/namei.h>
16	#include <linux/poll.h>	16	#include <linux/poll.h>
17		17
18		18
19	static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)	19	static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
20	{	20	{
21	return -EIO;	21	return -EIO;
22	}	22	}
23		23
24	static ssize_t bad_file_read(struct file filp, char __user buf,	24	static ssize_t bad_file_read(struct file filp, char __user buf,
25	size_t size, loff_t *ppos)	25	size_t size, loff_t *ppos)
26	{	26	{
27	return -EIO;	27	return -EIO;
28	}	28	}
29		29
30	static ssize_t bad_file_write(struct file filp, const char __user buf,	30	static ssize_t bad_file_write(struct file filp, const char __user buf,
31	size_t siz, loff_t *ppos)	31	size_t siz, loff_t *ppos)
32	{	32	{
33	return -EIO;	33	return -EIO;
34	}	34	}
35		35
36	static ssize_t bad_file_aio_read(struct kiocb iocb, const struct iovec iov,	36	static ssize_t bad_file_aio_read(struct kiocb iocb, const struct iovec iov,
37	unsigned long nr_segs, loff_t pos)	37	unsigned long nr_segs, loff_t pos)
38	{	38	{
39	return -EIO;	39	return -EIO;
40	}	40	}
41		41
42	static ssize_t bad_file_aio_write(struct kiocb iocb, const struct iovec iov,	42	static ssize_t bad_file_aio_write(struct kiocb iocb, const struct iovec iov,
43	unsigned long nr_segs, loff_t pos)	43	unsigned long nr_segs, loff_t pos)
44	{	44	{
45	return -EIO;	45	return -EIO;
46	}	46	}
47		47
48	static int bad_file_readdir(struct file filp, void dirent, filldir_t filldir)	48	static int bad_file_readdir(struct file filp, void dirent, filldir_t filldir)
49	{	49	{
50	return -EIO;	50	return -EIO;
51	}	51	}
52		52
53	static unsigned int bad_file_poll(struct file filp, poll_table wait)	53	static unsigned int bad_file_poll(struct file filp, poll_table wait)
54	{	54	{
55	return POLLERR;	55	return POLLERR;
56	}	56	}
57		57
58	static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,	58	static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
59	unsigned long arg)	59	unsigned long arg)
60	{	60	{
61	return -EIO;	61	return -EIO;
62	}	62	}
63		63
64	static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,	64	static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
65	unsigned long arg)	65	unsigned long arg)
66	{	66	{
67	return -EIO;	67	return -EIO;
68	}	68	}
69		69
70	static int bad_file_mmap(struct file file, struct vm_area_struct vma)	70	static int bad_file_mmap(struct file file, struct vm_area_struct vma)
71	{	71	{
72	return -EIO;	72	return -EIO;
73	}	73	}
74		74
75	static int bad_file_open(struct inode inode, struct file filp)	75	static int bad_file_open(struct inode inode, struct file filp)
76	{	76	{
77	return -EIO;	77	return -EIO;
78	}	78	}
79		79
80	static int bad_file_flush(struct file *file, fl_owner_t id)	80	static int bad_file_flush(struct file *file, fl_owner_t id)
81	{	81	{
82	return -EIO;	82	return -EIO;
83	}	83	}
84		84
85	static int bad_file_release(struct inode inode, struct file filp)	85	static int bad_file_release(struct inode inode, struct file filp)
86	{	86	{
87	return -EIO;	87	return -EIO;
88	}	88	}
89		89
90	static int bad_file_fsync(struct file *file, loff_t start, loff_t end,	90	static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
91	int datasync)	91	int datasync)
92	{	92	{
93	return -EIO;	93	return -EIO;
94	}	94	}
95		95
96	static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)	96	static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
97	{	97	{
98	return -EIO;	98	return -EIO;
99	}	99	}
100		100
101	static int bad_file_fasync(int fd, struct file *filp, int on)	101	static int bad_file_fasync(int fd, struct file *filp, int on)
102	{	102	{
103	return -EIO;	103	return -EIO;
104	}	104	}
105		105
106	static int bad_file_lock(struct file file, int cmd, struct file_lock fl)	106	static int bad_file_lock(struct file file, int cmd, struct file_lock fl)
107	{	107	{
108	return -EIO;	108	return -EIO;
109	}	109	}
110		110
111	static ssize_t bad_file_sendpage(struct file file, struct page page,	111	static ssize_t bad_file_sendpage(struct file file, struct page page,
112	int off, size_t len, loff_t *pos, int more)	112	int off, size_t len, loff_t *pos, int more)
113	{	113	{
114	return -EIO;	114	return -EIO;
115	}	115	}
116		116
117	static unsigned long bad_file_get_unmapped_area(struct file *file,	117	static unsigned long bad_file_get_unmapped_area(struct file *file,
118	unsigned long addr, unsigned long len,	118	unsigned long addr, unsigned long len,
119	unsigned long pgoff, unsigned long flags)	119	unsigned long pgoff, unsigned long flags)
120	{	120	{
121	return -EIO;	121	return -EIO;
122	}	122	}
123		123
124	static int bad_file_check_flags(int flags)	124	static int bad_file_check_flags(int flags)
125	{	125	{
126	return -EIO;	126	return -EIO;
127	}	127	}
128		128
129	static int bad_file_flock(struct file filp, int cmd, struct file_lock fl)	129	static int bad_file_flock(struct file filp, int cmd, struct file_lock fl)
130	{	130	{
131	return -EIO;	131	return -EIO;
132	}	132	}
133		133
134	static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,	134	static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
135	struct file out, loff_t ppos, size_t len,	135	struct file out, loff_t ppos, size_t len,
136	unsigned int flags)	136	unsigned int flags)
137	{	137	{
138	return -EIO;	138	return -EIO;
139	}	139	}
140		140
141	static ssize_t bad_file_splice_read(struct file in, loff_t ppos,	141	static ssize_t bad_file_splice_read(struct file in, loff_t ppos,
142	struct pipe_inode_info *pipe, size_t len,	142	struct pipe_inode_info *pipe, size_t len,
143	unsigned int flags)	143	unsigned int flags)
144	{	144	{
145	return -EIO;	145	return -EIO;
146	}	146	}
147		147
148	static const struct file_operations bad_file_ops =	148	static const struct file_operations bad_file_ops =
149	{	149	{
150	.llseek = bad_file_llseek,	150	.llseek = bad_file_llseek,
151	.read = bad_file_read,	151	.read = bad_file_read,
152	.write = bad_file_write,	152	.write = bad_file_write,
153	.aio_read = bad_file_aio_read,	153	.aio_read = bad_file_aio_read,
154	.aio_write = bad_file_aio_write,	154	.aio_write = bad_file_aio_write,
155	.readdir = bad_file_readdir,	155	.readdir = bad_file_readdir,
156	.poll = bad_file_poll,	156	.poll = bad_file_poll,
157	.unlocked_ioctl = bad_file_unlocked_ioctl,	157	.unlocked_ioctl = bad_file_unlocked_ioctl,
158	.compat_ioctl = bad_file_compat_ioctl,	158	.compat_ioctl = bad_file_compat_ioctl,
159	.mmap = bad_file_mmap,	159	.mmap = bad_file_mmap,
160	.open = bad_file_open,	160	.open = bad_file_open,
161	.flush = bad_file_flush,	161	.flush = bad_file_flush,
162	.release = bad_file_release,	162	.release = bad_file_release,
163	.fsync = bad_file_fsync,	163	.fsync = bad_file_fsync,
164	.aio_fsync = bad_file_aio_fsync,	164	.aio_fsync = bad_file_aio_fsync,
165	.fasync = bad_file_fasync,	165	.fasync = bad_file_fasync,
166	.lock = bad_file_lock,	166	.lock = bad_file_lock,
167	.sendpage = bad_file_sendpage,	167	.sendpage = bad_file_sendpage,
168	.get_unmapped_area = bad_file_get_unmapped_area,	168	.get_unmapped_area = bad_file_get_unmapped_area,
169	.check_flags = bad_file_check_flags,	169	.check_flags = bad_file_check_flags,
170	.flock = bad_file_flock,	170	.flock = bad_file_flock,
171	.splice_write = bad_file_splice_write,	171	.splice_write = bad_file_splice_write,
172	.splice_read = bad_file_splice_read,	172	.splice_read = bad_file_splice_read,
173	};	173	};
174		174
175	static int bad_inode_create (struct inode dir, struct dentry dentry,	175	static int bad_inode_create (struct inode dir, struct dentry dentry,
176	umode_t mode, bool excl)	176	umode_t mode, bool excl)
177	{	177	{
178	return -EIO;	178	return -EIO;
179	}	179	}
180		180
181	static struct dentry bad_inode_lookup(struct inode dir,	181	static struct dentry bad_inode_lookup(struct inode dir,
182	struct dentry *dentry, unsigned int flags)	182	struct dentry *dentry, unsigned int flags)
183	{	183	{
184	return ERR_PTR(-EIO);	184	return ERR_PTR(-EIO);
185	}	185	}
186		186
187	static int bad_inode_link (struct dentry old_dentry, struct inode dir,	187	static int bad_inode_link (struct dentry old_dentry, struct inode dir,
188	struct dentry *dentry)	188	struct dentry *dentry)
189	{	189	{
190	return -EIO;	190	return -EIO;
191	}	191	}
192		192
193	static int bad_inode_unlink(struct inode dir, struct dentry dentry)	193	static int bad_inode_unlink(struct inode dir, struct dentry dentry)
194	{	194	{
195	return -EIO;	195	return -EIO;
196	}	196	}
197		197
198	static int bad_inode_symlink (struct inode dir, struct dentry dentry,	198	static int bad_inode_symlink (struct inode dir, struct dentry dentry,
199	const char *symname)	199	const char *symname)
200	{	200	{
201	return -EIO;	201	return -EIO;
202	}	202	}
203		203
204	static int bad_inode_mkdir(struct inode dir, struct dentry dentry,	204	static int bad_inode_mkdir(struct inode dir, struct dentry dentry,
205	umode_t mode)	205	umode_t mode)
206	{	206	{
207	return -EIO;	207	return -EIO;
208	}	208	}
209		209
210	static int bad_inode_rmdir (struct inode dir, struct dentry dentry)	210	static int bad_inode_rmdir (struct inode dir, struct dentry dentry)
211	{	211	{
212	return -EIO;	212	return -EIO;
213	}	213	}
214		214
215	static int bad_inode_mknod (struct inode dir, struct dentry dentry,	215	static int bad_inode_mknod (struct inode dir, struct dentry dentry,
216	umode_t mode, dev_t rdev)	216	umode_t mode, dev_t rdev)
217	{	217	{
218	return -EIO;	218	return -EIO;
219	}	219	}
220		220
221	static int bad_inode_rename (struct inode old_dir, struct dentry old_dentry,	221	static int bad_inode_rename (struct inode old_dir, struct dentry old_dentry,
222	struct inode new_dir, struct dentry new_dentry)	222	struct inode new_dir, struct dentry new_dentry)
223	{	223	{
224	return -EIO;	224	return -EIO;
225	}	225	}
226		226
227	static int bad_inode_readlink(struct dentry dentry, char __user buffer,	227	static int bad_inode_readlink(struct dentry dentry, char __user buffer,
228	int buflen)	228	int buflen)
229	{	229	{
230	return -EIO;	230	return -EIO;
231	}	231	}
232		232
233	static int bad_inode_permission(struct inode *inode, int mask)	233	static int bad_inode_permission(struct inode *inode, int mask)
234	{	234	{
235	return -EIO;	235	return -EIO;
236	}	236	}
237		237
238	static int bad_inode_getattr(struct vfsmount mnt, struct dentry dentry,	238	static int bad_inode_getattr(struct vfsmount mnt, struct dentry dentry,
239	struct kstat *stat)	239	struct kstat *stat)
240	{	240	{
241	return -EIO;	241	return -EIO;
242	}	242	}
243		243
244	static int bad_inode_setattr(struct dentry direntry, struct iattr attrs)	244	static int bad_inode_setattr(struct dentry direntry, struct iattr attrs)
245	{	245	{
246	return -EIO;	246	return -EIO;
247	}	247	}
248		248
249	static int bad_inode_setxattr(struct dentry dentry, const char name,	249	static int bad_inode_setxattr(struct dentry dentry, const char name,
250	const void *value, size_t size, int flags)	250	const void *value, size_t size, int flags)
251	{	251	{
252	return -EIO;	252	return -EIO;
253	}	253	}
254		254
255	static ssize_t bad_inode_getxattr(struct dentry dentry, const char name,	255	static ssize_t bad_inode_getxattr(struct dentry dentry, const char name,
256	void *buffer, size_t size)	256	void *buffer, size_t size)
257	{	257	{
258	return -EIO;	258	return -EIO;
259	}	259	}
260		260
261	static ssize_t bad_inode_listxattr(struct dentry dentry, char buffer,	261	static ssize_t bad_inode_listxattr(struct dentry dentry, char buffer,
262	size_t buffer_size)	262	size_t buffer_size)
263	{	263	{
264	return -EIO;	264	return -EIO;
265	}	265	}
266		266
267	static int bad_inode_removexattr(struct dentry dentry, const char name)	267	static int bad_inode_removexattr(struct dentry dentry, const char name)
268	{	268	{
269	return -EIO;	269	return -EIO;
270	}	270	}
271		271
272	static const struct inode_operations bad_inode_ops =	272	static const struct inode_operations bad_inode_ops =
273	{	273	{
274	.create = bad_inode_create,	274	.create = bad_inode_create,
275	.lookup = bad_inode_lookup,	275	.lookup = bad_inode_lookup,
276	.link = bad_inode_link,	276	.link = bad_inode_link,
277	.unlink = bad_inode_unlink,	277	.unlink = bad_inode_unlink,
278	.symlink = bad_inode_symlink,	278	.symlink = bad_inode_symlink,
279	.mkdir = bad_inode_mkdir,	279	.mkdir = bad_inode_mkdir,
280	.rmdir = bad_inode_rmdir,	280	.rmdir = bad_inode_rmdir,
281	.mknod = bad_inode_mknod,	281	.mknod = bad_inode_mknod,
282	.rename = bad_inode_rename,	282	.rename = bad_inode_rename,
283	.readlink = bad_inode_readlink,	283	.readlink = bad_inode_readlink,
284	/* follow_link must be no-op, otherwise unmounting this inode	284	/* follow_link must be no-op, otherwise unmounting this inode
285	won't work */	285	won't work */
286	/* put_link returns void */	286	/* put_link returns void */
287	/* truncate returns void */	287	/* truncate returns void */
288	.permission = bad_inode_permission,	288	.permission = bad_inode_permission,
289	.getattr = bad_inode_getattr,	289	.getattr = bad_inode_getattr,
290	.setattr = bad_inode_setattr,	290	.setattr = bad_inode_setattr,
291	.setxattr = bad_inode_setxattr,	291	.setxattr = bad_inode_setxattr,
292	.getxattr = bad_inode_getxattr,	292	.getxattr = bad_inode_getxattr,
293	.listxattr = bad_inode_listxattr,	293	.listxattr = bad_inode_listxattr,
294	.removexattr = bad_inode_removexattr,	294	.removexattr = bad_inode_removexattr,
295	};	295	};
296		296
297		297
298	/*	298	/*
299	* When a filesystem is unable to read an inode due to an I/O error in	299	* When a filesystem is unable to read an inode due to an I/O error in
300	* its read_inode() function, it can call make_bad_inode() to return a	300	* its read_inode() function, it can call make_bad_inode() to return a
301	* set of stubs which will return EIO errors as required.	301	* set of stubs which will return EIO errors as required.
302	*	302	*
303	* We only need to do limited initialisation: all other fields are	303	* We only need to do limited initialisation: all other fields are
304	* preinitialised to zero automatically.	304	* preinitialised to zero automatically.
305	*/	305	*/
306		306
307	/**	307	/**
308	* make_bad_inode - mark an inode bad due to an I/O error	308	* make_bad_inode - mark an inode bad due to an I/O error
309	* @inode: Inode to mark bad	309	* @inode: Inode to mark bad
310	*	310	*
311	* When an inode cannot be read due to a media or remote network	311	* When an inode cannot be read due to a media or remote network
312	* failure this function makes the inode "bad" and causes I/O operations	312	* failure this function makes the inode "bad" and causes I/O operations
313	* on it to fail from this point on.	313	* on it to fail from this point on.
314	*/	314	*/
315		315
316	void make_bad_inode(struct inode *inode)	316	void make_bad_inode(struct inode *inode)
317	{	317	{
318	remove_inode_hash(inode);	318	remove_inode_hash(inode);
319		319
320	inode->i_mode = S_IFREG;	320	inode->i_mode = S_IFREG;
321	inode->i_atime = inode->i_mtime = inode->i_ctime =	321	inode->i_atime = inode->i_mtime = inode->i_ctime =
322	current_fs_time(inode->i_sb);	322	current_fs_time(inode->i_sb);
323	inode->i_op = &bad_inode_ops;	323	inode->i_op = &bad_inode_ops;
324	inode->i_fop = &bad_file_ops;	324	inode->i_fop = &bad_file_ops;
325	}	325	}
326	EXPORT_SYMBOL(make_bad_inode);	326	EXPORT_SYMBOL(make_bad_inode);
327		327
328	/*	328	/*
329	* This tests whether an inode has been flagged as bad. The test uses	329	* This tests whether an inode has been flagged as bad. The test uses
330	* &bad_inode_ops to cover the case of invalidated inodes as well as	330	* &bad_inode_ops to cover the case of invalidated inodes as well as
331	* those created by make_bad_inode() above.	331	* those created by make_bad_inode() above.
332	*/	332	*/
333		333
334	/**	334	/**
335	* is_bad_inode - is an inode errored	335	* is_bad_inode - is an inode errored
336	* @inode: inode to test	336	* @inode: inode to test
337	*	337	*
338	* Returns true if the inode in question has been marked as bad.	338	* Returns true if the inode in question has been marked as bad.
339	*/	339	*/
340		340
341	int is_bad_inode(struct inode *inode)	341	int is_bad_inode(struct inode *inode)
342	{	342	{
343	return (inode->i_op == &bad_inode_ops);	343	return (inode->i_op == &bad_inode_ops);
344	}	344	}
345		345
346	EXPORT_SYMBOL(is_bad_inode);	346	EXPORT_SYMBOL(is_bad_inode);
347		347
348	/**	348	/**
349	* iget_failed - Mark an under-construction inode as dead and release it	349	* iget_failed - Mark an under-construction inode as dead and release it
350	* @inode: The inode to discard	350	* @inode: The inode to discard
351	*	351	*
352	* Mark an under-construction inode as dead and release it.	352	* Mark an under-construction inode as dead and release it.
353	*/	353	*/
354	void iget_failed(struct inode *inode)	354	void iget_failed(struct inode *inode)
355	{	355	{
356	make_bad_inode(inode);	356	make_bad_inode(inode);
357	unlock_new_inode(inode);	357	unlock_new_inode(inode);
358	iput(inode);	358	iput(inode);
359	}	359	}
360	EXPORT_SYMBOL(iget_failed);	360	EXPORT_SYMBOL(iget_failed);
361		361

fs/block_dev.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/block_dev.c	2	* linux/fs/block_dev.c
3	*	3	*
4	* Copyright (C) 1991, 1992 Linus Torvalds	4	* Copyright (C) 1991, 1992 Linus Torvalds
5	* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE	5	* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6	*/	6	*/
7		7
8	#include <linux/init.h>	8	#include <linux/init.h>
9	#include <linux/mm.h>	9	#include <linux/mm.h>
10	#include <linux/fcntl.h>	10	#include <linux/fcntl.h>
11	#include <linux/slab.h>	11	#include <linux/slab.h>
12	#include <linux/kmod.h>	12	#include <linux/kmod.h>
13	#include <linux/major.h>	13	#include <linux/major.h>
14	#include <linux/device_cgroup.h>	14	#include <linux/device_cgroup.h>
15	#include <linux/highmem.h>	15	#include <linux/highmem.h>
16	#include <linux/blkdev.h>	16	#include <linux/blkdev.h>
17	#include <linux/module.h>	17	#include <linux/module.h>
18	#include <linux/blkpg.h>	18	#include <linux/blkpg.h>
19	#include <linux/magic.h>	19	#include <linux/magic.h>
20	#include <linux/buffer_head.h>	20	#include <linux/buffer_head.h>
21	#include <linux/swap.h>	21	#include <linux/swap.h>
22	#include <linux/pagevec.h>	22	#include <linux/pagevec.h>
23	#include <linux/writeback.h>	23	#include <linux/writeback.h>
24	#include <linux/mpage.h>	24	#include <linux/mpage.h>
25	#include <linux/mount.h>	25	#include <linux/mount.h>
26	#include <linux/uio.h>	26	#include <linux/uio.h>
27	#include <linux/namei.h>	27	#include <linux/namei.h>
28	#include <linux/log2.h>	28	#include <linux/log2.h>
29	#include <linux/cleancache.h>	29	#include <linux/cleancache.h>
30	#include <asm/uaccess.h>	30	#include <asm/uaccess.h>
31	#include "internal.h"	31	#include "internal.h"
32		32
33	struct bdev_inode {	33	struct bdev_inode {
34	struct block_device bdev;	34	struct block_device bdev;
35	struct inode vfs_inode;	35	struct inode vfs_inode;
36	};	36	};
37		37
38	static const struct address_space_operations def_blk_aops;	38	static const struct address_space_operations def_blk_aops;
39		39
40	static inline struct bdev_inode BDEV_I(struct inode inode)	40	static inline struct bdev_inode BDEV_I(struct inode inode)
41	{	41	{
42	return container_of(inode, struct bdev_inode, vfs_inode);	42	return container_of(inode, struct bdev_inode, vfs_inode);
43	}	43	}
44		44
45	inline struct block_device I_BDEV(struct inode inode)	45	inline struct block_device I_BDEV(struct inode inode)
46	{	46	{
47	return &BDEV_I(inode)->bdev;	47	return &BDEV_I(inode)->bdev;
48	}	48	}
49	EXPORT_SYMBOL(I_BDEV);	49	EXPORT_SYMBOL(I_BDEV);
50		50
51	/*	51	/*
52	* Move the inode from its current bdi to a new bdi. If the inode is dirty we	52	* Move the inode from its current bdi to a new bdi. If the inode is dirty we
53	* need to move it onto the dirty list of @dst so that the inode is always on	53	* need to move it onto the dirty list of @dst so that the inode is always on
54	* the right list.	54	* the right list.
55	*/	55	*/
56	static void bdev_inode_switch_bdi(struct inode *inode,	56	static void bdev_inode_switch_bdi(struct inode *inode,
57	struct backing_dev_info *dst)	57	struct backing_dev_info *dst)
58	{	58	{
59	struct backing_dev_info *old = inode->i_data.backing_dev_info;	59	struct backing_dev_info *old = inode->i_data.backing_dev_info;
60		60
61	if (unlikely(dst == old)) /* deadlock avoidance */	61	if (unlikely(dst == old)) /* deadlock avoidance */
62	return;	62	return;
63	bdi_lock_two(&old->wb, &dst->wb);	63	bdi_lock_two(&old->wb, &dst->wb);
64	spin_lock(&inode->i_lock);	64	spin_lock(&inode->i_lock);
65	inode->i_data.backing_dev_info = dst;	65	inode->i_data.backing_dev_info = dst;
66	if (inode->i_state & I_DIRTY)	66	if (inode->i_state & I_DIRTY)
67	list_move(&inode->i_wb_list, &dst->wb.b_dirty);	67	list_move(&inode->i_wb_list, &dst->wb.b_dirty);
68	spin_unlock(&inode->i_lock);	68	spin_unlock(&inode->i_lock);
69	spin_unlock(&old->wb.list_lock);	69	spin_unlock(&old->wb.list_lock);
70	spin_unlock(&dst->wb.list_lock);	70	spin_unlock(&dst->wb.list_lock);
71	}	71	}
72		72
73	/* Kill _all_ buffers and pagecache , dirty or not.. */	73	/* Kill _all_ buffers and pagecache , dirty or not.. */
74	void kill_bdev(struct block_device *bdev)	74	void kill_bdev(struct block_device *bdev)
75	{	75	{
76	struct address_space *mapping = bdev->bd_inode->i_mapping;	76	struct address_space *mapping = bdev->bd_inode->i_mapping;
77		77
78	if (mapping->nrpages == 0)	78	if (mapping->nrpages == 0)
79	return;	79	return;
80		80
81	invalidate_bh_lrus();	81	invalidate_bh_lrus();
82	truncate_inode_pages(mapping, 0);	82	truncate_inode_pages(mapping, 0);
83	}	83	}
84	EXPORT_SYMBOL(kill_bdev);	84	EXPORT_SYMBOL(kill_bdev);
85		85
86	/* Invalidate clean unused buffers and pagecache. */	86	/* Invalidate clean unused buffers and pagecache. */
87	void invalidate_bdev(struct block_device *bdev)	87	void invalidate_bdev(struct block_device *bdev)
88	{	88	{
89	struct address_space *mapping = bdev->bd_inode->i_mapping;	89	struct address_space *mapping = bdev->bd_inode->i_mapping;
90		90
91	if (mapping->nrpages == 0)	91	if (mapping->nrpages == 0)
92	return;	92	return;
93		93
94	invalidate_bh_lrus();	94	invalidate_bh_lrus();
95	lru_add_drain_all(); /* make sure all lru add caches are flushed */	95	lru_add_drain_all(); /* make sure all lru add caches are flushed */
96	invalidate_mapping_pages(mapping, 0, -1);	96	invalidate_mapping_pages(mapping, 0, -1);
97	/* 99% of the time, we don't need to flush the cleancache on the bdev.	97	/* 99% of the time, we don't need to flush the cleancache on the bdev.
98	* But, for the strange corners, lets be cautious	98	* But, for the strange corners, lets be cautious
99	*/	99	*/
100	cleancache_invalidate_inode(mapping);	100	cleancache_invalidate_inode(mapping);
101	}	101	}
102	EXPORT_SYMBOL(invalidate_bdev);	102	EXPORT_SYMBOL(invalidate_bdev);
103		103
104	int set_blocksize(struct block_device *bdev, int size)	104	int set_blocksize(struct block_device *bdev, int size)
105	{	105	{
106	/* Size must be a power of two, and between 512 and PAGE_SIZE */	106	/* Size must be a power of two, and between 512 and PAGE_SIZE */
107	if (size > PAGE_SIZE \|\| size < 512 \|\| !is_power_of_2(size))	107	if (size > PAGE_SIZE \|\| size < 512 \|\| !is_power_of_2(size))
108	return -EINVAL;	108	return -EINVAL;
109		109
110	/* Size cannot be smaller than the size supported by the device */	110	/* Size cannot be smaller than the size supported by the device */
111	if (size < bdev_logical_block_size(bdev))	111	if (size < bdev_logical_block_size(bdev))
112	return -EINVAL;	112	return -EINVAL;
113		113
114	/* Don't change the size if it is same as current */	114	/* Don't change the size if it is same as current */
115	if (bdev->bd_block_size != size) {	115	if (bdev->bd_block_size != size) {
116	sync_blockdev(bdev);	116	sync_blockdev(bdev);
117	bdev->bd_block_size = size;	117	bdev->bd_block_size = size;
118	bdev->bd_inode->i_blkbits = blksize_bits(size);	118	bdev->bd_inode->i_blkbits = blksize_bits(size);
119	kill_bdev(bdev);	119	kill_bdev(bdev);
120	}	120	}
121	return 0;	121	return 0;
122	}	122	}
123		123
124	EXPORT_SYMBOL(set_blocksize);	124	EXPORT_SYMBOL(set_blocksize);
125		125
126	int sb_set_blocksize(struct super_block *sb, int size)	126	int sb_set_blocksize(struct super_block *sb, int size)
127	{	127	{
128	if (set_blocksize(sb->s_bdev, size))	128	if (set_blocksize(sb->s_bdev, size))
129	return 0;	129	return 0;
130	/* If we get here, we know size is power of two	130	/* If we get here, we know size is power of two
131	* and it's value is between 512 and PAGE_SIZE */	131	* and it's value is between 512 and PAGE_SIZE */
132	sb->s_blocksize = size;	132	sb->s_blocksize = size;
133	sb->s_blocksize_bits = blksize_bits(size);	133	sb->s_blocksize_bits = blksize_bits(size);
134	return sb->s_blocksize;	134	return sb->s_blocksize;
135	}	135	}
136		136
137	EXPORT_SYMBOL(sb_set_blocksize);	137	EXPORT_SYMBOL(sb_set_blocksize);
138		138
139	int sb_min_blocksize(struct super_block *sb, int size)	139	int sb_min_blocksize(struct super_block *sb, int size)
140	{	140	{
141	int minsize = bdev_logical_block_size(sb->s_bdev);	141	int minsize = bdev_logical_block_size(sb->s_bdev);
142	if (size < minsize)	142	if (size < minsize)
143	size = minsize;	143	size = minsize;
144	return sb_set_blocksize(sb, size);	144	return sb_set_blocksize(sb, size);
145	}	145	}
146		146
147	EXPORT_SYMBOL(sb_min_blocksize);	147	EXPORT_SYMBOL(sb_min_blocksize);
148		148
149	static int	149	static int
150	blkdev_get_block(struct inode *inode, sector_t iblock,	150	blkdev_get_block(struct inode *inode, sector_t iblock,
151	struct buffer_head *bh, int create)	151	struct buffer_head *bh, int create)
152	{	152	{
153	bh->b_bdev = I_BDEV(inode);	153	bh->b_bdev = I_BDEV(inode);
154	bh->b_blocknr = iblock;	154	bh->b_blocknr = iblock;
155	set_buffer_mapped(bh);	155	set_buffer_mapped(bh);
156	return 0;	156	return 0;
157	}	157	}
158		158
159	static ssize_t	159	static ssize_t
160	blkdev_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,	160	blkdev_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,
161	loff_t offset, unsigned long nr_segs)	161	loff_t offset, unsigned long nr_segs)
162	{	162	{
163	struct file *file = iocb->ki_filp;	163	struct file *file = iocb->ki_filp;
164	struct inode *inode = file->f_mapping->host;	164	struct inode *inode = file->f_mapping->host;
165		165
166	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,	166	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
167	nr_segs, blkdev_get_block, NULL, NULL, 0);	167	nr_segs, blkdev_get_block, NULL, NULL, 0);
168	}	168	}
169		169
170	int __sync_blockdev(struct block_device *bdev, int wait)	170	int __sync_blockdev(struct block_device *bdev, int wait)
171	{	171	{
172	if (!bdev)	172	if (!bdev)
173	return 0;	173	return 0;
174	if (!wait)	174	if (!wait)
175	return filemap_flush(bdev->bd_inode->i_mapping);	175	return filemap_flush(bdev->bd_inode->i_mapping);
176	return filemap_write_and_wait(bdev->bd_inode->i_mapping);	176	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
177	}	177	}
178		178
179	/*	179	/*
180	* Write out and wait upon all the dirty data associated with a block	180	* Write out and wait upon all the dirty data associated with a block
181	* device via its mapping. Does not take the superblock lock.	181	* device via its mapping. Does not take the superblock lock.
182	*/	182	*/
183	int sync_blockdev(struct block_device *bdev)	183	int sync_blockdev(struct block_device *bdev)
184	{	184	{
185	return __sync_blockdev(bdev, 1);	185	return __sync_blockdev(bdev, 1);
186	}	186	}
187	EXPORT_SYMBOL(sync_blockdev);	187	EXPORT_SYMBOL(sync_blockdev);
188		188
189	/*	189	/*
190	* Write out and wait upon all dirty data associated with this	190	* Write out and wait upon all dirty data associated with this
191	* device. Filesystem data as well as the underlying block	191	* device. Filesystem data as well as the underlying block
192	* device. Takes the superblock lock.	192	* device. Takes the superblock lock.
193	*/	193	*/
194	int fsync_bdev(struct block_device *bdev)	194	int fsync_bdev(struct block_device *bdev)
195	{	195	{
196	struct super_block *sb = get_super(bdev);	196	struct super_block *sb = get_super(bdev);
197	if (sb) {	197	if (sb) {
198	int res = sync_filesystem(sb);	198	int res = sync_filesystem(sb);
199	drop_super(sb);	199	drop_super(sb);
200	return res;	200	return res;
201	}	201	}
202	return sync_blockdev(bdev);	202	return sync_blockdev(bdev);
203	}	203	}
204	EXPORT_SYMBOL(fsync_bdev);	204	EXPORT_SYMBOL(fsync_bdev);
205		205
206	/**	206	/**
207	* freeze_bdev -- lock a filesystem and force it into a consistent state	207	* freeze_bdev -- lock a filesystem and force it into a consistent state
208	* @bdev: blockdevice to lock	208	* @bdev: blockdevice to lock
209	*	209	*
210	* If a superblock is found on this device, we take the s_umount semaphore	210	* If a superblock is found on this device, we take the s_umount semaphore
211	* on it to make sure nobody unmounts until the snapshot creation is done.	211	* on it to make sure nobody unmounts until the snapshot creation is done.
212	* The reference counter (bd_fsfreeze_count) guarantees that only the last	212	* The reference counter (bd_fsfreeze_count) guarantees that only the last
213	* unfreeze process can unfreeze the frozen filesystem actually when multiple	213	* unfreeze process can unfreeze the frozen filesystem actually when multiple
214	* freeze requests arrive simultaneously. It counts up in freeze_bdev() and	214	* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
215	* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze	215	* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
216	* actually.	216	* actually.
217	*/	217	*/
218	struct super_block freeze_bdev(struct block_device bdev)	218	struct super_block freeze_bdev(struct block_device bdev)
219	{	219	{
220	struct super_block *sb;	220	struct super_block *sb;
221	int error = 0;	221	int error = 0;
222		222
223	mutex_lock(&bdev->bd_fsfreeze_mutex);	223	mutex_lock(&bdev->bd_fsfreeze_mutex);
224	if (++bdev->bd_fsfreeze_count > 1) {	224	if (++bdev->bd_fsfreeze_count > 1) {
225	/*	225	/*
226	* We don't even need to grab a reference - the first call	226	* We don't even need to grab a reference - the first call
227	* to freeze_bdev grab an active reference and only the last	227	* to freeze_bdev grab an active reference and only the last
228	* thaw_bdev drops it.	228	* thaw_bdev drops it.
229	*/	229	*/
230	sb = get_super(bdev);	230	sb = get_super(bdev);
231	drop_super(sb);	231	drop_super(sb);
232	mutex_unlock(&bdev->bd_fsfreeze_mutex);	232	mutex_unlock(&bdev->bd_fsfreeze_mutex);
233	return sb;	233	return sb;
234	}	234	}
235		235
236	sb = get_active_super(bdev);	236	sb = get_active_super(bdev);
237	if (!sb)	237	if (!sb)
238	goto out;	238	goto out;
239	error = freeze_super(sb);	239	error = freeze_super(sb);
240	if (error) {	240	if (error) {
241	deactivate_super(sb);	241	deactivate_super(sb);
242	bdev->bd_fsfreeze_count--;	242	bdev->bd_fsfreeze_count--;
243	mutex_unlock(&bdev->bd_fsfreeze_mutex);	243	mutex_unlock(&bdev->bd_fsfreeze_mutex);
244	return ERR_PTR(error);	244	return ERR_PTR(error);
245	}	245	}
246	deactivate_super(sb);	246	deactivate_super(sb);
247	out:	247	out:
248	sync_blockdev(bdev);	248	sync_blockdev(bdev);
249	mutex_unlock(&bdev->bd_fsfreeze_mutex);	249	mutex_unlock(&bdev->bd_fsfreeze_mutex);
250	return sb; /* thaw_bdev releases s->s_umount */	250	return sb; /* thaw_bdev releases s->s_umount */
251	}	251	}
252	EXPORT_SYMBOL(freeze_bdev);	252	EXPORT_SYMBOL(freeze_bdev);
253		253
254	/**	254	/**
255	* thaw_bdev -- unlock filesystem	255	* thaw_bdev -- unlock filesystem
256	* @bdev: blockdevice to unlock	256	* @bdev: blockdevice to unlock
257	* @sb: associated superblock	257	* @sb: associated superblock
258	*	258	*
259	* Unlocks the filesystem and marks it writeable again after freeze_bdev().	259	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
260	*/	260	*/
261	int thaw_bdev(struct block_device bdev, struct super_block sb)	261	int thaw_bdev(struct block_device bdev, struct super_block sb)
262	{	262	{
263	int error = -EINVAL;	263	int error = -EINVAL;
264		264
265	mutex_lock(&bdev->bd_fsfreeze_mutex);	265	mutex_lock(&bdev->bd_fsfreeze_mutex);
266	if (!bdev->bd_fsfreeze_count)	266	if (!bdev->bd_fsfreeze_count)
267	goto out;	267	goto out;
268		268
269	error = 0;	269	error = 0;
270	if (--bdev->bd_fsfreeze_count > 0)	270	if (--bdev->bd_fsfreeze_count > 0)
271	goto out;	271	goto out;
272		272
273	if (!sb)	273	if (!sb)
274	goto out;	274	goto out;
275		275
276	error = thaw_super(sb);	276	error = thaw_super(sb);
277	if (error) {	277	if (error) {
278	bdev->bd_fsfreeze_count++;	278	bdev->bd_fsfreeze_count++;
279	mutex_unlock(&bdev->bd_fsfreeze_mutex);	279	mutex_unlock(&bdev->bd_fsfreeze_mutex);
280	return error;	280	return error;
281	}	281	}
282	out:	282	out:
283	mutex_unlock(&bdev->bd_fsfreeze_mutex);	283	mutex_unlock(&bdev->bd_fsfreeze_mutex);
284	return 0;	284	return 0;
285	}	285	}
286	EXPORT_SYMBOL(thaw_bdev);	286	EXPORT_SYMBOL(thaw_bdev);
287		287
288	static int blkdev_writepage(struct page page, struct writeback_control wbc)	288	static int blkdev_writepage(struct page page, struct writeback_control wbc)
289	{	289	{
290	return block_write_full_page(page, blkdev_get_block, wbc);	290	return block_write_full_page(page, blkdev_get_block, wbc);
291	}	291	}
292		292
293	static int blkdev_readpage(struct file * file, struct page * page)	293	static int blkdev_readpage(struct file * file, struct page * page)
294	{	294	{
295	return block_read_full_page(page, blkdev_get_block);	295	return block_read_full_page(page, blkdev_get_block);
296	}	296	}
297		297
298	static int blkdev_write_begin(struct file file, struct address_space mapping,	298	static int blkdev_write_begin(struct file file, struct address_space mapping,
299	loff_t pos, unsigned len, unsigned flags,	299	loff_t pos, unsigned len, unsigned flags,
300	struct page pagep, void fsdata)	300	struct page pagep, void fsdata)
301	{	301	{
302	return block_write_begin(mapping, pos, len, flags, pagep,	302	return block_write_begin(mapping, pos, len, flags, pagep,
303	blkdev_get_block);	303	blkdev_get_block);
304	}	304	}
305		305
306	static int blkdev_write_end(struct file file, struct address_space mapping,	306	static int blkdev_write_end(struct file file, struct address_space mapping,
307	loff_t pos, unsigned len, unsigned copied,	307	loff_t pos, unsigned len, unsigned copied,
308	struct page page, void fsdata)	308	struct page page, void fsdata)
309	{	309	{
310	int ret;	310	int ret;
311	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);	311	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
312		312
313	unlock_page(page);	313	unlock_page(page);
314	page_cache_release(page);	314	page_cache_release(page);
315		315
316	return ret;	316	return ret;
317	}	317	}
318		318
319	/*	319	/*
320	* private llseek:	320	* private llseek:
321	* for a block special file file->f_path.dentry->d_inode->i_size is zero	321	* for a block special file file->f_path.dentry->d_inode->i_size is zero
322	* so we compute the size by hand (just as in block_read/write above)	322	* so we compute the size by hand (just as in block_read/write above)
323	*/	323	*/
324	static loff_t block_llseek(struct file *file, loff_t offset, int origin)	324	static loff_t block_llseek(struct file *file, loff_t offset, int whence)
325	{	325	{
326	struct inode *bd_inode = file->f_mapping->host;	326	struct inode *bd_inode = file->f_mapping->host;
327	loff_t size;	327	loff_t size;
328	loff_t retval;	328	loff_t retval;
329		329
330	mutex_lock(&bd_inode->i_mutex);	330	mutex_lock(&bd_inode->i_mutex);
331	size = i_size_read(bd_inode);	331	size = i_size_read(bd_inode);
332		332
333	retval = -EINVAL;	333	retval = -EINVAL;
334	switch (origin) {	334	switch (whence) {
335	case SEEK_END:	335	case SEEK_END:
336	offset += size;	336	offset += size;
337	break;	337	break;
338	case SEEK_CUR:	338	case SEEK_CUR:
339	offset += file->f_pos;	339	offset += file->f_pos;
340	case SEEK_SET:	340	case SEEK_SET:
341	break;	341	break;
342	default:	342	default:
343	goto out;	343	goto out;
344	}	344	}
345	if (offset >= 0 && offset <= size) {	345	if (offset >= 0 && offset <= size) {
346	if (offset != file->f_pos) {	346	if (offset != file->f_pos) {
347	file->f_pos = offset;	347	file->f_pos = offset;
348	}	348	}
349	retval = offset;	349	retval = offset;
350	}	350	}
351	out:	351	out:
352	mutex_unlock(&bd_inode->i_mutex);	352	mutex_unlock(&bd_inode->i_mutex);
353	return retval;	353	return retval;
354	}	354	}
355		355
356	int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)	356	int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
357	{	357	{
358	struct inode *bd_inode = filp->f_mapping->host;	358	struct inode *bd_inode = filp->f_mapping->host;
359	struct block_device *bdev = I_BDEV(bd_inode);	359	struct block_device *bdev = I_BDEV(bd_inode);
360	int error;	360	int error;
361		361
362	error = filemap_write_and_wait_range(filp->f_mapping, start, end);	362	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
363	if (error)	363	if (error)
364	return error;	364	return error;
365		365
366	/*	366	/*
367	* There is no need to serialise calls to blkdev_issue_flush with	367	* There is no need to serialise calls to blkdev_issue_flush with
368	* i_mutex and doing so causes performance issues with concurrent	368	* i_mutex and doing so causes performance issues with concurrent
369	* O_SYNC writers to a block device.	369	* O_SYNC writers to a block device.
370	*/	370	*/
371	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);	371	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
372	if (error == -EOPNOTSUPP)	372	if (error == -EOPNOTSUPP)
373	error = 0;	373	error = 0;
374		374
375	return error;	375	return error;
376	}	376	}
377	EXPORT_SYMBOL(blkdev_fsync);	377	EXPORT_SYMBOL(blkdev_fsync);
378		378
379	/*	379	/*
380	* pseudo-fs	380	* pseudo-fs
381	*/	381	*/
382		382
383	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);	383	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
384	static struct kmem_cache * bdev_cachep __read_mostly;	384	static struct kmem_cache * bdev_cachep __read_mostly;
385		385
386	static struct inode bdev_alloc_inode(struct super_block sb)	386	static struct inode bdev_alloc_inode(struct super_block sb)
387	{	387	{
388	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);	388	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
389	if (!ei)	389	if (!ei)
390	return NULL;	390	return NULL;
391	return &ei->vfs_inode;	391	return &ei->vfs_inode;
392	}	392	}
393		393
394	static void bdev_i_callback(struct rcu_head *head)	394	static void bdev_i_callback(struct rcu_head *head)
395	{	395	{
396	struct inode *inode = container_of(head, struct inode, i_rcu);	396	struct inode *inode = container_of(head, struct inode, i_rcu);
397	struct bdev_inode *bdi = BDEV_I(inode);	397	struct bdev_inode *bdi = BDEV_I(inode);
398		398
399	kmem_cache_free(bdev_cachep, bdi);	399	kmem_cache_free(bdev_cachep, bdi);
400	}	400	}
401		401
402	static void bdev_destroy_inode(struct inode *inode)	402	static void bdev_destroy_inode(struct inode *inode)
403	{	403	{
404	call_rcu(&inode->i_rcu, bdev_i_callback);	404	call_rcu(&inode->i_rcu, bdev_i_callback);
405	}	405	}
406		406
407	static void init_once(void *foo)	407	static void init_once(void *foo)
408	{	408	{
409	struct bdev_inode ei = (struct bdev_inode ) foo;	409	struct bdev_inode ei = (struct bdev_inode ) foo;
410	struct block_device *bdev = &ei->bdev;	410	struct block_device *bdev = &ei->bdev;
411		411
412	memset(bdev, 0, sizeof(*bdev));	412	memset(bdev, 0, sizeof(*bdev));
413	mutex_init(&bdev->bd_mutex);	413	mutex_init(&bdev->bd_mutex);
414	INIT_LIST_HEAD(&bdev->bd_inodes);	414	INIT_LIST_HEAD(&bdev->bd_inodes);
415	INIT_LIST_HEAD(&bdev->bd_list);	415	INIT_LIST_HEAD(&bdev->bd_list);
416	#ifdef CONFIG_SYSFS	416	#ifdef CONFIG_SYSFS
417	INIT_LIST_HEAD(&bdev->bd_holder_disks);	417	INIT_LIST_HEAD(&bdev->bd_holder_disks);
418	#endif	418	#endif
419	inode_init_once(&ei->vfs_inode);	419	inode_init_once(&ei->vfs_inode);
420	/* Initialize mutex for freeze. */	420	/* Initialize mutex for freeze. */
421	mutex_init(&bdev->bd_fsfreeze_mutex);	421	mutex_init(&bdev->bd_fsfreeze_mutex);
422	}	422	}
423		423
424	static inline void __bd_forget(struct inode *inode)	424	static inline void __bd_forget(struct inode *inode)
425	{	425	{
426	list_del_init(&inode->i_devices);	426	list_del_init(&inode->i_devices);
427	inode->i_bdev = NULL;	427	inode->i_bdev = NULL;
428	inode->i_mapping = &inode->i_data;	428	inode->i_mapping = &inode->i_data;
429	}	429	}
430		430
431	static void bdev_evict_inode(struct inode *inode)	431	static void bdev_evict_inode(struct inode *inode)
432	{	432	{
433	struct block_device *bdev = &BDEV_I(inode)->bdev;	433	struct block_device *bdev = &BDEV_I(inode)->bdev;
434	struct list_head *p;	434	struct list_head *p;
435	truncate_inode_pages(&inode->i_data, 0);	435	truncate_inode_pages(&inode->i_data, 0);
436	invalidate_inode_buffers(inode); /* is it needed here? */	436	invalidate_inode_buffers(inode); /* is it needed here? */
437	clear_inode(inode);	437	clear_inode(inode);
438	spin_lock(&bdev_lock);	438	spin_lock(&bdev_lock);
439	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {	439	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
440	__bd_forget(list_entry(p, struct inode, i_devices));	440	__bd_forget(list_entry(p, struct inode, i_devices));
441	}	441	}
442	list_del_init(&bdev->bd_list);	442	list_del_init(&bdev->bd_list);
443	spin_unlock(&bdev_lock);	443	spin_unlock(&bdev_lock);
444	}	444	}
445		445
446	static const struct super_operations bdev_sops = {	446	static const struct super_operations bdev_sops = {
447	.statfs = simple_statfs,	447	.statfs = simple_statfs,
448	.alloc_inode = bdev_alloc_inode,	448	.alloc_inode = bdev_alloc_inode,
449	.destroy_inode = bdev_destroy_inode,	449	.destroy_inode = bdev_destroy_inode,
450	.drop_inode = generic_delete_inode,	450	.drop_inode = generic_delete_inode,
451	.evict_inode = bdev_evict_inode,	451	.evict_inode = bdev_evict_inode,
452	};	452	};
453		453
454	static struct dentry bd_mount(struct file_system_type fs_type,	454	static struct dentry bd_mount(struct file_system_type fs_type,
455	int flags, const char dev_name, void data)	455	int flags, const char dev_name, void data)
456	{	456	{
457	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);	457	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
458	}	458	}
459		459
460	static struct file_system_type bd_type = {	460	static struct file_system_type bd_type = {
461	.name = "bdev",	461	.name = "bdev",
462	.mount = bd_mount,	462	.mount = bd_mount,
463	.kill_sb = kill_anon_super,	463	.kill_sb = kill_anon_super,
464	};	464	};
465		465
466	static struct super_block *blockdev_superblock __read_mostly;	466	static struct super_block *blockdev_superblock __read_mostly;
467		467
468	void __init bdev_cache_init(void)	468	void __init bdev_cache_init(void)
469	{	469	{
470	int err;	470	int err;
471	static struct vfsmount *bd_mnt;	471	static struct vfsmount *bd_mnt;
472		472
473	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),	473	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
474	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|	474	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
475	SLAB_MEM_SPREAD\|SLAB_PANIC),	475	SLAB_MEM_SPREAD\|SLAB_PANIC),
476	init_once);	476	init_once);
477	err = register_filesystem(&bd_type);	477	err = register_filesystem(&bd_type);
478	if (err)	478	if (err)
479	panic("Cannot register bdev pseudo-fs");	479	panic("Cannot register bdev pseudo-fs");
480	bd_mnt = kern_mount(&bd_type);	480	bd_mnt = kern_mount(&bd_type);
481	if (IS_ERR(bd_mnt))	481	if (IS_ERR(bd_mnt))
482	panic("Cannot create bdev pseudo-fs");	482	panic("Cannot create bdev pseudo-fs");
483	blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */	483	blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
484	}	484	}
485		485
486	/*	486	/*
487	* Most likely _very_ bad one - but then it's hardly critical for small	487	* Most likely _very_ bad one - but then it's hardly critical for small
488	* /dev and can be fixed when somebody will need really large one.	488	* /dev and can be fixed when somebody will need really large one.
489	* Keep in mind that it will be fed through icache hash function too.	489	* Keep in mind that it will be fed through icache hash function too.
490	*/	490	*/
491	static inline unsigned long hash(dev_t dev)	491	static inline unsigned long hash(dev_t dev)
492	{	492	{
493	return MAJOR(dev)+MINOR(dev);	493	return MAJOR(dev)+MINOR(dev);
494	}	494	}
495		495
496	static int bdev_test(struct inode inode, void data)	496	static int bdev_test(struct inode inode, void data)
497	{	497	{
498	return BDEV_I(inode)->bdev.bd_dev == (dev_t )data;	498	return BDEV_I(inode)->bdev.bd_dev == (dev_t )data;
499	}	499	}
500		500
501	static int bdev_set(struct inode inode, void data)	501	static int bdev_set(struct inode inode, void data)
502	{	502	{
503	BDEV_I(inode)->bdev.bd_dev = (dev_t )data;	503	BDEV_I(inode)->bdev.bd_dev = (dev_t )data;
504	return 0;	504	return 0;
505	}	505	}
506		506
507	static LIST_HEAD(all_bdevs);	507	static LIST_HEAD(all_bdevs);
508		508
509	struct block_device *bdget(dev_t dev)	509	struct block_device *bdget(dev_t dev)
510	{	510	{
511	struct block_device *bdev;	511	struct block_device *bdev;
512	struct inode *inode;	512	struct inode *inode;
513		513
514	inode = iget5_locked(blockdev_superblock, hash(dev),	514	inode = iget5_locked(blockdev_superblock, hash(dev),
515	bdev_test, bdev_set, &dev);	515	bdev_test, bdev_set, &dev);
516		516
517	if (!inode)	517	if (!inode)
518	return NULL;	518	return NULL;
519		519
520	bdev = &BDEV_I(inode)->bdev;	520	bdev = &BDEV_I(inode)->bdev;
521		521
522	if (inode->i_state & I_NEW) {	522	if (inode->i_state & I_NEW) {
523	bdev->bd_contains = NULL;	523	bdev->bd_contains = NULL;
524	bdev->bd_super = NULL;	524	bdev->bd_super = NULL;
525	bdev->bd_inode = inode;	525	bdev->bd_inode = inode;
526	bdev->bd_block_size = (1 << inode->i_blkbits);	526	bdev->bd_block_size = (1 << inode->i_blkbits);
527	bdev->bd_part_count = 0;	527	bdev->bd_part_count = 0;
528	bdev->bd_invalidated = 0;	528	bdev->bd_invalidated = 0;
529	inode->i_mode = S_IFBLK;	529	inode->i_mode = S_IFBLK;
530	inode->i_rdev = dev;	530	inode->i_rdev = dev;
531	inode->i_bdev = bdev;	531	inode->i_bdev = bdev;
532	inode->i_data.a_ops = &def_blk_aops;	532	inode->i_data.a_ops = &def_blk_aops;
533	mapping_set_gfp_mask(&inode->i_data, GFP_USER);	533	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
534	inode->i_data.backing_dev_info = &default_backing_dev_info;	534	inode->i_data.backing_dev_info = &default_backing_dev_info;
535	spin_lock(&bdev_lock);	535	spin_lock(&bdev_lock);
536	list_add(&bdev->bd_list, &all_bdevs);	536	list_add(&bdev->bd_list, &all_bdevs);
537	spin_unlock(&bdev_lock);	537	spin_unlock(&bdev_lock);
538	unlock_new_inode(inode);	538	unlock_new_inode(inode);
539	}	539	}
540	return bdev;	540	return bdev;
541	}	541	}
542		542
543	EXPORT_SYMBOL(bdget);	543	EXPORT_SYMBOL(bdget);
544		544
545	/**	545	/**
546	* bdgrab -- Grab a reference to an already referenced block device	546	* bdgrab -- Grab a reference to an already referenced block device
547	* @bdev: Block device to grab a reference to.	547	* @bdev: Block device to grab a reference to.
548	*/	548	*/
549	struct block_device bdgrab(struct block_device bdev)	549	struct block_device bdgrab(struct block_device bdev)
550	{	550	{
551	ihold(bdev->bd_inode);	551	ihold(bdev->bd_inode);
552	return bdev;	552	return bdev;
553	}	553	}
554		554
555	long nr_blockdev_pages(void)	555	long nr_blockdev_pages(void)
556	{	556	{
557	struct block_device *bdev;	557	struct block_device *bdev;
558	long ret = 0;	558	long ret = 0;
559	spin_lock(&bdev_lock);	559	spin_lock(&bdev_lock);
560	list_for_each_entry(bdev, &all_bdevs, bd_list) {	560	list_for_each_entry(bdev, &all_bdevs, bd_list) {
561	ret += bdev->bd_inode->i_mapping->nrpages;	561	ret += bdev->bd_inode->i_mapping->nrpages;
562	}	562	}
563	spin_unlock(&bdev_lock);	563	spin_unlock(&bdev_lock);
564	return ret;	564	return ret;
565	}	565	}
566		566
567	void bdput(struct block_device *bdev)	567	void bdput(struct block_device *bdev)
568	{	568	{
569	iput(bdev->bd_inode);	569	iput(bdev->bd_inode);
570	}	570	}
571		571
572	EXPORT_SYMBOL(bdput);	572	EXPORT_SYMBOL(bdput);
573		573
574	static struct block_device bd_acquire(struct inode inode)	574	static struct block_device bd_acquire(struct inode inode)
575	{	575	{
576	struct block_device *bdev;	576	struct block_device *bdev;
577		577
578	spin_lock(&bdev_lock);	578	spin_lock(&bdev_lock);
579	bdev = inode->i_bdev;	579	bdev = inode->i_bdev;
580	if (bdev) {	580	if (bdev) {
581	ihold(bdev->bd_inode);	581	ihold(bdev->bd_inode);
582	spin_unlock(&bdev_lock);	582	spin_unlock(&bdev_lock);
583	return bdev;	583	return bdev;
584	}	584	}
585	spin_unlock(&bdev_lock);	585	spin_unlock(&bdev_lock);
586		586
587	bdev = bdget(inode->i_rdev);	587	bdev = bdget(inode->i_rdev);
588	if (bdev) {	588	if (bdev) {
589	spin_lock(&bdev_lock);	589	spin_lock(&bdev_lock);
590	if (!inode->i_bdev) {	590	if (!inode->i_bdev) {
591	/*	591	/*
592	* We take an additional reference to bd_inode,	592	* We take an additional reference to bd_inode,
593	* and it's released in clear_inode() of inode.	593	* and it's released in clear_inode() of inode.
594	* So, we can access it via ->i_mapping always	594	* So, we can access it via ->i_mapping always
595	* without igrab().	595	* without igrab().
596	*/	596	*/
597	ihold(bdev->bd_inode);	597	ihold(bdev->bd_inode);
598	inode->i_bdev = bdev;	598	inode->i_bdev = bdev;
599	inode->i_mapping = bdev->bd_inode->i_mapping;	599	inode->i_mapping = bdev->bd_inode->i_mapping;
600	list_add(&inode->i_devices, &bdev->bd_inodes);	600	list_add(&inode->i_devices, &bdev->bd_inodes);
601	}	601	}
602	spin_unlock(&bdev_lock);	602	spin_unlock(&bdev_lock);
603	}	603	}
604	return bdev;	604	return bdev;
605	}	605	}
606		606
607	static inline int sb_is_blkdev_sb(struct super_block *sb)	607	static inline int sb_is_blkdev_sb(struct super_block *sb)
608	{	608	{
609	return sb == blockdev_superblock;	609	return sb == blockdev_superblock;
610	}	610	}
611		611
612	/* Call when you free inode */	612	/* Call when you free inode */
613		613
614	void bd_forget(struct inode *inode)	614	void bd_forget(struct inode *inode)
615	{	615	{
616	struct block_device *bdev = NULL;	616	struct block_device *bdev = NULL;
617		617
618	spin_lock(&bdev_lock);	618	spin_lock(&bdev_lock);
619	if (inode->i_bdev) {	619	if (inode->i_bdev) {
620	if (!sb_is_blkdev_sb(inode->i_sb))	620	if (!sb_is_blkdev_sb(inode->i_sb))
621	bdev = inode->i_bdev;	621	bdev = inode->i_bdev;
622	__bd_forget(inode);	622	__bd_forget(inode);
623	}	623	}
624	spin_unlock(&bdev_lock);	624	spin_unlock(&bdev_lock);
625		625
626	if (bdev)	626	if (bdev)
627	iput(bdev->bd_inode);	627	iput(bdev->bd_inode);
628	}	628	}
629		629
630	/**	630	/**
631	* bd_may_claim - test whether a block device can be claimed	631	* bd_may_claim - test whether a block device can be claimed
632	* @bdev: block device of interest	632	* @bdev: block device of interest
633	* @whole: whole block device containing @bdev, may equal @bdev	633	* @whole: whole block device containing @bdev, may equal @bdev
634	* @holder: holder trying to claim @bdev	634	* @holder: holder trying to claim @bdev
635	*	635	*
636	* Test whether @bdev can be claimed by @holder.	636	* Test whether @bdev can be claimed by @holder.
637	*	637	*
638	* CONTEXT:	638	* CONTEXT:
639	* spin_lock(&bdev_lock).	639	* spin_lock(&bdev_lock).
640	*	640	*
641	* RETURNS:	641	* RETURNS:
642	* %true if @bdev can be claimed, %false otherwise.	642	* %true if @bdev can be claimed, %false otherwise.
643	*/	643	*/
644	static bool bd_may_claim(struct block_device bdev, struct block_device whole,	644	static bool bd_may_claim(struct block_device bdev, struct block_device whole,
645	void *holder)	645	void *holder)
646	{	646	{
647	if (bdev->bd_holder == holder)	647	if (bdev->bd_holder == holder)
648	return true; /* already a holder */	648	return true; /* already a holder */
649	else if (bdev->bd_holder != NULL)	649	else if (bdev->bd_holder != NULL)
650	return false; /* held by someone else */	650	return false; /* held by someone else */
651	else if (bdev->bd_contains == bdev)	651	else if (bdev->bd_contains == bdev)
652	return true; /* is a whole device which isn't held */	652	return true; /* is a whole device which isn't held */
653		653
654	else if (whole->bd_holder == bd_may_claim)	654	else if (whole->bd_holder == bd_may_claim)
655	return true; /* is a partition of a device that is being partitioned */	655	return true; /* is a partition of a device that is being partitioned */
656	else if (whole->bd_holder != NULL)	656	else if (whole->bd_holder != NULL)
657	return false; /* is a partition of a held device */	657	return false; /* is a partition of a held device */
658	else	658	else
659	return true; /* is a partition of an un-held device */	659	return true; /* is a partition of an un-held device */
660	}	660	}
661		661
662	/**	662	/**
663	* bd_prepare_to_claim - prepare to claim a block device	663	* bd_prepare_to_claim - prepare to claim a block device
664	* @bdev: block device of interest	664	* @bdev: block device of interest
665	* @whole: the whole device containing @bdev, may equal @bdev	665	* @whole: the whole device containing @bdev, may equal @bdev
666	* @holder: holder trying to claim @bdev	666	* @holder: holder trying to claim @bdev
667	*	667	*
668	* Prepare to claim @bdev. This function fails if @bdev is already	668	* Prepare to claim @bdev. This function fails if @bdev is already
669	* claimed by another holder and waits if another claiming is in	669	* claimed by another holder and waits if another claiming is in
670	* progress. This function doesn't actually claim. On successful	670	* progress. This function doesn't actually claim. On successful
671	* return, the caller has ownership of bd_claiming and bd_holder[s].	671	* return, the caller has ownership of bd_claiming and bd_holder[s].
672	*	672	*
673	* CONTEXT:	673	* CONTEXT:
674	* spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab	674	* spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
675	* it multiple times.	675	* it multiple times.
676	*	676	*
677	* RETURNS:	677	* RETURNS:
678	* 0 if @bdev can be claimed, -EBUSY otherwise.	678	* 0 if @bdev can be claimed, -EBUSY otherwise.
679	*/	679	*/
680	static int bd_prepare_to_claim(struct block_device *bdev,	680	static int bd_prepare_to_claim(struct block_device *bdev,
681	struct block_device whole, void holder)	681	struct block_device whole, void holder)
682	{	682	{
683	retry:	683	retry:
684	/* if someone else claimed, fail */	684	/* if someone else claimed, fail */
685	if (!bd_may_claim(bdev, whole, holder))	685	if (!bd_may_claim(bdev, whole, holder))
686	return -EBUSY;	686	return -EBUSY;
687		687
688	/* if claiming is already in progress, wait for it to finish */	688	/* if claiming is already in progress, wait for it to finish */
689	if (whole->bd_claiming) {	689	if (whole->bd_claiming) {
690	wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);	690	wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
691	DEFINE_WAIT(wait);	691	DEFINE_WAIT(wait);
692		692
693	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);	693	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
694	spin_unlock(&bdev_lock);	694	spin_unlock(&bdev_lock);
695	schedule();	695	schedule();
696	finish_wait(wq, &wait);	696	finish_wait(wq, &wait);
697	spin_lock(&bdev_lock);	697	spin_lock(&bdev_lock);
698	goto retry;	698	goto retry;
699	}	699	}
700		700
701	/* yay, all mine */	701	/* yay, all mine */
702	return 0;	702	return 0;
703	}	703	}
704		704
705	/**	705	/**
706	* bd_start_claiming - start claiming a block device	706	* bd_start_claiming - start claiming a block device
707	* @bdev: block device of interest	707	* @bdev: block device of interest
708	* @holder: holder trying to claim @bdev	708	* @holder: holder trying to claim @bdev
709	*	709	*
710	* @bdev is about to be opened exclusively. Check @bdev can be opened	710	* @bdev is about to be opened exclusively. Check @bdev can be opened
711	* exclusively and mark that an exclusive open is in progress. Each	711	* exclusively and mark that an exclusive open is in progress. Each
712	* successful call to this function must be matched with a call to	712	* successful call to this function must be matched with a call to
713	* either bd_finish_claiming() or bd_abort_claiming() (which do not	713	* either bd_finish_claiming() or bd_abort_claiming() (which do not
714	* fail).	714	* fail).
715	*	715	*
716	* This function is used to gain exclusive access to the block device	716	* This function is used to gain exclusive access to the block device
717	* without actually causing other exclusive open attempts to fail. It	717	* without actually causing other exclusive open attempts to fail. It
718	* should be used when the open sequence itself requires exclusive	718	* should be used when the open sequence itself requires exclusive
719	* access but may subsequently fail.	719	* access but may subsequently fail.
720	*	720	*
721	* CONTEXT:	721	* CONTEXT:
722	* Might sleep.	722	* Might sleep.
723	*	723	*
724	* RETURNS:	724	* RETURNS:
725	* Pointer to the block device containing @bdev on success, ERR_PTR()	725	* Pointer to the block device containing @bdev on success, ERR_PTR()
726	* value on failure.	726	* value on failure.
727	*/	727	*/
728	static struct block_device bd_start_claiming(struct block_device bdev,	728	static struct block_device bd_start_claiming(struct block_device bdev,
729	void *holder)	729	void *holder)
730	{	730	{
731	struct gendisk *disk;	731	struct gendisk *disk;
732	struct block_device *whole;	732	struct block_device *whole;
733	int partno, err;	733	int partno, err;
734		734
735	might_sleep();	735	might_sleep();
736		736
737	/*	737	/*
738	* @bdev might not have been initialized properly yet, look up	738	* @bdev might not have been initialized properly yet, look up
739	* and grab the outer block device the hard way.	739	* and grab the outer block device the hard way.
740	*/	740	*/
741	disk = get_gendisk(bdev->bd_dev, &partno);	741	disk = get_gendisk(bdev->bd_dev, &partno);
742	if (!disk)	742	if (!disk)
743	return ERR_PTR(-ENXIO);	743	return ERR_PTR(-ENXIO);
744		744
745	/*	745	/*
746	* Normally, @bdev should equal what's returned from bdget_disk()	746	* Normally, @bdev should equal what's returned from bdget_disk()
747	* if partno is 0; however, some drivers (floppy) use multiple	747	* if partno is 0; however, some drivers (floppy) use multiple
748	* bdev's for the same physical device and @bdev may be one of the	748	* bdev's for the same physical device and @bdev may be one of the
749	* aliases. Keep @bdev if partno is 0. This means claimer	749	* aliases. Keep @bdev if partno is 0. This means claimer
750	* tracking is broken for those devices but it has always been that	750	* tracking is broken for those devices but it has always been that
751	* way.	751	* way.
752	*/	752	*/
753	if (partno)	753	if (partno)
754	whole = bdget_disk(disk, 0);	754	whole = bdget_disk(disk, 0);
755	else	755	else
756	whole = bdgrab(bdev);	756	whole = bdgrab(bdev);
757		757
758	module_put(disk->fops->owner);	758	module_put(disk->fops->owner);
759	put_disk(disk);	759	put_disk(disk);
760	if (!whole)	760	if (!whole)
761	return ERR_PTR(-ENOMEM);	761	return ERR_PTR(-ENOMEM);
762		762
763	/* prepare to claim, if successful, mark claiming in progress */	763	/* prepare to claim, if successful, mark claiming in progress */
764	spin_lock(&bdev_lock);	764	spin_lock(&bdev_lock);
765		765
766	err = bd_prepare_to_claim(bdev, whole, holder);	766	err = bd_prepare_to_claim(bdev, whole, holder);
767	if (err == 0) {	767	if (err == 0) {
768	whole->bd_claiming = holder;	768	whole->bd_claiming = holder;
769	spin_unlock(&bdev_lock);	769	spin_unlock(&bdev_lock);
770	return whole;	770	return whole;
771	} else {	771	} else {
772	spin_unlock(&bdev_lock);	772	spin_unlock(&bdev_lock);
773	bdput(whole);	773	bdput(whole);
774	return ERR_PTR(err);	774	return ERR_PTR(err);
775	}	775	}
776	}	776	}
777		777
778	#ifdef CONFIG_SYSFS	778	#ifdef CONFIG_SYSFS
779	struct bd_holder_disk {	779	struct bd_holder_disk {
780	struct list_head list;	780	struct list_head list;
781	struct gendisk *disk;	781	struct gendisk *disk;
782	int refcnt;	782	int refcnt;
783	};	783	};
784		784
785	static struct bd_holder_disk bd_find_holder_disk(struct block_device bdev,	785	static struct bd_holder_disk bd_find_holder_disk(struct block_device bdev,
786	struct gendisk *disk)	786	struct gendisk *disk)
787	{	787	{
788	struct bd_holder_disk *holder;	788	struct bd_holder_disk *holder;
789		789
790	list_for_each_entry(holder, &bdev->bd_holder_disks, list)	790	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
791	if (holder->disk == disk)	791	if (holder->disk == disk)
792	return holder;	792	return holder;
793	return NULL;	793	return NULL;
794	}	794	}
795		795
796	static int add_symlink(struct kobject from, struct kobject to)	796	static int add_symlink(struct kobject from, struct kobject to)
797	{	797	{
798	return sysfs_create_link(from, to, kobject_name(to));	798	return sysfs_create_link(from, to, kobject_name(to));
799	}	799	}
800		800
801	static void del_symlink(struct kobject from, struct kobject to)	801	static void del_symlink(struct kobject from, struct kobject to)
802	{	802	{
803	sysfs_remove_link(from, kobject_name(to));	803	sysfs_remove_link(from, kobject_name(to));
804	}	804	}
805		805
806	/**	806	/**
807	* bd_link_disk_holder - create symlinks between holding disk and slave bdev	807	* bd_link_disk_holder - create symlinks between holding disk and slave bdev
808	* @bdev: the claimed slave bdev	808	* @bdev: the claimed slave bdev
809	* @disk: the holding disk	809	* @disk: the holding disk
810	*	810	*
811	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.	811	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
812	*	812	*
813	* This functions creates the following sysfs symlinks.	813	* This functions creates the following sysfs symlinks.
814	*	814	*
815	* - from "slaves" directory of the holder @disk to the claimed @bdev	815	* - from "slaves" directory of the holder @disk to the claimed @bdev
816	* - from "holders" directory of the @bdev to the holder @disk	816	* - from "holders" directory of the @bdev to the holder @disk
817	*	817	*
818	* For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is	818	* For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
819	* passed to bd_link_disk_holder(), then:	819	* passed to bd_link_disk_holder(), then:
820	*	820	*
821	* /sys/block/dm-0/slaves/sda --> /sys/block/sda	821	* /sys/block/dm-0/slaves/sda --> /sys/block/sda
822	* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0	822	* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
823	*	823	*
824	* The caller must have claimed @bdev before calling this function and	824	* The caller must have claimed @bdev before calling this function and
825	* ensure that both @bdev and @disk are valid during the creation and	825	* ensure that both @bdev and @disk are valid during the creation and
826	* lifetime of these symlinks.	826	* lifetime of these symlinks.
827	*	827	*
828	* CONTEXT:	828	* CONTEXT:
829	* Might sleep.	829	* Might sleep.
830	*	830	*
831	* RETURNS:	831	* RETURNS:
832	* 0 on success, -errno on failure.	832	* 0 on success, -errno on failure.
833	*/	833	*/
834	int bd_link_disk_holder(struct block_device bdev, struct gendisk disk)	834	int bd_link_disk_holder(struct block_device bdev, struct gendisk disk)
835	{	835	{
836	struct bd_holder_disk *holder;	836	struct bd_holder_disk *holder;
837	int ret = 0;	837	int ret = 0;
838		838
839	mutex_lock(&bdev->bd_mutex);	839	mutex_lock(&bdev->bd_mutex);
840		840
841	WARN_ON_ONCE(!bdev->bd_holder);	841	WARN_ON_ONCE(!bdev->bd_holder);
842		842
843	/* FIXME: remove the following once add_disk() handles errors */	843	/* FIXME: remove the following once add_disk() handles errors */
844	if (WARN_ON(!disk->slave_dir \|\| !bdev->bd_part->holder_dir))	844	if (WARN_ON(!disk->slave_dir \|\| !bdev->bd_part->holder_dir))
845	goto out_unlock;	845	goto out_unlock;
846		846
847	holder = bd_find_holder_disk(bdev, disk);	847	holder = bd_find_holder_disk(bdev, disk);
848	if (holder) {	848	if (holder) {
849	holder->refcnt++;	849	holder->refcnt++;
850	goto out_unlock;	850	goto out_unlock;
851	}	851	}
852		852
853	holder = kzalloc(sizeof(*holder), GFP_KERNEL);	853	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
854	if (!holder) {	854	if (!holder) {
855	ret = -ENOMEM;	855	ret = -ENOMEM;
856	goto out_unlock;	856	goto out_unlock;
857	}	857	}
858		858
859	INIT_LIST_HEAD(&holder->list);	859	INIT_LIST_HEAD(&holder->list);
860	holder->disk = disk;	860	holder->disk = disk;
861	holder->refcnt = 1;	861	holder->refcnt = 1;
862		862
863	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);	863	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
864	if (ret)	864	if (ret)
865	goto out_free;	865	goto out_free;
866		866
867	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);	867	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
868	if (ret)	868	if (ret)
869	goto out_del;	869	goto out_del;
870	/*	870	/*
871	* bdev could be deleted beneath us which would implicitly destroy	871	* bdev could be deleted beneath us which would implicitly destroy
872	* the holder directory. Hold on to it.	872	* the holder directory. Hold on to it.
873	*/	873	*/
874	kobject_get(bdev->bd_part->holder_dir);	874	kobject_get(bdev->bd_part->holder_dir);
875		875
876	list_add(&holder->list, &bdev->bd_holder_disks);	876	list_add(&holder->list, &bdev->bd_holder_disks);
877	goto out_unlock;	877	goto out_unlock;
878		878
879	out_del:	879	out_del:
880	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);	880	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
881	out_free:	881	out_free:
882	kfree(holder);	882	kfree(holder);
883	out_unlock:	883	out_unlock:
884	mutex_unlock(&bdev->bd_mutex);	884	mutex_unlock(&bdev->bd_mutex);
885	return ret;	885	return ret;
886	}	886	}
887	EXPORT_SYMBOL_GPL(bd_link_disk_holder);	887	EXPORT_SYMBOL_GPL(bd_link_disk_holder);
888		888
889	/**	889	/**
890	* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()	890	* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
891	* @bdev: the calimed slave bdev	891	* @bdev: the calimed slave bdev
892	* @disk: the holding disk	892	* @disk: the holding disk
893	*	893	*
894	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.	894	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
895	*	895	*
896	* CONTEXT:	896	* CONTEXT:
897	* Might sleep.	897	* Might sleep.
898	*/	898	*/
899	void bd_unlink_disk_holder(struct block_device bdev, struct gendisk disk)	899	void bd_unlink_disk_holder(struct block_device bdev, struct gendisk disk)
900	{	900	{
901	struct bd_holder_disk *holder;	901	struct bd_holder_disk *holder;
902		902
903	mutex_lock(&bdev->bd_mutex);	903	mutex_lock(&bdev->bd_mutex);
904		904
905	holder = bd_find_holder_disk(bdev, disk);	905	holder = bd_find_holder_disk(bdev, disk);
906		906
907	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {	907	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
908	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);	908	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
909	del_symlink(bdev->bd_part->holder_dir,	909	del_symlink(bdev->bd_part->holder_dir,
910	&disk_to_dev(disk)->kobj);	910	&disk_to_dev(disk)->kobj);
911	kobject_put(bdev->bd_part->holder_dir);	911	kobject_put(bdev->bd_part->holder_dir);
912	list_del_init(&holder->list);	912	list_del_init(&holder->list);
913	kfree(holder);	913	kfree(holder);
914	}	914	}
915		915
916	mutex_unlock(&bdev->bd_mutex);	916	mutex_unlock(&bdev->bd_mutex);
917	}	917	}
918	EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);	918	EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
919	#endif	919	#endif
920		920
921	/**	921	/**
922	* flush_disk - invalidates all buffer-cache entries on a disk	922	* flush_disk - invalidates all buffer-cache entries on a disk
923	*	923	*
924	* @bdev: struct block device to be flushed	924	* @bdev: struct block device to be flushed
925	* @kill_dirty: flag to guide handling of dirty inodes	925	* @kill_dirty: flag to guide handling of dirty inodes
926	*	926	*
927	* Invalidates all buffer-cache entries on a disk. It should be called	927	* Invalidates all buffer-cache entries on a disk. It should be called
928	* when a disk has been changed -- either by a media change or online	928	* when a disk has been changed -- either by a media change or online
929	* resize.	929	* resize.
930	*/	930	*/
931	static void flush_disk(struct block_device *bdev, bool kill_dirty)	931	static void flush_disk(struct block_device *bdev, bool kill_dirty)
932	{	932	{
933	if (__invalidate_device(bdev, kill_dirty)) {	933	if (__invalidate_device(bdev, kill_dirty)) {
934	char name[BDEVNAME_SIZE] = "";	934	char name[BDEVNAME_SIZE] = "";
935		935
936	if (bdev->bd_disk)	936	if (bdev->bd_disk)
937	disk_name(bdev->bd_disk, 0, name);	937	disk_name(bdev->bd_disk, 0, name);
938	printk(KERN_WARNING "VFS: busy inodes on changed media or "	938	printk(KERN_WARNING "VFS: busy inodes on changed media or "
939	"resized disk %s\n", name);	939	"resized disk %s\n", name);
940	}	940	}
941		941
942	if (!bdev->bd_disk)	942	if (!bdev->bd_disk)
943	return;	943	return;
944	if (disk_part_scan_enabled(bdev->bd_disk))	944	if (disk_part_scan_enabled(bdev->bd_disk))
945	bdev->bd_invalidated = 1;	945	bdev->bd_invalidated = 1;
946	}	946	}
947		947
948	/**	948	/**
949	* check_disk_size_change - checks for disk size change and adjusts bdev size.	949	* check_disk_size_change - checks for disk size change and adjusts bdev size.
950	* @disk: struct gendisk to check	950	* @disk: struct gendisk to check
951	* @bdev: struct bdev to adjust.	951	* @bdev: struct bdev to adjust.
952	*	952	*
953	* This routine checks to see if the bdev size does not match the disk size	953	* This routine checks to see if the bdev size does not match the disk size
954	* and adjusts it if it differs.	954	* and adjusts it if it differs.
955	*/	955	*/
956	void check_disk_size_change(struct gendisk disk, struct block_device bdev)	956	void check_disk_size_change(struct gendisk disk, struct block_device bdev)
957	{	957	{
958	loff_t disk_size, bdev_size;	958	loff_t disk_size, bdev_size;
959		959
960	disk_size = (loff_t)get_capacity(disk) << 9;	960	disk_size = (loff_t)get_capacity(disk) << 9;
961	bdev_size = i_size_read(bdev->bd_inode);	961	bdev_size = i_size_read(bdev->bd_inode);
962	if (disk_size != bdev_size) {	962	if (disk_size != bdev_size) {
963	char name[BDEVNAME_SIZE];	963	char name[BDEVNAME_SIZE];
964		964
965	disk_name(disk, 0, name);	965	disk_name(disk, 0, name);
966	printk(KERN_INFO	966	printk(KERN_INFO
967	"%s: detected capacity change from %lld to %lld\n",	967	"%s: detected capacity change from %lld to %lld\n",
968	name, bdev_size, disk_size);	968	name, bdev_size, disk_size);
969	i_size_write(bdev->bd_inode, disk_size);	969	i_size_write(bdev->bd_inode, disk_size);
970	flush_disk(bdev, false);	970	flush_disk(bdev, false);
971	}	971	}
972	}	972	}
973	EXPORT_SYMBOL(check_disk_size_change);	973	EXPORT_SYMBOL(check_disk_size_change);
974		974
975	/**	975	/**
976	* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back	976	* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
977	* @disk: struct gendisk to be revalidated	977	* @disk: struct gendisk to be revalidated
978	*	978	*
979	* This routine is a wrapper for lower-level driver's revalidate_disk	979	* This routine is a wrapper for lower-level driver's revalidate_disk
980	* call-backs. It is used to do common pre and post operations needed	980	* call-backs. It is used to do common pre and post operations needed
981	* for all revalidate_disk operations.	981	* for all revalidate_disk operations.
982	*/	982	*/
983	int revalidate_disk(struct gendisk *disk)	983	int revalidate_disk(struct gendisk *disk)
984	{	984	{
985	struct block_device *bdev;	985	struct block_device *bdev;
986	int ret = 0;	986	int ret = 0;
987		987
988	if (disk->fops->revalidate_disk)	988	if (disk->fops->revalidate_disk)
989	ret = disk->fops->revalidate_disk(disk);	989	ret = disk->fops->revalidate_disk(disk);
990		990
991	bdev = bdget_disk(disk, 0);	991	bdev = bdget_disk(disk, 0);
992	if (!bdev)	992	if (!bdev)
993	return ret;	993	return ret;
994		994
995	mutex_lock(&bdev->bd_mutex);	995	mutex_lock(&bdev->bd_mutex);
996	check_disk_size_change(disk, bdev);	996	check_disk_size_change(disk, bdev);
997	mutex_unlock(&bdev->bd_mutex);	997	mutex_unlock(&bdev->bd_mutex);
998	bdput(bdev);	998	bdput(bdev);
999	return ret;	999	return ret;
1000	}	1000	}
1001	EXPORT_SYMBOL(revalidate_disk);	1001	EXPORT_SYMBOL(revalidate_disk);
1002		1002
1003	/*	1003	/*
1004	* This routine checks whether a removable media has been changed,	1004	* This routine checks whether a removable media has been changed,
1005	* and invalidates all buffer-cache-entries in that case. This	1005	* and invalidates all buffer-cache-entries in that case. This
1006	* is a relatively slow routine, so we have to try to minimize using	1006	* is a relatively slow routine, so we have to try to minimize using
1007	* it. Thus it is called only upon a 'mount' or 'open'. This	1007	* it. Thus it is called only upon a 'mount' or 'open'. This
1008	* is the best way of combining speed and utility, I think.	1008	* is the best way of combining speed and utility, I think.
1009	* People changing diskettes in the middle of an operation deserve	1009	* People changing diskettes in the middle of an operation deserve
1010	* to lose :-)	1010	* to lose :-)
1011	*/	1011	*/
1012	int check_disk_change(struct block_device *bdev)	1012	int check_disk_change(struct block_device *bdev)
1013	{	1013	{
1014	struct gendisk *disk = bdev->bd_disk;	1014	struct gendisk *disk = bdev->bd_disk;
1015	const struct block_device_operations *bdops = disk->fops;	1015	const struct block_device_operations *bdops = disk->fops;
1016	unsigned int events;	1016	unsigned int events;
1017		1017
1018	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE \|	1018	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE \|
1019	DISK_EVENT_EJECT_REQUEST);	1019	DISK_EVENT_EJECT_REQUEST);
1020	if (!(events & DISK_EVENT_MEDIA_CHANGE))	1020	if (!(events & DISK_EVENT_MEDIA_CHANGE))
1021	return 0;	1021	return 0;
1022		1022
1023	flush_disk(bdev, true);	1023	flush_disk(bdev, true);
1024	if (bdops->revalidate_disk)	1024	if (bdops->revalidate_disk)
1025	bdops->revalidate_disk(bdev->bd_disk);	1025	bdops->revalidate_disk(bdev->bd_disk);
1026	return 1;	1026	return 1;
1027	}	1027	}
1028		1028
1029	EXPORT_SYMBOL(check_disk_change);	1029	EXPORT_SYMBOL(check_disk_change);
1030		1030
1031	void bd_set_size(struct block_device *bdev, loff_t size)	1031	void bd_set_size(struct block_device *bdev, loff_t size)
1032	{	1032	{
1033	unsigned bsize = bdev_logical_block_size(bdev);	1033	unsigned bsize = bdev_logical_block_size(bdev);
1034		1034
1035	bdev->bd_inode->i_size = size;	1035	bdev->bd_inode->i_size = size;
1036	while (bsize < PAGE_CACHE_SIZE) {	1036	while (bsize < PAGE_CACHE_SIZE) {
1037	if (size & bsize)	1037	if (size & bsize)
1038	break;	1038	break;
1039	bsize <<= 1;	1039	bsize <<= 1;
1040	}	1040	}
1041	bdev->bd_block_size = bsize;	1041	bdev->bd_block_size = bsize;
1042	bdev->bd_inode->i_blkbits = blksize_bits(bsize);	1042	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1043	}	1043	}
1044	EXPORT_SYMBOL(bd_set_size);	1044	EXPORT_SYMBOL(bd_set_size);
1045		1045
1046	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);	1046	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1047		1047
1048	/*	1048	/*
1049	* bd_mutex locking:	1049	* bd_mutex locking:
1050	*	1050	*
1051	* mutex_lock(part->bd_mutex)	1051	* mutex_lock(part->bd_mutex)
1052	* mutex_lock_nested(whole->bd_mutex, 1)	1052	* mutex_lock_nested(whole->bd_mutex, 1)
1053	*/	1053	*/
1054		1054
1055	static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)	1055	static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1056	{	1056	{
1057	struct gendisk *disk;	1057	struct gendisk *disk;
1058	struct module *owner;	1058	struct module *owner;
1059	int ret;	1059	int ret;
1060	int partno;	1060	int partno;
1061	int perm = 0;	1061	int perm = 0;
1062		1062
1063	if (mode & FMODE_READ)	1063	if (mode & FMODE_READ)
1064	perm \|= MAY_READ;	1064	perm \|= MAY_READ;
1065	if (mode & FMODE_WRITE)	1065	if (mode & FMODE_WRITE)
1066	perm \|= MAY_WRITE;	1066	perm \|= MAY_WRITE;
1067	/*	1067	/*
1068	* hooks: /n/, see "layering violations".	1068	* hooks: /n/, see "layering violations".
1069	*/	1069	*/
1070	if (!for_part) {	1070	if (!for_part) {
1071	ret = devcgroup_inode_permission(bdev->bd_inode, perm);	1071	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1072	if (ret != 0) {	1072	if (ret != 0) {
1073	bdput(bdev);	1073	bdput(bdev);
1074	return ret;	1074	return ret;
1075	}	1075	}
1076	}	1076	}
1077		1077
1078	restart:	1078	restart:
1079		1079
1080	ret = -ENXIO;	1080	ret = -ENXIO;
1081	disk = get_gendisk(bdev->bd_dev, &partno);	1081	disk = get_gendisk(bdev->bd_dev, &partno);
1082	if (!disk)	1082	if (!disk)
1083	goto out;	1083	goto out;
1084	owner = disk->fops->owner;	1084	owner = disk->fops->owner;
1085		1085
1086	disk_block_events(disk);	1086	disk_block_events(disk);
1087	mutex_lock_nested(&bdev->bd_mutex, for_part);	1087	mutex_lock_nested(&bdev->bd_mutex, for_part);
1088	if (!bdev->bd_openers) {	1088	if (!bdev->bd_openers) {
1089	bdev->bd_disk = disk;	1089	bdev->bd_disk = disk;
1090	bdev->bd_queue = disk->queue;	1090	bdev->bd_queue = disk->queue;
1091	bdev->bd_contains = bdev;	1091	bdev->bd_contains = bdev;
1092	if (!partno) {	1092	if (!partno) {
1093	struct backing_dev_info *bdi;	1093	struct backing_dev_info *bdi;
1094		1094
1095	ret = -ENXIO;	1095	ret = -ENXIO;
1096	bdev->bd_part = disk_get_part(disk, partno);	1096	bdev->bd_part = disk_get_part(disk, partno);
1097	if (!bdev->bd_part)	1097	if (!bdev->bd_part)
1098	goto out_clear;	1098	goto out_clear;
1099		1099
1100	ret = 0;	1100	ret = 0;
1101	if (disk->fops->open) {	1101	if (disk->fops->open) {
1102	ret = disk->fops->open(bdev, mode);	1102	ret = disk->fops->open(bdev, mode);
1103	if (ret == -ERESTARTSYS) {	1103	if (ret == -ERESTARTSYS) {
1104	/* Lost a race with 'disk' being	1104	/* Lost a race with 'disk' being
1105	* deleted, try again.	1105	* deleted, try again.
1106	* See md.c	1106	* See md.c
1107	*/	1107	*/
1108	disk_put_part(bdev->bd_part);	1108	disk_put_part(bdev->bd_part);
1109	bdev->bd_part = NULL;	1109	bdev->bd_part = NULL;
1110	bdev->bd_disk = NULL;	1110	bdev->bd_disk = NULL;
1111	bdev->bd_queue = NULL;	1111	bdev->bd_queue = NULL;
1112	mutex_unlock(&bdev->bd_mutex);	1112	mutex_unlock(&bdev->bd_mutex);
1113	disk_unblock_events(disk);	1113	disk_unblock_events(disk);
1114	put_disk(disk);	1114	put_disk(disk);
1115	module_put(owner);	1115	module_put(owner);
1116	goto restart;	1116	goto restart;
1117	}	1117	}
1118	}	1118	}
1119		1119
1120	if (!ret && !bdev->bd_openers) {	1120	if (!ret && !bdev->bd_openers) {
1121	bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);	1121	bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1122	bdi = blk_get_backing_dev_info(bdev);	1122	bdi = blk_get_backing_dev_info(bdev);
1123	if (bdi == NULL)	1123	if (bdi == NULL)
1124	bdi = &default_backing_dev_info;	1124	bdi = &default_backing_dev_info;
1125	bdev_inode_switch_bdi(bdev->bd_inode, bdi);	1125	bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1126	}	1126	}
1127		1127
1128	/*	1128	/*
1129	* If the device is invalidated, rescan partition	1129	* If the device is invalidated, rescan partition
1130	* if open succeeded or failed with -ENOMEDIUM.	1130	* if open succeeded or failed with -ENOMEDIUM.
1131	* The latter is necessary to prevent ghost	1131	* The latter is necessary to prevent ghost
1132	* partitions on a removed medium.	1132	* partitions on a removed medium.
1133	*/	1133	*/
1134	if (bdev->bd_invalidated) {	1134	if (bdev->bd_invalidated) {
1135	if (!ret)	1135	if (!ret)
1136	rescan_partitions(disk, bdev);	1136	rescan_partitions(disk, bdev);
1137	else if (ret == -ENOMEDIUM)	1137	else if (ret == -ENOMEDIUM)
1138	invalidate_partitions(disk, bdev);	1138	invalidate_partitions(disk, bdev);
1139	}	1139	}
1140	if (ret)	1140	if (ret)
1141	goto out_clear;	1141	goto out_clear;
1142	} else {	1142	} else {
1143	struct block_device *whole;	1143	struct block_device *whole;
1144	whole = bdget_disk(disk, 0);	1144	whole = bdget_disk(disk, 0);
1145	ret = -ENOMEM;	1145	ret = -ENOMEM;
1146	if (!whole)	1146	if (!whole)
1147	goto out_clear;	1147	goto out_clear;
1148	BUG_ON(for_part);	1148	BUG_ON(for_part);
1149	ret = __blkdev_get(whole, mode, 1);	1149	ret = __blkdev_get(whole, mode, 1);
1150	if (ret)	1150	if (ret)
1151	goto out_clear;	1151	goto out_clear;
1152	bdev->bd_contains = whole;	1152	bdev->bd_contains = whole;
1153	bdev_inode_switch_bdi(bdev->bd_inode,	1153	bdev_inode_switch_bdi(bdev->bd_inode,
1154	whole->bd_inode->i_data.backing_dev_info);	1154	whole->bd_inode->i_data.backing_dev_info);
1155	bdev->bd_part = disk_get_part(disk, partno);	1155	bdev->bd_part = disk_get_part(disk, partno);
1156	if (!(disk->flags & GENHD_FL_UP) \|\|	1156	if (!(disk->flags & GENHD_FL_UP) \|\|
1157	!bdev->bd_part \|\| !bdev->bd_part->nr_sects) {	1157	!bdev->bd_part \|\| !bdev->bd_part->nr_sects) {
1158	ret = -ENXIO;	1158	ret = -ENXIO;
1159	goto out_clear;	1159	goto out_clear;
1160	}	1160	}
1161	bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);	1161	bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1162	}	1162	}
1163	} else {	1163	} else {
1164	if (bdev->bd_contains == bdev) {	1164	if (bdev->bd_contains == bdev) {
1165	ret = 0;	1165	ret = 0;
1166	if (bdev->bd_disk->fops->open)	1166	if (bdev->bd_disk->fops->open)
1167	ret = bdev->bd_disk->fops->open(bdev, mode);	1167	ret = bdev->bd_disk->fops->open(bdev, mode);
1168	/* the same as first opener case, read comment there */	1168	/* the same as first opener case, read comment there */
1169	if (bdev->bd_invalidated) {	1169	if (bdev->bd_invalidated) {
1170	if (!ret)	1170	if (!ret)
1171	rescan_partitions(bdev->bd_disk, bdev);	1171	rescan_partitions(bdev->bd_disk, bdev);
1172	else if (ret == -ENOMEDIUM)	1172	else if (ret == -ENOMEDIUM)
1173	invalidate_partitions(bdev->bd_disk, bdev);	1173	invalidate_partitions(bdev->bd_disk, bdev);
1174	}	1174	}
1175	if (ret)	1175	if (ret)
1176	goto out_unlock_bdev;	1176	goto out_unlock_bdev;
1177	}	1177	}
1178	/* only one opener holds refs to the module and disk */	1178	/* only one opener holds refs to the module and disk */
1179	put_disk(disk);	1179	put_disk(disk);
1180	module_put(owner);	1180	module_put(owner);
1181	}	1181	}
1182	bdev->bd_openers++;	1182	bdev->bd_openers++;
1183	if (for_part)	1183	if (for_part)
1184	bdev->bd_part_count++;	1184	bdev->bd_part_count++;
1185	mutex_unlock(&bdev->bd_mutex);	1185	mutex_unlock(&bdev->bd_mutex);
1186	disk_unblock_events(disk);	1186	disk_unblock_events(disk);
1187	return 0;	1187	return 0;
1188		1188
1189	out_clear:	1189	out_clear:
1190	disk_put_part(bdev->bd_part);	1190	disk_put_part(bdev->bd_part);
1191	bdev->bd_disk = NULL;	1191	bdev->bd_disk = NULL;
1192	bdev->bd_part = NULL;	1192	bdev->bd_part = NULL;
1193	bdev->bd_queue = NULL;	1193	bdev->bd_queue = NULL;
1194	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);	1194	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1195	if (bdev != bdev->bd_contains)	1195	if (bdev != bdev->bd_contains)
1196	__blkdev_put(bdev->bd_contains, mode, 1);	1196	__blkdev_put(bdev->bd_contains, mode, 1);
1197	bdev->bd_contains = NULL;	1197	bdev->bd_contains = NULL;
1198	out_unlock_bdev:	1198	out_unlock_bdev:
1199	mutex_unlock(&bdev->bd_mutex);	1199	mutex_unlock(&bdev->bd_mutex);
1200	disk_unblock_events(disk);	1200	disk_unblock_events(disk);
1201	put_disk(disk);	1201	put_disk(disk);
1202	module_put(owner);	1202	module_put(owner);
1203	out:	1203	out:
1204	bdput(bdev);	1204	bdput(bdev);
1205		1205
1206	return ret;	1206	return ret;
1207	}	1207	}
1208		1208
1209	/**	1209	/**
1210	* blkdev_get - open a block device	1210	* blkdev_get - open a block device
1211	* @bdev: block_device to open	1211	* @bdev: block_device to open
1212	* @mode: FMODE_* mask	1212	* @mode: FMODE_* mask
1213	* @holder: exclusive holder identifier	1213	* @holder: exclusive holder identifier
1214	*	1214	*
1215	* Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is	1215	* Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1216	* open with exclusive access. Specifying %FMODE_EXCL with %NULL	1216	* open with exclusive access. Specifying %FMODE_EXCL with %NULL
1217	* @holder is invalid. Exclusive opens may nest for the same @holder.	1217	* @holder is invalid. Exclusive opens may nest for the same @holder.
1218	*	1218	*
1219	* On success, the reference count of @bdev is unchanged. On failure,	1219	* On success, the reference count of @bdev is unchanged. On failure,
1220	* @bdev is put.	1220	* @bdev is put.
1221	*	1221	*
1222	* CONTEXT:	1222	* CONTEXT:
1223	* Might sleep.	1223	* Might sleep.
1224	*	1224	*
1225	* RETURNS:	1225	* RETURNS:
1226	* 0 on success, -errno on failure.	1226	* 0 on success, -errno on failure.
1227	*/	1227	*/
1228	int blkdev_get(struct block_device bdev, fmode_t mode, void holder)	1228	int blkdev_get(struct block_device bdev, fmode_t mode, void holder)
1229	{	1229	{
1230	struct block_device *whole = NULL;	1230	struct block_device *whole = NULL;
1231	int res;	1231	int res;
1232		1232
1233	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);	1233	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1234		1234
1235	if ((mode & FMODE_EXCL) && holder) {	1235	if ((mode & FMODE_EXCL) && holder) {
1236	whole = bd_start_claiming(bdev, holder);	1236	whole = bd_start_claiming(bdev, holder);
1237	if (IS_ERR(whole)) {	1237	if (IS_ERR(whole)) {
1238	bdput(bdev);	1238	bdput(bdev);
1239	return PTR_ERR(whole);	1239	return PTR_ERR(whole);
1240	}	1240	}
1241	}	1241	}
1242		1242
1243	res = __blkdev_get(bdev, mode, 0);	1243	res = __blkdev_get(bdev, mode, 0);
1244		1244
1245	if (whole) {	1245	if (whole) {
1246	struct gendisk *disk = whole->bd_disk;	1246	struct gendisk *disk = whole->bd_disk;
1247		1247
1248	/* finish claiming */	1248	/* finish claiming */
1249	mutex_lock(&bdev->bd_mutex);	1249	mutex_lock(&bdev->bd_mutex);
1250	spin_lock(&bdev_lock);	1250	spin_lock(&bdev_lock);
1251		1251
1252	if (!res) {	1252	if (!res) {
1253	BUG_ON(!bd_may_claim(bdev, whole, holder));	1253	BUG_ON(!bd_may_claim(bdev, whole, holder));
1254	/*	1254	/*
1255	* Note that for a whole device bd_holders	1255	* Note that for a whole device bd_holders
1256	* will be incremented twice, and bd_holder	1256	* will be incremented twice, and bd_holder
1257	* will be set to bd_may_claim before being	1257	* will be set to bd_may_claim before being
1258	* set to holder	1258	* set to holder
1259	*/	1259	*/
1260	whole->bd_holders++;	1260	whole->bd_holders++;
1261	whole->bd_holder = bd_may_claim;	1261	whole->bd_holder = bd_may_claim;
1262	bdev->bd_holders++;	1262	bdev->bd_holders++;
1263	bdev->bd_holder = holder;	1263	bdev->bd_holder = holder;
1264	}	1264	}
1265		1265
1266	/* tell others that we're done */	1266	/* tell others that we're done */
1267	BUG_ON(whole->bd_claiming != holder);	1267	BUG_ON(whole->bd_claiming != holder);
1268	whole->bd_claiming = NULL;	1268	whole->bd_claiming = NULL;
1269	wake_up_bit(&whole->bd_claiming, 0);	1269	wake_up_bit(&whole->bd_claiming, 0);
1270		1270
1271	spin_unlock(&bdev_lock);	1271	spin_unlock(&bdev_lock);
1272		1272
1273	/*	1273	/*
1274	* Block event polling for write claims if requested. Any	1274	* Block event polling for write claims if requested. Any
1275	* write holder makes the write_holder state stick until	1275	* write holder makes the write_holder state stick until
1276	* all are released. This is good enough and tracking	1276	* all are released. This is good enough and tracking
1277	* individual writeable reference is too fragile given the	1277	* individual writeable reference is too fragile given the
1278	* way @mode is used in blkdev_get/put().	1278	* way @mode is used in blkdev_get/put().
1279	*/	1279	*/
1280	if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&	1280	if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1281	(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {	1281	(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1282	bdev->bd_write_holder = true;	1282	bdev->bd_write_holder = true;
1283	disk_block_events(disk);	1283	disk_block_events(disk);
1284	}	1284	}
1285		1285
1286	mutex_unlock(&bdev->bd_mutex);	1286	mutex_unlock(&bdev->bd_mutex);
1287	bdput(whole);	1287	bdput(whole);
1288	}	1288	}
1289		1289
1290	return res;	1290	return res;
1291	}	1291	}
1292	EXPORT_SYMBOL(blkdev_get);	1292	EXPORT_SYMBOL(blkdev_get);
1293		1293
1294	/**	1294	/**
1295	* blkdev_get_by_path - open a block device by name	1295	* blkdev_get_by_path - open a block device by name
1296	* @path: path to the block device to open	1296	* @path: path to the block device to open
1297	* @mode: FMODE_* mask	1297	* @mode: FMODE_* mask
1298	* @holder: exclusive holder identifier	1298	* @holder: exclusive holder identifier
1299	*	1299	*
1300	* Open the blockdevice described by the device file at @path. @mode	1300	* Open the blockdevice described by the device file at @path. @mode
1301	* and @holder are identical to blkdev_get().	1301	* and @holder are identical to blkdev_get().
1302	*	1302	*
1303	* On success, the returned block_device has reference count of one.	1303	* On success, the returned block_device has reference count of one.
1304	*	1304	*
1305	* CONTEXT:	1305	* CONTEXT:
1306	* Might sleep.	1306	* Might sleep.
1307	*	1307	*
1308	* RETURNS:	1308	* RETURNS:
1309	* Pointer to block_device on success, ERR_PTR(-errno) on failure.	1309	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
1310	*/	1310	*/
1311	struct block_device blkdev_get_by_path(const char path, fmode_t mode,	1311	struct block_device blkdev_get_by_path(const char path, fmode_t mode,
1312	void *holder)	1312	void *holder)
1313	{	1313	{
1314	struct block_device *bdev;	1314	struct block_device *bdev;
1315	int err;	1315	int err;
1316		1316
1317	bdev = lookup_bdev(path);	1317	bdev = lookup_bdev(path);
1318	if (IS_ERR(bdev))	1318	if (IS_ERR(bdev))
1319	return bdev;	1319	return bdev;
1320		1320
1321	err = blkdev_get(bdev, mode, holder);	1321	err = blkdev_get(bdev, mode, holder);
1322	if (err)	1322	if (err)
1323	return ERR_PTR(err);	1323	return ERR_PTR(err);
1324		1324
1325	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {	1325	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1326	blkdev_put(bdev, mode);	1326	blkdev_put(bdev, mode);
1327	return ERR_PTR(-EACCES);	1327	return ERR_PTR(-EACCES);
1328	}	1328	}
1329		1329
1330	return bdev;	1330	return bdev;
1331	}	1331	}
1332	EXPORT_SYMBOL(blkdev_get_by_path);	1332	EXPORT_SYMBOL(blkdev_get_by_path);
1333		1333
1334	/**	1334	/**
1335	* blkdev_get_by_dev - open a block device by device number	1335	* blkdev_get_by_dev - open a block device by device number
1336	* @dev: device number of block device to open	1336	* @dev: device number of block device to open
1337	* @mode: FMODE_* mask	1337	* @mode: FMODE_* mask
1338	* @holder: exclusive holder identifier	1338	* @holder: exclusive holder identifier
1339	*	1339	*
1340	* Open the blockdevice described by device number @dev. @mode and	1340	* Open the blockdevice described by device number @dev. @mode and
1341	* @holder are identical to blkdev_get().	1341	* @holder are identical to blkdev_get().
1342	*	1342	*
1343	* Use it ONLY if you really do not have anything better - i.e. when	1343	* Use it ONLY if you really do not have anything better - i.e. when
1344	* you are behind a truly sucky interface and all you are given is a	1344	* you are behind a truly sucky interface and all you are given is a
1345	* device number. _Never_ to be used for internal purposes. If you	1345	* device number. _Never_ to be used for internal purposes. If you
1346	* ever need it - reconsider your API.	1346	* ever need it - reconsider your API.
1347	*	1347	*
1348	* On success, the returned block_device has reference count of one.	1348	* On success, the returned block_device has reference count of one.
1349	*	1349	*
1350	* CONTEXT:	1350	* CONTEXT:
1351	* Might sleep.	1351	* Might sleep.
1352	*	1352	*
1353	* RETURNS:	1353	* RETURNS:
1354	* Pointer to block_device on success, ERR_PTR(-errno) on failure.	1354	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
1355	*/	1355	*/
1356	struct block_device blkdev_get_by_dev(dev_t dev, fmode_t mode, void holder)	1356	struct block_device blkdev_get_by_dev(dev_t dev, fmode_t mode, void holder)
1357	{	1357	{
1358	struct block_device *bdev;	1358	struct block_device *bdev;
1359	int err;	1359	int err;
1360		1360
1361	bdev = bdget(dev);	1361	bdev = bdget(dev);
1362	if (!bdev)	1362	if (!bdev)
1363	return ERR_PTR(-ENOMEM);	1363	return ERR_PTR(-ENOMEM);
1364		1364
1365	err = blkdev_get(bdev, mode, holder);	1365	err = blkdev_get(bdev, mode, holder);
1366	if (err)	1366	if (err)
1367	return ERR_PTR(err);	1367	return ERR_PTR(err);
1368		1368
1369	return bdev;	1369	return bdev;
1370	}	1370	}
1371	EXPORT_SYMBOL(blkdev_get_by_dev);	1371	EXPORT_SYMBOL(blkdev_get_by_dev);
1372		1372
1373	static int blkdev_open(struct inode * inode, struct file * filp)	1373	static int blkdev_open(struct inode * inode, struct file * filp)
1374	{	1374	{
1375	struct block_device *bdev;	1375	struct block_device *bdev;
1376		1376
1377	/*	1377	/*
1378	* Preserve backwards compatibility and allow large file access	1378	* Preserve backwards compatibility and allow large file access
1379	* even if userspace doesn't ask for it explicitly. Some mkfs	1379	* even if userspace doesn't ask for it explicitly. Some mkfs
1380	* binary needs it. We might want to drop this workaround	1380	* binary needs it. We might want to drop this workaround
1381	* during an unstable branch.	1381	* during an unstable branch.
1382	*/	1382	*/
1383	filp->f_flags \|= O_LARGEFILE;	1383	filp->f_flags \|= O_LARGEFILE;
1384		1384
1385	if (filp->f_flags & O_NDELAY)	1385	if (filp->f_flags & O_NDELAY)
1386	filp->f_mode \|= FMODE_NDELAY;	1386	filp->f_mode \|= FMODE_NDELAY;
1387	if (filp->f_flags & O_EXCL)	1387	if (filp->f_flags & O_EXCL)
1388	filp->f_mode \|= FMODE_EXCL;	1388	filp->f_mode \|= FMODE_EXCL;
1389	if ((filp->f_flags & O_ACCMODE) == 3)	1389	if ((filp->f_flags & O_ACCMODE) == 3)
1390	filp->f_mode \|= FMODE_WRITE_IOCTL;	1390	filp->f_mode \|= FMODE_WRITE_IOCTL;
1391		1391
1392	bdev = bd_acquire(inode);	1392	bdev = bd_acquire(inode);
1393	if (bdev == NULL)	1393	if (bdev == NULL)
1394	return -ENOMEM;	1394	return -ENOMEM;
1395		1395
1396	filp->f_mapping = bdev->bd_inode->i_mapping;	1396	filp->f_mapping = bdev->bd_inode->i_mapping;
1397		1397
1398	return blkdev_get(bdev, filp->f_mode, filp);	1398	return blkdev_get(bdev, filp->f_mode, filp);
1399	}	1399	}
1400		1400
1401	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)	1401	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1402	{	1402	{
1403	int ret = 0;	1403	int ret = 0;
1404	struct gendisk *disk = bdev->bd_disk;	1404	struct gendisk *disk = bdev->bd_disk;
1405	struct block_device *victim = NULL;	1405	struct block_device *victim = NULL;
1406		1406
1407	mutex_lock_nested(&bdev->bd_mutex, for_part);	1407	mutex_lock_nested(&bdev->bd_mutex, for_part);
1408	if (for_part)	1408	if (for_part)
1409	bdev->bd_part_count--;	1409	bdev->bd_part_count--;
1410		1410
1411	if (!--bdev->bd_openers) {	1411	if (!--bdev->bd_openers) {
1412	WARN_ON_ONCE(bdev->bd_holders);	1412	WARN_ON_ONCE(bdev->bd_holders);
1413	sync_blockdev(bdev);	1413	sync_blockdev(bdev);
1414	kill_bdev(bdev);	1414	kill_bdev(bdev);
1415	/* ->release can cause the old bdi to disappear,	1415	/* ->release can cause the old bdi to disappear,
1416	* so must switch it out first	1416	* so must switch it out first
1417	*/	1417	*/
1418	bdev_inode_switch_bdi(bdev->bd_inode,	1418	bdev_inode_switch_bdi(bdev->bd_inode,
1419	&default_backing_dev_info);	1419	&default_backing_dev_info);
1420	}	1420	}
1421	if (bdev->bd_contains == bdev) {	1421	if (bdev->bd_contains == bdev) {
1422	if (disk->fops->release)	1422	if (disk->fops->release)
1423	ret = disk->fops->release(disk, mode);	1423	ret = disk->fops->release(disk, mode);
1424	}	1424	}
1425	if (!bdev->bd_openers) {	1425	if (!bdev->bd_openers) {
1426	struct module *owner = disk->fops->owner;	1426	struct module *owner = disk->fops->owner;
1427		1427
1428	disk_put_part(bdev->bd_part);	1428	disk_put_part(bdev->bd_part);
1429	bdev->bd_part = NULL;	1429	bdev->bd_part = NULL;
1430	bdev->bd_disk = NULL;	1430	bdev->bd_disk = NULL;
1431	if (bdev != bdev->bd_contains)	1431	if (bdev != bdev->bd_contains)
1432	victim = bdev->bd_contains;	1432	victim = bdev->bd_contains;
1433	bdev->bd_contains = NULL;	1433	bdev->bd_contains = NULL;
1434		1434
1435	put_disk(disk);	1435	put_disk(disk);
1436	module_put(owner);	1436	module_put(owner);
1437	}	1437	}
1438	mutex_unlock(&bdev->bd_mutex);	1438	mutex_unlock(&bdev->bd_mutex);
1439	bdput(bdev);	1439	bdput(bdev);
1440	if (victim)	1440	if (victim)
1441	__blkdev_put(victim, mode, 1);	1441	__blkdev_put(victim, mode, 1);
1442	return ret;	1442	return ret;
1443	}	1443	}
1444		1444
1445	int blkdev_put(struct block_device *bdev, fmode_t mode)	1445	int blkdev_put(struct block_device *bdev, fmode_t mode)
1446	{	1446	{
1447	mutex_lock(&bdev->bd_mutex);	1447	mutex_lock(&bdev->bd_mutex);
1448		1448
1449	if (mode & FMODE_EXCL) {	1449	if (mode & FMODE_EXCL) {
1450	bool bdev_free;	1450	bool bdev_free;
1451		1451
1452	/*	1452	/*
1453	* Release a claim on the device. The holder fields	1453	* Release a claim on the device. The holder fields
1454	* are protected with bdev_lock. bd_mutex is to	1454	* are protected with bdev_lock. bd_mutex is to
1455	* synchronize disk_holder unlinking.	1455	* synchronize disk_holder unlinking.
1456	*/	1456	*/
1457	spin_lock(&bdev_lock);	1457	spin_lock(&bdev_lock);
1458		1458
1459	WARN_ON_ONCE(--bdev->bd_holders < 0);	1459	WARN_ON_ONCE(--bdev->bd_holders < 0);
1460	WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);	1460	WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1461		1461
1462	/* bd_contains might point to self, check in a separate step */	1462	/* bd_contains might point to self, check in a separate step */
1463	if ((bdev_free = !bdev->bd_holders))	1463	if ((bdev_free = !bdev->bd_holders))
1464	bdev->bd_holder = NULL;	1464	bdev->bd_holder = NULL;
1465	if (!bdev->bd_contains->bd_holders)	1465	if (!bdev->bd_contains->bd_holders)
1466	bdev->bd_contains->bd_holder = NULL;	1466	bdev->bd_contains->bd_holder = NULL;
1467		1467
1468	spin_unlock(&bdev_lock);	1468	spin_unlock(&bdev_lock);
1469		1469
1470	/*	1470	/*
1471	* If this was the last claim, remove holder link and	1471	* If this was the last claim, remove holder link and
1472	* unblock evpoll if it was a write holder.	1472	* unblock evpoll if it was a write holder.
1473	*/	1473	*/
1474	if (bdev_free && bdev->bd_write_holder) {	1474	if (bdev_free && bdev->bd_write_holder) {
1475	disk_unblock_events(bdev->bd_disk);	1475	disk_unblock_events(bdev->bd_disk);
1476	bdev->bd_write_holder = false;	1476	bdev->bd_write_holder = false;
1477	}	1477	}
1478	}	1478	}
1479		1479
1480	/*	1480	/*
1481	* Trigger event checking and tell drivers to flush MEDIA_CHANGE	1481	* Trigger event checking and tell drivers to flush MEDIA_CHANGE
1482	* event. This is to ensure detection of media removal commanded	1482	* event. This is to ensure detection of media removal commanded
1483	* from userland - e.g. eject(1).	1483	* from userland - e.g. eject(1).
1484	*/	1484	*/
1485	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);	1485	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1486		1486
1487	mutex_unlock(&bdev->bd_mutex);	1487	mutex_unlock(&bdev->bd_mutex);
1488		1488
1489	return __blkdev_put(bdev, mode, 0);	1489	return __blkdev_put(bdev, mode, 0);
1490	}	1490	}
1491	EXPORT_SYMBOL(blkdev_put);	1491	EXPORT_SYMBOL(blkdev_put);
1492		1492
1493	static int blkdev_close(struct inode * inode, struct file * filp)	1493	static int blkdev_close(struct inode * inode, struct file * filp)
1494	{	1494	{
1495	struct block_device *bdev = I_BDEV(filp->f_mapping->host);	1495	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1496		1496
1497	return blkdev_put(bdev, filp->f_mode);	1497	return blkdev_put(bdev, filp->f_mode);
1498	}	1498	}
1499		1499
1500	static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)	1500	static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1501	{	1501	{
1502	struct block_device *bdev = I_BDEV(file->f_mapping->host);	1502	struct block_device *bdev = I_BDEV(file->f_mapping->host);
1503	fmode_t mode = file->f_mode;	1503	fmode_t mode = file->f_mode;
1504		1504
1505	/*	1505	/*
1506	* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have	1506	* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1507	* to updated it before every ioctl.	1507	* to updated it before every ioctl.
1508	*/	1508	*/
1509	if (file->f_flags & O_NDELAY)	1509	if (file->f_flags & O_NDELAY)
1510	mode \|= FMODE_NDELAY;	1510	mode \|= FMODE_NDELAY;
1511	else	1511	else
1512	mode &= ~FMODE_NDELAY;	1512	mode &= ~FMODE_NDELAY;
1513		1513
1514	return blkdev_ioctl(bdev, mode, cmd, arg);	1514	return blkdev_ioctl(bdev, mode, cmd, arg);
1515	}	1515	}
1516		1516
1517	/*	1517	/*
1518	* Write data to the block device. Only intended for the block device itself	1518	* Write data to the block device. Only intended for the block device itself
1519	* and the raw driver which basically is a fake block device.	1519	* and the raw driver which basically is a fake block device.
1520	*	1520	*
1521	* Does not take i_mutex for the write and thus is not for general purpose	1521	* Does not take i_mutex for the write and thus is not for general purpose
1522	* use.	1522	* use.
1523	*/	1523	*/
1524	ssize_t blkdev_aio_write(struct kiocb iocb, const struct iovec iov,	1524	ssize_t blkdev_aio_write(struct kiocb iocb, const struct iovec iov,
1525	unsigned long nr_segs, loff_t pos)	1525	unsigned long nr_segs, loff_t pos)
1526	{	1526	{
1527	struct file *file = iocb->ki_filp;	1527	struct file *file = iocb->ki_filp;
1528	struct blk_plug plug;	1528	struct blk_plug plug;
1529	ssize_t ret;	1529	ssize_t ret;
1530		1530
1531	BUG_ON(iocb->ki_pos != pos);	1531	BUG_ON(iocb->ki_pos != pos);
1532		1532
1533	blk_start_plug(&plug);	1533	blk_start_plug(&plug);
1534	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);	1534	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1535	if (ret > 0 \|\| ret == -EIOCBQUEUED) {	1535	if (ret > 0 \|\| ret == -EIOCBQUEUED) {
1536	ssize_t err;	1536	ssize_t err;
1537		1537
1538	err = generic_write_sync(file, pos, ret);	1538	err = generic_write_sync(file, pos, ret);
1539	if (err < 0 && ret > 0)	1539	if (err < 0 && ret > 0)
1540	ret = err;	1540	ret = err;
1541	}	1541	}
1542	blk_finish_plug(&plug);	1542	blk_finish_plug(&plug);
1543	return ret;	1543	return ret;
1544	}	1544	}
1545	EXPORT_SYMBOL_GPL(blkdev_aio_write);	1545	EXPORT_SYMBOL_GPL(blkdev_aio_write);
1546		1546
1547	static ssize_t blkdev_aio_read(struct kiocb iocb, const struct iovec iov,	1547	static ssize_t blkdev_aio_read(struct kiocb iocb, const struct iovec iov,
1548	unsigned long nr_segs, loff_t pos)	1548	unsigned long nr_segs, loff_t pos)
1549	{	1549	{
1550	struct file *file = iocb->ki_filp;	1550	struct file *file = iocb->ki_filp;
1551	struct inode *bd_inode = file->f_mapping->host;	1551	struct inode *bd_inode = file->f_mapping->host;
1552	loff_t size = i_size_read(bd_inode);	1552	loff_t size = i_size_read(bd_inode);
1553		1553
1554	if (pos >= size)	1554	if (pos >= size)
1555	return 0;	1555	return 0;
1556		1556
1557	size -= pos;	1557	size -= pos;
1558	if (size < INT_MAX)	1558	if (size < INT_MAX)
1559	nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);	1559	nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1560	return generic_file_aio_read(iocb, iov, nr_segs, pos);	1560	return generic_file_aio_read(iocb, iov, nr_segs, pos);
1561	}	1561	}
1562		1562
1563	/*	1563	/*
1564	* Try to release a page associated with block device when the system	1564	* Try to release a page associated with block device when the system
1565	* is under memory pressure.	1565	* is under memory pressure.
1566	*/	1566	*/
1567	static int blkdev_releasepage(struct page *page, gfp_t wait)	1567	static int blkdev_releasepage(struct page *page, gfp_t wait)
1568	{	1568	{
1569	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;	1569	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1570		1570
1571	if (super && super->s_op->bdev_try_to_free_page)	1571	if (super && super->s_op->bdev_try_to_free_page)
1572	return super->s_op->bdev_try_to_free_page(super, page, wait);	1572	return super->s_op->bdev_try_to_free_page(super, page, wait);
1573		1573
1574	return try_to_free_buffers(page);	1574	return try_to_free_buffers(page);
1575	}	1575	}
1576		1576
1577	static const struct address_space_operations def_blk_aops = {	1577	static const struct address_space_operations def_blk_aops = {
1578	.readpage = blkdev_readpage,	1578	.readpage = blkdev_readpage,
1579	.writepage = blkdev_writepage,	1579	.writepage = blkdev_writepage,
1580	.write_begin = blkdev_write_begin,	1580	.write_begin = blkdev_write_begin,
1581	.write_end = blkdev_write_end,	1581	.write_end = blkdev_write_end,
1582	.writepages = generic_writepages,	1582	.writepages = generic_writepages,
1583	.releasepage = blkdev_releasepage,	1583	.releasepage = blkdev_releasepage,
1584	.direct_IO = blkdev_direct_IO,	1584	.direct_IO = blkdev_direct_IO,
1585	};	1585	};
1586		1586
1587	const struct file_operations def_blk_fops = {	1587	const struct file_operations def_blk_fops = {
1588	.open = blkdev_open,	1588	.open = blkdev_open,
1589	.release = blkdev_close,	1589	.release = blkdev_close,
1590	.llseek = block_llseek,	1590	.llseek = block_llseek,
1591	.read = do_sync_read,	1591	.read = do_sync_read,
1592	.write = do_sync_write,	1592	.write = do_sync_write,
1593	.aio_read = blkdev_aio_read,	1593	.aio_read = blkdev_aio_read,
1594	.aio_write = blkdev_aio_write,	1594	.aio_write = blkdev_aio_write,
1595	.mmap = generic_file_mmap,	1595	.mmap = generic_file_mmap,
1596	.fsync = blkdev_fsync,	1596	.fsync = blkdev_fsync,
1597	.unlocked_ioctl = block_ioctl,	1597	.unlocked_ioctl = block_ioctl,
1598	#ifdef CONFIG_COMPAT	1598	#ifdef CONFIG_COMPAT
1599	.compat_ioctl = compat_blkdev_ioctl,	1599	.compat_ioctl = compat_blkdev_ioctl,
1600	#endif	1600	#endif
1601	.splice_read = generic_file_splice_read,	1601	.splice_read = generic_file_splice_read,
1602	.splice_write = generic_file_splice_write,	1602	.splice_write = generic_file_splice_write,
1603	};	1603	};
1604		1604
1605	int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)	1605	int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1606	{	1606	{
1607	int res;	1607	int res;
1608	mm_segment_t old_fs = get_fs();	1608	mm_segment_t old_fs = get_fs();
1609	set_fs(KERNEL_DS);	1609	set_fs(KERNEL_DS);
1610	res = blkdev_ioctl(bdev, 0, cmd, arg);	1610	res = blkdev_ioctl(bdev, 0, cmd, arg);
1611	set_fs(old_fs);	1611	set_fs(old_fs);
1612	return res;	1612	return res;
1613	}	1613	}
1614		1614
1615	EXPORT_SYMBOL(ioctl_by_bdev);	1615	EXPORT_SYMBOL(ioctl_by_bdev);
1616		1616
1617	/**	1617	/**
1618	* lookup_bdev - lookup a struct block_device by name	1618	* lookup_bdev - lookup a struct block_device by name
1619	* @pathname: special file representing the block device	1619	* @pathname: special file representing the block device
1620	*	1620	*
1621	* Get a reference to the blockdevice at @pathname in the current	1621	* Get a reference to the blockdevice at @pathname in the current
1622	* namespace if possible and return it. Return ERR_PTR(error)	1622	* namespace if possible and return it. Return ERR_PTR(error)
1623	* otherwise.	1623	* otherwise.
1624	*/	1624	*/
1625	struct block_device lookup_bdev(const char pathname)	1625	struct block_device lookup_bdev(const char pathname)
1626	{	1626	{
1627	struct block_device *bdev;	1627	struct block_device *bdev;
1628	struct inode *inode;	1628	struct inode *inode;
1629	struct path path;	1629	struct path path;
1630	int error;	1630	int error;
1631		1631
1632	if (!pathname \|\| !*pathname)	1632	if (!pathname \|\| !*pathname)
1633	return ERR_PTR(-EINVAL);	1633	return ERR_PTR(-EINVAL);
1634		1634
1635	error = kern_path(pathname, LOOKUP_FOLLOW, &path);	1635	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1636	if (error)	1636	if (error)
1637	return ERR_PTR(error);	1637	return ERR_PTR(error);
1638		1638
1639	inode = path.dentry->d_inode;	1639	inode = path.dentry->d_inode;
1640	error = -ENOTBLK;	1640	error = -ENOTBLK;
1641	if (!S_ISBLK(inode->i_mode))	1641	if (!S_ISBLK(inode->i_mode))
1642	goto fail;	1642	goto fail;
1643	error = -EACCES;	1643	error = -EACCES;
1644	if (path.mnt->mnt_flags & MNT_NODEV)	1644	if (path.mnt->mnt_flags & MNT_NODEV)
1645	goto fail;	1645	goto fail;
1646	error = -ENOMEM;	1646	error = -ENOMEM;
1647	bdev = bd_acquire(inode);	1647	bdev = bd_acquire(inode);
1648	if (!bdev)	1648	if (!bdev)
1649	goto fail;	1649	goto fail;
1650	out:	1650	out:
1651	path_put(&path);	1651	path_put(&path);
1652	return bdev;	1652	return bdev;
1653	fail:	1653	fail:
1654	bdev = ERR_PTR(error);	1654	bdev = ERR_PTR(error);
1655	goto out;	1655	goto out;
1656	}	1656	}
1657	EXPORT_SYMBOL(lookup_bdev);	1657	EXPORT_SYMBOL(lookup_bdev);
1658		1658
1659	int __invalidate_device(struct block_device *bdev, bool kill_dirty)	1659	int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1660	{	1660	{
1661	struct super_block *sb = get_super(bdev);	1661	struct super_block *sb = get_super(bdev);
1662	int res = 0;	1662	int res = 0;
1663		1663
1664	if (sb) {	1664	if (sb) {
1665	/*	1665	/*
1666	* no need to lock the super, get_super holds the	1666	* no need to lock the super, get_super holds the
1667	* read mutex so the filesystem cannot go away	1667	* read mutex so the filesystem cannot go away
1668	* under us (->put_super runs with the write lock	1668	* under us (->put_super runs with the write lock
1669	* hold).	1669	* hold).
1670	*/	1670	*/
1671	shrink_dcache_sb(sb);	1671	shrink_dcache_sb(sb);
1672	res = invalidate_inodes(sb, kill_dirty);	1672	res = invalidate_inodes(sb, kill_dirty);
1673	drop_super(sb);	1673	drop_super(sb);
1674	}	1674	}
1675	invalidate_bdev(bdev);	1675	invalidate_bdev(bdev);
1676	return res;	1676	return res;
1677	}	1677	}
1678	EXPORT_SYMBOL(__invalidate_device);	1678	EXPORT_SYMBOL(__invalidate_device);
1679		1679
1680	void iterate_bdevs(void (func)(struct block_device , void ), void arg)	1680	void iterate_bdevs(void (func)(struct block_device , void ), void arg)
1681	{	1681	{
1682	struct inode inode, old_inode = NULL;	1682	struct inode inode, old_inode = NULL;
1683		1683
1684	spin_lock(&inode_sb_list_lock);	1684	spin_lock(&inode_sb_list_lock);
1685	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {	1685	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1686	struct address_space *mapping = inode->i_mapping;	1686	struct address_space *mapping = inode->i_mapping;
1687		1687
1688	spin_lock(&inode->i_lock);	1688	spin_lock(&inode->i_lock);
1689	if (inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW) \|\|	1689	if (inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW) \|\|
1690	mapping->nrpages == 0) {	1690	mapping->nrpages == 0) {
1691	spin_unlock(&inode->i_lock);	1691	spin_unlock(&inode->i_lock);
1692	continue;	1692	continue;
1693	}	1693	}
1694	__iget(inode);	1694	__iget(inode);
1695	spin_unlock(&inode->i_lock);	1695	spin_unlock(&inode->i_lock);
1696	spin_unlock(&inode_sb_list_lock);	1696	spin_unlock(&inode_sb_list_lock);
1697	/*	1697	/*
1698	* We hold a reference to 'inode' so it couldn't have been	1698	* We hold a reference to 'inode' so it couldn't have been
1699	* removed from s_inodes list while we dropped the	1699	* removed from s_inodes list while we dropped the
1700	* inode_sb_list_lock. We cannot iput the inode now as we can	1700	* inode_sb_list_lock. We cannot iput the inode now as we can
1701	* be holding the last reference and we cannot iput it under	1701	* be holding the last reference and we cannot iput it under
1702	* inode_sb_list_lock. So we keep the reference and iput it	1702	* inode_sb_list_lock. So we keep the reference and iput it
1703	* later.	1703	* later.
1704	*/	1704	*/
1705	iput(old_inode);	1705	iput(old_inode);
1706	old_inode = inode;	1706	old_inode = inode;
1707		1707
1708	func(I_BDEV(inode), arg);	1708	func(I_BDEV(inode), arg);
1709		1709
1710	spin_lock(&inode_sb_list_lock);	1710	spin_lock(&inode_sb_list_lock);
1711	}	1711	}
1712	spin_unlock(&inode_sb_list_lock);	1712	spin_unlock(&inode_sb_list_lock);
1713	iput(old_inode);	1713	iput(old_inode);
1714	}	1714	}
1715		1715

fs/btrfs/file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* Copyright (C) 2007 Oracle. All rights reserved.	2	* Copyright (C) 2007 Oracle. All rights reserved.
3	*	3	*
4	* This program is free software; you can redistribute it and/or	4	* This program is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU General Public	5	* modify it under the terms of the GNU General Public
6	* License v2 as published by the Free Software Foundation.	6	* License v2 as published by the Free Software Foundation.
7	*	7	*
8	* This program is distributed in the hope that it will be useful,	8	* This program is distributed in the hope that it will be useful,
9	* but WITHOUT ANY WARRANTY; without even the implied warranty of	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11	* General Public License for more details.	11	* General Public License for more details.
12	*	12	*
13	* You should have received a copy of the GNU General Public	13	* You should have received a copy of the GNU General Public
14	* License along with this program; if not, write to the	14	* License along with this program; if not, write to the
15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16	* Boston, MA 021110-1307, USA.	16	* Boston, MA 021110-1307, USA.
17	*/	17	*/
18		18
19	#include <linux/fs.h>	19	#include <linux/fs.h>
20	#include <linux/pagemap.h>	20	#include <linux/pagemap.h>
21	#include <linux/highmem.h>	21	#include <linux/highmem.h>
22	#include <linux/time.h>	22	#include <linux/time.h>
23	#include <linux/init.h>	23	#include <linux/init.h>
24	#include <linux/string.h>	24	#include <linux/string.h>
25	#include <linux/backing-dev.h>	25	#include <linux/backing-dev.h>
26	#include <linux/mpage.h>	26	#include <linux/mpage.h>
27	#include <linux/falloc.h>	27	#include <linux/falloc.h>
28	#include <linux/swap.h>	28	#include <linux/swap.h>
29	#include <linux/writeback.h>	29	#include <linux/writeback.h>
30	#include <linux/statfs.h>	30	#include <linux/statfs.h>
31	#include <linux/compat.h>	31	#include <linux/compat.h>
32	#include <linux/slab.h>	32	#include <linux/slab.h>
33	#include "ctree.h"	33	#include "ctree.h"
34	#include "disk-io.h"	34	#include "disk-io.h"
35	#include "transaction.h"	35	#include "transaction.h"
36	#include "btrfs_inode.h"	36	#include "btrfs_inode.h"
37	#include "ioctl.h"	37	#include "ioctl.h"
38	#include "print-tree.h"	38	#include "print-tree.h"
39	#include "tree-log.h"	39	#include "tree-log.h"
40	#include "locking.h"	40	#include "locking.h"
41	#include "compat.h"	41	#include "compat.h"
42	#include "volumes.h"	42	#include "volumes.h"
43		43
44	/*	44	/*
45	* when auto defrag is enabled we	45	* when auto defrag is enabled we
46	* queue up these defrag structs to remember which	46	* queue up these defrag structs to remember which
47	* inodes need defragging passes	47	* inodes need defragging passes
48	*/	48	*/
49	struct inode_defrag {	49	struct inode_defrag {
50	struct rb_node rb_node;	50	struct rb_node rb_node;
51	/* objectid */	51	/* objectid */
52	u64 ino;	52	u64 ino;
53	/*	53	/*
54	* transid where the defrag was added, we search for	54	* transid where the defrag was added, we search for
55	* extents newer than this	55	* extents newer than this
56	*/	56	*/
57	u64 transid;	57	u64 transid;
58		58
59	/* root objectid */	59	/* root objectid */
60	u64 root;	60	u64 root;
61		61
62	/* last offset we were able to defrag */	62	/* last offset we were able to defrag */
63	u64 last_offset;	63	u64 last_offset;
64		64
65	/* if we've wrapped around back to zero once already */	65	/* if we've wrapped around back to zero once already */
66	int cycled;	66	int cycled;
67	};	67	};
68		68
69	static int __compare_inode_defrag(struct inode_defrag *defrag1,	69	static int __compare_inode_defrag(struct inode_defrag *defrag1,
70	struct inode_defrag *defrag2)	70	struct inode_defrag *defrag2)
71	{	71	{
72	if (defrag1->root > defrag2->root)	72	if (defrag1->root > defrag2->root)
73	return 1;	73	return 1;
74	else if (defrag1->root < defrag2->root)	74	else if (defrag1->root < defrag2->root)
75	return -1;	75	return -1;
76	else if (defrag1->ino > defrag2->ino)	76	else if (defrag1->ino > defrag2->ino)
77	return 1;	77	return 1;
78	else if (defrag1->ino < defrag2->ino)	78	else if (defrag1->ino < defrag2->ino)
79	return -1;	79	return -1;
80	else	80	else
81	return 0;	81	return 0;
82	}	82	}
83		83
84	/* pop a record for an inode into the defrag tree. The lock	84	/* pop a record for an inode into the defrag tree. The lock
85	* must be held already	85	* must be held already
86	*	86	*
87	* If you're inserting a record for an older transid than an	87	* If you're inserting a record for an older transid than an
88	* existing record, the transid already in the tree is lowered	88	* existing record, the transid already in the tree is lowered
89	*	89	*
90	* If an existing record is found the defrag item you	90	* If an existing record is found the defrag item you
91	* pass in is freed	91	* pass in is freed
92	*/	92	*/
93	static void __btrfs_add_inode_defrag(struct inode *inode,	93	static void __btrfs_add_inode_defrag(struct inode *inode,
94	struct inode_defrag *defrag)	94	struct inode_defrag *defrag)
95	{	95	{
96	struct btrfs_root *root = BTRFS_I(inode)->root;	96	struct btrfs_root *root = BTRFS_I(inode)->root;
97	struct inode_defrag *entry;	97	struct inode_defrag *entry;
98	struct rb_node **p;	98	struct rb_node **p;
99	struct rb_node *parent = NULL;	99	struct rb_node *parent = NULL;
100	int ret;	100	int ret;
101		101
102	p = &root->fs_info->defrag_inodes.rb_node;	102	p = &root->fs_info->defrag_inodes.rb_node;
103	while (*p) {	103	while (*p) {
104	parent = *p;	104	parent = *p;
105	entry = rb_entry(parent, struct inode_defrag, rb_node);	105	entry = rb_entry(parent, struct inode_defrag, rb_node);
106		106
107	ret = __compare_inode_defrag(defrag, entry);	107	ret = __compare_inode_defrag(defrag, entry);
108	if (ret < 0)	108	if (ret < 0)
109	p = &parent->rb_left;	109	p = &parent->rb_left;
110	else if (ret > 0)	110	else if (ret > 0)
111	p = &parent->rb_right;	111	p = &parent->rb_right;
112	else {	112	else {
113	/* if we're reinserting an entry for	113	/* if we're reinserting an entry for
114	* an old defrag run, make sure to	114	* an old defrag run, make sure to
115	* lower the transid of our existing record	115	* lower the transid of our existing record
116	*/	116	*/
117	if (defrag->transid < entry->transid)	117	if (defrag->transid < entry->transid)
118	entry->transid = defrag->transid;	118	entry->transid = defrag->transid;
119	if (defrag->last_offset > entry->last_offset)	119	if (defrag->last_offset > entry->last_offset)
120	entry->last_offset = defrag->last_offset;	120	entry->last_offset = defrag->last_offset;
121	goto exists;	121	goto exists;
122	}	122	}
123	}	123	}
124	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);	124	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125	rb_link_node(&defrag->rb_node, parent, p);	125	rb_link_node(&defrag->rb_node, parent, p);
126	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);	126	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127	return;	127	return;
128		128
129	exists:	129	exists:
130	kfree(defrag);	130	kfree(defrag);
131	return;	131	return;
132		132
133	}	133	}
134		134
135	/*	135	/*
136	* insert a defrag record for this inode if auto defrag is	136	* insert a defrag record for this inode if auto defrag is
137	* enabled	137	* enabled
138	*/	138	*/
139	int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,	139	int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
140	struct inode *inode)	140	struct inode *inode)
141	{	141	{
142	struct btrfs_root *root = BTRFS_I(inode)->root;	142	struct btrfs_root *root = BTRFS_I(inode)->root;
143	struct inode_defrag *defrag;	143	struct inode_defrag *defrag;
144	u64 transid;	144	u64 transid;
145		145
146	if (!btrfs_test_opt(root, AUTO_DEFRAG))	146	if (!btrfs_test_opt(root, AUTO_DEFRAG))
147	return 0;	147	return 0;
148		148
149	if (btrfs_fs_closing(root->fs_info))	149	if (btrfs_fs_closing(root->fs_info))
150	return 0;	150	return 0;
151		151
152	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))	152	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
153	return 0;	153	return 0;
154		154
155	if (trans)	155	if (trans)
156	transid = trans->transid;	156	transid = trans->transid;
157	else	157	else
158	transid = BTRFS_I(inode)->root->last_trans;	158	transid = BTRFS_I(inode)->root->last_trans;
159		159
160	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);	160	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
161	if (!defrag)	161	if (!defrag)
162	return -ENOMEM;	162	return -ENOMEM;
163		163
164	defrag->ino = btrfs_ino(inode);	164	defrag->ino = btrfs_ino(inode);
165	defrag->transid = transid;	165	defrag->transid = transid;
166	defrag->root = root->root_key.objectid;	166	defrag->root = root->root_key.objectid;
167		167
168	spin_lock(&root->fs_info->defrag_inodes_lock);	168	spin_lock(&root->fs_info->defrag_inodes_lock);
169	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))	169	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
170	__btrfs_add_inode_defrag(inode, defrag);	170	__btrfs_add_inode_defrag(inode, defrag);
171	else	171	else
172	kfree(defrag);	172	kfree(defrag);
173	spin_unlock(&root->fs_info->defrag_inodes_lock);	173	spin_unlock(&root->fs_info->defrag_inodes_lock);
174	return 0;	174	return 0;
175	}	175	}
176		176
177	/*	177	/*
178	* must be called with the defrag_inodes lock held	178	* must be called with the defrag_inodes lock held
179	*/	179	*/
180	struct inode_defrag btrfs_find_defrag_inode(struct btrfs_fs_info info,	180	struct inode_defrag btrfs_find_defrag_inode(struct btrfs_fs_info info,
181	u64 root, u64 ino,	181	u64 root, u64 ino,
182	struct rb_node **next)	182	struct rb_node **next)
183	{	183	{
184	struct inode_defrag *entry = NULL;	184	struct inode_defrag *entry = NULL;
185	struct inode_defrag tmp;	185	struct inode_defrag tmp;
186	struct rb_node *p;	186	struct rb_node *p;
187	struct rb_node *parent = NULL;	187	struct rb_node *parent = NULL;
188	int ret;	188	int ret;
189		189
190	tmp.ino = ino;	190	tmp.ino = ino;
191	tmp.root = root;	191	tmp.root = root;
192		192
193	p = info->defrag_inodes.rb_node;	193	p = info->defrag_inodes.rb_node;
194	while (p) {	194	while (p) {
195	parent = p;	195	parent = p;
196	entry = rb_entry(parent, struct inode_defrag, rb_node);	196	entry = rb_entry(parent, struct inode_defrag, rb_node);
197		197
198	ret = __compare_inode_defrag(&tmp, entry);	198	ret = __compare_inode_defrag(&tmp, entry);
199	if (ret < 0)	199	if (ret < 0)
200	p = parent->rb_left;	200	p = parent->rb_left;
201	else if (ret > 0)	201	else if (ret > 0)
202	p = parent->rb_right;	202	p = parent->rb_right;
203	else	203	else
204	return entry;	204	return entry;
205	}	205	}
206		206
207	if (next) {	207	if (next) {
208	while (parent && __compare_inode_defrag(&tmp, entry) > 0) {	208	while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
209	parent = rb_next(parent);	209	parent = rb_next(parent);
210	entry = rb_entry(parent, struct inode_defrag, rb_node);	210	entry = rb_entry(parent, struct inode_defrag, rb_node);
211	}	211	}
212	*next = parent;	212	*next = parent;
213	}	213	}
214	return NULL;	214	return NULL;
215	}	215	}
216		216
217	/*	217	/*
218	* run through the list of inodes in the FS that need	218	* run through the list of inodes in the FS that need
219	* defragging	219	* defragging
220	*/	220	*/
221	int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)	221	int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222	{	222	{
223	struct inode_defrag *defrag;	223	struct inode_defrag *defrag;
224	struct btrfs_root *inode_root;	224	struct btrfs_root *inode_root;
225	struct inode *inode;	225	struct inode *inode;
226	struct rb_node *n;	226	struct rb_node *n;
227	struct btrfs_key key;	227	struct btrfs_key key;
228	struct btrfs_ioctl_defrag_range_args range;	228	struct btrfs_ioctl_defrag_range_args range;
229	u64 first_ino = 0;	229	u64 first_ino = 0;
230	u64 root_objectid = 0;	230	u64 root_objectid = 0;
231	int num_defrag;	231	int num_defrag;
232	int defrag_batch = 1024;	232	int defrag_batch = 1024;
233		233
234	memset(&range, 0, sizeof(range));	234	memset(&range, 0, sizeof(range));
235	range.len = (u64)-1;	235	range.len = (u64)-1;
236		236
237	atomic_inc(&fs_info->defrag_running);	237	atomic_inc(&fs_info->defrag_running);
238	spin_lock(&fs_info->defrag_inodes_lock);	238	spin_lock(&fs_info->defrag_inodes_lock);
239	while(1) {	239	while(1) {
240	n = NULL;	240	n = NULL;
241		241
242	/* find an inode to defrag */	242	/* find an inode to defrag */
243	defrag = btrfs_find_defrag_inode(fs_info, root_objectid,	243	defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
244	first_ino, &n);	244	first_ino, &n);
245	if (!defrag) {	245	if (!defrag) {
246	if (n) {	246	if (n) {
247	defrag = rb_entry(n, struct inode_defrag,	247	defrag = rb_entry(n, struct inode_defrag,
248	rb_node);	248	rb_node);
249	} else if (root_objectid \|\| first_ino) {	249	} else if (root_objectid \|\| first_ino) {
250	root_objectid = 0;	250	root_objectid = 0;
251	first_ino = 0;	251	first_ino = 0;
252	continue;	252	continue;
253	} else {	253	} else {
254	break;	254	break;
255	}	255	}
256	}	256	}
257		257
258	/* remove it from the rbtree */	258	/* remove it from the rbtree */
259	first_ino = defrag->ino + 1;	259	first_ino = defrag->ino + 1;
260	root_objectid = defrag->root;	260	root_objectid = defrag->root;
261	rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);	261	rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262		262
263	if (btrfs_fs_closing(fs_info))	263	if (btrfs_fs_closing(fs_info))
264	goto next_free;	264	goto next_free;
265		265
266	spin_unlock(&fs_info->defrag_inodes_lock);	266	spin_unlock(&fs_info->defrag_inodes_lock);
267		267
268	/* get the inode */	268	/* get the inode */
269	key.objectid = defrag->root;	269	key.objectid = defrag->root;
270	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);	270	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271	key.offset = (u64)-1;	271	key.offset = (u64)-1;
272	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);	272	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273	if (IS_ERR(inode_root))	273	if (IS_ERR(inode_root))
274	goto next;	274	goto next;
275		275
276	key.objectid = defrag->ino;	276	key.objectid = defrag->ino;
277	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);	277	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278	key.offset = 0;	278	key.offset = 0;
279		279
280	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);	280	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281	if (IS_ERR(inode))	281	if (IS_ERR(inode))
282	goto next;	282	goto next;
283		283
284	/* do a chunk of defrag */	284	/* do a chunk of defrag */
285	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);	285	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286	range.start = defrag->last_offset;	286	range.start = defrag->last_offset;
287	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,	287	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288	defrag_batch);	288	defrag_batch);
289	/*	289	/*
290	* if we filled the whole defrag batch, there	290	* if we filled the whole defrag batch, there
291	* must be more work to do. Queue this defrag	291	* must be more work to do. Queue this defrag
292	* again	292	* again
293	*/	293	*/
294	if (num_defrag == defrag_batch) {	294	if (num_defrag == defrag_batch) {
295	defrag->last_offset = range.start;	295	defrag->last_offset = range.start;
296	__btrfs_add_inode_defrag(inode, defrag);	296	__btrfs_add_inode_defrag(inode, defrag);
297	/*	297	/*
298	* we don't want to kfree defrag, we added it back to	298	* we don't want to kfree defrag, we added it back to
299	* the rbtree	299	* the rbtree
300	*/	300	*/
301	defrag = NULL;	301	defrag = NULL;
302	} else if (defrag->last_offset && !defrag->cycled) {	302	} else if (defrag->last_offset && !defrag->cycled) {
303	/*	303	/*
304	* we didn't fill our defrag batch, but	304	* we didn't fill our defrag batch, but
305	* we didn't start at zero. Make sure we loop	305	* we didn't start at zero. Make sure we loop
306	* around to the start of the file.	306	* around to the start of the file.
307	*/	307	*/
308	defrag->last_offset = 0;	308	defrag->last_offset = 0;
309	defrag->cycled = 1;	309	defrag->cycled = 1;
310	__btrfs_add_inode_defrag(inode, defrag);	310	__btrfs_add_inode_defrag(inode, defrag);
311	defrag = NULL;	311	defrag = NULL;
312	}	312	}
313		313
314	iput(inode);	314	iput(inode);
315	next:	315	next:
316	spin_lock(&fs_info->defrag_inodes_lock);	316	spin_lock(&fs_info->defrag_inodes_lock);
317	next_free:	317	next_free:
318	kfree(defrag);	318	kfree(defrag);
319	}	319	}
320	spin_unlock(&fs_info->defrag_inodes_lock);	320	spin_unlock(&fs_info->defrag_inodes_lock);
321		321
322	atomic_dec(&fs_info->defrag_running);	322	atomic_dec(&fs_info->defrag_running);
323		323
324	/*	324	/*
325	* during unmount, we use the transaction_wait queue to	325	* during unmount, we use the transaction_wait queue to
326	* wait for the defragger to stop	326	* wait for the defragger to stop
327	*/	327	*/
328	wake_up(&fs_info->transaction_wait);	328	wake_up(&fs_info->transaction_wait);
329	return 0;	329	return 0;
330	}	330	}
331		331
332	/* simple helper to fault in pages and copy. This should go away	332	/* simple helper to fault in pages and copy. This should go away
333	* and be replaced with calls into generic code.	333	* and be replaced with calls into generic code.
334	*/	334	*/
335	static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,	335	static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
336	size_t write_bytes,	336	size_t write_bytes,
337	struct page **prepared_pages,	337	struct page **prepared_pages,
338	struct iov_iter *i)	338	struct iov_iter *i)
339	{	339	{
340	size_t copied = 0;	340	size_t copied = 0;
341	size_t total_copied = 0;	341	size_t total_copied = 0;
342	int pg = 0;	342	int pg = 0;
343	int offset = pos & (PAGE_CACHE_SIZE - 1);	343	int offset = pos & (PAGE_CACHE_SIZE - 1);
344		344
345	while (write_bytes > 0) {	345	while (write_bytes > 0) {
346	size_t count = min_t(size_t,	346	size_t count = min_t(size_t,
347	PAGE_CACHE_SIZE - offset, write_bytes);	347	PAGE_CACHE_SIZE - offset, write_bytes);
348	struct page *page = prepared_pages[pg];	348	struct page *page = prepared_pages[pg];
349	/*	349	/*
350	* Copy data from userspace to the current page	350	* Copy data from userspace to the current page
351	*	351	*
352	* Disable pagefault to avoid recursive lock since	352	* Disable pagefault to avoid recursive lock since
353	* the pages are already locked	353	* the pages are already locked
354	*/	354	*/
355	pagefault_disable();	355	pagefault_disable();
356	copied = iov_iter_copy_from_user_atomic(page, i, offset, count);	356	copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
357	pagefault_enable();	357	pagefault_enable();
358		358
359	/* Flush processor's dcache for this page */	359	/* Flush processor's dcache for this page */
360	flush_dcache_page(page);	360	flush_dcache_page(page);
361		361
362	/*	362	/*
363	* if we get a partial write, we can end up with	363	* if we get a partial write, we can end up with
364	* partially up to date pages. These add	364	* partially up to date pages. These add
365	* a lot of complexity, so make sure they don't	365	* a lot of complexity, so make sure they don't
366	* happen by forcing this copy to be retried.	366	* happen by forcing this copy to be retried.
367	*	367	*
368	* The rest of the btrfs_file_write code will fall	368	* The rest of the btrfs_file_write code will fall
369	* back to page at a time copies after we return 0.	369	* back to page at a time copies after we return 0.
370	*/	370	*/
371	if (!PageUptodate(page) && copied < count)	371	if (!PageUptodate(page) && copied < count)
372	copied = 0;	372	copied = 0;
373		373
374	iov_iter_advance(i, copied);	374	iov_iter_advance(i, copied);
375	write_bytes -= copied;	375	write_bytes -= copied;
376	total_copied += copied;	376	total_copied += copied;
377		377
378	/* Return to btrfs_file_aio_write to fault page */	378	/* Return to btrfs_file_aio_write to fault page */
379	if (unlikely(copied == 0))	379	if (unlikely(copied == 0))
380	break;	380	break;
381		381
382	if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {	382	if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
383	offset += copied;	383	offset += copied;
384	} else {	384	} else {
385	pg++;	385	pg++;
386	offset = 0;	386	offset = 0;
387	}	387	}
388	}	388	}
389	return total_copied;	389	return total_copied;
390	}	390	}
391		391
392	/*	392	/*
393	* unlocks pages after btrfs_file_write is done with them	393	* unlocks pages after btrfs_file_write is done with them
394	*/	394	*/
395	void btrfs_drop_pages(struct page **pages, size_t num_pages)	395	void btrfs_drop_pages(struct page **pages, size_t num_pages)
396	{	396	{
397	size_t i;	397	size_t i;
398	for (i = 0; i < num_pages; i++) {	398	for (i = 0; i < num_pages; i++) {
399	/* page checked is some magic around finding pages that	399	/* page checked is some magic around finding pages that
400	* have been modified without going through btrfs_set_page_dirty	400	* have been modified without going through btrfs_set_page_dirty
401	* clear it here	401	* clear it here
402	*/	402	*/
403	ClearPageChecked(pages[i]);	403	ClearPageChecked(pages[i]);
404	unlock_page(pages[i]);	404	unlock_page(pages[i]);
405	mark_page_accessed(pages[i]);	405	mark_page_accessed(pages[i]);
406	page_cache_release(pages[i]);	406	page_cache_release(pages[i]);
407	}	407	}
408	}	408	}
409		409
410	/*	410	/*
411	* after copy_from_user, pages need to be dirtied and we need to make	411	* after copy_from_user, pages need to be dirtied and we need to make
412	* sure holes are created between the current EOF and the start of	412	* sure holes are created between the current EOF and the start of
413	* any next extents (if required).	413	* any next extents (if required).
414	*	414	*
415	* this also makes the decision about creating an inline extent vs	415	* this also makes the decision about creating an inline extent vs
416	* doing real data extents, marking pages dirty and delalloc as required.	416	* doing real data extents, marking pages dirty and delalloc as required.
417	*/	417	*/
418	int btrfs_dirty_pages(struct btrfs_root root, struct inode inode,	418	int btrfs_dirty_pages(struct btrfs_root root, struct inode inode,
419	struct page **pages, size_t num_pages,	419	struct page **pages, size_t num_pages,
420	loff_t pos, size_t write_bytes,	420	loff_t pos, size_t write_bytes,
421	struct extent_state **cached)	421	struct extent_state **cached)
422	{	422	{
423	int err = 0;	423	int err = 0;
424	int i;	424	int i;
425	u64 num_bytes;	425	u64 num_bytes;
426	u64 start_pos;	426	u64 start_pos;
427	u64 end_of_last_block;	427	u64 end_of_last_block;
428	u64 end_pos = pos + write_bytes;	428	u64 end_pos = pos + write_bytes;
429	loff_t isize = i_size_read(inode);	429	loff_t isize = i_size_read(inode);
430		430
431	start_pos = pos & ~((u64)root->sectorsize - 1);	431	start_pos = pos & ~((u64)root->sectorsize - 1);
432	num_bytes = (write_bytes + pos - start_pos +	432	num_bytes = (write_bytes + pos - start_pos +
433	root->sectorsize - 1) & ~((u64)root->sectorsize - 1);	433	root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
434		434
435	end_of_last_block = start_pos + num_bytes - 1;	435	end_of_last_block = start_pos + num_bytes - 1;
436	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,	436	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
437	cached);	437	cached);
438	if (err)	438	if (err)
439	return err;	439	return err;
440		440
441	for (i = 0; i < num_pages; i++) {	441	for (i = 0; i < num_pages; i++) {
442	struct page *p = pages[i];	442	struct page *p = pages[i];
443	SetPageUptodate(p);	443	SetPageUptodate(p);
444	ClearPageChecked(p);	444	ClearPageChecked(p);
445	set_page_dirty(p);	445	set_page_dirty(p);
446	}	446	}
447		447
448	/*	448	/*
449	* we've only changed i_size in ram, and we haven't updated	449	* we've only changed i_size in ram, and we haven't updated
450	* the disk i_size. There is no need to log the inode	450	* the disk i_size. There is no need to log the inode
451	* at this time.	451	* at this time.
452	*/	452	*/
453	if (end_pos > isize)	453	if (end_pos > isize)
454	i_size_write(inode, end_pos);	454	i_size_write(inode, end_pos);
455	return 0;	455	return 0;
456	}	456	}
457		457
458	/*	458	/*
459	* this drops all the extents in the cache that intersect the range	459	* this drops all the extents in the cache that intersect the range
460	* [start, end]. Existing extents are split as required.	460	* [start, end]. Existing extents are split as required.
461	*/	461	*/
462	void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,	462	void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
463	int skip_pinned)	463	int skip_pinned)
464	{	464	{
465	struct extent_map *em;	465	struct extent_map *em;
466	struct extent_map *split = NULL;	466	struct extent_map *split = NULL;
467	struct extent_map *split2 = NULL;	467	struct extent_map *split2 = NULL;
468	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;	468	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
469	u64 len = end - start + 1;	469	u64 len = end - start + 1;
470	u64 gen;	470	u64 gen;
471	int ret;	471	int ret;
472	int testend = 1;	472	int testend = 1;
473	unsigned long flags;	473	unsigned long flags;
474	int compressed = 0;	474	int compressed = 0;
475		475
476	WARN_ON(end < start);	476	WARN_ON(end < start);
477	if (end == (u64)-1) {	477	if (end == (u64)-1) {
478	len = (u64)-1;	478	len = (u64)-1;
479	testend = 0;	479	testend = 0;
480	}	480	}
481	while (1) {	481	while (1) {
482	int no_splits = 0;	482	int no_splits = 0;
483		483
484	if (!split)	484	if (!split)
485	split = alloc_extent_map();	485	split = alloc_extent_map();
486	if (!split2)	486	if (!split2)
487	split2 = alloc_extent_map();	487	split2 = alloc_extent_map();
488	if (!split \|\| !split2)	488	if (!split \|\| !split2)
489	no_splits = 1;	489	no_splits = 1;
490		490
491	write_lock(&em_tree->lock);	491	write_lock(&em_tree->lock);
492	em = lookup_extent_mapping(em_tree, start, len);	492	em = lookup_extent_mapping(em_tree, start, len);
493	if (!em) {	493	if (!em) {
494	write_unlock(&em_tree->lock);	494	write_unlock(&em_tree->lock);
495	break;	495	break;
496	}	496	}
497	flags = em->flags;	497	flags = em->flags;
498	gen = em->generation;	498	gen = em->generation;
499	if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {	499	if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
500	if (testend && em->start + em->len >= start + len) {	500	if (testend && em->start + em->len >= start + len) {
501	free_extent_map(em);	501	free_extent_map(em);
502	write_unlock(&em_tree->lock);	502	write_unlock(&em_tree->lock);
503	break;	503	break;
504	}	504	}
505	start = em->start + em->len;	505	start = em->start + em->len;
506	if (testend)	506	if (testend)
507	len = start + len - (em->start + em->len);	507	len = start + len - (em->start + em->len);
508	free_extent_map(em);	508	free_extent_map(em);
509	write_unlock(&em_tree->lock);	509	write_unlock(&em_tree->lock);
510	continue;	510	continue;
511	}	511	}
512	compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);	512	compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
513	clear_bit(EXTENT_FLAG_PINNED, &em->flags);	513	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
514	remove_extent_mapping(em_tree, em);	514	remove_extent_mapping(em_tree, em);
515	if (no_splits)	515	if (no_splits)
516	goto next;	516	goto next;
517		517
518	if (em->block_start < EXTENT_MAP_LAST_BYTE &&	518	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
519	em->start < start) {	519	em->start < start) {
520	split->start = em->start;	520	split->start = em->start;
521	split->len = start - em->start;	521	split->len = start - em->start;
522	split->orig_start = em->orig_start;	522	split->orig_start = em->orig_start;
523	split->block_start = em->block_start;	523	split->block_start = em->block_start;
524		524
525	if (compressed)	525	if (compressed)
526	split->block_len = em->block_len;	526	split->block_len = em->block_len;
527	else	527	else
528	split->block_len = split->len;	528	split->block_len = split->len;
529	split->generation = gen;	529	split->generation = gen;
530	split->bdev = em->bdev;	530	split->bdev = em->bdev;
531	split->flags = flags;	531	split->flags = flags;
532	split->compress_type = em->compress_type;	532	split->compress_type = em->compress_type;
533	ret = add_extent_mapping(em_tree, split);	533	ret = add_extent_mapping(em_tree, split);
534	BUG_ON(ret); /* Logic error */	534	BUG_ON(ret); /* Logic error */
535	list_move(&split->list, &em_tree->modified_extents);	535	list_move(&split->list, &em_tree->modified_extents);
536	free_extent_map(split);	536	free_extent_map(split);
537	split = split2;	537	split = split2;
538	split2 = NULL;	538	split2 = NULL;
539	}	539	}
540	if (em->block_start < EXTENT_MAP_LAST_BYTE &&	540	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
541	testend && em->start + em->len > start + len) {	541	testend && em->start + em->len > start + len) {
542	u64 diff = start + len - em->start;	542	u64 diff = start + len - em->start;
543		543
544	split->start = start + len;	544	split->start = start + len;
545	split->len = em->start + em->len - (start + len);	545	split->len = em->start + em->len - (start + len);
546	split->bdev = em->bdev;	546	split->bdev = em->bdev;
547	split->flags = flags;	547	split->flags = flags;
548	split->compress_type = em->compress_type;	548	split->compress_type = em->compress_type;
549	split->generation = gen;	549	split->generation = gen;
550		550
551	if (compressed) {	551	if (compressed) {
552	split->block_len = em->block_len;	552	split->block_len = em->block_len;
553	split->block_start = em->block_start;	553	split->block_start = em->block_start;
554	split->orig_start = em->orig_start;	554	split->orig_start = em->orig_start;
555	} else {	555	} else {
556	split->block_len = split->len;	556	split->block_len = split->len;
557	split->block_start = em->block_start + diff;	557	split->block_start = em->block_start + diff;
558	split->orig_start = split->start;	558	split->orig_start = split->start;
559	}	559	}
560		560
561	ret = add_extent_mapping(em_tree, split);	561	ret = add_extent_mapping(em_tree, split);
562	BUG_ON(ret); /* Logic error */	562	BUG_ON(ret); /* Logic error */
563	list_move(&split->list, &em_tree->modified_extents);	563	list_move(&split->list, &em_tree->modified_extents);
564	free_extent_map(split);	564	free_extent_map(split);
565	split = NULL;	565	split = NULL;
566	}	566	}
567	next:	567	next:
568	write_unlock(&em_tree->lock);	568	write_unlock(&em_tree->lock);
569		569
570	/* once for us */	570	/* once for us */
571	free_extent_map(em);	571	free_extent_map(em);
572	/* once for the tree*/	572	/* once for the tree*/
573	free_extent_map(em);	573	free_extent_map(em);
574	}	574	}
575	if (split)	575	if (split)
576	free_extent_map(split);	576	free_extent_map(split);
577	if (split2)	577	if (split2)
578	free_extent_map(split2);	578	free_extent_map(split2);
579	}	579	}
580		580
581	/*	581	/*
582	* this is very complex, but the basic idea is to drop all extents	582	* this is very complex, but the basic idea is to drop all extents
583	* in the range start - end. hint_block is filled in with a block number	583	* in the range start - end. hint_block is filled in with a block number
584	* that would be a good hint to the block allocator for this file.	584	* that would be a good hint to the block allocator for this file.
585	*	585	*
586	* If an extent intersects the range but is not entirely inside the range	586	* If an extent intersects the range but is not entirely inside the range
587	* it is either truncated or split. Anything entirely inside the range	587	* it is either truncated or split. Anything entirely inside the range
588	* is deleted from the tree.	588	* is deleted from the tree.
589	*/	589	*/
590	int __btrfs_drop_extents(struct btrfs_trans_handle *trans,	590	int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
591	struct btrfs_root root, struct inode inode,	591	struct btrfs_root root, struct inode inode,
592	struct btrfs_path *path, u64 start, u64 end,	592	struct btrfs_path *path, u64 start, u64 end,
593	u64 *drop_end, int drop_cache)	593	u64 *drop_end, int drop_cache)
594	{	594	{
595	struct extent_buffer *leaf;	595	struct extent_buffer *leaf;
596	struct btrfs_file_extent_item *fi;	596	struct btrfs_file_extent_item *fi;
597	struct btrfs_key key;	597	struct btrfs_key key;
598	struct btrfs_key new_key;	598	struct btrfs_key new_key;
599	u64 ino = btrfs_ino(inode);	599	u64 ino = btrfs_ino(inode);
600	u64 search_start = start;	600	u64 search_start = start;
601	u64 disk_bytenr = 0;	601	u64 disk_bytenr = 0;
602	u64 num_bytes = 0;	602	u64 num_bytes = 0;
603	u64 extent_offset = 0;	603	u64 extent_offset = 0;
604	u64 extent_end = 0;	604	u64 extent_end = 0;
605	int del_nr = 0;	605	int del_nr = 0;
606	int del_slot = 0;	606	int del_slot = 0;
607	int extent_type;	607	int extent_type;
608	int recow;	608	int recow;
609	int ret;	609	int ret;
610	int modify_tree = -1;	610	int modify_tree = -1;
611	int update_refs = (root->ref_cows \|\| root == root->fs_info->tree_root);	611	int update_refs = (root->ref_cows \|\| root == root->fs_info->tree_root);
612	int found = 0;	612	int found = 0;
613		613
614	if (drop_cache)	614	if (drop_cache)
615	btrfs_drop_extent_cache(inode, start, end - 1, 0);	615	btrfs_drop_extent_cache(inode, start, end - 1, 0);
616		616
617	if (start >= BTRFS_I(inode)->disk_i_size)	617	if (start >= BTRFS_I(inode)->disk_i_size)
618	modify_tree = 0;	618	modify_tree = 0;
619		619
620	while (1) {	620	while (1) {
621	recow = 0;	621	recow = 0;
622	ret = btrfs_lookup_file_extent(trans, root, path, ino,	622	ret = btrfs_lookup_file_extent(trans, root, path, ino,
623	search_start, modify_tree);	623	search_start, modify_tree);
624	if (ret < 0)	624	if (ret < 0)
625	break;	625	break;
626	if (ret > 0 && path->slots[0] > 0 && search_start == start) {	626	if (ret > 0 && path->slots[0] > 0 && search_start == start) {
627	leaf = path->nodes[0];	627	leaf = path->nodes[0];
628	btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);	628	btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
629	if (key.objectid == ino &&	629	if (key.objectid == ino &&
630	key.type == BTRFS_EXTENT_DATA_KEY)	630	key.type == BTRFS_EXTENT_DATA_KEY)
631	path->slots[0]--;	631	path->slots[0]--;
632	}	632	}
633	ret = 0;	633	ret = 0;
634	next_slot:	634	next_slot:
635	leaf = path->nodes[0];	635	leaf = path->nodes[0];
636	if (path->slots[0] >= btrfs_header_nritems(leaf)) {	636	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
637	BUG_ON(del_nr > 0);	637	BUG_ON(del_nr > 0);
638	ret = btrfs_next_leaf(root, path);	638	ret = btrfs_next_leaf(root, path);
639	if (ret < 0)	639	if (ret < 0)
640	break;	640	break;
641	if (ret > 0) {	641	if (ret > 0) {
642	ret = 0;	642	ret = 0;
643	break;	643	break;
644	}	644	}
645	leaf = path->nodes[0];	645	leaf = path->nodes[0];
646	recow = 1;	646	recow = 1;
647	}	647	}
648		648
649	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);	649	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
650	if (key.objectid > ino \|\|	650	if (key.objectid > ino \|\|
651	key.type > BTRFS_EXTENT_DATA_KEY \|\| key.offset >= end)	651	key.type > BTRFS_EXTENT_DATA_KEY \|\| key.offset >= end)
652	break;	652	break;
653		653
654	fi = btrfs_item_ptr(leaf, path->slots[0],	654	fi = btrfs_item_ptr(leaf, path->slots[0],
655	struct btrfs_file_extent_item);	655	struct btrfs_file_extent_item);
656	extent_type = btrfs_file_extent_type(leaf, fi);	656	extent_type = btrfs_file_extent_type(leaf, fi);
657		657
658	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|	658	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
659	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {	659	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
660	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);	660	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
661	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);	661	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
662	extent_offset = btrfs_file_extent_offset(leaf, fi);	662	extent_offset = btrfs_file_extent_offset(leaf, fi);
663	extent_end = key.offset +	663	extent_end = key.offset +
664	btrfs_file_extent_num_bytes(leaf, fi);	664	btrfs_file_extent_num_bytes(leaf, fi);
665	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {	665	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
666	extent_end = key.offset +	666	extent_end = key.offset +
667	btrfs_file_extent_inline_len(leaf, fi);	667	btrfs_file_extent_inline_len(leaf, fi);
668	} else {	668	} else {
669	WARN_ON(1);	669	WARN_ON(1);
670	extent_end = search_start;	670	extent_end = search_start;
671	}	671	}
672		672
673	if (extent_end <= search_start) {	673	if (extent_end <= search_start) {
674	path->slots[0]++;	674	path->slots[0]++;
675	goto next_slot;	675	goto next_slot;
676	}	676	}
677		677
678	found = 1;	678	found = 1;
679	search_start = max(key.offset, start);	679	search_start = max(key.offset, start);
680	if (recow \|\| !modify_tree) {	680	if (recow \|\| !modify_tree) {
681	modify_tree = -1;	681	modify_tree = -1;
682	btrfs_release_path(path);	682	btrfs_release_path(path);
683	continue;	683	continue;
684	}	684	}
685		685
686	/*	686	/*
687	* \| - range to drop - \|	687	* \| - range to drop - \|
688	* \| -------- extent -------- \|	688	* \| -------- extent -------- \|
689	*/	689	*/
690	if (start > key.offset && end < extent_end) {	690	if (start > key.offset && end < extent_end) {
691	BUG_ON(del_nr > 0);	691	BUG_ON(del_nr > 0);
692	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);	692	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
693		693
694	memcpy(&new_key, &key, sizeof(new_key));	694	memcpy(&new_key, &key, sizeof(new_key));
695	new_key.offset = start;	695	new_key.offset = start;
696	ret = btrfs_duplicate_item(trans, root, path,	696	ret = btrfs_duplicate_item(trans, root, path,
697	&new_key);	697	&new_key);
698	if (ret == -EAGAIN) {	698	if (ret == -EAGAIN) {
699	btrfs_release_path(path);	699	btrfs_release_path(path);
700	continue;	700	continue;
701	}	701	}
702	if (ret < 0)	702	if (ret < 0)
703	break;	703	break;
704		704
705	leaf = path->nodes[0];	705	leaf = path->nodes[0];
706	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,	706	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
707	struct btrfs_file_extent_item);	707	struct btrfs_file_extent_item);
708	btrfs_set_file_extent_num_bytes(leaf, fi,	708	btrfs_set_file_extent_num_bytes(leaf, fi,
709	start - key.offset);	709	start - key.offset);
710		710
711	fi = btrfs_item_ptr(leaf, path->slots[0],	711	fi = btrfs_item_ptr(leaf, path->slots[0],
712	struct btrfs_file_extent_item);	712	struct btrfs_file_extent_item);
713		713
714	extent_offset += start - key.offset;	714	extent_offset += start - key.offset;
715	btrfs_set_file_extent_offset(leaf, fi, extent_offset);	715	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
716	btrfs_set_file_extent_num_bytes(leaf, fi,	716	btrfs_set_file_extent_num_bytes(leaf, fi,
717	extent_end - start);	717	extent_end - start);
718	btrfs_mark_buffer_dirty(leaf);	718	btrfs_mark_buffer_dirty(leaf);
719		719
720	if (update_refs && disk_bytenr > 0) {	720	if (update_refs && disk_bytenr > 0) {
721	ret = btrfs_inc_extent_ref(trans, root,	721	ret = btrfs_inc_extent_ref(trans, root,
722	disk_bytenr, num_bytes, 0,	722	disk_bytenr, num_bytes, 0,
723	root->root_key.objectid,	723	root->root_key.objectid,
724	new_key.objectid,	724	new_key.objectid,
725	start - extent_offset, 0);	725	start - extent_offset, 0);
726	BUG_ON(ret); /* -ENOMEM */	726	BUG_ON(ret); /* -ENOMEM */
727	}	727	}
728	key.offset = start;	728	key.offset = start;
729	}	729	}
730	/*	730	/*
731	* \| ---- range to drop ----- \|	731	* \| ---- range to drop ----- \|
732	* \| -------- extent -------- \|	732	* \| -------- extent -------- \|
733	*/	733	*/
734	if (start <= key.offset && end < extent_end) {	734	if (start <= key.offset && end < extent_end) {
735	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);	735	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
736		736
737	memcpy(&new_key, &key, sizeof(new_key));	737	memcpy(&new_key, &key, sizeof(new_key));
738	new_key.offset = end;	738	new_key.offset = end;
739	btrfs_set_item_key_safe(trans, root, path, &new_key);	739	btrfs_set_item_key_safe(trans, root, path, &new_key);
740		740
741	extent_offset += end - key.offset;	741	extent_offset += end - key.offset;
742	btrfs_set_file_extent_offset(leaf, fi, extent_offset);	742	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
743	btrfs_set_file_extent_num_bytes(leaf, fi,	743	btrfs_set_file_extent_num_bytes(leaf, fi,
744	extent_end - end);	744	extent_end - end);
745	btrfs_mark_buffer_dirty(leaf);	745	btrfs_mark_buffer_dirty(leaf);
746	if (update_refs && disk_bytenr > 0)	746	if (update_refs && disk_bytenr > 0)
747	inode_sub_bytes(inode, end - key.offset);	747	inode_sub_bytes(inode, end - key.offset);
748	break;	748	break;
749	}	749	}
750		750
751	search_start = extent_end;	751	search_start = extent_end;
752	/*	752	/*
753	* \| ---- range to drop ----- \|	753	* \| ---- range to drop ----- \|
754	* \| -------- extent -------- \|	754	* \| -------- extent -------- \|
755	*/	755	*/
756	if (start > key.offset && end >= extent_end) {	756	if (start > key.offset && end >= extent_end) {
757	BUG_ON(del_nr > 0);	757	BUG_ON(del_nr > 0);
758	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);	758	BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
759		759
760	btrfs_set_file_extent_num_bytes(leaf, fi,	760	btrfs_set_file_extent_num_bytes(leaf, fi,
761	start - key.offset);	761	start - key.offset);
762	btrfs_mark_buffer_dirty(leaf);	762	btrfs_mark_buffer_dirty(leaf);
763	if (update_refs && disk_bytenr > 0)	763	if (update_refs && disk_bytenr > 0)
764	inode_sub_bytes(inode, extent_end - start);	764	inode_sub_bytes(inode, extent_end - start);
765	if (end == extent_end)	765	if (end == extent_end)
766	break;	766	break;
767		767
768	path->slots[0]++;	768	path->slots[0]++;
769	goto next_slot;	769	goto next_slot;
770	}	770	}
771		771
772	/*	772	/*
773	* \| ---- range to drop ----- \|	773	* \| ---- range to drop ----- \|
774	* \| ------ extent ------ \|	774	* \| ------ extent ------ \|
775	*/	775	*/
776	if (start <= key.offset && end >= extent_end) {	776	if (start <= key.offset && end >= extent_end) {
777	if (del_nr == 0) {	777	if (del_nr == 0) {
778	del_slot = path->slots[0];	778	del_slot = path->slots[0];
779	del_nr = 1;	779	del_nr = 1;
780	} else {	780	} else {
781	BUG_ON(del_slot + del_nr != path->slots[0]);	781	BUG_ON(del_slot + del_nr != path->slots[0]);
782	del_nr++;	782	del_nr++;
783	}	783	}
784		784
785	if (update_refs &&	785	if (update_refs &&
786	extent_type == BTRFS_FILE_EXTENT_INLINE) {	786	extent_type == BTRFS_FILE_EXTENT_INLINE) {
787	inode_sub_bytes(inode,	787	inode_sub_bytes(inode,
788	extent_end - key.offset);	788	extent_end - key.offset);
789	extent_end = ALIGN(extent_end,	789	extent_end = ALIGN(extent_end,
790	root->sectorsize);	790	root->sectorsize);
791	} else if (update_refs && disk_bytenr > 0) {	791	} else if (update_refs && disk_bytenr > 0) {
792	ret = btrfs_free_extent(trans, root,	792	ret = btrfs_free_extent(trans, root,
793	disk_bytenr, num_bytes, 0,	793	disk_bytenr, num_bytes, 0,
794	root->root_key.objectid,	794	root->root_key.objectid,
795	key.objectid, key.offset -	795	key.objectid, key.offset -
796	extent_offset, 0);	796	extent_offset, 0);
797	BUG_ON(ret); /* -ENOMEM */	797	BUG_ON(ret); /* -ENOMEM */
798	inode_sub_bytes(inode,	798	inode_sub_bytes(inode,
799	extent_end - key.offset);	799	extent_end - key.offset);
800	}	800	}
801		801
802	if (end == extent_end)	802	if (end == extent_end)
803	break;	803	break;
804		804
805	if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {	805	if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
806	path->slots[0]++;	806	path->slots[0]++;
807	goto next_slot;	807	goto next_slot;
808	}	808	}
809		809
810	ret = btrfs_del_items(trans, root, path, del_slot,	810	ret = btrfs_del_items(trans, root, path, del_slot,
811	del_nr);	811	del_nr);
812	if (ret) {	812	if (ret) {
813	btrfs_abort_transaction(trans, root, ret);	813	btrfs_abort_transaction(trans, root, ret);
814	break;	814	break;
815	}	815	}
816		816
817	del_nr = 0;	817	del_nr = 0;
818	del_slot = 0;	818	del_slot = 0;
819		819
820	btrfs_release_path(path);	820	btrfs_release_path(path);
821	continue;	821	continue;
822	}	822	}
823		823
824	BUG_ON(1);	824	BUG_ON(1);
825	}	825	}
826		826
827	if (!ret && del_nr > 0) {	827	if (!ret && del_nr > 0) {
828	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);	828	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
829	if (ret)	829	if (ret)
830	btrfs_abort_transaction(trans, root, ret);	830	btrfs_abort_transaction(trans, root, ret);
831	}	831	}
832		832
833	if (drop_end)	833	if (drop_end)
834	*drop_end = found ? min(end, extent_end) : end;	834	*drop_end = found ? min(end, extent_end) : end;
835	btrfs_release_path(path);	835	btrfs_release_path(path);
836	return ret;	836	return ret;
837	}	837	}
838		838
839	int btrfs_drop_extents(struct btrfs_trans_handle *trans,	839	int btrfs_drop_extents(struct btrfs_trans_handle *trans,
840	struct btrfs_root root, struct inode inode, u64 start,	840	struct btrfs_root root, struct inode inode, u64 start,
841	u64 end, int drop_cache)	841	u64 end, int drop_cache)
842	{	842	{
843	struct btrfs_path *path;	843	struct btrfs_path *path;
844	int ret;	844	int ret;
845		845
846	path = btrfs_alloc_path();	846	path = btrfs_alloc_path();
847	if (!path)	847	if (!path)
848	return -ENOMEM;	848	return -ENOMEM;
849	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,	849	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
850	drop_cache);	850	drop_cache);
851	btrfs_free_path(path);	851	btrfs_free_path(path);
852	return ret;	852	return ret;
853	}	853	}
854		854
855	static int extent_mergeable(struct extent_buffer *leaf, int slot,	855	static int extent_mergeable(struct extent_buffer *leaf, int slot,
856	u64 objectid, u64 bytenr, u64 orig_offset,	856	u64 objectid, u64 bytenr, u64 orig_offset,
857	u64 start, u64 end)	857	u64 start, u64 end)
858	{	858	{
859	struct btrfs_file_extent_item *fi;	859	struct btrfs_file_extent_item *fi;
860	struct btrfs_key key;	860	struct btrfs_key key;
861	u64 extent_end;	861	u64 extent_end;
862		862
863	if (slot < 0 \|\| slot >= btrfs_header_nritems(leaf))	863	if (slot < 0 \|\| slot >= btrfs_header_nritems(leaf))
864	return 0;	864	return 0;
865		865
866	btrfs_item_key_to_cpu(leaf, &key, slot);	866	btrfs_item_key_to_cpu(leaf, &key, slot);
867	if (key.objectid != objectid \|\| key.type != BTRFS_EXTENT_DATA_KEY)	867	if (key.objectid != objectid \|\| key.type != BTRFS_EXTENT_DATA_KEY)
868	return 0;	868	return 0;
869		869
870	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);	870	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
871	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG \|\|	871	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG \|\|
872	btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr \|\|	872	btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr \|\|
873	btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset \|\|	873	btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset \|\|
874	btrfs_file_extent_compression(leaf, fi) \|\|	874	btrfs_file_extent_compression(leaf, fi) \|\|
875	btrfs_file_extent_encryption(leaf, fi) \|\|	875	btrfs_file_extent_encryption(leaf, fi) \|\|
876	btrfs_file_extent_other_encoding(leaf, fi))	876	btrfs_file_extent_other_encoding(leaf, fi))
877	return 0;	877	return 0;
878		878
879	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);	879	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
880	if ((start && start != key.offset) \|\| (end && end != extent_end))	880	if ((start && start != key.offset) \|\| (end && end != extent_end))
881	return 0;	881	return 0;
882		882
883	*start = key.offset;	883	*start = key.offset;
884	*end = extent_end;	884	*end = extent_end;
885	return 1;	885	return 1;
886	}	886	}
887		887
888	/*	888	/*
889	* Mark extent in the range start - end as written.	889	* Mark extent in the range start - end as written.
890	*	890	*
891	* This changes extent type from 'pre-allocated' to 'regular'. If only	891	* This changes extent type from 'pre-allocated' to 'regular'. If only
892	* part of extent is marked as written, the extent will be split into	892	* part of extent is marked as written, the extent will be split into
893	* two or three.	893	* two or three.
894	*/	894	*/
895	int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,	895	int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
896	struct inode *inode, u64 start, u64 end)	896	struct inode *inode, u64 start, u64 end)
897	{	897	{
898	struct btrfs_root *root = BTRFS_I(inode)->root;	898	struct btrfs_root *root = BTRFS_I(inode)->root;
899	struct extent_buffer *leaf;	899	struct extent_buffer *leaf;
900	struct btrfs_path *path;	900	struct btrfs_path *path;
901	struct btrfs_file_extent_item *fi;	901	struct btrfs_file_extent_item *fi;
902	struct btrfs_key key;	902	struct btrfs_key key;
903	struct btrfs_key new_key;	903	struct btrfs_key new_key;
904	u64 bytenr;	904	u64 bytenr;
905	u64 num_bytes;	905	u64 num_bytes;
906	u64 extent_end;	906	u64 extent_end;
907	u64 orig_offset;	907	u64 orig_offset;
908	u64 other_start;	908	u64 other_start;
909	u64 other_end;	909	u64 other_end;
910	u64 split;	910	u64 split;
911	int del_nr = 0;	911	int del_nr = 0;
912	int del_slot = 0;	912	int del_slot = 0;
913	int recow;	913	int recow;
914	int ret;	914	int ret;
915	u64 ino = btrfs_ino(inode);	915	u64 ino = btrfs_ino(inode);
916		916
917	path = btrfs_alloc_path();	917	path = btrfs_alloc_path();
918	if (!path)	918	if (!path)
919	return -ENOMEM;	919	return -ENOMEM;
920	again:	920	again:
921	recow = 0;	921	recow = 0;
922	split = start;	922	split = start;
923	key.objectid = ino;	923	key.objectid = ino;
924	key.type = BTRFS_EXTENT_DATA_KEY;	924	key.type = BTRFS_EXTENT_DATA_KEY;
925	key.offset = split;	925	key.offset = split;
926		926
927	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);	927	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
928	if (ret < 0)	928	if (ret < 0)
929	goto out;	929	goto out;
930	if (ret > 0 && path->slots[0] > 0)	930	if (ret > 0 && path->slots[0] > 0)
931	path->slots[0]--;	931	path->slots[0]--;
932		932
933	leaf = path->nodes[0];	933	leaf = path->nodes[0];
934	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);	934	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
935	BUG_ON(key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY);	935	BUG_ON(key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY);
936	fi = btrfs_item_ptr(leaf, path->slots[0],	936	fi = btrfs_item_ptr(leaf, path->slots[0],
937	struct btrfs_file_extent_item);	937	struct btrfs_file_extent_item);
938	BUG_ON(btrfs_file_extent_type(leaf, fi) !=	938	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
939	BTRFS_FILE_EXTENT_PREALLOC);	939	BTRFS_FILE_EXTENT_PREALLOC);
940	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);	940	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
941	BUG_ON(key.offset > start \|\| extent_end < end);	941	BUG_ON(key.offset > start \|\| extent_end < end);
942		942
943	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);	943	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
944	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);	944	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
945	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);	945	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
946	memcpy(&new_key, &key, sizeof(new_key));	946	memcpy(&new_key, &key, sizeof(new_key));
947		947
948	if (start == key.offset && end < extent_end) {	948	if (start == key.offset && end < extent_end) {
949	other_start = 0;	949	other_start = 0;
950	other_end = start;	950	other_end = start;
951	if (extent_mergeable(leaf, path->slots[0] - 1,	951	if (extent_mergeable(leaf, path->slots[0] - 1,
952	ino, bytenr, orig_offset,	952	ino, bytenr, orig_offset,
953	&other_start, &other_end)) {	953	&other_start, &other_end)) {
954	new_key.offset = end;	954	new_key.offset = end;
955	btrfs_set_item_key_safe(trans, root, path, &new_key);	955	btrfs_set_item_key_safe(trans, root, path, &new_key);
956	fi = btrfs_item_ptr(leaf, path->slots[0],	956	fi = btrfs_item_ptr(leaf, path->slots[0],
957	struct btrfs_file_extent_item);	957	struct btrfs_file_extent_item);
958	btrfs_set_file_extent_generation(leaf, fi,	958	btrfs_set_file_extent_generation(leaf, fi,
959	trans->transid);	959	trans->transid);
960	btrfs_set_file_extent_num_bytes(leaf, fi,	960	btrfs_set_file_extent_num_bytes(leaf, fi,
961	extent_end - end);	961	extent_end - end);
962	btrfs_set_file_extent_offset(leaf, fi,	962	btrfs_set_file_extent_offset(leaf, fi,
963	end - orig_offset);	963	end - orig_offset);
964	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,	964	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
965	struct btrfs_file_extent_item);	965	struct btrfs_file_extent_item);
966	btrfs_set_file_extent_generation(leaf, fi,	966	btrfs_set_file_extent_generation(leaf, fi,
967	trans->transid);	967	trans->transid);
968	btrfs_set_file_extent_num_bytes(leaf, fi,	968	btrfs_set_file_extent_num_bytes(leaf, fi,
969	end - other_start);	969	end - other_start);
970	btrfs_mark_buffer_dirty(leaf);	970	btrfs_mark_buffer_dirty(leaf);
971	goto out;	971	goto out;
972	}	972	}
973	}	973	}
974		974
975	if (start > key.offset && end == extent_end) {	975	if (start > key.offset && end == extent_end) {
976	other_start = end;	976	other_start = end;
977	other_end = 0;	977	other_end = 0;
978	if (extent_mergeable(leaf, path->slots[0] + 1,	978	if (extent_mergeable(leaf, path->slots[0] + 1,
979	ino, bytenr, orig_offset,	979	ino, bytenr, orig_offset,
980	&other_start, &other_end)) {	980	&other_start, &other_end)) {
981	fi = btrfs_item_ptr(leaf, path->slots[0],	981	fi = btrfs_item_ptr(leaf, path->slots[0],
982	struct btrfs_file_extent_item);	982	struct btrfs_file_extent_item);
983	btrfs_set_file_extent_num_bytes(leaf, fi,	983	btrfs_set_file_extent_num_bytes(leaf, fi,
984	start - key.offset);	984	start - key.offset);
985	btrfs_set_file_extent_generation(leaf, fi,	985	btrfs_set_file_extent_generation(leaf, fi,
986	trans->transid);	986	trans->transid);
987	path->slots[0]++;	987	path->slots[0]++;
988	new_key.offset = start;	988	new_key.offset = start;
989	btrfs_set_item_key_safe(trans, root, path, &new_key);	989	btrfs_set_item_key_safe(trans, root, path, &new_key);
990		990
991	fi = btrfs_item_ptr(leaf, path->slots[0],	991	fi = btrfs_item_ptr(leaf, path->slots[0],
992	struct btrfs_file_extent_item);	992	struct btrfs_file_extent_item);
993	btrfs_set_file_extent_generation(leaf, fi,	993	btrfs_set_file_extent_generation(leaf, fi,
994	trans->transid);	994	trans->transid);
995	btrfs_set_file_extent_num_bytes(leaf, fi,	995	btrfs_set_file_extent_num_bytes(leaf, fi,
996	other_end - start);	996	other_end - start);
997	btrfs_set_file_extent_offset(leaf, fi,	997	btrfs_set_file_extent_offset(leaf, fi,
998	start - orig_offset);	998	start - orig_offset);
999	btrfs_mark_buffer_dirty(leaf);	999	btrfs_mark_buffer_dirty(leaf);
1000	goto out;	1000	goto out;
1001	}	1001	}
1002	}	1002	}
1003		1003
1004	while (start > key.offset \|\| end < extent_end) {	1004	while (start > key.offset \|\| end < extent_end) {
1005	if (key.offset == start)	1005	if (key.offset == start)
1006	split = end;	1006	split = end;
1007		1007
1008	new_key.offset = split;	1008	new_key.offset = split;
1009	ret = btrfs_duplicate_item(trans, root, path, &new_key);	1009	ret = btrfs_duplicate_item(trans, root, path, &new_key);
1010	if (ret == -EAGAIN) {	1010	if (ret == -EAGAIN) {
1011	btrfs_release_path(path);	1011	btrfs_release_path(path);
1012	goto again;	1012	goto again;
1013	}	1013	}
1014	if (ret < 0) {	1014	if (ret < 0) {
1015	btrfs_abort_transaction(trans, root, ret);	1015	btrfs_abort_transaction(trans, root, ret);
1016	goto out;	1016	goto out;
1017	}	1017	}
1018		1018
1019	leaf = path->nodes[0];	1019	leaf = path->nodes[0];
1020	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,	1020	fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1021	struct btrfs_file_extent_item);	1021	struct btrfs_file_extent_item);
1022	btrfs_set_file_extent_generation(leaf, fi, trans->transid);	1022	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1023	btrfs_set_file_extent_num_bytes(leaf, fi,	1023	btrfs_set_file_extent_num_bytes(leaf, fi,
1024	split - key.offset);	1024	split - key.offset);
1025		1025
1026	fi = btrfs_item_ptr(leaf, path->slots[0],	1026	fi = btrfs_item_ptr(leaf, path->slots[0],
1027	struct btrfs_file_extent_item);	1027	struct btrfs_file_extent_item);
1028		1028
1029	btrfs_set_file_extent_generation(leaf, fi, trans->transid);	1029	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1030	btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);	1030	btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1031	btrfs_set_file_extent_num_bytes(leaf, fi,	1031	btrfs_set_file_extent_num_bytes(leaf, fi,
1032	extent_end - split);	1032	extent_end - split);
1033	btrfs_mark_buffer_dirty(leaf);	1033	btrfs_mark_buffer_dirty(leaf);
1034		1034
1035	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,	1035	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1036	root->root_key.objectid,	1036	root->root_key.objectid,
1037	ino, orig_offset, 0);	1037	ino, orig_offset, 0);
1038	BUG_ON(ret); /* -ENOMEM */	1038	BUG_ON(ret); /* -ENOMEM */
1039		1039
1040	if (split == start) {	1040	if (split == start) {
1041	key.offset = start;	1041	key.offset = start;
1042	} else {	1042	} else {
1043	BUG_ON(start != key.offset);	1043	BUG_ON(start != key.offset);
1044	path->slots[0]--;	1044	path->slots[0]--;
1045	extent_end = end;	1045	extent_end = end;
1046	}	1046	}
1047	recow = 1;	1047	recow = 1;
1048	}	1048	}
1049		1049
1050	other_start = end;	1050	other_start = end;
1051	other_end = 0;	1051	other_end = 0;
1052	if (extent_mergeable(leaf, path->slots[0] + 1,	1052	if (extent_mergeable(leaf, path->slots[0] + 1,
1053	ino, bytenr, orig_offset,	1053	ino, bytenr, orig_offset,
1054	&other_start, &other_end)) {	1054	&other_start, &other_end)) {
1055	if (recow) {	1055	if (recow) {
1056	btrfs_release_path(path);	1056	btrfs_release_path(path);
1057	goto again;	1057	goto again;
1058	}	1058	}
1059	extent_end = other_end;	1059	extent_end = other_end;
1060	del_slot = path->slots[0] + 1;	1060	del_slot = path->slots[0] + 1;
1061	del_nr++;	1061	del_nr++;
1062	ret = btrfs_free_extent(trans, root, bytenr, num_bytes,	1062	ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1063	0, root->root_key.objectid,	1063	0, root->root_key.objectid,
1064	ino, orig_offset, 0);	1064	ino, orig_offset, 0);
1065	BUG_ON(ret); /* -ENOMEM */	1065	BUG_ON(ret); /* -ENOMEM */
1066	}	1066	}
1067	other_start = 0;	1067	other_start = 0;
1068	other_end = start;	1068	other_end = start;
1069	if (extent_mergeable(leaf, path->slots[0] - 1,	1069	if (extent_mergeable(leaf, path->slots[0] - 1,
1070	ino, bytenr, orig_offset,	1070	ino, bytenr, orig_offset,
1071	&other_start, &other_end)) {	1071	&other_start, &other_end)) {
1072	if (recow) {	1072	if (recow) {
1073	btrfs_release_path(path);	1073	btrfs_release_path(path);
1074	goto again;	1074	goto again;
1075	}	1075	}
1076	key.offset = other_start;	1076	key.offset = other_start;
1077	del_slot = path->slots[0];	1077	del_slot = path->slots[0];
1078	del_nr++;	1078	del_nr++;
1079	ret = btrfs_free_extent(trans, root, bytenr, num_bytes,	1079	ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1080	0, root->root_key.objectid,	1080	0, root->root_key.objectid,
1081	ino, orig_offset, 0);	1081	ino, orig_offset, 0);
1082	BUG_ON(ret); /* -ENOMEM */	1082	BUG_ON(ret); /* -ENOMEM */
1083	}	1083	}
1084	if (del_nr == 0) {	1084	if (del_nr == 0) {
1085	fi = btrfs_item_ptr(leaf, path->slots[0],	1085	fi = btrfs_item_ptr(leaf, path->slots[0],
1086	struct btrfs_file_extent_item);	1086	struct btrfs_file_extent_item);
1087	btrfs_set_file_extent_type(leaf, fi,	1087	btrfs_set_file_extent_type(leaf, fi,
1088	BTRFS_FILE_EXTENT_REG);	1088	BTRFS_FILE_EXTENT_REG);
1089	btrfs_set_file_extent_generation(leaf, fi, trans->transid);	1089	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1090	btrfs_mark_buffer_dirty(leaf);	1090	btrfs_mark_buffer_dirty(leaf);
1091	} else {	1091	} else {
1092	fi = btrfs_item_ptr(leaf, del_slot - 1,	1092	fi = btrfs_item_ptr(leaf, del_slot - 1,
1093	struct btrfs_file_extent_item);	1093	struct btrfs_file_extent_item);
1094	btrfs_set_file_extent_type(leaf, fi,	1094	btrfs_set_file_extent_type(leaf, fi,
1095	BTRFS_FILE_EXTENT_REG);	1095	BTRFS_FILE_EXTENT_REG);
1096	btrfs_set_file_extent_generation(leaf, fi, trans->transid);	1096	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1097	btrfs_set_file_extent_num_bytes(leaf, fi,	1097	btrfs_set_file_extent_num_bytes(leaf, fi,
1098	extent_end - key.offset);	1098	extent_end - key.offset);
1099	btrfs_mark_buffer_dirty(leaf);	1099	btrfs_mark_buffer_dirty(leaf);
1100		1100
1101	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);	1101	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1102	if (ret < 0) {	1102	if (ret < 0) {
1103	btrfs_abort_transaction(trans, root, ret);	1103	btrfs_abort_transaction(trans, root, ret);
1104	goto out;	1104	goto out;
1105	}	1105	}
1106	}	1106	}
1107	out:	1107	out:
1108	btrfs_free_path(path);	1108	btrfs_free_path(path);
1109	return 0;	1109	return 0;
1110	}	1110	}
1111		1111
1112	/*	1112	/*
1113	* on error we return an unlocked page and the error value	1113	* on error we return an unlocked page and the error value
1114	* on success we return a locked page and 0	1114	* on success we return a locked page and 0
1115	*/	1115	*/
1116	static int prepare_uptodate_page(struct page *page, u64 pos,	1116	static int prepare_uptodate_page(struct page *page, u64 pos,
1117	bool force_uptodate)	1117	bool force_uptodate)
1118	{	1118	{
1119	int ret = 0;	1119	int ret = 0;
1120		1120
1121	if (((pos & (PAGE_CACHE_SIZE - 1)) \|\| force_uptodate) &&	1121	if (((pos & (PAGE_CACHE_SIZE - 1)) \|\| force_uptodate) &&
1122	!PageUptodate(page)) {	1122	!PageUptodate(page)) {
1123	ret = btrfs_readpage(NULL, page);	1123	ret = btrfs_readpage(NULL, page);
1124	if (ret)	1124	if (ret)
1125	return ret;	1125	return ret;
1126	lock_page(page);	1126	lock_page(page);
1127	if (!PageUptodate(page)) {	1127	if (!PageUptodate(page)) {
1128	unlock_page(page);	1128	unlock_page(page);
1129	return -EIO;	1129	return -EIO;
1130	}	1130	}
1131	}	1131	}
1132	return 0;	1132	return 0;
1133	}	1133	}
1134		1134
1135	/*	1135	/*
1136	* this gets pages into the page cache and locks them down, it also properly	1136	* this gets pages into the page cache and locks them down, it also properly
1137	* waits for data=ordered extents to finish before allowing the pages to be	1137	* waits for data=ordered extents to finish before allowing the pages to be
1138	* modified.	1138	* modified.
1139	*/	1139	*/
1140	static noinline int prepare_pages(struct btrfs_root root, struct file file,	1140	static noinline int prepare_pages(struct btrfs_root root, struct file file,
1141	struct page **pages, size_t num_pages,	1141	struct page **pages, size_t num_pages,
1142	loff_t pos, unsigned long first_index,	1142	loff_t pos, unsigned long first_index,
1143	size_t write_bytes, bool force_uptodate)	1143	size_t write_bytes, bool force_uptodate)
1144	{	1144	{
1145	struct extent_state *cached_state = NULL;	1145	struct extent_state *cached_state = NULL;
1146	int i;	1146	int i;
1147	unsigned long index = pos >> PAGE_CACHE_SHIFT;	1147	unsigned long index = pos >> PAGE_CACHE_SHIFT;
1148	struct inode *inode = fdentry(file)->d_inode;	1148	struct inode *inode = fdentry(file)->d_inode;
1149	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);	1149	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1150	int err = 0;	1150	int err = 0;
1151	int faili = 0;	1151	int faili = 0;
1152	u64 start_pos;	1152	u64 start_pos;
1153	u64 last_pos;	1153	u64 last_pos;
1154		1154
1155	start_pos = pos & ~((u64)root->sectorsize - 1);	1155	start_pos = pos & ~((u64)root->sectorsize - 1);
1156	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;	1156	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1157		1157
1158	again:	1158	again:
1159	for (i = 0; i < num_pages; i++) {	1159	for (i = 0; i < num_pages; i++) {
1160	pages[i] = find_or_create_page(inode->i_mapping, index + i,	1160	pages[i] = find_or_create_page(inode->i_mapping, index + i,
1161	mask \| __GFP_WRITE);	1161	mask \| __GFP_WRITE);
1162	if (!pages[i]) {	1162	if (!pages[i]) {
1163	faili = i - 1;	1163	faili = i - 1;
1164	err = -ENOMEM;	1164	err = -ENOMEM;
1165	goto fail;	1165	goto fail;
1166	}	1166	}
1167		1167
1168	if (i == 0)	1168	if (i == 0)
1169	err = prepare_uptodate_page(pages[i], pos,	1169	err = prepare_uptodate_page(pages[i], pos,
1170	force_uptodate);	1170	force_uptodate);
1171	if (i == num_pages - 1)	1171	if (i == num_pages - 1)
1172	err = prepare_uptodate_page(pages[i],	1172	err = prepare_uptodate_page(pages[i],
1173	pos + write_bytes, false);	1173	pos + write_bytes, false);
1174	if (err) {	1174	if (err) {
1175	page_cache_release(pages[i]);	1175	page_cache_release(pages[i]);
1176	faili = i - 1;	1176	faili = i - 1;
1177	goto fail;	1177	goto fail;
1178	}	1178	}
1179	wait_on_page_writeback(pages[i]);	1179	wait_on_page_writeback(pages[i]);
1180	}	1180	}
1181	err = 0;	1181	err = 0;
1182	if (start_pos < inode->i_size) {	1182	if (start_pos < inode->i_size) {
1183	struct btrfs_ordered_extent *ordered;	1183	struct btrfs_ordered_extent *ordered;
1184	lock_extent_bits(&BTRFS_I(inode)->io_tree,	1184	lock_extent_bits(&BTRFS_I(inode)->io_tree,
1185	start_pos, last_pos - 1, 0, &cached_state);	1185	start_pos, last_pos - 1, 0, &cached_state);
1186	ordered = btrfs_lookup_first_ordered_extent(inode,	1186	ordered = btrfs_lookup_first_ordered_extent(inode,
1187	last_pos - 1);	1187	last_pos - 1);
1188	if (ordered &&	1188	if (ordered &&
1189	ordered->file_offset + ordered->len > start_pos &&	1189	ordered->file_offset + ordered->len > start_pos &&
1190	ordered->file_offset < last_pos) {	1190	ordered->file_offset < last_pos) {
1191	btrfs_put_ordered_extent(ordered);	1191	btrfs_put_ordered_extent(ordered);
1192	unlock_extent_cached(&BTRFS_I(inode)->io_tree,	1192	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1193	start_pos, last_pos - 1,	1193	start_pos, last_pos - 1,
1194	&cached_state, GFP_NOFS);	1194	&cached_state, GFP_NOFS);
1195	for (i = 0; i < num_pages; i++) {	1195	for (i = 0; i < num_pages; i++) {
1196	unlock_page(pages[i]);	1196	unlock_page(pages[i]);
1197	page_cache_release(pages[i]);	1197	page_cache_release(pages[i]);
1198	}	1198	}
1199	btrfs_wait_ordered_range(inode, start_pos,	1199	btrfs_wait_ordered_range(inode, start_pos,
1200	last_pos - start_pos);	1200	last_pos - start_pos);
1201	goto again;	1201	goto again;
1202	}	1202	}
1203	if (ordered)	1203	if (ordered)
1204	btrfs_put_ordered_extent(ordered);	1204	btrfs_put_ordered_extent(ordered);
1205		1205
1206	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,	1206	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1207	last_pos - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|	1207	last_pos - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|
1208	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,	1208	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
1209	0, 0, &cached_state, GFP_NOFS);	1209	0, 0, &cached_state, GFP_NOFS);
1210	unlock_extent_cached(&BTRFS_I(inode)->io_tree,	1210	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1211	start_pos, last_pos - 1, &cached_state,	1211	start_pos, last_pos - 1, &cached_state,
1212	GFP_NOFS);	1212	GFP_NOFS);
1213	}	1213	}
1214	for (i = 0; i < num_pages; i++) {	1214	for (i = 0; i < num_pages; i++) {
1215	if (clear_page_dirty_for_io(pages[i]))	1215	if (clear_page_dirty_for_io(pages[i]))
1216	account_page_redirty(pages[i]);	1216	account_page_redirty(pages[i]);
1217	set_page_extent_mapped(pages[i]);	1217	set_page_extent_mapped(pages[i]);
1218	WARN_ON(!PageLocked(pages[i]));	1218	WARN_ON(!PageLocked(pages[i]));
1219	}	1219	}
1220	return 0;	1220	return 0;
1221	fail:	1221	fail:
1222	while (faili >= 0) {	1222	while (faili >= 0) {
1223	unlock_page(pages[faili]);	1223	unlock_page(pages[faili]);
1224	page_cache_release(pages[faili]);	1224	page_cache_release(pages[faili]);
1225	faili--;	1225	faili--;
1226	}	1226	}
1227	return err;	1227	return err;
1228		1228
1229	}	1229	}
1230		1230
1231	static noinline ssize_t __btrfs_buffered_write(struct file *file,	1231	static noinline ssize_t __btrfs_buffered_write(struct file *file,
1232	struct iov_iter *i,	1232	struct iov_iter *i,
1233	loff_t pos)	1233	loff_t pos)
1234	{	1234	{
1235	struct inode *inode = fdentry(file)->d_inode;	1235	struct inode *inode = fdentry(file)->d_inode;
1236	struct btrfs_root *root = BTRFS_I(inode)->root;	1236	struct btrfs_root *root = BTRFS_I(inode)->root;
1237	struct page **pages = NULL;	1237	struct page **pages = NULL;
1238	unsigned long first_index;	1238	unsigned long first_index;
1239	size_t num_written = 0;	1239	size_t num_written = 0;
1240	int nrptrs;	1240	int nrptrs;
1241	int ret = 0;	1241	int ret = 0;
1242	bool force_page_uptodate = false;	1242	bool force_page_uptodate = false;
1243		1243
1244	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /	1244	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1245	PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /	1245	PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1246	(sizeof(struct page *)));	1246	(sizeof(struct page *)));
1247	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);	1247	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1248	nrptrs = max(nrptrs, 8);	1248	nrptrs = max(nrptrs, 8);
1249	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);	1249	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1250	if (!pages)	1250	if (!pages)
1251	return -ENOMEM;	1251	return -ENOMEM;
1252		1252
1253	first_index = pos >> PAGE_CACHE_SHIFT;	1253	first_index = pos >> PAGE_CACHE_SHIFT;
1254		1254
1255	while (iov_iter_count(i) > 0) {	1255	while (iov_iter_count(i) > 0) {
1256	size_t offset = pos & (PAGE_CACHE_SIZE - 1);	1256	size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1257	size_t write_bytes = min(iov_iter_count(i),	1257	size_t write_bytes = min(iov_iter_count(i),
1258	nrptrs * (size_t)PAGE_CACHE_SIZE -	1258	nrptrs * (size_t)PAGE_CACHE_SIZE -
1259	offset);	1259	offset);
1260	size_t num_pages = (write_bytes + offset +	1260	size_t num_pages = (write_bytes + offset +
1261	PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1261	PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1262	size_t dirty_pages;	1262	size_t dirty_pages;
1263	size_t copied;	1263	size_t copied;
1264		1264
1265	WARN_ON(num_pages > nrptrs);	1265	WARN_ON(num_pages > nrptrs);
1266		1266
1267	/*	1267	/*
1268	* Fault pages before locking them in prepare_pages	1268	* Fault pages before locking them in prepare_pages
1269	* to avoid recursive lock	1269	* to avoid recursive lock
1270	*/	1270	*/
1271	if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {	1271	if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1272	ret = -EFAULT;	1272	ret = -EFAULT;
1273	break;	1273	break;
1274	}	1274	}
1275		1275
1276	ret = btrfs_delalloc_reserve_space(inode,	1276	ret = btrfs_delalloc_reserve_space(inode,
1277	num_pages << PAGE_CACHE_SHIFT);	1277	num_pages << PAGE_CACHE_SHIFT);
1278	if (ret)	1278	if (ret)
1279	break;	1279	break;
1280		1280
1281	/*	1281	/*
1282	* This is going to setup the pages array with the number of	1282	* This is going to setup the pages array with the number of
1283	* pages we want, so we don't really need to worry about the	1283	* pages we want, so we don't really need to worry about the
1284	* contents of pages from loop to loop	1284	* contents of pages from loop to loop
1285	*/	1285	*/
1286	ret = prepare_pages(root, file, pages, num_pages,	1286	ret = prepare_pages(root, file, pages, num_pages,
1287	pos, first_index, write_bytes,	1287	pos, first_index, write_bytes,
1288	force_page_uptodate);	1288	force_page_uptodate);
1289	if (ret) {	1289	if (ret) {
1290	btrfs_delalloc_release_space(inode,	1290	btrfs_delalloc_release_space(inode,
1291	num_pages << PAGE_CACHE_SHIFT);	1291	num_pages << PAGE_CACHE_SHIFT);
1292	break;	1292	break;
1293	}	1293	}
1294		1294
1295	copied = btrfs_copy_from_user(pos, num_pages,	1295	copied = btrfs_copy_from_user(pos, num_pages,
1296	write_bytes, pages, i);	1296	write_bytes, pages, i);
1297		1297
1298	/*	1298	/*
1299	* if we have trouble faulting in the pages, fall	1299	* if we have trouble faulting in the pages, fall
1300	* back to one page at a time	1300	* back to one page at a time
1301	*/	1301	*/
1302	if (copied < write_bytes)	1302	if (copied < write_bytes)
1303	nrptrs = 1;	1303	nrptrs = 1;
1304		1304
1305	if (copied == 0) {	1305	if (copied == 0) {
1306	force_page_uptodate = true;	1306	force_page_uptodate = true;
1307	dirty_pages = 0;	1307	dirty_pages = 0;
1308	} else {	1308	} else {
1309	force_page_uptodate = false;	1309	force_page_uptodate = false;
1310	dirty_pages = (copied + offset +	1310	dirty_pages = (copied + offset +
1311	PAGE_CACHE_SIZE - 1) >>	1311	PAGE_CACHE_SIZE - 1) >>
1312	PAGE_CACHE_SHIFT;	1312	PAGE_CACHE_SHIFT;
1313	}	1313	}
1314		1314
1315	/*	1315	/*
1316	* If we had a short copy we need to release the excess delaloc	1316	* If we had a short copy we need to release the excess delaloc
1317	* bytes we reserved. We need to increment outstanding_extents	1317	* bytes we reserved. We need to increment outstanding_extents
1318	* because btrfs_delalloc_release_space will decrement it, but	1318	* because btrfs_delalloc_release_space will decrement it, but
1319	* we still have an outstanding extent for the chunk we actually	1319	* we still have an outstanding extent for the chunk we actually
1320	* managed to copy.	1320	* managed to copy.
1321	*/	1321	*/
1322	if (num_pages > dirty_pages) {	1322	if (num_pages > dirty_pages) {
1323	if (copied > 0) {	1323	if (copied > 0) {
1324	spin_lock(&BTRFS_I(inode)->lock);	1324	spin_lock(&BTRFS_I(inode)->lock);
1325	BTRFS_I(inode)->outstanding_extents++;	1325	BTRFS_I(inode)->outstanding_extents++;
1326	spin_unlock(&BTRFS_I(inode)->lock);	1326	spin_unlock(&BTRFS_I(inode)->lock);
1327	}	1327	}
1328	btrfs_delalloc_release_space(inode,	1328	btrfs_delalloc_release_space(inode,
1329	(num_pages - dirty_pages) <<	1329	(num_pages - dirty_pages) <<
1330	PAGE_CACHE_SHIFT);	1330	PAGE_CACHE_SHIFT);
1331	}	1331	}
1332		1332
1333	if (copied > 0) {	1333	if (copied > 0) {
1334	ret = btrfs_dirty_pages(root, inode, pages,	1334	ret = btrfs_dirty_pages(root, inode, pages,
1335	dirty_pages, pos, copied,	1335	dirty_pages, pos, copied,
1336	NULL);	1336	NULL);
1337	if (ret) {	1337	if (ret) {
1338	btrfs_delalloc_release_space(inode,	1338	btrfs_delalloc_release_space(inode,
1339	dirty_pages << PAGE_CACHE_SHIFT);	1339	dirty_pages << PAGE_CACHE_SHIFT);
1340	btrfs_drop_pages(pages, num_pages);	1340	btrfs_drop_pages(pages, num_pages);
1341	break;	1341	break;
1342	}	1342	}
1343	}	1343	}
1344		1344
1345	btrfs_drop_pages(pages, num_pages);	1345	btrfs_drop_pages(pages, num_pages);
1346		1346
1347	cond_resched();	1347	cond_resched();
1348		1348
1349	balance_dirty_pages_ratelimited(inode->i_mapping);	1349	balance_dirty_pages_ratelimited(inode->i_mapping);
1350	if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)	1350	if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1351	btrfs_btree_balance_dirty(root, 1);	1351	btrfs_btree_balance_dirty(root, 1);
1352		1352
1353	pos += copied;	1353	pos += copied;
1354	num_written += copied;	1354	num_written += copied;
1355	}	1355	}
1356		1356
1357	kfree(pages);	1357	kfree(pages);
1358		1358
1359	return num_written ? num_written : ret;	1359	return num_written ? num_written : ret;
1360	}	1360	}
1361		1361
1362	static ssize_t __btrfs_direct_write(struct kiocb *iocb,	1362	static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1363	const struct iovec *iov,	1363	const struct iovec *iov,
1364	unsigned long nr_segs, loff_t pos,	1364	unsigned long nr_segs, loff_t pos,
1365	loff_t *ppos, size_t count, size_t ocount)	1365	loff_t *ppos, size_t count, size_t ocount)
1366	{	1366	{
1367	struct file *file = iocb->ki_filp;	1367	struct file *file = iocb->ki_filp;
1368	struct iov_iter i;	1368	struct iov_iter i;
1369	ssize_t written;	1369	ssize_t written;
1370	ssize_t written_buffered;	1370	ssize_t written_buffered;
1371	loff_t endbyte;	1371	loff_t endbyte;
1372	int err;	1372	int err;
1373		1373
1374	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,	1374	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1375	count, ocount);	1375	count, ocount);
1376		1376
1377	if (written < 0 \|\| written == count)	1377	if (written < 0 \|\| written == count)
1378	return written;	1378	return written;
1379		1379
1380	pos += written;	1380	pos += written;
1381	count -= written;	1381	count -= written;
1382	iov_iter_init(&i, iov, nr_segs, count, written);	1382	iov_iter_init(&i, iov, nr_segs, count, written);
1383	written_buffered = __btrfs_buffered_write(file, &i, pos);	1383	written_buffered = __btrfs_buffered_write(file, &i, pos);
1384	if (written_buffered < 0) {	1384	if (written_buffered < 0) {
1385	err = written_buffered;	1385	err = written_buffered;
1386	goto out;	1386	goto out;
1387	}	1387	}
1388	endbyte = pos + written_buffered - 1;	1388	endbyte = pos + written_buffered - 1;
1389	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);	1389	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1390	if (err)	1390	if (err)
1391	goto out;	1391	goto out;
1392	written += written_buffered;	1392	written += written_buffered;
1393	*ppos = pos + written_buffered;	1393	*ppos = pos + written_buffered;
1394	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,	1394	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1395	endbyte >> PAGE_CACHE_SHIFT);	1395	endbyte >> PAGE_CACHE_SHIFT);
1396	out:	1396	out:
1397	return written ? written : err;	1397	return written ? written : err;
1398	}	1398	}
1399		1399
1400	static ssize_t btrfs_file_aio_write(struct kiocb *iocb,	1400	static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1401	const struct iovec *iov,	1401	const struct iovec *iov,
1402	unsigned long nr_segs, loff_t pos)	1402	unsigned long nr_segs, loff_t pos)
1403	{	1403	{
1404	struct file *file = iocb->ki_filp;	1404	struct file *file = iocb->ki_filp;
1405	struct inode *inode = fdentry(file)->d_inode;	1405	struct inode *inode = fdentry(file)->d_inode;
1406	struct btrfs_root *root = BTRFS_I(inode)->root;	1406	struct btrfs_root *root = BTRFS_I(inode)->root;
1407	loff_t *ppos = &iocb->ki_pos;	1407	loff_t *ppos = &iocb->ki_pos;
1408	u64 start_pos;	1408	u64 start_pos;
1409	ssize_t num_written = 0;	1409	ssize_t num_written = 0;
1410	ssize_t err = 0;	1410	ssize_t err = 0;
1411	size_t count, ocount;	1411	size_t count, ocount;
1412		1412
1413	sb_start_write(inode->i_sb);	1413	sb_start_write(inode->i_sb);
1414		1414
1415	mutex_lock(&inode->i_mutex);	1415	mutex_lock(&inode->i_mutex);
1416		1416
1417	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);	1417	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1418	if (err) {	1418	if (err) {
1419	mutex_unlock(&inode->i_mutex);	1419	mutex_unlock(&inode->i_mutex);
1420	goto out;	1420	goto out;
1421	}	1421	}
1422	count = ocount;	1422	count = ocount;
1423		1423
1424	current->backing_dev_info = inode->i_mapping->backing_dev_info;	1424	current->backing_dev_info = inode->i_mapping->backing_dev_info;
1425	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));	1425	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1426	if (err) {	1426	if (err) {
1427	mutex_unlock(&inode->i_mutex);	1427	mutex_unlock(&inode->i_mutex);
1428	goto out;	1428	goto out;
1429	}	1429	}
1430		1430
1431	if (count == 0) {	1431	if (count == 0) {
1432	mutex_unlock(&inode->i_mutex);	1432	mutex_unlock(&inode->i_mutex);
1433	goto out;	1433	goto out;
1434	}	1434	}
1435		1435
1436	err = file_remove_suid(file);	1436	err = file_remove_suid(file);
1437	if (err) {	1437	if (err) {
1438	mutex_unlock(&inode->i_mutex);	1438	mutex_unlock(&inode->i_mutex);
1439	goto out;	1439	goto out;
1440	}	1440	}
1441		1441
1442	/*	1442	/*
1443	* If BTRFS flips readonly due to some impossible error	1443	* If BTRFS flips readonly due to some impossible error
1444	* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),	1444	* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1445	* although we have opened a file as writable, we have	1445	* although we have opened a file as writable, we have
1446	* to stop this write operation to ensure FS consistency.	1446	* to stop this write operation to ensure FS consistency.
1447	*/	1447	*/
1448	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {	1448	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1449	mutex_unlock(&inode->i_mutex);	1449	mutex_unlock(&inode->i_mutex);
1450	err = -EROFS;	1450	err = -EROFS;
1451	goto out;	1451	goto out;
1452	}	1452	}
1453		1453
1454	err = file_update_time(file);	1454	err = file_update_time(file);
1455	if (err) {	1455	if (err) {
1456	mutex_unlock(&inode->i_mutex);	1456	mutex_unlock(&inode->i_mutex);
1457	goto out;	1457	goto out;
1458	}	1458	}
1459		1459
1460	start_pos = round_down(pos, root->sectorsize);	1460	start_pos = round_down(pos, root->sectorsize);
1461	if (start_pos > i_size_read(inode)) {	1461	if (start_pos > i_size_read(inode)) {
1462	err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);	1462	err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1463	if (err) {	1463	if (err) {
1464	mutex_unlock(&inode->i_mutex);	1464	mutex_unlock(&inode->i_mutex);
1465	goto out;	1465	goto out;
1466	}	1466	}
1467	}	1467	}
1468		1468
1469	if (unlikely(file->f_flags & O_DIRECT)) {	1469	if (unlikely(file->f_flags & O_DIRECT)) {
1470	num_written = __btrfs_direct_write(iocb, iov, nr_segs,	1470	num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1471	pos, ppos, count, ocount);	1471	pos, ppos, count, ocount);
1472	} else {	1472	} else {
1473	struct iov_iter i;	1473	struct iov_iter i;
1474		1474
1475	iov_iter_init(&i, iov, nr_segs, count, num_written);	1475	iov_iter_init(&i, iov, nr_segs, count, num_written);
1476		1476
1477	num_written = __btrfs_buffered_write(file, &i, pos);	1477	num_written = __btrfs_buffered_write(file, &i, pos);
1478	if (num_written > 0)	1478	if (num_written > 0)
1479	*ppos = pos + num_written;	1479	*ppos = pos + num_written;
1480	}	1480	}
1481		1481
1482	mutex_unlock(&inode->i_mutex);	1482	mutex_unlock(&inode->i_mutex);
1483		1483
1484	/*	1484	/*
1485	* we want to make sure fsync finds this change	1485	* we want to make sure fsync finds this change
1486	* but we haven't joined a transaction running right now.	1486	* but we haven't joined a transaction running right now.
1487	*	1487	*
1488	* Later on, someone is sure to update the inode and get the	1488	* Later on, someone is sure to update the inode and get the
1489	* real transid recorded.	1489	* real transid recorded.
1490	*	1490	*
1491	* We set last_trans now to the fs_info generation + 1,	1491	* We set last_trans now to the fs_info generation + 1,
1492	* this will either be one more than the running transaction	1492	* this will either be one more than the running transaction
1493	* or the generation used for the next transaction if there isn't	1493	* or the generation used for the next transaction if there isn't
1494	* one running right now.	1494	* one running right now.
1495	*/	1495	*/
1496	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;	1496	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1497	if (num_written > 0 \|\| num_written == -EIOCBQUEUED) {	1497	if (num_written > 0 \|\| num_written == -EIOCBQUEUED) {
1498	err = generic_write_sync(file, pos, num_written);	1498	err = generic_write_sync(file, pos, num_written);
1499	if (err < 0 && num_written > 0)	1499	if (err < 0 && num_written > 0)
1500	num_written = err;	1500	num_written = err;
1501	}	1501	}
1502	out:	1502	out:
1503	sb_end_write(inode->i_sb);	1503	sb_end_write(inode->i_sb);
1504	current->backing_dev_info = NULL;	1504	current->backing_dev_info = NULL;
1505	return num_written ? num_written : err;	1505	return num_written ? num_written : err;
1506	}	1506	}
1507		1507
1508	int btrfs_release_file(struct inode inode, struct file filp)	1508	int btrfs_release_file(struct inode inode, struct file filp)
1509	{	1509	{
1510	/*	1510	/*
1511	* ordered_data_close is set by settattr when we are about to truncate	1511	* ordered_data_close is set by settattr when we are about to truncate
1512	* a file from a non-zero size to a zero size. This tries to	1512	* a file from a non-zero size to a zero size. This tries to
1513	* flush down new bytes that may have been written if the	1513	* flush down new bytes that may have been written if the
1514	* application were using truncate to replace a file in place.	1514	* application were using truncate to replace a file in place.
1515	*/	1515	*/
1516	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,	1516	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1517	&BTRFS_I(inode)->runtime_flags)) {	1517	&BTRFS_I(inode)->runtime_flags)) {
1518	btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);	1518	btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1519	if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)	1519	if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1520	filemap_flush(inode->i_mapping);	1520	filemap_flush(inode->i_mapping);
1521	}	1521	}
1522	if (filp->private_data)	1522	if (filp->private_data)
1523	btrfs_ioctl_trans_end(filp);	1523	btrfs_ioctl_trans_end(filp);
1524	return 0;	1524	return 0;
1525	}	1525	}
1526		1526
1527	/*	1527	/*
1528	* fsync call for both files and directories. This logs the inode into	1528	* fsync call for both files and directories. This logs the inode into
1529	* the tree log instead of forcing full commits whenever possible.	1529	* the tree log instead of forcing full commits whenever possible.
1530	*	1530	*
1531	* It needs to call filemap_fdatawait so that all ordered extent updates are	1531	* It needs to call filemap_fdatawait so that all ordered extent updates are
1532	* in the metadata btree are up to date for copying to the log.	1532	* in the metadata btree are up to date for copying to the log.
1533	*	1533	*
1534	* It drops the inode mutex before doing the tree log commit. This is an	1534	* It drops the inode mutex before doing the tree log commit. This is an
1535	* important optimization for directories because holding the mutex prevents	1535	* important optimization for directories because holding the mutex prevents
1536	* new operations on the dir while we write to disk.	1536	* new operations on the dir while we write to disk.
1537	*/	1537	*/
1538	int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)	1538	int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1539	{	1539	{
1540	struct dentry *dentry = file->f_path.dentry;	1540	struct dentry *dentry = file->f_path.dentry;
1541	struct inode *inode = dentry->d_inode;	1541	struct inode *inode = dentry->d_inode;
1542	struct btrfs_root *root = BTRFS_I(inode)->root;	1542	struct btrfs_root *root = BTRFS_I(inode)->root;
1543	int ret = 0;	1543	int ret = 0;
1544	struct btrfs_trans_handle *trans;	1544	struct btrfs_trans_handle *trans;
1545		1545
1546	trace_btrfs_sync_file(file, datasync);	1546	trace_btrfs_sync_file(file, datasync);
1547		1547
1548	/*	1548	/*
1549	* We write the dirty pages in the range and wait until they complete	1549	* We write the dirty pages in the range and wait until they complete
1550	* out of the ->i_mutex. If so, we can flush the dirty pages by	1550	* out of the ->i_mutex. If so, we can flush the dirty pages by
1551	* multi-task, and make the performance up.	1551	* multi-task, and make the performance up.
1552	*/	1552	*/
1553	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);	1553	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1554	if (ret)	1554	if (ret)
1555	return ret;	1555	return ret;
1556		1556
1557	mutex_lock(&inode->i_mutex);	1557	mutex_lock(&inode->i_mutex);
1558		1558
1559	/*	1559	/*
1560	* We flush the dirty pages again to avoid some dirty pages in the	1560	* We flush the dirty pages again to avoid some dirty pages in the
1561	* range being left.	1561	* range being left.
1562	*/	1562	*/
1563	atomic_inc(&root->log_batch);	1563	atomic_inc(&root->log_batch);
1564	btrfs_wait_ordered_range(inode, start, end);	1564	btrfs_wait_ordered_range(inode, start, end);
1565	atomic_inc(&root->log_batch);	1565	atomic_inc(&root->log_batch);
1566		1566
1567	/*	1567	/*
1568	* check the transaction that last modified this inode	1568	* check the transaction that last modified this inode
1569	* and see if its already been committed	1569	* and see if its already been committed
1570	*/	1570	*/
1571	if (!BTRFS_I(inode)->last_trans) {	1571	if (!BTRFS_I(inode)->last_trans) {
1572	mutex_unlock(&inode->i_mutex);	1572	mutex_unlock(&inode->i_mutex);
1573	goto out;	1573	goto out;
1574	}	1574	}
1575		1575
1576	/*	1576	/*
1577	* if the last transaction that changed this file was before	1577	* if the last transaction that changed this file was before
1578	* the current transaction, we can bail out now without any	1578	* the current transaction, we can bail out now without any
1579	* syncing	1579	* syncing
1580	*/	1580	*/
1581	smp_mb();	1581	smp_mb();
1582	if (btrfs_inode_in_log(inode, root->fs_info->generation) \|\|	1582	if (btrfs_inode_in_log(inode, root->fs_info->generation) \|\|
1583	BTRFS_I(inode)->last_trans <=	1583	BTRFS_I(inode)->last_trans <=
1584	root->fs_info->last_trans_committed) {	1584	root->fs_info->last_trans_committed) {
1585	BTRFS_I(inode)->last_trans = 0;	1585	BTRFS_I(inode)->last_trans = 0;
1586		1586
1587	/*	1587	/*
1588	* We'v had everything committed since the last time we were	1588	* We'v had everything committed since the last time we were
1589	* modified so clear this flag in case it was set for whatever	1589	* modified so clear this flag in case it was set for whatever
1590	* reason, it's no longer relevant.	1590	* reason, it's no longer relevant.
1591	*/	1591	*/
1592	clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,	1592	clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1593	&BTRFS_I(inode)->runtime_flags);	1593	&BTRFS_I(inode)->runtime_flags);
1594	mutex_unlock(&inode->i_mutex);	1594	mutex_unlock(&inode->i_mutex);
1595	goto out;	1595	goto out;
1596	}	1596	}
1597		1597
1598	/*	1598	/*
1599	* ok we haven't committed the transaction yet, lets do a commit	1599	* ok we haven't committed the transaction yet, lets do a commit
1600	*/	1600	*/
1601	if (file->private_data)	1601	if (file->private_data)
1602	btrfs_ioctl_trans_end(file);	1602	btrfs_ioctl_trans_end(file);
1603		1603
1604	trans = btrfs_start_transaction(root, 0);	1604	trans = btrfs_start_transaction(root, 0);
1605	if (IS_ERR(trans)) {	1605	if (IS_ERR(trans)) {
1606	ret = PTR_ERR(trans);	1606	ret = PTR_ERR(trans);
1607	mutex_unlock(&inode->i_mutex);	1607	mutex_unlock(&inode->i_mutex);
1608	goto out;	1608	goto out;
1609	}	1609	}
1610		1610
1611	ret = btrfs_log_dentry_safe(trans, root, dentry);	1611	ret = btrfs_log_dentry_safe(trans, root, dentry);
1612	if (ret < 0) {	1612	if (ret < 0) {
1613	mutex_unlock(&inode->i_mutex);	1613	mutex_unlock(&inode->i_mutex);
1614	goto out;	1614	goto out;
1615	}	1615	}
1616		1616
1617	/* we've logged all the items and now have a consistent	1617	/* we've logged all the items and now have a consistent
1618	* version of the file in the log. It is possible that	1618	* version of the file in the log. It is possible that
1619	* someone will come in and modify the file, but that's	1619	* someone will come in and modify the file, but that's
1620	* fine because the log is consistent on disk, and we	1620	* fine because the log is consistent on disk, and we
1621	* have references to all of the file's extents	1621	* have references to all of the file's extents
1622	*	1622	*
1623	* It is possible that someone will come in and log the	1623	* It is possible that someone will come in and log the
1624	* file again, but that will end up using the synchronization	1624	* file again, but that will end up using the synchronization
1625	* inside btrfs_sync_log to keep things safe.	1625	* inside btrfs_sync_log to keep things safe.
1626	*/	1626	*/
1627	mutex_unlock(&inode->i_mutex);	1627	mutex_unlock(&inode->i_mutex);
1628		1628
1629	if (ret != BTRFS_NO_LOG_SYNC) {	1629	if (ret != BTRFS_NO_LOG_SYNC) {
1630	if (ret > 0) {	1630	if (ret > 0) {
1631	ret = btrfs_commit_transaction(trans, root);	1631	ret = btrfs_commit_transaction(trans, root);
1632	} else {	1632	} else {
1633	ret = btrfs_sync_log(trans, root);	1633	ret = btrfs_sync_log(trans, root);
1634	if (ret == 0)	1634	if (ret == 0)
1635	ret = btrfs_end_transaction(trans, root);	1635	ret = btrfs_end_transaction(trans, root);
1636	else	1636	else
1637	ret = btrfs_commit_transaction(trans, root);	1637	ret = btrfs_commit_transaction(trans, root);
1638	}	1638	}
1639	} else {	1639	} else {
1640	ret = btrfs_end_transaction(trans, root);	1640	ret = btrfs_end_transaction(trans, root);
1641	}	1641	}
1642	out:	1642	out:
1643	return ret > 0 ? -EIO : ret;	1643	return ret > 0 ? -EIO : ret;
1644	}	1644	}
1645		1645
1646	static const struct vm_operations_struct btrfs_file_vm_ops = {	1646	static const struct vm_operations_struct btrfs_file_vm_ops = {
1647	.fault = filemap_fault,	1647	.fault = filemap_fault,
1648	.page_mkwrite = btrfs_page_mkwrite,	1648	.page_mkwrite = btrfs_page_mkwrite,
1649	.remap_pages = generic_file_remap_pages,	1649	.remap_pages = generic_file_remap_pages,
1650	};	1650	};
1651		1651
1652	static int btrfs_file_mmap(struct file filp, struct vm_area_struct vma)	1652	static int btrfs_file_mmap(struct file filp, struct vm_area_struct vma)
1653	{	1653	{
1654	struct address_space *mapping = filp->f_mapping;	1654	struct address_space *mapping = filp->f_mapping;
1655		1655
1656	if (!mapping->a_ops->readpage)	1656	if (!mapping->a_ops->readpage)
1657	return -ENOEXEC;	1657	return -ENOEXEC;
1658		1658
1659	file_accessed(filp);	1659	file_accessed(filp);
1660	vma->vm_ops = &btrfs_file_vm_ops;	1660	vma->vm_ops = &btrfs_file_vm_ops;
1661		1661
1662	return 0;	1662	return 0;
1663	}	1663	}
1664		1664
1665	static int hole_mergeable(struct inode inode, struct extent_buffer leaf,	1665	static int hole_mergeable(struct inode inode, struct extent_buffer leaf,
1666	int slot, u64 start, u64 end)	1666	int slot, u64 start, u64 end)
1667	{	1667	{
1668	struct btrfs_file_extent_item *fi;	1668	struct btrfs_file_extent_item *fi;
1669	struct btrfs_key key;	1669	struct btrfs_key key;
1670		1670
1671	if (slot < 0 \|\| slot >= btrfs_header_nritems(leaf))	1671	if (slot < 0 \|\| slot >= btrfs_header_nritems(leaf))
1672	return 0;	1672	return 0;
1673		1673
1674	btrfs_item_key_to_cpu(leaf, &key, slot);	1674	btrfs_item_key_to_cpu(leaf, &key, slot);
1675	if (key.objectid != btrfs_ino(inode) \|\|	1675	if (key.objectid != btrfs_ino(inode) \|\|
1676	key.type != BTRFS_EXTENT_DATA_KEY)	1676	key.type != BTRFS_EXTENT_DATA_KEY)
1677	return 0;	1677	return 0;
1678		1678
1679	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);	1679	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1680		1680
1681	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)	1681	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1682	return 0;	1682	return 0;
1683		1683
1684	if (btrfs_file_extent_disk_bytenr(leaf, fi))	1684	if (btrfs_file_extent_disk_bytenr(leaf, fi))
1685	return 0;	1685	return 0;
1686		1686
1687	if (key.offset == end)	1687	if (key.offset == end)
1688	return 1;	1688	return 1;
1689	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)	1689	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1690	return 1;	1690	return 1;
1691	return 0;	1691	return 0;
1692	}	1692	}
1693		1693
1694	static int fill_holes(struct btrfs_trans_handle trans, struct inode inode,	1694	static int fill_holes(struct btrfs_trans_handle trans, struct inode inode,
1695	struct btrfs_path *path, u64 offset, u64 end)	1695	struct btrfs_path *path, u64 offset, u64 end)
1696	{	1696	{
1697	struct btrfs_root *root = BTRFS_I(inode)->root;	1697	struct btrfs_root *root = BTRFS_I(inode)->root;
1698	struct extent_buffer *leaf;	1698	struct extent_buffer *leaf;
1699	struct btrfs_file_extent_item *fi;	1699	struct btrfs_file_extent_item *fi;
1700	struct extent_map *hole_em;	1700	struct extent_map *hole_em;
1701	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;	1701	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1702	struct btrfs_key key;	1702	struct btrfs_key key;
1703	int ret;	1703	int ret;
1704		1704
1705	key.objectid = btrfs_ino(inode);	1705	key.objectid = btrfs_ino(inode);
1706	key.type = BTRFS_EXTENT_DATA_KEY;	1706	key.type = BTRFS_EXTENT_DATA_KEY;
1707	key.offset = offset;	1707	key.offset = offset;
1708		1708
1709		1709
1710	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);	1710	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1711	if (ret < 0)	1711	if (ret < 0)
1712	return ret;	1712	return ret;
1713	BUG_ON(!ret);	1713	BUG_ON(!ret);
1714		1714
1715	leaf = path->nodes[0];	1715	leaf = path->nodes[0];
1716	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {	1716	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1717	u64 num_bytes;	1717	u64 num_bytes;
1718		1718
1719	path->slots[0]--;	1719	path->slots[0]--;
1720	fi = btrfs_item_ptr(leaf, path->slots[0],	1720	fi = btrfs_item_ptr(leaf, path->slots[0],
1721	struct btrfs_file_extent_item);	1721	struct btrfs_file_extent_item);
1722	num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +	1722	num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1723	end - offset;	1723	end - offset;
1724	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);	1724	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1725	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);	1725	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1726	btrfs_set_file_extent_offset(leaf, fi, 0);	1726	btrfs_set_file_extent_offset(leaf, fi, 0);
1727	btrfs_mark_buffer_dirty(leaf);	1727	btrfs_mark_buffer_dirty(leaf);
1728	goto out;	1728	goto out;
1729	}	1729	}
1730		1730
1731	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {	1731	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1732	u64 num_bytes;	1732	u64 num_bytes;
1733		1733
1734	path->slots[0]++;	1734	path->slots[0]++;
1735	key.offset = offset;	1735	key.offset = offset;
1736	btrfs_set_item_key_safe(trans, root, path, &key);	1736	btrfs_set_item_key_safe(trans, root, path, &key);
1737	fi = btrfs_item_ptr(leaf, path->slots[0],	1737	fi = btrfs_item_ptr(leaf, path->slots[0],
1738	struct btrfs_file_extent_item);	1738	struct btrfs_file_extent_item);
1739	num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -	1739	num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1740	offset;	1740	offset;
1741	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);	1741	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1742	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);	1742	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1743	btrfs_set_file_extent_offset(leaf, fi, 0);	1743	btrfs_set_file_extent_offset(leaf, fi, 0);
1744	btrfs_mark_buffer_dirty(leaf);	1744	btrfs_mark_buffer_dirty(leaf);
1745	goto out;	1745	goto out;
1746	}	1746	}
1747	btrfs_release_path(path);	1747	btrfs_release_path(path);
1748		1748
1749	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,	1749	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1750	0, 0, end - offset, 0, end - offset,	1750	0, 0, end - offset, 0, end - offset,
1751	0, 0, 0);	1751	0, 0, 0);
1752	if (ret)	1752	if (ret)
1753	return ret;	1753	return ret;
1754		1754
1755	out:	1755	out:
1756	btrfs_release_path(path);	1756	btrfs_release_path(path);
1757		1757
1758	hole_em = alloc_extent_map();	1758	hole_em = alloc_extent_map();
1759	if (!hole_em) {	1759	if (!hole_em) {
1760	btrfs_drop_extent_cache(inode, offset, end - 1, 0);	1760	btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1761	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,	1761	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1762	&BTRFS_I(inode)->runtime_flags);	1762	&BTRFS_I(inode)->runtime_flags);
1763	} else {	1763	} else {
1764	hole_em->start = offset;	1764	hole_em->start = offset;
1765	hole_em->len = end - offset;	1765	hole_em->len = end - offset;
1766	hole_em->orig_start = offset;	1766	hole_em->orig_start = offset;
1767		1767
1768	hole_em->block_start = EXTENT_MAP_HOLE;	1768	hole_em->block_start = EXTENT_MAP_HOLE;
1769	hole_em->block_len = 0;	1769	hole_em->block_len = 0;
1770	hole_em->bdev = root->fs_info->fs_devices->latest_bdev;	1770	hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1771	hole_em->compress_type = BTRFS_COMPRESS_NONE;	1771	hole_em->compress_type = BTRFS_COMPRESS_NONE;
1772	hole_em->generation = trans->transid;	1772	hole_em->generation = trans->transid;
1773		1773
1774	do {	1774	do {
1775	btrfs_drop_extent_cache(inode, offset, end - 1, 0);	1775	btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1776	write_lock(&em_tree->lock);	1776	write_lock(&em_tree->lock);
1777	ret = add_extent_mapping(em_tree, hole_em);	1777	ret = add_extent_mapping(em_tree, hole_em);
1778	if (!ret)	1778	if (!ret)
1779	list_move(&hole_em->list,	1779	list_move(&hole_em->list,
1780	&em_tree->modified_extents);	1780	&em_tree->modified_extents);
1781	write_unlock(&em_tree->lock);	1781	write_unlock(&em_tree->lock);
1782	} while (ret == -EEXIST);	1782	} while (ret == -EEXIST);
1783	free_extent_map(hole_em);	1783	free_extent_map(hole_em);
1784	if (ret)	1784	if (ret)
1785	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,	1785	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1786	&BTRFS_I(inode)->runtime_flags);	1786	&BTRFS_I(inode)->runtime_flags);
1787	}	1787	}
1788		1788
1789	return 0;	1789	return 0;
1790	}	1790	}
1791		1791
1792	static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)	1792	static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1793	{	1793	{
1794	struct btrfs_root *root = BTRFS_I(inode)->root;	1794	struct btrfs_root *root = BTRFS_I(inode)->root;
1795	struct extent_state *cached_state = NULL;	1795	struct extent_state *cached_state = NULL;
1796	struct btrfs_path *path;	1796	struct btrfs_path *path;
1797	struct btrfs_block_rsv *rsv;	1797	struct btrfs_block_rsv *rsv;
1798	struct btrfs_trans_handle *trans;	1798	struct btrfs_trans_handle *trans;
1799	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;	1799	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1800	u64 lockstart = (offset + mask) & ~mask;	1800	u64 lockstart = (offset + mask) & ~mask;
1801	u64 lockend = ((offset + len) & ~mask) - 1;	1801	u64 lockend = ((offset + len) & ~mask) - 1;
1802	u64 cur_offset = lockstart;	1802	u64 cur_offset = lockstart;
1803	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);	1803	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1804	u64 drop_end;	1804	u64 drop_end;
1805	unsigned long nr;	1805	unsigned long nr;
1806	int ret = 0;	1806	int ret = 0;
1807	int err = 0;	1807	int err = 0;
1808	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==	1808	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
1809	((offset + len) >> PAGE_CACHE_SHIFT);	1809	((offset + len) >> PAGE_CACHE_SHIFT);
1810		1810
1811	btrfs_wait_ordered_range(inode, offset, len);	1811	btrfs_wait_ordered_range(inode, offset, len);
1812		1812
1813	mutex_lock(&inode->i_mutex);	1813	mutex_lock(&inode->i_mutex);
1814	if (offset >= inode->i_size) {	1814	if (offset >= inode->i_size) {
1815	mutex_unlock(&inode->i_mutex);	1815	mutex_unlock(&inode->i_mutex);
1816	return 0;	1816	return 0;
1817	}	1817	}
1818		1818
1819	/*	1819	/*
1820	* Only do this if we are in the same page and we aren't doing the	1820	* Only do this if we are in the same page and we aren't doing the
1821	* entire page.	1821	* entire page.
1822	*/	1822	*/
1823	if (same_page && len < PAGE_CACHE_SIZE) {	1823	if (same_page && len < PAGE_CACHE_SIZE) {
1824	ret = btrfs_truncate_page(inode, offset, len, 0);	1824	ret = btrfs_truncate_page(inode, offset, len, 0);
1825	mutex_unlock(&inode->i_mutex);	1825	mutex_unlock(&inode->i_mutex);
1826	return ret;	1826	return ret;
1827	}	1827	}
1828		1828
1829	/* zero back part of the first page */	1829	/* zero back part of the first page */
1830	ret = btrfs_truncate_page(inode, offset, 0, 0);	1830	ret = btrfs_truncate_page(inode, offset, 0, 0);
1831	if (ret) {	1831	if (ret) {
1832	mutex_unlock(&inode->i_mutex);	1832	mutex_unlock(&inode->i_mutex);
1833	return ret;	1833	return ret;
1834	}	1834	}
1835		1835
1836	/* zero the front end of the last page */	1836	/* zero the front end of the last page */
1837	ret = btrfs_truncate_page(inode, offset + len, 0, 1);	1837	ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1838	if (ret) {	1838	if (ret) {
1839	mutex_unlock(&inode->i_mutex);	1839	mutex_unlock(&inode->i_mutex);
1840	return ret;	1840	return ret;
1841	}	1841	}
1842		1842
1843	if (lockend < lockstart) {	1843	if (lockend < lockstart) {
1844	mutex_unlock(&inode->i_mutex);	1844	mutex_unlock(&inode->i_mutex);
1845	return 0;	1845	return 0;
1846	}	1846	}
1847		1847
1848	while (1) {	1848	while (1) {
1849	struct btrfs_ordered_extent *ordered;	1849	struct btrfs_ordered_extent *ordered;
1850		1850
1851	truncate_pagecache_range(inode, lockstart, lockend);	1851	truncate_pagecache_range(inode, lockstart, lockend);
1852		1852
1853	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,	1853	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1854	0, &cached_state);	1854	0, &cached_state);
1855	ordered = btrfs_lookup_first_ordered_extent(inode, lockend);	1855	ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1856		1856
1857	/*	1857	/*
1858	* We need to make sure we have no ordered extents in this range	1858	* We need to make sure we have no ordered extents in this range
1859	* and nobody raced in and read a page in this range, if we did	1859	* and nobody raced in and read a page in this range, if we did
1860	* we need to try again.	1860	* we need to try again.
1861	*/	1861	*/
1862	if ((!ordered \|\|	1862	if ((!ordered \|\|
1863	(ordered->file_offset + ordered->len < lockstart \|\|	1863	(ordered->file_offset + ordered->len < lockstart \|\|
1864	ordered->file_offset > lockend)) &&	1864	ordered->file_offset > lockend)) &&
1865	!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,	1865	!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1866	lockend, EXTENT_UPTODATE, 0,	1866	lockend, EXTENT_UPTODATE, 0,
1867	cached_state)) {	1867	cached_state)) {
1868	if (ordered)	1868	if (ordered)
1869	btrfs_put_ordered_extent(ordered);	1869	btrfs_put_ordered_extent(ordered);
1870	break;	1870	break;
1871	}	1871	}
1872	if (ordered)	1872	if (ordered)
1873	btrfs_put_ordered_extent(ordered);	1873	btrfs_put_ordered_extent(ordered);
1874	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,	1874	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1875	lockend, &cached_state, GFP_NOFS);	1875	lockend, &cached_state, GFP_NOFS);
1876	btrfs_wait_ordered_range(inode, lockstart,	1876	btrfs_wait_ordered_range(inode, lockstart,
1877	lockend - lockstart + 1);	1877	lockend - lockstart + 1);
1878	}	1878	}
1879		1879
1880	path = btrfs_alloc_path();	1880	path = btrfs_alloc_path();
1881	if (!path) {	1881	if (!path) {
1882	ret = -ENOMEM;	1882	ret = -ENOMEM;
1883	goto out;	1883	goto out;
1884	}	1884	}
1885		1885
1886	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);	1886	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1887	if (!rsv) {	1887	if (!rsv) {
1888	ret = -ENOMEM;	1888	ret = -ENOMEM;
1889	goto out_free;	1889	goto out_free;
1890	}	1890	}
1891	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);	1891	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1892	rsv->failfast = 1;	1892	rsv->failfast = 1;
1893		1893
1894	/*	1894	/*
1895	* 1 - update the inode	1895	* 1 - update the inode
1896	* 1 - removing the extents in the range	1896	* 1 - removing the extents in the range
1897	* 1 - adding the hole extent	1897	* 1 - adding the hole extent
1898	*/	1898	*/
1899	trans = btrfs_start_transaction(root, 3);	1899	trans = btrfs_start_transaction(root, 3);
1900	if (IS_ERR(trans)) {	1900	if (IS_ERR(trans)) {
1901	err = PTR_ERR(trans);	1901	err = PTR_ERR(trans);
1902	goto out_free;	1902	goto out_free;
1903	}	1903	}
1904		1904
1905	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,	1905	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
1906	min_size);	1906	min_size);
1907	BUG_ON(ret);	1907	BUG_ON(ret);
1908	trans->block_rsv = rsv;	1908	trans->block_rsv = rsv;
1909		1909
1910	while (cur_offset < lockend) {	1910	while (cur_offset < lockend) {
1911	ret = __btrfs_drop_extents(trans, root, inode, path,	1911	ret = __btrfs_drop_extents(trans, root, inode, path,
1912	cur_offset, lockend + 1,	1912	cur_offset, lockend + 1,
1913	&drop_end, 1);	1913	&drop_end, 1);
1914	if (ret != -ENOSPC)	1914	if (ret != -ENOSPC)
1915	break;	1915	break;
1916		1916
1917	trans->block_rsv = &root->fs_info->trans_block_rsv;	1917	trans->block_rsv = &root->fs_info->trans_block_rsv;
1918		1918
1919	ret = fill_holes(trans, inode, path, cur_offset, drop_end);	1919	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1920	if (ret) {	1920	if (ret) {
1921	err = ret;	1921	err = ret;
1922	break;	1922	break;
1923	}	1923	}
1924		1924
1925	cur_offset = drop_end;	1925	cur_offset = drop_end;
1926		1926
1927	ret = btrfs_update_inode(trans, root, inode);	1927	ret = btrfs_update_inode(trans, root, inode);
1928	if (ret) {	1928	if (ret) {
1929	err = ret;	1929	err = ret;
1930	break;	1930	break;
1931	}	1931	}
1932		1932
1933	nr = trans->blocks_used;	1933	nr = trans->blocks_used;
1934	btrfs_end_transaction(trans, root);	1934	btrfs_end_transaction(trans, root);
1935	btrfs_btree_balance_dirty(root, nr);	1935	btrfs_btree_balance_dirty(root, nr);
1936		1936
1937	trans = btrfs_start_transaction(root, 3);	1937	trans = btrfs_start_transaction(root, 3);
1938	if (IS_ERR(trans)) {	1938	if (IS_ERR(trans)) {
1939	ret = PTR_ERR(trans);	1939	ret = PTR_ERR(trans);
1940	trans = NULL;	1940	trans = NULL;
1941	break;	1941	break;
1942	}	1942	}
1943		1943
1944	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,	1944	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
1945	rsv, min_size);	1945	rsv, min_size);
1946	BUG_ON(ret); /* shouldn't happen */	1946	BUG_ON(ret); /* shouldn't happen */
1947	trans->block_rsv = rsv;	1947	trans->block_rsv = rsv;
1948	}	1948	}
1949		1949
1950	if (ret) {	1950	if (ret) {
1951	err = ret;	1951	err = ret;
1952	goto out_trans;	1952	goto out_trans;
1953	}	1953	}
1954		1954
1955	trans->block_rsv = &root->fs_info->trans_block_rsv;	1955	trans->block_rsv = &root->fs_info->trans_block_rsv;
1956	ret = fill_holes(trans, inode, path, cur_offset, drop_end);	1956	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1957	if (ret) {	1957	if (ret) {
1958	err = ret;	1958	err = ret;
1959	goto out_trans;	1959	goto out_trans;
1960	}	1960	}
1961		1961
1962	out_trans:	1962	out_trans:
1963	if (!trans)	1963	if (!trans)
1964	goto out_free;	1964	goto out_free;
1965		1965
1966	trans->block_rsv = &root->fs_info->trans_block_rsv;	1966	trans->block_rsv = &root->fs_info->trans_block_rsv;
1967	ret = btrfs_update_inode(trans, root, inode);	1967	ret = btrfs_update_inode(trans, root, inode);
1968	nr = trans->blocks_used;	1968	nr = trans->blocks_used;
1969	btrfs_end_transaction(trans, root);	1969	btrfs_end_transaction(trans, root);
1970	btrfs_btree_balance_dirty(root, nr);	1970	btrfs_btree_balance_dirty(root, nr);
1971	out_free:	1971	out_free:
1972	btrfs_free_path(path);	1972	btrfs_free_path(path);
1973	btrfs_free_block_rsv(root, rsv);	1973	btrfs_free_block_rsv(root, rsv);
1974	out:	1974	out:
1975	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,	1975	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1976	&cached_state, GFP_NOFS);	1976	&cached_state, GFP_NOFS);
1977	mutex_unlock(&inode->i_mutex);	1977	mutex_unlock(&inode->i_mutex);
1978	if (ret && !err)	1978	if (ret && !err)
1979	err = ret;	1979	err = ret;
1980	return err;	1980	return err;
1981	}	1981	}
1982		1982
1983	static long btrfs_fallocate(struct file *file, int mode,	1983	static long btrfs_fallocate(struct file *file, int mode,
1984	loff_t offset, loff_t len)	1984	loff_t offset, loff_t len)
1985	{	1985	{
1986	struct inode *inode = file->f_path.dentry->d_inode;	1986	struct inode *inode = file->f_path.dentry->d_inode;
1987	struct extent_state *cached_state = NULL;	1987	struct extent_state *cached_state = NULL;
1988	u64 cur_offset;	1988	u64 cur_offset;
1989	u64 last_byte;	1989	u64 last_byte;
1990	u64 alloc_start;	1990	u64 alloc_start;
1991	u64 alloc_end;	1991	u64 alloc_end;
1992	u64 alloc_hint = 0;	1992	u64 alloc_hint = 0;
1993	u64 locked_end;	1993	u64 locked_end;
1994	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;	1994	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1995	struct extent_map *em;	1995	struct extent_map *em;
1996	int ret;	1996	int ret;
1997		1997
1998	alloc_start = offset & ~mask;	1998	alloc_start = offset & ~mask;
1999	alloc_end = (offset + len + mask) & ~mask;	1999	alloc_end = (offset + len + mask) & ~mask;
2000		2000
2001	/* Make sure we aren't being give some crap mode */	2001	/* Make sure we aren't being give some crap mode */
2002	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))	2002	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
2003	return -EOPNOTSUPP;	2003	return -EOPNOTSUPP;
2004		2004
2005	if (mode & FALLOC_FL_PUNCH_HOLE)	2005	if (mode & FALLOC_FL_PUNCH_HOLE)
2006	return btrfs_punch_hole(inode, offset, len);	2006	return btrfs_punch_hole(inode, offset, len);
2007		2007
2008	/*	2008	/*
2009	* Make sure we have enough space before we do the	2009	* Make sure we have enough space before we do the
2010	* allocation.	2010	* allocation.
2011	*/	2011	*/
2012	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);	2012	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
2013	if (ret)	2013	if (ret)
2014	return ret;	2014	return ret;
2015		2015
2016	/*	2016	/*
2017	* wait for ordered IO before we have any locks. We'll loop again	2017	* wait for ordered IO before we have any locks. We'll loop again
2018	* below with the locks held.	2018	* below with the locks held.
2019	*/	2019	*/
2020	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);	2020	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2021		2021
2022	mutex_lock(&inode->i_mutex);	2022	mutex_lock(&inode->i_mutex);
2023	ret = inode_newsize_ok(inode, alloc_end);	2023	ret = inode_newsize_ok(inode, alloc_end);
2024	if (ret)	2024	if (ret)
2025	goto out;	2025	goto out;
2026		2026
2027	if (alloc_start > inode->i_size) {	2027	if (alloc_start > inode->i_size) {
2028	ret = btrfs_cont_expand(inode, i_size_read(inode),	2028	ret = btrfs_cont_expand(inode, i_size_read(inode),
2029	alloc_start);	2029	alloc_start);
2030	if (ret)	2030	if (ret)
2031	goto out;	2031	goto out;
2032	}	2032	}
2033		2033
2034	locked_end = alloc_end - 1;	2034	locked_end = alloc_end - 1;
2035	while (1) {	2035	while (1) {
2036	struct btrfs_ordered_extent *ordered;	2036	struct btrfs_ordered_extent *ordered;
2037		2037
2038	/* the extent lock is ordered inside the running	2038	/* the extent lock is ordered inside the running
2039	* transaction	2039	* transaction
2040	*/	2040	*/
2041	lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,	2041	lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2042	locked_end, 0, &cached_state);	2042	locked_end, 0, &cached_state);
2043	ordered = btrfs_lookup_first_ordered_extent(inode,	2043	ordered = btrfs_lookup_first_ordered_extent(inode,
2044	alloc_end - 1);	2044	alloc_end - 1);
2045	if (ordered &&	2045	if (ordered &&
2046	ordered->file_offset + ordered->len > alloc_start &&	2046	ordered->file_offset + ordered->len > alloc_start &&
2047	ordered->file_offset < alloc_end) {	2047	ordered->file_offset < alloc_end) {
2048	btrfs_put_ordered_extent(ordered);	2048	btrfs_put_ordered_extent(ordered);
2049	unlock_extent_cached(&BTRFS_I(inode)->io_tree,	2049	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2050	alloc_start, locked_end,	2050	alloc_start, locked_end,
2051	&cached_state, GFP_NOFS);	2051	&cached_state, GFP_NOFS);
2052	/*	2052	/*
2053	* we can't wait on the range with the transaction	2053	* we can't wait on the range with the transaction
2054	* running or with the extent lock held	2054	* running or with the extent lock held
2055	*/	2055	*/
2056	btrfs_wait_ordered_range(inode, alloc_start,	2056	btrfs_wait_ordered_range(inode, alloc_start,
2057	alloc_end - alloc_start);	2057	alloc_end - alloc_start);
2058	} else {	2058	} else {
2059	if (ordered)	2059	if (ordered)
2060	btrfs_put_ordered_extent(ordered);	2060	btrfs_put_ordered_extent(ordered);
2061	break;	2061	break;
2062	}	2062	}
2063	}	2063	}
2064		2064
2065	cur_offset = alloc_start;	2065	cur_offset = alloc_start;
2066	while (1) {	2066	while (1) {
2067	u64 actual_end;	2067	u64 actual_end;
2068		2068
2069	em = btrfs_get_extent(inode, NULL, 0, cur_offset,	2069	em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2070	alloc_end - cur_offset, 0);	2070	alloc_end - cur_offset, 0);
2071	if (IS_ERR_OR_NULL(em)) {	2071	if (IS_ERR_OR_NULL(em)) {
2072	if (!em)	2072	if (!em)
2073	ret = -ENOMEM;	2073	ret = -ENOMEM;
2074	else	2074	else
2075	ret = PTR_ERR(em);	2075	ret = PTR_ERR(em);
2076	break;	2076	break;
2077	}	2077	}
2078	last_byte = min(extent_map_end(em), alloc_end);	2078	last_byte = min(extent_map_end(em), alloc_end);
2079	actual_end = min_t(u64, extent_map_end(em), offset + len);	2079	actual_end = min_t(u64, extent_map_end(em), offset + len);
2080	last_byte = (last_byte + mask) & ~mask;	2080	last_byte = (last_byte + mask) & ~mask;
2081		2081
2082	if (em->block_start == EXTENT_MAP_HOLE \|\|	2082	if (em->block_start == EXTENT_MAP_HOLE \|\|
2083	(cur_offset >= inode->i_size &&	2083	(cur_offset >= inode->i_size &&
2084	!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {	2084	!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2085	ret = btrfs_prealloc_file_range(inode, mode, cur_offset,	2085	ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2086	last_byte - cur_offset,	2086	last_byte - cur_offset,
2087	1 << inode->i_blkbits,	2087	1 << inode->i_blkbits,
2088	offset + len,	2088	offset + len,
2089	&alloc_hint);	2089	&alloc_hint);
2090		2090
2091	if (ret < 0) {	2091	if (ret < 0) {
2092	free_extent_map(em);	2092	free_extent_map(em);
2093	break;	2093	break;
2094	}	2094	}
2095	} else if (actual_end > inode->i_size &&	2095	} else if (actual_end > inode->i_size &&
2096	!(mode & FALLOC_FL_KEEP_SIZE)) {	2096	!(mode & FALLOC_FL_KEEP_SIZE)) {
2097	/*	2097	/*
2098	* We didn't need to allocate any more space, but we	2098	* We didn't need to allocate any more space, but we
2099	* still extended the size of the file so we need to	2099	* still extended the size of the file so we need to
2100	* update i_size.	2100	* update i_size.
2101	*/	2101	*/
2102	inode->i_ctime = CURRENT_TIME;	2102	inode->i_ctime = CURRENT_TIME;
2103	i_size_write(inode, actual_end);	2103	i_size_write(inode, actual_end);
2104	btrfs_ordered_update_i_size(inode, actual_end, NULL);	2104	btrfs_ordered_update_i_size(inode, actual_end, NULL);
2105	}	2105	}
2106	free_extent_map(em);	2106	free_extent_map(em);
2107		2107
2108	cur_offset = last_byte;	2108	cur_offset = last_byte;
2109	if (cur_offset >= alloc_end) {	2109	if (cur_offset >= alloc_end) {
2110	ret = 0;	2110	ret = 0;
2111	break;	2111	break;
2112	}	2112	}
2113	}	2113	}
2114	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,	2114	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2115	&cached_state, GFP_NOFS);	2115	&cached_state, GFP_NOFS);
2116	out:	2116	out:
2117	mutex_unlock(&inode->i_mutex);	2117	mutex_unlock(&inode->i_mutex);
2118	/* Let go of our reservation. */	2118	/* Let go of our reservation. */
2119	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);	2119	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
2120	return ret;	2120	return ret;
2121	}	2121	}
2122		2122
2123	static int find_desired_extent(struct inode inode, loff_t offset, int origin)	2123	static int find_desired_extent(struct inode inode, loff_t offset, int whence)
2124	{	2124	{
2125	struct btrfs_root *root = BTRFS_I(inode)->root;	2125	struct btrfs_root *root = BTRFS_I(inode)->root;
2126	struct extent_map *em;	2126	struct extent_map *em;
2127	struct extent_state *cached_state = NULL;	2127	struct extent_state *cached_state = NULL;
2128	u64 lockstart = *offset;	2128	u64 lockstart = *offset;
2129	u64 lockend = i_size_read(inode);	2129	u64 lockend = i_size_read(inode);
2130	u64 start = *offset;	2130	u64 start = *offset;
2131	u64 orig_start = *offset;	2131	u64 orig_start = *offset;
2132	u64 len = i_size_read(inode);	2132	u64 len = i_size_read(inode);
2133	u64 last_end = 0;	2133	u64 last_end = 0;
2134	int ret = 0;	2134	int ret = 0;
2135		2135
2136	lockend = max_t(u64, root->sectorsize, lockend);	2136	lockend = max_t(u64, root->sectorsize, lockend);
2137	if (lockend <= lockstart)	2137	if (lockend <= lockstart)
2138	lockend = lockstart + root->sectorsize;	2138	lockend = lockstart + root->sectorsize;
2139		2139
2140	len = lockend - lockstart + 1;	2140	len = lockend - lockstart + 1;
2141		2141
2142	len = max_t(u64, len, root->sectorsize);	2142	len = max_t(u64, len, root->sectorsize);
2143	if (inode->i_size == 0)	2143	if (inode->i_size == 0)
2144	return -ENXIO;	2144	return -ENXIO;
2145		2145
2146	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,	2146	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2147	&cached_state);	2147	&cached_state);
2148		2148
2149	/*	2149	/*
2150	* Delalloc is such a pain. If we have a hole and we have pending	2150	* Delalloc is such a pain. If we have a hole and we have pending
2151	* delalloc for a portion of the hole we will get back a hole that	2151	* delalloc for a portion of the hole we will get back a hole that
2152	* exists for the entire range since it hasn't been actually written	2152	* exists for the entire range since it hasn't been actually written
2153	* yet. So to take care of this case we need to look for an extent just	2153	* yet. So to take care of this case we need to look for an extent just
2154	* before the position we want in case there is outstanding delalloc	2154	* before the position we want in case there is outstanding delalloc
2155	* going on here.	2155	* going on here.
2156	*/	2156	*/
2157	if (origin == SEEK_HOLE && start != 0) {	2157	if (whence == SEEK_HOLE && start != 0) {
2158	if (start <= root->sectorsize)	2158	if (start <= root->sectorsize)
2159	em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,	2159	em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
2160	root->sectorsize, 0);	2160	root->sectorsize, 0);
2161	else	2161	else
2162	em = btrfs_get_extent_fiemap(inode, NULL, 0,	2162	em = btrfs_get_extent_fiemap(inode, NULL, 0,
2163	start - root->sectorsize,	2163	start - root->sectorsize,
2164	root->sectorsize, 0);	2164	root->sectorsize, 0);
2165	if (IS_ERR(em)) {	2165	if (IS_ERR(em)) {
2166	ret = PTR_ERR(em);	2166	ret = PTR_ERR(em);
2167	goto out;	2167	goto out;
2168	}	2168	}
2169	last_end = em->start + em->len;	2169	last_end = em->start + em->len;
2170	if (em->block_start == EXTENT_MAP_DELALLOC)	2170	if (em->block_start == EXTENT_MAP_DELALLOC)
2171	last_end = min_t(u64, last_end, inode->i_size);	2171	last_end = min_t(u64, last_end, inode->i_size);
2172	free_extent_map(em);	2172	free_extent_map(em);
2173	}	2173	}
2174		2174
2175	while (1) {	2175	while (1) {
2176	em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);	2176	em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2177	if (IS_ERR(em)) {	2177	if (IS_ERR(em)) {
2178	ret = PTR_ERR(em);	2178	ret = PTR_ERR(em);
2179	break;	2179	break;
2180	}	2180	}
2181		2181
2182	if (em->block_start == EXTENT_MAP_HOLE) {	2182	if (em->block_start == EXTENT_MAP_HOLE) {
2183	if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {	2183	if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2184	if (last_end <= orig_start) {	2184	if (last_end <= orig_start) {
2185	free_extent_map(em);	2185	free_extent_map(em);
2186	ret = -ENXIO;	2186	ret = -ENXIO;
2187	break;	2187	break;
2188	}	2188	}
2189	}	2189	}
2190		2190
2191	if (origin == SEEK_HOLE) {	2191	if (whence == SEEK_HOLE) {
2192	*offset = start;	2192	*offset = start;
2193	free_extent_map(em);	2193	free_extent_map(em);
2194	break;	2194	break;
2195	}	2195	}
2196	} else {	2196	} else {
2197	if (origin == SEEK_DATA) {	2197	if (whence == SEEK_DATA) {
2198	if (em->block_start == EXTENT_MAP_DELALLOC) {	2198	if (em->block_start == EXTENT_MAP_DELALLOC) {
2199	if (start >= inode->i_size) {	2199	if (start >= inode->i_size) {
2200	free_extent_map(em);	2200	free_extent_map(em);
2201	ret = -ENXIO;	2201	ret = -ENXIO;
2202	break;	2202	break;
2203	}	2203	}
2204	}	2204	}
2205		2205
2206	*offset = start;	2206	*offset = start;
2207	free_extent_map(em);	2207	free_extent_map(em);
2208	break;	2208	break;
2209	}	2209	}
2210	}	2210	}
2211		2211
2212	start = em->start + em->len;	2212	start = em->start + em->len;
2213	last_end = em->start + em->len;	2213	last_end = em->start + em->len;
2214		2214
2215	if (em->block_start == EXTENT_MAP_DELALLOC)	2215	if (em->block_start == EXTENT_MAP_DELALLOC)
2216	last_end = min_t(u64, last_end, inode->i_size);	2216	last_end = min_t(u64, last_end, inode->i_size);
2217		2217
2218	if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {	2218	if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2219	free_extent_map(em);	2219	free_extent_map(em);
2220	ret = -ENXIO;	2220	ret = -ENXIO;
2221	break;	2221	break;
2222	}	2222	}
2223	free_extent_map(em);	2223	free_extent_map(em);
2224	cond_resched();	2224	cond_resched();
2225	}	2225	}
2226	if (!ret)	2226	if (!ret)
2227	offset = min(offset, inode->i_size);	2227	offset = min(offset, inode->i_size);
2228	out:	2228	out:
2229	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,	2229	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2230	&cached_state, GFP_NOFS);	2230	&cached_state, GFP_NOFS);
2231	return ret;	2231	return ret;
2232	}	2232	}
2233		2233
2234	static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)	2234	static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2235	{	2235	{
2236	struct inode *inode = file->f_mapping->host;	2236	struct inode *inode = file->f_mapping->host;
2237	int ret;	2237	int ret;
2238		2238
2239	mutex_lock(&inode->i_mutex);	2239	mutex_lock(&inode->i_mutex);
2240	switch (origin) {	2240	switch (whence) {
2241	case SEEK_END:	2241	case SEEK_END:
2242	case SEEK_CUR:	2242	case SEEK_CUR:
2243	offset = generic_file_llseek(file, offset, origin);	2243	offset = generic_file_llseek(file, offset, whence);
2244	goto out;	2244	goto out;
2245	case SEEK_DATA:	2245	case SEEK_DATA:
2246	case SEEK_HOLE:	2246	case SEEK_HOLE:
2247	if (offset >= i_size_read(inode)) {	2247	if (offset >= i_size_read(inode)) {
2248	mutex_unlock(&inode->i_mutex);	2248	mutex_unlock(&inode->i_mutex);
2249	return -ENXIO;	2249	return -ENXIO;
2250	}	2250	}
2251		2251
2252	ret = find_desired_extent(inode, &offset, origin);	2252	ret = find_desired_extent(inode, &offset, whence);
2253	if (ret) {	2253	if (ret) {
2254	mutex_unlock(&inode->i_mutex);	2254	mutex_unlock(&inode->i_mutex);
2255	return ret;	2255	return ret;
2256	}	2256	}
2257	}	2257	}
2258		2258
2259	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {	2259	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
2260	offset = -EINVAL;	2260	offset = -EINVAL;
2261	goto out;	2261	goto out;
2262	}	2262	}
2263	if (offset > inode->i_sb->s_maxbytes) {	2263	if (offset > inode->i_sb->s_maxbytes) {
2264	offset = -EINVAL;	2264	offset = -EINVAL;
2265	goto out;	2265	goto out;
2266	}	2266	}
2267		2267
2268	/* Special lock needed here? */	2268	/* Special lock needed here? */
2269	if (offset != file->f_pos) {	2269	if (offset != file->f_pos) {
2270	file->f_pos = offset;	2270	file->f_pos = offset;
2271	file->f_version = 0;	2271	file->f_version = 0;
2272	}	2272	}
2273	out:	2273	out:
2274	mutex_unlock(&inode->i_mutex);	2274	mutex_unlock(&inode->i_mutex);
2275	return offset;	2275	return offset;
2276	}	2276	}
2277		2277
2278	const struct file_operations btrfs_file_operations = {	2278	const struct file_operations btrfs_file_operations = {
2279	.llseek = btrfs_file_llseek,	2279	.llseek = btrfs_file_llseek,
2280	.read = do_sync_read,	2280	.read = do_sync_read,
2281	.write = do_sync_write,	2281	.write = do_sync_write,
2282	.aio_read = generic_file_aio_read,	2282	.aio_read = generic_file_aio_read,
2283	.splice_read = generic_file_splice_read,	2283	.splice_read = generic_file_splice_read,
2284	.aio_write = btrfs_file_aio_write,	2284	.aio_write = btrfs_file_aio_write,
2285	.mmap = btrfs_file_mmap,	2285	.mmap = btrfs_file_mmap,
2286	.open = generic_file_open,	2286	.open = generic_file_open,
2287	.release = btrfs_release_file,	2287	.release = btrfs_release_file,
2288	.fsync = btrfs_sync_file,	2288	.fsync = btrfs_sync_file,
2289	.fallocate = btrfs_fallocate,	2289	.fallocate = btrfs_fallocate,
2290	.unlocked_ioctl = btrfs_ioctl,	2290	.unlocked_ioctl = btrfs_ioctl,
2291	#ifdef CONFIG_COMPAT	2291	#ifdef CONFIG_COMPAT
2292	.compat_ioctl = btrfs_ioctl,	2292	.compat_ioctl = btrfs_ioctl,
2293	#endif	2293	#endif
2294	};	2294	};
2295		2295

fs/ceph/dir.c

Diff comments View file @ 965c8e5

1	#include <linux/ceph/ceph_debug.h>	1	#include <linux/ceph/ceph_debug.h>
2		2
3	#include <linux/spinlock.h>	3	#include <linux/spinlock.h>
4	#include <linux/fs_struct.h>	4	#include <linux/fs_struct.h>
5	#include <linux/namei.h>	5	#include <linux/namei.h>
6	#include <linux/slab.h>	6	#include <linux/slab.h>
7	#include <linux/sched.h>	7	#include <linux/sched.h>
8		8
9	#include "super.h"	9	#include "super.h"
10	#include "mds_client.h"	10	#include "mds_client.h"
11		11
12	/*	12	/*
13	* Directory operations: readdir, lookup, create, link, unlink,	13	* Directory operations: readdir, lookup, create, link, unlink,
14	* rename, etc.	14	* rename, etc.
15	*/	15	*/
16		16
17	/*	17	/*
18	* Ceph MDS operations are specified in terms of a base ino and	18	* Ceph MDS operations are specified in terms of a base ino and
19	* relative path. Thus, the client can specify an operation on a	19	* relative path. Thus, the client can specify an operation on a
20	* specific inode (e.g., a getattr due to fstat(2)), or as a path	20	* specific inode (e.g., a getattr due to fstat(2)), or as a path
21	* relative to, say, the root directory.	21	* relative to, say, the root directory.
22	*	22	*
23	* Normally, we limit ourselves to strict inode ops (no path component)	23	* Normally, we limit ourselves to strict inode ops (no path component)
24	* or dentry operations (a single path component relative to an ino). The	24	* or dentry operations (a single path component relative to an ino). The
25	* exception to this is open_root_dentry(), which will open the mount	25	* exception to this is open_root_dentry(), which will open the mount
26	* point by name.	26	* point by name.
27	*/	27	*/
28		28
29	const struct inode_operations ceph_dir_iops;	29	const struct inode_operations ceph_dir_iops;
30	const struct file_operations ceph_dir_fops;	30	const struct file_operations ceph_dir_fops;
31	const struct dentry_operations ceph_dentry_ops;	31	const struct dentry_operations ceph_dentry_ops;
32		32
33	/*	33	/*
34	* Initialize ceph dentry state.	34	* Initialize ceph dentry state.
35	*/	35	*/
36	int ceph_init_dentry(struct dentry *dentry)	36	int ceph_init_dentry(struct dentry *dentry)
37	{	37	{
38	struct ceph_dentry_info *di;	38	struct ceph_dentry_info *di;
39		39
40	if (dentry->d_fsdata)	40	if (dentry->d_fsdata)
41	return 0;	41	return 0;
42		42
43	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS \| __GFP_ZERO);	43	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS \| __GFP_ZERO);
44	if (!di)	44	if (!di)
45	return -ENOMEM; /* oh well */	45	return -ENOMEM; /* oh well */
46		46
47	spin_lock(&dentry->d_lock);	47	spin_lock(&dentry->d_lock);
48	if (dentry->d_fsdata) {	48	if (dentry->d_fsdata) {
49	/* lost a race */	49	/* lost a race */
50	kmem_cache_free(ceph_dentry_cachep, di);	50	kmem_cache_free(ceph_dentry_cachep, di);
51	goto out_unlock;	51	goto out_unlock;
52	}	52	}
53		53
54	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)	54	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
55	d_set_d_op(dentry, &ceph_dentry_ops);	55	d_set_d_op(dentry, &ceph_dentry_ops);
56	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)	56	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
57	d_set_d_op(dentry, &ceph_snapdir_dentry_ops);	57	d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
58	else	58	else
59	d_set_d_op(dentry, &ceph_snap_dentry_ops);	59	d_set_d_op(dentry, &ceph_snap_dentry_ops);
60		60
61	di->dentry = dentry;	61	di->dentry = dentry;
62	di->lease_session = NULL;	62	di->lease_session = NULL;
63	dentry->d_time = jiffies;	63	dentry->d_time = jiffies;
64	/* avoid reordering d_fsdata setup so that the check above is safe */	64	/* avoid reordering d_fsdata setup so that the check above is safe */
65	smp_mb();	65	smp_mb();
66	dentry->d_fsdata = di;	66	dentry->d_fsdata = di;
67	ceph_dentry_lru_add(dentry);	67	ceph_dentry_lru_add(dentry);
68	out_unlock:	68	out_unlock:
69	spin_unlock(&dentry->d_lock);	69	spin_unlock(&dentry->d_lock);
70	return 0;	70	return 0;
71	}	71	}
72		72
73	struct inode ceph_get_dentry_parent_inode(struct dentry dentry)	73	struct inode ceph_get_dentry_parent_inode(struct dentry dentry)
74	{	74	{
75	struct inode *inode = NULL;	75	struct inode *inode = NULL;
76		76
77	if (!dentry)	77	if (!dentry)
78	return NULL;	78	return NULL;
79		79
80	spin_lock(&dentry->d_lock);	80	spin_lock(&dentry->d_lock);
81	if (!IS_ROOT(dentry)) {	81	if (!IS_ROOT(dentry)) {
82	inode = dentry->d_parent->d_inode;	82	inode = dentry->d_parent->d_inode;
83	ihold(inode);	83	ihold(inode);
84	}	84	}
85	spin_unlock(&dentry->d_lock);	85	spin_unlock(&dentry->d_lock);
86	return inode;	86	return inode;
87	}	87	}
88		88
89		89
90	/*	90	/*
91	* for readdir, we encode the directory frag and offset within that	91	* for readdir, we encode the directory frag and offset within that
92	* frag into f_pos.	92	* frag into f_pos.
93	*/	93	*/
94	static unsigned fpos_frag(loff_t p)	94	static unsigned fpos_frag(loff_t p)
95	{	95	{
96	return p >> 32;	96	return p >> 32;
97	}	97	}
98	static unsigned fpos_off(loff_t p)	98	static unsigned fpos_off(loff_t p)
99	{	99	{
100	return p & 0xffffffff;	100	return p & 0xffffffff;
101	}	101	}
102		102
103	/*	103	/*
104	* When possible, we try to satisfy a readdir by peeking at the	104	* When possible, we try to satisfy a readdir by peeking at the
105	* dcache. We make this work by carefully ordering dentries on	105	* dcache. We make this work by carefully ordering dentries on
106	* d_u.d_child when we initially get results back from the MDS, and	106	* d_u.d_child when we initially get results back from the MDS, and
107	* falling back to a "normal" sync readdir if any dentries in the dir	107	* falling back to a "normal" sync readdir if any dentries in the dir
108	* are dropped.	108	* are dropped.
109	*	109	*
110	* D_COMPLETE tells indicates we have all dentries in the dir. It is	110	* D_COMPLETE tells indicates we have all dentries in the dir. It is
111	* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by	111	* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
112	* the MDS if/when the directory is modified).	112	* the MDS if/when the directory is modified).
113	*/	113	*/
114	static int __dcache_readdir(struct file *filp,	114	static int __dcache_readdir(struct file *filp,
115	void *dirent, filldir_t filldir)	115	void *dirent, filldir_t filldir)
116	{	116	{
117	struct ceph_file_info *fi = filp->private_data;	117	struct ceph_file_info *fi = filp->private_data;
118	struct dentry *parent = filp->f_dentry;	118	struct dentry *parent = filp->f_dentry;
119	struct inode *dir = parent->d_inode;	119	struct inode *dir = parent->d_inode;
120	struct list_head *p;	120	struct list_head *p;
121	struct dentry dentry, last;	121	struct dentry dentry, last;
122	struct ceph_dentry_info *di;	122	struct ceph_dentry_info *di;
123	int err = 0;	123	int err = 0;
124		124
125	/* claim ref on last dentry we returned */	125	/* claim ref on last dentry we returned */
126	last = fi->dentry;	126	last = fi->dentry;
127	fi->dentry = NULL;	127	fi->dentry = NULL;
128		128
129	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,	129	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
130	last);	130	last);
131		131
132	spin_lock(&parent->d_lock);	132	spin_lock(&parent->d_lock);
133		133
134	/* start at beginning? */	134	/* start at beginning? */
135	if (filp->f_pos == 2 \|\| last == NULL \|\|	135	if (filp->f_pos == 2 \|\| last == NULL \|\|
136	filp->f_pos < ceph_dentry(last)->offset) {	136	filp->f_pos < ceph_dentry(last)->offset) {
137	if (list_empty(&parent->d_subdirs))	137	if (list_empty(&parent->d_subdirs))
138	goto out_unlock;	138	goto out_unlock;
139	p = parent->d_subdirs.prev;	139	p = parent->d_subdirs.prev;
140	dout(" initial p %p/%p\n", p->prev, p->next);	140	dout(" initial p %p/%p\n", p->prev, p->next);
141	} else {	141	} else {
142	p = last->d_u.d_child.prev;	142	p = last->d_u.d_child.prev;
143	}	143	}
144		144
145	more:	145	more:
146	dentry = list_entry(p, struct dentry, d_u.d_child);	146	dentry = list_entry(p, struct dentry, d_u.d_child);
147	di = ceph_dentry(dentry);	147	di = ceph_dentry(dentry);
148	while (1) {	148	while (1) {
149	dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,	149	dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
150	d_unhashed(dentry) ? "!hashed" : "hashed",	150	d_unhashed(dentry) ? "!hashed" : "hashed",
151	parent->d_subdirs.prev, parent->d_subdirs.next);	151	parent->d_subdirs.prev, parent->d_subdirs.next);
152	if (p == &parent->d_subdirs) {	152	if (p == &parent->d_subdirs) {
153	fi->flags \|= CEPH_F_ATEND;	153	fi->flags \|= CEPH_F_ATEND;
154	goto out_unlock;	154	goto out_unlock;
155	}	155	}
156	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);	156	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
157	if (!d_unhashed(dentry) && dentry->d_inode &&	157	if (!d_unhashed(dentry) && dentry->d_inode &&
158	ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&	158	ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
159	ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&	159	ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
160	filp->f_pos <= di->offset)	160	filp->f_pos <= di->offset)
161	break;	161	break;
162	dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,	162	dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
163	dentry->d_name.len, dentry->d_name.name, di->offset,	163	dentry->d_name.len, dentry->d_name.name, di->offset,
164	filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",	164	filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
165	!dentry->d_inode ? " null" : "");	165	!dentry->d_inode ? " null" : "");
166	spin_unlock(&dentry->d_lock);	166	spin_unlock(&dentry->d_lock);
167	p = p->prev;	167	p = p->prev;
168	dentry = list_entry(p, struct dentry, d_u.d_child);	168	dentry = list_entry(p, struct dentry, d_u.d_child);
169	di = ceph_dentry(dentry);	169	di = ceph_dentry(dentry);
170	}	170	}
171		171
172	dget_dlock(dentry);	172	dget_dlock(dentry);
173	spin_unlock(&dentry->d_lock);	173	spin_unlock(&dentry->d_lock);
174	spin_unlock(&parent->d_lock);	174	spin_unlock(&parent->d_lock);
175		175
176	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,	176	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
177	dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);	177	dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
178	filp->f_pos = di->offset;	178	filp->f_pos = di->offset;
179	err = filldir(dirent, dentry->d_name.name,	179	err = filldir(dirent, dentry->d_name.name,
180	dentry->d_name.len, di->offset,	180	dentry->d_name.len, di->offset,
181	ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),	181	ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
182	dentry->d_inode->i_mode >> 12);	182	dentry->d_inode->i_mode >> 12);
183		183
184	if (last) {	184	if (last) {
185	if (err < 0) {	185	if (err < 0) {
186	/* remember our position */	186	/* remember our position */
187	fi->dentry = last;	187	fi->dentry = last;
188	fi->next_offset = di->offset;	188	fi->next_offset = di->offset;
189	} else {	189	} else {
190	dput(last);	190	dput(last);
191	}	191	}
192	}	192	}
193	last = dentry;	193	last = dentry;
194		194
195	if (err < 0)	195	if (err < 0)
196	goto out;	196	goto out;
197		197
198	filp->f_pos++;	198	filp->f_pos++;
199		199
200	/* make sure a dentry wasn't dropped while we didn't have parent lock */	200	/* make sure a dentry wasn't dropped while we didn't have parent lock */
201	if (!ceph_dir_test_complete(dir)) {	201	if (!ceph_dir_test_complete(dir)) {
202	dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);	202	dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
203	err = -EAGAIN;	203	err = -EAGAIN;
204	goto out;	204	goto out;
205	}	205	}
206		206
207	spin_lock(&parent->d_lock);	207	spin_lock(&parent->d_lock);
208	p = p->prev; /* advance to next dentry */	208	p = p->prev; /* advance to next dentry */
209	goto more;	209	goto more;
210		210
211	out_unlock:	211	out_unlock:
212	spin_unlock(&parent->d_lock);	212	spin_unlock(&parent->d_lock);
213	out:	213	out:
214	if (last)	214	if (last)
215	dput(last);	215	dput(last);
216	return err;	216	return err;
217	}	217	}
218		218
219	/*	219	/*
220	* make note of the last dentry we read, so we can	220	* make note of the last dentry we read, so we can
221	* continue at the same lexicographical point,	221	* continue at the same lexicographical point,
222	* regardless of what dir changes take place on the	222	* regardless of what dir changes take place on the
223	* server.	223	* server.
224	*/	224	*/
225	static int note_last_dentry(struct ceph_file_info fi, const char name,	225	static int note_last_dentry(struct ceph_file_info fi, const char name,
226	int len)	226	int len)
227	{	227	{
228	kfree(fi->last_name);	228	kfree(fi->last_name);
229	fi->last_name = kmalloc(len+1, GFP_NOFS);	229	fi->last_name = kmalloc(len+1, GFP_NOFS);
230	if (!fi->last_name)	230	if (!fi->last_name)
231	return -ENOMEM;	231	return -ENOMEM;
232	memcpy(fi->last_name, name, len);	232	memcpy(fi->last_name, name, len);
233	fi->last_name[len] = 0;	233	fi->last_name[len] = 0;
234	dout("note_last_dentry '%s'\n", fi->last_name);	234	dout("note_last_dentry '%s'\n", fi->last_name);
235	return 0;	235	return 0;
236	}	236	}
237		237
238	static int ceph_readdir(struct file filp, void dirent, filldir_t filldir)	238	static int ceph_readdir(struct file filp, void dirent, filldir_t filldir)
239	{	239	{
240	struct ceph_file_info *fi = filp->private_data;	240	struct ceph_file_info *fi = filp->private_data;
241	struct inode *inode = filp->f_dentry->d_inode;	241	struct inode *inode = filp->f_dentry->d_inode;
242	struct ceph_inode_info *ci = ceph_inode(inode);	242	struct ceph_inode_info *ci = ceph_inode(inode);
243	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);	243	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244	struct ceph_mds_client *mdsc = fsc->mdsc;	244	struct ceph_mds_client *mdsc = fsc->mdsc;
245	unsigned frag = fpos_frag(filp->f_pos);	245	unsigned frag = fpos_frag(filp->f_pos);
246	int off = fpos_off(filp->f_pos);	246	int off = fpos_off(filp->f_pos);
247	int err;	247	int err;
248	u32 ftype;	248	u32 ftype;
249	struct ceph_mds_reply_info_parsed *rinfo;	249	struct ceph_mds_reply_info_parsed *rinfo;
250	const int max_entries = fsc->mount_options->max_readdir;	250	const int max_entries = fsc->mount_options->max_readdir;
251	const int max_bytes = fsc->mount_options->max_readdir_bytes;	251	const int max_bytes = fsc->mount_options->max_readdir_bytes;
252		252
253	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);	253	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
254	if (fi->flags & CEPH_F_ATEND)	254	if (fi->flags & CEPH_F_ATEND)
255	return 0;	255	return 0;
256		256
257	/* always start with . and .. */	257	/* always start with . and .. */
258	if (filp->f_pos == 0) {	258	if (filp->f_pos == 0) {
259	/* note dir version at start of readdir so we can tell	259	/* note dir version at start of readdir so we can tell
260	* if any dentries get dropped */	260	* if any dentries get dropped */
261	fi->dir_release_count = ci->i_release_count;	261	fi->dir_release_count = ci->i_release_count;
262		262
263	dout("readdir off 0 -> '.'\n");	263	dout("readdir off 0 -> '.'\n");
264	if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),	264	if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
265	ceph_translate_ino(inode->i_sb, inode->i_ino),	265	ceph_translate_ino(inode->i_sb, inode->i_ino),
266	inode->i_mode >> 12) < 0)	266	inode->i_mode >> 12) < 0)
267	return 0;	267	return 0;
268	filp->f_pos = 1;	268	filp->f_pos = 1;
269	off = 1;	269	off = 1;
270	}	270	}
271	if (filp->f_pos == 1) {	271	if (filp->f_pos == 1) {
272	ino_t ino = parent_ino(filp->f_dentry);	272	ino_t ino = parent_ino(filp->f_dentry);
273	dout("readdir off 1 -> '..'\n");	273	dout("readdir off 1 -> '..'\n");
274	if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),	274	if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
275	ceph_translate_ino(inode->i_sb, ino),	275	ceph_translate_ino(inode->i_sb, ino),
276	inode->i_mode >> 12) < 0)	276	inode->i_mode >> 12) < 0)
277	return 0;	277	return 0;
278	filp->f_pos = 2;	278	filp->f_pos = 2;
279	off = 2;	279	off = 2;
280	}	280	}
281		281
282	/* can we use the dcache? */	282	/* can we use the dcache? */
283	spin_lock(&ci->i_ceph_lock);	283	spin_lock(&ci->i_ceph_lock);
284	if ((filp->f_pos == 2 \|\| fi->dentry) &&	284	if ((filp->f_pos == 2 \|\| fi->dentry) &&
285	!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&	285	!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
286	ceph_snap(inode) != CEPH_SNAPDIR &&	286	ceph_snap(inode) != CEPH_SNAPDIR &&
287	ceph_dir_test_complete(inode) &&	287	ceph_dir_test_complete(inode) &&
288	__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {	288	__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
289	spin_unlock(&ci->i_ceph_lock);	289	spin_unlock(&ci->i_ceph_lock);
290	err = __dcache_readdir(filp, dirent, filldir);	290	err = __dcache_readdir(filp, dirent, filldir);
291	if (err != -EAGAIN)	291	if (err != -EAGAIN)
292	return err;	292	return err;
293	} else {	293	} else {
294	spin_unlock(&ci->i_ceph_lock);	294	spin_unlock(&ci->i_ceph_lock);
295	}	295	}
296	if (fi->dentry) {	296	if (fi->dentry) {
297	err = note_last_dentry(fi, fi->dentry->d_name.name,	297	err = note_last_dentry(fi, fi->dentry->d_name.name,
298	fi->dentry->d_name.len);	298	fi->dentry->d_name.len);
299	if (err)	299	if (err)
300	return err;	300	return err;
301	dput(fi->dentry);	301	dput(fi->dentry);
302	fi->dentry = NULL;	302	fi->dentry = NULL;
303	}	303	}
304		304
305	/* proceed with a normal readdir */	305	/* proceed with a normal readdir */
306		306
307	more:	307	more:
308	/* do we have the correct frag content buffered? */	308	/* do we have the correct frag content buffered? */
309	if (fi->frag != frag \|\| fi->last_readdir == NULL) {	309	if (fi->frag != frag \|\| fi->last_readdir == NULL) {
310	struct ceph_mds_request *req;	310	struct ceph_mds_request *req;
311	int op = ceph_snap(inode) == CEPH_SNAPDIR ?	311	int op = ceph_snap(inode) == CEPH_SNAPDIR ?
312	CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;	312	CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
313		313
314	/* discard old result, if any */	314	/* discard old result, if any */
315	if (fi->last_readdir) {	315	if (fi->last_readdir) {
316	ceph_mdsc_put_request(fi->last_readdir);	316	ceph_mdsc_put_request(fi->last_readdir);
317	fi->last_readdir = NULL;	317	fi->last_readdir = NULL;
318	}	318	}
319		319
320	/* requery frag tree, as the frag topology may have changed */	320	/* requery frag tree, as the frag topology may have changed */
321	frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);	321	frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
322		322
323	dout("readdir fetching %llx.%llx frag %x offset '%s'\n",	323	dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
324	ceph_vinop(inode), frag, fi->last_name);	324	ceph_vinop(inode), frag, fi->last_name);
325	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	325	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
326	if (IS_ERR(req))	326	if (IS_ERR(req))
327	return PTR_ERR(req);	327	return PTR_ERR(req);
328	req->r_inode = inode;	328	req->r_inode = inode;
329	ihold(inode);	329	ihold(inode);
330	req->r_dentry = dget(filp->f_dentry);	330	req->r_dentry = dget(filp->f_dentry);
331	/* hints to request -> mds selection code */	331	/* hints to request -> mds selection code */
332	req->r_direct_mode = USE_AUTH_MDS;	332	req->r_direct_mode = USE_AUTH_MDS;
333	req->r_direct_hash = ceph_frag_value(frag);	333	req->r_direct_hash = ceph_frag_value(frag);
334	req->r_direct_is_hash = true;	334	req->r_direct_is_hash = true;
335	req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);	335	req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
336	req->r_readdir_offset = fi->next_offset;	336	req->r_readdir_offset = fi->next_offset;
337	req->r_args.readdir.frag = cpu_to_le32(frag);	337	req->r_args.readdir.frag = cpu_to_le32(frag);
338	req->r_args.readdir.max_entries = cpu_to_le32(max_entries);	338	req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
339	req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);	339	req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
340	req->r_num_caps = max_entries + 1;	340	req->r_num_caps = max_entries + 1;
341	err = ceph_mdsc_do_request(mdsc, NULL, req);	341	err = ceph_mdsc_do_request(mdsc, NULL, req);
342	if (err < 0) {	342	if (err < 0) {
343	ceph_mdsc_put_request(req);	343	ceph_mdsc_put_request(req);
344	return err;	344	return err;
345	}	345	}
346	dout("readdir got and parsed readdir result=%d"	346	dout("readdir got and parsed readdir result=%d"
347	" on frag %x, end=%d, complete=%d\n", err, frag,	347	" on frag %x, end=%d, complete=%d\n", err, frag,
348	(int)req->r_reply_info.dir_end,	348	(int)req->r_reply_info.dir_end,
349	(int)req->r_reply_info.dir_complete);	349	(int)req->r_reply_info.dir_complete);
350		350
351	if (!req->r_did_prepopulate) {	351	if (!req->r_did_prepopulate) {
352	dout("readdir !did_prepopulate");	352	dout("readdir !did_prepopulate");
353	fi->dir_release_count--; /* preclude D_COMPLETE */	353	fi->dir_release_count--; /* preclude D_COMPLETE */
354	}	354	}
355		355
356	/* note next offset and last dentry name */	356	/* note next offset and last dentry name */
357	fi->offset = fi->next_offset;	357	fi->offset = fi->next_offset;
358	fi->last_readdir = req;	358	fi->last_readdir = req;
359		359
360	if (req->r_reply_info.dir_end) {	360	if (req->r_reply_info.dir_end) {
361	kfree(fi->last_name);	361	kfree(fi->last_name);
362	fi->last_name = NULL;	362	fi->last_name = NULL;
363	if (ceph_frag_is_rightmost(frag))	363	if (ceph_frag_is_rightmost(frag))
364	fi->next_offset = 2;	364	fi->next_offset = 2;
365	else	365	else
366	fi->next_offset = 0;	366	fi->next_offset = 0;
367	} else {	367	} else {
368	rinfo = &req->r_reply_info;	368	rinfo = &req->r_reply_info;
369	err = note_last_dentry(fi,	369	err = note_last_dentry(fi,
370	rinfo->dir_dname[rinfo->dir_nr-1],	370	rinfo->dir_dname[rinfo->dir_nr-1],
371	rinfo->dir_dname_len[rinfo->dir_nr-1]);	371	rinfo->dir_dname_len[rinfo->dir_nr-1]);
372	if (err)	372	if (err)
373	return err;	373	return err;
374	fi->next_offset += rinfo->dir_nr;	374	fi->next_offset += rinfo->dir_nr;
375	}	375	}
376	}	376	}
377		377
378	rinfo = &fi->last_readdir->r_reply_info;	378	rinfo = &fi->last_readdir->r_reply_info;
379	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,	379	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
380	rinfo->dir_nr, off, fi->offset);	380	rinfo->dir_nr, off, fi->offset);
381	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {	381	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
382	u64 pos = ceph_make_fpos(frag, off);	382	u64 pos = ceph_make_fpos(frag, off);
383	struct ceph_mds_reply_inode *in =	383	struct ceph_mds_reply_inode *in =
384	rinfo->dir_in[off - fi->offset].in;	384	rinfo->dir_in[off - fi->offset].in;
385	struct ceph_vino vino;	385	struct ceph_vino vino;
386	ino_t ino;	386	ino_t ino;
387		387
388	dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",	388	dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
389	off, off - fi->offset, rinfo->dir_nr, pos,	389	off, off - fi->offset, rinfo->dir_nr, pos,
390	rinfo->dir_dname_len[off - fi->offset],	390	rinfo->dir_dname_len[off - fi->offset],
391	rinfo->dir_dname[off - fi->offset], in);	391	rinfo->dir_dname[off - fi->offset], in);
392	BUG_ON(!in);	392	BUG_ON(!in);
393	ftype = le32_to_cpu(in->mode) >> 12;	393	ftype = le32_to_cpu(in->mode) >> 12;
394	vino.ino = le64_to_cpu(in->ino);	394	vino.ino = le64_to_cpu(in->ino);
395	vino.snap = le64_to_cpu(in->snapid);	395	vino.snap = le64_to_cpu(in->snapid);
396	ino = ceph_vino_to_ino(vino);	396	ino = ceph_vino_to_ino(vino);
397	if (filldir(dirent,	397	if (filldir(dirent,
398	rinfo->dir_dname[off - fi->offset],	398	rinfo->dir_dname[off - fi->offset],
399	rinfo->dir_dname_len[off - fi->offset],	399	rinfo->dir_dname_len[off - fi->offset],
400	pos,	400	pos,
401	ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {	401	ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
402	dout("filldir stopping us...\n");	402	dout("filldir stopping us...\n");
403	return 0;	403	return 0;
404	}	404	}
405	off++;	405	off++;
406	filp->f_pos = pos + 1;	406	filp->f_pos = pos + 1;
407	}	407	}
408		408
409	if (fi->last_name) {	409	if (fi->last_name) {
410	ceph_mdsc_put_request(fi->last_readdir);	410	ceph_mdsc_put_request(fi->last_readdir);
411	fi->last_readdir = NULL;	411	fi->last_readdir = NULL;
412	goto more;	412	goto more;
413	}	413	}
414		414
415	/* more frags? */	415	/* more frags? */
416	if (!ceph_frag_is_rightmost(frag)) {	416	if (!ceph_frag_is_rightmost(frag)) {
417	frag = ceph_frag_next(frag);	417	frag = ceph_frag_next(frag);
418	off = 0;	418	off = 0;
419	filp->f_pos = ceph_make_fpos(frag, off);	419	filp->f_pos = ceph_make_fpos(frag, off);
420	dout("readdir next frag is %x\n", frag);	420	dout("readdir next frag is %x\n", frag);
421	goto more;	421	goto more;
422	}	422	}
423	fi->flags \|= CEPH_F_ATEND;	423	fi->flags \|= CEPH_F_ATEND;
424		424
425	/*	425	/*
426	* if dir_release_count still matches the dir, no dentries	426	* if dir_release_count still matches the dir, no dentries
427	* were released during the whole readdir, and we should have	427	* were released during the whole readdir, and we should have
428	* the complete dir contents in our cache.	428	* the complete dir contents in our cache.
429	*/	429	*/
430	spin_lock(&ci->i_ceph_lock);	430	spin_lock(&ci->i_ceph_lock);
431	if (ci->i_release_count == fi->dir_release_count) {	431	if (ci->i_release_count == fi->dir_release_count) {
432	ceph_dir_set_complete(inode);	432	ceph_dir_set_complete(inode);
433	ci->i_max_offset = filp->f_pos;	433	ci->i_max_offset = filp->f_pos;
434	}	434	}
435	spin_unlock(&ci->i_ceph_lock);	435	spin_unlock(&ci->i_ceph_lock);
436		436
437	dout("readdir %p filp %p done.\n", inode, filp);	437	dout("readdir %p filp %p done.\n", inode, filp);
438	return 0;	438	return 0;
439	}	439	}
440		440
441	static void reset_readdir(struct ceph_file_info *fi)	441	static void reset_readdir(struct ceph_file_info *fi)
442	{	442	{
443	if (fi->last_readdir) {	443	if (fi->last_readdir) {
444	ceph_mdsc_put_request(fi->last_readdir);	444	ceph_mdsc_put_request(fi->last_readdir);
445	fi->last_readdir = NULL;	445	fi->last_readdir = NULL;
446	}	446	}
447	kfree(fi->last_name);	447	kfree(fi->last_name);
448	fi->last_name = NULL;	448	fi->last_name = NULL;
449	fi->next_offset = 2; /* compensate for . and .. */	449	fi->next_offset = 2; /* compensate for . and .. */
450	if (fi->dentry) {	450	if (fi->dentry) {
451	dput(fi->dentry);	451	dput(fi->dentry);
452	fi->dentry = NULL;	452	fi->dentry = NULL;
453	}	453	}
454	fi->flags &= ~CEPH_F_ATEND;	454	fi->flags &= ~CEPH_F_ATEND;
455	}	455	}
456		456
457	static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)	457	static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
458	{	458	{
459	struct ceph_file_info *fi = file->private_data;	459	struct ceph_file_info *fi = file->private_data;
460	struct inode *inode = file->f_mapping->host;	460	struct inode *inode = file->f_mapping->host;
461	loff_t old_offset = offset;	461	loff_t old_offset = offset;
462	loff_t retval;	462	loff_t retval;
463		463
464	mutex_lock(&inode->i_mutex);	464	mutex_lock(&inode->i_mutex);
465	retval = -EINVAL;	465	retval = -EINVAL;
466	switch (origin) {	466	switch (whence) {
467	case SEEK_END:	467	case SEEK_END:
468	offset += inode->i_size + 2; /* FIXME */	468	offset += inode->i_size + 2; /* FIXME */
469	break;	469	break;
470	case SEEK_CUR:	470	case SEEK_CUR:
471	offset += file->f_pos;	471	offset += file->f_pos;
472	case SEEK_SET:	472	case SEEK_SET:
473	break;	473	break;
474	default:	474	default:
475	goto out;	475	goto out;
476	}	476	}
477		477
478	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {	478	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
479	if (offset != file->f_pos) {	479	if (offset != file->f_pos) {
480	file->f_pos = offset;	480	file->f_pos = offset;
481	file->f_version = 0;	481	file->f_version = 0;
482	fi->flags &= ~CEPH_F_ATEND;	482	fi->flags &= ~CEPH_F_ATEND;
483	}	483	}
484	retval = offset;	484	retval = offset;
485		485
486	/*	486	/*
487	* discard buffered readdir content on seekdir(0), or	487	* discard buffered readdir content on seekdir(0), or
488	* seek to new frag, or seek prior to current chunk.	488	* seek to new frag, or seek prior to current chunk.
489	*/	489	*/
490	if (offset == 0 \|\|	490	if (offset == 0 \|\|
491	fpos_frag(offset) != fpos_frag(old_offset) \|\|	491	fpos_frag(offset) != fpos_frag(old_offset) \|\|
492	fpos_off(offset) < fi->offset) {	492	fpos_off(offset) < fi->offset) {
493	dout("dir_llseek dropping %p content\n", file);	493	dout("dir_llseek dropping %p content\n", file);
494	reset_readdir(fi);	494	reset_readdir(fi);
495	}	495	}
496		496
497	/* bump dir_release_count if we did a forward seek */	497	/* bump dir_release_count if we did a forward seek */
498	if (offset > old_offset)	498	if (offset > old_offset)
499	fi->dir_release_count--;	499	fi->dir_release_count--;
500	}	500	}
501	out:	501	out:
502	mutex_unlock(&inode->i_mutex);	502	mutex_unlock(&inode->i_mutex);
503	return retval;	503	return retval;
504	}	504	}
505		505
506	/*	506	/*
507	* Handle lookups for the hidden .snap directory.	507	* Handle lookups for the hidden .snap directory.
508	*/	508	*/
509	int ceph_handle_snapdir(struct ceph_mds_request *req,	509	int ceph_handle_snapdir(struct ceph_mds_request *req,
510	struct dentry *dentry, int err)	510	struct dentry *dentry, int err)
511	{	511	{
512	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);	512	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
513	struct inode parent = dentry->d_parent->d_inode; / we hold i_mutex */	513	struct inode parent = dentry->d_parent->d_inode; / we hold i_mutex */
514		514
515	/* .snap dir? */	515	/* .snap dir? */
516	if (err == -ENOENT &&	516	if (err == -ENOENT &&
517	ceph_snap(parent) == CEPH_NOSNAP &&	517	ceph_snap(parent) == CEPH_NOSNAP &&
518	strcmp(dentry->d_name.name,	518	strcmp(dentry->d_name.name,
519	fsc->mount_options->snapdir_name) == 0) {	519	fsc->mount_options->snapdir_name) == 0) {
520	struct inode *inode = ceph_get_snapdir(parent);	520	struct inode *inode = ceph_get_snapdir(parent);
521	dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",	521	dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
522	dentry, dentry->d_name.len, dentry->d_name.name, inode);	522	dentry, dentry->d_name.len, dentry->d_name.name, inode);
523	BUG_ON(!d_unhashed(dentry));	523	BUG_ON(!d_unhashed(dentry));
524	d_add(dentry, inode);	524	d_add(dentry, inode);
525	err = 0;	525	err = 0;
526	}	526	}
527	return err;	527	return err;
528	}	528	}
529		529
530	/*	530	/*
531	* Figure out final result of a lookup/open request.	531	* Figure out final result of a lookup/open request.
532	*	532	*
533	* Mainly, make sure we return the final req->r_dentry (if it already	533	* Mainly, make sure we return the final req->r_dentry (if it already
534	* existed) in place of the original VFS-provided dentry when they	534	* existed) in place of the original VFS-provided dentry when they
535	* differ.	535	* differ.
536	*	536	*
537	* Gracefully handle the case where the MDS replies with -ENOENT and	537	* Gracefully handle the case where the MDS replies with -ENOENT and
538	* no trace (which it may do, at its discretion, e.g., if it doesn't	538	* no trace (which it may do, at its discretion, e.g., if it doesn't
539	* care to issue a lease on the negative dentry).	539	* care to issue a lease on the negative dentry).
540	*/	540	*/
541	struct dentry ceph_finish_lookup(struct ceph_mds_request req,	541	struct dentry ceph_finish_lookup(struct ceph_mds_request req,
542	struct dentry *dentry, int err)	542	struct dentry *dentry, int err)
543	{	543	{
544	if (err == -ENOENT) {	544	if (err == -ENOENT) {
545	/* no trace? */	545	/* no trace? */
546	err = 0;	546	err = 0;
547	if (!req->r_reply_info.head->is_dentry) {	547	if (!req->r_reply_info.head->is_dentry) {
548	dout("ENOENT and no trace, dentry %p inode %p\n",	548	dout("ENOENT and no trace, dentry %p inode %p\n",
549	dentry, dentry->d_inode);	549	dentry, dentry->d_inode);
550	if (dentry->d_inode) {	550	if (dentry->d_inode) {
551	d_drop(dentry);	551	d_drop(dentry);
552	err = -ENOENT;	552	err = -ENOENT;
553	} else {	553	} else {
554	d_add(dentry, NULL);	554	d_add(dentry, NULL);
555	}	555	}
556	}	556	}
557	}	557	}
558	if (err)	558	if (err)
559	dentry = ERR_PTR(err);	559	dentry = ERR_PTR(err);
560	else if (dentry != req->r_dentry)	560	else if (dentry != req->r_dentry)
561	dentry = dget(req->r_dentry); /* we got spliced */	561	dentry = dget(req->r_dentry); /* we got spliced */
562	else	562	else
563	dentry = NULL;	563	dentry = NULL;
564	return dentry;	564	return dentry;
565	}	565	}
566		566
567	static int is_root_ceph_dentry(struct inode inode, struct dentry dentry)	567	static int is_root_ceph_dentry(struct inode inode, struct dentry dentry)
568	{	568	{
569	return ceph_ino(inode) == CEPH_INO_ROOT &&	569	return ceph_ino(inode) == CEPH_INO_ROOT &&
570	strncmp(dentry->d_name.name, ".ceph", 5) == 0;	570	strncmp(dentry->d_name.name, ".ceph", 5) == 0;
571	}	571	}
572		572
573	/*	573	/*
574	* Look up a single dir entry. If there is a lookup intent, inform	574	* Look up a single dir entry. If there is a lookup intent, inform
575	* the MDS so that it gets our 'caps wanted' value in a single op.	575	* the MDS so that it gets our 'caps wanted' value in a single op.
576	*/	576	*/
577	static struct dentry ceph_lookup(struct inode dir, struct dentry *dentry,	577	static struct dentry ceph_lookup(struct inode dir, struct dentry *dentry,
578	unsigned int flags)	578	unsigned int flags)
579	{	579	{
580	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	580	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
581	struct ceph_mds_client *mdsc = fsc->mdsc;	581	struct ceph_mds_client *mdsc = fsc->mdsc;
582	struct ceph_mds_request *req;	582	struct ceph_mds_request *req;
583	int op;	583	int op;
584	int err;	584	int err;
585		585
586	dout("lookup %p dentry %p '%.*s'\n",	586	dout("lookup %p dentry %p '%.*s'\n",
587	dir, dentry, dentry->d_name.len, dentry->d_name.name);	587	dir, dentry, dentry->d_name.len, dentry->d_name.name);
588		588
589	if (dentry->d_name.len > NAME_MAX)	589	if (dentry->d_name.len > NAME_MAX)
590	return ERR_PTR(-ENAMETOOLONG);	590	return ERR_PTR(-ENAMETOOLONG);
591		591
592	err = ceph_init_dentry(dentry);	592	err = ceph_init_dentry(dentry);
593	if (err < 0)	593	if (err < 0)
594	return ERR_PTR(err);	594	return ERR_PTR(err);
595		595
596	/* can we conclude ENOENT locally? */	596	/* can we conclude ENOENT locally? */
597	if (dentry->d_inode == NULL) {	597	if (dentry->d_inode == NULL) {
598	struct ceph_inode_info *ci = ceph_inode(dir);	598	struct ceph_inode_info *ci = ceph_inode(dir);
599	struct ceph_dentry_info *di = ceph_dentry(dentry);	599	struct ceph_dentry_info *di = ceph_dentry(dentry);
600		600
601	spin_lock(&ci->i_ceph_lock);	601	spin_lock(&ci->i_ceph_lock);
602	dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);	602	dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
603	if (strncmp(dentry->d_name.name,	603	if (strncmp(dentry->d_name.name,
604	fsc->mount_options->snapdir_name,	604	fsc->mount_options->snapdir_name,
605	dentry->d_name.len) &&	605	dentry->d_name.len) &&
606	!is_root_ceph_dentry(dir, dentry) &&	606	!is_root_ceph_dentry(dir, dentry) &&
607	ceph_dir_test_complete(dir) &&	607	ceph_dir_test_complete(dir) &&
608	(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {	608	(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
609	spin_unlock(&ci->i_ceph_lock);	609	spin_unlock(&ci->i_ceph_lock);
610	dout(" dir %p complete, -ENOENT\n", dir);	610	dout(" dir %p complete, -ENOENT\n", dir);
611	d_add(dentry, NULL);	611	d_add(dentry, NULL);
612	di->lease_shared_gen = ci->i_shared_gen;	612	di->lease_shared_gen = ci->i_shared_gen;
613	return NULL;	613	return NULL;
614	}	614	}
615	spin_unlock(&ci->i_ceph_lock);	615	spin_unlock(&ci->i_ceph_lock);
616	}	616	}
617		617
618	op = ceph_snap(dir) == CEPH_SNAPDIR ?	618	op = ceph_snap(dir) == CEPH_SNAPDIR ?
619	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;	619	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
620	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);	620	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
621	if (IS_ERR(req))	621	if (IS_ERR(req))
622	return ERR_CAST(req);	622	return ERR_CAST(req);
623	req->r_dentry = dget(dentry);	623	req->r_dentry = dget(dentry);
624	req->r_num_caps = 2;	624	req->r_num_caps = 2;
625	/* we only need inode linkage */	625	/* we only need inode linkage */
626	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);	626	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
627	req->r_locked_dir = dir;	627	req->r_locked_dir = dir;
628	err = ceph_mdsc_do_request(mdsc, NULL, req);	628	err = ceph_mdsc_do_request(mdsc, NULL, req);
629	err = ceph_handle_snapdir(req, dentry, err);	629	err = ceph_handle_snapdir(req, dentry, err);
630	dentry = ceph_finish_lookup(req, dentry, err);	630	dentry = ceph_finish_lookup(req, dentry, err);
631	ceph_mdsc_put_request(req); /* will dput(dentry) */	631	ceph_mdsc_put_request(req); /* will dput(dentry) */
632	dout("lookup result=%p\n", dentry);	632	dout("lookup result=%p\n", dentry);
633	return dentry;	633	return dentry;
634	}	634	}
635		635
636	/*	636	/*
637	* If we do a create but get no trace back from the MDS, follow up with	637	* If we do a create but get no trace back from the MDS, follow up with
638	* a lookup (the VFS expects us to link up the provided dentry).	638	* a lookup (the VFS expects us to link up the provided dentry).
639	*/	639	*/
640	int ceph_handle_notrace_create(struct inode dir, struct dentry dentry)	640	int ceph_handle_notrace_create(struct inode dir, struct dentry dentry)
641	{	641	{
642	struct dentry *result = ceph_lookup(dir, dentry, 0);	642	struct dentry *result = ceph_lookup(dir, dentry, 0);
643		643
644	if (result && !IS_ERR(result)) {	644	if (result && !IS_ERR(result)) {
645	/*	645	/*
646	* We created the item, then did a lookup, and found	646	* We created the item, then did a lookup, and found
647	* it was already linked to another inode we already	647	* it was already linked to another inode we already
648	* had in our cache (and thus got spliced). Link our	648	* had in our cache (and thus got spliced). Link our
649	* dentry to that inode, but don't hash it, just in	649	* dentry to that inode, but don't hash it, just in
650	* case the VFS wants to dereference it.	650	* case the VFS wants to dereference it.
651	*/	651	*/
652	BUG_ON(!result->d_inode);	652	BUG_ON(!result->d_inode);
653	d_instantiate(dentry, result->d_inode);	653	d_instantiate(dentry, result->d_inode);
654	return 0;	654	return 0;
655	}	655	}
656	return PTR_ERR(result);	656	return PTR_ERR(result);
657	}	657	}
658		658
659	static int ceph_mknod(struct inode dir, struct dentry dentry,	659	static int ceph_mknod(struct inode dir, struct dentry dentry,
660	umode_t mode, dev_t rdev)	660	umode_t mode, dev_t rdev)
661	{	661	{
662	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	662	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
663	struct ceph_mds_client *mdsc = fsc->mdsc;	663	struct ceph_mds_client *mdsc = fsc->mdsc;
664	struct ceph_mds_request *req;	664	struct ceph_mds_request *req;
665	int err;	665	int err;
666		666
667	if (ceph_snap(dir) != CEPH_NOSNAP)	667	if (ceph_snap(dir) != CEPH_NOSNAP)
668	return -EROFS;	668	return -EROFS;
669		669
670	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",	670	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
671	dir, dentry, mode, rdev);	671	dir, dentry, mode, rdev);
672	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);	672	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
673	if (IS_ERR(req)) {	673	if (IS_ERR(req)) {
674	d_drop(dentry);	674	d_drop(dentry);
675	return PTR_ERR(req);	675	return PTR_ERR(req);
676	}	676	}
677	req->r_dentry = dget(dentry);	677	req->r_dentry = dget(dentry);
678	req->r_num_caps = 2;	678	req->r_num_caps = 2;
679	req->r_locked_dir = dir;	679	req->r_locked_dir = dir;
680	req->r_args.mknod.mode = cpu_to_le32(mode);	680	req->r_args.mknod.mode = cpu_to_le32(mode);
681	req->r_args.mknod.rdev = cpu_to_le32(rdev);	681	req->r_args.mknod.rdev = cpu_to_le32(rdev);
682	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	682	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
683	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	683	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
684	err = ceph_mdsc_do_request(mdsc, dir, req);	684	err = ceph_mdsc_do_request(mdsc, dir, req);
685	if (!err && !req->r_reply_info.head->is_dentry)	685	if (!err && !req->r_reply_info.head->is_dentry)
686	err = ceph_handle_notrace_create(dir, dentry);	686	err = ceph_handle_notrace_create(dir, dentry);
687	ceph_mdsc_put_request(req);	687	ceph_mdsc_put_request(req);
688	if (err)	688	if (err)
689	d_drop(dentry);	689	d_drop(dentry);
690	return err;	690	return err;
691	}	691	}
692		692
693	static int ceph_create(struct inode dir, struct dentry dentry, umode_t mode,	693	static int ceph_create(struct inode dir, struct dentry dentry, umode_t mode,
694	bool excl)	694	bool excl)
695	{	695	{
696	return ceph_mknod(dir, dentry, mode, 0);	696	return ceph_mknod(dir, dentry, mode, 0);
697	}	697	}
698		698
699	static int ceph_symlink(struct inode dir, struct dentry dentry,	699	static int ceph_symlink(struct inode dir, struct dentry dentry,
700	const char *dest)	700	const char *dest)
701	{	701	{
702	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	702	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
703	struct ceph_mds_client *mdsc = fsc->mdsc;	703	struct ceph_mds_client *mdsc = fsc->mdsc;
704	struct ceph_mds_request *req;	704	struct ceph_mds_request *req;
705	int err;	705	int err;
706		706
707	if (ceph_snap(dir) != CEPH_NOSNAP)	707	if (ceph_snap(dir) != CEPH_NOSNAP)
708	return -EROFS;	708	return -EROFS;
709		709
710	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);	710	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
711	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);	711	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
712	if (IS_ERR(req)) {	712	if (IS_ERR(req)) {
713	d_drop(dentry);	713	d_drop(dentry);
714	return PTR_ERR(req);	714	return PTR_ERR(req);
715	}	715	}
716	req->r_dentry = dget(dentry);	716	req->r_dentry = dget(dentry);
717	req->r_num_caps = 2;	717	req->r_num_caps = 2;
718	req->r_path2 = kstrdup(dest, GFP_NOFS);	718	req->r_path2 = kstrdup(dest, GFP_NOFS);
719	req->r_locked_dir = dir;	719	req->r_locked_dir = dir;
720	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	720	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
721	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	721	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
722	err = ceph_mdsc_do_request(mdsc, dir, req);	722	err = ceph_mdsc_do_request(mdsc, dir, req);
723	if (!err && !req->r_reply_info.head->is_dentry)	723	if (!err && !req->r_reply_info.head->is_dentry)
724	err = ceph_handle_notrace_create(dir, dentry);	724	err = ceph_handle_notrace_create(dir, dentry);
725	ceph_mdsc_put_request(req);	725	ceph_mdsc_put_request(req);
726	if (err)	726	if (err)
727	d_drop(dentry);	727	d_drop(dentry);
728	return err;	728	return err;
729	}	729	}
730		730
731	static int ceph_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	731	static int ceph_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
732	{	732	{
733	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	733	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
734	struct ceph_mds_client *mdsc = fsc->mdsc;	734	struct ceph_mds_client *mdsc = fsc->mdsc;
735	struct ceph_mds_request *req;	735	struct ceph_mds_request *req;
736	int err = -EROFS;	736	int err = -EROFS;
737	int op;	737	int op;
738		738
739	if (ceph_snap(dir) == CEPH_SNAPDIR) {	739	if (ceph_snap(dir) == CEPH_SNAPDIR) {
740	/* mkdir .snap/foo is a MKSNAP */	740	/* mkdir .snap/foo is a MKSNAP */
741	op = CEPH_MDS_OP_MKSNAP;	741	op = CEPH_MDS_OP_MKSNAP;
742	dout("mksnap dir %p snap '%.*s' dn %p\n", dir,	742	dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
743	dentry->d_name.len, dentry->d_name.name, dentry);	743	dentry->d_name.len, dentry->d_name.name, dentry);
744	} else if (ceph_snap(dir) == CEPH_NOSNAP) {	744	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
745	dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);	745	dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
746	op = CEPH_MDS_OP_MKDIR;	746	op = CEPH_MDS_OP_MKDIR;
747	} else {	747	} else {
748	goto out;	748	goto out;
749	}	749	}
750	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	750	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
751	if (IS_ERR(req)) {	751	if (IS_ERR(req)) {
752	err = PTR_ERR(req);	752	err = PTR_ERR(req);
753	goto out;	753	goto out;
754	}	754	}
755		755
756	req->r_dentry = dget(dentry);	756	req->r_dentry = dget(dentry);
757	req->r_num_caps = 2;	757	req->r_num_caps = 2;
758	req->r_locked_dir = dir;	758	req->r_locked_dir = dir;
759	req->r_args.mkdir.mode = cpu_to_le32(mode);	759	req->r_args.mkdir.mode = cpu_to_le32(mode);
760	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	760	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
761	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	761	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
762	err = ceph_mdsc_do_request(mdsc, dir, req);	762	err = ceph_mdsc_do_request(mdsc, dir, req);
763	if (!err && !req->r_reply_info.head->is_dentry)	763	if (!err && !req->r_reply_info.head->is_dentry)
764	err = ceph_handle_notrace_create(dir, dentry);	764	err = ceph_handle_notrace_create(dir, dentry);
765	ceph_mdsc_put_request(req);	765	ceph_mdsc_put_request(req);
766	out:	766	out:
767	if (err < 0)	767	if (err < 0)
768	d_drop(dentry);	768	d_drop(dentry);
769	return err;	769	return err;
770	}	770	}
771		771
772	static int ceph_link(struct dentry old_dentry, struct inode dir,	772	static int ceph_link(struct dentry old_dentry, struct inode dir,
773	struct dentry *dentry)	773	struct dentry *dentry)
774	{	774	{
775	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	775	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
776	struct ceph_mds_client *mdsc = fsc->mdsc;	776	struct ceph_mds_client *mdsc = fsc->mdsc;
777	struct ceph_mds_request *req;	777	struct ceph_mds_request *req;
778	int err;	778	int err;
779		779
780	if (ceph_snap(dir) != CEPH_NOSNAP)	780	if (ceph_snap(dir) != CEPH_NOSNAP)
781	return -EROFS;	781	return -EROFS;
782		782
783	dout("link in dir %p old_dentry %p dentry %p\n", dir,	783	dout("link in dir %p old_dentry %p dentry %p\n", dir,
784	old_dentry, dentry);	784	old_dentry, dentry);
785	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);	785	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
786	if (IS_ERR(req)) {	786	if (IS_ERR(req)) {
787	d_drop(dentry);	787	d_drop(dentry);
788	return PTR_ERR(req);	788	return PTR_ERR(req);
789	}	789	}
790	req->r_dentry = dget(dentry);	790	req->r_dentry = dget(dentry);
791	req->r_num_caps = 2;	791	req->r_num_caps = 2;
792	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */	792	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
793	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);	793	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
794	req->r_locked_dir = dir;	794	req->r_locked_dir = dir;
795	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	795	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
796	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	796	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
797	err = ceph_mdsc_do_request(mdsc, dir, req);	797	err = ceph_mdsc_do_request(mdsc, dir, req);
798	if (err) {	798	if (err) {
799	d_drop(dentry);	799	d_drop(dentry);
800	} else if (!req->r_reply_info.head->is_dentry) {	800	} else if (!req->r_reply_info.head->is_dentry) {
801	ihold(old_dentry->d_inode);	801	ihold(old_dentry->d_inode);
802	d_instantiate(dentry, old_dentry->d_inode);	802	d_instantiate(dentry, old_dentry->d_inode);
803	}	803	}
804	ceph_mdsc_put_request(req);	804	ceph_mdsc_put_request(req);
805	return err;	805	return err;
806	}	806	}
807		807
808	/*	808	/*
809	* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it	809	* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
810	* looks like the link count will hit 0, drop any other caps (other	810	* looks like the link count will hit 0, drop any other caps (other
811	* than PIN) we don't specifically want (due to the file still being	811	* than PIN) we don't specifically want (due to the file still being
812	* open).	812	* open).
813	*/	813	*/
814	static int drop_caps_for_unlink(struct inode *inode)	814	static int drop_caps_for_unlink(struct inode *inode)
815	{	815	{
816	struct ceph_inode_info *ci = ceph_inode(inode);	816	struct ceph_inode_info *ci = ceph_inode(inode);
817	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;	817	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
818		818
819	spin_lock(&ci->i_ceph_lock);	819	spin_lock(&ci->i_ceph_lock);
820	if (inode->i_nlink == 1) {	820	if (inode->i_nlink == 1) {
821	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);	821	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);
822	ci->i_ceph_flags \|= CEPH_I_NODELAY;	822	ci->i_ceph_flags \|= CEPH_I_NODELAY;
823	}	823	}
824	spin_unlock(&ci->i_ceph_lock);	824	spin_unlock(&ci->i_ceph_lock);
825	return drop;	825	return drop;
826	}	826	}
827		827
828	/*	828	/*
829	* rmdir and unlink are differ only by the metadata op code	829	* rmdir and unlink are differ only by the metadata op code
830	*/	830	*/
831	static int ceph_unlink(struct inode dir, struct dentry dentry)	831	static int ceph_unlink(struct inode dir, struct dentry dentry)
832	{	832	{
833	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	833	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
834	struct ceph_mds_client *mdsc = fsc->mdsc;	834	struct ceph_mds_client *mdsc = fsc->mdsc;
835	struct inode *inode = dentry->d_inode;	835	struct inode *inode = dentry->d_inode;
836	struct ceph_mds_request *req;	836	struct ceph_mds_request *req;
837	int err = -EROFS;	837	int err = -EROFS;
838	int op;	838	int op;
839		839
840	if (ceph_snap(dir) == CEPH_SNAPDIR) {	840	if (ceph_snap(dir) == CEPH_SNAPDIR) {
841	/* rmdir .snap/foo is RMSNAP */	841	/* rmdir .snap/foo is RMSNAP */
842	dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,	842	dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
843	dentry->d_name.name, dentry);	843	dentry->d_name.name, dentry);
844	op = CEPH_MDS_OP_RMSNAP;	844	op = CEPH_MDS_OP_RMSNAP;
845	} else if (ceph_snap(dir) == CEPH_NOSNAP) {	845	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
846	dout("unlink/rmdir dir %p dn %p inode %p\n",	846	dout("unlink/rmdir dir %p dn %p inode %p\n",
847	dir, dentry, inode);	847	dir, dentry, inode);
848	op = S_ISDIR(dentry->d_inode->i_mode) ?	848	op = S_ISDIR(dentry->d_inode->i_mode) ?
849	CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;	849	CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
850	} else	850	} else
851	goto out;	851	goto out;
852	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	852	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
853	if (IS_ERR(req)) {	853	if (IS_ERR(req)) {
854	err = PTR_ERR(req);	854	err = PTR_ERR(req);
855	goto out;	855	goto out;
856	}	856	}
857	req->r_dentry = dget(dentry);	857	req->r_dentry = dget(dentry);
858	req->r_num_caps = 2;	858	req->r_num_caps = 2;
859	req->r_locked_dir = dir;	859	req->r_locked_dir = dir;
860	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	860	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
861	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	861	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
862	req->r_inode_drop = drop_caps_for_unlink(inode);	862	req->r_inode_drop = drop_caps_for_unlink(inode);
863	err = ceph_mdsc_do_request(mdsc, dir, req);	863	err = ceph_mdsc_do_request(mdsc, dir, req);
864	if (!err && !req->r_reply_info.head->is_dentry)	864	if (!err && !req->r_reply_info.head->is_dentry)
865	d_delete(dentry);	865	d_delete(dentry);
866	ceph_mdsc_put_request(req);	866	ceph_mdsc_put_request(req);
867	out:	867	out:
868	return err;	868	return err;
869	}	869	}
870		870
871	static int ceph_rename(struct inode old_dir, struct dentry old_dentry,	871	static int ceph_rename(struct inode old_dir, struct dentry old_dentry,
872	struct inode new_dir, struct dentry new_dentry)	872	struct inode new_dir, struct dentry new_dentry)
873	{	873	{
874	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);	874	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
875	struct ceph_mds_client *mdsc = fsc->mdsc;	875	struct ceph_mds_client *mdsc = fsc->mdsc;
876	struct ceph_mds_request *req;	876	struct ceph_mds_request *req;
877	int err;	877	int err;
878		878
879	if (ceph_snap(old_dir) != ceph_snap(new_dir))	879	if (ceph_snap(old_dir) != ceph_snap(new_dir))
880	return -EXDEV;	880	return -EXDEV;
881	if (ceph_snap(old_dir) != CEPH_NOSNAP \|\|	881	if (ceph_snap(old_dir) != CEPH_NOSNAP \|\|
882	ceph_snap(new_dir) != CEPH_NOSNAP)	882	ceph_snap(new_dir) != CEPH_NOSNAP)
883	return -EROFS;	883	return -EROFS;
884	dout("rename dir %p dentry %p to dir %p dentry %p\n",	884	dout("rename dir %p dentry %p to dir %p dentry %p\n",
885	old_dir, old_dentry, new_dir, new_dentry);	885	old_dir, old_dentry, new_dir, new_dentry);
886	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);	886	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
887	if (IS_ERR(req))	887	if (IS_ERR(req))
888	return PTR_ERR(req);	888	return PTR_ERR(req);
889	req->r_dentry = dget(new_dentry);	889	req->r_dentry = dget(new_dentry);
890	req->r_num_caps = 2;	890	req->r_num_caps = 2;
891	req->r_old_dentry = dget(old_dentry);	891	req->r_old_dentry = dget(old_dentry);
892	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);	892	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
893	req->r_locked_dir = new_dir;	893	req->r_locked_dir = new_dir;
894	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;	894	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
895	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;	895	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
896	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	896	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
897	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	897	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
898	/* release LINK_RDCACHE on source inode (mds will lock it) */	898	/* release LINK_RDCACHE on source inode (mds will lock it) */
899	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;	899	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
900	if (new_dentry->d_inode)	900	if (new_dentry->d_inode)
901	req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);	901	req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
902	err = ceph_mdsc_do_request(mdsc, old_dir, req);	902	err = ceph_mdsc_do_request(mdsc, old_dir, req);
903	if (!err && !req->r_reply_info.head->is_dentry) {	903	if (!err && !req->r_reply_info.head->is_dentry) {
904	/*	904	/*
905	* Normally d_move() is done by fill_trace (called by	905	* Normally d_move() is done by fill_trace (called by
906	* do_request, above). If there is no trace, we need	906	* do_request, above). If there is no trace, we need
907	* to do it here.	907	* to do it here.
908	*/	908	*/
909		909
910	/* d_move screws up d_subdirs order */	910	/* d_move screws up d_subdirs order */
911	ceph_dir_clear_complete(new_dir);	911	ceph_dir_clear_complete(new_dir);
912		912
913	d_move(old_dentry, new_dentry);	913	d_move(old_dentry, new_dentry);
914		914
915	/* ensure target dentry is invalidated, despite	915	/* ensure target dentry is invalidated, despite
916	rehashing bug in vfs_rename_dir */	916	rehashing bug in vfs_rename_dir */
917	ceph_invalidate_dentry_lease(new_dentry);	917	ceph_invalidate_dentry_lease(new_dentry);
918	}	918	}
919	ceph_mdsc_put_request(req);	919	ceph_mdsc_put_request(req);
920	return err;	920	return err;
921	}	921	}
922		922
923	/*	923	/*
924	* Ensure a dentry lease will no longer revalidate.	924	* Ensure a dentry lease will no longer revalidate.
925	*/	925	*/
926	void ceph_invalidate_dentry_lease(struct dentry *dentry)	926	void ceph_invalidate_dentry_lease(struct dentry *dentry)
927	{	927	{
928	spin_lock(&dentry->d_lock);	928	spin_lock(&dentry->d_lock);
929	dentry->d_time = jiffies;	929	dentry->d_time = jiffies;
930	ceph_dentry(dentry)->lease_shared_gen = 0;	930	ceph_dentry(dentry)->lease_shared_gen = 0;
931	spin_unlock(&dentry->d_lock);	931	spin_unlock(&dentry->d_lock);
932	}	932	}
933		933
934	/*	934	/*
935	* Check if dentry lease is valid. If not, delete the lease. Try to	935	* Check if dentry lease is valid. If not, delete the lease. Try to
936	* renew if the least is more than half up.	936	* renew if the least is more than half up.
937	*/	937	*/
938	static int dentry_lease_is_valid(struct dentry *dentry)	938	static int dentry_lease_is_valid(struct dentry *dentry)
939	{	939	{
940	struct ceph_dentry_info *di;	940	struct ceph_dentry_info *di;
941	struct ceph_mds_session *s;	941	struct ceph_mds_session *s;
942	int valid = 0;	942	int valid = 0;
943	u32 gen;	943	u32 gen;
944	unsigned long ttl;	944	unsigned long ttl;
945	struct ceph_mds_session *session = NULL;	945	struct ceph_mds_session *session = NULL;
946	struct inode *dir = NULL;	946	struct inode *dir = NULL;
947	u32 seq = 0;	947	u32 seq = 0;
948		948
949	spin_lock(&dentry->d_lock);	949	spin_lock(&dentry->d_lock);
950	di = ceph_dentry(dentry);	950	di = ceph_dentry(dentry);
951	if (di->lease_session) {	951	if (di->lease_session) {
952	s = di->lease_session;	952	s = di->lease_session;
953	spin_lock(&s->s_gen_ttl_lock);	953	spin_lock(&s->s_gen_ttl_lock);
954	gen = s->s_cap_gen;	954	gen = s->s_cap_gen;
955	ttl = s->s_cap_ttl;	955	ttl = s->s_cap_ttl;
956	spin_unlock(&s->s_gen_ttl_lock);	956	spin_unlock(&s->s_gen_ttl_lock);
957		957
958	if (di->lease_gen == gen &&	958	if (di->lease_gen == gen &&
959	time_before(jiffies, dentry->d_time) &&	959	time_before(jiffies, dentry->d_time) &&
960	time_before(jiffies, ttl)) {	960	time_before(jiffies, ttl)) {
961	valid = 1;	961	valid = 1;
962	if (di->lease_renew_after &&	962	if (di->lease_renew_after &&
963	time_after(jiffies, di->lease_renew_after)) {	963	time_after(jiffies, di->lease_renew_after)) {
964	/* we should renew */	964	/* we should renew */
965	dir = dentry->d_parent->d_inode;	965	dir = dentry->d_parent->d_inode;
966	session = ceph_get_mds_session(s);	966	session = ceph_get_mds_session(s);
967	seq = di->lease_seq;	967	seq = di->lease_seq;
968	di->lease_renew_after = 0;	968	di->lease_renew_after = 0;
969	di->lease_renew_from = jiffies;	969	di->lease_renew_from = jiffies;
970	}	970	}
971	}	971	}
972	}	972	}
973	spin_unlock(&dentry->d_lock);	973	spin_unlock(&dentry->d_lock);
974		974
975	if (session) {	975	if (session) {
976	ceph_mdsc_lease_send_msg(session, dir, dentry,	976	ceph_mdsc_lease_send_msg(session, dir, dentry,
977	CEPH_MDS_LEASE_RENEW, seq);	977	CEPH_MDS_LEASE_RENEW, seq);
978	ceph_put_mds_session(session);	978	ceph_put_mds_session(session);
979	}	979	}
980	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);	980	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
981	return valid;	981	return valid;
982	}	982	}
983		983
984	/*	984	/*
985	* Check if directory-wide content lease/cap is valid.	985	* Check if directory-wide content lease/cap is valid.
986	*/	986	*/
987	static int dir_lease_is_valid(struct inode dir, struct dentry dentry)	987	static int dir_lease_is_valid(struct inode dir, struct dentry dentry)
988	{	988	{
989	struct ceph_inode_info *ci = ceph_inode(dir);	989	struct ceph_inode_info *ci = ceph_inode(dir);
990	struct ceph_dentry_info *di = ceph_dentry(dentry);	990	struct ceph_dentry_info *di = ceph_dentry(dentry);
991	int valid = 0;	991	int valid = 0;
992		992
993	spin_lock(&ci->i_ceph_lock);	993	spin_lock(&ci->i_ceph_lock);
994	if (ci->i_shared_gen == di->lease_shared_gen)	994	if (ci->i_shared_gen == di->lease_shared_gen)
995	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);	995	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
996	spin_unlock(&ci->i_ceph_lock);	996	spin_unlock(&ci->i_ceph_lock);
997	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",	997	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
998	dir, (unsigned)ci->i_shared_gen, dentry,	998	dir, (unsigned)ci->i_shared_gen, dentry,
999	(unsigned)di->lease_shared_gen, valid);	999	(unsigned)di->lease_shared_gen, valid);
1000	return valid;	1000	return valid;
1001	}	1001	}
1002		1002
1003	/*	1003	/*
1004	* Check if cached dentry can be trusted.	1004	* Check if cached dentry can be trusted.
1005	*/	1005	*/
1006	static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)	1006	static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1007	{	1007	{
1008	int valid = 0;	1008	int valid = 0;
1009	struct inode *dir;	1009	struct inode *dir;
1010		1010
1011	if (flags & LOOKUP_RCU)	1011	if (flags & LOOKUP_RCU)
1012	return -ECHILD;	1012	return -ECHILD;
1013		1013
1014	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,	1014	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
1015	dentry->d_name.len, dentry->d_name.name, dentry->d_inode,	1015	dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
1016	ceph_dentry(dentry)->offset);	1016	ceph_dentry(dentry)->offset);
1017		1017
1018	dir = ceph_get_dentry_parent_inode(dentry);	1018	dir = ceph_get_dentry_parent_inode(dentry);
1019		1019
1020	/* always trust cached snapped dentries, snapdir dentry */	1020	/* always trust cached snapped dentries, snapdir dentry */
1021	if (ceph_snap(dir) != CEPH_NOSNAP) {	1021	if (ceph_snap(dir) != CEPH_NOSNAP) {
1022	dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,	1022	dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
1023	dentry->d_name.len, dentry->d_name.name, dentry->d_inode);	1023	dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
1024	valid = 1;	1024	valid = 1;
1025	} else if (dentry->d_inode &&	1025	} else if (dentry->d_inode &&
1026	ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {	1026	ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
1027	valid = 1;	1027	valid = 1;
1028	} else if (dentry_lease_is_valid(dentry) \|\|	1028	} else if (dentry_lease_is_valid(dentry) \|\|
1029	dir_lease_is_valid(dir, dentry)) {	1029	dir_lease_is_valid(dir, dentry)) {
1030	valid = 1;	1030	valid = 1;
1031	}	1031	}
1032		1032
1033	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");	1033	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1034	if (valid)	1034	if (valid)
1035	ceph_dentry_lru_touch(dentry);	1035	ceph_dentry_lru_touch(dentry);
1036	else	1036	else
1037	d_drop(dentry);	1037	d_drop(dentry);
1038	iput(dir);	1038	iput(dir);
1039	return valid;	1039	return valid;
1040	}	1040	}
1041		1041
1042	/*	1042	/*
1043	* Release our ceph_dentry_info.	1043	* Release our ceph_dentry_info.
1044	*/	1044	*/
1045	static void ceph_d_release(struct dentry *dentry)	1045	static void ceph_d_release(struct dentry *dentry)
1046	{	1046	{
1047	struct ceph_dentry_info *di = ceph_dentry(dentry);	1047	struct ceph_dentry_info *di = ceph_dentry(dentry);
1048		1048
1049	dout("d_release %p\n", dentry);	1049	dout("d_release %p\n", dentry);
1050	ceph_dentry_lru_del(dentry);	1050	ceph_dentry_lru_del(dentry);
1051	if (di->lease_session)	1051	if (di->lease_session)
1052	ceph_put_mds_session(di->lease_session);	1052	ceph_put_mds_session(di->lease_session);
1053	kmem_cache_free(ceph_dentry_cachep, di);	1053	kmem_cache_free(ceph_dentry_cachep, di);
1054	dentry->d_fsdata = NULL;	1054	dentry->d_fsdata = NULL;
1055	}	1055	}
1056		1056
1057	static int ceph_snapdir_d_revalidate(struct dentry *dentry,	1057	static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1058	unsigned int flags)	1058	unsigned int flags)
1059	{	1059	{
1060	/*	1060	/*
1061	* Eventually, we'll want to revalidate snapped metadata	1061	* Eventually, we'll want to revalidate snapped metadata
1062	* too... probably...	1062	* too... probably...
1063	*/	1063	*/
1064	return 1;	1064	return 1;
1065	}	1065	}
1066		1066
1067	/*	1067	/*
1068	* Set/clear/test dir complete flag on the dir's dentry.	1068	* Set/clear/test dir complete flag on the dir's dentry.
1069	*/	1069	*/
1070	void ceph_dir_set_complete(struct inode *inode)	1070	void ceph_dir_set_complete(struct inode *inode)
1071	{	1071	{
1072	struct dentry *dentry = d_find_any_alias(inode);	1072	struct dentry *dentry = d_find_any_alias(inode);
1073		1073
1074	if (dentry && ceph_dentry(dentry) &&	1074	if (dentry && ceph_dentry(dentry) &&
1075	ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {	1075	ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1076	dout(" marking %p (%p) complete\n", inode, dentry);	1076	dout(" marking %p (%p) complete\n", inode, dentry);
1077	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1077	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1078	}	1078	}
1079	dput(dentry);	1079	dput(dentry);
1080	}	1080	}
1081		1081
1082	void ceph_dir_clear_complete(struct inode *inode)	1082	void ceph_dir_clear_complete(struct inode *inode)
1083	{	1083	{
1084	struct dentry *dentry = d_find_any_alias(inode);	1084	struct dentry *dentry = d_find_any_alias(inode);
1085		1085
1086	if (dentry && ceph_dentry(dentry)) {	1086	if (dentry && ceph_dentry(dentry)) {
1087	dout(" marking %p (%p) complete\n", inode, dentry);	1087	dout(" marking %p (%p) complete\n", inode, dentry);
1088	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1088	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1089	}	1089	}
1090	dput(dentry);	1090	dput(dentry);
1091	}	1091	}
1092		1092
1093	bool ceph_dir_test_complete(struct inode *inode)	1093	bool ceph_dir_test_complete(struct inode *inode)
1094	{	1094	{
1095	struct dentry *dentry = d_find_any_alias(inode);	1095	struct dentry *dentry = d_find_any_alias(inode);
1096		1096
1097	if (dentry && ceph_dentry(dentry)) {	1097	if (dentry && ceph_dentry(dentry)) {
1098	dout(" marking %p (%p) NOT complete\n", inode, dentry);	1098	dout(" marking %p (%p) NOT complete\n", inode, dentry);
1099	clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1099	clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1100	}	1100	}
1101	dput(dentry);	1101	dput(dentry);
1102	return false;	1102	return false;
1103	}	1103	}
1104		1104
1105	/*	1105	/*
1106	* When the VFS prunes a dentry from the cache, we need to clear the	1106	* When the VFS prunes a dentry from the cache, we need to clear the
1107	* complete flag on the parent directory.	1107	* complete flag on the parent directory.
1108	*	1108	*
1109	* Called under dentry->d_lock.	1109	* Called under dentry->d_lock.
1110	*/	1110	*/
1111	static void ceph_d_prune(struct dentry *dentry)	1111	static void ceph_d_prune(struct dentry *dentry)
1112	{	1112	{
1113	struct ceph_dentry_info *di;	1113	struct ceph_dentry_info *di;
1114		1114
1115	dout("ceph_d_prune %p\n", dentry);	1115	dout("ceph_d_prune %p\n", dentry);
1116		1116
1117	/* do we have a valid parent? */	1117	/* do we have a valid parent? */
1118	if (IS_ROOT(dentry))	1118	if (IS_ROOT(dentry))
1119	return;	1119	return;
1120		1120
1121	/* if we are not hashed, we don't affect D_COMPLETE */	1121	/* if we are not hashed, we don't affect D_COMPLETE */
1122	if (d_unhashed(dentry))	1122	if (d_unhashed(dentry))
1123	return;	1123	return;
1124		1124
1125	/*	1125	/*
1126	* we hold d_lock, so d_parent is stable, and d_fsdata is never	1126	* we hold d_lock, so d_parent is stable, and d_fsdata is never
1127	* cleared until d_release	1127	* cleared until d_release
1128	*/	1128	*/
1129	di = ceph_dentry(dentry->d_parent);	1129	di = ceph_dentry(dentry->d_parent);
1130	clear_bit(CEPH_D_COMPLETE, &di->flags);	1130	clear_bit(CEPH_D_COMPLETE, &di->flags);
1131	}	1131	}
1132		1132
1133	/*	1133	/*
1134	* read() on a dir. This weird interface hack only works if mounted	1134	* read() on a dir. This weird interface hack only works if mounted
1135	* with '-o dirstat'.	1135	* with '-o dirstat'.
1136	*/	1136	*/
1137	static ssize_t ceph_read_dir(struct file file, char __user buf, size_t size,	1137	static ssize_t ceph_read_dir(struct file file, char __user buf, size_t size,
1138	loff_t *ppos)	1138	loff_t *ppos)
1139	{	1139	{
1140	struct ceph_file_info *cf = file->private_data;	1140	struct ceph_file_info *cf = file->private_data;
1141	struct inode *inode = file->f_dentry->d_inode;	1141	struct inode *inode = file->f_dentry->d_inode;
1142	struct ceph_inode_info *ci = ceph_inode(inode);	1142	struct ceph_inode_info *ci = ceph_inode(inode);
1143	int left;	1143	int left;
1144	const int bufsize = 1024;	1144	const int bufsize = 1024;
1145		1145
1146	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))	1146	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1147	return -EISDIR;	1147	return -EISDIR;
1148		1148
1149	if (!cf->dir_info) {	1149	if (!cf->dir_info) {
1150	cf->dir_info = kmalloc(bufsize, GFP_NOFS);	1150	cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1151	if (!cf->dir_info)	1151	if (!cf->dir_info)
1152	return -ENOMEM;	1152	return -ENOMEM;
1153	cf->dir_info_len =	1153	cf->dir_info_len =
1154	snprintf(cf->dir_info, bufsize,	1154	snprintf(cf->dir_info, bufsize,
1155	"entries: %20lld\n"	1155	"entries: %20lld\n"
1156	" files: %20lld\n"	1156	" files: %20lld\n"
1157	" subdirs: %20lld\n"	1157	" subdirs: %20lld\n"
1158	"rentries: %20lld\n"	1158	"rentries: %20lld\n"
1159	" rfiles: %20lld\n"	1159	" rfiles: %20lld\n"
1160	" rsubdirs: %20lld\n"	1160	" rsubdirs: %20lld\n"
1161	"rbytes: %20lld\n"	1161	"rbytes: %20lld\n"
1162	"rctime: %10ld.%09ld\n",	1162	"rctime: %10ld.%09ld\n",
1163	ci->i_files + ci->i_subdirs,	1163	ci->i_files + ci->i_subdirs,
1164	ci->i_files,	1164	ci->i_files,
1165	ci->i_subdirs,	1165	ci->i_subdirs,
1166	ci->i_rfiles + ci->i_rsubdirs,	1166	ci->i_rfiles + ci->i_rsubdirs,
1167	ci->i_rfiles,	1167	ci->i_rfiles,
1168	ci->i_rsubdirs,	1168	ci->i_rsubdirs,
1169	ci->i_rbytes,	1169	ci->i_rbytes,
1170	(long)ci->i_rctime.tv_sec,	1170	(long)ci->i_rctime.tv_sec,
1171	(long)ci->i_rctime.tv_nsec);	1171	(long)ci->i_rctime.tv_nsec);
1172	}	1172	}
1173		1173
1174	if (*ppos >= cf->dir_info_len)	1174	if (*ppos >= cf->dir_info_len)
1175	return 0;	1175	return 0;
1176	size = min_t(unsigned, size, cf->dir_info_len-*ppos);	1176	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1177	left = copy_to_user(buf, cf->dir_info + *ppos, size);	1177	left = copy_to_user(buf, cf->dir_info + *ppos, size);
1178	if (left == size)	1178	if (left == size)
1179	return -EFAULT;	1179	return -EFAULT;
1180	*ppos += (size - left);	1180	*ppos += (size - left);
1181	return size - left;	1181	return size - left;
1182	}	1182	}
1183		1183
1184	/*	1184	/*
1185	* an fsync() on a dir will wait for any uncommitted directory	1185	* an fsync() on a dir will wait for any uncommitted directory
1186	* operations to commit.	1186	* operations to commit.
1187	*/	1187	*/
1188	static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,	1188	static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1189	int datasync)	1189	int datasync)
1190	{	1190	{
1191	struct inode *inode = file->f_path.dentry->d_inode;	1191	struct inode *inode = file->f_path.dentry->d_inode;
1192	struct ceph_inode_info *ci = ceph_inode(inode);	1192	struct ceph_inode_info *ci = ceph_inode(inode);
1193	struct list_head *head = &ci->i_unsafe_dirops;	1193	struct list_head *head = &ci->i_unsafe_dirops;
1194	struct ceph_mds_request *req;	1194	struct ceph_mds_request *req;
1195	u64 last_tid;	1195	u64 last_tid;
1196	int ret = 0;	1196	int ret = 0;
1197		1197
1198	dout("dir_fsync %p\n", inode);	1198	dout("dir_fsync %p\n", inode);
1199	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);	1199	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1200	if (ret)	1200	if (ret)
1201	return ret;	1201	return ret;
1202	mutex_lock(&inode->i_mutex);	1202	mutex_lock(&inode->i_mutex);
1203		1203
1204	spin_lock(&ci->i_unsafe_lock);	1204	spin_lock(&ci->i_unsafe_lock);
1205	if (list_empty(head))	1205	if (list_empty(head))
1206	goto out;	1206	goto out;
1207		1207
1208	req = list_entry(head->prev,	1208	req = list_entry(head->prev,
1209	struct ceph_mds_request, r_unsafe_dir_item);	1209	struct ceph_mds_request, r_unsafe_dir_item);
1210	last_tid = req->r_tid;	1210	last_tid = req->r_tid;
1211		1211
1212	do {	1212	do {
1213	ceph_mdsc_get_request(req);	1213	ceph_mdsc_get_request(req);
1214	spin_unlock(&ci->i_unsafe_lock);	1214	spin_unlock(&ci->i_unsafe_lock);
1215		1215
1216	dout("dir_fsync %p wait on tid %llu (until %llu)\n",	1216	dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1217	inode, req->r_tid, last_tid);	1217	inode, req->r_tid, last_tid);
1218	if (req->r_timeout) {	1218	if (req->r_timeout) {
1219	ret = wait_for_completion_timeout(	1219	ret = wait_for_completion_timeout(
1220	&req->r_safe_completion, req->r_timeout);	1220	&req->r_safe_completion, req->r_timeout);
1221	if (ret > 0)	1221	if (ret > 0)
1222	ret = 0;	1222	ret = 0;
1223	else if (ret == 0)	1223	else if (ret == 0)
1224	ret = -EIO; /* timed out */	1224	ret = -EIO; /* timed out */
1225	} else {	1225	} else {
1226	wait_for_completion(&req->r_safe_completion);	1226	wait_for_completion(&req->r_safe_completion);
1227	}	1227	}
1228	ceph_mdsc_put_request(req);	1228	ceph_mdsc_put_request(req);
1229		1229
1230	spin_lock(&ci->i_unsafe_lock);	1230	spin_lock(&ci->i_unsafe_lock);
1231	if (ret \|\| list_empty(head))	1231	if (ret \|\| list_empty(head))
1232	break;	1232	break;
1233	req = list_entry(head->next,	1233	req = list_entry(head->next,
1234	struct ceph_mds_request, r_unsafe_dir_item);	1234	struct ceph_mds_request, r_unsafe_dir_item);
1235	} while (req->r_tid < last_tid);	1235	} while (req->r_tid < last_tid);
1236	out:	1236	out:
1237	spin_unlock(&ci->i_unsafe_lock);	1237	spin_unlock(&ci->i_unsafe_lock);
1238	mutex_unlock(&inode->i_mutex);	1238	mutex_unlock(&inode->i_mutex);
1239		1239
1240	return ret;	1240	return ret;
1241	}	1241	}
1242		1242
1243	/*	1243	/*
1244	* We maintain a private dentry LRU.	1244	* We maintain a private dentry LRU.
1245	*	1245	*
1246	* FIXME: this needs to be changed to a per-mds lru to be useful.	1246	* FIXME: this needs to be changed to a per-mds lru to be useful.
1247	*/	1247	*/
1248	void ceph_dentry_lru_add(struct dentry *dn)	1248	void ceph_dentry_lru_add(struct dentry *dn)
1249	{	1249	{
1250	struct ceph_dentry_info *di = ceph_dentry(dn);	1250	struct ceph_dentry_info *di = ceph_dentry(dn);
1251	struct ceph_mds_client *mdsc;	1251	struct ceph_mds_client *mdsc;
1252		1252
1253	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,	1253	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1254	dn->d_name.len, dn->d_name.name);	1254	dn->d_name.len, dn->d_name.name);
1255	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1255	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1256	spin_lock(&mdsc->dentry_lru_lock);	1256	spin_lock(&mdsc->dentry_lru_lock);
1257	list_add_tail(&di->lru, &mdsc->dentry_lru);	1257	list_add_tail(&di->lru, &mdsc->dentry_lru);
1258	mdsc->num_dentry++;	1258	mdsc->num_dentry++;
1259	spin_unlock(&mdsc->dentry_lru_lock);	1259	spin_unlock(&mdsc->dentry_lru_lock);
1260	}	1260	}
1261		1261
1262	void ceph_dentry_lru_touch(struct dentry *dn)	1262	void ceph_dentry_lru_touch(struct dentry *dn)
1263	{	1263	{
1264	struct ceph_dentry_info *di = ceph_dentry(dn);	1264	struct ceph_dentry_info *di = ceph_dentry(dn);
1265	struct ceph_mds_client *mdsc;	1265	struct ceph_mds_client *mdsc;
1266		1266
1267	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,	1267	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1268	dn->d_name.len, dn->d_name.name, di->offset);	1268	dn->d_name.len, dn->d_name.name, di->offset);
1269	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1269	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1270	spin_lock(&mdsc->dentry_lru_lock);	1270	spin_lock(&mdsc->dentry_lru_lock);
1271	list_move_tail(&di->lru, &mdsc->dentry_lru);	1271	list_move_tail(&di->lru, &mdsc->dentry_lru);
1272	spin_unlock(&mdsc->dentry_lru_lock);	1272	spin_unlock(&mdsc->dentry_lru_lock);
1273	}	1273	}
1274		1274
1275	void ceph_dentry_lru_del(struct dentry *dn)	1275	void ceph_dentry_lru_del(struct dentry *dn)
1276	{	1276	{
1277	struct ceph_dentry_info *di = ceph_dentry(dn);	1277	struct ceph_dentry_info *di = ceph_dentry(dn);
1278	struct ceph_mds_client *mdsc;	1278	struct ceph_mds_client *mdsc;
1279		1279
1280	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,	1280	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1281	dn->d_name.len, dn->d_name.name);	1281	dn->d_name.len, dn->d_name.name);
1282	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1282	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1283	spin_lock(&mdsc->dentry_lru_lock);	1283	spin_lock(&mdsc->dentry_lru_lock);
1284	list_del_init(&di->lru);	1284	list_del_init(&di->lru);
1285	mdsc->num_dentry--;	1285	mdsc->num_dentry--;
1286	spin_unlock(&mdsc->dentry_lru_lock);	1286	spin_unlock(&mdsc->dentry_lru_lock);
1287	}	1287	}
1288		1288
1289	/*	1289	/*
1290	* Return name hash for a given dentry. This is dependent on	1290	* Return name hash for a given dentry. This is dependent on
1291	* the parent directory's hash function.	1291	* the parent directory's hash function.
1292	*/	1292	*/
1293	unsigned ceph_dentry_hash(struct inode dir, struct dentry dn)	1293	unsigned ceph_dentry_hash(struct inode dir, struct dentry dn)
1294	{	1294	{
1295	struct ceph_inode_info *dci = ceph_inode(dir);	1295	struct ceph_inode_info *dci = ceph_inode(dir);
1296		1296
1297	switch (dci->i_dir_layout.dl_dir_hash) {	1297	switch (dci->i_dir_layout.dl_dir_hash) {
1298	case 0: /* for backward compat */	1298	case 0: /* for backward compat */
1299	case CEPH_STR_HASH_LINUX:	1299	case CEPH_STR_HASH_LINUX:
1300	return dn->d_name.hash;	1300	return dn->d_name.hash;
1301		1301
1302	default:	1302	default:
1303	return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,	1303	return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1304	dn->d_name.name, dn->d_name.len);	1304	dn->d_name.name, dn->d_name.len);
1305	}	1305	}
1306	}	1306	}
1307		1307
1308	const struct file_operations ceph_dir_fops = {	1308	const struct file_operations ceph_dir_fops = {
1309	.read = ceph_read_dir,	1309	.read = ceph_read_dir,
1310	.readdir = ceph_readdir,	1310	.readdir = ceph_readdir,
1311	.llseek = ceph_dir_llseek,	1311	.llseek = ceph_dir_llseek,
1312	.open = ceph_open,	1312	.open = ceph_open,
1313	.release = ceph_release,	1313	.release = ceph_release,
1314	.unlocked_ioctl = ceph_ioctl,	1314	.unlocked_ioctl = ceph_ioctl,
1315	.fsync = ceph_dir_fsync,	1315	.fsync = ceph_dir_fsync,
1316	};	1316	};
1317		1317
1318	const struct inode_operations ceph_dir_iops = {	1318	const struct inode_operations ceph_dir_iops = {
1319	.lookup = ceph_lookup,	1319	.lookup = ceph_lookup,
1320	.permission = ceph_permission,	1320	.permission = ceph_permission,
1321	.getattr = ceph_getattr,	1321	.getattr = ceph_getattr,
1322	.setattr = ceph_setattr,	1322	.setattr = ceph_setattr,
1323	.setxattr = ceph_setxattr,	1323	.setxattr = ceph_setxattr,
1324	.getxattr = ceph_getxattr,	1324	.getxattr = ceph_getxattr,
1325	.listxattr = ceph_listxattr,	1325	.listxattr = ceph_listxattr,
1326	.removexattr = ceph_removexattr,	1326	.removexattr = ceph_removexattr,
1327	.mknod = ceph_mknod,	1327	.mknod = ceph_mknod,
1328	.symlink = ceph_symlink,	1328	.symlink = ceph_symlink,
1329	.mkdir = ceph_mkdir,	1329	.mkdir = ceph_mkdir,
1330	.link = ceph_link,	1330	.link = ceph_link,
1331	.unlink = ceph_unlink,	1331	.unlink = ceph_unlink,
1332	.rmdir = ceph_unlink,	1332	.rmdir = ceph_unlink,
1333	.rename = ceph_rename,	1333	.rename = ceph_rename,
1334	.create = ceph_create,	1334	.create = ceph_create,
1335	.atomic_open = ceph_atomic_open,	1335	.atomic_open = ceph_atomic_open,
1336	};	1336	};
1337		1337
1338	const struct dentry_operations ceph_dentry_ops = {	1338	const struct dentry_operations ceph_dentry_ops = {
1339	.d_revalidate = ceph_d_revalidate,	1339	.d_revalidate = ceph_d_revalidate,
1340	.d_release = ceph_d_release,	1340	.d_release = ceph_d_release,
1341	.d_prune = ceph_d_prune,	1341	.d_prune = ceph_d_prune,
1342	};	1342	};
1343		1343
1344	const struct dentry_operations ceph_snapdir_dentry_ops = {	1344	const struct dentry_operations ceph_snapdir_dentry_ops = {
1345	.d_revalidate = ceph_snapdir_d_revalidate,	1345	.d_revalidate = ceph_snapdir_d_revalidate,
1346	.d_release = ceph_d_release,	1346	.d_release = ceph_d_release,
1347	};	1347	};
1348		1348
1349	const struct dentry_operations ceph_snap_dentry_ops = {	1349	const struct dentry_operations ceph_snap_dentry_ops = {
1350	.d_release = ceph_d_release,	1350	.d_release = ceph_d_release,
1351	.d_prune = ceph_d_prune,	1351	.d_prune = ceph_d_prune,
1352	};	1352	};
1353		1353

fs/ceph/file.c

Diff comments View file @ 965c8e5

1	#include <linux/ceph/ceph_debug.h>	1	#include <linux/ceph/ceph_debug.h>
2		2
3	#include <linux/module.h>	3	#include <linux/module.h>
4	#include <linux/sched.h>	4	#include <linux/sched.h>
5	#include <linux/slab.h>	5	#include <linux/slab.h>
6	#include <linux/file.h>	6	#include <linux/file.h>
7	#include <linux/mount.h>	7	#include <linux/mount.h>
8	#include <linux/namei.h>	8	#include <linux/namei.h>
9	#include <linux/writeback.h>	9	#include <linux/writeback.h>
10		10
11	#include "super.h"	11	#include "super.h"
12	#include "mds_client.h"	12	#include "mds_client.h"
13		13
14	/*	14	/*
15	* Ceph file operations	15	* Ceph file operations
16	*	16	*
17	* Implement basic open/close functionality, and implement	17	* Implement basic open/close functionality, and implement
18	* read/write.	18	* read/write.
19	*	19	*
20	* We implement three modes of file I/O:	20	* We implement three modes of file I/O:
21	* - buffered uses the generic_file_aio_{read,write} helpers	21	* - buffered uses the generic_file_aio_{read,write} helpers
22	*	22	*
23	* - synchronous is used when there is multi-client read/write	23	* - synchronous is used when there is multi-client read/write
24	* sharing, avoids the page cache, and synchronously waits for an	24	* sharing, avoids the page cache, and synchronously waits for an
25	* ack from the OSD.	25	* ack from the OSD.
26	*	26	*
27	* - direct io takes the variant of the sync path that references	27	* - direct io takes the variant of the sync path that references
28	* user pages directly.	28	* user pages directly.
29	*	29	*
30	* fsync() flushes and waits on dirty pages, but just queues metadata	30	* fsync() flushes and waits on dirty pages, but just queues metadata
31	* for writeback: since the MDS can recover size and mtime there is no	31	* for writeback: since the MDS can recover size and mtime there is no
32	* need to wait for MDS acknowledgement.	32	* need to wait for MDS acknowledgement.
33	*/	33	*/
34		34
35		35
36	/*	36	/*
37	* Prepare an open request. Preallocate ceph_cap to avoid an	37	* Prepare an open request. Preallocate ceph_cap to avoid an
38	* inopportune ENOMEM later.	38	* inopportune ENOMEM later.
39	*/	39	*/
40	static struct ceph_mds_request *	40	static struct ceph_mds_request *
41	prepare_open_request(struct super_block *sb, int flags, int create_mode)	41	prepare_open_request(struct super_block *sb, int flags, int create_mode)
42	{	42	{
43	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);	43	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
44	struct ceph_mds_client *mdsc = fsc->mdsc;	44	struct ceph_mds_client *mdsc = fsc->mdsc;
45	struct ceph_mds_request *req;	45	struct ceph_mds_request *req;
46	int want_auth = USE_ANY_MDS;	46	int want_auth = USE_ANY_MDS;
47	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;	47	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
48		48
49	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))	49	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))
50	want_auth = USE_AUTH_MDS;	50	want_auth = USE_AUTH_MDS;
51		51
52	req = ceph_mdsc_create_request(mdsc, op, want_auth);	52	req = ceph_mdsc_create_request(mdsc, op, want_auth);
53	if (IS_ERR(req))	53	if (IS_ERR(req))
54	goto out;	54	goto out;
55	req->r_fmode = ceph_flags_to_mode(flags);	55	req->r_fmode = ceph_flags_to_mode(flags);
56	req->r_args.open.flags = cpu_to_le32(flags);	56	req->r_args.open.flags = cpu_to_le32(flags);
57	req->r_args.open.mode = cpu_to_le32(create_mode);	57	req->r_args.open.mode = cpu_to_le32(create_mode);
58	out:	58	out:
59	return req;	59	return req;
60	}	60	}
61		61
62	/*	62	/*
63	* initialize private struct file data.	63	* initialize private struct file data.
64	* if we fail, clean up by dropping fmode reference on the ceph_inode	64	* if we fail, clean up by dropping fmode reference on the ceph_inode
65	*/	65	*/
66	static int ceph_init_file(struct inode inode, struct file file, int fmode)	66	static int ceph_init_file(struct inode inode, struct file file, int fmode)
67	{	67	{
68	struct ceph_file_info *cf;	68	struct ceph_file_info *cf;
69	int ret = 0;	69	int ret = 0;
70		70
71	switch (inode->i_mode & S_IFMT) {	71	switch (inode->i_mode & S_IFMT) {
72	case S_IFREG:	72	case S_IFREG:
73	case S_IFDIR:	73	case S_IFDIR:
74	dout("init_file %p %p 0%o (regular)\n", inode, file,	74	dout("init_file %p %p 0%o (regular)\n", inode, file,
75	inode->i_mode);	75	inode->i_mode);
76	cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS \| __GFP_ZERO);	76	cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS \| __GFP_ZERO);
77	if (cf == NULL) {	77	if (cf == NULL) {
78	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */	78	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
79	return -ENOMEM;	79	return -ENOMEM;
80	}	80	}
81	cf->fmode = fmode;	81	cf->fmode = fmode;
82	cf->next_offset = 2;	82	cf->next_offset = 2;
83	file->private_data = cf;	83	file->private_data = cf;
84	BUG_ON(inode->i_fop->release != ceph_release);	84	BUG_ON(inode->i_fop->release != ceph_release);
85	break;	85	break;
86		86
87	case S_IFLNK:	87	case S_IFLNK:
88	dout("init_file %p %p 0%o (symlink)\n", inode, file,	88	dout("init_file %p %p 0%o (symlink)\n", inode, file,
89	inode->i_mode);	89	inode->i_mode);
90	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */	90	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
91	break;	91	break;
92		92
93	default:	93	default:
94	dout("init_file %p %p 0%o (special)\n", inode, file,	94	dout("init_file %p %p 0%o (special)\n", inode, file,
95	inode->i_mode);	95	inode->i_mode);
96	/*	96	/*
97	* we need to drop the open ref now, since we don't	97	* we need to drop the open ref now, since we don't
98	* have .release set to ceph_release.	98	* have .release set to ceph_release.
99	*/	99	*/
100	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */	100	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
101	BUG_ON(inode->i_fop->release == ceph_release);	101	BUG_ON(inode->i_fop->release == ceph_release);
102		102
103	/* call the proper open fop */	103	/* call the proper open fop */
104	ret = inode->i_fop->open(inode, file);	104	ret = inode->i_fop->open(inode, file);
105	}	105	}
106	return ret;	106	return ret;
107	}	107	}
108		108
109	/*	109	/*
110	* If we already have the requisite capabilities, we can satisfy	110	* If we already have the requisite capabilities, we can satisfy
111	* the open request locally (no need to request new caps from the	111	* the open request locally (no need to request new caps from the
112	* MDS). We do, however, need to inform the MDS (asynchronously)	112	* MDS). We do, however, need to inform the MDS (asynchronously)
113	* if our wanted caps set expands.	113	* if our wanted caps set expands.
114	*/	114	*/
115	int ceph_open(struct inode inode, struct file file)	115	int ceph_open(struct inode inode, struct file file)
116	{	116	{
117	struct ceph_inode_info *ci = ceph_inode(inode);	117	struct ceph_inode_info *ci = ceph_inode(inode);
118	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);	118	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
119	struct ceph_mds_client *mdsc = fsc->mdsc;	119	struct ceph_mds_client *mdsc = fsc->mdsc;
120	struct ceph_mds_request *req;	120	struct ceph_mds_request *req;
121	struct ceph_file_info *cf = file->private_data;	121	struct ceph_file_info *cf = file->private_data;
122	struct inode *parent_inode = NULL;	122	struct inode *parent_inode = NULL;
123	int err;	123	int err;
124	int flags, fmode, wanted;	124	int flags, fmode, wanted;
125		125
126	if (cf) {	126	if (cf) {
127	dout("open file %p is already opened\n", file);	127	dout("open file %p is already opened\n", file);
128	return 0;	128	return 0;
129	}	129	}
130		130
131	/* filter out O_CREAT\|O_EXCL; vfs did that already. yuck. */	131	/* filter out O_CREAT\|O_EXCL; vfs did that already. yuck. */
132	flags = file->f_flags & ~(O_CREAT\|O_EXCL);	132	flags = file->f_flags & ~(O_CREAT\|O_EXCL);
133	if (S_ISDIR(inode->i_mode))	133	if (S_ISDIR(inode->i_mode))
134	flags = O_DIRECTORY; /* mds likes to know */	134	flags = O_DIRECTORY; /* mds likes to know */
135		135
136	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,	136	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
137	ceph_vinop(inode), file, flags, file->f_flags);	137	ceph_vinop(inode), file, flags, file->f_flags);
138	fmode = ceph_flags_to_mode(flags);	138	fmode = ceph_flags_to_mode(flags);
139	wanted = ceph_caps_for_mode(fmode);	139	wanted = ceph_caps_for_mode(fmode);
140		140
141	/* snapped files are read-only */	141	/* snapped files are read-only */
142	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))	142	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
143	return -EROFS;	143	return -EROFS;
144		144
145	/* trivially open snapdir */	145	/* trivially open snapdir */
146	if (ceph_snap(inode) == CEPH_SNAPDIR) {	146	if (ceph_snap(inode) == CEPH_SNAPDIR) {
147	spin_lock(&ci->i_ceph_lock);	147	spin_lock(&ci->i_ceph_lock);
148	__ceph_get_fmode(ci, fmode);	148	__ceph_get_fmode(ci, fmode);
149	spin_unlock(&ci->i_ceph_lock);	149	spin_unlock(&ci->i_ceph_lock);
150	return ceph_init_file(inode, file, fmode);	150	return ceph_init_file(inode, file, fmode);
151	}	151	}
152		152
153	/*	153	/*
154	* No need to block if we have caps on the auth MDS (for	154	* No need to block if we have caps on the auth MDS (for
155	* write) or any MDS (for read). Update wanted set	155	* write) or any MDS (for read). Update wanted set
156	* asynchronously.	156	* asynchronously.
157	*/	157	*/
158	spin_lock(&ci->i_ceph_lock);	158	spin_lock(&ci->i_ceph_lock);
159	if (__ceph_is_any_real_caps(ci) &&	159	if (__ceph_is_any_real_caps(ci) &&
160	(((fmode & CEPH_FILE_MODE_WR) == 0) \|\| ci->i_auth_cap)) {	160	(((fmode & CEPH_FILE_MODE_WR) == 0) \|\| ci->i_auth_cap)) {
161	int mds_wanted = __ceph_caps_mds_wanted(ci);	161	int mds_wanted = __ceph_caps_mds_wanted(ci);
162	int issued = __ceph_caps_issued(ci, NULL);	162	int issued = __ceph_caps_issued(ci, NULL);
163		163
164	dout("open %p fmode %d want %s issued %s using existing\n",	164	dout("open %p fmode %d want %s issued %s using existing\n",
165	inode, fmode, ceph_cap_string(wanted),	165	inode, fmode, ceph_cap_string(wanted),
166	ceph_cap_string(issued));	166	ceph_cap_string(issued));
167	__ceph_get_fmode(ci, fmode);	167	__ceph_get_fmode(ci, fmode);
168	spin_unlock(&ci->i_ceph_lock);	168	spin_unlock(&ci->i_ceph_lock);
169		169
170	/* adjust wanted? */	170	/* adjust wanted? */
171	if ((issued & wanted) != wanted &&	171	if ((issued & wanted) != wanted &&
172	(mds_wanted & wanted) != wanted &&	172	(mds_wanted & wanted) != wanted &&
173	ceph_snap(inode) != CEPH_SNAPDIR)	173	ceph_snap(inode) != CEPH_SNAPDIR)
174	ceph_check_caps(ci, 0, NULL);	174	ceph_check_caps(ci, 0, NULL);
175		175
176	return ceph_init_file(inode, file, fmode);	176	return ceph_init_file(inode, file, fmode);
177	} else if (ceph_snap(inode) != CEPH_NOSNAP &&	177	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
178	(ci->i_snap_caps & wanted) == wanted) {	178	(ci->i_snap_caps & wanted) == wanted) {
179	__ceph_get_fmode(ci, fmode);	179	__ceph_get_fmode(ci, fmode);
180	spin_unlock(&ci->i_ceph_lock);	180	spin_unlock(&ci->i_ceph_lock);
181	return ceph_init_file(inode, file, fmode);	181	return ceph_init_file(inode, file, fmode);
182	}	182	}
183	spin_unlock(&ci->i_ceph_lock);	183	spin_unlock(&ci->i_ceph_lock);
184		184
185	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));	185	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186	req = prepare_open_request(inode->i_sb, flags, 0);	186	req = prepare_open_request(inode->i_sb, flags, 0);
187	if (IS_ERR(req)) {	187	if (IS_ERR(req)) {
188	err = PTR_ERR(req);	188	err = PTR_ERR(req);
189	goto out;	189	goto out;
190	}	190	}
191	req->r_inode = inode;	191	req->r_inode = inode;
192	ihold(inode);	192	ihold(inode);
193	req->r_num_caps = 1;	193	req->r_num_caps = 1;
194	if (flags & (O_CREAT\|O_TRUNC))	194	if (flags & (O_CREAT\|O_TRUNC))
195	parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);	195	parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
196	err = ceph_mdsc_do_request(mdsc, parent_inode, req);	196	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
197	iput(parent_inode);	197	iput(parent_inode);
198	if (!err)	198	if (!err)
199	err = ceph_init_file(inode, file, req->r_fmode);	199	err = ceph_init_file(inode, file, req->r_fmode);
200	ceph_mdsc_put_request(req);	200	ceph_mdsc_put_request(req);
201	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));	201	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
202	out:	202	out:
203	return err;	203	return err;
204	}	204	}
205		205
206		206
207	/*	207	/*
208	* Do a lookup + open with a single request. If we get a non-existent	208	* Do a lookup + open with a single request. If we get a non-existent
209	* file or symlink, return 1 so the VFS can retry.	209	* file or symlink, return 1 so the VFS can retry.
210	*/	210	*/
211	int ceph_atomic_open(struct inode dir, struct dentry dentry,	211	int ceph_atomic_open(struct inode dir, struct dentry dentry,
212	struct file *file, unsigned flags, umode_t mode,	212	struct file *file, unsigned flags, umode_t mode,
213	int *opened)	213	int *opened)
214	{	214	{
215	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	215	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
216	struct ceph_mds_client *mdsc = fsc->mdsc;	216	struct ceph_mds_client *mdsc = fsc->mdsc;
217	struct ceph_mds_request *req;	217	struct ceph_mds_request *req;
218	struct dentry *dn;	218	struct dentry *dn;
219	int err;	219	int err;
220		220
221	dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",	221	dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
222	dir, dentry, dentry->d_name.len, dentry->d_name.name,	222	dir, dentry, dentry->d_name.len, dentry->d_name.name,
223	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);	223	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
224		224
225	if (dentry->d_name.len > NAME_MAX)	225	if (dentry->d_name.len > NAME_MAX)
226	return -ENAMETOOLONG;	226	return -ENAMETOOLONG;
227		227
228	err = ceph_init_dentry(dentry);	228	err = ceph_init_dentry(dentry);
229	if (err < 0)	229	if (err < 0)
230	return err;	230	return err;
231		231
232	/* do the open */	232	/* do the open */
233	req = prepare_open_request(dir->i_sb, flags, mode);	233	req = prepare_open_request(dir->i_sb, flags, mode);
234	if (IS_ERR(req))	234	if (IS_ERR(req))
235	return PTR_ERR(req);	235	return PTR_ERR(req);
236	req->r_dentry = dget(dentry);	236	req->r_dentry = dget(dentry);
237	req->r_num_caps = 2;	237	req->r_num_caps = 2;
238	if (flags & O_CREAT) {	238	if (flags & O_CREAT) {
239	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	239	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
240	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	240	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
241	}	241	}
242	req->r_locked_dir = dir; /* caller holds dir->i_mutex */	242	req->r_locked_dir = dir; /* caller holds dir->i_mutex */
243	err = ceph_mdsc_do_request(mdsc,	243	err = ceph_mdsc_do_request(mdsc,
244	(flags & (O_CREAT\|O_TRUNC)) ? dir : NULL,	244	(flags & (O_CREAT\|O_TRUNC)) ? dir : NULL,
245	req);	245	req);
246	err = ceph_handle_snapdir(req, dentry, err);	246	err = ceph_handle_snapdir(req, dentry, err);
247	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)	247	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248	err = ceph_handle_notrace_create(dir, dentry);	248	err = ceph_handle_notrace_create(dir, dentry);
249		249
250	if (d_unhashed(dentry)) {	250	if (d_unhashed(dentry)) {
251	dn = ceph_finish_lookup(req, dentry, err);	251	dn = ceph_finish_lookup(req, dentry, err);
252	if (IS_ERR(dn))	252	if (IS_ERR(dn))
253	err = PTR_ERR(dn);	253	err = PTR_ERR(dn);
254	} else {	254	} else {
255	/* we were given a hashed negative dentry */	255	/* we were given a hashed negative dentry */
256	dn = NULL;	256	dn = NULL;
257	}	257	}
258	if (err)	258	if (err)
259	goto out_err;	259	goto out_err;
260	if (dn \|\| dentry->d_inode == NULL \|\| S_ISLNK(dentry->d_inode->i_mode)) {	260	if (dn \|\| dentry->d_inode == NULL \|\| S_ISLNK(dentry->d_inode->i_mode)) {
261	/* make vfs retry on splice, ENOENT, or symlink */	261	/* make vfs retry on splice, ENOENT, or symlink */
262	dout("atomic_open finish_no_open on dn %p\n", dn);	262	dout("atomic_open finish_no_open on dn %p\n", dn);
263	err = finish_no_open(file, dn);	263	err = finish_no_open(file, dn);
264	} else {	264	} else {
265	dout("atomic_open finish_open on dn %p\n", dn);	265	dout("atomic_open finish_open on dn %p\n", dn);
266	err = finish_open(file, dentry, ceph_open, opened);	266	err = finish_open(file, dentry, ceph_open, opened);
267	}	267	}
268		268
269	out_err:	269	out_err:
270	ceph_mdsc_put_request(req);	270	ceph_mdsc_put_request(req);
271	dout("atomic_open result=%d\n", err);	271	dout("atomic_open result=%d\n", err);
272	return err;	272	return err;
273	}	273	}
274		274
275	int ceph_release(struct inode inode, struct file file)	275	int ceph_release(struct inode inode, struct file file)
276	{	276	{
277	struct ceph_inode_info *ci = ceph_inode(inode);	277	struct ceph_inode_info *ci = ceph_inode(inode);
278	struct ceph_file_info *cf = file->private_data;	278	struct ceph_file_info *cf = file->private_data;
279		279
280	dout("release inode %p file %p\n", inode, file);	280	dout("release inode %p file %p\n", inode, file);
281	ceph_put_fmode(ci, cf->fmode);	281	ceph_put_fmode(ci, cf->fmode);
282	if (cf->last_readdir)	282	if (cf->last_readdir)
283	ceph_mdsc_put_request(cf->last_readdir);	283	ceph_mdsc_put_request(cf->last_readdir);
284	kfree(cf->last_name);	284	kfree(cf->last_name);
285	kfree(cf->dir_info);	285	kfree(cf->dir_info);
286	dput(cf->dentry);	286	dput(cf->dentry);
287	kmem_cache_free(ceph_file_cachep, cf);	287	kmem_cache_free(ceph_file_cachep, cf);
288		288
289	/* wake up anyone waiting for caps on this inode */	289	/* wake up anyone waiting for caps on this inode */
290	wake_up_all(&ci->i_cap_wq);	290	wake_up_all(&ci->i_cap_wq);
291	return 0;	291	return 0;
292	}	292	}
293		293
294	/*	294	/*
295	* Read a range of bytes striped over one or more objects. Iterate over	295	* Read a range of bytes striped over one or more objects. Iterate over
296	* objects we stripe over. (That's not atomic, but good enough for now.)	296	* objects we stripe over. (That's not atomic, but good enough for now.)
297	*	297	*
298	* If we get a short result from the OSD, check against i_size; we need to	298	* If we get a short result from the OSD, check against i_size; we need to
299	* only return a short read to the caller if we hit EOF.	299	* only return a short read to the caller if we hit EOF.
300	*/	300	*/
301	static int striped_read(struct inode *inode,	301	static int striped_read(struct inode *inode,
302	u64 off, u64 len,	302	u64 off, u64 len,
303	struct page **pages, int num_pages,	303	struct page **pages, int num_pages,
304	int *checkeof, bool o_direct,	304	int *checkeof, bool o_direct,
305	unsigned long buf_align)	305	unsigned long buf_align)
306	{	306	{
307	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);	307	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
308	struct ceph_inode_info *ci = ceph_inode(inode);	308	struct ceph_inode_info *ci = ceph_inode(inode);
309	u64 pos, this_len;	309	u64 pos, this_len;
310	int io_align, page_align;	310	int io_align, page_align;
311	int left, pages_left;	311	int left, pages_left;
312	int read;	312	int read;
313	struct page **page_pos;	313	struct page **page_pos;
314	int ret;	314	int ret;
315	bool hit_stripe, was_short;	315	bool hit_stripe, was_short;
316		316
317	/*	317	/*
318	* we may need to do multiple reads. not atomic, unfortunately.	318	* we may need to do multiple reads. not atomic, unfortunately.
319	*/	319	*/
320	pos = off;	320	pos = off;
321	left = len;	321	left = len;
322	page_pos = pages;	322	page_pos = pages;
323	pages_left = num_pages;	323	pages_left = num_pages;
324	read = 0;	324	read = 0;
325	io_align = off & ~PAGE_MASK;	325	io_align = off & ~PAGE_MASK;
326		326
327	more:	327	more:
328	if (o_direct)	328	if (o_direct)
329	page_align = (pos - io_align + buf_align) & ~PAGE_MASK;	329	page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
330	else	330	else
331	page_align = pos & ~PAGE_MASK;	331	page_align = pos & ~PAGE_MASK;
332	this_len = left;	332	this_len = left;
333	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),	333	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
334	&ci->i_layout, pos, &this_len,	334	&ci->i_layout, pos, &this_len,
335	ci->i_truncate_seq,	335	ci->i_truncate_seq,
336	ci->i_truncate_size,	336	ci->i_truncate_size,
337	page_pos, pages_left, page_align);	337	page_pos, pages_left, page_align);
338	if (ret == -ENOENT)	338	if (ret == -ENOENT)
339	ret = 0;	339	ret = 0;
340	hit_stripe = this_len < left;	340	hit_stripe = this_len < left;
341	was_short = ret >= 0 && ret < this_len;	341	was_short = ret >= 0 && ret < this_len;
342	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,	342	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
343	ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");	343	ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
344		344
345	if (ret > 0) {	345	if (ret > 0) {
346	int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;	346	int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
347		347
348	if (read < pos - off) {	348	if (read < pos - off) {
349	dout(" zero gap %llu to %llu\n", off + read, pos);	349	dout(" zero gap %llu to %llu\n", off + read, pos);
350	ceph_zero_page_vector_range(page_align + read,	350	ceph_zero_page_vector_range(page_align + read,
351	pos - off - read, pages);	351	pos - off - read, pages);
352	}	352	}
353	pos += ret;	353	pos += ret;
354	read = pos - off;	354	read = pos - off;
355	left -= ret;	355	left -= ret;
356	page_pos += didpages;	356	page_pos += didpages;
357	pages_left -= didpages;	357	pages_left -= didpages;
358		358
359	/* hit stripe? */	359	/* hit stripe? */
360	if (left && hit_stripe)	360	if (left && hit_stripe)
361	goto more;	361	goto more;
362	}	362	}
363		363
364	if (was_short) {	364	if (was_short) {
365	/* did we bounce off eof? */	365	/* did we bounce off eof? */
366	if (pos + left > inode->i_size)	366	if (pos + left > inode->i_size)
367	*checkeof = 1;	367	*checkeof = 1;
368		368
369	/* zero trailing bytes (inside i_size) */	369	/* zero trailing bytes (inside i_size) */
370	if (left > 0 && pos < inode->i_size) {	370	if (left > 0 && pos < inode->i_size) {
371	if (pos + left > inode->i_size)	371	if (pos + left > inode->i_size)
372	left = inode->i_size - pos;	372	left = inode->i_size - pos;
373		373
374	dout("zero tail %d\n", left);	374	dout("zero tail %d\n", left);
375	ceph_zero_page_vector_range(page_align + read, left,	375	ceph_zero_page_vector_range(page_align + read, left,
376	pages);	376	pages);
377	read += left;	377	read += left;
378	}	378	}
379	}	379	}
380		380
381	if (ret >= 0)	381	if (ret >= 0)
382	ret = read;	382	ret = read;
383	dout("striped_read returns %d\n", ret);	383	dout("striped_read returns %d\n", ret);
384	return ret;	384	return ret;
385	}	385	}
386		386
387	/*	387	/*
388	* Completely synchronous read and write methods. Direct from __user	388	* Completely synchronous read and write methods. Direct from __user
389	* buffer to osd, or directly to user pages (if O_DIRECT).	389	* buffer to osd, or directly to user pages (if O_DIRECT).
390	*	390	*
391	* If the read spans object boundary, just do multiple reads.	391	* If the read spans object boundary, just do multiple reads.
392	*/	392	*/
393	static ssize_t ceph_sync_read(struct file file, char __user data,	393	static ssize_t ceph_sync_read(struct file file, char __user data,
394	unsigned len, loff_t poff, int checkeof)	394	unsigned len, loff_t poff, int checkeof)
395	{	395	{
396	struct inode *inode = file->f_dentry->d_inode;	396	struct inode *inode = file->f_dentry->d_inode;
397	struct page **pages;	397	struct page **pages;
398	u64 off = *poff;	398	u64 off = *poff;
399	int num_pages, ret;	399	int num_pages, ret;
400		400
401	dout("sync_read on file %p %llu~%u %s\n", file, off, len,	401	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
402	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");	402	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
403		403
404	if (file->f_flags & O_DIRECT) {	404	if (file->f_flags & O_DIRECT) {
405	num_pages = calc_pages_for((unsigned long)data, len);	405	num_pages = calc_pages_for((unsigned long)data, len);
406	pages = ceph_get_direct_page_vector(data, num_pages, true);	406	pages = ceph_get_direct_page_vector(data, num_pages, true);
407	} else {	407	} else {
408	num_pages = calc_pages_for(off, len);	408	num_pages = calc_pages_for(off, len);
409	pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);	409	pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
410	}	410	}
411	if (IS_ERR(pages))	411	if (IS_ERR(pages))
412	return PTR_ERR(pages);	412	return PTR_ERR(pages);
413		413
414	/*	414	/*
415	* flush any page cache pages in this range. this	415	* flush any page cache pages in this range. this
416	* will make concurrent normal and sync io slow,	416	* will make concurrent normal and sync io slow,
417	* but it will at least behave sensibly when they are	417	* but it will at least behave sensibly when they are
418	* in sequence.	418	* in sequence.
419	*/	419	*/
420	ret = filemap_write_and_wait(inode->i_mapping);	420	ret = filemap_write_and_wait(inode->i_mapping);
421	if (ret < 0)	421	if (ret < 0)
422	goto done;	422	goto done;
423		423
424	ret = striped_read(inode, off, len, pages, num_pages, checkeof,	424	ret = striped_read(inode, off, len, pages, num_pages, checkeof,
425	file->f_flags & O_DIRECT,	425	file->f_flags & O_DIRECT,
426	(unsigned long)data & ~PAGE_MASK);	426	(unsigned long)data & ~PAGE_MASK);
427		427
428	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)	428	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
429	ret = ceph_copy_page_vector_to_user(pages, data, off, ret);	429	ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
430	if (ret >= 0)	430	if (ret >= 0)
431	*poff = off + ret;	431	*poff = off + ret;
432		432
433	done:	433	done:
434	if (file->f_flags & O_DIRECT)	434	if (file->f_flags & O_DIRECT)
435	ceph_put_page_vector(pages, num_pages, true);	435	ceph_put_page_vector(pages, num_pages, true);
436	else	436	else
437	ceph_release_page_vector(pages, num_pages);	437	ceph_release_page_vector(pages, num_pages);
438	dout("sync_read result %d\n", ret);	438	dout("sync_read result %d\n", ret);
439	return ret;	439	return ret;
440	}	440	}
441		441
442	/*	442	/*
443	* Write commit callback, called if we requested both an ACK and	443	* Write commit callback, called if we requested both an ACK and
444	* ONDISK commit reply from the OSD.	444	* ONDISK commit reply from the OSD.
445	*/	445	*/
446	static void sync_write_commit(struct ceph_osd_request *req,	446	static void sync_write_commit(struct ceph_osd_request *req,
447	struct ceph_msg *msg)	447	struct ceph_msg *msg)
448	{	448	{
449	struct ceph_inode_info *ci = ceph_inode(req->r_inode);	449	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
450		450
451	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);	451	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
452	spin_lock(&ci->i_unsafe_lock);	452	spin_lock(&ci->i_unsafe_lock);
453	list_del_init(&req->r_unsafe_item);	453	list_del_init(&req->r_unsafe_item);
454	spin_unlock(&ci->i_unsafe_lock);	454	spin_unlock(&ci->i_unsafe_lock);
455	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);	455	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
456	}	456	}
457		457
458	/*	458	/*
459	* Synchronous write, straight from __user pointer or user pages (if	459	* Synchronous write, straight from __user pointer or user pages (if
460	* O_DIRECT).	460	* O_DIRECT).
461	*	461	*
462	* If write spans object boundary, just do multiple writes. (For a	462	* If write spans object boundary, just do multiple writes. (For a
463	* correct atomic write, we should e.g. take write locks on all	463	* correct atomic write, we should e.g. take write locks on all
464	* objects, rollback on failure, etc.)	464	* objects, rollback on failure, etc.)
465	*/	465	*/
466	static ssize_t ceph_sync_write(struct file file, const char __user data,	466	static ssize_t ceph_sync_write(struct file file, const char __user data,
467	size_t left, loff_t *offset)	467	size_t left, loff_t *offset)
468	{	468	{
469	struct inode *inode = file->f_dentry->d_inode;	469	struct inode *inode = file->f_dentry->d_inode;
470	struct ceph_inode_info *ci = ceph_inode(inode);	470	struct ceph_inode_info *ci = ceph_inode(inode);
471	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);	471	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
472	struct ceph_osd_request *req;	472	struct ceph_osd_request *req;
473	struct page **pages;	473	struct page **pages;
474	int num_pages;	474	int num_pages;
475	long long unsigned pos;	475	long long unsigned pos;
476	u64 len;	476	u64 len;
477	int written = 0;	477	int written = 0;
478	int flags;	478	int flags;
479	int do_sync = 0;	479	int do_sync = 0;
480	int check_caps = 0;	480	int check_caps = 0;
481	int page_align, io_align;	481	int page_align, io_align;
482	unsigned long buf_align;	482	unsigned long buf_align;
483	int ret;	483	int ret;
484	struct timespec mtime = CURRENT_TIME;	484	struct timespec mtime = CURRENT_TIME;
485		485
486	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)	486	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
487	return -EROFS;	487	return -EROFS;
488		488
489	dout("sync_write on file %p %lld~%u %s\n", file, *offset,	489	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
490	(unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");	490	(unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
491		491
492	if (file->f_flags & O_APPEND)	492	if (file->f_flags & O_APPEND)
493	pos = i_size_read(inode);	493	pos = i_size_read(inode);
494	else	494	else
495	pos = *offset;	495	pos = *offset;
496		496
497	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);	497	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
498	if (ret < 0)	498	if (ret < 0)
499	return ret;	499	return ret;
500		500
501	ret = invalidate_inode_pages2_range(inode->i_mapping,	501	ret = invalidate_inode_pages2_range(inode->i_mapping,
502	pos >> PAGE_CACHE_SHIFT,	502	pos >> PAGE_CACHE_SHIFT,
503	(pos + left) >> PAGE_CACHE_SHIFT);	503	(pos + left) >> PAGE_CACHE_SHIFT);
504	if (ret < 0)	504	if (ret < 0)
505	dout("invalidate_inode_pages2_range returned %d\n", ret);	505	dout("invalidate_inode_pages2_range returned %d\n", ret);
506		506
507	flags = CEPH_OSD_FLAG_ORDERSNAP \|	507	flags = CEPH_OSD_FLAG_ORDERSNAP \|
508	CEPH_OSD_FLAG_ONDISK \|	508	CEPH_OSD_FLAG_ONDISK \|
509	CEPH_OSD_FLAG_WRITE;	509	CEPH_OSD_FLAG_WRITE;
510	if ((file->f_flags & (O_SYNC\|O_DIRECT)) == 0)	510	if ((file->f_flags & (O_SYNC\|O_DIRECT)) == 0)
511	flags \|= CEPH_OSD_FLAG_ACK;	511	flags \|= CEPH_OSD_FLAG_ACK;
512	else	512	else
513	do_sync = 1;	513	do_sync = 1;
514		514
515	/*	515	/*
516	* we may need to do multiple writes here if we span an object	516	* we may need to do multiple writes here if we span an object
517	* boundary. this isn't atomic, unfortunately. :(	517	* boundary. this isn't atomic, unfortunately. :(
518	*/	518	*/
519	more:	519	more:
520	io_align = pos & ~PAGE_MASK;	520	io_align = pos & ~PAGE_MASK;
521	buf_align = (unsigned long)data & ~PAGE_MASK;	521	buf_align = (unsigned long)data & ~PAGE_MASK;
522	len = left;	522	len = left;
523	if (file->f_flags & O_DIRECT) {	523	if (file->f_flags & O_DIRECT) {
524	/* write from beginning of first page, regardless of	524	/* write from beginning of first page, regardless of
525	io alignment */	525	io alignment */
526	page_align = (pos - io_align + buf_align) & ~PAGE_MASK;	526	page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
527	num_pages = calc_pages_for((unsigned long)data, len);	527	num_pages = calc_pages_for((unsigned long)data, len);
528	} else {	528	} else {
529	page_align = pos & ~PAGE_MASK;	529	page_align = pos & ~PAGE_MASK;
530	num_pages = calc_pages_for(pos, len);	530	num_pages = calc_pages_for(pos, len);
531	}	531	}
532	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,	532	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
533	ceph_vino(inode), pos, &len,	533	ceph_vino(inode), pos, &len,
534	CEPH_OSD_OP_WRITE, flags,	534	CEPH_OSD_OP_WRITE, flags,
535	ci->i_snap_realm->cached_context,	535	ci->i_snap_realm->cached_context,
536	do_sync,	536	do_sync,
537	ci->i_truncate_seq, ci->i_truncate_size,	537	ci->i_truncate_seq, ci->i_truncate_size,
538	&mtime, false, 2, page_align);	538	&mtime, false, 2, page_align);
539	if (IS_ERR(req))	539	if (IS_ERR(req))
540	return PTR_ERR(req);	540	return PTR_ERR(req);
541		541
542	if (file->f_flags & O_DIRECT) {	542	if (file->f_flags & O_DIRECT) {
543	pages = ceph_get_direct_page_vector(data, num_pages, false);	543	pages = ceph_get_direct_page_vector(data, num_pages, false);
544	if (IS_ERR(pages)) {	544	if (IS_ERR(pages)) {
545	ret = PTR_ERR(pages);	545	ret = PTR_ERR(pages);
546	goto out;	546	goto out;
547	}	547	}
548		548
549	/*	549	/*
550	* throw out any page cache pages in this range. this	550	* throw out any page cache pages in this range. this
551	* may block.	551	* may block.
552	*/	552	*/
553	truncate_inode_pages_range(inode->i_mapping, pos,	553	truncate_inode_pages_range(inode->i_mapping, pos,
554	(pos+len) \| (PAGE_CACHE_SIZE-1));	554	(pos+len) \| (PAGE_CACHE_SIZE-1));
555	} else {	555	} else {
556	pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);	556	pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
557	if (IS_ERR(pages)) {	557	if (IS_ERR(pages)) {
558	ret = PTR_ERR(pages);	558	ret = PTR_ERR(pages);
559	goto out;	559	goto out;
560	}	560	}
561	ret = ceph_copy_user_to_page_vector(pages, data, pos, len);	561	ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
562	if (ret < 0) {	562	if (ret < 0) {
563	ceph_release_page_vector(pages, num_pages);	563	ceph_release_page_vector(pages, num_pages);
564	goto out;	564	goto out;
565	}	565	}
566		566
567	if ((file->f_flags & O_SYNC) == 0) {	567	if ((file->f_flags & O_SYNC) == 0) {
568	/* get a second commit callback */	568	/* get a second commit callback */
569	req->r_safe_callback = sync_write_commit;	569	req->r_safe_callback = sync_write_commit;
570	req->r_own_pages = 1;	570	req->r_own_pages = 1;
571	}	571	}
572	}	572	}
573	req->r_pages = pages;	573	req->r_pages = pages;
574	req->r_num_pages = num_pages;	574	req->r_num_pages = num_pages;
575	req->r_inode = inode;	575	req->r_inode = inode;
576		576
577	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);	577	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
578	if (!ret) {	578	if (!ret) {
579	if (req->r_safe_callback) {	579	if (req->r_safe_callback) {
580	/*	580	/*
581	* Add to inode unsafe list only after we	581	* Add to inode unsafe list only after we
582	* start_request so that a tid has been assigned.	582	* start_request so that a tid has been assigned.
583	*/	583	*/
584	spin_lock(&ci->i_unsafe_lock);	584	spin_lock(&ci->i_unsafe_lock);
585	list_add_tail(&req->r_unsafe_item,	585	list_add_tail(&req->r_unsafe_item,
586	&ci->i_unsafe_writes);	586	&ci->i_unsafe_writes);
587	spin_unlock(&ci->i_unsafe_lock);	587	spin_unlock(&ci->i_unsafe_lock);
588	ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);	588	ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
589	}	589	}
590		590
591	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);	591	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
592	if (ret < 0 && req->r_safe_callback) {	592	if (ret < 0 && req->r_safe_callback) {
593	spin_lock(&ci->i_unsafe_lock);	593	spin_lock(&ci->i_unsafe_lock);
594	list_del_init(&req->r_unsafe_item);	594	list_del_init(&req->r_unsafe_item);
595	spin_unlock(&ci->i_unsafe_lock);	595	spin_unlock(&ci->i_unsafe_lock);
596	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);	596	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
597	}	597	}
598	}	598	}
599		599
600	if (file->f_flags & O_DIRECT)	600	if (file->f_flags & O_DIRECT)
601	ceph_put_page_vector(pages, num_pages, false);	601	ceph_put_page_vector(pages, num_pages, false);
602	else if (file->f_flags & O_SYNC)	602	else if (file->f_flags & O_SYNC)
603	ceph_release_page_vector(pages, num_pages);	603	ceph_release_page_vector(pages, num_pages);
604		604
605	out:	605	out:
606	ceph_osdc_put_request(req);	606	ceph_osdc_put_request(req);
607	if (ret == 0) {	607	if (ret == 0) {
608	pos += len;	608	pos += len;
609	written += len;	609	written += len;
610	left -= len;	610	left -= len;
611	data += written;	611	data += written;
612	if (left)	612	if (left)
613	goto more;	613	goto more;
614		614
615	ret = written;	615	ret = written;
616	*offset = pos;	616	*offset = pos;
617	if (pos > i_size_read(inode))	617	if (pos > i_size_read(inode))
618	check_caps = ceph_inode_set_size(inode, pos);	618	check_caps = ceph_inode_set_size(inode, pos);
619	if (check_caps)	619	if (check_caps)
620	ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,	620	ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
621	NULL);	621	NULL);
622	}	622	}
623	return ret;	623	return ret;
624	}	624	}
625		625
626	/*	626	/*
627	* Wrap generic_file_aio_read with checks for cap bits on the inode.	627	* Wrap generic_file_aio_read with checks for cap bits on the inode.
628	* Atomically grab references, so that those bits are not released	628	* Atomically grab references, so that those bits are not released
629	* back to the MDS mid-read.	629	* back to the MDS mid-read.
630	*	630	*
631	* Hmm, the sync read case isn't actually async... should it be?	631	* Hmm, the sync read case isn't actually async... should it be?
632	*/	632	*/
633	static ssize_t ceph_aio_read(struct kiocb iocb, const struct iovec iov,	633	static ssize_t ceph_aio_read(struct kiocb iocb, const struct iovec iov,
634	unsigned long nr_segs, loff_t pos)	634	unsigned long nr_segs, loff_t pos)
635	{	635	{
636	struct file *filp = iocb->ki_filp;	636	struct file *filp = iocb->ki_filp;
637	struct ceph_file_info *fi = filp->private_data;	637	struct ceph_file_info *fi = filp->private_data;
638	loff_t *ppos = &iocb->ki_pos;	638	loff_t *ppos = &iocb->ki_pos;
639	size_t len = iov->iov_len;	639	size_t len = iov->iov_len;
640	struct inode *inode = filp->f_dentry->d_inode;	640	struct inode *inode = filp->f_dentry->d_inode;
641	struct ceph_inode_info *ci = ceph_inode(inode);	641	struct ceph_inode_info *ci = ceph_inode(inode);
642	void __user *base = iov->iov_base;	642	void __user *base = iov->iov_base;
643	ssize_t ret;	643	ssize_t ret;
644	int want, got = 0;	644	int want, got = 0;
645	int checkeof = 0, read = 0;	645	int checkeof = 0, read = 0;
646		646
647	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",	647	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
648	inode, ceph_vinop(inode), pos, (unsigned)len, inode);	648	inode, ceph_vinop(inode), pos, (unsigned)len, inode);
649	again:	649	again:
650	__ceph_do_pending_vmtruncate(inode);	650	__ceph_do_pending_vmtruncate(inode);
651	if (fi->fmode & CEPH_FILE_MODE_LAZY)	651	if (fi->fmode & CEPH_FILE_MODE_LAZY)
652	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;	652	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
653	else	653	else
654	want = CEPH_CAP_FILE_CACHE;	654	want = CEPH_CAP_FILE_CACHE;
655	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);	655	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
656	if (ret < 0)	656	if (ret < 0)
657	goto out;	657	goto out;
658	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",	658	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
659	inode, ceph_vinop(inode), pos, (unsigned)len,	659	inode, ceph_vinop(inode), pos, (unsigned)len,
660	ceph_cap_string(got));	660	ceph_cap_string(got));
661		661
662	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|	662	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
663	(iocb->ki_filp->f_flags & O_DIRECT) \|\|	663	(iocb->ki_filp->f_flags & O_DIRECT) \|\|
664	(inode->i_sb->s_flags & MS_SYNCHRONOUS) \|\|	664	(inode->i_sb->s_flags & MS_SYNCHRONOUS) \|\|
665	(fi->flags & CEPH_F_SYNC))	665	(fi->flags & CEPH_F_SYNC))
666	/* hmm, this isn't really async... */	666	/* hmm, this isn't really async... */
667	ret = ceph_sync_read(filp, base, len, ppos, &checkeof);	667	ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
668	else	668	else
669	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);	669	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
670		670
671	out:	671	out:
672	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",	672	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
673	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);	673	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
674	ceph_put_cap_refs(ci, got);	674	ceph_put_cap_refs(ci, got);
675		675
676	if (checkeof && ret >= 0) {	676	if (checkeof && ret >= 0) {
677	int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);	677	int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
678		678
679	/* hit EOF or hole? */	679	/* hit EOF or hole? */
680	if (statret == 0 && *ppos < inode->i_size) {	680	if (statret == 0 && *ppos < inode->i_size) {
681	dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);	681	dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
682	read += ret;	682	read += ret;
683	base += ret;	683	base += ret;
684	len -= ret;	684	len -= ret;
685	checkeof = 0;	685	checkeof = 0;
686	goto again;	686	goto again;
687	}	687	}
688	}	688	}
689	if (ret >= 0)	689	if (ret >= 0)
690	ret += read;	690	ret += read;
691		691
692	return ret;	692	return ret;
693	}	693	}
694		694
695	/*	695	/*
696	* Take cap references to avoid releasing caps to MDS mid-write.	696	* Take cap references to avoid releasing caps to MDS mid-write.
697	*	697	*
698	* If we are synchronous, and write with an old snap context, the OSD	698	* If we are synchronous, and write with an old snap context, the OSD
699	* may return EOLDSNAPC. In that case, retry the write.. _after_	699	* may return EOLDSNAPC. In that case, retry the write.. _after_
700	* dropping our cap refs and allowing the pending snap to logically	700	* dropping our cap refs and allowing the pending snap to logically
701	* complete _before_ this write occurs.	701	* complete _before_ this write occurs.
702	*	702	*
703	* If we are near ENOSPC, write synchronously.	703	* If we are near ENOSPC, write synchronously.
704	*/	704	*/
705	static ssize_t ceph_aio_write(struct kiocb iocb, const struct iovec iov,	705	static ssize_t ceph_aio_write(struct kiocb iocb, const struct iovec iov,
706	unsigned long nr_segs, loff_t pos)	706	unsigned long nr_segs, loff_t pos)
707	{	707	{
708	struct file *file = iocb->ki_filp;	708	struct file *file = iocb->ki_filp;
709	struct ceph_file_info *fi = file->private_data;	709	struct ceph_file_info *fi = file->private_data;
710	struct inode *inode = file->f_dentry->d_inode;	710	struct inode *inode = file->f_dentry->d_inode;
711	struct ceph_inode_info *ci = ceph_inode(inode);	711	struct ceph_inode_info *ci = ceph_inode(inode);
712	struct ceph_osd_client *osdc =	712	struct ceph_osd_client *osdc =
713	&ceph_sb_to_client(inode->i_sb)->client->osdc;	713	&ceph_sb_to_client(inode->i_sb)->client->osdc;
714	loff_t endoff = pos + iov->iov_len;	714	loff_t endoff = pos + iov->iov_len;
715	int want, got = 0;	715	int want, got = 0;
716	int ret, err;	716	int ret, err;
717		717
718	if (ceph_snap(inode) != CEPH_NOSNAP)	718	if (ceph_snap(inode) != CEPH_NOSNAP)
719	return -EROFS;	719	return -EROFS;
720		720
721	retry_snap:	721	retry_snap:
722	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))	722	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
723	return -ENOSPC;	723	return -ENOSPC;
724	__ceph_do_pending_vmtruncate(inode);	724	__ceph_do_pending_vmtruncate(inode);
725	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",	725	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
726	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,	726	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
727	inode->i_size);	727	inode->i_size);
728	if (fi->fmode & CEPH_FILE_MODE_LAZY)	728	if (fi->fmode & CEPH_FILE_MODE_LAZY)
729	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;	729	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
730	else	730	else
731	want = CEPH_CAP_FILE_BUFFER;	731	want = CEPH_CAP_FILE_BUFFER;
732	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);	732	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
733	if (ret < 0)	733	if (ret < 0)
734	goto out_put;	734	goto out_put;
735		735
736	dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",	736	dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
737	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,	737	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
738	ceph_cap_string(got));	738	ceph_cap_string(got));
739		739
740	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|	740	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
741	(iocb->ki_filp->f_flags & O_DIRECT) \|\|	741	(iocb->ki_filp->f_flags & O_DIRECT) \|\|
742	(inode->i_sb->s_flags & MS_SYNCHRONOUS) \|\|	742	(inode->i_sb->s_flags & MS_SYNCHRONOUS) \|\|
743	(fi->flags & CEPH_F_SYNC)) {	743	(fi->flags & CEPH_F_SYNC)) {
744	ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,	744	ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
745	&iocb->ki_pos);	745	&iocb->ki_pos);
746	} else {	746	} else {
747	/*	747	/*
748	* buffered write; drop Fw early to avoid slow	748	* buffered write; drop Fw early to avoid slow
749	* revocation if we get stuck on balance_dirty_pages	749	* revocation if we get stuck on balance_dirty_pages
750	*/	750	*/
751	int dirty;	751	int dirty;
752		752
753	spin_lock(&ci->i_ceph_lock);	753	spin_lock(&ci->i_ceph_lock);
754	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);	754	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
755	spin_unlock(&ci->i_ceph_lock);	755	spin_unlock(&ci->i_ceph_lock);
756	ceph_put_cap_refs(ci, got);	756	ceph_put_cap_refs(ci, got);
757		757
758	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);	758	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
759	if ((ret >= 0 \|\| ret == -EIOCBQUEUED) &&	759	if ((ret >= 0 \|\| ret == -EIOCBQUEUED) &&
760	((file->f_flags & O_SYNC) \|\| IS_SYNC(file->f_mapping->host)	760	((file->f_flags & O_SYNC) \|\| IS_SYNC(file->f_mapping->host)
761	\|\| ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {	761	\|\| ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
762	err = vfs_fsync_range(file, pos, pos + ret - 1, 1);	762	err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
763	if (err < 0)	763	if (err < 0)
764	ret = err;	764	ret = err;
765	}	765	}
766		766
767	if (dirty)	767	if (dirty)
768	__mark_inode_dirty(inode, dirty);	768	__mark_inode_dirty(inode, dirty);
769	goto out;	769	goto out;
770	}	770	}
771		771
772	if (ret >= 0) {	772	if (ret >= 0) {
773	int dirty;	773	int dirty;
774	spin_lock(&ci->i_ceph_lock);	774	spin_lock(&ci->i_ceph_lock);
775	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);	775	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
776	spin_unlock(&ci->i_ceph_lock);	776	spin_unlock(&ci->i_ceph_lock);
777	if (dirty)	777	if (dirty)
778	__mark_inode_dirty(inode, dirty);	778	__mark_inode_dirty(inode, dirty);
779	}	779	}
780		780
781	out_put:	781	out_put:
782	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",	782	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
783	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,	783	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
784	ceph_cap_string(got));	784	ceph_cap_string(got));
785	ceph_put_cap_refs(ci, got);	785	ceph_put_cap_refs(ci, got);
786		786
787	out:	787	out:
788	if (ret == -EOLDSNAPC) {	788	if (ret == -EOLDSNAPC) {
789	dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",	789	dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
790	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);	790	inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
791	goto retry_snap;	791	goto retry_snap;
792	}	792	}
793		793
794	return ret;	794	return ret;
795	}	795	}
796		796
797	/*	797	/*
798	* llseek. be sure to verify file size on SEEK_END.	798	* llseek. be sure to verify file size on SEEK_END.
799	*/	799	*/
800	static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)	800	static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
801	{	801	{
802	struct inode *inode = file->f_mapping->host;	802	struct inode *inode = file->f_mapping->host;
803	int ret;	803	int ret;
804		804
805	mutex_lock(&inode->i_mutex);	805	mutex_lock(&inode->i_mutex);
806	__ceph_do_pending_vmtruncate(inode);	806	__ceph_do_pending_vmtruncate(inode);
807		807
808	if (origin == SEEK_END \|\| origin == SEEK_DATA \|\| origin == SEEK_HOLE) {	808	if (whence == SEEK_END \|\| whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
809	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);	809	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
810	if (ret < 0) {	810	if (ret < 0) {
811	offset = ret;	811	offset = ret;
812	goto out;	812	goto out;
813	}	813	}
814	}	814	}
815		815
816	switch (origin) {	816	switch (whence) {
817	case SEEK_END:	817	case SEEK_END:
818	offset += inode->i_size;	818	offset += inode->i_size;
819	break;	819	break;
820	case SEEK_CUR:	820	case SEEK_CUR:
821	/*	821	/*
822	* Here we special-case the lseek(fd, 0, SEEK_CUR)	822	* Here we special-case the lseek(fd, 0, SEEK_CUR)
823	* position-querying operation. Avoid rewriting the "same"	823	* position-querying operation. Avoid rewriting the "same"
824	* f_pos value back to the file because a concurrent read(),	824	* f_pos value back to the file because a concurrent read(),
825	* write() or lseek() might have altered it	825	* write() or lseek() might have altered it
826	*/	826	*/
827	if (offset == 0) {	827	if (offset == 0) {
828	offset = file->f_pos;	828	offset = file->f_pos;
829	goto out;	829	goto out;
830	}	830	}
831	offset += file->f_pos;	831	offset += file->f_pos;
832	break;	832	break;
833	case SEEK_DATA:	833	case SEEK_DATA:
834	if (offset >= inode->i_size) {	834	if (offset >= inode->i_size) {
835	ret = -ENXIO;	835	ret = -ENXIO;
836	goto out;	836	goto out;
837	}	837	}
838	break;	838	break;
839	case SEEK_HOLE:	839	case SEEK_HOLE:
840	if (offset >= inode->i_size) {	840	if (offset >= inode->i_size) {
841	ret = -ENXIO;	841	ret = -ENXIO;
842	goto out;	842	goto out;
843	}	843	}
844	offset = inode->i_size;	844	offset = inode->i_size;
845	break;	845	break;
846	}	846	}
847		847
848	if (offset < 0 \|\| offset > inode->i_sb->s_maxbytes) {	848	if (offset < 0 \|\| offset > inode->i_sb->s_maxbytes) {
849	offset = -EINVAL;	849	offset = -EINVAL;
850	goto out;	850	goto out;
851	}	851	}
852		852
853	/* Special lock needed here? */	853	/* Special lock needed here? */
854	if (offset != file->f_pos) {	854	if (offset != file->f_pos) {
855	file->f_pos = offset;	855	file->f_pos = offset;
856	file->f_version = 0;	856	file->f_version = 0;
857	}	857	}
858		858
859	out:	859	out:
860	mutex_unlock(&inode->i_mutex);	860	mutex_unlock(&inode->i_mutex);
861	return offset;	861	return offset;
862	}	862	}
863		863
864	const struct file_operations ceph_file_fops = {	864	const struct file_operations ceph_file_fops = {
865	.open = ceph_open,	865	.open = ceph_open,
866	.release = ceph_release,	866	.release = ceph_release,
867	.llseek = ceph_llseek,	867	.llseek = ceph_llseek,
868	.read = do_sync_read,	868	.read = do_sync_read,
869	.write = do_sync_write,	869	.write = do_sync_write,
870	.aio_read = ceph_aio_read,	870	.aio_read = ceph_aio_read,
871	.aio_write = ceph_aio_write,	871	.aio_write = ceph_aio_write,
872	.mmap = ceph_mmap,	872	.mmap = ceph_mmap,
873	.fsync = ceph_fsync,	873	.fsync = ceph_fsync,
874	.lock = ceph_lock,	874	.lock = ceph_lock,
875	.flock = ceph_flock,	875	.flock = ceph_flock,
876	.splice_read = generic_file_splice_read,	876	.splice_read = generic_file_splice_read,
877	.splice_write = generic_file_splice_write,	877	.splice_write = generic_file_splice_write,
878	.unlocked_ioctl = ceph_ioctl,	878	.unlocked_ioctl = ceph_ioctl,
879	.compat_ioctl = ceph_ioctl,	879	.compat_ioctl = ceph_ioctl,
880	};	880	};
881		881
882		882

fs/cifs/cifsfs.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* fs/cifs/cifsfs.c	2	* fs/cifs/cifsfs.c
3	*	3	*
4	* Copyright (C) International Business Machines Corp., 2002,2008	4	* Copyright (C) International Business Machines Corp., 2002,2008
5	* Author(s): Steve French (sfrench@us.ibm.com)	5	* Author(s): Steve French (sfrench@us.ibm.com)
6	*	6	*
7	* Common Internet FileSystem (CIFS) client	7	* Common Internet FileSystem (CIFS) client
8	*	8	*
9	* This library is free software; you can redistribute it and/or modify	9	* This library is free software; you can redistribute it and/or modify
10	* it under the terms of the GNU Lesser General Public License as published	10	* it under the terms of the GNU Lesser General Public License as published
11	* by the Free Software Foundation; either version 2.1 of the License, or	11	* by the Free Software Foundation; either version 2.1 of the License, or
12	* (at your option) any later version.	12	* (at your option) any later version.
13	*	13	*
14	* This library is distributed in the hope that it will be useful,	14	* This library is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of	15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17	* the GNU Lesser General Public License for more details.	17	* the GNU Lesser General Public License for more details.
18	*	18	*
19	* You should have received a copy of the GNU Lesser General Public License	19	* You should have received a copy of the GNU Lesser General Public License
20	* along with this library; if not, write to the Free Software	20	* along with this library; if not, write to the Free Software
21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22	*/	22	*/
23		23
24	/* Note that BB means BUGBUG (ie something to fix eventually) */	24	/* Note that BB means BUGBUG (ie something to fix eventually) */
25		25
26	#include <linux/module.h>	26	#include <linux/module.h>
27	#include <linux/fs.h>	27	#include <linux/fs.h>
28	#include <linux/mount.h>	28	#include <linux/mount.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/init.h>	30	#include <linux/init.h>
31	#include <linux/list.h>	31	#include <linux/list.h>
32	#include <linux/seq_file.h>	32	#include <linux/seq_file.h>
33	#include <linux/vfs.h>	33	#include <linux/vfs.h>
34	#include <linux/mempool.h>	34	#include <linux/mempool.h>
35	#include <linux/delay.h>	35	#include <linux/delay.h>
36	#include <linux/kthread.h>	36	#include <linux/kthread.h>
37	#include <linux/freezer.h>	37	#include <linux/freezer.h>
38	#include <linux/namei.h>	38	#include <linux/namei.h>
39	#include <linux/random.h>	39	#include <linux/random.h>
40	#include <net/ipv6.h>	40	#include <net/ipv6.h>
41	#include "cifsfs.h"	41	#include "cifsfs.h"
42	#include "cifspdu.h"	42	#include "cifspdu.h"
43	#define DECLARE_GLOBALS_HERE	43	#define DECLARE_GLOBALS_HERE
44	#include "cifsglob.h"	44	#include "cifsglob.h"
45	#include "cifsproto.h"	45	#include "cifsproto.h"
46	#include "cifs_debug.h"	46	#include "cifs_debug.h"
47	#include "cifs_fs_sb.h"	47	#include "cifs_fs_sb.h"
48	#include <linux/mm.h>	48	#include <linux/mm.h>
49	#include <linux/key-type.h>	49	#include <linux/key-type.h>
50	#include "cifs_spnego.h"	50	#include "cifs_spnego.h"
51	#include "fscache.h"	51	#include "fscache.h"
52	#ifdef CONFIG_CIFS_SMB2	52	#ifdef CONFIG_CIFS_SMB2
53	#include "smb2pdu.h"	53	#include "smb2pdu.h"
54	#endif	54	#endif
55		55
56	int cifsFYI = 0;	56	int cifsFYI = 0;
57	int cifsERROR = 1;	57	int cifsERROR = 1;
58	int traceSMB = 0;	58	int traceSMB = 0;
59	bool enable_oplocks = true;	59	bool enable_oplocks = true;
60	unsigned int linuxExtEnabled = 1;	60	unsigned int linuxExtEnabled = 1;
61	unsigned int lookupCacheEnabled = 1;	61	unsigned int lookupCacheEnabled = 1;
62	unsigned int global_secflags = CIFSSEC_DEF;	62	unsigned int global_secflags = CIFSSEC_DEF;
63	/* unsigned int ntlmv2_support = 0; */	63	/* unsigned int ntlmv2_support = 0; */
64	unsigned int sign_CIFS_PDUs = 1;	64	unsigned int sign_CIFS_PDUs = 1;
65	static const struct super_operations cifs_super_ops;	65	static const struct super_operations cifs_super_ops;
66	unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;	66	unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67	module_param(CIFSMaxBufSize, uint, 0);	67	module_param(CIFSMaxBufSize, uint, 0);
68	MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "	68	MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
69	"Default: 16384 Range: 8192 to 130048");	69	"Default: 16384 Range: 8192 to 130048");
70	unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;	70	unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
71	module_param(cifs_min_rcv, uint, 0);	71	module_param(cifs_min_rcv, uint, 0);
72	MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "	72	MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
73	"1 to 64");	73	"1 to 64");
74	unsigned int cifs_min_small = 30;	74	unsigned int cifs_min_small = 30;
75	module_param(cifs_min_small, uint, 0);	75	module_param(cifs_min_small, uint, 0);
76	MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "	76	MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
77	"Range: 2 to 256");	77	"Range: 2 to 256");
78	unsigned int cifs_max_pending = CIFS_MAX_REQ;	78	unsigned int cifs_max_pending = CIFS_MAX_REQ;
79	module_param(cifs_max_pending, uint, 0444);	79	module_param(cifs_max_pending, uint, 0444);
80	MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "	80	MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
81	"Default: 32767 Range: 2 to 32767.");	81	"Default: 32767 Range: 2 to 32767.");
82	module_param(enable_oplocks, bool, 0644);	82	module_param(enable_oplocks, bool, 0644);
83	MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");	83	MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
84		84
85	extern mempool_t *cifs_sm_req_poolp;	85	extern mempool_t *cifs_sm_req_poolp;
86	extern mempool_t *cifs_req_poolp;	86	extern mempool_t *cifs_req_poolp;
87	extern mempool_t *cifs_mid_poolp;	87	extern mempool_t *cifs_mid_poolp;
88		88
89	struct workqueue_struct *cifsiod_wq;	89	struct workqueue_struct *cifsiod_wq;
90		90
91	#ifdef CONFIG_CIFS_SMB2	91	#ifdef CONFIG_CIFS_SMB2
92	__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];	92	__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
93	#endif	93	#endif
94		94
95	static int	95	static int
96	cifs_read_super(struct super_block *sb)	96	cifs_read_super(struct super_block *sb)
97	{	97	{
98	struct inode *inode;	98	struct inode *inode;
99	struct cifs_sb_info *cifs_sb;	99	struct cifs_sb_info *cifs_sb;
100	int rc = 0;	100	int rc = 0;
101		101
102	cifs_sb = CIFS_SB(sb);	102	cifs_sb = CIFS_SB(sb);
103		103
104	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)	104	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
105	sb->s_flags \|= MS_POSIXACL;	105	sb->s_flags \|= MS_POSIXACL;
106		106
107	if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)	107	if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
108	sb->s_maxbytes = MAX_LFS_FILESIZE;	108	sb->s_maxbytes = MAX_LFS_FILESIZE;
109	else	109	else
110	sb->s_maxbytes = MAX_NON_LFS;	110	sb->s_maxbytes = MAX_NON_LFS;
111		111
112	/* BB FIXME fix time_gran to be larger for LANMAN sessions */	112	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
113	sb->s_time_gran = 100;	113	sb->s_time_gran = 100;
114		114
115	sb->s_magic = CIFS_MAGIC_NUMBER;	115	sb->s_magic = CIFS_MAGIC_NUMBER;
116	sb->s_op = &cifs_super_ops;	116	sb->s_op = &cifs_super_ops;
117	sb->s_bdi = &cifs_sb->bdi;	117	sb->s_bdi = &cifs_sb->bdi;
118	sb->s_blocksize = CIFS_MAX_MSGSIZE;	118	sb->s_blocksize = CIFS_MAX_MSGSIZE;
119	sb->s_blocksize_bits = 14; /* default 2*14 = CIFS_MAX_MSGSIZE /	119	sb->s_blocksize_bits = 14; /* default 2*14 = CIFS_MAX_MSGSIZE /
120	inode = cifs_root_iget(sb);	120	inode = cifs_root_iget(sb);
121		121
122	if (IS_ERR(inode)) {	122	if (IS_ERR(inode)) {
123	rc = PTR_ERR(inode);	123	rc = PTR_ERR(inode);
124	goto out_no_root;	124	goto out_no_root;
125	}	125	}
126		126
127	sb->s_root = d_make_root(inode);	127	sb->s_root = d_make_root(inode);
128	if (!sb->s_root) {	128	if (!sb->s_root) {
129	rc = -ENOMEM;	129	rc = -ENOMEM;
130	goto out_no_root;	130	goto out_no_root;
131	}	131	}
132		132
133	/* do that after d_make_root() - we want NULL ->d_op for root here */	133	/* do that after d_make_root() - we want NULL ->d_op for root here */
134	if (cifs_sb_master_tcon(cifs_sb)->nocase)	134	if (cifs_sb_master_tcon(cifs_sb)->nocase)
135	sb->s_d_op = &cifs_ci_dentry_ops;	135	sb->s_d_op = &cifs_ci_dentry_ops;
136	else	136	else
137	sb->s_d_op = &cifs_dentry_ops;	137	sb->s_d_op = &cifs_dentry_ops;
138		138
139	#ifdef CONFIG_CIFS_NFSD_EXPORT	139	#ifdef CONFIG_CIFS_NFSD_EXPORT
140	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {	140	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
141	cFYI(1, "export ops supported");	141	cFYI(1, "export ops supported");
142	sb->s_export_op = &cifs_export_ops;	142	sb->s_export_op = &cifs_export_ops;
143	}	143	}
144	#endif /* CONFIG_CIFS_NFSD_EXPORT */	144	#endif /* CONFIG_CIFS_NFSD_EXPORT */
145		145
146	return 0;	146	return 0;
147		147
148	out_no_root:	148	out_no_root:
149	cERROR(1, "cifs_read_super: get root inode failed");	149	cERROR(1, "cifs_read_super: get root inode failed");
150	return rc;	150	return rc;
151	}	151	}
152		152
153	static void cifs_kill_sb(struct super_block *sb)	153	static void cifs_kill_sb(struct super_block *sb)
154	{	154	{
155	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);	155	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
156	kill_anon_super(sb);	156	kill_anon_super(sb);
157	cifs_umount(cifs_sb);	157	cifs_umount(cifs_sb);
158	}	158	}
159		159
160	static int	160	static int
161	cifs_statfs(struct dentry dentry, struct kstatfs buf)	161	cifs_statfs(struct dentry dentry, struct kstatfs buf)
162	{	162	{
163	struct super_block *sb = dentry->d_sb;	163	struct super_block *sb = dentry->d_sb;
164	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);	164	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
165	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);	165	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
166	struct TCP_Server_Info *server = tcon->ses->server;	166	struct TCP_Server_Info *server = tcon->ses->server;
167	unsigned int xid;	167	unsigned int xid;
168	int rc = 0;	168	int rc = 0;
169		169
170	xid = get_xid();	170	xid = get_xid();
171		171
172	/*	172	/*
173	* PATH_MAX may be too long - it would presumably be total path,	173	* PATH_MAX may be too long - it would presumably be total path,
174	* but note that some servers (includinng Samba 3) have a shorter	174	* but note that some servers (includinng Samba 3) have a shorter
175	* maximum path.	175	* maximum path.
176	*	176	*
177	* Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.	177	* Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.
178	*/	178	*/
179	buf->f_namelen = PATH_MAX;	179	buf->f_namelen = PATH_MAX;
180	buf->f_files = 0; /* undefined */	180	buf->f_files = 0; /* undefined */
181	buf->f_ffree = 0; /* unlimited */	181	buf->f_ffree = 0; /* unlimited */
182		182
183	if (server->ops->queryfs)	183	if (server->ops->queryfs)
184	rc = server->ops->queryfs(xid, tcon, buf);	184	rc = server->ops->queryfs(xid, tcon, buf);
185		185
186	free_xid(xid);	186	free_xid(xid);
187	return 0;	187	return 0;
188	}	188	}
189		189
190	static int cifs_permission(struct inode *inode, int mask)	190	static int cifs_permission(struct inode *inode, int mask)
191	{	191	{
192	struct cifs_sb_info *cifs_sb;	192	struct cifs_sb_info *cifs_sb;
193		193
194	cifs_sb = CIFS_SB(inode->i_sb);	194	cifs_sb = CIFS_SB(inode->i_sb);
195		195
196	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {	196	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
197	if ((mask & MAY_EXEC) && !execute_ok(inode))	197	if ((mask & MAY_EXEC) && !execute_ok(inode))
198	return -EACCES;	198	return -EACCES;
199	else	199	else
200	return 0;	200	return 0;
201	} else /* file mode might have been restricted at mount time	201	} else /* file mode might have been restricted at mount time
202	on the client (above and beyond ACL on servers) for	202	on the client (above and beyond ACL on servers) for
203	servers which do not support setting and viewing mode bits,	203	servers which do not support setting and viewing mode bits,
204	so allowing client to check permissions is useful */	204	so allowing client to check permissions is useful */
205	return generic_permission(inode, mask);	205	return generic_permission(inode, mask);
206	}	206	}
207		207
208	static struct kmem_cache *cifs_inode_cachep;	208	static struct kmem_cache *cifs_inode_cachep;
209	static struct kmem_cache *cifs_req_cachep;	209	static struct kmem_cache *cifs_req_cachep;
210	static struct kmem_cache *cifs_mid_cachep;	210	static struct kmem_cache *cifs_mid_cachep;
211	static struct kmem_cache *cifs_sm_req_cachep;	211	static struct kmem_cache *cifs_sm_req_cachep;
212	mempool_t *cifs_sm_req_poolp;	212	mempool_t *cifs_sm_req_poolp;
213	mempool_t *cifs_req_poolp;	213	mempool_t *cifs_req_poolp;
214	mempool_t *cifs_mid_poolp;	214	mempool_t *cifs_mid_poolp;
215		215
216	static struct inode *	216	static struct inode *
217	cifs_alloc_inode(struct super_block *sb)	217	cifs_alloc_inode(struct super_block *sb)
218	{	218	{
219	struct cifsInodeInfo *cifs_inode;	219	struct cifsInodeInfo *cifs_inode;
220	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);	220	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
221	if (!cifs_inode)	221	if (!cifs_inode)
222	return NULL;	222	return NULL;
223	cifs_inode->cifsAttrs = 0x20; /* default */	223	cifs_inode->cifsAttrs = 0x20; /* default */
224	cifs_inode->time = 0;	224	cifs_inode->time = 0;
225	/*	225	/*
226	* Until the file is open and we have gotten oplock info back from the	226	* Until the file is open and we have gotten oplock info back from the
227	* server, can not assume caching of file data or metadata.	227	* server, can not assume caching of file data or metadata.
228	*/	228	*/
229	cifs_set_oplock_level(cifs_inode, 0);	229	cifs_set_oplock_level(cifs_inode, 0);
230	cifs_inode->delete_pending = false;	230	cifs_inode->delete_pending = false;
231	cifs_inode->invalid_mapping = false;	231	cifs_inode->invalid_mapping = false;
232	cifs_inode->leave_pages_clean = false;	232	cifs_inode->leave_pages_clean = false;
233	cifs_inode->vfs_inode.i_blkbits = 14; /* 2*14 = CIFS_MAX_MSGSIZE /	233	cifs_inode->vfs_inode.i_blkbits = 14; /* 2*14 = CIFS_MAX_MSGSIZE /
234	cifs_inode->server_eof = 0;	234	cifs_inode->server_eof = 0;
235	cifs_inode->uniqueid = 0;	235	cifs_inode->uniqueid = 0;
236	cifs_inode->createtime = 0;	236	cifs_inode->createtime = 0;
237	#ifdef CONFIG_CIFS_SMB2	237	#ifdef CONFIG_CIFS_SMB2
238	get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);	238	get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
239	#endif	239	#endif
240	/*	240	/*
241	* Can not set i_flags here - they get immediately overwritten to zero	241	* Can not set i_flags here - they get immediately overwritten to zero
242	* by the VFS.	242	* by the VFS.
243	*/	243	*/
244	/* cifs_inode->vfs_inode.i_flags = S_NOATIME \| S_NOCMTIME; */	244	/* cifs_inode->vfs_inode.i_flags = S_NOATIME \| S_NOCMTIME; */
245	INIT_LIST_HEAD(&cifs_inode->openFileList);	245	INIT_LIST_HEAD(&cifs_inode->openFileList);
246	INIT_LIST_HEAD(&cifs_inode->llist);	246	INIT_LIST_HEAD(&cifs_inode->llist);
247	return &cifs_inode->vfs_inode;	247	return &cifs_inode->vfs_inode;
248	}	248	}
249		249
250	static void cifs_i_callback(struct rcu_head *head)	250	static void cifs_i_callback(struct rcu_head *head)
251	{	251	{
252	struct inode *inode = container_of(head, struct inode, i_rcu);	252	struct inode *inode = container_of(head, struct inode, i_rcu);
253	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));	253	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
254	}	254	}
255		255
256	static void	256	static void
257	cifs_destroy_inode(struct inode *inode)	257	cifs_destroy_inode(struct inode *inode)
258	{	258	{
259	call_rcu(&inode->i_rcu, cifs_i_callback);	259	call_rcu(&inode->i_rcu, cifs_i_callback);
260	}	260	}
261		261
262	static void	262	static void
263	cifs_evict_inode(struct inode *inode)	263	cifs_evict_inode(struct inode *inode)
264	{	264	{
265	truncate_inode_pages(&inode->i_data, 0);	265	truncate_inode_pages(&inode->i_data, 0);
266	clear_inode(inode);	266	clear_inode(inode);
267	cifs_fscache_release_inode_cookie(inode);	267	cifs_fscache_release_inode_cookie(inode);
268	}	268	}
269		269
270	static void	270	static void
271	cifs_show_address(struct seq_file s, struct TCP_Server_Info server)	271	cifs_show_address(struct seq_file s, struct TCP_Server_Info server)
272	{	272	{
273	struct sockaddr_in sa = (struct sockaddr_in ) &server->dstaddr;	273	struct sockaddr_in sa = (struct sockaddr_in ) &server->dstaddr;
274	struct sockaddr_in6 sa6 = (struct sockaddr_in6 ) &server->dstaddr;	274	struct sockaddr_in6 sa6 = (struct sockaddr_in6 ) &server->dstaddr;
275		275
276	seq_printf(s, ",addr=");	276	seq_printf(s, ",addr=");
277		277
278	switch (server->dstaddr.ss_family) {	278	switch (server->dstaddr.ss_family) {
279	case AF_INET:	279	case AF_INET:
280	seq_printf(s, "%pI4", &sa->sin_addr.s_addr);	280	seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
281	break;	281	break;
282	case AF_INET6:	282	case AF_INET6:
283	seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);	283	seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
284	if (sa6->sin6_scope_id)	284	if (sa6->sin6_scope_id)
285	seq_printf(s, "%%%u", sa6->sin6_scope_id);	285	seq_printf(s, "%%%u", sa6->sin6_scope_id);
286	break;	286	break;
287	default:	287	default:
288	seq_printf(s, "(unknown)");	288	seq_printf(s, "(unknown)");
289	}	289	}
290	}	290	}
291		291
292	static void	292	static void
293	cifs_show_security(struct seq_file s, struct TCP_Server_Info server)	293	cifs_show_security(struct seq_file s, struct TCP_Server_Info server)
294	{	294	{
295	seq_printf(s, ",sec=");	295	seq_printf(s, ",sec=");
296		296
297	switch (server->secType) {	297	switch (server->secType) {
298	case LANMAN:	298	case LANMAN:
299	seq_printf(s, "lanman");	299	seq_printf(s, "lanman");
300	break;	300	break;
301	case NTLMv2:	301	case NTLMv2:
302	seq_printf(s, "ntlmv2");	302	seq_printf(s, "ntlmv2");
303	break;	303	break;
304	case NTLM:	304	case NTLM:
305	seq_printf(s, "ntlm");	305	seq_printf(s, "ntlm");
306	break;	306	break;
307	case Kerberos:	307	case Kerberos:
308	seq_printf(s, "krb5");	308	seq_printf(s, "krb5");
309	break;	309	break;
310	case RawNTLMSSP:	310	case RawNTLMSSP:
311	seq_printf(s, "ntlmssp");	311	seq_printf(s, "ntlmssp");
312	break;	312	break;
313	default:	313	default:
314	/* shouldn't ever happen */	314	/* shouldn't ever happen */
315	seq_printf(s, "unknown");	315	seq_printf(s, "unknown");
316	break;	316	break;
317	}	317	}
318		318
319	if (server->sec_mode & (SECMODE_SIGN_REQUIRED \| SECMODE_SIGN_ENABLED))	319	if (server->sec_mode & (SECMODE_SIGN_REQUIRED \| SECMODE_SIGN_ENABLED))
320	seq_printf(s, "i");	320	seq_printf(s, "i");
321	}	321	}
322		322
323	static void	323	static void
324	cifs_show_cache_flavor(struct seq_file s, struct cifs_sb_info cifs_sb)	324	cifs_show_cache_flavor(struct seq_file s, struct cifs_sb_info cifs_sb)
325	{	325	{
326	seq_printf(s, ",cache=");	326	seq_printf(s, ",cache=");
327		327
328	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)	328	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
329	seq_printf(s, "strict");	329	seq_printf(s, "strict");
330	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)	330	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
331	seq_printf(s, "none");	331	seq_printf(s, "none");
332	else	332	else
333	seq_printf(s, "loose");	333	seq_printf(s, "loose");
334	}	334	}
335		335
336	/*	336	/*
337	* cifs_show_options() is for displaying mount options in /proc/mounts.	337	* cifs_show_options() is for displaying mount options in /proc/mounts.
338	* Not all settable options are displayed but most of the important	338	* Not all settable options are displayed but most of the important
339	* ones are.	339	* ones are.
340	*/	340	*/
341	static int	341	static int
342	cifs_show_options(struct seq_file s, struct dentry root)	342	cifs_show_options(struct seq_file s, struct dentry root)
343	{	343	{
344	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);	344	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
345	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);	345	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
346	struct sockaddr *srcaddr;	346	struct sockaddr *srcaddr;
347	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;	347	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
348		348
349	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);	349	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
350	cifs_show_security(s, tcon->ses->server);	350	cifs_show_security(s, tcon->ses->server);
351	cifs_show_cache_flavor(s, cifs_sb);	351	cifs_show_cache_flavor(s, cifs_sb);
352		352
353	seq_printf(s, ",unc=");	353	seq_printf(s, ",unc=");
354	seq_escape(s, tcon->treeName, " \t\n\\");	354	seq_escape(s, tcon->treeName, " \t\n\\");
355		355
356	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)	356	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
357	seq_printf(s, ",multiuser");	357	seq_printf(s, ",multiuser");
358	else if (tcon->ses->user_name)	358	else if (tcon->ses->user_name)
359	seq_printf(s, ",username=%s", tcon->ses->user_name);	359	seq_printf(s, ",username=%s", tcon->ses->user_name);
360		360
361	if (tcon->ses->domainName)	361	if (tcon->ses->domainName)
362	seq_printf(s, ",domain=%s", tcon->ses->domainName);	362	seq_printf(s, ",domain=%s", tcon->ses->domainName);
363		363
364	if (srcaddr->sa_family != AF_UNSPEC) {	364	if (srcaddr->sa_family != AF_UNSPEC) {
365	struct sockaddr_in *saddr4;	365	struct sockaddr_in *saddr4;
366	struct sockaddr_in6 *saddr6;	366	struct sockaddr_in6 *saddr6;
367	saddr4 = (struct sockaddr_in *)srcaddr;	367	saddr4 = (struct sockaddr_in *)srcaddr;
368	saddr6 = (struct sockaddr_in6 *)srcaddr;	368	saddr6 = (struct sockaddr_in6 *)srcaddr;
369	if (srcaddr->sa_family == AF_INET6)	369	if (srcaddr->sa_family == AF_INET6)
370	seq_printf(s, ",srcaddr=%pI6c",	370	seq_printf(s, ",srcaddr=%pI6c",
371	&saddr6->sin6_addr);	371	&saddr6->sin6_addr);
372	else if (srcaddr->sa_family == AF_INET)	372	else if (srcaddr->sa_family == AF_INET)
373	seq_printf(s, ",srcaddr=%pI4",	373	seq_printf(s, ",srcaddr=%pI4",
374	&saddr4->sin_addr.s_addr);	374	&saddr4->sin_addr.s_addr);
375	else	375	else
376	seq_printf(s, ",srcaddr=BAD-AF:%i",	376	seq_printf(s, ",srcaddr=BAD-AF:%i",
377	(int)(srcaddr->sa_family));	377	(int)(srcaddr->sa_family));
378	}	378	}
379		379
380	seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);	380	seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
381	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)	381	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
382	seq_printf(s, ",forceuid");	382	seq_printf(s, ",forceuid");
383	else	383	else
384	seq_printf(s, ",noforceuid");	384	seq_printf(s, ",noforceuid");
385		385
386	seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);	386	seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
387	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)	387	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
388	seq_printf(s, ",forcegid");	388	seq_printf(s, ",forcegid");
389	else	389	else
390	seq_printf(s, ",noforcegid");	390	seq_printf(s, ",noforcegid");
391		391
392	cifs_show_address(s, tcon->ses->server);	392	cifs_show_address(s, tcon->ses->server);
393		393
394	if (!tcon->unix_ext)	394	if (!tcon->unix_ext)
395	seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",	395	seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
396	cifs_sb->mnt_file_mode,	396	cifs_sb->mnt_file_mode,
397	cifs_sb->mnt_dir_mode);	397	cifs_sb->mnt_dir_mode);
398	if (tcon->seal)	398	if (tcon->seal)
399	seq_printf(s, ",seal");	399	seq_printf(s, ",seal");
400	if (tcon->nocase)	400	if (tcon->nocase)
401	seq_printf(s, ",nocase");	401	seq_printf(s, ",nocase");
402	if (tcon->retry)	402	if (tcon->retry)
403	seq_printf(s, ",hard");	403	seq_printf(s, ",hard");
404	if (tcon->unix_ext)	404	if (tcon->unix_ext)
405	seq_printf(s, ",unix");	405	seq_printf(s, ",unix");
406	else	406	else
407	seq_printf(s, ",nounix");	407	seq_printf(s, ",nounix");
408	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)	408	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
409	seq_printf(s, ",posixpaths");	409	seq_printf(s, ",posixpaths");
410	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)	410	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
411	seq_printf(s, ",setuids");	411	seq_printf(s, ",setuids");
412	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)	412	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
413	seq_printf(s, ",serverino");	413	seq_printf(s, ",serverino");
414	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)	414	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
415	seq_printf(s, ",rwpidforward");	415	seq_printf(s, ",rwpidforward");
416	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)	416	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
417	seq_printf(s, ",forcemand");	417	seq_printf(s, ",forcemand");
418	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)	418	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
419	seq_printf(s, ",nouser_xattr");	419	seq_printf(s, ",nouser_xattr");
420	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)	420	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
421	seq_printf(s, ",mapchars");	421	seq_printf(s, ",mapchars");
422	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)	422	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
423	seq_printf(s, ",sfu");	423	seq_printf(s, ",sfu");
424	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)	424	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
425	seq_printf(s, ",nobrl");	425	seq_printf(s, ",nobrl");
426	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)	426	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
427	seq_printf(s, ",cifsacl");	427	seq_printf(s, ",cifsacl");
428	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)	428	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
429	seq_printf(s, ",dynperm");	429	seq_printf(s, ",dynperm");
430	if (root->d_sb->s_flags & MS_POSIXACL)	430	if (root->d_sb->s_flags & MS_POSIXACL)
431	seq_printf(s, ",acl");	431	seq_printf(s, ",acl");
432	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)	432	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
433	seq_printf(s, ",mfsymlinks");	433	seq_printf(s, ",mfsymlinks");
434	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)	434	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
435	seq_printf(s, ",fsc");	435	seq_printf(s, ",fsc");
436	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)	436	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
437	seq_printf(s, ",nostrictsync");	437	seq_printf(s, ",nostrictsync");
438	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)	438	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
439	seq_printf(s, ",noperm");	439	seq_printf(s, ",noperm");
440	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)	440	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
441	seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);	441	seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
442	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)	442	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
443	seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);	443	seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
444		444
445	seq_printf(s, ",rsize=%u", cifs_sb->rsize);	445	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
446	seq_printf(s, ",wsize=%u", cifs_sb->wsize);	446	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
447	/* convert actimeo and display it in seconds */	447	/* convert actimeo and display it in seconds */
448	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);	448	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
449		449
450	return 0;	450	return 0;
451	}	451	}
452		452
453	static void cifs_umount_begin(struct super_block *sb)	453	static void cifs_umount_begin(struct super_block *sb)
454	{	454	{
455	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);	455	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
456	struct cifs_tcon *tcon;	456	struct cifs_tcon *tcon;
457		457
458	if (cifs_sb == NULL)	458	if (cifs_sb == NULL)
459	return;	459	return;
460		460
461	tcon = cifs_sb_master_tcon(cifs_sb);	461	tcon = cifs_sb_master_tcon(cifs_sb);
462		462
463	spin_lock(&cifs_tcp_ses_lock);	463	spin_lock(&cifs_tcp_ses_lock);
464	if ((tcon->tc_count > 1) \|\| (tcon->tidStatus == CifsExiting)) {	464	if ((tcon->tc_count > 1) \|\| (tcon->tidStatus == CifsExiting)) {
465	/* we have other mounts to same share or we have	465	/* we have other mounts to same share or we have
466	already tried to force umount this and woken up	466	already tried to force umount this and woken up
467	all waiting network requests, nothing to do */	467	all waiting network requests, nothing to do */
468	spin_unlock(&cifs_tcp_ses_lock);	468	spin_unlock(&cifs_tcp_ses_lock);
469	return;	469	return;
470	} else if (tcon->tc_count == 1)	470	} else if (tcon->tc_count == 1)
471	tcon->tidStatus = CifsExiting;	471	tcon->tidStatus = CifsExiting;
472	spin_unlock(&cifs_tcp_ses_lock);	472	spin_unlock(&cifs_tcp_ses_lock);
473		473
474	/* cancel_brl_requests(tcon); / / BB mark all brl mids as exiting */	474	/* cancel_brl_requests(tcon); / / BB mark all brl mids as exiting */
475	/* cancel_notify_requests(tcon); */	475	/* cancel_notify_requests(tcon); */
476	if (tcon->ses && tcon->ses->server) {	476	if (tcon->ses && tcon->ses->server) {
477	cFYI(1, "wake up tasks now - umount begin not complete");	477	cFYI(1, "wake up tasks now - umount begin not complete");
478	wake_up_all(&tcon->ses->server->request_q);	478	wake_up_all(&tcon->ses->server->request_q);
479	wake_up_all(&tcon->ses->server->response_q);	479	wake_up_all(&tcon->ses->server->response_q);
480	msleep(1); /* yield */	480	msleep(1); /* yield */
481	/* we have to kick the requests once more */	481	/* we have to kick the requests once more */
482	wake_up_all(&tcon->ses->server->response_q);	482	wake_up_all(&tcon->ses->server->response_q);
483	msleep(1);	483	msleep(1);
484	}	484	}
485		485
486	return;	486	return;
487	}	487	}
488		488
489	#ifdef CONFIG_CIFS_STATS2	489	#ifdef CONFIG_CIFS_STATS2
490	static int cifs_show_stats(struct seq_file s, struct dentry root)	490	static int cifs_show_stats(struct seq_file s, struct dentry root)
491	{	491	{
492	/* BB FIXME */	492	/* BB FIXME */
493	return 0;	493	return 0;
494	}	494	}
495	#endif	495	#endif
496		496
497	static int cifs_remount(struct super_block sb, int flags, char *data)	497	static int cifs_remount(struct super_block sb, int flags, char *data)
498	{	498	{
499	*flags \|= MS_NODIRATIME;	499	*flags \|= MS_NODIRATIME;
500	return 0;	500	return 0;
501	}	501	}
502		502
503	static int cifs_drop_inode(struct inode *inode)	503	static int cifs_drop_inode(struct inode *inode)
504	{	504	{
505	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);	505	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506		506
507	/* no serverino => unconditional eviction */	507	/* no serverino => unconditional eviction */
508	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) \|\|	508	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) \|\|
509	generic_drop_inode(inode);	509	generic_drop_inode(inode);
510	}	510	}
511		511
512	static const struct super_operations cifs_super_ops = {	512	static const struct super_operations cifs_super_ops = {
513	.statfs = cifs_statfs,	513	.statfs = cifs_statfs,
514	.alloc_inode = cifs_alloc_inode,	514	.alloc_inode = cifs_alloc_inode,
515	.destroy_inode = cifs_destroy_inode,	515	.destroy_inode = cifs_destroy_inode,
516	.drop_inode = cifs_drop_inode,	516	.drop_inode = cifs_drop_inode,
517	.evict_inode = cifs_evict_inode,	517	.evict_inode = cifs_evict_inode,
518	/* .delete_inode = cifs_delete_inode, / / Do not need above	518	/* .delete_inode = cifs_delete_inode, / / Do not need above
519	function unless later we add lazy close of inodes or unless the	519	function unless later we add lazy close of inodes or unless the
520	kernel forgets to call us with the same number of releases (closes)	520	kernel forgets to call us with the same number of releases (closes)
521	as opens */	521	as opens */
522	.show_options = cifs_show_options,	522	.show_options = cifs_show_options,
523	.umount_begin = cifs_umount_begin,	523	.umount_begin = cifs_umount_begin,
524	.remount_fs = cifs_remount,	524	.remount_fs = cifs_remount,
525	#ifdef CONFIG_CIFS_STATS2	525	#ifdef CONFIG_CIFS_STATS2
526	.show_stats = cifs_show_stats,	526	.show_stats = cifs_show_stats,
527	#endif	527	#endif
528	};	528	};
529		529
530	/*	530	/*
531	* Get root dentry from superblock according to prefix path mount option.	531	* Get root dentry from superblock according to prefix path mount option.
532	* Return dentry with refcount + 1 on success and NULL otherwise.	532	* Return dentry with refcount + 1 on success and NULL otherwise.
533	*/	533	*/
534	static struct dentry *	534	static struct dentry *
535	cifs_get_root(struct smb_vol vol, struct super_block sb)	535	cifs_get_root(struct smb_vol vol, struct super_block sb)
536	{	536	{
537	struct dentry *dentry;	537	struct dentry *dentry;
538	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);	538	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
539	char *full_path = NULL;	539	char *full_path = NULL;
540	char s, p;	540	char s, p;
541	char sep;	541	char sep;
542		542
543	full_path = cifs_build_path_to_root(vol, cifs_sb,	543	full_path = cifs_build_path_to_root(vol, cifs_sb,
544	cifs_sb_master_tcon(cifs_sb));	544	cifs_sb_master_tcon(cifs_sb));
545	if (full_path == NULL)	545	if (full_path == NULL)
546	return ERR_PTR(-ENOMEM);	546	return ERR_PTR(-ENOMEM);
547		547
548	cFYI(1, "Get root dentry for %s", full_path);	548	cFYI(1, "Get root dentry for %s", full_path);
549		549
550	sep = CIFS_DIR_SEP(cifs_sb);	550	sep = CIFS_DIR_SEP(cifs_sb);
551	dentry = dget(sb->s_root);	551	dentry = dget(sb->s_root);
552	p = s = full_path;	552	p = s = full_path;
553		553
554	do {	554	do {
555	struct inode *dir = dentry->d_inode;	555	struct inode *dir = dentry->d_inode;
556	struct dentry *child;	556	struct dentry *child;
557		557
558	if (!dir) {	558	if (!dir) {
559	dput(dentry);	559	dput(dentry);
560	dentry = ERR_PTR(-ENOENT);	560	dentry = ERR_PTR(-ENOENT);
561	break;	561	break;
562	}	562	}
563		563
564	/* skip separators */	564	/* skip separators */
565	while (*s == sep)	565	while (*s == sep)
566	s++;	566	s++;
567	if (!*s)	567	if (!*s)
568	break;	568	break;
569	p = s++;	569	p = s++;
570	/* next separator */	570	/* next separator */
571	while (s && s != sep)	571	while (s && s != sep)
572	s++;	572	s++;
573		573
574	mutex_lock(&dir->i_mutex);	574	mutex_lock(&dir->i_mutex);
575	child = lookup_one_len(p, dentry, s - p);	575	child = lookup_one_len(p, dentry, s - p);
576	mutex_unlock(&dir->i_mutex);	576	mutex_unlock(&dir->i_mutex);
577	dput(dentry);	577	dput(dentry);
578	dentry = child;	578	dentry = child;
579	} while (!IS_ERR(dentry));	579	} while (!IS_ERR(dentry));
580	kfree(full_path);	580	kfree(full_path);
581	return dentry;	581	return dentry;
582	}	582	}
583		583
584	static int cifs_set_super(struct super_block sb, void data)	584	static int cifs_set_super(struct super_block sb, void data)
585	{	585	{
586	struct cifs_mnt_data *mnt_data = data;	586	struct cifs_mnt_data *mnt_data = data;
587	sb->s_fs_info = mnt_data->cifs_sb;	587	sb->s_fs_info = mnt_data->cifs_sb;
588	return set_anon_super(sb, NULL);	588	return set_anon_super(sb, NULL);
589	}	589	}
590		590
591	static struct dentry *	591	static struct dentry *
592	cifs_do_mount(struct file_system_type *fs_type,	592	cifs_do_mount(struct file_system_type *fs_type,
593	int flags, const char dev_name, void data)	593	int flags, const char dev_name, void data)
594	{	594	{
595	int rc;	595	int rc;
596	struct super_block *sb;	596	struct super_block *sb;
597	struct cifs_sb_info *cifs_sb;	597	struct cifs_sb_info *cifs_sb;
598	struct smb_vol *volume_info;	598	struct smb_vol *volume_info;
599	struct cifs_mnt_data mnt_data;	599	struct cifs_mnt_data mnt_data;
600	struct dentry *root;	600	struct dentry *root;
601		601
602	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);	602	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
603		603
604	volume_info = cifs_get_volume_info((char *)data, dev_name);	604	volume_info = cifs_get_volume_info((char *)data, dev_name);
605	if (IS_ERR(volume_info))	605	if (IS_ERR(volume_info))
606	return ERR_CAST(volume_info);	606	return ERR_CAST(volume_info);
607		607
608	cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);	608	cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
609	if (cifs_sb == NULL) {	609	if (cifs_sb == NULL) {
610	root = ERR_PTR(-ENOMEM);	610	root = ERR_PTR(-ENOMEM);
611	goto out_nls;	611	goto out_nls;
612	}	612	}
613		613
614	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);	614	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
615	if (cifs_sb->mountdata == NULL) {	615	if (cifs_sb->mountdata == NULL) {
616	root = ERR_PTR(-ENOMEM);	616	root = ERR_PTR(-ENOMEM);
617	goto out_cifs_sb;	617	goto out_cifs_sb;
618	}	618	}
619		619
620	cifs_setup_cifs_sb(volume_info, cifs_sb);	620	cifs_setup_cifs_sb(volume_info, cifs_sb);
621		621
622	rc = cifs_mount(cifs_sb, volume_info);	622	rc = cifs_mount(cifs_sb, volume_info);
623	if (rc) {	623	if (rc) {
624	if (!(flags & MS_SILENT))	624	if (!(flags & MS_SILENT))
625	cERROR(1, "cifs_mount failed w/return code = %d", rc);	625	cERROR(1, "cifs_mount failed w/return code = %d", rc);
626	root = ERR_PTR(rc);	626	root = ERR_PTR(rc);
627	goto out_mountdata;	627	goto out_mountdata;
628	}	628	}
629		629
630	mnt_data.vol = volume_info;	630	mnt_data.vol = volume_info;
631	mnt_data.cifs_sb = cifs_sb;	631	mnt_data.cifs_sb = cifs_sb;
632	mnt_data.flags = flags;	632	mnt_data.flags = flags;
633		633
634	/* BB should we make this contingent on mount parm? */	634	/* BB should we make this contingent on mount parm? */
635	flags \|= MS_NODIRATIME \| MS_NOATIME;	635	flags \|= MS_NODIRATIME \| MS_NOATIME;
636		636
637	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);	637	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
638	if (IS_ERR(sb)) {	638	if (IS_ERR(sb)) {
639	root = ERR_CAST(sb);	639	root = ERR_CAST(sb);
640	cifs_umount(cifs_sb);	640	cifs_umount(cifs_sb);
641	goto out;	641	goto out;
642	}	642	}
643		643
644	if (sb->s_root) {	644	if (sb->s_root) {
645	cFYI(1, "Use existing superblock");	645	cFYI(1, "Use existing superblock");
646	cifs_umount(cifs_sb);	646	cifs_umount(cifs_sb);
647	} else {	647	} else {
648	rc = cifs_read_super(sb);	648	rc = cifs_read_super(sb);
649	if (rc) {	649	if (rc) {
650	root = ERR_PTR(rc);	650	root = ERR_PTR(rc);
651	goto out_super;	651	goto out_super;
652	}	652	}
653		653
654	sb->s_flags \|= MS_ACTIVE;	654	sb->s_flags \|= MS_ACTIVE;
655	}	655	}
656		656
657	root = cifs_get_root(volume_info, sb);	657	root = cifs_get_root(volume_info, sb);
658	if (IS_ERR(root))	658	if (IS_ERR(root))
659	goto out_super;	659	goto out_super;
660		660
661	cFYI(1, "dentry root is: %p", root);	661	cFYI(1, "dentry root is: %p", root);
662	goto out;	662	goto out;
663		663
664	out_super:	664	out_super:
665	deactivate_locked_super(sb);	665	deactivate_locked_super(sb);
666	out:	666	out:
667	cifs_cleanup_volume_info(volume_info);	667	cifs_cleanup_volume_info(volume_info);
668	return root;	668	return root;
669		669
670	out_mountdata:	670	out_mountdata:
671	kfree(cifs_sb->mountdata);	671	kfree(cifs_sb->mountdata);
672	out_cifs_sb:	672	out_cifs_sb:
673	kfree(cifs_sb);	673	kfree(cifs_sb);
674	out_nls:	674	out_nls:
675	unload_nls(volume_info->local_nls);	675	unload_nls(volume_info->local_nls);
676	goto out;	676	goto out;
677	}	677	}
678		678
679	static ssize_t cifs_file_aio_write(struct kiocb iocb, const struct iovec iov,	679	static ssize_t cifs_file_aio_write(struct kiocb iocb, const struct iovec iov,
680	unsigned long nr_segs, loff_t pos)	680	unsigned long nr_segs, loff_t pos)
681	{	681	{
682	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;	682	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
683	ssize_t written;	683	ssize_t written;
684	int rc;	684	int rc;
685		685
686	written = generic_file_aio_write(iocb, iov, nr_segs, pos);	686	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
687		687
688	if (CIFS_I(inode)->clientCanCacheAll)	688	if (CIFS_I(inode)->clientCanCacheAll)
689	return written;	689	return written;
690		690
691	rc = filemap_fdatawrite(inode->i_mapping);	691	rc = filemap_fdatawrite(inode->i_mapping);
692	if (rc)	692	if (rc)
693	cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);	693	cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
694		694
695	return written;	695	return written;
696	}	696	}
697		697
698	static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)	698	static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
699	{	699	{
700	/*	700	/*
701	* origin == SEEK_END \|\| SEEK_DATA \|\| SEEK_HOLE => we must revalidate	701	* whence == SEEK_END \|\| SEEK_DATA \|\| SEEK_HOLE => we must revalidate
702	* the cached file length	702	* the cached file length
703	*/	703	*/
704	if (origin != SEEK_SET && origin != SEEK_CUR) {	704	if (whence != SEEK_SET && whence != SEEK_CUR) {
705	int rc;	705	int rc;
706	struct inode *inode = file->f_path.dentry->d_inode;	706	struct inode *inode = file->f_path.dentry->d_inode;
707		707
708	/*	708	/*
709	* We need to be sure that all dirty pages are written and the	709	* We need to be sure that all dirty pages are written and the
710	* server has the newest file length.	710	* server has the newest file length.
711	*/	711	*/
712	if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&	712	if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
713	inode->i_mapping->nrpages != 0) {	713	inode->i_mapping->nrpages != 0) {
714	rc = filemap_fdatawait(inode->i_mapping);	714	rc = filemap_fdatawait(inode->i_mapping);
715	if (rc) {	715	if (rc) {
716	mapping_set_error(inode->i_mapping, rc);	716	mapping_set_error(inode->i_mapping, rc);
717	return rc;	717	return rc;
718	}	718	}
719	}	719	}
720	/*	720	/*
721	* Some applications poll for the file length in this strange	721	* Some applications poll for the file length in this strange
722	* way so we must seek to end on non-oplocked files by	722	* way so we must seek to end on non-oplocked files by
723	* setting the revalidate time to zero.	723	* setting the revalidate time to zero.
724	*/	724	*/
725	CIFS_I(inode)->time = 0;	725	CIFS_I(inode)->time = 0;
726		726
727	rc = cifs_revalidate_file_attr(file);	727	rc = cifs_revalidate_file_attr(file);
728	if (rc < 0)	728	if (rc < 0)
729	return (loff_t)rc;	729	return (loff_t)rc;
730	}	730	}
731	return generic_file_llseek(file, offset, origin);	731	return generic_file_llseek(file, offset, whence);
732	}	732	}
733		733
734	static int cifs_setlease(struct file file, long arg, struct file_lock *lease)	734	static int cifs_setlease(struct file file, long arg, struct file_lock *lease)
735	{	735	{
736	/* note that this is called by vfs setlease with lock_flocks held	736	/* note that this is called by vfs setlease with lock_flocks held
737	to protect lease from going away /	737	to protect lease from going away /
738	struct inode *inode = file->f_path.dentry->d_inode;	738	struct inode *inode = file->f_path.dentry->d_inode;
739	struct cifsFileInfo *cfile = file->private_data;	739	struct cifsFileInfo *cfile = file->private_data;
740		740
741	if (!(S_ISREG(inode->i_mode)))	741	if (!(S_ISREG(inode->i_mode)))
742	return -EINVAL;	742	return -EINVAL;
743		743
744	/* check if file is oplocked */	744	/* check if file is oplocked */
745	if (((arg == F_RDLCK) &&	745	if (((arg == F_RDLCK) &&
746	(CIFS_I(inode)->clientCanCacheRead)) \|\|	746	(CIFS_I(inode)->clientCanCacheRead)) \|\|
747	((arg == F_WRLCK) &&	747	((arg == F_WRLCK) &&
748	(CIFS_I(inode)->clientCanCacheAll)))	748	(CIFS_I(inode)->clientCanCacheAll)))
749	return generic_setlease(file, arg, lease);	749	return generic_setlease(file, arg, lease);
750	else if (tlink_tcon(cfile->tlink)->local_lease &&	750	else if (tlink_tcon(cfile->tlink)->local_lease &&
751	!CIFS_I(inode)->clientCanCacheRead)	751	!CIFS_I(inode)->clientCanCacheRead)
752	/* If the server claims to support oplock on this	752	/* If the server claims to support oplock on this
753	file, then we still need to check oplock even	753	file, then we still need to check oplock even
754	if the local_lease mount option is set, but there	754	if the local_lease mount option is set, but there
755	are servers which do not support oplock for which	755	are servers which do not support oplock for which
756	this mount option may be useful if the user	756	this mount option may be useful if the user
757	knows that the file won't be changed on the server	757	knows that the file won't be changed on the server
758	by anyone else */	758	by anyone else */
759	return generic_setlease(file, arg, lease);	759	return generic_setlease(file, arg, lease);
760	else	760	else
761	return -EAGAIN;	761	return -EAGAIN;
762	}	762	}
763		763
764	struct file_system_type cifs_fs_type = {	764	struct file_system_type cifs_fs_type = {
765	.owner = THIS_MODULE,	765	.owner = THIS_MODULE,
766	.name = "cifs",	766	.name = "cifs",
767	.mount = cifs_do_mount,	767	.mount = cifs_do_mount,
768	.kill_sb = cifs_kill_sb,	768	.kill_sb = cifs_kill_sb,
769	/* .fs_flags */	769	/* .fs_flags */
770	};	770	};
771	const struct inode_operations cifs_dir_inode_ops = {	771	const struct inode_operations cifs_dir_inode_ops = {
772	.create = cifs_create,	772	.create = cifs_create,
773	.atomic_open = cifs_atomic_open,	773	.atomic_open = cifs_atomic_open,
774	.lookup = cifs_lookup,	774	.lookup = cifs_lookup,
775	.getattr = cifs_getattr,	775	.getattr = cifs_getattr,
776	.unlink = cifs_unlink,	776	.unlink = cifs_unlink,
777	.link = cifs_hardlink,	777	.link = cifs_hardlink,
778	.mkdir = cifs_mkdir,	778	.mkdir = cifs_mkdir,
779	.rmdir = cifs_rmdir,	779	.rmdir = cifs_rmdir,
780	.rename = cifs_rename,	780	.rename = cifs_rename,
781	.permission = cifs_permission,	781	.permission = cifs_permission,
782	/* revalidate:cifs_revalidate, */	782	/* revalidate:cifs_revalidate, */
783	.setattr = cifs_setattr,	783	.setattr = cifs_setattr,
784	.symlink = cifs_symlink,	784	.symlink = cifs_symlink,
785	.mknod = cifs_mknod,	785	.mknod = cifs_mknod,
786	#ifdef CONFIG_CIFS_XATTR	786	#ifdef CONFIG_CIFS_XATTR
787	.setxattr = cifs_setxattr,	787	.setxattr = cifs_setxattr,
788	.getxattr = cifs_getxattr,	788	.getxattr = cifs_getxattr,
789	.listxattr = cifs_listxattr,	789	.listxattr = cifs_listxattr,
790	.removexattr = cifs_removexattr,	790	.removexattr = cifs_removexattr,
791	#endif	791	#endif
792	};	792	};
793		793
794	const struct inode_operations cifs_file_inode_ops = {	794	const struct inode_operations cifs_file_inode_ops = {
795	/* revalidate:cifs_revalidate, */	795	/* revalidate:cifs_revalidate, */
796	.setattr = cifs_setattr,	796	.setattr = cifs_setattr,
797	.getattr = cifs_getattr, /* do we need this anymore? */	797	.getattr = cifs_getattr, /* do we need this anymore? */
798	.rename = cifs_rename,	798	.rename = cifs_rename,
799	.permission = cifs_permission,	799	.permission = cifs_permission,
800	#ifdef CONFIG_CIFS_XATTR	800	#ifdef CONFIG_CIFS_XATTR
801	.setxattr = cifs_setxattr,	801	.setxattr = cifs_setxattr,
802	.getxattr = cifs_getxattr,	802	.getxattr = cifs_getxattr,
803	.listxattr = cifs_listxattr,	803	.listxattr = cifs_listxattr,
804	.removexattr = cifs_removexattr,	804	.removexattr = cifs_removexattr,
805	#endif	805	#endif
806	};	806	};
807		807
808	const struct inode_operations cifs_symlink_inode_ops = {	808	const struct inode_operations cifs_symlink_inode_ops = {
809	.readlink = generic_readlink,	809	.readlink = generic_readlink,
810	.follow_link = cifs_follow_link,	810	.follow_link = cifs_follow_link,
811	.put_link = cifs_put_link,	811	.put_link = cifs_put_link,
812	.permission = cifs_permission,	812	.permission = cifs_permission,
813	/* BB add the following two eventually */	813	/* BB add the following two eventually */
814	/* revalidate: cifs_revalidate,	814	/* revalidate: cifs_revalidate,
815	setattr: cifs_notify_change, // BB do we need notify change */	815	setattr: cifs_notify_change, // BB do we need notify change */
816	#ifdef CONFIG_CIFS_XATTR	816	#ifdef CONFIG_CIFS_XATTR
817	.setxattr = cifs_setxattr,	817	.setxattr = cifs_setxattr,
818	.getxattr = cifs_getxattr,	818	.getxattr = cifs_getxattr,
819	.listxattr = cifs_listxattr,	819	.listxattr = cifs_listxattr,
820	.removexattr = cifs_removexattr,	820	.removexattr = cifs_removexattr,
821	#endif	821	#endif
822	};	822	};
823		823
824	const struct file_operations cifs_file_ops = {	824	const struct file_operations cifs_file_ops = {
825	.read = do_sync_read,	825	.read = do_sync_read,
826	.write = do_sync_write,	826	.write = do_sync_write,
827	.aio_read = generic_file_aio_read,	827	.aio_read = generic_file_aio_read,
828	.aio_write = cifs_file_aio_write,	828	.aio_write = cifs_file_aio_write,
829	.open = cifs_open,	829	.open = cifs_open,
830	.release = cifs_close,	830	.release = cifs_close,
831	.lock = cifs_lock,	831	.lock = cifs_lock,
832	.fsync = cifs_fsync,	832	.fsync = cifs_fsync,
833	.flush = cifs_flush,	833	.flush = cifs_flush,
834	.mmap = cifs_file_mmap,	834	.mmap = cifs_file_mmap,
835	.splice_read = generic_file_splice_read,	835	.splice_read = generic_file_splice_read,
836	.llseek = cifs_llseek,	836	.llseek = cifs_llseek,
837	#ifdef CONFIG_CIFS_POSIX	837	#ifdef CONFIG_CIFS_POSIX
838	.unlocked_ioctl = cifs_ioctl,	838	.unlocked_ioctl = cifs_ioctl,
839	#endif /* CONFIG_CIFS_POSIX */	839	#endif /* CONFIG_CIFS_POSIX */
840	.setlease = cifs_setlease,	840	.setlease = cifs_setlease,
841	};	841	};
842		842
843	const struct file_operations cifs_file_strict_ops = {	843	const struct file_operations cifs_file_strict_ops = {
844	.read = do_sync_read,	844	.read = do_sync_read,
845	.write = do_sync_write,	845	.write = do_sync_write,
846	.aio_read = cifs_strict_readv,	846	.aio_read = cifs_strict_readv,
847	.aio_write = cifs_strict_writev,	847	.aio_write = cifs_strict_writev,
848	.open = cifs_open,	848	.open = cifs_open,
849	.release = cifs_close,	849	.release = cifs_close,
850	.lock = cifs_lock,	850	.lock = cifs_lock,
851	.fsync = cifs_strict_fsync,	851	.fsync = cifs_strict_fsync,
852	.flush = cifs_flush,	852	.flush = cifs_flush,
853	.mmap = cifs_file_strict_mmap,	853	.mmap = cifs_file_strict_mmap,
854	.splice_read = generic_file_splice_read,	854	.splice_read = generic_file_splice_read,
855	.llseek = cifs_llseek,	855	.llseek = cifs_llseek,
856	#ifdef CONFIG_CIFS_POSIX	856	#ifdef CONFIG_CIFS_POSIX
857	.unlocked_ioctl = cifs_ioctl,	857	.unlocked_ioctl = cifs_ioctl,
858	#endif /* CONFIG_CIFS_POSIX */	858	#endif /* CONFIG_CIFS_POSIX */
859	.setlease = cifs_setlease,	859	.setlease = cifs_setlease,
860	};	860	};
861		861
862	const struct file_operations cifs_file_direct_ops = {	862	const struct file_operations cifs_file_direct_ops = {
863	/* BB reevaluate whether they can be done with directio, no cache */	863	/* BB reevaluate whether they can be done with directio, no cache */
864	.read = do_sync_read,	864	.read = do_sync_read,
865	.write = do_sync_write,	865	.write = do_sync_write,
866	.aio_read = cifs_user_readv,	866	.aio_read = cifs_user_readv,
867	.aio_write = cifs_user_writev,	867	.aio_write = cifs_user_writev,
868	.open = cifs_open,	868	.open = cifs_open,
869	.release = cifs_close,	869	.release = cifs_close,
870	.lock = cifs_lock,	870	.lock = cifs_lock,
871	.fsync = cifs_fsync,	871	.fsync = cifs_fsync,
872	.flush = cifs_flush,	872	.flush = cifs_flush,
873	.mmap = cifs_file_mmap,	873	.mmap = cifs_file_mmap,
874	.splice_read = generic_file_splice_read,	874	.splice_read = generic_file_splice_read,
875	#ifdef CONFIG_CIFS_POSIX	875	#ifdef CONFIG_CIFS_POSIX
876	.unlocked_ioctl = cifs_ioctl,	876	.unlocked_ioctl = cifs_ioctl,
877	#endif /* CONFIG_CIFS_POSIX */	877	#endif /* CONFIG_CIFS_POSIX */
878	.llseek = cifs_llseek,	878	.llseek = cifs_llseek,
879	.setlease = cifs_setlease,	879	.setlease = cifs_setlease,
880	};	880	};
881		881
882	const struct file_operations cifs_file_nobrl_ops = {	882	const struct file_operations cifs_file_nobrl_ops = {
883	.read = do_sync_read,	883	.read = do_sync_read,
884	.write = do_sync_write,	884	.write = do_sync_write,
885	.aio_read = generic_file_aio_read,	885	.aio_read = generic_file_aio_read,
886	.aio_write = cifs_file_aio_write,	886	.aio_write = cifs_file_aio_write,
887	.open = cifs_open,	887	.open = cifs_open,
888	.release = cifs_close,	888	.release = cifs_close,
889	.fsync = cifs_fsync,	889	.fsync = cifs_fsync,
890	.flush = cifs_flush,	890	.flush = cifs_flush,
891	.mmap = cifs_file_mmap,	891	.mmap = cifs_file_mmap,
892	.splice_read = generic_file_splice_read,	892	.splice_read = generic_file_splice_read,
893	.llseek = cifs_llseek,	893	.llseek = cifs_llseek,
894	#ifdef CONFIG_CIFS_POSIX	894	#ifdef CONFIG_CIFS_POSIX
895	.unlocked_ioctl = cifs_ioctl,	895	.unlocked_ioctl = cifs_ioctl,
896	#endif /* CONFIG_CIFS_POSIX */	896	#endif /* CONFIG_CIFS_POSIX */
897	.setlease = cifs_setlease,	897	.setlease = cifs_setlease,
898	};	898	};
899		899
900	const struct file_operations cifs_file_strict_nobrl_ops = {	900	const struct file_operations cifs_file_strict_nobrl_ops = {
901	.read = do_sync_read,	901	.read = do_sync_read,
902	.write = do_sync_write,	902	.write = do_sync_write,
903	.aio_read = cifs_strict_readv,	903	.aio_read = cifs_strict_readv,
904	.aio_write = cifs_strict_writev,	904	.aio_write = cifs_strict_writev,
905	.open = cifs_open,	905	.open = cifs_open,
906	.release = cifs_close,	906	.release = cifs_close,
907	.fsync = cifs_strict_fsync,	907	.fsync = cifs_strict_fsync,
908	.flush = cifs_flush,	908	.flush = cifs_flush,
909	.mmap = cifs_file_strict_mmap,	909	.mmap = cifs_file_strict_mmap,
910	.splice_read = generic_file_splice_read,	910	.splice_read = generic_file_splice_read,
911	.llseek = cifs_llseek,	911	.llseek = cifs_llseek,
912	#ifdef CONFIG_CIFS_POSIX	912	#ifdef CONFIG_CIFS_POSIX
913	.unlocked_ioctl = cifs_ioctl,	913	.unlocked_ioctl = cifs_ioctl,
914	#endif /* CONFIG_CIFS_POSIX */	914	#endif /* CONFIG_CIFS_POSIX */
915	.setlease = cifs_setlease,	915	.setlease = cifs_setlease,
916	};	916	};
917		917
918	const struct file_operations cifs_file_direct_nobrl_ops = {	918	const struct file_operations cifs_file_direct_nobrl_ops = {
919	/* BB reevaluate whether they can be done with directio, no cache */	919	/* BB reevaluate whether they can be done with directio, no cache */
920	.read = do_sync_read,	920	.read = do_sync_read,
921	.write = do_sync_write,	921	.write = do_sync_write,
922	.aio_read = cifs_user_readv,	922	.aio_read = cifs_user_readv,
923	.aio_write = cifs_user_writev,	923	.aio_write = cifs_user_writev,
924	.open = cifs_open,	924	.open = cifs_open,
925	.release = cifs_close,	925	.release = cifs_close,
926	.fsync = cifs_fsync,	926	.fsync = cifs_fsync,
927	.flush = cifs_flush,	927	.flush = cifs_flush,
928	.mmap = cifs_file_mmap,	928	.mmap = cifs_file_mmap,
929	.splice_read = generic_file_splice_read,	929	.splice_read = generic_file_splice_read,
930	#ifdef CONFIG_CIFS_POSIX	930	#ifdef CONFIG_CIFS_POSIX
931	.unlocked_ioctl = cifs_ioctl,	931	.unlocked_ioctl = cifs_ioctl,
932	#endif /* CONFIG_CIFS_POSIX */	932	#endif /* CONFIG_CIFS_POSIX */
933	.llseek = cifs_llseek,	933	.llseek = cifs_llseek,
934	.setlease = cifs_setlease,	934	.setlease = cifs_setlease,
935	};	935	};
936		936
937	const struct file_operations cifs_dir_ops = {	937	const struct file_operations cifs_dir_ops = {
938	.readdir = cifs_readdir,	938	.readdir = cifs_readdir,
939	.release = cifs_closedir,	939	.release = cifs_closedir,
940	.read = generic_read_dir,	940	.read = generic_read_dir,
941	.unlocked_ioctl = cifs_ioctl,	941	.unlocked_ioctl = cifs_ioctl,
942	.llseek = generic_file_llseek,	942	.llseek = generic_file_llseek,
943	};	943	};
944		944
945	static void	945	static void
946	cifs_init_once(void *inode)	946	cifs_init_once(void *inode)
947	{	947	{
948	struct cifsInodeInfo *cifsi = inode;	948	struct cifsInodeInfo *cifsi = inode;
949		949
950	inode_init_once(&cifsi->vfs_inode);	950	inode_init_once(&cifsi->vfs_inode);
951	init_rwsem(&cifsi->lock_sem);	951	init_rwsem(&cifsi->lock_sem);
952	}	952	}
953		953
954	static int	954	static int
955	cifs_init_inodecache(void)	955	cifs_init_inodecache(void)
956	{	956	{
957	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",	957	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
958	sizeof(struct cifsInodeInfo),	958	sizeof(struct cifsInodeInfo),
959	0, (SLAB_RECLAIM_ACCOUNT\|	959	0, (SLAB_RECLAIM_ACCOUNT\|
960	SLAB_MEM_SPREAD),	960	SLAB_MEM_SPREAD),
961	cifs_init_once);	961	cifs_init_once);
962	if (cifs_inode_cachep == NULL)	962	if (cifs_inode_cachep == NULL)
963	return -ENOMEM;	963	return -ENOMEM;
964		964
965	return 0;	965	return 0;
966	}	966	}
967		967
968	static void	968	static void
969	cifs_destroy_inodecache(void)	969	cifs_destroy_inodecache(void)
970	{	970	{
971	/*	971	/*
972	* Make sure all delayed rcu free inodes are flushed before we	972	* Make sure all delayed rcu free inodes are flushed before we
973	* destroy cache.	973	* destroy cache.
974	*/	974	*/
975	rcu_barrier();	975	rcu_barrier();
976	kmem_cache_destroy(cifs_inode_cachep);	976	kmem_cache_destroy(cifs_inode_cachep);
977	}	977	}
978		978
979	static int	979	static int
980	cifs_init_request_bufs(void)	980	cifs_init_request_bufs(void)
981	{	981	{
982	size_t max_hdr_size = MAX_CIFS_HDR_SIZE;	982	size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
983	#ifdef CONFIG_CIFS_SMB2	983	#ifdef CONFIG_CIFS_SMB2
984	/*	984	/*
985	* SMB2 maximum header size is bigger than CIFS one - no problems to	985	* SMB2 maximum header size is bigger than CIFS one - no problems to
986	* allocate some more bytes for CIFS.	986	* allocate some more bytes for CIFS.
987	*/	987	*/
988	max_hdr_size = MAX_SMB2_HDR_SIZE;	988	max_hdr_size = MAX_SMB2_HDR_SIZE;
989	#endif	989	#endif
990	if (CIFSMaxBufSize < 8192) {	990	if (CIFSMaxBufSize < 8192) {
991	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum	991	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
992	Unicode path name has to fit in any SMB/CIFS path based frames */	992	Unicode path name has to fit in any SMB/CIFS path based frames */
993	CIFSMaxBufSize = 8192;	993	CIFSMaxBufSize = 8192;
994	} else if (CIFSMaxBufSize > 1024*127) {	994	} else if (CIFSMaxBufSize > 1024*127) {
995	CIFSMaxBufSize = 1024 * 127;	995	CIFSMaxBufSize = 1024 * 127;
996	} else {	996	} else {
997	CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/	997	CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
998	}	998	}
999	/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */	999	/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
1000	cifs_req_cachep = kmem_cache_create("cifs_request",	1000	cifs_req_cachep = kmem_cache_create("cifs_request",
1001	CIFSMaxBufSize + max_hdr_size, 0,	1001	CIFSMaxBufSize + max_hdr_size, 0,
1002	SLAB_HWCACHE_ALIGN, NULL);	1002	SLAB_HWCACHE_ALIGN, NULL);
1003	if (cifs_req_cachep == NULL)	1003	if (cifs_req_cachep == NULL)
1004	return -ENOMEM;	1004	return -ENOMEM;
1005		1005
1006	if (cifs_min_rcv < 1)	1006	if (cifs_min_rcv < 1)
1007	cifs_min_rcv = 1;	1007	cifs_min_rcv = 1;
1008	else if (cifs_min_rcv > 64) {	1008	else if (cifs_min_rcv > 64) {
1009	cifs_min_rcv = 64;	1009	cifs_min_rcv = 64;
1010	cERROR(1, "cifs_min_rcv set to maximum (64)");	1010	cERROR(1, "cifs_min_rcv set to maximum (64)");
1011	}	1011	}
1012		1012
1013	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,	1013	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
1014	cifs_req_cachep);	1014	cifs_req_cachep);
1015		1015
1016	if (cifs_req_poolp == NULL) {	1016	if (cifs_req_poolp == NULL) {
1017	kmem_cache_destroy(cifs_req_cachep);	1017	kmem_cache_destroy(cifs_req_cachep);
1018	return -ENOMEM;	1018	return -ENOMEM;
1019	}	1019	}
1020	/* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and	1020	/* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
1021	almost all handle based requests (but not write response, nor is it	1021	almost all handle based requests (but not write response, nor is it
1022	sufficient for path based requests). A smaller size would have	1022	sufficient for path based requests). A smaller size would have
1023	been more efficient (compacting multiple slab items on one 4k page)	1023	been more efficient (compacting multiple slab items on one 4k page)
1024	for the case in which debug was on, but this larger size allows	1024	for the case in which debug was on, but this larger size allows
1025	more SMBs to use small buffer alloc and is still much more	1025	more SMBs to use small buffer alloc and is still much more
1026	efficient to alloc 1 per page off the slab compared to 17K (5page)	1026	efficient to alloc 1 per page off the slab compared to 17K (5page)
1027	alloc of large cifs buffers even when page debugging is on */	1027	alloc of large cifs buffers even when page debugging is on */
1028	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",	1028	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
1029	MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,	1029	MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
1030	NULL);	1030	NULL);
1031	if (cifs_sm_req_cachep == NULL) {	1031	if (cifs_sm_req_cachep == NULL) {
1032	mempool_destroy(cifs_req_poolp);	1032	mempool_destroy(cifs_req_poolp);
1033	kmem_cache_destroy(cifs_req_cachep);	1033	kmem_cache_destroy(cifs_req_cachep);
1034	return -ENOMEM;	1034	return -ENOMEM;
1035	}	1035	}
1036		1036
1037	if (cifs_min_small < 2)	1037	if (cifs_min_small < 2)
1038	cifs_min_small = 2;	1038	cifs_min_small = 2;
1039	else if (cifs_min_small > 256) {	1039	else if (cifs_min_small > 256) {
1040	cifs_min_small = 256;	1040	cifs_min_small = 256;
1041	cFYI(1, "cifs_min_small set to maximum (256)");	1041	cFYI(1, "cifs_min_small set to maximum (256)");
1042	}	1042	}
1043		1043
1044	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,	1044	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
1045	cifs_sm_req_cachep);	1045	cifs_sm_req_cachep);
1046		1046
1047	if (cifs_sm_req_poolp == NULL) {	1047	if (cifs_sm_req_poolp == NULL) {
1048	mempool_destroy(cifs_req_poolp);	1048	mempool_destroy(cifs_req_poolp);
1049	kmem_cache_destroy(cifs_req_cachep);	1049	kmem_cache_destroy(cifs_req_cachep);
1050	kmem_cache_destroy(cifs_sm_req_cachep);	1050	kmem_cache_destroy(cifs_sm_req_cachep);
1051	return -ENOMEM;	1051	return -ENOMEM;
1052	}	1052	}
1053		1053
1054	return 0;	1054	return 0;
1055	}	1055	}
1056		1056
1057	static void	1057	static void
1058	cifs_destroy_request_bufs(void)	1058	cifs_destroy_request_bufs(void)
1059	{	1059	{
1060	mempool_destroy(cifs_req_poolp);	1060	mempool_destroy(cifs_req_poolp);
1061	kmem_cache_destroy(cifs_req_cachep);	1061	kmem_cache_destroy(cifs_req_cachep);
1062	mempool_destroy(cifs_sm_req_poolp);	1062	mempool_destroy(cifs_sm_req_poolp);
1063	kmem_cache_destroy(cifs_sm_req_cachep);	1063	kmem_cache_destroy(cifs_sm_req_cachep);
1064	}	1064	}
1065		1065
1066	static int	1066	static int
1067	cifs_init_mids(void)	1067	cifs_init_mids(void)
1068	{	1068	{
1069	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",	1069	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
1070	sizeof(struct mid_q_entry), 0,	1070	sizeof(struct mid_q_entry), 0,
1071	SLAB_HWCACHE_ALIGN, NULL);	1071	SLAB_HWCACHE_ALIGN, NULL);
1072	if (cifs_mid_cachep == NULL)	1072	if (cifs_mid_cachep == NULL)
1073	return -ENOMEM;	1073	return -ENOMEM;
1074		1074
1075	/* 3 is a reasonable minimum number of simultaneous operations */	1075	/* 3 is a reasonable minimum number of simultaneous operations */
1076	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);	1076	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
1077	if (cifs_mid_poolp == NULL) {	1077	if (cifs_mid_poolp == NULL) {
1078	kmem_cache_destroy(cifs_mid_cachep);	1078	kmem_cache_destroy(cifs_mid_cachep);
1079	return -ENOMEM;	1079	return -ENOMEM;
1080	}	1080	}
1081		1081
1082	return 0;	1082	return 0;
1083	}	1083	}
1084		1084
1085	static void	1085	static void
1086	cifs_destroy_mids(void)	1086	cifs_destroy_mids(void)
1087	{	1087	{
1088	mempool_destroy(cifs_mid_poolp);	1088	mempool_destroy(cifs_mid_poolp);
1089	kmem_cache_destroy(cifs_mid_cachep);	1089	kmem_cache_destroy(cifs_mid_cachep);
1090	}	1090	}
1091		1091
1092	static int __init	1092	static int __init
1093	init_cifs(void)	1093	init_cifs(void)
1094	{	1094	{
1095	int rc = 0;	1095	int rc = 0;
1096	cifs_proc_init();	1096	cifs_proc_init();
1097	INIT_LIST_HEAD(&cifs_tcp_ses_list);	1097	INIT_LIST_HEAD(&cifs_tcp_ses_list);
1098	#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */	1098	#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
1099	INIT_LIST_HEAD(&GlobalDnotifyReqList);	1099	INIT_LIST_HEAD(&GlobalDnotifyReqList);
1100	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);	1100	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
1101	#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */	1101	#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
1102	/*	1102	/*
1103	* Initialize Global counters	1103	* Initialize Global counters
1104	*/	1104	*/
1105	atomic_set(&sesInfoAllocCount, 0);	1105	atomic_set(&sesInfoAllocCount, 0);
1106	atomic_set(&tconInfoAllocCount, 0);	1106	atomic_set(&tconInfoAllocCount, 0);
1107	atomic_set(&tcpSesAllocCount, 0);	1107	atomic_set(&tcpSesAllocCount, 0);
1108	atomic_set(&tcpSesReconnectCount, 0);	1108	atomic_set(&tcpSesReconnectCount, 0);
1109	atomic_set(&tconInfoReconnectCount, 0);	1109	atomic_set(&tconInfoReconnectCount, 0);
1110		1110
1111	atomic_set(&bufAllocCount, 0);	1111	atomic_set(&bufAllocCount, 0);
1112	atomic_set(&smBufAllocCount, 0);	1112	atomic_set(&smBufAllocCount, 0);
1113	#ifdef CONFIG_CIFS_STATS2	1113	#ifdef CONFIG_CIFS_STATS2
1114	atomic_set(&totBufAllocCount, 0);	1114	atomic_set(&totBufAllocCount, 0);
1115	atomic_set(&totSmBufAllocCount, 0);	1115	atomic_set(&totSmBufAllocCount, 0);
1116	#endif /* CONFIG_CIFS_STATS2 */	1116	#endif /* CONFIG_CIFS_STATS2 */
1117		1117
1118	atomic_set(&midCount, 0);	1118	atomic_set(&midCount, 0);
1119	GlobalCurrentXid = 0;	1119	GlobalCurrentXid = 0;
1120	GlobalTotalActiveXid = 0;	1120	GlobalTotalActiveXid = 0;
1121	GlobalMaxActiveXid = 0;	1121	GlobalMaxActiveXid = 0;
1122	spin_lock_init(&cifs_tcp_ses_lock);	1122	spin_lock_init(&cifs_tcp_ses_lock);
1123	spin_lock_init(&cifs_file_list_lock);	1123	spin_lock_init(&cifs_file_list_lock);
1124	spin_lock_init(&GlobalMid_Lock);	1124	spin_lock_init(&GlobalMid_Lock);
1125		1125
1126	#ifdef CONFIG_CIFS_SMB2	1126	#ifdef CONFIG_CIFS_SMB2
1127	get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);	1127	get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
1128	#endif	1128	#endif
1129		1129
1130	if (cifs_max_pending < 2) {	1130	if (cifs_max_pending < 2) {
1131	cifs_max_pending = 2;	1131	cifs_max_pending = 2;
1132	cFYI(1, "cifs_max_pending set to min of 2");	1132	cFYI(1, "cifs_max_pending set to min of 2");
1133	} else if (cifs_max_pending > CIFS_MAX_REQ) {	1133	} else if (cifs_max_pending > CIFS_MAX_REQ) {
1134	cifs_max_pending = CIFS_MAX_REQ;	1134	cifs_max_pending = CIFS_MAX_REQ;
1135	cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);	1135	cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
1136	}	1136	}
1137		1137
1138	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE\|WQ_MEM_RECLAIM, 0);	1138	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE\|WQ_MEM_RECLAIM, 0);
1139	if (!cifsiod_wq) {	1139	if (!cifsiod_wq) {
1140	rc = -ENOMEM;	1140	rc = -ENOMEM;
1141	goto out_clean_proc;	1141	goto out_clean_proc;
1142	}	1142	}
1143		1143
1144	rc = cifs_fscache_register();	1144	rc = cifs_fscache_register();
1145	if (rc)	1145	if (rc)
1146	goto out_destroy_wq;	1146	goto out_destroy_wq;
1147		1147
1148	rc = cifs_init_inodecache();	1148	rc = cifs_init_inodecache();
1149	if (rc)	1149	if (rc)
1150	goto out_unreg_fscache;	1150	goto out_unreg_fscache;
1151		1151
1152	rc = cifs_init_mids();	1152	rc = cifs_init_mids();
1153	if (rc)	1153	if (rc)
1154	goto out_destroy_inodecache;	1154	goto out_destroy_inodecache;
1155		1155
1156	rc = cifs_init_request_bufs();	1156	rc = cifs_init_request_bufs();
1157	if (rc)	1157	if (rc)
1158	goto out_destroy_mids;	1158	goto out_destroy_mids;
1159		1159
1160	#ifdef CONFIG_CIFS_UPCALL	1160	#ifdef CONFIG_CIFS_UPCALL
1161	rc = register_key_type(&cifs_spnego_key_type);	1161	rc = register_key_type(&cifs_spnego_key_type);
1162	if (rc)	1162	if (rc)
1163	goto out_destroy_request_bufs;	1163	goto out_destroy_request_bufs;
1164	#endif /* CONFIG_CIFS_UPCALL */	1164	#endif /* CONFIG_CIFS_UPCALL */
1165		1165
1166	#ifdef CONFIG_CIFS_ACL	1166	#ifdef CONFIG_CIFS_ACL
1167	rc = init_cifs_idmap();	1167	rc = init_cifs_idmap();
1168	if (rc)	1168	if (rc)
1169	goto out_register_key_type;	1169	goto out_register_key_type;
1170	#endif /* CONFIG_CIFS_ACL */	1170	#endif /* CONFIG_CIFS_ACL */
1171		1171
1172	rc = register_filesystem(&cifs_fs_type);	1172	rc = register_filesystem(&cifs_fs_type);
1173	if (rc)	1173	if (rc)
1174	goto out_init_cifs_idmap;	1174	goto out_init_cifs_idmap;
1175		1175
1176	return 0;	1176	return 0;
1177		1177
1178	out_init_cifs_idmap:	1178	out_init_cifs_idmap:
1179	#ifdef CONFIG_CIFS_ACL	1179	#ifdef CONFIG_CIFS_ACL
1180	exit_cifs_idmap();	1180	exit_cifs_idmap();
1181	out_register_key_type:	1181	out_register_key_type:
1182	#endif	1182	#endif
1183	#ifdef CONFIG_CIFS_UPCALL	1183	#ifdef CONFIG_CIFS_UPCALL
1184	unregister_key_type(&cifs_spnego_key_type);	1184	unregister_key_type(&cifs_spnego_key_type);
1185	out_destroy_request_bufs:	1185	out_destroy_request_bufs:
1186	#endif	1186	#endif
1187	cifs_destroy_request_bufs();	1187	cifs_destroy_request_bufs();
1188	out_destroy_mids:	1188	out_destroy_mids:
1189	cifs_destroy_mids();	1189	cifs_destroy_mids();
1190	out_destroy_inodecache:	1190	out_destroy_inodecache:
1191	cifs_destroy_inodecache();	1191	cifs_destroy_inodecache();
1192	out_unreg_fscache:	1192	out_unreg_fscache:
1193	cifs_fscache_unregister();	1193	cifs_fscache_unregister();
1194	out_destroy_wq:	1194	out_destroy_wq:
1195	destroy_workqueue(cifsiod_wq);	1195	destroy_workqueue(cifsiod_wq);
1196	out_clean_proc:	1196	out_clean_proc:
1197	cifs_proc_clean();	1197	cifs_proc_clean();
1198	return rc;	1198	return rc;
1199	}	1199	}
1200		1200
1201	static void __exit	1201	static void __exit
1202	exit_cifs(void)	1202	exit_cifs(void)
1203	{	1203	{
1204	cFYI(DBG2, "exit_cifs");	1204	cFYI(DBG2, "exit_cifs");
1205	unregister_filesystem(&cifs_fs_type);	1205	unregister_filesystem(&cifs_fs_type);
1206	cifs_dfs_release_automount_timer();	1206	cifs_dfs_release_automount_timer();
1207	#ifdef CONFIG_CIFS_ACL	1207	#ifdef CONFIG_CIFS_ACL
1208	exit_cifs_idmap();	1208	exit_cifs_idmap();
1209	#endif	1209	#endif
1210	#ifdef CONFIG_CIFS_UPCALL	1210	#ifdef CONFIG_CIFS_UPCALL
1211	unregister_key_type(&cifs_spnego_key_type);	1211	unregister_key_type(&cifs_spnego_key_type);
1212	#endif	1212	#endif
1213	cifs_destroy_request_bufs();	1213	cifs_destroy_request_bufs();
1214	cifs_destroy_mids();	1214	cifs_destroy_mids();
1215	cifs_destroy_inodecache();	1215	cifs_destroy_inodecache();
1216	cifs_fscache_unregister();	1216	cifs_fscache_unregister();
1217	destroy_workqueue(cifsiod_wq);	1217	destroy_workqueue(cifsiod_wq);
1218	cifs_proc_clean();	1218	cifs_proc_clean();
1219	}	1219	}
1220		1220
1221	MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");	1221	MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
1222	MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */	1222	MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */
1223	MODULE_DESCRIPTION	1223	MODULE_DESCRIPTION
1224	("VFS to access servers complying with the SNIA CIFS Specification "	1224	("VFS to access servers complying with the SNIA CIFS Specification "
1225	"e.g. Samba and Windows");	1225	"e.g. Samba and Windows");
1226	MODULE_VERSION(CIFS_VERSION);	1226	MODULE_VERSION(CIFS_VERSION);
1227	module_init(init_cifs)	1227	module_init(init_cifs)
1228	module_exit(exit_cifs)	1228	module_exit(exit_cifs)
1229		1229

fs/configfs/dir.c

Diff comments View file @ 965c8e5

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* dir.c - Operations for configfs directories.	4	* dir.c - Operations for configfs directories.
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public	7	* modify it under the terms of the GNU General Public
8	* License as published by the Free Software Foundation; either	8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.	9	* version 2 of the License, or (at your option) any later version.
10	*	10	*
11	* This program is distributed in the hope that it will be useful,	11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* General Public License for more details.	14	* General Public License for more details.
15	*	15	*
16	* You should have received a copy of the GNU General Public	16	* You should have received a copy of the GNU General Public
17	* License along with this program; if not, write to the	17	* License along with this program; if not, write to the
18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19	* Boston, MA 021110-1307, USA.	19	* Boston, MA 021110-1307, USA.
20	*	20	*
21	* Based on sysfs:	21	* Based on sysfs:
22	* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel	22	* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23	*	23	*
24	* configfs Copyright (C) 2005 Oracle. All rights reserved.	24	* configfs Copyright (C) 2005 Oracle. All rights reserved.
25	*/	25	*/
26		26
27	#undef DEBUG	27	#undef DEBUG
28		28
29	#include <linux/fs.h>	29	#include <linux/fs.h>
30	#include <linux/mount.h>	30	#include <linux/mount.h>
31	#include <linux/module.h>	31	#include <linux/module.h>
32	#include <linux/slab.h>	32	#include <linux/slab.h>
33	#include <linux/err.h>	33	#include <linux/err.h>
34		34
35	#include <linux/configfs.h>	35	#include <linux/configfs.h>
36	#include "configfs_internal.h"	36	#include "configfs_internal.h"
37		37
38	DECLARE_RWSEM(configfs_rename_sem);	38	DECLARE_RWSEM(configfs_rename_sem);
39	/*	39	/*
40	* Protects mutations of configfs_dirent linkage together with proper i_mutex	40	* Protects mutations of configfs_dirent linkage together with proper i_mutex
41	* Also protects mutations of symlinks linkage to target configfs_dirent	41	* Also protects mutations of symlinks linkage to target configfs_dirent
42	* Mutators of configfs_dirent linkage must both have the proper inode locked	42	* Mutators of configfs_dirent linkage must both have the proper inode locked
43	* and configfs_dirent_lock locked, in that order.	43	* and configfs_dirent_lock locked, in that order.
44	* This allows one to safely traverse configfs_dirent trees and symlinks without	44	* This allows one to safely traverse configfs_dirent trees and symlinks without
45	* having to lock inodes.	45	* having to lock inodes.
46	*	46	*
47	* Protects setting of CONFIGFS_USET_DROPPING: checking the flag	47	* Protects setting of CONFIGFS_USET_DROPPING: checking the flag
48	* unlocked is not reliable unless in detach_groups() called from	48	* unlocked is not reliable unless in detach_groups() called from
49	* rmdir()/unregister() and from configfs_attach_group()	49	* rmdir()/unregister() and from configfs_attach_group()
50	*/	50	*/
51	DEFINE_SPINLOCK(configfs_dirent_lock);	51	DEFINE_SPINLOCK(configfs_dirent_lock);
52		52
53	static void configfs_d_iput(struct dentry * dentry,	53	static void configfs_d_iput(struct dentry * dentry,
54	struct inode * inode)	54	struct inode * inode)
55	{	55	{
56	struct configfs_dirent *sd = dentry->d_fsdata;	56	struct configfs_dirent *sd = dentry->d_fsdata;
57		57
58	if (sd) {	58	if (sd) {
59	BUG_ON(sd->s_dentry != dentry);	59	BUG_ON(sd->s_dentry != dentry);
60	/* Coordinate with configfs_readdir */	60	/* Coordinate with configfs_readdir */
61	spin_lock(&configfs_dirent_lock);	61	spin_lock(&configfs_dirent_lock);
62	sd->s_dentry = NULL;	62	sd->s_dentry = NULL;
63	spin_unlock(&configfs_dirent_lock);	63	spin_unlock(&configfs_dirent_lock);
64	configfs_put(sd);	64	configfs_put(sd);
65	}	65	}
66	iput(inode);	66	iput(inode);
67	}	67	}
68		68
69	/*	69	/*
70	* We _must_ delete our dentries on last dput, as the chain-to-parent	70	* We _must_ delete our dentries on last dput, as the chain-to-parent
71	* behavior is required to clear the parents of default_groups.	71	* behavior is required to clear the parents of default_groups.
72	*/	72	*/
73	static int configfs_d_delete(const struct dentry *dentry)	73	static int configfs_d_delete(const struct dentry *dentry)
74	{	74	{
75	return 1;	75	return 1;
76	}	76	}
77		77
78	const struct dentry_operations configfs_dentry_ops = {	78	const struct dentry_operations configfs_dentry_ops = {
79	.d_iput = configfs_d_iput,	79	.d_iput = configfs_d_iput,
80	/* simple_delete_dentry() isn't exported */	80	/* simple_delete_dentry() isn't exported */
81	.d_delete = configfs_d_delete,	81	.d_delete = configfs_d_delete,
82	};	82	};
83		83
84	#ifdef CONFIG_LOCKDEP	84	#ifdef CONFIG_LOCKDEP
85		85
86	/*	86	/*
87	* Helpers to make lockdep happy with our recursive locking of default groups'	87	* Helpers to make lockdep happy with our recursive locking of default groups'
88	* inodes (see configfs_attach_group() and configfs_detach_group()).	88	* inodes (see configfs_attach_group() and configfs_detach_group()).
89	* We put default groups i_mutexes in separate classes according to their depth	89	* We put default groups i_mutexes in separate classes according to their depth
90	* from the youngest non-default group ancestor.	90	* from the youngest non-default group ancestor.
91	*	91	*
92	* For a non-default group A having default groups A/B, A/C, and A/C/D, default	92	* For a non-default group A having default groups A/B, A/C, and A/C/D, default
93	* groups A/B and A/C will have their inode's mutex in class	93	* groups A/B and A/C will have their inode's mutex in class
94	* default_group_class[0], and default group A/C/D will be in	94	* default_group_class[0], and default group A/C/D will be in
95	* default_group_class[1].	95	* default_group_class[1].
96	*	96	*
97	* The lock classes are declared and assigned in inode.c, according to the	97	* The lock classes are declared and assigned in inode.c, according to the
98	* s_depth value.	98	* s_depth value.
99	* The s_depth value is initialized to -1, adjusted to >= 0 when attaching	99	* The s_depth value is initialized to -1, adjusted to >= 0 when attaching
100	* default groups, and reset to -1 when all default groups are attached. During	100	* default groups, and reset to -1 when all default groups are attached. During
101	* attachment, if configfs_create() sees s_depth > 0, the lock class of the new	101	* attachment, if configfs_create() sees s_depth > 0, the lock class of the new
102	* inode's mutex is set to default_group_class[s_depth - 1].	102	* inode's mutex is set to default_group_class[s_depth - 1].
103	*/	103	*/
104		104
105	static void configfs_init_dirent_depth(struct configfs_dirent *sd)	105	static void configfs_init_dirent_depth(struct configfs_dirent *sd)
106	{	106	{
107	sd->s_depth = -1;	107	sd->s_depth = -1;
108	}	108	}
109		109
110	static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,	110	static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
111	struct configfs_dirent *sd)	111	struct configfs_dirent *sd)
112	{	112	{
113	int parent_depth = parent_sd->s_depth;	113	int parent_depth = parent_sd->s_depth;
114		114
115	if (parent_depth >= 0)	115	if (parent_depth >= 0)
116	sd->s_depth = parent_depth + 1;	116	sd->s_depth = parent_depth + 1;
117	}	117	}
118		118
119	static void	119	static void
120	configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)	120	configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
121	{	121	{
122	/*	122	/*
123	* item's i_mutex class is already setup, so s_depth is now only	123	* item's i_mutex class is already setup, so s_depth is now only
124	* used to set new sub-directories s_depth, which is always done	124	* used to set new sub-directories s_depth, which is always done
125	* with item's i_mutex locked.	125	* with item's i_mutex locked.
126	*/	126	*/
127	/*	127	/*
128	* sd->s_depth == -1 iff we are a non default group.	128	* sd->s_depth == -1 iff we are a non default group.
129	* else (we are a default group) sd->s_depth > 0 (see	129	* else (we are a default group) sd->s_depth > 0 (see
130	* create_dir()).	130	* create_dir()).
131	*/	131	*/
132	if (sd->s_depth == -1)	132	if (sd->s_depth == -1)
133	/*	133	/*
134	* We are a non default group and we are going to create	134	* We are a non default group and we are going to create
135	* default groups.	135	* default groups.
136	*/	136	*/
137	sd->s_depth = 0;	137	sd->s_depth = 0;
138	}	138	}
139		139
140	static void	140	static void
141	configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)	141	configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
142	{	142	{
143	/* We will not create default groups anymore. */	143	/* We will not create default groups anymore. */
144	sd->s_depth = -1;	144	sd->s_depth = -1;
145	}	145	}
146		146
147	#else /* CONFIG_LOCKDEP */	147	#else /* CONFIG_LOCKDEP */
148		148
149	static void configfs_init_dirent_depth(struct configfs_dirent *sd)	149	static void configfs_init_dirent_depth(struct configfs_dirent *sd)
150	{	150	{
151	}	151	}
152		152
153	static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,	153	static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
154	struct configfs_dirent *sd)	154	struct configfs_dirent *sd)
155	{	155	{
156	}	156	}
157		157
158	static void	158	static void
159	configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)	159	configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
160	{	160	{
161	}	161	}
162		162
163	static void	163	static void
164	configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)	164	configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
165	{	165	{
166	}	166	}
167		167
168	#endif /* CONFIG_LOCKDEP */	168	#endif /* CONFIG_LOCKDEP */
169		169
170	/*	170	/*
171	* Allocates a new configfs_dirent and links it to the parent configfs_dirent	171	* Allocates a new configfs_dirent and links it to the parent configfs_dirent
172	*/	172	*/
173	static struct configfs_dirent configfs_new_dirent(struct configfs_dirent parent_sd,	173	static struct configfs_dirent configfs_new_dirent(struct configfs_dirent parent_sd,
174	void *element, int type)	174	void *element, int type)
175	{	175	{
176	struct configfs_dirent * sd;	176	struct configfs_dirent * sd;
177		177
178	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);	178	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
179	if (!sd)	179	if (!sd)
180	return ERR_PTR(-ENOMEM);	180	return ERR_PTR(-ENOMEM);
181		181
182	atomic_set(&sd->s_count, 1);	182	atomic_set(&sd->s_count, 1);
183	INIT_LIST_HEAD(&sd->s_links);	183	INIT_LIST_HEAD(&sd->s_links);
184	INIT_LIST_HEAD(&sd->s_children);	184	INIT_LIST_HEAD(&sd->s_children);
185	sd->s_element = element;	185	sd->s_element = element;
186	sd->s_type = type;	186	sd->s_type = type;
187	configfs_init_dirent_depth(sd);	187	configfs_init_dirent_depth(sd);
188	spin_lock(&configfs_dirent_lock);	188	spin_lock(&configfs_dirent_lock);
189	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {	189	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
190	spin_unlock(&configfs_dirent_lock);	190	spin_unlock(&configfs_dirent_lock);
191	kmem_cache_free(configfs_dir_cachep, sd);	191	kmem_cache_free(configfs_dir_cachep, sd);
192	return ERR_PTR(-ENOENT);	192	return ERR_PTR(-ENOENT);
193	}	193	}
194	list_add(&sd->s_sibling, &parent_sd->s_children);	194	list_add(&sd->s_sibling, &parent_sd->s_children);
195	spin_unlock(&configfs_dirent_lock);	195	spin_unlock(&configfs_dirent_lock);
196		196
197	return sd;	197	return sd;
198	}	198	}
199		199
200	/*	200	/*
201	*	201	*
202	* Return -EEXIST if there is already a configfs element with the same	202	* Return -EEXIST if there is already a configfs element with the same
203	* name for the same parent.	203	* name for the same parent.
204	*	204	*
205	* called with parent inode's i_mutex held	205	* called with parent inode's i_mutex held
206	*/	206	*/
207	static int configfs_dirent_exists(struct configfs_dirent *parent_sd,	207	static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
208	const unsigned char *new)	208	const unsigned char *new)
209	{	209	{
210	struct configfs_dirent * sd;	210	struct configfs_dirent * sd;
211		211
212	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {	212	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
213	if (sd->s_element) {	213	if (sd->s_element) {
214	const unsigned char *existing = configfs_get_name(sd);	214	const unsigned char *existing = configfs_get_name(sd);
215	if (strcmp(existing, new))	215	if (strcmp(existing, new))
216	continue;	216	continue;
217	else	217	else
218	return -EEXIST;	218	return -EEXIST;
219	}	219	}
220	}	220	}
221		221
222	return 0;	222	return 0;
223	}	223	}
224		224
225		225
226	int configfs_make_dirent(struct configfs_dirent * parent_sd,	226	int configfs_make_dirent(struct configfs_dirent * parent_sd,
227	struct dentry * dentry, void * element,	227	struct dentry * dentry, void * element,
228	umode_t mode, int type)	228	umode_t mode, int type)
229	{	229	{
230	struct configfs_dirent * sd;	230	struct configfs_dirent * sd;
231		231
232	sd = configfs_new_dirent(parent_sd, element, type);	232	sd = configfs_new_dirent(parent_sd, element, type);
233	if (IS_ERR(sd))	233	if (IS_ERR(sd))
234	return PTR_ERR(sd);	234	return PTR_ERR(sd);
235		235
236	sd->s_mode = mode;	236	sd->s_mode = mode;
237	sd->s_dentry = dentry;	237	sd->s_dentry = dentry;
238	if (dentry)	238	if (dentry)
239	dentry->d_fsdata = configfs_get(sd);	239	dentry->d_fsdata = configfs_get(sd);
240		240
241	return 0;	241	return 0;
242	}	242	}
243		243
244	static int init_dir(struct inode * inode)	244	static int init_dir(struct inode * inode)
245	{	245	{
246	inode->i_op = &configfs_dir_inode_operations;	246	inode->i_op = &configfs_dir_inode_operations;
247	inode->i_fop = &configfs_dir_operations;	247	inode->i_fop = &configfs_dir_operations;
248		248
249	/* directory inodes start off with i_nlink == 2 (for "." entry) */	249	/* directory inodes start off with i_nlink == 2 (for "." entry) */
250	inc_nlink(inode);	250	inc_nlink(inode);
251	return 0;	251	return 0;
252	}	252	}
253		253
254	static int configfs_init_file(struct inode * inode)	254	static int configfs_init_file(struct inode * inode)
255	{	255	{
256	inode->i_size = PAGE_SIZE;	256	inode->i_size = PAGE_SIZE;
257	inode->i_fop = &configfs_file_operations;	257	inode->i_fop = &configfs_file_operations;
258	return 0;	258	return 0;
259	}	259	}
260		260
261	static int init_symlink(struct inode * inode)	261	static int init_symlink(struct inode * inode)
262	{	262	{
263	inode->i_op = &configfs_symlink_inode_operations;	263	inode->i_op = &configfs_symlink_inode_operations;
264	return 0;	264	return 0;
265	}	265	}
266		266
267	static int create_dir(struct config_item k, struct dentry d)	267	static int create_dir(struct config_item k, struct dentry d)
268	{	268	{
269	int error;	269	int error;
270	umode_t mode = S_IFDIR\| S_IRWXU \| S_IRUGO \| S_IXUGO;	270	umode_t mode = S_IFDIR\| S_IRWXU \| S_IRUGO \| S_IXUGO;
271	struct dentry *p = d->d_parent;	271	struct dentry *p = d->d_parent;
272		272
273	BUG_ON(!k);	273	BUG_ON(!k);
274		274
275	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);	275	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
276	if (!error)	276	if (!error)
277	error = configfs_make_dirent(p->d_fsdata, d, k, mode,	277	error = configfs_make_dirent(p->d_fsdata, d, k, mode,
278	CONFIGFS_DIR \| CONFIGFS_USET_CREATING);	278	CONFIGFS_DIR \| CONFIGFS_USET_CREATING);
279	if (!error) {	279	if (!error) {
280	configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);	280	configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
281	error = configfs_create(d, mode, init_dir);	281	error = configfs_create(d, mode, init_dir);
282	if (!error) {	282	if (!error) {
283	inc_nlink(p->d_inode);	283	inc_nlink(p->d_inode);
284	} else {	284	} else {
285	struct configfs_dirent *sd = d->d_fsdata;	285	struct configfs_dirent *sd = d->d_fsdata;
286	if (sd) {	286	if (sd) {
287	spin_lock(&configfs_dirent_lock);	287	spin_lock(&configfs_dirent_lock);
288	list_del_init(&sd->s_sibling);	288	list_del_init(&sd->s_sibling);
289	spin_unlock(&configfs_dirent_lock);	289	spin_unlock(&configfs_dirent_lock);
290	configfs_put(sd);	290	configfs_put(sd);
291	}	291	}
292	}	292	}
293	}	293	}
294	return error;	294	return error;
295	}	295	}
296		296
297		297
298	/**	298	/**
299	* configfs_create_dir - create a directory for an config_item.	299	* configfs_create_dir - create a directory for an config_item.
300	* @item: config_itemwe're creating directory for.	300	* @item: config_itemwe're creating directory for.
301	* @dentry: config_item's dentry.	301	* @dentry: config_item's dentry.
302	*	302	*
303	* Note: user-created entries won't be allowed under this new directory	303	* Note: user-created entries won't be allowed under this new directory
304	* until it is validated by configfs_dir_set_ready()	304	* until it is validated by configfs_dir_set_ready()
305	*/	305	*/
306		306
307	static int configfs_create_dir(struct config_item * item, struct dentry *dentry)	307	static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
308	{	308	{
309	int error = create_dir(item, dentry);	309	int error = create_dir(item, dentry);
310	if (!error)	310	if (!error)
311	item->ci_dentry = dentry;	311	item->ci_dentry = dentry;
312	return error;	312	return error;
313	}	313	}
314		314
315	/*	315	/*
316	* Allow userspace to create new entries under a new directory created with	316	* Allow userspace to create new entries under a new directory created with
317	* configfs_create_dir(), and under all of its chidlren directories recursively.	317	* configfs_create_dir(), and under all of its chidlren directories recursively.
318	* @sd configfs_dirent of the new directory to validate	318	* @sd configfs_dirent of the new directory to validate
319	*	319	*
320	* Caller must hold configfs_dirent_lock.	320	* Caller must hold configfs_dirent_lock.
321	*/	321	*/
322	static void configfs_dir_set_ready(struct configfs_dirent *sd)	322	static void configfs_dir_set_ready(struct configfs_dirent *sd)
323	{	323	{
324	struct configfs_dirent *child_sd;	324	struct configfs_dirent *child_sd;
325		325
326	sd->s_type &= ~CONFIGFS_USET_CREATING;	326	sd->s_type &= ~CONFIGFS_USET_CREATING;
327	list_for_each_entry(child_sd, &sd->s_children, s_sibling)	327	list_for_each_entry(child_sd, &sd->s_children, s_sibling)
328	if (child_sd->s_type & CONFIGFS_USET_CREATING)	328	if (child_sd->s_type & CONFIGFS_USET_CREATING)
329	configfs_dir_set_ready(child_sd);	329	configfs_dir_set_ready(child_sd);
330	}	330	}
331		331
332	/*	332	/*
333	* Check that a directory does not belong to a directory hierarchy being	333	* Check that a directory does not belong to a directory hierarchy being
334	* attached and not validated yet.	334	* attached and not validated yet.
335	* @sd configfs_dirent of the directory to check	335	* @sd configfs_dirent of the directory to check
336	*	336	*
337	* @return non-zero iff the directory was validated	337	* @return non-zero iff the directory was validated
338	*	338	*
339	* Note: takes configfs_dirent_lock, so the result may change from false to true	339	* Note: takes configfs_dirent_lock, so the result may change from false to true
340	* in two consecutive calls, but never from true to false.	340	* in two consecutive calls, but never from true to false.
341	*/	341	*/
342	int configfs_dirent_is_ready(struct configfs_dirent *sd)	342	int configfs_dirent_is_ready(struct configfs_dirent *sd)
343	{	343	{
344	int ret;	344	int ret;
345		345
346	spin_lock(&configfs_dirent_lock);	346	spin_lock(&configfs_dirent_lock);
347	ret = !(sd->s_type & CONFIGFS_USET_CREATING);	347	ret = !(sd->s_type & CONFIGFS_USET_CREATING);
348	spin_unlock(&configfs_dirent_lock);	348	spin_unlock(&configfs_dirent_lock);
349		349
350	return ret;	350	return ret;
351	}	351	}
352		352
353	int configfs_create_link(struct configfs_symlink *sl,	353	int configfs_create_link(struct configfs_symlink *sl,
354	struct dentry *parent,	354	struct dentry *parent,
355	struct dentry *dentry)	355	struct dentry *dentry)
356	{	356	{
357	int err = 0;	357	int err = 0;
358	umode_t mode = S_IFLNK \| S_IRWXUGO;	358	umode_t mode = S_IFLNK \| S_IRWXUGO;
359		359
360	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,	360	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
361	CONFIGFS_ITEM_LINK);	361	CONFIGFS_ITEM_LINK);
362	if (!err) {	362	if (!err) {
363	err = configfs_create(dentry, mode, init_symlink);	363	err = configfs_create(dentry, mode, init_symlink);
364	if (err) {	364	if (err) {
365	struct configfs_dirent *sd = dentry->d_fsdata;	365	struct configfs_dirent *sd = dentry->d_fsdata;
366	if (sd) {	366	if (sd) {
367	spin_lock(&configfs_dirent_lock);	367	spin_lock(&configfs_dirent_lock);
368	list_del_init(&sd->s_sibling);	368	list_del_init(&sd->s_sibling);
369	spin_unlock(&configfs_dirent_lock);	369	spin_unlock(&configfs_dirent_lock);
370	configfs_put(sd);	370	configfs_put(sd);
371	}	371	}
372	}	372	}
373	}	373	}
374	return err;	374	return err;
375	}	375	}
376		376
377	static void remove_dir(struct dentry * d)	377	static void remove_dir(struct dentry * d)
378	{	378	{
379	struct dentry * parent = dget(d->d_parent);	379	struct dentry * parent = dget(d->d_parent);
380	struct configfs_dirent * sd;	380	struct configfs_dirent * sd;
381		381
382	sd = d->d_fsdata;	382	sd = d->d_fsdata;
383	spin_lock(&configfs_dirent_lock);	383	spin_lock(&configfs_dirent_lock);
384	list_del_init(&sd->s_sibling);	384	list_del_init(&sd->s_sibling);
385	spin_unlock(&configfs_dirent_lock);	385	spin_unlock(&configfs_dirent_lock);
386	configfs_put(sd);	386	configfs_put(sd);
387	if (d->d_inode)	387	if (d->d_inode)
388	simple_rmdir(parent->d_inode,d);	388	simple_rmdir(parent->d_inode,d);
389		389
390	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);	390	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
391		391
392	dput(parent);	392	dput(parent);
393	}	393	}
394		394
395	/**	395	/**
396	* configfs_remove_dir - remove an config_item's directory.	396	* configfs_remove_dir - remove an config_item's directory.
397	* @item: config_item we're removing.	397	* @item: config_item we're removing.
398	*	398	*
399	* The only thing special about this is that we remove any files in	399	* The only thing special about this is that we remove any files in
400	* the directory before we remove the directory, and we've inlined	400	* the directory before we remove the directory, and we've inlined
401	* what used to be configfs_rmdir() below, instead of calling separately.	401	* what used to be configfs_rmdir() below, instead of calling separately.
402	*	402	*
403	* Caller holds the mutex of the item's inode	403	* Caller holds the mutex of the item's inode
404	*/	404	*/
405		405
406	static void configfs_remove_dir(struct config_item * item)	406	static void configfs_remove_dir(struct config_item * item)
407	{	407	{
408	struct dentry * dentry = dget(item->ci_dentry);	408	struct dentry * dentry = dget(item->ci_dentry);
409		409
410	if (!dentry)	410	if (!dentry)
411	return;	411	return;
412		412
413	remove_dir(dentry);	413	remove_dir(dentry);
414	/**	414	/**
415	* Drop reference from dget() on entrance.	415	* Drop reference from dget() on entrance.
416	*/	416	*/
417	dput(dentry);	417	dput(dentry);
418	}	418	}
419		419
420		420
421	/* attaches attribute's configfs_dirent to the dentry corresponding to the	421	/* attaches attribute's configfs_dirent to the dentry corresponding to the
422	* attribute file	422	* attribute file
423	*/	423	*/
424	static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)	424	static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
425	{	425	{
426	struct configfs_attribute * attr = sd->s_element;	426	struct configfs_attribute * attr = sd->s_element;
427	int error;	427	int error;
428		428
429	dentry->d_fsdata = configfs_get(sd);	429	dentry->d_fsdata = configfs_get(sd);
430	sd->s_dentry = dentry;	430	sd->s_dentry = dentry;
431	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) \| S_IFREG,	431	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) \| S_IFREG,
432	configfs_init_file);	432	configfs_init_file);
433	if (error) {	433	if (error) {
434	configfs_put(sd);	434	configfs_put(sd);
435	return error;	435	return error;
436	}	436	}
437		437
438	d_rehash(dentry);	438	d_rehash(dentry);
439		439
440	return 0;	440	return 0;
441	}	441	}
442		442
443	static struct dentry * configfs_lookup(struct inode *dir,	443	static struct dentry * configfs_lookup(struct inode *dir,
444	struct dentry *dentry,	444	struct dentry *dentry,
445	unsigned int flags)	445	unsigned int flags)
446	{	446	{
447	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;	447	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
448	struct configfs_dirent * sd;	448	struct configfs_dirent * sd;
449	int found = 0;	449	int found = 0;
450	int err;	450	int err;
451		451
452	/*	452	/*
453	* Fake invisibility if dir belongs to a group/default groups hierarchy	453	* Fake invisibility if dir belongs to a group/default groups hierarchy
454	* being attached	454	* being attached
455	*	455	*
456	* This forbids userspace to read/write attributes of items which may	456	* This forbids userspace to read/write attributes of items which may
457	* not complete their initialization, since the dentries of the	457	* not complete their initialization, since the dentries of the
458	* attributes won't be instantiated.	458	* attributes won't be instantiated.
459	*/	459	*/
460	err = -ENOENT;	460	err = -ENOENT;
461	if (!configfs_dirent_is_ready(parent_sd))	461	if (!configfs_dirent_is_ready(parent_sd))
462	goto out;	462	goto out;
463		463
464	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {	464	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
465	if (sd->s_type & CONFIGFS_NOT_PINNED) {	465	if (sd->s_type & CONFIGFS_NOT_PINNED) {
466	const unsigned char * name = configfs_get_name(sd);	466	const unsigned char * name = configfs_get_name(sd);
467		467
468	if (strcmp(name, dentry->d_name.name))	468	if (strcmp(name, dentry->d_name.name))
469	continue;	469	continue;
470		470
471	found = 1;	471	found = 1;
472	err = configfs_attach_attr(sd, dentry);	472	err = configfs_attach_attr(sd, dentry);
473	break;	473	break;
474	}	474	}
475	}	475	}
476		476
477	if (!found) {	477	if (!found) {
478	/*	478	/*
479	* If it doesn't exist and it isn't a NOT_PINNED item,	479	* If it doesn't exist and it isn't a NOT_PINNED item,
480	* it must be negative.	480	* it must be negative.
481	*/	481	*/
482	if (dentry->d_name.len > NAME_MAX)	482	if (dentry->d_name.len > NAME_MAX)
483	return ERR_PTR(-ENAMETOOLONG);	483	return ERR_PTR(-ENAMETOOLONG);
484	d_add(dentry, NULL);	484	d_add(dentry, NULL);
485	return NULL;	485	return NULL;
486	}	486	}
487		487
488	out:	488	out:
489	return ERR_PTR(err);	489	return ERR_PTR(err);
490	}	490	}
491		491
492	/*	492	/*
493	* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are	493	* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
494	* attributes and are removed by rmdir(). We recurse, setting	494	* attributes and are removed by rmdir(). We recurse, setting
495	* CONFIGFS_USET_DROPPING on all children that are candidates for	495	* CONFIGFS_USET_DROPPING on all children that are candidates for
496	* default detach.	496	* default detach.
497	* If there is an error, the caller will reset the flags via	497	* If there is an error, the caller will reset the flags via
498	* configfs_detach_rollback().	498	* configfs_detach_rollback().
499	*/	499	*/
500	static int configfs_detach_prep(struct dentry dentry, struct mutex *wait_mutex)	500	static int configfs_detach_prep(struct dentry dentry, struct mutex *wait_mutex)
501	{	501	{
502	struct configfs_dirent *parent_sd = dentry->d_fsdata;	502	struct configfs_dirent *parent_sd = dentry->d_fsdata;
503	struct configfs_dirent *sd;	503	struct configfs_dirent *sd;
504	int ret;	504	int ret;
505		505
506	/* Mark that we're trying to drop the group */	506	/* Mark that we're trying to drop the group */
507	parent_sd->s_type \|= CONFIGFS_USET_DROPPING;	507	parent_sd->s_type \|= CONFIGFS_USET_DROPPING;
508		508
509	ret = -EBUSY;	509	ret = -EBUSY;
510	if (!list_empty(&parent_sd->s_links))	510	if (!list_empty(&parent_sd->s_links))
511	goto out;	511	goto out;
512		512
513	ret = 0;	513	ret = 0;
514	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {	514	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
515	if (!sd->s_element \|\|	515	if (!sd->s_element \|\|
516	(sd->s_type & CONFIGFS_NOT_PINNED))	516	(sd->s_type & CONFIGFS_NOT_PINNED))
517	continue;	517	continue;
518	if (sd->s_type & CONFIGFS_USET_DEFAULT) {	518	if (sd->s_type & CONFIGFS_USET_DEFAULT) {
519	/* Abort if racing with mkdir() */	519	/* Abort if racing with mkdir() */
520	if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {	520	if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
521	if (wait_mutex)	521	if (wait_mutex)
522	*wait_mutex = &sd->s_dentry->d_inode->i_mutex;	522	*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
523	return -EAGAIN;	523	return -EAGAIN;
524	}	524	}
525		525
526	/*	526	/*
527	* Yup, recursive. If there's a problem, blame	527	* Yup, recursive. If there's a problem, blame
528	* deep nesting of default_groups	528	* deep nesting of default_groups
529	*/	529	*/
530	ret = configfs_detach_prep(sd->s_dentry, wait_mutex);	530	ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
531	if (!ret)	531	if (!ret)
532	continue;	532	continue;
533	} else	533	} else
534	ret = -ENOTEMPTY;	534	ret = -ENOTEMPTY;
535		535
536	break;	536	break;
537	}	537	}
538		538
539	out:	539	out:
540	return ret;	540	return ret;
541	}	541	}
542		542
543	/*	543	/*
544	* Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was	544	* Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
545	* set.	545	* set.
546	*/	546	*/
547	static void configfs_detach_rollback(struct dentry *dentry)	547	static void configfs_detach_rollback(struct dentry *dentry)
548	{	548	{
549	struct configfs_dirent *parent_sd = dentry->d_fsdata;	549	struct configfs_dirent *parent_sd = dentry->d_fsdata;
550	struct configfs_dirent *sd;	550	struct configfs_dirent *sd;
551		551
552	parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;	552	parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
553		553
554	list_for_each_entry(sd, &parent_sd->s_children, s_sibling)	554	list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
555	if (sd->s_type & CONFIGFS_USET_DEFAULT)	555	if (sd->s_type & CONFIGFS_USET_DEFAULT)
556	configfs_detach_rollback(sd->s_dentry);	556	configfs_detach_rollback(sd->s_dentry);
557	}	557	}
558		558
559	static void detach_attrs(struct config_item * item)	559	static void detach_attrs(struct config_item * item)
560	{	560	{
561	struct dentry * dentry = dget(item->ci_dentry);	561	struct dentry * dentry = dget(item->ci_dentry);
562	struct configfs_dirent * parent_sd;	562	struct configfs_dirent * parent_sd;
563	struct configfs_dirent * sd, * tmp;	563	struct configfs_dirent * sd, * tmp;
564		564
565	if (!dentry)	565	if (!dentry)
566	return;	566	return;
567		567
568	pr_debug("configfs %s: dropping attrs for dir\n",	568	pr_debug("configfs %s: dropping attrs for dir\n",
569	dentry->d_name.name);	569	dentry->d_name.name);
570		570
571	parent_sd = dentry->d_fsdata;	571	parent_sd = dentry->d_fsdata;
572	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {	572	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
573	if (!sd->s_element \|\| !(sd->s_type & CONFIGFS_NOT_PINNED))	573	if (!sd->s_element \|\| !(sd->s_type & CONFIGFS_NOT_PINNED))
574	continue;	574	continue;
575	spin_lock(&configfs_dirent_lock);	575	spin_lock(&configfs_dirent_lock);
576	list_del_init(&sd->s_sibling);	576	list_del_init(&sd->s_sibling);
577	spin_unlock(&configfs_dirent_lock);	577	spin_unlock(&configfs_dirent_lock);
578	configfs_drop_dentry(sd, dentry);	578	configfs_drop_dentry(sd, dentry);
579	configfs_put(sd);	579	configfs_put(sd);
580	}	580	}
581		581
582	/**	582	/**
583	* Drop reference from dget() on entrance.	583	* Drop reference from dget() on entrance.
584	*/	584	*/
585	dput(dentry);	585	dput(dentry);
586	}	586	}
587		587
588	static int populate_attrs(struct config_item *item)	588	static int populate_attrs(struct config_item *item)
589	{	589	{
590	struct config_item_type *t = item->ci_type;	590	struct config_item_type *t = item->ci_type;
591	struct configfs_attribute *attr;	591	struct configfs_attribute *attr;
592	int error = 0;	592	int error = 0;
593	int i;	593	int i;
594		594
595	if (!t)	595	if (!t)
596	return -EINVAL;	596	return -EINVAL;
597	if (t->ct_attrs) {	597	if (t->ct_attrs) {
598	for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {	598	for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
599	if ((error = configfs_create_file(item, attr)))	599	if ((error = configfs_create_file(item, attr)))
600	break;	600	break;
601	}	601	}
602	}	602	}
603		603
604	if (error)	604	if (error)
605	detach_attrs(item);	605	detach_attrs(item);
606		606
607	return error;	607	return error;
608	}	608	}
609		609
610	static int configfs_attach_group(struct config_item *parent_item,	610	static int configfs_attach_group(struct config_item *parent_item,
611	struct config_item *item,	611	struct config_item *item,
612	struct dentry *dentry);	612	struct dentry *dentry);
613	static void configfs_detach_group(struct config_item *item);	613	static void configfs_detach_group(struct config_item *item);
614		614
615	static void detach_groups(struct config_group *group)	615	static void detach_groups(struct config_group *group)
616	{	616	{
617	struct dentry * dentry = dget(group->cg_item.ci_dentry);	617	struct dentry * dentry = dget(group->cg_item.ci_dentry);
618	struct dentry *child;	618	struct dentry *child;
619	struct configfs_dirent *parent_sd;	619	struct configfs_dirent *parent_sd;
620	struct configfs_dirent sd, tmp;	620	struct configfs_dirent sd, tmp;
621		621
622	if (!dentry)	622	if (!dentry)
623	return;	623	return;
624		624
625	parent_sd = dentry->d_fsdata;	625	parent_sd = dentry->d_fsdata;
626	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {	626	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
627	if (!sd->s_element \|\|	627	if (!sd->s_element \|\|
628	!(sd->s_type & CONFIGFS_USET_DEFAULT))	628	!(sd->s_type & CONFIGFS_USET_DEFAULT))
629	continue;	629	continue;
630		630
631	child = sd->s_dentry;	631	child = sd->s_dentry;
632		632
633	mutex_lock(&child->d_inode->i_mutex);	633	mutex_lock(&child->d_inode->i_mutex);
634		634
635	configfs_detach_group(sd->s_element);	635	configfs_detach_group(sd->s_element);
636	child->d_inode->i_flags \|= S_DEAD;	636	child->d_inode->i_flags \|= S_DEAD;
637	dont_mount(child);	637	dont_mount(child);
638		638
639	mutex_unlock(&child->d_inode->i_mutex);	639	mutex_unlock(&child->d_inode->i_mutex);
640		640
641	d_delete(child);	641	d_delete(child);
642	dput(child);	642	dput(child);
643	}	643	}
644		644
645	/**	645	/**
646	* Drop reference from dget() on entrance.	646	* Drop reference from dget() on entrance.
647	*/	647	*/
648	dput(dentry);	648	dput(dentry);
649	}	649	}
650		650
651	/*	651	/*
652	* This fakes mkdir(2) on a default_groups[] entry. It	652	* This fakes mkdir(2) on a default_groups[] entry. It
653	* creates a dentry, attachs it, and then does fixup	653	* creates a dentry, attachs it, and then does fixup
654	* on the sd->s_type.	654	* on the sd->s_type.
655	*	655	*
656	* We could, perhaps, tweak our parent's ->mkdir for a minute and	656	* We could, perhaps, tweak our parent's ->mkdir for a minute and
657	* try using vfs_mkdir. Just a thought.	657	* try using vfs_mkdir. Just a thought.
658	*/	658	*/
659	static int create_default_group(struct config_group *parent_group,	659	static int create_default_group(struct config_group *parent_group,
660	struct config_group *group)	660	struct config_group *group)
661	{	661	{
662	int ret;	662	int ret;
663	struct qstr name;	663	struct qstr name;
664	struct configfs_dirent *sd;	664	struct configfs_dirent *sd;
665	/* We trust the caller holds a reference to parent */	665	/* We trust the caller holds a reference to parent */
666	struct dentry child, parent = parent_group->cg_item.ci_dentry;	666	struct dentry child, parent = parent_group->cg_item.ci_dentry;
667		667
668	if (!group->cg_item.ci_name)	668	if (!group->cg_item.ci_name)
669	group->cg_item.ci_name = group->cg_item.ci_namebuf;	669	group->cg_item.ci_name = group->cg_item.ci_namebuf;
670	name.name = group->cg_item.ci_name;	670	name.name = group->cg_item.ci_name;
671	name.len = strlen(name.name);	671	name.len = strlen(name.name);
672	name.hash = full_name_hash(name.name, name.len);	672	name.hash = full_name_hash(name.name, name.len);
673		673
674	ret = -ENOMEM;	674	ret = -ENOMEM;
675	child = d_alloc(parent, &name);	675	child = d_alloc(parent, &name);
676	if (child) {	676	if (child) {
677	d_add(child, NULL);	677	d_add(child, NULL);
678		678
679	ret = configfs_attach_group(&parent_group->cg_item,	679	ret = configfs_attach_group(&parent_group->cg_item,
680	&group->cg_item, child);	680	&group->cg_item, child);
681	if (!ret) {	681	if (!ret) {
682	sd = child->d_fsdata;	682	sd = child->d_fsdata;
683	sd->s_type \|= CONFIGFS_USET_DEFAULT;	683	sd->s_type \|= CONFIGFS_USET_DEFAULT;
684	} else {	684	} else {
685	BUG_ON(child->d_inode);	685	BUG_ON(child->d_inode);
686	d_drop(child);	686	d_drop(child);
687	dput(child);	687	dput(child);
688	}	688	}
689	}	689	}
690		690
691	return ret;	691	return ret;
692	}	692	}
693		693
694	static int populate_groups(struct config_group *group)	694	static int populate_groups(struct config_group *group)
695	{	695	{
696	struct config_group *new_group;	696	struct config_group *new_group;
697	int ret = 0;	697	int ret = 0;
698	int i;	698	int i;
699		699
700	if (group->default_groups) {	700	if (group->default_groups) {
701	for (i = 0; group->default_groups[i]; i++) {	701	for (i = 0; group->default_groups[i]; i++) {
702	new_group = group->default_groups[i];	702	new_group = group->default_groups[i];
703		703
704	ret = create_default_group(group, new_group);	704	ret = create_default_group(group, new_group);
705	if (ret) {	705	if (ret) {
706	detach_groups(group);	706	detach_groups(group);
707	break;	707	break;
708	}	708	}
709	}	709	}
710	}	710	}
711		711
712	return ret;	712	return ret;
713	}	713	}
714		714
715	/*	715	/*
716	* All of link_obj/unlink_obj/link_group/unlink_group require that	716	* All of link_obj/unlink_obj/link_group/unlink_group require that
717	* subsys->su_mutex is held.	717	* subsys->su_mutex is held.
718	*/	718	*/
719		719
720	static void unlink_obj(struct config_item *item)	720	static void unlink_obj(struct config_item *item)
721	{	721	{
722	struct config_group *group;	722	struct config_group *group;
723		723
724	group = item->ci_group;	724	group = item->ci_group;
725	if (group) {	725	if (group) {
726	list_del_init(&item->ci_entry);	726	list_del_init(&item->ci_entry);
727		727
728	item->ci_group = NULL;	728	item->ci_group = NULL;
729	item->ci_parent = NULL;	729	item->ci_parent = NULL;
730		730
731	/* Drop the reference for ci_entry */	731	/* Drop the reference for ci_entry */
732	config_item_put(item);	732	config_item_put(item);
733		733
734	/* Drop the reference for ci_parent */	734	/* Drop the reference for ci_parent */
735	config_group_put(group);	735	config_group_put(group);
736	}	736	}
737	}	737	}
738		738
739	static void link_obj(struct config_item parent_item, struct config_item item)	739	static void link_obj(struct config_item parent_item, struct config_item item)
740	{	740	{
741	/*	741	/*
742	* Parent seems redundant with group, but it makes certain	742	* Parent seems redundant with group, but it makes certain
743	* traversals much nicer.	743	* traversals much nicer.
744	*/	744	*/
745	item->ci_parent = parent_item;	745	item->ci_parent = parent_item;
746		746
747	/*	747	/*
748	* We hold a reference on the parent for the child's ci_parent	748	* We hold a reference on the parent for the child's ci_parent
749	* link.	749	* link.
750	*/	750	*/
751	item->ci_group = config_group_get(to_config_group(parent_item));	751	item->ci_group = config_group_get(to_config_group(parent_item));
752	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);	752	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
753		753
754	/*	754	/*
755	* We hold a reference on the child for ci_entry on the parent's	755	* We hold a reference on the child for ci_entry on the parent's
756	* cg_children	756	* cg_children
757	*/	757	*/
758	config_item_get(item);	758	config_item_get(item);
759	}	759	}
760		760
761	static void unlink_group(struct config_group *group)	761	static void unlink_group(struct config_group *group)
762	{	762	{
763	int i;	763	int i;
764	struct config_group *new_group;	764	struct config_group *new_group;
765		765
766	if (group->default_groups) {	766	if (group->default_groups) {
767	for (i = 0; group->default_groups[i]; i++) {	767	for (i = 0; group->default_groups[i]; i++) {
768	new_group = group->default_groups[i];	768	new_group = group->default_groups[i];
769	unlink_group(new_group);	769	unlink_group(new_group);
770	}	770	}
771	}	771	}
772		772
773	group->cg_subsys = NULL;	773	group->cg_subsys = NULL;
774	unlink_obj(&group->cg_item);	774	unlink_obj(&group->cg_item);
775	}	775	}
776		776
777	static void link_group(struct config_group parent_group, struct config_group group)	777	static void link_group(struct config_group parent_group, struct config_group group)
778	{	778	{
779	int i;	779	int i;
780	struct config_group *new_group;	780	struct config_group *new_group;
781	struct configfs_subsystem subsys = NULL; / gcc is a turd */	781	struct configfs_subsystem subsys = NULL; / gcc is a turd */
782		782
783	link_obj(&parent_group->cg_item, &group->cg_item);	783	link_obj(&parent_group->cg_item, &group->cg_item);
784		784
785	if (parent_group->cg_subsys)	785	if (parent_group->cg_subsys)
786	subsys = parent_group->cg_subsys;	786	subsys = parent_group->cg_subsys;
787	else if (configfs_is_root(&parent_group->cg_item))	787	else if (configfs_is_root(&parent_group->cg_item))
788	subsys = to_configfs_subsystem(group);	788	subsys = to_configfs_subsystem(group);
789	else	789	else
790	BUG();	790	BUG();
791	group->cg_subsys = subsys;	791	group->cg_subsys = subsys;
792		792
793	if (group->default_groups) {	793	if (group->default_groups) {
794	for (i = 0; group->default_groups[i]; i++) {	794	for (i = 0; group->default_groups[i]; i++) {
795	new_group = group->default_groups[i];	795	new_group = group->default_groups[i];
796	link_group(group, new_group);	796	link_group(group, new_group);
797	}	797	}
798	}	798	}
799	}	799	}
800		800
801	/*	801	/*
802	* The goal is that configfs_attach_item() (and	802	* The goal is that configfs_attach_item() (and
803	* configfs_attach_group()) can be called from either the VFS or this	803	* configfs_attach_group()) can be called from either the VFS or this
804	* module. That is, they assume that the items have been created,	804	* module. That is, they assume that the items have been created,
805	* the dentry allocated, and the dcache is all ready to go.	805	* the dentry allocated, and the dcache is all ready to go.
806	*	806	*
807	* If they fail, they must clean up after themselves as if they	807	* If they fail, they must clean up after themselves as if they
808	* had never been called. The caller (VFS or local function) will	808	* had never been called. The caller (VFS or local function) will
809	* handle cleaning up the dcache bits.	809	* handle cleaning up the dcache bits.
810	*	810	*
811	* configfs_detach_group() and configfs_detach_item() behave similarly on	811	* configfs_detach_group() and configfs_detach_item() behave similarly on
812	* the way out. They assume that the proper semaphores are held, they	812	* the way out. They assume that the proper semaphores are held, they
813	* clean up the configfs items, and they expect their callers will	813	* clean up the configfs items, and they expect their callers will
814	* handle the dcache bits.	814	* handle the dcache bits.
815	*/	815	*/
816	static int configfs_attach_item(struct config_item *parent_item,	816	static int configfs_attach_item(struct config_item *parent_item,
817	struct config_item *item,	817	struct config_item *item,
818	struct dentry *dentry)	818	struct dentry *dentry)
819	{	819	{
820	int ret;	820	int ret;
821		821
822	ret = configfs_create_dir(item, dentry);	822	ret = configfs_create_dir(item, dentry);
823	if (!ret) {	823	if (!ret) {
824	ret = populate_attrs(item);	824	ret = populate_attrs(item);
825	if (ret) {	825	if (ret) {
826	/*	826	/*
827	* We are going to remove an inode and its dentry but	827	* We are going to remove an inode and its dentry but
828	* the VFS may already have hit and used them. Thus,	828	* the VFS may already have hit and used them. Thus,
829	* we must lock them as rmdir() would.	829	* we must lock them as rmdir() would.
830	*/	830	*/
831	mutex_lock(&dentry->d_inode->i_mutex);	831	mutex_lock(&dentry->d_inode->i_mutex);
832	configfs_remove_dir(item);	832	configfs_remove_dir(item);
833	dentry->d_inode->i_flags \|= S_DEAD;	833	dentry->d_inode->i_flags \|= S_DEAD;
834	dont_mount(dentry);	834	dont_mount(dentry);
835	mutex_unlock(&dentry->d_inode->i_mutex);	835	mutex_unlock(&dentry->d_inode->i_mutex);
836	d_delete(dentry);	836	d_delete(dentry);
837	}	837	}
838	}	838	}
839		839
840	return ret;	840	return ret;
841	}	841	}
842		842
843	/* Caller holds the mutex of the item's inode */	843	/* Caller holds the mutex of the item's inode */
844	static void configfs_detach_item(struct config_item *item)	844	static void configfs_detach_item(struct config_item *item)
845	{	845	{
846	detach_attrs(item);	846	detach_attrs(item);
847	configfs_remove_dir(item);	847	configfs_remove_dir(item);
848	}	848	}
849		849
850	static int configfs_attach_group(struct config_item *parent_item,	850	static int configfs_attach_group(struct config_item *parent_item,
851	struct config_item *item,	851	struct config_item *item,
852	struct dentry *dentry)	852	struct dentry *dentry)
853	{	853	{
854	int ret;	854	int ret;
855	struct configfs_dirent *sd;	855	struct configfs_dirent *sd;
856		856
857	ret = configfs_attach_item(parent_item, item, dentry);	857	ret = configfs_attach_item(parent_item, item, dentry);
858	if (!ret) {	858	if (!ret) {
859	sd = dentry->d_fsdata;	859	sd = dentry->d_fsdata;
860	sd->s_type \|= CONFIGFS_USET_DIR;	860	sd->s_type \|= CONFIGFS_USET_DIR;
861		861
862	/*	862	/*
863	* FYI, we're faking mkdir in populate_groups()	863	* FYI, we're faking mkdir in populate_groups()
864	* We must lock the group's inode to avoid races with the VFS	864	* We must lock the group's inode to avoid races with the VFS
865	* which can already hit the inode and try to add/remove entries	865	* which can already hit the inode and try to add/remove entries
866	* under it.	866	* under it.
867	*	867	*
868	* We must also lock the inode to remove it safely in case of	868	* We must also lock the inode to remove it safely in case of
869	* error, as rmdir() would.	869	* error, as rmdir() would.
870	*/	870	*/
871	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);	871	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
872	configfs_adjust_dir_dirent_depth_before_populate(sd);	872	configfs_adjust_dir_dirent_depth_before_populate(sd);
873	ret = populate_groups(to_config_group(item));	873	ret = populate_groups(to_config_group(item));
874	if (ret) {	874	if (ret) {
875	configfs_detach_item(item);	875	configfs_detach_item(item);
876	dentry->d_inode->i_flags \|= S_DEAD;	876	dentry->d_inode->i_flags \|= S_DEAD;
877	dont_mount(dentry);	877	dont_mount(dentry);
878	}	878	}
879	configfs_adjust_dir_dirent_depth_after_populate(sd);	879	configfs_adjust_dir_dirent_depth_after_populate(sd);
880	mutex_unlock(&dentry->d_inode->i_mutex);	880	mutex_unlock(&dentry->d_inode->i_mutex);
881	if (ret)	881	if (ret)
882	d_delete(dentry);	882	d_delete(dentry);
883	}	883	}
884		884
885	return ret;	885	return ret;
886	}	886	}
887		887
888	/* Caller holds the mutex of the group's inode */	888	/* Caller holds the mutex of the group's inode */
889	static void configfs_detach_group(struct config_item *item)	889	static void configfs_detach_group(struct config_item *item)
890	{	890	{
891	detach_groups(to_config_group(item));	891	detach_groups(to_config_group(item));
892	configfs_detach_item(item);	892	configfs_detach_item(item);
893	}	893	}
894		894
895	/*	895	/*
896	* After the item has been detached from the filesystem view, we are	896	* After the item has been detached from the filesystem view, we are
897	* ready to tear it out of the hierarchy. Notify the client before	897	* ready to tear it out of the hierarchy. Notify the client before
898	* we do that so they can perform any cleanup that requires	898	* we do that so they can perform any cleanup that requires
899	* navigating the hierarchy. A client does not need to provide this	899	* navigating the hierarchy. A client does not need to provide this
900	* callback. The subsystem semaphore MUST be held by the caller, and	900	* callback. The subsystem semaphore MUST be held by the caller, and
901	* references must be valid for both items. It also assumes the	901	* references must be valid for both items. It also assumes the
902	* caller has validated ci_type.	902	* caller has validated ci_type.
903	*/	903	*/
904	static void client_disconnect_notify(struct config_item *parent_item,	904	static void client_disconnect_notify(struct config_item *parent_item,
905	struct config_item *item)	905	struct config_item *item)
906	{	906	{
907	struct config_item_type *type;	907	struct config_item_type *type;
908		908
909	type = parent_item->ci_type;	909	type = parent_item->ci_type;
910	BUG_ON(!type);	910	BUG_ON(!type);
911		911
912	if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)	912	if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
913	type->ct_group_ops->disconnect_notify(to_config_group(parent_item),	913	type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
914	item);	914	item);
915	}	915	}
916		916
917	/*	917	/*
918	* Drop the initial reference from make_item()/make_group()	918	* Drop the initial reference from make_item()/make_group()
919	* This function assumes that reference is held on item	919	* This function assumes that reference is held on item
920	* and that item holds a valid reference to the parent. Also, it	920	* and that item holds a valid reference to the parent. Also, it
921	* assumes the caller has validated ci_type.	921	* assumes the caller has validated ci_type.
922	*/	922	*/
923	static void client_drop_item(struct config_item *parent_item,	923	static void client_drop_item(struct config_item *parent_item,
924	struct config_item *item)	924	struct config_item *item)
925	{	925	{
926	struct config_item_type *type;	926	struct config_item_type *type;
927		927
928	type = parent_item->ci_type;	928	type = parent_item->ci_type;
929	BUG_ON(!type);	929	BUG_ON(!type);
930		930
931	/*	931	/*
932	* If ->drop_item() exists, it is responsible for the	932	* If ->drop_item() exists, it is responsible for the
933	* config_item_put().	933	* config_item_put().
934	*/	934	*/
935	if (type->ct_group_ops && type->ct_group_ops->drop_item)	935	if (type->ct_group_ops && type->ct_group_ops->drop_item)
936	type->ct_group_ops->drop_item(to_config_group(parent_item),	936	type->ct_group_ops->drop_item(to_config_group(parent_item),
937	item);	937	item);
938	else	938	else
939	config_item_put(item);	939	config_item_put(item);
940	}	940	}
941		941
942	#ifdef DEBUG	942	#ifdef DEBUG
943	static void configfs_dump_one(struct configfs_dirent *sd, int level)	943	static void configfs_dump_one(struct configfs_dirent *sd, int level)
944	{	944	{
945	printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));	945	printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
946		946
947	#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);	947	#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
948	type_print(CONFIGFS_ROOT);	948	type_print(CONFIGFS_ROOT);
949	type_print(CONFIGFS_DIR);	949	type_print(CONFIGFS_DIR);
950	type_print(CONFIGFS_ITEM_ATTR);	950	type_print(CONFIGFS_ITEM_ATTR);
951	type_print(CONFIGFS_ITEM_LINK);	951	type_print(CONFIGFS_ITEM_LINK);
952	type_print(CONFIGFS_USET_DIR);	952	type_print(CONFIGFS_USET_DIR);
953	type_print(CONFIGFS_USET_DEFAULT);	953	type_print(CONFIGFS_USET_DEFAULT);
954	type_print(CONFIGFS_USET_DROPPING);	954	type_print(CONFIGFS_USET_DROPPING);
955	#undef type_print	955	#undef type_print
956	}	956	}
957		957
958	static int configfs_dump(struct configfs_dirent *sd, int level)	958	static int configfs_dump(struct configfs_dirent *sd, int level)
959	{	959	{
960	struct configfs_dirent *child_sd;	960	struct configfs_dirent *child_sd;
961	int ret = 0;	961	int ret = 0;
962		962
963	configfs_dump_one(sd, level);	963	configfs_dump_one(sd, level);
964		964
965	if (!(sd->s_type & (CONFIGFS_DIR\|CONFIGFS_ROOT)))	965	if (!(sd->s_type & (CONFIGFS_DIR\|CONFIGFS_ROOT)))
966	return 0;	966	return 0;
967		967
968	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {	968	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
969	ret = configfs_dump(child_sd, level + 2);	969	ret = configfs_dump(child_sd, level + 2);
970	if (ret)	970	if (ret)
971	break;	971	break;
972	}	972	}
973		973
974	return ret;	974	return ret;
975	}	975	}
976	#endif	976	#endif
977		977
978		978
979	/*	979	/*
980	* configfs_depend_item() and configfs_undepend_item()	980	* configfs_depend_item() and configfs_undepend_item()
981	*	981	*
982	* WARNING: Do not call these from a configfs callback!	982	* WARNING: Do not call these from a configfs callback!
983	*	983	*
984	* This describes these functions and their helpers.	984	* This describes these functions and their helpers.
985	*	985	*
986	* Allow another kernel system to depend on a config_item. If this	986	* Allow another kernel system to depend on a config_item. If this
987	* happens, the item cannot go away until the dependent can live without	987	* happens, the item cannot go away until the dependent can live without
988	* it. The idea is to give client modules as simple an interface as	988	* it. The idea is to give client modules as simple an interface as
989	* possible. When a system asks them to depend on an item, they just	989	* possible. When a system asks them to depend on an item, they just
990	* call configfs_depend_item(). If the item is live and the client	990	* call configfs_depend_item(). If the item is live and the client
991	* driver is in good shape, we'll happily do the work for them.	991	* driver is in good shape, we'll happily do the work for them.
992	*	992	*
993	* Why is the locking complex? Because configfs uses the VFS to handle	993	* Why is the locking complex? Because configfs uses the VFS to handle
994	* all locking, but this function is called outside the normal	994	* all locking, but this function is called outside the normal
995	* VFS->configfs path. So it must take VFS locks to prevent the	995	* VFS->configfs path. So it must take VFS locks to prevent the
996	* VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is	996	* VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
997	* why you can't call these functions underneath configfs callbacks.	997	* why you can't call these functions underneath configfs callbacks.
998	*	998	*
999	* Note, btw, that this can be called at any time, even when a configfs	999	* Note, btw, that this can be called at any time, even when a configfs
1000	* subsystem isn't registered, or when configfs is loading or unloading.	1000	* subsystem isn't registered, or when configfs is loading or unloading.
1001	* Just like configfs_register_subsystem(). So we take the same	1001	* Just like configfs_register_subsystem(). So we take the same
1002	* precautions. We pin the filesystem. We lock configfs_dirent_lock.	1002	* precautions. We pin the filesystem. We lock configfs_dirent_lock.
1003	* If we can find the target item in the	1003	* If we can find the target item in the
1004	* configfs tree, it must be part of the subsystem tree as well, so we	1004	* configfs tree, it must be part of the subsystem tree as well, so we
1005	* do not need the subsystem semaphore. Holding configfs_dirent_lock helps	1005	* do not need the subsystem semaphore. Holding configfs_dirent_lock helps
1006	* locking out mkdir() and rmdir(), who might be racing us.	1006	* locking out mkdir() and rmdir(), who might be racing us.
1007	*/	1007	*/
1008		1008
1009	/*	1009	/*
1010	* configfs_depend_prep()	1010	* configfs_depend_prep()
1011	*	1011	*
1012	* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are	1012	* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
1013	* attributes. This is similar but not the same to configfs_detach_prep().	1013	* attributes. This is similar but not the same to configfs_detach_prep().
1014	* Note that configfs_detach_prep() expects the parent to be locked when it	1014	* Note that configfs_detach_prep() expects the parent to be locked when it
1015	* is called, but we lock the parent inside configfs_depend_prep(). We	1015	* is called, but we lock the parent inside configfs_depend_prep(). We
1016	* do that so we can unlock it if we find nothing.	1016	* do that so we can unlock it if we find nothing.
1017	*	1017	*
1018	* Here we do a depth-first search of the dentry hierarchy looking for	1018	* Here we do a depth-first search of the dentry hierarchy looking for
1019	* our object.	1019	* our object.
1020	* We deliberately ignore items tagged as dropping since they are virtually	1020	* We deliberately ignore items tagged as dropping since they are virtually
1021	* dead, as well as items in the middle of attachment since they virtually	1021	* dead, as well as items in the middle of attachment since they virtually
1022	* do not exist yet. This completes the locking out of racing mkdir() and	1022	* do not exist yet. This completes the locking out of racing mkdir() and
1023	* rmdir().	1023	* rmdir().
1024	* Note: subdirectories in the middle of attachment start with s_type =	1024	* Note: subdirectories in the middle of attachment start with s_type =
1025	* CONFIGFS_DIR\|CONFIGFS_USET_CREATING set by create_dir(). When	1025	* CONFIGFS_DIR\|CONFIGFS_USET_CREATING set by create_dir(). When
1026	* CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of	1026	* CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of
1027	* s_type is in configfs_new_dirent(), which has configfs_dirent_lock.	1027	* s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
1028	*	1028	*
1029	* If the target is not found, -ENOENT is bubbled up.	1029	* If the target is not found, -ENOENT is bubbled up.
1030	*	1030	*
1031	* This adds a requirement that all config_items be unique!	1031	* This adds a requirement that all config_items be unique!
1032	*	1032	*
1033	* This is recursive. There isn't	1033	* This is recursive. There isn't
1034	* much on the stack, though, so folks that need this function - be careful	1034	* much on the stack, though, so folks that need this function - be careful
1035	* about your stack! Patches will be accepted to make it iterative.	1035	* about your stack! Patches will be accepted to make it iterative.
1036	*/	1036	*/
1037	static int configfs_depend_prep(struct dentry *origin,	1037	static int configfs_depend_prep(struct dentry *origin,
1038	struct config_item *target)	1038	struct config_item *target)
1039	{	1039	{
1040	struct configfs_dirent child_sd, sd = origin->d_fsdata;	1040	struct configfs_dirent child_sd, sd = origin->d_fsdata;
1041	int ret = 0;	1041	int ret = 0;
1042		1042
1043	BUG_ON(!origin \|\| !sd);	1043	BUG_ON(!origin \|\| !sd);
1044		1044
1045	if (sd->s_element == target) /* Boo-yah */	1045	if (sd->s_element == target) /* Boo-yah */
1046	goto out;	1046	goto out;
1047		1047
1048	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {	1048	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
1049	if ((child_sd->s_type & CONFIGFS_DIR) &&	1049	if ((child_sd->s_type & CONFIGFS_DIR) &&
1050	!(child_sd->s_type & CONFIGFS_USET_DROPPING) &&	1050	!(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
1051	!(child_sd->s_type & CONFIGFS_USET_CREATING)) {	1051	!(child_sd->s_type & CONFIGFS_USET_CREATING)) {
1052	ret = configfs_depend_prep(child_sd->s_dentry,	1052	ret = configfs_depend_prep(child_sd->s_dentry,
1053	target);	1053	target);
1054	if (!ret)	1054	if (!ret)
1055	goto out; /* Child path boo-yah */	1055	goto out; /* Child path boo-yah */
1056	}	1056	}
1057	}	1057	}
1058		1058
1059	/* We looped all our children and didn't find target */	1059	/* We looped all our children and didn't find target */
1060	ret = -ENOENT;	1060	ret = -ENOENT;
1061		1061
1062	out:	1062	out:
1063	return ret;	1063	return ret;
1064	}	1064	}
1065		1065
1066	int configfs_depend_item(struct configfs_subsystem *subsys,	1066	int configfs_depend_item(struct configfs_subsystem *subsys,
1067	struct config_item *target)	1067	struct config_item *target)
1068	{	1068	{
1069	int ret;	1069	int ret;
1070	struct configfs_dirent p, root_sd, *subsys_sd = NULL;	1070	struct configfs_dirent p, root_sd, *subsys_sd = NULL;
1071	struct config_item *s_item = &subsys->su_group.cg_item;	1071	struct config_item *s_item = &subsys->su_group.cg_item;
1072	struct dentry *root;	1072	struct dentry *root;
1073		1073
1074	/*	1074	/*
1075	* Pin the configfs filesystem. This means we can safely access	1075	* Pin the configfs filesystem. This means we can safely access
1076	* the root of the configfs filesystem.	1076	* the root of the configfs filesystem.
1077	*/	1077	*/
1078	root = configfs_pin_fs();	1078	root = configfs_pin_fs();
1079	if (IS_ERR(root))	1079	if (IS_ERR(root))
1080	return PTR_ERR(root);	1080	return PTR_ERR(root);
1081		1081
1082	/*	1082	/*
1083	* Next, lock the root directory. We're going to check that the	1083	* Next, lock the root directory. We're going to check that the
1084	* subsystem is really registered, and so we need to lock out	1084	* subsystem is really registered, and so we need to lock out
1085	* configfs_[un]register_subsystem().	1085	* configfs_[un]register_subsystem().
1086	*/	1086	*/
1087	mutex_lock(&root->d_inode->i_mutex);	1087	mutex_lock(&root->d_inode->i_mutex);
1088		1088
1089	root_sd = root->d_fsdata;	1089	root_sd = root->d_fsdata;
1090		1090
1091	list_for_each_entry(p, &root_sd->s_children, s_sibling) {	1091	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
1092	if (p->s_type & CONFIGFS_DIR) {	1092	if (p->s_type & CONFIGFS_DIR) {
1093	if (p->s_element == s_item) {	1093	if (p->s_element == s_item) {
1094	subsys_sd = p;	1094	subsys_sd = p;
1095	break;	1095	break;
1096	}	1096	}
1097	}	1097	}
1098	}	1098	}
1099		1099
1100	if (!subsys_sd) {	1100	if (!subsys_sd) {
1101	ret = -ENOENT;	1101	ret = -ENOENT;
1102	goto out_unlock_fs;	1102	goto out_unlock_fs;
1103	}	1103	}
1104		1104
1105	/* Ok, now we can trust subsys/s_item */	1105	/* Ok, now we can trust subsys/s_item */
1106		1106
1107	spin_lock(&configfs_dirent_lock);	1107	spin_lock(&configfs_dirent_lock);
1108	/* Scan the tree, return 0 if found */	1108	/* Scan the tree, return 0 if found */
1109	ret = configfs_depend_prep(subsys_sd->s_dentry, target);	1109	ret = configfs_depend_prep(subsys_sd->s_dentry, target);
1110	if (ret)	1110	if (ret)
1111	goto out_unlock_dirent_lock;	1111	goto out_unlock_dirent_lock;
1112		1112
1113	/*	1113	/*
1114	* We are sure that the item is not about to be removed by rmdir(), and	1114	* We are sure that the item is not about to be removed by rmdir(), and
1115	* not in the middle of attachment by mkdir().	1115	* not in the middle of attachment by mkdir().
1116	*/	1116	*/
1117	p = target->ci_dentry->d_fsdata;	1117	p = target->ci_dentry->d_fsdata;
1118	p->s_dependent_count += 1;	1118	p->s_dependent_count += 1;
1119		1119
1120	out_unlock_dirent_lock:	1120	out_unlock_dirent_lock:
1121	spin_unlock(&configfs_dirent_lock);	1121	spin_unlock(&configfs_dirent_lock);
1122	out_unlock_fs:	1122	out_unlock_fs:
1123	mutex_unlock(&root->d_inode->i_mutex);	1123	mutex_unlock(&root->d_inode->i_mutex);
1124		1124
1125	/*	1125	/*
1126	* If we succeeded, the fs is pinned via other methods. If not,	1126	* If we succeeded, the fs is pinned via other methods. If not,
1127	* we're done with it anyway. So release_fs() is always right.	1127	* we're done with it anyway. So release_fs() is always right.
1128	*/	1128	*/
1129	configfs_release_fs();	1129	configfs_release_fs();
1130		1130
1131	return ret;	1131	return ret;
1132	}	1132	}
1133	EXPORT_SYMBOL(configfs_depend_item);	1133	EXPORT_SYMBOL(configfs_depend_item);
1134		1134
1135	/*	1135	/*
1136	* Release the dependent linkage. This is much simpler than	1136	* Release the dependent linkage. This is much simpler than
1137	* configfs_depend_item() because we know that that the client driver is	1137	* configfs_depend_item() because we know that that the client driver is
1138	* pinned, thus the subsystem is pinned, and therefore configfs is pinned.	1138	* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
1139	*/	1139	*/
1140	void configfs_undepend_item(struct configfs_subsystem *subsys,	1140	void configfs_undepend_item(struct configfs_subsystem *subsys,
1141	struct config_item *target)	1141	struct config_item *target)
1142	{	1142	{
1143	struct configfs_dirent *sd;	1143	struct configfs_dirent *sd;
1144		1144
1145	/*	1145	/*
1146	* Since we can trust everything is pinned, we just need	1146	* Since we can trust everything is pinned, we just need
1147	* configfs_dirent_lock.	1147	* configfs_dirent_lock.
1148	*/	1148	*/
1149	spin_lock(&configfs_dirent_lock);	1149	spin_lock(&configfs_dirent_lock);
1150		1150
1151	sd = target->ci_dentry->d_fsdata;	1151	sd = target->ci_dentry->d_fsdata;
1152	BUG_ON(sd->s_dependent_count < 1);	1152	BUG_ON(sd->s_dependent_count < 1);
1153		1153
1154	sd->s_dependent_count -= 1;	1154	sd->s_dependent_count -= 1;
1155		1155
1156	/*	1156	/*
1157	* After this unlock, we cannot trust the item to stay alive!	1157	* After this unlock, we cannot trust the item to stay alive!
1158	* DO NOT REFERENCE item after this unlock.	1158	* DO NOT REFERENCE item after this unlock.
1159	*/	1159	*/
1160	spin_unlock(&configfs_dirent_lock);	1160	spin_unlock(&configfs_dirent_lock);
1161	}	1161	}
1162	EXPORT_SYMBOL(configfs_undepend_item);	1162	EXPORT_SYMBOL(configfs_undepend_item);
1163		1163
1164	static int configfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	1164	static int configfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
1165	{	1165	{
1166	int ret = 0;	1166	int ret = 0;
1167	int module_got = 0;	1167	int module_got = 0;
1168	struct config_group *group = NULL;	1168	struct config_group *group = NULL;
1169	struct config_item *item = NULL;	1169	struct config_item *item = NULL;
1170	struct config_item *parent_item;	1170	struct config_item *parent_item;
1171	struct configfs_subsystem *subsys;	1171	struct configfs_subsystem *subsys;
1172	struct configfs_dirent *sd;	1172	struct configfs_dirent *sd;
1173	struct config_item_type *type;	1173	struct config_item_type *type;
1174	struct module subsys_owner = NULL, new_item_owner = NULL;	1174	struct module subsys_owner = NULL, new_item_owner = NULL;
1175	char *name;	1175	char *name;
1176		1176
1177	sd = dentry->d_parent->d_fsdata;	1177	sd = dentry->d_parent->d_fsdata;
1178		1178
1179	/*	1179	/*
1180	* Fake invisibility if dir belongs to a group/default groups hierarchy	1180	* Fake invisibility if dir belongs to a group/default groups hierarchy
1181	* being attached	1181	* being attached
1182	*/	1182	*/
1183	if (!configfs_dirent_is_ready(sd)) {	1183	if (!configfs_dirent_is_ready(sd)) {
1184	ret = -ENOENT;	1184	ret = -ENOENT;
1185	goto out;	1185	goto out;
1186	}	1186	}
1187		1187
1188	if (!(sd->s_type & CONFIGFS_USET_DIR)) {	1188	if (!(sd->s_type & CONFIGFS_USET_DIR)) {
1189	ret = -EPERM;	1189	ret = -EPERM;
1190	goto out;	1190	goto out;
1191	}	1191	}
1192		1192
1193	/* Get a working ref for the duration of this function */	1193	/* Get a working ref for the duration of this function */
1194	parent_item = configfs_get_config_item(dentry->d_parent);	1194	parent_item = configfs_get_config_item(dentry->d_parent);
1195	type = parent_item->ci_type;	1195	type = parent_item->ci_type;
1196	subsys = to_config_group(parent_item)->cg_subsys;	1196	subsys = to_config_group(parent_item)->cg_subsys;
1197	BUG_ON(!subsys);	1197	BUG_ON(!subsys);
1198		1198
1199	if (!type \|\| !type->ct_group_ops \|\|	1199	if (!type \|\| !type->ct_group_ops \|\|
1200	(!type->ct_group_ops->make_group &&	1200	(!type->ct_group_ops->make_group &&
1201	!type->ct_group_ops->make_item)) {	1201	!type->ct_group_ops->make_item)) {
1202	ret = -EPERM; /* Lack-of-mkdir returns -EPERM */	1202	ret = -EPERM; /* Lack-of-mkdir returns -EPERM */
1203	goto out_put;	1203	goto out_put;
1204	}	1204	}
1205		1205
1206	/*	1206	/*
1207	* The subsystem may belong to a different module than the item	1207	* The subsystem may belong to a different module than the item
1208	* being created. We don't want to safely pin the new item but	1208	* being created. We don't want to safely pin the new item but
1209	* fail to pin the subsystem it sits under.	1209	* fail to pin the subsystem it sits under.
1210	*/	1210	*/
1211	if (!subsys->su_group.cg_item.ci_type) {	1211	if (!subsys->su_group.cg_item.ci_type) {
1212	ret = -EINVAL;	1212	ret = -EINVAL;
1213	goto out_put;	1213	goto out_put;
1214	}	1214	}
1215	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;	1215	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
1216	if (!try_module_get(subsys_owner)) {	1216	if (!try_module_get(subsys_owner)) {
1217	ret = -EINVAL;	1217	ret = -EINVAL;
1218	goto out_put;	1218	goto out_put;
1219	}	1219	}
1220		1220
1221	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);	1221	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
1222	if (!name) {	1222	if (!name) {
1223	ret = -ENOMEM;	1223	ret = -ENOMEM;
1224	goto out_subsys_put;	1224	goto out_subsys_put;
1225	}	1225	}
1226		1226
1227	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);	1227	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
1228		1228
1229	mutex_lock(&subsys->su_mutex);	1229	mutex_lock(&subsys->su_mutex);
1230	if (type->ct_group_ops->make_group) {	1230	if (type->ct_group_ops->make_group) {
1231	group = type->ct_group_ops->make_group(to_config_group(parent_item), name);	1231	group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
1232	if (!group)	1232	if (!group)
1233	group = ERR_PTR(-ENOMEM);	1233	group = ERR_PTR(-ENOMEM);
1234	if (!IS_ERR(group)) {	1234	if (!IS_ERR(group)) {
1235	link_group(to_config_group(parent_item), group);	1235	link_group(to_config_group(parent_item), group);
1236	item = &group->cg_item;	1236	item = &group->cg_item;
1237	} else	1237	} else
1238	ret = PTR_ERR(group);	1238	ret = PTR_ERR(group);
1239	} else {	1239	} else {
1240	item = type->ct_group_ops->make_item(to_config_group(parent_item), name);	1240	item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
1241	if (!item)	1241	if (!item)
1242	item = ERR_PTR(-ENOMEM);	1242	item = ERR_PTR(-ENOMEM);
1243	if (!IS_ERR(item))	1243	if (!IS_ERR(item))
1244	link_obj(parent_item, item);	1244	link_obj(parent_item, item);
1245	else	1245	else
1246	ret = PTR_ERR(item);	1246	ret = PTR_ERR(item);
1247	}	1247	}
1248	mutex_unlock(&subsys->su_mutex);	1248	mutex_unlock(&subsys->su_mutex);
1249		1249
1250	kfree(name);	1250	kfree(name);
1251	if (ret) {	1251	if (ret) {
1252	/*	1252	/*
1253	* If ret != 0, then link_obj() was never called.	1253	* If ret != 0, then link_obj() was never called.
1254	* There are no extra references to clean up.	1254	* There are no extra references to clean up.
1255	*/	1255	*/
1256	goto out_subsys_put;	1256	goto out_subsys_put;
1257	}	1257	}
1258		1258
1259	/*	1259	/*
1260	* link_obj() has been called (via link_group() for groups).	1260	* link_obj() has been called (via link_group() for groups).
1261	* From here on out, errors must clean that up.	1261	* From here on out, errors must clean that up.
1262	*/	1262	*/
1263		1263
1264	type = item->ci_type;	1264	type = item->ci_type;
1265	if (!type) {	1265	if (!type) {
1266	ret = -EINVAL;	1266	ret = -EINVAL;
1267	goto out_unlink;	1267	goto out_unlink;
1268	}	1268	}
1269		1269
1270	new_item_owner = type->ct_owner;	1270	new_item_owner = type->ct_owner;
1271	if (!try_module_get(new_item_owner)) {	1271	if (!try_module_get(new_item_owner)) {
1272	ret = -EINVAL;	1272	ret = -EINVAL;
1273	goto out_unlink;	1273	goto out_unlink;
1274	}	1274	}
1275		1275
1276	/*	1276	/*
1277	* I hate doing it this way, but if there is	1277	* I hate doing it this way, but if there is
1278	* an error, module_put() probably should	1278	* an error, module_put() probably should
1279	* happen after any cleanup.	1279	* happen after any cleanup.
1280	*/	1280	*/
1281	module_got = 1;	1281	module_got = 1;
1282		1282
1283	/*	1283	/*
1284	* Make racing rmdir() fail if it did not tag parent with	1284	* Make racing rmdir() fail if it did not tag parent with
1285	* CONFIGFS_USET_DROPPING	1285	* CONFIGFS_USET_DROPPING
1286	* Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will	1286	* Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
1287	* fail and let rmdir() terminate correctly	1287	* fail and let rmdir() terminate correctly
1288	*/	1288	*/
1289	spin_lock(&configfs_dirent_lock);	1289	spin_lock(&configfs_dirent_lock);
1290	/* This will make configfs_detach_prep() fail */	1290	/* This will make configfs_detach_prep() fail */
1291	sd->s_type \|= CONFIGFS_USET_IN_MKDIR;	1291	sd->s_type \|= CONFIGFS_USET_IN_MKDIR;
1292	spin_unlock(&configfs_dirent_lock);	1292	spin_unlock(&configfs_dirent_lock);
1293		1293
1294	if (group)	1294	if (group)
1295	ret = configfs_attach_group(parent_item, item, dentry);	1295	ret = configfs_attach_group(parent_item, item, dentry);
1296	else	1296	else
1297	ret = configfs_attach_item(parent_item, item, dentry);	1297	ret = configfs_attach_item(parent_item, item, dentry);
1298		1298
1299	spin_lock(&configfs_dirent_lock);	1299	spin_lock(&configfs_dirent_lock);
1300	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;	1300	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
1301	if (!ret)	1301	if (!ret)
1302	configfs_dir_set_ready(dentry->d_fsdata);	1302	configfs_dir_set_ready(dentry->d_fsdata);
1303	spin_unlock(&configfs_dirent_lock);	1303	spin_unlock(&configfs_dirent_lock);
1304		1304
1305	out_unlink:	1305	out_unlink:
1306	if (ret) {	1306	if (ret) {
1307	/* Tear down everything we built up */	1307	/* Tear down everything we built up */
1308	mutex_lock(&subsys->su_mutex);	1308	mutex_lock(&subsys->su_mutex);
1309		1309
1310	client_disconnect_notify(parent_item, item);	1310	client_disconnect_notify(parent_item, item);
1311	if (group)	1311	if (group)
1312	unlink_group(group);	1312	unlink_group(group);
1313	else	1313	else
1314	unlink_obj(item);	1314	unlink_obj(item);
1315	client_drop_item(parent_item, item);	1315	client_drop_item(parent_item, item);
1316		1316
1317	mutex_unlock(&subsys->su_mutex);	1317	mutex_unlock(&subsys->su_mutex);
1318		1318
1319	if (module_got)	1319	if (module_got)
1320	module_put(new_item_owner);	1320	module_put(new_item_owner);
1321	}	1321	}
1322		1322
1323	out_subsys_put:	1323	out_subsys_put:
1324	if (ret)	1324	if (ret)
1325	module_put(subsys_owner);	1325	module_put(subsys_owner);
1326		1326
1327	out_put:	1327	out_put:
1328	/*	1328	/*
1329	* link_obj()/link_group() took a reference from child->parent,	1329	* link_obj()/link_group() took a reference from child->parent,
1330	* so the parent is safely pinned. We can drop our working	1330	* so the parent is safely pinned. We can drop our working
1331	* reference.	1331	* reference.
1332	*/	1332	*/
1333	config_item_put(parent_item);	1333	config_item_put(parent_item);
1334		1334
1335	out:	1335	out:
1336	return ret;	1336	return ret;
1337	}	1337	}
1338		1338
1339	static int configfs_rmdir(struct inode dir, struct dentry dentry)	1339	static int configfs_rmdir(struct inode dir, struct dentry dentry)
1340	{	1340	{
1341	struct config_item *parent_item;	1341	struct config_item *parent_item;
1342	struct config_item *item;	1342	struct config_item *item;
1343	struct configfs_subsystem *subsys;	1343	struct configfs_subsystem *subsys;
1344	struct configfs_dirent *sd;	1344	struct configfs_dirent *sd;
1345	struct module subsys_owner = NULL, dead_item_owner = NULL;	1345	struct module subsys_owner = NULL, dead_item_owner = NULL;
1346	int ret;	1346	int ret;
1347		1347
1348	sd = dentry->d_fsdata;	1348	sd = dentry->d_fsdata;
1349	if (sd->s_type & CONFIGFS_USET_DEFAULT)	1349	if (sd->s_type & CONFIGFS_USET_DEFAULT)
1350	return -EPERM;	1350	return -EPERM;
1351		1351
1352	/* Get a working ref until we have the child */	1352	/* Get a working ref until we have the child */
1353	parent_item = configfs_get_config_item(dentry->d_parent);	1353	parent_item = configfs_get_config_item(dentry->d_parent);
1354	subsys = to_config_group(parent_item)->cg_subsys;	1354	subsys = to_config_group(parent_item)->cg_subsys;
1355	BUG_ON(!subsys);	1355	BUG_ON(!subsys);
1356		1356
1357	if (!parent_item->ci_type) {	1357	if (!parent_item->ci_type) {
1358	config_item_put(parent_item);	1358	config_item_put(parent_item);
1359	return -EINVAL;	1359	return -EINVAL;
1360	}	1360	}
1361		1361
1362	/* configfs_mkdir() shouldn't have allowed this */	1362	/* configfs_mkdir() shouldn't have allowed this */
1363	BUG_ON(!subsys->su_group.cg_item.ci_type);	1363	BUG_ON(!subsys->su_group.cg_item.ci_type);
1364	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;	1364	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
1365		1365
1366	/*	1366	/*
1367	* Ensure that no racing symlink() will make detach_prep() fail while	1367	* Ensure that no racing symlink() will make detach_prep() fail while
1368	* the new link is temporarily attached	1368	* the new link is temporarily attached
1369	*/	1369	*/
1370	do {	1370	do {
1371	struct mutex *wait_mutex;	1371	struct mutex *wait_mutex;
1372		1372
1373	mutex_lock(&configfs_symlink_mutex);	1373	mutex_lock(&configfs_symlink_mutex);
1374	spin_lock(&configfs_dirent_lock);	1374	spin_lock(&configfs_dirent_lock);
1375	/*	1375	/*
1376	* Here's where we check for dependents. We're protected by	1376	* Here's where we check for dependents. We're protected by
1377	* configfs_dirent_lock.	1377	* configfs_dirent_lock.
1378	* If no dependent, atomically tag the item as dropping.	1378	* If no dependent, atomically tag the item as dropping.
1379	*/	1379	*/
1380	ret = sd->s_dependent_count ? -EBUSY : 0;	1380	ret = sd->s_dependent_count ? -EBUSY : 0;
1381	if (!ret) {	1381	if (!ret) {
1382	ret = configfs_detach_prep(dentry, &wait_mutex);	1382	ret = configfs_detach_prep(dentry, &wait_mutex);
1383	if (ret)	1383	if (ret)
1384	configfs_detach_rollback(dentry);	1384	configfs_detach_rollback(dentry);
1385	}	1385	}
1386	spin_unlock(&configfs_dirent_lock);	1386	spin_unlock(&configfs_dirent_lock);
1387	mutex_unlock(&configfs_symlink_mutex);	1387	mutex_unlock(&configfs_symlink_mutex);
1388		1388
1389	if (ret) {	1389	if (ret) {
1390	if (ret != -EAGAIN) {	1390	if (ret != -EAGAIN) {
1391	config_item_put(parent_item);	1391	config_item_put(parent_item);
1392	return ret;	1392	return ret;
1393	}	1393	}
1394		1394
1395	/* Wait until the racing operation terminates */	1395	/* Wait until the racing operation terminates */
1396	mutex_lock(wait_mutex);	1396	mutex_lock(wait_mutex);
1397	mutex_unlock(wait_mutex);	1397	mutex_unlock(wait_mutex);
1398	}	1398	}
1399	} while (ret == -EAGAIN);	1399	} while (ret == -EAGAIN);
1400		1400
1401	/* Get a working ref for the duration of this function */	1401	/* Get a working ref for the duration of this function */
1402	item = configfs_get_config_item(dentry);	1402	item = configfs_get_config_item(dentry);
1403		1403
1404	/* Drop reference from above, item already holds one. */	1404	/* Drop reference from above, item already holds one. */
1405	config_item_put(parent_item);	1405	config_item_put(parent_item);
1406		1406
1407	if (item->ci_type)	1407	if (item->ci_type)
1408	dead_item_owner = item->ci_type->ct_owner;	1408	dead_item_owner = item->ci_type->ct_owner;
1409		1409
1410	if (sd->s_type & CONFIGFS_USET_DIR) {	1410	if (sd->s_type & CONFIGFS_USET_DIR) {
1411	configfs_detach_group(item);	1411	configfs_detach_group(item);
1412		1412
1413	mutex_lock(&subsys->su_mutex);	1413	mutex_lock(&subsys->su_mutex);
1414	client_disconnect_notify(parent_item, item);	1414	client_disconnect_notify(parent_item, item);
1415	unlink_group(to_config_group(item));	1415	unlink_group(to_config_group(item));
1416	} else {	1416	} else {
1417	configfs_detach_item(item);	1417	configfs_detach_item(item);
1418		1418
1419	mutex_lock(&subsys->su_mutex);	1419	mutex_lock(&subsys->su_mutex);
1420	client_disconnect_notify(parent_item, item);	1420	client_disconnect_notify(parent_item, item);
1421	unlink_obj(item);	1421	unlink_obj(item);
1422	}	1422	}
1423		1423
1424	client_drop_item(parent_item, item);	1424	client_drop_item(parent_item, item);
1425	mutex_unlock(&subsys->su_mutex);	1425	mutex_unlock(&subsys->su_mutex);
1426		1426
1427	/* Drop our reference from above */	1427	/* Drop our reference from above */
1428	config_item_put(item);	1428	config_item_put(item);
1429		1429
1430	module_put(dead_item_owner);	1430	module_put(dead_item_owner);
1431	module_put(subsys_owner);	1431	module_put(subsys_owner);
1432		1432
1433	return 0;	1433	return 0;
1434	}	1434	}
1435		1435
1436	const struct inode_operations configfs_dir_inode_operations = {	1436	const struct inode_operations configfs_dir_inode_operations = {
1437	.mkdir = configfs_mkdir,	1437	.mkdir = configfs_mkdir,
1438	.rmdir = configfs_rmdir,	1438	.rmdir = configfs_rmdir,
1439	.symlink = configfs_symlink,	1439	.symlink = configfs_symlink,
1440	.unlink = configfs_unlink,	1440	.unlink = configfs_unlink,
1441	.lookup = configfs_lookup,	1441	.lookup = configfs_lookup,
1442	.setattr = configfs_setattr,	1442	.setattr = configfs_setattr,
1443	};	1443	};
1444		1444
1445	const struct inode_operations configfs_root_inode_operations = {	1445	const struct inode_operations configfs_root_inode_operations = {
1446	.lookup = configfs_lookup,	1446	.lookup = configfs_lookup,
1447	.setattr = configfs_setattr,	1447	.setattr = configfs_setattr,
1448	};	1448	};
1449		1449
1450	#if 0	1450	#if 0
1451	int configfs_rename_dir(struct config_item * item, const char *new_name)	1451	int configfs_rename_dir(struct config_item * item, const char *new_name)
1452	{	1452	{
1453	int error = 0;	1453	int error = 0;
1454	struct dentry * new_dentry, * parent;	1454	struct dentry * new_dentry, * parent;
1455		1455
1456	if (!strcmp(config_item_name(item), new_name))	1456	if (!strcmp(config_item_name(item), new_name))
1457	return -EINVAL;	1457	return -EINVAL;
1458		1458
1459	if (!item->parent)	1459	if (!item->parent)
1460	return -EINVAL;	1460	return -EINVAL;
1461		1461
1462	down_write(&configfs_rename_sem);	1462	down_write(&configfs_rename_sem);
1463	parent = item->parent->dentry;	1463	parent = item->parent->dentry;
1464		1464
1465	mutex_lock(&parent->d_inode->i_mutex);	1465	mutex_lock(&parent->d_inode->i_mutex);
1466		1466
1467	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));	1467	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
1468	if (!IS_ERR(new_dentry)) {	1468	if (!IS_ERR(new_dentry)) {
1469	if (!new_dentry->d_inode) {	1469	if (!new_dentry->d_inode) {
1470	error = config_item_set_name(item, "%s", new_name);	1470	error = config_item_set_name(item, "%s", new_name);
1471	if (!error) {	1471	if (!error) {
1472	d_add(new_dentry, NULL);	1472	d_add(new_dentry, NULL);
1473	d_move(item->dentry, new_dentry);	1473	d_move(item->dentry, new_dentry);
1474	}	1474	}
1475	else	1475	else
1476	d_delete(new_dentry);	1476	d_delete(new_dentry);
1477	} else	1477	} else
1478	error = -EEXIST;	1478	error = -EEXIST;
1479	dput(new_dentry);	1479	dput(new_dentry);
1480	}	1480	}
1481	mutex_unlock(&parent->d_inode->i_mutex);	1481	mutex_unlock(&parent->d_inode->i_mutex);
1482	up_write(&configfs_rename_sem);	1482	up_write(&configfs_rename_sem);
1483		1483
1484	return error;	1484	return error;
1485	}	1485	}
1486	#endif	1486	#endif
1487		1487
1488	static int configfs_dir_open(struct inode inode, struct file file)	1488	static int configfs_dir_open(struct inode inode, struct file file)
1489	{	1489	{
1490	struct dentry * dentry = file->f_path.dentry;	1490	struct dentry * dentry = file->f_path.dentry;
1491	struct configfs_dirent * parent_sd = dentry->d_fsdata;	1491	struct configfs_dirent * parent_sd = dentry->d_fsdata;
1492	int err;	1492	int err;
1493		1493
1494	mutex_lock(&dentry->d_inode->i_mutex);	1494	mutex_lock(&dentry->d_inode->i_mutex);
1495	/*	1495	/*
1496	* Fake invisibility if dir belongs to a group/default groups hierarchy	1496	* Fake invisibility if dir belongs to a group/default groups hierarchy
1497	* being attached	1497	* being attached
1498	*/	1498	*/
1499	err = -ENOENT;	1499	err = -ENOENT;
1500	if (configfs_dirent_is_ready(parent_sd)) {	1500	if (configfs_dirent_is_ready(parent_sd)) {
1501	file->private_data = configfs_new_dirent(parent_sd, NULL, 0);	1501	file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
1502	if (IS_ERR(file->private_data))	1502	if (IS_ERR(file->private_data))
1503	err = PTR_ERR(file->private_data);	1503	err = PTR_ERR(file->private_data);
1504	else	1504	else
1505	err = 0;	1505	err = 0;
1506	}	1506	}
1507	mutex_unlock(&dentry->d_inode->i_mutex);	1507	mutex_unlock(&dentry->d_inode->i_mutex);
1508		1508
1509	return err;	1509	return err;
1510	}	1510	}
1511		1511
1512	static int configfs_dir_close(struct inode inode, struct file file)	1512	static int configfs_dir_close(struct inode inode, struct file file)
1513	{	1513	{
1514	struct dentry * dentry = file->f_path.dentry;	1514	struct dentry * dentry = file->f_path.dentry;
1515	struct configfs_dirent * cursor = file->private_data;	1515	struct configfs_dirent * cursor = file->private_data;
1516		1516
1517	mutex_lock(&dentry->d_inode->i_mutex);	1517	mutex_lock(&dentry->d_inode->i_mutex);
1518	spin_lock(&configfs_dirent_lock);	1518	spin_lock(&configfs_dirent_lock);
1519	list_del_init(&cursor->s_sibling);	1519	list_del_init(&cursor->s_sibling);
1520	spin_unlock(&configfs_dirent_lock);	1520	spin_unlock(&configfs_dirent_lock);
1521	mutex_unlock(&dentry->d_inode->i_mutex);	1521	mutex_unlock(&dentry->d_inode->i_mutex);
1522		1522
1523	release_configfs_dirent(cursor);	1523	release_configfs_dirent(cursor);
1524		1524
1525	return 0;	1525	return 0;
1526	}	1526	}
1527		1527
1528	/* Relationship between s_mode and the DT_xxx types */	1528	/* Relationship between s_mode and the DT_xxx types */
1529	static inline unsigned char dt_type(struct configfs_dirent *sd)	1529	static inline unsigned char dt_type(struct configfs_dirent *sd)
1530	{	1530	{
1531	return (sd->s_mode >> 12) & 15;	1531	return (sd->s_mode >> 12) & 15;
1532	}	1532	}
1533		1533
1534	static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)	1534	static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
1535	{	1535	{
1536	struct dentry *dentry = filp->f_path.dentry;	1536	struct dentry *dentry = filp->f_path.dentry;
1537	struct super_block *sb = dentry->d_sb;	1537	struct super_block *sb = dentry->d_sb;
1538	struct configfs_dirent * parent_sd = dentry->d_fsdata;	1538	struct configfs_dirent * parent_sd = dentry->d_fsdata;
1539	struct configfs_dirent *cursor = filp->private_data;	1539	struct configfs_dirent *cursor = filp->private_data;
1540	struct list_head p, q = &cursor->s_sibling;	1540	struct list_head p, q = &cursor->s_sibling;
1541	ino_t ino = 0;	1541	ino_t ino = 0;
1542	int i = filp->f_pos;	1542	int i = filp->f_pos;
1543		1543
1544	switch (i) {	1544	switch (i) {
1545	case 0:	1545	case 0:
1546	ino = dentry->d_inode->i_ino;	1546	ino = dentry->d_inode->i_ino;
1547	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)	1547	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1548	break;	1548	break;
1549	filp->f_pos++;	1549	filp->f_pos++;
1550	i++;	1550	i++;
1551	/* fallthrough */	1551	/* fallthrough */
1552	case 1:	1552	case 1:
1553	ino = parent_ino(dentry);	1553	ino = parent_ino(dentry);
1554	if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)	1554	if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1555	break;	1555	break;
1556	filp->f_pos++;	1556	filp->f_pos++;
1557	i++;	1557	i++;
1558	/* fallthrough */	1558	/* fallthrough */
1559	default:	1559	default:
1560	if (filp->f_pos == 2) {	1560	if (filp->f_pos == 2) {
1561	spin_lock(&configfs_dirent_lock);	1561	spin_lock(&configfs_dirent_lock);
1562	list_move(q, &parent_sd->s_children);	1562	list_move(q, &parent_sd->s_children);
1563	spin_unlock(&configfs_dirent_lock);	1563	spin_unlock(&configfs_dirent_lock);
1564	}	1564	}
1565	for (p=q->next; p!= &parent_sd->s_children; p=p->next) {	1565	for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1566	struct configfs_dirent *next;	1566	struct configfs_dirent *next;
1567	const char * name;	1567	const char * name;
1568	int len;	1568	int len;
1569	struct inode *inode = NULL;	1569	struct inode *inode = NULL;
1570		1570
1571	next = list_entry(p, struct configfs_dirent,	1571	next = list_entry(p, struct configfs_dirent,
1572	s_sibling);	1572	s_sibling);
1573	if (!next->s_element)	1573	if (!next->s_element)
1574	continue;	1574	continue;
1575		1575
1576	name = configfs_get_name(next);	1576	name = configfs_get_name(next);
1577	len = strlen(name);	1577	len = strlen(name);
1578		1578
1579	/*	1579	/*
1580	* We'll have a dentry and an inode for	1580	* We'll have a dentry and an inode for
1581	* PINNED items and for open attribute	1581	* PINNED items and for open attribute
1582	* files. We lock here to prevent a race	1582	* files. We lock here to prevent a race
1583	* with configfs_d_iput() clearing	1583	* with configfs_d_iput() clearing
1584	* s_dentry before calling iput().	1584	* s_dentry before calling iput().
1585	*	1585	*
1586	* Why do we go to the trouble? If	1586	* Why do we go to the trouble? If
1587	* someone has an attribute file open,	1587	* someone has an attribute file open,
1588	* the inode number should match until	1588	* the inode number should match until
1589	* they close it. Beyond that, we don't	1589	* they close it. Beyond that, we don't
1590	* care.	1590	* care.
1591	*/	1591	*/
1592	spin_lock(&configfs_dirent_lock);	1592	spin_lock(&configfs_dirent_lock);
1593	dentry = next->s_dentry;	1593	dentry = next->s_dentry;
1594	if (dentry)	1594	if (dentry)
1595	inode = dentry->d_inode;	1595	inode = dentry->d_inode;
1596	if (inode)	1596	if (inode)
1597	ino = inode->i_ino;	1597	ino = inode->i_ino;
1598	spin_unlock(&configfs_dirent_lock);	1598	spin_unlock(&configfs_dirent_lock);
1599	if (!inode)	1599	if (!inode)
1600	ino = iunique(sb, 2);	1600	ino = iunique(sb, 2);
1601		1601
1602	if (filldir(dirent, name, len, filp->f_pos, ino,	1602	if (filldir(dirent, name, len, filp->f_pos, ino,
1603	dt_type(next)) < 0)	1603	dt_type(next)) < 0)
1604	return 0;	1604	return 0;
1605		1605
1606	spin_lock(&configfs_dirent_lock);	1606	spin_lock(&configfs_dirent_lock);
1607	list_move(q, p);	1607	list_move(q, p);
1608	spin_unlock(&configfs_dirent_lock);	1608	spin_unlock(&configfs_dirent_lock);
1609	p = q;	1609	p = q;
1610	filp->f_pos++;	1610	filp->f_pos++;
1611	}	1611	}
1612	}	1612	}
1613	return 0;	1613	return 0;
1614	}	1614	}
1615		1615
1616	static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)	1616	static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
1617	{	1617	{
1618	struct dentry * dentry = file->f_path.dentry;	1618	struct dentry * dentry = file->f_path.dentry;
1619		1619
1620	mutex_lock(&dentry->d_inode->i_mutex);	1620	mutex_lock(&dentry->d_inode->i_mutex);
1621	switch (origin) {	1621	switch (whence) {
1622	case 1:	1622	case 1:
1623	offset += file->f_pos;	1623	offset += file->f_pos;
1624	case 0:	1624	case 0:
1625	if (offset >= 0)	1625	if (offset >= 0)
1626	break;	1626	break;
1627	default:	1627	default:
1628	mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);	1628	mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
1629	return -EINVAL;	1629	return -EINVAL;
1630	}	1630	}
1631	if (offset != file->f_pos) {	1631	if (offset != file->f_pos) {
1632	file->f_pos = offset;	1632	file->f_pos = offset;
1633	if (file->f_pos >= 2) {	1633	if (file->f_pos >= 2) {
1634	struct configfs_dirent *sd = dentry->d_fsdata;	1634	struct configfs_dirent *sd = dentry->d_fsdata;
1635	struct configfs_dirent *cursor = file->private_data;	1635	struct configfs_dirent *cursor = file->private_data;
1636	struct list_head *p;	1636	struct list_head *p;
1637	loff_t n = file->f_pos - 2;	1637	loff_t n = file->f_pos - 2;
1638		1638
1639	spin_lock(&configfs_dirent_lock);	1639	spin_lock(&configfs_dirent_lock);
1640	list_del(&cursor->s_sibling);	1640	list_del(&cursor->s_sibling);
1641	p = sd->s_children.next;	1641	p = sd->s_children.next;
1642	while (n && p != &sd->s_children) {	1642	while (n && p != &sd->s_children) {
1643	struct configfs_dirent *next;	1643	struct configfs_dirent *next;
1644	next = list_entry(p, struct configfs_dirent,	1644	next = list_entry(p, struct configfs_dirent,
1645	s_sibling);	1645	s_sibling);
1646	if (next->s_element)	1646	if (next->s_element)
1647	n--;	1647	n--;
1648	p = p->next;	1648	p = p->next;
1649	}	1649	}
1650	list_add_tail(&cursor->s_sibling, p);	1650	list_add_tail(&cursor->s_sibling, p);
1651	spin_unlock(&configfs_dirent_lock);	1651	spin_unlock(&configfs_dirent_lock);
1652	}	1652	}
1653	}	1653	}
1654	mutex_unlock(&dentry->d_inode->i_mutex);	1654	mutex_unlock(&dentry->d_inode->i_mutex);
1655	return offset;	1655	return offset;
1656	}	1656	}
1657		1657
1658	const struct file_operations configfs_dir_operations = {	1658	const struct file_operations configfs_dir_operations = {
1659	.open = configfs_dir_open,	1659	.open = configfs_dir_open,
1660	.release = configfs_dir_close,	1660	.release = configfs_dir_close,
1661	.llseek = configfs_dir_lseek,	1661	.llseek = configfs_dir_lseek,
1662	.read = generic_read_dir,	1662	.read = generic_read_dir,
1663	.readdir = configfs_readdir,	1663	.readdir = configfs_readdir,
1664	};	1664	};
1665		1665
1666	int configfs_register_subsystem(struct configfs_subsystem *subsys)	1666	int configfs_register_subsystem(struct configfs_subsystem *subsys)
1667	{	1667	{
1668	int err;	1668	int err;
1669	struct config_group *group = &subsys->su_group;	1669	struct config_group *group = &subsys->su_group;
1670	struct qstr name;	1670	struct qstr name;
1671	struct dentry *dentry;	1671	struct dentry *dentry;
1672	struct dentry *root;	1672	struct dentry *root;
1673	struct configfs_dirent *sd;	1673	struct configfs_dirent *sd;
1674		1674
1675	root = configfs_pin_fs();	1675	root = configfs_pin_fs();
1676	if (IS_ERR(root))	1676	if (IS_ERR(root))
1677	return PTR_ERR(root);	1677	return PTR_ERR(root);
1678		1678
1679	if (!group->cg_item.ci_name)	1679	if (!group->cg_item.ci_name)
1680	group->cg_item.ci_name = group->cg_item.ci_namebuf;	1680	group->cg_item.ci_name = group->cg_item.ci_namebuf;
1681		1681
1682	sd = root->d_fsdata;	1682	sd = root->d_fsdata;
1683	link_group(to_config_group(sd->s_element), group);	1683	link_group(to_config_group(sd->s_element), group);
1684		1684
1685	mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);	1685	mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
1686		1686
1687	name.name = group->cg_item.ci_name;	1687	name.name = group->cg_item.ci_name;
1688	name.len = strlen(name.name);	1688	name.len = strlen(name.name);
1689	name.hash = full_name_hash(name.name, name.len);	1689	name.hash = full_name_hash(name.name, name.len);
1690		1690
1691	err = -ENOMEM;	1691	err = -ENOMEM;
1692	dentry = d_alloc(root, &name);	1692	dentry = d_alloc(root, &name);
1693	if (dentry) {	1693	if (dentry) {
1694	d_add(dentry, NULL);	1694	d_add(dentry, NULL);
1695		1695
1696	err = configfs_attach_group(sd->s_element, &group->cg_item,	1696	err = configfs_attach_group(sd->s_element, &group->cg_item,
1697	dentry);	1697	dentry);
1698	if (err) {	1698	if (err) {
1699	BUG_ON(dentry->d_inode);	1699	BUG_ON(dentry->d_inode);
1700	d_drop(dentry);	1700	d_drop(dentry);
1701	dput(dentry);	1701	dput(dentry);
1702	} else {	1702	} else {
1703	spin_lock(&configfs_dirent_lock);	1703	spin_lock(&configfs_dirent_lock);
1704	configfs_dir_set_ready(dentry->d_fsdata);	1704	configfs_dir_set_ready(dentry->d_fsdata);
1705	spin_unlock(&configfs_dirent_lock);	1705	spin_unlock(&configfs_dirent_lock);
1706	}	1706	}
1707	}	1707	}
1708		1708
1709	mutex_unlock(&root->d_inode->i_mutex);	1709	mutex_unlock(&root->d_inode->i_mutex);
1710		1710
1711	if (err) {	1711	if (err) {
1712	unlink_group(group);	1712	unlink_group(group);
1713	configfs_release_fs();	1713	configfs_release_fs();
1714	}	1714	}
1715		1715
1716	return err;	1716	return err;
1717	}	1717	}
1718		1718
1719	void configfs_unregister_subsystem(struct configfs_subsystem *subsys)	1719	void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1720	{	1720	{
1721	struct config_group *group = &subsys->su_group;	1721	struct config_group *group = &subsys->su_group;
1722	struct dentry *dentry = group->cg_item.ci_dentry;	1722	struct dentry *dentry = group->cg_item.ci_dentry;
1723	struct dentry *root = dentry->d_sb->s_root;	1723	struct dentry *root = dentry->d_sb->s_root;
1724		1724
1725	if (dentry->d_parent != root) {	1725	if (dentry->d_parent != root) {
1726	printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");	1726	printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
1727	return;	1727	return;
1728	}	1728	}
1729		1729
1730	mutex_lock_nested(&root->d_inode->i_mutex,	1730	mutex_lock_nested(&root->d_inode->i_mutex,
1731	I_MUTEX_PARENT);	1731	I_MUTEX_PARENT);
1732	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);	1732	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
1733	mutex_lock(&configfs_symlink_mutex);	1733	mutex_lock(&configfs_symlink_mutex);
1734	spin_lock(&configfs_dirent_lock);	1734	spin_lock(&configfs_dirent_lock);
1735	if (configfs_detach_prep(dentry, NULL)) {	1735	if (configfs_detach_prep(dentry, NULL)) {
1736	printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");	1736	printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1737	}	1737	}
1738	spin_unlock(&configfs_dirent_lock);	1738	spin_unlock(&configfs_dirent_lock);
1739	mutex_unlock(&configfs_symlink_mutex);	1739	mutex_unlock(&configfs_symlink_mutex);
1740	configfs_detach_group(&group->cg_item);	1740	configfs_detach_group(&group->cg_item);
1741	dentry->d_inode->i_flags \|= S_DEAD;	1741	dentry->d_inode->i_flags \|= S_DEAD;
1742	dont_mount(dentry);	1742	dont_mount(dentry);
1743	mutex_unlock(&dentry->d_inode->i_mutex);	1743	mutex_unlock(&dentry->d_inode->i_mutex);
1744		1744
1745	d_delete(dentry);	1745	d_delete(dentry);
1746		1746
1747	mutex_unlock(&root->d_inode->i_mutex);	1747	mutex_unlock(&root->d_inode->i_mutex);
1748		1748
1749	dput(dentry);	1749	dput(dentry);
1750		1750
1751	unlink_group(group);	1751	unlink_group(group);
1752	configfs_release_fs();	1752	configfs_release_fs();
1753	}	1753	}
1754		1754
1755	EXPORT_SYMBOL(configfs_register_subsystem);	1755	EXPORT_SYMBOL(configfs_register_subsystem);
1756	EXPORT_SYMBOL(configfs_unregister_subsystem);	1756	EXPORT_SYMBOL(configfs_unregister_subsystem);
1757		1757

fs/ext3/dir.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/ext3/dir.c	2	* linux/fs/ext3/dir.c
3	*	3	*
4	* Copyright (C) 1992, 1993, 1994, 1995	4	* Copyright (C) 1992, 1993, 1994, 1995
5	* Remy Card (card@masi.ibp.fr)	5	* Remy Card (card@masi.ibp.fr)
6	* Laboratoire MASI - Institut Blaise Pascal	6	* Laboratoire MASI - Institut Blaise Pascal
7	* Universite Pierre et Marie Curie (Paris VI)	7	* Universite Pierre et Marie Curie (Paris VI)
8	*	8	*
9	* from	9	* from
10	*	10	*
11	* linux/fs/minix/dir.c	11	* linux/fs/minix/dir.c
12	*	12	*
13	* Copyright (C) 1991, 1992 Linus Torvalds	13	* Copyright (C) 1991, 1992 Linus Torvalds
14	*	14	*
15	* ext3 directory handling functions	15	* ext3 directory handling functions
16	*	16	*
17	* Big-endian to little-endian byte-swapping/bitmaps by	17	* Big-endian to little-endian byte-swapping/bitmaps by
18	* David S. Miller (davem@caip.rutgers.edu), 1995	18	* David S. Miller (davem@caip.rutgers.edu), 1995
19	*	19	*
20	* Hash Tree Directory indexing (c) 2001 Daniel Phillips	20	* Hash Tree Directory indexing (c) 2001 Daniel Phillips
21	*	21	*
22	*/	22	*/
23		23
24	#include <linux/compat.h>	24	#include <linux/compat.h>
25	#include "ext3.h"	25	#include "ext3.h"
26		26
27	static unsigned char ext3_filetype_table[] = {	27	static unsigned char ext3_filetype_table[] = {
28	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK	28	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29	};	29	};
30		30
31	static int ext3_dx_readdir(struct file * filp,	31	static int ext3_dx_readdir(struct file * filp,
32	void * dirent, filldir_t filldir);	32	void * dirent, filldir_t filldir);
33		33
34	static unsigned char get_dtype(struct super_block *sb, int filetype)	34	static unsigned char get_dtype(struct super_block *sb, int filetype)
35	{	35	{
36	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) \|\|	36	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) \|\|
37	(filetype >= EXT3_FT_MAX))	37	(filetype >= EXT3_FT_MAX))
38	return DT_UNKNOWN;	38	return DT_UNKNOWN;
39		39
40	return (ext3_filetype_table[filetype]);	40	return (ext3_filetype_table[filetype]);
41	}	41	}
42		42
43	/**	43	/**
44	* Check if the given dir-inode refers to an htree-indexed directory	44	* Check if the given dir-inode refers to an htree-indexed directory
45	* (or a directory which chould potentially get coverted to use htree	45	* (or a directory which chould potentially get coverted to use htree
46	* indexing).	46	* indexing).
47	*	47	*
48	* Return 1 if it is a dx dir, 0 if not	48	* Return 1 if it is a dx dir, 0 if not
49	*/	49	*/
50	static int is_dx_dir(struct inode *inode)	50	static int is_dx_dir(struct inode *inode)
51	{	51	{
52	struct super_block *sb = inode->i_sb;	52	struct super_block *sb = inode->i_sb;
53		53
54	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,	54	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
55	EXT3_FEATURE_COMPAT_DIR_INDEX) &&	55	EXT3_FEATURE_COMPAT_DIR_INDEX) &&
56	((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) \|\|	56	((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) \|\|
57	((inode->i_size >> sb->s_blocksize_bits) == 1)))	57	((inode->i_size >> sb->s_blocksize_bits) == 1)))
58	return 1;	58	return 1;
59		59
60	return 0;	60	return 0;
61	}	61	}
62		62
63	int ext3_check_dir_entry (const char * function, struct inode * dir,	63	int ext3_check_dir_entry (const char * function, struct inode * dir,
64	struct ext3_dir_entry_2 * de,	64	struct ext3_dir_entry_2 * de,
65	struct buffer_head * bh,	65	struct buffer_head * bh,
66	unsigned long offset)	66	unsigned long offset)
67	{	67	{
68	const char * error_msg = NULL;	68	const char * error_msg = NULL;
69	const int rlen = ext3_rec_len_from_disk(de->rec_len);	69	const int rlen = ext3_rec_len_from_disk(de->rec_len);
70		70
71	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))	71	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
72	error_msg = "rec_len is smaller than minimal";	72	error_msg = "rec_len is smaller than minimal";
73	else if (unlikely(rlen % 4 != 0))	73	else if (unlikely(rlen % 4 != 0))
74	error_msg = "rec_len % 4 != 0";	74	error_msg = "rec_len % 4 != 0";
75	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))	75	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
76	error_msg = "rec_len is too small for name_len";	76	error_msg = "rec_len is too small for name_len";
77	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))	77	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
78	error_msg = "directory entry across blocks";	78	error_msg = "directory entry across blocks";
79	else if (unlikely(le32_to_cpu(de->inode) >	79	else if (unlikely(le32_to_cpu(de->inode) >
80	le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))	80	le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
81	error_msg = "inode out of bounds";	81	error_msg = "inode out of bounds";
82		82
83	if (unlikely(error_msg != NULL))	83	if (unlikely(error_msg != NULL))
84	ext3_error (dir->i_sb, function,	84	ext3_error (dir->i_sb, function,
85	"bad entry in directory #%lu: %s - "	85	"bad entry in directory #%lu: %s - "
86	"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",	86	"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
87	dir->i_ino, error_msg, offset,	87	dir->i_ino, error_msg, offset,
88	(unsigned long) le32_to_cpu(de->inode),	88	(unsigned long) le32_to_cpu(de->inode),
89	rlen, de->name_len);	89	rlen, de->name_len);
90		90
91	return error_msg == NULL ? 1 : 0;	91	return error_msg == NULL ? 1 : 0;
92	}	92	}
93		93
94	static int ext3_readdir(struct file * filp,	94	static int ext3_readdir(struct file * filp,
95	void * dirent, filldir_t filldir)	95	void * dirent, filldir_t filldir)
96	{	96	{
97	int error = 0;	97	int error = 0;
98	unsigned long offset;	98	unsigned long offset;
99	int i, stored;	99	int i, stored;
100	struct ext3_dir_entry_2 *de;	100	struct ext3_dir_entry_2 *de;
101	int err;	101	int err;
102	struct inode *inode = filp->f_path.dentry->d_inode;	102	struct inode *inode = filp->f_path.dentry->d_inode;
103	struct super_block *sb = inode->i_sb;	103	struct super_block *sb = inode->i_sb;
104	int ret = 0;	104	int ret = 0;
105	int dir_has_error = 0;	105	int dir_has_error = 0;
106		106
107	if (is_dx_dir(inode)) {	107	if (is_dx_dir(inode)) {
108	err = ext3_dx_readdir(filp, dirent, filldir);	108	err = ext3_dx_readdir(filp, dirent, filldir);
109	if (err != ERR_BAD_DX_DIR) {	109	if (err != ERR_BAD_DX_DIR) {
110	ret = err;	110	ret = err;
111	goto out;	111	goto out;
112	}	112	}
113	/*	113	/*
114	* We don't set the inode dirty flag since it's not	114	* We don't set the inode dirty flag since it's not
115	* critical that it get flushed back to the disk.	115	* critical that it get flushed back to the disk.
116	*/	116	*/
117	EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;	117	EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
118	}	118	}
119	stored = 0;	119	stored = 0;
120	offset = filp->f_pos & (sb->s_blocksize - 1);	120	offset = filp->f_pos & (sb->s_blocksize - 1);
121		121
122	while (!error && !stored && filp->f_pos < inode->i_size) {	122	while (!error && !stored && filp->f_pos < inode->i_size) {
123	unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);	123	unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
124	struct buffer_head map_bh;	124	struct buffer_head map_bh;
125	struct buffer_head *bh = NULL;	125	struct buffer_head *bh = NULL;
126		126
127	map_bh.b_state = 0;	127	map_bh.b_state = 0;
128	err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);	128	err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
129	if (err > 0) {	129	if (err > 0) {
130	pgoff_t index = map_bh.b_blocknr >>	130	pgoff_t index = map_bh.b_blocknr >>
131	(PAGE_CACHE_SHIFT - inode->i_blkbits);	131	(PAGE_CACHE_SHIFT - inode->i_blkbits);
132	if (!ra_has_index(&filp->f_ra, index))	132	if (!ra_has_index(&filp->f_ra, index))
133	page_cache_sync_readahead(	133	page_cache_sync_readahead(
134	sb->s_bdev->bd_inode->i_mapping,	134	sb->s_bdev->bd_inode->i_mapping,
135	&filp->f_ra, filp,	135	&filp->f_ra, filp,
136	index, 1);	136	index, 1);
137	filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;	137	filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
138	bh = ext3_bread(NULL, inode, blk, 0, &err);	138	bh = ext3_bread(NULL, inode, blk, 0, &err);
139	}	139	}
140		140
141	/*	141	/*
142	* We ignore I/O errors on directories so users have a chance	142	* We ignore I/O errors on directories so users have a chance
143	* of recovering data when there's a bad sector	143	* of recovering data when there's a bad sector
144	*/	144	*/
145	if (!bh) {	145	if (!bh) {
146	if (!dir_has_error) {	146	if (!dir_has_error) {
147	ext3_error(sb, __func__, "directory #%lu "	147	ext3_error(sb, __func__, "directory #%lu "
148	"contains a hole at offset %lld",	148	"contains a hole at offset %lld",
149	inode->i_ino, filp->f_pos);	149	inode->i_ino, filp->f_pos);
150	dir_has_error = 1;	150	dir_has_error = 1;
151	}	151	}
152	/* corrupt size? Maybe no more blocks to read */	152	/* corrupt size? Maybe no more blocks to read */
153	if (filp->f_pos > inode->i_blocks << 9)	153	if (filp->f_pos > inode->i_blocks << 9)
154	break;	154	break;
155	filp->f_pos += sb->s_blocksize - offset;	155	filp->f_pos += sb->s_blocksize - offset;
156	continue;	156	continue;
157	}	157	}
158		158
159	revalidate:	159	revalidate:
160	/* If the dir block has changed since the last call to	160	/* If the dir block has changed since the last call to
161	* readdir(2), then we might be pointing to an invalid	161	* readdir(2), then we might be pointing to an invalid
162	* dirent right now. Scan from the start of the block	162	* dirent right now. Scan from the start of the block
163	* to make sure. */	163	* to make sure. */
164	if (filp->f_version != inode->i_version) {	164	if (filp->f_version != inode->i_version) {
165	for (i = 0; i < sb->s_blocksize && i < offset; ) {	165	for (i = 0; i < sb->s_blocksize && i < offset; ) {
166	de = (struct ext3_dir_entry_2 *)	166	de = (struct ext3_dir_entry_2 *)
167	(bh->b_data + i);	167	(bh->b_data + i);
168	/* It's too expensive to do a full	168	/* It's too expensive to do a full
169	* dirent test each time round this	169	* dirent test each time round this
170	* loop, but we do have to test at	170	* loop, but we do have to test at
171	* least that it is non-zero. A	171	* least that it is non-zero. A
172	* failure will be detected in the	172	* failure will be detected in the
173	* dirent test below. */	173	* dirent test below. */
174	if (ext3_rec_len_from_disk(de->rec_len) <	174	if (ext3_rec_len_from_disk(de->rec_len) <
175	EXT3_DIR_REC_LEN(1))	175	EXT3_DIR_REC_LEN(1))
176	break;	176	break;
177	i += ext3_rec_len_from_disk(de->rec_len);	177	i += ext3_rec_len_from_disk(de->rec_len);
178	}	178	}
179	offset = i;	179	offset = i;
180	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))	180	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
181	\| offset;	181	\| offset;
182	filp->f_version = inode->i_version;	182	filp->f_version = inode->i_version;
183	}	183	}
184		184
185	while (!error && filp->f_pos < inode->i_size	185	while (!error && filp->f_pos < inode->i_size
186	&& offset < sb->s_blocksize) {	186	&& offset < sb->s_blocksize) {
187	de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);	187	de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
188	if (!ext3_check_dir_entry ("ext3_readdir", inode, de,	188	if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
189	bh, offset)) {	189	bh, offset)) {
190	/* On error, skip the f_pos to the	190	/* On error, skip the f_pos to the
191	next block. */	191	next block. */
192	filp->f_pos = (filp->f_pos \|	192	filp->f_pos = (filp->f_pos \|
193	(sb->s_blocksize - 1)) + 1;	193	(sb->s_blocksize - 1)) + 1;
194	brelse (bh);	194	brelse (bh);
195	ret = stored;	195	ret = stored;
196	goto out;	196	goto out;
197	}	197	}
198	offset += ext3_rec_len_from_disk(de->rec_len);	198	offset += ext3_rec_len_from_disk(de->rec_len);
199	if (le32_to_cpu(de->inode)) {	199	if (le32_to_cpu(de->inode)) {
200	/* We might block in the next section	200	/* We might block in the next section
201	* if the data destination is	201	* if the data destination is
202	* currently swapped out. So, use a	202	* currently swapped out. So, use a
203	* version stamp to detect whether or	203	* version stamp to detect whether or
204	* not the directory has been modified	204	* not the directory has been modified
205	* during the copy operation.	205	* during the copy operation.
206	*/	206	*/
207	u64 version = filp->f_version;	207	u64 version = filp->f_version;
208		208
209	error = filldir(dirent, de->name,	209	error = filldir(dirent, de->name,
210	de->name_len,	210	de->name_len,
211	filp->f_pos,	211	filp->f_pos,
212	le32_to_cpu(de->inode),	212	le32_to_cpu(de->inode),
213	get_dtype(sb, de->file_type));	213	get_dtype(sb, de->file_type));
214	if (error)	214	if (error)
215	break;	215	break;
216	if (version != filp->f_version)	216	if (version != filp->f_version)
217	goto revalidate;	217	goto revalidate;
218	stored ++;	218	stored ++;
219	}	219	}
220	filp->f_pos += ext3_rec_len_from_disk(de->rec_len);	220	filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
221	}	221	}
222	offset = 0;	222	offset = 0;
223	brelse (bh);	223	brelse (bh);
224	}	224	}
225	out:	225	out:
226	return ret;	226	return ret;
227	}	227	}
228		228
229	static inline int is_32bit_api(void)	229	static inline int is_32bit_api(void)
230	{	230	{
231	#ifdef CONFIG_COMPAT	231	#ifdef CONFIG_COMPAT
232	return is_compat_task();	232	return is_compat_task();
233	#else	233	#else
234	return (BITS_PER_LONG == 32);	234	return (BITS_PER_LONG == 32);
235	#endif	235	#endif
236	}	236	}
237		237
238	/*	238	/*
239	* These functions convert from the major/minor hash to an f_pos	239	* These functions convert from the major/minor hash to an f_pos
240	* value for dx directories	240	* value for dx directories
241	*	241	*
242	* Upper layer (for example NFS) should specify FMODE_32BITHASH or	242	* Upper layer (for example NFS) should specify FMODE_32BITHASH or
243	* FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted	243	* FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
244	* directly on both 32-bit and 64-bit nodes, under such case, neither	244	* directly on both 32-bit and 64-bit nodes, under such case, neither
245	* FMODE_32BITHASH nor FMODE_64BITHASH is specified.	245	* FMODE_32BITHASH nor FMODE_64BITHASH is specified.
246	*/	246	*/
247	static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)	247	static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
248	{	248	{
249	if ((filp->f_mode & FMODE_32BITHASH) \|\|	249	if ((filp->f_mode & FMODE_32BITHASH) \|\|
250	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	250	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
251	return major >> 1;	251	return major >> 1;
252	else	252	else
253	return ((__u64)(major >> 1) << 32) \| (__u64)minor;	253	return ((__u64)(major >> 1) << 32) \| (__u64)minor;
254	}	254	}
255		255
256	static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)	256	static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
257	{	257	{
258	if ((filp->f_mode & FMODE_32BITHASH) \|\|	258	if ((filp->f_mode & FMODE_32BITHASH) \|\|
259	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	259	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
260	return (pos << 1) & 0xffffffff;	260	return (pos << 1) & 0xffffffff;
261	else	261	else
262	return ((pos >> 32) << 1) & 0xffffffff;	262	return ((pos >> 32) << 1) & 0xffffffff;
263	}	263	}
264		264
265	static inline __u32 pos2min_hash(struct file *filp, loff_t pos)	265	static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
266	{	266	{
267	if ((filp->f_mode & FMODE_32BITHASH) \|\|	267	if ((filp->f_mode & FMODE_32BITHASH) \|\|
268	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	268	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
269	return 0;	269	return 0;
270	else	270	else
271	return pos & 0xffffffff;	271	return pos & 0xffffffff;
272	}	272	}
273		273
274	/*	274	/*
275	* Return 32- or 64-bit end-of-file for dx directories	275	* Return 32- or 64-bit end-of-file for dx directories
276	*/	276	*/
277	static inline loff_t ext3_get_htree_eof(struct file *filp)	277	static inline loff_t ext3_get_htree_eof(struct file *filp)
278	{	278	{
279	if ((filp->f_mode & FMODE_32BITHASH) \|\|	279	if ((filp->f_mode & FMODE_32BITHASH) \|\|
280	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	280	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
281	return EXT3_HTREE_EOF_32BIT;	281	return EXT3_HTREE_EOF_32BIT;
282	else	282	else
283	return EXT3_HTREE_EOF_64BIT;	283	return EXT3_HTREE_EOF_64BIT;
284	}	284	}
285		285
286		286
287	/*	287	/*
288	* ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both	288	* ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
289	* non-htree and htree directories, where the "offset" is in terms	289	* non-htree and htree directories, where the "offset" is in terms
290	* of the filename hash value instead of the byte offset.	290	* of the filename hash value instead of the byte offset.
291	*	291	*
292	* Because we may return a 64-bit hash that is well beyond s_maxbytes,	292	* Because we may return a 64-bit hash that is well beyond s_maxbytes,
293	* we need to pass the max hash as the maximum allowable offset in	293	* we need to pass the max hash as the maximum allowable offset in
294	* the htree directory case.	294	* the htree directory case.
295	*	295	*
296	* NOTE: offsets obtained before ext3_set_inode_flag(dir, EXT3_INODE_INDEX)	296	* NOTE: offsets obtained before ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
297	* will be invalid once the directory was converted into a dx directory	297	* will be invalid once the directory was converted into a dx directory
298	*/	298	*/
299	loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)	299	loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
300	{	300	{
301	struct inode *inode = file->f_mapping->host;	301	struct inode *inode = file->f_mapping->host;
302	int dx_dir = is_dx_dir(inode);	302	int dx_dir = is_dx_dir(inode);
303	loff_t htree_max = ext3_get_htree_eof(file);	303	loff_t htree_max = ext3_get_htree_eof(file);
304		304
305	if (likely(dx_dir))	305	if (likely(dx_dir))
306	return generic_file_llseek_size(file, offset, origin,	306	return generic_file_llseek_size(file, offset, whence,
307	htree_max, htree_max);	307	htree_max, htree_max);
308	else	308	else
309	return generic_file_llseek(file, offset, origin);	309	return generic_file_llseek(file, offset, whence);
310	}	310	}
311		311
312	/*	312	/*
313	* This structure holds the nodes of the red-black tree used to store	313	* This structure holds the nodes of the red-black tree used to store
314	* the directory entry in hash order.	314	* the directory entry in hash order.
315	*/	315	*/
316	struct fname {	316	struct fname {
317	__u32 hash;	317	__u32 hash;
318	__u32 minor_hash;	318	__u32 minor_hash;
319	struct rb_node rb_hash;	319	struct rb_node rb_hash;
320	struct fname *next;	320	struct fname *next;
321	__u32 inode;	321	__u32 inode;
322	__u8 name_len;	322	__u8 name_len;
323	__u8 file_type;	323	__u8 file_type;
324	char name[0];	324	char name[0];
325	};	325	};
326		326
327	/*	327	/*
328	* This functoin implements a non-recursive way of freeing all of the	328	* This functoin implements a non-recursive way of freeing all of the
329	* nodes in the red-black tree.	329	* nodes in the red-black tree.
330	*/	330	*/
331	static void free_rb_tree_fname(struct rb_root *root)	331	static void free_rb_tree_fname(struct rb_root *root)
332	{	332	{
333	struct rb_node *n = root->rb_node;	333	struct rb_node *n = root->rb_node;
334	struct rb_node *parent;	334	struct rb_node *parent;
335	struct fname *fname;	335	struct fname *fname;
336		336
337	while (n) {	337	while (n) {
338	/* Do the node's children first */	338	/* Do the node's children first */
339	if (n->rb_left) {	339	if (n->rb_left) {
340	n = n->rb_left;	340	n = n->rb_left;
341	continue;	341	continue;
342	}	342	}
343	if (n->rb_right) {	343	if (n->rb_right) {
344	n = n->rb_right;	344	n = n->rb_right;
345	continue;	345	continue;
346	}	346	}
347	/*	347	/*
348	* The node has no children; free it, and then zero	348	* The node has no children; free it, and then zero
349	* out parent's link to it. Finally go to the	349	* out parent's link to it. Finally go to the
350	* beginning of the loop and try to free the parent	350	* beginning of the loop and try to free the parent
351	* node.	351	* node.
352	*/	352	*/
353	parent = rb_parent(n);	353	parent = rb_parent(n);
354	fname = rb_entry(n, struct fname, rb_hash);	354	fname = rb_entry(n, struct fname, rb_hash);
355	while (fname) {	355	while (fname) {
356	struct fname * old = fname;	356	struct fname * old = fname;
357	fname = fname->next;	357	fname = fname->next;
358	kfree (old);	358	kfree (old);
359	}	359	}
360	if (!parent)	360	if (!parent)
361	*root = RB_ROOT;	361	*root = RB_ROOT;
362	else if (parent->rb_left == n)	362	else if (parent->rb_left == n)
363	parent->rb_left = NULL;	363	parent->rb_left = NULL;
364	else if (parent->rb_right == n)	364	else if (parent->rb_right == n)
365	parent->rb_right = NULL;	365	parent->rb_right = NULL;
366	n = parent;	366	n = parent;
367	}	367	}
368	}	368	}
369		369
370		370
371	static struct dir_private_info ext3_htree_create_dir_info(struct file filp,	371	static struct dir_private_info ext3_htree_create_dir_info(struct file filp,
372	loff_t pos)	372	loff_t pos)
373	{	373	{
374	struct dir_private_info *p;	374	struct dir_private_info *p;
375		375
376	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);	376	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
377	if (!p)	377	if (!p)
378	return NULL;	378	return NULL;
379	p->curr_hash = pos2maj_hash(filp, pos);	379	p->curr_hash = pos2maj_hash(filp, pos);
380	p->curr_minor_hash = pos2min_hash(filp, pos);	380	p->curr_minor_hash = pos2min_hash(filp, pos);
381	return p;	381	return p;
382	}	382	}
383		383
384	void ext3_htree_free_dir_info(struct dir_private_info *p)	384	void ext3_htree_free_dir_info(struct dir_private_info *p)
385	{	385	{
386	free_rb_tree_fname(&p->root);	386	free_rb_tree_fname(&p->root);
387	kfree(p);	387	kfree(p);
388	}	388	}
389		389
390	/*	390	/*
391	* Given a directory entry, enter it into the fname rb tree.	391	* Given a directory entry, enter it into the fname rb tree.
392	*/	392	*/
393	int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,	393	int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
394	__u32 minor_hash,	394	__u32 minor_hash,
395	struct ext3_dir_entry_2 *dirent)	395	struct ext3_dir_entry_2 *dirent)
396	{	396	{
397	struct rb_node *p, parent = NULL;	397	struct rb_node *p, parent = NULL;
398	struct fname * fname, *new_fn;	398	struct fname * fname, *new_fn;
399	struct dir_private_info *info;	399	struct dir_private_info *info;
400	int len;	400	int len;
401		401
402	info = (struct dir_private_info *) dir_file->private_data;	402	info = (struct dir_private_info *) dir_file->private_data;
403	p = &info->root.rb_node;	403	p = &info->root.rb_node;
404		404
405	/* Create and allocate the fname structure */	405	/* Create and allocate the fname structure */
406	len = sizeof(struct fname) + dirent->name_len + 1;	406	len = sizeof(struct fname) + dirent->name_len + 1;
407	new_fn = kzalloc(len, GFP_KERNEL);	407	new_fn = kzalloc(len, GFP_KERNEL);
408	if (!new_fn)	408	if (!new_fn)
409	return -ENOMEM;	409	return -ENOMEM;
410	new_fn->hash = hash;	410	new_fn->hash = hash;
411	new_fn->minor_hash = minor_hash;	411	new_fn->minor_hash = minor_hash;
412	new_fn->inode = le32_to_cpu(dirent->inode);	412	new_fn->inode = le32_to_cpu(dirent->inode);
413	new_fn->name_len = dirent->name_len;	413	new_fn->name_len = dirent->name_len;
414	new_fn->file_type = dirent->file_type;	414	new_fn->file_type = dirent->file_type;
415	memcpy(new_fn->name, dirent->name, dirent->name_len);	415	memcpy(new_fn->name, dirent->name, dirent->name_len);
416	new_fn->name[dirent->name_len] = 0;	416	new_fn->name[dirent->name_len] = 0;
417		417
418	while (*p) {	418	while (*p) {
419	parent = *p;	419	parent = *p;
420	fname = rb_entry(parent, struct fname, rb_hash);	420	fname = rb_entry(parent, struct fname, rb_hash);
421		421
422	/*	422	/*
423	* If the hash and minor hash match up, then we put	423	* If the hash and minor hash match up, then we put
424	* them on a linked list. This rarely happens...	424	* them on a linked list. This rarely happens...
425	*/	425	*/
426	if ((new_fn->hash == fname->hash) &&	426	if ((new_fn->hash == fname->hash) &&
427	(new_fn->minor_hash == fname->minor_hash)) {	427	(new_fn->minor_hash == fname->minor_hash)) {
428	new_fn->next = fname->next;	428	new_fn->next = fname->next;
429	fname->next = new_fn;	429	fname->next = new_fn;
430	return 0;	430	return 0;
431	}	431	}
432		432
433	if (new_fn->hash < fname->hash)	433	if (new_fn->hash < fname->hash)
434	p = &(*p)->rb_left;	434	p = &(*p)->rb_left;
435	else if (new_fn->hash > fname->hash)	435	else if (new_fn->hash > fname->hash)
436	p = &(*p)->rb_right;	436	p = &(*p)->rb_right;
437	else if (new_fn->minor_hash < fname->minor_hash)	437	else if (new_fn->minor_hash < fname->minor_hash)
438	p = &(*p)->rb_left;	438	p = &(*p)->rb_left;
439	else /* if (new_fn->minor_hash > fname->minor_hash) */	439	else /* if (new_fn->minor_hash > fname->minor_hash) */
440	p = &(*p)->rb_right;	440	p = &(*p)->rb_right;
441	}	441	}
442		442
443	rb_link_node(&new_fn->rb_hash, parent, p);	443	rb_link_node(&new_fn->rb_hash, parent, p);
444	rb_insert_color(&new_fn->rb_hash, &info->root);	444	rb_insert_color(&new_fn->rb_hash, &info->root);
445	return 0;	445	return 0;
446	}	446	}
447		447
448		448
449		449
450	/*	450	/*
451	* This is a helper function for ext3_dx_readdir. It calls filldir	451	* This is a helper function for ext3_dx_readdir. It calls filldir
452	* for all entres on the fname linked list. (Normally there is only	452	* for all entres on the fname linked list. (Normally there is only
453	* one entry on the linked list, unless there are 62 bit hash collisions.)	453	* one entry on the linked list, unless there are 62 bit hash collisions.)
454	*/	454	*/
455	static int call_filldir(struct file * filp, void * dirent,	455	static int call_filldir(struct file * filp, void * dirent,
456	filldir_t filldir, struct fname *fname)	456	filldir_t filldir, struct fname *fname)
457	{	457	{
458	struct dir_private_info *info = filp->private_data;	458	struct dir_private_info *info = filp->private_data;
459	loff_t curr_pos;	459	loff_t curr_pos;
460	struct inode *inode = filp->f_path.dentry->d_inode;	460	struct inode *inode = filp->f_path.dentry->d_inode;
461	struct super_block * sb;	461	struct super_block * sb;
462	int error;	462	int error;
463		463
464	sb = inode->i_sb;	464	sb = inode->i_sb;
465		465
466	if (!fname) {	466	if (!fname) {
467	printk("call_filldir: called with null fname?!?\n");	467	printk("call_filldir: called with null fname?!?\n");
468	return 0;	468	return 0;
469	}	469	}
470	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);	470	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
471	while (fname) {	471	while (fname) {
472	error = filldir(dirent, fname->name,	472	error = filldir(dirent, fname->name,
473	fname->name_len, curr_pos,	473	fname->name_len, curr_pos,
474	fname->inode,	474	fname->inode,
475	get_dtype(sb, fname->file_type));	475	get_dtype(sb, fname->file_type));
476	if (error) {	476	if (error) {
477	filp->f_pos = curr_pos;	477	filp->f_pos = curr_pos;
478	info->extra_fname = fname;	478	info->extra_fname = fname;
479	return error;	479	return error;
480	}	480	}
481	fname = fname->next;	481	fname = fname->next;
482	}	482	}
483	return 0;	483	return 0;
484	}	484	}
485		485
486	static int ext3_dx_readdir(struct file * filp,	486	static int ext3_dx_readdir(struct file * filp,
487	void * dirent, filldir_t filldir)	487	void * dirent, filldir_t filldir)
488	{	488	{
489	struct dir_private_info *info = filp->private_data;	489	struct dir_private_info *info = filp->private_data;
490	struct inode *inode = filp->f_path.dentry->d_inode;	490	struct inode *inode = filp->f_path.dentry->d_inode;
491	struct fname *fname;	491	struct fname *fname;
492	int ret;	492	int ret;
493		493
494	if (!info) {	494	if (!info) {
495	info = ext3_htree_create_dir_info(filp, filp->f_pos);	495	info = ext3_htree_create_dir_info(filp, filp->f_pos);
496	if (!info)	496	if (!info)
497	return -ENOMEM;	497	return -ENOMEM;
498	filp->private_data = info;	498	filp->private_data = info;
499	}	499	}
500		500
501	if (filp->f_pos == ext3_get_htree_eof(filp))	501	if (filp->f_pos == ext3_get_htree_eof(filp))
502	return 0; /* EOF */	502	return 0; /* EOF */
503		503
504	/* Some one has messed with f_pos; reset the world */	504	/* Some one has messed with f_pos; reset the world */
505	if (info->last_pos != filp->f_pos) {	505	if (info->last_pos != filp->f_pos) {
506	free_rb_tree_fname(&info->root);	506	free_rb_tree_fname(&info->root);
507	info->curr_node = NULL;	507	info->curr_node = NULL;
508	info->extra_fname = NULL;	508	info->extra_fname = NULL;
509	info->curr_hash = pos2maj_hash(filp, filp->f_pos);	509	info->curr_hash = pos2maj_hash(filp, filp->f_pos);
510	info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);	510	info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
511	}	511	}
512		512
513	/*	513	/*
514	* If there are any leftover names on the hash collision	514	* If there are any leftover names on the hash collision
515	* chain, return them first.	515	* chain, return them first.
516	*/	516	*/
517	if (info->extra_fname) {	517	if (info->extra_fname) {
518	if (call_filldir(filp, dirent, filldir, info->extra_fname))	518	if (call_filldir(filp, dirent, filldir, info->extra_fname))
519	goto finished;	519	goto finished;
520	info->extra_fname = NULL;	520	info->extra_fname = NULL;
521	goto next_node;	521	goto next_node;
522	} else if (!info->curr_node)	522	} else if (!info->curr_node)
523	info->curr_node = rb_first(&info->root);	523	info->curr_node = rb_first(&info->root);
524		524
525	while (1) {	525	while (1) {
526	/*	526	/*
527	* Fill the rbtree if we have no more entries,	527	* Fill the rbtree if we have no more entries,
528	* or the inode has changed since we last read in the	528	* or the inode has changed since we last read in the
529	* cached entries.	529	* cached entries.
530	*/	530	*/
531	if ((!info->curr_node) \|\|	531	if ((!info->curr_node) \|\|
532	(filp->f_version != inode->i_version)) {	532	(filp->f_version != inode->i_version)) {
533	info->curr_node = NULL;	533	info->curr_node = NULL;
534	free_rb_tree_fname(&info->root);	534	free_rb_tree_fname(&info->root);
535	filp->f_version = inode->i_version;	535	filp->f_version = inode->i_version;
536	ret = ext3_htree_fill_tree(filp, info->curr_hash,	536	ret = ext3_htree_fill_tree(filp, info->curr_hash,
537	info->curr_minor_hash,	537	info->curr_minor_hash,
538	&info->next_hash);	538	&info->next_hash);
539	if (ret < 0)	539	if (ret < 0)
540	return ret;	540	return ret;
541	if (ret == 0) {	541	if (ret == 0) {
542	filp->f_pos = ext3_get_htree_eof(filp);	542	filp->f_pos = ext3_get_htree_eof(filp);
543	break;	543	break;
544	}	544	}
545	info->curr_node = rb_first(&info->root);	545	info->curr_node = rb_first(&info->root);
546	}	546	}
547		547
548	fname = rb_entry(info->curr_node, struct fname, rb_hash);	548	fname = rb_entry(info->curr_node, struct fname, rb_hash);
549	info->curr_hash = fname->hash;	549	info->curr_hash = fname->hash;
550	info->curr_minor_hash = fname->minor_hash;	550	info->curr_minor_hash = fname->minor_hash;
551	if (call_filldir(filp, dirent, filldir, fname))	551	if (call_filldir(filp, dirent, filldir, fname))
552	break;	552	break;
553	next_node:	553	next_node:
554	info->curr_node = rb_next(info->curr_node);	554	info->curr_node = rb_next(info->curr_node);
555	if (info->curr_node) {	555	if (info->curr_node) {
556	fname = rb_entry(info->curr_node, struct fname,	556	fname = rb_entry(info->curr_node, struct fname,
557	rb_hash);	557	rb_hash);
558	info->curr_hash = fname->hash;	558	info->curr_hash = fname->hash;
559	info->curr_minor_hash = fname->minor_hash;	559	info->curr_minor_hash = fname->minor_hash;
560	} else {	560	} else {
561	if (info->next_hash == ~0) {	561	if (info->next_hash == ~0) {
562	filp->f_pos = ext3_get_htree_eof(filp);	562	filp->f_pos = ext3_get_htree_eof(filp);
563	break;	563	break;
564	}	564	}
565	info->curr_hash = info->next_hash;	565	info->curr_hash = info->next_hash;
566	info->curr_minor_hash = 0;	566	info->curr_minor_hash = 0;
567	}	567	}
568	}	568	}
569	finished:	569	finished:
570	info->last_pos = filp->f_pos;	570	info->last_pos = filp->f_pos;
571	return 0;	571	return 0;
572	}	572	}
573		573
574	static int ext3_release_dir (struct inode * inode, struct file * filp)	574	static int ext3_release_dir (struct inode * inode, struct file * filp)
575	{	575	{
576	if (filp->private_data)	576	if (filp->private_data)
577	ext3_htree_free_dir_info(filp->private_data);	577	ext3_htree_free_dir_info(filp->private_data);
578		578
579	return 0;	579	return 0;
580	}	580	}
581		581
582	const struct file_operations ext3_dir_operations = {	582	const struct file_operations ext3_dir_operations = {
583	.llseek = ext3_dir_llseek,	583	.llseek = ext3_dir_llseek,
584	.read = generic_read_dir,	584	.read = generic_read_dir,
585	.readdir = ext3_readdir,	585	.readdir = ext3_readdir,
586	.unlocked_ioctl = ext3_ioctl,	586	.unlocked_ioctl = ext3_ioctl,
587	#ifdef CONFIG_COMPAT	587	#ifdef CONFIG_COMPAT
588	.compat_ioctl = ext3_compat_ioctl,	588	.compat_ioctl = ext3_compat_ioctl,
589	#endif	589	#endif
590	.fsync = ext3_sync_file,	590	.fsync = ext3_sync_file,
591	.release = ext3_release_dir,	591	.release = ext3_release_dir,
592	};	592	};
593		593

fs/ext4/dir.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/ext4/dir.c	2	* linux/fs/ext4/dir.c
3	*	3	*
4	* Copyright (C) 1992, 1993, 1994, 1995	4	* Copyright (C) 1992, 1993, 1994, 1995
5	* Remy Card (card@masi.ibp.fr)	5	* Remy Card (card@masi.ibp.fr)
6	* Laboratoire MASI - Institut Blaise Pascal	6	* Laboratoire MASI - Institut Blaise Pascal
7	* Universite Pierre et Marie Curie (Paris VI)	7	* Universite Pierre et Marie Curie (Paris VI)
8	*	8	*
9	* from	9	* from
10	*	10	*
11	* linux/fs/minix/dir.c	11	* linux/fs/minix/dir.c
12	*	12	*
13	* Copyright (C) 1991, 1992 Linus Torvalds	13	* Copyright (C) 1991, 1992 Linus Torvalds
14	*	14	*
15	* ext4 directory handling functions	15	* ext4 directory handling functions
16	*	16	*
17	* Big-endian to little-endian byte-swapping/bitmaps by	17	* Big-endian to little-endian byte-swapping/bitmaps by
18	* David S. Miller (davem@caip.rutgers.edu), 1995	18	* David S. Miller (davem@caip.rutgers.edu), 1995
19	*	19	*
20	* Hash Tree Directory indexing (c) 2001 Daniel Phillips	20	* Hash Tree Directory indexing (c) 2001 Daniel Phillips
21	*	21	*
22	*/	22	*/
23		23
24	#include <linux/fs.h>	24	#include <linux/fs.h>
25	#include <linux/jbd2.h>	25	#include <linux/jbd2.h>
26	#include <linux/buffer_head.h>	26	#include <linux/buffer_head.h>
27	#include <linux/slab.h>	27	#include <linux/slab.h>
28	#include <linux/rbtree.h>	28	#include <linux/rbtree.h>
29	#include "ext4.h"	29	#include "ext4.h"
30	#include "xattr.h"	30	#include "xattr.h"
31		31
32	static int ext4_dx_readdir(struct file *filp,	32	static int ext4_dx_readdir(struct file *filp,
33	void *dirent, filldir_t filldir);	33	void *dirent, filldir_t filldir);
34		34
35	/**	35	/**
36	* Check if the given dir-inode refers to an htree-indexed directory	36	* Check if the given dir-inode refers to an htree-indexed directory
37	* (or a directory which chould potentially get coverted to use htree	37	* (or a directory which chould potentially get coverted to use htree
38	* indexing).	38	* indexing).
39	*	39	*
40	* Return 1 if it is a dx dir, 0 if not	40	* Return 1 if it is a dx dir, 0 if not
41	*/	41	*/
42	static int is_dx_dir(struct inode *inode)	42	static int is_dx_dir(struct inode *inode)
43	{	43	{
44	struct super_block *sb = inode->i_sb;	44	struct super_block *sb = inode->i_sb;
45		45
46	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,	46	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
47	EXT4_FEATURE_COMPAT_DIR_INDEX) &&	47	EXT4_FEATURE_COMPAT_DIR_INDEX) &&
48	((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) \|\|	48	((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) \|\|
49	((inode->i_size >> sb->s_blocksize_bits) == 1)))	49	((inode->i_size >> sb->s_blocksize_bits) == 1)))
50	return 1;	50	return 1;
51		51
52	return 0;	52	return 0;
53	}	53	}
54		54
55	/*	55	/*
56	* Return 0 if the directory entry is OK, and 1 if there is a problem	56	* Return 0 if the directory entry is OK, and 1 if there is a problem
57	*	57	*
58	* Note: this is the opposite of what ext2 and ext3 historically returned...	58	* Note: this is the opposite of what ext2 and ext3 historically returned...
59	*	59	*
60	* bh passed here can be an inode block or a dir data block, depending	60	* bh passed here can be an inode block or a dir data block, depending
61	* on the inode inline data flag.	61	* on the inode inline data flag.
62	*/	62	*/
63	int __ext4_check_dir_entry(const char *function, unsigned int line,	63	int __ext4_check_dir_entry(const char *function, unsigned int line,
64	struct inode dir, struct file filp,	64	struct inode dir, struct file filp,
65	struct ext4_dir_entry_2 *de,	65	struct ext4_dir_entry_2 *de,
66	struct buffer_head bh, char buf, int size,	66	struct buffer_head bh, char buf, int size,
67	unsigned int offset)	67	unsigned int offset)
68	{	68	{
69	const char *error_msg = NULL;	69	const char *error_msg = NULL;
70	const int rlen = ext4_rec_len_from_disk(de->rec_len,	70	const int rlen = ext4_rec_len_from_disk(de->rec_len,
71	dir->i_sb->s_blocksize);	71	dir->i_sb->s_blocksize);
72		72
73	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))	73	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
74	error_msg = "rec_len is smaller than minimal";	74	error_msg = "rec_len is smaller than minimal";
75	else if (unlikely(rlen % 4 != 0))	75	else if (unlikely(rlen % 4 != 0))
76	error_msg = "rec_len % 4 != 0";	76	error_msg = "rec_len % 4 != 0";
77	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))	77	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
78	error_msg = "rec_len is too small for name_len";	78	error_msg = "rec_len is too small for name_len";
79	else if (unlikely(((char *) de - buf) + rlen > size))	79	else if (unlikely(((char *) de - buf) + rlen > size))
80	error_msg = "directory entry across range";	80	error_msg = "directory entry across range";
81	else if (unlikely(le32_to_cpu(de->inode) >	81	else if (unlikely(le32_to_cpu(de->inode) >
82	le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))	82	le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
83	error_msg = "inode out of bounds";	83	error_msg = "inode out of bounds";
84	else	84	else
85	return 0;	85	return 0;
86		86
87	if (filp)	87	if (filp)
88	ext4_error_file(filp, function, line, bh->b_blocknr,	88	ext4_error_file(filp, function, line, bh->b_blocknr,
89	"bad entry in directory: %s - offset=%u(%u), "	89	"bad entry in directory: %s - offset=%u(%u), "
90	"inode=%u, rec_len=%d, name_len=%d",	90	"inode=%u, rec_len=%d, name_len=%d",
91	error_msg, (unsigned) (offset % size),	91	error_msg, (unsigned) (offset % size),
92	offset, le32_to_cpu(de->inode),	92	offset, le32_to_cpu(de->inode),
93	rlen, de->name_len);	93	rlen, de->name_len);
94	else	94	else
95	ext4_error_inode(dir, function, line, bh->b_blocknr,	95	ext4_error_inode(dir, function, line, bh->b_blocknr,
96	"bad entry in directory: %s - offset=%u(%u), "	96	"bad entry in directory: %s - offset=%u(%u), "
97	"inode=%u, rec_len=%d, name_len=%d",	97	"inode=%u, rec_len=%d, name_len=%d",
98	error_msg, (unsigned) (offset % size),	98	error_msg, (unsigned) (offset % size),
99	offset, le32_to_cpu(de->inode),	99	offset, le32_to_cpu(de->inode),
100	rlen, de->name_len);	100	rlen, de->name_len);
101		101
102	return 1;	102	return 1;
103	}	103	}
104		104
105	static int ext4_readdir(struct file *filp,	105	static int ext4_readdir(struct file *filp,
106	void *dirent, filldir_t filldir)	106	void *dirent, filldir_t filldir)
107	{	107	{
108	int error = 0;	108	int error = 0;
109	unsigned int offset;	109	unsigned int offset;
110	int i, stored;	110	int i, stored;
111	struct ext4_dir_entry_2 *de;	111	struct ext4_dir_entry_2 *de;
112	int err;	112	int err;
113	struct inode *inode = filp->f_path.dentry->d_inode;	113	struct inode *inode = filp->f_path.dentry->d_inode;
114	struct super_block *sb = inode->i_sb;	114	struct super_block *sb = inode->i_sb;
115	int ret = 0;	115	int ret = 0;
116	int dir_has_error = 0;	116	int dir_has_error = 0;
117		117
118	if (ext4_has_inline_data(inode)) {	118	if (ext4_has_inline_data(inode)) {
119	int has_inline_data = 1;	119	int has_inline_data = 1;
120	ret = ext4_read_inline_dir(filp, dirent, filldir,	120	ret = ext4_read_inline_dir(filp, dirent, filldir,
121	&has_inline_data);	121	&has_inline_data);
122	if (has_inline_data)	122	if (has_inline_data)
123	return ret;	123	return ret;
124	}	124	}
125		125
126	if (is_dx_dir(inode)) {	126	if (is_dx_dir(inode)) {
127	err = ext4_dx_readdir(filp, dirent, filldir);	127	err = ext4_dx_readdir(filp, dirent, filldir);
128	if (err != ERR_BAD_DX_DIR) {	128	if (err != ERR_BAD_DX_DIR) {
129	ret = err;	129	ret = err;
130	goto out;	130	goto out;
131	}	131	}
132	/*	132	/*
133	* We don't set the inode dirty flag since it's not	133	* We don't set the inode dirty flag since it's not
134	* critical that it get flushed back to the disk.	134	* critical that it get flushed back to the disk.
135	*/	135	*/
136	ext4_clear_inode_flag(filp->f_path.dentry->d_inode,	136	ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
137	EXT4_INODE_INDEX);	137	EXT4_INODE_INDEX);
138	}	138	}
139	stored = 0;	139	stored = 0;
140	offset = filp->f_pos & (sb->s_blocksize - 1);	140	offset = filp->f_pos & (sb->s_blocksize - 1);
141		141
142	while (!error && !stored && filp->f_pos < inode->i_size) {	142	while (!error && !stored && filp->f_pos < inode->i_size) {
143	struct ext4_map_blocks map;	143	struct ext4_map_blocks map;
144	struct buffer_head *bh = NULL;	144	struct buffer_head *bh = NULL;
145		145
146	map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);	146	map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
147	map.m_len = 1;	147	map.m_len = 1;
148	err = ext4_map_blocks(NULL, inode, &map, 0);	148	err = ext4_map_blocks(NULL, inode, &map, 0);
149	if (err > 0) {	149	if (err > 0) {
150	pgoff_t index = map.m_pblk >>	150	pgoff_t index = map.m_pblk >>
151	(PAGE_CACHE_SHIFT - inode->i_blkbits);	151	(PAGE_CACHE_SHIFT - inode->i_blkbits);
152	if (!ra_has_index(&filp->f_ra, index))	152	if (!ra_has_index(&filp->f_ra, index))
153	page_cache_sync_readahead(	153	page_cache_sync_readahead(
154	sb->s_bdev->bd_inode->i_mapping,	154	sb->s_bdev->bd_inode->i_mapping,
155	&filp->f_ra, filp,	155	&filp->f_ra, filp,
156	index, 1);	156	index, 1);
157	filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;	157	filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
158	bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);	158	bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
159	}	159	}
160		160
161	/*	161	/*
162	* We ignore I/O errors on directories so users have a chance	162	* We ignore I/O errors on directories so users have a chance
163	* of recovering data when there's a bad sector	163	* of recovering data when there's a bad sector
164	*/	164	*/
165	if (!bh) {	165	if (!bh) {
166	if (!dir_has_error) {	166	if (!dir_has_error) {
167	EXT4_ERROR_FILE(filp, 0,	167	EXT4_ERROR_FILE(filp, 0,
168	"directory contains a "	168	"directory contains a "
169	"hole at offset %llu",	169	"hole at offset %llu",
170	(unsigned long long) filp->f_pos);	170	(unsigned long long) filp->f_pos);
171	dir_has_error = 1;	171	dir_has_error = 1;
172	}	172	}
173	/* corrupt size? Maybe no more blocks to read */	173	/* corrupt size? Maybe no more blocks to read */
174	if (filp->f_pos > inode->i_blocks << 9)	174	if (filp->f_pos > inode->i_blocks << 9)
175	break;	175	break;
176	filp->f_pos += sb->s_blocksize - offset;	176	filp->f_pos += sb->s_blocksize - offset;
177	continue;	177	continue;
178	}	178	}
179		179
180	/* Check the checksum */	180	/* Check the checksum */
181	if (!buffer_verified(bh) &&	181	if (!buffer_verified(bh) &&
182	!ext4_dirent_csum_verify(inode,	182	!ext4_dirent_csum_verify(inode,
183	(struct ext4_dir_entry *)bh->b_data)) {	183	(struct ext4_dir_entry *)bh->b_data)) {
184	EXT4_ERROR_FILE(filp, 0, "directory fails checksum "	184	EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
185	"at offset %llu",	185	"at offset %llu",
186	(unsigned long long)filp->f_pos);	186	(unsigned long long)filp->f_pos);
187	filp->f_pos += sb->s_blocksize - offset;	187	filp->f_pos += sb->s_blocksize - offset;
188	continue;	188	continue;
189	}	189	}
190	set_buffer_verified(bh);	190	set_buffer_verified(bh);
191		191
192	revalidate:	192	revalidate:
193	/* If the dir block has changed since the last call to	193	/* If the dir block has changed since the last call to
194	* readdir(2), then we might be pointing to an invalid	194	* readdir(2), then we might be pointing to an invalid
195	* dirent right now. Scan from the start of the block	195	* dirent right now. Scan from the start of the block
196	* to make sure. */	196	* to make sure. */
197	if (filp->f_version != inode->i_version) {	197	if (filp->f_version != inode->i_version) {
198	for (i = 0; i < sb->s_blocksize && i < offset; ) {	198	for (i = 0; i < sb->s_blocksize && i < offset; ) {
199	de = (struct ext4_dir_entry_2 *)	199	de = (struct ext4_dir_entry_2 *)
200	(bh->b_data + i);	200	(bh->b_data + i);
201	/* It's too expensive to do a full	201	/* It's too expensive to do a full
202	* dirent test each time round this	202	* dirent test each time round this
203	* loop, but we do have to test at	203	* loop, but we do have to test at
204	* least that it is non-zero. A	204	* least that it is non-zero. A
205	* failure will be detected in the	205	* failure will be detected in the
206	* dirent test below. */	206	* dirent test below. */
207	if (ext4_rec_len_from_disk(de->rec_len,	207	if (ext4_rec_len_from_disk(de->rec_len,
208	sb->s_blocksize) < EXT4_DIR_REC_LEN(1))	208	sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
209	break;	209	break;
210	i += ext4_rec_len_from_disk(de->rec_len,	210	i += ext4_rec_len_from_disk(de->rec_len,
211	sb->s_blocksize);	211	sb->s_blocksize);
212	}	212	}
213	offset = i;	213	offset = i;
214	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))	214	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
215	\| offset;	215	\| offset;
216	filp->f_version = inode->i_version;	216	filp->f_version = inode->i_version;
217	}	217	}
218		218
219	while (!error && filp->f_pos < inode->i_size	219	while (!error && filp->f_pos < inode->i_size
220	&& offset < sb->s_blocksize) {	220	&& offset < sb->s_blocksize) {
221	de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);	221	de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
222	if (ext4_check_dir_entry(inode, filp, de, bh,	222	if (ext4_check_dir_entry(inode, filp, de, bh,
223	bh->b_data, bh->b_size,	223	bh->b_data, bh->b_size,
224	offset)) {	224	offset)) {
225	/*	225	/*
226	* On error, skip the f_pos to the next block	226	* On error, skip the f_pos to the next block
227	*/	227	*/
228	filp->f_pos = (filp->f_pos \|	228	filp->f_pos = (filp->f_pos \|
229	(sb->s_blocksize - 1)) + 1;	229	(sb->s_blocksize - 1)) + 1;
230	brelse(bh);	230	brelse(bh);
231	ret = stored;	231	ret = stored;
232	goto out;	232	goto out;
233	}	233	}
234	offset += ext4_rec_len_from_disk(de->rec_len,	234	offset += ext4_rec_len_from_disk(de->rec_len,
235	sb->s_blocksize);	235	sb->s_blocksize);
236	if (le32_to_cpu(de->inode)) {	236	if (le32_to_cpu(de->inode)) {
237	/* We might block in the next section	237	/* We might block in the next section
238	* if the data destination is	238	* if the data destination is
239	* currently swapped out. So, use a	239	* currently swapped out. So, use a
240	* version stamp to detect whether or	240	* version stamp to detect whether or
241	* not the directory has been modified	241	* not the directory has been modified
242	* during the copy operation.	242	* during the copy operation.
243	*/	243	*/
244	u64 version = filp->f_version;	244	u64 version = filp->f_version;
245		245
246	error = filldir(dirent, de->name,	246	error = filldir(dirent, de->name,
247	de->name_len,	247	de->name_len,
248	filp->f_pos,	248	filp->f_pos,
249	le32_to_cpu(de->inode),	249	le32_to_cpu(de->inode),
250	get_dtype(sb, de->file_type));	250	get_dtype(sb, de->file_type));
251	if (error)	251	if (error)
252	break;	252	break;
253	if (version != filp->f_version)	253	if (version != filp->f_version)
254	goto revalidate;	254	goto revalidate;
255	stored++;	255	stored++;
256	}	256	}
257	filp->f_pos += ext4_rec_len_from_disk(de->rec_len,	257	filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
258	sb->s_blocksize);	258	sb->s_blocksize);
259	}	259	}
260	offset = 0;	260	offset = 0;
261	brelse(bh);	261	brelse(bh);
262	}	262	}
263	out:	263	out:
264	return ret;	264	return ret;
265	}	265	}
266		266
267	static inline int is_32bit_api(void)	267	static inline int is_32bit_api(void)
268	{	268	{
269	#ifdef CONFIG_COMPAT	269	#ifdef CONFIG_COMPAT
270	return is_compat_task();	270	return is_compat_task();
271	#else	271	#else
272	return (BITS_PER_LONG == 32);	272	return (BITS_PER_LONG == 32);
273	#endif	273	#endif
274	}	274	}
275		275
276	/*	276	/*
277	* These functions convert from the major/minor hash to an f_pos	277	* These functions convert from the major/minor hash to an f_pos
278	* value for dx directories	278	* value for dx directories
279	*	279	*
280	* Upper layer (for example NFS) should specify FMODE_32BITHASH or	280	* Upper layer (for example NFS) should specify FMODE_32BITHASH or
281	* FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted	281	* FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
282	* directly on both 32-bit and 64-bit nodes, under such case, neither	282	* directly on both 32-bit and 64-bit nodes, under such case, neither
283	* FMODE_32BITHASH nor FMODE_64BITHASH is specified.	283	* FMODE_32BITHASH nor FMODE_64BITHASH is specified.
284	*/	284	*/
285	static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)	285	static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
286	{	286	{
287	if ((filp->f_mode & FMODE_32BITHASH) \|\|	287	if ((filp->f_mode & FMODE_32BITHASH) \|\|
288	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	288	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
289	return major >> 1;	289	return major >> 1;
290	else	290	else
291	return ((__u64)(major >> 1) << 32) \| (__u64)minor;	291	return ((__u64)(major >> 1) << 32) \| (__u64)minor;
292	}	292	}
293		293
294	static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)	294	static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
295	{	295	{
296	if ((filp->f_mode & FMODE_32BITHASH) \|\|	296	if ((filp->f_mode & FMODE_32BITHASH) \|\|
297	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	297	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
298	return (pos << 1) & 0xffffffff;	298	return (pos << 1) & 0xffffffff;
299	else	299	else
300	return ((pos >> 32) << 1) & 0xffffffff;	300	return ((pos >> 32) << 1) & 0xffffffff;
301	}	301	}
302		302
303	static inline __u32 pos2min_hash(struct file *filp, loff_t pos)	303	static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
304	{	304	{
305	if ((filp->f_mode & FMODE_32BITHASH) \|\|	305	if ((filp->f_mode & FMODE_32BITHASH) \|\|
306	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	306	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
307	return 0;	307	return 0;
308	else	308	else
309	return pos & 0xffffffff;	309	return pos & 0xffffffff;
310	}	310	}
311		311
312	/*	312	/*
313	* Return 32- or 64-bit end-of-file for dx directories	313	* Return 32- or 64-bit end-of-file for dx directories
314	*/	314	*/
315	static inline loff_t ext4_get_htree_eof(struct file *filp)	315	static inline loff_t ext4_get_htree_eof(struct file *filp)
316	{	316	{
317	if ((filp->f_mode & FMODE_32BITHASH) \|\|	317	if ((filp->f_mode & FMODE_32BITHASH) \|\|
318	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))	318	(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
319	return EXT4_HTREE_EOF_32BIT;	319	return EXT4_HTREE_EOF_32BIT;
320	else	320	else
321	return EXT4_HTREE_EOF_64BIT;	321	return EXT4_HTREE_EOF_64BIT;
322	}	322	}
323		323
324		324
325	/*	325	/*
326	* ext4_dir_llseek() calls generic_file_llseek_size to handle htree	326	* ext4_dir_llseek() calls generic_file_llseek_size to handle htree
327	* directories, where the "offset" is in terms of the filename hash	327	* directories, where the "offset" is in terms of the filename hash
328	* value instead of the byte offset.	328	* value instead of the byte offset.
329	*	329	*
330	* Because we may return a 64-bit hash that is well beyond offset limits,	330	* Because we may return a 64-bit hash that is well beyond offset limits,
331	* we need to pass the max hash as the maximum allowable offset in	331	* we need to pass the max hash as the maximum allowable offset in
332	* the htree directory case.	332	* the htree directory case.
333	*	333	*
334	* For non-htree, ext4_llseek already chooses the proper max offset.	334	* For non-htree, ext4_llseek already chooses the proper max offset.
335	*/	335	*/
336	loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)	336	loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
337	{	337	{
338	struct inode *inode = file->f_mapping->host;	338	struct inode *inode = file->f_mapping->host;
339	int dx_dir = is_dx_dir(inode);	339	int dx_dir = is_dx_dir(inode);
340	loff_t htree_max = ext4_get_htree_eof(file);	340	loff_t htree_max = ext4_get_htree_eof(file);
341		341
342	if (likely(dx_dir))	342	if (likely(dx_dir))
343	return generic_file_llseek_size(file, offset, origin,	343	return generic_file_llseek_size(file, offset, whence,
344	htree_max, htree_max);	344	htree_max, htree_max);
345	else	345	else
346	return ext4_llseek(file, offset, origin);	346	return ext4_llseek(file, offset, whence);
347	}	347	}
348		348
349	/*	349	/*
350	* This structure holds the nodes of the red-black tree used to store	350	* This structure holds the nodes of the red-black tree used to store
351	* the directory entry in hash order.	351	* the directory entry in hash order.
352	*/	352	*/
353	struct fname {	353	struct fname {
354	__u32 hash;	354	__u32 hash;
355	__u32 minor_hash;	355	__u32 minor_hash;
356	struct rb_node rb_hash;	356	struct rb_node rb_hash;
357	struct fname *next;	357	struct fname *next;
358	__u32 inode;	358	__u32 inode;
359	__u8 name_len;	359	__u8 name_len;
360	__u8 file_type;	360	__u8 file_type;
361	char name[0];	361	char name[0];
362	};	362	};
363		363
364	/*	364	/*
365	* This functoin implements a non-recursive way of freeing all of the	365	* This functoin implements a non-recursive way of freeing all of the
366	* nodes in the red-black tree.	366	* nodes in the red-black tree.
367	*/	367	*/
368	static void free_rb_tree_fname(struct rb_root *root)	368	static void free_rb_tree_fname(struct rb_root *root)
369	{	369	{
370	struct rb_node *n = root->rb_node;	370	struct rb_node *n = root->rb_node;
371	struct rb_node *parent;	371	struct rb_node *parent;
372	struct fname *fname;	372	struct fname *fname;
373		373
374	while (n) {	374	while (n) {
375	/* Do the node's children first */	375	/* Do the node's children first */
376	if (n->rb_left) {	376	if (n->rb_left) {
377	n = n->rb_left;	377	n = n->rb_left;
378	continue;	378	continue;
379	}	379	}
380	if (n->rb_right) {	380	if (n->rb_right) {
381	n = n->rb_right;	381	n = n->rb_right;
382	continue;	382	continue;
383	}	383	}
384	/*	384	/*
385	* The node has no children; free it, and then zero	385	* The node has no children; free it, and then zero
386	* out parent's link to it. Finally go to the	386	* out parent's link to it. Finally go to the
387	* beginning of the loop and try to free the parent	387	* beginning of the loop and try to free the parent
388	* node.	388	* node.
389	*/	389	*/
390	parent = rb_parent(n);	390	parent = rb_parent(n);
391	fname = rb_entry(n, struct fname, rb_hash);	391	fname = rb_entry(n, struct fname, rb_hash);
392	while (fname) {	392	while (fname) {
393	struct fname *old = fname;	393	struct fname *old = fname;
394	fname = fname->next;	394	fname = fname->next;
395	kfree(old);	395	kfree(old);
396	}	396	}
397	if (!parent)	397	if (!parent)
398	*root = RB_ROOT;	398	*root = RB_ROOT;
399	else if (parent->rb_left == n)	399	else if (parent->rb_left == n)
400	parent->rb_left = NULL;	400	parent->rb_left = NULL;
401	else if (parent->rb_right == n)	401	else if (parent->rb_right == n)
402	parent->rb_right = NULL;	402	parent->rb_right = NULL;
403	n = parent;	403	n = parent;
404	}	404	}
405	}	405	}
406		406
407		407
408	static struct dir_private_info ext4_htree_create_dir_info(struct file filp,	408	static struct dir_private_info ext4_htree_create_dir_info(struct file filp,
409	loff_t pos)	409	loff_t pos)
410	{	410	{
411	struct dir_private_info *p;	411	struct dir_private_info *p;
412		412
413	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);	413	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
414	if (!p)	414	if (!p)
415	return NULL;	415	return NULL;
416	p->curr_hash = pos2maj_hash(filp, pos);	416	p->curr_hash = pos2maj_hash(filp, pos);
417	p->curr_minor_hash = pos2min_hash(filp, pos);	417	p->curr_minor_hash = pos2min_hash(filp, pos);
418	return p;	418	return p;
419	}	419	}
420		420
421	void ext4_htree_free_dir_info(struct dir_private_info *p)	421	void ext4_htree_free_dir_info(struct dir_private_info *p)
422	{	422	{
423	free_rb_tree_fname(&p->root);	423	free_rb_tree_fname(&p->root);
424	kfree(p);	424	kfree(p);
425	}	425	}
426		426
427	/*	427	/*
428	* Given a directory entry, enter it into the fname rb tree.	428	* Given a directory entry, enter it into the fname rb tree.
429	*/	429	*/
430	int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,	430	int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
431	__u32 minor_hash,	431	__u32 minor_hash,
432	struct ext4_dir_entry_2 *dirent)	432	struct ext4_dir_entry_2 *dirent)
433	{	433	{
434	struct rb_node *p, parent = NULL;	434	struct rb_node *p, parent = NULL;
435	struct fname fname, new_fn;	435	struct fname fname, new_fn;
436	struct dir_private_info *info;	436	struct dir_private_info *info;
437	int len;	437	int len;
438		438
439	info = dir_file->private_data;	439	info = dir_file->private_data;
440	p = &info->root.rb_node;	440	p = &info->root.rb_node;
441		441
442	/* Create and allocate the fname structure */	442	/* Create and allocate the fname structure */
443	len = sizeof(struct fname) + dirent->name_len + 1;	443	len = sizeof(struct fname) + dirent->name_len + 1;
444	new_fn = kzalloc(len, GFP_KERNEL);	444	new_fn = kzalloc(len, GFP_KERNEL);
445	if (!new_fn)	445	if (!new_fn)
446	return -ENOMEM;	446	return -ENOMEM;
447	new_fn->hash = hash;	447	new_fn->hash = hash;
448	new_fn->minor_hash = minor_hash;	448	new_fn->minor_hash = minor_hash;
449	new_fn->inode = le32_to_cpu(dirent->inode);	449	new_fn->inode = le32_to_cpu(dirent->inode);
450	new_fn->name_len = dirent->name_len;	450	new_fn->name_len = dirent->name_len;
451	new_fn->file_type = dirent->file_type;	451	new_fn->file_type = dirent->file_type;
452	memcpy(new_fn->name, dirent->name, dirent->name_len);	452	memcpy(new_fn->name, dirent->name, dirent->name_len);
453	new_fn->name[dirent->name_len] = 0;	453	new_fn->name[dirent->name_len] = 0;
454		454
455	while (*p) {	455	while (*p) {
456	parent = *p;	456	parent = *p;
457	fname = rb_entry(parent, struct fname, rb_hash);	457	fname = rb_entry(parent, struct fname, rb_hash);
458		458
459	/*	459	/*
460	* If the hash and minor hash match up, then we put	460	* If the hash and minor hash match up, then we put
461	* them on a linked list. This rarely happens...	461	* them on a linked list. This rarely happens...
462	*/	462	*/
463	if ((new_fn->hash == fname->hash) &&	463	if ((new_fn->hash == fname->hash) &&
464	(new_fn->minor_hash == fname->minor_hash)) {	464	(new_fn->minor_hash == fname->minor_hash)) {
465	new_fn->next = fname->next;	465	new_fn->next = fname->next;
466	fname->next = new_fn;	466	fname->next = new_fn;
467	return 0;	467	return 0;
468	}	468	}
469		469
470	if (new_fn->hash < fname->hash)	470	if (new_fn->hash < fname->hash)
471	p = &(*p)->rb_left;	471	p = &(*p)->rb_left;
472	else if (new_fn->hash > fname->hash)	472	else if (new_fn->hash > fname->hash)
473	p = &(*p)->rb_right;	473	p = &(*p)->rb_right;
474	else if (new_fn->minor_hash < fname->minor_hash)	474	else if (new_fn->minor_hash < fname->minor_hash)
475	p = &(*p)->rb_left;	475	p = &(*p)->rb_left;
476	else /* if (new_fn->minor_hash > fname->minor_hash) */	476	else /* if (new_fn->minor_hash > fname->minor_hash) */
477	p = &(*p)->rb_right;	477	p = &(*p)->rb_right;
478	}	478	}
479		479
480	rb_link_node(&new_fn->rb_hash, parent, p);	480	rb_link_node(&new_fn->rb_hash, parent, p);
481	rb_insert_color(&new_fn->rb_hash, &info->root);	481	rb_insert_color(&new_fn->rb_hash, &info->root);
482	return 0;	482	return 0;
483	}	483	}
484		484
485		485
486		486
487	/*	487	/*
488	* This is a helper function for ext4_dx_readdir. It calls filldir	488	* This is a helper function for ext4_dx_readdir. It calls filldir
489	* for all entres on the fname linked list. (Normally there is only	489	* for all entres on the fname linked list. (Normally there is only
490	* one entry on the linked list, unless there are 62 bit hash collisions.)	490	* one entry on the linked list, unless there are 62 bit hash collisions.)
491	*/	491	*/
492	static int call_filldir(struct file filp, void dirent,	492	static int call_filldir(struct file filp, void dirent,
493	filldir_t filldir, struct fname *fname)	493	filldir_t filldir, struct fname *fname)
494	{	494	{
495	struct dir_private_info *info = filp->private_data;	495	struct dir_private_info *info = filp->private_data;
496	loff_t curr_pos;	496	loff_t curr_pos;
497	struct inode *inode = filp->f_path.dentry->d_inode;	497	struct inode *inode = filp->f_path.dentry->d_inode;
498	struct super_block *sb;	498	struct super_block *sb;
499	int error;	499	int error;
500		500
501	sb = inode->i_sb;	501	sb = inode->i_sb;
502		502
503	if (!fname) {	503	if (!fname) {
504	ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "	504	ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
505	"called with null fname?!?", __func__, __LINE__,	505	"called with null fname?!?", __func__, __LINE__,
506	inode->i_ino, current->comm);	506	inode->i_ino, current->comm);
507	return 0;	507	return 0;
508	}	508	}
509	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);	509	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
510	while (fname) {	510	while (fname) {
511	error = filldir(dirent, fname->name,	511	error = filldir(dirent, fname->name,
512	fname->name_len, curr_pos,	512	fname->name_len, curr_pos,
513	fname->inode,	513	fname->inode,
514	get_dtype(sb, fname->file_type));	514	get_dtype(sb, fname->file_type));
515	if (error) {	515	if (error) {
516	filp->f_pos = curr_pos;	516	filp->f_pos = curr_pos;
517	info->extra_fname = fname;	517	info->extra_fname = fname;
518	return error;	518	return error;
519	}	519	}
520	fname = fname->next;	520	fname = fname->next;
521	}	521	}
522	return 0;	522	return 0;
523	}	523	}
524		524
525	static int ext4_dx_readdir(struct file *filp,	525	static int ext4_dx_readdir(struct file *filp,
526	void *dirent, filldir_t filldir)	526	void *dirent, filldir_t filldir)
527	{	527	{
528	struct dir_private_info *info = filp->private_data;	528	struct dir_private_info *info = filp->private_data;
529	struct inode *inode = filp->f_path.dentry->d_inode;	529	struct inode *inode = filp->f_path.dentry->d_inode;
530	struct fname *fname;	530	struct fname *fname;
531	int ret;	531	int ret;
532		532
533	if (!info) {	533	if (!info) {
534	info = ext4_htree_create_dir_info(filp, filp->f_pos);	534	info = ext4_htree_create_dir_info(filp, filp->f_pos);
535	if (!info)	535	if (!info)
536	return -ENOMEM;	536	return -ENOMEM;
537	filp->private_data = info;	537	filp->private_data = info;
538	}	538	}
539		539
540	if (filp->f_pos == ext4_get_htree_eof(filp))	540	if (filp->f_pos == ext4_get_htree_eof(filp))
541	return 0; /* EOF */	541	return 0; /* EOF */
542		542
543	/* Some one has messed with f_pos; reset the world */	543	/* Some one has messed with f_pos; reset the world */
544	if (info->last_pos != filp->f_pos) {	544	if (info->last_pos != filp->f_pos) {
545	free_rb_tree_fname(&info->root);	545	free_rb_tree_fname(&info->root);
546	info->curr_node = NULL;	546	info->curr_node = NULL;
547	info->extra_fname = NULL;	547	info->extra_fname = NULL;
548	info->curr_hash = pos2maj_hash(filp, filp->f_pos);	548	info->curr_hash = pos2maj_hash(filp, filp->f_pos);
549	info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);	549	info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
550	}	550	}
551		551
552	/*	552	/*
553	* If there are any leftover names on the hash collision	553	* If there are any leftover names on the hash collision
554	* chain, return them first.	554	* chain, return them first.
555	*/	555	*/
556	if (info->extra_fname) {	556	if (info->extra_fname) {
557	if (call_filldir(filp, dirent, filldir, info->extra_fname))	557	if (call_filldir(filp, dirent, filldir, info->extra_fname))
558	goto finished;	558	goto finished;
559	info->extra_fname = NULL;	559	info->extra_fname = NULL;
560	goto next_node;	560	goto next_node;
561	} else if (!info->curr_node)	561	} else if (!info->curr_node)
562	info->curr_node = rb_first(&info->root);	562	info->curr_node = rb_first(&info->root);
563		563
564	while (1) {	564	while (1) {
565	/*	565	/*
566	* Fill the rbtree if we have no more entries,	566	* Fill the rbtree if we have no more entries,
567	* or the inode has changed since we last read in the	567	* or the inode has changed since we last read in the
568	* cached entries.	568	* cached entries.
569	*/	569	*/
570	if ((!info->curr_node) \|\|	570	if ((!info->curr_node) \|\|
571	(filp->f_version != inode->i_version)) {	571	(filp->f_version != inode->i_version)) {
572	info->curr_node = NULL;	572	info->curr_node = NULL;
573	free_rb_tree_fname(&info->root);	573	free_rb_tree_fname(&info->root);
574	filp->f_version = inode->i_version;	574	filp->f_version = inode->i_version;
575	ret = ext4_htree_fill_tree(filp, info->curr_hash,	575	ret = ext4_htree_fill_tree(filp, info->curr_hash,
576	info->curr_minor_hash,	576	info->curr_minor_hash,
577	&info->next_hash);	577	&info->next_hash);
578	if (ret < 0)	578	if (ret < 0)
579	return ret;	579	return ret;
580	if (ret == 0) {	580	if (ret == 0) {
581	filp->f_pos = ext4_get_htree_eof(filp);	581	filp->f_pos = ext4_get_htree_eof(filp);
582	break;	582	break;
583	}	583	}
584	info->curr_node = rb_first(&info->root);	584	info->curr_node = rb_first(&info->root);
585	}	585	}
586		586
587	fname = rb_entry(info->curr_node, struct fname, rb_hash);	587	fname = rb_entry(info->curr_node, struct fname, rb_hash);
588	info->curr_hash = fname->hash;	588	info->curr_hash = fname->hash;
589	info->curr_minor_hash = fname->minor_hash;	589	info->curr_minor_hash = fname->minor_hash;
590	if (call_filldir(filp, dirent, filldir, fname))	590	if (call_filldir(filp, dirent, filldir, fname))
591	break;	591	break;
592	next_node:	592	next_node:
593	info->curr_node = rb_next(info->curr_node);	593	info->curr_node = rb_next(info->curr_node);
594	if (info->curr_node) {	594	if (info->curr_node) {
595	fname = rb_entry(info->curr_node, struct fname,	595	fname = rb_entry(info->curr_node, struct fname,
596	rb_hash);	596	rb_hash);
597	info->curr_hash = fname->hash;	597	info->curr_hash = fname->hash;
598	info->curr_minor_hash = fname->minor_hash;	598	info->curr_minor_hash = fname->minor_hash;
599	} else {	599	} else {
600	if (info->next_hash == ~0) {	600	if (info->next_hash == ~0) {
601	filp->f_pos = ext4_get_htree_eof(filp);	601	filp->f_pos = ext4_get_htree_eof(filp);
602	break;	602	break;
603	}	603	}
604	info->curr_hash = info->next_hash;	604	info->curr_hash = info->next_hash;
605	info->curr_minor_hash = 0;	605	info->curr_minor_hash = 0;
606	}	606	}
607	}	607	}
608	finished:	608	finished:
609	info->last_pos = filp->f_pos;	609	info->last_pos = filp->f_pos;
610	return 0;	610	return 0;
611	}	611	}
612		612
613	static int ext4_release_dir(struct inode inode, struct file filp)	613	static int ext4_release_dir(struct inode inode, struct file filp)
614	{	614	{
615	if (filp->private_data)	615	if (filp->private_data)
616	ext4_htree_free_dir_info(filp->private_data);	616	ext4_htree_free_dir_info(filp->private_data);
617		617
618	return 0;	618	return 0;
619	}	619	}
620		620
621	const struct file_operations ext4_dir_operations = {	621	const struct file_operations ext4_dir_operations = {
622	.llseek = ext4_dir_llseek,	622	.llseek = ext4_dir_llseek,
623	.read = generic_read_dir,	623	.read = generic_read_dir,
624	.readdir = ext4_readdir,	624	.readdir = ext4_readdir,
625	.unlocked_ioctl = ext4_ioctl,	625	.unlocked_ioctl = ext4_ioctl,
626	#ifdef CONFIG_COMPAT	626	#ifdef CONFIG_COMPAT
627	.compat_ioctl = ext4_compat_ioctl,	627	.compat_ioctl = ext4_compat_ioctl,
628	#endif	628	#endif
629	.fsync = ext4_sync_file,	629	.fsync = ext4_sync_file,
630	.release = ext4_release_dir,	630	.release = ext4_release_dir,
631	};	631	};
632		632

fs/ext4/file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/ext4/file.c	2	* linux/fs/ext4/file.c
3	*	3	*
4	* Copyright (C) 1992, 1993, 1994, 1995	4	* Copyright (C) 1992, 1993, 1994, 1995
5	* Remy Card (card@masi.ibp.fr)	5	* Remy Card (card@masi.ibp.fr)
6	* Laboratoire MASI - Institut Blaise Pascal	6	* Laboratoire MASI - Institut Blaise Pascal
7	* Universite Pierre et Marie Curie (Paris VI)	7	* Universite Pierre et Marie Curie (Paris VI)
8	*	8	*
9	* from	9	* from
10	*	10	*
11	* linux/fs/minix/file.c	11	* linux/fs/minix/file.c
12	*	12	*
13	* Copyright (C) 1991, 1992 Linus Torvalds	13	* Copyright (C) 1991, 1992 Linus Torvalds
14	*	14	*
15	* ext4 fs regular file handling primitives	15	* ext4 fs regular file handling primitives
16	*	16	*
17	* 64-bit file support on 64-bit platforms by Jakub Jelinek	17	* 64-bit file support on 64-bit platforms by Jakub Jelinek
18	* (jj@sunsite.ms.mff.cuni.cz)	18	* (jj@sunsite.ms.mff.cuni.cz)
19	*/	19	*/
20		20
21	#include <linux/time.h>	21	#include <linux/time.h>
22	#include <linux/fs.h>	22	#include <linux/fs.h>
23	#include <linux/jbd2.h>	23	#include <linux/jbd2.h>
24	#include <linux/mount.h>	24	#include <linux/mount.h>
25	#include <linux/path.h>	25	#include <linux/path.h>
26	#include <linux/quotaops.h>	26	#include <linux/quotaops.h>
27	#include <linux/pagevec.h>	27	#include <linux/pagevec.h>
28	#include "ext4.h"	28	#include "ext4.h"
29	#include "ext4_jbd2.h"	29	#include "ext4_jbd2.h"
30	#include "xattr.h"	30	#include "xattr.h"
31	#include "acl.h"	31	#include "acl.h"
32		32
33	/*	33	/*
34	* Called when an inode is released. Note that this is different	34	* Called when an inode is released. Note that this is different
35	* from ext4_file_open: open gets called at every open, but release	35	* from ext4_file_open: open gets called at every open, but release
36	* gets called only when /all/ the files are closed.	36	* gets called only when /all/ the files are closed.
37	*/	37	*/
38	static int ext4_release_file(struct inode inode, struct file filp)	38	static int ext4_release_file(struct inode inode, struct file filp)
39	{	39	{
40	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {	40	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
41	ext4_alloc_da_blocks(inode);	41	ext4_alloc_da_blocks(inode);
42	ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);	42	ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
43	}	43	}
44	/* if we are the last writer on the inode, drop the block reservation */	44	/* if we are the last writer on the inode, drop the block reservation */
45	if ((filp->f_mode & FMODE_WRITE) &&	45	if ((filp->f_mode & FMODE_WRITE) &&
46	(atomic_read(&inode->i_writecount) == 1) &&	46	(atomic_read(&inode->i_writecount) == 1) &&
47	!EXT4_I(inode)->i_reserved_data_blocks)	47	!EXT4_I(inode)->i_reserved_data_blocks)
48	{	48	{
49	down_write(&EXT4_I(inode)->i_data_sem);	49	down_write(&EXT4_I(inode)->i_data_sem);
50	ext4_discard_preallocations(inode);	50	ext4_discard_preallocations(inode);
51	up_write(&EXT4_I(inode)->i_data_sem);	51	up_write(&EXT4_I(inode)->i_data_sem);
52	}	52	}
53	if (is_dx(inode) && filp->private_data)	53	if (is_dx(inode) && filp->private_data)
54	ext4_htree_free_dir_info(filp->private_data);	54	ext4_htree_free_dir_info(filp->private_data);
55		55
56	return 0;	56	return 0;
57	}	57	}
58		58
59	void ext4_unwritten_wait(struct inode *inode)	59	void ext4_unwritten_wait(struct inode *inode)
60	{	60	{
61	wait_queue_head_t *wq = ext4_ioend_wq(inode);	61	wait_queue_head_t *wq = ext4_ioend_wq(inode);
62		62
63	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));	63	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
64	}	64	}
65		65
66	/*	66	/*
67	* This tests whether the IO in question is block-aligned or not.	67	* This tests whether the IO in question is block-aligned or not.
68	* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they	68	* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
69	* are converted to written only after the IO is complete. Until they are	69	* are converted to written only after the IO is complete. Until they are
70	* mapped, these blocks appear as holes, so dio_zero_block() will assume that	70	* mapped, these blocks appear as holes, so dio_zero_block() will assume that
71	* it needs to zero out portions of the start and/or end block. If 2 AIO	71	* it needs to zero out portions of the start and/or end block. If 2 AIO
72	* threads are at work on the same unwritten block, they must be synchronized	72	* threads are at work on the same unwritten block, they must be synchronized
73	* or one thread will zero the other's data, causing corruption.	73	* or one thread will zero the other's data, causing corruption.
74	*/	74	*/
75	static int	75	static int
76	ext4_unaligned_aio(struct inode inode, const struct iovec iov,	76	ext4_unaligned_aio(struct inode inode, const struct iovec iov,
77	unsigned long nr_segs, loff_t pos)	77	unsigned long nr_segs, loff_t pos)
78	{	78	{
79	struct super_block *sb = inode->i_sb;	79	struct super_block *sb = inode->i_sb;
80	int blockmask = sb->s_blocksize - 1;	80	int blockmask = sb->s_blocksize - 1;
81	size_t count = iov_length(iov, nr_segs);	81	size_t count = iov_length(iov, nr_segs);
82	loff_t final_size = pos + count;	82	loff_t final_size = pos + count;
83		83
84	if (pos >= inode->i_size)	84	if (pos >= inode->i_size)
85	return 0;	85	return 0;
86		86
87	if ((pos & blockmask) \|\| (final_size & blockmask))	87	if ((pos & blockmask) \|\| (final_size & blockmask))
88	return 1;	88	return 1;
89		89
90	return 0;	90	return 0;
91	}	91	}
92		92
93	static ssize_t	93	static ssize_t
94	ext4_file_dio_write(struct kiocb iocb, const struct iovec iov,	94	ext4_file_dio_write(struct kiocb iocb, const struct iovec iov,
95	unsigned long nr_segs, loff_t pos)	95	unsigned long nr_segs, loff_t pos)
96	{	96	{
97	struct file *file = iocb->ki_filp;	97	struct file *file = iocb->ki_filp;
98	struct inode *inode = file->f_mapping->host;	98	struct inode *inode = file->f_mapping->host;
99	struct blk_plug plug;	99	struct blk_plug plug;
100	int unaligned_aio = 0;	100	int unaligned_aio = 0;
101	ssize_t ret;	101	ssize_t ret;
102	int overwrite = 0;	102	int overwrite = 0;
103	size_t length = iov_length(iov, nr_segs);	103	size_t length = iov_length(iov, nr_segs);
104		104
105	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&	105	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
106	!is_sync_kiocb(iocb))	106	!is_sync_kiocb(iocb))
107	unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);	107	unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
108		108
109	/* Unaligned direct AIO must be serialized; see comment above */	109	/* Unaligned direct AIO must be serialized; see comment above */
110	if (unaligned_aio) {	110	if (unaligned_aio) {
111	static unsigned long unaligned_warn_time;	111	static unsigned long unaligned_warn_time;
112		112
113	/* Warn about this once per day */	113	/* Warn about this once per day */
114	if (printk_timed_ratelimit(&unaligned_warn_time, 606024*HZ))	114	if (printk_timed_ratelimit(&unaligned_warn_time, 606024*HZ))
115	ext4_msg(inode->i_sb, KERN_WARNING,	115	ext4_msg(inode->i_sb, KERN_WARNING,
116	"Unaligned AIO/DIO on inode %ld by %s; "	116	"Unaligned AIO/DIO on inode %ld by %s; "
117	"performance will be poor.",	117	"performance will be poor.",
118	inode->i_ino, current->comm);	118	inode->i_ino, current->comm);
119	mutex_lock(ext4_aio_mutex(inode));	119	mutex_lock(ext4_aio_mutex(inode));
120	ext4_unwritten_wait(inode);	120	ext4_unwritten_wait(inode);
121	}	121	}
122		122
123	BUG_ON(iocb->ki_pos != pos);	123	BUG_ON(iocb->ki_pos != pos);
124		124
125	mutex_lock(&inode->i_mutex);	125	mutex_lock(&inode->i_mutex);
126	blk_start_plug(&plug);	126	blk_start_plug(&plug);
127		127
128	iocb->private = &overwrite;	128	iocb->private = &overwrite;
129		129
130	/* check whether we do a DIO overwrite or not */	130	/* check whether we do a DIO overwrite or not */
131	if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&	131	if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
132	!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {	132	!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
133	struct ext4_map_blocks map;	133	struct ext4_map_blocks map;
134	unsigned int blkbits = inode->i_blkbits;	134	unsigned int blkbits = inode->i_blkbits;
135	int err, len;	135	int err, len;
136		136
137	map.m_lblk = pos >> blkbits;	137	map.m_lblk = pos >> blkbits;
138	map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)	138	map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
139	- map.m_lblk;	139	- map.m_lblk;
140	len = map.m_len;	140	len = map.m_len;
141		141
142	err = ext4_map_blocks(NULL, inode, &map, 0);	142	err = ext4_map_blocks(NULL, inode, &map, 0);
143	/*	143	/*
144	* 'err==len' means that all of blocks has been preallocated no	144	* 'err==len' means that all of blocks has been preallocated no
145	* matter they are initialized or not. For excluding	145	* matter they are initialized or not. For excluding
146	* uninitialized extents, we need to check m_flags. There are	146	* uninitialized extents, we need to check m_flags. There are
147	* two conditions that indicate for initialized extents.	147	* two conditions that indicate for initialized extents.
148	* 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;	148	* 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
149	* 2) If we do a real lookup, non-flags are returned.	149	* 2) If we do a real lookup, non-flags are returned.
150	* So we should check these two conditions.	150	* So we should check these two conditions.
151	*/	151	*/
152	if (err == len && (map.m_flags & EXT4_MAP_MAPPED))	152	if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
153	overwrite = 1;	153	overwrite = 1;
154	}	154	}
155		155
156	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);	156	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
157	mutex_unlock(&inode->i_mutex);	157	mutex_unlock(&inode->i_mutex);
158		158
159	if (ret > 0 \|\| ret == -EIOCBQUEUED) {	159	if (ret > 0 \|\| ret == -EIOCBQUEUED) {
160	ssize_t err;	160	ssize_t err;
161		161
162	err = generic_write_sync(file, pos, ret);	162	err = generic_write_sync(file, pos, ret);
163	if (err < 0 && ret > 0)	163	if (err < 0 && ret > 0)
164	ret = err;	164	ret = err;
165	}	165	}
166	blk_finish_plug(&plug);	166	blk_finish_plug(&plug);
167		167
168	if (unaligned_aio)	168	if (unaligned_aio)
169	mutex_unlock(ext4_aio_mutex(inode));	169	mutex_unlock(ext4_aio_mutex(inode));
170		170
171	return ret;	171	return ret;
172	}	172	}
173		173
174	static ssize_t	174	static ssize_t
175	ext4_file_write(struct kiocb iocb, const struct iovec iov,	175	ext4_file_write(struct kiocb iocb, const struct iovec iov,
176	unsigned long nr_segs, loff_t pos)	176	unsigned long nr_segs, loff_t pos)
177	{	177	{
178	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;	178	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
179	ssize_t ret;	179	ssize_t ret;
180		180
181	/*	181	/*
182	* If we have encountered a bitmap-format file, the size limit	182	* If we have encountered a bitmap-format file, the size limit
183	* is smaller than s_maxbytes, which is for extent-mapped files.	183	* is smaller than s_maxbytes, which is for extent-mapped files.
184	*/	184	*/
185		185
186	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {	186	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
187	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	187	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
188	size_t length = iov_length(iov, nr_segs);	188	size_t length = iov_length(iov, nr_segs);
189		189
190	if ((pos > sbi->s_bitmap_maxbytes \|\|	190	if ((pos > sbi->s_bitmap_maxbytes \|\|
191	(pos == sbi->s_bitmap_maxbytes && length > 0)))	191	(pos == sbi->s_bitmap_maxbytes && length > 0)))
192	return -EFBIG;	192	return -EFBIG;
193		193
194	if (pos + length > sbi->s_bitmap_maxbytes) {	194	if (pos + length > sbi->s_bitmap_maxbytes) {
195	nr_segs = iov_shorten((struct iovec *)iov, nr_segs,	195	nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
196	sbi->s_bitmap_maxbytes - pos);	196	sbi->s_bitmap_maxbytes - pos);
197	}	197	}
198	}	198	}
199		199
200	if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))	200	if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
201	ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);	201	ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
202	else	202	else
203	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);	203	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
204		204
205	return ret;	205	return ret;
206	}	206	}
207		207
208	static const struct vm_operations_struct ext4_file_vm_ops = {	208	static const struct vm_operations_struct ext4_file_vm_ops = {
209	.fault = filemap_fault,	209	.fault = filemap_fault,
210	.page_mkwrite = ext4_page_mkwrite,	210	.page_mkwrite = ext4_page_mkwrite,
211	.remap_pages = generic_file_remap_pages,	211	.remap_pages = generic_file_remap_pages,
212	};	212	};
213		213
214	static int ext4_file_mmap(struct file file, struct vm_area_struct vma)	214	static int ext4_file_mmap(struct file file, struct vm_area_struct vma)
215	{	215	{
216	struct address_space *mapping = file->f_mapping;	216	struct address_space *mapping = file->f_mapping;
217		217
218	if (!mapping->a_ops->readpage)	218	if (!mapping->a_ops->readpage)
219	return -ENOEXEC;	219	return -ENOEXEC;
220	file_accessed(file);	220	file_accessed(file);
221	vma->vm_ops = &ext4_file_vm_ops;	221	vma->vm_ops = &ext4_file_vm_ops;
222	return 0;	222	return 0;
223	}	223	}
224		224
225	static int ext4_file_open(struct inode * inode, struct file * filp)	225	static int ext4_file_open(struct inode * inode, struct file * filp)
226	{	226	{
227	struct super_block *sb = inode->i_sb;	227	struct super_block *sb = inode->i_sb;
228	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	228	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
229	struct ext4_inode_info *ei = EXT4_I(inode);	229	struct ext4_inode_info *ei = EXT4_I(inode);
230	struct vfsmount *mnt = filp->f_path.mnt;	230	struct vfsmount *mnt = filp->f_path.mnt;
231	struct path path;	231	struct path path;
232	char buf[64], *cp;	232	char buf[64], *cp;
233		233
234	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&	234	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
235	!(sb->s_flags & MS_RDONLY))) {	235	!(sb->s_flags & MS_RDONLY))) {
236	sbi->s_mount_flags \|= EXT4_MF_MNTDIR_SAMPLED;	236	sbi->s_mount_flags \|= EXT4_MF_MNTDIR_SAMPLED;
237	/*	237	/*
238	* Sample where the filesystem has been mounted and	238	* Sample where the filesystem has been mounted and
239	* store it in the superblock for sysadmin convenience	239	* store it in the superblock for sysadmin convenience
240	* when trying to sort through large numbers of block	240	* when trying to sort through large numbers of block
241	* devices or filesystem images.	241	* devices or filesystem images.
242	*/	242	*/
243	memset(buf, 0, sizeof(buf));	243	memset(buf, 0, sizeof(buf));
244	path.mnt = mnt;	244	path.mnt = mnt;
245	path.dentry = mnt->mnt_root;	245	path.dentry = mnt->mnt_root;
246	cp = d_path(&path, buf, sizeof(buf));	246	cp = d_path(&path, buf, sizeof(buf));
247	if (!IS_ERR(cp)) {	247	if (!IS_ERR(cp)) {
248	handle_t *handle;	248	handle_t *handle;
249	int err;	249	int err;
250		250
251	handle = ext4_journal_start_sb(sb, 1);	251	handle = ext4_journal_start_sb(sb, 1);
252	if (IS_ERR(handle))	252	if (IS_ERR(handle))
253	return PTR_ERR(handle);	253	return PTR_ERR(handle);
254	err = ext4_journal_get_write_access(handle, sbi->s_sbh);	254	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
255	if (err) {	255	if (err) {
256	ext4_journal_stop(handle);	256	ext4_journal_stop(handle);
257	return err;	257	return err;
258	}	258	}
259	strlcpy(sbi->s_es->s_last_mounted, cp,	259	strlcpy(sbi->s_es->s_last_mounted, cp,
260	sizeof(sbi->s_es->s_last_mounted));	260	sizeof(sbi->s_es->s_last_mounted));
261	ext4_handle_dirty_super(handle, sb);	261	ext4_handle_dirty_super(handle, sb);
262	ext4_journal_stop(handle);	262	ext4_journal_stop(handle);
263	}	263	}
264	}	264	}
265	/*	265	/*
266	* Set up the jbd2_inode if we are opening the inode for	266	* Set up the jbd2_inode if we are opening the inode for
267	* writing and the journal is present	267	* writing and the journal is present
268	*/	268	*/
269	if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {	269	if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
270	struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);	270	struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
271		271
272	spin_lock(&inode->i_lock);	272	spin_lock(&inode->i_lock);
273	if (!ei->jinode) {	273	if (!ei->jinode) {
274	if (!jinode) {	274	if (!jinode) {
275	spin_unlock(&inode->i_lock);	275	spin_unlock(&inode->i_lock);
276	return -ENOMEM;	276	return -ENOMEM;
277	}	277	}
278	ei->jinode = jinode;	278	ei->jinode = jinode;
279	jbd2_journal_init_jbd_inode(ei->jinode, inode);	279	jbd2_journal_init_jbd_inode(ei->jinode, inode);
280	jinode = NULL;	280	jinode = NULL;
281	}	281	}
282	spin_unlock(&inode->i_lock);	282	spin_unlock(&inode->i_lock);
283	if (unlikely(jinode != NULL))	283	if (unlikely(jinode != NULL))
284	jbd2_free_inode(jinode);	284	jbd2_free_inode(jinode);
285	}	285	}
286	return dquot_file_open(inode, filp);	286	return dquot_file_open(inode, filp);
287	}	287	}
288		288
289	/*	289	/*
290	* Here we use ext4_map_blocks() to get a block mapping for a extent-based	290	* Here we use ext4_map_blocks() to get a block mapping for a extent-based
291	* file rather than ext4_ext_walk_space() because we can introduce	291	* file rather than ext4_ext_walk_space() because we can introduce
292	* SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same	292	* SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
293	* function. When extent status tree has been fully implemented, it will	293	* function. When extent status tree has been fully implemented, it will
294	* track all extent status for a file and we can directly use it to	294	* track all extent status for a file and we can directly use it to
295	* retrieve the offset for SEEK_DATA/SEEK_HOLE.	295	* retrieve the offset for SEEK_DATA/SEEK_HOLE.
296	*/	296	*/
297		297
298	/*	298	/*
299	* When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to	299	* When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
300	* lookup page cache to check whether or not there has some data between	300	* lookup page cache to check whether or not there has some data between
301	* [startoff, endoff] because, if this range contains an unwritten extent,	301	* [startoff, endoff] because, if this range contains an unwritten extent,
302	* we determine this extent as a data or a hole according to whether the	302	* we determine this extent as a data or a hole according to whether the
303	* page cache has data or not.	303	* page cache has data or not.
304	*/	304	*/
305	static int ext4_find_unwritten_pgoff(struct inode *inode,	305	static int ext4_find_unwritten_pgoff(struct inode *inode,
306	int origin,	306	int whence,
307	struct ext4_map_blocks *map,	307	struct ext4_map_blocks *map,
308	loff_t *offset)	308	loff_t *offset)
309	{	309	{
310	struct pagevec pvec;	310	struct pagevec pvec;
311	unsigned int blkbits;	311	unsigned int blkbits;
312	pgoff_t index;	312	pgoff_t index;
313	pgoff_t end;	313	pgoff_t end;
314	loff_t endoff;	314	loff_t endoff;
315	loff_t startoff;	315	loff_t startoff;
316	loff_t lastoff;	316	loff_t lastoff;
317	int found = 0;	317	int found = 0;
318		318
319	blkbits = inode->i_sb->s_blocksize_bits;	319	blkbits = inode->i_sb->s_blocksize_bits;
320	startoff = *offset;	320	startoff = *offset;
321	lastoff = startoff;	321	lastoff = startoff;
322	endoff = (map->m_lblk + map->m_len) << blkbits;	322	endoff = (map->m_lblk + map->m_len) << blkbits;
323		323
324	index = startoff >> PAGE_CACHE_SHIFT;	324	index = startoff >> PAGE_CACHE_SHIFT;
325	end = endoff >> PAGE_CACHE_SHIFT;	325	end = endoff >> PAGE_CACHE_SHIFT;
326		326
327	pagevec_init(&pvec, 0);	327	pagevec_init(&pvec, 0);
328	do {	328	do {
329	int i, num;	329	int i, num;
330	unsigned long nr_pages;	330	unsigned long nr_pages;
331		331
332	num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);	332	num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
333	nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,	333	nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
334	(pgoff_t)num);	334	(pgoff_t)num);
335	if (nr_pages == 0) {	335	if (nr_pages == 0) {
336	if (origin == SEEK_DATA)	336	if (whence == SEEK_DATA)
337	break;	337	break;
338		338
339	BUG_ON(origin != SEEK_HOLE);	339	BUG_ON(whence != SEEK_HOLE);
340	/*	340	/*
341	* If this is the first time to go into the loop and	341	* If this is the first time to go into the loop and
342	* offset is not beyond the end offset, it will be a	342	* offset is not beyond the end offset, it will be a
343	* hole at this offset	343	* hole at this offset
344	*/	344	*/
345	if (lastoff == startoff \|\| lastoff < endoff)	345	if (lastoff == startoff \|\| lastoff < endoff)
346	found = 1;	346	found = 1;
347	break;	347	break;
348	}	348	}
349		349
350	/*	350	/*
351	* If this is the first time to go into the loop and	351	* If this is the first time to go into the loop and
352	* offset is smaller than the first page offset, it will be a	352	* offset is smaller than the first page offset, it will be a
353	* hole at this offset.	353	* hole at this offset.
354	*/	354	*/
355	if (lastoff == startoff && origin == SEEK_HOLE &&	355	if (lastoff == startoff && whence == SEEK_HOLE &&
356	lastoff < page_offset(pvec.pages[0])) {	356	lastoff < page_offset(pvec.pages[0])) {
357	found = 1;	357	found = 1;
358	break;	358	break;
359	}	359	}
360		360
361	for (i = 0; i < nr_pages; i++) {	361	for (i = 0; i < nr_pages; i++) {
362	struct page *page = pvec.pages[i];	362	struct page *page = pvec.pages[i];
363	struct buffer_head bh, head;	363	struct buffer_head bh, head;
364		364
365	/*	365	/*
366	* If the current offset is not beyond the end of given	366	* If the current offset is not beyond the end of given
367	* range, it will be a hole.	367	* range, it will be a hole.
368	*/	368	*/
369	if (lastoff < endoff && origin == SEEK_HOLE &&	369	if (lastoff < endoff && whence == SEEK_HOLE &&
370	page->index > end) {	370	page->index > end) {
371	found = 1;	371	found = 1;
372	*offset = lastoff;	372	*offset = lastoff;
373	goto out;	373	goto out;
374	}	374	}
375		375
376	lock_page(page);	376	lock_page(page);
377		377
378	if (unlikely(page->mapping != inode->i_mapping)) {	378	if (unlikely(page->mapping != inode->i_mapping)) {
379	unlock_page(page);	379	unlock_page(page);
380	continue;	380	continue;
381	}	381	}
382		382
383	if (!page_has_buffers(page)) {	383	if (!page_has_buffers(page)) {
384	unlock_page(page);	384	unlock_page(page);
385	continue;	385	continue;
386	}	386	}
387		387
388	if (page_has_buffers(page)) {	388	if (page_has_buffers(page)) {
389	lastoff = page_offset(page);	389	lastoff = page_offset(page);
390	bh = head = page_buffers(page);	390	bh = head = page_buffers(page);
391	do {	391	do {
392	if (buffer_uptodate(bh) \|\|	392	if (buffer_uptodate(bh) \|\|
393	buffer_unwritten(bh)) {	393	buffer_unwritten(bh)) {
394	if (origin == SEEK_DATA)	394	if (whence == SEEK_DATA)
395	found = 1;	395	found = 1;
396	} else {	396	} else {
397	if (origin == SEEK_HOLE)	397	if (whence == SEEK_HOLE)
398	found = 1;	398	found = 1;
399	}	399	}
400	if (found) {	400	if (found) {
401	*offset = max_t(loff_t,	401	*offset = max_t(loff_t,
402	startoff, lastoff);	402	startoff, lastoff);
403	unlock_page(page);	403	unlock_page(page);
404	goto out;	404	goto out;
405	}	405	}
406	lastoff += bh->b_size;	406	lastoff += bh->b_size;
407	bh = bh->b_this_page;	407	bh = bh->b_this_page;
408	} while (bh != head);	408	} while (bh != head);
409	}	409	}
410		410
411	lastoff = page_offset(page) + PAGE_SIZE;	411	lastoff = page_offset(page) + PAGE_SIZE;
412	unlock_page(page);	412	unlock_page(page);
413	}	413	}
414		414
415	/*	415	/*
416	* The no. of pages is less than our desired, that would be a	416	* The no. of pages is less than our desired, that would be a
417	* hole in there.	417	* hole in there.
418	*/	418	*/
419	if (nr_pages < num && origin == SEEK_HOLE) {	419	if (nr_pages < num && whence == SEEK_HOLE) {
420	found = 1;	420	found = 1;
421	*offset = lastoff;	421	*offset = lastoff;
422	break;	422	break;
423	}	423	}
424		424
425	index = pvec.pages[i - 1]->index + 1;	425	index = pvec.pages[i - 1]->index + 1;
426	pagevec_release(&pvec);	426	pagevec_release(&pvec);
427	} while (index <= end);	427	} while (index <= end);
428		428
429	out:	429	out:
430	pagevec_release(&pvec);	430	pagevec_release(&pvec);
431	return found;	431	return found;
432	}	432	}
433		433
434	/*	434	/*
435	* ext4_seek_data() retrieves the offset for SEEK_DATA.	435	* ext4_seek_data() retrieves the offset for SEEK_DATA.
436	*/	436	*/
437	static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)	437	static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
438	{	438	{
439	struct inode *inode = file->f_mapping->host;	439	struct inode *inode = file->f_mapping->host;
440	struct ext4_map_blocks map;	440	struct ext4_map_blocks map;
441	struct extent_status es;	441	struct extent_status es;
442	ext4_lblk_t start, last, end;	442	ext4_lblk_t start, last, end;
443	loff_t dataoff, isize;	443	loff_t dataoff, isize;
444	int blkbits;	444	int blkbits;
445	int ret = 0;	445	int ret = 0;
446		446
447	mutex_lock(&inode->i_mutex);	447	mutex_lock(&inode->i_mutex);
448		448
449	isize = i_size_read(inode);	449	isize = i_size_read(inode);
450	if (offset >= isize) {	450	if (offset >= isize) {
451	mutex_unlock(&inode->i_mutex);	451	mutex_unlock(&inode->i_mutex);
452	return -ENXIO;	452	return -ENXIO;
453	}	453	}
454		454
455	blkbits = inode->i_sb->s_blocksize_bits;	455	blkbits = inode->i_sb->s_blocksize_bits;
456	start = offset >> blkbits;	456	start = offset >> blkbits;
457	last = start;	457	last = start;
458	end = isize >> blkbits;	458	end = isize >> blkbits;
459	dataoff = offset;	459	dataoff = offset;
460		460
461	do {	461	do {
462	map.m_lblk = last;	462	map.m_lblk = last;
463	map.m_len = end - last + 1;	463	map.m_len = end - last + 1;
464	ret = ext4_map_blocks(NULL, inode, &map, 0);	464	ret = ext4_map_blocks(NULL, inode, &map, 0);
465	if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {	465	if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
466	if (last != start)	466	if (last != start)
467	dataoff = last << blkbits;	467	dataoff = last << blkbits;
468	break;	468	break;
469	}	469	}
470		470
471	/*	471	/*
472	* If there is a delay extent at this offset,	472	* If there is a delay extent at this offset,
473	* it will be as a data.	473	* it will be as a data.
474	*/	474	*/
475	es.start = last;	475	es.start = last;
476	(void)ext4_es_find_extent(inode, &es);	476	(void)ext4_es_find_extent(inode, &es);
477	if (last >= es.start &&	477	if (last >= es.start &&
478	last < es.start + es.len) {	478	last < es.start + es.len) {
479	if (last != start)	479	if (last != start)
480	dataoff = last << blkbits;	480	dataoff = last << blkbits;
481	break;	481	break;
482	}	482	}
483		483
484	/*	484	/*
485	* If there is a unwritten extent at this offset,	485	* If there is a unwritten extent at this offset,
486	* it will be as a data or a hole according to page	486	* it will be as a data or a hole according to page
487	* cache that has data or not.	487	* cache that has data or not.
488	*/	488	*/
489	if (map.m_flags & EXT4_MAP_UNWRITTEN) {	489	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
490	int unwritten;	490	int unwritten;
491	unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,	491	unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
492	&map, &dataoff);	492	&map, &dataoff);
493	if (unwritten)	493	if (unwritten)
494	break;	494	break;
495	}	495	}
496		496
497	last++;	497	last++;
498	dataoff = last << blkbits;	498	dataoff = last << blkbits;
499	} while (last <= end);	499	} while (last <= end);
500		500
501	mutex_unlock(&inode->i_mutex);	501	mutex_unlock(&inode->i_mutex);
502		502
503	if (dataoff > isize)	503	if (dataoff > isize)
504	return -ENXIO;	504	return -ENXIO;
505		505
506	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))	506	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
507	return -EINVAL;	507	return -EINVAL;
508	if (dataoff > maxsize)	508	if (dataoff > maxsize)
509	return -EINVAL;	509	return -EINVAL;
510		510
511	if (dataoff != file->f_pos) {	511	if (dataoff != file->f_pos) {
512	file->f_pos = dataoff;	512	file->f_pos = dataoff;
513	file->f_version = 0;	513	file->f_version = 0;
514	}	514	}
515		515
516	return dataoff;	516	return dataoff;
517	}	517	}
518		518
519	/*	519	/*
520	* ext4_seek_hole() retrieves the offset for SEEK_HOLE.	520	* ext4_seek_hole() retrieves the offset for SEEK_HOLE.
521	*/	521	*/
522	static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)	522	static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
523	{	523	{
524	struct inode *inode = file->f_mapping->host;	524	struct inode *inode = file->f_mapping->host;
525	struct ext4_map_blocks map;	525	struct ext4_map_blocks map;
526	struct extent_status es;	526	struct extent_status es;
527	ext4_lblk_t start, last, end;	527	ext4_lblk_t start, last, end;
528	loff_t holeoff, isize;	528	loff_t holeoff, isize;
529	int blkbits;	529	int blkbits;
530	int ret = 0;	530	int ret = 0;
531		531
532	mutex_lock(&inode->i_mutex);	532	mutex_lock(&inode->i_mutex);
533		533
534	isize = i_size_read(inode);	534	isize = i_size_read(inode);
535	if (offset >= isize) {	535	if (offset >= isize) {
536	mutex_unlock(&inode->i_mutex);	536	mutex_unlock(&inode->i_mutex);
537	return -ENXIO;	537	return -ENXIO;
538	}	538	}
539		539
540	blkbits = inode->i_sb->s_blocksize_bits;	540	blkbits = inode->i_sb->s_blocksize_bits;
541	start = offset >> blkbits;	541	start = offset >> blkbits;
542	last = start;	542	last = start;
543	end = isize >> blkbits;	543	end = isize >> blkbits;
544	holeoff = offset;	544	holeoff = offset;
545		545
546	do {	546	do {
547	map.m_lblk = last;	547	map.m_lblk = last;
548	map.m_len = end - last + 1;	548	map.m_len = end - last + 1;
549	ret = ext4_map_blocks(NULL, inode, &map, 0);	549	ret = ext4_map_blocks(NULL, inode, &map, 0);
550	if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {	550	if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
551	last += ret;	551	last += ret;
552	holeoff = last << blkbits;	552	holeoff = last << blkbits;
553	continue;	553	continue;
554	}	554	}
555		555
556	/*	556	/*
557	* If there is a delay extent at this offset,	557	* If there is a delay extent at this offset,
558	* we will skip this extent.	558	* we will skip this extent.
559	*/	559	*/
560	es.start = last;	560	es.start = last;
561	(void)ext4_es_find_extent(inode, &es);	561	(void)ext4_es_find_extent(inode, &es);
562	if (last >= es.start &&	562	if (last >= es.start &&
563	last < es.start + es.len) {	563	last < es.start + es.len) {
564	last = es.start + es.len;	564	last = es.start + es.len;
565	holeoff = last << blkbits;	565	holeoff = last << blkbits;
566	continue;	566	continue;
567	}	567	}
568		568
569	/*	569	/*
570	* If there is a unwritten extent at this offset,	570	* If there is a unwritten extent at this offset,
571	* it will be as a data or a hole according to page	571	* it will be as a data or a hole according to page
572	* cache that has data or not.	572	* cache that has data or not.
573	*/	573	*/
574	if (map.m_flags & EXT4_MAP_UNWRITTEN) {	574	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
575	int unwritten;	575	int unwritten;
576	unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,	576	unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
577	&map, &holeoff);	577	&map, &holeoff);
578	if (!unwritten) {	578	if (!unwritten) {
579	last += ret;	579	last += ret;
580	holeoff = last << blkbits;	580	holeoff = last << blkbits;
581	continue;	581	continue;
582	}	582	}
583	}	583	}
584		584
585	/* find a hole */	585	/* find a hole */
586	break;	586	break;
587	} while (last <= end);	587	} while (last <= end);
588		588
589	mutex_unlock(&inode->i_mutex);	589	mutex_unlock(&inode->i_mutex);
590		590
591	if (holeoff > isize)	591	if (holeoff > isize)
592	holeoff = isize;	592	holeoff = isize;
593		593
594	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))	594	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
595	return -EINVAL;	595	return -EINVAL;
596	if (holeoff > maxsize)	596	if (holeoff > maxsize)
597	return -EINVAL;	597	return -EINVAL;
598		598
599	if (holeoff != file->f_pos) {	599	if (holeoff != file->f_pos) {
600	file->f_pos = holeoff;	600	file->f_pos = holeoff;
601	file->f_version = 0;	601	file->f_version = 0;
602	}	602	}
603		603
604	return holeoff;	604	return holeoff;
605	}	605	}
606		606
607	/*	607	/*
608	* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values	608	* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
609	* by calling generic_file_llseek_size() with the appropriate maxbytes	609	* by calling generic_file_llseek_size() with the appropriate maxbytes
610	* value for each.	610	* value for each.
611	*/	611	*/
612	loff_t ext4_llseek(struct file *file, loff_t offset, int origin)	612	loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
613	{	613	{
614	struct inode *inode = file->f_mapping->host;	614	struct inode *inode = file->f_mapping->host;
615	loff_t maxbytes;	615	loff_t maxbytes;
616		616
617	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))	617	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
618	maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;	618	maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
619	else	619	else
620	maxbytes = inode->i_sb->s_maxbytes;	620	maxbytes = inode->i_sb->s_maxbytes;
621		621
622	switch (origin) {	622	switch (whence) {
623	case SEEK_SET:	623	case SEEK_SET:
624	case SEEK_CUR:	624	case SEEK_CUR:
625	case SEEK_END:	625	case SEEK_END:
626	return generic_file_llseek_size(file, offset, origin,	626	return generic_file_llseek_size(file, offset, whence,
627	maxbytes, i_size_read(inode));	627	maxbytes, i_size_read(inode));
628	case SEEK_DATA:	628	case SEEK_DATA:
629	return ext4_seek_data(file, offset, maxbytes);	629	return ext4_seek_data(file, offset, maxbytes);
630	case SEEK_HOLE:	630	case SEEK_HOLE:
631	return ext4_seek_hole(file, offset, maxbytes);	631	return ext4_seek_hole(file, offset, maxbytes);
632	}	632	}
633		633
634	return -EINVAL;	634	return -EINVAL;
635	}	635	}
636		636
637	const struct file_operations ext4_file_operations = {	637	const struct file_operations ext4_file_operations = {
638	.llseek = ext4_llseek,	638	.llseek = ext4_llseek,
639	.read = do_sync_read,	639	.read = do_sync_read,
640	.write = do_sync_write,	640	.write = do_sync_write,
641	.aio_read = generic_file_aio_read,	641	.aio_read = generic_file_aio_read,
642	.aio_write = ext4_file_write,	642	.aio_write = ext4_file_write,
643	.unlocked_ioctl = ext4_ioctl,	643	.unlocked_ioctl = ext4_ioctl,
644	#ifdef CONFIG_COMPAT	644	#ifdef CONFIG_COMPAT
645	.compat_ioctl = ext4_compat_ioctl,	645	.compat_ioctl = ext4_compat_ioctl,
646	#endif	646	#endif
647	.mmap = ext4_file_mmap,	647	.mmap = ext4_file_mmap,
648	.open = ext4_file_open,	648	.open = ext4_file_open,
649	.release = ext4_release_file,	649	.release = ext4_release_file,
650	.fsync = ext4_sync_file,	650	.fsync = ext4_sync_file,
651	.splice_read = generic_file_splice_read,	651	.splice_read = generic_file_splice_read,
652	.splice_write = generic_file_splice_write,	652	.splice_write = generic_file_splice_write,
653	.fallocate = ext4_fallocate,	653	.fallocate = ext4_fallocate,
654	};	654	};
655		655
656	const struct inode_operations ext4_file_inode_operations = {	656	const struct inode_operations ext4_file_inode_operations = {
657	.setattr = ext4_setattr,	657	.setattr = ext4_setattr,
658	.getattr = ext4_getattr,	658	.getattr = ext4_getattr,
659	.setxattr = generic_setxattr,	659	.setxattr = generic_setxattr,
660	.getxattr = generic_getxattr,	660	.getxattr = generic_getxattr,
661	.listxattr = ext4_listxattr,	661	.listxattr = ext4_listxattr,
662	.removexattr = generic_removexattr,	662	.removexattr = generic_removexattr,
663	.get_acl = ext4_get_acl,	663	.get_acl = ext4_get_acl,
664	.fiemap = ext4_fiemap,	664	.fiemap = ext4_fiemap,
665	};	665	};
666		666
667		667

fs/fuse/file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	FUSE: Filesystem in Userspace	2	FUSE: Filesystem in Userspace
3	Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>	3	Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4		4
5	This program can be distributed under the terms of the GNU GPL.	5	This program can be distributed under the terms of the GNU GPL.
6	See the file COPYING.	6	See the file COPYING.
7	*/	7	*/
8		8
9	#include "fuse_i.h"	9	#include "fuse_i.h"
10		10
11	#include <linux/pagemap.h>	11	#include <linux/pagemap.h>
12	#include <linux/slab.h>	12	#include <linux/slab.h>
13	#include <linux/kernel.h>	13	#include <linux/kernel.h>
14	#include <linux/sched.h>	14	#include <linux/sched.h>
15	#include <linux/module.h>	15	#include <linux/module.h>
16	#include <linux/compat.h>	16	#include <linux/compat.h>
17	#include <linux/swap.h>	17	#include <linux/swap.h>
18		18
19	static const struct file_operations fuse_direct_io_file_operations;	19	static const struct file_operations fuse_direct_io_file_operations;
20		20
21	static int fuse_send_open(struct fuse_conn fc, u64 nodeid, struct file file,	21	static int fuse_send_open(struct fuse_conn fc, u64 nodeid, struct file file,
22	int opcode, struct fuse_open_out *outargp)	22	int opcode, struct fuse_open_out *outargp)
23	{	23	{
24	struct fuse_open_in inarg;	24	struct fuse_open_in inarg;
25	struct fuse_req *req;	25	struct fuse_req *req;
26	int err;	26	int err;
27		27
28	req = fuse_get_req(fc);	28	req = fuse_get_req(fc);
29	if (IS_ERR(req))	29	if (IS_ERR(req))
30	return PTR_ERR(req);	30	return PTR_ERR(req);
31		31
32	memset(&inarg, 0, sizeof(inarg));	32	memset(&inarg, 0, sizeof(inarg));
33	inarg.flags = file->f_flags & ~(O_CREAT \| O_EXCL \| O_NOCTTY);	33	inarg.flags = file->f_flags & ~(O_CREAT \| O_EXCL \| O_NOCTTY);
34	if (!fc->atomic_o_trunc)	34	if (!fc->atomic_o_trunc)
35	inarg.flags &= ~O_TRUNC;	35	inarg.flags &= ~O_TRUNC;
36	req->in.h.opcode = opcode;	36	req->in.h.opcode = opcode;
37	req->in.h.nodeid = nodeid;	37	req->in.h.nodeid = nodeid;
38	req->in.numargs = 1;	38	req->in.numargs = 1;
39	req->in.args[0].size = sizeof(inarg);	39	req->in.args[0].size = sizeof(inarg);
40	req->in.args[0].value = &inarg;	40	req->in.args[0].value = &inarg;
41	req->out.numargs = 1;	41	req->out.numargs = 1;
42	req->out.args[0].size = sizeof(*outargp);	42	req->out.args[0].size = sizeof(*outargp);
43	req->out.args[0].value = outargp;	43	req->out.args[0].value = outargp;
44	fuse_request_send(fc, req);	44	fuse_request_send(fc, req);
45	err = req->out.h.error;	45	err = req->out.h.error;
46	fuse_put_request(fc, req);	46	fuse_put_request(fc, req);
47		47
48	return err;	48	return err;
49	}	49	}
50		50
51	struct fuse_file fuse_file_alloc(struct fuse_conn fc)	51	struct fuse_file fuse_file_alloc(struct fuse_conn fc)
52	{	52	{
53	struct fuse_file *ff;	53	struct fuse_file *ff;
54		54
55	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);	55	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
56	if (unlikely(!ff))	56	if (unlikely(!ff))
57	return NULL;	57	return NULL;
58		58
59	ff->fc = fc;	59	ff->fc = fc;
60	ff->reserved_req = fuse_request_alloc();	60	ff->reserved_req = fuse_request_alloc();
61	if (unlikely(!ff->reserved_req)) {	61	if (unlikely(!ff->reserved_req)) {
62	kfree(ff);	62	kfree(ff);
63	return NULL;	63	return NULL;
64	}	64	}
65		65
66	INIT_LIST_HEAD(&ff->write_entry);	66	INIT_LIST_HEAD(&ff->write_entry);
67	atomic_set(&ff->count, 0);	67	atomic_set(&ff->count, 0);
68	RB_CLEAR_NODE(&ff->polled_node);	68	RB_CLEAR_NODE(&ff->polled_node);
69	init_waitqueue_head(&ff->poll_wait);	69	init_waitqueue_head(&ff->poll_wait);
70		70
71	spin_lock(&fc->lock);	71	spin_lock(&fc->lock);
72	ff->kh = ++fc->khctr;	72	ff->kh = ++fc->khctr;
73	spin_unlock(&fc->lock);	73	spin_unlock(&fc->lock);
74		74
75	return ff;	75	return ff;
76	}	76	}
77		77
78	void fuse_file_free(struct fuse_file *ff)	78	void fuse_file_free(struct fuse_file *ff)
79	{	79	{
80	fuse_request_free(ff->reserved_req);	80	fuse_request_free(ff->reserved_req);
81	kfree(ff);	81	kfree(ff);
82	}	82	}
83		83
84	struct fuse_file fuse_file_get(struct fuse_file ff)	84	struct fuse_file fuse_file_get(struct fuse_file ff)
85	{	85	{
86	atomic_inc(&ff->count);	86	atomic_inc(&ff->count);
87	return ff;	87	return ff;
88	}	88	}
89		89
90	static void fuse_release_async(struct work_struct *work)	90	static void fuse_release_async(struct work_struct *work)
91	{	91	{
92	struct fuse_req *req;	92	struct fuse_req *req;
93	struct fuse_conn *fc;	93	struct fuse_conn *fc;
94	struct path path;	94	struct path path;
95		95
96	req = container_of(work, struct fuse_req, misc.release.work);	96	req = container_of(work, struct fuse_req, misc.release.work);
97	path = req->misc.release.path;	97	path = req->misc.release.path;
98	fc = get_fuse_conn(path.dentry->d_inode);	98	fc = get_fuse_conn(path.dentry->d_inode);
99		99
100	fuse_put_request(fc, req);	100	fuse_put_request(fc, req);
101	path_put(&path);	101	path_put(&path);
102	}	102	}
103		103
104	static void fuse_release_end(struct fuse_conn fc, struct fuse_req req)	104	static void fuse_release_end(struct fuse_conn fc, struct fuse_req req)
105	{	105	{
106	if (fc->destroy_req) {	106	if (fc->destroy_req) {
107	/*	107	/*
108	* If this is a fuseblk mount, then it's possible that	108	* If this is a fuseblk mount, then it's possible that
109	* releasing the path will result in releasing the	109	* releasing the path will result in releasing the
110	* super block and sending the DESTROY request. If	110	* super block and sending the DESTROY request. If
111	* the server is single threaded, this would hang.	111	* the server is single threaded, this would hang.
112	* For this reason do the path_put() in a separate	112	* For this reason do the path_put() in a separate
113	* thread.	113	* thread.
114	*/	114	*/
115	atomic_inc(&req->count);	115	atomic_inc(&req->count);
116	INIT_WORK(&req->misc.release.work, fuse_release_async);	116	INIT_WORK(&req->misc.release.work, fuse_release_async);
117	schedule_work(&req->misc.release.work);	117	schedule_work(&req->misc.release.work);
118	} else {	118	} else {
119	path_put(&req->misc.release.path);	119	path_put(&req->misc.release.path);
120	}	120	}
121	}	121	}
122		122
123	static void fuse_file_put(struct fuse_file *ff, bool sync)	123	static void fuse_file_put(struct fuse_file *ff, bool sync)
124	{	124	{
125	if (atomic_dec_and_test(&ff->count)) {	125	if (atomic_dec_and_test(&ff->count)) {
126	struct fuse_req *req = ff->reserved_req;	126	struct fuse_req *req = ff->reserved_req;
127		127
128	if (sync) {	128	if (sync) {
129	fuse_request_send(ff->fc, req);	129	fuse_request_send(ff->fc, req);
130	path_put(&req->misc.release.path);	130	path_put(&req->misc.release.path);
131	fuse_put_request(ff->fc, req);	131	fuse_put_request(ff->fc, req);
132	} else {	132	} else {
133	req->end = fuse_release_end;	133	req->end = fuse_release_end;
134	fuse_request_send_background(ff->fc, req);	134	fuse_request_send_background(ff->fc, req);
135	}	135	}
136	kfree(ff);	136	kfree(ff);
137	}	137	}
138	}	138	}
139		139
140	int fuse_do_open(struct fuse_conn fc, u64 nodeid, struct file file,	140	int fuse_do_open(struct fuse_conn fc, u64 nodeid, struct file file,
141	bool isdir)	141	bool isdir)
142	{	142	{
143	struct fuse_open_out outarg;	143	struct fuse_open_out outarg;
144	struct fuse_file *ff;	144	struct fuse_file *ff;
145	int err;	145	int err;
146	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;	146	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
147		147
148	ff = fuse_file_alloc(fc);	148	ff = fuse_file_alloc(fc);
149	if (!ff)	149	if (!ff)
150	return -ENOMEM;	150	return -ENOMEM;
151		151
152	err = fuse_send_open(fc, nodeid, file, opcode, &outarg);	152	err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
153	if (err) {	153	if (err) {
154	fuse_file_free(ff);	154	fuse_file_free(ff);
155	return err;	155	return err;
156	}	156	}
157		157
158	if (isdir)	158	if (isdir)
159	outarg.open_flags &= ~FOPEN_DIRECT_IO;	159	outarg.open_flags &= ~FOPEN_DIRECT_IO;
160		160
161	ff->fh = outarg.fh;	161	ff->fh = outarg.fh;
162	ff->nodeid = nodeid;	162	ff->nodeid = nodeid;
163	ff->open_flags = outarg.open_flags;	163	ff->open_flags = outarg.open_flags;
164	file->private_data = fuse_file_get(ff);	164	file->private_data = fuse_file_get(ff);
165		165
166	return 0;	166	return 0;
167	}	167	}
168	EXPORT_SYMBOL_GPL(fuse_do_open);	168	EXPORT_SYMBOL_GPL(fuse_do_open);
169		169
170	void fuse_finish_open(struct inode inode, struct file file)	170	void fuse_finish_open(struct inode inode, struct file file)
171	{	171	{
172	struct fuse_file *ff = file->private_data;	172	struct fuse_file *ff = file->private_data;
173	struct fuse_conn *fc = get_fuse_conn(inode);	173	struct fuse_conn *fc = get_fuse_conn(inode);
174		174
175	if (ff->open_flags & FOPEN_DIRECT_IO)	175	if (ff->open_flags & FOPEN_DIRECT_IO)
176	file->f_op = &fuse_direct_io_file_operations;	176	file->f_op = &fuse_direct_io_file_operations;
177	if (!(ff->open_flags & FOPEN_KEEP_CACHE))	177	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
178	invalidate_inode_pages2(inode->i_mapping);	178	invalidate_inode_pages2(inode->i_mapping);
179	if (ff->open_flags & FOPEN_NONSEEKABLE)	179	if (ff->open_flags & FOPEN_NONSEEKABLE)
180	nonseekable_open(inode, file);	180	nonseekable_open(inode, file);
181	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {	181	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
182	struct fuse_inode *fi = get_fuse_inode(inode);	182	struct fuse_inode *fi = get_fuse_inode(inode);
183		183
184	spin_lock(&fc->lock);	184	spin_lock(&fc->lock);
185	fi->attr_version = ++fc->attr_version;	185	fi->attr_version = ++fc->attr_version;
186	i_size_write(inode, 0);	186	i_size_write(inode, 0);
187	spin_unlock(&fc->lock);	187	spin_unlock(&fc->lock);
188	fuse_invalidate_attr(inode);	188	fuse_invalidate_attr(inode);
189	}	189	}
190	}	190	}
191		191
192	int fuse_open_common(struct inode inode, struct file file, bool isdir)	192	int fuse_open_common(struct inode inode, struct file file, bool isdir)
193	{	193	{
194	struct fuse_conn *fc = get_fuse_conn(inode);	194	struct fuse_conn *fc = get_fuse_conn(inode);
195	int err;	195	int err;
196		196
197	err = generic_file_open(inode, file);	197	err = generic_file_open(inode, file);
198	if (err)	198	if (err)
199	return err;	199	return err;
200		200
201	err = fuse_do_open(fc, get_node_id(inode), file, isdir);	201	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
202	if (err)	202	if (err)
203	return err;	203	return err;
204		204
205	fuse_finish_open(inode, file);	205	fuse_finish_open(inode, file);
206		206
207	return 0;	207	return 0;
208	}	208	}
209		209
210	static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)	210	static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
211	{	211	{
212	struct fuse_conn *fc = ff->fc;	212	struct fuse_conn *fc = ff->fc;
213	struct fuse_req *req = ff->reserved_req;	213	struct fuse_req *req = ff->reserved_req;
214	struct fuse_release_in *inarg = &req->misc.release.in;	214	struct fuse_release_in *inarg = &req->misc.release.in;
215		215
216	spin_lock(&fc->lock);	216	spin_lock(&fc->lock);
217	list_del(&ff->write_entry);	217	list_del(&ff->write_entry);
218	if (!RB_EMPTY_NODE(&ff->polled_node))	218	if (!RB_EMPTY_NODE(&ff->polled_node))
219	rb_erase(&ff->polled_node, &fc->polled_files);	219	rb_erase(&ff->polled_node, &fc->polled_files);
220	spin_unlock(&fc->lock);	220	spin_unlock(&fc->lock);
221		221
222	wake_up_interruptible_all(&ff->poll_wait);	222	wake_up_interruptible_all(&ff->poll_wait);
223		223
224	inarg->fh = ff->fh;	224	inarg->fh = ff->fh;
225	inarg->flags = flags;	225	inarg->flags = flags;
226	req->in.h.opcode = opcode;	226	req->in.h.opcode = opcode;
227	req->in.h.nodeid = ff->nodeid;	227	req->in.h.nodeid = ff->nodeid;
228	req->in.numargs = 1;	228	req->in.numargs = 1;
229	req->in.args[0].size = sizeof(struct fuse_release_in);	229	req->in.args[0].size = sizeof(struct fuse_release_in);
230	req->in.args[0].value = inarg;	230	req->in.args[0].value = inarg;
231	}	231	}
232		232
233	void fuse_release_common(struct file *file, int opcode)	233	void fuse_release_common(struct file *file, int opcode)
234	{	234	{
235	struct fuse_file *ff;	235	struct fuse_file *ff;
236	struct fuse_req *req;	236	struct fuse_req *req;
237		237
238	ff = file->private_data;	238	ff = file->private_data;
239	if (unlikely(!ff))	239	if (unlikely(!ff))
240	return;	240	return;
241		241
242	req = ff->reserved_req;	242	req = ff->reserved_req;
243	fuse_prepare_release(ff, file->f_flags, opcode);	243	fuse_prepare_release(ff, file->f_flags, opcode);
244		244
245	if (ff->flock) {	245	if (ff->flock) {
246	struct fuse_release_in *inarg = &req->misc.release.in;	246	struct fuse_release_in *inarg = &req->misc.release.in;
247	inarg->release_flags \|= FUSE_RELEASE_FLOCK_UNLOCK;	247	inarg->release_flags \|= FUSE_RELEASE_FLOCK_UNLOCK;
248	inarg->lock_owner = fuse_lock_owner_id(ff->fc,	248	inarg->lock_owner = fuse_lock_owner_id(ff->fc,
249	(fl_owner_t) file);	249	(fl_owner_t) file);
250	}	250	}
251	/* Hold vfsmount and dentry until release is finished */	251	/* Hold vfsmount and dentry until release is finished */
252	path_get(&file->f_path);	252	path_get(&file->f_path);
253	req->misc.release.path = file->f_path;	253	req->misc.release.path = file->f_path;
254		254
255	/*	255	/*
256	* Normally this will send the RELEASE request, however if	256	* Normally this will send the RELEASE request, however if
257	* some asynchronous READ or WRITE requests are outstanding,	257	* some asynchronous READ or WRITE requests are outstanding,
258	* the sending will be delayed.	258	* the sending will be delayed.
259	*	259	*
260	* Make the release synchronous if this is a fuseblk mount,	260	* Make the release synchronous if this is a fuseblk mount,
261	* synchronous RELEASE is allowed (and desirable) in this case	261	* synchronous RELEASE is allowed (and desirable) in this case
262	* because the server can be trusted not to screw up.	262	* because the server can be trusted not to screw up.
263	*/	263	*/
264	fuse_file_put(ff, ff->fc->destroy_req != NULL);	264	fuse_file_put(ff, ff->fc->destroy_req != NULL);
265	}	265	}
266		266
267	static int fuse_open(struct inode inode, struct file file)	267	static int fuse_open(struct inode inode, struct file file)
268	{	268	{
269	return fuse_open_common(inode, file, false);	269	return fuse_open_common(inode, file, false);
270	}	270	}
271		271
272	static int fuse_release(struct inode inode, struct file file)	272	static int fuse_release(struct inode inode, struct file file)
273	{	273	{
274	fuse_release_common(file, FUSE_RELEASE);	274	fuse_release_common(file, FUSE_RELEASE);
275		275
276	/* return value is ignored by VFS */	276	/* return value is ignored by VFS */
277	return 0;	277	return 0;
278	}	278	}
279		279
280	void fuse_sync_release(struct fuse_file *ff, int flags)	280	void fuse_sync_release(struct fuse_file *ff, int flags)
281	{	281	{
282	WARN_ON(atomic_read(&ff->count) > 1);	282	WARN_ON(atomic_read(&ff->count) > 1);
283	fuse_prepare_release(ff, flags, FUSE_RELEASE);	283	fuse_prepare_release(ff, flags, FUSE_RELEASE);
284	ff->reserved_req->force = 1;	284	ff->reserved_req->force = 1;
285	fuse_request_send(ff->fc, ff->reserved_req);	285	fuse_request_send(ff->fc, ff->reserved_req);
286	fuse_put_request(ff->fc, ff->reserved_req);	286	fuse_put_request(ff->fc, ff->reserved_req);
287	kfree(ff);	287	kfree(ff);
288	}	288	}
289	EXPORT_SYMBOL_GPL(fuse_sync_release);	289	EXPORT_SYMBOL_GPL(fuse_sync_release);
290		290
291	/*	291	/*
292	* Scramble the ID space with XTEA, so that the value of the files_struct	292	* Scramble the ID space with XTEA, so that the value of the files_struct
293	* pointer is not exposed to userspace.	293	* pointer is not exposed to userspace.
294	*/	294	*/
295	u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)	295	u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
296	{	296	{
297	u32 *k = fc->scramble_key;	297	u32 *k = fc->scramble_key;
298	u64 v = (unsigned long) id;	298	u64 v = (unsigned long) id;
299	u32 v0 = v;	299	u32 v0 = v;
300	u32 v1 = v >> 32;	300	u32 v1 = v >> 32;
301	u32 sum = 0;	301	u32 sum = 0;
302	int i;	302	int i;
303		303
304	for (i = 0; i < 32; i++) {	304	for (i = 0; i < 32; i++) {
305	v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);	305	v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
306	sum += 0x9E3779B9;	306	sum += 0x9E3779B9;
307	v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);	307	v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
308	}	308	}
309		309
310	return (u64) v0 + ((u64) v1 << 32);	310	return (u64) v0 + ((u64) v1 << 32);
311	}	311	}
312		312
313	/*	313	/*
314	* Check if page is under writeback	314	* Check if page is under writeback
315	*	315	*
316	* This is currently done by walking the list of writepage requests	316	* This is currently done by walking the list of writepage requests
317	* for the inode, which can be pretty inefficient.	317	* for the inode, which can be pretty inefficient.
318	*/	318	*/
319	static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)	319	static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
320	{	320	{
321	struct fuse_conn *fc = get_fuse_conn(inode);	321	struct fuse_conn *fc = get_fuse_conn(inode);
322	struct fuse_inode *fi = get_fuse_inode(inode);	322	struct fuse_inode *fi = get_fuse_inode(inode);
323	struct fuse_req *req;	323	struct fuse_req *req;
324	bool found = false;	324	bool found = false;
325		325
326	spin_lock(&fc->lock);	326	spin_lock(&fc->lock);
327	list_for_each_entry(req, &fi->writepages, writepages_entry) {	327	list_for_each_entry(req, &fi->writepages, writepages_entry) {
328	pgoff_t curr_index;	328	pgoff_t curr_index;
329		329
330	BUG_ON(req->inode != inode);	330	BUG_ON(req->inode != inode);
331	curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;	331	curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
332	if (curr_index == index) {	332	if (curr_index == index) {
333	found = true;	333	found = true;
334	break;	334	break;
335	}	335	}
336	}	336	}
337	spin_unlock(&fc->lock);	337	spin_unlock(&fc->lock);
338		338
339	return found;	339	return found;
340	}	340	}
341		341
342	/*	342	/*
343	* Wait for page writeback to be completed.	343	* Wait for page writeback to be completed.
344	*	344	*
345	* Since fuse doesn't rely on the VM writeback tracking, this has to	345	* Since fuse doesn't rely on the VM writeback tracking, this has to
346	* use some other means.	346	* use some other means.
347	*/	347	*/
348	static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)	348	static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
349	{	349	{
350	struct fuse_inode *fi = get_fuse_inode(inode);	350	struct fuse_inode *fi = get_fuse_inode(inode);
351		351
352	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));	352	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
353	return 0;	353	return 0;
354	}	354	}
355		355
356	static int fuse_flush(struct file *file, fl_owner_t id)	356	static int fuse_flush(struct file *file, fl_owner_t id)
357	{	357	{
358	struct inode *inode = file->f_path.dentry->d_inode;	358	struct inode *inode = file->f_path.dentry->d_inode;
359	struct fuse_conn *fc = get_fuse_conn(inode);	359	struct fuse_conn *fc = get_fuse_conn(inode);
360	struct fuse_file *ff = file->private_data;	360	struct fuse_file *ff = file->private_data;
361	struct fuse_req *req;	361	struct fuse_req *req;
362	struct fuse_flush_in inarg;	362	struct fuse_flush_in inarg;
363	int err;	363	int err;
364		364
365	if (is_bad_inode(inode))	365	if (is_bad_inode(inode))
366	return -EIO;	366	return -EIO;
367		367
368	if (fc->no_flush)	368	if (fc->no_flush)
369	return 0;	369	return 0;
370		370
371	req = fuse_get_req_nofail(fc, file);	371	req = fuse_get_req_nofail(fc, file);
372	memset(&inarg, 0, sizeof(inarg));	372	memset(&inarg, 0, sizeof(inarg));
373	inarg.fh = ff->fh;	373	inarg.fh = ff->fh;
374	inarg.lock_owner = fuse_lock_owner_id(fc, id);	374	inarg.lock_owner = fuse_lock_owner_id(fc, id);
375	req->in.h.opcode = FUSE_FLUSH;	375	req->in.h.opcode = FUSE_FLUSH;
376	req->in.h.nodeid = get_node_id(inode);	376	req->in.h.nodeid = get_node_id(inode);
377	req->in.numargs = 1;	377	req->in.numargs = 1;
378	req->in.args[0].size = sizeof(inarg);	378	req->in.args[0].size = sizeof(inarg);
379	req->in.args[0].value = &inarg;	379	req->in.args[0].value = &inarg;
380	req->force = 1;	380	req->force = 1;
381	fuse_request_send(fc, req);	381	fuse_request_send(fc, req);
382	err = req->out.h.error;	382	err = req->out.h.error;
383	fuse_put_request(fc, req);	383	fuse_put_request(fc, req);
384	if (err == -ENOSYS) {	384	if (err == -ENOSYS) {
385	fc->no_flush = 1;	385	fc->no_flush = 1;
386	err = 0;	386	err = 0;
387	}	387	}
388	return err;	388	return err;
389	}	389	}
390		390
391	/*	391	/*
392	* Wait for all pending writepages on the inode to finish.	392	* Wait for all pending writepages on the inode to finish.
393	*	393	*
394	* This is currently done by blocking further writes with FUSE_NOWRITE	394	* This is currently done by blocking further writes with FUSE_NOWRITE
395	* and waiting for all sent writes to complete.	395	* and waiting for all sent writes to complete.
396	*	396	*
397	* This must be called under i_mutex, otherwise the FUSE_NOWRITE usage	397	* This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
398	* could conflict with truncation.	398	* could conflict with truncation.
399	*/	399	*/
400	static void fuse_sync_writes(struct inode *inode)	400	static void fuse_sync_writes(struct inode *inode)
401	{	401	{
402	fuse_set_nowrite(inode);	402	fuse_set_nowrite(inode);
403	fuse_release_nowrite(inode);	403	fuse_release_nowrite(inode);
404	}	404	}
405		405
406	int fuse_fsync_common(struct file *file, loff_t start, loff_t end,	406	int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
407	int datasync, int isdir)	407	int datasync, int isdir)
408	{	408	{
409	struct inode *inode = file->f_mapping->host;	409	struct inode *inode = file->f_mapping->host;
410	struct fuse_conn *fc = get_fuse_conn(inode);	410	struct fuse_conn *fc = get_fuse_conn(inode);
411	struct fuse_file *ff = file->private_data;	411	struct fuse_file *ff = file->private_data;
412	struct fuse_req *req;	412	struct fuse_req *req;
413	struct fuse_fsync_in inarg;	413	struct fuse_fsync_in inarg;
414	int err;	414	int err;
415		415
416	if (is_bad_inode(inode))	416	if (is_bad_inode(inode))
417	return -EIO;	417	return -EIO;
418		418
419	err = filemap_write_and_wait_range(inode->i_mapping, start, end);	419	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
420	if (err)	420	if (err)
421	return err;	421	return err;
422		422
423	if ((!isdir && fc->no_fsync) \|\| (isdir && fc->no_fsyncdir))	423	if ((!isdir && fc->no_fsync) \|\| (isdir && fc->no_fsyncdir))
424	return 0;	424	return 0;
425		425
426	mutex_lock(&inode->i_mutex);	426	mutex_lock(&inode->i_mutex);
427		427
428	/*	428	/*
429	* Start writeback against all dirty pages of the inode, then	429	* Start writeback against all dirty pages of the inode, then
430	* wait for all outstanding writes, before sending the FSYNC	430	* wait for all outstanding writes, before sending the FSYNC
431	* request.	431	* request.
432	*/	432	*/
433	err = write_inode_now(inode, 0);	433	err = write_inode_now(inode, 0);
434	if (err)	434	if (err)
435	goto out;	435	goto out;
436		436
437	fuse_sync_writes(inode);	437	fuse_sync_writes(inode);
438		438
439	req = fuse_get_req(fc);	439	req = fuse_get_req(fc);
440	if (IS_ERR(req)) {	440	if (IS_ERR(req)) {
441	err = PTR_ERR(req);	441	err = PTR_ERR(req);
442	goto out;	442	goto out;
443	}	443	}
444		444
445	memset(&inarg, 0, sizeof(inarg));	445	memset(&inarg, 0, sizeof(inarg));
446	inarg.fh = ff->fh;	446	inarg.fh = ff->fh;
447	inarg.fsync_flags = datasync ? 1 : 0;	447	inarg.fsync_flags = datasync ? 1 : 0;
448	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;	448	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
449	req->in.h.nodeid = get_node_id(inode);	449	req->in.h.nodeid = get_node_id(inode);
450	req->in.numargs = 1;	450	req->in.numargs = 1;
451	req->in.args[0].size = sizeof(inarg);	451	req->in.args[0].size = sizeof(inarg);
452	req->in.args[0].value = &inarg;	452	req->in.args[0].value = &inarg;
453	fuse_request_send(fc, req);	453	fuse_request_send(fc, req);
454	err = req->out.h.error;	454	err = req->out.h.error;
455	fuse_put_request(fc, req);	455	fuse_put_request(fc, req);
456	if (err == -ENOSYS) {	456	if (err == -ENOSYS) {
457	if (isdir)	457	if (isdir)
458	fc->no_fsyncdir = 1;	458	fc->no_fsyncdir = 1;
459	else	459	else
460	fc->no_fsync = 1;	460	fc->no_fsync = 1;
461	err = 0;	461	err = 0;
462	}	462	}
463	out:	463	out:
464	mutex_unlock(&inode->i_mutex);	464	mutex_unlock(&inode->i_mutex);
465	return err;	465	return err;
466	}	466	}
467		467
468	static int fuse_fsync(struct file *file, loff_t start, loff_t end,	468	static int fuse_fsync(struct file *file, loff_t start, loff_t end,
469	int datasync)	469	int datasync)
470	{	470	{
471	return fuse_fsync_common(file, start, end, datasync, 0);	471	return fuse_fsync_common(file, start, end, datasync, 0);
472	}	472	}
473		473
474	void fuse_read_fill(struct fuse_req req, struct file file, loff_t pos,	474	void fuse_read_fill(struct fuse_req req, struct file file, loff_t pos,
475	size_t count, int opcode)	475	size_t count, int opcode)
476	{	476	{
477	struct fuse_read_in *inarg = &req->misc.read.in;	477	struct fuse_read_in *inarg = &req->misc.read.in;
478	struct fuse_file *ff = file->private_data;	478	struct fuse_file *ff = file->private_data;
479		479
480	inarg->fh = ff->fh;	480	inarg->fh = ff->fh;
481	inarg->offset = pos;	481	inarg->offset = pos;
482	inarg->size = count;	482	inarg->size = count;
483	inarg->flags = file->f_flags;	483	inarg->flags = file->f_flags;
484	req->in.h.opcode = opcode;	484	req->in.h.opcode = opcode;
485	req->in.h.nodeid = ff->nodeid;	485	req->in.h.nodeid = ff->nodeid;
486	req->in.numargs = 1;	486	req->in.numargs = 1;
487	req->in.args[0].size = sizeof(struct fuse_read_in);	487	req->in.args[0].size = sizeof(struct fuse_read_in);
488	req->in.args[0].value = inarg;	488	req->in.args[0].value = inarg;
489	req->out.argvar = 1;	489	req->out.argvar = 1;
490	req->out.numargs = 1;	490	req->out.numargs = 1;
491	req->out.args[0].size = count;	491	req->out.args[0].size = count;
492	}	492	}
493		493
494	static size_t fuse_send_read(struct fuse_req req, struct file file,	494	static size_t fuse_send_read(struct fuse_req req, struct file file,
495	loff_t pos, size_t count, fl_owner_t owner)	495	loff_t pos, size_t count, fl_owner_t owner)
496	{	496	{
497	struct fuse_file *ff = file->private_data;	497	struct fuse_file *ff = file->private_data;
498	struct fuse_conn *fc = ff->fc;	498	struct fuse_conn *fc = ff->fc;
499		499
500	fuse_read_fill(req, file, pos, count, FUSE_READ);	500	fuse_read_fill(req, file, pos, count, FUSE_READ);
501	if (owner != NULL) {	501	if (owner != NULL) {
502	struct fuse_read_in *inarg = &req->misc.read.in;	502	struct fuse_read_in *inarg = &req->misc.read.in;
503		503
504	inarg->read_flags \|= FUSE_READ_LOCKOWNER;	504	inarg->read_flags \|= FUSE_READ_LOCKOWNER;
505	inarg->lock_owner = fuse_lock_owner_id(fc, owner);	505	inarg->lock_owner = fuse_lock_owner_id(fc, owner);
506	}	506	}
507	fuse_request_send(fc, req);	507	fuse_request_send(fc, req);
508	return req->out.args[0].size;	508	return req->out.args[0].size;
509	}	509	}
510		510
511	static void fuse_read_update_size(struct inode *inode, loff_t size,	511	static void fuse_read_update_size(struct inode *inode, loff_t size,
512	u64 attr_ver)	512	u64 attr_ver)
513	{	513	{
514	struct fuse_conn *fc = get_fuse_conn(inode);	514	struct fuse_conn *fc = get_fuse_conn(inode);
515	struct fuse_inode *fi = get_fuse_inode(inode);	515	struct fuse_inode *fi = get_fuse_inode(inode);
516		516
517	spin_lock(&fc->lock);	517	spin_lock(&fc->lock);
518	if (attr_ver == fi->attr_version && size < inode->i_size) {	518	if (attr_ver == fi->attr_version && size < inode->i_size) {
519	fi->attr_version = ++fc->attr_version;	519	fi->attr_version = ++fc->attr_version;
520	i_size_write(inode, size);	520	i_size_write(inode, size);
521	}	521	}
522	spin_unlock(&fc->lock);	522	spin_unlock(&fc->lock);
523	}	523	}
524		524
525	static int fuse_readpage(struct file file, struct page page)	525	static int fuse_readpage(struct file file, struct page page)
526	{	526	{
527	struct inode *inode = page->mapping->host;	527	struct inode *inode = page->mapping->host;
528	struct fuse_conn *fc = get_fuse_conn(inode);	528	struct fuse_conn *fc = get_fuse_conn(inode);
529	struct fuse_req *req;	529	struct fuse_req *req;
530	size_t num_read;	530	size_t num_read;
531	loff_t pos = page_offset(page);	531	loff_t pos = page_offset(page);
532	size_t count = PAGE_CACHE_SIZE;	532	size_t count = PAGE_CACHE_SIZE;
533	u64 attr_ver;	533	u64 attr_ver;
534	int err;	534	int err;
535		535
536	err = -EIO;	536	err = -EIO;
537	if (is_bad_inode(inode))	537	if (is_bad_inode(inode))
538	goto out;	538	goto out;
539		539
540	/*	540	/*
541	* Page writeback can extend beyond the lifetime of the	541	* Page writeback can extend beyond the lifetime of the
542	* page-cache page, so make sure we read a properly synced	542	* page-cache page, so make sure we read a properly synced
543	* page.	543	* page.
544	*/	544	*/
545	fuse_wait_on_page_writeback(inode, page->index);	545	fuse_wait_on_page_writeback(inode, page->index);
546		546
547	req = fuse_get_req(fc);	547	req = fuse_get_req(fc);
548	err = PTR_ERR(req);	548	err = PTR_ERR(req);
549	if (IS_ERR(req))	549	if (IS_ERR(req))
550	goto out;	550	goto out;
551		551
552	attr_ver = fuse_get_attr_version(fc);	552	attr_ver = fuse_get_attr_version(fc);
553		553
554	req->out.page_zeroing = 1;	554	req->out.page_zeroing = 1;
555	req->out.argpages = 1;	555	req->out.argpages = 1;
556	req->num_pages = 1;	556	req->num_pages = 1;
557	req->pages[0] = page;	557	req->pages[0] = page;
558	num_read = fuse_send_read(req, file, pos, count, NULL);	558	num_read = fuse_send_read(req, file, pos, count, NULL);
559	err = req->out.h.error;	559	err = req->out.h.error;
560	fuse_put_request(fc, req);	560	fuse_put_request(fc, req);
561		561
562	if (!err) {	562	if (!err) {
563	/*	563	/*
564	* Short read means EOF. If file size is larger, truncate it	564	* Short read means EOF. If file size is larger, truncate it
565	*/	565	*/
566	if (num_read < count)	566	if (num_read < count)
567	fuse_read_update_size(inode, pos + num_read, attr_ver);	567	fuse_read_update_size(inode, pos + num_read, attr_ver);
568		568
569	SetPageUptodate(page);	569	SetPageUptodate(page);
570	}	570	}
571		571
572	fuse_invalidate_attr(inode); /* atime changed */	572	fuse_invalidate_attr(inode); /* atime changed */
573	out:	573	out:
574	unlock_page(page);	574	unlock_page(page);
575	return err;	575	return err;
576	}	576	}
577		577
578	static void fuse_readpages_end(struct fuse_conn fc, struct fuse_req req)	578	static void fuse_readpages_end(struct fuse_conn fc, struct fuse_req req)
579	{	579	{
580	int i;	580	int i;
581	size_t count = req->misc.read.in.size;	581	size_t count = req->misc.read.in.size;
582	size_t num_read = req->out.args[0].size;	582	size_t num_read = req->out.args[0].size;
583	struct address_space *mapping = NULL;	583	struct address_space *mapping = NULL;
584		584
585	for (i = 0; mapping == NULL && i < req->num_pages; i++)	585	for (i = 0; mapping == NULL && i < req->num_pages; i++)
586	mapping = req->pages[i]->mapping;	586	mapping = req->pages[i]->mapping;
587		587
588	if (mapping) {	588	if (mapping) {
589	struct inode *inode = mapping->host;	589	struct inode *inode = mapping->host;
590		590
591	/*	591	/*
592	* Short read means EOF. If file size is larger, truncate it	592	* Short read means EOF. If file size is larger, truncate it
593	*/	593	*/
594	if (!req->out.h.error && num_read < count) {	594	if (!req->out.h.error && num_read < count) {
595	loff_t pos;	595	loff_t pos;
596		596
597	pos = page_offset(req->pages[0]) + num_read;	597	pos = page_offset(req->pages[0]) + num_read;
598	fuse_read_update_size(inode, pos,	598	fuse_read_update_size(inode, pos,
599	req->misc.read.attr_ver);	599	req->misc.read.attr_ver);
600	}	600	}
601	fuse_invalidate_attr(inode); /* atime changed */	601	fuse_invalidate_attr(inode); /* atime changed */
602	}	602	}
603		603
604	for (i = 0; i < req->num_pages; i++) {	604	for (i = 0; i < req->num_pages; i++) {
605	struct page *page = req->pages[i];	605	struct page *page = req->pages[i];
606	if (!req->out.h.error)	606	if (!req->out.h.error)
607	SetPageUptodate(page);	607	SetPageUptodate(page);
608	else	608	else
609	SetPageError(page);	609	SetPageError(page);
610	unlock_page(page);	610	unlock_page(page);
611	page_cache_release(page);	611	page_cache_release(page);
612	}	612	}
613	if (req->ff)	613	if (req->ff)
614	fuse_file_put(req->ff, false);	614	fuse_file_put(req->ff, false);
615	}	615	}
616		616
617	static void fuse_send_readpages(struct fuse_req req, struct file file)	617	static void fuse_send_readpages(struct fuse_req req, struct file file)
618	{	618	{
619	struct fuse_file *ff = file->private_data;	619	struct fuse_file *ff = file->private_data;
620	struct fuse_conn *fc = ff->fc;	620	struct fuse_conn *fc = ff->fc;
621	loff_t pos = page_offset(req->pages[0]);	621	loff_t pos = page_offset(req->pages[0]);
622	size_t count = req->num_pages << PAGE_CACHE_SHIFT;	622	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
623		623
624	req->out.argpages = 1;	624	req->out.argpages = 1;
625	req->out.page_zeroing = 1;	625	req->out.page_zeroing = 1;
626	req->out.page_replace = 1;	626	req->out.page_replace = 1;
627	fuse_read_fill(req, file, pos, count, FUSE_READ);	627	fuse_read_fill(req, file, pos, count, FUSE_READ);
628	req->misc.read.attr_ver = fuse_get_attr_version(fc);	628	req->misc.read.attr_ver = fuse_get_attr_version(fc);
629	if (fc->async_read) {	629	if (fc->async_read) {
630	req->ff = fuse_file_get(ff);	630	req->ff = fuse_file_get(ff);
631	req->end = fuse_readpages_end;	631	req->end = fuse_readpages_end;
632	fuse_request_send_background(fc, req);	632	fuse_request_send_background(fc, req);
633	} else {	633	} else {
634	fuse_request_send(fc, req);	634	fuse_request_send(fc, req);
635	fuse_readpages_end(fc, req);	635	fuse_readpages_end(fc, req);
636	fuse_put_request(fc, req);	636	fuse_put_request(fc, req);
637	}	637	}
638	}	638	}
639		639
640	struct fuse_fill_data {	640	struct fuse_fill_data {
641	struct fuse_req *req;	641	struct fuse_req *req;
642	struct file *file;	642	struct file *file;
643	struct inode *inode;	643	struct inode *inode;
644	};	644	};
645		645
646	static int fuse_readpages_fill(void _data, struct page page)	646	static int fuse_readpages_fill(void _data, struct page page)
647	{	647	{
648	struct fuse_fill_data *data = _data;	648	struct fuse_fill_data *data = _data;
649	struct fuse_req *req = data->req;	649	struct fuse_req *req = data->req;
650	struct inode *inode = data->inode;	650	struct inode *inode = data->inode;
651	struct fuse_conn *fc = get_fuse_conn(inode);	651	struct fuse_conn *fc = get_fuse_conn(inode);
652		652
653	fuse_wait_on_page_writeback(inode, page->index);	653	fuse_wait_on_page_writeback(inode, page->index);
654		654
655	if (req->num_pages &&	655	if (req->num_pages &&
656	(req->num_pages == FUSE_MAX_PAGES_PER_REQ \|\|	656	(req->num_pages == FUSE_MAX_PAGES_PER_REQ \|\|
657	(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read \|\|	657	(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read \|\|
658	req->pages[req->num_pages - 1]->index + 1 != page->index)) {	658	req->pages[req->num_pages - 1]->index + 1 != page->index)) {
659	fuse_send_readpages(req, data->file);	659	fuse_send_readpages(req, data->file);
660	data->req = req = fuse_get_req(fc);	660	data->req = req = fuse_get_req(fc);
661	if (IS_ERR(req)) {	661	if (IS_ERR(req)) {
662	unlock_page(page);	662	unlock_page(page);
663	return PTR_ERR(req);	663	return PTR_ERR(req);
664	}	664	}
665	}	665	}
666	page_cache_get(page);	666	page_cache_get(page);
667	req->pages[req->num_pages] = page;	667	req->pages[req->num_pages] = page;
668	req->num_pages++;	668	req->num_pages++;
669	return 0;	669	return 0;
670	}	670	}
671		671
672	static int fuse_readpages(struct file file, struct address_space mapping,	672	static int fuse_readpages(struct file file, struct address_space mapping,
673	struct list_head *pages, unsigned nr_pages)	673	struct list_head *pages, unsigned nr_pages)
674	{	674	{
675	struct inode *inode = mapping->host;	675	struct inode *inode = mapping->host;
676	struct fuse_conn *fc = get_fuse_conn(inode);	676	struct fuse_conn *fc = get_fuse_conn(inode);
677	struct fuse_fill_data data;	677	struct fuse_fill_data data;
678	int err;	678	int err;
679		679
680	err = -EIO;	680	err = -EIO;
681	if (is_bad_inode(inode))	681	if (is_bad_inode(inode))
682	goto out;	682	goto out;
683		683
684	data.file = file;	684	data.file = file;
685	data.inode = inode;	685	data.inode = inode;
686	data.req = fuse_get_req(fc);	686	data.req = fuse_get_req(fc);
687	err = PTR_ERR(data.req);	687	err = PTR_ERR(data.req);
688	if (IS_ERR(data.req))	688	if (IS_ERR(data.req))
689	goto out;	689	goto out;
690		690
691	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);	691	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
692	if (!err) {	692	if (!err) {
693	if (data.req->num_pages)	693	if (data.req->num_pages)
694	fuse_send_readpages(data.req, file);	694	fuse_send_readpages(data.req, file);
695	else	695	else
696	fuse_put_request(fc, data.req);	696	fuse_put_request(fc, data.req);
697	}	697	}
698	out:	698	out:
699	return err;	699	return err;
700	}	700	}
701		701
702	static ssize_t fuse_file_aio_read(struct kiocb iocb, const struct iovec iov,	702	static ssize_t fuse_file_aio_read(struct kiocb iocb, const struct iovec iov,
703	unsigned long nr_segs, loff_t pos)	703	unsigned long nr_segs, loff_t pos)
704	{	704	{
705	struct inode *inode = iocb->ki_filp->f_mapping->host;	705	struct inode *inode = iocb->ki_filp->f_mapping->host;
706	struct fuse_conn *fc = get_fuse_conn(inode);	706	struct fuse_conn *fc = get_fuse_conn(inode);
707		707
708	/*	708	/*
709	* In auto invalidate mode, always update attributes on read.	709	* In auto invalidate mode, always update attributes on read.
710	* Otherwise, only update if we attempt to read past EOF (to ensure	710	* Otherwise, only update if we attempt to read past EOF (to ensure
711	* i_size is up to date).	711	* i_size is up to date).
712	*/	712	*/
713	if (fc->auto_inval_data \|\|	713	if (fc->auto_inval_data \|\|
714	(pos + iov_length(iov, nr_segs) > i_size_read(inode))) {	714	(pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
715	int err;	715	int err;
716	err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);	716	err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
717	if (err)	717	if (err)
718	return err;	718	return err;
719	}	719	}
720		720
721	return generic_file_aio_read(iocb, iov, nr_segs, pos);	721	return generic_file_aio_read(iocb, iov, nr_segs, pos);
722	}	722	}
723		723
724	static void fuse_write_fill(struct fuse_req req, struct fuse_file ff,	724	static void fuse_write_fill(struct fuse_req req, struct fuse_file ff,
725	loff_t pos, size_t count)	725	loff_t pos, size_t count)
726	{	726	{
727	struct fuse_write_in *inarg = &req->misc.write.in;	727	struct fuse_write_in *inarg = &req->misc.write.in;
728	struct fuse_write_out *outarg = &req->misc.write.out;	728	struct fuse_write_out *outarg = &req->misc.write.out;
729		729
730	inarg->fh = ff->fh;	730	inarg->fh = ff->fh;
731	inarg->offset = pos;	731	inarg->offset = pos;
732	inarg->size = count;	732	inarg->size = count;
733	req->in.h.opcode = FUSE_WRITE;	733	req->in.h.opcode = FUSE_WRITE;
734	req->in.h.nodeid = ff->nodeid;	734	req->in.h.nodeid = ff->nodeid;
735	req->in.numargs = 2;	735	req->in.numargs = 2;
736	if (ff->fc->minor < 9)	736	if (ff->fc->minor < 9)
737	req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;	737	req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
738	else	738	else
739	req->in.args[0].size = sizeof(struct fuse_write_in);	739	req->in.args[0].size = sizeof(struct fuse_write_in);
740	req->in.args[0].value = inarg;	740	req->in.args[0].value = inarg;
741	req->in.args[1].size = count;	741	req->in.args[1].size = count;
742	req->out.numargs = 1;	742	req->out.numargs = 1;
743	req->out.args[0].size = sizeof(struct fuse_write_out);	743	req->out.args[0].size = sizeof(struct fuse_write_out);
744	req->out.args[0].value = outarg;	744	req->out.args[0].value = outarg;
745	}	745	}
746		746
747	static size_t fuse_send_write(struct fuse_req req, struct file file,	747	static size_t fuse_send_write(struct fuse_req req, struct file file,
748	loff_t pos, size_t count, fl_owner_t owner)	748	loff_t pos, size_t count, fl_owner_t owner)
749	{	749	{
750	struct fuse_file *ff = file->private_data;	750	struct fuse_file *ff = file->private_data;
751	struct fuse_conn *fc = ff->fc;	751	struct fuse_conn *fc = ff->fc;
752	struct fuse_write_in *inarg = &req->misc.write.in;	752	struct fuse_write_in *inarg = &req->misc.write.in;
753		753
754	fuse_write_fill(req, ff, pos, count);	754	fuse_write_fill(req, ff, pos, count);
755	inarg->flags = file->f_flags;	755	inarg->flags = file->f_flags;
756	if (owner != NULL) {	756	if (owner != NULL) {
757	inarg->write_flags \|= FUSE_WRITE_LOCKOWNER;	757	inarg->write_flags \|= FUSE_WRITE_LOCKOWNER;
758	inarg->lock_owner = fuse_lock_owner_id(fc, owner);	758	inarg->lock_owner = fuse_lock_owner_id(fc, owner);
759	}	759	}
760	fuse_request_send(fc, req);	760	fuse_request_send(fc, req);
761	return req->misc.write.out.size;	761	return req->misc.write.out.size;
762	}	762	}
763		763
764	void fuse_write_update_size(struct inode *inode, loff_t pos)	764	void fuse_write_update_size(struct inode *inode, loff_t pos)
765	{	765	{
766	struct fuse_conn *fc = get_fuse_conn(inode);	766	struct fuse_conn *fc = get_fuse_conn(inode);
767	struct fuse_inode *fi = get_fuse_inode(inode);	767	struct fuse_inode *fi = get_fuse_inode(inode);
768		768
769	spin_lock(&fc->lock);	769	spin_lock(&fc->lock);
770	fi->attr_version = ++fc->attr_version;	770	fi->attr_version = ++fc->attr_version;
771	if (pos > inode->i_size)	771	if (pos > inode->i_size)
772	i_size_write(inode, pos);	772	i_size_write(inode, pos);
773	spin_unlock(&fc->lock);	773	spin_unlock(&fc->lock);
774	}	774	}
775		775
776	static size_t fuse_send_write_pages(struct fuse_req req, struct file file,	776	static size_t fuse_send_write_pages(struct fuse_req req, struct file file,
777	struct inode *inode, loff_t pos,	777	struct inode *inode, loff_t pos,
778	size_t count)	778	size_t count)
779	{	779	{
780	size_t res;	780	size_t res;
781	unsigned offset;	781	unsigned offset;
782	unsigned i;	782	unsigned i;
783		783
784	for (i = 0; i < req->num_pages; i++)	784	for (i = 0; i < req->num_pages; i++)
785	fuse_wait_on_page_writeback(inode, req->pages[i]->index);	785	fuse_wait_on_page_writeback(inode, req->pages[i]->index);
786		786
787	res = fuse_send_write(req, file, pos, count, NULL);	787	res = fuse_send_write(req, file, pos, count, NULL);
788		788
789	offset = req->page_offset;	789	offset = req->page_offset;
790	count = res;	790	count = res;
791	for (i = 0; i < req->num_pages; i++) {	791	for (i = 0; i < req->num_pages; i++) {
792	struct page *page = req->pages[i];	792	struct page *page = req->pages[i];
793		793
794	if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)	794	if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
795	SetPageUptodate(page);	795	SetPageUptodate(page);
796		796
797	if (count > PAGE_CACHE_SIZE - offset)	797	if (count > PAGE_CACHE_SIZE - offset)
798	count -= PAGE_CACHE_SIZE - offset;	798	count -= PAGE_CACHE_SIZE - offset;
799	else	799	else
800	count = 0;	800	count = 0;
801	offset = 0;	801	offset = 0;
802		802
803	unlock_page(page);	803	unlock_page(page);
804	page_cache_release(page);	804	page_cache_release(page);
805	}	805	}
806		806
807	return res;	807	return res;
808	}	808	}
809		809
810	static ssize_t fuse_fill_write_pages(struct fuse_req *req,	810	static ssize_t fuse_fill_write_pages(struct fuse_req *req,
811	struct address_space *mapping,	811	struct address_space *mapping,
812	struct iov_iter *ii, loff_t pos)	812	struct iov_iter *ii, loff_t pos)
813	{	813	{
814	struct fuse_conn *fc = get_fuse_conn(mapping->host);	814	struct fuse_conn *fc = get_fuse_conn(mapping->host);
815	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);	815	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
816	size_t count = 0;	816	size_t count = 0;
817	int err;	817	int err;
818		818
819	req->in.argpages = 1;	819	req->in.argpages = 1;
820	req->page_offset = offset;	820	req->page_offset = offset;
821		821
822	do {	822	do {
823	size_t tmp;	823	size_t tmp;
824	struct page *page;	824	struct page *page;
825	pgoff_t index = pos >> PAGE_CACHE_SHIFT;	825	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
826	size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,	826	size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
827	iov_iter_count(ii));	827	iov_iter_count(ii));
828		828
829	bytes = min_t(size_t, bytes, fc->max_write - count);	829	bytes = min_t(size_t, bytes, fc->max_write - count);
830		830
831	again:	831	again:
832	err = -EFAULT;	832	err = -EFAULT;
833	if (iov_iter_fault_in_readable(ii, bytes))	833	if (iov_iter_fault_in_readable(ii, bytes))
834	break;	834	break;
835		835
836	err = -ENOMEM;	836	err = -ENOMEM;
837	page = grab_cache_page_write_begin(mapping, index, 0);	837	page = grab_cache_page_write_begin(mapping, index, 0);
838	if (!page)	838	if (!page)
839	break;	839	break;
840		840
841	if (mapping_writably_mapped(mapping))	841	if (mapping_writably_mapped(mapping))
842	flush_dcache_page(page);	842	flush_dcache_page(page);
843		843
844	pagefault_disable();	844	pagefault_disable();
845	tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);	845	tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
846	pagefault_enable();	846	pagefault_enable();
847	flush_dcache_page(page);	847	flush_dcache_page(page);
848		848
849	mark_page_accessed(page);	849	mark_page_accessed(page);
850		850
851	if (!tmp) {	851	if (!tmp) {
852	unlock_page(page);	852	unlock_page(page);
853	page_cache_release(page);	853	page_cache_release(page);
854	bytes = min(bytes, iov_iter_single_seg_count(ii));	854	bytes = min(bytes, iov_iter_single_seg_count(ii));
855	goto again;	855	goto again;
856	}	856	}
857		857
858	err = 0;	858	err = 0;
859	req->pages[req->num_pages] = page;	859	req->pages[req->num_pages] = page;
860	req->num_pages++;	860	req->num_pages++;
861		861
862	iov_iter_advance(ii, tmp);	862	iov_iter_advance(ii, tmp);
863	count += tmp;	863	count += tmp;
864	pos += tmp;	864	pos += tmp;
865	offset += tmp;	865	offset += tmp;
866	if (offset == PAGE_CACHE_SIZE)	866	if (offset == PAGE_CACHE_SIZE)
867	offset = 0;	867	offset = 0;
868		868
869	if (!fc->big_writes)	869	if (!fc->big_writes)
870	break;	870	break;
871	} while (iov_iter_count(ii) && count < fc->max_write &&	871	} while (iov_iter_count(ii) && count < fc->max_write &&
872	req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);	872	req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
873		873
874	return count > 0 ? count : err;	874	return count > 0 ? count : err;
875	}	875	}
876		876
877	static ssize_t fuse_perform_write(struct file *file,	877	static ssize_t fuse_perform_write(struct file *file,
878	struct address_space *mapping,	878	struct address_space *mapping,
879	struct iov_iter *ii, loff_t pos)	879	struct iov_iter *ii, loff_t pos)
880	{	880	{
881	struct inode *inode = mapping->host;	881	struct inode *inode = mapping->host;
882	struct fuse_conn *fc = get_fuse_conn(inode);	882	struct fuse_conn *fc = get_fuse_conn(inode);
883	int err = 0;	883	int err = 0;
884	ssize_t res = 0;	884	ssize_t res = 0;
885		885
886	if (is_bad_inode(inode))	886	if (is_bad_inode(inode))
887	return -EIO;	887	return -EIO;
888		888
889	do {	889	do {
890	struct fuse_req *req;	890	struct fuse_req *req;
891	ssize_t count;	891	ssize_t count;
892		892
893	req = fuse_get_req(fc);	893	req = fuse_get_req(fc);
894	if (IS_ERR(req)) {	894	if (IS_ERR(req)) {
895	err = PTR_ERR(req);	895	err = PTR_ERR(req);
896	break;	896	break;
897	}	897	}
898		898
899	count = fuse_fill_write_pages(req, mapping, ii, pos);	899	count = fuse_fill_write_pages(req, mapping, ii, pos);
900	if (count <= 0) {	900	if (count <= 0) {
901	err = count;	901	err = count;
902	} else {	902	} else {
903	size_t num_written;	903	size_t num_written;
904		904
905	num_written = fuse_send_write_pages(req, file, inode,	905	num_written = fuse_send_write_pages(req, file, inode,
906	pos, count);	906	pos, count);
907	err = req->out.h.error;	907	err = req->out.h.error;
908	if (!err) {	908	if (!err) {
909	res += num_written;	909	res += num_written;
910	pos += num_written;	910	pos += num_written;
911		911
912	/* break out of the loop on short write */	912	/* break out of the loop on short write */
913	if (num_written != count)	913	if (num_written != count)
914	err = -EIO;	914	err = -EIO;
915	}	915	}
916	}	916	}
917	fuse_put_request(fc, req);	917	fuse_put_request(fc, req);
918	} while (!err && iov_iter_count(ii));	918	} while (!err && iov_iter_count(ii));
919		919
920	if (res > 0)	920	if (res > 0)
921	fuse_write_update_size(inode, pos);	921	fuse_write_update_size(inode, pos);
922		922
923	fuse_invalidate_attr(inode);	923	fuse_invalidate_attr(inode);
924		924
925	return res > 0 ? res : err;	925	return res > 0 ? res : err;
926	}	926	}
927		927
928	static ssize_t fuse_file_aio_write(struct kiocb iocb, const struct iovec iov,	928	static ssize_t fuse_file_aio_write(struct kiocb iocb, const struct iovec iov,
929	unsigned long nr_segs, loff_t pos)	929	unsigned long nr_segs, loff_t pos)
930	{	930	{
931	struct file *file = iocb->ki_filp;	931	struct file *file = iocb->ki_filp;
932	struct address_space *mapping = file->f_mapping;	932	struct address_space *mapping = file->f_mapping;
933	size_t count = 0;	933	size_t count = 0;
934	size_t ocount = 0;	934	size_t ocount = 0;
935	ssize_t written = 0;	935	ssize_t written = 0;
936	ssize_t written_buffered = 0;	936	ssize_t written_buffered = 0;
937	struct inode *inode = mapping->host;	937	struct inode *inode = mapping->host;
938	ssize_t err;	938	ssize_t err;
939	struct iov_iter i;	939	struct iov_iter i;
940	loff_t endbyte = 0;	940	loff_t endbyte = 0;
941		941
942	WARN_ON(iocb->ki_pos != pos);	942	WARN_ON(iocb->ki_pos != pos);
943		943
944	ocount = 0;	944	ocount = 0;
945	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);	945	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
946	if (err)	946	if (err)
947	return err;	947	return err;
948		948
949	count = ocount;	949	count = ocount;
950	sb_start_write(inode->i_sb);	950	sb_start_write(inode->i_sb);
951	mutex_lock(&inode->i_mutex);	951	mutex_lock(&inode->i_mutex);
952		952
953	/* We can write back this queue in page reclaim */	953	/* We can write back this queue in page reclaim */
954	current->backing_dev_info = mapping->backing_dev_info;	954	current->backing_dev_info = mapping->backing_dev_info;
955		955
956	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));	956	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
957	if (err)	957	if (err)
958	goto out;	958	goto out;
959		959
960	if (count == 0)	960	if (count == 0)
961	goto out;	961	goto out;
962		962
963	err = file_remove_suid(file);	963	err = file_remove_suid(file);
964	if (err)	964	if (err)
965	goto out;	965	goto out;
966		966
967	err = file_update_time(file);	967	err = file_update_time(file);
968	if (err)	968	if (err)
969	goto out;	969	goto out;
970		970
971	if (file->f_flags & O_DIRECT) {	971	if (file->f_flags & O_DIRECT) {
972	written = generic_file_direct_write(iocb, iov, &nr_segs,	972	written = generic_file_direct_write(iocb, iov, &nr_segs,
973	pos, &iocb->ki_pos,	973	pos, &iocb->ki_pos,
974	count, ocount);	974	count, ocount);
975	if (written < 0 \|\| written == count)	975	if (written < 0 \|\| written == count)
976	goto out;	976	goto out;
977		977
978	pos += written;	978	pos += written;
979	count -= written;	979	count -= written;
980		980
981	iov_iter_init(&i, iov, nr_segs, count, written);	981	iov_iter_init(&i, iov, nr_segs, count, written);
982	written_buffered = fuse_perform_write(file, mapping, &i, pos);	982	written_buffered = fuse_perform_write(file, mapping, &i, pos);
983	if (written_buffered < 0) {	983	if (written_buffered < 0) {
984	err = written_buffered;	984	err = written_buffered;
985	goto out;	985	goto out;
986	}	986	}
987	endbyte = pos + written_buffered - 1;	987	endbyte = pos + written_buffered - 1;
988		988
989	err = filemap_write_and_wait_range(file->f_mapping, pos,	989	err = filemap_write_and_wait_range(file->f_mapping, pos,
990	endbyte);	990	endbyte);
991	if (err)	991	if (err)
992	goto out;	992	goto out;
993		993
994	invalidate_mapping_pages(file->f_mapping,	994	invalidate_mapping_pages(file->f_mapping,
995	pos >> PAGE_CACHE_SHIFT,	995	pos >> PAGE_CACHE_SHIFT,
996	endbyte >> PAGE_CACHE_SHIFT);	996	endbyte >> PAGE_CACHE_SHIFT);
997		997
998	written += written_buffered;	998	written += written_buffered;
999	iocb->ki_pos = pos + written_buffered;	999	iocb->ki_pos = pos + written_buffered;
1000	} else {	1000	} else {
1001	iov_iter_init(&i, iov, nr_segs, count, 0);	1001	iov_iter_init(&i, iov, nr_segs, count, 0);
1002	written = fuse_perform_write(file, mapping, &i, pos);	1002	written = fuse_perform_write(file, mapping, &i, pos);
1003	if (written >= 0)	1003	if (written >= 0)
1004	iocb->ki_pos = pos + written;	1004	iocb->ki_pos = pos + written;
1005	}	1005	}
1006	out:	1006	out:
1007	current->backing_dev_info = NULL;	1007	current->backing_dev_info = NULL;
1008	mutex_unlock(&inode->i_mutex);	1008	mutex_unlock(&inode->i_mutex);
1009	sb_end_write(inode->i_sb);	1009	sb_end_write(inode->i_sb);
1010		1010
1011	return written ? written : err;	1011	return written ? written : err;
1012	}	1012	}
1013		1013
1014	static void fuse_release_user_pages(struct fuse_req *req, int write)	1014	static void fuse_release_user_pages(struct fuse_req *req, int write)
1015	{	1015	{
1016	unsigned i;	1016	unsigned i;
1017		1017
1018	for (i = 0; i < req->num_pages; i++) {	1018	for (i = 0; i < req->num_pages; i++) {
1019	struct page *page = req->pages[i];	1019	struct page *page = req->pages[i];
1020	if (write)	1020	if (write)
1021	set_page_dirty_lock(page);	1021	set_page_dirty_lock(page);
1022	put_page(page);	1022	put_page(page);
1023	}	1023	}
1024	}	1024	}
1025		1025
1026	static int fuse_get_user_pages(struct fuse_req req, const char __user buf,	1026	static int fuse_get_user_pages(struct fuse_req req, const char __user buf,
1027	size_t *nbytesp, int write)	1027	size_t *nbytesp, int write)
1028	{	1028	{
1029	size_t nbytes = *nbytesp;	1029	size_t nbytes = *nbytesp;
1030	unsigned long user_addr = (unsigned long) buf;	1030	unsigned long user_addr = (unsigned long) buf;
1031	unsigned offset = user_addr & ~PAGE_MASK;	1031	unsigned offset = user_addr & ~PAGE_MASK;
1032	int npages;	1032	int npages;
1033		1033
1034	/* Special case for kernel I/O: can copy directly into the buffer */	1034	/* Special case for kernel I/O: can copy directly into the buffer */
1035	if (segment_eq(get_fs(), KERNEL_DS)) {	1035	if (segment_eq(get_fs(), KERNEL_DS)) {
1036	if (write)	1036	if (write)
1037	req->in.args[1].value = (void *) user_addr;	1037	req->in.args[1].value = (void *) user_addr;
1038	else	1038	else
1039	req->out.args[0].value = (void *) user_addr;	1039	req->out.args[0].value = (void *) user_addr;
1040		1040
1041	return 0;	1041	return 0;
1042	}	1042	}
1043		1043
1044	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);	1044	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
1045	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;	1045	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1046	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);	1046	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
1047	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);	1047	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
1048	if (npages < 0)	1048	if (npages < 0)
1049	return npages;	1049	return npages;
1050		1050
1051	req->num_pages = npages;	1051	req->num_pages = npages;
1052	req->page_offset = offset;	1052	req->page_offset = offset;
1053		1053
1054	if (write)	1054	if (write)
1055	req->in.argpages = 1;	1055	req->in.argpages = 1;
1056	else	1056	else
1057	req->out.argpages = 1;	1057	req->out.argpages = 1;
1058		1058
1059	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;	1059	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
1060	nbytesp = min(nbytesp, nbytes);	1060	nbytesp = min(nbytesp, nbytes);
1061		1061
1062	return 0;	1062	return 0;
1063	}	1063	}
1064		1064
1065	ssize_t fuse_direct_io(struct file file, const char __user buf,	1065	ssize_t fuse_direct_io(struct file file, const char __user buf,
1066	size_t count, loff_t *ppos, int write)	1066	size_t count, loff_t *ppos, int write)
1067	{	1067	{
1068	struct fuse_file *ff = file->private_data;	1068	struct fuse_file *ff = file->private_data;
1069	struct fuse_conn *fc = ff->fc;	1069	struct fuse_conn *fc = ff->fc;
1070	size_t nmax = write ? fc->max_write : fc->max_read;	1070	size_t nmax = write ? fc->max_write : fc->max_read;
1071	loff_t pos = *ppos;	1071	loff_t pos = *ppos;
1072	ssize_t res = 0;	1072	ssize_t res = 0;
1073	struct fuse_req *req;	1073	struct fuse_req *req;
1074		1074
1075	req = fuse_get_req(fc);	1075	req = fuse_get_req(fc);
1076	if (IS_ERR(req))	1076	if (IS_ERR(req))
1077	return PTR_ERR(req);	1077	return PTR_ERR(req);
1078		1078
1079	while (count) {	1079	while (count) {
1080	size_t nres;	1080	size_t nres;
1081	fl_owner_t owner = current->files;	1081	fl_owner_t owner = current->files;
1082	size_t nbytes = min(count, nmax);	1082	size_t nbytes = min(count, nmax);
1083	int err = fuse_get_user_pages(req, buf, &nbytes, write);	1083	int err = fuse_get_user_pages(req, buf, &nbytes, write);
1084	if (err) {	1084	if (err) {
1085	res = err;	1085	res = err;
1086	break;	1086	break;
1087	}	1087	}
1088		1088
1089	if (write)	1089	if (write)
1090	nres = fuse_send_write(req, file, pos, nbytes, owner);	1090	nres = fuse_send_write(req, file, pos, nbytes, owner);
1091	else	1091	else
1092	nres = fuse_send_read(req, file, pos, nbytes, owner);	1092	nres = fuse_send_read(req, file, pos, nbytes, owner);
1093		1093
1094	fuse_release_user_pages(req, !write);	1094	fuse_release_user_pages(req, !write);
1095	if (req->out.h.error) {	1095	if (req->out.h.error) {
1096	if (!res)	1096	if (!res)
1097	res = req->out.h.error;	1097	res = req->out.h.error;
1098	break;	1098	break;
1099	} else if (nres > nbytes) {	1099	} else if (nres > nbytes) {
1100	res = -EIO;	1100	res = -EIO;
1101	break;	1101	break;
1102	}	1102	}
1103	count -= nres;	1103	count -= nres;
1104	res += nres;	1104	res += nres;
1105	pos += nres;	1105	pos += nres;
1106	buf += nres;	1106	buf += nres;
1107	if (nres != nbytes)	1107	if (nres != nbytes)
1108	break;	1108	break;
1109	if (count) {	1109	if (count) {
1110	fuse_put_request(fc, req);	1110	fuse_put_request(fc, req);
1111	req = fuse_get_req(fc);	1111	req = fuse_get_req(fc);
1112	if (IS_ERR(req))	1112	if (IS_ERR(req))
1113	break;	1113	break;
1114	}	1114	}
1115	}	1115	}
1116	if (!IS_ERR(req))	1116	if (!IS_ERR(req))
1117	fuse_put_request(fc, req);	1117	fuse_put_request(fc, req);
1118	if (res > 0)	1118	if (res > 0)
1119	*ppos = pos;	1119	*ppos = pos;
1120		1120
1121	return res;	1121	return res;
1122	}	1122	}
1123	EXPORT_SYMBOL_GPL(fuse_direct_io);	1123	EXPORT_SYMBOL_GPL(fuse_direct_io);
1124		1124
1125	static ssize_t fuse_direct_read(struct file file, char __user buf,	1125	static ssize_t fuse_direct_read(struct file file, char __user buf,
1126	size_t count, loff_t *ppos)	1126	size_t count, loff_t *ppos)
1127	{	1127	{
1128	ssize_t res;	1128	ssize_t res;
1129	struct inode *inode = file->f_path.dentry->d_inode;	1129	struct inode *inode = file->f_path.dentry->d_inode;
1130		1130
1131	if (is_bad_inode(inode))	1131	if (is_bad_inode(inode))
1132	return -EIO;	1132	return -EIO;
1133		1133
1134	res = fuse_direct_io(file, buf, count, ppos, 0);	1134	res = fuse_direct_io(file, buf, count, ppos, 0);
1135		1135
1136	fuse_invalidate_attr(inode);	1136	fuse_invalidate_attr(inode);
1137		1137
1138	return res;	1138	return res;
1139	}	1139	}
1140		1140
1141	static ssize_t __fuse_direct_write(struct file file, const char __user buf,	1141	static ssize_t __fuse_direct_write(struct file file, const char __user buf,
1142	size_t count, loff_t *ppos)	1142	size_t count, loff_t *ppos)
1143	{	1143	{
1144	struct inode *inode = file->f_path.dentry->d_inode;	1144	struct inode *inode = file->f_path.dentry->d_inode;
1145	ssize_t res;	1145	ssize_t res;
1146		1146
1147	res = generic_write_checks(file, ppos, &count, 0);	1147	res = generic_write_checks(file, ppos, &count, 0);
1148	if (!res) {	1148	if (!res) {
1149	res = fuse_direct_io(file, buf, count, ppos, 1);	1149	res = fuse_direct_io(file, buf, count, ppos, 1);
1150	if (res > 0)	1150	if (res > 0)
1151	fuse_write_update_size(inode, *ppos);	1151	fuse_write_update_size(inode, *ppos);
1152	}	1152	}
1153		1153
1154	fuse_invalidate_attr(inode);	1154	fuse_invalidate_attr(inode);
1155		1155
1156	return res;	1156	return res;
1157	}	1157	}
1158		1158
1159	static ssize_t fuse_direct_write(struct file file, const char __user buf,	1159	static ssize_t fuse_direct_write(struct file file, const char __user buf,
1160	size_t count, loff_t *ppos)	1160	size_t count, loff_t *ppos)
1161	{	1161	{
1162	struct inode *inode = file->f_path.dentry->d_inode;	1162	struct inode *inode = file->f_path.dentry->d_inode;
1163	ssize_t res;	1163	ssize_t res;
1164		1164
1165	if (is_bad_inode(inode))	1165	if (is_bad_inode(inode))
1166	return -EIO;	1166	return -EIO;
1167		1167
1168	/* Don't allow parallel writes to the same file */	1168	/* Don't allow parallel writes to the same file */
1169	mutex_lock(&inode->i_mutex);	1169	mutex_lock(&inode->i_mutex);
1170	res = __fuse_direct_write(file, buf, count, ppos);	1170	res = __fuse_direct_write(file, buf, count, ppos);
1171	mutex_unlock(&inode->i_mutex);	1171	mutex_unlock(&inode->i_mutex);
1172		1172
1173	return res;	1173	return res;
1174	}	1174	}
1175		1175
1176	static void fuse_writepage_free(struct fuse_conn fc, struct fuse_req req)	1176	static void fuse_writepage_free(struct fuse_conn fc, struct fuse_req req)
1177	{	1177	{
1178	__free_page(req->pages[0]);	1178	__free_page(req->pages[0]);
1179	fuse_file_put(req->ff, false);	1179	fuse_file_put(req->ff, false);
1180	}	1180	}
1181		1181
1182	static void fuse_writepage_finish(struct fuse_conn fc, struct fuse_req req)	1182	static void fuse_writepage_finish(struct fuse_conn fc, struct fuse_req req)
1183	{	1183	{
1184	struct inode *inode = req->inode;	1184	struct inode *inode = req->inode;
1185	struct fuse_inode *fi = get_fuse_inode(inode);	1185	struct fuse_inode *fi = get_fuse_inode(inode);
1186	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;	1186	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
1187		1187
1188	list_del(&req->writepages_entry);	1188	list_del(&req->writepages_entry);
1189	dec_bdi_stat(bdi, BDI_WRITEBACK);	1189	dec_bdi_stat(bdi, BDI_WRITEBACK);
1190	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);	1190	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
1191	bdi_writeout_inc(bdi);	1191	bdi_writeout_inc(bdi);
1192	wake_up(&fi->page_waitq);	1192	wake_up(&fi->page_waitq);
1193	}	1193	}
1194		1194
1195	/* Called under fc->lock, may release and reacquire it */	1195	/* Called under fc->lock, may release and reacquire it */
1196	static void fuse_send_writepage(struct fuse_conn fc, struct fuse_req req)	1196	static void fuse_send_writepage(struct fuse_conn fc, struct fuse_req req)
1197	__releases(fc->lock)	1197	__releases(fc->lock)
1198	__acquires(fc->lock)	1198	__acquires(fc->lock)
1199	{	1199	{
1200	struct fuse_inode *fi = get_fuse_inode(req->inode);	1200	struct fuse_inode *fi = get_fuse_inode(req->inode);
1201	loff_t size = i_size_read(req->inode);	1201	loff_t size = i_size_read(req->inode);
1202	struct fuse_write_in *inarg = &req->misc.write.in;	1202	struct fuse_write_in *inarg = &req->misc.write.in;
1203		1203
1204	if (!fc->connected)	1204	if (!fc->connected)
1205	goto out_free;	1205	goto out_free;
1206		1206
1207	if (inarg->offset + PAGE_CACHE_SIZE <= size) {	1207	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
1208	inarg->size = PAGE_CACHE_SIZE;	1208	inarg->size = PAGE_CACHE_SIZE;
1209	} else if (inarg->offset < size) {	1209	} else if (inarg->offset < size) {
1210	inarg->size = size & (PAGE_CACHE_SIZE - 1);	1210	inarg->size = size & (PAGE_CACHE_SIZE - 1);
1211	} else {	1211	} else {
1212	/* Got truncated off completely */	1212	/* Got truncated off completely */
1213	goto out_free;	1213	goto out_free;
1214	}	1214	}
1215		1215
1216	req->in.args[1].size = inarg->size;	1216	req->in.args[1].size = inarg->size;
1217	fi->writectr++;	1217	fi->writectr++;
1218	fuse_request_send_background_locked(fc, req);	1218	fuse_request_send_background_locked(fc, req);
1219	return;	1219	return;
1220		1220
1221	out_free:	1221	out_free:
1222	fuse_writepage_finish(fc, req);	1222	fuse_writepage_finish(fc, req);
1223	spin_unlock(&fc->lock);	1223	spin_unlock(&fc->lock);
1224	fuse_writepage_free(fc, req);	1224	fuse_writepage_free(fc, req);
1225	fuse_put_request(fc, req);	1225	fuse_put_request(fc, req);
1226	spin_lock(&fc->lock);	1226	spin_lock(&fc->lock);
1227	}	1227	}
1228		1228
1229	/*	1229	/*
1230	* If fi->writectr is positive (no truncate or fsync going on) send	1230	* If fi->writectr is positive (no truncate or fsync going on) send
1231	* all queued writepage requests.	1231	* all queued writepage requests.
1232	*	1232	*
1233	* Called with fc->lock	1233	* Called with fc->lock
1234	*/	1234	*/
1235	void fuse_flush_writepages(struct inode *inode)	1235	void fuse_flush_writepages(struct inode *inode)
1236	__releases(fc->lock)	1236	__releases(fc->lock)
1237	__acquires(fc->lock)	1237	__acquires(fc->lock)
1238	{	1238	{
1239	struct fuse_conn *fc = get_fuse_conn(inode);	1239	struct fuse_conn *fc = get_fuse_conn(inode);
1240	struct fuse_inode *fi = get_fuse_inode(inode);	1240	struct fuse_inode *fi = get_fuse_inode(inode);
1241	struct fuse_req *req;	1241	struct fuse_req *req;
1242		1242
1243	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {	1243	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1244	req = list_entry(fi->queued_writes.next, struct fuse_req, list);	1244	req = list_entry(fi->queued_writes.next, struct fuse_req, list);
1245	list_del_init(&req->list);	1245	list_del_init(&req->list);
1246	fuse_send_writepage(fc, req);	1246	fuse_send_writepage(fc, req);
1247	}	1247	}
1248	}	1248	}
1249		1249
1250	static void fuse_writepage_end(struct fuse_conn fc, struct fuse_req req)	1250	static void fuse_writepage_end(struct fuse_conn fc, struct fuse_req req)
1251	{	1251	{
1252	struct inode *inode = req->inode;	1252	struct inode *inode = req->inode;
1253	struct fuse_inode *fi = get_fuse_inode(inode);	1253	struct fuse_inode *fi = get_fuse_inode(inode);
1254		1254
1255	mapping_set_error(inode->i_mapping, req->out.h.error);	1255	mapping_set_error(inode->i_mapping, req->out.h.error);
1256	spin_lock(&fc->lock);	1256	spin_lock(&fc->lock);
1257	fi->writectr--;	1257	fi->writectr--;
1258	fuse_writepage_finish(fc, req);	1258	fuse_writepage_finish(fc, req);
1259	spin_unlock(&fc->lock);	1259	spin_unlock(&fc->lock);
1260	fuse_writepage_free(fc, req);	1260	fuse_writepage_free(fc, req);
1261	}	1261	}
1262		1262
1263	static int fuse_writepage_locked(struct page *page)	1263	static int fuse_writepage_locked(struct page *page)
1264	{	1264	{
1265	struct address_space *mapping = page->mapping;	1265	struct address_space *mapping = page->mapping;
1266	struct inode *inode = mapping->host;	1266	struct inode *inode = mapping->host;
1267	struct fuse_conn *fc = get_fuse_conn(inode);	1267	struct fuse_conn *fc = get_fuse_conn(inode);
1268	struct fuse_inode *fi = get_fuse_inode(inode);	1268	struct fuse_inode *fi = get_fuse_inode(inode);
1269	struct fuse_req *req;	1269	struct fuse_req *req;
1270	struct fuse_file *ff;	1270	struct fuse_file *ff;
1271	struct page *tmp_page;	1271	struct page *tmp_page;
1272		1272
1273	set_page_writeback(page);	1273	set_page_writeback(page);
1274		1274
1275	req = fuse_request_alloc_nofs();	1275	req = fuse_request_alloc_nofs();
1276	if (!req)	1276	if (!req)
1277	goto err;	1277	goto err;
1278		1278
1279	tmp_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);	1279	tmp_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
1280	if (!tmp_page)	1280	if (!tmp_page)
1281	goto err_free;	1281	goto err_free;
1282		1282
1283	spin_lock(&fc->lock);	1283	spin_lock(&fc->lock);
1284	BUG_ON(list_empty(&fi->write_files));	1284	BUG_ON(list_empty(&fi->write_files));
1285	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);	1285	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
1286	req->ff = fuse_file_get(ff);	1286	req->ff = fuse_file_get(ff);
1287	spin_unlock(&fc->lock);	1287	spin_unlock(&fc->lock);
1288		1288
1289	fuse_write_fill(req, ff, page_offset(page), 0);	1289	fuse_write_fill(req, ff, page_offset(page), 0);
1290		1290
1291	copy_highpage(tmp_page, page);	1291	copy_highpage(tmp_page, page);
1292	req->misc.write.in.write_flags \|= FUSE_WRITE_CACHE;	1292	req->misc.write.in.write_flags \|= FUSE_WRITE_CACHE;
1293	req->in.argpages = 1;	1293	req->in.argpages = 1;
1294	req->num_pages = 1;	1294	req->num_pages = 1;
1295	req->pages[0] = tmp_page;	1295	req->pages[0] = tmp_page;
1296	req->page_offset = 0;	1296	req->page_offset = 0;
1297	req->end = fuse_writepage_end;	1297	req->end = fuse_writepage_end;
1298	req->inode = inode;	1298	req->inode = inode;
1299		1299
1300	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);	1300	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
1301	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);	1301	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1302	end_page_writeback(page);	1302	end_page_writeback(page);
1303		1303
1304	spin_lock(&fc->lock);	1304	spin_lock(&fc->lock);
1305	list_add(&req->writepages_entry, &fi->writepages);	1305	list_add(&req->writepages_entry, &fi->writepages);
1306	list_add_tail(&req->list, &fi->queued_writes);	1306	list_add_tail(&req->list, &fi->queued_writes);
1307	fuse_flush_writepages(inode);	1307	fuse_flush_writepages(inode);
1308	spin_unlock(&fc->lock);	1308	spin_unlock(&fc->lock);
1309		1309
1310	return 0;	1310	return 0;
1311		1311
1312	err_free:	1312	err_free:
1313	fuse_request_free(req);	1313	fuse_request_free(req);
1314	err:	1314	err:
1315	end_page_writeback(page);	1315	end_page_writeback(page);
1316	return -ENOMEM;	1316	return -ENOMEM;
1317	}	1317	}
1318		1318
1319	static int fuse_writepage(struct page page, struct writeback_control wbc)	1319	static int fuse_writepage(struct page page, struct writeback_control wbc)
1320	{	1320	{
1321	int err;	1321	int err;
1322		1322
1323	err = fuse_writepage_locked(page);	1323	err = fuse_writepage_locked(page);
1324	unlock_page(page);	1324	unlock_page(page);
1325		1325
1326	return err;	1326	return err;
1327	}	1327	}
1328		1328
1329	static int fuse_launder_page(struct page *page)	1329	static int fuse_launder_page(struct page *page)
1330	{	1330	{
1331	int err = 0;	1331	int err = 0;
1332	if (clear_page_dirty_for_io(page)) {	1332	if (clear_page_dirty_for_io(page)) {
1333	struct inode *inode = page->mapping->host;	1333	struct inode *inode = page->mapping->host;
1334	err = fuse_writepage_locked(page);	1334	err = fuse_writepage_locked(page);
1335	if (!err)	1335	if (!err)
1336	fuse_wait_on_page_writeback(inode, page->index);	1336	fuse_wait_on_page_writeback(inode, page->index);
1337	}	1337	}
1338	return err;	1338	return err;
1339	}	1339	}
1340		1340
1341	/*	1341	/*
1342	* Write back dirty pages now, because there may not be any suitable	1342	* Write back dirty pages now, because there may not be any suitable
1343	* open files later	1343	* open files later
1344	*/	1344	*/
1345	static void fuse_vma_close(struct vm_area_struct *vma)	1345	static void fuse_vma_close(struct vm_area_struct *vma)
1346	{	1346	{
1347	filemap_write_and_wait(vma->vm_file->f_mapping);	1347	filemap_write_and_wait(vma->vm_file->f_mapping);
1348	}	1348	}
1349		1349
1350	/*	1350	/*
1351	* Wait for writeback against this page to complete before allowing it	1351	* Wait for writeback against this page to complete before allowing it
1352	* to be marked dirty again, and hence written back again, possibly	1352	* to be marked dirty again, and hence written back again, possibly
1353	* before the previous writepage completed.	1353	* before the previous writepage completed.
1354	*	1354	*
1355	* Block here, instead of in ->writepage(), so that the userspace fs	1355	* Block here, instead of in ->writepage(), so that the userspace fs
1356	* can only block processes actually operating on the filesystem.	1356	* can only block processes actually operating on the filesystem.
1357	*	1357	*
1358	* Otherwise unprivileged userspace fs would be able to block	1358	* Otherwise unprivileged userspace fs would be able to block
1359	* unrelated:	1359	* unrelated:
1360	*	1360	*
1361	* - page migration	1361	* - page migration
1362	* - sync(2)	1362	* - sync(2)
1363	* - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER	1363	* - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1364	*/	1364	*/
1365	static int fuse_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)	1365	static int fuse_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)
1366	{	1366	{
1367	struct page *page = vmf->page;	1367	struct page *page = vmf->page;
1368	/*	1368	/*
1369	* Don't use page->mapping as it may become NULL from a	1369	* Don't use page->mapping as it may become NULL from a
1370	* concurrent truncate.	1370	* concurrent truncate.
1371	*/	1371	*/
1372	struct inode *inode = vma->vm_file->f_mapping->host;	1372	struct inode *inode = vma->vm_file->f_mapping->host;
1373		1373
1374	fuse_wait_on_page_writeback(inode, page->index);	1374	fuse_wait_on_page_writeback(inode, page->index);
1375	return 0;	1375	return 0;
1376	}	1376	}
1377		1377
1378	static const struct vm_operations_struct fuse_file_vm_ops = {	1378	static const struct vm_operations_struct fuse_file_vm_ops = {
1379	.close = fuse_vma_close,	1379	.close = fuse_vma_close,
1380	.fault = filemap_fault,	1380	.fault = filemap_fault,
1381	.page_mkwrite = fuse_page_mkwrite,	1381	.page_mkwrite = fuse_page_mkwrite,
1382	.remap_pages = generic_file_remap_pages,	1382	.remap_pages = generic_file_remap_pages,
1383	};	1383	};
1384		1384
1385	static int fuse_file_mmap(struct file file, struct vm_area_struct vma)	1385	static int fuse_file_mmap(struct file file, struct vm_area_struct vma)
1386	{	1386	{
1387	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {	1387	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1388	struct inode *inode = file->f_dentry->d_inode;	1388	struct inode *inode = file->f_dentry->d_inode;
1389	struct fuse_conn *fc = get_fuse_conn(inode);	1389	struct fuse_conn *fc = get_fuse_conn(inode);
1390	struct fuse_inode *fi = get_fuse_inode(inode);	1390	struct fuse_inode *fi = get_fuse_inode(inode);
1391	struct fuse_file *ff = file->private_data;	1391	struct fuse_file *ff = file->private_data;
1392	/*	1392	/*
1393	* file may be written through mmap, so chain it onto the	1393	* file may be written through mmap, so chain it onto the
1394	* inodes's write_file list	1394	* inodes's write_file list
1395	*/	1395	*/
1396	spin_lock(&fc->lock);	1396	spin_lock(&fc->lock);
1397	if (list_empty(&ff->write_entry))	1397	if (list_empty(&ff->write_entry))
1398	list_add(&ff->write_entry, &fi->write_files);	1398	list_add(&ff->write_entry, &fi->write_files);
1399	spin_unlock(&fc->lock);	1399	spin_unlock(&fc->lock);
1400	}	1400	}
1401	file_accessed(file);	1401	file_accessed(file);
1402	vma->vm_ops = &fuse_file_vm_ops;	1402	vma->vm_ops = &fuse_file_vm_ops;
1403	return 0;	1403	return 0;
1404	}	1404	}
1405		1405
1406	static int fuse_direct_mmap(struct file file, struct vm_area_struct vma)	1406	static int fuse_direct_mmap(struct file file, struct vm_area_struct vma)
1407	{	1407	{
1408	/* Can't provide the coherency needed for MAP_SHARED */	1408	/* Can't provide the coherency needed for MAP_SHARED */
1409	if (vma->vm_flags & VM_MAYSHARE)	1409	if (vma->vm_flags & VM_MAYSHARE)
1410	return -ENODEV;	1410	return -ENODEV;
1411		1411
1412	invalidate_inode_pages2(file->f_mapping);	1412	invalidate_inode_pages2(file->f_mapping);
1413		1413
1414	return generic_file_mmap(file, vma);	1414	return generic_file_mmap(file, vma);
1415	}	1415	}
1416		1416
1417	static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,	1417	static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1418	struct file_lock *fl)	1418	struct file_lock *fl)
1419	{	1419	{
1420	switch (ffl->type) {	1420	switch (ffl->type) {
1421	case F_UNLCK:	1421	case F_UNLCK:
1422	break;	1422	break;
1423		1423
1424	case F_RDLCK:	1424	case F_RDLCK:
1425	case F_WRLCK:	1425	case F_WRLCK:
1426	if (ffl->start > OFFSET_MAX \|\| ffl->end > OFFSET_MAX \|\|	1426	if (ffl->start > OFFSET_MAX \|\| ffl->end > OFFSET_MAX \|\|
1427	ffl->end < ffl->start)	1427	ffl->end < ffl->start)
1428	return -EIO;	1428	return -EIO;
1429		1429
1430	fl->fl_start = ffl->start;	1430	fl->fl_start = ffl->start;
1431	fl->fl_end = ffl->end;	1431	fl->fl_end = ffl->end;
1432	fl->fl_pid = ffl->pid;	1432	fl->fl_pid = ffl->pid;
1433	break;	1433	break;
1434		1434
1435	default:	1435	default:
1436	return -EIO;	1436	return -EIO;
1437	}	1437	}
1438	fl->fl_type = ffl->type;	1438	fl->fl_type = ffl->type;
1439	return 0;	1439	return 0;
1440	}	1440	}
1441		1441
1442	static void fuse_lk_fill(struct fuse_req req, struct file file,	1442	static void fuse_lk_fill(struct fuse_req req, struct file file,
1443	const struct file_lock *fl, int opcode, pid_t pid,	1443	const struct file_lock *fl, int opcode, pid_t pid,
1444	int flock)	1444	int flock)
1445	{	1445	{
1446	struct inode *inode = file->f_path.dentry->d_inode;	1446	struct inode *inode = file->f_path.dentry->d_inode;
1447	struct fuse_conn *fc = get_fuse_conn(inode);	1447	struct fuse_conn *fc = get_fuse_conn(inode);
1448	struct fuse_file *ff = file->private_data;	1448	struct fuse_file *ff = file->private_data;
1449	struct fuse_lk_in *arg = &req->misc.lk_in;	1449	struct fuse_lk_in *arg = &req->misc.lk_in;
1450		1450
1451	arg->fh = ff->fh;	1451	arg->fh = ff->fh;
1452	arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);	1452	arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
1453	arg->lk.start = fl->fl_start;	1453	arg->lk.start = fl->fl_start;
1454	arg->lk.end = fl->fl_end;	1454	arg->lk.end = fl->fl_end;
1455	arg->lk.type = fl->fl_type;	1455	arg->lk.type = fl->fl_type;
1456	arg->lk.pid = pid;	1456	arg->lk.pid = pid;
1457	if (flock)	1457	if (flock)
1458	arg->lk_flags \|= FUSE_LK_FLOCK;	1458	arg->lk_flags \|= FUSE_LK_FLOCK;
1459	req->in.h.opcode = opcode;	1459	req->in.h.opcode = opcode;
1460	req->in.h.nodeid = get_node_id(inode);	1460	req->in.h.nodeid = get_node_id(inode);
1461	req->in.numargs = 1;	1461	req->in.numargs = 1;
1462	req->in.args[0].size = sizeof(*arg);	1462	req->in.args[0].size = sizeof(*arg);
1463	req->in.args[0].value = arg;	1463	req->in.args[0].value = arg;
1464	}	1464	}
1465		1465
1466	static int fuse_getlk(struct file file, struct file_lock fl)	1466	static int fuse_getlk(struct file file, struct file_lock fl)
1467	{	1467	{
1468	struct inode *inode = file->f_path.dentry->d_inode;	1468	struct inode *inode = file->f_path.dentry->d_inode;
1469	struct fuse_conn *fc = get_fuse_conn(inode);	1469	struct fuse_conn *fc = get_fuse_conn(inode);
1470	struct fuse_req *req;	1470	struct fuse_req *req;
1471	struct fuse_lk_out outarg;	1471	struct fuse_lk_out outarg;
1472	int err;	1472	int err;
1473		1473
1474	req = fuse_get_req(fc);	1474	req = fuse_get_req(fc);
1475	if (IS_ERR(req))	1475	if (IS_ERR(req))
1476	return PTR_ERR(req);	1476	return PTR_ERR(req);
1477		1477
1478	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);	1478	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
1479	req->out.numargs = 1;	1479	req->out.numargs = 1;
1480	req->out.args[0].size = sizeof(outarg);	1480	req->out.args[0].size = sizeof(outarg);
1481	req->out.args[0].value = &outarg;	1481	req->out.args[0].value = &outarg;
1482	fuse_request_send(fc, req);	1482	fuse_request_send(fc, req);
1483	err = req->out.h.error;	1483	err = req->out.h.error;
1484	fuse_put_request(fc, req);	1484	fuse_put_request(fc, req);
1485	if (!err)	1485	if (!err)
1486	err = convert_fuse_file_lock(&outarg.lk, fl);	1486	err = convert_fuse_file_lock(&outarg.lk, fl);
1487		1487
1488	return err;	1488	return err;
1489	}	1489	}
1490		1490
1491	static int fuse_setlk(struct file file, struct file_lock fl, int flock)	1491	static int fuse_setlk(struct file file, struct file_lock fl, int flock)
1492	{	1492	{
1493	struct inode *inode = file->f_path.dentry->d_inode;	1493	struct inode *inode = file->f_path.dentry->d_inode;
1494	struct fuse_conn *fc = get_fuse_conn(inode);	1494	struct fuse_conn *fc = get_fuse_conn(inode);
1495	struct fuse_req *req;	1495	struct fuse_req *req;
1496	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;	1496	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
1497	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;	1497	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
1498	int err;	1498	int err;
1499		1499
1500	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {	1500	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
1501	/* NLM needs asynchronous locks, which we don't support yet */	1501	/* NLM needs asynchronous locks, which we don't support yet */
1502	return -ENOLCK;	1502	return -ENOLCK;
1503	}	1503	}
1504		1504
1505	/* Unlock on close is handled by the flush method */	1505	/* Unlock on close is handled by the flush method */
1506	if (fl->fl_flags & FL_CLOSE)	1506	if (fl->fl_flags & FL_CLOSE)
1507	return 0;	1507	return 0;
1508		1508
1509	req = fuse_get_req(fc);	1509	req = fuse_get_req(fc);
1510	if (IS_ERR(req))	1510	if (IS_ERR(req))
1511	return PTR_ERR(req);	1511	return PTR_ERR(req);
1512		1512
1513	fuse_lk_fill(req, file, fl, opcode, pid, flock);	1513	fuse_lk_fill(req, file, fl, opcode, pid, flock);
1514	fuse_request_send(fc, req);	1514	fuse_request_send(fc, req);
1515	err = req->out.h.error;	1515	err = req->out.h.error;
1516	/* locking is restartable */	1516	/* locking is restartable */
1517	if (err == -EINTR)	1517	if (err == -EINTR)
1518	err = -ERESTARTSYS;	1518	err = -ERESTARTSYS;
1519	fuse_put_request(fc, req);	1519	fuse_put_request(fc, req);
1520	return err;	1520	return err;
1521	}	1521	}
1522		1522
1523	static int fuse_file_lock(struct file file, int cmd, struct file_lock fl)	1523	static int fuse_file_lock(struct file file, int cmd, struct file_lock fl)
1524	{	1524	{
1525	struct inode *inode = file->f_path.dentry->d_inode;	1525	struct inode *inode = file->f_path.dentry->d_inode;
1526	struct fuse_conn *fc = get_fuse_conn(inode);	1526	struct fuse_conn *fc = get_fuse_conn(inode);
1527	int err;	1527	int err;
1528		1528
1529	if (cmd == F_CANCELLK) {	1529	if (cmd == F_CANCELLK) {
1530	err = 0;	1530	err = 0;
1531	} else if (cmd == F_GETLK) {	1531	} else if (cmd == F_GETLK) {
1532	if (fc->no_lock) {	1532	if (fc->no_lock) {
1533	posix_test_lock(file, fl);	1533	posix_test_lock(file, fl);
1534	err = 0;	1534	err = 0;
1535	} else	1535	} else
1536	err = fuse_getlk(file, fl);	1536	err = fuse_getlk(file, fl);
1537	} else {	1537	} else {
1538	if (fc->no_lock)	1538	if (fc->no_lock)
1539	err = posix_lock_file(file, fl, NULL);	1539	err = posix_lock_file(file, fl, NULL);
1540	else	1540	else
1541	err = fuse_setlk(file, fl, 0);	1541	err = fuse_setlk(file, fl, 0);
1542	}	1542	}
1543	return err;	1543	return err;
1544	}	1544	}
1545		1545
1546	static int fuse_file_flock(struct file file, int cmd, struct file_lock fl)	1546	static int fuse_file_flock(struct file file, int cmd, struct file_lock fl)
1547	{	1547	{
1548	struct inode *inode = file->f_path.dentry->d_inode;	1548	struct inode *inode = file->f_path.dentry->d_inode;
1549	struct fuse_conn *fc = get_fuse_conn(inode);	1549	struct fuse_conn *fc = get_fuse_conn(inode);
1550	int err;	1550	int err;
1551		1551
1552	if (fc->no_flock) {	1552	if (fc->no_flock) {
1553	err = flock_lock_file_wait(file, fl);	1553	err = flock_lock_file_wait(file, fl);
1554	} else {	1554	} else {
1555	struct fuse_file *ff = file->private_data;	1555	struct fuse_file *ff = file->private_data;
1556		1556
1557	/* emulate flock with POSIX locks */	1557	/* emulate flock with POSIX locks */
1558	fl->fl_owner = (fl_owner_t) file;	1558	fl->fl_owner = (fl_owner_t) file;
1559	ff->flock = true;	1559	ff->flock = true;
1560	err = fuse_setlk(file, fl, 1);	1560	err = fuse_setlk(file, fl, 1);
1561	}	1561	}
1562		1562
1563	return err;	1563	return err;
1564	}	1564	}
1565		1565
1566	static sector_t fuse_bmap(struct address_space *mapping, sector_t block)	1566	static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1567	{	1567	{
1568	struct inode *inode = mapping->host;	1568	struct inode *inode = mapping->host;
1569	struct fuse_conn *fc = get_fuse_conn(inode);	1569	struct fuse_conn *fc = get_fuse_conn(inode);
1570	struct fuse_req *req;	1570	struct fuse_req *req;
1571	struct fuse_bmap_in inarg;	1571	struct fuse_bmap_in inarg;
1572	struct fuse_bmap_out outarg;	1572	struct fuse_bmap_out outarg;
1573	int err;	1573	int err;
1574		1574
1575	if (!inode->i_sb->s_bdev \|\| fc->no_bmap)	1575	if (!inode->i_sb->s_bdev \|\| fc->no_bmap)
1576	return 0;	1576	return 0;
1577		1577
1578	req = fuse_get_req(fc);	1578	req = fuse_get_req(fc);
1579	if (IS_ERR(req))	1579	if (IS_ERR(req))
1580	return 0;	1580	return 0;
1581		1581
1582	memset(&inarg, 0, sizeof(inarg));	1582	memset(&inarg, 0, sizeof(inarg));
1583	inarg.block = block;	1583	inarg.block = block;
1584	inarg.blocksize = inode->i_sb->s_blocksize;	1584	inarg.blocksize = inode->i_sb->s_blocksize;
1585	req->in.h.opcode = FUSE_BMAP;	1585	req->in.h.opcode = FUSE_BMAP;
1586	req->in.h.nodeid = get_node_id(inode);	1586	req->in.h.nodeid = get_node_id(inode);
1587	req->in.numargs = 1;	1587	req->in.numargs = 1;
1588	req->in.args[0].size = sizeof(inarg);	1588	req->in.args[0].size = sizeof(inarg);
1589	req->in.args[0].value = &inarg;	1589	req->in.args[0].value = &inarg;
1590	req->out.numargs = 1;	1590	req->out.numargs = 1;
1591	req->out.args[0].size = sizeof(outarg);	1591	req->out.args[0].size = sizeof(outarg);
1592	req->out.args[0].value = &outarg;	1592	req->out.args[0].value = &outarg;
1593	fuse_request_send(fc, req);	1593	fuse_request_send(fc, req);
1594	err = req->out.h.error;	1594	err = req->out.h.error;
1595	fuse_put_request(fc, req);	1595	fuse_put_request(fc, req);
1596	if (err == -ENOSYS)	1596	if (err == -ENOSYS)
1597	fc->no_bmap = 1;	1597	fc->no_bmap = 1;
1598		1598
1599	return err ? 0 : outarg.block;	1599	return err ? 0 : outarg.block;
1600	}	1600	}
1601		1601
1602	static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)	1602	static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
1603	{	1603	{
1604	loff_t retval;	1604	loff_t retval;
1605	struct inode *inode = file->f_path.dentry->d_inode;	1605	struct inode *inode = file->f_path.dentry->d_inode;
1606		1606
1607	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */	1607	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1608	if (origin == SEEK_CUR \|\| origin == SEEK_SET)	1608	if (whence == SEEK_CUR \|\| whence == SEEK_SET)
1609	return generic_file_llseek(file, offset, origin);	1609	return generic_file_llseek(file, offset, whence);
1610		1610
1611	mutex_lock(&inode->i_mutex);	1611	mutex_lock(&inode->i_mutex);
1612	retval = fuse_update_attributes(inode, NULL, file, NULL);	1612	retval = fuse_update_attributes(inode, NULL, file, NULL);
1613	if (!retval)	1613	if (!retval)
1614	retval = generic_file_llseek(file, offset, origin);	1614	retval = generic_file_llseek(file, offset, whence);
1615	mutex_unlock(&inode->i_mutex);	1615	mutex_unlock(&inode->i_mutex);
1616		1616
1617	return retval;	1617	return retval;
1618	}	1618	}
1619		1619
1620	static int fuse_ioctl_copy_user(struct page *pages, struct iovec iov,	1620	static int fuse_ioctl_copy_user(struct page *pages, struct iovec iov,
1621	unsigned int nr_segs, size_t bytes, bool to_user)	1621	unsigned int nr_segs, size_t bytes, bool to_user)
1622	{	1622	{
1623	struct iov_iter ii;	1623	struct iov_iter ii;
1624	int page_idx = 0;	1624	int page_idx = 0;
1625		1625
1626	if (!bytes)	1626	if (!bytes)
1627	return 0;	1627	return 0;
1628		1628
1629	iov_iter_init(&ii, iov, nr_segs, bytes, 0);	1629	iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1630		1630
1631	while (iov_iter_count(&ii)) {	1631	while (iov_iter_count(&ii)) {
1632	struct page *page = pages[page_idx++];	1632	struct page *page = pages[page_idx++];
1633	size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));	1633	size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1634	void *kaddr;	1634	void *kaddr;
1635		1635
1636	kaddr = kmap(page);	1636	kaddr = kmap(page);
1637		1637
1638	while (todo) {	1638	while (todo) {
1639	char __user *uaddr = ii.iov->iov_base + ii.iov_offset;	1639	char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1640	size_t iov_len = ii.iov->iov_len - ii.iov_offset;	1640	size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1641	size_t copy = min(todo, iov_len);	1641	size_t copy = min(todo, iov_len);
1642	size_t left;	1642	size_t left;
1643		1643
1644	if (!to_user)	1644	if (!to_user)
1645	left = copy_from_user(kaddr, uaddr, copy);	1645	left = copy_from_user(kaddr, uaddr, copy);
1646	else	1646	else
1647	left = copy_to_user(uaddr, kaddr, copy);	1647	left = copy_to_user(uaddr, kaddr, copy);
1648		1648
1649	if (unlikely(left))	1649	if (unlikely(left))
1650	return -EFAULT;	1650	return -EFAULT;
1651		1651
1652	iov_iter_advance(&ii, copy);	1652	iov_iter_advance(&ii, copy);
1653	todo -= copy;	1653	todo -= copy;
1654	kaddr += copy;	1654	kaddr += copy;
1655	}	1655	}
1656		1656
1657	kunmap(page);	1657	kunmap(page);
1658	}	1658	}
1659		1659
1660	return 0;	1660	return 0;
1661	}	1661	}
1662		1662
1663	/*	1663	/*
1664	* CUSE servers compiled on 32bit broke on 64bit kernels because the	1664	* CUSE servers compiled on 32bit broke on 64bit kernels because the
1665	* ABI was defined to be 'struct iovec' which is different on 32bit	1665	* ABI was defined to be 'struct iovec' which is different on 32bit
1666	* and 64bit. Fortunately we can determine which structure the server	1666	* and 64bit. Fortunately we can determine which structure the server
1667	* used from the size of the reply.	1667	* used from the size of the reply.
1668	*/	1668	*/
1669	static int fuse_copy_ioctl_iovec_old(struct iovec dst, void src,	1669	static int fuse_copy_ioctl_iovec_old(struct iovec dst, void src,
1670	size_t transferred, unsigned count,	1670	size_t transferred, unsigned count,
1671	bool is_compat)	1671	bool is_compat)
1672	{	1672	{
1673	#ifdef CONFIG_COMPAT	1673	#ifdef CONFIG_COMPAT
1674	if (count * sizeof(struct compat_iovec) == transferred) {	1674	if (count * sizeof(struct compat_iovec) == transferred) {
1675	struct compat_iovec *ciov = src;	1675	struct compat_iovec *ciov = src;
1676	unsigned i;	1676	unsigned i;
1677		1677
1678	/*	1678	/*
1679	* With this interface a 32bit server cannot support	1679	* With this interface a 32bit server cannot support
1680	* non-compat (i.e. ones coming from 64bit apps) ioctl	1680	* non-compat (i.e. ones coming from 64bit apps) ioctl
1681	* requests	1681	* requests
1682	*/	1682	*/
1683	if (!is_compat)	1683	if (!is_compat)
1684	return -EINVAL;	1684	return -EINVAL;
1685		1685
1686	for (i = 0; i < count; i++) {	1686	for (i = 0; i < count; i++) {
1687	dst[i].iov_base = compat_ptr(ciov[i].iov_base);	1687	dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1688	dst[i].iov_len = ciov[i].iov_len;	1688	dst[i].iov_len = ciov[i].iov_len;
1689	}	1689	}
1690	return 0;	1690	return 0;
1691	}	1691	}
1692	#endif	1692	#endif
1693		1693
1694	if (count * sizeof(struct iovec) != transferred)	1694	if (count * sizeof(struct iovec) != transferred)
1695	return -EIO;	1695	return -EIO;
1696		1696
1697	memcpy(dst, src, transferred);	1697	memcpy(dst, src, transferred);
1698	return 0;	1698	return 0;
1699	}	1699	}
1700		1700
1701	/* Make sure iov_length() won't overflow */	1701	/* Make sure iov_length() won't overflow */
1702	static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)	1702	static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1703	{	1703	{
1704	size_t n;	1704	size_t n;
1705	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;	1705	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1706		1706
1707	for (n = 0; n < count; n++, iov++) {	1707	for (n = 0; n < count; n++, iov++) {
1708	if (iov->iov_len > (size_t) max)	1708	if (iov->iov_len > (size_t) max)
1709	return -ENOMEM;	1709	return -ENOMEM;
1710	max -= iov->iov_len;	1710	max -= iov->iov_len;
1711	}	1711	}
1712	return 0;	1712	return 0;
1713	}	1713	}
1714		1714
1715	static int fuse_copy_ioctl_iovec(struct fuse_conn fc, struct iovec dst,	1715	static int fuse_copy_ioctl_iovec(struct fuse_conn fc, struct iovec dst,
1716	void *src, size_t transferred, unsigned count,	1716	void *src, size_t transferred, unsigned count,
1717	bool is_compat)	1717	bool is_compat)
1718	{	1718	{
1719	unsigned i;	1719	unsigned i;
1720	struct fuse_ioctl_iovec *fiov = src;	1720	struct fuse_ioctl_iovec *fiov = src;
1721		1721
1722	if (fc->minor < 16) {	1722	if (fc->minor < 16) {
1723	return fuse_copy_ioctl_iovec_old(dst, src, transferred,	1723	return fuse_copy_ioctl_iovec_old(dst, src, transferred,
1724	count, is_compat);	1724	count, is_compat);
1725	}	1725	}
1726		1726
1727	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)	1727	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
1728	return -EIO;	1728	return -EIO;
1729		1729
1730	for (i = 0; i < count; i++) {	1730	for (i = 0; i < count; i++) {
1731	/* Did the server supply an inappropriate value? */	1731	/* Did the server supply an inappropriate value? */
1732	if (fiov[i].base != (unsigned long) fiov[i].base \|\|	1732	if (fiov[i].base != (unsigned long) fiov[i].base \|\|
1733	fiov[i].len != (unsigned long) fiov[i].len)	1733	fiov[i].len != (unsigned long) fiov[i].len)
1734	return -EIO;	1734	return -EIO;
1735		1735
1736	dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;	1736	dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
1737	dst[i].iov_len = (size_t) fiov[i].len;	1737	dst[i].iov_len = (size_t) fiov[i].len;
1738		1738
1739	#ifdef CONFIG_COMPAT	1739	#ifdef CONFIG_COMPAT
1740	if (is_compat &&	1740	if (is_compat &&
1741	(ptr_to_compat(dst[i].iov_base) != fiov[i].base \|\|	1741	(ptr_to_compat(dst[i].iov_base) != fiov[i].base \|\|
1742	(compat_size_t) dst[i].iov_len != fiov[i].len))	1742	(compat_size_t) dst[i].iov_len != fiov[i].len))
1743	return -EIO;	1743	return -EIO;
1744	#endif	1744	#endif
1745	}	1745	}
1746		1746
1747	return 0;	1747	return 0;
1748	}	1748	}
1749		1749
1750		1750
1751	/*	1751	/*
1752	* For ioctls, there is no generic way to determine how much memory	1752	* For ioctls, there is no generic way to determine how much memory
1753	* needs to be read and/or written. Furthermore, ioctls are allowed	1753	* needs to be read and/or written. Furthermore, ioctls are allowed
1754	* to dereference the passed pointer, so the parameter requires deep	1754	* to dereference the passed pointer, so the parameter requires deep
1755	* copying but FUSE has no idea whatsoever about what to copy in or	1755	* copying but FUSE has no idea whatsoever about what to copy in or
1756	* out.	1756	* out.
1757	*	1757	*
1758	* This is solved by allowing FUSE server to retry ioctl with	1758	* This is solved by allowing FUSE server to retry ioctl with
1759	* necessary in/out iovecs. Let's assume the ioctl implementation	1759	* necessary in/out iovecs. Let's assume the ioctl implementation
1760	* needs to read in the following structure.	1760	* needs to read in the following structure.
1761	*	1761	*
1762	* struct a {	1762	* struct a {
1763	* char *buf;	1763	* char *buf;
1764	* size_t buflen;	1764	* size_t buflen;
1765	* }	1765	* }
1766	*	1766	*
1767	* On the first callout to FUSE server, inarg->in_size and	1767	* On the first callout to FUSE server, inarg->in_size and
1768	* inarg->out_size will be NULL; then, the server completes the ioctl	1768	* inarg->out_size will be NULL; then, the server completes the ioctl
1769	* with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and	1769	* with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1770	* the actual iov array to	1770	* the actual iov array to
1771	*	1771	*
1772	* { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }	1772	* { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
1773	*	1773	*
1774	* which tells FUSE to copy in the requested area and retry the ioctl.	1774	* which tells FUSE to copy in the requested area and retry the ioctl.
1775	* On the second round, the server has access to the structure and	1775	* On the second round, the server has access to the structure and
1776	* from that it can tell what to look for next, so on the invocation,	1776	* from that it can tell what to look for next, so on the invocation,
1777	* it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to	1777	* it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1778	*	1778	*
1779	* { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },	1779	* { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
1780	* { .iov_base = a.buf, .iov_len = a.buflen } }	1780	* { .iov_base = a.buf, .iov_len = a.buflen } }
1781	*	1781	*
1782	* FUSE will copy both struct a and the pointed buffer from the	1782	* FUSE will copy both struct a and the pointed buffer from the
1783	* process doing the ioctl and retry ioctl with both struct a and the	1783	* process doing the ioctl and retry ioctl with both struct a and the
1784	* buffer.	1784	* buffer.
1785	*	1785	*
1786	* This time, FUSE server has everything it needs and completes ioctl	1786	* This time, FUSE server has everything it needs and completes ioctl
1787	* without FUSE_IOCTL_RETRY which finishes the ioctl call.	1787	* without FUSE_IOCTL_RETRY which finishes the ioctl call.
1788	*	1788	*
1789	* Copying data out works the same way.	1789	* Copying data out works the same way.
1790	*	1790	*
1791	* Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel	1791	* Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1792	* automatically initializes in and out iovs by decoding @cmd with	1792	* automatically initializes in and out iovs by decoding @cmd with
1793	* _IOC_* macros and the server is not allowed to request RETRY. This	1793	* _IOC_* macros and the server is not allowed to request RETRY. This
1794	* limits ioctl data transfers to well-formed ioctls and is the forced	1794	* limits ioctl data transfers to well-formed ioctls and is the forced
1795	* behavior for all FUSE servers.	1795	* behavior for all FUSE servers.
1796	*/	1796	*/
1797	long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,	1797	long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1798	unsigned int flags)	1798	unsigned int flags)
1799	{	1799	{
1800	struct fuse_file *ff = file->private_data;	1800	struct fuse_file *ff = file->private_data;
1801	struct fuse_conn *fc = ff->fc;	1801	struct fuse_conn *fc = ff->fc;
1802	struct fuse_ioctl_in inarg = {	1802	struct fuse_ioctl_in inarg = {
1803	.fh = ff->fh,	1803	.fh = ff->fh,
1804	.cmd = cmd,	1804	.cmd = cmd,
1805	.arg = arg,	1805	.arg = arg,
1806	.flags = flags	1806	.flags = flags
1807	};	1807	};
1808	struct fuse_ioctl_out outarg;	1808	struct fuse_ioctl_out outarg;
1809	struct fuse_req *req = NULL;	1809	struct fuse_req *req = NULL;
1810	struct page **pages = NULL;	1810	struct page **pages = NULL;
1811	struct iovec *iov_page = NULL;	1811	struct iovec *iov_page = NULL;
1812	struct iovec in_iov = NULL, out_iov = NULL;	1812	struct iovec in_iov = NULL, out_iov = NULL;
1813	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;	1813	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1814	size_t in_size, out_size, transferred;	1814	size_t in_size, out_size, transferred;
1815	int err;	1815	int err;
1816		1816
1817	#if BITS_PER_LONG == 32	1817	#if BITS_PER_LONG == 32
1818	inarg.flags \|= FUSE_IOCTL_32BIT;	1818	inarg.flags \|= FUSE_IOCTL_32BIT;
1819	#else	1819	#else
1820	if (flags & FUSE_IOCTL_COMPAT)	1820	if (flags & FUSE_IOCTL_COMPAT)
1821	inarg.flags \|= FUSE_IOCTL_32BIT;	1821	inarg.flags \|= FUSE_IOCTL_32BIT;
1822	#endif	1822	#endif
1823		1823
1824	/* assume all the iovs returned by client always fits in a page */	1824	/* assume all the iovs returned by client always fits in a page */
1825	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);	1825	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1826		1826
1827	err = -ENOMEM;	1827	err = -ENOMEM;
1828	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);	1828	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
1829	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);	1829	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1830	if (!pages \|\| !iov_page)	1830	if (!pages \|\| !iov_page)
1831	goto out;	1831	goto out;
1832		1832
1833	/*	1833	/*
1834	* If restricted, initialize IO parameters as encoded in @cmd.	1834	* If restricted, initialize IO parameters as encoded in @cmd.
1835	* RETRY from server is not allowed.	1835	* RETRY from server is not allowed.
1836	*/	1836	*/
1837	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {	1837	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1838	struct iovec *iov = iov_page;	1838	struct iovec *iov = iov_page;
1839		1839
1840	iov->iov_base = (void __user *)arg;	1840	iov->iov_base = (void __user *)arg;
1841	iov->iov_len = _IOC_SIZE(cmd);	1841	iov->iov_len = _IOC_SIZE(cmd);
1842		1842
1843	if (_IOC_DIR(cmd) & _IOC_WRITE) {	1843	if (_IOC_DIR(cmd) & _IOC_WRITE) {
1844	in_iov = iov;	1844	in_iov = iov;
1845	in_iovs = 1;	1845	in_iovs = 1;
1846	}	1846	}
1847		1847
1848	if (_IOC_DIR(cmd) & _IOC_READ) {	1848	if (_IOC_DIR(cmd) & _IOC_READ) {
1849	out_iov = iov;	1849	out_iov = iov;
1850	out_iovs = 1;	1850	out_iovs = 1;
1851	}	1851	}
1852	}	1852	}
1853		1853
1854	retry:	1854	retry:
1855	inarg.in_size = in_size = iov_length(in_iov, in_iovs);	1855	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1856	inarg.out_size = out_size = iov_length(out_iov, out_iovs);	1856	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1857		1857
1858	/*	1858	/*
1859	* Out data can be used either for actual out data or iovs,	1859	* Out data can be used either for actual out data or iovs,
1860	* make sure there always is at least one page.	1860	* make sure there always is at least one page.
1861	*/	1861	*/
1862	out_size = max_t(size_t, out_size, PAGE_SIZE);	1862	out_size = max_t(size_t, out_size, PAGE_SIZE);
1863	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);	1863	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1864		1864
1865	/* make sure there are enough buffer pages and init request with them */	1865	/* make sure there are enough buffer pages and init request with them */
1866	err = -ENOMEM;	1866	err = -ENOMEM;
1867	if (max_pages > FUSE_MAX_PAGES_PER_REQ)	1867	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1868	goto out;	1868	goto out;
1869	while (num_pages < max_pages) {	1869	while (num_pages < max_pages) {
1870	pages[num_pages] = alloc_page(GFP_KERNEL \| __GFP_HIGHMEM);	1870	pages[num_pages] = alloc_page(GFP_KERNEL \| __GFP_HIGHMEM);
1871	if (!pages[num_pages])	1871	if (!pages[num_pages])
1872	goto out;	1872	goto out;
1873	num_pages++;	1873	num_pages++;
1874	}	1874	}
1875		1875
1876	req = fuse_get_req(fc);	1876	req = fuse_get_req(fc);
1877	if (IS_ERR(req)) {	1877	if (IS_ERR(req)) {
1878	err = PTR_ERR(req);	1878	err = PTR_ERR(req);
1879	req = NULL;	1879	req = NULL;
1880	goto out;	1880	goto out;
1881	}	1881	}
1882	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);	1882	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1883	req->num_pages = num_pages;	1883	req->num_pages = num_pages;
1884		1884
1885	/* okay, let's send it to the client */	1885	/* okay, let's send it to the client */
1886	req->in.h.opcode = FUSE_IOCTL;	1886	req->in.h.opcode = FUSE_IOCTL;
1887	req->in.h.nodeid = ff->nodeid;	1887	req->in.h.nodeid = ff->nodeid;
1888	req->in.numargs = 1;	1888	req->in.numargs = 1;
1889	req->in.args[0].size = sizeof(inarg);	1889	req->in.args[0].size = sizeof(inarg);
1890	req->in.args[0].value = &inarg;	1890	req->in.args[0].value = &inarg;
1891	if (in_size) {	1891	if (in_size) {
1892	req->in.numargs++;	1892	req->in.numargs++;
1893	req->in.args[1].size = in_size;	1893	req->in.args[1].size = in_size;
1894	req->in.argpages = 1;	1894	req->in.argpages = 1;
1895		1895
1896	err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,	1896	err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1897	false);	1897	false);
1898	if (err)	1898	if (err)
1899	goto out;	1899	goto out;
1900	}	1900	}
1901		1901
1902	req->out.numargs = 2;	1902	req->out.numargs = 2;
1903	req->out.args[0].size = sizeof(outarg);	1903	req->out.args[0].size = sizeof(outarg);
1904	req->out.args[0].value = &outarg;	1904	req->out.args[0].value = &outarg;
1905	req->out.args[1].size = out_size;	1905	req->out.args[1].size = out_size;
1906	req->out.argpages = 1;	1906	req->out.argpages = 1;
1907	req->out.argvar = 1;	1907	req->out.argvar = 1;
1908		1908
1909	fuse_request_send(fc, req);	1909	fuse_request_send(fc, req);
1910	err = req->out.h.error;	1910	err = req->out.h.error;
1911	transferred = req->out.args[1].size;	1911	transferred = req->out.args[1].size;
1912	fuse_put_request(fc, req);	1912	fuse_put_request(fc, req);
1913	req = NULL;	1913	req = NULL;
1914	if (err)	1914	if (err)
1915	goto out;	1915	goto out;
1916		1916
1917	/* did it ask for retry? */	1917	/* did it ask for retry? */
1918	if (outarg.flags & FUSE_IOCTL_RETRY) {	1918	if (outarg.flags & FUSE_IOCTL_RETRY) {
1919	void *vaddr;	1919	void *vaddr;
1920		1920
1921	/* no retry if in restricted mode */	1921	/* no retry if in restricted mode */
1922	err = -EIO;	1922	err = -EIO;
1923	if (!(flags & FUSE_IOCTL_UNRESTRICTED))	1923	if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1924	goto out;	1924	goto out;
1925		1925
1926	in_iovs = outarg.in_iovs;	1926	in_iovs = outarg.in_iovs;
1927	out_iovs = outarg.out_iovs;	1927	out_iovs = outarg.out_iovs;
1928		1928
1929	/*	1929	/*
1930	* Make sure things are in boundary, separate checks	1930	* Make sure things are in boundary, separate checks
1931	* are to protect against overflow.	1931	* are to protect against overflow.
1932	*/	1932	*/
1933	err = -ENOMEM;	1933	err = -ENOMEM;
1934	if (in_iovs > FUSE_IOCTL_MAX_IOV \|\|	1934	if (in_iovs > FUSE_IOCTL_MAX_IOV \|\|
1935	out_iovs > FUSE_IOCTL_MAX_IOV \|\|	1935	out_iovs > FUSE_IOCTL_MAX_IOV \|\|
1936	in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)	1936	in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1937	goto out;	1937	goto out;
1938		1938
1939	vaddr = kmap_atomic(pages[0]);	1939	vaddr = kmap_atomic(pages[0]);
1940	err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,	1940	err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
1941	transferred, in_iovs + out_iovs,	1941	transferred, in_iovs + out_iovs,
1942	(flags & FUSE_IOCTL_COMPAT) != 0);	1942	(flags & FUSE_IOCTL_COMPAT) != 0);
1943	kunmap_atomic(vaddr);	1943	kunmap_atomic(vaddr);
1944	if (err)	1944	if (err)
1945	goto out;	1945	goto out;
1946		1946
1947	in_iov = iov_page;	1947	in_iov = iov_page;
1948	out_iov = in_iov + in_iovs;	1948	out_iov = in_iov + in_iovs;
1949		1949
1950	err = fuse_verify_ioctl_iov(in_iov, in_iovs);	1950	err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1951	if (err)	1951	if (err)
1952	goto out;	1952	goto out;
1953		1953
1954	err = fuse_verify_ioctl_iov(out_iov, out_iovs);	1954	err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1955	if (err)	1955	if (err)
1956	goto out;	1956	goto out;
1957		1957
1958	goto retry;	1958	goto retry;
1959	}	1959	}
1960		1960
1961	err = -EIO;	1961	err = -EIO;
1962	if (transferred > inarg.out_size)	1962	if (transferred > inarg.out_size)
1963	goto out;	1963	goto out;
1964		1964
1965	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);	1965	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1966	out:	1966	out:
1967	if (req)	1967	if (req)
1968	fuse_put_request(fc, req);	1968	fuse_put_request(fc, req);
1969	free_page((unsigned long) iov_page);	1969	free_page((unsigned long) iov_page);
1970	while (num_pages)	1970	while (num_pages)
1971	__free_page(pages[--num_pages]);	1971	__free_page(pages[--num_pages]);
1972	kfree(pages);	1972	kfree(pages);
1973		1973
1974	return err ? err : outarg.result;	1974	return err ? err : outarg.result;
1975	}	1975	}
1976	EXPORT_SYMBOL_GPL(fuse_do_ioctl);	1976	EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1977		1977
1978	long fuse_ioctl_common(struct file *file, unsigned int cmd,	1978	long fuse_ioctl_common(struct file *file, unsigned int cmd,
1979	unsigned long arg, unsigned int flags)	1979	unsigned long arg, unsigned int flags)
1980	{	1980	{
1981	struct inode *inode = file->f_dentry->d_inode;	1981	struct inode *inode = file->f_dentry->d_inode;
1982	struct fuse_conn *fc = get_fuse_conn(inode);	1982	struct fuse_conn *fc = get_fuse_conn(inode);
1983		1983
1984	if (!fuse_allow_task(fc, current))	1984	if (!fuse_allow_task(fc, current))
1985	return -EACCES;	1985	return -EACCES;
1986		1986
1987	if (is_bad_inode(inode))	1987	if (is_bad_inode(inode))
1988	return -EIO;	1988	return -EIO;
1989		1989
1990	return fuse_do_ioctl(file, cmd, arg, flags);	1990	return fuse_do_ioctl(file, cmd, arg, flags);
1991	}	1991	}
1992		1992
1993	static long fuse_file_ioctl(struct file *file, unsigned int cmd,	1993	static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1994	unsigned long arg)	1994	unsigned long arg)
1995	{	1995	{
1996	return fuse_ioctl_common(file, cmd, arg, 0);	1996	return fuse_ioctl_common(file, cmd, arg, 0);
1997	}	1997	}
1998		1998
1999	static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,	1999	static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
2000	unsigned long arg)	2000	unsigned long arg)
2001	{	2001	{
2002	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);	2002	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
2003	}	2003	}
2004		2004
2005	/*	2005	/*
2006	* All files which have been polled are linked to RB tree	2006	* All files which have been polled are linked to RB tree
2007	* fuse_conn->polled_files which is indexed by kh. Walk the tree and	2007	* fuse_conn->polled_files which is indexed by kh. Walk the tree and
2008	* find the matching one.	2008	* find the matching one.
2009	*/	2009	*/
2010	static struct rb_node *fuse_find_polled_node(struct fuse_conn fc, u64 kh,	2010	static struct rb_node *fuse_find_polled_node(struct fuse_conn fc, u64 kh,
2011	struct rb_node **parent_out)	2011	struct rb_node **parent_out)
2012	{	2012	{
2013	struct rb_node **link = &fc->polled_files.rb_node;	2013	struct rb_node **link = &fc->polled_files.rb_node;
2014	struct rb_node *last = NULL;	2014	struct rb_node *last = NULL;
2015		2015
2016	while (*link) {	2016	while (*link) {
2017	struct fuse_file *ff;	2017	struct fuse_file *ff;
2018		2018
2019	last = *link;	2019	last = *link;
2020	ff = rb_entry(last, struct fuse_file, polled_node);	2020	ff = rb_entry(last, struct fuse_file, polled_node);
2021		2021
2022	if (kh < ff->kh)	2022	if (kh < ff->kh)
2023	link = &last->rb_left;	2023	link = &last->rb_left;
2024	else if (kh > ff->kh)	2024	else if (kh > ff->kh)
2025	link = &last->rb_right;	2025	link = &last->rb_right;
2026	else	2026	else
2027	return link;	2027	return link;
2028	}	2028	}
2029		2029
2030	if (parent_out)	2030	if (parent_out)
2031	*parent_out = last;	2031	*parent_out = last;
2032	return link;	2032	return link;
2033	}	2033	}
2034		2034
2035	/*	2035	/*
2036	* The file is about to be polled. Make sure it's on the polled_files	2036	* The file is about to be polled. Make sure it's on the polled_files
2037	* RB tree. Note that files once added to the polled_files tree are	2037	* RB tree. Note that files once added to the polled_files tree are
2038	* not removed before the file is released. This is because a file	2038	* not removed before the file is released. This is because a file
2039	* polled once is likely to be polled again.	2039	* polled once is likely to be polled again.
2040	*/	2040	*/
2041	static void fuse_register_polled_file(struct fuse_conn *fc,	2041	static void fuse_register_polled_file(struct fuse_conn *fc,
2042	struct fuse_file *ff)	2042	struct fuse_file *ff)
2043	{	2043	{
2044	spin_lock(&fc->lock);	2044	spin_lock(&fc->lock);
2045	if (RB_EMPTY_NODE(&ff->polled_node)) {	2045	if (RB_EMPTY_NODE(&ff->polled_node)) {
2046	struct rb_node *link, parent;	2046	struct rb_node *link, parent;
2047		2047
2048	link = fuse_find_polled_node(fc, ff->kh, &parent);	2048	link = fuse_find_polled_node(fc, ff->kh, &parent);
2049	BUG_ON(*link);	2049	BUG_ON(*link);
2050	rb_link_node(&ff->polled_node, parent, link);	2050	rb_link_node(&ff->polled_node, parent, link);
2051	rb_insert_color(&ff->polled_node, &fc->polled_files);	2051	rb_insert_color(&ff->polled_node, &fc->polled_files);
2052	}	2052	}
2053	spin_unlock(&fc->lock);	2053	spin_unlock(&fc->lock);
2054	}	2054	}
2055		2055
2056	unsigned fuse_file_poll(struct file file, poll_table wait)	2056	unsigned fuse_file_poll(struct file file, poll_table wait)
2057	{	2057	{
2058	struct fuse_file *ff = file->private_data;	2058	struct fuse_file *ff = file->private_data;
2059	struct fuse_conn *fc = ff->fc;	2059	struct fuse_conn *fc = ff->fc;
2060	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };	2060	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2061	struct fuse_poll_out outarg;	2061	struct fuse_poll_out outarg;
2062	struct fuse_req *req;	2062	struct fuse_req *req;
2063	int err;	2063	int err;
2064		2064
2065	if (fc->no_poll)	2065	if (fc->no_poll)
2066	return DEFAULT_POLLMASK;	2066	return DEFAULT_POLLMASK;
2067		2067
2068	poll_wait(file, &ff->poll_wait, wait);	2068	poll_wait(file, &ff->poll_wait, wait);
2069		2069
2070	/*	2070	/*
2071	* Ask for notification iff there's someone waiting for it.	2071	* Ask for notification iff there's someone waiting for it.
2072	* The client may ignore the flag and always notify.	2072	* The client may ignore the flag and always notify.
2073	*/	2073	*/
2074	if (waitqueue_active(&ff->poll_wait)) {	2074	if (waitqueue_active(&ff->poll_wait)) {
2075	inarg.flags \|= FUSE_POLL_SCHEDULE_NOTIFY;	2075	inarg.flags \|= FUSE_POLL_SCHEDULE_NOTIFY;
2076	fuse_register_polled_file(fc, ff);	2076	fuse_register_polled_file(fc, ff);
2077	}	2077	}
2078		2078
2079	req = fuse_get_req(fc);	2079	req = fuse_get_req(fc);
2080	if (IS_ERR(req))	2080	if (IS_ERR(req))
2081	return POLLERR;	2081	return POLLERR;
2082		2082
2083	req->in.h.opcode = FUSE_POLL;	2083	req->in.h.opcode = FUSE_POLL;
2084	req->in.h.nodeid = ff->nodeid;	2084	req->in.h.nodeid = ff->nodeid;
2085	req->in.numargs = 1;	2085	req->in.numargs = 1;
2086	req->in.args[0].size = sizeof(inarg);	2086	req->in.args[0].size = sizeof(inarg);
2087	req->in.args[0].value = &inarg;	2087	req->in.args[0].value = &inarg;
2088	req->out.numargs = 1;	2088	req->out.numargs = 1;
2089	req->out.args[0].size = sizeof(outarg);	2089	req->out.args[0].size = sizeof(outarg);
2090	req->out.args[0].value = &outarg;	2090	req->out.args[0].value = &outarg;
2091	fuse_request_send(fc, req);	2091	fuse_request_send(fc, req);
2092	err = req->out.h.error;	2092	err = req->out.h.error;
2093	fuse_put_request(fc, req);	2093	fuse_put_request(fc, req);
2094		2094
2095	if (!err)	2095	if (!err)
2096	return outarg.revents;	2096	return outarg.revents;
2097	if (err == -ENOSYS) {	2097	if (err == -ENOSYS) {
2098	fc->no_poll = 1;	2098	fc->no_poll = 1;
2099	return DEFAULT_POLLMASK;	2099	return DEFAULT_POLLMASK;
2100	}	2100	}
2101	return POLLERR;	2101	return POLLERR;
2102	}	2102	}
2103	EXPORT_SYMBOL_GPL(fuse_file_poll);	2103	EXPORT_SYMBOL_GPL(fuse_file_poll);
2104		2104
2105	/*	2105	/*
2106	* This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and	2106	* This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2107	* wakes up the poll waiters.	2107	* wakes up the poll waiters.
2108	*/	2108	*/
2109	int fuse_notify_poll_wakeup(struct fuse_conn *fc,	2109	int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2110	struct fuse_notify_poll_wakeup_out *outarg)	2110	struct fuse_notify_poll_wakeup_out *outarg)
2111	{	2111	{
2112	u64 kh = outarg->kh;	2112	u64 kh = outarg->kh;
2113	struct rb_node **link;	2113	struct rb_node **link;
2114		2114
2115	spin_lock(&fc->lock);	2115	spin_lock(&fc->lock);
2116		2116
2117	link = fuse_find_polled_node(fc, kh, NULL);	2117	link = fuse_find_polled_node(fc, kh, NULL);
2118	if (*link) {	2118	if (*link) {
2119	struct fuse_file *ff;	2119	struct fuse_file *ff;
2120		2120
2121	ff = rb_entry(*link, struct fuse_file, polled_node);	2121	ff = rb_entry(*link, struct fuse_file, polled_node);
2122	wake_up_interruptible_sync(&ff->poll_wait);	2122	wake_up_interruptible_sync(&ff->poll_wait);
2123	}	2123	}
2124		2124
2125	spin_unlock(&fc->lock);	2125	spin_unlock(&fc->lock);
2126	return 0;	2126	return 0;
2127	}	2127	}
2128		2128
2129	static ssize_t fuse_loop_dio(struct file filp, const struct iovec iov,	2129	static ssize_t fuse_loop_dio(struct file filp, const struct iovec iov,
2130	unsigned long nr_segs, loff_t *ppos, int rw)	2130	unsigned long nr_segs, loff_t *ppos, int rw)
2131	{	2131	{
2132	const struct iovec *vector = iov;	2132	const struct iovec *vector = iov;
2133	ssize_t ret = 0;	2133	ssize_t ret = 0;
2134		2134
2135	while (nr_segs > 0) {	2135	while (nr_segs > 0) {
2136	void __user *base;	2136	void __user *base;
2137	size_t len;	2137	size_t len;
2138	ssize_t nr;	2138	ssize_t nr;
2139		2139
2140	base = vector->iov_base;	2140	base = vector->iov_base;
2141	len = vector->iov_len;	2141	len = vector->iov_len;
2142	vector++;	2142	vector++;
2143	nr_segs--;	2143	nr_segs--;
2144		2144
2145	if (rw == WRITE)	2145	if (rw == WRITE)
2146	nr = __fuse_direct_write(filp, base, len, ppos);	2146	nr = __fuse_direct_write(filp, base, len, ppos);
2147	else	2147	else
2148	nr = fuse_direct_read(filp, base, len, ppos);	2148	nr = fuse_direct_read(filp, base, len, ppos);
2149		2149
2150	if (nr < 0) {	2150	if (nr < 0) {
2151	if (!ret)	2151	if (!ret)
2152	ret = nr;	2152	ret = nr;
2153	break;	2153	break;
2154	}	2154	}
2155	ret += nr;	2155	ret += nr;
2156	if (nr != len)	2156	if (nr != len)
2157	break;	2157	break;
2158	}	2158	}
2159		2159
2160	return ret;	2160	return ret;
2161	}	2161	}
2162		2162
2163		2163
2164	static ssize_t	2164	static ssize_t
2165	fuse_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,	2165	fuse_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,
2166	loff_t offset, unsigned long nr_segs)	2166	loff_t offset, unsigned long nr_segs)
2167	{	2167	{
2168	ssize_t ret = 0;	2168	ssize_t ret = 0;
2169	struct file *file = NULL;	2169	struct file *file = NULL;
2170	loff_t pos = 0;	2170	loff_t pos = 0;
2171		2171
2172	file = iocb->ki_filp;	2172	file = iocb->ki_filp;
2173	pos = offset;	2173	pos = offset;
2174		2174
2175	ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);	2175	ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
2176		2176
2177	return ret;	2177	return ret;
2178	}	2178	}
2179		2179
2180	long fuse_file_fallocate(struct file *file, int mode, loff_t offset,	2180	long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2181	loff_t length)	2181	loff_t length)
2182	{	2182	{
2183	struct fuse_file *ff = file->private_data;	2183	struct fuse_file *ff = file->private_data;
2184	struct fuse_conn *fc = ff->fc;	2184	struct fuse_conn *fc = ff->fc;
2185	struct fuse_req *req;	2185	struct fuse_req *req;
2186	struct fuse_fallocate_in inarg = {	2186	struct fuse_fallocate_in inarg = {
2187	.fh = ff->fh,	2187	.fh = ff->fh,
2188	.offset = offset,	2188	.offset = offset,
2189	.length = length,	2189	.length = length,
2190	.mode = mode	2190	.mode = mode
2191	};	2191	};
2192	int err;	2192	int err;
2193		2193
2194	if (fc->no_fallocate)	2194	if (fc->no_fallocate)
2195	return -EOPNOTSUPP;	2195	return -EOPNOTSUPP;
2196		2196
2197	req = fuse_get_req(fc);	2197	req = fuse_get_req(fc);
2198	if (IS_ERR(req))	2198	if (IS_ERR(req))
2199	return PTR_ERR(req);	2199	return PTR_ERR(req);
2200		2200
2201	req->in.h.opcode = FUSE_FALLOCATE;	2201	req->in.h.opcode = FUSE_FALLOCATE;
2202	req->in.h.nodeid = ff->nodeid;	2202	req->in.h.nodeid = ff->nodeid;
2203	req->in.numargs = 1;	2203	req->in.numargs = 1;
2204	req->in.args[0].size = sizeof(inarg);	2204	req->in.args[0].size = sizeof(inarg);
2205	req->in.args[0].value = &inarg;	2205	req->in.args[0].value = &inarg;
2206	fuse_request_send(fc, req);	2206	fuse_request_send(fc, req);
2207	err = req->out.h.error;	2207	err = req->out.h.error;
2208	if (err == -ENOSYS) {	2208	if (err == -ENOSYS) {
2209	fc->no_fallocate = 1;	2209	fc->no_fallocate = 1;
2210	err = -EOPNOTSUPP;	2210	err = -EOPNOTSUPP;
2211	}	2211	}
2212	fuse_put_request(fc, req);	2212	fuse_put_request(fc, req);
2213		2213
2214	return err;	2214	return err;
2215	}	2215	}
2216	EXPORT_SYMBOL_GPL(fuse_file_fallocate);	2216	EXPORT_SYMBOL_GPL(fuse_file_fallocate);
2217		2217
2218	static const struct file_operations fuse_file_operations = {	2218	static const struct file_operations fuse_file_operations = {
2219	.llseek = fuse_file_llseek,	2219	.llseek = fuse_file_llseek,
2220	.read = do_sync_read,	2220	.read = do_sync_read,
2221	.aio_read = fuse_file_aio_read,	2221	.aio_read = fuse_file_aio_read,
2222	.write = do_sync_write,	2222	.write = do_sync_write,
2223	.aio_write = fuse_file_aio_write,	2223	.aio_write = fuse_file_aio_write,
2224	.mmap = fuse_file_mmap,	2224	.mmap = fuse_file_mmap,
2225	.open = fuse_open,	2225	.open = fuse_open,
2226	.flush = fuse_flush,	2226	.flush = fuse_flush,
2227	.release = fuse_release,	2227	.release = fuse_release,
2228	.fsync = fuse_fsync,	2228	.fsync = fuse_fsync,
2229	.lock = fuse_file_lock,	2229	.lock = fuse_file_lock,
2230	.flock = fuse_file_flock,	2230	.flock = fuse_file_flock,
2231	.splice_read = generic_file_splice_read,	2231	.splice_read = generic_file_splice_read,
2232	.unlocked_ioctl = fuse_file_ioctl,	2232	.unlocked_ioctl = fuse_file_ioctl,
2233	.compat_ioctl = fuse_file_compat_ioctl,	2233	.compat_ioctl = fuse_file_compat_ioctl,
2234	.poll = fuse_file_poll,	2234	.poll = fuse_file_poll,
2235	.fallocate = fuse_file_fallocate,	2235	.fallocate = fuse_file_fallocate,
2236	};	2236	};
2237		2237
2238	static const struct file_operations fuse_direct_io_file_operations = {	2238	static const struct file_operations fuse_direct_io_file_operations = {
2239	.llseek = fuse_file_llseek,	2239	.llseek = fuse_file_llseek,
2240	.read = fuse_direct_read,	2240	.read = fuse_direct_read,
2241	.write = fuse_direct_write,	2241	.write = fuse_direct_write,
2242	.mmap = fuse_direct_mmap,	2242	.mmap = fuse_direct_mmap,
2243	.open = fuse_open,	2243	.open = fuse_open,
2244	.flush = fuse_flush,	2244	.flush = fuse_flush,
2245	.release = fuse_release,	2245	.release = fuse_release,
2246	.fsync = fuse_fsync,	2246	.fsync = fuse_fsync,
2247	.lock = fuse_file_lock,	2247	.lock = fuse_file_lock,
2248	.flock = fuse_file_flock,	2248	.flock = fuse_file_flock,
2249	.unlocked_ioctl = fuse_file_ioctl,	2249	.unlocked_ioctl = fuse_file_ioctl,
2250	.compat_ioctl = fuse_file_compat_ioctl,	2250	.compat_ioctl = fuse_file_compat_ioctl,
2251	.poll = fuse_file_poll,	2251	.poll = fuse_file_poll,
2252	.fallocate = fuse_file_fallocate,	2252	.fallocate = fuse_file_fallocate,
2253	/* no splice_read */	2253	/* no splice_read */
2254	};	2254	};
2255		2255
2256	static const struct address_space_operations fuse_file_aops = {	2256	static const struct address_space_operations fuse_file_aops = {
2257	.readpage = fuse_readpage,	2257	.readpage = fuse_readpage,
2258	.writepage = fuse_writepage,	2258	.writepage = fuse_writepage,
2259	.launder_page = fuse_launder_page,	2259	.launder_page = fuse_launder_page,
2260	.readpages = fuse_readpages,	2260	.readpages = fuse_readpages,
2261	.set_page_dirty = __set_page_dirty_nobuffers,	2261	.set_page_dirty = __set_page_dirty_nobuffers,
2262	.bmap = fuse_bmap,	2262	.bmap = fuse_bmap,
2263	.direct_IO = fuse_direct_IO,	2263	.direct_IO = fuse_direct_IO,
2264	};	2264	};
2265		2265
2266	void fuse_init_file_inode(struct inode *inode)	2266	void fuse_init_file_inode(struct inode *inode)
2267	{	2267	{
2268	inode->i_fop = &fuse_file_operations;	2268	inode->i_fop = &fuse_file_operations;
2269	inode->i_data.a_ops = &fuse_file_aops;	2269	inode->i_data.a_ops = &fuse_file_aops;
2270	}	2270	}
2271		2271

fs/gfs2/file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.	2	* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3	* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.	3	* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4	*	4	*
5	* This copyrighted material is made available to anyone wishing to use,	5	* This copyrighted material is made available to anyone wishing to use,
6	* modify, copy, or redistribute it subject to the terms and conditions	6	* modify, copy, or redistribute it subject to the terms and conditions
7	* of the GNU General Public License version 2.	7	* of the GNU General Public License version 2.
8	*/	8	*/
9		9
10	#include <linux/slab.h>	10	#include <linux/slab.h>
11	#include <linux/spinlock.h>	11	#include <linux/spinlock.h>
12	#include <linux/completion.h>	12	#include <linux/completion.h>
13	#include <linux/buffer_head.h>	13	#include <linux/buffer_head.h>
14	#include <linux/pagemap.h>	14	#include <linux/pagemap.h>
15	#include <linux/uio.h>	15	#include <linux/uio.h>
16	#include <linux/blkdev.h>	16	#include <linux/blkdev.h>
17	#include <linux/mm.h>	17	#include <linux/mm.h>
18	#include <linux/mount.h>	18	#include <linux/mount.h>
19	#include <linux/fs.h>	19	#include <linux/fs.h>
20	#include <linux/gfs2_ondisk.h>	20	#include <linux/gfs2_ondisk.h>
21	#include <linux/falloc.h>	21	#include <linux/falloc.h>
22	#include <linux/swap.h>	22	#include <linux/swap.h>
23	#include <linux/crc32.h>	23	#include <linux/crc32.h>
24	#include <linux/writeback.h>	24	#include <linux/writeback.h>
25	#include <asm/uaccess.h>	25	#include <asm/uaccess.h>
26	#include <linux/dlm.h>	26	#include <linux/dlm.h>
27	#include <linux/dlm_plock.h>	27	#include <linux/dlm_plock.h>
28		28
29	#include "gfs2.h"	29	#include "gfs2.h"
30	#include "incore.h"	30	#include "incore.h"
31	#include "bmap.h"	31	#include "bmap.h"
32	#include "dir.h"	32	#include "dir.h"
33	#include "glock.h"	33	#include "glock.h"
34	#include "glops.h"	34	#include "glops.h"
35	#include "inode.h"	35	#include "inode.h"
36	#include "log.h"	36	#include "log.h"
37	#include "meta_io.h"	37	#include "meta_io.h"
38	#include "quota.h"	38	#include "quota.h"
39	#include "rgrp.h"	39	#include "rgrp.h"
40	#include "trans.h"	40	#include "trans.h"
41	#include "util.h"	41	#include "util.h"
42		42
43	/**	43	/**
44	* gfs2_llseek - seek to a location in a file	44	* gfs2_llseek - seek to a location in a file
45	* @file: the file	45	* @file: the file
46	* @offset: the offset	46	* @offset: the offset
47	* @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)	47	* @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
48	*	48	*
49	* SEEK_END requires the glock for the file because it references the	49	* SEEK_END requires the glock for the file because it references the
50	* file's size.	50	* file's size.
51	*	51	*
52	* Returns: The new offset, or errno	52	* Returns: The new offset, or errno
53	*/	53	*/
54		54
55	static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)	55	static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
56	{	56	{
57	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);	57	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
58	struct gfs2_holder i_gh;	58	struct gfs2_holder i_gh;
59	loff_t error;	59	loff_t error;
60		60
61	switch (origin) {	61	switch (whence) {
62	case SEEK_END: /* These reference inode->i_size */	62	case SEEK_END: /* These reference inode->i_size */
63	case SEEK_DATA:	63	case SEEK_DATA:
64	case SEEK_HOLE:	64	case SEEK_HOLE:
65	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,	65	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
66	&i_gh);	66	&i_gh);
67	if (!error) {	67	if (!error) {
68	error = generic_file_llseek(file, offset, origin);	68	error = generic_file_llseek(file, offset, whence);
69	gfs2_glock_dq_uninit(&i_gh);	69	gfs2_glock_dq_uninit(&i_gh);
70	}	70	}
71	break;	71	break;
72	case SEEK_CUR:	72	case SEEK_CUR:
73	case SEEK_SET:	73	case SEEK_SET:
74	error = generic_file_llseek(file, offset, origin);	74	error = generic_file_llseek(file, offset, whence);
75	break;	75	break;
76	default:	76	default:
77	error = -EINVAL;	77	error = -EINVAL;
78	}	78	}
79		79
80	return error;	80	return error;
81	}	81	}
82		82
83	/**	83	/**
84	* gfs2_readdir - Read directory entries from a directory	84	* gfs2_readdir - Read directory entries from a directory
85	* @file: The directory to read from	85	* @file: The directory to read from
86	* @dirent: Buffer for dirents	86	* @dirent: Buffer for dirents
87	* @filldir: Function used to do the copying	87	* @filldir: Function used to do the copying
88	*	88	*
89	* Returns: errno	89	* Returns: errno
90	*/	90	*/
91		91
92	static int gfs2_readdir(struct file file, void dirent, filldir_t filldir)	92	static int gfs2_readdir(struct file file, void dirent, filldir_t filldir)
93	{	93	{
94	struct inode *dir = file->f_mapping->host;	94	struct inode *dir = file->f_mapping->host;
95	struct gfs2_inode *dip = GFS2_I(dir);	95	struct gfs2_inode *dip = GFS2_I(dir);
96	struct gfs2_holder d_gh;	96	struct gfs2_holder d_gh;
97	u64 offset = file->f_pos;	97	u64 offset = file->f_pos;
98	int error;	98	int error;
99		99
100	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);	100	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
101	error = gfs2_glock_nq(&d_gh);	101	error = gfs2_glock_nq(&d_gh);
102	if (error) {	102	if (error) {
103	gfs2_holder_uninit(&d_gh);	103	gfs2_holder_uninit(&d_gh);
104	return error;	104	return error;
105	}	105	}
106		106
107	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);	107	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
108		108
109	gfs2_glock_dq_uninit(&d_gh);	109	gfs2_glock_dq_uninit(&d_gh);
110		110
111	file->f_pos = offset;	111	file->f_pos = offset;
112		112
113	return error;	113	return error;
114	}	114	}
115		115
116	/**	116	/**
117	* fsflags_cvt	117	* fsflags_cvt
118	* @table: A table of 32 u32 flags	118	* @table: A table of 32 u32 flags
119	* @val: a 32 bit value to convert	119	* @val: a 32 bit value to convert
120	*	120	*
121	* This function can be used to convert between fsflags values and	121	* This function can be used to convert between fsflags values and
122	* GFS2's own flags values.	122	* GFS2's own flags values.
123	*	123	*
124	* Returns: the converted flags	124	* Returns: the converted flags
125	*/	125	*/
126	static u32 fsflags_cvt(const u32 *table, u32 val)	126	static u32 fsflags_cvt(const u32 *table, u32 val)
127	{	127	{
128	u32 res = 0;	128	u32 res = 0;
129	while(val) {	129	while(val) {
130	if (val & 1)	130	if (val & 1)
131	res \|= *table;	131	res \|= *table;
132	table++;	132	table++;
133	val >>= 1;	133	val >>= 1;
134	}	134	}
135	return res;	135	return res;
136	}	136	}
137		137
138	static const u32 fsflags_to_gfs2[32] = {	138	static const u32 fsflags_to_gfs2[32] = {
139	[3] = GFS2_DIF_SYNC,	139	[3] = GFS2_DIF_SYNC,
140	[4] = GFS2_DIF_IMMUTABLE,	140	[4] = GFS2_DIF_IMMUTABLE,
141	[5] = GFS2_DIF_APPENDONLY,	141	[5] = GFS2_DIF_APPENDONLY,
142	[7] = GFS2_DIF_NOATIME,	142	[7] = GFS2_DIF_NOATIME,
143	[12] = GFS2_DIF_EXHASH,	143	[12] = GFS2_DIF_EXHASH,
144	[14] = GFS2_DIF_INHERIT_JDATA,	144	[14] = GFS2_DIF_INHERIT_JDATA,
145	[17] = GFS2_DIF_TOPDIR,	145	[17] = GFS2_DIF_TOPDIR,
146	};	146	};
147		147
148	static const u32 gfs2_to_fsflags[32] = {	148	static const u32 gfs2_to_fsflags[32] = {
149	[gfs2fl_Sync] = FS_SYNC_FL,	149	[gfs2fl_Sync] = FS_SYNC_FL,
150	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,	150	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
151	[gfs2fl_AppendOnly] = FS_APPEND_FL,	151	[gfs2fl_AppendOnly] = FS_APPEND_FL,
152	[gfs2fl_NoAtime] = FS_NOATIME_FL,	152	[gfs2fl_NoAtime] = FS_NOATIME_FL,
153	[gfs2fl_ExHash] = FS_INDEX_FL,	153	[gfs2fl_ExHash] = FS_INDEX_FL,
154	[gfs2fl_TopLevel] = FS_TOPDIR_FL,	154	[gfs2fl_TopLevel] = FS_TOPDIR_FL,
155	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,	155	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
156	};	156	};
157		157
158	static int gfs2_get_flags(struct file filp, u32 __user ptr)	158	static int gfs2_get_flags(struct file filp, u32 __user ptr)
159	{	159	{
160	struct inode *inode = filp->f_path.dentry->d_inode;	160	struct inode *inode = filp->f_path.dentry->d_inode;
161	struct gfs2_inode *ip = GFS2_I(inode);	161	struct gfs2_inode *ip = GFS2_I(inode);
162	struct gfs2_holder gh;	162	struct gfs2_holder gh;
163	int error;	163	int error;
164	u32 fsflags;	164	u32 fsflags;
165		165
166	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);	166	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
167	error = gfs2_glock_nq(&gh);	167	error = gfs2_glock_nq(&gh);
168	if (error)	168	if (error)
169	return error;	169	return error;
170		170
171	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);	171	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
172	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)	172	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
173	fsflags \|= FS_JOURNAL_DATA_FL;	173	fsflags \|= FS_JOURNAL_DATA_FL;
174	if (put_user(fsflags, ptr))	174	if (put_user(fsflags, ptr))
175	error = -EFAULT;	175	error = -EFAULT;
176		176
177	gfs2_glock_dq(&gh);	177	gfs2_glock_dq(&gh);
178	gfs2_holder_uninit(&gh);	178	gfs2_holder_uninit(&gh);
179	return error;	179	return error;
180	}	180	}
181		181
182	void gfs2_set_inode_flags(struct inode *inode)	182	void gfs2_set_inode_flags(struct inode *inode)
183	{	183	{
184	struct gfs2_inode *ip = GFS2_I(inode);	184	struct gfs2_inode *ip = GFS2_I(inode);
185	unsigned int flags = inode->i_flags;	185	unsigned int flags = inode->i_flags;
186		186
187	flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC\|S_NOSEC);	187	flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC\|S_NOSEC);
188	if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))	188	if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
189	inode->i_flags \|= S_NOSEC;	189	inode->i_flags \|= S_NOSEC;
190	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)	190	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
191	flags \|= S_IMMUTABLE;	191	flags \|= S_IMMUTABLE;
192	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)	192	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
193	flags \|= S_APPEND;	193	flags \|= S_APPEND;
194	if (ip->i_diskflags & GFS2_DIF_NOATIME)	194	if (ip->i_diskflags & GFS2_DIF_NOATIME)
195	flags \|= S_NOATIME;	195	flags \|= S_NOATIME;
196	if (ip->i_diskflags & GFS2_DIF_SYNC)	196	if (ip->i_diskflags & GFS2_DIF_SYNC)
197	flags \|= S_SYNC;	197	flags \|= S_SYNC;
198	inode->i_flags = flags;	198	inode->i_flags = flags;
199	}	199	}
200		200
201	/* Flags that can be set by user space */	201	/* Flags that can be set by user space */
202	#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA\| \	202	#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA\| \
203	GFS2_DIF_IMMUTABLE\| \	203	GFS2_DIF_IMMUTABLE\| \
204	GFS2_DIF_APPENDONLY\| \	204	GFS2_DIF_APPENDONLY\| \
205	GFS2_DIF_NOATIME\| \	205	GFS2_DIF_NOATIME\| \
206	GFS2_DIF_SYNC\| \	206	GFS2_DIF_SYNC\| \
207	GFS2_DIF_SYSTEM\| \	207	GFS2_DIF_SYSTEM\| \
208	GFS2_DIF_TOPDIR\| \	208	GFS2_DIF_TOPDIR\| \
209	GFS2_DIF_INHERIT_JDATA)	209	GFS2_DIF_INHERIT_JDATA)
210		210
211	/**	211	/**
212	* gfs2_set_flags - set flags on an inode	212	* gfs2_set_flags - set flags on an inode
213	* @inode: The inode	213	* @inode: The inode
214	* @flags: The flags to set	214	* @flags: The flags to set
215	* @mask: Indicates which flags are valid	215	* @mask: Indicates which flags are valid
216	*	216	*
217	*/	217	*/
218	static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)	218	static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
219	{	219	{
220	struct inode *inode = filp->f_path.dentry->d_inode;	220	struct inode *inode = filp->f_path.dentry->d_inode;
221	struct gfs2_inode *ip = GFS2_I(inode);	221	struct gfs2_inode *ip = GFS2_I(inode);
222	struct gfs2_sbd *sdp = GFS2_SB(inode);	222	struct gfs2_sbd *sdp = GFS2_SB(inode);
223	struct buffer_head *bh;	223	struct buffer_head *bh;
224	struct gfs2_holder gh;	224	struct gfs2_holder gh;
225	int error;	225	int error;
226	u32 new_flags, flags;	226	u32 new_flags, flags;
227		227
228	error = mnt_want_write_file(filp);	228	error = mnt_want_write_file(filp);
229	if (error)	229	if (error)
230	return error;	230	return error;
231		231
232	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);	232	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
233	if (error)	233	if (error)
234	goto out_drop_write;	234	goto out_drop_write;
235		235
236	error = -EACCES;	236	error = -EACCES;
237	if (!inode_owner_or_capable(inode))	237	if (!inode_owner_or_capable(inode))
238	goto out;	238	goto out;
239		239
240	error = 0;	240	error = 0;
241	flags = ip->i_diskflags;	241	flags = ip->i_diskflags;
242	new_flags = (flags & ~mask) \| (reqflags & mask);	242	new_flags = (flags & ~mask) \| (reqflags & mask);
243	if ((new_flags ^ flags) == 0)	243	if ((new_flags ^ flags) == 0)
244	goto out;	244	goto out;
245		245
246	error = -EINVAL;	246	error = -EINVAL;
247	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)	247	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
248	goto out;	248	goto out;
249		249
250	error = -EPERM;	250	error = -EPERM;
251	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))	251	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
252	goto out;	252	goto out;
253	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))	253	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
254	goto out;	254	goto out;
255	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&	255	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
256	!capable(CAP_LINUX_IMMUTABLE))	256	!capable(CAP_LINUX_IMMUTABLE))
257	goto out;	257	goto out;
258	if (!IS_IMMUTABLE(inode)) {	258	if (!IS_IMMUTABLE(inode)) {
259	error = gfs2_permission(inode, MAY_WRITE);	259	error = gfs2_permission(inode, MAY_WRITE);
260	if (error)	260	if (error)
261	goto out;	261	goto out;
262	}	262	}
263	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {	263	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
264	if (flags & GFS2_DIF_JDATA)	264	if (flags & GFS2_DIF_JDATA)
265	gfs2_log_flush(sdp, ip->i_gl);	265	gfs2_log_flush(sdp, ip->i_gl);
266	error = filemap_fdatawrite(inode->i_mapping);	266	error = filemap_fdatawrite(inode->i_mapping);
267	if (error)	267	if (error)
268	goto out;	268	goto out;
269	error = filemap_fdatawait(inode->i_mapping);	269	error = filemap_fdatawait(inode->i_mapping);
270	if (error)	270	if (error)
271	goto out;	271	goto out;
272	}	272	}
273	error = gfs2_trans_begin(sdp, RES_DINODE, 0);	273	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
274	if (error)	274	if (error)
275	goto out;	275	goto out;
276	error = gfs2_meta_inode_buffer(ip, &bh);	276	error = gfs2_meta_inode_buffer(ip, &bh);
277	if (error)	277	if (error)
278	goto out_trans_end;	278	goto out_trans_end;
279	gfs2_trans_add_bh(ip->i_gl, bh, 1);	279	gfs2_trans_add_bh(ip->i_gl, bh, 1);
280	ip->i_diskflags = new_flags;	280	ip->i_diskflags = new_flags;
281	gfs2_dinode_out(ip, bh->b_data);	281	gfs2_dinode_out(ip, bh->b_data);
282	brelse(bh);	282	brelse(bh);
283	gfs2_set_inode_flags(inode);	283	gfs2_set_inode_flags(inode);
284	gfs2_set_aops(inode);	284	gfs2_set_aops(inode);
285	out_trans_end:	285	out_trans_end:
286	gfs2_trans_end(sdp);	286	gfs2_trans_end(sdp);
287	out:	287	out:
288	gfs2_glock_dq_uninit(&gh);	288	gfs2_glock_dq_uninit(&gh);
289	out_drop_write:	289	out_drop_write:
290	mnt_drop_write_file(filp);	290	mnt_drop_write_file(filp);
291	return error;	291	return error;
292	}	292	}
293		293
294	static int gfs2_set_flags(struct file filp, u32 __user ptr)	294	static int gfs2_set_flags(struct file filp, u32 __user ptr)
295	{	295	{
296	struct inode *inode = filp->f_path.dentry->d_inode;	296	struct inode *inode = filp->f_path.dentry->d_inode;
297	u32 fsflags, gfsflags;	297	u32 fsflags, gfsflags;
298		298
299	if (get_user(fsflags, ptr))	299	if (get_user(fsflags, ptr))
300	return -EFAULT;	300	return -EFAULT;
301		301
302	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);	302	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
303	if (!S_ISDIR(inode->i_mode)) {	303	if (!S_ISDIR(inode->i_mode)) {
304	gfsflags &= ~GFS2_DIF_TOPDIR;	304	gfsflags &= ~GFS2_DIF_TOPDIR;
305	if (gfsflags & GFS2_DIF_INHERIT_JDATA)	305	if (gfsflags & GFS2_DIF_INHERIT_JDATA)
306	gfsflags ^= (GFS2_DIF_JDATA \| GFS2_DIF_INHERIT_JDATA);	306	gfsflags ^= (GFS2_DIF_JDATA \| GFS2_DIF_INHERIT_JDATA);
307	return do_gfs2_set_flags(filp, gfsflags, ~0);	307	return do_gfs2_set_flags(filp, gfsflags, ~0);
308	}	308	}
309	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);	309	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
310	}	310	}
311		311
312	static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)	312	static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
313	{	313	{
314	switch(cmd) {	314	switch(cmd) {
315	case FS_IOC_GETFLAGS:	315	case FS_IOC_GETFLAGS:
316	return gfs2_get_flags(filp, (u32 __user *)arg);	316	return gfs2_get_flags(filp, (u32 __user *)arg);
317	case FS_IOC_SETFLAGS:	317	case FS_IOC_SETFLAGS:
318	return gfs2_set_flags(filp, (u32 __user *)arg);	318	return gfs2_set_flags(filp, (u32 __user *)arg);
319	case FITRIM:	319	case FITRIM:
320	return gfs2_fitrim(filp, (void __user *)arg);	320	return gfs2_fitrim(filp, (void __user *)arg);
321	}	321	}
322	return -ENOTTY;	322	return -ENOTTY;
323	}	323	}
324		324
325	/**	325	/**
326	* gfs2_size_hint - Give a hint to the size of a write request	326	* gfs2_size_hint - Give a hint to the size of a write request
327	* @file: The struct file	327	* @file: The struct file
328	* @offset: The file offset of the write	328	* @offset: The file offset of the write
329	* @size: The length of the write	329	* @size: The length of the write
330	*	330	*
331	* When we are about to do a write, this function records the total	331	* When we are about to do a write, this function records the total
332	* write size in order to provide a suitable hint to the lower layers	332	* write size in order to provide a suitable hint to the lower layers
333	* about how many blocks will be required.	333	* about how many blocks will be required.
334	*	334	*
335	*/	335	*/
336		336
337	static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)	337	static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
338	{	338	{
339	struct inode *inode = filep->f_dentry->d_inode;	339	struct inode *inode = filep->f_dentry->d_inode;
340	struct gfs2_sbd *sdp = GFS2_SB(inode);	340	struct gfs2_sbd *sdp = GFS2_SB(inode);
341	struct gfs2_inode *ip = GFS2_I(inode);	341	struct gfs2_inode *ip = GFS2_I(inode);
342	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;	342	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
343	int hint = min_t(size_t, INT_MAX, blks);	343	int hint = min_t(size_t, INT_MAX, blks);
344		344
345	atomic_set(&ip->i_res->rs_sizehint, hint);	345	atomic_set(&ip->i_res->rs_sizehint, hint);
346	}	346	}
347		347
348	/**	348	/**
349	* gfs2_allocate_page_backing - Use bmap to allocate blocks	349	* gfs2_allocate_page_backing - Use bmap to allocate blocks
350	* @page: The (locked) page to allocate backing for	350	* @page: The (locked) page to allocate backing for
351	*	351	*
352	* We try to allocate all the blocks required for the page in	352	* We try to allocate all the blocks required for the page in
353	* one go. This might fail for various reasons, so we keep	353	* one go. This might fail for various reasons, so we keep
354	* trying until all the blocks to back this page are allocated.	354	* trying until all the blocks to back this page are allocated.
355	* If some of the blocks are already allocated, thats ok too.	355	* If some of the blocks are already allocated, thats ok too.
356	*/	356	*/
357		357
358	static int gfs2_allocate_page_backing(struct page *page)	358	static int gfs2_allocate_page_backing(struct page *page)
359	{	359	{
360	struct inode *inode = page->mapping->host;	360	struct inode *inode = page->mapping->host;
361	struct buffer_head bh;	361	struct buffer_head bh;
362	unsigned long size = PAGE_CACHE_SIZE;	362	unsigned long size = PAGE_CACHE_SIZE;
363	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);	363	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
364		364
365	do {	365	do {
366	bh.b_state = 0;	366	bh.b_state = 0;
367	bh.b_size = size;	367	bh.b_size = size;
368	gfs2_block_map(inode, lblock, &bh, 1);	368	gfs2_block_map(inode, lblock, &bh, 1);
369	if (!buffer_mapped(&bh))	369	if (!buffer_mapped(&bh))
370	return -EIO;	370	return -EIO;
371	size -= bh.b_size;	371	size -= bh.b_size;
372	lblock += (bh.b_size >> inode->i_blkbits);	372	lblock += (bh.b_size >> inode->i_blkbits);
373	} while(size > 0);	373	} while(size > 0);
374	return 0;	374	return 0;
375	}	375	}
376		376
377	/**	377	/**
378	* gfs2_page_mkwrite - Make a shared, mmap()ed, page writable	378	* gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
379	* @vma: The virtual memory area	379	* @vma: The virtual memory area
380	* @page: The page which is about to become writable	380	* @page: The page which is about to become writable
381	*	381	*
382	* When the page becomes writable, we need to ensure that we have	382	* When the page becomes writable, we need to ensure that we have
383	* blocks allocated on disk to back that page.	383	* blocks allocated on disk to back that page.
384	*/	384	*/
385		385
386	static int gfs2_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)	386	static int gfs2_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)
387	{	387	{
388	struct page *page = vmf->page;	388	struct page *page = vmf->page;
389	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	389	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
390	struct gfs2_inode *ip = GFS2_I(inode);	390	struct gfs2_inode *ip = GFS2_I(inode);
391	struct gfs2_sbd *sdp = GFS2_SB(inode);	391	struct gfs2_sbd *sdp = GFS2_SB(inode);
392	unsigned long last_index;	392	unsigned long last_index;
393	u64 pos = page->index << PAGE_CACHE_SHIFT;	393	u64 pos = page->index << PAGE_CACHE_SHIFT;
394	unsigned int data_blocks, ind_blocks, rblocks;	394	unsigned int data_blocks, ind_blocks, rblocks;
395	struct gfs2_holder gh;	395	struct gfs2_holder gh;
396	loff_t size;	396	loff_t size;
397	int ret;	397	int ret;
398		398
399	sb_start_pagefault(inode->i_sb);	399	sb_start_pagefault(inode->i_sb);
400		400
401	/* Update file times before taking page lock */	401	/* Update file times before taking page lock */
402	file_update_time(vma->vm_file);	402	file_update_time(vma->vm_file);
403		403
404	ret = gfs2_rs_alloc(ip);	404	ret = gfs2_rs_alloc(ip);
405	if (ret)	405	if (ret)
406	return ret;	406	return ret;
407		407
408	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);	408	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
409		409
410	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);	410	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
411	ret = gfs2_glock_nq(&gh);	411	ret = gfs2_glock_nq(&gh);
412	if (ret)	412	if (ret)
413	goto out;	413	goto out;
414		414
415	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);	415	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
416	set_bit(GIF_SW_PAGED, &ip->i_flags);	416	set_bit(GIF_SW_PAGED, &ip->i_flags);
417		417
418	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {	418	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
419	lock_page(page);	419	lock_page(page);
420	if (!PageUptodate(page) \|\| page->mapping != inode->i_mapping) {	420	if (!PageUptodate(page) \|\| page->mapping != inode->i_mapping) {
421	ret = -EAGAIN;	421	ret = -EAGAIN;
422	unlock_page(page);	422	unlock_page(page);
423	}	423	}
424	goto out_unlock;	424	goto out_unlock;
425	}	425	}
426		426
427	ret = gfs2_rindex_update(sdp);	427	ret = gfs2_rindex_update(sdp);
428	if (ret)	428	if (ret)
429	goto out_unlock;	429	goto out_unlock;
430		430
431	ret = gfs2_quota_lock_check(ip);	431	ret = gfs2_quota_lock_check(ip);
432	if (ret)	432	if (ret)
433	goto out_unlock;	433	goto out_unlock;
434	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);	434	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
435	ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);	435	ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
436	if (ret)	436	if (ret)
437	goto out_quota_unlock;	437	goto out_quota_unlock;
438		438
439	rblocks = RES_DINODE + ind_blocks;	439	rblocks = RES_DINODE + ind_blocks;
440	if (gfs2_is_jdata(ip))	440	if (gfs2_is_jdata(ip))
441	rblocks += data_blocks ? data_blocks : 1;	441	rblocks += data_blocks ? data_blocks : 1;
442	if (ind_blocks \|\| data_blocks) {	442	if (ind_blocks \|\| data_blocks) {
443	rblocks += RES_STATFS + RES_QUOTA;	443	rblocks += RES_STATFS + RES_QUOTA;
444	rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);	444	rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
445	}	445	}
446	ret = gfs2_trans_begin(sdp, rblocks, 0);	446	ret = gfs2_trans_begin(sdp, rblocks, 0);
447	if (ret)	447	if (ret)
448	goto out_trans_fail;	448	goto out_trans_fail;
449		449
450	lock_page(page);	450	lock_page(page);
451	ret = -EINVAL;	451	ret = -EINVAL;
452	size = i_size_read(inode);	452	size = i_size_read(inode);
453	last_index = (size - 1) >> PAGE_CACHE_SHIFT;	453	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
454	/* Check page index against inode size */	454	/* Check page index against inode size */
455	if (size == 0 \|\| (page->index > last_index))	455	if (size == 0 \|\| (page->index > last_index))
456	goto out_trans_end;	456	goto out_trans_end;
457		457
458	ret = -EAGAIN;	458	ret = -EAGAIN;
459	/* If truncated, we must retry the operation, we may have raced	459	/* If truncated, we must retry the operation, we may have raced
460	* with the glock demotion code.	460	* with the glock demotion code.
461	*/	461	*/
462	if (!PageUptodate(page) \|\| page->mapping != inode->i_mapping)	462	if (!PageUptodate(page) \|\| page->mapping != inode->i_mapping)
463	goto out_trans_end;	463	goto out_trans_end;
464		464
465	/* Unstuff, if required, and allocate backing blocks for page */	465	/* Unstuff, if required, and allocate backing blocks for page */
466	ret = 0;	466	ret = 0;
467	if (gfs2_is_stuffed(ip))	467	if (gfs2_is_stuffed(ip))
468	ret = gfs2_unstuff_dinode(ip, page);	468	ret = gfs2_unstuff_dinode(ip, page);
469	if (ret == 0)	469	if (ret == 0)
470	ret = gfs2_allocate_page_backing(page);	470	ret = gfs2_allocate_page_backing(page);
471		471
472	out_trans_end:	472	out_trans_end:
473	if (ret)	473	if (ret)
474	unlock_page(page);	474	unlock_page(page);
475	gfs2_trans_end(sdp);	475	gfs2_trans_end(sdp);
476	out_trans_fail:	476	out_trans_fail:
477	gfs2_inplace_release(ip);	477	gfs2_inplace_release(ip);
478	out_quota_unlock:	478	out_quota_unlock:
479	gfs2_quota_unlock(ip);	479	gfs2_quota_unlock(ip);
480	out_unlock:	480	out_unlock:
481	gfs2_glock_dq(&gh);	481	gfs2_glock_dq(&gh);
482	out:	482	out:
483	gfs2_holder_uninit(&gh);	483	gfs2_holder_uninit(&gh);
484	if (ret == 0) {	484	if (ret == 0) {
485	set_page_dirty(page);	485	set_page_dirty(page);
486	wait_on_page_writeback(page);	486	wait_on_page_writeback(page);
487	}	487	}
488	sb_end_pagefault(inode->i_sb);	488	sb_end_pagefault(inode->i_sb);
489	return block_page_mkwrite_return(ret);	489	return block_page_mkwrite_return(ret);
490	}	490	}
491		491
492	static const struct vm_operations_struct gfs2_vm_ops = {	492	static const struct vm_operations_struct gfs2_vm_ops = {
493	.fault = filemap_fault,	493	.fault = filemap_fault,
494	.page_mkwrite = gfs2_page_mkwrite,	494	.page_mkwrite = gfs2_page_mkwrite,
495	.remap_pages = generic_file_remap_pages,	495	.remap_pages = generic_file_remap_pages,
496	};	496	};
497		497
498	/**	498	/**
499	* gfs2_mmap -	499	* gfs2_mmap -
500	* @file: The file to map	500	* @file: The file to map
501	* @vma: The VMA which described the mapping	501	* @vma: The VMA which described the mapping
502	*	502	*
503	* There is no need to get a lock here unless we should be updating	503	* There is no need to get a lock here unless we should be updating
504	* atime. We ignore any locking errors since the only consequence is	504	* atime. We ignore any locking errors since the only consequence is
505	* a missed atime update (which will just be deferred until later).	505	* a missed atime update (which will just be deferred until later).
506	*	506	*
507	* Returns: 0	507	* Returns: 0
508	*/	508	*/
509		509
510	static int gfs2_mmap(struct file file, struct vm_area_struct vma)	510	static int gfs2_mmap(struct file file, struct vm_area_struct vma)
511	{	511	{
512	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);	512	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
513		513
514	if (!(file->f_flags & O_NOATIME) &&	514	if (!(file->f_flags & O_NOATIME) &&
515	!IS_NOATIME(&ip->i_inode)) {	515	!IS_NOATIME(&ip->i_inode)) {
516	struct gfs2_holder i_gh;	516	struct gfs2_holder i_gh;
517	int error;	517	int error;
518		518
519	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,	519	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
520	&i_gh);	520	&i_gh);
521	if (error)	521	if (error)
522	return error;	522	return error;
523	/* grab lock to update inode */	523	/* grab lock to update inode */
524	gfs2_glock_dq_uninit(&i_gh);	524	gfs2_glock_dq_uninit(&i_gh);
525	file_accessed(file);	525	file_accessed(file);
526	}	526	}
527	vma->vm_ops = &gfs2_vm_ops;	527	vma->vm_ops = &gfs2_vm_ops;
528		528
529	return 0;	529	return 0;
530	}	530	}
531		531
532	/**	532	/**
533	* gfs2_open - open a file	533	* gfs2_open - open a file
534	* @inode: the inode to open	534	* @inode: the inode to open
535	* @file: the struct file for this opening	535	* @file: the struct file for this opening
536	*	536	*
537	* Returns: errno	537	* Returns: errno
538	*/	538	*/
539		539
540	static int gfs2_open(struct inode inode, struct file file)	540	static int gfs2_open(struct inode inode, struct file file)
541	{	541	{
542	struct gfs2_inode *ip = GFS2_I(inode);	542	struct gfs2_inode *ip = GFS2_I(inode);
543	struct gfs2_holder i_gh;	543	struct gfs2_holder i_gh;
544	struct gfs2_file *fp;	544	struct gfs2_file *fp;
545	int error;	545	int error;
546		546
547	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);	547	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
548	if (!fp)	548	if (!fp)
549	return -ENOMEM;	549	return -ENOMEM;
550		550
551	mutex_init(&fp->f_fl_mutex);	551	mutex_init(&fp->f_fl_mutex);
552		552
553	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);	553	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
554	file->private_data = fp;	554	file->private_data = fp;
555		555
556	if (S_ISREG(ip->i_inode.i_mode)) {	556	if (S_ISREG(ip->i_inode.i_mode)) {
557	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,	557	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
558	&i_gh);	558	&i_gh);
559	if (error)	559	if (error)
560	goto fail;	560	goto fail;
561		561
562	if (!(file->f_flags & O_LARGEFILE) &&	562	if (!(file->f_flags & O_LARGEFILE) &&
563	i_size_read(inode) > MAX_NON_LFS) {	563	i_size_read(inode) > MAX_NON_LFS) {
564	error = -EOVERFLOW;	564	error = -EOVERFLOW;
565	goto fail_gunlock;	565	goto fail_gunlock;
566	}	566	}
567		567
568	gfs2_glock_dq_uninit(&i_gh);	568	gfs2_glock_dq_uninit(&i_gh);
569	}	569	}
570		570
571	return 0;	571	return 0;
572		572
573	fail_gunlock:	573	fail_gunlock:
574	gfs2_glock_dq_uninit(&i_gh);	574	gfs2_glock_dq_uninit(&i_gh);
575	fail:	575	fail:
576	file->private_data = NULL;	576	file->private_data = NULL;
577	kfree(fp);	577	kfree(fp);
578	return error;	578	return error;
579	}	579	}
580		580
581	/**	581	/**
582	* gfs2_release - called to close a struct file	582	* gfs2_release - called to close a struct file
583	* @inode: the inode the struct file belongs to	583	* @inode: the inode the struct file belongs to
584	* @file: the struct file being closed	584	* @file: the struct file being closed
585	*	585	*
586	* Returns: errno	586	* Returns: errno
587	*/	587	*/
588		588
589	static int gfs2_release(struct inode inode, struct file file)	589	static int gfs2_release(struct inode inode, struct file file)
590	{	590	{
591	struct gfs2_inode *ip = GFS2_I(inode);	591	struct gfs2_inode *ip = GFS2_I(inode);
592		592
593	kfree(file->private_data);	593	kfree(file->private_data);
594	file->private_data = NULL;	594	file->private_data = NULL;
595		595
596	if ((file->f_mode & FMODE_WRITE) &&	596	if ((file->f_mode & FMODE_WRITE) &&
597	(atomic_read(&inode->i_writecount) == 1))	597	(atomic_read(&inode->i_writecount) == 1))
598	gfs2_rs_delete(ip);	598	gfs2_rs_delete(ip);
599		599
600	return 0;	600	return 0;
601	}	601	}
602		602
603	/**	603	/**
604	* gfs2_fsync - sync the dirty data for a file (across the cluster)	604	* gfs2_fsync - sync the dirty data for a file (across the cluster)
605	* @file: the file that points to the dentry	605	* @file: the file that points to the dentry
606	* @start: the start position in the file to sync	606	* @start: the start position in the file to sync
607	* @end: the end position in the file to sync	607	* @end: the end position in the file to sync
608	* @datasync: set if we can ignore timestamp changes	608	* @datasync: set if we can ignore timestamp changes
609	*	609	*
610	* We split the data flushing here so that we don't wait for the data	610	* We split the data flushing here so that we don't wait for the data
611	* until after we've also sent the metadata to disk. Note that for	611	* until after we've also sent the metadata to disk. Note that for
612	* data=ordered, we will write & wait for the data at the log flush	612	* data=ordered, we will write & wait for the data at the log flush
613	* stage anyway, so this is unlikely to make much of a difference	613	* stage anyway, so this is unlikely to make much of a difference
614	* except in the data=writeback case.	614	* except in the data=writeback case.
615	*	615	*
616	* If the fdatawrite fails due to any reason except -EIO, we will	616	* If the fdatawrite fails due to any reason except -EIO, we will
617	* continue the remainder of the fsync, although we'll still report	617	* continue the remainder of the fsync, although we'll still report
618	* the error at the end. This is to match filemap_write_and_wait_range()	618	* the error at the end. This is to match filemap_write_and_wait_range()
619	* behaviour.	619	* behaviour.
620	*	620	*
621	* Returns: errno	621	* Returns: errno
622	*/	622	*/
623		623
624	static int gfs2_fsync(struct file *file, loff_t start, loff_t end,	624	static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
625	int datasync)	625	int datasync)
626	{	626	{
627	struct address_space *mapping = file->f_mapping;	627	struct address_space *mapping = file->f_mapping;
628	struct inode *inode = mapping->host;	628	struct inode *inode = mapping->host;
629	int sync_state = inode->i_state & (I_DIRTY_SYNC\|I_DIRTY_DATASYNC);	629	int sync_state = inode->i_state & (I_DIRTY_SYNC\|I_DIRTY_DATASYNC);
630	struct gfs2_inode *ip = GFS2_I(inode);	630	struct gfs2_inode *ip = GFS2_I(inode);
631	int ret = 0, ret1 = 0;	631	int ret = 0, ret1 = 0;
632		632
633	if (mapping->nrpages) {	633	if (mapping->nrpages) {
634	ret1 = filemap_fdatawrite_range(mapping, start, end);	634	ret1 = filemap_fdatawrite_range(mapping, start, end);
635	if (ret1 == -EIO)	635	if (ret1 == -EIO)
636	return ret1;	636	return ret1;
637	}	637	}
638		638
639	if (datasync)	639	if (datasync)
640	sync_state &= ~I_DIRTY_SYNC;	640	sync_state &= ~I_DIRTY_SYNC;
641		641
642	if (sync_state) {	642	if (sync_state) {
643	ret = sync_inode_metadata(inode, 1);	643	ret = sync_inode_metadata(inode, 1);
644	if (ret)	644	if (ret)
645	return ret;	645	return ret;
646	if (gfs2_is_jdata(ip))	646	if (gfs2_is_jdata(ip))
647	filemap_write_and_wait(mapping);	647	filemap_write_and_wait(mapping);
648	gfs2_ail_flush(ip->i_gl, 1);	648	gfs2_ail_flush(ip->i_gl, 1);
649	}	649	}
650		650
651	if (mapping->nrpages)	651	if (mapping->nrpages)
652	ret = filemap_fdatawait_range(mapping, start, end);	652	ret = filemap_fdatawait_range(mapping, start, end);
653		653
654	return ret ? ret : ret1;	654	return ret ? ret : ret1;
655	}	655	}
656		656
657	/**	657	/**
658	* gfs2_file_aio_write - Perform a write to a file	658	* gfs2_file_aio_write - Perform a write to a file
659	* @iocb: The io context	659	* @iocb: The io context
660	* @iov: The data to write	660	* @iov: The data to write
661	* @nr_segs: Number of @iov segments	661	* @nr_segs: Number of @iov segments
662	* @pos: The file position	662	* @pos: The file position
663	*	663	*
664	* We have to do a lock/unlock here to refresh the inode size for	664	* We have to do a lock/unlock here to refresh the inode size for
665	* O_APPEND writes, otherwise we can land up writing at the wrong	665	* O_APPEND writes, otherwise we can land up writing at the wrong
666	* offset. There is still a race, but provided the app is using its	666	* offset. There is still a race, but provided the app is using its
667	* own file locking, this will make O_APPEND work as expected.	667	* own file locking, this will make O_APPEND work as expected.
668	*	668	*
669	*/	669	*/
670		670
671	static ssize_t gfs2_file_aio_write(struct kiocb iocb, const struct iovec iov,	671	static ssize_t gfs2_file_aio_write(struct kiocb iocb, const struct iovec iov,
672	unsigned long nr_segs, loff_t pos)	672	unsigned long nr_segs, loff_t pos)
673	{	673	{
674	struct file *file = iocb->ki_filp;	674	struct file *file = iocb->ki_filp;
675	size_t writesize = iov_length(iov, nr_segs);	675	size_t writesize = iov_length(iov, nr_segs);
676	struct dentry *dentry = file->f_dentry;	676	struct dentry *dentry = file->f_dentry;
677	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);	677	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
678	int ret;	678	int ret;
679		679
680	ret = gfs2_rs_alloc(ip);	680	ret = gfs2_rs_alloc(ip);
681	if (ret)	681	if (ret)
682	return ret;	682	return ret;
683		683
684	gfs2_size_hint(file, pos, writesize);	684	gfs2_size_hint(file, pos, writesize);
685		685
686	if (file->f_flags & O_APPEND) {	686	if (file->f_flags & O_APPEND) {
687	struct gfs2_holder gh;	687	struct gfs2_holder gh;
688		688
689	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);	689	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
690	if (ret)	690	if (ret)
691	return ret;	691	return ret;
692	gfs2_glock_dq_uninit(&gh);	692	gfs2_glock_dq_uninit(&gh);
693	}	693	}
694		694
695	return generic_file_aio_write(iocb, iov, nr_segs, pos);	695	return generic_file_aio_write(iocb, iov, nr_segs, pos);
696	}	696	}
697		697
698	static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,	698	static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
699	int mode)	699	int mode)
700	{	700	{
701	struct gfs2_inode *ip = GFS2_I(inode);	701	struct gfs2_inode *ip = GFS2_I(inode);
702	struct buffer_head *dibh;	702	struct buffer_head *dibh;
703	int error;	703	int error;
704	loff_t size = len;	704	loff_t size = len;
705	unsigned int nr_blks;	705	unsigned int nr_blks;
706	sector_t lblock = offset >> inode->i_blkbits;	706	sector_t lblock = offset >> inode->i_blkbits;
707		707
708	error = gfs2_meta_inode_buffer(ip, &dibh);	708	error = gfs2_meta_inode_buffer(ip, &dibh);
709	if (unlikely(error))	709	if (unlikely(error))
710	return error;	710	return error;
711		711
712	gfs2_trans_add_bh(ip->i_gl, dibh, 1);	712	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
713		713
714	if (gfs2_is_stuffed(ip)) {	714	if (gfs2_is_stuffed(ip)) {
715	error = gfs2_unstuff_dinode(ip, NULL);	715	error = gfs2_unstuff_dinode(ip, NULL);
716	if (unlikely(error))	716	if (unlikely(error))
717	goto out;	717	goto out;
718	}	718	}
719		719
720	while (len) {	720	while (len) {
721	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };	721	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
722	bh_map.b_size = len;	722	bh_map.b_size = len;
723	set_buffer_zeronew(&bh_map);	723	set_buffer_zeronew(&bh_map);
724		724
725	error = gfs2_block_map(inode, lblock, &bh_map, 1);	725	error = gfs2_block_map(inode, lblock, &bh_map, 1);
726	if (unlikely(error))	726	if (unlikely(error))
727	goto out;	727	goto out;
728	len -= bh_map.b_size;	728	len -= bh_map.b_size;
729	nr_blks = bh_map.b_size >> inode->i_blkbits;	729	nr_blks = bh_map.b_size >> inode->i_blkbits;
730	lblock += nr_blks;	730	lblock += nr_blks;
731	if (!buffer_new(&bh_map))	731	if (!buffer_new(&bh_map))
732	continue;	732	continue;
733	if (unlikely(!buffer_zeronew(&bh_map))) {	733	if (unlikely(!buffer_zeronew(&bh_map))) {
734	error = -EIO;	734	error = -EIO;
735	goto out;	735	goto out;
736	}	736	}
737	}	737	}
738	if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))	738	if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
739	i_size_write(inode, offset + size);	739	i_size_write(inode, offset + size);
740		740
741	mark_inode_dirty(inode);	741	mark_inode_dirty(inode);
742		742
743	out:	743	out:
744	brelse(dibh);	744	brelse(dibh);
745	return error;	745	return error;
746	}	746	}
747		747
748	static void calc_max_reserv(struct gfs2_inode ip, loff_t max, loff_t len,	748	static void calc_max_reserv(struct gfs2_inode ip, loff_t max, loff_t len,
749	unsigned int data_blocks, unsigned int ind_blocks)	749	unsigned int data_blocks, unsigned int ind_blocks)
750	{	750	{
751	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);	751	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
752	unsigned int max_blocks = ip->i_rgd->rd_free_clone;	752	unsigned int max_blocks = ip->i_rgd->rd_free_clone;
753	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);	753	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
754		754
755	for (tmp = max_data; tmp > sdp->sd_diptrs;) {	755	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
756	tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);	756	tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
757	max_data -= tmp;	757	max_data -= tmp;
758	}	758	}
759	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,	759	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
760	so it might end up with fewer data blocks */	760	so it might end up with fewer data blocks */
761	if (max_data <= *data_blocks)	761	if (max_data <= *data_blocks)
762	return;	762	return;
763	*data_blocks = max_data;	763	*data_blocks = max_data;
764	*ind_blocks = max_blocks - max_data;	764	*ind_blocks = max_blocks - max_data;
765	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;	765	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
766	if (*len > max) {	766	if (*len > max) {
767	*len = max;	767	*len = max;
768	gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);	768	gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
769	}	769	}
770	}	770	}
771		771
772	static long gfs2_fallocate(struct file *file, int mode, loff_t offset,	772	static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
773	loff_t len)	773	loff_t len)
774	{	774	{
775	struct inode *inode = file->f_path.dentry->d_inode;	775	struct inode *inode = file->f_path.dentry->d_inode;
776	struct gfs2_sbd *sdp = GFS2_SB(inode);	776	struct gfs2_sbd *sdp = GFS2_SB(inode);
777	struct gfs2_inode *ip = GFS2_I(inode);	777	struct gfs2_inode *ip = GFS2_I(inode);
778	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;	778	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
779	loff_t bytes, max_bytes;	779	loff_t bytes, max_bytes;
780	int error;	780	int error;
781	const loff_t pos = offset;	781	const loff_t pos = offset;
782	const loff_t count = len;	782	const loff_t count = len;
783	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);	783	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
784	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;	784	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
785	loff_t max_chunk_size = UINT_MAX & bsize_mask;	785	loff_t max_chunk_size = UINT_MAX & bsize_mask;
786	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;	786	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
787		787
788	/* We only support the FALLOC_FL_KEEP_SIZE mode */	788	/* We only support the FALLOC_FL_KEEP_SIZE mode */
789	if (mode & ~FALLOC_FL_KEEP_SIZE)	789	if (mode & ~FALLOC_FL_KEEP_SIZE)
790	return -EOPNOTSUPP;	790	return -EOPNOTSUPP;
791		791
792	offset &= bsize_mask;	792	offset &= bsize_mask;
793		793
794	len = next - offset;	794	len = next - offset;
795	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;	795	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
796	if (!bytes)	796	if (!bytes)
797	bytes = UINT_MAX;	797	bytes = UINT_MAX;
798	bytes &= bsize_mask;	798	bytes &= bsize_mask;
799	if (bytes == 0)	799	if (bytes == 0)
800	bytes = sdp->sd_sb.sb_bsize;	800	bytes = sdp->sd_sb.sb_bsize;
801		801
802	error = gfs2_rs_alloc(ip);	802	error = gfs2_rs_alloc(ip);
803	if (error)	803	if (error)
804	return error;	804	return error;
805		805
806	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);	806	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
807	error = gfs2_glock_nq(&ip->i_gh);	807	error = gfs2_glock_nq(&ip->i_gh);
808	if (unlikely(error))	808	if (unlikely(error))
809	goto out_uninit;	809	goto out_uninit;
810		810
811	gfs2_size_hint(file, offset, len);	811	gfs2_size_hint(file, offset, len);
812		812
813	while (len > 0) {	813	while (len > 0) {
814	if (len < bytes)	814	if (len < bytes)
815	bytes = len;	815	bytes = len;
816	if (!gfs2_write_alloc_required(ip, offset, bytes)) {	816	if (!gfs2_write_alloc_required(ip, offset, bytes)) {
817	len -= bytes;	817	len -= bytes;
818	offset += bytes;	818	offset += bytes;
819	continue;	819	continue;
820	}	820	}
821	error = gfs2_quota_lock_check(ip);	821	error = gfs2_quota_lock_check(ip);
822	if (error)	822	if (error)
823	goto out_unlock;	823	goto out_unlock;
824		824
825	retry:	825	retry:
826	gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);	826	gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
827		827
828	error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);	828	error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
829	if (error) {	829	if (error) {
830	if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {	830	if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
831	bytes >>= 1;	831	bytes >>= 1;
832	bytes &= bsize_mask;	832	bytes &= bsize_mask;
833	if (bytes == 0)	833	if (bytes == 0)
834	bytes = sdp->sd_sb.sb_bsize;	834	bytes = sdp->sd_sb.sb_bsize;
835	goto retry;	835	goto retry;
836	}	836	}
837	goto out_qunlock;	837	goto out_qunlock;
838	}	838	}
839	max_bytes = bytes;	839	max_bytes = bytes;
840	calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,	840	calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
841	&max_bytes, &data_blocks, &ind_blocks);	841	&max_bytes, &data_blocks, &ind_blocks);
842		842
843	rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +	843	rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
844	RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);	844	RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
845	if (gfs2_is_jdata(ip))	845	if (gfs2_is_jdata(ip))
846	rblocks += data_blocks ? data_blocks : 1;	846	rblocks += data_blocks ? data_blocks : 1;
847		847
848	error = gfs2_trans_begin(sdp, rblocks,	848	error = gfs2_trans_begin(sdp, rblocks,
849	PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);	849	PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
850	if (error)	850	if (error)
851	goto out_trans_fail;	851	goto out_trans_fail;
852		852
853	error = fallocate_chunk(inode, offset, max_bytes, mode);	853	error = fallocate_chunk(inode, offset, max_bytes, mode);
854	gfs2_trans_end(sdp);	854	gfs2_trans_end(sdp);
855		855
856	if (error)	856	if (error)
857	goto out_trans_fail;	857	goto out_trans_fail;
858		858
859	len -= max_bytes;	859	len -= max_bytes;
860	offset += max_bytes;	860	offset += max_bytes;
861	gfs2_inplace_release(ip);	861	gfs2_inplace_release(ip);
862	gfs2_quota_unlock(ip);	862	gfs2_quota_unlock(ip);
863	}	863	}
864		864
865	if (error == 0)	865	if (error == 0)
866	error = generic_write_sync(file, pos, count);	866	error = generic_write_sync(file, pos, count);
867	goto out_unlock;	867	goto out_unlock;
868		868
869	out_trans_fail:	869	out_trans_fail:
870	gfs2_inplace_release(ip);	870	gfs2_inplace_release(ip);
871	out_qunlock:	871	out_qunlock:
872	gfs2_quota_unlock(ip);	872	gfs2_quota_unlock(ip);
873	out_unlock:	873	out_unlock:
874	gfs2_glock_dq(&ip->i_gh);	874	gfs2_glock_dq(&ip->i_gh);
875	out_uninit:	875	out_uninit:
876	gfs2_holder_uninit(&ip->i_gh);	876	gfs2_holder_uninit(&ip->i_gh);
877	return error;	877	return error;
878	}	878	}
879		879
880	#ifdef CONFIG_GFS2_FS_LOCKING_DLM	880	#ifdef CONFIG_GFS2_FS_LOCKING_DLM
881		881
882	/**	882	/**
883	* gfs2_setlease - acquire/release a file lease	883	* gfs2_setlease - acquire/release a file lease
884	* @file: the file pointer	884	* @file: the file pointer
885	* @arg: lease type	885	* @arg: lease type
886	* @fl: file lock	886	* @fl: file lock
887	*	887	*
888	* We don't currently have a way to enforce a lease across the whole	888	* We don't currently have a way to enforce a lease across the whole
889	* cluster; until we do, disable leases (by just returning -EINVAL),	889	* cluster; until we do, disable leases (by just returning -EINVAL),
890	* unless the administrator has requested purely local locking.	890	* unless the administrator has requested purely local locking.
891	*	891	*
892	* Locking: called under lock_flocks	892	* Locking: called under lock_flocks
893	*	893	*
894	* Returns: errno	894	* Returns: errno
895	*/	895	*/
896		896
897	static int gfs2_setlease(struct file file, long arg, struct file_lock *fl)	897	static int gfs2_setlease(struct file file, long arg, struct file_lock *fl)
898	{	898	{
899	return -EINVAL;	899	return -EINVAL;
900	}	900	}
901		901
902	/**	902	/**
903	* gfs2_lock - acquire/release a posix lock on a file	903	* gfs2_lock - acquire/release a posix lock on a file
904	* @file: the file pointer	904	* @file: the file pointer
905	* @cmd: either modify or retrieve lock state, possibly wait	905	* @cmd: either modify or retrieve lock state, possibly wait
906	* @fl: type and range of lock	906	* @fl: type and range of lock
907	*	907	*
908	* Returns: errno	908	* Returns: errno
909	*/	909	*/
910		910
911	static int gfs2_lock(struct file file, int cmd, struct file_lock fl)	911	static int gfs2_lock(struct file file, int cmd, struct file_lock fl)
912	{	912	{
913	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);	913	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
914	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);	914	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
915	struct lm_lockstruct *ls = &sdp->sd_lockstruct;	915	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
916		916
917	if (!(fl->fl_flags & FL_POSIX))	917	if (!(fl->fl_flags & FL_POSIX))
918	return -ENOLCK;	918	return -ENOLCK;
919	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)	919	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
920	return -ENOLCK;	920	return -ENOLCK;
921		921
922	if (cmd == F_CANCELLK) {	922	if (cmd == F_CANCELLK) {
923	/* Hack: */	923	/* Hack: */
924	cmd = F_SETLK;	924	cmd = F_SETLK;
925	fl->fl_type = F_UNLCK;	925	fl->fl_type = F_UNLCK;
926	}	926	}
927	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))	927	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
928	return -EIO;	928	return -EIO;
929	if (IS_GETLK(cmd))	929	if (IS_GETLK(cmd))
930	return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);	930	return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
931	else if (fl->fl_type == F_UNLCK)	931	else if (fl->fl_type == F_UNLCK)
932	return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);	932	return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
933	else	933	else
934	return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);	934	return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
935	}	935	}
936		936
937	static int do_flock(struct file file, int cmd, struct file_lock fl)	937	static int do_flock(struct file file, int cmd, struct file_lock fl)
938	{	938	{
939	struct gfs2_file *fp = file->private_data;	939	struct gfs2_file *fp = file->private_data;
940	struct gfs2_holder *fl_gh = &fp->f_fl_gh;	940	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
941	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);	941	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
942	struct gfs2_glock *gl;	942	struct gfs2_glock *gl;
943	unsigned int state;	943	unsigned int state;
944	int flags;	944	int flags;
945	int error = 0;	945	int error = 0;
946		946
947	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;	947	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
948	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) \| GL_EXACT \| GL_NOCACHE;	948	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) \| GL_EXACT \| GL_NOCACHE;
949		949
950	mutex_lock(&fp->f_fl_mutex);	950	mutex_lock(&fp->f_fl_mutex);
951		951
952	gl = fl_gh->gh_gl;	952	gl = fl_gh->gh_gl;
953	if (gl) {	953	if (gl) {
954	if (fl_gh->gh_state == state)	954	if (fl_gh->gh_state == state)
955	goto out;	955	goto out;
956	flock_lock_file_wait(file,	956	flock_lock_file_wait(file,
957	&(struct file_lock){.fl_type = F_UNLCK});	957	&(struct file_lock){.fl_type = F_UNLCK});
958	gfs2_glock_dq_wait(fl_gh);	958	gfs2_glock_dq_wait(fl_gh);
959	gfs2_holder_reinit(state, flags, fl_gh);	959	gfs2_holder_reinit(state, flags, fl_gh);
960	} else {	960	} else {
961	error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,	961	error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
962	&gfs2_flock_glops, CREATE, &gl);	962	&gfs2_flock_glops, CREATE, &gl);
963	if (error)	963	if (error)
964	goto out;	964	goto out;
965	gfs2_holder_init(gl, state, flags, fl_gh);	965	gfs2_holder_init(gl, state, flags, fl_gh);
966	gfs2_glock_put(gl);	966	gfs2_glock_put(gl);
967	}	967	}
968	error = gfs2_glock_nq(fl_gh);	968	error = gfs2_glock_nq(fl_gh);
969	if (error) {	969	if (error) {
970	gfs2_holder_uninit(fl_gh);	970	gfs2_holder_uninit(fl_gh);
971	if (error == GLR_TRYFAILED)	971	if (error == GLR_TRYFAILED)
972	error = -EAGAIN;	972	error = -EAGAIN;
973	} else {	973	} else {
974	error = flock_lock_file_wait(file, fl);	974	error = flock_lock_file_wait(file, fl);
975	gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);	975	gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
976	}	976	}
977		977
978	out:	978	out:
979	mutex_unlock(&fp->f_fl_mutex);	979	mutex_unlock(&fp->f_fl_mutex);
980	return error;	980	return error;
981	}	981	}
982		982
983	static void do_unflock(struct file file, struct file_lock fl)	983	static void do_unflock(struct file file, struct file_lock fl)
984	{	984	{
985	struct gfs2_file *fp = file->private_data;	985	struct gfs2_file *fp = file->private_data;
986	struct gfs2_holder *fl_gh = &fp->f_fl_gh;	986	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
987		987
988	mutex_lock(&fp->f_fl_mutex);	988	mutex_lock(&fp->f_fl_mutex);
989	flock_lock_file_wait(file, fl);	989	flock_lock_file_wait(file, fl);
990	if (fl_gh->gh_gl) {	990	if (fl_gh->gh_gl) {
991	gfs2_glock_dq_wait(fl_gh);	991	gfs2_glock_dq_wait(fl_gh);
992	gfs2_holder_uninit(fl_gh);	992	gfs2_holder_uninit(fl_gh);
993	}	993	}
994	mutex_unlock(&fp->f_fl_mutex);	994	mutex_unlock(&fp->f_fl_mutex);
995	}	995	}
996		996
997	/**	997	/**
998	* gfs2_flock - acquire/release a flock lock on a file	998	* gfs2_flock - acquire/release a flock lock on a file
999	* @file: the file pointer	999	* @file: the file pointer
1000	* @cmd: either modify or retrieve lock state, possibly wait	1000	* @cmd: either modify or retrieve lock state, possibly wait
1001	* @fl: type and range of lock	1001	* @fl: type and range of lock
1002	*	1002	*
1003	* Returns: errno	1003	* Returns: errno
1004	*/	1004	*/
1005		1005
1006	static int gfs2_flock(struct file file, int cmd, struct file_lock fl)	1006	static int gfs2_flock(struct file file, int cmd, struct file_lock fl)
1007	{	1007	{
1008	if (!(fl->fl_flags & FL_FLOCK))	1008	if (!(fl->fl_flags & FL_FLOCK))
1009	return -ENOLCK;	1009	return -ENOLCK;
1010	if (fl->fl_type & LOCK_MAND)	1010	if (fl->fl_type & LOCK_MAND)
1011	return -EOPNOTSUPP;	1011	return -EOPNOTSUPP;
1012		1012
1013	if (fl->fl_type == F_UNLCK) {	1013	if (fl->fl_type == F_UNLCK) {
1014	do_unflock(file, fl);	1014	do_unflock(file, fl);
1015	return 0;	1015	return 0;
1016	} else {	1016	} else {
1017	return do_flock(file, cmd, fl);	1017	return do_flock(file, cmd, fl);
1018	}	1018	}
1019	}	1019	}
1020		1020
1021	const struct file_operations gfs2_file_fops = {	1021	const struct file_operations gfs2_file_fops = {
1022	.llseek = gfs2_llseek,	1022	.llseek = gfs2_llseek,
1023	.read = do_sync_read,	1023	.read = do_sync_read,
1024	.aio_read = generic_file_aio_read,	1024	.aio_read = generic_file_aio_read,
1025	.write = do_sync_write,	1025	.write = do_sync_write,
1026	.aio_write = gfs2_file_aio_write,	1026	.aio_write = gfs2_file_aio_write,
1027	.unlocked_ioctl = gfs2_ioctl,	1027	.unlocked_ioctl = gfs2_ioctl,
1028	.mmap = gfs2_mmap,	1028	.mmap = gfs2_mmap,
1029	.open = gfs2_open,	1029	.open = gfs2_open,
1030	.release = gfs2_release,	1030	.release = gfs2_release,
1031	.fsync = gfs2_fsync,	1031	.fsync = gfs2_fsync,
1032	.lock = gfs2_lock,	1032	.lock = gfs2_lock,
1033	.flock = gfs2_flock,	1033	.flock = gfs2_flock,
1034	.splice_read = generic_file_splice_read,	1034	.splice_read = generic_file_splice_read,
1035	.splice_write = generic_file_splice_write,	1035	.splice_write = generic_file_splice_write,
1036	.setlease = gfs2_setlease,	1036	.setlease = gfs2_setlease,
1037	.fallocate = gfs2_fallocate,	1037	.fallocate = gfs2_fallocate,
1038	};	1038	};
1039		1039
1040	const struct file_operations gfs2_dir_fops = {	1040	const struct file_operations gfs2_dir_fops = {
1041	.readdir = gfs2_readdir,	1041	.readdir = gfs2_readdir,
1042	.unlocked_ioctl = gfs2_ioctl,	1042	.unlocked_ioctl = gfs2_ioctl,
1043	.open = gfs2_open,	1043	.open = gfs2_open,
1044	.release = gfs2_release,	1044	.release = gfs2_release,
1045	.fsync = gfs2_fsync,	1045	.fsync = gfs2_fsync,
1046	.lock = gfs2_lock,	1046	.lock = gfs2_lock,
1047	.flock = gfs2_flock,	1047	.flock = gfs2_flock,
1048	.llseek = default_llseek,	1048	.llseek = default_llseek,
1049	};	1049	};
1050		1050
1051	#endif /* CONFIG_GFS2_FS_LOCKING_DLM */	1051	#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
1052		1052
1053	const struct file_operations gfs2_file_fops_nolock = {	1053	const struct file_operations gfs2_file_fops_nolock = {
1054	.llseek = gfs2_llseek,	1054	.llseek = gfs2_llseek,
1055	.read = do_sync_read,	1055	.read = do_sync_read,
1056	.aio_read = generic_file_aio_read,	1056	.aio_read = generic_file_aio_read,
1057	.write = do_sync_write,	1057	.write = do_sync_write,
1058	.aio_write = gfs2_file_aio_write,	1058	.aio_write = gfs2_file_aio_write,
1059	.unlocked_ioctl = gfs2_ioctl,	1059	.unlocked_ioctl = gfs2_ioctl,
1060	.mmap = gfs2_mmap,	1060	.mmap = gfs2_mmap,
1061	.open = gfs2_open,	1061	.open = gfs2_open,
1062	.release = gfs2_release,	1062	.release = gfs2_release,
1063	.fsync = gfs2_fsync,	1063	.fsync = gfs2_fsync,
1064	.splice_read = generic_file_splice_read,	1064	.splice_read = generic_file_splice_read,
1065	.splice_write = generic_file_splice_write,	1065	.splice_write = generic_file_splice_write,
1066	.setlease = generic_setlease,	1066	.setlease = generic_setlease,
1067	.fallocate = gfs2_fallocate,	1067	.fallocate = gfs2_fallocate,
1068	};	1068	};
1069		1069
1070	const struct file_operations gfs2_dir_fops_nolock = {	1070	const struct file_operations gfs2_dir_fops_nolock = {
1071	.readdir = gfs2_readdir,	1071	.readdir = gfs2_readdir,
1072	.unlocked_ioctl = gfs2_ioctl,	1072	.unlocked_ioctl = gfs2_ioctl,
1073	.open = gfs2_open,	1073	.open = gfs2_open,
1074	.release = gfs2_release,	1074	.release = gfs2_release,
1075	.fsync = gfs2_fsync,	1075	.fsync = gfs2_fsync,
1076	.llseek = default_llseek,	1076	.llseek = default_llseek,
1077	};	1077	};
1078		1078
1079		1079

fs/libfs.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* fs/libfs.c	2	* fs/libfs.c
3	* Library for filesystems writers.	3	* Library for filesystems writers.
4	*/	4	*/
5		5
6	#include <linux/export.h>	6	#include <linux/export.h>
7	#include <linux/pagemap.h>	7	#include <linux/pagemap.h>
8	#include <linux/slab.h>	8	#include <linux/slab.h>
9	#include <linux/mount.h>	9	#include <linux/mount.h>
10	#include <linux/vfs.h>	10	#include <linux/vfs.h>
11	#include <linux/quotaops.h>	11	#include <linux/quotaops.h>
12	#include <linux/mutex.h>	12	#include <linux/mutex.h>
13	#include <linux/exportfs.h>	13	#include <linux/exportfs.h>
14	#include <linux/writeback.h>	14	#include <linux/writeback.h>
15	#include <linux/buffer_head.h> /* sync_mapping_buffers */	15	#include <linux/buffer_head.h> /* sync_mapping_buffers */
16		16
17	#include <asm/uaccess.h>	17	#include <asm/uaccess.h>
18		18
19	#include "internal.h"	19	#include "internal.h"
20		20
21	static inline int simple_positive(struct dentry *dentry)	21	static inline int simple_positive(struct dentry *dentry)
22	{	22	{
23	return dentry->d_inode && !d_unhashed(dentry);	23	return dentry->d_inode && !d_unhashed(dentry);
24	}	24	}
25		25
26	int simple_getattr(struct vfsmount mnt, struct dentry dentry,	26	int simple_getattr(struct vfsmount mnt, struct dentry dentry,
27	struct kstat *stat)	27	struct kstat *stat)
28	{	28	{
29	struct inode *inode = dentry->d_inode;	29	struct inode *inode = dentry->d_inode;
30	generic_fillattr(inode, stat);	30	generic_fillattr(inode, stat);
31	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);	31	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
32	return 0;	32	return 0;
33	}	33	}
34		34
35	int simple_statfs(struct dentry dentry, struct kstatfs buf)	35	int simple_statfs(struct dentry dentry, struct kstatfs buf)
36	{	36	{
37	buf->f_type = dentry->d_sb->s_magic;	37	buf->f_type = dentry->d_sb->s_magic;
38	buf->f_bsize = PAGE_CACHE_SIZE;	38	buf->f_bsize = PAGE_CACHE_SIZE;
39	buf->f_namelen = NAME_MAX;	39	buf->f_namelen = NAME_MAX;
40	return 0;	40	return 0;
41	}	41	}
42		42
43	/*	43	/*
44	* Retaining negative dentries for an in-memory filesystem just wastes	44	* Retaining negative dentries for an in-memory filesystem just wastes
45	* memory and lookup time: arrange for them to be deleted immediately.	45	* memory and lookup time: arrange for them to be deleted immediately.
46	*/	46	*/
47	static int simple_delete_dentry(const struct dentry *dentry)	47	static int simple_delete_dentry(const struct dentry *dentry)
48	{	48	{
49	return 1;	49	return 1;
50	}	50	}
51		51
52	/*	52	/*
53	* Lookup the data. This is trivial - if the dentry didn't already	53	* Lookup the data. This is trivial - if the dentry didn't already
54	* exist, we know it is negative. Set d_op to delete negative dentries.	54	* exist, we know it is negative. Set d_op to delete negative dentries.
55	*/	55	*/
56	struct dentry simple_lookup(struct inode dir, struct dentry *dentry, unsigned int flags)	56	struct dentry simple_lookup(struct inode dir, struct dentry *dentry, unsigned int flags)
57	{	57	{
58	static const struct dentry_operations simple_dentry_operations = {	58	static const struct dentry_operations simple_dentry_operations = {
59	.d_delete = simple_delete_dentry,	59	.d_delete = simple_delete_dentry,
60	};	60	};
61		61
62	if (dentry->d_name.len > NAME_MAX)	62	if (dentry->d_name.len > NAME_MAX)
63	return ERR_PTR(-ENAMETOOLONG);	63	return ERR_PTR(-ENAMETOOLONG);
64	d_set_d_op(dentry, &simple_dentry_operations);	64	d_set_d_op(dentry, &simple_dentry_operations);
65	d_add(dentry, NULL);	65	d_add(dentry, NULL);
66	return NULL;	66	return NULL;
67	}	67	}
68		68
69	int dcache_dir_open(struct inode inode, struct file file)	69	int dcache_dir_open(struct inode inode, struct file file)
70	{	70	{
71	static struct qstr cursor_name = QSTR_INIT(".", 1);	71	static struct qstr cursor_name = QSTR_INIT(".", 1);
72		72
73	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);	73	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
74		74
75	return file->private_data ? 0 : -ENOMEM;	75	return file->private_data ? 0 : -ENOMEM;
76	}	76	}
77		77
78	int dcache_dir_close(struct inode inode, struct file file)	78	int dcache_dir_close(struct inode inode, struct file file)
79	{	79	{
80	dput(file->private_data);	80	dput(file->private_data);
81	return 0;	81	return 0;
82	}	82	}
83		83
84	loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)	84	loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
85	{	85	{
86	struct dentry *dentry = file->f_path.dentry;	86	struct dentry *dentry = file->f_path.dentry;
87	mutex_lock(&dentry->d_inode->i_mutex);	87	mutex_lock(&dentry->d_inode->i_mutex);
88	switch (origin) {	88	switch (whence) {
89	case 1:	89	case 1:
90	offset += file->f_pos;	90	offset += file->f_pos;
91	case 0:	91	case 0:
92	if (offset >= 0)	92	if (offset >= 0)
93	break;	93	break;
94	default:	94	default:
95	mutex_unlock(&dentry->d_inode->i_mutex);	95	mutex_unlock(&dentry->d_inode->i_mutex);
96	return -EINVAL;	96	return -EINVAL;
97	}	97	}
98	if (offset != file->f_pos) {	98	if (offset != file->f_pos) {
99	file->f_pos = offset;	99	file->f_pos = offset;
100	if (file->f_pos >= 2) {	100	if (file->f_pos >= 2) {
101	struct list_head *p;	101	struct list_head *p;
102	struct dentry *cursor = file->private_data;	102	struct dentry *cursor = file->private_data;
103	loff_t n = file->f_pos - 2;	103	loff_t n = file->f_pos - 2;
104		104
105	spin_lock(&dentry->d_lock);	105	spin_lock(&dentry->d_lock);
106	/* d_lock not required for cursor */	106	/* d_lock not required for cursor */
107	list_del(&cursor->d_u.d_child);	107	list_del(&cursor->d_u.d_child);
108	p = dentry->d_subdirs.next;	108	p = dentry->d_subdirs.next;
109	while (n && p != &dentry->d_subdirs) {	109	while (n && p != &dentry->d_subdirs) {
110	struct dentry *next;	110	struct dentry *next;
111	next = list_entry(p, struct dentry, d_u.d_child);	111	next = list_entry(p, struct dentry, d_u.d_child);
112	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);	112	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
113	if (simple_positive(next))	113	if (simple_positive(next))
114	n--;	114	n--;
115	spin_unlock(&next->d_lock);	115	spin_unlock(&next->d_lock);
116	p = p->next;	116	p = p->next;
117	}	117	}
118	list_add_tail(&cursor->d_u.d_child, p);	118	list_add_tail(&cursor->d_u.d_child, p);
119	spin_unlock(&dentry->d_lock);	119	spin_unlock(&dentry->d_lock);
120	}	120	}
121	}	121	}
122	mutex_unlock(&dentry->d_inode->i_mutex);	122	mutex_unlock(&dentry->d_inode->i_mutex);
123	return offset;	123	return offset;
124	}	124	}
125		125
126	/* Relationship between i_mode and the DT_xxx types */	126	/* Relationship between i_mode and the DT_xxx types */
127	static inline unsigned char dt_type(struct inode *inode)	127	static inline unsigned char dt_type(struct inode *inode)
128	{	128	{
129	return (inode->i_mode >> 12) & 15;	129	return (inode->i_mode >> 12) & 15;
130	}	130	}
131		131
132	/*	132	/*
133	* Directory is locked and all positive dentries in it are safe, since	133	* Directory is locked and all positive dentries in it are safe, since
134	* for ramfs-type trees they can't go away without unlink() or rmdir(),	134	* for ramfs-type trees they can't go away without unlink() or rmdir(),
135	* both impossible due to the lock on directory.	135	* both impossible due to the lock on directory.
136	*/	136	*/
137		137
138	int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)	138	int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
139	{	139	{
140	struct dentry *dentry = filp->f_path.dentry;	140	struct dentry *dentry = filp->f_path.dentry;
141	struct dentry *cursor = filp->private_data;	141	struct dentry *cursor = filp->private_data;
142	struct list_head p, q = &cursor->d_u.d_child;	142	struct list_head p, q = &cursor->d_u.d_child;
143	ino_t ino;	143	ino_t ino;
144	int i = filp->f_pos;	144	int i = filp->f_pos;
145		145
146	switch (i) {	146	switch (i) {
147	case 0:	147	case 0:
148	ino = dentry->d_inode->i_ino;	148	ino = dentry->d_inode->i_ino;
149	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)	149	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
150	break;	150	break;
151	filp->f_pos++;	151	filp->f_pos++;
152	i++;	152	i++;
153	/* fallthrough */	153	/* fallthrough */
154	case 1:	154	case 1:
155	ino = parent_ino(dentry);	155	ino = parent_ino(dentry);
156	if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)	156	if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
157	break;	157	break;
158	filp->f_pos++;	158	filp->f_pos++;
159	i++;	159	i++;
160	/* fallthrough */	160	/* fallthrough */
161	default:	161	default:
162	spin_lock(&dentry->d_lock);	162	spin_lock(&dentry->d_lock);
163	if (filp->f_pos == 2)	163	if (filp->f_pos == 2)
164	list_move(q, &dentry->d_subdirs);	164	list_move(q, &dentry->d_subdirs);
165		165
166	for (p=q->next; p != &dentry->d_subdirs; p=p->next) {	166	for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
167	struct dentry *next;	167	struct dentry *next;
168	next = list_entry(p, struct dentry, d_u.d_child);	168	next = list_entry(p, struct dentry, d_u.d_child);
169	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);	169	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
170	if (!simple_positive(next)) {	170	if (!simple_positive(next)) {
171	spin_unlock(&next->d_lock);	171	spin_unlock(&next->d_lock);
172	continue;	172	continue;
173	}	173	}
174		174
175	spin_unlock(&next->d_lock);	175	spin_unlock(&next->d_lock);
176	spin_unlock(&dentry->d_lock);	176	spin_unlock(&dentry->d_lock);
177	if (filldir(dirent, next->d_name.name,	177	if (filldir(dirent, next->d_name.name,
178	next->d_name.len, filp->f_pos,	178	next->d_name.len, filp->f_pos,
179	next->d_inode->i_ino,	179	next->d_inode->i_ino,
180	dt_type(next->d_inode)) < 0)	180	dt_type(next->d_inode)) < 0)
181	return 0;	181	return 0;
182	spin_lock(&dentry->d_lock);	182	spin_lock(&dentry->d_lock);
183	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);	183	spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
184	/* next is still alive */	184	/* next is still alive */
185	list_move(q, p);	185	list_move(q, p);
186	spin_unlock(&next->d_lock);	186	spin_unlock(&next->d_lock);
187	p = q;	187	p = q;
188	filp->f_pos++;	188	filp->f_pos++;
189	}	189	}
190	spin_unlock(&dentry->d_lock);	190	spin_unlock(&dentry->d_lock);
191	}	191	}
192	return 0;	192	return 0;
193	}	193	}
194		194
195	ssize_t generic_read_dir(struct file filp, char __user buf, size_t siz, loff_t *ppos)	195	ssize_t generic_read_dir(struct file filp, char __user buf, size_t siz, loff_t *ppos)
196	{	196	{
197	return -EISDIR;	197	return -EISDIR;
198	}	198	}
199		199
200	const struct file_operations simple_dir_operations = {	200	const struct file_operations simple_dir_operations = {
201	.open = dcache_dir_open,	201	.open = dcache_dir_open,
202	.release = dcache_dir_close,	202	.release = dcache_dir_close,
203	.llseek = dcache_dir_lseek,	203	.llseek = dcache_dir_lseek,
204	.read = generic_read_dir,	204	.read = generic_read_dir,
205	.readdir = dcache_readdir,	205	.readdir = dcache_readdir,
206	.fsync = noop_fsync,	206	.fsync = noop_fsync,
207	};	207	};
208		208
209	const struct inode_operations simple_dir_inode_operations = {	209	const struct inode_operations simple_dir_inode_operations = {
210	.lookup = simple_lookup,	210	.lookup = simple_lookup,
211	};	211	};
212		212
213	static const struct super_operations simple_super_operations = {	213	static const struct super_operations simple_super_operations = {
214	.statfs = simple_statfs,	214	.statfs = simple_statfs,
215	};	215	};
216		216
217	/*	217	/*
218	* Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that	218	* Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
219	* will never be mountable)	219	* will never be mountable)
220	*/	220	*/
221	struct dentry mount_pseudo(struct file_system_type fs_type, char *name,	221	struct dentry mount_pseudo(struct file_system_type fs_type, char *name,
222	const struct super_operations *ops,	222	const struct super_operations *ops,
223	const struct dentry_operations *dops, unsigned long magic)	223	const struct dentry_operations *dops, unsigned long magic)
224	{	224	{
225	struct super_block *s;	225	struct super_block *s;
226	struct dentry *dentry;	226	struct dentry *dentry;
227	struct inode *root;	227	struct inode *root;
228	struct qstr d_name = QSTR_INIT(name, strlen(name));	228	struct qstr d_name = QSTR_INIT(name, strlen(name));
229		229
230	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);	230	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
231	if (IS_ERR(s))	231	if (IS_ERR(s))
232	return ERR_CAST(s);	232	return ERR_CAST(s);
233		233
234	s->s_maxbytes = MAX_LFS_FILESIZE;	234	s->s_maxbytes = MAX_LFS_FILESIZE;
235	s->s_blocksize = PAGE_SIZE;	235	s->s_blocksize = PAGE_SIZE;
236	s->s_blocksize_bits = PAGE_SHIFT;	236	s->s_blocksize_bits = PAGE_SHIFT;
237	s->s_magic = magic;	237	s->s_magic = magic;
238	s->s_op = ops ? ops : &simple_super_operations;	238	s->s_op = ops ? ops : &simple_super_operations;
239	s->s_time_gran = 1;	239	s->s_time_gran = 1;
240	root = new_inode(s);	240	root = new_inode(s);
241	if (!root)	241	if (!root)
242	goto Enomem;	242	goto Enomem;
243	/*	243	/*
244	* since this is the first inode, make it number 1. New inodes created	244	* since this is the first inode, make it number 1. New inodes created
245	* after this must take care not to collide with it (by passing	245	* after this must take care not to collide with it (by passing
246	* max_reserved of 1 to iunique).	246	* max_reserved of 1 to iunique).
247	*/	247	*/
248	root->i_ino = 1;	248	root->i_ino = 1;
249	root->i_mode = S_IFDIR \| S_IRUSR \| S_IWUSR;	249	root->i_mode = S_IFDIR \| S_IRUSR \| S_IWUSR;
250	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;	250	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
251	dentry = __d_alloc(s, &d_name);	251	dentry = __d_alloc(s, &d_name);
252	if (!dentry) {	252	if (!dentry) {
253	iput(root);	253	iput(root);
254	goto Enomem;	254	goto Enomem;
255	}	255	}
256	d_instantiate(dentry, root);	256	d_instantiate(dentry, root);
257	s->s_root = dentry;	257	s->s_root = dentry;
258	s->s_d_op = dops;	258	s->s_d_op = dops;
259	s->s_flags \|= MS_ACTIVE;	259	s->s_flags \|= MS_ACTIVE;
260	return dget(s->s_root);	260	return dget(s->s_root);
261		261
262	Enomem:	262	Enomem:
263	deactivate_locked_super(s);	263	deactivate_locked_super(s);
264	return ERR_PTR(-ENOMEM);	264	return ERR_PTR(-ENOMEM);
265	}	265	}
266		266
267	int simple_open(struct inode inode, struct file file)	267	int simple_open(struct inode inode, struct file file)
268	{	268	{
269	if (inode->i_private)	269	if (inode->i_private)
270	file->private_data = inode->i_private;	270	file->private_data = inode->i_private;
271	return 0;	271	return 0;
272	}	272	}
273		273
274	int simple_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)	274	int simple_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)
275	{	275	{
276	struct inode *inode = old_dentry->d_inode;	276	struct inode *inode = old_dentry->d_inode;
277		277
278	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;	278	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
279	inc_nlink(inode);	279	inc_nlink(inode);
280	ihold(inode);	280	ihold(inode);
281	dget(dentry);	281	dget(dentry);
282	d_instantiate(dentry, inode);	282	d_instantiate(dentry, inode);
283	return 0;	283	return 0;
284	}	284	}
285		285
286	int simple_empty(struct dentry *dentry)	286	int simple_empty(struct dentry *dentry)
287	{	287	{
288	struct dentry *child;	288	struct dentry *child;
289	int ret = 0;	289	int ret = 0;
290		290
291	spin_lock(&dentry->d_lock);	291	spin_lock(&dentry->d_lock);
292	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {	292	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
293	spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);	293	spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
294	if (simple_positive(child)) {	294	if (simple_positive(child)) {
295	spin_unlock(&child->d_lock);	295	spin_unlock(&child->d_lock);
296	goto out;	296	goto out;
297	}	297	}
298	spin_unlock(&child->d_lock);	298	spin_unlock(&child->d_lock);
299	}	299	}
300	ret = 1;	300	ret = 1;
301	out:	301	out:
302	spin_unlock(&dentry->d_lock);	302	spin_unlock(&dentry->d_lock);
303	return ret;	303	return ret;
304	}	304	}
305		305
306	int simple_unlink(struct inode dir, struct dentry dentry)	306	int simple_unlink(struct inode dir, struct dentry dentry)
307	{	307	{
308	struct inode *inode = dentry->d_inode;	308	struct inode *inode = dentry->d_inode;
309		309
310	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;	310	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
311	drop_nlink(inode);	311	drop_nlink(inode);
312	dput(dentry);	312	dput(dentry);
313	return 0;	313	return 0;
314	}	314	}
315		315
316	int simple_rmdir(struct inode dir, struct dentry dentry)	316	int simple_rmdir(struct inode dir, struct dentry dentry)
317	{	317	{
318	if (!simple_empty(dentry))	318	if (!simple_empty(dentry))
319	return -ENOTEMPTY;	319	return -ENOTEMPTY;
320		320
321	drop_nlink(dentry->d_inode);	321	drop_nlink(dentry->d_inode);
322	simple_unlink(dir, dentry);	322	simple_unlink(dir, dentry);
323	drop_nlink(dir);	323	drop_nlink(dir);
324	return 0;	324	return 0;
325	}	325	}
326		326
327	int simple_rename(struct inode old_dir, struct dentry old_dentry,	327	int simple_rename(struct inode old_dir, struct dentry old_dentry,
328	struct inode new_dir, struct dentry new_dentry)	328	struct inode new_dir, struct dentry new_dentry)
329	{	329	{
330	struct inode *inode = old_dentry->d_inode;	330	struct inode *inode = old_dentry->d_inode;
331	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);	331	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
332		332
333	if (!simple_empty(new_dentry))	333	if (!simple_empty(new_dentry))
334	return -ENOTEMPTY;	334	return -ENOTEMPTY;
335		335
336	if (new_dentry->d_inode) {	336	if (new_dentry->d_inode) {
337	simple_unlink(new_dir, new_dentry);	337	simple_unlink(new_dir, new_dentry);
338	if (they_are_dirs) {	338	if (they_are_dirs) {
339	drop_nlink(new_dentry->d_inode);	339	drop_nlink(new_dentry->d_inode);
340	drop_nlink(old_dir);	340	drop_nlink(old_dir);
341	}	341	}
342	} else if (they_are_dirs) {	342	} else if (they_are_dirs) {
343	drop_nlink(old_dir);	343	drop_nlink(old_dir);
344	inc_nlink(new_dir);	344	inc_nlink(new_dir);
345	}	345	}
346		346
347	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =	347	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
348	new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;	348	new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
349		349
350	return 0;	350	return 0;
351	}	351	}
352		352
353	/**	353	/**
354	* simple_setattr - setattr for simple filesystem	354	* simple_setattr - setattr for simple filesystem
355	* @dentry: dentry	355	* @dentry: dentry
356	* @iattr: iattr structure	356	* @iattr: iattr structure
357	*	357	*
358	* Returns 0 on success, -error on failure.	358	* Returns 0 on success, -error on failure.
359	*	359	*
360	* simple_setattr is a simple ->setattr implementation without a proper	360	* simple_setattr is a simple ->setattr implementation without a proper
361	* implementation of size changes.	361	* implementation of size changes.
362	*	362	*
363	* It can either be used for in-memory filesystems or special files	363	* It can either be used for in-memory filesystems or special files
364	* on simple regular filesystems. Anything that needs to change on-disk	364	* on simple regular filesystems. Anything that needs to change on-disk
365	* or wire state on size changes needs its own setattr method.	365	* or wire state on size changes needs its own setattr method.
366	*/	366	*/
367	int simple_setattr(struct dentry dentry, struct iattr iattr)	367	int simple_setattr(struct dentry dentry, struct iattr iattr)
368	{	368	{
369	struct inode *inode = dentry->d_inode;	369	struct inode *inode = dentry->d_inode;
370	int error;	370	int error;
371		371
372	WARN_ON_ONCE(inode->i_op->truncate);	372	WARN_ON_ONCE(inode->i_op->truncate);
373		373
374	error = inode_change_ok(inode, iattr);	374	error = inode_change_ok(inode, iattr);
375	if (error)	375	if (error)
376	return error;	376	return error;
377		377
378	if (iattr->ia_valid & ATTR_SIZE)	378	if (iattr->ia_valid & ATTR_SIZE)
379	truncate_setsize(inode, iattr->ia_size);	379	truncate_setsize(inode, iattr->ia_size);
380	setattr_copy(inode, iattr);	380	setattr_copy(inode, iattr);
381	mark_inode_dirty(inode);	381	mark_inode_dirty(inode);
382	return 0;	382	return 0;
383	}	383	}
384	EXPORT_SYMBOL(simple_setattr);	384	EXPORT_SYMBOL(simple_setattr);
385		385
386	int simple_readpage(struct file file, struct page page)	386	int simple_readpage(struct file file, struct page page)
387	{	387	{
388	clear_highpage(page);	388	clear_highpage(page);
389	flush_dcache_page(page);	389	flush_dcache_page(page);
390	SetPageUptodate(page);	390	SetPageUptodate(page);
391	unlock_page(page);	391	unlock_page(page);
392	return 0;	392	return 0;
393	}	393	}
394		394
395	int simple_write_begin(struct file file, struct address_space mapping,	395	int simple_write_begin(struct file file, struct address_space mapping,
396	loff_t pos, unsigned len, unsigned flags,	396	loff_t pos, unsigned len, unsigned flags,
397	struct page pagep, void fsdata)	397	struct page pagep, void fsdata)
398	{	398	{
399	struct page *page;	399	struct page *page;
400	pgoff_t index;	400	pgoff_t index;
401		401
402	index = pos >> PAGE_CACHE_SHIFT;	402	index = pos >> PAGE_CACHE_SHIFT;
403		403
404	page = grab_cache_page_write_begin(mapping, index, flags);	404	page = grab_cache_page_write_begin(mapping, index, flags);
405	if (!page)	405	if (!page)
406	return -ENOMEM;	406	return -ENOMEM;
407		407
408	*pagep = page;	408	*pagep = page;
409		409
410	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {	410	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
411	unsigned from = pos & (PAGE_CACHE_SIZE - 1);	411	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
412		412
413	zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);	413	zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
414	}	414	}
415	return 0;	415	return 0;
416	}	416	}
417		417
418	/**	418	/**
419	* simple_write_end - .write_end helper for non-block-device FSes	419	* simple_write_end - .write_end helper for non-block-device FSes
420	* @available: See .write_end of address_space_operations	420	* @available: See .write_end of address_space_operations
421	* @file: "	421	* @file: "
422	* @mapping: "	422	* @mapping: "
423	* @pos: "	423	* @pos: "
424	* @len: "	424	* @len: "
425	* @copied: "	425	* @copied: "
426	* @page: "	426	* @page: "
427	* @fsdata: "	427	* @fsdata: "
428	*	428	*
429	* simple_write_end does the minimum needed for updating a page after writing is	429	* simple_write_end does the minimum needed for updating a page after writing is
430	* done. It has the same API signature as the .write_end of	430	* done. It has the same API signature as the .write_end of
431	* address_space_operations vector. So it can just be set onto .write_end for	431	* address_space_operations vector. So it can just be set onto .write_end for
432	* FSes that don't need any other processing. i_mutex is assumed to be held.	432	* FSes that don't need any other processing. i_mutex is assumed to be held.
433	* Block based filesystems should use generic_write_end().	433	* Block based filesystems should use generic_write_end().
434	* NOTE: Even though i_size might get updated by this function, mark_inode_dirty	434	* NOTE: Even though i_size might get updated by this function, mark_inode_dirty
435	* is not called, so a filesystem that actually does store data in .write_inode	435	* is not called, so a filesystem that actually does store data in .write_inode
436	* should extend on what's done here with a call to mark_inode_dirty() in the	436	* should extend on what's done here with a call to mark_inode_dirty() in the
437	* case that i_size has changed.	437	* case that i_size has changed.
438	*/	438	*/
439	int simple_write_end(struct file file, struct address_space mapping,	439	int simple_write_end(struct file file, struct address_space mapping,
440	loff_t pos, unsigned len, unsigned copied,	440	loff_t pos, unsigned len, unsigned copied,
441	struct page page, void fsdata)	441	struct page page, void fsdata)
442	{	442	{
443	struct inode *inode = page->mapping->host;	443	struct inode *inode = page->mapping->host;
444	loff_t last_pos = pos + copied;	444	loff_t last_pos = pos + copied;
445		445
446	/* zero the stale part of the page if we did a short copy */	446	/* zero the stale part of the page if we did a short copy */
447	if (copied < len) {	447	if (copied < len) {
448	unsigned from = pos & (PAGE_CACHE_SIZE - 1);	448	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
449		449
450	zero_user(page, from + copied, len - copied);	450	zero_user(page, from + copied, len - copied);
451	}	451	}
452		452
453	if (!PageUptodate(page))	453	if (!PageUptodate(page))
454	SetPageUptodate(page);	454	SetPageUptodate(page);
455	/*	455	/*
456	* No need to use i_size_read() here, the i_size	456	* No need to use i_size_read() here, the i_size
457	* cannot change under us because we hold the i_mutex.	457	* cannot change under us because we hold the i_mutex.
458	*/	458	*/
459	if (last_pos > inode->i_size)	459	if (last_pos > inode->i_size)
460	i_size_write(inode, last_pos);	460	i_size_write(inode, last_pos);
461		461
462	set_page_dirty(page);	462	set_page_dirty(page);
463	unlock_page(page);	463	unlock_page(page);
464	page_cache_release(page);	464	page_cache_release(page);
465		465
466	return copied;	466	return copied;
467	}	467	}
468		468
469	/*	469	/*
470	* the inodes created here are not hashed. If you use iunique to generate	470	* the inodes created here are not hashed. If you use iunique to generate
471	* unique inode values later for this filesystem, then you must take care	471	* unique inode values later for this filesystem, then you must take care
472	* to pass it an appropriate max_reserved value to avoid collisions.	472	* to pass it an appropriate max_reserved value to avoid collisions.
473	*/	473	*/
474	int simple_fill_super(struct super_block *s, unsigned long magic,	474	int simple_fill_super(struct super_block *s, unsigned long magic,
475	struct tree_descr *files)	475	struct tree_descr *files)
476	{	476	{
477	struct inode *inode;	477	struct inode *inode;
478	struct dentry *root;	478	struct dentry *root;
479	struct dentry *dentry;	479	struct dentry *dentry;
480	int i;	480	int i;
481		481
482	s->s_blocksize = PAGE_CACHE_SIZE;	482	s->s_blocksize = PAGE_CACHE_SIZE;
483	s->s_blocksize_bits = PAGE_CACHE_SHIFT;	483	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
484	s->s_magic = magic;	484	s->s_magic = magic;
485	s->s_op = &simple_super_operations;	485	s->s_op = &simple_super_operations;
486	s->s_time_gran = 1;	486	s->s_time_gran = 1;
487		487
488	inode = new_inode(s);	488	inode = new_inode(s);
489	if (!inode)	489	if (!inode)
490	return -ENOMEM;	490	return -ENOMEM;
491	/*	491	/*
492	* because the root inode is 1, the files array must not contain an	492	* because the root inode is 1, the files array must not contain an
493	* entry at index 1	493	* entry at index 1
494	*/	494	*/
495	inode->i_ino = 1;	495	inode->i_ino = 1;
496	inode->i_mode = S_IFDIR \| 0755;	496	inode->i_mode = S_IFDIR \| 0755;
497	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	497	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
498	inode->i_op = &simple_dir_inode_operations;	498	inode->i_op = &simple_dir_inode_operations;
499	inode->i_fop = &simple_dir_operations;	499	inode->i_fop = &simple_dir_operations;
500	set_nlink(inode, 2);	500	set_nlink(inode, 2);
501	root = d_make_root(inode);	501	root = d_make_root(inode);
502	if (!root)	502	if (!root)
503	return -ENOMEM;	503	return -ENOMEM;
504	for (i = 0; !files->name \|\| files->name[0]; i++, files++) {	504	for (i = 0; !files->name \|\| files->name[0]; i++, files++) {
505	if (!files->name)	505	if (!files->name)
506	continue;	506	continue;
507		507
508	/* warn if it tries to conflict with the root inode */	508	/* warn if it tries to conflict with the root inode */
509	if (unlikely(i == 1))	509	if (unlikely(i == 1))
510	printk(KERN_WARNING "%s: %s passed in a files array"	510	printk(KERN_WARNING "%s: %s passed in a files array"
511	"with an index of 1!\n", __func__,	511	"with an index of 1!\n", __func__,
512	s->s_type->name);	512	s->s_type->name);
513		513
514	dentry = d_alloc_name(root, files->name);	514	dentry = d_alloc_name(root, files->name);
515	if (!dentry)	515	if (!dentry)
516	goto out;	516	goto out;
517	inode = new_inode(s);	517	inode = new_inode(s);
518	if (!inode) {	518	if (!inode) {
519	dput(dentry);	519	dput(dentry);
520	goto out;	520	goto out;
521	}	521	}
522	inode->i_mode = S_IFREG \| files->mode;	522	inode->i_mode = S_IFREG \| files->mode;
523	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	523	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
524	inode->i_fop = files->ops;	524	inode->i_fop = files->ops;
525	inode->i_ino = i;	525	inode->i_ino = i;
526	d_add(dentry, inode);	526	d_add(dentry, inode);
527	}	527	}
528	s->s_root = root;	528	s->s_root = root;
529	return 0;	529	return 0;
530	out:	530	out:
531	d_genocide(root);	531	d_genocide(root);
532	shrink_dcache_parent(root);	532	shrink_dcache_parent(root);
533	dput(root);	533	dput(root);
534	return -ENOMEM;	534	return -ENOMEM;
535	}	535	}
536		536
537	static DEFINE_SPINLOCK(pin_fs_lock);	537	static DEFINE_SPINLOCK(pin_fs_lock);
538		538
539	int simple_pin_fs(struct file_system_type type, struct vfsmount mount, int count)	539	int simple_pin_fs(struct file_system_type type, struct vfsmount mount, int count)
540	{	540	{
541	struct vfsmount *mnt = NULL;	541	struct vfsmount *mnt = NULL;
542	spin_lock(&pin_fs_lock);	542	spin_lock(&pin_fs_lock);
543	if (unlikely(!*mount)) {	543	if (unlikely(!*mount)) {
544	spin_unlock(&pin_fs_lock);	544	spin_unlock(&pin_fs_lock);
545	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);	545	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
546	if (IS_ERR(mnt))	546	if (IS_ERR(mnt))
547	return PTR_ERR(mnt);	547	return PTR_ERR(mnt);
548	spin_lock(&pin_fs_lock);	548	spin_lock(&pin_fs_lock);
549	if (!*mount)	549	if (!*mount)
550	*mount = mnt;	550	*mount = mnt;
551	}	551	}
552	mntget(*mount);	552	mntget(*mount);
553	++*count;	553	++*count;
554	spin_unlock(&pin_fs_lock);	554	spin_unlock(&pin_fs_lock);
555	mntput(mnt);	555	mntput(mnt);
556	return 0;	556	return 0;
557	}	557	}
558		558
559	void simple_release_fs(struct vfsmount *mount, int count)	559	void simple_release_fs(struct vfsmount *mount, int count)
560	{	560	{
561	struct vfsmount *mnt;	561	struct vfsmount *mnt;
562	spin_lock(&pin_fs_lock);	562	spin_lock(&pin_fs_lock);
563	mnt = *mount;	563	mnt = *mount;
564	if (!--*count)	564	if (!--*count)
565	*mount = NULL;	565	*mount = NULL;
566	spin_unlock(&pin_fs_lock);	566	spin_unlock(&pin_fs_lock);
567	mntput(mnt);	567	mntput(mnt);
568	}	568	}
569		569
570	/**	570	/**
571	* simple_read_from_buffer - copy data from the buffer to user space	571	* simple_read_from_buffer - copy data from the buffer to user space
572	* @to: the user space buffer to read to	572	* @to: the user space buffer to read to
573	* @count: the maximum number of bytes to read	573	* @count: the maximum number of bytes to read
574	* @ppos: the current position in the buffer	574	* @ppos: the current position in the buffer
575	* @from: the buffer to read from	575	* @from: the buffer to read from
576	* @available: the size of the buffer	576	* @available: the size of the buffer
577	*	577	*
578	* The simple_read_from_buffer() function reads up to @count bytes from the	578	* The simple_read_from_buffer() function reads up to @count bytes from the
579	* buffer @from at offset @ppos into the user space address starting at @to.	579	* buffer @from at offset @ppos into the user space address starting at @to.
580	*	580	*
581	* On success, the number of bytes read is returned and the offset @ppos is	581	* On success, the number of bytes read is returned and the offset @ppos is
582	* advanced by this number, or negative value is returned on error.	582	* advanced by this number, or negative value is returned on error.
583	**/	583	**/
584	ssize_t simple_read_from_buffer(void __user to, size_t count, loff_t ppos,	584	ssize_t simple_read_from_buffer(void __user to, size_t count, loff_t ppos,
585	const void *from, size_t available)	585	const void *from, size_t available)
586	{	586	{
587	loff_t pos = *ppos;	587	loff_t pos = *ppos;
588	size_t ret;	588	size_t ret;
589		589
590	if (pos < 0)	590	if (pos < 0)
591	return -EINVAL;	591	return -EINVAL;
592	if (pos >= available \|\| !count)	592	if (pos >= available \|\| !count)
593	return 0;	593	return 0;
594	if (count > available - pos)	594	if (count > available - pos)
595	count = available - pos;	595	count = available - pos;
596	ret = copy_to_user(to, from + pos, count);	596	ret = copy_to_user(to, from + pos, count);
597	if (ret == count)	597	if (ret == count)
598	return -EFAULT;	598	return -EFAULT;
599	count -= ret;	599	count -= ret;
600	*ppos = pos + count;	600	*ppos = pos + count;
601	return count;	601	return count;
602	}	602	}
603		603
604	/**	604	/**
605	* simple_write_to_buffer - copy data from user space to the buffer	605	* simple_write_to_buffer - copy data from user space to the buffer
606	* @to: the buffer to write to	606	* @to: the buffer to write to
607	* @available: the size of the buffer	607	* @available: the size of the buffer
608	* @ppos: the current position in the buffer	608	* @ppos: the current position in the buffer
609	* @from: the user space buffer to read from	609	* @from: the user space buffer to read from
610	* @count: the maximum number of bytes to read	610	* @count: the maximum number of bytes to read
611	*	611	*
612	* The simple_write_to_buffer() function reads up to @count bytes from the user	612	* The simple_write_to_buffer() function reads up to @count bytes from the user
613	* space address starting at @from into the buffer @to at offset @ppos.	613	* space address starting at @from into the buffer @to at offset @ppos.
614	*	614	*
615	* On success, the number of bytes written is returned and the offset @ppos is	615	* On success, the number of bytes written is returned and the offset @ppos is
616	* advanced by this number, or negative value is returned on error.	616	* advanced by this number, or negative value is returned on error.
617	**/	617	**/
618	ssize_t simple_write_to_buffer(void to, size_t available, loff_t ppos,	618	ssize_t simple_write_to_buffer(void to, size_t available, loff_t ppos,
619	const void __user *from, size_t count)	619	const void __user *from, size_t count)
620	{	620	{
621	loff_t pos = *ppos;	621	loff_t pos = *ppos;
622	size_t res;	622	size_t res;
623		623
624	if (pos < 0)	624	if (pos < 0)
625	return -EINVAL;	625	return -EINVAL;
626	if (pos >= available \|\| !count)	626	if (pos >= available \|\| !count)
627	return 0;	627	return 0;
628	if (count > available - pos)	628	if (count > available - pos)
629	count = available - pos;	629	count = available - pos;
630	res = copy_from_user(to + pos, from, count);	630	res = copy_from_user(to + pos, from, count);
631	if (res == count)	631	if (res == count)
632	return -EFAULT;	632	return -EFAULT;
633	count -= res;	633	count -= res;
634	*ppos = pos + count;	634	*ppos = pos + count;
635	return count;	635	return count;
636	}	636	}
637		637
638	/**	638	/**
639	* memory_read_from_buffer - copy data from the buffer	639	* memory_read_from_buffer - copy data from the buffer
640	* @to: the kernel space buffer to read to	640	* @to: the kernel space buffer to read to
641	* @count: the maximum number of bytes to read	641	* @count: the maximum number of bytes to read
642	* @ppos: the current position in the buffer	642	* @ppos: the current position in the buffer
643	* @from: the buffer to read from	643	* @from: the buffer to read from
644	* @available: the size of the buffer	644	* @available: the size of the buffer
645	*	645	*
646	* The memory_read_from_buffer() function reads up to @count bytes from the	646	* The memory_read_from_buffer() function reads up to @count bytes from the
647	* buffer @from at offset @ppos into the kernel space address starting at @to.	647	* buffer @from at offset @ppos into the kernel space address starting at @to.
648	*	648	*
649	* On success, the number of bytes read is returned and the offset @ppos is	649	* On success, the number of bytes read is returned and the offset @ppos is
650	* advanced by this number, or negative value is returned on error.	650	* advanced by this number, or negative value is returned on error.
651	**/	651	**/
652	ssize_t memory_read_from_buffer(void to, size_t count, loff_t ppos,	652	ssize_t memory_read_from_buffer(void to, size_t count, loff_t ppos,
653	const void *from, size_t available)	653	const void *from, size_t available)
654	{	654	{
655	loff_t pos = *ppos;	655	loff_t pos = *ppos;
656		656
657	if (pos < 0)	657	if (pos < 0)
658	return -EINVAL;	658	return -EINVAL;
659	if (pos >= available)	659	if (pos >= available)
660	return 0;	660	return 0;
661	if (count > available - pos)	661	if (count > available - pos)
662	count = available - pos;	662	count = available - pos;
663	memcpy(to, from + pos, count);	663	memcpy(to, from + pos, count);
664	*ppos = pos + count;	664	*ppos = pos + count;
665		665
666	return count;	666	return count;
667	}	667	}
668		668
669	/*	669	/*
670	* Transaction based IO.	670	* Transaction based IO.
671	* The file expects a single write which triggers the transaction, and then	671	* The file expects a single write which triggers the transaction, and then
672	* possibly a read which collects the result - which is stored in a	672	* possibly a read which collects the result - which is stored in a
673	* file-local buffer.	673	* file-local buffer.
674	*/	674	*/
675		675
676	void simple_transaction_set(struct file *file, size_t n)	676	void simple_transaction_set(struct file *file, size_t n)
677	{	677	{
678	struct simple_transaction_argresp *ar = file->private_data;	678	struct simple_transaction_argresp *ar = file->private_data;
679		679
680	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);	680	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
681		681
682	/*	682	/*
683	* The barrier ensures that ar->size will really remain zero until	683	* The barrier ensures that ar->size will really remain zero until
684	* ar->data is ready for reading.	684	* ar->data is ready for reading.
685	*/	685	*/
686	smp_mb();	686	smp_mb();
687	ar->size = n;	687	ar->size = n;
688	}	688	}
689		689
690	char simple_transaction_get(struct file file, const char __user *buf, size_t size)	690	char simple_transaction_get(struct file file, const char __user *buf, size_t size)
691	{	691	{
692	struct simple_transaction_argresp *ar;	692	struct simple_transaction_argresp *ar;
693	static DEFINE_SPINLOCK(simple_transaction_lock);	693	static DEFINE_SPINLOCK(simple_transaction_lock);
694		694
695	if (size > SIMPLE_TRANSACTION_LIMIT - 1)	695	if (size > SIMPLE_TRANSACTION_LIMIT - 1)
696	return ERR_PTR(-EFBIG);	696	return ERR_PTR(-EFBIG);
697		697
698	ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);	698	ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
699	if (!ar)	699	if (!ar)
700	return ERR_PTR(-ENOMEM);	700	return ERR_PTR(-ENOMEM);
701		701
702	spin_lock(&simple_transaction_lock);	702	spin_lock(&simple_transaction_lock);
703		703
704	/* only one write allowed per open */	704	/* only one write allowed per open */
705	if (file->private_data) {	705	if (file->private_data) {
706	spin_unlock(&simple_transaction_lock);	706	spin_unlock(&simple_transaction_lock);
707	free_page((unsigned long)ar);	707	free_page((unsigned long)ar);
708	return ERR_PTR(-EBUSY);	708	return ERR_PTR(-EBUSY);
709	}	709	}
710		710
711	file->private_data = ar;	711	file->private_data = ar;
712		712
713	spin_unlock(&simple_transaction_lock);	713	spin_unlock(&simple_transaction_lock);
714		714
715	if (copy_from_user(ar->data, buf, size))	715	if (copy_from_user(ar->data, buf, size))
716	return ERR_PTR(-EFAULT);	716	return ERR_PTR(-EFAULT);
717		717
718	return ar->data;	718	return ar->data;
719	}	719	}
720		720
721	ssize_t simple_transaction_read(struct file file, char __user buf, size_t size, loff_t *pos)	721	ssize_t simple_transaction_read(struct file file, char __user buf, size_t size, loff_t *pos)
722	{	722	{
723	struct simple_transaction_argresp *ar = file->private_data;	723	struct simple_transaction_argresp *ar = file->private_data;
724		724
725	if (!ar)	725	if (!ar)
726	return 0;	726	return 0;
727	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);	727	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
728	}	728	}
729		729
730	int simple_transaction_release(struct inode inode, struct file file)	730	int simple_transaction_release(struct inode inode, struct file file)
731	{	731	{
732	free_page((unsigned long)file->private_data);	732	free_page((unsigned long)file->private_data);
733	return 0;	733	return 0;
734	}	734	}
735		735
736	/* Simple attribute files */	736	/* Simple attribute files */
737		737
738	struct simple_attr {	738	struct simple_attr {
739	int (get)(void , u64 *);	739	int (get)(void , u64 *);
740	int (set)(void , u64);	740	int (set)(void , u64);
741	char get_buf[24]; /* enough to store a u64 and "\n\0" */	741	char get_buf[24]; /* enough to store a u64 and "\n\0" */
742	char set_buf[24];	742	char set_buf[24];
743	void *data;	743	void *data;
744	const char fmt; / format for read operation */	744	const char fmt; / format for read operation */
745	struct mutex mutex; /* protects access to these buffers */	745	struct mutex mutex; /* protects access to these buffers */
746	};	746	};
747		747
748	/* simple_attr_open is called by an actual attribute open file operation	748	/* simple_attr_open is called by an actual attribute open file operation
749	* to set the attribute specific access operations. */	749	* to set the attribute specific access operations. */
750	int simple_attr_open(struct inode inode, struct file file,	750	int simple_attr_open(struct inode inode, struct file file,
751	int (get)(void , u64 ), int (set)(void *, u64),	751	int (get)(void , u64 ), int (set)(void *, u64),
752	const char *fmt)	752	const char *fmt)
753	{	753	{
754	struct simple_attr *attr;	754	struct simple_attr *attr;
755		755
756	attr = kmalloc(sizeof(*attr), GFP_KERNEL);	756	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
757	if (!attr)	757	if (!attr)
758	return -ENOMEM;	758	return -ENOMEM;
759		759
760	attr->get = get;	760	attr->get = get;
761	attr->set = set;	761	attr->set = set;
762	attr->data = inode->i_private;	762	attr->data = inode->i_private;
763	attr->fmt = fmt;	763	attr->fmt = fmt;
764	mutex_init(&attr->mutex);	764	mutex_init(&attr->mutex);
765		765
766	file->private_data = attr;	766	file->private_data = attr;
767		767
768	return nonseekable_open(inode, file);	768	return nonseekable_open(inode, file);
769	}	769	}
770		770
771	int simple_attr_release(struct inode inode, struct file file)	771	int simple_attr_release(struct inode inode, struct file file)
772	{	772	{
773	kfree(file->private_data);	773	kfree(file->private_data);
774	return 0;	774	return 0;
775	}	775	}
776		776
777	/* read from the buffer that is filled with the get function */	777	/* read from the buffer that is filled with the get function */
778	ssize_t simple_attr_read(struct file file, char __user buf,	778	ssize_t simple_attr_read(struct file file, char __user buf,
779	size_t len, loff_t *ppos)	779	size_t len, loff_t *ppos)
780	{	780	{
781	struct simple_attr *attr;	781	struct simple_attr *attr;
782	size_t size;	782	size_t size;
783	ssize_t ret;	783	ssize_t ret;
784		784
785	attr = file->private_data;	785	attr = file->private_data;
786		786
787	if (!attr->get)	787	if (!attr->get)
788	return -EACCES;	788	return -EACCES;
789		789
790	ret = mutex_lock_interruptible(&attr->mutex);	790	ret = mutex_lock_interruptible(&attr->mutex);
791	if (ret)	791	if (ret)
792	return ret;	792	return ret;
793		793
794	if (ppos) { / continued read */	794	if (ppos) { / continued read */
795	size = strlen(attr->get_buf);	795	size = strlen(attr->get_buf);
796	} else { /* first read */	796	} else { /* first read */
797	u64 val;	797	u64 val;
798	ret = attr->get(attr->data, &val);	798	ret = attr->get(attr->data, &val);
799	if (ret)	799	if (ret)
800	goto out;	800	goto out;
801		801
802	size = scnprintf(attr->get_buf, sizeof(attr->get_buf),	802	size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
803	attr->fmt, (unsigned long long)val);	803	attr->fmt, (unsigned long long)val);
804	}	804	}
805		805
806	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);	806	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
807	out:	807	out:
808	mutex_unlock(&attr->mutex);	808	mutex_unlock(&attr->mutex);
809	return ret;	809	return ret;
810	}	810	}
811		811
812	/* interpret the buffer as a number to call the set function with */	812	/* interpret the buffer as a number to call the set function with */
813	ssize_t simple_attr_write(struct file file, const char __user buf,	813	ssize_t simple_attr_write(struct file file, const char __user buf,
814	size_t len, loff_t *ppos)	814	size_t len, loff_t *ppos)
815	{	815	{
816	struct simple_attr *attr;	816	struct simple_attr *attr;
817	u64 val;	817	u64 val;
818	size_t size;	818	size_t size;
819	ssize_t ret;	819	ssize_t ret;
820		820
821	attr = file->private_data;	821	attr = file->private_data;
822	if (!attr->set)	822	if (!attr->set)
823	return -EACCES;	823	return -EACCES;
824		824
825	ret = mutex_lock_interruptible(&attr->mutex);	825	ret = mutex_lock_interruptible(&attr->mutex);
826	if (ret)	826	if (ret)
827	return ret;	827	return ret;
828		828
829	ret = -EFAULT;	829	ret = -EFAULT;
830	size = min(sizeof(attr->set_buf) - 1, len);	830	size = min(sizeof(attr->set_buf) - 1, len);
831	if (copy_from_user(attr->set_buf, buf, size))	831	if (copy_from_user(attr->set_buf, buf, size))
832	goto out;	832	goto out;
833		833
834	attr->set_buf[size] = '\0';	834	attr->set_buf[size] = '\0';
835	val = simple_strtoll(attr->set_buf, NULL, 0);	835	val = simple_strtoll(attr->set_buf, NULL, 0);
836	ret = attr->set(attr->data, val);	836	ret = attr->set(attr->data, val);
837	if (ret == 0)	837	if (ret == 0)
838	ret = len; /* on success, claim we got the whole input */	838	ret = len; /* on success, claim we got the whole input */
839	out:	839	out:
840	mutex_unlock(&attr->mutex);	840	mutex_unlock(&attr->mutex);
841	return ret;	841	return ret;
842	}	842	}
843		843
844	/**	844	/**
845	* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation	845	* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
846	* @sb: filesystem to do the file handle conversion on	846	* @sb: filesystem to do the file handle conversion on
847	* @fid: file handle to convert	847	* @fid: file handle to convert
848	* @fh_len: length of the file handle in bytes	848	* @fh_len: length of the file handle in bytes
849	* @fh_type: type of file handle	849	* @fh_type: type of file handle
850	* @get_inode: filesystem callback to retrieve inode	850	* @get_inode: filesystem callback to retrieve inode
851	*	851	*
852	* This function decodes @fid as long as it has one of the well-known	852	* This function decodes @fid as long as it has one of the well-known
853	* Linux filehandle types and calls @get_inode on it to retrieve the	853	* Linux filehandle types and calls @get_inode on it to retrieve the
854	* inode for the object specified in the file handle.	854	* inode for the object specified in the file handle.
855	*/	855	*/
856	struct dentry generic_fh_to_dentry(struct super_block sb, struct fid *fid,	856	struct dentry generic_fh_to_dentry(struct super_block sb, struct fid *fid,
857	int fh_len, int fh_type, struct inode (get_inode)	857	int fh_len, int fh_type, struct inode (get_inode)
858	(struct super_block *sb, u64 ino, u32 gen))	858	(struct super_block *sb, u64 ino, u32 gen))
859	{	859	{
860	struct inode *inode = NULL;	860	struct inode *inode = NULL;
861		861
862	if (fh_len < 2)	862	if (fh_len < 2)
863	return NULL;	863	return NULL;
864		864
865	switch (fh_type) {	865	switch (fh_type) {
866	case FILEID_INO32_GEN:	866	case FILEID_INO32_GEN:
867	case FILEID_INO32_GEN_PARENT:	867	case FILEID_INO32_GEN_PARENT:
868	inode = get_inode(sb, fid->i32.ino, fid->i32.gen);	868	inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
869	break;	869	break;
870	}	870	}
871		871
872	return d_obtain_alias(inode);	872	return d_obtain_alias(inode);
873	}	873	}
874	EXPORT_SYMBOL_GPL(generic_fh_to_dentry);	874	EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
875		875
876	/**	876	/**
877	* generic_fh_to_parent - generic helper for the fh_to_parent export operation	877	* generic_fh_to_parent - generic helper for the fh_to_parent export operation
878	* @sb: filesystem to do the file handle conversion on	878	* @sb: filesystem to do the file handle conversion on
879	* @fid: file handle to convert	879	* @fid: file handle to convert
880	* @fh_len: length of the file handle in bytes	880	* @fh_len: length of the file handle in bytes
881	* @fh_type: type of file handle	881	* @fh_type: type of file handle
882	* @get_inode: filesystem callback to retrieve inode	882	* @get_inode: filesystem callback to retrieve inode
883	*	883	*
884	* This function decodes @fid as long as it has one of the well-known	884	* This function decodes @fid as long as it has one of the well-known
885	* Linux filehandle types and calls @get_inode on it to retrieve the	885	* Linux filehandle types and calls @get_inode on it to retrieve the
886	* inode for the _parent_ object specified in the file handle if it	886	* inode for the _parent_ object specified in the file handle if it
887	* is specified in the file handle, or NULL otherwise.	887	* is specified in the file handle, or NULL otherwise.
888	*/	888	*/
889	struct dentry generic_fh_to_parent(struct super_block sb, struct fid *fid,	889	struct dentry generic_fh_to_parent(struct super_block sb, struct fid *fid,
890	int fh_len, int fh_type, struct inode (get_inode)	890	int fh_len, int fh_type, struct inode (get_inode)
891	(struct super_block *sb, u64 ino, u32 gen))	891	(struct super_block *sb, u64 ino, u32 gen))
892	{	892	{
893	struct inode *inode = NULL;	893	struct inode *inode = NULL;
894		894
895	if (fh_len <= 2)	895	if (fh_len <= 2)
896	return NULL;	896	return NULL;
897		897
898	switch (fh_type) {	898	switch (fh_type) {
899	case FILEID_INO32_GEN_PARENT:	899	case FILEID_INO32_GEN_PARENT:
900	inode = get_inode(sb, fid->i32.parent_ino,	900	inode = get_inode(sb, fid->i32.parent_ino,
901	(fh_len > 3 ? fid->i32.parent_gen : 0));	901	(fh_len > 3 ? fid->i32.parent_gen : 0));
902	break;	902	break;
903	}	903	}
904		904
905	return d_obtain_alias(inode);	905	return d_obtain_alias(inode);
906	}	906	}
907	EXPORT_SYMBOL_GPL(generic_fh_to_parent);	907	EXPORT_SYMBOL_GPL(generic_fh_to_parent);
908		908
909	/**	909	/**
910	* generic_file_fsync - generic fsync implementation for simple filesystems	910	* generic_file_fsync - generic fsync implementation for simple filesystems
911	* @file: file to synchronize	911	* @file: file to synchronize
912	* @datasync: only synchronize essential metadata if true	912	* @datasync: only synchronize essential metadata if true
913	*	913	*
914	* This is a generic implementation of the fsync method for simple	914	* This is a generic implementation of the fsync method for simple
915	* filesystems which track all non-inode metadata in the buffers list	915	* filesystems which track all non-inode metadata in the buffers list
916	* hanging off the address_space structure.	916	* hanging off the address_space structure.
917	*/	917	*/
918	int generic_file_fsync(struct file *file, loff_t start, loff_t end,	918	int generic_file_fsync(struct file *file, loff_t start, loff_t end,
919	int datasync)	919	int datasync)
920	{	920	{
921	struct inode *inode = file->f_mapping->host;	921	struct inode *inode = file->f_mapping->host;
922	int err;	922	int err;
923	int ret;	923	int ret;
924		924
925	err = filemap_write_and_wait_range(inode->i_mapping, start, end);	925	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
926	if (err)	926	if (err)
927	return err;	927	return err;
928		928
929	mutex_lock(&inode->i_mutex);	929	mutex_lock(&inode->i_mutex);
930	ret = sync_mapping_buffers(inode->i_mapping);	930	ret = sync_mapping_buffers(inode->i_mapping);
931	if (!(inode->i_state & I_DIRTY))	931	if (!(inode->i_state & I_DIRTY))
932	goto out;	932	goto out;
933	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))	933	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
934	goto out;	934	goto out;
935		935
936	err = sync_inode_metadata(inode, 1);	936	err = sync_inode_metadata(inode, 1);
937	if (ret == 0)	937	if (ret == 0)
938	ret = err;	938	ret = err;
939	out:	939	out:
940	mutex_unlock(&inode->i_mutex);	940	mutex_unlock(&inode->i_mutex);
941	return ret;	941	return ret;
942	}	942	}
943	EXPORT_SYMBOL(generic_file_fsync);	943	EXPORT_SYMBOL(generic_file_fsync);
944		944
945	/**	945	/**
946	* generic_check_addressable - Check addressability of file system	946	* generic_check_addressable - Check addressability of file system
947	* @blocksize_bits: log of file system block size	947	* @blocksize_bits: log of file system block size
948	* @num_blocks: number of blocks in file system	948	* @num_blocks: number of blocks in file system
949	*	949	*
950	* Determine whether a file system with @num_blocks blocks (and a	950	* Determine whether a file system with @num_blocks blocks (and a
951	* block size of 2**@blocksize_bits) is addressable by the sector_t	951	* block size of 2**@blocksize_bits) is addressable by the sector_t
952	* and page cache of the system. Return 0 if so and -EFBIG otherwise.	952	* and page cache of the system. Return 0 if so and -EFBIG otherwise.
953	*/	953	*/
954	int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)	954	int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
955	{	955	{
956	u64 last_fs_block = num_blocks - 1;	956	u64 last_fs_block = num_blocks - 1;
957	u64 last_fs_page =	957	u64 last_fs_page =
958	last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);	958	last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
959		959
960	if (unlikely(num_blocks == 0))	960	if (unlikely(num_blocks == 0))
961	return 0;	961	return 0;
962		962
963	if ((blocksize_bits < 9) \|\| (blocksize_bits > PAGE_CACHE_SHIFT))	963	if ((blocksize_bits < 9) \|\| (blocksize_bits > PAGE_CACHE_SHIFT))
964	return -EINVAL;	964	return -EINVAL;
965		965
966	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) \|\|	966	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) \|\|
967	(last_fs_page > (pgoff_t)(~0ULL))) {	967	(last_fs_page > (pgoff_t)(~0ULL))) {
968	return -EFBIG;	968	return -EFBIG;
969	}	969	}
970	return 0;	970	return 0;
971	}	971	}
972	EXPORT_SYMBOL(generic_check_addressable);	972	EXPORT_SYMBOL(generic_check_addressable);
973		973
974	/*	974	/*
975	* No-op implementation of ->fsync for in-memory filesystems.	975	* No-op implementation of ->fsync for in-memory filesystems.
976	*/	976	*/
977	int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)	977	int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
978	{	978	{
979	return 0;	979	return 0;
980	}	980	}
981		981
982	EXPORT_SYMBOL(dcache_dir_close);	982	EXPORT_SYMBOL(dcache_dir_close);
983	EXPORT_SYMBOL(dcache_dir_lseek);	983	EXPORT_SYMBOL(dcache_dir_lseek);
984	EXPORT_SYMBOL(dcache_dir_open);	984	EXPORT_SYMBOL(dcache_dir_open);
985	EXPORT_SYMBOL(dcache_readdir);	985	EXPORT_SYMBOL(dcache_readdir);
986	EXPORT_SYMBOL(generic_read_dir);	986	EXPORT_SYMBOL(generic_read_dir);
987	EXPORT_SYMBOL(mount_pseudo);	987	EXPORT_SYMBOL(mount_pseudo);
988	EXPORT_SYMBOL(simple_write_begin);	988	EXPORT_SYMBOL(simple_write_begin);
989	EXPORT_SYMBOL(simple_write_end);	989	EXPORT_SYMBOL(simple_write_end);
990	EXPORT_SYMBOL(simple_dir_inode_operations);	990	EXPORT_SYMBOL(simple_dir_inode_operations);
991	EXPORT_SYMBOL(simple_dir_operations);	991	EXPORT_SYMBOL(simple_dir_operations);
992	EXPORT_SYMBOL(simple_empty);	992	EXPORT_SYMBOL(simple_empty);
993	EXPORT_SYMBOL(simple_fill_super);	993	EXPORT_SYMBOL(simple_fill_super);
994	EXPORT_SYMBOL(simple_getattr);	994	EXPORT_SYMBOL(simple_getattr);
995	EXPORT_SYMBOL(simple_open);	995	EXPORT_SYMBOL(simple_open);
996	EXPORT_SYMBOL(simple_link);	996	EXPORT_SYMBOL(simple_link);
997	EXPORT_SYMBOL(simple_lookup);	997	EXPORT_SYMBOL(simple_lookup);
998	EXPORT_SYMBOL(simple_pin_fs);	998	EXPORT_SYMBOL(simple_pin_fs);
999	EXPORT_SYMBOL(simple_readpage);	999	EXPORT_SYMBOL(simple_readpage);
1000	EXPORT_SYMBOL(simple_release_fs);	1000	EXPORT_SYMBOL(simple_release_fs);
1001	EXPORT_SYMBOL(simple_rename);	1001	EXPORT_SYMBOL(simple_rename);
1002	EXPORT_SYMBOL(simple_rmdir);	1002	EXPORT_SYMBOL(simple_rmdir);
1003	EXPORT_SYMBOL(simple_statfs);	1003	EXPORT_SYMBOL(simple_statfs);
1004	EXPORT_SYMBOL(noop_fsync);	1004	EXPORT_SYMBOL(noop_fsync);
1005	EXPORT_SYMBOL(simple_unlink);	1005	EXPORT_SYMBOL(simple_unlink);
1006	EXPORT_SYMBOL(simple_read_from_buffer);	1006	EXPORT_SYMBOL(simple_read_from_buffer);
1007	EXPORT_SYMBOL(simple_write_to_buffer);	1007	EXPORT_SYMBOL(simple_write_to_buffer);
1008	EXPORT_SYMBOL(memory_read_from_buffer);	1008	EXPORT_SYMBOL(memory_read_from_buffer);
1009	EXPORT_SYMBOL(simple_transaction_set);	1009	EXPORT_SYMBOL(simple_transaction_set);
1010	EXPORT_SYMBOL(simple_transaction_get);	1010	EXPORT_SYMBOL(simple_transaction_get);
1011	EXPORT_SYMBOL(simple_transaction_read);	1011	EXPORT_SYMBOL(simple_transaction_read);
1012	EXPORT_SYMBOL(simple_transaction_release);	1012	EXPORT_SYMBOL(simple_transaction_release);
1013	EXPORT_SYMBOL_GPL(simple_attr_open);	1013	EXPORT_SYMBOL_GPL(simple_attr_open);
1014	EXPORT_SYMBOL_GPL(simple_attr_release);	1014	EXPORT_SYMBOL_GPL(simple_attr_release);
1015	EXPORT_SYMBOL_GPL(simple_attr_read);	1015	EXPORT_SYMBOL_GPL(simple_attr_read);
1016	EXPORT_SYMBOL_GPL(simple_attr_write);	1016	EXPORT_SYMBOL_GPL(simple_attr_write);
1017		1017

fs/nfs/dir.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/nfs/dir.c	2	* linux/fs/nfs/dir.c
3	*	3	*
4	* Copyright (C) 1992 Rick Sladkey	4	* Copyright (C) 1992 Rick Sladkey
5	*	5	*
6	* nfs directory handling functions	6	* nfs directory handling functions
7	*	7	*
8	* 10 Apr 1996 Added silly rename for unlink --okir	8	* 10 Apr 1996 Added silly rename for unlink --okir
9	* 28 Sep 1996 Improved directory cache --okir	9	* 28 Sep 1996 Improved directory cache --okir
10	* 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de	10	* 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de
11	* Re-implemented silly rename for unlink, newly implemented	11	* Re-implemented silly rename for unlink, newly implemented
12	* silly rename for nfs_rename() following the suggestions	12	* silly rename for nfs_rename() following the suggestions
13	* of Olaf Kirch (okir) found in this file.	13	* of Olaf Kirch (okir) found in this file.
14	* Following Linus comments on my original hack, this version	14	* Following Linus comments on my original hack, this version
15	* depends only on the dcache stuff and doesn't touch the inode	15	* depends only on the dcache stuff and doesn't touch the inode
16	* layer (iput() and friends).	16	* layer (iput() and friends).
17	* 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM	17	* 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
18	*/	18	*/
19		19
20	#include <linux/module.h>	20	#include <linux/module.h>
21	#include <linux/time.h>	21	#include <linux/time.h>
22	#include <linux/errno.h>	22	#include <linux/errno.h>
23	#include <linux/stat.h>	23	#include <linux/stat.h>
24	#include <linux/fcntl.h>	24	#include <linux/fcntl.h>
25	#include <linux/string.h>	25	#include <linux/string.h>
26	#include <linux/kernel.h>	26	#include <linux/kernel.h>
27	#include <linux/slab.h>	27	#include <linux/slab.h>
28	#include <linux/mm.h>	28	#include <linux/mm.h>
29	#include <linux/sunrpc/clnt.h>	29	#include <linux/sunrpc/clnt.h>
30	#include <linux/nfs_fs.h>	30	#include <linux/nfs_fs.h>
31	#include <linux/nfs_mount.h>	31	#include <linux/nfs_mount.h>
32	#include <linux/pagemap.h>	32	#include <linux/pagemap.h>
33	#include <linux/pagevec.h>	33	#include <linux/pagevec.h>
34	#include <linux/namei.h>	34	#include <linux/namei.h>
35	#include <linux/mount.h>	35	#include <linux/mount.h>
36	#include <linux/sched.h>	36	#include <linux/sched.h>
37	#include <linux/kmemleak.h>	37	#include <linux/kmemleak.h>
38	#include <linux/xattr.h>	38	#include <linux/xattr.h>
39		39
40	#include "delegation.h"	40	#include "delegation.h"
41	#include "iostat.h"	41	#include "iostat.h"
42	#include "internal.h"	42	#include "internal.h"
43	#include "fscache.h"	43	#include "fscache.h"
44		44
45	/* #define NFS_DEBUG_VERBOSE 1 */	45	/* #define NFS_DEBUG_VERBOSE 1 */
46		46
47	static int nfs_opendir(struct inode , struct file );	47	static int nfs_opendir(struct inode , struct file );
48	static int nfs_closedir(struct inode , struct file );	48	static int nfs_closedir(struct inode , struct file );
49	static int nfs_readdir(struct file , void , filldir_t);	49	static int nfs_readdir(struct file , void , filldir_t);
50	static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);	50	static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
51	static loff_t nfs_llseek_dir(struct file *, loff_t, int);	51	static loff_t nfs_llseek_dir(struct file *, loff_t, int);
52	static void nfs_readdir_clear_array(struct page*);	52	static void nfs_readdir_clear_array(struct page*);
53		53
54	const struct file_operations nfs_dir_operations = {	54	const struct file_operations nfs_dir_operations = {
55	.llseek = nfs_llseek_dir,	55	.llseek = nfs_llseek_dir,
56	.read = generic_read_dir,	56	.read = generic_read_dir,
57	.readdir = nfs_readdir,	57	.readdir = nfs_readdir,
58	.open = nfs_opendir,	58	.open = nfs_opendir,
59	.release = nfs_closedir,	59	.release = nfs_closedir,
60	.fsync = nfs_fsync_dir,	60	.fsync = nfs_fsync_dir,
61	};	61	};
62		62
63	const struct address_space_operations nfs_dir_aops = {	63	const struct address_space_operations nfs_dir_aops = {
64	.freepage = nfs_readdir_clear_array,	64	.freepage = nfs_readdir_clear_array,
65	};	65	};
66		66
67	static struct nfs_open_dir_context alloc_nfs_open_dir_context(struct inode dir, struct rpc_cred *cred)	67	static struct nfs_open_dir_context alloc_nfs_open_dir_context(struct inode dir, struct rpc_cred *cred)
68	{	68	{
69	struct nfs_open_dir_context *ctx;	69	struct nfs_open_dir_context *ctx;
70	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);	70	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
71	if (ctx != NULL) {	71	if (ctx != NULL) {
72	ctx->duped = 0;	72	ctx->duped = 0;
73	ctx->attr_gencount = NFS_I(dir)->attr_gencount;	73	ctx->attr_gencount = NFS_I(dir)->attr_gencount;
74	ctx->dir_cookie = 0;	74	ctx->dir_cookie = 0;
75	ctx->dup_cookie = 0;	75	ctx->dup_cookie = 0;
76	ctx->cred = get_rpccred(cred);	76	ctx->cred = get_rpccred(cred);
77	return ctx;	77	return ctx;
78	}	78	}
79	return ERR_PTR(-ENOMEM);	79	return ERR_PTR(-ENOMEM);
80	}	80	}
81		81
82	static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)	82	static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
83	{	83	{
84	put_rpccred(ctx->cred);	84	put_rpccred(ctx->cred);
85	kfree(ctx);	85	kfree(ctx);
86	}	86	}
87		87
88	/*	88	/*
89	* Open file	89	* Open file
90	*/	90	*/
91	static int	91	static int
92	nfs_opendir(struct inode inode, struct file filp)	92	nfs_opendir(struct inode inode, struct file filp)
93	{	93	{
94	int res = 0;	94	int res = 0;
95	struct nfs_open_dir_context *ctx;	95	struct nfs_open_dir_context *ctx;
96	struct rpc_cred *cred;	96	struct rpc_cred *cred;
97		97
98	dfprintk(FILE, "NFS: open dir(%s/%s)\n",	98	dfprintk(FILE, "NFS: open dir(%s/%s)\n",
99	filp->f_path.dentry->d_parent->d_name.name,	99	filp->f_path.dentry->d_parent->d_name.name,
100	filp->f_path.dentry->d_name.name);	100	filp->f_path.dentry->d_name.name);
101		101
102	nfs_inc_stats(inode, NFSIOS_VFSOPEN);	102	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
103		103
104	cred = rpc_lookup_cred();	104	cred = rpc_lookup_cred();
105	if (IS_ERR(cred))	105	if (IS_ERR(cred))
106	return PTR_ERR(cred);	106	return PTR_ERR(cred);
107	ctx = alloc_nfs_open_dir_context(inode, cred);	107	ctx = alloc_nfs_open_dir_context(inode, cred);
108	if (IS_ERR(ctx)) {	108	if (IS_ERR(ctx)) {
109	res = PTR_ERR(ctx);	109	res = PTR_ERR(ctx);
110	goto out;	110	goto out;
111	}	111	}
112	filp->private_data = ctx;	112	filp->private_data = ctx;
113	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {	113	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
114	/* This is a mountpoint, so d_revalidate will never	114	/* This is a mountpoint, so d_revalidate will never
115	* have been called, so we need to refresh the	115	* have been called, so we need to refresh the
116	* inode (for close-open consistency) ourselves.	116	* inode (for close-open consistency) ourselves.
117	*/	117	*/
118	__nfs_revalidate_inode(NFS_SERVER(inode), inode);	118	__nfs_revalidate_inode(NFS_SERVER(inode), inode);
119	}	119	}
120	out:	120	out:
121	put_rpccred(cred);	121	put_rpccred(cred);
122	return res;	122	return res;
123	}	123	}
124		124
125	static int	125	static int
126	nfs_closedir(struct inode inode, struct file filp)	126	nfs_closedir(struct inode inode, struct file filp)
127	{	127	{
128	put_nfs_open_dir_context(filp->private_data);	128	put_nfs_open_dir_context(filp->private_data);
129	return 0;	129	return 0;
130	}	130	}
131		131
132	struct nfs_cache_array_entry {	132	struct nfs_cache_array_entry {
133	u64 cookie;	133	u64 cookie;
134	u64 ino;	134	u64 ino;
135	struct qstr string;	135	struct qstr string;
136	unsigned char d_type;	136	unsigned char d_type;
137	};	137	};
138		138
139	struct nfs_cache_array {	139	struct nfs_cache_array {
140	int size;	140	int size;
141	int eof_index;	141	int eof_index;
142	u64 last_cookie;	142	u64 last_cookie;
143	struct nfs_cache_array_entry array[0];	143	struct nfs_cache_array_entry array[0];
144	};	144	};
145		145
146	typedef int (decode_dirent_t)(struct xdr_stream , struct nfs_entry *, int);	146	typedef int (decode_dirent_t)(struct xdr_stream , struct nfs_entry *, int);
147	typedef struct {	147	typedef struct {
148	struct file *file;	148	struct file *file;
149	struct page *page;	149	struct page *page;
150	unsigned long page_index;	150	unsigned long page_index;
151	u64 *dir_cookie;	151	u64 *dir_cookie;
152	u64 last_cookie;	152	u64 last_cookie;
153	loff_t current_index;	153	loff_t current_index;
154	decode_dirent_t decode;	154	decode_dirent_t decode;
155		155
156	unsigned long timestamp;	156	unsigned long timestamp;
157	unsigned long gencount;	157	unsigned long gencount;
158	unsigned int cache_entry_index;	158	unsigned int cache_entry_index;
159	unsigned int plus:1;	159	unsigned int plus:1;
160	unsigned int eof:1;	160	unsigned int eof:1;
161	} nfs_readdir_descriptor_t;	161	} nfs_readdir_descriptor_t;
162		162
163	/*	163	/*
164	* The caller is responsible for calling nfs_readdir_release_array(page)	164	* The caller is responsible for calling nfs_readdir_release_array(page)
165	*/	165	*/
166	static	166	static
167	struct nfs_cache_array nfs_readdir_get_array(struct page page)	167	struct nfs_cache_array nfs_readdir_get_array(struct page page)
168	{	168	{
169	void *ptr;	169	void *ptr;
170	if (page == NULL)	170	if (page == NULL)
171	return ERR_PTR(-EIO);	171	return ERR_PTR(-EIO);
172	ptr = kmap(page);	172	ptr = kmap(page);
173	if (ptr == NULL)	173	if (ptr == NULL)
174	return ERR_PTR(-ENOMEM);	174	return ERR_PTR(-ENOMEM);
175	return ptr;	175	return ptr;
176	}	176	}
177		177
178	static	178	static
179	void nfs_readdir_release_array(struct page *page)	179	void nfs_readdir_release_array(struct page *page)
180	{	180	{
181	kunmap(page);	181	kunmap(page);
182	}	182	}
183		183
184	/*	184	/*
185	* we are freeing strings created by nfs_add_to_readdir_array()	185	* we are freeing strings created by nfs_add_to_readdir_array()
186	*/	186	*/
187	static	187	static
188	void nfs_readdir_clear_array(struct page *page)	188	void nfs_readdir_clear_array(struct page *page)
189	{	189	{
190	struct nfs_cache_array *array;	190	struct nfs_cache_array *array;
191	int i;	191	int i;
192		192
193	array = kmap_atomic(page);	193	array = kmap_atomic(page);
194	for (i = 0; i < array->size; i++)	194	for (i = 0; i < array->size; i++)
195	kfree(array->array[i].string.name);	195	kfree(array->array[i].string.name);
196	kunmap_atomic(array);	196	kunmap_atomic(array);
197	}	197	}
198		198
199	/*	199	/*
200	* the caller is responsible for freeing qstr.name	200	* the caller is responsible for freeing qstr.name
201	* when called by nfs_readdir_add_to_array, the strings will be freed in	201	* when called by nfs_readdir_add_to_array, the strings will be freed in
202	* nfs_clear_readdir_array()	202	* nfs_clear_readdir_array()
203	*/	203	*/
204	static	204	static
205	int nfs_readdir_make_qstr(struct qstr string, const char name, unsigned int len)	205	int nfs_readdir_make_qstr(struct qstr string, const char name, unsigned int len)
206	{	206	{
207	string->len = len;	207	string->len = len;
208	string->name = kmemdup(name, len, GFP_KERNEL);	208	string->name = kmemdup(name, len, GFP_KERNEL);
209	if (string->name == NULL)	209	if (string->name == NULL)
210	return -ENOMEM;	210	return -ENOMEM;
211	/*	211	/*
212	* Avoid a kmemleak false positive. The pointer to the name is stored	212	* Avoid a kmemleak false positive. The pointer to the name is stored
213	* in a page cache page which kmemleak does not scan.	213	* in a page cache page which kmemleak does not scan.
214	*/	214	*/
215	kmemleak_not_leak(string->name);	215	kmemleak_not_leak(string->name);
216	string->hash = full_name_hash(name, len);	216	string->hash = full_name_hash(name, len);
217	return 0;	217	return 0;
218	}	218	}
219		219
220	static	220	static
221	int nfs_readdir_add_to_array(struct nfs_entry entry, struct page page)	221	int nfs_readdir_add_to_array(struct nfs_entry entry, struct page page)
222	{	222	{
223	struct nfs_cache_array *array = nfs_readdir_get_array(page);	223	struct nfs_cache_array *array = nfs_readdir_get_array(page);
224	struct nfs_cache_array_entry *cache_entry;	224	struct nfs_cache_array_entry *cache_entry;
225	int ret;	225	int ret;
226		226
227	if (IS_ERR(array))	227	if (IS_ERR(array))
228	return PTR_ERR(array);	228	return PTR_ERR(array);
229		229
230	cache_entry = &array->array[array->size];	230	cache_entry = &array->array[array->size];
231		231
232	/* Check that this entry lies within the page bounds */	232	/* Check that this entry lies within the page bounds */
233	ret = -ENOSPC;	233	ret = -ENOSPC;
234	if ((char )&cache_entry[1] - (char )page_address(page) > PAGE_SIZE)	234	if ((char )&cache_entry[1] - (char )page_address(page) > PAGE_SIZE)
235	goto out;	235	goto out;
236		236
237	cache_entry->cookie = entry->prev_cookie;	237	cache_entry->cookie = entry->prev_cookie;
238	cache_entry->ino = entry->ino;	238	cache_entry->ino = entry->ino;
239	cache_entry->d_type = entry->d_type;	239	cache_entry->d_type = entry->d_type;
240	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);	240	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
241	if (ret)	241	if (ret)
242	goto out;	242	goto out;
243	array->last_cookie = entry->cookie;	243	array->last_cookie = entry->cookie;
244	array->size++;	244	array->size++;
245	if (entry->eof != 0)	245	if (entry->eof != 0)
246	array->eof_index = array->size;	246	array->eof_index = array->size;
247	out:	247	out:
248	nfs_readdir_release_array(page);	248	nfs_readdir_release_array(page);
249	return ret;	249	return ret;
250	}	250	}
251		251
252	static	252	static
253	int nfs_readdir_search_for_pos(struct nfs_cache_array array, nfs_readdir_descriptor_t desc)	253	int nfs_readdir_search_for_pos(struct nfs_cache_array array, nfs_readdir_descriptor_t desc)
254	{	254	{
255	loff_t diff = desc->file->f_pos - desc->current_index;	255	loff_t diff = desc->file->f_pos - desc->current_index;
256	unsigned int index;	256	unsigned int index;
257		257
258	if (diff < 0)	258	if (diff < 0)
259	goto out_eof;	259	goto out_eof;
260	if (diff >= array->size) {	260	if (diff >= array->size) {
261	if (array->eof_index >= 0)	261	if (array->eof_index >= 0)
262	goto out_eof;	262	goto out_eof;
263	return -EAGAIN;	263	return -EAGAIN;
264	}	264	}
265		265
266	index = (unsigned int)diff;	266	index = (unsigned int)diff;
267	*desc->dir_cookie = array->array[index].cookie;	267	*desc->dir_cookie = array->array[index].cookie;
268	desc->cache_entry_index = index;	268	desc->cache_entry_index = index;
269	return 0;	269	return 0;
270	out_eof:	270	out_eof:
271	desc->eof = 1;	271	desc->eof = 1;
272	return -EBADCOOKIE;	272	return -EBADCOOKIE;
273	}	273	}
274		274
275	static	275	static
276	int nfs_readdir_search_for_cookie(struct nfs_cache_array array, nfs_readdir_descriptor_t desc)	276	int nfs_readdir_search_for_cookie(struct nfs_cache_array array, nfs_readdir_descriptor_t desc)
277	{	277	{
278	int i;	278	int i;
279	loff_t new_pos;	279	loff_t new_pos;
280	int status = -EAGAIN;	280	int status = -EAGAIN;
281		281
282	for (i = 0; i < array->size; i++) {	282	for (i = 0; i < array->size; i++) {
283	if (array->array[i].cookie == *desc->dir_cookie) {	283	if (array->array[i].cookie == *desc->dir_cookie) {
284	struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);	284	struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
285	struct nfs_open_dir_context *ctx = desc->file->private_data;	285	struct nfs_open_dir_context *ctx = desc->file->private_data;
286		286
287	new_pos = desc->current_index + i;	287	new_pos = desc->current_index + i;
288	if (ctx->attr_gencount != nfsi->attr_gencount	288	if (ctx->attr_gencount != nfsi->attr_gencount
289	\|\| (nfsi->cache_validity & (NFS_INO_INVALID_ATTR\|NFS_INO_INVALID_DATA))) {	289	\|\| (nfsi->cache_validity & (NFS_INO_INVALID_ATTR\|NFS_INO_INVALID_DATA))) {
290	ctx->duped = 0;	290	ctx->duped = 0;
291	ctx->attr_gencount = nfsi->attr_gencount;	291	ctx->attr_gencount = nfsi->attr_gencount;
292	} else if (new_pos < desc->file->f_pos) {	292	} else if (new_pos < desc->file->f_pos) {
293	if (ctx->duped > 0	293	if (ctx->duped > 0
294	&& ctx->dup_cookie == *desc->dir_cookie) {	294	&& ctx->dup_cookie == *desc->dir_cookie) {
295	if (printk_ratelimit()) {	295	if (printk_ratelimit()) {
296	pr_notice("NFS: directory %s/%s contains a readdir loop."	296	pr_notice("NFS: directory %s/%s contains a readdir loop."
297	"Please contact your server vendor. "	297	"Please contact your server vendor. "
298	"The file: %s has duplicate cookie %llu\n",	298	"The file: %s has duplicate cookie %llu\n",
299	desc->file->f_dentry->d_parent->d_name.name,	299	desc->file->f_dentry->d_parent->d_name.name,
300	desc->file->f_dentry->d_name.name,	300	desc->file->f_dentry->d_name.name,
301	array->array[i].string.name,	301	array->array[i].string.name,
302	*desc->dir_cookie);	302	*desc->dir_cookie);
303	}	303	}
304	status = -ELOOP;	304	status = -ELOOP;
305	goto out;	305	goto out;
306	}	306	}
307	ctx->dup_cookie = *desc->dir_cookie;	307	ctx->dup_cookie = *desc->dir_cookie;
308	ctx->duped = -1;	308	ctx->duped = -1;
309	}	309	}
310	desc->file->f_pos = new_pos;	310	desc->file->f_pos = new_pos;
311	desc->cache_entry_index = i;	311	desc->cache_entry_index = i;
312	return 0;	312	return 0;
313	}	313	}
314	}	314	}
315	if (array->eof_index >= 0) {	315	if (array->eof_index >= 0) {
316	status = -EBADCOOKIE;	316	status = -EBADCOOKIE;
317	if (*desc->dir_cookie == array->last_cookie)	317	if (*desc->dir_cookie == array->last_cookie)
318	desc->eof = 1;	318	desc->eof = 1;
319	}	319	}
320	out:	320	out:
321	return status;	321	return status;
322	}	322	}
323		323
324	static	324	static
325	int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)	325	int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
326	{	326	{
327	struct nfs_cache_array *array;	327	struct nfs_cache_array *array;
328	int status;	328	int status;
329		329
330	array = nfs_readdir_get_array(desc->page);	330	array = nfs_readdir_get_array(desc->page);
331	if (IS_ERR(array)) {	331	if (IS_ERR(array)) {
332	status = PTR_ERR(array);	332	status = PTR_ERR(array);
333	goto out;	333	goto out;
334	}	334	}
335		335
336	if (*desc->dir_cookie == 0)	336	if (*desc->dir_cookie == 0)
337	status = nfs_readdir_search_for_pos(array, desc);	337	status = nfs_readdir_search_for_pos(array, desc);
338	else	338	else
339	status = nfs_readdir_search_for_cookie(array, desc);	339	status = nfs_readdir_search_for_cookie(array, desc);
340		340
341	if (status == -EAGAIN) {	341	if (status == -EAGAIN) {
342	desc->last_cookie = array->last_cookie;	342	desc->last_cookie = array->last_cookie;
343	desc->current_index += array->size;	343	desc->current_index += array->size;
344	desc->page_index++;	344	desc->page_index++;
345	}	345	}
346	nfs_readdir_release_array(desc->page);	346	nfs_readdir_release_array(desc->page);
347	out:	347	out:
348	return status;	348	return status;
349	}	349	}
350		350
351	/* Fill a page with xdr information before transferring to the cache page */	351	/* Fill a page with xdr information before transferring to the cache page */
352	static	352	static
353	int nfs_readdir_xdr_filler(struct page *pages, nfs_readdir_descriptor_t desc,	353	int nfs_readdir_xdr_filler(struct page *pages, nfs_readdir_descriptor_t desc,
354	struct nfs_entry entry, struct file file, struct inode *inode)	354	struct nfs_entry entry, struct file file, struct inode *inode)
355	{	355	{
356	struct nfs_open_dir_context *ctx = file->private_data;	356	struct nfs_open_dir_context *ctx = file->private_data;
357	struct rpc_cred *cred = ctx->cred;	357	struct rpc_cred *cred = ctx->cred;
358	unsigned long timestamp, gencount;	358	unsigned long timestamp, gencount;
359	int error;	359	int error;
360		360
361	again:	361	again:
362	timestamp = jiffies;	362	timestamp = jiffies;
363	gencount = nfs_inc_attr_generation_counter();	363	gencount = nfs_inc_attr_generation_counter();
364	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,	364	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
365	NFS_SERVER(inode)->dtsize, desc->plus);	365	NFS_SERVER(inode)->dtsize, desc->plus);
366	if (error < 0) {	366	if (error < 0) {
367	/* We requested READDIRPLUS, but the server doesn't grok it */	367	/* We requested READDIRPLUS, but the server doesn't grok it */
368	if (error == -ENOTSUPP && desc->plus) {	368	if (error == -ENOTSUPP && desc->plus) {
369	NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;	369	NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
370	clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);	370	clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
371	desc->plus = 0;	371	desc->plus = 0;
372	goto again;	372	goto again;
373	}	373	}
374	goto error;	374	goto error;
375	}	375	}
376	desc->timestamp = timestamp;	376	desc->timestamp = timestamp;
377	desc->gencount = gencount;	377	desc->gencount = gencount;
378	error:	378	error:
379	return error;	379	return error;
380	}	380	}
381		381
382	static int xdr_decode(nfs_readdir_descriptor_t *desc,	382	static int xdr_decode(nfs_readdir_descriptor_t *desc,
383	struct nfs_entry entry, struct xdr_stream xdr)	383	struct nfs_entry entry, struct xdr_stream xdr)
384	{	384	{
385	int error;	385	int error;
386		386
387	error = desc->decode(xdr, entry, desc->plus);	387	error = desc->decode(xdr, entry, desc->plus);
388	if (error)	388	if (error)
389	return error;	389	return error;
390	entry->fattr->time_start = desc->timestamp;	390	entry->fattr->time_start = desc->timestamp;
391	entry->fattr->gencount = desc->gencount;	391	entry->fattr->gencount = desc->gencount;
392	return 0;	392	return 0;
393	}	393	}
394		394
395	static	395	static
396	int nfs_same_file(struct dentry dentry, struct nfs_entry entry)	396	int nfs_same_file(struct dentry dentry, struct nfs_entry entry)
397	{	397	{
398	if (dentry->d_inode == NULL)	398	if (dentry->d_inode == NULL)
399	goto different;	399	goto different;
400	if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)	400	if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
401	goto different;	401	goto different;
402	return 1;	402	return 1;
403	different:	403	different:
404	return 0;	404	return 0;
405	}	405	}
406		406
407	static	407	static
408	bool nfs_use_readdirplus(struct inode dir, struct file filp)	408	bool nfs_use_readdirplus(struct inode dir, struct file filp)
409	{	409	{
410	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))	410	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
411	return false;	411	return false;
412	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))	412	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
413	return true;	413	return true;
414	if (filp->f_pos == 0)	414	if (filp->f_pos == 0)
415	return true;	415	return true;
416	return false;	416	return false;
417	}	417	}
418		418
419	/*	419	/*
420	* This function is called by the lookup code to request the use of	420	* This function is called by the lookup code to request the use of
421	* readdirplus to accelerate any future lookups in the same	421	* readdirplus to accelerate any future lookups in the same
422	* directory.	422	* directory.
423	*/	423	*/
424	static	424	static
425	void nfs_advise_use_readdirplus(struct inode *dir)	425	void nfs_advise_use_readdirplus(struct inode *dir)
426	{	426	{
427	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);	427	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
428	}	428	}
429		429
430	static	430	static
431	void nfs_prime_dcache(struct dentry parent, struct nfs_entry entry)	431	void nfs_prime_dcache(struct dentry parent, struct nfs_entry entry)
432	{	432	{
433	struct qstr filename = QSTR_INIT(entry->name, entry->len);	433	struct qstr filename = QSTR_INIT(entry->name, entry->len);
434	struct dentry *dentry;	434	struct dentry *dentry;
435	struct dentry *alias;	435	struct dentry *alias;
436	struct inode *dir = parent->d_inode;	436	struct inode *dir = parent->d_inode;
437	struct inode *inode;	437	struct inode *inode;
438		438
439	if (filename.name[0] == '.') {	439	if (filename.name[0] == '.') {
440	if (filename.len == 1)	440	if (filename.len == 1)
441	return;	441	return;
442	if (filename.len == 2 && filename.name[1] == '.')	442	if (filename.len == 2 && filename.name[1] == '.')
443	return;	443	return;
444	}	444	}
445	filename.hash = full_name_hash(filename.name, filename.len);	445	filename.hash = full_name_hash(filename.name, filename.len);
446		446
447	dentry = d_lookup(parent, &filename);	447	dentry = d_lookup(parent, &filename);
448	if (dentry != NULL) {	448	if (dentry != NULL) {
449	if (nfs_same_file(dentry, entry)) {	449	if (nfs_same_file(dentry, entry)) {
450	nfs_refresh_inode(dentry->d_inode, entry->fattr);	450	nfs_refresh_inode(dentry->d_inode, entry->fattr);
451	goto out;	451	goto out;
452	} else {	452	} else {
453	if (d_invalidate(dentry) != 0)	453	if (d_invalidate(dentry) != 0)
454	goto out;	454	goto out;
455	dput(dentry);	455	dput(dentry);
456	}	456	}
457	}	457	}
458		458
459	dentry = d_alloc(parent, &filename);	459	dentry = d_alloc(parent, &filename);
460	if (dentry == NULL)	460	if (dentry == NULL)
461	return;	461	return;
462		462
463	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);	463	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
464	if (IS_ERR(inode))	464	if (IS_ERR(inode))
465	goto out;	465	goto out;
466		466
467	alias = d_materialise_unique(dentry, inode);	467	alias = d_materialise_unique(dentry, inode);
468	if (IS_ERR(alias))	468	if (IS_ERR(alias))
469	goto out;	469	goto out;
470	else if (alias) {	470	else if (alias) {
471	nfs_set_verifier(alias, nfs_save_change_attribute(dir));	471	nfs_set_verifier(alias, nfs_save_change_attribute(dir));
472	dput(alias);	472	dput(alias);
473	} else	473	} else
474	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	474	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
475		475
476	out:	476	out:
477	dput(dentry);	477	dput(dentry);
478	}	478	}
479		479
480	/* Perform conversion from xdr to cache array */	480	/* Perform conversion from xdr to cache array */
481	static	481	static
482	int nfs_readdir_page_filler(nfs_readdir_descriptor_t desc, struct nfs_entry entry,	482	int nfs_readdir_page_filler(nfs_readdir_descriptor_t desc, struct nfs_entry entry,
483	struct page *xdr_pages, struct page page, unsigned int buflen)	483	struct page *xdr_pages, struct page page, unsigned int buflen)
484	{	484	{
485	struct xdr_stream stream;	485	struct xdr_stream stream;
486	struct xdr_buf buf;	486	struct xdr_buf buf;
487	struct page *scratch;	487	struct page *scratch;
488	struct nfs_cache_array *array;	488	struct nfs_cache_array *array;
489	unsigned int count = 0;	489	unsigned int count = 0;
490	int status;	490	int status;
491		491
492	scratch = alloc_page(GFP_KERNEL);	492	scratch = alloc_page(GFP_KERNEL);
493	if (scratch == NULL)	493	if (scratch == NULL)
494	return -ENOMEM;	494	return -ENOMEM;
495		495
496	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);	496	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
497	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);	497	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
498		498
499	do {	499	do {
500	status = xdr_decode(desc, entry, &stream);	500	status = xdr_decode(desc, entry, &stream);
501	if (status != 0) {	501	if (status != 0) {
502	if (status == -EAGAIN)	502	if (status == -EAGAIN)
503	status = 0;	503	status = 0;
504	break;	504	break;
505	}	505	}
506		506
507	count++;	507	count++;
508		508
509	if (desc->plus != 0)	509	if (desc->plus != 0)
510	nfs_prime_dcache(desc->file->f_path.dentry, entry);	510	nfs_prime_dcache(desc->file->f_path.dentry, entry);
511		511
512	status = nfs_readdir_add_to_array(entry, page);	512	status = nfs_readdir_add_to_array(entry, page);
513	if (status != 0)	513	if (status != 0)
514	break;	514	break;
515	} while (!entry->eof);	515	} while (!entry->eof);
516		516
517	if (count == 0 \|\| (status == -EBADCOOKIE && entry->eof != 0)) {	517	if (count == 0 \|\| (status == -EBADCOOKIE && entry->eof != 0)) {
518	array = nfs_readdir_get_array(page);	518	array = nfs_readdir_get_array(page);
519	if (!IS_ERR(array)) {	519	if (!IS_ERR(array)) {
520	array->eof_index = array->size;	520	array->eof_index = array->size;
521	status = 0;	521	status = 0;
522	nfs_readdir_release_array(page);	522	nfs_readdir_release_array(page);
523	} else	523	} else
524	status = PTR_ERR(array);	524	status = PTR_ERR(array);
525	}	525	}
526		526
527	put_page(scratch);	527	put_page(scratch);
528	return status;	528	return status;
529	}	529	}
530		530
531	static	531	static
532	void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)	532	void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
533	{	533	{
534	unsigned int i;	534	unsigned int i;
535	for (i = 0; i < npages; i++)	535	for (i = 0; i < npages; i++)
536	put_page(pages[i]);	536	put_page(pages[i]);
537	}	537	}
538		538
539	static	539	static
540	void nfs_readdir_free_large_page(void ptr, struct page *pages,	540	void nfs_readdir_free_large_page(void ptr, struct page *pages,
541	unsigned int npages)	541	unsigned int npages)
542	{	542	{
543	nfs_readdir_free_pagearray(pages, npages);	543	nfs_readdir_free_pagearray(pages, npages);
544	}	544	}
545		545
546	/*	546	/*
547	* nfs_readdir_large_page will allocate pages that must be freed with a call	547	* nfs_readdir_large_page will allocate pages that must be freed with a call
548	* to nfs_readdir_free_large_page	548	* to nfs_readdir_free_large_page
549	*/	549	*/
550	static	550	static
551	int nfs_readdir_large_page(struct page **pages, unsigned int npages)	551	int nfs_readdir_large_page(struct page **pages, unsigned int npages)
552	{	552	{
553	unsigned int i;	553	unsigned int i;
554		554
555	for (i = 0; i < npages; i++) {	555	for (i = 0; i < npages; i++) {
556	struct page *page = alloc_page(GFP_KERNEL);	556	struct page *page = alloc_page(GFP_KERNEL);
557	if (page == NULL)	557	if (page == NULL)
558	goto out_freepages;	558	goto out_freepages;
559	pages[i] = page;	559	pages[i] = page;
560	}	560	}
561	return 0;	561	return 0;
562		562
563	out_freepages:	563	out_freepages:
564	nfs_readdir_free_pagearray(pages, i);	564	nfs_readdir_free_pagearray(pages, i);
565	return -ENOMEM;	565	return -ENOMEM;
566	}	566	}
567		567
568	static	568	static
569	int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t desc, struct page page, struct inode *inode)	569	int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t desc, struct page page, struct inode *inode)
570	{	570	{
571	struct page *pages[NFS_MAX_READDIR_PAGES];	571	struct page *pages[NFS_MAX_READDIR_PAGES];
572	void *pages_ptr = NULL;	572	void *pages_ptr = NULL;
573	struct nfs_entry entry;	573	struct nfs_entry entry;
574	struct file *file = desc->file;	574	struct file *file = desc->file;
575	struct nfs_cache_array *array;	575	struct nfs_cache_array *array;
576	int status = -ENOMEM;	576	int status = -ENOMEM;
577	unsigned int array_size = ARRAY_SIZE(pages);	577	unsigned int array_size = ARRAY_SIZE(pages);
578		578
579	entry.prev_cookie = 0;	579	entry.prev_cookie = 0;
580	entry.cookie = desc->last_cookie;	580	entry.cookie = desc->last_cookie;
581	entry.eof = 0;	581	entry.eof = 0;
582	entry.fh = nfs_alloc_fhandle();	582	entry.fh = nfs_alloc_fhandle();
583	entry.fattr = nfs_alloc_fattr();	583	entry.fattr = nfs_alloc_fattr();
584	entry.server = NFS_SERVER(inode);	584	entry.server = NFS_SERVER(inode);
585	if (entry.fh == NULL \|\| entry.fattr == NULL)	585	if (entry.fh == NULL \|\| entry.fattr == NULL)
586	goto out;	586	goto out;
587		587
588	array = nfs_readdir_get_array(page);	588	array = nfs_readdir_get_array(page);
589	if (IS_ERR(array)) {	589	if (IS_ERR(array)) {
590	status = PTR_ERR(array);	590	status = PTR_ERR(array);
591	goto out;	591	goto out;
592	}	592	}
593	memset(array, 0, sizeof(struct nfs_cache_array));	593	memset(array, 0, sizeof(struct nfs_cache_array));
594	array->eof_index = -1;	594	array->eof_index = -1;
595		595
596	status = nfs_readdir_large_page(pages, array_size);	596	status = nfs_readdir_large_page(pages, array_size);
597	if (status < 0)	597	if (status < 0)
598	goto out_release_array;	598	goto out_release_array;
599	do {	599	do {
600	unsigned int pglen;	600	unsigned int pglen;
601	status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);	601	status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
602		602
603	if (status < 0)	603	if (status < 0)
604	break;	604	break;
605	pglen = status;	605	pglen = status;
606	status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);	606	status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
607	if (status < 0) {	607	if (status < 0) {
608	if (status == -ENOSPC)	608	if (status == -ENOSPC)
609	status = 0;	609	status = 0;
610	break;	610	break;
611	}	611	}
612	} while (array->eof_index < 0);	612	} while (array->eof_index < 0);
613		613
614	nfs_readdir_free_large_page(pages_ptr, pages, array_size);	614	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
615	out_release_array:	615	out_release_array:
616	nfs_readdir_release_array(page);	616	nfs_readdir_release_array(page);
617	out:	617	out:
618	nfs_free_fattr(entry.fattr);	618	nfs_free_fattr(entry.fattr);
619	nfs_free_fhandle(entry.fh);	619	nfs_free_fhandle(entry.fh);
620	return status;	620	return status;
621	}	621	}
622		622
623	/*	623	/*
624	* Now we cache directories properly, by converting xdr information	624	* Now we cache directories properly, by converting xdr information
625	* to an array that can be used for lookups later. This results in	625	* to an array that can be used for lookups later. This results in
626	* fewer cache pages, since we can store more information on each page.	626	* fewer cache pages, since we can store more information on each page.
627	* We only need to convert from xdr once so future lookups are much simpler	627	* We only need to convert from xdr once so future lookups are much simpler
628	*/	628	*/
629	static	629	static
630	int nfs_readdir_filler(nfs_readdir_descriptor_t desc, struct page page)	630	int nfs_readdir_filler(nfs_readdir_descriptor_t desc, struct page page)
631	{	631	{
632	struct inode *inode = desc->file->f_path.dentry->d_inode;	632	struct inode *inode = desc->file->f_path.dentry->d_inode;
633	int ret;	633	int ret;
634		634
635	ret = nfs_readdir_xdr_to_array(desc, page, inode);	635	ret = nfs_readdir_xdr_to_array(desc, page, inode);
636	if (ret < 0)	636	if (ret < 0)
637	goto error;	637	goto error;
638	SetPageUptodate(page);	638	SetPageUptodate(page);
639		639
640	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {	640	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
641	/* Should never happen */	641	/* Should never happen */
642	nfs_zap_mapping(inode, inode->i_mapping);	642	nfs_zap_mapping(inode, inode->i_mapping);
643	}	643	}
644	unlock_page(page);	644	unlock_page(page);
645	return 0;	645	return 0;
646	error:	646	error:
647	unlock_page(page);	647	unlock_page(page);
648	return ret;	648	return ret;
649	}	649	}
650		650
651	static	651	static
652	void cache_page_release(nfs_readdir_descriptor_t *desc)	652	void cache_page_release(nfs_readdir_descriptor_t *desc)
653	{	653	{
654	if (!desc->page->mapping)	654	if (!desc->page->mapping)
655	nfs_readdir_clear_array(desc->page);	655	nfs_readdir_clear_array(desc->page);
656	page_cache_release(desc->page);	656	page_cache_release(desc->page);
657	desc->page = NULL;	657	desc->page = NULL;
658	}	658	}
659		659
660	static	660	static
661	struct page get_cache_page(nfs_readdir_descriptor_t desc)	661	struct page get_cache_page(nfs_readdir_descriptor_t desc)
662	{	662	{
663	return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,	663	return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
664	desc->page_index, (filler_t *)nfs_readdir_filler, desc);	664	desc->page_index, (filler_t *)nfs_readdir_filler, desc);
665	}	665	}
666		666
667	/*	667	/*
668	* Returns 0 if desc->dir_cookie was found on page desc->page_index	668	* Returns 0 if desc->dir_cookie was found on page desc->page_index
669	*/	669	*/
670	static	670	static
671	int find_cache_page(nfs_readdir_descriptor_t *desc)	671	int find_cache_page(nfs_readdir_descriptor_t *desc)
672	{	672	{
673	int res;	673	int res;
674		674
675	desc->page = get_cache_page(desc);	675	desc->page = get_cache_page(desc);
676	if (IS_ERR(desc->page))	676	if (IS_ERR(desc->page))
677	return PTR_ERR(desc->page);	677	return PTR_ERR(desc->page);
678		678
679	res = nfs_readdir_search_array(desc);	679	res = nfs_readdir_search_array(desc);
680	if (res != 0)	680	if (res != 0)
681	cache_page_release(desc);	681	cache_page_release(desc);
682	return res;	682	return res;
683	}	683	}
684		684
685	/* Search for desc->dir_cookie from the beginning of the page cache */	685	/* Search for desc->dir_cookie from the beginning of the page cache */
686	static inline	686	static inline
687	int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)	687	int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
688	{	688	{
689	int res;	689	int res;
690		690
691	if (desc->page_index == 0) {	691	if (desc->page_index == 0) {
692	desc->current_index = 0;	692	desc->current_index = 0;
693	desc->last_cookie = 0;	693	desc->last_cookie = 0;
694	}	694	}
695	do {	695	do {
696	res = find_cache_page(desc);	696	res = find_cache_page(desc);
697	} while (res == -EAGAIN);	697	} while (res == -EAGAIN);
698	return res;	698	return res;
699	}	699	}
700		700
701	/*	701	/*
702	* Once we've found the start of the dirent within a page: fill 'er up...	702	* Once we've found the start of the dirent within a page: fill 'er up...
703	*/	703	*/
704	static	704	static
705	int nfs_do_filldir(nfs_readdir_descriptor_t desc, void dirent,	705	int nfs_do_filldir(nfs_readdir_descriptor_t desc, void dirent,
706	filldir_t filldir)	706	filldir_t filldir)
707	{	707	{
708	struct file *file = desc->file;	708	struct file *file = desc->file;
709	int i = 0;	709	int i = 0;
710	int res = 0;	710	int res = 0;
711	struct nfs_cache_array *array = NULL;	711	struct nfs_cache_array *array = NULL;
712	struct nfs_open_dir_context *ctx = file->private_data;	712	struct nfs_open_dir_context *ctx = file->private_data;
713		713
714	array = nfs_readdir_get_array(desc->page);	714	array = nfs_readdir_get_array(desc->page);
715	if (IS_ERR(array)) {	715	if (IS_ERR(array)) {
716	res = PTR_ERR(array);	716	res = PTR_ERR(array);
717	goto out;	717	goto out;
718	}	718	}
719		719
720	for (i = desc->cache_entry_index; i < array->size; i++) {	720	for (i = desc->cache_entry_index; i < array->size; i++) {
721	struct nfs_cache_array_entry *ent;	721	struct nfs_cache_array_entry *ent;
722		722
723	ent = &array->array[i];	723	ent = &array->array[i];
724	if (filldir(dirent, ent->string.name, ent->string.len,	724	if (filldir(dirent, ent->string.name, ent->string.len,
725	file->f_pos, nfs_compat_user_ino64(ent->ino),	725	file->f_pos, nfs_compat_user_ino64(ent->ino),
726	ent->d_type) < 0) {	726	ent->d_type) < 0) {
727	desc->eof = 1;	727	desc->eof = 1;
728	break;	728	break;
729	}	729	}
730	file->f_pos++;	730	file->f_pos++;
731	if (i < (array->size-1))	731	if (i < (array->size-1))
732	*desc->dir_cookie = array->array[i+1].cookie;	732	*desc->dir_cookie = array->array[i+1].cookie;
733	else	733	else
734	*desc->dir_cookie = array->last_cookie;	734	*desc->dir_cookie = array->last_cookie;
735	if (ctx->duped != 0)	735	if (ctx->duped != 0)
736	ctx->duped = 1;	736	ctx->duped = 1;
737	}	737	}
738	if (array->eof_index >= 0)	738	if (array->eof_index >= 0)
739	desc->eof = 1;	739	desc->eof = 1;
740		740
741	nfs_readdir_release_array(desc->page);	741	nfs_readdir_release_array(desc->page);
742	out:	742	out:
743	cache_page_release(desc);	743	cache_page_release(desc);
744	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",	744	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
745	(unsigned long long)*desc->dir_cookie, res);	745	(unsigned long long)*desc->dir_cookie, res);
746	return res;	746	return res;
747	}	747	}
748		748
749	/*	749	/*
750	* If we cannot find a cookie in our cache, we suspect that this is	750	* If we cannot find a cookie in our cache, we suspect that this is
751	* because it points to a deleted file, so we ask the server to return	751	* because it points to a deleted file, so we ask the server to return
752	* whatever it thinks is the next entry. We then feed this to filldir.	752	* whatever it thinks is the next entry. We then feed this to filldir.
753	* If all goes well, we should then be able to find our way round the	753	* If all goes well, we should then be able to find our way round the
754	* cache on the next call to readdir_search_pagecache();	754	* cache on the next call to readdir_search_pagecache();
755	*	755	*
756	* NOTE: we cannot add the anonymous page to the pagecache because	756	* NOTE: we cannot add the anonymous page to the pagecache because
757	* the data it contains might not be page aligned. Besides,	757	* the data it contains might not be page aligned. Besides,
758	* we should already have a complete representation of the	758	* we should already have a complete representation of the
759	* directory in the page cache by the time we get here.	759	* directory in the page cache by the time we get here.
760	*/	760	*/
761	static inline	761	static inline
762	int uncached_readdir(nfs_readdir_descriptor_t desc, void dirent,	762	int uncached_readdir(nfs_readdir_descriptor_t desc, void dirent,
763	filldir_t filldir)	763	filldir_t filldir)
764	{	764	{
765	struct page *page = NULL;	765	struct page *page = NULL;
766	int status;	766	int status;
767	struct inode *inode = desc->file->f_path.dentry->d_inode;	767	struct inode *inode = desc->file->f_path.dentry->d_inode;
768	struct nfs_open_dir_context *ctx = desc->file->private_data;	768	struct nfs_open_dir_context *ctx = desc->file->private_data;
769		769
770	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",	770	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
771	(unsigned long long)*desc->dir_cookie);	771	(unsigned long long)*desc->dir_cookie);
772		772
773	page = alloc_page(GFP_HIGHUSER);	773	page = alloc_page(GFP_HIGHUSER);
774	if (!page) {	774	if (!page) {
775	status = -ENOMEM;	775	status = -ENOMEM;
776	goto out;	776	goto out;
777	}	777	}
778		778
779	desc->page_index = 0;	779	desc->page_index = 0;
780	desc->last_cookie = *desc->dir_cookie;	780	desc->last_cookie = *desc->dir_cookie;
781	desc->page = page;	781	desc->page = page;
782	ctx->duped = 0;	782	ctx->duped = 0;
783		783
784	status = nfs_readdir_xdr_to_array(desc, page, inode);	784	status = nfs_readdir_xdr_to_array(desc, page, inode);
785	if (status < 0)	785	if (status < 0)
786	goto out_release;	786	goto out_release;
787		787
788	status = nfs_do_filldir(desc, dirent, filldir);	788	status = nfs_do_filldir(desc, dirent, filldir);
789		789
790	out:	790	out:
791	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",	791	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
792	__func__, status);	792	__func__, status);
793	return status;	793	return status;
794	out_release:	794	out_release:
795	cache_page_release(desc);	795	cache_page_release(desc);
796	goto out;	796	goto out;
797	}	797	}
798		798
799	/* The file offset position represents the dirent entry number. A	799	/* The file offset position represents the dirent entry number. A
800	last cookie cache takes care of the common case of reading the	800	last cookie cache takes care of the common case of reading the
801	whole directory.	801	whole directory.
802	*/	802	*/
803	static int nfs_readdir(struct file filp, void dirent, filldir_t filldir)	803	static int nfs_readdir(struct file filp, void dirent, filldir_t filldir)
804	{	804	{
805	struct dentry *dentry = filp->f_path.dentry;	805	struct dentry *dentry = filp->f_path.dentry;
806	struct inode *inode = dentry->d_inode;	806	struct inode *inode = dentry->d_inode;
807	nfs_readdir_descriptor_t my_desc,	807	nfs_readdir_descriptor_t my_desc,
808	*desc = &my_desc;	808	*desc = &my_desc;
809	struct nfs_open_dir_context *dir_ctx = filp->private_data;	809	struct nfs_open_dir_context *dir_ctx = filp->private_data;
810	int res;	810	int res;
811		811
812	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",	812	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
813	dentry->d_parent->d_name.name, dentry->d_name.name,	813	dentry->d_parent->d_name.name, dentry->d_name.name,
814	(long long)filp->f_pos);	814	(long long)filp->f_pos);
815	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);	815	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
816		816
817	/*	817	/*
818	* filp->f_pos points to the dirent entry number.	818	* filp->f_pos points to the dirent entry number.
819	* *desc->dir_cookie has the cookie for the next entry. We have	819	* *desc->dir_cookie has the cookie for the next entry. We have
820	* to either find the entry with the appropriate number or	820	* to either find the entry with the appropriate number or
821	* revalidate the cookie.	821	* revalidate the cookie.
822	*/	822	*/
823	memset(desc, 0, sizeof(*desc));	823	memset(desc, 0, sizeof(*desc));
824		824
825	desc->file = filp;	825	desc->file = filp;
826	desc->dir_cookie = &dir_ctx->dir_cookie;	826	desc->dir_cookie = &dir_ctx->dir_cookie;
827	desc->decode = NFS_PROTO(inode)->decode_dirent;	827	desc->decode = NFS_PROTO(inode)->decode_dirent;
828	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;	828	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
829		829
830	nfs_block_sillyrename(dentry);	830	nfs_block_sillyrename(dentry);
831	res = nfs_revalidate_mapping(inode, filp->f_mapping);	831	res = nfs_revalidate_mapping(inode, filp->f_mapping);
832	if (res < 0)	832	if (res < 0)
833	goto out;	833	goto out;
834		834
835	do {	835	do {
836	res = readdir_search_pagecache(desc);	836	res = readdir_search_pagecache(desc);
837		837
838	if (res == -EBADCOOKIE) {	838	if (res == -EBADCOOKIE) {
839	res = 0;	839	res = 0;
840	/* This means either end of directory */	840	/* This means either end of directory */
841	if (*desc->dir_cookie && desc->eof == 0) {	841	if (*desc->dir_cookie && desc->eof == 0) {
842	/* Or that the server has 'lost' a cookie */	842	/* Or that the server has 'lost' a cookie */
843	res = uncached_readdir(desc, dirent, filldir);	843	res = uncached_readdir(desc, dirent, filldir);
844	if (res == 0)	844	if (res == 0)
845	continue;	845	continue;
846	}	846	}
847	break;	847	break;
848	}	848	}
849	if (res == -ETOOSMALL && desc->plus) {	849	if (res == -ETOOSMALL && desc->plus) {
850	clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);	850	clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
851	nfs_zap_caches(inode);	851	nfs_zap_caches(inode);
852	desc->page_index = 0;	852	desc->page_index = 0;
853	desc->plus = 0;	853	desc->plus = 0;
854	desc->eof = 0;	854	desc->eof = 0;
855	continue;	855	continue;
856	}	856	}
857	if (res < 0)	857	if (res < 0)
858	break;	858	break;
859		859
860	res = nfs_do_filldir(desc, dirent, filldir);	860	res = nfs_do_filldir(desc, dirent, filldir);
861	if (res < 0)	861	if (res < 0)
862	break;	862	break;
863	} while (!desc->eof);	863	} while (!desc->eof);
864	out:	864	out:
865	nfs_unblock_sillyrename(dentry);	865	nfs_unblock_sillyrename(dentry);
866	if (res > 0)	866	if (res > 0)
867	res = 0;	867	res = 0;
868	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",	868	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
869	dentry->d_parent->d_name.name, dentry->d_name.name,	869	dentry->d_parent->d_name.name, dentry->d_name.name,
870	res);	870	res);
871	return res;	871	return res;
872	}	872	}
873		873
874	static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)	874	static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
875	{	875	{
876	struct dentry *dentry = filp->f_path.dentry;	876	struct dentry *dentry = filp->f_path.dentry;
877	struct inode *inode = dentry->d_inode;	877	struct inode *inode = dentry->d_inode;
878	struct nfs_open_dir_context *dir_ctx = filp->private_data;	878	struct nfs_open_dir_context *dir_ctx = filp->private_data;
879		879
880	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",	880	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
881	dentry->d_parent->d_name.name,	881	dentry->d_parent->d_name.name,
882	dentry->d_name.name,	882	dentry->d_name.name,
883	offset, origin);	883	offset, whence);
884		884
885	mutex_lock(&inode->i_mutex);	885	mutex_lock(&inode->i_mutex);
886	switch (origin) {	886	switch (whence) {
887	case 1:	887	case 1:
888	offset += filp->f_pos;	888	offset += filp->f_pos;
889	case 0:	889	case 0:
890	if (offset >= 0)	890	if (offset >= 0)
891	break;	891	break;
892	default:	892	default:
893	offset = -EINVAL;	893	offset = -EINVAL;
894	goto out;	894	goto out;
895	}	895	}
896	if (offset != filp->f_pos) {	896	if (offset != filp->f_pos) {
897	filp->f_pos = offset;	897	filp->f_pos = offset;
898	dir_ctx->dir_cookie = 0;	898	dir_ctx->dir_cookie = 0;
899	dir_ctx->duped = 0;	899	dir_ctx->duped = 0;
900	}	900	}
901	out:	901	out:
902	mutex_unlock(&inode->i_mutex);	902	mutex_unlock(&inode->i_mutex);
903	return offset;	903	return offset;
904	}	904	}
905		905
906	/*	906	/*
907	* All directory operations under NFS are synchronous, so fsync()	907	* All directory operations under NFS are synchronous, so fsync()
908	* is a dummy operation.	908	* is a dummy operation.
909	*/	909	*/
910	static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,	910	static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
911	int datasync)	911	int datasync)
912	{	912	{
913	struct dentry *dentry = filp->f_path.dentry;	913	struct dentry *dentry = filp->f_path.dentry;
914	struct inode *inode = dentry->d_inode;	914	struct inode *inode = dentry->d_inode;
915		915
916	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",	916	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
917	dentry->d_parent->d_name.name, dentry->d_name.name,	917	dentry->d_parent->d_name.name, dentry->d_name.name,
918	datasync);	918	datasync);
919		919
920	mutex_lock(&inode->i_mutex);	920	mutex_lock(&inode->i_mutex);
921	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);	921	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
922	mutex_unlock(&inode->i_mutex);	922	mutex_unlock(&inode->i_mutex);
923	return 0;	923	return 0;
924	}	924	}
925		925
926	/**	926	/**
927	* nfs_force_lookup_revalidate - Mark the directory as having changed	927	* nfs_force_lookup_revalidate - Mark the directory as having changed
928	* @dir - pointer to directory inode	928	* @dir - pointer to directory inode
929	*	929	*
930	* This forces the revalidation code in nfs_lookup_revalidate() to do a	930	* This forces the revalidation code in nfs_lookup_revalidate() to do a
931	* full lookup on all child dentries of 'dir' whenever a change occurs	931	* full lookup on all child dentries of 'dir' whenever a change occurs
932	* on the server that might have invalidated our dcache.	932	* on the server that might have invalidated our dcache.
933	*	933	*
934	* The caller should be holding dir->i_lock	934	* The caller should be holding dir->i_lock
935	*/	935	*/
936	void nfs_force_lookup_revalidate(struct inode *dir)	936	void nfs_force_lookup_revalidate(struct inode *dir)
937	{	937	{
938	NFS_I(dir)->cache_change_attribute++;	938	NFS_I(dir)->cache_change_attribute++;
939	}	939	}
940	EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);	940	EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
941		941
942	/*	942	/*
943	* A check for whether or not the parent directory has changed.	943	* A check for whether or not the parent directory has changed.
944	* In the case it has, we assume that the dentries are untrustworthy	944	* In the case it has, we assume that the dentries are untrustworthy
945	* and may need to be looked up again.	945	* and may need to be looked up again.
946	*/	946	*/
947	static int nfs_check_verifier(struct inode dir, struct dentry dentry)	947	static int nfs_check_verifier(struct inode dir, struct dentry dentry)
948	{	948	{
949	if (IS_ROOT(dentry))	949	if (IS_ROOT(dentry))
950	return 1;	950	return 1;
951	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)	951	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
952	return 0;	952	return 0;
953	if (!nfs_verify_change_attribute(dir, dentry->d_time))	953	if (!nfs_verify_change_attribute(dir, dentry->d_time))
954	return 0;	954	return 0;
955	/* Revalidate nfsi->cache_change_attribute before we declare a match */	955	/* Revalidate nfsi->cache_change_attribute before we declare a match */
956	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)	956	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
957	return 0;	957	return 0;
958	if (!nfs_verify_change_attribute(dir, dentry->d_time))	958	if (!nfs_verify_change_attribute(dir, dentry->d_time))
959	return 0;	959	return 0;
960	return 1;	960	return 1;
961	}	961	}
962		962
963	/*	963	/*
964	* Use intent information to check whether or not we're going to do	964	* Use intent information to check whether or not we're going to do
965	* an O_EXCL create using this path component.	965	* an O_EXCL create using this path component.
966	*/	966	*/
967	static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)	967	static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
968	{	968	{
969	if (NFS_PROTO(dir)->version == 2)	969	if (NFS_PROTO(dir)->version == 2)
970	return 0;	970	return 0;
971	return flags & LOOKUP_EXCL;	971	return flags & LOOKUP_EXCL;
972	}	972	}
973		973
974	/*	974	/*
975	* Inode and filehandle revalidation for lookups.	975	* Inode and filehandle revalidation for lookups.
976	*	976	*
977	* We force revalidation in the cases where the VFS sets LOOKUP_REVAL,	977	* We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
978	* or if the intent information indicates that we're about to open this	978	* or if the intent information indicates that we're about to open this
979	* particular file and the "nocto" mount flag is not set.	979	* particular file and the "nocto" mount flag is not set.
980	*	980	*
981	*/	981	*/
982	static inline	982	static inline
983	int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)	983	int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
984	{	984	{
985	struct nfs_server *server = NFS_SERVER(inode);	985	struct nfs_server *server = NFS_SERVER(inode);
986		986
987	if (IS_AUTOMOUNT(inode))	987	if (IS_AUTOMOUNT(inode))
988	return 0;	988	return 0;
989	/* VFS wants an on-the-wire revalidation */	989	/* VFS wants an on-the-wire revalidation */
990	if (flags & LOOKUP_REVAL)	990	if (flags & LOOKUP_REVAL)
991	goto out_force;	991	goto out_force;
992	/* This is an open(2) */	992	/* This is an open(2) */
993	if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&	993	if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
994	(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode)))	994	(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode)))
995	goto out_force;	995	goto out_force;
996	return 0;	996	return 0;
997	out_force:	997	out_force:
998	return __nfs_revalidate_inode(server, inode);	998	return __nfs_revalidate_inode(server, inode);
999	}	999	}
1000		1000
1001	/*	1001	/*
1002	* We judge how long we want to trust negative	1002	* We judge how long we want to trust negative
1003	* dentries by looking at the parent inode mtime.	1003	* dentries by looking at the parent inode mtime.
1004	*	1004	*
1005	* If parent mtime has changed, we revalidate, else we wait for a	1005	* If parent mtime has changed, we revalidate, else we wait for a
1006	* period corresponding to the parent's attribute cache timeout value.	1006	* period corresponding to the parent's attribute cache timeout value.
1007	*/	1007	*/
1008	static inline	1008	static inline
1009	int nfs_neg_need_reval(struct inode dir, struct dentry dentry,	1009	int nfs_neg_need_reval(struct inode dir, struct dentry dentry,
1010	unsigned int flags)	1010	unsigned int flags)
1011	{	1011	{
1012	/* Don't revalidate a negative dentry if we're creating a new file */	1012	/* Don't revalidate a negative dentry if we're creating a new file */
1013	if (flags & LOOKUP_CREATE)	1013	if (flags & LOOKUP_CREATE)
1014	return 0;	1014	return 0;
1015	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)	1015	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
1016	return 1;	1016	return 1;
1017	return !nfs_check_verifier(dir, dentry);	1017	return !nfs_check_verifier(dir, dentry);
1018	}	1018	}
1019		1019
1020	/*	1020	/*
1021	* This is called every time the dcache has a lookup hit,	1021	* This is called every time the dcache has a lookup hit,
1022	* and we should check whether we can really trust that	1022	* and we should check whether we can really trust that
1023	* lookup.	1023	* lookup.
1024	*	1024	*
1025	* NOTE! The hit can be a negative hit too, don't assume	1025	* NOTE! The hit can be a negative hit too, don't assume
1026	* we have an inode!	1026	* we have an inode!
1027	*	1027	*
1028	* If the parent directory is seen to have changed, we throw out the	1028	* If the parent directory is seen to have changed, we throw out the
1029	* cached dentry and do a new lookup.	1029	* cached dentry and do a new lookup.
1030	*/	1030	*/
1031	static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)	1031	static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1032	{	1032	{
1033	struct inode *dir;	1033	struct inode *dir;
1034	struct inode *inode;	1034	struct inode *inode;
1035	struct dentry *parent;	1035	struct dentry *parent;
1036	struct nfs_fh *fhandle = NULL;	1036	struct nfs_fh *fhandle = NULL;
1037	struct nfs_fattr *fattr = NULL;	1037	struct nfs_fattr *fattr = NULL;
1038	int error;	1038	int error;
1039		1039
1040	if (flags & LOOKUP_RCU)	1040	if (flags & LOOKUP_RCU)
1041	return -ECHILD;	1041	return -ECHILD;
1042		1042
1043	parent = dget_parent(dentry);	1043	parent = dget_parent(dentry);
1044	dir = parent->d_inode;	1044	dir = parent->d_inode;
1045	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);	1045	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
1046	inode = dentry->d_inode;	1046	inode = dentry->d_inode;
1047		1047
1048	if (!inode) {	1048	if (!inode) {
1049	if (nfs_neg_need_reval(dir, dentry, flags))	1049	if (nfs_neg_need_reval(dir, dentry, flags))
1050	goto out_bad;	1050	goto out_bad;
1051	goto out_valid_noent;	1051	goto out_valid_noent;
1052	}	1052	}
1053		1053
1054	if (is_bad_inode(inode)) {	1054	if (is_bad_inode(inode)) {
1055	dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",	1055	dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
1056	__func__, dentry->d_parent->d_name.name,	1056	__func__, dentry->d_parent->d_name.name,
1057	dentry->d_name.name);	1057	dentry->d_name.name);
1058	goto out_bad;	1058	goto out_bad;
1059	}	1059	}
1060		1060
1061	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))	1061	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
1062	goto out_set_verifier;	1062	goto out_set_verifier;
1063		1063
1064	/* Force a full look up iff the parent directory has changed */	1064	/* Force a full look up iff the parent directory has changed */
1065	if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {	1065	if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
1066	if (nfs_lookup_verify_inode(inode, flags))	1066	if (nfs_lookup_verify_inode(inode, flags))
1067	goto out_zap_parent;	1067	goto out_zap_parent;
1068	goto out_valid;	1068	goto out_valid;
1069	}	1069	}
1070		1070
1071	if (NFS_STALE(inode))	1071	if (NFS_STALE(inode))
1072	goto out_bad;	1072	goto out_bad;
1073		1073
1074	error = -ENOMEM;	1074	error = -ENOMEM;
1075	fhandle = nfs_alloc_fhandle();	1075	fhandle = nfs_alloc_fhandle();
1076	fattr = nfs_alloc_fattr();	1076	fattr = nfs_alloc_fattr();
1077	if (fhandle == NULL \|\| fattr == NULL)	1077	if (fhandle == NULL \|\| fattr == NULL)
1078	goto out_error;	1078	goto out_error;
1079		1079
1080	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);	1080	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1081	if (error)	1081	if (error)
1082	goto out_bad;	1082	goto out_bad;
1083	if (nfs_compare_fh(NFS_FH(inode), fhandle))	1083	if (nfs_compare_fh(NFS_FH(inode), fhandle))
1084	goto out_bad;	1084	goto out_bad;
1085	if ((error = nfs_refresh_inode(inode, fattr)) != 0)	1085	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
1086	goto out_bad;	1086	goto out_bad;
1087		1087
1088	nfs_free_fattr(fattr);	1088	nfs_free_fattr(fattr);
1089	nfs_free_fhandle(fhandle);	1089	nfs_free_fhandle(fhandle);
1090	out_set_verifier:	1090	out_set_verifier:
1091	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	1091	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1092	out_valid:	1092	out_valid:
1093	/* Success: notify readdir to use READDIRPLUS */	1093	/* Success: notify readdir to use READDIRPLUS */
1094	nfs_advise_use_readdirplus(dir);	1094	nfs_advise_use_readdirplus(dir);
1095	out_valid_noent:	1095	out_valid_noent:
1096	dput(parent);	1096	dput(parent);
1097	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",	1097	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
1098	__func__, dentry->d_parent->d_name.name,	1098	__func__, dentry->d_parent->d_name.name,
1099	dentry->d_name.name);	1099	dentry->d_name.name);
1100	return 1;	1100	return 1;
1101	out_zap_parent:	1101	out_zap_parent:
1102	nfs_zap_caches(dir);	1102	nfs_zap_caches(dir);
1103	out_bad:	1103	out_bad:
1104	nfs_free_fattr(fattr);	1104	nfs_free_fattr(fattr);
1105	nfs_free_fhandle(fhandle);	1105	nfs_free_fhandle(fhandle);
1106	nfs_mark_for_revalidate(dir);	1106	nfs_mark_for_revalidate(dir);
1107	if (inode && S_ISDIR(inode->i_mode)) {	1107	if (inode && S_ISDIR(inode->i_mode)) {
1108	/* Purge readdir caches. */	1108	/* Purge readdir caches. */
1109	nfs_zap_caches(inode);	1109	nfs_zap_caches(inode);
1110	/* If we have submounts, don't unhash ! */	1110	/* If we have submounts, don't unhash ! */
1111	if (have_submounts(dentry))	1111	if (have_submounts(dentry))
1112	goto out_valid;	1112	goto out_valid;
1113	if (dentry->d_flags & DCACHE_DISCONNECTED)	1113	if (dentry->d_flags & DCACHE_DISCONNECTED)
1114	goto out_valid;	1114	goto out_valid;
1115	shrink_dcache_parent(dentry);	1115	shrink_dcache_parent(dentry);
1116	}	1116	}
1117	d_drop(dentry);	1117	d_drop(dentry);
1118	dput(parent);	1118	dput(parent);
1119	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",	1119	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
1120	__func__, dentry->d_parent->d_name.name,	1120	__func__, dentry->d_parent->d_name.name,
1121	dentry->d_name.name);	1121	dentry->d_name.name);
1122	return 0;	1122	return 0;
1123	out_error:	1123	out_error:
1124	nfs_free_fattr(fattr);	1124	nfs_free_fattr(fattr);
1125	nfs_free_fhandle(fhandle);	1125	nfs_free_fhandle(fhandle);
1126	dput(parent);	1126	dput(parent);
1127	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",	1127	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
1128	__func__, dentry->d_parent->d_name.name,	1128	__func__, dentry->d_parent->d_name.name,
1129	dentry->d_name.name, error);	1129	dentry->d_name.name, error);
1130	return error;	1130	return error;
1131	}	1131	}
1132		1132
1133	/*	1133	/*
1134	* This is called from dput() when d_count is going to 0.	1134	* This is called from dput() when d_count is going to 0.
1135	*/	1135	*/
1136	static int nfs_dentry_delete(const struct dentry *dentry)	1136	static int nfs_dentry_delete(const struct dentry *dentry)
1137	{	1137	{
1138	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",	1138	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
1139	dentry->d_parent->d_name.name, dentry->d_name.name,	1139	dentry->d_parent->d_name.name, dentry->d_name.name,
1140	dentry->d_flags);	1140	dentry->d_flags);
1141		1141
1142	/* Unhash any dentry with a stale inode */	1142	/* Unhash any dentry with a stale inode */
1143	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))	1143	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
1144	return 1;	1144	return 1;
1145		1145
1146	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {	1146	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1147	/* Unhash it, so that ->d_iput() would be called */	1147	/* Unhash it, so that ->d_iput() would be called */
1148	return 1;	1148	return 1;
1149	}	1149	}
1150	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {	1150	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
1151	/* Unhash it, so that ancestors of killed async unlink	1151	/* Unhash it, so that ancestors of killed async unlink
1152	* files will be cleaned up during umount */	1152	* files will be cleaned up during umount */
1153	return 1;	1153	return 1;
1154	}	1154	}
1155	return 0;	1155	return 0;
1156		1156
1157	}	1157	}
1158		1158
1159	static void nfs_drop_nlink(struct inode *inode)	1159	static void nfs_drop_nlink(struct inode *inode)
1160	{	1160	{
1161	spin_lock(&inode->i_lock);	1161	spin_lock(&inode->i_lock);
1162	if (inode->i_nlink > 0)	1162	if (inode->i_nlink > 0)
1163	drop_nlink(inode);	1163	drop_nlink(inode);
1164	spin_unlock(&inode->i_lock);	1164	spin_unlock(&inode->i_lock);
1165	}	1165	}
1166		1166
1167	/*	1167	/*
1168	* Called when the dentry loses inode.	1168	* Called when the dentry loses inode.
1169	* We use it to clean up silly-renamed files.	1169	* We use it to clean up silly-renamed files.
1170	*/	1170	*/
1171	static void nfs_dentry_iput(struct dentry dentry, struct inode inode)	1171	static void nfs_dentry_iput(struct dentry dentry, struct inode inode)
1172	{	1172	{
1173	if (S_ISDIR(inode->i_mode))	1173	if (S_ISDIR(inode->i_mode))
1174	/* drop any readdir cache as it could easily be old */	1174	/* drop any readdir cache as it could easily be old */
1175	NFS_I(inode)->cache_validity \|= NFS_INO_INVALID_DATA;	1175	NFS_I(inode)->cache_validity \|= NFS_INO_INVALID_DATA;
1176		1176
1177	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {	1177	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1178	drop_nlink(inode);	1178	drop_nlink(inode);
1179	nfs_complete_unlink(dentry, inode);	1179	nfs_complete_unlink(dentry, inode);
1180	}	1180	}
1181	iput(inode);	1181	iput(inode);
1182	}	1182	}
1183		1183
1184	static void nfs_d_release(struct dentry *dentry)	1184	static void nfs_d_release(struct dentry *dentry)
1185	{	1185	{
1186	/* free cached devname value, if it survived that far */	1186	/* free cached devname value, if it survived that far */
1187	if (unlikely(dentry->d_fsdata)) {	1187	if (unlikely(dentry->d_fsdata)) {
1188	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)	1188	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1189	WARN_ON(1);	1189	WARN_ON(1);
1190	else	1190	else
1191	kfree(dentry->d_fsdata);	1191	kfree(dentry->d_fsdata);
1192	}	1192	}
1193	}	1193	}
1194		1194
1195	const struct dentry_operations nfs_dentry_operations = {	1195	const struct dentry_operations nfs_dentry_operations = {
1196	.d_revalidate = nfs_lookup_revalidate,	1196	.d_revalidate = nfs_lookup_revalidate,
1197	.d_delete = nfs_dentry_delete,	1197	.d_delete = nfs_dentry_delete,
1198	.d_iput = nfs_dentry_iput,	1198	.d_iput = nfs_dentry_iput,
1199	.d_automount = nfs_d_automount,	1199	.d_automount = nfs_d_automount,
1200	.d_release = nfs_d_release,	1200	.d_release = nfs_d_release,
1201	};	1201	};
1202	EXPORT_SYMBOL_GPL(nfs_dentry_operations);	1202	EXPORT_SYMBOL_GPL(nfs_dentry_operations);
1203		1203
1204	struct dentry nfs_lookup(struct inode dir, struct dentry * dentry, unsigned int flags)	1204	struct dentry nfs_lookup(struct inode dir, struct dentry * dentry, unsigned int flags)
1205	{	1205	{
1206	struct dentry *res;	1206	struct dentry *res;
1207	struct dentry *parent;	1207	struct dentry *parent;
1208	struct inode *inode = NULL;	1208	struct inode *inode = NULL;
1209	struct nfs_fh *fhandle = NULL;	1209	struct nfs_fh *fhandle = NULL;
1210	struct nfs_fattr *fattr = NULL;	1210	struct nfs_fattr *fattr = NULL;
1211	int error;	1211	int error;
1212		1212
1213	dfprintk(VFS, "NFS: lookup(%s/%s)\n",	1213	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
1214	dentry->d_parent->d_name.name, dentry->d_name.name);	1214	dentry->d_parent->d_name.name, dentry->d_name.name);
1215	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);	1215	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
1216		1216
1217	res = ERR_PTR(-ENAMETOOLONG);	1217	res = ERR_PTR(-ENAMETOOLONG);
1218	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)	1218	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1219	goto out;	1219	goto out;
1220		1220
1221	/*	1221	/*
1222	* If we're doing an exclusive create, optimize away the lookup	1222	* If we're doing an exclusive create, optimize away the lookup
1223	* but don't hash the dentry.	1223	* but don't hash the dentry.
1224	*/	1224	*/
1225	if (nfs_is_exclusive_create(dir, flags)) {	1225	if (nfs_is_exclusive_create(dir, flags)) {
1226	d_instantiate(dentry, NULL);	1226	d_instantiate(dentry, NULL);
1227	res = NULL;	1227	res = NULL;
1228	goto out;	1228	goto out;
1229	}	1229	}
1230		1230
1231	res = ERR_PTR(-ENOMEM);	1231	res = ERR_PTR(-ENOMEM);
1232	fhandle = nfs_alloc_fhandle();	1232	fhandle = nfs_alloc_fhandle();
1233	fattr = nfs_alloc_fattr();	1233	fattr = nfs_alloc_fattr();
1234	if (fhandle == NULL \|\| fattr == NULL)	1234	if (fhandle == NULL \|\| fattr == NULL)
1235	goto out;	1235	goto out;
1236		1236
1237	parent = dentry->d_parent;	1237	parent = dentry->d_parent;
1238	/* Protect against concurrent sillydeletes */	1238	/* Protect against concurrent sillydeletes */
1239	nfs_block_sillyrename(parent);	1239	nfs_block_sillyrename(parent);
1240	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);	1240	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1241	if (error == -ENOENT)	1241	if (error == -ENOENT)
1242	goto no_entry;	1242	goto no_entry;
1243	if (error < 0) {	1243	if (error < 0) {
1244	res = ERR_PTR(error);	1244	res = ERR_PTR(error);
1245	goto out_unblock_sillyrename;	1245	goto out_unblock_sillyrename;
1246	}	1246	}
1247	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);	1247	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1248	res = ERR_CAST(inode);	1248	res = ERR_CAST(inode);
1249	if (IS_ERR(res))	1249	if (IS_ERR(res))
1250	goto out_unblock_sillyrename;	1250	goto out_unblock_sillyrename;
1251		1251
1252	/* Success: notify readdir to use READDIRPLUS */	1252	/* Success: notify readdir to use READDIRPLUS */
1253	nfs_advise_use_readdirplus(dir);	1253	nfs_advise_use_readdirplus(dir);
1254		1254
1255	no_entry:	1255	no_entry:
1256	res = d_materialise_unique(dentry, inode);	1256	res = d_materialise_unique(dentry, inode);
1257	if (res != NULL) {	1257	if (res != NULL) {
1258	if (IS_ERR(res))	1258	if (IS_ERR(res))
1259	goto out_unblock_sillyrename;	1259	goto out_unblock_sillyrename;
1260	dentry = res;	1260	dentry = res;
1261	}	1261	}
1262	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	1262	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1263	out_unblock_sillyrename:	1263	out_unblock_sillyrename:
1264	nfs_unblock_sillyrename(parent);	1264	nfs_unblock_sillyrename(parent);
1265	out:	1265	out:
1266	nfs_free_fattr(fattr);	1266	nfs_free_fattr(fattr);
1267	nfs_free_fhandle(fhandle);	1267	nfs_free_fhandle(fhandle);
1268	return res;	1268	return res;
1269	}	1269	}
1270	EXPORT_SYMBOL_GPL(nfs_lookup);	1270	EXPORT_SYMBOL_GPL(nfs_lookup);
1271		1271
1272	#if IS_ENABLED(CONFIG_NFS_V4)	1272	#if IS_ENABLED(CONFIG_NFS_V4)
1273	static int nfs4_lookup_revalidate(struct dentry *, unsigned int);	1273	static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
1274		1274
1275	const struct dentry_operations nfs4_dentry_operations = {	1275	const struct dentry_operations nfs4_dentry_operations = {
1276	.d_revalidate = nfs4_lookup_revalidate,	1276	.d_revalidate = nfs4_lookup_revalidate,
1277	.d_delete = nfs_dentry_delete,	1277	.d_delete = nfs_dentry_delete,
1278	.d_iput = nfs_dentry_iput,	1278	.d_iput = nfs_dentry_iput,
1279	.d_automount = nfs_d_automount,	1279	.d_automount = nfs_d_automount,
1280	.d_release = nfs_d_release,	1280	.d_release = nfs_d_release,
1281	};	1281	};
1282	EXPORT_SYMBOL_GPL(nfs4_dentry_operations);	1282	EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
1283		1283
1284	static fmode_t flags_to_mode(int flags)	1284	static fmode_t flags_to_mode(int flags)
1285	{	1285	{
1286	fmode_t res = (__force fmode_t)flags & FMODE_EXEC;	1286	fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
1287	if ((flags & O_ACCMODE) != O_WRONLY)	1287	if ((flags & O_ACCMODE) != O_WRONLY)
1288	res \|= FMODE_READ;	1288	res \|= FMODE_READ;
1289	if ((flags & O_ACCMODE) != O_RDONLY)	1289	if ((flags & O_ACCMODE) != O_RDONLY)
1290	res \|= FMODE_WRITE;	1290	res \|= FMODE_WRITE;
1291	return res;	1291	return res;
1292	}	1292	}
1293		1293
1294	static struct nfs_open_context create_nfs_open_context(struct dentry dentry, int open_flags)	1294	static struct nfs_open_context create_nfs_open_context(struct dentry dentry, int open_flags)
1295	{	1295	{
1296	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));	1296	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
1297	}	1297	}
1298		1298
1299	static int do_open(struct inode inode, struct file filp)	1299	static int do_open(struct inode inode, struct file filp)
1300	{	1300	{
1301	nfs_fscache_set_inode_cookie(inode, filp);	1301	nfs_fscache_set_inode_cookie(inode, filp);
1302	return 0;	1302	return 0;
1303	}	1303	}
1304		1304
1305	static int nfs_finish_open(struct nfs_open_context *ctx,	1305	static int nfs_finish_open(struct nfs_open_context *ctx,
1306	struct dentry *dentry,	1306	struct dentry *dentry,
1307	struct file *file, unsigned open_flags,	1307	struct file *file, unsigned open_flags,
1308	int *opened)	1308	int *opened)
1309	{	1309	{
1310	int err;	1310	int err;
1311		1311
1312	if (ctx->dentry != dentry) {	1312	if (ctx->dentry != dentry) {
1313	dput(ctx->dentry);	1313	dput(ctx->dentry);
1314	ctx->dentry = dget(dentry);	1314	ctx->dentry = dget(dentry);
1315	}	1315	}
1316		1316
1317	/* If the open_intent is for execute, we have an extra check to make */	1317	/* If the open_intent is for execute, we have an extra check to make */
1318	if (ctx->mode & FMODE_EXEC) {	1318	if (ctx->mode & FMODE_EXEC) {
1319	err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);	1319	err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
1320	if (err < 0)	1320	if (err < 0)
1321	goto out;	1321	goto out;
1322	}	1322	}
1323		1323
1324	err = finish_open(file, dentry, do_open, opened);	1324	err = finish_open(file, dentry, do_open, opened);
1325	if (err)	1325	if (err)
1326	goto out;	1326	goto out;
1327	nfs_file_set_open_context(file, ctx);	1327	nfs_file_set_open_context(file, ctx);
1328		1328
1329	out:	1329	out:
1330	put_nfs_open_context(ctx);	1330	put_nfs_open_context(ctx);
1331	return err;	1331	return err;
1332	}	1332	}
1333		1333
1334	int nfs_atomic_open(struct inode dir, struct dentry dentry,	1334	int nfs_atomic_open(struct inode dir, struct dentry dentry,
1335	struct file *file, unsigned open_flags,	1335	struct file *file, unsigned open_flags,
1336	umode_t mode, int *opened)	1336	umode_t mode, int *opened)
1337	{	1337	{
1338	struct nfs_open_context *ctx;	1338	struct nfs_open_context *ctx;
1339	struct dentry *res;	1339	struct dentry *res;
1340	struct iattr attr = { .ia_valid = ATTR_OPEN };	1340	struct iattr attr = { .ia_valid = ATTR_OPEN };
1341	struct inode *inode;	1341	struct inode *inode;
1342	int err;	1342	int err;
1343		1343
1344	/* Expect a negative dentry */	1344	/* Expect a negative dentry */
1345	BUG_ON(dentry->d_inode);	1345	BUG_ON(dentry->d_inode);
1346		1346
1347	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",	1347	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
1348	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);	1348	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1349		1349
1350	/* NFS only supports OPEN on regular files */	1350	/* NFS only supports OPEN on regular files */
1351	if ((open_flags & O_DIRECTORY)) {	1351	if ((open_flags & O_DIRECTORY)) {
1352	if (!d_unhashed(dentry)) {	1352	if (!d_unhashed(dentry)) {
1353	/*	1353	/*
1354	* Hashed negative dentry with O_DIRECTORY: dentry was	1354	* Hashed negative dentry with O_DIRECTORY: dentry was
1355	* revalidated and is fine, no need to perform lookup	1355	* revalidated and is fine, no need to perform lookup
1356	* again	1356	* again
1357	*/	1357	*/
1358	return -ENOENT;	1358	return -ENOENT;
1359	}	1359	}
1360	goto no_open;	1360	goto no_open;
1361	}	1361	}
1362		1362
1363	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)	1363	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1364	return -ENAMETOOLONG;	1364	return -ENAMETOOLONG;
1365		1365
1366	if (open_flags & O_CREAT) {	1366	if (open_flags & O_CREAT) {
1367	attr.ia_valid \|= ATTR_MODE;	1367	attr.ia_valid \|= ATTR_MODE;
1368	attr.ia_mode = mode & ~current_umask();	1368	attr.ia_mode = mode & ~current_umask();
1369	}	1369	}
1370	if (open_flags & O_TRUNC) {	1370	if (open_flags & O_TRUNC) {
1371	attr.ia_valid \|= ATTR_SIZE;	1371	attr.ia_valid \|= ATTR_SIZE;
1372	attr.ia_size = 0;	1372	attr.ia_size = 0;
1373	}	1373	}
1374		1374
1375	ctx = create_nfs_open_context(dentry, open_flags);	1375	ctx = create_nfs_open_context(dentry, open_flags);
1376	err = PTR_ERR(ctx);	1376	err = PTR_ERR(ctx);
1377	if (IS_ERR(ctx))	1377	if (IS_ERR(ctx))
1378	goto out;	1378	goto out;
1379		1379
1380	nfs_block_sillyrename(dentry->d_parent);	1380	nfs_block_sillyrename(dentry->d_parent);
1381	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);	1381	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1382	d_drop(dentry);	1382	d_drop(dentry);
1383	if (IS_ERR(inode)) {	1383	if (IS_ERR(inode)) {
1384	nfs_unblock_sillyrename(dentry->d_parent);	1384	nfs_unblock_sillyrename(dentry->d_parent);
1385	put_nfs_open_context(ctx);	1385	put_nfs_open_context(ctx);
1386	err = PTR_ERR(inode);	1386	err = PTR_ERR(inode);
1387	switch (err) {	1387	switch (err) {
1388	case -ENOENT:	1388	case -ENOENT:
1389	d_add(dentry, NULL);	1389	d_add(dentry, NULL);
1390	break;	1390	break;
1391	case -EISDIR:	1391	case -EISDIR:
1392	case -ENOTDIR:	1392	case -ENOTDIR:
1393	goto no_open;	1393	goto no_open;
1394	case -ELOOP:	1394	case -ELOOP:
1395	if (!(open_flags & O_NOFOLLOW))	1395	if (!(open_flags & O_NOFOLLOW))
1396	goto no_open;	1396	goto no_open;
1397	break;	1397	break;
1398	/* case -EINVAL: */	1398	/* case -EINVAL: */
1399	default:	1399	default:
1400	break;	1400	break;
1401	}	1401	}
1402	goto out;	1402	goto out;
1403	}	1403	}
1404	res = d_add_unique(dentry, inode);	1404	res = d_add_unique(dentry, inode);
1405	if (res != NULL)	1405	if (res != NULL)
1406	dentry = res;	1406	dentry = res;
1407		1407
1408	nfs_unblock_sillyrename(dentry->d_parent);	1408	nfs_unblock_sillyrename(dentry->d_parent);
1409	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	1409	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1410		1410
1411	err = nfs_finish_open(ctx, dentry, file, open_flags, opened);	1411	err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
1412		1412
1413	dput(res);	1413	dput(res);
1414	out:	1414	out:
1415	return err;	1415	return err;
1416		1416
1417	no_open:	1417	no_open:
1418	res = nfs_lookup(dir, dentry, 0);	1418	res = nfs_lookup(dir, dentry, 0);
1419	err = PTR_ERR(res);	1419	err = PTR_ERR(res);
1420	if (IS_ERR(res))	1420	if (IS_ERR(res))
1421	goto out;	1421	goto out;
1422		1422
1423	return finish_no_open(file, res);	1423	return finish_no_open(file, res);
1424	}	1424	}
1425	EXPORT_SYMBOL_GPL(nfs_atomic_open);	1425	EXPORT_SYMBOL_GPL(nfs_atomic_open);
1426		1426
1427	static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)	1427	static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1428	{	1428	{
1429	struct dentry *parent = NULL;	1429	struct dentry *parent = NULL;
1430	struct inode *inode;	1430	struct inode *inode;
1431	struct inode *dir;	1431	struct inode *dir;
1432	int ret = 0;	1432	int ret = 0;
1433		1433
1434	if (flags & LOOKUP_RCU)	1434	if (flags & LOOKUP_RCU)
1435	return -ECHILD;	1435	return -ECHILD;
1436		1436
1437	if (!(flags & LOOKUP_OPEN) \|\| (flags & LOOKUP_DIRECTORY))	1437	if (!(flags & LOOKUP_OPEN) \|\| (flags & LOOKUP_DIRECTORY))
1438	goto no_open;	1438	goto no_open;
1439	if (d_mountpoint(dentry))	1439	if (d_mountpoint(dentry))
1440	goto no_open;	1440	goto no_open;
1441		1441
1442	inode = dentry->d_inode;	1442	inode = dentry->d_inode;
1443	parent = dget_parent(dentry);	1443	parent = dget_parent(dentry);
1444	dir = parent->d_inode;	1444	dir = parent->d_inode;
1445		1445
1446	/* We can't create new files in nfs_open_revalidate(), so we	1446	/* We can't create new files in nfs_open_revalidate(), so we
1447	* optimize away revalidation of negative dentries.	1447	* optimize away revalidation of negative dentries.
1448	*/	1448	*/
1449	if (inode == NULL) {	1449	if (inode == NULL) {
1450	if (!nfs_neg_need_reval(dir, dentry, flags))	1450	if (!nfs_neg_need_reval(dir, dentry, flags))
1451	ret = 1;	1451	ret = 1;
1452	goto out;	1452	goto out;
1453	}	1453	}
1454		1454
1455	/* NFS only supports OPEN on regular files */	1455	/* NFS only supports OPEN on regular files */
1456	if (!S_ISREG(inode->i_mode))	1456	if (!S_ISREG(inode->i_mode))
1457	goto no_open_dput;	1457	goto no_open_dput;
1458	/* We cannot do exclusive creation on a positive dentry */	1458	/* We cannot do exclusive creation on a positive dentry */
1459	if (flags & LOOKUP_EXCL)	1459	if (flags & LOOKUP_EXCL)
1460	goto no_open_dput;	1460	goto no_open_dput;
1461		1461
1462	/* Let f_op->open() actually open (and revalidate) the file */	1462	/* Let f_op->open() actually open (and revalidate) the file */
1463	ret = 1;	1463	ret = 1;
1464		1464
1465	out:	1465	out:
1466	dput(parent);	1466	dput(parent);
1467	return ret;	1467	return ret;
1468		1468
1469	no_open_dput:	1469	no_open_dput:
1470	dput(parent);	1470	dput(parent);
1471	no_open:	1471	no_open:
1472	return nfs_lookup_revalidate(dentry, flags);	1472	return nfs_lookup_revalidate(dentry, flags);
1473	}	1473	}
1474		1474
1475	#endif /* CONFIG_NFSV4 */	1475	#endif /* CONFIG_NFSV4 */
1476		1476
1477	/*	1477	/*
1478	* Code common to create, mkdir, and mknod.	1478	* Code common to create, mkdir, and mknod.
1479	*/	1479	*/
1480	int nfs_instantiate(struct dentry dentry, struct nfs_fh fhandle,	1480	int nfs_instantiate(struct dentry dentry, struct nfs_fh fhandle,
1481	struct nfs_fattr *fattr)	1481	struct nfs_fattr *fattr)
1482	{	1482	{
1483	struct dentry *parent = dget_parent(dentry);	1483	struct dentry *parent = dget_parent(dentry);
1484	struct inode *dir = parent->d_inode;	1484	struct inode *dir = parent->d_inode;
1485	struct inode *inode;	1485	struct inode *inode;
1486	int error = -EACCES;	1486	int error = -EACCES;
1487		1487
1488	d_drop(dentry);	1488	d_drop(dentry);
1489		1489
1490	/* We may have been initialized further down */	1490	/* We may have been initialized further down */
1491	if (dentry->d_inode)	1491	if (dentry->d_inode)
1492	goto out;	1492	goto out;
1493	if (fhandle->size == 0) {	1493	if (fhandle->size == 0) {
1494	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);	1494	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1495	if (error)	1495	if (error)
1496	goto out_error;	1496	goto out_error;
1497	}	1497	}
1498	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	1498	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1499	if (!(fattr->valid & NFS_ATTR_FATTR)) {	1499	if (!(fattr->valid & NFS_ATTR_FATTR)) {
1500	struct nfs_server *server = NFS_SB(dentry->d_sb);	1500	struct nfs_server *server = NFS_SB(dentry->d_sb);
1501	error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);	1501	error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
1502	if (error < 0)	1502	if (error < 0)
1503	goto out_error;	1503	goto out_error;
1504	}	1504	}
1505	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);	1505	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1506	error = PTR_ERR(inode);	1506	error = PTR_ERR(inode);
1507	if (IS_ERR(inode))	1507	if (IS_ERR(inode))
1508	goto out_error;	1508	goto out_error;
1509	d_add(dentry, inode);	1509	d_add(dentry, inode);
1510	out:	1510	out:
1511	dput(parent);	1511	dput(parent);
1512	return 0;	1512	return 0;
1513	out_error:	1513	out_error:
1514	nfs_mark_for_revalidate(dir);	1514	nfs_mark_for_revalidate(dir);
1515	dput(parent);	1515	dput(parent);
1516	return error;	1516	return error;
1517	}	1517	}
1518	EXPORT_SYMBOL_GPL(nfs_instantiate);	1518	EXPORT_SYMBOL_GPL(nfs_instantiate);
1519		1519
1520	/*	1520	/*
1521	* Following a failed create operation, we drop the dentry rather	1521	* Following a failed create operation, we drop the dentry rather
1522	* than retain a negative dentry. This avoids a problem in the event	1522	* than retain a negative dentry. This avoids a problem in the event
1523	* that the operation succeeded on the server, but an error in the	1523	* that the operation succeeded on the server, but an error in the
1524	* reply path made it appear to have failed.	1524	* reply path made it appear to have failed.
1525	*/	1525	*/
1526	int nfs_create(struct inode dir, struct dentry dentry,	1526	int nfs_create(struct inode dir, struct dentry dentry,
1527	umode_t mode, bool excl)	1527	umode_t mode, bool excl)
1528	{	1528	{
1529	struct iattr attr;	1529	struct iattr attr;
1530	int open_flags = excl ? O_CREAT \| O_EXCL : O_CREAT;	1530	int open_flags = excl ? O_CREAT \| O_EXCL : O_CREAT;
1531	int error;	1531	int error;
1532		1532
1533	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",	1533	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1534	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);	1534	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1535		1535
1536	attr.ia_mode = mode;	1536	attr.ia_mode = mode;
1537	attr.ia_valid = ATTR_MODE;	1537	attr.ia_valid = ATTR_MODE;
1538		1538
1539	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);	1539	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
1540	if (error != 0)	1540	if (error != 0)
1541	goto out_err;	1541	goto out_err;
1542	return 0;	1542	return 0;
1543	out_err:	1543	out_err:
1544	d_drop(dentry);	1544	d_drop(dentry);
1545	return error;	1545	return error;
1546	}	1546	}
1547	EXPORT_SYMBOL_GPL(nfs_create);	1547	EXPORT_SYMBOL_GPL(nfs_create);
1548		1548
1549	/*	1549	/*
1550	* See comments for nfs_proc_create regarding failed operations.	1550	* See comments for nfs_proc_create regarding failed operations.
1551	*/	1551	*/
1552	int	1552	int
1553	nfs_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t rdev)	1553	nfs_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t rdev)
1554	{	1554	{
1555	struct iattr attr;	1555	struct iattr attr;
1556	int status;	1556	int status;
1557		1557
1558	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",	1558	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",
1559	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);	1559	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1560		1560
1561	if (!new_valid_dev(rdev))	1561	if (!new_valid_dev(rdev))
1562	return -EINVAL;	1562	return -EINVAL;
1563		1563
1564	attr.ia_mode = mode;	1564	attr.ia_mode = mode;
1565	attr.ia_valid = ATTR_MODE;	1565	attr.ia_valid = ATTR_MODE;
1566		1566
1567	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);	1567	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1568	if (status != 0)	1568	if (status != 0)
1569	goto out_err;	1569	goto out_err;
1570	return 0;	1570	return 0;
1571	out_err:	1571	out_err:
1572	d_drop(dentry);	1572	d_drop(dentry);
1573	return status;	1573	return status;
1574	}	1574	}
1575	EXPORT_SYMBOL_GPL(nfs_mknod);	1575	EXPORT_SYMBOL_GPL(nfs_mknod);
1576		1576
1577	/*	1577	/*
1578	* See comments for nfs_proc_create regarding failed operations.	1578	* See comments for nfs_proc_create regarding failed operations.
1579	*/	1579	*/
1580	int nfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	1580	int nfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
1581	{	1581	{
1582	struct iattr attr;	1582	struct iattr attr;
1583	int error;	1583	int error;
1584		1584
1585	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",	1585	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",
1586	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);	1586	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1587		1587
1588	attr.ia_valid = ATTR_MODE;	1588	attr.ia_valid = ATTR_MODE;
1589	attr.ia_mode = mode \| S_IFDIR;	1589	attr.ia_mode = mode \| S_IFDIR;
1590		1590
1591	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);	1591	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1592	if (error != 0)	1592	if (error != 0)
1593	goto out_err;	1593	goto out_err;
1594	return 0;	1594	return 0;
1595	out_err:	1595	out_err:
1596	d_drop(dentry);	1596	d_drop(dentry);
1597	return error;	1597	return error;
1598	}	1598	}
1599	EXPORT_SYMBOL_GPL(nfs_mkdir);	1599	EXPORT_SYMBOL_GPL(nfs_mkdir);
1600		1600
1601	static void nfs_dentry_handle_enoent(struct dentry *dentry)	1601	static void nfs_dentry_handle_enoent(struct dentry *dentry)
1602	{	1602	{
1603	if (dentry->d_inode != NULL && !d_unhashed(dentry))	1603	if (dentry->d_inode != NULL && !d_unhashed(dentry))
1604	d_delete(dentry);	1604	d_delete(dentry);
1605	}	1605	}
1606		1606
1607	int nfs_rmdir(struct inode dir, struct dentry dentry)	1607	int nfs_rmdir(struct inode dir, struct dentry dentry)
1608	{	1608	{
1609	int error;	1609	int error;
1610		1610
1611	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",	1611	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
1612	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);	1612	dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1613		1613
1614	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);	1614	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1615	/* Ensure the VFS deletes this inode */	1615	/* Ensure the VFS deletes this inode */
1616	if (error == 0 && dentry->d_inode != NULL)	1616	if (error == 0 && dentry->d_inode != NULL)
1617	clear_nlink(dentry->d_inode);	1617	clear_nlink(dentry->d_inode);
1618	else if (error == -ENOENT)	1618	else if (error == -ENOENT)
1619	nfs_dentry_handle_enoent(dentry);	1619	nfs_dentry_handle_enoent(dentry);
1620		1620
1621	return error;	1621	return error;
1622	}	1622	}
1623	EXPORT_SYMBOL_GPL(nfs_rmdir);	1623	EXPORT_SYMBOL_GPL(nfs_rmdir);
1624		1624
1625	/*	1625	/*
1626	* Remove a file after making sure there are no pending writes,	1626	* Remove a file after making sure there are no pending writes,
1627	* and after checking that the file has only one user.	1627	* and after checking that the file has only one user.
1628	*	1628	*
1629	* We invalidate the attribute cache and free the inode prior to the operation	1629	* We invalidate the attribute cache and free the inode prior to the operation
1630	* to avoid possible races if the server reuses the inode.	1630	* to avoid possible races if the server reuses the inode.
1631	*/	1631	*/
1632	static int nfs_safe_remove(struct dentry *dentry)	1632	static int nfs_safe_remove(struct dentry *dentry)
1633	{	1633	{
1634	struct inode *dir = dentry->d_parent->d_inode;	1634	struct inode *dir = dentry->d_parent->d_inode;
1635	struct inode *inode = dentry->d_inode;	1635	struct inode *inode = dentry->d_inode;
1636	int error = -EBUSY;	1636	int error = -EBUSY;
1637		1637
1638	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",	1638	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",
1639	dentry->d_parent->d_name.name, dentry->d_name.name);	1639	dentry->d_parent->d_name.name, dentry->d_name.name);
1640		1640
1641	/* If the dentry was sillyrenamed, we simply call d_delete() */	1641	/* If the dentry was sillyrenamed, we simply call d_delete() */
1642	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {	1642	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1643	error = 0;	1643	error = 0;
1644	goto out;	1644	goto out;
1645	}	1645	}
1646		1646
1647	if (inode != NULL) {	1647	if (inode != NULL) {
1648	NFS_PROTO(inode)->return_delegation(inode);	1648	NFS_PROTO(inode)->return_delegation(inode);
1649	error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);	1649	error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1650	/* The VFS may want to delete this inode */	1650	/* The VFS may want to delete this inode */
1651	if (error == 0)	1651	if (error == 0)
1652	nfs_drop_nlink(inode);	1652	nfs_drop_nlink(inode);
1653	nfs_mark_for_revalidate(inode);	1653	nfs_mark_for_revalidate(inode);
1654	} else	1654	} else
1655	error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);	1655	error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1656	if (error == -ENOENT)	1656	if (error == -ENOENT)
1657	nfs_dentry_handle_enoent(dentry);	1657	nfs_dentry_handle_enoent(dentry);
1658	out:	1658	out:
1659	return error;	1659	return error;
1660	}	1660	}
1661		1661
1662	/* We do silly rename. In case sillyrename() returns -EBUSY, the inode	1662	/* We do silly rename. In case sillyrename() returns -EBUSY, the inode
1663	* belongs to an active ".nfs..." file and we return -EBUSY.	1663	* belongs to an active ".nfs..." file and we return -EBUSY.
1664	*	1664	*
1665	* If sillyrename() returns 0, we do nothing, otherwise we unlink.	1665	* If sillyrename() returns 0, we do nothing, otherwise we unlink.
1666	*/	1666	*/
1667	int nfs_unlink(struct inode dir, struct dentry dentry)	1667	int nfs_unlink(struct inode dir, struct dentry dentry)
1668	{	1668	{
1669	int error;	1669	int error;
1670	int need_rehash = 0;	1670	int need_rehash = 0;
1671		1671
1672	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,	1672	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1673	dir->i_ino, dentry->d_name.name);	1673	dir->i_ino, dentry->d_name.name);
1674		1674
1675	spin_lock(&dentry->d_lock);	1675	spin_lock(&dentry->d_lock);
1676	if (dentry->d_count > 1) {	1676	if (dentry->d_count > 1) {
1677	spin_unlock(&dentry->d_lock);	1677	spin_unlock(&dentry->d_lock);
1678	/* Start asynchronous writeout of the inode */	1678	/* Start asynchronous writeout of the inode */
1679	write_inode_now(dentry->d_inode, 0);	1679	write_inode_now(dentry->d_inode, 0);
1680	error = nfs_sillyrename(dir, dentry);	1680	error = nfs_sillyrename(dir, dentry);
1681	return error;	1681	return error;
1682	}	1682	}
1683	if (!d_unhashed(dentry)) {	1683	if (!d_unhashed(dentry)) {
1684	__d_drop(dentry);	1684	__d_drop(dentry);
1685	need_rehash = 1;	1685	need_rehash = 1;
1686	}	1686	}
1687	spin_unlock(&dentry->d_lock);	1687	spin_unlock(&dentry->d_lock);
1688	error = nfs_safe_remove(dentry);	1688	error = nfs_safe_remove(dentry);
1689	if (!error \|\| error == -ENOENT) {	1689	if (!error \|\| error == -ENOENT) {
1690	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));	1690	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1691	} else if (need_rehash)	1691	} else if (need_rehash)
1692	d_rehash(dentry);	1692	d_rehash(dentry);
1693	return error;	1693	return error;
1694	}	1694	}
1695	EXPORT_SYMBOL_GPL(nfs_unlink);	1695	EXPORT_SYMBOL_GPL(nfs_unlink);
1696		1696
1697	/*	1697	/*
1698	* To create a symbolic link, most file systems instantiate a new inode,	1698	* To create a symbolic link, most file systems instantiate a new inode,
1699	* add a page to it containing the path, then write it out to the disk	1699	* add a page to it containing the path, then write it out to the disk
1700	* using prepare_write/commit_write.	1700	* using prepare_write/commit_write.
1701	*	1701	*
1702	* Unfortunately the NFS client can't create the in-core inode first	1702	* Unfortunately the NFS client can't create the in-core inode first
1703	* because it needs a file handle to create an in-core inode (see	1703	* because it needs a file handle to create an in-core inode (see
1704	* fs/nfs/inode.c:nfs_fhget). We only have a file handle after the	1704	* fs/nfs/inode.c:nfs_fhget). We only have a file handle after the
1705	* symlink request has completed on the server.	1705	* symlink request has completed on the server.
1706	*	1706	*
1707	* So instead we allocate a raw page, copy the symname into it, then do	1707	* So instead we allocate a raw page, copy the symname into it, then do
1708	* the SYMLINK request with the page as the buffer. If it succeeds, we	1708	* the SYMLINK request with the page as the buffer. If it succeeds, we
1709	* now have a new file handle and can instantiate an in-core NFS inode	1709	* now have a new file handle and can instantiate an in-core NFS inode
1710	* and move the raw page into its mapping.	1710	* and move the raw page into its mapping.
1711	*/	1711	*/
1712	int nfs_symlink(struct inode dir, struct dentry dentry, const char *symname)	1712	int nfs_symlink(struct inode dir, struct dentry dentry, const char *symname)
1713	{	1713	{
1714	struct pagevec lru_pvec;	1714	struct pagevec lru_pvec;
1715	struct page *page;	1715	struct page *page;
1716	char *kaddr;	1716	char *kaddr;
1717	struct iattr attr;	1717	struct iattr attr;
1718	unsigned int pathlen = strlen(symname);	1718	unsigned int pathlen = strlen(symname);
1719	int error;	1719	int error;
1720		1720
1721	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,	1721	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
1722	dir->i_ino, dentry->d_name.name, symname);	1722	dir->i_ino, dentry->d_name.name, symname);
1723		1723
1724	if (pathlen > PAGE_SIZE)	1724	if (pathlen > PAGE_SIZE)
1725	return -ENAMETOOLONG;	1725	return -ENAMETOOLONG;
1726		1726
1727	attr.ia_mode = S_IFLNK \| S_IRWXUGO;	1727	attr.ia_mode = S_IFLNK \| S_IRWXUGO;
1728	attr.ia_valid = ATTR_MODE;	1728	attr.ia_valid = ATTR_MODE;
1729		1729
1730	page = alloc_page(GFP_HIGHUSER);	1730	page = alloc_page(GFP_HIGHUSER);
1731	if (!page)	1731	if (!page)
1732	return -ENOMEM;	1732	return -ENOMEM;
1733		1733
1734	kaddr = kmap_atomic(page);	1734	kaddr = kmap_atomic(page);
1735	memcpy(kaddr, symname, pathlen);	1735	memcpy(kaddr, symname, pathlen);
1736	if (pathlen < PAGE_SIZE)	1736	if (pathlen < PAGE_SIZE)
1737	memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);	1737	memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
1738	kunmap_atomic(kaddr);	1738	kunmap_atomic(kaddr);
1739		1739
1740	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);	1740	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
1741	if (error != 0) {	1741	if (error != 0) {
1742	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",	1742	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
1743	dir->i_sb->s_id, dir->i_ino,	1743	dir->i_sb->s_id, dir->i_ino,
1744	dentry->d_name.name, symname, error);	1744	dentry->d_name.name, symname, error);
1745	d_drop(dentry);	1745	d_drop(dentry);
1746	__free_page(page);	1746	__free_page(page);
1747	return error;	1747	return error;
1748	}	1748	}
1749		1749
1750	/*	1750	/*
1751	* No big deal if we can't add this page to the page cache here.	1751	* No big deal if we can't add this page to the page cache here.
1752	* READLINK will get the missing page from the server if needed.	1752	* READLINK will get the missing page from the server if needed.
1753	*/	1753	*/
1754	pagevec_init(&lru_pvec, 0);	1754	pagevec_init(&lru_pvec, 0);
1755	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,	1755	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1756	GFP_KERNEL)) {	1756	GFP_KERNEL)) {
1757	pagevec_add(&lru_pvec, page);	1757	pagevec_add(&lru_pvec, page);
1758	pagevec_lru_add_file(&lru_pvec);	1758	pagevec_lru_add_file(&lru_pvec);
1759	SetPageUptodate(page);	1759	SetPageUptodate(page);
1760	unlock_page(page);	1760	unlock_page(page);
1761	} else	1761	} else
1762	__free_page(page);	1762	__free_page(page);
1763		1763
1764	return 0;	1764	return 0;
1765	}	1765	}
1766	EXPORT_SYMBOL_GPL(nfs_symlink);	1766	EXPORT_SYMBOL_GPL(nfs_symlink);
1767		1767
1768	int	1768	int
1769	nfs_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)	1769	nfs_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)
1770	{	1770	{
1771	struct inode *inode = old_dentry->d_inode;	1771	struct inode *inode = old_dentry->d_inode;
1772	int error;	1772	int error;
1773		1773
1774	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",	1774	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",
1775	old_dentry->d_parent->d_name.name, old_dentry->d_name.name,	1775	old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1776	dentry->d_parent->d_name.name, dentry->d_name.name);	1776	dentry->d_parent->d_name.name, dentry->d_name.name);
1777		1777
1778	NFS_PROTO(inode)->return_delegation(inode);	1778	NFS_PROTO(inode)->return_delegation(inode);
1779		1779
1780	d_drop(dentry);	1780	d_drop(dentry);
1781	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);	1781	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1782	if (error == 0) {	1782	if (error == 0) {
1783	ihold(inode);	1783	ihold(inode);
1784	d_add(dentry, inode);	1784	d_add(dentry, inode);
1785	}	1785	}
1786	return error;	1786	return error;
1787	}	1787	}
1788	EXPORT_SYMBOL_GPL(nfs_link);	1788	EXPORT_SYMBOL_GPL(nfs_link);
1789		1789
1790	/*	1790	/*
1791	* RENAME	1791	* RENAME
1792	* FIXME: Some nfsds, like the Linux user space nfsd, may generate a	1792	* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
1793	* different file handle for the same inode after a rename (e.g. when	1793	* different file handle for the same inode after a rename (e.g. when
1794	* moving to a different directory). A fail-safe method to do so would	1794	* moving to a different directory). A fail-safe method to do so would
1795	* be to look up old_dir/old_name, create a link to new_dir/new_name and	1795	* be to look up old_dir/old_name, create a link to new_dir/new_name and
1796	* rename the old file using the sillyrename stuff. This way, the original	1796	* rename the old file using the sillyrename stuff. This way, the original
1797	* file in old_dir will go away when the last process iput()s the inode.	1797	* file in old_dir will go away when the last process iput()s the inode.
1798	*	1798	*
1799	* FIXED.	1799	* FIXED.
1800	*	1800	*
1801	* It actually works quite well. One needs to have the possibility for	1801	* It actually works quite well. One needs to have the possibility for
1802	* at least one ".nfs..." file in each directory the file ever gets	1802	* at least one ".nfs..." file in each directory the file ever gets
1803	* moved or linked to which happens automagically with the new	1803	* moved or linked to which happens automagically with the new
1804	* implementation that only depends on the dcache stuff instead of	1804	* implementation that only depends on the dcache stuff instead of
1805	* using the inode layer	1805	* using the inode layer
1806	*	1806	*
1807	* Unfortunately, things are a little more complicated than indicated	1807	* Unfortunately, things are a little more complicated than indicated
1808	* above. For a cross-directory move, we want to make sure we can get	1808	* above. For a cross-directory move, we want to make sure we can get
1809	* rid of the old inode after the operation. This means there must be	1809	* rid of the old inode after the operation. This means there must be
1810	* no pending writes (if it's a file), and the use count must be 1.	1810	* no pending writes (if it's a file), and the use count must be 1.
1811	* If these conditions are met, we can drop the dentries before doing	1811	* If these conditions are met, we can drop the dentries before doing
1812	* the rename.	1812	* the rename.
1813	*/	1813	*/
1814	int nfs_rename(struct inode old_dir, struct dentry old_dentry,	1814	int nfs_rename(struct inode old_dir, struct dentry old_dentry,
1815	struct inode new_dir, struct dentry new_dentry)	1815	struct inode new_dir, struct dentry new_dentry)
1816	{	1816	{
1817	struct inode *old_inode = old_dentry->d_inode;	1817	struct inode *old_inode = old_dentry->d_inode;
1818	struct inode *new_inode = new_dentry->d_inode;	1818	struct inode *new_inode = new_dentry->d_inode;
1819	struct dentry dentry = NULL, rehash = NULL;	1819	struct dentry dentry = NULL, rehash = NULL;
1820	int error = -EBUSY;	1820	int error = -EBUSY;
1821		1821
1822	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",	1822	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1823	old_dentry->d_parent->d_name.name, old_dentry->d_name.name,	1823	old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1824	new_dentry->d_parent->d_name.name, new_dentry->d_name.name,	1824	new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1825	new_dentry->d_count);	1825	new_dentry->d_count);
1826		1826
1827	/*	1827	/*
1828	* For non-directories, check whether the target is busy and if so,	1828	* For non-directories, check whether the target is busy and if so,
1829	* make a copy of the dentry and then do a silly-rename. If the	1829	* make a copy of the dentry and then do a silly-rename. If the
1830	* silly-rename succeeds, the copied dentry is hashed and becomes	1830	* silly-rename succeeds, the copied dentry is hashed and becomes
1831	* the new target.	1831	* the new target.
1832	*/	1832	*/
1833	if (new_inode && !S_ISDIR(new_inode->i_mode)) {	1833	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
1834	/*	1834	/*
1835	* To prevent any new references to the target during the	1835	* To prevent any new references to the target during the
1836	* rename, we unhash the dentry in advance.	1836	* rename, we unhash the dentry in advance.
1837	*/	1837	*/
1838	if (!d_unhashed(new_dentry)) {	1838	if (!d_unhashed(new_dentry)) {
1839	d_drop(new_dentry);	1839	d_drop(new_dentry);
1840	rehash = new_dentry;	1840	rehash = new_dentry;
1841	}	1841	}
1842		1842
1843	if (new_dentry->d_count > 2) {	1843	if (new_dentry->d_count > 2) {
1844	int err;	1844	int err;
1845		1845
1846	/* copy the target dentry's name */	1846	/* copy the target dentry's name */
1847	dentry = d_alloc(new_dentry->d_parent,	1847	dentry = d_alloc(new_dentry->d_parent,
1848	&new_dentry->d_name);	1848	&new_dentry->d_name);
1849	if (!dentry)	1849	if (!dentry)
1850	goto out;	1850	goto out;
1851		1851
1852	/* silly-rename the existing target ... */	1852	/* silly-rename the existing target ... */
1853	err = nfs_sillyrename(new_dir, new_dentry);	1853	err = nfs_sillyrename(new_dir, new_dentry);
1854	if (err)	1854	if (err)
1855	goto out;	1855	goto out;
1856		1856
1857	new_dentry = dentry;	1857	new_dentry = dentry;
1858	rehash = NULL;	1858	rehash = NULL;
1859	new_inode = NULL;	1859	new_inode = NULL;
1860	}	1860	}
1861	}	1861	}
1862		1862
1863	NFS_PROTO(old_inode)->return_delegation(old_inode);	1863	NFS_PROTO(old_inode)->return_delegation(old_inode);
1864	if (new_inode != NULL)	1864	if (new_inode != NULL)
1865	NFS_PROTO(new_inode)->return_delegation(new_inode);	1865	NFS_PROTO(new_inode)->return_delegation(new_inode);
1866		1866
1867	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,	1867	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1868	new_dir, &new_dentry->d_name);	1868	new_dir, &new_dentry->d_name);
1869	nfs_mark_for_revalidate(old_inode);	1869	nfs_mark_for_revalidate(old_inode);
1870	out:	1870	out:
1871	if (rehash)	1871	if (rehash)
1872	d_rehash(rehash);	1872	d_rehash(rehash);
1873	if (!error) {	1873	if (!error) {
1874	if (new_inode != NULL)	1874	if (new_inode != NULL)
1875	nfs_drop_nlink(new_inode);	1875	nfs_drop_nlink(new_inode);
1876	d_move(old_dentry, new_dentry);	1876	d_move(old_dentry, new_dentry);
1877	nfs_set_verifier(new_dentry,	1877	nfs_set_verifier(new_dentry,
1878	nfs_save_change_attribute(new_dir));	1878	nfs_save_change_attribute(new_dir));
1879	} else if (error == -ENOENT)	1879	} else if (error == -ENOENT)
1880	nfs_dentry_handle_enoent(old_dentry);	1880	nfs_dentry_handle_enoent(old_dentry);
1881		1881
1882	/* new dentry created? */	1882	/* new dentry created? */
1883	if (dentry)	1883	if (dentry)
1884	dput(dentry);	1884	dput(dentry);
1885	return error;	1885	return error;
1886	}	1886	}
1887	EXPORT_SYMBOL_GPL(nfs_rename);	1887	EXPORT_SYMBOL_GPL(nfs_rename);
1888		1888
1889	static DEFINE_SPINLOCK(nfs_access_lru_lock);	1889	static DEFINE_SPINLOCK(nfs_access_lru_lock);
1890	static LIST_HEAD(nfs_access_lru_list);	1890	static LIST_HEAD(nfs_access_lru_list);
1891	static atomic_long_t nfs_access_nr_entries;	1891	static atomic_long_t nfs_access_nr_entries;
1892		1892
1893	static void nfs_access_free_entry(struct nfs_access_entry *entry)	1893	static void nfs_access_free_entry(struct nfs_access_entry *entry)
1894	{	1894	{
1895	put_rpccred(entry->cred);	1895	put_rpccred(entry->cred);
1896	kfree(entry);	1896	kfree(entry);
1897	smp_mb__before_atomic_dec();	1897	smp_mb__before_atomic_dec();
1898	atomic_long_dec(&nfs_access_nr_entries);	1898	atomic_long_dec(&nfs_access_nr_entries);
1899	smp_mb__after_atomic_dec();	1899	smp_mb__after_atomic_dec();
1900	}	1900	}
1901		1901
1902	static void nfs_access_free_list(struct list_head *head)	1902	static void nfs_access_free_list(struct list_head *head)
1903	{	1903	{
1904	struct nfs_access_entry *cache;	1904	struct nfs_access_entry *cache;
1905		1905
1906	while (!list_empty(head)) {	1906	while (!list_empty(head)) {
1907	cache = list_entry(head->next, struct nfs_access_entry, lru);	1907	cache = list_entry(head->next, struct nfs_access_entry, lru);
1908	list_del(&cache->lru);	1908	list_del(&cache->lru);
1909	nfs_access_free_entry(cache);	1909	nfs_access_free_entry(cache);
1910	}	1910	}
1911	}	1911	}
1912		1912
1913	int nfs_access_cache_shrinker(struct shrinker *shrink,	1913	int nfs_access_cache_shrinker(struct shrinker *shrink,
1914	struct shrink_control *sc)	1914	struct shrink_control *sc)
1915	{	1915	{
1916	LIST_HEAD(head);	1916	LIST_HEAD(head);
1917	struct nfs_inode nfsi, next;	1917	struct nfs_inode nfsi, next;
1918	struct nfs_access_entry *cache;	1918	struct nfs_access_entry *cache;
1919	int nr_to_scan = sc->nr_to_scan;	1919	int nr_to_scan = sc->nr_to_scan;
1920	gfp_t gfp_mask = sc->gfp_mask;	1920	gfp_t gfp_mask = sc->gfp_mask;
1921		1921
1922	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)	1922	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1923	return (nr_to_scan == 0) ? 0 : -1;	1923	return (nr_to_scan == 0) ? 0 : -1;
1924		1924
1925	spin_lock(&nfs_access_lru_lock);	1925	spin_lock(&nfs_access_lru_lock);
1926	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {	1926	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1927	struct inode *inode;	1927	struct inode *inode;
1928		1928
1929	if (nr_to_scan-- == 0)	1929	if (nr_to_scan-- == 0)
1930	break;	1930	break;
1931	inode = &nfsi->vfs_inode;	1931	inode = &nfsi->vfs_inode;
1932	spin_lock(&inode->i_lock);	1932	spin_lock(&inode->i_lock);
1933	if (list_empty(&nfsi->access_cache_entry_lru))	1933	if (list_empty(&nfsi->access_cache_entry_lru))
1934	goto remove_lru_entry;	1934	goto remove_lru_entry;
1935	cache = list_entry(nfsi->access_cache_entry_lru.next,	1935	cache = list_entry(nfsi->access_cache_entry_lru.next,
1936	struct nfs_access_entry, lru);	1936	struct nfs_access_entry, lru);
1937	list_move(&cache->lru, &head);	1937	list_move(&cache->lru, &head);
1938	rb_erase(&cache->rb_node, &nfsi->access_cache);	1938	rb_erase(&cache->rb_node, &nfsi->access_cache);
1939	if (!list_empty(&nfsi->access_cache_entry_lru))	1939	if (!list_empty(&nfsi->access_cache_entry_lru))
1940	list_move_tail(&nfsi->access_cache_inode_lru,	1940	list_move_tail(&nfsi->access_cache_inode_lru,
1941	&nfs_access_lru_list);	1941	&nfs_access_lru_list);
1942	else {	1942	else {
1943	remove_lru_entry:	1943	remove_lru_entry:
1944	list_del_init(&nfsi->access_cache_inode_lru);	1944	list_del_init(&nfsi->access_cache_inode_lru);
1945	smp_mb__before_clear_bit();	1945	smp_mb__before_clear_bit();
1946	clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);	1946	clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1947	smp_mb__after_clear_bit();	1947	smp_mb__after_clear_bit();
1948	}	1948	}
1949	spin_unlock(&inode->i_lock);	1949	spin_unlock(&inode->i_lock);
1950	}	1950	}
1951	spin_unlock(&nfs_access_lru_lock);	1951	spin_unlock(&nfs_access_lru_lock);
1952	nfs_access_free_list(&head);	1952	nfs_access_free_list(&head);
1953	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;	1953	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
1954	}	1954	}
1955		1955
1956	static void __nfs_access_zap_cache(struct nfs_inode nfsi, struct list_head head)	1956	static void __nfs_access_zap_cache(struct nfs_inode nfsi, struct list_head head)
1957	{	1957	{
1958	struct rb_root *root_node = &nfsi->access_cache;	1958	struct rb_root *root_node = &nfsi->access_cache;
1959	struct rb_node *n;	1959	struct rb_node *n;
1960	struct nfs_access_entry *entry;	1960	struct nfs_access_entry *entry;
1961		1961
1962	/* Unhook entries from the cache */	1962	/* Unhook entries from the cache */
1963	while ((n = rb_first(root_node)) != NULL) {	1963	while ((n = rb_first(root_node)) != NULL) {
1964	entry = rb_entry(n, struct nfs_access_entry, rb_node);	1964	entry = rb_entry(n, struct nfs_access_entry, rb_node);
1965	rb_erase(n, root_node);	1965	rb_erase(n, root_node);
1966	list_move(&entry->lru, head);	1966	list_move(&entry->lru, head);
1967	}	1967	}
1968	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;	1968	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
1969	}	1969	}
1970		1970
1971	void nfs_access_zap_cache(struct inode *inode)	1971	void nfs_access_zap_cache(struct inode *inode)
1972	{	1972	{
1973	LIST_HEAD(head);	1973	LIST_HEAD(head);
1974		1974
1975	if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)	1975	if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
1976	return;	1976	return;
1977	/* Remove from global LRU init */	1977	/* Remove from global LRU init */
1978	spin_lock(&nfs_access_lru_lock);	1978	spin_lock(&nfs_access_lru_lock);
1979	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))	1979	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1980	list_del_init(&NFS_I(inode)->access_cache_inode_lru);	1980	list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1981		1981
1982	spin_lock(&inode->i_lock);	1982	spin_lock(&inode->i_lock);
1983	__nfs_access_zap_cache(NFS_I(inode), &head);	1983	__nfs_access_zap_cache(NFS_I(inode), &head);
1984	spin_unlock(&inode->i_lock);	1984	spin_unlock(&inode->i_lock);
1985	spin_unlock(&nfs_access_lru_lock);	1985	spin_unlock(&nfs_access_lru_lock);
1986	nfs_access_free_list(&head);	1986	nfs_access_free_list(&head);
1987	}	1987	}
1988	EXPORT_SYMBOL_GPL(nfs_access_zap_cache);	1988	EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
1989		1989
1990	static struct nfs_access_entry nfs_access_search_rbtree(struct inode inode, struct rpc_cred *cred)	1990	static struct nfs_access_entry nfs_access_search_rbtree(struct inode inode, struct rpc_cred *cred)
1991	{	1991	{
1992	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;	1992	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
1993	struct nfs_access_entry *entry;	1993	struct nfs_access_entry *entry;
1994		1994
1995	while (n != NULL) {	1995	while (n != NULL) {
1996	entry = rb_entry(n, struct nfs_access_entry, rb_node);	1996	entry = rb_entry(n, struct nfs_access_entry, rb_node);
1997		1997
1998	if (cred < entry->cred)	1998	if (cred < entry->cred)
1999	n = n->rb_left;	1999	n = n->rb_left;
2000	else if (cred > entry->cred)	2000	else if (cred > entry->cred)
2001	n = n->rb_right;	2001	n = n->rb_right;
2002	else	2002	else
2003	return entry;	2003	return entry;
2004	}	2004	}
2005	return NULL;	2005	return NULL;
2006	}	2006	}
2007		2007
2008	static int nfs_access_get_cached(struct inode inode, struct rpc_cred cred, struct nfs_access_entry *res)	2008	static int nfs_access_get_cached(struct inode inode, struct rpc_cred cred, struct nfs_access_entry *res)
2009	{	2009	{
2010	struct nfs_inode *nfsi = NFS_I(inode);	2010	struct nfs_inode *nfsi = NFS_I(inode);
2011	struct nfs_access_entry *cache;	2011	struct nfs_access_entry *cache;
2012	int err = -ENOENT;	2012	int err = -ENOENT;
2013		2013
2014	spin_lock(&inode->i_lock);	2014	spin_lock(&inode->i_lock);
2015	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)	2015	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
2016	goto out_zap;	2016	goto out_zap;
2017	cache = nfs_access_search_rbtree(inode, cred);	2017	cache = nfs_access_search_rbtree(inode, cred);
2018	if (cache == NULL)	2018	if (cache == NULL)
2019	goto out;	2019	goto out;
2020	if (!nfs_have_delegated_attributes(inode) &&	2020	if (!nfs_have_delegated_attributes(inode) &&
2021	!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))	2021	!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
2022	goto out_stale;	2022	goto out_stale;
2023	res->jiffies = cache->jiffies;	2023	res->jiffies = cache->jiffies;
2024	res->cred = cache->cred;	2024	res->cred = cache->cred;
2025	res->mask = cache->mask;	2025	res->mask = cache->mask;
2026	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);	2026	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
2027	err = 0;	2027	err = 0;
2028	out:	2028	out:
2029	spin_unlock(&inode->i_lock);	2029	spin_unlock(&inode->i_lock);
2030	return err;	2030	return err;
2031	out_stale:	2031	out_stale:
2032	rb_erase(&cache->rb_node, &nfsi->access_cache);	2032	rb_erase(&cache->rb_node, &nfsi->access_cache);
2033	list_del(&cache->lru);	2033	list_del(&cache->lru);
2034	spin_unlock(&inode->i_lock);	2034	spin_unlock(&inode->i_lock);
2035	nfs_access_free_entry(cache);	2035	nfs_access_free_entry(cache);
2036	return -ENOENT;	2036	return -ENOENT;
2037	out_zap:	2037	out_zap:
2038	spin_unlock(&inode->i_lock);	2038	spin_unlock(&inode->i_lock);
2039	nfs_access_zap_cache(inode);	2039	nfs_access_zap_cache(inode);
2040	return -ENOENT;	2040	return -ENOENT;
2041	}	2041	}
2042		2042
2043	static void nfs_access_add_rbtree(struct inode inode, struct nfs_access_entry set)	2043	static void nfs_access_add_rbtree(struct inode inode, struct nfs_access_entry set)
2044	{	2044	{
2045	struct nfs_inode *nfsi = NFS_I(inode);	2045	struct nfs_inode *nfsi = NFS_I(inode);
2046	struct rb_root *root_node = &nfsi->access_cache;	2046	struct rb_root *root_node = &nfsi->access_cache;
2047	struct rb_node **p = &root_node->rb_node;	2047	struct rb_node **p = &root_node->rb_node;
2048	struct rb_node *parent = NULL;	2048	struct rb_node *parent = NULL;
2049	struct nfs_access_entry *entry;	2049	struct nfs_access_entry *entry;
2050		2050
2051	spin_lock(&inode->i_lock);	2051	spin_lock(&inode->i_lock);
2052	while (*p != NULL) {	2052	while (*p != NULL) {
2053	parent = *p;	2053	parent = *p;
2054	entry = rb_entry(parent, struct nfs_access_entry, rb_node);	2054	entry = rb_entry(parent, struct nfs_access_entry, rb_node);
2055		2055
2056	if (set->cred < entry->cred)	2056	if (set->cred < entry->cred)
2057	p = &parent->rb_left;	2057	p = &parent->rb_left;
2058	else if (set->cred > entry->cred)	2058	else if (set->cred > entry->cred)
2059	p = &parent->rb_right;	2059	p = &parent->rb_right;
2060	else	2060	else
2061	goto found;	2061	goto found;
2062	}	2062	}
2063	rb_link_node(&set->rb_node, parent, p);	2063	rb_link_node(&set->rb_node, parent, p);
2064	rb_insert_color(&set->rb_node, root_node);	2064	rb_insert_color(&set->rb_node, root_node);
2065	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);	2065	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
2066	spin_unlock(&inode->i_lock);	2066	spin_unlock(&inode->i_lock);
2067	return;	2067	return;
2068	found:	2068	found:
2069	rb_replace_node(parent, &set->rb_node, root_node);	2069	rb_replace_node(parent, &set->rb_node, root_node);
2070	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);	2070	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
2071	list_del(&entry->lru);	2071	list_del(&entry->lru);
2072	spin_unlock(&inode->i_lock);	2072	spin_unlock(&inode->i_lock);
2073	nfs_access_free_entry(entry);	2073	nfs_access_free_entry(entry);
2074	}	2074	}
2075		2075
2076	void nfs_access_add_cache(struct inode inode, struct nfs_access_entry set)	2076	void nfs_access_add_cache(struct inode inode, struct nfs_access_entry set)
2077	{	2077	{
2078	struct nfs_access_entry cache = kmalloc(sizeof(cache), GFP_KERNEL);	2078	struct nfs_access_entry cache = kmalloc(sizeof(cache), GFP_KERNEL);
2079	if (cache == NULL)	2079	if (cache == NULL)
2080	return;	2080	return;
2081	RB_CLEAR_NODE(&cache->rb_node);	2081	RB_CLEAR_NODE(&cache->rb_node);
2082	cache->jiffies = set->jiffies;	2082	cache->jiffies = set->jiffies;
2083	cache->cred = get_rpccred(set->cred);	2083	cache->cred = get_rpccred(set->cred);
2084	cache->mask = set->mask;	2084	cache->mask = set->mask;
2085		2085
2086	nfs_access_add_rbtree(inode, cache);	2086	nfs_access_add_rbtree(inode, cache);
2087		2087
2088	/* Update accounting */	2088	/* Update accounting */
2089	smp_mb__before_atomic_inc();	2089	smp_mb__before_atomic_inc();
2090	atomic_long_inc(&nfs_access_nr_entries);	2090	atomic_long_inc(&nfs_access_nr_entries);
2091	smp_mb__after_atomic_inc();	2091	smp_mb__after_atomic_inc();
2092		2092
2093	/* Add inode to global LRU list */	2093	/* Add inode to global LRU list */
2094	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {	2094	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
2095	spin_lock(&nfs_access_lru_lock);	2095	spin_lock(&nfs_access_lru_lock);
2096	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))	2096	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
2097	list_add_tail(&NFS_I(inode)->access_cache_inode_lru,	2097	list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
2098	&nfs_access_lru_list);	2098	&nfs_access_lru_list);
2099	spin_unlock(&nfs_access_lru_lock);	2099	spin_unlock(&nfs_access_lru_lock);
2100	}	2100	}
2101	}	2101	}
2102	EXPORT_SYMBOL_GPL(nfs_access_add_cache);	2102	EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2103		2103
2104	void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)	2104	void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
2105	{	2105	{
2106	entry->mask = 0;	2106	entry->mask = 0;
2107	if (access_result & NFS4_ACCESS_READ)	2107	if (access_result & NFS4_ACCESS_READ)
2108	entry->mask \|= MAY_READ;	2108	entry->mask \|= MAY_READ;
2109	if (access_result &	2109	if (access_result &
2110	(NFS4_ACCESS_MODIFY \| NFS4_ACCESS_EXTEND \| NFS4_ACCESS_DELETE))	2110	(NFS4_ACCESS_MODIFY \| NFS4_ACCESS_EXTEND \| NFS4_ACCESS_DELETE))
2111	entry->mask \|= MAY_WRITE;	2111	entry->mask \|= MAY_WRITE;
2112	if (access_result & (NFS4_ACCESS_LOOKUP\|NFS4_ACCESS_EXECUTE))	2112	if (access_result & (NFS4_ACCESS_LOOKUP\|NFS4_ACCESS_EXECUTE))
2113	entry->mask \|= MAY_EXEC;	2113	entry->mask \|= MAY_EXEC;
2114	}	2114	}
2115	EXPORT_SYMBOL_GPL(nfs_access_set_mask);	2115	EXPORT_SYMBOL_GPL(nfs_access_set_mask);
2116		2116
2117	static int nfs_do_access(struct inode inode, struct rpc_cred cred, int mask)	2117	static int nfs_do_access(struct inode inode, struct rpc_cred cred, int mask)
2118	{	2118	{
2119	struct nfs_access_entry cache;	2119	struct nfs_access_entry cache;
2120	int status;	2120	int status;
2121		2121
2122	status = nfs_access_get_cached(inode, cred, &cache);	2122	status = nfs_access_get_cached(inode, cred, &cache);
2123	if (status == 0)	2123	if (status == 0)
2124	goto out;	2124	goto out;
2125		2125
2126	/* Be clever: ask server to check for all possible rights */	2126	/* Be clever: ask server to check for all possible rights */
2127	cache.mask = MAY_EXEC \| MAY_WRITE \| MAY_READ;	2127	cache.mask = MAY_EXEC \| MAY_WRITE \| MAY_READ;
2128	cache.cred = cred;	2128	cache.cred = cred;
2129	cache.jiffies = jiffies;	2129	cache.jiffies = jiffies;
2130	status = NFS_PROTO(inode)->access(inode, &cache);	2130	status = NFS_PROTO(inode)->access(inode, &cache);
2131	if (status != 0) {	2131	if (status != 0) {
2132	if (status == -ESTALE) {	2132	if (status == -ESTALE) {
2133	nfs_zap_caches(inode);	2133	nfs_zap_caches(inode);
2134	if (!S_ISDIR(inode->i_mode))	2134	if (!S_ISDIR(inode->i_mode))
2135	set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);	2135	set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
2136	}	2136	}
2137	return status;	2137	return status;
2138	}	2138	}
2139	nfs_access_add_cache(inode, &cache);	2139	nfs_access_add_cache(inode, &cache);
2140	out:	2140	out:
2141	if ((mask & ~cache.mask & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)	2141	if ((mask & ~cache.mask & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)
2142	return 0;	2142	return 0;
2143	return -EACCES;	2143	return -EACCES;
2144	}	2144	}
2145		2145
2146	static int nfs_open_permission_mask(int openflags)	2146	static int nfs_open_permission_mask(int openflags)
2147	{	2147	{
2148	int mask = 0;	2148	int mask = 0;
2149		2149
2150	if ((openflags & O_ACCMODE) != O_WRONLY)	2150	if ((openflags & O_ACCMODE) != O_WRONLY)
2151	mask \|= MAY_READ;	2151	mask \|= MAY_READ;
2152	if ((openflags & O_ACCMODE) != O_RDONLY)	2152	if ((openflags & O_ACCMODE) != O_RDONLY)
2153	mask \|= MAY_WRITE;	2153	mask \|= MAY_WRITE;
2154	if (openflags & __FMODE_EXEC)	2154	if (openflags & __FMODE_EXEC)
2155	mask \|= MAY_EXEC;	2155	mask \|= MAY_EXEC;
2156	return mask;	2156	return mask;
2157	}	2157	}
2158		2158
2159	int nfs_may_open(struct inode inode, struct rpc_cred cred, int openflags)	2159	int nfs_may_open(struct inode inode, struct rpc_cred cred, int openflags)
2160	{	2160	{
2161	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));	2161	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
2162	}	2162	}
2163	EXPORT_SYMBOL_GPL(nfs_may_open);	2163	EXPORT_SYMBOL_GPL(nfs_may_open);
2164		2164
2165	int nfs_permission(struct inode *inode, int mask)	2165	int nfs_permission(struct inode *inode, int mask)
2166	{	2166	{
2167	struct rpc_cred *cred;	2167	struct rpc_cred *cred;
2168	int res = 0;	2168	int res = 0;
2169		2169
2170	if (mask & MAY_NOT_BLOCK)	2170	if (mask & MAY_NOT_BLOCK)
2171	return -ECHILD;	2171	return -ECHILD;
2172		2172
2173	nfs_inc_stats(inode, NFSIOS_VFSACCESS);	2173	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
2174		2174
2175	if ((mask & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)	2175	if ((mask & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)
2176	goto out;	2176	goto out;
2177	/* Is this sys_access() ? */	2177	/* Is this sys_access() ? */
2178	if (mask & (MAY_ACCESS \| MAY_CHDIR))	2178	if (mask & (MAY_ACCESS \| MAY_CHDIR))
2179	goto force_lookup;	2179	goto force_lookup;
2180		2180
2181	switch (inode->i_mode & S_IFMT) {	2181	switch (inode->i_mode & S_IFMT) {
2182	case S_IFLNK:	2182	case S_IFLNK:
2183	goto out;	2183	goto out;
2184	case S_IFREG:	2184	case S_IFREG:
2185	/* NFSv4 has atomic_open... */	2185	/* NFSv4 has atomic_open... */
2186	if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)	2186	if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
2187	&& (mask & MAY_OPEN)	2187	&& (mask & MAY_OPEN)
2188	&& !(mask & MAY_EXEC))	2188	&& !(mask & MAY_EXEC))
2189	goto out;	2189	goto out;
2190	break;	2190	break;
2191	case S_IFDIR:	2191	case S_IFDIR:
2192	/*	2192	/*
2193	* Optimize away all write operations, since the server	2193	* Optimize away all write operations, since the server
2194	* will check permissions when we perform the op.	2194	* will check permissions when we perform the op.
2195	*/	2195	*/
2196	if ((mask & MAY_WRITE) && !(mask & MAY_READ))	2196	if ((mask & MAY_WRITE) && !(mask & MAY_READ))
2197	goto out;	2197	goto out;
2198	}	2198	}
2199		2199
2200	force_lookup:	2200	force_lookup:
2201	if (!NFS_PROTO(inode)->access)	2201	if (!NFS_PROTO(inode)->access)
2202	goto out_notsup;	2202	goto out_notsup;
2203		2203
2204	cred = rpc_lookup_cred();	2204	cred = rpc_lookup_cred();
2205	if (!IS_ERR(cred)) {	2205	if (!IS_ERR(cred)) {
2206	res = nfs_do_access(inode, cred, mask);	2206	res = nfs_do_access(inode, cred, mask);
2207	put_rpccred(cred);	2207	put_rpccred(cred);
2208	} else	2208	} else
2209	res = PTR_ERR(cred);	2209	res = PTR_ERR(cred);
2210	out:	2210	out:
2211	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))	2211	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
2212	res = -EACCES;	2212	res = -EACCES;
2213		2213
2214	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",	2214	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
2215	inode->i_sb->s_id, inode->i_ino, mask, res);	2215	inode->i_sb->s_id, inode->i_ino, mask, res);
2216	return res;	2216	return res;
2217	out_notsup:	2217	out_notsup:
2218	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);	2218	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2219	if (res == 0)	2219	if (res == 0)
2220	res = generic_permission(inode, mask);	2220	res = generic_permission(inode, mask);
2221	goto out;	2221	goto out;
2222	}	2222	}
2223	EXPORT_SYMBOL_GPL(nfs_permission);	2223	EXPORT_SYMBOL_GPL(nfs_permission);
2224		2224
2225	/*	2225	/*
2226	* Local variables:	2226	* Local variables:
2227	* version-control: t	2227	* version-control: t
2228	* kept-new-versions: 5	2228	* kept-new-versions: 5
2229	* End:	2229	* End:
2230	*/	2230	*/
2231		2231

fs/nfs/file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/nfs/file.c	2	* linux/fs/nfs/file.c
3	*	3	*
4	* Copyright (C) 1992 Rick Sladkey	4	* Copyright (C) 1992 Rick Sladkey
5	*	5	*
6	* Changes Copyright (C) 1994 by Florian La Roche	6	* Changes Copyright (C) 1994 by Florian La Roche
7	* - Do not copy data too often around in the kernel.	7	* - Do not copy data too often around in the kernel.
8	* - In nfs_file_read the return value of kmalloc wasn't checked.	8	* - In nfs_file_read the return value of kmalloc wasn't checked.
9	* - Put in a better version of read look-ahead buffering. Original idea	9	* - Put in a better version of read look-ahead buffering. Original idea
10	* and implementation by Wai S Kok elekokws@ee.nus.sg.	10	* and implementation by Wai S Kok elekokws@ee.nus.sg.
11	*	11	*
12	* Expire cache on write to a file by Wai S Kok (Oct 1994).	12	* Expire cache on write to a file by Wai S Kok (Oct 1994).
13	*	13	*
14	* Total rewrite of read side for new NFS buffer cache.. Linus.	14	* Total rewrite of read side for new NFS buffer cache.. Linus.
15	*	15	*
16	* nfs regular file handling functions	16	* nfs regular file handling functions
17	*/	17	*/
18		18
19	#include <linux/module.h>	19	#include <linux/module.h>
20	#include <linux/time.h>	20	#include <linux/time.h>
21	#include <linux/kernel.h>	21	#include <linux/kernel.h>
22	#include <linux/errno.h>	22	#include <linux/errno.h>
23	#include <linux/fcntl.h>	23	#include <linux/fcntl.h>
24	#include <linux/stat.h>	24	#include <linux/stat.h>
25	#include <linux/nfs_fs.h>	25	#include <linux/nfs_fs.h>
26	#include <linux/nfs_mount.h>	26	#include <linux/nfs_mount.h>
27	#include <linux/mm.h>	27	#include <linux/mm.h>
28	#include <linux/pagemap.h>	28	#include <linux/pagemap.h>
29	#include <linux/aio.h>	29	#include <linux/aio.h>
30	#include <linux/gfp.h>	30	#include <linux/gfp.h>
31	#include <linux/swap.h>	31	#include <linux/swap.h>
32		32
33	#include <asm/uaccess.h>	33	#include <asm/uaccess.h>
34		34
35	#include "delegation.h"	35	#include "delegation.h"
36	#include "internal.h"	36	#include "internal.h"
37	#include "iostat.h"	37	#include "iostat.h"
38	#include "fscache.h"	38	#include "fscache.h"
39		39
40	#define NFSDBG_FACILITY NFSDBG_FILE	40	#define NFSDBG_FACILITY NFSDBG_FILE
41		41
42	static const struct vm_operations_struct nfs_file_vm_ops;	42	static const struct vm_operations_struct nfs_file_vm_ops;
43		43
44	/* Hack for future NFS swap support */	44	/* Hack for future NFS swap support */
45	#ifndef IS_SWAPFILE	45	#ifndef IS_SWAPFILE
46	# define IS_SWAPFILE(inode) (0)	46	# define IS_SWAPFILE(inode) (0)
47	#endif	47	#endif
48		48
49	int nfs_check_flags(int flags)	49	int nfs_check_flags(int flags)
50	{	50	{
51	if ((flags & (O_APPEND \| O_DIRECT)) == (O_APPEND \| O_DIRECT))	51	if ((flags & (O_APPEND \| O_DIRECT)) == (O_APPEND \| O_DIRECT))
52	return -EINVAL;	52	return -EINVAL;
53		53
54	return 0;	54	return 0;
55	}	55	}
56	EXPORT_SYMBOL_GPL(nfs_check_flags);	56	EXPORT_SYMBOL_GPL(nfs_check_flags);
57		57
58	/*	58	/*
59	* Open file	59	* Open file
60	*/	60	*/
61	static int	61	static int
62	nfs_file_open(struct inode inode, struct file filp)	62	nfs_file_open(struct inode inode, struct file filp)
63	{	63	{
64	int res;	64	int res;
65		65
66	dprintk("NFS: open file(%s/%s)\n",	66	dprintk("NFS: open file(%s/%s)\n",
67	filp->f_path.dentry->d_parent->d_name.name,	67	filp->f_path.dentry->d_parent->d_name.name,
68	filp->f_path.dentry->d_name.name);	68	filp->f_path.dentry->d_name.name);
69		69
70	nfs_inc_stats(inode, NFSIOS_VFSOPEN);	70	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
71	res = nfs_check_flags(filp->f_flags);	71	res = nfs_check_flags(filp->f_flags);
72	if (res)	72	if (res)
73	return res;	73	return res;
74		74
75	res = nfs_open(inode, filp);	75	res = nfs_open(inode, filp);
76	return res;	76	return res;
77	}	77	}
78		78
79	int	79	int
80	nfs_file_release(struct inode inode, struct file filp)	80	nfs_file_release(struct inode inode, struct file filp)
81	{	81	{
82	dprintk("NFS: release(%s/%s)\n",	82	dprintk("NFS: release(%s/%s)\n",
83	filp->f_path.dentry->d_parent->d_name.name,	83	filp->f_path.dentry->d_parent->d_name.name,
84	filp->f_path.dentry->d_name.name);	84	filp->f_path.dentry->d_name.name);
85		85
86	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);	86	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
87	return nfs_release(inode, filp);	87	return nfs_release(inode, filp);
88	}	88	}
89	EXPORT_SYMBOL_GPL(nfs_file_release);	89	EXPORT_SYMBOL_GPL(nfs_file_release);
90		90
91	/**	91	/**
92	* nfs_revalidate_size - Revalidate the file size	92	* nfs_revalidate_size - Revalidate the file size
93	* @inode - pointer to inode struct	93	* @inode - pointer to inode struct
94	* @file - pointer to struct file	94	* @file - pointer to struct file
95	*	95	*
96	* Revalidates the file length. This is basically a wrapper around	96	* Revalidates the file length. This is basically a wrapper around
97	* nfs_revalidate_inode() that takes into account the fact that we may	97	* nfs_revalidate_inode() that takes into account the fact that we may
98	* have cached writes (in which case we don't care about the server's	98	* have cached writes (in which case we don't care about the server's
99	* idea of what the file length is), or O_DIRECT (in which case we	99	* idea of what the file length is), or O_DIRECT (in which case we
100	* shouldn't trust the cache).	100	* shouldn't trust the cache).
101	*/	101	*/
102	static int nfs_revalidate_file_size(struct inode inode, struct file filp)	102	static int nfs_revalidate_file_size(struct inode inode, struct file filp)
103	{	103	{
104	struct nfs_server *server = NFS_SERVER(inode);	104	struct nfs_server *server = NFS_SERVER(inode);
105	struct nfs_inode *nfsi = NFS_I(inode);	105	struct nfs_inode *nfsi = NFS_I(inode);
106		106
107	if (nfs_have_delegated_attributes(inode))	107	if (nfs_have_delegated_attributes(inode))
108	goto out_noreval;	108	goto out_noreval;
109		109
110	if (filp->f_flags & O_DIRECT)	110	if (filp->f_flags & O_DIRECT)
111	goto force_reval;	111	goto force_reval;
112	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)	112	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
113	goto force_reval;	113	goto force_reval;
114	if (nfs_attribute_timeout(inode))	114	if (nfs_attribute_timeout(inode))
115	goto force_reval;	115	goto force_reval;
116	out_noreval:	116	out_noreval:
117	return 0;	117	return 0;
118	force_reval:	118	force_reval:
119	return __nfs_revalidate_inode(server, inode);	119	return __nfs_revalidate_inode(server, inode);
120	}	120	}
121		121
122	loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)	122	loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
123	{	123	{
124	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",	124	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
125	filp->f_path.dentry->d_parent->d_name.name,	125	filp->f_path.dentry->d_parent->d_name.name,
126	filp->f_path.dentry->d_name.name,	126	filp->f_path.dentry->d_name.name,
127	offset, origin);	127	offset, whence);
128		128
129	/*	129	/*
130	* origin == SEEK_END \|\| SEEK_DATA \|\| SEEK_HOLE => we must revalidate	130	* whence == SEEK_END \|\| SEEK_DATA \|\| SEEK_HOLE => we must revalidate
131	* the cached file length	131	* the cached file length
132	*/	132	*/
133	if (origin != SEEK_SET && origin != SEEK_CUR) {	133	if (whence != SEEK_SET && whence != SEEK_CUR) {
134	struct inode *inode = filp->f_mapping->host;	134	struct inode *inode = filp->f_mapping->host;
135		135
136	int retval = nfs_revalidate_file_size(inode, filp);	136	int retval = nfs_revalidate_file_size(inode, filp);
137	if (retval < 0)	137	if (retval < 0)
138	return (loff_t)retval;	138	return (loff_t)retval;
139	}	139	}
140		140
141	return generic_file_llseek(filp, offset, origin);	141	return generic_file_llseek(filp, offset, whence);
142	}	142	}
143	EXPORT_SYMBOL_GPL(nfs_file_llseek);	143	EXPORT_SYMBOL_GPL(nfs_file_llseek);
144		144
145	/*	145	/*
146	* Flush all dirty pages, and check for write errors.	146	* Flush all dirty pages, and check for write errors.
147	*/	147	*/
148	int	148	int
149	nfs_file_flush(struct file *file, fl_owner_t id)	149	nfs_file_flush(struct file *file, fl_owner_t id)
150	{	150	{
151	struct dentry *dentry = file->f_path.dentry;	151	struct dentry *dentry = file->f_path.dentry;
152	struct inode *inode = dentry->d_inode;	152	struct inode *inode = dentry->d_inode;
153		153
154	dprintk("NFS: flush(%s/%s)\n",	154	dprintk("NFS: flush(%s/%s)\n",
155	dentry->d_parent->d_name.name,	155	dentry->d_parent->d_name.name,
156	dentry->d_name.name);	156	dentry->d_name.name);
157		157
158	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);	158	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
159	if ((file->f_mode & FMODE_WRITE) == 0)	159	if ((file->f_mode & FMODE_WRITE) == 0)
160	return 0;	160	return 0;
161		161
162	/*	162	/*
163	* If we're holding a write delegation, then just start the i/o	163	* If we're holding a write delegation, then just start the i/o
164	* but don't wait for completion (or send a commit).	164	* but don't wait for completion (or send a commit).
165	*/	165	*/
166	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))	166	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
167	return filemap_fdatawrite(file->f_mapping);	167	return filemap_fdatawrite(file->f_mapping);
168		168
169	/* Flush writes to the server and return any errors */	169	/* Flush writes to the server and return any errors */
170	return vfs_fsync(file, 0);	170	return vfs_fsync(file, 0);
171	}	171	}
172	EXPORT_SYMBOL_GPL(nfs_file_flush);	172	EXPORT_SYMBOL_GPL(nfs_file_flush);
173		173
174	ssize_t	174	ssize_t
175	nfs_file_read(struct kiocb iocb, const struct iovec iov,	175	nfs_file_read(struct kiocb iocb, const struct iovec iov,
176	unsigned long nr_segs, loff_t pos)	176	unsigned long nr_segs, loff_t pos)
177	{	177	{
178	struct dentry * dentry = iocb->ki_filp->f_path.dentry;	178	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
179	struct inode * inode = dentry->d_inode;	179	struct inode * inode = dentry->d_inode;
180	ssize_t result;	180	ssize_t result;
181		181
182	if (iocb->ki_filp->f_flags & O_DIRECT)	182	if (iocb->ki_filp->f_flags & O_DIRECT)
183	return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);	183	return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
184		184
185	dprintk("NFS: read(%s/%s, %lu@%lu)\n",	185	dprintk("NFS: read(%s/%s, %lu@%lu)\n",
186	dentry->d_parent->d_name.name, dentry->d_name.name,	186	dentry->d_parent->d_name.name, dentry->d_name.name,
187	(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);	187	(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
188		188
189	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);	189	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
190	if (!result) {	190	if (!result) {
191	result = generic_file_aio_read(iocb, iov, nr_segs, pos);	191	result = generic_file_aio_read(iocb, iov, nr_segs, pos);
192	if (result > 0)	192	if (result > 0)
193	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);	193	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
194	}	194	}
195	return result;	195	return result;
196	}	196	}
197	EXPORT_SYMBOL_GPL(nfs_file_read);	197	EXPORT_SYMBOL_GPL(nfs_file_read);
198		198
199	ssize_t	199	ssize_t
200	nfs_file_splice_read(struct file filp, loff_t ppos,	200	nfs_file_splice_read(struct file filp, loff_t ppos,
201	struct pipe_inode_info *pipe, size_t count,	201	struct pipe_inode_info *pipe, size_t count,
202	unsigned int flags)	202	unsigned int flags)
203	{	203	{
204	struct dentry *dentry = filp->f_path.dentry;	204	struct dentry *dentry = filp->f_path.dentry;
205	struct inode *inode = dentry->d_inode;	205	struct inode *inode = dentry->d_inode;
206	ssize_t res;	206	ssize_t res;
207		207
208	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",	208	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
209	dentry->d_parent->d_name.name, dentry->d_name.name,	209	dentry->d_parent->d_name.name, dentry->d_name.name,
210	(unsigned long) count, (unsigned long long) *ppos);	210	(unsigned long) count, (unsigned long long) *ppos);
211		211
212	res = nfs_revalidate_mapping(inode, filp->f_mapping);	212	res = nfs_revalidate_mapping(inode, filp->f_mapping);
213	if (!res) {	213	if (!res) {
214	res = generic_file_splice_read(filp, ppos, pipe, count, flags);	214	res = generic_file_splice_read(filp, ppos, pipe, count, flags);
215	if (res > 0)	215	if (res > 0)
216	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);	216	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
217	}	217	}
218	return res;	218	return res;
219	}	219	}
220	EXPORT_SYMBOL_GPL(nfs_file_splice_read);	220	EXPORT_SYMBOL_GPL(nfs_file_splice_read);
221		221
222	int	222	int
223	nfs_file_mmap(struct file * file, struct vm_area_struct * vma)	223	nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
224	{	224	{
225	struct dentry *dentry = file->f_path.dentry;	225	struct dentry *dentry = file->f_path.dentry;
226	struct inode *inode = dentry->d_inode;	226	struct inode *inode = dentry->d_inode;
227	int status;	227	int status;
228		228
229	dprintk("NFS: mmap(%s/%s)\n",	229	dprintk("NFS: mmap(%s/%s)\n",
230	dentry->d_parent->d_name.name, dentry->d_name.name);	230	dentry->d_parent->d_name.name, dentry->d_name.name);
231		231
232	/* Note: generic_file_mmap() returns ENOSYS on nommu systems	232	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
233	* so we call that before revalidating the mapping	233	* so we call that before revalidating the mapping
234	*/	234	*/
235	status = generic_file_mmap(file, vma);	235	status = generic_file_mmap(file, vma);
236	if (!status) {	236	if (!status) {
237	vma->vm_ops = &nfs_file_vm_ops;	237	vma->vm_ops = &nfs_file_vm_ops;
238	status = nfs_revalidate_mapping(inode, file->f_mapping);	238	status = nfs_revalidate_mapping(inode, file->f_mapping);
239	}	239	}
240	return status;	240	return status;
241	}	241	}
242	EXPORT_SYMBOL_GPL(nfs_file_mmap);	242	EXPORT_SYMBOL_GPL(nfs_file_mmap);
243		243
244	/*	244	/*
245	* Flush any dirty pages for this process, and check for write errors.	245	* Flush any dirty pages for this process, and check for write errors.
246	* The return status from this call provides a reliable indication of	246	* The return status from this call provides a reliable indication of
247	* whether any write errors occurred for this process.	247	* whether any write errors occurred for this process.
248	*	248	*
249	* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to	249	* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
250	* disk, but it retrieves and clears ctx->error after synching, despite	250	* disk, but it retrieves and clears ctx->error after synching, despite
251	* the two being set at the same time in nfs_context_set_write_error().	251	* the two being set at the same time in nfs_context_set_write_error().
252	* This is because the former is used to notify the _next_ call to	252	* This is because the former is used to notify the _next_ call to
253	* nfs_file_write() that a write error occurred, and hence cause it to	253	* nfs_file_write() that a write error occurred, and hence cause it to
254	* fall back to doing a synchronous write.	254	* fall back to doing a synchronous write.
255	*/	255	*/
256	int	256	int
257	nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)	257	nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
258	{	258	{
259	struct dentry *dentry = file->f_path.dentry;	259	struct dentry *dentry = file->f_path.dentry;
260	struct nfs_open_context *ctx = nfs_file_open_context(file);	260	struct nfs_open_context *ctx = nfs_file_open_context(file);
261	struct inode *inode = dentry->d_inode;	261	struct inode *inode = dentry->d_inode;
262	int have_error, do_resend, status;	262	int have_error, do_resend, status;
263	int ret = 0;	263	int ret = 0;
264		264
265	dprintk("NFS: fsync file(%s/%s) datasync %d\n",	265	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
266	dentry->d_parent->d_name.name, dentry->d_name.name,	266	dentry->d_parent->d_name.name, dentry->d_name.name,
267	datasync);	267	datasync);
268		268
269	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);	269	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
270	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);	270	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
271	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);	271	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
272	status = nfs_commit_inode(inode, FLUSH_SYNC);	272	status = nfs_commit_inode(inode, FLUSH_SYNC);
273	have_error \|= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);	273	have_error \|= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
274	if (have_error) {	274	if (have_error) {
275	ret = xchg(&ctx->error, 0);	275	ret = xchg(&ctx->error, 0);
276	if (ret)	276	if (ret)
277	goto out;	277	goto out;
278	}	278	}
279	if (status < 0) {	279	if (status < 0) {
280	ret = status;	280	ret = status;
281	goto out;	281	goto out;
282	}	282	}
283	do_resend \|= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);	283	do_resend \|= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
284	if (do_resend)	284	if (do_resend)
285	ret = -EAGAIN;	285	ret = -EAGAIN;
286	out:	286	out:
287	return ret;	287	return ret;
288	}	288	}
289	EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);	289	EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
290		290
291	static int	291	static int
292	nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)	292	nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
293	{	293	{
294	int ret;	294	int ret;
295	struct inode *inode = file->f_path.dentry->d_inode;	295	struct inode *inode = file->f_path.dentry->d_inode;
296		296
297	do {	297	do {
298	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);	298	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
299	if (ret != 0)	299	if (ret != 0)
300	break;	300	break;
301	mutex_lock(&inode->i_mutex);	301	mutex_lock(&inode->i_mutex);
302	ret = nfs_file_fsync_commit(file, start, end, datasync);	302	ret = nfs_file_fsync_commit(file, start, end, datasync);
303	mutex_unlock(&inode->i_mutex);	303	mutex_unlock(&inode->i_mutex);
304	/*	304	/*
305	* If nfs_file_fsync_commit detected a server reboot, then	305	* If nfs_file_fsync_commit detected a server reboot, then
306	* resend all dirty pages that might have been covered by	306	* resend all dirty pages that might have been covered by
307	* the NFS_CONTEXT_RESEND_WRITES flag	307	* the NFS_CONTEXT_RESEND_WRITES flag
308	*/	308	*/
309	start = 0;	309	start = 0;
310	end = LLONG_MAX;	310	end = LLONG_MAX;
311	} while (ret == -EAGAIN);	311	} while (ret == -EAGAIN);
312		312
313	return ret;	313	return ret;
314	}	314	}
315		315
316	/*	316	/*
317	* Decide whether a read/modify/write cycle may be more efficient	317	* Decide whether a read/modify/write cycle may be more efficient
318	* then a modify/write/read cycle when writing to a page in the	318	* then a modify/write/read cycle when writing to a page in the
319	* page cache.	319	* page cache.
320	*	320	*
321	* The modify/write/read cycle may occur if a page is read before	321	* The modify/write/read cycle may occur if a page is read before
322	* being completely filled by the writer. In this situation, the	322	* being completely filled by the writer. In this situation, the
323	* page must be completely written to stable storage on the server	323	* page must be completely written to stable storage on the server
324	* before it can be refilled by reading in the page from the server.	324	* before it can be refilled by reading in the page from the server.
325	* This can lead to expensive, small, FILE_SYNC mode writes being	325	* This can lead to expensive, small, FILE_SYNC mode writes being
326	* done.	326	* done.
327	*	327	*
328	* It may be more efficient to read the page first if the file is	328	* It may be more efficient to read the page first if the file is
329	* open for reading in addition to writing, the page is not marked	329	* open for reading in addition to writing, the page is not marked
330	* as Uptodate, it is not dirty or waiting to be committed,	330	* as Uptodate, it is not dirty or waiting to be committed,
331	* indicating that it was previously allocated and then modified,	331	* indicating that it was previously allocated and then modified,
332	* that there were valid bytes of data in that range of the file,	332	* that there were valid bytes of data in that range of the file,
333	* and that the new data won't completely replace the old data in	333	* and that the new data won't completely replace the old data in
334	* that range of the file.	334	* that range of the file.
335	*/	335	*/
336	static int nfs_want_read_modify_write(struct file file, struct page page,	336	static int nfs_want_read_modify_write(struct file file, struct page page,
337	loff_t pos, unsigned len)	337	loff_t pos, unsigned len)
338	{	338	{
339	unsigned int pglen = nfs_page_length(page);	339	unsigned int pglen = nfs_page_length(page);
340	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);	340	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
341	unsigned int end = offset + len;	341	unsigned int end = offset + len;
342		342
343	if ((file->f_mode & FMODE_READ) && /* open for read? */	343	if ((file->f_mode & FMODE_READ) && /* open for read? */
344	!PageUptodate(page) && /* Uptodate? */	344	!PageUptodate(page) && /* Uptodate? */
345	!PagePrivate(page) && /* i/o request already? */	345	!PagePrivate(page) && /* i/o request already? */
346	pglen && /* valid bytes of file? */	346	pglen && /* valid bytes of file? */
347	(end < pglen \|\| offset)) /* replace all valid bytes? */	347	(end < pglen \|\| offset)) /* replace all valid bytes? */
348	return 1;	348	return 1;
349	return 0;	349	return 0;
350	}	350	}
351		351
352	/*	352	/*
353	* This does the "real" work of the write. We must allocate and lock the	353	* This does the "real" work of the write. We must allocate and lock the
354	* page to be sent back to the generic routine, which then copies the	354	* page to be sent back to the generic routine, which then copies the
355	* data from user space.	355	* data from user space.
356	*	356	*
357	* If the writer ends up delaying the write, the writer needs to	357	* If the writer ends up delaying the write, the writer needs to
358	* increment the page use counts until he is done with the page.	358	* increment the page use counts until he is done with the page.
359	*/	359	*/
360	static int nfs_write_begin(struct file file, struct address_space mapping,	360	static int nfs_write_begin(struct file file, struct address_space mapping,
361	loff_t pos, unsigned len, unsigned flags,	361	loff_t pos, unsigned len, unsigned flags,
362	struct page pagep, void fsdata)	362	struct page pagep, void fsdata)
363	{	363	{
364	int ret;	364	int ret;
365	pgoff_t index = pos >> PAGE_CACHE_SHIFT;	365	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
366	struct page *page;	366	struct page *page;
367	int once_thru = 0;	367	int once_thru = 0;
368		368
369	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",	369	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
370	file->f_path.dentry->d_parent->d_name.name,	370	file->f_path.dentry->d_parent->d_name.name,
371	file->f_path.dentry->d_name.name,	371	file->f_path.dentry->d_name.name,
372	mapping->host->i_ino, len, (long long) pos);	372	mapping->host->i_ino, len, (long long) pos);
373		373
374	start:	374	start:
375	/*	375	/*
376	* Prevent starvation issues if someone is doing a consistency	376	* Prevent starvation issues if someone is doing a consistency
377	* sync-to-disk	377	* sync-to-disk
378	*/	378	*/
379	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,	379	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
380	nfs_wait_bit_killable, TASK_KILLABLE);	380	nfs_wait_bit_killable, TASK_KILLABLE);
381	if (ret)	381	if (ret)
382	return ret;	382	return ret;
383		383
384	page = grab_cache_page_write_begin(mapping, index, flags);	384	page = grab_cache_page_write_begin(mapping, index, flags);
385	if (!page)	385	if (!page)
386	return -ENOMEM;	386	return -ENOMEM;
387	*pagep = page;	387	*pagep = page;
388		388
389	ret = nfs_flush_incompatible(file, page);	389	ret = nfs_flush_incompatible(file, page);
390	if (ret) {	390	if (ret) {
391	unlock_page(page);	391	unlock_page(page);
392	page_cache_release(page);	392	page_cache_release(page);
393	} else if (!once_thru &&	393	} else if (!once_thru &&
394	nfs_want_read_modify_write(file, page, pos, len)) {	394	nfs_want_read_modify_write(file, page, pos, len)) {
395	once_thru = 1;	395	once_thru = 1;
396	ret = nfs_readpage(file, page);	396	ret = nfs_readpage(file, page);
397	page_cache_release(page);	397	page_cache_release(page);
398	if (!ret)	398	if (!ret)
399	goto start;	399	goto start;
400	}	400	}
401	return ret;	401	return ret;
402	}	402	}
403		403
404	static int nfs_write_end(struct file file, struct address_space mapping,	404	static int nfs_write_end(struct file file, struct address_space mapping,
405	loff_t pos, unsigned len, unsigned copied,	405	loff_t pos, unsigned len, unsigned copied,
406	struct page page, void fsdata)	406	struct page page, void fsdata)
407	{	407	{
408	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);	408	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
409	int status;	409	int status;
410		410
411	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",	411	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
412	file->f_path.dentry->d_parent->d_name.name,	412	file->f_path.dentry->d_parent->d_name.name,
413	file->f_path.dentry->d_name.name,	413	file->f_path.dentry->d_name.name,
414	mapping->host->i_ino, len, (long long) pos);	414	mapping->host->i_ino, len, (long long) pos);
415		415
416	/*	416	/*
417	* Zero any uninitialised parts of the page, and then mark the page	417	* Zero any uninitialised parts of the page, and then mark the page
418	* as up to date if it turns out that we're extending the file.	418	* as up to date if it turns out that we're extending the file.
419	*/	419	*/
420	if (!PageUptodate(page)) {	420	if (!PageUptodate(page)) {
421	unsigned pglen = nfs_page_length(page);	421	unsigned pglen = nfs_page_length(page);
422	unsigned end = offset + len;	422	unsigned end = offset + len;
423		423
424	if (pglen == 0) {	424	if (pglen == 0) {
425	zero_user_segments(page, 0, offset,	425	zero_user_segments(page, 0, offset,
426	end, PAGE_CACHE_SIZE);	426	end, PAGE_CACHE_SIZE);
427	SetPageUptodate(page);	427	SetPageUptodate(page);
428	} else if (end >= pglen) {	428	} else if (end >= pglen) {
429	zero_user_segment(page, end, PAGE_CACHE_SIZE);	429	zero_user_segment(page, end, PAGE_CACHE_SIZE);
430	if (offset == 0)	430	if (offset == 0)
431	SetPageUptodate(page);	431	SetPageUptodate(page);
432	} else	432	} else
433	zero_user_segment(page, pglen, PAGE_CACHE_SIZE);	433	zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
434	}	434	}
435		435
436	status = nfs_updatepage(file, page, offset, copied);	436	status = nfs_updatepage(file, page, offset, copied);
437		437
438	unlock_page(page);	438	unlock_page(page);
439	page_cache_release(page);	439	page_cache_release(page);
440		440
441	if (status < 0)	441	if (status < 0)
442	return status;	442	return status;
443	NFS_I(mapping->host)->write_io += copied;	443	NFS_I(mapping->host)->write_io += copied;
444	return copied;	444	return copied;
445	}	445	}
446		446
447	/*	447	/*
448	* Partially or wholly invalidate a page	448	* Partially or wholly invalidate a page
449	* - Release the private state associated with a page if undergoing complete	449	* - Release the private state associated with a page if undergoing complete
450	* page invalidation	450	* page invalidation
451	* - Called if either PG_private or PG_fscache is set on the page	451	* - Called if either PG_private or PG_fscache is set on the page
452	* - Caller holds page lock	452	* - Caller holds page lock
453	*/	453	*/
454	static void nfs_invalidate_page(struct page *page, unsigned long offset)	454	static void nfs_invalidate_page(struct page *page, unsigned long offset)
455	{	455	{
456	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);	456	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
457		457
458	if (offset != 0)	458	if (offset != 0)
459	return;	459	return;
460	/* Cancel any unstarted writes on this page */	460	/* Cancel any unstarted writes on this page */
461	nfs_wb_page_cancel(page_file_mapping(page)->host, page);	461	nfs_wb_page_cancel(page_file_mapping(page)->host, page);
462		462
463	nfs_fscache_invalidate_page(page, page->mapping->host);	463	nfs_fscache_invalidate_page(page, page->mapping->host);
464	}	464	}
465		465
466	/*	466	/*
467	* Attempt to release the private state associated with a page	467	* Attempt to release the private state associated with a page
468	* - Called if either PG_private or PG_fscache is set on the page	468	* - Called if either PG_private or PG_fscache is set on the page
469	* - Caller holds page lock	469	* - Caller holds page lock
470	* - Return true (may release page) or false (may not)	470	* - Return true (may release page) or false (may not)
471	*/	471	*/
472	static int nfs_release_page(struct page *page, gfp_t gfp)	472	static int nfs_release_page(struct page *page, gfp_t gfp)
473	{	473	{
474	struct address_space *mapping = page->mapping;	474	struct address_space *mapping = page->mapping;
475		475
476	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);	476	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
477		477
478	/* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not	478	/* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
479	* doing this memory reclaim for a fs-related allocation.	479	* doing this memory reclaim for a fs-related allocation.
480	*/	480	*/
481	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&	481	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
482	!(current->flags & PF_FSTRANS)) {	482	!(current->flags & PF_FSTRANS)) {
483	int how = FLUSH_SYNC;	483	int how = FLUSH_SYNC;
484		484
485	/* Don't let kswapd deadlock waiting for OOM RPC calls */	485	/* Don't let kswapd deadlock waiting for OOM RPC calls */
486	if (current_is_kswapd())	486	if (current_is_kswapd())
487	how = 0;	487	how = 0;
488	nfs_commit_inode(mapping->host, how);	488	nfs_commit_inode(mapping->host, how);
489	}	489	}
490	/* If PagePrivate() is set, then the page is not freeable */	490	/* If PagePrivate() is set, then the page is not freeable */
491	if (PagePrivate(page))	491	if (PagePrivate(page))
492	return 0;	492	return 0;
493	return nfs_fscache_release_page(page, gfp);	493	return nfs_fscache_release_page(page, gfp);
494	}	494	}
495		495
496	/*	496	/*
497	* Attempt to clear the private state associated with a page when an error	497	* Attempt to clear the private state associated with a page when an error
498	* occurs that requires the cached contents of an inode to be written back or	498	* occurs that requires the cached contents of an inode to be written back or
499	* destroyed	499	* destroyed
500	* - Called if either PG_private or fscache is set on the page	500	* - Called if either PG_private or fscache is set on the page
501	* - Caller holds page lock	501	* - Caller holds page lock
502	* - Return 0 if successful, -error otherwise	502	* - Return 0 if successful, -error otherwise
503	*/	503	*/
504	static int nfs_launder_page(struct page *page)	504	static int nfs_launder_page(struct page *page)
505	{	505	{
506	struct inode *inode = page_file_mapping(page)->host;	506	struct inode *inode = page_file_mapping(page)->host;
507	struct nfs_inode *nfsi = NFS_I(inode);	507	struct nfs_inode *nfsi = NFS_I(inode);
508		508
509	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",	509	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
510	inode->i_ino, (long long)page_offset(page));	510	inode->i_ino, (long long)page_offset(page));
511		511
512	nfs_fscache_wait_on_page_write(nfsi, page);	512	nfs_fscache_wait_on_page_write(nfsi, page);
513	return nfs_wb_page(inode, page);	513	return nfs_wb_page(inode, page);
514	}	514	}
515		515
516	#ifdef CONFIG_NFS_SWAP	516	#ifdef CONFIG_NFS_SWAP
517	static int nfs_swap_activate(struct swap_info_struct sis, struct file file,	517	static int nfs_swap_activate(struct swap_info_struct sis, struct file file,
518	sector_t *span)	518	sector_t *span)
519	{	519	{
520	*span = sis->pages;	520	*span = sis->pages;
521	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);	521	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
522	}	522	}
523		523
524	static void nfs_swap_deactivate(struct file *file)	524	static void nfs_swap_deactivate(struct file *file)
525	{	525	{
526	xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);	526	xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
527	}	527	}
528	#endif	528	#endif
529		529
530	const struct address_space_operations nfs_file_aops = {	530	const struct address_space_operations nfs_file_aops = {
531	.readpage = nfs_readpage,	531	.readpage = nfs_readpage,
532	.readpages = nfs_readpages,	532	.readpages = nfs_readpages,
533	.set_page_dirty = __set_page_dirty_nobuffers,	533	.set_page_dirty = __set_page_dirty_nobuffers,
534	.writepage = nfs_writepage,	534	.writepage = nfs_writepage,
535	.writepages = nfs_writepages,	535	.writepages = nfs_writepages,
536	.write_begin = nfs_write_begin,	536	.write_begin = nfs_write_begin,
537	.write_end = nfs_write_end,	537	.write_end = nfs_write_end,
538	.invalidatepage = nfs_invalidate_page,	538	.invalidatepage = nfs_invalidate_page,
539	.releasepage = nfs_release_page,	539	.releasepage = nfs_release_page,
540	.direct_IO = nfs_direct_IO,	540	.direct_IO = nfs_direct_IO,
541	.migratepage = nfs_migrate_page,	541	.migratepage = nfs_migrate_page,
542	.launder_page = nfs_launder_page,	542	.launder_page = nfs_launder_page,
543	.error_remove_page = generic_error_remove_page,	543	.error_remove_page = generic_error_remove_page,
544	#ifdef CONFIG_NFS_SWAP	544	#ifdef CONFIG_NFS_SWAP
545	.swap_activate = nfs_swap_activate,	545	.swap_activate = nfs_swap_activate,
546	.swap_deactivate = nfs_swap_deactivate,	546	.swap_deactivate = nfs_swap_deactivate,
547	#endif	547	#endif
548	};	548	};
549		549
550	/*	550	/*
551	* Notification that a PTE pointing to an NFS page is about to be made	551	* Notification that a PTE pointing to an NFS page is about to be made
552	* writable, implying that someone is about to modify the page through a	552	* writable, implying that someone is about to modify the page through a
553	* shared-writable mapping	553	* shared-writable mapping
554	*/	554	*/
555	static int nfs_vm_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)	555	static int nfs_vm_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)
556	{	556	{
557	struct page *page = vmf->page;	557	struct page *page = vmf->page;
558	struct file *filp = vma->vm_file;	558	struct file *filp = vma->vm_file;
559	struct dentry *dentry = filp->f_path.dentry;	559	struct dentry *dentry = filp->f_path.dentry;
560	unsigned pagelen;	560	unsigned pagelen;
561	int ret = VM_FAULT_NOPAGE;	561	int ret = VM_FAULT_NOPAGE;
562	struct address_space *mapping;	562	struct address_space *mapping;
563		563
564	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",	564	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
565	dentry->d_parent->d_name.name, dentry->d_name.name,	565	dentry->d_parent->d_name.name, dentry->d_name.name,
566	filp->f_mapping->host->i_ino,	566	filp->f_mapping->host->i_ino,
567	(long long)page_offset(page));	567	(long long)page_offset(page));
568		568
569	/* make sure the cache has finished storing the page */	569	/* make sure the cache has finished storing the page */
570	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);	570	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
571		571
572	lock_page(page);	572	lock_page(page);
573	mapping = page_file_mapping(page);	573	mapping = page_file_mapping(page);
574	if (mapping != dentry->d_inode->i_mapping)	574	if (mapping != dentry->d_inode->i_mapping)
575	goto out_unlock;	575	goto out_unlock;
576		576
577	wait_on_page_writeback(page);	577	wait_on_page_writeback(page);
578		578
579	pagelen = nfs_page_length(page);	579	pagelen = nfs_page_length(page);
580	if (pagelen == 0)	580	if (pagelen == 0)
581	goto out_unlock;	581	goto out_unlock;
582		582
583	ret = VM_FAULT_LOCKED;	583	ret = VM_FAULT_LOCKED;
584	if (nfs_flush_incompatible(filp, page) == 0 &&	584	if (nfs_flush_incompatible(filp, page) == 0 &&
585	nfs_updatepage(filp, page, 0, pagelen) == 0)	585	nfs_updatepage(filp, page, 0, pagelen) == 0)
586	goto out;	586	goto out;
587		587
588	ret = VM_FAULT_SIGBUS;	588	ret = VM_FAULT_SIGBUS;
589	out_unlock:	589	out_unlock:
590	unlock_page(page);	590	unlock_page(page);
591	out:	591	out:
592	return ret;	592	return ret;
593	}	593	}
594		594
595	static const struct vm_operations_struct nfs_file_vm_ops = {	595	static const struct vm_operations_struct nfs_file_vm_ops = {
596	.fault = filemap_fault,	596	.fault = filemap_fault,
597	.page_mkwrite = nfs_vm_page_mkwrite,	597	.page_mkwrite = nfs_vm_page_mkwrite,
598	.remap_pages = generic_file_remap_pages,	598	.remap_pages = generic_file_remap_pages,
599	};	599	};
600		600
601	static int nfs_need_sync_write(struct file filp, struct inode inode)	601	static int nfs_need_sync_write(struct file filp, struct inode inode)
602	{	602	{
603	struct nfs_open_context *ctx;	603	struct nfs_open_context *ctx;
604		604
605	if (IS_SYNC(inode) \|\| (filp->f_flags & O_DSYNC))	605	if (IS_SYNC(inode) \|\| (filp->f_flags & O_DSYNC))
606	return 1;	606	return 1;
607	ctx = nfs_file_open_context(filp);	607	ctx = nfs_file_open_context(filp);
608	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))	608	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
609	return 1;	609	return 1;
610	return 0;	610	return 0;
611	}	611	}
612		612
613	ssize_t nfs_file_write(struct kiocb iocb, const struct iovec iov,	613	ssize_t nfs_file_write(struct kiocb iocb, const struct iovec iov,
614	unsigned long nr_segs, loff_t pos)	614	unsigned long nr_segs, loff_t pos)
615	{	615	{
616	struct dentry * dentry = iocb->ki_filp->f_path.dentry;	616	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
617	struct inode * inode = dentry->d_inode;	617	struct inode * inode = dentry->d_inode;
618	unsigned long written = 0;	618	unsigned long written = 0;
619	ssize_t result;	619	ssize_t result;
620	size_t count = iov_length(iov, nr_segs);	620	size_t count = iov_length(iov, nr_segs);
621		621
622	if (iocb->ki_filp->f_flags & O_DIRECT)	622	if (iocb->ki_filp->f_flags & O_DIRECT)
623	return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);	623	return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
624		624
625	dprintk("NFS: write(%s/%s, %lu@%Ld)\n",	625	dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
626	dentry->d_parent->d_name.name, dentry->d_name.name,	626	dentry->d_parent->d_name.name, dentry->d_name.name,
627	(unsigned long) count, (long long) pos);	627	(unsigned long) count, (long long) pos);
628		628
629	result = -EBUSY;	629	result = -EBUSY;
630	if (IS_SWAPFILE(inode))	630	if (IS_SWAPFILE(inode))
631	goto out_swapfile;	631	goto out_swapfile;
632	/*	632	/*
633	* O_APPEND implies that we must revalidate the file length.	633	* O_APPEND implies that we must revalidate the file length.
634	*/	634	*/
635	if (iocb->ki_filp->f_flags & O_APPEND) {	635	if (iocb->ki_filp->f_flags & O_APPEND) {
636	result = nfs_revalidate_file_size(inode, iocb->ki_filp);	636	result = nfs_revalidate_file_size(inode, iocb->ki_filp);
637	if (result)	637	if (result)
638	goto out;	638	goto out;
639	}	639	}
640		640
641	result = count;	641	result = count;
642	if (!count)	642	if (!count)
643	goto out;	643	goto out;
644		644
645	result = generic_file_aio_write(iocb, iov, nr_segs, pos);	645	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
646	if (result > 0)	646	if (result > 0)
647	written = result;	647	written = result;
648		648
649	/* Return error values for O_DSYNC and IS_SYNC() */	649	/* Return error values for O_DSYNC and IS_SYNC() */
650	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {	650	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
651	int err = vfs_fsync(iocb->ki_filp, 0);	651	int err = vfs_fsync(iocb->ki_filp, 0);
652	if (err < 0)	652	if (err < 0)
653	result = err;	653	result = err;
654	}	654	}
655	if (result > 0)	655	if (result > 0)
656	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);	656	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
657	out:	657	out:
658	return result;	658	return result;
659		659
660	out_swapfile:	660	out_swapfile:
661	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");	661	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
662	goto out;	662	goto out;
663	}	663	}
664	EXPORT_SYMBOL_GPL(nfs_file_write);	664	EXPORT_SYMBOL_GPL(nfs_file_write);
665		665
666	ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,	666	ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
667	struct file filp, loff_t ppos,	667	struct file filp, loff_t ppos,
668	size_t count, unsigned int flags)	668	size_t count, unsigned int flags)
669	{	669	{
670	struct dentry *dentry = filp->f_path.dentry;	670	struct dentry *dentry = filp->f_path.dentry;
671	struct inode *inode = dentry->d_inode;	671	struct inode *inode = dentry->d_inode;
672	unsigned long written = 0;	672	unsigned long written = 0;
673	ssize_t ret;	673	ssize_t ret;
674		674
675	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",	675	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
676	dentry->d_parent->d_name.name, dentry->d_name.name,	676	dentry->d_parent->d_name.name, dentry->d_name.name,
677	(unsigned long) count, (unsigned long long) *ppos);	677	(unsigned long) count, (unsigned long long) *ppos);
678		678
679	/*	679	/*
680	* The combination of splice and an O_APPEND destination is disallowed.	680	* The combination of splice and an O_APPEND destination is disallowed.
681	*/	681	*/
682		682
683	ret = generic_file_splice_write(pipe, filp, ppos, count, flags);	683	ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
684	if (ret > 0)	684	if (ret > 0)
685	written = ret;	685	written = ret;
686		686
687	if (ret >= 0 && nfs_need_sync_write(filp, inode)) {	687	if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
688	int err = vfs_fsync(filp, 0);	688	int err = vfs_fsync(filp, 0);
689	if (err < 0)	689	if (err < 0)
690	ret = err;	690	ret = err;
691	}	691	}
692	if (ret > 0)	692	if (ret > 0)
693	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);	693	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
694	return ret;	694	return ret;
695	}	695	}
696	EXPORT_SYMBOL_GPL(nfs_file_splice_write);	696	EXPORT_SYMBOL_GPL(nfs_file_splice_write);
697		697
698	static int	698	static int
699	do_getlk(struct file filp, int cmd, struct file_lock fl, int is_local)	699	do_getlk(struct file filp, int cmd, struct file_lock fl, int is_local)
700	{	700	{
701	struct inode *inode = filp->f_mapping->host;	701	struct inode *inode = filp->f_mapping->host;
702	int status = 0;	702	int status = 0;
703	unsigned int saved_type = fl->fl_type;	703	unsigned int saved_type = fl->fl_type;
704		704
705	/* Try local locking first */	705	/* Try local locking first */
706	posix_test_lock(filp, fl);	706	posix_test_lock(filp, fl);
707	if (fl->fl_type != F_UNLCK) {	707	if (fl->fl_type != F_UNLCK) {
708	/* found a conflict */	708	/* found a conflict */
709	goto out;	709	goto out;
710	}	710	}
711	fl->fl_type = saved_type;	711	fl->fl_type = saved_type;
712		712
713	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))	713	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
714	goto out_noconflict;	714	goto out_noconflict;
715		715
716	if (is_local)	716	if (is_local)
717	goto out_noconflict;	717	goto out_noconflict;
718		718
719	status = NFS_PROTO(inode)->lock(filp, cmd, fl);	719	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
720	out:	720	out:
721	return status;	721	return status;
722	out_noconflict:	722	out_noconflict:
723	fl->fl_type = F_UNLCK;	723	fl->fl_type = F_UNLCK;
724	goto out;	724	goto out;
725	}	725	}
726		726
727	static int do_vfs_lock(struct file file, struct file_lock fl)	727	static int do_vfs_lock(struct file file, struct file_lock fl)
728	{	728	{
729	int res = 0;	729	int res = 0;
730	switch (fl->fl_flags & (FL_POSIX\|FL_FLOCK)) {	730	switch (fl->fl_flags & (FL_POSIX\|FL_FLOCK)) {
731	case FL_POSIX:	731	case FL_POSIX:
732	res = posix_lock_file_wait(file, fl);	732	res = posix_lock_file_wait(file, fl);
733	break;	733	break;
734	case FL_FLOCK:	734	case FL_FLOCK:
735	res = flock_lock_file_wait(file, fl);	735	res = flock_lock_file_wait(file, fl);
736	break;	736	break;
737	default:	737	default:
738	BUG();	738	BUG();
739	}	739	}
740	return res;	740	return res;
741	}	741	}
742		742
743	static int	743	static int
744	do_unlk(struct file filp, int cmd, struct file_lock fl, int is_local)	744	do_unlk(struct file filp, int cmd, struct file_lock fl, int is_local)
745	{	745	{
746	struct inode *inode = filp->f_mapping->host;	746	struct inode *inode = filp->f_mapping->host;
747	int status;	747	int status;
748		748
749	/*	749	/*
750	* Flush all pending writes before doing anything	750	* Flush all pending writes before doing anything
751	* with locks..	751	* with locks..
752	*/	752	*/
753	nfs_sync_mapping(filp->f_mapping);	753	nfs_sync_mapping(filp->f_mapping);
754		754
755	/* NOTE: special case	755	/* NOTE: special case
756	* If we're signalled while cleaning up locks on process exit, we	756	* If we're signalled while cleaning up locks on process exit, we
757	* still need to complete the unlock.	757	* still need to complete the unlock.
758	*/	758	*/
759	/*	759	/*
760	* Use local locking if mounted with "-onolock" or with appropriate	760	* Use local locking if mounted with "-onolock" or with appropriate
761	* "-olocal_lock="	761	* "-olocal_lock="
762	*/	762	*/
763	if (!is_local)	763	if (!is_local)
764	status = NFS_PROTO(inode)->lock(filp, cmd, fl);	764	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
765	else	765	else
766	status = do_vfs_lock(filp, fl);	766	status = do_vfs_lock(filp, fl);
767	return status;	767	return status;
768	}	768	}
769		769
770	static int	770	static int
771	is_time_granular(struct timespec *ts) {	771	is_time_granular(struct timespec *ts) {
772	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));	772	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
773	}	773	}
774		774
775	static int	775	static int
776	do_setlk(struct file filp, int cmd, struct file_lock fl, int is_local)	776	do_setlk(struct file filp, int cmd, struct file_lock fl, int is_local)
777	{	777	{
778	struct inode *inode = filp->f_mapping->host;	778	struct inode *inode = filp->f_mapping->host;
779	int status;	779	int status;
780		780
781	/*	781	/*
782	* Flush all pending writes before doing anything	782	* Flush all pending writes before doing anything
783	* with locks..	783	* with locks..
784	*/	784	*/
785	status = nfs_sync_mapping(filp->f_mapping);	785	status = nfs_sync_mapping(filp->f_mapping);
786	if (status != 0)	786	if (status != 0)
787	goto out;	787	goto out;
788		788
789	/*	789	/*
790	* Use local locking if mounted with "-onolock" or with appropriate	790	* Use local locking if mounted with "-onolock" or with appropriate
791	* "-olocal_lock="	791	* "-olocal_lock="
792	*/	792	*/
793	if (!is_local)	793	if (!is_local)
794	status = NFS_PROTO(inode)->lock(filp, cmd, fl);	794	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
795	else	795	else
796	status = do_vfs_lock(filp, fl);	796	status = do_vfs_lock(filp, fl);
797	if (status < 0)	797	if (status < 0)
798	goto out;	798	goto out;
799		799
800	/*	800	/*
801	* Revalidate the cache if the server has time stamps granular	801	* Revalidate the cache if the server has time stamps granular
802	* enough to detect subsecond changes. Otherwise, clear the	802	* enough to detect subsecond changes. Otherwise, clear the
803	* cache to prevent missing any changes.	803	* cache to prevent missing any changes.
804	*	804	*
805	* This makes locking act as a cache coherency point.	805	* This makes locking act as a cache coherency point.
806	*/	806	*/
807	nfs_sync_mapping(filp->f_mapping);	807	nfs_sync_mapping(filp->f_mapping);
808	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {	808	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
809	if (is_time_granular(&NFS_SERVER(inode)->time_delta))	809	if (is_time_granular(&NFS_SERVER(inode)->time_delta))
810	__nfs_revalidate_inode(NFS_SERVER(inode), inode);	810	__nfs_revalidate_inode(NFS_SERVER(inode), inode);
811	else	811	else
812	nfs_zap_caches(inode);	812	nfs_zap_caches(inode);
813	}	813	}
814	out:	814	out:
815	return status;	815	return status;
816	}	816	}
817		817
818	/*	818	/*
819	* Lock a (portion of) a file	819	* Lock a (portion of) a file
820	*/	820	*/
821	int nfs_lock(struct file filp, int cmd, struct file_lock fl)	821	int nfs_lock(struct file filp, int cmd, struct file_lock fl)
822	{	822	{
823	struct inode *inode = filp->f_mapping->host;	823	struct inode *inode = filp->f_mapping->host;
824	int ret = -ENOLCK;	824	int ret = -ENOLCK;
825	int is_local = 0;	825	int is_local = 0;
826		826
827	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",	827	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
828	filp->f_path.dentry->d_parent->d_name.name,	828	filp->f_path.dentry->d_parent->d_name.name,
829	filp->f_path.dentry->d_name.name,	829	filp->f_path.dentry->d_name.name,
830	fl->fl_type, fl->fl_flags,	830	fl->fl_type, fl->fl_flags,
831	(long long)fl->fl_start, (long long)fl->fl_end);	831	(long long)fl->fl_start, (long long)fl->fl_end);
832		832
833	nfs_inc_stats(inode, NFSIOS_VFSLOCK);	833	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
834		834
835	/* No mandatory locks over NFS */	835	/* No mandatory locks over NFS */
836	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)	836	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
837	goto out_err;	837	goto out_err;
838		838
839	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)	839	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
840	is_local = 1;	840	is_local = 1;
841		841
842	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {	842	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
843	ret = NFS_PROTO(inode)->lock_check_bounds(fl);	843	ret = NFS_PROTO(inode)->lock_check_bounds(fl);
844	if (ret < 0)	844	if (ret < 0)
845	goto out_err;	845	goto out_err;
846	}	846	}
847		847
848	if (IS_GETLK(cmd))	848	if (IS_GETLK(cmd))
849	ret = do_getlk(filp, cmd, fl, is_local);	849	ret = do_getlk(filp, cmd, fl, is_local);
850	else if (fl->fl_type == F_UNLCK)	850	else if (fl->fl_type == F_UNLCK)
851	ret = do_unlk(filp, cmd, fl, is_local);	851	ret = do_unlk(filp, cmd, fl, is_local);
852	else	852	else
853	ret = do_setlk(filp, cmd, fl, is_local);	853	ret = do_setlk(filp, cmd, fl, is_local);
854	out_err:	854	out_err:
855	return ret;	855	return ret;
856	}	856	}
857	EXPORT_SYMBOL_GPL(nfs_lock);	857	EXPORT_SYMBOL_GPL(nfs_lock);
858		858
859	/*	859	/*
860	* Lock a (portion of) a file	860	* Lock a (portion of) a file
861	*/	861	*/
862	int nfs_flock(struct file filp, int cmd, struct file_lock fl)	862	int nfs_flock(struct file filp, int cmd, struct file_lock fl)
863	{	863	{
864	struct inode *inode = filp->f_mapping->host;	864	struct inode *inode = filp->f_mapping->host;
865	int is_local = 0;	865	int is_local = 0;
866		866
867	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",	867	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
868	filp->f_path.dentry->d_parent->d_name.name,	868	filp->f_path.dentry->d_parent->d_name.name,
869	filp->f_path.dentry->d_name.name,	869	filp->f_path.dentry->d_name.name,
870	fl->fl_type, fl->fl_flags);	870	fl->fl_type, fl->fl_flags);
871		871
872	if (!(fl->fl_flags & FL_FLOCK))	872	if (!(fl->fl_flags & FL_FLOCK))
873	return -ENOLCK;	873	return -ENOLCK;
874		874
875	/*	875	/*
876	* The NFSv4 protocol doesn't support LOCK_MAND, which is not part of	876	* The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
877	* any standard. In principle we might be able to support LOCK_MAND	877	* any standard. In principle we might be able to support LOCK_MAND
878	* on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the	878	* on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
879	* NFS code is not set up for it.	879	* NFS code is not set up for it.
880	*/	880	*/
881	if (fl->fl_type & LOCK_MAND)	881	if (fl->fl_type & LOCK_MAND)
882	return -EINVAL;	882	return -EINVAL;
883		883
884	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)	884	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
885	is_local = 1;	885	is_local = 1;
886		886
887	/* We're simulating flock() locks using posix locks on the server */	887	/* We're simulating flock() locks using posix locks on the server */
888	fl->fl_owner = (fl_owner_t)filp;	888	fl->fl_owner = (fl_owner_t)filp;
889	fl->fl_start = 0;	889	fl->fl_start = 0;
890	fl->fl_end = OFFSET_MAX;	890	fl->fl_end = OFFSET_MAX;
891		891
892	if (fl->fl_type == F_UNLCK)	892	if (fl->fl_type == F_UNLCK)
893	return do_unlk(filp, cmd, fl, is_local);	893	return do_unlk(filp, cmd, fl, is_local);
894	return do_setlk(filp, cmd, fl, is_local);	894	return do_setlk(filp, cmd, fl, is_local);
895	}	895	}
896	EXPORT_SYMBOL_GPL(nfs_flock);	896	EXPORT_SYMBOL_GPL(nfs_flock);
897		897
898	/*	898	/*
899	* There is no protocol support for leases, so we have no way to implement	899	* There is no protocol support for leases, so we have no way to implement
900	* them correctly in the face of opens by other clients.	900	* them correctly in the face of opens by other clients.
901	*/	901	*/
902	int nfs_setlease(struct file file, long arg, struct file_lock *fl)	902	int nfs_setlease(struct file file, long arg, struct file_lock *fl)
903	{	903	{
904	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",	904	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
905	file->f_path.dentry->d_parent->d_name.name,	905	file->f_path.dentry->d_parent->d_name.name,
906	file->f_path.dentry->d_name.name, arg);	906	file->f_path.dentry->d_name.name, arg);
907	return -EINVAL;	907	return -EINVAL;
908	}	908	}
909	EXPORT_SYMBOL_GPL(nfs_setlease);	909	EXPORT_SYMBOL_GPL(nfs_setlease);
910		910
911	const struct file_operations nfs_file_operations = {	911	const struct file_operations nfs_file_operations = {
912	.llseek = nfs_file_llseek,	912	.llseek = nfs_file_llseek,
913	.read = do_sync_read,	913	.read = do_sync_read,
914	.write = do_sync_write,	914	.write = do_sync_write,
915	.aio_read = nfs_file_read,	915	.aio_read = nfs_file_read,
916	.aio_write = nfs_file_write,	916	.aio_write = nfs_file_write,
917	.mmap = nfs_file_mmap,	917	.mmap = nfs_file_mmap,
918	.open = nfs_file_open,	918	.open = nfs_file_open,
919	.flush = nfs_file_flush,	919	.flush = nfs_file_flush,
920	.release = nfs_file_release,	920	.release = nfs_file_release,
921	.fsync = nfs_file_fsync,	921	.fsync = nfs_file_fsync,
922	.lock = nfs_lock,	922	.lock = nfs_lock,
923	.flock = nfs_flock,	923	.flock = nfs_flock,
924	.splice_read = nfs_file_splice_read,	924	.splice_read = nfs_file_splice_read,
925	.splice_write = nfs_file_splice_write,	925	.splice_write = nfs_file_splice_write,
926	.check_flags = nfs_check_flags,	926	.check_flags = nfs_check_flags,
927	.setlease = nfs_setlease,	927	.setlease = nfs_setlease,
928	};	928	};
929	EXPORT_SYMBOL_GPL(nfs_file_operations);	929	EXPORT_SYMBOL_GPL(nfs_file_operations);
930		930

fs/ocfs2/extent_map.c

Diff comments View file @ 965c8e5

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* extent_map.c	4	* extent_map.c
5	*	5	*
6	* Block/Cluster mapping functions	6	* Block/Cluster mapping functions
7	*	7	*
8	* Copyright (C) 2004 Oracle. All rights reserved.	8	* Copyright (C) 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License, version 2, as published by the Free Software Foundation.	12	* License, version 2, as published by the Free Software Foundation.
13	*	13	*
14	* This program is distributed in the hope that it will be useful,	14	* This program is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of	15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* General Public License for more details.	17	* General Public License for more details.
18	*	18	*
19	* You should have received a copy of the GNU General Public	19	* You should have received a copy of the GNU General Public
20	* License along with this program; if not, write to the	20	* License along with this program; if not, write to the
21	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	21	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22	* Boston, MA 021110-1307, USA.	22	* Boston, MA 021110-1307, USA.
23	*/	23	*/
24		24
25	#include <linux/fs.h>	25	#include <linux/fs.h>
26	#include <linux/init.h>	26	#include <linux/init.h>
27	#include <linux/slab.h>	27	#include <linux/slab.h>
28	#include <linux/types.h>	28	#include <linux/types.h>
29	#include <linux/fiemap.h>	29	#include <linux/fiemap.h>
30		30
31	#include <cluster/masklog.h>	31	#include <cluster/masklog.h>
32		32
33	#include "ocfs2.h"	33	#include "ocfs2.h"
34		34
35	#include "alloc.h"	35	#include "alloc.h"
36	#include "dlmglue.h"	36	#include "dlmglue.h"
37	#include "extent_map.h"	37	#include "extent_map.h"
38	#include "inode.h"	38	#include "inode.h"
39	#include "super.h"	39	#include "super.h"
40	#include "symlink.h"	40	#include "symlink.h"
41	#include "ocfs2_trace.h"	41	#include "ocfs2_trace.h"
42		42
43	#include "buffer_head_io.h"	43	#include "buffer_head_io.h"
44		44
45	/*	45	/*
46	* The extent caching implementation is intentionally trivial.	46	* The extent caching implementation is intentionally trivial.
47	*	47	*
48	* We only cache a small number of extents stored directly on the	48	* We only cache a small number of extents stored directly on the
49	* inode, so linear order operations are acceptable. If we ever want	49	* inode, so linear order operations are acceptable. If we ever want
50	* to increase the size of the extent map, then these algorithms must	50	* to increase the size of the extent map, then these algorithms must
51	* get smarter.	51	* get smarter.
52	*/	52	*/
53		53
54	void ocfs2_extent_map_init(struct inode *inode)	54	void ocfs2_extent_map_init(struct inode *inode)
55	{	55	{
56	struct ocfs2_inode_info *oi = OCFS2_I(inode);	56	struct ocfs2_inode_info *oi = OCFS2_I(inode);
57		57
58	oi->ip_extent_map.em_num_items = 0;	58	oi->ip_extent_map.em_num_items = 0;
59	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);	59	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
60	}	60	}
61		61
62	static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,	62	static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
63	unsigned int cpos,	63	unsigned int cpos,
64	struct ocfs2_extent_map_item **ret_emi)	64	struct ocfs2_extent_map_item **ret_emi)
65	{	65	{
66	unsigned int range;	66	unsigned int range;
67	struct ocfs2_extent_map_item *emi;	67	struct ocfs2_extent_map_item *emi;
68		68
69	*ret_emi = NULL;	69	*ret_emi = NULL;
70		70
71	list_for_each_entry(emi, &em->em_list, ei_list) {	71	list_for_each_entry(emi, &em->em_list, ei_list) {
72	range = emi->ei_cpos + emi->ei_clusters;	72	range = emi->ei_cpos + emi->ei_clusters;
73		73
74	if (cpos >= emi->ei_cpos && cpos < range) {	74	if (cpos >= emi->ei_cpos && cpos < range) {
75	list_move(&emi->ei_list, &em->em_list);	75	list_move(&emi->ei_list, &em->em_list);
76		76
77	*ret_emi = emi;	77	*ret_emi = emi;
78	break;	78	break;
79	}	79	}
80	}	80	}
81	}	81	}
82		82
83	static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,	83	static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
84	unsigned int phys, unsigned int len,	84	unsigned int phys, unsigned int len,
85	unsigned int *flags)	85	unsigned int *flags)
86	{	86	{
87	unsigned int coff;	87	unsigned int coff;
88	struct ocfs2_inode_info *oi = OCFS2_I(inode);	88	struct ocfs2_inode_info *oi = OCFS2_I(inode);
89	struct ocfs2_extent_map_item *emi;	89	struct ocfs2_extent_map_item *emi;
90		90
91	spin_lock(&oi->ip_lock);	91	spin_lock(&oi->ip_lock);
92		92
93	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);	93	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
94	if (emi) {	94	if (emi) {
95	coff = cpos - emi->ei_cpos;	95	coff = cpos - emi->ei_cpos;
96	*phys = emi->ei_phys + coff;	96	*phys = emi->ei_phys + coff;
97	if (len)	97	if (len)
98	*len = emi->ei_clusters - coff;	98	*len = emi->ei_clusters - coff;
99	if (flags)	99	if (flags)
100	*flags = emi->ei_flags;	100	*flags = emi->ei_flags;
101	}	101	}
102		102
103	spin_unlock(&oi->ip_lock);	103	spin_unlock(&oi->ip_lock);
104		104
105	if (emi == NULL)	105	if (emi == NULL)
106	return -ENOENT;	106	return -ENOENT;
107		107
108	return 0;	108	return 0;
109	}	109	}
110		110
111	/*	111	/*
112	* Forget about all clusters equal to or greater than cpos.	112	* Forget about all clusters equal to or greater than cpos.
113	*/	113	*/
114	void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)	114	void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
115	{	115	{
116	struct ocfs2_extent_map_item emi, n;	116	struct ocfs2_extent_map_item emi, n;
117	struct ocfs2_inode_info *oi = OCFS2_I(inode);	117	struct ocfs2_inode_info *oi = OCFS2_I(inode);
118	struct ocfs2_extent_map *em = &oi->ip_extent_map;	118	struct ocfs2_extent_map *em = &oi->ip_extent_map;
119	LIST_HEAD(tmp_list);	119	LIST_HEAD(tmp_list);
120	unsigned int range;	120	unsigned int range;
121		121
122	spin_lock(&oi->ip_lock);	122	spin_lock(&oi->ip_lock);
123	list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {	123	list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
124	if (emi->ei_cpos >= cpos) {	124	if (emi->ei_cpos >= cpos) {
125	/* Full truncate of this record. */	125	/* Full truncate of this record. */
126	list_move(&emi->ei_list, &tmp_list);	126	list_move(&emi->ei_list, &tmp_list);
127	BUG_ON(em->em_num_items == 0);	127	BUG_ON(em->em_num_items == 0);
128	em->em_num_items--;	128	em->em_num_items--;
129	continue;	129	continue;
130	}	130	}
131		131
132	range = emi->ei_cpos + emi->ei_clusters;	132	range = emi->ei_cpos + emi->ei_clusters;
133	if (range > cpos) {	133	if (range > cpos) {
134	/* Partial truncate */	134	/* Partial truncate */
135	emi->ei_clusters = cpos - emi->ei_cpos;	135	emi->ei_clusters = cpos - emi->ei_cpos;
136	}	136	}
137	}	137	}
138	spin_unlock(&oi->ip_lock);	138	spin_unlock(&oi->ip_lock);
139		139
140	list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {	140	list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
141	list_del(&emi->ei_list);	141	list_del(&emi->ei_list);
142	kfree(emi);	142	kfree(emi);
143	}	143	}
144	}	144	}
145		145
146	/*	146	/*
147	* Is any part of emi2 contained within emi1	147	* Is any part of emi2 contained within emi1
148	*/	148	*/
149	static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,	149	static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
150	struct ocfs2_extent_map_item *emi2)	150	struct ocfs2_extent_map_item *emi2)
151	{	151	{
152	unsigned int range1, range2;	152	unsigned int range1, range2;
153		153
154	/*	154	/*
155	* Check if logical start of emi2 is inside emi1	155	* Check if logical start of emi2 is inside emi1
156	*/	156	*/
157	range1 = emi1->ei_cpos + emi1->ei_clusters;	157	range1 = emi1->ei_cpos + emi1->ei_clusters;
158	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)	158	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
159	return 1;	159	return 1;
160		160
161	/*	161	/*
162	* Check if logical end of emi2 is inside emi1	162	* Check if logical end of emi2 is inside emi1
163	*/	163	*/
164	range2 = emi2->ei_cpos + emi2->ei_clusters;	164	range2 = emi2->ei_cpos + emi2->ei_clusters;
165	if (range2 > emi1->ei_cpos && range2 <= range1)	165	if (range2 > emi1->ei_cpos && range2 <= range1)
166	return 1;	166	return 1;
167		167
168	return 0;	168	return 0;
169	}	169	}
170		170
171	static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,	171	static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
172	struct ocfs2_extent_map_item *src)	172	struct ocfs2_extent_map_item *src)
173	{	173	{
174	dest->ei_cpos = src->ei_cpos;	174	dest->ei_cpos = src->ei_cpos;
175	dest->ei_phys = src->ei_phys;	175	dest->ei_phys = src->ei_phys;
176	dest->ei_clusters = src->ei_clusters;	176	dest->ei_clusters = src->ei_clusters;
177	dest->ei_flags = src->ei_flags;	177	dest->ei_flags = src->ei_flags;
178	}	178	}
179		179
180	/*	180	/*
181	* Try to merge emi with ins. Returns 1 if merge succeeds, zero	181	* Try to merge emi with ins. Returns 1 if merge succeeds, zero
182	* otherwise.	182	* otherwise.
183	*/	183	*/
184	static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,	184	static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
185	struct ocfs2_extent_map_item *ins)	185	struct ocfs2_extent_map_item *ins)
186	{	186	{
187	/*	187	/*
188	* Handle contiguousness	188	* Handle contiguousness
189	*/	189	*/
190	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&	190	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
191	ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&	191	ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
192	ins->ei_flags == emi->ei_flags) {	192	ins->ei_flags == emi->ei_flags) {
193	emi->ei_clusters += ins->ei_clusters;	193	emi->ei_clusters += ins->ei_clusters;
194	return 1;	194	return 1;
195	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&	195	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
196	(ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&	196	(ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
197	ins->ei_flags == emi->ei_flags) {	197	ins->ei_flags == emi->ei_flags) {
198	emi->ei_phys = ins->ei_phys;	198	emi->ei_phys = ins->ei_phys;
199	emi->ei_cpos = ins->ei_cpos;	199	emi->ei_cpos = ins->ei_cpos;
200	emi->ei_clusters += ins->ei_clusters;	200	emi->ei_clusters += ins->ei_clusters;
201	return 1;	201	return 1;
202	}	202	}
203		203
204	/*	204	/*
205	* Overlapping extents - this shouldn't happen unless we've	205	* Overlapping extents - this shouldn't happen unless we've
206	* split an extent to change it's flags. That is exceedingly	206	* split an extent to change it's flags. That is exceedingly
207	* rare, so there's no sense in trying to optimize it yet.	207	* rare, so there's no sense in trying to optimize it yet.
208	*/	208	*/
209	if (ocfs2_ei_is_contained(emi, ins) \|\|	209	if (ocfs2_ei_is_contained(emi, ins) \|\|
210	ocfs2_ei_is_contained(ins, emi)) {	210	ocfs2_ei_is_contained(ins, emi)) {
211	ocfs2_copy_emi_fields(emi, ins);	211	ocfs2_copy_emi_fields(emi, ins);
212	return 1;	212	return 1;
213	}	213	}
214		214
215	/* No merge was possible. */	215	/* No merge was possible. */
216	return 0;	216	return 0;
217	}	217	}
218		218
219	/*	219	/*
220	* In order to reduce complexity on the caller, this insert function	220	* In order to reduce complexity on the caller, this insert function
221	* is intentionally liberal in what it will accept.	221	* is intentionally liberal in what it will accept.
222	*	222	*
223	* The only rule is that the truncate call must be used whenever	223	* The only rule is that the truncate call must be used whenever
224	* records have been deleted. This avoids inserting overlapping	224	* records have been deleted. This avoids inserting overlapping
225	* records with different physical mappings.	225	* records with different physical mappings.
226	*/	226	*/
227	void ocfs2_extent_map_insert_rec(struct inode *inode,	227	void ocfs2_extent_map_insert_rec(struct inode *inode,
228	struct ocfs2_extent_rec *rec)	228	struct ocfs2_extent_rec *rec)
229	{	229	{
230	struct ocfs2_inode_info *oi = OCFS2_I(inode);	230	struct ocfs2_inode_info *oi = OCFS2_I(inode);
231	struct ocfs2_extent_map *em = &oi->ip_extent_map;	231	struct ocfs2_extent_map *em = &oi->ip_extent_map;
232	struct ocfs2_extent_map_item emi, new_emi = NULL;	232	struct ocfs2_extent_map_item emi, new_emi = NULL;
233	struct ocfs2_extent_map_item ins;	233	struct ocfs2_extent_map_item ins;
234		234
235	ins.ei_cpos = le32_to_cpu(rec->e_cpos);	235	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
236	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,	236	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
237	le64_to_cpu(rec->e_blkno));	237	le64_to_cpu(rec->e_blkno));
238	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);	238	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
239	ins.ei_flags = rec->e_flags;	239	ins.ei_flags = rec->e_flags;
240		240
241	search:	241	search:
242	spin_lock(&oi->ip_lock);	242	spin_lock(&oi->ip_lock);
243		243
244	list_for_each_entry(emi, &em->em_list, ei_list) {	244	list_for_each_entry(emi, &em->em_list, ei_list) {
245	if (ocfs2_try_to_merge_extent_map(emi, &ins)) {	245	if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
246	list_move(&emi->ei_list, &em->em_list);	246	list_move(&emi->ei_list, &em->em_list);
247	spin_unlock(&oi->ip_lock);	247	spin_unlock(&oi->ip_lock);
248	goto out;	248	goto out;
249	}	249	}
250	}	250	}
251		251
252	/*	252	/*
253	* No item could be merged.	253	* No item could be merged.
254	*	254	*
255	* Either allocate and add a new item, or overwrite the last recently	255	* Either allocate and add a new item, or overwrite the last recently
256	* inserted.	256	* inserted.
257	*/	257	*/
258		258
259	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {	259	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
260	if (new_emi == NULL) {	260	if (new_emi == NULL) {
261	spin_unlock(&oi->ip_lock);	261	spin_unlock(&oi->ip_lock);
262		262
263	new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);	263	new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
264	if (new_emi == NULL)	264	if (new_emi == NULL)
265	goto out;	265	goto out;
266		266
267	goto search;	267	goto search;
268	}	268	}
269		269
270	ocfs2_copy_emi_fields(new_emi, &ins);	270	ocfs2_copy_emi_fields(new_emi, &ins);
271	list_add(&new_emi->ei_list, &em->em_list);	271	list_add(&new_emi->ei_list, &em->em_list);
272	em->em_num_items++;	272	em->em_num_items++;
273	new_emi = NULL;	273	new_emi = NULL;
274	} else {	274	} else {
275	BUG_ON(list_empty(&em->em_list) \|\| em->em_num_items == 0);	275	BUG_ON(list_empty(&em->em_list) \|\| em->em_num_items == 0);
276	emi = list_entry(em->em_list.prev,	276	emi = list_entry(em->em_list.prev,
277	struct ocfs2_extent_map_item, ei_list);	277	struct ocfs2_extent_map_item, ei_list);
278	list_move(&emi->ei_list, &em->em_list);	278	list_move(&emi->ei_list, &em->em_list);
279	ocfs2_copy_emi_fields(emi, &ins);	279	ocfs2_copy_emi_fields(emi, &ins);
280	}	280	}
281		281
282	spin_unlock(&oi->ip_lock);	282	spin_unlock(&oi->ip_lock);
283		283
284	out:	284	out:
285	if (new_emi)	285	if (new_emi)
286	kfree(new_emi);	286	kfree(new_emi);
287	}	287	}
288		288
289	static int ocfs2_last_eb_is_empty(struct inode *inode,	289	static int ocfs2_last_eb_is_empty(struct inode *inode,
290	struct ocfs2_dinode *di)	290	struct ocfs2_dinode *di)
291	{	291	{
292	int ret, next_free;	292	int ret, next_free;
293	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);	293	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
294	struct buffer_head *eb_bh = NULL;	294	struct buffer_head *eb_bh = NULL;
295	struct ocfs2_extent_block *eb;	295	struct ocfs2_extent_block *eb;
296	struct ocfs2_extent_list *el;	296	struct ocfs2_extent_list *el;
297		297
298	ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);	298	ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
299	if (ret) {	299	if (ret) {
300	mlog_errno(ret);	300	mlog_errno(ret);
301	goto out;	301	goto out;
302	}	302	}
303		303
304	eb = (struct ocfs2_extent_block *) eb_bh->b_data;	304	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
305	el = &eb->h_list;	305	el = &eb->h_list;
306		306
307	if (el->l_tree_depth) {	307	if (el->l_tree_depth) {
308	ocfs2_error(inode->i_sb,	308	ocfs2_error(inode->i_sb,
309	"Inode %lu has non zero tree depth in "	309	"Inode %lu has non zero tree depth in "
310	"leaf block %llu\n", inode->i_ino,	310	"leaf block %llu\n", inode->i_ino,
311	(unsigned long long)eb_bh->b_blocknr);	311	(unsigned long long)eb_bh->b_blocknr);
312	ret = -EROFS;	312	ret = -EROFS;
313	goto out;	313	goto out;
314	}	314	}
315		315
316	next_free = le16_to_cpu(el->l_next_free_rec);	316	next_free = le16_to_cpu(el->l_next_free_rec);
317		317
318	if (next_free == 0 \|\|	318	if (next_free == 0 \|\|
319	(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))	319	(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
320	ret = 1;	320	ret = 1;
321		321
322	out:	322	out:
323	brelse(eb_bh);	323	brelse(eb_bh);
324	return ret;	324	return ret;
325	}	325	}
326		326
327	/*	327	/*
328	* Return the 1st index within el which contains an extent start	328	* Return the 1st index within el which contains an extent start
329	* larger than v_cluster.	329	* larger than v_cluster.
330	*/	330	*/
331	static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,	331	static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
332	u32 v_cluster)	332	u32 v_cluster)
333	{	333	{
334	int i;	334	int i;
335	struct ocfs2_extent_rec *rec;	335	struct ocfs2_extent_rec *rec;
336		336
337	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {	337	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
338	rec = &el->l_recs[i];	338	rec = &el->l_recs[i];
339		339
340	if (v_cluster < le32_to_cpu(rec->e_cpos))	340	if (v_cluster < le32_to_cpu(rec->e_cpos))
341	break;	341	break;
342	}	342	}
343		343
344	return i;	344	return i;
345	}	345	}
346		346
347	/*	347	/*
348	* Figure out the size of a hole which starts at v_cluster within the given	348	* Figure out the size of a hole which starts at v_cluster within the given
349	* extent list.	349	* extent list.
350	*	350	*
351	* If there is no more allocation past v_cluster, we return the maximum	351	* If there is no more allocation past v_cluster, we return the maximum
352	* cluster size minus v_cluster.	352	* cluster size minus v_cluster.
353	*	353	*
354	* If we have in-inode extents, then el points to the dinode list and	354	* If we have in-inode extents, then el points to the dinode list and
355	* eb_bh is NULL. Otherwise, eb_bh should point to the extent block	355	* eb_bh is NULL. Otherwise, eb_bh should point to the extent block
356	* containing el.	356	* containing el.
357	*/	357	*/
358	int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,	358	int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
359	struct ocfs2_extent_list *el,	359	struct ocfs2_extent_list *el,
360	struct buffer_head *eb_bh,	360	struct buffer_head *eb_bh,
361	u32 v_cluster,	361	u32 v_cluster,
362	u32 *num_clusters)	362	u32 *num_clusters)
363	{	363	{
364	int ret, i;	364	int ret, i;
365	struct buffer_head *next_eb_bh = NULL;	365	struct buffer_head *next_eb_bh = NULL;
366	struct ocfs2_extent_block eb, next_eb;	366	struct ocfs2_extent_block eb, next_eb;
367		367
368	i = ocfs2_search_for_hole_index(el, v_cluster);	368	i = ocfs2_search_for_hole_index(el, v_cluster);
369		369
370	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {	370	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
371	eb = (struct ocfs2_extent_block *)eb_bh->b_data;	371	eb = (struct ocfs2_extent_block *)eb_bh->b_data;
372		372
373	/*	373	/*
374	* Check the next leaf for any extents.	374	* Check the next leaf for any extents.
375	*/	375	*/
376		376
377	if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)	377	if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
378	goto no_more_extents;	378	goto no_more_extents;
379		379
380	ret = ocfs2_read_extent_block(ci,	380	ret = ocfs2_read_extent_block(ci,
381	le64_to_cpu(eb->h_next_leaf_blk),	381	le64_to_cpu(eb->h_next_leaf_blk),
382	&next_eb_bh);	382	&next_eb_bh);
383	if (ret) {	383	if (ret) {
384	mlog_errno(ret);	384	mlog_errno(ret);
385	goto out;	385	goto out;
386	}	386	}
387		387
388	next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;	388	next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
389	el = &next_eb->h_list;	389	el = &next_eb->h_list;
390	i = ocfs2_search_for_hole_index(el, v_cluster);	390	i = ocfs2_search_for_hole_index(el, v_cluster);
391	}	391	}
392		392
393	no_more_extents:	393	no_more_extents:
394	if (i == le16_to_cpu(el->l_next_free_rec)) {	394	if (i == le16_to_cpu(el->l_next_free_rec)) {
395	/*	395	/*
396	* We're at the end of our existing allocation. Just	396	* We're at the end of our existing allocation. Just
397	* return the maximum number of clusters we could	397	* return the maximum number of clusters we could
398	* possibly allocate.	398	* possibly allocate.
399	*/	399	*/
400	*num_clusters = UINT_MAX - v_cluster;	400	*num_clusters = UINT_MAX - v_cluster;
401	} else {	401	} else {
402	*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;	402	*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
403	}	403	}
404		404
405	ret = 0;	405	ret = 0;
406	out:	406	out:
407	brelse(next_eb_bh);	407	brelse(next_eb_bh);
408	return ret;	408	return ret;
409	}	409	}
410		410
411	static int ocfs2_get_clusters_nocache(struct inode *inode,	411	static int ocfs2_get_clusters_nocache(struct inode *inode,
412	struct buffer_head *di_bh,	412	struct buffer_head *di_bh,
413	u32 v_cluster, unsigned int *hole_len,	413	u32 v_cluster, unsigned int *hole_len,
414	struct ocfs2_extent_rec *ret_rec,	414	struct ocfs2_extent_rec *ret_rec,
415	unsigned int *is_last)	415	unsigned int *is_last)
416	{	416	{
417	int i, ret, tree_height, len;	417	int i, ret, tree_height, len;
418	struct ocfs2_dinode *di;	418	struct ocfs2_dinode *di;
419	struct ocfs2_extent_block *uninitialized_var(eb);	419	struct ocfs2_extent_block *uninitialized_var(eb);
420	struct ocfs2_extent_list *el;	420	struct ocfs2_extent_list *el;
421	struct ocfs2_extent_rec *rec;	421	struct ocfs2_extent_rec *rec;
422	struct buffer_head *eb_bh = NULL;	422	struct buffer_head *eb_bh = NULL;
423		423
424	memset(ret_rec, 0, sizeof(*ret_rec));	424	memset(ret_rec, 0, sizeof(*ret_rec));
425	if (is_last)	425	if (is_last)
426	*is_last = 0;	426	*is_last = 0;
427		427
428	di = (struct ocfs2_dinode *) di_bh->b_data;	428	di = (struct ocfs2_dinode *) di_bh->b_data;
429	el = &di->id2.i_list;	429	el = &di->id2.i_list;
430	tree_height = le16_to_cpu(el->l_tree_depth);	430	tree_height = le16_to_cpu(el->l_tree_depth);
431		431
432	if (tree_height > 0) {	432	if (tree_height > 0) {
433	ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,	433	ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
434	&eb_bh);	434	&eb_bh);
435	if (ret) {	435	if (ret) {
436	mlog_errno(ret);	436	mlog_errno(ret);
437	goto out;	437	goto out;
438	}	438	}
439		439
440	eb = (struct ocfs2_extent_block *) eb_bh->b_data;	440	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
441	el = &eb->h_list;	441	el = &eb->h_list;
442		442
443	if (el->l_tree_depth) {	443	if (el->l_tree_depth) {
444	ocfs2_error(inode->i_sb,	444	ocfs2_error(inode->i_sb,
445	"Inode %lu has non zero tree depth in "	445	"Inode %lu has non zero tree depth in "
446	"leaf block %llu\n", inode->i_ino,	446	"leaf block %llu\n", inode->i_ino,
447	(unsigned long long)eb_bh->b_blocknr);	447	(unsigned long long)eb_bh->b_blocknr);
448	ret = -EROFS;	448	ret = -EROFS;
449	goto out;	449	goto out;
450	}	450	}
451	}	451	}
452		452
453	i = ocfs2_search_extent_list(el, v_cluster);	453	i = ocfs2_search_extent_list(el, v_cluster);
454	if (i == -1) {	454	if (i == -1) {
455	/*	455	/*
456	* Holes can be larger than the maximum size of an	456	* Holes can be larger than the maximum size of an
457	* extent, so we return their lengths in a separate	457	* extent, so we return their lengths in a separate
458	* field.	458	* field.
459	*/	459	*/
460	if (hole_len) {	460	if (hole_len) {
461	ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),	461	ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
462	el, eb_bh,	462	el, eb_bh,
463	v_cluster, &len);	463	v_cluster, &len);
464	if (ret) {	464	if (ret) {
465	mlog_errno(ret);	465	mlog_errno(ret);
466	goto out;	466	goto out;
467	}	467	}
468		468
469	*hole_len = len;	469	*hole_len = len;
470	}	470	}
471	goto out_hole;	471	goto out_hole;
472	}	472	}
473		473
474	rec = &el->l_recs[i];	474	rec = &el->l_recs[i];
475		475
476	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));	476	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
477		477
478	if (!rec->e_blkno) {	478	if (!rec->e_blkno) {
479	ocfs2_error(inode->i_sb, "Inode %lu has bad extent "	479	ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
480	"record (%u, %u, 0)", inode->i_ino,	480	"record (%u, %u, 0)", inode->i_ino,
481	le32_to_cpu(rec->e_cpos),	481	le32_to_cpu(rec->e_cpos),
482	ocfs2_rec_clusters(el, rec));	482	ocfs2_rec_clusters(el, rec));
483	ret = -EROFS;	483	ret = -EROFS;
484	goto out;	484	goto out;
485	}	485	}
486		486
487	ret_rec = rec;	487	ret_rec = rec;
488		488
489	/*	489	/*
490	* Checking for last extent is potentially expensive - we	490	* Checking for last extent is potentially expensive - we
491	* might have to look at the next leaf over to see if it's	491	* might have to look at the next leaf over to see if it's
492	* empty.	492	* empty.
493	*	493	*
494	* The first two checks are to see whether the caller even	494	* The first two checks are to see whether the caller even
495	* cares for this information, and if the extent is at least	495	* cares for this information, and if the extent is at least
496	* the last in it's list.	496	* the last in it's list.
497	*	497	*
498	* If those hold true, then the extent is last if any of the	498	* If those hold true, then the extent is last if any of the
499	* additional conditions hold true:	499	* additional conditions hold true:
500	* - Extent list is in-inode	500	* - Extent list is in-inode
501	* - Extent list is right-most	501	* - Extent list is right-most
502	* - Extent list is 2nd to rightmost, with empty right-most	502	* - Extent list is 2nd to rightmost, with empty right-most
503	*/	503	*/
504	if (is_last) {	504	if (is_last) {
505	if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {	505	if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
506	if (tree_height == 0)	506	if (tree_height == 0)
507	*is_last = 1;	507	*is_last = 1;
508	else if (eb->h_blkno == di->i_last_eb_blk)	508	else if (eb->h_blkno == di->i_last_eb_blk)
509	*is_last = 1;	509	*is_last = 1;
510	else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {	510	else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
511	ret = ocfs2_last_eb_is_empty(inode, di);	511	ret = ocfs2_last_eb_is_empty(inode, di);
512	if (ret < 0) {	512	if (ret < 0) {
513	mlog_errno(ret);	513	mlog_errno(ret);
514	goto out;	514	goto out;
515	}	515	}
516	if (ret == 1)	516	if (ret == 1)
517	*is_last = 1;	517	*is_last = 1;
518	}	518	}
519	}	519	}
520	}	520	}
521		521
522	out_hole:	522	out_hole:
523	ret = 0;	523	ret = 0;
524	out:	524	out:
525	brelse(eb_bh);	525	brelse(eb_bh);
526	return ret;	526	return ret;
527	}	527	}
528		528
529	static void ocfs2_relative_extent_offsets(struct super_block *sb,	529	static void ocfs2_relative_extent_offsets(struct super_block *sb,
530	u32 v_cluster,	530	u32 v_cluster,
531	struct ocfs2_extent_rec *rec,	531	struct ocfs2_extent_rec *rec,
532	u32 p_cluster, u32 num_clusters)	532	u32 p_cluster, u32 num_clusters)
533		533
534	{	534	{
535	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);	535	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
536		536
537	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));	537	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
538	p_cluster = p_cluster + coff;	538	p_cluster = p_cluster + coff;
539		539
540	if (num_clusters)	540	if (num_clusters)
541	*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;	541	*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
542	}	542	}
543		543
544	int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,	544	int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
545	u32 p_cluster, u32 num_clusters,	545	u32 p_cluster, u32 num_clusters,
546	struct ocfs2_extent_list *el,	546	struct ocfs2_extent_list *el,
547	unsigned int *extent_flags)	547	unsigned int *extent_flags)
548	{	548	{
549	int ret = 0, i;	549	int ret = 0, i;
550	struct buffer_head *eb_bh = NULL;	550	struct buffer_head *eb_bh = NULL;
551	struct ocfs2_extent_block *eb;	551	struct ocfs2_extent_block *eb;
552	struct ocfs2_extent_rec *rec;	552	struct ocfs2_extent_rec *rec;
553	u32 coff;	553	u32 coff;
554		554
555	if (el->l_tree_depth) {	555	if (el->l_tree_depth) {
556	ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,	556	ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
557	&eb_bh);	557	&eb_bh);
558	if (ret) {	558	if (ret) {
559	mlog_errno(ret);	559	mlog_errno(ret);
560	goto out;	560	goto out;
561	}	561	}
562		562
563	eb = (struct ocfs2_extent_block *) eb_bh->b_data;	563	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
564	el = &eb->h_list;	564	el = &eb->h_list;
565		565
566	if (el->l_tree_depth) {	566	if (el->l_tree_depth) {
567	ocfs2_error(inode->i_sb,	567	ocfs2_error(inode->i_sb,
568	"Inode %lu has non zero tree depth in "	568	"Inode %lu has non zero tree depth in "
569	"xattr leaf block %llu\n", inode->i_ino,	569	"xattr leaf block %llu\n", inode->i_ino,
570	(unsigned long long)eb_bh->b_blocknr);	570	(unsigned long long)eb_bh->b_blocknr);
571	ret = -EROFS;	571	ret = -EROFS;
572	goto out;	572	goto out;
573	}	573	}
574	}	574	}
575		575
576	i = ocfs2_search_extent_list(el, v_cluster);	576	i = ocfs2_search_extent_list(el, v_cluster);
577	if (i == -1) {	577	if (i == -1) {
578	ret = -EROFS;	578	ret = -EROFS;
579	mlog_errno(ret);	579	mlog_errno(ret);
580	goto out;	580	goto out;
581	} else {	581	} else {
582	rec = &el->l_recs[i];	582	rec = &el->l_recs[i];
583	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));	583	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
584		584
585	if (!rec->e_blkno) {	585	if (!rec->e_blkno) {
586	ocfs2_error(inode->i_sb, "Inode %lu has bad extent "	586	ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
587	"record (%u, %u, 0) in xattr", inode->i_ino,	587	"record (%u, %u, 0) in xattr", inode->i_ino,
588	le32_to_cpu(rec->e_cpos),	588	le32_to_cpu(rec->e_cpos),
589	ocfs2_rec_clusters(el, rec));	589	ocfs2_rec_clusters(el, rec));
590	ret = -EROFS;	590	ret = -EROFS;
591	goto out;	591	goto out;
592	}	592	}
593	coff = v_cluster - le32_to_cpu(rec->e_cpos);	593	coff = v_cluster - le32_to_cpu(rec->e_cpos);
594	*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,	594	*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
595	le64_to_cpu(rec->e_blkno));	595	le64_to_cpu(rec->e_blkno));
596	p_cluster = p_cluster + coff;	596	p_cluster = p_cluster + coff;
597	if (num_clusters)	597	if (num_clusters)
598	*num_clusters = ocfs2_rec_clusters(el, rec) - coff;	598	*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
599		599
600	if (extent_flags)	600	if (extent_flags)
601	*extent_flags = rec->e_flags;	601	*extent_flags = rec->e_flags;
602	}	602	}
603	out:	603	out:
604	if (eb_bh)	604	if (eb_bh)
605	brelse(eb_bh);	605	brelse(eb_bh);
606	return ret;	606	return ret;
607	}	607	}
608		608
609	int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,	609	int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
610	u32 p_cluster, u32 num_clusters,	610	u32 p_cluster, u32 num_clusters,
611	unsigned int *extent_flags)	611	unsigned int *extent_flags)
612	{	612	{
613	int ret;	613	int ret;
614	unsigned int uninitialized_var(hole_len), flags = 0;	614	unsigned int uninitialized_var(hole_len), flags = 0;
615	struct buffer_head *di_bh = NULL;	615	struct buffer_head *di_bh = NULL;
616	struct ocfs2_extent_rec rec;	616	struct ocfs2_extent_rec rec;
617		617
618	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	618	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
619	ret = -ERANGE;	619	ret = -ERANGE;
620	mlog_errno(ret);	620	mlog_errno(ret);
621	goto out;	621	goto out;
622	}	622	}
623		623
624	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,	624	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
625	num_clusters, extent_flags);	625	num_clusters, extent_flags);
626	if (ret == 0)	626	if (ret == 0)
627	goto out;	627	goto out;
628		628
629	ret = ocfs2_read_inode_block(inode, &di_bh);	629	ret = ocfs2_read_inode_block(inode, &di_bh);
630	if (ret) {	630	if (ret) {
631	mlog_errno(ret);	631	mlog_errno(ret);
632	goto out;	632	goto out;
633	}	633	}
634		634
635	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,	635	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
636	&rec, NULL);	636	&rec, NULL);
637	if (ret) {	637	if (ret) {
638	mlog_errno(ret);	638	mlog_errno(ret);
639	goto out;	639	goto out;
640	}	640	}
641		641
642	if (rec.e_blkno == 0ULL) {	642	if (rec.e_blkno == 0ULL) {
643	/*	643	/*
644	* A hole was found. Return some canned values that	644	* A hole was found. Return some canned values that
645	* callers can key on. If asked for, num_clusters will	645	* callers can key on. If asked for, num_clusters will
646	* be populated with the size of the hole.	646	* be populated with the size of the hole.
647	*/	647	*/
648	*p_cluster = 0;	648	*p_cluster = 0;
649	if (num_clusters) {	649	if (num_clusters) {
650	*num_clusters = hole_len;	650	*num_clusters = hole_len;
651	}	651	}
652	} else {	652	} else {
653	ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,	653	ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
654	p_cluster, num_clusters);	654	p_cluster, num_clusters);
655	flags = rec.e_flags;	655	flags = rec.e_flags;
656		656
657	ocfs2_extent_map_insert_rec(inode, &rec);	657	ocfs2_extent_map_insert_rec(inode, &rec);
658	}	658	}
659		659
660	if (extent_flags)	660	if (extent_flags)
661	*extent_flags = flags;	661	*extent_flags = flags;
662		662
663	out:	663	out:
664	brelse(di_bh);	664	brelse(di_bh);
665	return ret;	665	return ret;
666	}	666	}
667		667
668	/*	668	/*
669	* This expects alloc_sem to be held. The allocation cannot change at	669	* This expects alloc_sem to be held. The allocation cannot change at
670	* all while the map is in the process of being updated.	670	* all while the map is in the process of being updated.
671	*/	671	*/
672	int ocfs2_extent_map_get_blocks(struct inode inode, u64 v_blkno, u64 p_blkno,	672	int ocfs2_extent_map_get_blocks(struct inode inode, u64 v_blkno, u64 p_blkno,
673	u64 ret_count, unsigned int extent_flags)	673	u64 ret_count, unsigned int extent_flags)
674	{	674	{
675	int ret;	675	int ret;
676	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);	676	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
677	u32 cpos, num_clusters, p_cluster;	677	u32 cpos, num_clusters, p_cluster;
678	u64 boff = 0;	678	u64 boff = 0;
679		679
680	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);	680	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
681		681
682	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,	682	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
683	extent_flags);	683	extent_flags);
684	if (ret) {	684	if (ret) {
685	mlog_errno(ret);	685	mlog_errno(ret);
686	goto out;	686	goto out;
687	}	687	}
688		688
689	/*	689	/*
690	* p_cluster == 0 indicates a hole.	690	* p_cluster == 0 indicates a hole.
691	*/	691	*/
692	if (p_cluster) {	692	if (p_cluster) {
693	boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);	693	boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
694	boff += (v_blkno & (u64)(bpc - 1));	694	boff += (v_blkno & (u64)(bpc - 1));
695	}	695	}
696		696
697	*p_blkno = boff;	697	*p_blkno = boff;
698		698
699	if (ret_count) {	699	if (ret_count) {
700	*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);	700	*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
701	*ret_count -= v_blkno & (u64)(bpc - 1);	701	*ret_count -= v_blkno & (u64)(bpc - 1);
702	}	702	}
703		703
704	out:	704	out:
705	return ret;	705	return ret;
706	}	706	}
707		707
708	/*	708	/*
709	* The ocfs2_fiemap_inline() may be a little bit misleading, since	709	* The ocfs2_fiemap_inline() may be a little bit misleading, since
710	* it not only handles the fiemap for inlined files, but also deals	710	* it not only handles the fiemap for inlined files, but also deals
711	* with the fast symlink, cause they have no difference for extent	711	* with the fast symlink, cause they have no difference for extent
712	* mapping per se.	712	* mapping per se.
713	*/	713	*/
714	static int ocfs2_fiemap_inline(struct inode inode, struct buffer_head di_bh,	714	static int ocfs2_fiemap_inline(struct inode inode, struct buffer_head di_bh,
715	struct fiemap_extent_info *fieinfo,	715	struct fiemap_extent_info *fieinfo,
716	u64 map_start)	716	u64 map_start)
717	{	717	{
718	int ret;	718	int ret;
719	unsigned int id_count;	719	unsigned int id_count;
720	struct ocfs2_dinode *di;	720	struct ocfs2_dinode *di;
721	u64 phys;	721	u64 phys;
722	u32 flags = FIEMAP_EXTENT_DATA_INLINE\|FIEMAP_EXTENT_LAST;	722	u32 flags = FIEMAP_EXTENT_DATA_INLINE\|FIEMAP_EXTENT_LAST;
723	struct ocfs2_inode_info *oi = OCFS2_I(inode);	723	struct ocfs2_inode_info *oi = OCFS2_I(inode);
724		724
725	di = (struct ocfs2_dinode *)di_bh->b_data;	725	di = (struct ocfs2_dinode *)di_bh->b_data;
726	if (ocfs2_inode_is_fast_symlink(inode))	726	if (ocfs2_inode_is_fast_symlink(inode))
727	id_count = ocfs2_fast_symlink_chars(inode->i_sb);	727	id_count = ocfs2_fast_symlink_chars(inode->i_sb);
728	else	728	else
729	id_count = le16_to_cpu(di->id2.i_data.id_count);	729	id_count = le16_to_cpu(di->id2.i_data.id_count);
730		730
731	if (map_start < id_count) {	731	if (map_start < id_count) {
732	phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;	732	phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
733	if (ocfs2_inode_is_fast_symlink(inode))	733	if (ocfs2_inode_is_fast_symlink(inode))
734	phys += offsetof(struct ocfs2_dinode, id2.i_symlink);	734	phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
735	else	735	else
736	phys += offsetof(struct ocfs2_dinode,	736	phys += offsetof(struct ocfs2_dinode,
737	id2.i_data.id_data);	737	id2.i_data.id_data);
738		738
739	ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,	739	ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
740	flags);	740	flags);
741	if (ret < 0)	741	if (ret < 0)
742	return ret;	742	return ret;
743	}	743	}
744		744
745	return 0;	745	return 0;
746	}	746	}
747		747
748	#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)	748	#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
749		749
750	int ocfs2_fiemap(struct inode inode, struct fiemap_extent_info fieinfo,	750	int ocfs2_fiemap(struct inode inode, struct fiemap_extent_info fieinfo,
751	u64 map_start, u64 map_len)	751	u64 map_start, u64 map_len)
752	{	752	{
753	int ret, is_last;	753	int ret, is_last;
754	u32 mapping_end, cpos;	754	u32 mapping_end, cpos;
755	unsigned int hole_size;	755	unsigned int hole_size;
756	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	756	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
757	u64 len_bytes, phys_bytes, virt_bytes;	757	u64 len_bytes, phys_bytes, virt_bytes;
758	struct buffer_head *di_bh = NULL;	758	struct buffer_head *di_bh = NULL;
759	struct ocfs2_extent_rec rec;	759	struct ocfs2_extent_rec rec;
760		760
761	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);	761	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
762	if (ret)	762	if (ret)
763	return ret;	763	return ret;
764		764
765	ret = ocfs2_inode_lock(inode, &di_bh, 0);	765	ret = ocfs2_inode_lock(inode, &di_bh, 0);
766	if (ret) {	766	if (ret) {
767	mlog_errno(ret);	767	mlog_errno(ret);
768	goto out;	768	goto out;
769	}	769	}
770		770
771	down_read(&OCFS2_I(inode)->ip_alloc_sem);	771	down_read(&OCFS2_I(inode)->ip_alloc_sem);
772		772
773	/*	773	/*
774	* Handle inline-data and fast symlink separately.	774	* Handle inline-data and fast symlink separately.
775	*/	775	*/
776	if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) \|\|	776	if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) \|\|
777	ocfs2_inode_is_fast_symlink(inode)) {	777	ocfs2_inode_is_fast_symlink(inode)) {
778	ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);	778	ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
779	goto out_unlock;	779	goto out_unlock;
780	}	780	}
781		781
782	cpos = map_start >> osb->s_clustersize_bits;	782	cpos = map_start >> osb->s_clustersize_bits;
783	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,	783	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
784	map_start + map_len);	784	map_start + map_len);
785	mapping_end -= cpos;	785	mapping_end -= cpos;
786	is_last = 0;	786	is_last = 0;
787	while (cpos < mapping_end && !is_last) {	787	while (cpos < mapping_end && !is_last) {
788	u32 fe_flags;	788	u32 fe_flags;
789		789
790	ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,	790	ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
791	&hole_size, &rec, &is_last);	791	&hole_size, &rec, &is_last);
792	if (ret) {	792	if (ret) {
793	mlog_errno(ret);	793	mlog_errno(ret);
794	goto out;	794	goto out;
795	}	795	}
796		796
797	if (rec.e_blkno == 0ULL) {	797	if (rec.e_blkno == 0ULL) {
798	cpos += hole_size;	798	cpos += hole_size;
799	continue;	799	continue;
800	}	800	}
801		801
802	fe_flags = 0;	802	fe_flags = 0;
803	if (rec.e_flags & OCFS2_EXT_UNWRITTEN)	803	if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
804	fe_flags \|= FIEMAP_EXTENT_UNWRITTEN;	804	fe_flags \|= FIEMAP_EXTENT_UNWRITTEN;
805	if (rec.e_flags & OCFS2_EXT_REFCOUNTED)	805	if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
806	fe_flags \|= FIEMAP_EXTENT_SHARED;	806	fe_flags \|= FIEMAP_EXTENT_SHARED;
807	if (is_last)	807	if (is_last)
808	fe_flags \|= FIEMAP_EXTENT_LAST;	808	fe_flags \|= FIEMAP_EXTENT_LAST;
809	len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;	809	len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
810	phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;	810	phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
811	virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;	811	virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
812		812
813	ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,	813	ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
814	len_bytes, fe_flags);	814	len_bytes, fe_flags);
815	if (ret)	815	if (ret)
816	break;	816	break;
817		817
818	cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);	818	cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
819	}	819	}
820		820
821	if (ret > 0)	821	if (ret > 0)
822	ret = 0;	822	ret = 0;
823		823
824	out_unlock:	824	out_unlock:
825	brelse(di_bh);	825	brelse(di_bh);
826		826
827	up_read(&OCFS2_I(inode)->ip_alloc_sem);	827	up_read(&OCFS2_I(inode)->ip_alloc_sem);
828		828
829	ocfs2_inode_unlock(inode, 0);	829	ocfs2_inode_unlock(inode, 0);
830	out:	830	out:
831		831
832	return ret;	832	return ret;
833	}	833	}
834		834
835	int ocfs2_seek_data_hole_offset(struct file file, loff_t offset, int origin)	835	int ocfs2_seek_data_hole_offset(struct file file, loff_t offset, int whence)
836	{	836	{
837	struct inode *inode = file->f_mapping->host;	837	struct inode *inode = file->f_mapping->host;
838	int ret;	838	int ret;
839	unsigned int is_last = 0, is_data = 0;	839	unsigned int is_last = 0, is_data = 0;
840	u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;	840	u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
841	u32 cpos, cend, clen, hole_size;	841	u32 cpos, cend, clen, hole_size;
842	u64 extoff, extlen;	842	u64 extoff, extlen;
843	struct buffer_head *di_bh = NULL;	843	struct buffer_head *di_bh = NULL;
844	struct ocfs2_extent_rec rec;	844	struct ocfs2_extent_rec rec;
845		845
846	BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);	846	BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
847		847
848	ret = ocfs2_inode_lock(inode, &di_bh, 0);	848	ret = ocfs2_inode_lock(inode, &di_bh, 0);
849	if (ret) {	849	if (ret) {
850	mlog_errno(ret);	850	mlog_errno(ret);
851	goto out;	851	goto out;
852	}	852	}
853		853
854	down_read(&OCFS2_I(inode)->ip_alloc_sem);	854	down_read(&OCFS2_I(inode)->ip_alloc_sem);
855		855
856	if (*offset >= inode->i_size) {	856	if (*offset >= inode->i_size) {
857	ret = -ENXIO;	857	ret = -ENXIO;
858	goto out_unlock;	858	goto out_unlock;
859	}	859	}
860		860
861	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	861	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862	if (origin == SEEK_HOLE)	862	if (whence == SEEK_HOLE)
863	*offset = inode->i_size;	863	*offset = inode->i_size;
864	goto out_unlock;	864	goto out_unlock;
865	}	865	}
866		866
867	clen = 0;	867	clen = 0;
868	cpos = *offset >> cs_bits;	868	cpos = *offset >> cs_bits;
869	cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);	869	cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
870		870
871	while (cpos < cend && !is_last) {	871	while (cpos < cend && !is_last) {
872	ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,	872	ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
873	&rec, &is_last);	873	&rec, &is_last);
874	if (ret) {	874	if (ret) {
875	mlog_errno(ret);	875	mlog_errno(ret);
876	goto out_unlock;	876	goto out_unlock;
877	}	877	}
878		878
879	extoff = cpos;	879	extoff = cpos;
880	extoff <<= cs_bits;	880	extoff <<= cs_bits;
881		881
882	if (rec.e_blkno == 0ULL) {	882	if (rec.e_blkno == 0ULL) {
883	clen = hole_size;	883	clen = hole_size;
884	is_data = 0;	884	is_data = 0;
885	} else {	885	} else {
886	clen = le16_to_cpu(rec.e_leaf_clusters) -	886	clen = le16_to_cpu(rec.e_leaf_clusters) -
887	(cpos - le32_to_cpu(rec.e_cpos));	887	(cpos - le32_to_cpu(rec.e_cpos));
888	is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;	888	is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889	}	889	}
890		890
891	if ((!is_data && origin == SEEK_HOLE) \|\|	891	if ((!is_data && whence == SEEK_HOLE) \|\|
892	(is_data && origin == SEEK_DATA)) {	892	(is_data && whence == SEEK_DATA)) {
893	if (extoff > *offset)	893	if (extoff > *offset)
894	*offset = extoff;	894	*offset = extoff;
895	goto out_unlock;	895	goto out_unlock;
896	}	896	}
897		897
898	if (!is_last)	898	if (!is_last)
899	cpos += clen;	899	cpos += clen;
900	}	900	}
901		901
902	if (origin == SEEK_HOLE) {	902	if (whence == SEEK_HOLE) {
903	extoff = cpos;	903	extoff = cpos;
904	extoff <<= cs_bits;	904	extoff <<= cs_bits;
905	extlen = clen;	905	extlen = clen;
906	extlen <<= cs_bits;	906	extlen <<= cs_bits;
907		907
908	if ((extoff + extlen) > inode->i_size)	908	if ((extoff + extlen) > inode->i_size)
909	extlen = inode->i_size - extoff;	909	extlen = inode->i_size - extoff;
910	extoff += extlen;	910	extoff += extlen;
911	if (extoff > *offset)	911	if (extoff > *offset)
912	*offset = extoff;	912	*offset = extoff;
913	goto out_unlock;	913	goto out_unlock;
914	}	914	}
915		915
916	ret = -ENXIO;	916	ret = -ENXIO;
917		917
918	out_unlock:	918	out_unlock:
919		919
920	brelse(di_bh);	920	brelse(di_bh);
921		921
922	up_read(&OCFS2_I(inode)->ip_alloc_sem);	922	up_read(&OCFS2_I(inode)->ip_alloc_sem);
923		923
924	ocfs2_inode_unlock(inode, 0);	924	ocfs2_inode_unlock(inode, 0);
925	out:	925	out:
926	return ret;	926	return ret;
927	}	927	}
928		928
929	int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,	929	int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
930	struct buffer_head *bhs[], int flags,	930	struct buffer_head *bhs[], int flags,
931	int (validate)(struct super_block sb,	931	int (validate)(struct super_block sb,
932	struct buffer_head *bh))	932	struct buffer_head *bh))
933	{	933	{
934	int rc = 0;	934	int rc = 0;
935	u64 p_block, p_count;	935	u64 p_block, p_count;
936	int i, count, done = 0;	936	int i, count, done = 0;
937		937
938	trace_ocfs2_read_virt_blocks(	938	trace_ocfs2_read_virt_blocks(
939	inode, (unsigned long long)v_block, nr, bhs, flags,	939	inode, (unsigned long long)v_block, nr, bhs, flags,
940	validate);	940	validate);
941		941
942	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=	942	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
943	i_size_read(inode)) {	943	i_size_read(inode)) {
944	BUG_ON(!(flags & OCFS2_BH_READAHEAD));	944	BUG_ON(!(flags & OCFS2_BH_READAHEAD));
945	goto out;	945	goto out;
946	}	946	}
947		947
948	while (done < nr) {	948	while (done < nr) {
949	down_read(&OCFS2_I(inode)->ip_alloc_sem);	949	down_read(&OCFS2_I(inode)->ip_alloc_sem);
950	rc = ocfs2_extent_map_get_blocks(inode, v_block + done,	950	rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
951	&p_block, &p_count, NULL);	951	&p_block, &p_count, NULL);
952	up_read(&OCFS2_I(inode)->ip_alloc_sem);	952	up_read(&OCFS2_I(inode)->ip_alloc_sem);
953	if (rc) {	953	if (rc) {
954	mlog_errno(rc);	954	mlog_errno(rc);
955	break;	955	break;
956	}	956	}
957		957
958	if (!p_block) {	958	if (!p_block) {
959	rc = -EIO;	959	rc = -EIO;
960	mlog(ML_ERROR,	960	mlog(ML_ERROR,
961	"Inode #%llu contains a hole at offset %llu\n",	961	"Inode #%llu contains a hole at offset %llu\n",
962	(unsigned long long)OCFS2_I(inode)->ip_blkno,	962	(unsigned long long)OCFS2_I(inode)->ip_blkno,
963	(unsigned long long)(v_block + done) <<	963	(unsigned long long)(v_block + done) <<
964	inode->i_sb->s_blocksize_bits);	964	inode->i_sb->s_blocksize_bits);
965	break;	965	break;
966	}	966	}
967		967
968	count = nr - done;	968	count = nr - done;
969	if (p_count < count)	969	if (p_count < count)
970	count = p_count;	970	count = p_count;
971		971
972	/*	972	/*
973	* If the caller passed us bhs, they should have come	973	* If the caller passed us bhs, they should have come
974	* from a previous readahead call to this function. Thus,	974	* from a previous readahead call to this function. Thus,
975	* they should have the right b_blocknr.	975	* they should have the right b_blocknr.
976	*/	976	*/
977	for (i = 0; i < count; i++) {	977	for (i = 0; i < count; i++) {
978	if (!bhs[done + i])	978	if (!bhs[done + i])
979	continue;	979	continue;
980	BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));	980	BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
981	}	981	}
982		982
983	rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,	983	rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
984	bhs + done, flags, validate);	984	bhs + done, flags, validate);
985	if (rc) {	985	if (rc) {
986	mlog_errno(rc);	986	mlog_errno(rc);
987	break;	987	break;
988	}	988	}
989	done += count;	989	done += count;
990	}	990	}
991		991
992	out:	992	out:
993	return rc;	993	return rc;
994	}	994	}
995		995
996		996
997		997

fs/ocfs2/file.c

Diff comments View file @ 965c8e5

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* file.c	4	* file.c
5	*	5	*
6	* File open, close, extend, truncate	6	* File open, close, extend, truncate
7	*	7	*
8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*/	24	*/
25		25
26	#include <linux/capability.h>	26	#include <linux/capability.h>
27	#include <linux/fs.h>	27	#include <linux/fs.h>
28	#include <linux/types.h>	28	#include <linux/types.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/highmem.h>	30	#include <linux/highmem.h>
31	#include <linux/pagemap.h>	31	#include <linux/pagemap.h>
32	#include <linux/uio.h>	32	#include <linux/uio.h>
33	#include <linux/sched.h>	33	#include <linux/sched.h>
34	#include <linux/splice.h>	34	#include <linux/splice.h>
35	#include <linux/mount.h>	35	#include <linux/mount.h>
36	#include <linux/writeback.h>	36	#include <linux/writeback.h>
37	#include <linux/falloc.h>	37	#include <linux/falloc.h>
38	#include <linux/quotaops.h>	38	#include <linux/quotaops.h>
39	#include <linux/blkdev.h>	39	#include <linux/blkdev.h>
40		40
41	#include <cluster/masklog.h>	41	#include <cluster/masklog.h>
42		42
43	#include "ocfs2.h"	43	#include "ocfs2.h"
44		44
45	#include "alloc.h"	45	#include "alloc.h"
46	#include "aops.h"	46	#include "aops.h"
47	#include "dir.h"	47	#include "dir.h"
48	#include "dlmglue.h"	48	#include "dlmglue.h"
49	#include "extent_map.h"	49	#include "extent_map.h"
50	#include "file.h"	50	#include "file.h"
51	#include "sysfile.h"	51	#include "sysfile.h"
52	#include "inode.h"	52	#include "inode.h"
53	#include "ioctl.h"	53	#include "ioctl.h"
54	#include "journal.h"	54	#include "journal.h"
55	#include "locks.h"	55	#include "locks.h"
56	#include "mmap.h"	56	#include "mmap.h"
57	#include "suballoc.h"	57	#include "suballoc.h"
58	#include "super.h"	58	#include "super.h"
59	#include "xattr.h"	59	#include "xattr.h"
60	#include "acl.h"	60	#include "acl.h"
61	#include "quota.h"	61	#include "quota.h"
62	#include "refcounttree.h"	62	#include "refcounttree.h"
63	#include "ocfs2_trace.h"	63	#include "ocfs2_trace.h"
64		64
65	#include "buffer_head_io.h"	65	#include "buffer_head_io.h"
66		66
67	static int ocfs2_init_file_private(struct inode inode, struct file file)	67	static int ocfs2_init_file_private(struct inode inode, struct file file)
68	{	68	{
69	struct ocfs2_file_private *fp;	69	struct ocfs2_file_private *fp;
70		70
71	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);	71	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72	if (!fp)	72	if (!fp)
73	return -ENOMEM;	73	return -ENOMEM;
74		74
75	fp->fp_file = file;	75	fp->fp_file = file;
76	mutex_init(&fp->fp_mutex);	76	mutex_init(&fp->fp_mutex);
77	ocfs2_file_lock_res_init(&fp->fp_flock, fp);	77	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78	file->private_data = fp;	78	file->private_data = fp;
79		79
80	return 0;	80	return 0;
81	}	81	}
82		82
83	static void ocfs2_free_file_private(struct inode inode, struct file file)	83	static void ocfs2_free_file_private(struct inode inode, struct file file)
84	{	84	{
85	struct ocfs2_file_private *fp = file->private_data;	85	struct ocfs2_file_private *fp = file->private_data;
86	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	86	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87		87
88	if (fp) {	88	if (fp) {
89	ocfs2_simple_drop_lockres(osb, &fp->fp_flock);	89	ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90	ocfs2_lock_res_free(&fp->fp_flock);	90	ocfs2_lock_res_free(&fp->fp_flock);
91	kfree(fp);	91	kfree(fp);
92	file->private_data = NULL;	92	file->private_data = NULL;
93	}	93	}
94	}	94	}
95		95
96	static int ocfs2_file_open(struct inode inode, struct file file)	96	static int ocfs2_file_open(struct inode inode, struct file file)
97	{	97	{
98	int status;	98	int status;
99	int mode = file->f_flags;	99	int mode = file->f_flags;
100	struct ocfs2_inode_info *oi = OCFS2_I(inode);	100	struct ocfs2_inode_info *oi = OCFS2_I(inode);
101		101
102	trace_ocfs2_file_open(inode, file, file->f_path.dentry,	102	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
103	(unsigned long long)OCFS2_I(inode)->ip_blkno,	103	(unsigned long long)OCFS2_I(inode)->ip_blkno,
104	file->f_path.dentry->d_name.len,	104	file->f_path.dentry->d_name.len,
105	file->f_path.dentry->d_name.name, mode);	105	file->f_path.dentry->d_name.name, mode);
106		106
107	if (file->f_mode & FMODE_WRITE)	107	if (file->f_mode & FMODE_WRITE)
108	dquot_initialize(inode);	108	dquot_initialize(inode);
109		109
110	spin_lock(&oi->ip_lock);	110	spin_lock(&oi->ip_lock);
111		111
112	/* Check that the inode hasn't been wiped from disk by another	112	/* Check that the inode hasn't been wiped from disk by another
113	* node. If it hasn't then we're safe as long as we hold the	113	* node. If it hasn't then we're safe as long as we hold the
114	* spin lock until our increment of open count. */	114	* spin lock until our increment of open count. */
115	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {	115	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
116	spin_unlock(&oi->ip_lock);	116	spin_unlock(&oi->ip_lock);
117		117
118	status = -ENOENT;	118	status = -ENOENT;
119	goto leave;	119	goto leave;
120	}	120	}
121		121
122	if (mode & O_DIRECT)	122	if (mode & O_DIRECT)
123	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;	123	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
124		124
125	oi->ip_open_count++;	125	oi->ip_open_count++;
126	spin_unlock(&oi->ip_lock);	126	spin_unlock(&oi->ip_lock);
127		127
128	status = ocfs2_init_file_private(inode, file);	128	status = ocfs2_init_file_private(inode, file);
129	if (status) {	129	if (status) {
130	/*	130	/*
131	* We want to set open count back if we're failing the	131	* We want to set open count back if we're failing the
132	* open.	132	* open.
133	*/	133	*/
134	spin_lock(&oi->ip_lock);	134	spin_lock(&oi->ip_lock);
135	oi->ip_open_count--;	135	oi->ip_open_count--;
136	spin_unlock(&oi->ip_lock);	136	spin_unlock(&oi->ip_lock);
137	}	137	}
138		138
139	leave:	139	leave:
140	return status;	140	return status;
141	}	141	}
142		142
143	static int ocfs2_file_release(struct inode inode, struct file file)	143	static int ocfs2_file_release(struct inode inode, struct file file)
144	{	144	{
145	struct ocfs2_inode_info *oi = OCFS2_I(inode);	145	struct ocfs2_inode_info *oi = OCFS2_I(inode);
146		146
147	spin_lock(&oi->ip_lock);	147	spin_lock(&oi->ip_lock);
148	if (!--oi->ip_open_count)	148	if (!--oi->ip_open_count)
149	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;	149	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
150		150
151	trace_ocfs2_file_release(inode, file, file->f_path.dentry,	151	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
152	oi->ip_blkno,	152	oi->ip_blkno,
153	file->f_path.dentry->d_name.len,	153	file->f_path.dentry->d_name.len,
154	file->f_path.dentry->d_name.name,	154	file->f_path.dentry->d_name.name,
155	oi->ip_open_count);	155	oi->ip_open_count);
156	spin_unlock(&oi->ip_lock);	156	spin_unlock(&oi->ip_lock);
157		157
158	ocfs2_free_file_private(inode, file);	158	ocfs2_free_file_private(inode, file);
159		159
160	return 0;	160	return 0;
161	}	161	}
162		162
163	static int ocfs2_dir_open(struct inode inode, struct file file)	163	static int ocfs2_dir_open(struct inode inode, struct file file)
164	{	164	{
165	return ocfs2_init_file_private(inode, file);	165	return ocfs2_init_file_private(inode, file);
166	}	166	}
167		167
168	static int ocfs2_dir_release(struct inode inode, struct file file)	168	static int ocfs2_dir_release(struct inode inode, struct file file)
169	{	169	{
170	ocfs2_free_file_private(inode, file);	170	ocfs2_free_file_private(inode, file);
171	return 0;	171	return 0;
172	}	172	}
173		173
174	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,	174	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175	int datasync)	175	int datasync)
176	{	176	{
177	int err = 0;	177	int err = 0;
178	journal_t *journal;	178	journal_t *journal;
179	struct inode *inode = file->f_mapping->host;	179	struct inode *inode = file->f_mapping->host;
180	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	180	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
181		181
182	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,	182	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183	OCFS2_I(inode)->ip_blkno,	183	OCFS2_I(inode)->ip_blkno,
184	file->f_path.dentry->d_name.len,	184	file->f_path.dentry->d_name.len,
185	file->f_path.dentry->d_name.name,	185	file->f_path.dentry->d_name.name,
186	(unsigned long long)datasync);	186	(unsigned long long)datasync);
187		187
188	err = filemap_write_and_wait_range(inode->i_mapping, start, end);	188	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
189	if (err)	189	if (err)
190	return err;	190	return err;
191		191
192	/*	192	/*
193	* Probably don't need the i_mutex at all in here, just putting it here	193	* Probably don't need the i_mutex at all in here, just putting it here
194	* to be consistent with how fsync used to be called, someone more	194	* to be consistent with how fsync used to be called, someone more
195	* familiar with the fs could possibly remove it.	195	* familiar with the fs could possibly remove it.
196	*/	196	*/
197	mutex_lock(&inode->i_mutex);	197	mutex_lock(&inode->i_mutex);
198	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {	198	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
199	/*	199	/*
200	* We still have to flush drive's caches to get data to the	200	* We still have to flush drive's caches to get data to the
201	* platter	201	* platter
202	*/	202	*/
203	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)	203	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
204	blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);	204	blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
205	goto bail;	205	goto bail;
206	}	206	}
207		207
208	journal = osb->journal->j_journal;	208	journal = osb->journal->j_journal;
209	err = jbd2_journal_force_commit(journal);	209	err = jbd2_journal_force_commit(journal);
210		210
211	bail:	211	bail:
212	if (err)	212	if (err)
213	mlog_errno(err);	213	mlog_errno(err);
214	mutex_unlock(&inode->i_mutex);	214	mutex_unlock(&inode->i_mutex);
215		215
216	return (err < 0) ? -EIO : 0;	216	return (err < 0) ? -EIO : 0;
217	}	217	}
218		218
219	int ocfs2_should_update_atime(struct inode *inode,	219	int ocfs2_should_update_atime(struct inode *inode,
220	struct vfsmount *vfsmnt)	220	struct vfsmount *vfsmnt)
221	{	221	{
222	struct timespec now;	222	struct timespec now;
223	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	223	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
224		224
225	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))	225	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
226	return 0;	226	return 0;
227		227
228	if ((inode->i_flags & S_NOATIME) \|\|	228	if ((inode->i_flags & S_NOATIME) \|\|
229	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))	229	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
230	return 0;	230	return 0;
231		231
232	/*	232	/*
233	* We can be called with no vfsmnt structure - NFSD will	233	* We can be called with no vfsmnt structure - NFSD will
234	* sometimes do this.	234	* sometimes do this.
235	*	235	*
236	* Note that our action here is different than touch_atime() -	236	* Note that our action here is different than touch_atime() -
237	* if we can't tell whether this is a noatime mount, then we	237	* if we can't tell whether this is a noatime mount, then we
238	* don't know whether to trust the value of s_atime_quantum.	238	* don't know whether to trust the value of s_atime_quantum.
239	*/	239	*/
240	if (vfsmnt == NULL)	240	if (vfsmnt == NULL)
241	return 0;	241	return 0;
242		242
243	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|	243	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
244	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))	244	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
245	return 0;	245	return 0;
246		246
247	if (vfsmnt->mnt_flags & MNT_RELATIME) {	247	if (vfsmnt->mnt_flags & MNT_RELATIME) {
248	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|	248	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
249	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))	249	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
250	return 1;	250	return 1;
251		251
252	return 0;	252	return 0;
253	}	253	}
254		254
255	now = CURRENT_TIME;	255	now = CURRENT_TIME;
256	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))	256	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
257	return 0;	257	return 0;
258	else	258	else
259	return 1;	259	return 1;
260	}	260	}
261		261
262	int ocfs2_update_inode_atime(struct inode *inode,	262	int ocfs2_update_inode_atime(struct inode *inode,
263	struct buffer_head *bh)	263	struct buffer_head *bh)
264	{	264	{
265	int ret;	265	int ret;
266	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	266	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
267	handle_t *handle;	267	handle_t *handle;
268	struct ocfs2_dinode di = (struct ocfs2_dinode ) bh->b_data;	268	struct ocfs2_dinode di = (struct ocfs2_dinode ) bh->b_data;
269		269
270	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	270	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
271	if (IS_ERR(handle)) {	271	if (IS_ERR(handle)) {
272	ret = PTR_ERR(handle);	272	ret = PTR_ERR(handle);
273	mlog_errno(ret);	273	mlog_errno(ret);
274	goto out;	274	goto out;
275	}	275	}
276		276
277	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,	277	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
278	OCFS2_JOURNAL_ACCESS_WRITE);	278	OCFS2_JOURNAL_ACCESS_WRITE);
279	if (ret) {	279	if (ret) {
280	mlog_errno(ret);	280	mlog_errno(ret);
281	goto out_commit;	281	goto out_commit;
282	}	282	}
283		283
284	/*	284	/*
285	* Don't use ocfs2_mark_inode_dirty() here as we don't always	285	* Don't use ocfs2_mark_inode_dirty() here as we don't always
286	* have i_mutex to guard against concurrent changes to other	286	* have i_mutex to guard against concurrent changes to other
287	* inode fields.	287	* inode fields.
288	*/	288	*/
289	inode->i_atime = CURRENT_TIME;	289	inode->i_atime = CURRENT_TIME;
290	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);	290	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
291	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);	291	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
292	ocfs2_journal_dirty(handle, bh);	292	ocfs2_journal_dirty(handle, bh);
293		293
294	out_commit:	294	out_commit:
295	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	295	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
296	out:	296	out:
297	return ret;	297	return ret;
298	}	298	}
299		299
300	static int ocfs2_set_inode_size(handle_t *handle,	300	static int ocfs2_set_inode_size(handle_t *handle,
301	struct inode *inode,	301	struct inode *inode,
302	struct buffer_head *fe_bh,	302	struct buffer_head *fe_bh,
303	u64 new_i_size)	303	u64 new_i_size)
304	{	304	{
305	int status;	305	int status;
306		306
307	i_size_write(inode, new_i_size);	307	i_size_write(inode, new_i_size);
308	inode->i_blocks = ocfs2_inode_sector_count(inode);	308	inode->i_blocks = ocfs2_inode_sector_count(inode);
309	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	309	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
310		310
311	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);	311	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
312	if (status < 0) {	312	if (status < 0) {
313	mlog_errno(status);	313	mlog_errno(status);
314	goto bail;	314	goto bail;
315	}	315	}
316		316
317	bail:	317	bail:
318	return status;	318	return status;
319	}	319	}
320		320
321	int ocfs2_simple_size_update(struct inode *inode,	321	int ocfs2_simple_size_update(struct inode *inode,
322	struct buffer_head *di_bh,	322	struct buffer_head *di_bh,
323	u64 new_i_size)	323	u64 new_i_size)
324	{	324	{
325	int ret;	325	int ret;
326	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	326	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
327	handle_t *handle = NULL;	327	handle_t *handle = NULL;
328		328
329	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	329	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
330	if (IS_ERR(handle)) {	330	if (IS_ERR(handle)) {
331	ret = PTR_ERR(handle);	331	ret = PTR_ERR(handle);
332	mlog_errno(ret);	332	mlog_errno(ret);
333	goto out;	333	goto out;
334	}	334	}
335		335
336	ret = ocfs2_set_inode_size(handle, inode, di_bh,	336	ret = ocfs2_set_inode_size(handle, inode, di_bh,
337	new_i_size);	337	new_i_size);
338	if (ret < 0)	338	if (ret < 0)
339	mlog_errno(ret);	339	mlog_errno(ret);
340		340
341	ocfs2_commit_trans(osb, handle);	341	ocfs2_commit_trans(osb, handle);
342	out:	342	out:
343	return ret;	343	return ret;
344	}	344	}
345		345
346	static int ocfs2_cow_file_pos(struct inode *inode,	346	static int ocfs2_cow_file_pos(struct inode *inode,
347	struct buffer_head *fe_bh,	347	struct buffer_head *fe_bh,
348	u64 offset)	348	u64 offset)
349	{	349	{
350	int status;	350	int status;
351	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;	351	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
352	unsigned int num_clusters = 0;	352	unsigned int num_clusters = 0;
353	unsigned int ext_flags = 0;	353	unsigned int ext_flags = 0;
354		354
355	/*	355	/*
356	* If the new offset is aligned to the range of the cluster, there is	356	* If the new offset is aligned to the range of the cluster, there is
357	* no space for ocfs2_zero_range_for_truncate to fill, so no need to	357	* no space for ocfs2_zero_range_for_truncate to fill, so no need to
358	* CoW either.	358	* CoW either.
359	*/	359	*/
360	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)	360	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
361	return 0;	361	return 0;
362		362
363	status = ocfs2_get_clusters(inode, cpos, &phys,	363	status = ocfs2_get_clusters(inode, cpos, &phys,
364	&num_clusters, &ext_flags);	364	&num_clusters, &ext_flags);
365	if (status) {	365	if (status) {
366	mlog_errno(status);	366	mlog_errno(status);
367	goto out;	367	goto out;
368	}	368	}
369		369
370	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))	370	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371	goto out;	371	goto out;
372		372
373	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);	373	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374		374
375	out:	375	out:
376	return status;	376	return status;
377	}	377	}
378		378
379	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,	379	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
380	struct inode *inode,	380	struct inode *inode,
381	struct buffer_head *fe_bh,	381	struct buffer_head *fe_bh,
382	u64 new_i_size)	382	u64 new_i_size)
383	{	383	{
384	int status;	384	int status;
385	handle_t *handle;	385	handle_t *handle;
386	struct ocfs2_dinode *di;	386	struct ocfs2_dinode *di;
387	u64 cluster_bytes;	387	u64 cluster_bytes;
388		388
389	/*	389	/*
390	* We need to CoW the cluster contains the offset if it is reflinked	390	* We need to CoW the cluster contains the offset if it is reflinked
391	* since we will call ocfs2_zero_range_for_truncate later which will	391	* since we will call ocfs2_zero_range_for_truncate later which will
392	* write "0" from offset to the end of the cluster.	392	* write "0" from offset to the end of the cluster.
393	*/	393	*/
394	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);	394	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
395	if (status) {	395	if (status) {
396	mlog_errno(status);	396	mlog_errno(status);
397	return status;	397	return status;
398	}	398	}
399		399
400	/* TODO: This needs to actually orphan the inode in this	400	/* TODO: This needs to actually orphan the inode in this
401	* transaction. */	401	* transaction. */
402		402
403	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	403	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
404	if (IS_ERR(handle)) {	404	if (IS_ERR(handle)) {
405	status = PTR_ERR(handle);	405	status = PTR_ERR(handle);
406	mlog_errno(status);	406	mlog_errno(status);
407	goto out;	407	goto out;
408	}	408	}
409		409
410	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,	410	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
411	OCFS2_JOURNAL_ACCESS_WRITE);	411	OCFS2_JOURNAL_ACCESS_WRITE);
412	if (status < 0) {	412	if (status < 0) {
413	mlog_errno(status);	413	mlog_errno(status);
414	goto out_commit;	414	goto out_commit;
415	}	415	}
416		416
417	/*	417	/*
418	* Do this before setting i_size.	418	* Do this before setting i_size.
419	*/	419	*/
420	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);	420	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
421	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,	421	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
422	cluster_bytes);	422	cluster_bytes);
423	if (status) {	423	if (status) {
424	mlog_errno(status);	424	mlog_errno(status);
425	goto out_commit;	425	goto out_commit;
426	}	426	}
427		427
428	i_size_write(inode, new_i_size);	428	i_size_write(inode, new_i_size);
429	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	429	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
430		430
431	di = (struct ocfs2_dinode *) fe_bh->b_data;	431	di = (struct ocfs2_dinode *) fe_bh->b_data;
432	di->i_size = cpu_to_le64(new_i_size);	432	di->i_size = cpu_to_le64(new_i_size);
433	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);	433	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
434	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);	434	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
435		435
436	ocfs2_journal_dirty(handle, fe_bh);	436	ocfs2_journal_dirty(handle, fe_bh);
437		437
438	out_commit:	438	out_commit:
439	ocfs2_commit_trans(osb, handle);	439	ocfs2_commit_trans(osb, handle);
440	out:	440	out:
441	return status;	441	return status;
442	}	442	}
443		443
444	static int ocfs2_truncate_file(struct inode *inode,	444	static int ocfs2_truncate_file(struct inode *inode,
445	struct buffer_head *di_bh,	445	struct buffer_head *di_bh,
446	u64 new_i_size)	446	u64 new_i_size)
447	{	447	{
448	int status = 0;	448	int status = 0;
449	struct ocfs2_dinode *fe = NULL;	449	struct ocfs2_dinode *fe = NULL;
450	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	450	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
451		451
452	/* We trust di_bh because it comes from ocfs2_inode_lock(), which	452	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
453	* already validated it */	453	* already validated it */
454	fe = (struct ocfs2_dinode *) di_bh->b_data;	454	fe = (struct ocfs2_dinode *) di_bh->b_data;
455		455
456	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,	456	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
457	(unsigned long long)le64_to_cpu(fe->i_size),	457	(unsigned long long)le64_to_cpu(fe->i_size),
458	(unsigned long long)new_i_size);	458	(unsigned long long)new_i_size);
459		459
460	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),	460	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
461	"Inode %llu, inode i_size = %lld != di "	461	"Inode %llu, inode i_size = %lld != di "
462	"i_size = %llu, i_flags = 0x%x\n",	462	"i_size = %llu, i_flags = 0x%x\n",
463	(unsigned long long)OCFS2_I(inode)->ip_blkno,	463	(unsigned long long)OCFS2_I(inode)->ip_blkno,
464	i_size_read(inode),	464	i_size_read(inode),
465	(unsigned long long)le64_to_cpu(fe->i_size),	465	(unsigned long long)le64_to_cpu(fe->i_size),
466	le32_to_cpu(fe->i_flags));	466	le32_to_cpu(fe->i_flags));
467		467
468	if (new_i_size > le64_to_cpu(fe->i_size)) {	468	if (new_i_size > le64_to_cpu(fe->i_size)) {
469	trace_ocfs2_truncate_file_error(	469	trace_ocfs2_truncate_file_error(
470	(unsigned long long)le64_to_cpu(fe->i_size),	470	(unsigned long long)le64_to_cpu(fe->i_size),
471	(unsigned long long)new_i_size);	471	(unsigned long long)new_i_size);
472	status = -EINVAL;	472	status = -EINVAL;
473	mlog_errno(status);	473	mlog_errno(status);
474	goto bail;	474	goto bail;
475	}	475	}
476		476
477	/* lets handle the simple truncate cases before doing any more	477	/* lets handle the simple truncate cases before doing any more
478	* cluster locking. */	478	* cluster locking. */
479	if (new_i_size == le64_to_cpu(fe->i_size))	479	if (new_i_size == le64_to_cpu(fe->i_size))
480	goto bail;	480	goto bail;
481		481
482	down_write(&OCFS2_I(inode)->ip_alloc_sem);	482	down_write(&OCFS2_I(inode)->ip_alloc_sem);
483		483
484	ocfs2_resv_discard(&osb->osb_la_resmap,	484	ocfs2_resv_discard(&osb->osb_la_resmap,
485	&OCFS2_I(inode)->ip_la_data_resv);	485	&OCFS2_I(inode)->ip_la_data_resv);
486		486
487	/*	487	/*
488	* The inode lock forced other nodes to sync and drop their	488	* The inode lock forced other nodes to sync and drop their
489	* pages, which (correctly) happens even if we have a truncate	489	* pages, which (correctly) happens even if we have a truncate
490	* without allocation change - ocfs2 cluster sizes can be much	490	* without allocation change - ocfs2 cluster sizes can be much
491	* greater than page size, so we have to truncate them	491	* greater than page size, so we have to truncate them
492	* anyway.	492	* anyway.
493	*/	493	*/
494	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);	494	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
495	truncate_inode_pages(inode->i_mapping, new_i_size);	495	truncate_inode_pages(inode->i_mapping, new_i_size);
496		496
497	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	497	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
498	status = ocfs2_truncate_inline(inode, di_bh, new_i_size,	498	status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
499	i_size_read(inode), 1);	499	i_size_read(inode), 1);
500	if (status)	500	if (status)
501	mlog_errno(status);	501	mlog_errno(status);
502		502
503	goto bail_unlock_sem;	503	goto bail_unlock_sem;
504	}	504	}
505		505
506	/* alright, we're going to need to do a full blown alloc size	506	/* alright, we're going to need to do a full blown alloc size
507	* change. Orphan the inode so that recovery can complete the	507	* change. Orphan the inode so that recovery can complete the
508	* truncate if necessary. This does the task of marking	508	* truncate if necessary. This does the task of marking
509	* i_size. */	509	* i_size. */
510	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);	510	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
511	if (status < 0) {	511	if (status < 0) {
512	mlog_errno(status);	512	mlog_errno(status);
513	goto bail_unlock_sem;	513	goto bail_unlock_sem;
514	}	514	}
515		515
516	status = ocfs2_commit_truncate(osb, inode, di_bh);	516	status = ocfs2_commit_truncate(osb, inode, di_bh);
517	if (status < 0) {	517	if (status < 0) {
518	mlog_errno(status);	518	mlog_errno(status);
519	goto bail_unlock_sem;	519	goto bail_unlock_sem;
520	}	520	}
521		521
522	/* TODO: orphan dir cleanup here. */	522	/* TODO: orphan dir cleanup here. */
523	bail_unlock_sem:	523	bail_unlock_sem:
524	up_write(&OCFS2_I(inode)->ip_alloc_sem);	524	up_write(&OCFS2_I(inode)->ip_alloc_sem);
525		525
526	bail:	526	bail:
527	if (!status && OCFS2_I(inode)->ip_clusters == 0)	527	if (!status && OCFS2_I(inode)->ip_clusters == 0)
528	status = ocfs2_try_remove_refcount_tree(inode, di_bh);	528	status = ocfs2_try_remove_refcount_tree(inode, di_bh);
529		529
530	return status;	530	return status;
531	}	531	}
532		532
533	/*	533	/*
534	* extend file allocation only here.	534	* extend file allocation only here.
535	* we'll update all the disk stuff, and oip->alloc_size	535	* we'll update all the disk stuff, and oip->alloc_size
536	*	536	*
537	* expect stuff to be locked, a transaction started and enough data /	537	* expect stuff to be locked, a transaction started and enough data /
538	* metadata reservations in the contexts.	538	* metadata reservations in the contexts.
539	*	539	*
540	* Will return -EAGAIN, and a reason if a restart is needed.	540	* Will return -EAGAIN, and a reason if a restart is needed.
541	* If passed in, *reason will always be set, even in error.	541	* If passed in, *reason will always be set, even in error.
542	*/	542	*/
543	int ocfs2_add_inode_data(struct ocfs2_super *osb,	543	int ocfs2_add_inode_data(struct ocfs2_super *osb,
544	struct inode *inode,	544	struct inode *inode,
545	u32 *logical_offset,	545	u32 *logical_offset,
546	u32 clusters_to_add,	546	u32 clusters_to_add,
547	int mark_unwritten,	547	int mark_unwritten,
548	struct buffer_head *fe_bh,	548	struct buffer_head *fe_bh,
549	handle_t *handle,	549	handle_t *handle,
550	struct ocfs2_alloc_context *data_ac,	550	struct ocfs2_alloc_context *data_ac,
551	struct ocfs2_alloc_context *meta_ac,	551	struct ocfs2_alloc_context *meta_ac,
552	enum ocfs2_alloc_restarted *reason_ret)	552	enum ocfs2_alloc_restarted *reason_ret)
553	{	553	{
554	int ret;	554	int ret;
555	struct ocfs2_extent_tree et;	555	struct ocfs2_extent_tree et;
556		556
557	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);	557	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
558	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,	558	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
559	clusters_to_add, mark_unwritten,	559	clusters_to_add, mark_unwritten,
560	data_ac, meta_ac, reason_ret);	560	data_ac, meta_ac, reason_ret);
561		561
562	return ret;	562	return ret;
563	}	563	}
564		564
565	static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,	565	static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
566	u32 clusters_to_add, int mark_unwritten)	566	u32 clusters_to_add, int mark_unwritten)
567	{	567	{
568	int status = 0;	568	int status = 0;
569	int restart_func = 0;	569	int restart_func = 0;
570	int credits;	570	int credits;
571	u32 prev_clusters;	571	u32 prev_clusters;
572	struct buffer_head *bh = NULL;	572	struct buffer_head *bh = NULL;
573	struct ocfs2_dinode *fe = NULL;	573	struct ocfs2_dinode *fe = NULL;
574	handle_t *handle = NULL;	574	handle_t *handle = NULL;
575	struct ocfs2_alloc_context *data_ac = NULL;	575	struct ocfs2_alloc_context *data_ac = NULL;
576	struct ocfs2_alloc_context *meta_ac = NULL;	576	struct ocfs2_alloc_context *meta_ac = NULL;
577	enum ocfs2_alloc_restarted why;	577	enum ocfs2_alloc_restarted why;
578	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	578	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
579	struct ocfs2_extent_tree et;	579	struct ocfs2_extent_tree et;
580	int did_quota = 0;	580	int did_quota = 0;
581		581
582	/*	582	/*
583	* This function only exists for file systems which don't	583	* This function only exists for file systems which don't
584	* support holes.	584	* support holes.
585	*/	585	*/
586	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));	586	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
587		587
588	status = ocfs2_read_inode_block(inode, &bh);	588	status = ocfs2_read_inode_block(inode, &bh);
589	if (status < 0) {	589	if (status < 0) {
590	mlog_errno(status);	590	mlog_errno(status);
591	goto leave;	591	goto leave;
592	}	592	}
593	fe = (struct ocfs2_dinode *) bh->b_data;	593	fe = (struct ocfs2_dinode *) bh->b_data;
594		594
595	restart_all:	595	restart_all:
596	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);	596	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
597		597
598	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);	598	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
599	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,	599	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
600	&data_ac, &meta_ac);	600	&data_ac, &meta_ac);
601	if (status) {	601	if (status) {
602	mlog_errno(status);	602	mlog_errno(status);
603	goto leave;	603	goto leave;
604	}	604	}
605		605
606	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,	606	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
607	clusters_to_add);	607	clusters_to_add);
608	handle = ocfs2_start_trans(osb, credits);	608	handle = ocfs2_start_trans(osb, credits);
609	if (IS_ERR(handle)) {	609	if (IS_ERR(handle)) {
610	status = PTR_ERR(handle);	610	status = PTR_ERR(handle);
611	handle = NULL;	611	handle = NULL;
612	mlog_errno(status);	612	mlog_errno(status);
613	goto leave;	613	goto leave;
614	}	614	}
615		615
616	restarted_transaction:	616	restarted_transaction:
617	trace_ocfs2_extend_allocation(	617	trace_ocfs2_extend_allocation(
618	(unsigned long long)OCFS2_I(inode)->ip_blkno,	618	(unsigned long long)OCFS2_I(inode)->ip_blkno,
619	(unsigned long long)i_size_read(inode),	619	(unsigned long long)i_size_read(inode),
620	le32_to_cpu(fe->i_clusters), clusters_to_add,	620	le32_to_cpu(fe->i_clusters), clusters_to_add,
621	why, restart_func);	621	why, restart_func);
622		622
623	status = dquot_alloc_space_nodirty(inode,	623	status = dquot_alloc_space_nodirty(inode,
624	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));	624	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
625	if (status)	625	if (status)
626	goto leave;	626	goto leave;
627	did_quota = 1;	627	did_quota = 1;
628		628
629	/* reserve a write to the file entry early on - that we if we	629	/* reserve a write to the file entry early on - that we if we
630	* run out of credits in the allocation path, we can still	630	* run out of credits in the allocation path, we can still
631	* update i_size. */	631	* update i_size. */
632	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,	632	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
633	OCFS2_JOURNAL_ACCESS_WRITE);	633	OCFS2_JOURNAL_ACCESS_WRITE);
634	if (status < 0) {	634	if (status < 0) {
635	mlog_errno(status);	635	mlog_errno(status);
636	goto leave;	636	goto leave;
637	}	637	}
638		638
639	prev_clusters = OCFS2_I(inode)->ip_clusters;	639	prev_clusters = OCFS2_I(inode)->ip_clusters;
640		640
641	status = ocfs2_add_inode_data(osb,	641	status = ocfs2_add_inode_data(osb,
642	inode,	642	inode,
643	&logical_start,	643	&logical_start,
644	clusters_to_add,	644	clusters_to_add,
645	mark_unwritten,	645	mark_unwritten,
646	bh,	646	bh,
647	handle,	647	handle,
648	data_ac,	648	data_ac,
649	meta_ac,	649	meta_ac,
650	&why);	650	&why);
651	if ((status < 0) && (status != -EAGAIN)) {	651	if ((status < 0) && (status != -EAGAIN)) {
652	if (status != -ENOSPC)	652	if (status != -ENOSPC)
653	mlog_errno(status);	653	mlog_errno(status);
654	goto leave;	654	goto leave;
655	}	655	}
656		656
657	ocfs2_journal_dirty(handle, bh);	657	ocfs2_journal_dirty(handle, bh);
658		658
659	spin_lock(&OCFS2_I(inode)->ip_lock);	659	spin_lock(&OCFS2_I(inode)->ip_lock);
660	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);	660	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
661	spin_unlock(&OCFS2_I(inode)->ip_lock);	661	spin_unlock(&OCFS2_I(inode)->ip_lock);
662	/* Release unused quota reservation */	662	/* Release unused quota reservation */
663	dquot_free_space(inode,	663	dquot_free_space(inode,
664	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));	664	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
665	did_quota = 0;	665	did_quota = 0;
666		666
667	if (why != RESTART_NONE && clusters_to_add) {	667	if (why != RESTART_NONE && clusters_to_add) {
668	if (why == RESTART_META) {	668	if (why == RESTART_META) {
669	restart_func = 1;	669	restart_func = 1;
670	status = 0;	670	status = 0;
671	} else {	671	} else {
672	BUG_ON(why != RESTART_TRANS);	672	BUG_ON(why != RESTART_TRANS);
673		673
674	/* TODO: This can be more intelligent. */	674	/* TODO: This can be more intelligent. */
675	credits = ocfs2_calc_extend_credits(osb->sb,	675	credits = ocfs2_calc_extend_credits(osb->sb,
676	&fe->id2.i_list,	676	&fe->id2.i_list,
677	clusters_to_add);	677	clusters_to_add);
678	status = ocfs2_extend_trans(handle, credits);	678	status = ocfs2_extend_trans(handle, credits);
679	if (status < 0) {	679	if (status < 0) {
680	/* handle still has to be committed at	680	/* handle still has to be committed at
681	* this point. */	681	* this point. */
682	status = -ENOMEM;	682	status = -ENOMEM;
683	mlog_errno(status);	683	mlog_errno(status);
684	goto leave;	684	goto leave;
685	}	685	}
686	goto restarted_transaction;	686	goto restarted_transaction;
687	}	687	}
688	}	688	}
689		689
690	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,	690	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
691	le32_to_cpu(fe->i_clusters),	691	le32_to_cpu(fe->i_clusters),
692	(unsigned long long)le64_to_cpu(fe->i_size),	692	(unsigned long long)le64_to_cpu(fe->i_size),
693	OCFS2_I(inode)->ip_clusters,	693	OCFS2_I(inode)->ip_clusters,
694	(unsigned long long)i_size_read(inode));	694	(unsigned long long)i_size_read(inode));
695		695
696	leave:	696	leave:
697	if (status < 0 && did_quota)	697	if (status < 0 && did_quota)
698	dquot_free_space(inode,	698	dquot_free_space(inode,
699	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));	699	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
700	if (handle) {	700	if (handle) {
701	ocfs2_commit_trans(osb, handle);	701	ocfs2_commit_trans(osb, handle);
702	handle = NULL;	702	handle = NULL;
703	}	703	}
704	if (data_ac) {	704	if (data_ac) {
705	ocfs2_free_alloc_context(data_ac);	705	ocfs2_free_alloc_context(data_ac);
706	data_ac = NULL;	706	data_ac = NULL;
707	}	707	}
708	if (meta_ac) {	708	if (meta_ac) {
709	ocfs2_free_alloc_context(meta_ac);	709	ocfs2_free_alloc_context(meta_ac);
710	meta_ac = NULL;	710	meta_ac = NULL;
711	}	711	}
712	if ((!status) && restart_func) {	712	if ((!status) && restart_func) {
713	restart_func = 0;	713	restart_func = 0;
714	goto restart_all;	714	goto restart_all;
715	}	715	}
716	brelse(bh);	716	brelse(bh);
717	bh = NULL;	717	bh = NULL;
718		718
719	return status;	719	return status;
720	}	720	}
721		721
722	/*	722	/*
723	* While a write will already be ordering the data, a truncate will not.	723	* While a write will already be ordering the data, a truncate will not.
724	* Thus, we need to explicitly order the zeroed pages.	724	* Thus, we need to explicitly order the zeroed pages.
725	*/	725	*/
726	static handle_t ocfs2_zero_start_ordered_transaction(struct inode inode)	726	static handle_t ocfs2_zero_start_ordered_transaction(struct inode inode)
727	{	727	{
728	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	728	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
729	handle_t *handle = NULL;	729	handle_t *handle = NULL;
730	int ret = 0;	730	int ret = 0;
731		731
732	if (!ocfs2_should_order_data(inode))	732	if (!ocfs2_should_order_data(inode))
733	goto out;	733	goto out;
734		734
735	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	735	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
736	if (IS_ERR(handle)) {	736	if (IS_ERR(handle)) {
737	ret = -ENOMEM;	737	ret = -ENOMEM;
738	mlog_errno(ret);	738	mlog_errno(ret);
739	goto out;	739	goto out;
740	}	740	}
741		741
742	ret = ocfs2_jbd2_file_inode(handle, inode);	742	ret = ocfs2_jbd2_file_inode(handle, inode);
743	if (ret < 0)	743	if (ret < 0)
744	mlog_errno(ret);	744	mlog_errno(ret);
745		745
746	out:	746	out:
747	if (ret) {	747	if (ret) {
748	if (!IS_ERR(handle))	748	if (!IS_ERR(handle))
749	ocfs2_commit_trans(osb, handle);	749	ocfs2_commit_trans(osb, handle);
750	handle = ERR_PTR(ret);	750	handle = ERR_PTR(ret);
751	}	751	}
752	return handle;	752	return handle;
753	}	753	}
754		754
755	/* Some parts of this taken from generic_cont_expand, which turned out	755	/* Some parts of this taken from generic_cont_expand, which turned out
756	* to be too fragile to do exactly what we need without us having to	756	* to be too fragile to do exactly what we need without us having to
757	* worry about recursive locking in ->write_begin() and ->write_end(). */	757	* worry about recursive locking in ->write_begin() and ->write_end(). */
758	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,	758	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
759	u64 abs_to)	759	u64 abs_to)
760	{	760	{
761	struct address_space *mapping = inode->i_mapping;	761	struct address_space *mapping = inode->i_mapping;
762	struct page *page;	762	struct page *page;
763	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;	763	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
764	handle_t *handle = NULL;	764	handle_t *handle = NULL;
765	int ret = 0;	765	int ret = 0;
766	unsigned zero_from, zero_to, block_start, block_end;	766	unsigned zero_from, zero_to, block_start, block_end;
767		767
768	BUG_ON(abs_from >= abs_to);	768	BUG_ON(abs_from >= abs_to);
769	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));	769	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770	BUG_ON(abs_from & (inode->i_blkbits - 1));	770	BUG_ON(abs_from & (inode->i_blkbits - 1));
771		771
772	page = find_or_create_page(mapping, index, GFP_NOFS);	772	page = find_or_create_page(mapping, index, GFP_NOFS);
773	if (!page) {	773	if (!page) {
774	ret = -ENOMEM;	774	ret = -ENOMEM;
775	mlog_errno(ret);	775	mlog_errno(ret);
776	goto out;	776	goto out;
777	}	777	}
778		778
779	/* Get the offsets within the page that we want to zero */	779	/* Get the offsets within the page that we want to zero */
780	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);	780	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
781	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);	781	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
782	if (!zero_to)	782	if (!zero_to)
783	zero_to = PAGE_CACHE_SIZE;	783	zero_to = PAGE_CACHE_SIZE;
784		784
785	trace_ocfs2_write_zero_page(	785	trace_ocfs2_write_zero_page(
786	(unsigned long long)OCFS2_I(inode)->ip_blkno,	786	(unsigned long long)OCFS2_I(inode)->ip_blkno,
787	(unsigned long long)abs_from,	787	(unsigned long long)abs_from,
788	(unsigned long long)abs_to,	788	(unsigned long long)abs_to,
789	index, zero_from, zero_to);	789	index, zero_from, zero_to);
790		790
791	/* We know that zero_from is block aligned */	791	/* We know that zero_from is block aligned */
792	for (block_start = zero_from; block_start < zero_to;	792	for (block_start = zero_from; block_start < zero_to;
793	block_start = block_end) {	793	block_start = block_end) {
794	block_end = block_start + (1 << inode->i_blkbits);	794	block_end = block_start + (1 << inode->i_blkbits);
795		795
796	/*	796	/*
797	* block_start is block-aligned. Bump it by one to force	797	* block_start is block-aligned. Bump it by one to force
798	* __block_write_begin and block_commit_write to zero the	798	* __block_write_begin and block_commit_write to zero the
799	* whole block.	799	* whole block.
800	*/	800	*/
801	ret = __block_write_begin(page, block_start + 1, 0,	801	ret = __block_write_begin(page, block_start + 1, 0,
802	ocfs2_get_block);	802	ocfs2_get_block);
803	if (ret < 0) {	803	if (ret < 0) {
804	mlog_errno(ret);	804	mlog_errno(ret);
805	goto out_unlock;	805	goto out_unlock;
806	}	806	}
807		807
808	if (!handle) {	808	if (!handle) {
809	handle = ocfs2_zero_start_ordered_transaction(inode);	809	handle = ocfs2_zero_start_ordered_transaction(inode);
810	if (IS_ERR(handle)) {	810	if (IS_ERR(handle)) {
811	ret = PTR_ERR(handle);	811	ret = PTR_ERR(handle);
812	handle = NULL;	812	handle = NULL;
813	break;	813	break;
814	}	814	}
815	}	815	}
816		816
817	/* must not update i_size! */	817	/* must not update i_size! */
818	ret = block_commit_write(page, block_start + 1,	818	ret = block_commit_write(page, block_start + 1,
819	block_start + 1);	819	block_start + 1);
820	if (ret < 0)	820	if (ret < 0)
821	mlog_errno(ret);	821	mlog_errno(ret);
822	else	822	else
823	ret = 0;	823	ret = 0;
824	}	824	}
825		825
826	if (handle)	826	if (handle)
827	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	827	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
828		828
829	out_unlock:	829	out_unlock:
830	unlock_page(page);	830	unlock_page(page);
831	page_cache_release(page);	831	page_cache_release(page);
832	out:	832	out:
833	return ret;	833	return ret;
834	}	834	}
835		835
836	/*	836	/*
837	* Find the next range to zero. We do this in terms of bytes because	837	* Find the next range to zero. We do this in terms of bytes because
838	* that's what ocfs2_zero_extend() wants, and it is dealing with the	838	* that's what ocfs2_zero_extend() wants, and it is dealing with the
839	* pagecache. We may return multiple extents.	839	* pagecache. We may return multiple extents.
840	*	840	*
841	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what	841	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
842	* needs to be zeroed. range_start and range_end return the next zeroing	842	* needs to be zeroed. range_start and range_end return the next zeroing
843	* range. A subsequent call should pass the previous range_end as its	843	* range. A subsequent call should pass the previous range_end as its
844	* zero_start. If range_end is 0, there's nothing to do.	844	* zero_start. If range_end is 0, there's nothing to do.
845	*	845	*
846	* Unwritten extents are skipped over. Refcounted extents are CoWd.	846	* Unwritten extents are skipped over. Refcounted extents are CoWd.
847	*/	847	*/
848	static int ocfs2_zero_extend_get_range(struct inode *inode,	848	static int ocfs2_zero_extend_get_range(struct inode *inode,
849	struct buffer_head *di_bh,	849	struct buffer_head *di_bh,
850	u64 zero_start, u64 zero_end,	850	u64 zero_start, u64 zero_end,
851	u64 range_start, u64 range_end)	851	u64 range_start, u64 range_end)
852	{	852	{
853	int rc = 0, needs_cow = 0;	853	int rc = 0, needs_cow = 0;
854	u32 p_cpos, zero_clusters = 0;	854	u32 p_cpos, zero_clusters = 0;
855	u32 zero_cpos =	855	u32 zero_cpos =
856	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;	856	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
857	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);	857	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
858	unsigned int num_clusters = 0;	858	unsigned int num_clusters = 0;
859	unsigned int ext_flags = 0;	859	unsigned int ext_flags = 0;
860		860
861	while (zero_cpos < last_cpos) {	861	while (zero_cpos < last_cpos) {
862	rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,	862	rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
863	&num_clusters, &ext_flags);	863	&num_clusters, &ext_flags);
864	if (rc) {	864	if (rc) {
865	mlog_errno(rc);	865	mlog_errno(rc);
866	goto out;	866	goto out;
867	}	867	}
868		868
869	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {	869	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
870	zero_clusters = num_clusters;	870	zero_clusters = num_clusters;
871	if (ext_flags & OCFS2_EXT_REFCOUNTED)	871	if (ext_flags & OCFS2_EXT_REFCOUNTED)
872	needs_cow = 1;	872	needs_cow = 1;
873	break;	873	break;
874	}	874	}
875		875
876	zero_cpos += num_clusters;	876	zero_cpos += num_clusters;
877	}	877	}
878	if (!zero_clusters) {	878	if (!zero_clusters) {
879	*range_end = 0;	879	*range_end = 0;
880	goto out;	880	goto out;
881	}	881	}
882		882
883	while ((zero_cpos + zero_clusters) < last_cpos) {	883	while ((zero_cpos + zero_clusters) < last_cpos) {
884	rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,	884	rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
885	&p_cpos, &num_clusters,	885	&p_cpos, &num_clusters,
886	&ext_flags);	886	&ext_flags);
887	if (rc) {	887	if (rc) {
888	mlog_errno(rc);	888	mlog_errno(rc);
889	goto out;	889	goto out;
890	}	890	}
891		891
892	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))	892	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))
893	break;	893	break;
894	if (ext_flags & OCFS2_EXT_REFCOUNTED)	894	if (ext_flags & OCFS2_EXT_REFCOUNTED)
895	needs_cow = 1;	895	needs_cow = 1;
896	zero_clusters += num_clusters;	896	zero_clusters += num_clusters;
897	}	897	}
898	if ((zero_cpos + zero_clusters) > last_cpos)	898	if ((zero_cpos + zero_clusters) > last_cpos)
899	zero_clusters = last_cpos - zero_cpos;	899	zero_clusters = last_cpos - zero_cpos;
900		900
901	if (needs_cow) {	901	if (needs_cow) {
902	rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,	902	rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
903	zero_clusters, UINT_MAX);	903	zero_clusters, UINT_MAX);
904	if (rc) {	904	if (rc) {
905	mlog_errno(rc);	905	mlog_errno(rc);
906	goto out;	906	goto out;
907	}	907	}
908	}	908	}
909		909
910	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);	910	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
911	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,	911	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
912	zero_cpos + zero_clusters);	912	zero_cpos + zero_clusters);
913		913
914	out:	914	out:
915	return rc;	915	return rc;
916	}	916	}
917		917
918	/*	918	/*
919	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller	919	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
920	* has made sure that the entire range needs zeroing.	920	* has made sure that the entire range needs zeroing.
921	*/	921	*/
922	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,	922	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
923	u64 range_end)	923	u64 range_end)
924	{	924	{
925	int rc = 0;	925	int rc = 0;
926	u64 next_pos;	926	u64 next_pos;
927	u64 zero_pos = range_start;	927	u64 zero_pos = range_start;
928		928
929	trace_ocfs2_zero_extend_range(	929	trace_ocfs2_zero_extend_range(
930	(unsigned long long)OCFS2_I(inode)->ip_blkno,	930	(unsigned long long)OCFS2_I(inode)->ip_blkno,
931	(unsigned long long)range_start,	931	(unsigned long long)range_start,
932	(unsigned long long)range_end);	932	(unsigned long long)range_end);
933	BUG_ON(range_start >= range_end);	933	BUG_ON(range_start >= range_end);
934		934
935	while (zero_pos < range_end) {	935	while (zero_pos < range_end) {
936	next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;	936	next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
937	if (next_pos > range_end)	937	if (next_pos > range_end)
938	next_pos = range_end;	938	next_pos = range_end;
939	rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);	939	rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
940	if (rc < 0) {	940	if (rc < 0) {
941	mlog_errno(rc);	941	mlog_errno(rc);
942	break;	942	break;
943	}	943	}
944	zero_pos = next_pos;	944	zero_pos = next_pos;
945		945
946	/*	946	/*
947	* Very large extends have the potential to lock up	947	* Very large extends have the potential to lock up
948	* the cpu for extended periods of time.	948	* the cpu for extended periods of time.
949	*/	949	*/
950	cond_resched();	950	cond_resched();
951	}	951	}
952		952
953	return rc;	953	return rc;
954	}	954	}
955		955
956	int ocfs2_zero_extend(struct inode inode, struct buffer_head di_bh,	956	int ocfs2_zero_extend(struct inode inode, struct buffer_head di_bh,
957	loff_t zero_to_size)	957	loff_t zero_to_size)
958	{	958	{
959	int ret = 0;	959	int ret = 0;
960	u64 zero_start, range_start = 0, range_end = 0;	960	u64 zero_start, range_start = 0, range_end = 0;
961	struct super_block *sb = inode->i_sb;	961	struct super_block *sb = inode->i_sb;
962		962
963	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));	963	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
964	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,	964	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
965	(unsigned long long)zero_start,	965	(unsigned long long)zero_start,
966	(unsigned long long)i_size_read(inode));	966	(unsigned long long)i_size_read(inode));
967	while (zero_start < zero_to_size) {	967	while (zero_start < zero_to_size) {
968	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,	968	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
969	zero_to_size,	969	zero_to_size,
970	&range_start,	970	&range_start,
971	&range_end);	971	&range_end);
972	if (ret) {	972	if (ret) {
973	mlog_errno(ret);	973	mlog_errno(ret);
974	break;	974	break;
975	}	975	}
976	if (!range_end)	976	if (!range_end)
977	break;	977	break;
978	/* Trim the ends */	978	/* Trim the ends */
979	if (range_start < zero_start)	979	if (range_start < zero_start)
980	range_start = zero_start;	980	range_start = zero_start;
981	if (range_end > zero_to_size)	981	if (range_end > zero_to_size)
982	range_end = zero_to_size;	982	range_end = zero_to_size;
983		983
984	ret = ocfs2_zero_extend_range(inode, range_start,	984	ret = ocfs2_zero_extend_range(inode, range_start,
985	range_end);	985	range_end);
986	if (ret) {	986	if (ret) {
987	mlog_errno(ret);	987	mlog_errno(ret);
988	break;	988	break;
989	}	989	}
990	zero_start = range_end;	990	zero_start = range_end;
991	}	991	}
992		992
993	return ret;	993	return ret;
994	}	994	}
995		995
996	int ocfs2_extend_no_holes(struct inode inode, struct buffer_head di_bh,	996	int ocfs2_extend_no_holes(struct inode inode, struct buffer_head di_bh,
997	u64 new_i_size, u64 zero_to)	997	u64 new_i_size, u64 zero_to)
998	{	998	{
999	int ret;	999	int ret;
1000	u32 clusters_to_add;	1000	u32 clusters_to_add;
1001	struct ocfs2_inode_info *oi = OCFS2_I(inode);	1001	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1002		1002
1003	/*	1003	/*
1004	* Only quota files call this without a bh, and they can't be	1004	* Only quota files call this without a bh, and they can't be
1005	* refcounted.	1005	* refcounted.
1006	*/	1006	*/
1007	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));	1007	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1008	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));	1008	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1009		1009
1010	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);	1010	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1011	if (clusters_to_add < oi->ip_clusters)	1011	if (clusters_to_add < oi->ip_clusters)
1012	clusters_to_add = 0;	1012	clusters_to_add = 0;
1013	else	1013	else
1014	clusters_to_add -= oi->ip_clusters;	1014	clusters_to_add -= oi->ip_clusters;
1015		1015
1016	if (clusters_to_add) {	1016	if (clusters_to_add) {
1017	ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,	1017	ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
1018	clusters_to_add, 0);	1018	clusters_to_add, 0);
1019	if (ret) {	1019	if (ret) {
1020	mlog_errno(ret);	1020	mlog_errno(ret);
1021	goto out;	1021	goto out;
1022	}	1022	}
1023	}	1023	}
1024		1024
1025	/*	1025	/*
1026	* Call this even if we don't add any clusters to the tree. We	1026	* Call this even if we don't add any clusters to the tree. We
1027	* still need to zero the area between the old i_size and the	1027	* still need to zero the area between the old i_size and the
1028	* new i_size.	1028	* new i_size.
1029	*/	1029	*/
1030	ret = ocfs2_zero_extend(inode, di_bh, zero_to);	1030	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1031	if (ret < 0)	1031	if (ret < 0)
1032	mlog_errno(ret);	1032	mlog_errno(ret);
1033		1033
1034	out:	1034	out:
1035	return ret;	1035	return ret;
1036	}	1036	}
1037		1037
1038	static int ocfs2_extend_file(struct inode *inode,	1038	static int ocfs2_extend_file(struct inode *inode,
1039	struct buffer_head *di_bh,	1039	struct buffer_head *di_bh,
1040	u64 new_i_size)	1040	u64 new_i_size)
1041	{	1041	{
1042	int ret = 0;	1042	int ret = 0;
1043	struct ocfs2_inode_info *oi = OCFS2_I(inode);	1043	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1044		1044
1045	BUG_ON(!di_bh);	1045	BUG_ON(!di_bh);
1046		1046
1047	/* setattr sometimes calls us like this. */	1047	/* setattr sometimes calls us like this. */
1048	if (new_i_size == 0)	1048	if (new_i_size == 0)
1049	goto out;	1049	goto out;
1050		1050
1051	if (i_size_read(inode) == new_i_size)	1051	if (i_size_read(inode) == new_i_size)
1052	goto out;	1052	goto out;
1053	BUG_ON(new_i_size < i_size_read(inode));	1053	BUG_ON(new_i_size < i_size_read(inode));
1054		1054
1055	/*	1055	/*
1056	* The alloc sem blocks people in read/write from reading our	1056	* The alloc sem blocks people in read/write from reading our
1057	* allocation until we're done changing it. We depend on	1057	* allocation until we're done changing it. We depend on
1058	* i_mutex to block other extend/truncate calls while we're	1058	* i_mutex to block other extend/truncate calls while we're
1059	* here. We even have to hold it for sparse files because there	1059	* here. We even have to hold it for sparse files because there
1060	* might be some tail zeroing.	1060	* might be some tail zeroing.
1061	*/	1061	*/
1062	down_write(&oi->ip_alloc_sem);	1062	down_write(&oi->ip_alloc_sem);
1063		1063
1064	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	1064	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1065	/*	1065	/*
1066	* We can optimize small extends by keeping the inodes	1066	* We can optimize small extends by keeping the inodes
1067	* inline data.	1067	* inline data.
1068	*/	1068	*/
1069	if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {	1069	if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1070	up_write(&oi->ip_alloc_sem);	1070	up_write(&oi->ip_alloc_sem);
1071	goto out_update_size;	1071	goto out_update_size;
1072	}	1072	}
1073		1073
1074	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);	1074	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1075	if (ret) {	1075	if (ret) {
1076	up_write(&oi->ip_alloc_sem);	1076	up_write(&oi->ip_alloc_sem);
1077	mlog_errno(ret);	1077	mlog_errno(ret);
1078	goto out;	1078	goto out;
1079	}	1079	}
1080	}	1080	}
1081		1081
1082	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))	1082	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1083	ret = ocfs2_zero_extend(inode, di_bh, new_i_size);	1083	ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1084	else	1084	else
1085	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,	1085	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1086	new_i_size);	1086	new_i_size);
1087		1087
1088	up_write(&oi->ip_alloc_sem);	1088	up_write(&oi->ip_alloc_sem);
1089		1089
1090	if (ret < 0) {	1090	if (ret < 0) {
1091	mlog_errno(ret);	1091	mlog_errno(ret);
1092	goto out;	1092	goto out;
1093	}	1093	}
1094		1094
1095	out_update_size:	1095	out_update_size:
1096	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);	1096	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1097	if (ret < 0)	1097	if (ret < 0)
1098	mlog_errno(ret);	1098	mlog_errno(ret);
1099		1099
1100	out:	1100	out:
1101	return ret;	1101	return ret;
1102	}	1102	}
1103		1103
1104	int ocfs2_setattr(struct dentry dentry, struct iattr attr)	1104	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
1105	{	1105	{
1106	int status = 0, size_change;	1106	int status = 0, size_change;
1107	struct inode *inode = dentry->d_inode;	1107	struct inode *inode = dentry->d_inode;
1108	struct super_block *sb = inode->i_sb;	1108	struct super_block *sb = inode->i_sb;
1109	struct ocfs2_super *osb = OCFS2_SB(sb);	1109	struct ocfs2_super *osb = OCFS2_SB(sb);
1110	struct buffer_head *bh = NULL;	1110	struct buffer_head *bh = NULL;
1111	handle_t *handle = NULL;	1111	handle_t *handle = NULL;
1112	struct dquot *transfer_to[MAXQUOTAS] = { };	1112	struct dquot *transfer_to[MAXQUOTAS] = { };
1113	int qtype;	1113	int qtype;
1114		1114
1115	trace_ocfs2_setattr(inode, dentry,	1115	trace_ocfs2_setattr(inode, dentry,
1116	(unsigned long long)OCFS2_I(inode)->ip_blkno,	1116	(unsigned long long)OCFS2_I(inode)->ip_blkno,
1117	dentry->d_name.len, dentry->d_name.name,	1117	dentry->d_name.len, dentry->d_name.name,
1118	attr->ia_valid, attr->ia_mode,	1118	attr->ia_valid, attr->ia_mode,
1119	attr->ia_uid, attr->ia_gid);	1119	attr->ia_uid, attr->ia_gid);
1120		1120
1121	/* ensuring we don't even attempt to truncate a symlink */	1121	/* ensuring we don't even attempt to truncate a symlink */
1122	if (S_ISLNK(inode->i_mode))	1122	if (S_ISLNK(inode->i_mode))
1123	attr->ia_valid &= ~ATTR_SIZE;	1123	attr->ia_valid &= ~ATTR_SIZE;
1124		1124
1125	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \	1125	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
1126	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)	1126	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
1127	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))	1127	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1128	return 0;	1128	return 0;
1129		1129
1130	status = inode_change_ok(inode, attr);	1130	status = inode_change_ok(inode, attr);
1131	if (status)	1131	if (status)
1132	return status;	1132	return status;
1133		1133
1134	if (is_quota_modification(inode, attr))	1134	if (is_quota_modification(inode, attr))
1135	dquot_initialize(inode);	1135	dquot_initialize(inode);
1136	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;	1136	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1137	if (size_change) {	1137	if (size_change) {
1138	status = ocfs2_rw_lock(inode, 1);	1138	status = ocfs2_rw_lock(inode, 1);
1139	if (status < 0) {	1139	if (status < 0) {
1140	mlog_errno(status);	1140	mlog_errno(status);
1141	goto bail;	1141	goto bail;
1142	}	1142	}
1143	}	1143	}
1144		1144
1145	status = ocfs2_inode_lock(inode, &bh, 1);	1145	status = ocfs2_inode_lock(inode, &bh, 1);
1146	if (status < 0) {	1146	if (status < 0) {
1147	if (status != -ENOENT)	1147	if (status != -ENOENT)
1148	mlog_errno(status);	1148	mlog_errno(status);
1149	goto bail_unlock_rw;	1149	goto bail_unlock_rw;
1150	}	1150	}
1151		1151
1152	if (size_change && attr->ia_size != i_size_read(inode)) {	1152	if (size_change && attr->ia_size != i_size_read(inode)) {
1153	status = inode_newsize_ok(inode, attr->ia_size);	1153	status = inode_newsize_ok(inode, attr->ia_size);
1154	if (status)	1154	if (status)
1155	goto bail_unlock;	1155	goto bail_unlock;
1156		1156
1157	inode_dio_wait(inode);	1157	inode_dio_wait(inode);
1158		1158
1159	if (i_size_read(inode) > attr->ia_size) {	1159	if (i_size_read(inode) > attr->ia_size) {
1160	if (ocfs2_should_order_data(inode)) {	1160	if (ocfs2_should_order_data(inode)) {
1161	status = ocfs2_begin_ordered_truncate(inode,	1161	status = ocfs2_begin_ordered_truncate(inode,
1162	attr->ia_size);	1162	attr->ia_size);
1163	if (status)	1163	if (status)
1164	goto bail_unlock;	1164	goto bail_unlock;
1165	}	1165	}
1166	status = ocfs2_truncate_file(inode, bh, attr->ia_size);	1166	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1167	} else	1167	} else
1168	status = ocfs2_extend_file(inode, bh, attr->ia_size);	1168	status = ocfs2_extend_file(inode, bh, attr->ia_size);
1169	if (status < 0) {	1169	if (status < 0) {
1170	if (status != -ENOSPC)	1170	if (status != -ENOSPC)
1171	mlog_errno(status);	1171	mlog_errno(status);
1172	status = -ENOSPC;	1172	status = -ENOSPC;
1173	goto bail_unlock;	1173	goto bail_unlock;
1174	}	1174	}
1175	}	1175	}
1176		1176
1177	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|	1177	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
1178	(attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {	1178	(attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1179	/*	1179	/*
1180	* Gather pointers to quota structures so that allocation /	1180	* Gather pointers to quota structures so that allocation /
1181	* freeing of quota structures happens here and not inside	1181	* freeing of quota structures happens here and not inside
1182	* dquot_transfer() where we have problems with lock ordering	1182	* dquot_transfer() where we have problems with lock ordering
1183	*/	1183	*/
1184	if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid	1184	if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1185	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,	1185	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1186	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {	1186	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1187	transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));	1187	transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1188	if (!transfer_to[USRQUOTA]) {	1188	if (!transfer_to[USRQUOTA]) {
1189	status = -ESRCH;	1189	status = -ESRCH;
1190	goto bail_unlock;	1190	goto bail_unlock;
1191	}	1191	}
1192	}	1192	}
1193	if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid	1193	if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1194	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,	1194	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1195	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {	1195	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1196	transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));	1196	transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1197	if (!transfer_to[GRPQUOTA]) {	1197	if (!transfer_to[GRPQUOTA]) {
1198	status = -ESRCH;	1198	status = -ESRCH;
1199	goto bail_unlock;	1199	goto bail_unlock;
1200	}	1200	}
1201	}	1201	}
1202	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +	1202	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1203	2 * ocfs2_quota_trans_credits(sb));	1203	2 * ocfs2_quota_trans_credits(sb));
1204	if (IS_ERR(handle)) {	1204	if (IS_ERR(handle)) {
1205	status = PTR_ERR(handle);	1205	status = PTR_ERR(handle);
1206	mlog_errno(status);	1206	mlog_errno(status);
1207	goto bail_unlock;	1207	goto bail_unlock;
1208	}	1208	}
1209	status = __dquot_transfer(inode, transfer_to);	1209	status = __dquot_transfer(inode, transfer_to);
1210	if (status < 0)	1210	if (status < 0)
1211	goto bail_commit;	1211	goto bail_commit;
1212	} else {	1212	} else {
1213	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	1213	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1214	if (IS_ERR(handle)) {	1214	if (IS_ERR(handle)) {
1215	status = PTR_ERR(handle);	1215	status = PTR_ERR(handle);
1216	mlog_errno(status);	1216	mlog_errno(status);
1217	goto bail_unlock;	1217	goto bail_unlock;
1218	}	1218	}
1219	}	1219	}
1220		1220
1221	/*	1221	/*
1222	* This will intentionally not wind up calling truncate_setsize(),	1222	* This will intentionally not wind up calling truncate_setsize(),
1223	* since all the work for a size change has been done above.	1223	* since all the work for a size change has been done above.
1224	* Otherwise, we could get into problems with truncate as	1224	* Otherwise, we could get into problems with truncate as
1225	* ip_alloc_sem is used there to protect against i_size	1225	* ip_alloc_sem is used there to protect against i_size
1226	* changes.	1226	* changes.
1227	*	1227	*
1228	* XXX: this means the conditional below can probably be removed.	1228	* XXX: this means the conditional below can probably be removed.
1229	*/	1229	*/
1230	if ((attr->ia_valid & ATTR_SIZE) &&	1230	if ((attr->ia_valid & ATTR_SIZE) &&
1231	attr->ia_size != i_size_read(inode)) {	1231	attr->ia_size != i_size_read(inode)) {
1232	status = vmtruncate(inode, attr->ia_size);	1232	status = vmtruncate(inode, attr->ia_size);
1233	if (status) {	1233	if (status) {
1234	mlog_errno(status);	1234	mlog_errno(status);
1235	goto bail_commit;	1235	goto bail_commit;
1236	}	1236	}
1237	}	1237	}
1238		1238
1239	setattr_copy(inode, attr);	1239	setattr_copy(inode, attr);
1240	mark_inode_dirty(inode);	1240	mark_inode_dirty(inode);
1241		1241
1242	status = ocfs2_mark_inode_dirty(handle, inode, bh);	1242	status = ocfs2_mark_inode_dirty(handle, inode, bh);
1243	if (status < 0)	1243	if (status < 0)
1244	mlog_errno(status);	1244	mlog_errno(status);
1245		1245
1246	bail_commit:	1246	bail_commit:
1247	ocfs2_commit_trans(osb, handle);	1247	ocfs2_commit_trans(osb, handle);
1248	bail_unlock:	1248	bail_unlock:
1249	ocfs2_inode_unlock(inode, 1);	1249	ocfs2_inode_unlock(inode, 1);
1250	bail_unlock_rw:	1250	bail_unlock_rw:
1251	if (size_change)	1251	if (size_change)
1252	ocfs2_rw_unlock(inode, 1);	1252	ocfs2_rw_unlock(inode, 1);
1253	bail:	1253	bail:
1254	brelse(bh);	1254	brelse(bh);
1255		1255
1256	/* Release quota pointers in case we acquired them */	1256	/* Release quota pointers in case we acquired them */
1257	for (qtype = 0; qtype < MAXQUOTAS; qtype++)	1257	for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1258	dqput(transfer_to[qtype]);	1258	dqput(transfer_to[qtype]);
1259		1259
1260	if (!status && attr->ia_valid & ATTR_MODE) {	1260	if (!status && attr->ia_valid & ATTR_MODE) {
1261	status = ocfs2_acl_chmod(inode);	1261	status = ocfs2_acl_chmod(inode);
1262	if (status < 0)	1262	if (status < 0)
1263	mlog_errno(status);	1263	mlog_errno(status);
1264	}	1264	}
1265		1265
1266	return status;	1266	return status;
1267	}	1267	}
1268		1268
1269	int ocfs2_getattr(struct vfsmount *mnt,	1269	int ocfs2_getattr(struct vfsmount *mnt,
1270	struct dentry *dentry,	1270	struct dentry *dentry,
1271	struct kstat *stat)	1271	struct kstat *stat)
1272	{	1272	{
1273	struct inode *inode = dentry->d_inode;	1273	struct inode *inode = dentry->d_inode;
1274	struct super_block *sb = dentry->d_inode->i_sb;	1274	struct super_block *sb = dentry->d_inode->i_sb;
1275	struct ocfs2_super *osb = sb->s_fs_info;	1275	struct ocfs2_super *osb = sb->s_fs_info;
1276	int err;	1276	int err;
1277		1277
1278	err = ocfs2_inode_revalidate(dentry);	1278	err = ocfs2_inode_revalidate(dentry);
1279	if (err) {	1279	if (err) {
1280	if (err != -ENOENT)	1280	if (err != -ENOENT)
1281	mlog_errno(err);	1281	mlog_errno(err);
1282	goto bail;	1282	goto bail;
1283	}	1283	}
1284		1284
1285	generic_fillattr(inode, stat);	1285	generic_fillattr(inode, stat);
1286		1286
1287	/* We set the blksize from the cluster size for performance */	1287	/* We set the blksize from the cluster size for performance */
1288	stat->blksize = osb->s_clustersize;	1288	stat->blksize = osb->s_clustersize;
1289		1289
1290	bail:	1290	bail:
1291	return err;	1291	return err;
1292	}	1292	}
1293		1293
1294	int ocfs2_permission(struct inode *inode, int mask)	1294	int ocfs2_permission(struct inode *inode, int mask)
1295	{	1295	{
1296	int ret;	1296	int ret;
1297		1297
1298	if (mask & MAY_NOT_BLOCK)	1298	if (mask & MAY_NOT_BLOCK)
1299	return -ECHILD;	1299	return -ECHILD;
1300		1300
1301	ret = ocfs2_inode_lock(inode, NULL, 0);	1301	ret = ocfs2_inode_lock(inode, NULL, 0);
1302	if (ret) {	1302	if (ret) {
1303	if (ret != -ENOENT)	1303	if (ret != -ENOENT)
1304	mlog_errno(ret);	1304	mlog_errno(ret);
1305	goto out;	1305	goto out;
1306	}	1306	}
1307		1307
1308	ret = generic_permission(inode, mask);	1308	ret = generic_permission(inode, mask);
1309		1309
1310	ocfs2_inode_unlock(inode, 0);	1310	ocfs2_inode_unlock(inode, 0);
1311	out:	1311	out:
1312	return ret;	1312	return ret;
1313	}	1313	}
1314		1314
1315	static int __ocfs2_write_remove_suid(struct inode *inode,	1315	static int __ocfs2_write_remove_suid(struct inode *inode,
1316	struct buffer_head *bh)	1316	struct buffer_head *bh)
1317	{	1317	{
1318	int ret;	1318	int ret;
1319	handle_t *handle;	1319	handle_t *handle;
1320	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1320	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1321	struct ocfs2_dinode *di;	1321	struct ocfs2_dinode *di;
1322		1322
1323	trace_ocfs2_write_remove_suid(	1323	trace_ocfs2_write_remove_suid(
1324	(unsigned long long)OCFS2_I(inode)->ip_blkno,	1324	(unsigned long long)OCFS2_I(inode)->ip_blkno,
1325	inode->i_mode);	1325	inode->i_mode);
1326		1326
1327	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	1327	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1328	if (IS_ERR(handle)) {	1328	if (IS_ERR(handle)) {
1329	ret = PTR_ERR(handle);	1329	ret = PTR_ERR(handle);
1330	mlog_errno(ret);	1330	mlog_errno(ret);
1331	goto out;	1331	goto out;
1332	}	1332	}
1333		1333
1334	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,	1334	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1335	OCFS2_JOURNAL_ACCESS_WRITE);	1335	OCFS2_JOURNAL_ACCESS_WRITE);
1336	if (ret < 0) {	1336	if (ret < 0) {
1337	mlog_errno(ret);	1337	mlog_errno(ret);
1338	goto out_trans;	1338	goto out_trans;
1339	}	1339	}
1340		1340
1341	inode->i_mode &= ~S_ISUID;	1341	inode->i_mode &= ~S_ISUID;
1342	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))	1342	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1343	inode->i_mode &= ~S_ISGID;	1343	inode->i_mode &= ~S_ISGID;
1344		1344
1345	di = (struct ocfs2_dinode *) bh->b_data;	1345	di = (struct ocfs2_dinode *) bh->b_data;
1346	di->i_mode = cpu_to_le16(inode->i_mode);	1346	di->i_mode = cpu_to_le16(inode->i_mode);
1347		1347
1348	ocfs2_journal_dirty(handle, bh);	1348	ocfs2_journal_dirty(handle, bh);
1349		1349
1350	out_trans:	1350	out_trans:
1351	ocfs2_commit_trans(osb, handle);	1351	ocfs2_commit_trans(osb, handle);
1352	out:	1352	out:
1353	return ret;	1353	return ret;
1354	}	1354	}
1355		1355
1356	/*	1356	/*
1357	* Will look for holes and unwritten extents in the range starting at	1357	* Will look for holes and unwritten extents in the range starting at
1358	* pos for count bytes (inclusive).	1358	* pos for count bytes (inclusive).
1359	*/	1359	*/
1360	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,	1360	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1361	size_t count)	1361	size_t count)
1362	{	1362	{
1363	int ret = 0;	1363	int ret = 0;
1364	unsigned int extent_flags;	1364	unsigned int extent_flags;
1365	u32 cpos, clusters, extent_len, phys_cpos;	1365	u32 cpos, clusters, extent_len, phys_cpos;
1366	struct super_block *sb = inode->i_sb;	1366	struct super_block *sb = inode->i_sb;
1367		1367
1368	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;	1368	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1369	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;	1369	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1370		1370
1371	while (clusters) {	1371	while (clusters) {
1372	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,	1372	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1373	&extent_flags);	1373	&extent_flags);
1374	if (ret < 0) {	1374	if (ret < 0) {
1375	mlog_errno(ret);	1375	mlog_errno(ret);
1376	goto out;	1376	goto out;
1377	}	1377	}
1378		1378
1379	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {	1379	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1380	ret = 1;	1380	ret = 1;
1381	break;	1381	break;
1382	}	1382	}
1383		1383
1384	if (extent_len > clusters)	1384	if (extent_len > clusters)
1385	extent_len = clusters;	1385	extent_len = clusters;
1386		1386
1387	clusters -= extent_len;	1387	clusters -= extent_len;
1388	cpos += extent_len;	1388	cpos += extent_len;
1389	}	1389	}
1390	out:	1390	out:
1391	return ret;	1391	return ret;
1392	}	1392	}
1393		1393
1394	static int ocfs2_write_remove_suid(struct inode *inode)	1394	static int ocfs2_write_remove_suid(struct inode *inode)
1395	{	1395	{
1396	int ret;	1396	int ret;
1397	struct buffer_head *bh = NULL;	1397	struct buffer_head *bh = NULL;
1398		1398
1399	ret = ocfs2_read_inode_block(inode, &bh);	1399	ret = ocfs2_read_inode_block(inode, &bh);
1400	if (ret < 0) {	1400	if (ret < 0) {
1401	mlog_errno(ret);	1401	mlog_errno(ret);
1402	goto out;	1402	goto out;
1403	}	1403	}
1404		1404
1405	ret = __ocfs2_write_remove_suid(inode, bh);	1405	ret = __ocfs2_write_remove_suid(inode, bh);
1406	out:	1406	out:
1407	brelse(bh);	1407	brelse(bh);
1408	return ret;	1408	return ret;
1409	}	1409	}
1410		1410
1411	/*	1411	/*
1412	* Allocate enough extents to cover the region starting at byte offset	1412	* Allocate enough extents to cover the region starting at byte offset
1413	* start for len bytes. Existing extents are skipped, any extents	1413	* start for len bytes. Existing extents are skipped, any extents
1414	* added are marked as "unwritten".	1414	* added are marked as "unwritten".
1415	*/	1415	*/
1416	static int ocfs2_allocate_unwritten_extents(struct inode *inode,	1416	static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1417	u64 start, u64 len)	1417	u64 start, u64 len)
1418	{	1418	{
1419	int ret;	1419	int ret;
1420	u32 cpos, phys_cpos, clusters, alloc_size;	1420	u32 cpos, phys_cpos, clusters, alloc_size;
1421	u64 end = start + len;	1421	u64 end = start + len;
1422	struct buffer_head *di_bh = NULL;	1422	struct buffer_head *di_bh = NULL;
1423		1423
1424	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	1424	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1425	ret = ocfs2_read_inode_block(inode, &di_bh);	1425	ret = ocfs2_read_inode_block(inode, &di_bh);
1426	if (ret) {	1426	if (ret) {
1427	mlog_errno(ret);	1427	mlog_errno(ret);
1428	goto out;	1428	goto out;
1429	}	1429	}
1430		1430
1431	/*	1431	/*
1432	* Nothing to do if the requested reservation range	1432	* Nothing to do if the requested reservation range
1433	* fits within the inode.	1433	* fits within the inode.
1434	*/	1434	*/
1435	if (ocfs2_size_fits_inline_data(di_bh, end))	1435	if (ocfs2_size_fits_inline_data(di_bh, end))
1436	goto out;	1436	goto out;
1437		1437
1438	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);	1438	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1439	if (ret) {	1439	if (ret) {
1440	mlog_errno(ret);	1440	mlog_errno(ret);
1441	goto out;	1441	goto out;
1442	}	1442	}
1443	}	1443	}
1444		1444
1445	/*	1445	/*
1446	* We consider both start and len to be inclusive.	1446	* We consider both start and len to be inclusive.
1447	*/	1447	*/
1448	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;	1448	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1449	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);	1449	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1450	clusters -= cpos;	1450	clusters -= cpos;
1451		1451
1452	while (clusters) {	1452	while (clusters) {
1453	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,	1453	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1454	&alloc_size, NULL);	1454	&alloc_size, NULL);
1455	if (ret) {	1455	if (ret) {
1456	mlog_errno(ret);	1456	mlog_errno(ret);
1457	goto out;	1457	goto out;
1458	}	1458	}
1459		1459
1460	/*	1460	/*
1461	* Hole or existing extent len can be arbitrary, so	1461	* Hole or existing extent len can be arbitrary, so
1462	* cap it to our own allocation request.	1462	* cap it to our own allocation request.
1463	*/	1463	*/
1464	if (alloc_size > clusters)	1464	if (alloc_size > clusters)
1465	alloc_size = clusters;	1465	alloc_size = clusters;
1466		1466
1467	if (phys_cpos) {	1467	if (phys_cpos) {
1468	/*	1468	/*
1469	* We already have an allocation at this	1469	* We already have an allocation at this
1470	* region so we can safely skip it.	1470	* region so we can safely skip it.
1471	*/	1471	*/
1472	goto next;	1472	goto next;
1473	}	1473	}
1474		1474
1475	ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);	1475	ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1476	if (ret) {	1476	if (ret) {
1477	if (ret != -ENOSPC)	1477	if (ret != -ENOSPC)
1478	mlog_errno(ret);	1478	mlog_errno(ret);
1479	goto out;	1479	goto out;
1480	}	1480	}
1481		1481
1482	next:	1482	next:
1483	cpos += alloc_size;	1483	cpos += alloc_size;
1484	clusters -= alloc_size;	1484	clusters -= alloc_size;
1485	}	1485	}
1486		1486
1487	ret = 0;	1487	ret = 0;
1488	out:	1488	out:
1489		1489
1490	brelse(di_bh);	1490	brelse(di_bh);
1491	return ret;	1491	return ret;
1492	}	1492	}
1493		1493
1494	/*	1494	/*
1495	* Truncate a byte range, avoiding pages within partial clusters. This	1495	* Truncate a byte range, avoiding pages within partial clusters. This
1496	* preserves those pages for the zeroing code to write to.	1496	* preserves those pages for the zeroing code to write to.
1497	*/	1497	*/
1498	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,	1498	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1499	u64 byte_len)	1499	u64 byte_len)
1500	{	1500	{
1501	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1501	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1502	loff_t start, end;	1502	loff_t start, end;
1503	struct address_space *mapping = inode->i_mapping;	1503	struct address_space *mapping = inode->i_mapping;
1504		1504
1505	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);	1505	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1506	end = byte_start + byte_len;	1506	end = byte_start + byte_len;
1507	end = end & ~(osb->s_clustersize - 1);	1507	end = end & ~(osb->s_clustersize - 1);
1508		1508
1509	if (start < end) {	1509	if (start < end) {
1510	unmap_mapping_range(mapping, start, end - start, 0);	1510	unmap_mapping_range(mapping, start, end - start, 0);
1511	truncate_inode_pages_range(mapping, start, end - 1);	1511	truncate_inode_pages_range(mapping, start, end - 1);
1512	}	1512	}
1513	}	1513	}
1514		1514
1515	static int ocfs2_zero_partial_clusters(struct inode *inode,	1515	static int ocfs2_zero_partial_clusters(struct inode *inode,
1516	u64 start, u64 len)	1516	u64 start, u64 len)
1517	{	1517	{
1518	int ret = 0;	1518	int ret = 0;
1519	u64 tmpend, end = start + len;	1519	u64 tmpend, end = start + len;
1520	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1520	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1521	unsigned int csize = osb->s_clustersize;	1521	unsigned int csize = osb->s_clustersize;
1522	handle_t *handle;	1522	handle_t *handle;
1523		1523
1524	/*	1524	/*
1525	* The "start" and "end" values are NOT necessarily part of	1525	* The "start" and "end" values are NOT necessarily part of
1526	* the range whose allocation is being deleted. Rather, this	1526	* the range whose allocation is being deleted. Rather, this
1527	* is what the user passed in with the request. We must zero	1527	* is what the user passed in with the request. We must zero
1528	* partial clusters here. There's no need to worry about	1528	* partial clusters here. There's no need to worry about
1529	* physical allocation - the zeroing code knows to skip holes.	1529	* physical allocation - the zeroing code knows to skip holes.
1530	*/	1530	*/
1531	trace_ocfs2_zero_partial_clusters(	1531	trace_ocfs2_zero_partial_clusters(
1532	(unsigned long long)OCFS2_I(inode)->ip_blkno,	1532	(unsigned long long)OCFS2_I(inode)->ip_blkno,
1533	(unsigned long long)start, (unsigned long long)end);	1533	(unsigned long long)start, (unsigned long long)end);
1534		1534
1535	/*	1535	/*
1536	* If both edges are on a cluster boundary then there's no	1536	* If both edges are on a cluster boundary then there's no
1537	* zeroing required as the region is part of the allocation to	1537	* zeroing required as the region is part of the allocation to
1538	* be truncated.	1538	* be truncated.
1539	*/	1539	*/
1540	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)	1540	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1541	goto out;	1541	goto out;
1542		1542
1543	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	1543	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1544	if (IS_ERR(handle)) {	1544	if (IS_ERR(handle)) {
1545	ret = PTR_ERR(handle);	1545	ret = PTR_ERR(handle);
1546	mlog_errno(ret);	1546	mlog_errno(ret);
1547	goto out;	1547	goto out;
1548	}	1548	}
1549		1549
1550	/*	1550	/*
1551	* We want to get the byte offset of the end of the 1st cluster.	1551	* We want to get the byte offset of the end of the 1st cluster.
1552	*/	1552	*/
1553	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));	1553	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1554	if (tmpend > end)	1554	if (tmpend > end)
1555	tmpend = end;	1555	tmpend = end;
1556		1556
1557	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,	1557	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
1558	(unsigned long long)tmpend);	1558	(unsigned long long)tmpend);
1559		1559
1560	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);	1560	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1561	if (ret)	1561	if (ret)
1562	mlog_errno(ret);	1562	mlog_errno(ret);
1563		1563
1564	if (tmpend < end) {	1564	if (tmpend < end) {
1565	/*	1565	/*
1566	* This may make start and end equal, but the zeroing	1566	* This may make start and end equal, but the zeroing
1567	* code will skip any work in that case so there's no	1567	* code will skip any work in that case so there's no
1568	* need to catch it up here.	1568	* need to catch it up here.
1569	*/	1569	*/
1570	start = end & ~(osb->s_clustersize - 1);	1570	start = end & ~(osb->s_clustersize - 1);
1571		1571
1572	trace_ocfs2_zero_partial_clusters_range2(	1572	trace_ocfs2_zero_partial_clusters_range2(
1573	(unsigned long long)start, (unsigned long long)end);	1573	(unsigned long long)start, (unsigned long long)end);
1574		1574
1575	ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);	1575	ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1576	if (ret)	1576	if (ret)
1577	mlog_errno(ret);	1577	mlog_errno(ret);
1578	}	1578	}
1579		1579
1580	ocfs2_commit_trans(osb, handle);	1580	ocfs2_commit_trans(osb, handle);
1581	out:	1581	out:
1582	return ret;	1582	return ret;
1583	}	1583	}
1584		1584
1585	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)	1585	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1586	{	1586	{
1587	int i;	1587	int i;
1588	struct ocfs2_extent_rec *rec = NULL;	1588	struct ocfs2_extent_rec *rec = NULL;
1589		1589
1590	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {	1590	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1591		1591
1592	rec = &el->l_recs[i];	1592	rec = &el->l_recs[i];
1593		1593
1594	if (le32_to_cpu(rec->e_cpos) < pos)	1594	if (le32_to_cpu(rec->e_cpos) < pos)
1595	break;	1595	break;
1596	}	1596	}
1597		1597
1598	return i;	1598	return i;
1599	}	1599	}
1600		1600
1601	/*	1601	/*
1602	* Helper to calculate the punching pos and length in one run, we handle the	1602	* Helper to calculate the punching pos and length in one run, we handle the
1603	* following three cases in order:	1603	* following three cases in order:
1604	*	1604	*
1605	* - remove the entire record	1605	* - remove the entire record
1606	* - remove a partial record	1606	* - remove a partial record
1607	* - no record needs to be removed (hole-punching completed)	1607	* - no record needs to be removed (hole-punching completed)
1608	*/	1608	*/
1609	static void ocfs2_calc_trunc_pos(struct inode *inode,	1609	static void ocfs2_calc_trunc_pos(struct inode *inode,
1610	struct ocfs2_extent_list *el,	1610	struct ocfs2_extent_list *el,
1611	struct ocfs2_extent_rec *rec,	1611	struct ocfs2_extent_rec *rec,
1612	u32 trunc_start, u32 *trunc_cpos,	1612	u32 trunc_start, u32 *trunc_cpos,
1613	u32 trunc_len, u32 trunc_end,	1613	u32 trunc_len, u32 trunc_end,
1614	u64 blkno, int done)	1614	u64 blkno, int done)
1615	{	1615	{
1616	int ret = 0;	1616	int ret = 0;
1617	u32 coff, range;	1617	u32 coff, range;
1618		1618
1619	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);	1619	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1620		1620
1621	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {	1621	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1622	/*	1622	/*
1623	* remove an entire extent record.	1623	* remove an entire extent record.
1624	*/	1624	*/
1625	*trunc_cpos = le32_to_cpu(rec->e_cpos);	1625	*trunc_cpos = le32_to_cpu(rec->e_cpos);
1626	/*	1626	/*
1627	* Skip holes if any.	1627	* Skip holes if any.
1628	*/	1628	*/
1629	if (range < *trunc_end)	1629	if (range < *trunc_end)
1630	*trunc_end = range;	1630	*trunc_end = range;
1631	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);	1631	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);
1632	*blkno = le64_to_cpu(rec->e_blkno);	1632	*blkno = le64_to_cpu(rec->e_blkno);
1633	*trunc_end = le32_to_cpu(rec->e_cpos);	1633	*trunc_end = le32_to_cpu(rec->e_cpos);
1634	} else if (range > trunc_start) {	1634	} else if (range > trunc_start) {
1635	/*	1635	/*
1636	* remove a partial extent record, which means we're	1636	* remove a partial extent record, which means we're
1637	* removing the last extent record.	1637	* removing the last extent record.
1638	*/	1638	*/
1639	*trunc_cpos = trunc_start;	1639	*trunc_cpos = trunc_start;
1640	/*	1640	/*
1641	* skip hole if any.	1641	* skip hole if any.
1642	*/	1642	*/
1643	if (range < *trunc_end)	1643	if (range < *trunc_end)
1644	*trunc_end = range;	1644	*trunc_end = range;
1645	trunc_len = trunc_end - trunc_start;	1645	trunc_len = trunc_end - trunc_start;
1646	coff = trunc_start - le32_to_cpu(rec->e_cpos);	1646	coff = trunc_start - le32_to_cpu(rec->e_cpos);
1647	*blkno = le64_to_cpu(rec->e_blkno) +	1647	*blkno = le64_to_cpu(rec->e_blkno) +
1648	ocfs2_clusters_to_blocks(inode->i_sb, coff);	1648	ocfs2_clusters_to_blocks(inode->i_sb, coff);
1649	*trunc_end = trunc_start;	1649	*trunc_end = trunc_start;
1650	} else {	1650	} else {
1651	/*	1651	/*
1652	* It may have two following possibilities:	1652	* It may have two following possibilities:
1653	*	1653	*
1654	* - last record has been removed	1654	* - last record has been removed
1655	* - trunc_start was within a hole	1655	* - trunc_start was within a hole
1656	*	1656	*
1657	* both two cases mean the completion of hole punching.	1657	* both two cases mean the completion of hole punching.
1658	*/	1658	*/
1659	ret = 1;	1659	ret = 1;
1660	}	1660	}
1661		1661
1662	*done = ret;	1662	*done = ret;
1663	}	1663	}
1664		1664
1665	static int ocfs2_remove_inode_range(struct inode *inode,	1665	static int ocfs2_remove_inode_range(struct inode *inode,
1666	struct buffer_head *di_bh, u64 byte_start,	1666	struct buffer_head *di_bh, u64 byte_start,
1667	u64 byte_len)	1667	u64 byte_len)
1668	{	1668	{
1669	int ret = 0, flags = 0, done = 0, i;	1669	int ret = 0, flags = 0, done = 0, i;
1670	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;	1670	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1671	u32 cluster_in_el;	1671	u32 cluster_in_el;
1672	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1672	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1673	struct ocfs2_cached_dealloc_ctxt dealloc;	1673	struct ocfs2_cached_dealloc_ctxt dealloc;
1674	struct address_space *mapping = inode->i_mapping;	1674	struct address_space *mapping = inode->i_mapping;
1675	struct ocfs2_extent_tree et;	1675	struct ocfs2_extent_tree et;
1676	struct ocfs2_path *path = NULL;	1676	struct ocfs2_path *path = NULL;
1677	struct ocfs2_extent_list *el = NULL;	1677	struct ocfs2_extent_list *el = NULL;
1678	struct ocfs2_extent_rec *rec = NULL;	1678	struct ocfs2_extent_rec *rec = NULL;
1679	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;	1679	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
1680	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);	1680	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1681		1681
1682	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);	1682	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1683	ocfs2_init_dealloc_ctxt(&dealloc);	1683	ocfs2_init_dealloc_ctxt(&dealloc);
1684		1684
1685	trace_ocfs2_remove_inode_range(	1685	trace_ocfs2_remove_inode_range(
1686	(unsigned long long)OCFS2_I(inode)->ip_blkno,	1686	(unsigned long long)OCFS2_I(inode)->ip_blkno,
1687	(unsigned long long)byte_start,	1687	(unsigned long long)byte_start,
1688	(unsigned long long)byte_len);	1688	(unsigned long long)byte_len);
1689		1689
1690	if (byte_len == 0)	1690	if (byte_len == 0)
1691	return 0;	1691	return 0;
1692		1692
1693	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	1693	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1694	ret = ocfs2_truncate_inline(inode, di_bh, byte_start,	1694	ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1695	byte_start + byte_len, 0);	1695	byte_start + byte_len, 0);
1696	if (ret) {	1696	if (ret) {
1697	mlog_errno(ret);	1697	mlog_errno(ret);
1698	goto out;	1698	goto out;
1699	}	1699	}
1700	/*	1700	/*
1701	* There's no need to get fancy with the page cache	1701	* There's no need to get fancy with the page cache
1702	* truncate of an inline-data inode. We're talking	1702	* truncate of an inline-data inode. We're talking
1703	* about less than a page here, which will be cached	1703	* about less than a page here, which will be cached
1704	* in the dinode buffer anyway.	1704	* in the dinode buffer anyway.
1705	*/	1705	*/
1706	unmap_mapping_range(mapping, 0, 0, 0);	1706	unmap_mapping_range(mapping, 0, 0, 0);
1707	truncate_inode_pages(mapping, 0);	1707	truncate_inode_pages(mapping, 0);
1708	goto out;	1708	goto out;
1709	}	1709	}
1710		1710
1711	/*	1711	/*
1712	* For reflinks, we may need to CoW 2 clusters which might be	1712	* For reflinks, we may need to CoW 2 clusters which might be
1713	* partially zero'd later, if hole's start and end offset were	1713	* partially zero'd later, if hole's start and end offset were
1714	* within one cluster(means is not exactly aligned to clustersize).	1714	* within one cluster(means is not exactly aligned to clustersize).
1715	*/	1715	*/
1716		1716
1717	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {	1717	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1718		1718
1719	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);	1719	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1720	if (ret) {	1720	if (ret) {
1721	mlog_errno(ret);	1721	mlog_errno(ret);
1722	goto out;	1722	goto out;
1723	}	1723	}
1724		1724
1725	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);	1725	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1726	if (ret) {	1726	if (ret) {
1727	mlog_errno(ret);	1727	mlog_errno(ret);
1728	goto out;	1728	goto out;
1729	}	1729	}
1730	}	1730	}
1731		1731
1732	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);	1732	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1733	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;	1733	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1734	cluster_in_el = trunc_end;	1734	cluster_in_el = trunc_end;
1735		1735
1736	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);	1736	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1737	if (ret) {	1737	if (ret) {
1738	mlog_errno(ret);	1738	mlog_errno(ret);
1739	goto out;	1739	goto out;
1740	}	1740	}
1741		1741
1742	path = ocfs2_new_path_from_et(&et);	1742	path = ocfs2_new_path_from_et(&et);
1743	if (!path) {	1743	if (!path) {
1744	ret = -ENOMEM;	1744	ret = -ENOMEM;
1745	mlog_errno(ret);	1745	mlog_errno(ret);
1746	goto out;	1746	goto out;
1747	}	1747	}
1748		1748
1749	while (trunc_end > trunc_start) {	1749	while (trunc_end > trunc_start) {
1750		1750
1751	ret = ocfs2_find_path(INODE_CACHE(inode), path,	1751	ret = ocfs2_find_path(INODE_CACHE(inode), path,
1752	cluster_in_el);	1752	cluster_in_el);
1753	if (ret) {	1753	if (ret) {
1754	mlog_errno(ret);	1754	mlog_errno(ret);
1755	goto out;	1755	goto out;
1756	}	1756	}
1757		1757
1758	el = path_leaf_el(path);	1758	el = path_leaf_el(path);
1759		1759
1760	i = ocfs2_find_rec(el, trunc_end);	1760	i = ocfs2_find_rec(el, trunc_end);
1761	/*	1761	/*
1762	* Need to go to previous extent block.	1762	* Need to go to previous extent block.
1763	*/	1763	*/
1764	if (i < 0) {	1764	if (i < 0) {
1765	if (path->p_tree_depth == 0)	1765	if (path->p_tree_depth == 0)
1766	break;	1766	break;
1767		1767
1768	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,	1768	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1769	path,	1769	path,
1770	&cluster_in_el);	1770	&cluster_in_el);
1771	if (ret) {	1771	if (ret) {
1772	mlog_errno(ret);	1772	mlog_errno(ret);
1773	goto out;	1773	goto out;
1774	}	1774	}
1775		1775
1776	/*	1776	/*
1777	* We've reached the leftmost extent block,	1777	* We've reached the leftmost extent block,
1778	* it's safe to leave.	1778	* it's safe to leave.
1779	*/	1779	*/
1780	if (cluster_in_el == 0)	1780	if (cluster_in_el == 0)
1781	break;	1781	break;
1782		1782
1783	/*	1783	/*
1784	* The 'pos' searched for previous extent block is	1784	* The 'pos' searched for previous extent block is
1785	* always one cluster less than actual trunc_end.	1785	* always one cluster less than actual trunc_end.
1786	*/	1786	*/
1787	trunc_end = cluster_in_el + 1;	1787	trunc_end = cluster_in_el + 1;
1788		1788
1789	ocfs2_reinit_path(path, 1);	1789	ocfs2_reinit_path(path, 1);
1790		1790
1791	continue;	1791	continue;
1792		1792
1793	} else	1793	} else
1794	rec = &el->l_recs[i];	1794	rec = &el->l_recs[i];
1795		1795
1796	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,	1796	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1797	&trunc_len, &trunc_end, &blkno, &done);	1797	&trunc_len, &trunc_end, &blkno, &done);
1798	if (done)	1798	if (done)
1799	break;	1799	break;
1800		1800
1801	flags = rec->e_flags;	1801	flags = rec->e_flags;
1802	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);	1802	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1803		1803
1804	ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,	1804	ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1805	phys_cpos, trunc_len, flags,	1805	phys_cpos, trunc_len, flags,
1806	&dealloc, refcount_loc);	1806	&dealloc, refcount_loc);
1807	if (ret < 0) {	1807	if (ret < 0) {
1808	mlog_errno(ret);	1808	mlog_errno(ret);
1809	goto out;	1809	goto out;
1810	}	1810	}
1811		1811
1812	cluster_in_el = trunc_end;	1812	cluster_in_el = trunc_end;
1813		1813
1814	ocfs2_reinit_path(path, 1);	1814	ocfs2_reinit_path(path, 1);
1815	}	1815	}
1816		1816
1817	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);	1817	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1818		1818
1819	out:	1819	out:
1820	ocfs2_schedule_truncate_log_flush(osb, 1);	1820	ocfs2_schedule_truncate_log_flush(osb, 1);
1821	ocfs2_run_deallocs(osb, &dealloc);	1821	ocfs2_run_deallocs(osb, &dealloc);
1822		1822
1823	return ret;	1823	return ret;
1824	}	1824	}
1825		1825
1826	/*	1826	/*
1827	* Parts of this function taken from xfs_change_file_space()	1827	* Parts of this function taken from xfs_change_file_space()
1828	*/	1828	*/
1829	static int __ocfs2_change_file_space(struct file file, struct inode inode,	1829	static int __ocfs2_change_file_space(struct file file, struct inode inode,
1830	loff_t f_pos, unsigned int cmd,	1830	loff_t f_pos, unsigned int cmd,
1831	struct ocfs2_space_resv *sr,	1831	struct ocfs2_space_resv *sr,
1832	int change_size)	1832	int change_size)
1833	{	1833	{
1834	int ret;	1834	int ret;
1835	s64 llen;	1835	s64 llen;
1836	loff_t size;	1836	loff_t size;
1837	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1837	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1838	struct buffer_head *di_bh = NULL;	1838	struct buffer_head *di_bh = NULL;
1839	handle_t *handle;	1839	handle_t *handle;
1840	unsigned long long max_off = inode->i_sb->s_maxbytes;	1840	unsigned long long max_off = inode->i_sb->s_maxbytes;
1841		1841
1842	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))	1842	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
1843	return -EROFS;	1843	return -EROFS;
1844		1844
1845	mutex_lock(&inode->i_mutex);	1845	mutex_lock(&inode->i_mutex);
1846		1846
1847	/*	1847	/*
1848	* This prevents concurrent writes on other nodes	1848	* This prevents concurrent writes on other nodes
1849	*/	1849	*/
1850	ret = ocfs2_rw_lock(inode, 1);	1850	ret = ocfs2_rw_lock(inode, 1);
1851	if (ret) {	1851	if (ret) {
1852	mlog_errno(ret);	1852	mlog_errno(ret);
1853	goto out;	1853	goto out;
1854	}	1854	}
1855		1855
1856	ret = ocfs2_inode_lock(inode, &di_bh, 1);	1856	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1857	if (ret) {	1857	if (ret) {
1858	mlog_errno(ret);	1858	mlog_errno(ret);
1859	goto out_rw_unlock;	1859	goto out_rw_unlock;
1860	}	1860	}
1861		1861
1862	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {	1862	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {
1863	ret = -EPERM;	1863	ret = -EPERM;
1864	goto out_inode_unlock;	1864	goto out_inode_unlock;
1865	}	1865	}
1866		1866
1867	switch (sr->l_whence) {	1867	switch (sr->l_whence) {
1868	case 0: /SEEK_SET/	1868	case 0: /SEEK_SET/
1869	break;	1869	break;
1870	case 1: /SEEK_CUR/	1870	case 1: /SEEK_CUR/
1871	sr->l_start += f_pos;	1871	sr->l_start += f_pos;
1872	break;	1872	break;
1873	case 2: /SEEK_END/	1873	case 2: /SEEK_END/
1874	sr->l_start += i_size_read(inode);	1874	sr->l_start += i_size_read(inode);
1875	break;	1875	break;
1876	default:	1876	default:
1877	ret = -EINVAL;	1877	ret = -EINVAL;
1878	goto out_inode_unlock;	1878	goto out_inode_unlock;
1879	}	1879	}
1880	sr->l_whence = 0;	1880	sr->l_whence = 0;
1881		1881
1882	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;	1882	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1883		1883
1884	if (sr->l_start < 0	1884	if (sr->l_start < 0
1885	\|\| sr->l_start > max_off	1885	\|\| sr->l_start > max_off
1886	\|\| (sr->l_start + llen) < 0	1886	\|\| (sr->l_start + llen) < 0
1887	\|\| (sr->l_start + llen) > max_off) {	1887	\|\| (sr->l_start + llen) > max_off) {
1888	ret = -EINVAL;	1888	ret = -EINVAL;
1889	goto out_inode_unlock;	1889	goto out_inode_unlock;
1890	}	1890	}
1891	size = sr->l_start + sr->l_len;	1891	size = sr->l_start + sr->l_len;
1892		1892
1893	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) {	1893	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) {
1894	if (sr->l_len <= 0) {	1894	if (sr->l_len <= 0) {
1895	ret = -EINVAL;	1895	ret = -EINVAL;
1896	goto out_inode_unlock;	1896	goto out_inode_unlock;
1897	}	1897	}
1898	}	1898	}
1899		1899
1900	if (file && should_remove_suid(file->f_path.dentry)) {	1900	if (file && should_remove_suid(file->f_path.dentry)) {
1901	ret = __ocfs2_write_remove_suid(inode, di_bh);	1901	ret = __ocfs2_write_remove_suid(inode, di_bh);
1902	if (ret) {	1902	if (ret) {
1903	mlog_errno(ret);	1903	mlog_errno(ret);
1904	goto out_inode_unlock;	1904	goto out_inode_unlock;
1905	}	1905	}
1906	}	1906	}
1907		1907
1908	down_write(&OCFS2_I(inode)->ip_alloc_sem);	1908	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1909	switch (cmd) {	1909	switch (cmd) {
1910	case OCFS2_IOC_RESVSP:	1910	case OCFS2_IOC_RESVSP:
1911	case OCFS2_IOC_RESVSP64:	1911	case OCFS2_IOC_RESVSP64:
1912	/*	1912	/*
1913	* This takes unsigned offsets, but the signed ones we	1913	* This takes unsigned offsets, but the signed ones we
1914	* pass have been checked against overflow above.	1914	* pass have been checked against overflow above.
1915	*/	1915	*/
1916	ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,	1916	ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1917	sr->l_len);	1917	sr->l_len);
1918	break;	1918	break;
1919	case OCFS2_IOC_UNRESVSP:	1919	case OCFS2_IOC_UNRESVSP:
1920	case OCFS2_IOC_UNRESVSP64:	1920	case OCFS2_IOC_UNRESVSP64:
1921	ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,	1921	ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1922	sr->l_len);	1922	sr->l_len);
1923	break;	1923	break;
1924	default:	1924	default:
1925	ret = -EINVAL;	1925	ret = -EINVAL;
1926	}	1926	}
1927	up_write(&OCFS2_I(inode)->ip_alloc_sem);	1927	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1928	if (ret) {	1928	if (ret) {
1929	mlog_errno(ret);	1929	mlog_errno(ret);
1930	goto out_inode_unlock;	1930	goto out_inode_unlock;
1931	}	1931	}
1932		1932
1933	/*	1933	/*
1934	* We update c/mtime for these changes	1934	* We update c/mtime for these changes
1935	*/	1935	*/
1936	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	1936	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1937	if (IS_ERR(handle)) {	1937	if (IS_ERR(handle)) {
1938	ret = PTR_ERR(handle);	1938	ret = PTR_ERR(handle);
1939	mlog_errno(ret);	1939	mlog_errno(ret);
1940	goto out_inode_unlock;	1940	goto out_inode_unlock;
1941	}	1941	}
1942		1942
1943	if (change_size && i_size_read(inode) < size)	1943	if (change_size && i_size_read(inode) < size)
1944	i_size_write(inode, size);	1944	i_size_write(inode, size);
1945		1945
1946	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	1946	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1947	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);	1947	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1948	if (ret < 0)	1948	if (ret < 0)
1949	mlog_errno(ret);	1949	mlog_errno(ret);
1950		1950
1951	if (file && (file->f_flags & O_SYNC))	1951	if (file && (file->f_flags & O_SYNC))
1952	handle->h_sync = 1;	1952	handle->h_sync = 1;
1953		1953
1954	ocfs2_commit_trans(osb, handle);	1954	ocfs2_commit_trans(osb, handle);
1955		1955
1956	out_inode_unlock:	1956	out_inode_unlock:
1957	brelse(di_bh);	1957	brelse(di_bh);
1958	ocfs2_inode_unlock(inode, 1);	1958	ocfs2_inode_unlock(inode, 1);
1959	out_rw_unlock:	1959	out_rw_unlock:
1960	ocfs2_rw_unlock(inode, 1);	1960	ocfs2_rw_unlock(inode, 1);
1961		1961
1962	out:	1962	out:
1963	mutex_unlock(&inode->i_mutex);	1963	mutex_unlock(&inode->i_mutex);
1964	return ret;	1964	return ret;
1965	}	1965	}
1966		1966
1967	int ocfs2_change_file_space(struct file *file, unsigned int cmd,	1967	int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1968	struct ocfs2_space_resv *sr)	1968	struct ocfs2_space_resv *sr)
1969	{	1969	{
1970	struct inode *inode = file->f_path.dentry->d_inode;	1970	struct inode *inode = file->f_path.dentry->d_inode;
1971	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1971	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1972	int ret;	1972	int ret;
1973		1973
1974	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&	1974	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&
1975	!ocfs2_writes_unwritten_extents(osb))	1975	!ocfs2_writes_unwritten_extents(osb))
1976	return -ENOTTY;	1976	return -ENOTTY;
1977	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&	1977	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&
1978	!ocfs2_sparse_alloc(osb))	1978	!ocfs2_sparse_alloc(osb))
1979	return -ENOTTY;	1979	return -ENOTTY;
1980		1980
1981	if (!S_ISREG(inode->i_mode))	1981	if (!S_ISREG(inode->i_mode))
1982	return -EINVAL;	1982	return -EINVAL;
1983		1983
1984	if (!(file->f_mode & FMODE_WRITE))	1984	if (!(file->f_mode & FMODE_WRITE))
1985	return -EBADF;	1985	return -EBADF;
1986		1986
1987	ret = mnt_want_write_file(file);	1987	ret = mnt_want_write_file(file);
1988	if (ret)	1988	if (ret)
1989	return ret;	1989	return ret;
1990	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);	1990	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1991	mnt_drop_write_file(file);	1991	mnt_drop_write_file(file);
1992	return ret;	1992	return ret;
1993	}	1993	}
1994		1994
1995	static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,	1995	static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1996	loff_t len)	1996	loff_t len)
1997	{	1997	{
1998	struct inode *inode = file->f_path.dentry->d_inode;	1998	struct inode *inode = file->f_path.dentry->d_inode;
1999	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1999	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2000	struct ocfs2_space_resv sr;	2000	struct ocfs2_space_resv sr;
2001	int change_size = 1;	2001	int change_size = 1;
2002	int cmd = OCFS2_IOC_RESVSP64;	2002	int cmd = OCFS2_IOC_RESVSP64;
2003		2003
2004	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))	2004	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
2005	return -EOPNOTSUPP;	2005	return -EOPNOTSUPP;
2006	if (!ocfs2_writes_unwritten_extents(osb))	2006	if (!ocfs2_writes_unwritten_extents(osb))
2007	return -EOPNOTSUPP;	2007	return -EOPNOTSUPP;
2008		2008
2009	if (mode & FALLOC_FL_KEEP_SIZE)	2009	if (mode & FALLOC_FL_KEEP_SIZE)
2010	change_size = 0;	2010	change_size = 0;
2011		2011
2012	if (mode & FALLOC_FL_PUNCH_HOLE)	2012	if (mode & FALLOC_FL_PUNCH_HOLE)
2013	cmd = OCFS2_IOC_UNRESVSP64;	2013	cmd = OCFS2_IOC_UNRESVSP64;
2014		2014
2015	sr.l_whence = 0;	2015	sr.l_whence = 0;
2016	sr.l_start = (s64)offset;	2016	sr.l_start = (s64)offset;
2017	sr.l_len = (s64)len;	2017	sr.l_len = (s64)len;
2018		2018
2019	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,	2019	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2020	change_size);	2020	change_size);
2021	}	2021	}
2022		2022
2023	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,	2023	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2024	size_t count)	2024	size_t count)
2025	{	2025	{
2026	int ret = 0;	2026	int ret = 0;
2027	unsigned int extent_flags;	2027	unsigned int extent_flags;
2028	u32 cpos, clusters, extent_len, phys_cpos;	2028	u32 cpos, clusters, extent_len, phys_cpos;
2029	struct super_block *sb = inode->i_sb;	2029	struct super_block *sb = inode->i_sb;
2030		2030
2031	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|	2031	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|
2032	!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) \|\|	2032	!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) \|\|
2033	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)	2033	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2034	return 0;	2034	return 0;
2035		2035
2036	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;	2036	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2037	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;	2037	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2038		2038
2039	while (clusters) {	2039	while (clusters) {
2040	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,	2040	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2041	&extent_flags);	2041	&extent_flags);
2042	if (ret < 0) {	2042	if (ret < 0) {
2043	mlog_errno(ret);	2043	mlog_errno(ret);
2044	goto out;	2044	goto out;
2045	}	2045	}
2046		2046
2047	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {	2047	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2048	ret = 1;	2048	ret = 1;
2049	break;	2049	break;
2050	}	2050	}
2051		2051
2052	if (extent_len > clusters)	2052	if (extent_len > clusters)
2053	extent_len = clusters;	2053	extent_len = clusters;
2054		2054
2055	clusters -= extent_len;	2055	clusters -= extent_len;
2056	cpos += extent_len;	2056	cpos += extent_len;
2057	}	2057	}
2058	out:	2058	out:
2059	return ret;	2059	return ret;
2060	}	2060	}
2061		2061
2062	static void ocfs2_aiodio_wait(struct inode *inode)	2062	static void ocfs2_aiodio_wait(struct inode *inode)
2063	{	2063	{
2064	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);	2064	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2065		2065
2066	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));	2066	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2067	}	2067	}
2068		2068
2069	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)	2069	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2070	{	2070	{
2071	int blockmask = inode->i_sb->s_blocksize - 1;	2071	int blockmask = inode->i_sb->s_blocksize - 1;
2072	loff_t final_size = pos + count;	2072	loff_t final_size = pos + count;
2073		2073
2074	if ((pos & blockmask) \|\| (final_size & blockmask))	2074	if ((pos & blockmask) \|\| (final_size & blockmask))
2075	return 1;	2075	return 1;
2076	return 0;	2076	return 0;
2077	}	2077	}
2078		2078
2079	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,	2079	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2080	struct file *file,	2080	struct file *file,
2081	loff_t pos, size_t count,	2081	loff_t pos, size_t count,
2082	int *meta_level)	2082	int *meta_level)
2083	{	2083	{
2084	int ret;	2084	int ret;
2085	struct buffer_head *di_bh = NULL;	2085	struct buffer_head *di_bh = NULL;
2086	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;	2086	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2087	u32 clusters =	2087	u32 clusters =
2088	ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;	2088	ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2089		2089
2090	ret = ocfs2_inode_lock(inode, &di_bh, 1);	2090	ret = ocfs2_inode_lock(inode, &di_bh, 1);
2091	if (ret) {	2091	if (ret) {
2092	mlog_errno(ret);	2092	mlog_errno(ret);
2093	goto out;	2093	goto out;
2094	}	2094	}
2095		2095
2096	*meta_level = 1;	2096	*meta_level = 1;
2097		2097
2098	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);	2098	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2099	if (ret)	2099	if (ret)
2100	mlog_errno(ret);	2100	mlog_errno(ret);
2101	out:	2101	out:
2102	brelse(di_bh);	2102	brelse(di_bh);
2103	return ret;	2103	return ret;
2104	}	2104	}
2105		2105
2106	static int ocfs2_prepare_inode_for_write(struct file *file,	2106	static int ocfs2_prepare_inode_for_write(struct file *file,
2107	loff_t *ppos,	2107	loff_t *ppos,
2108	size_t count,	2108	size_t count,
2109	int appending,	2109	int appending,
2110	int *direct_io,	2110	int *direct_io,
2111	int *has_refcount)	2111	int *has_refcount)
2112	{	2112	{
2113	int ret = 0, meta_level = 0;	2113	int ret = 0, meta_level = 0;
2114	struct dentry *dentry = file->f_path.dentry;	2114	struct dentry *dentry = file->f_path.dentry;
2115	struct inode *inode = dentry->d_inode;	2115	struct inode *inode = dentry->d_inode;
2116	loff_t saved_pos = 0, end;	2116	loff_t saved_pos = 0, end;
2117		2117
2118	/*	2118	/*
2119	* We start with a read level meta lock and only jump to an ex	2119	* We start with a read level meta lock and only jump to an ex
2120	* if we need to make modifications here.	2120	* if we need to make modifications here.
2121	*/	2121	*/
2122	for(;;) {	2122	for(;;) {
2123	ret = ocfs2_inode_lock(inode, NULL, meta_level);	2123	ret = ocfs2_inode_lock(inode, NULL, meta_level);
2124	if (ret < 0) {	2124	if (ret < 0) {
2125	meta_level = -1;	2125	meta_level = -1;
2126	mlog_errno(ret);	2126	mlog_errno(ret);
2127	goto out;	2127	goto out;
2128	}	2128	}
2129		2129
2130	/* Clear suid / sgid if necessary. We do this here	2130	/* Clear suid / sgid if necessary. We do this here
2131	* instead of later in the write path because	2131	* instead of later in the write path because
2132	* remove_suid() calls ->setattr without any hint that	2132	* remove_suid() calls ->setattr without any hint that
2133	* we may have already done our cluster locking. Since	2133	* we may have already done our cluster locking. Since
2134	* ocfs2_setattr() must take cluster locks to	2134	* ocfs2_setattr() must take cluster locks to
2135	* proceed, this will lead us to recursively lock the	2135	* proceed, this will lead us to recursively lock the
2136	* inode. There's also the dinode i_size state which	2136	* inode. There's also the dinode i_size state which
2137	* can be lost via setattr during extending writes (we	2137	* can be lost via setattr during extending writes (we
2138	* set inode->i_size at the end of a write. */	2138	* set inode->i_size at the end of a write. */
2139	if (should_remove_suid(dentry)) {	2139	if (should_remove_suid(dentry)) {
2140	if (meta_level == 0) {	2140	if (meta_level == 0) {
2141	ocfs2_inode_unlock(inode, meta_level);	2141	ocfs2_inode_unlock(inode, meta_level);
2142	meta_level = 1;	2142	meta_level = 1;
2143	continue;	2143	continue;
2144	}	2144	}
2145		2145
2146	ret = ocfs2_write_remove_suid(inode);	2146	ret = ocfs2_write_remove_suid(inode);
2147	if (ret < 0) {	2147	if (ret < 0) {
2148	mlog_errno(ret);	2148	mlog_errno(ret);
2149	goto out_unlock;	2149	goto out_unlock;
2150	}	2150	}
2151	}	2151	}
2152		2152
2153	/* work on a copy of ppos until we're sure that we won't have	2153	/* work on a copy of ppos until we're sure that we won't have
2154	* to recalculate it due to relocking. */	2154	* to recalculate it due to relocking. */
2155	if (appending)	2155	if (appending)
2156	saved_pos = i_size_read(inode);	2156	saved_pos = i_size_read(inode);
2157	else	2157	else
2158	saved_pos = *ppos;	2158	saved_pos = *ppos;
2159		2159
2160	end = saved_pos + count;	2160	end = saved_pos + count;
2161		2161
2162	ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);	2162	ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
2163	if (ret == 1) {	2163	if (ret == 1) {
2164	ocfs2_inode_unlock(inode, meta_level);	2164	ocfs2_inode_unlock(inode, meta_level);
2165	meta_level = -1;	2165	meta_level = -1;
2166		2166
2167	ret = ocfs2_prepare_inode_for_refcount(inode,	2167	ret = ocfs2_prepare_inode_for_refcount(inode,
2168	file,	2168	file,
2169	saved_pos,	2169	saved_pos,
2170	count,	2170	count,
2171	&meta_level);	2171	&meta_level);
2172	if (has_refcount)	2172	if (has_refcount)
2173	*has_refcount = 1;	2173	*has_refcount = 1;
2174	if (direct_io)	2174	if (direct_io)
2175	*direct_io = 0;	2175	*direct_io = 0;
2176	}	2176	}
2177		2177
2178	if (ret < 0) {	2178	if (ret < 0) {
2179	mlog_errno(ret);	2179	mlog_errno(ret);
2180	goto out_unlock;	2180	goto out_unlock;
2181	}	2181	}
2182		2182
2183	/*	2183	/*
2184	* Skip the O_DIRECT checks if we don't need	2184	* Skip the O_DIRECT checks if we don't need
2185	* them.	2185	* them.
2186	*/	2186	*/
2187	if (!direct_io \|\| !(*direct_io))	2187	if (!direct_io \|\| !(*direct_io))
2188	break;	2188	break;
2189		2189
2190	/*	2190	/*
2191	* There's no sane way to do direct writes to an inode	2191	* There's no sane way to do direct writes to an inode
2192	* with inline data.	2192	* with inline data.
2193	*/	2193	*/
2194	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {	2194	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2195	*direct_io = 0;	2195	*direct_io = 0;
2196	break;	2196	break;
2197	}	2197	}
2198		2198
2199	/*	2199	/*
2200	* Allowing concurrent direct writes means	2200	* Allowing concurrent direct writes means
2201	* i_size changes wouldn't be synchronized, so	2201	* i_size changes wouldn't be synchronized, so
2202	* one node could wind up truncating another	2202	* one node could wind up truncating another
2203	* nodes writes.	2203	* nodes writes.
2204	*/	2204	*/
2205	if (end > i_size_read(inode)) {	2205	if (end > i_size_read(inode)) {
2206	*direct_io = 0;	2206	*direct_io = 0;
2207	break;	2207	break;
2208	}	2208	}
2209		2209
2210	/*	2210	/*
2211	* We don't fill holes during direct io, so	2211	* We don't fill holes during direct io, so
2212	* check for them here. If any are found, the	2212	* check for them here. If any are found, the
2213	* caller will have to retake some cluster	2213	* caller will have to retake some cluster
2214	* locks and initiate the io as buffered.	2214	* locks and initiate the io as buffered.
2215	*/	2215	*/
2216	ret = ocfs2_check_range_for_holes(inode, saved_pos, count);	2216	ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2217	if (ret == 1) {	2217	if (ret == 1) {
2218	*direct_io = 0;	2218	*direct_io = 0;
2219	ret = 0;	2219	ret = 0;
2220	} else if (ret < 0)	2220	} else if (ret < 0)
2221	mlog_errno(ret);	2221	mlog_errno(ret);
2222	break;	2222	break;
2223	}	2223	}
2224		2224
2225	if (appending)	2225	if (appending)
2226	*ppos = saved_pos;	2226	*ppos = saved_pos;
2227		2227
2228	out_unlock:	2228	out_unlock:
2229	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,	2229	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2230	saved_pos, appending, count,	2230	saved_pos, appending, count,
2231	direct_io, has_refcount);	2231	direct_io, has_refcount);
2232		2232
2233	if (meta_level >= 0)	2233	if (meta_level >= 0)
2234	ocfs2_inode_unlock(inode, meta_level);	2234	ocfs2_inode_unlock(inode, meta_level);
2235		2235
2236	out:	2236	out:
2237	return ret;	2237	return ret;
2238	}	2238	}
2239		2239
2240	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,	2240	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2241	const struct iovec *iov,	2241	const struct iovec *iov,
2242	unsigned long nr_segs,	2242	unsigned long nr_segs,
2243	loff_t pos)	2243	loff_t pos)
2244	{	2244	{
2245	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;	2245	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
2246	int can_do_direct, has_refcount = 0;	2246	int can_do_direct, has_refcount = 0;
2247	ssize_t written = 0;	2247	ssize_t written = 0;
2248	size_t ocount; /* original count */	2248	size_t ocount; /* original count */
2249	size_t count; /* after file limit checks */	2249	size_t count; /* after file limit checks */
2250	loff_t old_size, *ppos = &iocb->ki_pos;	2250	loff_t old_size, *ppos = &iocb->ki_pos;
2251	u32 old_clusters;	2251	u32 old_clusters;
2252	struct file *file = iocb->ki_filp;	2252	struct file *file = iocb->ki_filp;
2253	struct inode *inode = file->f_path.dentry->d_inode;	2253	struct inode *inode = file->f_path.dentry->d_inode;
2254	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	2254	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255	int full_coherency = !(osb->s_mount_opt &	2255	int full_coherency = !(osb->s_mount_opt &
2256	OCFS2_MOUNT_COHERENCY_BUFFERED);	2256	OCFS2_MOUNT_COHERENCY_BUFFERED);
2257	int unaligned_dio = 0;	2257	int unaligned_dio = 0;
2258		2258
2259	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,	2259	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2260	(unsigned long long)OCFS2_I(inode)->ip_blkno,	2260	(unsigned long long)OCFS2_I(inode)->ip_blkno,
2261	file->f_path.dentry->d_name.len,	2261	file->f_path.dentry->d_name.len,
2262	file->f_path.dentry->d_name.name,	2262	file->f_path.dentry->d_name.name,
2263	(unsigned int)nr_segs);	2263	(unsigned int)nr_segs);
2264		2264
2265	if (iocb->ki_left == 0)	2265	if (iocb->ki_left == 0)
2266	return 0;	2266	return 0;
2267		2267
2268	sb_start_write(inode->i_sb);	2268	sb_start_write(inode->i_sb);
2269		2269
2270	appending = file->f_flags & O_APPEND ? 1 : 0;	2270	appending = file->f_flags & O_APPEND ? 1 : 0;
2271	direct_io = file->f_flags & O_DIRECT ? 1 : 0;	2271	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
2272		2272
2273	mutex_lock(&inode->i_mutex);	2273	mutex_lock(&inode->i_mutex);
2274		2274
2275	ocfs2_iocb_clear_sem_locked(iocb);	2275	ocfs2_iocb_clear_sem_locked(iocb);
2276		2276
2277	relock:	2277	relock:
2278	/* to match setattr's i_mutex -> rw_lock ordering */	2278	/* to match setattr's i_mutex -> rw_lock ordering */
2279	if (direct_io) {	2279	if (direct_io) {
2280	have_alloc_sem = 1;	2280	have_alloc_sem = 1;
2281	/* communicate with ocfs2_dio_end_io */	2281	/* communicate with ocfs2_dio_end_io */
2282	ocfs2_iocb_set_sem_locked(iocb);	2282	ocfs2_iocb_set_sem_locked(iocb);
2283	}	2283	}
2284		2284
2285	/*	2285	/*
2286	* Concurrent O_DIRECT writes are allowed with	2286	* Concurrent O_DIRECT writes are allowed with
2287	* mount_option "coherency=buffered".	2287	* mount_option "coherency=buffered".
2288	*/	2288	*/
2289	rw_level = (!direct_io \|\| full_coherency);	2289	rw_level = (!direct_io \|\| full_coherency);
2290		2290
2291	ret = ocfs2_rw_lock(inode, rw_level);	2291	ret = ocfs2_rw_lock(inode, rw_level);
2292	if (ret < 0) {	2292	if (ret < 0) {
2293	mlog_errno(ret);	2293	mlog_errno(ret);
2294	goto out_sems;	2294	goto out_sems;
2295	}	2295	}
2296		2296
2297	/*	2297	/*
2298	* O_DIRECT writes with "coherency=full" need to take EX cluster	2298	* O_DIRECT writes with "coherency=full" need to take EX cluster
2299	* inode_lock to guarantee coherency.	2299	* inode_lock to guarantee coherency.
2300	*/	2300	*/
2301	if (direct_io && full_coherency) {	2301	if (direct_io && full_coherency) {
2302	/*	2302	/*
2303	* We need to take and drop the inode lock to force	2303	* We need to take and drop the inode lock to force
2304	* other nodes to drop their caches. Buffered I/O	2304	* other nodes to drop their caches. Buffered I/O
2305	* already does this in write_begin().	2305	* already does this in write_begin().
2306	*/	2306	*/
2307	ret = ocfs2_inode_lock(inode, NULL, 1);	2307	ret = ocfs2_inode_lock(inode, NULL, 1);
2308	if (ret < 0) {	2308	if (ret < 0) {
2309	mlog_errno(ret);	2309	mlog_errno(ret);
2310	goto out_sems;	2310	goto out_sems;
2311	}	2311	}
2312		2312
2313	ocfs2_inode_unlock(inode, 1);	2313	ocfs2_inode_unlock(inode, 1);
2314	}	2314	}
2315		2315
2316	can_do_direct = direct_io;	2316	can_do_direct = direct_io;
2317	ret = ocfs2_prepare_inode_for_write(file, ppos,	2317	ret = ocfs2_prepare_inode_for_write(file, ppos,
2318	iocb->ki_left, appending,	2318	iocb->ki_left, appending,
2319	&can_do_direct, &has_refcount);	2319	&can_do_direct, &has_refcount);
2320	if (ret < 0) {	2320	if (ret < 0) {
2321	mlog_errno(ret);	2321	mlog_errno(ret);
2322	goto out;	2322	goto out;
2323	}	2323	}
2324		2324
2325	if (direct_io && !is_sync_kiocb(iocb))	2325	if (direct_io && !is_sync_kiocb(iocb))
2326	unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,	2326	unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2327	*ppos);	2327	*ppos);
2328		2328
2329	/*	2329	/*
2330	* We can't complete the direct I/O as requested, fall back to	2330	* We can't complete the direct I/O as requested, fall back to
2331	* buffered I/O.	2331	* buffered I/O.
2332	*/	2332	*/
2333	if (direct_io && !can_do_direct) {	2333	if (direct_io && !can_do_direct) {
2334	ocfs2_rw_unlock(inode, rw_level);	2334	ocfs2_rw_unlock(inode, rw_level);
2335		2335
2336	have_alloc_sem = 0;	2336	have_alloc_sem = 0;
2337	rw_level = -1;	2337	rw_level = -1;
2338		2338
2339	direct_io = 0;	2339	direct_io = 0;
2340	goto relock;	2340	goto relock;
2341	}	2341	}
2342		2342
2343	if (unaligned_dio) {	2343	if (unaligned_dio) {
2344	/*	2344	/*
2345	* Wait on previous unaligned aio to complete before	2345	* Wait on previous unaligned aio to complete before
2346	* proceeding.	2346	* proceeding.
2347	*/	2347	*/
2348	ocfs2_aiodio_wait(inode);	2348	ocfs2_aiodio_wait(inode);
2349		2349
2350	/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */	2350	/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351	atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);	2351	atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2352	ocfs2_iocb_set_unaligned_aio(iocb);	2352	ocfs2_iocb_set_unaligned_aio(iocb);
2353	}	2353	}
2354		2354
2355	/*	2355	/*
2356	* To later detect whether a journal commit for sync writes is	2356	* To later detect whether a journal commit for sync writes is
2357	* necessary, we sample i_size, and cluster count here.	2357	* necessary, we sample i_size, and cluster count here.
2358	*/	2358	*/
2359	old_size = i_size_read(inode);	2359	old_size = i_size_read(inode);
2360	old_clusters = OCFS2_I(inode)->ip_clusters;	2360	old_clusters = OCFS2_I(inode)->ip_clusters;
2361		2361
2362	/* communicate with ocfs2_dio_end_io */	2362	/* communicate with ocfs2_dio_end_io */
2363	ocfs2_iocb_set_rw_locked(iocb, rw_level);	2363	ocfs2_iocb_set_rw_locked(iocb, rw_level);
2364		2364
2365	ret = generic_segment_checks(iov, &nr_segs, &ocount,	2365	ret = generic_segment_checks(iov, &nr_segs, &ocount,
2366	VERIFY_READ);	2366	VERIFY_READ);
2367	if (ret)	2367	if (ret)
2368	goto out_dio;	2368	goto out_dio;
2369		2369
2370	count = ocount;	2370	count = ocount;
2371	ret = generic_write_checks(file, ppos, &count,	2371	ret = generic_write_checks(file, ppos, &count,
2372	S_ISBLK(inode->i_mode));	2372	S_ISBLK(inode->i_mode));
2373	if (ret)	2373	if (ret)
2374	goto out_dio;	2374	goto out_dio;
2375		2375
2376	if (direct_io) {	2376	if (direct_io) {
2377	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,	2377	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2378	ppos, count, ocount);	2378	ppos, count, ocount);
2379	if (written < 0) {	2379	if (written < 0) {
2380	ret = written;	2380	ret = written;
2381	goto out_dio;	2381	goto out_dio;
2382	}	2382	}
2383	} else {	2383	} else {
2384	current->backing_dev_info = file->f_mapping->backing_dev_info;	2384	current->backing_dev_info = file->f_mapping->backing_dev_info;
2385	written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,	2385	written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2386	ppos, count, 0);	2386	ppos, count, 0);
2387	current->backing_dev_info = NULL;	2387	current->backing_dev_info = NULL;
2388	}	2388	}
2389		2389
2390	out_dio:	2390	out_dio:
2391	/* buffered aio wouldn't have proper lock coverage today */	2391	/* buffered aio wouldn't have proper lock coverage today */
2392	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));	2392	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2393		2393
2394	if (((file->f_flags & O_DSYNC) && !direct_io) \|\| IS_SYNC(inode) \|\|	2394	if (((file->f_flags & O_DSYNC) && !direct_io) \|\| IS_SYNC(inode) \|\|
2395	((file->f_flags & O_DIRECT) && !direct_io)) {	2395	((file->f_flags & O_DIRECT) && !direct_io)) {
2396	ret = filemap_fdatawrite_range(file->f_mapping, pos,	2396	ret = filemap_fdatawrite_range(file->f_mapping, pos,
2397	pos + count - 1);	2397	pos + count - 1);
2398	if (ret < 0)	2398	if (ret < 0)
2399	written = ret;	2399	written = ret;
2400		2400
2401	if (!ret && ((old_size != i_size_read(inode)) \|\|	2401	if (!ret && ((old_size != i_size_read(inode)) \|\|
2402	(old_clusters != OCFS2_I(inode)->ip_clusters) \|\|	2402	(old_clusters != OCFS2_I(inode)->ip_clusters) \|\|
2403	has_refcount)) {	2403	has_refcount)) {
2404	ret = jbd2_journal_force_commit(osb->journal->j_journal);	2404	ret = jbd2_journal_force_commit(osb->journal->j_journal);
2405	if (ret < 0)	2405	if (ret < 0)
2406	written = ret;	2406	written = ret;
2407	}	2407	}
2408		2408
2409	if (!ret)	2409	if (!ret)
2410	ret = filemap_fdatawait_range(file->f_mapping, pos,	2410	ret = filemap_fdatawait_range(file->f_mapping, pos,
2411	pos + count - 1);	2411	pos + count - 1);
2412	}	2412	}
2413		2413
2414	/*	2414	/*
2415	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io	2415	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2416	* function pointer which is called when o_direct io completes so that	2416	* function pointer which is called when o_direct io completes so that
2417	* it can unlock our rw lock.	2417	* it can unlock our rw lock.
2418	* Unfortunately there are error cases which call end_io and others	2418	* Unfortunately there are error cases which call end_io and others
2419	* that don't. so we don't have to unlock the rw_lock if either an	2419	* that don't. so we don't have to unlock the rw_lock if either an
2420	* async dio is going to do it in the future or an end_io after an	2420	* async dio is going to do it in the future or an end_io after an
2421	* error has already done it.	2421	* error has already done it.
2422	*/	2422	*/
2423	if ((ret == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {	2423	if ((ret == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
2424	rw_level = -1;	2424	rw_level = -1;
2425	have_alloc_sem = 0;	2425	have_alloc_sem = 0;
2426	unaligned_dio = 0;	2426	unaligned_dio = 0;
2427	}	2427	}
2428		2428
2429	if (unaligned_dio) {	2429	if (unaligned_dio) {
2430	ocfs2_iocb_clear_unaligned_aio(iocb);	2430	ocfs2_iocb_clear_unaligned_aio(iocb);
2431	atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);	2431	atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2432	}	2432	}
2433		2433
2434	out:	2434	out:
2435	if (rw_level != -1)	2435	if (rw_level != -1)
2436	ocfs2_rw_unlock(inode, rw_level);	2436	ocfs2_rw_unlock(inode, rw_level);
2437		2437
2438	out_sems:	2438	out_sems:
2439	if (have_alloc_sem)	2439	if (have_alloc_sem)
2440	ocfs2_iocb_clear_sem_locked(iocb);	2440	ocfs2_iocb_clear_sem_locked(iocb);
2441		2441
2442	mutex_unlock(&inode->i_mutex);	2442	mutex_unlock(&inode->i_mutex);
2443	sb_end_write(inode->i_sb);	2443	sb_end_write(inode->i_sb);
2444		2444
2445	if (written)	2445	if (written)
2446	ret = written;	2446	ret = written;
2447	return ret;	2447	return ret;
2448	}	2448	}
2449		2449
2450	static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,	2450	static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2451	struct file *out,	2451	struct file *out,
2452	struct splice_desc *sd)	2452	struct splice_desc *sd)
2453	{	2453	{
2454	int ret;	2454	int ret;
2455		2455
2456	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,	2456	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2457	sd->total_len, 0, NULL, NULL);	2457	sd->total_len, 0, NULL, NULL);
2458	if (ret < 0) {	2458	if (ret < 0) {
2459	mlog_errno(ret);	2459	mlog_errno(ret);
2460	return ret;	2460	return ret;
2461	}	2461	}
2462		2462
2463	return splice_from_pipe_feed(pipe, sd, pipe_to_file);	2463	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2464	}	2464	}
2465		2465
2466	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,	2466	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2467	struct file *out,	2467	struct file *out,
2468	loff_t *ppos,	2468	loff_t *ppos,
2469	size_t len,	2469	size_t len,
2470	unsigned int flags)	2470	unsigned int flags)
2471	{	2471	{
2472	int ret;	2472	int ret;
2473	struct address_space *mapping = out->f_mapping;	2473	struct address_space *mapping = out->f_mapping;
2474	struct inode *inode = mapping->host;	2474	struct inode *inode = mapping->host;
2475	struct splice_desc sd = {	2475	struct splice_desc sd = {
2476	.total_len = len,	2476	.total_len = len,
2477	.flags = flags,	2477	.flags = flags,
2478	.pos = *ppos,	2478	.pos = *ppos,
2479	.u.file = out,	2479	.u.file = out,
2480	};	2480	};
2481		2481
2482		2482
2483	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,	2483	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2484	(unsigned long long)OCFS2_I(inode)->ip_blkno,	2484	(unsigned long long)OCFS2_I(inode)->ip_blkno,
2485	out->f_path.dentry->d_name.len,	2485	out->f_path.dentry->d_name.len,
2486	out->f_path.dentry->d_name.name, len);	2486	out->f_path.dentry->d_name.name, len);
2487		2487
2488	if (pipe->inode)	2488	if (pipe->inode)
2489	mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);	2489	mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2490		2490
2491	splice_from_pipe_begin(&sd);	2491	splice_from_pipe_begin(&sd);
2492	do {	2492	do {
2493	ret = splice_from_pipe_next(pipe, &sd);	2493	ret = splice_from_pipe_next(pipe, &sd);
2494	if (ret <= 0)	2494	if (ret <= 0)
2495	break;	2495	break;
2496		2496
2497	mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);	2497	mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2498	ret = ocfs2_rw_lock(inode, 1);	2498	ret = ocfs2_rw_lock(inode, 1);
2499	if (ret < 0)	2499	if (ret < 0)
2500	mlog_errno(ret);	2500	mlog_errno(ret);
2501	else {	2501	else {
2502	ret = ocfs2_splice_to_file(pipe, out, &sd);	2502	ret = ocfs2_splice_to_file(pipe, out, &sd);
2503	ocfs2_rw_unlock(inode, 1);	2503	ocfs2_rw_unlock(inode, 1);
2504	}	2504	}
2505	mutex_unlock(&inode->i_mutex);	2505	mutex_unlock(&inode->i_mutex);
2506	} while (ret > 0);	2506	} while (ret > 0);
2507	splice_from_pipe_end(pipe, &sd);	2507	splice_from_pipe_end(pipe, &sd);
2508		2508
2509	if (pipe->inode)	2509	if (pipe->inode)
2510	mutex_unlock(&pipe->inode->i_mutex);	2510	mutex_unlock(&pipe->inode->i_mutex);
2511		2511
2512	if (sd.num_spliced)	2512	if (sd.num_spliced)
2513	ret = sd.num_spliced;	2513	ret = sd.num_spliced;
2514		2514
2515	if (ret > 0) {	2515	if (ret > 0) {
2516	int err;	2516	int err;
2517		2517
2518	err = generic_write_sync(out, *ppos, ret);	2518	err = generic_write_sync(out, *ppos, ret);
2519	if (err)	2519	if (err)
2520	ret = err;	2520	ret = err;
2521	else	2521	else
2522	*ppos += ret;	2522	*ppos += ret;
2523		2523
2524	balance_dirty_pages_ratelimited(mapping);	2524	balance_dirty_pages_ratelimited(mapping);
2525	}	2525	}
2526		2526
2527	return ret;	2527	return ret;
2528	}	2528	}
2529		2529
2530	static ssize_t ocfs2_file_splice_read(struct file *in,	2530	static ssize_t ocfs2_file_splice_read(struct file *in,
2531	loff_t *ppos,	2531	loff_t *ppos,
2532	struct pipe_inode_info *pipe,	2532	struct pipe_inode_info *pipe,
2533	size_t len,	2533	size_t len,
2534	unsigned int flags)	2534	unsigned int flags)
2535	{	2535	{
2536	int ret = 0, lock_level = 0;	2536	int ret = 0, lock_level = 0;
2537	struct inode *inode = in->f_path.dentry->d_inode;	2537	struct inode *inode = in->f_path.dentry->d_inode;
2538		2538
2539	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,	2539	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2540	(unsigned long long)OCFS2_I(inode)->ip_blkno,	2540	(unsigned long long)OCFS2_I(inode)->ip_blkno,
2541	in->f_path.dentry->d_name.len,	2541	in->f_path.dentry->d_name.len,
2542	in->f_path.dentry->d_name.name, len);	2542	in->f_path.dentry->d_name.name, len);
2543		2543
2544	/*	2544	/*
2545	* See the comment in ocfs2_file_aio_read()	2545	* See the comment in ocfs2_file_aio_read()
2546	*/	2546	*/
2547	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);	2547	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2548	if (ret < 0) {	2548	if (ret < 0) {
2549	mlog_errno(ret);	2549	mlog_errno(ret);
2550	goto bail;	2550	goto bail;
2551	}	2551	}
2552	ocfs2_inode_unlock(inode, lock_level);	2552	ocfs2_inode_unlock(inode, lock_level);
2553		2553
2554	ret = generic_file_splice_read(in, ppos, pipe, len, flags);	2554	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2555		2555
2556	bail:	2556	bail:
2557	return ret;	2557	return ret;
2558	}	2558	}
2559		2559
2560	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,	2560	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2561	const struct iovec *iov,	2561	const struct iovec *iov,
2562	unsigned long nr_segs,	2562	unsigned long nr_segs,
2563	loff_t pos)	2563	loff_t pos)
2564	{	2564	{
2565	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;	2565	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2566	struct file *filp = iocb->ki_filp;	2566	struct file *filp = iocb->ki_filp;
2567	struct inode *inode = filp->f_path.dentry->d_inode;	2567	struct inode *inode = filp->f_path.dentry->d_inode;
2568		2568
2569	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,	2569	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2570	(unsigned long long)OCFS2_I(inode)->ip_blkno,	2570	(unsigned long long)OCFS2_I(inode)->ip_blkno,
2571	filp->f_path.dentry->d_name.len,	2571	filp->f_path.dentry->d_name.len,
2572	filp->f_path.dentry->d_name.name, nr_segs);	2572	filp->f_path.dentry->d_name.name, nr_segs);
2573		2573
2574		2574
2575	if (!inode) {	2575	if (!inode) {
2576	ret = -EINVAL;	2576	ret = -EINVAL;
2577	mlog_errno(ret);	2577	mlog_errno(ret);
2578	goto bail;	2578	goto bail;
2579	}	2579	}
2580		2580
2581	ocfs2_iocb_clear_sem_locked(iocb);	2581	ocfs2_iocb_clear_sem_locked(iocb);
2582		2582
2583	/*	2583	/*
2584	* buffered reads protect themselves in ->readpage(). O_DIRECT reads	2584	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
2585	* need locks to protect pending reads from racing with truncate.	2585	* need locks to protect pending reads from racing with truncate.
2586	*/	2586	*/
2587	if (filp->f_flags & O_DIRECT) {	2587	if (filp->f_flags & O_DIRECT) {
2588	have_alloc_sem = 1;	2588	have_alloc_sem = 1;
2589	ocfs2_iocb_set_sem_locked(iocb);	2589	ocfs2_iocb_set_sem_locked(iocb);
2590		2590
2591	ret = ocfs2_rw_lock(inode, 0);	2591	ret = ocfs2_rw_lock(inode, 0);
2592	if (ret < 0) {	2592	if (ret < 0) {
2593	mlog_errno(ret);	2593	mlog_errno(ret);
2594	goto bail;	2594	goto bail;
2595	}	2595	}
2596	rw_level = 0;	2596	rw_level = 0;
2597	/* communicate with ocfs2_dio_end_io */	2597	/* communicate with ocfs2_dio_end_io */
2598	ocfs2_iocb_set_rw_locked(iocb, rw_level);	2598	ocfs2_iocb_set_rw_locked(iocb, rw_level);
2599	}	2599	}
2600		2600
2601	/*	2601	/*
2602	* We're fine letting folks race truncates and extending	2602	* We're fine letting folks race truncates and extending
2603	* writes with read across the cluster, just like they can	2603	* writes with read across the cluster, just like they can
2604	* locally. Hence no rw_lock during read.	2604	* locally. Hence no rw_lock during read.
2605	*	2605	*
2606	* Take and drop the meta data lock to update inode fields	2606	* Take and drop the meta data lock to update inode fields
2607	* like i_size. This allows the checks down below	2607	* like i_size. This allows the checks down below
2608	* generic_file_aio_read() a chance of actually working.	2608	* generic_file_aio_read() a chance of actually working.
2609	*/	2609	*/
2610	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);	2610	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2611	if (ret < 0) {	2611	if (ret < 0) {
2612	mlog_errno(ret);	2612	mlog_errno(ret);
2613	goto bail;	2613	goto bail;
2614	}	2614	}
2615	ocfs2_inode_unlock(inode, lock_level);	2615	ocfs2_inode_unlock(inode, lock_level);
2616		2616
2617	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);	2617	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2618	trace_generic_file_aio_read_ret(ret);	2618	trace_generic_file_aio_read_ret(ret);
2619		2619
2620	/* buffered aio wouldn't have proper lock coverage today */	2620	/* buffered aio wouldn't have proper lock coverage today */
2621	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));	2621	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2622		2622
2623	/* see ocfs2_file_aio_write */	2623	/* see ocfs2_file_aio_write */
2624	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {	2624	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
2625	rw_level = -1;	2625	rw_level = -1;
2626	have_alloc_sem = 0;	2626	have_alloc_sem = 0;
2627	}	2627	}
2628		2628
2629	bail:	2629	bail:
2630	if (have_alloc_sem)	2630	if (have_alloc_sem)
2631	ocfs2_iocb_clear_sem_locked(iocb);	2631	ocfs2_iocb_clear_sem_locked(iocb);
2632		2632
2633	if (rw_level != -1)	2633	if (rw_level != -1)
2634	ocfs2_rw_unlock(inode, rw_level);	2634	ocfs2_rw_unlock(inode, rw_level);
2635		2635
2636	return ret;	2636	return ret;
2637	}	2637	}
2638		2638
2639	/* Refer generic_file_llseek_unlocked() */	2639	/* Refer generic_file_llseek_unlocked() */
2640	static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)	2640	static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2641	{	2641	{
2642	struct inode *inode = file->f_mapping->host;	2642	struct inode *inode = file->f_mapping->host;
2643	int ret = 0;	2643	int ret = 0;
2644		2644
2645	mutex_lock(&inode->i_mutex);	2645	mutex_lock(&inode->i_mutex);
2646		2646
2647	switch (origin) {	2647	switch (whence) {
2648	case SEEK_SET:	2648	case SEEK_SET:
2649	break;	2649	break;
2650	case SEEK_END:	2650	case SEEK_END:
2651	offset += inode->i_size;	2651	offset += inode->i_size;
2652	break;	2652	break;
2653	case SEEK_CUR:	2653	case SEEK_CUR:
2654	if (offset == 0) {	2654	if (offset == 0) {
2655	offset = file->f_pos;	2655	offset = file->f_pos;
2656	goto out;	2656	goto out;
2657	}	2657	}
2658	offset += file->f_pos;	2658	offset += file->f_pos;
2659	break;	2659	break;
2660	case SEEK_DATA:	2660	case SEEK_DATA:
2661	case SEEK_HOLE:	2661	case SEEK_HOLE:
2662	ret = ocfs2_seek_data_hole_offset(file, &offset, origin);	2662	ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2663	if (ret)	2663	if (ret)
2664	goto out;	2664	goto out;
2665	break;	2665	break;
2666	default:	2666	default:
2667	ret = -EINVAL;	2667	ret = -EINVAL;
2668	goto out;	2668	goto out;
2669	}	2669	}
2670		2670
2671	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))	2671	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2672	ret = -EINVAL;	2672	ret = -EINVAL;
2673	if (!ret && offset > inode->i_sb->s_maxbytes)	2673	if (!ret && offset > inode->i_sb->s_maxbytes)
2674	ret = -EINVAL;	2674	ret = -EINVAL;
2675	if (ret)	2675	if (ret)
2676	goto out;	2676	goto out;
2677		2677
2678	if (offset != file->f_pos) {	2678	if (offset != file->f_pos) {
2679	file->f_pos = offset;	2679	file->f_pos = offset;
2680	file->f_version = 0;	2680	file->f_version = 0;
2681	}	2681	}
2682		2682
2683	out:	2683	out:
2684	mutex_unlock(&inode->i_mutex);	2684	mutex_unlock(&inode->i_mutex);
2685	if (ret)	2685	if (ret)
2686	return ret;	2686	return ret;
2687	return offset;	2687	return offset;
2688	}	2688	}
2689		2689
2690	const struct inode_operations ocfs2_file_iops = {	2690	const struct inode_operations ocfs2_file_iops = {
2691	.setattr = ocfs2_setattr,	2691	.setattr = ocfs2_setattr,
2692	.getattr = ocfs2_getattr,	2692	.getattr = ocfs2_getattr,
2693	.permission = ocfs2_permission,	2693	.permission = ocfs2_permission,
2694	.setxattr = generic_setxattr,	2694	.setxattr = generic_setxattr,
2695	.getxattr = generic_getxattr,	2695	.getxattr = generic_getxattr,
2696	.listxattr = ocfs2_listxattr,	2696	.listxattr = ocfs2_listxattr,
2697	.removexattr = generic_removexattr,	2697	.removexattr = generic_removexattr,
2698	.fiemap = ocfs2_fiemap,	2698	.fiemap = ocfs2_fiemap,
2699	.get_acl = ocfs2_iop_get_acl,	2699	.get_acl = ocfs2_iop_get_acl,
2700	};	2700	};
2701		2701
2702	const struct inode_operations ocfs2_special_file_iops = {	2702	const struct inode_operations ocfs2_special_file_iops = {
2703	.setattr = ocfs2_setattr,	2703	.setattr = ocfs2_setattr,
2704	.getattr = ocfs2_getattr,	2704	.getattr = ocfs2_getattr,
2705	.permission = ocfs2_permission,	2705	.permission = ocfs2_permission,
2706	.get_acl = ocfs2_iop_get_acl,	2706	.get_acl = ocfs2_iop_get_acl,
2707	};	2707	};
2708		2708
2709	/*	2709	/*
2710	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with	2710	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2711	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!	2711	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2712	*/	2712	*/
2713	const struct file_operations ocfs2_fops = {	2713	const struct file_operations ocfs2_fops = {
2714	.llseek = ocfs2_file_llseek,	2714	.llseek = ocfs2_file_llseek,
2715	.read = do_sync_read,	2715	.read = do_sync_read,
2716	.write = do_sync_write,	2716	.write = do_sync_write,
2717	.mmap = ocfs2_mmap,	2717	.mmap = ocfs2_mmap,
2718	.fsync = ocfs2_sync_file,	2718	.fsync = ocfs2_sync_file,
2719	.release = ocfs2_file_release,	2719	.release = ocfs2_file_release,
2720	.open = ocfs2_file_open,	2720	.open = ocfs2_file_open,
2721	.aio_read = ocfs2_file_aio_read,	2721	.aio_read = ocfs2_file_aio_read,
2722	.aio_write = ocfs2_file_aio_write,	2722	.aio_write = ocfs2_file_aio_write,
2723	.unlocked_ioctl = ocfs2_ioctl,	2723	.unlocked_ioctl = ocfs2_ioctl,
2724	#ifdef CONFIG_COMPAT	2724	#ifdef CONFIG_COMPAT
2725	.compat_ioctl = ocfs2_compat_ioctl,	2725	.compat_ioctl = ocfs2_compat_ioctl,
2726	#endif	2726	#endif
2727	.lock = ocfs2_lock,	2727	.lock = ocfs2_lock,
2728	.flock = ocfs2_flock,	2728	.flock = ocfs2_flock,
2729	.splice_read = ocfs2_file_splice_read,	2729	.splice_read = ocfs2_file_splice_read,
2730	.splice_write = ocfs2_file_splice_write,	2730	.splice_write = ocfs2_file_splice_write,
2731	.fallocate = ocfs2_fallocate,	2731	.fallocate = ocfs2_fallocate,
2732	};	2732	};
2733		2733
2734	const struct file_operations ocfs2_dops = {	2734	const struct file_operations ocfs2_dops = {
2735	.llseek = generic_file_llseek,	2735	.llseek = generic_file_llseek,
2736	.read = generic_read_dir,	2736	.read = generic_read_dir,
2737	.readdir = ocfs2_readdir,	2737	.readdir = ocfs2_readdir,
2738	.fsync = ocfs2_sync_file,	2738	.fsync = ocfs2_sync_file,
2739	.release = ocfs2_dir_release,	2739	.release = ocfs2_dir_release,
2740	.open = ocfs2_dir_open,	2740	.open = ocfs2_dir_open,
2741	.unlocked_ioctl = ocfs2_ioctl,	2741	.unlocked_ioctl = ocfs2_ioctl,
2742	#ifdef CONFIG_COMPAT	2742	#ifdef CONFIG_COMPAT
2743	.compat_ioctl = ocfs2_compat_ioctl,	2743	.compat_ioctl = ocfs2_compat_ioctl,
2744	#endif	2744	#endif
2745	.lock = ocfs2_lock,	2745	.lock = ocfs2_lock,
2746	.flock = ocfs2_flock,	2746	.flock = ocfs2_flock,
2747	};	2747	};
2748		2748
2749	/*	2749	/*
2750	* POSIX-lockless variants of our file_operations.	2750	* POSIX-lockless variants of our file_operations.
2751	*	2751	*
2752	* These will be used if the underlying cluster stack does not support	2752	* These will be used if the underlying cluster stack does not support
2753	* posix file locking, if the user passes the "localflocks" mount	2753	* posix file locking, if the user passes the "localflocks" mount
2754	* option, or if we have a local-only fs.	2754	* option, or if we have a local-only fs.
2755	*	2755	*
2756	* ocfs2_flock is in here because all stacks handle UNIX file locks,	2756	* ocfs2_flock is in here because all stacks handle UNIX file locks,
2757	* so we still want it in the case of no stack support for	2757	* so we still want it in the case of no stack support for
2758	* plocks. Internally, it will do the right thing when asked to ignore	2758	* plocks. Internally, it will do the right thing when asked to ignore
2759	* the cluster.	2759	* the cluster.
2760	*/	2760	*/
2761	const struct file_operations ocfs2_fops_no_plocks = {	2761	const struct file_operations ocfs2_fops_no_plocks = {
2762	.llseek = ocfs2_file_llseek,	2762	.llseek = ocfs2_file_llseek,
2763	.read = do_sync_read,	2763	.read = do_sync_read,
2764	.write = do_sync_write,	2764	.write = do_sync_write,
2765	.mmap = ocfs2_mmap,	2765	.mmap = ocfs2_mmap,
2766	.fsync = ocfs2_sync_file,	2766	.fsync = ocfs2_sync_file,
2767	.release = ocfs2_file_release,	2767	.release = ocfs2_file_release,
2768	.open = ocfs2_file_open,	2768	.open = ocfs2_file_open,
2769	.aio_read = ocfs2_file_aio_read,	2769	.aio_read = ocfs2_file_aio_read,
2770	.aio_write = ocfs2_file_aio_write,	2770	.aio_write = ocfs2_file_aio_write,
2771	.unlocked_ioctl = ocfs2_ioctl,	2771	.unlocked_ioctl = ocfs2_ioctl,
2772	#ifdef CONFIG_COMPAT	2772	#ifdef CONFIG_COMPAT
2773	.compat_ioctl = ocfs2_compat_ioctl,	2773	.compat_ioctl = ocfs2_compat_ioctl,
2774	#endif	2774	#endif
2775	.flock = ocfs2_flock,	2775	.flock = ocfs2_flock,
2776	.splice_read = ocfs2_file_splice_read,	2776	.splice_read = ocfs2_file_splice_read,
2777	.splice_write = ocfs2_file_splice_write,	2777	.splice_write = ocfs2_file_splice_write,
2778	.fallocate = ocfs2_fallocate,	2778	.fallocate = ocfs2_fallocate,
2779	};	2779	};
2780		2780
2781	const struct file_operations ocfs2_dops_no_plocks = {	2781	const struct file_operations ocfs2_dops_no_plocks = {
2782	.llseek = generic_file_llseek,	2782	.llseek = generic_file_llseek,
2783	.read = generic_read_dir,	2783	.read = generic_read_dir,
2784	.readdir = ocfs2_readdir,	2784	.readdir = ocfs2_readdir,
2785	.fsync = ocfs2_sync_file,	2785	.fsync = ocfs2_sync_file,
2786	.release = ocfs2_dir_release,	2786	.release = ocfs2_dir_release,
2787	.open = ocfs2_dir_open,	2787	.open = ocfs2_dir_open,
2788	.unlocked_ioctl = ocfs2_ioctl,	2788	.unlocked_ioctl = ocfs2_ioctl,
2789	#ifdef CONFIG_COMPAT	2789	#ifdef CONFIG_COMPAT
2790	.compat_ioctl = ocfs2_compat_ioctl,	2790	.compat_ioctl = ocfs2_compat_ioctl,
2791	#endif	2791	#endif
2792	.flock = ocfs2_flock,	2792	.flock = ocfs2_flock,
2793	};	2793	};
2794		2794

fs/pstore/inode.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* Persistent Storage - ramfs parts.	2	* Persistent Storage - ramfs parts.
3	*	3	*
4	* Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>	4	* Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5	*	5	*
6	* This program is free software; you can redistribute it and/or modify	6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License version 2 as	7	* it under the terms of the GNU General Public License version 2 as
8	* published by the Free Software Foundation.	8	* published by the Free Software Foundation.
9	*	9	*
10	* This program is distributed in the hope that it will be useful,	10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.	13	* GNU General Public License for more details.
14	*	14	*
15	* You should have received a copy of the GNU General Public License	15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software	16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18	*/	18	*/
19		19
20	#include <linux/module.h>	20	#include <linux/module.h>
21	#include <linux/fs.h>	21	#include <linux/fs.h>
22	#include <linux/fsnotify.h>	22	#include <linux/fsnotify.h>
23	#include <linux/pagemap.h>	23	#include <linux/pagemap.h>
24	#include <linux/highmem.h>	24	#include <linux/highmem.h>
25	#include <linux/time.h>	25	#include <linux/time.h>
26	#include <linux/init.h>	26	#include <linux/init.h>
27	#include <linux/list.h>	27	#include <linux/list.h>
28	#include <linux/string.h>	28	#include <linux/string.h>
29	#include <linux/mount.h>	29	#include <linux/mount.h>
30	#include <linux/seq_file.h>	30	#include <linux/seq_file.h>
31	#include <linux/ramfs.h>	31	#include <linux/ramfs.h>
32	#include <linux/parser.h>	32	#include <linux/parser.h>
33	#include <linux/sched.h>	33	#include <linux/sched.h>
34	#include <linux/magic.h>	34	#include <linux/magic.h>
35	#include <linux/pstore.h>	35	#include <linux/pstore.h>
36	#include <linux/slab.h>	36	#include <linux/slab.h>
37	#include <linux/spinlock.h>	37	#include <linux/spinlock.h>
38	#include <linux/uaccess.h>	38	#include <linux/uaccess.h>
39		39
40	#include "internal.h"	40	#include "internal.h"
41		41
42	#define PSTORE_NAMELEN 64	42	#define PSTORE_NAMELEN 64
43		43
44	static DEFINE_SPINLOCK(allpstore_lock);	44	static DEFINE_SPINLOCK(allpstore_lock);
45	static LIST_HEAD(allpstore);	45	static LIST_HEAD(allpstore);
46		46
47	struct pstore_private {	47	struct pstore_private {
48	struct list_head list;	48	struct list_head list;
49	struct pstore_info *psi;	49	struct pstore_info *psi;
50	enum pstore_type_id type;	50	enum pstore_type_id type;
51	u64 id;	51	u64 id;
52	int count;	52	int count;
53	ssize_t size;	53	ssize_t size;
54	char data[];	54	char data[];
55	};	55	};
56		56
57	struct pstore_ftrace_seq_data {	57	struct pstore_ftrace_seq_data {
58	const void *ptr;	58	const void *ptr;
59	size_t off;	59	size_t off;
60	size_t size;	60	size_t size;
61	};	61	};
62		62
63	#define REC_SIZE sizeof(struct pstore_ftrace_record)	63	#define REC_SIZE sizeof(struct pstore_ftrace_record)
64		64
65	static void pstore_ftrace_seq_start(struct seq_file s, loff_t *pos)	65	static void pstore_ftrace_seq_start(struct seq_file s, loff_t *pos)
66	{	66	{
67	struct pstore_private *ps = s->private;	67	struct pstore_private *ps = s->private;
68	struct pstore_ftrace_seq_data *data;	68	struct pstore_ftrace_seq_data *data;
69		69
70	data = kzalloc(sizeof(*data), GFP_KERNEL);	70	data = kzalloc(sizeof(*data), GFP_KERNEL);
71	if (!data)	71	if (!data)
72	return NULL;	72	return NULL;
73		73
74	data->off = ps->size % REC_SIZE;	74	data->off = ps->size % REC_SIZE;
75	data->off += pos REC_SIZE;	75	data->off += pos REC_SIZE;
76	if (data->off + REC_SIZE > ps->size) {	76	if (data->off + REC_SIZE > ps->size) {
77	kfree(data);	77	kfree(data);
78	return NULL;	78	return NULL;
79	}	79	}
80		80
81	return data;	81	return data;
82		82
83	}	83	}
84		84
85	static void pstore_ftrace_seq_stop(struct seq_file s, void v)	85	static void pstore_ftrace_seq_stop(struct seq_file s, void v)
86	{	86	{
87	kfree(v);	87	kfree(v);
88	}	88	}
89		89
90	static void pstore_ftrace_seq_next(struct seq_file s, void v, loff_t pos)	90	static void pstore_ftrace_seq_next(struct seq_file s, void v, loff_t pos)
91	{	91	{
92	struct pstore_private *ps = s->private;	92	struct pstore_private *ps = s->private;
93	struct pstore_ftrace_seq_data *data = v;	93	struct pstore_ftrace_seq_data *data = v;
94		94
95	data->off += REC_SIZE;	95	data->off += REC_SIZE;
96	if (data->off + REC_SIZE > ps->size)	96	if (data->off + REC_SIZE > ps->size)
97	return NULL;	97	return NULL;
98		98
99	(*pos)++;	99	(*pos)++;
100	return data;	100	return data;
101	}	101	}
102		102
103	static int pstore_ftrace_seq_show(struct seq_file s, void v)	103	static int pstore_ftrace_seq_show(struct seq_file s, void v)
104	{	104	{
105	struct pstore_private *ps = s->private;	105	struct pstore_private *ps = s->private;
106	struct pstore_ftrace_seq_data *data = v;	106	struct pstore_ftrace_seq_data *data = v;
107	struct pstore_ftrace_record rec = (void )(ps->data + data->off);	107	struct pstore_ftrace_record rec = (void )(ps->data + data->off);
108		108
109	seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",	109	seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
110	pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,	110	pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
111	(void )rec->ip, (void )rec->parent_ip);	111	(void )rec->ip, (void )rec->parent_ip);
112		112
113	return 0;	113	return 0;
114	}	114	}
115		115
116	static const struct seq_operations pstore_ftrace_seq_ops = {	116	static const struct seq_operations pstore_ftrace_seq_ops = {
117	.start = pstore_ftrace_seq_start,	117	.start = pstore_ftrace_seq_start,
118	.next = pstore_ftrace_seq_next,	118	.next = pstore_ftrace_seq_next,
119	.stop = pstore_ftrace_seq_stop,	119	.stop = pstore_ftrace_seq_stop,
120	.show = pstore_ftrace_seq_show,	120	.show = pstore_ftrace_seq_show,
121	};	121	};
122		122
123	static ssize_t pstore_file_read(struct file file, char __user userbuf,	123	static ssize_t pstore_file_read(struct file file, char __user userbuf,
124	size_t count, loff_t *ppos)	124	size_t count, loff_t *ppos)
125	{	125	{
126	struct seq_file *sf = file->private_data;	126	struct seq_file *sf = file->private_data;
127	struct pstore_private *ps = sf->private;	127	struct pstore_private *ps = sf->private;
128		128
129	if (ps->type == PSTORE_TYPE_FTRACE)	129	if (ps->type == PSTORE_TYPE_FTRACE)
130	return seq_read(file, userbuf, count, ppos);	130	return seq_read(file, userbuf, count, ppos);
131	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);	131	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
132	}	132	}
133		133
134	static int pstore_file_open(struct inode inode, struct file file)	134	static int pstore_file_open(struct inode inode, struct file file)
135	{	135	{
136	struct pstore_private *ps = inode->i_private;	136	struct pstore_private *ps = inode->i_private;
137	struct seq_file *sf;	137	struct seq_file *sf;
138	int err;	138	int err;
139	const struct seq_operations *sops = NULL;	139	const struct seq_operations *sops = NULL;
140		140
141	if (ps->type == PSTORE_TYPE_FTRACE)	141	if (ps->type == PSTORE_TYPE_FTRACE)
142	sops = &pstore_ftrace_seq_ops;	142	sops = &pstore_ftrace_seq_ops;
143		143
144	err = seq_open(file, sops);	144	err = seq_open(file, sops);
145	if (err < 0)	145	if (err < 0)
146	return err;	146	return err;
147		147
148	sf = file->private_data;	148	sf = file->private_data;
149	sf->private = ps;	149	sf->private = ps;
150		150
151	return 0;	151	return 0;
152	}	152	}
153		153
154	static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin)	154	static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
155	{	155	{
156	struct seq_file *sf = file->private_data;	156	struct seq_file *sf = file->private_data;
157		157
158	if (sf->op)	158	if (sf->op)
159	return seq_lseek(file, off, origin);	159	return seq_lseek(file, off, whence);
160	return default_llseek(file, off, origin);	160	return default_llseek(file, off, whence);
161	}	161	}
162		162
163	static const struct file_operations pstore_file_operations = {	163	static const struct file_operations pstore_file_operations = {
164	.open = pstore_file_open,	164	.open = pstore_file_open,
165	.read = pstore_file_read,	165	.read = pstore_file_read,
166	.llseek = pstore_file_llseek,	166	.llseek = pstore_file_llseek,
167	.release = seq_release,	167	.release = seq_release,
168	};	168	};
169		169
170	/*	170	/*
171	* When a file is unlinked from our file system we call the	171	* When a file is unlinked from our file system we call the
172	* platform driver to erase the record from persistent store.	172	* platform driver to erase the record from persistent store.
173	*/	173	*/
174	static int pstore_unlink(struct inode dir, struct dentry dentry)	174	static int pstore_unlink(struct inode dir, struct dentry dentry)
175	{	175	{
176	struct pstore_private *p = dentry->d_inode->i_private;	176	struct pstore_private *p = dentry->d_inode->i_private;
177		177
178	if (p->psi->erase)	178	if (p->psi->erase)
179	p->psi->erase(p->type, p->id, p->count,	179	p->psi->erase(p->type, p->id, p->count,
180	dentry->d_inode->i_ctime, p->psi);	180	dentry->d_inode->i_ctime, p->psi);
181		181
182	return simple_unlink(dir, dentry);	182	return simple_unlink(dir, dentry);
183	}	183	}
184		184
185	static void pstore_evict_inode(struct inode *inode)	185	static void pstore_evict_inode(struct inode *inode)
186	{	186	{
187	struct pstore_private *p = inode->i_private;	187	struct pstore_private *p = inode->i_private;
188	unsigned long flags;	188	unsigned long flags;
189		189
190	clear_inode(inode);	190	clear_inode(inode);
191	if (p) {	191	if (p) {
192	spin_lock_irqsave(&allpstore_lock, flags);	192	spin_lock_irqsave(&allpstore_lock, flags);
193	list_del(&p->list);	193	list_del(&p->list);
194	spin_unlock_irqrestore(&allpstore_lock, flags);	194	spin_unlock_irqrestore(&allpstore_lock, flags);
195	kfree(p);	195	kfree(p);
196	}	196	}
197	}	197	}
198		198
199	static const struct inode_operations pstore_dir_inode_operations = {	199	static const struct inode_operations pstore_dir_inode_operations = {
200	.lookup = simple_lookup,	200	.lookup = simple_lookup,
201	.unlink = pstore_unlink,	201	.unlink = pstore_unlink,
202	};	202	};
203		203
204	static struct inode pstore_get_inode(struct super_block sb)	204	static struct inode pstore_get_inode(struct super_block sb)
205	{	205	{
206	struct inode *inode = new_inode(sb);	206	struct inode *inode = new_inode(sb);
207	if (inode) {	207	if (inode) {
208	inode->i_ino = get_next_ino();	208	inode->i_ino = get_next_ino();
209	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	209	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
210	}	210	}
211	return inode;	211	return inode;
212	}	212	}
213		213
214	enum {	214	enum {
215	Opt_kmsg_bytes, Opt_err	215	Opt_kmsg_bytes, Opt_err
216	};	216	};
217		217
218	static const match_table_t tokens = {	218	static const match_table_t tokens = {
219	{Opt_kmsg_bytes, "kmsg_bytes=%u"},	219	{Opt_kmsg_bytes, "kmsg_bytes=%u"},
220	{Opt_err, NULL}	220	{Opt_err, NULL}
221	};	221	};
222		222
223	static void parse_options(char *options)	223	static void parse_options(char *options)
224	{	224	{
225	char *p;	225	char *p;
226	substring_t args[MAX_OPT_ARGS];	226	substring_t args[MAX_OPT_ARGS];
227	int option;	227	int option;
228		228
229	if (!options)	229	if (!options)
230	return;	230	return;
231		231
232	while ((p = strsep(&options, ",")) != NULL) {	232	while ((p = strsep(&options, ",")) != NULL) {
233	int token;	233	int token;
234		234
235	if (!*p)	235	if (!*p)
236	continue;	236	continue;
237		237
238	token = match_token(p, tokens, args);	238	token = match_token(p, tokens, args);
239	switch (token) {	239	switch (token) {
240	case Opt_kmsg_bytes:	240	case Opt_kmsg_bytes:
241	if (!match_int(&args[0], &option))	241	if (!match_int(&args[0], &option))
242	pstore_set_kmsg_bytes(option);	242	pstore_set_kmsg_bytes(option);
243	break;	243	break;
244	}	244	}
245	}	245	}
246	}	246	}
247		247
248	static int pstore_remount(struct super_block sb, int flags, char *data)	248	static int pstore_remount(struct super_block sb, int flags, char *data)
249	{	249	{
250	parse_options(data);	250	parse_options(data);
251		251
252	return 0;	252	return 0;
253	}	253	}
254		254
255	static const struct super_operations pstore_ops = {	255	static const struct super_operations pstore_ops = {
256	.statfs = simple_statfs,	256	.statfs = simple_statfs,
257	.drop_inode = generic_delete_inode,	257	.drop_inode = generic_delete_inode,
258	.evict_inode = pstore_evict_inode,	258	.evict_inode = pstore_evict_inode,
259	.remount_fs = pstore_remount,	259	.remount_fs = pstore_remount,
260	.show_options = generic_show_options,	260	.show_options = generic_show_options,
261	};	261	};
262		262
263	static struct super_block *pstore_sb;	263	static struct super_block *pstore_sb;
264		264
265	int pstore_is_mounted(void)	265	int pstore_is_mounted(void)
266	{	266	{
267	return pstore_sb != NULL;	267	return pstore_sb != NULL;
268	}	268	}
269		269
270	/*	270	/*
271	* Make a regular file in the root directory of our file system.	271	* Make a regular file in the root directory of our file system.
272	* Load it up with "size" bytes of data from "buf".	272	* Load it up with "size" bytes of data from "buf".
273	* Set the mtime & ctime to the date that this record was originally stored.	273	* Set the mtime & ctime to the date that this record was originally stored.
274	*/	274	*/
275	int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,	275	int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
276	char *data, size_t size, struct timespec time,	276	char *data, size_t size, struct timespec time,
277	struct pstore_info *psi)	277	struct pstore_info *psi)
278	{	278	{
279	struct dentry *root = pstore_sb->s_root;	279	struct dentry *root = pstore_sb->s_root;
280	struct dentry *dentry;	280	struct dentry *dentry;
281	struct inode *inode;	281	struct inode *inode;
282	int rc = 0;	282	int rc = 0;
283	char name[PSTORE_NAMELEN];	283	char name[PSTORE_NAMELEN];
284	struct pstore_private private, pos;	284	struct pstore_private private, pos;
285	unsigned long flags;	285	unsigned long flags;
286		286
287	spin_lock_irqsave(&allpstore_lock, flags);	287	spin_lock_irqsave(&allpstore_lock, flags);
288	list_for_each_entry(pos, &allpstore, list) {	288	list_for_each_entry(pos, &allpstore, list) {
289	if (pos->type == type &&	289	if (pos->type == type &&
290	pos->id == id &&	290	pos->id == id &&
291	pos->psi == psi) {	291	pos->psi == psi) {
292	rc = -EEXIST;	292	rc = -EEXIST;
293	break;	293	break;
294	}	294	}
295	}	295	}
296	spin_unlock_irqrestore(&allpstore_lock, flags);	296	spin_unlock_irqrestore(&allpstore_lock, flags);
297	if (rc)	297	if (rc)
298	return rc;	298	return rc;
299		299
300	rc = -ENOMEM;	300	rc = -ENOMEM;
301	inode = pstore_get_inode(pstore_sb);	301	inode = pstore_get_inode(pstore_sb);
302	if (!inode)	302	if (!inode)
303	goto fail;	303	goto fail;
304	inode->i_mode = S_IFREG \| 0444;	304	inode->i_mode = S_IFREG \| 0444;
305	inode->i_fop = &pstore_file_operations;	305	inode->i_fop = &pstore_file_operations;
306	private = kmalloc(sizeof *private + size, GFP_KERNEL);	306	private = kmalloc(sizeof *private + size, GFP_KERNEL);
307	if (!private)	307	if (!private)
308	goto fail_alloc;	308	goto fail_alloc;
309	private->type = type;	309	private->type = type;
310	private->id = id;	310	private->id = id;
311	private->count = count;	311	private->count = count;
312	private->psi = psi;	312	private->psi = psi;
313		313
314	switch (type) {	314	switch (type) {
315	case PSTORE_TYPE_DMESG:	315	case PSTORE_TYPE_DMESG:
316	sprintf(name, "dmesg-%s-%lld", psname, id);	316	sprintf(name, "dmesg-%s-%lld", psname, id);
317	break;	317	break;
318	case PSTORE_TYPE_CONSOLE:	318	case PSTORE_TYPE_CONSOLE:
319	sprintf(name, "console-%s", psname);	319	sprintf(name, "console-%s", psname);
320	break;	320	break;
321	case PSTORE_TYPE_FTRACE:	321	case PSTORE_TYPE_FTRACE:
322	sprintf(name, "ftrace-%s", psname);	322	sprintf(name, "ftrace-%s", psname);
323	break;	323	break;
324	case PSTORE_TYPE_MCE:	324	case PSTORE_TYPE_MCE:
325	sprintf(name, "mce-%s-%lld", psname, id);	325	sprintf(name, "mce-%s-%lld", psname, id);
326	break;	326	break;
327	case PSTORE_TYPE_UNKNOWN:	327	case PSTORE_TYPE_UNKNOWN:
328	sprintf(name, "unknown-%s-%lld", psname, id);	328	sprintf(name, "unknown-%s-%lld", psname, id);
329	break;	329	break;
330	default:	330	default:
331	sprintf(name, "type%d-%s-%lld", type, psname, id);	331	sprintf(name, "type%d-%s-%lld", type, psname, id);
332	break;	332	break;
333	}	333	}
334		334
335	mutex_lock(&root->d_inode->i_mutex);	335	mutex_lock(&root->d_inode->i_mutex);
336		336
337	rc = -ENOSPC;	337	rc = -ENOSPC;
338	dentry = d_alloc_name(root, name);	338	dentry = d_alloc_name(root, name);
339	if (IS_ERR(dentry))	339	if (IS_ERR(dentry))
340	goto fail_lockedalloc;	340	goto fail_lockedalloc;
341		341
342	memcpy(private->data, data, size);	342	memcpy(private->data, data, size);
343	inode->i_size = private->size = size;	343	inode->i_size = private->size = size;
344		344
345	inode->i_private = private;	345	inode->i_private = private;
346		346
347	if (time.tv_sec)	347	if (time.tv_sec)
348	inode->i_mtime = inode->i_ctime = time;	348	inode->i_mtime = inode->i_ctime = time;
349		349
350	d_add(dentry, inode);	350	d_add(dentry, inode);
351		351
352	spin_lock_irqsave(&allpstore_lock, flags);	352	spin_lock_irqsave(&allpstore_lock, flags);
353	list_add(&private->list, &allpstore);	353	list_add(&private->list, &allpstore);
354	spin_unlock_irqrestore(&allpstore_lock, flags);	354	spin_unlock_irqrestore(&allpstore_lock, flags);
355		355
356	mutex_unlock(&root->d_inode->i_mutex);	356	mutex_unlock(&root->d_inode->i_mutex);
357		357
358	return 0;	358	return 0;
359		359
360	fail_lockedalloc:	360	fail_lockedalloc:
361	mutex_unlock(&root->d_inode->i_mutex);	361	mutex_unlock(&root->d_inode->i_mutex);
362	kfree(private);	362	kfree(private);
363	fail_alloc:	363	fail_alloc:
364	iput(inode);	364	iput(inode);
365		365
366	fail:	366	fail:
367	return rc;	367	return rc;
368	}	368	}
369		369
370	static int pstore_fill_super(struct super_block sb, void data, int silent)	370	static int pstore_fill_super(struct super_block sb, void data, int silent)
371	{	371	{
372	struct inode *inode;	372	struct inode *inode;
373		373
374	save_mount_options(sb, data);	374	save_mount_options(sb, data);
375		375
376	pstore_sb = sb;	376	pstore_sb = sb;
377		377
378	sb->s_maxbytes = MAX_LFS_FILESIZE;	378	sb->s_maxbytes = MAX_LFS_FILESIZE;
379	sb->s_blocksize = PAGE_CACHE_SIZE;	379	sb->s_blocksize = PAGE_CACHE_SIZE;
380	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;	380	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
381	sb->s_magic = PSTOREFS_MAGIC;	381	sb->s_magic = PSTOREFS_MAGIC;
382	sb->s_op = &pstore_ops;	382	sb->s_op = &pstore_ops;
383	sb->s_time_gran = 1;	383	sb->s_time_gran = 1;
384		384
385	parse_options(data);	385	parse_options(data);
386		386
387	inode = pstore_get_inode(sb);	387	inode = pstore_get_inode(sb);
388	if (inode) {	388	if (inode) {
389	inode->i_mode = S_IFDIR \| 0755;	389	inode->i_mode = S_IFDIR \| 0755;
390	inode->i_op = &pstore_dir_inode_operations;	390	inode->i_op = &pstore_dir_inode_operations;
391	inode->i_fop = &simple_dir_operations;	391	inode->i_fop = &simple_dir_operations;
392	inc_nlink(inode);	392	inc_nlink(inode);
393	}	393	}
394	sb->s_root = d_make_root(inode);	394	sb->s_root = d_make_root(inode);
395	if (!sb->s_root)	395	if (!sb->s_root)
396	return -ENOMEM;	396	return -ENOMEM;
397		397
398	pstore_get_records(0);	398	pstore_get_records(0);
399		399
400	return 0;	400	return 0;
401	}	401	}
402		402
403	static struct dentry pstore_mount(struct file_system_type fs_type,	403	static struct dentry pstore_mount(struct file_system_type fs_type,
404	int flags, const char dev_name, void data)	404	int flags, const char dev_name, void data)
405	{	405	{
406	return mount_single(fs_type, flags, data, pstore_fill_super);	406	return mount_single(fs_type, flags, data, pstore_fill_super);
407	}	407	}
408		408
409	static void pstore_kill_sb(struct super_block *sb)	409	static void pstore_kill_sb(struct super_block *sb)
410	{	410	{
411	kill_litter_super(sb);	411	kill_litter_super(sb);
412	pstore_sb = NULL;	412	pstore_sb = NULL;
413	}	413	}
414		414
415	static struct file_system_type pstore_fs_type = {	415	static struct file_system_type pstore_fs_type = {
416	.name = "pstore",	416	.name = "pstore",
417	.mount = pstore_mount,	417	.mount = pstore_mount,
418	.kill_sb = pstore_kill_sb,	418	.kill_sb = pstore_kill_sb,
419	};	419	};
420		420
421	static int __init init_pstore_fs(void)	421	static int __init init_pstore_fs(void)
422	{	422	{
423	return register_filesystem(&pstore_fs_type);	423	return register_filesystem(&pstore_fs_type);
424	}	424	}
425	module_init(init_pstore_fs)	425	module_init(init_pstore_fs)
426		426
427	MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");	427	MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
428	MODULE_LICENSE("GPL");	428	MODULE_LICENSE("GPL");
429		429

fs/read_write.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/read_write.c	2	* linux/fs/read_write.c
3	*	3	*
4	* Copyright (C) 1991, 1992 Linus Torvalds	4	* Copyright (C) 1991, 1992 Linus Torvalds
5	*/	5	*/
6		6
7	#include <linux/slab.h>	7	#include <linux/slab.h>
8	#include <linux/stat.h>	8	#include <linux/stat.h>
9	#include <linux/fcntl.h>	9	#include <linux/fcntl.h>
10	#include <linux/file.h>	10	#include <linux/file.h>
11	#include <linux/uio.h>	11	#include <linux/uio.h>
12	#include <linux/fsnotify.h>	12	#include <linux/fsnotify.h>
13	#include <linux/security.h>	13	#include <linux/security.h>
14	#include <linux/export.h>	14	#include <linux/export.h>
15	#include <linux/syscalls.h>	15	#include <linux/syscalls.h>
16	#include <linux/pagemap.h>	16	#include <linux/pagemap.h>
17	#include <linux/splice.h>	17	#include <linux/splice.h>
18	#include "read_write.h"	18	#include "read_write.h"
19		19
20	#include <asm/uaccess.h>	20	#include <asm/uaccess.h>
21	#include <asm/unistd.h>	21	#include <asm/unistd.h>
22		22
23	const struct file_operations generic_ro_fops = {	23	const struct file_operations generic_ro_fops = {
24	.llseek = generic_file_llseek,	24	.llseek = generic_file_llseek,
25	.read = do_sync_read,	25	.read = do_sync_read,
26	.aio_read = generic_file_aio_read,	26	.aio_read = generic_file_aio_read,
27	.mmap = generic_file_readonly_mmap,	27	.mmap = generic_file_readonly_mmap,
28	.splice_read = generic_file_splice_read,	28	.splice_read = generic_file_splice_read,
29	};	29	};
30		30
31	EXPORT_SYMBOL(generic_ro_fops);	31	EXPORT_SYMBOL(generic_ro_fops);
32		32
33	static inline int unsigned_offsets(struct file *file)	33	static inline int unsigned_offsets(struct file *file)
34	{	34	{
35	return file->f_mode & FMODE_UNSIGNED_OFFSET;	35	return file->f_mode & FMODE_UNSIGNED_OFFSET;
36	}	36	}
37		37
38	static loff_t lseek_execute(struct file file, struct inode inode,	38	static loff_t lseek_execute(struct file file, struct inode inode,
39	loff_t offset, loff_t maxsize)	39	loff_t offset, loff_t maxsize)
40	{	40	{
41	if (offset < 0 && !unsigned_offsets(file))	41	if (offset < 0 && !unsigned_offsets(file))
42	return -EINVAL;	42	return -EINVAL;
43	if (offset > maxsize)	43	if (offset > maxsize)
44	return -EINVAL;	44	return -EINVAL;
45		45
46	if (offset != file->f_pos) {	46	if (offset != file->f_pos) {
47	file->f_pos = offset;	47	file->f_pos = offset;
48	file->f_version = 0;	48	file->f_version = 0;
49	}	49	}
50	return offset;	50	return offset;
51	}	51	}
52		52
53	/**	53	/**
54	* generic_file_llseek_size - generic llseek implementation for regular files	54	* generic_file_llseek_size - generic llseek implementation for regular files
55	* @file: file structure to seek on	55	* @file: file structure to seek on
56	* @offset: file offset to seek to	56	* @offset: file offset to seek to
57	* @origin: type of seek	57	* @whence: type of seek
58	* @size: max size of this file in file system	58	* @size: max size of this file in file system
59	* @eof: offset used for SEEK_END position	59	* @eof: offset used for SEEK_END position
60	*	60	*
61	* This is a variant of generic_file_llseek that allows passing in a custom	61	* This is a variant of generic_file_llseek that allows passing in a custom
62	* maximum file size and a custom EOF position, for e.g. hashed directories	62	* maximum file size and a custom EOF position, for e.g. hashed directories
63	*	63	*
64	* Synchronization:	64	* Synchronization:
65	* SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)	65	* SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
66	* SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.	66	* SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
67	* read/writes behave like SEEK_SET against seeks.	67	* read/writes behave like SEEK_SET against seeks.
68	*/	68	*/
69	loff_t	69	loff_t
70	generic_file_llseek_size(struct file *file, loff_t offset, int origin,	70	generic_file_llseek_size(struct file *file, loff_t offset, int whence,
71	loff_t maxsize, loff_t eof)	71	loff_t maxsize, loff_t eof)
72	{	72	{
73	struct inode *inode = file->f_mapping->host;	73	struct inode *inode = file->f_mapping->host;
74		74
75	switch (origin) {	75	switch (whence) {
76	case SEEK_END:	76	case SEEK_END:
77	offset += eof;	77	offset += eof;
78	break;	78	break;
79	case SEEK_CUR:	79	case SEEK_CUR:
80	/*	80	/*
81	* Here we special-case the lseek(fd, 0, SEEK_CUR)	81	* Here we special-case the lseek(fd, 0, SEEK_CUR)
82	* position-querying operation. Avoid rewriting the "same"	82	* position-querying operation. Avoid rewriting the "same"
83	* f_pos value back to the file because a concurrent read(),	83	* f_pos value back to the file because a concurrent read(),
84	* write() or lseek() might have altered it	84	* write() or lseek() might have altered it
85	*/	85	*/
86	if (offset == 0)	86	if (offset == 0)
87	return file->f_pos;	87	return file->f_pos;
88	/*	88	/*
89	* f_lock protects against read/modify/write race with other	89	* f_lock protects against read/modify/write race with other
90	* SEEK_CURs. Note that parallel writes and reads behave	90	* SEEK_CURs. Note that parallel writes and reads behave
91	* like SEEK_SET.	91	* like SEEK_SET.
92	*/	92	*/
93	spin_lock(&file->f_lock);	93	spin_lock(&file->f_lock);
94	offset = lseek_execute(file, inode, file->f_pos + offset,	94	offset = lseek_execute(file, inode, file->f_pos + offset,
95	maxsize);	95	maxsize);
96	spin_unlock(&file->f_lock);	96	spin_unlock(&file->f_lock);
97	return offset;	97	return offset;
98	case SEEK_DATA:	98	case SEEK_DATA:
99	/*	99	/*
100	* In the generic case the entire file is data, so as long as	100	* In the generic case the entire file is data, so as long as
101	* offset isn't at the end of the file then the offset is data.	101	* offset isn't at the end of the file then the offset is data.
102	*/	102	*/
103	if (offset >= eof)	103	if (offset >= eof)
104	return -ENXIO;	104	return -ENXIO;
105	break;	105	break;
106	case SEEK_HOLE:	106	case SEEK_HOLE:
107	/*	107	/*
108	* There is a virtual hole at the end of the file, so as long as	108	* There is a virtual hole at the end of the file, so as long as
109	* offset isn't i_size or larger, return i_size.	109	* offset isn't i_size or larger, return i_size.
110	*/	110	*/
111	if (offset >= eof)	111	if (offset >= eof)
112	return -ENXIO;	112	return -ENXIO;
113	offset = eof;	113	offset = eof;
114	break;	114	break;
115	}	115	}
116		116
117	return lseek_execute(file, inode, offset, maxsize);	117	return lseek_execute(file, inode, offset, maxsize);
118	}	118	}
119	EXPORT_SYMBOL(generic_file_llseek_size);	119	EXPORT_SYMBOL(generic_file_llseek_size);
120		120
121	/**	121	/**
122	* generic_file_llseek - generic llseek implementation for regular files	122	* generic_file_llseek - generic llseek implementation for regular files
123	* @file: file structure to seek on	123	* @file: file structure to seek on
124	* @offset: file offset to seek to	124	* @offset: file offset to seek to
125	* @origin: type of seek	125	* @whence: type of seek
126	*	126	*
127	* This is a generic implemenation of ->llseek useable for all normal local	127	* This is a generic implemenation of ->llseek useable for all normal local
128	* filesystems. It just updates the file offset to the value specified by	128	* filesystems. It just updates the file offset to the value specified by
129	* @offset and @origin under i_mutex.	129	* @offset and @whence under i_mutex.
130	*/	130	*/
131	loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)	131	loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
132	{	132	{
133	struct inode *inode = file->f_mapping->host;	133	struct inode *inode = file->f_mapping->host;
134		134
135	return generic_file_llseek_size(file, offset, origin,	135	return generic_file_llseek_size(file, offset, whence,
136	inode->i_sb->s_maxbytes,	136	inode->i_sb->s_maxbytes,
137	i_size_read(inode));	137	i_size_read(inode));
138	}	138	}
139	EXPORT_SYMBOL(generic_file_llseek);	139	EXPORT_SYMBOL(generic_file_llseek);
140		140
141	/**	141	/**
142	* noop_llseek - No Operation Performed llseek implementation	142	* noop_llseek - No Operation Performed llseek implementation
143	* @file: file structure to seek on	143	* @file: file structure to seek on
144	* @offset: file offset to seek to	144	* @offset: file offset to seek to
145	* @origin: type of seek	145	* @whence: type of seek
146	*	146	*
147	* This is an implementation of ->llseek useable for the rare special case when	147	* This is an implementation of ->llseek useable for the rare special case when
148	* userspace expects the seek to succeed but the (device) file is actually not	148	* userspace expects the seek to succeed but the (device) file is actually not
149	* able to perform the seek. In this case you use noop_llseek() instead of	149	* able to perform the seek. In this case you use noop_llseek() instead of
150	* falling back to the default implementation of ->llseek.	150	* falling back to the default implementation of ->llseek.
151	*/	151	*/
152	loff_t noop_llseek(struct file *file, loff_t offset, int origin)	152	loff_t noop_llseek(struct file *file, loff_t offset, int whence)
153	{	153	{
154	return file->f_pos;	154	return file->f_pos;
155	}	155	}
156	EXPORT_SYMBOL(noop_llseek);	156	EXPORT_SYMBOL(noop_llseek);
157		157
158	loff_t no_llseek(struct file *file, loff_t offset, int origin)	158	loff_t no_llseek(struct file *file, loff_t offset, int whence)
159	{	159	{
160	return -ESPIPE;	160	return -ESPIPE;
161	}	161	}
162	EXPORT_SYMBOL(no_llseek);	162	EXPORT_SYMBOL(no_llseek);
163		163
164	loff_t default_llseek(struct file *file, loff_t offset, int origin)	164	loff_t default_llseek(struct file *file, loff_t offset, int whence)
165	{	165	{
166	struct inode *inode = file->f_path.dentry->d_inode;	166	struct inode *inode = file->f_path.dentry->d_inode;
167	loff_t retval;	167	loff_t retval;
168		168
169	mutex_lock(&inode->i_mutex);	169	mutex_lock(&inode->i_mutex);
170	switch (origin) {	170	switch (whence) {
171	case SEEK_END:	171	case SEEK_END:
172	offset += i_size_read(inode);	172	offset += i_size_read(inode);
173	break;	173	break;
174	case SEEK_CUR:	174	case SEEK_CUR:
175	if (offset == 0) {	175	if (offset == 0) {
176	retval = file->f_pos;	176	retval = file->f_pos;
177	goto out;	177	goto out;
178	}	178	}
179	offset += file->f_pos;	179	offset += file->f_pos;
180	break;	180	break;
181	case SEEK_DATA:	181	case SEEK_DATA:
182	/*	182	/*
183	* In the generic case the entire file is data, so as	183	* In the generic case the entire file is data, so as
184	* long as offset isn't at the end of the file then the	184	* long as offset isn't at the end of the file then the
185	* offset is data.	185	* offset is data.
186	*/	186	*/
187	if (offset >= inode->i_size) {	187	if (offset >= inode->i_size) {
188	retval = -ENXIO;	188	retval = -ENXIO;
189	goto out;	189	goto out;
190	}	190	}
191	break;	191	break;
192	case SEEK_HOLE:	192	case SEEK_HOLE:
193	/*	193	/*
194	* There is a virtual hole at the end of the file, so	194	* There is a virtual hole at the end of the file, so
195	* as long as offset isn't i_size or larger, return	195	* as long as offset isn't i_size or larger, return
196	* i_size.	196	* i_size.
197	*/	197	*/
198	if (offset >= inode->i_size) {	198	if (offset >= inode->i_size) {
199	retval = -ENXIO;	199	retval = -ENXIO;
200	goto out;	200	goto out;
201	}	201	}
202	offset = inode->i_size;	202	offset = inode->i_size;
203	break;	203	break;
204	}	204	}
205	retval = -EINVAL;	205	retval = -EINVAL;
206	if (offset >= 0 \|\| unsigned_offsets(file)) {	206	if (offset >= 0 \|\| unsigned_offsets(file)) {
207	if (offset != file->f_pos) {	207	if (offset != file->f_pos) {
208	file->f_pos = offset;	208	file->f_pos = offset;
209	file->f_version = 0;	209	file->f_version = 0;
210	}	210	}
211	retval = offset;	211	retval = offset;
212	}	212	}
213	out:	213	out:
214	mutex_unlock(&inode->i_mutex);	214	mutex_unlock(&inode->i_mutex);
215	return retval;	215	return retval;
216	}	216	}
217	EXPORT_SYMBOL(default_llseek);	217	EXPORT_SYMBOL(default_llseek);
218		218
219	loff_t vfs_llseek(struct file *file, loff_t offset, int origin)	219	loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
220	{	220	{
221	loff_t (fn)(struct file , loff_t, int);	221	loff_t (fn)(struct file , loff_t, int);
222		222
223	fn = no_llseek;	223	fn = no_llseek;
224	if (file->f_mode & FMODE_LSEEK) {	224	if (file->f_mode & FMODE_LSEEK) {
225	if (file->f_op && file->f_op->llseek)	225	if (file->f_op && file->f_op->llseek)
226	fn = file->f_op->llseek;	226	fn = file->f_op->llseek;
227	}	227	}
228	return fn(file, offset, origin);	228	return fn(file, offset, whence);
229	}	229	}
230	EXPORT_SYMBOL(vfs_llseek);	230	EXPORT_SYMBOL(vfs_llseek);
231		231
232	SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)	232	SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
233	{	233	{
234	off_t retval;	234	off_t retval;
235	struct fd f = fdget(fd);	235	struct fd f = fdget(fd);
236	if (!f.file)	236	if (!f.file)
237	return -EBADF;	237	return -EBADF;
238		238
239	retval = -EINVAL;	239	retval = -EINVAL;
240	if (origin <= SEEK_MAX) {	240	if (whence <= SEEK_MAX) {
241	loff_t res = vfs_llseek(f.file, offset, origin);	241	loff_t res = vfs_llseek(f.file, offset, whence);
242	retval = res;	242	retval = res;
243	if (res != (loff_t)retval)	243	if (res != (loff_t)retval)
244	retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */	244	retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
245	}	245	}
246	fdput(f);	246	fdput(f);
247	return retval;	247	return retval;
248	}	248	}
249		249
250	#ifdef __ARCH_WANT_SYS_LLSEEK	250	#ifdef __ARCH_WANT_SYS_LLSEEK
251	SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,	251	SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
252	unsigned long, offset_low, loff_t __user *, result,	252	unsigned long, offset_low, loff_t __user *, result,
253	unsigned int, origin)	253	unsigned int, whence)
254	{	254	{
255	int retval;	255	int retval;
256	struct fd f = fdget(fd);	256	struct fd f = fdget(fd);
257	loff_t offset;	257	loff_t offset;
258		258
259	if (!f.file)	259	if (!f.file)
260	return -EBADF;	260	return -EBADF;
261		261
262	retval = -EINVAL;	262	retval = -EINVAL;
263	if (origin > SEEK_MAX)	263	if (whence > SEEK_MAX)
264	goto out_putf;	264	goto out_putf;
265		265
266	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) \| offset_low,	266	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) \| offset_low,
267	origin);	267	whence);
268		268
269	retval = (int)offset;	269	retval = (int)offset;
270	if (offset >= 0) {	270	if (offset >= 0) {
271	retval = -EFAULT;	271	retval = -EFAULT;
272	if (!copy_to_user(result, &offset, sizeof(offset)))	272	if (!copy_to_user(result, &offset, sizeof(offset)))
273	retval = 0;	273	retval = 0;
274	}	274	}
275	out_putf:	275	out_putf:
276	fdput(f);	276	fdput(f);
277	return retval;	277	return retval;
278	}	278	}
279	#endif	279	#endif
280		280
281		281
282	/*	282	/*
283	* rw_verify_area doesn't like huge counts. We limit	283	* rw_verify_area doesn't like huge counts. We limit
284	* them to something that fits in "int" so that others	284	* them to something that fits in "int" so that others
285	* won't have to do range checks all the time.	285	* won't have to do range checks all the time.
286	*/	286	*/
287	int rw_verify_area(int read_write, struct file file, loff_t ppos, size_t count)	287	int rw_verify_area(int read_write, struct file file, loff_t ppos, size_t count)
288	{	288	{
289	struct inode *inode;	289	struct inode *inode;
290	loff_t pos;	290	loff_t pos;
291	int retval = -EINVAL;	291	int retval = -EINVAL;
292		292
293	inode = file->f_path.dentry->d_inode;	293	inode = file->f_path.dentry->d_inode;
294	if (unlikely((ssize_t) count < 0))	294	if (unlikely((ssize_t) count < 0))
295	return retval;	295	return retval;
296	pos = *ppos;	296	pos = *ppos;
297	if (unlikely(pos < 0)) {	297	if (unlikely(pos < 0)) {
298	if (!unsigned_offsets(file))	298	if (!unsigned_offsets(file))
299	return retval;	299	return retval;
300	if (count >= -pos) /* both values are in 0..LLONG_MAX */	300	if (count >= -pos) /* both values are in 0..LLONG_MAX */
301	return -EOVERFLOW;	301	return -EOVERFLOW;
302	} else if (unlikely((loff_t) (pos + count) < 0)) {	302	} else if (unlikely((loff_t) (pos + count) < 0)) {
303	if (!unsigned_offsets(file))	303	if (!unsigned_offsets(file))
304	return retval;	304	return retval;
305	}	305	}
306		306
307	if (unlikely(inode->i_flock && mandatory_lock(inode))) {	307	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
308	retval = locks_mandatory_area(	308	retval = locks_mandatory_area(
309	read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,	309	read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
310	inode, file, pos, count);	310	inode, file, pos, count);
311	if (retval < 0)	311	if (retval < 0)
312	return retval;	312	return retval;
313	}	313	}
314	retval = security_file_permission(file,	314	retval = security_file_permission(file,
315	read_write == READ ? MAY_READ : MAY_WRITE);	315	read_write == READ ? MAY_READ : MAY_WRITE);
316	if (retval)	316	if (retval)
317	return retval;	317	return retval;
318	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;	318	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
319	}	319	}
320		320
321	static void wait_on_retry_sync_kiocb(struct kiocb *iocb)	321	static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
322	{	322	{
323	set_current_state(TASK_UNINTERRUPTIBLE);	323	set_current_state(TASK_UNINTERRUPTIBLE);
324	if (!kiocbIsKicked(iocb))	324	if (!kiocbIsKicked(iocb))
325	schedule();	325	schedule();
326	else	326	else
327	kiocbClearKicked(iocb);	327	kiocbClearKicked(iocb);
328	__set_current_state(TASK_RUNNING);	328	__set_current_state(TASK_RUNNING);
329	}	329	}
330		330
331	ssize_t do_sync_read(struct file filp, char __user buf, size_t len, loff_t *ppos)	331	ssize_t do_sync_read(struct file filp, char __user buf, size_t len, loff_t *ppos)
332	{	332	{
333	struct iovec iov = { .iov_base = buf, .iov_len = len };	333	struct iovec iov = { .iov_base = buf, .iov_len = len };
334	struct kiocb kiocb;	334	struct kiocb kiocb;
335	ssize_t ret;	335	ssize_t ret;
336		336
337	init_sync_kiocb(&kiocb, filp);	337	init_sync_kiocb(&kiocb, filp);
338	kiocb.ki_pos = *ppos;	338	kiocb.ki_pos = *ppos;
339	kiocb.ki_left = len;	339	kiocb.ki_left = len;
340	kiocb.ki_nbytes = len;	340	kiocb.ki_nbytes = len;
341		341
342	for (;;) {	342	for (;;) {
343	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);	343	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
344	if (ret != -EIOCBRETRY)	344	if (ret != -EIOCBRETRY)
345	break;	345	break;
346	wait_on_retry_sync_kiocb(&kiocb);	346	wait_on_retry_sync_kiocb(&kiocb);
347	}	347	}
348		348
349	if (-EIOCBQUEUED == ret)	349	if (-EIOCBQUEUED == ret)
350	ret = wait_on_sync_kiocb(&kiocb);	350	ret = wait_on_sync_kiocb(&kiocb);
351	*ppos = kiocb.ki_pos;	351	*ppos = kiocb.ki_pos;
352	return ret;	352	return ret;
353	}	353	}
354		354
355	EXPORT_SYMBOL(do_sync_read);	355	EXPORT_SYMBOL(do_sync_read);
356		356
357	ssize_t vfs_read(struct file file, char __user buf, size_t count, loff_t *pos)	357	ssize_t vfs_read(struct file file, char __user buf, size_t count, loff_t *pos)
358	{	358	{
359	ssize_t ret;	359	ssize_t ret;
360		360
361	if (!(file->f_mode & FMODE_READ))	361	if (!(file->f_mode & FMODE_READ))
362	return -EBADF;	362	return -EBADF;
363	if (!file->f_op \|\| (!file->f_op->read && !file->f_op->aio_read))	363	if (!file->f_op \|\| (!file->f_op->read && !file->f_op->aio_read))
364	return -EINVAL;	364	return -EINVAL;
365	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))	365	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
366	return -EFAULT;	366	return -EFAULT;
367		367
368	ret = rw_verify_area(READ, file, pos, count);	368	ret = rw_verify_area(READ, file, pos, count);
369	if (ret >= 0) {	369	if (ret >= 0) {
370	count = ret;	370	count = ret;
371	if (file->f_op->read)	371	if (file->f_op->read)
372	ret = file->f_op->read(file, buf, count, pos);	372	ret = file->f_op->read(file, buf, count, pos);
373	else	373	else
374	ret = do_sync_read(file, buf, count, pos);	374	ret = do_sync_read(file, buf, count, pos);
375	if (ret > 0) {	375	if (ret > 0) {
376	fsnotify_access(file);	376	fsnotify_access(file);
377	add_rchar(current, ret);	377	add_rchar(current, ret);
378	}	378	}
379	inc_syscr(current);	379	inc_syscr(current);
380	}	380	}
381		381
382	return ret;	382	return ret;
383	}	383	}
384		384
385	EXPORT_SYMBOL(vfs_read);	385	EXPORT_SYMBOL(vfs_read);
386		386
387	ssize_t do_sync_write(struct file filp, const char __user buf, size_t len, loff_t *ppos)	387	ssize_t do_sync_write(struct file filp, const char __user buf, size_t len, loff_t *ppos)
388	{	388	{
389	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };	389	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
390	struct kiocb kiocb;	390	struct kiocb kiocb;
391	ssize_t ret;	391	ssize_t ret;
392		392
393	init_sync_kiocb(&kiocb, filp);	393	init_sync_kiocb(&kiocb, filp);
394	kiocb.ki_pos = *ppos;	394	kiocb.ki_pos = *ppos;
395	kiocb.ki_left = len;	395	kiocb.ki_left = len;
396	kiocb.ki_nbytes = len;	396	kiocb.ki_nbytes = len;
397		397
398	for (;;) {	398	for (;;) {
399	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);	399	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
400	if (ret != -EIOCBRETRY)	400	if (ret != -EIOCBRETRY)
401	break;	401	break;
402	wait_on_retry_sync_kiocb(&kiocb);	402	wait_on_retry_sync_kiocb(&kiocb);
403	}	403	}
404		404
405	if (-EIOCBQUEUED == ret)	405	if (-EIOCBQUEUED == ret)
406	ret = wait_on_sync_kiocb(&kiocb);	406	ret = wait_on_sync_kiocb(&kiocb);
407	*ppos = kiocb.ki_pos;	407	*ppos = kiocb.ki_pos;
408	return ret;	408	return ret;
409	}	409	}
410		410
411	EXPORT_SYMBOL(do_sync_write);	411	EXPORT_SYMBOL(do_sync_write);
412		412
413	ssize_t vfs_write(struct file file, const char __user buf, size_t count, loff_t *pos)	413	ssize_t vfs_write(struct file file, const char __user buf, size_t count, loff_t *pos)
414	{	414	{
415	ssize_t ret;	415	ssize_t ret;
416		416
417	if (!(file->f_mode & FMODE_WRITE))	417	if (!(file->f_mode & FMODE_WRITE))
418	return -EBADF;	418	return -EBADF;
419	if (!file->f_op \|\| (!file->f_op->write && !file->f_op->aio_write))	419	if (!file->f_op \|\| (!file->f_op->write && !file->f_op->aio_write))
420	return -EINVAL;	420	return -EINVAL;
421	if (unlikely(!access_ok(VERIFY_READ, buf, count)))	421	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
422	return -EFAULT;	422	return -EFAULT;
423		423
424	ret = rw_verify_area(WRITE, file, pos, count);	424	ret = rw_verify_area(WRITE, file, pos, count);
425	if (ret >= 0) {	425	if (ret >= 0) {
426	count = ret;	426	count = ret;
427	if (file->f_op->write)	427	if (file->f_op->write)
428	ret = file->f_op->write(file, buf, count, pos);	428	ret = file->f_op->write(file, buf, count, pos);
429	else	429	else
430	ret = do_sync_write(file, buf, count, pos);	430	ret = do_sync_write(file, buf, count, pos);
431	if (ret > 0) {	431	if (ret > 0) {
432	fsnotify_modify(file);	432	fsnotify_modify(file);
433	add_wchar(current, ret);	433	add_wchar(current, ret);
434	}	434	}
435	inc_syscw(current);	435	inc_syscw(current);
436	}	436	}
437		437
438	return ret;	438	return ret;
439	}	439	}
440		440
441	EXPORT_SYMBOL(vfs_write);	441	EXPORT_SYMBOL(vfs_write);
442		442
443	static inline loff_t file_pos_read(struct file *file)	443	static inline loff_t file_pos_read(struct file *file)
444	{	444	{
445	return file->f_pos;	445	return file->f_pos;
446	}	446	}
447		447
448	static inline void file_pos_write(struct file *file, loff_t pos)	448	static inline void file_pos_write(struct file *file, loff_t pos)
449	{	449	{
450	file->f_pos = pos;	450	file->f_pos = pos;
451	}	451	}
452		452
453	SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)	453	SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
454	{	454	{
455	struct fd f = fdget(fd);	455	struct fd f = fdget(fd);
456	ssize_t ret = -EBADF;	456	ssize_t ret = -EBADF;
457		457
458	if (f.file) {	458	if (f.file) {
459	loff_t pos = file_pos_read(f.file);	459	loff_t pos = file_pos_read(f.file);
460	ret = vfs_read(f.file, buf, count, &pos);	460	ret = vfs_read(f.file, buf, count, &pos);
461	file_pos_write(f.file, pos);	461	file_pos_write(f.file, pos);
462	fdput(f);	462	fdput(f);
463	}	463	}
464	return ret;	464	return ret;
465	}	465	}
466		466
467	SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,	467	SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
468	size_t, count)	468	size_t, count)
469	{	469	{
470	struct fd f = fdget(fd);	470	struct fd f = fdget(fd);
471	ssize_t ret = -EBADF;	471	ssize_t ret = -EBADF;
472		472
473	if (f.file) {	473	if (f.file) {
474	loff_t pos = file_pos_read(f.file);	474	loff_t pos = file_pos_read(f.file);
475	ret = vfs_write(f.file, buf, count, &pos);	475	ret = vfs_write(f.file, buf, count, &pos);
476	file_pos_write(f.file, pos);	476	file_pos_write(f.file, pos);
477	fdput(f);	477	fdput(f);
478	}	478	}
479		479
480	return ret;	480	return ret;
481	}	481	}
482		482
483	SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,	483	SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
484	size_t count, loff_t pos)	484	size_t count, loff_t pos)
485	{	485	{
486	struct fd f;	486	struct fd f;
487	ssize_t ret = -EBADF;	487	ssize_t ret = -EBADF;
488		488
489	if (pos < 0)	489	if (pos < 0)
490	return -EINVAL;	490	return -EINVAL;
491		491
492	f = fdget(fd);	492	f = fdget(fd);
493	if (f.file) {	493	if (f.file) {
494	ret = -ESPIPE;	494	ret = -ESPIPE;
495	if (f.file->f_mode & FMODE_PREAD)	495	if (f.file->f_mode & FMODE_PREAD)
496	ret = vfs_read(f.file, buf, count, &pos);	496	ret = vfs_read(f.file, buf, count, &pos);
497	fdput(f);	497	fdput(f);
498	}	498	}
499		499
500	return ret;	500	return ret;
501	}	501	}
502	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS	502	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
503	asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)	503	asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
504	{	504	{
505	return SYSC_pread64((unsigned int) fd, (char __user *) buf,	505	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
506	(size_t) count, pos);	506	(size_t) count, pos);
507	}	507	}
508	SYSCALL_ALIAS(sys_pread64, SyS_pread64);	508	SYSCALL_ALIAS(sys_pread64, SyS_pread64);
509	#endif	509	#endif
510		510
511	SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,	511	SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
512	size_t count, loff_t pos)	512	size_t count, loff_t pos)
513	{	513	{
514	struct fd f;	514	struct fd f;
515	ssize_t ret = -EBADF;	515	ssize_t ret = -EBADF;
516		516
517	if (pos < 0)	517	if (pos < 0)
518	return -EINVAL;	518	return -EINVAL;
519		519
520	f = fdget(fd);	520	f = fdget(fd);
521	if (f.file) {	521	if (f.file) {
522	ret = -ESPIPE;	522	ret = -ESPIPE;
523	if (f.file->f_mode & FMODE_PWRITE)	523	if (f.file->f_mode & FMODE_PWRITE)
524	ret = vfs_write(f.file, buf, count, &pos);	524	ret = vfs_write(f.file, buf, count, &pos);
525	fdput(f);	525	fdput(f);
526	}	526	}
527		527
528	return ret;	528	return ret;
529	}	529	}
530	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS	530	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
531	asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)	531	asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
532	{	532	{
533	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,	533	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
534	(size_t) count, pos);	534	(size_t) count, pos);
535	}	535	}
536	SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);	536	SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
537	#endif	537	#endif
538		538
539	/*	539	/*
540	* Reduce an iovec's length in-place. Return the resulting number of segments	540	* Reduce an iovec's length in-place. Return the resulting number of segments
541	*/	541	*/
542	unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)	542	unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
543	{	543	{
544	unsigned long seg = 0;	544	unsigned long seg = 0;
545	size_t len = 0;	545	size_t len = 0;
546		546
547	while (seg < nr_segs) {	547	while (seg < nr_segs) {
548	seg++;	548	seg++;
549	if (len + iov->iov_len >= to) {	549	if (len + iov->iov_len >= to) {
550	iov->iov_len = to - len;	550	iov->iov_len = to - len;
551	break;	551	break;
552	}	552	}
553	len += iov->iov_len;	553	len += iov->iov_len;
554	iov++;	554	iov++;
555	}	555	}
556	return seg;	556	return seg;
557	}	557	}
558	EXPORT_SYMBOL(iov_shorten);	558	EXPORT_SYMBOL(iov_shorten);
559		559
560	ssize_t do_sync_readv_writev(struct file filp, const struct iovec iov,	560	ssize_t do_sync_readv_writev(struct file filp, const struct iovec iov,
561	unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)	561	unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
562	{	562	{
563	struct kiocb kiocb;	563	struct kiocb kiocb;
564	ssize_t ret;	564	ssize_t ret;
565		565
566	init_sync_kiocb(&kiocb, filp);	566	init_sync_kiocb(&kiocb, filp);
567	kiocb.ki_pos = *ppos;	567	kiocb.ki_pos = *ppos;
568	kiocb.ki_left = len;	568	kiocb.ki_left = len;
569	kiocb.ki_nbytes = len;	569	kiocb.ki_nbytes = len;
570		570
571	for (;;) {	571	for (;;) {
572	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);	572	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
573	if (ret != -EIOCBRETRY)	573	if (ret != -EIOCBRETRY)
574	break;	574	break;
575	wait_on_retry_sync_kiocb(&kiocb);	575	wait_on_retry_sync_kiocb(&kiocb);
576	}	576	}
577		577
578	if (ret == -EIOCBQUEUED)	578	if (ret == -EIOCBQUEUED)
579	ret = wait_on_sync_kiocb(&kiocb);	579	ret = wait_on_sync_kiocb(&kiocb);
580	*ppos = kiocb.ki_pos;	580	*ppos = kiocb.ki_pos;
581	return ret;	581	return ret;
582	}	582	}
583		583
584	/* Do it by hand, with file-ops */	584	/* Do it by hand, with file-ops */
585	ssize_t do_loop_readv_writev(struct file filp, struct iovec iov,	585	ssize_t do_loop_readv_writev(struct file filp, struct iovec iov,
586	unsigned long nr_segs, loff_t *ppos, io_fn_t fn)	586	unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
587	{	587	{
588	struct iovec *vector = iov;	588	struct iovec *vector = iov;
589	ssize_t ret = 0;	589	ssize_t ret = 0;
590		590
591	while (nr_segs > 0) {	591	while (nr_segs > 0) {
592	void __user *base;	592	void __user *base;
593	size_t len;	593	size_t len;
594	ssize_t nr;	594	ssize_t nr;
595		595
596	base = vector->iov_base;	596	base = vector->iov_base;
597	len = vector->iov_len;	597	len = vector->iov_len;
598	vector++;	598	vector++;
599	nr_segs--;	599	nr_segs--;
600		600
601	nr = fn(filp, base, len, ppos);	601	nr = fn(filp, base, len, ppos);
602		602
603	if (nr < 0) {	603	if (nr < 0) {
604	if (!ret)	604	if (!ret)
605	ret = nr;	605	ret = nr;
606	break;	606	break;
607	}	607	}
608	ret += nr;	608	ret += nr;
609	if (nr != len)	609	if (nr != len)
610	break;	610	break;
611	}	611	}
612		612
613	return ret;	613	return ret;
614	}	614	}
615		615
616	/* A write operation does a read from user space and vice versa */	616	/* A write operation does a read from user space and vice versa */
617	#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)	617	#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
618		618
619	ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,	619	ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
620	unsigned long nr_segs, unsigned long fast_segs,	620	unsigned long nr_segs, unsigned long fast_segs,
621	struct iovec *fast_pointer,	621	struct iovec *fast_pointer,
622	struct iovec **ret_pointer)	622	struct iovec **ret_pointer)
623	{	623	{
624	unsigned long seg;	624	unsigned long seg;
625	ssize_t ret;	625	ssize_t ret;
626	struct iovec *iov = fast_pointer;	626	struct iovec *iov = fast_pointer;
627		627
628	/*	628	/*
629	* SuS says "The readv() function may fail if the iovcnt argument	629	* SuS says "The readv() function may fail if the iovcnt argument
630	* was less than or equal to 0, or greater than {IOV_MAX}. Linux has	630	* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
631	* traditionally returned zero for zero segments, so...	631	* traditionally returned zero for zero segments, so...
632	*/	632	*/
633	if (nr_segs == 0) {	633	if (nr_segs == 0) {
634	ret = 0;	634	ret = 0;
635	goto out;	635	goto out;
636	}	636	}
637		637
638	/*	638	/*
639	* First get the "struct iovec" from user memory and	639	* First get the "struct iovec" from user memory and
640	* verify all the pointers	640	* verify all the pointers
641	*/	641	*/
642	if (nr_segs > UIO_MAXIOV) {	642	if (nr_segs > UIO_MAXIOV) {
643	ret = -EINVAL;	643	ret = -EINVAL;
644	goto out;	644	goto out;
645	}	645	}
646	if (nr_segs > fast_segs) {	646	if (nr_segs > fast_segs) {
647	iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);	647	iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
648	if (iov == NULL) {	648	if (iov == NULL) {
649	ret = -ENOMEM;	649	ret = -ENOMEM;
650	goto out;	650	goto out;
651	}	651	}
652	}	652	}
653	if (copy_from_user(iov, uvector, nr_segssizeof(uvector))) {	653	if (copy_from_user(iov, uvector, nr_segssizeof(uvector))) {
654	ret = -EFAULT;	654	ret = -EFAULT;
655	goto out;	655	goto out;
656	}	656	}
657		657
658	/*	658	/*
659	* According to the Single Unix Specification we should return EINVAL	659	* According to the Single Unix Specification we should return EINVAL
660	* if an element length is < 0 when cast to ssize_t or if the	660	* if an element length is < 0 when cast to ssize_t or if the
661	* total length would overflow the ssize_t return value of the	661	* total length would overflow the ssize_t return value of the
662	* system call.	662	* system call.
663	*	663	*
664	* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the	664	* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
665	* overflow case.	665	* overflow case.
666	*/	666	*/
667	ret = 0;	667	ret = 0;
668	for (seg = 0; seg < nr_segs; seg++) {	668	for (seg = 0; seg < nr_segs; seg++) {
669	void __user *buf = iov[seg].iov_base;	669	void __user *buf = iov[seg].iov_base;
670	ssize_t len = (ssize_t)iov[seg].iov_len;	670	ssize_t len = (ssize_t)iov[seg].iov_len;
671		671
672	/* see if we we're about to use an invalid len or if	672	/* see if we we're about to use an invalid len or if
673	* it's about to overflow ssize_t */	673	* it's about to overflow ssize_t */
674	if (len < 0) {	674	if (len < 0) {
675	ret = -EINVAL;	675	ret = -EINVAL;
676	goto out;	676	goto out;
677	}	677	}
678	if (type >= 0	678	if (type >= 0
679	&& unlikely(!access_ok(vrfy_dir(type), buf, len))) {	679	&& unlikely(!access_ok(vrfy_dir(type), buf, len))) {
680	ret = -EFAULT;	680	ret = -EFAULT;
681	goto out;	681	goto out;
682	}	682	}
683	if (len > MAX_RW_COUNT - ret) {	683	if (len > MAX_RW_COUNT - ret) {
684	len = MAX_RW_COUNT - ret;	684	len = MAX_RW_COUNT - ret;
685	iov[seg].iov_len = len;	685	iov[seg].iov_len = len;
686	}	686	}
687	ret += len;	687	ret += len;
688	}	688	}
689	out:	689	out:
690	*ret_pointer = iov;	690	*ret_pointer = iov;
691	return ret;	691	return ret;
692	}	692	}
693		693
694	static ssize_t do_readv_writev(int type, struct file *file,	694	static ssize_t do_readv_writev(int type, struct file *file,
695	const struct iovec __user * uvector,	695	const struct iovec __user * uvector,
696	unsigned long nr_segs, loff_t *pos)	696	unsigned long nr_segs, loff_t *pos)
697	{	697	{
698	size_t tot_len;	698	size_t tot_len;
699	struct iovec iovstack[UIO_FASTIOV];	699	struct iovec iovstack[UIO_FASTIOV];
700	struct iovec *iov = iovstack;	700	struct iovec *iov = iovstack;
701	ssize_t ret;	701	ssize_t ret;
702	io_fn_t fn;	702	io_fn_t fn;
703	iov_fn_t fnv;	703	iov_fn_t fnv;
704		704
705	if (!file->f_op) {	705	if (!file->f_op) {
706	ret = -EINVAL;	706	ret = -EINVAL;
707	goto out;	707	goto out;
708	}	708	}
709		709
710	ret = rw_copy_check_uvector(type, uvector, nr_segs,	710	ret = rw_copy_check_uvector(type, uvector, nr_segs,
711	ARRAY_SIZE(iovstack), iovstack, &iov);	711	ARRAY_SIZE(iovstack), iovstack, &iov);
712	if (ret <= 0)	712	if (ret <= 0)
713	goto out;	713	goto out;
714		714
715	tot_len = ret;	715	tot_len = ret;
716	ret = rw_verify_area(type, file, pos, tot_len);	716	ret = rw_verify_area(type, file, pos, tot_len);
717	if (ret < 0)	717	if (ret < 0)
718	goto out;	718	goto out;
719		719
720	fnv = NULL;	720	fnv = NULL;
721	if (type == READ) {	721	if (type == READ) {
722	fn = file->f_op->read;	722	fn = file->f_op->read;
723	fnv = file->f_op->aio_read;	723	fnv = file->f_op->aio_read;
724	} else {	724	} else {
725	fn = (io_fn_t)file->f_op->write;	725	fn = (io_fn_t)file->f_op->write;
726	fnv = file->f_op->aio_write;	726	fnv = file->f_op->aio_write;
727	}	727	}
728		728
729	if (fnv)	729	if (fnv)
730	ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,	730	ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
731	pos, fnv);	731	pos, fnv);
732	else	732	else
733	ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);	733	ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
734		734
735	out:	735	out:
736	if (iov != iovstack)	736	if (iov != iovstack)
737	kfree(iov);	737	kfree(iov);
738	if ((ret + (type == READ)) > 0) {	738	if ((ret + (type == READ)) > 0) {
739	if (type == READ)	739	if (type == READ)
740	fsnotify_access(file);	740	fsnotify_access(file);
741	else	741	else
742	fsnotify_modify(file);	742	fsnotify_modify(file);
743	}	743	}
744	return ret;	744	return ret;
745	}	745	}
746		746
747	ssize_t vfs_readv(struct file file, const struct iovec __user vec,	747	ssize_t vfs_readv(struct file file, const struct iovec __user vec,
748	unsigned long vlen, loff_t *pos)	748	unsigned long vlen, loff_t *pos)
749	{	749	{
750	if (!(file->f_mode & FMODE_READ))	750	if (!(file->f_mode & FMODE_READ))
751	return -EBADF;	751	return -EBADF;
752	if (!file->f_op \|\| (!file->f_op->aio_read && !file->f_op->read))	752	if (!file->f_op \|\| (!file->f_op->aio_read && !file->f_op->read))
753	return -EINVAL;	753	return -EINVAL;
754		754
755	return do_readv_writev(READ, file, vec, vlen, pos);	755	return do_readv_writev(READ, file, vec, vlen, pos);
756	}	756	}
757		757
758	EXPORT_SYMBOL(vfs_readv);	758	EXPORT_SYMBOL(vfs_readv);
759		759
760	ssize_t vfs_writev(struct file file, const struct iovec __user vec,	760	ssize_t vfs_writev(struct file file, const struct iovec __user vec,
761	unsigned long vlen, loff_t *pos)	761	unsigned long vlen, loff_t *pos)
762	{	762	{
763	if (!(file->f_mode & FMODE_WRITE))	763	if (!(file->f_mode & FMODE_WRITE))
764	return -EBADF;	764	return -EBADF;
765	if (!file->f_op \|\| (!file->f_op->aio_write && !file->f_op->write))	765	if (!file->f_op \|\| (!file->f_op->aio_write && !file->f_op->write))
766	return -EINVAL;	766	return -EINVAL;
767		767
768	return do_readv_writev(WRITE, file, vec, vlen, pos);	768	return do_readv_writev(WRITE, file, vec, vlen, pos);
769	}	769	}
770		770
771	EXPORT_SYMBOL(vfs_writev);	771	EXPORT_SYMBOL(vfs_writev);
772		772
773	SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,	773	SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
774	unsigned long, vlen)	774	unsigned long, vlen)
775	{	775	{
776	struct fd f = fdget(fd);	776	struct fd f = fdget(fd);
777	ssize_t ret = -EBADF;	777	ssize_t ret = -EBADF;
778		778
779	if (f.file) {	779	if (f.file) {
780	loff_t pos = file_pos_read(f.file);	780	loff_t pos = file_pos_read(f.file);
781	ret = vfs_readv(f.file, vec, vlen, &pos);	781	ret = vfs_readv(f.file, vec, vlen, &pos);
782	file_pos_write(f.file, pos);	782	file_pos_write(f.file, pos);
783	fdput(f);	783	fdput(f);
784	}	784	}
785		785
786	if (ret > 0)	786	if (ret > 0)
787	add_rchar(current, ret);	787	add_rchar(current, ret);
788	inc_syscr(current);	788	inc_syscr(current);
789	return ret;	789	return ret;
790	}	790	}
791		791
792	SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,	792	SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
793	unsigned long, vlen)	793	unsigned long, vlen)
794	{	794	{
795	struct fd f = fdget(fd);	795	struct fd f = fdget(fd);
796	ssize_t ret = -EBADF;	796	ssize_t ret = -EBADF;
797		797
798	if (f.file) {	798	if (f.file) {
799	loff_t pos = file_pos_read(f.file);	799	loff_t pos = file_pos_read(f.file);
800	ret = vfs_writev(f.file, vec, vlen, &pos);	800	ret = vfs_writev(f.file, vec, vlen, &pos);
801	file_pos_write(f.file, pos);	801	file_pos_write(f.file, pos);
802	fdput(f);	802	fdput(f);
803	}	803	}
804		804
805	if (ret > 0)	805	if (ret > 0)
806	add_wchar(current, ret);	806	add_wchar(current, ret);
807	inc_syscw(current);	807	inc_syscw(current);
808	return ret;	808	return ret;
809	}	809	}
810		810
811	static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)	811	static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
812	{	812	{
813	#define HALF_LONG_BITS (BITS_PER_LONG / 2)	813	#define HALF_LONG_BITS (BITS_PER_LONG / 2)
814	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) \| low;	814	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) \| low;
815	}	815	}
816		816
817	SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,	817	SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
818	unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)	818	unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
819	{	819	{
820	loff_t pos = pos_from_hilo(pos_h, pos_l);	820	loff_t pos = pos_from_hilo(pos_h, pos_l);
821	struct fd f;	821	struct fd f;
822	ssize_t ret = -EBADF;	822	ssize_t ret = -EBADF;
823		823
824	if (pos < 0)	824	if (pos < 0)
825	return -EINVAL;	825	return -EINVAL;
826		826
827	f = fdget(fd);	827	f = fdget(fd);
828	if (f.file) {	828	if (f.file) {
829	ret = -ESPIPE;	829	ret = -ESPIPE;
830	if (f.file->f_mode & FMODE_PREAD)	830	if (f.file->f_mode & FMODE_PREAD)
831	ret = vfs_readv(f.file, vec, vlen, &pos);	831	ret = vfs_readv(f.file, vec, vlen, &pos);
832	fdput(f);	832	fdput(f);
833	}	833	}
834		834
835	if (ret > 0)	835	if (ret > 0)
836	add_rchar(current, ret);	836	add_rchar(current, ret);
837	inc_syscr(current);	837	inc_syscr(current);
838	return ret;	838	return ret;
839	}	839	}
840		840
841	SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,	841	SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
842	unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)	842	unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
843	{	843	{
844	loff_t pos = pos_from_hilo(pos_h, pos_l);	844	loff_t pos = pos_from_hilo(pos_h, pos_l);
845	struct fd f;	845	struct fd f;
846	ssize_t ret = -EBADF;	846	ssize_t ret = -EBADF;
847		847
848	if (pos < 0)	848	if (pos < 0)
849	return -EINVAL;	849	return -EINVAL;
850		850
851	f = fdget(fd);	851	f = fdget(fd);
852	if (f.file) {	852	if (f.file) {
853	ret = -ESPIPE;	853	ret = -ESPIPE;
854	if (f.file->f_mode & FMODE_PWRITE)	854	if (f.file->f_mode & FMODE_PWRITE)
855	ret = vfs_writev(f.file, vec, vlen, &pos);	855	ret = vfs_writev(f.file, vec, vlen, &pos);
856	fdput(f);	856	fdput(f);
857	}	857	}
858		858
859	if (ret > 0)	859	if (ret > 0)
860	add_wchar(current, ret);	860	add_wchar(current, ret);
861	inc_syscw(current);	861	inc_syscw(current);
862	return ret;	862	return ret;
863	}	863	}
864		864
865	ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,	865	ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
866	loff_t max)	866	loff_t max)
867	{	867	{
868	struct fd in, out;	868	struct fd in, out;
869	struct inode in_inode, out_inode;	869	struct inode in_inode, out_inode;
870	loff_t pos;	870	loff_t pos;
871	ssize_t retval;	871	ssize_t retval;
872	int fl;	872	int fl;
873		873
874	/*	874	/*
875	* Get input file, and verify that it is ok..	875	* Get input file, and verify that it is ok..
876	*/	876	*/
877	retval = -EBADF;	877	retval = -EBADF;
878	in = fdget(in_fd);	878	in = fdget(in_fd);
879	if (!in.file)	879	if (!in.file)
880	goto out;	880	goto out;
881	if (!(in.file->f_mode & FMODE_READ))	881	if (!(in.file->f_mode & FMODE_READ))
882	goto fput_in;	882	goto fput_in;
883	retval = -ESPIPE;	883	retval = -ESPIPE;
884	if (!ppos)	884	if (!ppos)
885	ppos = &in.file->f_pos;	885	ppos = &in.file->f_pos;
886	else	886	else
887	if (!(in.file->f_mode & FMODE_PREAD))	887	if (!(in.file->f_mode & FMODE_PREAD))
888	goto fput_in;	888	goto fput_in;
889	retval = rw_verify_area(READ, in.file, ppos, count);	889	retval = rw_verify_area(READ, in.file, ppos, count);
890	if (retval < 0)	890	if (retval < 0)
891	goto fput_in;	891	goto fput_in;
892	count = retval;	892	count = retval;
893		893
894	/*	894	/*
895	* Get output file, and verify that it is ok..	895	* Get output file, and verify that it is ok..
896	*/	896	*/
897	retval = -EBADF;	897	retval = -EBADF;
898	out = fdget(out_fd);	898	out = fdget(out_fd);
899	if (!out.file)	899	if (!out.file)
900	goto fput_in;	900	goto fput_in;
901	if (!(out.file->f_mode & FMODE_WRITE))	901	if (!(out.file->f_mode & FMODE_WRITE))
902	goto fput_out;	902	goto fput_out;
903	retval = -EINVAL;	903	retval = -EINVAL;
904	in_inode = in.file->f_path.dentry->d_inode;	904	in_inode = in.file->f_path.dentry->d_inode;
905	out_inode = out.file->f_path.dentry->d_inode;	905	out_inode = out.file->f_path.dentry->d_inode;
906	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);	906	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
907	if (retval < 0)	907	if (retval < 0)
908	goto fput_out;	908	goto fput_out;
909	count = retval;	909	count = retval;
910		910
911	if (!max)	911	if (!max)
912	max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);	912	max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
913		913
914	pos = *ppos;	914	pos = *ppos;
915	if (unlikely(pos + count > max)) {	915	if (unlikely(pos + count > max)) {
916	retval = -EOVERFLOW;	916	retval = -EOVERFLOW;
917	if (pos >= max)	917	if (pos >= max)
918	goto fput_out;	918	goto fput_out;
919	count = max - pos;	919	count = max - pos;
920	}	920	}
921		921
922	fl = 0;	922	fl = 0;
923	#if 0	923	#if 0
924	/*	924	/*
925	* We need to debate whether we can enable this or not. The	925	* We need to debate whether we can enable this or not. The
926	* man page documents EAGAIN return for the output at least,	926	* man page documents EAGAIN return for the output at least,
927	* and the application is arguably buggy if it doesn't expect	927	* and the application is arguably buggy if it doesn't expect
928	* EAGAIN on a non-blocking file descriptor.	928	* EAGAIN on a non-blocking file descriptor.
929	*/	929	*/
930	if (in.file->f_flags & O_NONBLOCK)	930	if (in.file->f_flags & O_NONBLOCK)
931	fl = SPLICE_F_NONBLOCK;	931	fl = SPLICE_F_NONBLOCK;
932	#endif	932	#endif
933	retval = do_splice_direct(in.file, ppos, out.file, count, fl);	933	retval = do_splice_direct(in.file, ppos, out.file, count, fl);
934		934
935	if (retval > 0) {	935	if (retval > 0) {
936	add_rchar(current, retval);	936	add_rchar(current, retval);
937	add_wchar(current, retval);	937	add_wchar(current, retval);
938	}	938	}
939		939
940	inc_syscr(current);	940	inc_syscr(current);
941	inc_syscw(current);	941	inc_syscw(current);
942	if (*ppos > max)	942	if (*ppos > max)
943	retval = -EOVERFLOW;	943	retval = -EOVERFLOW;
944		944
945	fput_out:	945	fput_out:
946	fdput(out);	946	fdput(out);
947	fput_in:	947	fput_in:
948	fdput(in);	948	fdput(in);
949	out:	949	out:
950	return retval;	950	return retval;
951	}	951	}
952		952
953	SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)	953	SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
954	{	954	{
955	loff_t pos;	955	loff_t pos;
956	off_t off;	956	off_t off;
957	ssize_t ret;	957	ssize_t ret;
958		958
959	if (offset) {	959	if (offset) {
960	if (unlikely(get_user(off, offset)))	960	if (unlikely(get_user(off, offset)))
961	return -EFAULT;	961	return -EFAULT;
962	pos = off;	962	pos = off;
963	ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);	963	ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
964	if (unlikely(put_user(pos, offset)))	964	if (unlikely(put_user(pos, offset)))
965	return -EFAULT;	965	return -EFAULT;
966	return ret;	966	return ret;
967	}	967	}
968		968
969	return do_sendfile(out_fd, in_fd, NULL, count, 0);	969	return do_sendfile(out_fd, in_fd, NULL, count, 0);
970	}	970	}
971		971
972	SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)	972	SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
973	{	973	{
974	loff_t pos;	974	loff_t pos;
975	ssize_t ret;	975	ssize_t ret;
976		976
977	if (offset) {	977	if (offset) {
978	if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))	978	if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
979	return -EFAULT;	979	return -EFAULT;
980	ret = do_sendfile(out_fd, in_fd, &pos, count, 0);	980	ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
981	if (unlikely(put_user(pos, offset)))	981	if (unlikely(put_user(pos, offset)))
982	return -EFAULT;	982	return -EFAULT;
983	return ret;	983	return ret;
984	}	984	}
985		985
986	return do_sendfile(out_fd, in_fd, NULL, count, 0);	986	return do_sendfile(out_fd, in_fd, NULL, count, 0);
987	}	987	}
988		988

fs/seq_file.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* linux/fs/seq_file.c	2	* linux/fs/seq_file.c
3	*	3	*
4	* helper functions for making synthetic files from sequences of records.	4	* helper functions for making synthetic files from sequences of records.
5	* initial implementation -- AV, Oct 2001.	5	* initial implementation -- AV, Oct 2001.
6	*/	6	*/
7		7
8	#include <linux/fs.h>	8	#include <linux/fs.h>
9	#include <linux/export.h>	9	#include <linux/export.h>
10	#include <linux/seq_file.h>	10	#include <linux/seq_file.h>
11	#include <linux/slab.h>	11	#include <linux/slab.h>
12	#include <linux/cred.h>	12	#include <linux/cred.h>
13		13
14	#include <asm/uaccess.h>	14	#include <asm/uaccess.h>
15	#include <asm/page.h>	15	#include <asm/page.h>
16		16
17		17
18	/*	18	/*
19	* seq_files have a buffer which can may overflow. When this happens a larger	19	* seq_files have a buffer which can may overflow. When this happens a larger
20	* buffer is reallocated and all the data will be printed again.	20	* buffer is reallocated and all the data will be printed again.
21	* The overflow state is true when m->count == m->size.	21	* The overflow state is true when m->count == m->size.
22	*/	22	*/
23	static bool seq_overflow(struct seq_file *m)	23	static bool seq_overflow(struct seq_file *m)
24	{	24	{
25	return m->count == m->size;	25	return m->count == m->size;
26	}	26	}
27		27
28	static void seq_set_overflow(struct seq_file *m)	28	static void seq_set_overflow(struct seq_file *m)
29	{	29	{
30	m->count = m->size;	30	m->count = m->size;
31	}	31	}
32		32
33	/**	33	/**
34	* seq_open - initialize sequential file	34	* seq_open - initialize sequential file
35	* @file: file we initialize	35	* @file: file we initialize
36	* @op: method table describing the sequence	36	* @op: method table describing the sequence
37	*	37	*
38	* seq_open() sets @file, associating it with a sequence described	38	* seq_open() sets @file, associating it with a sequence described
39	* by @op. @op->start() sets the iterator up and returns the first	39	* by @op. @op->start() sets the iterator up and returns the first
40	* element of sequence. @op->stop() shuts it down. @op->next()	40	* element of sequence. @op->stop() shuts it down. @op->next()
41	* returns the next element of sequence. @op->show() prints element	41	* returns the next element of sequence. @op->show() prints element
42	* into the buffer. In case of error ->start() and ->next() return	42	* into the buffer. In case of error ->start() and ->next() return
43	* ERR_PTR(error). In the end of sequence they return %NULL. ->show()	43	* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
44	* returns 0 in case of success and negative number in case of error.	44	* returns 0 in case of success and negative number in case of error.
45	* Returning SEQ_SKIP means "discard this element and move on".	45	* Returning SEQ_SKIP means "discard this element and move on".
46	*/	46	*/
47	int seq_open(struct file file, const struct seq_operations op)	47	int seq_open(struct file file, const struct seq_operations op)
48	{	48	{
49	struct seq_file *p = file->private_data;	49	struct seq_file *p = file->private_data;
50		50
51	if (!p) {	51	if (!p) {
52	p = kmalloc(sizeof(*p), GFP_KERNEL);	52	p = kmalloc(sizeof(*p), GFP_KERNEL);
53	if (!p)	53	if (!p)
54	return -ENOMEM;	54	return -ENOMEM;
55	file->private_data = p;	55	file->private_data = p;
56	}	56	}
57	memset(p, 0, sizeof(*p));	57	memset(p, 0, sizeof(*p));
58	mutex_init(&p->lock);	58	mutex_init(&p->lock);
59	p->op = op;	59	p->op = op;
60	#ifdef CONFIG_USER_NS	60	#ifdef CONFIG_USER_NS
61	p->user_ns = file->f_cred->user_ns;	61	p->user_ns = file->f_cred->user_ns;
62	#endif	62	#endif
63		63
64	/*	64	/*
65	* Wrappers around seq_open(e.g. swaps_open) need to be	65	* Wrappers around seq_open(e.g. swaps_open) need to be
66	* aware of this. If they set f_version themselves, they	66	* aware of this. If they set f_version themselves, they
67	* should call seq_open first and then set f_version.	67	* should call seq_open first and then set f_version.
68	*/	68	*/
69	file->f_version = 0;	69	file->f_version = 0;
70		70
71	/*	71	/*
72	* seq_files support lseek() and pread(). They do not implement	72	* seq_files support lseek() and pread(). They do not implement
73	* write() at all, but we clear FMODE_PWRITE here for historical	73	* write() at all, but we clear FMODE_PWRITE here for historical
74	* reasons.	74	* reasons.
75	*	75	*
76	* If a client of seq_files a) implements file.write() and b) wishes to	76	* If a client of seq_files a) implements file.write() and b) wishes to
77	* support pwrite() then that client will need to implement its own	77	* support pwrite() then that client will need to implement its own
78	* file.open() which calls seq_open() and then sets FMODE_PWRITE.	78	* file.open() which calls seq_open() and then sets FMODE_PWRITE.
79	*/	79	*/
80	file->f_mode &= ~FMODE_PWRITE;	80	file->f_mode &= ~FMODE_PWRITE;
81	return 0;	81	return 0;
82	}	82	}
83	EXPORT_SYMBOL(seq_open);	83	EXPORT_SYMBOL(seq_open);
84		84
85	static int traverse(struct seq_file *m, loff_t offset)	85	static int traverse(struct seq_file *m, loff_t offset)
86	{	86	{
87	loff_t pos = 0, index;	87	loff_t pos = 0, index;
88	int error = 0;	88	int error = 0;
89	void *p;	89	void *p;
90		90
91	m->version = 0;	91	m->version = 0;
92	index = 0;	92	index = 0;
93	m->count = m->from = 0;	93	m->count = m->from = 0;
94	if (!offset) {	94	if (!offset) {
95	m->index = index;	95	m->index = index;
96	return 0;	96	return 0;
97	}	97	}
98	if (!m->buf) {	98	if (!m->buf) {
99	m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);	99	m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
100	if (!m->buf)	100	if (!m->buf)
101	return -ENOMEM;	101	return -ENOMEM;
102	}	102	}
103	p = m->op->start(m, &index);	103	p = m->op->start(m, &index);
104	while (p) {	104	while (p) {
105	error = PTR_ERR(p);	105	error = PTR_ERR(p);
106	if (IS_ERR(p))	106	if (IS_ERR(p))
107	break;	107	break;
108	error = m->op->show(m, p);	108	error = m->op->show(m, p);
109	if (error < 0)	109	if (error < 0)
110	break;	110	break;
111	if (unlikely(error)) {	111	if (unlikely(error)) {
112	error = 0;	112	error = 0;
113	m->count = 0;	113	m->count = 0;
114	}	114	}
115	if (seq_overflow(m))	115	if (seq_overflow(m))
116	goto Eoverflow;	116	goto Eoverflow;
117	if (pos + m->count > offset) {	117	if (pos + m->count > offset) {
118	m->from = offset - pos;	118	m->from = offset - pos;
119	m->count -= m->from;	119	m->count -= m->from;
120	m->index = index;	120	m->index = index;
121	break;	121	break;
122	}	122	}
123	pos += m->count;	123	pos += m->count;
124	m->count = 0;	124	m->count = 0;
125	if (pos == offset) {	125	if (pos == offset) {
126	index++;	126	index++;
127	m->index = index;	127	m->index = index;
128	break;	128	break;
129	}	129	}
130	p = m->op->next(m, p, &index);	130	p = m->op->next(m, p, &index);
131	}	131	}
132	m->op->stop(m, p);	132	m->op->stop(m, p);
133	m->index = index;	133	m->index = index;
134	return error;	134	return error;
135		135
136	Eoverflow:	136	Eoverflow:
137	m->op->stop(m, p);	137	m->op->stop(m, p);
138	kfree(m->buf);	138	kfree(m->buf);
139	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);	139	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
140	return !m->buf ? -ENOMEM : -EAGAIN;	140	return !m->buf ? -ENOMEM : -EAGAIN;
141	}	141	}
142		142
143	/**	143	/**
144	* seq_read - ->read() method for sequential files.	144	* seq_read - ->read() method for sequential files.
145	* @file: the file to read from	145	* @file: the file to read from
146	* @buf: the buffer to read to	146	* @buf: the buffer to read to
147	* @size: the maximum number of bytes to read	147	* @size: the maximum number of bytes to read
148	* @ppos: the current position in the file	148	* @ppos: the current position in the file
149	*	149	*
150	* Ready-made ->f_op->read()	150	* Ready-made ->f_op->read()
151	*/	151	*/
152	ssize_t seq_read(struct file file, char __user buf, size_t size, loff_t *ppos)	152	ssize_t seq_read(struct file file, char __user buf, size_t size, loff_t *ppos)
153	{	153	{
154	struct seq_file *m = file->private_data;	154	struct seq_file *m = file->private_data;
155	size_t copied = 0;	155	size_t copied = 0;
156	loff_t pos;	156	loff_t pos;
157	size_t n;	157	size_t n;
158	void *p;	158	void *p;
159	int err = 0;	159	int err = 0;
160		160
161	mutex_lock(&m->lock);	161	mutex_lock(&m->lock);
162		162
163	/*	163	/*
164	* seq_file->op->..m_start/m_stop/m_next may do special actions	164	* seq_file->op->..m_start/m_stop/m_next may do special actions
165	* or optimisations based on the file->f_version, so we want to	165	* or optimisations based on the file->f_version, so we want to
166	* pass the file->f_version to those methods.	166	* pass the file->f_version to those methods.
167	*	167	*
168	* seq_file->version is just copy of f_version, and seq_file	168	* seq_file->version is just copy of f_version, and seq_file
169	* methods can treat it simply as file version.	169	* methods can treat it simply as file version.
170	* It is copied in first and copied out after all operations.	170	* It is copied in first and copied out after all operations.
171	* It is convenient to have it as part of structure to avoid the	171	* It is convenient to have it as part of structure to avoid the
172	* need of passing another argument to all the seq_file methods.	172	* need of passing another argument to all the seq_file methods.
173	*/	173	*/
174	m->version = file->f_version;	174	m->version = file->f_version;
175		175
176	/* Don't assume ppos is where we left it /	176	/* Don't assume ppos is where we left it /
177	if (unlikely(*ppos != m->read_pos)) {	177	if (unlikely(*ppos != m->read_pos)) {
178	while ((err = traverse(m, *ppos)) == -EAGAIN)	178	while ((err = traverse(m, *ppos)) == -EAGAIN)
179	;	179	;
180	if (err) {	180	if (err) {
181	/* With prejudice... */	181	/* With prejudice... */
182	m->read_pos = 0;	182	m->read_pos = 0;
183	m->version = 0;	183	m->version = 0;
184	m->index = 0;	184	m->index = 0;
185	m->count = 0;	185	m->count = 0;
186	goto Done;	186	goto Done;
187	} else {	187	} else {
188	m->read_pos = *ppos;	188	m->read_pos = *ppos;
189	}	189	}
190	}	190	}
191		191
192	/* grab buffer if we didn't have one */	192	/* grab buffer if we didn't have one */
193	if (!m->buf) {	193	if (!m->buf) {
194	m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);	194	m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
195	if (!m->buf)	195	if (!m->buf)
196	goto Enomem;	196	goto Enomem;
197	}	197	}
198	/* if not empty - flush it first */	198	/* if not empty - flush it first */
199	if (m->count) {	199	if (m->count) {
200	n = min(m->count, size);	200	n = min(m->count, size);
201	err = copy_to_user(buf, m->buf + m->from, n);	201	err = copy_to_user(buf, m->buf + m->from, n);
202	if (err)	202	if (err)
203	goto Efault;	203	goto Efault;
204	m->count -= n;	204	m->count -= n;
205	m->from += n;	205	m->from += n;
206	size -= n;	206	size -= n;
207	buf += n;	207	buf += n;
208	copied += n;	208	copied += n;
209	if (!m->count)	209	if (!m->count)
210	m->index++;	210	m->index++;
211	if (!size)	211	if (!size)
212	goto Done;	212	goto Done;
213	}	213	}
214	/* we need at least one record in buffer */	214	/* we need at least one record in buffer */
215	pos = m->index;	215	pos = m->index;
216	p = m->op->start(m, &pos);	216	p = m->op->start(m, &pos);
217	while (1) {	217	while (1) {
218	err = PTR_ERR(p);	218	err = PTR_ERR(p);
219	if (!p \|\| IS_ERR(p))	219	if (!p \|\| IS_ERR(p))
220	break;	220	break;
221	err = m->op->show(m, p);	221	err = m->op->show(m, p);
222	if (err < 0)	222	if (err < 0)
223	break;	223	break;
224	if (unlikely(err))	224	if (unlikely(err))
225	m->count = 0;	225	m->count = 0;
226	if (unlikely(!m->count)) {	226	if (unlikely(!m->count)) {
227	p = m->op->next(m, p, &pos);	227	p = m->op->next(m, p, &pos);
228	m->index = pos;	228	m->index = pos;
229	continue;	229	continue;
230	}	230	}
231	if (m->count < m->size)	231	if (m->count < m->size)
232	goto Fill;	232	goto Fill;
233	m->op->stop(m, p);	233	m->op->stop(m, p);
234	kfree(m->buf);	234	kfree(m->buf);
235	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);	235	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
236	if (!m->buf)	236	if (!m->buf)
237	goto Enomem;	237	goto Enomem;
238	m->count = 0;	238	m->count = 0;
239	m->version = 0;	239	m->version = 0;
240	pos = m->index;	240	pos = m->index;
241	p = m->op->start(m, &pos);	241	p = m->op->start(m, &pos);
242	}	242	}
243	m->op->stop(m, p);	243	m->op->stop(m, p);
244	m->count = 0;	244	m->count = 0;
245	goto Done;	245	goto Done;
246	Fill:	246	Fill:
247	/* they want more? let's try to get some more */	247	/* they want more? let's try to get some more */
248	while (m->count < size) {	248	while (m->count < size) {
249	size_t offs = m->count;	249	size_t offs = m->count;
250	loff_t next = pos;	250	loff_t next = pos;
251	p = m->op->next(m, p, &next);	251	p = m->op->next(m, p, &next);
252	if (!p \|\| IS_ERR(p)) {	252	if (!p \|\| IS_ERR(p)) {
253	err = PTR_ERR(p);	253	err = PTR_ERR(p);
254	break;	254	break;
255	}	255	}
256	err = m->op->show(m, p);	256	err = m->op->show(m, p);
257	if (seq_overflow(m) \|\| err) {	257	if (seq_overflow(m) \|\| err) {
258	m->count = offs;	258	m->count = offs;
259	if (likely(err <= 0))	259	if (likely(err <= 0))
260	break;	260	break;
261	}	261	}
262	pos = next;	262	pos = next;
263	}	263	}
264	m->op->stop(m, p);	264	m->op->stop(m, p);
265	n = min(m->count, size);	265	n = min(m->count, size);
266	err = copy_to_user(buf, m->buf, n);	266	err = copy_to_user(buf, m->buf, n);
267	if (err)	267	if (err)
268	goto Efault;	268	goto Efault;
269	copied += n;	269	copied += n;
270	m->count -= n;	270	m->count -= n;
271	if (m->count)	271	if (m->count)
272	m->from = n;	272	m->from = n;
273	else	273	else
274	pos++;	274	pos++;
275	m->index = pos;	275	m->index = pos;
276	Done:	276	Done:
277	if (!copied)	277	if (!copied)
278	copied = err;	278	copied = err;
279	else {	279	else {
280	*ppos += copied;	280	*ppos += copied;
281	m->read_pos += copied;	281	m->read_pos += copied;
282	}	282	}
283	file->f_version = m->version;	283	file->f_version = m->version;
284	mutex_unlock(&m->lock);	284	mutex_unlock(&m->lock);
285	return copied;	285	return copied;
286	Enomem:	286	Enomem:
287	err = -ENOMEM;	287	err = -ENOMEM;
288	goto Done;	288	goto Done;
289	Efault:	289	Efault:
290	err = -EFAULT;	290	err = -EFAULT;
291	goto Done;	291	goto Done;
292	}	292	}
293	EXPORT_SYMBOL(seq_read);	293	EXPORT_SYMBOL(seq_read);
294		294
295	/**	295	/**
296	* seq_lseek - ->llseek() method for sequential files.	296	* seq_lseek - ->llseek() method for sequential files.
297	* @file: the file in question	297	* @file: the file in question
298	* @offset: new position	298	* @offset: new position
299	* @origin: 0 for absolute, 1 for relative position	299	* @origin: 0 for absolute, 1 for relative position
300	*	300	*
301	* Ready-made ->f_op->llseek()	301	* Ready-made ->f_op->llseek()
302	*/	302	*/
303	loff_t seq_lseek(struct file *file, loff_t offset, int origin)	303	loff_t seq_lseek(struct file *file, loff_t offset, int whence)
304	{	304	{
305	struct seq_file *m = file->private_data;	305	struct seq_file *m = file->private_data;
306	loff_t retval = -EINVAL;	306	loff_t retval = -EINVAL;
307		307
308	mutex_lock(&m->lock);	308	mutex_lock(&m->lock);
309	m->version = file->f_version;	309	m->version = file->f_version;
310	switch (origin) {	310	switch (whence) {
311	case 1:	311	case 1:
312	offset += file->f_pos;	312	offset += file->f_pos;
313	case 0:	313	case 0:
314	if (offset < 0)	314	if (offset < 0)
315	break;	315	break;
316	retval = offset;	316	retval = offset;
317	if (offset != m->read_pos) {	317	if (offset != m->read_pos) {
318	while ((retval=traverse(m, offset)) == -EAGAIN)	318	while ((retval=traverse(m, offset)) == -EAGAIN)
319	;	319	;
320	if (retval) {	320	if (retval) {
321	/* with extreme prejudice... */	321	/* with extreme prejudice... */
322	file->f_pos = 0;	322	file->f_pos = 0;
323	m->read_pos = 0;	323	m->read_pos = 0;
324	m->version = 0;	324	m->version = 0;
325	m->index = 0;	325	m->index = 0;
326	m->count = 0;	326	m->count = 0;
327	} else {	327	} else {
328	m->read_pos = offset;	328	m->read_pos = offset;
329	retval = file->f_pos = offset;	329	retval = file->f_pos = offset;
330	}	330	}
331	}	331	}
332	}	332	}
333	file->f_version = m->version;	333	file->f_version = m->version;
334	mutex_unlock(&m->lock);	334	mutex_unlock(&m->lock);
335	return retval;	335	return retval;
336	}	336	}
337	EXPORT_SYMBOL(seq_lseek);	337	EXPORT_SYMBOL(seq_lseek);
338		338
339	/**	339	/**
340	* seq_release - free the structures associated with sequential file.	340	* seq_release - free the structures associated with sequential file.
341	* @file: file in question	341	* @file: file in question
342	* @inode: file->f_path.dentry->d_inode	342	* @inode: file->f_path.dentry->d_inode
343	*	343	*
344	* Frees the structures associated with sequential file; can be used	344	* Frees the structures associated with sequential file; can be used
345	* as ->f_op->release() if you don't have private data to destroy.	345	* as ->f_op->release() if you don't have private data to destroy.
346	*/	346	*/
347	int seq_release(struct inode inode, struct file file)	347	int seq_release(struct inode inode, struct file file)
348	{	348	{
349	struct seq_file *m = file->private_data;	349	struct seq_file *m = file->private_data;
350	kfree(m->buf);	350	kfree(m->buf);
351	kfree(m);	351	kfree(m);
352	return 0;	352	return 0;
353	}	353	}
354	EXPORT_SYMBOL(seq_release);	354	EXPORT_SYMBOL(seq_release);
355		355
356	/**	356	/**
357	* seq_escape - print string into buffer, escaping some characters	357	* seq_escape - print string into buffer, escaping some characters
358	* @m: target buffer	358	* @m: target buffer
359	* @s: string	359	* @s: string
360	* @esc: set of characters that need escaping	360	* @esc: set of characters that need escaping
361	*	361	*
362	* Puts string into buffer, replacing each occurrence of character from	362	* Puts string into buffer, replacing each occurrence of character from
363	* @esc with usual octal escape. Returns 0 in case of success, -1 - in	363	* @esc with usual octal escape. Returns 0 in case of success, -1 - in
364	* case of overflow.	364	* case of overflow.
365	*/	365	*/
366	int seq_escape(struct seq_file m, const char s, const char *esc)	366	int seq_escape(struct seq_file m, const char s, const char *esc)
367	{	367	{
368	char *end = m->buf + m->size;	368	char *end = m->buf + m->size;
369	char *p;	369	char *p;
370	char c;	370	char c;
371		371
372	for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {	372	for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
373	if (!strchr(esc, c)) {	373	if (!strchr(esc, c)) {
374	*p++ = c;	374	*p++ = c;
375	continue;	375	continue;
376	}	376	}
377	if (p + 3 < end) {	377	if (p + 3 < end) {
378	*p++ = '\\';	378	*p++ = '\\';
379	*p++ = '0' + ((c & 0300) >> 6);	379	*p++ = '0' + ((c & 0300) >> 6);
380	*p++ = '0' + ((c & 070) >> 3);	380	*p++ = '0' + ((c & 070) >> 3);
381	*p++ = '0' + (c & 07);	381	*p++ = '0' + (c & 07);
382	continue;	382	continue;
383	}	383	}
384	seq_set_overflow(m);	384	seq_set_overflow(m);
385	return -1;	385	return -1;
386	}	386	}
387	m->count = p - m->buf;	387	m->count = p - m->buf;
388	return 0;	388	return 0;
389	}	389	}
390	EXPORT_SYMBOL(seq_escape);	390	EXPORT_SYMBOL(seq_escape);
391		391
392	int seq_vprintf(struct seq_file m, const char f, va_list args)	392	int seq_vprintf(struct seq_file m, const char f, va_list args)
393	{	393	{
394	int len;	394	int len;
395		395
396	if (m->count < m->size) {	396	if (m->count < m->size) {
397	len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);	397	len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
398	if (m->count + len < m->size) {	398	if (m->count + len < m->size) {
399	m->count += len;	399	m->count += len;
400	return 0;	400	return 0;
401	}	401	}
402	}	402	}
403	seq_set_overflow(m);	403	seq_set_overflow(m);
404	return -1;	404	return -1;
405	}	405	}
406	EXPORT_SYMBOL(seq_vprintf);	406	EXPORT_SYMBOL(seq_vprintf);
407		407
408	int seq_printf(struct seq_file m, const char f, ...)	408	int seq_printf(struct seq_file m, const char f, ...)
409	{	409	{
410	int ret;	410	int ret;
411	va_list args;	411	va_list args;
412		412
413	va_start(args, f);	413	va_start(args, f);
414	ret = seq_vprintf(m, f, args);	414	ret = seq_vprintf(m, f, args);
415	va_end(args);	415	va_end(args);
416		416
417	return ret;	417	return ret;
418	}	418	}
419	EXPORT_SYMBOL(seq_printf);	419	EXPORT_SYMBOL(seq_printf);
420		420
421	/**	421	/**
422	* mangle_path - mangle and copy path to buffer beginning	422	* mangle_path - mangle and copy path to buffer beginning
423	* @s: buffer start	423	* @s: buffer start
424	* @p: beginning of path in above buffer	424	* @p: beginning of path in above buffer
425	* @esc: set of characters that need escaping	425	* @esc: set of characters that need escaping
426	*	426	*
427	* Copy the path from @p to @s, replacing each occurrence of character from	427	* Copy the path from @p to @s, replacing each occurrence of character from
428	* @esc with usual octal escape.	428	* @esc with usual octal escape.
429	* Returns pointer past last written character in @s, or NULL in case of	429	* Returns pointer past last written character in @s, or NULL in case of
430	* failure.	430	* failure.
431	*/	431	*/
432	char mangle_path(char s, const char p, const char esc)	432	char mangle_path(char s, const char p, const char esc)
433	{	433	{
434	while (s <= p) {	434	while (s <= p) {
435	char c = *p++;	435	char c = *p++;
436	if (!c) {	436	if (!c) {
437	return s;	437	return s;
438	} else if (!strchr(esc, c)) {	438	} else if (!strchr(esc, c)) {
439	*s++ = c;	439	*s++ = c;
440	} else if (s + 4 > p) {	440	} else if (s + 4 > p) {
441	break;	441	break;
442	} else {	442	} else {
443	*s++ = '\\';	443	*s++ = '\\';
444	*s++ = '0' + ((c & 0300) >> 6);	444	*s++ = '0' + ((c & 0300) >> 6);
445	*s++ = '0' + ((c & 070) >> 3);	445	*s++ = '0' + ((c & 070) >> 3);
446	*s++ = '0' + (c & 07);	446	*s++ = '0' + (c & 07);
447	}	447	}
448	}	448	}
449	return NULL;	449	return NULL;
450	}	450	}
451	EXPORT_SYMBOL(mangle_path);	451	EXPORT_SYMBOL(mangle_path);
452		452
453	/**	453	/**
454	* seq_path - seq_file interface to print a pathname	454	* seq_path - seq_file interface to print a pathname
455	* @m: the seq_file handle	455	* @m: the seq_file handle
456	* @path: the struct path to print	456	* @path: the struct path to print
457	* @esc: set of characters to escape in the output	457	* @esc: set of characters to escape in the output
458	*	458	*
459	* return the absolute path of 'path', as represented by the	459	* return the absolute path of 'path', as represented by the
460	* dentry / mnt pair in the path parameter.	460	* dentry / mnt pair in the path parameter.
461	*/	461	*/
462	int seq_path(struct seq_file m, const struct path path, const char *esc)	462	int seq_path(struct seq_file m, const struct path path, const char *esc)
463	{	463	{
464	char *buf;	464	char *buf;
465	size_t size = seq_get_buf(m, &buf);	465	size_t size = seq_get_buf(m, &buf);
466	int res = -1;	466	int res = -1;
467		467
468	if (size) {	468	if (size) {
469	char *p = d_path(path, buf, size);	469	char *p = d_path(path, buf, size);
470	if (!IS_ERR(p)) {	470	if (!IS_ERR(p)) {
471	char *end = mangle_path(buf, p, esc);	471	char *end = mangle_path(buf, p, esc);
472	if (end)	472	if (end)
473	res = end - buf;	473	res = end - buf;
474	}	474	}
475	}	475	}
476	seq_commit(m, res);	476	seq_commit(m, res);
477		477
478	return res;	478	return res;
479	}	479	}
480	EXPORT_SYMBOL(seq_path);	480	EXPORT_SYMBOL(seq_path);
481		481
482	/*	482	/*
483	* Same as seq_path, but relative to supplied root.	483	* Same as seq_path, but relative to supplied root.
484	*/	484	*/
485	int seq_path_root(struct seq_file m, const struct path path,	485	int seq_path_root(struct seq_file m, const struct path path,
486	const struct path root, const char esc)	486	const struct path root, const char esc)
487	{	487	{
488	char *buf;	488	char *buf;
489	size_t size = seq_get_buf(m, &buf);	489	size_t size = seq_get_buf(m, &buf);
490	int res = -ENAMETOOLONG;	490	int res = -ENAMETOOLONG;
491		491
492	if (size) {	492	if (size) {
493	char *p;	493	char *p;
494		494
495	p = __d_path(path, root, buf, size);	495	p = __d_path(path, root, buf, size);
496	if (!p)	496	if (!p)
497	return SEQ_SKIP;	497	return SEQ_SKIP;
498	res = PTR_ERR(p);	498	res = PTR_ERR(p);
499	if (!IS_ERR(p)) {	499	if (!IS_ERR(p)) {
500	char *end = mangle_path(buf, p, esc);	500	char *end = mangle_path(buf, p, esc);
501	if (end)	501	if (end)
502	res = end - buf;	502	res = end - buf;
503	else	503	else
504	res = -ENAMETOOLONG;	504	res = -ENAMETOOLONG;
505	}	505	}
506	}	506	}
507	seq_commit(m, res);	507	seq_commit(m, res);
508		508
509	return res < 0 && res != -ENAMETOOLONG ? res : 0;	509	return res < 0 && res != -ENAMETOOLONG ? res : 0;
510	}	510	}
511		511
512	/*	512	/*
513	* returns the path of the 'dentry' from the root of its filesystem.	513	* returns the path of the 'dentry' from the root of its filesystem.
514	*/	514	*/
515	int seq_dentry(struct seq_file m, struct dentry dentry, const char *esc)	515	int seq_dentry(struct seq_file m, struct dentry dentry, const char *esc)
516	{	516	{
517	char *buf;	517	char *buf;
518	size_t size = seq_get_buf(m, &buf);	518	size_t size = seq_get_buf(m, &buf);
519	int res = -1;	519	int res = -1;
520		520
521	if (size) {	521	if (size) {
522	char *p = dentry_path(dentry, buf, size);	522	char *p = dentry_path(dentry, buf, size);
523	if (!IS_ERR(p)) {	523	if (!IS_ERR(p)) {
524	char *end = mangle_path(buf, p, esc);	524	char *end = mangle_path(buf, p, esc);
525	if (end)	525	if (end)
526	res = end - buf;	526	res = end - buf;
527	}	527	}
528	}	528	}
529	seq_commit(m, res);	529	seq_commit(m, res);
530		530
531	return res;	531	return res;
532	}	532	}
533		533
534	int seq_bitmap(struct seq_file m, const unsigned long bits,	534	int seq_bitmap(struct seq_file m, const unsigned long bits,
535	unsigned int nr_bits)	535	unsigned int nr_bits)
536	{	536	{
537	if (m->count < m->size) {	537	if (m->count < m->size) {
538	int len = bitmap_scnprintf(m->buf + m->count,	538	int len = bitmap_scnprintf(m->buf + m->count,
539	m->size - m->count, bits, nr_bits);	539	m->size - m->count, bits, nr_bits);
540	if (m->count + len < m->size) {	540	if (m->count + len < m->size) {
541	m->count += len;	541	m->count += len;
542	return 0;	542	return 0;
543	}	543	}
544	}	544	}
545	seq_set_overflow(m);	545	seq_set_overflow(m);
546	return -1;	546	return -1;
547	}	547	}
548	EXPORT_SYMBOL(seq_bitmap);	548	EXPORT_SYMBOL(seq_bitmap);
549		549
550	int seq_bitmap_list(struct seq_file m, const unsigned long bits,	550	int seq_bitmap_list(struct seq_file m, const unsigned long bits,
551	unsigned int nr_bits)	551	unsigned int nr_bits)
552	{	552	{
553	if (m->count < m->size) {	553	if (m->count < m->size) {
554	int len = bitmap_scnlistprintf(m->buf + m->count,	554	int len = bitmap_scnlistprintf(m->buf + m->count,
555	m->size - m->count, bits, nr_bits);	555	m->size - m->count, bits, nr_bits);
556	if (m->count + len < m->size) {	556	if (m->count + len < m->size) {
557	m->count += len;	557	m->count += len;
558	return 0;	558	return 0;
559	}	559	}
560	}	560	}
561	seq_set_overflow(m);	561	seq_set_overflow(m);
562	return -1;	562	return -1;
563	}	563	}
564	EXPORT_SYMBOL(seq_bitmap_list);	564	EXPORT_SYMBOL(seq_bitmap_list);
565		565
566	static void single_start(struct seq_file p, loff_t *pos)	566	static void single_start(struct seq_file p, loff_t *pos)
567	{	567	{
568	return NULL + (*pos == 0);	568	return NULL + (*pos == 0);
569	}	569	}
570		570
571	static void single_next(struct seq_file p, void v, loff_t pos)	571	static void single_next(struct seq_file p, void v, loff_t pos)
572	{	572	{
573	++*pos;	573	++*pos;
574	return NULL;	574	return NULL;
575	}	575	}
576		576
577	static void single_stop(struct seq_file p, void v)	577	static void single_stop(struct seq_file p, void v)
578	{	578	{
579	}	579	}
580		580
581	int single_open(struct file file, int (show)(struct seq_file , void ),	581	int single_open(struct file file, int (show)(struct seq_file , void ),
582	void *data)	582	void *data)
583	{	583	{
584	struct seq_operations op = kmalloc(sizeof(op), GFP_KERNEL);	584	struct seq_operations op = kmalloc(sizeof(op), GFP_KERNEL);
585	int res = -ENOMEM;	585	int res = -ENOMEM;
586		586
587	if (op) {	587	if (op) {
588	op->start = single_start;	588	op->start = single_start;
589	op->next = single_next;	589	op->next = single_next;
590	op->stop = single_stop;	590	op->stop = single_stop;
591	op->show = show;	591	op->show = show;
592	res = seq_open(file, op);	592	res = seq_open(file, op);
593	if (!res)	593	if (!res)
594	((struct seq_file *)file->private_data)->private = data;	594	((struct seq_file *)file->private_data)->private = data;
595	else	595	else
596	kfree(op);	596	kfree(op);
597	}	597	}
598	return res;	598	return res;
599	}	599	}
600	EXPORT_SYMBOL(single_open);	600	EXPORT_SYMBOL(single_open);
601		601
602	int single_release(struct inode inode, struct file file)	602	int single_release(struct inode inode, struct file file)
603	{	603	{
604	const struct seq_operations op = ((struct seq_file )file->private_data)->op;	604	const struct seq_operations op = ((struct seq_file )file->private_data)->op;
605	int res = seq_release(inode, file);	605	int res = seq_release(inode, file);
606	kfree(op);	606	kfree(op);
607	return res;	607	return res;
608	}	608	}
609	EXPORT_SYMBOL(single_release);	609	EXPORT_SYMBOL(single_release);
610		610
611	int seq_release_private(struct inode inode, struct file file)	611	int seq_release_private(struct inode inode, struct file file)
612	{	612	{
613	struct seq_file *seq = file->private_data;	613	struct seq_file *seq = file->private_data;
614		614
615	kfree(seq->private);	615	kfree(seq->private);
616	seq->private = NULL;	616	seq->private = NULL;
617	return seq_release(inode, file);	617	return seq_release(inode, file);
618	}	618	}
619	EXPORT_SYMBOL(seq_release_private);	619	EXPORT_SYMBOL(seq_release_private);
620		620
621	void __seq_open_private(struct file f, const struct seq_operations *ops,	621	void __seq_open_private(struct file f, const struct seq_operations *ops,
622	int psize)	622	int psize)
623	{	623	{
624	int rc;	624	int rc;
625	void *private;	625	void *private;
626	struct seq_file *seq;	626	struct seq_file *seq;
627		627
628	private = kzalloc(psize, GFP_KERNEL);	628	private = kzalloc(psize, GFP_KERNEL);
629	if (private == NULL)	629	if (private == NULL)
630	goto out;	630	goto out;
631		631
632	rc = seq_open(f, ops);	632	rc = seq_open(f, ops);
633	if (rc < 0)	633	if (rc < 0)
634	goto out_free;	634	goto out_free;
635		635
636	seq = f->private_data;	636	seq = f->private_data;
637	seq->private = private;	637	seq->private = private;
638	return private;	638	return private;
639		639
640	out_free:	640	out_free:
641	kfree(private);	641	kfree(private);
642	out:	642	out:
643	return NULL;	643	return NULL;
644	}	644	}
645	EXPORT_SYMBOL(__seq_open_private);	645	EXPORT_SYMBOL(__seq_open_private);
646		646
647	int seq_open_private(struct file filp, const struct seq_operations ops,	647	int seq_open_private(struct file filp, const struct seq_operations ops,
648	int psize)	648	int psize)
649	{	649	{
650	return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;	650	return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
651	}	651	}
652	EXPORT_SYMBOL(seq_open_private);	652	EXPORT_SYMBOL(seq_open_private);
653		653
654	int seq_putc(struct seq_file *m, char c)	654	int seq_putc(struct seq_file *m, char c)
655	{	655	{
656	if (m->count < m->size) {	656	if (m->count < m->size) {
657	m->buf[m->count++] = c;	657	m->buf[m->count++] = c;
658	return 0;	658	return 0;
659	}	659	}
660	return -1;	660	return -1;
661	}	661	}
662	EXPORT_SYMBOL(seq_putc);	662	EXPORT_SYMBOL(seq_putc);
663		663
664	int seq_puts(struct seq_file m, const char s)	664	int seq_puts(struct seq_file m, const char s)
665	{	665	{
666	int len = strlen(s);	666	int len = strlen(s);
667	if (m->count + len < m->size) {	667	if (m->count + len < m->size) {
668	memcpy(m->buf + m->count, s, len);	668	memcpy(m->buf + m->count, s, len);
669	m->count += len;	669	m->count += len;
670	return 0;	670	return 0;
671	}	671	}
672	seq_set_overflow(m);	672	seq_set_overflow(m);
673	return -1;	673	return -1;
674	}	674	}
675	EXPORT_SYMBOL(seq_puts);	675	EXPORT_SYMBOL(seq_puts);
676		676
677	/*	677	/*
678	* A helper routine for putting decimal numbers without rich format of printf().	678	* A helper routine for putting decimal numbers without rich format of printf().
679	* only 'unsigned long long' is supported.	679	* only 'unsigned long long' is supported.
680	* This routine will put one byte delimiter + number into seq_file.	680	* This routine will put one byte delimiter + number into seq_file.
681	* This routine is very quick when you show lots of numbers.	681	* This routine is very quick when you show lots of numbers.
682	* In usual cases, it will be better to use seq_printf(). It's easier to read.	682	* In usual cases, it will be better to use seq_printf(). It's easier to read.
683	*/	683	*/
684	int seq_put_decimal_ull(struct seq_file *m, char delimiter,	684	int seq_put_decimal_ull(struct seq_file *m, char delimiter,
685	unsigned long long num)	685	unsigned long long num)
686	{	686	{
687	int len;	687	int len;
688		688
689	if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */	689	if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
690	goto overflow;	690	goto overflow;
691		691
692	if (delimiter)	692	if (delimiter)
693	m->buf[m->count++] = delimiter;	693	m->buf[m->count++] = delimiter;
694		694
695	if (num < 10) {	695	if (num < 10) {
696	m->buf[m->count++] = num + '0';	696	m->buf[m->count++] = num + '0';
697	return 0;	697	return 0;
698	}	698	}
699		699
700	len = num_to_str(m->buf + m->count, m->size - m->count, num);	700	len = num_to_str(m->buf + m->count, m->size - m->count, num);
701	if (!len)	701	if (!len)
702	goto overflow;	702	goto overflow;
703	m->count += len;	703	m->count += len;
704	return 0;	704	return 0;
705	overflow:	705	overflow:
706	seq_set_overflow(m);	706	seq_set_overflow(m);
707	return -1;	707	return -1;
708	}	708	}
709	EXPORT_SYMBOL(seq_put_decimal_ull);	709	EXPORT_SYMBOL(seq_put_decimal_ull);
710		710
711	int seq_put_decimal_ll(struct seq_file *m, char delimiter,	711	int seq_put_decimal_ll(struct seq_file *m, char delimiter,
712	long long num)	712	long long num)
713	{	713	{
714	if (num < 0) {	714	if (num < 0) {
715	if (m->count + 3 >= m->size) {	715	if (m->count + 3 >= m->size) {
716	seq_set_overflow(m);	716	seq_set_overflow(m);
717	return -1;	717	return -1;
718	}	718	}
719	if (delimiter)	719	if (delimiter)
720	m->buf[m->count++] = delimiter;	720	m->buf[m->count++] = delimiter;
721	num = -num;	721	num = -num;
722	delimiter = '-';	722	delimiter = '-';
723	}	723	}
724	return seq_put_decimal_ull(m, delimiter, num);	724	return seq_put_decimal_ull(m, delimiter, num);
725		725
726	}	726	}
727	EXPORT_SYMBOL(seq_put_decimal_ll);	727	EXPORT_SYMBOL(seq_put_decimal_ll);
728		728
729	/**	729	/**
730	* seq_write - write arbitrary data to buffer	730	* seq_write - write arbitrary data to buffer
731	* @seq: seq_file identifying the buffer to which data should be written	731	* @seq: seq_file identifying the buffer to which data should be written
732	* @data: data address	732	* @data: data address
733	* @len: number of bytes	733	* @len: number of bytes
734	*	734	*
735	* Return 0 on success, non-zero otherwise.	735	* Return 0 on success, non-zero otherwise.
736	*/	736	*/
737	int seq_write(struct seq_file seq, const void data, size_t len)	737	int seq_write(struct seq_file seq, const void data, size_t len)
738	{	738	{
739	if (seq->count + len < seq->size) {	739	if (seq->count + len < seq->size) {
740	memcpy(seq->buf + seq->count, data, len);	740	memcpy(seq->buf + seq->count, data, len);
741	seq->count += len;	741	seq->count += len;
742	return 0;	742	return 0;
743	}	743	}
744	seq_set_overflow(seq);	744	seq_set_overflow(seq);
745	return -1;	745	return -1;
746	}	746	}
747	EXPORT_SYMBOL(seq_write);	747	EXPORT_SYMBOL(seq_write);
748		748
749	struct list_head seq_list_start(struct list_head head, loff_t pos)	749	struct list_head seq_list_start(struct list_head head, loff_t pos)
750	{	750	{
751	struct list_head *lh;	751	struct list_head *lh;
752		752
753	list_for_each(lh, head)	753	list_for_each(lh, head)
754	if (pos-- == 0)	754	if (pos-- == 0)
755	return lh;	755	return lh;
756		756
757	return NULL;	757	return NULL;
758	}	758	}
759	EXPORT_SYMBOL(seq_list_start);	759	EXPORT_SYMBOL(seq_list_start);
760		760
761	struct list_head seq_list_start_head(struct list_head head, loff_t pos)	761	struct list_head seq_list_start_head(struct list_head head, loff_t pos)
762	{	762	{
763	if (!pos)	763	if (!pos)
764	return head;	764	return head;
765		765
766	return seq_list_start(head, pos - 1);	766	return seq_list_start(head, pos - 1);
767	}	767	}
768	EXPORT_SYMBOL(seq_list_start_head);	768	EXPORT_SYMBOL(seq_list_start_head);
769		769
770	struct list_head seq_list_next(void v, struct list_head head, loff_t ppos)	770	struct list_head seq_list_next(void v, struct list_head head, loff_t ppos)
771	{	771	{
772	struct list_head *lh;	772	struct list_head *lh;
773		773
774	lh = ((struct list_head *)v)->next;	774	lh = ((struct list_head *)v)->next;
775	++*ppos;	775	++*ppos;
776	return lh == head ? NULL : lh;	776	return lh == head ? NULL : lh;
777	}	777	}
778	EXPORT_SYMBOL(seq_list_next);	778	EXPORT_SYMBOL(seq_list_next);
779		779
780	/**	780	/**
781	* seq_hlist_start - start an iteration of a hlist	781	* seq_hlist_start - start an iteration of a hlist
782	* @head: the head of the hlist	782	* @head: the head of the hlist
783	* @pos: the start position of the sequence	783	* @pos: the start position of the sequence
784	*	784	*
785	* Called at seq_file->op->start().	785	* Called at seq_file->op->start().
786	*/	786	*/
787	struct hlist_node seq_hlist_start(struct hlist_head head, loff_t pos)	787	struct hlist_node seq_hlist_start(struct hlist_head head, loff_t pos)
788	{	788	{
789	struct hlist_node *node;	789	struct hlist_node *node;
790		790
791	hlist_for_each(node, head)	791	hlist_for_each(node, head)
792	if (pos-- == 0)	792	if (pos-- == 0)
793	return node;	793	return node;
794	return NULL;	794	return NULL;
795	}	795	}
796	EXPORT_SYMBOL(seq_hlist_start);	796	EXPORT_SYMBOL(seq_hlist_start);
797		797
798	/**	798	/**
799	* seq_hlist_start_head - start an iteration of a hlist	799	* seq_hlist_start_head - start an iteration of a hlist
800	* @head: the head of the hlist	800	* @head: the head of the hlist
801	* @pos: the start position of the sequence	801	* @pos: the start position of the sequence
802	*	802	*
803	* Called at seq_file->op->start(). Call this function if you want to	803	* Called at seq_file->op->start(). Call this function if you want to
804	* print a header at the top of the output.	804	* print a header at the top of the output.
805	*/	805	*/
806	struct hlist_node seq_hlist_start_head(struct hlist_head head, loff_t pos)	806	struct hlist_node seq_hlist_start_head(struct hlist_head head, loff_t pos)
807	{	807	{
808	if (!pos)	808	if (!pos)
809	return SEQ_START_TOKEN;	809	return SEQ_START_TOKEN;
810		810
811	return seq_hlist_start(head, pos - 1);	811	return seq_hlist_start(head, pos - 1);
812	}	812	}
813	EXPORT_SYMBOL(seq_hlist_start_head);	813	EXPORT_SYMBOL(seq_hlist_start_head);
814		814
815	/**	815	/**
816	* seq_hlist_next - move to the next position of the hlist	816	* seq_hlist_next - move to the next position of the hlist
817	* @v: the current iterator	817	* @v: the current iterator
818	* @head: the head of the hlist	818	* @head: the head of the hlist
819	* @ppos: the current position	819	* @ppos: the current position
820	*	820	*
821	* Called at seq_file->op->next().	821	* Called at seq_file->op->next().
822	*/	822	*/
823	struct hlist_node seq_hlist_next(void v, struct hlist_head *head,	823	struct hlist_node seq_hlist_next(void v, struct hlist_head *head,
824	loff_t *ppos)	824	loff_t *ppos)
825	{	825	{
826	struct hlist_node *node = v;	826	struct hlist_node *node = v;
827		827
828	++*ppos;	828	++*ppos;
829	if (v == SEQ_START_TOKEN)	829	if (v == SEQ_START_TOKEN)
830	return head->first;	830	return head->first;
831	else	831	else
832	return node->next;	832	return node->next;
833	}	833	}
834	EXPORT_SYMBOL(seq_hlist_next);	834	EXPORT_SYMBOL(seq_hlist_next);
835		835
836	/**	836	/**
837	* seq_hlist_start_rcu - start an iteration of a hlist protected by RCU	837	* seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
838	* @head: the head of the hlist	838	* @head: the head of the hlist
839	* @pos: the start position of the sequence	839	* @pos: the start position of the sequence
840	*	840	*
841	* Called at seq_file->op->start().	841	* Called at seq_file->op->start().
842	*	842	*
843	* This list-traversal primitive may safely run concurrently with	843	* This list-traversal primitive may safely run concurrently with
844	* the _rcu list-mutation primitives such as hlist_add_head_rcu()	844	* the _rcu list-mutation primitives such as hlist_add_head_rcu()
845	* as long as the traversal is guarded by rcu_read_lock().	845	* as long as the traversal is guarded by rcu_read_lock().
846	*/	846	*/
847	struct hlist_node seq_hlist_start_rcu(struct hlist_head head,	847	struct hlist_node seq_hlist_start_rcu(struct hlist_head head,
848	loff_t pos)	848	loff_t pos)
849	{	849	{
850	struct hlist_node *node;	850	struct hlist_node *node;
851		851
852	__hlist_for_each_rcu(node, head)	852	__hlist_for_each_rcu(node, head)
853	if (pos-- == 0)	853	if (pos-- == 0)
854	return node;	854	return node;
855	return NULL;	855	return NULL;
856	}	856	}
857	EXPORT_SYMBOL(seq_hlist_start_rcu);	857	EXPORT_SYMBOL(seq_hlist_start_rcu);
858		858
859	/**	859	/**
860	* seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU	860	* seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
861	* @head: the head of the hlist	861	* @head: the head of the hlist
862	* @pos: the start position of the sequence	862	* @pos: the start position of the sequence
863	*	863	*
864	* Called at seq_file->op->start(). Call this function if you want to	864	* Called at seq_file->op->start(). Call this function if you want to
865	* print a header at the top of the output.	865	* print a header at the top of the output.
866	*	866	*
867	* This list-traversal primitive may safely run concurrently with	867	* This list-traversal primitive may safely run concurrently with
868	* the _rcu list-mutation primitives such as hlist_add_head_rcu()	868	* the _rcu list-mutation primitives such as hlist_add_head_rcu()
869	* as long as the traversal is guarded by rcu_read_lock().	869	* as long as the traversal is guarded by rcu_read_lock().
870	*/	870	*/
871	struct hlist_node seq_hlist_start_head_rcu(struct hlist_head head,	871	struct hlist_node seq_hlist_start_head_rcu(struct hlist_head head,
872	loff_t pos)	872	loff_t pos)
873	{	873	{
874	if (!pos)	874	if (!pos)
875	return SEQ_START_TOKEN;	875	return SEQ_START_TOKEN;
876		876
877	return seq_hlist_start_rcu(head, pos - 1);	877	return seq_hlist_start_rcu(head, pos - 1);
878	}	878	}
879	EXPORT_SYMBOL(seq_hlist_start_head_rcu);	879	EXPORT_SYMBOL(seq_hlist_start_head_rcu);
880		880
881	/**	881	/**
882	* seq_hlist_next_rcu - move to the next position of the hlist protected by RCU	882	* seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
883	* @v: the current iterator	883	* @v: the current iterator
884	* @head: the head of the hlist	884	* @head: the head of the hlist
885	* @ppos: the current position	885	* @ppos: the current position
886	*	886	*
887	* Called at seq_file->op->next().	887	* Called at seq_file->op->next().
888	*	888	*
889	* This list-traversal primitive may safely run concurrently with	889	* This list-traversal primitive may safely run concurrently with
890	* the _rcu list-mutation primitives such as hlist_add_head_rcu()	890	* the _rcu list-mutation primitives such as hlist_add_head_rcu()
891	* as long as the traversal is guarded by rcu_read_lock().	891	* as long as the traversal is guarded by rcu_read_lock().
892	*/	892	*/
893	struct hlist_node seq_hlist_next_rcu(void v,	893	struct hlist_node seq_hlist_next_rcu(void v,
894	struct hlist_head *head,	894	struct hlist_head *head,
895	loff_t *ppos)	895	loff_t *ppos)
896	{	896	{
897	struct hlist_node *node = v;	897	struct hlist_node *node = v;
898		898
899	++*ppos;	899	++*ppos;
900	if (v == SEQ_START_TOKEN)	900	if (v == SEQ_START_TOKEN)
901	return rcu_dereference(head->first);	901	return rcu_dereference(head->first);
902	else	902	else
903	return rcu_dereference(node->next);	903	return rcu_dereference(node->next);
904	}	904	}
905	EXPORT_SYMBOL(seq_hlist_next_rcu);	905	EXPORT_SYMBOL(seq_hlist_next_rcu);
906		906

fs/ubifs/dir.c

Diff comments View file @ 965c8e5

1	/* * This file is part of UBIFS.	1	/* * This file is part of UBIFS.
2	*	2	*
3	* Copyright (C) 2006-2008 Nokia Corporation.	3	* Copyright (C) 2006-2008 Nokia Corporation.
4	* Copyright (C) 2006, 2007 University of Szeged, Hungary	4	* Copyright (C) 2006, 2007 University of Szeged, Hungary
5	*	5	*
6	* This program is free software; you can redistribute it and/or modify it	6	* This program is free software; you can redistribute it and/or modify it
7	* under the terms of the GNU General Public License version 2 as published by	7	* under the terms of the GNU General Public License version 2 as published by
8	* the Free Software Foundation.	8	* the Free Software Foundation.
9	*	9	*
10	* This program is distributed in the hope that it will be useful, but WITHOUT	10	* This program is distributed in the hope that it will be useful, but WITHOUT
11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or	11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for	12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13	* more details.	13	* more details.
14	*	14	*
15	* You should have received a copy of the GNU General Public License along with	15	* You should have received a copy of the GNU General Public License along with
16	* this program; if not, write to the Free Software Foundation, Inc., 51	16	* this program; if not, write to the Free Software Foundation, Inc., 51
17	* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	17	* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18	*	18	*
19	* Authors: Artem Bityutskiy (Битюцкий Артём)	19	* Authors: Artem Bityutskiy (Битюцкий Артём)
20	* Adrian Hunter	20	* Adrian Hunter
21	* Zoltan Sogor	21	* Zoltan Sogor
22	*/	22	*/
23		23
24	/*	24	/*
25	* This file implements directory operations.	25	* This file implements directory operations.
26	*	26	*
27	* All FS operations in this file allocate budget before writing anything to the	27	* All FS operations in this file allocate budget before writing anything to the
28	* media. If they fail to allocate it, the error is returned. The only	28	* media. If they fail to allocate it, the error is returned. The only
29	* exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even	29	* exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
30	* if they unable to allocate the budget, because deletion %-ENOSPC failure is	30	* if they unable to allocate the budget, because deletion %-ENOSPC failure is
31	* not what users are usually ready to get. UBIFS budgeting subsystem has some	31	* not what users are usually ready to get. UBIFS budgeting subsystem has some
32	* space reserved for these purposes.	32	* space reserved for these purposes.
33	*	33	*
34	* All operations in this file write all inodes which they change straight	34	* All operations in this file write all inodes which they change straight
35	* away, instead of marking them dirty. For example, 'ubifs_link()' changes	35	* away, instead of marking them dirty. For example, 'ubifs_link()' changes
36	* @i_size of the parent inode and writes the parent inode together with the	36	* @i_size of the parent inode and writes the parent inode together with the
37	* target inode. This was done to simplify file-system recovery which would	37	* target inode. This was done to simplify file-system recovery which would
38	* otherwise be very difficult to do. The only exception is rename which marks	38	* otherwise be very difficult to do. The only exception is rename which marks
39	* the re-named inode dirty (because its @i_ctime is updated) but does not	39	* the re-named inode dirty (because its @i_ctime is updated) but does not
40	* write it, but just marks it as dirty.	40	* write it, but just marks it as dirty.
41	*/	41	*/
42		42
43	#include "ubifs.h"	43	#include "ubifs.h"
44		44
45	/**	45	/**
46	* inherit_flags - inherit flags of the parent inode.	46	* inherit_flags - inherit flags of the parent inode.
47	* @dir: parent inode	47	* @dir: parent inode
48	* @mode: new inode mode flags	48	* @mode: new inode mode flags
49	*	49	*
50	* This is a helper function for 'ubifs_new_inode()' which inherits flag of the	50	* This is a helper function for 'ubifs_new_inode()' which inherits flag of the
51	* parent directory inode @dir. UBIFS inodes inherit the following flags:	51	* parent directory inode @dir. UBIFS inodes inherit the following flags:
52	* o %UBIFS_COMPR_FL, which is useful to switch compression on/of on	52	* o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
53	* sub-directory basis;	53	* sub-directory basis;
54	* o %UBIFS_SYNC_FL - useful for the same reasons;	54	* o %UBIFS_SYNC_FL - useful for the same reasons;
55	* o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.	55	* o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
56	*	56	*
57	* This function returns the inherited flags.	57	* This function returns the inherited flags.
58	*/	58	*/
59	static int inherit_flags(const struct inode *dir, umode_t mode)	59	static int inherit_flags(const struct inode *dir, umode_t mode)
60	{	60	{
61	int flags;	61	int flags;
62	const struct ubifs_inode *ui = ubifs_inode(dir);	62	const struct ubifs_inode *ui = ubifs_inode(dir);
63		63
64	if (!S_ISDIR(dir->i_mode))	64	if (!S_ISDIR(dir->i_mode))
65	/*	65	/*
66	* The parent is not a directory, which means that an extended	66	* The parent is not a directory, which means that an extended
67	* attribute inode is being created. No flags.	67	* attribute inode is being created. No flags.
68	*/	68	*/
69	return 0;	69	return 0;
70		70
71	flags = ui->flags & (UBIFS_COMPR_FL \| UBIFS_SYNC_FL \| UBIFS_DIRSYNC_FL);	71	flags = ui->flags & (UBIFS_COMPR_FL \| UBIFS_SYNC_FL \| UBIFS_DIRSYNC_FL);
72	if (!S_ISDIR(mode))	72	if (!S_ISDIR(mode))
73	/* The "DIRSYNC" flag only applies to directories */	73	/* The "DIRSYNC" flag only applies to directories */
74	flags &= ~UBIFS_DIRSYNC_FL;	74	flags &= ~UBIFS_DIRSYNC_FL;
75	return flags;	75	return flags;
76	}	76	}
77		77
78	/**	78	/**
79	* ubifs_new_inode - allocate new UBIFS inode object.	79	* ubifs_new_inode - allocate new UBIFS inode object.
80	* @c: UBIFS file-system description object	80	* @c: UBIFS file-system description object
81	* @dir: parent directory inode	81	* @dir: parent directory inode
82	* @mode: inode mode flags	82	* @mode: inode mode flags
83	*	83	*
84	* This function finds an unused inode number, allocates new inode and	84	* This function finds an unused inode number, allocates new inode and
85	* initializes it. Returns new inode in case of success and an error code in	85	* initializes it. Returns new inode in case of success and an error code in
86	* case of failure.	86	* case of failure.
87	*/	87	*/
88	struct inode ubifs_new_inode(struct ubifs_info c, const struct inode *dir,	88	struct inode ubifs_new_inode(struct ubifs_info c, const struct inode *dir,
89	umode_t mode)	89	umode_t mode)
90	{	90	{
91	struct inode *inode;	91	struct inode *inode;
92	struct ubifs_inode *ui;	92	struct ubifs_inode *ui;
93		93
94	inode = new_inode(c->vfs_sb);	94	inode = new_inode(c->vfs_sb);
95	ui = ubifs_inode(inode);	95	ui = ubifs_inode(inode);
96	if (!inode)	96	if (!inode)
97	return ERR_PTR(-ENOMEM);	97	return ERR_PTR(-ENOMEM);
98		98
99	/*	99	/*
100	* Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and	100	* Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
101	* marking them dirty in file write path (see 'file_update_time()').	101	* marking them dirty in file write path (see 'file_update_time()').
102	* UBIFS has to fully control "clean <-> dirty" transitions of inodes	102	* UBIFS has to fully control "clean <-> dirty" transitions of inodes
103	* to make budgeting work.	103	* to make budgeting work.
104	*/	104	*/
105	inode->i_flags \|= S_NOCMTIME;	105	inode->i_flags \|= S_NOCMTIME;
106		106
107	inode_init_owner(inode, dir, mode);	107	inode_init_owner(inode, dir, mode);
108	inode->i_mtime = inode->i_atime = inode->i_ctime =	108	inode->i_mtime = inode->i_atime = inode->i_ctime =
109	ubifs_current_time(inode);	109	ubifs_current_time(inode);
110	inode->i_mapping->nrpages = 0;	110	inode->i_mapping->nrpages = 0;
111	/* Disable readahead */	111	/* Disable readahead */
112	inode->i_mapping->backing_dev_info = &c->bdi;	112	inode->i_mapping->backing_dev_info = &c->bdi;
113		113
114	switch (mode & S_IFMT) {	114	switch (mode & S_IFMT) {
115	case S_IFREG:	115	case S_IFREG:
116	inode->i_mapping->a_ops = &ubifs_file_address_operations;	116	inode->i_mapping->a_ops = &ubifs_file_address_operations;
117	inode->i_op = &ubifs_file_inode_operations;	117	inode->i_op = &ubifs_file_inode_operations;
118	inode->i_fop = &ubifs_file_operations;	118	inode->i_fop = &ubifs_file_operations;
119	break;	119	break;
120	case S_IFDIR:	120	case S_IFDIR:
121	inode->i_op = &ubifs_dir_inode_operations;	121	inode->i_op = &ubifs_dir_inode_operations;
122	inode->i_fop = &ubifs_dir_operations;	122	inode->i_fop = &ubifs_dir_operations;
123	inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;	123	inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
124	break;	124	break;
125	case S_IFLNK:	125	case S_IFLNK:
126	inode->i_op = &ubifs_symlink_inode_operations;	126	inode->i_op = &ubifs_symlink_inode_operations;
127	break;	127	break;
128	case S_IFSOCK:	128	case S_IFSOCK:
129	case S_IFIFO:	129	case S_IFIFO:
130	case S_IFBLK:	130	case S_IFBLK:
131	case S_IFCHR:	131	case S_IFCHR:
132	inode->i_op = &ubifs_file_inode_operations;	132	inode->i_op = &ubifs_file_inode_operations;
133	break;	133	break;
134	default:	134	default:
135	BUG();	135	BUG();
136	}	136	}
137		137
138	ui->flags = inherit_flags(dir, mode);	138	ui->flags = inherit_flags(dir, mode);
139	ubifs_set_inode_flags(inode);	139	ubifs_set_inode_flags(inode);
140	if (S_ISREG(mode))	140	if (S_ISREG(mode))
141	ui->compr_type = c->default_compr;	141	ui->compr_type = c->default_compr;
142	else	142	else
143	ui->compr_type = UBIFS_COMPR_NONE;	143	ui->compr_type = UBIFS_COMPR_NONE;
144	ui->synced_i_size = 0;	144	ui->synced_i_size = 0;
145		145
146	spin_lock(&c->cnt_lock);	146	spin_lock(&c->cnt_lock);
147	/* Inode number overflow is currently not supported */	147	/* Inode number overflow is currently not supported */
148	if (c->highest_inum >= INUM_WARN_WATERMARK) {	148	if (c->highest_inum >= INUM_WARN_WATERMARK) {
149	if (c->highest_inum >= INUM_WATERMARK) {	149	if (c->highest_inum >= INUM_WATERMARK) {
150	spin_unlock(&c->cnt_lock);	150	spin_unlock(&c->cnt_lock);
151	ubifs_err("out of inode numbers");	151	ubifs_err("out of inode numbers");
152	make_bad_inode(inode);	152	make_bad_inode(inode);
153	iput(inode);	153	iput(inode);
154	return ERR_PTR(-EINVAL);	154	return ERR_PTR(-EINVAL);
155	}	155	}
156	ubifs_warn("running out of inode numbers (current %lu, max %d)",	156	ubifs_warn("running out of inode numbers (current %lu, max %d)",
157	(unsigned long)c->highest_inum, INUM_WATERMARK);	157	(unsigned long)c->highest_inum, INUM_WATERMARK);
158	}	158	}
159		159
160	inode->i_ino = ++c->highest_inum;	160	inode->i_ino = ++c->highest_inum;
161	/*	161	/*
162	* The creation sequence number remains with this inode for its	162	* The creation sequence number remains with this inode for its
163	* lifetime. All nodes for this inode have a greater sequence number,	163	* lifetime. All nodes for this inode have a greater sequence number,
164	* and so it is possible to distinguish obsolete nodes belonging to a	164	* and so it is possible to distinguish obsolete nodes belonging to a
165	* previous incarnation of the same inode number - for example, for the	165	* previous incarnation of the same inode number - for example, for the
166	* purpose of rebuilding the index.	166	* purpose of rebuilding the index.
167	*/	167	*/
168	ui->creat_sqnum = ++c->max_sqnum;	168	ui->creat_sqnum = ++c->max_sqnum;
169	spin_unlock(&c->cnt_lock);	169	spin_unlock(&c->cnt_lock);
170	return inode;	170	return inode;
171	}	171	}
172		172
173	static int dbg_check_name(const struct ubifs_info *c,	173	static int dbg_check_name(const struct ubifs_info *c,
174	const struct ubifs_dent_node *dent,	174	const struct ubifs_dent_node *dent,
175	const struct qstr *nm)	175	const struct qstr *nm)
176	{	176	{
177	if (!dbg_is_chk_gen(c))	177	if (!dbg_is_chk_gen(c))
178	return 0;	178	return 0;
179	if (le16_to_cpu(dent->nlen) != nm->len)	179	if (le16_to_cpu(dent->nlen) != nm->len)
180	return -EINVAL;	180	return -EINVAL;
181	if (memcmp(dent->name, nm->name, nm->len))	181	if (memcmp(dent->name, nm->name, nm->len))
182	return -EINVAL;	182	return -EINVAL;
183	return 0;	183	return 0;
184	}	184	}
185		185
186	static struct dentry ubifs_lookup(struct inode dir, struct dentry *dentry,	186	static struct dentry ubifs_lookup(struct inode dir, struct dentry *dentry,
187	unsigned int flags)	187	unsigned int flags)
188	{	188	{
189	int err;	189	int err;
190	union ubifs_key key;	190	union ubifs_key key;
191	struct inode *inode = NULL;	191	struct inode *inode = NULL;
192	struct ubifs_dent_node *dent;	192	struct ubifs_dent_node *dent;
193	struct ubifs_info *c = dir->i_sb->s_fs_info;	193	struct ubifs_info *c = dir->i_sb->s_fs_info;
194		194
195	dbg_gen("'%.*s' in dir ino %lu",	195	dbg_gen("'%.*s' in dir ino %lu",
196	dentry->d_name.len, dentry->d_name.name, dir->i_ino);	196	dentry->d_name.len, dentry->d_name.name, dir->i_ino);
197		197
198	if (dentry->d_name.len > UBIFS_MAX_NLEN)	198	if (dentry->d_name.len > UBIFS_MAX_NLEN)
199	return ERR_PTR(-ENAMETOOLONG);	199	return ERR_PTR(-ENAMETOOLONG);
200		200
201	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);	201	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
202	if (!dent)	202	if (!dent)
203	return ERR_PTR(-ENOMEM);	203	return ERR_PTR(-ENOMEM);
204		204
205	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);	205	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
206		206
207	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);	207	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
208	if (err) {	208	if (err) {
209	if (err == -ENOENT) {	209	if (err == -ENOENT) {
210	dbg_gen("not found");	210	dbg_gen("not found");
211	goto done;	211	goto done;
212	}	212	}
213	goto out;	213	goto out;
214	}	214	}
215		215
216	if (dbg_check_name(c, dent, &dentry->d_name)) {	216	if (dbg_check_name(c, dent, &dentry->d_name)) {
217	err = -EINVAL;	217	err = -EINVAL;
218	goto out;	218	goto out;
219	}	219	}
220		220
221	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));	221	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
222	if (IS_ERR(inode)) {	222	if (IS_ERR(inode)) {
223	/*	223	/*
224	* This should not happen. Probably the file-system needs	224	* This should not happen. Probably the file-system needs
225	* checking.	225	* checking.
226	*/	226	*/
227	err = PTR_ERR(inode);	227	err = PTR_ERR(inode);
228	ubifs_err("dead directory entry '%.*s', error %d",	228	ubifs_err("dead directory entry '%.*s', error %d",
229	dentry->d_name.len, dentry->d_name.name, err);	229	dentry->d_name.len, dentry->d_name.name, err);
230	ubifs_ro_mode(c, err);	230	ubifs_ro_mode(c, err);
231	goto out;	231	goto out;
232	}	232	}
233		233
234	done:	234	done:
235	kfree(dent);	235	kfree(dent);
236	/*	236	/*
237	* Note, d_splice_alias() would be required instead if we supported	237	* Note, d_splice_alias() would be required instead if we supported
238	* NFS.	238	* NFS.
239	*/	239	*/
240	d_add(dentry, inode);	240	d_add(dentry, inode);
241	return NULL;	241	return NULL;
242		242
243	out:	243	out:
244	kfree(dent);	244	kfree(dent);
245	return ERR_PTR(err);	245	return ERR_PTR(err);
246	}	246	}
247		247
248	static int ubifs_create(struct inode dir, struct dentry dentry, umode_t mode,	248	static int ubifs_create(struct inode dir, struct dentry dentry, umode_t mode,
249	bool excl)	249	bool excl)
250	{	250	{
251	struct inode *inode;	251	struct inode *inode;
252	struct ubifs_info *c = dir->i_sb->s_fs_info;	252	struct ubifs_info *c = dir->i_sb->s_fs_info;
253	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);	253	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
254	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,	254	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
255	.dirtied_ino = 1 };	255	.dirtied_ino = 1 };
256	struct ubifs_inode *dir_ui = ubifs_inode(dir);	256	struct ubifs_inode *dir_ui = ubifs_inode(dir);
257		257
258	/*	258	/*
259	* Budget request settings: new inode, new direntry, changing the	259	* Budget request settings: new inode, new direntry, changing the
260	* parent directory inode.	260	* parent directory inode.
261	*/	261	*/
262		262
263	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",	263	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
264	dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);	264	dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
265		265
266	err = ubifs_budget_space(c, &req);	266	err = ubifs_budget_space(c, &req);
267	if (err)	267	if (err)
268	return err;	268	return err;
269		269
270	inode = ubifs_new_inode(c, dir, mode);	270	inode = ubifs_new_inode(c, dir, mode);
271	if (IS_ERR(inode)) {	271	if (IS_ERR(inode)) {
272	err = PTR_ERR(inode);	272	err = PTR_ERR(inode);
273	goto out_budg;	273	goto out_budg;
274	}	274	}
275		275
276	mutex_lock(&dir_ui->ui_mutex);	276	mutex_lock(&dir_ui->ui_mutex);
277	dir->i_size += sz_change;	277	dir->i_size += sz_change;
278	dir_ui->ui_size = dir->i_size;	278	dir_ui->ui_size = dir->i_size;
279	dir->i_mtime = dir->i_ctime = inode->i_ctime;	279	dir->i_mtime = dir->i_ctime = inode->i_ctime;
280	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);	280	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
281	if (err)	281	if (err)
282	goto out_cancel;	282	goto out_cancel;
283	mutex_unlock(&dir_ui->ui_mutex);	283	mutex_unlock(&dir_ui->ui_mutex);
284		284
285	ubifs_release_budget(c, &req);	285	ubifs_release_budget(c, &req);
286	insert_inode_hash(inode);	286	insert_inode_hash(inode);
287	d_instantiate(dentry, inode);	287	d_instantiate(dentry, inode);
288	return 0;	288	return 0;
289		289
290	out_cancel:	290	out_cancel:
291	dir->i_size -= sz_change;	291	dir->i_size -= sz_change;
292	dir_ui->ui_size = dir->i_size;	292	dir_ui->ui_size = dir->i_size;
293	mutex_unlock(&dir_ui->ui_mutex);	293	mutex_unlock(&dir_ui->ui_mutex);
294	make_bad_inode(inode);	294	make_bad_inode(inode);
295	iput(inode);	295	iput(inode);
296	out_budg:	296	out_budg:
297	ubifs_release_budget(c, &req);	297	ubifs_release_budget(c, &req);
298	ubifs_err("cannot create regular file, error %d", err);	298	ubifs_err("cannot create regular file, error %d", err);
299	return err;	299	return err;
300	}	300	}
301		301
302	/**	302	/**
303	* vfs_dent_type - get VFS directory entry type.	303	* vfs_dent_type - get VFS directory entry type.
304	* @type: UBIFS directory entry type	304	* @type: UBIFS directory entry type
305	*	305	*
306	* This function converts UBIFS directory entry type into VFS directory entry	306	* This function converts UBIFS directory entry type into VFS directory entry
307	* type.	307	* type.
308	*/	308	*/
309	static unsigned int vfs_dent_type(uint8_t type)	309	static unsigned int vfs_dent_type(uint8_t type)
310	{	310	{
311	switch (type) {	311	switch (type) {
312	case UBIFS_ITYPE_REG:	312	case UBIFS_ITYPE_REG:
313	return DT_REG;	313	return DT_REG;
314	case UBIFS_ITYPE_DIR:	314	case UBIFS_ITYPE_DIR:
315	return DT_DIR;	315	return DT_DIR;
316	case UBIFS_ITYPE_LNK:	316	case UBIFS_ITYPE_LNK:
317	return DT_LNK;	317	return DT_LNK;
318	case UBIFS_ITYPE_BLK:	318	case UBIFS_ITYPE_BLK:
319	return DT_BLK;	319	return DT_BLK;
320	case UBIFS_ITYPE_CHR:	320	case UBIFS_ITYPE_CHR:
321	return DT_CHR;	321	return DT_CHR;
322	case UBIFS_ITYPE_FIFO:	322	case UBIFS_ITYPE_FIFO:
323	return DT_FIFO;	323	return DT_FIFO;
324	case UBIFS_ITYPE_SOCK:	324	case UBIFS_ITYPE_SOCK:
325	return DT_SOCK;	325	return DT_SOCK;
326	default:	326	default:
327	BUG();	327	BUG();
328	}	328	}
329	return 0;	329	return 0;
330	}	330	}
331		331
332	/*	332	/*
333	* The classical Unix view for directory is that it is a linear array of	333	* The classical Unix view for directory is that it is a linear array of
334	* (name, inode number) entries. Linux/VFS assumes this model as well.	334	* (name, inode number) entries. Linux/VFS assumes this model as well.
335	* Particularly, 'readdir()' call wants us to return a directory entry offset	335	* Particularly, 'readdir()' call wants us to return a directory entry offset
336	* which later may be used to continue 'readdir()'ing the directory or to	336	* which later may be used to continue 'readdir()'ing the directory or to
337	* 'seek()' to that specific direntry. Obviously UBIFS does not really fit this	337	* 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
338	* model because directory entries are identified by keys, which may collide.	338	* model because directory entries are identified by keys, which may collide.
339	*	339	*
340	* UBIFS uses directory entry hash value for directory offsets, so	340	* UBIFS uses directory entry hash value for directory offsets, so
341	* 'seekdir()'/'telldir()' may not always work because of possible key	341	* 'seekdir()'/'telldir()' may not always work because of possible key
342	* collisions. But UBIFS guarantees that consecutive 'readdir()' calls work	342	* collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
343	* properly by means of saving full directory entry name in the private field	343	* properly by means of saving full directory entry name in the private field
344	* of the file description object.	344	* of the file description object.
345	*	345	*
346	* This means that UBIFS cannot support NFS which requires full	346	* This means that UBIFS cannot support NFS which requires full
347	* 'seekdir()'/'telldir()' support.	347	* 'seekdir()'/'telldir()' support.
348	*/	348	*/
349	static int ubifs_readdir(struct file file, void dirent, filldir_t filldir)	349	static int ubifs_readdir(struct file file, void dirent, filldir_t filldir)
350	{	350	{
351	int err, over = 0;	351	int err, over = 0;
352	struct qstr nm;	352	struct qstr nm;
353	union ubifs_key key;	353	union ubifs_key key;
354	struct ubifs_dent_node *dent;	354	struct ubifs_dent_node *dent;
355	struct inode *dir = file->f_path.dentry->d_inode;	355	struct inode *dir = file->f_path.dentry->d_inode;
356	struct ubifs_info *c = dir->i_sb->s_fs_info;	356	struct ubifs_info *c = dir->i_sb->s_fs_info;
357		357
358	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);	358	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
359		359
360	if (file->f_pos > UBIFS_S_KEY_HASH_MASK \|\| file->f_pos == 2)	360	if (file->f_pos > UBIFS_S_KEY_HASH_MASK \|\| file->f_pos == 2)
361	/*	361	/*
362	* The directory was seek'ed to a senseless position or there	362	* The directory was seek'ed to a senseless position or there
363	* are no more entries.	363	* are no more entries.
364	*/	364	*/
365	return 0;	365	return 0;
366		366
367	/* File positions 0 and 1 correspond to "." and ".." */	367	/* File positions 0 and 1 correspond to "." and ".." */
368	if (file->f_pos == 0) {	368	if (file->f_pos == 0) {
369	ubifs_assert(!file->private_data);	369	ubifs_assert(!file->private_data);
370	over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);	370	over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
371	if (over)	371	if (over)
372	return 0;	372	return 0;
373	file->f_pos = 1;	373	file->f_pos = 1;
374	}	374	}
375		375
376	if (file->f_pos == 1) {	376	if (file->f_pos == 1) {
377	ubifs_assert(!file->private_data);	377	ubifs_assert(!file->private_data);
378	over = filldir(dirent, "..", 2, 1,	378	over = filldir(dirent, "..", 2, 1,
379	parent_ino(file->f_path.dentry), DT_DIR);	379	parent_ino(file->f_path.dentry), DT_DIR);
380	if (over)	380	if (over)
381	return 0;	381	return 0;
382		382
383	/* Find the first entry in TNC and save it */	383	/* Find the first entry in TNC and save it */
384	lowest_dent_key(c, &key, dir->i_ino);	384	lowest_dent_key(c, &key, dir->i_ino);
385	nm.name = NULL;	385	nm.name = NULL;
386	dent = ubifs_tnc_next_ent(c, &key, &nm);	386	dent = ubifs_tnc_next_ent(c, &key, &nm);
387	if (IS_ERR(dent)) {	387	if (IS_ERR(dent)) {
388	err = PTR_ERR(dent);	388	err = PTR_ERR(dent);
389	goto out;	389	goto out;
390	}	390	}
391		391
392	file->f_pos = key_hash_flash(c, &dent->key);	392	file->f_pos = key_hash_flash(c, &dent->key);
393	file->private_data = dent;	393	file->private_data = dent;
394	}	394	}
395		395
396	dent = file->private_data;	396	dent = file->private_data;
397	if (!dent) {	397	if (!dent) {
398	/*	398	/*
399	* The directory was seek'ed to and is now readdir'ed.	399	* The directory was seek'ed to and is now readdir'ed.
400	* Find the entry corresponding to @file->f_pos or the	400	* Find the entry corresponding to @file->f_pos or the
401	* closest one.	401	* closest one.
402	*/	402	*/
403	dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);	403	dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
404	nm.name = NULL;	404	nm.name = NULL;
405	dent = ubifs_tnc_next_ent(c, &key, &nm);	405	dent = ubifs_tnc_next_ent(c, &key, &nm);
406	if (IS_ERR(dent)) {	406	if (IS_ERR(dent)) {
407	err = PTR_ERR(dent);	407	err = PTR_ERR(dent);
408	goto out;	408	goto out;
409	}	409	}
410	file->f_pos = key_hash_flash(c, &dent->key);	410	file->f_pos = key_hash_flash(c, &dent->key);
411	file->private_data = dent;	411	file->private_data = dent;
412	}	412	}
413		413
414	while (1) {	414	while (1) {
415	dbg_gen("feed '%s', ino %llu, new f_pos %#x",	415	dbg_gen("feed '%s', ino %llu, new f_pos %#x",
416	dent->name, (unsigned long long)le64_to_cpu(dent->inum),	416	dent->name, (unsigned long long)le64_to_cpu(dent->inum),
417	key_hash_flash(c, &dent->key));	417	key_hash_flash(c, &dent->key));
418	ubifs_assert(le64_to_cpu(dent->ch.sqnum) >	418	ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
419	ubifs_inode(dir)->creat_sqnum);	419	ubifs_inode(dir)->creat_sqnum);
420		420
421	nm.len = le16_to_cpu(dent->nlen);	421	nm.len = le16_to_cpu(dent->nlen);
422	over = filldir(dirent, dent->name, nm.len, file->f_pos,	422	over = filldir(dirent, dent->name, nm.len, file->f_pos,
423	le64_to_cpu(dent->inum),	423	le64_to_cpu(dent->inum),
424	vfs_dent_type(dent->type));	424	vfs_dent_type(dent->type));
425	if (over)	425	if (over)
426	return 0;	426	return 0;
427		427
428	/* Switch to the next entry */	428	/* Switch to the next entry */
429	key_read(c, &dent->key, &key);	429	key_read(c, &dent->key, &key);
430	nm.name = dent->name;	430	nm.name = dent->name;
431	dent = ubifs_tnc_next_ent(c, &key, &nm);	431	dent = ubifs_tnc_next_ent(c, &key, &nm);
432	if (IS_ERR(dent)) {	432	if (IS_ERR(dent)) {
433	err = PTR_ERR(dent);	433	err = PTR_ERR(dent);
434	goto out;	434	goto out;
435	}	435	}
436		436
437	kfree(file->private_data);	437	kfree(file->private_data);
438	file->f_pos = key_hash_flash(c, &dent->key);	438	file->f_pos = key_hash_flash(c, &dent->key);
439	file->private_data = dent;	439	file->private_data = dent;
440	cond_resched();	440	cond_resched();
441	}	441	}
442		442
443	out:	443	out:
444	if (err != -ENOENT) {	444	if (err != -ENOENT) {
445	ubifs_err("cannot find next direntry, error %d", err);	445	ubifs_err("cannot find next direntry, error %d", err);
446	return err;	446	return err;
447	}	447	}
448		448
449	kfree(file->private_data);	449	kfree(file->private_data);
450	file->private_data = NULL;	450	file->private_data = NULL;
451	file->f_pos = 2;	451	file->f_pos = 2;
452	return 0;	452	return 0;
453	}	453	}
454		454
455	/* If a directory is seeked, we have to free saved readdir() state */	455	/* If a directory is seeked, we have to free saved readdir() state */
456	static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)	456	static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
457	{	457	{
458	kfree(file->private_data);	458	kfree(file->private_data);
459	file->private_data = NULL;	459	file->private_data = NULL;
460	return generic_file_llseek(file, offset, origin);	460	return generic_file_llseek(file, offset, whence);
461	}	461	}
462		462
463	/* Free saved readdir() state when the directory is closed */	463	/* Free saved readdir() state when the directory is closed */
464	static int ubifs_dir_release(struct inode dir, struct file file)	464	static int ubifs_dir_release(struct inode dir, struct file file)
465	{	465	{
466	kfree(file->private_data);	466	kfree(file->private_data);
467	file->private_data = NULL;	467	file->private_data = NULL;
468	return 0;	468	return 0;
469	}	469	}
470		470
471	/**	471	/**
472	* lock_2_inodes - a wrapper for locking two UBIFS inodes.	472	* lock_2_inodes - a wrapper for locking two UBIFS inodes.
473	* @inode1: first inode	473	* @inode1: first inode
474	* @inode2: second inode	474	* @inode2: second inode
475	*	475	*
476	* We do not implement any tricks to guarantee strict lock ordering, because	476	* We do not implement any tricks to guarantee strict lock ordering, because
477	* VFS has already done it for us on the @i_mutex. So this is just a simple	477	* VFS has already done it for us on the @i_mutex. So this is just a simple
478	* wrapper function.	478	* wrapper function.
479	*/	479	*/
480	static void lock_2_inodes(struct inode inode1, struct inode inode2)	480	static void lock_2_inodes(struct inode inode1, struct inode inode2)
481	{	481	{
482	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);	482	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
483	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);	483	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
484	}	484	}
485		485
486	/**	486	/**
487	* unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.	487	* unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
488	* @inode1: first inode	488	* @inode1: first inode
489	* @inode2: second inode	489	* @inode2: second inode
490	*/	490	*/
491	static void unlock_2_inodes(struct inode inode1, struct inode inode2)	491	static void unlock_2_inodes(struct inode inode1, struct inode inode2)
492	{	492	{
493	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);	493	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
494	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);	494	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
495	}	495	}
496		496
497	static int ubifs_link(struct dentry old_dentry, struct inode dir,	497	static int ubifs_link(struct dentry old_dentry, struct inode dir,
498	struct dentry *dentry)	498	struct dentry *dentry)
499	{	499	{
500	struct ubifs_info *c = dir->i_sb->s_fs_info;	500	struct ubifs_info *c = dir->i_sb->s_fs_info;
501	struct inode *inode = old_dentry->d_inode;	501	struct inode *inode = old_dentry->d_inode;
502	struct ubifs_inode *ui = ubifs_inode(inode);	502	struct ubifs_inode *ui = ubifs_inode(inode);
503	struct ubifs_inode *dir_ui = ubifs_inode(dir);	503	struct ubifs_inode *dir_ui = ubifs_inode(dir);
504	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);	504	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
505	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,	505	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
506	.dirtied_ino_d = ALIGN(ui->data_len, 8) };	506	.dirtied_ino_d = ALIGN(ui->data_len, 8) };
507		507
508	/*	508	/*
509	* Budget request settings: new direntry, changing the target inode,	509	* Budget request settings: new direntry, changing the target inode,
510	* changing the parent inode.	510	* changing the parent inode.
511	*/	511	*/
512		512
513	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",	513	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
514	dentry->d_name.len, dentry->d_name.name, inode->i_ino,	514	dentry->d_name.len, dentry->d_name.name, inode->i_ino,
515	inode->i_nlink, dir->i_ino);	515	inode->i_nlink, dir->i_ino);
516	ubifs_assert(mutex_is_locked(&dir->i_mutex));	516	ubifs_assert(mutex_is_locked(&dir->i_mutex));
517	ubifs_assert(mutex_is_locked(&inode->i_mutex));	517	ubifs_assert(mutex_is_locked(&inode->i_mutex));
518		518
519	err = dbg_check_synced_i_size(c, inode);	519	err = dbg_check_synced_i_size(c, inode);
520	if (err)	520	if (err)
521	return err;	521	return err;
522		522
523	err = ubifs_budget_space(c, &req);	523	err = ubifs_budget_space(c, &req);
524	if (err)	524	if (err)
525	return err;	525	return err;
526		526
527	lock_2_inodes(dir, inode);	527	lock_2_inodes(dir, inode);
528	inc_nlink(inode);	528	inc_nlink(inode);
529	ihold(inode);	529	ihold(inode);
530	inode->i_ctime = ubifs_current_time(inode);	530	inode->i_ctime = ubifs_current_time(inode);
531	dir->i_size += sz_change;	531	dir->i_size += sz_change;
532	dir_ui->ui_size = dir->i_size;	532	dir_ui->ui_size = dir->i_size;
533	dir->i_mtime = dir->i_ctime = inode->i_ctime;	533	dir->i_mtime = dir->i_ctime = inode->i_ctime;
534	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);	534	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
535	if (err)	535	if (err)
536	goto out_cancel;	536	goto out_cancel;
537	unlock_2_inodes(dir, inode);	537	unlock_2_inodes(dir, inode);
538		538
539	ubifs_release_budget(c, &req);	539	ubifs_release_budget(c, &req);
540	d_instantiate(dentry, inode);	540	d_instantiate(dentry, inode);
541	return 0;	541	return 0;
542		542
543	out_cancel:	543	out_cancel:
544	dir->i_size -= sz_change;	544	dir->i_size -= sz_change;
545	dir_ui->ui_size = dir->i_size;	545	dir_ui->ui_size = dir->i_size;
546	drop_nlink(inode);	546	drop_nlink(inode);
547	unlock_2_inodes(dir, inode);	547	unlock_2_inodes(dir, inode);
548	ubifs_release_budget(c, &req);	548	ubifs_release_budget(c, &req);
549	iput(inode);	549	iput(inode);
550	return err;	550	return err;
551	}	551	}
552		552
553	static int ubifs_unlink(struct inode dir, struct dentry dentry)	553	static int ubifs_unlink(struct inode dir, struct dentry dentry)
554	{	554	{
555	struct ubifs_info *c = dir->i_sb->s_fs_info;	555	struct ubifs_info *c = dir->i_sb->s_fs_info;
556	struct inode *inode = dentry->d_inode;	556	struct inode *inode = dentry->d_inode;
557	struct ubifs_inode *dir_ui = ubifs_inode(dir);	557	struct ubifs_inode *dir_ui = ubifs_inode(dir);
558	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);	558	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
559	int err, budgeted = 1;	559	int err, budgeted = 1;
560	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };	560	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
561	unsigned int saved_nlink = inode->i_nlink;	561	unsigned int saved_nlink = inode->i_nlink;
562		562
563	/*	563	/*
564	* Budget request settings: deletion direntry, deletion inode (+1 for	564	* Budget request settings: deletion direntry, deletion inode (+1 for
565	* @dirtied_ino), changing the parent directory inode. If budgeting	565	* @dirtied_ino), changing the parent directory inode. If budgeting
566	* fails, go ahead anyway because we have extra space reserved for	566	* fails, go ahead anyway because we have extra space reserved for
567	* deletions.	567	* deletions.
568	*/	568	*/
569		569
570	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",	570	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
571	dentry->d_name.len, dentry->d_name.name, inode->i_ino,	571	dentry->d_name.len, dentry->d_name.name, inode->i_ino,
572	inode->i_nlink, dir->i_ino);	572	inode->i_nlink, dir->i_ino);
573	ubifs_assert(mutex_is_locked(&dir->i_mutex));	573	ubifs_assert(mutex_is_locked(&dir->i_mutex));
574	ubifs_assert(mutex_is_locked(&inode->i_mutex));	574	ubifs_assert(mutex_is_locked(&inode->i_mutex));
575	err = dbg_check_synced_i_size(c, inode);	575	err = dbg_check_synced_i_size(c, inode);
576	if (err)	576	if (err)
577	return err;	577	return err;
578		578
579	err = ubifs_budget_space(c, &req);	579	err = ubifs_budget_space(c, &req);
580	if (err) {	580	if (err) {
581	if (err != -ENOSPC)	581	if (err != -ENOSPC)
582	return err;	582	return err;
583	budgeted = 0;	583	budgeted = 0;
584	}	584	}
585		585
586	lock_2_inodes(dir, inode);	586	lock_2_inodes(dir, inode);
587	inode->i_ctime = ubifs_current_time(dir);	587	inode->i_ctime = ubifs_current_time(dir);
588	drop_nlink(inode);	588	drop_nlink(inode);
589	dir->i_size -= sz_change;	589	dir->i_size -= sz_change;
590	dir_ui->ui_size = dir->i_size;	590	dir_ui->ui_size = dir->i_size;
591	dir->i_mtime = dir->i_ctime = inode->i_ctime;	591	dir->i_mtime = dir->i_ctime = inode->i_ctime;
592	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);	592	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
593	if (err)	593	if (err)
594	goto out_cancel;	594	goto out_cancel;
595	unlock_2_inodes(dir, inode);	595	unlock_2_inodes(dir, inode);
596		596
597	if (budgeted)	597	if (budgeted)
598	ubifs_release_budget(c, &req);	598	ubifs_release_budget(c, &req);
599	else {	599	else {
600	/* We've deleted something - clean the "no space" flags */	600	/* We've deleted something - clean the "no space" flags */
601	c->bi.nospace = c->bi.nospace_rp = 0;	601	c->bi.nospace = c->bi.nospace_rp = 0;
602	smp_wmb();	602	smp_wmb();
603	}	603	}
604	return 0;	604	return 0;
605		605
606	out_cancel:	606	out_cancel:
607	dir->i_size += sz_change;	607	dir->i_size += sz_change;
608	dir_ui->ui_size = dir->i_size;	608	dir_ui->ui_size = dir->i_size;
609	set_nlink(inode, saved_nlink);	609	set_nlink(inode, saved_nlink);
610	unlock_2_inodes(dir, inode);	610	unlock_2_inodes(dir, inode);
611	if (budgeted)	611	if (budgeted)
612	ubifs_release_budget(c, &req);	612	ubifs_release_budget(c, &req);
613	return err;	613	return err;
614	}	614	}
615		615
616	/**	616	/**
617	* check_dir_empty - check if a directory is empty or not.	617	* check_dir_empty - check if a directory is empty or not.
618	* @c: UBIFS file-system description object	618	* @c: UBIFS file-system description object
619	* @dir: VFS inode object of the directory to check	619	* @dir: VFS inode object of the directory to check
620	*	620	*
621	* This function checks if directory @dir is empty. Returns zero if the	621	* This function checks if directory @dir is empty. Returns zero if the
622	* directory is empty, %-ENOTEMPTY if it is not, and other negative error codes	622	* directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
623	* in case of of errors.	623	* in case of of errors.
624	*/	624	*/
625	static int check_dir_empty(struct ubifs_info c, struct inode dir)	625	static int check_dir_empty(struct ubifs_info c, struct inode dir)
626	{	626	{
627	struct qstr nm = { .name = NULL };	627	struct qstr nm = { .name = NULL };
628	struct ubifs_dent_node *dent;	628	struct ubifs_dent_node *dent;
629	union ubifs_key key;	629	union ubifs_key key;
630	int err;	630	int err;
631		631
632	lowest_dent_key(c, &key, dir->i_ino);	632	lowest_dent_key(c, &key, dir->i_ino);
633	dent = ubifs_tnc_next_ent(c, &key, &nm);	633	dent = ubifs_tnc_next_ent(c, &key, &nm);
634	if (IS_ERR(dent)) {	634	if (IS_ERR(dent)) {
635	err = PTR_ERR(dent);	635	err = PTR_ERR(dent);
636	if (err == -ENOENT)	636	if (err == -ENOENT)
637	err = 0;	637	err = 0;
638	} else {	638	} else {
639	kfree(dent);	639	kfree(dent);
640	err = -ENOTEMPTY;	640	err = -ENOTEMPTY;
641	}	641	}
642	return err;	642	return err;
643	}	643	}
644		644
645	static int ubifs_rmdir(struct inode dir, struct dentry dentry)	645	static int ubifs_rmdir(struct inode dir, struct dentry dentry)
646	{	646	{
647	struct ubifs_info *c = dir->i_sb->s_fs_info;	647	struct ubifs_info *c = dir->i_sb->s_fs_info;
648	struct inode *inode = dentry->d_inode;	648	struct inode *inode = dentry->d_inode;
649	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);	649	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
650	int err, budgeted = 1;	650	int err, budgeted = 1;
651	struct ubifs_inode *dir_ui = ubifs_inode(dir);	651	struct ubifs_inode *dir_ui = ubifs_inode(dir);
652	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };	652	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
653		653
654	/*	654	/*
655	* Budget request settings: deletion direntry, deletion inode and	655	* Budget request settings: deletion direntry, deletion inode and
656	* changing the parent inode. If budgeting fails, go ahead anyway	656	* changing the parent inode. If budgeting fails, go ahead anyway
657	* because we have extra space reserved for deletions.	657	* because we have extra space reserved for deletions.
658	*/	658	*/
659		659
660	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,	660	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
661	dentry->d_name.name, inode->i_ino, dir->i_ino);	661	dentry->d_name.name, inode->i_ino, dir->i_ino);
662	ubifs_assert(mutex_is_locked(&dir->i_mutex));	662	ubifs_assert(mutex_is_locked(&dir->i_mutex));
663	ubifs_assert(mutex_is_locked(&inode->i_mutex));	663	ubifs_assert(mutex_is_locked(&inode->i_mutex));
664	err = check_dir_empty(c, dentry->d_inode);	664	err = check_dir_empty(c, dentry->d_inode);
665	if (err)	665	if (err)
666	return err;	666	return err;
667		667
668	err = ubifs_budget_space(c, &req);	668	err = ubifs_budget_space(c, &req);
669	if (err) {	669	if (err) {
670	if (err != -ENOSPC)	670	if (err != -ENOSPC)
671	return err;	671	return err;
672	budgeted = 0;	672	budgeted = 0;
673	}	673	}
674		674
675	lock_2_inodes(dir, inode);	675	lock_2_inodes(dir, inode);
676	inode->i_ctime = ubifs_current_time(dir);	676	inode->i_ctime = ubifs_current_time(dir);
677	clear_nlink(inode);	677	clear_nlink(inode);
678	drop_nlink(dir);	678	drop_nlink(dir);
679	dir->i_size -= sz_change;	679	dir->i_size -= sz_change;
680	dir_ui->ui_size = dir->i_size;	680	dir_ui->ui_size = dir->i_size;
681	dir->i_mtime = dir->i_ctime = inode->i_ctime;	681	dir->i_mtime = dir->i_ctime = inode->i_ctime;
682	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);	682	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
683	if (err)	683	if (err)
684	goto out_cancel;	684	goto out_cancel;
685	unlock_2_inodes(dir, inode);	685	unlock_2_inodes(dir, inode);
686		686
687	if (budgeted)	687	if (budgeted)
688	ubifs_release_budget(c, &req);	688	ubifs_release_budget(c, &req);
689	else {	689	else {
690	/* We've deleted something - clean the "no space" flags */	690	/* We've deleted something - clean the "no space" flags */
691	c->bi.nospace = c->bi.nospace_rp = 0;	691	c->bi.nospace = c->bi.nospace_rp = 0;
692	smp_wmb();	692	smp_wmb();
693	}	693	}
694	return 0;	694	return 0;
695		695
696	out_cancel:	696	out_cancel:
697	dir->i_size += sz_change;	697	dir->i_size += sz_change;
698	dir_ui->ui_size = dir->i_size;	698	dir_ui->ui_size = dir->i_size;
699	inc_nlink(dir);	699	inc_nlink(dir);
700	set_nlink(inode, 2);	700	set_nlink(inode, 2);
701	unlock_2_inodes(dir, inode);	701	unlock_2_inodes(dir, inode);
702	if (budgeted)	702	if (budgeted)
703	ubifs_release_budget(c, &req);	703	ubifs_release_budget(c, &req);
704	return err;	704	return err;
705	}	705	}
706		706
707	static int ubifs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	707	static int ubifs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
708	{	708	{
709	struct inode *inode;	709	struct inode *inode;
710	struct ubifs_inode *dir_ui = ubifs_inode(dir);	710	struct ubifs_inode *dir_ui = ubifs_inode(dir);
711	struct ubifs_info *c = dir->i_sb->s_fs_info;	711	struct ubifs_info *c = dir->i_sb->s_fs_info;
712	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);	712	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
713	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };	713	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
714		714
715	/*	715	/*
716	* Budget request settings: new inode, new direntry and changing parent	716	* Budget request settings: new inode, new direntry and changing parent
717	* directory inode.	717	* directory inode.
718	*/	718	*/
719		719
720	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",	720	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
721	dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);	721	dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
722		722
723	err = ubifs_budget_space(c, &req);	723	err = ubifs_budget_space(c, &req);
724	if (err)	724	if (err)
725	return err;	725	return err;
726		726
727	inode = ubifs_new_inode(c, dir, S_IFDIR \| mode);	727	inode = ubifs_new_inode(c, dir, S_IFDIR \| mode);
728	if (IS_ERR(inode)) {	728	if (IS_ERR(inode)) {
729	err = PTR_ERR(inode);	729	err = PTR_ERR(inode);
730	goto out_budg;	730	goto out_budg;
731	}	731	}
732		732
733	mutex_lock(&dir_ui->ui_mutex);	733	mutex_lock(&dir_ui->ui_mutex);
734	insert_inode_hash(inode);	734	insert_inode_hash(inode);
735	inc_nlink(inode);	735	inc_nlink(inode);
736	inc_nlink(dir);	736	inc_nlink(dir);
737	dir->i_size += sz_change;	737	dir->i_size += sz_change;
738	dir_ui->ui_size = dir->i_size;	738	dir_ui->ui_size = dir->i_size;
739	dir->i_mtime = dir->i_ctime = inode->i_ctime;	739	dir->i_mtime = dir->i_ctime = inode->i_ctime;
740	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);	740	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
741	if (err) {	741	if (err) {
742	ubifs_err("cannot create directory, error %d", err);	742	ubifs_err("cannot create directory, error %d", err);
743	goto out_cancel;	743	goto out_cancel;
744	}	744	}
745	mutex_unlock(&dir_ui->ui_mutex);	745	mutex_unlock(&dir_ui->ui_mutex);
746		746
747	ubifs_release_budget(c, &req);	747	ubifs_release_budget(c, &req);
748	d_instantiate(dentry, inode);	748	d_instantiate(dentry, inode);
749	return 0;	749	return 0;
750		750
751	out_cancel:	751	out_cancel:
752	dir->i_size -= sz_change;	752	dir->i_size -= sz_change;
753	dir_ui->ui_size = dir->i_size;	753	dir_ui->ui_size = dir->i_size;
754	drop_nlink(dir);	754	drop_nlink(dir);
755	mutex_unlock(&dir_ui->ui_mutex);	755	mutex_unlock(&dir_ui->ui_mutex);
756	make_bad_inode(inode);	756	make_bad_inode(inode);
757	iput(inode);	757	iput(inode);
758	out_budg:	758	out_budg:
759	ubifs_release_budget(c, &req);	759	ubifs_release_budget(c, &req);
760	return err;	760	return err;
761	}	761	}
762		762
763	static int ubifs_mknod(struct inode dir, struct dentry dentry,	763	static int ubifs_mknod(struct inode dir, struct dentry dentry,
764	umode_t mode, dev_t rdev)	764	umode_t mode, dev_t rdev)
765	{	765	{
766	struct inode *inode;	766	struct inode *inode;
767	struct ubifs_inode *ui;	767	struct ubifs_inode *ui;
768	struct ubifs_inode *dir_ui = ubifs_inode(dir);	768	struct ubifs_inode *dir_ui = ubifs_inode(dir);
769	struct ubifs_info *c = dir->i_sb->s_fs_info;	769	struct ubifs_info *c = dir->i_sb->s_fs_info;
770	union ubifs_dev_desc *dev = NULL;	770	union ubifs_dev_desc *dev = NULL;
771	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);	771	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
772	int err, devlen = 0;	772	int err, devlen = 0;
773	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,	773	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
774	.new_ino_d = ALIGN(devlen, 8),	774	.new_ino_d = ALIGN(devlen, 8),
775	.dirtied_ino = 1 };	775	.dirtied_ino = 1 };
776		776
777	/*	777	/*
778	* Budget request settings: new inode, new direntry and changing parent	778	* Budget request settings: new inode, new direntry and changing parent
779	* directory inode.	779	* directory inode.
780	*/	780	*/
781		781
782	dbg_gen("dent '%.*s' in dir ino %lu",	782	dbg_gen("dent '%.*s' in dir ino %lu",
783	dentry->d_name.len, dentry->d_name.name, dir->i_ino);	783	dentry->d_name.len, dentry->d_name.name, dir->i_ino);
784		784
785	if (!new_valid_dev(rdev))	785	if (!new_valid_dev(rdev))
786	return -EINVAL;	786	return -EINVAL;
787		787
788	if (S_ISBLK(mode) \|\| S_ISCHR(mode)) {	788	if (S_ISBLK(mode) \|\| S_ISCHR(mode)) {
789	dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);	789	dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
790	if (!dev)	790	if (!dev)
791	return -ENOMEM;	791	return -ENOMEM;
792	devlen = ubifs_encode_dev(dev, rdev);	792	devlen = ubifs_encode_dev(dev, rdev);
793	}	793	}
794		794
795	err = ubifs_budget_space(c, &req);	795	err = ubifs_budget_space(c, &req);
796	if (err) {	796	if (err) {
797	kfree(dev);	797	kfree(dev);
798	return err;	798	return err;
799	}	799	}
800		800
801	inode = ubifs_new_inode(c, dir, mode);	801	inode = ubifs_new_inode(c, dir, mode);
802	if (IS_ERR(inode)) {	802	if (IS_ERR(inode)) {
803	kfree(dev);	803	kfree(dev);
804	err = PTR_ERR(inode);	804	err = PTR_ERR(inode);
805	goto out_budg;	805	goto out_budg;
806	}	806	}
807		807
808	init_special_inode(inode, inode->i_mode, rdev);	808	init_special_inode(inode, inode->i_mode, rdev);
809	inode->i_size = ubifs_inode(inode)->ui_size = devlen;	809	inode->i_size = ubifs_inode(inode)->ui_size = devlen;
810	ui = ubifs_inode(inode);	810	ui = ubifs_inode(inode);
811	ui->data = dev;	811	ui->data = dev;
812	ui->data_len = devlen;	812	ui->data_len = devlen;
813		813
814	mutex_lock(&dir_ui->ui_mutex);	814	mutex_lock(&dir_ui->ui_mutex);
815	dir->i_size += sz_change;	815	dir->i_size += sz_change;
816	dir_ui->ui_size = dir->i_size;	816	dir_ui->ui_size = dir->i_size;
817	dir->i_mtime = dir->i_ctime = inode->i_ctime;	817	dir->i_mtime = dir->i_ctime = inode->i_ctime;
818	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);	818	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
819	if (err)	819	if (err)
820	goto out_cancel;	820	goto out_cancel;
821	mutex_unlock(&dir_ui->ui_mutex);	821	mutex_unlock(&dir_ui->ui_mutex);
822		822
823	ubifs_release_budget(c, &req);	823	ubifs_release_budget(c, &req);
824	insert_inode_hash(inode);	824	insert_inode_hash(inode);
825	d_instantiate(dentry, inode);	825	d_instantiate(dentry, inode);
826	return 0;	826	return 0;
827		827
828	out_cancel:	828	out_cancel:
829	dir->i_size -= sz_change;	829	dir->i_size -= sz_change;
830	dir_ui->ui_size = dir->i_size;	830	dir_ui->ui_size = dir->i_size;
831	mutex_unlock(&dir_ui->ui_mutex);	831	mutex_unlock(&dir_ui->ui_mutex);
832	make_bad_inode(inode);	832	make_bad_inode(inode);
833	iput(inode);	833	iput(inode);
834	out_budg:	834	out_budg:
835	ubifs_release_budget(c, &req);	835	ubifs_release_budget(c, &req);
836	return err;	836	return err;
837	}	837	}
838		838
839	static int ubifs_symlink(struct inode dir, struct dentry dentry,	839	static int ubifs_symlink(struct inode dir, struct dentry dentry,
840	const char *symname)	840	const char *symname)
841	{	841	{
842	struct inode *inode;	842	struct inode *inode;
843	struct ubifs_inode *ui;	843	struct ubifs_inode *ui;
844	struct ubifs_inode *dir_ui = ubifs_inode(dir);	844	struct ubifs_inode *dir_ui = ubifs_inode(dir);
845	struct ubifs_info *c = dir->i_sb->s_fs_info;	845	struct ubifs_info *c = dir->i_sb->s_fs_info;
846	int err, len = strlen(symname);	846	int err, len = strlen(symname);
847	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);	847	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
848	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,	848	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
849	.new_ino_d = ALIGN(len, 8),	849	.new_ino_d = ALIGN(len, 8),
850	.dirtied_ino = 1 };	850	.dirtied_ino = 1 };
851		851
852	/*	852	/*
853	* Budget request settings: new inode, new direntry and changing parent	853	* Budget request settings: new inode, new direntry and changing parent
854	* directory inode.	854	* directory inode.
855	*/	855	*/
856		856
857	dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,	857	dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
858	dentry->d_name.name, symname, dir->i_ino);	858	dentry->d_name.name, symname, dir->i_ino);
859		859
860	if (len > UBIFS_MAX_INO_DATA)	860	if (len > UBIFS_MAX_INO_DATA)
861	return -ENAMETOOLONG;	861	return -ENAMETOOLONG;
862		862
863	err = ubifs_budget_space(c, &req);	863	err = ubifs_budget_space(c, &req);
864	if (err)	864	if (err)
865	return err;	865	return err;
866		866
867	inode = ubifs_new_inode(c, dir, S_IFLNK \| S_IRWXUGO);	867	inode = ubifs_new_inode(c, dir, S_IFLNK \| S_IRWXUGO);
868	if (IS_ERR(inode)) {	868	if (IS_ERR(inode)) {
869	err = PTR_ERR(inode);	869	err = PTR_ERR(inode);
870	goto out_budg;	870	goto out_budg;
871	}	871	}
872		872
873	ui = ubifs_inode(inode);	873	ui = ubifs_inode(inode);
874	ui->data = kmalloc(len + 1, GFP_NOFS);	874	ui->data = kmalloc(len + 1, GFP_NOFS);
875	if (!ui->data) {	875	if (!ui->data) {
876	err = -ENOMEM;	876	err = -ENOMEM;
877	goto out_inode;	877	goto out_inode;
878	}	878	}
879		879
880	memcpy(ui->data, symname, len);	880	memcpy(ui->data, symname, len);
881	((char *)ui->data)[len] = '\0';	881	((char *)ui->data)[len] = '\0';
882	/*	882	/*
883	* The terminating zero byte is not written to the flash media and it	883	* The terminating zero byte is not written to the flash media and it
884	* is put just to make later in-memory string processing simpler. Thus,	884	* is put just to make later in-memory string processing simpler. Thus,
885	* data length is @len, not @len + %1.	885	* data length is @len, not @len + %1.
886	*/	886	*/
887	ui->data_len = len;	887	ui->data_len = len;
888	inode->i_size = ubifs_inode(inode)->ui_size = len;	888	inode->i_size = ubifs_inode(inode)->ui_size = len;
889		889
890	mutex_lock(&dir_ui->ui_mutex);	890	mutex_lock(&dir_ui->ui_mutex);
891	dir->i_size += sz_change;	891	dir->i_size += sz_change;
892	dir_ui->ui_size = dir->i_size;	892	dir_ui->ui_size = dir->i_size;
893	dir->i_mtime = dir->i_ctime = inode->i_ctime;	893	dir->i_mtime = dir->i_ctime = inode->i_ctime;
894	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);	894	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
895	if (err)	895	if (err)
896	goto out_cancel;	896	goto out_cancel;
897	mutex_unlock(&dir_ui->ui_mutex);	897	mutex_unlock(&dir_ui->ui_mutex);
898		898
899	ubifs_release_budget(c, &req);	899	ubifs_release_budget(c, &req);
900	insert_inode_hash(inode);	900	insert_inode_hash(inode);
901	d_instantiate(dentry, inode);	901	d_instantiate(dentry, inode);
902	return 0;	902	return 0;
903		903
904	out_cancel:	904	out_cancel:
905	dir->i_size -= sz_change;	905	dir->i_size -= sz_change;
906	dir_ui->ui_size = dir->i_size;	906	dir_ui->ui_size = dir->i_size;
907	mutex_unlock(&dir_ui->ui_mutex);	907	mutex_unlock(&dir_ui->ui_mutex);
908	out_inode:	908	out_inode:
909	make_bad_inode(inode);	909	make_bad_inode(inode);
910	iput(inode);	910	iput(inode);
911	out_budg:	911	out_budg:
912	ubifs_release_budget(c, &req);	912	ubifs_release_budget(c, &req);
913	return err;	913	return err;
914	}	914	}
915		915
916	/**	916	/**
917	* lock_3_inodes - a wrapper for locking three UBIFS inodes.	917	* lock_3_inodes - a wrapper for locking three UBIFS inodes.
918	* @inode1: first inode	918	* @inode1: first inode
919	* @inode2: second inode	919	* @inode2: second inode
920	* @inode3: third inode	920	* @inode3: third inode
921	*	921	*
922	* This function is used for 'ubifs_rename()' and @inode1 may be the same as	922	* This function is used for 'ubifs_rename()' and @inode1 may be the same as
923	* @inode2 whereas @inode3 may be %NULL.	923	* @inode2 whereas @inode3 may be %NULL.
924	*	924	*
925	* We do not implement any tricks to guarantee strict lock ordering, because	925	* We do not implement any tricks to guarantee strict lock ordering, because
926	* VFS has already done it for us on the @i_mutex. So this is just a simple	926	* VFS has already done it for us on the @i_mutex. So this is just a simple
927	* wrapper function.	927	* wrapper function.
928	*/	928	*/
929	static void lock_3_inodes(struct inode inode1, struct inode inode2,	929	static void lock_3_inodes(struct inode inode1, struct inode inode2,
930	struct inode *inode3)	930	struct inode *inode3)
931	{	931	{
932	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);	932	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
933	if (inode2 != inode1)	933	if (inode2 != inode1)
934	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);	934	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
935	if (inode3)	935	if (inode3)
936	mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);	936	mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
937	}	937	}
938		938
939	/**	939	/**
940	* unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.	940	* unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
941	* @inode1: first inode	941	* @inode1: first inode
942	* @inode2: second inode	942	* @inode2: second inode
943	* @inode3: third inode	943	* @inode3: third inode
944	*/	944	*/
945	static void unlock_3_inodes(struct inode inode1, struct inode inode2,	945	static void unlock_3_inodes(struct inode inode1, struct inode inode2,
946	struct inode *inode3)	946	struct inode *inode3)
947	{	947	{
948	if (inode3)	948	if (inode3)
949	mutex_unlock(&ubifs_inode(inode3)->ui_mutex);	949	mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
950	if (inode1 != inode2)	950	if (inode1 != inode2)
951	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);	951	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
952	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);	952	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
953	}	953	}
954		954
955	static int ubifs_rename(struct inode old_dir, struct dentry old_dentry,	955	static int ubifs_rename(struct inode old_dir, struct dentry old_dentry,
956	struct inode new_dir, struct dentry new_dentry)	956	struct inode new_dir, struct dentry new_dentry)
957	{	957	{
958	struct ubifs_info *c = old_dir->i_sb->s_fs_info;	958	struct ubifs_info *c = old_dir->i_sb->s_fs_info;
959	struct inode *old_inode = old_dentry->d_inode;	959	struct inode *old_inode = old_dentry->d_inode;
960	struct inode *new_inode = new_dentry->d_inode;	960	struct inode *new_inode = new_dentry->d_inode;
961	struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);	961	struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
962	int err, release, sync = 0, move = (new_dir != old_dir);	962	int err, release, sync = 0, move = (new_dir != old_dir);
963	int is_dir = S_ISDIR(old_inode->i_mode);	963	int is_dir = S_ISDIR(old_inode->i_mode);
964	int unlink = !!new_inode;	964	int unlink = !!new_inode;
965	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);	965	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
966	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);	966	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
967	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,	967	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
968	.dirtied_ino = 3 };	968	.dirtied_ino = 3 };
969	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,	969	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
970	.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };	970	.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
971	struct timespec time;	971	struct timespec time;
972	unsigned int uninitialized_var(saved_nlink);	972	unsigned int uninitialized_var(saved_nlink);
973		973
974	/*	974	/*
975	* Budget request settings: deletion direntry, new direntry, removing	975	* Budget request settings: deletion direntry, new direntry, removing
976	* the old inode, and changing old and new parent directory inodes.	976	* the old inode, and changing old and new parent directory inodes.
977	*	977	*
978	* However, this operation also marks the target inode as dirty and	978	* However, this operation also marks the target inode as dirty and
979	* does not write it, so we allocate budget for the target inode	979	* does not write it, so we allocate budget for the target inode
980	* separately.	980	* separately.
981	*/	981	*/
982		982
983	dbg_gen("dent '%.s' ino %lu in dir ino %lu to dent '%.s' in dir ino %lu",	983	dbg_gen("dent '%.s' ino %lu in dir ino %lu to dent '%.s' in dir ino %lu",
984	old_dentry->d_name.len, old_dentry->d_name.name,	984	old_dentry->d_name.len, old_dentry->d_name.name,
985	old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,	985	old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
986	new_dentry->d_name.name, new_dir->i_ino);	986	new_dentry->d_name.name, new_dir->i_ino);
987	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));	987	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
988	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));	988	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
989	if (unlink)	989	if (unlink)
990	ubifs_assert(mutex_is_locked(&new_inode->i_mutex));	990	ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
991		991
992		992
993	if (unlink && is_dir) {	993	if (unlink && is_dir) {
994	err = check_dir_empty(c, new_inode);	994	err = check_dir_empty(c, new_inode);
995	if (err)	995	if (err)
996	return err;	996	return err;
997	}	997	}
998		998
999	err = ubifs_budget_space(c, &req);	999	err = ubifs_budget_space(c, &req);
1000	if (err)	1000	if (err)
1001	return err;	1001	return err;
1002	err = ubifs_budget_space(c, &ino_req);	1002	err = ubifs_budget_space(c, &ino_req);
1003	if (err) {	1003	if (err) {
1004	ubifs_release_budget(c, &req);	1004	ubifs_release_budget(c, &req);
1005	return err;	1005	return err;
1006	}	1006	}
1007		1007
1008	lock_3_inodes(old_dir, new_dir, new_inode);	1008	lock_3_inodes(old_dir, new_dir, new_inode);
1009		1009
1010	/*	1010	/*
1011	* Like most other Unix systems, set the @i_ctime for inodes on a	1011	* Like most other Unix systems, set the @i_ctime for inodes on a
1012	* rename.	1012	* rename.
1013	*/	1013	*/
1014	time = ubifs_current_time(old_dir);	1014	time = ubifs_current_time(old_dir);
1015	old_inode->i_ctime = time;	1015	old_inode->i_ctime = time;
1016		1016
1017	/* We must adjust parent link count when renaming directories */	1017	/* We must adjust parent link count when renaming directories */
1018	if (is_dir) {	1018	if (is_dir) {
1019	if (move) {	1019	if (move) {
1020	/*	1020	/*
1021	* @old_dir loses a link because we are moving	1021	* @old_dir loses a link because we are moving
1022	* @old_inode to a different directory.	1022	* @old_inode to a different directory.
1023	*/	1023	*/
1024	drop_nlink(old_dir);	1024	drop_nlink(old_dir);
1025	/*	1025	/*
1026	* @new_dir only gains a link if we are not also	1026	* @new_dir only gains a link if we are not also
1027	* overwriting an existing directory.	1027	* overwriting an existing directory.
1028	*/	1028	*/
1029	if (!unlink)	1029	if (!unlink)
1030	inc_nlink(new_dir);	1030	inc_nlink(new_dir);
1031	} else {	1031	} else {
1032	/*	1032	/*
1033	* @old_inode is not moving to a different directory,	1033	* @old_inode is not moving to a different directory,
1034	* but @old_dir still loses a link if we are	1034	* but @old_dir still loses a link if we are
1035	* overwriting an existing directory.	1035	* overwriting an existing directory.
1036	*/	1036	*/
1037	if (unlink)	1037	if (unlink)
1038	drop_nlink(old_dir);	1038	drop_nlink(old_dir);
1039	}	1039	}
1040	}	1040	}
1041		1041
1042	old_dir->i_size -= old_sz;	1042	old_dir->i_size -= old_sz;
1043	ubifs_inode(old_dir)->ui_size = old_dir->i_size;	1043	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1044	old_dir->i_mtime = old_dir->i_ctime = time;	1044	old_dir->i_mtime = old_dir->i_ctime = time;
1045	new_dir->i_mtime = new_dir->i_ctime = time;	1045	new_dir->i_mtime = new_dir->i_ctime = time;
1046		1046
1047	/*	1047	/*
1048	* And finally, if we unlinked a direntry which happened to have the	1048	* And finally, if we unlinked a direntry which happened to have the
1049	* same name as the moved direntry, we have to decrement @i_nlink of	1049	* same name as the moved direntry, we have to decrement @i_nlink of
1050	* the unlinked inode and change its ctime.	1050	* the unlinked inode and change its ctime.
1051	*/	1051	*/
1052	if (unlink) {	1052	if (unlink) {
1053	/*	1053	/*
1054	* Directories cannot have hard-links, so if this is a	1054	* Directories cannot have hard-links, so if this is a
1055	* directory, just clear @i_nlink.	1055	* directory, just clear @i_nlink.
1056	*/	1056	*/
1057	saved_nlink = new_inode->i_nlink;	1057	saved_nlink = new_inode->i_nlink;
1058	if (is_dir)	1058	if (is_dir)
1059	clear_nlink(new_inode);	1059	clear_nlink(new_inode);
1060	else	1060	else
1061	drop_nlink(new_inode);	1061	drop_nlink(new_inode);
1062	new_inode->i_ctime = time;	1062	new_inode->i_ctime = time;
1063	} else {	1063	} else {
1064	new_dir->i_size += new_sz;	1064	new_dir->i_size += new_sz;
1065	ubifs_inode(new_dir)->ui_size = new_dir->i_size;	1065	ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1066	}	1066	}
1067		1067
1068	/*	1068	/*
1069	* Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode	1069	* Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
1070	* is dirty, because this will be done later on at the end of	1070	* is dirty, because this will be done later on at the end of
1071	* 'ubifs_rename()'.	1071	* 'ubifs_rename()'.
1072	*/	1072	*/
1073	if (IS_SYNC(old_inode)) {	1073	if (IS_SYNC(old_inode)) {
1074	sync = IS_DIRSYNC(old_dir) \|\| IS_DIRSYNC(new_dir);	1074	sync = IS_DIRSYNC(old_dir) \|\| IS_DIRSYNC(new_dir);
1075	if (unlink && IS_SYNC(new_inode))	1075	if (unlink && IS_SYNC(new_inode))
1076	sync = 1;	1076	sync = 1;
1077	}	1077	}
1078	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,	1078	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
1079	sync);	1079	sync);
1080	if (err)	1080	if (err)
1081	goto out_cancel;	1081	goto out_cancel;
1082		1082
1083	unlock_3_inodes(old_dir, new_dir, new_inode);	1083	unlock_3_inodes(old_dir, new_dir, new_inode);
1084	ubifs_release_budget(c, &req);	1084	ubifs_release_budget(c, &req);
1085		1085
1086	mutex_lock(&old_inode_ui->ui_mutex);	1086	mutex_lock(&old_inode_ui->ui_mutex);
1087	release = old_inode_ui->dirty;	1087	release = old_inode_ui->dirty;
1088	mark_inode_dirty_sync(old_inode);	1088	mark_inode_dirty_sync(old_inode);
1089	mutex_unlock(&old_inode_ui->ui_mutex);	1089	mutex_unlock(&old_inode_ui->ui_mutex);
1090		1090
1091	if (release)	1091	if (release)
1092	ubifs_release_budget(c, &ino_req);	1092	ubifs_release_budget(c, &ino_req);
1093	if (IS_SYNC(old_inode))	1093	if (IS_SYNC(old_inode))
1094	err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);	1094	err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1095	return err;	1095	return err;
1096		1096
1097	out_cancel:	1097	out_cancel:
1098	if (unlink) {	1098	if (unlink) {
1099	set_nlink(new_inode, saved_nlink);	1099	set_nlink(new_inode, saved_nlink);
1100	} else {	1100	} else {
1101	new_dir->i_size -= new_sz;	1101	new_dir->i_size -= new_sz;
1102	ubifs_inode(new_dir)->ui_size = new_dir->i_size;	1102	ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1103	}	1103	}
1104	old_dir->i_size += old_sz;	1104	old_dir->i_size += old_sz;
1105	ubifs_inode(old_dir)->ui_size = old_dir->i_size;	1105	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1106	if (is_dir) {	1106	if (is_dir) {
1107	if (move) {	1107	if (move) {
1108	inc_nlink(old_dir);	1108	inc_nlink(old_dir);
1109	if (!unlink)	1109	if (!unlink)
1110	drop_nlink(new_dir);	1110	drop_nlink(new_dir);
1111	} else {	1111	} else {
1112	if (unlink)	1112	if (unlink)
1113	inc_nlink(old_dir);	1113	inc_nlink(old_dir);
1114	}	1114	}
1115	}	1115	}
1116	unlock_3_inodes(old_dir, new_dir, new_inode);	1116	unlock_3_inodes(old_dir, new_dir, new_inode);
1117	ubifs_release_budget(c, &ino_req);	1117	ubifs_release_budget(c, &ino_req);
1118	ubifs_release_budget(c, &req);	1118	ubifs_release_budget(c, &req);
1119	return err;	1119	return err;
1120	}	1120	}
1121		1121
1122	int ubifs_getattr(struct vfsmount mnt, struct dentry dentry,	1122	int ubifs_getattr(struct vfsmount mnt, struct dentry dentry,
1123	struct kstat *stat)	1123	struct kstat *stat)
1124	{	1124	{
1125	loff_t size;	1125	loff_t size;
1126	struct inode *inode = dentry->d_inode;	1126	struct inode *inode = dentry->d_inode;
1127	struct ubifs_inode *ui = ubifs_inode(inode);	1127	struct ubifs_inode *ui = ubifs_inode(inode);
1128		1128
1129	mutex_lock(&ui->ui_mutex);	1129	mutex_lock(&ui->ui_mutex);
1130	generic_fillattr(inode, stat);	1130	generic_fillattr(inode, stat);
1131	stat->blksize = UBIFS_BLOCK_SIZE;	1131	stat->blksize = UBIFS_BLOCK_SIZE;
1132	stat->size = ui->ui_size;	1132	stat->size = ui->ui_size;
1133		1133
1134	/*	1134	/*
1135	* Unfortunately, the 'stat()' system call was designed for block	1135	* Unfortunately, the 'stat()' system call was designed for block
1136	* device based file systems, and it is not appropriate for UBIFS,	1136	* device based file systems, and it is not appropriate for UBIFS,
1137	* because UBIFS does not have notion of "block". For example, it is	1137	* because UBIFS does not have notion of "block". For example, it is
1138	* difficult to tell how many block a directory takes - it actually	1138	* difficult to tell how many block a directory takes - it actually
1139	* takes less than 300 bytes, but we have to round it to block size,	1139	* takes less than 300 bytes, but we have to round it to block size,
1140	* which introduces large mistake. This makes utilities like 'du' to	1140	* which introduces large mistake. This makes utilities like 'du' to
1141	* report completely senseless numbers. This is the reason why UBIFS	1141	* report completely senseless numbers. This is the reason why UBIFS
1142	* goes the same way as JFFS2 - it reports zero blocks for everything	1142	* goes the same way as JFFS2 - it reports zero blocks for everything
1143	* but regular files, which makes more sense than reporting completely	1143	* but regular files, which makes more sense than reporting completely
1144	* wrong sizes.	1144	* wrong sizes.
1145	*/	1145	*/
1146	if (S_ISREG(inode->i_mode)) {	1146	if (S_ISREG(inode->i_mode)) {
1147	size = ui->xattr_size;	1147	size = ui->xattr_size;
1148	size += stat->size;	1148	size += stat->size;
1149	size = ALIGN(size, UBIFS_BLOCK_SIZE);	1149	size = ALIGN(size, UBIFS_BLOCK_SIZE);
1150	/*	1150	/*
1151	* Note, user-space expects 512-byte blocks count irrespectively	1151	* Note, user-space expects 512-byte blocks count irrespectively
1152	* of what was reported in @stat->size.	1152	* of what was reported in @stat->size.
1153	*/	1153	*/
1154	stat->blocks = size >> 9;	1154	stat->blocks = size >> 9;
1155	} else	1155	} else
1156	stat->blocks = 0;	1156	stat->blocks = 0;
1157	mutex_unlock(&ui->ui_mutex);	1157	mutex_unlock(&ui->ui_mutex);
1158	return 0;	1158	return 0;
1159	}	1159	}
1160		1160
1161	const struct inode_operations ubifs_dir_inode_operations = {	1161	const struct inode_operations ubifs_dir_inode_operations = {
1162	.lookup = ubifs_lookup,	1162	.lookup = ubifs_lookup,
1163	.create = ubifs_create,	1163	.create = ubifs_create,
1164	.link = ubifs_link,	1164	.link = ubifs_link,
1165	.symlink = ubifs_symlink,	1165	.symlink = ubifs_symlink,
1166	.unlink = ubifs_unlink,	1166	.unlink = ubifs_unlink,
1167	.mkdir = ubifs_mkdir,	1167	.mkdir = ubifs_mkdir,
1168	.rmdir = ubifs_rmdir,	1168	.rmdir = ubifs_rmdir,
1169	.mknod = ubifs_mknod,	1169	.mknod = ubifs_mknod,
1170	.rename = ubifs_rename,	1170	.rename = ubifs_rename,
1171	.setattr = ubifs_setattr,	1171	.setattr = ubifs_setattr,
1172	.getattr = ubifs_getattr,	1172	.getattr = ubifs_getattr,
1173	.setxattr = ubifs_setxattr,	1173	.setxattr = ubifs_setxattr,
1174	.getxattr = ubifs_getxattr,	1174	.getxattr = ubifs_getxattr,
1175	.listxattr = ubifs_listxattr,	1175	.listxattr = ubifs_listxattr,
1176	.removexattr = ubifs_removexattr,	1176	.removexattr = ubifs_removexattr,
1177	};	1177	};
1178		1178
1179	const struct file_operations ubifs_dir_operations = {	1179	const struct file_operations ubifs_dir_operations = {
1180	.llseek = ubifs_dir_llseek,	1180	.llseek = ubifs_dir_llseek,
1181	.release = ubifs_dir_release,	1181	.release = ubifs_dir_release,
1182	.read = generic_read_dir,	1182	.read = generic_read_dir,
1183	.readdir = ubifs_readdir,	1183	.readdir = ubifs_readdir,
1184	.fsync = ubifs_fsync,	1184	.fsync = ubifs_fsync,
1185	.unlocked_ioctl = ubifs_ioctl,	1185	.unlocked_ioctl = ubifs_ioctl,
1186	#ifdef CONFIG_COMPAT	1186	#ifdef CONFIG_COMPAT
1187	.compat_ioctl = ubifs_compat_ioctl,	1187	.compat_ioctl = ubifs_compat_ioctl,
1188	#endif	1188	#endif
1189	};	1189	};
1190		1190

include/linux/fs.h

Diff comments View file @ 965c8e5

include/linux/ftrace.h

Diff comments View file @ 965c8e5

include/linux/syscalls.h

Diff comments View file @ 965c8e5

kernel/trace/ftrace.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* Infrastructure for profiling code inserted by 'gcc -pg'.	2	* Infrastructure for profiling code inserted by 'gcc -pg'.
3	*	3	*
4	* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>	4	* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5	* Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>	5	* Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
6	*	6	*
7	* Originally ported from the -rt patch by:	7	* Originally ported from the -rt patch by:
8	* Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>	8	* Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
9	*	9	*
10	* Based on code in the latency_tracer, that is:	10	* Based on code in the latency_tracer, that is:
11	*	11	*
12	* Copyright (C) 2004-2006 Ingo Molnar	12	* Copyright (C) 2004-2006 Ingo Molnar
13	* Copyright (C) 2004 Nadia Yvette Chambers	13	* Copyright (C) 2004 Nadia Yvette Chambers
14	*/	14	*/
15		15
16	#include <linux/stop_machine.h>	16	#include <linux/stop_machine.h>
17	#include <linux/clocksource.h>	17	#include <linux/clocksource.h>
18	#include <linux/kallsyms.h>	18	#include <linux/kallsyms.h>
19	#include <linux/seq_file.h>	19	#include <linux/seq_file.h>
20	#include <linux/suspend.h>	20	#include <linux/suspend.h>
21	#include <linux/debugfs.h>	21	#include <linux/debugfs.h>
22	#include <linux/hardirq.h>	22	#include <linux/hardirq.h>
23	#include <linux/kthread.h>	23	#include <linux/kthread.h>
24	#include <linux/uaccess.h>	24	#include <linux/uaccess.h>
25	#include <linux/bsearch.h>	25	#include <linux/bsearch.h>
26	#include <linux/module.h>	26	#include <linux/module.h>
27	#include <linux/ftrace.h>	27	#include <linux/ftrace.h>
28	#include <linux/sysctl.h>	28	#include <linux/sysctl.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/ctype.h>	30	#include <linux/ctype.h>
31	#include <linux/sort.h>	31	#include <linux/sort.h>
32	#include <linux/list.h>	32	#include <linux/list.h>
33	#include <linux/hash.h>	33	#include <linux/hash.h>
34	#include <linux/rcupdate.h>	34	#include <linux/rcupdate.h>
35		35
36	#include <trace/events/sched.h>	36	#include <trace/events/sched.h>
37		37
38	#include <asm/setup.h>	38	#include <asm/setup.h>
39		39
40	#include "trace_output.h"	40	#include "trace_output.h"
41	#include "trace_stat.h"	41	#include "trace_stat.h"
42		42
43	#define FTRACE_WARN_ON(cond) \	43	#define FTRACE_WARN_ON(cond) \
44	({ \	44	({ \
45	int ___r = cond; \	45	int ___r = cond; \
46	if (WARN_ON(___r)) \	46	if (WARN_ON(___r)) \
47	ftrace_kill(); \	47	ftrace_kill(); \
48	___r; \	48	___r; \
49	})	49	})
50		50
51	#define FTRACE_WARN_ON_ONCE(cond) \	51	#define FTRACE_WARN_ON_ONCE(cond) \
52	({ \	52	({ \
53	int ___r = cond; \	53	int ___r = cond; \
54	if (WARN_ON_ONCE(___r)) \	54	if (WARN_ON_ONCE(___r)) \
55	ftrace_kill(); \	55	ftrace_kill(); \
56	___r; \	56	___r; \
57	})	57	})
58		58
59	/* hash bits for specific function selection */	59	/* hash bits for specific function selection */
60	#define FTRACE_HASH_BITS 7	60	#define FTRACE_HASH_BITS 7
61	#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)	61	#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
62	#define FTRACE_HASH_DEFAULT_BITS 10	62	#define FTRACE_HASH_DEFAULT_BITS 10
63	#define FTRACE_HASH_MAX_BITS 12	63	#define FTRACE_HASH_MAX_BITS 12
64		64
65	#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL \| FTRACE_OPS_FL_CONTROL)	65	#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL \| FTRACE_OPS_FL_CONTROL)
66		66
67	static struct ftrace_ops ftrace_list_end __read_mostly = {	67	static struct ftrace_ops ftrace_list_end __read_mostly = {
68	.func = ftrace_stub,	68	.func = ftrace_stub,
69	.flags = FTRACE_OPS_FL_RECURSION_SAFE,	69	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
70	};	70	};
71		71
72	/* ftrace_enabled is a method to turn ftrace on or off */	72	/* ftrace_enabled is a method to turn ftrace on or off */
73	int ftrace_enabled __read_mostly;	73	int ftrace_enabled __read_mostly;
74	static int last_ftrace_enabled;	74	static int last_ftrace_enabled;
75		75
76	/* Quick disabling of function tracer. */	76	/* Quick disabling of function tracer. */
77	int function_trace_stop __read_mostly;	77	int function_trace_stop __read_mostly;
78		78
79	/* Current function tracing op */	79	/* Current function tracing op */
80	struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;	80	struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
81		81
82	/* List for set_ftrace_pid's pids. */	82	/* List for set_ftrace_pid's pids. */
83	LIST_HEAD(ftrace_pids);	83	LIST_HEAD(ftrace_pids);
84	struct ftrace_pid {	84	struct ftrace_pid {
85	struct list_head list;	85	struct list_head list;
86	struct pid *pid;	86	struct pid *pid;
87	};	87	};
88		88
89	/*	89	/*
90	* ftrace_disabled is set when an anomaly is discovered.	90	* ftrace_disabled is set when an anomaly is discovered.
91	* ftrace_disabled is much stronger than ftrace_enabled.	91	* ftrace_disabled is much stronger than ftrace_enabled.
92	*/	92	*/
93	static int ftrace_disabled __read_mostly;	93	static int ftrace_disabled __read_mostly;
94		94
95	static DEFINE_MUTEX(ftrace_lock);	95	static DEFINE_MUTEX(ftrace_lock);
96		96
97	static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;	97	static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
98	static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;	98	static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
99	static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;	99	static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
100	ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;	100	ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
101	ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;	101	ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
102	static struct ftrace_ops global_ops;	102	static struct ftrace_ops global_ops;
103	static struct ftrace_ops control_ops;	103	static struct ftrace_ops control_ops;
104		104
105	#if ARCH_SUPPORTS_FTRACE_OPS	105	#if ARCH_SUPPORTS_FTRACE_OPS
106	static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,	106	static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
107	struct ftrace_ops op, struct pt_regs regs);	107	struct ftrace_ops op, struct pt_regs regs);
108	#else	108	#else
109	/* See comment below, where ftrace_ops_list_func is defined */	109	/* See comment below, where ftrace_ops_list_func is defined */
110	static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);	110	static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111	#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)	111	#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112	#endif	112	#endif
113		113
114	/**	114	/**
115	* ftrace_nr_registered_ops - return number of ops registered	115	* ftrace_nr_registered_ops - return number of ops registered
116	*	116	*
117	* Returns the number of ftrace_ops registered and tracing functions	117	* Returns the number of ftrace_ops registered and tracing functions
118	*/	118	*/
119	int ftrace_nr_registered_ops(void)	119	int ftrace_nr_registered_ops(void)
120	{	120	{
121	struct ftrace_ops *ops;	121	struct ftrace_ops *ops;
122	int cnt = 0;	122	int cnt = 0;
123		123
124	mutex_lock(&ftrace_lock);	124	mutex_lock(&ftrace_lock);
125		125
126	for (ops = ftrace_ops_list;	126	for (ops = ftrace_ops_list;
127	ops != &ftrace_list_end; ops = ops->next)	127	ops != &ftrace_list_end; ops = ops->next)
128	cnt++;	128	cnt++;
129		129
130	mutex_unlock(&ftrace_lock);	130	mutex_unlock(&ftrace_lock);
131		131
132	return cnt;	132	return cnt;
133	}	133	}
134		134
135	/*	135	/*
136	* Traverse the ftrace_global_list, invoking all entries. The reason that we	136	* Traverse the ftrace_global_list, invoking all entries. The reason that we
137	* can use rcu_dereference_raw() is that elements removed from this list	137	* can use rcu_dereference_raw() is that elements removed from this list
138	* are simply leaked, so there is no need to interact with a grace-period	138	* are simply leaked, so there is no need to interact with a grace-period
139	* mechanism. The rcu_dereference_raw() calls are needed to handle	139	* mechanism. The rcu_dereference_raw() calls are needed to handle
140	* concurrent insertions into the ftrace_global_list.	140	* concurrent insertions into the ftrace_global_list.
141	*	141	*
142	* Silly Alpha and silly pointer-speculation compiler optimizations!	142	* Silly Alpha and silly pointer-speculation compiler optimizations!
143	*/	143	*/
144	static void	144	static void
145	ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,	145	ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146	struct ftrace_ops op, struct pt_regs regs)	146	struct ftrace_ops op, struct pt_regs regs)
147	{	147	{
148	if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))	148	if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
149	return;	149	return;
150		150
151	trace_recursion_set(TRACE_GLOBAL_BIT);	151	trace_recursion_set(TRACE_GLOBAL_BIT);
152	op = rcu_dereference_raw(ftrace_global_list); /see above/	152	op = rcu_dereference_raw(ftrace_global_list); /see above/
153	while (op != &ftrace_list_end) {	153	while (op != &ftrace_list_end) {
154	op->func(ip, parent_ip, op, regs);	154	op->func(ip, parent_ip, op, regs);
155	op = rcu_dereference_raw(op->next); /see above/	155	op = rcu_dereference_raw(op->next); /see above/
156	};	156	};
157	trace_recursion_clear(TRACE_GLOBAL_BIT);	157	trace_recursion_clear(TRACE_GLOBAL_BIT);
158	}	158	}
159		159
160	static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,	160	static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
161	struct ftrace_ops op, struct pt_regs regs)	161	struct ftrace_ops op, struct pt_regs regs)
162	{	162	{
163	if (!test_tsk_trace_trace(current))	163	if (!test_tsk_trace_trace(current))
164	return;	164	return;
165		165
166	ftrace_pid_function(ip, parent_ip, op, regs);	166	ftrace_pid_function(ip, parent_ip, op, regs);
167	}	167	}
168		168
169	static void set_ftrace_pid_function(ftrace_func_t func)	169	static void set_ftrace_pid_function(ftrace_func_t func)
170	{	170	{
171	/* do not set ftrace_pid_function to itself! */	171	/* do not set ftrace_pid_function to itself! */
172	if (func != ftrace_pid_func)	172	if (func != ftrace_pid_func)
173	ftrace_pid_function = func;	173	ftrace_pid_function = func;
174	}	174	}
175		175
176	/**	176	/**
177	* clear_ftrace_function - reset the ftrace function	177	* clear_ftrace_function - reset the ftrace function
178	*	178	*
179	* This NULLs the ftrace function and in essence stops	179	* This NULLs the ftrace function and in essence stops
180	* tracing. There may be lag	180	* tracing. There may be lag
181	*/	181	*/
182	void clear_ftrace_function(void)	182	void clear_ftrace_function(void)
183	{	183	{
184	ftrace_trace_function = ftrace_stub;	184	ftrace_trace_function = ftrace_stub;
185	ftrace_pid_function = ftrace_stub;	185	ftrace_pid_function = ftrace_stub;
186	}	186	}
187		187
188	static void control_ops_disable_all(struct ftrace_ops *ops)	188	static void control_ops_disable_all(struct ftrace_ops *ops)
189	{	189	{
190	int cpu;	190	int cpu;
191		191
192	for_each_possible_cpu(cpu)	192	for_each_possible_cpu(cpu)
193	*per_cpu_ptr(ops->disabled, cpu) = 1;	193	*per_cpu_ptr(ops->disabled, cpu) = 1;
194	}	194	}
195		195
196	static int control_ops_alloc(struct ftrace_ops *ops)	196	static int control_ops_alloc(struct ftrace_ops *ops)
197	{	197	{
198	int __percpu *disabled;	198	int __percpu *disabled;
199		199
200	disabled = alloc_percpu(int);	200	disabled = alloc_percpu(int);
201	if (!disabled)	201	if (!disabled)
202	return -ENOMEM;	202	return -ENOMEM;
203		203
204	ops->disabled = disabled;	204	ops->disabled = disabled;
205	control_ops_disable_all(ops);	205	control_ops_disable_all(ops);
206	return 0;	206	return 0;
207	}	207	}
208		208
209	static void control_ops_free(struct ftrace_ops *ops)	209	static void control_ops_free(struct ftrace_ops *ops)
210	{	210	{
211	free_percpu(ops->disabled);	211	free_percpu(ops->disabled);
212	}	212	}
213		213
214	static void update_global_ops(void)	214	static void update_global_ops(void)
215	{	215	{
216	ftrace_func_t func;	216	ftrace_func_t func;
217		217
218	/*	218	/*
219	* If there's only one function registered, then call that	219	* If there's only one function registered, then call that
220	* function directly. Otherwise, we need to iterate over the	220	* function directly. Otherwise, we need to iterate over the
221	* registered callers.	221	* registered callers.
222	*/	222	*/
223	if (ftrace_global_list == &ftrace_list_end \|\|	223	if (ftrace_global_list == &ftrace_list_end \|\|
224	ftrace_global_list->next == &ftrace_list_end)	224	ftrace_global_list->next == &ftrace_list_end)
225	func = ftrace_global_list->func;	225	func = ftrace_global_list->func;
226	else	226	else
227	func = ftrace_global_list_func;	227	func = ftrace_global_list_func;
228		228
229	/* If we filter on pids, update to use the pid function */	229	/* If we filter on pids, update to use the pid function */
230	if (!list_empty(&ftrace_pids)) {	230	if (!list_empty(&ftrace_pids)) {
231	set_ftrace_pid_function(func);	231	set_ftrace_pid_function(func);
232	func = ftrace_pid_func;	232	func = ftrace_pid_func;
233	}	233	}
234		234
235	global_ops.func = func;	235	global_ops.func = func;
236	}	236	}
237		237
238	static void update_ftrace_function(void)	238	static void update_ftrace_function(void)
239	{	239	{
240	ftrace_func_t func;	240	ftrace_func_t func;
241		241
242	update_global_ops();	242	update_global_ops();
243		243
244	/*	244	/*
245	* If we are at the end of the list and this ops is	245	* If we are at the end of the list and this ops is
246	* recursion safe and not dynamic and the arch supports passing ops,	246	* recursion safe and not dynamic and the arch supports passing ops,
247	* then have the mcount trampoline call the function directly.	247	* then have the mcount trampoline call the function directly.
248	*/	248	*/
249	if (ftrace_ops_list == &ftrace_list_end \|\|	249	if (ftrace_ops_list == &ftrace_list_end \|\|
250	(ftrace_ops_list->next == &ftrace_list_end &&	250	(ftrace_ops_list->next == &ftrace_list_end &&
251	!(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&	251	!(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
252	(ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&	252	(ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
253	!FTRACE_FORCE_LIST_FUNC)) {	253	!FTRACE_FORCE_LIST_FUNC)) {
254	/* Set the ftrace_ops that the arch callback uses */	254	/* Set the ftrace_ops that the arch callback uses */
255	if (ftrace_ops_list == &global_ops)	255	if (ftrace_ops_list == &global_ops)
256	function_trace_op = ftrace_global_list;	256	function_trace_op = ftrace_global_list;
257	else	257	else
258	function_trace_op = ftrace_ops_list;	258	function_trace_op = ftrace_ops_list;
259	func = ftrace_ops_list->func;	259	func = ftrace_ops_list->func;
260	} else {	260	} else {
261	/* Just use the default ftrace_ops */	261	/* Just use the default ftrace_ops */
262	function_trace_op = &ftrace_list_end;	262	function_trace_op = &ftrace_list_end;
263	func = ftrace_ops_list_func;	263	func = ftrace_ops_list_func;
264	}	264	}
265		265
266	ftrace_trace_function = func;	266	ftrace_trace_function = func;
267	}	267	}
268		268
269	static void add_ftrace_ops(struct ftrace_ops *list, struct ftrace_ops ops)	269	static void add_ftrace_ops(struct ftrace_ops *list, struct ftrace_ops ops)
270	{	270	{
271	ops->next = *list;	271	ops->next = *list;
272	/*	272	/*
273	* We are entering ops into the list but another	273	* We are entering ops into the list but another
274	* CPU might be walking that list. We need to make sure	274	* CPU might be walking that list. We need to make sure
275	* the ops->next pointer is valid before another CPU sees	275	* the ops->next pointer is valid before another CPU sees
276	* the ops pointer included into the list.	276	* the ops pointer included into the list.
277	*/	277	*/
278	rcu_assign_pointer(*list, ops);	278	rcu_assign_pointer(*list, ops);
279	}	279	}
280		280
281	static int remove_ftrace_ops(struct ftrace_ops *list, struct ftrace_ops ops)	281	static int remove_ftrace_ops(struct ftrace_ops *list, struct ftrace_ops ops)
282	{	282	{
283	struct ftrace_ops **p;	283	struct ftrace_ops **p;
284		284
285	/*	285	/*
286	* If we are removing the last function, then simply point	286	* If we are removing the last function, then simply point
287	* to the ftrace_stub.	287	* to the ftrace_stub.
288	*/	288	*/
289	if (*list == ops && ops->next == &ftrace_list_end) {	289	if (*list == ops && ops->next == &ftrace_list_end) {
290	*list = &ftrace_list_end;	290	*list = &ftrace_list_end;
291	return 0;	291	return 0;
292	}	292	}
293		293
294	for (p = list; p != &ftrace_list_end; p = &(p)->next)	294	for (p = list; p != &ftrace_list_end; p = &(p)->next)
295	if (*p == ops)	295	if (*p == ops)
296	break;	296	break;
297		297
298	if (*p != ops)	298	if (*p != ops)
299	return -1;	299	return -1;
300		300
301	p = (p)->next;	301	p = (p)->next;
302	return 0;	302	return 0;
303	}	303	}
304		304
305	static void add_ftrace_list_ops(struct ftrace_ops **list,	305	static void add_ftrace_list_ops(struct ftrace_ops **list,
306	struct ftrace_ops *main_ops,	306	struct ftrace_ops *main_ops,
307	struct ftrace_ops *ops)	307	struct ftrace_ops *ops)
308	{	308	{
309	int first = *list == &ftrace_list_end;	309	int first = *list == &ftrace_list_end;
310	add_ftrace_ops(list, ops);	310	add_ftrace_ops(list, ops);
311	if (first)	311	if (first)
312	add_ftrace_ops(&ftrace_ops_list, main_ops);	312	add_ftrace_ops(&ftrace_ops_list, main_ops);
313	}	313	}
314		314
315	static int remove_ftrace_list_ops(struct ftrace_ops **list,	315	static int remove_ftrace_list_ops(struct ftrace_ops **list,
316	struct ftrace_ops *main_ops,	316	struct ftrace_ops *main_ops,
317	struct ftrace_ops *ops)	317	struct ftrace_ops *ops)
318	{	318	{
319	int ret = remove_ftrace_ops(list, ops);	319	int ret = remove_ftrace_ops(list, ops);
320	if (!ret && *list == &ftrace_list_end)	320	if (!ret && *list == &ftrace_list_end)
321	ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);	321	ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
322	return ret;	322	return ret;
323	}	323	}
324		324
325	static int __register_ftrace_function(struct ftrace_ops *ops)	325	static int __register_ftrace_function(struct ftrace_ops *ops)
326	{	326	{
327	if (unlikely(ftrace_disabled))	327	if (unlikely(ftrace_disabled))
328	return -ENODEV;	328	return -ENODEV;
329		329
330	if (FTRACE_WARN_ON(ops == &global_ops))	330	if (FTRACE_WARN_ON(ops == &global_ops))
331	return -EINVAL;	331	return -EINVAL;
332		332
333	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))	333	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
334	return -EBUSY;	334	return -EBUSY;
335		335
336	/* We don't support both control and global flags set. */	336	/* We don't support both control and global flags set. */
337	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)	337	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338	return -EINVAL;	338	return -EINVAL;
339		339
340	#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS	340	#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
341	/*	341	/*
342	* If the ftrace_ops specifies SAVE_REGS, then it only can be used	342	* If the ftrace_ops specifies SAVE_REGS, then it only can be used
343	* if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.	343	* if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
344	* Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.	344	* Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
345	*/	345	*/
346	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&	346	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
347	!(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))	347	!(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
348	return -EINVAL;	348	return -EINVAL;
349		349
350	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)	350	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
351	ops->flags \|= FTRACE_OPS_FL_SAVE_REGS;	351	ops->flags \|= FTRACE_OPS_FL_SAVE_REGS;
352	#endif	352	#endif
353		353
354	if (!core_kernel_data((unsigned long)ops))	354	if (!core_kernel_data((unsigned long)ops))
355	ops->flags \|= FTRACE_OPS_FL_DYNAMIC;	355	ops->flags \|= FTRACE_OPS_FL_DYNAMIC;
356		356
357	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {	357	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
358	add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);	358	add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
359	ops->flags \|= FTRACE_OPS_FL_ENABLED;	359	ops->flags \|= FTRACE_OPS_FL_ENABLED;
360	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {	360	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
361	if (control_ops_alloc(ops))	361	if (control_ops_alloc(ops))
362	return -ENOMEM;	362	return -ENOMEM;
363	add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);	363	add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
364	} else	364	} else
365	add_ftrace_ops(&ftrace_ops_list, ops);	365	add_ftrace_ops(&ftrace_ops_list, ops);
366		366
367	if (ftrace_enabled)	367	if (ftrace_enabled)
368	update_ftrace_function();	368	update_ftrace_function();
369		369
370	return 0;	370	return 0;
371	}	371	}
372		372
373	static int __unregister_ftrace_function(struct ftrace_ops *ops)	373	static int __unregister_ftrace_function(struct ftrace_ops *ops)
374	{	374	{
375	int ret;	375	int ret;
376		376
377	if (ftrace_disabled)	377	if (ftrace_disabled)
378	return -ENODEV;	378	return -ENODEV;
379		379
380	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))	380	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
381	return -EBUSY;	381	return -EBUSY;
382		382
383	if (FTRACE_WARN_ON(ops == &global_ops))	383	if (FTRACE_WARN_ON(ops == &global_ops))
384	return -EINVAL;	384	return -EINVAL;
385		385
386	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {	386	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
387	ret = remove_ftrace_list_ops(&ftrace_global_list,	387	ret = remove_ftrace_list_ops(&ftrace_global_list,
388	&global_ops, ops);	388	&global_ops, ops);
389	if (!ret)	389	if (!ret)
390	ops->flags &= ~FTRACE_OPS_FL_ENABLED;	390	ops->flags &= ~FTRACE_OPS_FL_ENABLED;
391	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {	391	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
392	ret = remove_ftrace_list_ops(&ftrace_control_list,	392	ret = remove_ftrace_list_ops(&ftrace_control_list,
393	&control_ops, ops);	393	&control_ops, ops);
394	if (!ret) {	394	if (!ret) {
395	/*	395	/*
396	* The ftrace_ops is now removed from the list,	396	* The ftrace_ops is now removed from the list,
397	* so there'll be no new users. We must ensure	397	* so there'll be no new users. We must ensure
398	* all current users are done before we free	398	* all current users are done before we free
399	* the control data.	399	* the control data.
400	*/	400	*/
401	synchronize_sched();	401	synchronize_sched();
402	control_ops_free(ops);	402	control_ops_free(ops);
403	}	403	}
404	} else	404	} else
405	ret = remove_ftrace_ops(&ftrace_ops_list, ops);	405	ret = remove_ftrace_ops(&ftrace_ops_list, ops);
406		406
407	if (ret < 0)	407	if (ret < 0)
408	return ret;	408	return ret;
409		409
410	if (ftrace_enabled)	410	if (ftrace_enabled)
411	update_ftrace_function();	411	update_ftrace_function();
412		412
413	/*	413	/*
414	* Dynamic ops may be freed, we must make sure that all	414	* Dynamic ops may be freed, we must make sure that all
415	* callers are done before leaving this function.	415	* callers are done before leaving this function.
416	*/	416	*/
417	if (ops->flags & FTRACE_OPS_FL_DYNAMIC)	417	if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
418	synchronize_sched();	418	synchronize_sched();
419		419
420	return 0;	420	return 0;
421	}	421	}
422		422
423	static void ftrace_update_pid_func(void)	423	static void ftrace_update_pid_func(void)
424	{	424	{
425	/* Only do something if we are tracing something */	425	/* Only do something if we are tracing something */
426	if (ftrace_trace_function == ftrace_stub)	426	if (ftrace_trace_function == ftrace_stub)
427	return;	427	return;
428		428
429	update_ftrace_function();	429	update_ftrace_function();
430	}	430	}
431		431
432	#ifdef CONFIG_FUNCTION_PROFILER	432	#ifdef CONFIG_FUNCTION_PROFILER
433	struct ftrace_profile {	433	struct ftrace_profile {
434	struct hlist_node node;	434	struct hlist_node node;
435	unsigned long ip;	435	unsigned long ip;
436	unsigned long counter;	436	unsigned long counter;
437	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	437	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
438	unsigned long long time;	438	unsigned long long time;
439	unsigned long long time_squared;	439	unsigned long long time_squared;
440	#endif	440	#endif
441	};	441	};
442		442
443	struct ftrace_profile_page {	443	struct ftrace_profile_page {
444	struct ftrace_profile_page *next;	444	struct ftrace_profile_page *next;
445	unsigned long index;	445	unsigned long index;
446	struct ftrace_profile records[];	446	struct ftrace_profile records[];
447	};	447	};
448		448
449	struct ftrace_profile_stat {	449	struct ftrace_profile_stat {
450	atomic_t disabled;	450	atomic_t disabled;
451	struct hlist_head *hash;	451	struct hlist_head *hash;
452	struct ftrace_profile_page *pages;	452	struct ftrace_profile_page *pages;
453	struct ftrace_profile_page *start;	453	struct ftrace_profile_page *start;
454	struct tracer_stat stat;	454	struct tracer_stat stat;
455	};	455	};
456		456
457	#define PROFILE_RECORDS_SIZE \	457	#define PROFILE_RECORDS_SIZE \
458	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))	458	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
459		459
460	#define PROFILES_PER_PAGE \	460	#define PROFILES_PER_PAGE \
461	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))	461	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
462		462
463	static int ftrace_profile_bits __read_mostly;	463	static int ftrace_profile_bits __read_mostly;
464	static int ftrace_profile_enabled __read_mostly;	464	static int ftrace_profile_enabled __read_mostly;
465		465
466	/* ftrace_profile_lock - synchronize the enable and disable of the profiler */	466	/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
467	static DEFINE_MUTEX(ftrace_profile_lock);	467	static DEFINE_MUTEX(ftrace_profile_lock);
468		468
469	static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);	469	static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
470		470
471	#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */	471	#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
472		472
473	static void *	473	static void *
474	function_stat_next(void *v, int idx)	474	function_stat_next(void *v, int idx)
475	{	475	{
476	struct ftrace_profile *rec = v;	476	struct ftrace_profile *rec = v;
477	struct ftrace_profile_page *pg;	477	struct ftrace_profile_page *pg;
478		478
479	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);	479	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
480		480
481	again:	481	again:
482	if (idx != 0)	482	if (idx != 0)
483	rec++;	483	rec++;
484		484
485	if ((void )rec >= (void )&pg->records[pg->index]) {	485	if ((void )rec >= (void )&pg->records[pg->index]) {
486	pg = pg->next;	486	pg = pg->next;
487	if (!pg)	487	if (!pg)
488	return NULL;	488	return NULL;
489	rec = &pg->records[0];	489	rec = &pg->records[0];
490	if (!rec->counter)	490	if (!rec->counter)
491	goto again;	491	goto again;
492	}	492	}
493		493
494	return rec;	494	return rec;
495	}	495	}
496		496
497	static void function_stat_start(struct tracer_stat trace)	497	static void function_stat_start(struct tracer_stat trace)
498	{	498	{
499	struct ftrace_profile_stat *stat =	499	struct ftrace_profile_stat *stat =
500	container_of(trace, struct ftrace_profile_stat, stat);	500	container_of(trace, struct ftrace_profile_stat, stat);
501		501
502	if (!stat \|\| !stat->start)	502	if (!stat \|\| !stat->start)
503	return NULL;	503	return NULL;
504		504
505	return function_stat_next(&stat->start->records[0], 0);	505	return function_stat_next(&stat->start->records[0], 0);
506	}	506	}
507		507
508	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	508	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
509	/* function graph compares on total time */	509	/* function graph compares on total time */
510	static int function_stat_cmp(void p1, void p2)	510	static int function_stat_cmp(void p1, void p2)
511	{	511	{
512	struct ftrace_profile *a = p1;	512	struct ftrace_profile *a = p1;
513	struct ftrace_profile *b = p2;	513	struct ftrace_profile *b = p2;
514		514
515	if (a->time < b->time)	515	if (a->time < b->time)
516	return -1;	516	return -1;
517	if (a->time > b->time)	517	if (a->time > b->time)
518	return 1;	518	return 1;
519	else	519	else
520	return 0;	520	return 0;
521	}	521	}
522	#else	522	#else
523	/* not function graph compares against hits */	523	/* not function graph compares against hits */
524	static int function_stat_cmp(void p1, void p2)	524	static int function_stat_cmp(void p1, void p2)
525	{	525	{
526	struct ftrace_profile *a = p1;	526	struct ftrace_profile *a = p1;
527	struct ftrace_profile *b = p2;	527	struct ftrace_profile *b = p2;
528		528
529	if (a->counter < b->counter)	529	if (a->counter < b->counter)
530	return -1;	530	return -1;
531	if (a->counter > b->counter)	531	if (a->counter > b->counter)
532	return 1;	532	return 1;
533	else	533	else
534	return 0;	534	return 0;
535	}	535	}
536	#endif	536	#endif
537		537
538	static int function_stat_headers(struct seq_file *m)	538	static int function_stat_headers(struct seq_file *m)
539	{	539	{
540	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	540	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
541	seq_printf(m, " Function "	541	seq_printf(m, " Function "
542	"Hit Time Avg s^2\n"	542	"Hit Time Avg s^2\n"
543	" -------- "	543	" -------- "
544	"--- ---- --- ---\n");	544	"--- ---- --- ---\n");
545	#else	545	#else
546	seq_printf(m, " Function Hit\n"	546	seq_printf(m, " Function Hit\n"
547	" -------- ---\n");	547	" -------- ---\n");
548	#endif	548	#endif
549	return 0;	549	return 0;
550	}	550	}
551		551
552	static int function_stat_show(struct seq_file m, void v)	552	static int function_stat_show(struct seq_file m, void v)
553	{	553	{
554	struct ftrace_profile *rec = v;	554	struct ftrace_profile *rec = v;
555	char str[KSYM_SYMBOL_LEN];	555	char str[KSYM_SYMBOL_LEN];
556	int ret = 0;	556	int ret = 0;
557	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	557	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
558	static struct trace_seq s;	558	static struct trace_seq s;
559	unsigned long long avg;	559	unsigned long long avg;
560	unsigned long long stddev;	560	unsigned long long stddev;
561	#endif	561	#endif
562	mutex_lock(&ftrace_profile_lock);	562	mutex_lock(&ftrace_profile_lock);
563		563
564	/* we raced with function_profile_reset() */	564	/* we raced with function_profile_reset() */
565	if (unlikely(rec->counter == 0)) {	565	if (unlikely(rec->counter == 0)) {
566	ret = -EBUSY;	566	ret = -EBUSY;
567	goto out;	567	goto out;
568	}	568	}
569		569
570	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);	570	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
571	seq_printf(m, " %-30.30s %10lu", str, rec->counter);	571	seq_printf(m, " %-30.30s %10lu", str, rec->counter);
572		572
573	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	573	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
574	seq_printf(m, " ");	574	seq_printf(m, " ");
575	avg = rec->time;	575	avg = rec->time;
576	do_div(avg, rec->counter);	576	do_div(avg, rec->counter);
577		577
578	/* Sample standard deviation (s^2) */	578	/* Sample standard deviation (s^2) */
579	if (rec->counter <= 1)	579	if (rec->counter <= 1)
580	stddev = 0;	580	stddev = 0;
581	else {	581	else {
582	stddev = rec->time_squared - rec->counter * avg * avg;	582	stddev = rec->time_squared - rec->counter * avg * avg;
583	/*	583	/*
584	* Divide only 1000 for ns^2 -> us^2 conversion.	584	* Divide only 1000 for ns^2 -> us^2 conversion.
585	* trace_print_graph_duration will divide 1000 again.	585	* trace_print_graph_duration will divide 1000 again.
586	*/	586	*/
587	do_div(stddev, (rec->counter - 1) * 1000);	587	do_div(stddev, (rec->counter - 1) * 1000);
588	}	588	}
589		589
590	trace_seq_init(&s);	590	trace_seq_init(&s);
591	trace_print_graph_duration(rec->time, &s);	591	trace_print_graph_duration(rec->time, &s);
592	trace_seq_puts(&s, " ");	592	trace_seq_puts(&s, " ");
593	trace_print_graph_duration(avg, &s);	593	trace_print_graph_duration(avg, &s);
594	trace_seq_puts(&s, " ");	594	trace_seq_puts(&s, " ");
595	trace_print_graph_duration(stddev, &s);	595	trace_print_graph_duration(stddev, &s);
596	trace_print_seq(m, &s);	596	trace_print_seq(m, &s);
597	#endif	597	#endif
598	seq_putc(m, '\n');	598	seq_putc(m, '\n');
599	out:	599	out:
600	mutex_unlock(&ftrace_profile_lock);	600	mutex_unlock(&ftrace_profile_lock);
601		601
602	return ret;	602	return ret;
603	}	603	}
604		604
605	static void ftrace_profile_reset(struct ftrace_profile_stat *stat)	605	static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
606	{	606	{
607	struct ftrace_profile_page *pg;	607	struct ftrace_profile_page *pg;
608		608
609	pg = stat->pages = stat->start;	609	pg = stat->pages = stat->start;
610		610
611	while (pg) {	611	while (pg) {
612	memset(pg->records, 0, PROFILE_RECORDS_SIZE);	612	memset(pg->records, 0, PROFILE_RECORDS_SIZE);
613	pg->index = 0;	613	pg->index = 0;
614	pg = pg->next;	614	pg = pg->next;
615	}	615	}
616		616
617	memset(stat->hash, 0,	617	memset(stat->hash, 0,
618	FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));	618	FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
619	}	619	}
620		620
621	int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)	621	int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
622	{	622	{
623	struct ftrace_profile_page *pg;	623	struct ftrace_profile_page *pg;
624	int functions;	624	int functions;
625	int pages;	625	int pages;
626	int i;	626	int i;
627		627
628	/* If we already allocated, do nothing */	628	/* If we already allocated, do nothing */
629	if (stat->pages)	629	if (stat->pages)
630	return 0;	630	return 0;
631		631
632	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);	632	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
633	if (!stat->pages)	633	if (!stat->pages)
634	return -ENOMEM;	634	return -ENOMEM;
635		635
636	#ifdef CONFIG_DYNAMIC_FTRACE	636	#ifdef CONFIG_DYNAMIC_FTRACE
637	functions = ftrace_update_tot_cnt;	637	functions = ftrace_update_tot_cnt;
638	#else	638	#else
639	/*	639	/*
640	* We do not know the number of functions that exist because	640	* We do not know the number of functions that exist because
641	* dynamic tracing is what counts them. With past experience	641	* dynamic tracing is what counts them. With past experience
642	* we have around 20K functions. That should be more than enough.	642	* we have around 20K functions. That should be more than enough.
643	* It is highly unlikely we will execute every function in	643	* It is highly unlikely we will execute every function in
644	* the kernel.	644	* the kernel.
645	*/	645	*/
646	functions = 20000;	646	functions = 20000;
647	#endif	647	#endif
648		648
649	pg = stat->start = stat->pages;	649	pg = stat->start = stat->pages;
650		650
651	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);	651	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
652		652
653	for (i = 0; i < pages; i++) {	653	for (i = 0; i < pages; i++) {
654	pg->next = (void *)get_zeroed_page(GFP_KERNEL);	654	pg->next = (void *)get_zeroed_page(GFP_KERNEL);
655	if (!pg->next)	655	if (!pg->next)
656	goto out_free;	656	goto out_free;
657	pg = pg->next;	657	pg = pg->next;
658	}	658	}
659		659
660	return 0;	660	return 0;
661		661
662	out_free:	662	out_free:
663	pg = stat->start;	663	pg = stat->start;
664	while (pg) {	664	while (pg) {
665	unsigned long tmp = (unsigned long)pg;	665	unsigned long tmp = (unsigned long)pg;
666		666
667	pg = pg->next;	667	pg = pg->next;
668	free_page(tmp);	668	free_page(tmp);
669	}	669	}
670		670
671	free_page((unsigned long)stat->pages);	671	free_page((unsigned long)stat->pages);
672	stat->pages = NULL;	672	stat->pages = NULL;
673	stat->start = NULL;	673	stat->start = NULL;
674		674
675	return -ENOMEM;	675	return -ENOMEM;
676	}	676	}
677		677
678	static int ftrace_profile_init_cpu(int cpu)	678	static int ftrace_profile_init_cpu(int cpu)
679	{	679	{
680	struct ftrace_profile_stat *stat;	680	struct ftrace_profile_stat *stat;
681	int size;	681	int size;
682		682
683	stat = &per_cpu(ftrace_profile_stats, cpu);	683	stat = &per_cpu(ftrace_profile_stats, cpu);
684		684
685	if (stat->hash) {	685	if (stat->hash) {
686	/* If the profile is already created, simply reset it */	686	/* If the profile is already created, simply reset it */
687	ftrace_profile_reset(stat);	687	ftrace_profile_reset(stat);
688	return 0;	688	return 0;
689	}	689	}
690		690
691	/*	691	/*
692	* We are profiling all functions, but usually only a few thousand	692	* We are profiling all functions, but usually only a few thousand
693	* functions are hit. We'll make a hash of 1024 items.	693	* functions are hit. We'll make a hash of 1024 items.
694	*/	694	*/
695	size = FTRACE_PROFILE_HASH_SIZE;	695	size = FTRACE_PROFILE_HASH_SIZE;
696		696
697	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);	697	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
698		698
699	if (!stat->hash)	699	if (!stat->hash)
700	return -ENOMEM;	700	return -ENOMEM;
701		701
702	if (!ftrace_profile_bits) {	702	if (!ftrace_profile_bits) {
703	size--;	703	size--;
704		704
705	for (; size; size >>= 1)	705	for (; size; size >>= 1)
706	ftrace_profile_bits++;	706	ftrace_profile_bits++;
707	}	707	}
708		708
709	/* Preallocate the function profiling pages */	709	/* Preallocate the function profiling pages */
710	if (ftrace_profile_pages_init(stat) < 0) {	710	if (ftrace_profile_pages_init(stat) < 0) {
711	kfree(stat->hash);	711	kfree(stat->hash);
712	stat->hash = NULL;	712	stat->hash = NULL;
713	return -ENOMEM;	713	return -ENOMEM;
714	}	714	}
715		715
716	return 0;	716	return 0;
717	}	717	}
718		718
719	static int ftrace_profile_init(void)	719	static int ftrace_profile_init(void)
720	{	720	{
721	int cpu;	721	int cpu;
722	int ret = 0;	722	int ret = 0;
723		723
724	for_each_online_cpu(cpu) {	724	for_each_online_cpu(cpu) {
725	ret = ftrace_profile_init_cpu(cpu);	725	ret = ftrace_profile_init_cpu(cpu);
726	if (ret)	726	if (ret)
727	break;	727	break;
728	}	728	}
729		729
730	return ret;	730	return ret;
731	}	731	}
732		732
733	/* interrupts must be disabled */	733	/* interrupts must be disabled */
734	static struct ftrace_profile *	734	static struct ftrace_profile *
735	ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)	735	ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
736	{	736	{
737	struct ftrace_profile *rec;	737	struct ftrace_profile *rec;
738	struct hlist_head *hhd;	738	struct hlist_head *hhd;
739	struct hlist_node *n;	739	struct hlist_node *n;
740	unsigned long key;	740	unsigned long key;
741		741
742	key = hash_long(ip, ftrace_profile_bits);	742	key = hash_long(ip, ftrace_profile_bits);
743	hhd = &stat->hash[key];	743	hhd = &stat->hash[key];
744		744
745	if (hlist_empty(hhd))	745	if (hlist_empty(hhd))
746	return NULL;	746	return NULL;
747		747
748	hlist_for_each_entry_rcu(rec, n, hhd, node) {	748	hlist_for_each_entry_rcu(rec, n, hhd, node) {
749	if (rec->ip == ip)	749	if (rec->ip == ip)
750	return rec;	750	return rec;
751	}	751	}
752		752
753	return NULL;	753	return NULL;
754	}	754	}
755		755
756	static void ftrace_add_profile(struct ftrace_profile_stat *stat,	756	static void ftrace_add_profile(struct ftrace_profile_stat *stat,
757	struct ftrace_profile *rec)	757	struct ftrace_profile *rec)
758	{	758	{
759	unsigned long key;	759	unsigned long key;
760		760
761	key = hash_long(rec->ip, ftrace_profile_bits);	761	key = hash_long(rec->ip, ftrace_profile_bits);
762	hlist_add_head_rcu(&rec->node, &stat->hash[key]);	762	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
763	}	763	}
764		764
765	/*	765	/*
766	* The memory is already allocated, this simply finds a new record to use.	766	* The memory is already allocated, this simply finds a new record to use.
767	*/	767	*/
768	static struct ftrace_profile *	768	static struct ftrace_profile *
769	ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)	769	ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
770	{	770	{
771	struct ftrace_profile *rec = NULL;	771	struct ftrace_profile *rec = NULL;
772		772
773	/* prevent recursion (from NMIs) */	773	/* prevent recursion (from NMIs) */
774	if (atomic_inc_return(&stat->disabled) != 1)	774	if (atomic_inc_return(&stat->disabled) != 1)
775	goto out;	775	goto out;
776		776
777	/*	777	/*
778	* Try to find the function again since an NMI	778	* Try to find the function again since an NMI
779	* could have added it	779	* could have added it
780	*/	780	*/
781	rec = ftrace_find_profiled_func(stat, ip);	781	rec = ftrace_find_profiled_func(stat, ip);
782	if (rec)	782	if (rec)
783	goto out;	783	goto out;
784		784
785	if (stat->pages->index == PROFILES_PER_PAGE) {	785	if (stat->pages->index == PROFILES_PER_PAGE) {
786	if (!stat->pages->next)	786	if (!stat->pages->next)
787	goto out;	787	goto out;
788	stat->pages = stat->pages->next;	788	stat->pages = stat->pages->next;
789	}	789	}
790		790
791	rec = &stat->pages->records[stat->pages->index++];	791	rec = &stat->pages->records[stat->pages->index++];
792	rec->ip = ip;	792	rec->ip = ip;
793	ftrace_add_profile(stat, rec);	793	ftrace_add_profile(stat, rec);
794		794
795	out:	795	out:
796	atomic_dec(&stat->disabled);	796	atomic_dec(&stat->disabled);
797		797
798	return rec;	798	return rec;
799	}	799	}
800		800
801	static void	801	static void
802	function_profile_call(unsigned long ip, unsigned long parent_ip,	802	function_profile_call(unsigned long ip, unsigned long parent_ip,
803	struct ftrace_ops ops, struct pt_regs regs)	803	struct ftrace_ops ops, struct pt_regs regs)
804	{	804	{
805	struct ftrace_profile_stat *stat;	805	struct ftrace_profile_stat *stat;
806	struct ftrace_profile *rec;	806	struct ftrace_profile *rec;
807	unsigned long flags;	807	unsigned long flags;
808		808
809	if (!ftrace_profile_enabled)	809	if (!ftrace_profile_enabled)
810	return;	810	return;
811		811
812	local_irq_save(flags);	812	local_irq_save(flags);
813		813
814	stat = &__get_cpu_var(ftrace_profile_stats);	814	stat = &__get_cpu_var(ftrace_profile_stats);
815	if (!stat->hash \|\| !ftrace_profile_enabled)	815	if (!stat->hash \|\| !ftrace_profile_enabled)
816	goto out;	816	goto out;
817		817
818	rec = ftrace_find_profiled_func(stat, ip);	818	rec = ftrace_find_profiled_func(stat, ip);
819	if (!rec) {	819	if (!rec) {
820	rec = ftrace_profile_alloc(stat, ip);	820	rec = ftrace_profile_alloc(stat, ip);
821	if (!rec)	821	if (!rec)
822	goto out;	822	goto out;
823	}	823	}
824		824
825	rec->counter++;	825	rec->counter++;
826	out:	826	out:
827	local_irq_restore(flags);	827	local_irq_restore(flags);
828	}	828	}
829		829
830	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	830	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
831	static int profile_graph_entry(struct ftrace_graph_ent *trace)	831	static int profile_graph_entry(struct ftrace_graph_ent *trace)
832	{	832	{
833	function_profile_call(trace->func, 0, NULL, NULL);	833	function_profile_call(trace->func, 0, NULL, NULL);
834	return 1;	834	return 1;
835	}	835	}
836		836
837	static void profile_graph_return(struct ftrace_graph_ret *trace)	837	static void profile_graph_return(struct ftrace_graph_ret *trace)
838	{	838	{
839	struct ftrace_profile_stat *stat;	839	struct ftrace_profile_stat *stat;
840	unsigned long long calltime;	840	unsigned long long calltime;
841	struct ftrace_profile *rec;	841	struct ftrace_profile *rec;
842	unsigned long flags;	842	unsigned long flags;
843		843
844	local_irq_save(flags);	844	local_irq_save(flags);
845	stat = &__get_cpu_var(ftrace_profile_stats);	845	stat = &__get_cpu_var(ftrace_profile_stats);
846	if (!stat->hash \|\| !ftrace_profile_enabled)	846	if (!stat->hash \|\| !ftrace_profile_enabled)
847	goto out;	847	goto out;
848		848
849	/* If the calltime was zero'd ignore it */	849	/* If the calltime was zero'd ignore it */
850	if (!trace->calltime)	850	if (!trace->calltime)
851	goto out;	851	goto out;
852		852
853	calltime = trace->rettime - trace->calltime;	853	calltime = trace->rettime - trace->calltime;
854		854
855	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {	855	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
856	int index;	856	int index;
857		857
858	index = trace->depth;	858	index = trace->depth;
859		859
860	/* Append this call time to the parent time to subtract */	860	/* Append this call time to the parent time to subtract */
861	if (index)	861	if (index)
862	current->ret_stack[index - 1].subtime += calltime;	862	current->ret_stack[index - 1].subtime += calltime;
863		863
864	if (current->ret_stack[index].subtime < calltime)	864	if (current->ret_stack[index].subtime < calltime)
865	calltime -= current->ret_stack[index].subtime;	865	calltime -= current->ret_stack[index].subtime;
866	else	866	else
867	calltime = 0;	867	calltime = 0;
868	}	868	}
869		869
870	rec = ftrace_find_profiled_func(stat, trace->func);	870	rec = ftrace_find_profiled_func(stat, trace->func);
871	if (rec) {	871	if (rec) {
872	rec->time += calltime;	872	rec->time += calltime;
873	rec->time_squared += calltime * calltime;	873	rec->time_squared += calltime * calltime;
874	}	874	}
875		875
876	out:	876	out:
877	local_irq_restore(flags);	877	local_irq_restore(flags);
878	}	878	}
879		879
880	static int register_ftrace_profiler(void)	880	static int register_ftrace_profiler(void)
881	{	881	{
882	return register_ftrace_graph(&profile_graph_return,	882	return register_ftrace_graph(&profile_graph_return,
883	&profile_graph_entry);	883	&profile_graph_entry);
884	}	884	}
885		885
886	static void unregister_ftrace_profiler(void)	886	static void unregister_ftrace_profiler(void)
887	{	887	{
888	unregister_ftrace_graph();	888	unregister_ftrace_graph();
889	}	889	}
890	#else	890	#else
891	static struct ftrace_ops ftrace_profile_ops __read_mostly = {	891	static struct ftrace_ops ftrace_profile_ops __read_mostly = {
892	.func = function_profile_call,	892	.func = function_profile_call,
893	.flags = FTRACE_OPS_FL_RECURSION_SAFE,	893	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
894	};	894	};
895		895
896	static int register_ftrace_profiler(void)	896	static int register_ftrace_profiler(void)
897	{	897	{
898	return register_ftrace_function(&ftrace_profile_ops);	898	return register_ftrace_function(&ftrace_profile_ops);
899	}	899	}
900		900
901	static void unregister_ftrace_profiler(void)	901	static void unregister_ftrace_profiler(void)
902	{	902	{
903	unregister_ftrace_function(&ftrace_profile_ops);	903	unregister_ftrace_function(&ftrace_profile_ops);
904	}	904	}
905	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */	905	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
906		906
907	static ssize_t	907	static ssize_t
908	ftrace_profile_write(struct file filp, const char __user ubuf,	908	ftrace_profile_write(struct file filp, const char __user ubuf,
909	size_t cnt, loff_t *ppos)	909	size_t cnt, loff_t *ppos)
910	{	910	{
911	unsigned long val;	911	unsigned long val;
912	int ret;	912	int ret;
913		913
914	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);	914	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
915	if (ret)	915	if (ret)
916	return ret;	916	return ret;
917		917
918	val = !!val;	918	val = !!val;
919		919
920	mutex_lock(&ftrace_profile_lock);	920	mutex_lock(&ftrace_profile_lock);
921	if (ftrace_profile_enabled ^ val) {	921	if (ftrace_profile_enabled ^ val) {
922	if (val) {	922	if (val) {
923	ret = ftrace_profile_init();	923	ret = ftrace_profile_init();
924	if (ret < 0) {	924	if (ret < 0) {
925	cnt = ret;	925	cnt = ret;
926	goto out;	926	goto out;
927	}	927	}
928		928
929	ret = register_ftrace_profiler();	929	ret = register_ftrace_profiler();
930	if (ret < 0) {	930	if (ret < 0) {
931	cnt = ret;	931	cnt = ret;
932	goto out;	932	goto out;
933	}	933	}
934	ftrace_profile_enabled = 1;	934	ftrace_profile_enabled = 1;
935	} else {	935	} else {
936	ftrace_profile_enabled = 0;	936	ftrace_profile_enabled = 0;
937	/*	937	/*
938	* unregister_ftrace_profiler calls stop_machine	938	* unregister_ftrace_profiler calls stop_machine
939	* so this acts like an synchronize_sched.	939	* so this acts like an synchronize_sched.
940	*/	940	*/
941	unregister_ftrace_profiler();	941	unregister_ftrace_profiler();
942	}	942	}
943	}	943	}
944	out:	944	out:
945	mutex_unlock(&ftrace_profile_lock);	945	mutex_unlock(&ftrace_profile_lock);
946		946
947	*ppos += cnt;	947	*ppos += cnt;
948		948
949	return cnt;	949	return cnt;
950	}	950	}
951		951
952	static ssize_t	952	static ssize_t
953	ftrace_profile_read(struct file filp, char __user ubuf,	953	ftrace_profile_read(struct file filp, char __user ubuf,
954	size_t cnt, loff_t *ppos)	954	size_t cnt, loff_t *ppos)
955	{	955	{
956	char buf[64]; /* big enough to hold a number */	956	char buf[64]; /* big enough to hold a number */
957	int r;	957	int r;
958		958
959	r = sprintf(buf, "%u\n", ftrace_profile_enabled);	959	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
960	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);	960	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
961	}	961	}
962		962
963	static const struct file_operations ftrace_profile_fops = {	963	static const struct file_operations ftrace_profile_fops = {
964	.open = tracing_open_generic,	964	.open = tracing_open_generic,
965	.read = ftrace_profile_read,	965	.read = ftrace_profile_read,
966	.write = ftrace_profile_write,	966	.write = ftrace_profile_write,
967	.llseek = default_llseek,	967	.llseek = default_llseek,
968	};	968	};
969		969
970	/* used to initialize the real stat files */	970	/* used to initialize the real stat files */
971	static struct tracer_stat function_stats __initdata = {	971	static struct tracer_stat function_stats __initdata = {
972	.name = "functions",	972	.name = "functions",
973	.stat_start = function_stat_start,	973	.stat_start = function_stat_start,
974	.stat_next = function_stat_next,	974	.stat_next = function_stat_next,
975	.stat_cmp = function_stat_cmp,	975	.stat_cmp = function_stat_cmp,
976	.stat_headers = function_stat_headers,	976	.stat_headers = function_stat_headers,
977	.stat_show = function_stat_show	977	.stat_show = function_stat_show
978	};	978	};
979		979
980	static __init void ftrace_profile_debugfs(struct dentry *d_tracer)	980	static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
981	{	981	{
982	struct ftrace_profile_stat *stat;	982	struct ftrace_profile_stat *stat;
983	struct dentry *entry;	983	struct dentry *entry;
984	char *name;	984	char *name;
985	int ret;	985	int ret;
986	int cpu;	986	int cpu;
987		987
988	for_each_possible_cpu(cpu) {	988	for_each_possible_cpu(cpu) {
989	stat = &per_cpu(ftrace_profile_stats, cpu);	989	stat = &per_cpu(ftrace_profile_stats, cpu);
990		990
991	/* allocate enough for function name + cpu number */	991	/* allocate enough for function name + cpu number */
992	name = kmalloc(32, GFP_KERNEL);	992	name = kmalloc(32, GFP_KERNEL);
993	if (!name) {	993	if (!name) {
994	/*	994	/*
995	* The files created are permanent, if something happens	995	* The files created are permanent, if something happens
996	* we still do not free memory.	996	* we still do not free memory.
997	*/	997	*/
998	WARN(1,	998	WARN(1,
999	"Could not allocate stat file for cpu %d\n",	999	"Could not allocate stat file for cpu %d\n",
1000	cpu);	1000	cpu);
1001	return;	1001	return;
1002	}	1002	}
1003	stat->stat = function_stats;	1003	stat->stat = function_stats;
1004	snprintf(name, 32, "function%d", cpu);	1004	snprintf(name, 32, "function%d", cpu);
1005	stat->stat.name = name;	1005	stat->stat.name = name;
1006	ret = register_stat_tracer(&stat->stat);	1006	ret = register_stat_tracer(&stat->stat);
1007	if (ret) {	1007	if (ret) {
1008	WARN(1,	1008	WARN(1,
1009	"Could not register function stat for cpu %d\n",	1009	"Could not register function stat for cpu %d\n",
1010	cpu);	1010	cpu);
1011	kfree(name);	1011	kfree(name);
1012	return;	1012	return;
1013	}	1013	}
1014	}	1014	}
1015		1015
1016	entry = debugfs_create_file("function_profile_enabled", 0644,	1016	entry = debugfs_create_file("function_profile_enabled", 0644,
1017	d_tracer, NULL, &ftrace_profile_fops);	1017	d_tracer, NULL, &ftrace_profile_fops);
1018	if (!entry)	1018	if (!entry)
1019	pr_warning("Could not create debugfs "	1019	pr_warning("Could not create debugfs "
1020	"'function_profile_enabled' entry\n");	1020	"'function_profile_enabled' entry\n");
1021	}	1021	}
1022		1022
1023	#else /* CONFIG_FUNCTION_PROFILER */	1023	#else /* CONFIG_FUNCTION_PROFILER */
1024	static __init void ftrace_profile_debugfs(struct dentry *d_tracer)	1024	static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1025	{	1025	{
1026	}	1026	}
1027	#endif /* CONFIG_FUNCTION_PROFILER */	1027	#endif /* CONFIG_FUNCTION_PROFILER */
1028		1028
1029	static struct pid * const ftrace_swapper_pid = &init_struct_pid;	1029	static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1030		1030
1031	#ifdef CONFIG_DYNAMIC_FTRACE	1031	#ifdef CONFIG_DYNAMIC_FTRACE
1032		1032
1033	#ifndef CONFIG_FTRACE_MCOUNT_RECORD	1033	#ifndef CONFIG_FTRACE_MCOUNT_RECORD
1034	# error Dynamic ftrace depends on MCOUNT_RECORD	1034	# error Dynamic ftrace depends on MCOUNT_RECORD
1035	#endif	1035	#endif
1036		1036
1037	static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;	1037	static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
1038		1038
1039	struct ftrace_func_probe {	1039	struct ftrace_func_probe {
1040	struct hlist_node node;	1040	struct hlist_node node;
1041	struct ftrace_probe_ops *ops;	1041	struct ftrace_probe_ops *ops;
1042	unsigned long flags;	1042	unsigned long flags;
1043	unsigned long ip;	1043	unsigned long ip;
1044	void *data;	1044	void *data;
1045	struct rcu_head rcu;	1045	struct rcu_head rcu;
1046	};	1046	};
1047		1047
1048	struct ftrace_func_entry {	1048	struct ftrace_func_entry {
1049	struct hlist_node hlist;	1049	struct hlist_node hlist;
1050	unsigned long ip;	1050	unsigned long ip;
1051	};	1051	};
1052		1052
1053	struct ftrace_hash {	1053	struct ftrace_hash {
1054	unsigned long size_bits;	1054	unsigned long size_bits;
1055	struct hlist_head *buckets;	1055	struct hlist_head *buckets;
1056	unsigned long count;	1056	unsigned long count;
1057	struct rcu_head rcu;	1057	struct rcu_head rcu;
1058	};	1058	};
1059		1059
1060	/*	1060	/*
1061	* We make these constant because no one should touch them,	1061	* We make these constant because no one should touch them,
1062	* but they are used as the default "empty hash", to avoid allocating	1062	* but they are used as the default "empty hash", to avoid allocating
1063	* it all the time. These are in a read only section such that if	1063	* it all the time. These are in a read only section such that if
1064	* anyone does try to modify it, it will cause an exception.	1064	* anyone does try to modify it, it will cause an exception.
1065	*/	1065	*/
1066	static const struct hlist_head empty_buckets[1];	1066	static const struct hlist_head empty_buckets[1];
1067	static const struct ftrace_hash empty_hash = {	1067	static const struct ftrace_hash empty_hash = {
1068	.buckets = (struct hlist_head *)empty_buckets,	1068	.buckets = (struct hlist_head *)empty_buckets,
1069	};	1069	};
1070	#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)	1070	#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
1071		1071
1072	static struct ftrace_ops global_ops = {	1072	static struct ftrace_ops global_ops = {
1073	.func = ftrace_stub,	1073	.func = ftrace_stub,
1074	.notrace_hash = EMPTY_HASH,	1074	.notrace_hash = EMPTY_HASH,
1075	.filter_hash = EMPTY_HASH,	1075	.filter_hash = EMPTY_HASH,
1076	.flags = FTRACE_OPS_FL_RECURSION_SAFE,	1076	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
1077	};	1077	};
1078		1078
1079	static DEFINE_MUTEX(ftrace_regex_lock);	1079	static DEFINE_MUTEX(ftrace_regex_lock);
1080		1080
1081	struct ftrace_page {	1081	struct ftrace_page {
1082	struct ftrace_page *next;	1082	struct ftrace_page *next;
1083	struct dyn_ftrace *records;	1083	struct dyn_ftrace *records;
1084	int index;	1084	int index;
1085	int size;	1085	int size;
1086	};	1086	};
1087		1087
1088	static struct ftrace_page *ftrace_new_pgs;	1088	static struct ftrace_page *ftrace_new_pgs;
1089		1089
1090	#define ENTRY_SIZE sizeof(struct dyn_ftrace)	1090	#define ENTRY_SIZE sizeof(struct dyn_ftrace)
1091	#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)	1091	#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1092		1092
1093	/* estimate from running different kernels */	1093	/* estimate from running different kernels */
1094	#define NR_TO_INIT 10000	1094	#define NR_TO_INIT 10000
1095		1095
1096	static struct ftrace_page *ftrace_pages_start;	1096	static struct ftrace_page *ftrace_pages_start;
1097	static struct ftrace_page *ftrace_pages;	1097	static struct ftrace_page *ftrace_pages;
1098		1098
1099	static bool ftrace_hash_empty(struct ftrace_hash *hash)	1099	static bool ftrace_hash_empty(struct ftrace_hash *hash)
1100	{	1100	{
1101	return !hash \|\| !hash->count;	1101	return !hash \|\| !hash->count;
1102	}	1102	}
1103		1103
1104	static struct ftrace_func_entry *	1104	static struct ftrace_func_entry *
1105	ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)	1105	ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1106	{	1106	{
1107	unsigned long key;	1107	unsigned long key;
1108	struct ftrace_func_entry *entry;	1108	struct ftrace_func_entry *entry;
1109	struct hlist_head *hhd;	1109	struct hlist_head *hhd;
1110	struct hlist_node *n;	1110	struct hlist_node *n;
1111		1111
1112	if (ftrace_hash_empty(hash))	1112	if (ftrace_hash_empty(hash))
1113	return NULL;	1113	return NULL;
1114		1114
1115	if (hash->size_bits > 0)	1115	if (hash->size_bits > 0)
1116	key = hash_long(ip, hash->size_bits);	1116	key = hash_long(ip, hash->size_bits);
1117	else	1117	else
1118	key = 0;	1118	key = 0;
1119		1119
1120	hhd = &hash->buckets[key];	1120	hhd = &hash->buckets[key];
1121		1121
1122	hlist_for_each_entry_rcu(entry, n, hhd, hlist) {	1122	hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1123	if (entry->ip == ip)	1123	if (entry->ip == ip)
1124	return entry;	1124	return entry;
1125	}	1125	}
1126	return NULL;	1126	return NULL;
1127	}	1127	}
1128		1128
1129	static void __add_hash_entry(struct ftrace_hash *hash,	1129	static void __add_hash_entry(struct ftrace_hash *hash,
1130	struct ftrace_func_entry *entry)	1130	struct ftrace_func_entry *entry)
1131	{	1131	{
1132	struct hlist_head *hhd;	1132	struct hlist_head *hhd;
1133	unsigned long key;	1133	unsigned long key;
1134		1134
1135	if (hash->size_bits)	1135	if (hash->size_bits)
1136	key = hash_long(entry->ip, hash->size_bits);	1136	key = hash_long(entry->ip, hash->size_bits);
1137	else	1137	else
1138	key = 0;	1138	key = 0;
1139		1139
1140	hhd = &hash->buckets[key];	1140	hhd = &hash->buckets[key];
1141	hlist_add_head(&entry->hlist, hhd);	1141	hlist_add_head(&entry->hlist, hhd);
1142	hash->count++;	1142	hash->count++;
1143	}	1143	}
1144		1144
1145	static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)	1145	static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1146	{	1146	{
1147	struct ftrace_func_entry *entry;	1147	struct ftrace_func_entry *entry;
1148		1148
1149	entry = kmalloc(sizeof(*entry), GFP_KERNEL);	1149	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1150	if (!entry)	1150	if (!entry)
1151	return -ENOMEM;	1151	return -ENOMEM;
1152		1152
1153	entry->ip = ip;	1153	entry->ip = ip;
1154	__add_hash_entry(hash, entry);	1154	__add_hash_entry(hash, entry);
1155		1155
1156	return 0;	1156	return 0;
1157	}	1157	}
1158		1158
1159	static void	1159	static void
1160	free_hash_entry(struct ftrace_hash *hash,	1160	free_hash_entry(struct ftrace_hash *hash,
1161	struct ftrace_func_entry *entry)	1161	struct ftrace_func_entry *entry)
1162	{	1162	{
1163	hlist_del(&entry->hlist);	1163	hlist_del(&entry->hlist);
1164	kfree(entry);	1164	kfree(entry);
1165	hash->count--;	1165	hash->count--;
1166	}	1166	}
1167		1167
1168	static void	1168	static void
1169	remove_hash_entry(struct ftrace_hash *hash,	1169	remove_hash_entry(struct ftrace_hash *hash,
1170	struct ftrace_func_entry *entry)	1170	struct ftrace_func_entry *entry)
1171	{	1171	{
1172	hlist_del(&entry->hlist);	1172	hlist_del(&entry->hlist);
1173	hash->count--;	1173	hash->count--;
1174	}	1174	}
1175		1175
1176	static void ftrace_hash_clear(struct ftrace_hash *hash)	1176	static void ftrace_hash_clear(struct ftrace_hash *hash)
1177	{	1177	{
1178	struct hlist_head *hhd;	1178	struct hlist_head *hhd;
1179	struct hlist_node tp, tn;	1179	struct hlist_node tp, tn;
1180	struct ftrace_func_entry *entry;	1180	struct ftrace_func_entry *entry;
1181	int size = 1 << hash->size_bits;	1181	int size = 1 << hash->size_bits;
1182	int i;	1182	int i;
1183		1183
1184	if (!hash->count)	1184	if (!hash->count)
1185	return;	1185	return;
1186		1186
1187	for (i = 0; i < size; i++) {	1187	for (i = 0; i < size; i++) {
1188	hhd = &hash->buckets[i];	1188	hhd = &hash->buckets[i];
1189	hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)	1189	hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1190	free_hash_entry(hash, entry);	1190	free_hash_entry(hash, entry);
1191	}	1191	}
1192	FTRACE_WARN_ON(hash->count);	1192	FTRACE_WARN_ON(hash->count);
1193	}	1193	}
1194		1194
1195	static void free_ftrace_hash(struct ftrace_hash *hash)	1195	static void free_ftrace_hash(struct ftrace_hash *hash)
1196	{	1196	{
1197	if (!hash \|\| hash == EMPTY_HASH)	1197	if (!hash \|\| hash == EMPTY_HASH)
1198	return;	1198	return;
1199	ftrace_hash_clear(hash);	1199	ftrace_hash_clear(hash);
1200	kfree(hash->buckets);	1200	kfree(hash->buckets);
1201	kfree(hash);	1201	kfree(hash);
1202	}	1202	}
1203		1203
1204	static void __free_ftrace_hash_rcu(struct rcu_head *rcu)	1204	static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1205	{	1205	{
1206	struct ftrace_hash *hash;	1206	struct ftrace_hash *hash;
1207		1207
1208	hash = container_of(rcu, struct ftrace_hash, rcu);	1208	hash = container_of(rcu, struct ftrace_hash, rcu);
1209	free_ftrace_hash(hash);	1209	free_ftrace_hash(hash);
1210	}	1210	}
1211		1211
1212	static void free_ftrace_hash_rcu(struct ftrace_hash *hash)	1212	static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1213	{	1213	{
1214	if (!hash \|\| hash == EMPTY_HASH)	1214	if (!hash \|\| hash == EMPTY_HASH)
1215	return;	1215	return;
1216	call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);	1216	call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1217	}	1217	}
1218		1218
1219	void ftrace_free_filter(struct ftrace_ops *ops)	1219	void ftrace_free_filter(struct ftrace_ops *ops)
1220	{	1220	{
1221	free_ftrace_hash(ops->filter_hash);	1221	free_ftrace_hash(ops->filter_hash);
1222	free_ftrace_hash(ops->notrace_hash);	1222	free_ftrace_hash(ops->notrace_hash);
1223	}	1223	}
1224		1224
1225	static struct ftrace_hash *alloc_ftrace_hash(int size_bits)	1225	static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1226	{	1226	{
1227	struct ftrace_hash *hash;	1227	struct ftrace_hash *hash;
1228	int size;	1228	int size;
1229		1229
1230	hash = kzalloc(sizeof(*hash), GFP_KERNEL);	1230	hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1231	if (!hash)	1231	if (!hash)
1232	return NULL;	1232	return NULL;
1233		1233
1234	size = 1 << size_bits;	1234	size = 1 << size_bits;
1235	hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);	1235	hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
1236		1236
1237	if (!hash->buckets) {	1237	if (!hash->buckets) {
1238	kfree(hash);	1238	kfree(hash);
1239	return NULL;	1239	return NULL;
1240	}	1240	}
1241		1241
1242	hash->size_bits = size_bits;	1242	hash->size_bits = size_bits;
1243		1243
1244	return hash;	1244	return hash;
1245	}	1245	}
1246		1246
1247	static struct ftrace_hash *	1247	static struct ftrace_hash *
1248	alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)	1248	alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1249	{	1249	{
1250	struct ftrace_func_entry *entry;	1250	struct ftrace_func_entry *entry;
1251	struct ftrace_hash *new_hash;	1251	struct ftrace_hash *new_hash;
1252	struct hlist_node *tp;	1252	struct hlist_node *tp;
1253	int size;	1253	int size;
1254	int ret;	1254	int ret;
1255	int i;	1255	int i;
1256		1256
1257	new_hash = alloc_ftrace_hash(size_bits);	1257	new_hash = alloc_ftrace_hash(size_bits);
1258	if (!new_hash)	1258	if (!new_hash)
1259	return NULL;	1259	return NULL;
1260		1260
1261	/* Empty hash? */	1261	/* Empty hash? */
1262	if (ftrace_hash_empty(hash))	1262	if (ftrace_hash_empty(hash))
1263	return new_hash;	1263	return new_hash;
1264		1264
1265	size = 1 << hash->size_bits;	1265	size = 1 << hash->size_bits;
1266	for (i = 0; i < size; i++) {	1266	for (i = 0; i < size; i++) {
1267	hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {	1267	hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1268	ret = add_hash_entry(new_hash, entry->ip);	1268	ret = add_hash_entry(new_hash, entry->ip);
1269	if (ret < 0)	1269	if (ret < 0)
1270	goto free_hash;	1270	goto free_hash;
1271	}	1271	}
1272	}	1272	}
1273		1273
1274	FTRACE_WARN_ON(new_hash->count != hash->count);	1274	FTRACE_WARN_ON(new_hash->count != hash->count);
1275		1275
1276	return new_hash;	1276	return new_hash;
1277		1277
1278	free_hash:	1278	free_hash:
1279	free_ftrace_hash(new_hash);	1279	free_ftrace_hash(new_hash);
1280	return NULL;	1280	return NULL;
1281	}	1281	}
1282		1282
1283	static void	1283	static void
1284	ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);	1284	ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1285	static void	1285	static void
1286	ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);	1286	ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1287		1287
1288	static int	1288	static int
1289	ftrace_hash_move(struct ftrace_ops *ops, int enable,	1289	ftrace_hash_move(struct ftrace_ops *ops, int enable,
1290	struct ftrace_hash *dst, struct ftrace_hash src)	1290	struct ftrace_hash *dst, struct ftrace_hash src)
1291	{	1291	{
1292	struct ftrace_func_entry *entry;	1292	struct ftrace_func_entry *entry;
1293	struct hlist_node tp, tn;	1293	struct hlist_node tp, tn;
1294	struct hlist_head *hhd;	1294	struct hlist_head *hhd;
1295	struct ftrace_hash *old_hash;	1295	struct ftrace_hash *old_hash;
1296	struct ftrace_hash *new_hash;	1296	struct ftrace_hash *new_hash;
1297	unsigned long key;	1297	unsigned long key;
1298	int size = src->count;	1298	int size = src->count;
1299	int bits = 0;	1299	int bits = 0;
1300	int ret;	1300	int ret;
1301	int i;	1301	int i;
1302		1302
1303	/*	1303	/*
1304	* Remove the current set, update the hash and add	1304	* Remove the current set, update the hash and add
1305	* them back.	1305	* them back.
1306	*/	1306	*/
1307	ftrace_hash_rec_disable(ops, enable);	1307	ftrace_hash_rec_disable(ops, enable);
1308		1308
1309	/*	1309	/*
1310	* If the new source is empty, just free dst and assign it	1310	* If the new source is empty, just free dst and assign it
1311	* the empty_hash.	1311	* the empty_hash.
1312	*/	1312	*/
1313	if (!src->count) {	1313	if (!src->count) {
1314	free_ftrace_hash_rcu(*dst);	1314	free_ftrace_hash_rcu(*dst);
1315	rcu_assign_pointer(*dst, EMPTY_HASH);	1315	rcu_assign_pointer(*dst, EMPTY_HASH);
1316	/* still need to update the function records */	1316	/* still need to update the function records */
1317	ret = 0;	1317	ret = 0;
1318	goto out;	1318	goto out;
1319	}	1319	}
1320		1320
1321	/*	1321	/*
1322	* Make the hash size about 1/2 the # found	1322	* Make the hash size about 1/2 the # found
1323	*/	1323	*/
1324	for (size /= 2; size; size >>= 1)	1324	for (size /= 2; size; size >>= 1)
1325	bits++;	1325	bits++;
1326		1326
1327	/* Don't allocate too much */	1327	/* Don't allocate too much */
1328	if (bits > FTRACE_HASH_MAX_BITS)	1328	if (bits > FTRACE_HASH_MAX_BITS)
1329	bits = FTRACE_HASH_MAX_BITS;	1329	bits = FTRACE_HASH_MAX_BITS;
1330		1330
1331	ret = -ENOMEM;	1331	ret = -ENOMEM;
1332	new_hash = alloc_ftrace_hash(bits);	1332	new_hash = alloc_ftrace_hash(bits);
1333	if (!new_hash)	1333	if (!new_hash)
1334	goto out;	1334	goto out;
1335		1335
1336	size = 1 << src->size_bits;	1336	size = 1 << src->size_bits;
1337	for (i = 0; i < size; i++) {	1337	for (i = 0; i < size; i++) {
1338	hhd = &src->buckets[i];	1338	hhd = &src->buckets[i];
1339	hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {	1339	hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1340	if (bits > 0)	1340	if (bits > 0)
1341	key = hash_long(entry->ip, bits);	1341	key = hash_long(entry->ip, bits);
1342	else	1342	else
1343	key = 0;	1343	key = 0;
1344	remove_hash_entry(src, entry);	1344	remove_hash_entry(src, entry);
1345	__add_hash_entry(new_hash, entry);	1345	__add_hash_entry(new_hash, entry);
1346	}	1346	}
1347	}	1347	}
1348		1348
1349	old_hash = *dst;	1349	old_hash = *dst;
1350	rcu_assign_pointer(*dst, new_hash);	1350	rcu_assign_pointer(*dst, new_hash);
1351	free_ftrace_hash_rcu(old_hash);	1351	free_ftrace_hash_rcu(old_hash);
1352		1352
1353	ret = 0;	1353	ret = 0;
1354	out:	1354	out:
1355	/*	1355	/*
1356	* Enable regardless of ret:	1356	* Enable regardless of ret:
1357	* On success, we enable the new hash.	1357	* On success, we enable the new hash.
1358	* On failure, we re-enable the original hash.	1358	* On failure, we re-enable the original hash.
1359	*/	1359	*/
1360	ftrace_hash_rec_enable(ops, enable);	1360	ftrace_hash_rec_enable(ops, enable);
1361		1361
1362	return ret;	1362	return ret;
1363	}	1363	}
1364		1364
1365	/*	1365	/*
1366	* Test the hashes for this ops to see if we want to call	1366	* Test the hashes for this ops to see if we want to call
1367	* the ops->func or not.	1367	* the ops->func or not.
1368	*	1368	*
1369	* It's a match if the ip is in the ops->filter_hash or	1369	* It's a match if the ip is in the ops->filter_hash or
1370	* the filter_hash does not exist or is empty,	1370	* the filter_hash does not exist or is empty,
1371	* AND	1371	* AND
1372	* the ip is not in the ops->notrace_hash.	1372	* the ip is not in the ops->notrace_hash.
1373	*	1373	*
1374	* This needs to be called with preemption disabled as	1374	* This needs to be called with preemption disabled as
1375	* the hashes are freed with call_rcu_sched().	1375	* the hashes are freed with call_rcu_sched().
1376	*/	1376	*/
1377	static int	1377	static int
1378	ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)	1378	ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1379	{	1379	{
1380	struct ftrace_hash *filter_hash;	1380	struct ftrace_hash *filter_hash;
1381	struct ftrace_hash *notrace_hash;	1381	struct ftrace_hash *notrace_hash;
1382	int ret;	1382	int ret;
1383		1383
1384	filter_hash = rcu_dereference_raw(ops->filter_hash);	1384	filter_hash = rcu_dereference_raw(ops->filter_hash);
1385	notrace_hash = rcu_dereference_raw(ops->notrace_hash);	1385	notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1386		1386
1387	if ((ftrace_hash_empty(filter_hash) \|\|	1387	if ((ftrace_hash_empty(filter_hash) \|\|
1388	ftrace_lookup_ip(filter_hash, ip)) &&	1388	ftrace_lookup_ip(filter_hash, ip)) &&
1389	(ftrace_hash_empty(notrace_hash) \|\|	1389	(ftrace_hash_empty(notrace_hash) \|\|
1390	!ftrace_lookup_ip(notrace_hash, ip)))	1390	!ftrace_lookup_ip(notrace_hash, ip)))
1391	ret = 1;	1391	ret = 1;
1392	else	1392	else
1393	ret = 0;	1393	ret = 0;
1394		1394
1395	return ret;	1395	return ret;
1396	}	1396	}
1397		1397
1398	/*	1398	/*
1399	* This is a double for. Do not use 'break' to break out of the loop,	1399	* This is a double for. Do not use 'break' to break out of the loop,
1400	* you must use a goto.	1400	* you must use a goto.
1401	*/	1401	*/
1402	#define do_for_each_ftrace_rec(pg, rec) \	1402	#define do_for_each_ftrace_rec(pg, rec) \
1403	for (pg = ftrace_pages_start; pg; pg = pg->next) { \	1403	for (pg = ftrace_pages_start; pg; pg = pg->next) { \
1404	int _____i; \	1404	int _____i; \
1405	for (_____i = 0; _____i < pg->index; _____i++) { \	1405	for (_____i = 0; _____i < pg->index; _____i++) { \
1406	rec = &pg->records[_____i];	1406	rec = &pg->records[_____i];
1407		1407
1408	#define while_for_each_ftrace_rec() \	1408	#define while_for_each_ftrace_rec() \
1409	} \	1409	} \
1410	}	1410	}
1411		1411
1412		1412
1413	static int ftrace_cmp_recs(const void a, const void b)	1413	static int ftrace_cmp_recs(const void a, const void b)
1414	{	1414	{
1415	const struct dyn_ftrace *key = a;	1415	const struct dyn_ftrace *key = a;
1416	const struct dyn_ftrace *rec = b;	1416	const struct dyn_ftrace *rec = b;
1417		1417
1418	if (key->flags < rec->ip)	1418	if (key->flags < rec->ip)
1419	return -1;	1419	return -1;
1420	if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)	1420	if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1421	return 1;	1421	return 1;
1422	return 0;	1422	return 0;
1423	}	1423	}
1424		1424
1425	static unsigned long ftrace_location_range(unsigned long start, unsigned long end)	1425	static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1426	{	1426	{
1427	struct ftrace_page *pg;	1427	struct ftrace_page *pg;
1428	struct dyn_ftrace *rec;	1428	struct dyn_ftrace *rec;
1429	struct dyn_ftrace key;	1429	struct dyn_ftrace key;
1430		1430
1431	key.ip = start;	1431	key.ip = start;
1432	key.flags = end; /* overload flags, as it is unsigned long */	1432	key.flags = end; /* overload flags, as it is unsigned long */
1433		1433
1434	for (pg = ftrace_pages_start; pg; pg = pg->next) {	1434	for (pg = ftrace_pages_start; pg; pg = pg->next) {
1435	if (end < pg->records[0].ip \|\|	1435	if (end < pg->records[0].ip \|\|
1436	start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))	1436	start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1437	continue;	1437	continue;
1438	rec = bsearch(&key, pg->records, pg->index,	1438	rec = bsearch(&key, pg->records, pg->index,
1439	sizeof(struct dyn_ftrace),	1439	sizeof(struct dyn_ftrace),
1440	ftrace_cmp_recs);	1440	ftrace_cmp_recs);
1441	if (rec)	1441	if (rec)
1442	return rec->ip;	1442	return rec->ip;
1443	}	1443	}
1444		1444
1445	return 0;	1445	return 0;
1446	}	1446	}
1447		1447
1448	/**	1448	/**
1449	* ftrace_location - return true if the ip giving is a traced location	1449	* ftrace_location - return true if the ip giving is a traced location
1450	* @ip: the instruction pointer to check	1450	* @ip: the instruction pointer to check
1451	*	1451	*
1452	* Returns rec->ip if @ip given is a pointer to a ftrace location.	1452	* Returns rec->ip if @ip given is a pointer to a ftrace location.
1453	* That is, the instruction that is either a NOP or call to	1453	* That is, the instruction that is either a NOP or call to
1454	* the function tracer. It checks the ftrace internal tables to	1454	* the function tracer. It checks the ftrace internal tables to
1455	* determine if the address belongs or not.	1455	* determine if the address belongs or not.
1456	*/	1456	*/
1457	unsigned long ftrace_location(unsigned long ip)	1457	unsigned long ftrace_location(unsigned long ip)
1458	{	1458	{
1459	return ftrace_location_range(ip, ip);	1459	return ftrace_location_range(ip, ip);
1460	}	1460	}
1461		1461
1462	/**	1462	/**
1463	* ftrace_text_reserved - return true if range contains an ftrace location	1463	* ftrace_text_reserved - return true if range contains an ftrace location
1464	* @start: start of range to search	1464	* @start: start of range to search
1465	* @end: end of range to search (inclusive). @end points to the last byte to check.	1465	* @end: end of range to search (inclusive). @end points to the last byte to check.
1466	*	1466	*
1467	* Returns 1 if @start and @end contains a ftrace location.	1467	* Returns 1 if @start and @end contains a ftrace location.
1468	* That is, the instruction that is either a NOP or call to	1468	* That is, the instruction that is either a NOP or call to
1469	* the function tracer. It checks the ftrace internal tables to	1469	* the function tracer. It checks the ftrace internal tables to
1470	* determine if the address belongs or not.	1470	* determine if the address belongs or not.
1471	*/	1471	*/
1472	int ftrace_text_reserved(void start, void end)	1472	int ftrace_text_reserved(void start, void end)
1473	{	1473	{
1474	unsigned long ret;	1474	unsigned long ret;
1475		1475
1476	ret = ftrace_location_range((unsigned long)start,	1476	ret = ftrace_location_range((unsigned long)start,
1477	(unsigned long)end);	1477	(unsigned long)end);
1478		1478
1479	return (int)!!ret;	1479	return (int)!!ret;
1480	}	1480	}
1481		1481
1482	static void __ftrace_hash_rec_update(struct ftrace_ops *ops,	1482	static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1483	int filter_hash,	1483	int filter_hash,
1484	bool inc)	1484	bool inc)
1485	{	1485	{
1486	struct ftrace_hash *hash;	1486	struct ftrace_hash *hash;
1487	struct ftrace_hash *other_hash;	1487	struct ftrace_hash *other_hash;
1488	struct ftrace_page *pg;	1488	struct ftrace_page *pg;
1489	struct dyn_ftrace *rec;	1489	struct dyn_ftrace *rec;
1490	int count = 0;	1490	int count = 0;
1491	int all = 0;	1491	int all = 0;
1492		1492
1493	/* Only update if the ops has been registered */	1493	/* Only update if the ops has been registered */
1494	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))	1494	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1495	return;	1495	return;
1496		1496
1497	/*	1497	/*
1498	* In the filter_hash case:	1498	* In the filter_hash case:
1499	* If the count is zero, we update all records.	1499	* If the count is zero, we update all records.
1500	* Otherwise we just update the items in the hash.	1500	* Otherwise we just update the items in the hash.
1501	*	1501	*
1502	* In the notrace_hash case:	1502	* In the notrace_hash case:
1503	* We enable the update in the hash.	1503	* We enable the update in the hash.
1504	* As disabling notrace means enabling the tracing,	1504	* As disabling notrace means enabling the tracing,
1505	* and enabling notrace means disabling, the inc variable	1505	* and enabling notrace means disabling, the inc variable
1506	* gets inversed.	1506	* gets inversed.
1507	*/	1507	*/
1508	if (filter_hash) {	1508	if (filter_hash) {
1509	hash = ops->filter_hash;	1509	hash = ops->filter_hash;
1510	other_hash = ops->notrace_hash;	1510	other_hash = ops->notrace_hash;
1511	if (ftrace_hash_empty(hash))	1511	if (ftrace_hash_empty(hash))
1512	all = 1;	1512	all = 1;
1513	} else {	1513	} else {
1514	inc = !inc;	1514	inc = !inc;
1515	hash = ops->notrace_hash;	1515	hash = ops->notrace_hash;
1516	other_hash = ops->filter_hash;	1516	other_hash = ops->filter_hash;
1517	/*	1517	/*
1518	* If the notrace hash has no items,	1518	* If the notrace hash has no items,
1519	* then there's nothing to do.	1519	* then there's nothing to do.
1520	*/	1520	*/
1521	if (ftrace_hash_empty(hash))	1521	if (ftrace_hash_empty(hash))
1522	return;	1522	return;
1523	}	1523	}
1524		1524
1525	do_for_each_ftrace_rec(pg, rec) {	1525	do_for_each_ftrace_rec(pg, rec) {
1526	int in_other_hash = 0;	1526	int in_other_hash = 0;
1527	int in_hash = 0;	1527	int in_hash = 0;
1528	int match = 0;	1528	int match = 0;
1529		1529
1530	if (all) {	1530	if (all) {
1531	/*	1531	/*
1532	* Only the filter_hash affects all records.	1532	* Only the filter_hash affects all records.
1533	* Update if the record is not in the notrace hash.	1533	* Update if the record is not in the notrace hash.
1534	*/	1534	*/
1535	if (!other_hash \|\| !ftrace_lookup_ip(other_hash, rec->ip))	1535	if (!other_hash \|\| !ftrace_lookup_ip(other_hash, rec->ip))
1536	match = 1;	1536	match = 1;
1537	} else {	1537	} else {
1538	in_hash = !!ftrace_lookup_ip(hash, rec->ip);	1538	in_hash = !!ftrace_lookup_ip(hash, rec->ip);
1539	in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);	1539	in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1540		1540
1541	/*	1541	/*
1542	*	1542	*
1543	*/	1543	*/
1544	if (filter_hash && in_hash && !in_other_hash)	1544	if (filter_hash && in_hash && !in_other_hash)
1545	match = 1;	1545	match = 1;
1546	else if (!filter_hash && in_hash &&	1546	else if (!filter_hash && in_hash &&
1547	(in_other_hash \|\| ftrace_hash_empty(other_hash)))	1547	(in_other_hash \|\| ftrace_hash_empty(other_hash)))
1548	match = 1;	1548	match = 1;
1549	}	1549	}
1550	if (!match)	1550	if (!match)
1551	continue;	1551	continue;
1552		1552
1553	if (inc) {	1553	if (inc) {
1554	rec->flags++;	1554	rec->flags++;
1555	if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))	1555	if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1556	return;	1556	return;
1557	/*	1557	/*
1558	* If any ops wants regs saved for this function	1558	* If any ops wants regs saved for this function
1559	* then all ops will get saved regs.	1559	* then all ops will get saved regs.
1560	*/	1560	*/
1561	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)	1561	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1562	rec->flags \|= FTRACE_FL_REGS;	1562	rec->flags \|= FTRACE_FL_REGS;
1563	} else {	1563	} else {
1564	if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))	1564	if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1565	return;	1565	return;
1566	rec->flags--;	1566	rec->flags--;
1567	}	1567	}
1568	count++;	1568	count++;
1569	/* Shortcut, if we handled all records, we are done. */	1569	/* Shortcut, if we handled all records, we are done. */
1570	if (!all && count == hash->count)	1570	if (!all && count == hash->count)
1571	return;	1571	return;
1572	} while_for_each_ftrace_rec();	1572	} while_for_each_ftrace_rec();
1573	}	1573	}
1574		1574
1575	static void ftrace_hash_rec_disable(struct ftrace_ops *ops,	1575	static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1576	int filter_hash)	1576	int filter_hash)
1577	{	1577	{
1578	__ftrace_hash_rec_update(ops, filter_hash, 0);	1578	__ftrace_hash_rec_update(ops, filter_hash, 0);
1579	}	1579	}
1580		1580
1581	static void ftrace_hash_rec_enable(struct ftrace_ops *ops,	1581	static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1582	int filter_hash)	1582	int filter_hash)
1583	{	1583	{
1584	__ftrace_hash_rec_update(ops, filter_hash, 1);	1584	__ftrace_hash_rec_update(ops, filter_hash, 1);
1585	}	1585	}
1586		1586
1587	static void print_ip_ins(const char fmt, unsigned char p)	1587	static void print_ip_ins(const char fmt, unsigned char p)
1588	{	1588	{
1589	int i;	1589	int i;
1590		1590
1591	printk(KERN_CONT "%s", fmt);	1591	printk(KERN_CONT "%s", fmt);
1592		1592
1593	for (i = 0; i < MCOUNT_INSN_SIZE; i++)	1593	for (i = 0; i < MCOUNT_INSN_SIZE; i++)
1594	printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);	1594	printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1595	}	1595	}
1596		1596
1597	/**	1597	/**
1598	* ftrace_bug - report and shutdown function tracer	1598	* ftrace_bug - report and shutdown function tracer
1599	* @failed: The failed type (EFAULT, EINVAL, EPERM)	1599	* @failed: The failed type (EFAULT, EINVAL, EPERM)
1600	* @ip: The address that failed	1600	* @ip: The address that failed
1601	*	1601	*
1602	* The arch code that enables or disables the function tracing	1602	* The arch code that enables or disables the function tracing
1603	* can call ftrace_bug() when it has detected a problem in	1603	* can call ftrace_bug() when it has detected a problem in
1604	* modifying the code. @failed should be one of either:	1604	* modifying the code. @failed should be one of either:
1605	* EFAULT - if the problem happens on reading the @ip address	1605	* EFAULT - if the problem happens on reading the @ip address
1606	* EINVAL - if what is read at @ip is not what was expected	1606	* EINVAL - if what is read at @ip is not what was expected
1607	* EPERM - if the problem happens on writting to the @ip address	1607	* EPERM - if the problem happens on writting to the @ip address
1608	*/	1608	*/
1609	void ftrace_bug(int failed, unsigned long ip)	1609	void ftrace_bug(int failed, unsigned long ip)
1610	{	1610	{
1611	switch (failed) {	1611	switch (failed) {
1612	case -EFAULT:	1612	case -EFAULT:
1613	FTRACE_WARN_ON_ONCE(1);	1613	FTRACE_WARN_ON_ONCE(1);
1614	pr_info("ftrace faulted on modifying ");	1614	pr_info("ftrace faulted on modifying ");
1615	print_ip_sym(ip);	1615	print_ip_sym(ip);
1616	break;	1616	break;
1617	case -EINVAL:	1617	case -EINVAL:
1618	FTRACE_WARN_ON_ONCE(1);	1618	FTRACE_WARN_ON_ONCE(1);
1619	pr_info("ftrace failed to modify ");	1619	pr_info("ftrace failed to modify ");
1620	print_ip_sym(ip);	1620	print_ip_sym(ip);
1621	print_ip_ins(" actual: ", (unsigned char *)ip);	1621	print_ip_ins(" actual: ", (unsigned char *)ip);
1622	printk(KERN_CONT "\n");	1622	printk(KERN_CONT "\n");
1623	break;	1623	break;
1624	case -EPERM:	1624	case -EPERM:
1625	FTRACE_WARN_ON_ONCE(1);	1625	FTRACE_WARN_ON_ONCE(1);
1626	pr_info("ftrace faulted on writing ");	1626	pr_info("ftrace faulted on writing ");
1627	print_ip_sym(ip);	1627	print_ip_sym(ip);
1628	break;	1628	break;
1629	default:	1629	default:
1630	FTRACE_WARN_ON_ONCE(1);	1630	FTRACE_WARN_ON_ONCE(1);
1631	pr_info("ftrace faulted on unknown error ");	1631	pr_info("ftrace faulted on unknown error ");
1632	print_ip_sym(ip);	1632	print_ip_sym(ip);
1633	}	1633	}
1634	}	1634	}
1635		1635
1636	static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)	1636	static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1637	{	1637	{
1638	unsigned long flag = 0UL;	1638	unsigned long flag = 0UL;
1639		1639
1640	/*	1640	/*
1641	* If we are updating calls:	1641	* If we are updating calls:
1642	*	1642	*
1643	* If the record has a ref count, then we need to enable it	1643	* If the record has a ref count, then we need to enable it
1644	* because someone is using it.	1644	* because someone is using it.
1645	*	1645	*
1646	* Otherwise we make sure its disabled.	1646	* Otherwise we make sure its disabled.
1647	*	1647	*
1648	* If we are disabling calls, then disable all records that	1648	* If we are disabling calls, then disable all records that
1649	* are enabled.	1649	* are enabled.
1650	*/	1650	*/
1651	if (enable && (rec->flags & ~FTRACE_FL_MASK))	1651	if (enable && (rec->flags & ~FTRACE_FL_MASK))
1652	flag = FTRACE_FL_ENABLED;	1652	flag = FTRACE_FL_ENABLED;
1653		1653
1654	/*	1654	/*
1655	* If enabling and the REGS flag does not match the REGS_EN, then	1655	* If enabling and the REGS flag does not match the REGS_EN, then
1656	* do not ignore this record. Set flags to fail the compare against	1656	* do not ignore this record. Set flags to fail the compare against
1657	* ENABLED.	1657	* ENABLED.
1658	*/	1658	*/
1659	if (flag &&	1659	if (flag &&
1660	(!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))	1660	(!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
1661	flag \|= FTRACE_FL_REGS;	1661	flag \|= FTRACE_FL_REGS;
1662		1662
1663	/* If the state of this record hasn't changed, then do nothing */	1663	/* If the state of this record hasn't changed, then do nothing */
1664	if ((rec->flags & FTRACE_FL_ENABLED) == flag)	1664	if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1665	return FTRACE_UPDATE_IGNORE;	1665	return FTRACE_UPDATE_IGNORE;
1666		1666
1667	if (flag) {	1667	if (flag) {
1668	/* Save off if rec is being enabled (for return value) */	1668	/* Save off if rec is being enabled (for return value) */
1669	flag ^= rec->flags & FTRACE_FL_ENABLED;	1669	flag ^= rec->flags & FTRACE_FL_ENABLED;
1670		1670
1671	if (update) {	1671	if (update) {
1672	rec->flags \|= FTRACE_FL_ENABLED;	1672	rec->flags \|= FTRACE_FL_ENABLED;
1673	if (flag & FTRACE_FL_REGS) {	1673	if (flag & FTRACE_FL_REGS) {
1674	if (rec->flags & FTRACE_FL_REGS)	1674	if (rec->flags & FTRACE_FL_REGS)
1675	rec->flags \|= FTRACE_FL_REGS_EN;	1675	rec->flags \|= FTRACE_FL_REGS_EN;
1676	else	1676	else
1677	rec->flags &= ~FTRACE_FL_REGS_EN;	1677	rec->flags &= ~FTRACE_FL_REGS_EN;
1678	}	1678	}
1679	}	1679	}
1680		1680
1681	/*	1681	/*
1682	* If this record is being updated from a nop, then	1682	* If this record is being updated from a nop, then
1683	* return UPDATE_MAKE_CALL.	1683	* return UPDATE_MAKE_CALL.
1684	* Otherwise, if the EN flag is set, then return	1684	* Otherwise, if the EN flag is set, then return
1685	* UPDATE_MODIFY_CALL_REGS to tell the caller to convert	1685	* UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1686	* from the non-save regs, to a save regs function.	1686	* from the non-save regs, to a save regs function.
1687	* Otherwise,	1687	* Otherwise,
1688	* return UPDATE_MODIFY_CALL to tell the caller to convert	1688	* return UPDATE_MODIFY_CALL to tell the caller to convert
1689	* from the save regs, to a non-save regs function.	1689	* from the save regs, to a non-save regs function.
1690	*/	1690	*/
1691	if (flag & FTRACE_FL_ENABLED)	1691	if (flag & FTRACE_FL_ENABLED)
1692	return FTRACE_UPDATE_MAKE_CALL;	1692	return FTRACE_UPDATE_MAKE_CALL;
1693	else if (rec->flags & FTRACE_FL_REGS_EN)	1693	else if (rec->flags & FTRACE_FL_REGS_EN)
1694	return FTRACE_UPDATE_MODIFY_CALL_REGS;	1694	return FTRACE_UPDATE_MODIFY_CALL_REGS;
1695	else	1695	else
1696	return FTRACE_UPDATE_MODIFY_CALL;	1696	return FTRACE_UPDATE_MODIFY_CALL;
1697	}	1697	}
1698		1698
1699	if (update) {	1699	if (update) {
1700	/* If there's no more users, clear all flags */	1700	/* If there's no more users, clear all flags */
1701	if (!(rec->flags & ~FTRACE_FL_MASK))	1701	if (!(rec->flags & ~FTRACE_FL_MASK))
1702	rec->flags = 0;	1702	rec->flags = 0;
1703	else	1703	else
1704	/* Just disable the record (keep REGS state) */	1704	/* Just disable the record (keep REGS state) */
1705	rec->flags &= ~FTRACE_FL_ENABLED;	1705	rec->flags &= ~FTRACE_FL_ENABLED;
1706	}	1706	}
1707		1707
1708	return FTRACE_UPDATE_MAKE_NOP;	1708	return FTRACE_UPDATE_MAKE_NOP;
1709	}	1709	}
1710		1710
1711	/**	1711	/**
1712	* ftrace_update_record, set a record that now is tracing or not	1712	* ftrace_update_record, set a record that now is tracing or not
1713	* @rec: the record to update	1713	* @rec: the record to update
1714	* @enable: set to 1 if the record is tracing, zero to force disable	1714	* @enable: set to 1 if the record is tracing, zero to force disable
1715	*	1715	*
1716	* The records that represent all functions that can be traced need	1716	* The records that represent all functions that can be traced need
1717	* to be updated when tracing has been enabled.	1717	* to be updated when tracing has been enabled.
1718	*/	1718	*/
1719	int ftrace_update_record(struct dyn_ftrace *rec, int enable)	1719	int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1720	{	1720	{
1721	return ftrace_check_record(rec, enable, 1);	1721	return ftrace_check_record(rec, enable, 1);
1722	}	1722	}
1723		1723
1724	/**	1724	/**
1725	* ftrace_test_record, check if the record has been enabled or not	1725	* ftrace_test_record, check if the record has been enabled or not
1726	* @rec: the record to test	1726	* @rec: the record to test
1727	* @enable: set to 1 to check if enabled, 0 if it is disabled	1727	* @enable: set to 1 to check if enabled, 0 if it is disabled
1728	*	1728	*
1729	* The arch code may need to test if a record is already set to	1729	* The arch code may need to test if a record is already set to
1730	* tracing to determine how to modify the function code that it	1730	* tracing to determine how to modify the function code that it
1731	* represents.	1731	* represents.
1732	*/	1732	*/
1733	int ftrace_test_record(struct dyn_ftrace *rec, int enable)	1733	int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1734	{	1734	{
1735	return ftrace_check_record(rec, enable, 0);	1735	return ftrace_check_record(rec, enable, 0);
1736	}	1736	}
1737		1737
1738	static int	1738	static int
1739	__ftrace_replace_code(struct dyn_ftrace *rec, int enable)	1739	__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1740	{	1740	{
1741	unsigned long ftrace_old_addr;	1741	unsigned long ftrace_old_addr;
1742	unsigned long ftrace_addr;	1742	unsigned long ftrace_addr;
1743	int ret;	1743	int ret;
1744		1744
1745	ret = ftrace_update_record(rec, enable);	1745	ret = ftrace_update_record(rec, enable);
1746		1746
1747	if (rec->flags & FTRACE_FL_REGS)	1747	if (rec->flags & FTRACE_FL_REGS)
1748	ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;	1748	ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
1749	else	1749	else
1750	ftrace_addr = (unsigned long)FTRACE_ADDR;	1750	ftrace_addr = (unsigned long)FTRACE_ADDR;
1751		1751
1752	switch (ret) {	1752	switch (ret) {
1753	case FTRACE_UPDATE_IGNORE:	1753	case FTRACE_UPDATE_IGNORE:
1754	return 0;	1754	return 0;
1755		1755
1756	case FTRACE_UPDATE_MAKE_CALL:	1756	case FTRACE_UPDATE_MAKE_CALL:
1757	return ftrace_make_call(rec, ftrace_addr);	1757	return ftrace_make_call(rec, ftrace_addr);
1758		1758
1759	case FTRACE_UPDATE_MAKE_NOP:	1759	case FTRACE_UPDATE_MAKE_NOP:
1760	return ftrace_make_nop(NULL, rec, ftrace_addr);	1760	return ftrace_make_nop(NULL, rec, ftrace_addr);
1761		1761
1762	case FTRACE_UPDATE_MODIFY_CALL_REGS:	1762	case FTRACE_UPDATE_MODIFY_CALL_REGS:
1763	case FTRACE_UPDATE_MODIFY_CALL:	1763	case FTRACE_UPDATE_MODIFY_CALL:
1764	if (rec->flags & FTRACE_FL_REGS)	1764	if (rec->flags & FTRACE_FL_REGS)
1765	ftrace_old_addr = (unsigned long)FTRACE_ADDR;	1765	ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1766	else	1766	else
1767	ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;	1767	ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1768		1768
1769	return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);	1769	return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1770	}	1770	}
1771		1771
1772	return -1; /* unknow ftrace bug */	1772	return -1; /* unknow ftrace bug */
1773	}	1773	}
1774		1774
1775	void __weak ftrace_replace_code(int enable)	1775	void __weak ftrace_replace_code(int enable)
1776	{	1776	{
1777	struct dyn_ftrace *rec;	1777	struct dyn_ftrace *rec;
1778	struct ftrace_page *pg;	1778	struct ftrace_page *pg;
1779	int failed;	1779	int failed;
1780		1780
1781	if (unlikely(ftrace_disabled))	1781	if (unlikely(ftrace_disabled))
1782	return;	1782	return;
1783		1783
1784	do_for_each_ftrace_rec(pg, rec) {	1784	do_for_each_ftrace_rec(pg, rec) {
1785	failed = __ftrace_replace_code(rec, enable);	1785	failed = __ftrace_replace_code(rec, enable);
1786	if (failed) {	1786	if (failed) {
1787	ftrace_bug(failed, rec->ip);	1787	ftrace_bug(failed, rec->ip);
1788	/* Stop processing */	1788	/* Stop processing */
1789	return;	1789	return;
1790	}	1790	}
1791	} while_for_each_ftrace_rec();	1791	} while_for_each_ftrace_rec();
1792	}	1792	}
1793		1793
1794	struct ftrace_rec_iter {	1794	struct ftrace_rec_iter {
1795	struct ftrace_page *pg;	1795	struct ftrace_page *pg;
1796	int index;	1796	int index;
1797	};	1797	};
1798		1798
1799	/**	1799	/**
1800	* ftrace_rec_iter_start, start up iterating over traced functions	1800	* ftrace_rec_iter_start, start up iterating over traced functions
1801	*	1801	*
1802	* Returns an iterator handle that is used to iterate over all	1802	* Returns an iterator handle that is used to iterate over all
1803	* the records that represent address locations where functions	1803	* the records that represent address locations where functions
1804	* are traced.	1804	* are traced.
1805	*	1805	*
1806	* May return NULL if no records are available.	1806	* May return NULL if no records are available.
1807	*/	1807	*/
1808	struct ftrace_rec_iter *ftrace_rec_iter_start(void)	1808	struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1809	{	1809	{
1810	/*	1810	/*
1811	* We only use a single iterator.	1811	* We only use a single iterator.
1812	* Protected by the ftrace_lock mutex.	1812	* Protected by the ftrace_lock mutex.
1813	*/	1813	*/
1814	static struct ftrace_rec_iter ftrace_rec_iter;	1814	static struct ftrace_rec_iter ftrace_rec_iter;
1815	struct ftrace_rec_iter *iter = &ftrace_rec_iter;	1815	struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1816		1816
1817	iter->pg = ftrace_pages_start;	1817	iter->pg = ftrace_pages_start;
1818	iter->index = 0;	1818	iter->index = 0;
1819		1819
1820	/* Could have empty pages */	1820	/* Could have empty pages */
1821	while (iter->pg && !iter->pg->index)	1821	while (iter->pg && !iter->pg->index)
1822	iter->pg = iter->pg->next;	1822	iter->pg = iter->pg->next;
1823		1823
1824	if (!iter->pg)	1824	if (!iter->pg)
1825	return NULL;	1825	return NULL;
1826		1826
1827	return iter;	1827	return iter;
1828	}	1828	}
1829		1829
1830	/**	1830	/**
1831	* ftrace_rec_iter_next, get the next record to process.	1831	* ftrace_rec_iter_next, get the next record to process.
1832	* @iter: The handle to the iterator.	1832	* @iter: The handle to the iterator.
1833	*	1833	*
1834	* Returns the next iterator after the given iterator @iter.	1834	* Returns the next iterator after the given iterator @iter.
1835	*/	1835	*/
1836	struct ftrace_rec_iter ftrace_rec_iter_next(struct ftrace_rec_iter iter)	1836	struct ftrace_rec_iter ftrace_rec_iter_next(struct ftrace_rec_iter iter)
1837	{	1837	{
1838	iter->index++;	1838	iter->index++;
1839		1839
1840	if (iter->index >= iter->pg->index) {	1840	if (iter->index >= iter->pg->index) {
1841	iter->pg = iter->pg->next;	1841	iter->pg = iter->pg->next;
1842	iter->index = 0;	1842	iter->index = 0;
1843		1843
1844	/* Could have empty pages */	1844	/* Could have empty pages */
1845	while (iter->pg && !iter->pg->index)	1845	while (iter->pg && !iter->pg->index)
1846	iter->pg = iter->pg->next;	1846	iter->pg = iter->pg->next;
1847	}	1847	}
1848		1848
1849	if (!iter->pg)	1849	if (!iter->pg)
1850	return NULL;	1850	return NULL;
1851		1851
1852	return iter;	1852	return iter;
1853	}	1853	}
1854		1854
1855	/**	1855	/**
1856	* ftrace_rec_iter_record, get the record at the iterator location	1856	* ftrace_rec_iter_record, get the record at the iterator location
1857	* @iter: The current iterator location	1857	* @iter: The current iterator location
1858	*	1858	*
1859	* Returns the record that the current @iter is at.	1859	* Returns the record that the current @iter is at.
1860	*/	1860	*/
1861	struct dyn_ftrace ftrace_rec_iter_record(struct ftrace_rec_iter iter)	1861	struct dyn_ftrace ftrace_rec_iter_record(struct ftrace_rec_iter iter)
1862	{	1862	{
1863	return &iter->pg->records[iter->index];	1863	return &iter->pg->records[iter->index];
1864	}	1864	}
1865		1865
1866	static int	1866	static int
1867	ftrace_code_disable(struct module mod, struct dyn_ftrace rec)	1867	ftrace_code_disable(struct module mod, struct dyn_ftrace rec)
1868	{	1868	{
1869	unsigned long ip;	1869	unsigned long ip;
1870	int ret;	1870	int ret;
1871		1871
1872	ip = rec->ip;	1872	ip = rec->ip;
1873		1873
1874	if (unlikely(ftrace_disabled))	1874	if (unlikely(ftrace_disabled))
1875	return 0;	1875	return 0;
1876		1876
1877	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);	1877	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1878	if (ret) {	1878	if (ret) {
1879	ftrace_bug(ret, ip);	1879	ftrace_bug(ret, ip);
1880	return 0;	1880	return 0;
1881	}	1881	}
1882	return 1;	1882	return 1;
1883	}	1883	}
1884		1884
1885	/*	1885	/*
1886	* archs can override this function if they must do something	1886	* archs can override this function if they must do something
1887	* before the modifying code is performed.	1887	* before the modifying code is performed.
1888	*/	1888	*/
1889	int __weak ftrace_arch_code_modify_prepare(void)	1889	int __weak ftrace_arch_code_modify_prepare(void)
1890	{	1890	{
1891	return 0;	1891	return 0;
1892	}	1892	}
1893		1893
1894	/*	1894	/*
1895	* archs can override this function if they must do something	1895	* archs can override this function if they must do something
1896	* after the modifying code is performed.	1896	* after the modifying code is performed.
1897	*/	1897	*/
1898	int __weak ftrace_arch_code_modify_post_process(void)	1898	int __weak ftrace_arch_code_modify_post_process(void)
1899	{	1899	{
1900	return 0;	1900	return 0;
1901	}	1901	}
1902		1902
1903	void ftrace_modify_all_code(int command)	1903	void ftrace_modify_all_code(int command)
1904	{	1904	{
1905	if (command & FTRACE_UPDATE_CALLS)	1905	if (command & FTRACE_UPDATE_CALLS)
1906	ftrace_replace_code(1);	1906	ftrace_replace_code(1);
1907	else if (command & FTRACE_DISABLE_CALLS)	1907	else if (command & FTRACE_DISABLE_CALLS)
1908	ftrace_replace_code(0);	1908	ftrace_replace_code(0);
1909		1909
1910	if (command & FTRACE_UPDATE_TRACE_FUNC)	1910	if (command & FTRACE_UPDATE_TRACE_FUNC)
1911	ftrace_update_ftrace_func(ftrace_trace_function);	1911	ftrace_update_ftrace_func(ftrace_trace_function);
1912		1912
1913	if (command & FTRACE_START_FUNC_RET)	1913	if (command & FTRACE_START_FUNC_RET)
1914	ftrace_enable_ftrace_graph_caller();	1914	ftrace_enable_ftrace_graph_caller();
1915	else if (command & FTRACE_STOP_FUNC_RET)	1915	else if (command & FTRACE_STOP_FUNC_RET)
1916	ftrace_disable_ftrace_graph_caller();	1916	ftrace_disable_ftrace_graph_caller();
1917	}	1917	}
1918		1918
1919	static int __ftrace_modify_code(void *data)	1919	static int __ftrace_modify_code(void *data)
1920	{	1920	{
1921	int *command = data;	1921	int *command = data;
1922		1922
1923	ftrace_modify_all_code(*command);	1923	ftrace_modify_all_code(*command);
1924		1924
1925	return 0;	1925	return 0;
1926	}	1926	}
1927		1927
1928	/**	1928	/**
1929	* ftrace_run_stop_machine, go back to the stop machine method	1929	* ftrace_run_stop_machine, go back to the stop machine method
1930	* @command: The command to tell ftrace what to do	1930	* @command: The command to tell ftrace what to do
1931	*	1931	*
1932	* If an arch needs to fall back to the stop machine method, the	1932	* If an arch needs to fall back to the stop machine method, the
1933	* it can call this function.	1933	* it can call this function.
1934	*/	1934	*/
1935	void ftrace_run_stop_machine(int command)	1935	void ftrace_run_stop_machine(int command)
1936	{	1936	{
1937	stop_machine(__ftrace_modify_code, &command, NULL);	1937	stop_machine(__ftrace_modify_code, &command, NULL);
1938	}	1938	}
1939		1939
1940	/**	1940	/**
1941	* arch_ftrace_update_code, modify the code to trace or not trace	1941	* arch_ftrace_update_code, modify the code to trace or not trace
1942	* @command: The command that needs to be done	1942	* @command: The command that needs to be done
1943	*	1943	*
1944	* Archs can override this function if it does not need to	1944	* Archs can override this function if it does not need to
1945	* run stop_machine() to modify code.	1945	* run stop_machine() to modify code.
1946	*/	1946	*/
1947	void __weak arch_ftrace_update_code(int command)	1947	void __weak arch_ftrace_update_code(int command)
1948	{	1948	{
1949	ftrace_run_stop_machine(command);	1949	ftrace_run_stop_machine(command);
1950	}	1950	}
1951		1951
1952	static void ftrace_run_update_code(int command)	1952	static void ftrace_run_update_code(int command)
1953	{	1953	{
1954	int ret;	1954	int ret;
1955		1955
1956	ret = ftrace_arch_code_modify_prepare();	1956	ret = ftrace_arch_code_modify_prepare();
1957	FTRACE_WARN_ON(ret);	1957	FTRACE_WARN_ON(ret);
1958	if (ret)	1958	if (ret)
1959	return;	1959	return;
1960	/*	1960	/*
1961	* Do not call function tracer while we update the code.	1961	* Do not call function tracer while we update the code.
1962	* We are in stop machine.	1962	* We are in stop machine.
1963	*/	1963	*/
1964	function_trace_stop++;	1964	function_trace_stop++;
1965		1965
1966	/*	1966	/*
1967	* By default we use stop_machine() to modify the code.	1967	* By default we use stop_machine() to modify the code.
1968	* But archs can do what ever they want as long as it	1968	* But archs can do what ever they want as long as it
1969	* is safe. The stop_machine() is the safest, but also	1969	* is safe. The stop_machine() is the safest, but also
1970	* produces the most overhead.	1970	* produces the most overhead.
1971	*/	1971	*/
1972	arch_ftrace_update_code(command);	1972	arch_ftrace_update_code(command);
1973		1973
1974	function_trace_stop--;	1974	function_trace_stop--;
1975		1975
1976	ret = ftrace_arch_code_modify_post_process();	1976	ret = ftrace_arch_code_modify_post_process();
1977	FTRACE_WARN_ON(ret);	1977	FTRACE_WARN_ON(ret);
1978	}	1978	}
1979		1979
1980	static ftrace_func_t saved_ftrace_func;	1980	static ftrace_func_t saved_ftrace_func;
1981	static int ftrace_start_up;	1981	static int ftrace_start_up;
1982	static int global_start_up;	1982	static int global_start_up;
1983		1983
1984	static void ftrace_startup_enable(int command)	1984	static void ftrace_startup_enable(int command)
1985	{	1985	{
1986	if (saved_ftrace_func != ftrace_trace_function) {	1986	if (saved_ftrace_func != ftrace_trace_function) {
1987	saved_ftrace_func = ftrace_trace_function;	1987	saved_ftrace_func = ftrace_trace_function;
1988	command \|= FTRACE_UPDATE_TRACE_FUNC;	1988	command \|= FTRACE_UPDATE_TRACE_FUNC;
1989	}	1989	}
1990		1990
1991	if (!command \|\| !ftrace_enabled)	1991	if (!command \|\| !ftrace_enabled)
1992	return;	1992	return;
1993		1993
1994	ftrace_run_update_code(command);	1994	ftrace_run_update_code(command);
1995	}	1995	}
1996		1996
1997	static int ftrace_startup(struct ftrace_ops *ops, int command)	1997	static int ftrace_startup(struct ftrace_ops *ops, int command)
1998	{	1998	{
1999	bool hash_enable = true;	1999	bool hash_enable = true;
2000		2000
2001	if (unlikely(ftrace_disabled))	2001	if (unlikely(ftrace_disabled))
2002	return -ENODEV;	2002	return -ENODEV;
2003		2003
2004	ftrace_start_up++;	2004	ftrace_start_up++;
2005	command \|= FTRACE_UPDATE_CALLS;	2005	command \|= FTRACE_UPDATE_CALLS;
2006		2006
2007	/* ops marked global share the filter hashes */	2007	/* ops marked global share the filter hashes */
2008	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {	2008	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2009	ops = &global_ops;	2009	ops = &global_ops;
2010	/* Don't update hash if global is already set */	2010	/* Don't update hash if global is already set */
2011	if (global_start_up)	2011	if (global_start_up)
2012	hash_enable = false;	2012	hash_enable = false;
2013	global_start_up++;	2013	global_start_up++;
2014	}	2014	}
2015		2015
2016	ops->flags \|= FTRACE_OPS_FL_ENABLED;	2016	ops->flags \|= FTRACE_OPS_FL_ENABLED;
2017	if (hash_enable)	2017	if (hash_enable)
2018	ftrace_hash_rec_enable(ops, 1);	2018	ftrace_hash_rec_enable(ops, 1);
2019		2019
2020	ftrace_startup_enable(command);	2020	ftrace_startup_enable(command);
2021		2021
2022	return 0;	2022	return 0;
2023	}	2023	}
2024		2024
2025	static void ftrace_shutdown(struct ftrace_ops *ops, int command)	2025	static void ftrace_shutdown(struct ftrace_ops *ops, int command)
2026	{	2026	{
2027	bool hash_disable = true;	2027	bool hash_disable = true;
2028		2028
2029	if (unlikely(ftrace_disabled))	2029	if (unlikely(ftrace_disabled))
2030	return;	2030	return;
2031		2031
2032	ftrace_start_up--;	2032	ftrace_start_up--;
2033	/*	2033	/*
2034	* Just warn in case of unbalance, no need to kill ftrace, it's not	2034	* Just warn in case of unbalance, no need to kill ftrace, it's not
2035	* critical but the ftrace_call callers may be never nopped again after	2035	* critical but the ftrace_call callers may be never nopped again after
2036	* further ftrace uses.	2036	* further ftrace uses.
2037	*/	2037	*/
2038	WARN_ON_ONCE(ftrace_start_up < 0);	2038	WARN_ON_ONCE(ftrace_start_up < 0);
2039		2039
2040	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {	2040	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2041	ops = &global_ops;	2041	ops = &global_ops;
2042	global_start_up--;	2042	global_start_up--;
2043	WARN_ON_ONCE(global_start_up < 0);	2043	WARN_ON_ONCE(global_start_up < 0);
2044	/* Don't update hash if global still has users */	2044	/* Don't update hash if global still has users */
2045	if (global_start_up) {	2045	if (global_start_up) {
2046	WARN_ON_ONCE(!ftrace_start_up);	2046	WARN_ON_ONCE(!ftrace_start_up);
2047	hash_disable = false;	2047	hash_disable = false;
2048	}	2048	}
2049	}	2049	}
2050		2050
2051	if (hash_disable)	2051	if (hash_disable)
2052	ftrace_hash_rec_disable(ops, 1);	2052	ftrace_hash_rec_disable(ops, 1);
2053		2053
2054	if (ops != &global_ops \|\| !global_start_up)	2054	if (ops != &global_ops \|\| !global_start_up)
2055	ops->flags &= ~FTRACE_OPS_FL_ENABLED;	2055	ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2056		2056
2057	command \|= FTRACE_UPDATE_CALLS;	2057	command \|= FTRACE_UPDATE_CALLS;
2058		2058
2059	if (saved_ftrace_func != ftrace_trace_function) {	2059	if (saved_ftrace_func != ftrace_trace_function) {
2060	saved_ftrace_func = ftrace_trace_function;	2060	saved_ftrace_func = ftrace_trace_function;
2061	command \|= FTRACE_UPDATE_TRACE_FUNC;	2061	command \|= FTRACE_UPDATE_TRACE_FUNC;
2062	}	2062	}
2063		2063
2064	if (!command \|\| !ftrace_enabled)	2064	if (!command \|\| !ftrace_enabled)
2065	return;	2065	return;
2066		2066
2067	ftrace_run_update_code(command);	2067	ftrace_run_update_code(command);
2068	}	2068	}
2069		2069
2070	static void ftrace_startup_sysctl(void)	2070	static void ftrace_startup_sysctl(void)
2071	{	2071	{
2072	if (unlikely(ftrace_disabled))	2072	if (unlikely(ftrace_disabled))
2073	return;	2073	return;
2074		2074
2075	/* Force update next time */	2075	/* Force update next time */
2076	saved_ftrace_func = NULL;	2076	saved_ftrace_func = NULL;
2077	/* ftrace_start_up is true if we want ftrace running */	2077	/* ftrace_start_up is true if we want ftrace running */
2078	if (ftrace_start_up)	2078	if (ftrace_start_up)
2079	ftrace_run_update_code(FTRACE_UPDATE_CALLS);	2079	ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2080	}	2080	}
2081		2081
2082	static void ftrace_shutdown_sysctl(void)	2082	static void ftrace_shutdown_sysctl(void)
2083	{	2083	{
2084	if (unlikely(ftrace_disabled))	2084	if (unlikely(ftrace_disabled))
2085	return;	2085	return;
2086		2086
2087	/* ftrace_start_up is true if ftrace is running */	2087	/* ftrace_start_up is true if ftrace is running */
2088	if (ftrace_start_up)	2088	if (ftrace_start_up)
2089	ftrace_run_update_code(FTRACE_DISABLE_CALLS);	2089	ftrace_run_update_code(FTRACE_DISABLE_CALLS);
2090	}	2090	}
2091		2091
2092	static cycle_t ftrace_update_time;	2092	static cycle_t ftrace_update_time;
2093	static unsigned long ftrace_update_cnt;	2093	static unsigned long ftrace_update_cnt;
2094	unsigned long ftrace_update_tot_cnt;	2094	unsigned long ftrace_update_tot_cnt;
2095		2095
2096	static int ops_traces_mod(struct ftrace_ops *ops)	2096	static int ops_traces_mod(struct ftrace_ops *ops)
2097	{	2097	{
2098	struct ftrace_hash *hash;	2098	struct ftrace_hash *hash;
2099		2099
2100	hash = ops->filter_hash;	2100	hash = ops->filter_hash;
2101	return ftrace_hash_empty(hash);	2101	return ftrace_hash_empty(hash);
2102	}	2102	}
2103		2103
2104	static int ftrace_update_code(struct module *mod)	2104	static int ftrace_update_code(struct module *mod)
2105	{	2105	{
2106	struct ftrace_page *pg;	2106	struct ftrace_page *pg;
2107	struct dyn_ftrace *p;	2107	struct dyn_ftrace *p;
2108	cycle_t start, stop;	2108	cycle_t start, stop;
2109	unsigned long ref = 0;	2109	unsigned long ref = 0;
2110	int i;	2110	int i;
2111		2111
2112	/*	2112	/*
2113	* When adding a module, we need to check if tracers are	2113	* When adding a module, we need to check if tracers are
2114	* currently enabled and if they are set to trace all functions.	2114	* currently enabled and if they are set to trace all functions.
2115	* If they are, we need to enable the module functions as well	2115	* If they are, we need to enable the module functions as well
2116	* as update the reference counts for those function records.	2116	* as update the reference counts for those function records.
2117	*/	2117	*/
2118	if (mod) {	2118	if (mod) {
2119	struct ftrace_ops *ops;	2119	struct ftrace_ops *ops;
2120		2120
2121	for (ops = ftrace_ops_list;	2121	for (ops = ftrace_ops_list;
2122	ops != &ftrace_list_end; ops = ops->next) {	2122	ops != &ftrace_list_end; ops = ops->next) {
2123	if (ops->flags & FTRACE_OPS_FL_ENABLED &&	2123	if (ops->flags & FTRACE_OPS_FL_ENABLED &&
2124	ops_traces_mod(ops))	2124	ops_traces_mod(ops))
2125	ref++;	2125	ref++;
2126	}	2126	}
2127	}	2127	}
2128		2128
2129	start = ftrace_now(raw_smp_processor_id());	2129	start = ftrace_now(raw_smp_processor_id());
2130	ftrace_update_cnt = 0;	2130	ftrace_update_cnt = 0;
2131		2131
2132	for (pg = ftrace_new_pgs; pg; pg = pg->next) {	2132	for (pg = ftrace_new_pgs; pg; pg = pg->next) {
2133		2133
2134	for (i = 0; i < pg->index; i++) {	2134	for (i = 0; i < pg->index; i++) {
2135	/* If something went wrong, bail without enabling anything */	2135	/* If something went wrong, bail without enabling anything */
2136	if (unlikely(ftrace_disabled))	2136	if (unlikely(ftrace_disabled))
2137	return -1;	2137	return -1;
2138		2138
2139	p = &pg->records[i];	2139	p = &pg->records[i];
2140	p->flags = ref;	2140	p->flags = ref;
2141		2141
2142	/*	2142	/*
2143	* Do the initial record conversion from mcount jump	2143	* Do the initial record conversion from mcount jump
2144	* to the NOP instructions.	2144	* to the NOP instructions.
2145	*/	2145	*/
2146	if (!ftrace_code_disable(mod, p))	2146	if (!ftrace_code_disable(mod, p))
2147	break;	2147	break;
2148		2148
2149	ftrace_update_cnt++;	2149	ftrace_update_cnt++;
2150		2150
2151	/*	2151	/*
2152	* If the tracing is enabled, go ahead and enable the record.	2152	* If the tracing is enabled, go ahead and enable the record.
2153	*	2153	*
2154	* The reason not to enable the record immediatelly is the	2154	* The reason not to enable the record immediatelly is the
2155	* inherent check of ftrace_make_nop/ftrace_make_call for	2155	* inherent check of ftrace_make_nop/ftrace_make_call for
2156	* correct previous instructions. Making first the NOP	2156	* correct previous instructions. Making first the NOP
2157	* conversion puts the module to the correct state, thus	2157	* conversion puts the module to the correct state, thus
2158	* passing the ftrace_make_call check.	2158	* passing the ftrace_make_call check.
2159	*/	2159	*/
2160	if (ftrace_start_up && ref) {	2160	if (ftrace_start_up && ref) {
2161	int failed = __ftrace_replace_code(p, 1);	2161	int failed = __ftrace_replace_code(p, 1);
2162	if (failed)	2162	if (failed)
2163	ftrace_bug(failed, p->ip);	2163	ftrace_bug(failed, p->ip);
2164	}	2164	}
2165	}	2165	}
2166	}	2166	}
2167		2167
2168	ftrace_new_pgs = NULL;	2168	ftrace_new_pgs = NULL;
2169		2169
2170	stop = ftrace_now(raw_smp_processor_id());	2170	stop = ftrace_now(raw_smp_processor_id());
2171	ftrace_update_time = stop - start;	2171	ftrace_update_time = stop - start;
2172	ftrace_update_tot_cnt += ftrace_update_cnt;	2172	ftrace_update_tot_cnt += ftrace_update_cnt;
2173		2173
2174	return 0;	2174	return 0;
2175	}	2175	}
2176		2176
2177	static int ftrace_allocate_records(struct ftrace_page *pg, int count)	2177	static int ftrace_allocate_records(struct ftrace_page *pg, int count)
2178	{	2178	{
2179	int order;	2179	int order;
2180	int cnt;	2180	int cnt;
2181		2181
2182	if (WARN_ON(!count))	2182	if (WARN_ON(!count))
2183	return -EINVAL;	2183	return -EINVAL;
2184		2184
2185	order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));	2185	order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
2186		2186
2187	/*	2187	/*
2188	* We want to fill as much as possible. No more than a page	2188	* We want to fill as much as possible. No more than a page
2189	* may be empty.	2189	* may be empty.
2190	*/	2190	*/
2191	while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)	2191	while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2192	order--;	2192	order--;
2193		2193
2194	again:	2194	again:
2195	pg->records = (void *)__get_free_pages(GFP_KERNEL \| __GFP_ZERO, order);	2195	pg->records = (void *)__get_free_pages(GFP_KERNEL \| __GFP_ZERO, order);
2196		2196
2197	if (!pg->records) {	2197	if (!pg->records) {
2198	/* if we can't allocate this size, try something smaller */	2198	/* if we can't allocate this size, try something smaller */
2199	if (!order)	2199	if (!order)
2200	return -ENOMEM;	2200	return -ENOMEM;
2201	order >>= 1;	2201	order >>= 1;
2202	goto again;	2202	goto again;
2203	}	2203	}
2204		2204
2205	cnt = (PAGE_SIZE << order) / ENTRY_SIZE;	2205	cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
2206	pg->size = cnt;	2206	pg->size = cnt;
2207		2207
2208	if (cnt > count)	2208	if (cnt > count)
2209	cnt = count;	2209	cnt = count;
2210		2210
2211	return cnt;	2211	return cnt;
2212	}	2212	}
2213		2213
2214	static struct ftrace_page *	2214	static struct ftrace_page *
2215	ftrace_allocate_pages(unsigned long num_to_init)	2215	ftrace_allocate_pages(unsigned long num_to_init)
2216	{	2216	{
2217	struct ftrace_page *start_pg;	2217	struct ftrace_page *start_pg;
2218	struct ftrace_page *pg;	2218	struct ftrace_page *pg;
2219	int order;	2219	int order;
2220	int cnt;	2220	int cnt;
2221		2221
2222	if (!num_to_init)	2222	if (!num_to_init)
2223	return 0;	2223	return 0;
2224		2224
2225	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);	2225	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2226	if (!pg)	2226	if (!pg)
2227	return NULL;	2227	return NULL;
2228		2228
2229	/*	2229	/*
2230	* Try to allocate as much as possible in one continues	2230	* Try to allocate as much as possible in one continues
2231	* location that fills in all of the space. We want to	2231	* location that fills in all of the space. We want to
2232	* waste as little space as possible.	2232	* waste as little space as possible.
2233	*/	2233	*/
2234	for (;;) {	2234	for (;;) {
2235	cnt = ftrace_allocate_records(pg, num_to_init);	2235	cnt = ftrace_allocate_records(pg, num_to_init);
2236	if (cnt < 0)	2236	if (cnt < 0)
2237	goto free_pages;	2237	goto free_pages;
2238		2238
2239	num_to_init -= cnt;	2239	num_to_init -= cnt;
2240	if (!num_to_init)	2240	if (!num_to_init)
2241	break;	2241	break;
2242		2242
2243	pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);	2243	pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
2244	if (!pg->next)	2244	if (!pg->next)
2245	goto free_pages;	2245	goto free_pages;
2246		2246
2247	pg = pg->next;	2247	pg = pg->next;
2248	}	2248	}
2249		2249
2250	return start_pg;	2250	return start_pg;
2251		2251
2252	free_pages:	2252	free_pages:
2253	while (start_pg) {	2253	while (start_pg) {
2254	order = get_count_order(pg->size / ENTRIES_PER_PAGE);	2254	order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2255	free_pages((unsigned long)pg->records, order);	2255	free_pages((unsigned long)pg->records, order);
2256	start_pg = pg->next;	2256	start_pg = pg->next;
2257	kfree(pg);	2257	kfree(pg);
2258	pg = start_pg;	2258	pg = start_pg;
2259	}	2259	}
2260	pr_info("ftrace: FAILED to allocate memory for functions\n");	2260	pr_info("ftrace: FAILED to allocate memory for functions\n");
2261	return NULL;	2261	return NULL;
2262	}	2262	}
2263		2263
2264	static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)	2264	static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2265	{	2265	{
2266	int cnt;	2266	int cnt;
2267		2267
2268	if (!num_to_init) {	2268	if (!num_to_init) {
2269	pr_info("ftrace: No functions to be traced?\n");	2269	pr_info("ftrace: No functions to be traced?\n");
2270	return -1;	2270	return -1;
2271	}	2271	}
2272		2272
2273	cnt = num_to_init / ENTRIES_PER_PAGE;	2273	cnt = num_to_init / ENTRIES_PER_PAGE;
2274	pr_info("ftrace: allocating %ld entries in %d pages\n",	2274	pr_info("ftrace: allocating %ld entries in %d pages\n",
2275	num_to_init, cnt + 1);	2275	num_to_init, cnt + 1);
2276		2276
2277	return 0;	2277	return 0;
2278	}	2278	}
2279		2279
2280	#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */	2280	#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2281		2281
2282	struct ftrace_iterator {	2282	struct ftrace_iterator {
2283	loff_t pos;	2283	loff_t pos;
2284	loff_t func_pos;	2284	loff_t func_pos;
2285	struct ftrace_page *pg;	2285	struct ftrace_page *pg;
2286	struct dyn_ftrace *func;	2286	struct dyn_ftrace *func;
2287	struct ftrace_func_probe *probe;	2287	struct ftrace_func_probe *probe;
2288	struct trace_parser parser;	2288	struct trace_parser parser;
2289	struct ftrace_hash *hash;	2289	struct ftrace_hash *hash;
2290	struct ftrace_ops *ops;	2290	struct ftrace_ops *ops;
2291	int hidx;	2291	int hidx;
2292	int idx;	2292	int idx;
2293	unsigned flags;	2293	unsigned flags;
2294	};	2294	};
2295		2295
2296	static void *	2296	static void *
2297	t_hash_next(struct seq_file m, loff_t pos)	2297	t_hash_next(struct seq_file m, loff_t pos)
2298	{	2298	{
2299	struct ftrace_iterator *iter = m->private;	2299	struct ftrace_iterator *iter = m->private;
2300	struct hlist_node *hnd = NULL;	2300	struct hlist_node *hnd = NULL;
2301	struct hlist_head *hhd;	2301	struct hlist_head *hhd;
2302		2302
2303	(*pos)++;	2303	(*pos)++;
2304	iter->pos = *pos;	2304	iter->pos = *pos;
2305		2305
2306	if (iter->probe)	2306	if (iter->probe)
2307	hnd = &iter->probe->node;	2307	hnd = &iter->probe->node;
2308	retry:	2308	retry:
2309	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)	2309	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
2310	return NULL;	2310	return NULL;
2311		2311
2312	hhd = &ftrace_func_hash[iter->hidx];	2312	hhd = &ftrace_func_hash[iter->hidx];
2313		2313
2314	if (hlist_empty(hhd)) {	2314	if (hlist_empty(hhd)) {
2315	iter->hidx++;	2315	iter->hidx++;
2316	hnd = NULL;	2316	hnd = NULL;
2317	goto retry;	2317	goto retry;
2318	}	2318	}
2319		2319
2320	if (!hnd)	2320	if (!hnd)
2321	hnd = hhd->first;	2321	hnd = hhd->first;
2322	else {	2322	else {
2323	hnd = hnd->next;	2323	hnd = hnd->next;
2324	if (!hnd) {	2324	if (!hnd) {
2325	iter->hidx++;	2325	iter->hidx++;
2326	goto retry;	2326	goto retry;
2327	}	2327	}
2328	}	2328	}
2329		2329
2330	if (WARN_ON_ONCE(!hnd))	2330	if (WARN_ON_ONCE(!hnd))
2331	return NULL;	2331	return NULL;
2332		2332
2333	iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);	2333	iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
2334		2334
2335	return iter;	2335	return iter;
2336	}	2336	}
2337		2337
2338	static void t_hash_start(struct seq_file m, loff_t *pos)	2338	static void t_hash_start(struct seq_file m, loff_t *pos)
2339	{	2339	{
2340	struct ftrace_iterator *iter = m->private;	2340	struct ftrace_iterator *iter = m->private;
2341	void *p = NULL;	2341	void *p = NULL;
2342	loff_t l;	2342	loff_t l;
2343		2343
2344	if (!(iter->flags & FTRACE_ITER_DO_HASH))	2344	if (!(iter->flags & FTRACE_ITER_DO_HASH))
2345	return NULL;	2345	return NULL;
2346		2346
2347	if (iter->func_pos > *pos)	2347	if (iter->func_pos > *pos)
2348	return NULL;	2348	return NULL;
2349		2349
2350	iter->hidx = 0;	2350	iter->hidx = 0;
2351	for (l = 0; l <= (*pos - iter->func_pos); ) {	2351	for (l = 0; l <= (*pos - iter->func_pos); ) {
2352	p = t_hash_next(m, &l);	2352	p = t_hash_next(m, &l);
2353	if (!p)	2353	if (!p)
2354	break;	2354	break;
2355	}	2355	}
2356	if (!p)	2356	if (!p)
2357	return NULL;	2357	return NULL;
2358		2358
2359	/* Only set this if we have an item */	2359	/* Only set this if we have an item */
2360	iter->flags \|= FTRACE_ITER_HASH;	2360	iter->flags \|= FTRACE_ITER_HASH;
2361		2361
2362	return iter;	2362	return iter;
2363	}	2363	}
2364		2364
2365	static int	2365	static int
2366	t_hash_show(struct seq_file m, struct ftrace_iterator iter)	2366	t_hash_show(struct seq_file m, struct ftrace_iterator iter)
2367	{	2367	{
2368	struct ftrace_func_probe *rec;	2368	struct ftrace_func_probe *rec;
2369		2369
2370	rec = iter->probe;	2370	rec = iter->probe;
2371	if (WARN_ON_ONCE(!rec))	2371	if (WARN_ON_ONCE(!rec))
2372	return -EIO;	2372	return -EIO;
2373		2373
2374	if (rec->ops->print)	2374	if (rec->ops->print)
2375	return rec->ops->print(m, rec->ip, rec->ops, rec->data);	2375	return rec->ops->print(m, rec->ip, rec->ops, rec->data);
2376		2376
2377	seq_printf(m, "%ps:%ps", (void )rec->ip, (void )rec->ops->func);	2377	seq_printf(m, "%ps:%ps", (void )rec->ip, (void )rec->ops->func);
2378		2378
2379	if (rec->data)	2379	if (rec->data)
2380	seq_printf(m, ":%p", rec->data);	2380	seq_printf(m, ":%p", rec->data);
2381	seq_putc(m, '\n');	2381	seq_putc(m, '\n');
2382		2382
2383	return 0;	2383	return 0;
2384	}	2384	}
2385		2385
2386	static void *	2386	static void *
2387	t_next(struct seq_file m, void v, loff_t *pos)	2387	t_next(struct seq_file m, void v, loff_t *pos)
2388	{	2388	{
2389	struct ftrace_iterator *iter = m->private;	2389	struct ftrace_iterator *iter = m->private;
2390	struct ftrace_ops *ops = iter->ops;	2390	struct ftrace_ops *ops = iter->ops;
2391	struct dyn_ftrace *rec = NULL;	2391	struct dyn_ftrace *rec = NULL;
2392		2392
2393	if (unlikely(ftrace_disabled))	2393	if (unlikely(ftrace_disabled))
2394	return NULL;	2394	return NULL;
2395		2395
2396	if (iter->flags & FTRACE_ITER_HASH)	2396	if (iter->flags & FTRACE_ITER_HASH)
2397	return t_hash_next(m, pos);	2397	return t_hash_next(m, pos);
2398		2398
2399	(*pos)++;	2399	(*pos)++;
2400	iter->pos = iter->func_pos = *pos;	2400	iter->pos = iter->func_pos = *pos;
2401		2401
2402	if (iter->flags & FTRACE_ITER_PRINTALL)	2402	if (iter->flags & FTRACE_ITER_PRINTALL)
2403	return t_hash_start(m, pos);	2403	return t_hash_start(m, pos);
2404		2404
2405	retry:	2405	retry:
2406	if (iter->idx >= iter->pg->index) {	2406	if (iter->idx >= iter->pg->index) {
2407	if (iter->pg->next) {	2407	if (iter->pg->next) {
2408	iter->pg = iter->pg->next;	2408	iter->pg = iter->pg->next;
2409	iter->idx = 0;	2409	iter->idx = 0;
2410	goto retry;	2410	goto retry;
2411	}	2411	}
2412	} else {	2412	} else {
2413	rec = &iter->pg->records[iter->idx++];	2413	rec = &iter->pg->records[iter->idx++];
2414	if (((iter->flags & FTRACE_ITER_FILTER) &&	2414	if (((iter->flags & FTRACE_ITER_FILTER) &&
2415	!(ftrace_lookup_ip(ops->filter_hash, rec->ip))) \|\|	2415	!(ftrace_lookup_ip(ops->filter_hash, rec->ip))) \|\|
2416		2416
2417	((iter->flags & FTRACE_ITER_NOTRACE) &&	2417	((iter->flags & FTRACE_ITER_NOTRACE) &&
2418	!ftrace_lookup_ip(ops->notrace_hash, rec->ip)) \|\|	2418	!ftrace_lookup_ip(ops->notrace_hash, rec->ip)) \|\|
2419		2419
2420	((iter->flags & FTRACE_ITER_ENABLED) &&	2420	((iter->flags & FTRACE_ITER_ENABLED) &&
2421	!(rec->flags & ~FTRACE_FL_MASK))) {	2421	!(rec->flags & ~FTRACE_FL_MASK))) {
2422		2422
2423	rec = NULL;	2423	rec = NULL;
2424	goto retry;	2424	goto retry;
2425	}	2425	}
2426	}	2426	}
2427		2427
2428	if (!rec)	2428	if (!rec)
2429	return t_hash_start(m, pos);	2429	return t_hash_start(m, pos);
2430		2430
2431	iter->func = rec;	2431	iter->func = rec;
2432		2432
2433	return iter;	2433	return iter;
2434	}	2434	}
2435		2435
2436	static void reset_iter_read(struct ftrace_iterator *iter)	2436	static void reset_iter_read(struct ftrace_iterator *iter)
2437	{	2437	{
2438	iter->pos = 0;	2438	iter->pos = 0;
2439	iter->func_pos = 0;	2439	iter->func_pos = 0;
2440	iter->flags &= ~(FTRACE_ITER_PRINTALL \| FTRACE_ITER_HASH);	2440	iter->flags &= ~(FTRACE_ITER_PRINTALL \| FTRACE_ITER_HASH);
2441	}	2441	}
2442		2442
2443	static void t_start(struct seq_file m, loff_t *pos)	2443	static void t_start(struct seq_file m, loff_t *pos)
2444	{	2444	{
2445	struct ftrace_iterator *iter = m->private;	2445	struct ftrace_iterator *iter = m->private;
2446	struct ftrace_ops *ops = iter->ops;	2446	struct ftrace_ops *ops = iter->ops;
2447	void *p = NULL;	2447	void *p = NULL;
2448	loff_t l;	2448	loff_t l;
2449		2449
2450	mutex_lock(&ftrace_lock);	2450	mutex_lock(&ftrace_lock);
2451		2451
2452	if (unlikely(ftrace_disabled))	2452	if (unlikely(ftrace_disabled))
2453	return NULL;	2453	return NULL;
2454		2454
2455	/*	2455	/*
2456	* If an lseek was done, then reset and start from beginning.	2456	* If an lseek was done, then reset and start from beginning.
2457	*/	2457	*/
2458	if (*pos < iter->pos)	2458	if (*pos < iter->pos)
2459	reset_iter_read(iter);	2459	reset_iter_read(iter);
2460		2460
2461	/*	2461	/*
2462	* For set_ftrace_filter reading, if we have the filter	2462	* For set_ftrace_filter reading, if we have the filter
2463	* off, we can short cut and just print out that all	2463	* off, we can short cut and just print out that all
2464	* functions are enabled.	2464	* functions are enabled.
2465	*/	2465	*/
2466	if (iter->flags & FTRACE_ITER_FILTER &&	2466	if (iter->flags & FTRACE_ITER_FILTER &&
2467	ftrace_hash_empty(ops->filter_hash)) {	2467	ftrace_hash_empty(ops->filter_hash)) {
2468	if (*pos > 0)	2468	if (*pos > 0)
2469	return t_hash_start(m, pos);	2469	return t_hash_start(m, pos);
2470	iter->flags \|= FTRACE_ITER_PRINTALL;	2470	iter->flags \|= FTRACE_ITER_PRINTALL;
2471	/* reset in case of seek/pread */	2471	/* reset in case of seek/pread */
2472	iter->flags &= ~FTRACE_ITER_HASH;	2472	iter->flags &= ~FTRACE_ITER_HASH;
2473	return iter;	2473	return iter;
2474	}	2474	}
2475		2475
2476	if (iter->flags & FTRACE_ITER_HASH)	2476	if (iter->flags & FTRACE_ITER_HASH)
2477	return t_hash_start(m, pos);	2477	return t_hash_start(m, pos);
2478		2478
2479	/*	2479	/*
2480	* Unfortunately, we need to restart at ftrace_pages_start	2480	* Unfortunately, we need to restart at ftrace_pages_start
2481	* every time we let go of the ftrace_mutex. This is because	2481	* every time we let go of the ftrace_mutex. This is because
2482	* those pointers can change without the lock.	2482	* those pointers can change without the lock.
2483	*/	2483	*/
2484	iter->pg = ftrace_pages_start;	2484	iter->pg = ftrace_pages_start;
2485	iter->idx = 0;	2485	iter->idx = 0;
2486	for (l = 0; l <= *pos; ) {	2486	for (l = 0; l <= *pos; ) {
2487	p = t_next(m, p, &l);	2487	p = t_next(m, p, &l);
2488	if (!p)	2488	if (!p)
2489	break;	2489	break;
2490	}	2490	}
2491		2491
2492	if (!p)	2492	if (!p)
2493	return t_hash_start(m, pos);	2493	return t_hash_start(m, pos);
2494		2494
2495	return iter;	2495	return iter;
2496	}	2496	}
2497		2497
2498	static void t_stop(struct seq_file m, void p)	2498	static void t_stop(struct seq_file m, void p)
2499	{	2499	{
2500	mutex_unlock(&ftrace_lock);	2500	mutex_unlock(&ftrace_lock);
2501	}	2501	}
2502		2502
2503	static int t_show(struct seq_file m, void v)	2503	static int t_show(struct seq_file m, void v)
2504	{	2504	{
2505	struct ftrace_iterator *iter = m->private;	2505	struct ftrace_iterator *iter = m->private;
2506	struct dyn_ftrace *rec;	2506	struct dyn_ftrace *rec;
2507		2507
2508	if (iter->flags & FTRACE_ITER_HASH)	2508	if (iter->flags & FTRACE_ITER_HASH)
2509	return t_hash_show(m, iter);	2509	return t_hash_show(m, iter);
2510		2510
2511	if (iter->flags & FTRACE_ITER_PRINTALL) {	2511	if (iter->flags & FTRACE_ITER_PRINTALL) {
2512	seq_printf(m, "#### all functions enabled ####\n");	2512	seq_printf(m, "#### all functions enabled ####\n");
2513	return 0;	2513	return 0;
2514	}	2514	}
2515		2515
2516	rec = iter->func;	2516	rec = iter->func;
2517		2517
2518	if (!rec)	2518	if (!rec)
2519	return 0;	2519	return 0;
2520		2520
2521	seq_printf(m, "%ps", (void *)rec->ip);	2521	seq_printf(m, "%ps", (void *)rec->ip);
2522	if (iter->flags & FTRACE_ITER_ENABLED)	2522	if (iter->flags & FTRACE_ITER_ENABLED)
2523	seq_printf(m, " (%ld)%s",	2523	seq_printf(m, " (%ld)%s",
2524	rec->flags & ~FTRACE_FL_MASK,	2524	rec->flags & ~FTRACE_FL_MASK,
2525	rec->flags & FTRACE_FL_REGS ? " R" : "");	2525	rec->flags & FTRACE_FL_REGS ? " R" : "");
2526	seq_printf(m, "\n");	2526	seq_printf(m, "\n");
2527		2527
2528	return 0;	2528	return 0;
2529	}	2529	}
2530		2530
2531	static const struct seq_operations show_ftrace_seq_ops = {	2531	static const struct seq_operations show_ftrace_seq_ops = {
2532	.start = t_start,	2532	.start = t_start,
2533	.next = t_next,	2533	.next = t_next,
2534	.stop = t_stop,	2534	.stop = t_stop,
2535	.show = t_show,	2535	.show = t_show,
2536	};	2536	};
2537		2537
2538	static int	2538	static int
2539	ftrace_avail_open(struct inode inode, struct file file)	2539	ftrace_avail_open(struct inode inode, struct file file)
2540	{	2540	{
2541	struct ftrace_iterator *iter;	2541	struct ftrace_iterator *iter;
2542		2542
2543	if (unlikely(ftrace_disabled))	2543	if (unlikely(ftrace_disabled))
2544	return -ENODEV;	2544	return -ENODEV;
2545		2545
2546	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));	2546	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2547	if (iter) {	2547	if (iter) {
2548	iter->pg = ftrace_pages_start;	2548	iter->pg = ftrace_pages_start;
2549	iter->ops = &global_ops;	2549	iter->ops = &global_ops;
2550	}	2550	}
2551		2551
2552	return iter ? 0 : -ENOMEM;	2552	return iter ? 0 : -ENOMEM;
2553	}	2553	}
2554		2554
2555	static int	2555	static int
2556	ftrace_enabled_open(struct inode inode, struct file file)	2556	ftrace_enabled_open(struct inode inode, struct file file)
2557	{	2557	{
2558	struct ftrace_iterator *iter;	2558	struct ftrace_iterator *iter;
2559		2559
2560	if (unlikely(ftrace_disabled))	2560	if (unlikely(ftrace_disabled))
2561	return -ENODEV;	2561	return -ENODEV;
2562		2562
2563	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));	2563	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2564	if (iter) {	2564	if (iter) {
2565	iter->pg = ftrace_pages_start;	2565	iter->pg = ftrace_pages_start;
2566	iter->flags = FTRACE_ITER_ENABLED;	2566	iter->flags = FTRACE_ITER_ENABLED;
2567	iter->ops = &global_ops;	2567	iter->ops = &global_ops;
2568	}	2568	}
2569		2569
2570	return iter ? 0 : -ENOMEM;	2570	return iter ? 0 : -ENOMEM;
2571	}	2571	}
2572		2572
2573	static void ftrace_filter_reset(struct ftrace_hash *hash)	2573	static void ftrace_filter_reset(struct ftrace_hash *hash)
2574	{	2574	{
2575	mutex_lock(&ftrace_lock);	2575	mutex_lock(&ftrace_lock);
2576	ftrace_hash_clear(hash);	2576	ftrace_hash_clear(hash);
2577	mutex_unlock(&ftrace_lock);	2577	mutex_unlock(&ftrace_lock);
2578	}	2578	}
2579		2579
2580	/**	2580	/**
2581	* ftrace_regex_open - initialize function tracer filter files	2581	* ftrace_regex_open - initialize function tracer filter files
2582	* @ops: The ftrace_ops that hold the hash filters	2582	* @ops: The ftrace_ops that hold the hash filters
2583	* @flag: The type of filter to process	2583	* @flag: The type of filter to process
2584	* @inode: The inode, usually passed in to your open routine	2584	* @inode: The inode, usually passed in to your open routine
2585	* @file: The file, usually passed in to your open routine	2585	* @file: The file, usually passed in to your open routine
2586	*	2586	*
2587	* ftrace_regex_open() initializes the filter files for the	2587	* ftrace_regex_open() initializes the filter files for the
2588	* @ops. Depending on @flag it may process the filter hash or	2588	* @ops. Depending on @flag it may process the filter hash or
2589	* the notrace hash of @ops. With this called from the open	2589	* the notrace hash of @ops. With this called from the open
2590	* routine, you can use ftrace_filter_write() for the write	2590	* routine, you can use ftrace_filter_write() for the write
2591	* routine if @flag has FTRACE_ITER_FILTER set, or	2591	* routine if @flag has FTRACE_ITER_FILTER set, or
2592	* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.	2592	* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2593	* ftrace_regex_lseek() should be used as the lseek routine, and	2593	* ftrace_regex_lseek() should be used as the lseek routine, and
2594	* release must call ftrace_regex_release().	2594	* release must call ftrace_regex_release().
2595	*/	2595	*/
2596	int	2596	int
2597	ftrace_regex_open(struct ftrace_ops *ops, int flag,	2597	ftrace_regex_open(struct ftrace_ops *ops, int flag,
2598	struct inode inode, struct file file)	2598	struct inode inode, struct file file)
2599	{	2599	{
2600	struct ftrace_iterator *iter;	2600	struct ftrace_iterator *iter;
2601	struct ftrace_hash *hash;	2601	struct ftrace_hash *hash;
2602	int ret = 0;	2602	int ret = 0;
2603		2603
2604	if (unlikely(ftrace_disabled))	2604	if (unlikely(ftrace_disabled))
2605	return -ENODEV;	2605	return -ENODEV;
2606		2606
2607	iter = kzalloc(sizeof(*iter), GFP_KERNEL);	2607	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2608	if (!iter)	2608	if (!iter)
2609	return -ENOMEM;	2609	return -ENOMEM;
2610		2610
2611	if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {	2611	if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
2612	kfree(iter);	2612	kfree(iter);
2613	return -ENOMEM;	2613	return -ENOMEM;
2614	}	2614	}
2615		2615
2616	if (flag & FTRACE_ITER_NOTRACE)	2616	if (flag & FTRACE_ITER_NOTRACE)
2617	hash = ops->notrace_hash;	2617	hash = ops->notrace_hash;
2618	else	2618	else
2619	hash = ops->filter_hash;	2619	hash = ops->filter_hash;
2620		2620
2621	iter->ops = ops;	2621	iter->ops = ops;
2622	iter->flags = flag;	2622	iter->flags = flag;
2623		2623
2624	if (file->f_mode & FMODE_WRITE) {	2624	if (file->f_mode & FMODE_WRITE) {
2625	mutex_lock(&ftrace_lock);	2625	mutex_lock(&ftrace_lock);
2626	iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);	2626	iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2627	mutex_unlock(&ftrace_lock);	2627	mutex_unlock(&ftrace_lock);
2628		2628
2629	if (!iter->hash) {	2629	if (!iter->hash) {
2630	trace_parser_put(&iter->parser);	2630	trace_parser_put(&iter->parser);
2631	kfree(iter);	2631	kfree(iter);
2632	return -ENOMEM;	2632	return -ENOMEM;
2633	}	2633	}
2634	}	2634	}
2635		2635
2636	mutex_lock(&ftrace_regex_lock);	2636	mutex_lock(&ftrace_regex_lock);
2637		2637
2638	if ((file->f_mode & FMODE_WRITE) &&	2638	if ((file->f_mode & FMODE_WRITE) &&
2639	(file->f_flags & O_TRUNC))	2639	(file->f_flags & O_TRUNC))
2640	ftrace_filter_reset(iter->hash);	2640	ftrace_filter_reset(iter->hash);
2641		2641
2642	if (file->f_mode & FMODE_READ) {	2642	if (file->f_mode & FMODE_READ) {
2643	iter->pg = ftrace_pages_start;	2643	iter->pg = ftrace_pages_start;
2644		2644
2645	ret = seq_open(file, &show_ftrace_seq_ops);	2645	ret = seq_open(file, &show_ftrace_seq_ops);
2646	if (!ret) {	2646	if (!ret) {
2647	struct seq_file *m = file->private_data;	2647	struct seq_file *m = file->private_data;
2648	m->private = iter;	2648	m->private = iter;
2649	} else {	2649	} else {
2650	/* Failed */	2650	/* Failed */
2651	free_ftrace_hash(iter->hash);	2651	free_ftrace_hash(iter->hash);
2652	trace_parser_put(&iter->parser);	2652	trace_parser_put(&iter->parser);
2653	kfree(iter);	2653	kfree(iter);
2654	}	2654	}
2655	} else	2655	} else
2656	file->private_data = iter;	2656	file->private_data = iter;
2657	mutex_unlock(&ftrace_regex_lock);	2657	mutex_unlock(&ftrace_regex_lock);
2658		2658
2659	return ret;	2659	return ret;
2660	}	2660	}
2661		2661
2662	static int	2662	static int
2663	ftrace_filter_open(struct inode inode, struct file file)	2663	ftrace_filter_open(struct inode inode, struct file file)
2664	{	2664	{
2665	return ftrace_regex_open(&global_ops,	2665	return ftrace_regex_open(&global_ops,
2666	FTRACE_ITER_FILTER \| FTRACE_ITER_DO_HASH,	2666	FTRACE_ITER_FILTER \| FTRACE_ITER_DO_HASH,
2667	inode, file);	2667	inode, file);
2668	}	2668	}
2669		2669
2670	static int	2670	static int
2671	ftrace_notrace_open(struct inode inode, struct file file)	2671	ftrace_notrace_open(struct inode inode, struct file file)
2672	{	2672	{
2673	return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,	2673	return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2674	inode, file);	2674	inode, file);
2675	}	2675	}
2676		2676
2677	loff_t	2677	loff_t
2678	ftrace_regex_lseek(struct file *file, loff_t offset, int origin)	2678	ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2679	{	2679	{
2680	loff_t ret;	2680	loff_t ret;
2681		2681
2682	if (file->f_mode & FMODE_READ)	2682	if (file->f_mode & FMODE_READ)
2683	ret = seq_lseek(file, offset, origin);	2683	ret = seq_lseek(file, offset, whence);
2684	else	2684	else
2685	file->f_pos = ret = 1;	2685	file->f_pos = ret = 1;
2686		2686
2687	return ret;	2687	return ret;
2688	}	2688	}
2689		2689
2690	static int ftrace_match(char str, char regex, int len, int type)	2690	static int ftrace_match(char str, char regex, int len, int type)
2691	{	2691	{
2692	int matched = 0;	2692	int matched = 0;
2693	int slen;	2693	int slen;
2694		2694
2695	switch (type) {	2695	switch (type) {
2696	case MATCH_FULL:	2696	case MATCH_FULL:
2697	if (strcmp(str, regex) == 0)	2697	if (strcmp(str, regex) == 0)
2698	matched = 1;	2698	matched = 1;
2699	break;	2699	break;
2700	case MATCH_FRONT_ONLY:	2700	case MATCH_FRONT_ONLY:
2701	if (strncmp(str, regex, len) == 0)	2701	if (strncmp(str, regex, len) == 0)
2702	matched = 1;	2702	matched = 1;
2703	break;	2703	break;
2704	case MATCH_MIDDLE_ONLY:	2704	case MATCH_MIDDLE_ONLY:
2705	if (strstr(str, regex))	2705	if (strstr(str, regex))
2706	matched = 1;	2706	matched = 1;
2707	break;	2707	break;
2708	case MATCH_END_ONLY:	2708	case MATCH_END_ONLY:
2709	slen = strlen(str);	2709	slen = strlen(str);
2710	if (slen >= len && memcmp(str + slen - len, regex, len) == 0)	2710	if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
2711	matched = 1;	2711	matched = 1;
2712	break;	2712	break;
2713	}	2713	}
2714		2714
2715	return matched;	2715	return matched;
2716	}	2716	}
2717		2717
2718	static int	2718	static int
2719	enter_record(struct ftrace_hash hash, struct dyn_ftrace rec, int not)	2719	enter_record(struct ftrace_hash hash, struct dyn_ftrace rec, int not)
2720	{	2720	{
2721	struct ftrace_func_entry *entry;	2721	struct ftrace_func_entry *entry;
2722	int ret = 0;	2722	int ret = 0;
2723		2723
2724	entry = ftrace_lookup_ip(hash, rec->ip);	2724	entry = ftrace_lookup_ip(hash, rec->ip);
2725	if (not) {	2725	if (not) {
2726	/* Do nothing if it doesn't exist */	2726	/* Do nothing if it doesn't exist */
2727	if (!entry)	2727	if (!entry)
2728	return 0;	2728	return 0;
2729		2729
2730	free_hash_entry(hash, entry);	2730	free_hash_entry(hash, entry);
2731	} else {	2731	} else {
2732	/* Do nothing if it exists */	2732	/* Do nothing if it exists */
2733	if (entry)	2733	if (entry)
2734	return 0;	2734	return 0;
2735		2735
2736	ret = add_hash_entry(hash, rec->ip);	2736	ret = add_hash_entry(hash, rec->ip);
2737	}	2737	}
2738	return ret;	2738	return ret;
2739	}	2739	}
2740		2740
2741	static int	2741	static int
2742	ftrace_match_record(struct dyn_ftrace rec, char mod,	2742	ftrace_match_record(struct dyn_ftrace rec, char mod,
2743	char *regex, int len, int type)	2743	char *regex, int len, int type)
2744	{	2744	{
2745	char str[KSYM_SYMBOL_LEN];	2745	char str[KSYM_SYMBOL_LEN];
2746	char *modname;	2746	char *modname;
2747		2747
2748	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);	2748	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2749		2749
2750	if (mod) {	2750	if (mod) {
2751	/* module lookup requires matching the module */	2751	/* module lookup requires matching the module */
2752	if (!modname \|\| strcmp(modname, mod))	2752	if (!modname \|\| strcmp(modname, mod))
2753	return 0;	2753	return 0;
2754		2754
2755	/* blank search means to match all funcs in the mod */	2755	/* blank search means to match all funcs in the mod */
2756	if (!len)	2756	if (!len)
2757	return 1;	2757	return 1;
2758	}	2758	}
2759		2759
2760	return ftrace_match(str, regex, len, type);	2760	return ftrace_match(str, regex, len, type);
2761	}	2761	}
2762		2762
2763	static int	2763	static int
2764	match_records(struct ftrace_hash hash, char buff,	2764	match_records(struct ftrace_hash hash, char buff,
2765	int len, char *mod, int not)	2765	int len, char *mod, int not)
2766	{	2766	{
2767	unsigned search_len = 0;	2767	unsigned search_len = 0;
2768	struct ftrace_page *pg;	2768	struct ftrace_page *pg;
2769	struct dyn_ftrace *rec;	2769	struct dyn_ftrace *rec;
2770	int type = MATCH_FULL;	2770	int type = MATCH_FULL;
2771	char *search = buff;	2771	char *search = buff;
2772	int found = 0;	2772	int found = 0;
2773	int ret;	2773	int ret;
2774		2774
2775	if (len) {	2775	if (len) {
2776	type = filter_parse_regex(buff, len, &search, &not);	2776	type = filter_parse_regex(buff, len, &search, &not);
2777	search_len = strlen(search);	2777	search_len = strlen(search);
2778	}	2778	}
2779		2779
2780	mutex_lock(&ftrace_lock);	2780	mutex_lock(&ftrace_lock);
2781		2781
2782	if (unlikely(ftrace_disabled))	2782	if (unlikely(ftrace_disabled))
2783	goto out_unlock;	2783	goto out_unlock;
2784		2784
2785	do_for_each_ftrace_rec(pg, rec) {	2785	do_for_each_ftrace_rec(pg, rec) {
2786	if (ftrace_match_record(rec, mod, search, search_len, type)) {	2786	if (ftrace_match_record(rec, mod, search, search_len, type)) {
2787	ret = enter_record(hash, rec, not);	2787	ret = enter_record(hash, rec, not);
2788	if (ret < 0) {	2788	if (ret < 0) {
2789	found = ret;	2789	found = ret;
2790	goto out_unlock;	2790	goto out_unlock;
2791	}	2791	}
2792	found = 1;	2792	found = 1;
2793	}	2793	}
2794	} while_for_each_ftrace_rec();	2794	} while_for_each_ftrace_rec();
2795	out_unlock:	2795	out_unlock:
2796	mutex_unlock(&ftrace_lock);	2796	mutex_unlock(&ftrace_lock);
2797		2797
2798	return found;	2798	return found;
2799	}	2799	}
2800		2800
2801	static int	2801	static int
2802	ftrace_match_records(struct ftrace_hash hash, char buff, int len)	2802	ftrace_match_records(struct ftrace_hash hash, char buff, int len)
2803	{	2803	{
2804	return match_records(hash, buff, len, NULL, 0);	2804	return match_records(hash, buff, len, NULL, 0);
2805	}	2805	}
2806		2806
2807	static int	2807	static int
2808	ftrace_match_module_records(struct ftrace_hash hash, char buff, char *mod)	2808	ftrace_match_module_records(struct ftrace_hash hash, char buff, char *mod)
2809	{	2809	{
2810	int not = 0;	2810	int not = 0;
2811		2811
2812	/* blank or '' mean the same /	2812	/* blank or '' mean the same /
2813	if (strcmp(buff, "*") == 0)	2813	if (strcmp(buff, "*") == 0)
2814	buff[0] = 0;	2814	buff[0] = 0;
2815		2815
2816	/* handle the case of 'dont filter this module' */	2816	/* handle the case of 'dont filter this module' */
2817	if (strcmp(buff, "!") == 0 \|\| strcmp(buff, "!*") == 0) {	2817	if (strcmp(buff, "!") == 0 \|\| strcmp(buff, "!*") == 0) {
2818	buff[0] = 0;	2818	buff[0] = 0;
2819	not = 1;	2819	not = 1;
2820	}	2820	}
2821		2821
2822	return match_records(hash, buff, strlen(buff), mod, not);	2822	return match_records(hash, buff, strlen(buff), mod, not);
2823	}	2823	}
2824		2824
2825	/*	2825	/*
2826	* We register the module command as a template to show others how	2826	* We register the module command as a template to show others how
2827	* to register the a command as well.	2827	* to register the a command as well.
2828	*/	2828	*/
2829		2829
2830	static int	2830	static int
2831	ftrace_mod_callback(struct ftrace_hash *hash,	2831	ftrace_mod_callback(struct ftrace_hash *hash,
2832	char func, char cmd, char *param, int enable)	2832	char func, char cmd, char *param, int enable)
2833	{	2833	{
2834	char *mod;	2834	char *mod;
2835	int ret = -EINVAL;	2835	int ret = -EINVAL;
2836		2836
2837	/*	2837	/*
2838	* cmd == 'mod' because we only registered this func	2838	* cmd == 'mod' because we only registered this func
2839	* for the 'mod' ftrace_func_command.	2839	* for the 'mod' ftrace_func_command.
2840	* But if you register one func with multiple commands,	2840	* But if you register one func with multiple commands,
2841	* you can tell which command was used by the cmd	2841	* you can tell which command was used by the cmd
2842	* parameter.	2842	* parameter.
2843	*/	2843	*/
2844		2844
2845	/* we must have a module name */	2845	/* we must have a module name */
2846	if (!param)	2846	if (!param)
2847	return ret;	2847	return ret;
2848		2848
2849	mod = strsep(&param, ":");	2849	mod = strsep(&param, ":");
2850	if (!strlen(mod))	2850	if (!strlen(mod))
2851	return ret;	2851	return ret;
2852		2852
2853	ret = ftrace_match_module_records(hash, func, mod);	2853	ret = ftrace_match_module_records(hash, func, mod);
2854	if (!ret)	2854	if (!ret)
2855	ret = -EINVAL;	2855	ret = -EINVAL;
2856	if (ret < 0)	2856	if (ret < 0)
2857	return ret;	2857	return ret;
2858		2858
2859	return 0;	2859	return 0;
2860	}	2860	}
2861		2861
2862	static struct ftrace_func_command ftrace_mod_cmd = {	2862	static struct ftrace_func_command ftrace_mod_cmd = {
2863	.name = "mod",	2863	.name = "mod",
2864	.func = ftrace_mod_callback,	2864	.func = ftrace_mod_callback,
2865	};	2865	};
2866		2866
2867	static int __init ftrace_mod_cmd_init(void)	2867	static int __init ftrace_mod_cmd_init(void)
2868	{	2868	{
2869	return register_ftrace_command(&ftrace_mod_cmd);	2869	return register_ftrace_command(&ftrace_mod_cmd);
2870	}	2870	}
2871	core_initcall(ftrace_mod_cmd_init);	2871	core_initcall(ftrace_mod_cmd_init);
2872		2872
2873	static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,	2873	static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2874	struct ftrace_ops op, struct pt_regs pt_regs)	2874	struct ftrace_ops op, struct pt_regs pt_regs)
2875	{	2875	{
2876	struct ftrace_func_probe *entry;	2876	struct ftrace_func_probe *entry;
2877	struct hlist_head *hhd;	2877	struct hlist_head *hhd;
2878	struct hlist_node *n;	2878	struct hlist_node *n;
2879	unsigned long key;	2879	unsigned long key;
2880		2880
2881	key = hash_long(ip, FTRACE_HASH_BITS);	2881	key = hash_long(ip, FTRACE_HASH_BITS);
2882		2882
2883	hhd = &ftrace_func_hash[key];	2883	hhd = &ftrace_func_hash[key];
2884		2884
2885	if (hlist_empty(hhd))	2885	if (hlist_empty(hhd))
2886	return;	2886	return;
2887		2887
2888	/*	2888	/*
2889	* Disable preemption for these calls to prevent a RCU grace	2889	* Disable preemption for these calls to prevent a RCU grace
2890	* period. This syncs the hash iteration and freeing of items	2890	* period. This syncs the hash iteration and freeing of items
2891	* on the hash. rcu_read_lock is too dangerous here.	2891	* on the hash. rcu_read_lock is too dangerous here.
2892	*/	2892	*/
2893	preempt_disable_notrace();	2893	preempt_disable_notrace();
2894	hlist_for_each_entry_rcu(entry, n, hhd, node) {	2894	hlist_for_each_entry_rcu(entry, n, hhd, node) {
2895	if (entry->ip == ip)	2895	if (entry->ip == ip)
2896	entry->ops->func(ip, parent_ip, &entry->data);	2896	entry->ops->func(ip, parent_ip, &entry->data);
2897	}	2897	}
2898	preempt_enable_notrace();	2898	preempt_enable_notrace();
2899	}	2899	}
2900		2900
2901	static struct ftrace_ops trace_probe_ops __read_mostly =	2901	static struct ftrace_ops trace_probe_ops __read_mostly =
2902	{	2902	{
2903	.func = function_trace_probe_call,	2903	.func = function_trace_probe_call,
2904	};	2904	};
2905		2905
2906	static int ftrace_probe_registered;	2906	static int ftrace_probe_registered;
2907		2907
2908	static void __enable_ftrace_function_probe(void)	2908	static void __enable_ftrace_function_probe(void)
2909	{	2909	{
2910	int ret;	2910	int ret;
2911	int i;	2911	int i;
2912		2912
2913	if (ftrace_probe_registered)	2913	if (ftrace_probe_registered)
2914	return;	2914	return;
2915		2915
2916	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {	2916	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
2917	struct hlist_head *hhd = &ftrace_func_hash[i];	2917	struct hlist_head *hhd = &ftrace_func_hash[i];
2918	if (hhd->first)	2918	if (hhd->first)
2919	break;	2919	break;
2920	}	2920	}
2921	/* Nothing registered? */	2921	/* Nothing registered? */
2922	if (i == FTRACE_FUNC_HASHSIZE)	2922	if (i == FTRACE_FUNC_HASHSIZE)
2923	return;	2923	return;
2924		2924
2925	ret = __register_ftrace_function(&trace_probe_ops);	2925	ret = __register_ftrace_function(&trace_probe_ops);
2926	if (!ret)	2926	if (!ret)
2927	ret = ftrace_startup(&trace_probe_ops, 0);	2927	ret = ftrace_startup(&trace_probe_ops, 0);
2928		2928
2929	ftrace_probe_registered = 1;	2929	ftrace_probe_registered = 1;
2930	}	2930	}
2931		2931
2932	static void __disable_ftrace_function_probe(void)	2932	static void __disable_ftrace_function_probe(void)
2933	{	2933	{
2934	int ret;	2934	int ret;
2935	int i;	2935	int i;
2936		2936
2937	if (!ftrace_probe_registered)	2937	if (!ftrace_probe_registered)
2938	return;	2938	return;
2939		2939
2940	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {	2940	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
2941	struct hlist_head *hhd = &ftrace_func_hash[i];	2941	struct hlist_head *hhd = &ftrace_func_hash[i];
2942	if (hhd->first)	2942	if (hhd->first)
2943	return;	2943	return;
2944	}	2944	}
2945		2945
2946	/* no more funcs left */	2946	/* no more funcs left */
2947	ret = __unregister_ftrace_function(&trace_probe_ops);	2947	ret = __unregister_ftrace_function(&trace_probe_ops);
2948	if (!ret)	2948	if (!ret)
2949	ftrace_shutdown(&trace_probe_ops, 0);	2949	ftrace_shutdown(&trace_probe_ops, 0);
2950		2950
2951	ftrace_probe_registered = 0;	2951	ftrace_probe_registered = 0;
2952	}	2952	}
2953		2953
2954		2954
2955	static void ftrace_free_entry_rcu(struct rcu_head *rhp)	2955	static void ftrace_free_entry_rcu(struct rcu_head *rhp)
2956	{	2956	{
2957	struct ftrace_func_probe *entry =	2957	struct ftrace_func_probe *entry =
2958	container_of(rhp, struct ftrace_func_probe, rcu);	2958	container_of(rhp, struct ftrace_func_probe, rcu);
2959		2959
2960	if (entry->ops->free)	2960	if (entry->ops->free)
2961	entry->ops->free(&entry->data);	2961	entry->ops->free(&entry->data);
2962	kfree(entry);	2962	kfree(entry);
2963	}	2963	}
2964		2964
2965		2965
2966	int	2966	int
2967	register_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,	2967	register_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,
2968	void *data)	2968	void *data)
2969	{	2969	{
2970	struct ftrace_func_probe *entry;	2970	struct ftrace_func_probe *entry;
2971	struct ftrace_page *pg;	2971	struct ftrace_page *pg;
2972	struct dyn_ftrace *rec;	2972	struct dyn_ftrace *rec;
2973	int type, len, not;	2973	int type, len, not;
2974	unsigned long key;	2974	unsigned long key;
2975	int count = 0;	2975	int count = 0;
2976	char *search;	2976	char *search;
2977		2977
2978	type = filter_parse_regex(glob, strlen(glob), &search, &not);	2978	type = filter_parse_regex(glob, strlen(glob), &search, &not);
2979	len = strlen(search);	2979	len = strlen(search);
2980		2980
2981	/* we do not support '!' for function probes */	2981	/* we do not support '!' for function probes */
2982	if (WARN_ON(not))	2982	if (WARN_ON(not))
2983	return -EINVAL;	2983	return -EINVAL;
2984		2984
2985	mutex_lock(&ftrace_lock);	2985	mutex_lock(&ftrace_lock);
2986		2986
2987	if (unlikely(ftrace_disabled))	2987	if (unlikely(ftrace_disabled))
2988	goto out_unlock;	2988	goto out_unlock;
2989		2989
2990	do_for_each_ftrace_rec(pg, rec) {	2990	do_for_each_ftrace_rec(pg, rec) {
2991		2991
2992	if (!ftrace_match_record(rec, NULL, search, len, type))	2992	if (!ftrace_match_record(rec, NULL, search, len, type))
2993	continue;	2993	continue;
2994		2994
2995	entry = kmalloc(sizeof(*entry), GFP_KERNEL);	2995	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2996	if (!entry) {	2996	if (!entry) {
2997	/* If we did not process any, then return error */	2997	/* If we did not process any, then return error */
2998	if (!count)	2998	if (!count)
2999	count = -ENOMEM;	2999	count = -ENOMEM;
3000	goto out_unlock;	3000	goto out_unlock;
3001	}	3001	}
3002		3002
3003	count++;	3003	count++;
3004		3004
3005	entry->data = data;	3005	entry->data = data;
3006		3006
3007	/*	3007	/*
3008	* The caller might want to do something special	3008	* The caller might want to do something special
3009	* for each function we find. We call the callback	3009	* for each function we find. We call the callback
3010	* to give the caller an opportunity to do so.	3010	* to give the caller an opportunity to do so.
3011	*/	3011	*/
3012	if (ops->callback) {	3012	if (ops->callback) {
3013	if (ops->callback(rec->ip, &entry->data) < 0) {	3013	if (ops->callback(rec->ip, &entry->data) < 0) {
3014	/* caller does not like this func */	3014	/* caller does not like this func */
3015	kfree(entry);	3015	kfree(entry);
3016	continue;	3016	continue;
3017	}	3017	}
3018	}	3018	}
3019		3019
3020	entry->ops = ops;	3020	entry->ops = ops;
3021	entry->ip = rec->ip;	3021	entry->ip = rec->ip;
3022		3022
3023	key = hash_long(entry->ip, FTRACE_HASH_BITS);	3023	key = hash_long(entry->ip, FTRACE_HASH_BITS);
3024	hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);	3024	hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3025		3025
3026	} while_for_each_ftrace_rec();	3026	} while_for_each_ftrace_rec();
3027	__enable_ftrace_function_probe();	3027	__enable_ftrace_function_probe();
3028		3028
3029	out_unlock:	3029	out_unlock:
3030	mutex_unlock(&ftrace_lock);	3030	mutex_unlock(&ftrace_lock);
3031		3031
3032	return count;	3032	return count;
3033	}	3033	}
3034		3034
3035	enum {	3035	enum {
3036	PROBE_TEST_FUNC = 1,	3036	PROBE_TEST_FUNC = 1,
3037	PROBE_TEST_DATA = 2	3037	PROBE_TEST_DATA = 2
3038	};	3038	};
3039		3039
3040	static void	3040	static void
3041	__unregister_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,	3041	__unregister_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,
3042	void *data, int flags)	3042	void *data, int flags)
3043	{	3043	{
3044	struct ftrace_func_probe *entry;	3044	struct ftrace_func_probe *entry;
3045	struct hlist_node n, tmp;	3045	struct hlist_node n, tmp;
3046	char str[KSYM_SYMBOL_LEN];	3046	char str[KSYM_SYMBOL_LEN];
3047	int type = MATCH_FULL;	3047	int type = MATCH_FULL;
3048	int i, len = 0;	3048	int i, len = 0;
3049	char *search;	3049	char *search;
3050		3050
3051	if (glob && (strcmp(glob, "*") == 0 \|\| !strlen(glob)))	3051	if (glob && (strcmp(glob, "*") == 0 \|\| !strlen(glob)))
3052	glob = NULL;	3052	glob = NULL;
3053	else if (glob) {	3053	else if (glob) {
3054	int not;	3054	int not;
3055		3055
3056	type = filter_parse_regex(glob, strlen(glob), &search, &not);	3056	type = filter_parse_regex(glob, strlen(glob), &search, &not);
3057	len = strlen(search);	3057	len = strlen(search);
3058		3058
3059	/* we do not support '!' for function probes */	3059	/* we do not support '!' for function probes */
3060	if (WARN_ON(not))	3060	if (WARN_ON(not))
3061	return;	3061	return;
3062	}	3062	}
3063		3063
3064	mutex_lock(&ftrace_lock);	3064	mutex_lock(&ftrace_lock);
3065	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {	3065	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3066	struct hlist_head *hhd = &ftrace_func_hash[i];	3066	struct hlist_head *hhd = &ftrace_func_hash[i];
3067		3067
3068	hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {	3068	hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
3069		3069
3070	/* break up if statements for readability */	3070	/* break up if statements for readability */
3071	if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)	3071	if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
3072	continue;	3072	continue;
3073		3073
3074	if ((flags & PROBE_TEST_DATA) && entry->data != data)	3074	if ((flags & PROBE_TEST_DATA) && entry->data != data)
3075	continue;	3075	continue;
3076		3076
3077	/* do this last, since it is the most expensive */	3077	/* do this last, since it is the most expensive */
3078	if (glob) {	3078	if (glob) {
3079	kallsyms_lookup(entry->ip, NULL, NULL,	3079	kallsyms_lookup(entry->ip, NULL, NULL,
3080	NULL, str);	3080	NULL, str);
3081	if (!ftrace_match(str, glob, len, type))	3081	if (!ftrace_match(str, glob, len, type))
3082	continue;	3082	continue;
3083	}	3083	}
3084		3084
3085	hlist_del(&entry->node);	3085	hlist_del(&entry->node);
3086	call_rcu(&entry->rcu, ftrace_free_entry_rcu);	3086	call_rcu(&entry->rcu, ftrace_free_entry_rcu);
3087	}	3087	}
3088	}	3088	}
3089	__disable_ftrace_function_probe();	3089	__disable_ftrace_function_probe();
3090	mutex_unlock(&ftrace_lock);	3090	mutex_unlock(&ftrace_lock);
3091	}	3091	}
3092		3092
3093	void	3093	void
3094	unregister_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,	3094	unregister_ftrace_function_probe(char glob, struct ftrace_probe_ops ops,
3095	void *data)	3095	void *data)
3096	{	3096	{
3097	__unregister_ftrace_function_probe(glob, ops, data,	3097	__unregister_ftrace_function_probe(glob, ops, data,
3098	PROBE_TEST_FUNC \| PROBE_TEST_DATA);	3098	PROBE_TEST_FUNC \| PROBE_TEST_DATA);
3099	}	3099	}
3100		3100
3101	void	3101	void
3102	unregister_ftrace_function_probe_func(char glob, struct ftrace_probe_ops ops)	3102	unregister_ftrace_function_probe_func(char glob, struct ftrace_probe_ops ops)
3103	{	3103	{
3104	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);	3104	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
3105	}	3105	}
3106		3106
3107	void unregister_ftrace_function_probe_all(char *glob)	3107	void unregister_ftrace_function_probe_all(char *glob)
3108	{	3108	{
3109	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);	3109	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
3110	}	3110	}
3111		3111
3112	static LIST_HEAD(ftrace_commands);	3112	static LIST_HEAD(ftrace_commands);
3113	static DEFINE_MUTEX(ftrace_cmd_mutex);	3113	static DEFINE_MUTEX(ftrace_cmd_mutex);
3114		3114
3115	int register_ftrace_command(struct ftrace_func_command *cmd)	3115	int register_ftrace_command(struct ftrace_func_command *cmd)
3116	{	3116	{
3117	struct ftrace_func_command *p;	3117	struct ftrace_func_command *p;
3118	int ret = 0;	3118	int ret = 0;
3119		3119
3120	mutex_lock(&ftrace_cmd_mutex);	3120	mutex_lock(&ftrace_cmd_mutex);
3121	list_for_each_entry(p, &ftrace_commands, list) {	3121	list_for_each_entry(p, &ftrace_commands, list) {
3122	if (strcmp(cmd->name, p->name) == 0) {	3122	if (strcmp(cmd->name, p->name) == 0) {
3123	ret = -EBUSY;	3123	ret = -EBUSY;
3124	goto out_unlock;	3124	goto out_unlock;
3125	}	3125	}
3126	}	3126	}
3127	list_add(&cmd->list, &ftrace_commands);	3127	list_add(&cmd->list, &ftrace_commands);
3128	out_unlock:	3128	out_unlock:
3129	mutex_unlock(&ftrace_cmd_mutex);	3129	mutex_unlock(&ftrace_cmd_mutex);
3130		3130
3131	return ret;	3131	return ret;
3132	}	3132	}
3133		3133
3134	int unregister_ftrace_command(struct ftrace_func_command *cmd)	3134	int unregister_ftrace_command(struct ftrace_func_command *cmd)
3135	{	3135	{
3136	struct ftrace_func_command p, n;	3136	struct ftrace_func_command p, n;
3137	int ret = -ENODEV;	3137	int ret = -ENODEV;
3138		3138
3139	mutex_lock(&ftrace_cmd_mutex);	3139	mutex_lock(&ftrace_cmd_mutex);
3140	list_for_each_entry_safe(p, n, &ftrace_commands, list) {	3140	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
3141	if (strcmp(cmd->name, p->name) == 0) {	3141	if (strcmp(cmd->name, p->name) == 0) {
3142	ret = 0;	3142	ret = 0;
3143	list_del_init(&p->list);	3143	list_del_init(&p->list);
3144	goto out_unlock;	3144	goto out_unlock;
3145	}	3145	}
3146	}	3146	}
3147	out_unlock:	3147	out_unlock:
3148	mutex_unlock(&ftrace_cmd_mutex);	3148	mutex_unlock(&ftrace_cmd_mutex);
3149		3149
3150	return ret;	3150	return ret;
3151	}	3151	}
3152		3152
3153	static int ftrace_process_regex(struct ftrace_hash *hash,	3153	static int ftrace_process_regex(struct ftrace_hash *hash,
3154	char *buff, int len, int enable)	3154	char *buff, int len, int enable)
3155	{	3155	{
3156	char func, command, *next = buff;	3156	char func, command, *next = buff;
3157	struct ftrace_func_command *p;	3157	struct ftrace_func_command *p;
3158	int ret = -EINVAL;	3158	int ret = -EINVAL;
3159		3159
3160	func = strsep(&next, ":");	3160	func = strsep(&next, ":");
3161		3161
3162	if (!next) {	3162	if (!next) {
3163	ret = ftrace_match_records(hash, func, len);	3163	ret = ftrace_match_records(hash, func, len);
3164	if (!ret)	3164	if (!ret)
3165	ret = -EINVAL;	3165	ret = -EINVAL;
3166	if (ret < 0)	3166	if (ret < 0)
3167	return ret;	3167	return ret;
3168	return 0;	3168	return 0;
3169	}	3169	}
3170		3170
3171	/* command found */	3171	/* command found */
3172		3172
3173	command = strsep(&next, ":");	3173	command = strsep(&next, ":");
3174		3174
3175	mutex_lock(&ftrace_cmd_mutex);	3175	mutex_lock(&ftrace_cmd_mutex);
3176	list_for_each_entry(p, &ftrace_commands, list) {	3176	list_for_each_entry(p, &ftrace_commands, list) {
3177	if (strcmp(p->name, command) == 0) {	3177	if (strcmp(p->name, command) == 0) {
3178	ret = p->func(hash, func, command, next, enable);	3178	ret = p->func(hash, func, command, next, enable);
3179	goto out_unlock;	3179	goto out_unlock;
3180	}	3180	}
3181	}	3181	}
3182	out_unlock:	3182	out_unlock:
3183	mutex_unlock(&ftrace_cmd_mutex);	3183	mutex_unlock(&ftrace_cmd_mutex);
3184		3184
3185	return ret;	3185	return ret;
3186	}	3186	}
3187		3187
3188	static ssize_t	3188	static ssize_t
3189	ftrace_regex_write(struct file file, const char __user ubuf,	3189	ftrace_regex_write(struct file file, const char __user ubuf,
3190	size_t cnt, loff_t *ppos, int enable)	3190	size_t cnt, loff_t *ppos, int enable)
3191	{	3191	{
3192	struct ftrace_iterator *iter;	3192	struct ftrace_iterator *iter;
3193	struct trace_parser *parser;	3193	struct trace_parser *parser;
3194	ssize_t ret, read;	3194	ssize_t ret, read;
3195		3195
3196	if (!cnt)	3196	if (!cnt)
3197	return 0;	3197	return 0;
3198		3198
3199	mutex_lock(&ftrace_regex_lock);	3199	mutex_lock(&ftrace_regex_lock);
3200		3200
3201	ret = -ENODEV;	3201	ret = -ENODEV;
3202	if (unlikely(ftrace_disabled))	3202	if (unlikely(ftrace_disabled))
3203	goto out_unlock;	3203	goto out_unlock;
3204		3204
3205	if (file->f_mode & FMODE_READ) {	3205	if (file->f_mode & FMODE_READ) {
3206	struct seq_file *m = file->private_data;	3206	struct seq_file *m = file->private_data;
3207	iter = m->private;	3207	iter = m->private;
3208	} else	3208	} else
3209	iter = file->private_data;	3209	iter = file->private_data;
3210		3210
3211	parser = &iter->parser;	3211	parser = &iter->parser;
3212	read = trace_get_user(parser, ubuf, cnt, ppos);	3212	read = trace_get_user(parser, ubuf, cnt, ppos);
3213		3213
3214	if (read >= 0 && trace_parser_loaded(parser) &&	3214	if (read >= 0 && trace_parser_loaded(parser) &&
3215	!trace_parser_cont(parser)) {	3215	!trace_parser_cont(parser)) {
3216	ret = ftrace_process_regex(iter->hash, parser->buffer,	3216	ret = ftrace_process_regex(iter->hash, parser->buffer,
3217	parser->idx, enable);	3217	parser->idx, enable);
3218	trace_parser_clear(parser);	3218	trace_parser_clear(parser);
3219	if (ret)	3219	if (ret)
3220	goto out_unlock;	3220	goto out_unlock;
3221	}	3221	}
3222		3222
3223	ret = read;	3223	ret = read;
3224	out_unlock:	3224	out_unlock:
3225	mutex_unlock(&ftrace_regex_lock);	3225	mutex_unlock(&ftrace_regex_lock);
3226		3226
3227	return ret;	3227	return ret;
3228	}	3228	}
3229		3229
3230	ssize_t	3230	ssize_t
3231	ftrace_filter_write(struct file file, const char __user ubuf,	3231	ftrace_filter_write(struct file file, const char __user ubuf,
3232	size_t cnt, loff_t *ppos)	3232	size_t cnt, loff_t *ppos)
3233	{	3233	{
3234	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);	3234	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
3235	}	3235	}
3236		3236
3237	ssize_t	3237	ssize_t
3238	ftrace_notrace_write(struct file file, const char __user ubuf,	3238	ftrace_notrace_write(struct file file, const char __user ubuf,
3239	size_t cnt, loff_t *ppos)	3239	size_t cnt, loff_t *ppos)
3240	{	3240	{
3241	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);	3241	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
3242	}	3242	}
3243		3243
3244	static int	3244	static int
3245	ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)	3245	ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3246	{	3246	{
3247	struct ftrace_func_entry *entry;	3247	struct ftrace_func_entry *entry;
3248		3248
3249	if (!ftrace_location(ip))	3249	if (!ftrace_location(ip))
3250	return -EINVAL;	3250	return -EINVAL;
3251		3251
3252	if (remove) {	3252	if (remove) {
3253	entry = ftrace_lookup_ip(hash, ip);	3253	entry = ftrace_lookup_ip(hash, ip);
3254	if (!entry)	3254	if (!entry)
3255	return -ENOENT;	3255	return -ENOENT;
3256	free_hash_entry(hash, entry);	3256	free_hash_entry(hash, entry);
3257	return 0;	3257	return 0;
3258	}	3258	}
3259		3259
3260	return add_hash_entry(hash, ip);	3260	return add_hash_entry(hash, ip);
3261	}	3261	}
3262		3262
3263	static int	3263	static int
3264	ftrace_set_hash(struct ftrace_ops ops, unsigned char buf, int len,	3264	ftrace_set_hash(struct ftrace_ops ops, unsigned char buf, int len,
3265	unsigned long ip, int remove, int reset, int enable)	3265	unsigned long ip, int remove, int reset, int enable)
3266	{	3266	{
3267	struct ftrace_hash **orig_hash;	3267	struct ftrace_hash **orig_hash;
3268	struct ftrace_hash *hash;	3268	struct ftrace_hash *hash;
3269	int ret;	3269	int ret;
3270		3270
3271	/* All global ops uses the global ops filters */	3271	/* All global ops uses the global ops filters */
3272	if (ops->flags & FTRACE_OPS_FL_GLOBAL)	3272	if (ops->flags & FTRACE_OPS_FL_GLOBAL)
3273	ops = &global_ops;	3273	ops = &global_ops;
3274		3274
3275	if (unlikely(ftrace_disabled))	3275	if (unlikely(ftrace_disabled))
3276	return -ENODEV;	3276	return -ENODEV;
3277		3277
3278	if (enable)	3278	if (enable)
3279	orig_hash = &ops->filter_hash;	3279	orig_hash = &ops->filter_hash;
3280	else	3280	else
3281	orig_hash = &ops->notrace_hash;	3281	orig_hash = &ops->notrace_hash;
3282		3282
3283	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);	3283	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3284	if (!hash)	3284	if (!hash)
3285	return -ENOMEM;	3285	return -ENOMEM;
3286		3286
3287	mutex_lock(&ftrace_regex_lock);	3287	mutex_lock(&ftrace_regex_lock);
3288	if (reset)	3288	if (reset)
3289	ftrace_filter_reset(hash);	3289	ftrace_filter_reset(hash);
3290	if (buf && !ftrace_match_records(hash, buf, len)) {	3290	if (buf && !ftrace_match_records(hash, buf, len)) {
3291	ret = -EINVAL;	3291	ret = -EINVAL;
3292	goto out_regex_unlock;	3292	goto out_regex_unlock;
3293	}	3293	}
3294	if (ip) {	3294	if (ip) {
3295	ret = ftrace_match_addr(hash, ip, remove);	3295	ret = ftrace_match_addr(hash, ip, remove);
3296	if (ret < 0)	3296	if (ret < 0)
3297	goto out_regex_unlock;	3297	goto out_regex_unlock;
3298	}	3298	}
3299		3299
3300	mutex_lock(&ftrace_lock);	3300	mutex_lock(&ftrace_lock);
3301	ret = ftrace_hash_move(ops, enable, orig_hash, hash);	3301	ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3302	if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED	3302	if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
3303	&& ftrace_enabled)	3303	&& ftrace_enabled)
3304	ftrace_run_update_code(FTRACE_UPDATE_CALLS);	3304	ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3305		3305
3306	mutex_unlock(&ftrace_lock);	3306	mutex_unlock(&ftrace_lock);
3307		3307
3308	out_regex_unlock:	3308	out_regex_unlock:
3309	mutex_unlock(&ftrace_regex_lock);	3309	mutex_unlock(&ftrace_regex_lock);
3310		3310
3311	free_ftrace_hash(hash);	3311	free_ftrace_hash(hash);
3312	return ret;	3312	return ret;
3313	}	3313	}
3314		3314
3315	static int	3315	static int
3316	ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,	3316	ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
3317	int reset, int enable)	3317	int reset, int enable)
3318	{	3318	{
3319	return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);	3319	return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
3320	}	3320	}
3321		3321
3322	/**	3322	/**
3323	* ftrace_set_filter_ip - set a function to filter on in ftrace by address	3323	* ftrace_set_filter_ip - set a function to filter on in ftrace by address
3324	* @ops - the ops to set the filter with	3324	* @ops - the ops to set the filter with
3325	* @ip - the address to add to or remove from the filter.	3325	* @ip - the address to add to or remove from the filter.
3326	* @remove - non zero to remove the ip from the filter	3326	* @remove - non zero to remove the ip from the filter
3327	* @reset - non zero to reset all filters before applying this filter.	3327	* @reset - non zero to reset all filters before applying this filter.
3328	*	3328	*
3329	* Filters denote which functions should be enabled when tracing is enabled	3329	* Filters denote which functions should be enabled when tracing is enabled
3330	* If @ip is NULL, it failes to update filter.	3330	* If @ip is NULL, it failes to update filter.
3331	*/	3331	*/
3332	int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,	3332	int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
3333	int remove, int reset)	3333	int remove, int reset)
3334	{	3334	{
3335	return ftrace_set_addr(ops, ip, remove, reset, 1);	3335	return ftrace_set_addr(ops, ip, remove, reset, 1);
3336	}	3336	}
3337	EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);	3337	EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
3338		3338
3339	static int	3339	static int
3340	ftrace_set_regex(struct ftrace_ops ops, unsigned char buf, int len,	3340	ftrace_set_regex(struct ftrace_ops ops, unsigned char buf, int len,
3341	int reset, int enable)	3341	int reset, int enable)
3342	{	3342	{
3343	return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);	3343	return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
3344	}	3344	}
3345		3345
3346	/**	3346	/**
3347	* ftrace_set_filter - set a function to filter on in ftrace	3347	* ftrace_set_filter - set a function to filter on in ftrace
3348	* @ops - the ops to set the filter with	3348	* @ops - the ops to set the filter with
3349	* @buf - the string that holds the function filter text.	3349	* @buf - the string that holds the function filter text.
3350	* @len - the length of the string.	3350	* @len - the length of the string.
3351	* @reset - non zero to reset all filters before applying this filter.	3351	* @reset - non zero to reset all filters before applying this filter.
3352	*	3352	*
3353	* Filters denote which functions should be enabled when tracing is enabled.	3353	* Filters denote which functions should be enabled when tracing is enabled.
3354	* If @buf is NULL and reset is set, all functions will be enabled for tracing.	3354	* If @buf is NULL and reset is set, all functions will be enabled for tracing.
3355	*/	3355	*/
3356	int ftrace_set_filter(struct ftrace_ops ops, unsigned char buf,	3356	int ftrace_set_filter(struct ftrace_ops ops, unsigned char buf,
3357	int len, int reset)	3357	int len, int reset)
3358	{	3358	{
3359	return ftrace_set_regex(ops, buf, len, reset, 1);	3359	return ftrace_set_regex(ops, buf, len, reset, 1);
3360	}	3360	}
3361	EXPORT_SYMBOL_GPL(ftrace_set_filter);	3361	EXPORT_SYMBOL_GPL(ftrace_set_filter);
3362		3362
3363	/**	3363	/**
3364	* ftrace_set_notrace - set a function to not trace in ftrace	3364	* ftrace_set_notrace - set a function to not trace in ftrace
3365	* @ops - the ops to set the notrace filter with	3365	* @ops - the ops to set the notrace filter with
3366	* @buf - the string that holds the function notrace text.	3366	* @buf - the string that holds the function notrace text.
3367	* @len - the length of the string.	3367	* @len - the length of the string.
3368	* @reset - non zero to reset all filters before applying this filter.	3368	* @reset - non zero to reset all filters before applying this filter.
3369	*	3369	*
3370	* Notrace Filters denote which functions should not be enabled when tracing	3370	* Notrace Filters denote which functions should not be enabled when tracing
3371	* is enabled. If @buf is NULL and reset is set, all functions will be enabled	3371	* is enabled. If @buf is NULL and reset is set, all functions will be enabled
3372	* for tracing.	3372	* for tracing.
3373	*/	3373	*/
3374	int ftrace_set_notrace(struct ftrace_ops ops, unsigned char buf,	3374	int ftrace_set_notrace(struct ftrace_ops ops, unsigned char buf,
3375	int len, int reset)	3375	int len, int reset)
3376	{	3376	{
3377	return ftrace_set_regex(ops, buf, len, reset, 0);	3377	return ftrace_set_regex(ops, buf, len, reset, 0);
3378	}	3378	}
3379	EXPORT_SYMBOL_GPL(ftrace_set_notrace);	3379	EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3380	/**	3380	/**
3381	* ftrace_set_filter - set a function to filter on in ftrace	3381	* ftrace_set_filter - set a function to filter on in ftrace
3382	* @ops - the ops to set the filter with	3382	* @ops - the ops to set the filter with
3383	* @buf - the string that holds the function filter text.	3383	* @buf - the string that holds the function filter text.
3384	* @len - the length of the string.	3384	* @len - the length of the string.
3385	* @reset - non zero to reset all filters before applying this filter.	3385	* @reset - non zero to reset all filters before applying this filter.
3386	*	3386	*
3387	* Filters denote which functions should be enabled when tracing is enabled.	3387	* Filters denote which functions should be enabled when tracing is enabled.
3388	* If @buf is NULL and reset is set, all functions will be enabled for tracing.	3388	* If @buf is NULL and reset is set, all functions will be enabled for tracing.
3389	*/	3389	*/
3390	void ftrace_set_global_filter(unsigned char *buf, int len, int reset)	3390	void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
3391	{	3391	{
3392	ftrace_set_regex(&global_ops, buf, len, reset, 1);	3392	ftrace_set_regex(&global_ops, buf, len, reset, 1);
3393	}	3393	}
3394	EXPORT_SYMBOL_GPL(ftrace_set_global_filter);	3394	EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
3395		3395
3396	/**	3396	/**
3397	* ftrace_set_notrace - set a function to not trace in ftrace	3397	* ftrace_set_notrace - set a function to not trace in ftrace
3398	* @ops - the ops to set the notrace filter with	3398	* @ops - the ops to set the notrace filter with
3399	* @buf - the string that holds the function notrace text.	3399	* @buf - the string that holds the function notrace text.
3400	* @len - the length of the string.	3400	* @len - the length of the string.
3401	* @reset - non zero to reset all filters before applying this filter.	3401	* @reset - non zero to reset all filters before applying this filter.
3402	*	3402	*
3403	* Notrace Filters denote which functions should not be enabled when tracing	3403	* Notrace Filters denote which functions should not be enabled when tracing
3404	* is enabled. If @buf is NULL and reset is set, all functions will be enabled	3404	* is enabled. If @buf is NULL and reset is set, all functions will be enabled
3405	* for tracing.	3405	* for tracing.
3406	*/	3406	*/
3407	void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)	3407	void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
3408	{	3408	{
3409	ftrace_set_regex(&global_ops, buf, len, reset, 0);	3409	ftrace_set_regex(&global_ops, buf, len, reset, 0);
3410	}	3410	}
3411	EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);	3411	EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
3412		3412
3413	/*	3413	/*
3414	* command line interface to allow users to set filters on boot up.	3414	* command line interface to allow users to set filters on boot up.
3415	*/	3415	*/
3416	#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE	3416	#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
3417	static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;	3417	static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3418	static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;	3418	static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3419		3419
3420	static int __init set_ftrace_notrace(char *str)	3420	static int __init set_ftrace_notrace(char *str)
3421	{	3421	{
3422	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);	3422	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3423	return 1;	3423	return 1;
3424	}	3424	}
3425	__setup("ftrace_notrace=", set_ftrace_notrace);	3425	__setup("ftrace_notrace=", set_ftrace_notrace);
3426		3426
3427	static int __init set_ftrace_filter(char *str)	3427	static int __init set_ftrace_filter(char *str)
3428	{	3428	{
3429	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);	3429	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3430	return 1;	3430	return 1;
3431	}	3431	}
3432	__setup("ftrace_filter=", set_ftrace_filter);	3432	__setup("ftrace_filter=", set_ftrace_filter);
3433		3433
3434	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	3434	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3435	static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;	3435	static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3436	static int ftrace_set_func(unsigned long array, int idx, char *buffer);	3436	static int ftrace_set_func(unsigned long array, int idx, char *buffer);
3437		3437
3438	static int __init set_graph_function(char *str)	3438	static int __init set_graph_function(char *str)
3439	{	3439	{
3440	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);	3440	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
3441	return 1;	3441	return 1;
3442	}	3442	}
3443	__setup("ftrace_graph_filter=", set_graph_function);	3443	__setup("ftrace_graph_filter=", set_graph_function);
3444		3444
3445	static void __init set_ftrace_early_graph(char *buf)	3445	static void __init set_ftrace_early_graph(char *buf)
3446	{	3446	{
3447	int ret;	3447	int ret;
3448	char *func;	3448	char *func;
3449		3449
3450	while (buf) {	3450	while (buf) {
3451	func = strsep(&buf, ",");	3451	func = strsep(&buf, ",");
3452	/* we allow only one expression at a time */	3452	/* we allow only one expression at a time */
3453	ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,	3453	ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3454	func);	3454	func);
3455	if (ret)	3455	if (ret)
3456	printk(KERN_DEBUG "ftrace: function %s not "	3456	printk(KERN_DEBUG "ftrace: function %s not "
3457	"traceable\n", func);	3457	"traceable\n", func);
3458	}	3458	}
3459	}	3459	}
3460	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */	3460	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3461		3461
3462	void __init	3462	void __init
3463	ftrace_set_early_filter(struct ftrace_ops ops, char buf, int enable)	3463	ftrace_set_early_filter(struct ftrace_ops ops, char buf, int enable)
3464	{	3464	{
3465	char *func;	3465	char *func;
3466		3466
3467	while (buf) {	3467	while (buf) {
3468	func = strsep(&buf, ",");	3468	func = strsep(&buf, ",");
3469	ftrace_set_regex(ops, func, strlen(func), 0, enable);	3469	ftrace_set_regex(ops, func, strlen(func), 0, enable);
3470	}	3470	}
3471	}	3471	}
3472		3472
3473	static void __init set_ftrace_early_filters(void)	3473	static void __init set_ftrace_early_filters(void)
3474	{	3474	{
3475	if (ftrace_filter_buf[0])	3475	if (ftrace_filter_buf[0])
3476	ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);	3476	ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
3477	if (ftrace_notrace_buf[0])	3477	if (ftrace_notrace_buf[0])
3478	ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);	3478	ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3479	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	3479	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3480	if (ftrace_graph_buf[0])	3480	if (ftrace_graph_buf[0])
3481	set_ftrace_early_graph(ftrace_graph_buf);	3481	set_ftrace_early_graph(ftrace_graph_buf);
3482	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */	3482	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3483	}	3483	}
3484		3484
3485	int ftrace_regex_release(struct inode inode, struct file file)	3485	int ftrace_regex_release(struct inode inode, struct file file)
3486	{	3486	{
3487	struct seq_file m = (struct seq_file )file->private_data;	3487	struct seq_file m = (struct seq_file )file->private_data;
3488	struct ftrace_iterator *iter;	3488	struct ftrace_iterator *iter;
3489	struct ftrace_hash **orig_hash;	3489	struct ftrace_hash **orig_hash;
3490	struct trace_parser *parser;	3490	struct trace_parser *parser;
3491	int filter_hash;	3491	int filter_hash;
3492	int ret;	3492	int ret;
3493		3493
3494	mutex_lock(&ftrace_regex_lock);	3494	mutex_lock(&ftrace_regex_lock);
3495	if (file->f_mode & FMODE_READ) {	3495	if (file->f_mode & FMODE_READ) {
3496	iter = m->private;	3496	iter = m->private;
3497		3497
3498	seq_release(inode, file);	3498	seq_release(inode, file);
3499	} else	3499	} else
3500	iter = file->private_data;	3500	iter = file->private_data;
3501		3501
3502	parser = &iter->parser;	3502	parser = &iter->parser;
3503	if (trace_parser_loaded(parser)) {	3503	if (trace_parser_loaded(parser)) {
3504	parser->buffer[parser->idx] = 0;	3504	parser->buffer[parser->idx] = 0;
3505	ftrace_match_records(iter->hash, parser->buffer, parser->idx);	3505	ftrace_match_records(iter->hash, parser->buffer, parser->idx);
3506	}	3506	}
3507		3507
3508	trace_parser_put(parser);	3508	trace_parser_put(parser);
3509		3509
3510	if (file->f_mode & FMODE_WRITE) {	3510	if (file->f_mode & FMODE_WRITE) {
3511	filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);	3511	filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3512		3512
3513	if (filter_hash)	3513	if (filter_hash)
3514	orig_hash = &iter->ops->filter_hash;	3514	orig_hash = &iter->ops->filter_hash;
3515	else	3515	else
3516	orig_hash = &iter->ops->notrace_hash;	3516	orig_hash = &iter->ops->notrace_hash;
3517		3517
3518	mutex_lock(&ftrace_lock);	3518	mutex_lock(&ftrace_lock);
3519	ret = ftrace_hash_move(iter->ops, filter_hash,	3519	ret = ftrace_hash_move(iter->ops, filter_hash,
3520	orig_hash, iter->hash);	3520	orig_hash, iter->hash);
3521	if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)	3521	if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3522	&& ftrace_enabled)	3522	&& ftrace_enabled)
3523	ftrace_run_update_code(FTRACE_UPDATE_CALLS);	3523	ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3524		3524
3525	mutex_unlock(&ftrace_lock);	3525	mutex_unlock(&ftrace_lock);
3526	}	3526	}
3527	free_ftrace_hash(iter->hash);	3527	free_ftrace_hash(iter->hash);
3528	kfree(iter);	3528	kfree(iter);
3529		3529
3530	mutex_unlock(&ftrace_regex_lock);	3530	mutex_unlock(&ftrace_regex_lock);
3531	return 0;	3531	return 0;
3532	}	3532	}
3533		3533
3534	static const struct file_operations ftrace_avail_fops = {	3534	static const struct file_operations ftrace_avail_fops = {
3535	.open = ftrace_avail_open,	3535	.open = ftrace_avail_open,
3536	.read = seq_read,	3536	.read = seq_read,
3537	.llseek = seq_lseek,	3537	.llseek = seq_lseek,
3538	.release = seq_release_private,	3538	.release = seq_release_private,
3539	};	3539	};
3540		3540
3541	static const struct file_operations ftrace_enabled_fops = {	3541	static const struct file_operations ftrace_enabled_fops = {
3542	.open = ftrace_enabled_open,	3542	.open = ftrace_enabled_open,
3543	.read = seq_read,	3543	.read = seq_read,
3544	.llseek = seq_lseek,	3544	.llseek = seq_lseek,
3545	.release = seq_release_private,	3545	.release = seq_release_private,
3546	};	3546	};
3547		3547
3548	static const struct file_operations ftrace_filter_fops = {	3548	static const struct file_operations ftrace_filter_fops = {
3549	.open = ftrace_filter_open,	3549	.open = ftrace_filter_open,
3550	.read = seq_read,	3550	.read = seq_read,
3551	.write = ftrace_filter_write,	3551	.write = ftrace_filter_write,
3552	.llseek = ftrace_regex_lseek,	3552	.llseek = ftrace_regex_lseek,
3553	.release = ftrace_regex_release,	3553	.release = ftrace_regex_release,
3554	};	3554	};
3555		3555
3556	static const struct file_operations ftrace_notrace_fops = {	3556	static const struct file_operations ftrace_notrace_fops = {
3557	.open = ftrace_notrace_open,	3557	.open = ftrace_notrace_open,
3558	.read = seq_read,	3558	.read = seq_read,
3559	.write = ftrace_notrace_write,	3559	.write = ftrace_notrace_write,
3560	.llseek = ftrace_regex_lseek,	3560	.llseek = ftrace_regex_lseek,
3561	.release = ftrace_regex_release,	3561	.release = ftrace_regex_release,
3562	};	3562	};
3563		3563
3564	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	3564	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3565		3565
3566	static DEFINE_MUTEX(graph_lock);	3566	static DEFINE_MUTEX(graph_lock);
3567		3567
3568	int ftrace_graph_count;	3568	int ftrace_graph_count;
3569	int ftrace_graph_filter_enabled;	3569	int ftrace_graph_filter_enabled;
3570	unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;	3570	unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3571		3571
3572	static void *	3572	static void *
3573	__g_next(struct seq_file m, loff_t pos)	3573	__g_next(struct seq_file m, loff_t pos)
3574	{	3574	{
3575	if (*pos >= ftrace_graph_count)	3575	if (*pos >= ftrace_graph_count)
3576	return NULL;	3576	return NULL;
3577	return &ftrace_graph_funcs[*pos];	3577	return &ftrace_graph_funcs[*pos];
3578	}	3578	}
3579		3579
3580	static void *	3580	static void *
3581	g_next(struct seq_file m, void v, loff_t *pos)	3581	g_next(struct seq_file m, void v, loff_t *pos)
3582	{	3582	{
3583	(*pos)++;	3583	(*pos)++;
3584	return __g_next(m, pos);	3584	return __g_next(m, pos);
3585	}	3585	}
3586		3586
3587	static void g_start(struct seq_file m, loff_t *pos)	3587	static void g_start(struct seq_file m, loff_t *pos)
3588	{	3588	{
3589	mutex_lock(&graph_lock);	3589	mutex_lock(&graph_lock);
3590		3590
3591	/* Nothing, tell g_show to print all functions are enabled */	3591	/* Nothing, tell g_show to print all functions are enabled */
3592	if (!ftrace_graph_filter_enabled && !*pos)	3592	if (!ftrace_graph_filter_enabled && !*pos)
3593	return (void *)1;	3593	return (void *)1;
3594		3594
3595	return __g_next(m, pos);	3595	return __g_next(m, pos);
3596	}	3596	}
3597		3597
3598	static void g_stop(struct seq_file m, void p)	3598	static void g_stop(struct seq_file m, void p)
3599	{	3599	{
3600	mutex_unlock(&graph_lock);	3600	mutex_unlock(&graph_lock);
3601	}	3601	}
3602		3602
3603	static int g_show(struct seq_file m, void v)	3603	static int g_show(struct seq_file m, void v)
3604	{	3604	{
3605	unsigned long *ptr = v;	3605	unsigned long *ptr = v;
3606		3606
3607	if (!ptr)	3607	if (!ptr)
3608	return 0;	3608	return 0;
3609		3609
3610	if (ptr == (unsigned long *)1) {	3610	if (ptr == (unsigned long *)1) {
3611	seq_printf(m, "#### all functions enabled ####\n");	3611	seq_printf(m, "#### all functions enabled ####\n");
3612	return 0;	3612	return 0;
3613	}	3613	}
3614		3614
3615	seq_printf(m, "%ps\n", (void )ptr);	3615	seq_printf(m, "%ps\n", (void )ptr);
3616		3616
3617	return 0;	3617	return 0;
3618	}	3618	}
3619		3619
3620	static const struct seq_operations ftrace_graph_seq_ops = {	3620	static const struct seq_operations ftrace_graph_seq_ops = {
3621	.start = g_start,	3621	.start = g_start,
3622	.next = g_next,	3622	.next = g_next,
3623	.stop = g_stop,	3623	.stop = g_stop,
3624	.show = g_show,	3624	.show = g_show,
3625	};	3625	};
3626		3626
3627	static int	3627	static int
3628	ftrace_graph_open(struct inode inode, struct file file)	3628	ftrace_graph_open(struct inode inode, struct file file)
3629	{	3629	{
3630	int ret = 0;	3630	int ret = 0;
3631		3631
3632	if (unlikely(ftrace_disabled))	3632	if (unlikely(ftrace_disabled))
3633	return -ENODEV;	3633	return -ENODEV;
3634		3634
3635	mutex_lock(&graph_lock);	3635	mutex_lock(&graph_lock);
3636	if ((file->f_mode & FMODE_WRITE) &&	3636	if ((file->f_mode & FMODE_WRITE) &&
3637	(file->f_flags & O_TRUNC)) {	3637	(file->f_flags & O_TRUNC)) {
3638	ftrace_graph_filter_enabled = 0;	3638	ftrace_graph_filter_enabled = 0;
3639	ftrace_graph_count = 0;	3639	ftrace_graph_count = 0;
3640	memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));	3640	memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
3641	}	3641	}
3642	mutex_unlock(&graph_lock);	3642	mutex_unlock(&graph_lock);
3643		3643
3644	if (file->f_mode & FMODE_READ)	3644	if (file->f_mode & FMODE_READ)
3645	ret = seq_open(file, &ftrace_graph_seq_ops);	3645	ret = seq_open(file, &ftrace_graph_seq_ops);
3646		3646
3647	return ret;	3647	return ret;
3648	}	3648	}
3649		3649
3650	static int	3650	static int
3651	ftrace_graph_release(struct inode inode, struct file file)	3651	ftrace_graph_release(struct inode inode, struct file file)
3652	{	3652	{
3653	if (file->f_mode & FMODE_READ)	3653	if (file->f_mode & FMODE_READ)
3654	seq_release(inode, file);	3654	seq_release(inode, file);
3655	return 0;	3655	return 0;
3656	}	3656	}
3657		3657
3658	static int	3658	static int
3659	ftrace_set_func(unsigned long array, int idx, char *buffer)	3659	ftrace_set_func(unsigned long array, int idx, char *buffer)
3660	{	3660	{
3661	struct dyn_ftrace *rec;	3661	struct dyn_ftrace *rec;
3662	struct ftrace_page *pg;	3662	struct ftrace_page *pg;
3663	int search_len;	3663	int search_len;
3664	int fail = 1;	3664	int fail = 1;
3665	int type, not;	3665	int type, not;
3666	char *search;	3666	char *search;
3667	bool exists;	3667	bool exists;
3668	int i;	3668	int i;
3669		3669
3670	/* decode regex */	3670	/* decode regex */
3671	type = filter_parse_regex(buffer, strlen(buffer), &search, &not);	3671	type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
3672	if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)	3672	if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
3673	return -EBUSY;	3673	return -EBUSY;
3674		3674
3675	search_len = strlen(search);	3675	search_len = strlen(search);
3676		3676
3677	mutex_lock(&ftrace_lock);	3677	mutex_lock(&ftrace_lock);
3678		3678
3679	if (unlikely(ftrace_disabled)) {	3679	if (unlikely(ftrace_disabled)) {
3680	mutex_unlock(&ftrace_lock);	3680	mutex_unlock(&ftrace_lock);
3681	return -ENODEV;	3681	return -ENODEV;
3682	}	3682	}
3683		3683
3684	do_for_each_ftrace_rec(pg, rec) {	3684	do_for_each_ftrace_rec(pg, rec) {
3685		3685
3686	if (ftrace_match_record(rec, NULL, search, search_len, type)) {	3686	if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3687	/* if it is in the array */	3687	/* if it is in the array */
3688	exists = false;	3688	exists = false;
3689	for (i = 0; i < *idx; i++) {	3689	for (i = 0; i < *idx; i++) {
3690	if (array[i] == rec->ip) {	3690	if (array[i] == rec->ip) {
3691	exists = true;	3691	exists = true;
3692	break;	3692	break;
3693	}	3693	}
3694	}	3694	}
3695		3695
3696	if (!not) {	3696	if (!not) {
3697	fail = 0;	3697	fail = 0;
3698	if (!exists) {	3698	if (!exists) {
3699	array[(*idx)++] = rec->ip;	3699	array[(*idx)++] = rec->ip;
3700	if (*idx >= FTRACE_GRAPH_MAX_FUNCS)	3700	if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
3701	goto out;	3701	goto out;
3702	}	3702	}
3703	} else {	3703	} else {
3704	if (exists) {	3704	if (exists) {
3705	array[i] = array[--(*idx)];	3705	array[i] = array[--(*idx)];
3706	array[*idx] = 0;	3706	array[*idx] = 0;
3707	fail = 0;	3707	fail = 0;
3708	}	3708	}
3709	}	3709	}
3710	}	3710	}
3711	} while_for_each_ftrace_rec();	3711	} while_for_each_ftrace_rec();
3712	out:	3712	out:
3713	mutex_unlock(&ftrace_lock);	3713	mutex_unlock(&ftrace_lock);
3714		3714
3715	if (fail)	3715	if (fail)
3716	return -EINVAL;	3716	return -EINVAL;
3717		3717
3718	ftrace_graph_filter_enabled = 1;	3718	ftrace_graph_filter_enabled = 1;
3719	return 0;	3719	return 0;
3720	}	3720	}
3721		3721
3722	static ssize_t	3722	static ssize_t
3723	ftrace_graph_write(struct file file, const char __user ubuf,	3723	ftrace_graph_write(struct file file, const char __user ubuf,
3724	size_t cnt, loff_t *ppos)	3724	size_t cnt, loff_t *ppos)
3725	{	3725	{
3726	struct trace_parser parser;	3726	struct trace_parser parser;
3727	ssize_t read, ret;	3727	ssize_t read, ret;
3728		3728
3729	if (!cnt)	3729	if (!cnt)
3730	return 0;	3730	return 0;
3731		3731
3732	mutex_lock(&graph_lock);	3732	mutex_lock(&graph_lock);
3733		3733
3734	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {	3734	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
3735	ret = -ENOMEM;	3735	ret = -ENOMEM;
3736	goto out_unlock;	3736	goto out_unlock;
3737	}	3737	}
3738		3738
3739	read = trace_get_user(&parser, ubuf, cnt, ppos);	3739	read = trace_get_user(&parser, ubuf, cnt, ppos);
3740		3740
3741	if (read >= 0 && trace_parser_loaded((&parser))) {	3741	if (read >= 0 && trace_parser_loaded((&parser))) {
3742	parser.buffer[parser.idx] = 0;	3742	parser.buffer[parser.idx] = 0;
3743		3743
3744	/* we allow only one expression at a time */	3744	/* we allow only one expression at a time */
3745	ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,	3745	ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3746	parser.buffer);	3746	parser.buffer);
3747	if (ret)	3747	if (ret)
3748	goto out_free;	3748	goto out_free;
3749	}	3749	}
3750		3750
3751	ret = read;	3751	ret = read;
3752		3752
3753	out_free:	3753	out_free:
3754	trace_parser_put(&parser);	3754	trace_parser_put(&parser);
3755	out_unlock:	3755	out_unlock:
3756	mutex_unlock(&graph_lock);	3756	mutex_unlock(&graph_lock);
3757		3757
3758	return ret;	3758	return ret;
3759	}	3759	}
3760		3760
3761	static const struct file_operations ftrace_graph_fops = {	3761	static const struct file_operations ftrace_graph_fops = {
3762	.open = ftrace_graph_open,	3762	.open = ftrace_graph_open,
3763	.read = seq_read,	3763	.read = seq_read,
3764	.write = ftrace_graph_write,	3764	.write = ftrace_graph_write,
3765	.release = ftrace_graph_release,	3765	.release = ftrace_graph_release,
3766	.llseek = seq_lseek,	3766	.llseek = seq_lseek,
3767	};	3767	};
3768	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */	3768	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3769		3769
3770	static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)	3770	static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3771	{	3771	{
3772		3772
3773	trace_create_file("available_filter_functions", 0444,	3773	trace_create_file("available_filter_functions", 0444,
3774	d_tracer, NULL, &ftrace_avail_fops);	3774	d_tracer, NULL, &ftrace_avail_fops);
3775		3775
3776	trace_create_file("enabled_functions", 0444,	3776	trace_create_file("enabled_functions", 0444,
3777	d_tracer, NULL, &ftrace_enabled_fops);	3777	d_tracer, NULL, &ftrace_enabled_fops);
3778		3778
3779	trace_create_file("set_ftrace_filter", 0644, d_tracer,	3779	trace_create_file("set_ftrace_filter", 0644, d_tracer,
3780	NULL, &ftrace_filter_fops);	3780	NULL, &ftrace_filter_fops);
3781		3781
3782	trace_create_file("set_ftrace_notrace", 0644, d_tracer,	3782	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
3783	NULL, &ftrace_notrace_fops);	3783	NULL, &ftrace_notrace_fops);
3784		3784
3785	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	3785	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3786	trace_create_file("set_graph_function", 0444, d_tracer,	3786	trace_create_file("set_graph_function", 0444, d_tracer,
3787	NULL,	3787	NULL,
3788	&ftrace_graph_fops);	3788	&ftrace_graph_fops);
3789	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */	3789	#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3790		3790
3791	return 0;	3791	return 0;
3792	}	3792	}
3793		3793
3794	static int ftrace_cmp_ips(const void a, const void b)	3794	static int ftrace_cmp_ips(const void a, const void b)
3795	{	3795	{
3796	const unsigned long *ipa = a;	3796	const unsigned long *ipa = a;
3797	const unsigned long *ipb = b;	3797	const unsigned long *ipb = b;
3798		3798
3799	if (ipa > ipb)	3799	if (ipa > ipb)
3800	return 1;	3800	return 1;
3801	if (ipa < ipb)	3801	if (ipa < ipb)
3802	return -1;	3802	return -1;
3803	return 0;	3803	return 0;
3804	}	3804	}
3805		3805
3806	static void ftrace_swap_ips(void a, void b, int size)	3806	static void ftrace_swap_ips(void a, void b, int size)
3807	{	3807	{
3808	unsigned long *ipa = a;	3808	unsigned long *ipa = a;
3809	unsigned long *ipb = b;	3809	unsigned long *ipb = b;
3810	unsigned long t;	3810	unsigned long t;
3811		3811
3812	t = *ipa;	3812	t = *ipa;
3813	ipa = ipb;	3813	ipa = ipb;
3814	*ipb = t;	3814	*ipb = t;
3815	}	3815	}
3816		3816
3817	static int ftrace_process_locs(struct module *mod,	3817	static int ftrace_process_locs(struct module *mod,
3818	unsigned long *start,	3818	unsigned long *start,
3819	unsigned long *end)	3819	unsigned long *end)
3820	{	3820	{
3821	struct ftrace_page *start_pg;	3821	struct ftrace_page *start_pg;
3822	struct ftrace_page *pg;	3822	struct ftrace_page *pg;
3823	struct dyn_ftrace *rec;	3823	struct dyn_ftrace *rec;
3824	unsigned long count;	3824	unsigned long count;
3825	unsigned long *p;	3825	unsigned long *p;
3826	unsigned long addr;	3826	unsigned long addr;
3827	unsigned long flags = 0; /* Shut up gcc */	3827	unsigned long flags = 0; /* Shut up gcc */
3828	int ret = -ENOMEM;	3828	int ret = -ENOMEM;
3829		3829
3830	count = end - start;	3830	count = end - start;
3831		3831
3832	if (!count)	3832	if (!count)
3833	return 0;	3833	return 0;
3834		3834
3835	sort(start, count, sizeof(*start),	3835	sort(start, count, sizeof(*start),
3836	ftrace_cmp_ips, ftrace_swap_ips);	3836	ftrace_cmp_ips, ftrace_swap_ips);
3837		3837
3838	start_pg = ftrace_allocate_pages(count);	3838	start_pg = ftrace_allocate_pages(count);
3839	if (!start_pg)	3839	if (!start_pg)
3840	return -ENOMEM;	3840	return -ENOMEM;
3841		3841
3842	mutex_lock(&ftrace_lock);	3842	mutex_lock(&ftrace_lock);
3843		3843
3844	/*	3844	/*
3845	* Core and each module needs their own pages, as	3845	* Core and each module needs their own pages, as
3846	* modules will free them when they are removed.	3846	* modules will free them when they are removed.
3847	* Force a new page to be allocated for modules.	3847	* Force a new page to be allocated for modules.
3848	*/	3848	*/
3849	if (!mod) {	3849	if (!mod) {
3850	WARN_ON(ftrace_pages \|\| ftrace_pages_start);	3850	WARN_ON(ftrace_pages \|\| ftrace_pages_start);
3851	/* First initialization */	3851	/* First initialization */
3852	ftrace_pages = ftrace_pages_start = start_pg;	3852	ftrace_pages = ftrace_pages_start = start_pg;
3853	} else {	3853	} else {
3854	if (!ftrace_pages)	3854	if (!ftrace_pages)
3855	goto out;	3855	goto out;
3856		3856
3857	if (WARN_ON(ftrace_pages->next)) {	3857	if (WARN_ON(ftrace_pages->next)) {
3858	/* Hmm, we have free pages? */	3858	/* Hmm, we have free pages? */
3859	while (ftrace_pages->next)	3859	while (ftrace_pages->next)
3860	ftrace_pages = ftrace_pages->next;	3860	ftrace_pages = ftrace_pages->next;
3861	}	3861	}
3862		3862
3863	ftrace_pages->next = start_pg;	3863	ftrace_pages->next = start_pg;
3864	}	3864	}
3865		3865
3866	p = start;	3866	p = start;
3867	pg = start_pg;	3867	pg = start_pg;
3868	while (p < end) {	3868	while (p < end) {
3869	addr = ftrace_call_adjust(*p++);	3869	addr = ftrace_call_adjust(*p++);
3870	/*	3870	/*
3871	* Some architecture linkers will pad between	3871	* Some architecture linkers will pad between
3872	* the different mcount_loc sections of different	3872	* the different mcount_loc sections of different
3873	* object files to satisfy alignments.	3873	* object files to satisfy alignments.
3874	* Skip any NULL pointers.	3874	* Skip any NULL pointers.
3875	*/	3875	*/
3876	if (!addr)	3876	if (!addr)
3877	continue;	3877	continue;
3878		3878
3879	if (pg->index == pg->size) {	3879	if (pg->index == pg->size) {
3880	/* We should have allocated enough */	3880	/* We should have allocated enough */
3881	if (WARN_ON(!pg->next))	3881	if (WARN_ON(!pg->next))
3882	break;	3882	break;
3883	pg = pg->next;	3883	pg = pg->next;
3884	}	3884	}
3885		3885
3886	rec = &pg->records[pg->index++];	3886	rec = &pg->records[pg->index++];
3887	rec->ip = addr;	3887	rec->ip = addr;
3888	}	3888	}
3889		3889
3890	/* We should have used all pages */	3890	/* We should have used all pages */
3891	WARN_ON(pg->next);	3891	WARN_ON(pg->next);
3892		3892
3893	/* Assign the last page to ftrace_pages */	3893	/* Assign the last page to ftrace_pages */
3894	ftrace_pages = pg;	3894	ftrace_pages = pg;
3895		3895
3896	/* These new locations need to be initialized */	3896	/* These new locations need to be initialized */
3897	ftrace_new_pgs = start_pg;	3897	ftrace_new_pgs = start_pg;
3898		3898
3899	/*	3899	/*
3900	* We only need to disable interrupts on start up	3900	* We only need to disable interrupts on start up
3901	* because we are modifying code that an interrupt	3901	* because we are modifying code that an interrupt
3902	* may execute, and the modification is not atomic.	3902	* may execute, and the modification is not atomic.
3903	* But for modules, nothing runs the code we modify	3903	* But for modules, nothing runs the code we modify
3904	* until we are finished with it, and there's no	3904	* until we are finished with it, and there's no
3905	* reason to cause large interrupt latencies while we do it.	3905	* reason to cause large interrupt latencies while we do it.
3906	*/	3906	*/
3907	if (!mod)	3907	if (!mod)
3908	local_irq_save(flags);	3908	local_irq_save(flags);
3909	ftrace_update_code(mod);	3909	ftrace_update_code(mod);
3910	if (!mod)	3910	if (!mod)
3911	local_irq_restore(flags);	3911	local_irq_restore(flags);
3912	ret = 0;	3912	ret = 0;
3913	out:	3913	out:
3914	mutex_unlock(&ftrace_lock);	3914	mutex_unlock(&ftrace_lock);
3915		3915
3916	return ret;	3916	return ret;
3917	}	3917	}
3918		3918
3919	#ifdef CONFIG_MODULES	3919	#ifdef CONFIG_MODULES
3920		3920
3921	#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)	3921	#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3922		3922
3923	void ftrace_release_mod(struct module *mod)	3923	void ftrace_release_mod(struct module *mod)
3924	{	3924	{
3925	struct dyn_ftrace *rec;	3925	struct dyn_ftrace *rec;
3926	struct ftrace_page **last_pg;	3926	struct ftrace_page **last_pg;
3927	struct ftrace_page *pg;	3927	struct ftrace_page *pg;
3928	int order;	3928	int order;
3929		3929
3930	mutex_lock(&ftrace_lock);	3930	mutex_lock(&ftrace_lock);
3931		3931
3932	if (ftrace_disabled)	3932	if (ftrace_disabled)
3933	goto out_unlock;	3933	goto out_unlock;
3934		3934
3935	/*	3935	/*
3936	* Each module has its own ftrace_pages, remove	3936	* Each module has its own ftrace_pages, remove
3937	* them from the list.	3937	* them from the list.
3938	*/	3938	*/
3939	last_pg = &ftrace_pages_start;	3939	last_pg = &ftrace_pages_start;
3940	for (pg = ftrace_pages_start; pg; pg = *last_pg) {	3940	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3941	rec = &pg->records[0];	3941	rec = &pg->records[0];
3942	if (within_module_core(rec->ip, mod)) {	3942	if (within_module_core(rec->ip, mod)) {
3943	/*	3943	/*
3944	* As core pages are first, the first	3944	* As core pages are first, the first
3945	* page should never be a module page.	3945	* page should never be a module page.
3946	*/	3946	*/
3947	if (WARN_ON(pg == ftrace_pages_start))	3947	if (WARN_ON(pg == ftrace_pages_start))
3948	goto out_unlock;	3948	goto out_unlock;
3949		3949
3950	/* Check if we are deleting the last page */	3950	/* Check if we are deleting the last page */
3951	if (pg == ftrace_pages)	3951	if (pg == ftrace_pages)
3952	ftrace_pages = next_to_ftrace_page(last_pg);	3952	ftrace_pages = next_to_ftrace_page(last_pg);
3953		3953
3954	*last_pg = pg->next;	3954	*last_pg = pg->next;
3955	order = get_count_order(pg->size / ENTRIES_PER_PAGE);	3955	order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3956	free_pages((unsigned long)pg->records, order);	3956	free_pages((unsigned long)pg->records, order);
3957	kfree(pg);	3957	kfree(pg);
3958	} else	3958	} else
3959	last_pg = &pg->next;	3959	last_pg = &pg->next;
3960	}	3960	}
3961	out_unlock:	3961	out_unlock:
3962	mutex_unlock(&ftrace_lock);	3962	mutex_unlock(&ftrace_lock);
3963	}	3963	}
3964		3964
3965	static void ftrace_init_module(struct module *mod,	3965	static void ftrace_init_module(struct module *mod,
3966	unsigned long start, unsigned long end)	3966	unsigned long start, unsigned long end)
3967	{	3967	{
3968	if (ftrace_disabled \|\| start == end)	3968	if (ftrace_disabled \|\| start == end)
3969	return;	3969	return;
3970	ftrace_process_locs(mod, start, end);	3970	ftrace_process_locs(mod, start, end);
3971	}	3971	}
3972		3972
3973	static int ftrace_module_notify(struct notifier_block *self,	3973	static int ftrace_module_notify(struct notifier_block *self,
3974	unsigned long val, void *data)	3974	unsigned long val, void *data)
3975	{	3975	{
3976	struct module *mod = data;	3976	struct module *mod = data;
3977		3977
3978	switch (val) {	3978	switch (val) {
3979	case MODULE_STATE_COMING:	3979	case MODULE_STATE_COMING:
3980	ftrace_init_module(mod, mod->ftrace_callsites,	3980	ftrace_init_module(mod, mod->ftrace_callsites,
3981	mod->ftrace_callsites +	3981	mod->ftrace_callsites +
3982	mod->num_ftrace_callsites);	3982	mod->num_ftrace_callsites);
3983	break;	3983	break;
3984	case MODULE_STATE_GOING:	3984	case MODULE_STATE_GOING:
3985	ftrace_release_mod(mod);	3985	ftrace_release_mod(mod);
3986	break;	3986	break;
3987	}	3987	}
3988		3988
3989	return 0;	3989	return 0;
3990	}	3990	}
3991	#else	3991	#else
3992	static int ftrace_module_notify(struct notifier_block *self,	3992	static int ftrace_module_notify(struct notifier_block *self,
3993	unsigned long val, void *data)	3993	unsigned long val, void *data)
3994	{	3994	{
3995	return 0;	3995	return 0;
3996	}	3996	}
3997	#endif /* CONFIG_MODULES */	3997	#endif /* CONFIG_MODULES */
3998		3998
3999	struct notifier_block ftrace_module_nb = {	3999	struct notifier_block ftrace_module_nb = {
4000	.notifier_call = ftrace_module_notify,	4000	.notifier_call = ftrace_module_notify,
4001	.priority = 0,	4001	.priority = 0,
4002	};	4002	};
4003		4003
4004	extern unsigned long __start_mcount_loc[];	4004	extern unsigned long __start_mcount_loc[];
4005	extern unsigned long __stop_mcount_loc[];	4005	extern unsigned long __stop_mcount_loc[];
4006		4006
4007	void __init ftrace_init(void)	4007	void __init ftrace_init(void)
4008	{	4008	{
4009	unsigned long count, addr, flags;	4009	unsigned long count, addr, flags;
4010	int ret;	4010	int ret;
4011		4011
4012	/* Keep the ftrace pointer to the stub */	4012	/* Keep the ftrace pointer to the stub */
4013	addr = (unsigned long)ftrace_stub;	4013	addr = (unsigned long)ftrace_stub;
4014		4014
4015	local_irq_save(flags);	4015	local_irq_save(flags);
4016	ftrace_dyn_arch_init(&addr);	4016	ftrace_dyn_arch_init(&addr);
4017	local_irq_restore(flags);	4017	local_irq_restore(flags);
4018		4018
4019	/* ftrace_dyn_arch_init places the return code in addr */	4019	/* ftrace_dyn_arch_init places the return code in addr */
4020	if (addr)	4020	if (addr)
4021	goto failed;	4021	goto failed;
4022		4022
4023	count = __stop_mcount_loc - __start_mcount_loc;	4023	count = __stop_mcount_loc - __start_mcount_loc;
4024		4024
4025	ret = ftrace_dyn_table_alloc(count);	4025	ret = ftrace_dyn_table_alloc(count);
4026	if (ret)	4026	if (ret)
4027	goto failed;	4027	goto failed;
4028		4028
4029	last_ftrace_enabled = ftrace_enabled = 1;	4029	last_ftrace_enabled = ftrace_enabled = 1;
4030		4030
4031	ret = ftrace_process_locs(NULL,	4031	ret = ftrace_process_locs(NULL,
4032	__start_mcount_loc,	4032	__start_mcount_loc,
4033	__stop_mcount_loc);	4033	__stop_mcount_loc);
4034		4034
4035	ret = register_module_notifier(&ftrace_module_nb);	4035	ret = register_module_notifier(&ftrace_module_nb);
4036	if (ret)	4036	if (ret)
4037	pr_warning("Failed to register trace ftrace module notifier\n");	4037	pr_warning("Failed to register trace ftrace module notifier\n");
4038		4038
4039	set_ftrace_early_filters();	4039	set_ftrace_early_filters();
4040		4040
4041	return;	4041	return;
4042	failed:	4042	failed:
4043	ftrace_disabled = 1;	4043	ftrace_disabled = 1;
4044	}	4044	}
4045		4045
4046	#else	4046	#else
4047		4047
4048	static struct ftrace_ops global_ops = {	4048	static struct ftrace_ops global_ops = {
4049	.func = ftrace_stub,	4049	.func = ftrace_stub,
4050	.flags = FTRACE_OPS_FL_RECURSION_SAFE,	4050	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
4051	};	4051	};
4052		4052
4053	static int __init ftrace_nodyn_init(void)	4053	static int __init ftrace_nodyn_init(void)
4054	{	4054	{
4055	ftrace_enabled = 1;	4055	ftrace_enabled = 1;
4056	return 0;	4056	return 0;
4057	}	4057	}
4058	core_initcall(ftrace_nodyn_init);	4058	core_initcall(ftrace_nodyn_init);
4059		4059
4060	static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }	4060	static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061	static inline void ftrace_startup_enable(int command) { }	4061	static inline void ftrace_startup_enable(int command) { }
4062	/* Keep as macros so we do not need to define the commands */	4062	/* Keep as macros so we do not need to define the commands */
4063	# define ftrace_startup(ops, command) \	4063	# define ftrace_startup(ops, command) \
4064	({ \	4064	({ \
4065	(ops)->flags \|= FTRACE_OPS_FL_ENABLED; \	4065	(ops)->flags \|= FTRACE_OPS_FL_ENABLED; \
4066	0; \	4066	0; \
4067	})	4067	})
4068	# define ftrace_shutdown(ops, command) do { } while (0)	4068	# define ftrace_shutdown(ops, command) do { } while (0)
4069	# define ftrace_startup_sysctl() do { } while (0)	4069	# define ftrace_startup_sysctl() do { } while (0)
4070	# define ftrace_shutdown_sysctl() do { } while (0)	4070	# define ftrace_shutdown_sysctl() do { } while (0)
4071		4071
4072	static inline int	4072	static inline int
4073	ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)	4073	ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
4074	{	4074	{
4075	return 1;	4075	return 1;
4076	}	4076	}
4077		4077
4078	#endif /* CONFIG_DYNAMIC_FTRACE */	4078	#endif /* CONFIG_DYNAMIC_FTRACE */
4079		4079
4080	static void	4080	static void
4081	ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,	4081	ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4082	struct ftrace_ops op, struct pt_regs regs)	4082	struct ftrace_ops op, struct pt_regs regs)
4083	{	4083	{
4084	if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))	4084	if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
4085	return;	4085	return;
4086		4086
4087	/*	4087	/*
4088	* Some of the ops may be dynamically allocated,	4088	* Some of the ops may be dynamically allocated,
4089	* they must be freed after a synchronize_sched().	4089	* they must be freed after a synchronize_sched().
4090	*/	4090	*/
4091	preempt_disable_notrace();	4091	preempt_disable_notrace();
4092	trace_recursion_set(TRACE_CONTROL_BIT);	4092	trace_recursion_set(TRACE_CONTROL_BIT);
4093	op = rcu_dereference_raw(ftrace_control_list);	4093	op = rcu_dereference_raw(ftrace_control_list);
4094	while (op != &ftrace_list_end) {	4094	while (op != &ftrace_list_end) {
4095	if (!ftrace_function_local_disabled(op) &&	4095	if (!ftrace_function_local_disabled(op) &&
4096	ftrace_ops_test(op, ip))	4096	ftrace_ops_test(op, ip))
4097	op->func(ip, parent_ip, op, regs);	4097	op->func(ip, parent_ip, op, regs);
4098		4098
4099	op = rcu_dereference_raw(op->next);	4099	op = rcu_dereference_raw(op->next);
4100	};	4100	};
4101	trace_recursion_clear(TRACE_CONTROL_BIT);	4101	trace_recursion_clear(TRACE_CONTROL_BIT);
4102	preempt_enable_notrace();	4102	preempt_enable_notrace();
4103	}	4103	}
4104		4104
4105	static struct ftrace_ops control_ops = {	4105	static struct ftrace_ops control_ops = {
4106	.func = ftrace_ops_control_func,	4106	.func = ftrace_ops_control_func,
4107	.flags = FTRACE_OPS_FL_RECURSION_SAFE,	4107	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
4108	};	4108	};
4109		4109
4110	static inline void	4110	static inline void
4111	__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,	4111	__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112	struct ftrace_ops ignored, struct pt_regs regs)	4112	struct ftrace_ops ignored, struct pt_regs regs)
4113	{	4113	{
4114	struct ftrace_ops *op;	4114	struct ftrace_ops *op;
4115		4115
4116	if (function_trace_stop)	4116	if (function_trace_stop)
4117	return;	4117	return;
4118		4118
4119	if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))	4119	if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
4120	return;	4120	return;
4121		4121
4122	trace_recursion_set(TRACE_INTERNAL_BIT);	4122	trace_recursion_set(TRACE_INTERNAL_BIT);
4123	/*	4123	/*
4124	* Some of the ops may be dynamically allocated,	4124	* Some of the ops may be dynamically allocated,
4125	* they must be freed after a synchronize_sched().	4125	* they must be freed after a synchronize_sched().
4126	*/	4126	*/
4127	preempt_disable_notrace();	4127	preempt_disable_notrace();
4128	op = rcu_dereference_raw(ftrace_ops_list);	4128	op = rcu_dereference_raw(ftrace_ops_list);
4129	while (op != &ftrace_list_end) {	4129	while (op != &ftrace_list_end) {
4130	if (ftrace_ops_test(op, ip))	4130	if (ftrace_ops_test(op, ip))
4131	op->func(ip, parent_ip, op, regs);	4131	op->func(ip, parent_ip, op, regs);
4132	op = rcu_dereference_raw(op->next);	4132	op = rcu_dereference_raw(op->next);
4133	};	4133	};
4134	preempt_enable_notrace();	4134	preempt_enable_notrace();
4135	trace_recursion_clear(TRACE_INTERNAL_BIT);	4135	trace_recursion_clear(TRACE_INTERNAL_BIT);
4136	}	4136	}
4137		4137
4138	/*	4138	/*
4139	* Some archs only support passing ip and parent_ip. Even though	4139	* Some archs only support passing ip and parent_ip. Even though
4140	* the list function ignores the op parameter, we do not want any	4140	* the list function ignores the op parameter, we do not want any
4141	* C side effects, where a function is called without the caller	4141	* C side effects, where a function is called without the caller
4142	* sending a third parameter.	4142	* sending a third parameter.
4143	* Archs are to support both the regs and ftrace_ops at the same time.	4143	* Archs are to support both the regs and ftrace_ops at the same time.
4144	* If they support ftrace_ops, it is assumed they support regs.	4144	* If they support ftrace_ops, it is assumed they support regs.
4145	* If call backs want to use regs, they must either check for regs	4145	* If call backs want to use regs, they must either check for regs
4146	* being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.	4146	* being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
4147	* Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.	4147	* Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
4148	* An architecture can pass partial regs with ftrace_ops and still	4148	* An architecture can pass partial regs with ftrace_ops and still
4149	* set the ARCH_SUPPORT_FTARCE_OPS.	4149	* set the ARCH_SUPPORT_FTARCE_OPS.
4150	*/	4150	*/
4151	#if ARCH_SUPPORTS_FTRACE_OPS	4151	#if ARCH_SUPPORTS_FTRACE_OPS
4152	static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,	4152	static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4153	struct ftrace_ops op, struct pt_regs regs)	4153	struct ftrace_ops op, struct pt_regs regs)
4154	{	4154	{
4155	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);	4155	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
4156	}	4156	}
4157	#else	4157	#else
4158	static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)	4158	static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4159	{	4159	{
4160	__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);	4160	__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
4161	}	4161	}
4162	#endif	4162	#endif
4163		4163
4164	static void clear_ftrace_swapper(void)	4164	static void clear_ftrace_swapper(void)
4165	{	4165	{
4166	struct task_struct *p;	4166	struct task_struct *p;
4167	int cpu;	4167	int cpu;
4168		4168
4169	get_online_cpus();	4169	get_online_cpus();
4170	for_each_online_cpu(cpu) {	4170	for_each_online_cpu(cpu) {
4171	p = idle_task(cpu);	4171	p = idle_task(cpu);
4172	clear_tsk_trace_trace(p);	4172	clear_tsk_trace_trace(p);
4173	}	4173	}
4174	put_online_cpus();	4174	put_online_cpus();
4175	}	4175	}
4176		4176
4177	static void set_ftrace_swapper(void)	4177	static void set_ftrace_swapper(void)
4178	{	4178	{
4179	struct task_struct *p;	4179	struct task_struct *p;
4180	int cpu;	4180	int cpu;
4181		4181
4182	get_online_cpus();	4182	get_online_cpus();
4183	for_each_online_cpu(cpu) {	4183	for_each_online_cpu(cpu) {
4184	p = idle_task(cpu);	4184	p = idle_task(cpu);
4185	set_tsk_trace_trace(p);	4185	set_tsk_trace_trace(p);
4186	}	4186	}
4187	put_online_cpus();	4187	put_online_cpus();
4188	}	4188	}
4189		4189
4190	static void clear_ftrace_pid(struct pid *pid)	4190	static void clear_ftrace_pid(struct pid *pid)
4191	{	4191	{
4192	struct task_struct *p;	4192	struct task_struct *p;
4193		4193
4194	rcu_read_lock();	4194	rcu_read_lock();
4195	do_each_pid_task(pid, PIDTYPE_PID, p) {	4195	do_each_pid_task(pid, PIDTYPE_PID, p) {
4196	clear_tsk_trace_trace(p);	4196	clear_tsk_trace_trace(p);
4197	} while_each_pid_task(pid, PIDTYPE_PID, p);	4197	} while_each_pid_task(pid, PIDTYPE_PID, p);
4198	rcu_read_unlock();	4198	rcu_read_unlock();
4199		4199
4200	put_pid(pid);	4200	put_pid(pid);
4201	}	4201	}
4202		4202
4203	static void set_ftrace_pid(struct pid *pid)	4203	static void set_ftrace_pid(struct pid *pid)
4204	{	4204	{
4205	struct task_struct *p;	4205	struct task_struct *p;
4206		4206
4207	rcu_read_lock();	4207	rcu_read_lock();
4208	do_each_pid_task(pid, PIDTYPE_PID, p) {	4208	do_each_pid_task(pid, PIDTYPE_PID, p) {
4209	set_tsk_trace_trace(p);	4209	set_tsk_trace_trace(p);
4210	} while_each_pid_task(pid, PIDTYPE_PID, p);	4210	} while_each_pid_task(pid, PIDTYPE_PID, p);
4211	rcu_read_unlock();	4211	rcu_read_unlock();
4212	}	4212	}
4213		4213
4214	static void clear_ftrace_pid_task(struct pid *pid)	4214	static void clear_ftrace_pid_task(struct pid *pid)
4215	{	4215	{
4216	if (pid == ftrace_swapper_pid)	4216	if (pid == ftrace_swapper_pid)
4217	clear_ftrace_swapper();	4217	clear_ftrace_swapper();
4218	else	4218	else
4219	clear_ftrace_pid(pid);	4219	clear_ftrace_pid(pid);
4220	}	4220	}
4221		4221
4222	static void set_ftrace_pid_task(struct pid *pid)	4222	static void set_ftrace_pid_task(struct pid *pid)
4223	{	4223	{
4224	if (pid == ftrace_swapper_pid)	4224	if (pid == ftrace_swapper_pid)
4225	set_ftrace_swapper();	4225	set_ftrace_swapper();
4226	else	4226	else
4227	set_ftrace_pid(pid);	4227	set_ftrace_pid(pid);
4228	}	4228	}
4229		4229
4230	static int ftrace_pid_add(int p)	4230	static int ftrace_pid_add(int p)
4231	{	4231	{
4232	struct pid *pid;	4232	struct pid *pid;
4233	struct ftrace_pid *fpid;	4233	struct ftrace_pid *fpid;
4234	int ret = -EINVAL;	4234	int ret = -EINVAL;
4235		4235
4236	mutex_lock(&ftrace_lock);	4236	mutex_lock(&ftrace_lock);
4237		4237
4238	if (!p)	4238	if (!p)
4239	pid = ftrace_swapper_pid;	4239	pid = ftrace_swapper_pid;
4240	else	4240	else
4241	pid = find_get_pid(p);	4241	pid = find_get_pid(p);
4242		4242
4243	if (!pid)	4243	if (!pid)
4244	goto out;	4244	goto out;
4245		4245
4246	ret = 0;	4246	ret = 0;
4247		4247
4248	list_for_each_entry(fpid, &ftrace_pids, list)	4248	list_for_each_entry(fpid, &ftrace_pids, list)
4249	if (fpid->pid == pid)	4249	if (fpid->pid == pid)
4250	goto out_put;	4250	goto out_put;
4251		4251
4252	ret = -ENOMEM;	4252	ret = -ENOMEM;
4253		4253
4254	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);	4254	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
4255	if (!fpid)	4255	if (!fpid)
4256	goto out_put;	4256	goto out_put;
4257		4257
4258	list_add(&fpid->list, &ftrace_pids);	4258	list_add(&fpid->list, &ftrace_pids);
4259	fpid->pid = pid;	4259	fpid->pid = pid;
4260		4260
4261	set_ftrace_pid_task(pid);	4261	set_ftrace_pid_task(pid);
4262		4262
4263	ftrace_update_pid_func();	4263	ftrace_update_pid_func();
4264	ftrace_startup_enable(0);	4264	ftrace_startup_enable(0);
4265		4265
4266	mutex_unlock(&ftrace_lock);	4266	mutex_unlock(&ftrace_lock);
4267	return 0;	4267	return 0;
4268		4268
4269	out_put:	4269	out_put:
4270	if (pid != ftrace_swapper_pid)	4270	if (pid != ftrace_swapper_pid)
4271	put_pid(pid);	4271	put_pid(pid);
4272		4272
4273	out:	4273	out:
4274	mutex_unlock(&ftrace_lock);	4274	mutex_unlock(&ftrace_lock);
4275	return ret;	4275	return ret;
4276	}	4276	}
4277		4277
4278	static void ftrace_pid_reset(void)	4278	static void ftrace_pid_reset(void)
4279	{	4279	{
4280	struct ftrace_pid fpid, safe;	4280	struct ftrace_pid fpid, safe;
4281		4281
4282	mutex_lock(&ftrace_lock);	4282	mutex_lock(&ftrace_lock);
4283	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {	4283	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
4284	struct pid *pid = fpid->pid;	4284	struct pid *pid = fpid->pid;
4285		4285
4286	clear_ftrace_pid_task(pid);	4286	clear_ftrace_pid_task(pid);
4287		4287
4288	list_del(&fpid->list);	4288	list_del(&fpid->list);
4289	kfree(fpid);	4289	kfree(fpid);
4290	}	4290	}
4291		4291
4292	ftrace_update_pid_func();	4292	ftrace_update_pid_func();
4293	ftrace_startup_enable(0);	4293	ftrace_startup_enable(0);
4294		4294
4295	mutex_unlock(&ftrace_lock);	4295	mutex_unlock(&ftrace_lock);
4296	}	4296	}
4297		4297
4298	static void fpid_start(struct seq_file m, loff_t *pos)	4298	static void fpid_start(struct seq_file m, loff_t *pos)
4299	{	4299	{
4300	mutex_lock(&ftrace_lock);	4300	mutex_lock(&ftrace_lock);
4301		4301
4302	if (list_empty(&ftrace_pids) && (!*pos))	4302	if (list_empty(&ftrace_pids) && (!*pos))
4303	return (void *) 1;	4303	return (void *) 1;
4304		4304
4305	return seq_list_start(&ftrace_pids, *pos);	4305	return seq_list_start(&ftrace_pids, *pos);
4306	}	4306	}
4307		4307
4308	static void fpid_next(struct seq_file m, void v, loff_t pos)	4308	static void fpid_next(struct seq_file m, void v, loff_t pos)
4309	{	4309	{
4310	if (v == (void *)1)	4310	if (v == (void *)1)
4311	return NULL;	4311	return NULL;
4312		4312
4313	return seq_list_next(v, &ftrace_pids, pos);	4313	return seq_list_next(v, &ftrace_pids, pos);
4314	}	4314	}
4315		4315
4316	static void fpid_stop(struct seq_file m, void p)	4316	static void fpid_stop(struct seq_file m, void p)
4317	{	4317	{
4318	mutex_unlock(&ftrace_lock);	4318	mutex_unlock(&ftrace_lock);
4319	}	4319	}
4320		4320
4321	static int fpid_show(struct seq_file m, void v)	4321	static int fpid_show(struct seq_file m, void v)
4322	{	4322	{
4323	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);	4323	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
4324		4324
4325	if (v == (void *)1) {	4325	if (v == (void *)1) {
4326	seq_printf(m, "no pid\n");	4326	seq_printf(m, "no pid\n");
4327	return 0;	4327	return 0;
4328	}	4328	}
4329		4329
4330	if (fpid->pid == ftrace_swapper_pid)	4330	if (fpid->pid == ftrace_swapper_pid)
4331	seq_printf(m, "swapper tasks\n");	4331	seq_printf(m, "swapper tasks\n");
4332	else	4332	else
4333	seq_printf(m, "%u\n", pid_vnr(fpid->pid));	4333	seq_printf(m, "%u\n", pid_vnr(fpid->pid));
4334		4334
4335	return 0;	4335	return 0;
4336	}	4336	}
4337		4337
4338	static const struct seq_operations ftrace_pid_sops = {	4338	static const struct seq_operations ftrace_pid_sops = {
4339	.start = fpid_start,	4339	.start = fpid_start,
4340	.next = fpid_next,	4340	.next = fpid_next,
4341	.stop = fpid_stop,	4341	.stop = fpid_stop,
4342	.show = fpid_show,	4342	.show = fpid_show,
4343	};	4343	};
4344		4344
4345	static int	4345	static int
4346	ftrace_pid_open(struct inode inode, struct file file)	4346	ftrace_pid_open(struct inode inode, struct file file)
4347	{	4347	{
4348	int ret = 0;	4348	int ret = 0;
4349		4349
4350	if ((file->f_mode & FMODE_WRITE) &&	4350	if ((file->f_mode & FMODE_WRITE) &&
4351	(file->f_flags & O_TRUNC))	4351	(file->f_flags & O_TRUNC))
4352	ftrace_pid_reset();	4352	ftrace_pid_reset();
4353		4353
4354	if (file->f_mode & FMODE_READ)	4354	if (file->f_mode & FMODE_READ)
4355	ret = seq_open(file, &ftrace_pid_sops);	4355	ret = seq_open(file, &ftrace_pid_sops);
4356		4356
4357	return ret;	4357	return ret;
4358	}	4358	}
4359		4359
4360	static ssize_t	4360	static ssize_t
4361	ftrace_pid_write(struct file filp, const char __user ubuf,	4361	ftrace_pid_write(struct file filp, const char __user ubuf,
4362	size_t cnt, loff_t *ppos)	4362	size_t cnt, loff_t *ppos)
4363	{	4363	{
4364	char buf[64], *tmp;	4364	char buf[64], *tmp;
4365	long val;	4365	long val;
4366	int ret;	4366	int ret;
4367		4367
4368	if (cnt >= sizeof(buf))	4368	if (cnt >= sizeof(buf))
4369	return -EINVAL;	4369	return -EINVAL;
4370		4370
4371	if (copy_from_user(&buf, ubuf, cnt))	4371	if (copy_from_user(&buf, ubuf, cnt))
4372	return -EFAULT;	4372	return -EFAULT;
4373		4373
4374	buf[cnt] = 0;	4374	buf[cnt] = 0;
4375		4375
4376	/*	4376	/*
4377	* Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"	4377	* Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
4378	* to clean the filter quietly.	4378	* to clean the filter quietly.
4379	*/	4379	*/
4380	tmp = strstrip(buf);	4380	tmp = strstrip(buf);
4381	if (strlen(tmp) == 0)	4381	if (strlen(tmp) == 0)
4382	return 1;	4382	return 1;
4383		4383
4384	ret = kstrtol(tmp, 10, &val);	4384	ret = kstrtol(tmp, 10, &val);
4385	if (ret < 0)	4385	if (ret < 0)
4386	return ret;	4386	return ret;
4387		4387
4388	ret = ftrace_pid_add(val);	4388	ret = ftrace_pid_add(val);
4389		4389
4390	return ret ? ret : cnt;	4390	return ret ? ret : cnt;
4391	}	4391	}
4392		4392
4393	static int	4393	static int
4394	ftrace_pid_release(struct inode inode, struct file file)	4394	ftrace_pid_release(struct inode inode, struct file file)
4395	{	4395	{
4396	if (file->f_mode & FMODE_READ)	4396	if (file->f_mode & FMODE_READ)
4397	seq_release(inode, file);	4397	seq_release(inode, file);
4398		4398
4399	return 0;	4399	return 0;
4400	}	4400	}
4401		4401
4402	static const struct file_operations ftrace_pid_fops = {	4402	static const struct file_operations ftrace_pid_fops = {
4403	.open = ftrace_pid_open,	4403	.open = ftrace_pid_open,
4404	.write = ftrace_pid_write,	4404	.write = ftrace_pid_write,
4405	.read = seq_read,	4405	.read = seq_read,
4406	.llseek = seq_lseek,	4406	.llseek = seq_lseek,
4407	.release = ftrace_pid_release,	4407	.release = ftrace_pid_release,
4408	};	4408	};
4409		4409
4410	static __init int ftrace_init_debugfs(void)	4410	static __init int ftrace_init_debugfs(void)
4411	{	4411	{
4412	struct dentry *d_tracer;	4412	struct dentry *d_tracer;
4413		4413
4414	d_tracer = tracing_init_dentry();	4414	d_tracer = tracing_init_dentry();
4415	if (!d_tracer)	4415	if (!d_tracer)
4416	return 0;	4416	return 0;
4417		4417
4418	ftrace_init_dyn_debugfs(d_tracer);	4418	ftrace_init_dyn_debugfs(d_tracer);
4419		4419
4420	trace_create_file("set_ftrace_pid", 0644, d_tracer,	4420	trace_create_file("set_ftrace_pid", 0644, d_tracer,
4421	NULL, &ftrace_pid_fops);	4421	NULL, &ftrace_pid_fops);
4422		4422
4423	ftrace_profile_debugfs(d_tracer);	4423	ftrace_profile_debugfs(d_tracer);
4424		4424
4425	return 0;	4425	return 0;
4426	}	4426	}
4427	fs_initcall(ftrace_init_debugfs);	4427	fs_initcall(ftrace_init_debugfs);
4428		4428
4429	/**	4429	/**
4430	* ftrace_kill - kill ftrace	4430	* ftrace_kill - kill ftrace
4431	*	4431	*
4432	* This function should be used by panic code. It stops ftrace	4432	* This function should be used by panic code. It stops ftrace
4433	* but in a not so nice way. If you need to simply kill ftrace	4433	* but in a not so nice way. If you need to simply kill ftrace
4434	* from a non-atomic section, use ftrace_kill.	4434	* from a non-atomic section, use ftrace_kill.
4435	*/	4435	*/
4436	void ftrace_kill(void)	4436	void ftrace_kill(void)
4437	{	4437	{
4438	ftrace_disabled = 1;	4438	ftrace_disabled = 1;
4439	ftrace_enabled = 0;	4439	ftrace_enabled = 0;
4440	clear_ftrace_function();	4440	clear_ftrace_function();
4441	}	4441	}
4442		4442
4443	/**	4443	/**
4444	* Test if ftrace is dead or not.	4444	* Test if ftrace is dead or not.
4445	*/	4445	*/
4446	int ftrace_is_dead(void)	4446	int ftrace_is_dead(void)
4447	{	4447	{
4448	return ftrace_disabled;	4448	return ftrace_disabled;
4449	}	4449	}
4450		4450
4451	/**	4451	/**
4452	* register_ftrace_function - register a function for profiling	4452	* register_ftrace_function - register a function for profiling
4453	* @ops - ops structure that holds the function for profiling.	4453	* @ops - ops structure that holds the function for profiling.
4454	*	4454	*
4455	* Register a function to be called by all functions in the	4455	* Register a function to be called by all functions in the
4456	* kernel.	4456	* kernel.
4457	*	4457	*
4458	* Note: @ops->func and all the functions it calls must be labeled	4458	* Note: @ops->func and all the functions it calls must be labeled
4459	* with "notrace", otherwise it will go into a	4459	* with "notrace", otherwise it will go into a
4460	* recursive loop.	4460	* recursive loop.
4461	*/	4461	*/
4462	int register_ftrace_function(struct ftrace_ops *ops)	4462	int register_ftrace_function(struct ftrace_ops *ops)
4463	{	4463	{
4464	int ret = -1;	4464	int ret = -1;
4465		4465
4466	mutex_lock(&ftrace_lock);	4466	mutex_lock(&ftrace_lock);
4467		4467
4468	ret = __register_ftrace_function(ops);	4468	ret = __register_ftrace_function(ops);
4469	if (!ret)	4469	if (!ret)
4470	ret = ftrace_startup(ops, 0);	4470	ret = ftrace_startup(ops, 0);
4471		4471
4472	mutex_unlock(&ftrace_lock);	4472	mutex_unlock(&ftrace_lock);
4473		4473
4474	return ret;	4474	return ret;
4475	}	4475	}
4476	EXPORT_SYMBOL_GPL(register_ftrace_function);	4476	EXPORT_SYMBOL_GPL(register_ftrace_function);
4477		4477
4478	/**	4478	/**
4479	* unregister_ftrace_function - unregister a function for profiling.	4479	* unregister_ftrace_function - unregister a function for profiling.
4480	* @ops - ops structure that holds the function to unregister	4480	* @ops - ops structure that holds the function to unregister
4481	*	4481	*
4482	* Unregister a function that was added to be called by ftrace profiling.	4482	* Unregister a function that was added to be called by ftrace profiling.
4483	*/	4483	*/
4484	int unregister_ftrace_function(struct ftrace_ops *ops)	4484	int unregister_ftrace_function(struct ftrace_ops *ops)
4485	{	4485	{
4486	int ret;	4486	int ret;
4487		4487
4488	mutex_lock(&ftrace_lock);	4488	mutex_lock(&ftrace_lock);
4489	ret = __unregister_ftrace_function(ops);	4489	ret = __unregister_ftrace_function(ops);
4490	if (!ret)	4490	if (!ret)
4491	ftrace_shutdown(ops, 0);	4491	ftrace_shutdown(ops, 0);
4492	mutex_unlock(&ftrace_lock);	4492	mutex_unlock(&ftrace_lock);
4493		4493
4494	return ret;	4494	return ret;
4495	}	4495	}
4496	EXPORT_SYMBOL_GPL(unregister_ftrace_function);	4496	EXPORT_SYMBOL_GPL(unregister_ftrace_function);
4497		4497
4498	int	4498	int
4499	ftrace_enable_sysctl(struct ctl_table *table, int write,	4499	ftrace_enable_sysctl(struct ctl_table *table, int write,
4500	void __user buffer, size_t lenp,	4500	void __user buffer, size_t lenp,
4501	loff_t *ppos)	4501	loff_t *ppos)
4502	{	4502	{
4503	int ret = -ENODEV;	4503	int ret = -ENODEV;
4504		4504
4505	mutex_lock(&ftrace_lock);	4505	mutex_lock(&ftrace_lock);
4506		4506
4507	if (unlikely(ftrace_disabled))	4507	if (unlikely(ftrace_disabled))
4508	goto out;	4508	goto out;
4509		4509
4510	ret = proc_dointvec(table, write, buffer, lenp, ppos);	4510	ret = proc_dointvec(table, write, buffer, lenp, ppos);
4511		4511
4512	if (ret \|\| !write \|\| (last_ftrace_enabled == !!ftrace_enabled))	4512	if (ret \|\| !write \|\| (last_ftrace_enabled == !!ftrace_enabled))
4513	goto out;	4513	goto out;
4514		4514
4515	last_ftrace_enabled = !!ftrace_enabled;	4515	last_ftrace_enabled = !!ftrace_enabled;
4516		4516
4517	if (ftrace_enabled) {	4517	if (ftrace_enabled) {
4518		4518
4519	ftrace_startup_sysctl();	4519	ftrace_startup_sysctl();
4520		4520
4521	/* we are starting ftrace again */	4521	/* we are starting ftrace again */
4522	if (ftrace_ops_list != &ftrace_list_end) {	4522	if (ftrace_ops_list != &ftrace_list_end) {
4523	if (ftrace_ops_list->next == &ftrace_list_end)	4523	if (ftrace_ops_list->next == &ftrace_list_end)
4524	ftrace_trace_function = ftrace_ops_list->func;	4524	ftrace_trace_function = ftrace_ops_list->func;
4525	else	4525	else
4526	ftrace_trace_function = ftrace_ops_list_func;	4526	ftrace_trace_function = ftrace_ops_list_func;
4527	}	4527	}
4528		4528
4529	} else {	4529	} else {
4530	/* stopping ftrace calls (just send to ftrace_stub) */	4530	/* stopping ftrace calls (just send to ftrace_stub) */
4531	ftrace_trace_function = ftrace_stub;	4531	ftrace_trace_function = ftrace_stub;
4532		4532
4533	ftrace_shutdown_sysctl();	4533	ftrace_shutdown_sysctl();
4534	}	4534	}
4535		4535
4536	out:	4536	out:
4537	mutex_unlock(&ftrace_lock);	4537	mutex_unlock(&ftrace_lock);
4538	return ret;	4538	return ret;
4539	}	4539	}
4540		4540
4541	#ifdef CONFIG_FUNCTION_GRAPH_TRACER	4541	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4542		4542
4543	static int ftrace_graph_active;	4543	static int ftrace_graph_active;
4544	static struct notifier_block ftrace_suspend_notifier;	4544	static struct notifier_block ftrace_suspend_notifier;
4545		4545
4546	int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)	4546	int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4547	{	4547	{
4548	return 0;	4548	return 0;
4549	}	4549	}
4550		4550
4551	/* The callbacks that hook a function */	4551	/* The callbacks that hook a function */
4552	trace_func_graph_ret_t ftrace_graph_return =	4552	trace_func_graph_ret_t ftrace_graph_return =
4553	(trace_func_graph_ret_t)ftrace_stub;	4553	(trace_func_graph_ret_t)ftrace_stub;
4554	trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;	4554	trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
4555		4555
4556	/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */	4556	/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
4557	static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)	4557	static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
4558	{	4558	{
4559	int i;	4559	int i;
4560	int ret = 0;	4560	int ret = 0;
4561	unsigned long flags;	4561	unsigned long flags;
4562	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;	4562	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
4563	struct task_struct g, t;	4563	struct task_struct g, t;
4564		4564
4565	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {	4565	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
4566	ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH	4566	ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
4567	* sizeof(struct ftrace_ret_stack),	4567	* sizeof(struct ftrace_ret_stack),
4568	GFP_KERNEL);	4568	GFP_KERNEL);
4569	if (!ret_stack_list[i]) {	4569	if (!ret_stack_list[i]) {
4570	start = 0;	4570	start = 0;
4571	end = i;	4571	end = i;
4572	ret = -ENOMEM;	4572	ret = -ENOMEM;
4573	goto free;	4573	goto free;
4574	}	4574	}
4575	}	4575	}
4576		4576
4577	read_lock_irqsave(&tasklist_lock, flags);	4577	read_lock_irqsave(&tasklist_lock, flags);
4578	do_each_thread(g, t) {	4578	do_each_thread(g, t) {
4579	if (start == end) {	4579	if (start == end) {
4580	ret = -EAGAIN;	4580	ret = -EAGAIN;
4581	goto unlock;	4581	goto unlock;
4582	}	4582	}
4583		4583
4584	if (t->ret_stack == NULL) {	4584	if (t->ret_stack == NULL) {
4585	atomic_set(&t->tracing_graph_pause, 0);	4585	atomic_set(&t->tracing_graph_pause, 0);
4586	atomic_set(&t->trace_overrun, 0);	4586	atomic_set(&t->trace_overrun, 0);
4587	t->curr_ret_stack = -1;	4587	t->curr_ret_stack = -1;
4588	/* Make sure the tasks see the -1 first: */	4588	/* Make sure the tasks see the -1 first: */
4589	smp_wmb();	4589	smp_wmb();
4590	t->ret_stack = ret_stack_list[start++];	4590	t->ret_stack = ret_stack_list[start++];
4591	}	4591	}
4592	} while_each_thread(g, t);	4592	} while_each_thread(g, t);
4593		4593
4594	unlock:	4594	unlock:
4595	read_unlock_irqrestore(&tasklist_lock, flags);	4595	read_unlock_irqrestore(&tasklist_lock, flags);
4596	free:	4596	free:
4597	for (i = start; i < end; i++)	4597	for (i = start; i < end; i++)
4598	kfree(ret_stack_list[i]);	4598	kfree(ret_stack_list[i]);
4599	return ret;	4599	return ret;
4600	}	4600	}
4601		4601
4602	static void	4602	static void
4603	ftrace_graph_probe_sched_switch(void *ignore,	4603	ftrace_graph_probe_sched_switch(void *ignore,
4604	struct task_struct prev, struct task_struct next)	4604	struct task_struct prev, struct task_struct next)
4605	{	4605	{
4606	unsigned long long timestamp;	4606	unsigned long long timestamp;
4607	int index;	4607	int index;
4608		4608
4609	/*	4609	/*
4610	* Does the user want to count the time a function was asleep.	4610	* Does the user want to count the time a function was asleep.
4611	* If so, do not update the time stamps.	4611	* If so, do not update the time stamps.
4612	*/	4612	*/
4613	if (trace_flags & TRACE_ITER_SLEEP_TIME)	4613	if (trace_flags & TRACE_ITER_SLEEP_TIME)
4614	return;	4614	return;
4615		4615
4616	timestamp = trace_clock_local();	4616	timestamp = trace_clock_local();
4617		4617
4618	prev->ftrace_timestamp = timestamp;	4618	prev->ftrace_timestamp = timestamp;
4619		4619
4620	/* only process tasks that we timestamped */	4620	/* only process tasks that we timestamped */
4621	if (!next->ftrace_timestamp)	4621	if (!next->ftrace_timestamp)
4622	return;	4622	return;
4623		4623
4624	/*	4624	/*
4625	* Update all the counters in next to make up for the	4625	* Update all the counters in next to make up for the
4626	* time next was sleeping.	4626	* time next was sleeping.
4627	*/	4627	*/
4628	timestamp -= next->ftrace_timestamp;	4628	timestamp -= next->ftrace_timestamp;
4629		4629
4630	for (index = next->curr_ret_stack; index >= 0; index--)	4630	for (index = next->curr_ret_stack; index >= 0; index--)
4631	next->ret_stack[index].calltime += timestamp;	4631	next->ret_stack[index].calltime += timestamp;
4632	}	4632	}
4633		4633
4634	/* Allocate a return stack for each task */	4634	/* Allocate a return stack for each task */
4635	static int start_graph_tracing(void)	4635	static int start_graph_tracing(void)
4636	{	4636	{
4637	struct ftrace_ret_stack **ret_stack_list;	4637	struct ftrace_ret_stack **ret_stack_list;
4638	int ret, cpu;	4638	int ret, cpu;
4639		4639
4640	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *	4640	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
4641	sizeof(struct ftrace_ret_stack *),	4641	sizeof(struct ftrace_ret_stack *),
4642	GFP_KERNEL);	4642	GFP_KERNEL);
4643		4643
4644	if (!ret_stack_list)	4644	if (!ret_stack_list)
4645	return -ENOMEM;	4645	return -ENOMEM;
4646		4646
4647	/* The cpu_boot init_task->ret_stack will never be freed */	4647	/* The cpu_boot init_task->ret_stack will never be freed */
4648	for_each_online_cpu(cpu) {	4648	for_each_online_cpu(cpu) {
4649	if (!idle_task(cpu)->ret_stack)	4649	if (!idle_task(cpu)->ret_stack)
4650	ftrace_graph_init_idle_task(idle_task(cpu), cpu);	4650	ftrace_graph_init_idle_task(idle_task(cpu), cpu);
4651	}	4651	}
4652		4652
4653	do {	4653	do {
4654	ret = alloc_retstack_tasklist(ret_stack_list);	4654	ret = alloc_retstack_tasklist(ret_stack_list);
4655	} while (ret == -EAGAIN);	4655	} while (ret == -EAGAIN);
4656		4656
4657	if (!ret) {	4657	if (!ret) {
4658	ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);	4658	ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
4659	if (ret)	4659	if (ret)
4660	pr_info("ftrace_graph: Couldn't activate tracepoint"	4660	pr_info("ftrace_graph: Couldn't activate tracepoint"
4661	" probe to kernel_sched_switch\n");	4661	" probe to kernel_sched_switch\n");
4662	}	4662	}
4663		4663
4664	kfree(ret_stack_list);	4664	kfree(ret_stack_list);
4665	return ret;	4665	return ret;
4666	}	4666	}
4667		4667
4668	/*	4668	/*
4669	* Hibernation protection.	4669	* Hibernation protection.
4670	* The state of the current task is too much unstable during	4670	* The state of the current task is too much unstable during
4671	* suspend/restore to disk. We want to protect against that.	4671	* suspend/restore to disk. We want to protect against that.
4672	*/	4672	*/
4673	static int	4673	static int
4674	ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,	4674	ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
4675	void *unused)	4675	void *unused)
4676	{	4676	{
4677	switch (state) {	4677	switch (state) {
4678	case PM_HIBERNATION_PREPARE:	4678	case PM_HIBERNATION_PREPARE:
4679	pause_graph_tracing();	4679	pause_graph_tracing();
4680	break;	4680	break;
4681		4681
4682	case PM_POST_HIBERNATION:	4682	case PM_POST_HIBERNATION:
4683	unpause_graph_tracing();	4683	unpause_graph_tracing();
4684	break;	4684	break;
4685	}	4685	}
4686	return NOTIFY_DONE;	4686	return NOTIFY_DONE;
4687	}	4687	}
4688		4688
4689	int register_ftrace_graph(trace_func_graph_ret_t retfunc,	4689	int register_ftrace_graph(trace_func_graph_ret_t retfunc,
4690	trace_func_graph_ent_t entryfunc)	4690	trace_func_graph_ent_t entryfunc)
4691	{	4691	{
4692	int ret = 0;	4692	int ret = 0;
4693		4693
4694	mutex_lock(&ftrace_lock);	4694	mutex_lock(&ftrace_lock);
4695		4695
4696	/* we currently allow only one tracer registered at a time */	4696	/* we currently allow only one tracer registered at a time */
4697	if (ftrace_graph_active) {	4697	if (ftrace_graph_active) {
4698	ret = -EBUSY;	4698	ret = -EBUSY;
4699	goto out;	4699	goto out;
4700	}	4700	}
4701		4701
4702	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;	4702	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
4703	register_pm_notifier(&ftrace_suspend_notifier);	4703	register_pm_notifier(&ftrace_suspend_notifier);
4704		4704
4705	ftrace_graph_active++;	4705	ftrace_graph_active++;
4706	ret = start_graph_tracing();	4706	ret = start_graph_tracing();
4707	if (ret) {	4707	if (ret) {
4708	ftrace_graph_active--;	4708	ftrace_graph_active--;
4709	goto out;	4709	goto out;
4710	}	4710	}
4711		4711
4712	ftrace_graph_return = retfunc;	4712	ftrace_graph_return = retfunc;
4713	ftrace_graph_entry = entryfunc;	4713	ftrace_graph_entry = entryfunc;
4714		4714
4715	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);	4715	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
4716		4716
4717	out:	4717	out:
4718	mutex_unlock(&ftrace_lock);	4718	mutex_unlock(&ftrace_lock);
4719	return ret;	4719	return ret;
4720	}	4720	}
4721		4721
4722	void unregister_ftrace_graph(void)	4722	void unregister_ftrace_graph(void)
4723	{	4723	{
4724	mutex_lock(&ftrace_lock);	4724	mutex_lock(&ftrace_lock);
4725		4725
4726	if (unlikely(!ftrace_graph_active))	4726	if (unlikely(!ftrace_graph_active))
4727	goto out;	4727	goto out;
4728		4728
4729	ftrace_graph_active--;	4729	ftrace_graph_active--;
4730	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;	4730	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
4731	ftrace_graph_entry = ftrace_graph_entry_stub;	4731	ftrace_graph_entry = ftrace_graph_entry_stub;
4732	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);	4732	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
4733	unregister_pm_notifier(&ftrace_suspend_notifier);	4733	unregister_pm_notifier(&ftrace_suspend_notifier);
4734	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);	4734	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
4735		4735
4736	out:	4736	out:
4737	mutex_unlock(&ftrace_lock);	4737	mutex_unlock(&ftrace_lock);
4738	}	4738	}
4739		4739
4740	static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);	4740	static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
4741		4741
4742	static void	4742	static void
4743	graph_init_task(struct task_struct t, struct ftrace_ret_stack ret_stack)	4743	graph_init_task(struct task_struct t, struct ftrace_ret_stack ret_stack)
4744	{	4744	{
4745	atomic_set(&t->tracing_graph_pause, 0);	4745	atomic_set(&t->tracing_graph_pause, 0);
4746	atomic_set(&t->trace_overrun, 0);	4746	atomic_set(&t->trace_overrun, 0);
4747	t->ftrace_timestamp = 0;	4747	t->ftrace_timestamp = 0;
4748	/* make curr_ret_stack visible before we add the ret_stack */	4748	/* make curr_ret_stack visible before we add the ret_stack */
4749	smp_wmb();	4749	smp_wmb();
4750	t->ret_stack = ret_stack;	4750	t->ret_stack = ret_stack;
4751	}	4751	}
4752		4752
4753	/*	4753	/*
4754	* Allocate a return stack for the idle task. May be the first	4754	* Allocate a return stack for the idle task. May be the first
4755	* time through, or it may be done by CPU hotplug online.	4755	* time through, or it may be done by CPU hotplug online.
4756	*/	4756	*/
4757	void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)	4757	void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
4758	{	4758	{
4759	t->curr_ret_stack = -1;	4759	t->curr_ret_stack = -1;
4760	/*	4760	/*
4761	* The idle task has no parent, it either has its own	4761	* The idle task has no parent, it either has its own
4762	* stack or no stack at all.	4762	* stack or no stack at all.
4763	*/	4763	*/
4764	if (t->ret_stack)	4764	if (t->ret_stack)
4765	WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));	4765	WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
4766		4766
4767	if (ftrace_graph_active) {	4767	if (ftrace_graph_active) {
4768	struct ftrace_ret_stack *ret_stack;	4768	struct ftrace_ret_stack *ret_stack;
4769		4769
4770	ret_stack = per_cpu(idle_ret_stack, cpu);	4770	ret_stack = per_cpu(idle_ret_stack, cpu);
4771	if (!ret_stack) {	4771	if (!ret_stack) {
4772	ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH	4772	ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4773	* sizeof(struct ftrace_ret_stack),	4773	* sizeof(struct ftrace_ret_stack),
4774	GFP_KERNEL);	4774	GFP_KERNEL);
4775	if (!ret_stack)	4775	if (!ret_stack)
4776	return;	4776	return;
4777	per_cpu(idle_ret_stack, cpu) = ret_stack;	4777	per_cpu(idle_ret_stack, cpu) = ret_stack;
4778	}	4778	}
4779	graph_init_task(t, ret_stack);	4779	graph_init_task(t, ret_stack);
4780	}	4780	}
4781	}	4781	}
4782		4782
4783	/* Allocate a return stack for newly created task */	4783	/* Allocate a return stack for newly created task */
4784	void ftrace_graph_init_task(struct task_struct *t)	4784	void ftrace_graph_init_task(struct task_struct *t)
4785	{	4785	{
4786	/* Make sure we do not use the parent ret_stack */	4786	/* Make sure we do not use the parent ret_stack */
4787	t->ret_stack = NULL;	4787	t->ret_stack = NULL;
4788	t->curr_ret_stack = -1;	4788	t->curr_ret_stack = -1;
4789		4789
4790	if (ftrace_graph_active) {	4790	if (ftrace_graph_active) {
4791	struct ftrace_ret_stack *ret_stack;	4791	struct ftrace_ret_stack *ret_stack;
4792		4792
4793	ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH	4793	ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4794	* sizeof(struct ftrace_ret_stack),	4794	* sizeof(struct ftrace_ret_stack),
4795	GFP_KERNEL);	4795	GFP_KERNEL);
4796	if (!ret_stack)	4796	if (!ret_stack)
4797	return;	4797	return;
4798	graph_init_task(t, ret_stack);	4798	graph_init_task(t, ret_stack);
4799	}	4799	}
4800	}	4800	}
4801		4801
4802	void ftrace_graph_exit_task(struct task_struct *t)	4802	void ftrace_graph_exit_task(struct task_struct *t)
4803	{	4803	{
4804	struct ftrace_ret_stack *ret_stack = t->ret_stack;	4804	struct ftrace_ret_stack *ret_stack = t->ret_stack;
4805		4805
4806	t->ret_stack = NULL;	4806	t->ret_stack = NULL;
4807	/* NULL must become visible to IRQs before we free it: */	4807	/* NULL must become visible to IRQs before we free it: */
4808	barrier();	4808	barrier();
4809		4809
4810	kfree(ret_stack);	4810	kfree(ret_stack);
4811	}	4811	}
4812		4812
4813	void ftrace_graph_stop(void)	4813	void ftrace_graph_stop(void)
4814	{	4814	{
4815	ftrace_stop();	4815	ftrace_stop();
4816	}	4816	}
4817	#endif	4817	#endif
4818		4818

mm/shmem.c

Diff comments View file @ 965c8e5

1	/*	1	/*
2	* Resizable virtual memory filesystem for Linux.	2	* Resizable virtual memory filesystem for Linux.
3	*	3	*
4	* Copyright (C) 2000 Linus Torvalds.	4	* Copyright (C) 2000 Linus Torvalds.
5	* 2000 Transmeta Corp.	5	* 2000 Transmeta Corp.
6	* 2000-2001 Christoph Rohland	6	* 2000-2001 Christoph Rohland
7	* 2000-2001 SAP AG	7	* 2000-2001 SAP AG
8	* 2002 Red Hat Inc.	8	* 2002 Red Hat Inc.
9	* Copyright (C) 2002-2011 Hugh Dickins.	9	* Copyright (C) 2002-2011 Hugh Dickins.
10	* Copyright (C) 2011 Google Inc.	10	* Copyright (C) 2011 Google Inc.
11	* Copyright (C) 2002-2005 VERITAS Software Corporation.	11	* Copyright (C) 2002-2005 VERITAS Software Corporation.
12	* Copyright (C) 2004 Andi Kleen, SuSE Labs	12	* Copyright (C) 2004 Andi Kleen, SuSE Labs
13	*	13	*
14	* Extended attribute support for tmpfs:	14	* Extended attribute support for tmpfs:
15	* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>	15	* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16	* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>	16	* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17	*	17	*
18	* tiny-shmem:	18	* tiny-shmem:
19	* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>	19	* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20	*	20	*
21	* This file is released under the GPL.	21	* This file is released under the GPL.
22	*/	22	*/
23		23
24	#include <linux/fs.h>	24	#include <linux/fs.h>
25	#include <linux/init.h>	25	#include <linux/init.h>
26	#include <linux/vfs.h>	26	#include <linux/vfs.h>
27	#include <linux/mount.h>	27	#include <linux/mount.h>
28	#include <linux/pagemap.h>	28	#include <linux/pagemap.h>
29	#include <linux/file.h>	29	#include <linux/file.h>
30	#include <linux/mm.h>	30	#include <linux/mm.h>
31	#include <linux/export.h>	31	#include <linux/export.h>
32	#include <linux/swap.h>	32	#include <linux/swap.h>
33		33
34	static struct vfsmount *shm_mnt;	34	static struct vfsmount *shm_mnt;
35		35
36	#ifdef CONFIG_SHMEM	36	#ifdef CONFIG_SHMEM
37	/*	37	/*
38	* This virtual memory filesystem is heavily based on the ramfs. It	38	* This virtual memory filesystem is heavily based on the ramfs. It
39	* extends ramfs by the ability to use swap and honor resource limits	39	* extends ramfs by the ability to use swap and honor resource limits
40	* which makes it a completely usable filesystem.	40	* which makes it a completely usable filesystem.
41	*/	41	*/
42		42
43	#include <linux/xattr.h>	43	#include <linux/xattr.h>
44	#include <linux/exportfs.h>	44	#include <linux/exportfs.h>
45	#include <linux/posix_acl.h>	45	#include <linux/posix_acl.h>
46	#include <linux/generic_acl.h>	46	#include <linux/generic_acl.h>
47	#include <linux/mman.h>	47	#include <linux/mman.h>
48	#include <linux/string.h>	48	#include <linux/string.h>
49	#include <linux/slab.h>	49	#include <linux/slab.h>
50	#include <linux/backing-dev.h>	50	#include <linux/backing-dev.h>
51	#include <linux/shmem_fs.h>	51	#include <linux/shmem_fs.h>
52	#include <linux/writeback.h>	52	#include <linux/writeback.h>
53	#include <linux/blkdev.h>	53	#include <linux/blkdev.h>
54	#include <linux/pagevec.h>	54	#include <linux/pagevec.h>
55	#include <linux/percpu_counter.h>	55	#include <linux/percpu_counter.h>
56	#include <linux/falloc.h>	56	#include <linux/falloc.h>
57	#include <linux/splice.h>	57	#include <linux/splice.h>
58	#include <linux/security.h>	58	#include <linux/security.h>
59	#include <linux/swapops.h>	59	#include <linux/swapops.h>
60	#include <linux/mempolicy.h>	60	#include <linux/mempolicy.h>
61	#include <linux/namei.h>	61	#include <linux/namei.h>
62	#include <linux/ctype.h>	62	#include <linux/ctype.h>
63	#include <linux/migrate.h>	63	#include <linux/migrate.h>
64	#include <linux/highmem.h>	64	#include <linux/highmem.h>
65	#include <linux/seq_file.h>	65	#include <linux/seq_file.h>
66	#include <linux/magic.h>	66	#include <linux/magic.h>
67		67
68	#include <asm/uaccess.h>	68	#include <asm/uaccess.h>
69	#include <asm/pgtable.h>	69	#include <asm/pgtable.h>
70		70
71	#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)	71	#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
72	#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)	72	#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
73		73
74	/* Pretend that each entry is of this size in directory's i_size */	74	/* Pretend that each entry is of this size in directory's i_size */
75	#define BOGO_DIRENT_SIZE 20	75	#define BOGO_DIRENT_SIZE 20
76		76
77	/* Symlink up to this size is kmalloc'ed instead of using a swappable page */	77	/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
78	#define SHORT_SYMLINK_LEN 128	78	#define SHORT_SYMLINK_LEN 128
79		79
80	/*	80	/*
81	* shmem_fallocate and shmem_writepage communicate via inode->i_private	81	* shmem_fallocate and shmem_writepage communicate via inode->i_private
82	* (with i_mutex making sure that it has only one user at a time):	82	* (with i_mutex making sure that it has only one user at a time):
83	* we would prefer not to enlarge the shmem inode just for that.	83	* we would prefer not to enlarge the shmem inode just for that.
84	*/	84	*/
85	struct shmem_falloc {	85	struct shmem_falloc {
86	pgoff_t start; /* start of range currently being fallocated */	86	pgoff_t start; /* start of range currently being fallocated */
87	pgoff_t next; /* the next page offset to be fallocated */	87	pgoff_t next; /* the next page offset to be fallocated */
88	pgoff_t nr_falloced; /* how many new pages have been fallocated */	88	pgoff_t nr_falloced; /* how many new pages have been fallocated */
89	pgoff_t nr_unswapped; /* how often writepage refused to swap out */	89	pgoff_t nr_unswapped; /* how often writepage refused to swap out */
90	};	90	};
91		91
92	/* Flag allocation requirements to shmem_getpage */	92	/* Flag allocation requirements to shmem_getpage */
93	enum sgp_type {	93	enum sgp_type {
94	SGP_READ, /* don't exceed i_size, don't allocate page */	94	SGP_READ, /* don't exceed i_size, don't allocate page */
95	SGP_CACHE, /* don't exceed i_size, may allocate page */	95	SGP_CACHE, /* don't exceed i_size, may allocate page */
96	SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */	96	SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
97	SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */	97	SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
98	SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */	98	SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
99	};	99	};
100		100
101	#ifdef CONFIG_TMPFS	101	#ifdef CONFIG_TMPFS
102	static unsigned long shmem_default_max_blocks(void)	102	static unsigned long shmem_default_max_blocks(void)
103	{	103	{
104	return totalram_pages / 2;	104	return totalram_pages / 2;
105	}	105	}
106		106
107	static unsigned long shmem_default_max_inodes(void)	107	static unsigned long shmem_default_max_inodes(void)
108	{	108	{
109	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);	109	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
110	}	110	}
111	#endif	111	#endif
112		112
113	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);	113	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
114	static int shmem_replace_page(struct page **pagep, gfp_t gfp,	114	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
115	struct shmem_inode_info *info, pgoff_t index);	115	struct shmem_inode_info *info, pgoff_t index);
116	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,	116	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
117	struct page *pagep, enum sgp_type sgp, gfp_t gfp, int fault_type);	117	struct page *pagep, enum sgp_type sgp, gfp_t gfp, int fault_type);
118		118
119	static inline int shmem_getpage(struct inode *inode, pgoff_t index,	119	static inline int shmem_getpage(struct inode *inode, pgoff_t index,
120	struct page *pagep, enum sgp_type sgp, int fault_type)	120	struct page *pagep, enum sgp_type sgp, int fault_type)
121	{	121	{
122	return shmem_getpage_gfp(inode, index, pagep, sgp,	122	return shmem_getpage_gfp(inode, index, pagep, sgp,
123	mapping_gfp_mask(inode->i_mapping), fault_type);	123	mapping_gfp_mask(inode->i_mapping), fault_type);
124	}	124	}
125		125
126	static inline struct shmem_sb_info SHMEM_SB(struct super_block sb)	126	static inline struct shmem_sb_info SHMEM_SB(struct super_block sb)
127	{	127	{
128	return sb->s_fs_info;	128	return sb->s_fs_info;
129	}	129	}
130		130
131	/*	131	/*
132	* shmem_file_setup pre-accounts the whole fixed size of a VM object,	132	* shmem_file_setup pre-accounts the whole fixed size of a VM object,
133	* for shared memory and for shared anonymous (/dev/zero) mappings	133	* for shared memory and for shared anonymous (/dev/zero) mappings
134	* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),	134	* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
135	* consistent with the pre-accounting of private mappings ...	135	* consistent with the pre-accounting of private mappings ...
136	*/	136	*/
137	static inline int shmem_acct_size(unsigned long flags, loff_t size)	137	static inline int shmem_acct_size(unsigned long flags, loff_t size)
138	{	138	{
139	return (flags & VM_NORESERVE) ?	139	return (flags & VM_NORESERVE) ?
140	0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));	140	0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
141	}	141	}
142		142
143	static inline void shmem_unacct_size(unsigned long flags, loff_t size)	143	static inline void shmem_unacct_size(unsigned long flags, loff_t size)
144	{	144	{
145	if (!(flags & VM_NORESERVE))	145	if (!(flags & VM_NORESERVE))
146	vm_unacct_memory(VM_ACCT(size));	146	vm_unacct_memory(VM_ACCT(size));
147	}	147	}
148		148
149	/*	149	/*
150	* ... whereas tmpfs objects are accounted incrementally as	150	* ... whereas tmpfs objects are accounted incrementally as
151	* pages are allocated, in order to allow huge sparse files.	151	* pages are allocated, in order to allow huge sparse files.
152	* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,	152	* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
153	* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.	153	* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
154	*/	154	*/
155	static inline int shmem_acct_block(unsigned long flags)	155	static inline int shmem_acct_block(unsigned long flags)
156	{	156	{
157	return (flags & VM_NORESERVE) ?	157	return (flags & VM_NORESERVE) ?
158	security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;	158	security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
159	}	159	}
160		160
161	static inline void shmem_unacct_blocks(unsigned long flags, long pages)	161	static inline void shmem_unacct_blocks(unsigned long flags, long pages)
162	{	162	{
163	if (flags & VM_NORESERVE)	163	if (flags & VM_NORESERVE)
164	vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));	164	vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
165	}	165	}
166		166
167	static const struct super_operations shmem_ops;	167	static const struct super_operations shmem_ops;
168	static const struct address_space_operations shmem_aops;	168	static const struct address_space_operations shmem_aops;
169	static const struct file_operations shmem_file_operations;	169	static const struct file_operations shmem_file_operations;
170	static const struct inode_operations shmem_inode_operations;	170	static const struct inode_operations shmem_inode_operations;
171	static const struct inode_operations shmem_dir_inode_operations;	171	static const struct inode_operations shmem_dir_inode_operations;
172	static const struct inode_operations shmem_special_inode_operations;	172	static const struct inode_operations shmem_special_inode_operations;
173	static const struct vm_operations_struct shmem_vm_ops;	173	static const struct vm_operations_struct shmem_vm_ops;
174		174
175	static struct backing_dev_info shmem_backing_dev_info __read_mostly = {	175	static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
176	.ra_pages = 0, /* No readahead */	176	.ra_pages = 0, /* No readahead */
177	.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK \| BDI_CAP_SWAP_BACKED,	177	.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK \| BDI_CAP_SWAP_BACKED,
178	};	178	};
179		179
180	static LIST_HEAD(shmem_swaplist);	180	static LIST_HEAD(shmem_swaplist);
181	static DEFINE_MUTEX(shmem_swaplist_mutex);	181	static DEFINE_MUTEX(shmem_swaplist_mutex);
182		182
183	static int shmem_reserve_inode(struct super_block *sb)	183	static int shmem_reserve_inode(struct super_block *sb)
184	{	184	{
185	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);	185	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
186	if (sbinfo->max_inodes) {	186	if (sbinfo->max_inodes) {
187	spin_lock(&sbinfo->stat_lock);	187	spin_lock(&sbinfo->stat_lock);
188	if (!sbinfo->free_inodes) {	188	if (!sbinfo->free_inodes) {
189	spin_unlock(&sbinfo->stat_lock);	189	spin_unlock(&sbinfo->stat_lock);
190	return -ENOSPC;	190	return -ENOSPC;
191	}	191	}
192	sbinfo->free_inodes--;	192	sbinfo->free_inodes--;
193	spin_unlock(&sbinfo->stat_lock);	193	spin_unlock(&sbinfo->stat_lock);
194	}	194	}
195	return 0;	195	return 0;
196	}	196	}
197		197
198	static void shmem_free_inode(struct super_block *sb)	198	static void shmem_free_inode(struct super_block *sb)
199	{	199	{
200	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);	200	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
201	if (sbinfo->max_inodes) {	201	if (sbinfo->max_inodes) {
202	spin_lock(&sbinfo->stat_lock);	202	spin_lock(&sbinfo->stat_lock);
203	sbinfo->free_inodes++;	203	sbinfo->free_inodes++;
204	spin_unlock(&sbinfo->stat_lock);	204	spin_unlock(&sbinfo->stat_lock);
205	}	205	}
206	}	206	}
207		207
208	/**	208	/**
209	* shmem_recalc_inode - recalculate the block usage of an inode	209	* shmem_recalc_inode - recalculate the block usage of an inode
210	* @inode: inode to recalc	210	* @inode: inode to recalc
211	*	211	*
212	* We have to calculate the free blocks since the mm can drop	212	* We have to calculate the free blocks since the mm can drop
213	* undirtied hole pages behind our back.	213	* undirtied hole pages behind our back.
214	*	214	*
215	* But normally info->alloced == inode->i_mapping->nrpages + info->swapped	215	* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
216	* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)	216	* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
217	*	217	*
218	* It has to be called with the spinlock held.	218	* It has to be called with the spinlock held.
219	*/	219	*/
220	static void shmem_recalc_inode(struct inode *inode)	220	static void shmem_recalc_inode(struct inode *inode)
221	{	221	{
222	struct shmem_inode_info *info = SHMEM_I(inode);	222	struct shmem_inode_info *info = SHMEM_I(inode);
223	long freed;	223	long freed;
224		224
225	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;	225	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
226	if (freed > 0) {	226	if (freed > 0) {
227	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);	227	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
228	if (sbinfo->max_blocks)	228	if (sbinfo->max_blocks)
229	percpu_counter_add(&sbinfo->used_blocks, -freed);	229	percpu_counter_add(&sbinfo->used_blocks, -freed);
230	info->alloced -= freed;	230	info->alloced -= freed;
231	inode->i_blocks -= freed * BLOCKS_PER_PAGE;	231	inode->i_blocks -= freed * BLOCKS_PER_PAGE;
232	shmem_unacct_blocks(info->flags, freed);	232	shmem_unacct_blocks(info->flags, freed);
233	}	233	}
234	}	234	}
235		235
236	/*	236	/*
237	* Replace item expected in radix tree by a new item, while holding tree lock.	237	* Replace item expected in radix tree by a new item, while holding tree lock.
238	*/	238	*/
239	static int shmem_radix_tree_replace(struct address_space *mapping,	239	static int shmem_radix_tree_replace(struct address_space *mapping,
240	pgoff_t index, void expected, void replacement)	240	pgoff_t index, void expected, void replacement)
241	{	241	{
242	void **pslot;	242	void **pslot;
243	void *item = NULL;	243	void *item = NULL;
244		244
245	VM_BUG_ON(!expected);	245	VM_BUG_ON(!expected);
246	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);	246	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
247	if (pslot)	247	if (pslot)
248	item = radix_tree_deref_slot_protected(pslot,	248	item = radix_tree_deref_slot_protected(pslot,
249	&mapping->tree_lock);	249	&mapping->tree_lock);
250	if (item != expected)	250	if (item != expected)
251	return -ENOENT;	251	return -ENOENT;
252	if (replacement)	252	if (replacement)
253	radix_tree_replace_slot(pslot, replacement);	253	radix_tree_replace_slot(pslot, replacement);
254	else	254	else
255	radix_tree_delete(&mapping->page_tree, index);	255	radix_tree_delete(&mapping->page_tree, index);
256	return 0;	256	return 0;
257	}	257	}
258		258
259	/*	259	/*
260	* Sometimes, before we decide whether to proceed or to fail, we must check	260	* Sometimes, before we decide whether to proceed or to fail, we must check
261	* that an entry was not already brought back from swap by a racing thread.	261	* that an entry was not already brought back from swap by a racing thread.
262	*	262	*
263	* Checking page is not enough: by the time a SwapCache page is locked, it	263	* Checking page is not enough: by the time a SwapCache page is locked, it
264	* might be reused, and again be SwapCache, using the same swap as before.	264	* might be reused, and again be SwapCache, using the same swap as before.
265	*/	265	*/
266	static bool shmem_confirm_swap(struct address_space *mapping,	266	static bool shmem_confirm_swap(struct address_space *mapping,
267	pgoff_t index, swp_entry_t swap)	267	pgoff_t index, swp_entry_t swap)
268	{	268	{
269	void *item;	269	void *item;
270		270
271	rcu_read_lock();	271	rcu_read_lock();
272	item = radix_tree_lookup(&mapping->page_tree, index);	272	item = radix_tree_lookup(&mapping->page_tree, index);
273	rcu_read_unlock();	273	rcu_read_unlock();
274	return item == swp_to_radix_entry(swap);	274	return item == swp_to_radix_entry(swap);
275	}	275	}
276		276
277	/*	277	/*
278	* Like add_to_page_cache_locked, but error if expected item has gone.	278	* Like add_to_page_cache_locked, but error if expected item has gone.
279	*/	279	*/
280	static int shmem_add_to_page_cache(struct page *page,	280	static int shmem_add_to_page_cache(struct page *page,
281	struct address_space *mapping,	281	struct address_space *mapping,
282	pgoff_t index, gfp_t gfp, void *expected)	282	pgoff_t index, gfp_t gfp, void *expected)
283	{	283	{
284	int error;	284	int error;
285		285
286	VM_BUG_ON(!PageLocked(page));	286	VM_BUG_ON(!PageLocked(page));
287	VM_BUG_ON(!PageSwapBacked(page));	287	VM_BUG_ON(!PageSwapBacked(page));
288		288
289	page_cache_get(page);	289	page_cache_get(page);
290	page->mapping = mapping;	290	page->mapping = mapping;
291	page->index = index;	291	page->index = index;
292		292
293	spin_lock_irq(&mapping->tree_lock);	293	spin_lock_irq(&mapping->tree_lock);
294	if (!expected)	294	if (!expected)
295	error = radix_tree_insert(&mapping->page_tree, index, page);	295	error = radix_tree_insert(&mapping->page_tree, index, page);
296	else	296	else
297	error = shmem_radix_tree_replace(mapping, index, expected,	297	error = shmem_radix_tree_replace(mapping, index, expected,
298	page);	298	page);
299	if (!error) {	299	if (!error) {
300	mapping->nrpages++;	300	mapping->nrpages++;
301	__inc_zone_page_state(page, NR_FILE_PAGES);	301	__inc_zone_page_state(page, NR_FILE_PAGES);
302	__inc_zone_page_state(page, NR_SHMEM);	302	__inc_zone_page_state(page, NR_SHMEM);
303	spin_unlock_irq(&mapping->tree_lock);	303	spin_unlock_irq(&mapping->tree_lock);
304	} else {	304	} else {
305	page->mapping = NULL;	305	page->mapping = NULL;
306	spin_unlock_irq(&mapping->tree_lock);	306	spin_unlock_irq(&mapping->tree_lock);
307	page_cache_release(page);	307	page_cache_release(page);
308	}	308	}
309	return error;	309	return error;
310	}	310	}
311		311
312	/*	312	/*
313	* Like delete_from_page_cache, but substitutes swap for page.	313	* Like delete_from_page_cache, but substitutes swap for page.
314	*/	314	*/
315	static void shmem_delete_from_page_cache(struct page page, void radswap)	315	static void shmem_delete_from_page_cache(struct page page, void radswap)
316	{	316	{
317	struct address_space *mapping = page->mapping;	317	struct address_space *mapping = page->mapping;
318	int error;	318	int error;
319		319
320	spin_lock_irq(&mapping->tree_lock);	320	spin_lock_irq(&mapping->tree_lock);
321	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);	321	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
322	page->mapping = NULL;	322	page->mapping = NULL;
323	mapping->nrpages--;	323	mapping->nrpages--;
324	__dec_zone_page_state(page, NR_FILE_PAGES);	324	__dec_zone_page_state(page, NR_FILE_PAGES);
325	__dec_zone_page_state(page, NR_SHMEM);	325	__dec_zone_page_state(page, NR_SHMEM);
326	spin_unlock_irq(&mapping->tree_lock);	326	spin_unlock_irq(&mapping->tree_lock);
327	page_cache_release(page);	327	page_cache_release(page);
328	BUG_ON(error);	328	BUG_ON(error);
329	}	329	}
330		330
331	/*	331	/*
332	* Like find_get_pages, but collecting swap entries as well as pages.	332	* Like find_get_pages, but collecting swap entries as well as pages.
333	*/	333	*/
334	static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,	334	static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
335	pgoff_t start, unsigned int nr_pages,	335	pgoff_t start, unsigned int nr_pages,
336	struct page *pages, pgoff_t indices)	336	struct page *pages, pgoff_t indices)
337	{	337	{
338	unsigned int i;	338	unsigned int i;
339	unsigned int ret;	339	unsigned int ret;
340	unsigned int nr_found;	340	unsigned int nr_found;
341		341
342	rcu_read_lock();	342	rcu_read_lock();
343	restart:	343	restart:
344	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,	344	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
345	(void ***)pages, indices, start, nr_pages);	345	(void ***)pages, indices, start, nr_pages);
346	ret = 0;	346	ret = 0;
347	for (i = 0; i < nr_found; i++) {	347	for (i = 0; i < nr_found; i++) {
348	struct page *page;	348	struct page *page;
349	repeat:	349	repeat:
350	page = radix_tree_deref_slot((void **)pages[i]);	350	page = radix_tree_deref_slot((void **)pages[i]);
351	if (unlikely(!page))	351	if (unlikely(!page))
352	continue;	352	continue;
353	if (radix_tree_exception(page)) {	353	if (radix_tree_exception(page)) {
354	if (radix_tree_deref_retry(page))	354	if (radix_tree_deref_retry(page))
355	goto restart;	355	goto restart;
356	/*	356	/*
357	* Otherwise, we must be storing a swap entry	357	* Otherwise, we must be storing a swap entry
358	* here as an exceptional entry: so return it	358	* here as an exceptional entry: so return it
359	* without attempting to raise page count.	359	* without attempting to raise page count.
360	*/	360	*/
361	goto export;	361	goto export;
362	}	362	}
363	if (!page_cache_get_speculative(page))	363	if (!page_cache_get_speculative(page))
364	goto repeat;	364	goto repeat;
365		365
366	/* Has the page moved? */	366	/* Has the page moved? */
367	if (unlikely(page != ((void *)pages[i]))) {	367	if (unlikely(page != ((void *)pages[i]))) {
368	page_cache_release(page);	368	page_cache_release(page);
369	goto repeat;	369	goto repeat;
370	}	370	}
371	export:	371	export:
372	indices[ret] = indices[i];	372	indices[ret] = indices[i];
373	pages[ret] = page;	373	pages[ret] = page;
374	ret++;	374	ret++;
375	}	375	}
376	if (unlikely(!ret && nr_found))	376	if (unlikely(!ret && nr_found))
377	goto restart;	377	goto restart;
378	rcu_read_unlock();	378	rcu_read_unlock();
379	return ret;	379	return ret;
380	}	380	}
381		381
382	/*	382	/*
383	* Remove swap entry from radix tree, free the swap and its page cache.	383	* Remove swap entry from radix tree, free the swap and its page cache.
384	*/	384	*/
385	static int shmem_free_swap(struct address_space *mapping,	385	static int shmem_free_swap(struct address_space *mapping,
386	pgoff_t index, void *radswap)	386	pgoff_t index, void *radswap)
387	{	387	{
388	int error;	388	int error;
389		389
390	spin_lock_irq(&mapping->tree_lock);	390	spin_lock_irq(&mapping->tree_lock);
391	error = shmem_radix_tree_replace(mapping, index, radswap, NULL);	391	error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
392	spin_unlock_irq(&mapping->tree_lock);	392	spin_unlock_irq(&mapping->tree_lock);
393	if (!error)	393	if (!error)
394	free_swap_and_cache(radix_to_swp_entry(radswap));	394	free_swap_and_cache(radix_to_swp_entry(radswap));
395	return error;	395	return error;
396	}	396	}
397		397
398	/*	398	/*
399	* Pagevec may contain swap entries, so shuffle up pages before releasing.	399	* Pagevec may contain swap entries, so shuffle up pages before releasing.
400	*/	400	*/
401	static void shmem_deswap_pagevec(struct pagevec *pvec)	401	static void shmem_deswap_pagevec(struct pagevec *pvec)
402	{	402	{
403	int i, j;	403	int i, j;
404		404
405	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {	405	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
406	struct page *page = pvec->pages[i];	406	struct page *page = pvec->pages[i];
407	if (!radix_tree_exceptional_entry(page))	407	if (!radix_tree_exceptional_entry(page))
408	pvec->pages[j++] = page;	408	pvec->pages[j++] = page;
409	}	409	}
410	pvec->nr = j;	410	pvec->nr = j;
411	}	411	}
412		412
413	/*	413	/*
414	* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.	414	* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
415	*/	415	*/
416	void shmem_unlock_mapping(struct address_space *mapping)	416	void shmem_unlock_mapping(struct address_space *mapping)
417	{	417	{
418	struct pagevec pvec;	418	struct pagevec pvec;
419	pgoff_t indices[PAGEVEC_SIZE];	419	pgoff_t indices[PAGEVEC_SIZE];
420	pgoff_t index = 0;	420	pgoff_t index = 0;
421		421
422	pagevec_init(&pvec, 0);	422	pagevec_init(&pvec, 0);
423	/*	423	/*
424	* Minor point, but we might as well stop if someone else SHM_LOCKs it.	424	* Minor point, but we might as well stop if someone else SHM_LOCKs it.
425	*/	425	*/
426	while (!mapping_unevictable(mapping)) {	426	while (!mapping_unevictable(mapping)) {
427	/*	427	/*
428	* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it	428	* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
429	* has finished, if it hits a row of PAGEVEC_SIZE swap entries.	429	* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
430	*/	430	*/
431	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,	431	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
432	PAGEVEC_SIZE, pvec.pages, indices);	432	PAGEVEC_SIZE, pvec.pages, indices);
433	if (!pvec.nr)	433	if (!pvec.nr)
434	break;	434	break;
435	index = indices[pvec.nr - 1] + 1;	435	index = indices[pvec.nr - 1] + 1;
436	shmem_deswap_pagevec(&pvec);	436	shmem_deswap_pagevec(&pvec);
437	check_move_unevictable_pages(pvec.pages, pvec.nr);	437	check_move_unevictable_pages(pvec.pages, pvec.nr);
438	pagevec_release(&pvec);	438	pagevec_release(&pvec);
439	cond_resched();	439	cond_resched();
440	}	440	}
441	}	441	}
442		442
443	/*	443	/*
444	* Remove range of pages and swap entries from radix tree, and free them.	444	* Remove range of pages and swap entries from radix tree, and free them.
445	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.	445	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
446	*/	446	*/
447	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,	447	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
448	bool unfalloc)	448	bool unfalloc)
449	{	449	{
450	struct address_space *mapping = inode->i_mapping;	450	struct address_space *mapping = inode->i_mapping;
451	struct shmem_inode_info *info = SHMEM_I(inode);	451	struct shmem_inode_info *info = SHMEM_I(inode);
452	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	452	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
453	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;	453	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
454	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);	454	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
455	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);	455	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
456	struct pagevec pvec;	456	struct pagevec pvec;
457	pgoff_t indices[PAGEVEC_SIZE];	457	pgoff_t indices[PAGEVEC_SIZE];
458	long nr_swaps_freed = 0;	458	long nr_swaps_freed = 0;
459	pgoff_t index;	459	pgoff_t index;
460	int i;	460	int i;
461		461
462	if (lend == -1)	462	if (lend == -1)
463	end = -1; /* unsigned, so actually very big */	463	end = -1; /* unsigned, so actually very big */
464		464
465	pagevec_init(&pvec, 0);	465	pagevec_init(&pvec, 0);
466	index = start;	466	index = start;
467	while (index < end) {	467	while (index < end) {
468	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,	468	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
469	min(end - index, (pgoff_t)PAGEVEC_SIZE),	469	min(end - index, (pgoff_t)PAGEVEC_SIZE),
470	pvec.pages, indices);	470	pvec.pages, indices);
471	if (!pvec.nr)	471	if (!pvec.nr)
472	break;	472	break;
473	mem_cgroup_uncharge_start();	473	mem_cgroup_uncharge_start();
474	for (i = 0; i < pagevec_count(&pvec); i++) {	474	for (i = 0; i < pagevec_count(&pvec); i++) {
475	struct page *page = pvec.pages[i];	475	struct page *page = pvec.pages[i];
476		476
477	index = indices[i];	477	index = indices[i];
478	if (index >= end)	478	if (index >= end)
479	break;	479	break;
480		480
481	if (radix_tree_exceptional_entry(page)) {	481	if (radix_tree_exceptional_entry(page)) {
482	if (unfalloc)	482	if (unfalloc)
483	continue;	483	continue;
484	nr_swaps_freed += !shmem_free_swap(mapping,	484	nr_swaps_freed += !shmem_free_swap(mapping,
485	index, page);	485	index, page);
486	continue;	486	continue;
487	}	487	}
488		488
489	if (!trylock_page(page))	489	if (!trylock_page(page))
490	continue;	490	continue;
491	if (!unfalloc \|\| !PageUptodate(page)) {	491	if (!unfalloc \|\| !PageUptodate(page)) {
492	if (page->mapping == mapping) {	492	if (page->mapping == mapping) {
493	VM_BUG_ON(PageWriteback(page));	493	VM_BUG_ON(PageWriteback(page));
494	truncate_inode_page(mapping, page);	494	truncate_inode_page(mapping, page);
495	}	495	}
496	}	496	}
497	unlock_page(page);	497	unlock_page(page);
498	}	498	}
499	shmem_deswap_pagevec(&pvec);	499	shmem_deswap_pagevec(&pvec);
500	pagevec_release(&pvec);	500	pagevec_release(&pvec);
501	mem_cgroup_uncharge_end();	501	mem_cgroup_uncharge_end();
502	cond_resched();	502	cond_resched();
503	index++;	503	index++;
504	}	504	}
505		505
506	if (partial_start) {	506	if (partial_start) {
507	struct page *page = NULL;	507	struct page *page = NULL;
508	shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);	508	shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
509	if (page) {	509	if (page) {
510	unsigned int top = PAGE_CACHE_SIZE;	510	unsigned int top = PAGE_CACHE_SIZE;
511	if (start > end) {	511	if (start > end) {
512	top = partial_end;	512	top = partial_end;
513	partial_end = 0;	513	partial_end = 0;
514	}	514	}
515	zero_user_segment(page, partial_start, top);	515	zero_user_segment(page, partial_start, top);
516	set_page_dirty(page);	516	set_page_dirty(page);
517	unlock_page(page);	517	unlock_page(page);
518	page_cache_release(page);	518	page_cache_release(page);
519	}	519	}
520	}	520	}
521	if (partial_end) {	521	if (partial_end) {
522	struct page *page = NULL;	522	struct page *page = NULL;
523	shmem_getpage(inode, end, &page, SGP_READ, NULL);	523	shmem_getpage(inode, end, &page, SGP_READ, NULL);
524	if (page) {	524	if (page) {
525	zero_user_segment(page, 0, partial_end);	525	zero_user_segment(page, 0, partial_end);
526	set_page_dirty(page);	526	set_page_dirty(page);
527	unlock_page(page);	527	unlock_page(page);
528	page_cache_release(page);	528	page_cache_release(page);
529	}	529	}
530	}	530	}
531	if (start >= end)	531	if (start >= end)
532	return;	532	return;
533		533
534	index = start;	534	index = start;
535	for ( ; ; ) {	535	for ( ; ; ) {
536	cond_resched();	536	cond_resched();
537	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,	537	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
538	min(end - index, (pgoff_t)PAGEVEC_SIZE),	538	min(end - index, (pgoff_t)PAGEVEC_SIZE),
539	pvec.pages, indices);	539	pvec.pages, indices);
540	if (!pvec.nr) {	540	if (!pvec.nr) {
541	if (index == start \|\| unfalloc)	541	if (index == start \|\| unfalloc)
542	break;	542	break;
543	index = start;	543	index = start;
544	continue;	544	continue;
545	}	545	}
546	if ((index == start \|\| unfalloc) && indices[0] >= end) {	546	if ((index == start \|\| unfalloc) && indices[0] >= end) {
547	shmem_deswap_pagevec(&pvec);	547	shmem_deswap_pagevec(&pvec);
548	pagevec_release(&pvec);	548	pagevec_release(&pvec);
549	break;	549	break;
550	}	550	}
551	mem_cgroup_uncharge_start();	551	mem_cgroup_uncharge_start();
552	for (i = 0; i < pagevec_count(&pvec); i++) {	552	for (i = 0; i < pagevec_count(&pvec); i++) {
553	struct page *page = pvec.pages[i];	553	struct page *page = pvec.pages[i];
554		554
555	index = indices[i];	555	index = indices[i];
556	if (index >= end)	556	if (index >= end)
557	break;	557	break;
558		558
559	if (radix_tree_exceptional_entry(page)) {	559	if (radix_tree_exceptional_entry(page)) {
560	if (unfalloc)	560	if (unfalloc)
561	continue;	561	continue;
562	nr_swaps_freed += !shmem_free_swap(mapping,	562	nr_swaps_freed += !shmem_free_swap(mapping,
563	index, page);	563	index, page);
564	continue;	564	continue;
565	}	565	}
566		566
567	lock_page(page);	567	lock_page(page);
568	if (!unfalloc \|\| !PageUptodate(page)) {	568	if (!unfalloc \|\| !PageUptodate(page)) {
569	if (page->mapping == mapping) {	569	if (page->mapping == mapping) {
570	VM_BUG_ON(PageWriteback(page));	570	VM_BUG_ON(PageWriteback(page));
571	truncate_inode_page(mapping, page);	571	truncate_inode_page(mapping, page);
572	}	572	}
573	}	573	}
574	unlock_page(page);	574	unlock_page(page);
575	}	575	}
576	shmem_deswap_pagevec(&pvec);	576	shmem_deswap_pagevec(&pvec);
577	pagevec_release(&pvec);	577	pagevec_release(&pvec);
578	mem_cgroup_uncharge_end();	578	mem_cgroup_uncharge_end();
579	index++;	579	index++;
580	}	580	}
581		581
582	spin_lock(&info->lock);	582	spin_lock(&info->lock);
583	info->swapped -= nr_swaps_freed;	583	info->swapped -= nr_swaps_freed;
584	shmem_recalc_inode(inode);	584	shmem_recalc_inode(inode);
585	spin_unlock(&info->lock);	585	spin_unlock(&info->lock);
586	}	586	}
587		587
588	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)	588	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
589	{	589	{
590	shmem_undo_range(inode, lstart, lend, false);	590	shmem_undo_range(inode, lstart, lend, false);
591	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	591	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
592	}	592	}
593	EXPORT_SYMBOL_GPL(shmem_truncate_range);	593	EXPORT_SYMBOL_GPL(shmem_truncate_range);
594		594
595	static int shmem_setattr(struct dentry dentry, struct iattr attr)	595	static int shmem_setattr(struct dentry dentry, struct iattr attr)
596	{	596	{
597	struct inode *inode = dentry->d_inode;	597	struct inode *inode = dentry->d_inode;
598	int error;	598	int error;
599		599
600	error = inode_change_ok(inode, attr);	600	error = inode_change_ok(inode, attr);
601	if (error)	601	if (error)
602	return error;	602	return error;
603		603
604	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {	604	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
605	loff_t oldsize = inode->i_size;	605	loff_t oldsize = inode->i_size;
606	loff_t newsize = attr->ia_size;	606	loff_t newsize = attr->ia_size;
607		607
608	if (newsize != oldsize) {	608	if (newsize != oldsize) {
609	i_size_write(inode, newsize);	609	i_size_write(inode, newsize);
610	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	610	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
611	}	611	}
612	if (newsize < oldsize) {	612	if (newsize < oldsize) {
613	loff_t holebegin = round_up(newsize, PAGE_SIZE);	613	loff_t holebegin = round_up(newsize, PAGE_SIZE);
614	unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);	614	unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
615	shmem_truncate_range(inode, newsize, (loff_t)-1);	615	shmem_truncate_range(inode, newsize, (loff_t)-1);
616	/* unmap again to remove racily COWed private pages */	616	/* unmap again to remove racily COWed private pages */
617	unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);	617	unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
618	}	618	}
619	}	619	}
620		620
621	setattr_copy(inode, attr);	621	setattr_copy(inode, attr);
622	#ifdef CONFIG_TMPFS_POSIX_ACL	622	#ifdef CONFIG_TMPFS_POSIX_ACL
623	if (attr->ia_valid & ATTR_MODE)	623	if (attr->ia_valid & ATTR_MODE)
624	error = generic_acl_chmod(inode);	624	error = generic_acl_chmod(inode);
625	#endif	625	#endif
626	return error;	626	return error;
627	}	627	}
628		628
629	static void shmem_evict_inode(struct inode *inode)	629	static void shmem_evict_inode(struct inode *inode)
630	{	630	{
631	struct shmem_inode_info *info = SHMEM_I(inode);	631	struct shmem_inode_info *info = SHMEM_I(inode);
632		632
633	if (inode->i_mapping->a_ops == &shmem_aops) {	633	if (inode->i_mapping->a_ops == &shmem_aops) {
634	shmem_unacct_size(info->flags, inode->i_size);	634	shmem_unacct_size(info->flags, inode->i_size);
635	inode->i_size = 0;	635	inode->i_size = 0;
636	shmem_truncate_range(inode, 0, (loff_t)-1);	636	shmem_truncate_range(inode, 0, (loff_t)-1);
637	if (!list_empty(&info->swaplist)) {	637	if (!list_empty(&info->swaplist)) {
638	mutex_lock(&shmem_swaplist_mutex);	638	mutex_lock(&shmem_swaplist_mutex);
639	list_del_init(&info->swaplist);	639	list_del_init(&info->swaplist);
640	mutex_unlock(&shmem_swaplist_mutex);	640	mutex_unlock(&shmem_swaplist_mutex);
641	}	641	}
642	} else	642	} else
643	kfree(info->symlink);	643	kfree(info->symlink);
644		644
645	simple_xattrs_free(&info->xattrs);	645	simple_xattrs_free(&info->xattrs);
646	WARN_ON(inode->i_blocks);	646	WARN_ON(inode->i_blocks);
647	shmem_free_inode(inode->i_sb);	647	shmem_free_inode(inode->i_sb);
648	clear_inode(inode);	648	clear_inode(inode);
649	}	649	}
650		650
651	/*	651	/*
652	* If swap found in inode, free it and move page from swapcache to filecache.	652	* If swap found in inode, free it and move page from swapcache to filecache.
653	*/	653	*/
654	static int shmem_unuse_inode(struct shmem_inode_info *info,	654	static int shmem_unuse_inode(struct shmem_inode_info *info,
655	swp_entry_t swap, struct page **pagep)	655	swp_entry_t swap, struct page **pagep)
656	{	656	{
657	struct address_space *mapping = info->vfs_inode.i_mapping;	657	struct address_space *mapping = info->vfs_inode.i_mapping;
658	void *radswap;	658	void *radswap;
659	pgoff_t index;	659	pgoff_t index;
660	gfp_t gfp;	660	gfp_t gfp;
661	int error = 0;	661	int error = 0;
662		662
663	radswap = swp_to_radix_entry(swap);	663	radswap = swp_to_radix_entry(swap);
664	index = radix_tree_locate_item(&mapping->page_tree, radswap);	664	index = radix_tree_locate_item(&mapping->page_tree, radswap);
665	if (index == -1)	665	if (index == -1)
666	return 0;	666	return 0;
667		667
668	/*	668	/*
669	* Move _head_ to start search for next from here.	669	* Move _head_ to start search for next from here.
670	* But be careful: shmem_evict_inode checks list_empty without taking	670	* But be careful: shmem_evict_inode checks list_empty without taking
671	* mutex, and there's an instant in list_move_tail when info->swaplist	671	* mutex, and there's an instant in list_move_tail when info->swaplist
672	* would appear empty, if it were the only one on shmem_swaplist.	672	* would appear empty, if it were the only one on shmem_swaplist.
673	*/	673	*/
674	if (shmem_swaplist.next != &info->swaplist)	674	if (shmem_swaplist.next != &info->swaplist)
675	list_move_tail(&shmem_swaplist, &info->swaplist);	675	list_move_tail(&shmem_swaplist, &info->swaplist);
676		676
677	gfp = mapping_gfp_mask(mapping);	677	gfp = mapping_gfp_mask(mapping);
678	if (shmem_should_replace_page(*pagep, gfp)) {	678	if (shmem_should_replace_page(*pagep, gfp)) {
679	mutex_unlock(&shmem_swaplist_mutex);	679	mutex_unlock(&shmem_swaplist_mutex);
680	error = shmem_replace_page(pagep, gfp, info, index);	680	error = shmem_replace_page(pagep, gfp, info, index);
681	mutex_lock(&shmem_swaplist_mutex);	681	mutex_lock(&shmem_swaplist_mutex);
682	/*	682	/*
683	* We needed to drop mutex to make that restrictive page	683	* We needed to drop mutex to make that restrictive page
684	* allocation, but the inode might have been freed while we	684	* allocation, but the inode might have been freed while we
685	* dropped it: although a racing shmem_evict_inode() cannot	685	* dropped it: although a racing shmem_evict_inode() cannot
686	* complete without emptying the radix_tree, our page lock	686	* complete without emptying the radix_tree, our page lock
687	* on this swapcache page is not enough to prevent that -	687	* on this swapcache page is not enough to prevent that -
688	* free_swap_and_cache() of our swap entry will only	688	* free_swap_and_cache() of our swap entry will only
689	* trylock_page(), removing swap from radix_tree whatever.	689	* trylock_page(), removing swap from radix_tree whatever.
690	*	690	*
691	* We must not proceed to shmem_add_to_page_cache() if the	691	* We must not proceed to shmem_add_to_page_cache() if the
692	* inode has been freed, but of course we cannot rely on	692	* inode has been freed, but of course we cannot rely on
693	* inode or mapping or info to check that. However, we can	693	* inode or mapping or info to check that. However, we can
694	* safely check if our swap entry is still in use (and here	694	* safely check if our swap entry is still in use (and here
695	* it can't have got reused for another page): if it's still	695	* it can't have got reused for another page): if it's still
696	* in use, then the inode cannot have been freed yet, and we	696	* in use, then the inode cannot have been freed yet, and we
697	* can safely proceed (if it's no longer in use, that tells	697	* can safely proceed (if it's no longer in use, that tells
698	* nothing about the inode, but we don't need to unuse swap).	698	* nothing about the inode, but we don't need to unuse swap).
699	*/	699	*/
700	if (!page_swapcount(*pagep))	700	if (!page_swapcount(*pagep))
701	error = -ENOENT;	701	error = -ENOENT;
702	}	702	}
703		703
704	/*	704	/*
705	* We rely on shmem_swaplist_mutex, not only to protect the swaplist,	705	* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
706	* but also to hold up shmem_evict_inode(): so inode cannot be freed	706	* but also to hold up shmem_evict_inode(): so inode cannot be freed
707	* beneath us (pagelock doesn't help until the page is in pagecache).	707	* beneath us (pagelock doesn't help until the page is in pagecache).
708	*/	708	*/
709	if (!error)	709	if (!error)
710	error = shmem_add_to_page_cache(*pagep, mapping, index,	710	error = shmem_add_to_page_cache(*pagep, mapping, index,
711	GFP_NOWAIT, radswap);	711	GFP_NOWAIT, radswap);
712	if (error != -ENOMEM) {	712	if (error != -ENOMEM) {
713	/*	713	/*
714	* Truncation and eviction use free_swap_and_cache(), which	714	* Truncation and eviction use free_swap_and_cache(), which
715	* only does trylock page: if we raced, best clean up here.	715	* only does trylock page: if we raced, best clean up here.
716	*/	716	*/
717	delete_from_swap_cache(*pagep);	717	delete_from_swap_cache(*pagep);
718	set_page_dirty(*pagep);	718	set_page_dirty(*pagep);
719	if (!error) {	719	if (!error) {
720	spin_lock(&info->lock);	720	spin_lock(&info->lock);
721	info->swapped--;	721	info->swapped--;
722	spin_unlock(&info->lock);	722	spin_unlock(&info->lock);
723	swap_free(swap);	723	swap_free(swap);
724	}	724	}
725	error = 1; /* not an error, but entry was found */	725	error = 1; /* not an error, but entry was found */
726	}	726	}
727	return error;	727	return error;
728	}	728	}
729		729
730	/*	730	/*
731	* Search through swapped inodes to find and replace swap by page.	731	* Search through swapped inodes to find and replace swap by page.
732	*/	732	*/
733	int shmem_unuse(swp_entry_t swap, struct page *page)	733	int shmem_unuse(swp_entry_t swap, struct page *page)
734	{	734	{
735	struct list_head this, next;	735	struct list_head this, next;
736	struct shmem_inode_info *info;	736	struct shmem_inode_info *info;
737	int found = 0;	737	int found = 0;
738	int error = 0;	738	int error = 0;
739		739
740	/*	740	/*
741	* There's a faint possibility that swap page was replaced before	741	* There's a faint possibility that swap page was replaced before
742	* caller locked it: caller will come back later with the right page.	742	* caller locked it: caller will come back later with the right page.
743	*/	743	*/
744	if (unlikely(!PageSwapCache(page) \|\| page_private(page) != swap.val))	744	if (unlikely(!PageSwapCache(page) \|\| page_private(page) != swap.val))
745	goto out;	745	goto out;
746		746
747	/*	747	/*
748	* Charge page using GFP_KERNEL while we can wait, before taking	748	* Charge page using GFP_KERNEL while we can wait, before taking
749	* the shmem_swaplist_mutex which might hold up shmem_writepage().	749	* the shmem_swaplist_mutex which might hold up shmem_writepage().
750	* Charged back to the user (not to caller) when swap account is used.	750	* Charged back to the user (not to caller) when swap account is used.
751	*/	751	*/
752	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);	752	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
753	if (error)	753	if (error)
754	goto out;	754	goto out;
755	/* No radix_tree_preload: swap entry keeps a place for page in tree */	755	/* No radix_tree_preload: swap entry keeps a place for page in tree */
756		756
757	mutex_lock(&shmem_swaplist_mutex);	757	mutex_lock(&shmem_swaplist_mutex);
758	list_for_each_safe(this, next, &shmem_swaplist) {	758	list_for_each_safe(this, next, &shmem_swaplist) {
759	info = list_entry(this, struct shmem_inode_info, swaplist);	759	info = list_entry(this, struct shmem_inode_info, swaplist);
760	if (info->swapped)	760	if (info->swapped)
761	found = shmem_unuse_inode(info, swap, &page);	761	found = shmem_unuse_inode(info, swap, &page);
762	else	762	else
763	list_del_init(&info->swaplist);	763	list_del_init(&info->swaplist);
764	cond_resched();	764	cond_resched();
765	if (found)	765	if (found)
766	break;	766	break;
767	}	767	}
768	mutex_unlock(&shmem_swaplist_mutex);	768	mutex_unlock(&shmem_swaplist_mutex);
769		769
770	if (found < 0)	770	if (found < 0)
771	error = found;	771	error = found;
772	out:	772	out:
773	unlock_page(page);	773	unlock_page(page);
774	page_cache_release(page);	774	page_cache_release(page);
775	return error;	775	return error;
776	}	776	}
777		777
778	/*	778	/*
779	* Move the page from the page cache to the swap cache.	779	* Move the page from the page cache to the swap cache.
780	*/	780	*/
781	static int shmem_writepage(struct page page, struct writeback_control wbc)	781	static int shmem_writepage(struct page page, struct writeback_control wbc)
782	{	782	{
783	struct shmem_inode_info *info;	783	struct shmem_inode_info *info;
784	struct address_space *mapping;	784	struct address_space *mapping;
785	struct inode *inode;	785	struct inode *inode;
786	swp_entry_t swap;	786	swp_entry_t swap;
787	pgoff_t index;	787	pgoff_t index;
788		788
789	BUG_ON(!PageLocked(page));	789	BUG_ON(!PageLocked(page));
790	mapping = page->mapping;	790	mapping = page->mapping;
791	index = page->index;	791	index = page->index;
792	inode = mapping->host;	792	inode = mapping->host;
793	info = SHMEM_I(inode);	793	info = SHMEM_I(inode);
794	if (info->flags & VM_LOCKED)	794	if (info->flags & VM_LOCKED)
795	goto redirty;	795	goto redirty;
796	if (!total_swap_pages)	796	if (!total_swap_pages)
797	goto redirty;	797	goto redirty;
798		798
799	/*	799	/*
800	* shmem_backing_dev_info's capabilities prevent regular writeback or	800	* shmem_backing_dev_info's capabilities prevent regular writeback or
801	* sync from ever calling shmem_writepage; but a stacking filesystem	801	* sync from ever calling shmem_writepage; but a stacking filesystem
802	* might use ->writepage of its underlying filesystem, in which case	802	* might use ->writepage of its underlying filesystem, in which case
803	* tmpfs should write out to swap only in response to memory pressure,	803	* tmpfs should write out to swap only in response to memory pressure,
804	* and not for the writeback threads or sync.	804	* and not for the writeback threads or sync.
805	*/	805	*/
806	if (!wbc->for_reclaim) {	806	if (!wbc->for_reclaim) {
807	WARN_ON_ONCE(1); /* Still happens? Tell us about it! */	807	WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
808	goto redirty;	808	goto redirty;
809	}	809	}
810		810
811	/*	811	/*
812	* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC	812	* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
813	* value into swapfile.c, the only way we can correctly account for a	813	* value into swapfile.c, the only way we can correctly account for a
814	* fallocated page arriving here is now to initialize it and write it.	814	* fallocated page arriving here is now to initialize it and write it.
815	*	815	*
816	* That's okay for a page already fallocated earlier, but if we have	816	* That's okay for a page already fallocated earlier, but if we have
817	* not yet completed the fallocation, then (a) we want to keep track	817	* not yet completed the fallocation, then (a) we want to keep track
818	* of this page in case we have to undo it, and (b) it may not be a	818	* of this page in case we have to undo it, and (b) it may not be a
819	* good idea to continue anyway, once we're pushing into swap. So	819	* good idea to continue anyway, once we're pushing into swap. So
820	* reactivate the page, and let shmem_fallocate() quit when too many.	820	* reactivate the page, and let shmem_fallocate() quit when too many.
821	*/	821	*/
822	if (!PageUptodate(page)) {	822	if (!PageUptodate(page)) {
823	if (inode->i_private) {	823	if (inode->i_private) {
824	struct shmem_falloc *shmem_falloc;	824	struct shmem_falloc *shmem_falloc;
825	spin_lock(&inode->i_lock);	825	spin_lock(&inode->i_lock);
826	shmem_falloc = inode->i_private;	826	shmem_falloc = inode->i_private;
827	if (shmem_falloc &&	827	if (shmem_falloc &&
828	index >= shmem_falloc->start &&	828	index >= shmem_falloc->start &&
829	index < shmem_falloc->next)	829	index < shmem_falloc->next)
830	shmem_falloc->nr_unswapped++;	830	shmem_falloc->nr_unswapped++;
831	else	831	else
832	shmem_falloc = NULL;	832	shmem_falloc = NULL;
833	spin_unlock(&inode->i_lock);	833	spin_unlock(&inode->i_lock);
834	if (shmem_falloc)	834	if (shmem_falloc)
835	goto redirty;	835	goto redirty;
836	}	836	}
837	clear_highpage(page);	837	clear_highpage(page);
838	flush_dcache_page(page);	838	flush_dcache_page(page);
839	SetPageUptodate(page);	839	SetPageUptodate(page);
840	}	840	}
841		841
842	swap = get_swap_page();	842	swap = get_swap_page();
843	if (!swap.val)	843	if (!swap.val)
844	goto redirty;	844	goto redirty;
845		845
846	/*	846	/*
847	* Add inode to shmem_unuse()'s list of swapped-out inodes,	847	* Add inode to shmem_unuse()'s list of swapped-out inodes,
848	* if it's not already there. Do it now before the page is	848	* if it's not already there. Do it now before the page is
849	* moved to swap cache, when its pagelock no longer protects	849	* moved to swap cache, when its pagelock no longer protects
850	* the inode from eviction. But don't unlock the mutex until	850	* the inode from eviction. But don't unlock the mutex until
851	* we've incremented swapped, because shmem_unuse_inode() will	851	* we've incremented swapped, because shmem_unuse_inode() will
852	* prune a !swapped inode from the swaplist under this mutex.	852	* prune a !swapped inode from the swaplist under this mutex.
853	*/	853	*/
854	mutex_lock(&shmem_swaplist_mutex);	854	mutex_lock(&shmem_swaplist_mutex);
855	if (list_empty(&info->swaplist))	855	if (list_empty(&info->swaplist))
856	list_add_tail(&info->swaplist, &shmem_swaplist);	856	list_add_tail(&info->swaplist, &shmem_swaplist);
857		857
858	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {	858	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
859	swap_shmem_alloc(swap);	859	swap_shmem_alloc(swap);
860	shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));	860	shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
861		861
862	spin_lock(&info->lock);	862	spin_lock(&info->lock);
863	info->swapped++;	863	info->swapped++;
864	shmem_recalc_inode(inode);	864	shmem_recalc_inode(inode);
865	spin_unlock(&info->lock);	865	spin_unlock(&info->lock);
866		866
867	mutex_unlock(&shmem_swaplist_mutex);	867	mutex_unlock(&shmem_swaplist_mutex);
868	BUG_ON(page_mapped(page));	868	BUG_ON(page_mapped(page));
869	swap_writepage(page, wbc);	869	swap_writepage(page, wbc);
870	return 0;	870	return 0;
871	}	871	}
872		872
873	mutex_unlock(&shmem_swaplist_mutex);	873	mutex_unlock(&shmem_swaplist_mutex);
874	swapcache_free(swap, NULL);	874	swapcache_free(swap, NULL);
875	redirty:	875	redirty:
876	set_page_dirty(page);	876	set_page_dirty(page);
877	if (wbc->for_reclaim)	877	if (wbc->for_reclaim)
878	return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */	878	return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
879	unlock_page(page);	879	unlock_page(page);
880	return 0;	880	return 0;
881	}	881	}
882		882
883	#ifdef CONFIG_NUMA	883	#ifdef CONFIG_NUMA
884	#ifdef CONFIG_TMPFS	884	#ifdef CONFIG_TMPFS
885	static void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)	885	static void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
886	{	886	{
887	char buffer[64];	887	char buffer[64];
888		888
889	if (!mpol \|\| mpol->mode == MPOL_DEFAULT)	889	if (!mpol \|\| mpol->mode == MPOL_DEFAULT)
890	return; /* show nothing */	890	return; /* show nothing */
891		891
892	mpol_to_str(buffer, sizeof(buffer), mpol, 1);	892	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
893		893
894	seq_printf(seq, ",mpol=%s", buffer);	894	seq_printf(seq, ",mpol=%s", buffer);
895	}	895	}
896		896
897	static struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)	897	static struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
898	{	898	{
899	struct mempolicy *mpol = NULL;	899	struct mempolicy *mpol = NULL;
900	if (sbinfo->mpol) {	900	if (sbinfo->mpol) {
901	spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */	901	spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
902	mpol = sbinfo->mpol;	902	mpol = sbinfo->mpol;
903	mpol_get(mpol);	903	mpol_get(mpol);
904	spin_unlock(&sbinfo->stat_lock);	904	spin_unlock(&sbinfo->stat_lock);
905	}	905	}
906	return mpol;	906	return mpol;
907	}	907	}
908	#endif /* CONFIG_TMPFS */	908	#endif /* CONFIG_TMPFS */
909		909
910	static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,	910	static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
911	struct shmem_inode_info *info, pgoff_t index)	911	struct shmem_inode_info *info, pgoff_t index)
912	{	912	{
913	struct vm_area_struct pvma;	913	struct vm_area_struct pvma;
914	struct page *page;	914	struct page *page;
915		915
916	/* Create a pseudo vma that just contains the policy */	916	/* Create a pseudo vma that just contains the policy */
917	pvma.vm_start = 0;	917	pvma.vm_start = 0;
918	/* Bias interleave by inode number to distribute better across nodes */	918	/* Bias interleave by inode number to distribute better across nodes */
919	pvma.vm_pgoff = index + info->vfs_inode.i_ino;	919	pvma.vm_pgoff = index + info->vfs_inode.i_ino;
920	pvma.vm_ops = NULL;	920	pvma.vm_ops = NULL;
921	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);	921	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
922		922
923	page = swapin_readahead(swap, gfp, &pvma, 0);	923	page = swapin_readahead(swap, gfp, &pvma, 0);
924		924
925	/* Drop reference taken by mpol_shared_policy_lookup() */	925	/* Drop reference taken by mpol_shared_policy_lookup() */
926	mpol_cond_put(pvma.vm_policy);	926	mpol_cond_put(pvma.vm_policy);
927		927
928	return page;	928	return page;
929	}	929	}
930		930
931	static struct page *shmem_alloc_page(gfp_t gfp,	931	static struct page *shmem_alloc_page(gfp_t gfp,
932	struct shmem_inode_info *info, pgoff_t index)	932	struct shmem_inode_info *info, pgoff_t index)
933	{	933	{
934	struct vm_area_struct pvma;	934	struct vm_area_struct pvma;
935	struct page *page;	935	struct page *page;
936		936
937	/* Create a pseudo vma that just contains the policy */	937	/* Create a pseudo vma that just contains the policy */
938	pvma.vm_start = 0;	938	pvma.vm_start = 0;
939	/* Bias interleave by inode number to distribute better across nodes */	939	/* Bias interleave by inode number to distribute better across nodes */
940	pvma.vm_pgoff = index + info->vfs_inode.i_ino;	940	pvma.vm_pgoff = index + info->vfs_inode.i_ino;
941	pvma.vm_ops = NULL;	941	pvma.vm_ops = NULL;
942	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);	942	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
943		943
944	page = alloc_page_vma(gfp, &pvma, 0);	944	page = alloc_page_vma(gfp, &pvma, 0);
945		945
946	/* Drop reference taken by mpol_shared_policy_lookup() */	946	/* Drop reference taken by mpol_shared_policy_lookup() */
947	mpol_cond_put(pvma.vm_policy);	947	mpol_cond_put(pvma.vm_policy);
948		948
949	return page;	949	return page;
950	}	950	}
951	#else /* !CONFIG_NUMA */	951	#else /* !CONFIG_NUMA */
952	#ifdef CONFIG_TMPFS	952	#ifdef CONFIG_TMPFS
953	static inline void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)	953	static inline void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
954	{	954	{
955	}	955	}
956	#endif /* CONFIG_TMPFS */	956	#endif /* CONFIG_TMPFS */
957		957
958	static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,	958	static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
959	struct shmem_inode_info *info, pgoff_t index)	959	struct shmem_inode_info *info, pgoff_t index)
960	{	960	{
961	return swapin_readahead(swap, gfp, NULL, 0);	961	return swapin_readahead(swap, gfp, NULL, 0);
962	}	962	}
963		963
964	static inline struct page *shmem_alloc_page(gfp_t gfp,	964	static inline struct page *shmem_alloc_page(gfp_t gfp,
965	struct shmem_inode_info *info, pgoff_t index)	965	struct shmem_inode_info *info, pgoff_t index)
966	{	966	{
967	return alloc_page(gfp);	967	return alloc_page(gfp);
968	}	968	}
969	#endif /* CONFIG_NUMA */	969	#endif /* CONFIG_NUMA */
970		970
971	#if !defined(CONFIG_NUMA) \|\| !defined(CONFIG_TMPFS)	971	#if !defined(CONFIG_NUMA) \|\| !defined(CONFIG_TMPFS)
972	static inline struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)	972	static inline struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
973	{	973	{
974	return NULL;	974	return NULL;
975	}	975	}
976	#endif	976	#endif
977		977
978	/*	978	/*
979	* When a page is moved from swapcache to shmem filecache (either by the	979	* When a page is moved from swapcache to shmem filecache (either by the
980	* usual swapin of shmem_getpage_gfp(), or by the less common swapoff of	980	* usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
981	* shmem_unuse_inode()), it may have been read in earlier from swap, in	981	* shmem_unuse_inode()), it may have been read in earlier from swap, in
982	* ignorance of the mapping it belongs to. If that mapping has special	982	* ignorance of the mapping it belongs to. If that mapping has special
983	* constraints (like the gma500 GEM driver, which requires RAM below 4GB),	983	* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
984	* we may need to copy to a suitable page before moving to filecache.	984	* we may need to copy to a suitable page before moving to filecache.
985	*	985	*
986	* In a future release, this may well be extended to respect cpuset and	986	* In a future release, this may well be extended to respect cpuset and
987	* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();	987	* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
988	* but for now it is a simple matter of zone.	988	* but for now it is a simple matter of zone.
989	*/	989	*/
990	static bool shmem_should_replace_page(struct page *page, gfp_t gfp)	990	static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
991	{	991	{
992	return page_zonenum(page) > gfp_zone(gfp);	992	return page_zonenum(page) > gfp_zone(gfp);
993	}	993	}
994		994
995	static int shmem_replace_page(struct page **pagep, gfp_t gfp,	995	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
996	struct shmem_inode_info *info, pgoff_t index)	996	struct shmem_inode_info *info, pgoff_t index)
997	{	997	{
998	struct page oldpage, newpage;	998	struct page oldpage, newpage;
999	struct address_space *swap_mapping;	999	struct address_space *swap_mapping;
1000	pgoff_t swap_index;	1000	pgoff_t swap_index;
1001	int error;	1001	int error;
1002		1002
1003	oldpage = *pagep;	1003	oldpage = *pagep;
1004	swap_index = page_private(oldpage);	1004	swap_index = page_private(oldpage);
1005	swap_mapping = page_mapping(oldpage);	1005	swap_mapping = page_mapping(oldpage);
1006		1006
1007	/*	1007	/*
1008	* We have arrived here because our zones are constrained, so don't	1008	* We have arrived here because our zones are constrained, so don't
1009	* limit chance of success by further cpuset and node constraints.	1009	* limit chance of success by further cpuset and node constraints.
1010	*/	1010	*/
1011	gfp &= ~GFP_CONSTRAINT_MASK;	1011	gfp &= ~GFP_CONSTRAINT_MASK;
1012	newpage = shmem_alloc_page(gfp, info, index);	1012	newpage = shmem_alloc_page(gfp, info, index);
1013	if (!newpage)	1013	if (!newpage)
1014	return -ENOMEM;	1014	return -ENOMEM;
1015		1015
1016	page_cache_get(newpage);	1016	page_cache_get(newpage);
1017	copy_highpage(newpage, oldpage);	1017	copy_highpage(newpage, oldpage);
1018	flush_dcache_page(newpage);	1018	flush_dcache_page(newpage);
1019		1019
1020	__set_page_locked(newpage);	1020	__set_page_locked(newpage);
1021	SetPageUptodate(newpage);	1021	SetPageUptodate(newpage);
1022	SetPageSwapBacked(newpage);	1022	SetPageSwapBacked(newpage);
1023	set_page_private(newpage, swap_index);	1023	set_page_private(newpage, swap_index);
1024	SetPageSwapCache(newpage);	1024	SetPageSwapCache(newpage);
1025		1025
1026	/*	1026	/*
1027	* Our caller will very soon move newpage out of swapcache, but it's	1027	* Our caller will very soon move newpage out of swapcache, but it's
1028	* a nice clean interface for us to replace oldpage by newpage there.	1028	* a nice clean interface for us to replace oldpage by newpage there.
1029	*/	1029	*/
1030	spin_lock_irq(&swap_mapping->tree_lock);	1030	spin_lock_irq(&swap_mapping->tree_lock);
1031	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,	1031	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1032	newpage);	1032	newpage);
1033	if (!error) {	1033	if (!error) {
1034	__inc_zone_page_state(newpage, NR_FILE_PAGES);	1034	__inc_zone_page_state(newpage, NR_FILE_PAGES);
1035	__dec_zone_page_state(oldpage, NR_FILE_PAGES);	1035	__dec_zone_page_state(oldpage, NR_FILE_PAGES);
1036	}	1036	}
1037	spin_unlock_irq(&swap_mapping->tree_lock);	1037	spin_unlock_irq(&swap_mapping->tree_lock);
1038		1038
1039	if (unlikely(error)) {	1039	if (unlikely(error)) {
1040	/*	1040	/*
1041	* Is this possible? I think not, now that our callers check	1041	* Is this possible? I think not, now that our callers check
1042	* both PageSwapCache and page_private after getting page lock;	1042	* both PageSwapCache and page_private after getting page lock;
1043	* but be defensive. Reverse old to newpage for clear and free.	1043	* but be defensive. Reverse old to newpage for clear and free.
1044	*/	1044	*/
1045	oldpage = newpage;	1045	oldpage = newpage;
1046	} else {	1046	} else {
1047	mem_cgroup_replace_page_cache(oldpage, newpage);	1047	mem_cgroup_replace_page_cache(oldpage, newpage);
1048	lru_cache_add_anon(newpage);	1048	lru_cache_add_anon(newpage);
1049	*pagep = newpage;	1049	*pagep = newpage;
1050	}	1050	}
1051		1051
1052	ClearPageSwapCache(oldpage);	1052	ClearPageSwapCache(oldpage);
1053	set_page_private(oldpage, 0);	1053	set_page_private(oldpage, 0);
1054		1054
1055	unlock_page(oldpage);	1055	unlock_page(oldpage);
1056	page_cache_release(oldpage);	1056	page_cache_release(oldpage);
1057	page_cache_release(oldpage);	1057	page_cache_release(oldpage);
1058	return error;	1058	return error;
1059	}	1059	}
1060		1060
1061	/*	1061	/*
1062	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate	1062	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1063	*	1063	*
1064	* If we allocate a new one we do not mark it dirty. That's up to the	1064	* If we allocate a new one we do not mark it dirty. That's up to the
1065	* vm. If we swap it in we mark it dirty since we also free the swap	1065	* vm. If we swap it in we mark it dirty since we also free the swap
1066	* entry since a page cannot live in both the swap and page cache	1066	* entry since a page cannot live in both the swap and page cache
1067	*/	1067	*/
1068	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,	1068	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1069	struct page *pagep, enum sgp_type sgp, gfp_t gfp, int fault_type)	1069	struct page *pagep, enum sgp_type sgp, gfp_t gfp, int fault_type)
1070	{	1070	{
1071	struct address_space *mapping = inode->i_mapping;	1071	struct address_space *mapping = inode->i_mapping;
1072	struct shmem_inode_info *info;	1072	struct shmem_inode_info *info;
1073	struct shmem_sb_info *sbinfo;	1073	struct shmem_sb_info *sbinfo;
1074	struct page *page;	1074	struct page *page;
1075	swp_entry_t swap;	1075	swp_entry_t swap;
1076	int error;	1076	int error;
1077	int once = 0;	1077	int once = 0;
1078	int alloced = 0;	1078	int alloced = 0;
1079		1079
1080	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))	1080	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1081	return -EFBIG;	1081	return -EFBIG;
1082	repeat:	1082	repeat:
1083	swap.val = 0;	1083	swap.val = 0;
1084	page = find_lock_page(mapping, index);	1084	page = find_lock_page(mapping, index);
1085	if (radix_tree_exceptional_entry(page)) {	1085	if (radix_tree_exceptional_entry(page)) {
1086	swap = radix_to_swp_entry(page);	1086	swap = radix_to_swp_entry(page);
1087	page = NULL;	1087	page = NULL;
1088	}	1088	}
1089		1089
1090	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&	1090	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1091	((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {	1091	((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1092	error = -EINVAL;	1092	error = -EINVAL;
1093	goto failed;	1093	goto failed;
1094	}	1094	}
1095		1095
1096	/* fallocated page? */	1096	/* fallocated page? */
1097	if (page && !PageUptodate(page)) {	1097	if (page && !PageUptodate(page)) {
1098	if (sgp != SGP_READ)	1098	if (sgp != SGP_READ)
1099	goto clear;	1099	goto clear;
1100	unlock_page(page);	1100	unlock_page(page);
1101	page_cache_release(page);	1101	page_cache_release(page);
1102	page = NULL;	1102	page = NULL;
1103	}	1103	}
1104	if (page \|\| (sgp == SGP_READ && !swap.val)) {	1104	if (page \|\| (sgp == SGP_READ && !swap.val)) {
1105	*pagep = page;	1105	*pagep = page;
1106	return 0;	1106	return 0;
1107	}	1107	}
1108		1108
1109	/*	1109	/*
1110	* Fast cache lookup did not find it:	1110	* Fast cache lookup did not find it:
1111	* bring it back from swap or allocate.	1111	* bring it back from swap or allocate.
1112	*/	1112	*/
1113	info = SHMEM_I(inode);	1113	info = SHMEM_I(inode);
1114	sbinfo = SHMEM_SB(inode->i_sb);	1114	sbinfo = SHMEM_SB(inode->i_sb);
1115		1115
1116	if (swap.val) {	1116	if (swap.val) {
1117	/* Look it up and read it in.. */	1117	/* Look it up and read it in.. */
1118	page = lookup_swap_cache(swap);	1118	page = lookup_swap_cache(swap);
1119	if (!page) {	1119	if (!page) {
1120	/* here we actually do the io */	1120	/* here we actually do the io */
1121	if (fault_type)	1121	if (fault_type)
1122	*fault_type \|= VM_FAULT_MAJOR;	1122	*fault_type \|= VM_FAULT_MAJOR;
1123	page = shmem_swapin(swap, gfp, info, index);	1123	page = shmem_swapin(swap, gfp, info, index);
1124	if (!page) {	1124	if (!page) {
1125	error = -ENOMEM;	1125	error = -ENOMEM;
1126	goto failed;	1126	goto failed;
1127	}	1127	}
1128	}	1128	}
1129		1129
1130	/* We have to do this with page locked to prevent races */	1130	/* We have to do this with page locked to prevent races */
1131	lock_page(page);	1131	lock_page(page);
1132	if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|	1132	if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
1133	!shmem_confirm_swap(mapping, index, swap)) {	1133	!shmem_confirm_swap(mapping, index, swap)) {
1134	error = -EEXIST; /* try again */	1134	error = -EEXIST; /* try again */
1135	goto unlock;	1135	goto unlock;
1136	}	1136	}
1137	if (!PageUptodate(page)) {	1137	if (!PageUptodate(page)) {
1138	error = -EIO;	1138	error = -EIO;
1139	goto failed;	1139	goto failed;
1140	}	1140	}
1141	wait_on_page_writeback(page);	1141	wait_on_page_writeback(page);
1142		1142
1143	if (shmem_should_replace_page(page, gfp)) {	1143	if (shmem_should_replace_page(page, gfp)) {
1144	error = shmem_replace_page(&page, gfp, info, index);	1144	error = shmem_replace_page(&page, gfp, info, index);
1145	if (error)	1145	if (error)
1146	goto failed;	1146	goto failed;
1147	}	1147	}
1148		1148
1149	error = mem_cgroup_cache_charge(page, current->mm,	1149	error = mem_cgroup_cache_charge(page, current->mm,
1150	gfp & GFP_RECLAIM_MASK);	1150	gfp & GFP_RECLAIM_MASK);
1151	if (!error) {	1151	if (!error) {
1152	error = shmem_add_to_page_cache(page, mapping, index,	1152	error = shmem_add_to_page_cache(page, mapping, index,
1153	gfp, swp_to_radix_entry(swap));	1153	gfp, swp_to_radix_entry(swap));
1154	/*	1154	/*
1155	* We already confirmed swap under page lock, and make	1155	* We already confirmed swap under page lock, and make
1156	* no memory allocation here, so usually no possibility	1156	* no memory allocation here, so usually no possibility
1157	* of error; but free_swap_and_cache() only trylocks a	1157	* of error; but free_swap_and_cache() only trylocks a
1158	* page, so it is just possible that the entry has been	1158	* page, so it is just possible that the entry has been
1159	* truncated or holepunched since swap was confirmed.	1159	* truncated or holepunched since swap was confirmed.
1160	* shmem_undo_range() will have done some of the	1160	* shmem_undo_range() will have done some of the
1161	* unaccounting, now delete_from_swap_cache() will do	1161	* unaccounting, now delete_from_swap_cache() will do
1162	* the rest (including mem_cgroup_uncharge_swapcache).	1162	* the rest (including mem_cgroup_uncharge_swapcache).
1163	* Reset swap.val? No, leave it so "failed" goes back to	1163	* Reset swap.val? No, leave it so "failed" goes back to
1164	* "repeat": reading a hole and writing should succeed.	1164	* "repeat": reading a hole and writing should succeed.
1165	*/	1165	*/
1166	if (error)	1166	if (error)
1167	delete_from_swap_cache(page);	1167	delete_from_swap_cache(page);
1168	}	1168	}
1169	if (error)	1169	if (error)
1170	goto failed;	1170	goto failed;
1171		1171
1172	spin_lock(&info->lock);	1172	spin_lock(&info->lock);
1173	info->swapped--;	1173	info->swapped--;
1174	shmem_recalc_inode(inode);	1174	shmem_recalc_inode(inode);
1175	spin_unlock(&info->lock);	1175	spin_unlock(&info->lock);
1176		1176
1177	delete_from_swap_cache(page);	1177	delete_from_swap_cache(page);
1178	set_page_dirty(page);	1178	set_page_dirty(page);
1179	swap_free(swap);	1179	swap_free(swap);
1180		1180
1181	} else {	1181	} else {
1182	if (shmem_acct_block(info->flags)) {	1182	if (shmem_acct_block(info->flags)) {
1183	error = -ENOSPC;	1183	error = -ENOSPC;
1184	goto failed;	1184	goto failed;
1185	}	1185	}
1186	if (sbinfo->max_blocks) {	1186	if (sbinfo->max_blocks) {
1187	if (percpu_counter_compare(&sbinfo->used_blocks,	1187	if (percpu_counter_compare(&sbinfo->used_blocks,
1188	sbinfo->max_blocks) >= 0) {	1188	sbinfo->max_blocks) >= 0) {
1189	error = -ENOSPC;	1189	error = -ENOSPC;
1190	goto unacct;	1190	goto unacct;
1191	}	1191	}
1192	percpu_counter_inc(&sbinfo->used_blocks);	1192	percpu_counter_inc(&sbinfo->used_blocks);
1193	}	1193	}
1194		1194
1195	page = shmem_alloc_page(gfp, info, index);	1195	page = shmem_alloc_page(gfp, info, index);
1196	if (!page) {	1196	if (!page) {
1197	error = -ENOMEM;	1197	error = -ENOMEM;
1198	goto decused;	1198	goto decused;
1199	}	1199	}
1200		1200
1201	SetPageSwapBacked(page);	1201	SetPageSwapBacked(page);
1202	__set_page_locked(page);	1202	__set_page_locked(page);
1203	error = mem_cgroup_cache_charge(page, current->mm,	1203	error = mem_cgroup_cache_charge(page, current->mm,
1204	gfp & GFP_RECLAIM_MASK);	1204	gfp & GFP_RECLAIM_MASK);
1205	if (error)	1205	if (error)
1206	goto decused;	1206	goto decused;
1207	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);	1207	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1208	if (!error) {	1208	if (!error) {
1209	error = shmem_add_to_page_cache(page, mapping, index,	1209	error = shmem_add_to_page_cache(page, mapping, index,
1210	gfp, NULL);	1210	gfp, NULL);
1211	radix_tree_preload_end();	1211	radix_tree_preload_end();
1212	}	1212	}
1213	if (error) {	1213	if (error) {
1214	mem_cgroup_uncharge_cache_page(page);	1214	mem_cgroup_uncharge_cache_page(page);
1215	goto decused;	1215	goto decused;
1216	}	1216	}
1217	lru_cache_add_anon(page);	1217	lru_cache_add_anon(page);
1218		1218
1219	spin_lock(&info->lock);	1219	spin_lock(&info->lock);
1220	info->alloced++;	1220	info->alloced++;
1221	inode->i_blocks += BLOCKS_PER_PAGE;	1221	inode->i_blocks += BLOCKS_PER_PAGE;
1222	shmem_recalc_inode(inode);	1222	shmem_recalc_inode(inode);
1223	spin_unlock(&info->lock);	1223	spin_unlock(&info->lock);
1224	alloced = true;	1224	alloced = true;
1225		1225
1226	/*	1226	/*
1227	* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.	1227	* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1228	*/	1228	*/
1229	if (sgp == SGP_FALLOC)	1229	if (sgp == SGP_FALLOC)
1230	sgp = SGP_WRITE;	1230	sgp = SGP_WRITE;
1231	clear:	1231	clear:
1232	/*	1232	/*
1233	* Let SGP_WRITE caller clear ends if write does not fill page;	1233	* Let SGP_WRITE caller clear ends if write does not fill page;
1234	* but SGP_FALLOC on a page fallocated earlier must initialize	1234	* but SGP_FALLOC on a page fallocated earlier must initialize
1235	* it now, lest undo on failure cancel our earlier guarantee.	1235	* it now, lest undo on failure cancel our earlier guarantee.
1236	*/	1236	*/
1237	if (sgp != SGP_WRITE) {	1237	if (sgp != SGP_WRITE) {
1238	clear_highpage(page);	1238	clear_highpage(page);
1239	flush_dcache_page(page);	1239	flush_dcache_page(page);
1240	SetPageUptodate(page);	1240	SetPageUptodate(page);
1241	}	1241	}
1242	if (sgp == SGP_DIRTY)	1242	if (sgp == SGP_DIRTY)
1243	set_page_dirty(page);	1243	set_page_dirty(page);
1244	}	1244	}
1245		1245
1246	/* Perhaps the file has been truncated since we checked */	1246	/* Perhaps the file has been truncated since we checked */
1247	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&	1247	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1248	((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {	1248	((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1249	error = -EINVAL;	1249	error = -EINVAL;
1250	if (alloced)	1250	if (alloced)
1251	goto trunc;	1251	goto trunc;
1252	else	1252	else
1253	goto failed;	1253	goto failed;
1254	}	1254	}
1255	*pagep = page;	1255	*pagep = page;
1256	return 0;	1256	return 0;
1257		1257
1258	/*	1258	/*
1259	* Error recovery.	1259	* Error recovery.
1260	*/	1260	*/
1261	trunc:	1261	trunc:
1262	info = SHMEM_I(inode);	1262	info = SHMEM_I(inode);
1263	ClearPageDirty(page);	1263	ClearPageDirty(page);
1264	delete_from_page_cache(page);	1264	delete_from_page_cache(page);
1265	spin_lock(&info->lock);	1265	spin_lock(&info->lock);
1266	info->alloced--;	1266	info->alloced--;
1267	inode->i_blocks -= BLOCKS_PER_PAGE;	1267	inode->i_blocks -= BLOCKS_PER_PAGE;
1268	spin_unlock(&info->lock);	1268	spin_unlock(&info->lock);
1269	decused:	1269	decused:
1270	sbinfo = SHMEM_SB(inode->i_sb);	1270	sbinfo = SHMEM_SB(inode->i_sb);
1271	if (sbinfo->max_blocks)	1271	if (sbinfo->max_blocks)
1272	percpu_counter_add(&sbinfo->used_blocks, -1);	1272	percpu_counter_add(&sbinfo->used_blocks, -1);
1273	unacct:	1273	unacct:
1274	shmem_unacct_blocks(info->flags, 1);	1274	shmem_unacct_blocks(info->flags, 1);
1275	failed:	1275	failed:
1276	if (swap.val && error != -EINVAL &&	1276	if (swap.val && error != -EINVAL &&
1277	!shmem_confirm_swap(mapping, index, swap))	1277	!shmem_confirm_swap(mapping, index, swap))
1278	error = -EEXIST;	1278	error = -EEXIST;
1279	unlock:	1279	unlock:
1280	if (page) {	1280	if (page) {
1281	unlock_page(page);	1281	unlock_page(page);
1282	page_cache_release(page);	1282	page_cache_release(page);
1283	}	1283	}
1284	if (error == -ENOSPC && !once++) {	1284	if (error == -ENOSPC && !once++) {
1285	info = SHMEM_I(inode);	1285	info = SHMEM_I(inode);
1286	spin_lock(&info->lock);	1286	spin_lock(&info->lock);
1287	shmem_recalc_inode(inode);	1287	shmem_recalc_inode(inode);
1288	spin_unlock(&info->lock);	1288	spin_unlock(&info->lock);
1289	goto repeat;	1289	goto repeat;
1290	}	1290	}
1291	if (error == -EEXIST) /* from above or from radix_tree_insert */	1291	if (error == -EEXIST) /* from above or from radix_tree_insert */
1292	goto repeat;	1292	goto repeat;
1293	return error;	1293	return error;
1294	}	1294	}
1295		1295
1296	static int shmem_fault(struct vm_area_struct vma, struct vm_fault vmf)	1296	static int shmem_fault(struct vm_area_struct vma, struct vm_fault vmf)
1297	{	1297	{
1298	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	1298	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1299	int error;	1299	int error;
1300	int ret = VM_FAULT_LOCKED;	1300	int ret = VM_FAULT_LOCKED;
1301		1301
1302	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);	1302	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1303	if (error)	1303	if (error)
1304	return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);	1304	return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1305		1305
1306	if (ret & VM_FAULT_MAJOR) {	1306	if (ret & VM_FAULT_MAJOR) {
1307	count_vm_event(PGMAJFAULT);	1307	count_vm_event(PGMAJFAULT);
1308	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);	1308	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1309	}	1309	}
1310	return ret;	1310	return ret;
1311	}	1311	}
1312		1312
1313	#ifdef CONFIG_NUMA	1313	#ifdef CONFIG_NUMA
1314	static int shmem_set_policy(struct vm_area_struct vma, struct mempolicy mpol)	1314	static int shmem_set_policy(struct vm_area_struct vma, struct mempolicy mpol)
1315	{	1315	{
1316	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	1316	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1317	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);	1317	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1318	}	1318	}
1319		1319
1320	static struct mempolicy shmem_get_policy(struct vm_area_struct vma,	1320	static struct mempolicy shmem_get_policy(struct vm_area_struct vma,
1321	unsigned long addr)	1321	unsigned long addr)
1322	{	1322	{
1323	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	1323	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1324	pgoff_t index;	1324	pgoff_t index;
1325		1325
1326	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;	1326	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1327	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);	1327	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1328	}	1328	}
1329	#endif	1329	#endif
1330		1330
1331	int shmem_lock(struct file file, int lock, struct user_struct user)	1331	int shmem_lock(struct file file, int lock, struct user_struct user)
1332	{	1332	{
1333	struct inode *inode = file->f_path.dentry->d_inode;	1333	struct inode *inode = file->f_path.dentry->d_inode;
1334	struct shmem_inode_info *info = SHMEM_I(inode);	1334	struct shmem_inode_info *info = SHMEM_I(inode);
1335	int retval = -ENOMEM;	1335	int retval = -ENOMEM;
1336		1336
1337	spin_lock(&info->lock);	1337	spin_lock(&info->lock);
1338	if (lock && !(info->flags & VM_LOCKED)) {	1338	if (lock && !(info->flags & VM_LOCKED)) {
1339	if (!user_shm_lock(inode->i_size, user))	1339	if (!user_shm_lock(inode->i_size, user))
1340	goto out_nomem;	1340	goto out_nomem;
1341	info->flags \|= VM_LOCKED;	1341	info->flags \|= VM_LOCKED;
1342	mapping_set_unevictable(file->f_mapping);	1342	mapping_set_unevictable(file->f_mapping);
1343	}	1343	}
1344	if (!lock && (info->flags & VM_LOCKED) && user) {	1344	if (!lock && (info->flags & VM_LOCKED) && user) {
1345	user_shm_unlock(inode->i_size, user);	1345	user_shm_unlock(inode->i_size, user);
1346	info->flags &= ~VM_LOCKED;	1346	info->flags &= ~VM_LOCKED;
1347	mapping_clear_unevictable(file->f_mapping);	1347	mapping_clear_unevictable(file->f_mapping);
1348	}	1348	}
1349	retval = 0;	1349	retval = 0;
1350		1350
1351	out_nomem:	1351	out_nomem:
1352	spin_unlock(&info->lock);	1352	spin_unlock(&info->lock);
1353	return retval;	1353	return retval;
1354	}	1354	}
1355		1355
1356	static int shmem_mmap(struct file file, struct vm_area_struct vma)	1356	static int shmem_mmap(struct file file, struct vm_area_struct vma)
1357	{	1357	{
1358	file_accessed(file);	1358	file_accessed(file);
1359	vma->vm_ops = &shmem_vm_ops;	1359	vma->vm_ops = &shmem_vm_ops;
1360	return 0;	1360	return 0;
1361	}	1361	}
1362		1362
1363	static struct inode shmem_get_inode(struct super_block sb, const struct inode *dir,	1363	static struct inode shmem_get_inode(struct super_block sb, const struct inode *dir,
1364	umode_t mode, dev_t dev, unsigned long flags)	1364	umode_t mode, dev_t dev, unsigned long flags)
1365	{	1365	{
1366	struct inode *inode;	1366	struct inode *inode;
1367	struct shmem_inode_info *info;	1367	struct shmem_inode_info *info;
1368	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);	1368	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1369		1369
1370	if (shmem_reserve_inode(sb))	1370	if (shmem_reserve_inode(sb))
1371	return NULL;	1371	return NULL;
1372		1372
1373	inode = new_inode(sb);	1373	inode = new_inode(sb);
1374	if (inode) {	1374	if (inode) {
1375	inode->i_ino = get_next_ino();	1375	inode->i_ino = get_next_ino();
1376	inode_init_owner(inode, dir, mode);	1376	inode_init_owner(inode, dir, mode);
1377	inode->i_blocks = 0;	1377	inode->i_blocks = 0;
1378	inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;	1378	inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1379	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	1379	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1380	inode->i_generation = get_seconds();	1380	inode->i_generation = get_seconds();
1381	info = SHMEM_I(inode);	1381	info = SHMEM_I(inode);
1382	memset(info, 0, (char )inode - (char )info);	1382	memset(info, 0, (char )inode - (char )info);
1383	spin_lock_init(&info->lock);	1383	spin_lock_init(&info->lock);
1384	info->flags = flags & VM_NORESERVE;	1384	info->flags = flags & VM_NORESERVE;
1385	INIT_LIST_HEAD(&info->swaplist);	1385	INIT_LIST_HEAD(&info->swaplist);
1386	simple_xattrs_init(&info->xattrs);	1386	simple_xattrs_init(&info->xattrs);
1387	cache_no_acl(inode);	1387	cache_no_acl(inode);
1388		1388
1389	switch (mode & S_IFMT) {	1389	switch (mode & S_IFMT) {
1390	default:	1390	default:
1391	inode->i_op = &shmem_special_inode_operations;	1391	inode->i_op = &shmem_special_inode_operations;
1392	init_special_inode(inode, mode, dev);	1392	init_special_inode(inode, mode, dev);
1393	break;	1393	break;
1394	case S_IFREG:	1394	case S_IFREG:
1395	inode->i_mapping->a_ops = &shmem_aops;	1395	inode->i_mapping->a_ops = &shmem_aops;
1396	inode->i_op = &shmem_inode_operations;	1396	inode->i_op = &shmem_inode_operations;
1397	inode->i_fop = &shmem_file_operations;	1397	inode->i_fop = &shmem_file_operations;
1398	mpol_shared_policy_init(&info->policy,	1398	mpol_shared_policy_init(&info->policy,
1399	shmem_get_sbmpol(sbinfo));	1399	shmem_get_sbmpol(sbinfo));
1400	break;	1400	break;
1401	case S_IFDIR:	1401	case S_IFDIR:
1402	inc_nlink(inode);	1402	inc_nlink(inode);
1403	/* Some things misbehave if size == 0 on a directory */	1403	/* Some things misbehave if size == 0 on a directory */
1404	inode->i_size = 2 * BOGO_DIRENT_SIZE;	1404	inode->i_size = 2 * BOGO_DIRENT_SIZE;
1405	inode->i_op = &shmem_dir_inode_operations;	1405	inode->i_op = &shmem_dir_inode_operations;
1406	inode->i_fop = &simple_dir_operations;	1406	inode->i_fop = &simple_dir_operations;
1407	break;	1407	break;
1408	case S_IFLNK:	1408	case S_IFLNK:
1409	/*	1409	/*
1410	* Must not load anything in the rbtree,	1410	* Must not load anything in the rbtree,
1411	* mpol_free_shared_policy will not be called.	1411	* mpol_free_shared_policy will not be called.
1412	*/	1412	*/
1413	mpol_shared_policy_init(&info->policy, NULL);	1413	mpol_shared_policy_init(&info->policy, NULL);
1414	break;	1414	break;
1415	}	1415	}
1416	} else	1416	} else
1417	shmem_free_inode(sb);	1417	shmem_free_inode(sb);
1418	return inode;	1418	return inode;
1419	}	1419	}
1420		1420
1421	#ifdef CONFIG_TMPFS	1421	#ifdef CONFIG_TMPFS
1422	static const struct inode_operations shmem_symlink_inode_operations;	1422	static const struct inode_operations shmem_symlink_inode_operations;
1423	static const struct inode_operations shmem_short_symlink_operations;	1423	static const struct inode_operations shmem_short_symlink_operations;
1424		1424
1425	#ifdef CONFIG_TMPFS_XATTR	1425	#ifdef CONFIG_TMPFS_XATTR
1426	static int shmem_initxattrs(struct inode , const struct xattr , void *);	1426	static int shmem_initxattrs(struct inode , const struct xattr , void *);
1427	#else	1427	#else
1428	#define shmem_initxattrs NULL	1428	#define shmem_initxattrs NULL
1429	#endif	1429	#endif
1430		1430
1431	static int	1431	static int
1432	shmem_write_begin(struct file file, struct address_space mapping,	1432	shmem_write_begin(struct file file, struct address_space mapping,
1433	loff_t pos, unsigned len, unsigned flags,	1433	loff_t pos, unsigned len, unsigned flags,
1434	struct page pagep, void fsdata)	1434	struct page pagep, void fsdata)
1435	{	1435	{
1436	struct inode *inode = mapping->host;	1436	struct inode *inode = mapping->host;
1437	pgoff_t index = pos >> PAGE_CACHE_SHIFT;	1437	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1438	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);	1438	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1439	}	1439	}
1440		1440
1441	static int	1441	static int
1442	shmem_write_end(struct file file, struct address_space mapping,	1442	shmem_write_end(struct file file, struct address_space mapping,
1443	loff_t pos, unsigned len, unsigned copied,	1443	loff_t pos, unsigned len, unsigned copied,
1444	struct page page, void fsdata)	1444	struct page page, void fsdata)
1445	{	1445	{
1446	struct inode *inode = mapping->host;	1446	struct inode *inode = mapping->host;
1447		1447
1448	if (pos + copied > inode->i_size)	1448	if (pos + copied > inode->i_size)
1449	i_size_write(inode, pos + copied);	1449	i_size_write(inode, pos + copied);
1450		1450
1451	if (!PageUptodate(page)) {	1451	if (!PageUptodate(page)) {
1452	if (copied < PAGE_CACHE_SIZE) {	1452	if (copied < PAGE_CACHE_SIZE) {
1453	unsigned from = pos & (PAGE_CACHE_SIZE - 1);	1453	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1454	zero_user_segments(page, 0, from,	1454	zero_user_segments(page, 0, from,
1455	from + copied, PAGE_CACHE_SIZE);	1455	from + copied, PAGE_CACHE_SIZE);
1456	}	1456	}
1457	SetPageUptodate(page);	1457	SetPageUptodate(page);
1458	}	1458	}
1459	set_page_dirty(page);	1459	set_page_dirty(page);
1460	unlock_page(page);	1460	unlock_page(page);
1461	page_cache_release(page);	1461	page_cache_release(page);
1462		1462
1463	return copied;	1463	return copied;
1464	}	1464	}
1465		1465
1466	static void do_shmem_file_read(struct file filp, loff_t ppos, read_descriptor_t *desc, read_actor_t actor)	1466	static void do_shmem_file_read(struct file filp, loff_t ppos, read_descriptor_t *desc, read_actor_t actor)
1467	{	1467	{
1468	struct inode *inode = filp->f_path.dentry->d_inode;	1468	struct inode *inode = filp->f_path.dentry->d_inode;
1469	struct address_space *mapping = inode->i_mapping;	1469	struct address_space *mapping = inode->i_mapping;
1470	pgoff_t index;	1470	pgoff_t index;
1471	unsigned long offset;	1471	unsigned long offset;
1472	enum sgp_type sgp = SGP_READ;	1472	enum sgp_type sgp = SGP_READ;
1473		1473
1474	/*	1474	/*
1475	* Might this read be for a stacking filesystem? Then when reading	1475	* Might this read be for a stacking filesystem? Then when reading
1476	* holes of a sparse file, we actually need to allocate those pages,	1476	* holes of a sparse file, we actually need to allocate those pages,
1477	* and even mark them dirty, so it cannot exceed the max_blocks limit.	1477	* and even mark them dirty, so it cannot exceed the max_blocks limit.
1478	*/	1478	*/
1479	if (segment_eq(get_fs(), KERNEL_DS))	1479	if (segment_eq(get_fs(), KERNEL_DS))
1480	sgp = SGP_DIRTY;	1480	sgp = SGP_DIRTY;
1481		1481
1482	index = *ppos >> PAGE_CACHE_SHIFT;	1482	index = *ppos >> PAGE_CACHE_SHIFT;
1483	offset = *ppos & ~PAGE_CACHE_MASK;	1483	offset = *ppos & ~PAGE_CACHE_MASK;
1484		1484
1485	for (;;) {	1485	for (;;) {
1486	struct page *page = NULL;	1486	struct page *page = NULL;
1487	pgoff_t end_index;	1487	pgoff_t end_index;
1488	unsigned long nr, ret;	1488	unsigned long nr, ret;
1489	loff_t i_size = i_size_read(inode);	1489	loff_t i_size = i_size_read(inode);
1490		1490
1491	end_index = i_size >> PAGE_CACHE_SHIFT;	1491	end_index = i_size >> PAGE_CACHE_SHIFT;
1492	if (index > end_index)	1492	if (index > end_index)
1493	break;	1493	break;
1494	if (index == end_index) {	1494	if (index == end_index) {
1495	nr = i_size & ~PAGE_CACHE_MASK;	1495	nr = i_size & ~PAGE_CACHE_MASK;
1496	if (nr <= offset)	1496	if (nr <= offset)
1497	break;	1497	break;
1498	}	1498	}
1499		1499
1500	desc->error = shmem_getpage(inode, index, &page, sgp, NULL);	1500	desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1501	if (desc->error) {	1501	if (desc->error) {
1502	if (desc->error == -EINVAL)	1502	if (desc->error == -EINVAL)
1503	desc->error = 0;	1503	desc->error = 0;
1504	break;	1504	break;
1505	}	1505	}
1506	if (page)	1506	if (page)
1507	unlock_page(page);	1507	unlock_page(page);
1508		1508
1509	/*	1509	/*
1510	* We must evaluate after, since reads (unlike writes)	1510	* We must evaluate after, since reads (unlike writes)
1511	* are called without i_mutex protection against truncate	1511	* are called without i_mutex protection against truncate
1512	*/	1512	*/
1513	nr = PAGE_CACHE_SIZE;	1513	nr = PAGE_CACHE_SIZE;
1514	i_size = i_size_read(inode);	1514	i_size = i_size_read(inode);
1515	end_index = i_size >> PAGE_CACHE_SHIFT;	1515	end_index = i_size >> PAGE_CACHE_SHIFT;
1516	if (index == end_index) {	1516	if (index == end_index) {
1517	nr = i_size & ~PAGE_CACHE_MASK;	1517	nr = i_size & ~PAGE_CACHE_MASK;
1518	if (nr <= offset) {	1518	if (nr <= offset) {
1519	if (page)	1519	if (page)
1520	page_cache_release(page);	1520	page_cache_release(page);
1521	break;	1521	break;
1522	}	1522	}
1523	}	1523	}
1524	nr -= offset;	1524	nr -= offset;
1525		1525
1526	if (page) {	1526	if (page) {
1527	/*	1527	/*
1528	* If users can be writing to this page using arbitrary	1528	* If users can be writing to this page using arbitrary
1529	* virtual addresses, take care about potential aliasing	1529	* virtual addresses, take care about potential aliasing
1530	* before reading the page on the kernel side.	1530	* before reading the page on the kernel side.
1531	*/	1531	*/
1532	if (mapping_writably_mapped(mapping))	1532	if (mapping_writably_mapped(mapping))
1533	flush_dcache_page(page);	1533	flush_dcache_page(page);
1534	/*	1534	/*
1535	* Mark the page accessed if we read the beginning.	1535	* Mark the page accessed if we read the beginning.
1536	*/	1536	*/
1537	if (!offset)	1537	if (!offset)
1538	mark_page_accessed(page);	1538	mark_page_accessed(page);
1539	} else {	1539	} else {
1540	page = ZERO_PAGE(0);	1540	page = ZERO_PAGE(0);
1541	page_cache_get(page);	1541	page_cache_get(page);
1542	}	1542	}
1543		1543
1544	/*	1544	/*
1545	* Ok, we have the page, and it's up-to-date, so	1545	* Ok, we have the page, and it's up-to-date, so
1546	* now we can copy it to user space...	1546	* now we can copy it to user space...
1547	*	1547	*
1548	* The actor routine returns how many bytes were actually used..	1548	* The actor routine returns how many bytes were actually used..
1549	* NOTE! This may not be the same as how much of a user buffer	1549	* NOTE! This may not be the same as how much of a user buffer
1550	* we filled up (we may be padding etc), so we can only update	1550	* we filled up (we may be padding etc), so we can only update
1551	* "pos" here (the actor routine has to update the user buffer	1551	* "pos" here (the actor routine has to update the user buffer
1552	* pointers and the remaining count).	1552	* pointers and the remaining count).
1553	*/	1553	*/
1554	ret = actor(desc, page, offset, nr);	1554	ret = actor(desc, page, offset, nr);
1555	offset += ret;	1555	offset += ret;
1556	index += offset >> PAGE_CACHE_SHIFT;	1556	index += offset >> PAGE_CACHE_SHIFT;
1557	offset &= ~PAGE_CACHE_MASK;	1557	offset &= ~PAGE_CACHE_MASK;
1558		1558
1559	page_cache_release(page);	1559	page_cache_release(page);
1560	if (ret != nr \|\| !desc->count)	1560	if (ret != nr \|\| !desc->count)
1561	break;	1561	break;
1562		1562
1563	cond_resched();	1563	cond_resched();
1564	}	1564	}
1565		1565
1566	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;	1566	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1567	file_accessed(filp);	1567	file_accessed(filp);
1568	}	1568	}
1569		1569
1570	static ssize_t shmem_file_aio_read(struct kiocb *iocb,	1570	static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1571	const struct iovec *iov, unsigned long nr_segs, loff_t pos)	1571	const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1572	{	1572	{
1573	struct file *filp = iocb->ki_filp;	1573	struct file *filp = iocb->ki_filp;
1574	ssize_t retval;	1574	ssize_t retval;
1575	unsigned long seg;	1575	unsigned long seg;
1576	size_t count;	1576	size_t count;
1577	loff_t *ppos = &iocb->ki_pos;	1577	loff_t *ppos = &iocb->ki_pos;
1578		1578
1579	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);	1579	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1580	if (retval)	1580	if (retval)
1581	return retval;	1581	return retval;
1582		1582
1583	for (seg = 0; seg < nr_segs; seg++) {	1583	for (seg = 0; seg < nr_segs; seg++) {
1584	read_descriptor_t desc;	1584	read_descriptor_t desc;
1585		1585
1586	desc.written = 0;	1586	desc.written = 0;
1587	desc.arg.buf = iov[seg].iov_base;	1587	desc.arg.buf = iov[seg].iov_base;
1588	desc.count = iov[seg].iov_len;	1588	desc.count = iov[seg].iov_len;
1589	if (desc.count == 0)	1589	if (desc.count == 0)
1590	continue;	1590	continue;
1591	desc.error = 0;	1591	desc.error = 0;
1592	do_shmem_file_read(filp, ppos, &desc, file_read_actor);	1592	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1593	retval += desc.written;	1593	retval += desc.written;
1594	if (desc.error) {	1594	if (desc.error) {
1595	retval = retval ?: desc.error;	1595	retval = retval ?: desc.error;
1596	break;	1596	break;
1597	}	1597	}
1598	if (desc.count > 0)	1598	if (desc.count > 0)
1599	break;	1599	break;
1600	}	1600	}
1601	return retval;	1601	return retval;
1602	}	1602	}
1603		1603
1604	static ssize_t shmem_file_splice_read(struct file in, loff_t ppos,	1604	static ssize_t shmem_file_splice_read(struct file in, loff_t ppos,
1605	struct pipe_inode_info *pipe, size_t len,	1605	struct pipe_inode_info *pipe, size_t len,
1606	unsigned int flags)	1606	unsigned int flags)
1607	{	1607	{
1608	struct address_space *mapping = in->f_mapping;	1608	struct address_space *mapping = in->f_mapping;
1609	struct inode *inode = mapping->host;	1609	struct inode *inode = mapping->host;
1610	unsigned int loff, nr_pages, req_pages;	1610	unsigned int loff, nr_pages, req_pages;
1611	struct page *pages[PIPE_DEF_BUFFERS];	1611	struct page *pages[PIPE_DEF_BUFFERS];
1612	struct partial_page partial[PIPE_DEF_BUFFERS];	1612	struct partial_page partial[PIPE_DEF_BUFFERS];
1613	struct page *page;	1613	struct page *page;
1614	pgoff_t index, end_index;	1614	pgoff_t index, end_index;
1615	loff_t isize, left;	1615	loff_t isize, left;
1616	int error, page_nr;	1616	int error, page_nr;
1617	struct splice_pipe_desc spd = {	1617	struct splice_pipe_desc spd = {
1618	.pages = pages,	1618	.pages = pages,
1619	.partial = partial,	1619	.partial = partial,
1620	.nr_pages_max = PIPE_DEF_BUFFERS,	1620	.nr_pages_max = PIPE_DEF_BUFFERS,
1621	.flags = flags,	1621	.flags = flags,
1622	.ops = &page_cache_pipe_buf_ops,	1622	.ops = &page_cache_pipe_buf_ops,
1623	.spd_release = spd_release_page,	1623	.spd_release = spd_release_page,
1624	};	1624	};
1625		1625
1626	isize = i_size_read(inode);	1626	isize = i_size_read(inode);
1627	if (unlikely(*ppos >= isize))	1627	if (unlikely(*ppos >= isize))
1628	return 0;	1628	return 0;
1629		1629
1630	left = isize - *ppos;	1630	left = isize - *ppos;
1631	if (unlikely(left < len))	1631	if (unlikely(left < len))
1632	len = left;	1632	len = left;
1633		1633
1634	if (splice_grow_spd(pipe, &spd))	1634	if (splice_grow_spd(pipe, &spd))
1635	return -ENOMEM;	1635	return -ENOMEM;
1636		1636
1637	index = *ppos >> PAGE_CACHE_SHIFT;	1637	index = *ppos >> PAGE_CACHE_SHIFT;
1638	loff = *ppos & ~PAGE_CACHE_MASK;	1638	loff = *ppos & ~PAGE_CACHE_MASK;
1639	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1639	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1640	nr_pages = min(req_pages, pipe->buffers);	1640	nr_pages = min(req_pages, pipe->buffers);
1641		1641
1642	spd.nr_pages = find_get_pages_contig(mapping, index,	1642	spd.nr_pages = find_get_pages_contig(mapping, index,
1643	nr_pages, spd.pages);	1643	nr_pages, spd.pages);
1644	index += spd.nr_pages;	1644	index += spd.nr_pages;
1645	error = 0;	1645	error = 0;
1646		1646
1647	while (spd.nr_pages < nr_pages) {	1647	while (spd.nr_pages < nr_pages) {
1648	error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);	1648	error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1649	if (error)	1649	if (error)
1650	break;	1650	break;
1651	unlock_page(page);	1651	unlock_page(page);
1652	spd.pages[spd.nr_pages++] = page;	1652	spd.pages[spd.nr_pages++] = page;
1653	index++;	1653	index++;
1654	}	1654	}
1655		1655
1656	index = *ppos >> PAGE_CACHE_SHIFT;	1656	index = *ppos >> PAGE_CACHE_SHIFT;
1657	nr_pages = spd.nr_pages;	1657	nr_pages = spd.nr_pages;
1658	spd.nr_pages = 0;	1658	spd.nr_pages = 0;
1659		1659
1660	for (page_nr = 0; page_nr < nr_pages; page_nr++) {	1660	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1661	unsigned int this_len;	1661	unsigned int this_len;
1662		1662
1663	if (!len)	1663	if (!len)
1664	break;	1664	break;
1665		1665
1666	this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);	1666	this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1667	page = spd.pages[page_nr];	1667	page = spd.pages[page_nr];
1668		1668
1669	if (!PageUptodate(page) \|\| page->mapping != mapping) {	1669	if (!PageUptodate(page) \|\| page->mapping != mapping) {
1670	error = shmem_getpage(inode, index, &page,	1670	error = shmem_getpage(inode, index, &page,
1671	SGP_CACHE, NULL);	1671	SGP_CACHE, NULL);
1672	if (error)	1672	if (error)
1673	break;	1673	break;
1674	unlock_page(page);	1674	unlock_page(page);
1675	page_cache_release(spd.pages[page_nr]);	1675	page_cache_release(spd.pages[page_nr]);
1676	spd.pages[page_nr] = page;	1676	spd.pages[page_nr] = page;
1677	}	1677	}
1678		1678
1679	isize = i_size_read(inode);	1679	isize = i_size_read(inode);
1680	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;	1680	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1681	if (unlikely(!isize \|\| index > end_index))	1681	if (unlikely(!isize \|\| index > end_index))
1682	break;	1682	break;
1683		1683
1684	if (end_index == index) {	1684	if (end_index == index) {
1685	unsigned int plen;	1685	unsigned int plen;
1686		1686
1687	plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;	1687	plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1688	if (plen <= loff)	1688	if (plen <= loff)
1689	break;	1689	break;
1690		1690
1691	this_len = min(this_len, plen - loff);	1691	this_len = min(this_len, plen - loff);
1692	len = this_len;	1692	len = this_len;
1693	}	1693	}
1694		1694
1695	spd.partial[page_nr].offset = loff;	1695	spd.partial[page_nr].offset = loff;
1696	spd.partial[page_nr].len = this_len;	1696	spd.partial[page_nr].len = this_len;
1697	len -= this_len;	1697	len -= this_len;
1698	loff = 0;	1698	loff = 0;
1699	spd.nr_pages++;	1699	spd.nr_pages++;
1700	index++;	1700	index++;
1701	}	1701	}
1702		1702
1703	while (page_nr < nr_pages)	1703	while (page_nr < nr_pages)
1704	page_cache_release(spd.pages[page_nr++]);	1704	page_cache_release(spd.pages[page_nr++]);
1705		1705
1706	if (spd.nr_pages)	1706	if (spd.nr_pages)
1707	error = splice_to_pipe(pipe, &spd);	1707	error = splice_to_pipe(pipe, &spd);
1708		1708
1709	splice_shrink_spd(&spd);	1709	splice_shrink_spd(&spd);
1710		1710
1711	if (error > 0) {	1711	if (error > 0) {
1712	*ppos += error;	1712	*ppos += error;
1713	file_accessed(in);	1713	file_accessed(in);
1714	}	1714	}
1715	return error;	1715	return error;
1716	}	1716	}
1717		1717
1718	/*	1718	/*
1719	* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.	1719	* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720	*/	1720	*/
1721	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,	1721	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722	pgoff_t index, pgoff_t end, int origin)	1722	pgoff_t index, pgoff_t end, int whence)
1723	{	1723	{
1724	struct page *page;	1724	struct page *page;
1725	struct pagevec pvec;	1725	struct pagevec pvec;
1726	pgoff_t indices[PAGEVEC_SIZE];	1726	pgoff_t indices[PAGEVEC_SIZE];
1727	bool done = false;	1727	bool done = false;
1728	int i;	1728	int i;
1729		1729
1730	pagevec_init(&pvec, 0);	1730	pagevec_init(&pvec, 0);
1731	pvec.nr = 1; /* start small: we may be there already */	1731	pvec.nr = 1; /* start small: we may be there already */
1732	while (!done) {	1732	while (!done) {
1733	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,	1733	pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734	pvec.nr, pvec.pages, indices);	1734	pvec.nr, pvec.pages, indices);
1735	if (!pvec.nr) {	1735	if (!pvec.nr) {
1736	if (origin == SEEK_DATA)	1736	if (whence == SEEK_DATA)
1737	index = end;	1737	index = end;
1738	break;	1738	break;
1739	}	1739	}
1740	for (i = 0; i < pvec.nr; i++, index++) {	1740	for (i = 0; i < pvec.nr; i++, index++) {
1741	if (index < indices[i]) {	1741	if (index < indices[i]) {
1742	if (origin == SEEK_HOLE) {	1742	if (whence == SEEK_HOLE) {
1743	done = true;	1743	done = true;
1744	break;	1744	break;
1745	}	1745	}
1746	index = indices[i];	1746	index = indices[i];
1747	}	1747	}
1748	page = pvec.pages[i];	1748	page = pvec.pages[i];
1749	if (page && !radix_tree_exceptional_entry(page)) {	1749	if (page && !radix_tree_exceptional_entry(page)) {
1750	if (!PageUptodate(page))	1750	if (!PageUptodate(page))
1751	page = NULL;	1751	page = NULL;
1752	}	1752	}
1753	if (index >= end \|\|	1753	if (index >= end \|\|
1754	(page && origin == SEEK_DATA) \|\|	1754	(page && whence == SEEK_DATA) \|\|
1755	(!page && origin == SEEK_HOLE)) {	1755	(!page && whence == SEEK_HOLE)) {
1756	done = true;	1756	done = true;
1757	break;	1757	break;
1758	}	1758	}
1759	}	1759	}
1760	shmem_deswap_pagevec(&pvec);	1760	shmem_deswap_pagevec(&pvec);
1761	pagevec_release(&pvec);	1761	pagevec_release(&pvec);
1762	pvec.nr = PAGEVEC_SIZE;	1762	pvec.nr = PAGEVEC_SIZE;
1763	cond_resched();	1763	cond_resched();
1764	}	1764	}
1765	return index;	1765	return index;
1766	}	1766	}
1767		1767
1768	static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)	1768	static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769	{	1769	{
1770	struct address_space *mapping = file->f_mapping;	1770	struct address_space *mapping = file->f_mapping;
1771	struct inode *inode = mapping->host;	1771	struct inode *inode = mapping->host;
1772	pgoff_t start, end;	1772	pgoff_t start, end;
1773	loff_t new_offset;	1773	loff_t new_offset;
1774		1774
1775	if (origin != SEEK_DATA && origin != SEEK_HOLE)	1775	if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776	return generic_file_llseek_size(file, offset, origin,	1776	return generic_file_llseek_size(file, offset, whence,
1777	MAX_LFS_FILESIZE, i_size_read(inode));	1777	MAX_LFS_FILESIZE, i_size_read(inode));
1778	mutex_lock(&inode->i_mutex);	1778	mutex_lock(&inode->i_mutex);
1779	/* We're holding i_mutex so we can access i_size directly */	1779	/* We're holding i_mutex so we can access i_size directly */
1780		1780
1781	if (offset < 0)	1781	if (offset < 0)
1782	offset = -EINVAL;	1782	offset = -EINVAL;
1783	else if (offset >= inode->i_size)	1783	else if (offset >= inode->i_size)
1784	offset = -ENXIO;	1784	offset = -ENXIO;
1785	else {	1785	else {
1786	start = offset >> PAGE_CACHE_SHIFT;	1786	start = offset >> PAGE_CACHE_SHIFT;
1787	end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1787	end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788	new_offset = shmem_seek_hole_data(mapping, start, end, origin);	1788	new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789	new_offset <<= PAGE_CACHE_SHIFT;	1789	new_offset <<= PAGE_CACHE_SHIFT;
1790	if (new_offset > offset) {	1790	if (new_offset > offset) {
1791	if (new_offset < inode->i_size)	1791	if (new_offset < inode->i_size)
1792	offset = new_offset;	1792	offset = new_offset;
1793	else if (origin == SEEK_DATA)	1793	else if (whence == SEEK_DATA)
1794	offset = -ENXIO;	1794	offset = -ENXIO;
1795	else	1795	else
1796	offset = inode->i_size;	1796	offset = inode->i_size;
1797	}	1797	}
1798	}	1798	}
1799		1799
1800	if (offset >= 0 && offset != file->f_pos) {	1800	if (offset >= 0 && offset != file->f_pos) {
1801	file->f_pos = offset;	1801	file->f_pos = offset;
1802	file->f_version = 0;	1802	file->f_version = 0;
1803	}	1803	}
1804	mutex_unlock(&inode->i_mutex);	1804	mutex_unlock(&inode->i_mutex);
1805	return offset;	1805	return offset;
1806	}	1806	}
1807		1807
1808	static long shmem_fallocate(struct file *file, int mode, loff_t offset,	1808	static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1809	loff_t len)	1809	loff_t len)
1810	{	1810	{
1811	struct inode *inode = file->f_path.dentry->d_inode;	1811	struct inode *inode = file->f_path.dentry->d_inode;
1812	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);	1812	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1813	struct shmem_falloc shmem_falloc;	1813	struct shmem_falloc shmem_falloc;
1814	pgoff_t start, index, end;	1814	pgoff_t start, index, end;
1815	int error;	1815	int error;
1816		1816
1817	mutex_lock(&inode->i_mutex);	1817	mutex_lock(&inode->i_mutex);
1818		1818
1819	if (mode & FALLOC_FL_PUNCH_HOLE) {	1819	if (mode & FALLOC_FL_PUNCH_HOLE) {
1820	struct address_space *mapping = file->f_mapping;	1820	struct address_space *mapping = file->f_mapping;
1821	loff_t unmap_start = round_up(offset, PAGE_SIZE);	1821	loff_t unmap_start = round_up(offset, PAGE_SIZE);
1822	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;	1822	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1823		1823
1824	if ((u64)unmap_end > (u64)unmap_start)	1824	if ((u64)unmap_end > (u64)unmap_start)
1825	unmap_mapping_range(mapping, unmap_start,	1825	unmap_mapping_range(mapping, unmap_start,
1826	1 + unmap_end - unmap_start, 0);	1826	1 + unmap_end - unmap_start, 0);
1827	shmem_truncate_range(inode, offset, offset + len - 1);	1827	shmem_truncate_range(inode, offset, offset + len - 1);
1828	/* No need to unmap again: hole-punching leaves COWed pages */	1828	/* No need to unmap again: hole-punching leaves COWed pages */
1829	error = 0;	1829	error = 0;
1830	goto out;	1830	goto out;
1831	}	1831	}
1832		1832
1833	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */	1833	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1834	error = inode_newsize_ok(inode, offset + len);	1834	error = inode_newsize_ok(inode, offset + len);
1835	if (error)	1835	if (error)
1836	goto out;	1836	goto out;
1837		1837
1838	start = offset >> PAGE_CACHE_SHIFT;	1838	start = offset >> PAGE_CACHE_SHIFT;
1839	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1839	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1840	/* Try to avoid a swapstorm if len is impossible to satisfy */	1840	/* Try to avoid a swapstorm if len is impossible to satisfy */
1841	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {	1841	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1842	error = -ENOSPC;	1842	error = -ENOSPC;
1843	goto out;	1843	goto out;
1844	}	1844	}
1845		1845
1846	shmem_falloc.start = start;	1846	shmem_falloc.start = start;
1847	shmem_falloc.next = start;	1847	shmem_falloc.next = start;
1848	shmem_falloc.nr_falloced = 0;	1848	shmem_falloc.nr_falloced = 0;
1849	shmem_falloc.nr_unswapped = 0;	1849	shmem_falloc.nr_unswapped = 0;
1850	spin_lock(&inode->i_lock);	1850	spin_lock(&inode->i_lock);
1851	inode->i_private = &shmem_falloc;	1851	inode->i_private = &shmem_falloc;
1852	spin_unlock(&inode->i_lock);	1852	spin_unlock(&inode->i_lock);
1853		1853
1854	for (index = start; index < end; index++) {	1854	for (index = start; index < end; index++) {
1855	struct page *page;	1855	struct page *page;
1856		1856
1857	/*	1857	/*
1858	* Good, the fallocate(2) manpage permits EINTR: we may have	1858	* Good, the fallocate(2) manpage permits EINTR: we may have
1859	* been interrupted because we are using up too much memory.	1859	* been interrupted because we are using up too much memory.
1860	*/	1860	*/
1861	if (signal_pending(current))	1861	if (signal_pending(current))
1862	error = -EINTR;	1862	error = -EINTR;
1863	else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)	1863	else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1864	error = -ENOMEM;	1864	error = -ENOMEM;
1865	else	1865	else
1866	error = shmem_getpage(inode, index, &page, SGP_FALLOC,	1866	error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1867	NULL);	1867	NULL);
1868	if (error) {	1868	if (error) {
1869	/* Remove the !PageUptodate pages we added */	1869	/* Remove the !PageUptodate pages we added */
1870	shmem_undo_range(inode,	1870	shmem_undo_range(inode,
1871	(loff_t)start << PAGE_CACHE_SHIFT,	1871	(loff_t)start << PAGE_CACHE_SHIFT,
1872	(loff_t)index << PAGE_CACHE_SHIFT, true);	1872	(loff_t)index << PAGE_CACHE_SHIFT, true);
1873	goto undone;	1873	goto undone;
1874	}	1874	}
1875		1875
1876	/*	1876	/*
1877	* Inform shmem_writepage() how far we have reached.	1877	* Inform shmem_writepage() how far we have reached.
1878	* No need for lock or barrier: we have the page lock.	1878	* No need for lock or barrier: we have the page lock.
1879	*/	1879	*/
1880	shmem_falloc.next++;	1880	shmem_falloc.next++;
1881	if (!PageUptodate(page))	1881	if (!PageUptodate(page))
1882	shmem_falloc.nr_falloced++;	1882	shmem_falloc.nr_falloced++;
1883		1883
1884	/*	1884	/*
1885	* If !PageUptodate, leave it that way so that freeable pages	1885	* If !PageUptodate, leave it that way so that freeable pages
1886	* can be recognized if we need to rollback on error later.	1886	* can be recognized if we need to rollback on error later.
1887	* But set_page_dirty so that memory pressure will swap rather	1887	* But set_page_dirty so that memory pressure will swap rather
1888	* than free the pages we are allocating (and SGP_CACHE pages	1888	* than free the pages we are allocating (and SGP_CACHE pages
1889	* might still be clean: we now need to mark those dirty too).	1889	* might still be clean: we now need to mark those dirty too).
1890	*/	1890	*/
1891	set_page_dirty(page);	1891	set_page_dirty(page);
1892	unlock_page(page);	1892	unlock_page(page);
1893	page_cache_release(page);	1893	page_cache_release(page);
1894	cond_resched();	1894	cond_resched();
1895	}	1895	}
1896		1896
1897	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)	1897	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1898	i_size_write(inode, offset + len);	1898	i_size_write(inode, offset + len);
1899	inode->i_ctime = CURRENT_TIME;	1899	inode->i_ctime = CURRENT_TIME;
1900	undone:	1900	undone:
1901	spin_lock(&inode->i_lock);	1901	spin_lock(&inode->i_lock);
1902	inode->i_private = NULL;	1902	inode->i_private = NULL;
1903	spin_unlock(&inode->i_lock);	1903	spin_unlock(&inode->i_lock);
1904	out:	1904	out:
1905	mutex_unlock(&inode->i_mutex);	1905	mutex_unlock(&inode->i_mutex);
1906	return error;	1906	return error;
1907	}	1907	}
1908		1908
1909	static int shmem_statfs(struct dentry dentry, struct kstatfs buf)	1909	static int shmem_statfs(struct dentry dentry, struct kstatfs buf)
1910	{	1910	{
1911	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);	1911	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1912		1912
1913	buf->f_type = TMPFS_MAGIC;	1913	buf->f_type = TMPFS_MAGIC;
1914	buf->f_bsize = PAGE_CACHE_SIZE;	1914	buf->f_bsize = PAGE_CACHE_SIZE;
1915	buf->f_namelen = NAME_MAX;	1915	buf->f_namelen = NAME_MAX;
1916	if (sbinfo->max_blocks) {	1916	if (sbinfo->max_blocks) {
1917	buf->f_blocks = sbinfo->max_blocks;	1917	buf->f_blocks = sbinfo->max_blocks;
1918	buf->f_bavail =	1918	buf->f_bavail =
1919	buf->f_bfree = sbinfo->max_blocks -	1919	buf->f_bfree = sbinfo->max_blocks -
1920	percpu_counter_sum(&sbinfo->used_blocks);	1920	percpu_counter_sum(&sbinfo->used_blocks);
1921	}	1921	}
1922	if (sbinfo->max_inodes) {	1922	if (sbinfo->max_inodes) {
1923	buf->f_files = sbinfo->max_inodes;	1923	buf->f_files = sbinfo->max_inodes;
1924	buf->f_ffree = sbinfo->free_inodes;	1924	buf->f_ffree = sbinfo->free_inodes;
1925	}	1925	}
1926	/* else leave those fields 0 like simple_statfs */	1926	/* else leave those fields 0 like simple_statfs */
1927	return 0;	1927	return 0;
1928	}	1928	}
1929		1929
1930	/*	1930	/*
1931	* File creation. Allocate an inode, and we're done..	1931	* File creation. Allocate an inode, and we're done..
1932	*/	1932	*/
1933	static int	1933	static int
1934	shmem_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)	1934	shmem_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
1935	{	1935	{
1936	struct inode *inode;	1936	struct inode *inode;
1937	int error = -ENOSPC;	1937	int error = -ENOSPC;
1938		1938
1939	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);	1939	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1940	if (inode) {	1940	if (inode) {
1941	error = security_inode_init_security(inode, dir,	1941	error = security_inode_init_security(inode, dir,
1942	&dentry->d_name,	1942	&dentry->d_name,
1943	shmem_initxattrs, NULL);	1943	shmem_initxattrs, NULL);
1944	if (error) {	1944	if (error) {
1945	if (error != -EOPNOTSUPP) {	1945	if (error != -EOPNOTSUPP) {
1946	iput(inode);	1946	iput(inode);
1947	return error;	1947	return error;
1948	}	1948	}
1949	}	1949	}
1950	#ifdef CONFIG_TMPFS_POSIX_ACL	1950	#ifdef CONFIG_TMPFS_POSIX_ACL
1951	error = generic_acl_init(inode, dir);	1951	error = generic_acl_init(inode, dir);
1952	if (error) {	1952	if (error) {
1953	iput(inode);	1953	iput(inode);
1954	return error;	1954	return error;
1955	}	1955	}
1956	#else	1956	#else
1957	error = 0;	1957	error = 0;
1958	#endif	1958	#endif
1959	dir->i_size += BOGO_DIRENT_SIZE;	1959	dir->i_size += BOGO_DIRENT_SIZE;
1960	dir->i_ctime = dir->i_mtime = CURRENT_TIME;	1960	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1961	d_instantiate(dentry, inode);	1961	d_instantiate(dentry, inode);
1962	dget(dentry); /* Extra count - pin the dentry in core */	1962	dget(dentry); /* Extra count - pin the dentry in core */
1963	}	1963	}
1964	return error;	1964	return error;
1965	}	1965	}
1966		1966
1967	static int shmem_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	1967	static int shmem_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
1968	{	1968	{
1969	int error;	1969	int error;
1970		1970
1971	if ((error = shmem_mknod(dir, dentry, mode \| S_IFDIR, 0)))	1971	if ((error = shmem_mknod(dir, dentry, mode \| S_IFDIR, 0)))
1972	return error;	1972	return error;
1973	inc_nlink(dir);	1973	inc_nlink(dir);
1974	return 0;	1974	return 0;
1975	}	1975	}
1976		1976
1977	static int shmem_create(struct inode dir, struct dentry dentry, umode_t mode,	1977	static int shmem_create(struct inode dir, struct dentry dentry, umode_t mode,
1978	bool excl)	1978	bool excl)
1979	{	1979	{
1980	return shmem_mknod(dir, dentry, mode \| S_IFREG, 0);	1980	return shmem_mknod(dir, dentry, mode \| S_IFREG, 0);
1981	}	1981	}
1982		1982
1983	/*	1983	/*
1984	* Link a file..	1984	* Link a file..
1985	*/	1985	*/
1986	static int shmem_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)	1986	static int shmem_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)
1987	{	1987	{
1988	struct inode *inode = old_dentry->d_inode;	1988	struct inode *inode = old_dentry->d_inode;
1989	int ret;	1989	int ret;
1990		1990
1991	/*	1991	/*
1992	* No ordinary (disk based) filesystem counts links as inodes;	1992	* No ordinary (disk based) filesystem counts links as inodes;
1993	* but each new link needs a new dentry, pinning lowmem, and	1993	* but each new link needs a new dentry, pinning lowmem, and
1994	* tmpfs dentries cannot be pruned until they are unlinked.	1994	* tmpfs dentries cannot be pruned until they are unlinked.
1995	*/	1995	*/
1996	ret = shmem_reserve_inode(inode->i_sb);	1996	ret = shmem_reserve_inode(inode->i_sb);
1997	if (ret)	1997	if (ret)
1998	goto out;	1998	goto out;
1999		1999
2000	dir->i_size += BOGO_DIRENT_SIZE;	2000	dir->i_size += BOGO_DIRENT_SIZE;
2001	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;	2001	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2002	inc_nlink(inode);	2002	inc_nlink(inode);
2003	ihold(inode); /* New dentry reference */	2003	ihold(inode); /* New dentry reference */
2004	dget(dentry); /* Extra pinning count for the created dentry */	2004	dget(dentry); /* Extra pinning count for the created dentry */
2005	d_instantiate(dentry, inode);	2005	d_instantiate(dentry, inode);
2006	out:	2006	out:
2007	return ret;	2007	return ret;
2008	}	2008	}
2009		2009
2010	static int shmem_unlink(struct inode dir, struct dentry dentry)	2010	static int shmem_unlink(struct inode dir, struct dentry dentry)
2011	{	2011	{
2012	struct inode *inode = dentry->d_inode;	2012	struct inode *inode = dentry->d_inode;
2013		2013
2014	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))	2014	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2015	shmem_free_inode(inode->i_sb);	2015	shmem_free_inode(inode->i_sb);
2016		2016
2017	dir->i_size -= BOGO_DIRENT_SIZE;	2017	dir->i_size -= BOGO_DIRENT_SIZE;
2018	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;	2018	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2019	drop_nlink(inode);	2019	drop_nlink(inode);
2020	dput(dentry); /* Undo the count from "create" - this does all the work */	2020	dput(dentry); /* Undo the count from "create" - this does all the work */
2021	return 0;	2021	return 0;
2022	}	2022	}
2023		2023
2024	static int shmem_rmdir(struct inode dir, struct dentry dentry)	2024	static int shmem_rmdir(struct inode dir, struct dentry dentry)
2025	{	2025	{
2026	if (!simple_empty(dentry))	2026	if (!simple_empty(dentry))
2027	return -ENOTEMPTY;	2027	return -ENOTEMPTY;
2028		2028
2029	drop_nlink(dentry->d_inode);	2029	drop_nlink(dentry->d_inode);
2030	drop_nlink(dir);	2030	drop_nlink(dir);
2031	return shmem_unlink(dir, dentry);	2031	return shmem_unlink(dir, dentry);
2032	}	2032	}
2033		2033
2034	/*	2034	/*
2035	* The VFS layer already does all the dentry stuff for rename,	2035	* The VFS layer already does all the dentry stuff for rename,
2036	* we just have to decrement the usage count for the target if	2036	* we just have to decrement the usage count for the target if
2037	* it exists so that the VFS layer correctly free's it when it	2037	* it exists so that the VFS layer correctly free's it when it
2038	* gets overwritten.	2038	* gets overwritten.
2039	*/	2039	*/
2040	static int shmem_rename(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry)	2040	static int shmem_rename(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry)
2041	{	2041	{
2042	struct inode *inode = old_dentry->d_inode;	2042	struct inode *inode = old_dentry->d_inode;
2043	int they_are_dirs = S_ISDIR(inode->i_mode);	2043	int they_are_dirs = S_ISDIR(inode->i_mode);
2044		2044
2045	if (!simple_empty(new_dentry))	2045	if (!simple_empty(new_dentry))
2046	return -ENOTEMPTY;	2046	return -ENOTEMPTY;
2047		2047
2048	if (new_dentry->d_inode) {	2048	if (new_dentry->d_inode) {
2049	(void) shmem_unlink(new_dir, new_dentry);	2049	(void) shmem_unlink(new_dir, new_dentry);
2050	if (they_are_dirs)	2050	if (they_are_dirs)
2051	drop_nlink(old_dir);	2051	drop_nlink(old_dir);
2052	} else if (they_are_dirs) {	2052	} else if (they_are_dirs) {
2053	drop_nlink(old_dir);	2053	drop_nlink(old_dir);
2054	inc_nlink(new_dir);	2054	inc_nlink(new_dir);
2055	}	2055	}
2056		2056
2057	old_dir->i_size -= BOGO_DIRENT_SIZE;	2057	old_dir->i_size -= BOGO_DIRENT_SIZE;
2058	new_dir->i_size += BOGO_DIRENT_SIZE;	2058	new_dir->i_size += BOGO_DIRENT_SIZE;
2059	old_dir->i_ctime = old_dir->i_mtime =	2059	old_dir->i_ctime = old_dir->i_mtime =
2060	new_dir->i_ctime = new_dir->i_mtime =	2060	new_dir->i_ctime = new_dir->i_mtime =
2061	inode->i_ctime = CURRENT_TIME;	2061	inode->i_ctime = CURRENT_TIME;
2062	return 0;	2062	return 0;
2063	}	2063	}
2064		2064
2065	static int shmem_symlink(struct inode dir, struct dentry dentry, const char *symname)	2065	static int shmem_symlink(struct inode dir, struct dentry dentry, const char *symname)
2066	{	2066	{
2067	int error;	2067	int error;
2068	int len;	2068	int len;
2069	struct inode *inode;	2069	struct inode *inode;
2070	struct page *page;	2070	struct page *page;
2071	char *kaddr;	2071	char *kaddr;
2072	struct shmem_inode_info *info;	2072	struct shmem_inode_info *info;
2073		2073
2074	len = strlen(symname) + 1;	2074	len = strlen(symname) + 1;
2075	if (len > PAGE_CACHE_SIZE)	2075	if (len > PAGE_CACHE_SIZE)
2076	return -ENAMETOOLONG;	2076	return -ENAMETOOLONG;
2077		2077
2078	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK\|S_IRWXUGO, 0, VM_NORESERVE);	2078	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK\|S_IRWXUGO, 0, VM_NORESERVE);
2079	if (!inode)	2079	if (!inode)
2080	return -ENOSPC;	2080	return -ENOSPC;
2081		2081
2082	error = security_inode_init_security(inode, dir, &dentry->d_name,	2082	error = security_inode_init_security(inode, dir, &dentry->d_name,
2083	shmem_initxattrs, NULL);	2083	shmem_initxattrs, NULL);
2084	if (error) {	2084	if (error) {
2085	if (error != -EOPNOTSUPP) {	2085	if (error != -EOPNOTSUPP) {
2086	iput(inode);	2086	iput(inode);
2087	return error;	2087	return error;
2088	}	2088	}
2089	error = 0;	2089	error = 0;
2090	}	2090	}
2091		2091
2092	info = SHMEM_I(inode);	2092	info = SHMEM_I(inode);
2093	inode->i_size = len-1;	2093	inode->i_size = len-1;
2094	if (len <= SHORT_SYMLINK_LEN) {	2094	if (len <= SHORT_SYMLINK_LEN) {
2095	info->symlink = kmemdup(symname, len, GFP_KERNEL);	2095	info->symlink = kmemdup(symname, len, GFP_KERNEL);
2096	if (!info->symlink) {	2096	if (!info->symlink) {
2097	iput(inode);	2097	iput(inode);
2098	return -ENOMEM;	2098	return -ENOMEM;
2099	}	2099	}
2100	inode->i_op = &shmem_short_symlink_operations;	2100	inode->i_op = &shmem_short_symlink_operations;
2101	} else {	2101	} else {
2102	error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);	2102	error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2103	if (error) {	2103	if (error) {
2104	iput(inode);	2104	iput(inode);
2105	return error;	2105	return error;
2106	}	2106	}
2107	inode->i_mapping->a_ops = &shmem_aops;	2107	inode->i_mapping->a_ops = &shmem_aops;
2108	inode->i_op = &shmem_symlink_inode_operations;	2108	inode->i_op = &shmem_symlink_inode_operations;
2109	kaddr = kmap_atomic(page);	2109	kaddr = kmap_atomic(page);
2110	memcpy(kaddr, symname, len);	2110	memcpy(kaddr, symname, len);
2111	kunmap_atomic(kaddr);	2111	kunmap_atomic(kaddr);
2112	SetPageUptodate(page);	2112	SetPageUptodate(page);
2113	set_page_dirty(page);	2113	set_page_dirty(page);
2114	unlock_page(page);	2114	unlock_page(page);
2115	page_cache_release(page);	2115	page_cache_release(page);
2116	}	2116	}
2117	dir->i_size += BOGO_DIRENT_SIZE;	2117	dir->i_size += BOGO_DIRENT_SIZE;
2118	dir->i_ctime = dir->i_mtime = CURRENT_TIME;	2118	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2119	d_instantiate(dentry, inode);	2119	d_instantiate(dentry, inode);
2120	dget(dentry);	2120	dget(dentry);
2121	return 0;	2121	return 0;
2122	}	2122	}
2123		2123
2124	static void shmem_follow_short_symlink(struct dentry dentry, struct nameidata *nd)	2124	static void shmem_follow_short_symlink(struct dentry dentry, struct nameidata *nd)
2125	{	2125	{
2126	nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);	2126	nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2127	return NULL;	2127	return NULL;
2128	}	2128	}
2129		2129
2130	static void shmem_follow_link(struct dentry dentry, struct nameidata *nd)	2130	static void shmem_follow_link(struct dentry dentry, struct nameidata *nd)
2131	{	2131	{
2132	struct page *page = NULL;	2132	struct page *page = NULL;
2133	int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);	2133	int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2134	nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));	2134	nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2135	if (page)	2135	if (page)
2136	unlock_page(page);	2136	unlock_page(page);
2137	return page;	2137	return page;
2138	}	2138	}
2139		2139
2140	static void shmem_put_link(struct dentry dentry, struct nameidata nd, void *cookie)	2140	static void shmem_put_link(struct dentry dentry, struct nameidata nd, void *cookie)
2141	{	2141	{
2142	if (!IS_ERR(nd_get_link(nd))) {	2142	if (!IS_ERR(nd_get_link(nd))) {
2143	struct page *page = cookie;	2143	struct page *page = cookie;
2144	kunmap(page);	2144	kunmap(page);
2145	mark_page_accessed(page);	2145	mark_page_accessed(page);
2146	page_cache_release(page);	2146	page_cache_release(page);
2147	}	2147	}
2148	}	2148	}
2149		2149
2150	#ifdef CONFIG_TMPFS_XATTR	2150	#ifdef CONFIG_TMPFS_XATTR
2151	/*	2151	/*
2152	* Superblocks without xattr inode operations may get some security.* xattr	2152	* Superblocks without xattr inode operations may get some security.* xattr
2153	* support from the LSM "for free". As soon as we have any other xattrs	2153	* support from the LSM "for free". As soon as we have any other xattrs
2154	* like ACLs, we also need to implement the security.* handlers at	2154	* like ACLs, we also need to implement the security.* handlers at
2155	* filesystem level, though.	2155	* filesystem level, though.
2156	*/	2156	*/
2157		2157
2158	/*	2158	/*
2159	* Callback for security_inode_init_security() for acquiring xattrs.	2159	* Callback for security_inode_init_security() for acquiring xattrs.
2160	*/	2160	*/
2161	static int shmem_initxattrs(struct inode *inode,	2161	static int shmem_initxattrs(struct inode *inode,
2162	const struct xattr *xattr_array,	2162	const struct xattr *xattr_array,
2163	void *fs_info)	2163	void *fs_info)
2164	{	2164	{
2165	struct shmem_inode_info *info = SHMEM_I(inode);	2165	struct shmem_inode_info *info = SHMEM_I(inode);
2166	const struct xattr *xattr;	2166	const struct xattr *xattr;
2167	struct simple_xattr *new_xattr;	2167	struct simple_xattr *new_xattr;
2168	size_t len;	2168	size_t len;
2169		2169
2170	for (xattr = xattr_array; xattr->name != NULL; xattr++) {	2170	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2171	new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);	2171	new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2172	if (!new_xattr)	2172	if (!new_xattr)
2173	return -ENOMEM;	2173	return -ENOMEM;
2174		2174
2175	len = strlen(xattr->name) + 1;	2175	len = strlen(xattr->name) + 1;
2176	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,	2176	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
2177	GFP_KERNEL);	2177	GFP_KERNEL);
2178	if (!new_xattr->name) {	2178	if (!new_xattr->name) {
2179	kfree(new_xattr);	2179	kfree(new_xattr);
2180	return -ENOMEM;	2180	return -ENOMEM;
2181	}	2181	}
2182		2182
2183	memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,	2183	memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
2184	XATTR_SECURITY_PREFIX_LEN);	2184	XATTR_SECURITY_PREFIX_LEN);
2185	memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,	2185	memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2186	xattr->name, len);	2186	xattr->name, len);
2187		2187
2188	simple_xattr_list_add(&info->xattrs, new_xattr);	2188	simple_xattr_list_add(&info->xattrs, new_xattr);
2189	}	2189	}
2190		2190
2191	return 0;	2191	return 0;
2192	}	2192	}
2193		2193
2194	static const struct xattr_handler *shmem_xattr_handlers[] = {	2194	static const struct xattr_handler *shmem_xattr_handlers[] = {
2195	#ifdef CONFIG_TMPFS_POSIX_ACL	2195	#ifdef CONFIG_TMPFS_POSIX_ACL
2196	&generic_acl_access_handler,	2196	&generic_acl_access_handler,
2197	&generic_acl_default_handler,	2197	&generic_acl_default_handler,
2198	#endif	2198	#endif
2199	NULL	2199	NULL
2200	};	2200	};
2201		2201
2202	static int shmem_xattr_validate(const char *name)	2202	static int shmem_xattr_validate(const char *name)
2203	{	2203	{
2204	struct { const char *prefix; size_t len; } arr[] = {	2204	struct { const char *prefix; size_t len; } arr[] = {
2205	{ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },	2205	{ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2206	{ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }	2206	{ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2207	};	2207	};
2208	int i;	2208	int i;
2209		2209
2210	for (i = 0; i < ARRAY_SIZE(arr); i++) {	2210	for (i = 0; i < ARRAY_SIZE(arr); i++) {
2211	size_t preflen = arr[i].len;	2211	size_t preflen = arr[i].len;
2212	if (strncmp(name, arr[i].prefix, preflen) == 0) {	2212	if (strncmp(name, arr[i].prefix, preflen) == 0) {
2213	if (!name[preflen])	2213	if (!name[preflen])
2214	return -EINVAL;	2214	return -EINVAL;
2215	return 0;	2215	return 0;
2216	}	2216	}
2217	}	2217	}
2218	return -EOPNOTSUPP;	2218	return -EOPNOTSUPP;
2219	}	2219	}
2220		2220
2221	static ssize_t shmem_getxattr(struct dentry dentry, const char name,	2221	static ssize_t shmem_getxattr(struct dentry dentry, const char name,
2222	void *buffer, size_t size)	2222	void *buffer, size_t size)
2223	{	2223	{
2224	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);	2224	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2225	int err;	2225	int err;
2226		2226
2227	/*	2227	/*
2228	* If this is a request for a synthetic attribute in the system.*	2228	* If this is a request for a synthetic attribute in the system.*
2229	* namespace use the generic infrastructure to resolve a handler	2229	* namespace use the generic infrastructure to resolve a handler
2230	* for it via sb->s_xattr.	2230	* for it via sb->s_xattr.
2231	*/	2231	*/
2232	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))	2232	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2233	return generic_getxattr(dentry, name, buffer, size);	2233	return generic_getxattr(dentry, name, buffer, size);
2234		2234
2235	err = shmem_xattr_validate(name);	2235	err = shmem_xattr_validate(name);
2236	if (err)	2236	if (err)
2237	return err;	2237	return err;
2238		2238
2239	return simple_xattr_get(&info->xattrs, name, buffer, size);	2239	return simple_xattr_get(&info->xattrs, name, buffer, size);
2240	}	2240	}
2241		2241
2242	static int shmem_setxattr(struct dentry dentry, const char name,	2242	static int shmem_setxattr(struct dentry dentry, const char name,
2243	const void *value, size_t size, int flags)	2243	const void *value, size_t size, int flags)
2244	{	2244	{
2245	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);	2245	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2246	int err;	2246	int err;
2247		2247
2248	/*	2248	/*
2249	* If this is a request for a synthetic attribute in the system.*	2249	* If this is a request for a synthetic attribute in the system.*
2250	* namespace use the generic infrastructure to resolve a handler	2250	* namespace use the generic infrastructure to resolve a handler
2251	* for it via sb->s_xattr.	2251	* for it via sb->s_xattr.
2252	*/	2252	*/
2253	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))	2253	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2254	return generic_setxattr(dentry, name, value, size, flags);	2254	return generic_setxattr(dentry, name, value, size, flags);
2255		2255
2256	err = shmem_xattr_validate(name);	2256	err = shmem_xattr_validate(name);
2257	if (err)	2257	if (err)
2258	return err;	2258	return err;
2259		2259
2260	return simple_xattr_set(&info->xattrs, name, value, size, flags);	2260	return simple_xattr_set(&info->xattrs, name, value, size, flags);
2261	}	2261	}
2262		2262
2263	static int shmem_removexattr(struct dentry dentry, const char name)	2263	static int shmem_removexattr(struct dentry dentry, const char name)
2264	{	2264	{
2265	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);	2265	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2266	int err;	2266	int err;
2267		2267
2268	/*	2268	/*
2269	* If this is a request for a synthetic attribute in the system.*	2269	* If this is a request for a synthetic attribute in the system.*
2270	* namespace use the generic infrastructure to resolve a handler	2270	* namespace use the generic infrastructure to resolve a handler
2271	* for it via sb->s_xattr.	2271	* for it via sb->s_xattr.
2272	*/	2272	*/
2273	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))	2273	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2274	return generic_removexattr(dentry, name);	2274	return generic_removexattr(dentry, name);
2275		2275
2276	err = shmem_xattr_validate(name);	2276	err = shmem_xattr_validate(name);
2277	if (err)	2277	if (err)
2278	return err;	2278	return err;
2279		2279
2280	return simple_xattr_remove(&info->xattrs, name);	2280	return simple_xattr_remove(&info->xattrs, name);
2281	}	2281	}
2282		2282
2283	static ssize_t shmem_listxattr(struct dentry dentry, char buffer, size_t size)	2283	static ssize_t shmem_listxattr(struct dentry dentry, char buffer, size_t size)
2284	{	2284	{
2285	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);	2285	struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2286	return simple_xattr_list(&info->xattrs, buffer, size);	2286	return simple_xattr_list(&info->xattrs, buffer, size);
2287	}	2287	}
2288	#endif /* CONFIG_TMPFS_XATTR */	2288	#endif /* CONFIG_TMPFS_XATTR */
2289		2289
2290	static const struct inode_operations shmem_short_symlink_operations = {	2290	static const struct inode_operations shmem_short_symlink_operations = {
2291	.readlink = generic_readlink,	2291	.readlink = generic_readlink,
2292	.follow_link = shmem_follow_short_symlink,	2292	.follow_link = shmem_follow_short_symlink,
2293	#ifdef CONFIG_TMPFS_XATTR	2293	#ifdef CONFIG_TMPFS_XATTR
2294	.setxattr = shmem_setxattr,	2294	.setxattr = shmem_setxattr,
2295	.getxattr = shmem_getxattr,	2295	.getxattr = shmem_getxattr,
2296	.listxattr = shmem_listxattr,	2296	.listxattr = shmem_listxattr,
2297	.removexattr = shmem_removexattr,	2297	.removexattr = shmem_removexattr,
2298	#endif	2298	#endif
2299	};	2299	};
2300		2300
2301	static const struct inode_operations shmem_symlink_inode_operations = {	2301	static const struct inode_operations shmem_symlink_inode_operations = {
2302	.readlink = generic_readlink,	2302	.readlink = generic_readlink,
2303	.follow_link = shmem_follow_link,	2303	.follow_link = shmem_follow_link,
2304	.put_link = shmem_put_link,	2304	.put_link = shmem_put_link,
2305	#ifdef CONFIG_TMPFS_XATTR	2305	#ifdef CONFIG_TMPFS_XATTR
2306	.setxattr = shmem_setxattr,	2306	.setxattr = shmem_setxattr,
2307	.getxattr = shmem_getxattr,	2307	.getxattr = shmem_getxattr,
2308	.listxattr = shmem_listxattr,	2308	.listxattr = shmem_listxattr,
2309	.removexattr = shmem_removexattr,	2309	.removexattr = shmem_removexattr,
2310	#endif	2310	#endif
2311	};	2311	};
2312		2312
2313	static struct dentry shmem_get_parent(struct dentry child)	2313	static struct dentry shmem_get_parent(struct dentry child)
2314	{	2314	{
2315	return ERR_PTR(-ESTALE);	2315	return ERR_PTR(-ESTALE);
2316	}	2316	}
2317		2317
2318	static int shmem_match(struct inode ino, void vfh)	2318	static int shmem_match(struct inode ino, void vfh)
2319	{	2319	{
2320	__u32 *fh = vfh;	2320	__u32 *fh = vfh;
2321	__u64 inum = fh[2];	2321	__u64 inum = fh[2];
2322	inum = (inum << 32) \| fh[1];	2322	inum = (inum << 32) \| fh[1];
2323	return ino->i_ino == inum && fh[0] == ino->i_generation;	2323	return ino->i_ino == inum && fh[0] == ino->i_generation;
2324	}	2324	}
2325		2325
2326	static struct dentry shmem_fh_to_dentry(struct super_block sb,	2326	static struct dentry shmem_fh_to_dentry(struct super_block sb,
2327	struct fid *fid, int fh_len, int fh_type)	2327	struct fid *fid, int fh_len, int fh_type)
2328	{	2328	{
2329	struct inode *inode;	2329	struct inode *inode;
2330	struct dentry *dentry = NULL;	2330	struct dentry *dentry = NULL;
2331	u64 inum;	2331	u64 inum;
2332		2332
2333	if (fh_len < 3)	2333	if (fh_len < 3)
2334	return NULL;	2334	return NULL;
2335		2335
2336	inum = fid->raw[2];	2336	inum = fid->raw[2];
2337	inum = (inum << 32) \| fid->raw[1];	2337	inum = (inum << 32) \| fid->raw[1];
2338		2338
2339	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),	2339	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2340	shmem_match, fid->raw);	2340	shmem_match, fid->raw);
2341	if (inode) {	2341	if (inode) {
2342	dentry = d_find_alias(inode);	2342	dentry = d_find_alias(inode);
2343	iput(inode);	2343	iput(inode);
2344	}	2344	}
2345		2345
2346	return dentry;	2346	return dentry;
2347	}	2347	}
2348		2348
2349	static int shmem_encode_fh(struct inode inode, __u32 fh, int *len,	2349	static int shmem_encode_fh(struct inode inode, __u32 fh, int *len,
2350	struct inode *parent)	2350	struct inode *parent)
2351	{	2351	{
2352	if (*len < 3) {	2352	if (*len < 3) {
2353	*len = 3;	2353	*len = 3;
2354	return 255;	2354	return 255;
2355	}	2355	}
2356		2356
2357	if (inode_unhashed(inode)) {	2357	if (inode_unhashed(inode)) {
2358	/* Unfortunately insert_inode_hash is not idempotent,	2358	/* Unfortunately insert_inode_hash is not idempotent,
2359	* so as we hash inodes here rather than at creation	2359	* so as we hash inodes here rather than at creation
2360	* time, we need a lock to ensure we only try	2360	* time, we need a lock to ensure we only try
2361	* to do it once	2361	* to do it once
2362	*/	2362	*/
2363	static DEFINE_SPINLOCK(lock);	2363	static DEFINE_SPINLOCK(lock);
2364	spin_lock(&lock);	2364	spin_lock(&lock);
2365	if (inode_unhashed(inode))	2365	if (inode_unhashed(inode))
2366	__insert_inode_hash(inode,	2366	__insert_inode_hash(inode,
2367	inode->i_ino + inode->i_generation);	2367	inode->i_ino + inode->i_generation);
2368	spin_unlock(&lock);	2368	spin_unlock(&lock);
2369	}	2369	}
2370		2370
2371	fh[0] = inode->i_generation;	2371	fh[0] = inode->i_generation;
2372	fh[1] = inode->i_ino;	2372	fh[1] = inode->i_ino;
2373	fh[2] = ((__u64)inode->i_ino) >> 32;	2373	fh[2] = ((__u64)inode->i_ino) >> 32;
2374		2374
2375	*len = 3;	2375	*len = 3;
2376	return 1;	2376	return 1;
2377	}	2377	}
2378		2378
2379	static const struct export_operations shmem_export_ops = {	2379	static const struct export_operations shmem_export_ops = {
2380	.get_parent = shmem_get_parent,	2380	.get_parent = shmem_get_parent,
2381	.encode_fh = shmem_encode_fh,	2381	.encode_fh = shmem_encode_fh,
2382	.fh_to_dentry = shmem_fh_to_dentry,	2382	.fh_to_dentry = shmem_fh_to_dentry,
2383	};	2383	};
2384		2384
2385	static int shmem_parse_options(char options, struct shmem_sb_info sbinfo,	2385	static int shmem_parse_options(char options, struct shmem_sb_info sbinfo,
2386	bool remount)	2386	bool remount)
2387	{	2387	{
2388	char this_char, value, *rest;	2388	char this_char, value, *rest;
2389	uid_t uid;	2389	uid_t uid;
2390	gid_t gid;	2390	gid_t gid;
2391		2391
2392	while (options != NULL) {	2392	while (options != NULL) {
2393	this_char = options;	2393	this_char = options;
2394	for (;;) {	2394	for (;;) {
2395	/*	2395	/*
2396	* NUL-terminate this option: unfortunately,	2396	* NUL-terminate this option: unfortunately,
2397	* mount options form a comma-separated list,	2397	* mount options form a comma-separated list,
2398	* but mpol's nodelist may also contain commas.	2398	* but mpol's nodelist may also contain commas.
2399	*/	2399	*/
2400	options = strchr(options, ',');	2400	options = strchr(options, ',');
2401	if (options == NULL)	2401	if (options == NULL)
2402	break;	2402	break;
2403	options++;	2403	options++;
2404	if (!isdigit(*options)) {	2404	if (!isdigit(*options)) {
2405	options[-1] = '\0';	2405	options[-1] = '\0';
2406	break;	2406	break;
2407	}	2407	}
2408	}	2408	}
2409	if (!*this_char)	2409	if (!*this_char)
2410	continue;	2410	continue;
2411	if ((value = strchr(this_char,'=')) != NULL) {	2411	if ((value = strchr(this_char,'=')) != NULL) {
2412	*value++ = 0;	2412	*value++ = 0;
2413	} else {	2413	} else {
2414	printk(KERN_ERR	2414	printk(KERN_ERR
2415	"tmpfs: No value for mount option '%s'\n",	2415	"tmpfs: No value for mount option '%s'\n",
2416	this_char);	2416	this_char);
2417	return 1;	2417	return 1;
2418	}	2418	}
2419		2419
2420	if (!strcmp(this_char,"size")) {	2420	if (!strcmp(this_char,"size")) {
2421	unsigned long long size;	2421	unsigned long long size;
2422	size = memparse(value,&rest);	2422	size = memparse(value,&rest);
2423	if (*rest == '%') {	2423	if (*rest == '%') {
2424	size <<= PAGE_SHIFT;	2424	size <<= PAGE_SHIFT;
2425	size *= totalram_pages;	2425	size *= totalram_pages;
2426	do_div(size, 100);	2426	do_div(size, 100);
2427	rest++;	2427	rest++;
2428	}	2428	}
2429	if (*rest)	2429	if (*rest)
2430	goto bad_val;	2430	goto bad_val;
2431	sbinfo->max_blocks =	2431	sbinfo->max_blocks =
2432	DIV_ROUND_UP(size, PAGE_CACHE_SIZE);	2432	DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2433	} else if (!strcmp(this_char,"nr_blocks")) {	2433	} else if (!strcmp(this_char,"nr_blocks")) {
2434	sbinfo->max_blocks = memparse(value, &rest);	2434	sbinfo->max_blocks = memparse(value, &rest);
2435	if (*rest)	2435	if (*rest)
2436	goto bad_val;	2436	goto bad_val;
2437	} else if (!strcmp(this_char,"nr_inodes")) {	2437	} else if (!strcmp(this_char,"nr_inodes")) {
2438	sbinfo->max_inodes = memparse(value, &rest);	2438	sbinfo->max_inodes = memparse(value, &rest);
2439	if (*rest)	2439	if (*rest)
2440	goto bad_val;	2440	goto bad_val;
2441	} else if (!strcmp(this_char,"mode")) {	2441	} else if (!strcmp(this_char,"mode")) {
2442	if (remount)	2442	if (remount)
2443	continue;	2443	continue;
2444	sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;	2444	sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
2445	if (*rest)	2445	if (*rest)
2446	goto bad_val;	2446	goto bad_val;
2447	} else if (!strcmp(this_char,"uid")) {	2447	} else if (!strcmp(this_char,"uid")) {
2448	if (remount)	2448	if (remount)
2449	continue;	2449	continue;
2450	uid = simple_strtoul(value, &rest, 0);	2450	uid = simple_strtoul(value, &rest, 0);
2451	if (*rest)	2451	if (*rest)
2452	goto bad_val;	2452	goto bad_val;
2453	sbinfo->uid = make_kuid(current_user_ns(), uid);	2453	sbinfo->uid = make_kuid(current_user_ns(), uid);
2454	if (!uid_valid(sbinfo->uid))	2454	if (!uid_valid(sbinfo->uid))
2455	goto bad_val;	2455	goto bad_val;
2456	} else if (!strcmp(this_char,"gid")) {	2456	} else if (!strcmp(this_char,"gid")) {
2457	if (remount)	2457	if (remount)
2458	continue;	2458	continue;
2459	gid = simple_strtoul(value, &rest, 0);	2459	gid = simple_strtoul(value, &rest, 0);
2460	if (*rest)	2460	if (*rest)
2461	goto bad_val;	2461	goto bad_val;
2462	sbinfo->gid = make_kgid(current_user_ns(), gid);	2462	sbinfo->gid = make_kgid(current_user_ns(), gid);
2463	if (!gid_valid(sbinfo->gid))	2463	if (!gid_valid(sbinfo->gid))
2464	goto bad_val;	2464	goto bad_val;
2465	} else if (!strcmp(this_char,"mpol")) {	2465	} else if (!strcmp(this_char,"mpol")) {
2466	if (mpol_parse_str(value, &sbinfo->mpol, 1))	2466	if (mpol_parse_str(value, &sbinfo->mpol, 1))
2467	goto bad_val;	2467	goto bad_val;
2468	} else {	2468	} else {
2469	printk(KERN_ERR "tmpfs: Bad mount option %s\n",	2469	printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2470	this_char);	2470	this_char);
2471	return 1;	2471	return 1;
2472	}	2472	}
2473	}	2473	}
2474	return 0;	2474	return 0;
2475		2475
2476	bad_val:	2476	bad_val:
2477	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",	2477	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2478	value, this_char);	2478	value, this_char);
2479	return 1;	2479	return 1;
2480		2480
2481	}	2481	}
2482		2482
2483	static int shmem_remount_fs(struct super_block sb, int flags, char *data)	2483	static int shmem_remount_fs(struct super_block sb, int flags, char *data)
2484	{	2484	{
2485	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);	2485	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2486	struct shmem_sb_info config = *sbinfo;	2486	struct shmem_sb_info config = *sbinfo;
2487	unsigned long inodes;	2487	unsigned long inodes;
2488	int error = -EINVAL;	2488	int error = -EINVAL;
2489		2489
2490	if (shmem_parse_options(data, &config, true))	2490	if (shmem_parse_options(data, &config, true))
2491	return error;	2491	return error;
2492		2492
2493	spin_lock(&sbinfo->stat_lock);	2493	spin_lock(&sbinfo->stat_lock);
2494	inodes = sbinfo->max_inodes - sbinfo->free_inodes;	2494	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2495	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)	2495	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2496	goto out;	2496	goto out;
2497	if (config.max_inodes < inodes)	2497	if (config.max_inodes < inodes)
2498	goto out;	2498	goto out;
2499	/*	2499	/*
2500	* Those tests disallow limited->unlimited while any are in use;	2500	* Those tests disallow limited->unlimited while any are in use;
2501	* but we must separately disallow unlimited->limited, because	2501	* but we must separately disallow unlimited->limited, because
2502	* in that case we have no record of how much is already in use.	2502	* in that case we have no record of how much is already in use.
2503	*/	2503	*/
2504	if (config.max_blocks && !sbinfo->max_blocks)	2504	if (config.max_blocks && !sbinfo->max_blocks)
2505	goto out;	2505	goto out;
2506	if (config.max_inodes && !sbinfo->max_inodes)	2506	if (config.max_inodes && !sbinfo->max_inodes)
2507	goto out;	2507	goto out;
2508		2508
2509	error = 0;	2509	error = 0;
2510	sbinfo->max_blocks = config.max_blocks;	2510	sbinfo->max_blocks = config.max_blocks;
2511	sbinfo->max_inodes = config.max_inodes;	2511	sbinfo->max_inodes = config.max_inodes;
2512	sbinfo->free_inodes = config.max_inodes - inodes;	2512	sbinfo->free_inodes = config.max_inodes - inodes;
2513		2513
2514	mpol_put(sbinfo->mpol);	2514	mpol_put(sbinfo->mpol);
2515	sbinfo->mpol = config.mpol; /* transfers initial ref */	2515	sbinfo->mpol = config.mpol; /* transfers initial ref */
2516	out:	2516	out:
2517	spin_unlock(&sbinfo->stat_lock);	2517	spin_unlock(&sbinfo->stat_lock);
2518	return error;	2518	return error;
2519	}	2519	}
2520		2520
2521	static int shmem_show_options(struct seq_file seq, struct dentry root)	2521	static int shmem_show_options(struct seq_file seq, struct dentry root)
2522	{	2522	{
2523	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);	2523	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
2524		2524
2525	if (sbinfo->max_blocks != shmem_default_max_blocks())	2525	if (sbinfo->max_blocks != shmem_default_max_blocks())
2526	seq_printf(seq, ",size=%luk",	2526	seq_printf(seq, ",size=%luk",
2527	sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));	2527	sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
2528	if (sbinfo->max_inodes != shmem_default_max_inodes())	2528	if (sbinfo->max_inodes != shmem_default_max_inodes())
2529	seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);	2529	seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2530	if (sbinfo->mode != (S_IRWXUGO \| S_ISVTX))	2530	if (sbinfo->mode != (S_IRWXUGO \| S_ISVTX))
2531	seq_printf(seq, ",mode=%03ho", sbinfo->mode);	2531	seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2532	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))	2532	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2533	seq_printf(seq, ",uid=%u",	2533	seq_printf(seq, ",uid=%u",
2534	from_kuid_munged(&init_user_ns, sbinfo->uid));	2534	from_kuid_munged(&init_user_ns, sbinfo->uid));
2535	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))	2535	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2536	seq_printf(seq, ",gid=%u",	2536	seq_printf(seq, ",gid=%u",
2537	from_kgid_munged(&init_user_ns, sbinfo->gid));	2537	from_kgid_munged(&init_user_ns, sbinfo->gid));
2538	shmem_show_mpol(seq, sbinfo->mpol);	2538	shmem_show_mpol(seq, sbinfo->mpol);
2539	return 0;	2539	return 0;
2540	}	2540	}
2541	#endif /* CONFIG_TMPFS */	2541	#endif /* CONFIG_TMPFS */
2542		2542
2543	static void shmem_put_super(struct super_block *sb)	2543	static void shmem_put_super(struct super_block *sb)
2544	{	2544	{
2545	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);	2545	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2546		2546
2547	percpu_counter_destroy(&sbinfo->used_blocks);	2547	percpu_counter_destroy(&sbinfo->used_blocks);
2548	kfree(sbinfo);	2548	kfree(sbinfo);
2549	sb->s_fs_info = NULL;	2549	sb->s_fs_info = NULL;
2550	}	2550	}
2551		2551
2552	int shmem_fill_super(struct super_block sb, void data, int silent)	2552	int shmem_fill_super(struct super_block sb, void data, int silent)
2553	{	2553	{
2554	struct inode *inode;	2554	struct inode *inode;
2555	struct shmem_sb_info *sbinfo;	2555	struct shmem_sb_info *sbinfo;
2556	int err = -ENOMEM;	2556	int err = -ENOMEM;
2557		2557
2558	/* Round up to L1_CACHE_BYTES to resist false sharing */	2558	/* Round up to L1_CACHE_BYTES to resist false sharing */
2559	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),	2559	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2560	L1_CACHE_BYTES), GFP_KERNEL);	2560	L1_CACHE_BYTES), GFP_KERNEL);
2561	if (!sbinfo)	2561	if (!sbinfo)
2562	return -ENOMEM;	2562	return -ENOMEM;
2563		2563
2564	sbinfo->mode = S_IRWXUGO \| S_ISVTX;	2564	sbinfo->mode = S_IRWXUGO \| S_ISVTX;
2565	sbinfo->uid = current_fsuid();	2565	sbinfo->uid = current_fsuid();
2566	sbinfo->gid = current_fsgid();	2566	sbinfo->gid = current_fsgid();
2567	sb->s_fs_info = sbinfo;	2567	sb->s_fs_info = sbinfo;
2568		2568
2569	#ifdef CONFIG_TMPFS	2569	#ifdef CONFIG_TMPFS
2570	/*	2570	/*
2571	* Per default we only allow half of the physical ram per	2571	* Per default we only allow half of the physical ram per
2572	* tmpfs instance, limiting inodes to one per page of lowmem;	2572	* tmpfs instance, limiting inodes to one per page of lowmem;
2573	* but the internal instance is left unlimited.	2573	* but the internal instance is left unlimited.
2574	*/	2574	*/
2575	if (!(sb->s_flags & MS_NOUSER)) {	2575	if (!(sb->s_flags & MS_NOUSER)) {
2576	sbinfo->max_blocks = shmem_default_max_blocks();	2576	sbinfo->max_blocks = shmem_default_max_blocks();
2577	sbinfo->max_inodes = shmem_default_max_inodes();	2577	sbinfo->max_inodes = shmem_default_max_inodes();
2578	if (shmem_parse_options(data, sbinfo, false)) {	2578	if (shmem_parse_options(data, sbinfo, false)) {
2579	err = -EINVAL;	2579	err = -EINVAL;
2580	goto failed;	2580	goto failed;
2581	}	2581	}
2582	}	2582	}
2583	sb->s_export_op = &shmem_export_ops;	2583	sb->s_export_op = &shmem_export_ops;
2584	sb->s_flags \|= MS_NOSEC;	2584	sb->s_flags \|= MS_NOSEC;
2585	#else	2585	#else
2586	sb->s_flags \|= MS_NOUSER;	2586	sb->s_flags \|= MS_NOUSER;
2587	#endif	2587	#endif
2588		2588
2589	spin_lock_init(&sbinfo->stat_lock);	2589	spin_lock_init(&sbinfo->stat_lock);
2590	if (percpu_counter_init(&sbinfo->used_blocks, 0))	2590	if (percpu_counter_init(&sbinfo->used_blocks, 0))
2591	goto failed;	2591	goto failed;
2592	sbinfo->free_inodes = sbinfo->max_inodes;	2592	sbinfo->free_inodes = sbinfo->max_inodes;
2593		2593
2594	sb->s_maxbytes = MAX_LFS_FILESIZE;	2594	sb->s_maxbytes = MAX_LFS_FILESIZE;
2595	sb->s_blocksize = PAGE_CACHE_SIZE;	2595	sb->s_blocksize = PAGE_CACHE_SIZE;
2596	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;	2596	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2597	sb->s_magic = TMPFS_MAGIC;	2597	sb->s_magic = TMPFS_MAGIC;
2598	sb->s_op = &shmem_ops;	2598	sb->s_op = &shmem_ops;
2599	sb->s_time_gran = 1;	2599	sb->s_time_gran = 1;
2600	#ifdef CONFIG_TMPFS_XATTR	2600	#ifdef CONFIG_TMPFS_XATTR
2601	sb->s_xattr = shmem_xattr_handlers;	2601	sb->s_xattr = shmem_xattr_handlers;
2602	#endif	2602	#endif
2603	#ifdef CONFIG_TMPFS_POSIX_ACL	2603	#ifdef CONFIG_TMPFS_POSIX_ACL
2604	sb->s_flags \|= MS_POSIXACL;	2604	sb->s_flags \|= MS_POSIXACL;
2605	#endif	2605	#endif
2606		2606
2607	inode = shmem_get_inode(sb, NULL, S_IFDIR \| sbinfo->mode, 0, VM_NORESERVE);	2607	inode = shmem_get_inode(sb, NULL, S_IFDIR \| sbinfo->mode, 0, VM_NORESERVE);
2608	if (!inode)	2608	if (!inode)
2609	goto failed;	2609	goto failed;
2610	inode->i_uid = sbinfo->uid;	2610	inode->i_uid = sbinfo->uid;
2611	inode->i_gid = sbinfo->gid;	2611	inode->i_gid = sbinfo->gid;
2612	sb->s_root = d_make_root(inode);	2612	sb->s_root = d_make_root(inode);
2613	if (!sb->s_root)	2613	if (!sb->s_root)
2614	goto failed;	2614	goto failed;
2615	return 0;	2615	return 0;
2616		2616
2617	failed:	2617	failed:
2618	shmem_put_super(sb);	2618	shmem_put_super(sb);
2619	return err;	2619	return err;
2620	}	2620	}
2621		2621
2622	static struct kmem_cache *shmem_inode_cachep;	2622	static struct kmem_cache *shmem_inode_cachep;
2623		2623
2624	static struct inode shmem_alloc_inode(struct super_block sb)	2624	static struct inode shmem_alloc_inode(struct super_block sb)
2625	{	2625	{
2626	struct shmem_inode_info *info;	2626	struct shmem_inode_info *info;
2627	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);	2627	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2628	if (!info)	2628	if (!info)
2629	return NULL;	2629	return NULL;
2630	return &info->vfs_inode;	2630	return &info->vfs_inode;
2631	}	2631	}
2632		2632
2633	static void shmem_destroy_callback(struct rcu_head *head)	2633	static void shmem_destroy_callback(struct rcu_head *head)
2634	{	2634	{
2635	struct inode *inode = container_of(head, struct inode, i_rcu);	2635	struct inode *inode = container_of(head, struct inode, i_rcu);
2636	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));	2636	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2637	}	2637	}
2638		2638
2639	static void shmem_destroy_inode(struct inode *inode)	2639	static void shmem_destroy_inode(struct inode *inode)
2640	{	2640	{
2641	if (S_ISREG(inode->i_mode))	2641	if (S_ISREG(inode->i_mode))
2642	mpol_free_shared_policy(&SHMEM_I(inode)->policy);	2642	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2643	call_rcu(&inode->i_rcu, shmem_destroy_callback);	2643	call_rcu(&inode->i_rcu, shmem_destroy_callback);
2644	}	2644	}
2645		2645
2646	static void shmem_init_inode(void *foo)	2646	static void shmem_init_inode(void *foo)
2647	{	2647	{
2648	struct shmem_inode_info *info = foo;	2648	struct shmem_inode_info *info = foo;
2649	inode_init_once(&info->vfs_inode);	2649	inode_init_once(&info->vfs_inode);
2650	}	2650	}
2651		2651
2652	static int shmem_init_inodecache(void)	2652	static int shmem_init_inodecache(void)
2653	{	2653	{
2654	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",	2654	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2655	sizeof(struct shmem_inode_info),	2655	sizeof(struct shmem_inode_info),
2656	0, SLAB_PANIC, shmem_init_inode);	2656	0, SLAB_PANIC, shmem_init_inode);
2657	return 0;	2657	return 0;
2658	}	2658	}
2659		2659
2660	static void shmem_destroy_inodecache(void)	2660	static void shmem_destroy_inodecache(void)
2661	{	2661	{
2662	kmem_cache_destroy(shmem_inode_cachep);	2662	kmem_cache_destroy(shmem_inode_cachep);
2663	}	2663	}
2664		2664
2665	static const struct address_space_operations shmem_aops = {	2665	static const struct address_space_operations shmem_aops = {
2666	.writepage = shmem_writepage,	2666	.writepage = shmem_writepage,
2667	.set_page_dirty = __set_page_dirty_no_writeback,	2667	.set_page_dirty = __set_page_dirty_no_writeback,
2668	#ifdef CONFIG_TMPFS	2668	#ifdef CONFIG_TMPFS
2669	.write_begin = shmem_write_begin,	2669	.write_begin = shmem_write_begin,
2670	.write_end = shmem_write_end,	2670	.write_end = shmem_write_end,
2671	#endif	2671	#endif
2672	.migratepage = migrate_page,	2672	.migratepage = migrate_page,
2673	.error_remove_page = generic_error_remove_page,	2673	.error_remove_page = generic_error_remove_page,
2674	};	2674	};
2675		2675
2676	static const struct file_operations shmem_file_operations = {	2676	static const struct file_operations shmem_file_operations = {
2677	.mmap = shmem_mmap,	2677	.mmap = shmem_mmap,
2678	#ifdef CONFIG_TMPFS	2678	#ifdef CONFIG_TMPFS
2679	.llseek = shmem_file_llseek,	2679	.llseek = shmem_file_llseek,
2680	.read = do_sync_read,	2680	.read = do_sync_read,
2681	.write = do_sync_write,	2681	.write = do_sync_write,
2682	.aio_read = shmem_file_aio_read,	2682	.aio_read = shmem_file_aio_read,
2683	.aio_write = generic_file_aio_write,	2683	.aio_write = generic_file_aio_write,
2684	.fsync = noop_fsync,	2684	.fsync = noop_fsync,
2685	.splice_read = shmem_file_splice_read,	2685	.splice_read = shmem_file_splice_read,
2686	.splice_write = generic_file_splice_write,	2686	.splice_write = generic_file_splice_write,
2687	.fallocate = shmem_fallocate,	2687	.fallocate = shmem_fallocate,
2688	#endif	2688	#endif
2689	};	2689	};
2690		2690
2691	static const struct inode_operations shmem_inode_operations = {	2691	static const struct inode_operations shmem_inode_operations = {
2692	.setattr = shmem_setattr,	2692	.setattr = shmem_setattr,
2693	#ifdef CONFIG_TMPFS_XATTR	2693	#ifdef CONFIG_TMPFS_XATTR
2694	.setxattr = shmem_setxattr,	2694	.setxattr = shmem_setxattr,
2695	.getxattr = shmem_getxattr,	2695	.getxattr = shmem_getxattr,
2696	.listxattr = shmem_listxattr,	2696	.listxattr = shmem_listxattr,
2697	.removexattr = shmem_removexattr,	2697	.removexattr = shmem_removexattr,
2698	#endif	2698	#endif
2699	};	2699	};
2700		2700
2701	static const struct inode_operations shmem_dir_inode_operations = {	2701	static const struct inode_operations shmem_dir_inode_operations = {
2702	#ifdef CONFIG_TMPFS	2702	#ifdef CONFIG_TMPFS
2703	.create = shmem_create,	2703	.create = shmem_create,
2704	.lookup = simple_lookup,	2704	.lookup = simple_lookup,
2705	.link = shmem_link,	2705	.link = shmem_link,
2706	.unlink = shmem_unlink,	2706	.unlink = shmem_unlink,
2707	.symlink = shmem_symlink,	2707	.symlink = shmem_symlink,
2708	.mkdir = shmem_mkdir,	2708	.mkdir = shmem_mkdir,
2709	.rmdir = shmem_rmdir,	2709	.rmdir = shmem_rmdir,
2710	.mknod = shmem_mknod,	2710	.mknod = shmem_mknod,
2711	.rename = shmem_rename,	2711	.rename = shmem_rename,
2712	#endif	2712	#endif
2713	#ifdef CONFIG_TMPFS_XATTR	2713	#ifdef CONFIG_TMPFS_XATTR
2714	.setxattr = shmem_setxattr,	2714	.setxattr = shmem_setxattr,
2715	.getxattr = shmem_getxattr,	2715	.getxattr = shmem_getxattr,
2716	.listxattr = shmem_listxattr,	2716	.listxattr = shmem_listxattr,
2717	.removexattr = shmem_removexattr,	2717	.removexattr = shmem_removexattr,
2718	#endif	2718	#endif
2719	#ifdef CONFIG_TMPFS_POSIX_ACL	2719	#ifdef CONFIG_TMPFS_POSIX_ACL
2720	.setattr = shmem_setattr,	2720	.setattr = shmem_setattr,
2721	#endif	2721	#endif
2722	};	2722	};
2723		2723
2724	static const struct inode_operations shmem_special_inode_operations = {	2724	static const struct inode_operations shmem_special_inode_operations = {
2725	#ifdef CONFIG_TMPFS_XATTR	2725	#ifdef CONFIG_TMPFS_XATTR
2726	.setxattr = shmem_setxattr,	2726	.setxattr = shmem_setxattr,
2727	.getxattr = shmem_getxattr,	2727	.getxattr = shmem_getxattr,
2728	.listxattr = shmem_listxattr,	2728	.listxattr = shmem_listxattr,
2729	.removexattr = shmem_removexattr,	2729	.removexattr = shmem_removexattr,
2730	#endif	2730	#endif
2731	#ifdef CONFIG_TMPFS_POSIX_ACL	2731	#ifdef CONFIG_TMPFS_POSIX_ACL
2732	.setattr = shmem_setattr,	2732	.setattr = shmem_setattr,
2733	#endif	2733	#endif
2734	};	2734	};
2735		2735
2736	static const struct super_operations shmem_ops = {	2736	static const struct super_operations shmem_ops = {
2737	.alloc_inode = shmem_alloc_inode,	2737	.alloc_inode = shmem_alloc_inode,
2738	.destroy_inode = shmem_destroy_inode,	2738	.destroy_inode = shmem_destroy_inode,
2739	#ifdef CONFIG_TMPFS	2739	#ifdef CONFIG_TMPFS
2740	.statfs = shmem_statfs,	2740	.statfs = shmem_statfs,
2741	.remount_fs = shmem_remount_fs,	2741	.remount_fs = shmem_remount_fs,
2742	.show_options = shmem_show_options,	2742	.show_options = shmem_show_options,
2743	#endif	2743	#endif
2744	.evict_inode = shmem_evict_inode,	2744	.evict_inode = shmem_evict_inode,
2745	.drop_inode = generic_delete_inode,	2745	.drop_inode = generic_delete_inode,
2746	.put_super = shmem_put_super,	2746	.put_super = shmem_put_super,
2747	};	2747	};
2748		2748
2749	static const struct vm_operations_struct shmem_vm_ops = {	2749	static const struct vm_operations_struct shmem_vm_ops = {
2750	.fault = shmem_fault,	2750	.fault = shmem_fault,
2751	#ifdef CONFIG_NUMA	2751	#ifdef CONFIG_NUMA
2752	.set_policy = shmem_set_policy,	2752	.set_policy = shmem_set_policy,
2753	.get_policy = shmem_get_policy,	2753	.get_policy = shmem_get_policy,
2754	#endif	2754	#endif
2755	.remap_pages = generic_file_remap_pages,	2755	.remap_pages = generic_file_remap_pages,
2756	};	2756	};
2757		2757
2758	static struct dentry shmem_mount(struct file_system_type fs_type,	2758	static struct dentry shmem_mount(struct file_system_type fs_type,
2759	int flags, const char dev_name, void data)	2759	int flags, const char dev_name, void data)
2760	{	2760	{
2761	return mount_nodev(fs_type, flags, data, shmem_fill_super);	2761	return mount_nodev(fs_type, flags, data, shmem_fill_super);
2762	}	2762	}
2763		2763
2764	static struct file_system_type shmem_fs_type = {	2764	static struct file_system_type shmem_fs_type = {
2765	.owner = THIS_MODULE,	2765	.owner = THIS_MODULE,
2766	.name = "tmpfs",	2766	.name = "tmpfs",
2767	.mount = shmem_mount,	2767	.mount = shmem_mount,
2768	.kill_sb = kill_litter_super,	2768	.kill_sb = kill_litter_super,
2769	};	2769	};
2770		2770
2771	int __init shmem_init(void)	2771	int __init shmem_init(void)
2772	{	2772	{
2773	int error;	2773	int error;
2774		2774
2775	error = bdi_init(&shmem_backing_dev_info);	2775	error = bdi_init(&shmem_backing_dev_info);
2776	if (error)	2776	if (error)
2777	goto out4;	2777	goto out4;
2778		2778
2779	error = shmem_init_inodecache();	2779	error = shmem_init_inodecache();
2780	if (error)	2780	if (error)
2781	goto out3;	2781	goto out3;
2782		2782
2783	error = register_filesystem(&shmem_fs_type);	2783	error = register_filesystem(&shmem_fs_type);
2784	if (error) {	2784	if (error) {
2785	printk(KERN_ERR "Could not register tmpfs\n");	2785	printk(KERN_ERR "Could not register tmpfs\n");
2786	goto out2;	2786	goto out2;
2787	}	2787	}
2788		2788
2789	shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,	2789	shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2790	shmem_fs_type.name, NULL);	2790	shmem_fs_type.name, NULL);
2791	if (IS_ERR(shm_mnt)) {	2791	if (IS_ERR(shm_mnt)) {
2792	error = PTR_ERR(shm_mnt);	2792	error = PTR_ERR(shm_mnt);
2793	printk(KERN_ERR "Could not kern_mount tmpfs\n");	2793	printk(KERN_ERR "Could not kern_mount tmpfs\n");
2794	goto out1;	2794	goto out1;
2795	}	2795	}
2796	return 0;	2796	return 0;
2797		2797
2798	out1:	2798	out1:
2799	unregister_filesystem(&shmem_fs_type);	2799	unregister_filesystem(&shmem_fs_type);
2800	out2:	2800	out2:
2801	shmem_destroy_inodecache();	2801	shmem_destroy_inodecache();
2802	out3:	2802	out3:
2803	bdi_destroy(&shmem_backing_dev_info);	2803	bdi_destroy(&shmem_backing_dev_info);
2804	out4:	2804	out4:
2805	shm_mnt = ERR_PTR(error);	2805	shm_mnt = ERR_PTR(error);
2806	return error;	2806	return error;
2807	}	2807	}
2808		2808
2809	#else /* !CONFIG_SHMEM */	2809	#else /* !CONFIG_SHMEM */
2810		2810
2811	/*	2811	/*
2812	* tiny-shmem: simple shmemfs and tmpfs using ramfs code	2812	* tiny-shmem: simple shmemfs and tmpfs using ramfs code
2813	*	2813	*
2814	* This is intended for small system where the benefits of the full	2814	* This is intended for small system where the benefits of the full
2815	* shmem code (swap-backed and resource-limited) are outweighed by	2815	* shmem code (swap-backed and resource-limited) are outweighed by
2816	* their complexity. On systems without swap this code should be	2816	* their complexity. On systems without swap this code should be
2817	* effectively equivalent, but much lighter weight.	2817	* effectively equivalent, but much lighter weight.
2818	*/	2818	*/
2819		2819
2820	#include <linux/ramfs.h>	2820	#include <linux/ramfs.h>
2821		2821
2822	static struct file_system_type shmem_fs_type = {	2822	static struct file_system_type shmem_fs_type = {
2823	.name = "tmpfs",	2823	.name = "tmpfs",
2824	.mount = ramfs_mount,	2824	.mount = ramfs_mount,
2825	.kill_sb = kill_litter_super,	2825	.kill_sb = kill_litter_super,
2826	};	2826	};
2827		2827
2828	int __init shmem_init(void)	2828	int __init shmem_init(void)
2829	{	2829	{
2830	BUG_ON(register_filesystem(&shmem_fs_type) != 0);	2830	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2831		2831
2832	shm_mnt = kern_mount(&shmem_fs_type);	2832	shm_mnt = kern_mount(&shmem_fs_type);
2833	BUG_ON(IS_ERR(shm_mnt));	2833	BUG_ON(IS_ERR(shm_mnt));
2834		2834
2835	return 0;	2835	return 0;
2836	}	2836	}
2837		2837
2838	int shmem_unuse(swp_entry_t swap, struct page *page)	2838	int shmem_unuse(swp_entry_t swap, struct page *page)
2839	{	2839	{
2840	return 0;	2840	return 0;
2841	}	2841	}
2842		2842
2843	int shmem_lock(struct file file, int lock, struct user_struct user)	2843	int shmem_lock(struct file file, int lock, struct user_struct user)
2844	{	2844	{
2845	return 0;	2845	return 0;
2846	}	2846	}
2847		2847
2848	void shmem_unlock_mapping(struct address_space *mapping)	2848	void shmem_unlock_mapping(struct address_space *mapping)
2849	{	2849	{
2850	}	2850	}
2851		2851
2852	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)	2852	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2853	{	2853	{
2854	truncate_inode_pages_range(inode->i_mapping, lstart, lend);	2854	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2855	}	2855	}
2856	EXPORT_SYMBOL_GPL(shmem_truncate_range);	2856	EXPORT_SYMBOL_GPL(shmem_truncate_range);
2857		2857
2858	#define shmem_vm_ops generic_file_vm_ops	2858	#define shmem_vm_ops generic_file_vm_ops
2859	#define shmem_file_operations ramfs_file_operations	2859	#define shmem_file_operations ramfs_file_operations
2860	#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)	2860	#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2861	#define shmem_acct_size(flags, size) 0	2861	#define shmem_acct_size(flags, size) 0
2862	#define shmem_unacct_size(flags, size) do {} while (0)	2862	#define shmem_unacct_size(flags, size) do {} while (0)
2863		2863
2864	#endif /* CONFIG_SHMEM */	2864	#endif /* CONFIG_SHMEM */
2865		2865
2866	/* common code */	2866	/* common code */
2867		2867
2868	/**	2868	/**
2869	* shmem_file_setup - get an unlinked file living in tmpfs	2869	* shmem_file_setup - get an unlinked file living in tmpfs
2870	* @name: name for dentry (to be seen in /proc/<pid>/maps	2870	* @name: name for dentry (to be seen in /proc/<pid>/maps
2871	* @size: size to be set for the file	2871	* @size: size to be set for the file
2872	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size	2872	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2873	*/	2873	*/
2874	struct file shmem_file_setup(const char name, loff_t size, unsigned long flags)	2874	struct file shmem_file_setup(const char name, loff_t size, unsigned long flags)
2875	{	2875	{
2876	int error;	2876	int error;
2877	struct file *file;	2877	struct file *file;
2878	struct inode *inode;	2878	struct inode *inode;
2879	struct path path;	2879	struct path path;
2880	struct dentry *root;	2880	struct dentry *root;
2881	struct qstr this;	2881	struct qstr this;
2882		2882
2883	if (IS_ERR(shm_mnt))	2883	if (IS_ERR(shm_mnt))
2884	return (void *)shm_mnt;	2884	return (void *)shm_mnt;
2885		2885
2886	if (size < 0 \|\| size > MAX_LFS_FILESIZE)	2886	if (size < 0 \|\| size > MAX_LFS_FILESIZE)
2887	return ERR_PTR(-EINVAL);	2887	return ERR_PTR(-EINVAL);
2888		2888
2889	if (shmem_acct_size(flags, size))	2889	if (shmem_acct_size(flags, size))
2890	return ERR_PTR(-ENOMEM);	2890	return ERR_PTR(-ENOMEM);
2891		2891
2892	error = -ENOMEM;	2892	error = -ENOMEM;
2893	this.name = name;	2893	this.name = name;
2894	this.len = strlen(name);	2894	this.len = strlen(name);
2895	this.hash = 0; /* will go */	2895	this.hash = 0; /* will go */
2896	root = shm_mnt->mnt_root;	2896	root = shm_mnt->mnt_root;
2897	path.dentry = d_alloc(root, &this);	2897	path.dentry = d_alloc(root, &this);
2898	if (!path.dentry)	2898	if (!path.dentry)
2899	goto put_memory;	2899	goto put_memory;
2900	path.mnt = mntget(shm_mnt);	2900	path.mnt = mntget(shm_mnt);
2901		2901
2902	error = -ENOSPC;	2902	error = -ENOSPC;
2903	inode = shmem_get_inode(root->d_sb, NULL, S_IFREG \| S_IRWXUGO, 0, flags);	2903	inode = shmem_get_inode(root->d_sb, NULL, S_IFREG \| S_IRWXUGO, 0, flags);
2904	if (!inode)	2904	if (!inode)
2905	goto put_dentry;	2905	goto put_dentry;
2906		2906
2907	d_instantiate(path.dentry, inode);	2907	d_instantiate(path.dentry, inode);
2908	inode->i_size = size;	2908	inode->i_size = size;
2909	clear_nlink(inode); /* It is unlinked */	2909	clear_nlink(inode); /* It is unlinked */
2910	#ifndef CONFIG_MMU	2910	#ifndef CONFIG_MMU
2911	error = ramfs_nommu_expand_for_mapping(inode, size);	2911	error = ramfs_nommu_expand_for_mapping(inode, size);
2912	if (error)	2912	if (error)
2913	goto put_dentry;	2913	goto put_dentry;
2914	#endif	2914	#endif
2915		2915
2916	error = -ENFILE;	2916	error = -ENFILE;
2917	file = alloc_file(&path, FMODE_WRITE \| FMODE_READ,	2917	file = alloc_file(&path, FMODE_WRITE \| FMODE_READ,
2918	&shmem_file_operations);	2918	&shmem_file_operations);
2919	if (!file)	2919	if (!file)
2920	goto put_dentry;	2920	goto put_dentry;
2921		2921
2922	return file;	2922	return file;
2923		2923
2924	put_dentry:	2924	put_dentry:
2925	path_put(&path);	2925	path_put(&path);
2926	put_memory:	2926	put_memory:
2927	shmem_unacct_size(flags, size);	2927	shmem_unacct_size(flags, size);
2928	return ERR_PTR(error);	2928	return ERR_PTR(error);
2929	}	2929	}
2930	EXPORT_SYMBOL_GPL(shmem_file_setup);	2930	EXPORT_SYMBOL_GPL(shmem_file_setup);
2931		2931
2932	/**	2932	/**
2933	* shmem_zero_setup - setup a shared anonymous mapping	2933	* shmem_zero_setup - setup a shared anonymous mapping
2934	* @vma: the vma to be mmapped is prepared by do_mmap_pgoff	2934	* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2935	*/	2935	*/
2936	int shmem_zero_setup(struct vm_area_struct *vma)	2936	int shmem_zero_setup(struct vm_area_struct *vma)
2937	{	2937	{
2938	struct file *file;	2938	struct file *file;
2939	loff_t size = vma->vm_end - vma->vm_start;	2939	loff_t size = vma->vm_end - vma->vm_start;
2940		2940
2941	file = shmem_file_setup("dev/zero", size, vma->vm_flags);	2941	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2942	if (IS_ERR(file))	2942	if (IS_ERR(file))
2943	return PTR_ERR(file);	2943	return PTR_ERR(file);
2944		2944
2945	if (vma->vm_file)	2945	if (vma->vm_file)
2946	fput(vma->vm_file);	2946	fput(vma->vm_file);
2947	vma->vm_file = file;	2947	vma->vm_file = file;
2948	vma->vm_ops = &shmem_vm_ops;	2948	vma->vm_ops = &shmem_vm_ops;
2949	return 0;	2949	return 0;
2950	}	2950	}
2951		2951
2952	/**	2952	/**
2953	* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.	2953	* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
2954	* @mapping: the page's address_space	2954	* @mapping: the page's address_space
2955	* @index: the page index	2955	* @index: the page index
2956	* @gfp: the page allocator flags to use if allocating	2956	* @gfp: the page allocator flags to use if allocating
2957	*	2957	*
2958	* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",	2958	* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
2959	* with any new page allocations done using the specified allocation flags.	2959	* with any new page allocations done using the specified allocation flags.
2960	* But read_cache_page_gfp() uses the ->readpage() method: which does not	2960	* But read_cache_page_gfp() uses the ->readpage() method: which does not
2961	* suit tmpfs, since it may have pages in swapcache, and needs to find those	2961	* suit tmpfs, since it may have pages in swapcache, and needs to find those
2962	* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.	2962	* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
2963	*	2963	*
2964	* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY \| __GFP_NOWARN in	2964	* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY \| __GFP_NOWARN in
2965	* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.	2965	* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
2966	*/	2966	*/
2967	struct page shmem_read_mapping_page_gfp(struct address_space mapping,	2967	struct page shmem_read_mapping_page_gfp(struct address_space mapping,
2968	pgoff_t index, gfp_t gfp)	2968	pgoff_t index, gfp_t gfp)
2969	{	2969	{
2970	#ifdef CONFIG_SHMEM	2970	#ifdef CONFIG_SHMEM
2971	struct inode *inode = mapping->host;	2971	struct inode *inode = mapping->host;
2972	struct page *page;	2972	struct page *page;
2973	int error;	2973	int error;
2974		2974
2975	BUG_ON(mapping->a_ops != &shmem_aops);	2975	BUG_ON(mapping->a_ops != &shmem_aops);
2976	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);	2976	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2977	if (error)	2977	if (error)
2978	page = ERR_PTR(error);	2978	page = ERR_PTR(error);
2979	else	2979	else
2980	unlock_page(page);	2980	unlock_page(page);
2981	return page;	2981	return page;
2982	#else	2982	#else
2983	/*	2983	/*
2984	* The tiny !SHMEM case uses ramfs without swap	2984	* The tiny !SHMEM case uses ramfs without swap
2985	*/	2985	*/
2986	return read_cache_page_gfp(mapping, index, gfp);	2986	return read_cache_page_gfp(mapping, index, gfp);
2987	#endif	2987	#endif
2988	}	2988	}
2989	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);	2989	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
2990		2990