Commit 12debc4248a4a7f1873e47cda2cdd7faca80b099

Authored by David Howells
Committed by Linus Torvalds
1 parent 755aedc159

iget: remove iget() and the read_inode() super op as being obsolete

Remove the old iget() call and the read_inode() superblock operation it uses
as these are really obsolete, and the use of read_inode() does not produce
proper error handling (no distinction between ENOMEM and EIO when marking an
inode bad).

Furthermore, this removes the temptation to use iget() to find an inode by
number in a filesystem from code outside that filesystem.

iget_locked() should be used instead.  A new function is added in an earlier
patch (iget_failed) that is to be called to mark an inode as bad, unlock it
and release it should the get routine fail.  Mark iget() and read_inode() as
being obsolete and remove references to them from the documentation.

Typically a filesystem will be modified such that the read_inode function
becomes an internal iget function, for example the following:

	void thingyfs_read_inode(struct inode *inode)
	{
		...
	}

would be changed into something like:

	struct inode *thingyfs_iget(struct super_block *sp, unsigned long ino)
	{
		struct inode *inode;
		int ret;

		inode = iget_locked(sb, ino);
		if (!inode)
			return ERR_PTR(-ENOMEM);
		if (!(inode->i_state & I_NEW))
			return inode;

		...
		unlock_new_inode(inode);
		return inode;
	error:
		iget_failed(inode);
		return ERR_PTR(ret);
	}

and then thingyfs_iget() would be called rather than iget(), for example:

	ret = -EINVAL;
	inode = iget(sb, ino);
	if (!inode || is_bad_inode(inode))
		goto error;

becomes:

	inode = thingyfs_iget(sb, ino);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		goto error;
	}

Note that is_bad_inode() does not need to be called.  The error returned by
thingyfs_iget() should render it unnecessary.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 9 additions and 41 deletions Inline Diff

Documentation/filesystems/Locking
1 The text below describes the locking rules for VFS-related methods. 1 The text below describes the locking rules for VFS-related methods.
2 It is (believed to be) up-to-date. *Please*, if you change anything in 2 It is (believed to be) up-to-date. *Please*, if you change anything in
3 prototypes or locking protocols - update this file. And update the relevant 3 prototypes or locking protocols - update this file. And update the relevant
4 instances in the tree, don't leave that to maintainers of filesystems/devices/ 4 instances in the tree, don't leave that to maintainers of filesystems/devices/
5 etc. At the very least, put the list of dubious cases in the end of this file. 5 etc. At the very least, put the list of dubious cases in the end of this file.
6 Don't turn it into log - maintainers of out-of-the-tree code are supposed to 6 Don't turn it into log - maintainers of out-of-the-tree code are supposed to
7 be able to use diff(1). 7 be able to use diff(1).
8 Thing currently missing here: socket operations. Alexey? 8 Thing currently missing here: socket operations. Alexey?
9 9
10 --------------------------- dentry_operations -------------------------- 10 --------------------------- dentry_operations --------------------------
11 prototypes: 11 prototypes:
12 int (*d_revalidate)(struct dentry *, int); 12 int (*d_revalidate)(struct dentry *, int);
13 int (*d_hash) (struct dentry *, struct qstr *); 13 int (*d_hash) (struct dentry *, struct qstr *);
14 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); 14 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
15 int (*d_delete)(struct dentry *); 15 int (*d_delete)(struct dentry *);
16 void (*d_release)(struct dentry *); 16 void (*d_release)(struct dentry *);
17 void (*d_iput)(struct dentry *, struct inode *); 17 void (*d_iput)(struct dentry *, struct inode *);
18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
19 19
20 locking rules: 20 locking rules:
21 none have BKL 21 none have BKL
22 dcache_lock rename_lock ->d_lock may block 22 dcache_lock rename_lock ->d_lock may block
23 d_revalidate: no no no yes 23 d_revalidate: no no no yes
24 d_hash no no no yes 24 d_hash no no no yes
25 d_compare: no yes no no 25 d_compare: no yes no no
26 d_delete: yes no yes no 26 d_delete: yes no yes no
27 d_release: no no no yes 27 d_release: no no no yes
28 d_iput: no no no yes 28 d_iput: no no no yes
29 d_dname: no no no no 29 d_dname: no no no no
30 30
31 --------------------------- inode_operations --------------------------- 31 --------------------------- inode_operations ---------------------------
32 prototypes: 32 prototypes:
33 int (*create) (struct inode *,struct dentry *,int, struct nameidata *); 33 int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
34 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid 34 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
35 ata *); 35 ata *);
36 int (*link) (struct dentry *,struct inode *,struct dentry *); 36 int (*link) (struct dentry *,struct inode *,struct dentry *);
37 int (*unlink) (struct inode *,struct dentry *); 37 int (*unlink) (struct inode *,struct dentry *);
38 int (*symlink) (struct inode *,struct dentry *,const char *); 38 int (*symlink) (struct inode *,struct dentry *,const char *);
39 int (*mkdir) (struct inode *,struct dentry *,int); 39 int (*mkdir) (struct inode *,struct dentry *,int);
40 int (*rmdir) (struct inode *,struct dentry *); 40 int (*rmdir) (struct inode *,struct dentry *);
41 int (*mknod) (struct inode *,struct dentry *,int,dev_t); 41 int (*mknod) (struct inode *,struct dentry *,int,dev_t);
42 int (*rename) (struct inode *, struct dentry *, 42 int (*rename) (struct inode *, struct dentry *,
43 struct inode *, struct dentry *); 43 struct inode *, struct dentry *);
44 int (*readlink) (struct dentry *, char __user *,int); 44 int (*readlink) (struct dentry *, char __user *,int);
45 int (*follow_link) (struct dentry *, struct nameidata *); 45 int (*follow_link) (struct dentry *, struct nameidata *);
46 void (*truncate) (struct inode *); 46 void (*truncate) (struct inode *);
47 int (*permission) (struct inode *, int, struct nameidata *); 47 int (*permission) (struct inode *, int, struct nameidata *);
48 int (*setattr) (struct dentry *, struct iattr *); 48 int (*setattr) (struct dentry *, struct iattr *);
49 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 49 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
50 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 50 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
51 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 51 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
52 ssize_t (*listxattr) (struct dentry *, char *, size_t); 52 ssize_t (*listxattr) (struct dentry *, char *, size_t);
53 int (*removexattr) (struct dentry *, const char *); 53 int (*removexattr) (struct dentry *, const char *);
54 54
55 locking rules: 55 locking rules:
56 all may block, none have BKL 56 all may block, none have BKL
57 i_mutex(inode) 57 i_mutex(inode)
58 lookup: yes 58 lookup: yes
59 create: yes 59 create: yes
60 link: yes (both) 60 link: yes (both)
61 mknod: yes 61 mknod: yes
62 symlink: yes 62 symlink: yes
63 mkdir: yes 63 mkdir: yes
64 unlink: yes (both) 64 unlink: yes (both)
65 rmdir: yes (both) (see below) 65 rmdir: yes (both) (see below)
66 rename: yes (all) (see below) 66 rename: yes (all) (see below)
67 readlink: no 67 readlink: no
68 follow_link: no 68 follow_link: no
69 truncate: yes (see below) 69 truncate: yes (see below)
70 setattr: yes 70 setattr: yes
71 permission: no 71 permission: no
72 getattr: no 72 getattr: no
73 setxattr: yes 73 setxattr: yes
74 getxattr: no 74 getxattr: no
75 listxattr: no 75 listxattr: no
76 removexattr: yes 76 removexattr: yes
77 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 77 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
78 victim. 78 victim.
79 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 79 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
80 ->truncate() is never called directly - it's a callback, not a 80 ->truncate() is never called directly - it's a callback, not a
81 method. It's called by vmtruncate() - library function normally used by 81 method. It's called by vmtruncate() - library function normally used by
82 ->setattr(). Locking information above applies to that call (i.e. is 82 ->setattr(). Locking information above applies to that call (i.e. is
83 inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been 83 inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
84 passed). 84 passed).
85 85
86 See Documentation/filesystems/directory-locking for more detailed discussion 86 See Documentation/filesystems/directory-locking for more detailed discussion
87 of the locking scheme for directory operations. 87 of the locking scheme for directory operations.
88 88
89 --------------------------- super_operations --------------------------- 89 --------------------------- super_operations ---------------------------
90 prototypes: 90 prototypes:
91 struct inode *(*alloc_inode)(struct super_block *sb); 91 struct inode *(*alloc_inode)(struct super_block *sb);
92 void (*destroy_inode)(struct inode *); 92 void (*destroy_inode)(struct inode *);
93 void (*read_inode) (struct inode *);
94 void (*dirty_inode) (struct inode *); 93 void (*dirty_inode) (struct inode *);
95 int (*write_inode) (struct inode *, int); 94 int (*write_inode) (struct inode *, int);
96 void (*put_inode) (struct inode *); 95 void (*put_inode) (struct inode *);
97 void (*drop_inode) (struct inode *); 96 void (*drop_inode) (struct inode *);
98 void (*delete_inode) (struct inode *); 97 void (*delete_inode) (struct inode *);
99 void (*put_super) (struct super_block *); 98 void (*put_super) (struct super_block *);
100 void (*write_super) (struct super_block *); 99 void (*write_super) (struct super_block *);
101 int (*sync_fs)(struct super_block *sb, int wait); 100 int (*sync_fs)(struct super_block *sb, int wait);
102 void (*write_super_lockfs) (struct super_block *); 101 void (*write_super_lockfs) (struct super_block *);
103 void (*unlockfs) (struct super_block *); 102 void (*unlockfs) (struct super_block *);
104 int (*statfs) (struct dentry *, struct kstatfs *); 103 int (*statfs) (struct dentry *, struct kstatfs *);
105 int (*remount_fs) (struct super_block *, int *, char *); 104 int (*remount_fs) (struct super_block *, int *, char *);
106 void (*clear_inode) (struct inode *); 105 void (*clear_inode) (struct inode *);
107 void (*umount_begin) (struct super_block *); 106 void (*umount_begin) (struct super_block *);
108 int (*show_options)(struct seq_file *, struct vfsmount *); 107 int (*show_options)(struct seq_file *, struct vfsmount *);
109 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 108 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
110 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 109 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
111 110
112 locking rules: 111 locking rules:
113 All may block. 112 All may block.
114 BKL s_lock s_umount 113 BKL s_lock s_umount
115 alloc_inode: no no no 114 alloc_inode: no no no
116 destroy_inode: no 115 destroy_inode: no
117 read_inode: no (see below)
118 dirty_inode: no (must not sleep) 116 dirty_inode: no (must not sleep)
119 write_inode: no 117 write_inode: no
120 put_inode: no 118 put_inode: no
121 drop_inode: no !!!inode_lock!!! 119 drop_inode: no !!!inode_lock!!!
122 delete_inode: no 120 delete_inode: no
123 put_super: yes yes no 121 put_super: yes yes no
124 write_super: no yes read 122 write_super: no yes read
125 sync_fs: no no read 123 sync_fs: no no read
126 write_super_lockfs: ? 124 write_super_lockfs: ?
127 unlockfs: ? 125 unlockfs: ?
128 statfs: no no no 126 statfs: no no no
129 remount_fs: yes yes maybe (see below) 127 remount_fs: yes yes maybe (see below)
130 clear_inode: no 128 clear_inode: no
131 umount_begin: yes no no 129 umount_begin: yes no no
132 show_options: no (vfsmount->sem) 130 show_options: no (vfsmount->sem)
133 quota_read: no no no (see below) 131 quota_read: no no no (see below)
134 quota_write: no no no (see below) 132 quota_write: no no no (see below)
135 133
136 ->read_inode() is not a method - it's a callback used in iget().
137 ->remount_fs() will have the s_umount lock if it's already mounted. 134 ->remount_fs() will have the s_umount lock if it's already mounted.
138 When called from get_sb_single, it does NOT have the s_umount lock. 135 When called from get_sb_single, it does NOT have the s_umount lock.
139 ->quota_read() and ->quota_write() functions are both guaranteed to 136 ->quota_read() and ->quota_write() functions are both guaranteed to
140 be the only ones operating on the quota file by the quota code (via 137 be the only ones operating on the quota file by the quota code (via
141 dqio_sem) (unless an admin really wants to screw up something and 138 dqio_sem) (unless an admin really wants to screw up something and
142 writes to quota files with quotas on). For other details about locking 139 writes to quota files with quotas on). For other details about locking
143 see also dquot_operations section. 140 see also dquot_operations section.
144 141
145 --------------------------- file_system_type --------------------------- 142 --------------------------- file_system_type ---------------------------
146 prototypes: 143 prototypes:
147 int (*get_sb) (struct file_system_type *, int, 144 int (*get_sb) (struct file_system_type *, int,
148 const char *, void *, struct vfsmount *); 145 const char *, void *, struct vfsmount *);
149 void (*kill_sb) (struct super_block *); 146 void (*kill_sb) (struct super_block *);
150 locking rules: 147 locking rules:
151 may block BKL 148 may block BKL
152 get_sb yes yes 149 get_sb yes yes
153 kill_sb yes yes 150 kill_sb yes yes
154 151
155 ->get_sb() returns error or 0 with locked superblock attached to the vfsmount 152 ->get_sb() returns error or 0 with locked superblock attached to the vfsmount
156 (exclusive on ->s_umount). 153 (exclusive on ->s_umount).
157 ->kill_sb() takes a write-locked superblock, does all shutdown work on it, 154 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
158 unlocks and drops the reference. 155 unlocks and drops the reference.
159 156
160 --------------------------- address_space_operations -------------------------- 157 --------------------------- address_space_operations --------------------------
161 prototypes: 158 prototypes:
162 int (*writepage)(struct page *page, struct writeback_control *wbc); 159 int (*writepage)(struct page *page, struct writeback_control *wbc);
163 int (*readpage)(struct file *, struct page *); 160 int (*readpage)(struct file *, struct page *);
164 int (*sync_page)(struct page *); 161 int (*sync_page)(struct page *);
165 int (*writepages)(struct address_space *, struct writeback_control *); 162 int (*writepages)(struct address_space *, struct writeback_control *);
166 int (*set_page_dirty)(struct page *page); 163 int (*set_page_dirty)(struct page *page);
167 int (*readpages)(struct file *filp, struct address_space *mapping, 164 int (*readpages)(struct file *filp, struct address_space *mapping,
168 struct list_head *pages, unsigned nr_pages); 165 struct list_head *pages, unsigned nr_pages);
169 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 166 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
170 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 167 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
171 sector_t (*bmap)(struct address_space *, sector_t); 168 sector_t (*bmap)(struct address_space *, sector_t);
172 int (*invalidatepage) (struct page *, unsigned long); 169 int (*invalidatepage) (struct page *, unsigned long);
173 int (*releasepage) (struct page *, int); 170 int (*releasepage) (struct page *, int);
174 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 171 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
175 loff_t offset, unsigned long nr_segs); 172 loff_t offset, unsigned long nr_segs);
176 int (*launder_page) (struct page *); 173 int (*launder_page) (struct page *);
177 174
178 locking rules: 175 locking rules:
179 All except set_page_dirty may block 176 All except set_page_dirty may block
180 177
181 BKL PageLocked(page) i_sem 178 BKL PageLocked(page) i_sem
182 writepage: no yes, unlocks (see below) 179 writepage: no yes, unlocks (see below)
183 readpage: no yes, unlocks 180 readpage: no yes, unlocks
184 sync_page: no maybe 181 sync_page: no maybe
185 writepages: no 182 writepages: no
186 set_page_dirty no no 183 set_page_dirty no no
187 readpages: no 184 readpages: no
188 prepare_write: no yes yes 185 prepare_write: no yes yes
189 commit_write: no yes yes 186 commit_write: no yes yes
190 write_begin: no locks the page yes 187 write_begin: no locks the page yes
191 write_end: no yes, unlocks yes 188 write_end: no yes, unlocks yes
192 perform_write: no n/a yes 189 perform_write: no n/a yes
193 bmap: yes 190 bmap: yes
194 invalidatepage: no yes 191 invalidatepage: no yes
195 releasepage: no yes 192 releasepage: no yes
196 direct_IO: no 193 direct_IO: no
197 launder_page: no yes 194 launder_page: no yes
198 195
199 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() 196 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
200 may be called from the request handler (/dev/loop). 197 may be called from the request handler (/dev/loop).
201 198
202 ->readpage() unlocks the page, either synchronously or via I/O 199 ->readpage() unlocks the page, either synchronously or via I/O
203 completion. 200 completion.
204 201
205 ->readpages() populates the pagecache with the passed pages and starts 202 ->readpages() populates the pagecache with the passed pages and starts
206 I/O against them. They come unlocked upon I/O completion. 203 I/O against them. They come unlocked upon I/O completion.
207 204
208 ->writepage() is used for two purposes: for "memory cleansing" and for 205 ->writepage() is used for two purposes: for "memory cleansing" and for
209 "sync". These are quite different operations and the behaviour may differ 206 "sync". These are quite different operations and the behaviour may differ
210 depending upon the mode. 207 depending upon the mode.
211 208
212 If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then 209 If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
213 it *must* start I/O against the page, even if that would involve 210 it *must* start I/O against the page, even if that would involve
214 blocking on in-progress I/O. 211 blocking on in-progress I/O.
215 212
216 If writepage is called for memory cleansing (sync_mode == 213 If writepage is called for memory cleansing (sync_mode ==
217 WBC_SYNC_NONE) then its role is to get as much writeout underway as 214 WBC_SYNC_NONE) then its role is to get as much writeout underway as
218 possible. So writepage should try to avoid blocking against 215 possible. So writepage should try to avoid blocking against
219 currently-in-progress I/O. 216 currently-in-progress I/O.
220 217
221 If the filesystem is not called for "sync" and it determines that it 218 If the filesystem is not called for "sync" and it determines that it
222 would need to block against in-progress I/O to be able to start new I/O 219 would need to block against in-progress I/O to be able to start new I/O
223 against the page the filesystem should redirty the page with 220 against the page the filesystem should redirty the page with
224 redirty_page_for_writepage(), then unlock the page and return zero. 221 redirty_page_for_writepage(), then unlock the page and return zero.
225 This may also be done to avoid internal deadlocks, but rarely. 222 This may also be done to avoid internal deadlocks, but rarely.
226 223
227 If the filesystem is called for sync then it must wait on any 224 If the filesystem is called for sync then it must wait on any
228 in-progress I/O and then start new I/O. 225 in-progress I/O and then start new I/O.
229 226
230 The filesystem should unlock the page synchronously, before returning to the 227 The filesystem should unlock the page synchronously, before returning to the
231 caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE 228 caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
232 value. WRITEPAGE_ACTIVATE means that page cannot really be written out 229 value. WRITEPAGE_ACTIVATE means that page cannot really be written out
233 currently, and VM should stop calling ->writepage() on this page for some 230 currently, and VM should stop calling ->writepage() on this page for some
234 time. VM does this by moving page to the head of the active list, hence the 231 time. VM does this by moving page to the head of the active list, hence the
235 name. 232 name.
236 233
237 Unless the filesystem is going to redirty_page_for_writepage(), unlock the page 234 Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
238 and return zero, writepage *must* run set_page_writeback() against the page, 235 and return zero, writepage *must* run set_page_writeback() against the page,
239 followed by unlocking it. Once set_page_writeback() has been run against the 236 followed by unlocking it. Once set_page_writeback() has been run against the
240 page, write I/O can be submitted and the write I/O completion handler must run 237 page, write I/O can be submitted and the write I/O completion handler must run
241 end_page_writeback() once the I/O is complete. If no I/O is submitted, the 238 end_page_writeback() once the I/O is complete. If no I/O is submitted, the
242 filesystem must run end_page_writeback() against the page before returning from 239 filesystem must run end_page_writeback() against the page before returning from
243 writepage. 240 writepage.
244 241
245 That is: after 2.5.12, pages which are under writeout are *not* locked. Note, 242 That is: after 2.5.12, pages which are under writeout are *not* locked. Note,
246 if the filesystem needs the page to be locked during writeout, that is ok, too, 243 if the filesystem needs the page to be locked during writeout, that is ok, too,
247 the page is allowed to be unlocked at any point in time between the calls to 244 the page is allowed to be unlocked at any point in time between the calls to
248 set_page_writeback() and end_page_writeback(). 245 set_page_writeback() and end_page_writeback().
249 246
250 Note, failure to run either redirty_page_for_writepage() or the combination of 247 Note, failure to run either redirty_page_for_writepage() or the combination of
251 set_page_writeback()/end_page_writeback() on a page submitted to writepage 248 set_page_writeback()/end_page_writeback() on a page submitted to writepage
252 will leave the page itself marked clean but it will be tagged as dirty in the 249 will leave the page itself marked clean but it will be tagged as dirty in the
253 radix tree. This incoherency can lead to all sorts of hard-to-debug problems 250 radix tree. This incoherency can lead to all sorts of hard-to-debug problems
254 in the filesystem like having dirty inodes at umount and losing written data. 251 in the filesystem like having dirty inodes at umount and losing written data.
255 252
256 ->sync_page() locking rules are not well-defined - usually it is called 253 ->sync_page() locking rules are not well-defined - usually it is called
257 with lock on page, but that is not guaranteed. Considering the currently 254 with lock on page, but that is not guaranteed. Considering the currently
258 existing instances of this method ->sync_page() itself doesn't look 255 existing instances of this method ->sync_page() itself doesn't look
259 well-defined... 256 well-defined...
260 257
261 ->writepages() is used for periodic writeback and for syscall-initiated 258 ->writepages() is used for periodic writeback and for syscall-initiated
262 sync operations. The address_space should start I/O against at least 259 sync operations. The address_space should start I/O against at least
263 *nr_to_write pages. *nr_to_write must be decremented for each page which is 260 *nr_to_write pages. *nr_to_write must be decremented for each page which is
264 written. The address_space implementation may write more (or less) pages 261 written. The address_space implementation may write more (or less) pages
265 than *nr_to_write asks for, but it should try to be reasonably close. If 262 than *nr_to_write asks for, but it should try to be reasonably close. If
266 nr_to_write is NULL, all dirty pages must be written. 263 nr_to_write is NULL, all dirty pages must be written.
267 264
268 writepages should _only_ write pages which are present on 265 writepages should _only_ write pages which are present on
269 mapping->io_pages. 266 mapping->io_pages.
270 267
271 ->set_page_dirty() is called from various places in the kernel 268 ->set_page_dirty() is called from various places in the kernel
272 when the target page is marked as needing writeback. It may be called 269 when the target page is marked as needing writeback. It may be called
273 under spinlock (it cannot block) and is sometimes called with the page 270 under spinlock (it cannot block) and is sometimes called with the page
274 not locked. 271 not locked.
275 272
276 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 273 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
277 filesystems and by the swapper. The latter will eventually go away. All 274 filesystems and by the swapper. The latter will eventually go away. All
278 instances do not actually need the BKL. Please, keep it that way and don't 275 instances do not actually need the BKL. Please, keep it that way and don't
279 breed new callers. 276 breed new callers.
280 277
281 ->invalidatepage() is called when the filesystem must attempt to drop 278 ->invalidatepage() is called when the filesystem must attempt to drop
282 some or all of the buffers from the page when it is being truncated. It 279 some or all of the buffers from the page when it is being truncated. It
283 returns zero on success. If ->invalidatepage is zero, the kernel uses 280 returns zero on success. If ->invalidatepage is zero, the kernel uses
284 block_invalidatepage() instead. 281 block_invalidatepage() instead.
285 282
286 ->releasepage() is called when the kernel is about to try to drop the 283 ->releasepage() is called when the kernel is about to try to drop the
287 buffers from the page in preparation for freeing it. It returns zero to 284 buffers from the page in preparation for freeing it. It returns zero to
288 indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 285 indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
289 the kernel assumes that the fs has no private interest in the buffers. 286 the kernel assumes that the fs has no private interest in the buffers.
290 287
291 ->launder_page() may be called prior to releasing a page if 288 ->launder_page() may be called prior to releasing a page if
292 it is still found to be dirty. It returns zero if the page was successfully 289 it is still found to be dirty. It returns zero if the page was successfully
293 cleaned, or an error value if not. Note that in order to prevent the page 290 cleaned, or an error value if not. Note that in order to prevent the page
294 getting mapped back in and redirtied, it needs to be kept locked 291 getting mapped back in and redirtied, it needs to be kept locked
295 across the entire operation. 292 across the entire operation.
296 293
297 Note: currently almost all instances of address_space methods are 294 Note: currently almost all instances of address_space methods are
298 using BKL for internal serialization and that's one of the worst sources 295 using BKL for internal serialization and that's one of the worst sources
299 of contention. Normally they are calling library functions (in fs/buffer.c) 296 of contention. Normally they are calling library functions (in fs/buffer.c)
300 and pass foo_get_block() as a callback (on local block-based filesystems, 297 and pass foo_get_block() as a callback (on local block-based filesystems,
301 indeed). BKL is not needed for library stuff and is usually taken by 298 indeed). BKL is not needed for library stuff and is usually taken by
302 foo_get_block(). It's an overkill, since block bitmaps can be protected by 299 foo_get_block(). It's an overkill, since block bitmaps can be protected by
303 internal fs locking and real critical areas are much smaller than the areas 300 internal fs locking and real critical areas are much smaller than the areas
304 filesystems protect now. 301 filesystems protect now.
305 302
306 ----------------------- file_lock_operations ------------------------------ 303 ----------------------- file_lock_operations ------------------------------
307 prototypes: 304 prototypes:
308 void (*fl_insert)(struct file_lock *); /* lock insertion callback */ 305 void (*fl_insert)(struct file_lock *); /* lock insertion callback */
309 void (*fl_remove)(struct file_lock *); /* lock removal callback */ 306 void (*fl_remove)(struct file_lock *); /* lock removal callback */
310 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 307 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
311 void (*fl_release_private)(struct file_lock *); 308 void (*fl_release_private)(struct file_lock *);
312 309
313 310
314 locking rules: 311 locking rules:
315 BKL may block 312 BKL may block
316 fl_insert: yes no 313 fl_insert: yes no
317 fl_remove: yes no 314 fl_remove: yes no
318 fl_copy_lock: yes no 315 fl_copy_lock: yes no
319 fl_release_private: yes yes 316 fl_release_private: yes yes
320 317
321 ----------------------- lock_manager_operations --------------------------- 318 ----------------------- lock_manager_operations ---------------------------
322 prototypes: 319 prototypes:
323 int (*fl_compare_owner)(struct file_lock *, struct file_lock *); 320 int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
324 void (*fl_notify)(struct file_lock *); /* unblock callback */ 321 void (*fl_notify)(struct file_lock *); /* unblock callback */
325 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 322 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
326 void (*fl_release_private)(struct file_lock *); 323 void (*fl_release_private)(struct file_lock *);
327 void (*fl_break)(struct file_lock *); /* break_lease callback */ 324 void (*fl_break)(struct file_lock *); /* break_lease callback */
328 325
329 locking rules: 326 locking rules:
330 BKL may block 327 BKL may block
331 fl_compare_owner: yes no 328 fl_compare_owner: yes no
332 fl_notify: yes no 329 fl_notify: yes no
333 fl_copy_lock: yes no 330 fl_copy_lock: yes no
334 fl_release_private: yes yes 331 fl_release_private: yes yes
335 fl_break: yes no 332 fl_break: yes no
336 333
337 Currently only NFSD and NLM provide instances of this class. None of the 334 Currently only NFSD and NLM provide instances of this class. None of the
338 them block. If you have out-of-tree instances - please, show up. Locking 335 them block. If you have out-of-tree instances - please, show up. Locking
339 in that area will change. 336 in that area will change.
340 --------------------------- buffer_head ----------------------------------- 337 --------------------------- buffer_head -----------------------------------
341 prototypes: 338 prototypes:
342 void (*b_end_io)(struct buffer_head *bh, int uptodate); 339 void (*b_end_io)(struct buffer_head *bh, int uptodate);
343 340
344 locking rules: 341 locking rules:
345 called from interrupts. In other words, extreme care is needed here. 342 called from interrupts. In other words, extreme care is needed here.
346 bh is locked, but that's all warranties we have here. Currently only RAID1, 343 bh is locked, but that's all warranties we have here. Currently only RAID1,
347 highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices 344 highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
348 call this method upon the IO completion. 345 call this method upon the IO completion.
349 346
350 --------------------------- block_device_operations ----------------------- 347 --------------------------- block_device_operations -----------------------
351 prototypes: 348 prototypes:
352 int (*open) (struct inode *, struct file *); 349 int (*open) (struct inode *, struct file *);
353 int (*release) (struct inode *, struct file *); 350 int (*release) (struct inode *, struct file *);
354 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); 351 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
355 int (*media_changed) (struct gendisk *); 352 int (*media_changed) (struct gendisk *);
356 int (*revalidate_disk) (struct gendisk *); 353 int (*revalidate_disk) (struct gendisk *);
357 354
358 locking rules: 355 locking rules:
359 BKL bd_sem 356 BKL bd_sem
360 open: yes yes 357 open: yes yes
361 release: yes yes 358 release: yes yes
362 ioctl: yes no 359 ioctl: yes no
363 media_changed: no no 360 media_changed: no no
364 revalidate_disk: no no 361 revalidate_disk: no no
365 362
366 The last two are called only from check_disk_change(). 363 The last two are called only from check_disk_change().
367 364
368 --------------------------- file_operations ------------------------------- 365 --------------------------- file_operations -------------------------------
369 prototypes: 366 prototypes:
370 loff_t (*llseek) (struct file *, loff_t, int); 367 loff_t (*llseek) (struct file *, loff_t, int);
371 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 368 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
372 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 369 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
373 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 370 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
374 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 371 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
375 int (*readdir) (struct file *, void *, filldir_t); 372 int (*readdir) (struct file *, void *, filldir_t);
376 unsigned int (*poll) (struct file *, struct poll_table_struct *); 373 unsigned int (*poll) (struct file *, struct poll_table_struct *);
377 int (*ioctl) (struct inode *, struct file *, unsigned int, 374 int (*ioctl) (struct inode *, struct file *, unsigned int,
378 unsigned long); 375 unsigned long);
379 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 376 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
380 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 377 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
381 int (*mmap) (struct file *, struct vm_area_struct *); 378 int (*mmap) (struct file *, struct vm_area_struct *);
382 int (*open) (struct inode *, struct file *); 379 int (*open) (struct inode *, struct file *);
383 int (*flush) (struct file *); 380 int (*flush) (struct file *);
384 int (*release) (struct inode *, struct file *); 381 int (*release) (struct inode *, struct file *);
385 int (*fsync) (struct file *, struct dentry *, int datasync); 382 int (*fsync) (struct file *, struct dentry *, int datasync);
386 int (*aio_fsync) (struct kiocb *, int datasync); 383 int (*aio_fsync) (struct kiocb *, int datasync);
387 int (*fasync) (int, struct file *, int); 384 int (*fasync) (int, struct file *, int);
388 int (*lock) (struct file *, int, struct file_lock *); 385 int (*lock) (struct file *, int, struct file_lock *);
389 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, 386 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
390 loff_t *); 387 loff_t *);
391 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, 388 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
392 loff_t *); 389 loff_t *);
393 ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, 390 ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
394 void __user *); 391 void __user *);
395 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, 392 ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
396 loff_t *, int); 393 loff_t *, int);
397 unsigned long (*get_unmapped_area)(struct file *, unsigned long, 394 unsigned long (*get_unmapped_area)(struct file *, unsigned long,
398 unsigned long, unsigned long, unsigned long); 395 unsigned long, unsigned long, unsigned long);
399 int (*check_flags)(int); 396 int (*check_flags)(int);
400 int (*dir_notify)(struct file *, unsigned long); 397 int (*dir_notify)(struct file *, unsigned long);
401 }; 398 };
402 399
403 locking rules: 400 locking rules:
404 All except ->poll() may block. 401 All except ->poll() may block.
405 BKL 402 BKL
406 llseek: no (see below) 403 llseek: no (see below)
407 read: no 404 read: no
408 aio_read: no 405 aio_read: no
409 write: no 406 write: no
410 aio_write: no 407 aio_write: no
411 readdir: no 408 readdir: no
412 poll: no 409 poll: no
413 ioctl: yes (see below) 410 ioctl: yes (see below)
414 unlocked_ioctl: no (see below) 411 unlocked_ioctl: no (see below)
415 compat_ioctl: no 412 compat_ioctl: no
416 mmap: no 413 mmap: no
417 open: maybe (see below) 414 open: maybe (see below)
418 flush: no 415 flush: no
419 release: no 416 release: no
420 fsync: no (see below) 417 fsync: no (see below)
421 aio_fsync: no 418 aio_fsync: no
422 fasync: yes (see below) 419 fasync: yes (see below)
423 lock: yes 420 lock: yes
424 readv: no 421 readv: no
425 writev: no 422 writev: no
426 sendfile: no 423 sendfile: no
427 sendpage: no 424 sendpage: no
428 get_unmapped_area: no 425 get_unmapped_area: no
429 check_flags: no 426 check_flags: no
430 dir_notify: no 427 dir_notify: no
431 428
432 ->llseek() locking has moved from llseek to the individual llseek 429 ->llseek() locking has moved from llseek to the individual llseek
433 implementations. If your fs is not using generic_file_llseek, you 430 implementations. If your fs is not using generic_file_llseek, you
434 need to acquire and release the appropriate locks in your ->llseek(). 431 need to acquire and release the appropriate locks in your ->llseek().
435 For many filesystems, it is probably safe to acquire the inode 432 For many filesystems, it is probably safe to acquire the inode
436 semaphore. Note some filesystems (i.e. remote ones) provide no 433 semaphore. Note some filesystems (i.e. remote ones) provide no
437 protection for i_size so you will need to use the BKL. 434 protection for i_size so you will need to use the BKL.
438 435
439 ->open() locking is in-transit: big lock partially moved into the methods. 436 ->open() locking is in-transit: big lock partially moved into the methods.
440 The only exception is ->open() in the instances of file_operations that never 437 The only exception is ->open() in the instances of file_operations that never
441 end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices 438 end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
442 (chrdev_open() takes lock before replacing ->f_op and calling the secondary 439 (chrdev_open() takes lock before replacing ->f_op and calling the secondary
443 method. As soon as we fix the handling of module reference counters all 440 method. As soon as we fix the handling of module reference counters all
444 instances of ->open() will be called without the BKL. 441 instances of ->open() will be called without the BKL.
445 442
446 Note: ext2_release() was *the* source of contention on fs-intensive 443 Note: ext2_release() was *the* source of contention on fs-intensive
447 loads and dropping BKL on ->release() helps to get rid of that (we still 444 loads and dropping BKL on ->release() helps to get rid of that (we still
448 grab BKL for cases when we close a file that had been opened r/w, but that 445 grab BKL for cases when we close a file that had been opened r/w, but that
449 can and should be done using the internal locking with smaller critical areas). 446 can and should be done using the internal locking with smaller critical areas).
450 Current worst offender is ext2_get_block()... 447 Current worst offender is ext2_get_block()...
451 448
452 ->fasync() is a mess. This area needs a big cleanup and that will probably 449 ->fasync() is a mess. This area needs a big cleanup and that will probably
453 affect locking. 450 affect locking.
454 451
455 ->readdir() and ->ioctl() on directories must be changed. Ideally we would 452 ->readdir() and ->ioctl() on directories must be changed. Ideally we would
456 move ->readdir() to inode_operations and use a separate method for directory 453 move ->readdir() to inode_operations and use a separate method for directory
457 ->ioctl() or kill the latter completely. One of the problems is that for 454 ->ioctl() or kill the latter completely. One of the problems is that for
458 anything that resembles union-mount we won't have a struct file for all 455 anything that resembles union-mount we won't have a struct file for all
459 components. And there are other reasons why the current interface is a mess... 456 components. And there are other reasons why the current interface is a mess...
460 457
461 ->ioctl() on regular files is superceded by the ->unlocked_ioctl() that 458 ->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
462 doesn't take the BKL. 459 doesn't take the BKL.
463 460
464 ->read on directories probably must go away - we should just enforce -EISDIR 461 ->read on directories probably must go away - we should just enforce -EISDIR
465 in sys_read() and friends. 462 in sys_read() and friends.
466 463
467 ->fsync() has i_mutex on inode. 464 ->fsync() has i_mutex on inode.
468 465
469 --------------------------- dquot_operations ------------------------------- 466 --------------------------- dquot_operations -------------------------------
470 prototypes: 467 prototypes:
471 int (*initialize) (struct inode *, int); 468 int (*initialize) (struct inode *, int);
472 int (*drop) (struct inode *); 469 int (*drop) (struct inode *);
473 int (*alloc_space) (struct inode *, qsize_t, int); 470 int (*alloc_space) (struct inode *, qsize_t, int);
474 int (*alloc_inode) (const struct inode *, unsigned long); 471 int (*alloc_inode) (const struct inode *, unsigned long);
475 int (*free_space) (struct inode *, qsize_t); 472 int (*free_space) (struct inode *, qsize_t);
476 int (*free_inode) (const struct inode *, unsigned long); 473 int (*free_inode) (const struct inode *, unsigned long);
477 int (*transfer) (struct inode *, struct iattr *); 474 int (*transfer) (struct inode *, struct iattr *);
478 int (*write_dquot) (struct dquot *); 475 int (*write_dquot) (struct dquot *);
479 int (*acquire_dquot) (struct dquot *); 476 int (*acquire_dquot) (struct dquot *);
480 int (*release_dquot) (struct dquot *); 477 int (*release_dquot) (struct dquot *);
481 int (*mark_dirty) (struct dquot *); 478 int (*mark_dirty) (struct dquot *);
482 int (*write_info) (struct super_block *, int); 479 int (*write_info) (struct super_block *, int);
483 480
484 These operations are intended to be more or less wrapping functions that ensure 481 These operations are intended to be more or less wrapping functions that ensure
485 a proper locking wrt the filesystem and call the generic quota operations. 482 a proper locking wrt the filesystem and call the generic quota operations.
486 483
487 What filesystem should expect from the generic quota functions: 484 What filesystem should expect from the generic quota functions:
488 485
489 FS recursion Held locks when called 486 FS recursion Held locks when called
490 initialize: yes maybe dqonoff_sem 487 initialize: yes maybe dqonoff_sem
491 drop: yes - 488 drop: yes -
492 alloc_space: ->mark_dirty() - 489 alloc_space: ->mark_dirty() -
493 alloc_inode: ->mark_dirty() - 490 alloc_inode: ->mark_dirty() -
494 free_space: ->mark_dirty() - 491 free_space: ->mark_dirty() -
495 free_inode: ->mark_dirty() - 492 free_inode: ->mark_dirty() -
496 transfer: yes - 493 transfer: yes -
497 write_dquot: yes dqonoff_sem or dqptr_sem 494 write_dquot: yes dqonoff_sem or dqptr_sem
498 acquire_dquot: yes dqonoff_sem or dqptr_sem 495 acquire_dquot: yes dqonoff_sem or dqptr_sem
499 release_dquot: yes dqonoff_sem or dqptr_sem 496 release_dquot: yes dqonoff_sem or dqptr_sem
500 mark_dirty: no - 497 mark_dirty: no -
501 write_info: yes dqonoff_sem 498 write_info: yes dqonoff_sem
502 499
503 FS recursion means calling ->quota_read() and ->quota_write() from superblock 500 FS recursion means calling ->quota_read() and ->quota_write() from superblock
504 operations. 501 operations.
505 502
506 ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called 503 ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
507 only directly by the filesystem and do not call any fs functions only 504 only directly by the filesystem and do not call any fs functions only
508 the ->mark_dirty() operation. 505 the ->mark_dirty() operation.
509 506
510 More details about quota locking can be found in fs/dquot.c. 507 More details about quota locking can be found in fs/dquot.c.
511 508
512 --------------------------- vm_operations_struct ----------------------------- 509 --------------------------- vm_operations_struct -----------------------------
513 prototypes: 510 prototypes:
514 void (*open)(struct vm_area_struct*); 511 void (*open)(struct vm_area_struct*);
515 void (*close)(struct vm_area_struct*); 512 void (*close)(struct vm_area_struct*);
516 int (*fault)(struct vm_area_struct*, struct vm_fault *); 513 int (*fault)(struct vm_area_struct*, struct vm_fault *);
517 struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); 514 struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
518 int (*page_mkwrite)(struct vm_area_struct *, struct page *); 515 int (*page_mkwrite)(struct vm_area_struct *, struct page *);
519 516
520 locking rules: 517 locking rules:
521 BKL mmap_sem PageLocked(page) 518 BKL mmap_sem PageLocked(page)
522 open: no yes 519 open: no yes
523 close: no yes 520 close: no yes
524 fault: no yes 521 fault: no yes
525 nopage: no yes 522 nopage: no yes
526 page_mkwrite: no yes no 523 page_mkwrite: no yes no
527 524
528 ->page_mkwrite() is called when a previously read-only page is 525 ->page_mkwrite() is called when a previously read-only page is
529 about to become writeable. The file system is responsible for 526 about to become writeable. The file system is responsible for
530 protecting against truncate races. Once appropriate action has been 527 protecting against truncate races. Once appropriate action has been
531 taking to lock out truncate, the page range should be verified to be 528 taking to lock out truncate, the page range should be verified to be
532 within i_size. The page mapping should also be checked that it is not 529 within i_size. The page mapping should also be checked that it is not
533 NULL. 530 NULL.
534 531
535 ================================================================================ 532 ================================================================================
536 Dubious stuff 533 Dubious stuff
537 534
538 (if you break something or notice that it is broken and do not fix it yourself 535 (if you break something or notice that it is broken and do not fix it yourself
539 - at least put it here) 536 - at least put it here)
540 537
541 ipc/shm.c::shm_delete() - may need BKL. 538 ipc/shm.c::shm_delete() - may need BKL.
542 ->read() and ->write() in many drivers are (probably) missing BKL. 539 ->read() and ->write() in many drivers are (probably) missing BKL.
543 drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL. 540 drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL.
544 541
Documentation/filesystems/porting
1 Changes since 2.5.0: 1 Changes since 2.5.0:
2 2
3 --- 3 ---
4 [recommended] 4 [recommended]
5 5
6 New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(), 6 New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(),
7 sb_set_blocksize() and sb_min_blocksize(). 7 sb_set_blocksize() and sb_min_blocksize().
8 8
9 Use them. 9 Use them.
10 10
11 (sb_find_get_block() replaces 2.4's get_hash_table()) 11 (sb_find_get_block() replaces 2.4's get_hash_table())
12 12
13 --- 13 ---
14 [recommended] 14 [recommended]
15 15
16 New methods: ->alloc_inode() and ->destroy_inode(). 16 New methods: ->alloc_inode() and ->destroy_inode().
17 17
18 Remove inode->u.foo_inode_i 18 Remove inode->u.foo_inode_i
19 Declare 19 Declare
20 struct foo_inode_info { 20 struct foo_inode_info {
21 /* fs-private stuff */ 21 /* fs-private stuff */
22 struct inode vfs_inode; 22 struct inode vfs_inode;
23 }; 23 };
24 static inline struct foo_inode_info *FOO_I(struct inode *inode) 24 static inline struct foo_inode_info *FOO_I(struct inode *inode)
25 { 25 {
26 return list_entry(inode, struct foo_inode_info, vfs_inode); 26 return list_entry(inode, struct foo_inode_info, vfs_inode);
27 } 27 }
28 28
29 Use FOO_I(inode) instead of &inode->u.foo_inode_i; 29 Use FOO_I(inode) instead of &inode->u.foo_inode_i;
30 30
31 Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate 31 Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate
32 foo_inode_info and return the address of ->vfs_inode, the latter should free 32 foo_inode_info and return the address of ->vfs_inode, the latter should free
33 FOO_I(inode) (see in-tree filesystems for examples). 33 FOO_I(inode) (see in-tree filesystems for examples).
34 34
35 Make them ->alloc_inode and ->destroy_inode in your super_operations. 35 Make them ->alloc_inode and ->destroy_inode in your super_operations.
36 36
37 Keep in mind that now you need explicit initialization of private data - 37 Keep in mind that now you need explicit initialization of private data
38 typically in ->read_inode() and after getting an inode from new_inode(). 38 typically between calling iget_locked() and unlocking the inode.
39 39
40 At some point that will become mandatory. 40 At some point that will become mandatory.
41 41
42 --- 42 ---
43 [mandatory] 43 [mandatory]
44 44
45 Change of file_system_type method (->read_super to ->get_sb) 45 Change of file_system_type method (->read_super to ->get_sb)
46 46
47 ->read_super() is no more. Ditto for DECLARE_FSTYPE and DECLARE_FSTYPE_DEV. 47 ->read_super() is no more. Ditto for DECLARE_FSTYPE and DECLARE_FSTYPE_DEV.
48 48
49 Turn your foo_read_super() into a function that would return 0 in case of 49 Turn your foo_read_super() into a function that would return 0 in case of
50 success and negative number in case of error (-EINVAL unless you have more 50 success and negative number in case of error (-EINVAL unless you have more
51 informative error value to report). Call it foo_fill_super(). Now declare 51 informative error value to report). Call it foo_fill_super(). Now declare
52 52
53 int foo_get_sb(struct file_system_type *fs_type, 53 int foo_get_sb(struct file_system_type *fs_type,
54 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 54 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
55 { 55 {
56 return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super, 56 return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
57 mnt); 57 mnt);
58 } 58 }
59 59
60 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of 60 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
61 filesystem). 61 filesystem).
62 62
63 Replace DECLARE_FSTYPE... with explicit initializer and have ->get_sb set as 63 Replace DECLARE_FSTYPE... with explicit initializer and have ->get_sb set as
64 foo_get_sb. 64 foo_get_sb.
65 65
66 --- 66 ---
67 [mandatory] 67 [mandatory]
68 68
69 Locking change: ->s_vfs_rename_sem is taken only by cross-directory renames. 69 Locking change: ->s_vfs_rename_sem is taken only by cross-directory renames.
70 Most likely there is no need to change anything, but if you relied on 70 Most likely there is no need to change anything, but if you relied on
71 global exclusion between renames for some internal purpose - you need to 71 global exclusion between renames for some internal purpose - you need to
72 change your internal locking. Otherwise exclusion warranties remain the 72 change your internal locking. Otherwise exclusion warranties remain the
73 same (i.e. parents and victim are locked, etc.). 73 same (i.e. parents and victim are locked, etc.).
74 74
75 --- 75 ---
76 [informational] 76 [informational]
77 77
78 Now we have the exclusion between ->lookup() and directory removal (by 78 Now we have the exclusion between ->lookup() and directory removal (by
79 ->rmdir() and ->rename()). If you used to need that exclusion and do 79 ->rmdir() and ->rename()). If you used to need that exclusion and do
80 it by internal locking (most of filesystems couldn't care less) - you 80 it by internal locking (most of filesystems couldn't care less) - you
81 can relax your locking. 81 can relax your locking.
82 82
83 --- 83 ---
84 [mandatory] 84 [mandatory]
85 85
86 ->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(), 86 ->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(),
87 ->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename() 87 ->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename()
88 and ->readdir() are called without BKL now. Grab it on entry, drop upon return 88 and ->readdir() are called without BKL now. Grab it on entry, drop upon return
89 - that will guarantee the same locking you used to have. If your method or its 89 - that will guarantee the same locking you used to have. If your method or its
90 parts do not need BKL - better yet, now you can shift lock_kernel() and 90 parts do not need BKL - better yet, now you can shift lock_kernel() and
91 unlock_kernel() so that they would protect exactly what needs to be 91 unlock_kernel() so that they would protect exactly what needs to be
92 protected. 92 protected.
93 93
94 --- 94 ---
95 [mandatory] 95 [mandatory]
96 96
97 BKL is also moved from around sb operations. ->write_super() Is now called 97 BKL is also moved from around sb operations. ->write_super() Is now called
98 without BKL held. BKL should have been shifted into individual fs sb_op 98 without BKL held. BKL should have been shifted into individual fs sb_op
99 functions. If you don't need it, remove it. 99 functions. If you don't need it, remove it.
100 100
101 --- 101 ---
102 [informational] 102 [informational]
103 103
104 check for ->link() target not being a directory is done by callers. Feel 104 check for ->link() target not being a directory is done by callers. Feel
105 free to drop it... 105 free to drop it...
106 106
107 --- 107 ---
108 [informational] 108 [informational]
109 109
110 ->link() callers hold ->i_mutex on the object we are linking to. Some of your 110 ->link() callers hold ->i_mutex on the object we are linking to. Some of your
111 problems might be over... 111 problems might be over...
112 112
113 --- 113 ---
114 [mandatory] 114 [mandatory]
115 115
116 new file_system_type method - kill_sb(superblock). If you are converting 116 new file_system_type method - kill_sb(superblock). If you are converting
117 an existing filesystem, set it according to ->fs_flags: 117 an existing filesystem, set it according to ->fs_flags:
118 FS_REQUIRES_DEV - kill_block_super 118 FS_REQUIRES_DEV - kill_block_super
119 FS_LITTER - kill_litter_super 119 FS_LITTER - kill_litter_super
120 neither - kill_anon_super 120 neither - kill_anon_super
121 FS_LITTER is gone - just remove it from fs_flags. 121 FS_LITTER is gone - just remove it from fs_flags.
122 122
123 --- 123 ---
124 [mandatory] 124 [mandatory]
125 125
126 FS_SINGLE is gone (actually, that had happened back when ->get_sb() 126 FS_SINGLE is gone (actually, that had happened back when ->get_sb()
127 went in - and hadn't been documented ;-/). Just remove it from fs_flags 127 went in - and hadn't been documented ;-/). Just remove it from fs_flags
128 (and see ->get_sb() entry for other actions). 128 (and see ->get_sb() entry for other actions).
129 129
130 --- 130 ---
131 [mandatory] 131 [mandatory]
132 132
133 ->setattr() is called without BKL now. Caller _always_ holds ->i_mutex, so 133 ->setattr() is called without BKL now. Caller _always_ holds ->i_mutex, so
134 watch for ->i_mutex-grabbing code that might be used by your ->setattr(). 134 watch for ->i_mutex-grabbing code that might be used by your ->setattr().
135 Callers of notify_change() need ->i_mutex now. 135 Callers of notify_change() need ->i_mutex now.
136 136
137 --- 137 ---
138 [recommended] 138 [recommended]
139 139
140 New super_block field "struct export_operations *s_export_op" for 140 New super_block field "struct export_operations *s_export_op" for
141 explicit support for exporting, e.g. via NFS. The structure is fully 141 explicit support for exporting, e.g. via NFS. The structure is fully
142 documented at its declaration in include/linux/fs.h, and in 142 documented at its declaration in include/linux/fs.h, and in
143 Documentation/filesystems/Exporting. 143 Documentation/filesystems/Exporting.
144 144
145 Briefly it allows for the definition of decode_fh and encode_fh operations 145 Briefly it allows for the definition of decode_fh and encode_fh operations
146 to encode and decode filehandles, and allows the filesystem to use 146 to encode and decode filehandles, and allows the filesystem to use
147 a standard helper function for decode_fh, and provide file-system specific 147 a standard helper function for decode_fh, and provide file-system specific
148 support for this helper, particularly get_parent. 148 support for this helper, particularly get_parent.
149 149
150 It is planned that this will be required for exporting once the code 150 It is planned that this will be required for exporting once the code
151 settles down a bit. 151 settles down a bit.
152 152
153 [mandatory] 153 [mandatory]
154 154
155 s_export_op is now required for exporting a filesystem. 155 s_export_op is now required for exporting a filesystem.
156 isofs, ext2, ext3, resierfs, fat 156 isofs, ext2, ext3, resierfs, fat
157 can be used as examples of very different filesystems. 157 can be used as examples of very different filesystems.
158 158
159 --- 159 ---
160 [mandatory] 160 [mandatory]
161 161
162 iget4() and the read_inode2 callback have been superseded by iget5_locked() 162 iget4() and the read_inode2 callback have been superseded by iget5_locked()
163 which has the following prototype, 163 which has the following prototype,
164 164
165 struct inode *iget5_locked(struct super_block *sb, unsigned long ino, 165 struct inode *iget5_locked(struct super_block *sb, unsigned long ino,
166 int (*test)(struct inode *, void *), 166 int (*test)(struct inode *, void *),
167 int (*set)(struct inode *, void *), 167 int (*set)(struct inode *, void *),
168 void *data); 168 void *data);
169 169
170 'test' is an additional function that can be used when the inode 170 'test' is an additional function that can be used when the inode
171 number is not sufficient to identify the actual file object. 'set' 171 number is not sufficient to identify the actual file object. 'set'
172 should be a non-blocking function that initializes those parts of a 172 should be a non-blocking function that initializes those parts of a
173 newly created inode to allow the test function to succeed. 'data' is 173 newly created inode to allow the test function to succeed. 'data' is
174 passed as an opaque value to both test and set functions. 174 passed as an opaque value to both test and set functions.
175 175
176 When the inode has been created by iget5_locked(), it will be returned with 176 When the inode has been created by iget5_locked(), it will be returned with the
177 the I_NEW flag set and will still be locked. read_inode has not been 177 I_NEW flag set and will still be locked. The filesystem then needs to finalize
178 called so the file system still has to finalize the initialization. Once 178 the initialization. Once the inode is initialized it must be unlocked by
179 the inode is initialized it must be unlocked by calling unlock_new_inode(). 179 calling unlock_new_inode().
180 180
181 The filesystem is responsible for setting (and possibly testing) i_ino 181 The filesystem is responsible for setting (and possibly testing) i_ino
182 when appropriate. There is also a simpler iget_locked function that 182 when appropriate. There is also a simpler iget_locked function that
183 just takes the superblock and inode number as arguments and does the 183 just takes the superblock and inode number as arguments and does the
184 test and set for you. 184 test and set for you.
185 185
186 e.g. 186 e.g.
187 inode = iget_locked(sb, ino); 187 inode = iget_locked(sb, ino);
188 if (inode->i_state & I_NEW) { 188 if (inode->i_state & I_NEW) {
189 err = read_inode_from_disk(inode); 189 err = read_inode_from_disk(inode);
190 if (err < 0) { 190 if (err < 0) {
191 iget_failed(inode); 191 iget_failed(inode);
192 return err; 192 return err;
193 } 193 }
194 unlock_new_inode(inode); 194 unlock_new_inode(inode);
195 } 195 }
196 196
197 Note that if the process of setting up a new inode fails, then iget_failed() 197 Note that if the process of setting up a new inode fails, then iget_failed()
198 should be called on the inode to render it dead, and an appropriate error 198 should be called on the inode to render it dead, and an appropriate error
199 should be passed back to the caller. 199 should be passed back to the caller.
200 200
201 --- 201 ---
202 [recommended] 202 [recommended]
203 203
204 ->getattr() finally getting used. See instances in nfs, minix, etc. 204 ->getattr() finally getting used. See instances in nfs, minix, etc.
205 205
206 --- 206 ---
207 [mandatory] 207 [mandatory]
208 208
209 ->revalidate() is gone. If your filesystem had it - provide ->getattr() 209 ->revalidate() is gone. If your filesystem had it - provide ->getattr()
210 and let it call whatever you had as ->revlidate() + (for symlinks that 210 and let it call whatever you had as ->revlidate() + (for symlinks that
211 had ->revalidate()) add calls in ->follow_link()/->readlink(). 211 had ->revalidate()) add calls in ->follow_link()/->readlink().
212 212
213 --- 213 ---
214 [mandatory] 214 [mandatory]
215 215
216 ->d_parent changes are not protected by BKL anymore. Read access is safe 216 ->d_parent changes are not protected by BKL anymore. Read access is safe
217 if at least one of the following is true: 217 if at least one of the following is true:
218 * filesystem has no cross-directory rename() 218 * filesystem has no cross-directory rename()
219 * dcache_lock is held 219 * dcache_lock is held
220 * we know that parent had been locked (e.g. we are looking at 220 * we know that parent had been locked (e.g. we are looking at
221 ->d_parent of ->lookup() argument). 221 ->d_parent of ->lookup() argument).
222 * we are called from ->rename(). 222 * we are called from ->rename().
223 * the child's ->d_lock is held 223 * the child's ->d_lock is held
224 Audit your code and add locking if needed. Notice that any place that is 224 Audit your code and add locking if needed. Notice that any place that is
225 not protected by the conditions above is risky even in the old tree - you 225 not protected by the conditions above is risky even in the old tree - you
226 had been relying on BKL and that's prone to screwups. Old tree had quite 226 had been relying on BKL and that's prone to screwups. Old tree had quite
227 a few holes of that kind - unprotected access to ->d_parent leading to 227 a few holes of that kind - unprotected access to ->d_parent leading to
228 anything from oops to silent memory corruption. 228 anything from oops to silent memory corruption.
229 229
230 --- 230 ---
231 [mandatory] 231 [mandatory]
232 232
233 FS_NOMOUNT is gone. If you use it - just set MS_NOUSER in flags 233 FS_NOMOUNT is gone. If you use it - just set MS_NOUSER in flags
234 (see rootfs for one kind of solution and bdev/socket/pipe for another). 234 (see rootfs for one kind of solution and bdev/socket/pipe for another).
235 235
236 --- 236 ---
237 [recommended] 237 [recommended]
238 238
239 Use bdev_read_only(bdev) instead of is_read_only(kdev). The latter 239 Use bdev_read_only(bdev) instead of is_read_only(kdev). The latter
240 is still alive, but only because of the mess in drivers/s390/block/dasd.c. 240 is still alive, but only because of the mess in drivers/s390/block/dasd.c.
241 As soon as it gets fixed is_read_only() will die. 241 As soon as it gets fixed is_read_only() will die.
242 242
243 --- 243 ---
244 [mandatory] 244 [mandatory]
245 245
246 ->permission() is called without BKL now. Grab it on entry, drop upon 246 ->permission() is called without BKL now. Grab it on entry, drop upon
247 return - that will guarantee the same locking you used to have. If 247 return - that will guarantee the same locking you used to have. If
248 your method or its parts do not need BKL - better yet, now you can 248 your method or its parts do not need BKL - better yet, now you can
249 shift lock_kernel() and unlock_kernel() so that they would protect 249 shift lock_kernel() and unlock_kernel() so that they would protect
250 exactly what needs to be protected. 250 exactly what needs to be protected.
251 251
252 --- 252 ---
253 [mandatory] 253 [mandatory]
254 254
255 ->statfs() is now called without BKL held. BKL should have been 255 ->statfs() is now called without BKL held. BKL should have been
256 shifted into individual fs sb_op functions where it's not clear that 256 shifted into individual fs sb_op functions where it's not clear that
257 it's safe to remove it. If you don't need it, remove it. 257 it's safe to remove it. If you don't need it, remove it.
258 258
259 --- 259 ---
260 [mandatory] 260 [mandatory]
261 261
262 is_read_only() is gone; use bdev_read_only() instead. 262 is_read_only() is gone; use bdev_read_only() instead.
263 263
264 --- 264 ---
265 [mandatory] 265 [mandatory]
266 266
267 destroy_buffers() is gone; use invalidate_bdev(). 267 destroy_buffers() is gone; use invalidate_bdev().
268 268
269 --- 269 ---
270 [mandatory] 270 [mandatory]
271 271
272 fsync_dev() is gone; use fsync_bdev(). NOTE: lvm breakage is 272 fsync_dev() is gone; use fsync_bdev(). NOTE: lvm breakage is
273 deliberate; as soon as struct block_device * is propagated in a reasonable 273 deliberate; as soon as struct block_device * is propagated in a reasonable
274 way by that code fixing will become trivial; until then nothing can be 274 way by that code fixing will become trivial; until then nothing can be
275 done. 275 done.
276 276
Documentation/filesystems/vfs.txt
1 1
2 Overview of the Linux Virtual File System 2 Overview of the Linux Virtual File System
3 3
4 Original author: Richard Gooch <rgooch@atnf.csiro.au> 4 Original author: Richard Gooch <rgooch@atnf.csiro.au>
5 5
6 Last updated on June 24, 2007. 6 Last updated on June 24, 2007.
7 7
8 Copyright (C) 1999 Richard Gooch 8 Copyright (C) 1999 Richard Gooch
9 Copyright (C) 2005 Pekka Enberg 9 Copyright (C) 2005 Pekka Enberg
10 10
11 This file is released under the GPLv2. 11 This file is released under the GPLv2.
12 12
13 13
14 Introduction 14 Introduction
15 ============ 15 ============
16 16
17 The Virtual File System (also known as the Virtual Filesystem Switch) 17 The Virtual File System (also known as the Virtual Filesystem Switch)
18 is the software layer in the kernel that provides the filesystem 18 is the software layer in the kernel that provides the filesystem
19 interface to userspace programs. It also provides an abstraction 19 interface to userspace programs. It also provides an abstraction
20 within the kernel which allows different filesystem implementations to 20 within the kernel which allows different filesystem implementations to
21 coexist. 21 coexist.
22 22
23 VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so 23 VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so
24 on are called from a process context. Filesystem locking is described 24 on are called from a process context. Filesystem locking is described
25 in the document Documentation/filesystems/Locking. 25 in the document Documentation/filesystems/Locking.
26 26
27 27
28 Directory Entry Cache (dcache) 28 Directory Entry Cache (dcache)
29 ------------------------------ 29 ------------------------------
30 30
31 The VFS implements the open(2), stat(2), chmod(2), and similar system 31 The VFS implements the open(2), stat(2), chmod(2), and similar system
32 calls. The pathname argument that is passed to them is used by the VFS 32 calls. The pathname argument that is passed to them is used by the VFS
33 to search through the directory entry cache (also known as the dentry 33 to search through the directory entry cache (also known as the dentry
34 cache or dcache). This provides a very fast look-up mechanism to 34 cache or dcache). This provides a very fast look-up mechanism to
35 translate a pathname (filename) into a specific dentry. Dentries live 35 translate a pathname (filename) into a specific dentry. Dentries live
36 in RAM and are never saved to disc: they exist only for performance. 36 in RAM and are never saved to disc: they exist only for performance.
37 37
38 The dentry cache is meant to be a view into your entire filespace. As 38 The dentry cache is meant to be a view into your entire filespace. As
39 most computers cannot fit all dentries in the RAM at the same time, 39 most computers cannot fit all dentries in the RAM at the same time,
40 some bits of the cache are missing. In order to resolve your pathname 40 some bits of the cache are missing. In order to resolve your pathname
41 into a dentry, the VFS may have to resort to creating dentries along 41 into a dentry, the VFS may have to resort to creating dentries along
42 the way, and then loading the inode. This is done by looking up the 42 the way, and then loading the inode. This is done by looking up the
43 inode. 43 inode.
44 44
45 45
46 The Inode Object 46 The Inode Object
47 ---------------- 47 ----------------
48 48
49 An individual dentry usually has a pointer to an inode. Inodes are 49 An individual dentry usually has a pointer to an inode. Inodes are
50 filesystem objects such as regular files, directories, FIFOs and other 50 filesystem objects such as regular files, directories, FIFOs and other
51 beasts. They live either on the disc (for block device filesystems) 51 beasts. They live either on the disc (for block device filesystems)
52 or in the memory (for pseudo filesystems). Inodes that live on the 52 or in the memory (for pseudo filesystems). Inodes that live on the
53 disc are copied into the memory when required and changes to the inode 53 disc are copied into the memory when required and changes to the inode
54 are written back to disc. A single inode can be pointed to by multiple 54 are written back to disc. A single inode can be pointed to by multiple
55 dentries (hard links, for example, do this). 55 dentries (hard links, for example, do this).
56 56
57 To look up an inode requires that the VFS calls the lookup() method of 57 To look up an inode requires that the VFS calls the lookup() method of
58 the parent directory inode. This method is installed by the specific 58 the parent directory inode. This method is installed by the specific
59 filesystem implementation that the inode lives in. Once the VFS has 59 filesystem implementation that the inode lives in. Once the VFS has
60 the required dentry (and hence the inode), we can do all those boring 60 the required dentry (and hence the inode), we can do all those boring
61 things like open(2) the file, or stat(2) it to peek at the inode 61 things like open(2) the file, or stat(2) it to peek at the inode
62 data. The stat(2) operation is fairly simple: once the VFS has the 62 data. The stat(2) operation is fairly simple: once the VFS has the
63 dentry, it peeks at the inode data and passes some of it back to 63 dentry, it peeks at the inode data and passes some of it back to
64 userspace. 64 userspace.
65 65
66 66
67 The File Object 67 The File Object
68 --------------- 68 ---------------
69 69
70 Opening a file requires another operation: allocation of a file 70 Opening a file requires another operation: allocation of a file
71 structure (this is the kernel-side implementation of file 71 structure (this is the kernel-side implementation of file
72 descriptors). The freshly allocated file structure is initialized with 72 descriptors). The freshly allocated file structure is initialized with
73 a pointer to the dentry and a set of file operation member functions. 73 a pointer to the dentry and a set of file operation member functions.
74 These are taken from the inode data. The open() file method is then 74 These are taken from the inode data. The open() file method is then
75 called so the specific filesystem implementation can do it's work. You 75 called so the specific filesystem implementation can do it's work. You
76 can see that this is another switch performed by the VFS. The file 76 can see that this is another switch performed by the VFS. The file
77 structure is placed into the file descriptor table for the process. 77 structure is placed into the file descriptor table for the process.
78 78
79 Reading, writing and closing files (and other assorted VFS operations) 79 Reading, writing and closing files (and other assorted VFS operations)
80 is done by using the userspace file descriptor to grab the appropriate 80 is done by using the userspace file descriptor to grab the appropriate
81 file structure, and then calling the required file structure method to 81 file structure, and then calling the required file structure method to
82 do whatever is required. For as long as the file is open, it keeps the 82 do whatever is required. For as long as the file is open, it keeps the
83 dentry in use, which in turn means that the VFS inode is still in use. 83 dentry in use, which in turn means that the VFS inode is still in use.
84 84
85 85
86 Registering and Mounting a Filesystem 86 Registering and Mounting a Filesystem
87 ===================================== 87 =====================================
88 88
89 To register and unregister a filesystem, use the following API 89 To register and unregister a filesystem, use the following API
90 functions: 90 functions:
91 91
92 #include <linux/fs.h> 92 #include <linux/fs.h>
93 93
94 extern int register_filesystem(struct file_system_type *); 94 extern int register_filesystem(struct file_system_type *);
95 extern int unregister_filesystem(struct file_system_type *); 95 extern int unregister_filesystem(struct file_system_type *);
96 96
97 The passed struct file_system_type describes your filesystem. When a 97 The passed struct file_system_type describes your filesystem. When a
98 request is made to mount a device onto a directory in your filespace, 98 request is made to mount a device onto a directory in your filespace,
99 the VFS will call the appropriate get_sb() method for the specific 99 the VFS will call the appropriate get_sb() method for the specific
100 filesystem. The dentry for the mount point will then be updated to 100 filesystem. The dentry for the mount point will then be updated to
101 point to the root inode for the new filesystem. 101 point to the root inode for the new filesystem.
102 102
103 You can see all filesystems that are registered to the kernel in the 103 You can see all filesystems that are registered to the kernel in the
104 file /proc/filesystems. 104 file /proc/filesystems.
105 105
106 106
107 struct file_system_type 107 struct file_system_type
108 ----------------------- 108 -----------------------
109 109
110 This describes the filesystem. As of kernel 2.6.22, the following 110 This describes the filesystem. As of kernel 2.6.22, the following
111 members are defined: 111 members are defined:
112 112
113 struct file_system_type { 113 struct file_system_type {
114 const char *name; 114 const char *name;
115 int fs_flags; 115 int fs_flags;
116 int (*get_sb) (struct file_system_type *, int, 116 int (*get_sb) (struct file_system_type *, int,
117 const char *, void *, struct vfsmount *); 117 const char *, void *, struct vfsmount *);
118 void (*kill_sb) (struct super_block *); 118 void (*kill_sb) (struct super_block *);
119 struct module *owner; 119 struct module *owner;
120 struct file_system_type * next; 120 struct file_system_type * next;
121 struct list_head fs_supers; 121 struct list_head fs_supers;
122 struct lock_class_key s_lock_key; 122 struct lock_class_key s_lock_key;
123 struct lock_class_key s_umount_key; 123 struct lock_class_key s_umount_key;
124 }; 124 };
125 125
126 name: the name of the filesystem type, such as "ext2", "iso9660", 126 name: the name of the filesystem type, such as "ext2", "iso9660",
127 "msdos" and so on 127 "msdos" and so on
128 128
129 fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) 129 fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
130 130
131 get_sb: the method to call when a new instance of this 131 get_sb: the method to call when a new instance of this
132 filesystem should be mounted 132 filesystem should be mounted
133 133
134 kill_sb: the method to call when an instance of this filesystem 134 kill_sb: the method to call when an instance of this filesystem
135 should be unmounted 135 should be unmounted
136 136
137 owner: for internal VFS use: you should initialize this to THIS_MODULE in 137 owner: for internal VFS use: you should initialize this to THIS_MODULE in
138 most cases. 138 most cases.
139 139
140 next: for internal VFS use: you should initialize this to NULL 140 next: for internal VFS use: you should initialize this to NULL
141 141
142 s_lock_key, s_umount_key: lockdep-specific 142 s_lock_key, s_umount_key: lockdep-specific
143 143
144 The get_sb() method has the following arguments: 144 The get_sb() method has the following arguments:
145 145
146 struct file_system_type *fs_type: decribes the filesystem, partly initialized 146 struct file_system_type *fs_type: decribes the filesystem, partly initialized
147 by the specific filesystem code 147 by the specific filesystem code
148 148
149 int flags: mount flags 149 int flags: mount flags
150 150
151 const char *dev_name: the device name we are mounting. 151 const char *dev_name: the device name we are mounting.
152 152
153 void *data: arbitrary mount options, usually comes as an ASCII 153 void *data: arbitrary mount options, usually comes as an ASCII
154 string 154 string
155 155
156 struct vfsmount *mnt: a vfs-internal representation of a mount point 156 struct vfsmount *mnt: a vfs-internal representation of a mount point
157 157
158 The get_sb() method must determine if the block device specified 158 The get_sb() method must determine if the block device specified
159 in the dev_name and fs_type contains a filesystem of the type the method 159 in the dev_name and fs_type contains a filesystem of the type the method
160 supports. If it succeeds in opening the named block device, it initializes a 160 supports. If it succeeds in opening the named block device, it initializes a
161 struct super_block descriptor for the filesystem contained by the block device. 161 struct super_block descriptor for the filesystem contained by the block device.
162 On failure it returns an error. 162 On failure it returns an error.
163 163
164 The most interesting member of the superblock structure that the 164 The most interesting member of the superblock structure that the
165 get_sb() method fills in is the "s_op" field. This is a pointer to 165 get_sb() method fills in is the "s_op" field. This is a pointer to
166 a "struct super_operations" which describes the next level of the 166 a "struct super_operations" which describes the next level of the
167 filesystem implementation. 167 filesystem implementation.
168 168
169 Usually, a filesystem uses one of the generic get_sb() implementations 169 Usually, a filesystem uses one of the generic get_sb() implementations
170 and provides a fill_super() method instead. The generic methods are: 170 and provides a fill_super() method instead. The generic methods are:
171 171
172 get_sb_bdev: mount a filesystem residing on a block device 172 get_sb_bdev: mount a filesystem residing on a block device
173 173
174 get_sb_nodev: mount a filesystem that is not backed by a device 174 get_sb_nodev: mount a filesystem that is not backed by a device
175 175
176 get_sb_single: mount a filesystem which shares the instance between 176 get_sb_single: mount a filesystem which shares the instance between
177 all mounts 177 all mounts
178 178
179 A fill_super() method implementation has the following arguments: 179 A fill_super() method implementation has the following arguments:
180 180
181 struct super_block *sb: the superblock structure. The method fill_super() 181 struct super_block *sb: the superblock structure. The method fill_super()
182 must initialize this properly. 182 must initialize this properly.
183 183
184 void *data: arbitrary mount options, usually comes as an ASCII 184 void *data: arbitrary mount options, usually comes as an ASCII
185 string 185 string
186 186
187 int silent: whether or not to be silent on error 187 int silent: whether or not to be silent on error
188 188
189 189
190 The Superblock Object 190 The Superblock Object
191 ===================== 191 =====================
192 192
193 A superblock object represents a mounted filesystem. 193 A superblock object represents a mounted filesystem.
194 194
195 195
196 struct super_operations 196 struct super_operations
197 ----------------------- 197 -----------------------
198 198
199 This describes how the VFS can manipulate the superblock of your 199 This describes how the VFS can manipulate the superblock of your
200 filesystem. As of kernel 2.6.22, the following members are defined: 200 filesystem. As of kernel 2.6.22, the following members are defined:
201 201
202 struct super_operations { 202 struct super_operations {
203 struct inode *(*alloc_inode)(struct super_block *sb); 203 struct inode *(*alloc_inode)(struct super_block *sb);
204 void (*destroy_inode)(struct inode *); 204 void (*destroy_inode)(struct inode *);
205 205
206 void (*read_inode) (struct inode *);
207
208 void (*dirty_inode) (struct inode *); 206 void (*dirty_inode) (struct inode *);
209 int (*write_inode) (struct inode *, int); 207 int (*write_inode) (struct inode *, int);
210 void (*put_inode) (struct inode *); 208 void (*put_inode) (struct inode *);
211 void (*drop_inode) (struct inode *); 209 void (*drop_inode) (struct inode *);
212 void (*delete_inode) (struct inode *); 210 void (*delete_inode) (struct inode *);
213 void (*put_super) (struct super_block *); 211 void (*put_super) (struct super_block *);
214 void (*write_super) (struct super_block *); 212 void (*write_super) (struct super_block *);
215 int (*sync_fs)(struct super_block *sb, int wait); 213 int (*sync_fs)(struct super_block *sb, int wait);
216 void (*write_super_lockfs) (struct super_block *); 214 void (*write_super_lockfs) (struct super_block *);
217 void (*unlockfs) (struct super_block *); 215 void (*unlockfs) (struct super_block *);
218 int (*statfs) (struct dentry *, struct kstatfs *); 216 int (*statfs) (struct dentry *, struct kstatfs *);
219 int (*remount_fs) (struct super_block *, int *, char *); 217 int (*remount_fs) (struct super_block *, int *, char *);
220 void (*clear_inode) (struct inode *); 218 void (*clear_inode) (struct inode *);
221 void (*umount_begin) (struct super_block *); 219 void (*umount_begin) (struct super_block *);
222 220
223 int (*show_options)(struct seq_file *, struct vfsmount *); 221 int (*show_options)(struct seq_file *, struct vfsmount *);
224 222
225 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 223 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
226 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 224 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
227 }; 225 };
228 226
229 All methods are called without any locks being held, unless otherwise 227 All methods are called without any locks being held, unless otherwise
230 noted. This means that most methods can block safely. All methods are 228 noted. This means that most methods can block safely. All methods are
231 only called from a process context (i.e. not from an interrupt handler 229 only called from a process context (i.e. not from an interrupt handler
232 or bottom half). 230 or bottom half).
233 231
234 alloc_inode: this method is called by inode_alloc() to allocate memory 232 alloc_inode: this method is called by inode_alloc() to allocate memory
235 for struct inode and initialize it. If this function is not 233 for struct inode and initialize it. If this function is not
236 defined, a simple 'struct inode' is allocated. Normally 234 defined, a simple 'struct inode' is allocated. Normally
237 alloc_inode will be used to allocate a larger structure which 235 alloc_inode will be used to allocate a larger structure which
238 contains a 'struct inode' embedded within it. 236 contains a 'struct inode' embedded within it.
239 237
240 destroy_inode: this method is called by destroy_inode() to release 238 destroy_inode: this method is called by destroy_inode() to release
241 resources allocated for struct inode. It is only required if 239 resources allocated for struct inode. It is only required if
242 ->alloc_inode was defined and simply undoes anything done by 240 ->alloc_inode was defined and simply undoes anything done by
243 ->alloc_inode. 241 ->alloc_inode.
244 242
245 read_inode: this method is called to read a specific inode from the
246 mounted filesystem. The i_ino member in the struct inode is
247 initialized by the VFS to indicate which inode to read. Other
248 members are filled in by this method.
249
250 You can set this to NULL and use iget5_locked() instead of iget()
251 to read inodes. This is necessary for filesystems for which the
252 inode number is not sufficient to identify an inode.
253
254 dirty_inode: this method is called by the VFS to mark an inode dirty. 243 dirty_inode: this method is called by the VFS to mark an inode dirty.
255 244
256 write_inode: this method is called when the VFS needs to write an 245 write_inode: this method is called when the VFS needs to write an
257 inode to disc. The second parameter indicates whether the write 246 inode to disc. The second parameter indicates whether the write
258 should be synchronous or not, not all filesystems check this flag. 247 should be synchronous or not, not all filesystems check this flag.
259 248
260 put_inode: called when the VFS inode is removed from the inode 249 put_inode: called when the VFS inode is removed from the inode
261 cache. 250 cache.
262 251
263 drop_inode: called when the last access to the inode is dropped, 252 drop_inode: called when the last access to the inode is dropped,
264 with the inode_lock spinlock held. 253 with the inode_lock spinlock held.
265 254
266 This method should be either NULL (normal UNIX filesystem 255 This method should be either NULL (normal UNIX filesystem
267 semantics) or "generic_delete_inode" (for filesystems that do not 256 semantics) or "generic_delete_inode" (for filesystems that do not
268 want to cache inodes - causing "delete_inode" to always be 257 want to cache inodes - causing "delete_inode" to always be
269 called regardless of the value of i_nlink) 258 called regardless of the value of i_nlink)
270 259
271 The "generic_delete_inode()" behavior is equivalent to the 260 The "generic_delete_inode()" behavior is equivalent to the
272 old practice of using "force_delete" in the put_inode() case, 261 old practice of using "force_delete" in the put_inode() case,
273 but does not have the races that the "force_delete()" approach 262 but does not have the races that the "force_delete()" approach
274 had. 263 had.
275 264
276 delete_inode: called when the VFS wants to delete an inode 265 delete_inode: called when the VFS wants to delete an inode
277 266
278 put_super: called when the VFS wishes to free the superblock 267 put_super: called when the VFS wishes to free the superblock
279 (i.e. unmount). This is called with the superblock lock held 268 (i.e. unmount). This is called with the superblock lock held
280 269
281 write_super: called when the VFS superblock needs to be written to 270 write_super: called when the VFS superblock needs to be written to
282 disc. This method is optional 271 disc. This method is optional
283 272
284 sync_fs: called when VFS is writing out all dirty data associated with 273 sync_fs: called when VFS is writing out all dirty data associated with
285 a superblock. The second parameter indicates whether the method 274 a superblock. The second parameter indicates whether the method
286 should wait until the write out has been completed. Optional. 275 should wait until the write out has been completed. Optional.
287 276
288 write_super_lockfs: called when VFS is locking a filesystem and 277 write_super_lockfs: called when VFS is locking a filesystem and
289 forcing it into a consistent state. This method is currently 278 forcing it into a consistent state. This method is currently
290 used by the Logical Volume Manager (LVM). 279 used by the Logical Volume Manager (LVM).
291 280
292 unlockfs: called when VFS is unlocking a filesystem and making it writable 281 unlockfs: called when VFS is unlocking a filesystem and making it writable
293 again. 282 again.
294 283
295 statfs: called when the VFS needs to get filesystem statistics. This 284 statfs: called when the VFS needs to get filesystem statistics. This
296 is called with the kernel lock held 285 is called with the kernel lock held
297 286
298 remount_fs: called when the filesystem is remounted. This is called 287 remount_fs: called when the filesystem is remounted. This is called
299 with the kernel lock held 288 with the kernel lock held
300 289
301 clear_inode: called then the VFS clears the inode. Optional 290 clear_inode: called then the VFS clears the inode. Optional
302 291
303 umount_begin: called when the VFS is unmounting a filesystem. 292 umount_begin: called when the VFS is unmounting a filesystem.
304 293
305 show_options: called by the VFS to show mount options for /proc/<pid>/mounts. 294 show_options: called by the VFS to show mount options for /proc/<pid>/mounts.
306 295
307 quota_read: called by the VFS to read from filesystem quota file. 296 quota_read: called by the VFS to read from filesystem quota file.
308 297
309 quota_write: called by the VFS to write to filesystem quota file. 298 quota_write: called by the VFS to write to filesystem quota file.
310 299
311 The read_inode() method is responsible for filling in the "i_op" 300 Whoever sets up the inode is responsible for filling in the "i_op" field. This
312 field. This is a pointer to a "struct inode_operations" which 301 is a pointer to a "struct inode_operations" which describes the methods that
313 describes the methods that can be performed on individual inodes. 302 can be performed on individual inodes.
314 303
315 304
316 The Inode Object 305 The Inode Object
317 ================ 306 ================
318 307
319 An inode object represents an object within the filesystem. 308 An inode object represents an object within the filesystem.
320 309
321 310
322 struct inode_operations 311 struct inode_operations
323 ----------------------- 312 -----------------------
324 313
325 This describes how the VFS can manipulate an inode in your 314 This describes how the VFS can manipulate an inode in your
326 filesystem. As of kernel 2.6.22, the following members are defined: 315 filesystem. As of kernel 2.6.22, the following members are defined:
327 316
328 struct inode_operations { 317 struct inode_operations {
329 int (*create) (struct inode *,struct dentry *,int, struct nameidata *); 318 int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
330 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); 319 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
331 int (*link) (struct dentry *,struct inode *,struct dentry *); 320 int (*link) (struct dentry *,struct inode *,struct dentry *);
332 int (*unlink) (struct inode *,struct dentry *); 321 int (*unlink) (struct inode *,struct dentry *);
333 int (*symlink) (struct inode *,struct dentry *,const char *); 322 int (*symlink) (struct inode *,struct dentry *,const char *);
334 int (*mkdir) (struct inode *,struct dentry *,int); 323 int (*mkdir) (struct inode *,struct dentry *,int);
335 int (*rmdir) (struct inode *,struct dentry *); 324 int (*rmdir) (struct inode *,struct dentry *);
336 int (*mknod) (struct inode *,struct dentry *,int,dev_t); 325 int (*mknod) (struct inode *,struct dentry *,int,dev_t);
337 int (*rename) (struct inode *, struct dentry *, 326 int (*rename) (struct inode *, struct dentry *,
338 struct inode *, struct dentry *); 327 struct inode *, struct dentry *);
339 int (*readlink) (struct dentry *, char __user *,int); 328 int (*readlink) (struct dentry *, char __user *,int);
340 void * (*follow_link) (struct dentry *, struct nameidata *); 329 void * (*follow_link) (struct dentry *, struct nameidata *);
341 void (*put_link) (struct dentry *, struct nameidata *, void *); 330 void (*put_link) (struct dentry *, struct nameidata *, void *);
342 void (*truncate) (struct inode *); 331 void (*truncate) (struct inode *);
343 int (*permission) (struct inode *, int, struct nameidata *); 332 int (*permission) (struct inode *, int, struct nameidata *);
344 int (*setattr) (struct dentry *, struct iattr *); 333 int (*setattr) (struct dentry *, struct iattr *);
345 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 334 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
346 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 335 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
347 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 336 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
348 ssize_t (*listxattr) (struct dentry *, char *, size_t); 337 ssize_t (*listxattr) (struct dentry *, char *, size_t);
349 int (*removexattr) (struct dentry *, const char *); 338 int (*removexattr) (struct dentry *, const char *);
350 void (*truncate_range)(struct inode *, loff_t, loff_t); 339 void (*truncate_range)(struct inode *, loff_t, loff_t);
351 }; 340 };
352 341
353 Again, all methods are called without any locks being held, unless 342 Again, all methods are called without any locks being held, unless
354 otherwise noted. 343 otherwise noted.
355 344
356 create: called by the open(2) and creat(2) system calls. Only 345 create: called by the open(2) and creat(2) system calls. Only
357 required if you want to support regular files. The dentry you 346 required if you want to support regular files. The dentry you
358 get should not have an inode (i.e. it should be a negative 347 get should not have an inode (i.e. it should be a negative
359 dentry). Here you will probably call d_instantiate() with the 348 dentry). Here you will probably call d_instantiate() with the
360 dentry and the newly created inode 349 dentry and the newly created inode
361 350
362 lookup: called when the VFS needs to look up an inode in a parent 351 lookup: called when the VFS needs to look up an inode in a parent
363 directory. The name to look for is found in the dentry. This 352 directory. The name to look for is found in the dentry. This
364 method must call d_add() to insert the found inode into the 353 method must call d_add() to insert the found inode into the
365 dentry. The "i_count" field in the inode structure should be 354 dentry. The "i_count" field in the inode structure should be
366 incremented. If the named inode does not exist a NULL inode 355 incremented. If the named inode does not exist a NULL inode
367 should be inserted into the dentry (this is called a negative 356 should be inserted into the dentry (this is called a negative
368 dentry). Returning an error code from this routine must only 357 dentry). Returning an error code from this routine must only
369 be done on a real error, otherwise creating inodes with system 358 be done on a real error, otherwise creating inodes with system
370 calls like create(2), mknod(2), mkdir(2) and so on will fail. 359 calls like create(2), mknod(2), mkdir(2) and so on will fail.
371 If you wish to overload the dentry methods then you should 360 If you wish to overload the dentry methods then you should
372 initialise the "d_dop" field in the dentry; this is a pointer 361 initialise the "d_dop" field in the dentry; this is a pointer
373 to a struct "dentry_operations". 362 to a struct "dentry_operations".
374 This method is called with the directory inode semaphore held 363 This method is called with the directory inode semaphore held
375 364
376 link: called by the link(2) system call. Only required if you want 365 link: called by the link(2) system call. Only required if you want
377 to support hard links. You will probably need to call 366 to support hard links. You will probably need to call
378 d_instantiate() just as you would in the create() method 367 d_instantiate() just as you would in the create() method
379 368
380 unlink: called by the unlink(2) system call. Only required if you 369 unlink: called by the unlink(2) system call. Only required if you
381 want to support deleting inodes 370 want to support deleting inodes
382 371
383 symlink: called by the symlink(2) system call. Only required if you 372 symlink: called by the symlink(2) system call. Only required if you
384 want to support symlinks. You will probably need to call 373 want to support symlinks. You will probably need to call
385 d_instantiate() just as you would in the create() method 374 d_instantiate() just as you would in the create() method
386 375
387 mkdir: called by the mkdir(2) system call. Only required if you want 376 mkdir: called by the mkdir(2) system call. Only required if you want
388 to support creating subdirectories. You will probably need to 377 to support creating subdirectories. You will probably need to
389 call d_instantiate() just as you would in the create() method 378 call d_instantiate() just as you would in the create() method
390 379
391 rmdir: called by the rmdir(2) system call. Only required if you want 380 rmdir: called by the rmdir(2) system call. Only required if you want
392 to support deleting subdirectories 381 to support deleting subdirectories
393 382
394 mknod: called by the mknod(2) system call to create a device (char, 383 mknod: called by the mknod(2) system call to create a device (char,
395 block) inode or a named pipe (FIFO) or socket. Only required 384 block) inode or a named pipe (FIFO) or socket. Only required
396 if you want to support creating these types of inodes. You 385 if you want to support creating these types of inodes. You
397 will probably need to call d_instantiate() just as you would 386 will probably need to call d_instantiate() just as you would
398 in the create() method 387 in the create() method
399 388
400 rename: called by the rename(2) system call to rename the object to 389 rename: called by the rename(2) system call to rename the object to
401 have the parent and name given by the second inode and dentry. 390 have the parent and name given by the second inode and dentry.
402 391
403 readlink: called by the readlink(2) system call. Only required if 392 readlink: called by the readlink(2) system call. Only required if
404 you want to support reading symbolic links 393 you want to support reading symbolic links
405 394
406 follow_link: called by the VFS to follow a symbolic link to the 395 follow_link: called by the VFS to follow a symbolic link to the
407 inode it points to. Only required if you want to support 396 inode it points to. Only required if you want to support
408 symbolic links. This method returns a void pointer cookie 397 symbolic links. This method returns a void pointer cookie
409 that is passed to put_link(). 398 that is passed to put_link().
410 399
411 put_link: called by the VFS to release resources allocated by 400 put_link: called by the VFS to release resources allocated by
412 follow_link(). The cookie returned by follow_link() is passed 401 follow_link(). The cookie returned by follow_link() is passed
413 to this method as the last parameter. It is used by 402 to this method as the last parameter. It is used by
414 filesystems such as NFS where page cache is not stable 403 filesystems such as NFS where page cache is not stable
415 (i.e. page that was installed when the symbolic link walk 404 (i.e. page that was installed when the symbolic link walk
416 started might not be in the page cache at the end of the 405 started might not be in the page cache at the end of the
417 walk). 406 walk).
418 407
419 truncate: called by the VFS to change the size of a file. The 408 truncate: called by the VFS to change the size of a file. The
420 i_size field of the inode is set to the desired size by the 409 i_size field of the inode is set to the desired size by the
421 VFS before this method is called. This method is called by 410 VFS before this method is called. This method is called by
422 the truncate(2) system call and related functionality. 411 the truncate(2) system call and related functionality.
423 412
424 permission: called by the VFS to check for access rights on a POSIX-like 413 permission: called by the VFS to check for access rights on a POSIX-like
425 filesystem. 414 filesystem.
426 415
427 setattr: called by the VFS to set attributes for a file. This method 416 setattr: called by the VFS to set attributes for a file. This method
428 is called by chmod(2) and related system calls. 417 is called by chmod(2) and related system calls.
429 418
430 getattr: called by the VFS to get attributes of a file. This method 419 getattr: called by the VFS to get attributes of a file. This method
431 is called by stat(2) and related system calls. 420 is called by stat(2) and related system calls.
432 421
433 setxattr: called by the VFS to set an extended attribute for a file. 422 setxattr: called by the VFS to set an extended attribute for a file.
434 Extended attribute is a name:value pair associated with an 423 Extended attribute is a name:value pair associated with an
435 inode. This method is called by setxattr(2) system call. 424 inode. This method is called by setxattr(2) system call.
436 425
437 getxattr: called by the VFS to retrieve the value of an extended 426 getxattr: called by the VFS to retrieve the value of an extended
438 attribute name. This method is called by getxattr(2) function 427 attribute name. This method is called by getxattr(2) function
439 call. 428 call.
440 429
441 listxattr: called by the VFS to list all extended attributes for a 430 listxattr: called by the VFS to list all extended attributes for a
442 given file. This method is called by listxattr(2) system call. 431 given file. This method is called by listxattr(2) system call.
443 432
444 removexattr: called by the VFS to remove an extended attribute from 433 removexattr: called by the VFS to remove an extended attribute from
445 a file. This method is called by removexattr(2) system call. 434 a file. This method is called by removexattr(2) system call.
446 435
447 truncate_range: a method provided by the underlying filesystem to truncate a 436 truncate_range: a method provided by the underlying filesystem to truncate a
448 range of blocks , i.e. punch a hole somewhere in a file. 437 range of blocks , i.e. punch a hole somewhere in a file.
449 438
450 439
451 The Address Space Object 440 The Address Space Object
452 ======================== 441 ========================
453 442
454 The address space object is used to group and manage pages in the page 443 The address space object is used to group and manage pages in the page
455 cache. It can be used to keep track of the pages in a file (or 444 cache. It can be used to keep track of the pages in a file (or
456 anything else) and also track the mapping of sections of the file into 445 anything else) and also track the mapping of sections of the file into
457 process address spaces. 446 process address spaces.
458 447
459 There are a number of distinct yet related services that an 448 There are a number of distinct yet related services that an
460 address-space can provide. These include communicating memory 449 address-space can provide. These include communicating memory
461 pressure, page lookup by address, and keeping track of pages tagged as 450 pressure, page lookup by address, and keeping track of pages tagged as
462 Dirty or Writeback. 451 Dirty or Writeback.
463 452
464 The first can be used independently to the others. The VM can try to 453 The first can be used independently to the others. The VM can try to
465 either write dirty pages in order to clean them, or release clean 454 either write dirty pages in order to clean them, or release clean
466 pages in order to reuse them. To do this it can call the ->writepage 455 pages in order to reuse them. To do this it can call the ->writepage
467 method on dirty pages, and ->releasepage on clean pages with 456 method on dirty pages, and ->releasepage on clean pages with
468 PagePrivate set. Clean pages without PagePrivate and with no external 457 PagePrivate set. Clean pages without PagePrivate and with no external
469 references will be released without notice being given to the 458 references will be released without notice being given to the
470 address_space. 459 address_space.
471 460
472 To achieve this functionality, pages need to be placed on an LRU with 461 To achieve this functionality, pages need to be placed on an LRU with
473 lru_cache_add and mark_page_active needs to be called whenever the 462 lru_cache_add and mark_page_active needs to be called whenever the
474 page is used. 463 page is used.
475 464
476 Pages are normally kept in a radix tree index by ->index. This tree 465 Pages are normally kept in a radix tree index by ->index. This tree
477 maintains information about the PG_Dirty and PG_Writeback status of 466 maintains information about the PG_Dirty and PG_Writeback status of
478 each page, so that pages with either of these flags can be found 467 each page, so that pages with either of these flags can be found
479 quickly. 468 quickly.
480 469
481 The Dirty tag is primarily used by mpage_writepages - the default 470 The Dirty tag is primarily used by mpage_writepages - the default
482 ->writepages method. It uses the tag to find dirty pages to call 471 ->writepages method. It uses the tag to find dirty pages to call
483 ->writepage on. If mpage_writepages is not used (i.e. the address 472 ->writepage on. If mpage_writepages is not used (i.e. the address
484 provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is 473 provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is
485 almost unused. write_inode_now and sync_inode do use it (through 474 almost unused. write_inode_now and sync_inode do use it (through
486 __sync_single_inode) to check if ->writepages has been successful in 475 __sync_single_inode) to check if ->writepages has been successful in
487 writing out the whole address_space. 476 writing out the whole address_space.
488 477
489 The Writeback tag is used by filemap*wait* and sync_page* functions, 478 The Writeback tag is used by filemap*wait* and sync_page* functions,
490 via wait_on_page_writeback_range, to wait for all writeback to 479 via wait_on_page_writeback_range, to wait for all writeback to
491 complete. While waiting ->sync_page (if defined) will be called on 480 complete. While waiting ->sync_page (if defined) will be called on
492 each page that is found to require writeback. 481 each page that is found to require writeback.
493 482
494 An address_space handler may attach extra information to a page, 483 An address_space handler may attach extra information to a page,
495 typically using the 'private' field in the 'struct page'. If such 484 typically using the 'private' field in the 'struct page'. If such
496 information is attached, the PG_Private flag should be set. This will 485 information is attached, the PG_Private flag should be set. This will
497 cause various VM routines to make extra calls into the address_space 486 cause various VM routines to make extra calls into the address_space
498 handler to deal with that data. 487 handler to deal with that data.
499 488
500 An address space acts as an intermediate between storage and 489 An address space acts as an intermediate between storage and
501 application. Data is read into the address space a whole page at a 490 application. Data is read into the address space a whole page at a
502 time, and provided to the application either by copying of the page, 491 time, and provided to the application either by copying of the page,
503 or by memory-mapping the page. 492 or by memory-mapping the page.
504 Data is written into the address space by the application, and then 493 Data is written into the address space by the application, and then
505 written-back to storage typically in whole pages, however the 494 written-back to storage typically in whole pages, however the
506 address_space has finer control of write sizes. 495 address_space has finer control of write sizes.
507 496
508 The read process essentially only requires 'readpage'. The write 497 The read process essentially only requires 'readpage'. The write
509 process is more complicated and uses prepare_write/commit_write or 498 process is more complicated and uses prepare_write/commit_write or
510 set_page_dirty to write data into the address_space, and writepage, 499 set_page_dirty to write data into the address_space, and writepage,
511 sync_page, and writepages to writeback data to storage. 500 sync_page, and writepages to writeback data to storage.
512 501
513 Adding and removing pages to/from an address_space is protected by the 502 Adding and removing pages to/from an address_space is protected by the
514 inode's i_mutex. 503 inode's i_mutex.
515 504
516 When data is written to a page, the PG_Dirty flag should be set. It 505 When data is written to a page, the PG_Dirty flag should be set. It
517 typically remains set until writepage asks for it to be written. This 506 typically remains set until writepage asks for it to be written. This
518 should clear PG_Dirty and set PG_Writeback. It can be actually 507 should clear PG_Dirty and set PG_Writeback. It can be actually
519 written at any point after PG_Dirty is clear. Once it is known to be 508 written at any point after PG_Dirty is clear. Once it is known to be
520 safe, PG_Writeback is cleared. 509 safe, PG_Writeback is cleared.
521 510
522 Writeback makes use of a writeback_control structure... 511 Writeback makes use of a writeback_control structure...
523 512
524 struct address_space_operations 513 struct address_space_operations
525 ------------------------------- 514 -------------------------------
526 515
527 This describes how the VFS can manipulate mapping of a file to page cache in 516 This describes how the VFS can manipulate mapping of a file to page cache in
528 your filesystem. As of kernel 2.6.22, the following members are defined: 517 your filesystem. As of kernel 2.6.22, the following members are defined:
529 518
530 struct address_space_operations { 519 struct address_space_operations {
531 int (*writepage)(struct page *page, struct writeback_control *wbc); 520 int (*writepage)(struct page *page, struct writeback_control *wbc);
532 int (*readpage)(struct file *, struct page *); 521 int (*readpage)(struct file *, struct page *);
533 int (*sync_page)(struct page *); 522 int (*sync_page)(struct page *);
534 int (*writepages)(struct address_space *, struct writeback_control *); 523 int (*writepages)(struct address_space *, struct writeback_control *);
535 int (*set_page_dirty)(struct page *page); 524 int (*set_page_dirty)(struct page *page);
536 int (*readpages)(struct file *filp, struct address_space *mapping, 525 int (*readpages)(struct file *filp, struct address_space *mapping,
537 struct list_head *pages, unsigned nr_pages); 526 struct list_head *pages, unsigned nr_pages);
538 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 527 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
539 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 528 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
540 int (*write_begin)(struct file *, struct address_space *mapping, 529 int (*write_begin)(struct file *, struct address_space *mapping,
541 loff_t pos, unsigned len, unsigned flags, 530 loff_t pos, unsigned len, unsigned flags,
542 struct page **pagep, void **fsdata); 531 struct page **pagep, void **fsdata);
543 int (*write_end)(struct file *, struct address_space *mapping, 532 int (*write_end)(struct file *, struct address_space *mapping,
544 loff_t pos, unsigned len, unsigned copied, 533 loff_t pos, unsigned len, unsigned copied,
545 struct page *page, void *fsdata); 534 struct page *page, void *fsdata);
546 sector_t (*bmap)(struct address_space *, sector_t); 535 sector_t (*bmap)(struct address_space *, sector_t);
547 int (*invalidatepage) (struct page *, unsigned long); 536 int (*invalidatepage) (struct page *, unsigned long);
548 int (*releasepage) (struct page *, int); 537 int (*releasepage) (struct page *, int);
549 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 538 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
550 loff_t offset, unsigned long nr_segs); 539 loff_t offset, unsigned long nr_segs);
551 struct page* (*get_xip_page)(struct address_space *, sector_t, 540 struct page* (*get_xip_page)(struct address_space *, sector_t,
552 int); 541 int);
553 /* migrate the contents of a page to the specified target */ 542 /* migrate the contents of a page to the specified target */
554 int (*migratepage) (struct page *, struct page *); 543 int (*migratepage) (struct page *, struct page *);
555 int (*launder_page) (struct page *); 544 int (*launder_page) (struct page *);
556 }; 545 };
557 546
558 writepage: called by the VM to write a dirty page to backing store. 547 writepage: called by the VM to write a dirty page to backing store.
559 This may happen for data integrity reasons (i.e. 'sync'), or 548 This may happen for data integrity reasons (i.e. 'sync'), or
560 to free up memory (flush). The difference can be seen in 549 to free up memory (flush). The difference can be seen in
561 wbc->sync_mode. 550 wbc->sync_mode.
562 The PG_Dirty flag has been cleared and PageLocked is true. 551 The PG_Dirty flag has been cleared and PageLocked is true.
563 writepage should start writeout, should set PG_Writeback, 552 writepage should start writeout, should set PG_Writeback,
564 and should make sure the page is unlocked, either synchronously 553 and should make sure the page is unlocked, either synchronously
565 or asynchronously when the write operation completes. 554 or asynchronously when the write operation completes.
566 555
567 If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to 556 If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
568 try too hard if there are problems, and may choose to write out 557 try too hard if there are problems, and may choose to write out
569 other pages from the mapping if that is easier (e.g. due to 558 other pages from the mapping if that is easier (e.g. due to
570 internal dependencies). If it chooses not to start writeout, it 559 internal dependencies). If it chooses not to start writeout, it
571 should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep 560 should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep
572 calling ->writepage on that page. 561 calling ->writepage on that page.
573 562
574 See the file "Locking" for more details. 563 See the file "Locking" for more details.
575 564
576 readpage: called by the VM to read a page from backing store. 565 readpage: called by the VM to read a page from backing store.
577 The page will be Locked when readpage is called, and should be 566 The page will be Locked when readpage is called, and should be
578 unlocked and marked uptodate once the read completes. 567 unlocked and marked uptodate once the read completes.
579 If ->readpage discovers that it needs to unlock the page for 568 If ->readpage discovers that it needs to unlock the page for
580 some reason, it can do so, and then return AOP_TRUNCATED_PAGE. 569 some reason, it can do so, and then return AOP_TRUNCATED_PAGE.
581 In this case, the page will be relocated, relocked and if 570 In this case, the page will be relocated, relocked and if
582 that all succeeds, ->readpage will be called again. 571 that all succeeds, ->readpage will be called again.
583 572
584 sync_page: called by the VM to notify the backing store to perform all 573 sync_page: called by the VM to notify the backing store to perform all
585 queued I/O operations for a page. I/O operations for other pages 574 queued I/O operations for a page. I/O operations for other pages
586 associated with this address_space object may also be performed. 575 associated with this address_space object may also be performed.
587 576
588 This function is optional and is called only for pages with 577 This function is optional and is called only for pages with
589 PG_Writeback set while waiting for the writeback to complete. 578 PG_Writeback set while waiting for the writeback to complete.
590 579
591 writepages: called by the VM to write out pages associated with the 580 writepages: called by the VM to write out pages associated with the
592 address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then 581 address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
593 the writeback_control will specify a range of pages that must be 582 the writeback_control will specify a range of pages that must be
594 written out. If it is WBC_SYNC_NONE, then a nr_to_write is given 583 written out. If it is WBC_SYNC_NONE, then a nr_to_write is given
595 and that many pages should be written if possible. 584 and that many pages should be written if possible.
596 If no ->writepages is given, then mpage_writepages is used 585 If no ->writepages is given, then mpage_writepages is used
597 instead. This will choose pages from the address space that are 586 instead. This will choose pages from the address space that are
598 tagged as DIRTY and will pass them to ->writepage. 587 tagged as DIRTY and will pass them to ->writepage.
599 588
600 set_page_dirty: called by the VM to set a page dirty. 589 set_page_dirty: called by the VM to set a page dirty.
601 This is particularly needed if an address space attaches 590 This is particularly needed if an address space attaches
602 private data to a page, and that data needs to be updated when 591 private data to a page, and that data needs to be updated when
603 a page is dirtied. This is called, for example, when a memory 592 a page is dirtied. This is called, for example, when a memory
604 mapped page gets modified. 593 mapped page gets modified.
605 If defined, it should set the PageDirty flag, and the 594 If defined, it should set the PageDirty flag, and the
606 PAGECACHE_TAG_DIRTY tag in the radix tree. 595 PAGECACHE_TAG_DIRTY tag in the radix tree.
607 596
608 readpages: called by the VM to read pages associated with the address_space 597 readpages: called by the VM to read pages associated with the address_space
609 object. This is essentially just a vector version of 598 object. This is essentially just a vector version of
610 readpage. Instead of just one page, several pages are 599 readpage. Instead of just one page, several pages are
611 requested. 600 requested.
612 readpages is only used for read-ahead, so read errors are 601 readpages is only used for read-ahead, so read errors are
613 ignored. If anything goes wrong, feel free to give up. 602 ignored. If anything goes wrong, feel free to give up.
614 603
615 prepare_write: called by the generic write path in VM to set up a write 604 prepare_write: called by the generic write path in VM to set up a write
616 request for a page. This indicates to the address space that 605 request for a page. This indicates to the address space that
617 the given range of bytes is about to be written. The 606 the given range of bytes is about to be written. The
618 address_space should check that the write will be able to 607 address_space should check that the write will be able to
619 complete, by allocating space if necessary and doing any other 608 complete, by allocating space if necessary and doing any other
620 internal housekeeping. If the write will update parts of 609 internal housekeeping. If the write will update parts of
621 any basic-blocks on storage, then those blocks should be 610 any basic-blocks on storage, then those blocks should be
622 pre-read (if they haven't been read already) so that the 611 pre-read (if they haven't been read already) so that the
623 updated blocks can be written out properly. 612 updated blocks can be written out properly.
624 The page will be locked. 613 The page will be locked.
625 614
626 Note: the page _must not_ be marked uptodate in this function 615 Note: the page _must not_ be marked uptodate in this function
627 (or anywhere else) unless it actually is uptodate right now. As 616 (or anywhere else) unless it actually is uptodate right now. As
628 soon as a page is marked uptodate, it is possible for a concurrent 617 soon as a page is marked uptodate, it is possible for a concurrent
629 read(2) to copy it to userspace. 618 read(2) to copy it to userspace.
630 619
631 commit_write: If prepare_write succeeds, new data will be copied 620 commit_write: If prepare_write succeeds, new data will be copied
632 into the page and then commit_write will be called. It will 621 into the page and then commit_write will be called. It will
633 typically update the size of the file (if appropriate) and 622 typically update the size of the file (if appropriate) and
634 mark the inode as dirty, and do any other related housekeeping 623 mark the inode as dirty, and do any other related housekeeping
635 operations. It should avoid returning an error if possible - 624 operations. It should avoid returning an error if possible -
636 errors should have been handled by prepare_write. 625 errors should have been handled by prepare_write.
637 626
638 write_begin: This is intended as a replacement for prepare_write. The 627 write_begin: This is intended as a replacement for prepare_write. The
639 key differences being that: 628 key differences being that:
640 - it returns a locked page (in *pagep) rather than being 629 - it returns a locked page (in *pagep) rather than being
641 given a pre locked page; 630 given a pre locked page;
642 - it must be able to cope with short writes (where the 631 - it must be able to cope with short writes (where the
643 length passed to write_begin is greater than the number 632 length passed to write_begin is greater than the number
644 of bytes copied into the page). 633 of bytes copied into the page).
645 634
646 Called by the generic buffered write code to ask the filesystem to 635 Called by the generic buffered write code to ask the filesystem to
647 prepare to write len bytes at the given offset in the file. The 636 prepare to write len bytes at the given offset in the file. The
648 address_space should check that the write will be able to complete, 637 address_space should check that the write will be able to complete,
649 by allocating space if necessary and doing any other internal 638 by allocating space if necessary and doing any other internal
650 housekeeping. If the write will update parts of any basic-blocks on 639 housekeeping. If the write will update parts of any basic-blocks on
651 storage, then those blocks should be pre-read (if they haven't been 640 storage, then those blocks should be pre-read (if they haven't been
652 read already) so that the updated blocks can be written out properly. 641 read already) so that the updated blocks can be written out properly.
653 642
654 The filesystem must return the locked pagecache page for the specified 643 The filesystem must return the locked pagecache page for the specified
655 offset, in *pagep, for the caller to write into. 644 offset, in *pagep, for the caller to write into.
656 645
657 flags is a field for AOP_FLAG_xxx flags, described in 646 flags is a field for AOP_FLAG_xxx flags, described in
658 include/linux/fs.h. 647 include/linux/fs.h.
659 648
660 A void * may be returned in fsdata, which then gets passed into 649 A void * may be returned in fsdata, which then gets passed into
661 write_end. 650 write_end.
662 651
663 Returns 0 on success; < 0 on failure (which is the error code), in 652 Returns 0 on success; < 0 on failure (which is the error code), in
664 which case write_end is not called. 653 which case write_end is not called.
665 654
666 write_end: After a successful write_begin, and data copy, write_end must 655 write_end: After a successful write_begin, and data copy, write_end must
667 be called. len is the original len passed to write_begin, and copied 656 be called. len is the original len passed to write_begin, and copied
668 is the amount that was able to be copied (copied == len is always true 657 is the amount that was able to be copied (copied == len is always true
669 if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag). 658 if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
670 659
671 The filesystem must take care of unlocking the page and releasing it 660 The filesystem must take care of unlocking the page and releasing it
672 refcount, and updating i_size. 661 refcount, and updating i_size.
673 662
674 Returns < 0 on failure, otherwise the number of bytes (<= 'copied') 663 Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
675 that were able to be copied into pagecache. 664 that were able to be copied into pagecache.
676 665
677 bmap: called by the VFS to map a logical block offset within object to 666 bmap: called by the VFS to map a logical block offset within object to
678 physical block number. This method is used by the FIBMAP 667 physical block number. This method is used by the FIBMAP
679 ioctl and for working with swap-files. To be able to swap to 668 ioctl and for working with swap-files. To be able to swap to
680 a file, the file must have a stable mapping to a block 669 a file, the file must have a stable mapping to a block
681 device. The swap system does not go through the filesystem 670 device. The swap system does not go through the filesystem
682 but instead uses bmap to find out where the blocks in the file 671 but instead uses bmap to find out where the blocks in the file
683 are and uses those addresses directly. 672 are and uses those addresses directly.
684 673
685 674
686 invalidatepage: If a page has PagePrivate set, then invalidatepage 675 invalidatepage: If a page has PagePrivate set, then invalidatepage
687 will be called when part or all of the page is to be removed 676 will be called when part or all of the page is to be removed
688 from the address space. This generally corresponds to either a 677 from the address space. This generally corresponds to either a
689 truncation or a complete invalidation of the address space 678 truncation or a complete invalidation of the address space
690 (in the latter case 'offset' will always be 0). 679 (in the latter case 'offset' will always be 0).
691 Any private data associated with the page should be updated 680 Any private data associated with the page should be updated
692 to reflect this truncation. If offset is 0, then 681 to reflect this truncation. If offset is 0, then
693 the private data should be released, because the page 682 the private data should be released, because the page
694 must be able to be completely discarded. This may be done by 683 must be able to be completely discarded. This may be done by
695 calling the ->releasepage function, but in this case the 684 calling the ->releasepage function, but in this case the
696 release MUST succeed. 685 release MUST succeed.
697 686
698 releasepage: releasepage is called on PagePrivate pages to indicate 687 releasepage: releasepage is called on PagePrivate pages to indicate
699 that the page should be freed if possible. ->releasepage 688 that the page should be freed if possible. ->releasepage
700 should remove any private data from the page and clear the 689 should remove any private data from the page and clear the
701 PagePrivate flag. It may also remove the page from the 690 PagePrivate flag. It may also remove the page from the
702 address_space. If this fails for some reason, it may indicate 691 address_space. If this fails for some reason, it may indicate
703 failure with a 0 return value. 692 failure with a 0 return value.
704 This is used in two distinct though related cases. The first 693 This is used in two distinct though related cases. The first
705 is when the VM finds a clean page with no active users and 694 is when the VM finds a clean page with no active users and
706 wants to make it a free page. If ->releasepage succeeds, the 695 wants to make it a free page. If ->releasepage succeeds, the
707 page will be removed from the address_space and become free. 696 page will be removed from the address_space and become free.
708 697
709 The second case is when a request has been made to invalidate 698 The second case is when a request has been made to invalidate
710 some or all pages in an address_space. This can happen 699 some or all pages in an address_space. This can happen
711 through the fadvice(POSIX_FADV_DONTNEED) system call or by the 700 through the fadvice(POSIX_FADV_DONTNEED) system call or by the
712 filesystem explicitly requesting it as nfs and 9fs do (when 701 filesystem explicitly requesting it as nfs and 9fs do (when
713 they believe the cache may be out of date with storage) by 702 they believe the cache may be out of date with storage) by
714 calling invalidate_inode_pages2(). 703 calling invalidate_inode_pages2().
715 If the filesystem makes such a call, and needs to be certain 704 If the filesystem makes such a call, and needs to be certain
716 that all pages are invalidated, then its releasepage will 705 that all pages are invalidated, then its releasepage will
717 need to ensure this. Possibly it can clear the PageUptodate 706 need to ensure this. Possibly it can clear the PageUptodate
718 bit if it cannot free private data yet. 707 bit if it cannot free private data yet.
719 708
720 direct_IO: called by the generic read/write routines to perform 709 direct_IO: called by the generic read/write routines to perform
721 direct_IO - that is IO requests which bypass the page cache 710 direct_IO - that is IO requests which bypass the page cache
722 and transfer data directly between the storage and the 711 and transfer data directly between the storage and the
723 application's address space. 712 application's address space.
724 713
725 get_xip_page: called by the VM to translate a block number to a page. 714 get_xip_page: called by the VM to translate a block number to a page.
726 The page is valid until the corresponding filesystem is unmounted. 715 The page is valid until the corresponding filesystem is unmounted.
727 Filesystems that want to use execute-in-place (XIP) need to implement 716 Filesystems that want to use execute-in-place (XIP) need to implement
728 it. An example implementation can be found in fs/ext2/xip.c. 717 it. An example implementation can be found in fs/ext2/xip.c.
729 718
730 migrate_page: This is used to compact the physical memory usage. 719 migrate_page: This is used to compact the physical memory usage.
731 If the VM wants to relocate a page (maybe off a memory card 720 If the VM wants to relocate a page (maybe off a memory card
732 that is signalling imminent failure) it will pass a new page 721 that is signalling imminent failure) it will pass a new page
733 and an old page to this function. migrate_page should 722 and an old page to this function. migrate_page should
734 transfer any private data across and update any references 723 transfer any private data across and update any references
735 that it has to the page. 724 that it has to the page.
736 725
737 launder_page: Called before freeing a page - it writes back the dirty page. To 726 launder_page: Called before freeing a page - it writes back the dirty page. To
738 prevent redirtying the page, it is kept locked during the whole 727 prevent redirtying the page, it is kept locked during the whole
739 operation. 728 operation.
740 729
741 The File Object 730 The File Object
742 =============== 731 ===============
743 732
744 A file object represents a file opened by a process. 733 A file object represents a file opened by a process.
745 734
746 735
747 struct file_operations 736 struct file_operations
748 ---------------------- 737 ----------------------
749 738
750 This describes how the VFS can manipulate an open file. As of kernel 739 This describes how the VFS can manipulate an open file. As of kernel
751 2.6.22, the following members are defined: 740 2.6.22, the following members are defined:
752 741
753 struct file_operations { 742 struct file_operations {
754 struct module *owner; 743 struct module *owner;
755 loff_t (*llseek) (struct file *, loff_t, int); 744 loff_t (*llseek) (struct file *, loff_t, int);
756 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 745 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
757 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 746 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
758 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 747 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
759 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 748 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
760 int (*readdir) (struct file *, void *, filldir_t); 749 int (*readdir) (struct file *, void *, filldir_t);
761 unsigned int (*poll) (struct file *, struct poll_table_struct *); 750 unsigned int (*poll) (struct file *, struct poll_table_struct *);
762 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); 751 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
763 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 752 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
764 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 753 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
765 int (*mmap) (struct file *, struct vm_area_struct *); 754 int (*mmap) (struct file *, struct vm_area_struct *);
766 int (*open) (struct inode *, struct file *); 755 int (*open) (struct inode *, struct file *);
767 int (*flush) (struct file *); 756 int (*flush) (struct file *);
768 int (*release) (struct inode *, struct file *); 757 int (*release) (struct inode *, struct file *);
769 int (*fsync) (struct file *, struct dentry *, int datasync); 758 int (*fsync) (struct file *, struct dentry *, int datasync);
770 int (*aio_fsync) (struct kiocb *, int datasync); 759 int (*aio_fsync) (struct kiocb *, int datasync);
771 int (*fasync) (int, struct file *, int); 760 int (*fasync) (int, struct file *, int);
772 int (*lock) (struct file *, int, struct file_lock *); 761 int (*lock) (struct file *, int, struct file_lock *);
773 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); 762 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
774 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); 763 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
775 ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); 764 ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
776 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); 765 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
777 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 766 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
778 int (*check_flags)(int); 767 int (*check_flags)(int);
779 int (*dir_notify)(struct file *filp, unsigned long arg); 768 int (*dir_notify)(struct file *filp, unsigned long arg);
780 int (*flock) (struct file *, int, struct file_lock *); 769 int (*flock) (struct file *, int, struct file_lock *);
781 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); 770 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
782 ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); 771 ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
783 }; 772 };
784 773
785 Again, all methods are called without any locks being held, unless 774 Again, all methods are called without any locks being held, unless
786 otherwise noted. 775 otherwise noted.
787 776
788 llseek: called when the VFS needs to move the file position index 777 llseek: called when the VFS needs to move the file position index
789 778
790 read: called by read(2) and related system calls 779 read: called by read(2) and related system calls
791 780
792 aio_read: called by io_submit(2) and other asynchronous I/O operations 781 aio_read: called by io_submit(2) and other asynchronous I/O operations
793 782
794 write: called by write(2) and related system calls 783 write: called by write(2) and related system calls
795 784
796 aio_write: called by io_submit(2) and other asynchronous I/O operations 785 aio_write: called by io_submit(2) and other asynchronous I/O operations
797 786
798 readdir: called when the VFS needs to read the directory contents 787 readdir: called when the VFS needs to read the directory contents
799 788
800 poll: called by the VFS when a process wants to check if there is 789 poll: called by the VFS when a process wants to check if there is
801 activity on this file and (optionally) go to sleep until there 790 activity on this file and (optionally) go to sleep until there
802 is activity. Called by the select(2) and poll(2) system calls 791 is activity. Called by the select(2) and poll(2) system calls
803 792
804 ioctl: called by the ioctl(2) system call 793 ioctl: called by the ioctl(2) system call
805 794
806 unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not 795 unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not
807 require the BKL should use this method instead of the ioctl() above. 796 require the BKL should use this method instead of the ioctl() above.
808 797
809 compat_ioctl: called by the ioctl(2) system call when 32 bit system calls 798 compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
810 are used on 64 bit kernels. 799 are used on 64 bit kernels.
811 800
812 mmap: called by the mmap(2) system call 801 mmap: called by the mmap(2) system call
813 802
814 open: called by the VFS when an inode should be opened. When the VFS 803 open: called by the VFS when an inode should be opened. When the VFS
815 opens a file, it creates a new "struct file". It then calls the 804 opens a file, it creates a new "struct file". It then calls the
816 open method for the newly allocated file structure. You might 805 open method for the newly allocated file structure. You might
817 think that the open method really belongs in 806 think that the open method really belongs in
818 "struct inode_operations", and you may be right. I think it's 807 "struct inode_operations", and you may be right. I think it's
819 done the way it is because it makes filesystems simpler to 808 done the way it is because it makes filesystems simpler to
820 implement. The open() method is a good place to initialize the 809 implement. The open() method is a good place to initialize the
821 "private_data" member in the file structure if you want to point 810 "private_data" member in the file structure if you want to point
822 to a device structure 811 to a device structure
823 812
824 flush: called by the close(2) system call to flush a file 813 flush: called by the close(2) system call to flush a file
825 814
826 release: called when the last reference to an open file is closed 815 release: called when the last reference to an open file is closed
827 816
828 fsync: called by the fsync(2) system call 817 fsync: called by the fsync(2) system call
829 818
830 fasync: called by the fcntl(2) system call when asynchronous 819 fasync: called by the fcntl(2) system call when asynchronous
831 (non-blocking) mode is enabled for a file 820 (non-blocking) mode is enabled for a file
832 821
833 lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW 822 lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
834 commands 823 commands
835 824
836 readv: called by the readv(2) system call 825 readv: called by the readv(2) system call
837 826
838 writev: called by the writev(2) system call 827 writev: called by the writev(2) system call
839 828
840 sendfile: called by the sendfile(2) system call 829 sendfile: called by the sendfile(2) system call
841 830
842 get_unmapped_area: called by the mmap(2) system call 831 get_unmapped_area: called by the mmap(2) system call
843 832
844 check_flags: called by the fcntl(2) system call for F_SETFL command 833 check_flags: called by the fcntl(2) system call for F_SETFL command
845 834
846 dir_notify: called by the fcntl(2) system call for F_NOTIFY command 835 dir_notify: called by the fcntl(2) system call for F_NOTIFY command
847 836
848 flock: called by the flock(2) system call 837 flock: called by the flock(2) system call
849 838
850 splice_write: called by the VFS to splice data from a pipe to a file. This 839 splice_write: called by the VFS to splice data from a pipe to a file. This
851 method is used by the splice(2) system call 840 method is used by the splice(2) system call
852 841
853 splice_read: called by the VFS to splice data from file to a pipe. This 842 splice_read: called by the VFS to splice data from file to a pipe. This
854 method is used by the splice(2) system call 843 method is used by the splice(2) system call
855 844
856 Note that the file operations are implemented by the specific 845 Note that the file operations are implemented by the specific
857 filesystem in which the inode resides. When opening a device node 846 filesystem in which the inode resides. When opening a device node
858 (character or block special) most filesystems will call special 847 (character or block special) most filesystems will call special
859 support routines in the VFS which will locate the required device 848 support routines in the VFS which will locate the required device
860 driver information. These support routines replace the filesystem file 849 driver information. These support routines replace the filesystem file
861 operations with those for the device driver, and then proceed to call 850 operations with those for the device driver, and then proceed to call
862 the new open() method for the file. This is how opening a device file 851 the new open() method for the file. This is how opening a device file
863 in the filesystem eventually ends up calling the device driver open() 852 in the filesystem eventually ends up calling the device driver open()
864 method. 853 method.
865 854
866 855
867 Directory Entry Cache (dcache) 856 Directory Entry Cache (dcache)
868 ============================== 857 ==============================
869 858
870 859
871 struct dentry_operations 860 struct dentry_operations
872 ------------------------ 861 ------------------------
873 862
874 This describes how a filesystem can overload the standard dentry 863 This describes how a filesystem can overload the standard dentry
875 operations. Dentries and the dcache are the domain of the VFS and the 864 operations. Dentries and the dcache are the domain of the VFS and the
876 individual filesystem implementations. Device drivers have no business 865 individual filesystem implementations. Device drivers have no business
877 here. These methods may be set to NULL, as they are either optional or 866 here. These methods may be set to NULL, as they are either optional or
878 the VFS uses a default. As of kernel 2.6.22, the following members are 867 the VFS uses a default. As of kernel 2.6.22, the following members are
879 defined: 868 defined:
880 869
881 struct dentry_operations { 870 struct dentry_operations {
882 int (*d_revalidate)(struct dentry *, struct nameidata *); 871 int (*d_revalidate)(struct dentry *, struct nameidata *);
883 int (*d_hash) (struct dentry *, struct qstr *); 872 int (*d_hash) (struct dentry *, struct qstr *);
884 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); 873 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
885 int (*d_delete)(struct dentry *); 874 int (*d_delete)(struct dentry *);
886 void (*d_release)(struct dentry *); 875 void (*d_release)(struct dentry *);
887 void (*d_iput)(struct dentry *, struct inode *); 876 void (*d_iput)(struct dentry *, struct inode *);
888 char *(*d_dname)(struct dentry *, char *, int); 877 char *(*d_dname)(struct dentry *, char *, int);
889 }; 878 };
890 879
891 d_revalidate: called when the VFS needs to revalidate a dentry. This 880 d_revalidate: called when the VFS needs to revalidate a dentry. This
892 is called whenever a name look-up finds a dentry in the 881 is called whenever a name look-up finds a dentry in the
893 dcache. Most filesystems leave this as NULL, because all their 882 dcache. Most filesystems leave this as NULL, because all their
894 dentries in the dcache are valid 883 dentries in the dcache are valid
895 884
896 d_hash: called when the VFS adds a dentry to the hash table 885 d_hash: called when the VFS adds a dentry to the hash table
897 886
898 d_compare: called when a dentry should be compared with another 887 d_compare: called when a dentry should be compared with another
899 888
900 d_delete: called when the last reference to a dentry is 889 d_delete: called when the last reference to a dentry is
901 deleted. This means no-one is using the dentry, however it is 890 deleted. This means no-one is using the dentry, however it is
902 still valid and in the dcache 891 still valid and in the dcache
903 892
904 d_release: called when a dentry is really deallocated 893 d_release: called when a dentry is really deallocated
905 894
906 d_iput: called when a dentry loses its inode (just prior to its 895 d_iput: called when a dentry loses its inode (just prior to its
907 being deallocated). The default when this is NULL is that the 896 being deallocated). The default when this is NULL is that the
908 VFS calls iput(). If you define this method, you must call 897 VFS calls iput(). If you define this method, you must call
909 iput() yourself 898 iput() yourself
910 899
911 d_dname: called when the pathname of a dentry should be generated. 900 d_dname: called when the pathname of a dentry should be generated.
912 Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay 901 Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay
913 pathname generation. (Instead of doing it when dentry is created, 902 pathname generation. (Instead of doing it when dentry is created,
914 its done only when the path is needed.). Real filesystems probably 903 its done only when the path is needed.). Real filesystems probably
915 dont want to use it, because their dentries are present in global 904 dont want to use it, because their dentries are present in global
916 dcache hash, so their hash should be an invariant. As no lock is 905 dcache hash, so their hash should be an invariant. As no lock is
917 held, d_dname() should not try to modify the dentry itself, unless 906 held, d_dname() should not try to modify the dentry itself, unless
918 appropriate SMP safety is used. CAUTION : d_path() logic is quite 907 appropriate SMP safety is used. CAUTION : d_path() logic is quite
919 tricky. The correct way to return for example "Hello" is to put it 908 tricky. The correct way to return for example "Hello" is to put it
920 at the end of the buffer, and returns a pointer to the first char. 909 at the end of the buffer, and returns a pointer to the first char.
921 dynamic_dname() helper function is provided to take care of this. 910 dynamic_dname() helper function is provided to take care of this.
922 911
923 Example : 912 Example :
924 913
925 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) 914 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
926 { 915 {
927 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 916 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
928 dentry->d_inode->i_ino); 917 dentry->d_inode->i_ino);
929 } 918 }
930 919
931 Each dentry has a pointer to its parent dentry, as well as a hash list 920 Each dentry has a pointer to its parent dentry, as well as a hash list
932 of child dentries. Child dentries are basically like files in a 921 of child dentries. Child dentries are basically like files in a
933 directory. 922 directory.
934 923
935 924
936 Directory Entry Cache API 925 Directory Entry Cache API
937 -------------------------- 926 --------------------------
938 927
939 There are a number of functions defined which permit a filesystem to 928 There are a number of functions defined which permit a filesystem to
940 manipulate dentries: 929 manipulate dentries:
941 930
942 dget: open a new handle for an existing dentry (this just increments 931 dget: open a new handle for an existing dentry (this just increments
943 the usage count) 932 the usage count)
944 933
945 dput: close a handle for a dentry (decrements the usage count). If 934 dput: close a handle for a dentry (decrements the usage count). If
946 the usage count drops to 0, the "d_delete" method is called 935 the usage count drops to 0, the "d_delete" method is called
947 and the dentry is placed on the unused list if the dentry is 936 and the dentry is placed on the unused list if the dentry is
948 still in its parents hash list. Putting the dentry on the 937 still in its parents hash list. Putting the dentry on the
949 unused list just means that if the system needs some RAM, it 938 unused list just means that if the system needs some RAM, it
950 goes through the unused list of dentries and deallocates them. 939 goes through the unused list of dentries and deallocates them.
951 If the dentry has already been unhashed and the usage count 940 If the dentry has already been unhashed and the usage count
952 drops to 0, in this case the dentry is deallocated after the 941 drops to 0, in this case the dentry is deallocated after the
953 "d_delete" method is called 942 "d_delete" method is called
954 943
955 d_drop: this unhashes a dentry from its parents hash list. A 944 d_drop: this unhashes a dentry from its parents hash list. A
956 subsequent call to dput() will deallocate the dentry if its 945 subsequent call to dput() will deallocate the dentry if its
957 usage count drops to 0 946 usage count drops to 0
958 947
959 d_delete: delete a dentry. If there are no other open references to 948 d_delete: delete a dentry. If there are no other open references to
960 the dentry then the dentry is turned into a negative dentry 949 the dentry then the dentry is turned into a negative dentry
961 (the d_iput() method is called). If there are other 950 (the d_iput() method is called). If there are other
962 references, then d_drop() is called instead 951 references, then d_drop() is called instead
963 952
964 d_add: add a dentry to its parents hash list and then calls 953 d_add: add a dentry to its parents hash list and then calls
965 d_instantiate() 954 d_instantiate()
966 955
967 d_instantiate: add a dentry to the alias hash list for the inode and 956 d_instantiate: add a dentry to the alias hash list for the inode and
968 updates the "d_inode" member. The "i_count" member in the 957 updates the "d_inode" member. The "i_count" member in the
969 inode structure should be set/incremented. If the inode 958 inode structure should be set/incremented. If the inode
970 pointer is NULL, the dentry is called a "negative 959 pointer is NULL, the dentry is called a "negative
971 dentry". This function is commonly called when an inode is 960 dentry". This function is commonly called when an inode is
972 created for an existing negative dentry 961 created for an existing negative dentry
973 962
974 d_lookup: look up a dentry given its parent and path name component 963 d_lookup: look up a dentry given its parent and path name component
975 It looks up the child of that given name from the dcache 964 It looks up the child of that given name from the dcache
976 hash table. If it is found, the reference count is incremented 965 hash table. If it is found, the reference count is incremented
977 and the dentry is returned. The caller must use d_put() 966 and the dentry is returned. The caller must use d_put()
978 to free the dentry when it finishes using it. 967 to free the dentry when it finishes using it.
979 968
980 For further information on dentry locking, please refer to the document 969 For further information on dentry locking, please refer to the document
981 Documentation/filesystems/dentry-locking.txt. 970 Documentation/filesystems/dentry-locking.txt.
982 971
983 972
984 Resources 973 Resources
985 ========= 974 =========
986 975
987 (Note some of these resources are not up-to-date with the latest kernel 976 (Note some of these resources are not up-to-date with the latest kernel
988 version.) 977 version.)
989 978
990 Creating Linux virtual filesystems. 2002 979 Creating Linux virtual filesystems. 2002
991 <http://lwn.net/Articles/13325/> 980 <http://lwn.net/Articles/13325/>
992 981
993 The Linux Virtual File-system Layer by Neil Brown. 1999 982 The Linux Virtual File-system Layer by Neil Brown. 1999
994 <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> 983 <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
995 984
996 A tour of the Linux VFS by Michael K. Johnson. 1996 985 A tour of the Linux VFS by Michael K. Johnson. 1996
997 <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> 986 <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
998 987
999 A small trail through the Linux kernel by Andries Brouwer. 2001 988 A small trail through the Linux kernel by Andries Brouwer. 2001
1000 <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> 989 <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
1001 990
1 /* 1 /*
2 * linux/fs/inode.c 2 * linux/fs/inode.c
3 * 3 *
4 * (C) 1997 Linus Torvalds 4 * (C) 1997 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/fs.h> 7 #include <linux/fs.h>
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/dcache.h> 9 #include <linux/dcache.h>
10 #include <linux/init.h> 10 #include <linux/init.h>
11 #include <linux/quotaops.h> 11 #include <linux/quotaops.h>
12 #include <linux/slab.h> 12 #include <linux/slab.h>
13 #include <linux/writeback.h> 13 #include <linux/writeback.h>
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/backing-dev.h> 15 #include <linux/backing-dev.h>
16 #include <linux/wait.h> 16 #include <linux/wait.h>
17 #include <linux/hash.h> 17 #include <linux/hash.h>
18 #include <linux/swap.h> 18 #include <linux/swap.h>
19 #include <linux/security.h> 19 #include <linux/security.h>
20 #include <linux/pagemap.h> 20 #include <linux/pagemap.h>
21 #include <linux/cdev.h> 21 #include <linux/cdev.h>
22 #include <linux/bootmem.h> 22 #include <linux/bootmem.h>
23 #include <linux/inotify.h> 23 #include <linux/inotify.h>
24 #include <linux/mount.h> 24 #include <linux/mount.h>
25 25
26 /* 26 /*
27 * This is needed for the following functions: 27 * This is needed for the following functions:
28 * - inode_has_buffers 28 * - inode_has_buffers
29 * - invalidate_inode_buffers 29 * - invalidate_inode_buffers
30 * - invalidate_bdev 30 * - invalidate_bdev
31 * 31 *
32 * FIXME: remove all knowledge of the buffer layer from this file 32 * FIXME: remove all knowledge of the buffer layer from this file
33 */ 33 */
34 #include <linux/buffer_head.h> 34 #include <linux/buffer_head.h>
35 35
36 /* 36 /*
37 * New inode.c implementation. 37 * New inode.c implementation.
38 * 38 *
39 * This implementation has the basic premise of trying 39 * This implementation has the basic premise of trying
40 * to be extremely low-overhead and SMP-safe, yet be 40 * to be extremely low-overhead and SMP-safe, yet be
41 * simple enough to be "obviously correct". 41 * simple enough to be "obviously correct".
42 * 42 *
43 * Famous last words. 43 * Famous last words.
44 */ 44 */
45 45
46 /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ 46 /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
47 47
48 /* #define INODE_PARANOIA 1 */ 48 /* #define INODE_PARANOIA 1 */
49 /* #define INODE_DEBUG 1 */ 49 /* #define INODE_DEBUG 1 */
50 50
51 /* 51 /*
52 * Inode lookup is no longer as critical as it used to be: 52 * Inode lookup is no longer as critical as it used to be:
53 * most of the lookups are going to be through the dcache. 53 * most of the lookups are going to be through the dcache.
54 */ 54 */
55 #define I_HASHBITS i_hash_shift 55 #define I_HASHBITS i_hash_shift
56 #define I_HASHMASK i_hash_mask 56 #define I_HASHMASK i_hash_mask
57 57
58 static unsigned int i_hash_mask __read_mostly; 58 static unsigned int i_hash_mask __read_mostly;
59 static unsigned int i_hash_shift __read_mostly; 59 static unsigned int i_hash_shift __read_mostly;
60 60
61 /* 61 /*
62 * Each inode can be on two separate lists. One is 62 * Each inode can be on two separate lists. One is
63 * the hash list of the inode, used for lookups. The 63 * the hash list of the inode, used for lookups. The
64 * other linked list is the "type" list: 64 * other linked list is the "type" list:
65 * "in_use" - valid inode, i_count > 0, i_nlink > 0 65 * "in_use" - valid inode, i_count > 0, i_nlink > 0
66 * "dirty" - as "in_use" but also dirty 66 * "dirty" - as "in_use" but also dirty
67 * "unused" - valid inode, i_count = 0 67 * "unused" - valid inode, i_count = 0
68 * 68 *
69 * A "dirty" list is maintained for each super block, 69 * A "dirty" list is maintained for each super block,
70 * allowing for low-overhead inode sync() operations. 70 * allowing for low-overhead inode sync() operations.
71 */ 71 */
72 72
73 LIST_HEAD(inode_in_use); 73 LIST_HEAD(inode_in_use);
74 LIST_HEAD(inode_unused); 74 LIST_HEAD(inode_unused);
75 static struct hlist_head *inode_hashtable __read_mostly; 75 static struct hlist_head *inode_hashtable __read_mostly;
76 76
77 /* 77 /*
78 * A simple spinlock to protect the list manipulations. 78 * A simple spinlock to protect the list manipulations.
79 * 79 *
80 * NOTE! You also have to own the lock if you change 80 * NOTE! You also have to own the lock if you change
81 * the i_state of an inode while it is in use.. 81 * the i_state of an inode while it is in use..
82 */ 82 */
83 DEFINE_SPINLOCK(inode_lock); 83 DEFINE_SPINLOCK(inode_lock);
84 84
85 /* 85 /*
86 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 86 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages
87 * icache shrinking path, and the umount path. Without this exclusion, 87 * icache shrinking path, and the umount path. Without this exclusion,
88 * by the time prune_icache calls iput for the inode whose pages it has 88 * by the time prune_icache calls iput for the inode whose pages it has
89 * been invalidating, or by the time it calls clear_inode & destroy_inode 89 * been invalidating, or by the time it calls clear_inode & destroy_inode
90 * from its final dispose_list, the struct super_block they refer to 90 * from its final dispose_list, the struct super_block they refer to
91 * (for inode->i_sb->s_op) may already have been freed and reused. 91 * (for inode->i_sb->s_op) may already have been freed and reused.
92 */ 92 */
93 static DEFINE_MUTEX(iprune_mutex); 93 static DEFINE_MUTEX(iprune_mutex);
94 94
95 /* 95 /*
96 * Statistics gathering.. 96 * Statistics gathering..
97 */ 97 */
98 struct inodes_stat_t inodes_stat; 98 struct inodes_stat_t inodes_stat;
99 99
100 static struct kmem_cache * inode_cachep __read_mostly; 100 static struct kmem_cache * inode_cachep __read_mostly;
101 101
102 static void wake_up_inode(struct inode *inode) 102 static void wake_up_inode(struct inode *inode)
103 { 103 {
104 /* 104 /*
105 * Prevent speculative execution through spin_unlock(&inode_lock); 105 * Prevent speculative execution through spin_unlock(&inode_lock);
106 */ 106 */
107 smp_mb(); 107 smp_mb();
108 wake_up_bit(&inode->i_state, __I_LOCK); 108 wake_up_bit(&inode->i_state, __I_LOCK);
109 } 109 }
110 110
111 static struct inode *alloc_inode(struct super_block *sb) 111 static struct inode *alloc_inode(struct super_block *sb)
112 { 112 {
113 static const struct address_space_operations empty_aops; 113 static const struct address_space_operations empty_aops;
114 static struct inode_operations empty_iops; 114 static struct inode_operations empty_iops;
115 static const struct file_operations empty_fops; 115 static const struct file_operations empty_fops;
116 struct inode *inode; 116 struct inode *inode;
117 117
118 if (sb->s_op->alloc_inode) 118 if (sb->s_op->alloc_inode)
119 inode = sb->s_op->alloc_inode(sb); 119 inode = sb->s_op->alloc_inode(sb);
120 else 120 else
121 inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); 121 inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
122 122
123 if (inode) { 123 if (inode) {
124 struct address_space * const mapping = &inode->i_data; 124 struct address_space * const mapping = &inode->i_data;
125 125
126 inode->i_sb = sb; 126 inode->i_sb = sb;
127 inode->i_blkbits = sb->s_blocksize_bits; 127 inode->i_blkbits = sb->s_blocksize_bits;
128 inode->i_flags = 0; 128 inode->i_flags = 0;
129 atomic_set(&inode->i_count, 1); 129 atomic_set(&inode->i_count, 1);
130 inode->i_op = &empty_iops; 130 inode->i_op = &empty_iops;
131 inode->i_fop = &empty_fops; 131 inode->i_fop = &empty_fops;
132 inode->i_nlink = 1; 132 inode->i_nlink = 1;
133 atomic_set(&inode->i_writecount, 0); 133 atomic_set(&inode->i_writecount, 0);
134 inode->i_size = 0; 134 inode->i_size = 0;
135 inode->i_blocks = 0; 135 inode->i_blocks = 0;
136 inode->i_bytes = 0; 136 inode->i_bytes = 0;
137 inode->i_generation = 0; 137 inode->i_generation = 0;
138 #ifdef CONFIG_QUOTA 138 #ifdef CONFIG_QUOTA
139 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 139 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
140 #endif 140 #endif
141 inode->i_pipe = NULL; 141 inode->i_pipe = NULL;
142 inode->i_bdev = NULL; 142 inode->i_bdev = NULL;
143 inode->i_cdev = NULL; 143 inode->i_cdev = NULL;
144 inode->i_rdev = 0; 144 inode->i_rdev = 0;
145 inode->dirtied_when = 0; 145 inode->dirtied_when = 0;
146 if (security_inode_alloc(inode)) { 146 if (security_inode_alloc(inode)) {
147 if (inode->i_sb->s_op->destroy_inode) 147 if (inode->i_sb->s_op->destroy_inode)
148 inode->i_sb->s_op->destroy_inode(inode); 148 inode->i_sb->s_op->destroy_inode(inode);
149 else 149 else
150 kmem_cache_free(inode_cachep, (inode)); 150 kmem_cache_free(inode_cachep, (inode));
151 return NULL; 151 return NULL;
152 } 152 }
153 153
154 spin_lock_init(&inode->i_lock); 154 spin_lock_init(&inode->i_lock);
155 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 155 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
156 156
157 mutex_init(&inode->i_mutex); 157 mutex_init(&inode->i_mutex);
158 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 158 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
159 159
160 init_rwsem(&inode->i_alloc_sem); 160 init_rwsem(&inode->i_alloc_sem);
161 lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); 161 lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
162 162
163 mapping->a_ops = &empty_aops; 163 mapping->a_ops = &empty_aops;
164 mapping->host = inode; 164 mapping->host = inode;
165 mapping->flags = 0; 165 mapping->flags = 0;
166 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); 166 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
167 mapping->assoc_mapping = NULL; 167 mapping->assoc_mapping = NULL;
168 mapping->backing_dev_info = &default_backing_dev_info; 168 mapping->backing_dev_info = &default_backing_dev_info;
169 169
170 /* 170 /*
171 * If the block_device provides a backing_dev_info for client 171 * If the block_device provides a backing_dev_info for client
172 * inodes then use that. Otherwise the inode share the bdev's 172 * inodes then use that. Otherwise the inode share the bdev's
173 * backing_dev_info. 173 * backing_dev_info.
174 */ 174 */
175 if (sb->s_bdev) { 175 if (sb->s_bdev) {
176 struct backing_dev_info *bdi; 176 struct backing_dev_info *bdi;
177 177
178 bdi = sb->s_bdev->bd_inode_backing_dev_info; 178 bdi = sb->s_bdev->bd_inode_backing_dev_info;
179 if (!bdi) 179 if (!bdi)
180 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 180 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
181 mapping->backing_dev_info = bdi; 181 mapping->backing_dev_info = bdi;
182 } 182 }
183 inode->i_private = NULL; 183 inode->i_private = NULL;
184 inode->i_mapping = mapping; 184 inode->i_mapping = mapping;
185 } 185 }
186 return inode; 186 return inode;
187 } 187 }
188 188
189 void destroy_inode(struct inode *inode) 189 void destroy_inode(struct inode *inode)
190 { 190 {
191 BUG_ON(inode_has_buffers(inode)); 191 BUG_ON(inode_has_buffers(inode));
192 security_inode_free(inode); 192 security_inode_free(inode);
193 if (inode->i_sb->s_op->destroy_inode) 193 if (inode->i_sb->s_op->destroy_inode)
194 inode->i_sb->s_op->destroy_inode(inode); 194 inode->i_sb->s_op->destroy_inode(inode);
195 else 195 else
196 kmem_cache_free(inode_cachep, (inode)); 196 kmem_cache_free(inode_cachep, (inode));
197 } 197 }
198 198
199 199
200 /* 200 /*
201 * These are initializations that only need to be done 201 * These are initializations that only need to be done
202 * once, because the fields are idempotent across use 202 * once, because the fields are idempotent across use
203 * of the inode, so let the slab aware of that. 203 * of the inode, so let the slab aware of that.
204 */ 204 */
205 void inode_init_once(struct inode *inode) 205 void inode_init_once(struct inode *inode)
206 { 206 {
207 memset(inode, 0, sizeof(*inode)); 207 memset(inode, 0, sizeof(*inode));
208 INIT_HLIST_NODE(&inode->i_hash); 208 INIT_HLIST_NODE(&inode->i_hash);
209 INIT_LIST_HEAD(&inode->i_dentry); 209 INIT_LIST_HEAD(&inode->i_dentry);
210 INIT_LIST_HEAD(&inode->i_devices); 210 INIT_LIST_HEAD(&inode->i_devices);
211 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 211 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
212 rwlock_init(&inode->i_data.tree_lock); 212 rwlock_init(&inode->i_data.tree_lock);
213 spin_lock_init(&inode->i_data.i_mmap_lock); 213 spin_lock_init(&inode->i_data.i_mmap_lock);
214 INIT_LIST_HEAD(&inode->i_data.private_list); 214 INIT_LIST_HEAD(&inode->i_data.private_list);
215 spin_lock_init(&inode->i_data.private_lock); 215 spin_lock_init(&inode->i_data.private_lock);
216 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 216 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
217 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 217 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
218 i_size_ordered_init(inode); 218 i_size_ordered_init(inode);
219 #ifdef CONFIG_INOTIFY 219 #ifdef CONFIG_INOTIFY
220 INIT_LIST_HEAD(&inode->inotify_watches); 220 INIT_LIST_HEAD(&inode->inotify_watches);
221 mutex_init(&inode->inotify_mutex); 221 mutex_init(&inode->inotify_mutex);
222 #endif 222 #endif
223 } 223 }
224 224
225 EXPORT_SYMBOL(inode_init_once); 225 EXPORT_SYMBOL(inode_init_once);
226 226
227 static void init_once(struct kmem_cache * cachep, void *foo) 227 static void init_once(struct kmem_cache * cachep, void *foo)
228 { 228 {
229 struct inode * inode = (struct inode *) foo; 229 struct inode * inode = (struct inode *) foo;
230 230
231 inode_init_once(inode); 231 inode_init_once(inode);
232 } 232 }
233 233
234 /* 234 /*
235 * inode_lock must be held 235 * inode_lock must be held
236 */ 236 */
237 void __iget(struct inode * inode) 237 void __iget(struct inode * inode)
238 { 238 {
239 if (atomic_read(&inode->i_count)) { 239 if (atomic_read(&inode->i_count)) {
240 atomic_inc(&inode->i_count); 240 atomic_inc(&inode->i_count);
241 return; 241 return;
242 } 242 }
243 atomic_inc(&inode->i_count); 243 atomic_inc(&inode->i_count);
244 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 244 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
245 list_move(&inode->i_list, &inode_in_use); 245 list_move(&inode->i_list, &inode_in_use);
246 inodes_stat.nr_unused--; 246 inodes_stat.nr_unused--;
247 } 247 }
248 248
249 /** 249 /**
250 * clear_inode - clear an inode 250 * clear_inode - clear an inode
251 * @inode: inode to clear 251 * @inode: inode to clear
252 * 252 *
253 * This is called by the filesystem to tell us 253 * This is called by the filesystem to tell us
254 * that the inode is no longer useful. We just 254 * that the inode is no longer useful. We just
255 * terminate it with extreme prejudice. 255 * terminate it with extreme prejudice.
256 */ 256 */
257 void clear_inode(struct inode *inode) 257 void clear_inode(struct inode *inode)
258 { 258 {
259 might_sleep(); 259 might_sleep();
260 invalidate_inode_buffers(inode); 260 invalidate_inode_buffers(inode);
261 261
262 BUG_ON(inode->i_data.nrpages); 262 BUG_ON(inode->i_data.nrpages);
263 BUG_ON(!(inode->i_state & I_FREEING)); 263 BUG_ON(!(inode->i_state & I_FREEING));
264 BUG_ON(inode->i_state & I_CLEAR); 264 BUG_ON(inode->i_state & I_CLEAR);
265 inode_sync_wait(inode); 265 inode_sync_wait(inode);
266 DQUOT_DROP(inode); 266 DQUOT_DROP(inode);
267 if (inode->i_sb->s_op->clear_inode) 267 if (inode->i_sb->s_op->clear_inode)
268 inode->i_sb->s_op->clear_inode(inode); 268 inode->i_sb->s_op->clear_inode(inode);
269 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 269 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
270 bd_forget(inode); 270 bd_forget(inode);
271 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 271 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
272 cd_forget(inode); 272 cd_forget(inode);
273 inode->i_state = I_CLEAR; 273 inode->i_state = I_CLEAR;
274 } 274 }
275 275
276 EXPORT_SYMBOL(clear_inode); 276 EXPORT_SYMBOL(clear_inode);
277 277
278 /* 278 /*
279 * dispose_list - dispose of the contents of a local list 279 * dispose_list - dispose of the contents of a local list
280 * @head: the head of the list to free 280 * @head: the head of the list to free
281 * 281 *
282 * Dispose-list gets a local list with local inodes in it, so it doesn't 282 * Dispose-list gets a local list with local inodes in it, so it doesn't
283 * need to worry about list corruption and SMP locks. 283 * need to worry about list corruption and SMP locks.
284 */ 284 */
285 static void dispose_list(struct list_head *head) 285 static void dispose_list(struct list_head *head)
286 { 286 {
287 int nr_disposed = 0; 287 int nr_disposed = 0;
288 288
289 while (!list_empty(head)) { 289 while (!list_empty(head)) {
290 struct inode *inode; 290 struct inode *inode;
291 291
292 inode = list_first_entry(head, struct inode, i_list); 292 inode = list_first_entry(head, struct inode, i_list);
293 list_del(&inode->i_list); 293 list_del(&inode->i_list);
294 294
295 if (inode->i_data.nrpages) 295 if (inode->i_data.nrpages)
296 truncate_inode_pages(&inode->i_data, 0); 296 truncate_inode_pages(&inode->i_data, 0);
297 clear_inode(inode); 297 clear_inode(inode);
298 298
299 spin_lock(&inode_lock); 299 spin_lock(&inode_lock);
300 hlist_del_init(&inode->i_hash); 300 hlist_del_init(&inode->i_hash);
301 list_del_init(&inode->i_sb_list); 301 list_del_init(&inode->i_sb_list);
302 spin_unlock(&inode_lock); 302 spin_unlock(&inode_lock);
303 303
304 wake_up_inode(inode); 304 wake_up_inode(inode);
305 destroy_inode(inode); 305 destroy_inode(inode);
306 nr_disposed++; 306 nr_disposed++;
307 } 307 }
308 spin_lock(&inode_lock); 308 spin_lock(&inode_lock);
309 inodes_stat.nr_inodes -= nr_disposed; 309 inodes_stat.nr_inodes -= nr_disposed;
310 spin_unlock(&inode_lock); 310 spin_unlock(&inode_lock);
311 } 311 }
312 312
313 /* 313 /*
314 * Invalidate all inodes for a device. 314 * Invalidate all inodes for a device.
315 */ 315 */
316 static int invalidate_list(struct list_head *head, struct list_head *dispose) 316 static int invalidate_list(struct list_head *head, struct list_head *dispose)
317 { 317 {
318 struct list_head *next; 318 struct list_head *next;
319 int busy = 0, count = 0; 319 int busy = 0, count = 0;
320 320
321 next = head->next; 321 next = head->next;
322 for (;;) { 322 for (;;) {
323 struct list_head * tmp = next; 323 struct list_head * tmp = next;
324 struct inode * inode; 324 struct inode * inode;
325 325
326 /* 326 /*
327 * We can reschedule here without worrying about the list's 327 * We can reschedule here without worrying about the list's
328 * consistency because the per-sb list of inodes must not 328 * consistency because the per-sb list of inodes must not
329 * change during umount anymore, and because iprune_mutex keeps 329 * change during umount anymore, and because iprune_mutex keeps
330 * shrink_icache_memory() away. 330 * shrink_icache_memory() away.
331 */ 331 */
332 cond_resched_lock(&inode_lock); 332 cond_resched_lock(&inode_lock);
333 333
334 next = next->next; 334 next = next->next;
335 if (tmp == head) 335 if (tmp == head)
336 break; 336 break;
337 inode = list_entry(tmp, struct inode, i_sb_list); 337 inode = list_entry(tmp, struct inode, i_sb_list);
338 invalidate_inode_buffers(inode); 338 invalidate_inode_buffers(inode);
339 if (!atomic_read(&inode->i_count)) { 339 if (!atomic_read(&inode->i_count)) {
340 list_move(&inode->i_list, dispose); 340 list_move(&inode->i_list, dispose);
341 inode->i_state |= I_FREEING; 341 inode->i_state |= I_FREEING;
342 count++; 342 count++;
343 continue; 343 continue;
344 } 344 }
345 busy = 1; 345 busy = 1;
346 } 346 }
347 /* only unused inodes may be cached with i_count zero */ 347 /* only unused inodes may be cached with i_count zero */
348 inodes_stat.nr_unused -= count; 348 inodes_stat.nr_unused -= count;
349 return busy; 349 return busy;
350 } 350 }
351 351
352 /** 352 /**
353 * invalidate_inodes - discard the inodes on a device 353 * invalidate_inodes - discard the inodes on a device
354 * @sb: superblock 354 * @sb: superblock
355 * 355 *
356 * Discard all of the inodes for a given superblock. If the discard 356 * Discard all of the inodes for a given superblock. If the discard
357 * fails because there are busy inodes then a non zero value is returned. 357 * fails because there are busy inodes then a non zero value is returned.
358 * If the discard is successful all the inodes have been discarded. 358 * If the discard is successful all the inodes have been discarded.
359 */ 359 */
360 int invalidate_inodes(struct super_block * sb) 360 int invalidate_inodes(struct super_block * sb)
361 { 361 {
362 int busy; 362 int busy;
363 LIST_HEAD(throw_away); 363 LIST_HEAD(throw_away);
364 364
365 mutex_lock(&iprune_mutex); 365 mutex_lock(&iprune_mutex);
366 spin_lock(&inode_lock); 366 spin_lock(&inode_lock);
367 inotify_unmount_inodes(&sb->s_inodes); 367 inotify_unmount_inodes(&sb->s_inodes);
368 busy = invalidate_list(&sb->s_inodes, &throw_away); 368 busy = invalidate_list(&sb->s_inodes, &throw_away);
369 spin_unlock(&inode_lock); 369 spin_unlock(&inode_lock);
370 370
371 dispose_list(&throw_away); 371 dispose_list(&throw_away);
372 mutex_unlock(&iprune_mutex); 372 mutex_unlock(&iprune_mutex);
373 373
374 return busy; 374 return busy;
375 } 375 }
376 376
377 EXPORT_SYMBOL(invalidate_inodes); 377 EXPORT_SYMBOL(invalidate_inodes);
378 378
379 static int can_unuse(struct inode *inode) 379 static int can_unuse(struct inode *inode)
380 { 380 {
381 if (inode->i_state) 381 if (inode->i_state)
382 return 0; 382 return 0;
383 if (inode_has_buffers(inode)) 383 if (inode_has_buffers(inode))
384 return 0; 384 return 0;
385 if (atomic_read(&inode->i_count)) 385 if (atomic_read(&inode->i_count))
386 return 0; 386 return 0;
387 if (inode->i_data.nrpages) 387 if (inode->i_data.nrpages)
388 return 0; 388 return 0;
389 return 1; 389 return 1;
390 } 390 }
391 391
392 /* 392 /*
393 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 393 * Scan `goal' inodes on the unused list for freeable ones. They are moved to
394 * a temporary list and then are freed outside inode_lock by dispose_list(). 394 * a temporary list and then are freed outside inode_lock by dispose_list().
395 * 395 *
396 * Any inodes which are pinned purely because of attached pagecache have their 396 * Any inodes which are pinned purely because of attached pagecache have their
397 * pagecache removed. We expect the final iput() on that inode to add it to 397 * pagecache removed. We expect the final iput() on that inode to add it to
398 * the front of the inode_unused list. So look for it there and if the 398 * the front of the inode_unused list. So look for it there and if the
399 * inode is still freeable, proceed. The right inode is found 99.9% of the 399 * inode is still freeable, proceed. The right inode is found 99.9% of the
400 * time in testing on a 4-way. 400 * time in testing on a 4-way.
401 * 401 *
402 * If the inode has metadata buffers attached to mapping->private_list then 402 * If the inode has metadata buffers attached to mapping->private_list then
403 * try to remove them. 403 * try to remove them.
404 */ 404 */
405 static void prune_icache(int nr_to_scan) 405 static void prune_icache(int nr_to_scan)
406 { 406 {
407 LIST_HEAD(freeable); 407 LIST_HEAD(freeable);
408 int nr_pruned = 0; 408 int nr_pruned = 0;
409 int nr_scanned; 409 int nr_scanned;
410 unsigned long reap = 0; 410 unsigned long reap = 0;
411 411
412 mutex_lock(&iprune_mutex); 412 mutex_lock(&iprune_mutex);
413 spin_lock(&inode_lock); 413 spin_lock(&inode_lock);
414 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 414 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
415 struct inode *inode; 415 struct inode *inode;
416 416
417 if (list_empty(&inode_unused)) 417 if (list_empty(&inode_unused))
418 break; 418 break;
419 419
420 inode = list_entry(inode_unused.prev, struct inode, i_list); 420 inode = list_entry(inode_unused.prev, struct inode, i_list);
421 421
422 if (inode->i_state || atomic_read(&inode->i_count)) { 422 if (inode->i_state || atomic_read(&inode->i_count)) {
423 list_move(&inode->i_list, &inode_unused); 423 list_move(&inode->i_list, &inode_unused);
424 continue; 424 continue;
425 } 425 }
426 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 426 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
427 __iget(inode); 427 __iget(inode);
428 spin_unlock(&inode_lock); 428 spin_unlock(&inode_lock);
429 if (remove_inode_buffers(inode)) 429 if (remove_inode_buffers(inode))
430 reap += invalidate_mapping_pages(&inode->i_data, 430 reap += invalidate_mapping_pages(&inode->i_data,
431 0, -1); 431 0, -1);
432 iput(inode); 432 iput(inode);
433 spin_lock(&inode_lock); 433 spin_lock(&inode_lock);
434 434
435 if (inode != list_entry(inode_unused.next, 435 if (inode != list_entry(inode_unused.next,
436 struct inode, i_list)) 436 struct inode, i_list))
437 continue; /* wrong inode or list_empty */ 437 continue; /* wrong inode or list_empty */
438 if (!can_unuse(inode)) 438 if (!can_unuse(inode))
439 continue; 439 continue;
440 } 440 }
441 list_move(&inode->i_list, &freeable); 441 list_move(&inode->i_list, &freeable);
442 inode->i_state |= I_FREEING; 442 inode->i_state |= I_FREEING;
443 nr_pruned++; 443 nr_pruned++;
444 } 444 }
445 inodes_stat.nr_unused -= nr_pruned; 445 inodes_stat.nr_unused -= nr_pruned;
446 if (current_is_kswapd()) 446 if (current_is_kswapd())
447 __count_vm_events(KSWAPD_INODESTEAL, reap); 447 __count_vm_events(KSWAPD_INODESTEAL, reap);
448 else 448 else
449 __count_vm_events(PGINODESTEAL, reap); 449 __count_vm_events(PGINODESTEAL, reap);
450 spin_unlock(&inode_lock); 450 spin_unlock(&inode_lock);
451 451
452 dispose_list(&freeable); 452 dispose_list(&freeable);
453 mutex_unlock(&iprune_mutex); 453 mutex_unlock(&iprune_mutex);
454 } 454 }
455 455
456 /* 456 /*
457 * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, 457 * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
458 * "unused" means that no dentries are referring to the inodes: the files are 458 * "unused" means that no dentries are referring to the inodes: the files are
459 * not open and the dcache references to those inodes have already been 459 * not open and the dcache references to those inodes have already been
460 * reclaimed. 460 * reclaimed.
461 * 461 *
462 * This function is passed the number of inodes to scan, and it returns the 462 * This function is passed the number of inodes to scan, and it returns the
463 * total number of remaining possibly-reclaimable inodes. 463 * total number of remaining possibly-reclaimable inodes.
464 */ 464 */
465 static int shrink_icache_memory(int nr, gfp_t gfp_mask) 465 static int shrink_icache_memory(int nr, gfp_t gfp_mask)
466 { 466 {
467 if (nr) { 467 if (nr) {
468 /* 468 /*
469 * Nasty deadlock avoidance. We may hold various FS locks, 469 * Nasty deadlock avoidance. We may hold various FS locks,
470 * and we don't want to recurse into the FS that called us 470 * and we don't want to recurse into the FS that called us
471 * in clear_inode() and friends.. 471 * in clear_inode() and friends..
472 */ 472 */
473 if (!(gfp_mask & __GFP_FS)) 473 if (!(gfp_mask & __GFP_FS))
474 return -1; 474 return -1;
475 prune_icache(nr); 475 prune_icache(nr);
476 } 476 }
477 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 477 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
478 } 478 }
479 479
480 static struct shrinker icache_shrinker = { 480 static struct shrinker icache_shrinker = {
481 .shrink = shrink_icache_memory, 481 .shrink = shrink_icache_memory,
482 .seeks = DEFAULT_SEEKS, 482 .seeks = DEFAULT_SEEKS,
483 }; 483 };
484 484
485 static void __wait_on_freeing_inode(struct inode *inode); 485 static void __wait_on_freeing_inode(struct inode *inode);
486 /* 486 /*
487 * Called with the inode lock held. 487 * Called with the inode lock held.
488 * NOTE: we are not increasing the inode-refcount, you must call __iget() 488 * NOTE: we are not increasing the inode-refcount, you must call __iget()
489 * by hand after calling find_inode now! This simplifies iunique and won't 489 * by hand after calling find_inode now! This simplifies iunique and won't
490 * add any additional branch in the common code. 490 * add any additional branch in the common code.
491 */ 491 */
492 static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) 492 static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
493 { 493 {
494 struct hlist_node *node; 494 struct hlist_node *node;
495 struct inode * inode = NULL; 495 struct inode * inode = NULL;
496 496
497 repeat: 497 repeat:
498 hlist_for_each (node, head) { 498 hlist_for_each (node, head) {
499 inode = hlist_entry(node, struct inode, i_hash); 499 inode = hlist_entry(node, struct inode, i_hash);
500 if (inode->i_sb != sb) 500 if (inode->i_sb != sb)
501 continue; 501 continue;
502 if (!test(inode, data)) 502 if (!test(inode, data))
503 continue; 503 continue;
504 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 504 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
505 __wait_on_freeing_inode(inode); 505 __wait_on_freeing_inode(inode);
506 goto repeat; 506 goto repeat;
507 } 507 }
508 break; 508 break;
509 } 509 }
510 return node ? inode : NULL; 510 return node ? inode : NULL;
511 } 511 }
512 512
513 /* 513 /*
514 * find_inode_fast is the fast path version of find_inode, see the comment at 514 * find_inode_fast is the fast path version of find_inode, see the comment at
515 * iget_locked for details. 515 * iget_locked for details.
516 */ 516 */
517 static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) 517 static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
518 { 518 {
519 struct hlist_node *node; 519 struct hlist_node *node;
520 struct inode * inode = NULL; 520 struct inode * inode = NULL;
521 521
522 repeat: 522 repeat:
523 hlist_for_each (node, head) { 523 hlist_for_each (node, head) {
524 inode = hlist_entry(node, struct inode, i_hash); 524 inode = hlist_entry(node, struct inode, i_hash);
525 if (inode->i_ino != ino) 525 if (inode->i_ino != ino)
526 continue; 526 continue;
527 if (inode->i_sb != sb) 527 if (inode->i_sb != sb)
528 continue; 528 continue;
529 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 529 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
530 __wait_on_freeing_inode(inode); 530 __wait_on_freeing_inode(inode);
531 goto repeat; 531 goto repeat;
532 } 532 }
533 break; 533 break;
534 } 534 }
535 return node ? inode : NULL; 535 return node ? inode : NULL;
536 } 536 }
537 537
538 /** 538 /**
539 * new_inode - obtain an inode 539 * new_inode - obtain an inode
540 * @sb: superblock 540 * @sb: superblock
541 * 541 *
542 * Allocates a new inode for given superblock. The default gfp_mask 542 * Allocates a new inode for given superblock. The default gfp_mask
543 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. 543 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
544 * If HIGHMEM pages are unsuitable or it is known that pages allocated 544 * If HIGHMEM pages are unsuitable or it is known that pages allocated
545 * for the page cache are not reclaimable or migratable, 545 * for the page cache are not reclaimable or migratable,
546 * mapping_set_gfp_mask() must be called with suitable flags on the 546 * mapping_set_gfp_mask() must be called with suitable flags on the
547 * newly created inode's mapping 547 * newly created inode's mapping
548 * 548 *
549 */ 549 */
550 struct inode *new_inode(struct super_block *sb) 550 struct inode *new_inode(struct super_block *sb)
551 { 551 {
552 /* 552 /*
553 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 553 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
554 * error if st_ino won't fit in target struct field. Use 32bit counter 554 * error if st_ino won't fit in target struct field. Use 32bit counter
555 * here to attempt to avoid that. 555 * here to attempt to avoid that.
556 */ 556 */
557 static unsigned int last_ino; 557 static unsigned int last_ino;
558 struct inode * inode; 558 struct inode * inode;
559 559
560 spin_lock_prefetch(&inode_lock); 560 spin_lock_prefetch(&inode_lock);
561 561
562 inode = alloc_inode(sb); 562 inode = alloc_inode(sb);
563 if (inode) { 563 if (inode) {
564 spin_lock(&inode_lock); 564 spin_lock(&inode_lock);
565 inodes_stat.nr_inodes++; 565 inodes_stat.nr_inodes++;
566 list_add(&inode->i_list, &inode_in_use); 566 list_add(&inode->i_list, &inode_in_use);
567 list_add(&inode->i_sb_list, &sb->s_inodes); 567 list_add(&inode->i_sb_list, &sb->s_inodes);
568 inode->i_ino = ++last_ino; 568 inode->i_ino = ++last_ino;
569 inode->i_state = 0; 569 inode->i_state = 0;
570 spin_unlock(&inode_lock); 570 spin_unlock(&inode_lock);
571 } 571 }
572 return inode; 572 return inode;
573 } 573 }
574 574
575 EXPORT_SYMBOL(new_inode); 575 EXPORT_SYMBOL(new_inode);
576 576
577 void unlock_new_inode(struct inode *inode) 577 void unlock_new_inode(struct inode *inode)
578 { 578 {
579 #ifdef CONFIG_DEBUG_LOCK_ALLOC 579 #ifdef CONFIG_DEBUG_LOCK_ALLOC
580 if (inode->i_mode & S_IFDIR) { 580 if (inode->i_mode & S_IFDIR) {
581 struct file_system_type *type = inode->i_sb->s_type; 581 struct file_system_type *type = inode->i_sb->s_type;
582 582
583 /* 583 /*
584 * ensure nobody is actually holding i_mutex 584 * ensure nobody is actually holding i_mutex
585 */ 585 */
586 mutex_destroy(&inode->i_mutex); 586 mutex_destroy(&inode->i_mutex);
587 mutex_init(&inode->i_mutex); 587 mutex_init(&inode->i_mutex);
588 lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); 588 lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
589 } 589 }
590 #endif 590 #endif
591 /* 591 /*
592 * This is special! We do not need the spinlock 592 * This is special! We do not need the spinlock
593 * when clearing I_LOCK, because we're guaranteed 593 * when clearing I_LOCK, because we're guaranteed
594 * that nobody else tries to do anything about the 594 * that nobody else tries to do anything about the
595 * state of the inode when it is locked, as we 595 * state of the inode when it is locked, as we
596 * just created it (so there can be no old holders 596 * just created it (so there can be no old holders
597 * that haven't tested I_LOCK). 597 * that haven't tested I_LOCK).
598 */ 598 */
599 inode->i_state &= ~(I_LOCK|I_NEW); 599 inode->i_state &= ~(I_LOCK|I_NEW);
600 wake_up_inode(inode); 600 wake_up_inode(inode);
601 } 601 }
602 602
603 EXPORT_SYMBOL(unlock_new_inode); 603 EXPORT_SYMBOL(unlock_new_inode);
604 604
605 /* 605 /*
606 * This is called without the inode lock held.. Be careful. 606 * This is called without the inode lock held.. Be careful.
607 * 607 *
608 * We no longer cache the sb_flags in i_flags - see fs.h 608 * We no longer cache the sb_flags in i_flags - see fs.h
609 * -- rmk@arm.uk.linux.org 609 * -- rmk@arm.uk.linux.org
610 */ 610 */
611 static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) 611 static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
612 { 612 {
613 struct inode * inode; 613 struct inode * inode;
614 614
615 inode = alloc_inode(sb); 615 inode = alloc_inode(sb);
616 if (inode) { 616 if (inode) {
617 struct inode * old; 617 struct inode * old;
618 618
619 spin_lock(&inode_lock); 619 spin_lock(&inode_lock);
620 /* We released the lock, so.. */ 620 /* We released the lock, so.. */
621 old = find_inode(sb, head, test, data); 621 old = find_inode(sb, head, test, data);
622 if (!old) { 622 if (!old) {
623 if (set(inode, data)) 623 if (set(inode, data))
624 goto set_failed; 624 goto set_failed;
625 625
626 inodes_stat.nr_inodes++; 626 inodes_stat.nr_inodes++;
627 list_add(&inode->i_list, &inode_in_use); 627 list_add(&inode->i_list, &inode_in_use);
628 list_add(&inode->i_sb_list, &sb->s_inodes); 628 list_add(&inode->i_sb_list, &sb->s_inodes);
629 hlist_add_head(&inode->i_hash, head); 629 hlist_add_head(&inode->i_hash, head);
630 inode->i_state = I_LOCK|I_NEW; 630 inode->i_state = I_LOCK|I_NEW;
631 spin_unlock(&inode_lock); 631 spin_unlock(&inode_lock);
632 632
633 /* Return the locked inode with I_NEW set, the 633 /* Return the locked inode with I_NEW set, the
634 * caller is responsible for filling in the contents 634 * caller is responsible for filling in the contents
635 */ 635 */
636 return inode; 636 return inode;
637 } 637 }
638 638
639 /* 639 /*
640 * Uhhuh, somebody else created the same inode under 640 * Uhhuh, somebody else created the same inode under
641 * us. Use the old inode instead of the one we just 641 * us. Use the old inode instead of the one we just
642 * allocated. 642 * allocated.
643 */ 643 */
644 __iget(old); 644 __iget(old);
645 spin_unlock(&inode_lock); 645 spin_unlock(&inode_lock);
646 destroy_inode(inode); 646 destroy_inode(inode);
647 inode = old; 647 inode = old;
648 wait_on_inode(inode); 648 wait_on_inode(inode);
649 } 649 }
650 return inode; 650 return inode;
651 651
652 set_failed: 652 set_failed:
653 spin_unlock(&inode_lock); 653 spin_unlock(&inode_lock);
654 destroy_inode(inode); 654 destroy_inode(inode);
655 return NULL; 655 return NULL;
656 } 656 }
657 657
658 /* 658 /*
659 * get_new_inode_fast is the fast path version of get_new_inode, see the 659 * get_new_inode_fast is the fast path version of get_new_inode, see the
660 * comment at iget_locked for details. 660 * comment at iget_locked for details.
661 */ 661 */
662 static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) 662 static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
663 { 663 {
664 struct inode * inode; 664 struct inode * inode;
665 665
666 inode = alloc_inode(sb); 666 inode = alloc_inode(sb);
667 if (inode) { 667 if (inode) {
668 struct inode * old; 668 struct inode * old;
669 669
670 spin_lock(&inode_lock); 670 spin_lock(&inode_lock);
671 /* We released the lock, so.. */ 671 /* We released the lock, so.. */
672 old = find_inode_fast(sb, head, ino); 672 old = find_inode_fast(sb, head, ino);
673 if (!old) { 673 if (!old) {
674 inode->i_ino = ino; 674 inode->i_ino = ino;
675 inodes_stat.nr_inodes++; 675 inodes_stat.nr_inodes++;
676 list_add(&inode->i_list, &inode_in_use); 676 list_add(&inode->i_list, &inode_in_use);
677 list_add(&inode->i_sb_list, &sb->s_inodes); 677 list_add(&inode->i_sb_list, &sb->s_inodes);
678 hlist_add_head(&inode->i_hash, head); 678 hlist_add_head(&inode->i_hash, head);
679 inode->i_state = I_LOCK|I_NEW; 679 inode->i_state = I_LOCK|I_NEW;
680 spin_unlock(&inode_lock); 680 spin_unlock(&inode_lock);
681 681
682 /* Return the locked inode with I_NEW set, the 682 /* Return the locked inode with I_NEW set, the
683 * caller is responsible for filling in the contents 683 * caller is responsible for filling in the contents
684 */ 684 */
685 return inode; 685 return inode;
686 } 686 }
687 687
688 /* 688 /*
689 * Uhhuh, somebody else created the same inode under 689 * Uhhuh, somebody else created the same inode under
690 * us. Use the old inode instead of the one we just 690 * us. Use the old inode instead of the one we just
691 * allocated. 691 * allocated.
692 */ 692 */
693 __iget(old); 693 __iget(old);
694 spin_unlock(&inode_lock); 694 spin_unlock(&inode_lock);
695 destroy_inode(inode); 695 destroy_inode(inode);
696 inode = old; 696 inode = old;
697 wait_on_inode(inode); 697 wait_on_inode(inode);
698 } 698 }
699 return inode; 699 return inode;
700 } 700 }
701 701
702 static unsigned long hash(struct super_block *sb, unsigned long hashval) 702 static unsigned long hash(struct super_block *sb, unsigned long hashval)
703 { 703 {
704 unsigned long tmp; 704 unsigned long tmp;
705 705
706 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 706 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
707 L1_CACHE_BYTES; 707 L1_CACHE_BYTES;
708 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); 708 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
709 return tmp & I_HASHMASK; 709 return tmp & I_HASHMASK;
710 } 710 }
711 711
712 /** 712 /**
713 * iunique - get a unique inode number 713 * iunique - get a unique inode number
714 * @sb: superblock 714 * @sb: superblock
715 * @max_reserved: highest reserved inode number 715 * @max_reserved: highest reserved inode number
716 * 716 *
717 * Obtain an inode number that is unique on the system for a given 717 * Obtain an inode number that is unique on the system for a given
718 * superblock. This is used by file systems that have no natural 718 * superblock. This is used by file systems that have no natural
719 * permanent inode numbering system. An inode number is returned that 719 * permanent inode numbering system. An inode number is returned that
720 * is higher than the reserved limit but unique. 720 * is higher than the reserved limit but unique.
721 * 721 *
722 * BUGS: 722 * BUGS:
723 * With a large number of inodes live on the file system this function 723 * With a large number of inodes live on the file system this function
724 * currently becomes quite slow. 724 * currently becomes quite slow.
725 */ 725 */
726 ino_t iunique(struct super_block *sb, ino_t max_reserved) 726 ino_t iunique(struct super_block *sb, ino_t max_reserved)
727 { 727 {
728 /* 728 /*
729 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 729 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
730 * error if st_ino won't fit in target struct field. Use 32bit counter 730 * error if st_ino won't fit in target struct field. Use 32bit counter
731 * here to attempt to avoid that. 731 * here to attempt to avoid that.
732 */ 732 */
733 static unsigned int counter; 733 static unsigned int counter;
734 struct inode *inode; 734 struct inode *inode;
735 struct hlist_head *head; 735 struct hlist_head *head;
736 ino_t res; 736 ino_t res;
737 737
738 spin_lock(&inode_lock); 738 spin_lock(&inode_lock);
739 do { 739 do {
740 if (counter <= max_reserved) 740 if (counter <= max_reserved)
741 counter = max_reserved + 1; 741 counter = max_reserved + 1;
742 res = counter++; 742 res = counter++;
743 head = inode_hashtable + hash(sb, res); 743 head = inode_hashtable + hash(sb, res);
744 inode = find_inode_fast(sb, head, res); 744 inode = find_inode_fast(sb, head, res);
745 } while (inode != NULL); 745 } while (inode != NULL);
746 spin_unlock(&inode_lock); 746 spin_unlock(&inode_lock);
747 747
748 return res; 748 return res;
749 } 749 }
750 EXPORT_SYMBOL(iunique); 750 EXPORT_SYMBOL(iunique);
751 751
752 struct inode *igrab(struct inode *inode) 752 struct inode *igrab(struct inode *inode)
753 { 753 {
754 spin_lock(&inode_lock); 754 spin_lock(&inode_lock);
755 if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) 755 if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)))
756 __iget(inode); 756 __iget(inode);
757 else 757 else
758 /* 758 /*
759 * Handle the case where s_op->clear_inode is not been 759 * Handle the case where s_op->clear_inode is not been
760 * called yet, and somebody is calling igrab 760 * called yet, and somebody is calling igrab
761 * while the inode is getting freed. 761 * while the inode is getting freed.
762 */ 762 */
763 inode = NULL; 763 inode = NULL;
764 spin_unlock(&inode_lock); 764 spin_unlock(&inode_lock);
765 return inode; 765 return inode;
766 } 766 }
767 767
768 EXPORT_SYMBOL(igrab); 768 EXPORT_SYMBOL(igrab);
769 769
770 /** 770 /**
771 * ifind - internal function, you want ilookup5() or iget5(). 771 * ifind - internal function, you want ilookup5() or iget5().
772 * @sb: super block of file system to search 772 * @sb: super block of file system to search
773 * @head: the head of the list to search 773 * @head: the head of the list to search
774 * @test: callback used for comparisons between inodes 774 * @test: callback used for comparisons between inodes
775 * @data: opaque data pointer to pass to @test 775 * @data: opaque data pointer to pass to @test
776 * @wait: if true wait for the inode to be unlocked, if false do not 776 * @wait: if true wait for the inode to be unlocked, if false do not
777 * 777 *
778 * ifind() searches for the inode specified by @data in the inode 778 * ifind() searches for the inode specified by @data in the inode
779 * cache. This is a generalized version of ifind_fast() for file systems where 779 * cache. This is a generalized version of ifind_fast() for file systems where
780 * the inode number is not sufficient for unique identification of an inode. 780 * the inode number is not sufficient for unique identification of an inode.
781 * 781 *
782 * If the inode is in the cache, the inode is returned with an incremented 782 * If the inode is in the cache, the inode is returned with an incremented
783 * reference count. 783 * reference count.
784 * 784 *
785 * Otherwise NULL is returned. 785 * Otherwise NULL is returned.
786 * 786 *
787 * Note, @test is called with the inode_lock held, so can't sleep. 787 * Note, @test is called with the inode_lock held, so can't sleep.
788 */ 788 */
789 static struct inode *ifind(struct super_block *sb, 789 static struct inode *ifind(struct super_block *sb,
790 struct hlist_head *head, int (*test)(struct inode *, void *), 790 struct hlist_head *head, int (*test)(struct inode *, void *),
791 void *data, const int wait) 791 void *data, const int wait)
792 { 792 {
793 struct inode *inode; 793 struct inode *inode;
794 794
795 spin_lock(&inode_lock); 795 spin_lock(&inode_lock);
796 inode = find_inode(sb, head, test, data); 796 inode = find_inode(sb, head, test, data);
797 if (inode) { 797 if (inode) {
798 __iget(inode); 798 __iget(inode);
799 spin_unlock(&inode_lock); 799 spin_unlock(&inode_lock);
800 if (likely(wait)) 800 if (likely(wait))
801 wait_on_inode(inode); 801 wait_on_inode(inode);
802 return inode; 802 return inode;
803 } 803 }
804 spin_unlock(&inode_lock); 804 spin_unlock(&inode_lock);
805 return NULL; 805 return NULL;
806 } 806 }
807 807
808 /** 808 /**
809 * ifind_fast - internal function, you want ilookup() or iget(). 809 * ifind_fast - internal function, you want ilookup() or iget().
810 * @sb: super block of file system to search 810 * @sb: super block of file system to search
811 * @head: head of the list to search 811 * @head: head of the list to search
812 * @ino: inode number to search for 812 * @ino: inode number to search for
813 * 813 *
814 * ifind_fast() searches for the inode @ino in the inode cache. This is for 814 * ifind_fast() searches for the inode @ino in the inode cache. This is for
815 * file systems where the inode number is sufficient for unique identification 815 * file systems where the inode number is sufficient for unique identification
816 * of an inode. 816 * of an inode.
817 * 817 *
818 * If the inode is in the cache, the inode is returned with an incremented 818 * If the inode is in the cache, the inode is returned with an incremented
819 * reference count. 819 * reference count.
820 * 820 *
821 * Otherwise NULL is returned. 821 * Otherwise NULL is returned.
822 */ 822 */
823 static struct inode *ifind_fast(struct super_block *sb, 823 static struct inode *ifind_fast(struct super_block *sb,
824 struct hlist_head *head, unsigned long ino) 824 struct hlist_head *head, unsigned long ino)
825 { 825 {
826 struct inode *inode; 826 struct inode *inode;
827 827
828 spin_lock(&inode_lock); 828 spin_lock(&inode_lock);
829 inode = find_inode_fast(sb, head, ino); 829 inode = find_inode_fast(sb, head, ino);
830 if (inode) { 830 if (inode) {
831 __iget(inode); 831 __iget(inode);
832 spin_unlock(&inode_lock); 832 spin_unlock(&inode_lock);
833 wait_on_inode(inode); 833 wait_on_inode(inode);
834 return inode; 834 return inode;
835 } 835 }
836 spin_unlock(&inode_lock); 836 spin_unlock(&inode_lock);
837 return NULL; 837 return NULL;
838 } 838 }
839 839
840 /** 840 /**
841 * ilookup5_nowait - search for an inode in the inode cache 841 * ilookup5_nowait - search for an inode in the inode cache
842 * @sb: super block of file system to search 842 * @sb: super block of file system to search
843 * @hashval: hash value (usually inode number) to search for 843 * @hashval: hash value (usually inode number) to search for
844 * @test: callback used for comparisons between inodes 844 * @test: callback used for comparisons between inodes
845 * @data: opaque data pointer to pass to @test 845 * @data: opaque data pointer to pass to @test
846 * 846 *
847 * ilookup5() uses ifind() to search for the inode specified by @hashval and 847 * ilookup5() uses ifind() to search for the inode specified by @hashval and
848 * @data in the inode cache. This is a generalized version of ilookup() for 848 * @data in the inode cache. This is a generalized version of ilookup() for
849 * file systems where the inode number is not sufficient for unique 849 * file systems where the inode number is not sufficient for unique
850 * identification of an inode. 850 * identification of an inode.
851 * 851 *
852 * If the inode is in the cache, the inode is returned with an incremented 852 * If the inode is in the cache, the inode is returned with an incremented
853 * reference count. Note, the inode lock is not waited upon so you have to be 853 * reference count. Note, the inode lock is not waited upon so you have to be
854 * very careful what you do with the returned inode. You probably should be 854 * very careful what you do with the returned inode. You probably should be
855 * using ilookup5() instead. 855 * using ilookup5() instead.
856 * 856 *
857 * Otherwise NULL is returned. 857 * Otherwise NULL is returned.
858 * 858 *
859 * Note, @test is called with the inode_lock held, so can't sleep. 859 * Note, @test is called with the inode_lock held, so can't sleep.
860 */ 860 */
861 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 861 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
862 int (*test)(struct inode *, void *), void *data) 862 int (*test)(struct inode *, void *), void *data)
863 { 863 {
864 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 864 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
865 865
866 return ifind(sb, head, test, data, 0); 866 return ifind(sb, head, test, data, 0);
867 } 867 }
868 868
869 EXPORT_SYMBOL(ilookup5_nowait); 869 EXPORT_SYMBOL(ilookup5_nowait);
870 870
871 /** 871 /**
872 * ilookup5 - search for an inode in the inode cache 872 * ilookup5 - search for an inode in the inode cache
873 * @sb: super block of file system to search 873 * @sb: super block of file system to search
874 * @hashval: hash value (usually inode number) to search for 874 * @hashval: hash value (usually inode number) to search for
875 * @test: callback used for comparisons between inodes 875 * @test: callback used for comparisons between inodes
876 * @data: opaque data pointer to pass to @test 876 * @data: opaque data pointer to pass to @test
877 * 877 *
878 * ilookup5() uses ifind() to search for the inode specified by @hashval and 878 * ilookup5() uses ifind() to search for the inode specified by @hashval and
879 * @data in the inode cache. This is a generalized version of ilookup() for 879 * @data in the inode cache. This is a generalized version of ilookup() for
880 * file systems where the inode number is not sufficient for unique 880 * file systems where the inode number is not sufficient for unique
881 * identification of an inode. 881 * identification of an inode.
882 * 882 *
883 * If the inode is in the cache, the inode lock is waited upon and the inode is 883 * If the inode is in the cache, the inode lock is waited upon and the inode is
884 * returned with an incremented reference count. 884 * returned with an incremented reference count.
885 * 885 *
886 * Otherwise NULL is returned. 886 * Otherwise NULL is returned.
887 * 887 *
888 * Note, @test is called with the inode_lock held, so can't sleep. 888 * Note, @test is called with the inode_lock held, so can't sleep.
889 */ 889 */
890 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 890 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
891 int (*test)(struct inode *, void *), void *data) 891 int (*test)(struct inode *, void *), void *data)
892 { 892 {
893 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 893 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
894 894
895 return ifind(sb, head, test, data, 1); 895 return ifind(sb, head, test, data, 1);
896 } 896 }
897 897
898 EXPORT_SYMBOL(ilookup5); 898 EXPORT_SYMBOL(ilookup5);
899 899
900 /** 900 /**
901 * ilookup - search for an inode in the inode cache 901 * ilookup - search for an inode in the inode cache
902 * @sb: super block of file system to search 902 * @sb: super block of file system to search
903 * @ino: inode number to search for 903 * @ino: inode number to search for
904 * 904 *
905 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 905 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
906 * This is for file systems where the inode number is sufficient for unique 906 * This is for file systems where the inode number is sufficient for unique
907 * identification of an inode. 907 * identification of an inode.
908 * 908 *
909 * If the inode is in the cache, the inode is returned with an incremented 909 * If the inode is in the cache, the inode is returned with an incremented
910 * reference count. 910 * reference count.
911 * 911 *
912 * Otherwise NULL is returned. 912 * Otherwise NULL is returned.
913 */ 913 */
914 struct inode *ilookup(struct super_block *sb, unsigned long ino) 914 struct inode *ilookup(struct super_block *sb, unsigned long ino)
915 { 915 {
916 struct hlist_head *head = inode_hashtable + hash(sb, ino); 916 struct hlist_head *head = inode_hashtable + hash(sb, ino);
917 917
918 return ifind_fast(sb, head, ino); 918 return ifind_fast(sb, head, ino);
919 } 919 }
920 920
921 EXPORT_SYMBOL(ilookup); 921 EXPORT_SYMBOL(ilookup);
922 922
923 /** 923 /**
924 * iget5_locked - obtain an inode from a mounted file system 924 * iget5_locked - obtain an inode from a mounted file system
925 * @sb: super block of file system 925 * @sb: super block of file system
926 * @hashval: hash value (usually inode number) to get 926 * @hashval: hash value (usually inode number) to get
927 * @test: callback used for comparisons between inodes 927 * @test: callback used for comparisons between inodes
928 * @set: callback used to initialize a new struct inode 928 * @set: callback used to initialize a new struct inode
929 * @data: opaque data pointer to pass to @test and @set 929 * @data: opaque data pointer to pass to @test and @set
930 * 930 *
931 * This is iget() without the read_inode() portion of get_new_inode().
932 *
933 * iget5_locked() uses ifind() to search for the inode specified by @hashval 931 * iget5_locked() uses ifind() to search for the inode specified by @hashval
934 * and @data in the inode cache and if present it is returned with an increased 932 * and @data in the inode cache and if present it is returned with an increased
935 * reference count. This is a generalized version of iget_locked() for file 933 * reference count. This is a generalized version of iget_locked() for file
936 * systems where the inode number is not sufficient for unique identification 934 * systems where the inode number is not sufficient for unique identification
937 * of an inode. 935 * of an inode.
938 * 936 *
939 * If the inode is not in cache, get_new_inode() is called to allocate a new 937 * If the inode is not in cache, get_new_inode() is called to allocate a new
940 * inode and this is returned locked, hashed, and with the I_NEW flag set. The 938 * inode and this is returned locked, hashed, and with the I_NEW flag set. The
941 * file system gets to fill it in before unlocking it via unlock_new_inode(). 939 * file system gets to fill it in before unlocking it via unlock_new_inode().
942 * 940 *
943 * Note both @test and @set are called with the inode_lock held, so can't sleep. 941 * Note both @test and @set are called with the inode_lock held, so can't sleep.
944 */ 942 */
945 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 943 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
946 int (*test)(struct inode *, void *), 944 int (*test)(struct inode *, void *),
947 int (*set)(struct inode *, void *), void *data) 945 int (*set)(struct inode *, void *), void *data)
948 { 946 {
949 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 947 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
950 struct inode *inode; 948 struct inode *inode;
951 949
952 inode = ifind(sb, head, test, data, 1); 950 inode = ifind(sb, head, test, data, 1);
953 if (inode) 951 if (inode)
954 return inode; 952 return inode;
955 /* 953 /*
956 * get_new_inode() will do the right thing, re-trying the search 954 * get_new_inode() will do the right thing, re-trying the search
957 * in case it had to block at any point. 955 * in case it had to block at any point.
958 */ 956 */
959 return get_new_inode(sb, head, test, set, data); 957 return get_new_inode(sb, head, test, set, data);
960 } 958 }
961 959
962 EXPORT_SYMBOL(iget5_locked); 960 EXPORT_SYMBOL(iget5_locked);
963 961
964 /** 962 /**
965 * iget_locked - obtain an inode from a mounted file system 963 * iget_locked - obtain an inode from a mounted file system
966 * @sb: super block of file system 964 * @sb: super block of file system
967 * @ino: inode number to get 965 * @ino: inode number to get
968 *
969 * This is iget() without the read_inode() portion of get_new_inode_fast().
970 * 966 *
971 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in 967 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
972 * the inode cache and if present it is returned with an increased reference 968 * the inode cache and if present it is returned with an increased reference
973 * count. This is for file systems where the inode number is sufficient for 969 * count. This is for file systems where the inode number is sufficient for
974 * unique identification of an inode. 970 * unique identification of an inode.
975 * 971 *
976 * If the inode is not in cache, get_new_inode_fast() is called to allocate a 972 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
977 * new inode and this is returned locked, hashed, and with the I_NEW flag set. 973 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
978 * The file system gets to fill it in before unlocking it via 974 * The file system gets to fill it in before unlocking it via
979 * unlock_new_inode(). 975 * unlock_new_inode().
980 */ 976 */
981 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 977 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
982 { 978 {
983 struct hlist_head *head = inode_hashtable + hash(sb, ino); 979 struct hlist_head *head = inode_hashtable + hash(sb, ino);
984 struct inode *inode; 980 struct inode *inode;
985 981
986 inode = ifind_fast(sb, head, ino); 982 inode = ifind_fast(sb, head, ino);
987 if (inode) 983 if (inode)
988 return inode; 984 return inode;
989 /* 985 /*
990 * get_new_inode_fast() will do the right thing, re-trying the search 986 * get_new_inode_fast() will do the right thing, re-trying the search
991 * in case it had to block at any point. 987 * in case it had to block at any point.
992 */ 988 */
993 return get_new_inode_fast(sb, head, ino); 989 return get_new_inode_fast(sb, head, ino);
994 } 990 }
995 991
996 EXPORT_SYMBOL(iget_locked); 992 EXPORT_SYMBOL(iget_locked);
997 993
998 /** 994 /**
999 * __insert_inode_hash - hash an inode 995 * __insert_inode_hash - hash an inode
1000 * @inode: unhashed inode 996 * @inode: unhashed inode
1001 * @hashval: unsigned long value used to locate this object in the 997 * @hashval: unsigned long value used to locate this object in the
1002 * inode_hashtable. 998 * inode_hashtable.
1003 * 999 *
1004 * Add an inode to the inode hash for this superblock. 1000 * Add an inode to the inode hash for this superblock.
1005 */ 1001 */
1006 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 1002 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1007 { 1003 {
1008 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 1004 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1009 spin_lock(&inode_lock); 1005 spin_lock(&inode_lock);
1010 hlist_add_head(&inode->i_hash, head); 1006 hlist_add_head(&inode->i_hash, head);
1011 spin_unlock(&inode_lock); 1007 spin_unlock(&inode_lock);
1012 } 1008 }
1013 1009
1014 EXPORT_SYMBOL(__insert_inode_hash); 1010 EXPORT_SYMBOL(__insert_inode_hash);
1015 1011
1016 /** 1012 /**
1017 * remove_inode_hash - remove an inode from the hash 1013 * remove_inode_hash - remove an inode from the hash
1018 * @inode: inode to unhash 1014 * @inode: inode to unhash
1019 * 1015 *
1020 * Remove an inode from the superblock. 1016 * Remove an inode from the superblock.
1021 */ 1017 */
1022 void remove_inode_hash(struct inode *inode) 1018 void remove_inode_hash(struct inode *inode)
1023 { 1019 {
1024 spin_lock(&inode_lock); 1020 spin_lock(&inode_lock);
1025 hlist_del_init(&inode->i_hash); 1021 hlist_del_init(&inode->i_hash);
1026 spin_unlock(&inode_lock); 1022 spin_unlock(&inode_lock);
1027 } 1023 }
1028 1024
1029 EXPORT_SYMBOL(remove_inode_hash); 1025 EXPORT_SYMBOL(remove_inode_hash);
1030 1026
1031 /* 1027 /*
1032 * Tell the filesystem that this inode is no longer of any interest and should 1028 * Tell the filesystem that this inode is no longer of any interest and should
1033 * be completely destroyed. 1029 * be completely destroyed.
1034 * 1030 *
1035 * We leave the inode in the inode hash table until *after* the filesystem's 1031 * We leave the inode in the inode hash table until *after* the filesystem's
1036 * ->delete_inode completes. This ensures that an iget (such as nfsd might 1032 * ->delete_inode completes. This ensures that an iget (such as nfsd might
1037 * instigate) will always find up-to-date information either in the hash or on 1033 * instigate) will always find up-to-date information either in the hash or on
1038 * disk. 1034 * disk.
1039 * 1035 *
1040 * I_FREEING is set so that no-one will take a new reference to the inode while 1036 * I_FREEING is set so that no-one will take a new reference to the inode while
1041 * it is being deleted. 1037 * it is being deleted.
1042 */ 1038 */
1043 void generic_delete_inode(struct inode *inode) 1039 void generic_delete_inode(struct inode *inode)
1044 { 1040 {
1045 const struct super_operations *op = inode->i_sb->s_op; 1041 const struct super_operations *op = inode->i_sb->s_op;
1046 1042
1047 list_del_init(&inode->i_list); 1043 list_del_init(&inode->i_list);
1048 list_del_init(&inode->i_sb_list); 1044 list_del_init(&inode->i_sb_list);
1049 inode->i_state |= I_FREEING; 1045 inode->i_state |= I_FREEING;
1050 inodes_stat.nr_inodes--; 1046 inodes_stat.nr_inodes--;
1051 spin_unlock(&inode_lock); 1047 spin_unlock(&inode_lock);
1052 1048
1053 security_inode_delete(inode); 1049 security_inode_delete(inode);
1054 1050
1055 if (op->delete_inode) { 1051 if (op->delete_inode) {
1056 void (*delete)(struct inode *) = op->delete_inode; 1052 void (*delete)(struct inode *) = op->delete_inode;
1057 if (!is_bad_inode(inode)) 1053 if (!is_bad_inode(inode))
1058 DQUOT_INIT(inode); 1054 DQUOT_INIT(inode);
1059 /* Filesystems implementing their own 1055 /* Filesystems implementing their own
1060 * s_op->delete_inode are required to call 1056 * s_op->delete_inode are required to call
1061 * truncate_inode_pages and clear_inode() 1057 * truncate_inode_pages and clear_inode()
1062 * internally */ 1058 * internally */
1063 delete(inode); 1059 delete(inode);
1064 } else { 1060 } else {
1065 truncate_inode_pages(&inode->i_data, 0); 1061 truncate_inode_pages(&inode->i_data, 0);
1066 clear_inode(inode); 1062 clear_inode(inode);
1067 } 1063 }
1068 spin_lock(&inode_lock); 1064 spin_lock(&inode_lock);
1069 hlist_del_init(&inode->i_hash); 1065 hlist_del_init(&inode->i_hash);
1070 spin_unlock(&inode_lock); 1066 spin_unlock(&inode_lock);
1071 wake_up_inode(inode); 1067 wake_up_inode(inode);
1072 BUG_ON(inode->i_state != I_CLEAR); 1068 BUG_ON(inode->i_state != I_CLEAR);
1073 destroy_inode(inode); 1069 destroy_inode(inode);
1074 } 1070 }
1075 1071
1076 EXPORT_SYMBOL(generic_delete_inode); 1072 EXPORT_SYMBOL(generic_delete_inode);
1077 1073
1078 static void generic_forget_inode(struct inode *inode) 1074 static void generic_forget_inode(struct inode *inode)
1079 { 1075 {
1080 struct super_block *sb = inode->i_sb; 1076 struct super_block *sb = inode->i_sb;
1081 1077
1082 if (!hlist_unhashed(&inode->i_hash)) { 1078 if (!hlist_unhashed(&inode->i_hash)) {
1083 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1079 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1084 list_move(&inode->i_list, &inode_unused); 1080 list_move(&inode->i_list, &inode_unused);
1085 inodes_stat.nr_unused++; 1081 inodes_stat.nr_unused++;
1086 if (sb->s_flags & MS_ACTIVE) { 1082 if (sb->s_flags & MS_ACTIVE) {
1087 spin_unlock(&inode_lock); 1083 spin_unlock(&inode_lock);
1088 return; 1084 return;
1089 } 1085 }
1090 inode->i_state |= I_WILL_FREE; 1086 inode->i_state |= I_WILL_FREE;
1091 spin_unlock(&inode_lock); 1087 spin_unlock(&inode_lock);
1092 write_inode_now(inode, 1); 1088 write_inode_now(inode, 1);
1093 spin_lock(&inode_lock); 1089 spin_lock(&inode_lock);
1094 inode->i_state &= ~I_WILL_FREE; 1090 inode->i_state &= ~I_WILL_FREE;
1095 inodes_stat.nr_unused--; 1091 inodes_stat.nr_unused--;
1096 hlist_del_init(&inode->i_hash); 1092 hlist_del_init(&inode->i_hash);
1097 } 1093 }
1098 list_del_init(&inode->i_list); 1094 list_del_init(&inode->i_list);
1099 list_del_init(&inode->i_sb_list); 1095 list_del_init(&inode->i_sb_list);
1100 inode->i_state |= I_FREEING; 1096 inode->i_state |= I_FREEING;
1101 inodes_stat.nr_inodes--; 1097 inodes_stat.nr_inodes--;
1102 spin_unlock(&inode_lock); 1098 spin_unlock(&inode_lock);
1103 if (inode->i_data.nrpages) 1099 if (inode->i_data.nrpages)
1104 truncate_inode_pages(&inode->i_data, 0); 1100 truncate_inode_pages(&inode->i_data, 0);
1105 clear_inode(inode); 1101 clear_inode(inode);
1106 wake_up_inode(inode); 1102 wake_up_inode(inode);
1107 destroy_inode(inode); 1103 destroy_inode(inode);
1108 } 1104 }
1109 1105
1110 /* 1106 /*
1111 * Normal UNIX filesystem behaviour: delete the 1107 * Normal UNIX filesystem behaviour: delete the
1112 * inode when the usage count drops to zero, and 1108 * inode when the usage count drops to zero, and
1113 * i_nlink is zero. 1109 * i_nlink is zero.
1114 */ 1110 */
1115 void generic_drop_inode(struct inode *inode) 1111 void generic_drop_inode(struct inode *inode)
1116 { 1112 {
1117 if (!inode->i_nlink) 1113 if (!inode->i_nlink)
1118 generic_delete_inode(inode); 1114 generic_delete_inode(inode);
1119 else 1115 else
1120 generic_forget_inode(inode); 1116 generic_forget_inode(inode);
1121 } 1117 }
1122 1118
1123 EXPORT_SYMBOL_GPL(generic_drop_inode); 1119 EXPORT_SYMBOL_GPL(generic_drop_inode);
1124 1120
1125 /* 1121 /*
1126 * Called when we're dropping the last reference 1122 * Called when we're dropping the last reference
1127 * to an inode. 1123 * to an inode.
1128 * 1124 *
1129 * Call the FS "drop()" function, defaulting to 1125 * Call the FS "drop()" function, defaulting to
1130 * the legacy UNIX filesystem behaviour.. 1126 * the legacy UNIX filesystem behaviour..
1131 * 1127 *
1132 * NOTE! NOTE! NOTE! We're called with the inode lock 1128 * NOTE! NOTE! NOTE! We're called with the inode lock
1133 * held, and the drop function is supposed to release 1129 * held, and the drop function is supposed to release
1134 * the lock! 1130 * the lock!
1135 */ 1131 */
1136 static inline void iput_final(struct inode *inode) 1132 static inline void iput_final(struct inode *inode)
1137 { 1133 {
1138 const struct super_operations *op = inode->i_sb->s_op; 1134 const struct super_operations *op = inode->i_sb->s_op;
1139 void (*drop)(struct inode *) = generic_drop_inode; 1135 void (*drop)(struct inode *) = generic_drop_inode;
1140 1136
1141 if (op && op->drop_inode) 1137 if (op && op->drop_inode)
1142 drop = op->drop_inode; 1138 drop = op->drop_inode;
1143 drop(inode); 1139 drop(inode);
1144 } 1140 }
1145 1141
1146 /** 1142 /**
1147 * iput - put an inode 1143 * iput - put an inode
1148 * @inode: inode to put 1144 * @inode: inode to put
1149 * 1145 *
1150 * Puts an inode, dropping its usage count. If the inode use count hits 1146 * Puts an inode, dropping its usage count. If the inode use count hits
1151 * zero, the inode is then freed and may also be destroyed. 1147 * zero, the inode is then freed and may also be destroyed.
1152 * 1148 *
1153 * Consequently, iput() can sleep. 1149 * Consequently, iput() can sleep.
1154 */ 1150 */
1155 void iput(struct inode *inode) 1151 void iput(struct inode *inode)
1156 { 1152 {
1157 if (inode) { 1153 if (inode) {
1158 const struct super_operations *op = inode->i_sb->s_op; 1154 const struct super_operations *op = inode->i_sb->s_op;
1159 1155
1160 BUG_ON(inode->i_state == I_CLEAR); 1156 BUG_ON(inode->i_state == I_CLEAR);
1161 1157
1162 if (op && op->put_inode) 1158 if (op && op->put_inode)
1163 op->put_inode(inode); 1159 op->put_inode(inode);
1164 1160
1165 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1161 if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
1166 iput_final(inode); 1162 iput_final(inode);
1167 } 1163 }
1168 } 1164 }
1169 1165
1170 EXPORT_SYMBOL(iput); 1166 EXPORT_SYMBOL(iput);
1171 1167
1172 /** 1168 /**
1173 * bmap - find a block number in a file 1169 * bmap - find a block number in a file
1174 * @inode: inode of file 1170 * @inode: inode of file
1175 * @block: block to find 1171 * @block: block to find
1176 * 1172 *
1177 * Returns the block number on the device holding the inode that 1173 * Returns the block number on the device holding the inode that
1178 * is the disk block number for the block of the file requested. 1174 * is the disk block number for the block of the file requested.
1179 * That is, asked for block 4 of inode 1 the function will return the 1175 * That is, asked for block 4 of inode 1 the function will return the
1180 * disk block relative to the disk start that holds that block of the 1176 * disk block relative to the disk start that holds that block of the
1181 * file. 1177 * file.
1182 */ 1178 */
1183 sector_t bmap(struct inode * inode, sector_t block) 1179 sector_t bmap(struct inode * inode, sector_t block)
1184 { 1180 {
1185 sector_t res = 0; 1181 sector_t res = 0;
1186 if (inode->i_mapping->a_ops->bmap) 1182 if (inode->i_mapping->a_ops->bmap)
1187 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1183 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
1188 return res; 1184 return res;
1189 } 1185 }
1190 EXPORT_SYMBOL(bmap); 1186 EXPORT_SYMBOL(bmap);
1191 1187
1192 /** 1188 /**
1193 * touch_atime - update the access time 1189 * touch_atime - update the access time
1194 * @mnt: mount the inode is accessed on 1190 * @mnt: mount the inode is accessed on
1195 * @dentry: dentry accessed 1191 * @dentry: dentry accessed
1196 * 1192 *
1197 * Update the accessed time on an inode and mark it for writeback. 1193 * Update the accessed time on an inode and mark it for writeback.
1198 * This function automatically handles read only file systems and media, 1194 * This function automatically handles read only file systems and media,
1199 * as well as the "noatime" flag and inode specific "noatime" markers. 1195 * as well as the "noatime" flag and inode specific "noatime" markers.
1200 */ 1196 */
1201 void touch_atime(struct vfsmount *mnt, struct dentry *dentry) 1197 void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1202 { 1198 {
1203 struct inode *inode = dentry->d_inode; 1199 struct inode *inode = dentry->d_inode;
1204 struct timespec now; 1200 struct timespec now;
1205 1201
1206 if (inode->i_flags & S_NOATIME) 1202 if (inode->i_flags & S_NOATIME)
1207 return; 1203 return;
1208 if (IS_NOATIME(inode)) 1204 if (IS_NOATIME(inode))
1209 return; 1205 return;
1210 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1206 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1211 return; 1207 return;
1212 1208
1213 /* 1209 /*
1214 * We may have a NULL vfsmount when coming from NFSD 1210 * We may have a NULL vfsmount when coming from NFSD
1215 */ 1211 */
1216 if (mnt) { 1212 if (mnt) {
1217 if (mnt->mnt_flags & MNT_NOATIME) 1213 if (mnt->mnt_flags & MNT_NOATIME)
1218 return; 1214 return;
1219 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1215 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1220 return; 1216 return;
1221 1217
1222 if (mnt->mnt_flags & MNT_RELATIME) { 1218 if (mnt->mnt_flags & MNT_RELATIME) {
1223 /* 1219 /*
1224 * With relative atime, only update atime if the 1220 * With relative atime, only update atime if the
1225 * previous atime is earlier than either the ctime or 1221 * previous atime is earlier than either the ctime or
1226 * mtime. 1222 * mtime.
1227 */ 1223 */
1228 if (timespec_compare(&inode->i_mtime, 1224 if (timespec_compare(&inode->i_mtime,
1229 &inode->i_atime) < 0 && 1225 &inode->i_atime) < 0 &&
1230 timespec_compare(&inode->i_ctime, 1226 timespec_compare(&inode->i_ctime,
1231 &inode->i_atime) < 0) 1227 &inode->i_atime) < 0)
1232 return; 1228 return;
1233 } 1229 }
1234 } 1230 }
1235 1231
1236 now = current_fs_time(inode->i_sb); 1232 now = current_fs_time(inode->i_sb);
1237 if (timespec_equal(&inode->i_atime, &now)) 1233 if (timespec_equal(&inode->i_atime, &now))
1238 return; 1234 return;
1239 1235
1240 inode->i_atime = now; 1236 inode->i_atime = now;
1241 mark_inode_dirty_sync(inode); 1237 mark_inode_dirty_sync(inode);
1242 } 1238 }
1243 EXPORT_SYMBOL(touch_atime); 1239 EXPORT_SYMBOL(touch_atime);
1244 1240
1245 /** 1241 /**
1246 * file_update_time - update mtime and ctime time 1242 * file_update_time - update mtime and ctime time
1247 * @file: file accessed 1243 * @file: file accessed
1248 * 1244 *
1249 * Update the mtime and ctime members of an inode and mark the inode 1245 * Update the mtime and ctime members of an inode and mark the inode
1250 * for writeback. Note that this function is meant exclusively for 1246 * for writeback. Note that this function is meant exclusively for
1251 * usage in the file write path of filesystems, and filesystems may 1247 * usage in the file write path of filesystems, and filesystems may
1252 * choose to explicitly ignore update via this function with the 1248 * choose to explicitly ignore update via this function with the
1253 * S_NOCTIME inode flag, e.g. for network filesystem where these 1249 * S_NOCTIME inode flag, e.g. for network filesystem where these
1254 * timestamps are handled by the server. 1250 * timestamps are handled by the server.
1255 */ 1251 */
1256 1252
1257 void file_update_time(struct file *file) 1253 void file_update_time(struct file *file)
1258 { 1254 {
1259 struct inode *inode = file->f_path.dentry->d_inode; 1255 struct inode *inode = file->f_path.dentry->d_inode;
1260 struct timespec now; 1256 struct timespec now;
1261 int sync_it = 0; 1257 int sync_it = 0;
1262 1258
1263 if (IS_NOCMTIME(inode)) 1259 if (IS_NOCMTIME(inode))
1264 return; 1260 return;
1265 if (IS_RDONLY(inode)) 1261 if (IS_RDONLY(inode))
1266 return; 1262 return;
1267 1263
1268 now = current_fs_time(inode->i_sb); 1264 now = current_fs_time(inode->i_sb);
1269 if (!timespec_equal(&inode->i_mtime, &now)) { 1265 if (!timespec_equal(&inode->i_mtime, &now)) {
1270 inode->i_mtime = now; 1266 inode->i_mtime = now;
1271 sync_it = 1; 1267 sync_it = 1;
1272 } 1268 }
1273 1269
1274 if (!timespec_equal(&inode->i_ctime, &now)) { 1270 if (!timespec_equal(&inode->i_ctime, &now)) {
1275 inode->i_ctime = now; 1271 inode->i_ctime = now;
1276 sync_it = 1; 1272 sync_it = 1;
1277 } 1273 }
1278 1274
1279 if (IS_I_VERSION(inode)) { 1275 if (IS_I_VERSION(inode)) {
1280 inode_inc_iversion(inode); 1276 inode_inc_iversion(inode);
1281 sync_it = 1; 1277 sync_it = 1;
1282 } 1278 }
1283 1279
1284 if (sync_it) 1280 if (sync_it)
1285 mark_inode_dirty_sync(inode); 1281 mark_inode_dirty_sync(inode);
1286 } 1282 }
1287 1283
1288 EXPORT_SYMBOL(file_update_time); 1284 EXPORT_SYMBOL(file_update_time);
1289 1285
1290 int inode_needs_sync(struct inode *inode) 1286 int inode_needs_sync(struct inode *inode)
1291 { 1287 {
1292 if (IS_SYNC(inode)) 1288 if (IS_SYNC(inode))
1293 return 1; 1289 return 1;
1294 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1290 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
1295 return 1; 1291 return 1;
1296 return 0; 1292 return 0;
1297 } 1293 }
1298 1294
1299 EXPORT_SYMBOL(inode_needs_sync); 1295 EXPORT_SYMBOL(inode_needs_sync);
1300 1296
1301 int inode_wait(void *word) 1297 int inode_wait(void *word)
1302 { 1298 {
1303 schedule(); 1299 schedule();
1304 return 0; 1300 return 0;
1305 } 1301 }
1306 1302
1307 /* 1303 /*
1308 * If we try to find an inode in the inode hash while it is being 1304 * If we try to find an inode in the inode hash while it is being
1309 * deleted, we have to wait until the filesystem completes its 1305 * deleted, we have to wait until the filesystem completes its
1310 * deletion before reporting that it isn't found. This function waits 1306 * deletion before reporting that it isn't found. This function waits
1311 * until the deletion _might_ have completed. Callers are responsible 1307 * until the deletion _might_ have completed. Callers are responsible
1312 * to recheck inode state. 1308 * to recheck inode state.
1313 * 1309 *
1314 * It doesn't matter if I_LOCK is not set initially, a call to 1310 * It doesn't matter if I_LOCK is not set initially, a call to
1315 * wake_up_inode() after removing from the hash list will DTRT. 1311 * wake_up_inode() after removing from the hash list will DTRT.
1316 * 1312 *
1317 * This is called with inode_lock held. 1313 * This is called with inode_lock held.
1318 */ 1314 */
1319 static void __wait_on_freeing_inode(struct inode *inode) 1315 static void __wait_on_freeing_inode(struct inode *inode)
1320 { 1316 {
1321 wait_queue_head_t *wq; 1317 wait_queue_head_t *wq;
1322 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); 1318 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK);
1323 wq = bit_waitqueue(&inode->i_state, __I_LOCK); 1319 wq = bit_waitqueue(&inode->i_state, __I_LOCK);
1324 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1320 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1325 spin_unlock(&inode_lock); 1321 spin_unlock(&inode_lock);
1326 schedule(); 1322 schedule();
1327 finish_wait(wq, &wait.wait); 1323 finish_wait(wq, &wait.wait);
1328 spin_lock(&inode_lock); 1324 spin_lock(&inode_lock);
1329 } 1325 }
1330 1326
1331 /* 1327 /*
1332 * We rarely want to lock two inodes that do not have a parent/child 1328 * We rarely want to lock two inodes that do not have a parent/child
1333 * relationship (such as directory, child inode) simultaneously. The 1329 * relationship (such as directory, child inode) simultaneously. The
1334 * vast majority of file systems should be able to get along fine 1330 * vast majority of file systems should be able to get along fine
1335 * without this. Do not use these functions except as a last resort. 1331 * without this. Do not use these functions except as a last resort.
1336 */ 1332 */
1337 void inode_double_lock(struct inode *inode1, struct inode *inode2) 1333 void inode_double_lock(struct inode *inode1, struct inode *inode2)
1338 { 1334 {
1339 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1335 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
1340 if (inode1) 1336 if (inode1)
1341 mutex_lock(&inode1->i_mutex); 1337 mutex_lock(&inode1->i_mutex);
1342 else if (inode2) 1338 else if (inode2)
1343 mutex_lock(&inode2->i_mutex); 1339 mutex_lock(&inode2->i_mutex);
1344 return; 1340 return;
1345 } 1341 }
1346 1342
1347 if (inode1 < inode2) { 1343 if (inode1 < inode2) {
1348 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1344 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1349 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1345 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1350 } else { 1346 } else {
1351 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1347 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1352 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1348 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1353 } 1349 }
1354 } 1350 }
1355 EXPORT_SYMBOL(inode_double_lock); 1351 EXPORT_SYMBOL(inode_double_lock);
1356 1352
1357 void inode_double_unlock(struct inode *inode1, struct inode *inode2) 1353 void inode_double_unlock(struct inode *inode1, struct inode *inode2)
1358 { 1354 {
1359 if (inode1) 1355 if (inode1)
1360 mutex_unlock(&inode1->i_mutex); 1356 mutex_unlock(&inode1->i_mutex);
1361 1357
1362 if (inode2 && inode2 != inode1) 1358 if (inode2 && inode2 != inode1)
1363 mutex_unlock(&inode2->i_mutex); 1359 mutex_unlock(&inode2->i_mutex);
1364 } 1360 }
1365 EXPORT_SYMBOL(inode_double_unlock); 1361 EXPORT_SYMBOL(inode_double_unlock);
1366 1362
1367 static __initdata unsigned long ihash_entries; 1363 static __initdata unsigned long ihash_entries;
1368 static int __init set_ihash_entries(char *str) 1364 static int __init set_ihash_entries(char *str)
1369 { 1365 {
1370 if (!str) 1366 if (!str)
1371 return 0; 1367 return 0;
1372 ihash_entries = simple_strtoul(str, &str, 0); 1368 ihash_entries = simple_strtoul(str, &str, 0);
1373 return 1; 1369 return 1;
1374 } 1370 }
1375 __setup("ihash_entries=", set_ihash_entries); 1371 __setup("ihash_entries=", set_ihash_entries);
1376 1372
1377 /* 1373 /*
1378 * Initialize the waitqueues and inode hash table. 1374 * Initialize the waitqueues and inode hash table.
1379 */ 1375 */
1380 void __init inode_init_early(void) 1376 void __init inode_init_early(void)
1381 { 1377 {
1382 int loop; 1378 int loop;
1383 1379
1384 /* If hashes are distributed across NUMA nodes, defer 1380 /* If hashes are distributed across NUMA nodes, defer
1385 * hash allocation until vmalloc space is available. 1381 * hash allocation until vmalloc space is available.
1386 */ 1382 */
1387 if (hashdist) 1383 if (hashdist)
1388 return; 1384 return;
1389 1385
1390 inode_hashtable = 1386 inode_hashtable =
1391 alloc_large_system_hash("Inode-cache", 1387 alloc_large_system_hash("Inode-cache",
1392 sizeof(struct hlist_head), 1388 sizeof(struct hlist_head),
1393 ihash_entries, 1389 ihash_entries,
1394 14, 1390 14,
1395 HASH_EARLY, 1391 HASH_EARLY,
1396 &i_hash_shift, 1392 &i_hash_shift,
1397 &i_hash_mask, 1393 &i_hash_mask,
1398 0); 1394 0);
1399 1395
1400 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1396 for (loop = 0; loop < (1 << i_hash_shift); loop++)
1401 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1397 INIT_HLIST_HEAD(&inode_hashtable[loop]);
1402 } 1398 }
1403 1399
1404 void __init inode_init(void) 1400 void __init inode_init(void)
1405 { 1401 {
1406 int loop; 1402 int loop;
1407 1403
1408 /* inode slab cache */ 1404 /* inode slab cache */
1409 inode_cachep = kmem_cache_create("inode_cache", 1405 inode_cachep = kmem_cache_create("inode_cache",
1410 sizeof(struct inode), 1406 sizeof(struct inode),
1411 0, 1407 0,
1412 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1408 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
1413 SLAB_MEM_SPREAD), 1409 SLAB_MEM_SPREAD),
1414 init_once); 1410 init_once);
1415 register_shrinker(&icache_shrinker); 1411 register_shrinker(&icache_shrinker);
1416 1412
1417 /* Hash may have been set up in inode_init_early */ 1413 /* Hash may have been set up in inode_init_early */
1418 if (!hashdist) 1414 if (!hashdist)
1419 return; 1415 return;
1420 1416
1421 inode_hashtable = 1417 inode_hashtable =
1422 alloc_large_system_hash("Inode-cache", 1418 alloc_large_system_hash("Inode-cache",
1423 sizeof(struct hlist_head), 1419 sizeof(struct hlist_head),
1424 ihash_entries, 1420 ihash_entries,
1425 14, 1421 14,
1426 0, 1422 0,
1427 &i_hash_shift, 1423 &i_hash_shift,
1428 &i_hash_mask, 1424 &i_hash_mask,
1429 0); 1425 0);
1430 1426
1431 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1427 for (loop = 0; loop < (1 << i_hash_shift); loop++)
1432 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1428 INIT_HLIST_HEAD(&inode_hashtable[loop]);
1433 } 1429 }
1434 1430
1435 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1431 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1436 { 1432 {
1437 inode->i_mode = mode; 1433 inode->i_mode = mode;
1438 if (S_ISCHR(mode)) { 1434 if (S_ISCHR(mode)) {
1439 inode->i_fop = &def_chr_fops; 1435 inode->i_fop = &def_chr_fops;
1440 inode->i_rdev = rdev; 1436 inode->i_rdev = rdev;
1441 } else if (S_ISBLK(mode)) { 1437 } else if (S_ISBLK(mode)) {
1442 inode->i_fop = &def_blk_fops; 1438 inode->i_fop = &def_blk_fops;
1443 inode->i_rdev = rdev; 1439 inode->i_rdev = rdev;
1444 } else if (S_ISFIFO(mode)) 1440 } else if (S_ISFIFO(mode))
1445 inode->i_fop = &def_fifo_fops; 1441 inode->i_fop = &def_fifo_fops;
1446 else if (S_ISSOCK(mode)) 1442 else if (S_ISSOCK(mode))
1447 inode->i_fop = &bad_sock_fops; 1443 inode->i_fop = &bad_sock_fops;
1448 else 1444 else
1449 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", 1445 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
1450 mode); 1446 mode);
1451 } 1447 }
1452 EXPORT_SYMBOL(init_special_inode); 1448 EXPORT_SYMBOL(init_special_inode);
1453 1449
1 #ifndef _LINUX_FS_H 1 #ifndef _LINUX_FS_H
2 #define _LINUX_FS_H 2 #define _LINUX_FS_H
3 3
4 /* 4 /*
5 * This file has definitions for some important file table 5 * This file has definitions for some important file table
6 * structures etc. 6 * structures etc.
7 */ 7 */
8 8
9 #include <linux/limits.h> 9 #include <linux/limits.h>
10 #include <linux/ioctl.h> 10 #include <linux/ioctl.h>
11 11
12 /* 12 /*
13 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change 13 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
14 * the file limit at runtime and only root can increase the per-process 14 * the file limit at runtime and only root can increase the per-process
15 * nr_file rlimit, so it's safe to set up a ridiculously high absolute 15 * nr_file rlimit, so it's safe to set up a ridiculously high absolute
16 * upper limit on files-per-process. 16 * upper limit on files-per-process.
17 * 17 *
18 * Some programs (notably those using select()) may have to be 18 * Some programs (notably those using select()) may have to be
19 * recompiled to take full advantage of the new limits.. 19 * recompiled to take full advantage of the new limits..
20 */ 20 */
21 21
22 /* Fixed constants first: */ 22 /* Fixed constants first: */
23 #undef NR_OPEN 23 #undef NR_OPEN
24 extern int sysctl_nr_open; 24 extern int sysctl_nr_open;
25 #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ 25 #define INR_OPEN 1024 /* Initial setting for nfile rlimits */
26 26
27 #define BLOCK_SIZE_BITS 10 27 #define BLOCK_SIZE_BITS 10
28 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) 28 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
29 29
30 #define SEEK_SET 0 /* seek relative to beginning of file */ 30 #define SEEK_SET 0 /* seek relative to beginning of file */
31 #define SEEK_CUR 1 /* seek relative to current file position */ 31 #define SEEK_CUR 1 /* seek relative to current file position */
32 #define SEEK_END 2 /* seek relative to end of file */ 32 #define SEEK_END 2 /* seek relative to end of file */
33 #define SEEK_MAX SEEK_END 33 #define SEEK_MAX SEEK_END
34 34
35 /* And dynamically-tunable limits and defaults: */ 35 /* And dynamically-tunable limits and defaults: */
36 struct files_stat_struct { 36 struct files_stat_struct {
37 int nr_files; /* read only */ 37 int nr_files; /* read only */
38 int nr_free_files; /* read only */ 38 int nr_free_files; /* read only */
39 int max_files; /* tunable */ 39 int max_files; /* tunable */
40 }; 40 };
41 extern struct files_stat_struct files_stat; 41 extern struct files_stat_struct files_stat;
42 extern int get_max_files(void); 42 extern int get_max_files(void);
43 43
44 struct inodes_stat_t { 44 struct inodes_stat_t {
45 int nr_inodes; 45 int nr_inodes;
46 int nr_unused; 46 int nr_unused;
47 int dummy[5]; /* padding for sysctl ABI compatibility */ 47 int dummy[5]; /* padding for sysctl ABI compatibility */
48 }; 48 };
49 extern struct inodes_stat_t inodes_stat; 49 extern struct inodes_stat_t inodes_stat;
50 50
51 extern int leases_enable, lease_break_time; 51 extern int leases_enable, lease_break_time;
52 52
53 #ifdef CONFIG_DNOTIFY 53 #ifdef CONFIG_DNOTIFY
54 extern int dir_notify_enable; 54 extern int dir_notify_enable;
55 #endif 55 #endif
56 56
57 #define NR_FILE 8192 /* this can well be larger on a larger system */ 57 #define NR_FILE 8192 /* this can well be larger on a larger system */
58 58
59 #define MAY_EXEC 1 59 #define MAY_EXEC 1
60 #define MAY_WRITE 2 60 #define MAY_WRITE 2
61 #define MAY_READ 4 61 #define MAY_READ 4
62 #define MAY_APPEND 8 62 #define MAY_APPEND 8
63 63
64 #define FMODE_READ 1 64 #define FMODE_READ 1
65 #define FMODE_WRITE 2 65 #define FMODE_WRITE 2
66 66
67 /* Internal kernel extensions */ 67 /* Internal kernel extensions */
68 #define FMODE_LSEEK 4 68 #define FMODE_LSEEK 4
69 #define FMODE_PREAD 8 69 #define FMODE_PREAD 8
70 #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ 70 #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */
71 71
72 /* File is being opened for execution. Primary users of this flag are 72 /* File is being opened for execution. Primary users of this flag are
73 distributed filesystems that can use it to achieve correct ETXTBUSY 73 distributed filesystems that can use it to achieve correct ETXTBUSY
74 behavior for cross-node execution/opening_for_writing of files */ 74 behavior for cross-node execution/opening_for_writing of files */
75 #define FMODE_EXEC 16 75 #define FMODE_EXEC 16
76 76
77 #define RW_MASK 1 77 #define RW_MASK 1
78 #define RWA_MASK 2 78 #define RWA_MASK 2
79 #define READ 0 79 #define READ 0
80 #define WRITE 1 80 #define WRITE 1
81 #define READA 2 /* read-ahead - don't block if no resources */ 81 #define READA 2 /* read-ahead - don't block if no resources */
82 #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ 82 #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */
83 #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) 83 #define READ_SYNC (READ | (1 << BIO_RW_SYNC))
84 #define READ_META (READ | (1 << BIO_RW_META)) 84 #define READ_META (READ | (1 << BIO_RW_META))
85 #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) 85 #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
86 #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) 86 #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
87 87
88 #define SEL_IN 1 88 #define SEL_IN 1
89 #define SEL_OUT 2 89 #define SEL_OUT 2
90 #define SEL_EX 4 90 #define SEL_EX 4
91 91
92 /* public flags for file_system_type */ 92 /* public flags for file_system_type */
93 #define FS_REQUIRES_DEV 1 93 #define FS_REQUIRES_DEV 1
94 #define FS_BINARY_MOUNTDATA 2 94 #define FS_BINARY_MOUNTDATA 2
95 #define FS_HAS_SUBTYPE 4 95 #define FS_HAS_SUBTYPE 4
96 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 96 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
97 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() 97 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move()
98 * during rename() internally. 98 * during rename() internally.
99 */ 99 */
100 100
101 /* 101 /*
102 * These are the fs-independent mount-flags: up to 32 flags are supported 102 * These are the fs-independent mount-flags: up to 32 flags are supported
103 */ 103 */
104 #define MS_RDONLY 1 /* Mount read-only */ 104 #define MS_RDONLY 1 /* Mount read-only */
105 #define MS_NOSUID 2 /* Ignore suid and sgid bits */ 105 #define MS_NOSUID 2 /* Ignore suid and sgid bits */
106 #define MS_NODEV 4 /* Disallow access to device special files */ 106 #define MS_NODEV 4 /* Disallow access to device special files */
107 #define MS_NOEXEC 8 /* Disallow program execution */ 107 #define MS_NOEXEC 8 /* Disallow program execution */
108 #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ 108 #define MS_SYNCHRONOUS 16 /* Writes are synced at once */
109 #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ 109 #define MS_REMOUNT 32 /* Alter flags of a mounted FS */
110 #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ 110 #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
111 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ 111 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */
112 #define MS_NOATIME 1024 /* Do not update access times. */ 112 #define MS_NOATIME 1024 /* Do not update access times. */
113 #define MS_NODIRATIME 2048 /* Do not update directory access times */ 113 #define MS_NODIRATIME 2048 /* Do not update directory access times */
114 #define MS_BIND 4096 114 #define MS_BIND 4096
115 #define MS_MOVE 8192 115 #define MS_MOVE 8192
116 #define MS_REC 16384 116 #define MS_REC 16384
117 #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. 117 #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence.
118 MS_VERBOSE is deprecated. */ 118 MS_VERBOSE is deprecated. */
119 #define MS_SILENT 32768 119 #define MS_SILENT 32768
120 #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ 120 #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
121 #define MS_UNBINDABLE (1<<17) /* change to unbindable */ 121 #define MS_UNBINDABLE (1<<17) /* change to unbindable */
122 #define MS_PRIVATE (1<<18) /* change to private */ 122 #define MS_PRIVATE (1<<18) /* change to private */
123 #define MS_SLAVE (1<<19) /* change to slave */ 123 #define MS_SLAVE (1<<19) /* change to slave */
124 #define MS_SHARED (1<<20) /* change to shared */ 124 #define MS_SHARED (1<<20) /* change to shared */
125 #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ 125 #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
126 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ 126 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
127 #define MS_I_VERSION (1<<23) /* Update inode I_version field */ 127 #define MS_I_VERSION (1<<23) /* Update inode I_version field */
128 #define MS_ACTIVE (1<<30) 128 #define MS_ACTIVE (1<<30)
129 #define MS_NOUSER (1<<31) 129 #define MS_NOUSER (1<<31)
130 130
131 /* 131 /*
132 * Superblock flags that can be altered by MS_REMOUNT 132 * Superblock flags that can be altered by MS_REMOUNT
133 */ 133 */
134 #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK) 134 #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
135 135
136 /* 136 /*
137 * Old magic mount flag and mask 137 * Old magic mount flag and mask
138 */ 138 */
139 #define MS_MGC_VAL 0xC0ED0000 139 #define MS_MGC_VAL 0xC0ED0000
140 #define MS_MGC_MSK 0xffff0000 140 #define MS_MGC_MSK 0xffff0000
141 141
142 /* Inode flags - they have nothing to superblock flags now */ 142 /* Inode flags - they have nothing to superblock flags now */
143 143
144 #define S_SYNC 1 /* Writes are synced at once */ 144 #define S_SYNC 1 /* Writes are synced at once */
145 #define S_NOATIME 2 /* Do not update access times */ 145 #define S_NOATIME 2 /* Do not update access times */
146 #define S_APPEND 4 /* Append-only file */ 146 #define S_APPEND 4 /* Append-only file */
147 #define S_IMMUTABLE 8 /* Immutable file */ 147 #define S_IMMUTABLE 8 /* Immutable file */
148 #define S_DEAD 16 /* removed, but still open directory */ 148 #define S_DEAD 16 /* removed, but still open directory */
149 #define S_NOQUOTA 32 /* Inode is not counted to quota */ 149 #define S_NOQUOTA 32 /* Inode is not counted to quota */
150 #define S_DIRSYNC 64 /* Directory modifications are synchronous */ 150 #define S_DIRSYNC 64 /* Directory modifications are synchronous */
151 #define S_NOCMTIME 128 /* Do not update file c/mtime */ 151 #define S_NOCMTIME 128 /* Do not update file c/mtime */
152 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ 152 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
153 #define S_PRIVATE 512 /* Inode is fs-internal */ 153 #define S_PRIVATE 512 /* Inode is fs-internal */
154 154
155 /* 155 /*
156 * Note that nosuid etc flags are inode-specific: setting some file-system 156 * Note that nosuid etc flags are inode-specific: setting some file-system
157 * flags just means all the inodes inherit those flags by default. It might be 157 * flags just means all the inodes inherit those flags by default. It might be
158 * possible to override it selectively if you really wanted to with some 158 * possible to override it selectively if you really wanted to with some
159 * ioctl() that is not currently implemented. 159 * ioctl() that is not currently implemented.
160 * 160 *
161 * Exception: MS_RDONLY is always applied to the entire file system. 161 * Exception: MS_RDONLY is always applied to the entire file system.
162 * 162 *
163 * Unfortunately, it is possible to change a filesystems flags with it mounted 163 * Unfortunately, it is possible to change a filesystems flags with it mounted
164 * with files in use. This means that all of the inodes will not have their 164 * with files in use. This means that all of the inodes will not have their
165 * i_flags updated. Hence, i_flags no longer inherit the superblock mount 165 * i_flags updated. Hence, i_flags no longer inherit the superblock mount
166 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org 166 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
167 */ 167 */
168 #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) 168 #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
169 169
170 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) 170 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
171 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ 171 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
172 ((inode)->i_flags & S_SYNC)) 172 ((inode)->i_flags & S_SYNC))
173 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ 173 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
174 ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) 174 ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
175 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) 175 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
176 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) 176 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
177 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) 177 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION)
178 178
179 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) 179 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
180 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) 180 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
181 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) 181 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
182 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) 182 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
183 183
184 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) 184 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
185 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) 185 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
186 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) 186 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
187 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) 187 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
188 188
189 /* the read-only stuff doesn't really belong here, but any other place is 189 /* the read-only stuff doesn't really belong here, but any other place is
190 probably as bad and I don't want to create yet another include file. */ 190 probably as bad and I don't want to create yet another include file. */
191 191
192 #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ 192 #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
193 #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ 193 #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
194 #define BLKRRPART _IO(0x12,95) /* re-read partition table */ 194 #define BLKRRPART _IO(0x12,95) /* re-read partition table */
195 #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ 195 #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
196 #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ 196 #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
197 #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ 197 #define BLKRASET _IO(0x12,98) /* set read ahead for block device */
198 #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ 198 #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
199 #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ 199 #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
200 #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ 200 #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
201 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ 201 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
202 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ 202 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
203 #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ 203 #define BLKSSZGET _IO(0x12,104)/* get block device sector size */
204 #if 0 204 #if 0
205 #define BLKPG _IO(0x12,105)/* See blkpg.h */ 205 #define BLKPG _IO(0x12,105)/* See blkpg.h */
206 206
207 /* Some people are morons. Do not use sizeof! */ 207 /* Some people are morons. Do not use sizeof! */
208 208
209 #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ 209 #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */
210 #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ 210 #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */
211 /* This was here just to show that the number is taken - 211 /* This was here just to show that the number is taken -
212 probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ 212 probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
213 #endif 213 #endif
214 /* A jump here: 108-111 have been used for various private purposes. */ 214 /* A jump here: 108-111 have been used for various private purposes. */
215 #define BLKBSZGET _IOR(0x12,112,size_t) 215 #define BLKBSZGET _IOR(0x12,112,size_t)
216 #define BLKBSZSET _IOW(0x12,113,size_t) 216 #define BLKBSZSET _IOW(0x12,113,size_t)
217 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ 217 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
218 #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) 218 #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
219 #define BLKTRACESTART _IO(0x12,116) 219 #define BLKTRACESTART _IO(0x12,116)
220 #define BLKTRACESTOP _IO(0x12,117) 220 #define BLKTRACESTOP _IO(0x12,117)
221 #define BLKTRACETEARDOWN _IO(0x12,118) 221 #define BLKTRACETEARDOWN _IO(0x12,118)
222 222
223 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 223 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
224 #define FIBMAP _IO(0x00,1) /* bmap access */ 224 #define FIBMAP _IO(0x00,1) /* bmap access */
225 #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ 225 #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
226 226
227 #define FS_IOC_GETFLAGS _IOR('f', 1, long) 227 #define FS_IOC_GETFLAGS _IOR('f', 1, long)
228 #define FS_IOC_SETFLAGS _IOW('f', 2, long) 228 #define FS_IOC_SETFLAGS _IOW('f', 2, long)
229 #define FS_IOC_GETVERSION _IOR('v', 1, long) 229 #define FS_IOC_GETVERSION _IOR('v', 1, long)
230 #define FS_IOC_SETVERSION _IOW('v', 2, long) 230 #define FS_IOC_SETVERSION _IOW('v', 2, long)
231 #define FS_IOC32_GETFLAGS _IOR('f', 1, int) 231 #define FS_IOC32_GETFLAGS _IOR('f', 1, int)
232 #define FS_IOC32_SETFLAGS _IOW('f', 2, int) 232 #define FS_IOC32_SETFLAGS _IOW('f', 2, int)
233 #define FS_IOC32_GETVERSION _IOR('v', 1, int) 233 #define FS_IOC32_GETVERSION _IOR('v', 1, int)
234 #define FS_IOC32_SETVERSION _IOW('v', 2, int) 234 #define FS_IOC32_SETVERSION _IOW('v', 2, int)
235 235
236 /* 236 /*
237 * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) 237 * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
238 */ 238 */
239 #define FS_SECRM_FL 0x00000001 /* Secure deletion */ 239 #define FS_SECRM_FL 0x00000001 /* Secure deletion */
240 #define FS_UNRM_FL 0x00000002 /* Undelete */ 240 #define FS_UNRM_FL 0x00000002 /* Undelete */
241 #define FS_COMPR_FL 0x00000004 /* Compress file */ 241 #define FS_COMPR_FL 0x00000004 /* Compress file */
242 #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ 242 #define FS_SYNC_FL 0x00000008 /* Synchronous updates */
243 #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ 243 #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
244 #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ 244 #define FS_APPEND_FL 0x00000020 /* writes to file may only append */
245 #define FS_NODUMP_FL 0x00000040 /* do not dump file */ 245 #define FS_NODUMP_FL 0x00000040 /* do not dump file */
246 #define FS_NOATIME_FL 0x00000080 /* do not update atime */ 246 #define FS_NOATIME_FL 0x00000080 /* do not update atime */
247 /* Reserved for compression usage... */ 247 /* Reserved for compression usage... */
248 #define FS_DIRTY_FL 0x00000100 248 #define FS_DIRTY_FL 0x00000100
249 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ 249 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
250 #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ 250 #define FS_NOCOMP_FL 0x00000400 /* Don't compress */
251 #define FS_ECOMPR_FL 0x00000800 /* Compression error */ 251 #define FS_ECOMPR_FL 0x00000800 /* Compression error */
252 /* End compression flags --- maybe not all used */ 252 /* End compression flags --- maybe not all used */
253 #define FS_BTREE_FL 0x00001000 /* btree format dir */ 253 #define FS_BTREE_FL 0x00001000 /* btree format dir */
254 #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ 254 #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */
255 #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ 255 #define FS_IMAGIC_FL 0x00002000 /* AFS directory */
256 #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ 256 #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */
257 #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ 257 #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
258 #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ 258 #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
259 #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 259 #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
260 #define FS_EXTENT_FL 0x00080000 /* Extents */ 260 #define FS_EXTENT_FL 0x00080000 /* Extents */
261 #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ 261 #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */
262 #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ 262 #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
263 263
264 #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ 264 #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
265 #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 265 #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
266 266
267 267
268 #define SYNC_FILE_RANGE_WAIT_BEFORE 1 268 #define SYNC_FILE_RANGE_WAIT_BEFORE 1
269 #define SYNC_FILE_RANGE_WRITE 2 269 #define SYNC_FILE_RANGE_WRITE 2
270 #define SYNC_FILE_RANGE_WAIT_AFTER 4 270 #define SYNC_FILE_RANGE_WAIT_AFTER 4
271 271
272 #ifdef __KERNEL__ 272 #ifdef __KERNEL__
273 273
274 #include <linux/linkage.h> 274 #include <linux/linkage.h>
275 #include <linux/wait.h> 275 #include <linux/wait.h>
276 #include <linux/types.h> 276 #include <linux/types.h>
277 #include <linux/kdev_t.h> 277 #include <linux/kdev_t.h>
278 #include <linux/dcache.h> 278 #include <linux/dcache.h>
279 #include <linux/namei.h> 279 #include <linux/namei.h>
280 #include <linux/stat.h> 280 #include <linux/stat.h>
281 #include <linux/cache.h> 281 #include <linux/cache.h>
282 #include <linux/kobject.h> 282 #include <linux/kobject.h>
283 #include <linux/list.h> 283 #include <linux/list.h>
284 #include <linux/radix-tree.h> 284 #include <linux/radix-tree.h>
285 #include <linux/prio_tree.h> 285 #include <linux/prio_tree.h>
286 #include <linux/init.h> 286 #include <linux/init.h>
287 #include <linux/pid.h> 287 #include <linux/pid.h>
288 #include <linux/mutex.h> 288 #include <linux/mutex.h>
289 #include <linux/capability.h> 289 #include <linux/capability.h>
290 290
291 #include <asm/atomic.h> 291 #include <asm/atomic.h>
292 #include <asm/semaphore.h> 292 #include <asm/semaphore.h>
293 #include <asm/byteorder.h> 293 #include <asm/byteorder.h>
294 294
295 struct export_operations; 295 struct export_operations;
296 struct hd_geometry; 296 struct hd_geometry;
297 struct iovec; 297 struct iovec;
298 struct nameidata; 298 struct nameidata;
299 struct kiocb; 299 struct kiocb;
300 struct pipe_inode_info; 300 struct pipe_inode_info;
301 struct poll_table_struct; 301 struct poll_table_struct;
302 struct kstatfs; 302 struct kstatfs;
303 struct vm_area_struct; 303 struct vm_area_struct;
304 struct vfsmount; 304 struct vfsmount;
305 305
306 extern void __init inode_init(void); 306 extern void __init inode_init(void);
307 extern void __init inode_init_early(void); 307 extern void __init inode_init_early(void);
308 extern void __init mnt_init(void); 308 extern void __init mnt_init(void);
309 extern void __init files_init(unsigned long); 309 extern void __init files_init(unsigned long);
310 310
311 struct buffer_head; 311 struct buffer_head;
312 typedef int (get_block_t)(struct inode *inode, sector_t iblock, 312 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
313 struct buffer_head *bh_result, int create); 313 struct buffer_head *bh_result, int create);
314 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 314 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
315 ssize_t bytes, void *private); 315 ssize_t bytes, void *private);
316 316
317 /* 317 /*
318 * Attribute flags. These should be or-ed together to figure out what 318 * Attribute flags. These should be or-ed together to figure out what
319 * has been changed! 319 * has been changed!
320 */ 320 */
321 #define ATTR_MODE 1 321 #define ATTR_MODE 1
322 #define ATTR_UID 2 322 #define ATTR_UID 2
323 #define ATTR_GID 4 323 #define ATTR_GID 4
324 #define ATTR_SIZE 8 324 #define ATTR_SIZE 8
325 #define ATTR_ATIME 16 325 #define ATTR_ATIME 16
326 #define ATTR_MTIME 32 326 #define ATTR_MTIME 32
327 #define ATTR_CTIME 64 327 #define ATTR_CTIME 64
328 #define ATTR_ATIME_SET 128 328 #define ATTR_ATIME_SET 128
329 #define ATTR_MTIME_SET 256 329 #define ATTR_MTIME_SET 256
330 #define ATTR_FORCE 512 /* Not a change, but a change it */ 330 #define ATTR_FORCE 512 /* Not a change, but a change it */
331 #define ATTR_ATTR_FLAG 1024 331 #define ATTR_ATTR_FLAG 1024
332 #define ATTR_KILL_SUID 2048 332 #define ATTR_KILL_SUID 2048
333 #define ATTR_KILL_SGID 4096 333 #define ATTR_KILL_SGID 4096
334 #define ATTR_FILE 8192 334 #define ATTR_FILE 8192
335 #define ATTR_KILL_PRIV 16384 335 #define ATTR_KILL_PRIV 16384
336 #define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */ 336 #define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */
337 337
338 /* 338 /*
339 * This is the Inode Attributes structure, used for notify_change(). It 339 * This is the Inode Attributes structure, used for notify_change(). It
340 * uses the above definitions as flags, to know which values have changed. 340 * uses the above definitions as flags, to know which values have changed.
341 * Also, in this manner, a Filesystem can look at only the values it cares 341 * Also, in this manner, a Filesystem can look at only the values it cares
342 * about. Basically, these are the attributes that the VFS layer can 342 * about. Basically, these are the attributes that the VFS layer can
343 * request to change from the FS layer. 343 * request to change from the FS layer.
344 * 344 *
345 * Derek Atkins <warlord@MIT.EDU> 94-10-20 345 * Derek Atkins <warlord@MIT.EDU> 94-10-20
346 */ 346 */
347 struct iattr { 347 struct iattr {
348 unsigned int ia_valid; 348 unsigned int ia_valid;
349 umode_t ia_mode; 349 umode_t ia_mode;
350 uid_t ia_uid; 350 uid_t ia_uid;
351 gid_t ia_gid; 351 gid_t ia_gid;
352 loff_t ia_size; 352 loff_t ia_size;
353 struct timespec ia_atime; 353 struct timespec ia_atime;
354 struct timespec ia_mtime; 354 struct timespec ia_mtime;
355 struct timespec ia_ctime; 355 struct timespec ia_ctime;
356 356
357 /* 357 /*
358 * Not an attribute, but an auxilary info for filesystems wanting to 358 * Not an attribute, but an auxilary info for filesystems wanting to
359 * implement an ftruncate() like method. NOTE: filesystem should 359 * implement an ftruncate() like method. NOTE: filesystem should
360 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). 360 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
361 */ 361 */
362 struct file *ia_file; 362 struct file *ia_file;
363 }; 363 };
364 364
365 /* 365 /*
366 * Includes for diskquotas. 366 * Includes for diskquotas.
367 */ 367 */
368 #include <linux/quota.h> 368 #include <linux/quota.h>
369 369
370 /** 370 /**
371 * enum positive_aop_returns - aop return codes with specific semantics 371 * enum positive_aop_returns - aop return codes with specific semantics
372 * 372 *
373 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has 373 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
374 * completed, that the page is still locked, and 374 * completed, that the page is still locked, and
375 * should be considered active. The VM uses this hint 375 * should be considered active. The VM uses this hint
376 * to return the page to the active list -- it won't 376 * to return the page to the active list -- it won't
377 * be a candidate for writeback again in the near 377 * be a candidate for writeback again in the near
378 * future. Other callers must be careful to unlock 378 * future. Other callers must be careful to unlock
379 * the page if they get this return. Returned by 379 * the page if they get this return. Returned by
380 * writepage(); 380 * writepage();
381 * 381 *
382 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has 382 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
383 * unlocked it and the page might have been truncated. 383 * unlocked it and the page might have been truncated.
384 * The caller should back up to acquiring a new page and 384 * The caller should back up to acquiring a new page and
385 * trying again. The aop will be taking reasonable 385 * trying again. The aop will be taking reasonable
386 * precautions not to livelock. If the caller held a page 386 * precautions not to livelock. If the caller held a page
387 * reference, it should drop it before retrying. Returned 387 * reference, it should drop it before retrying. Returned
388 * by readpage(). 388 * by readpage().
389 * 389 *
390 * address_space_operation functions return these large constants to indicate 390 * address_space_operation functions return these large constants to indicate
391 * special semantics to the caller. These are much larger than the bytes in a 391 * special semantics to the caller. These are much larger than the bytes in a
392 * page to allow for functions that return the number of bytes operated on in a 392 * page to allow for functions that return the number of bytes operated on in a
393 * given page. 393 * given page.
394 */ 394 */
395 395
396 enum positive_aop_returns { 396 enum positive_aop_returns {
397 AOP_WRITEPAGE_ACTIVATE = 0x80000, 397 AOP_WRITEPAGE_ACTIVATE = 0x80000,
398 AOP_TRUNCATED_PAGE = 0x80001, 398 AOP_TRUNCATED_PAGE = 0x80001,
399 }; 399 };
400 400
401 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ 401 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */
402 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ 402 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */
403 403
404 /* 404 /*
405 * oh the beauties of C type declarations. 405 * oh the beauties of C type declarations.
406 */ 406 */
407 struct page; 407 struct page;
408 struct address_space; 408 struct address_space;
409 struct writeback_control; 409 struct writeback_control;
410 410
411 struct iov_iter { 411 struct iov_iter {
412 const struct iovec *iov; 412 const struct iovec *iov;
413 unsigned long nr_segs; 413 unsigned long nr_segs;
414 size_t iov_offset; 414 size_t iov_offset;
415 size_t count; 415 size_t count;
416 }; 416 };
417 417
418 size_t iov_iter_copy_from_user_atomic(struct page *page, 418 size_t iov_iter_copy_from_user_atomic(struct page *page,
419 struct iov_iter *i, unsigned long offset, size_t bytes); 419 struct iov_iter *i, unsigned long offset, size_t bytes);
420 size_t iov_iter_copy_from_user(struct page *page, 420 size_t iov_iter_copy_from_user(struct page *page,
421 struct iov_iter *i, unsigned long offset, size_t bytes); 421 struct iov_iter *i, unsigned long offset, size_t bytes);
422 void iov_iter_advance(struct iov_iter *i, size_t bytes); 422 void iov_iter_advance(struct iov_iter *i, size_t bytes);
423 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); 423 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
424 size_t iov_iter_single_seg_count(struct iov_iter *i); 424 size_t iov_iter_single_seg_count(struct iov_iter *i);
425 425
426 static inline void iov_iter_init(struct iov_iter *i, 426 static inline void iov_iter_init(struct iov_iter *i,
427 const struct iovec *iov, unsigned long nr_segs, 427 const struct iovec *iov, unsigned long nr_segs,
428 size_t count, size_t written) 428 size_t count, size_t written)
429 { 429 {
430 i->iov = iov; 430 i->iov = iov;
431 i->nr_segs = nr_segs; 431 i->nr_segs = nr_segs;
432 i->iov_offset = 0; 432 i->iov_offset = 0;
433 i->count = count + written; 433 i->count = count + written;
434 434
435 iov_iter_advance(i, written); 435 iov_iter_advance(i, written);
436 } 436 }
437 437
438 static inline size_t iov_iter_count(struct iov_iter *i) 438 static inline size_t iov_iter_count(struct iov_iter *i)
439 { 439 {
440 return i->count; 440 return i->count;
441 } 441 }
442 442
443 443
444 struct address_space_operations { 444 struct address_space_operations {
445 int (*writepage)(struct page *page, struct writeback_control *wbc); 445 int (*writepage)(struct page *page, struct writeback_control *wbc);
446 int (*readpage)(struct file *, struct page *); 446 int (*readpage)(struct file *, struct page *);
447 void (*sync_page)(struct page *); 447 void (*sync_page)(struct page *);
448 448
449 /* Write back some dirty pages from this mapping. */ 449 /* Write back some dirty pages from this mapping. */
450 int (*writepages)(struct address_space *, struct writeback_control *); 450 int (*writepages)(struct address_space *, struct writeback_control *);
451 451
452 /* Set a page dirty. Return true if this dirtied it */ 452 /* Set a page dirty. Return true if this dirtied it */
453 int (*set_page_dirty)(struct page *page); 453 int (*set_page_dirty)(struct page *page);
454 454
455 int (*readpages)(struct file *filp, struct address_space *mapping, 455 int (*readpages)(struct file *filp, struct address_space *mapping,
456 struct list_head *pages, unsigned nr_pages); 456 struct list_head *pages, unsigned nr_pages);
457 457
458 /* 458 /*
459 * ext3 requires that a successful prepare_write() call be followed 459 * ext3 requires that a successful prepare_write() call be followed
460 * by a commit_write() call - they must be balanced 460 * by a commit_write() call - they must be balanced
461 */ 461 */
462 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 462 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
463 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 463 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
464 464
465 int (*write_begin)(struct file *, struct address_space *mapping, 465 int (*write_begin)(struct file *, struct address_space *mapping,
466 loff_t pos, unsigned len, unsigned flags, 466 loff_t pos, unsigned len, unsigned flags,
467 struct page **pagep, void **fsdata); 467 struct page **pagep, void **fsdata);
468 int (*write_end)(struct file *, struct address_space *mapping, 468 int (*write_end)(struct file *, struct address_space *mapping,
469 loff_t pos, unsigned len, unsigned copied, 469 loff_t pos, unsigned len, unsigned copied,
470 struct page *page, void *fsdata); 470 struct page *page, void *fsdata);
471 471
472 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 472 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
473 sector_t (*bmap)(struct address_space *, sector_t); 473 sector_t (*bmap)(struct address_space *, sector_t);
474 void (*invalidatepage) (struct page *, unsigned long); 474 void (*invalidatepage) (struct page *, unsigned long);
475 int (*releasepage) (struct page *, gfp_t); 475 int (*releasepage) (struct page *, gfp_t);
476 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 476 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
477 loff_t offset, unsigned long nr_segs); 477 loff_t offset, unsigned long nr_segs);
478 struct page* (*get_xip_page)(struct address_space *, sector_t, 478 struct page* (*get_xip_page)(struct address_space *, sector_t,
479 int); 479 int);
480 /* migrate the contents of a page to the specified target */ 480 /* migrate the contents of a page to the specified target */
481 int (*migratepage) (struct address_space *, 481 int (*migratepage) (struct address_space *,
482 struct page *, struct page *); 482 struct page *, struct page *);
483 int (*launder_page) (struct page *); 483 int (*launder_page) (struct page *);
484 }; 484 };
485 485
486 /* 486 /*
487 * pagecache_write_begin/pagecache_write_end must be used by general code 487 * pagecache_write_begin/pagecache_write_end must be used by general code
488 * to write into the pagecache. 488 * to write into the pagecache.
489 */ 489 */
490 int pagecache_write_begin(struct file *, struct address_space *mapping, 490 int pagecache_write_begin(struct file *, struct address_space *mapping,
491 loff_t pos, unsigned len, unsigned flags, 491 loff_t pos, unsigned len, unsigned flags,
492 struct page **pagep, void **fsdata); 492 struct page **pagep, void **fsdata);
493 493
494 int pagecache_write_end(struct file *, struct address_space *mapping, 494 int pagecache_write_end(struct file *, struct address_space *mapping,
495 loff_t pos, unsigned len, unsigned copied, 495 loff_t pos, unsigned len, unsigned copied,
496 struct page *page, void *fsdata); 496 struct page *page, void *fsdata);
497 497
498 struct backing_dev_info; 498 struct backing_dev_info;
499 struct address_space { 499 struct address_space {
500 struct inode *host; /* owner: inode, block_device */ 500 struct inode *host; /* owner: inode, block_device */
501 struct radix_tree_root page_tree; /* radix tree of all pages */ 501 struct radix_tree_root page_tree; /* radix tree of all pages */
502 rwlock_t tree_lock; /* and rwlock protecting it */ 502 rwlock_t tree_lock; /* and rwlock protecting it */
503 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 503 unsigned int i_mmap_writable;/* count VM_SHARED mappings */
504 struct prio_tree_root i_mmap; /* tree of private and shared mappings */ 504 struct prio_tree_root i_mmap; /* tree of private and shared mappings */
505 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 505 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
506 spinlock_t i_mmap_lock; /* protect tree, count, list */ 506 spinlock_t i_mmap_lock; /* protect tree, count, list */
507 unsigned int truncate_count; /* Cover race condition with truncate */ 507 unsigned int truncate_count; /* Cover race condition with truncate */
508 unsigned long nrpages; /* number of total pages */ 508 unsigned long nrpages; /* number of total pages */
509 pgoff_t writeback_index;/* writeback starts here */ 509 pgoff_t writeback_index;/* writeback starts here */
510 const struct address_space_operations *a_ops; /* methods */ 510 const struct address_space_operations *a_ops; /* methods */
511 unsigned long flags; /* error bits/gfp mask */ 511 unsigned long flags; /* error bits/gfp mask */
512 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 512 struct backing_dev_info *backing_dev_info; /* device readahead, etc */
513 spinlock_t private_lock; /* for use by the address_space */ 513 spinlock_t private_lock; /* for use by the address_space */
514 struct list_head private_list; /* ditto */ 514 struct list_head private_list; /* ditto */
515 struct address_space *assoc_mapping; /* ditto */ 515 struct address_space *assoc_mapping; /* ditto */
516 } __attribute__((aligned(sizeof(long)))); 516 } __attribute__((aligned(sizeof(long))));
517 /* 517 /*
518 * On most architectures that alignment is already the case; but 518 * On most architectures that alignment is already the case; but
519 * must be enforced here for CRIS, to let the least signficant bit 519 * must be enforced here for CRIS, to let the least signficant bit
520 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. 520 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
521 */ 521 */
522 522
523 struct block_device { 523 struct block_device {
524 dev_t bd_dev; /* not a kdev_t - it's a search key */ 524 dev_t bd_dev; /* not a kdev_t - it's a search key */
525 struct inode * bd_inode; /* will die */ 525 struct inode * bd_inode; /* will die */
526 int bd_openers; 526 int bd_openers;
527 struct mutex bd_mutex; /* open/close mutex */ 527 struct mutex bd_mutex; /* open/close mutex */
528 struct semaphore bd_mount_sem; 528 struct semaphore bd_mount_sem;
529 struct list_head bd_inodes; 529 struct list_head bd_inodes;
530 void * bd_holder; 530 void * bd_holder;
531 int bd_holders; 531 int bd_holders;
532 #ifdef CONFIG_SYSFS 532 #ifdef CONFIG_SYSFS
533 struct list_head bd_holder_list; 533 struct list_head bd_holder_list;
534 #endif 534 #endif
535 struct block_device * bd_contains; 535 struct block_device * bd_contains;
536 unsigned bd_block_size; 536 unsigned bd_block_size;
537 struct hd_struct * bd_part; 537 struct hd_struct * bd_part;
538 /* number of times partitions within this device have been opened. */ 538 /* number of times partitions within this device have been opened. */
539 unsigned bd_part_count; 539 unsigned bd_part_count;
540 int bd_invalidated; 540 int bd_invalidated;
541 struct gendisk * bd_disk; 541 struct gendisk * bd_disk;
542 struct list_head bd_list; 542 struct list_head bd_list;
543 struct backing_dev_info *bd_inode_backing_dev_info; 543 struct backing_dev_info *bd_inode_backing_dev_info;
544 /* 544 /*
545 * Private data. You must have bd_claim'ed the block_device 545 * Private data. You must have bd_claim'ed the block_device
546 * to use this. NOTE: bd_claim allows an owner to claim 546 * to use this. NOTE: bd_claim allows an owner to claim
547 * the same device multiple times, the owner must take special 547 * the same device multiple times, the owner must take special
548 * care to not mess up bd_private for that case. 548 * care to not mess up bd_private for that case.
549 */ 549 */
550 unsigned long bd_private; 550 unsigned long bd_private;
551 }; 551 };
552 552
553 /* 553 /*
554 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache 554 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
555 * radix trees 555 * radix trees
556 */ 556 */
557 #define PAGECACHE_TAG_DIRTY 0 557 #define PAGECACHE_TAG_DIRTY 0
558 #define PAGECACHE_TAG_WRITEBACK 1 558 #define PAGECACHE_TAG_WRITEBACK 1
559 559
560 int mapping_tagged(struct address_space *mapping, int tag); 560 int mapping_tagged(struct address_space *mapping, int tag);
561 561
562 /* 562 /*
563 * Might pages of this file be mapped into userspace? 563 * Might pages of this file be mapped into userspace?
564 */ 564 */
565 static inline int mapping_mapped(struct address_space *mapping) 565 static inline int mapping_mapped(struct address_space *mapping)
566 { 566 {
567 return !prio_tree_empty(&mapping->i_mmap) || 567 return !prio_tree_empty(&mapping->i_mmap) ||
568 !list_empty(&mapping->i_mmap_nonlinear); 568 !list_empty(&mapping->i_mmap_nonlinear);
569 } 569 }
570 570
571 /* 571 /*
572 * Might pages of this file have been modified in userspace? 572 * Might pages of this file have been modified in userspace?
573 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff 573 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
574 * marks vma as VM_SHARED if it is shared, and the file was opened for 574 * marks vma as VM_SHARED if it is shared, and the file was opened for
575 * writing i.e. vma may be mprotected writable even if now readonly. 575 * writing i.e. vma may be mprotected writable even if now readonly.
576 */ 576 */
577 static inline int mapping_writably_mapped(struct address_space *mapping) 577 static inline int mapping_writably_mapped(struct address_space *mapping)
578 { 578 {
579 return mapping->i_mmap_writable != 0; 579 return mapping->i_mmap_writable != 0;
580 } 580 }
581 581
582 /* 582 /*
583 * Use sequence counter to get consistent i_size on 32-bit processors. 583 * Use sequence counter to get consistent i_size on 32-bit processors.
584 */ 584 */
585 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 585 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
586 #include <linux/seqlock.h> 586 #include <linux/seqlock.h>
587 #define __NEED_I_SIZE_ORDERED 587 #define __NEED_I_SIZE_ORDERED
588 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) 588 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
589 #else 589 #else
590 #define i_size_ordered_init(inode) do { } while (0) 590 #define i_size_ordered_init(inode) do { } while (0)
591 #endif 591 #endif
592 592
593 struct inode { 593 struct inode {
594 struct hlist_node i_hash; 594 struct hlist_node i_hash;
595 struct list_head i_list; 595 struct list_head i_list;
596 struct list_head i_sb_list; 596 struct list_head i_sb_list;
597 struct list_head i_dentry; 597 struct list_head i_dentry;
598 unsigned long i_ino; 598 unsigned long i_ino;
599 atomic_t i_count; 599 atomic_t i_count;
600 unsigned int i_nlink; 600 unsigned int i_nlink;
601 uid_t i_uid; 601 uid_t i_uid;
602 gid_t i_gid; 602 gid_t i_gid;
603 dev_t i_rdev; 603 dev_t i_rdev;
604 u64 i_version; 604 u64 i_version;
605 loff_t i_size; 605 loff_t i_size;
606 #ifdef __NEED_I_SIZE_ORDERED 606 #ifdef __NEED_I_SIZE_ORDERED
607 seqcount_t i_size_seqcount; 607 seqcount_t i_size_seqcount;
608 #endif 608 #endif
609 struct timespec i_atime; 609 struct timespec i_atime;
610 struct timespec i_mtime; 610 struct timespec i_mtime;
611 struct timespec i_ctime; 611 struct timespec i_ctime;
612 unsigned int i_blkbits; 612 unsigned int i_blkbits;
613 blkcnt_t i_blocks; 613 blkcnt_t i_blocks;
614 unsigned short i_bytes; 614 unsigned short i_bytes;
615 umode_t i_mode; 615 umode_t i_mode;
616 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 616 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
617 struct mutex i_mutex; 617 struct mutex i_mutex;
618 struct rw_semaphore i_alloc_sem; 618 struct rw_semaphore i_alloc_sem;
619 const struct inode_operations *i_op; 619 const struct inode_operations *i_op;
620 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 620 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
621 struct super_block *i_sb; 621 struct super_block *i_sb;
622 struct file_lock *i_flock; 622 struct file_lock *i_flock;
623 struct address_space *i_mapping; 623 struct address_space *i_mapping;
624 struct address_space i_data; 624 struct address_space i_data;
625 #ifdef CONFIG_QUOTA 625 #ifdef CONFIG_QUOTA
626 struct dquot *i_dquot[MAXQUOTAS]; 626 struct dquot *i_dquot[MAXQUOTAS];
627 #endif 627 #endif
628 struct list_head i_devices; 628 struct list_head i_devices;
629 union { 629 union {
630 struct pipe_inode_info *i_pipe; 630 struct pipe_inode_info *i_pipe;
631 struct block_device *i_bdev; 631 struct block_device *i_bdev;
632 struct cdev *i_cdev; 632 struct cdev *i_cdev;
633 }; 633 };
634 int i_cindex; 634 int i_cindex;
635 635
636 __u32 i_generation; 636 __u32 i_generation;
637 637
638 #ifdef CONFIG_DNOTIFY 638 #ifdef CONFIG_DNOTIFY
639 unsigned long i_dnotify_mask; /* Directory notify events */ 639 unsigned long i_dnotify_mask; /* Directory notify events */
640 struct dnotify_struct *i_dnotify; /* for directory notifications */ 640 struct dnotify_struct *i_dnotify; /* for directory notifications */
641 #endif 641 #endif
642 642
643 #ifdef CONFIG_INOTIFY 643 #ifdef CONFIG_INOTIFY
644 struct list_head inotify_watches; /* watches on this inode */ 644 struct list_head inotify_watches; /* watches on this inode */
645 struct mutex inotify_mutex; /* protects the watches list */ 645 struct mutex inotify_mutex; /* protects the watches list */
646 #endif 646 #endif
647 647
648 unsigned long i_state; 648 unsigned long i_state;
649 unsigned long dirtied_when; /* jiffies of first dirtying */ 649 unsigned long dirtied_when; /* jiffies of first dirtying */
650 650
651 unsigned int i_flags; 651 unsigned int i_flags;
652 652
653 atomic_t i_writecount; 653 atomic_t i_writecount;
654 #ifdef CONFIG_SECURITY 654 #ifdef CONFIG_SECURITY
655 void *i_security; 655 void *i_security;
656 #endif 656 #endif
657 void *i_private; /* fs or device private pointer */ 657 void *i_private; /* fs or device private pointer */
658 }; 658 };
659 659
660 /* 660 /*
661 * inode->i_mutex nesting subclasses for the lock validator: 661 * inode->i_mutex nesting subclasses for the lock validator:
662 * 662 *
663 * 0: the object of the current VFS operation 663 * 0: the object of the current VFS operation
664 * 1: parent 664 * 1: parent
665 * 2: child/target 665 * 2: child/target
666 * 3: quota file 666 * 3: quota file
667 * 667 *
668 * The locking order between these classes is 668 * The locking order between these classes is
669 * parent -> child -> normal -> xattr -> quota 669 * parent -> child -> normal -> xattr -> quota
670 */ 670 */
671 enum inode_i_mutex_lock_class 671 enum inode_i_mutex_lock_class
672 { 672 {
673 I_MUTEX_NORMAL, 673 I_MUTEX_NORMAL,
674 I_MUTEX_PARENT, 674 I_MUTEX_PARENT,
675 I_MUTEX_CHILD, 675 I_MUTEX_CHILD,
676 I_MUTEX_XATTR, 676 I_MUTEX_XATTR,
677 I_MUTEX_QUOTA 677 I_MUTEX_QUOTA
678 }; 678 };
679 679
680 extern void inode_double_lock(struct inode *inode1, struct inode *inode2); 680 extern void inode_double_lock(struct inode *inode1, struct inode *inode2);
681 extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); 681 extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
682 682
683 /* 683 /*
684 * NOTE: in a 32bit arch with a preemptable kernel and 684 * NOTE: in a 32bit arch with a preemptable kernel and
685 * an UP compile the i_size_read/write must be atomic 685 * an UP compile the i_size_read/write must be atomic
686 * with respect to the local cpu (unlike with preempt disabled), 686 * with respect to the local cpu (unlike with preempt disabled),
687 * but they don't need to be atomic with respect to other cpus like in 687 * but they don't need to be atomic with respect to other cpus like in
688 * true SMP (so they need either to either locally disable irq around 688 * true SMP (so they need either to either locally disable irq around
689 * the read or for example on x86 they can be still implemented as a 689 * the read or for example on x86 they can be still implemented as a
690 * cmpxchg8b without the need of the lock prefix). For SMP compiles 690 * cmpxchg8b without the need of the lock prefix). For SMP compiles
691 * and 64bit archs it makes no difference if preempt is enabled or not. 691 * and 64bit archs it makes no difference if preempt is enabled or not.
692 */ 692 */
693 static inline loff_t i_size_read(const struct inode *inode) 693 static inline loff_t i_size_read(const struct inode *inode)
694 { 694 {
695 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 695 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
696 loff_t i_size; 696 loff_t i_size;
697 unsigned int seq; 697 unsigned int seq;
698 698
699 do { 699 do {
700 seq = read_seqcount_begin(&inode->i_size_seqcount); 700 seq = read_seqcount_begin(&inode->i_size_seqcount);
701 i_size = inode->i_size; 701 i_size = inode->i_size;
702 } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); 702 } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
703 return i_size; 703 return i_size;
704 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 704 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
705 loff_t i_size; 705 loff_t i_size;
706 706
707 preempt_disable(); 707 preempt_disable();
708 i_size = inode->i_size; 708 i_size = inode->i_size;
709 preempt_enable(); 709 preempt_enable();
710 return i_size; 710 return i_size;
711 #else 711 #else
712 return inode->i_size; 712 return inode->i_size;
713 #endif 713 #endif
714 } 714 }
715 715
716 /* 716 /*
717 * NOTE: unlike i_size_read(), i_size_write() does need locking around it 717 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
718 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount 718 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
719 * can be lost, resulting in subsequent i_size_read() calls spinning forever. 719 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
720 */ 720 */
721 static inline void i_size_write(struct inode *inode, loff_t i_size) 721 static inline void i_size_write(struct inode *inode, loff_t i_size)
722 { 722 {
723 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 723 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
724 write_seqcount_begin(&inode->i_size_seqcount); 724 write_seqcount_begin(&inode->i_size_seqcount);
725 inode->i_size = i_size; 725 inode->i_size = i_size;
726 write_seqcount_end(&inode->i_size_seqcount); 726 write_seqcount_end(&inode->i_size_seqcount);
727 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 727 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
728 preempt_disable(); 728 preempt_disable();
729 inode->i_size = i_size; 729 inode->i_size = i_size;
730 preempt_enable(); 730 preempt_enable();
731 #else 731 #else
732 inode->i_size = i_size; 732 inode->i_size = i_size;
733 #endif 733 #endif
734 } 734 }
735 735
736 static inline unsigned iminor(const struct inode *inode) 736 static inline unsigned iminor(const struct inode *inode)
737 { 737 {
738 return MINOR(inode->i_rdev); 738 return MINOR(inode->i_rdev);
739 } 739 }
740 740
741 static inline unsigned imajor(const struct inode *inode) 741 static inline unsigned imajor(const struct inode *inode)
742 { 742 {
743 return MAJOR(inode->i_rdev); 743 return MAJOR(inode->i_rdev);
744 } 744 }
745 745
746 extern struct block_device *I_BDEV(struct inode *inode); 746 extern struct block_device *I_BDEV(struct inode *inode);
747 747
748 struct fown_struct { 748 struct fown_struct {
749 rwlock_t lock; /* protects pid, uid, euid fields */ 749 rwlock_t lock; /* protects pid, uid, euid fields */
750 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ 750 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
751 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ 751 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
752 uid_t uid, euid; /* uid/euid of process setting the owner */ 752 uid_t uid, euid; /* uid/euid of process setting the owner */
753 int signum; /* posix.1b rt signal to be delivered on IO */ 753 int signum; /* posix.1b rt signal to be delivered on IO */
754 }; 754 };
755 755
756 /* 756 /*
757 * Track a single file's readahead state 757 * Track a single file's readahead state
758 */ 758 */
759 struct file_ra_state { 759 struct file_ra_state {
760 pgoff_t start; /* where readahead started */ 760 pgoff_t start; /* where readahead started */
761 unsigned int size; /* # of readahead pages */ 761 unsigned int size; /* # of readahead pages */
762 unsigned int async_size; /* do asynchronous readahead when 762 unsigned int async_size; /* do asynchronous readahead when
763 there are only # of pages ahead */ 763 there are only # of pages ahead */
764 764
765 unsigned int ra_pages; /* Maximum readahead window */ 765 unsigned int ra_pages; /* Maximum readahead window */
766 int mmap_miss; /* Cache miss stat for mmap accesses */ 766 int mmap_miss; /* Cache miss stat for mmap accesses */
767 loff_t prev_pos; /* Cache last read() position */ 767 loff_t prev_pos; /* Cache last read() position */
768 }; 768 };
769 769
770 /* 770 /*
771 * Check if @index falls in the readahead windows. 771 * Check if @index falls in the readahead windows.
772 */ 772 */
773 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) 773 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
774 { 774 {
775 return (index >= ra->start && 775 return (index >= ra->start &&
776 index < ra->start + ra->size); 776 index < ra->start + ra->size);
777 } 777 }
778 778
779 struct file { 779 struct file {
780 /* 780 /*
781 * fu_list becomes invalid after file_free is called and queued via 781 * fu_list becomes invalid after file_free is called and queued via
782 * fu_rcuhead for RCU freeing 782 * fu_rcuhead for RCU freeing
783 */ 783 */
784 union { 784 union {
785 struct list_head fu_list; 785 struct list_head fu_list;
786 struct rcu_head fu_rcuhead; 786 struct rcu_head fu_rcuhead;
787 } f_u; 787 } f_u;
788 struct path f_path; 788 struct path f_path;
789 #define f_dentry f_path.dentry 789 #define f_dentry f_path.dentry
790 #define f_vfsmnt f_path.mnt 790 #define f_vfsmnt f_path.mnt
791 const struct file_operations *f_op; 791 const struct file_operations *f_op;
792 atomic_t f_count; 792 atomic_t f_count;
793 unsigned int f_flags; 793 unsigned int f_flags;
794 mode_t f_mode; 794 mode_t f_mode;
795 loff_t f_pos; 795 loff_t f_pos;
796 struct fown_struct f_owner; 796 struct fown_struct f_owner;
797 unsigned int f_uid, f_gid; 797 unsigned int f_uid, f_gid;
798 struct file_ra_state f_ra; 798 struct file_ra_state f_ra;
799 799
800 u64 f_version; 800 u64 f_version;
801 #ifdef CONFIG_SECURITY 801 #ifdef CONFIG_SECURITY
802 void *f_security; 802 void *f_security;
803 #endif 803 #endif
804 /* needed for tty driver, and maybe others */ 804 /* needed for tty driver, and maybe others */
805 void *private_data; 805 void *private_data;
806 806
807 #ifdef CONFIG_EPOLL 807 #ifdef CONFIG_EPOLL
808 /* Used by fs/eventpoll.c to link all the hooks to this file */ 808 /* Used by fs/eventpoll.c to link all the hooks to this file */
809 struct list_head f_ep_links; 809 struct list_head f_ep_links;
810 spinlock_t f_ep_lock; 810 spinlock_t f_ep_lock;
811 #endif /* #ifdef CONFIG_EPOLL */ 811 #endif /* #ifdef CONFIG_EPOLL */
812 struct address_space *f_mapping; 812 struct address_space *f_mapping;
813 }; 813 };
814 extern spinlock_t files_lock; 814 extern spinlock_t files_lock;
815 #define file_list_lock() spin_lock(&files_lock); 815 #define file_list_lock() spin_lock(&files_lock);
816 #define file_list_unlock() spin_unlock(&files_lock); 816 #define file_list_unlock() spin_unlock(&files_lock);
817 817
818 #define get_file(x) atomic_inc(&(x)->f_count) 818 #define get_file(x) atomic_inc(&(x)->f_count)
819 #define file_count(x) atomic_read(&(x)->f_count) 819 #define file_count(x) atomic_read(&(x)->f_count)
820 820
821 #define MAX_NON_LFS ((1UL<<31) - 1) 821 #define MAX_NON_LFS ((1UL<<31) - 1)
822 822
823 /* Page cache limit. The filesystems should put that into their s_maxbytes 823 /* Page cache limit. The filesystems should put that into their s_maxbytes
824 limits, otherwise bad things can happen in VM. */ 824 limits, otherwise bad things can happen in VM. */
825 #if BITS_PER_LONG==32 825 #if BITS_PER_LONG==32
826 #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 826 #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
827 #elif BITS_PER_LONG==64 827 #elif BITS_PER_LONG==64
828 #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL 828 #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL
829 #endif 829 #endif
830 830
831 #define FL_POSIX 1 831 #define FL_POSIX 1
832 #define FL_FLOCK 2 832 #define FL_FLOCK 2
833 #define FL_ACCESS 8 /* not trying to lock, just looking */ 833 #define FL_ACCESS 8 /* not trying to lock, just looking */
834 #define FL_EXISTS 16 /* when unlocking, test for existence */ 834 #define FL_EXISTS 16 /* when unlocking, test for existence */
835 #define FL_LEASE 32 /* lease held on this file */ 835 #define FL_LEASE 32 /* lease held on this file */
836 #define FL_CLOSE 64 /* unlock on close */ 836 #define FL_CLOSE 64 /* unlock on close */
837 #define FL_SLEEP 128 /* A blocking lock */ 837 #define FL_SLEEP 128 /* A blocking lock */
838 838
839 /* 839 /*
840 * The POSIX file lock owner is determined by 840 * The POSIX file lock owner is determined by
841 * the "struct files_struct" in the thread group 841 * the "struct files_struct" in the thread group
842 * (or NULL for no owner - BSD locks). 842 * (or NULL for no owner - BSD locks).
843 * 843 *
844 * Lockd stuffs a "host" pointer into this. 844 * Lockd stuffs a "host" pointer into this.
845 */ 845 */
846 typedef struct files_struct *fl_owner_t; 846 typedef struct files_struct *fl_owner_t;
847 847
848 struct file_lock_operations { 848 struct file_lock_operations {
849 void (*fl_insert)(struct file_lock *); /* lock insertion callback */ 849 void (*fl_insert)(struct file_lock *); /* lock insertion callback */
850 void (*fl_remove)(struct file_lock *); /* lock removal callback */ 850 void (*fl_remove)(struct file_lock *); /* lock removal callback */
851 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 851 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
852 void (*fl_release_private)(struct file_lock *); 852 void (*fl_release_private)(struct file_lock *);
853 }; 853 };
854 854
855 struct lock_manager_operations { 855 struct lock_manager_operations {
856 int (*fl_compare_owner)(struct file_lock *, struct file_lock *); 856 int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
857 void (*fl_notify)(struct file_lock *); /* unblock callback */ 857 void (*fl_notify)(struct file_lock *); /* unblock callback */
858 int (*fl_grant)(struct file_lock *, struct file_lock *, int); 858 int (*fl_grant)(struct file_lock *, struct file_lock *, int);
859 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 859 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
860 void (*fl_release_private)(struct file_lock *); 860 void (*fl_release_private)(struct file_lock *);
861 void (*fl_break)(struct file_lock *); 861 void (*fl_break)(struct file_lock *);
862 int (*fl_mylease)(struct file_lock *, struct file_lock *); 862 int (*fl_mylease)(struct file_lock *, struct file_lock *);
863 int (*fl_change)(struct file_lock **, int); 863 int (*fl_change)(struct file_lock **, int);
864 }; 864 };
865 865
866 /* that will die - we need it for nfs_lock_info */ 866 /* that will die - we need it for nfs_lock_info */
867 #include <linux/nfs_fs_i.h> 867 #include <linux/nfs_fs_i.h>
868 868
869 struct file_lock { 869 struct file_lock {
870 struct file_lock *fl_next; /* singly linked list for this inode */ 870 struct file_lock *fl_next; /* singly linked list for this inode */
871 struct list_head fl_link; /* doubly linked list of all locks */ 871 struct list_head fl_link; /* doubly linked list of all locks */
872 struct list_head fl_block; /* circular list of blocked processes */ 872 struct list_head fl_block; /* circular list of blocked processes */
873 fl_owner_t fl_owner; 873 fl_owner_t fl_owner;
874 unsigned int fl_pid; 874 unsigned int fl_pid;
875 struct pid *fl_nspid; 875 struct pid *fl_nspid;
876 wait_queue_head_t fl_wait; 876 wait_queue_head_t fl_wait;
877 struct file *fl_file; 877 struct file *fl_file;
878 unsigned char fl_flags; 878 unsigned char fl_flags;
879 unsigned char fl_type; 879 unsigned char fl_type;
880 loff_t fl_start; 880 loff_t fl_start;
881 loff_t fl_end; 881 loff_t fl_end;
882 882
883 struct fasync_struct * fl_fasync; /* for lease break notifications */ 883 struct fasync_struct * fl_fasync; /* for lease break notifications */
884 unsigned long fl_break_time; /* for nonblocking lease breaks */ 884 unsigned long fl_break_time; /* for nonblocking lease breaks */
885 885
886 struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ 886 struct file_lock_operations *fl_ops; /* Callbacks for filesystems */
887 struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ 887 struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */
888 union { 888 union {
889 struct nfs_lock_info nfs_fl; 889 struct nfs_lock_info nfs_fl;
890 struct nfs4_lock_info nfs4_fl; 890 struct nfs4_lock_info nfs4_fl;
891 struct { 891 struct {
892 struct list_head link; /* link in AFS vnode's pending_locks list */ 892 struct list_head link; /* link in AFS vnode's pending_locks list */
893 int state; /* state of grant or error if -ve */ 893 int state; /* state of grant or error if -ve */
894 } afs; 894 } afs;
895 } fl_u; 895 } fl_u;
896 }; 896 };
897 897
898 /* The following constant reflects the upper bound of the file/locking space */ 898 /* The following constant reflects the upper bound of the file/locking space */
899 #ifndef OFFSET_MAX 899 #ifndef OFFSET_MAX
900 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) 900 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
901 #define OFFSET_MAX INT_LIMIT(loff_t) 901 #define OFFSET_MAX INT_LIMIT(loff_t)
902 #define OFFT_OFFSET_MAX INT_LIMIT(off_t) 902 #define OFFT_OFFSET_MAX INT_LIMIT(off_t)
903 #endif 903 #endif
904 904
905 #include <linux/fcntl.h> 905 #include <linux/fcntl.h>
906 906
907 extern int fcntl_getlk(struct file *, struct flock __user *); 907 extern int fcntl_getlk(struct file *, struct flock __user *);
908 extern int fcntl_setlk(unsigned int, struct file *, unsigned int, 908 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
909 struct flock __user *); 909 struct flock __user *);
910 910
911 #if BITS_PER_LONG == 32 911 #if BITS_PER_LONG == 32
912 extern int fcntl_getlk64(struct file *, struct flock64 __user *); 912 extern int fcntl_getlk64(struct file *, struct flock64 __user *);
913 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, 913 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
914 struct flock64 __user *); 914 struct flock64 __user *);
915 #endif 915 #endif
916 916
917 extern void send_sigio(struct fown_struct *fown, int fd, int band); 917 extern void send_sigio(struct fown_struct *fown, int fd, int band);
918 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); 918 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
919 extern int fcntl_getlease(struct file *filp); 919 extern int fcntl_getlease(struct file *filp);
920 920
921 /* fs/sync.c */ 921 /* fs/sync.c */
922 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, 922 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
923 loff_t endbyte, unsigned int flags); 923 loff_t endbyte, unsigned int flags);
924 924
925 /* fs/locks.c */ 925 /* fs/locks.c */
926 extern void locks_init_lock(struct file_lock *); 926 extern void locks_init_lock(struct file_lock *);
927 extern void locks_copy_lock(struct file_lock *, struct file_lock *); 927 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
928 extern void locks_remove_posix(struct file *, fl_owner_t); 928 extern void locks_remove_posix(struct file *, fl_owner_t);
929 extern void locks_remove_flock(struct file *); 929 extern void locks_remove_flock(struct file *);
930 extern void posix_test_lock(struct file *, struct file_lock *); 930 extern void posix_test_lock(struct file *, struct file_lock *);
931 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); 931 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
932 extern int posix_lock_file_wait(struct file *, struct file_lock *); 932 extern int posix_lock_file_wait(struct file *, struct file_lock *);
933 extern int posix_unblock_lock(struct file *, struct file_lock *); 933 extern int posix_unblock_lock(struct file *, struct file_lock *);
934 extern int vfs_test_lock(struct file *, struct file_lock *); 934 extern int vfs_test_lock(struct file *, struct file_lock *);
935 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); 935 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
936 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); 936 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
937 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); 937 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
938 extern int __break_lease(struct inode *inode, unsigned int flags); 938 extern int __break_lease(struct inode *inode, unsigned int flags);
939 extern void lease_get_mtime(struct inode *, struct timespec *time); 939 extern void lease_get_mtime(struct inode *, struct timespec *time);
940 extern int generic_setlease(struct file *, long, struct file_lock **); 940 extern int generic_setlease(struct file *, long, struct file_lock **);
941 extern int vfs_setlease(struct file *, long, struct file_lock **); 941 extern int vfs_setlease(struct file *, long, struct file_lock **);
942 extern int lease_modify(struct file_lock **, int); 942 extern int lease_modify(struct file_lock **, int);
943 extern int lock_may_read(struct inode *, loff_t start, unsigned long count); 943 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
944 extern int lock_may_write(struct inode *, loff_t start, unsigned long count); 944 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
945 extern struct seq_operations locks_seq_operations; 945 extern struct seq_operations locks_seq_operations;
946 946
947 struct fasync_struct { 947 struct fasync_struct {
948 int magic; 948 int magic;
949 int fa_fd; 949 int fa_fd;
950 struct fasync_struct *fa_next; /* singly linked list */ 950 struct fasync_struct *fa_next; /* singly linked list */
951 struct file *fa_file; 951 struct file *fa_file;
952 }; 952 };
953 953
954 #define FASYNC_MAGIC 0x4601 954 #define FASYNC_MAGIC 0x4601
955 955
956 /* SMP safe fasync helpers: */ 956 /* SMP safe fasync helpers: */
957 extern int fasync_helper(int, struct file *, int, struct fasync_struct **); 957 extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
958 /* can be called from interrupts */ 958 /* can be called from interrupts */
959 extern void kill_fasync(struct fasync_struct **, int, int); 959 extern void kill_fasync(struct fasync_struct **, int, int);
960 /* only for net: no internal synchronization */ 960 /* only for net: no internal synchronization */
961 extern void __kill_fasync(struct fasync_struct *, int, int); 961 extern void __kill_fasync(struct fasync_struct *, int, int);
962 962
963 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); 963 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
964 extern int f_setown(struct file *filp, unsigned long arg, int force); 964 extern int f_setown(struct file *filp, unsigned long arg, int force);
965 extern void f_delown(struct file *filp); 965 extern void f_delown(struct file *filp);
966 extern pid_t f_getown(struct file *filp); 966 extern pid_t f_getown(struct file *filp);
967 extern int send_sigurg(struct fown_struct *fown); 967 extern int send_sigurg(struct fown_struct *fown);
968 968
969 /* 969 /*
970 * Umount options 970 * Umount options
971 */ 971 */
972 972
973 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ 973 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
974 #define MNT_DETACH 0x00000002 /* Just detach from the tree */ 974 #define MNT_DETACH 0x00000002 /* Just detach from the tree */
975 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ 975 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */
976 976
977 extern struct list_head super_blocks; 977 extern struct list_head super_blocks;
978 extern spinlock_t sb_lock; 978 extern spinlock_t sb_lock;
979 979
980 #define S_BIAS (1<<30) 980 #define S_BIAS (1<<30)
981 struct super_block { 981 struct super_block {
982 struct list_head s_list; /* Keep this first */ 982 struct list_head s_list; /* Keep this first */
983 dev_t s_dev; /* search index; _not_ kdev_t */ 983 dev_t s_dev; /* search index; _not_ kdev_t */
984 unsigned long s_blocksize; 984 unsigned long s_blocksize;
985 unsigned char s_blocksize_bits; 985 unsigned char s_blocksize_bits;
986 unsigned char s_dirt; 986 unsigned char s_dirt;
987 unsigned long long s_maxbytes; /* Max file size */ 987 unsigned long long s_maxbytes; /* Max file size */
988 struct file_system_type *s_type; 988 struct file_system_type *s_type;
989 const struct super_operations *s_op; 989 const struct super_operations *s_op;
990 struct dquot_operations *dq_op; 990 struct dquot_operations *dq_op;
991 struct quotactl_ops *s_qcop; 991 struct quotactl_ops *s_qcop;
992 const struct export_operations *s_export_op; 992 const struct export_operations *s_export_op;
993 unsigned long s_flags; 993 unsigned long s_flags;
994 unsigned long s_magic; 994 unsigned long s_magic;
995 struct dentry *s_root; 995 struct dentry *s_root;
996 struct rw_semaphore s_umount; 996 struct rw_semaphore s_umount;
997 struct mutex s_lock; 997 struct mutex s_lock;
998 int s_count; 998 int s_count;
999 int s_syncing; 999 int s_syncing;
1000 int s_need_sync_fs; 1000 int s_need_sync_fs;
1001 atomic_t s_active; 1001 atomic_t s_active;
1002 #ifdef CONFIG_SECURITY 1002 #ifdef CONFIG_SECURITY
1003 void *s_security; 1003 void *s_security;
1004 #endif 1004 #endif
1005 struct xattr_handler **s_xattr; 1005 struct xattr_handler **s_xattr;
1006 1006
1007 struct list_head s_inodes; /* all inodes */ 1007 struct list_head s_inodes; /* all inodes */
1008 struct list_head s_dirty; /* dirty inodes */ 1008 struct list_head s_dirty; /* dirty inodes */
1009 struct list_head s_io; /* parked for writeback */ 1009 struct list_head s_io; /* parked for writeback */
1010 struct list_head s_more_io; /* parked for more writeback */ 1010 struct list_head s_more_io; /* parked for more writeback */
1011 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ 1011 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
1012 struct list_head s_files; 1012 struct list_head s_files;
1013 1013
1014 struct block_device *s_bdev; 1014 struct block_device *s_bdev;
1015 struct mtd_info *s_mtd; 1015 struct mtd_info *s_mtd;
1016 struct list_head s_instances; 1016 struct list_head s_instances;
1017 struct quota_info s_dquot; /* Diskquota specific options */ 1017 struct quota_info s_dquot; /* Diskquota specific options */
1018 1018
1019 int s_frozen; 1019 int s_frozen;
1020 wait_queue_head_t s_wait_unfrozen; 1020 wait_queue_head_t s_wait_unfrozen;
1021 1021
1022 char s_id[32]; /* Informational name */ 1022 char s_id[32]; /* Informational name */
1023 1023
1024 void *s_fs_info; /* Filesystem private info */ 1024 void *s_fs_info; /* Filesystem private info */
1025 1025
1026 /* 1026 /*
1027 * The next field is for VFS *only*. No filesystems have any business 1027 * The next field is for VFS *only*. No filesystems have any business
1028 * even looking at it. You had been warned. 1028 * even looking at it. You had been warned.
1029 */ 1029 */
1030 struct mutex s_vfs_rename_mutex; /* Kludge */ 1030 struct mutex s_vfs_rename_mutex; /* Kludge */
1031 1031
1032 /* Granularity of c/m/atime in ns. 1032 /* Granularity of c/m/atime in ns.
1033 Cannot be worse than a second */ 1033 Cannot be worse than a second */
1034 u32 s_time_gran; 1034 u32 s_time_gran;
1035 1035
1036 /* 1036 /*
1037 * Filesystem subtype. If non-empty the filesystem type field 1037 * Filesystem subtype. If non-empty the filesystem type field
1038 * in /proc/mounts will be "type.subtype" 1038 * in /proc/mounts will be "type.subtype"
1039 */ 1039 */
1040 char *s_subtype; 1040 char *s_subtype;
1041 }; 1041 };
1042 1042
1043 extern struct timespec current_fs_time(struct super_block *sb); 1043 extern struct timespec current_fs_time(struct super_block *sb);
1044 1044
1045 /* 1045 /*
1046 * Snapshotting support. 1046 * Snapshotting support.
1047 */ 1047 */
1048 enum { 1048 enum {
1049 SB_UNFROZEN = 0, 1049 SB_UNFROZEN = 0,
1050 SB_FREEZE_WRITE = 1, 1050 SB_FREEZE_WRITE = 1,
1051 SB_FREEZE_TRANS = 2, 1051 SB_FREEZE_TRANS = 2,
1052 }; 1052 };
1053 1053
1054 #define vfs_check_frozen(sb, level) \ 1054 #define vfs_check_frozen(sb, level) \
1055 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 1055 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
1056 1056
1057 #define get_fs_excl() atomic_inc(&current->fs_excl) 1057 #define get_fs_excl() atomic_inc(&current->fs_excl)
1058 #define put_fs_excl() atomic_dec(&current->fs_excl) 1058 #define put_fs_excl() atomic_dec(&current->fs_excl)
1059 #define has_fs_excl() atomic_read(&current->fs_excl) 1059 #define has_fs_excl() atomic_read(&current->fs_excl)
1060 1060
1061 #define is_owner_or_cap(inode) \ 1061 #define is_owner_or_cap(inode) \
1062 ((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER)) 1062 ((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
1063 1063
1064 /* not quite ready to be deprecated, but... */ 1064 /* not quite ready to be deprecated, but... */
1065 extern void lock_super(struct super_block *); 1065 extern void lock_super(struct super_block *);
1066 extern void unlock_super(struct super_block *); 1066 extern void unlock_super(struct super_block *);
1067 1067
1068 /* 1068 /*
1069 * VFS helper functions.. 1069 * VFS helper functions..
1070 */ 1070 */
1071 extern int vfs_permission(struct nameidata *, int); 1071 extern int vfs_permission(struct nameidata *, int);
1072 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); 1072 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
1073 extern int vfs_mkdir(struct inode *, struct dentry *, int); 1073 extern int vfs_mkdir(struct inode *, struct dentry *, int);
1074 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); 1074 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
1075 extern int vfs_symlink(struct inode *, struct dentry *, const char *, int); 1075 extern int vfs_symlink(struct inode *, struct dentry *, const char *, int);
1076 extern int vfs_link(struct dentry *, struct inode *, struct dentry *); 1076 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
1077 extern int vfs_rmdir(struct inode *, struct dentry *); 1077 extern int vfs_rmdir(struct inode *, struct dentry *);
1078 extern int vfs_unlink(struct inode *, struct dentry *); 1078 extern int vfs_unlink(struct inode *, struct dentry *);
1079 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 1079 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
1080 1080
1081 /* 1081 /*
1082 * VFS dentry helper functions. 1082 * VFS dentry helper functions.
1083 */ 1083 */
1084 extern void dentry_unhash(struct dentry *dentry); 1084 extern void dentry_unhash(struct dentry *dentry);
1085 1085
1086 /* 1086 /*
1087 * VFS file helper functions. 1087 * VFS file helper functions.
1088 */ 1088 */
1089 extern int file_permission(struct file *, int); 1089 extern int file_permission(struct file *, int);
1090 1090
1091 /* 1091 /*
1092 * File types 1092 * File types
1093 * 1093 *
1094 * NOTE! These match bits 12..15 of stat.st_mode 1094 * NOTE! These match bits 12..15 of stat.st_mode
1095 * (ie "(i_mode >> 12) & 15"). 1095 * (ie "(i_mode >> 12) & 15").
1096 */ 1096 */
1097 #define DT_UNKNOWN 0 1097 #define DT_UNKNOWN 0
1098 #define DT_FIFO 1 1098 #define DT_FIFO 1
1099 #define DT_CHR 2 1099 #define DT_CHR 2
1100 #define DT_DIR 4 1100 #define DT_DIR 4
1101 #define DT_BLK 6 1101 #define DT_BLK 6
1102 #define DT_REG 8 1102 #define DT_REG 8
1103 #define DT_LNK 10 1103 #define DT_LNK 10
1104 #define DT_SOCK 12 1104 #define DT_SOCK 12
1105 #define DT_WHT 14 1105 #define DT_WHT 14
1106 1106
1107 #define OSYNC_METADATA (1<<0) 1107 #define OSYNC_METADATA (1<<0)
1108 #define OSYNC_DATA (1<<1) 1108 #define OSYNC_DATA (1<<1)
1109 #define OSYNC_INODE (1<<2) 1109 #define OSYNC_INODE (1<<2)
1110 int generic_osync_inode(struct inode *, struct address_space *, int); 1110 int generic_osync_inode(struct inode *, struct address_space *, int);
1111 1111
1112 /* 1112 /*
1113 * This is the "filldir" function type, used by readdir() to let 1113 * This is the "filldir" function type, used by readdir() to let
1114 * the kernel specify what kind of dirent layout it wants to have. 1114 * the kernel specify what kind of dirent layout it wants to have.
1115 * This allows the kernel to read directories into kernel space or 1115 * This allows the kernel to read directories into kernel space or
1116 * to have different dirent layouts depending on the binary type. 1116 * to have different dirent layouts depending on the binary type.
1117 */ 1117 */
1118 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); 1118 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
1119 1119
1120 struct block_device_operations { 1120 struct block_device_operations {
1121 int (*open) (struct inode *, struct file *); 1121 int (*open) (struct inode *, struct file *);
1122 int (*release) (struct inode *, struct file *); 1122 int (*release) (struct inode *, struct file *);
1123 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); 1123 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
1124 long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); 1124 long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
1125 long (*compat_ioctl) (struct file *, unsigned, unsigned long); 1125 long (*compat_ioctl) (struct file *, unsigned, unsigned long);
1126 int (*direct_access) (struct block_device *, sector_t, unsigned long *); 1126 int (*direct_access) (struct block_device *, sector_t, unsigned long *);
1127 int (*media_changed) (struct gendisk *); 1127 int (*media_changed) (struct gendisk *);
1128 int (*revalidate_disk) (struct gendisk *); 1128 int (*revalidate_disk) (struct gendisk *);
1129 int (*getgeo)(struct block_device *, struct hd_geometry *); 1129 int (*getgeo)(struct block_device *, struct hd_geometry *);
1130 struct module *owner; 1130 struct module *owner;
1131 }; 1131 };
1132 1132
1133 /* 1133 /*
1134 * "descriptor" for what we're up to with a read. 1134 * "descriptor" for what we're up to with a read.
1135 * This allows us to use the same read code yet 1135 * This allows us to use the same read code yet
1136 * have multiple different users of the data that 1136 * have multiple different users of the data that
1137 * we read from a file. 1137 * we read from a file.
1138 * 1138 *
1139 * The simplest case just copies the data to user 1139 * The simplest case just copies the data to user
1140 * mode. 1140 * mode.
1141 */ 1141 */
1142 typedef struct { 1142 typedef struct {
1143 size_t written; 1143 size_t written;
1144 size_t count; 1144 size_t count;
1145 union { 1145 union {
1146 char __user * buf; 1146 char __user * buf;
1147 void *data; 1147 void *data;
1148 } arg; 1148 } arg;
1149 int error; 1149 int error;
1150 } read_descriptor_t; 1150 } read_descriptor_t;
1151 1151
1152 typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); 1152 typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
1153 1153
1154 /* These macros are for out of kernel modules to test that 1154 /* These macros are for out of kernel modules to test that
1155 * the kernel supports the unlocked_ioctl and compat_ioctl 1155 * the kernel supports the unlocked_ioctl and compat_ioctl
1156 * fields in struct file_operations. */ 1156 * fields in struct file_operations. */
1157 #define HAVE_COMPAT_IOCTL 1 1157 #define HAVE_COMPAT_IOCTL 1
1158 #define HAVE_UNLOCKED_IOCTL 1 1158 #define HAVE_UNLOCKED_IOCTL 1
1159 1159
1160 /* 1160 /*
1161 * NOTE: 1161 * NOTE:
1162 * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl 1162 * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
1163 * can be called without the big kernel lock held in all filesystems. 1163 * can be called without the big kernel lock held in all filesystems.
1164 */ 1164 */
1165 struct file_operations { 1165 struct file_operations {
1166 struct module *owner; 1166 struct module *owner;
1167 loff_t (*llseek) (struct file *, loff_t, int); 1167 loff_t (*llseek) (struct file *, loff_t, int);
1168 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 1168 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1169 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 1169 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1170 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1170 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1171 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1171 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1172 int (*readdir) (struct file *, void *, filldir_t); 1172 int (*readdir) (struct file *, void *, filldir_t);
1173 unsigned int (*poll) (struct file *, struct poll_table_struct *); 1173 unsigned int (*poll) (struct file *, struct poll_table_struct *);
1174 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); 1174 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
1175 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1175 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1176 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1176 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1177 int (*mmap) (struct file *, struct vm_area_struct *); 1177 int (*mmap) (struct file *, struct vm_area_struct *);
1178 int (*open) (struct inode *, struct file *); 1178 int (*open) (struct inode *, struct file *);
1179 int (*flush) (struct file *, fl_owner_t id); 1179 int (*flush) (struct file *, fl_owner_t id);
1180 int (*release) (struct inode *, struct file *); 1180 int (*release) (struct inode *, struct file *);
1181 int (*fsync) (struct file *, struct dentry *, int datasync); 1181 int (*fsync) (struct file *, struct dentry *, int datasync);
1182 int (*aio_fsync) (struct kiocb *, int datasync); 1182 int (*aio_fsync) (struct kiocb *, int datasync);
1183 int (*fasync) (int, struct file *, int); 1183 int (*fasync) (int, struct file *, int);
1184 int (*lock) (struct file *, int, struct file_lock *); 1184 int (*lock) (struct file *, int, struct file_lock *);
1185 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); 1185 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1186 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1186 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1187 int (*check_flags)(int); 1187 int (*check_flags)(int);
1188 int (*dir_notify)(struct file *filp, unsigned long arg); 1188 int (*dir_notify)(struct file *filp, unsigned long arg);
1189 int (*flock) (struct file *, int, struct file_lock *); 1189 int (*flock) (struct file *, int, struct file_lock *);
1190 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); 1190 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1191 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); 1191 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1192 int (*setlease)(struct file *, long, struct file_lock **); 1192 int (*setlease)(struct file *, long, struct file_lock **);
1193 }; 1193 };
1194 1194
1195 struct inode_operations { 1195 struct inode_operations {
1196 int (*create) (struct inode *,struct dentry *,int, struct nameidata *); 1196 int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
1197 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); 1197 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
1198 int (*link) (struct dentry *,struct inode *,struct dentry *); 1198 int (*link) (struct dentry *,struct inode *,struct dentry *);
1199 int (*unlink) (struct inode *,struct dentry *); 1199 int (*unlink) (struct inode *,struct dentry *);
1200 int (*symlink) (struct inode *,struct dentry *,const char *); 1200 int (*symlink) (struct inode *,struct dentry *,const char *);
1201 int (*mkdir) (struct inode *,struct dentry *,int); 1201 int (*mkdir) (struct inode *,struct dentry *,int);
1202 int (*rmdir) (struct inode *,struct dentry *); 1202 int (*rmdir) (struct inode *,struct dentry *);
1203 int (*mknod) (struct inode *,struct dentry *,int,dev_t); 1203 int (*mknod) (struct inode *,struct dentry *,int,dev_t);
1204 int (*rename) (struct inode *, struct dentry *, 1204 int (*rename) (struct inode *, struct dentry *,
1205 struct inode *, struct dentry *); 1205 struct inode *, struct dentry *);
1206 int (*readlink) (struct dentry *, char __user *,int); 1206 int (*readlink) (struct dentry *, char __user *,int);
1207 void * (*follow_link) (struct dentry *, struct nameidata *); 1207 void * (*follow_link) (struct dentry *, struct nameidata *);
1208 void (*put_link) (struct dentry *, struct nameidata *, void *); 1208 void (*put_link) (struct dentry *, struct nameidata *, void *);
1209 void (*truncate) (struct inode *); 1209 void (*truncate) (struct inode *);
1210 int (*permission) (struct inode *, int, struct nameidata *); 1210 int (*permission) (struct inode *, int, struct nameidata *);
1211 int (*setattr) (struct dentry *, struct iattr *); 1211 int (*setattr) (struct dentry *, struct iattr *);
1212 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 1212 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
1213 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 1213 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
1214 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 1214 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
1215 ssize_t (*listxattr) (struct dentry *, char *, size_t); 1215 ssize_t (*listxattr) (struct dentry *, char *, size_t);
1216 int (*removexattr) (struct dentry *, const char *); 1216 int (*removexattr) (struct dentry *, const char *);
1217 void (*truncate_range)(struct inode *, loff_t, loff_t); 1217 void (*truncate_range)(struct inode *, loff_t, loff_t);
1218 long (*fallocate)(struct inode *inode, int mode, loff_t offset, 1218 long (*fallocate)(struct inode *inode, int mode, loff_t offset,
1219 loff_t len); 1219 loff_t len);
1220 }; 1220 };
1221 1221
1222 struct seq_file; 1222 struct seq_file;
1223 1223
1224 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 1224 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
1225 unsigned long nr_segs, unsigned long fast_segs, 1225 unsigned long nr_segs, unsigned long fast_segs,
1226 struct iovec *fast_pointer, 1226 struct iovec *fast_pointer,
1227 struct iovec **ret_pointer); 1227 struct iovec **ret_pointer);
1228 1228
1229 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1229 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
1230 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1230 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
1231 extern ssize_t vfs_readv(struct file *, const struct iovec __user *, 1231 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
1232 unsigned long, loff_t *); 1232 unsigned long, loff_t *);
1233 extern ssize_t vfs_writev(struct file *, const struct iovec __user *, 1233 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
1234 unsigned long, loff_t *); 1234 unsigned long, loff_t *);
1235 1235
1236 /* 1236 /*
1237 * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called 1237 * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
1238 * without the big kernel lock held in all filesystems. 1238 * without the big kernel lock held in all filesystems.
1239 */ 1239 */
1240 struct super_operations { 1240 struct super_operations {
1241 struct inode *(*alloc_inode)(struct super_block *sb); 1241 struct inode *(*alloc_inode)(struct super_block *sb);
1242 void (*destroy_inode)(struct inode *); 1242 void (*destroy_inode)(struct inode *);
1243 1243
1244 void (*read_inode) (struct inode *);
1245
1246 void (*dirty_inode) (struct inode *); 1244 void (*dirty_inode) (struct inode *);
1247 int (*write_inode) (struct inode *, int); 1245 int (*write_inode) (struct inode *, int);
1248 void (*put_inode) (struct inode *); 1246 void (*put_inode) (struct inode *);
1249 void (*drop_inode) (struct inode *); 1247 void (*drop_inode) (struct inode *);
1250 void (*delete_inode) (struct inode *); 1248 void (*delete_inode) (struct inode *);
1251 void (*put_super) (struct super_block *); 1249 void (*put_super) (struct super_block *);
1252 void (*write_super) (struct super_block *); 1250 void (*write_super) (struct super_block *);
1253 int (*sync_fs)(struct super_block *sb, int wait); 1251 int (*sync_fs)(struct super_block *sb, int wait);
1254 void (*write_super_lockfs) (struct super_block *); 1252 void (*write_super_lockfs) (struct super_block *);
1255 void (*unlockfs) (struct super_block *); 1253 void (*unlockfs) (struct super_block *);
1256 int (*statfs) (struct dentry *, struct kstatfs *); 1254 int (*statfs) (struct dentry *, struct kstatfs *);
1257 int (*remount_fs) (struct super_block *, int *, char *); 1255 int (*remount_fs) (struct super_block *, int *, char *);
1258 void (*clear_inode) (struct inode *); 1256 void (*clear_inode) (struct inode *);
1259 void (*umount_begin) (struct vfsmount *, int); 1257 void (*umount_begin) (struct vfsmount *, int);
1260 1258
1261 int (*show_options)(struct seq_file *, struct vfsmount *); 1259 int (*show_options)(struct seq_file *, struct vfsmount *);
1262 int (*show_stats)(struct seq_file *, struct vfsmount *); 1260 int (*show_stats)(struct seq_file *, struct vfsmount *);
1263 #ifdef CONFIG_QUOTA 1261 #ifdef CONFIG_QUOTA
1264 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 1262 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1265 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 1263 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1266 #endif 1264 #endif
1267 }; 1265 };
1268 1266
1269 /* 1267 /*
1270 * Inode state bits. Protected by inode_lock. 1268 * Inode state bits. Protected by inode_lock.
1271 * 1269 *
1272 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1270 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
1273 * I_DIRTY_DATASYNC and I_DIRTY_PAGES. 1271 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
1274 * 1272 *
1275 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, 1273 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
1276 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at 1274 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
1277 * various stages of removing an inode. 1275 * various stages of removing an inode.
1278 * 1276 *
1279 * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. 1277 * Two bits are used for locking and completion notification, I_LOCK and I_SYNC.
1280 * 1278 *
1281 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on 1279 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
1282 * fdatasync(). i_atime is the usual cause. 1280 * fdatasync(). i_atime is the usual cause.
1283 * I_DIRTY_DATASYNC Inode is dirty and must be written on fdatasync(), f.e. 1281 * I_DIRTY_DATASYNC Inode is dirty and must be written on fdatasync(), f.e.
1284 * because i_size changed. 1282 * because i_size changed.
1285 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. 1283 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
1286 * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both 1284 * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both
1287 * are cleared by unlock_new_inode(), called from iget(). 1285 * are cleared by unlock_new_inode(), called from iget().
1288 * I_WILL_FREE Must be set when calling write_inode_now() if i_count 1286 * I_WILL_FREE Must be set when calling write_inode_now() if i_count
1289 * is zero. I_FREEING must be set when I_WILL_FREE is 1287 * is zero. I_FREEING must be set when I_WILL_FREE is
1290 * cleared. 1288 * cleared.
1291 * I_FREEING Set when inode is about to be freed but still has dirty 1289 * I_FREEING Set when inode is about to be freed but still has dirty
1292 * pages or buffers attached or the inode itself is still 1290 * pages or buffers attached or the inode itself is still
1293 * dirty. 1291 * dirty.
1294 * I_CLEAR Set by clear_inode(). In this state the inode is clean 1292 * I_CLEAR Set by clear_inode(). In this state the inode is clean
1295 * and can be destroyed. 1293 * and can be destroyed.
1296 * 1294 *
1297 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are 1295 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
1298 * prohibited for many purposes. iget() must wait for 1296 * prohibited for many purposes. iget() must wait for
1299 * the inode to be completely released, then create it 1297 * the inode to be completely released, then create it
1300 * anew. Other functions will just ignore such inodes, 1298 * anew. Other functions will just ignore such inodes,
1301 * if appropriate. I_LOCK is used for waiting. 1299 * if appropriate. I_LOCK is used for waiting.
1302 * 1300 *
1303 * I_LOCK Serves as both a mutex and completion notification. 1301 * I_LOCK Serves as both a mutex and completion notification.
1304 * New inodes set I_LOCK. If two processes both create 1302 * New inodes set I_LOCK. If two processes both create
1305 * the same inode, one of them will release its inode and 1303 * the same inode, one of them will release its inode and
1306 * wait for I_LOCK to be released before returning. 1304 * wait for I_LOCK to be released before returning.
1307 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can 1305 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
1308 * also cause waiting on I_LOCK, without I_LOCK actually 1306 * also cause waiting on I_LOCK, without I_LOCK actually
1309 * being set. find_inode() uses this to prevent returning 1307 * being set. find_inode() uses this to prevent returning
1310 * nearly-dead inodes. 1308 * nearly-dead inodes.
1311 * I_SYNC Similar to I_LOCK, but limited in scope to writeback 1309 * I_SYNC Similar to I_LOCK, but limited in scope to writeback
1312 * of inode dirty data. Having a separate lock for this 1310 * of inode dirty data. Having a separate lock for this
1313 * purpose reduces latency and prevents some filesystem- 1311 * purpose reduces latency and prevents some filesystem-
1314 * specific deadlocks. 1312 * specific deadlocks.
1315 * 1313 *
1316 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1314 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1317 * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on 1315 * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on
1318 * I_CLEAR? If not, why? 1316 * I_CLEAR? If not, why?
1319 */ 1317 */
1320 #define I_DIRTY_SYNC 1 1318 #define I_DIRTY_SYNC 1
1321 #define I_DIRTY_DATASYNC 2 1319 #define I_DIRTY_DATASYNC 2
1322 #define I_DIRTY_PAGES 4 1320 #define I_DIRTY_PAGES 4
1323 #define I_NEW 8 1321 #define I_NEW 8
1324 #define I_WILL_FREE 16 1322 #define I_WILL_FREE 16
1325 #define I_FREEING 32 1323 #define I_FREEING 32
1326 #define I_CLEAR 64 1324 #define I_CLEAR 64
1327 #define __I_LOCK 7 1325 #define __I_LOCK 7
1328 #define I_LOCK (1 << __I_LOCK) 1326 #define I_LOCK (1 << __I_LOCK)
1329 #define __I_SYNC 8 1327 #define __I_SYNC 8
1330 #define I_SYNC (1 << __I_SYNC) 1328 #define I_SYNC (1 << __I_SYNC)
1331 1329
1332 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1330 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1333 1331
1334 extern void __mark_inode_dirty(struct inode *, int); 1332 extern void __mark_inode_dirty(struct inode *, int);
1335 static inline void mark_inode_dirty(struct inode *inode) 1333 static inline void mark_inode_dirty(struct inode *inode)
1336 { 1334 {
1337 __mark_inode_dirty(inode, I_DIRTY); 1335 __mark_inode_dirty(inode, I_DIRTY);
1338 } 1336 }
1339 1337
1340 static inline void mark_inode_dirty_sync(struct inode *inode) 1338 static inline void mark_inode_dirty_sync(struct inode *inode)
1341 { 1339 {
1342 __mark_inode_dirty(inode, I_DIRTY_SYNC); 1340 __mark_inode_dirty(inode, I_DIRTY_SYNC);
1343 } 1341 }
1344 1342
1345 /** 1343 /**
1346 * inc_nlink - directly increment an inode's link count 1344 * inc_nlink - directly increment an inode's link count
1347 * @inode: inode 1345 * @inode: inode
1348 * 1346 *
1349 * This is a low-level filesystem helper to replace any 1347 * This is a low-level filesystem helper to replace any
1350 * direct filesystem manipulation of i_nlink. Currently, 1348 * direct filesystem manipulation of i_nlink. Currently,
1351 * it is only here for parity with dec_nlink(). 1349 * it is only here for parity with dec_nlink().
1352 */ 1350 */
1353 static inline void inc_nlink(struct inode *inode) 1351 static inline void inc_nlink(struct inode *inode)
1354 { 1352 {
1355 inode->i_nlink++; 1353 inode->i_nlink++;
1356 } 1354 }
1357 1355
1358 static inline void inode_inc_link_count(struct inode *inode) 1356 static inline void inode_inc_link_count(struct inode *inode)
1359 { 1357 {
1360 inc_nlink(inode); 1358 inc_nlink(inode);
1361 mark_inode_dirty(inode); 1359 mark_inode_dirty(inode);
1362 } 1360 }
1363 1361
1364 /** 1362 /**
1365 * drop_nlink - directly drop an inode's link count 1363 * drop_nlink - directly drop an inode's link count
1366 * @inode: inode 1364 * @inode: inode
1367 * 1365 *
1368 * This is a low-level filesystem helper to replace any 1366 * This is a low-level filesystem helper to replace any
1369 * direct filesystem manipulation of i_nlink. In cases 1367 * direct filesystem manipulation of i_nlink. In cases
1370 * where we are attempting to track writes to the 1368 * where we are attempting to track writes to the
1371 * filesystem, a decrement to zero means an imminent 1369 * filesystem, a decrement to zero means an imminent
1372 * write when the file is truncated and actually unlinked 1370 * write when the file is truncated and actually unlinked
1373 * on the filesystem. 1371 * on the filesystem.
1374 */ 1372 */
1375 static inline void drop_nlink(struct inode *inode) 1373 static inline void drop_nlink(struct inode *inode)
1376 { 1374 {
1377 inode->i_nlink--; 1375 inode->i_nlink--;
1378 } 1376 }
1379 1377
1380 /** 1378 /**
1381 * clear_nlink - directly zero an inode's link count 1379 * clear_nlink - directly zero an inode's link count
1382 * @inode: inode 1380 * @inode: inode
1383 * 1381 *
1384 * This is a low-level filesystem helper to replace any 1382 * This is a low-level filesystem helper to replace any
1385 * direct filesystem manipulation of i_nlink. See 1383 * direct filesystem manipulation of i_nlink. See
1386 * drop_nlink() for why we care about i_nlink hitting zero. 1384 * drop_nlink() for why we care about i_nlink hitting zero.
1387 */ 1385 */
1388 static inline void clear_nlink(struct inode *inode) 1386 static inline void clear_nlink(struct inode *inode)
1389 { 1387 {
1390 inode->i_nlink = 0; 1388 inode->i_nlink = 0;
1391 } 1389 }
1392 1390
1393 static inline void inode_dec_link_count(struct inode *inode) 1391 static inline void inode_dec_link_count(struct inode *inode)
1394 { 1392 {
1395 drop_nlink(inode); 1393 drop_nlink(inode);
1396 mark_inode_dirty(inode); 1394 mark_inode_dirty(inode);
1397 } 1395 }
1398 1396
1399 /** 1397 /**
1400 * inode_inc_iversion - increments i_version 1398 * inode_inc_iversion - increments i_version
1401 * @inode: inode that need to be updated 1399 * @inode: inode that need to be updated
1402 * 1400 *
1403 * Every time the inode is modified, the i_version field will be incremented. 1401 * Every time the inode is modified, the i_version field will be incremented.
1404 * The filesystem has to be mounted with i_version flag 1402 * The filesystem has to be mounted with i_version flag
1405 */ 1403 */
1406 1404
1407 static inline void inode_inc_iversion(struct inode *inode) 1405 static inline void inode_inc_iversion(struct inode *inode)
1408 { 1406 {
1409 spin_lock(&inode->i_lock); 1407 spin_lock(&inode->i_lock);
1410 inode->i_version++; 1408 inode->i_version++;
1411 spin_unlock(&inode->i_lock); 1409 spin_unlock(&inode->i_lock);
1412 } 1410 }
1413 1411
1414 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); 1412 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
1415 static inline void file_accessed(struct file *file) 1413 static inline void file_accessed(struct file *file)
1416 { 1414 {
1417 if (!(file->f_flags & O_NOATIME)) 1415 if (!(file->f_flags & O_NOATIME))
1418 touch_atime(file->f_path.mnt, file->f_path.dentry); 1416 touch_atime(file->f_path.mnt, file->f_path.dentry);
1419 } 1417 }
1420 1418
1421 int sync_inode(struct inode *inode, struct writeback_control *wbc); 1419 int sync_inode(struct inode *inode, struct writeback_control *wbc);
1422 1420
1423 struct file_system_type { 1421 struct file_system_type {
1424 const char *name; 1422 const char *name;
1425 int fs_flags; 1423 int fs_flags;
1426 int (*get_sb) (struct file_system_type *, int, 1424 int (*get_sb) (struct file_system_type *, int,
1427 const char *, void *, struct vfsmount *); 1425 const char *, void *, struct vfsmount *);
1428 void (*kill_sb) (struct super_block *); 1426 void (*kill_sb) (struct super_block *);
1429 struct module *owner; 1427 struct module *owner;
1430 struct file_system_type * next; 1428 struct file_system_type * next;
1431 struct list_head fs_supers; 1429 struct list_head fs_supers;
1432 1430
1433 struct lock_class_key s_lock_key; 1431 struct lock_class_key s_lock_key;
1434 struct lock_class_key s_umount_key; 1432 struct lock_class_key s_umount_key;
1435 1433
1436 struct lock_class_key i_lock_key; 1434 struct lock_class_key i_lock_key;
1437 struct lock_class_key i_mutex_key; 1435 struct lock_class_key i_mutex_key;
1438 struct lock_class_key i_mutex_dir_key; 1436 struct lock_class_key i_mutex_dir_key;
1439 struct lock_class_key i_alloc_sem_key; 1437 struct lock_class_key i_alloc_sem_key;
1440 }; 1438 };
1441 1439
1442 extern int get_sb_bdev(struct file_system_type *fs_type, 1440 extern int get_sb_bdev(struct file_system_type *fs_type,
1443 int flags, const char *dev_name, void *data, 1441 int flags, const char *dev_name, void *data,
1444 int (*fill_super)(struct super_block *, void *, int), 1442 int (*fill_super)(struct super_block *, void *, int),
1445 struct vfsmount *mnt); 1443 struct vfsmount *mnt);
1446 extern int get_sb_single(struct file_system_type *fs_type, 1444 extern int get_sb_single(struct file_system_type *fs_type,
1447 int flags, void *data, 1445 int flags, void *data,
1448 int (*fill_super)(struct super_block *, void *, int), 1446 int (*fill_super)(struct super_block *, void *, int),
1449 struct vfsmount *mnt); 1447 struct vfsmount *mnt);
1450 extern int get_sb_nodev(struct file_system_type *fs_type, 1448 extern int get_sb_nodev(struct file_system_type *fs_type,
1451 int flags, void *data, 1449 int flags, void *data,
1452 int (*fill_super)(struct super_block *, void *, int), 1450 int (*fill_super)(struct super_block *, void *, int),
1453 struct vfsmount *mnt); 1451 struct vfsmount *mnt);
1454 void generic_shutdown_super(struct super_block *sb); 1452 void generic_shutdown_super(struct super_block *sb);
1455 void kill_block_super(struct super_block *sb); 1453 void kill_block_super(struct super_block *sb);
1456 void kill_anon_super(struct super_block *sb); 1454 void kill_anon_super(struct super_block *sb);
1457 void kill_litter_super(struct super_block *sb); 1455 void kill_litter_super(struct super_block *sb);
1458 void deactivate_super(struct super_block *sb); 1456 void deactivate_super(struct super_block *sb);
1459 int set_anon_super(struct super_block *s, void *data); 1457 int set_anon_super(struct super_block *s, void *data);
1460 struct super_block *sget(struct file_system_type *type, 1458 struct super_block *sget(struct file_system_type *type,
1461 int (*test)(struct super_block *,void *), 1459 int (*test)(struct super_block *,void *),
1462 int (*set)(struct super_block *,void *), 1460 int (*set)(struct super_block *,void *),
1463 void *data); 1461 void *data);
1464 extern int get_sb_pseudo(struct file_system_type *, char *, 1462 extern int get_sb_pseudo(struct file_system_type *, char *,
1465 const struct super_operations *ops, unsigned long, 1463 const struct super_operations *ops, unsigned long,
1466 struct vfsmount *mnt); 1464 struct vfsmount *mnt);
1467 extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); 1465 extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
1468 int __put_super(struct super_block *sb); 1466 int __put_super(struct super_block *sb);
1469 int __put_super_and_need_restart(struct super_block *sb); 1467 int __put_super_and_need_restart(struct super_block *sb);
1470 void unnamed_dev_init(void); 1468 void unnamed_dev_init(void);
1471 1469
1472 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ 1470 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1473 #define fops_get(fops) \ 1471 #define fops_get(fops) \
1474 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) 1472 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
1475 #define fops_put(fops) \ 1473 #define fops_put(fops) \
1476 do { if (fops) module_put((fops)->owner); } while(0) 1474 do { if (fops) module_put((fops)->owner); } while(0)
1477 1475
1478 extern int register_filesystem(struct file_system_type *); 1476 extern int register_filesystem(struct file_system_type *);
1479 extern int unregister_filesystem(struct file_system_type *); 1477 extern int unregister_filesystem(struct file_system_type *);
1480 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); 1478 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
1481 #define kern_mount(type) kern_mount_data(type, NULL) 1479 #define kern_mount(type) kern_mount_data(type, NULL)
1482 extern int may_umount_tree(struct vfsmount *); 1480 extern int may_umount_tree(struct vfsmount *);
1483 extern int may_umount(struct vfsmount *); 1481 extern int may_umount(struct vfsmount *);
1484 extern void umount_tree(struct vfsmount *, int, struct list_head *); 1482 extern void umount_tree(struct vfsmount *, int, struct list_head *);
1485 extern void release_mounts(struct list_head *); 1483 extern void release_mounts(struct list_head *);
1486 extern long do_mount(char *, char *, char *, unsigned long, void *); 1484 extern long do_mount(char *, char *, char *, unsigned long, void *);
1487 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 1485 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
1488 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 1486 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
1489 struct vfsmount *); 1487 struct vfsmount *);
1490 extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); 1488 extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
1491 extern void drop_collected_mounts(struct vfsmount *); 1489 extern void drop_collected_mounts(struct vfsmount *);
1492 1490
1493 extern int vfs_statfs(struct dentry *, struct kstatfs *); 1491 extern int vfs_statfs(struct dentry *, struct kstatfs *);
1494 1492
1495 /* /sys/fs */ 1493 /* /sys/fs */
1496 extern struct kobject *fs_kobj; 1494 extern struct kobject *fs_kobj;
1497 1495
1498 #define FLOCK_VERIFY_READ 1 1496 #define FLOCK_VERIFY_READ 1
1499 #define FLOCK_VERIFY_WRITE 2 1497 #define FLOCK_VERIFY_WRITE 2
1500 1498
1501 extern int locks_mandatory_locked(struct inode *); 1499 extern int locks_mandatory_locked(struct inode *);
1502 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1500 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1503 1501
1504 /* 1502 /*
1505 * Candidates for mandatory locking have the setgid bit set 1503 * Candidates for mandatory locking have the setgid bit set
1506 * but no group execute bit - an otherwise meaningless combination. 1504 * but no group execute bit - an otherwise meaningless combination.
1507 */ 1505 */
1508 1506
1509 static inline int __mandatory_lock(struct inode *ino) 1507 static inline int __mandatory_lock(struct inode *ino)
1510 { 1508 {
1511 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; 1509 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
1512 } 1510 }
1513 1511
1514 /* 1512 /*
1515 * ... and these candidates should be on MS_MANDLOCK mounted fs, 1513 * ... and these candidates should be on MS_MANDLOCK mounted fs,
1516 * otherwise these will be advisory locks 1514 * otherwise these will be advisory locks
1517 */ 1515 */
1518 1516
1519 static inline int mandatory_lock(struct inode *ino) 1517 static inline int mandatory_lock(struct inode *ino)
1520 { 1518 {
1521 return IS_MANDLOCK(ino) && __mandatory_lock(ino); 1519 return IS_MANDLOCK(ino) && __mandatory_lock(ino);
1522 } 1520 }
1523 1521
1524 static inline int locks_verify_locked(struct inode *inode) 1522 static inline int locks_verify_locked(struct inode *inode)
1525 { 1523 {
1526 if (mandatory_lock(inode)) 1524 if (mandatory_lock(inode))
1527 return locks_mandatory_locked(inode); 1525 return locks_mandatory_locked(inode);
1528 return 0; 1526 return 0;
1529 } 1527 }
1530 1528
1531 extern int rw_verify_area(int, struct file *, loff_t *, size_t); 1529 extern int rw_verify_area(int, struct file *, loff_t *, size_t);
1532 1530
1533 static inline int locks_verify_truncate(struct inode *inode, 1531 static inline int locks_verify_truncate(struct inode *inode,
1534 struct file *filp, 1532 struct file *filp,
1535 loff_t size) 1533 loff_t size)
1536 { 1534 {
1537 if (inode->i_flock && mandatory_lock(inode)) 1535 if (inode->i_flock && mandatory_lock(inode))
1538 return locks_mandatory_area( 1536 return locks_mandatory_area(
1539 FLOCK_VERIFY_WRITE, inode, filp, 1537 FLOCK_VERIFY_WRITE, inode, filp,
1540 size < inode->i_size ? size : inode->i_size, 1538 size < inode->i_size ? size : inode->i_size,
1541 (size < inode->i_size ? inode->i_size - size 1539 (size < inode->i_size ? inode->i_size - size
1542 : size - inode->i_size) 1540 : size - inode->i_size)
1543 ); 1541 );
1544 return 0; 1542 return 0;
1545 } 1543 }
1546 1544
1547 static inline int break_lease(struct inode *inode, unsigned int mode) 1545 static inline int break_lease(struct inode *inode, unsigned int mode)
1548 { 1546 {
1549 if (inode->i_flock) 1547 if (inode->i_flock)
1550 return __break_lease(inode, mode); 1548 return __break_lease(inode, mode);
1551 return 0; 1549 return 0;
1552 } 1550 }
1553 1551
1554 /* fs/open.c */ 1552 /* fs/open.c */
1555 1553
1556 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, 1554 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
1557 struct file *filp); 1555 struct file *filp);
1558 extern long do_sys_open(int dfd, const char __user *filename, int flags, 1556 extern long do_sys_open(int dfd, const char __user *filename, int flags,
1559 int mode); 1557 int mode);
1560 extern struct file *filp_open(const char *, int, int); 1558 extern struct file *filp_open(const char *, int, int);
1561 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); 1559 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
1562 extern int filp_close(struct file *, fl_owner_t id); 1560 extern int filp_close(struct file *, fl_owner_t id);
1563 extern char * getname(const char __user *); 1561 extern char * getname(const char __user *);
1564 1562
1565 /* fs/dcache.c */ 1563 /* fs/dcache.c */
1566 extern void __init vfs_caches_init_early(void); 1564 extern void __init vfs_caches_init_early(void);
1567 extern void __init vfs_caches_init(unsigned long); 1565 extern void __init vfs_caches_init(unsigned long);
1568 1566
1569 extern struct kmem_cache *names_cachep; 1567 extern struct kmem_cache *names_cachep;
1570 1568
1571 #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) 1569 #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL)
1572 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) 1570 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
1573 #ifndef CONFIG_AUDITSYSCALL 1571 #ifndef CONFIG_AUDITSYSCALL
1574 #define putname(name) __putname(name) 1572 #define putname(name) __putname(name)
1575 #else 1573 #else
1576 extern void putname(const char *name); 1574 extern void putname(const char *name);
1577 #endif 1575 #endif
1578 1576
1579 #ifdef CONFIG_BLOCK 1577 #ifdef CONFIG_BLOCK
1580 extern int register_blkdev(unsigned int, const char *); 1578 extern int register_blkdev(unsigned int, const char *);
1581 extern void unregister_blkdev(unsigned int, const char *); 1579 extern void unregister_blkdev(unsigned int, const char *);
1582 extern struct block_device *bdget(dev_t); 1580 extern struct block_device *bdget(dev_t);
1583 extern void bd_set_size(struct block_device *, loff_t size); 1581 extern void bd_set_size(struct block_device *, loff_t size);
1584 extern void bd_forget(struct inode *inode); 1582 extern void bd_forget(struct inode *inode);
1585 extern void bdput(struct block_device *); 1583 extern void bdput(struct block_device *);
1586 extern struct block_device *open_by_devnum(dev_t, unsigned); 1584 extern struct block_device *open_by_devnum(dev_t, unsigned);
1587 extern const struct address_space_operations def_blk_aops; 1585 extern const struct address_space_operations def_blk_aops;
1588 #else 1586 #else
1589 static inline void bd_forget(struct inode *inode) {} 1587 static inline void bd_forget(struct inode *inode) {}
1590 #endif 1588 #endif
1591 extern const struct file_operations def_blk_fops; 1589 extern const struct file_operations def_blk_fops;
1592 extern const struct file_operations def_chr_fops; 1590 extern const struct file_operations def_chr_fops;
1593 extern const struct file_operations bad_sock_fops; 1591 extern const struct file_operations bad_sock_fops;
1594 extern const struct file_operations def_fifo_fops; 1592 extern const struct file_operations def_fifo_fops;
1595 #ifdef CONFIG_BLOCK 1593 #ifdef CONFIG_BLOCK
1596 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); 1594 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
1597 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); 1595 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
1598 extern int blkdev_driver_ioctl(struct inode *inode, struct file *file, 1596 extern int blkdev_driver_ioctl(struct inode *inode, struct file *file,
1599 struct gendisk *disk, unsigned cmd, 1597 struct gendisk *disk, unsigned cmd,
1600 unsigned long arg); 1598 unsigned long arg);
1601 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); 1599 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
1602 extern int blkdev_get(struct block_device *, mode_t, unsigned); 1600 extern int blkdev_get(struct block_device *, mode_t, unsigned);
1603 extern int blkdev_put(struct block_device *); 1601 extern int blkdev_put(struct block_device *);
1604 extern int bd_claim(struct block_device *, void *); 1602 extern int bd_claim(struct block_device *, void *);
1605 extern void bd_release(struct block_device *); 1603 extern void bd_release(struct block_device *);
1606 #ifdef CONFIG_SYSFS 1604 #ifdef CONFIG_SYSFS
1607 extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); 1605 extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
1608 extern void bd_release_from_disk(struct block_device *, struct gendisk *); 1606 extern void bd_release_from_disk(struct block_device *, struct gendisk *);
1609 #else 1607 #else
1610 #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) 1608 #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder)
1611 #define bd_release_from_disk(bdev, disk) bd_release(bdev) 1609 #define bd_release_from_disk(bdev, disk) bd_release(bdev)
1612 #endif 1610 #endif
1613 #endif 1611 #endif
1614 1612
1615 /* fs/char_dev.c */ 1613 /* fs/char_dev.c */
1616 #define CHRDEV_MAJOR_HASH_SIZE 255 1614 #define CHRDEV_MAJOR_HASH_SIZE 255
1617 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); 1615 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
1618 extern int register_chrdev_region(dev_t, unsigned, const char *); 1616 extern int register_chrdev_region(dev_t, unsigned, const char *);
1619 extern int register_chrdev(unsigned int, const char *, 1617 extern int register_chrdev(unsigned int, const char *,
1620 const struct file_operations *); 1618 const struct file_operations *);
1621 extern void unregister_chrdev(unsigned int, const char *); 1619 extern void unregister_chrdev(unsigned int, const char *);
1622 extern void unregister_chrdev_region(dev_t, unsigned); 1620 extern void unregister_chrdev_region(dev_t, unsigned);
1623 extern int chrdev_open(struct inode *, struct file *); 1621 extern int chrdev_open(struct inode *, struct file *);
1624 extern void chrdev_show(struct seq_file *,off_t); 1622 extern void chrdev_show(struct seq_file *,off_t);
1625 1623
1626 /* fs/block_dev.c */ 1624 /* fs/block_dev.c */
1627 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ 1625 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */
1628 1626
1629 #ifdef CONFIG_BLOCK 1627 #ifdef CONFIG_BLOCK
1630 #define BLKDEV_MAJOR_HASH_SIZE 255 1628 #define BLKDEV_MAJOR_HASH_SIZE 255
1631 extern const char *__bdevname(dev_t, char *buffer); 1629 extern const char *__bdevname(dev_t, char *buffer);
1632 extern const char *bdevname(struct block_device *bdev, char *buffer); 1630 extern const char *bdevname(struct block_device *bdev, char *buffer);
1633 extern struct block_device *lookup_bdev(const char *); 1631 extern struct block_device *lookup_bdev(const char *);
1634 extern struct block_device *open_bdev_excl(const char *, int, void *); 1632 extern struct block_device *open_bdev_excl(const char *, int, void *);
1635 extern void close_bdev_excl(struct block_device *); 1633 extern void close_bdev_excl(struct block_device *);
1636 extern void blkdev_show(struct seq_file *,off_t); 1634 extern void blkdev_show(struct seq_file *,off_t);
1637 #else 1635 #else
1638 #define BLKDEV_MAJOR_HASH_SIZE 0 1636 #define BLKDEV_MAJOR_HASH_SIZE 0
1639 #endif 1637 #endif
1640 1638
1641 extern void init_special_inode(struct inode *, umode_t, dev_t); 1639 extern void init_special_inode(struct inode *, umode_t, dev_t);
1642 1640
1643 /* Invalid inode operations -- fs/bad_inode.c */ 1641 /* Invalid inode operations -- fs/bad_inode.c */
1644 extern void make_bad_inode(struct inode *); 1642 extern void make_bad_inode(struct inode *);
1645 extern int is_bad_inode(struct inode *); 1643 extern int is_bad_inode(struct inode *);
1646 1644
1647 extern const struct file_operations read_fifo_fops; 1645 extern const struct file_operations read_fifo_fops;
1648 extern const struct file_operations write_fifo_fops; 1646 extern const struct file_operations write_fifo_fops;
1649 extern const struct file_operations rdwr_fifo_fops; 1647 extern const struct file_operations rdwr_fifo_fops;
1650 1648
1651 extern int fs_may_remount_ro(struct super_block *); 1649 extern int fs_may_remount_ro(struct super_block *);
1652 1650
1653 #ifdef CONFIG_BLOCK 1651 #ifdef CONFIG_BLOCK
1654 /* 1652 /*
1655 * return READ, READA, or WRITE 1653 * return READ, READA, or WRITE
1656 */ 1654 */
1657 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) 1655 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK))
1658 1656
1659 /* 1657 /*
1660 * return data direction, READ or WRITE 1658 * return data direction, READ or WRITE
1661 */ 1659 */
1662 #define bio_data_dir(bio) ((bio)->bi_rw & 1) 1660 #define bio_data_dir(bio) ((bio)->bi_rw & 1)
1663 1661
1664 extern int check_disk_change(struct block_device *); 1662 extern int check_disk_change(struct block_device *);
1665 extern int __invalidate_device(struct block_device *); 1663 extern int __invalidate_device(struct block_device *);
1666 extern int invalidate_partition(struct gendisk *, int); 1664 extern int invalidate_partition(struct gendisk *, int);
1667 #endif 1665 #endif
1668 extern int invalidate_inodes(struct super_block *); 1666 extern int invalidate_inodes(struct super_block *);
1669 unsigned long __invalidate_mapping_pages(struct address_space *mapping, 1667 unsigned long __invalidate_mapping_pages(struct address_space *mapping,
1670 pgoff_t start, pgoff_t end, 1668 pgoff_t start, pgoff_t end,
1671 bool be_atomic); 1669 bool be_atomic);
1672 unsigned long invalidate_mapping_pages(struct address_space *mapping, 1670 unsigned long invalidate_mapping_pages(struct address_space *mapping,
1673 pgoff_t start, pgoff_t end); 1671 pgoff_t start, pgoff_t end);
1674 1672
1675 static inline unsigned long __deprecated 1673 static inline unsigned long __deprecated
1676 invalidate_inode_pages(struct address_space *mapping) 1674 invalidate_inode_pages(struct address_space *mapping)
1677 { 1675 {
1678 return invalidate_mapping_pages(mapping, 0, ~0UL); 1676 return invalidate_mapping_pages(mapping, 0, ~0UL);
1679 } 1677 }
1680 1678
1681 static inline void invalidate_remote_inode(struct inode *inode) 1679 static inline void invalidate_remote_inode(struct inode *inode)
1682 { 1680 {
1683 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1681 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1684 S_ISLNK(inode->i_mode)) 1682 S_ISLNK(inode->i_mode))
1685 invalidate_mapping_pages(inode->i_mapping, 0, -1); 1683 invalidate_mapping_pages(inode->i_mapping, 0, -1);
1686 } 1684 }
1687 extern int invalidate_inode_pages2(struct address_space *mapping); 1685 extern int invalidate_inode_pages2(struct address_space *mapping);
1688 extern int invalidate_inode_pages2_range(struct address_space *mapping, 1686 extern int invalidate_inode_pages2_range(struct address_space *mapping,
1689 pgoff_t start, pgoff_t end); 1687 pgoff_t start, pgoff_t end);
1690 extern int write_inode_now(struct inode *, int); 1688 extern int write_inode_now(struct inode *, int);
1691 extern int filemap_fdatawrite(struct address_space *); 1689 extern int filemap_fdatawrite(struct address_space *);
1692 extern int filemap_flush(struct address_space *); 1690 extern int filemap_flush(struct address_space *);
1693 extern int filemap_fdatawait(struct address_space *); 1691 extern int filemap_fdatawait(struct address_space *);
1694 extern int filemap_write_and_wait(struct address_space *mapping); 1692 extern int filemap_write_and_wait(struct address_space *mapping);
1695 extern int filemap_write_and_wait_range(struct address_space *mapping, 1693 extern int filemap_write_and_wait_range(struct address_space *mapping,
1696 loff_t lstart, loff_t lend); 1694 loff_t lstart, loff_t lend);
1697 extern int wait_on_page_writeback_range(struct address_space *mapping, 1695 extern int wait_on_page_writeback_range(struct address_space *mapping,
1698 pgoff_t start, pgoff_t end); 1696 pgoff_t start, pgoff_t end);
1699 extern int __filemap_fdatawrite_range(struct address_space *mapping, 1697 extern int __filemap_fdatawrite_range(struct address_space *mapping,
1700 loff_t start, loff_t end, int sync_mode); 1698 loff_t start, loff_t end, int sync_mode);
1701 1699
1702 extern long do_fsync(struct file *file, int datasync); 1700 extern long do_fsync(struct file *file, int datasync);
1703 extern void sync_supers(void); 1701 extern void sync_supers(void);
1704 extern void sync_filesystems(int wait); 1702 extern void sync_filesystems(int wait);
1705 extern void __fsync_super(struct super_block *sb); 1703 extern void __fsync_super(struct super_block *sb);
1706 extern void emergency_sync(void); 1704 extern void emergency_sync(void);
1707 extern void emergency_remount(void); 1705 extern void emergency_remount(void);
1708 extern int do_remount_sb(struct super_block *sb, int flags, 1706 extern int do_remount_sb(struct super_block *sb, int flags,
1709 void *data, int force); 1707 void *data, int force);
1710 #ifdef CONFIG_BLOCK 1708 #ifdef CONFIG_BLOCK
1711 extern sector_t bmap(struct inode *, sector_t); 1709 extern sector_t bmap(struct inode *, sector_t);
1712 #endif 1710 #endif
1713 extern int notify_change(struct dentry *, struct iattr *); 1711 extern int notify_change(struct dentry *, struct iattr *);
1714 extern int permission(struct inode *, int, struct nameidata *); 1712 extern int permission(struct inode *, int, struct nameidata *);
1715 extern int generic_permission(struct inode *, int, 1713 extern int generic_permission(struct inode *, int,
1716 int (*check_acl)(struct inode *, int)); 1714 int (*check_acl)(struct inode *, int));
1717 1715
1718 extern int get_write_access(struct inode *); 1716 extern int get_write_access(struct inode *);
1719 extern int deny_write_access(struct file *); 1717 extern int deny_write_access(struct file *);
1720 static inline void put_write_access(struct inode * inode) 1718 static inline void put_write_access(struct inode * inode)
1721 { 1719 {
1722 atomic_dec(&inode->i_writecount); 1720 atomic_dec(&inode->i_writecount);
1723 } 1721 }
1724 static inline void allow_write_access(struct file *file) 1722 static inline void allow_write_access(struct file *file)
1725 { 1723 {
1726 if (file) 1724 if (file)
1727 atomic_inc(&file->f_path.dentry->d_inode->i_writecount); 1725 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
1728 } 1726 }
1729 extern int do_pipe(int *); 1727 extern int do_pipe(int *);
1730 extern struct file *create_read_pipe(struct file *f); 1728 extern struct file *create_read_pipe(struct file *f);
1731 extern struct file *create_write_pipe(void); 1729 extern struct file *create_write_pipe(void);
1732 extern void free_write_pipe(struct file *); 1730 extern void free_write_pipe(struct file *);
1733 1731
1734 extern int open_namei(int dfd, const char *, int, int, struct nameidata *); 1732 extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
1735 extern int may_open(struct nameidata *, int, int); 1733 extern int may_open(struct nameidata *, int, int);
1736 1734
1737 extern int kernel_read(struct file *, unsigned long, char *, unsigned long); 1735 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
1738 extern struct file * open_exec(const char *); 1736 extern struct file * open_exec(const char *);
1739 1737
1740 /* fs/dcache.c -- generic fs support functions */ 1738 /* fs/dcache.c -- generic fs support functions */
1741 extern int is_subdir(struct dentry *, struct dentry *); 1739 extern int is_subdir(struct dentry *, struct dentry *);
1742 extern ino_t find_inode_number(struct dentry *, struct qstr *); 1740 extern ino_t find_inode_number(struct dentry *, struct qstr *);
1743 1741
1744 #include <linux/err.h> 1742 #include <linux/err.h>
1745 1743
1746 /* needed for stackable file system support */ 1744 /* needed for stackable file system support */
1747 extern loff_t default_llseek(struct file *file, loff_t offset, int origin); 1745 extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
1748 1746
1749 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); 1747 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
1750 1748
1751 extern void inode_init_once(struct inode *); 1749 extern void inode_init_once(struct inode *);
1752 extern void iput(struct inode *); 1750 extern void iput(struct inode *);
1753 extern struct inode * igrab(struct inode *); 1751 extern struct inode * igrab(struct inode *);
1754 extern ino_t iunique(struct super_block *, ino_t); 1752 extern ino_t iunique(struct super_block *, ino_t);
1755 extern int inode_needs_sync(struct inode *inode); 1753 extern int inode_needs_sync(struct inode *inode);
1756 extern void generic_delete_inode(struct inode *inode); 1754 extern void generic_delete_inode(struct inode *inode);
1757 extern void generic_drop_inode(struct inode *inode); 1755 extern void generic_drop_inode(struct inode *inode);
1758 1756
1759 extern struct inode *ilookup5_nowait(struct super_block *sb, 1757 extern struct inode *ilookup5_nowait(struct super_block *sb,
1760 unsigned long hashval, int (*test)(struct inode *, void *), 1758 unsigned long hashval, int (*test)(struct inode *, void *),
1761 void *data); 1759 void *data);
1762 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1760 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1763 int (*test)(struct inode *, void *), void *data); 1761 int (*test)(struct inode *, void *), void *data);
1764 extern struct inode *ilookup(struct super_block *sb, unsigned long ino); 1762 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
1765 1763
1766 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); 1764 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
1767 extern struct inode * iget_locked(struct super_block *, unsigned long); 1765 extern struct inode * iget_locked(struct super_block *, unsigned long);
1768 extern void unlock_new_inode(struct inode *); 1766 extern void unlock_new_inode(struct inode *);
1769
1770 static inline struct inode *iget(struct super_block *sb, unsigned long ino)
1771 {
1772 struct inode *inode = iget_locked(sb, ino);
1773
1774 if (inode && (inode->i_state & I_NEW)) {
1775 sb->s_op->read_inode(inode);
1776 unlock_new_inode(inode);
1777 }
1778
1779 return inode;
1780 }
1781 1767
1782 extern void __iget(struct inode * inode); 1768 extern void __iget(struct inode * inode);
1783 extern void iget_failed(struct inode *); 1769 extern void iget_failed(struct inode *);
1784 extern void clear_inode(struct inode *); 1770 extern void clear_inode(struct inode *);
1785 extern void destroy_inode(struct inode *); 1771 extern void destroy_inode(struct inode *);
1786 extern struct inode *new_inode(struct super_block *); 1772 extern struct inode *new_inode(struct super_block *);
1787 extern int __remove_suid(struct dentry *, int); 1773 extern int __remove_suid(struct dentry *, int);
1788 extern int should_remove_suid(struct dentry *); 1774 extern int should_remove_suid(struct dentry *);
1789 extern int remove_suid(struct dentry *); 1775 extern int remove_suid(struct dentry *);
1790 1776
1791 extern void __insert_inode_hash(struct inode *, unsigned long hashval); 1777 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
1792 extern void remove_inode_hash(struct inode *); 1778 extern void remove_inode_hash(struct inode *);
1793 static inline void insert_inode_hash(struct inode *inode) { 1779 static inline void insert_inode_hash(struct inode *inode) {
1794 __insert_inode_hash(inode, inode->i_ino); 1780 __insert_inode_hash(inode, inode->i_ino);
1795 } 1781 }
1796 1782
1797 extern struct file * get_empty_filp(void); 1783 extern struct file * get_empty_filp(void);
1798 extern void file_move(struct file *f, struct list_head *list); 1784 extern void file_move(struct file *f, struct list_head *list);
1799 extern void file_kill(struct file *f); 1785 extern void file_kill(struct file *f);
1800 #ifdef CONFIG_BLOCK 1786 #ifdef CONFIG_BLOCK
1801 struct bio; 1787 struct bio;
1802 extern void submit_bio(int, struct bio *); 1788 extern void submit_bio(int, struct bio *);
1803 extern int bdev_read_only(struct block_device *); 1789 extern int bdev_read_only(struct block_device *);
1804 #endif 1790 #endif
1805 extern int set_blocksize(struct block_device *, int); 1791 extern int set_blocksize(struct block_device *, int);
1806 extern int sb_set_blocksize(struct super_block *, int); 1792 extern int sb_set_blocksize(struct super_block *, int);
1807 extern int sb_min_blocksize(struct super_block *, int); 1793 extern int sb_min_blocksize(struct super_block *, int);
1808 extern int sb_has_dirty_inodes(struct super_block *); 1794 extern int sb_has_dirty_inodes(struct super_block *);
1809 1795
1810 extern int generic_file_mmap(struct file *, struct vm_area_struct *); 1796 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
1811 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 1797 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
1812 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); 1798 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
1813 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); 1799 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
1814 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); 1800 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
1815 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); 1801 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
1816 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, 1802 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
1817 unsigned long, loff_t); 1803 unsigned long, loff_t);
1818 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, 1804 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
1819 unsigned long *, loff_t, loff_t *, size_t, size_t); 1805 unsigned long *, loff_t, loff_t *, size_t, size_t);
1820 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, 1806 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
1821 unsigned long, loff_t, loff_t *, size_t, ssize_t); 1807 unsigned long, loff_t, loff_t *, size_t, ssize_t);
1822 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); 1808 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
1823 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); 1809 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
1824 extern void do_generic_mapping_read(struct address_space *mapping, 1810 extern void do_generic_mapping_read(struct address_space *mapping,
1825 struct file_ra_state *, struct file *, 1811 struct file_ra_state *, struct file *,
1826 loff_t *, read_descriptor_t *, read_actor_t); 1812 loff_t *, read_descriptor_t *, read_actor_t);
1827 extern int generic_segment_checks(const struct iovec *iov, 1813 extern int generic_segment_checks(const struct iovec *iov,
1828 unsigned long *nr_segs, size_t *count, int access_flags); 1814 unsigned long *nr_segs, size_t *count, int access_flags);
1829 1815
1830 /* fs/splice.c */ 1816 /* fs/splice.c */
1831 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 1817 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
1832 struct pipe_inode_info *, size_t, unsigned int); 1818 struct pipe_inode_info *, size_t, unsigned int);
1833 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 1819 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
1834 struct file *, loff_t *, size_t, unsigned int); 1820 struct file *, loff_t *, size_t, unsigned int);
1835 extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, 1821 extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *,
1836 struct file *, loff_t *, size_t, unsigned int); 1822 struct file *, loff_t *, size_t, unsigned int);
1837 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 1823 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
1838 struct file *out, loff_t *, size_t len, unsigned int flags); 1824 struct file *out, loff_t *, size_t len, unsigned int flags);
1839 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1825 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1840 size_t len, unsigned int flags); 1826 size_t len, unsigned int flags);
1841 1827
1842 extern void 1828 extern void
1843 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); 1829 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
1844 extern loff_t no_llseek(struct file *file, loff_t offset, int origin); 1830 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
1845 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); 1831 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
1846 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); 1832 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
1847 extern int generic_file_open(struct inode * inode, struct file * filp); 1833 extern int generic_file_open(struct inode * inode, struct file * filp);
1848 extern int nonseekable_open(struct inode * inode, struct file * filp); 1834 extern int nonseekable_open(struct inode * inode, struct file * filp);
1849 1835
1850 #ifdef CONFIG_FS_XIP 1836 #ifdef CONFIG_FS_XIP
1851 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, 1837 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
1852 loff_t *ppos); 1838 loff_t *ppos);
1853 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); 1839 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
1854 extern ssize_t xip_file_write(struct file *filp, const char __user *buf, 1840 extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
1855 size_t len, loff_t *ppos); 1841 size_t len, loff_t *ppos);
1856 extern int xip_truncate_page(struct address_space *mapping, loff_t from); 1842 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
1857 #else 1843 #else
1858 static inline int xip_truncate_page(struct address_space *mapping, loff_t from) 1844 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
1859 { 1845 {
1860 return 0; 1846 return 0;
1861 } 1847 }
1862 #endif 1848 #endif
1863 1849
1864 static inline void do_generic_file_read(struct file * filp, loff_t *ppos, 1850 static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
1865 read_descriptor_t * desc, 1851 read_descriptor_t * desc,
1866 read_actor_t actor) 1852 read_actor_t actor)
1867 { 1853 {
1868 do_generic_mapping_read(filp->f_mapping, 1854 do_generic_mapping_read(filp->f_mapping,
1869 &filp->f_ra, 1855 &filp->f_ra,
1870 filp, 1856 filp,
1871 ppos, 1857 ppos,
1872 desc, 1858 desc,
1873 actor); 1859 actor);
1874 } 1860 }
1875 1861
1876 #ifdef CONFIG_BLOCK 1862 #ifdef CONFIG_BLOCK
1877 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1863 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1878 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1864 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1879 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1865 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1880 int lock_type); 1866 int lock_type);
1881 1867
1882 enum { 1868 enum {
1883 DIO_LOCKING = 1, /* need locking between buffered and direct access */ 1869 DIO_LOCKING = 1, /* need locking between buffered and direct access */
1884 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ 1870 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
1885 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ 1871 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
1886 }; 1872 };
1887 1873
1888 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 1874 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
1889 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 1875 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
1890 loff_t offset, unsigned long nr_segs, get_block_t get_block, 1876 loff_t offset, unsigned long nr_segs, get_block_t get_block,
1891 dio_iodone_t end_io) 1877 dio_iodone_t end_io)
1892 { 1878 {
1893 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 1879 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1894 nr_segs, get_block, end_io, DIO_LOCKING); 1880 nr_segs, get_block, end_io, DIO_LOCKING);
1895 } 1881 }
1896 1882
1897 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, 1883 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
1898 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 1884 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
1899 loff_t offset, unsigned long nr_segs, get_block_t get_block, 1885 loff_t offset, unsigned long nr_segs, get_block_t get_block,
1900 dio_iodone_t end_io) 1886 dio_iodone_t end_io)
1901 { 1887 {
1902 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 1888 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1903 nr_segs, get_block, end_io, DIO_NO_LOCKING); 1889 nr_segs, get_block, end_io, DIO_NO_LOCKING);
1904 } 1890 }
1905 1891
1906 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, 1892 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
1907 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 1893 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
1908 loff_t offset, unsigned long nr_segs, get_block_t get_block, 1894 loff_t offset, unsigned long nr_segs, get_block_t get_block,
1909 dio_iodone_t end_io) 1895 dio_iodone_t end_io)
1910 { 1896 {
1911 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 1897 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1912 nr_segs, get_block, end_io, DIO_OWN_LOCKING); 1898 nr_segs, get_block, end_io, DIO_OWN_LOCKING);
1913 } 1899 }
1914 #endif 1900 #endif
1915 1901
1916 extern const struct file_operations generic_ro_fops; 1902 extern const struct file_operations generic_ro_fops;
1917 1903
1918 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) 1904 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
1919 1905
1920 extern int vfs_readlink(struct dentry *, char __user *, int, const char *); 1906 extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
1921 extern int vfs_follow_link(struct nameidata *, const char *); 1907 extern int vfs_follow_link(struct nameidata *, const char *);
1922 extern int page_readlink(struct dentry *, char __user *, int); 1908 extern int page_readlink(struct dentry *, char __user *, int);
1923 extern void *page_follow_link_light(struct dentry *, struct nameidata *); 1909 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
1924 extern void page_put_link(struct dentry *, struct nameidata *, void *); 1910 extern void page_put_link(struct dentry *, struct nameidata *, void *);
1925 extern int __page_symlink(struct inode *inode, const char *symname, int len, 1911 extern int __page_symlink(struct inode *inode, const char *symname, int len,
1926 gfp_t gfp_mask); 1912 gfp_t gfp_mask);
1927 extern int page_symlink(struct inode *inode, const char *symname, int len); 1913 extern int page_symlink(struct inode *inode, const char *symname, int len);
1928 extern const struct inode_operations page_symlink_inode_operations; 1914 extern const struct inode_operations page_symlink_inode_operations;
1929 extern int generic_readlink(struct dentry *, char __user *, int); 1915 extern int generic_readlink(struct dentry *, char __user *, int);
1930 extern void generic_fillattr(struct inode *, struct kstat *); 1916 extern void generic_fillattr(struct inode *, struct kstat *);
1931 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1917 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
1932 void inode_add_bytes(struct inode *inode, loff_t bytes); 1918 void inode_add_bytes(struct inode *inode, loff_t bytes);
1933 void inode_sub_bytes(struct inode *inode, loff_t bytes); 1919 void inode_sub_bytes(struct inode *inode, loff_t bytes);
1934 loff_t inode_get_bytes(struct inode *inode); 1920 loff_t inode_get_bytes(struct inode *inode);
1935 void inode_set_bytes(struct inode *inode, loff_t bytes); 1921 void inode_set_bytes(struct inode *inode, loff_t bytes);
1936 1922
1937 extern int vfs_readdir(struct file *, filldir_t, void *); 1923 extern int vfs_readdir(struct file *, filldir_t, void *);
1938 1924
1939 extern int vfs_stat(char __user *, struct kstat *); 1925 extern int vfs_stat(char __user *, struct kstat *);
1940 extern int vfs_lstat(char __user *, struct kstat *); 1926 extern int vfs_lstat(char __user *, struct kstat *);
1941 extern int vfs_stat_fd(int dfd, char __user *, struct kstat *); 1927 extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
1942 extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *); 1928 extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
1943 extern int vfs_fstat(unsigned int, struct kstat *); 1929 extern int vfs_fstat(unsigned int, struct kstat *);
1944 1930
1945 extern long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 1931 extern long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
1946 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, 1932 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
1947 unsigned long arg); 1933 unsigned long arg);
1948 1934
1949 extern void get_filesystem(struct file_system_type *fs); 1935 extern void get_filesystem(struct file_system_type *fs);
1950 extern void put_filesystem(struct file_system_type *fs); 1936 extern void put_filesystem(struct file_system_type *fs);
1951 extern struct file_system_type *get_fs_type(const char *name); 1937 extern struct file_system_type *get_fs_type(const char *name);
1952 extern struct super_block *get_super(struct block_device *); 1938 extern struct super_block *get_super(struct block_device *);
1953 extern struct super_block *user_get_super(dev_t); 1939 extern struct super_block *user_get_super(dev_t);
1954 extern void drop_super(struct super_block *sb); 1940 extern void drop_super(struct super_block *sb);
1955 1941
1956 extern int dcache_dir_open(struct inode *, struct file *); 1942 extern int dcache_dir_open(struct inode *, struct file *);
1957 extern int dcache_dir_close(struct inode *, struct file *); 1943 extern int dcache_dir_close(struct inode *, struct file *);
1958 extern loff_t dcache_dir_lseek(struct file *, loff_t, int); 1944 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
1959 extern int dcache_readdir(struct file *, void *, filldir_t); 1945 extern int dcache_readdir(struct file *, void *, filldir_t);
1960 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1946 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
1961 extern int simple_statfs(struct dentry *, struct kstatfs *); 1947 extern int simple_statfs(struct dentry *, struct kstatfs *);
1962 extern int simple_link(struct dentry *, struct inode *, struct dentry *); 1948 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
1963 extern int simple_unlink(struct inode *, struct dentry *); 1949 extern int simple_unlink(struct inode *, struct dentry *);
1964 extern int simple_rmdir(struct inode *, struct dentry *); 1950 extern int simple_rmdir(struct inode *, struct dentry *);
1965 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 1951 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
1966 extern int simple_sync_file(struct file *, struct dentry *, int); 1952 extern int simple_sync_file(struct file *, struct dentry *, int);
1967 extern int simple_empty(struct dentry *); 1953 extern int simple_empty(struct dentry *);
1968 extern int simple_readpage(struct file *file, struct page *page); 1954 extern int simple_readpage(struct file *file, struct page *page);
1969 extern int simple_prepare_write(struct file *file, struct page *page, 1955 extern int simple_prepare_write(struct file *file, struct page *page,
1970 unsigned offset, unsigned to); 1956 unsigned offset, unsigned to);
1971 extern int simple_write_begin(struct file *file, struct address_space *mapping, 1957 extern int simple_write_begin(struct file *file, struct address_space *mapping,
1972 loff_t pos, unsigned len, unsigned flags, 1958 loff_t pos, unsigned len, unsigned flags,
1973 struct page **pagep, void **fsdata); 1959 struct page **pagep, void **fsdata);
1974 extern int simple_write_end(struct file *file, struct address_space *mapping, 1960 extern int simple_write_end(struct file *file, struct address_space *mapping,
1975 loff_t pos, unsigned len, unsigned copied, 1961 loff_t pos, unsigned len, unsigned copied,
1976 struct page *page, void *fsdata); 1962 struct page *page, void *fsdata);
1977 1963
1978 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); 1964 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
1979 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); 1965 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
1980 extern const struct file_operations simple_dir_operations; 1966 extern const struct file_operations simple_dir_operations;
1981 extern const struct inode_operations simple_dir_inode_operations; 1967 extern const struct inode_operations simple_dir_inode_operations;
1982 struct tree_descr { char *name; const struct file_operations *ops; int mode; }; 1968 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
1983 struct dentry *d_alloc_name(struct dentry *, const char *); 1969 struct dentry *d_alloc_name(struct dentry *, const char *);
1984 extern int simple_fill_super(struct super_block *, int, struct tree_descr *); 1970 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
1985 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); 1971 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
1986 extern void simple_release_fs(struct vfsmount **mount, int *count); 1972 extern void simple_release_fs(struct vfsmount **mount, int *count);
1987 1973
1988 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t); 1974 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
1989 1975
1990 #ifdef CONFIG_MIGRATION 1976 #ifdef CONFIG_MIGRATION
1991 extern int buffer_migrate_page(struct address_space *, 1977 extern int buffer_migrate_page(struct address_space *,
1992 struct page *, struct page *); 1978 struct page *, struct page *);
1993 #else 1979 #else
1994 #define buffer_migrate_page NULL 1980 #define buffer_migrate_page NULL
1995 #endif 1981 #endif
1996 1982
1997 extern int inode_change_ok(struct inode *, struct iattr *); 1983 extern int inode_change_ok(struct inode *, struct iattr *);
1998 extern int __must_check inode_setattr(struct inode *, struct iattr *); 1984 extern int __must_check inode_setattr(struct inode *, struct iattr *);
1999 1985
2000 extern void file_update_time(struct file *file); 1986 extern void file_update_time(struct file *file);
2001 1987
2002 static inline ino_t parent_ino(struct dentry *dentry) 1988 static inline ino_t parent_ino(struct dentry *dentry)
2003 { 1989 {
2004 ino_t res; 1990 ino_t res;
2005 1991
2006 spin_lock(&dentry->d_lock); 1992 spin_lock(&dentry->d_lock);
2007 res = dentry->d_parent->d_inode->i_ino; 1993 res = dentry->d_parent->d_inode->i_ino;
2008 spin_unlock(&dentry->d_lock); 1994 spin_unlock(&dentry->d_lock);
2009 return res; 1995 return res;
2010 } 1996 }
2011 1997
2012 /* kernel/fork.c */ 1998 /* kernel/fork.c */
2013 extern int unshare_files(void); 1999 extern int unshare_files(void);
2014 2000
2015 /* Transaction based IO helpers */ 2001 /* Transaction based IO helpers */
2016 2002
2017 /* 2003 /*
2018 * An argresp is stored in an allocated page and holds the 2004 * An argresp is stored in an allocated page and holds the
2019 * size of the argument or response, along with its content 2005 * size of the argument or response, along with its content
2020 */ 2006 */
2021 struct simple_transaction_argresp { 2007 struct simple_transaction_argresp {
2022 ssize_t size; 2008 ssize_t size;
2023 char data[0]; 2009 char data[0];
2024 }; 2010 };
2025 2011
2026 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) 2012 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))
2027 2013
2028 char *simple_transaction_get(struct file *file, const char __user *buf, 2014 char *simple_transaction_get(struct file *file, const char __user *buf,
2029 size_t size); 2015 size_t size);
2030 ssize_t simple_transaction_read(struct file *file, char __user *buf, 2016 ssize_t simple_transaction_read(struct file *file, char __user *buf,
2031 size_t size, loff_t *pos); 2017 size_t size, loff_t *pos);
2032 int simple_transaction_release(struct inode *inode, struct file *file); 2018 int simple_transaction_release(struct inode *inode, struct file *file);
2033 2019
2034 static inline void simple_transaction_set(struct file *file, size_t n) 2020 static inline void simple_transaction_set(struct file *file, size_t n)
2035 { 2021 {
2036 struct simple_transaction_argresp *ar = file->private_data; 2022 struct simple_transaction_argresp *ar = file->private_data;
2037 2023
2038 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 2024 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
2039 2025
2040 /* 2026 /*
2041 * The barrier ensures that ar->size will really remain zero until 2027 * The barrier ensures that ar->size will really remain zero until
2042 * ar->data is ready for reading. 2028 * ar->data is ready for reading.
2043 */ 2029 */
2044 smp_mb(); 2030 smp_mb();
2045 ar->size = n; 2031 ar->size = n;
2046 } 2032 }
2047 2033
2048 /* 2034 /*
2049 * simple attribute files 2035 * simple attribute files
2050 * 2036 *
2051 * These attributes behave similar to those in sysfs: 2037 * These attributes behave similar to those in sysfs:
2052 * 2038 *
2053 * Writing to an attribute immediately sets a value, an open file can be 2039 * Writing to an attribute immediately sets a value, an open file can be
2054 * written to multiple times. 2040 * written to multiple times.
2055 * 2041 *
2056 * Reading from an attribute creates a buffer from the value that might get 2042 * Reading from an attribute creates a buffer from the value that might get
2057 * read with multiple read calls. When the attribute has been read 2043 * read with multiple read calls. When the attribute has been read
2058 * completely, no further read calls are possible until the file is opened 2044 * completely, no further read calls are possible until the file is opened
2059 * again. 2045 * again.
2060 * 2046 *
2061 * All attributes contain a text representation of a numeric value 2047 * All attributes contain a text representation of a numeric value
2062 * that are accessed with the get() and set() functions. 2048 * that are accessed with the get() and set() functions.
2063 */ 2049 */
2064 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ 2050 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \
2065 static int __fops ## _open(struct inode *inode, struct file *file) \ 2051 static int __fops ## _open(struct inode *inode, struct file *file) \
2066 { \ 2052 { \
2067 __simple_attr_check_format(__fmt, 0ull); \ 2053 __simple_attr_check_format(__fmt, 0ull); \
2068 return simple_attr_open(inode, file, __get, __set, __fmt); \ 2054 return simple_attr_open(inode, file, __get, __set, __fmt); \
2069 } \ 2055 } \
2070 static struct file_operations __fops = { \ 2056 static struct file_operations __fops = { \
2071 .owner = THIS_MODULE, \ 2057 .owner = THIS_MODULE, \
2072 .open = __fops ## _open, \ 2058 .open = __fops ## _open, \
2073 .release = simple_attr_close, \ 2059 .release = simple_attr_close, \
2074 .read = simple_attr_read, \ 2060 .read = simple_attr_read, \
2075 .write = simple_attr_write, \ 2061 .write = simple_attr_write, \
2076 }; 2062 };
2077 2063
2078 static inline void __attribute__((format(printf, 1, 2))) 2064 static inline void __attribute__((format(printf, 1, 2)))
2079 __simple_attr_check_format(const char *fmt, ...) 2065 __simple_attr_check_format(const char *fmt, ...)
2080 { 2066 {
2081 /* don't do anything, just let the compiler check the arguments; */ 2067 /* don't do anything, just let the compiler check the arguments; */
2082 } 2068 }
2083 2069
2084 int simple_attr_open(struct inode *inode, struct file *file, 2070 int simple_attr_open(struct inode *inode, struct file *file,
2085 u64 (*get)(void *), void (*set)(void *, u64), 2071 u64 (*get)(void *), void (*set)(void *, u64),
2086 const char *fmt); 2072 const char *fmt);
2087 int simple_attr_close(struct inode *inode, struct file *file); 2073 int simple_attr_close(struct inode *inode, struct file *file);
2088 ssize_t simple_attr_read(struct file *file, char __user *buf, 2074 ssize_t simple_attr_read(struct file *file, char __user *buf,
2089 size_t len, loff_t *ppos); 2075 size_t len, loff_t *ppos);
2090 ssize_t simple_attr_write(struct file *file, const char __user *buf, 2076 ssize_t simple_attr_write(struct file *file, const char __user *buf,
2091 size_t len, loff_t *ppos); 2077 size_t len, loff_t *ppos);
2092 2078
2093 2079
2094 #ifdef CONFIG_SECURITY 2080 #ifdef CONFIG_SECURITY
2095 static inline char *alloc_secdata(void) 2081 static inline char *alloc_secdata(void)
2096 { 2082 {
2097 return (char *)get_zeroed_page(GFP_KERNEL); 2083 return (char *)get_zeroed_page(GFP_KERNEL);
2098 } 2084 }
2099 2085
2100 static inline void free_secdata(void *secdata) 2086 static inline void free_secdata(void *secdata)
2101 { 2087 {
2102 free_page((unsigned long)secdata); 2088 free_page((unsigned long)secdata);
2103 } 2089 }
2104 #else 2090 #else
2105 static inline char *alloc_secdata(void) 2091 static inline char *alloc_secdata(void)
2106 { 2092 {
2107 return (char *)1; 2093 return (char *)1;
2108 } 2094 }
2109 2095
2110 static inline void free_secdata(void *secdata) 2096 static inline void free_secdata(void *secdata)
2111 { } 2097 { }
2112 #endif /* CONFIG_SECURITY */ 2098 #endif /* CONFIG_SECURITY */
2113 2099
2114 struct ctl_table; 2100 struct ctl_table;
2115 int proc_nr_files(struct ctl_table *table, int write, struct file *filp, 2101 int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
2116 void __user *buffer, size_t *lenp, loff_t *ppos); 2102 void __user *buffer, size_t *lenp, loff_t *ppos);
2117 2103
2118 int get_filesystem_list(char * buf); 2104 int get_filesystem_list(char * buf);
2119 2105
2120 #endif /* __KERNEL__ */ 2106 #endif /* __KERNEL__ */
2121 #endif /* _LINUX_FS_H */ 2107 #endif /* _LINUX_FS_H */
2122 2108