Commit 12debc4248a4a7f1873e47cda2cdd7faca80b099
Committed by
Linus Torvalds
1 parent
755aedc159
Exists in
master
and in
4 other branches
iget: remove iget() and the read_inode() super op as being obsolete
Remove the old iget() call and the read_inode() superblock operation it uses as these are really obsolete, and the use of read_inode() does not produce proper error handling (no distinction between ENOMEM and EIO when marking an inode bad). Furthermore, this removes the temptation to use iget() to find an inode by number in a filesystem from code outside that filesystem. iget_locked() should be used instead. A new function is added in an earlier patch (iget_failed) that is to be called to mark an inode as bad, unlock it and release it should the get routine fail. Mark iget() and read_inode() as being obsolete and remove references to them from the documentation. Typically a filesystem will be modified such that the read_inode function becomes an internal iget function, for example the following: void thingyfs_read_inode(struct inode *inode) { ... } would be changed into something like: struct inode *thingyfs_iget(struct super_block *sp, unsigned long ino) { struct inode *inode; int ret; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ... unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(ret); } and then thingyfs_iget() would be called rather than iget(), for example: ret = -EINVAL; inode = iget(sb, ino); if (!inode || is_bad_inode(inode)) goto error; becomes: inode = thingyfs_iget(sb, ino); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto error; } Note that is_bad_inode() does not need to be called. The error returned by thingyfs_iget() should render it unnecessary. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 5 changed files with 9 additions and 41 deletions Inline Diff
Documentation/filesystems/Locking
1 | The text below describes the locking rules for VFS-related methods. | 1 | The text below describes the locking rules for VFS-related methods. |
2 | It is (believed to be) up-to-date. *Please*, if you change anything in | 2 | It is (believed to be) up-to-date. *Please*, if you change anything in |
3 | prototypes or locking protocols - update this file. And update the relevant | 3 | prototypes or locking protocols - update this file. And update the relevant |
4 | instances in the tree, don't leave that to maintainers of filesystems/devices/ | 4 | instances in the tree, don't leave that to maintainers of filesystems/devices/ |
5 | etc. At the very least, put the list of dubious cases in the end of this file. | 5 | etc. At the very least, put the list of dubious cases in the end of this file. |
6 | Don't turn it into log - maintainers of out-of-the-tree code are supposed to | 6 | Don't turn it into log - maintainers of out-of-the-tree code are supposed to |
7 | be able to use diff(1). | 7 | be able to use diff(1). |
8 | Thing currently missing here: socket operations. Alexey? | 8 | Thing currently missing here: socket operations. Alexey? |
9 | 9 | ||
10 | --------------------------- dentry_operations -------------------------- | 10 | --------------------------- dentry_operations -------------------------- |
11 | prototypes: | 11 | prototypes: |
12 | int (*d_revalidate)(struct dentry *, int); | 12 | int (*d_revalidate)(struct dentry *, int); |
13 | int (*d_hash) (struct dentry *, struct qstr *); | 13 | int (*d_hash) (struct dentry *, struct qstr *); |
14 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 14 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); |
15 | int (*d_delete)(struct dentry *); | 15 | int (*d_delete)(struct dentry *); |
16 | void (*d_release)(struct dentry *); | 16 | void (*d_release)(struct dentry *); |
17 | void (*d_iput)(struct dentry *, struct inode *); | 17 | void (*d_iput)(struct dentry *, struct inode *); |
18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); | 18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); |
19 | 19 | ||
20 | locking rules: | 20 | locking rules: |
21 | none have BKL | 21 | none have BKL |
22 | dcache_lock rename_lock ->d_lock may block | 22 | dcache_lock rename_lock ->d_lock may block |
23 | d_revalidate: no no no yes | 23 | d_revalidate: no no no yes |
24 | d_hash no no no yes | 24 | d_hash no no no yes |
25 | d_compare: no yes no no | 25 | d_compare: no yes no no |
26 | d_delete: yes no yes no | 26 | d_delete: yes no yes no |
27 | d_release: no no no yes | 27 | d_release: no no no yes |
28 | d_iput: no no no yes | 28 | d_iput: no no no yes |
29 | d_dname: no no no no | 29 | d_dname: no no no no |
30 | 30 | ||
31 | --------------------------- inode_operations --------------------------- | 31 | --------------------------- inode_operations --------------------------- |
32 | prototypes: | 32 | prototypes: |
33 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); | 33 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); |
34 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid | 34 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid |
35 | ata *); | 35 | ata *); |
36 | int (*link) (struct dentry *,struct inode *,struct dentry *); | 36 | int (*link) (struct dentry *,struct inode *,struct dentry *); |
37 | int (*unlink) (struct inode *,struct dentry *); | 37 | int (*unlink) (struct inode *,struct dentry *); |
38 | int (*symlink) (struct inode *,struct dentry *,const char *); | 38 | int (*symlink) (struct inode *,struct dentry *,const char *); |
39 | int (*mkdir) (struct inode *,struct dentry *,int); | 39 | int (*mkdir) (struct inode *,struct dentry *,int); |
40 | int (*rmdir) (struct inode *,struct dentry *); | 40 | int (*rmdir) (struct inode *,struct dentry *); |
41 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); | 41 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); |
42 | int (*rename) (struct inode *, struct dentry *, | 42 | int (*rename) (struct inode *, struct dentry *, |
43 | struct inode *, struct dentry *); | 43 | struct inode *, struct dentry *); |
44 | int (*readlink) (struct dentry *, char __user *,int); | 44 | int (*readlink) (struct dentry *, char __user *,int); |
45 | int (*follow_link) (struct dentry *, struct nameidata *); | 45 | int (*follow_link) (struct dentry *, struct nameidata *); |
46 | void (*truncate) (struct inode *); | 46 | void (*truncate) (struct inode *); |
47 | int (*permission) (struct inode *, int, struct nameidata *); | 47 | int (*permission) (struct inode *, int, struct nameidata *); |
48 | int (*setattr) (struct dentry *, struct iattr *); | 48 | int (*setattr) (struct dentry *, struct iattr *); |
49 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); | 49 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); |
50 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 50 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
51 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 51 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
52 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 52 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
53 | int (*removexattr) (struct dentry *, const char *); | 53 | int (*removexattr) (struct dentry *, const char *); |
54 | 54 | ||
55 | locking rules: | 55 | locking rules: |
56 | all may block, none have BKL | 56 | all may block, none have BKL |
57 | i_mutex(inode) | 57 | i_mutex(inode) |
58 | lookup: yes | 58 | lookup: yes |
59 | create: yes | 59 | create: yes |
60 | link: yes (both) | 60 | link: yes (both) |
61 | mknod: yes | 61 | mknod: yes |
62 | symlink: yes | 62 | symlink: yes |
63 | mkdir: yes | 63 | mkdir: yes |
64 | unlink: yes (both) | 64 | unlink: yes (both) |
65 | rmdir: yes (both) (see below) | 65 | rmdir: yes (both) (see below) |
66 | rename: yes (all) (see below) | 66 | rename: yes (all) (see below) |
67 | readlink: no | 67 | readlink: no |
68 | follow_link: no | 68 | follow_link: no |
69 | truncate: yes (see below) | 69 | truncate: yes (see below) |
70 | setattr: yes | 70 | setattr: yes |
71 | permission: no | 71 | permission: no |
72 | getattr: no | 72 | getattr: no |
73 | setxattr: yes | 73 | setxattr: yes |
74 | getxattr: no | 74 | getxattr: no |
75 | listxattr: no | 75 | listxattr: no |
76 | removexattr: yes | 76 | removexattr: yes |
77 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on | 77 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on |
78 | victim. | 78 | victim. |
79 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. | 79 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. |
80 | ->truncate() is never called directly - it's a callback, not a | 80 | ->truncate() is never called directly - it's a callback, not a |
81 | method. It's called by vmtruncate() - library function normally used by | 81 | method. It's called by vmtruncate() - library function normally used by |
82 | ->setattr(). Locking information above applies to that call (i.e. is | 82 | ->setattr(). Locking information above applies to that call (i.e. is |
83 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been | 83 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been |
84 | passed). | 84 | passed). |
85 | 85 | ||
86 | See Documentation/filesystems/directory-locking for more detailed discussion | 86 | See Documentation/filesystems/directory-locking for more detailed discussion |
87 | of the locking scheme for directory operations. | 87 | of the locking scheme for directory operations. |
88 | 88 | ||
89 | --------------------------- super_operations --------------------------- | 89 | --------------------------- super_operations --------------------------- |
90 | prototypes: | 90 | prototypes: |
91 | struct inode *(*alloc_inode)(struct super_block *sb); | 91 | struct inode *(*alloc_inode)(struct super_block *sb); |
92 | void (*destroy_inode)(struct inode *); | 92 | void (*destroy_inode)(struct inode *); |
93 | void (*read_inode) (struct inode *); | ||
94 | void (*dirty_inode) (struct inode *); | 93 | void (*dirty_inode) (struct inode *); |
95 | int (*write_inode) (struct inode *, int); | 94 | int (*write_inode) (struct inode *, int); |
96 | void (*put_inode) (struct inode *); | 95 | void (*put_inode) (struct inode *); |
97 | void (*drop_inode) (struct inode *); | 96 | void (*drop_inode) (struct inode *); |
98 | void (*delete_inode) (struct inode *); | 97 | void (*delete_inode) (struct inode *); |
99 | void (*put_super) (struct super_block *); | 98 | void (*put_super) (struct super_block *); |
100 | void (*write_super) (struct super_block *); | 99 | void (*write_super) (struct super_block *); |
101 | int (*sync_fs)(struct super_block *sb, int wait); | 100 | int (*sync_fs)(struct super_block *sb, int wait); |
102 | void (*write_super_lockfs) (struct super_block *); | 101 | void (*write_super_lockfs) (struct super_block *); |
103 | void (*unlockfs) (struct super_block *); | 102 | void (*unlockfs) (struct super_block *); |
104 | int (*statfs) (struct dentry *, struct kstatfs *); | 103 | int (*statfs) (struct dentry *, struct kstatfs *); |
105 | int (*remount_fs) (struct super_block *, int *, char *); | 104 | int (*remount_fs) (struct super_block *, int *, char *); |
106 | void (*clear_inode) (struct inode *); | 105 | void (*clear_inode) (struct inode *); |
107 | void (*umount_begin) (struct super_block *); | 106 | void (*umount_begin) (struct super_block *); |
108 | int (*show_options)(struct seq_file *, struct vfsmount *); | 107 | int (*show_options)(struct seq_file *, struct vfsmount *); |
109 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 108 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
110 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 109 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
111 | 110 | ||
112 | locking rules: | 111 | locking rules: |
113 | All may block. | 112 | All may block. |
114 | BKL s_lock s_umount | 113 | BKL s_lock s_umount |
115 | alloc_inode: no no no | 114 | alloc_inode: no no no |
116 | destroy_inode: no | 115 | destroy_inode: no |
117 | read_inode: no (see below) | ||
118 | dirty_inode: no (must not sleep) | 116 | dirty_inode: no (must not sleep) |
119 | write_inode: no | 117 | write_inode: no |
120 | put_inode: no | 118 | put_inode: no |
121 | drop_inode: no !!!inode_lock!!! | 119 | drop_inode: no !!!inode_lock!!! |
122 | delete_inode: no | 120 | delete_inode: no |
123 | put_super: yes yes no | 121 | put_super: yes yes no |
124 | write_super: no yes read | 122 | write_super: no yes read |
125 | sync_fs: no no read | 123 | sync_fs: no no read |
126 | write_super_lockfs: ? | 124 | write_super_lockfs: ? |
127 | unlockfs: ? | 125 | unlockfs: ? |
128 | statfs: no no no | 126 | statfs: no no no |
129 | remount_fs: yes yes maybe (see below) | 127 | remount_fs: yes yes maybe (see below) |
130 | clear_inode: no | 128 | clear_inode: no |
131 | umount_begin: yes no no | 129 | umount_begin: yes no no |
132 | show_options: no (vfsmount->sem) | 130 | show_options: no (vfsmount->sem) |
133 | quota_read: no no no (see below) | 131 | quota_read: no no no (see below) |
134 | quota_write: no no no (see below) | 132 | quota_write: no no no (see below) |
135 | 133 | ||
136 | ->read_inode() is not a method - it's a callback used in iget(). | ||
137 | ->remount_fs() will have the s_umount lock if it's already mounted. | 134 | ->remount_fs() will have the s_umount lock if it's already mounted. |
138 | When called from get_sb_single, it does NOT have the s_umount lock. | 135 | When called from get_sb_single, it does NOT have the s_umount lock. |
139 | ->quota_read() and ->quota_write() functions are both guaranteed to | 136 | ->quota_read() and ->quota_write() functions are both guaranteed to |
140 | be the only ones operating on the quota file by the quota code (via | 137 | be the only ones operating on the quota file by the quota code (via |
141 | dqio_sem) (unless an admin really wants to screw up something and | 138 | dqio_sem) (unless an admin really wants to screw up something and |
142 | writes to quota files with quotas on). For other details about locking | 139 | writes to quota files with quotas on). For other details about locking |
143 | see also dquot_operations section. | 140 | see also dquot_operations section. |
144 | 141 | ||
145 | --------------------------- file_system_type --------------------------- | 142 | --------------------------- file_system_type --------------------------- |
146 | prototypes: | 143 | prototypes: |
147 | int (*get_sb) (struct file_system_type *, int, | 144 | int (*get_sb) (struct file_system_type *, int, |
148 | const char *, void *, struct vfsmount *); | 145 | const char *, void *, struct vfsmount *); |
149 | void (*kill_sb) (struct super_block *); | 146 | void (*kill_sb) (struct super_block *); |
150 | locking rules: | 147 | locking rules: |
151 | may block BKL | 148 | may block BKL |
152 | get_sb yes yes | 149 | get_sb yes yes |
153 | kill_sb yes yes | 150 | kill_sb yes yes |
154 | 151 | ||
155 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount | 152 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount |
156 | (exclusive on ->s_umount). | 153 | (exclusive on ->s_umount). |
157 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, | 154 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, |
158 | unlocks and drops the reference. | 155 | unlocks and drops the reference. |
159 | 156 | ||
160 | --------------------------- address_space_operations -------------------------- | 157 | --------------------------- address_space_operations -------------------------- |
161 | prototypes: | 158 | prototypes: |
162 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 159 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
163 | int (*readpage)(struct file *, struct page *); | 160 | int (*readpage)(struct file *, struct page *); |
164 | int (*sync_page)(struct page *); | 161 | int (*sync_page)(struct page *); |
165 | int (*writepages)(struct address_space *, struct writeback_control *); | 162 | int (*writepages)(struct address_space *, struct writeback_control *); |
166 | int (*set_page_dirty)(struct page *page); | 163 | int (*set_page_dirty)(struct page *page); |
167 | int (*readpages)(struct file *filp, struct address_space *mapping, | 164 | int (*readpages)(struct file *filp, struct address_space *mapping, |
168 | struct list_head *pages, unsigned nr_pages); | 165 | struct list_head *pages, unsigned nr_pages); |
169 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); | 166 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); |
170 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); | 167 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); |
171 | sector_t (*bmap)(struct address_space *, sector_t); | 168 | sector_t (*bmap)(struct address_space *, sector_t); |
172 | int (*invalidatepage) (struct page *, unsigned long); | 169 | int (*invalidatepage) (struct page *, unsigned long); |
173 | int (*releasepage) (struct page *, int); | 170 | int (*releasepage) (struct page *, int); |
174 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 171 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
175 | loff_t offset, unsigned long nr_segs); | 172 | loff_t offset, unsigned long nr_segs); |
176 | int (*launder_page) (struct page *); | 173 | int (*launder_page) (struct page *); |
177 | 174 | ||
178 | locking rules: | 175 | locking rules: |
179 | All except set_page_dirty may block | 176 | All except set_page_dirty may block |
180 | 177 | ||
181 | BKL PageLocked(page) i_sem | 178 | BKL PageLocked(page) i_sem |
182 | writepage: no yes, unlocks (see below) | 179 | writepage: no yes, unlocks (see below) |
183 | readpage: no yes, unlocks | 180 | readpage: no yes, unlocks |
184 | sync_page: no maybe | 181 | sync_page: no maybe |
185 | writepages: no | 182 | writepages: no |
186 | set_page_dirty no no | 183 | set_page_dirty no no |
187 | readpages: no | 184 | readpages: no |
188 | prepare_write: no yes yes | 185 | prepare_write: no yes yes |
189 | commit_write: no yes yes | 186 | commit_write: no yes yes |
190 | write_begin: no locks the page yes | 187 | write_begin: no locks the page yes |
191 | write_end: no yes, unlocks yes | 188 | write_end: no yes, unlocks yes |
192 | perform_write: no n/a yes | 189 | perform_write: no n/a yes |
193 | bmap: yes | 190 | bmap: yes |
194 | invalidatepage: no yes | 191 | invalidatepage: no yes |
195 | releasepage: no yes | 192 | releasepage: no yes |
196 | direct_IO: no | 193 | direct_IO: no |
197 | launder_page: no yes | 194 | launder_page: no yes |
198 | 195 | ||
199 | ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() | 196 | ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() |
200 | may be called from the request handler (/dev/loop). | 197 | may be called from the request handler (/dev/loop). |
201 | 198 | ||
202 | ->readpage() unlocks the page, either synchronously or via I/O | 199 | ->readpage() unlocks the page, either synchronously or via I/O |
203 | completion. | 200 | completion. |
204 | 201 | ||
205 | ->readpages() populates the pagecache with the passed pages and starts | 202 | ->readpages() populates the pagecache with the passed pages and starts |
206 | I/O against them. They come unlocked upon I/O completion. | 203 | I/O against them. They come unlocked upon I/O completion. |
207 | 204 | ||
208 | ->writepage() is used for two purposes: for "memory cleansing" and for | 205 | ->writepage() is used for two purposes: for "memory cleansing" and for |
209 | "sync". These are quite different operations and the behaviour may differ | 206 | "sync". These are quite different operations and the behaviour may differ |
210 | depending upon the mode. | 207 | depending upon the mode. |
211 | 208 | ||
212 | If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then | 209 | If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then |
213 | it *must* start I/O against the page, even if that would involve | 210 | it *must* start I/O against the page, even if that would involve |
214 | blocking on in-progress I/O. | 211 | blocking on in-progress I/O. |
215 | 212 | ||
216 | If writepage is called for memory cleansing (sync_mode == | 213 | If writepage is called for memory cleansing (sync_mode == |
217 | WBC_SYNC_NONE) then its role is to get as much writeout underway as | 214 | WBC_SYNC_NONE) then its role is to get as much writeout underway as |
218 | possible. So writepage should try to avoid blocking against | 215 | possible. So writepage should try to avoid blocking against |
219 | currently-in-progress I/O. | 216 | currently-in-progress I/O. |
220 | 217 | ||
221 | If the filesystem is not called for "sync" and it determines that it | 218 | If the filesystem is not called for "sync" and it determines that it |
222 | would need to block against in-progress I/O to be able to start new I/O | 219 | would need to block against in-progress I/O to be able to start new I/O |
223 | against the page the filesystem should redirty the page with | 220 | against the page the filesystem should redirty the page with |
224 | redirty_page_for_writepage(), then unlock the page and return zero. | 221 | redirty_page_for_writepage(), then unlock the page and return zero. |
225 | This may also be done to avoid internal deadlocks, but rarely. | 222 | This may also be done to avoid internal deadlocks, but rarely. |
226 | 223 | ||
227 | If the filesystem is called for sync then it must wait on any | 224 | If the filesystem is called for sync then it must wait on any |
228 | in-progress I/O and then start new I/O. | 225 | in-progress I/O and then start new I/O. |
229 | 226 | ||
230 | The filesystem should unlock the page synchronously, before returning to the | 227 | The filesystem should unlock the page synchronously, before returning to the |
231 | caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE | 228 | caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE |
232 | value. WRITEPAGE_ACTIVATE means that page cannot really be written out | 229 | value. WRITEPAGE_ACTIVATE means that page cannot really be written out |
233 | currently, and VM should stop calling ->writepage() on this page for some | 230 | currently, and VM should stop calling ->writepage() on this page for some |
234 | time. VM does this by moving page to the head of the active list, hence the | 231 | time. VM does this by moving page to the head of the active list, hence the |
235 | name. | 232 | name. |
236 | 233 | ||
237 | Unless the filesystem is going to redirty_page_for_writepage(), unlock the page | 234 | Unless the filesystem is going to redirty_page_for_writepage(), unlock the page |
238 | and return zero, writepage *must* run set_page_writeback() against the page, | 235 | and return zero, writepage *must* run set_page_writeback() against the page, |
239 | followed by unlocking it. Once set_page_writeback() has been run against the | 236 | followed by unlocking it. Once set_page_writeback() has been run against the |
240 | page, write I/O can be submitted and the write I/O completion handler must run | 237 | page, write I/O can be submitted and the write I/O completion handler must run |
241 | end_page_writeback() once the I/O is complete. If no I/O is submitted, the | 238 | end_page_writeback() once the I/O is complete. If no I/O is submitted, the |
242 | filesystem must run end_page_writeback() against the page before returning from | 239 | filesystem must run end_page_writeback() against the page before returning from |
243 | writepage. | 240 | writepage. |
244 | 241 | ||
245 | That is: after 2.5.12, pages which are under writeout are *not* locked. Note, | 242 | That is: after 2.5.12, pages which are under writeout are *not* locked. Note, |
246 | if the filesystem needs the page to be locked during writeout, that is ok, too, | 243 | if the filesystem needs the page to be locked during writeout, that is ok, too, |
247 | the page is allowed to be unlocked at any point in time between the calls to | 244 | the page is allowed to be unlocked at any point in time between the calls to |
248 | set_page_writeback() and end_page_writeback(). | 245 | set_page_writeback() and end_page_writeback(). |
249 | 246 | ||
250 | Note, failure to run either redirty_page_for_writepage() or the combination of | 247 | Note, failure to run either redirty_page_for_writepage() or the combination of |
251 | set_page_writeback()/end_page_writeback() on a page submitted to writepage | 248 | set_page_writeback()/end_page_writeback() on a page submitted to writepage |
252 | will leave the page itself marked clean but it will be tagged as dirty in the | 249 | will leave the page itself marked clean but it will be tagged as dirty in the |
253 | radix tree. This incoherency can lead to all sorts of hard-to-debug problems | 250 | radix tree. This incoherency can lead to all sorts of hard-to-debug problems |
254 | in the filesystem like having dirty inodes at umount and losing written data. | 251 | in the filesystem like having dirty inodes at umount and losing written data. |
255 | 252 | ||
256 | ->sync_page() locking rules are not well-defined - usually it is called | 253 | ->sync_page() locking rules are not well-defined - usually it is called |
257 | with lock on page, but that is not guaranteed. Considering the currently | 254 | with lock on page, but that is not guaranteed. Considering the currently |
258 | existing instances of this method ->sync_page() itself doesn't look | 255 | existing instances of this method ->sync_page() itself doesn't look |
259 | well-defined... | 256 | well-defined... |
260 | 257 | ||
261 | ->writepages() is used for periodic writeback and for syscall-initiated | 258 | ->writepages() is used for periodic writeback and for syscall-initiated |
262 | sync operations. The address_space should start I/O against at least | 259 | sync operations. The address_space should start I/O against at least |
263 | *nr_to_write pages. *nr_to_write must be decremented for each page which is | 260 | *nr_to_write pages. *nr_to_write must be decremented for each page which is |
264 | written. The address_space implementation may write more (or less) pages | 261 | written. The address_space implementation may write more (or less) pages |
265 | than *nr_to_write asks for, but it should try to be reasonably close. If | 262 | than *nr_to_write asks for, but it should try to be reasonably close. If |
266 | nr_to_write is NULL, all dirty pages must be written. | 263 | nr_to_write is NULL, all dirty pages must be written. |
267 | 264 | ||
268 | writepages should _only_ write pages which are present on | 265 | writepages should _only_ write pages which are present on |
269 | mapping->io_pages. | 266 | mapping->io_pages. |
270 | 267 | ||
271 | ->set_page_dirty() is called from various places in the kernel | 268 | ->set_page_dirty() is called from various places in the kernel |
272 | when the target page is marked as needing writeback. It may be called | 269 | when the target page is marked as needing writeback. It may be called |
273 | under spinlock (it cannot block) and is sometimes called with the page | 270 | under spinlock (it cannot block) and is sometimes called with the page |
274 | not locked. | 271 | not locked. |
275 | 272 | ||
276 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some | 273 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some |
277 | filesystems and by the swapper. The latter will eventually go away. All | 274 | filesystems and by the swapper. The latter will eventually go away. All |
278 | instances do not actually need the BKL. Please, keep it that way and don't | 275 | instances do not actually need the BKL. Please, keep it that way and don't |
279 | breed new callers. | 276 | breed new callers. |
280 | 277 | ||
281 | ->invalidatepage() is called when the filesystem must attempt to drop | 278 | ->invalidatepage() is called when the filesystem must attempt to drop |
282 | some or all of the buffers from the page when it is being truncated. It | 279 | some or all of the buffers from the page when it is being truncated. It |
283 | returns zero on success. If ->invalidatepage is zero, the kernel uses | 280 | returns zero on success. If ->invalidatepage is zero, the kernel uses |
284 | block_invalidatepage() instead. | 281 | block_invalidatepage() instead. |
285 | 282 | ||
286 | ->releasepage() is called when the kernel is about to try to drop the | 283 | ->releasepage() is called when the kernel is about to try to drop the |
287 | buffers from the page in preparation for freeing it. It returns zero to | 284 | buffers from the page in preparation for freeing it. It returns zero to |
288 | indicate that the buffers are (or may be) freeable. If ->releasepage is zero, | 285 | indicate that the buffers are (or may be) freeable. If ->releasepage is zero, |
289 | the kernel assumes that the fs has no private interest in the buffers. | 286 | the kernel assumes that the fs has no private interest in the buffers. |
290 | 287 | ||
291 | ->launder_page() may be called prior to releasing a page if | 288 | ->launder_page() may be called prior to releasing a page if |
292 | it is still found to be dirty. It returns zero if the page was successfully | 289 | it is still found to be dirty. It returns zero if the page was successfully |
293 | cleaned, or an error value if not. Note that in order to prevent the page | 290 | cleaned, or an error value if not. Note that in order to prevent the page |
294 | getting mapped back in and redirtied, it needs to be kept locked | 291 | getting mapped back in and redirtied, it needs to be kept locked |
295 | across the entire operation. | 292 | across the entire operation. |
296 | 293 | ||
297 | Note: currently almost all instances of address_space methods are | 294 | Note: currently almost all instances of address_space methods are |
298 | using BKL for internal serialization and that's one of the worst sources | 295 | using BKL for internal serialization and that's one of the worst sources |
299 | of contention. Normally they are calling library functions (in fs/buffer.c) | 296 | of contention. Normally they are calling library functions (in fs/buffer.c) |
300 | and pass foo_get_block() as a callback (on local block-based filesystems, | 297 | and pass foo_get_block() as a callback (on local block-based filesystems, |
301 | indeed). BKL is not needed for library stuff and is usually taken by | 298 | indeed). BKL is not needed for library stuff and is usually taken by |
302 | foo_get_block(). It's an overkill, since block bitmaps can be protected by | 299 | foo_get_block(). It's an overkill, since block bitmaps can be protected by |
303 | internal fs locking and real critical areas are much smaller than the areas | 300 | internal fs locking and real critical areas are much smaller than the areas |
304 | filesystems protect now. | 301 | filesystems protect now. |
305 | 302 | ||
306 | ----------------------- file_lock_operations ------------------------------ | 303 | ----------------------- file_lock_operations ------------------------------ |
307 | prototypes: | 304 | prototypes: |
308 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ | 305 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ |
309 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ | 306 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ |
310 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 307 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
311 | void (*fl_release_private)(struct file_lock *); | 308 | void (*fl_release_private)(struct file_lock *); |
312 | 309 | ||
313 | 310 | ||
314 | locking rules: | 311 | locking rules: |
315 | BKL may block | 312 | BKL may block |
316 | fl_insert: yes no | 313 | fl_insert: yes no |
317 | fl_remove: yes no | 314 | fl_remove: yes no |
318 | fl_copy_lock: yes no | 315 | fl_copy_lock: yes no |
319 | fl_release_private: yes yes | 316 | fl_release_private: yes yes |
320 | 317 | ||
321 | ----------------------- lock_manager_operations --------------------------- | 318 | ----------------------- lock_manager_operations --------------------------- |
322 | prototypes: | 319 | prototypes: |
323 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); | 320 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); |
324 | void (*fl_notify)(struct file_lock *); /* unblock callback */ | 321 | void (*fl_notify)(struct file_lock *); /* unblock callback */ |
325 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 322 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
326 | void (*fl_release_private)(struct file_lock *); | 323 | void (*fl_release_private)(struct file_lock *); |
327 | void (*fl_break)(struct file_lock *); /* break_lease callback */ | 324 | void (*fl_break)(struct file_lock *); /* break_lease callback */ |
328 | 325 | ||
329 | locking rules: | 326 | locking rules: |
330 | BKL may block | 327 | BKL may block |
331 | fl_compare_owner: yes no | 328 | fl_compare_owner: yes no |
332 | fl_notify: yes no | 329 | fl_notify: yes no |
333 | fl_copy_lock: yes no | 330 | fl_copy_lock: yes no |
334 | fl_release_private: yes yes | 331 | fl_release_private: yes yes |
335 | fl_break: yes no | 332 | fl_break: yes no |
336 | 333 | ||
337 | Currently only NFSD and NLM provide instances of this class. None of the | 334 | Currently only NFSD and NLM provide instances of this class. None of the |
338 | them block. If you have out-of-tree instances - please, show up. Locking | 335 | them block. If you have out-of-tree instances - please, show up. Locking |
339 | in that area will change. | 336 | in that area will change. |
340 | --------------------------- buffer_head ----------------------------------- | 337 | --------------------------- buffer_head ----------------------------------- |
341 | prototypes: | 338 | prototypes: |
342 | void (*b_end_io)(struct buffer_head *bh, int uptodate); | 339 | void (*b_end_io)(struct buffer_head *bh, int uptodate); |
343 | 340 | ||
344 | locking rules: | 341 | locking rules: |
345 | called from interrupts. In other words, extreme care is needed here. | 342 | called from interrupts. In other words, extreme care is needed here. |
346 | bh is locked, but that's all warranties we have here. Currently only RAID1, | 343 | bh is locked, but that's all warranties we have here. Currently only RAID1, |
347 | highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices | 344 | highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices |
348 | call this method upon the IO completion. | 345 | call this method upon the IO completion. |
349 | 346 | ||
350 | --------------------------- block_device_operations ----------------------- | 347 | --------------------------- block_device_operations ----------------------- |
351 | prototypes: | 348 | prototypes: |
352 | int (*open) (struct inode *, struct file *); | 349 | int (*open) (struct inode *, struct file *); |
353 | int (*release) (struct inode *, struct file *); | 350 | int (*release) (struct inode *, struct file *); |
354 | int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); | 351 | int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); |
355 | int (*media_changed) (struct gendisk *); | 352 | int (*media_changed) (struct gendisk *); |
356 | int (*revalidate_disk) (struct gendisk *); | 353 | int (*revalidate_disk) (struct gendisk *); |
357 | 354 | ||
358 | locking rules: | 355 | locking rules: |
359 | BKL bd_sem | 356 | BKL bd_sem |
360 | open: yes yes | 357 | open: yes yes |
361 | release: yes yes | 358 | release: yes yes |
362 | ioctl: yes no | 359 | ioctl: yes no |
363 | media_changed: no no | 360 | media_changed: no no |
364 | revalidate_disk: no no | 361 | revalidate_disk: no no |
365 | 362 | ||
366 | The last two are called only from check_disk_change(). | 363 | The last two are called only from check_disk_change(). |
367 | 364 | ||
368 | --------------------------- file_operations ------------------------------- | 365 | --------------------------- file_operations ------------------------------- |
369 | prototypes: | 366 | prototypes: |
370 | loff_t (*llseek) (struct file *, loff_t, int); | 367 | loff_t (*llseek) (struct file *, loff_t, int); |
371 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); | 368 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); |
372 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); | 369 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); |
373 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 370 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
374 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 371 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
375 | int (*readdir) (struct file *, void *, filldir_t); | 372 | int (*readdir) (struct file *, void *, filldir_t); |
376 | unsigned int (*poll) (struct file *, struct poll_table_struct *); | 373 | unsigned int (*poll) (struct file *, struct poll_table_struct *); |
377 | int (*ioctl) (struct inode *, struct file *, unsigned int, | 374 | int (*ioctl) (struct inode *, struct file *, unsigned int, |
378 | unsigned long); | 375 | unsigned long); |
379 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); | 376 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); |
380 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); | 377 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); |
381 | int (*mmap) (struct file *, struct vm_area_struct *); | 378 | int (*mmap) (struct file *, struct vm_area_struct *); |
382 | int (*open) (struct inode *, struct file *); | 379 | int (*open) (struct inode *, struct file *); |
383 | int (*flush) (struct file *); | 380 | int (*flush) (struct file *); |
384 | int (*release) (struct inode *, struct file *); | 381 | int (*release) (struct inode *, struct file *); |
385 | int (*fsync) (struct file *, struct dentry *, int datasync); | 382 | int (*fsync) (struct file *, struct dentry *, int datasync); |
386 | int (*aio_fsync) (struct kiocb *, int datasync); | 383 | int (*aio_fsync) (struct kiocb *, int datasync); |
387 | int (*fasync) (int, struct file *, int); | 384 | int (*fasync) (int, struct file *, int); |
388 | int (*lock) (struct file *, int, struct file_lock *); | 385 | int (*lock) (struct file *, int, struct file_lock *); |
389 | ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, | 386 | ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, |
390 | loff_t *); | 387 | loff_t *); |
391 | ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, | 388 | ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, |
392 | loff_t *); | 389 | loff_t *); |
393 | ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, | 390 | ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, |
394 | void __user *); | 391 | void __user *); |
395 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, | 392 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, |
396 | loff_t *, int); | 393 | loff_t *, int); |
397 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, | 394 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, |
398 | unsigned long, unsigned long, unsigned long); | 395 | unsigned long, unsigned long, unsigned long); |
399 | int (*check_flags)(int); | 396 | int (*check_flags)(int); |
400 | int (*dir_notify)(struct file *, unsigned long); | 397 | int (*dir_notify)(struct file *, unsigned long); |
401 | }; | 398 | }; |
402 | 399 | ||
403 | locking rules: | 400 | locking rules: |
404 | All except ->poll() may block. | 401 | All except ->poll() may block. |
405 | BKL | 402 | BKL |
406 | llseek: no (see below) | 403 | llseek: no (see below) |
407 | read: no | 404 | read: no |
408 | aio_read: no | 405 | aio_read: no |
409 | write: no | 406 | write: no |
410 | aio_write: no | 407 | aio_write: no |
411 | readdir: no | 408 | readdir: no |
412 | poll: no | 409 | poll: no |
413 | ioctl: yes (see below) | 410 | ioctl: yes (see below) |
414 | unlocked_ioctl: no (see below) | 411 | unlocked_ioctl: no (see below) |
415 | compat_ioctl: no | 412 | compat_ioctl: no |
416 | mmap: no | 413 | mmap: no |
417 | open: maybe (see below) | 414 | open: maybe (see below) |
418 | flush: no | 415 | flush: no |
419 | release: no | 416 | release: no |
420 | fsync: no (see below) | 417 | fsync: no (see below) |
421 | aio_fsync: no | 418 | aio_fsync: no |
422 | fasync: yes (see below) | 419 | fasync: yes (see below) |
423 | lock: yes | 420 | lock: yes |
424 | readv: no | 421 | readv: no |
425 | writev: no | 422 | writev: no |
426 | sendfile: no | 423 | sendfile: no |
427 | sendpage: no | 424 | sendpage: no |
428 | get_unmapped_area: no | 425 | get_unmapped_area: no |
429 | check_flags: no | 426 | check_flags: no |
430 | dir_notify: no | 427 | dir_notify: no |
431 | 428 | ||
432 | ->llseek() locking has moved from llseek to the individual llseek | 429 | ->llseek() locking has moved from llseek to the individual llseek |
433 | implementations. If your fs is not using generic_file_llseek, you | 430 | implementations. If your fs is not using generic_file_llseek, you |
434 | need to acquire and release the appropriate locks in your ->llseek(). | 431 | need to acquire and release the appropriate locks in your ->llseek(). |
435 | For many filesystems, it is probably safe to acquire the inode | 432 | For many filesystems, it is probably safe to acquire the inode |
436 | semaphore. Note some filesystems (i.e. remote ones) provide no | 433 | semaphore. Note some filesystems (i.e. remote ones) provide no |
437 | protection for i_size so you will need to use the BKL. | 434 | protection for i_size so you will need to use the BKL. |
438 | 435 | ||
439 | ->open() locking is in-transit: big lock partially moved into the methods. | 436 | ->open() locking is in-transit: big lock partially moved into the methods. |
440 | The only exception is ->open() in the instances of file_operations that never | 437 | The only exception is ->open() in the instances of file_operations that never |
441 | end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices | 438 | end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices |
442 | (chrdev_open() takes lock before replacing ->f_op and calling the secondary | 439 | (chrdev_open() takes lock before replacing ->f_op and calling the secondary |
443 | method. As soon as we fix the handling of module reference counters all | 440 | method. As soon as we fix the handling of module reference counters all |
444 | instances of ->open() will be called without the BKL. | 441 | instances of ->open() will be called without the BKL. |
445 | 442 | ||
446 | Note: ext2_release() was *the* source of contention on fs-intensive | 443 | Note: ext2_release() was *the* source of contention on fs-intensive |
447 | loads and dropping BKL on ->release() helps to get rid of that (we still | 444 | loads and dropping BKL on ->release() helps to get rid of that (we still |
448 | grab BKL for cases when we close a file that had been opened r/w, but that | 445 | grab BKL for cases when we close a file that had been opened r/w, but that |
449 | can and should be done using the internal locking with smaller critical areas). | 446 | can and should be done using the internal locking with smaller critical areas). |
450 | Current worst offender is ext2_get_block()... | 447 | Current worst offender is ext2_get_block()... |
451 | 448 | ||
452 | ->fasync() is a mess. This area needs a big cleanup and that will probably | 449 | ->fasync() is a mess. This area needs a big cleanup and that will probably |
453 | affect locking. | 450 | affect locking. |
454 | 451 | ||
455 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would | 452 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would |
456 | move ->readdir() to inode_operations and use a separate method for directory | 453 | move ->readdir() to inode_operations and use a separate method for directory |
457 | ->ioctl() or kill the latter completely. One of the problems is that for | 454 | ->ioctl() or kill the latter completely. One of the problems is that for |
458 | anything that resembles union-mount we won't have a struct file for all | 455 | anything that resembles union-mount we won't have a struct file for all |
459 | components. And there are other reasons why the current interface is a mess... | 456 | components. And there are other reasons why the current interface is a mess... |
460 | 457 | ||
461 | ->ioctl() on regular files is superceded by the ->unlocked_ioctl() that | 458 | ->ioctl() on regular files is superceded by the ->unlocked_ioctl() that |
462 | doesn't take the BKL. | 459 | doesn't take the BKL. |
463 | 460 | ||
464 | ->read on directories probably must go away - we should just enforce -EISDIR | 461 | ->read on directories probably must go away - we should just enforce -EISDIR |
465 | in sys_read() and friends. | 462 | in sys_read() and friends. |
466 | 463 | ||
467 | ->fsync() has i_mutex on inode. | 464 | ->fsync() has i_mutex on inode. |
468 | 465 | ||
469 | --------------------------- dquot_operations ------------------------------- | 466 | --------------------------- dquot_operations ------------------------------- |
470 | prototypes: | 467 | prototypes: |
471 | int (*initialize) (struct inode *, int); | 468 | int (*initialize) (struct inode *, int); |
472 | int (*drop) (struct inode *); | 469 | int (*drop) (struct inode *); |
473 | int (*alloc_space) (struct inode *, qsize_t, int); | 470 | int (*alloc_space) (struct inode *, qsize_t, int); |
474 | int (*alloc_inode) (const struct inode *, unsigned long); | 471 | int (*alloc_inode) (const struct inode *, unsigned long); |
475 | int (*free_space) (struct inode *, qsize_t); | 472 | int (*free_space) (struct inode *, qsize_t); |
476 | int (*free_inode) (const struct inode *, unsigned long); | 473 | int (*free_inode) (const struct inode *, unsigned long); |
477 | int (*transfer) (struct inode *, struct iattr *); | 474 | int (*transfer) (struct inode *, struct iattr *); |
478 | int (*write_dquot) (struct dquot *); | 475 | int (*write_dquot) (struct dquot *); |
479 | int (*acquire_dquot) (struct dquot *); | 476 | int (*acquire_dquot) (struct dquot *); |
480 | int (*release_dquot) (struct dquot *); | 477 | int (*release_dquot) (struct dquot *); |
481 | int (*mark_dirty) (struct dquot *); | 478 | int (*mark_dirty) (struct dquot *); |
482 | int (*write_info) (struct super_block *, int); | 479 | int (*write_info) (struct super_block *, int); |
483 | 480 | ||
484 | These operations are intended to be more or less wrapping functions that ensure | 481 | These operations are intended to be more or less wrapping functions that ensure |
485 | a proper locking wrt the filesystem and call the generic quota operations. | 482 | a proper locking wrt the filesystem and call the generic quota operations. |
486 | 483 | ||
487 | What filesystem should expect from the generic quota functions: | 484 | What filesystem should expect from the generic quota functions: |
488 | 485 | ||
489 | FS recursion Held locks when called | 486 | FS recursion Held locks when called |
490 | initialize: yes maybe dqonoff_sem | 487 | initialize: yes maybe dqonoff_sem |
491 | drop: yes - | 488 | drop: yes - |
492 | alloc_space: ->mark_dirty() - | 489 | alloc_space: ->mark_dirty() - |
493 | alloc_inode: ->mark_dirty() - | 490 | alloc_inode: ->mark_dirty() - |
494 | free_space: ->mark_dirty() - | 491 | free_space: ->mark_dirty() - |
495 | free_inode: ->mark_dirty() - | 492 | free_inode: ->mark_dirty() - |
496 | transfer: yes - | 493 | transfer: yes - |
497 | write_dquot: yes dqonoff_sem or dqptr_sem | 494 | write_dquot: yes dqonoff_sem or dqptr_sem |
498 | acquire_dquot: yes dqonoff_sem or dqptr_sem | 495 | acquire_dquot: yes dqonoff_sem or dqptr_sem |
499 | release_dquot: yes dqonoff_sem or dqptr_sem | 496 | release_dquot: yes dqonoff_sem or dqptr_sem |
500 | mark_dirty: no - | 497 | mark_dirty: no - |
501 | write_info: yes dqonoff_sem | 498 | write_info: yes dqonoff_sem |
502 | 499 | ||
503 | FS recursion means calling ->quota_read() and ->quota_write() from superblock | 500 | FS recursion means calling ->quota_read() and ->quota_write() from superblock |
504 | operations. | 501 | operations. |
505 | 502 | ||
506 | ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called | 503 | ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called |
507 | only directly by the filesystem and do not call any fs functions only | 504 | only directly by the filesystem and do not call any fs functions only |
508 | the ->mark_dirty() operation. | 505 | the ->mark_dirty() operation. |
509 | 506 | ||
510 | More details about quota locking can be found in fs/dquot.c. | 507 | More details about quota locking can be found in fs/dquot.c. |
511 | 508 | ||
512 | --------------------------- vm_operations_struct ----------------------------- | 509 | --------------------------- vm_operations_struct ----------------------------- |
513 | prototypes: | 510 | prototypes: |
514 | void (*open)(struct vm_area_struct*); | 511 | void (*open)(struct vm_area_struct*); |
515 | void (*close)(struct vm_area_struct*); | 512 | void (*close)(struct vm_area_struct*); |
516 | int (*fault)(struct vm_area_struct*, struct vm_fault *); | 513 | int (*fault)(struct vm_area_struct*, struct vm_fault *); |
517 | struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); | 514 | struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); |
518 | int (*page_mkwrite)(struct vm_area_struct *, struct page *); | 515 | int (*page_mkwrite)(struct vm_area_struct *, struct page *); |
519 | 516 | ||
520 | locking rules: | 517 | locking rules: |
521 | BKL mmap_sem PageLocked(page) | 518 | BKL mmap_sem PageLocked(page) |
522 | open: no yes | 519 | open: no yes |
523 | close: no yes | 520 | close: no yes |
524 | fault: no yes | 521 | fault: no yes |
525 | nopage: no yes | 522 | nopage: no yes |
526 | page_mkwrite: no yes no | 523 | page_mkwrite: no yes no |
527 | 524 | ||
528 | ->page_mkwrite() is called when a previously read-only page is | 525 | ->page_mkwrite() is called when a previously read-only page is |
529 | about to become writeable. The file system is responsible for | 526 | about to become writeable. The file system is responsible for |
530 | protecting against truncate races. Once appropriate action has been | 527 | protecting against truncate races. Once appropriate action has been |
531 | taking to lock out truncate, the page range should be verified to be | 528 | taking to lock out truncate, the page range should be verified to be |
532 | within i_size. The page mapping should also be checked that it is not | 529 | within i_size. The page mapping should also be checked that it is not |
533 | NULL. | 530 | NULL. |
534 | 531 | ||
535 | ================================================================================ | 532 | ================================================================================ |
536 | Dubious stuff | 533 | Dubious stuff |
537 | 534 | ||
538 | (if you break something or notice that it is broken and do not fix it yourself | 535 | (if you break something or notice that it is broken and do not fix it yourself |
539 | - at least put it here) | 536 | - at least put it here) |
540 | 537 | ||
541 | ipc/shm.c::shm_delete() - may need BKL. | 538 | ipc/shm.c::shm_delete() - may need BKL. |
542 | ->read() and ->write() in many drivers are (probably) missing BKL. | 539 | ->read() and ->write() in many drivers are (probably) missing BKL. |
543 | drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL. | 540 | drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL. |
544 | 541 |
Documentation/filesystems/porting
1 | Changes since 2.5.0: | 1 | Changes since 2.5.0: |
2 | 2 | ||
3 | --- | 3 | --- |
4 | [recommended] | 4 | [recommended] |
5 | 5 | ||
6 | New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(), | 6 | New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(), |
7 | sb_set_blocksize() and sb_min_blocksize(). | 7 | sb_set_blocksize() and sb_min_blocksize(). |
8 | 8 | ||
9 | Use them. | 9 | Use them. |
10 | 10 | ||
11 | (sb_find_get_block() replaces 2.4's get_hash_table()) | 11 | (sb_find_get_block() replaces 2.4's get_hash_table()) |
12 | 12 | ||
13 | --- | 13 | --- |
14 | [recommended] | 14 | [recommended] |
15 | 15 | ||
16 | New methods: ->alloc_inode() and ->destroy_inode(). | 16 | New methods: ->alloc_inode() and ->destroy_inode(). |
17 | 17 | ||
18 | Remove inode->u.foo_inode_i | 18 | Remove inode->u.foo_inode_i |
19 | Declare | 19 | Declare |
20 | struct foo_inode_info { | 20 | struct foo_inode_info { |
21 | /* fs-private stuff */ | 21 | /* fs-private stuff */ |
22 | struct inode vfs_inode; | 22 | struct inode vfs_inode; |
23 | }; | 23 | }; |
24 | static inline struct foo_inode_info *FOO_I(struct inode *inode) | 24 | static inline struct foo_inode_info *FOO_I(struct inode *inode) |
25 | { | 25 | { |
26 | return list_entry(inode, struct foo_inode_info, vfs_inode); | 26 | return list_entry(inode, struct foo_inode_info, vfs_inode); |
27 | } | 27 | } |
28 | 28 | ||
29 | Use FOO_I(inode) instead of &inode->u.foo_inode_i; | 29 | Use FOO_I(inode) instead of &inode->u.foo_inode_i; |
30 | 30 | ||
31 | Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate | 31 | Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate |
32 | foo_inode_info and return the address of ->vfs_inode, the latter should free | 32 | foo_inode_info and return the address of ->vfs_inode, the latter should free |
33 | FOO_I(inode) (see in-tree filesystems for examples). | 33 | FOO_I(inode) (see in-tree filesystems for examples). |
34 | 34 | ||
35 | Make them ->alloc_inode and ->destroy_inode in your super_operations. | 35 | Make them ->alloc_inode and ->destroy_inode in your super_operations. |
36 | 36 | ||
37 | Keep in mind that now you need explicit initialization of private data - | 37 | Keep in mind that now you need explicit initialization of private data |
38 | typically in ->read_inode() and after getting an inode from new_inode(). | 38 | typically between calling iget_locked() and unlocking the inode. |
39 | 39 | ||
40 | At some point that will become mandatory. | 40 | At some point that will become mandatory. |
41 | 41 | ||
42 | --- | 42 | --- |
43 | [mandatory] | 43 | [mandatory] |
44 | 44 | ||
45 | Change of file_system_type method (->read_super to ->get_sb) | 45 | Change of file_system_type method (->read_super to ->get_sb) |
46 | 46 | ||
47 | ->read_super() is no more. Ditto for DECLARE_FSTYPE and DECLARE_FSTYPE_DEV. | 47 | ->read_super() is no more. Ditto for DECLARE_FSTYPE and DECLARE_FSTYPE_DEV. |
48 | 48 | ||
49 | Turn your foo_read_super() into a function that would return 0 in case of | 49 | Turn your foo_read_super() into a function that would return 0 in case of |
50 | success and negative number in case of error (-EINVAL unless you have more | 50 | success and negative number in case of error (-EINVAL unless you have more |
51 | informative error value to report). Call it foo_fill_super(). Now declare | 51 | informative error value to report). Call it foo_fill_super(). Now declare |
52 | 52 | ||
53 | int foo_get_sb(struct file_system_type *fs_type, | 53 | int foo_get_sb(struct file_system_type *fs_type, |
54 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) | 54 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) |
55 | { | 55 | { |
56 | return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super, | 56 | return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super, |
57 | mnt); | 57 | mnt); |
58 | } | 58 | } |
59 | 59 | ||
60 | (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of | 60 | (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of |
61 | filesystem). | 61 | filesystem). |
62 | 62 | ||
63 | Replace DECLARE_FSTYPE... with explicit initializer and have ->get_sb set as | 63 | Replace DECLARE_FSTYPE... with explicit initializer and have ->get_sb set as |
64 | foo_get_sb. | 64 | foo_get_sb. |
65 | 65 | ||
66 | --- | 66 | --- |
67 | [mandatory] | 67 | [mandatory] |
68 | 68 | ||
69 | Locking change: ->s_vfs_rename_sem is taken only by cross-directory renames. | 69 | Locking change: ->s_vfs_rename_sem is taken only by cross-directory renames. |
70 | Most likely there is no need to change anything, but if you relied on | 70 | Most likely there is no need to change anything, but if you relied on |
71 | global exclusion between renames for some internal purpose - you need to | 71 | global exclusion between renames for some internal purpose - you need to |
72 | change your internal locking. Otherwise exclusion warranties remain the | 72 | change your internal locking. Otherwise exclusion warranties remain the |
73 | same (i.e. parents and victim are locked, etc.). | 73 | same (i.e. parents and victim are locked, etc.). |
74 | 74 | ||
75 | --- | 75 | --- |
76 | [informational] | 76 | [informational] |
77 | 77 | ||
78 | Now we have the exclusion between ->lookup() and directory removal (by | 78 | Now we have the exclusion between ->lookup() and directory removal (by |
79 | ->rmdir() and ->rename()). If you used to need that exclusion and do | 79 | ->rmdir() and ->rename()). If you used to need that exclusion and do |
80 | it by internal locking (most of filesystems couldn't care less) - you | 80 | it by internal locking (most of filesystems couldn't care less) - you |
81 | can relax your locking. | 81 | can relax your locking. |
82 | 82 | ||
83 | --- | 83 | --- |
84 | [mandatory] | 84 | [mandatory] |
85 | 85 | ||
86 | ->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(), | 86 | ->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(), |
87 | ->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename() | 87 | ->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename() |
88 | and ->readdir() are called without BKL now. Grab it on entry, drop upon return | 88 | and ->readdir() are called without BKL now. Grab it on entry, drop upon return |
89 | - that will guarantee the same locking you used to have. If your method or its | 89 | - that will guarantee the same locking you used to have. If your method or its |
90 | parts do not need BKL - better yet, now you can shift lock_kernel() and | 90 | parts do not need BKL - better yet, now you can shift lock_kernel() and |
91 | unlock_kernel() so that they would protect exactly what needs to be | 91 | unlock_kernel() so that they would protect exactly what needs to be |
92 | protected. | 92 | protected. |
93 | 93 | ||
94 | --- | 94 | --- |
95 | [mandatory] | 95 | [mandatory] |
96 | 96 | ||
97 | BKL is also moved from around sb operations. ->write_super() Is now called | 97 | BKL is also moved from around sb operations. ->write_super() Is now called |
98 | without BKL held. BKL should have been shifted into individual fs sb_op | 98 | without BKL held. BKL should have been shifted into individual fs sb_op |
99 | functions. If you don't need it, remove it. | 99 | functions. If you don't need it, remove it. |
100 | 100 | ||
101 | --- | 101 | --- |
102 | [informational] | 102 | [informational] |
103 | 103 | ||
104 | check for ->link() target not being a directory is done by callers. Feel | 104 | check for ->link() target not being a directory is done by callers. Feel |
105 | free to drop it... | 105 | free to drop it... |
106 | 106 | ||
107 | --- | 107 | --- |
108 | [informational] | 108 | [informational] |
109 | 109 | ||
110 | ->link() callers hold ->i_mutex on the object we are linking to. Some of your | 110 | ->link() callers hold ->i_mutex on the object we are linking to. Some of your |
111 | problems might be over... | 111 | problems might be over... |
112 | 112 | ||
113 | --- | 113 | --- |
114 | [mandatory] | 114 | [mandatory] |
115 | 115 | ||
116 | new file_system_type method - kill_sb(superblock). If you are converting | 116 | new file_system_type method - kill_sb(superblock). If you are converting |
117 | an existing filesystem, set it according to ->fs_flags: | 117 | an existing filesystem, set it according to ->fs_flags: |
118 | FS_REQUIRES_DEV - kill_block_super | 118 | FS_REQUIRES_DEV - kill_block_super |
119 | FS_LITTER - kill_litter_super | 119 | FS_LITTER - kill_litter_super |
120 | neither - kill_anon_super | 120 | neither - kill_anon_super |
121 | FS_LITTER is gone - just remove it from fs_flags. | 121 | FS_LITTER is gone - just remove it from fs_flags. |
122 | 122 | ||
123 | --- | 123 | --- |
124 | [mandatory] | 124 | [mandatory] |
125 | 125 | ||
126 | FS_SINGLE is gone (actually, that had happened back when ->get_sb() | 126 | FS_SINGLE is gone (actually, that had happened back when ->get_sb() |
127 | went in - and hadn't been documented ;-/). Just remove it from fs_flags | 127 | went in - and hadn't been documented ;-/). Just remove it from fs_flags |
128 | (and see ->get_sb() entry for other actions). | 128 | (and see ->get_sb() entry for other actions). |
129 | 129 | ||
130 | --- | 130 | --- |
131 | [mandatory] | 131 | [mandatory] |
132 | 132 | ||
133 | ->setattr() is called without BKL now. Caller _always_ holds ->i_mutex, so | 133 | ->setattr() is called without BKL now. Caller _always_ holds ->i_mutex, so |
134 | watch for ->i_mutex-grabbing code that might be used by your ->setattr(). | 134 | watch for ->i_mutex-grabbing code that might be used by your ->setattr(). |
135 | Callers of notify_change() need ->i_mutex now. | 135 | Callers of notify_change() need ->i_mutex now. |
136 | 136 | ||
137 | --- | 137 | --- |
138 | [recommended] | 138 | [recommended] |
139 | 139 | ||
140 | New super_block field "struct export_operations *s_export_op" for | 140 | New super_block field "struct export_operations *s_export_op" for |
141 | explicit support for exporting, e.g. via NFS. The structure is fully | 141 | explicit support for exporting, e.g. via NFS. The structure is fully |
142 | documented at its declaration in include/linux/fs.h, and in | 142 | documented at its declaration in include/linux/fs.h, and in |
143 | Documentation/filesystems/Exporting. | 143 | Documentation/filesystems/Exporting. |
144 | 144 | ||
145 | Briefly it allows for the definition of decode_fh and encode_fh operations | 145 | Briefly it allows for the definition of decode_fh and encode_fh operations |
146 | to encode and decode filehandles, and allows the filesystem to use | 146 | to encode and decode filehandles, and allows the filesystem to use |
147 | a standard helper function for decode_fh, and provide file-system specific | 147 | a standard helper function for decode_fh, and provide file-system specific |
148 | support for this helper, particularly get_parent. | 148 | support for this helper, particularly get_parent. |
149 | 149 | ||
150 | It is planned that this will be required for exporting once the code | 150 | It is planned that this will be required for exporting once the code |
151 | settles down a bit. | 151 | settles down a bit. |
152 | 152 | ||
153 | [mandatory] | 153 | [mandatory] |
154 | 154 | ||
155 | s_export_op is now required for exporting a filesystem. | 155 | s_export_op is now required for exporting a filesystem. |
156 | isofs, ext2, ext3, resierfs, fat | 156 | isofs, ext2, ext3, resierfs, fat |
157 | can be used as examples of very different filesystems. | 157 | can be used as examples of very different filesystems. |
158 | 158 | ||
159 | --- | 159 | --- |
160 | [mandatory] | 160 | [mandatory] |
161 | 161 | ||
162 | iget4() and the read_inode2 callback have been superseded by iget5_locked() | 162 | iget4() and the read_inode2 callback have been superseded by iget5_locked() |
163 | which has the following prototype, | 163 | which has the following prototype, |
164 | 164 | ||
165 | struct inode *iget5_locked(struct super_block *sb, unsigned long ino, | 165 | struct inode *iget5_locked(struct super_block *sb, unsigned long ino, |
166 | int (*test)(struct inode *, void *), | 166 | int (*test)(struct inode *, void *), |
167 | int (*set)(struct inode *, void *), | 167 | int (*set)(struct inode *, void *), |
168 | void *data); | 168 | void *data); |
169 | 169 | ||
170 | 'test' is an additional function that can be used when the inode | 170 | 'test' is an additional function that can be used when the inode |
171 | number is not sufficient to identify the actual file object. 'set' | 171 | number is not sufficient to identify the actual file object. 'set' |
172 | should be a non-blocking function that initializes those parts of a | 172 | should be a non-blocking function that initializes those parts of a |
173 | newly created inode to allow the test function to succeed. 'data' is | 173 | newly created inode to allow the test function to succeed. 'data' is |
174 | passed as an opaque value to both test and set functions. | 174 | passed as an opaque value to both test and set functions. |
175 | 175 | ||
176 | When the inode has been created by iget5_locked(), it will be returned with | 176 | When the inode has been created by iget5_locked(), it will be returned with the |
177 | the I_NEW flag set and will still be locked. read_inode has not been | 177 | I_NEW flag set and will still be locked. The filesystem then needs to finalize |
178 | called so the file system still has to finalize the initialization. Once | 178 | the initialization. Once the inode is initialized it must be unlocked by |
179 | the inode is initialized it must be unlocked by calling unlock_new_inode(). | 179 | calling unlock_new_inode(). |
180 | 180 | ||
181 | The filesystem is responsible for setting (and possibly testing) i_ino | 181 | The filesystem is responsible for setting (and possibly testing) i_ino |
182 | when appropriate. There is also a simpler iget_locked function that | 182 | when appropriate. There is also a simpler iget_locked function that |
183 | just takes the superblock and inode number as arguments and does the | 183 | just takes the superblock and inode number as arguments and does the |
184 | test and set for you. | 184 | test and set for you. |
185 | 185 | ||
186 | e.g. | 186 | e.g. |
187 | inode = iget_locked(sb, ino); | 187 | inode = iget_locked(sb, ino); |
188 | if (inode->i_state & I_NEW) { | 188 | if (inode->i_state & I_NEW) { |
189 | err = read_inode_from_disk(inode); | 189 | err = read_inode_from_disk(inode); |
190 | if (err < 0) { | 190 | if (err < 0) { |
191 | iget_failed(inode); | 191 | iget_failed(inode); |
192 | return err; | 192 | return err; |
193 | } | 193 | } |
194 | unlock_new_inode(inode); | 194 | unlock_new_inode(inode); |
195 | } | 195 | } |
196 | 196 | ||
197 | Note that if the process of setting up a new inode fails, then iget_failed() | 197 | Note that if the process of setting up a new inode fails, then iget_failed() |
198 | should be called on the inode to render it dead, and an appropriate error | 198 | should be called on the inode to render it dead, and an appropriate error |
199 | should be passed back to the caller. | 199 | should be passed back to the caller. |
200 | 200 | ||
201 | --- | 201 | --- |
202 | [recommended] | 202 | [recommended] |
203 | 203 | ||
204 | ->getattr() finally getting used. See instances in nfs, minix, etc. | 204 | ->getattr() finally getting used. See instances in nfs, minix, etc. |
205 | 205 | ||
206 | --- | 206 | --- |
207 | [mandatory] | 207 | [mandatory] |
208 | 208 | ||
209 | ->revalidate() is gone. If your filesystem had it - provide ->getattr() | 209 | ->revalidate() is gone. If your filesystem had it - provide ->getattr() |
210 | and let it call whatever you had as ->revlidate() + (for symlinks that | 210 | and let it call whatever you had as ->revlidate() + (for symlinks that |
211 | had ->revalidate()) add calls in ->follow_link()/->readlink(). | 211 | had ->revalidate()) add calls in ->follow_link()/->readlink(). |
212 | 212 | ||
213 | --- | 213 | --- |
214 | [mandatory] | 214 | [mandatory] |
215 | 215 | ||
216 | ->d_parent changes are not protected by BKL anymore. Read access is safe | 216 | ->d_parent changes are not protected by BKL anymore. Read access is safe |
217 | if at least one of the following is true: | 217 | if at least one of the following is true: |
218 | * filesystem has no cross-directory rename() | 218 | * filesystem has no cross-directory rename() |
219 | * dcache_lock is held | 219 | * dcache_lock is held |
220 | * we know that parent had been locked (e.g. we are looking at | 220 | * we know that parent had been locked (e.g. we are looking at |
221 | ->d_parent of ->lookup() argument). | 221 | ->d_parent of ->lookup() argument). |
222 | * we are called from ->rename(). | 222 | * we are called from ->rename(). |
223 | * the child's ->d_lock is held | 223 | * the child's ->d_lock is held |
224 | Audit your code and add locking if needed. Notice that any place that is | 224 | Audit your code and add locking if needed. Notice that any place that is |
225 | not protected by the conditions above is risky even in the old tree - you | 225 | not protected by the conditions above is risky even in the old tree - you |
226 | had been relying on BKL and that's prone to screwups. Old tree had quite | 226 | had been relying on BKL and that's prone to screwups. Old tree had quite |
227 | a few holes of that kind - unprotected access to ->d_parent leading to | 227 | a few holes of that kind - unprotected access to ->d_parent leading to |
228 | anything from oops to silent memory corruption. | 228 | anything from oops to silent memory corruption. |
229 | 229 | ||
230 | --- | 230 | --- |
231 | [mandatory] | 231 | [mandatory] |
232 | 232 | ||
233 | FS_NOMOUNT is gone. If you use it - just set MS_NOUSER in flags | 233 | FS_NOMOUNT is gone. If you use it - just set MS_NOUSER in flags |
234 | (see rootfs for one kind of solution and bdev/socket/pipe for another). | 234 | (see rootfs for one kind of solution and bdev/socket/pipe for another). |
235 | 235 | ||
236 | --- | 236 | --- |
237 | [recommended] | 237 | [recommended] |
238 | 238 | ||
239 | Use bdev_read_only(bdev) instead of is_read_only(kdev). The latter | 239 | Use bdev_read_only(bdev) instead of is_read_only(kdev). The latter |
240 | is still alive, but only because of the mess in drivers/s390/block/dasd.c. | 240 | is still alive, but only because of the mess in drivers/s390/block/dasd.c. |
241 | As soon as it gets fixed is_read_only() will die. | 241 | As soon as it gets fixed is_read_only() will die. |
242 | 242 | ||
243 | --- | 243 | --- |
244 | [mandatory] | 244 | [mandatory] |
245 | 245 | ||
246 | ->permission() is called without BKL now. Grab it on entry, drop upon | 246 | ->permission() is called without BKL now. Grab it on entry, drop upon |
247 | return - that will guarantee the same locking you used to have. If | 247 | return - that will guarantee the same locking you used to have. If |
248 | your method or its parts do not need BKL - better yet, now you can | 248 | your method or its parts do not need BKL - better yet, now you can |
249 | shift lock_kernel() and unlock_kernel() so that they would protect | 249 | shift lock_kernel() and unlock_kernel() so that they would protect |
250 | exactly what needs to be protected. | 250 | exactly what needs to be protected. |
251 | 251 | ||
252 | --- | 252 | --- |
253 | [mandatory] | 253 | [mandatory] |
254 | 254 | ||
255 | ->statfs() is now called without BKL held. BKL should have been | 255 | ->statfs() is now called without BKL held. BKL should have been |
256 | shifted into individual fs sb_op functions where it's not clear that | 256 | shifted into individual fs sb_op functions where it's not clear that |
257 | it's safe to remove it. If you don't need it, remove it. | 257 | it's safe to remove it. If you don't need it, remove it. |
258 | 258 | ||
259 | --- | 259 | --- |
260 | [mandatory] | 260 | [mandatory] |
261 | 261 | ||
262 | is_read_only() is gone; use bdev_read_only() instead. | 262 | is_read_only() is gone; use bdev_read_only() instead. |
263 | 263 | ||
264 | --- | 264 | --- |
265 | [mandatory] | 265 | [mandatory] |
266 | 266 | ||
267 | destroy_buffers() is gone; use invalidate_bdev(). | 267 | destroy_buffers() is gone; use invalidate_bdev(). |
268 | 268 | ||
269 | --- | 269 | --- |
270 | [mandatory] | 270 | [mandatory] |
271 | 271 | ||
272 | fsync_dev() is gone; use fsync_bdev(). NOTE: lvm breakage is | 272 | fsync_dev() is gone; use fsync_bdev(). NOTE: lvm breakage is |
273 | deliberate; as soon as struct block_device * is propagated in a reasonable | 273 | deliberate; as soon as struct block_device * is propagated in a reasonable |
274 | way by that code fixing will become trivial; until then nothing can be | 274 | way by that code fixing will become trivial; until then nothing can be |
275 | done. | 275 | done. |
276 | 276 |
Documentation/filesystems/vfs.txt
1 | 1 | ||
2 | Overview of the Linux Virtual File System | 2 | Overview of the Linux Virtual File System |
3 | 3 | ||
4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> | 4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> |
5 | 5 | ||
6 | Last updated on June 24, 2007. | 6 | Last updated on June 24, 2007. |
7 | 7 | ||
8 | Copyright (C) 1999 Richard Gooch | 8 | Copyright (C) 1999 Richard Gooch |
9 | Copyright (C) 2005 Pekka Enberg | 9 | Copyright (C) 2005 Pekka Enberg |
10 | 10 | ||
11 | This file is released under the GPLv2. | 11 | This file is released under the GPLv2. |
12 | 12 | ||
13 | 13 | ||
14 | Introduction | 14 | Introduction |
15 | ============ | 15 | ============ |
16 | 16 | ||
17 | The Virtual File System (also known as the Virtual Filesystem Switch) | 17 | The Virtual File System (also known as the Virtual Filesystem Switch) |
18 | is the software layer in the kernel that provides the filesystem | 18 | is the software layer in the kernel that provides the filesystem |
19 | interface to userspace programs. It also provides an abstraction | 19 | interface to userspace programs. It also provides an abstraction |
20 | within the kernel which allows different filesystem implementations to | 20 | within the kernel which allows different filesystem implementations to |
21 | coexist. | 21 | coexist. |
22 | 22 | ||
23 | VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so | 23 | VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so |
24 | on are called from a process context. Filesystem locking is described | 24 | on are called from a process context. Filesystem locking is described |
25 | in the document Documentation/filesystems/Locking. | 25 | in the document Documentation/filesystems/Locking. |
26 | 26 | ||
27 | 27 | ||
28 | Directory Entry Cache (dcache) | 28 | Directory Entry Cache (dcache) |
29 | ------------------------------ | 29 | ------------------------------ |
30 | 30 | ||
31 | The VFS implements the open(2), stat(2), chmod(2), and similar system | 31 | The VFS implements the open(2), stat(2), chmod(2), and similar system |
32 | calls. The pathname argument that is passed to them is used by the VFS | 32 | calls. The pathname argument that is passed to them is used by the VFS |
33 | to search through the directory entry cache (also known as the dentry | 33 | to search through the directory entry cache (also known as the dentry |
34 | cache or dcache). This provides a very fast look-up mechanism to | 34 | cache or dcache). This provides a very fast look-up mechanism to |
35 | translate a pathname (filename) into a specific dentry. Dentries live | 35 | translate a pathname (filename) into a specific dentry. Dentries live |
36 | in RAM and are never saved to disc: they exist only for performance. | 36 | in RAM and are never saved to disc: they exist only for performance. |
37 | 37 | ||
38 | The dentry cache is meant to be a view into your entire filespace. As | 38 | The dentry cache is meant to be a view into your entire filespace. As |
39 | most computers cannot fit all dentries in the RAM at the same time, | 39 | most computers cannot fit all dentries in the RAM at the same time, |
40 | some bits of the cache are missing. In order to resolve your pathname | 40 | some bits of the cache are missing. In order to resolve your pathname |
41 | into a dentry, the VFS may have to resort to creating dentries along | 41 | into a dentry, the VFS may have to resort to creating dentries along |
42 | the way, and then loading the inode. This is done by looking up the | 42 | the way, and then loading the inode. This is done by looking up the |
43 | inode. | 43 | inode. |
44 | 44 | ||
45 | 45 | ||
46 | The Inode Object | 46 | The Inode Object |
47 | ---------------- | 47 | ---------------- |
48 | 48 | ||
49 | An individual dentry usually has a pointer to an inode. Inodes are | 49 | An individual dentry usually has a pointer to an inode. Inodes are |
50 | filesystem objects such as regular files, directories, FIFOs and other | 50 | filesystem objects such as regular files, directories, FIFOs and other |
51 | beasts. They live either on the disc (for block device filesystems) | 51 | beasts. They live either on the disc (for block device filesystems) |
52 | or in the memory (for pseudo filesystems). Inodes that live on the | 52 | or in the memory (for pseudo filesystems). Inodes that live on the |
53 | disc are copied into the memory when required and changes to the inode | 53 | disc are copied into the memory when required and changes to the inode |
54 | are written back to disc. A single inode can be pointed to by multiple | 54 | are written back to disc. A single inode can be pointed to by multiple |
55 | dentries (hard links, for example, do this). | 55 | dentries (hard links, for example, do this). |
56 | 56 | ||
57 | To look up an inode requires that the VFS calls the lookup() method of | 57 | To look up an inode requires that the VFS calls the lookup() method of |
58 | the parent directory inode. This method is installed by the specific | 58 | the parent directory inode. This method is installed by the specific |
59 | filesystem implementation that the inode lives in. Once the VFS has | 59 | filesystem implementation that the inode lives in. Once the VFS has |
60 | the required dentry (and hence the inode), we can do all those boring | 60 | the required dentry (and hence the inode), we can do all those boring |
61 | things like open(2) the file, or stat(2) it to peek at the inode | 61 | things like open(2) the file, or stat(2) it to peek at the inode |
62 | data. The stat(2) operation is fairly simple: once the VFS has the | 62 | data. The stat(2) operation is fairly simple: once the VFS has the |
63 | dentry, it peeks at the inode data and passes some of it back to | 63 | dentry, it peeks at the inode data and passes some of it back to |
64 | userspace. | 64 | userspace. |
65 | 65 | ||
66 | 66 | ||
67 | The File Object | 67 | The File Object |
68 | --------------- | 68 | --------------- |
69 | 69 | ||
70 | Opening a file requires another operation: allocation of a file | 70 | Opening a file requires another operation: allocation of a file |
71 | structure (this is the kernel-side implementation of file | 71 | structure (this is the kernel-side implementation of file |
72 | descriptors). The freshly allocated file structure is initialized with | 72 | descriptors). The freshly allocated file structure is initialized with |
73 | a pointer to the dentry and a set of file operation member functions. | 73 | a pointer to the dentry and a set of file operation member functions. |
74 | These are taken from the inode data. The open() file method is then | 74 | These are taken from the inode data. The open() file method is then |
75 | called so the specific filesystem implementation can do it's work. You | 75 | called so the specific filesystem implementation can do it's work. You |
76 | can see that this is another switch performed by the VFS. The file | 76 | can see that this is another switch performed by the VFS. The file |
77 | structure is placed into the file descriptor table for the process. | 77 | structure is placed into the file descriptor table for the process. |
78 | 78 | ||
79 | Reading, writing and closing files (and other assorted VFS operations) | 79 | Reading, writing and closing files (and other assorted VFS operations) |
80 | is done by using the userspace file descriptor to grab the appropriate | 80 | is done by using the userspace file descriptor to grab the appropriate |
81 | file structure, and then calling the required file structure method to | 81 | file structure, and then calling the required file structure method to |
82 | do whatever is required. For as long as the file is open, it keeps the | 82 | do whatever is required. For as long as the file is open, it keeps the |
83 | dentry in use, which in turn means that the VFS inode is still in use. | 83 | dentry in use, which in turn means that the VFS inode is still in use. |
84 | 84 | ||
85 | 85 | ||
86 | Registering and Mounting a Filesystem | 86 | Registering and Mounting a Filesystem |
87 | ===================================== | 87 | ===================================== |
88 | 88 | ||
89 | To register and unregister a filesystem, use the following API | 89 | To register and unregister a filesystem, use the following API |
90 | functions: | 90 | functions: |
91 | 91 | ||
92 | #include <linux/fs.h> | 92 | #include <linux/fs.h> |
93 | 93 | ||
94 | extern int register_filesystem(struct file_system_type *); | 94 | extern int register_filesystem(struct file_system_type *); |
95 | extern int unregister_filesystem(struct file_system_type *); | 95 | extern int unregister_filesystem(struct file_system_type *); |
96 | 96 | ||
97 | The passed struct file_system_type describes your filesystem. When a | 97 | The passed struct file_system_type describes your filesystem. When a |
98 | request is made to mount a device onto a directory in your filespace, | 98 | request is made to mount a device onto a directory in your filespace, |
99 | the VFS will call the appropriate get_sb() method for the specific | 99 | the VFS will call the appropriate get_sb() method for the specific |
100 | filesystem. The dentry for the mount point will then be updated to | 100 | filesystem. The dentry for the mount point will then be updated to |
101 | point to the root inode for the new filesystem. | 101 | point to the root inode for the new filesystem. |
102 | 102 | ||
103 | You can see all filesystems that are registered to the kernel in the | 103 | You can see all filesystems that are registered to the kernel in the |
104 | file /proc/filesystems. | 104 | file /proc/filesystems. |
105 | 105 | ||
106 | 106 | ||
107 | struct file_system_type | 107 | struct file_system_type |
108 | ----------------------- | 108 | ----------------------- |
109 | 109 | ||
110 | This describes the filesystem. As of kernel 2.6.22, the following | 110 | This describes the filesystem. As of kernel 2.6.22, the following |
111 | members are defined: | 111 | members are defined: |
112 | 112 | ||
113 | struct file_system_type { | 113 | struct file_system_type { |
114 | const char *name; | 114 | const char *name; |
115 | int fs_flags; | 115 | int fs_flags; |
116 | int (*get_sb) (struct file_system_type *, int, | 116 | int (*get_sb) (struct file_system_type *, int, |
117 | const char *, void *, struct vfsmount *); | 117 | const char *, void *, struct vfsmount *); |
118 | void (*kill_sb) (struct super_block *); | 118 | void (*kill_sb) (struct super_block *); |
119 | struct module *owner; | 119 | struct module *owner; |
120 | struct file_system_type * next; | 120 | struct file_system_type * next; |
121 | struct list_head fs_supers; | 121 | struct list_head fs_supers; |
122 | struct lock_class_key s_lock_key; | 122 | struct lock_class_key s_lock_key; |
123 | struct lock_class_key s_umount_key; | 123 | struct lock_class_key s_umount_key; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | name: the name of the filesystem type, such as "ext2", "iso9660", | 126 | name: the name of the filesystem type, such as "ext2", "iso9660", |
127 | "msdos" and so on | 127 | "msdos" and so on |
128 | 128 | ||
129 | fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) | 129 | fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) |
130 | 130 | ||
131 | get_sb: the method to call when a new instance of this | 131 | get_sb: the method to call when a new instance of this |
132 | filesystem should be mounted | 132 | filesystem should be mounted |
133 | 133 | ||
134 | kill_sb: the method to call when an instance of this filesystem | 134 | kill_sb: the method to call when an instance of this filesystem |
135 | should be unmounted | 135 | should be unmounted |
136 | 136 | ||
137 | owner: for internal VFS use: you should initialize this to THIS_MODULE in | 137 | owner: for internal VFS use: you should initialize this to THIS_MODULE in |
138 | most cases. | 138 | most cases. |
139 | 139 | ||
140 | next: for internal VFS use: you should initialize this to NULL | 140 | next: for internal VFS use: you should initialize this to NULL |
141 | 141 | ||
142 | s_lock_key, s_umount_key: lockdep-specific | 142 | s_lock_key, s_umount_key: lockdep-specific |
143 | 143 | ||
144 | The get_sb() method has the following arguments: | 144 | The get_sb() method has the following arguments: |
145 | 145 | ||
146 | struct file_system_type *fs_type: decribes the filesystem, partly initialized | 146 | struct file_system_type *fs_type: decribes the filesystem, partly initialized |
147 | by the specific filesystem code | 147 | by the specific filesystem code |
148 | 148 | ||
149 | int flags: mount flags | 149 | int flags: mount flags |
150 | 150 | ||
151 | const char *dev_name: the device name we are mounting. | 151 | const char *dev_name: the device name we are mounting. |
152 | 152 | ||
153 | void *data: arbitrary mount options, usually comes as an ASCII | 153 | void *data: arbitrary mount options, usually comes as an ASCII |
154 | string | 154 | string |
155 | 155 | ||
156 | struct vfsmount *mnt: a vfs-internal representation of a mount point | 156 | struct vfsmount *mnt: a vfs-internal representation of a mount point |
157 | 157 | ||
158 | The get_sb() method must determine if the block device specified | 158 | The get_sb() method must determine if the block device specified |
159 | in the dev_name and fs_type contains a filesystem of the type the method | 159 | in the dev_name and fs_type contains a filesystem of the type the method |
160 | supports. If it succeeds in opening the named block device, it initializes a | 160 | supports. If it succeeds in opening the named block device, it initializes a |
161 | struct super_block descriptor for the filesystem contained by the block device. | 161 | struct super_block descriptor for the filesystem contained by the block device. |
162 | On failure it returns an error. | 162 | On failure it returns an error. |
163 | 163 | ||
164 | The most interesting member of the superblock structure that the | 164 | The most interesting member of the superblock structure that the |
165 | get_sb() method fills in is the "s_op" field. This is a pointer to | 165 | get_sb() method fills in is the "s_op" field. This is a pointer to |
166 | a "struct super_operations" which describes the next level of the | 166 | a "struct super_operations" which describes the next level of the |
167 | filesystem implementation. | 167 | filesystem implementation. |
168 | 168 | ||
169 | Usually, a filesystem uses one of the generic get_sb() implementations | 169 | Usually, a filesystem uses one of the generic get_sb() implementations |
170 | and provides a fill_super() method instead. The generic methods are: | 170 | and provides a fill_super() method instead. The generic methods are: |
171 | 171 | ||
172 | get_sb_bdev: mount a filesystem residing on a block device | 172 | get_sb_bdev: mount a filesystem residing on a block device |
173 | 173 | ||
174 | get_sb_nodev: mount a filesystem that is not backed by a device | 174 | get_sb_nodev: mount a filesystem that is not backed by a device |
175 | 175 | ||
176 | get_sb_single: mount a filesystem which shares the instance between | 176 | get_sb_single: mount a filesystem which shares the instance between |
177 | all mounts | 177 | all mounts |
178 | 178 | ||
179 | A fill_super() method implementation has the following arguments: | 179 | A fill_super() method implementation has the following arguments: |
180 | 180 | ||
181 | struct super_block *sb: the superblock structure. The method fill_super() | 181 | struct super_block *sb: the superblock structure. The method fill_super() |
182 | must initialize this properly. | 182 | must initialize this properly. |
183 | 183 | ||
184 | void *data: arbitrary mount options, usually comes as an ASCII | 184 | void *data: arbitrary mount options, usually comes as an ASCII |
185 | string | 185 | string |
186 | 186 | ||
187 | int silent: whether or not to be silent on error | 187 | int silent: whether or not to be silent on error |
188 | 188 | ||
189 | 189 | ||
190 | The Superblock Object | 190 | The Superblock Object |
191 | ===================== | 191 | ===================== |
192 | 192 | ||
193 | A superblock object represents a mounted filesystem. | 193 | A superblock object represents a mounted filesystem. |
194 | 194 | ||
195 | 195 | ||
196 | struct super_operations | 196 | struct super_operations |
197 | ----------------------- | 197 | ----------------------- |
198 | 198 | ||
199 | This describes how the VFS can manipulate the superblock of your | 199 | This describes how the VFS can manipulate the superblock of your |
200 | filesystem. As of kernel 2.6.22, the following members are defined: | 200 | filesystem. As of kernel 2.6.22, the following members are defined: |
201 | 201 | ||
202 | struct super_operations { | 202 | struct super_operations { |
203 | struct inode *(*alloc_inode)(struct super_block *sb); | 203 | struct inode *(*alloc_inode)(struct super_block *sb); |
204 | void (*destroy_inode)(struct inode *); | 204 | void (*destroy_inode)(struct inode *); |
205 | 205 | ||
206 | void (*read_inode) (struct inode *); | ||
207 | |||
208 | void (*dirty_inode) (struct inode *); | 206 | void (*dirty_inode) (struct inode *); |
209 | int (*write_inode) (struct inode *, int); | 207 | int (*write_inode) (struct inode *, int); |
210 | void (*put_inode) (struct inode *); | 208 | void (*put_inode) (struct inode *); |
211 | void (*drop_inode) (struct inode *); | 209 | void (*drop_inode) (struct inode *); |
212 | void (*delete_inode) (struct inode *); | 210 | void (*delete_inode) (struct inode *); |
213 | void (*put_super) (struct super_block *); | 211 | void (*put_super) (struct super_block *); |
214 | void (*write_super) (struct super_block *); | 212 | void (*write_super) (struct super_block *); |
215 | int (*sync_fs)(struct super_block *sb, int wait); | 213 | int (*sync_fs)(struct super_block *sb, int wait); |
216 | void (*write_super_lockfs) (struct super_block *); | 214 | void (*write_super_lockfs) (struct super_block *); |
217 | void (*unlockfs) (struct super_block *); | 215 | void (*unlockfs) (struct super_block *); |
218 | int (*statfs) (struct dentry *, struct kstatfs *); | 216 | int (*statfs) (struct dentry *, struct kstatfs *); |
219 | int (*remount_fs) (struct super_block *, int *, char *); | 217 | int (*remount_fs) (struct super_block *, int *, char *); |
220 | void (*clear_inode) (struct inode *); | 218 | void (*clear_inode) (struct inode *); |
221 | void (*umount_begin) (struct super_block *); | 219 | void (*umount_begin) (struct super_block *); |
222 | 220 | ||
223 | int (*show_options)(struct seq_file *, struct vfsmount *); | 221 | int (*show_options)(struct seq_file *, struct vfsmount *); |
224 | 222 | ||
225 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 223 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
226 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 224 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
227 | }; | 225 | }; |
228 | 226 | ||
229 | All methods are called without any locks being held, unless otherwise | 227 | All methods are called without any locks being held, unless otherwise |
230 | noted. This means that most methods can block safely. All methods are | 228 | noted. This means that most methods can block safely. All methods are |
231 | only called from a process context (i.e. not from an interrupt handler | 229 | only called from a process context (i.e. not from an interrupt handler |
232 | or bottom half). | 230 | or bottom half). |
233 | 231 | ||
234 | alloc_inode: this method is called by inode_alloc() to allocate memory | 232 | alloc_inode: this method is called by inode_alloc() to allocate memory |
235 | for struct inode and initialize it. If this function is not | 233 | for struct inode and initialize it. If this function is not |
236 | defined, a simple 'struct inode' is allocated. Normally | 234 | defined, a simple 'struct inode' is allocated. Normally |
237 | alloc_inode will be used to allocate a larger structure which | 235 | alloc_inode will be used to allocate a larger structure which |
238 | contains a 'struct inode' embedded within it. | 236 | contains a 'struct inode' embedded within it. |
239 | 237 | ||
240 | destroy_inode: this method is called by destroy_inode() to release | 238 | destroy_inode: this method is called by destroy_inode() to release |
241 | resources allocated for struct inode. It is only required if | 239 | resources allocated for struct inode. It is only required if |
242 | ->alloc_inode was defined and simply undoes anything done by | 240 | ->alloc_inode was defined and simply undoes anything done by |
243 | ->alloc_inode. | 241 | ->alloc_inode. |
244 | 242 | ||
245 | read_inode: this method is called to read a specific inode from the | ||
246 | mounted filesystem. The i_ino member in the struct inode is | ||
247 | initialized by the VFS to indicate which inode to read. Other | ||
248 | members are filled in by this method. | ||
249 | |||
250 | You can set this to NULL and use iget5_locked() instead of iget() | ||
251 | to read inodes. This is necessary for filesystems for which the | ||
252 | inode number is not sufficient to identify an inode. | ||
253 | |||
254 | dirty_inode: this method is called by the VFS to mark an inode dirty. | 243 | dirty_inode: this method is called by the VFS to mark an inode dirty. |
255 | 244 | ||
256 | write_inode: this method is called when the VFS needs to write an | 245 | write_inode: this method is called when the VFS needs to write an |
257 | inode to disc. The second parameter indicates whether the write | 246 | inode to disc. The second parameter indicates whether the write |
258 | should be synchronous or not, not all filesystems check this flag. | 247 | should be synchronous or not, not all filesystems check this flag. |
259 | 248 | ||
260 | put_inode: called when the VFS inode is removed from the inode | 249 | put_inode: called when the VFS inode is removed from the inode |
261 | cache. | 250 | cache. |
262 | 251 | ||
263 | drop_inode: called when the last access to the inode is dropped, | 252 | drop_inode: called when the last access to the inode is dropped, |
264 | with the inode_lock spinlock held. | 253 | with the inode_lock spinlock held. |
265 | 254 | ||
266 | This method should be either NULL (normal UNIX filesystem | 255 | This method should be either NULL (normal UNIX filesystem |
267 | semantics) or "generic_delete_inode" (for filesystems that do not | 256 | semantics) or "generic_delete_inode" (for filesystems that do not |
268 | want to cache inodes - causing "delete_inode" to always be | 257 | want to cache inodes - causing "delete_inode" to always be |
269 | called regardless of the value of i_nlink) | 258 | called regardless of the value of i_nlink) |
270 | 259 | ||
271 | The "generic_delete_inode()" behavior is equivalent to the | 260 | The "generic_delete_inode()" behavior is equivalent to the |
272 | old practice of using "force_delete" in the put_inode() case, | 261 | old practice of using "force_delete" in the put_inode() case, |
273 | but does not have the races that the "force_delete()" approach | 262 | but does not have the races that the "force_delete()" approach |
274 | had. | 263 | had. |
275 | 264 | ||
276 | delete_inode: called when the VFS wants to delete an inode | 265 | delete_inode: called when the VFS wants to delete an inode |
277 | 266 | ||
278 | put_super: called when the VFS wishes to free the superblock | 267 | put_super: called when the VFS wishes to free the superblock |
279 | (i.e. unmount). This is called with the superblock lock held | 268 | (i.e. unmount). This is called with the superblock lock held |
280 | 269 | ||
281 | write_super: called when the VFS superblock needs to be written to | 270 | write_super: called when the VFS superblock needs to be written to |
282 | disc. This method is optional | 271 | disc. This method is optional |
283 | 272 | ||
284 | sync_fs: called when VFS is writing out all dirty data associated with | 273 | sync_fs: called when VFS is writing out all dirty data associated with |
285 | a superblock. The second parameter indicates whether the method | 274 | a superblock. The second parameter indicates whether the method |
286 | should wait until the write out has been completed. Optional. | 275 | should wait until the write out has been completed. Optional. |
287 | 276 | ||
288 | write_super_lockfs: called when VFS is locking a filesystem and | 277 | write_super_lockfs: called when VFS is locking a filesystem and |
289 | forcing it into a consistent state. This method is currently | 278 | forcing it into a consistent state. This method is currently |
290 | used by the Logical Volume Manager (LVM). | 279 | used by the Logical Volume Manager (LVM). |
291 | 280 | ||
292 | unlockfs: called when VFS is unlocking a filesystem and making it writable | 281 | unlockfs: called when VFS is unlocking a filesystem and making it writable |
293 | again. | 282 | again. |
294 | 283 | ||
295 | statfs: called when the VFS needs to get filesystem statistics. This | 284 | statfs: called when the VFS needs to get filesystem statistics. This |
296 | is called with the kernel lock held | 285 | is called with the kernel lock held |
297 | 286 | ||
298 | remount_fs: called when the filesystem is remounted. This is called | 287 | remount_fs: called when the filesystem is remounted. This is called |
299 | with the kernel lock held | 288 | with the kernel lock held |
300 | 289 | ||
301 | clear_inode: called then the VFS clears the inode. Optional | 290 | clear_inode: called then the VFS clears the inode. Optional |
302 | 291 | ||
303 | umount_begin: called when the VFS is unmounting a filesystem. | 292 | umount_begin: called when the VFS is unmounting a filesystem. |
304 | 293 | ||
305 | show_options: called by the VFS to show mount options for /proc/<pid>/mounts. | 294 | show_options: called by the VFS to show mount options for /proc/<pid>/mounts. |
306 | 295 | ||
307 | quota_read: called by the VFS to read from filesystem quota file. | 296 | quota_read: called by the VFS to read from filesystem quota file. |
308 | 297 | ||
309 | quota_write: called by the VFS to write to filesystem quota file. | 298 | quota_write: called by the VFS to write to filesystem quota file. |
310 | 299 | ||
311 | The read_inode() method is responsible for filling in the "i_op" | 300 | Whoever sets up the inode is responsible for filling in the "i_op" field. This |
312 | field. This is a pointer to a "struct inode_operations" which | 301 | is a pointer to a "struct inode_operations" which describes the methods that |
313 | describes the methods that can be performed on individual inodes. | 302 | can be performed on individual inodes. |
314 | 303 | ||
315 | 304 | ||
316 | The Inode Object | 305 | The Inode Object |
317 | ================ | 306 | ================ |
318 | 307 | ||
319 | An inode object represents an object within the filesystem. | 308 | An inode object represents an object within the filesystem. |
320 | 309 | ||
321 | 310 | ||
322 | struct inode_operations | 311 | struct inode_operations |
323 | ----------------------- | 312 | ----------------------- |
324 | 313 | ||
325 | This describes how the VFS can manipulate an inode in your | 314 | This describes how the VFS can manipulate an inode in your |
326 | filesystem. As of kernel 2.6.22, the following members are defined: | 315 | filesystem. As of kernel 2.6.22, the following members are defined: |
327 | 316 | ||
328 | struct inode_operations { | 317 | struct inode_operations { |
329 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); | 318 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); |
330 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); | 319 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); |
331 | int (*link) (struct dentry *,struct inode *,struct dentry *); | 320 | int (*link) (struct dentry *,struct inode *,struct dentry *); |
332 | int (*unlink) (struct inode *,struct dentry *); | 321 | int (*unlink) (struct inode *,struct dentry *); |
333 | int (*symlink) (struct inode *,struct dentry *,const char *); | 322 | int (*symlink) (struct inode *,struct dentry *,const char *); |
334 | int (*mkdir) (struct inode *,struct dentry *,int); | 323 | int (*mkdir) (struct inode *,struct dentry *,int); |
335 | int (*rmdir) (struct inode *,struct dentry *); | 324 | int (*rmdir) (struct inode *,struct dentry *); |
336 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); | 325 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); |
337 | int (*rename) (struct inode *, struct dentry *, | 326 | int (*rename) (struct inode *, struct dentry *, |
338 | struct inode *, struct dentry *); | 327 | struct inode *, struct dentry *); |
339 | int (*readlink) (struct dentry *, char __user *,int); | 328 | int (*readlink) (struct dentry *, char __user *,int); |
340 | void * (*follow_link) (struct dentry *, struct nameidata *); | 329 | void * (*follow_link) (struct dentry *, struct nameidata *); |
341 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 330 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
342 | void (*truncate) (struct inode *); | 331 | void (*truncate) (struct inode *); |
343 | int (*permission) (struct inode *, int, struct nameidata *); | 332 | int (*permission) (struct inode *, int, struct nameidata *); |
344 | int (*setattr) (struct dentry *, struct iattr *); | 333 | int (*setattr) (struct dentry *, struct iattr *); |
345 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 334 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
346 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 335 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
347 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 336 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
348 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 337 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
349 | int (*removexattr) (struct dentry *, const char *); | 338 | int (*removexattr) (struct dentry *, const char *); |
350 | void (*truncate_range)(struct inode *, loff_t, loff_t); | 339 | void (*truncate_range)(struct inode *, loff_t, loff_t); |
351 | }; | 340 | }; |
352 | 341 | ||
353 | Again, all methods are called without any locks being held, unless | 342 | Again, all methods are called without any locks being held, unless |
354 | otherwise noted. | 343 | otherwise noted. |
355 | 344 | ||
356 | create: called by the open(2) and creat(2) system calls. Only | 345 | create: called by the open(2) and creat(2) system calls. Only |
357 | required if you want to support regular files. The dentry you | 346 | required if you want to support regular files. The dentry you |
358 | get should not have an inode (i.e. it should be a negative | 347 | get should not have an inode (i.e. it should be a negative |
359 | dentry). Here you will probably call d_instantiate() with the | 348 | dentry). Here you will probably call d_instantiate() with the |
360 | dentry and the newly created inode | 349 | dentry and the newly created inode |
361 | 350 | ||
362 | lookup: called when the VFS needs to look up an inode in a parent | 351 | lookup: called when the VFS needs to look up an inode in a parent |
363 | directory. The name to look for is found in the dentry. This | 352 | directory. The name to look for is found in the dentry. This |
364 | method must call d_add() to insert the found inode into the | 353 | method must call d_add() to insert the found inode into the |
365 | dentry. The "i_count" field in the inode structure should be | 354 | dentry. The "i_count" field in the inode structure should be |
366 | incremented. If the named inode does not exist a NULL inode | 355 | incremented. If the named inode does not exist a NULL inode |
367 | should be inserted into the dentry (this is called a negative | 356 | should be inserted into the dentry (this is called a negative |
368 | dentry). Returning an error code from this routine must only | 357 | dentry). Returning an error code from this routine must only |
369 | be done on a real error, otherwise creating inodes with system | 358 | be done on a real error, otherwise creating inodes with system |
370 | calls like create(2), mknod(2), mkdir(2) and so on will fail. | 359 | calls like create(2), mknod(2), mkdir(2) and so on will fail. |
371 | If you wish to overload the dentry methods then you should | 360 | If you wish to overload the dentry methods then you should |
372 | initialise the "d_dop" field in the dentry; this is a pointer | 361 | initialise the "d_dop" field in the dentry; this is a pointer |
373 | to a struct "dentry_operations". | 362 | to a struct "dentry_operations". |
374 | This method is called with the directory inode semaphore held | 363 | This method is called with the directory inode semaphore held |
375 | 364 | ||
376 | link: called by the link(2) system call. Only required if you want | 365 | link: called by the link(2) system call. Only required if you want |
377 | to support hard links. You will probably need to call | 366 | to support hard links. You will probably need to call |
378 | d_instantiate() just as you would in the create() method | 367 | d_instantiate() just as you would in the create() method |
379 | 368 | ||
380 | unlink: called by the unlink(2) system call. Only required if you | 369 | unlink: called by the unlink(2) system call. Only required if you |
381 | want to support deleting inodes | 370 | want to support deleting inodes |
382 | 371 | ||
383 | symlink: called by the symlink(2) system call. Only required if you | 372 | symlink: called by the symlink(2) system call. Only required if you |
384 | want to support symlinks. You will probably need to call | 373 | want to support symlinks. You will probably need to call |
385 | d_instantiate() just as you would in the create() method | 374 | d_instantiate() just as you would in the create() method |
386 | 375 | ||
387 | mkdir: called by the mkdir(2) system call. Only required if you want | 376 | mkdir: called by the mkdir(2) system call. Only required if you want |
388 | to support creating subdirectories. You will probably need to | 377 | to support creating subdirectories. You will probably need to |
389 | call d_instantiate() just as you would in the create() method | 378 | call d_instantiate() just as you would in the create() method |
390 | 379 | ||
391 | rmdir: called by the rmdir(2) system call. Only required if you want | 380 | rmdir: called by the rmdir(2) system call. Only required if you want |
392 | to support deleting subdirectories | 381 | to support deleting subdirectories |
393 | 382 | ||
394 | mknod: called by the mknod(2) system call to create a device (char, | 383 | mknod: called by the mknod(2) system call to create a device (char, |
395 | block) inode or a named pipe (FIFO) or socket. Only required | 384 | block) inode or a named pipe (FIFO) or socket. Only required |
396 | if you want to support creating these types of inodes. You | 385 | if you want to support creating these types of inodes. You |
397 | will probably need to call d_instantiate() just as you would | 386 | will probably need to call d_instantiate() just as you would |
398 | in the create() method | 387 | in the create() method |
399 | 388 | ||
400 | rename: called by the rename(2) system call to rename the object to | 389 | rename: called by the rename(2) system call to rename the object to |
401 | have the parent and name given by the second inode and dentry. | 390 | have the parent and name given by the second inode and dentry. |
402 | 391 | ||
403 | readlink: called by the readlink(2) system call. Only required if | 392 | readlink: called by the readlink(2) system call. Only required if |
404 | you want to support reading symbolic links | 393 | you want to support reading symbolic links |
405 | 394 | ||
406 | follow_link: called by the VFS to follow a symbolic link to the | 395 | follow_link: called by the VFS to follow a symbolic link to the |
407 | inode it points to. Only required if you want to support | 396 | inode it points to. Only required if you want to support |
408 | symbolic links. This method returns a void pointer cookie | 397 | symbolic links. This method returns a void pointer cookie |
409 | that is passed to put_link(). | 398 | that is passed to put_link(). |
410 | 399 | ||
411 | put_link: called by the VFS to release resources allocated by | 400 | put_link: called by the VFS to release resources allocated by |
412 | follow_link(). The cookie returned by follow_link() is passed | 401 | follow_link(). The cookie returned by follow_link() is passed |
413 | to this method as the last parameter. It is used by | 402 | to this method as the last parameter. It is used by |
414 | filesystems such as NFS where page cache is not stable | 403 | filesystems such as NFS where page cache is not stable |
415 | (i.e. page that was installed when the symbolic link walk | 404 | (i.e. page that was installed when the symbolic link walk |
416 | started might not be in the page cache at the end of the | 405 | started might not be in the page cache at the end of the |
417 | walk). | 406 | walk). |
418 | 407 | ||
419 | truncate: called by the VFS to change the size of a file. The | 408 | truncate: called by the VFS to change the size of a file. The |
420 | i_size field of the inode is set to the desired size by the | 409 | i_size field of the inode is set to the desired size by the |
421 | VFS before this method is called. This method is called by | 410 | VFS before this method is called. This method is called by |
422 | the truncate(2) system call and related functionality. | 411 | the truncate(2) system call and related functionality. |
423 | 412 | ||
424 | permission: called by the VFS to check for access rights on a POSIX-like | 413 | permission: called by the VFS to check for access rights on a POSIX-like |
425 | filesystem. | 414 | filesystem. |
426 | 415 | ||
427 | setattr: called by the VFS to set attributes for a file. This method | 416 | setattr: called by the VFS to set attributes for a file. This method |
428 | is called by chmod(2) and related system calls. | 417 | is called by chmod(2) and related system calls. |
429 | 418 | ||
430 | getattr: called by the VFS to get attributes of a file. This method | 419 | getattr: called by the VFS to get attributes of a file. This method |
431 | is called by stat(2) and related system calls. | 420 | is called by stat(2) and related system calls. |
432 | 421 | ||
433 | setxattr: called by the VFS to set an extended attribute for a file. | 422 | setxattr: called by the VFS to set an extended attribute for a file. |
434 | Extended attribute is a name:value pair associated with an | 423 | Extended attribute is a name:value pair associated with an |
435 | inode. This method is called by setxattr(2) system call. | 424 | inode. This method is called by setxattr(2) system call. |
436 | 425 | ||
437 | getxattr: called by the VFS to retrieve the value of an extended | 426 | getxattr: called by the VFS to retrieve the value of an extended |
438 | attribute name. This method is called by getxattr(2) function | 427 | attribute name. This method is called by getxattr(2) function |
439 | call. | 428 | call. |
440 | 429 | ||
441 | listxattr: called by the VFS to list all extended attributes for a | 430 | listxattr: called by the VFS to list all extended attributes for a |
442 | given file. This method is called by listxattr(2) system call. | 431 | given file. This method is called by listxattr(2) system call. |
443 | 432 | ||
444 | removexattr: called by the VFS to remove an extended attribute from | 433 | removexattr: called by the VFS to remove an extended attribute from |
445 | a file. This method is called by removexattr(2) system call. | 434 | a file. This method is called by removexattr(2) system call. |
446 | 435 | ||
447 | truncate_range: a method provided by the underlying filesystem to truncate a | 436 | truncate_range: a method provided by the underlying filesystem to truncate a |
448 | range of blocks , i.e. punch a hole somewhere in a file. | 437 | range of blocks , i.e. punch a hole somewhere in a file. |
449 | 438 | ||
450 | 439 | ||
451 | The Address Space Object | 440 | The Address Space Object |
452 | ======================== | 441 | ======================== |
453 | 442 | ||
454 | The address space object is used to group and manage pages in the page | 443 | The address space object is used to group and manage pages in the page |
455 | cache. It can be used to keep track of the pages in a file (or | 444 | cache. It can be used to keep track of the pages in a file (or |
456 | anything else) and also track the mapping of sections of the file into | 445 | anything else) and also track the mapping of sections of the file into |
457 | process address spaces. | 446 | process address spaces. |
458 | 447 | ||
459 | There are a number of distinct yet related services that an | 448 | There are a number of distinct yet related services that an |
460 | address-space can provide. These include communicating memory | 449 | address-space can provide. These include communicating memory |
461 | pressure, page lookup by address, and keeping track of pages tagged as | 450 | pressure, page lookup by address, and keeping track of pages tagged as |
462 | Dirty or Writeback. | 451 | Dirty or Writeback. |
463 | 452 | ||
464 | The first can be used independently to the others. The VM can try to | 453 | The first can be used independently to the others. The VM can try to |
465 | either write dirty pages in order to clean them, or release clean | 454 | either write dirty pages in order to clean them, or release clean |
466 | pages in order to reuse them. To do this it can call the ->writepage | 455 | pages in order to reuse them. To do this it can call the ->writepage |
467 | method on dirty pages, and ->releasepage on clean pages with | 456 | method on dirty pages, and ->releasepage on clean pages with |
468 | PagePrivate set. Clean pages without PagePrivate and with no external | 457 | PagePrivate set. Clean pages without PagePrivate and with no external |
469 | references will be released without notice being given to the | 458 | references will be released without notice being given to the |
470 | address_space. | 459 | address_space. |
471 | 460 | ||
472 | To achieve this functionality, pages need to be placed on an LRU with | 461 | To achieve this functionality, pages need to be placed on an LRU with |
473 | lru_cache_add and mark_page_active needs to be called whenever the | 462 | lru_cache_add and mark_page_active needs to be called whenever the |
474 | page is used. | 463 | page is used. |
475 | 464 | ||
476 | Pages are normally kept in a radix tree index by ->index. This tree | 465 | Pages are normally kept in a radix tree index by ->index. This tree |
477 | maintains information about the PG_Dirty and PG_Writeback status of | 466 | maintains information about the PG_Dirty and PG_Writeback status of |
478 | each page, so that pages with either of these flags can be found | 467 | each page, so that pages with either of these flags can be found |
479 | quickly. | 468 | quickly. |
480 | 469 | ||
481 | The Dirty tag is primarily used by mpage_writepages - the default | 470 | The Dirty tag is primarily used by mpage_writepages - the default |
482 | ->writepages method. It uses the tag to find dirty pages to call | 471 | ->writepages method. It uses the tag to find dirty pages to call |
483 | ->writepage on. If mpage_writepages is not used (i.e. the address | 472 | ->writepage on. If mpage_writepages is not used (i.e. the address |
484 | provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is | 473 | provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is |
485 | almost unused. write_inode_now and sync_inode do use it (through | 474 | almost unused. write_inode_now and sync_inode do use it (through |
486 | __sync_single_inode) to check if ->writepages has been successful in | 475 | __sync_single_inode) to check if ->writepages has been successful in |
487 | writing out the whole address_space. | 476 | writing out the whole address_space. |
488 | 477 | ||
489 | The Writeback tag is used by filemap*wait* and sync_page* functions, | 478 | The Writeback tag is used by filemap*wait* and sync_page* functions, |
490 | via wait_on_page_writeback_range, to wait for all writeback to | 479 | via wait_on_page_writeback_range, to wait for all writeback to |
491 | complete. While waiting ->sync_page (if defined) will be called on | 480 | complete. While waiting ->sync_page (if defined) will be called on |
492 | each page that is found to require writeback. | 481 | each page that is found to require writeback. |
493 | 482 | ||
494 | An address_space handler may attach extra information to a page, | 483 | An address_space handler may attach extra information to a page, |
495 | typically using the 'private' field in the 'struct page'. If such | 484 | typically using the 'private' field in the 'struct page'. If such |
496 | information is attached, the PG_Private flag should be set. This will | 485 | information is attached, the PG_Private flag should be set. This will |
497 | cause various VM routines to make extra calls into the address_space | 486 | cause various VM routines to make extra calls into the address_space |
498 | handler to deal with that data. | 487 | handler to deal with that data. |
499 | 488 | ||
500 | An address space acts as an intermediate between storage and | 489 | An address space acts as an intermediate between storage and |
501 | application. Data is read into the address space a whole page at a | 490 | application. Data is read into the address space a whole page at a |
502 | time, and provided to the application either by copying of the page, | 491 | time, and provided to the application either by copying of the page, |
503 | or by memory-mapping the page. | 492 | or by memory-mapping the page. |
504 | Data is written into the address space by the application, and then | 493 | Data is written into the address space by the application, and then |
505 | written-back to storage typically in whole pages, however the | 494 | written-back to storage typically in whole pages, however the |
506 | address_space has finer control of write sizes. | 495 | address_space has finer control of write sizes. |
507 | 496 | ||
508 | The read process essentially only requires 'readpage'. The write | 497 | The read process essentially only requires 'readpage'. The write |
509 | process is more complicated and uses prepare_write/commit_write or | 498 | process is more complicated and uses prepare_write/commit_write or |
510 | set_page_dirty to write data into the address_space, and writepage, | 499 | set_page_dirty to write data into the address_space, and writepage, |
511 | sync_page, and writepages to writeback data to storage. | 500 | sync_page, and writepages to writeback data to storage. |
512 | 501 | ||
513 | Adding and removing pages to/from an address_space is protected by the | 502 | Adding and removing pages to/from an address_space is protected by the |
514 | inode's i_mutex. | 503 | inode's i_mutex. |
515 | 504 | ||
516 | When data is written to a page, the PG_Dirty flag should be set. It | 505 | When data is written to a page, the PG_Dirty flag should be set. It |
517 | typically remains set until writepage asks for it to be written. This | 506 | typically remains set until writepage asks for it to be written. This |
518 | should clear PG_Dirty and set PG_Writeback. It can be actually | 507 | should clear PG_Dirty and set PG_Writeback. It can be actually |
519 | written at any point after PG_Dirty is clear. Once it is known to be | 508 | written at any point after PG_Dirty is clear. Once it is known to be |
520 | safe, PG_Writeback is cleared. | 509 | safe, PG_Writeback is cleared. |
521 | 510 | ||
522 | Writeback makes use of a writeback_control structure... | 511 | Writeback makes use of a writeback_control structure... |
523 | 512 | ||
524 | struct address_space_operations | 513 | struct address_space_operations |
525 | ------------------------------- | 514 | ------------------------------- |
526 | 515 | ||
527 | This describes how the VFS can manipulate mapping of a file to page cache in | 516 | This describes how the VFS can manipulate mapping of a file to page cache in |
528 | your filesystem. As of kernel 2.6.22, the following members are defined: | 517 | your filesystem. As of kernel 2.6.22, the following members are defined: |
529 | 518 | ||
530 | struct address_space_operations { | 519 | struct address_space_operations { |
531 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 520 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
532 | int (*readpage)(struct file *, struct page *); | 521 | int (*readpage)(struct file *, struct page *); |
533 | int (*sync_page)(struct page *); | 522 | int (*sync_page)(struct page *); |
534 | int (*writepages)(struct address_space *, struct writeback_control *); | 523 | int (*writepages)(struct address_space *, struct writeback_control *); |
535 | int (*set_page_dirty)(struct page *page); | 524 | int (*set_page_dirty)(struct page *page); |
536 | int (*readpages)(struct file *filp, struct address_space *mapping, | 525 | int (*readpages)(struct file *filp, struct address_space *mapping, |
537 | struct list_head *pages, unsigned nr_pages); | 526 | struct list_head *pages, unsigned nr_pages); |
538 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); | 527 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); |
539 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); | 528 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); |
540 | int (*write_begin)(struct file *, struct address_space *mapping, | 529 | int (*write_begin)(struct file *, struct address_space *mapping, |
541 | loff_t pos, unsigned len, unsigned flags, | 530 | loff_t pos, unsigned len, unsigned flags, |
542 | struct page **pagep, void **fsdata); | 531 | struct page **pagep, void **fsdata); |
543 | int (*write_end)(struct file *, struct address_space *mapping, | 532 | int (*write_end)(struct file *, struct address_space *mapping, |
544 | loff_t pos, unsigned len, unsigned copied, | 533 | loff_t pos, unsigned len, unsigned copied, |
545 | struct page *page, void *fsdata); | 534 | struct page *page, void *fsdata); |
546 | sector_t (*bmap)(struct address_space *, sector_t); | 535 | sector_t (*bmap)(struct address_space *, sector_t); |
547 | int (*invalidatepage) (struct page *, unsigned long); | 536 | int (*invalidatepage) (struct page *, unsigned long); |
548 | int (*releasepage) (struct page *, int); | 537 | int (*releasepage) (struct page *, int); |
549 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 538 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
550 | loff_t offset, unsigned long nr_segs); | 539 | loff_t offset, unsigned long nr_segs); |
551 | struct page* (*get_xip_page)(struct address_space *, sector_t, | 540 | struct page* (*get_xip_page)(struct address_space *, sector_t, |
552 | int); | 541 | int); |
553 | /* migrate the contents of a page to the specified target */ | 542 | /* migrate the contents of a page to the specified target */ |
554 | int (*migratepage) (struct page *, struct page *); | 543 | int (*migratepage) (struct page *, struct page *); |
555 | int (*launder_page) (struct page *); | 544 | int (*launder_page) (struct page *); |
556 | }; | 545 | }; |
557 | 546 | ||
558 | writepage: called by the VM to write a dirty page to backing store. | 547 | writepage: called by the VM to write a dirty page to backing store. |
559 | This may happen for data integrity reasons (i.e. 'sync'), or | 548 | This may happen for data integrity reasons (i.e. 'sync'), or |
560 | to free up memory (flush). The difference can be seen in | 549 | to free up memory (flush). The difference can be seen in |
561 | wbc->sync_mode. | 550 | wbc->sync_mode. |
562 | The PG_Dirty flag has been cleared and PageLocked is true. | 551 | The PG_Dirty flag has been cleared and PageLocked is true. |
563 | writepage should start writeout, should set PG_Writeback, | 552 | writepage should start writeout, should set PG_Writeback, |
564 | and should make sure the page is unlocked, either synchronously | 553 | and should make sure the page is unlocked, either synchronously |
565 | or asynchronously when the write operation completes. | 554 | or asynchronously when the write operation completes. |
566 | 555 | ||
567 | If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to | 556 | If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to |
568 | try too hard if there are problems, and may choose to write out | 557 | try too hard if there are problems, and may choose to write out |
569 | other pages from the mapping if that is easier (e.g. due to | 558 | other pages from the mapping if that is easier (e.g. due to |
570 | internal dependencies). If it chooses not to start writeout, it | 559 | internal dependencies). If it chooses not to start writeout, it |
571 | should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep | 560 | should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep |
572 | calling ->writepage on that page. | 561 | calling ->writepage on that page. |
573 | 562 | ||
574 | See the file "Locking" for more details. | 563 | See the file "Locking" for more details. |
575 | 564 | ||
576 | readpage: called by the VM to read a page from backing store. | 565 | readpage: called by the VM to read a page from backing store. |
577 | The page will be Locked when readpage is called, and should be | 566 | The page will be Locked when readpage is called, and should be |
578 | unlocked and marked uptodate once the read completes. | 567 | unlocked and marked uptodate once the read completes. |
579 | If ->readpage discovers that it needs to unlock the page for | 568 | If ->readpage discovers that it needs to unlock the page for |
580 | some reason, it can do so, and then return AOP_TRUNCATED_PAGE. | 569 | some reason, it can do so, and then return AOP_TRUNCATED_PAGE. |
581 | In this case, the page will be relocated, relocked and if | 570 | In this case, the page will be relocated, relocked and if |
582 | that all succeeds, ->readpage will be called again. | 571 | that all succeeds, ->readpage will be called again. |
583 | 572 | ||
584 | sync_page: called by the VM to notify the backing store to perform all | 573 | sync_page: called by the VM to notify the backing store to perform all |
585 | queued I/O operations for a page. I/O operations for other pages | 574 | queued I/O operations for a page. I/O operations for other pages |
586 | associated with this address_space object may also be performed. | 575 | associated with this address_space object may also be performed. |
587 | 576 | ||
588 | This function is optional and is called only for pages with | 577 | This function is optional and is called only for pages with |
589 | PG_Writeback set while waiting for the writeback to complete. | 578 | PG_Writeback set while waiting for the writeback to complete. |
590 | 579 | ||
591 | writepages: called by the VM to write out pages associated with the | 580 | writepages: called by the VM to write out pages associated with the |
592 | address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then | 581 | address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then |
593 | the writeback_control will specify a range of pages that must be | 582 | the writeback_control will specify a range of pages that must be |
594 | written out. If it is WBC_SYNC_NONE, then a nr_to_write is given | 583 | written out. If it is WBC_SYNC_NONE, then a nr_to_write is given |
595 | and that many pages should be written if possible. | 584 | and that many pages should be written if possible. |
596 | If no ->writepages is given, then mpage_writepages is used | 585 | If no ->writepages is given, then mpage_writepages is used |
597 | instead. This will choose pages from the address space that are | 586 | instead. This will choose pages from the address space that are |
598 | tagged as DIRTY and will pass them to ->writepage. | 587 | tagged as DIRTY and will pass them to ->writepage. |
599 | 588 | ||
600 | set_page_dirty: called by the VM to set a page dirty. | 589 | set_page_dirty: called by the VM to set a page dirty. |
601 | This is particularly needed if an address space attaches | 590 | This is particularly needed if an address space attaches |
602 | private data to a page, and that data needs to be updated when | 591 | private data to a page, and that data needs to be updated when |
603 | a page is dirtied. This is called, for example, when a memory | 592 | a page is dirtied. This is called, for example, when a memory |
604 | mapped page gets modified. | 593 | mapped page gets modified. |
605 | If defined, it should set the PageDirty flag, and the | 594 | If defined, it should set the PageDirty flag, and the |
606 | PAGECACHE_TAG_DIRTY tag in the radix tree. | 595 | PAGECACHE_TAG_DIRTY tag in the radix tree. |
607 | 596 | ||
608 | readpages: called by the VM to read pages associated with the address_space | 597 | readpages: called by the VM to read pages associated with the address_space |
609 | object. This is essentially just a vector version of | 598 | object. This is essentially just a vector version of |
610 | readpage. Instead of just one page, several pages are | 599 | readpage. Instead of just one page, several pages are |
611 | requested. | 600 | requested. |
612 | readpages is only used for read-ahead, so read errors are | 601 | readpages is only used for read-ahead, so read errors are |
613 | ignored. If anything goes wrong, feel free to give up. | 602 | ignored. If anything goes wrong, feel free to give up. |
614 | 603 | ||
615 | prepare_write: called by the generic write path in VM to set up a write | 604 | prepare_write: called by the generic write path in VM to set up a write |
616 | request for a page. This indicates to the address space that | 605 | request for a page. This indicates to the address space that |
617 | the given range of bytes is about to be written. The | 606 | the given range of bytes is about to be written. The |
618 | address_space should check that the write will be able to | 607 | address_space should check that the write will be able to |
619 | complete, by allocating space if necessary and doing any other | 608 | complete, by allocating space if necessary and doing any other |
620 | internal housekeeping. If the write will update parts of | 609 | internal housekeeping. If the write will update parts of |
621 | any basic-blocks on storage, then those blocks should be | 610 | any basic-blocks on storage, then those blocks should be |
622 | pre-read (if they haven't been read already) so that the | 611 | pre-read (if they haven't been read already) so that the |
623 | updated blocks can be written out properly. | 612 | updated blocks can be written out properly. |
624 | The page will be locked. | 613 | The page will be locked. |
625 | 614 | ||
626 | Note: the page _must not_ be marked uptodate in this function | 615 | Note: the page _must not_ be marked uptodate in this function |
627 | (or anywhere else) unless it actually is uptodate right now. As | 616 | (or anywhere else) unless it actually is uptodate right now. As |
628 | soon as a page is marked uptodate, it is possible for a concurrent | 617 | soon as a page is marked uptodate, it is possible for a concurrent |
629 | read(2) to copy it to userspace. | 618 | read(2) to copy it to userspace. |
630 | 619 | ||
631 | commit_write: If prepare_write succeeds, new data will be copied | 620 | commit_write: If prepare_write succeeds, new data will be copied |
632 | into the page and then commit_write will be called. It will | 621 | into the page and then commit_write will be called. It will |
633 | typically update the size of the file (if appropriate) and | 622 | typically update the size of the file (if appropriate) and |
634 | mark the inode as dirty, and do any other related housekeeping | 623 | mark the inode as dirty, and do any other related housekeeping |
635 | operations. It should avoid returning an error if possible - | 624 | operations. It should avoid returning an error if possible - |
636 | errors should have been handled by prepare_write. | 625 | errors should have been handled by prepare_write. |
637 | 626 | ||
638 | write_begin: This is intended as a replacement for prepare_write. The | 627 | write_begin: This is intended as a replacement for prepare_write. The |
639 | key differences being that: | 628 | key differences being that: |
640 | - it returns a locked page (in *pagep) rather than being | 629 | - it returns a locked page (in *pagep) rather than being |
641 | given a pre locked page; | 630 | given a pre locked page; |
642 | - it must be able to cope with short writes (where the | 631 | - it must be able to cope with short writes (where the |
643 | length passed to write_begin is greater than the number | 632 | length passed to write_begin is greater than the number |
644 | of bytes copied into the page). | 633 | of bytes copied into the page). |
645 | 634 | ||
646 | Called by the generic buffered write code to ask the filesystem to | 635 | Called by the generic buffered write code to ask the filesystem to |
647 | prepare to write len bytes at the given offset in the file. The | 636 | prepare to write len bytes at the given offset in the file. The |
648 | address_space should check that the write will be able to complete, | 637 | address_space should check that the write will be able to complete, |
649 | by allocating space if necessary and doing any other internal | 638 | by allocating space if necessary and doing any other internal |
650 | housekeeping. If the write will update parts of any basic-blocks on | 639 | housekeeping. If the write will update parts of any basic-blocks on |
651 | storage, then those blocks should be pre-read (if they haven't been | 640 | storage, then those blocks should be pre-read (if they haven't been |
652 | read already) so that the updated blocks can be written out properly. | 641 | read already) so that the updated blocks can be written out properly. |
653 | 642 | ||
654 | The filesystem must return the locked pagecache page for the specified | 643 | The filesystem must return the locked pagecache page for the specified |
655 | offset, in *pagep, for the caller to write into. | 644 | offset, in *pagep, for the caller to write into. |
656 | 645 | ||
657 | flags is a field for AOP_FLAG_xxx flags, described in | 646 | flags is a field for AOP_FLAG_xxx flags, described in |
658 | include/linux/fs.h. | 647 | include/linux/fs.h. |
659 | 648 | ||
660 | A void * may be returned in fsdata, which then gets passed into | 649 | A void * may be returned in fsdata, which then gets passed into |
661 | write_end. | 650 | write_end. |
662 | 651 | ||
663 | Returns 0 on success; < 0 on failure (which is the error code), in | 652 | Returns 0 on success; < 0 on failure (which is the error code), in |
664 | which case write_end is not called. | 653 | which case write_end is not called. |
665 | 654 | ||
666 | write_end: After a successful write_begin, and data copy, write_end must | 655 | write_end: After a successful write_begin, and data copy, write_end must |
667 | be called. len is the original len passed to write_begin, and copied | 656 | be called. len is the original len passed to write_begin, and copied |
668 | is the amount that was able to be copied (copied == len is always true | 657 | is the amount that was able to be copied (copied == len is always true |
669 | if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag). | 658 | if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag). |
670 | 659 | ||
671 | The filesystem must take care of unlocking the page and releasing it | 660 | The filesystem must take care of unlocking the page and releasing it |
672 | refcount, and updating i_size. | 661 | refcount, and updating i_size. |
673 | 662 | ||
674 | Returns < 0 on failure, otherwise the number of bytes (<= 'copied') | 663 | Returns < 0 on failure, otherwise the number of bytes (<= 'copied') |
675 | that were able to be copied into pagecache. | 664 | that were able to be copied into pagecache. |
676 | 665 | ||
677 | bmap: called by the VFS to map a logical block offset within object to | 666 | bmap: called by the VFS to map a logical block offset within object to |
678 | physical block number. This method is used by the FIBMAP | 667 | physical block number. This method is used by the FIBMAP |
679 | ioctl and for working with swap-files. To be able to swap to | 668 | ioctl and for working with swap-files. To be able to swap to |
680 | a file, the file must have a stable mapping to a block | 669 | a file, the file must have a stable mapping to a block |
681 | device. The swap system does not go through the filesystem | 670 | device. The swap system does not go through the filesystem |
682 | but instead uses bmap to find out where the blocks in the file | 671 | but instead uses bmap to find out where the blocks in the file |
683 | are and uses those addresses directly. | 672 | are and uses those addresses directly. |
684 | 673 | ||
685 | 674 | ||
686 | invalidatepage: If a page has PagePrivate set, then invalidatepage | 675 | invalidatepage: If a page has PagePrivate set, then invalidatepage |
687 | will be called when part or all of the page is to be removed | 676 | will be called when part or all of the page is to be removed |
688 | from the address space. This generally corresponds to either a | 677 | from the address space. This generally corresponds to either a |
689 | truncation or a complete invalidation of the address space | 678 | truncation or a complete invalidation of the address space |
690 | (in the latter case 'offset' will always be 0). | 679 | (in the latter case 'offset' will always be 0). |
691 | Any private data associated with the page should be updated | 680 | Any private data associated with the page should be updated |
692 | to reflect this truncation. If offset is 0, then | 681 | to reflect this truncation. If offset is 0, then |
693 | the private data should be released, because the page | 682 | the private data should be released, because the page |
694 | must be able to be completely discarded. This may be done by | 683 | must be able to be completely discarded. This may be done by |
695 | calling the ->releasepage function, but in this case the | 684 | calling the ->releasepage function, but in this case the |
696 | release MUST succeed. | 685 | release MUST succeed. |
697 | 686 | ||
698 | releasepage: releasepage is called on PagePrivate pages to indicate | 687 | releasepage: releasepage is called on PagePrivate pages to indicate |
699 | that the page should be freed if possible. ->releasepage | 688 | that the page should be freed if possible. ->releasepage |
700 | should remove any private data from the page and clear the | 689 | should remove any private data from the page and clear the |
701 | PagePrivate flag. It may also remove the page from the | 690 | PagePrivate flag. It may also remove the page from the |
702 | address_space. If this fails for some reason, it may indicate | 691 | address_space. If this fails for some reason, it may indicate |
703 | failure with a 0 return value. | 692 | failure with a 0 return value. |
704 | This is used in two distinct though related cases. The first | 693 | This is used in two distinct though related cases. The first |
705 | is when the VM finds a clean page with no active users and | 694 | is when the VM finds a clean page with no active users and |
706 | wants to make it a free page. If ->releasepage succeeds, the | 695 | wants to make it a free page. If ->releasepage succeeds, the |
707 | page will be removed from the address_space and become free. | 696 | page will be removed from the address_space and become free. |
708 | 697 | ||
709 | The second case is when a request has been made to invalidate | 698 | The second case is when a request has been made to invalidate |
710 | some or all pages in an address_space. This can happen | 699 | some or all pages in an address_space. This can happen |
711 | through the fadvice(POSIX_FADV_DONTNEED) system call or by the | 700 | through the fadvice(POSIX_FADV_DONTNEED) system call or by the |
712 | filesystem explicitly requesting it as nfs and 9fs do (when | 701 | filesystem explicitly requesting it as nfs and 9fs do (when |
713 | they believe the cache may be out of date with storage) by | 702 | they believe the cache may be out of date with storage) by |
714 | calling invalidate_inode_pages2(). | 703 | calling invalidate_inode_pages2(). |
715 | If the filesystem makes such a call, and needs to be certain | 704 | If the filesystem makes such a call, and needs to be certain |
716 | that all pages are invalidated, then its releasepage will | 705 | that all pages are invalidated, then its releasepage will |
717 | need to ensure this. Possibly it can clear the PageUptodate | 706 | need to ensure this. Possibly it can clear the PageUptodate |
718 | bit if it cannot free private data yet. | 707 | bit if it cannot free private data yet. |
719 | 708 | ||
720 | direct_IO: called by the generic read/write routines to perform | 709 | direct_IO: called by the generic read/write routines to perform |
721 | direct_IO - that is IO requests which bypass the page cache | 710 | direct_IO - that is IO requests which bypass the page cache |
722 | and transfer data directly between the storage and the | 711 | and transfer data directly between the storage and the |
723 | application's address space. | 712 | application's address space. |
724 | 713 | ||
725 | get_xip_page: called by the VM to translate a block number to a page. | 714 | get_xip_page: called by the VM to translate a block number to a page. |
726 | The page is valid until the corresponding filesystem is unmounted. | 715 | The page is valid until the corresponding filesystem is unmounted. |
727 | Filesystems that want to use execute-in-place (XIP) need to implement | 716 | Filesystems that want to use execute-in-place (XIP) need to implement |
728 | it. An example implementation can be found in fs/ext2/xip.c. | 717 | it. An example implementation can be found in fs/ext2/xip.c. |
729 | 718 | ||
730 | migrate_page: This is used to compact the physical memory usage. | 719 | migrate_page: This is used to compact the physical memory usage. |
731 | If the VM wants to relocate a page (maybe off a memory card | 720 | If the VM wants to relocate a page (maybe off a memory card |
732 | that is signalling imminent failure) it will pass a new page | 721 | that is signalling imminent failure) it will pass a new page |
733 | and an old page to this function. migrate_page should | 722 | and an old page to this function. migrate_page should |
734 | transfer any private data across and update any references | 723 | transfer any private data across and update any references |
735 | that it has to the page. | 724 | that it has to the page. |
736 | 725 | ||
737 | launder_page: Called before freeing a page - it writes back the dirty page. To | 726 | launder_page: Called before freeing a page - it writes back the dirty page. To |
738 | prevent redirtying the page, it is kept locked during the whole | 727 | prevent redirtying the page, it is kept locked during the whole |
739 | operation. | 728 | operation. |
740 | 729 | ||
741 | The File Object | 730 | The File Object |
742 | =============== | 731 | =============== |
743 | 732 | ||
744 | A file object represents a file opened by a process. | 733 | A file object represents a file opened by a process. |
745 | 734 | ||
746 | 735 | ||
747 | struct file_operations | 736 | struct file_operations |
748 | ---------------------- | 737 | ---------------------- |
749 | 738 | ||
750 | This describes how the VFS can manipulate an open file. As of kernel | 739 | This describes how the VFS can manipulate an open file. As of kernel |
751 | 2.6.22, the following members are defined: | 740 | 2.6.22, the following members are defined: |
752 | 741 | ||
753 | struct file_operations { | 742 | struct file_operations { |
754 | struct module *owner; | 743 | struct module *owner; |
755 | loff_t (*llseek) (struct file *, loff_t, int); | 744 | loff_t (*llseek) (struct file *, loff_t, int); |
756 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); | 745 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); |
757 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); | 746 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); |
758 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 747 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
759 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 748 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
760 | int (*readdir) (struct file *, void *, filldir_t); | 749 | int (*readdir) (struct file *, void *, filldir_t); |
761 | unsigned int (*poll) (struct file *, struct poll_table_struct *); | 750 | unsigned int (*poll) (struct file *, struct poll_table_struct *); |
762 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); | 751 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); |
763 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); | 752 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); |
764 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); | 753 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); |
765 | int (*mmap) (struct file *, struct vm_area_struct *); | 754 | int (*mmap) (struct file *, struct vm_area_struct *); |
766 | int (*open) (struct inode *, struct file *); | 755 | int (*open) (struct inode *, struct file *); |
767 | int (*flush) (struct file *); | 756 | int (*flush) (struct file *); |
768 | int (*release) (struct inode *, struct file *); | 757 | int (*release) (struct inode *, struct file *); |
769 | int (*fsync) (struct file *, struct dentry *, int datasync); | 758 | int (*fsync) (struct file *, struct dentry *, int datasync); |
770 | int (*aio_fsync) (struct kiocb *, int datasync); | 759 | int (*aio_fsync) (struct kiocb *, int datasync); |
771 | int (*fasync) (int, struct file *, int); | 760 | int (*fasync) (int, struct file *, int); |
772 | int (*lock) (struct file *, int, struct file_lock *); | 761 | int (*lock) (struct file *, int, struct file_lock *); |
773 | ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); | 762 | ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); |
774 | ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); | 763 | ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); |
775 | ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); | 764 | ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); |
776 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); | 765 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); |
777 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 766 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
778 | int (*check_flags)(int); | 767 | int (*check_flags)(int); |
779 | int (*dir_notify)(struct file *filp, unsigned long arg); | 768 | int (*dir_notify)(struct file *filp, unsigned long arg); |
780 | int (*flock) (struct file *, int, struct file_lock *); | 769 | int (*flock) (struct file *, int, struct file_lock *); |
781 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); | 770 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); |
782 | ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); | 771 | ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); |
783 | }; | 772 | }; |
784 | 773 | ||
785 | Again, all methods are called without any locks being held, unless | 774 | Again, all methods are called without any locks being held, unless |
786 | otherwise noted. | 775 | otherwise noted. |
787 | 776 | ||
788 | llseek: called when the VFS needs to move the file position index | 777 | llseek: called when the VFS needs to move the file position index |
789 | 778 | ||
790 | read: called by read(2) and related system calls | 779 | read: called by read(2) and related system calls |
791 | 780 | ||
792 | aio_read: called by io_submit(2) and other asynchronous I/O operations | 781 | aio_read: called by io_submit(2) and other asynchronous I/O operations |
793 | 782 | ||
794 | write: called by write(2) and related system calls | 783 | write: called by write(2) and related system calls |
795 | 784 | ||
796 | aio_write: called by io_submit(2) and other asynchronous I/O operations | 785 | aio_write: called by io_submit(2) and other asynchronous I/O operations |
797 | 786 | ||
798 | readdir: called when the VFS needs to read the directory contents | 787 | readdir: called when the VFS needs to read the directory contents |
799 | 788 | ||
800 | poll: called by the VFS when a process wants to check if there is | 789 | poll: called by the VFS when a process wants to check if there is |
801 | activity on this file and (optionally) go to sleep until there | 790 | activity on this file and (optionally) go to sleep until there |
802 | is activity. Called by the select(2) and poll(2) system calls | 791 | is activity. Called by the select(2) and poll(2) system calls |
803 | 792 | ||
804 | ioctl: called by the ioctl(2) system call | 793 | ioctl: called by the ioctl(2) system call |
805 | 794 | ||
806 | unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not | 795 | unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not |
807 | require the BKL should use this method instead of the ioctl() above. | 796 | require the BKL should use this method instead of the ioctl() above. |
808 | 797 | ||
809 | compat_ioctl: called by the ioctl(2) system call when 32 bit system calls | 798 | compat_ioctl: called by the ioctl(2) system call when 32 bit system calls |
810 | are used on 64 bit kernels. | 799 | are used on 64 bit kernels. |
811 | 800 | ||
812 | mmap: called by the mmap(2) system call | 801 | mmap: called by the mmap(2) system call |
813 | 802 | ||
814 | open: called by the VFS when an inode should be opened. When the VFS | 803 | open: called by the VFS when an inode should be opened. When the VFS |
815 | opens a file, it creates a new "struct file". It then calls the | 804 | opens a file, it creates a new "struct file". It then calls the |
816 | open method for the newly allocated file structure. You might | 805 | open method for the newly allocated file structure. You might |
817 | think that the open method really belongs in | 806 | think that the open method really belongs in |
818 | "struct inode_operations", and you may be right. I think it's | 807 | "struct inode_operations", and you may be right. I think it's |
819 | done the way it is because it makes filesystems simpler to | 808 | done the way it is because it makes filesystems simpler to |
820 | implement. The open() method is a good place to initialize the | 809 | implement. The open() method is a good place to initialize the |
821 | "private_data" member in the file structure if you want to point | 810 | "private_data" member in the file structure if you want to point |
822 | to a device structure | 811 | to a device structure |
823 | 812 | ||
824 | flush: called by the close(2) system call to flush a file | 813 | flush: called by the close(2) system call to flush a file |
825 | 814 | ||
826 | release: called when the last reference to an open file is closed | 815 | release: called when the last reference to an open file is closed |
827 | 816 | ||
828 | fsync: called by the fsync(2) system call | 817 | fsync: called by the fsync(2) system call |
829 | 818 | ||
830 | fasync: called by the fcntl(2) system call when asynchronous | 819 | fasync: called by the fcntl(2) system call when asynchronous |
831 | (non-blocking) mode is enabled for a file | 820 | (non-blocking) mode is enabled for a file |
832 | 821 | ||
833 | lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW | 822 | lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW |
834 | commands | 823 | commands |
835 | 824 | ||
836 | readv: called by the readv(2) system call | 825 | readv: called by the readv(2) system call |
837 | 826 | ||
838 | writev: called by the writev(2) system call | 827 | writev: called by the writev(2) system call |
839 | 828 | ||
840 | sendfile: called by the sendfile(2) system call | 829 | sendfile: called by the sendfile(2) system call |
841 | 830 | ||
842 | get_unmapped_area: called by the mmap(2) system call | 831 | get_unmapped_area: called by the mmap(2) system call |
843 | 832 | ||
844 | check_flags: called by the fcntl(2) system call for F_SETFL command | 833 | check_flags: called by the fcntl(2) system call for F_SETFL command |
845 | 834 | ||
846 | dir_notify: called by the fcntl(2) system call for F_NOTIFY command | 835 | dir_notify: called by the fcntl(2) system call for F_NOTIFY command |
847 | 836 | ||
848 | flock: called by the flock(2) system call | 837 | flock: called by the flock(2) system call |
849 | 838 | ||
850 | splice_write: called by the VFS to splice data from a pipe to a file. This | 839 | splice_write: called by the VFS to splice data from a pipe to a file. This |
851 | method is used by the splice(2) system call | 840 | method is used by the splice(2) system call |
852 | 841 | ||
853 | splice_read: called by the VFS to splice data from file to a pipe. This | 842 | splice_read: called by the VFS to splice data from file to a pipe. This |
854 | method is used by the splice(2) system call | 843 | method is used by the splice(2) system call |
855 | 844 | ||
856 | Note that the file operations are implemented by the specific | 845 | Note that the file operations are implemented by the specific |
857 | filesystem in which the inode resides. When opening a device node | 846 | filesystem in which the inode resides. When opening a device node |
858 | (character or block special) most filesystems will call special | 847 | (character or block special) most filesystems will call special |
859 | support routines in the VFS which will locate the required device | 848 | support routines in the VFS which will locate the required device |
860 | driver information. These support routines replace the filesystem file | 849 | driver information. These support routines replace the filesystem file |
861 | operations with those for the device driver, and then proceed to call | 850 | operations with those for the device driver, and then proceed to call |
862 | the new open() method for the file. This is how opening a device file | 851 | the new open() method for the file. This is how opening a device file |
863 | in the filesystem eventually ends up calling the device driver open() | 852 | in the filesystem eventually ends up calling the device driver open() |
864 | method. | 853 | method. |
865 | 854 | ||
866 | 855 | ||
867 | Directory Entry Cache (dcache) | 856 | Directory Entry Cache (dcache) |
868 | ============================== | 857 | ============================== |
869 | 858 | ||
870 | 859 | ||
871 | struct dentry_operations | 860 | struct dentry_operations |
872 | ------------------------ | 861 | ------------------------ |
873 | 862 | ||
874 | This describes how a filesystem can overload the standard dentry | 863 | This describes how a filesystem can overload the standard dentry |
875 | operations. Dentries and the dcache are the domain of the VFS and the | 864 | operations. Dentries and the dcache are the domain of the VFS and the |
876 | individual filesystem implementations. Device drivers have no business | 865 | individual filesystem implementations. Device drivers have no business |
877 | here. These methods may be set to NULL, as they are either optional or | 866 | here. These methods may be set to NULL, as they are either optional or |
878 | the VFS uses a default. As of kernel 2.6.22, the following members are | 867 | the VFS uses a default. As of kernel 2.6.22, the following members are |
879 | defined: | 868 | defined: |
880 | 869 | ||
881 | struct dentry_operations { | 870 | struct dentry_operations { |
882 | int (*d_revalidate)(struct dentry *, struct nameidata *); | 871 | int (*d_revalidate)(struct dentry *, struct nameidata *); |
883 | int (*d_hash) (struct dentry *, struct qstr *); | 872 | int (*d_hash) (struct dentry *, struct qstr *); |
884 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 873 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); |
885 | int (*d_delete)(struct dentry *); | 874 | int (*d_delete)(struct dentry *); |
886 | void (*d_release)(struct dentry *); | 875 | void (*d_release)(struct dentry *); |
887 | void (*d_iput)(struct dentry *, struct inode *); | 876 | void (*d_iput)(struct dentry *, struct inode *); |
888 | char *(*d_dname)(struct dentry *, char *, int); | 877 | char *(*d_dname)(struct dentry *, char *, int); |
889 | }; | 878 | }; |
890 | 879 | ||
891 | d_revalidate: called when the VFS needs to revalidate a dentry. This | 880 | d_revalidate: called when the VFS needs to revalidate a dentry. This |
892 | is called whenever a name look-up finds a dentry in the | 881 | is called whenever a name look-up finds a dentry in the |
893 | dcache. Most filesystems leave this as NULL, because all their | 882 | dcache. Most filesystems leave this as NULL, because all their |
894 | dentries in the dcache are valid | 883 | dentries in the dcache are valid |
895 | 884 | ||
896 | d_hash: called when the VFS adds a dentry to the hash table | 885 | d_hash: called when the VFS adds a dentry to the hash table |
897 | 886 | ||
898 | d_compare: called when a dentry should be compared with another | 887 | d_compare: called when a dentry should be compared with another |
899 | 888 | ||
900 | d_delete: called when the last reference to a dentry is | 889 | d_delete: called when the last reference to a dentry is |
901 | deleted. This means no-one is using the dentry, however it is | 890 | deleted. This means no-one is using the dentry, however it is |
902 | still valid and in the dcache | 891 | still valid and in the dcache |
903 | 892 | ||
904 | d_release: called when a dentry is really deallocated | 893 | d_release: called when a dentry is really deallocated |
905 | 894 | ||
906 | d_iput: called when a dentry loses its inode (just prior to its | 895 | d_iput: called when a dentry loses its inode (just prior to its |
907 | being deallocated). The default when this is NULL is that the | 896 | being deallocated). The default when this is NULL is that the |
908 | VFS calls iput(). If you define this method, you must call | 897 | VFS calls iput(). If you define this method, you must call |
909 | iput() yourself | 898 | iput() yourself |
910 | 899 | ||
911 | d_dname: called when the pathname of a dentry should be generated. | 900 | d_dname: called when the pathname of a dentry should be generated. |
912 | Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay | 901 | Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay |
913 | pathname generation. (Instead of doing it when dentry is created, | 902 | pathname generation. (Instead of doing it when dentry is created, |
914 | its done only when the path is needed.). Real filesystems probably | 903 | its done only when the path is needed.). Real filesystems probably |
915 | dont want to use it, because their dentries are present in global | 904 | dont want to use it, because their dentries are present in global |
916 | dcache hash, so their hash should be an invariant. As no lock is | 905 | dcache hash, so their hash should be an invariant. As no lock is |
917 | held, d_dname() should not try to modify the dentry itself, unless | 906 | held, d_dname() should not try to modify the dentry itself, unless |
918 | appropriate SMP safety is used. CAUTION : d_path() logic is quite | 907 | appropriate SMP safety is used. CAUTION : d_path() logic is quite |
919 | tricky. The correct way to return for example "Hello" is to put it | 908 | tricky. The correct way to return for example "Hello" is to put it |
920 | at the end of the buffer, and returns a pointer to the first char. | 909 | at the end of the buffer, and returns a pointer to the first char. |
921 | dynamic_dname() helper function is provided to take care of this. | 910 | dynamic_dname() helper function is provided to take care of this. |
922 | 911 | ||
923 | Example : | 912 | Example : |
924 | 913 | ||
925 | static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) | 914 | static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) |
926 | { | 915 | { |
927 | return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", | 916 | return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", |
928 | dentry->d_inode->i_ino); | 917 | dentry->d_inode->i_ino); |
929 | } | 918 | } |
930 | 919 | ||
931 | Each dentry has a pointer to its parent dentry, as well as a hash list | 920 | Each dentry has a pointer to its parent dentry, as well as a hash list |
932 | of child dentries. Child dentries are basically like files in a | 921 | of child dentries. Child dentries are basically like files in a |
933 | directory. | 922 | directory. |
934 | 923 | ||
935 | 924 | ||
936 | Directory Entry Cache API | 925 | Directory Entry Cache API |
937 | -------------------------- | 926 | -------------------------- |
938 | 927 | ||
939 | There are a number of functions defined which permit a filesystem to | 928 | There are a number of functions defined which permit a filesystem to |
940 | manipulate dentries: | 929 | manipulate dentries: |
941 | 930 | ||
942 | dget: open a new handle for an existing dentry (this just increments | 931 | dget: open a new handle for an existing dentry (this just increments |
943 | the usage count) | 932 | the usage count) |
944 | 933 | ||
945 | dput: close a handle for a dentry (decrements the usage count). If | 934 | dput: close a handle for a dentry (decrements the usage count). If |
946 | the usage count drops to 0, the "d_delete" method is called | 935 | the usage count drops to 0, the "d_delete" method is called |
947 | and the dentry is placed on the unused list if the dentry is | 936 | and the dentry is placed on the unused list if the dentry is |
948 | still in its parents hash list. Putting the dentry on the | 937 | still in its parents hash list. Putting the dentry on the |
949 | unused list just means that if the system needs some RAM, it | 938 | unused list just means that if the system needs some RAM, it |
950 | goes through the unused list of dentries and deallocates them. | 939 | goes through the unused list of dentries and deallocates them. |
951 | If the dentry has already been unhashed and the usage count | 940 | If the dentry has already been unhashed and the usage count |
952 | drops to 0, in this case the dentry is deallocated after the | 941 | drops to 0, in this case the dentry is deallocated after the |
953 | "d_delete" method is called | 942 | "d_delete" method is called |
954 | 943 | ||
955 | d_drop: this unhashes a dentry from its parents hash list. A | 944 | d_drop: this unhashes a dentry from its parents hash list. A |
956 | subsequent call to dput() will deallocate the dentry if its | 945 | subsequent call to dput() will deallocate the dentry if its |
957 | usage count drops to 0 | 946 | usage count drops to 0 |
958 | 947 | ||
959 | d_delete: delete a dentry. If there are no other open references to | 948 | d_delete: delete a dentry. If there are no other open references to |
960 | the dentry then the dentry is turned into a negative dentry | 949 | the dentry then the dentry is turned into a negative dentry |
961 | (the d_iput() method is called). If there are other | 950 | (the d_iput() method is called). If there are other |
962 | references, then d_drop() is called instead | 951 | references, then d_drop() is called instead |
963 | 952 | ||
964 | d_add: add a dentry to its parents hash list and then calls | 953 | d_add: add a dentry to its parents hash list and then calls |
965 | d_instantiate() | 954 | d_instantiate() |
966 | 955 | ||
967 | d_instantiate: add a dentry to the alias hash list for the inode and | 956 | d_instantiate: add a dentry to the alias hash list for the inode and |
968 | updates the "d_inode" member. The "i_count" member in the | 957 | updates the "d_inode" member. The "i_count" member in the |
969 | inode structure should be set/incremented. If the inode | 958 | inode structure should be set/incremented. If the inode |
970 | pointer is NULL, the dentry is called a "negative | 959 | pointer is NULL, the dentry is called a "negative |
971 | dentry". This function is commonly called when an inode is | 960 | dentry". This function is commonly called when an inode is |
972 | created for an existing negative dentry | 961 | created for an existing negative dentry |
973 | 962 | ||
974 | d_lookup: look up a dentry given its parent and path name component | 963 | d_lookup: look up a dentry given its parent and path name component |
975 | It looks up the child of that given name from the dcache | 964 | It looks up the child of that given name from the dcache |
976 | hash table. If it is found, the reference count is incremented | 965 | hash table. If it is found, the reference count is incremented |
977 | and the dentry is returned. The caller must use d_put() | 966 | and the dentry is returned. The caller must use d_put() |
978 | to free the dentry when it finishes using it. | 967 | to free the dentry when it finishes using it. |
979 | 968 | ||
980 | For further information on dentry locking, please refer to the document | 969 | For further information on dentry locking, please refer to the document |
981 | Documentation/filesystems/dentry-locking.txt. | 970 | Documentation/filesystems/dentry-locking.txt. |
982 | 971 | ||
983 | 972 | ||
984 | Resources | 973 | Resources |
985 | ========= | 974 | ========= |
986 | 975 | ||
987 | (Note some of these resources are not up-to-date with the latest kernel | 976 | (Note some of these resources are not up-to-date with the latest kernel |
988 | version.) | 977 | version.) |
989 | 978 | ||
990 | Creating Linux virtual filesystems. 2002 | 979 | Creating Linux virtual filesystems. 2002 |
991 | <http://lwn.net/Articles/13325/> | 980 | <http://lwn.net/Articles/13325/> |
992 | 981 | ||
993 | The Linux Virtual File-system Layer by Neil Brown. 1999 | 982 | The Linux Virtual File-system Layer by Neil Brown. 1999 |
994 | <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> | 983 | <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> |
995 | 984 | ||
996 | A tour of the Linux VFS by Michael K. Johnson. 1996 | 985 | A tour of the Linux VFS by Michael K. Johnson. 1996 |
997 | <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> | 986 | <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> |
998 | 987 | ||
999 | A small trail through the Linux kernel by Andries Brouwer. 2001 | 988 | A small trail through the Linux kernel by Andries Brouwer. 2001 |
1000 | <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> | 989 | <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> |
1001 | 990 |
fs/inode.c
1 | /* | 1 | /* |
2 | * linux/fs/inode.c | 2 | * linux/fs/inode.c |
3 | * | 3 | * |
4 | * (C) 1997 Linus Torvalds | 4 | * (C) 1997 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/fs.h> | 7 | #include <linux/fs.h> |
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/dcache.h> | 9 | #include <linux/dcache.h> |
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/quotaops.h> | 11 | #include <linux/quotaops.h> |
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/writeback.h> | 13 | #include <linux/writeback.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/wait.h> | 16 | #include <linux/wait.h> |
17 | #include <linux/hash.h> | 17 | #include <linux/hash.h> |
18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
20 | #include <linux/pagemap.h> | 20 | #include <linux/pagemap.h> |
21 | #include <linux/cdev.h> | 21 | #include <linux/cdev.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/inotify.h> | 23 | #include <linux/inotify.h> |
24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * This is needed for the following functions: | 27 | * This is needed for the following functions: |
28 | * - inode_has_buffers | 28 | * - inode_has_buffers |
29 | * - invalidate_inode_buffers | 29 | * - invalidate_inode_buffers |
30 | * - invalidate_bdev | 30 | * - invalidate_bdev |
31 | * | 31 | * |
32 | * FIXME: remove all knowledge of the buffer layer from this file | 32 | * FIXME: remove all knowledge of the buffer layer from this file |
33 | */ | 33 | */ |
34 | #include <linux/buffer_head.h> | 34 | #include <linux/buffer_head.h> |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * New inode.c implementation. | 37 | * New inode.c implementation. |
38 | * | 38 | * |
39 | * This implementation has the basic premise of trying | 39 | * This implementation has the basic premise of trying |
40 | * to be extremely low-overhead and SMP-safe, yet be | 40 | * to be extremely low-overhead and SMP-safe, yet be |
41 | * simple enough to be "obviously correct". | 41 | * simple enough to be "obviously correct". |
42 | * | 42 | * |
43 | * Famous last words. | 43 | * Famous last words. |
44 | */ | 44 | */ |
45 | 45 | ||
46 | /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ | 46 | /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ |
47 | 47 | ||
48 | /* #define INODE_PARANOIA 1 */ | 48 | /* #define INODE_PARANOIA 1 */ |
49 | /* #define INODE_DEBUG 1 */ | 49 | /* #define INODE_DEBUG 1 */ |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Inode lookup is no longer as critical as it used to be: | 52 | * Inode lookup is no longer as critical as it used to be: |
53 | * most of the lookups are going to be through the dcache. | 53 | * most of the lookups are going to be through the dcache. |
54 | */ | 54 | */ |
55 | #define I_HASHBITS i_hash_shift | 55 | #define I_HASHBITS i_hash_shift |
56 | #define I_HASHMASK i_hash_mask | 56 | #define I_HASHMASK i_hash_mask |
57 | 57 | ||
58 | static unsigned int i_hash_mask __read_mostly; | 58 | static unsigned int i_hash_mask __read_mostly; |
59 | static unsigned int i_hash_shift __read_mostly; | 59 | static unsigned int i_hash_shift __read_mostly; |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Each inode can be on two separate lists. One is | 62 | * Each inode can be on two separate lists. One is |
63 | * the hash list of the inode, used for lookups. The | 63 | * the hash list of the inode, used for lookups. The |
64 | * other linked list is the "type" list: | 64 | * other linked list is the "type" list: |
65 | * "in_use" - valid inode, i_count > 0, i_nlink > 0 | 65 | * "in_use" - valid inode, i_count > 0, i_nlink > 0 |
66 | * "dirty" - as "in_use" but also dirty | 66 | * "dirty" - as "in_use" but also dirty |
67 | * "unused" - valid inode, i_count = 0 | 67 | * "unused" - valid inode, i_count = 0 |
68 | * | 68 | * |
69 | * A "dirty" list is maintained for each super block, | 69 | * A "dirty" list is maintained for each super block, |
70 | * allowing for low-overhead inode sync() operations. | 70 | * allowing for low-overhead inode sync() operations. |
71 | */ | 71 | */ |
72 | 72 | ||
73 | LIST_HEAD(inode_in_use); | 73 | LIST_HEAD(inode_in_use); |
74 | LIST_HEAD(inode_unused); | 74 | LIST_HEAD(inode_unused); |
75 | static struct hlist_head *inode_hashtable __read_mostly; | 75 | static struct hlist_head *inode_hashtable __read_mostly; |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * A simple spinlock to protect the list manipulations. | 78 | * A simple spinlock to protect the list manipulations. |
79 | * | 79 | * |
80 | * NOTE! You also have to own the lock if you change | 80 | * NOTE! You also have to own the lock if you change |
81 | * the i_state of an inode while it is in use.. | 81 | * the i_state of an inode while it is in use.. |
82 | */ | 82 | */ |
83 | DEFINE_SPINLOCK(inode_lock); | 83 | DEFINE_SPINLOCK(inode_lock); |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * iprune_mutex provides exclusion between the kswapd or try_to_free_pages | 86 | * iprune_mutex provides exclusion between the kswapd or try_to_free_pages |
87 | * icache shrinking path, and the umount path. Without this exclusion, | 87 | * icache shrinking path, and the umount path. Without this exclusion, |
88 | * by the time prune_icache calls iput for the inode whose pages it has | 88 | * by the time prune_icache calls iput for the inode whose pages it has |
89 | * been invalidating, or by the time it calls clear_inode & destroy_inode | 89 | * been invalidating, or by the time it calls clear_inode & destroy_inode |
90 | * from its final dispose_list, the struct super_block they refer to | 90 | * from its final dispose_list, the struct super_block they refer to |
91 | * (for inode->i_sb->s_op) may already have been freed and reused. | 91 | * (for inode->i_sb->s_op) may already have been freed and reused. |
92 | */ | 92 | */ |
93 | static DEFINE_MUTEX(iprune_mutex); | 93 | static DEFINE_MUTEX(iprune_mutex); |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Statistics gathering.. | 96 | * Statistics gathering.. |
97 | */ | 97 | */ |
98 | struct inodes_stat_t inodes_stat; | 98 | struct inodes_stat_t inodes_stat; |
99 | 99 | ||
100 | static struct kmem_cache * inode_cachep __read_mostly; | 100 | static struct kmem_cache * inode_cachep __read_mostly; |
101 | 101 | ||
102 | static void wake_up_inode(struct inode *inode) | 102 | static void wake_up_inode(struct inode *inode) |
103 | { | 103 | { |
104 | /* | 104 | /* |
105 | * Prevent speculative execution through spin_unlock(&inode_lock); | 105 | * Prevent speculative execution through spin_unlock(&inode_lock); |
106 | */ | 106 | */ |
107 | smp_mb(); | 107 | smp_mb(); |
108 | wake_up_bit(&inode->i_state, __I_LOCK); | 108 | wake_up_bit(&inode->i_state, __I_LOCK); |
109 | } | 109 | } |
110 | 110 | ||
111 | static struct inode *alloc_inode(struct super_block *sb) | 111 | static struct inode *alloc_inode(struct super_block *sb) |
112 | { | 112 | { |
113 | static const struct address_space_operations empty_aops; | 113 | static const struct address_space_operations empty_aops; |
114 | static struct inode_operations empty_iops; | 114 | static struct inode_operations empty_iops; |
115 | static const struct file_operations empty_fops; | 115 | static const struct file_operations empty_fops; |
116 | struct inode *inode; | 116 | struct inode *inode; |
117 | 117 | ||
118 | if (sb->s_op->alloc_inode) | 118 | if (sb->s_op->alloc_inode) |
119 | inode = sb->s_op->alloc_inode(sb); | 119 | inode = sb->s_op->alloc_inode(sb); |
120 | else | 120 | else |
121 | inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); | 121 | inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); |
122 | 122 | ||
123 | if (inode) { | 123 | if (inode) { |
124 | struct address_space * const mapping = &inode->i_data; | 124 | struct address_space * const mapping = &inode->i_data; |
125 | 125 | ||
126 | inode->i_sb = sb; | 126 | inode->i_sb = sb; |
127 | inode->i_blkbits = sb->s_blocksize_bits; | 127 | inode->i_blkbits = sb->s_blocksize_bits; |
128 | inode->i_flags = 0; | 128 | inode->i_flags = 0; |
129 | atomic_set(&inode->i_count, 1); | 129 | atomic_set(&inode->i_count, 1); |
130 | inode->i_op = &empty_iops; | 130 | inode->i_op = &empty_iops; |
131 | inode->i_fop = &empty_fops; | 131 | inode->i_fop = &empty_fops; |
132 | inode->i_nlink = 1; | 132 | inode->i_nlink = 1; |
133 | atomic_set(&inode->i_writecount, 0); | 133 | atomic_set(&inode->i_writecount, 0); |
134 | inode->i_size = 0; | 134 | inode->i_size = 0; |
135 | inode->i_blocks = 0; | 135 | inode->i_blocks = 0; |
136 | inode->i_bytes = 0; | 136 | inode->i_bytes = 0; |
137 | inode->i_generation = 0; | 137 | inode->i_generation = 0; |
138 | #ifdef CONFIG_QUOTA | 138 | #ifdef CONFIG_QUOTA |
139 | memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); | 139 | memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); |
140 | #endif | 140 | #endif |
141 | inode->i_pipe = NULL; | 141 | inode->i_pipe = NULL; |
142 | inode->i_bdev = NULL; | 142 | inode->i_bdev = NULL; |
143 | inode->i_cdev = NULL; | 143 | inode->i_cdev = NULL; |
144 | inode->i_rdev = 0; | 144 | inode->i_rdev = 0; |
145 | inode->dirtied_when = 0; | 145 | inode->dirtied_when = 0; |
146 | if (security_inode_alloc(inode)) { | 146 | if (security_inode_alloc(inode)) { |
147 | if (inode->i_sb->s_op->destroy_inode) | 147 | if (inode->i_sb->s_op->destroy_inode) |
148 | inode->i_sb->s_op->destroy_inode(inode); | 148 | inode->i_sb->s_op->destroy_inode(inode); |
149 | else | 149 | else |
150 | kmem_cache_free(inode_cachep, (inode)); | 150 | kmem_cache_free(inode_cachep, (inode)); |
151 | return NULL; | 151 | return NULL; |
152 | } | 152 | } |
153 | 153 | ||
154 | spin_lock_init(&inode->i_lock); | 154 | spin_lock_init(&inode->i_lock); |
155 | lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); | 155 | lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); |
156 | 156 | ||
157 | mutex_init(&inode->i_mutex); | 157 | mutex_init(&inode->i_mutex); |
158 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); | 158 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); |
159 | 159 | ||
160 | init_rwsem(&inode->i_alloc_sem); | 160 | init_rwsem(&inode->i_alloc_sem); |
161 | lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); | 161 | lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); |
162 | 162 | ||
163 | mapping->a_ops = &empty_aops; | 163 | mapping->a_ops = &empty_aops; |
164 | mapping->host = inode; | 164 | mapping->host = inode; |
165 | mapping->flags = 0; | 165 | mapping->flags = 0; |
166 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); | 166 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); |
167 | mapping->assoc_mapping = NULL; | 167 | mapping->assoc_mapping = NULL; |
168 | mapping->backing_dev_info = &default_backing_dev_info; | 168 | mapping->backing_dev_info = &default_backing_dev_info; |
169 | 169 | ||
170 | /* | 170 | /* |
171 | * If the block_device provides a backing_dev_info for client | 171 | * If the block_device provides a backing_dev_info for client |
172 | * inodes then use that. Otherwise the inode share the bdev's | 172 | * inodes then use that. Otherwise the inode share the bdev's |
173 | * backing_dev_info. | 173 | * backing_dev_info. |
174 | */ | 174 | */ |
175 | if (sb->s_bdev) { | 175 | if (sb->s_bdev) { |
176 | struct backing_dev_info *bdi; | 176 | struct backing_dev_info *bdi; |
177 | 177 | ||
178 | bdi = sb->s_bdev->bd_inode_backing_dev_info; | 178 | bdi = sb->s_bdev->bd_inode_backing_dev_info; |
179 | if (!bdi) | 179 | if (!bdi) |
180 | bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; | 180 | bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; |
181 | mapping->backing_dev_info = bdi; | 181 | mapping->backing_dev_info = bdi; |
182 | } | 182 | } |
183 | inode->i_private = NULL; | 183 | inode->i_private = NULL; |
184 | inode->i_mapping = mapping; | 184 | inode->i_mapping = mapping; |
185 | } | 185 | } |
186 | return inode; | 186 | return inode; |
187 | } | 187 | } |
188 | 188 | ||
189 | void destroy_inode(struct inode *inode) | 189 | void destroy_inode(struct inode *inode) |
190 | { | 190 | { |
191 | BUG_ON(inode_has_buffers(inode)); | 191 | BUG_ON(inode_has_buffers(inode)); |
192 | security_inode_free(inode); | 192 | security_inode_free(inode); |
193 | if (inode->i_sb->s_op->destroy_inode) | 193 | if (inode->i_sb->s_op->destroy_inode) |
194 | inode->i_sb->s_op->destroy_inode(inode); | 194 | inode->i_sb->s_op->destroy_inode(inode); |
195 | else | 195 | else |
196 | kmem_cache_free(inode_cachep, (inode)); | 196 | kmem_cache_free(inode_cachep, (inode)); |
197 | } | 197 | } |
198 | 198 | ||
199 | 199 | ||
200 | /* | 200 | /* |
201 | * These are initializations that only need to be done | 201 | * These are initializations that only need to be done |
202 | * once, because the fields are idempotent across use | 202 | * once, because the fields are idempotent across use |
203 | * of the inode, so let the slab aware of that. | 203 | * of the inode, so let the slab aware of that. |
204 | */ | 204 | */ |
205 | void inode_init_once(struct inode *inode) | 205 | void inode_init_once(struct inode *inode) |
206 | { | 206 | { |
207 | memset(inode, 0, sizeof(*inode)); | 207 | memset(inode, 0, sizeof(*inode)); |
208 | INIT_HLIST_NODE(&inode->i_hash); | 208 | INIT_HLIST_NODE(&inode->i_hash); |
209 | INIT_LIST_HEAD(&inode->i_dentry); | 209 | INIT_LIST_HEAD(&inode->i_dentry); |
210 | INIT_LIST_HEAD(&inode->i_devices); | 210 | INIT_LIST_HEAD(&inode->i_devices); |
211 | INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); | 211 | INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); |
212 | rwlock_init(&inode->i_data.tree_lock); | 212 | rwlock_init(&inode->i_data.tree_lock); |
213 | spin_lock_init(&inode->i_data.i_mmap_lock); | 213 | spin_lock_init(&inode->i_data.i_mmap_lock); |
214 | INIT_LIST_HEAD(&inode->i_data.private_list); | 214 | INIT_LIST_HEAD(&inode->i_data.private_list); |
215 | spin_lock_init(&inode->i_data.private_lock); | 215 | spin_lock_init(&inode->i_data.private_lock); |
216 | INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); | 216 | INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); |
217 | INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); | 217 | INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); |
218 | i_size_ordered_init(inode); | 218 | i_size_ordered_init(inode); |
219 | #ifdef CONFIG_INOTIFY | 219 | #ifdef CONFIG_INOTIFY |
220 | INIT_LIST_HEAD(&inode->inotify_watches); | 220 | INIT_LIST_HEAD(&inode->inotify_watches); |
221 | mutex_init(&inode->inotify_mutex); | 221 | mutex_init(&inode->inotify_mutex); |
222 | #endif | 222 | #endif |
223 | } | 223 | } |
224 | 224 | ||
225 | EXPORT_SYMBOL(inode_init_once); | 225 | EXPORT_SYMBOL(inode_init_once); |
226 | 226 | ||
227 | static void init_once(struct kmem_cache * cachep, void *foo) | 227 | static void init_once(struct kmem_cache * cachep, void *foo) |
228 | { | 228 | { |
229 | struct inode * inode = (struct inode *) foo; | 229 | struct inode * inode = (struct inode *) foo; |
230 | 230 | ||
231 | inode_init_once(inode); | 231 | inode_init_once(inode); |
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * inode_lock must be held | 235 | * inode_lock must be held |
236 | */ | 236 | */ |
237 | void __iget(struct inode * inode) | 237 | void __iget(struct inode * inode) |
238 | { | 238 | { |
239 | if (atomic_read(&inode->i_count)) { | 239 | if (atomic_read(&inode->i_count)) { |
240 | atomic_inc(&inode->i_count); | 240 | atomic_inc(&inode->i_count); |
241 | return; | 241 | return; |
242 | } | 242 | } |
243 | atomic_inc(&inode->i_count); | 243 | atomic_inc(&inode->i_count); |
244 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) | 244 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) |
245 | list_move(&inode->i_list, &inode_in_use); | 245 | list_move(&inode->i_list, &inode_in_use); |
246 | inodes_stat.nr_unused--; | 246 | inodes_stat.nr_unused--; |
247 | } | 247 | } |
248 | 248 | ||
249 | /** | 249 | /** |
250 | * clear_inode - clear an inode | 250 | * clear_inode - clear an inode |
251 | * @inode: inode to clear | 251 | * @inode: inode to clear |
252 | * | 252 | * |
253 | * This is called by the filesystem to tell us | 253 | * This is called by the filesystem to tell us |
254 | * that the inode is no longer useful. We just | 254 | * that the inode is no longer useful. We just |
255 | * terminate it with extreme prejudice. | 255 | * terminate it with extreme prejudice. |
256 | */ | 256 | */ |
257 | void clear_inode(struct inode *inode) | 257 | void clear_inode(struct inode *inode) |
258 | { | 258 | { |
259 | might_sleep(); | 259 | might_sleep(); |
260 | invalidate_inode_buffers(inode); | 260 | invalidate_inode_buffers(inode); |
261 | 261 | ||
262 | BUG_ON(inode->i_data.nrpages); | 262 | BUG_ON(inode->i_data.nrpages); |
263 | BUG_ON(!(inode->i_state & I_FREEING)); | 263 | BUG_ON(!(inode->i_state & I_FREEING)); |
264 | BUG_ON(inode->i_state & I_CLEAR); | 264 | BUG_ON(inode->i_state & I_CLEAR); |
265 | inode_sync_wait(inode); | 265 | inode_sync_wait(inode); |
266 | DQUOT_DROP(inode); | 266 | DQUOT_DROP(inode); |
267 | if (inode->i_sb->s_op->clear_inode) | 267 | if (inode->i_sb->s_op->clear_inode) |
268 | inode->i_sb->s_op->clear_inode(inode); | 268 | inode->i_sb->s_op->clear_inode(inode); |
269 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) | 269 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) |
270 | bd_forget(inode); | 270 | bd_forget(inode); |
271 | if (S_ISCHR(inode->i_mode) && inode->i_cdev) | 271 | if (S_ISCHR(inode->i_mode) && inode->i_cdev) |
272 | cd_forget(inode); | 272 | cd_forget(inode); |
273 | inode->i_state = I_CLEAR; | 273 | inode->i_state = I_CLEAR; |
274 | } | 274 | } |
275 | 275 | ||
276 | EXPORT_SYMBOL(clear_inode); | 276 | EXPORT_SYMBOL(clear_inode); |
277 | 277 | ||
278 | /* | 278 | /* |
279 | * dispose_list - dispose of the contents of a local list | 279 | * dispose_list - dispose of the contents of a local list |
280 | * @head: the head of the list to free | 280 | * @head: the head of the list to free |
281 | * | 281 | * |
282 | * Dispose-list gets a local list with local inodes in it, so it doesn't | 282 | * Dispose-list gets a local list with local inodes in it, so it doesn't |
283 | * need to worry about list corruption and SMP locks. | 283 | * need to worry about list corruption and SMP locks. |
284 | */ | 284 | */ |
285 | static void dispose_list(struct list_head *head) | 285 | static void dispose_list(struct list_head *head) |
286 | { | 286 | { |
287 | int nr_disposed = 0; | 287 | int nr_disposed = 0; |
288 | 288 | ||
289 | while (!list_empty(head)) { | 289 | while (!list_empty(head)) { |
290 | struct inode *inode; | 290 | struct inode *inode; |
291 | 291 | ||
292 | inode = list_first_entry(head, struct inode, i_list); | 292 | inode = list_first_entry(head, struct inode, i_list); |
293 | list_del(&inode->i_list); | 293 | list_del(&inode->i_list); |
294 | 294 | ||
295 | if (inode->i_data.nrpages) | 295 | if (inode->i_data.nrpages) |
296 | truncate_inode_pages(&inode->i_data, 0); | 296 | truncate_inode_pages(&inode->i_data, 0); |
297 | clear_inode(inode); | 297 | clear_inode(inode); |
298 | 298 | ||
299 | spin_lock(&inode_lock); | 299 | spin_lock(&inode_lock); |
300 | hlist_del_init(&inode->i_hash); | 300 | hlist_del_init(&inode->i_hash); |
301 | list_del_init(&inode->i_sb_list); | 301 | list_del_init(&inode->i_sb_list); |
302 | spin_unlock(&inode_lock); | 302 | spin_unlock(&inode_lock); |
303 | 303 | ||
304 | wake_up_inode(inode); | 304 | wake_up_inode(inode); |
305 | destroy_inode(inode); | 305 | destroy_inode(inode); |
306 | nr_disposed++; | 306 | nr_disposed++; |
307 | } | 307 | } |
308 | spin_lock(&inode_lock); | 308 | spin_lock(&inode_lock); |
309 | inodes_stat.nr_inodes -= nr_disposed; | 309 | inodes_stat.nr_inodes -= nr_disposed; |
310 | spin_unlock(&inode_lock); | 310 | spin_unlock(&inode_lock); |
311 | } | 311 | } |
312 | 312 | ||
313 | /* | 313 | /* |
314 | * Invalidate all inodes for a device. | 314 | * Invalidate all inodes for a device. |
315 | */ | 315 | */ |
316 | static int invalidate_list(struct list_head *head, struct list_head *dispose) | 316 | static int invalidate_list(struct list_head *head, struct list_head *dispose) |
317 | { | 317 | { |
318 | struct list_head *next; | 318 | struct list_head *next; |
319 | int busy = 0, count = 0; | 319 | int busy = 0, count = 0; |
320 | 320 | ||
321 | next = head->next; | 321 | next = head->next; |
322 | for (;;) { | 322 | for (;;) { |
323 | struct list_head * tmp = next; | 323 | struct list_head * tmp = next; |
324 | struct inode * inode; | 324 | struct inode * inode; |
325 | 325 | ||
326 | /* | 326 | /* |
327 | * We can reschedule here without worrying about the list's | 327 | * We can reschedule here without worrying about the list's |
328 | * consistency because the per-sb list of inodes must not | 328 | * consistency because the per-sb list of inodes must not |
329 | * change during umount anymore, and because iprune_mutex keeps | 329 | * change during umount anymore, and because iprune_mutex keeps |
330 | * shrink_icache_memory() away. | 330 | * shrink_icache_memory() away. |
331 | */ | 331 | */ |
332 | cond_resched_lock(&inode_lock); | 332 | cond_resched_lock(&inode_lock); |
333 | 333 | ||
334 | next = next->next; | 334 | next = next->next; |
335 | if (tmp == head) | 335 | if (tmp == head) |
336 | break; | 336 | break; |
337 | inode = list_entry(tmp, struct inode, i_sb_list); | 337 | inode = list_entry(tmp, struct inode, i_sb_list); |
338 | invalidate_inode_buffers(inode); | 338 | invalidate_inode_buffers(inode); |
339 | if (!atomic_read(&inode->i_count)) { | 339 | if (!atomic_read(&inode->i_count)) { |
340 | list_move(&inode->i_list, dispose); | 340 | list_move(&inode->i_list, dispose); |
341 | inode->i_state |= I_FREEING; | 341 | inode->i_state |= I_FREEING; |
342 | count++; | 342 | count++; |
343 | continue; | 343 | continue; |
344 | } | 344 | } |
345 | busy = 1; | 345 | busy = 1; |
346 | } | 346 | } |
347 | /* only unused inodes may be cached with i_count zero */ | 347 | /* only unused inodes may be cached with i_count zero */ |
348 | inodes_stat.nr_unused -= count; | 348 | inodes_stat.nr_unused -= count; |
349 | return busy; | 349 | return busy; |
350 | } | 350 | } |
351 | 351 | ||
352 | /** | 352 | /** |
353 | * invalidate_inodes - discard the inodes on a device | 353 | * invalidate_inodes - discard the inodes on a device |
354 | * @sb: superblock | 354 | * @sb: superblock |
355 | * | 355 | * |
356 | * Discard all of the inodes for a given superblock. If the discard | 356 | * Discard all of the inodes for a given superblock. If the discard |
357 | * fails because there are busy inodes then a non zero value is returned. | 357 | * fails because there are busy inodes then a non zero value is returned. |
358 | * If the discard is successful all the inodes have been discarded. | 358 | * If the discard is successful all the inodes have been discarded. |
359 | */ | 359 | */ |
360 | int invalidate_inodes(struct super_block * sb) | 360 | int invalidate_inodes(struct super_block * sb) |
361 | { | 361 | { |
362 | int busy; | 362 | int busy; |
363 | LIST_HEAD(throw_away); | 363 | LIST_HEAD(throw_away); |
364 | 364 | ||
365 | mutex_lock(&iprune_mutex); | 365 | mutex_lock(&iprune_mutex); |
366 | spin_lock(&inode_lock); | 366 | spin_lock(&inode_lock); |
367 | inotify_unmount_inodes(&sb->s_inodes); | 367 | inotify_unmount_inodes(&sb->s_inodes); |
368 | busy = invalidate_list(&sb->s_inodes, &throw_away); | 368 | busy = invalidate_list(&sb->s_inodes, &throw_away); |
369 | spin_unlock(&inode_lock); | 369 | spin_unlock(&inode_lock); |
370 | 370 | ||
371 | dispose_list(&throw_away); | 371 | dispose_list(&throw_away); |
372 | mutex_unlock(&iprune_mutex); | 372 | mutex_unlock(&iprune_mutex); |
373 | 373 | ||
374 | return busy; | 374 | return busy; |
375 | } | 375 | } |
376 | 376 | ||
377 | EXPORT_SYMBOL(invalidate_inodes); | 377 | EXPORT_SYMBOL(invalidate_inodes); |
378 | 378 | ||
379 | static int can_unuse(struct inode *inode) | 379 | static int can_unuse(struct inode *inode) |
380 | { | 380 | { |
381 | if (inode->i_state) | 381 | if (inode->i_state) |
382 | return 0; | 382 | return 0; |
383 | if (inode_has_buffers(inode)) | 383 | if (inode_has_buffers(inode)) |
384 | return 0; | 384 | return 0; |
385 | if (atomic_read(&inode->i_count)) | 385 | if (atomic_read(&inode->i_count)) |
386 | return 0; | 386 | return 0; |
387 | if (inode->i_data.nrpages) | 387 | if (inode->i_data.nrpages) |
388 | return 0; | 388 | return 0; |
389 | return 1; | 389 | return 1; |
390 | } | 390 | } |
391 | 391 | ||
392 | /* | 392 | /* |
393 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to | 393 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to |
394 | * a temporary list and then are freed outside inode_lock by dispose_list(). | 394 | * a temporary list and then are freed outside inode_lock by dispose_list(). |
395 | * | 395 | * |
396 | * Any inodes which are pinned purely because of attached pagecache have their | 396 | * Any inodes which are pinned purely because of attached pagecache have their |
397 | * pagecache removed. We expect the final iput() on that inode to add it to | 397 | * pagecache removed. We expect the final iput() on that inode to add it to |
398 | * the front of the inode_unused list. So look for it there and if the | 398 | * the front of the inode_unused list. So look for it there and if the |
399 | * inode is still freeable, proceed. The right inode is found 99.9% of the | 399 | * inode is still freeable, proceed. The right inode is found 99.9% of the |
400 | * time in testing on a 4-way. | 400 | * time in testing on a 4-way. |
401 | * | 401 | * |
402 | * If the inode has metadata buffers attached to mapping->private_list then | 402 | * If the inode has metadata buffers attached to mapping->private_list then |
403 | * try to remove them. | 403 | * try to remove them. |
404 | */ | 404 | */ |
405 | static void prune_icache(int nr_to_scan) | 405 | static void prune_icache(int nr_to_scan) |
406 | { | 406 | { |
407 | LIST_HEAD(freeable); | 407 | LIST_HEAD(freeable); |
408 | int nr_pruned = 0; | 408 | int nr_pruned = 0; |
409 | int nr_scanned; | 409 | int nr_scanned; |
410 | unsigned long reap = 0; | 410 | unsigned long reap = 0; |
411 | 411 | ||
412 | mutex_lock(&iprune_mutex); | 412 | mutex_lock(&iprune_mutex); |
413 | spin_lock(&inode_lock); | 413 | spin_lock(&inode_lock); |
414 | for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { | 414 | for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { |
415 | struct inode *inode; | 415 | struct inode *inode; |
416 | 416 | ||
417 | if (list_empty(&inode_unused)) | 417 | if (list_empty(&inode_unused)) |
418 | break; | 418 | break; |
419 | 419 | ||
420 | inode = list_entry(inode_unused.prev, struct inode, i_list); | 420 | inode = list_entry(inode_unused.prev, struct inode, i_list); |
421 | 421 | ||
422 | if (inode->i_state || atomic_read(&inode->i_count)) { | 422 | if (inode->i_state || atomic_read(&inode->i_count)) { |
423 | list_move(&inode->i_list, &inode_unused); | 423 | list_move(&inode->i_list, &inode_unused); |
424 | continue; | 424 | continue; |
425 | } | 425 | } |
426 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { | 426 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { |
427 | __iget(inode); | 427 | __iget(inode); |
428 | spin_unlock(&inode_lock); | 428 | spin_unlock(&inode_lock); |
429 | if (remove_inode_buffers(inode)) | 429 | if (remove_inode_buffers(inode)) |
430 | reap += invalidate_mapping_pages(&inode->i_data, | 430 | reap += invalidate_mapping_pages(&inode->i_data, |
431 | 0, -1); | 431 | 0, -1); |
432 | iput(inode); | 432 | iput(inode); |
433 | spin_lock(&inode_lock); | 433 | spin_lock(&inode_lock); |
434 | 434 | ||
435 | if (inode != list_entry(inode_unused.next, | 435 | if (inode != list_entry(inode_unused.next, |
436 | struct inode, i_list)) | 436 | struct inode, i_list)) |
437 | continue; /* wrong inode or list_empty */ | 437 | continue; /* wrong inode or list_empty */ |
438 | if (!can_unuse(inode)) | 438 | if (!can_unuse(inode)) |
439 | continue; | 439 | continue; |
440 | } | 440 | } |
441 | list_move(&inode->i_list, &freeable); | 441 | list_move(&inode->i_list, &freeable); |
442 | inode->i_state |= I_FREEING; | 442 | inode->i_state |= I_FREEING; |
443 | nr_pruned++; | 443 | nr_pruned++; |
444 | } | 444 | } |
445 | inodes_stat.nr_unused -= nr_pruned; | 445 | inodes_stat.nr_unused -= nr_pruned; |
446 | if (current_is_kswapd()) | 446 | if (current_is_kswapd()) |
447 | __count_vm_events(KSWAPD_INODESTEAL, reap); | 447 | __count_vm_events(KSWAPD_INODESTEAL, reap); |
448 | else | 448 | else |
449 | __count_vm_events(PGINODESTEAL, reap); | 449 | __count_vm_events(PGINODESTEAL, reap); |
450 | spin_unlock(&inode_lock); | 450 | spin_unlock(&inode_lock); |
451 | 451 | ||
452 | dispose_list(&freeable); | 452 | dispose_list(&freeable); |
453 | mutex_unlock(&iprune_mutex); | 453 | mutex_unlock(&iprune_mutex); |
454 | } | 454 | } |
455 | 455 | ||
456 | /* | 456 | /* |
457 | * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, | 457 | * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, |
458 | * "unused" means that no dentries are referring to the inodes: the files are | 458 | * "unused" means that no dentries are referring to the inodes: the files are |
459 | * not open and the dcache references to those inodes have already been | 459 | * not open and the dcache references to those inodes have already been |
460 | * reclaimed. | 460 | * reclaimed. |
461 | * | 461 | * |
462 | * This function is passed the number of inodes to scan, and it returns the | 462 | * This function is passed the number of inodes to scan, and it returns the |
463 | * total number of remaining possibly-reclaimable inodes. | 463 | * total number of remaining possibly-reclaimable inodes. |
464 | */ | 464 | */ |
465 | static int shrink_icache_memory(int nr, gfp_t gfp_mask) | 465 | static int shrink_icache_memory(int nr, gfp_t gfp_mask) |
466 | { | 466 | { |
467 | if (nr) { | 467 | if (nr) { |
468 | /* | 468 | /* |
469 | * Nasty deadlock avoidance. We may hold various FS locks, | 469 | * Nasty deadlock avoidance. We may hold various FS locks, |
470 | * and we don't want to recurse into the FS that called us | 470 | * and we don't want to recurse into the FS that called us |
471 | * in clear_inode() and friends.. | 471 | * in clear_inode() and friends.. |
472 | */ | 472 | */ |
473 | if (!(gfp_mask & __GFP_FS)) | 473 | if (!(gfp_mask & __GFP_FS)) |
474 | return -1; | 474 | return -1; |
475 | prune_icache(nr); | 475 | prune_icache(nr); |
476 | } | 476 | } |
477 | return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; | 477 | return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; |
478 | } | 478 | } |
479 | 479 | ||
480 | static struct shrinker icache_shrinker = { | 480 | static struct shrinker icache_shrinker = { |
481 | .shrink = shrink_icache_memory, | 481 | .shrink = shrink_icache_memory, |
482 | .seeks = DEFAULT_SEEKS, | 482 | .seeks = DEFAULT_SEEKS, |
483 | }; | 483 | }; |
484 | 484 | ||
485 | static void __wait_on_freeing_inode(struct inode *inode); | 485 | static void __wait_on_freeing_inode(struct inode *inode); |
486 | /* | 486 | /* |
487 | * Called with the inode lock held. | 487 | * Called with the inode lock held. |
488 | * NOTE: we are not increasing the inode-refcount, you must call __iget() | 488 | * NOTE: we are not increasing the inode-refcount, you must call __iget() |
489 | * by hand after calling find_inode now! This simplifies iunique and won't | 489 | * by hand after calling find_inode now! This simplifies iunique and won't |
490 | * add any additional branch in the common code. | 490 | * add any additional branch in the common code. |
491 | */ | 491 | */ |
492 | static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) | 492 | static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) |
493 | { | 493 | { |
494 | struct hlist_node *node; | 494 | struct hlist_node *node; |
495 | struct inode * inode = NULL; | 495 | struct inode * inode = NULL; |
496 | 496 | ||
497 | repeat: | 497 | repeat: |
498 | hlist_for_each (node, head) { | 498 | hlist_for_each (node, head) { |
499 | inode = hlist_entry(node, struct inode, i_hash); | 499 | inode = hlist_entry(node, struct inode, i_hash); |
500 | if (inode->i_sb != sb) | 500 | if (inode->i_sb != sb) |
501 | continue; | 501 | continue; |
502 | if (!test(inode, data)) | 502 | if (!test(inode, data)) |
503 | continue; | 503 | continue; |
504 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { | 504 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { |
505 | __wait_on_freeing_inode(inode); | 505 | __wait_on_freeing_inode(inode); |
506 | goto repeat; | 506 | goto repeat; |
507 | } | 507 | } |
508 | break; | 508 | break; |
509 | } | 509 | } |
510 | return node ? inode : NULL; | 510 | return node ? inode : NULL; |
511 | } | 511 | } |
512 | 512 | ||
513 | /* | 513 | /* |
514 | * find_inode_fast is the fast path version of find_inode, see the comment at | 514 | * find_inode_fast is the fast path version of find_inode, see the comment at |
515 | * iget_locked for details. | 515 | * iget_locked for details. |
516 | */ | 516 | */ |
517 | static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) | 517 | static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) |
518 | { | 518 | { |
519 | struct hlist_node *node; | 519 | struct hlist_node *node; |
520 | struct inode * inode = NULL; | 520 | struct inode * inode = NULL; |
521 | 521 | ||
522 | repeat: | 522 | repeat: |
523 | hlist_for_each (node, head) { | 523 | hlist_for_each (node, head) { |
524 | inode = hlist_entry(node, struct inode, i_hash); | 524 | inode = hlist_entry(node, struct inode, i_hash); |
525 | if (inode->i_ino != ino) | 525 | if (inode->i_ino != ino) |
526 | continue; | 526 | continue; |
527 | if (inode->i_sb != sb) | 527 | if (inode->i_sb != sb) |
528 | continue; | 528 | continue; |
529 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { | 529 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { |
530 | __wait_on_freeing_inode(inode); | 530 | __wait_on_freeing_inode(inode); |
531 | goto repeat; | 531 | goto repeat; |
532 | } | 532 | } |
533 | break; | 533 | break; |
534 | } | 534 | } |
535 | return node ? inode : NULL; | 535 | return node ? inode : NULL; |
536 | } | 536 | } |
537 | 537 | ||
538 | /** | 538 | /** |
539 | * new_inode - obtain an inode | 539 | * new_inode - obtain an inode |
540 | * @sb: superblock | 540 | * @sb: superblock |
541 | * | 541 | * |
542 | * Allocates a new inode for given superblock. The default gfp_mask | 542 | * Allocates a new inode for given superblock. The default gfp_mask |
543 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. | 543 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. |
544 | * If HIGHMEM pages are unsuitable or it is known that pages allocated | 544 | * If HIGHMEM pages are unsuitable or it is known that pages allocated |
545 | * for the page cache are not reclaimable or migratable, | 545 | * for the page cache are not reclaimable or migratable, |
546 | * mapping_set_gfp_mask() must be called with suitable flags on the | 546 | * mapping_set_gfp_mask() must be called with suitable flags on the |
547 | * newly created inode's mapping | 547 | * newly created inode's mapping |
548 | * | 548 | * |
549 | */ | 549 | */ |
550 | struct inode *new_inode(struct super_block *sb) | 550 | struct inode *new_inode(struct super_block *sb) |
551 | { | 551 | { |
552 | /* | 552 | /* |
553 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW | 553 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW |
554 | * error if st_ino won't fit in target struct field. Use 32bit counter | 554 | * error if st_ino won't fit in target struct field. Use 32bit counter |
555 | * here to attempt to avoid that. | 555 | * here to attempt to avoid that. |
556 | */ | 556 | */ |
557 | static unsigned int last_ino; | 557 | static unsigned int last_ino; |
558 | struct inode * inode; | 558 | struct inode * inode; |
559 | 559 | ||
560 | spin_lock_prefetch(&inode_lock); | 560 | spin_lock_prefetch(&inode_lock); |
561 | 561 | ||
562 | inode = alloc_inode(sb); | 562 | inode = alloc_inode(sb); |
563 | if (inode) { | 563 | if (inode) { |
564 | spin_lock(&inode_lock); | 564 | spin_lock(&inode_lock); |
565 | inodes_stat.nr_inodes++; | 565 | inodes_stat.nr_inodes++; |
566 | list_add(&inode->i_list, &inode_in_use); | 566 | list_add(&inode->i_list, &inode_in_use); |
567 | list_add(&inode->i_sb_list, &sb->s_inodes); | 567 | list_add(&inode->i_sb_list, &sb->s_inodes); |
568 | inode->i_ino = ++last_ino; | 568 | inode->i_ino = ++last_ino; |
569 | inode->i_state = 0; | 569 | inode->i_state = 0; |
570 | spin_unlock(&inode_lock); | 570 | spin_unlock(&inode_lock); |
571 | } | 571 | } |
572 | return inode; | 572 | return inode; |
573 | } | 573 | } |
574 | 574 | ||
575 | EXPORT_SYMBOL(new_inode); | 575 | EXPORT_SYMBOL(new_inode); |
576 | 576 | ||
577 | void unlock_new_inode(struct inode *inode) | 577 | void unlock_new_inode(struct inode *inode) |
578 | { | 578 | { |
579 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 579 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
580 | if (inode->i_mode & S_IFDIR) { | 580 | if (inode->i_mode & S_IFDIR) { |
581 | struct file_system_type *type = inode->i_sb->s_type; | 581 | struct file_system_type *type = inode->i_sb->s_type; |
582 | 582 | ||
583 | /* | 583 | /* |
584 | * ensure nobody is actually holding i_mutex | 584 | * ensure nobody is actually holding i_mutex |
585 | */ | 585 | */ |
586 | mutex_destroy(&inode->i_mutex); | 586 | mutex_destroy(&inode->i_mutex); |
587 | mutex_init(&inode->i_mutex); | 587 | mutex_init(&inode->i_mutex); |
588 | lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); | 588 | lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); |
589 | } | 589 | } |
590 | #endif | 590 | #endif |
591 | /* | 591 | /* |
592 | * This is special! We do not need the spinlock | 592 | * This is special! We do not need the spinlock |
593 | * when clearing I_LOCK, because we're guaranteed | 593 | * when clearing I_LOCK, because we're guaranteed |
594 | * that nobody else tries to do anything about the | 594 | * that nobody else tries to do anything about the |
595 | * state of the inode when it is locked, as we | 595 | * state of the inode when it is locked, as we |
596 | * just created it (so there can be no old holders | 596 | * just created it (so there can be no old holders |
597 | * that haven't tested I_LOCK). | 597 | * that haven't tested I_LOCK). |
598 | */ | 598 | */ |
599 | inode->i_state &= ~(I_LOCK|I_NEW); | 599 | inode->i_state &= ~(I_LOCK|I_NEW); |
600 | wake_up_inode(inode); | 600 | wake_up_inode(inode); |
601 | } | 601 | } |
602 | 602 | ||
603 | EXPORT_SYMBOL(unlock_new_inode); | 603 | EXPORT_SYMBOL(unlock_new_inode); |
604 | 604 | ||
605 | /* | 605 | /* |
606 | * This is called without the inode lock held.. Be careful. | 606 | * This is called without the inode lock held.. Be careful. |
607 | * | 607 | * |
608 | * We no longer cache the sb_flags in i_flags - see fs.h | 608 | * We no longer cache the sb_flags in i_flags - see fs.h |
609 | * -- rmk@arm.uk.linux.org | 609 | * -- rmk@arm.uk.linux.org |
610 | */ | 610 | */ |
611 | static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) | 611 | static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) |
612 | { | 612 | { |
613 | struct inode * inode; | 613 | struct inode * inode; |
614 | 614 | ||
615 | inode = alloc_inode(sb); | 615 | inode = alloc_inode(sb); |
616 | if (inode) { | 616 | if (inode) { |
617 | struct inode * old; | 617 | struct inode * old; |
618 | 618 | ||
619 | spin_lock(&inode_lock); | 619 | spin_lock(&inode_lock); |
620 | /* We released the lock, so.. */ | 620 | /* We released the lock, so.. */ |
621 | old = find_inode(sb, head, test, data); | 621 | old = find_inode(sb, head, test, data); |
622 | if (!old) { | 622 | if (!old) { |
623 | if (set(inode, data)) | 623 | if (set(inode, data)) |
624 | goto set_failed; | 624 | goto set_failed; |
625 | 625 | ||
626 | inodes_stat.nr_inodes++; | 626 | inodes_stat.nr_inodes++; |
627 | list_add(&inode->i_list, &inode_in_use); | 627 | list_add(&inode->i_list, &inode_in_use); |
628 | list_add(&inode->i_sb_list, &sb->s_inodes); | 628 | list_add(&inode->i_sb_list, &sb->s_inodes); |
629 | hlist_add_head(&inode->i_hash, head); | 629 | hlist_add_head(&inode->i_hash, head); |
630 | inode->i_state = I_LOCK|I_NEW; | 630 | inode->i_state = I_LOCK|I_NEW; |
631 | spin_unlock(&inode_lock); | 631 | spin_unlock(&inode_lock); |
632 | 632 | ||
633 | /* Return the locked inode with I_NEW set, the | 633 | /* Return the locked inode with I_NEW set, the |
634 | * caller is responsible for filling in the contents | 634 | * caller is responsible for filling in the contents |
635 | */ | 635 | */ |
636 | return inode; | 636 | return inode; |
637 | } | 637 | } |
638 | 638 | ||
639 | /* | 639 | /* |
640 | * Uhhuh, somebody else created the same inode under | 640 | * Uhhuh, somebody else created the same inode under |
641 | * us. Use the old inode instead of the one we just | 641 | * us. Use the old inode instead of the one we just |
642 | * allocated. | 642 | * allocated. |
643 | */ | 643 | */ |
644 | __iget(old); | 644 | __iget(old); |
645 | spin_unlock(&inode_lock); | 645 | spin_unlock(&inode_lock); |
646 | destroy_inode(inode); | 646 | destroy_inode(inode); |
647 | inode = old; | 647 | inode = old; |
648 | wait_on_inode(inode); | 648 | wait_on_inode(inode); |
649 | } | 649 | } |
650 | return inode; | 650 | return inode; |
651 | 651 | ||
652 | set_failed: | 652 | set_failed: |
653 | spin_unlock(&inode_lock); | 653 | spin_unlock(&inode_lock); |
654 | destroy_inode(inode); | 654 | destroy_inode(inode); |
655 | return NULL; | 655 | return NULL; |
656 | } | 656 | } |
657 | 657 | ||
658 | /* | 658 | /* |
659 | * get_new_inode_fast is the fast path version of get_new_inode, see the | 659 | * get_new_inode_fast is the fast path version of get_new_inode, see the |
660 | * comment at iget_locked for details. | 660 | * comment at iget_locked for details. |
661 | */ | 661 | */ |
662 | static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) | 662 | static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) |
663 | { | 663 | { |
664 | struct inode * inode; | 664 | struct inode * inode; |
665 | 665 | ||
666 | inode = alloc_inode(sb); | 666 | inode = alloc_inode(sb); |
667 | if (inode) { | 667 | if (inode) { |
668 | struct inode * old; | 668 | struct inode * old; |
669 | 669 | ||
670 | spin_lock(&inode_lock); | 670 | spin_lock(&inode_lock); |
671 | /* We released the lock, so.. */ | 671 | /* We released the lock, so.. */ |
672 | old = find_inode_fast(sb, head, ino); | 672 | old = find_inode_fast(sb, head, ino); |
673 | if (!old) { | 673 | if (!old) { |
674 | inode->i_ino = ino; | 674 | inode->i_ino = ino; |
675 | inodes_stat.nr_inodes++; | 675 | inodes_stat.nr_inodes++; |
676 | list_add(&inode->i_list, &inode_in_use); | 676 | list_add(&inode->i_list, &inode_in_use); |
677 | list_add(&inode->i_sb_list, &sb->s_inodes); | 677 | list_add(&inode->i_sb_list, &sb->s_inodes); |
678 | hlist_add_head(&inode->i_hash, head); | 678 | hlist_add_head(&inode->i_hash, head); |
679 | inode->i_state = I_LOCK|I_NEW; | 679 | inode->i_state = I_LOCK|I_NEW; |
680 | spin_unlock(&inode_lock); | 680 | spin_unlock(&inode_lock); |
681 | 681 | ||
682 | /* Return the locked inode with I_NEW set, the | 682 | /* Return the locked inode with I_NEW set, the |
683 | * caller is responsible for filling in the contents | 683 | * caller is responsible for filling in the contents |
684 | */ | 684 | */ |
685 | return inode; | 685 | return inode; |
686 | } | 686 | } |
687 | 687 | ||
688 | /* | 688 | /* |
689 | * Uhhuh, somebody else created the same inode under | 689 | * Uhhuh, somebody else created the same inode under |
690 | * us. Use the old inode instead of the one we just | 690 | * us. Use the old inode instead of the one we just |
691 | * allocated. | 691 | * allocated. |
692 | */ | 692 | */ |
693 | __iget(old); | 693 | __iget(old); |
694 | spin_unlock(&inode_lock); | 694 | spin_unlock(&inode_lock); |
695 | destroy_inode(inode); | 695 | destroy_inode(inode); |
696 | inode = old; | 696 | inode = old; |
697 | wait_on_inode(inode); | 697 | wait_on_inode(inode); |
698 | } | 698 | } |
699 | return inode; | 699 | return inode; |
700 | } | 700 | } |
701 | 701 | ||
702 | static unsigned long hash(struct super_block *sb, unsigned long hashval) | 702 | static unsigned long hash(struct super_block *sb, unsigned long hashval) |
703 | { | 703 | { |
704 | unsigned long tmp; | 704 | unsigned long tmp; |
705 | 705 | ||
706 | tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / | 706 | tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / |
707 | L1_CACHE_BYTES; | 707 | L1_CACHE_BYTES; |
708 | tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); | 708 | tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); |
709 | return tmp & I_HASHMASK; | 709 | return tmp & I_HASHMASK; |
710 | } | 710 | } |
711 | 711 | ||
712 | /** | 712 | /** |
713 | * iunique - get a unique inode number | 713 | * iunique - get a unique inode number |
714 | * @sb: superblock | 714 | * @sb: superblock |
715 | * @max_reserved: highest reserved inode number | 715 | * @max_reserved: highest reserved inode number |
716 | * | 716 | * |
717 | * Obtain an inode number that is unique on the system for a given | 717 | * Obtain an inode number that is unique on the system for a given |
718 | * superblock. This is used by file systems that have no natural | 718 | * superblock. This is used by file systems that have no natural |
719 | * permanent inode numbering system. An inode number is returned that | 719 | * permanent inode numbering system. An inode number is returned that |
720 | * is higher than the reserved limit but unique. | 720 | * is higher than the reserved limit but unique. |
721 | * | 721 | * |
722 | * BUGS: | 722 | * BUGS: |
723 | * With a large number of inodes live on the file system this function | 723 | * With a large number of inodes live on the file system this function |
724 | * currently becomes quite slow. | 724 | * currently becomes quite slow. |
725 | */ | 725 | */ |
726 | ino_t iunique(struct super_block *sb, ino_t max_reserved) | 726 | ino_t iunique(struct super_block *sb, ino_t max_reserved) |
727 | { | 727 | { |
728 | /* | 728 | /* |
729 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW | 729 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW |
730 | * error if st_ino won't fit in target struct field. Use 32bit counter | 730 | * error if st_ino won't fit in target struct field. Use 32bit counter |
731 | * here to attempt to avoid that. | 731 | * here to attempt to avoid that. |
732 | */ | 732 | */ |
733 | static unsigned int counter; | 733 | static unsigned int counter; |
734 | struct inode *inode; | 734 | struct inode *inode; |
735 | struct hlist_head *head; | 735 | struct hlist_head *head; |
736 | ino_t res; | 736 | ino_t res; |
737 | 737 | ||
738 | spin_lock(&inode_lock); | 738 | spin_lock(&inode_lock); |
739 | do { | 739 | do { |
740 | if (counter <= max_reserved) | 740 | if (counter <= max_reserved) |
741 | counter = max_reserved + 1; | 741 | counter = max_reserved + 1; |
742 | res = counter++; | 742 | res = counter++; |
743 | head = inode_hashtable + hash(sb, res); | 743 | head = inode_hashtable + hash(sb, res); |
744 | inode = find_inode_fast(sb, head, res); | 744 | inode = find_inode_fast(sb, head, res); |
745 | } while (inode != NULL); | 745 | } while (inode != NULL); |
746 | spin_unlock(&inode_lock); | 746 | spin_unlock(&inode_lock); |
747 | 747 | ||
748 | return res; | 748 | return res; |
749 | } | 749 | } |
750 | EXPORT_SYMBOL(iunique); | 750 | EXPORT_SYMBOL(iunique); |
751 | 751 | ||
752 | struct inode *igrab(struct inode *inode) | 752 | struct inode *igrab(struct inode *inode) |
753 | { | 753 | { |
754 | spin_lock(&inode_lock); | 754 | spin_lock(&inode_lock); |
755 | if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) | 755 | if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) |
756 | __iget(inode); | 756 | __iget(inode); |
757 | else | 757 | else |
758 | /* | 758 | /* |
759 | * Handle the case where s_op->clear_inode is not been | 759 | * Handle the case where s_op->clear_inode is not been |
760 | * called yet, and somebody is calling igrab | 760 | * called yet, and somebody is calling igrab |
761 | * while the inode is getting freed. | 761 | * while the inode is getting freed. |
762 | */ | 762 | */ |
763 | inode = NULL; | 763 | inode = NULL; |
764 | spin_unlock(&inode_lock); | 764 | spin_unlock(&inode_lock); |
765 | return inode; | 765 | return inode; |
766 | } | 766 | } |
767 | 767 | ||
768 | EXPORT_SYMBOL(igrab); | 768 | EXPORT_SYMBOL(igrab); |
769 | 769 | ||
770 | /** | 770 | /** |
771 | * ifind - internal function, you want ilookup5() or iget5(). | 771 | * ifind - internal function, you want ilookup5() or iget5(). |
772 | * @sb: super block of file system to search | 772 | * @sb: super block of file system to search |
773 | * @head: the head of the list to search | 773 | * @head: the head of the list to search |
774 | * @test: callback used for comparisons between inodes | 774 | * @test: callback used for comparisons between inodes |
775 | * @data: opaque data pointer to pass to @test | 775 | * @data: opaque data pointer to pass to @test |
776 | * @wait: if true wait for the inode to be unlocked, if false do not | 776 | * @wait: if true wait for the inode to be unlocked, if false do not |
777 | * | 777 | * |
778 | * ifind() searches for the inode specified by @data in the inode | 778 | * ifind() searches for the inode specified by @data in the inode |
779 | * cache. This is a generalized version of ifind_fast() for file systems where | 779 | * cache. This is a generalized version of ifind_fast() for file systems where |
780 | * the inode number is not sufficient for unique identification of an inode. | 780 | * the inode number is not sufficient for unique identification of an inode. |
781 | * | 781 | * |
782 | * If the inode is in the cache, the inode is returned with an incremented | 782 | * If the inode is in the cache, the inode is returned with an incremented |
783 | * reference count. | 783 | * reference count. |
784 | * | 784 | * |
785 | * Otherwise NULL is returned. | 785 | * Otherwise NULL is returned. |
786 | * | 786 | * |
787 | * Note, @test is called with the inode_lock held, so can't sleep. | 787 | * Note, @test is called with the inode_lock held, so can't sleep. |
788 | */ | 788 | */ |
789 | static struct inode *ifind(struct super_block *sb, | 789 | static struct inode *ifind(struct super_block *sb, |
790 | struct hlist_head *head, int (*test)(struct inode *, void *), | 790 | struct hlist_head *head, int (*test)(struct inode *, void *), |
791 | void *data, const int wait) | 791 | void *data, const int wait) |
792 | { | 792 | { |
793 | struct inode *inode; | 793 | struct inode *inode; |
794 | 794 | ||
795 | spin_lock(&inode_lock); | 795 | spin_lock(&inode_lock); |
796 | inode = find_inode(sb, head, test, data); | 796 | inode = find_inode(sb, head, test, data); |
797 | if (inode) { | 797 | if (inode) { |
798 | __iget(inode); | 798 | __iget(inode); |
799 | spin_unlock(&inode_lock); | 799 | spin_unlock(&inode_lock); |
800 | if (likely(wait)) | 800 | if (likely(wait)) |
801 | wait_on_inode(inode); | 801 | wait_on_inode(inode); |
802 | return inode; | 802 | return inode; |
803 | } | 803 | } |
804 | spin_unlock(&inode_lock); | 804 | spin_unlock(&inode_lock); |
805 | return NULL; | 805 | return NULL; |
806 | } | 806 | } |
807 | 807 | ||
808 | /** | 808 | /** |
809 | * ifind_fast - internal function, you want ilookup() or iget(). | 809 | * ifind_fast - internal function, you want ilookup() or iget(). |
810 | * @sb: super block of file system to search | 810 | * @sb: super block of file system to search |
811 | * @head: head of the list to search | 811 | * @head: head of the list to search |
812 | * @ino: inode number to search for | 812 | * @ino: inode number to search for |
813 | * | 813 | * |
814 | * ifind_fast() searches for the inode @ino in the inode cache. This is for | 814 | * ifind_fast() searches for the inode @ino in the inode cache. This is for |
815 | * file systems where the inode number is sufficient for unique identification | 815 | * file systems where the inode number is sufficient for unique identification |
816 | * of an inode. | 816 | * of an inode. |
817 | * | 817 | * |
818 | * If the inode is in the cache, the inode is returned with an incremented | 818 | * If the inode is in the cache, the inode is returned with an incremented |
819 | * reference count. | 819 | * reference count. |
820 | * | 820 | * |
821 | * Otherwise NULL is returned. | 821 | * Otherwise NULL is returned. |
822 | */ | 822 | */ |
823 | static struct inode *ifind_fast(struct super_block *sb, | 823 | static struct inode *ifind_fast(struct super_block *sb, |
824 | struct hlist_head *head, unsigned long ino) | 824 | struct hlist_head *head, unsigned long ino) |
825 | { | 825 | { |
826 | struct inode *inode; | 826 | struct inode *inode; |
827 | 827 | ||
828 | spin_lock(&inode_lock); | 828 | spin_lock(&inode_lock); |
829 | inode = find_inode_fast(sb, head, ino); | 829 | inode = find_inode_fast(sb, head, ino); |
830 | if (inode) { | 830 | if (inode) { |
831 | __iget(inode); | 831 | __iget(inode); |
832 | spin_unlock(&inode_lock); | 832 | spin_unlock(&inode_lock); |
833 | wait_on_inode(inode); | 833 | wait_on_inode(inode); |
834 | return inode; | 834 | return inode; |
835 | } | 835 | } |
836 | spin_unlock(&inode_lock); | 836 | spin_unlock(&inode_lock); |
837 | return NULL; | 837 | return NULL; |
838 | } | 838 | } |
839 | 839 | ||
840 | /** | 840 | /** |
841 | * ilookup5_nowait - search for an inode in the inode cache | 841 | * ilookup5_nowait - search for an inode in the inode cache |
842 | * @sb: super block of file system to search | 842 | * @sb: super block of file system to search |
843 | * @hashval: hash value (usually inode number) to search for | 843 | * @hashval: hash value (usually inode number) to search for |
844 | * @test: callback used for comparisons between inodes | 844 | * @test: callback used for comparisons between inodes |
845 | * @data: opaque data pointer to pass to @test | 845 | * @data: opaque data pointer to pass to @test |
846 | * | 846 | * |
847 | * ilookup5() uses ifind() to search for the inode specified by @hashval and | 847 | * ilookup5() uses ifind() to search for the inode specified by @hashval and |
848 | * @data in the inode cache. This is a generalized version of ilookup() for | 848 | * @data in the inode cache. This is a generalized version of ilookup() for |
849 | * file systems where the inode number is not sufficient for unique | 849 | * file systems where the inode number is not sufficient for unique |
850 | * identification of an inode. | 850 | * identification of an inode. |
851 | * | 851 | * |
852 | * If the inode is in the cache, the inode is returned with an incremented | 852 | * If the inode is in the cache, the inode is returned with an incremented |
853 | * reference count. Note, the inode lock is not waited upon so you have to be | 853 | * reference count. Note, the inode lock is not waited upon so you have to be |
854 | * very careful what you do with the returned inode. You probably should be | 854 | * very careful what you do with the returned inode. You probably should be |
855 | * using ilookup5() instead. | 855 | * using ilookup5() instead. |
856 | * | 856 | * |
857 | * Otherwise NULL is returned. | 857 | * Otherwise NULL is returned. |
858 | * | 858 | * |
859 | * Note, @test is called with the inode_lock held, so can't sleep. | 859 | * Note, @test is called with the inode_lock held, so can't sleep. |
860 | */ | 860 | */ |
861 | struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, | 861 | struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, |
862 | int (*test)(struct inode *, void *), void *data) | 862 | int (*test)(struct inode *, void *), void *data) |
863 | { | 863 | { |
864 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 864 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
865 | 865 | ||
866 | return ifind(sb, head, test, data, 0); | 866 | return ifind(sb, head, test, data, 0); |
867 | } | 867 | } |
868 | 868 | ||
869 | EXPORT_SYMBOL(ilookup5_nowait); | 869 | EXPORT_SYMBOL(ilookup5_nowait); |
870 | 870 | ||
871 | /** | 871 | /** |
872 | * ilookup5 - search for an inode in the inode cache | 872 | * ilookup5 - search for an inode in the inode cache |
873 | * @sb: super block of file system to search | 873 | * @sb: super block of file system to search |
874 | * @hashval: hash value (usually inode number) to search for | 874 | * @hashval: hash value (usually inode number) to search for |
875 | * @test: callback used for comparisons between inodes | 875 | * @test: callback used for comparisons between inodes |
876 | * @data: opaque data pointer to pass to @test | 876 | * @data: opaque data pointer to pass to @test |
877 | * | 877 | * |
878 | * ilookup5() uses ifind() to search for the inode specified by @hashval and | 878 | * ilookup5() uses ifind() to search for the inode specified by @hashval and |
879 | * @data in the inode cache. This is a generalized version of ilookup() for | 879 | * @data in the inode cache. This is a generalized version of ilookup() for |
880 | * file systems where the inode number is not sufficient for unique | 880 | * file systems where the inode number is not sufficient for unique |
881 | * identification of an inode. | 881 | * identification of an inode. |
882 | * | 882 | * |
883 | * If the inode is in the cache, the inode lock is waited upon and the inode is | 883 | * If the inode is in the cache, the inode lock is waited upon and the inode is |
884 | * returned with an incremented reference count. | 884 | * returned with an incremented reference count. |
885 | * | 885 | * |
886 | * Otherwise NULL is returned. | 886 | * Otherwise NULL is returned. |
887 | * | 887 | * |
888 | * Note, @test is called with the inode_lock held, so can't sleep. | 888 | * Note, @test is called with the inode_lock held, so can't sleep. |
889 | */ | 889 | */ |
890 | struct inode *ilookup5(struct super_block *sb, unsigned long hashval, | 890 | struct inode *ilookup5(struct super_block *sb, unsigned long hashval, |
891 | int (*test)(struct inode *, void *), void *data) | 891 | int (*test)(struct inode *, void *), void *data) |
892 | { | 892 | { |
893 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 893 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
894 | 894 | ||
895 | return ifind(sb, head, test, data, 1); | 895 | return ifind(sb, head, test, data, 1); |
896 | } | 896 | } |
897 | 897 | ||
898 | EXPORT_SYMBOL(ilookup5); | 898 | EXPORT_SYMBOL(ilookup5); |
899 | 899 | ||
900 | /** | 900 | /** |
901 | * ilookup - search for an inode in the inode cache | 901 | * ilookup - search for an inode in the inode cache |
902 | * @sb: super block of file system to search | 902 | * @sb: super block of file system to search |
903 | * @ino: inode number to search for | 903 | * @ino: inode number to search for |
904 | * | 904 | * |
905 | * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. | 905 | * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. |
906 | * This is for file systems where the inode number is sufficient for unique | 906 | * This is for file systems where the inode number is sufficient for unique |
907 | * identification of an inode. | 907 | * identification of an inode. |
908 | * | 908 | * |
909 | * If the inode is in the cache, the inode is returned with an incremented | 909 | * If the inode is in the cache, the inode is returned with an incremented |
910 | * reference count. | 910 | * reference count. |
911 | * | 911 | * |
912 | * Otherwise NULL is returned. | 912 | * Otherwise NULL is returned. |
913 | */ | 913 | */ |
914 | struct inode *ilookup(struct super_block *sb, unsigned long ino) | 914 | struct inode *ilookup(struct super_block *sb, unsigned long ino) |
915 | { | 915 | { |
916 | struct hlist_head *head = inode_hashtable + hash(sb, ino); | 916 | struct hlist_head *head = inode_hashtable + hash(sb, ino); |
917 | 917 | ||
918 | return ifind_fast(sb, head, ino); | 918 | return ifind_fast(sb, head, ino); |
919 | } | 919 | } |
920 | 920 | ||
921 | EXPORT_SYMBOL(ilookup); | 921 | EXPORT_SYMBOL(ilookup); |
922 | 922 | ||
923 | /** | 923 | /** |
924 | * iget5_locked - obtain an inode from a mounted file system | 924 | * iget5_locked - obtain an inode from a mounted file system |
925 | * @sb: super block of file system | 925 | * @sb: super block of file system |
926 | * @hashval: hash value (usually inode number) to get | 926 | * @hashval: hash value (usually inode number) to get |
927 | * @test: callback used for comparisons between inodes | 927 | * @test: callback used for comparisons between inodes |
928 | * @set: callback used to initialize a new struct inode | 928 | * @set: callback used to initialize a new struct inode |
929 | * @data: opaque data pointer to pass to @test and @set | 929 | * @data: opaque data pointer to pass to @test and @set |
930 | * | 930 | * |
931 | * This is iget() without the read_inode() portion of get_new_inode(). | ||
932 | * | ||
933 | * iget5_locked() uses ifind() to search for the inode specified by @hashval | 931 | * iget5_locked() uses ifind() to search for the inode specified by @hashval |
934 | * and @data in the inode cache and if present it is returned with an increased | 932 | * and @data in the inode cache and if present it is returned with an increased |
935 | * reference count. This is a generalized version of iget_locked() for file | 933 | * reference count. This is a generalized version of iget_locked() for file |
936 | * systems where the inode number is not sufficient for unique identification | 934 | * systems where the inode number is not sufficient for unique identification |
937 | * of an inode. | 935 | * of an inode. |
938 | * | 936 | * |
939 | * If the inode is not in cache, get_new_inode() is called to allocate a new | 937 | * If the inode is not in cache, get_new_inode() is called to allocate a new |
940 | * inode and this is returned locked, hashed, and with the I_NEW flag set. The | 938 | * inode and this is returned locked, hashed, and with the I_NEW flag set. The |
941 | * file system gets to fill it in before unlocking it via unlock_new_inode(). | 939 | * file system gets to fill it in before unlocking it via unlock_new_inode(). |
942 | * | 940 | * |
943 | * Note both @test and @set are called with the inode_lock held, so can't sleep. | 941 | * Note both @test and @set are called with the inode_lock held, so can't sleep. |
944 | */ | 942 | */ |
945 | struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, | 943 | struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, |
946 | int (*test)(struct inode *, void *), | 944 | int (*test)(struct inode *, void *), |
947 | int (*set)(struct inode *, void *), void *data) | 945 | int (*set)(struct inode *, void *), void *data) |
948 | { | 946 | { |
949 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 947 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
950 | struct inode *inode; | 948 | struct inode *inode; |
951 | 949 | ||
952 | inode = ifind(sb, head, test, data, 1); | 950 | inode = ifind(sb, head, test, data, 1); |
953 | if (inode) | 951 | if (inode) |
954 | return inode; | 952 | return inode; |
955 | /* | 953 | /* |
956 | * get_new_inode() will do the right thing, re-trying the search | 954 | * get_new_inode() will do the right thing, re-trying the search |
957 | * in case it had to block at any point. | 955 | * in case it had to block at any point. |
958 | */ | 956 | */ |
959 | return get_new_inode(sb, head, test, set, data); | 957 | return get_new_inode(sb, head, test, set, data); |
960 | } | 958 | } |
961 | 959 | ||
962 | EXPORT_SYMBOL(iget5_locked); | 960 | EXPORT_SYMBOL(iget5_locked); |
963 | 961 | ||
964 | /** | 962 | /** |
965 | * iget_locked - obtain an inode from a mounted file system | 963 | * iget_locked - obtain an inode from a mounted file system |
966 | * @sb: super block of file system | 964 | * @sb: super block of file system |
967 | * @ino: inode number to get | 965 | * @ino: inode number to get |
968 | * | ||
969 | * This is iget() without the read_inode() portion of get_new_inode_fast(). | ||
970 | * | 966 | * |
971 | * iget_locked() uses ifind_fast() to search for the inode specified by @ino in | 967 | * iget_locked() uses ifind_fast() to search for the inode specified by @ino in |
972 | * the inode cache and if present it is returned with an increased reference | 968 | * the inode cache and if present it is returned with an increased reference |
973 | * count. This is for file systems where the inode number is sufficient for | 969 | * count. This is for file systems where the inode number is sufficient for |
974 | * unique identification of an inode. | 970 | * unique identification of an inode. |
975 | * | 971 | * |
976 | * If the inode is not in cache, get_new_inode_fast() is called to allocate a | 972 | * If the inode is not in cache, get_new_inode_fast() is called to allocate a |
977 | * new inode and this is returned locked, hashed, and with the I_NEW flag set. | 973 | * new inode and this is returned locked, hashed, and with the I_NEW flag set. |
978 | * The file system gets to fill it in before unlocking it via | 974 | * The file system gets to fill it in before unlocking it via |
979 | * unlock_new_inode(). | 975 | * unlock_new_inode(). |
980 | */ | 976 | */ |
981 | struct inode *iget_locked(struct super_block *sb, unsigned long ino) | 977 | struct inode *iget_locked(struct super_block *sb, unsigned long ino) |
982 | { | 978 | { |
983 | struct hlist_head *head = inode_hashtable + hash(sb, ino); | 979 | struct hlist_head *head = inode_hashtable + hash(sb, ino); |
984 | struct inode *inode; | 980 | struct inode *inode; |
985 | 981 | ||
986 | inode = ifind_fast(sb, head, ino); | 982 | inode = ifind_fast(sb, head, ino); |
987 | if (inode) | 983 | if (inode) |
988 | return inode; | 984 | return inode; |
989 | /* | 985 | /* |
990 | * get_new_inode_fast() will do the right thing, re-trying the search | 986 | * get_new_inode_fast() will do the right thing, re-trying the search |
991 | * in case it had to block at any point. | 987 | * in case it had to block at any point. |
992 | */ | 988 | */ |
993 | return get_new_inode_fast(sb, head, ino); | 989 | return get_new_inode_fast(sb, head, ino); |
994 | } | 990 | } |
995 | 991 | ||
996 | EXPORT_SYMBOL(iget_locked); | 992 | EXPORT_SYMBOL(iget_locked); |
997 | 993 | ||
998 | /** | 994 | /** |
999 | * __insert_inode_hash - hash an inode | 995 | * __insert_inode_hash - hash an inode |
1000 | * @inode: unhashed inode | 996 | * @inode: unhashed inode |
1001 | * @hashval: unsigned long value used to locate this object in the | 997 | * @hashval: unsigned long value used to locate this object in the |
1002 | * inode_hashtable. | 998 | * inode_hashtable. |
1003 | * | 999 | * |
1004 | * Add an inode to the inode hash for this superblock. | 1000 | * Add an inode to the inode hash for this superblock. |
1005 | */ | 1001 | */ |
1006 | void __insert_inode_hash(struct inode *inode, unsigned long hashval) | 1002 | void __insert_inode_hash(struct inode *inode, unsigned long hashval) |
1007 | { | 1003 | { |
1008 | struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); | 1004 | struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); |
1009 | spin_lock(&inode_lock); | 1005 | spin_lock(&inode_lock); |
1010 | hlist_add_head(&inode->i_hash, head); | 1006 | hlist_add_head(&inode->i_hash, head); |
1011 | spin_unlock(&inode_lock); | 1007 | spin_unlock(&inode_lock); |
1012 | } | 1008 | } |
1013 | 1009 | ||
1014 | EXPORT_SYMBOL(__insert_inode_hash); | 1010 | EXPORT_SYMBOL(__insert_inode_hash); |
1015 | 1011 | ||
1016 | /** | 1012 | /** |
1017 | * remove_inode_hash - remove an inode from the hash | 1013 | * remove_inode_hash - remove an inode from the hash |
1018 | * @inode: inode to unhash | 1014 | * @inode: inode to unhash |
1019 | * | 1015 | * |
1020 | * Remove an inode from the superblock. | 1016 | * Remove an inode from the superblock. |
1021 | */ | 1017 | */ |
1022 | void remove_inode_hash(struct inode *inode) | 1018 | void remove_inode_hash(struct inode *inode) |
1023 | { | 1019 | { |
1024 | spin_lock(&inode_lock); | 1020 | spin_lock(&inode_lock); |
1025 | hlist_del_init(&inode->i_hash); | 1021 | hlist_del_init(&inode->i_hash); |
1026 | spin_unlock(&inode_lock); | 1022 | spin_unlock(&inode_lock); |
1027 | } | 1023 | } |
1028 | 1024 | ||
1029 | EXPORT_SYMBOL(remove_inode_hash); | 1025 | EXPORT_SYMBOL(remove_inode_hash); |
1030 | 1026 | ||
1031 | /* | 1027 | /* |
1032 | * Tell the filesystem that this inode is no longer of any interest and should | 1028 | * Tell the filesystem that this inode is no longer of any interest and should |
1033 | * be completely destroyed. | 1029 | * be completely destroyed. |
1034 | * | 1030 | * |
1035 | * We leave the inode in the inode hash table until *after* the filesystem's | 1031 | * We leave the inode in the inode hash table until *after* the filesystem's |
1036 | * ->delete_inode completes. This ensures that an iget (such as nfsd might | 1032 | * ->delete_inode completes. This ensures that an iget (such as nfsd might |
1037 | * instigate) will always find up-to-date information either in the hash or on | 1033 | * instigate) will always find up-to-date information either in the hash or on |
1038 | * disk. | 1034 | * disk. |
1039 | * | 1035 | * |
1040 | * I_FREEING is set so that no-one will take a new reference to the inode while | 1036 | * I_FREEING is set so that no-one will take a new reference to the inode while |
1041 | * it is being deleted. | 1037 | * it is being deleted. |
1042 | */ | 1038 | */ |
1043 | void generic_delete_inode(struct inode *inode) | 1039 | void generic_delete_inode(struct inode *inode) |
1044 | { | 1040 | { |
1045 | const struct super_operations *op = inode->i_sb->s_op; | 1041 | const struct super_operations *op = inode->i_sb->s_op; |
1046 | 1042 | ||
1047 | list_del_init(&inode->i_list); | 1043 | list_del_init(&inode->i_list); |
1048 | list_del_init(&inode->i_sb_list); | 1044 | list_del_init(&inode->i_sb_list); |
1049 | inode->i_state |= I_FREEING; | 1045 | inode->i_state |= I_FREEING; |
1050 | inodes_stat.nr_inodes--; | 1046 | inodes_stat.nr_inodes--; |
1051 | spin_unlock(&inode_lock); | 1047 | spin_unlock(&inode_lock); |
1052 | 1048 | ||
1053 | security_inode_delete(inode); | 1049 | security_inode_delete(inode); |
1054 | 1050 | ||
1055 | if (op->delete_inode) { | 1051 | if (op->delete_inode) { |
1056 | void (*delete)(struct inode *) = op->delete_inode; | 1052 | void (*delete)(struct inode *) = op->delete_inode; |
1057 | if (!is_bad_inode(inode)) | 1053 | if (!is_bad_inode(inode)) |
1058 | DQUOT_INIT(inode); | 1054 | DQUOT_INIT(inode); |
1059 | /* Filesystems implementing their own | 1055 | /* Filesystems implementing their own |
1060 | * s_op->delete_inode are required to call | 1056 | * s_op->delete_inode are required to call |
1061 | * truncate_inode_pages and clear_inode() | 1057 | * truncate_inode_pages and clear_inode() |
1062 | * internally */ | 1058 | * internally */ |
1063 | delete(inode); | 1059 | delete(inode); |
1064 | } else { | 1060 | } else { |
1065 | truncate_inode_pages(&inode->i_data, 0); | 1061 | truncate_inode_pages(&inode->i_data, 0); |
1066 | clear_inode(inode); | 1062 | clear_inode(inode); |
1067 | } | 1063 | } |
1068 | spin_lock(&inode_lock); | 1064 | spin_lock(&inode_lock); |
1069 | hlist_del_init(&inode->i_hash); | 1065 | hlist_del_init(&inode->i_hash); |
1070 | spin_unlock(&inode_lock); | 1066 | spin_unlock(&inode_lock); |
1071 | wake_up_inode(inode); | 1067 | wake_up_inode(inode); |
1072 | BUG_ON(inode->i_state != I_CLEAR); | 1068 | BUG_ON(inode->i_state != I_CLEAR); |
1073 | destroy_inode(inode); | 1069 | destroy_inode(inode); |
1074 | } | 1070 | } |
1075 | 1071 | ||
1076 | EXPORT_SYMBOL(generic_delete_inode); | 1072 | EXPORT_SYMBOL(generic_delete_inode); |
1077 | 1073 | ||
1078 | static void generic_forget_inode(struct inode *inode) | 1074 | static void generic_forget_inode(struct inode *inode) |
1079 | { | 1075 | { |
1080 | struct super_block *sb = inode->i_sb; | 1076 | struct super_block *sb = inode->i_sb; |
1081 | 1077 | ||
1082 | if (!hlist_unhashed(&inode->i_hash)) { | 1078 | if (!hlist_unhashed(&inode->i_hash)) { |
1083 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) | 1079 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) |
1084 | list_move(&inode->i_list, &inode_unused); | 1080 | list_move(&inode->i_list, &inode_unused); |
1085 | inodes_stat.nr_unused++; | 1081 | inodes_stat.nr_unused++; |
1086 | if (sb->s_flags & MS_ACTIVE) { | 1082 | if (sb->s_flags & MS_ACTIVE) { |
1087 | spin_unlock(&inode_lock); | 1083 | spin_unlock(&inode_lock); |
1088 | return; | 1084 | return; |
1089 | } | 1085 | } |
1090 | inode->i_state |= I_WILL_FREE; | 1086 | inode->i_state |= I_WILL_FREE; |
1091 | spin_unlock(&inode_lock); | 1087 | spin_unlock(&inode_lock); |
1092 | write_inode_now(inode, 1); | 1088 | write_inode_now(inode, 1); |
1093 | spin_lock(&inode_lock); | 1089 | spin_lock(&inode_lock); |
1094 | inode->i_state &= ~I_WILL_FREE; | 1090 | inode->i_state &= ~I_WILL_FREE; |
1095 | inodes_stat.nr_unused--; | 1091 | inodes_stat.nr_unused--; |
1096 | hlist_del_init(&inode->i_hash); | 1092 | hlist_del_init(&inode->i_hash); |
1097 | } | 1093 | } |
1098 | list_del_init(&inode->i_list); | 1094 | list_del_init(&inode->i_list); |
1099 | list_del_init(&inode->i_sb_list); | 1095 | list_del_init(&inode->i_sb_list); |
1100 | inode->i_state |= I_FREEING; | 1096 | inode->i_state |= I_FREEING; |
1101 | inodes_stat.nr_inodes--; | 1097 | inodes_stat.nr_inodes--; |
1102 | spin_unlock(&inode_lock); | 1098 | spin_unlock(&inode_lock); |
1103 | if (inode->i_data.nrpages) | 1099 | if (inode->i_data.nrpages) |
1104 | truncate_inode_pages(&inode->i_data, 0); | 1100 | truncate_inode_pages(&inode->i_data, 0); |
1105 | clear_inode(inode); | 1101 | clear_inode(inode); |
1106 | wake_up_inode(inode); | 1102 | wake_up_inode(inode); |
1107 | destroy_inode(inode); | 1103 | destroy_inode(inode); |
1108 | } | 1104 | } |
1109 | 1105 | ||
1110 | /* | 1106 | /* |
1111 | * Normal UNIX filesystem behaviour: delete the | 1107 | * Normal UNIX filesystem behaviour: delete the |
1112 | * inode when the usage count drops to zero, and | 1108 | * inode when the usage count drops to zero, and |
1113 | * i_nlink is zero. | 1109 | * i_nlink is zero. |
1114 | */ | 1110 | */ |
1115 | void generic_drop_inode(struct inode *inode) | 1111 | void generic_drop_inode(struct inode *inode) |
1116 | { | 1112 | { |
1117 | if (!inode->i_nlink) | 1113 | if (!inode->i_nlink) |
1118 | generic_delete_inode(inode); | 1114 | generic_delete_inode(inode); |
1119 | else | 1115 | else |
1120 | generic_forget_inode(inode); | 1116 | generic_forget_inode(inode); |
1121 | } | 1117 | } |
1122 | 1118 | ||
1123 | EXPORT_SYMBOL_GPL(generic_drop_inode); | 1119 | EXPORT_SYMBOL_GPL(generic_drop_inode); |
1124 | 1120 | ||
1125 | /* | 1121 | /* |
1126 | * Called when we're dropping the last reference | 1122 | * Called when we're dropping the last reference |
1127 | * to an inode. | 1123 | * to an inode. |
1128 | * | 1124 | * |
1129 | * Call the FS "drop()" function, defaulting to | 1125 | * Call the FS "drop()" function, defaulting to |
1130 | * the legacy UNIX filesystem behaviour.. | 1126 | * the legacy UNIX filesystem behaviour.. |
1131 | * | 1127 | * |
1132 | * NOTE! NOTE! NOTE! We're called with the inode lock | 1128 | * NOTE! NOTE! NOTE! We're called with the inode lock |
1133 | * held, and the drop function is supposed to release | 1129 | * held, and the drop function is supposed to release |
1134 | * the lock! | 1130 | * the lock! |
1135 | */ | 1131 | */ |
1136 | static inline void iput_final(struct inode *inode) | 1132 | static inline void iput_final(struct inode *inode) |
1137 | { | 1133 | { |
1138 | const struct super_operations *op = inode->i_sb->s_op; | 1134 | const struct super_operations *op = inode->i_sb->s_op; |
1139 | void (*drop)(struct inode *) = generic_drop_inode; | 1135 | void (*drop)(struct inode *) = generic_drop_inode; |
1140 | 1136 | ||
1141 | if (op && op->drop_inode) | 1137 | if (op && op->drop_inode) |
1142 | drop = op->drop_inode; | 1138 | drop = op->drop_inode; |
1143 | drop(inode); | 1139 | drop(inode); |
1144 | } | 1140 | } |
1145 | 1141 | ||
1146 | /** | 1142 | /** |
1147 | * iput - put an inode | 1143 | * iput - put an inode |
1148 | * @inode: inode to put | 1144 | * @inode: inode to put |
1149 | * | 1145 | * |
1150 | * Puts an inode, dropping its usage count. If the inode use count hits | 1146 | * Puts an inode, dropping its usage count. If the inode use count hits |
1151 | * zero, the inode is then freed and may also be destroyed. | 1147 | * zero, the inode is then freed and may also be destroyed. |
1152 | * | 1148 | * |
1153 | * Consequently, iput() can sleep. | 1149 | * Consequently, iput() can sleep. |
1154 | */ | 1150 | */ |
1155 | void iput(struct inode *inode) | 1151 | void iput(struct inode *inode) |
1156 | { | 1152 | { |
1157 | if (inode) { | 1153 | if (inode) { |
1158 | const struct super_operations *op = inode->i_sb->s_op; | 1154 | const struct super_operations *op = inode->i_sb->s_op; |
1159 | 1155 | ||
1160 | BUG_ON(inode->i_state == I_CLEAR); | 1156 | BUG_ON(inode->i_state == I_CLEAR); |
1161 | 1157 | ||
1162 | if (op && op->put_inode) | 1158 | if (op && op->put_inode) |
1163 | op->put_inode(inode); | 1159 | op->put_inode(inode); |
1164 | 1160 | ||
1165 | if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) | 1161 | if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) |
1166 | iput_final(inode); | 1162 | iput_final(inode); |
1167 | } | 1163 | } |
1168 | } | 1164 | } |
1169 | 1165 | ||
1170 | EXPORT_SYMBOL(iput); | 1166 | EXPORT_SYMBOL(iput); |
1171 | 1167 | ||
1172 | /** | 1168 | /** |
1173 | * bmap - find a block number in a file | 1169 | * bmap - find a block number in a file |
1174 | * @inode: inode of file | 1170 | * @inode: inode of file |
1175 | * @block: block to find | 1171 | * @block: block to find |
1176 | * | 1172 | * |
1177 | * Returns the block number on the device holding the inode that | 1173 | * Returns the block number on the device holding the inode that |
1178 | * is the disk block number for the block of the file requested. | 1174 | * is the disk block number for the block of the file requested. |
1179 | * That is, asked for block 4 of inode 1 the function will return the | 1175 | * That is, asked for block 4 of inode 1 the function will return the |
1180 | * disk block relative to the disk start that holds that block of the | 1176 | * disk block relative to the disk start that holds that block of the |
1181 | * file. | 1177 | * file. |
1182 | */ | 1178 | */ |
1183 | sector_t bmap(struct inode * inode, sector_t block) | 1179 | sector_t bmap(struct inode * inode, sector_t block) |
1184 | { | 1180 | { |
1185 | sector_t res = 0; | 1181 | sector_t res = 0; |
1186 | if (inode->i_mapping->a_ops->bmap) | 1182 | if (inode->i_mapping->a_ops->bmap) |
1187 | res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); | 1183 | res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); |
1188 | return res; | 1184 | return res; |
1189 | } | 1185 | } |
1190 | EXPORT_SYMBOL(bmap); | 1186 | EXPORT_SYMBOL(bmap); |
1191 | 1187 | ||
1192 | /** | 1188 | /** |
1193 | * touch_atime - update the access time | 1189 | * touch_atime - update the access time |
1194 | * @mnt: mount the inode is accessed on | 1190 | * @mnt: mount the inode is accessed on |
1195 | * @dentry: dentry accessed | 1191 | * @dentry: dentry accessed |
1196 | * | 1192 | * |
1197 | * Update the accessed time on an inode and mark it for writeback. | 1193 | * Update the accessed time on an inode and mark it for writeback. |
1198 | * This function automatically handles read only file systems and media, | 1194 | * This function automatically handles read only file systems and media, |
1199 | * as well as the "noatime" flag and inode specific "noatime" markers. | 1195 | * as well as the "noatime" flag and inode specific "noatime" markers. |
1200 | */ | 1196 | */ |
1201 | void touch_atime(struct vfsmount *mnt, struct dentry *dentry) | 1197 | void touch_atime(struct vfsmount *mnt, struct dentry *dentry) |
1202 | { | 1198 | { |
1203 | struct inode *inode = dentry->d_inode; | 1199 | struct inode *inode = dentry->d_inode; |
1204 | struct timespec now; | 1200 | struct timespec now; |
1205 | 1201 | ||
1206 | if (inode->i_flags & S_NOATIME) | 1202 | if (inode->i_flags & S_NOATIME) |
1207 | return; | 1203 | return; |
1208 | if (IS_NOATIME(inode)) | 1204 | if (IS_NOATIME(inode)) |
1209 | return; | 1205 | return; |
1210 | if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) | 1206 | if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) |
1211 | return; | 1207 | return; |
1212 | 1208 | ||
1213 | /* | 1209 | /* |
1214 | * We may have a NULL vfsmount when coming from NFSD | 1210 | * We may have a NULL vfsmount when coming from NFSD |
1215 | */ | 1211 | */ |
1216 | if (mnt) { | 1212 | if (mnt) { |
1217 | if (mnt->mnt_flags & MNT_NOATIME) | 1213 | if (mnt->mnt_flags & MNT_NOATIME) |
1218 | return; | 1214 | return; |
1219 | if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) | 1215 | if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) |
1220 | return; | 1216 | return; |
1221 | 1217 | ||
1222 | if (mnt->mnt_flags & MNT_RELATIME) { | 1218 | if (mnt->mnt_flags & MNT_RELATIME) { |
1223 | /* | 1219 | /* |
1224 | * With relative atime, only update atime if the | 1220 | * With relative atime, only update atime if the |
1225 | * previous atime is earlier than either the ctime or | 1221 | * previous atime is earlier than either the ctime or |
1226 | * mtime. | 1222 | * mtime. |
1227 | */ | 1223 | */ |
1228 | if (timespec_compare(&inode->i_mtime, | 1224 | if (timespec_compare(&inode->i_mtime, |
1229 | &inode->i_atime) < 0 && | 1225 | &inode->i_atime) < 0 && |
1230 | timespec_compare(&inode->i_ctime, | 1226 | timespec_compare(&inode->i_ctime, |
1231 | &inode->i_atime) < 0) | 1227 | &inode->i_atime) < 0) |
1232 | return; | 1228 | return; |
1233 | } | 1229 | } |
1234 | } | 1230 | } |
1235 | 1231 | ||
1236 | now = current_fs_time(inode->i_sb); | 1232 | now = current_fs_time(inode->i_sb); |
1237 | if (timespec_equal(&inode->i_atime, &now)) | 1233 | if (timespec_equal(&inode->i_atime, &now)) |
1238 | return; | 1234 | return; |
1239 | 1235 | ||
1240 | inode->i_atime = now; | 1236 | inode->i_atime = now; |
1241 | mark_inode_dirty_sync(inode); | 1237 | mark_inode_dirty_sync(inode); |
1242 | } | 1238 | } |
1243 | EXPORT_SYMBOL(touch_atime); | 1239 | EXPORT_SYMBOL(touch_atime); |
1244 | 1240 | ||
1245 | /** | 1241 | /** |
1246 | * file_update_time - update mtime and ctime time | 1242 | * file_update_time - update mtime and ctime time |
1247 | * @file: file accessed | 1243 | * @file: file accessed |
1248 | * | 1244 | * |
1249 | * Update the mtime and ctime members of an inode and mark the inode | 1245 | * Update the mtime and ctime members of an inode and mark the inode |
1250 | * for writeback. Note that this function is meant exclusively for | 1246 | * for writeback. Note that this function is meant exclusively for |
1251 | * usage in the file write path of filesystems, and filesystems may | 1247 | * usage in the file write path of filesystems, and filesystems may |
1252 | * choose to explicitly ignore update via this function with the | 1248 | * choose to explicitly ignore update via this function with the |
1253 | * S_NOCTIME inode flag, e.g. for network filesystem where these | 1249 | * S_NOCTIME inode flag, e.g. for network filesystem where these |
1254 | * timestamps are handled by the server. | 1250 | * timestamps are handled by the server. |
1255 | */ | 1251 | */ |
1256 | 1252 | ||
1257 | void file_update_time(struct file *file) | 1253 | void file_update_time(struct file *file) |
1258 | { | 1254 | { |
1259 | struct inode *inode = file->f_path.dentry->d_inode; | 1255 | struct inode *inode = file->f_path.dentry->d_inode; |
1260 | struct timespec now; | 1256 | struct timespec now; |
1261 | int sync_it = 0; | 1257 | int sync_it = 0; |
1262 | 1258 | ||
1263 | if (IS_NOCMTIME(inode)) | 1259 | if (IS_NOCMTIME(inode)) |
1264 | return; | 1260 | return; |
1265 | if (IS_RDONLY(inode)) | 1261 | if (IS_RDONLY(inode)) |
1266 | return; | 1262 | return; |
1267 | 1263 | ||
1268 | now = current_fs_time(inode->i_sb); | 1264 | now = current_fs_time(inode->i_sb); |
1269 | if (!timespec_equal(&inode->i_mtime, &now)) { | 1265 | if (!timespec_equal(&inode->i_mtime, &now)) { |
1270 | inode->i_mtime = now; | 1266 | inode->i_mtime = now; |
1271 | sync_it = 1; | 1267 | sync_it = 1; |
1272 | } | 1268 | } |
1273 | 1269 | ||
1274 | if (!timespec_equal(&inode->i_ctime, &now)) { | 1270 | if (!timespec_equal(&inode->i_ctime, &now)) { |
1275 | inode->i_ctime = now; | 1271 | inode->i_ctime = now; |
1276 | sync_it = 1; | 1272 | sync_it = 1; |
1277 | } | 1273 | } |
1278 | 1274 | ||
1279 | if (IS_I_VERSION(inode)) { | 1275 | if (IS_I_VERSION(inode)) { |
1280 | inode_inc_iversion(inode); | 1276 | inode_inc_iversion(inode); |
1281 | sync_it = 1; | 1277 | sync_it = 1; |
1282 | } | 1278 | } |
1283 | 1279 | ||
1284 | if (sync_it) | 1280 | if (sync_it) |
1285 | mark_inode_dirty_sync(inode); | 1281 | mark_inode_dirty_sync(inode); |
1286 | } | 1282 | } |
1287 | 1283 | ||
1288 | EXPORT_SYMBOL(file_update_time); | 1284 | EXPORT_SYMBOL(file_update_time); |
1289 | 1285 | ||
1290 | int inode_needs_sync(struct inode *inode) | 1286 | int inode_needs_sync(struct inode *inode) |
1291 | { | 1287 | { |
1292 | if (IS_SYNC(inode)) | 1288 | if (IS_SYNC(inode)) |
1293 | return 1; | 1289 | return 1; |
1294 | if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) | 1290 | if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) |
1295 | return 1; | 1291 | return 1; |
1296 | return 0; | 1292 | return 0; |
1297 | } | 1293 | } |
1298 | 1294 | ||
1299 | EXPORT_SYMBOL(inode_needs_sync); | 1295 | EXPORT_SYMBOL(inode_needs_sync); |
1300 | 1296 | ||
1301 | int inode_wait(void *word) | 1297 | int inode_wait(void *word) |
1302 | { | 1298 | { |
1303 | schedule(); | 1299 | schedule(); |
1304 | return 0; | 1300 | return 0; |
1305 | } | 1301 | } |
1306 | 1302 | ||
1307 | /* | 1303 | /* |
1308 | * If we try to find an inode in the inode hash while it is being | 1304 | * If we try to find an inode in the inode hash while it is being |
1309 | * deleted, we have to wait until the filesystem completes its | 1305 | * deleted, we have to wait until the filesystem completes its |
1310 | * deletion before reporting that it isn't found. This function waits | 1306 | * deletion before reporting that it isn't found. This function waits |
1311 | * until the deletion _might_ have completed. Callers are responsible | 1307 | * until the deletion _might_ have completed. Callers are responsible |
1312 | * to recheck inode state. | 1308 | * to recheck inode state. |
1313 | * | 1309 | * |
1314 | * It doesn't matter if I_LOCK is not set initially, a call to | 1310 | * It doesn't matter if I_LOCK is not set initially, a call to |
1315 | * wake_up_inode() after removing from the hash list will DTRT. | 1311 | * wake_up_inode() after removing from the hash list will DTRT. |
1316 | * | 1312 | * |
1317 | * This is called with inode_lock held. | 1313 | * This is called with inode_lock held. |
1318 | */ | 1314 | */ |
1319 | static void __wait_on_freeing_inode(struct inode *inode) | 1315 | static void __wait_on_freeing_inode(struct inode *inode) |
1320 | { | 1316 | { |
1321 | wait_queue_head_t *wq; | 1317 | wait_queue_head_t *wq; |
1322 | DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); | 1318 | DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); |
1323 | wq = bit_waitqueue(&inode->i_state, __I_LOCK); | 1319 | wq = bit_waitqueue(&inode->i_state, __I_LOCK); |
1324 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | 1320 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); |
1325 | spin_unlock(&inode_lock); | 1321 | spin_unlock(&inode_lock); |
1326 | schedule(); | 1322 | schedule(); |
1327 | finish_wait(wq, &wait.wait); | 1323 | finish_wait(wq, &wait.wait); |
1328 | spin_lock(&inode_lock); | 1324 | spin_lock(&inode_lock); |
1329 | } | 1325 | } |
1330 | 1326 | ||
1331 | /* | 1327 | /* |
1332 | * We rarely want to lock two inodes that do not have a parent/child | 1328 | * We rarely want to lock two inodes that do not have a parent/child |
1333 | * relationship (such as directory, child inode) simultaneously. The | 1329 | * relationship (such as directory, child inode) simultaneously. The |
1334 | * vast majority of file systems should be able to get along fine | 1330 | * vast majority of file systems should be able to get along fine |
1335 | * without this. Do not use these functions except as a last resort. | 1331 | * without this. Do not use these functions except as a last resort. |
1336 | */ | 1332 | */ |
1337 | void inode_double_lock(struct inode *inode1, struct inode *inode2) | 1333 | void inode_double_lock(struct inode *inode1, struct inode *inode2) |
1338 | { | 1334 | { |
1339 | if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { | 1335 | if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { |
1340 | if (inode1) | 1336 | if (inode1) |
1341 | mutex_lock(&inode1->i_mutex); | 1337 | mutex_lock(&inode1->i_mutex); |
1342 | else if (inode2) | 1338 | else if (inode2) |
1343 | mutex_lock(&inode2->i_mutex); | 1339 | mutex_lock(&inode2->i_mutex); |
1344 | return; | 1340 | return; |
1345 | } | 1341 | } |
1346 | 1342 | ||
1347 | if (inode1 < inode2) { | 1343 | if (inode1 < inode2) { |
1348 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); | 1344 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); |
1349 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); | 1345 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); |
1350 | } else { | 1346 | } else { |
1351 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); | 1347 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); |
1352 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); | 1348 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); |
1353 | } | 1349 | } |
1354 | } | 1350 | } |
1355 | EXPORT_SYMBOL(inode_double_lock); | 1351 | EXPORT_SYMBOL(inode_double_lock); |
1356 | 1352 | ||
1357 | void inode_double_unlock(struct inode *inode1, struct inode *inode2) | 1353 | void inode_double_unlock(struct inode *inode1, struct inode *inode2) |
1358 | { | 1354 | { |
1359 | if (inode1) | 1355 | if (inode1) |
1360 | mutex_unlock(&inode1->i_mutex); | 1356 | mutex_unlock(&inode1->i_mutex); |
1361 | 1357 | ||
1362 | if (inode2 && inode2 != inode1) | 1358 | if (inode2 && inode2 != inode1) |
1363 | mutex_unlock(&inode2->i_mutex); | 1359 | mutex_unlock(&inode2->i_mutex); |
1364 | } | 1360 | } |
1365 | EXPORT_SYMBOL(inode_double_unlock); | 1361 | EXPORT_SYMBOL(inode_double_unlock); |
1366 | 1362 | ||
1367 | static __initdata unsigned long ihash_entries; | 1363 | static __initdata unsigned long ihash_entries; |
1368 | static int __init set_ihash_entries(char *str) | 1364 | static int __init set_ihash_entries(char *str) |
1369 | { | 1365 | { |
1370 | if (!str) | 1366 | if (!str) |
1371 | return 0; | 1367 | return 0; |
1372 | ihash_entries = simple_strtoul(str, &str, 0); | 1368 | ihash_entries = simple_strtoul(str, &str, 0); |
1373 | return 1; | 1369 | return 1; |
1374 | } | 1370 | } |
1375 | __setup("ihash_entries=", set_ihash_entries); | 1371 | __setup("ihash_entries=", set_ihash_entries); |
1376 | 1372 | ||
1377 | /* | 1373 | /* |
1378 | * Initialize the waitqueues and inode hash table. | 1374 | * Initialize the waitqueues and inode hash table. |
1379 | */ | 1375 | */ |
1380 | void __init inode_init_early(void) | 1376 | void __init inode_init_early(void) |
1381 | { | 1377 | { |
1382 | int loop; | 1378 | int loop; |
1383 | 1379 | ||
1384 | /* If hashes are distributed across NUMA nodes, defer | 1380 | /* If hashes are distributed across NUMA nodes, defer |
1385 | * hash allocation until vmalloc space is available. | 1381 | * hash allocation until vmalloc space is available. |
1386 | */ | 1382 | */ |
1387 | if (hashdist) | 1383 | if (hashdist) |
1388 | return; | 1384 | return; |
1389 | 1385 | ||
1390 | inode_hashtable = | 1386 | inode_hashtable = |
1391 | alloc_large_system_hash("Inode-cache", | 1387 | alloc_large_system_hash("Inode-cache", |
1392 | sizeof(struct hlist_head), | 1388 | sizeof(struct hlist_head), |
1393 | ihash_entries, | 1389 | ihash_entries, |
1394 | 14, | 1390 | 14, |
1395 | HASH_EARLY, | 1391 | HASH_EARLY, |
1396 | &i_hash_shift, | 1392 | &i_hash_shift, |
1397 | &i_hash_mask, | 1393 | &i_hash_mask, |
1398 | 0); | 1394 | 0); |
1399 | 1395 | ||
1400 | for (loop = 0; loop < (1 << i_hash_shift); loop++) | 1396 | for (loop = 0; loop < (1 << i_hash_shift); loop++) |
1401 | INIT_HLIST_HEAD(&inode_hashtable[loop]); | 1397 | INIT_HLIST_HEAD(&inode_hashtable[loop]); |
1402 | } | 1398 | } |
1403 | 1399 | ||
1404 | void __init inode_init(void) | 1400 | void __init inode_init(void) |
1405 | { | 1401 | { |
1406 | int loop; | 1402 | int loop; |
1407 | 1403 | ||
1408 | /* inode slab cache */ | 1404 | /* inode slab cache */ |
1409 | inode_cachep = kmem_cache_create("inode_cache", | 1405 | inode_cachep = kmem_cache_create("inode_cache", |
1410 | sizeof(struct inode), | 1406 | sizeof(struct inode), |
1411 | 0, | 1407 | 0, |
1412 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 1408 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
1413 | SLAB_MEM_SPREAD), | 1409 | SLAB_MEM_SPREAD), |
1414 | init_once); | 1410 | init_once); |
1415 | register_shrinker(&icache_shrinker); | 1411 | register_shrinker(&icache_shrinker); |
1416 | 1412 | ||
1417 | /* Hash may have been set up in inode_init_early */ | 1413 | /* Hash may have been set up in inode_init_early */ |
1418 | if (!hashdist) | 1414 | if (!hashdist) |
1419 | return; | 1415 | return; |
1420 | 1416 | ||
1421 | inode_hashtable = | 1417 | inode_hashtable = |
1422 | alloc_large_system_hash("Inode-cache", | 1418 | alloc_large_system_hash("Inode-cache", |
1423 | sizeof(struct hlist_head), | 1419 | sizeof(struct hlist_head), |
1424 | ihash_entries, | 1420 | ihash_entries, |
1425 | 14, | 1421 | 14, |
1426 | 0, | 1422 | 0, |
1427 | &i_hash_shift, | 1423 | &i_hash_shift, |
1428 | &i_hash_mask, | 1424 | &i_hash_mask, |
1429 | 0); | 1425 | 0); |
1430 | 1426 | ||
1431 | for (loop = 0; loop < (1 << i_hash_shift); loop++) | 1427 | for (loop = 0; loop < (1 << i_hash_shift); loop++) |
1432 | INIT_HLIST_HEAD(&inode_hashtable[loop]); | 1428 | INIT_HLIST_HEAD(&inode_hashtable[loop]); |
1433 | } | 1429 | } |
1434 | 1430 | ||
1435 | void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) | 1431 | void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) |
1436 | { | 1432 | { |
1437 | inode->i_mode = mode; | 1433 | inode->i_mode = mode; |
1438 | if (S_ISCHR(mode)) { | 1434 | if (S_ISCHR(mode)) { |
1439 | inode->i_fop = &def_chr_fops; | 1435 | inode->i_fop = &def_chr_fops; |
1440 | inode->i_rdev = rdev; | 1436 | inode->i_rdev = rdev; |
1441 | } else if (S_ISBLK(mode)) { | 1437 | } else if (S_ISBLK(mode)) { |
1442 | inode->i_fop = &def_blk_fops; | 1438 | inode->i_fop = &def_blk_fops; |
1443 | inode->i_rdev = rdev; | 1439 | inode->i_rdev = rdev; |
1444 | } else if (S_ISFIFO(mode)) | 1440 | } else if (S_ISFIFO(mode)) |
1445 | inode->i_fop = &def_fifo_fops; | 1441 | inode->i_fop = &def_fifo_fops; |
1446 | else if (S_ISSOCK(mode)) | 1442 | else if (S_ISSOCK(mode)) |
1447 | inode->i_fop = &bad_sock_fops; | 1443 | inode->i_fop = &bad_sock_fops; |
1448 | else | 1444 | else |
1449 | printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", | 1445 | printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", |
1450 | mode); | 1446 | mode); |
1451 | } | 1447 | } |
1452 | EXPORT_SYMBOL(init_special_inode); | 1448 | EXPORT_SYMBOL(init_special_inode); |
1453 | 1449 |
include/linux/fs.h
1 | #ifndef _LINUX_FS_H | 1 | #ifndef _LINUX_FS_H |
2 | #define _LINUX_FS_H | 2 | #define _LINUX_FS_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * This file has definitions for some important file table | 5 | * This file has definitions for some important file table |
6 | * structures etc. | 6 | * structures etc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/limits.h> | 9 | #include <linux/limits.h> |
10 | #include <linux/ioctl.h> | 10 | #include <linux/ioctl.h> |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change | 13 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change |
14 | * the file limit at runtime and only root can increase the per-process | 14 | * the file limit at runtime and only root can increase the per-process |
15 | * nr_file rlimit, so it's safe to set up a ridiculously high absolute | 15 | * nr_file rlimit, so it's safe to set up a ridiculously high absolute |
16 | * upper limit on files-per-process. | 16 | * upper limit on files-per-process. |
17 | * | 17 | * |
18 | * Some programs (notably those using select()) may have to be | 18 | * Some programs (notably those using select()) may have to be |
19 | * recompiled to take full advantage of the new limits.. | 19 | * recompiled to take full advantage of the new limits.. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | /* Fixed constants first: */ | 22 | /* Fixed constants first: */ |
23 | #undef NR_OPEN | 23 | #undef NR_OPEN |
24 | extern int sysctl_nr_open; | 24 | extern int sysctl_nr_open; |
25 | #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ | 25 | #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ |
26 | 26 | ||
27 | #define BLOCK_SIZE_BITS 10 | 27 | #define BLOCK_SIZE_BITS 10 |
28 | #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) | 28 | #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) |
29 | 29 | ||
30 | #define SEEK_SET 0 /* seek relative to beginning of file */ | 30 | #define SEEK_SET 0 /* seek relative to beginning of file */ |
31 | #define SEEK_CUR 1 /* seek relative to current file position */ | 31 | #define SEEK_CUR 1 /* seek relative to current file position */ |
32 | #define SEEK_END 2 /* seek relative to end of file */ | 32 | #define SEEK_END 2 /* seek relative to end of file */ |
33 | #define SEEK_MAX SEEK_END | 33 | #define SEEK_MAX SEEK_END |
34 | 34 | ||
35 | /* And dynamically-tunable limits and defaults: */ | 35 | /* And dynamically-tunable limits and defaults: */ |
36 | struct files_stat_struct { | 36 | struct files_stat_struct { |
37 | int nr_files; /* read only */ | 37 | int nr_files; /* read only */ |
38 | int nr_free_files; /* read only */ | 38 | int nr_free_files; /* read only */ |
39 | int max_files; /* tunable */ | 39 | int max_files; /* tunable */ |
40 | }; | 40 | }; |
41 | extern struct files_stat_struct files_stat; | 41 | extern struct files_stat_struct files_stat; |
42 | extern int get_max_files(void); | 42 | extern int get_max_files(void); |
43 | 43 | ||
44 | struct inodes_stat_t { | 44 | struct inodes_stat_t { |
45 | int nr_inodes; | 45 | int nr_inodes; |
46 | int nr_unused; | 46 | int nr_unused; |
47 | int dummy[5]; /* padding for sysctl ABI compatibility */ | 47 | int dummy[5]; /* padding for sysctl ABI compatibility */ |
48 | }; | 48 | }; |
49 | extern struct inodes_stat_t inodes_stat; | 49 | extern struct inodes_stat_t inodes_stat; |
50 | 50 | ||
51 | extern int leases_enable, lease_break_time; | 51 | extern int leases_enable, lease_break_time; |
52 | 52 | ||
53 | #ifdef CONFIG_DNOTIFY | 53 | #ifdef CONFIG_DNOTIFY |
54 | extern int dir_notify_enable; | 54 | extern int dir_notify_enable; |
55 | #endif | 55 | #endif |
56 | 56 | ||
57 | #define NR_FILE 8192 /* this can well be larger on a larger system */ | 57 | #define NR_FILE 8192 /* this can well be larger on a larger system */ |
58 | 58 | ||
59 | #define MAY_EXEC 1 | 59 | #define MAY_EXEC 1 |
60 | #define MAY_WRITE 2 | 60 | #define MAY_WRITE 2 |
61 | #define MAY_READ 4 | 61 | #define MAY_READ 4 |
62 | #define MAY_APPEND 8 | 62 | #define MAY_APPEND 8 |
63 | 63 | ||
64 | #define FMODE_READ 1 | 64 | #define FMODE_READ 1 |
65 | #define FMODE_WRITE 2 | 65 | #define FMODE_WRITE 2 |
66 | 66 | ||
67 | /* Internal kernel extensions */ | 67 | /* Internal kernel extensions */ |
68 | #define FMODE_LSEEK 4 | 68 | #define FMODE_LSEEK 4 |
69 | #define FMODE_PREAD 8 | 69 | #define FMODE_PREAD 8 |
70 | #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ | 70 | #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ |
71 | 71 | ||
72 | /* File is being opened for execution. Primary users of this flag are | 72 | /* File is being opened for execution. Primary users of this flag are |
73 | distributed filesystems that can use it to achieve correct ETXTBUSY | 73 | distributed filesystems that can use it to achieve correct ETXTBUSY |
74 | behavior for cross-node execution/opening_for_writing of files */ | 74 | behavior for cross-node execution/opening_for_writing of files */ |
75 | #define FMODE_EXEC 16 | 75 | #define FMODE_EXEC 16 |
76 | 76 | ||
77 | #define RW_MASK 1 | 77 | #define RW_MASK 1 |
78 | #define RWA_MASK 2 | 78 | #define RWA_MASK 2 |
79 | #define READ 0 | 79 | #define READ 0 |
80 | #define WRITE 1 | 80 | #define WRITE 1 |
81 | #define READA 2 /* read-ahead - don't block if no resources */ | 81 | #define READA 2 /* read-ahead - don't block if no resources */ |
82 | #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ | 82 | #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ |
83 | #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) | 83 | #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) |
84 | #define READ_META (READ | (1 << BIO_RW_META)) | 84 | #define READ_META (READ | (1 << BIO_RW_META)) |
85 | #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) | 85 | #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) |
86 | #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) | 86 | #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) |
87 | 87 | ||
88 | #define SEL_IN 1 | 88 | #define SEL_IN 1 |
89 | #define SEL_OUT 2 | 89 | #define SEL_OUT 2 |
90 | #define SEL_EX 4 | 90 | #define SEL_EX 4 |
91 | 91 | ||
92 | /* public flags for file_system_type */ | 92 | /* public flags for file_system_type */ |
93 | #define FS_REQUIRES_DEV 1 | 93 | #define FS_REQUIRES_DEV 1 |
94 | #define FS_BINARY_MOUNTDATA 2 | 94 | #define FS_BINARY_MOUNTDATA 2 |
95 | #define FS_HAS_SUBTYPE 4 | 95 | #define FS_HAS_SUBTYPE 4 |
96 | #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ | 96 | #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ |
97 | #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() | 97 | #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() |
98 | * during rename() internally. | 98 | * during rename() internally. |
99 | */ | 99 | */ |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * These are the fs-independent mount-flags: up to 32 flags are supported | 102 | * These are the fs-independent mount-flags: up to 32 flags are supported |
103 | */ | 103 | */ |
104 | #define MS_RDONLY 1 /* Mount read-only */ | 104 | #define MS_RDONLY 1 /* Mount read-only */ |
105 | #define MS_NOSUID 2 /* Ignore suid and sgid bits */ | 105 | #define MS_NOSUID 2 /* Ignore suid and sgid bits */ |
106 | #define MS_NODEV 4 /* Disallow access to device special files */ | 106 | #define MS_NODEV 4 /* Disallow access to device special files */ |
107 | #define MS_NOEXEC 8 /* Disallow program execution */ | 107 | #define MS_NOEXEC 8 /* Disallow program execution */ |
108 | #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ | 108 | #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ |
109 | #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ | 109 | #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ |
110 | #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ | 110 | #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ |
111 | #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ | 111 | #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ |
112 | #define MS_NOATIME 1024 /* Do not update access times. */ | 112 | #define MS_NOATIME 1024 /* Do not update access times. */ |
113 | #define MS_NODIRATIME 2048 /* Do not update directory access times */ | 113 | #define MS_NODIRATIME 2048 /* Do not update directory access times */ |
114 | #define MS_BIND 4096 | 114 | #define MS_BIND 4096 |
115 | #define MS_MOVE 8192 | 115 | #define MS_MOVE 8192 |
116 | #define MS_REC 16384 | 116 | #define MS_REC 16384 |
117 | #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. | 117 | #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. |
118 | MS_VERBOSE is deprecated. */ | 118 | MS_VERBOSE is deprecated. */ |
119 | #define MS_SILENT 32768 | 119 | #define MS_SILENT 32768 |
120 | #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ | 120 | #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ |
121 | #define MS_UNBINDABLE (1<<17) /* change to unbindable */ | 121 | #define MS_UNBINDABLE (1<<17) /* change to unbindable */ |
122 | #define MS_PRIVATE (1<<18) /* change to private */ | 122 | #define MS_PRIVATE (1<<18) /* change to private */ |
123 | #define MS_SLAVE (1<<19) /* change to slave */ | 123 | #define MS_SLAVE (1<<19) /* change to slave */ |
124 | #define MS_SHARED (1<<20) /* change to shared */ | 124 | #define MS_SHARED (1<<20) /* change to shared */ |
125 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ | 125 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ |
126 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ | 126 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ |
127 | #define MS_I_VERSION (1<<23) /* Update inode I_version field */ | 127 | #define MS_I_VERSION (1<<23) /* Update inode I_version field */ |
128 | #define MS_ACTIVE (1<<30) | 128 | #define MS_ACTIVE (1<<30) |
129 | #define MS_NOUSER (1<<31) | 129 | #define MS_NOUSER (1<<31) |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * Superblock flags that can be altered by MS_REMOUNT | 132 | * Superblock flags that can be altered by MS_REMOUNT |
133 | */ | 133 | */ |
134 | #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK) | 134 | #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK) |
135 | 135 | ||
136 | /* | 136 | /* |
137 | * Old magic mount flag and mask | 137 | * Old magic mount flag and mask |
138 | */ | 138 | */ |
139 | #define MS_MGC_VAL 0xC0ED0000 | 139 | #define MS_MGC_VAL 0xC0ED0000 |
140 | #define MS_MGC_MSK 0xffff0000 | 140 | #define MS_MGC_MSK 0xffff0000 |
141 | 141 | ||
142 | /* Inode flags - they have nothing to superblock flags now */ | 142 | /* Inode flags - they have nothing to superblock flags now */ |
143 | 143 | ||
144 | #define S_SYNC 1 /* Writes are synced at once */ | 144 | #define S_SYNC 1 /* Writes are synced at once */ |
145 | #define S_NOATIME 2 /* Do not update access times */ | 145 | #define S_NOATIME 2 /* Do not update access times */ |
146 | #define S_APPEND 4 /* Append-only file */ | 146 | #define S_APPEND 4 /* Append-only file */ |
147 | #define S_IMMUTABLE 8 /* Immutable file */ | 147 | #define S_IMMUTABLE 8 /* Immutable file */ |
148 | #define S_DEAD 16 /* removed, but still open directory */ | 148 | #define S_DEAD 16 /* removed, but still open directory */ |
149 | #define S_NOQUOTA 32 /* Inode is not counted to quota */ | 149 | #define S_NOQUOTA 32 /* Inode is not counted to quota */ |
150 | #define S_DIRSYNC 64 /* Directory modifications are synchronous */ | 150 | #define S_DIRSYNC 64 /* Directory modifications are synchronous */ |
151 | #define S_NOCMTIME 128 /* Do not update file c/mtime */ | 151 | #define S_NOCMTIME 128 /* Do not update file c/mtime */ |
152 | #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ | 152 | #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ |
153 | #define S_PRIVATE 512 /* Inode is fs-internal */ | 153 | #define S_PRIVATE 512 /* Inode is fs-internal */ |
154 | 154 | ||
155 | /* | 155 | /* |
156 | * Note that nosuid etc flags are inode-specific: setting some file-system | 156 | * Note that nosuid etc flags are inode-specific: setting some file-system |
157 | * flags just means all the inodes inherit those flags by default. It might be | 157 | * flags just means all the inodes inherit those flags by default. It might be |
158 | * possible to override it selectively if you really wanted to with some | 158 | * possible to override it selectively if you really wanted to with some |
159 | * ioctl() that is not currently implemented. | 159 | * ioctl() that is not currently implemented. |
160 | * | 160 | * |
161 | * Exception: MS_RDONLY is always applied to the entire file system. | 161 | * Exception: MS_RDONLY is always applied to the entire file system. |
162 | * | 162 | * |
163 | * Unfortunately, it is possible to change a filesystems flags with it mounted | 163 | * Unfortunately, it is possible to change a filesystems flags with it mounted |
164 | * with files in use. This means that all of the inodes will not have their | 164 | * with files in use. This means that all of the inodes will not have their |
165 | * i_flags updated. Hence, i_flags no longer inherit the superblock mount | 165 | * i_flags updated. Hence, i_flags no longer inherit the superblock mount |
166 | * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org | 166 | * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org |
167 | */ | 167 | */ |
168 | #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) | 168 | #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) |
169 | 169 | ||
170 | #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) | 170 | #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) |
171 | #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ | 171 | #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ |
172 | ((inode)->i_flags & S_SYNC)) | 172 | ((inode)->i_flags & S_SYNC)) |
173 | #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ | 173 | #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ |
174 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) | 174 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) |
175 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) | 175 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) |
176 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) | 176 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) |
177 | #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) | 177 | #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) |
178 | 178 | ||
179 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) | 179 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) |
180 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) | 180 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) |
181 | #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) | 181 | #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) |
182 | #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) | 182 | #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) |
183 | 183 | ||
184 | #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) | 184 | #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) |
185 | #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) | 185 | #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) |
186 | #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) | 186 | #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) |
187 | #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) | 187 | #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) |
188 | 188 | ||
189 | /* the read-only stuff doesn't really belong here, but any other place is | 189 | /* the read-only stuff doesn't really belong here, but any other place is |
190 | probably as bad and I don't want to create yet another include file. */ | 190 | probably as bad and I don't want to create yet another include file. */ |
191 | 191 | ||
192 | #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ | 192 | #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ |
193 | #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ | 193 | #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ |
194 | #define BLKRRPART _IO(0x12,95) /* re-read partition table */ | 194 | #define BLKRRPART _IO(0x12,95) /* re-read partition table */ |
195 | #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ | 195 | #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ |
196 | #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ | 196 | #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ |
197 | #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ | 197 | #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ |
198 | #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ | 198 | #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ |
199 | #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ | 199 | #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ |
200 | #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ | 200 | #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ |
201 | #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ | 201 | #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ |
202 | #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ | 202 | #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ |
203 | #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ | 203 | #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ |
204 | #if 0 | 204 | #if 0 |
205 | #define BLKPG _IO(0x12,105)/* See blkpg.h */ | 205 | #define BLKPG _IO(0x12,105)/* See blkpg.h */ |
206 | 206 | ||
207 | /* Some people are morons. Do not use sizeof! */ | 207 | /* Some people are morons. Do not use sizeof! */ |
208 | 208 | ||
209 | #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ | 209 | #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ |
210 | #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ | 210 | #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ |
211 | /* This was here just to show that the number is taken - | 211 | /* This was here just to show that the number is taken - |
212 | probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ | 212 | probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ |
213 | #endif | 213 | #endif |
214 | /* A jump here: 108-111 have been used for various private purposes. */ | 214 | /* A jump here: 108-111 have been used for various private purposes. */ |
215 | #define BLKBSZGET _IOR(0x12,112,size_t) | 215 | #define BLKBSZGET _IOR(0x12,112,size_t) |
216 | #define BLKBSZSET _IOW(0x12,113,size_t) | 216 | #define BLKBSZSET _IOW(0x12,113,size_t) |
217 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ | 217 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ |
218 | #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) | 218 | #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) |
219 | #define BLKTRACESTART _IO(0x12,116) | 219 | #define BLKTRACESTART _IO(0x12,116) |
220 | #define BLKTRACESTOP _IO(0x12,117) | 220 | #define BLKTRACESTOP _IO(0x12,117) |
221 | #define BLKTRACETEARDOWN _IO(0x12,118) | 221 | #define BLKTRACETEARDOWN _IO(0x12,118) |
222 | 222 | ||
223 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ | 223 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ |
224 | #define FIBMAP _IO(0x00,1) /* bmap access */ | 224 | #define FIBMAP _IO(0x00,1) /* bmap access */ |
225 | #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ | 225 | #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ |
226 | 226 | ||
227 | #define FS_IOC_GETFLAGS _IOR('f', 1, long) | 227 | #define FS_IOC_GETFLAGS _IOR('f', 1, long) |
228 | #define FS_IOC_SETFLAGS _IOW('f', 2, long) | 228 | #define FS_IOC_SETFLAGS _IOW('f', 2, long) |
229 | #define FS_IOC_GETVERSION _IOR('v', 1, long) | 229 | #define FS_IOC_GETVERSION _IOR('v', 1, long) |
230 | #define FS_IOC_SETVERSION _IOW('v', 2, long) | 230 | #define FS_IOC_SETVERSION _IOW('v', 2, long) |
231 | #define FS_IOC32_GETFLAGS _IOR('f', 1, int) | 231 | #define FS_IOC32_GETFLAGS _IOR('f', 1, int) |
232 | #define FS_IOC32_SETFLAGS _IOW('f', 2, int) | 232 | #define FS_IOC32_SETFLAGS _IOW('f', 2, int) |
233 | #define FS_IOC32_GETVERSION _IOR('v', 1, int) | 233 | #define FS_IOC32_GETVERSION _IOR('v', 1, int) |
234 | #define FS_IOC32_SETVERSION _IOW('v', 2, int) | 234 | #define FS_IOC32_SETVERSION _IOW('v', 2, int) |
235 | 235 | ||
236 | /* | 236 | /* |
237 | * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) | 237 | * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) |
238 | */ | 238 | */ |
239 | #define FS_SECRM_FL 0x00000001 /* Secure deletion */ | 239 | #define FS_SECRM_FL 0x00000001 /* Secure deletion */ |
240 | #define FS_UNRM_FL 0x00000002 /* Undelete */ | 240 | #define FS_UNRM_FL 0x00000002 /* Undelete */ |
241 | #define FS_COMPR_FL 0x00000004 /* Compress file */ | 241 | #define FS_COMPR_FL 0x00000004 /* Compress file */ |
242 | #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ | 242 | #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ |
243 | #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ | 243 | #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ |
244 | #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ | 244 | #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ |
245 | #define FS_NODUMP_FL 0x00000040 /* do not dump file */ | 245 | #define FS_NODUMP_FL 0x00000040 /* do not dump file */ |
246 | #define FS_NOATIME_FL 0x00000080 /* do not update atime */ | 246 | #define FS_NOATIME_FL 0x00000080 /* do not update atime */ |
247 | /* Reserved for compression usage... */ | 247 | /* Reserved for compression usage... */ |
248 | #define FS_DIRTY_FL 0x00000100 | 248 | #define FS_DIRTY_FL 0x00000100 |
249 | #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ | 249 | #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ |
250 | #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ | 250 | #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ |
251 | #define FS_ECOMPR_FL 0x00000800 /* Compression error */ | 251 | #define FS_ECOMPR_FL 0x00000800 /* Compression error */ |
252 | /* End compression flags --- maybe not all used */ | 252 | /* End compression flags --- maybe not all used */ |
253 | #define FS_BTREE_FL 0x00001000 /* btree format dir */ | 253 | #define FS_BTREE_FL 0x00001000 /* btree format dir */ |
254 | #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ | 254 | #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ |
255 | #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ | 255 | #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ |
256 | #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ | 256 | #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ |
257 | #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ | 257 | #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ |
258 | #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ | 258 | #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ |
259 | #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ | 259 | #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ |
260 | #define FS_EXTENT_FL 0x00080000 /* Extents */ | 260 | #define FS_EXTENT_FL 0x00080000 /* Extents */ |
261 | #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ | 261 | #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ |
262 | #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ | 262 | #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ |
263 | 263 | ||
264 | #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ | 264 | #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ |
265 | #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ | 265 | #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ |
266 | 266 | ||
267 | 267 | ||
268 | #define SYNC_FILE_RANGE_WAIT_BEFORE 1 | 268 | #define SYNC_FILE_RANGE_WAIT_BEFORE 1 |
269 | #define SYNC_FILE_RANGE_WRITE 2 | 269 | #define SYNC_FILE_RANGE_WRITE 2 |
270 | #define SYNC_FILE_RANGE_WAIT_AFTER 4 | 270 | #define SYNC_FILE_RANGE_WAIT_AFTER 4 |
271 | 271 | ||
272 | #ifdef __KERNEL__ | 272 | #ifdef __KERNEL__ |
273 | 273 | ||
274 | #include <linux/linkage.h> | 274 | #include <linux/linkage.h> |
275 | #include <linux/wait.h> | 275 | #include <linux/wait.h> |
276 | #include <linux/types.h> | 276 | #include <linux/types.h> |
277 | #include <linux/kdev_t.h> | 277 | #include <linux/kdev_t.h> |
278 | #include <linux/dcache.h> | 278 | #include <linux/dcache.h> |
279 | #include <linux/namei.h> | 279 | #include <linux/namei.h> |
280 | #include <linux/stat.h> | 280 | #include <linux/stat.h> |
281 | #include <linux/cache.h> | 281 | #include <linux/cache.h> |
282 | #include <linux/kobject.h> | 282 | #include <linux/kobject.h> |
283 | #include <linux/list.h> | 283 | #include <linux/list.h> |
284 | #include <linux/radix-tree.h> | 284 | #include <linux/radix-tree.h> |
285 | #include <linux/prio_tree.h> | 285 | #include <linux/prio_tree.h> |
286 | #include <linux/init.h> | 286 | #include <linux/init.h> |
287 | #include <linux/pid.h> | 287 | #include <linux/pid.h> |
288 | #include <linux/mutex.h> | 288 | #include <linux/mutex.h> |
289 | #include <linux/capability.h> | 289 | #include <linux/capability.h> |
290 | 290 | ||
291 | #include <asm/atomic.h> | 291 | #include <asm/atomic.h> |
292 | #include <asm/semaphore.h> | 292 | #include <asm/semaphore.h> |
293 | #include <asm/byteorder.h> | 293 | #include <asm/byteorder.h> |
294 | 294 | ||
295 | struct export_operations; | 295 | struct export_operations; |
296 | struct hd_geometry; | 296 | struct hd_geometry; |
297 | struct iovec; | 297 | struct iovec; |
298 | struct nameidata; | 298 | struct nameidata; |
299 | struct kiocb; | 299 | struct kiocb; |
300 | struct pipe_inode_info; | 300 | struct pipe_inode_info; |
301 | struct poll_table_struct; | 301 | struct poll_table_struct; |
302 | struct kstatfs; | 302 | struct kstatfs; |
303 | struct vm_area_struct; | 303 | struct vm_area_struct; |
304 | struct vfsmount; | 304 | struct vfsmount; |
305 | 305 | ||
306 | extern void __init inode_init(void); | 306 | extern void __init inode_init(void); |
307 | extern void __init inode_init_early(void); | 307 | extern void __init inode_init_early(void); |
308 | extern void __init mnt_init(void); | 308 | extern void __init mnt_init(void); |
309 | extern void __init files_init(unsigned long); | 309 | extern void __init files_init(unsigned long); |
310 | 310 | ||
311 | struct buffer_head; | 311 | struct buffer_head; |
312 | typedef int (get_block_t)(struct inode *inode, sector_t iblock, | 312 | typedef int (get_block_t)(struct inode *inode, sector_t iblock, |
313 | struct buffer_head *bh_result, int create); | 313 | struct buffer_head *bh_result, int create); |
314 | typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, | 314 | typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, |
315 | ssize_t bytes, void *private); | 315 | ssize_t bytes, void *private); |
316 | 316 | ||
317 | /* | 317 | /* |
318 | * Attribute flags. These should be or-ed together to figure out what | 318 | * Attribute flags. These should be or-ed together to figure out what |
319 | * has been changed! | 319 | * has been changed! |
320 | */ | 320 | */ |
321 | #define ATTR_MODE 1 | 321 | #define ATTR_MODE 1 |
322 | #define ATTR_UID 2 | 322 | #define ATTR_UID 2 |
323 | #define ATTR_GID 4 | 323 | #define ATTR_GID 4 |
324 | #define ATTR_SIZE 8 | 324 | #define ATTR_SIZE 8 |
325 | #define ATTR_ATIME 16 | 325 | #define ATTR_ATIME 16 |
326 | #define ATTR_MTIME 32 | 326 | #define ATTR_MTIME 32 |
327 | #define ATTR_CTIME 64 | 327 | #define ATTR_CTIME 64 |
328 | #define ATTR_ATIME_SET 128 | 328 | #define ATTR_ATIME_SET 128 |
329 | #define ATTR_MTIME_SET 256 | 329 | #define ATTR_MTIME_SET 256 |
330 | #define ATTR_FORCE 512 /* Not a change, but a change it */ | 330 | #define ATTR_FORCE 512 /* Not a change, but a change it */ |
331 | #define ATTR_ATTR_FLAG 1024 | 331 | #define ATTR_ATTR_FLAG 1024 |
332 | #define ATTR_KILL_SUID 2048 | 332 | #define ATTR_KILL_SUID 2048 |
333 | #define ATTR_KILL_SGID 4096 | 333 | #define ATTR_KILL_SGID 4096 |
334 | #define ATTR_FILE 8192 | 334 | #define ATTR_FILE 8192 |
335 | #define ATTR_KILL_PRIV 16384 | 335 | #define ATTR_KILL_PRIV 16384 |
336 | #define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */ | 336 | #define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */ |
337 | 337 | ||
338 | /* | 338 | /* |
339 | * This is the Inode Attributes structure, used for notify_change(). It | 339 | * This is the Inode Attributes structure, used for notify_change(). It |
340 | * uses the above definitions as flags, to know which values have changed. | 340 | * uses the above definitions as flags, to know which values have changed. |
341 | * Also, in this manner, a Filesystem can look at only the values it cares | 341 | * Also, in this manner, a Filesystem can look at only the values it cares |
342 | * about. Basically, these are the attributes that the VFS layer can | 342 | * about. Basically, these are the attributes that the VFS layer can |
343 | * request to change from the FS layer. | 343 | * request to change from the FS layer. |
344 | * | 344 | * |
345 | * Derek Atkins <warlord@MIT.EDU> 94-10-20 | 345 | * Derek Atkins <warlord@MIT.EDU> 94-10-20 |
346 | */ | 346 | */ |
347 | struct iattr { | 347 | struct iattr { |
348 | unsigned int ia_valid; | 348 | unsigned int ia_valid; |
349 | umode_t ia_mode; | 349 | umode_t ia_mode; |
350 | uid_t ia_uid; | 350 | uid_t ia_uid; |
351 | gid_t ia_gid; | 351 | gid_t ia_gid; |
352 | loff_t ia_size; | 352 | loff_t ia_size; |
353 | struct timespec ia_atime; | 353 | struct timespec ia_atime; |
354 | struct timespec ia_mtime; | 354 | struct timespec ia_mtime; |
355 | struct timespec ia_ctime; | 355 | struct timespec ia_ctime; |
356 | 356 | ||
357 | /* | 357 | /* |
358 | * Not an attribute, but an auxilary info for filesystems wanting to | 358 | * Not an attribute, but an auxilary info for filesystems wanting to |
359 | * implement an ftruncate() like method. NOTE: filesystem should | 359 | * implement an ftruncate() like method. NOTE: filesystem should |
360 | * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). | 360 | * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). |
361 | */ | 361 | */ |
362 | struct file *ia_file; | 362 | struct file *ia_file; |
363 | }; | 363 | }; |
364 | 364 | ||
365 | /* | 365 | /* |
366 | * Includes for diskquotas. | 366 | * Includes for diskquotas. |
367 | */ | 367 | */ |
368 | #include <linux/quota.h> | 368 | #include <linux/quota.h> |
369 | 369 | ||
370 | /** | 370 | /** |
371 | * enum positive_aop_returns - aop return codes with specific semantics | 371 | * enum positive_aop_returns - aop return codes with specific semantics |
372 | * | 372 | * |
373 | * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has | 373 | * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has |
374 | * completed, that the page is still locked, and | 374 | * completed, that the page is still locked, and |
375 | * should be considered active. The VM uses this hint | 375 | * should be considered active. The VM uses this hint |
376 | * to return the page to the active list -- it won't | 376 | * to return the page to the active list -- it won't |
377 | * be a candidate for writeback again in the near | 377 | * be a candidate for writeback again in the near |
378 | * future. Other callers must be careful to unlock | 378 | * future. Other callers must be careful to unlock |
379 | * the page if they get this return. Returned by | 379 | * the page if they get this return. Returned by |
380 | * writepage(); | 380 | * writepage(); |
381 | * | 381 | * |
382 | * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has | 382 | * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has |
383 | * unlocked it and the page might have been truncated. | 383 | * unlocked it and the page might have been truncated. |
384 | * The caller should back up to acquiring a new page and | 384 | * The caller should back up to acquiring a new page and |
385 | * trying again. The aop will be taking reasonable | 385 | * trying again. The aop will be taking reasonable |
386 | * precautions not to livelock. If the caller held a page | 386 | * precautions not to livelock. If the caller held a page |
387 | * reference, it should drop it before retrying. Returned | 387 | * reference, it should drop it before retrying. Returned |
388 | * by readpage(). | 388 | * by readpage(). |
389 | * | 389 | * |
390 | * address_space_operation functions return these large constants to indicate | 390 | * address_space_operation functions return these large constants to indicate |
391 | * special semantics to the caller. These are much larger than the bytes in a | 391 | * special semantics to the caller. These are much larger than the bytes in a |
392 | * page to allow for functions that return the number of bytes operated on in a | 392 | * page to allow for functions that return the number of bytes operated on in a |
393 | * given page. | 393 | * given page. |
394 | */ | 394 | */ |
395 | 395 | ||
396 | enum positive_aop_returns { | 396 | enum positive_aop_returns { |
397 | AOP_WRITEPAGE_ACTIVATE = 0x80000, | 397 | AOP_WRITEPAGE_ACTIVATE = 0x80000, |
398 | AOP_TRUNCATED_PAGE = 0x80001, | 398 | AOP_TRUNCATED_PAGE = 0x80001, |
399 | }; | 399 | }; |
400 | 400 | ||
401 | #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ | 401 | #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ |
402 | #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ | 402 | #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ |
403 | 403 | ||
404 | /* | 404 | /* |
405 | * oh the beauties of C type declarations. | 405 | * oh the beauties of C type declarations. |
406 | */ | 406 | */ |
407 | struct page; | 407 | struct page; |
408 | struct address_space; | 408 | struct address_space; |
409 | struct writeback_control; | 409 | struct writeback_control; |
410 | 410 | ||
411 | struct iov_iter { | 411 | struct iov_iter { |
412 | const struct iovec *iov; | 412 | const struct iovec *iov; |
413 | unsigned long nr_segs; | 413 | unsigned long nr_segs; |
414 | size_t iov_offset; | 414 | size_t iov_offset; |
415 | size_t count; | 415 | size_t count; |
416 | }; | 416 | }; |
417 | 417 | ||
418 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 418 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
419 | struct iov_iter *i, unsigned long offset, size_t bytes); | 419 | struct iov_iter *i, unsigned long offset, size_t bytes); |
420 | size_t iov_iter_copy_from_user(struct page *page, | 420 | size_t iov_iter_copy_from_user(struct page *page, |
421 | struct iov_iter *i, unsigned long offset, size_t bytes); | 421 | struct iov_iter *i, unsigned long offset, size_t bytes); |
422 | void iov_iter_advance(struct iov_iter *i, size_t bytes); | 422 | void iov_iter_advance(struct iov_iter *i, size_t bytes); |
423 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); | 423 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); |
424 | size_t iov_iter_single_seg_count(struct iov_iter *i); | 424 | size_t iov_iter_single_seg_count(struct iov_iter *i); |
425 | 425 | ||
426 | static inline void iov_iter_init(struct iov_iter *i, | 426 | static inline void iov_iter_init(struct iov_iter *i, |
427 | const struct iovec *iov, unsigned long nr_segs, | 427 | const struct iovec *iov, unsigned long nr_segs, |
428 | size_t count, size_t written) | 428 | size_t count, size_t written) |
429 | { | 429 | { |
430 | i->iov = iov; | 430 | i->iov = iov; |
431 | i->nr_segs = nr_segs; | 431 | i->nr_segs = nr_segs; |
432 | i->iov_offset = 0; | 432 | i->iov_offset = 0; |
433 | i->count = count + written; | 433 | i->count = count + written; |
434 | 434 | ||
435 | iov_iter_advance(i, written); | 435 | iov_iter_advance(i, written); |
436 | } | 436 | } |
437 | 437 | ||
438 | static inline size_t iov_iter_count(struct iov_iter *i) | 438 | static inline size_t iov_iter_count(struct iov_iter *i) |
439 | { | 439 | { |
440 | return i->count; | 440 | return i->count; |
441 | } | 441 | } |
442 | 442 | ||
443 | 443 | ||
444 | struct address_space_operations { | 444 | struct address_space_operations { |
445 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 445 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
446 | int (*readpage)(struct file *, struct page *); | 446 | int (*readpage)(struct file *, struct page *); |
447 | void (*sync_page)(struct page *); | 447 | void (*sync_page)(struct page *); |
448 | 448 | ||
449 | /* Write back some dirty pages from this mapping. */ | 449 | /* Write back some dirty pages from this mapping. */ |
450 | int (*writepages)(struct address_space *, struct writeback_control *); | 450 | int (*writepages)(struct address_space *, struct writeback_control *); |
451 | 451 | ||
452 | /* Set a page dirty. Return true if this dirtied it */ | 452 | /* Set a page dirty. Return true if this dirtied it */ |
453 | int (*set_page_dirty)(struct page *page); | 453 | int (*set_page_dirty)(struct page *page); |
454 | 454 | ||
455 | int (*readpages)(struct file *filp, struct address_space *mapping, | 455 | int (*readpages)(struct file *filp, struct address_space *mapping, |
456 | struct list_head *pages, unsigned nr_pages); | 456 | struct list_head *pages, unsigned nr_pages); |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * ext3 requires that a successful prepare_write() call be followed | 459 | * ext3 requires that a successful prepare_write() call be followed |
460 | * by a commit_write() call - they must be balanced | 460 | * by a commit_write() call - they must be balanced |
461 | */ | 461 | */ |
462 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); | 462 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); |
463 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); | 463 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); |
464 | 464 | ||
465 | int (*write_begin)(struct file *, struct address_space *mapping, | 465 | int (*write_begin)(struct file *, struct address_space *mapping, |
466 | loff_t pos, unsigned len, unsigned flags, | 466 | loff_t pos, unsigned len, unsigned flags, |
467 | struct page **pagep, void **fsdata); | 467 | struct page **pagep, void **fsdata); |
468 | int (*write_end)(struct file *, struct address_space *mapping, | 468 | int (*write_end)(struct file *, struct address_space *mapping, |
469 | loff_t pos, unsigned len, unsigned copied, | 469 | loff_t pos, unsigned len, unsigned copied, |
470 | struct page *page, void *fsdata); | 470 | struct page *page, void *fsdata); |
471 | 471 | ||
472 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ | 472 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ |
473 | sector_t (*bmap)(struct address_space *, sector_t); | 473 | sector_t (*bmap)(struct address_space *, sector_t); |
474 | void (*invalidatepage) (struct page *, unsigned long); | 474 | void (*invalidatepage) (struct page *, unsigned long); |
475 | int (*releasepage) (struct page *, gfp_t); | 475 | int (*releasepage) (struct page *, gfp_t); |
476 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 476 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
477 | loff_t offset, unsigned long nr_segs); | 477 | loff_t offset, unsigned long nr_segs); |
478 | struct page* (*get_xip_page)(struct address_space *, sector_t, | 478 | struct page* (*get_xip_page)(struct address_space *, sector_t, |
479 | int); | 479 | int); |
480 | /* migrate the contents of a page to the specified target */ | 480 | /* migrate the contents of a page to the specified target */ |
481 | int (*migratepage) (struct address_space *, | 481 | int (*migratepage) (struct address_space *, |
482 | struct page *, struct page *); | 482 | struct page *, struct page *); |
483 | int (*launder_page) (struct page *); | 483 | int (*launder_page) (struct page *); |
484 | }; | 484 | }; |
485 | 485 | ||
486 | /* | 486 | /* |
487 | * pagecache_write_begin/pagecache_write_end must be used by general code | 487 | * pagecache_write_begin/pagecache_write_end must be used by general code |
488 | * to write into the pagecache. | 488 | * to write into the pagecache. |
489 | */ | 489 | */ |
490 | int pagecache_write_begin(struct file *, struct address_space *mapping, | 490 | int pagecache_write_begin(struct file *, struct address_space *mapping, |
491 | loff_t pos, unsigned len, unsigned flags, | 491 | loff_t pos, unsigned len, unsigned flags, |
492 | struct page **pagep, void **fsdata); | 492 | struct page **pagep, void **fsdata); |
493 | 493 | ||
494 | int pagecache_write_end(struct file *, struct address_space *mapping, | 494 | int pagecache_write_end(struct file *, struct address_space *mapping, |
495 | loff_t pos, unsigned len, unsigned copied, | 495 | loff_t pos, unsigned len, unsigned copied, |
496 | struct page *page, void *fsdata); | 496 | struct page *page, void *fsdata); |
497 | 497 | ||
498 | struct backing_dev_info; | 498 | struct backing_dev_info; |
499 | struct address_space { | 499 | struct address_space { |
500 | struct inode *host; /* owner: inode, block_device */ | 500 | struct inode *host; /* owner: inode, block_device */ |
501 | struct radix_tree_root page_tree; /* radix tree of all pages */ | 501 | struct radix_tree_root page_tree; /* radix tree of all pages */ |
502 | rwlock_t tree_lock; /* and rwlock protecting it */ | 502 | rwlock_t tree_lock; /* and rwlock protecting it */ |
503 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ | 503 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ |
504 | struct prio_tree_root i_mmap; /* tree of private and shared mappings */ | 504 | struct prio_tree_root i_mmap; /* tree of private and shared mappings */ |
505 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 505 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
506 | spinlock_t i_mmap_lock; /* protect tree, count, list */ | 506 | spinlock_t i_mmap_lock; /* protect tree, count, list */ |
507 | unsigned int truncate_count; /* Cover race condition with truncate */ | 507 | unsigned int truncate_count; /* Cover race condition with truncate */ |
508 | unsigned long nrpages; /* number of total pages */ | 508 | unsigned long nrpages; /* number of total pages */ |
509 | pgoff_t writeback_index;/* writeback starts here */ | 509 | pgoff_t writeback_index;/* writeback starts here */ |
510 | const struct address_space_operations *a_ops; /* methods */ | 510 | const struct address_space_operations *a_ops; /* methods */ |
511 | unsigned long flags; /* error bits/gfp mask */ | 511 | unsigned long flags; /* error bits/gfp mask */ |
512 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ | 512 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ |
513 | spinlock_t private_lock; /* for use by the address_space */ | 513 | spinlock_t private_lock; /* for use by the address_space */ |
514 | struct list_head private_list; /* ditto */ | 514 | struct list_head private_list; /* ditto */ |
515 | struct address_space *assoc_mapping; /* ditto */ | 515 | struct address_space *assoc_mapping; /* ditto */ |
516 | } __attribute__((aligned(sizeof(long)))); | 516 | } __attribute__((aligned(sizeof(long)))); |
517 | /* | 517 | /* |
518 | * On most architectures that alignment is already the case; but | 518 | * On most architectures that alignment is already the case; but |
519 | * must be enforced here for CRIS, to let the least signficant bit | 519 | * must be enforced here for CRIS, to let the least signficant bit |
520 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. | 520 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. |
521 | */ | 521 | */ |
522 | 522 | ||
523 | struct block_device { | 523 | struct block_device { |
524 | dev_t bd_dev; /* not a kdev_t - it's a search key */ | 524 | dev_t bd_dev; /* not a kdev_t - it's a search key */ |
525 | struct inode * bd_inode; /* will die */ | 525 | struct inode * bd_inode; /* will die */ |
526 | int bd_openers; | 526 | int bd_openers; |
527 | struct mutex bd_mutex; /* open/close mutex */ | 527 | struct mutex bd_mutex; /* open/close mutex */ |
528 | struct semaphore bd_mount_sem; | 528 | struct semaphore bd_mount_sem; |
529 | struct list_head bd_inodes; | 529 | struct list_head bd_inodes; |
530 | void * bd_holder; | 530 | void * bd_holder; |
531 | int bd_holders; | 531 | int bd_holders; |
532 | #ifdef CONFIG_SYSFS | 532 | #ifdef CONFIG_SYSFS |
533 | struct list_head bd_holder_list; | 533 | struct list_head bd_holder_list; |
534 | #endif | 534 | #endif |
535 | struct block_device * bd_contains; | 535 | struct block_device * bd_contains; |
536 | unsigned bd_block_size; | 536 | unsigned bd_block_size; |
537 | struct hd_struct * bd_part; | 537 | struct hd_struct * bd_part; |
538 | /* number of times partitions within this device have been opened. */ | 538 | /* number of times partitions within this device have been opened. */ |
539 | unsigned bd_part_count; | 539 | unsigned bd_part_count; |
540 | int bd_invalidated; | 540 | int bd_invalidated; |
541 | struct gendisk * bd_disk; | 541 | struct gendisk * bd_disk; |
542 | struct list_head bd_list; | 542 | struct list_head bd_list; |
543 | struct backing_dev_info *bd_inode_backing_dev_info; | 543 | struct backing_dev_info *bd_inode_backing_dev_info; |
544 | /* | 544 | /* |
545 | * Private data. You must have bd_claim'ed the block_device | 545 | * Private data. You must have bd_claim'ed the block_device |
546 | * to use this. NOTE: bd_claim allows an owner to claim | 546 | * to use this. NOTE: bd_claim allows an owner to claim |
547 | * the same device multiple times, the owner must take special | 547 | * the same device multiple times, the owner must take special |
548 | * care to not mess up bd_private for that case. | 548 | * care to not mess up bd_private for that case. |
549 | */ | 549 | */ |
550 | unsigned long bd_private; | 550 | unsigned long bd_private; |
551 | }; | 551 | }; |
552 | 552 | ||
553 | /* | 553 | /* |
554 | * Radix-tree tags, for tagging dirty and writeback pages within the pagecache | 554 | * Radix-tree tags, for tagging dirty and writeback pages within the pagecache |
555 | * radix trees | 555 | * radix trees |
556 | */ | 556 | */ |
557 | #define PAGECACHE_TAG_DIRTY 0 | 557 | #define PAGECACHE_TAG_DIRTY 0 |
558 | #define PAGECACHE_TAG_WRITEBACK 1 | 558 | #define PAGECACHE_TAG_WRITEBACK 1 |
559 | 559 | ||
560 | int mapping_tagged(struct address_space *mapping, int tag); | 560 | int mapping_tagged(struct address_space *mapping, int tag); |
561 | 561 | ||
562 | /* | 562 | /* |
563 | * Might pages of this file be mapped into userspace? | 563 | * Might pages of this file be mapped into userspace? |
564 | */ | 564 | */ |
565 | static inline int mapping_mapped(struct address_space *mapping) | 565 | static inline int mapping_mapped(struct address_space *mapping) |
566 | { | 566 | { |
567 | return !prio_tree_empty(&mapping->i_mmap) || | 567 | return !prio_tree_empty(&mapping->i_mmap) || |
568 | !list_empty(&mapping->i_mmap_nonlinear); | 568 | !list_empty(&mapping->i_mmap_nonlinear); |
569 | } | 569 | } |
570 | 570 | ||
571 | /* | 571 | /* |
572 | * Might pages of this file have been modified in userspace? | 572 | * Might pages of this file have been modified in userspace? |
573 | * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff | 573 | * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff |
574 | * marks vma as VM_SHARED if it is shared, and the file was opened for | 574 | * marks vma as VM_SHARED if it is shared, and the file was opened for |
575 | * writing i.e. vma may be mprotected writable even if now readonly. | 575 | * writing i.e. vma may be mprotected writable even if now readonly. |
576 | */ | 576 | */ |
577 | static inline int mapping_writably_mapped(struct address_space *mapping) | 577 | static inline int mapping_writably_mapped(struct address_space *mapping) |
578 | { | 578 | { |
579 | return mapping->i_mmap_writable != 0; | 579 | return mapping->i_mmap_writable != 0; |
580 | } | 580 | } |
581 | 581 | ||
582 | /* | 582 | /* |
583 | * Use sequence counter to get consistent i_size on 32-bit processors. | 583 | * Use sequence counter to get consistent i_size on 32-bit processors. |
584 | */ | 584 | */ |
585 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 585 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
586 | #include <linux/seqlock.h> | 586 | #include <linux/seqlock.h> |
587 | #define __NEED_I_SIZE_ORDERED | 587 | #define __NEED_I_SIZE_ORDERED |
588 | #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) | 588 | #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) |
589 | #else | 589 | #else |
590 | #define i_size_ordered_init(inode) do { } while (0) | 590 | #define i_size_ordered_init(inode) do { } while (0) |
591 | #endif | 591 | #endif |
592 | 592 | ||
593 | struct inode { | 593 | struct inode { |
594 | struct hlist_node i_hash; | 594 | struct hlist_node i_hash; |
595 | struct list_head i_list; | 595 | struct list_head i_list; |
596 | struct list_head i_sb_list; | 596 | struct list_head i_sb_list; |
597 | struct list_head i_dentry; | 597 | struct list_head i_dentry; |
598 | unsigned long i_ino; | 598 | unsigned long i_ino; |
599 | atomic_t i_count; | 599 | atomic_t i_count; |
600 | unsigned int i_nlink; | 600 | unsigned int i_nlink; |
601 | uid_t i_uid; | 601 | uid_t i_uid; |
602 | gid_t i_gid; | 602 | gid_t i_gid; |
603 | dev_t i_rdev; | 603 | dev_t i_rdev; |
604 | u64 i_version; | 604 | u64 i_version; |
605 | loff_t i_size; | 605 | loff_t i_size; |
606 | #ifdef __NEED_I_SIZE_ORDERED | 606 | #ifdef __NEED_I_SIZE_ORDERED |
607 | seqcount_t i_size_seqcount; | 607 | seqcount_t i_size_seqcount; |
608 | #endif | 608 | #endif |
609 | struct timespec i_atime; | 609 | struct timespec i_atime; |
610 | struct timespec i_mtime; | 610 | struct timespec i_mtime; |
611 | struct timespec i_ctime; | 611 | struct timespec i_ctime; |
612 | unsigned int i_blkbits; | 612 | unsigned int i_blkbits; |
613 | blkcnt_t i_blocks; | 613 | blkcnt_t i_blocks; |
614 | unsigned short i_bytes; | 614 | unsigned short i_bytes; |
615 | umode_t i_mode; | 615 | umode_t i_mode; |
616 | spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ | 616 | spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ |
617 | struct mutex i_mutex; | 617 | struct mutex i_mutex; |
618 | struct rw_semaphore i_alloc_sem; | 618 | struct rw_semaphore i_alloc_sem; |
619 | const struct inode_operations *i_op; | 619 | const struct inode_operations *i_op; |
620 | const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ | 620 | const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ |
621 | struct super_block *i_sb; | 621 | struct super_block *i_sb; |
622 | struct file_lock *i_flock; | 622 | struct file_lock *i_flock; |
623 | struct address_space *i_mapping; | 623 | struct address_space *i_mapping; |
624 | struct address_space i_data; | 624 | struct address_space i_data; |
625 | #ifdef CONFIG_QUOTA | 625 | #ifdef CONFIG_QUOTA |
626 | struct dquot *i_dquot[MAXQUOTAS]; | 626 | struct dquot *i_dquot[MAXQUOTAS]; |
627 | #endif | 627 | #endif |
628 | struct list_head i_devices; | 628 | struct list_head i_devices; |
629 | union { | 629 | union { |
630 | struct pipe_inode_info *i_pipe; | 630 | struct pipe_inode_info *i_pipe; |
631 | struct block_device *i_bdev; | 631 | struct block_device *i_bdev; |
632 | struct cdev *i_cdev; | 632 | struct cdev *i_cdev; |
633 | }; | 633 | }; |
634 | int i_cindex; | 634 | int i_cindex; |
635 | 635 | ||
636 | __u32 i_generation; | 636 | __u32 i_generation; |
637 | 637 | ||
638 | #ifdef CONFIG_DNOTIFY | 638 | #ifdef CONFIG_DNOTIFY |
639 | unsigned long i_dnotify_mask; /* Directory notify events */ | 639 | unsigned long i_dnotify_mask; /* Directory notify events */ |
640 | struct dnotify_struct *i_dnotify; /* for directory notifications */ | 640 | struct dnotify_struct *i_dnotify; /* for directory notifications */ |
641 | #endif | 641 | #endif |
642 | 642 | ||
643 | #ifdef CONFIG_INOTIFY | 643 | #ifdef CONFIG_INOTIFY |
644 | struct list_head inotify_watches; /* watches on this inode */ | 644 | struct list_head inotify_watches; /* watches on this inode */ |
645 | struct mutex inotify_mutex; /* protects the watches list */ | 645 | struct mutex inotify_mutex; /* protects the watches list */ |
646 | #endif | 646 | #endif |
647 | 647 | ||
648 | unsigned long i_state; | 648 | unsigned long i_state; |
649 | unsigned long dirtied_when; /* jiffies of first dirtying */ | 649 | unsigned long dirtied_when; /* jiffies of first dirtying */ |
650 | 650 | ||
651 | unsigned int i_flags; | 651 | unsigned int i_flags; |
652 | 652 | ||
653 | atomic_t i_writecount; | 653 | atomic_t i_writecount; |
654 | #ifdef CONFIG_SECURITY | 654 | #ifdef CONFIG_SECURITY |
655 | void *i_security; | 655 | void *i_security; |
656 | #endif | 656 | #endif |
657 | void *i_private; /* fs or device private pointer */ | 657 | void *i_private; /* fs or device private pointer */ |
658 | }; | 658 | }; |
659 | 659 | ||
660 | /* | 660 | /* |
661 | * inode->i_mutex nesting subclasses for the lock validator: | 661 | * inode->i_mutex nesting subclasses for the lock validator: |
662 | * | 662 | * |
663 | * 0: the object of the current VFS operation | 663 | * 0: the object of the current VFS operation |
664 | * 1: parent | 664 | * 1: parent |
665 | * 2: child/target | 665 | * 2: child/target |
666 | * 3: quota file | 666 | * 3: quota file |
667 | * | 667 | * |
668 | * The locking order between these classes is | 668 | * The locking order between these classes is |
669 | * parent -> child -> normal -> xattr -> quota | 669 | * parent -> child -> normal -> xattr -> quota |
670 | */ | 670 | */ |
671 | enum inode_i_mutex_lock_class | 671 | enum inode_i_mutex_lock_class |
672 | { | 672 | { |
673 | I_MUTEX_NORMAL, | 673 | I_MUTEX_NORMAL, |
674 | I_MUTEX_PARENT, | 674 | I_MUTEX_PARENT, |
675 | I_MUTEX_CHILD, | 675 | I_MUTEX_CHILD, |
676 | I_MUTEX_XATTR, | 676 | I_MUTEX_XATTR, |
677 | I_MUTEX_QUOTA | 677 | I_MUTEX_QUOTA |
678 | }; | 678 | }; |
679 | 679 | ||
680 | extern void inode_double_lock(struct inode *inode1, struct inode *inode2); | 680 | extern void inode_double_lock(struct inode *inode1, struct inode *inode2); |
681 | extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); | 681 | extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); |
682 | 682 | ||
683 | /* | 683 | /* |
684 | * NOTE: in a 32bit arch with a preemptable kernel and | 684 | * NOTE: in a 32bit arch with a preemptable kernel and |
685 | * an UP compile the i_size_read/write must be atomic | 685 | * an UP compile the i_size_read/write must be atomic |
686 | * with respect to the local cpu (unlike with preempt disabled), | 686 | * with respect to the local cpu (unlike with preempt disabled), |
687 | * but they don't need to be atomic with respect to other cpus like in | 687 | * but they don't need to be atomic with respect to other cpus like in |
688 | * true SMP (so they need either to either locally disable irq around | 688 | * true SMP (so they need either to either locally disable irq around |
689 | * the read or for example on x86 they can be still implemented as a | 689 | * the read or for example on x86 they can be still implemented as a |
690 | * cmpxchg8b without the need of the lock prefix). For SMP compiles | 690 | * cmpxchg8b without the need of the lock prefix). For SMP compiles |
691 | * and 64bit archs it makes no difference if preempt is enabled or not. | 691 | * and 64bit archs it makes no difference if preempt is enabled or not. |
692 | */ | 692 | */ |
693 | static inline loff_t i_size_read(const struct inode *inode) | 693 | static inline loff_t i_size_read(const struct inode *inode) |
694 | { | 694 | { |
695 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 695 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
696 | loff_t i_size; | 696 | loff_t i_size; |
697 | unsigned int seq; | 697 | unsigned int seq; |
698 | 698 | ||
699 | do { | 699 | do { |
700 | seq = read_seqcount_begin(&inode->i_size_seqcount); | 700 | seq = read_seqcount_begin(&inode->i_size_seqcount); |
701 | i_size = inode->i_size; | 701 | i_size = inode->i_size; |
702 | } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); | 702 | } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); |
703 | return i_size; | 703 | return i_size; |
704 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) | 704 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) |
705 | loff_t i_size; | 705 | loff_t i_size; |
706 | 706 | ||
707 | preempt_disable(); | 707 | preempt_disable(); |
708 | i_size = inode->i_size; | 708 | i_size = inode->i_size; |
709 | preempt_enable(); | 709 | preempt_enable(); |
710 | return i_size; | 710 | return i_size; |
711 | #else | 711 | #else |
712 | return inode->i_size; | 712 | return inode->i_size; |
713 | #endif | 713 | #endif |
714 | } | 714 | } |
715 | 715 | ||
716 | /* | 716 | /* |
717 | * NOTE: unlike i_size_read(), i_size_write() does need locking around it | 717 | * NOTE: unlike i_size_read(), i_size_write() does need locking around it |
718 | * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount | 718 | * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount |
719 | * can be lost, resulting in subsequent i_size_read() calls spinning forever. | 719 | * can be lost, resulting in subsequent i_size_read() calls spinning forever. |
720 | */ | 720 | */ |
721 | static inline void i_size_write(struct inode *inode, loff_t i_size) | 721 | static inline void i_size_write(struct inode *inode, loff_t i_size) |
722 | { | 722 | { |
723 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 723 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
724 | write_seqcount_begin(&inode->i_size_seqcount); | 724 | write_seqcount_begin(&inode->i_size_seqcount); |
725 | inode->i_size = i_size; | 725 | inode->i_size = i_size; |
726 | write_seqcount_end(&inode->i_size_seqcount); | 726 | write_seqcount_end(&inode->i_size_seqcount); |
727 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) | 727 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) |
728 | preempt_disable(); | 728 | preempt_disable(); |
729 | inode->i_size = i_size; | 729 | inode->i_size = i_size; |
730 | preempt_enable(); | 730 | preempt_enable(); |
731 | #else | 731 | #else |
732 | inode->i_size = i_size; | 732 | inode->i_size = i_size; |
733 | #endif | 733 | #endif |
734 | } | 734 | } |
735 | 735 | ||
736 | static inline unsigned iminor(const struct inode *inode) | 736 | static inline unsigned iminor(const struct inode *inode) |
737 | { | 737 | { |
738 | return MINOR(inode->i_rdev); | 738 | return MINOR(inode->i_rdev); |
739 | } | 739 | } |
740 | 740 | ||
741 | static inline unsigned imajor(const struct inode *inode) | 741 | static inline unsigned imajor(const struct inode *inode) |
742 | { | 742 | { |
743 | return MAJOR(inode->i_rdev); | 743 | return MAJOR(inode->i_rdev); |
744 | } | 744 | } |
745 | 745 | ||
746 | extern struct block_device *I_BDEV(struct inode *inode); | 746 | extern struct block_device *I_BDEV(struct inode *inode); |
747 | 747 | ||
748 | struct fown_struct { | 748 | struct fown_struct { |
749 | rwlock_t lock; /* protects pid, uid, euid fields */ | 749 | rwlock_t lock; /* protects pid, uid, euid fields */ |
750 | struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ | 750 | struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ |
751 | enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ | 751 | enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ |
752 | uid_t uid, euid; /* uid/euid of process setting the owner */ | 752 | uid_t uid, euid; /* uid/euid of process setting the owner */ |
753 | int signum; /* posix.1b rt signal to be delivered on IO */ | 753 | int signum; /* posix.1b rt signal to be delivered on IO */ |
754 | }; | 754 | }; |
755 | 755 | ||
756 | /* | 756 | /* |
757 | * Track a single file's readahead state | 757 | * Track a single file's readahead state |
758 | */ | 758 | */ |
759 | struct file_ra_state { | 759 | struct file_ra_state { |
760 | pgoff_t start; /* where readahead started */ | 760 | pgoff_t start; /* where readahead started */ |
761 | unsigned int size; /* # of readahead pages */ | 761 | unsigned int size; /* # of readahead pages */ |
762 | unsigned int async_size; /* do asynchronous readahead when | 762 | unsigned int async_size; /* do asynchronous readahead when |
763 | there are only # of pages ahead */ | 763 | there are only # of pages ahead */ |
764 | 764 | ||
765 | unsigned int ra_pages; /* Maximum readahead window */ | 765 | unsigned int ra_pages; /* Maximum readahead window */ |
766 | int mmap_miss; /* Cache miss stat for mmap accesses */ | 766 | int mmap_miss; /* Cache miss stat for mmap accesses */ |
767 | loff_t prev_pos; /* Cache last read() position */ | 767 | loff_t prev_pos; /* Cache last read() position */ |
768 | }; | 768 | }; |
769 | 769 | ||
770 | /* | 770 | /* |
771 | * Check if @index falls in the readahead windows. | 771 | * Check if @index falls in the readahead windows. |
772 | */ | 772 | */ |
773 | static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) | 773 | static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) |
774 | { | 774 | { |
775 | return (index >= ra->start && | 775 | return (index >= ra->start && |
776 | index < ra->start + ra->size); | 776 | index < ra->start + ra->size); |
777 | } | 777 | } |
778 | 778 | ||
779 | struct file { | 779 | struct file { |
780 | /* | 780 | /* |
781 | * fu_list becomes invalid after file_free is called and queued via | 781 | * fu_list becomes invalid after file_free is called and queued via |
782 | * fu_rcuhead for RCU freeing | 782 | * fu_rcuhead for RCU freeing |
783 | */ | 783 | */ |
784 | union { | 784 | union { |
785 | struct list_head fu_list; | 785 | struct list_head fu_list; |
786 | struct rcu_head fu_rcuhead; | 786 | struct rcu_head fu_rcuhead; |
787 | } f_u; | 787 | } f_u; |
788 | struct path f_path; | 788 | struct path f_path; |
789 | #define f_dentry f_path.dentry | 789 | #define f_dentry f_path.dentry |
790 | #define f_vfsmnt f_path.mnt | 790 | #define f_vfsmnt f_path.mnt |
791 | const struct file_operations *f_op; | 791 | const struct file_operations *f_op; |
792 | atomic_t f_count; | 792 | atomic_t f_count; |
793 | unsigned int f_flags; | 793 | unsigned int f_flags; |
794 | mode_t f_mode; | 794 | mode_t f_mode; |
795 | loff_t f_pos; | 795 | loff_t f_pos; |
796 | struct fown_struct f_owner; | 796 | struct fown_struct f_owner; |
797 | unsigned int f_uid, f_gid; | 797 | unsigned int f_uid, f_gid; |
798 | struct file_ra_state f_ra; | 798 | struct file_ra_state f_ra; |
799 | 799 | ||
800 | u64 f_version; | 800 | u64 f_version; |
801 | #ifdef CONFIG_SECURITY | 801 | #ifdef CONFIG_SECURITY |
802 | void *f_security; | 802 | void *f_security; |
803 | #endif | 803 | #endif |
804 | /* needed for tty driver, and maybe others */ | 804 | /* needed for tty driver, and maybe others */ |
805 | void *private_data; | 805 | void *private_data; |
806 | 806 | ||
807 | #ifdef CONFIG_EPOLL | 807 | #ifdef CONFIG_EPOLL |
808 | /* Used by fs/eventpoll.c to link all the hooks to this file */ | 808 | /* Used by fs/eventpoll.c to link all the hooks to this file */ |
809 | struct list_head f_ep_links; | 809 | struct list_head f_ep_links; |
810 | spinlock_t f_ep_lock; | 810 | spinlock_t f_ep_lock; |
811 | #endif /* #ifdef CONFIG_EPOLL */ | 811 | #endif /* #ifdef CONFIG_EPOLL */ |
812 | struct address_space *f_mapping; | 812 | struct address_space *f_mapping; |
813 | }; | 813 | }; |
814 | extern spinlock_t files_lock; | 814 | extern spinlock_t files_lock; |
815 | #define file_list_lock() spin_lock(&files_lock); | 815 | #define file_list_lock() spin_lock(&files_lock); |
816 | #define file_list_unlock() spin_unlock(&files_lock); | 816 | #define file_list_unlock() spin_unlock(&files_lock); |
817 | 817 | ||
818 | #define get_file(x) atomic_inc(&(x)->f_count) | 818 | #define get_file(x) atomic_inc(&(x)->f_count) |
819 | #define file_count(x) atomic_read(&(x)->f_count) | 819 | #define file_count(x) atomic_read(&(x)->f_count) |
820 | 820 | ||
821 | #define MAX_NON_LFS ((1UL<<31) - 1) | 821 | #define MAX_NON_LFS ((1UL<<31) - 1) |
822 | 822 | ||
823 | /* Page cache limit. The filesystems should put that into their s_maxbytes | 823 | /* Page cache limit. The filesystems should put that into their s_maxbytes |
824 | limits, otherwise bad things can happen in VM. */ | 824 | limits, otherwise bad things can happen in VM. */ |
825 | #if BITS_PER_LONG==32 | 825 | #if BITS_PER_LONG==32 |
826 | #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) | 826 | #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) |
827 | #elif BITS_PER_LONG==64 | 827 | #elif BITS_PER_LONG==64 |
828 | #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL | 828 | #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL |
829 | #endif | 829 | #endif |
830 | 830 | ||
831 | #define FL_POSIX 1 | 831 | #define FL_POSIX 1 |
832 | #define FL_FLOCK 2 | 832 | #define FL_FLOCK 2 |
833 | #define FL_ACCESS 8 /* not trying to lock, just looking */ | 833 | #define FL_ACCESS 8 /* not trying to lock, just looking */ |
834 | #define FL_EXISTS 16 /* when unlocking, test for existence */ | 834 | #define FL_EXISTS 16 /* when unlocking, test for existence */ |
835 | #define FL_LEASE 32 /* lease held on this file */ | 835 | #define FL_LEASE 32 /* lease held on this file */ |
836 | #define FL_CLOSE 64 /* unlock on close */ | 836 | #define FL_CLOSE 64 /* unlock on close */ |
837 | #define FL_SLEEP 128 /* A blocking lock */ | 837 | #define FL_SLEEP 128 /* A blocking lock */ |
838 | 838 | ||
839 | /* | 839 | /* |
840 | * The POSIX file lock owner is determined by | 840 | * The POSIX file lock owner is determined by |
841 | * the "struct files_struct" in the thread group | 841 | * the "struct files_struct" in the thread group |
842 | * (or NULL for no owner - BSD locks). | 842 | * (or NULL for no owner - BSD locks). |
843 | * | 843 | * |
844 | * Lockd stuffs a "host" pointer into this. | 844 | * Lockd stuffs a "host" pointer into this. |
845 | */ | 845 | */ |
846 | typedef struct files_struct *fl_owner_t; | 846 | typedef struct files_struct *fl_owner_t; |
847 | 847 | ||
848 | struct file_lock_operations { | 848 | struct file_lock_operations { |
849 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ | 849 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ |
850 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ | 850 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ |
851 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 851 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
852 | void (*fl_release_private)(struct file_lock *); | 852 | void (*fl_release_private)(struct file_lock *); |
853 | }; | 853 | }; |
854 | 854 | ||
855 | struct lock_manager_operations { | 855 | struct lock_manager_operations { |
856 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); | 856 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); |
857 | void (*fl_notify)(struct file_lock *); /* unblock callback */ | 857 | void (*fl_notify)(struct file_lock *); /* unblock callback */ |
858 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); | 858 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); |
859 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 859 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
860 | void (*fl_release_private)(struct file_lock *); | 860 | void (*fl_release_private)(struct file_lock *); |
861 | void (*fl_break)(struct file_lock *); | 861 | void (*fl_break)(struct file_lock *); |
862 | int (*fl_mylease)(struct file_lock *, struct file_lock *); | 862 | int (*fl_mylease)(struct file_lock *, struct file_lock *); |
863 | int (*fl_change)(struct file_lock **, int); | 863 | int (*fl_change)(struct file_lock **, int); |
864 | }; | 864 | }; |
865 | 865 | ||
866 | /* that will die - we need it for nfs_lock_info */ | 866 | /* that will die - we need it for nfs_lock_info */ |
867 | #include <linux/nfs_fs_i.h> | 867 | #include <linux/nfs_fs_i.h> |
868 | 868 | ||
869 | struct file_lock { | 869 | struct file_lock { |
870 | struct file_lock *fl_next; /* singly linked list for this inode */ | 870 | struct file_lock *fl_next; /* singly linked list for this inode */ |
871 | struct list_head fl_link; /* doubly linked list of all locks */ | 871 | struct list_head fl_link; /* doubly linked list of all locks */ |
872 | struct list_head fl_block; /* circular list of blocked processes */ | 872 | struct list_head fl_block; /* circular list of blocked processes */ |
873 | fl_owner_t fl_owner; | 873 | fl_owner_t fl_owner; |
874 | unsigned int fl_pid; | 874 | unsigned int fl_pid; |
875 | struct pid *fl_nspid; | 875 | struct pid *fl_nspid; |
876 | wait_queue_head_t fl_wait; | 876 | wait_queue_head_t fl_wait; |
877 | struct file *fl_file; | 877 | struct file *fl_file; |
878 | unsigned char fl_flags; | 878 | unsigned char fl_flags; |
879 | unsigned char fl_type; | 879 | unsigned char fl_type; |
880 | loff_t fl_start; | 880 | loff_t fl_start; |
881 | loff_t fl_end; | 881 | loff_t fl_end; |
882 | 882 | ||
883 | struct fasync_struct * fl_fasync; /* for lease break notifications */ | 883 | struct fasync_struct * fl_fasync; /* for lease break notifications */ |
884 | unsigned long fl_break_time; /* for nonblocking lease breaks */ | 884 | unsigned long fl_break_time; /* for nonblocking lease breaks */ |
885 | 885 | ||
886 | struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ | 886 | struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ |
887 | struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ | 887 | struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ |
888 | union { | 888 | union { |
889 | struct nfs_lock_info nfs_fl; | 889 | struct nfs_lock_info nfs_fl; |
890 | struct nfs4_lock_info nfs4_fl; | 890 | struct nfs4_lock_info nfs4_fl; |
891 | struct { | 891 | struct { |
892 | struct list_head link; /* link in AFS vnode's pending_locks list */ | 892 | struct list_head link; /* link in AFS vnode's pending_locks list */ |
893 | int state; /* state of grant or error if -ve */ | 893 | int state; /* state of grant or error if -ve */ |
894 | } afs; | 894 | } afs; |
895 | } fl_u; | 895 | } fl_u; |
896 | }; | 896 | }; |
897 | 897 | ||
898 | /* The following constant reflects the upper bound of the file/locking space */ | 898 | /* The following constant reflects the upper bound of the file/locking space */ |
899 | #ifndef OFFSET_MAX | 899 | #ifndef OFFSET_MAX |
900 | #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) | 900 | #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) |
901 | #define OFFSET_MAX INT_LIMIT(loff_t) | 901 | #define OFFSET_MAX INT_LIMIT(loff_t) |
902 | #define OFFT_OFFSET_MAX INT_LIMIT(off_t) | 902 | #define OFFT_OFFSET_MAX INT_LIMIT(off_t) |
903 | #endif | 903 | #endif |
904 | 904 | ||
905 | #include <linux/fcntl.h> | 905 | #include <linux/fcntl.h> |
906 | 906 | ||
907 | extern int fcntl_getlk(struct file *, struct flock __user *); | 907 | extern int fcntl_getlk(struct file *, struct flock __user *); |
908 | extern int fcntl_setlk(unsigned int, struct file *, unsigned int, | 908 | extern int fcntl_setlk(unsigned int, struct file *, unsigned int, |
909 | struct flock __user *); | 909 | struct flock __user *); |
910 | 910 | ||
911 | #if BITS_PER_LONG == 32 | 911 | #if BITS_PER_LONG == 32 |
912 | extern int fcntl_getlk64(struct file *, struct flock64 __user *); | 912 | extern int fcntl_getlk64(struct file *, struct flock64 __user *); |
913 | extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, | 913 | extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, |
914 | struct flock64 __user *); | 914 | struct flock64 __user *); |
915 | #endif | 915 | #endif |
916 | 916 | ||
917 | extern void send_sigio(struct fown_struct *fown, int fd, int band); | 917 | extern void send_sigio(struct fown_struct *fown, int fd, int band); |
918 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); | 918 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); |
919 | extern int fcntl_getlease(struct file *filp); | 919 | extern int fcntl_getlease(struct file *filp); |
920 | 920 | ||
921 | /* fs/sync.c */ | 921 | /* fs/sync.c */ |
922 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, | 922 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
923 | loff_t endbyte, unsigned int flags); | 923 | loff_t endbyte, unsigned int flags); |
924 | 924 | ||
925 | /* fs/locks.c */ | 925 | /* fs/locks.c */ |
926 | extern void locks_init_lock(struct file_lock *); | 926 | extern void locks_init_lock(struct file_lock *); |
927 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); | 927 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); |
928 | extern void locks_remove_posix(struct file *, fl_owner_t); | 928 | extern void locks_remove_posix(struct file *, fl_owner_t); |
929 | extern void locks_remove_flock(struct file *); | 929 | extern void locks_remove_flock(struct file *); |
930 | extern void posix_test_lock(struct file *, struct file_lock *); | 930 | extern void posix_test_lock(struct file *, struct file_lock *); |
931 | extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); | 931 | extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); |
932 | extern int posix_lock_file_wait(struct file *, struct file_lock *); | 932 | extern int posix_lock_file_wait(struct file *, struct file_lock *); |
933 | extern int posix_unblock_lock(struct file *, struct file_lock *); | 933 | extern int posix_unblock_lock(struct file *, struct file_lock *); |
934 | extern int vfs_test_lock(struct file *, struct file_lock *); | 934 | extern int vfs_test_lock(struct file *, struct file_lock *); |
935 | extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); | 935 | extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); |
936 | extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); | 936 | extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); |
937 | extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); | 937 | extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); |
938 | extern int __break_lease(struct inode *inode, unsigned int flags); | 938 | extern int __break_lease(struct inode *inode, unsigned int flags); |
939 | extern void lease_get_mtime(struct inode *, struct timespec *time); | 939 | extern void lease_get_mtime(struct inode *, struct timespec *time); |
940 | extern int generic_setlease(struct file *, long, struct file_lock **); | 940 | extern int generic_setlease(struct file *, long, struct file_lock **); |
941 | extern int vfs_setlease(struct file *, long, struct file_lock **); | 941 | extern int vfs_setlease(struct file *, long, struct file_lock **); |
942 | extern int lease_modify(struct file_lock **, int); | 942 | extern int lease_modify(struct file_lock **, int); |
943 | extern int lock_may_read(struct inode *, loff_t start, unsigned long count); | 943 | extern int lock_may_read(struct inode *, loff_t start, unsigned long count); |
944 | extern int lock_may_write(struct inode *, loff_t start, unsigned long count); | 944 | extern int lock_may_write(struct inode *, loff_t start, unsigned long count); |
945 | extern struct seq_operations locks_seq_operations; | 945 | extern struct seq_operations locks_seq_operations; |
946 | 946 | ||
947 | struct fasync_struct { | 947 | struct fasync_struct { |
948 | int magic; | 948 | int magic; |
949 | int fa_fd; | 949 | int fa_fd; |
950 | struct fasync_struct *fa_next; /* singly linked list */ | 950 | struct fasync_struct *fa_next; /* singly linked list */ |
951 | struct file *fa_file; | 951 | struct file *fa_file; |
952 | }; | 952 | }; |
953 | 953 | ||
954 | #define FASYNC_MAGIC 0x4601 | 954 | #define FASYNC_MAGIC 0x4601 |
955 | 955 | ||
956 | /* SMP safe fasync helpers: */ | 956 | /* SMP safe fasync helpers: */ |
957 | extern int fasync_helper(int, struct file *, int, struct fasync_struct **); | 957 | extern int fasync_helper(int, struct file *, int, struct fasync_struct **); |
958 | /* can be called from interrupts */ | 958 | /* can be called from interrupts */ |
959 | extern void kill_fasync(struct fasync_struct **, int, int); | 959 | extern void kill_fasync(struct fasync_struct **, int, int); |
960 | /* only for net: no internal synchronization */ | 960 | /* only for net: no internal synchronization */ |
961 | extern void __kill_fasync(struct fasync_struct *, int, int); | 961 | extern void __kill_fasync(struct fasync_struct *, int, int); |
962 | 962 | ||
963 | extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); | 963 | extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); |
964 | extern int f_setown(struct file *filp, unsigned long arg, int force); | 964 | extern int f_setown(struct file *filp, unsigned long arg, int force); |
965 | extern void f_delown(struct file *filp); | 965 | extern void f_delown(struct file *filp); |
966 | extern pid_t f_getown(struct file *filp); | 966 | extern pid_t f_getown(struct file *filp); |
967 | extern int send_sigurg(struct fown_struct *fown); | 967 | extern int send_sigurg(struct fown_struct *fown); |
968 | 968 | ||
969 | /* | 969 | /* |
970 | * Umount options | 970 | * Umount options |
971 | */ | 971 | */ |
972 | 972 | ||
973 | #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ | 973 | #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ |
974 | #define MNT_DETACH 0x00000002 /* Just detach from the tree */ | 974 | #define MNT_DETACH 0x00000002 /* Just detach from the tree */ |
975 | #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ | 975 | #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ |
976 | 976 | ||
977 | extern struct list_head super_blocks; | 977 | extern struct list_head super_blocks; |
978 | extern spinlock_t sb_lock; | 978 | extern spinlock_t sb_lock; |
979 | 979 | ||
980 | #define S_BIAS (1<<30) | 980 | #define S_BIAS (1<<30) |
981 | struct super_block { | 981 | struct super_block { |
982 | struct list_head s_list; /* Keep this first */ | 982 | struct list_head s_list; /* Keep this first */ |
983 | dev_t s_dev; /* search index; _not_ kdev_t */ | 983 | dev_t s_dev; /* search index; _not_ kdev_t */ |
984 | unsigned long s_blocksize; | 984 | unsigned long s_blocksize; |
985 | unsigned char s_blocksize_bits; | 985 | unsigned char s_blocksize_bits; |
986 | unsigned char s_dirt; | 986 | unsigned char s_dirt; |
987 | unsigned long long s_maxbytes; /* Max file size */ | 987 | unsigned long long s_maxbytes; /* Max file size */ |
988 | struct file_system_type *s_type; | 988 | struct file_system_type *s_type; |
989 | const struct super_operations *s_op; | 989 | const struct super_operations *s_op; |
990 | struct dquot_operations *dq_op; | 990 | struct dquot_operations *dq_op; |
991 | struct quotactl_ops *s_qcop; | 991 | struct quotactl_ops *s_qcop; |
992 | const struct export_operations *s_export_op; | 992 | const struct export_operations *s_export_op; |
993 | unsigned long s_flags; | 993 | unsigned long s_flags; |
994 | unsigned long s_magic; | 994 | unsigned long s_magic; |
995 | struct dentry *s_root; | 995 | struct dentry *s_root; |
996 | struct rw_semaphore s_umount; | 996 | struct rw_semaphore s_umount; |
997 | struct mutex s_lock; | 997 | struct mutex s_lock; |
998 | int s_count; | 998 | int s_count; |
999 | int s_syncing; | 999 | int s_syncing; |
1000 | int s_need_sync_fs; | 1000 | int s_need_sync_fs; |
1001 | atomic_t s_active; | 1001 | atomic_t s_active; |
1002 | #ifdef CONFIG_SECURITY | 1002 | #ifdef CONFIG_SECURITY |
1003 | void *s_security; | 1003 | void *s_security; |
1004 | #endif | 1004 | #endif |
1005 | struct xattr_handler **s_xattr; | 1005 | struct xattr_handler **s_xattr; |
1006 | 1006 | ||
1007 | struct list_head s_inodes; /* all inodes */ | 1007 | struct list_head s_inodes; /* all inodes */ |
1008 | struct list_head s_dirty; /* dirty inodes */ | 1008 | struct list_head s_dirty; /* dirty inodes */ |
1009 | struct list_head s_io; /* parked for writeback */ | 1009 | struct list_head s_io; /* parked for writeback */ |
1010 | struct list_head s_more_io; /* parked for more writeback */ | 1010 | struct list_head s_more_io; /* parked for more writeback */ |
1011 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ | 1011 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ |
1012 | struct list_head s_files; | 1012 | struct list_head s_files; |
1013 | 1013 | ||
1014 | struct block_device *s_bdev; | 1014 | struct block_device *s_bdev; |
1015 | struct mtd_info *s_mtd; | 1015 | struct mtd_info *s_mtd; |
1016 | struct list_head s_instances; | 1016 | struct list_head s_instances; |
1017 | struct quota_info s_dquot; /* Diskquota specific options */ | 1017 | struct quota_info s_dquot; /* Diskquota specific options */ |
1018 | 1018 | ||
1019 | int s_frozen; | 1019 | int s_frozen; |
1020 | wait_queue_head_t s_wait_unfrozen; | 1020 | wait_queue_head_t s_wait_unfrozen; |
1021 | 1021 | ||
1022 | char s_id[32]; /* Informational name */ | 1022 | char s_id[32]; /* Informational name */ |
1023 | 1023 | ||
1024 | void *s_fs_info; /* Filesystem private info */ | 1024 | void *s_fs_info; /* Filesystem private info */ |
1025 | 1025 | ||
1026 | /* | 1026 | /* |
1027 | * The next field is for VFS *only*. No filesystems have any business | 1027 | * The next field is for VFS *only*. No filesystems have any business |
1028 | * even looking at it. You had been warned. | 1028 | * even looking at it. You had been warned. |
1029 | */ | 1029 | */ |
1030 | struct mutex s_vfs_rename_mutex; /* Kludge */ | 1030 | struct mutex s_vfs_rename_mutex; /* Kludge */ |
1031 | 1031 | ||
1032 | /* Granularity of c/m/atime in ns. | 1032 | /* Granularity of c/m/atime in ns. |
1033 | Cannot be worse than a second */ | 1033 | Cannot be worse than a second */ |
1034 | u32 s_time_gran; | 1034 | u32 s_time_gran; |
1035 | 1035 | ||
1036 | /* | 1036 | /* |
1037 | * Filesystem subtype. If non-empty the filesystem type field | 1037 | * Filesystem subtype. If non-empty the filesystem type field |
1038 | * in /proc/mounts will be "type.subtype" | 1038 | * in /proc/mounts will be "type.subtype" |
1039 | */ | 1039 | */ |
1040 | char *s_subtype; | 1040 | char *s_subtype; |
1041 | }; | 1041 | }; |
1042 | 1042 | ||
1043 | extern struct timespec current_fs_time(struct super_block *sb); | 1043 | extern struct timespec current_fs_time(struct super_block *sb); |
1044 | 1044 | ||
1045 | /* | 1045 | /* |
1046 | * Snapshotting support. | 1046 | * Snapshotting support. |
1047 | */ | 1047 | */ |
1048 | enum { | 1048 | enum { |
1049 | SB_UNFROZEN = 0, | 1049 | SB_UNFROZEN = 0, |
1050 | SB_FREEZE_WRITE = 1, | 1050 | SB_FREEZE_WRITE = 1, |
1051 | SB_FREEZE_TRANS = 2, | 1051 | SB_FREEZE_TRANS = 2, |
1052 | }; | 1052 | }; |
1053 | 1053 | ||
1054 | #define vfs_check_frozen(sb, level) \ | 1054 | #define vfs_check_frozen(sb, level) \ |
1055 | wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) | 1055 | wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) |
1056 | 1056 | ||
1057 | #define get_fs_excl() atomic_inc(¤t->fs_excl) | 1057 | #define get_fs_excl() atomic_inc(¤t->fs_excl) |
1058 | #define put_fs_excl() atomic_dec(¤t->fs_excl) | 1058 | #define put_fs_excl() atomic_dec(¤t->fs_excl) |
1059 | #define has_fs_excl() atomic_read(¤t->fs_excl) | 1059 | #define has_fs_excl() atomic_read(¤t->fs_excl) |
1060 | 1060 | ||
1061 | #define is_owner_or_cap(inode) \ | 1061 | #define is_owner_or_cap(inode) \ |
1062 | ((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER)) | 1062 | ((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER)) |
1063 | 1063 | ||
1064 | /* not quite ready to be deprecated, but... */ | 1064 | /* not quite ready to be deprecated, but... */ |
1065 | extern void lock_super(struct super_block *); | 1065 | extern void lock_super(struct super_block *); |
1066 | extern void unlock_super(struct super_block *); | 1066 | extern void unlock_super(struct super_block *); |
1067 | 1067 | ||
1068 | /* | 1068 | /* |
1069 | * VFS helper functions.. | 1069 | * VFS helper functions.. |
1070 | */ | 1070 | */ |
1071 | extern int vfs_permission(struct nameidata *, int); | 1071 | extern int vfs_permission(struct nameidata *, int); |
1072 | extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); | 1072 | extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); |
1073 | extern int vfs_mkdir(struct inode *, struct dentry *, int); | 1073 | extern int vfs_mkdir(struct inode *, struct dentry *, int); |
1074 | extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); | 1074 | extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); |
1075 | extern int vfs_symlink(struct inode *, struct dentry *, const char *, int); | 1075 | extern int vfs_symlink(struct inode *, struct dentry *, const char *, int); |
1076 | extern int vfs_link(struct dentry *, struct inode *, struct dentry *); | 1076 | extern int vfs_link(struct dentry *, struct inode *, struct dentry *); |
1077 | extern int vfs_rmdir(struct inode *, struct dentry *); | 1077 | extern int vfs_rmdir(struct inode *, struct dentry *); |
1078 | extern int vfs_unlink(struct inode *, struct dentry *); | 1078 | extern int vfs_unlink(struct inode *, struct dentry *); |
1079 | extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); | 1079 | extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); |
1080 | 1080 | ||
1081 | /* | 1081 | /* |
1082 | * VFS dentry helper functions. | 1082 | * VFS dentry helper functions. |
1083 | */ | 1083 | */ |
1084 | extern void dentry_unhash(struct dentry *dentry); | 1084 | extern void dentry_unhash(struct dentry *dentry); |
1085 | 1085 | ||
1086 | /* | 1086 | /* |
1087 | * VFS file helper functions. | 1087 | * VFS file helper functions. |
1088 | */ | 1088 | */ |
1089 | extern int file_permission(struct file *, int); | 1089 | extern int file_permission(struct file *, int); |
1090 | 1090 | ||
1091 | /* | 1091 | /* |
1092 | * File types | 1092 | * File types |
1093 | * | 1093 | * |
1094 | * NOTE! These match bits 12..15 of stat.st_mode | 1094 | * NOTE! These match bits 12..15 of stat.st_mode |
1095 | * (ie "(i_mode >> 12) & 15"). | 1095 | * (ie "(i_mode >> 12) & 15"). |
1096 | */ | 1096 | */ |
1097 | #define DT_UNKNOWN 0 | 1097 | #define DT_UNKNOWN 0 |
1098 | #define DT_FIFO 1 | 1098 | #define DT_FIFO 1 |
1099 | #define DT_CHR 2 | 1099 | #define DT_CHR 2 |
1100 | #define DT_DIR 4 | 1100 | #define DT_DIR 4 |
1101 | #define DT_BLK 6 | 1101 | #define DT_BLK 6 |
1102 | #define DT_REG 8 | 1102 | #define DT_REG 8 |
1103 | #define DT_LNK 10 | 1103 | #define DT_LNK 10 |
1104 | #define DT_SOCK 12 | 1104 | #define DT_SOCK 12 |
1105 | #define DT_WHT 14 | 1105 | #define DT_WHT 14 |
1106 | 1106 | ||
1107 | #define OSYNC_METADATA (1<<0) | 1107 | #define OSYNC_METADATA (1<<0) |
1108 | #define OSYNC_DATA (1<<1) | 1108 | #define OSYNC_DATA (1<<1) |
1109 | #define OSYNC_INODE (1<<2) | 1109 | #define OSYNC_INODE (1<<2) |
1110 | int generic_osync_inode(struct inode *, struct address_space *, int); | 1110 | int generic_osync_inode(struct inode *, struct address_space *, int); |
1111 | 1111 | ||
1112 | /* | 1112 | /* |
1113 | * This is the "filldir" function type, used by readdir() to let | 1113 | * This is the "filldir" function type, used by readdir() to let |
1114 | * the kernel specify what kind of dirent layout it wants to have. | 1114 | * the kernel specify what kind of dirent layout it wants to have. |
1115 | * This allows the kernel to read directories into kernel space or | 1115 | * This allows the kernel to read directories into kernel space or |
1116 | * to have different dirent layouts depending on the binary type. | 1116 | * to have different dirent layouts depending on the binary type. |
1117 | */ | 1117 | */ |
1118 | typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); | 1118 | typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); |
1119 | 1119 | ||
1120 | struct block_device_operations { | 1120 | struct block_device_operations { |
1121 | int (*open) (struct inode *, struct file *); | 1121 | int (*open) (struct inode *, struct file *); |
1122 | int (*release) (struct inode *, struct file *); | 1122 | int (*release) (struct inode *, struct file *); |
1123 | int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); | 1123 | int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); |
1124 | long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); | 1124 | long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); |
1125 | long (*compat_ioctl) (struct file *, unsigned, unsigned long); | 1125 | long (*compat_ioctl) (struct file *, unsigned, unsigned long); |
1126 | int (*direct_access) (struct block_device *, sector_t, unsigned long *); | 1126 | int (*direct_access) (struct block_device *, sector_t, unsigned long *); |
1127 | int (*media_changed) (struct gendisk *); | 1127 | int (*media_changed) (struct gendisk *); |
1128 | int (*revalidate_disk) (struct gendisk *); | 1128 | int (*revalidate_disk) (struct gendisk *); |
1129 | int (*getgeo)(struct block_device *, struct hd_geometry *); | 1129 | int (*getgeo)(struct block_device *, struct hd_geometry *); |
1130 | struct module *owner; | 1130 | struct module *owner; |
1131 | }; | 1131 | }; |
1132 | 1132 | ||
1133 | /* | 1133 | /* |
1134 | * "descriptor" for what we're up to with a read. | 1134 | * "descriptor" for what we're up to with a read. |
1135 | * This allows us to use the same read code yet | 1135 | * This allows us to use the same read code yet |
1136 | * have multiple different users of the data that | 1136 | * have multiple different users of the data that |
1137 | * we read from a file. | 1137 | * we read from a file. |
1138 | * | 1138 | * |
1139 | * The simplest case just copies the data to user | 1139 | * The simplest case just copies the data to user |
1140 | * mode. | 1140 | * mode. |
1141 | */ | 1141 | */ |
1142 | typedef struct { | 1142 | typedef struct { |
1143 | size_t written; | 1143 | size_t written; |
1144 | size_t count; | 1144 | size_t count; |
1145 | union { | 1145 | union { |
1146 | char __user * buf; | 1146 | char __user * buf; |
1147 | void *data; | 1147 | void *data; |
1148 | } arg; | 1148 | } arg; |
1149 | int error; | 1149 | int error; |
1150 | } read_descriptor_t; | 1150 | } read_descriptor_t; |
1151 | 1151 | ||
1152 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); | 1152 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); |
1153 | 1153 | ||
1154 | /* These macros are for out of kernel modules to test that | 1154 | /* These macros are for out of kernel modules to test that |
1155 | * the kernel supports the unlocked_ioctl and compat_ioctl | 1155 | * the kernel supports the unlocked_ioctl and compat_ioctl |
1156 | * fields in struct file_operations. */ | 1156 | * fields in struct file_operations. */ |
1157 | #define HAVE_COMPAT_IOCTL 1 | 1157 | #define HAVE_COMPAT_IOCTL 1 |
1158 | #define HAVE_UNLOCKED_IOCTL 1 | 1158 | #define HAVE_UNLOCKED_IOCTL 1 |
1159 | 1159 | ||
1160 | /* | 1160 | /* |
1161 | * NOTE: | 1161 | * NOTE: |
1162 | * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl | 1162 | * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl |
1163 | * can be called without the big kernel lock held in all filesystems. | 1163 | * can be called without the big kernel lock held in all filesystems. |
1164 | */ | 1164 | */ |
1165 | struct file_operations { | 1165 | struct file_operations { |
1166 | struct module *owner; | 1166 | struct module *owner; |
1167 | loff_t (*llseek) (struct file *, loff_t, int); | 1167 | loff_t (*llseek) (struct file *, loff_t, int); |
1168 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); | 1168 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); |
1169 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); | 1169 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); |
1170 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1170 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1171 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1171 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1172 | int (*readdir) (struct file *, void *, filldir_t); | 1172 | int (*readdir) (struct file *, void *, filldir_t); |
1173 | unsigned int (*poll) (struct file *, struct poll_table_struct *); | 1173 | unsigned int (*poll) (struct file *, struct poll_table_struct *); |
1174 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); | 1174 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); |
1175 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); | 1175 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); |
1176 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); | 1176 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); |
1177 | int (*mmap) (struct file *, struct vm_area_struct *); | 1177 | int (*mmap) (struct file *, struct vm_area_struct *); |
1178 | int (*open) (struct inode *, struct file *); | 1178 | int (*open) (struct inode *, struct file *); |
1179 | int (*flush) (struct file *, fl_owner_t id); | 1179 | int (*flush) (struct file *, fl_owner_t id); |
1180 | int (*release) (struct inode *, struct file *); | 1180 | int (*release) (struct inode *, struct file *); |
1181 | int (*fsync) (struct file *, struct dentry *, int datasync); | 1181 | int (*fsync) (struct file *, struct dentry *, int datasync); |
1182 | int (*aio_fsync) (struct kiocb *, int datasync); | 1182 | int (*aio_fsync) (struct kiocb *, int datasync); |
1183 | int (*fasync) (int, struct file *, int); | 1183 | int (*fasync) (int, struct file *, int); |
1184 | int (*lock) (struct file *, int, struct file_lock *); | 1184 | int (*lock) (struct file *, int, struct file_lock *); |
1185 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); | 1185 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); |
1186 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 1186 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
1187 | int (*check_flags)(int); | 1187 | int (*check_flags)(int); |
1188 | int (*dir_notify)(struct file *filp, unsigned long arg); | 1188 | int (*dir_notify)(struct file *filp, unsigned long arg); |
1189 | int (*flock) (struct file *, int, struct file_lock *); | 1189 | int (*flock) (struct file *, int, struct file_lock *); |
1190 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); | 1190 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); |
1191 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); | 1191 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); |
1192 | int (*setlease)(struct file *, long, struct file_lock **); | 1192 | int (*setlease)(struct file *, long, struct file_lock **); |
1193 | }; | 1193 | }; |
1194 | 1194 | ||
1195 | struct inode_operations { | 1195 | struct inode_operations { |
1196 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); | 1196 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); |
1197 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); | 1197 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); |
1198 | int (*link) (struct dentry *,struct inode *,struct dentry *); | 1198 | int (*link) (struct dentry *,struct inode *,struct dentry *); |
1199 | int (*unlink) (struct inode *,struct dentry *); | 1199 | int (*unlink) (struct inode *,struct dentry *); |
1200 | int (*symlink) (struct inode *,struct dentry *,const char *); | 1200 | int (*symlink) (struct inode *,struct dentry *,const char *); |
1201 | int (*mkdir) (struct inode *,struct dentry *,int); | 1201 | int (*mkdir) (struct inode *,struct dentry *,int); |
1202 | int (*rmdir) (struct inode *,struct dentry *); | 1202 | int (*rmdir) (struct inode *,struct dentry *); |
1203 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); | 1203 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); |
1204 | int (*rename) (struct inode *, struct dentry *, | 1204 | int (*rename) (struct inode *, struct dentry *, |
1205 | struct inode *, struct dentry *); | 1205 | struct inode *, struct dentry *); |
1206 | int (*readlink) (struct dentry *, char __user *,int); | 1206 | int (*readlink) (struct dentry *, char __user *,int); |
1207 | void * (*follow_link) (struct dentry *, struct nameidata *); | 1207 | void * (*follow_link) (struct dentry *, struct nameidata *); |
1208 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 1208 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
1209 | void (*truncate) (struct inode *); | 1209 | void (*truncate) (struct inode *); |
1210 | int (*permission) (struct inode *, int, struct nameidata *); | 1210 | int (*permission) (struct inode *, int, struct nameidata *); |
1211 | int (*setattr) (struct dentry *, struct iattr *); | 1211 | int (*setattr) (struct dentry *, struct iattr *); |
1212 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 1212 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
1213 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 1213 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
1214 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 1214 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
1215 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 1215 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
1216 | int (*removexattr) (struct dentry *, const char *); | 1216 | int (*removexattr) (struct dentry *, const char *); |
1217 | void (*truncate_range)(struct inode *, loff_t, loff_t); | 1217 | void (*truncate_range)(struct inode *, loff_t, loff_t); |
1218 | long (*fallocate)(struct inode *inode, int mode, loff_t offset, | 1218 | long (*fallocate)(struct inode *inode, int mode, loff_t offset, |
1219 | loff_t len); | 1219 | loff_t len); |
1220 | }; | 1220 | }; |
1221 | 1221 | ||
1222 | struct seq_file; | 1222 | struct seq_file; |
1223 | 1223 | ||
1224 | ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, | 1224 | ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, |
1225 | unsigned long nr_segs, unsigned long fast_segs, | 1225 | unsigned long nr_segs, unsigned long fast_segs, |
1226 | struct iovec *fast_pointer, | 1226 | struct iovec *fast_pointer, |
1227 | struct iovec **ret_pointer); | 1227 | struct iovec **ret_pointer); |
1228 | 1228 | ||
1229 | extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); | 1229 | extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); |
1230 | extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); | 1230 | extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); |
1231 | extern ssize_t vfs_readv(struct file *, const struct iovec __user *, | 1231 | extern ssize_t vfs_readv(struct file *, const struct iovec __user *, |
1232 | unsigned long, loff_t *); | 1232 | unsigned long, loff_t *); |
1233 | extern ssize_t vfs_writev(struct file *, const struct iovec __user *, | 1233 | extern ssize_t vfs_writev(struct file *, const struct iovec __user *, |
1234 | unsigned long, loff_t *); | 1234 | unsigned long, loff_t *); |
1235 | 1235 | ||
1236 | /* | 1236 | /* |
1237 | * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called | 1237 | * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called |
1238 | * without the big kernel lock held in all filesystems. | 1238 | * without the big kernel lock held in all filesystems. |
1239 | */ | 1239 | */ |
1240 | struct super_operations { | 1240 | struct super_operations { |
1241 | struct inode *(*alloc_inode)(struct super_block *sb); | 1241 | struct inode *(*alloc_inode)(struct super_block *sb); |
1242 | void (*destroy_inode)(struct inode *); | 1242 | void (*destroy_inode)(struct inode *); |
1243 | 1243 | ||
1244 | void (*read_inode) (struct inode *); | ||
1245 | |||
1246 | void (*dirty_inode) (struct inode *); | 1244 | void (*dirty_inode) (struct inode *); |
1247 | int (*write_inode) (struct inode *, int); | 1245 | int (*write_inode) (struct inode *, int); |
1248 | void (*put_inode) (struct inode *); | 1246 | void (*put_inode) (struct inode *); |
1249 | void (*drop_inode) (struct inode *); | 1247 | void (*drop_inode) (struct inode *); |
1250 | void (*delete_inode) (struct inode *); | 1248 | void (*delete_inode) (struct inode *); |
1251 | void (*put_super) (struct super_block *); | 1249 | void (*put_super) (struct super_block *); |
1252 | void (*write_super) (struct super_block *); | 1250 | void (*write_super) (struct super_block *); |
1253 | int (*sync_fs)(struct super_block *sb, int wait); | 1251 | int (*sync_fs)(struct super_block *sb, int wait); |
1254 | void (*write_super_lockfs) (struct super_block *); | 1252 | void (*write_super_lockfs) (struct super_block *); |
1255 | void (*unlockfs) (struct super_block *); | 1253 | void (*unlockfs) (struct super_block *); |
1256 | int (*statfs) (struct dentry *, struct kstatfs *); | 1254 | int (*statfs) (struct dentry *, struct kstatfs *); |
1257 | int (*remount_fs) (struct super_block *, int *, char *); | 1255 | int (*remount_fs) (struct super_block *, int *, char *); |
1258 | void (*clear_inode) (struct inode *); | 1256 | void (*clear_inode) (struct inode *); |
1259 | void (*umount_begin) (struct vfsmount *, int); | 1257 | void (*umount_begin) (struct vfsmount *, int); |
1260 | 1258 | ||
1261 | int (*show_options)(struct seq_file *, struct vfsmount *); | 1259 | int (*show_options)(struct seq_file *, struct vfsmount *); |
1262 | int (*show_stats)(struct seq_file *, struct vfsmount *); | 1260 | int (*show_stats)(struct seq_file *, struct vfsmount *); |
1263 | #ifdef CONFIG_QUOTA | 1261 | #ifdef CONFIG_QUOTA |
1264 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 1262 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
1265 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 1263 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
1266 | #endif | 1264 | #endif |
1267 | }; | 1265 | }; |
1268 | 1266 | ||
1269 | /* | 1267 | /* |
1270 | * Inode state bits. Protected by inode_lock. | 1268 | * Inode state bits. Protected by inode_lock. |
1271 | * | 1269 | * |
1272 | * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, | 1270 | * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, |
1273 | * I_DIRTY_DATASYNC and I_DIRTY_PAGES. | 1271 | * I_DIRTY_DATASYNC and I_DIRTY_PAGES. |
1274 | * | 1272 | * |
1275 | * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, | 1273 | * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, |
1276 | * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at | 1274 | * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at |
1277 | * various stages of removing an inode. | 1275 | * various stages of removing an inode. |
1278 | * | 1276 | * |
1279 | * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. | 1277 | * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. |
1280 | * | 1278 | * |
1281 | * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on | 1279 | * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on |
1282 | * fdatasync(). i_atime is the usual cause. | 1280 | * fdatasync(). i_atime is the usual cause. |
1283 | * I_DIRTY_DATASYNC Inode is dirty and must be written on fdatasync(), f.e. | 1281 | * I_DIRTY_DATASYNC Inode is dirty and must be written on fdatasync(), f.e. |
1284 | * because i_size changed. | 1282 | * because i_size changed. |
1285 | * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. | 1283 | * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. |
1286 | * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both | 1284 | * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both |
1287 | * are cleared by unlock_new_inode(), called from iget(). | 1285 | * are cleared by unlock_new_inode(), called from iget(). |
1288 | * I_WILL_FREE Must be set when calling write_inode_now() if i_count | 1286 | * I_WILL_FREE Must be set when calling write_inode_now() if i_count |
1289 | * is zero. I_FREEING must be set when I_WILL_FREE is | 1287 | * is zero. I_FREEING must be set when I_WILL_FREE is |
1290 | * cleared. | 1288 | * cleared. |
1291 | * I_FREEING Set when inode is about to be freed but still has dirty | 1289 | * I_FREEING Set when inode is about to be freed but still has dirty |
1292 | * pages or buffers attached or the inode itself is still | 1290 | * pages or buffers attached or the inode itself is still |
1293 | * dirty. | 1291 | * dirty. |
1294 | * I_CLEAR Set by clear_inode(). In this state the inode is clean | 1292 | * I_CLEAR Set by clear_inode(). In this state the inode is clean |
1295 | * and can be destroyed. | 1293 | * and can be destroyed. |
1296 | * | 1294 | * |
1297 | * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are | 1295 | * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are |
1298 | * prohibited for many purposes. iget() must wait for | 1296 | * prohibited for many purposes. iget() must wait for |
1299 | * the inode to be completely released, then create it | 1297 | * the inode to be completely released, then create it |
1300 | * anew. Other functions will just ignore such inodes, | 1298 | * anew. Other functions will just ignore such inodes, |
1301 | * if appropriate. I_LOCK is used for waiting. | 1299 | * if appropriate. I_LOCK is used for waiting. |
1302 | * | 1300 | * |
1303 | * I_LOCK Serves as both a mutex and completion notification. | 1301 | * I_LOCK Serves as both a mutex and completion notification. |
1304 | * New inodes set I_LOCK. If two processes both create | 1302 | * New inodes set I_LOCK. If two processes both create |
1305 | * the same inode, one of them will release its inode and | 1303 | * the same inode, one of them will release its inode and |
1306 | * wait for I_LOCK to be released before returning. | 1304 | * wait for I_LOCK to be released before returning. |
1307 | * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can | 1305 | * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can |
1308 | * also cause waiting on I_LOCK, without I_LOCK actually | 1306 | * also cause waiting on I_LOCK, without I_LOCK actually |
1309 | * being set. find_inode() uses this to prevent returning | 1307 | * being set. find_inode() uses this to prevent returning |
1310 | * nearly-dead inodes. | 1308 | * nearly-dead inodes. |
1311 | * I_SYNC Similar to I_LOCK, but limited in scope to writeback | 1309 | * I_SYNC Similar to I_LOCK, but limited in scope to writeback |
1312 | * of inode dirty data. Having a separate lock for this | 1310 | * of inode dirty data. Having a separate lock for this |
1313 | * purpose reduces latency and prevents some filesystem- | 1311 | * purpose reduces latency and prevents some filesystem- |
1314 | * specific deadlocks. | 1312 | * specific deadlocks. |
1315 | * | 1313 | * |
1316 | * Q: What is the difference between I_WILL_FREE and I_FREEING? | 1314 | * Q: What is the difference between I_WILL_FREE and I_FREEING? |
1317 | * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on | 1315 | * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on |
1318 | * I_CLEAR? If not, why? | 1316 | * I_CLEAR? If not, why? |
1319 | */ | 1317 | */ |
1320 | #define I_DIRTY_SYNC 1 | 1318 | #define I_DIRTY_SYNC 1 |
1321 | #define I_DIRTY_DATASYNC 2 | 1319 | #define I_DIRTY_DATASYNC 2 |
1322 | #define I_DIRTY_PAGES 4 | 1320 | #define I_DIRTY_PAGES 4 |
1323 | #define I_NEW 8 | 1321 | #define I_NEW 8 |
1324 | #define I_WILL_FREE 16 | 1322 | #define I_WILL_FREE 16 |
1325 | #define I_FREEING 32 | 1323 | #define I_FREEING 32 |
1326 | #define I_CLEAR 64 | 1324 | #define I_CLEAR 64 |
1327 | #define __I_LOCK 7 | 1325 | #define __I_LOCK 7 |
1328 | #define I_LOCK (1 << __I_LOCK) | 1326 | #define I_LOCK (1 << __I_LOCK) |
1329 | #define __I_SYNC 8 | 1327 | #define __I_SYNC 8 |
1330 | #define I_SYNC (1 << __I_SYNC) | 1328 | #define I_SYNC (1 << __I_SYNC) |
1331 | 1329 | ||
1332 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) | 1330 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) |
1333 | 1331 | ||
1334 | extern void __mark_inode_dirty(struct inode *, int); | 1332 | extern void __mark_inode_dirty(struct inode *, int); |
1335 | static inline void mark_inode_dirty(struct inode *inode) | 1333 | static inline void mark_inode_dirty(struct inode *inode) |
1336 | { | 1334 | { |
1337 | __mark_inode_dirty(inode, I_DIRTY); | 1335 | __mark_inode_dirty(inode, I_DIRTY); |
1338 | } | 1336 | } |
1339 | 1337 | ||
1340 | static inline void mark_inode_dirty_sync(struct inode *inode) | 1338 | static inline void mark_inode_dirty_sync(struct inode *inode) |
1341 | { | 1339 | { |
1342 | __mark_inode_dirty(inode, I_DIRTY_SYNC); | 1340 | __mark_inode_dirty(inode, I_DIRTY_SYNC); |
1343 | } | 1341 | } |
1344 | 1342 | ||
1345 | /** | 1343 | /** |
1346 | * inc_nlink - directly increment an inode's link count | 1344 | * inc_nlink - directly increment an inode's link count |
1347 | * @inode: inode | 1345 | * @inode: inode |
1348 | * | 1346 | * |
1349 | * This is a low-level filesystem helper to replace any | 1347 | * This is a low-level filesystem helper to replace any |
1350 | * direct filesystem manipulation of i_nlink. Currently, | 1348 | * direct filesystem manipulation of i_nlink. Currently, |
1351 | * it is only here for parity with dec_nlink(). | 1349 | * it is only here for parity with dec_nlink(). |
1352 | */ | 1350 | */ |
1353 | static inline void inc_nlink(struct inode *inode) | 1351 | static inline void inc_nlink(struct inode *inode) |
1354 | { | 1352 | { |
1355 | inode->i_nlink++; | 1353 | inode->i_nlink++; |
1356 | } | 1354 | } |
1357 | 1355 | ||
1358 | static inline void inode_inc_link_count(struct inode *inode) | 1356 | static inline void inode_inc_link_count(struct inode *inode) |
1359 | { | 1357 | { |
1360 | inc_nlink(inode); | 1358 | inc_nlink(inode); |
1361 | mark_inode_dirty(inode); | 1359 | mark_inode_dirty(inode); |
1362 | } | 1360 | } |
1363 | 1361 | ||
1364 | /** | 1362 | /** |
1365 | * drop_nlink - directly drop an inode's link count | 1363 | * drop_nlink - directly drop an inode's link count |
1366 | * @inode: inode | 1364 | * @inode: inode |
1367 | * | 1365 | * |
1368 | * This is a low-level filesystem helper to replace any | 1366 | * This is a low-level filesystem helper to replace any |
1369 | * direct filesystem manipulation of i_nlink. In cases | 1367 | * direct filesystem manipulation of i_nlink. In cases |
1370 | * where we are attempting to track writes to the | 1368 | * where we are attempting to track writes to the |
1371 | * filesystem, a decrement to zero means an imminent | 1369 | * filesystem, a decrement to zero means an imminent |
1372 | * write when the file is truncated and actually unlinked | 1370 | * write when the file is truncated and actually unlinked |
1373 | * on the filesystem. | 1371 | * on the filesystem. |
1374 | */ | 1372 | */ |
1375 | static inline void drop_nlink(struct inode *inode) | 1373 | static inline void drop_nlink(struct inode *inode) |
1376 | { | 1374 | { |
1377 | inode->i_nlink--; | 1375 | inode->i_nlink--; |
1378 | } | 1376 | } |
1379 | 1377 | ||
1380 | /** | 1378 | /** |
1381 | * clear_nlink - directly zero an inode's link count | 1379 | * clear_nlink - directly zero an inode's link count |
1382 | * @inode: inode | 1380 | * @inode: inode |
1383 | * | 1381 | * |
1384 | * This is a low-level filesystem helper to replace any | 1382 | * This is a low-level filesystem helper to replace any |
1385 | * direct filesystem manipulation of i_nlink. See | 1383 | * direct filesystem manipulation of i_nlink. See |
1386 | * drop_nlink() for why we care about i_nlink hitting zero. | 1384 | * drop_nlink() for why we care about i_nlink hitting zero. |
1387 | */ | 1385 | */ |
1388 | static inline void clear_nlink(struct inode *inode) | 1386 | static inline void clear_nlink(struct inode *inode) |
1389 | { | 1387 | { |
1390 | inode->i_nlink = 0; | 1388 | inode->i_nlink = 0; |
1391 | } | 1389 | } |
1392 | 1390 | ||
1393 | static inline void inode_dec_link_count(struct inode *inode) | 1391 | static inline void inode_dec_link_count(struct inode *inode) |
1394 | { | 1392 | { |
1395 | drop_nlink(inode); | 1393 | drop_nlink(inode); |
1396 | mark_inode_dirty(inode); | 1394 | mark_inode_dirty(inode); |
1397 | } | 1395 | } |
1398 | 1396 | ||
1399 | /** | 1397 | /** |
1400 | * inode_inc_iversion - increments i_version | 1398 | * inode_inc_iversion - increments i_version |
1401 | * @inode: inode that need to be updated | 1399 | * @inode: inode that need to be updated |
1402 | * | 1400 | * |
1403 | * Every time the inode is modified, the i_version field will be incremented. | 1401 | * Every time the inode is modified, the i_version field will be incremented. |
1404 | * The filesystem has to be mounted with i_version flag | 1402 | * The filesystem has to be mounted with i_version flag |
1405 | */ | 1403 | */ |
1406 | 1404 | ||
1407 | static inline void inode_inc_iversion(struct inode *inode) | 1405 | static inline void inode_inc_iversion(struct inode *inode) |
1408 | { | 1406 | { |
1409 | spin_lock(&inode->i_lock); | 1407 | spin_lock(&inode->i_lock); |
1410 | inode->i_version++; | 1408 | inode->i_version++; |
1411 | spin_unlock(&inode->i_lock); | 1409 | spin_unlock(&inode->i_lock); |
1412 | } | 1410 | } |
1413 | 1411 | ||
1414 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); | 1412 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); |
1415 | static inline void file_accessed(struct file *file) | 1413 | static inline void file_accessed(struct file *file) |
1416 | { | 1414 | { |
1417 | if (!(file->f_flags & O_NOATIME)) | 1415 | if (!(file->f_flags & O_NOATIME)) |
1418 | touch_atime(file->f_path.mnt, file->f_path.dentry); | 1416 | touch_atime(file->f_path.mnt, file->f_path.dentry); |
1419 | } | 1417 | } |
1420 | 1418 | ||
1421 | int sync_inode(struct inode *inode, struct writeback_control *wbc); | 1419 | int sync_inode(struct inode *inode, struct writeback_control *wbc); |
1422 | 1420 | ||
1423 | struct file_system_type { | 1421 | struct file_system_type { |
1424 | const char *name; | 1422 | const char *name; |
1425 | int fs_flags; | 1423 | int fs_flags; |
1426 | int (*get_sb) (struct file_system_type *, int, | 1424 | int (*get_sb) (struct file_system_type *, int, |
1427 | const char *, void *, struct vfsmount *); | 1425 | const char *, void *, struct vfsmount *); |
1428 | void (*kill_sb) (struct super_block *); | 1426 | void (*kill_sb) (struct super_block *); |
1429 | struct module *owner; | 1427 | struct module *owner; |
1430 | struct file_system_type * next; | 1428 | struct file_system_type * next; |
1431 | struct list_head fs_supers; | 1429 | struct list_head fs_supers; |
1432 | 1430 | ||
1433 | struct lock_class_key s_lock_key; | 1431 | struct lock_class_key s_lock_key; |
1434 | struct lock_class_key s_umount_key; | 1432 | struct lock_class_key s_umount_key; |
1435 | 1433 | ||
1436 | struct lock_class_key i_lock_key; | 1434 | struct lock_class_key i_lock_key; |
1437 | struct lock_class_key i_mutex_key; | 1435 | struct lock_class_key i_mutex_key; |
1438 | struct lock_class_key i_mutex_dir_key; | 1436 | struct lock_class_key i_mutex_dir_key; |
1439 | struct lock_class_key i_alloc_sem_key; | 1437 | struct lock_class_key i_alloc_sem_key; |
1440 | }; | 1438 | }; |
1441 | 1439 | ||
1442 | extern int get_sb_bdev(struct file_system_type *fs_type, | 1440 | extern int get_sb_bdev(struct file_system_type *fs_type, |
1443 | int flags, const char *dev_name, void *data, | 1441 | int flags, const char *dev_name, void *data, |
1444 | int (*fill_super)(struct super_block *, void *, int), | 1442 | int (*fill_super)(struct super_block *, void *, int), |
1445 | struct vfsmount *mnt); | 1443 | struct vfsmount *mnt); |
1446 | extern int get_sb_single(struct file_system_type *fs_type, | 1444 | extern int get_sb_single(struct file_system_type *fs_type, |
1447 | int flags, void *data, | 1445 | int flags, void *data, |
1448 | int (*fill_super)(struct super_block *, void *, int), | 1446 | int (*fill_super)(struct super_block *, void *, int), |
1449 | struct vfsmount *mnt); | 1447 | struct vfsmount *mnt); |
1450 | extern int get_sb_nodev(struct file_system_type *fs_type, | 1448 | extern int get_sb_nodev(struct file_system_type *fs_type, |
1451 | int flags, void *data, | 1449 | int flags, void *data, |
1452 | int (*fill_super)(struct super_block *, void *, int), | 1450 | int (*fill_super)(struct super_block *, void *, int), |
1453 | struct vfsmount *mnt); | 1451 | struct vfsmount *mnt); |
1454 | void generic_shutdown_super(struct super_block *sb); | 1452 | void generic_shutdown_super(struct super_block *sb); |
1455 | void kill_block_super(struct super_block *sb); | 1453 | void kill_block_super(struct super_block *sb); |
1456 | void kill_anon_super(struct super_block *sb); | 1454 | void kill_anon_super(struct super_block *sb); |
1457 | void kill_litter_super(struct super_block *sb); | 1455 | void kill_litter_super(struct super_block *sb); |
1458 | void deactivate_super(struct super_block *sb); | 1456 | void deactivate_super(struct super_block *sb); |
1459 | int set_anon_super(struct super_block *s, void *data); | 1457 | int set_anon_super(struct super_block *s, void *data); |
1460 | struct super_block *sget(struct file_system_type *type, | 1458 | struct super_block *sget(struct file_system_type *type, |
1461 | int (*test)(struct super_block *,void *), | 1459 | int (*test)(struct super_block *,void *), |
1462 | int (*set)(struct super_block *,void *), | 1460 | int (*set)(struct super_block *,void *), |
1463 | void *data); | 1461 | void *data); |
1464 | extern int get_sb_pseudo(struct file_system_type *, char *, | 1462 | extern int get_sb_pseudo(struct file_system_type *, char *, |
1465 | const struct super_operations *ops, unsigned long, | 1463 | const struct super_operations *ops, unsigned long, |
1466 | struct vfsmount *mnt); | 1464 | struct vfsmount *mnt); |
1467 | extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); | 1465 | extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); |
1468 | int __put_super(struct super_block *sb); | 1466 | int __put_super(struct super_block *sb); |
1469 | int __put_super_and_need_restart(struct super_block *sb); | 1467 | int __put_super_and_need_restart(struct super_block *sb); |
1470 | void unnamed_dev_init(void); | 1468 | void unnamed_dev_init(void); |
1471 | 1469 | ||
1472 | /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ | 1470 | /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ |
1473 | #define fops_get(fops) \ | 1471 | #define fops_get(fops) \ |
1474 | (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) | 1472 | (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) |
1475 | #define fops_put(fops) \ | 1473 | #define fops_put(fops) \ |
1476 | do { if (fops) module_put((fops)->owner); } while(0) | 1474 | do { if (fops) module_put((fops)->owner); } while(0) |
1477 | 1475 | ||
1478 | extern int register_filesystem(struct file_system_type *); | 1476 | extern int register_filesystem(struct file_system_type *); |
1479 | extern int unregister_filesystem(struct file_system_type *); | 1477 | extern int unregister_filesystem(struct file_system_type *); |
1480 | extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); | 1478 | extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); |
1481 | #define kern_mount(type) kern_mount_data(type, NULL) | 1479 | #define kern_mount(type) kern_mount_data(type, NULL) |
1482 | extern int may_umount_tree(struct vfsmount *); | 1480 | extern int may_umount_tree(struct vfsmount *); |
1483 | extern int may_umount(struct vfsmount *); | 1481 | extern int may_umount(struct vfsmount *); |
1484 | extern void umount_tree(struct vfsmount *, int, struct list_head *); | 1482 | extern void umount_tree(struct vfsmount *, int, struct list_head *); |
1485 | extern void release_mounts(struct list_head *); | 1483 | extern void release_mounts(struct list_head *); |
1486 | extern long do_mount(char *, char *, char *, unsigned long, void *); | 1484 | extern long do_mount(char *, char *, char *, unsigned long, void *); |
1487 | extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); | 1485 | extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); |
1488 | extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, | 1486 | extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, |
1489 | struct vfsmount *); | 1487 | struct vfsmount *); |
1490 | extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); | 1488 | extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); |
1491 | extern void drop_collected_mounts(struct vfsmount *); | 1489 | extern void drop_collected_mounts(struct vfsmount *); |
1492 | 1490 | ||
1493 | extern int vfs_statfs(struct dentry *, struct kstatfs *); | 1491 | extern int vfs_statfs(struct dentry *, struct kstatfs *); |
1494 | 1492 | ||
1495 | /* /sys/fs */ | 1493 | /* /sys/fs */ |
1496 | extern struct kobject *fs_kobj; | 1494 | extern struct kobject *fs_kobj; |
1497 | 1495 | ||
1498 | #define FLOCK_VERIFY_READ 1 | 1496 | #define FLOCK_VERIFY_READ 1 |
1499 | #define FLOCK_VERIFY_WRITE 2 | 1497 | #define FLOCK_VERIFY_WRITE 2 |
1500 | 1498 | ||
1501 | extern int locks_mandatory_locked(struct inode *); | 1499 | extern int locks_mandatory_locked(struct inode *); |
1502 | extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); | 1500 | extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); |
1503 | 1501 | ||
1504 | /* | 1502 | /* |
1505 | * Candidates for mandatory locking have the setgid bit set | 1503 | * Candidates for mandatory locking have the setgid bit set |
1506 | * but no group execute bit - an otherwise meaningless combination. | 1504 | * but no group execute bit - an otherwise meaningless combination. |
1507 | */ | 1505 | */ |
1508 | 1506 | ||
1509 | static inline int __mandatory_lock(struct inode *ino) | 1507 | static inline int __mandatory_lock(struct inode *ino) |
1510 | { | 1508 | { |
1511 | return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; | 1509 | return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; |
1512 | } | 1510 | } |
1513 | 1511 | ||
1514 | /* | 1512 | /* |
1515 | * ... and these candidates should be on MS_MANDLOCK mounted fs, | 1513 | * ... and these candidates should be on MS_MANDLOCK mounted fs, |
1516 | * otherwise these will be advisory locks | 1514 | * otherwise these will be advisory locks |
1517 | */ | 1515 | */ |
1518 | 1516 | ||
1519 | static inline int mandatory_lock(struct inode *ino) | 1517 | static inline int mandatory_lock(struct inode *ino) |
1520 | { | 1518 | { |
1521 | return IS_MANDLOCK(ino) && __mandatory_lock(ino); | 1519 | return IS_MANDLOCK(ino) && __mandatory_lock(ino); |
1522 | } | 1520 | } |
1523 | 1521 | ||
1524 | static inline int locks_verify_locked(struct inode *inode) | 1522 | static inline int locks_verify_locked(struct inode *inode) |
1525 | { | 1523 | { |
1526 | if (mandatory_lock(inode)) | 1524 | if (mandatory_lock(inode)) |
1527 | return locks_mandatory_locked(inode); | 1525 | return locks_mandatory_locked(inode); |
1528 | return 0; | 1526 | return 0; |
1529 | } | 1527 | } |
1530 | 1528 | ||
1531 | extern int rw_verify_area(int, struct file *, loff_t *, size_t); | 1529 | extern int rw_verify_area(int, struct file *, loff_t *, size_t); |
1532 | 1530 | ||
1533 | static inline int locks_verify_truncate(struct inode *inode, | 1531 | static inline int locks_verify_truncate(struct inode *inode, |
1534 | struct file *filp, | 1532 | struct file *filp, |
1535 | loff_t size) | 1533 | loff_t size) |
1536 | { | 1534 | { |
1537 | if (inode->i_flock && mandatory_lock(inode)) | 1535 | if (inode->i_flock && mandatory_lock(inode)) |
1538 | return locks_mandatory_area( | 1536 | return locks_mandatory_area( |
1539 | FLOCK_VERIFY_WRITE, inode, filp, | 1537 | FLOCK_VERIFY_WRITE, inode, filp, |
1540 | size < inode->i_size ? size : inode->i_size, | 1538 | size < inode->i_size ? size : inode->i_size, |
1541 | (size < inode->i_size ? inode->i_size - size | 1539 | (size < inode->i_size ? inode->i_size - size |
1542 | : size - inode->i_size) | 1540 | : size - inode->i_size) |
1543 | ); | 1541 | ); |
1544 | return 0; | 1542 | return 0; |
1545 | } | 1543 | } |
1546 | 1544 | ||
1547 | static inline int break_lease(struct inode *inode, unsigned int mode) | 1545 | static inline int break_lease(struct inode *inode, unsigned int mode) |
1548 | { | 1546 | { |
1549 | if (inode->i_flock) | 1547 | if (inode->i_flock) |
1550 | return __break_lease(inode, mode); | 1548 | return __break_lease(inode, mode); |
1551 | return 0; | 1549 | return 0; |
1552 | } | 1550 | } |
1553 | 1551 | ||
1554 | /* fs/open.c */ | 1552 | /* fs/open.c */ |
1555 | 1553 | ||
1556 | extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, | 1554 | extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, |
1557 | struct file *filp); | 1555 | struct file *filp); |
1558 | extern long do_sys_open(int dfd, const char __user *filename, int flags, | 1556 | extern long do_sys_open(int dfd, const char __user *filename, int flags, |
1559 | int mode); | 1557 | int mode); |
1560 | extern struct file *filp_open(const char *, int, int); | 1558 | extern struct file *filp_open(const char *, int, int); |
1561 | extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); | 1559 | extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); |
1562 | extern int filp_close(struct file *, fl_owner_t id); | 1560 | extern int filp_close(struct file *, fl_owner_t id); |
1563 | extern char * getname(const char __user *); | 1561 | extern char * getname(const char __user *); |
1564 | 1562 | ||
1565 | /* fs/dcache.c */ | 1563 | /* fs/dcache.c */ |
1566 | extern void __init vfs_caches_init_early(void); | 1564 | extern void __init vfs_caches_init_early(void); |
1567 | extern void __init vfs_caches_init(unsigned long); | 1565 | extern void __init vfs_caches_init(unsigned long); |
1568 | 1566 | ||
1569 | extern struct kmem_cache *names_cachep; | 1567 | extern struct kmem_cache *names_cachep; |
1570 | 1568 | ||
1571 | #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) | 1569 | #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) |
1572 | #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) | 1570 | #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) |
1573 | #ifndef CONFIG_AUDITSYSCALL | 1571 | #ifndef CONFIG_AUDITSYSCALL |
1574 | #define putname(name) __putname(name) | 1572 | #define putname(name) __putname(name) |
1575 | #else | 1573 | #else |
1576 | extern void putname(const char *name); | 1574 | extern void putname(const char *name); |
1577 | #endif | 1575 | #endif |
1578 | 1576 | ||
1579 | #ifdef CONFIG_BLOCK | 1577 | #ifdef CONFIG_BLOCK |
1580 | extern int register_blkdev(unsigned int, const char *); | 1578 | extern int register_blkdev(unsigned int, const char *); |
1581 | extern void unregister_blkdev(unsigned int, const char *); | 1579 | extern void unregister_blkdev(unsigned int, const char *); |
1582 | extern struct block_device *bdget(dev_t); | 1580 | extern struct block_device *bdget(dev_t); |
1583 | extern void bd_set_size(struct block_device *, loff_t size); | 1581 | extern void bd_set_size(struct block_device *, loff_t size); |
1584 | extern void bd_forget(struct inode *inode); | 1582 | extern void bd_forget(struct inode *inode); |
1585 | extern void bdput(struct block_device *); | 1583 | extern void bdput(struct block_device *); |
1586 | extern struct block_device *open_by_devnum(dev_t, unsigned); | 1584 | extern struct block_device *open_by_devnum(dev_t, unsigned); |
1587 | extern const struct address_space_operations def_blk_aops; | 1585 | extern const struct address_space_operations def_blk_aops; |
1588 | #else | 1586 | #else |
1589 | static inline void bd_forget(struct inode *inode) {} | 1587 | static inline void bd_forget(struct inode *inode) {} |
1590 | #endif | 1588 | #endif |
1591 | extern const struct file_operations def_blk_fops; | 1589 | extern const struct file_operations def_blk_fops; |
1592 | extern const struct file_operations def_chr_fops; | 1590 | extern const struct file_operations def_chr_fops; |
1593 | extern const struct file_operations bad_sock_fops; | 1591 | extern const struct file_operations bad_sock_fops; |
1594 | extern const struct file_operations def_fifo_fops; | 1592 | extern const struct file_operations def_fifo_fops; |
1595 | #ifdef CONFIG_BLOCK | 1593 | #ifdef CONFIG_BLOCK |
1596 | extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); | 1594 | extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); |
1597 | extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); | 1595 | extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); |
1598 | extern int blkdev_driver_ioctl(struct inode *inode, struct file *file, | 1596 | extern int blkdev_driver_ioctl(struct inode *inode, struct file *file, |
1599 | struct gendisk *disk, unsigned cmd, | 1597 | struct gendisk *disk, unsigned cmd, |
1600 | unsigned long arg); | 1598 | unsigned long arg); |
1601 | extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); | 1599 | extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); |
1602 | extern int blkdev_get(struct block_device *, mode_t, unsigned); | 1600 | extern int blkdev_get(struct block_device *, mode_t, unsigned); |
1603 | extern int blkdev_put(struct block_device *); | 1601 | extern int blkdev_put(struct block_device *); |
1604 | extern int bd_claim(struct block_device *, void *); | 1602 | extern int bd_claim(struct block_device *, void *); |
1605 | extern void bd_release(struct block_device *); | 1603 | extern void bd_release(struct block_device *); |
1606 | #ifdef CONFIG_SYSFS | 1604 | #ifdef CONFIG_SYSFS |
1607 | extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); | 1605 | extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); |
1608 | extern void bd_release_from_disk(struct block_device *, struct gendisk *); | 1606 | extern void bd_release_from_disk(struct block_device *, struct gendisk *); |
1609 | #else | 1607 | #else |
1610 | #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) | 1608 | #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) |
1611 | #define bd_release_from_disk(bdev, disk) bd_release(bdev) | 1609 | #define bd_release_from_disk(bdev, disk) bd_release(bdev) |
1612 | #endif | 1610 | #endif |
1613 | #endif | 1611 | #endif |
1614 | 1612 | ||
1615 | /* fs/char_dev.c */ | 1613 | /* fs/char_dev.c */ |
1616 | #define CHRDEV_MAJOR_HASH_SIZE 255 | 1614 | #define CHRDEV_MAJOR_HASH_SIZE 255 |
1617 | extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); | 1615 | extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); |
1618 | extern int register_chrdev_region(dev_t, unsigned, const char *); | 1616 | extern int register_chrdev_region(dev_t, unsigned, const char *); |
1619 | extern int register_chrdev(unsigned int, const char *, | 1617 | extern int register_chrdev(unsigned int, const char *, |
1620 | const struct file_operations *); | 1618 | const struct file_operations *); |
1621 | extern void unregister_chrdev(unsigned int, const char *); | 1619 | extern void unregister_chrdev(unsigned int, const char *); |
1622 | extern void unregister_chrdev_region(dev_t, unsigned); | 1620 | extern void unregister_chrdev_region(dev_t, unsigned); |
1623 | extern int chrdev_open(struct inode *, struct file *); | 1621 | extern int chrdev_open(struct inode *, struct file *); |
1624 | extern void chrdev_show(struct seq_file *,off_t); | 1622 | extern void chrdev_show(struct seq_file *,off_t); |
1625 | 1623 | ||
1626 | /* fs/block_dev.c */ | 1624 | /* fs/block_dev.c */ |
1627 | #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ | 1625 | #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ |
1628 | 1626 | ||
1629 | #ifdef CONFIG_BLOCK | 1627 | #ifdef CONFIG_BLOCK |
1630 | #define BLKDEV_MAJOR_HASH_SIZE 255 | 1628 | #define BLKDEV_MAJOR_HASH_SIZE 255 |
1631 | extern const char *__bdevname(dev_t, char *buffer); | 1629 | extern const char *__bdevname(dev_t, char *buffer); |
1632 | extern const char *bdevname(struct block_device *bdev, char *buffer); | 1630 | extern const char *bdevname(struct block_device *bdev, char *buffer); |
1633 | extern struct block_device *lookup_bdev(const char *); | 1631 | extern struct block_device *lookup_bdev(const char *); |
1634 | extern struct block_device *open_bdev_excl(const char *, int, void *); | 1632 | extern struct block_device *open_bdev_excl(const char *, int, void *); |
1635 | extern void close_bdev_excl(struct block_device *); | 1633 | extern void close_bdev_excl(struct block_device *); |
1636 | extern void blkdev_show(struct seq_file *,off_t); | 1634 | extern void blkdev_show(struct seq_file *,off_t); |
1637 | #else | 1635 | #else |
1638 | #define BLKDEV_MAJOR_HASH_SIZE 0 | 1636 | #define BLKDEV_MAJOR_HASH_SIZE 0 |
1639 | #endif | 1637 | #endif |
1640 | 1638 | ||
1641 | extern void init_special_inode(struct inode *, umode_t, dev_t); | 1639 | extern void init_special_inode(struct inode *, umode_t, dev_t); |
1642 | 1640 | ||
1643 | /* Invalid inode operations -- fs/bad_inode.c */ | 1641 | /* Invalid inode operations -- fs/bad_inode.c */ |
1644 | extern void make_bad_inode(struct inode *); | 1642 | extern void make_bad_inode(struct inode *); |
1645 | extern int is_bad_inode(struct inode *); | 1643 | extern int is_bad_inode(struct inode *); |
1646 | 1644 | ||
1647 | extern const struct file_operations read_fifo_fops; | 1645 | extern const struct file_operations read_fifo_fops; |
1648 | extern const struct file_operations write_fifo_fops; | 1646 | extern const struct file_operations write_fifo_fops; |
1649 | extern const struct file_operations rdwr_fifo_fops; | 1647 | extern const struct file_operations rdwr_fifo_fops; |
1650 | 1648 | ||
1651 | extern int fs_may_remount_ro(struct super_block *); | 1649 | extern int fs_may_remount_ro(struct super_block *); |
1652 | 1650 | ||
1653 | #ifdef CONFIG_BLOCK | 1651 | #ifdef CONFIG_BLOCK |
1654 | /* | 1652 | /* |
1655 | * return READ, READA, or WRITE | 1653 | * return READ, READA, or WRITE |
1656 | */ | 1654 | */ |
1657 | #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) | 1655 | #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) |
1658 | 1656 | ||
1659 | /* | 1657 | /* |
1660 | * return data direction, READ or WRITE | 1658 | * return data direction, READ or WRITE |
1661 | */ | 1659 | */ |
1662 | #define bio_data_dir(bio) ((bio)->bi_rw & 1) | 1660 | #define bio_data_dir(bio) ((bio)->bi_rw & 1) |
1663 | 1661 | ||
1664 | extern int check_disk_change(struct block_device *); | 1662 | extern int check_disk_change(struct block_device *); |
1665 | extern int __invalidate_device(struct block_device *); | 1663 | extern int __invalidate_device(struct block_device *); |
1666 | extern int invalidate_partition(struct gendisk *, int); | 1664 | extern int invalidate_partition(struct gendisk *, int); |
1667 | #endif | 1665 | #endif |
1668 | extern int invalidate_inodes(struct super_block *); | 1666 | extern int invalidate_inodes(struct super_block *); |
1669 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 1667 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, |
1670 | pgoff_t start, pgoff_t end, | 1668 | pgoff_t start, pgoff_t end, |
1671 | bool be_atomic); | 1669 | bool be_atomic); |
1672 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | 1670 | unsigned long invalidate_mapping_pages(struct address_space *mapping, |
1673 | pgoff_t start, pgoff_t end); | 1671 | pgoff_t start, pgoff_t end); |
1674 | 1672 | ||
1675 | static inline unsigned long __deprecated | 1673 | static inline unsigned long __deprecated |
1676 | invalidate_inode_pages(struct address_space *mapping) | 1674 | invalidate_inode_pages(struct address_space *mapping) |
1677 | { | 1675 | { |
1678 | return invalidate_mapping_pages(mapping, 0, ~0UL); | 1676 | return invalidate_mapping_pages(mapping, 0, ~0UL); |
1679 | } | 1677 | } |
1680 | 1678 | ||
1681 | static inline void invalidate_remote_inode(struct inode *inode) | 1679 | static inline void invalidate_remote_inode(struct inode *inode) |
1682 | { | 1680 | { |
1683 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 1681 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
1684 | S_ISLNK(inode->i_mode)) | 1682 | S_ISLNK(inode->i_mode)) |
1685 | invalidate_mapping_pages(inode->i_mapping, 0, -1); | 1683 | invalidate_mapping_pages(inode->i_mapping, 0, -1); |
1686 | } | 1684 | } |
1687 | extern int invalidate_inode_pages2(struct address_space *mapping); | 1685 | extern int invalidate_inode_pages2(struct address_space *mapping); |
1688 | extern int invalidate_inode_pages2_range(struct address_space *mapping, | 1686 | extern int invalidate_inode_pages2_range(struct address_space *mapping, |
1689 | pgoff_t start, pgoff_t end); | 1687 | pgoff_t start, pgoff_t end); |
1690 | extern int write_inode_now(struct inode *, int); | 1688 | extern int write_inode_now(struct inode *, int); |
1691 | extern int filemap_fdatawrite(struct address_space *); | 1689 | extern int filemap_fdatawrite(struct address_space *); |
1692 | extern int filemap_flush(struct address_space *); | 1690 | extern int filemap_flush(struct address_space *); |
1693 | extern int filemap_fdatawait(struct address_space *); | 1691 | extern int filemap_fdatawait(struct address_space *); |
1694 | extern int filemap_write_and_wait(struct address_space *mapping); | 1692 | extern int filemap_write_and_wait(struct address_space *mapping); |
1695 | extern int filemap_write_and_wait_range(struct address_space *mapping, | 1693 | extern int filemap_write_and_wait_range(struct address_space *mapping, |
1696 | loff_t lstart, loff_t lend); | 1694 | loff_t lstart, loff_t lend); |
1697 | extern int wait_on_page_writeback_range(struct address_space *mapping, | 1695 | extern int wait_on_page_writeback_range(struct address_space *mapping, |
1698 | pgoff_t start, pgoff_t end); | 1696 | pgoff_t start, pgoff_t end); |
1699 | extern int __filemap_fdatawrite_range(struct address_space *mapping, | 1697 | extern int __filemap_fdatawrite_range(struct address_space *mapping, |
1700 | loff_t start, loff_t end, int sync_mode); | 1698 | loff_t start, loff_t end, int sync_mode); |
1701 | 1699 | ||
1702 | extern long do_fsync(struct file *file, int datasync); | 1700 | extern long do_fsync(struct file *file, int datasync); |
1703 | extern void sync_supers(void); | 1701 | extern void sync_supers(void); |
1704 | extern void sync_filesystems(int wait); | 1702 | extern void sync_filesystems(int wait); |
1705 | extern void __fsync_super(struct super_block *sb); | 1703 | extern void __fsync_super(struct super_block *sb); |
1706 | extern void emergency_sync(void); | 1704 | extern void emergency_sync(void); |
1707 | extern void emergency_remount(void); | 1705 | extern void emergency_remount(void); |
1708 | extern int do_remount_sb(struct super_block *sb, int flags, | 1706 | extern int do_remount_sb(struct super_block *sb, int flags, |
1709 | void *data, int force); | 1707 | void *data, int force); |
1710 | #ifdef CONFIG_BLOCK | 1708 | #ifdef CONFIG_BLOCK |
1711 | extern sector_t bmap(struct inode *, sector_t); | 1709 | extern sector_t bmap(struct inode *, sector_t); |
1712 | #endif | 1710 | #endif |
1713 | extern int notify_change(struct dentry *, struct iattr *); | 1711 | extern int notify_change(struct dentry *, struct iattr *); |
1714 | extern int permission(struct inode *, int, struct nameidata *); | 1712 | extern int permission(struct inode *, int, struct nameidata *); |
1715 | extern int generic_permission(struct inode *, int, | 1713 | extern int generic_permission(struct inode *, int, |
1716 | int (*check_acl)(struct inode *, int)); | 1714 | int (*check_acl)(struct inode *, int)); |
1717 | 1715 | ||
1718 | extern int get_write_access(struct inode *); | 1716 | extern int get_write_access(struct inode *); |
1719 | extern int deny_write_access(struct file *); | 1717 | extern int deny_write_access(struct file *); |
1720 | static inline void put_write_access(struct inode * inode) | 1718 | static inline void put_write_access(struct inode * inode) |
1721 | { | 1719 | { |
1722 | atomic_dec(&inode->i_writecount); | 1720 | atomic_dec(&inode->i_writecount); |
1723 | } | 1721 | } |
1724 | static inline void allow_write_access(struct file *file) | 1722 | static inline void allow_write_access(struct file *file) |
1725 | { | 1723 | { |
1726 | if (file) | 1724 | if (file) |
1727 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); | 1725 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
1728 | } | 1726 | } |
1729 | extern int do_pipe(int *); | 1727 | extern int do_pipe(int *); |
1730 | extern struct file *create_read_pipe(struct file *f); | 1728 | extern struct file *create_read_pipe(struct file *f); |
1731 | extern struct file *create_write_pipe(void); | 1729 | extern struct file *create_write_pipe(void); |
1732 | extern void free_write_pipe(struct file *); | 1730 | extern void free_write_pipe(struct file *); |
1733 | 1731 | ||
1734 | extern int open_namei(int dfd, const char *, int, int, struct nameidata *); | 1732 | extern int open_namei(int dfd, const char *, int, int, struct nameidata *); |
1735 | extern int may_open(struct nameidata *, int, int); | 1733 | extern int may_open(struct nameidata *, int, int); |
1736 | 1734 | ||
1737 | extern int kernel_read(struct file *, unsigned long, char *, unsigned long); | 1735 | extern int kernel_read(struct file *, unsigned long, char *, unsigned long); |
1738 | extern struct file * open_exec(const char *); | 1736 | extern struct file * open_exec(const char *); |
1739 | 1737 | ||
1740 | /* fs/dcache.c -- generic fs support functions */ | 1738 | /* fs/dcache.c -- generic fs support functions */ |
1741 | extern int is_subdir(struct dentry *, struct dentry *); | 1739 | extern int is_subdir(struct dentry *, struct dentry *); |
1742 | extern ino_t find_inode_number(struct dentry *, struct qstr *); | 1740 | extern ino_t find_inode_number(struct dentry *, struct qstr *); |
1743 | 1741 | ||
1744 | #include <linux/err.h> | 1742 | #include <linux/err.h> |
1745 | 1743 | ||
1746 | /* needed for stackable file system support */ | 1744 | /* needed for stackable file system support */ |
1747 | extern loff_t default_llseek(struct file *file, loff_t offset, int origin); | 1745 | extern loff_t default_llseek(struct file *file, loff_t offset, int origin); |
1748 | 1746 | ||
1749 | extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); | 1747 | extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); |
1750 | 1748 | ||
1751 | extern void inode_init_once(struct inode *); | 1749 | extern void inode_init_once(struct inode *); |
1752 | extern void iput(struct inode *); | 1750 | extern void iput(struct inode *); |
1753 | extern struct inode * igrab(struct inode *); | 1751 | extern struct inode * igrab(struct inode *); |
1754 | extern ino_t iunique(struct super_block *, ino_t); | 1752 | extern ino_t iunique(struct super_block *, ino_t); |
1755 | extern int inode_needs_sync(struct inode *inode); | 1753 | extern int inode_needs_sync(struct inode *inode); |
1756 | extern void generic_delete_inode(struct inode *inode); | 1754 | extern void generic_delete_inode(struct inode *inode); |
1757 | extern void generic_drop_inode(struct inode *inode); | 1755 | extern void generic_drop_inode(struct inode *inode); |
1758 | 1756 | ||
1759 | extern struct inode *ilookup5_nowait(struct super_block *sb, | 1757 | extern struct inode *ilookup5_nowait(struct super_block *sb, |
1760 | unsigned long hashval, int (*test)(struct inode *, void *), | 1758 | unsigned long hashval, int (*test)(struct inode *, void *), |
1761 | void *data); | 1759 | void *data); |
1762 | extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, | 1760 | extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, |
1763 | int (*test)(struct inode *, void *), void *data); | 1761 | int (*test)(struct inode *, void *), void *data); |
1764 | extern struct inode *ilookup(struct super_block *sb, unsigned long ino); | 1762 | extern struct inode *ilookup(struct super_block *sb, unsigned long ino); |
1765 | 1763 | ||
1766 | extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); | 1764 | extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); |
1767 | extern struct inode * iget_locked(struct super_block *, unsigned long); | 1765 | extern struct inode * iget_locked(struct super_block *, unsigned long); |
1768 | extern void unlock_new_inode(struct inode *); | 1766 | extern void unlock_new_inode(struct inode *); |
1769 | |||
1770 | static inline struct inode *iget(struct super_block *sb, unsigned long ino) | ||
1771 | { | ||
1772 | struct inode *inode = iget_locked(sb, ino); | ||
1773 | |||
1774 | if (inode && (inode->i_state & I_NEW)) { | ||
1775 | sb->s_op->read_inode(inode); | ||
1776 | unlock_new_inode(inode); | ||
1777 | } | ||
1778 | |||
1779 | return inode; | ||
1780 | } | ||
1781 | 1767 | ||
1782 | extern void __iget(struct inode * inode); | 1768 | extern void __iget(struct inode * inode); |
1783 | extern void iget_failed(struct inode *); | 1769 | extern void iget_failed(struct inode *); |
1784 | extern void clear_inode(struct inode *); | 1770 | extern void clear_inode(struct inode *); |
1785 | extern void destroy_inode(struct inode *); | 1771 | extern void destroy_inode(struct inode *); |
1786 | extern struct inode *new_inode(struct super_block *); | 1772 | extern struct inode *new_inode(struct super_block *); |
1787 | extern int __remove_suid(struct dentry *, int); | 1773 | extern int __remove_suid(struct dentry *, int); |
1788 | extern int should_remove_suid(struct dentry *); | 1774 | extern int should_remove_suid(struct dentry *); |
1789 | extern int remove_suid(struct dentry *); | 1775 | extern int remove_suid(struct dentry *); |
1790 | 1776 | ||
1791 | extern void __insert_inode_hash(struct inode *, unsigned long hashval); | 1777 | extern void __insert_inode_hash(struct inode *, unsigned long hashval); |
1792 | extern void remove_inode_hash(struct inode *); | 1778 | extern void remove_inode_hash(struct inode *); |
1793 | static inline void insert_inode_hash(struct inode *inode) { | 1779 | static inline void insert_inode_hash(struct inode *inode) { |
1794 | __insert_inode_hash(inode, inode->i_ino); | 1780 | __insert_inode_hash(inode, inode->i_ino); |
1795 | } | 1781 | } |
1796 | 1782 | ||
1797 | extern struct file * get_empty_filp(void); | 1783 | extern struct file * get_empty_filp(void); |
1798 | extern void file_move(struct file *f, struct list_head *list); | 1784 | extern void file_move(struct file *f, struct list_head *list); |
1799 | extern void file_kill(struct file *f); | 1785 | extern void file_kill(struct file *f); |
1800 | #ifdef CONFIG_BLOCK | 1786 | #ifdef CONFIG_BLOCK |
1801 | struct bio; | 1787 | struct bio; |
1802 | extern void submit_bio(int, struct bio *); | 1788 | extern void submit_bio(int, struct bio *); |
1803 | extern int bdev_read_only(struct block_device *); | 1789 | extern int bdev_read_only(struct block_device *); |
1804 | #endif | 1790 | #endif |
1805 | extern int set_blocksize(struct block_device *, int); | 1791 | extern int set_blocksize(struct block_device *, int); |
1806 | extern int sb_set_blocksize(struct super_block *, int); | 1792 | extern int sb_set_blocksize(struct super_block *, int); |
1807 | extern int sb_min_blocksize(struct super_block *, int); | 1793 | extern int sb_min_blocksize(struct super_block *, int); |
1808 | extern int sb_has_dirty_inodes(struct super_block *); | 1794 | extern int sb_has_dirty_inodes(struct super_block *); |
1809 | 1795 | ||
1810 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); | 1796 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); |
1811 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); | 1797 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); |
1812 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); | 1798 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); |
1813 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); | 1799 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); |
1814 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1800 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1815 | extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1801 | extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1816 | extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, | 1802 | extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, |
1817 | unsigned long, loff_t); | 1803 | unsigned long, loff_t); |
1818 | extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, | 1804 | extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, |
1819 | unsigned long *, loff_t, loff_t *, size_t, size_t); | 1805 | unsigned long *, loff_t, loff_t *, size_t, size_t); |
1820 | extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, | 1806 | extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, |
1821 | unsigned long, loff_t, loff_t *, size_t, ssize_t); | 1807 | unsigned long, loff_t, loff_t *, size_t, ssize_t); |
1822 | extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); | 1808 | extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); |
1823 | extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); | 1809 | extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); |
1824 | extern void do_generic_mapping_read(struct address_space *mapping, | 1810 | extern void do_generic_mapping_read(struct address_space *mapping, |
1825 | struct file_ra_state *, struct file *, | 1811 | struct file_ra_state *, struct file *, |
1826 | loff_t *, read_descriptor_t *, read_actor_t); | 1812 | loff_t *, read_descriptor_t *, read_actor_t); |
1827 | extern int generic_segment_checks(const struct iovec *iov, | 1813 | extern int generic_segment_checks(const struct iovec *iov, |
1828 | unsigned long *nr_segs, size_t *count, int access_flags); | 1814 | unsigned long *nr_segs, size_t *count, int access_flags); |
1829 | 1815 | ||
1830 | /* fs/splice.c */ | 1816 | /* fs/splice.c */ |
1831 | extern ssize_t generic_file_splice_read(struct file *, loff_t *, | 1817 | extern ssize_t generic_file_splice_read(struct file *, loff_t *, |
1832 | struct pipe_inode_info *, size_t, unsigned int); | 1818 | struct pipe_inode_info *, size_t, unsigned int); |
1833 | extern ssize_t generic_file_splice_write(struct pipe_inode_info *, | 1819 | extern ssize_t generic_file_splice_write(struct pipe_inode_info *, |
1834 | struct file *, loff_t *, size_t, unsigned int); | 1820 | struct file *, loff_t *, size_t, unsigned int); |
1835 | extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, | 1821 | extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, |
1836 | struct file *, loff_t *, size_t, unsigned int); | 1822 | struct file *, loff_t *, size_t, unsigned int); |
1837 | extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, | 1823 | extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, |
1838 | struct file *out, loff_t *, size_t len, unsigned int flags); | 1824 | struct file *out, loff_t *, size_t len, unsigned int flags); |
1839 | extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | 1825 | extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, |
1840 | size_t len, unsigned int flags); | 1826 | size_t len, unsigned int flags); |
1841 | 1827 | ||
1842 | extern void | 1828 | extern void |
1843 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); | 1829 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); |
1844 | extern loff_t no_llseek(struct file *file, loff_t offset, int origin); | 1830 | extern loff_t no_llseek(struct file *file, loff_t offset, int origin); |
1845 | extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); | 1831 | extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); |
1846 | extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); | 1832 | extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); |
1847 | extern int generic_file_open(struct inode * inode, struct file * filp); | 1833 | extern int generic_file_open(struct inode * inode, struct file * filp); |
1848 | extern int nonseekable_open(struct inode * inode, struct file * filp); | 1834 | extern int nonseekable_open(struct inode * inode, struct file * filp); |
1849 | 1835 | ||
1850 | #ifdef CONFIG_FS_XIP | 1836 | #ifdef CONFIG_FS_XIP |
1851 | extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, | 1837 | extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, |
1852 | loff_t *ppos); | 1838 | loff_t *ppos); |
1853 | extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); | 1839 | extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); |
1854 | extern ssize_t xip_file_write(struct file *filp, const char __user *buf, | 1840 | extern ssize_t xip_file_write(struct file *filp, const char __user *buf, |
1855 | size_t len, loff_t *ppos); | 1841 | size_t len, loff_t *ppos); |
1856 | extern int xip_truncate_page(struct address_space *mapping, loff_t from); | 1842 | extern int xip_truncate_page(struct address_space *mapping, loff_t from); |
1857 | #else | 1843 | #else |
1858 | static inline int xip_truncate_page(struct address_space *mapping, loff_t from) | 1844 | static inline int xip_truncate_page(struct address_space *mapping, loff_t from) |
1859 | { | 1845 | { |
1860 | return 0; | 1846 | return 0; |
1861 | } | 1847 | } |
1862 | #endif | 1848 | #endif |
1863 | 1849 | ||
1864 | static inline void do_generic_file_read(struct file * filp, loff_t *ppos, | 1850 | static inline void do_generic_file_read(struct file * filp, loff_t *ppos, |
1865 | read_descriptor_t * desc, | 1851 | read_descriptor_t * desc, |
1866 | read_actor_t actor) | 1852 | read_actor_t actor) |
1867 | { | 1853 | { |
1868 | do_generic_mapping_read(filp->f_mapping, | 1854 | do_generic_mapping_read(filp->f_mapping, |
1869 | &filp->f_ra, | 1855 | &filp->f_ra, |
1870 | filp, | 1856 | filp, |
1871 | ppos, | 1857 | ppos, |
1872 | desc, | 1858 | desc, |
1873 | actor); | 1859 | actor); |
1874 | } | 1860 | } |
1875 | 1861 | ||
1876 | #ifdef CONFIG_BLOCK | 1862 | #ifdef CONFIG_BLOCK |
1877 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1863 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1878 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1864 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1879 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1865 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1880 | int lock_type); | 1866 | int lock_type); |
1881 | 1867 | ||
1882 | enum { | 1868 | enum { |
1883 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ | 1869 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ |
1884 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ | 1870 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ |
1885 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ | 1871 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ |
1886 | }; | 1872 | }; |
1887 | 1873 | ||
1888 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | 1874 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, |
1889 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 1875 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
1890 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 1876 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
1891 | dio_iodone_t end_io) | 1877 | dio_iodone_t end_io) |
1892 | { | 1878 | { |
1893 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 1879 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
1894 | nr_segs, get_block, end_io, DIO_LOCKING); | 1880 | nr_segs, get_block, end_io, DIO_LOCKING); |
1895 | } | 1881 | } |
1896 | 1882 | ||
1897 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | 1883 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, |
1898 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 1884 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
1899 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 1885 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
1900 | dio_iodone_t end_io) | 1886 | dio_iodone_t end_io) |
1901 | { | 1887 | { |
1902 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 1888 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
1903 | nr_segs, get_block, end_io, DIO_NO_LOCKING); | 1889 | nr_segs, get_block, end_io, DIO_NO_LOCKING); |
1904 | } | 1890 | } |
1905 | 1891 | ||
1906 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, | 1892 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, |
1907 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 1893 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
1908 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 1894 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
1909 | dio_iodone_t end_io) | 1895 | dio_iodone_t end_io) |
1910 | { | 1896 | { |
1911 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 1897 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
1912 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); | 1898 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); |
1913 | } | 1899 | } |
1914 | #endif | 1900 | #endif |
1915 | 1901 | ||
1916 | extern const struct file_operations generic_ro_fops; | 1902 | extern const struct file_operations generic_ro_fops; |
1917 | 1903 | ||
1918 | #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) | 1904 | #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) |
1919 | 1905 | ||
1920 | extern int vfs_readlink(struct dentry *, char __user *, int, const char *); | 1906 | extern int vfs_readlink(struct dentry *, char __user *, int, const char *); |
1921 | extern int vfs_follow_link(struct nameidata *, const char *); | 1907 | extern int vfs_follow_link(struct nameidata *, const char *); |
1922 | extern int page_readlink(struct dentry *, char __user *, int); | 1908 | extern int page_readlink(struct dentry *, char __user *, int); |
1923 | extern void *page_follow_link_light(struct dentry *, struct nameidata *); | 1909 | extern void *page_follow_link_light(struct dentry *, struct nameidata *); |
1924 | extern void page_put_link(struct dentry *, struct nameidata *, void *); | 1910 | extern void page_put_link(struct dentry *, struct nameidata *, void *); |
1925 | extern int __page_symlink(struct inode *inode, const char *symname, int len, | 1911 | extern int __page_symlink(struct inode *inode, const char *symname, int len, |
1926 | gfp_t gfp_mask); | 1912 | gfp_t gfp_mask); |
1927 | extern int page_symlink(struct inode *inode, const char *symname, int len); | 1913 | extern int page_symlink(struct inode *inode, const char *symname, int len); |
1928 | extern const struct inode_operations page_symlink_inode_operations; | 1914 | extern const struct inode_operations page_symlink_inode_operations; |
1929 | extern int generic_readlink(struct dentry *, char __user *, int); | 1915 | extern int generic_readlink(struct dentry *, char __user *, int); |
1930 | extern void generic_fillattr(struct inode *, struct kstat *); | 1916 | extern void generic_fillattr(struct inode *, struct kstat *); |
1931 | extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 1917 | extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
1932 | void inode_add_bytes(struct inode *inode, loff_t bytes); | 1918 | void inode_add_bytes(struct inode *inode, loff_t bytes); |
1933 | void inode_sub_bytes(struct inode *inode, loff_t bytes); | 1919 | void inode_sub_bytes(struct inode *inode, loff_t bytes); |
1934 | loff_t inode_get_bytes(struct inode *inode); | 1920 | loff_t inode_get_bytes(struct inode *inode); |
1935 | void inode_set_bytes(struct inode *inode, loff_t bytes); | 1921 | void inode_set_bytes(struct inode *inode, loff_t bytes); |
1936 | 1922 | ||
1937 | extern int vfs_readdir(struct file *, filldir_t, void *); | 1923 | extern int vfs_readdir(struct file *, filldir_t, void *); |
1938 | 1924 | ||
1939 | extern int vfs_stat(char __user *, struct kstat *); | 1925 | extern int vfs_stat(char __user *, struct kstat *); |
1940 | extern int vfs_lstat(char __user *, struct kstat *); | 1926 | extern int vfs_lstat(char __user *, struct kstat *); |
1941 | extern int vfs_stat_fd(int dfd, char __user *, struct kstat *); | 1927 | extern int vfs_stat_fd(int dfd, char __user *, struct kstat *); |
1942 | extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *); | 1928 | extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *); |
1943 | extern int vfs_fstat(unsigned int, struct kstat *); | 1929 | extern int vfs_fstat(unsigned int, struct kstat *); |
1944 | 1930 | ||
1945 | extern long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); | 1931 | extern long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); |
1946 | extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, | 1932 | extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, |
1947 | unsigned long arg); | 1933 | unsigned long arg); |
1948 | 1934 | ||
1949 | extern void get_filesystem(struct file_system_type *fs); | 1935 | extern void get_filesystem(struct file_system_type *fs); |
1950 | extern void put_filesystem(struct file_system_type *fs); | 1936 | extern void put_filesystem(struct file_system_type *fs); |
1951 | extern struct file_system_type *get_fs_type(const char *name); | 1937 | extern struct file_system_type *get_fs_type(const char *name); |
1952 | extern struct super_block *get_super(struct block_device *); | 1938 | extern struct super_block *get_super(struct block_device *); |
1953 | extern struct super_block *user_get_super(dev_t); | 1939 | extern struct super_block *user_get_super(dev_t); |
1954 | extern void drop_super(struct super_block *sb); | 1940 | extern void drop_super(struct super_block *sb); |
1955 | 1941 | ||
1956 | extern int dcache_dir_open(struct inode *, struct file *); | 1942 | extern int dcache_dir_open(struct inode *, struct file *); |
1957 | extern int dcache_dir_close(struct inode *, struct file *); | 1943 | extern int dcache_dir_close(struct inode *, struct file *); |
1958 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); | 1944 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); |
1959 | extern int dcache_readdir(struct file *, void *, filldir_t); | 1945 | extern int dcache_readdir(struct file *, void *, filldir_t); |
1960 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 1946 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
1961 | extern int simple_statfs(struct dentry *, struct kstatfs *); | 1947 | extern int simple_statfs(struct dentry *, struct kstatfs *); |
1962 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); | 1948 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); |
1963 | extern int simple_unlink(struct inode *, struct dentry *); | 1949 | extern int simple_unlink(struct inode *, struct dentry *); |
1964 | extern int simple_rmdir(struct inode *, struct dentry *); | 1950 | extern int simple_rmdir(struct inode *, struct dentry *); |
1965 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); | 1951 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); |
1966 | extern int simple_sync_file(struct file *, struct dentry *, int); | 1952 | extern int simple_sync_file(struct file *, struct dentry *, int); |
1967 | extern int simple_empty(struct dentry *); | 1953 | extern int simple_empty(struct dentry *); |
1968 | extern int simple_readpage(struct file *file, struct page *page); | 1954 | extern int simple_readpage(struct file *file, struct page *page); |
1969 | extern int simple_prepare_write(struct file *file, struct page *page, | 1955 | extern int simple_prepare_write(struct file *file, struct page *page, |
1970 | unsigned offset, unsigned to); | 1956 | unsigned offset, unsigned to); |
1971 | extern int simple_write_begin(struct file *file, struct address_space *mapping, | 1957 | extern int simple_write_begin(struct file *file, struct address_space *mapping, |
1972 | loff_t pos, unsigned len, unsigned flags, | 1958 | loff_t pos, unsigned len, unsigned flags, |
1973 | struct page **pagep, void **fsdata); | 1959 | struct page **pagep, void **fsdata); |
1974 | extern int simple_write_end(struct file *file, struct address_space *mapping, | 1960 | extern int simple_write_end(struct file *file, struct address_space *mapping, |
1975 | loff_t pos, unsigned len, unsigned copied, | 1961 | loff_t pos, unsigned len, unsigned copied, |
1976 | struct page *page, void *fsdata); | 1962 | struct page *page, void *fsdata); |
1977 | 1963 | ||
1978 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); | 1964 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); |
1979 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); | 1965 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); |
1980 | extern const struct file_operations simple_dir_operations; | 1966 | extern const struct file_operations simple_dir_operations; |
1981 | extern const struct inode_operations simple_dir_inode_operations; | 1967 | extern const struct inode_operations simple_dir_inode_operations; |
1982 | struct tree_descr { char *name; const struct file_operations *ops; int mode; }; | 1968 | struct tree_descr { char *name; const struct file_operations *ops; int mode; }; |
1983 | struct dentry *d_alloc_name(struct dentry *, const char *); | 1969 | struct dentry *d_alloc_name(struct dentry *, const char *); |
1984 | extern int simple_fill_super(struct super_block *, int, struct tree_descr *); | 1970 | extern int simple_fill_super(struct super_block *, int, struct tree_descr *); |
1985 | extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); | 1971 | extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); |
1986 | extern void simple_release_fs(struct vfsmount **mount, int *count); | 1972 | extern void simple_release_fs(struct vfsmount **mount, int *count); |
1987 | 1973 | ||
1988 | extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t); | 1974 | extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t); |
1989 | 1975 | ||
1990 | #ifdef CONFIG_MIGRATION | 1976 | #ifdef CONFIG_MIGRATION |
1991 | extern int buffer_migrate_page(struct address_space *, | 1977 | extern int buffer_migrate_page(struct address_space *, |
1992 | struct page *, struct page *); | 1978 | struct page *, struct page *); |
1993 | #else | 1979 | #else |
1994 | #define buffer_migrate_page NULL | 1980 | #define buffer_migrate_page NULL |
1995 | #endif | 1981 | #endif |
1996 | 1982 | ||
1997 | extern int inode_change_ok(struct inode *, struct iattr *); | 1983 | extern int inode_change_ok(struct inode *, struct iattr *); |
1998 | extern int __must_check inode_setattr(struct inode *, struct iattr *); | 1984 | extern int __must_check inode_setattr(struct inode *, struct iattr *); |
1999 | 1985 | ||
2000 | extern void file_update_time(struct file *file); | 1986 | extern void file_update_time(struct file *file); |
2001 | 1987 | ||
2002 | static inline ino_t parent_ino(struct dentry *dentry) | 1988 | static inline ino_t parent_ino(struct dentry *dentry) |
2003 | { | 1989 | { |
2004 | ino_t res; | 1990 | ino_t res; |
2005 | 1991 | ||
2006 | spin_lock(&dentry->d_lock); | 1992 | spin_lock(&dentry->d_lock); |
2007 | res = dentry->d_parent->d_inode->i_ino; | 1993 | res = dentry->d_parent->d_inode->i_ino; |
2008 | spin_unlock(&dentry->d_lock); | 1994 | spin_unlock(&dentry->d_lock); |
2009 | return res; | 1995 | return res; |
2010 | } | 1996 | } |
2011 | 1997 | ||
2012 | /* kernel/fork.c */ | 1998 | /* kernel/fork.c */ |
2013 | extern int unshare_files(void); | 1999 | extern int unshare_files(void); |
2014 | 2000 | ||
2015 | /* Transaction based IO helpers */ | 2001 | /* Transaction based IO helpers */ |
2016 | 2002 | ||
2017 | /* | 2003 | /* |
2018 | * An argresp is stored in an allocated page and holds the | 2004 | * An argresp is stored in an allocated page and holds the |
2019 | * size of the argument or response, along with its content | 2005 | * size of the argument or response, along with its content |
2020 | */ | 2006 | */ |
2021 | struct simple_transaction_argresp { | 2007 | struct simple_transaction_argresp { |
2022 | ssize_t size; | 2008 | ssize_t size; |
2023 | char data[0]; | 2009 | char data[0]; |
2024 | }; | 2010 | }; |
2025 | 2011 | ||
2026 | #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) | 2012 | #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) |
2027 | 2013 | ||
2028 | char *simple_transaction_get(struct file *file, const char __user *buf, | 2014 | char *simple_transaction_get(struct file *file, const char __user *buf, |
2029 | size_t size); | 2015 | size_t size); |
2030 | ssize_t simple_transaction_read(struct file *file, char __user *buf, | 2016 | ssize_t simple_transaction_read(struct file *file, char __user *buf, |
2031 | size_t size, loff_t *pos); | 2017 | size_t size, loff_t *pos); |
2032 | int simple_transaction_release(struct inode *inode, struct file *file); | 2018 | int simple_transaction_release(struct inode *inode, struct file *file); |
2033 | 2019 | ||
2034 | static inline void simple_transaction_set(struct file *file, size_t n) | 2020 | static inline void simple_transaction_set(struct file *file, size_t n) |
2035 | { | 2021 | { |
2036 | struct simple_transaction_argresp *ar = file->private_data; | 2022 | struct simple_transaction_argresp *ar = file->private_data; |
2037 | 2023 | ||
2038 | BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); | 2024 | BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); |
2039 | 2025 | ||
2040 | /* | 2026 | /* |
2041 | * The barrier ensures that ar->size will really remain zero until | 2027 | * The barrier ensures that ar->size will really remain zero until |
2042 | * ar->data is ready for reading. | 2028 | * ar->data is ready for reading. |
2043 | */ | 2029 | */ |
2044 | smp_mb(); | 2030 | smp_mb(); |
2045 | ar->size = n; | 2031 | ar->size = n; |
2046 | } | 2032 | } |
2047 | 2033 | ||
2048 | /* | 2034 | /* |
2049 | * simple attribute files | 2035 | * simple attribute files |
2050 | * | 2036 | * |
2051 | * These attributes behave similar to those in sysfs: | 2037 | * These attributes behave similar to those in sysfs: |
2052 | * | 2038 | * |
2053 | * Writing to an attribute immediately sets a value, an open file can be | 2039 | * Writing to an attribute immediately sets a value, an open file can be |
2054 | * written to multiple times. | 2040 | * written to multiple times. |
2055 | * | 2041 | * |
2056 | * Reading from an attribute creates a buffer from the value that might get | 2042 | * Reading from an attribute creates a buffer from the value that might get |
2057 | * read with multiple read calls. When the attribute has been read | 2043 | * read with multiple read calls. When the attribute has been read |
2058 | * completely, no further read calls are possible until the file is opened | 2044 | * completely, no further read calls are possible until the file is opened |
2059 | * again. | 2045 | * again. |
2060 | * | 2046 | * |
2061 | * All attributes contain a text representation of a numeric value | 2047 | * All attributes contain a text representation of a numeric value |
2062 | * that are accessed with the get() and set() functions. | 2048 | * that are accessed with the get() and set() functions. |
2063 | */ | 2049 | */ |
2064 | #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ | 2050 | #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ |
2065 | static int __fops ## _open(struct inode *inode, struct file *file) \ | 2051 | static int __fops ## _open(struct inode *inode, struct file *file) \ |
2066 | { \ | 2052 | { \ |
2067 | __simple_attr_check_format(__fmt, 0ull); \ | 2053 | __simple_attr_check_format(__fmt, 0ull); \ |
2068 | return simple_attr_open(inode, file, __get, __set, __fmt); \ | 2054 | return simple_attr_open(inode, file, __get, __set, __fmt); \ |
2069 | } \ | 2055 | } \ |
2070 | static struct file_operations __fops = { \ | 2056 | static struct file_operations __fops = { \ |
2071 | .owner = THIS_MODULE, \ | 2057 | .owner = THIS_MODULE, \ |
2072 | .open = __fops ## _open, \ | 2058 | .open = __fops ## _open, \ |
2073 | .release = simple_attr_close, \ | 2059 | .release = simple_attr_close, \ |
2074 | .read = simple_attr_read, \ | 2060 | .read = simple_attr_read, \ |
2075 | .write = simple_attr_write, \ | 2061 | .write = simple_attr_write, \ |
2076 | }; | 2062 | }; |
2077 | 2063 | ||
2078 | static inline void __attribute__((format(printf, 1, 2))) | 2064 | static inline void __attribute__((format(printf, 1, 2))) |
2079 | __simple_attr_check_format(const char *fmt, ...) | 2065 | __simple_attr_check_format(const char *fmt, ...) |
2080 | { | 2066 | { |
2081 | /* don't do anything, just let the compiler check the arguments; */ | 2067 | /* don't do anything, just let the compiler check the arguments; */ |
2082 | } | 2068 | } |
2083 | 2069 | ||
2084 | int simple_attr_open(struct inode *inode, struct file *file, | 2070 | int simple_attr_open(struct inode *inode, struct file *file, |
2085 | u64 (*get)(void *), void (*set)(void *, u64), | 2071 | u64 (*get)(void *), void (*set)(void *, u64), |
2086 | const char *fmt); | 2072 | const char *fmt); |
2087 | int simple_attr_close(struct inode *inode, struct file *file); | 2073 | int simple_attr_close(struct inode *inode, struct file *file); |
2088 | ssize_t simple_attr_read(struct file *file, char __user *buf, | 2074 | ssize_t simple_attr_read(struct file *file, char __user *buf, |
2089 | size_t len, loff_t *ppos); | 2075 | size_t len, loff_t *ppos); |
2090 | ssize_t simple_attr_write(struct file *file, const char __user *buf, | 2076 | ssize_t simple_attr_write(struct file *file, const char __user *buf, |
2091 | size_t len, loff_t *ppos); | 2077 | size_t len, loff_t *ppos); |
2092 | 2078 | ||
2093 | 2079 | ||
2094 | #ifdef CONFIG_SECURITY | 2080 | #ifdef CONFIG_SECURITY |
2095 | static inline char *alloc_secdata(void) | 2081 | static inline char *alloc_secdata(void) |
2096 | { | 2082 | { |
2097 | return (char *)get_zeroed_page(GFP_KERNEL); | 2083 | return (char *)get_zeroed_page(GFP_KERNEL); |
2098 | } | 2084 | } |
2099 | 2085 | ||
2100 | static inline void free_secdata(void *secdata) | 2086 | static inline void free_secdata(void *secdata) |
2101 | { | 2087 | { |
2102 | free_page((unsigned long)secdata); | 2088 | free_page((unsigned long)secdata); |
2103 | } | 2089 | } |
2104 | #else | 2090 | #else |
2105 | static inline char *alloc_secdata(void) | 2091 | static inline char *alloc_secdata(void) |
2106 | { | 2092 | { |
2107 | return (char *)1; | 2093 | return (char *)1; |
2108 | } | 2094 | } |
2109 | 2095 | ||
2110 | static inline void free_secdata(void *secdata) | 2096 | static inline void free_secdata(void *secdata) |
2111 | { } | 2097 | { } |
2112 | #endif /* CONFIG_SECURITY */ | 2098 | #endif /* CONFIG_SECURITY */ |
2113 | 2099 | ||
2114 | struct ctl_table; | 2100 | struct ctl_table; |
2115 | int proc_nr_files(struct ctl_table *table, int write, struct file *filp, | 2101 | int proc_nr_files(struct ctl_table *table, int write, struct file *filp, |
2116 | void __user *buffer, size_t *lenp, loff_t *ppos); | 2102 | void __user *buffer, size_t *lenp, loff_t *ppos); |
2117 | 2103 | ||
2118 | int get_filesystem_list(char * buf); | 2104 | int get_filesystem_list(char * buf); |
2119 | 2105 | ||
2120 | #endif /* __KERNEL__ */ | 2106 | #endif /* __KERNEL__ */ |
2121 | #endif /* _LINUX_FS_H */ | 2107 | #endif /* _LINUX_FS_H */ |
2122 | 2108 |