Commit dddac6a7b445de95515f64fdf82fe5dc36c02f26
Committed by
Rafael J. Wysocki
1 parent
ec79be2687
Exists in
master
and in
7 other branches
PM / Hibernate: Replace bdget call with simple atomic_inc of i_count
Create bdgrab(). This function copies an existing reference to a block_device. It is safe to call from any context. Hibernation code wishes to copy a reference to the active swap device. Right now it calls bdget() under a spinlock, but this is wrong because bdget() can sleep. It doesn't need a full bdget() because we already hold a reference to active swap devices (and the spinlock protects against swapoff). Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827 Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk> Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Showing 3 changed files with 13 additions and 2 deletions Inline Diff
fs/block_dev.c
1 | /* | 1 | /* |
2 | * linux/fs/block_dev.c | 2 | * linux/fs/block_dev.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | 5 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/fcntl.h> | 10 | #include <linux/fcntl.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/kmod.h> | 12 | #include <linux/kmod.h> |
13 | #include <linux/major.h> | 13 | #include <linux/major.h> |
14 | #include <linux/smp_lock.h> | 14 | #include <linux/smp_lock.h> |
15 | #include <linux/device_cgroup.h> | 15 | #include <linux/device_cgroup.h> |
16 | #include <linux/highmem.h> | 16 | #include <linux/highmem.h> |
17 | #include <linux/blkdev.h> | 17 | #include <linux/blkdev.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/blkpg.h> | 19 | #include <linux/blkpg.h> |
20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
21 | #include <linux/pagevec.h> | 21 | #include <linux/pagevec.h> |
22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
23 | #include <linux/mpage.h> | 23 | #include <linux/mpage.h> |
24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
25 | #include <linux/uio.h> | 25 | #include <linux/uio.h> |
26 | #include <linux/namei.h> | 26 | #include <linux/namei.h> |
27 | #include <linux/log2.h> | 27 | #include <linux/log2.h> |
28 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
30 | #include "internal.h" | 30 | #include "internal.h" |
31 | 31 | ||
32 | struct bdev_inode { | 32 | struct bdev_inode { |
33 | struct block_device bdev; | 33 | struct block_device bdev; |
34 | struct inode vfs_inode; | 34 | struct inode vfs_inode; |
35 | }; | 35 | }; |
36 | 36 | ||
37 | static const struct address_space_operations def_blk_aops; | 37 | static const struct address_space_operations def_blk_aops; |
38 | 38 | ||
39 | static inline struct bdev_inode *BDEV_I(struct inode *inode) | 39 | static inline struct bdev_inode *BDEV_I(struct inode *inode) |
40 | { | 40 | { |
41 | return container_of(inode, struct bdev_inode, vfs_inode); | 41 | return container_of(inode, struct bdev_inode, vfs_inode); |
42 | } | 42 | } |
43 | 43 | ||
44 | inline struct block_device *I_BDEV(struct inode *inode) | 44 | inline struct block_device *I_BDEV(struct inode *inode) |
45 | { | 45 | { |
46 | return &BDEV_I(inode)->bdev; | 46 | return &BDEV_I(inode)->bdev; |
47 | } | 47 | } |
48 | 48 | ||
49 | EXPORT_SYMBOL(I_BDEV); | 49 | EXPORT_SYMBOL(I_BDEV); |
50 | 50 | ||
51 | static sector_t max_block(struct block_device *bdev) | 51 | static sector_t max_block(struct block_device *bdev) |
52 | { | 52 | { |
53 | sector_t retval = ~((sector_t)0); | 53 | sector_t retval = ~((sector_t)0); |
54 | loff_t sz = i_size_read(bdev->bd_inode); | 54 | loff_t sz = i_size_read(bdev->bd_inode); |
55 | 55 | ||
56 | if (sz) { | 56 | if (sz) { |
57 | unsigned int size = block_size(bdev); | 57 | unsigned int size = block_size(bdev); |
58 | unsigned int sizebits = blksize_bits(size); | 58 | unsigned int sizebits = blksize_bits(size); |
59 | retval = (sz >> sizebits); | 59 | retval = (sz >> sizebits); |
60 | } | 60 | } |
61 | return retval; | 61 | return retval; |
62 | } | 62 | } |
63 | 63 | ||
64 | /* Kill _all_ buffers and pagecache , dirty or not.. */ | 64 | /* Kill _all_ buffers and pagecache , dirty or not.. */ |
65 | static void kill_bdev(struct block_device *bdev) | 65 | static void kill_bdev(struct block_device *bdev) |
66 | { | 66 | { |
67 | if (bdev->bd_inode->i_mapping->nrpages == 0) | 67 | if (bdev->bd_inode->i_mapping->nrpages == 0) |
68 | return; | 68 | return; |
69 | invalidate_bh_lrus(); | 69 | invalidate_bh_lrus(); |
70 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); | 70 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); |
71 | } | 71 | } |
72 | 72 | ||
73 | int set_blocksize(struct block_device *bdev, int size) | 73 | int set_blocksize(struct block_device *bdev, int size) |
74 | { | 74 | { |
75 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ | 75 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ |
76 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) | 76 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) |
77 | return -EINVAL; | 77 | return -EINVAL; |
78 | 78 | ||
79 | /* Size cannot be smaller than the size supported by the device */ | 79 | /* Size cannot be smaller than the size supported by the device */ |
80 | if (size < bdev_logical_block_size(bdev)) | 80 | if (size < bdev_logical_block_size(bdev)) |
81 | return -EINVAL; | 81 | return -EINVAL; |
82 | 82 | ||
83 | /* Don't change the size if it is same as current */ | 83 | /* Don't change the size if it is same as current */ |
84 | if (bdev->bd_block_size != size) { | 84 | if (bdev->bd_block_size != size) { |
85 | sync_blockdev(bdev); | 85 | sync_blockdev(bdev); |
86 | bdev->bd_block_size = size; | 86 | bdev->bd_block_size = size; |
87 | bdev->bd_inode->i_blkbits = blksize_bits(size); | 87 | bdev->bd_inode->i_blkbits = blksize_bits(size); |
88 | kill_bdev(bdev); | 88 | kill_bdev(bdev); |
89 | } | 89 | } |
90 | return 0; | 90 | return 0; |
91 | } | 91 | } |
92 | 92 | ||
93 | EXPORT_SYMBOL(set_blocksize); | 93 | EXPORT_SYMBOL(set_blocksize); |
94 | 94 | ||
95 | int sb_set_blocksize(struct super_block *sb, int size) | 95 | int sb_set_blocksize(struct super_block *sb, int size) |
96 | { | 96 | { |
97 | if (set_blocksize(sb->s_bdev, size)) | 97 | if (set_blocksize(sb->s_bdev, size)) |
98 | return 0; | 98 | return 0; |
99 | /* If we get here, we know size is power of two | 99 | /* If we get here, we know size is power of two |
100 | * and it's value is between 512 and PAGE_SIZE */ | 100 | * and it's value is between 512 and PAGE_SIZE */ |
101 | sb->s_blocksize = size; | 101 | sb->s_blocksize = size; |
102 | sb->s_blocksize_bits = blksize_bits(size); | 102 | sb->s_blocksize_bits = blksize_bits(size); |
103 | return sb->s_blocksize; | 103 | return sb->s_blocksize; |
104 | } | 104 | } |
105 | 105 | ||
106 | EXPORT_SYMBOL(sb_set_blocksize); | 106 | EXPORT_SYMBOL(sb_set_blocksize); |
107 | 107 | ||
108 | int sb_min_blocksize(struct super_block *sb, int size) | 108 | int sb_min_blocksize(struct super_block *sb, int size) |
109 | { | 109 | { |
110 | int minsize = bdev_logical_block_size(sb->s_bdev); | 110 | int minsize = bdev_logical_block_size(sb->s_bdev); |
111 | if (size < minsize) | 111 | if (size < minsize) |
112 | size = minsize; | 112 | size = minsize; |
113 | return sb_set_blocksize(sb, size); | 113 | return sb_set_blocksize(sb, size); |
114 | } | 114 | } |
115 | 115 | ||
116 | EXPORT_SYMBOL(sb_min_blocksize); | 116 | EXPORT_SYMBOL(sb_min_blocksize); |
117 | 117 | ||
118 | static int | 118 | static int |
119 | blkdev_get_block(struct inode *inode, sector_t iblock, | 119 | blkdev_get_block(struct inode *inode, sector_t iblock, |
120 | struct buffer_head *bh, int create) | 120 | struct buffer_head *bh, int create) |
121 | { | 121 | { |
122 | if (iblock >= max_block(I_BDEV(inode))) { | 122 | if (iblock >= max_block(I_BDEV(inode))) { |
123 | if (create) | 123 | if (create) |
124 | return -EIO; | 124 | return -EIO; |
125 | 125 | ||
126 | /* | 126 | /* |
127 | * for reads, we're just trying to fill a partial page. | 127 | * for reads, we're just trying to fill a partial page. |
128 | * return a hole, they will have to call get_block again | 128 | * return a hole, they will have to call get_block again |
129 | * before they can fill it, and they will get -EIO at that | 129 | * before they can fill it, and they will get -EIO at that |
130 | * time | 130 | * time |
131 | */ | 131 | */ |
132 | return 0; | 132 | return 0; |
133 | } | 133 | } |
134 | bh->b_bdev = I_BDEV(inode); | 134 | bh->b_bdev = I_BDEV(inode); |
135 | bh->b_blocknr = iblock; | 135 | bh->b_blocknr = iblock; |
136 | set_buffer_mapped(bh); | 136 | set_buffer_mapped(bh); |
137 | return 0; | 137 | return 0; |
138 | } | 138 | } |
139 | 139 | ||
140 | static int | 140 | static int |
141 | blkdev_get_blocks(struct inode *inode, sector_t iblock, | 141 | blkdev_get_blocks(struct inode *inode, sector_t iblock, |
142 | struct buffer_head *bh, int create) | 142 | struct buffer_head *bh, int create) |
143 | { | 143 | { |
144 | sector_t end_block = max_block(I_BDEV(inode)); | 144 | sector_t end_block = max_block(I_BDEV(inode)); |
145 | unsigned long max_blocks = bh->b_size >> inode->i_blkbits; | 145 | unsigned long max_blocks = bh->b_size >> inode->i_blkbits; |
146 | 146 | ||
147 | if ((iblock + max_blocks) > end_block) { | 147 | if ((iblock + max_blocks) > end_block) { |
148 | max_blocks = end_block - iblock; | 148 | max_blocks = end_block - iblock; |
149 | if ((long)max_blocks <= 0) { | 149 | if ((long)max_blocks <= 0) { |
150 | if (create) | 150 | if (create) |
151 | return -EIO; /* write fully beyond EOF */ | 151 | return -EIO; /* write fully beyond EOF */ |
152 | /* | 152 | /* |
153 | * It is a read which is fully beyond EOF. We return | 153 | * It is a read which is fully beyond EOF. We return |
154 | * a !buffer_mapped buffer | 154 | * a !buffer_mapped buffer |
155 | */ | 155 | */ |
156 | max_blocks = 0; | 156 | max_blocks = 0; |
157 | } | 157 | } |
158 | } | 158 | } |
159 | 159 | ||
160 | bh->b_bdev = I_BDEV(inode); | 160 | bh->b_bdev = I_BDEV(inode); |
161 | bh->b_blocknr = iblock; | 161 | bh->b_blocknr = iblock; |
162 | bh->b_size = max_blocks << inode->i_blkbits; | 162 | bh->b_size = max_blocks << inode->i_blkbits; |
163 | if (max_blocks) | 163 | if (max_blocks) |
164 | set_buffer_mapped(bh); | 164 | set_buffer_mapped(bh); |
165 | return 0; | 165 | return 0; |
166 | } | 166 | } |
167 | 167 | ||
168 | static ssize_t | 168 | static ssize_t |
169 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | 169 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, |
170 | loff_t offset, unsigned long nr_segs) | 170 | loff_t offset, unsigned long nr_segs) |
171 | { | 171 | { |
172 | struct file *file = iocb->ki_filp; | 172 | struct file *file = iocb->ki_filp; |
173 | struct inode *inode = file->f_mapping->host; | 173 | struct inode *inode = file->f_mapping->host; |
174 | 174 | ||
175 | return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), | 175 | return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), |
176 | iov, offset, nr_segs, blkdev_get_blocks, NULL); | 176 | iov, offset, nr_segs, blkdev_get_blocks, NULL); |
177 | } | 177 | } |
178 | 178 | ||
179 | int __sync_blockdev(struct block_device *bdev, int wait) | 179 | int __sync_blockdev(struct block_device *bdev, int wait) |
180 | { | 180 | { |
181 | if (!bdev) | 181 | if (!bdev) |
182 | return 0; | 182 | return 0; |
183 | if (!wait) | 183 | if (!wait) |
184 | return filemap_flush(bdev->bd_inode->i_mapping); | 184 | return filemap_flush(bdev->bd_inode->i_mapping); |
185 | return filemap_write_and_wait(bdev->bd_inode->i_mapping); | 185 | return filemap_write_and_wait(bdev->bd_inode->i_mapping); |
186 | } | 186 | } |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * Write out and wait upon all the dirty data associated with a block | 189 | * Write out and wait upon all the dirty data associated with a block |
190 | * device via its mapping. Does not take the superblock lock. | 190 | * device via its mapping. Does not take the superblock lock. |
191 | */ | 191 | */ |
192 | int sync_blockdev(struct block_device *bdev) | 192 | int sync_blockdev(struct block_device *bdev) |
193 | { | 193 | { |
194 | return __sync_blockdev(bdev, 1); | 194 | return __sync_blockdev(bdev, 1); |
195 | } | 195 | } |
196 | EXPORT_SYMBOL(sync_blockdev); | 196 | EXPORT_SYMBOL(sync_blockdev); |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * Write out and wait upon all dirty data associated with this | 199 | * Write out and wait upon all dirty data associated with this |
200 | * device. Filesystem data as well as the underlying block | 200 | * device. Filesystem data as well as the underlying block |
201 | * device. Takes the superblock lock. | 201 | * device. Takes the superblock lock. |
202 | */ | 202 | */ |
203 | int fsync_bdev(struct block_device *bdev) | 203 | int fsync_bdev(struct block_device *bdev) |
204 | { | 204 | { |
205 | struct super_block *sb = get_super(bdev); | 205 | struct super_block *sb = get_super(bdev); |
206 | if (sb) { | 206 | if (sb) { |
207 | int res = sync_filesystem(sb); | 207 | int res = sync_filesystem(sb); |
208 | drop_super(sb); | 208 | drop_super(sb); |
209 | return res; | 209 | return res; |
210 | } | 210 | } |
211 | return sync_blockdev(bdev); | 211 | return sync_blockdev(bdev); |
212 | } | 212 | } |
213 | EXPORT_SYMBOL(fsync_bdev); | 213 | EXPORT_SYMBOL(fsync_bdev); |
214 | 214 | ||
215 | /** | 215 | /** |
216 | * freeze_bdev -- lock a filesystem and force it into a consistent state | 216 | * freeze_bdev -- lock a filesystem and force it into a consistent state |
217 | * @bdev: blockdevice to lock | 217 | * @bdev: blockdevice to lock |
218 | * | 218 | * |
219 | * This takes the block device bd_mount_sem to make sure no new mounts | 219 | * This takes the block device bd_mount_sem to make sure no new mounts |
220 | * happen on bdev until thaw_bdev() is called. | 220 | * happen on bdev until thaw_bdev() is called. |
221 | * If a superblock is found on this device, we take the s_umount semaphore | 221 | * If a superblock is found on this device, we take the s_umount semaphore |
222 | * on it to make sure nobody unmounts until the snapshot creation is done. | 222 | * on it to make sure nobody unmounts until the snapshot creation is done. |
223 | * The reference counter (bd_fsfreeze_count) guarantees that only the last | 223 | * The reference counter (bd_fsfreeze_count) guarantees that only the last |
224 | * unfreeze process can unfreeze the frozen filesystem actually when multiple | 224 | * unfreeze process can unfreeze the frozen filesystem actually when multiple |
225 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and | 225 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and |
226 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze | 226 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze |
227 | * actually. | 227 | * actually. |
228 | */ | 228 | */ |
229 | struct super_block *freeze_bdev(struct block_device *bdev) | 229 | struct super_block *freeze_bdev(struct block_device *bdev) |
230 | { | 230 | { |
231 | struct super_block *sb; | 231 | struct super_block *sb; |
232 | int error = 0; | 232 | int error = 0; |
233 | 233 | ||
234 | mutex_lock(&bdev->bd_fsfreeze_mutex); | 234 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
235 | if (bdev->bd_fsfreeze_count > 0) { | 235 | if (bdev->bd_fsfreeze_count > 0) { |
236 | bdev->bd_fsfreeze_count++; | 236 | bdev->bd_fsfreeze_count++; |
237 | sb = get_super(bdev); | 237 | sb = get_super(bdev); |
238 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 238 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
239 | return sb; | 239 | return sb; |
240 | } | 240 | } |
241 | bdev->bd_fsfreeze_count++; | 241 | bdev->bd_fsfreeze_count++; |
242 | 242 | ||
243 | down(&bdev->bd_mount_sem); | 243 | down(&bdev->bd_mount_sem); |
244 | sb = get_super(bdev); | 244 | sb = get_super(bdev); |
245 | if (sb && !(sb->s_flags & MS_RDONLY)) { | 245 | if (sb && !(sb->s_flags & MS_RDONLY)) { |
246 | sb->s_frozen = SB_FREEZE_WRITE; | 246 | sb->s_frozen = SB_FREEZE_WRITE; |
247 | smp_wmb(); | 247 | smp_wmb(); |
248 | 248 | ||
249 | sync_filesystem(sb); | 249 | sync_filesystem(sb); |
250 | 250 | ||
251 | sb->s_frozen = SB_FREEZE_TRANS; | 251 | sb->s_frozen = SB_FREEZE_TRANS; |
252 | smp_wmb(); | 252 | smp_wmb(); |
253 | 253 | ||
254 | sync_blockdev(sb->s_bdev); | 254 | sync_blockdev(sb->s_bdev); |
255 | 255 | ||
256 | if (sb->s_op->freeze_fs) { | 256 | if (sb->s_op->freeze_fs) { |
257 | error = sb->s_op->freeze_fs(sb); | 257 | error = sb->s_op->freeze_fs(sb); |
258 | if (error) { | 258 | if (error) { |
259 | printk(KERN_ERR | 259 | printk(KERN_ERR |
260 | "VFS:Filesystem freeze failed\n"); | 260 | "VFS:Filesystem freeze failed\n"); |
261 | sb->s_frozen = SB_UNFROZEN; | 261 | sb->s_frozen = SB_UNFROZEN; |
262 | drop_super(sb); | 262 | drop_super(sb); |
263 | up(&bdev->bd_mount_sem); | 263 | up(&bdev->bd_mount_sem); |
264 | bdev->bd_fsfreeze_count--; | 264 | bdev->bd_fsfreeze_count--; |
265 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 265 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
266 | return ERR_PTR(error); | 266 | return ERR_PTR(error); |
267 | } | 267 | } |
268 | } | 268 | } |
269 | } | 269 | } |
270 | 270 | ||
271 | sync_blockdev(bdev); | 271 | sync_blockdev(bdev); |
272 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 272 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
273 | 273 | ||
274 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ | 274 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ |
275 | } | 275 | } |
276 | EXPORT_SYMBOL(freeze_bdev); | 276 | EXPORT_SYMBOL(freeze_bdev); |
277 | 277 | ||
278 | /** | 278 | /** |
279 | * thaw_bdev -- unlock filesystem | 279 | * thaw_bdev -- unlock filesystem |
280 | * @bdev: blockdevice to unlock | 280 | * @bdev: blockdevice to unlock |
281 | * @sb: associated superblock | 281 | * @sb: associated superblock |
282 | * | 282 | * |
283 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). | 283 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
284 | */ | 284 | */ |
285 | int thaw_bdev(struct block_device *bdev, struct super_block *sb) | 285 | int thaw_bdev(struct block_device *bdev, struct super_block *sb) |
286 | { | 286 | { |
287 | int error = 0; | 287 | int error = 0; |
288 | 288 | ||
289 | mutex_lock(&bdev->bd_fsfreeze_mutex); | 289 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
290 | if (!bdev->bd_fsfreeze_count) { | 290 | if (!bdev->bd_fsfreeze_count) { |
291 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 291 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
292 | return -EINVAL; | 292 | return -EINVAL; |
293 | } | 293 | } |
294 | 294 | ||
295 | bdev->bd_fsfreeze_count--; | 295 | bdev->bd_fsfreeze_count--; |
296 | if (bdev->bd_fsfreeze_count > 0) { | 296 | if (bdev->bd_fsfreeze_count > 0) { |
297 | if (sb) | 297 | if (sb) |
298 | drop_super(sb); | 298 | drop_super(sb); |
299 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 299 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
300 | return 0; | 300 | return 0; |
301 | } | 301 | } |
302 | 302 | ||
303 | if (sb) { | 303 | if (sb) { |
304 | BUG_ON(sb->s_bdev != bdev); | 304 | BUG_ON(sb->s_bdev != bdev); |
305 | if (!(sb->s_flags & MS_RDONLY)) { | 305 | if (!(sb->s_flags & MS_RDONLY)) { |
306 | if (sb->s_op->unfreeze_fs) { | 306 | if (sb->s_op->unfreeze_fs) { |
307 | error = sb->s_op->unfreeze_fs(sb); | 307 | error = sb->s_op->unfreeze_fs(sb); |
308 | if (error) { | 308 | if (error) { |
309 | printk(KERN_ERR | 309 | printk(KERN_ERR |
310 | "VFS:Filesystem thaw failed\n"); | 310 | "VFS:Filesystem thaw failed\n"); |
311 | sb->s_frozen = SB_FREEZE_TRANS; | 311 | sb->s_frozen = SB_FREEZE_TRANS; |
312 | bdev->bd_fsfreeze_count++; | 312 | bdev->bd_fsfreeze_count++; |
313 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 313 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
314 | return error; | 314 | return error; |
315 | } | 315 | } |
316 | } | 316 | } |
317 | sb->s_frozen = SB_UNFROZEN; | 317 | sb->s_frozen = SB_UNFROZEN; |
318 | smp_wmb(); | 318 | smp_wmb(); |
319 | wake_up(&sb->s_wait_unfrozen); | 319 | wake_up(&sb->s_wait_unfrozen); |
320 | } | 320 | } |
321 | drop_super(sb); | 321 | drop_super(sb); |
322 | } | 322 | } |
323 | 323 | ||
324 | up(&bdev->bd_mount_sem); | 324 | up(&bdev->bd_mount_sem); |
325 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 325 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
326 | return 0; | 326 | return 0; |
327 | } | 327 | } |
328 | EXPORT_SYMBOL(thaw_bdev); | 328 | EXPORT_SYMBOL(thaw_bdev); |
329 | 329 | ||
330 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) | 330 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) |
331 | { | 331 | { |
332 | return block_write_full_page(page, blkdev_get_block, wbc); | 332 | return block_write_full_page(page, blkdev_get_block, wbc); |
333 | } | 333 | } |
334 | 334 | ||
335 | static int blkdev_readpage(struct file * file, struct page * page) | 335 | static int blkdev_readpage(struct file * file, struct page * page) |
336 | { | 336 | { |
337 | return block_read_full_page(page, blkdev_get_block); | 337 | return block_read_full_page(page, blkdev_get_block); |
338 | } | 338 | } |
339 | 339 | ||
340 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, | 340 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, |
341 | loff_t pos, unsigned len, unsigned flags, | 341 | loff_t pos, unsigned len, unsigned flags, |
342 | struct page **pagep, void **fsdata) | 342 | struct page **pagep, void **fsdata) |
343 | { | 343 | { |
344 | *pagep = NULL; | 344 | *pagep = NULL; |
345 | return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | 345 | return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, |
346 | blkdev_get_block); | 346 | blkdev_get_block); |
347 | } | 347 | } |
348 | 348 | ||
349 | static int blkdev_write_end(struct file *file, struct address_space *mapping, | 349 | static int blkdev_write_end(struct file *file, struct address_space *mapping, |
350 | loff_t pos, unsigned len, unsigned copied, | 350 | loff_t pos, unsigned len, unsigned copied, |
351 | struct page *page, void *fsdata) | 351 | struct page *page, void *fsdata) |
352 | { | 352 | { |
353 | int ret; | 353 | int ret; |
354 | ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 354 | ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
355 | 355 | ||
356 | unlock_page(page); | 356 | unlock_page(page); |
357 | page_cache_release(page); | 357 | page_cache_release(page); |
358 | 358 | ||
359 | return ret; | 359 | return ret; |
360 | } | 360 | } |
361 | 361 | ||
362 | /* | 362 | /* |
363 | * private llseek: | 363 | * private llseek: |
364 | * for a block special file file->f_path.dentry->d_inode->i_size is zero | 364 | * for a block special file file->f_path.dentry->d_inode->i_size is zero |
365 | * so we compute the size by hand (just as in block_read/write above) | 365 | * so we compute the size by hand (just as in block_read/write above) |
366 | */ | 366 | */ |
367 | static loff_t block_llseek(struct file *file, loff_t offset, int origin) | 367 | static loff_t block_llseek(struct file *file, loff_t offset, int origin) |
368 | { | 368 | { |
369 | struct inode *bd_inode = file->f_mapping->host; | 369 | struct inode *bd_inode = file->f_mapping->host; |
370 | loff_t size; | 370 | loff_t size; |
371 | loff_t retval; | 371 | loff_t retval; |
372 | 372 | ||
373 | mutex_lock(&bd_inode->i_mutex); | 373 | mutex_lock(&bd_inode->i_mutex); |
374 | size = i_size_read(bd_inode); | 374 | size = i_size_read(bd_inode); |
375 | 375 | ||
376 | switch (origin) { | 376 | switch (origin) { |
377 | case 2: | 377 | case 2: |
378 | offset += size; | 378 | offset += size; |
379 | break; | 379 | break; |
380 | case 1: | 380 | case 1: |
381 | offset += file->f_pos; | 381 | offset += file->f_pos; |
382 | } | 382 | } |
383 | retval = -EINVAL; | 383 | retval = -EINVAL; |
384 | if (offset >= 0 && offset <= size) { | 384 | if (offset >= 0 && offset <= size) { |
385 | if (offset != file->f_pos) { | 385 | if (offset != file->f_pos) { |
386 | file->f_pos = offset; | 386 | file->f_pos = offset; |
387 | } | 387 | } |
388 | retval = offset; | 388 | retval = offset; |
389 | } | 389 | } |
390 | mutex_unlock(&bd_inode->i_mutex); | 390 | mutex_unlock(&bd_inode->i_mutex); |
391 | return retval; | 391 | return retval; |
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * Filp is never NULL; the only case when ->fsync() is called with | 395 | * Filp is never NULL; the only case when ->fsync() is called with |
396 | * NULL first argument is nfsd_sync_dir() and that's not a directory. | 396 | * NULL first argument is nfsd_sync_dir() and that's not a directory. |
397 | */ | 397 | */ |
398 | 398 | ||
399 | static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) | 399 | static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) |
400 | { | 400 | { |
401 | return sync_blockdev(I_BDEV(filp->f_mapping->host)); | 401 | return sync_blockdev(I_BDEV(filp->f_mapping->host)); |
402 | } | 402 | } |
403 | 403 | ||
404 | /* | 404 | /* |
405 | * pseudo-fs | 405 | * pseudo-fs |
406 | */ | 406 | */ |
407 | 407 | ||
408 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); | 408 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); |
409 | static struct kmem_cache * bdev_cachep __read_mostly; | 409 | static struct kmem_cache * bdev_cachep __read_mostly; |
410 | 410 | ||
411 | static struct inode *bdev_alloc_inode(struct super_block *sb) | 411 | static struct inode *bdev_alloc_inode(struct super_block *sb) |
412 | { | 412 | { |
413 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); | 413 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); |
414 | if (!ei) | 414 | if (!ei) |
415 | return NULL; | 415 | return NULL; |
416 | return &ei->vfs_inode; | 416 | return &ei->vfs_inode; |
417 | } | 417 | } |
418 | 418 | ||
419 | static void bdev_destroy_inode(struct inode *inode) | 419 | static void bdev_destroy_inode(struct inode *inode) |
420 | { | 420 | { |
421 | struct bdev_inode *bdi = BDEV_I(inode); | 421 | struct bdev_inode *bdi = BDEV_I(inode); |
422 | 422 | ||
423 | bdi->bdev.bd_inode_backing_dev_info = NULL; | 423 | bdi->bdev.bd_inode_backing_dev_info = NULL; |
424 | kmem_cache_free(bdev_cachep, bdi); | 424 | kmem_cache_free(bdev_cachep, bdi); |
425 | } | 425 | } |
426 | 426 | ||
427 | static void init_once(void *foo) | 427 | static void init_once(void *foo) |
428 | { | 428 | { |
429 | struct bdev_inode *ei = (struct bdev_inode *) foo; | 429 | struct bdev_inode *ei = (struct bdev_inode *) foo; |
430 | struct block_device *bdev = &ei->bdev; | 430 | struct block_device *bdev = &ei->bdev; |
431 | 431 | ||
432 | memset(bdev, 0, sizeof(*bdev)); | 432 | memset(bdev, 0, sizeof(*bdev)); |
433 | mutex_init(&bdev->bd_mutex); | 433 | mutex_init(&bdev->bd_mutex); |
434 | sema_init(&bdev->bd_mount_sem, 1); | 434 | sema_init(&bdev->bd_mount_sem, 1); |
435 | INIT_LIST_HEAD(&bdev->bd_inodes); | 435 | INIT_LIST_HEAD(&bdev->bd_inodes); |
436 | INIT_LIST_HEAD(&bdev->bd_list); | 436 | INIT_LIST_HEAD(&bdev->bd_list); |
437 | #ifdef CONFIG_SYSFS | 437 | #ifdef CONFIG_SYSFS |
438 | INIT_LIST_HEAD(&bdev->bd_holder_list); | 438 | INIT_LIST_HEAD(&bdev->bd_holder_list); |
439 | #endif | 439 | #endif |
440 | inode_init_once(&ei->vfs_inode); | 440 | inode_init_once(&ei->vfs_inode); |
441 | /* Initialize mutex for freeze. */ | 441 | /* Initialize mutex for freeze. */ |
442 | mutex_init(&bdev->bd_fsfreeze_mutex); | 442 | mutex_init(&bdev->bd_fsfreeze_mutex); |
443 | } | 443 | } |
444 | 444 | ||
445 | static inline void __bd_forget(struct inode *inode) | 445 | static inline void __bd_forget(struct inode *inode) |
446 | { | 446 | { |
447 | list_del_init(&inode->i_devices); | 447 | list_del_init(&inode->i_devices); |
448 | inode->i_bdev = NULL; | 448 | inode->i_bdev = NULL; |
449 | inode->i_mapping = &inode->i_data; | 449 | inode->i_mapping = &inode->i_data; |
450 | } | 450 | } |
451 | 451 | ||
452 | static void bdev_clear_inode(struct inode *inode) | 452 | static void bdev_clear_inode(struct inode *inode) |
453 | { | 453 | { |
454 | struct block_device *bdev = &BDEV_I(inode)->bdev; | 454 | struct block_device *bdev = &BDEV_I(inode)->bdev; |
455 | struct list_head *p; | 455 | struct list_head *p; |
456 | spin_lock(&bdev_lock); | 456 | spin_lock(&bdev_lock); |
457 | while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { | 457 | while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { |
458 | __bd_forget(list_entry(p, struct inode, i_devices)); | 458 | __bd_forget(list_entry(p, struct inode, i_devices)); |
459 | } | 459 | } |
460 | list_del_init(&bdev->bd_list); | 460 | list_del_init(&bdev->bd_list); |
461 | spin_unlock(&bdev_lock); | 461 | spin_unlock(&bdev_lock); |
462 | } | 462 | } |
463 | 463 | ||
464 | static const struct super_operations bdev_sops = { | 464 | static const struct super_operations bdev_sops = { |
465 | .statfs = simple_statfs, | 465 | .statfs = simple_statfs, |
466 | .alloc_inode = bdev_alloc_inode, | 466 | .alloc_inode = bdev_alloc_inode, |
467 | .destroy_inode = bdev_destroy_inode, | 467 | .destroy_inode = bdev_destroy_inode, |
468 | .drop_inode = generic_delete_inode, | 468 | .drop_inode = generic_delete_inode, |
469 | .clear_inode = bdev_clear_inode, | 469 | .clear_inode = bdev_clear_inode, |
470 | }; | 470 | }; |
471 | 471 | ||
472 | static int bd_get_sb(struct file_system_type *fs_type, | 472 | static int bd_get_sb(struct file_system_type *fs_type, |
473 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) | 473 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) |
474 | { | 474 | { |
475 | return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); | 475 | return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); |
476 | } | 476 | } |
477 | 477 | ||
478 | static struct file_system_type bd_type = { | 478 | static struct file_system_type bd_type = { |
479 | .name = "bdev", | 479 | .name = "bdev", |
480 | .get_sb = bd_get_sb, | 480 | .get_sb = bd_get_sb, |
481 | .kill_sb = kill_anon_super, | 481 | .kill_sb = kill_anon_super, |
482 | }; | 482 | }; |
483 | 483 | ||
484 | struct super_block *blockdev_superblock __read_mostly; | 484 | struct super_block *blockdev_superblock __read_mostly; |
485 | 485 | ||
486 | void __init bdev_cache_init(void) | 486 | void __init bdev_cache_init(void) |
487 | { | 487 | { |
488 | int err; | 488 | int err; |
489 | struct vfsmount *bd_mnt; | 489 | struct vfsmount *bd_mnt; |
490 | 490 | ||
491 | bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), | 491 | bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), |
492 | 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | 492 | 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
493 | SLAB_MEM_SPREAD|SLAB_PANIC), | 493 | SLAB_MEM_SPREAD|SLAB_PANIC), |
494 | init_once); | 494 | init_once); |
495 | err = register_filesystem(&bd_type); | 495 | err = register_filesystem(&bd_type); |
496 | if (err) | 496 | if (err) |
497 | panic("Cannot register bdev pseudo-fs"); | 497 | panic("Cannot register bdev pseudo-fs"); |
498 | bd_mnt = kern_mount(&bd_type); | 498 | bd_mnt = kern_mount(&bd_type); |
499 | if (IS_ERR(bd_mnt)) | 499 | if (IS_ERR(bd_mnt)) |
500 | panic("Cannot create bdev pseudo-fs"); | 500 | panic("Cannot create bdev pseudo-fs"); |
501 | /* | 501 | /* |
502 | * This vfsmount structure is only used to obtain the | 502 | * This vfsmount structure is only used to obtain the |
503 | * blockdev_superblock, so tell kmemleak not to report it. | 503 | * blockdev_superblock, so tell kmemleak not to report it. |
504 | */ | 504 | */ |
505 | kmemleak_not_leak(bd_mnt); | 505 | kmemleak_not_leak(bd_mnt); |
506 | blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ | 506 | blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ |
507 | } | 507 | } |
508 | 508 | ||
509 | /* | 509 | /* |
510 | * Most likely _very_ bad one - but then it's hardly critical for small | 510 | * Most likely _very_ bad one - but then it's hardly critical for small |
511 | * /dev and can be fixed when somebody will need really large one. | 511 | * /dev and can be fixed when somebody will need really large one. |
512 | * Keep in mind that it will be fed through icache hash function too. | 512 | * Keep in mind that it will be fed through icache hash function too. |
513 | */ | 513 | */ |
514 | static inline unsigned long hash(dev_t dev) | 514 | static inline unsigned long hash(dev_t dev) |
515 | { | 515 | { |
516 | return MAJOR(dev)+MINOR(dev); | 516 | return MAJOR(dev)+MINOR(dev); |
517 | } | 517 | } |
518 | 518 | ||
519 | static int bdev_test(struct inode *inode, void *data) | 519 | static int bdev_test(struct inode *inode, void *data) |
520 | { | 520 | { |
521 | return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; | 521 | return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; |
522 | } | 522 | } |
523 | 523 | ||
524 | static int bdev_set(struct inode *inode, void *data) | 524 | static int bdev_set(struct inode *inode, void *data) |
525 | { | 525 | { |
526 | BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; | 526 | BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; |
527 | return 0; | 527 | return 0; |
528 | } | 528 | } |
529 | 529 | ||
530 | static LIST_HEAD(all_bdevs); | 530 | static LIST_HEAD(all_bdevs); |
531 | 531 | ||
532 | struct block_device *bdget(dev_t dev) | 532 | struct block_device *bdget(dev_t dev) |
533 | { | 533 | { |
534 | struct block_device *bdev; | 534 | struct block_device *bdev; |
535 | struct inode *inode; | 535 | struct inode *inode; |
536 | 536 | ||
537 | inode = iget5_locked(blockdev_superblock, hash(dev), | 537 | inode = iget5_locked(blockdev_superblock, hash(dev), |
538 | bdev_test, bdev_set, &dev); | 538 | bdev_test, bdev_set, &dev); |
539 | 539 | ||
540 | if (!inode) | 540 | if (!inode) |
541 | return NULL; | 541 | return NULL; |
542 | 542 | ||
543 | bdev = &BDEV_I(inode)->bdev; | 543 | bdev = &BDEV_I(inode)->bdev; |
544 | 544 | ||
545 | if (inode->i_state & I_NEW) { | 545 | if (inode->i_state & I_NEW) { |
546 | bdev->bd_contains = NULL; | 546 | bdev->bd_contains = NULL; |
547 | bdev->bd_inode = inode; | 547 | bdev->bd_inode = inode; |
548 | bdev->bd_block_size = (1 << inode->i_blkbits); | 548 | bdev->bd_block_size = (1 << inode->i_blkbits); |
549 | bdev->bd_part_count = 0; | 549 | bdev->bd_part_count = 0; |
550 | bdev->bd_invalidated = 0; | 550 | bdev->bd_invalidated = 0; |
551 | inode->i_mode = S_IFBLK; | 551 | inode->i_mode = S_IFBLK; |
552 | inode->i_rdev = dev; | 552 | inode->i_rdev = dev; |
553 | inode->i_bdev = bdev; | 553 | inode->i_bdev = bdev; |
554 | inode->i_data.a_ops = &def_blk_aops; | 554 | inode->i_data.a_ops = &def_blk_aops; |
555 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); | 555 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); |
556 | inode->i_data.backing_dev_info = &default_backing_dev_info; | 556 | inode->i_data.backing_dev_info = &default_backing_dev_info; |
557 | spin_lock(&bdev_lock); | 557 | spin_lock(&bdev_lock); |
558 | list_add(&bdev->bd_list, &all_bdevs); | 558 | list_add(&bdev->bd_list, &all_bdevs); |
559 | spin_unlock(&bdev_lock); | 559 | spin_unlock(&bdev_lock); |
560 | unlock_new_inode(inode); | 560 | unlock_new_inode(inode); |
561 | } | 561 | } |
562 | return bdev; | 562 | return bdev; |
563 | } | 563 | } |
564 | 564 | ||
565 | EXPORT_SYMBOL(bdget); | 565 | EXPORT_SYMBOL(bdget); |
566 | 566 | ||
567 | /** | ||
568 | * bdgrab -- Grab a reference to an already referenced block device | ||
569 | * @bdev: Block device to grab a reference to. | ||
570 | */ | ||
571 | struct block_device *bdgrab(struct block_device *bdev) | ||
572 | { | ||
573 | atomic_inc(&bdev->bd_inode->i_count); | ||
574 | return bdev; | ||
575 | } | ||
576 | |||
567 | long nr_blockdev_pages(void) | 577 | long nr_blockdev_pages(void) |
568 | { | 578 | { |
569 | struct block_device *bdev; | 579 | struct block_device *bdev; |
570 | long ret = 0; | 580 | long ret = 0; |
571 | spin_lock(&bdev_lock); | 581 | spin_lock(&bdev_lock); |
572 | list_for_each_entry(bdev, &all_bdevs, bd_list) { | 582 | list_for_each_entry(bdev, &all_bdevs, bd_list) { |
573 | ret += bdev->bd_inode->i_mapping->nrpages; | 583 | ret += bdev->bd_inode->i_mapping->nrpages; |
574 | } | 584 | } |
575 | spin_unlock(&bdev_lock); | 585 | spin_unlock(&bdev_lock); |
576 | return ret; | 586 | return ret; |
577 | } | 587 | } |
578 | 588 | ||
579 | void bdput(struct block_device *bdev) | 589 | void bdput(struct block_device *bdev) |
580 | { | 590 | { |
581 | iput(bdev->bd_inode); | 591 | iput(bdev->bd_inode); |
582 | } | 592 | } |
583 | 593 | ||
584 | EXPORT_SYMBOL(bdput); | 594 | EXPORT_SYMBOL(bdput); |
585 | 595 | ||
586 | static struct block_device *bd_acquire(struct inode *inode) | 596 | static struct block_device *bd_acquire(struct inode *inode) |
587 | { | 597 | { |
588 | struct block_device *bdev; | 598 | struct block_device *bdev; |
589 | 599 | ||
590 | spin_lock(&bdev_lock); | 600 | spin_lock(&bdev_lock); |
591 | bdev = inode->i_bdev; | 601 | bdev = inode->i_bdev; |
592 | if (bdev) { | 602 | if (bdev) { |
593 | atomic_inc(&bdev->bd_inode->i_count); | 603 | atomic_inc(&bdev->bd_inode->i_count); |
594 | spin_unlock(&bdev_lock); | 604 | spin_unlock(&bdev_lock); |
595 | return bdev; | 605 | return bdev; |
596 | } | 606 | } |
597 | spin_unlock(&bdev_lock); | 607 | spin_unlock(&bdev_lock); |
598 | 608 | ||
599 | bdev = bdget(inode->i_rdev); | 609 | bdev = bdget(inode->i_rdev); |
600 | if (bdev) { | 610 | if (bdev) { |
601 | spin_lock(&bdev_lock); | 611 | spin_lock(&bdev_lock); |
602 | if (!inode->i_bdev) { | 612 | if (!inode->i_bdev) { |
603 | /* | 613 | /* |
604 | * We take an additional bd_inode->i_count for inode, | 614 | * We take an additional bd_inode->i_count for inode, |
605 | * and it's released in clear_inode() of inode. | 615 | * and it's released in clear_inode() of inode. |
606 | * So, we can access it via ->i_mapping always | 616 | * So, we can access it via ->i_mapping always |
607 | * without igrab(). | 617 | * without igrab(). |
608 | */ | 618 | */ |
609 | atomic_inc(&bdev->bd_inode->i_count); | 619 | atomic_inc(&bdev->bd_inode->i_count); |
610 | inode->i_bdev = bdev; | 620 | inode->i_bdev = bdev; |
611 | inode->i_mapping = bdev->bd_inode->i_mapping; | 621 | inode->i_mapping = bdev->bd_inode->i_mapping; |
612 | list_add(&inode->i_devices, &bdev->bd_inodes); | 622 | list_add(&inode->i_devices, &bdev->bd_inodes); |
613 | } | 623 | } |
614 | spin_unlock(&bdev_lock); | 624 | spin_unlock(&bdev_lock); |
615 | } | 625 | } |
616 | return bdev; | 626 | return bdev; |
617 | } | 627 | } |
618 | 628 | ||
619 | /* Call when you free inode */ | 629 | /* Call when you free inode */ |
620 | 630 | ||
621 | void bd_forget(struct inode *inode) | 631 | void bd_forget(struct inode *inode) |
622 | { | 632 | { |
623 | struct block_device *bdev = NULL; | 633 | struct block_device *bdev = NULL; |
624 | 634 | ||
625 | spin_lock(&bdev_lock); | 635 | spin_lock(&bdev_lock); |
626 | if (inode->i_bdev) { | 636 | if (inode->i_bdev) { |
627 | if (!sb_is_blkdev_sb(inode->i_sb)) | 637 | if (!sb_is_blkdev_sb(inode->i_sb)) |
628 | bdev = inode->i_bdev; | 638 | bdev = inode->i_bdev; |
629 | __bd_forget(inode); | 639 | __bd_forget(inode); |
630 | } | 640 | } |
631 | spin_unlock(&bdev_lock); | 641 | spin_unlock(&bdev_lock); |
632 | 642 | ||
633 | if (bdev) | 643 | if (bdev) |
634 | iput(bdev->bd_inode); | 644 | iput(bdev->bd_inode); |
635 | } | 645 | } |
636 | 646 | ||
637 | int bd_claim(struct block_device *bdev, void *holder) | 647 | int bd_claim(struct block_device *bdev, void *holder) |
638 | { | 648 | { |
639 | int res; | 649 | int res; |
640 | spin_lock(&bdev_lock); | 650 | spin_lock(&bdev_lock); |
641 | 651 | ||
642 | /* first decide result */ | 652 | /* first decide result */ |
643 | if (bdev->bd_holder == holder) | 653 | if (bdev->bd_holder == holder) |
644 | res = 0; /* already a holder */ | 654 | res = 0; /* already a holder */ |
645 | else if (bdev->bd_holder != NULL) | 655 | else if (bdev->bd_holder != NULL) |
646 | res = -EBUSY; /* held by someone else */ | 656 | res = -EBUSY; /* held by someone else */ |
647 | else if (bdev->bd_contains == bdev) | 657 | else if (bdev->bd_contains == bdev) |
648 | res = 0; /* is a whole device which isn't held */ | 658 | res = 0; /* is a whole device which isn't held */ |
649 | 659 | ||
650 | else if (bdev->bd_contains->bd_holder == bd_claim) | 660 | else if (bdev->bd_contains->bd_holder == bd_claim) |
651 | res = 0; /* is a partition of a device that is being partitioned */ | 661 | res = 0; /* is a partition of a device that is being partitioned */ |
652 | else if (bdev->bd_contains->bd_holder != NULL) | 662 | else if (bdev->bd_contains->bd_holder != NULL) |
653 | res = -EBUSY; /* is a partition of a held device */ | 663 | res = -EBUSY; /* is a partition of a held device */ |
654 | else | 664 | else |
655 | res = 0; /* is a partition of an un-held device */ | 665 | res = 0; /* is a partition of an un-held device */ |
656 | 666 | ||
657 | /* now impose change */ | 667 | /* now impose change */ |
658 | if (res==0) { | 668 | if (res==0) { |
659 | /* note that for a whole device bd_holders | 669 | /* note that for a whole device bd_holders |
660 | * will be incremented twice, and bd_holder will | 670 | * will be incremented twice, and bd_holder will |
661 | * be set to bd_claim before being set to holder | 671 | * be set to bd_claim before being set to holder |
662 | */ | 672 | */ |
663 | bdev->bd_contains->bd_holders ++; | 673 | bdev->bd_contains->bd_holders ++; |
664 | bdev->bd_contains->bd_holder = bd_claim; | 674 | bdev->bd_contains->bd_holder = bd_claim; |
665 | bdev->bd_holders++; | 675 | bdev->bd_holders++; |
666 | bdev->bd_holder = holder; | 676 | bdev->bd_holder = holder; |
667 | } | 677 | } |
668 | spin_unlock(&bdev_lock); | 678 | spin_unlock(&bdev_lock); |
669 | return res; | 679 | return res; |
670 | } | 680 | } |
671 | 681 | ||
672 | EXPORT_SYMBOL(bd_claim); | 682 | EXPORT_SYMBOL(bd_claim); |
673 | 683 | ||
674 | void bd_release(struct block_device *bdev) | 684 | void bd_release(struct block_device *bdev) |
675 | { | 685 | { |
676 | spin_lock(&bdev_lock); | 686 | spin_lock(&bdev_lock); |
677 | if (!--bdev->bd_contains->bd_holders) | 687 | if (!--bdev->bd_contains->bd_holders) |
678 | bdev->bd_contains->bd_holder = NULL; | 688 | bdev->bd_contains->bd_holder = NULL; |
679 | if (!--bdev->bd_holders) | 689 | if (!--bdev->bd_holders) |
680 | bdev->bd_holder = NULL; | 690 | bdev->bd_holder = NULL; |
681 | spin_unlock(&bdev_lock); | 691 | spin_unlock(&bdev_lock); |
682 | } | 692 | } |
683 | 693 | ||
684 | EXPORT_SYMBOL(bd_release); | 694 | EXPORT_SYMBOL(bd_release); |
685 | 695 | ||
686 | #ifdef CONFIG_SYSFS | 696 | #ifdef CONFIG_SYSFS |
687 | /* | 697 | /* |
688 | * Functions for bd_claim_by_kobject / bd_release_from_kobject | 698 | * Functions for bd_claim_by_kobject / bd_release_from_kobject |
689 | * | 699 | * |
690 | * If a kobject is passed to bd_claim_by_kobject() | 700 | * If a kobject is passed to bd_claim_by_kobject() |
691 | * and the kobject has a parent directory, | 701 | * and the kobject has a parent directory, |
692 | * following symlinks are created: | 702 | * following symlinks are created: |
693 | * o from the kobject to the claimed bdev | 703 | * o from the kobject to the claimed bdev |
694 | * o from "holders" directory of the bdev to the parent of the kobject | 704 | * o from "holders" directory of the bdev to the parent of the kobject |
695 | * bd_release_from_kobject() removes these symlinks. | 705 | * bd_release_from_kobject() removes these symlinks. |
696 | * | 706 | * |
697 | * Example: | 707 | * Example: |
698 | * If /dev/dm-0 maps to /dev/sda, kobject corresponding to | 708 | * If /dev/dm-0 maps to /dev/sda, kobject corresponding to |
699 | * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: | 709 | * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: |
700 | * /sys/block/dm-0/slaves/sda --> /sys/block/sda | 710 | * /sys/block/dm-0/slaves/sda --> /sys/block/sda |
701 | * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 | 711 | * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 |
702 | */ | 712 | */ |
703 | 713 | ||
704 | static int add_symlink(struct kobject *from, struct kobject *to) | 714 | static int add_symlink(struct kobject *from, struct kobject *to) |
705 | { | 715 | { |
706 | if (!from || !to) | 716 | if (!from || !to) |
707 | return 0; | 717 | return 0; |
708 | return sysfs_create_link(from, to, kobject_name(to)); | 718 | return sysfs_create_link(from, to, kobject_name(to)); |
709 | } | 719 | } |
710 | 720 | ||
711 | static void del_symlink(struct kobject *from, struct kobject *to) | 721 | static void del_symlink(struct kobject *from, struct kobject *to) |
712 | { | 722 | { |
713 | if (!from || !to) | 723 | if (!from || !to) |
714 | return; | 724 | return; |
715 | sysfs_remove_link(from, kobject_name(to)); | 725 | sysfs_remove_link(from, kobject_name(to)); |
716 | } | 726 | } |
717 | 727 | ||
718 | /* | 728 | /* |
719 | * 'struct bd_holder' contains pointers to kobjects symlinked by | 729 | * 'struct bd_holder' contains pointers to kobjects symlinked by |
720 | * bd_claim_by_kobject. | 730 | * bd_claim_by_kobject. |
721 | * It's connected to bd_holder_list which is protected by bdev->bd_sem. | 731 | * It's connected to bd_holder_list which is protected by bdev->bd_sem. |
722 | */ | 732 | */ |
723 | struct bd_holder { | 733 | struct bd_holder { |
724 | struct list_head list; /* chain of holders of the bdev */ | 734 | struct list_head list; /* chain of holders of the bdev */ |
725 | int count; /* references from the holder */ | 735 | int count; /* references from the holder */ |
726 | struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ | 736 | struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ |
727 | struct kobject *hdev; /* e.g. "/block/dm-0" */ | 737 | struct kobject *hdev; /* e.g. "/block/dm-0" */ |
728 | struct kobject *hdir; /* e.g. "/block/sda/holders" */ | 738 | struct kobject *hdir; /* e.g. "/block/sda/holders" */ |
729 | struct kobject *sdev; /* e.g. "/block/sda" */ | 739 | struct kobject *sdev; /* e.g. "/block/sda" */ |
730 | }; | 740 | }; |
731 | 741 | ||
732 | /* | 742 | /* |
733 | * Get references of related kobjects at once. | 743 | * Get references of related kobjects at once. |
734 | * Returns 1 on success. 0 on failure. | 744 | * Returns 1 on success. 0 on failure. |
735 | * | 745 | * |
736 | * Should call bd_holder_release_dirs() after successful use. | 746 | * Should call bd_holder_release_dirs() after successful use. |
737 | */ | 747 | */ |
738 | static int bd_holder_grab_dirs(struct block_device *bdev, | 748 | static int bd_holder_grab_dirs(struct block_device *bdev, |
739 | struct bd_holder *bo) | 749 | struct bd_holder *bo) |
740 | { | 750 | { |
741 | if (!bdev || !bo) | 751 | if (!bdev || !bo) |
742 | return 0; | 752 | return 0; |
743 | 753 | ||
744 | bo->sdir = kobject_get(bo->sdir); | 754 | bo->sdir = kobject_get(bo->sdir); |
745 | if (!bo->sdir) | 755 | if (!bo->sdir) |
746 | return 0; | 756 | return 0; |
747 | 757 | ||
748 | bo->hdev = kobject_get(bo->sdir->parent); | 758 | bo->hdev = kobject_get(bo->sdir->parent); |
749 | if (!bo->hdev) | 759 | if (!bo->hdev) |
750 | goto fail_put_sdir; | 760 | goto fail_put_sdir; |
751 | 761 | ||
752 | bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); | 762 | bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); |
753 | if (!bo->sdev) | 763 | if (!bo->sdev) |
754 | goto fail_put_hdev; | 764 | goto fail_put_hdev; |
755 | 765 | ||
756 | bo->hdir = kobject_get(bdev->bd_part->holder_dir); | 766 | bo->hdir = kobject_get(bdev->bd_part->holder_dir); |
757 | if (!bo->hdir) | 767 | if (!bo->hdir) |
758 | goto fail_put_sdev; | 768 | goto fail_put_sdev; |
759 | 769 | ||
760 | return 1; | 770 | return 1; |
761 | 771 | ||
762 | fail_put_sdev: | 772 | fail_put_sdev: |
763 | kobject_put(bo->sdev); | 773 | kobject_put(bo->sdev); |
764 | fail_put_hdev: | 774 | fail_put_hdev: |
765 | kobject_put(bo->hdev); | 775 | kobject_put(bo->hdev); |
766 | fail_put_sdir: | 776 | fail_put_sdir: |
767 | kobject_put(bo->sdir); | 777 | kobject_put(bo->sdir); |
768 | 778 | ||
769 | return 0; | 779 | return 0; |
770 | } | 780 | } |
771 | 781 | ||
772 | /* Put references of related kobjects at once. */ | 782 | /* Put references of related kobjects at once. */ |
773 | static void bd_holder_release_dirs(struct bd_holder *bo) | 783 | static void bd_holder_release_dirs(struct bd_holder *bo) |
774 | { | 784 | { |
775 | kobject_put(bo->hdir); | 785 | kobject_put(bo->hdir); |
776 | kobject_put(bo->sdev); | 786 | kobject_put(bo->sdev); |
777 | kobject_put(bo->hdev); | 787 | kobject_put(bo->hdev); |
778 | kobject_put(bo->sdir); | 788 | kobject_put(bo->sdir); |
779 | } | 789 | } |
780 | 790 | ||
781 | static struct bd_holder *alloc_bd_holder(struct kobject *kobj) | 791 | static struct bd_holder *alloc_bd_holder(struct kobject *kobj) |
782 | { | 792 | { |
783 | struct bd_holder *bo; | 793 | struct bd_holder *bo; |
784 | 794 | ||
785 | bo = kzalloc(sizeof(*bo), GFP_KERNEL); | 795 | bo = kzalloc(sizeof(*bo), GFP_KERNEL); |
786 | if (!bo) | 796 | if (!bo) |
787 | return NULL; | 797 | return NULL; |
788 | 798 | ||
789 | bo->count = 1; | 799 | bo->count = 1; |
790 | bo->sdir = kobj; | 800 | bo->sdir = kobj; |
791 | 801 | ||
792 | return bo; | 802 | return bo; |
793 | } | 803 | } |
794 | 804 | ||
795 | static void free_bd_holder(struct bd_holder *bo) | 805 | static void free_bd_holder(struct bd_holder *bo) |
796 | { | 806 | { |
797 | kfree(bo); | 807 | kfree(bo); |
798 | } | 808 | } |
799 | 809 | ||
800 | /** | 810 | /** |
801 | * find_bd_holder - find matching struct bd_holder from the block device | 811 | * find_bd_holder - find matching struct bd_holder from the block device |
802 | * | 812 | * |
803 | * @bdev: struct block device to be searched | 813 | * @bdev: struct block device to be searched |
804 | * @bo: target struct bd_holder | 814 | * @bo: target struct bd_holder |
805 | * | 815 | * |
806 | * Returns matching entry with @bo in @bdev->bd_holder_list. | 816 | * Returns matching entry with @bo in @bdev->bd_holder_list. |
807 | * If found, increment the reference count and return the pointer. | 817 | * If found, increment the reference count and return the pointer. |
808 | * If not found, returns NULL. | 818 | * If not found, returns NULL. |
809 | */ | 819 | */ |
810 | static struct bd_holder *find_bd_holder(struct block_device *bdev, | 820 | static struct bd_holder *find_bd_holder(struct block_device *bdev, |
811 | struct bd_holder *bo) | 821 | struct bd_holder *bo) |
812 | { | 822 | { |
813 | struct bd_holder *tmp; | 823 | struct bd_holder *tmp; |
814 | 824 | ||
815 | list_for_each_entry(tmp, &bdev->bd_holder_list, list) | 825 | list_for_each_entry(tmp, &bdev->bd_holder_list, list) |
816 | if (tmp->sdir == bo->sdir) { | 826 | if (tmp->sdir == bo->sdir) { |
817 | tmp->count++; | 827 | tmp->count++; |
818 | return tmp; | 828 | return tmp; |
819 | } | 829 | } |
820 | 830 | ||
821 | return NULL; | 831 | return NULL; |
822 | } | 832 | } |
823 | 833 | ||
824 | /** | 834 | /** |
825 | * add_bd_holder - create sysfs symlinks for bd_claim() relationship | 835 | * add_bd_holder - create sysfs symlinks for bd_claim() relationship |
826 | * | 836 | * |
827 | * @bdev: block device to be bd_claimed | 837 | * @bdev: block device to be bd_claimed |
828 | * @bo: preallocated and initialized by alloc_bd_holder() | 838 | * @bo: preallocated and initialized by alloc_bd_holder() |
829 | * | 839 | * |
830 | * Add @bo to @bdev->bd_holder_list, create symlinks. | 840 | * Add @bo to @bdev->bd_holder_list, create symlinks. |
831 | * | 841 | * |
832 | * Returns 0 if symlinks are created. | 842 | * Returns 0 if symlinks are created. |
833 | * Returns -ve if something fails. | 843 | * Returns -ve if something fails. |
834 | */ | 844 | */ |
835 | static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) | 845 | static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) |
836 | { | 846 | { |
837 | int err; | 847 | int err; |
838 | 848 | ||
839 | if (!bo) | 849 | if (!bo) |
840 | return -EINVAL; | 850 | return -EINVAL; |
841 | 851 | ||
842 | if (!bd_holder_grab_dirs(bdev, bo)) | 852 | if (!bd_holder_grab_dirs(bdev, bo)) |
843 | return -EBUSY; | 853 | return -EBUSY; |
844 | 854 | ||
845 | err = add_symlink(bo->sdir, bo->sdev); | 855 | err = add_symlink(bo->sdir, bo->sdev); |
846 | if (err) | 856 | if (err) |
847 | return err; | 857 | return err; |
848 | 858 | ||
849 | err = add_symlink(bo->hdir, bo->hdev); | 859 | err = add_symlink(bo->hdir, bo->hdev); |
850 | if (err) { | 860 | if (err) { |
851 | del_symlink(bo->sdir, bo->sdev); | 861 | del_symlink(bo->sdir, bo->sdev); |
852 | return err; | 862 | return err; |
853 | } | 863 | } |
854 | 864 | ||
855 | list_add_tail(&bo->list, &bdev->bd_holder_list); | 865 | list_add_tail(&bo->list, &bdev->bd_holder_list); |
856 | return 0; | 866 | return 0; |
857 | } | 867 | } |
858 | 868 | ||
859 | /** | 869 | /** |
860 | * del_bd_holder - delete sysfs symlinks for bd_claim() relationship | 870 | * del_bd_holder - delete sysfs symlinks for bd_claim() relationship |
861 | * | 871 | * |
862 | * @bdev: block device to be bd_claimed | 872 | * @bdev: block device to be bd_claimed |
863 | * @kobj: holder's kobject | 873 | * @kobj: holder's kobject |
864 | * | 874 | * |
865 | * If there is matching entry with @kobj in @bdev->bd_holder_list | 875 | * If there is matching entry with @kobj in @bdev->bd_holder_list |
866 | * and no other bd_claim() from the same kobject, | 876 | * and no other bd_claim() from the same kobject, |
867 | * remove the struct bd_holder from the list, delete symlinks for it. | 877 | * remove the struct bd_holder from the list, delete symlinks for it. |
868 | * | 878 | * |
869 | * Returns a pointer to the struct bd_holder when it's removed from the list | 879 | * Returns a pointer to the struct bd_holder when it's removed from the list |
870 | * and ready to be freed. | 880 | * and ready to be freed. |
871 | * Returns NULL if matching claim isn't found or there is other bd_claim() | 881 | * Returns NULL if matching claim isn't found or there is other bd_claim() |
872 | * by the same kobject. | 882 | * by the same kobject. |
873 | */ | 883 | */ |
874 | static struct bd_holder *del_bd_holder(struct block_device *bdev, | 884 | static struct bd_holder *del_bd_holder(struct block_device *bdev, |
875 | struct kobject *kobj) | 885 | struct kobject *kobj) |
876 | { | 886 | { |
877 | struct bd_holder *bo; | 887 | struct bd_holder *bo; |
878 | 888 | ||
879 | list_for_each_entry(bo, &bdev->bd_holder_list, list) { | 889 | list_for_each_entry(bo, &bdev->bd_holder_list, list) { |
880 | if (bo->sdir == kobj) { | 890 | if (bo->sdir == kobj) { |
881 | bo->count--; | 891 | bo->count--; |
882 | BUG_ON(bo->count < 0); | 892 | BUG_ON(bo->count < 0); |
883 | if (!bo->count) { | 893 | if (!bo->count) { |
884 | list_del(&bo->list); | 894 | list_del(&bo->list); |
885 | del_symlink(bo->sdir, bo->sdev); | 895 | del_symlink(bo->sdir, bo->sdev); |
886 | del_symlink(bo->hdir, bo->hdev); | 896 | del_symlink(bo->hdir, bo->hdev); |
887 | bd_holder_release_dirs(bo); | 897 | bd_holder_release_dirs(bo); |
888 | return bo; | 898 | return bo; |
889 | } | 899 | } |
890 | break; | 900 | break; |
891 | } | 901 | } |
892 | } | 902 | } |
893 | 903 | ||
894 | return NULL; | 904 | return NULL; |
895 | } | 905 | } |
896 | 906 | ||
897 | /** | 907 | /** |
898 | * bd_claim_by_kobject - bd_claim() with additional kobject signature | 908 | * bd_claim_by_kobject - bd_claim() with additional kobject signature |
899 | * | 909 | * |
900 | * @bdev: block device to be claimed | 910 | * @bdev: block device to be claimed |
901 | * @holder: holder's signature | 911 | * @holder: holder's signature |
902 | * @kobj: holder's kobject | 912 | * @kobj: holder's kobject |
903 | * | 913 | * |
904 | * Do bd_claim() and if it succeeds, create sysfs symlinks between | 914 | * Do bd_claim() and if it succeeds, create sysfs symlinks between |
905 | * the bdev and the holder's kobject. | 915 | * the bdev and the holder's kobject. |
906 | * Use bd_release_from_kobject() when relesing the claimed bdev. | 916 | * Use bd_release_from_kobject() when relesing the claimed bdev. |
907 | * | 917 | * |
908 | * Returns 0 on success. (same as bd_claim()) | 918 | * Returns 0 on success. (same as bd_claim()) |
909 | * Returns errno on failure. | 919 | * Returns errno on failure. |
910 | */ | 920 | */ |
911 | static int bd_claim_by_kobject(struct block_device *bdev, void *holder, | 921 | static int bd_claim_by_kobject(struct block_device *bdev, void *holder, |
912 | struct kobject *kobj) | 922 | struct kobject *kobj) |
913 | { | 923 | { |
914 | int err; | 924 | int err; |
915 | struct bd_holder *bo, *found; | 925 | struct bd_holder *bo, *found; |
916 | 926 | ||
917 | if (!kobj) | 927 | if (!kobj) |
918 | return -EINVAL; | 928 | return -EINVAL; |
919 | 929 | ||
920 | bo = alloc_bd_holder(kobj); | 930 | bo = alloc_bd_holder(kobj); |
921 | if (!bo) | 931 | if (!bo) |
922 | return -ENOMEM; | 932 | return -ENOMEM; |
923 | 933 | ||
924 | mutex_lock(&bdev->bd_mutex); | 934 | mutex_lock(&bdev->bd_mutex); |
925 | 935 | ||
926 | err = bd_claim(bdev, holder); | 936 | err = bd_claim(bdev, holder); |
927 | if (err) | 937 | if (err) |
928 | goto fail; | 938 | goto fail; |
929 | 939 | ||
930 | found = find_bd_holder(bdev, bo); | 940 | found = find_bd_holder(bdev, bo); |
931 | if (found) | 941 | if (found) |
932 | goto fail; | 942 | goto fail; |
933 | 943 | ||
934 | err = add_bd_holder(bdev, bo); | 944 | err = add_bd_holder(bdev, bo); |
935 | if (err) | 945 | if (err) |
936 | bd_release(bdev); | 946 | bd_release(bdev); |
937 | else | 947 | else |
938 | bo = NULL; | 948 | bo = NULL; |
939 | fail: | 949 | fail: |
940 | mutex_unlock(&bdev->bd_mutex); | 950 | mutex_unlock(&bdev->bd_mutex); |
941 | free_bd_holder(bo); | 951 | free_bd_holder(bo); |
942 | return err; | 952 | return err; |
943 | } | 953 | } |
944 | 954 | ||
945 | /** | 955 | /** |
946 | * bd_release_from_kobject - bd_release() with additional kobject signature | 956 | * bd_release_from_kobject - bd_release() with additional kobject signature |
947 | * | 957 | * |
948 | * @bdev: block device to be released | 958 | * @bdev: block device to be released |
949 | * @kobj: holder's kobject | 959 | * @kobj: holder's kobject |
950 | * | 960 | * |
951 | * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). | 961 | * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). |
952 | */ | 962 | */ |
953 | static void bd_release_from_kobject(struct block_device *bdev, | 963 | static void bd_release_from_kobject(struct block_device *bdev, |
954 | struct kobject *kobj) | 964 | struct kobject *kobj) |
955 | { | 965 | { |
956 | if (!kobj) | 966 | if (!kobj) |
957 | return; | 967 | return; |
958 | 968 | ||
959 | mutex_lock(&bdev->bd_mutex); | 969 | mutex_lock(&bdev->bd_mutex); |
960 | bd_release(bdev); | 970 | bd_release(bdev); |
961 | free_bd_holder(del_bd_holder(bdev, kobj)); | 971 | free_bd_holder(del_bd_holder(bdev, kobj)); |
962 | mutex_unlock(&bdev->bd_mutex); | 972 | mutex_unlock(&bdev->bd_mutex); |
963 | } | 973 | } |
964 | 974 | ||
965 | /** | 975 | /** |
966 | * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() | 976 | * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() |
967 | * | 977 | * |
968 | * @bdev: block device to be claimed | 978 | * @bdev: block device to be claimed |
969 | * @holder: holder's signature | 979 | * @holder: holder's signature |
970 | * @disk: holder's gendisk | 980 | * @disk: holder's gendisk |
971 | * | 981 | * |
972 | * Call bd_claim_by_kobject() with getting @disk->slave_dir. | 982 | * Call bd_claim_by_kobject() with getting @disk->slave_dir. |
973 | */ | 983 | */ |
974 | int bd_claim_by_disk(struct block_device *bdev, void *holder, | 984 | int bd_claim_by_disk(struct block_device *bdev, void *holder, |
975 | struct gendisk *disk) | 985 | struct gendisk *disk) |
976 | { | 986 | { |
977 | return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); | 987 | return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); |
978 | } | 988 | } |
979 | EXPORT_SYMBOL_GPL(bd_claim_by_disk); | 989 | EXPORT_SYMBOL_GPL(bd_claim_by_disk); |
980 | 990 | ||
981 | /** | 991 | /** |
982 | * bd_release_from_disk - wrapper function for bd_release_from_kobject() | 992 | * bd_release_from_disk - wrapper function for bd_release_from_kobject() |
983 | * | 993 | * |
984 | * @bdev: block device to be claimed | 994 | * @bdev: block device to be claimed |
985 | * @disk: holder's gendisk | 995 | * @disk: holder's gendisk |
986 | * | 996 | * |
987 | * Call bd_release_from_kobject() and put @disk->slave_dir. | 997 | * Call bd_release_from_kobject() and put @disk->slave_dir. |
988 | */ | 998 | */ |
989 | void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) | 999 | void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) |
990 | { | 1000 | { |
991 | bd_release_from_kobject(bdev, disk->slave_dir); | 1001 | bd_release_from_kobject(bdev, disk->slave_dir); |
992 | kobject_put(disk->slave_dir); | 1002 | kobject_put(disk->slave_dir); |
993 | } | 1003 | } |
994 | EXPORT_SYMBOL_GPL(bd_release_from_disk); | 1004 | EXPORT_SYMBOL_GPL(bd_release_from_disk); |
995 | #endif | 1005 | #endif |
996 | 1006 | ||
997 | /* | 1007 | /* |
998 | * Tries to open block device by device number. Use it ONLY if you | 1008 | * Tries to open block device by device number. Use it ONLY if you |
999 | * really do not have anything better - i.e. when you are behind a | 1009 | * really do not have anything better - i.e. when you are behind a |
1000 | * truly sucky interface and all you are given is a device number. _Never_ | 1010 | * truly sucky interface and all you are given is a device number. _Never_ |
1001 | * to be used for internal purposes. If you ever need it - reconsider | 1011 | * to be used for internal purposes. If you ever need it - reconsider |
1002 | * your API. | 1012 | * your API. |
1003 | */ | 1013 | */ |
1004 | struct block_device *open_by_devnum(dev_t dev, fmode_t mode) | 1014 | struct block_device *open_by_devnum(dev_t dev, fmode_t mode) |
1005 | { | 1015 | { |
1006 | struct block_device *bdev = bdget(dev); | 1016 | struct block_device *bdev = bdget(dev); |
1007 | int err = -ENOMEM; | 1017 | int err = -ENOMEM; |
1008 | if (bdev) | 1018 | if (bdev) |
1009 | err = blkdev_get(bdev, mode); | 1019 | err = blkdev_get(bdev, mode); |
1010 | return err ? ERR_PTR(err) : bdev; | 1020 | return err ? ERR_PTR(err) : bdev; |
1011 | } | 1021 | } |
1012 | 1022 | ||
1013 | EXPORT_SYMBOL(open_by_devnum); | 1023 | EXPORT_SYMBOL(open_by_devnum); |
1014 | 1024 | ||
1015 | /** | 1025 | /** |
1016 | * flush_disk - invalidates all buffer-cache entries on a disk | 1026 | * flush_disk - invalidates all buffer-cache entries on a disk |
1017 | * | 1027 | * |
1018 | * @bdev: struct block device to be flushed | 1028 | * @bdev: struct block device to be flushed |
1019 | * | 1029 | * |
1020 | * Invalidates all buffer-cache entries on a disk. It should be called | 1030 | * Invalidates all buffer-cache entries on a disk. It should be called |
1021 | * when a disk has been changed -- either by a media change or online | 1031 | * when a disk has been changed -- either by a media change or online |
1022 | * resize. | 1032 | * resize. |
1023 | */ | 1033 | */ |
1024 | static void flush_disk(struct block_device *bdev) | 1034 | static void flush_disk(struct block_device *bdev) |
1025 | { | 1035 | { |
1026 | if (__invalidate_device(bdev)) { | 1036 | if (__invalidate_device(bdev)) { |
1027 | char name[BDEVNAME_SIZE] = ""; | 1037 | char name[BDEVNAME_SIZE] = ""; |
1028 | 1038 | ||
1029 | if (bdev->bd_disk) | 1039 | if (bdev->bd_disk) |
1030 | disk_name(bdev->bd_disk, 0, name); | 1040 | disk_name(bdev->bd_disk, 0, name); |
1031 | printk(KERN_WARNING "VFS: busy inodes on changed media or " | 1041 | printk(KERN_WARNING "VFS: busy inodes on changed media or " |
1032 | "resized disk %s\n", name); | 1042 | "resized disk %s\n", name); |
1033 | } | 1043 | } |
1034 | 1044 | ||
1035 | if (!bdev->bd_disk) | 1045 | if (!bdev->bd_disk) |
1036 | return; | 1046 | return; |
1037 | if (disk_partitionable(bdev->bd_disk)) | 1047 | if (disk_partitionable(bdev->bd_disk)) |
1038 | bdev->bd_invalidated = 1; | 1048 | bdev->bd_invalidated = 1; |
1039 | } | 1049 | } |
1040 | 1050 | ||
1041 | /** | 1051 | /** |
1042 | * check_disk_size_change - checks for disk size change and adjusts bdev size. | 1052 | * check_disk_size_change - checks for disk size change and adjusts bdev size. |
1043 | * @disk: struct gendisk to check | 1053 | * @disk: struct gendisk to check |
1044 | * @bdev: struct bdev to adjust. | 1054 | * @bdev: struct bdev to adjust. |
1045 | * | 1055 | * |
1046 | * This routine checks to see if the bdev size does not match the disk size | 1056 | * This routine checks to see if the bdev size does not match the disk size |
1047 | * and adjusts it if it differs. | 1057 | * and adjusts it if it differs. |
1048 | */ | 1058 | */ |
1049 | void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) | 1059 | void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) |
1050 | { | 1060 | { |
1051 | loff_t disk_size, bdev_size; | 1061 | loff_t disk_size, bdev_size; |
1052 | 1062 | ||
1053 | disk_size = (loff_t)get_capacity(disk) << 9; | 1063 | disk_size = (loff_t)get_capacity(disk) << 9; |
1054 | bdev_size = i_size_read(bdev->bd_inode); | 1064 | bdev_size = i_size_read(bdev->bd_inode); |
1055 | if (disk_size != bdev_size) { | 1065 | if (disk_size != bdev_size) { |
1056 | char name[BDEVNAME_SIZE]; | 1066 | char name[BDEVNAME_SIZE]; |
1057 | 1067 | ||
1058 | disk_name(disk, 0, name); | 1068 | disk_name(disk, 0, name); |
1059 | printk(KERN_INFO | 1069 | printk(KERN_INFO |
1060 | "%s: detected capacity change from %lld to %lld\n", | 1070 | "%s: detected capacity change from %lld to %lld\n", |
1061 | name, bdev_size, disk_size); | 1071 | name, bdev_size, disk_size); |
1062 | i_size_write(bdev->bd_inode, disk_size); | 1072 | i_size_write(bdev->bd_inode, disk_size); |
1063 | flush_disk(bdev); | 1073 | flush_disk(bdev); |
1064 | } | 1074 | } |
1065 | } | 1075 | } |
1066 | EXPORT_SYMBOL(check_disk_size_change); | 1076 | EXPORT_SYMBOL(check_disk_size_change); |
1067 | 1077 | ||
1068 | /** | 1078 | /** |
1069 | * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back | 1079 | * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back |
1070 | * @disk: struct gendisk to be revalidated | 1080 | * @disk: struct gendisk to be revalidated |
1071 | * | 1081 | * |
1072 | * This routine is a wrapper for lower-level driver's revalidate_disk | 1082 | * This routine is a wrapper for lower-level driver's revalidate_disk |
1073 | * call-backs. It is used to do common pre and post operations needed | 1083 | * call-backs. It is used to do common pre and post operations needed |
1074 | * for all revalidate_disk operations. | 1084 | * for all revalidate_disk operations. |
1075 | */ | 1085 | */ |
1076 | int revalidate_disk(struct gendisk *disk) | 1086 | int revalidate_disk(struct gendisk *disk) |
1077 | { | 1087 | { |
1078 | struct block_device *bdev; | 1088 | struct block_device *bdev; |
1079 | int ret = 0; | 1089 | int ret = 0; |
1080 | 1090 | ||
1081 | if (disk->fops->revalidate_disk) | 1091 | if (disk->fops->revalidate_disk) |
1082 | ret = disk->fops->revalidate_disk(disk); | 1092 | ret = disk->fops->revalidate_disk(disk); |
1083 | 1093 | ||
1084 | bdev = bdget_disk(disk, 0); | 1094 | bdev = bdget_disk(disk, 0); |
1085 | if (!bdev) | 1095 | if (!bdev) |
1086 | return ret; | 1096 | return ret; |
1087 | 1097 | ||
1088 | mutex_lock(&bdev->bd_mutex); | 1098 | mutex_lock(&bdev->bd_mutex); |
1089 | check_disk_size_change(disk, bdev); | 1099 | check_disk_size_change(disk, bdev); |
1090 | mutex_unlock(&bdev->bd_mutex); | 1100 | mutex_unlock(&bdev->bd_mutex); |
1091 | bdput(bdev); | 1101 | bdput(bdev); |
1092 | return ret; | 1102 | return ret; |
1093 | } | 1103 | } |
1094 | EXPORT_SYMBOL(revalidate_disk); | 1104 | EXPORT_SYMBOL(revalidate_disk); |
1095 | 1105 | ||
1096 | /* | 1106 | /* |
1097 | * This routine checks whether a removable media has been changed, | 1107 | * This routine checks whether a removable media has been changed, |
1098 | * and invalidates all buffer-cache-entries in that case. This | 1108 | * and invalidates all buffer-cache-entries in that case. This |
1099 | * is a relatively slow routine, so we have to try to minimize using | 1109 | * is a relatively slow routine, so we have to try to minimize using |
1100 | * it. Thus it is called only upon a 'mount' or 'open'. This | 1110 | * it. Thus it is called only upon a 'mount' or 'open'. This |
1101 | * is the best way of combining speed and utility, I think. | 1111 | * is the best way of combining speed and utility, I think. |
1102 | * People changing diskettes in the middle of an operation deserve | 1112 | * People changing diskettes in the middle of an operation deserve |
1103 | * to lose :-) | 1113 | * to lose :-) |
1104 | */ | 1114 | */ |
1105 | int check_disk_change(struct block_device *bdev) | 1115 | int check_disk_change(struct block_device *bdev) |
1106 | { | 1116 | { |
1107 | struct gendisk *disk = bdev->bd_disk; | 1117 | struct gendisk *disk = bdev->bd_disk; |
1108 | struct block_device_operations * bdops = disk->fops; | 1118 | struct block_device_operations * bdops = disk->fops; |
1109 | 1119 | ||
1110 | if (!bdops->media_changed) | 1120 | if (!bdops->media_changed) |
1111 | return 0; | 1121 | return 0; |
1112 | if (!bdops->media_changed(bdev->bd_disk)) | 1122 | if (!bdops->media_changed(bdev->bd_disk)) |
1113 | return 0; | 1123 | return 0; |
1114 | 1124 | ||
1115 | flush_disk(bdev); | 1125 | flush_disk(bdev); |
1116 | if (bdops->revalidate_disk) | 1126 | if (bdops->revalidate_disk) |
1117 | bdops->revalidate_disk(bdev->bd_disk); | 1127 | bdops->revalidate_disk(bdev->bd_disk); |
1118 | return 1; | 1128 | return 1; |
1119 | } | 1129 | } |
1120 | 1130 | ||
1121 | EXPORT_SYMBOL(check_disk_change); | 1131 | EXPORT_SYMBOL(check_disk_change); |
1122 | 1132 | ||
1123 | void bd_set_size(struct block_device *bdev, loff_t size) | 1133 | void bd_set_size(struct block_device *bdev, loff_t size) |
1124 | { | 1134 | { |
1125 | unsigned bsize = bdev_logical_block_size(bdev); | 1135 | unsigned bsize = bdev_logical_block_size(bdev); |
1126 | 1136 | ||
1127 | bdev->bd_inode->i_size = size; | 1137 | bdev->bd_inode->i_size = size; |
1128 | while (bsize < PAGE_CACHE_SIZE) { | 1138 | while (bsize < PAGE_CACHE_SIZE) { |
1129 | if (size & bsize) | 1139 | if (size & bsize) |
1130 | break; | 1140 | break; |
1131 | bsize <<= 1; | 1141 | bsize <<= 1; |
1132 | } | 1142 | } |
1133 | bdev->bd_block_size = bsize; | 1143 | bdev->bd_block_size = bsize; |
1134 | bdev->bd_inode->i_blkbits = blksize_bits(bsize); | 1144 | bdev->bd_inode->i_blkbits = blksize_bits(bsize); |
1135 | } | 1145 | } |
1136 | EXPORT_SYMBOL(bd_set_size); | 1146 | EXPORT_SYMBOL(bd_set_size); |
1137 | 1147 | ||
1138 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); | 1148 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); |
1139 | 1149 | ||
1140 | /* | 1150 | /* |
1141 | * bd_mutex locking: | 1151 | * bd_mutex locking: |
1142 | * | 1152 | * |
1143 | * mutex_lock(part->bd_mutex) | 1153 | * mutex_lock(part->bd_mutex) |
1144 | * mutex_lock_nested(whole->bd_mutex, 1) | 1154 | * mutex_lock_nested(whole->bd_mutex, 1) |
1145 | */ | 1155 | */ |
1146 | 1156 | ||
1147 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | 1157 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) |
1148 | { | 1158 | { |
1149 | struct gendisk *disk; | 1159 | struct gendisk *disk; |
1150 | int ret; | 1160 | int ret; |
1151 | int partno; | 1161 | int partno; |
1152 | int perm = 0; | 1162 | int perm = 0; |
1153 | 1163 | ||
1154 | if (mode & FMODE_READ) | 1164 | if (mode & FMODE_READ) |
1155 | perm |= MAY_READ; | 1165 | perm |= MAY_READ; |
1156 | if (mode & FMODE_WRITE) | 1166 | if (mode & FMODE_WRITE) |
1157 | perm |= MAY_WRITE; | 1167 | perm |= MAY_WRITE; |
1158 | /* | 1168 | /* |
1159 | * hooks: /n/, see "layering violations". | 1169 | * hooks: /n/, see "layering violations". |
1160 | */ | 1170 | */ |
1161 | ret = devcgroup_inode_permission(bdev->bd_inode, perm); | 1171 | ret = devcgroup_inode_permission(bdev->bd_inode, perm); |
1162 | if (ret != 0) { | 1172 | if (ret != 0) { |
1163 | bdput(bdev); | 1173 | bdput(bdev); |
1164 | return ret; | 1174 | return ret; |
1165 | } | 1175 | } |
1166 | 1176 | ||
1167 | lock_kernel(); | 1177 | lock_kernel(); |
1168 | restart: | 1178 | restart: |
1169 | 1179 | ||
1170 | ret = -ENXIO; | 1180 | ret = -ENXIO; |
1171 | disk = get_gendisk(bdev->bd_dev, &partno); | 1181 | disk = get_gendisk(bdev->bd_dev, &partno); |
1172 | if (!disk) | 1182 | if (!disk) |
1173 | goto out_unlock_kernel; | 1183 | goto out_unlock_kernel; |
1174 | 1184 | ||
1175 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1185 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
1176 | if (!bdev->bd_openers) { | 1186 | if (!bdev->bd_openers) { |
1177 | bdev->bd_disk = disk; | 1187 | bdev->bd_disk = disk; |
1178 | bdev->bd_contains = bdev; | 1188 | bdev->bd_contains = bdev; |
1179 | if (!partno) { | 1189 | if (!partno) { |
1180 | struct backing_dev_info *bdi; | 1190 | struct backing_dev_info *bdi; |
1181 | 1191 | ||
1182 | ret = -ENXIO; | 1192 | ret = -ENXIO; |
1183 | bdev->bd_part = disk_get_part(disk, partno); | 1193 | bdev->bd_part = disk_get_part(disk, partno); |
1184 | if (!bdev->bd_part) | 1194 | if (!bdev->bd_part) |
1185 | goto out_clear; | 1195 | goto out_clear; |
1186 | 1196 | ||
1187 | if (disk->fops->open) { | 1197 | if (disk->fops->open) { |
1188 | ret = disk->fops->open(bdev, mode); | 1198 | ret = disk->fops->open(bdev, mode); |
1189 | if (ret == -ERESTARTSYS) { | 1199 | if (ret == -ERESTARTSYS) { |
1190 | /* Lost a race with 'disk' being | 1200 | /* Lost a race with 'disk' being |
1191 | * deleted, try again. | 1201 | * deleted, try again. |
1192 | * See md.c | 1202 | * See md.c |
1193 | */ | 1203 | */ |
1194 | disk_put_part(bdev->bd_part); | 1204 | disk_put_part(bdev->bd_part); |
1195 | bdev->bd_part = NULL; | 1205 | bdev->bd_part = NULL; |
1196 | module_put(disk->fops->owner); | 1206 | module_put(disk->fops->owner); |
1197 | put_disk(disk); | 1207 | put_disk(disk); |
1198 | bdev->bd_disk = NULL; | 1208 | bdev->bd_disk = NULL; |
1199 | mutex_unlock(&bdev->bd_mutex); | 1209 | mutex_unlock(&bdev->bd_mutex); |
1200 | goto restart; | 1210 | goto restart; |
1201 | } | 1211 | } |
1202 | if (ret) | 1212 | if (ret) |
1203 | goto out_clear; | 1213 | goto out_clear; |
1204 | } | 1214 | } |
1205 | if (!bdev->bd_openers) { | 1215 | if (!bdev->bd_openers) { |
1206 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); | 1216 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); |
1207 | bdi = blk_get_backing_dev_info(bdev); | 1217 | bdi = blk_get_backing_dev_info(bdev); |
1208 | if (bdi == NULL) | 1218 | if (bdi == NULL) |
1209 | bdi = &default_backing_dev_info; | 1219 | bdi = &default_backing_dev_info; |
1210 | bdev->bd_inode->i_data.backing_dev_info = bdi; | 1220 | bdev->bd_inode->i_data.backing_dev_info = bdi; |
1211 | } | 1221 | } |
1212 | if (bdev->bd_invalidated) | 1222 | if (bdev->bd_invalidated) |
1213 | rescan_partitions(disk, bdev); | 1223 | rescan_partitions(disk, bdev); |
1214 | } else { | 1224 | } else { |
1215 | struct block_device *whole; | 1225 | struct block_device *whole; |
1216 | whole = bdget_disk(disk, 0); | 1226 | whole = bdget_disk(disk, 0); |
1217 | ret = -ENOMEM; | 1227 | ret = -ENOMEM; |
1218 | if (!whole) | 1228 | if (!whole) |
1219 | goto out_clear; | 1229 | goto out_clear; |
1220 | BUG_ON(for_part); | 1230 | BUG_ON(for_part); |
1221 | ret = __blkdev_get(whole, mode, 1); | 1231 | ret = __blkdev_get(whole, mode, 1); |
1222 | if (ret) | 1232 | if (ret) |
1223 | goto out_clear; | 1233 | goto out_clear; |
1224 | bdev->bd_contains = whole; | 1234 | bdev->bd_contains = whole; |
1225 | bdev->bd_inode->i_data.backing_dev_info = | 1235 | bdev->bd_inode->i_data.backing_dev_info = |
1226 | whole->bd_inode->i_data.backing_dev_info; | 1236 | whole->bd_inode->i_data.backing_dev_info; |
1227 | bdev->bd_part = disk_get_part(disk, partno); | 1237 | bdev->bd_part = disk_get_part(disk, partno); |
1228 | if (!(disk->flags & GENHD_FL_UP) || | 1238 | if (!(disk->flags & GENHD_FL_UP) || |
1229 | !bdev->bd_part || !bdev->bd_part->nr_sects) { | 1239 | !bdev->bd_part || !bdev->bd_part->nr_sects) { |
1230 | ret = -ENXIO; | 1240 | ret = -ENXIO; |
1231 | goto out_clear; | 1241 | goto out_clear; |
1232 | } | 1242 | } |
1233 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); | 1243 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); |
1234 | } | 1244 | } |
1235 | } else { | 1245 | } else { |
1236 | put_disk(disk); | 1246 | put_disk(disk); |
1237 | module_put(disk->fops->owner); | 1247 | module_put(disk->fops->owner); |
1238 | disk = NULL; | 1248 | disk = NULL; |
1239 | if (bdev->bd_contains == bdev) { | 1249 | if (bdev->bd_contains == bdev) { |
1240 | if (bdev->bd_disk->fops->open) { | 1250 | if (bdev->bd_disk->fops->open) { |
1241 | ret = bdev->bd_disk->fops->open(bdev, mode); | 1251 | ret = bdev->bd_disk->fops->open(bdev, mode); |
1242 | if (ret) | 1252 | if (ret) |
1243 | goto out_unlock_bdev; | 1253 | goto out_unlock_bdev; |
1244 | } | 1254 | } |
1245 | if (bdev->bd_invalidated) | 1255 | if (bdev->bd_invalidated) |
1246 | rescan_partitions(bdev->bd_disk, bdev); | 1256 | rescan_partitions(bdev->bd_disk, bdev); |
1247 | } | 1257 | } |
1248 | } | 1258 | } |
1249 | bdev->bd_openers++; | 1259 | bdev->bd_openers++; |
1250 | if (for_part) | 1260 | if (for_part) |
1251 | bdev->bd_part_count++; | 1261 | bdev->bd_part_count++; |
1252 | mutex_unlock(&bdev->bd_mutex); | 1262 | mutex_unlock(&bdev->bd_mutex); |
1253 | unlock_kernel(); | 1263 | unlock_kernel(); |
1254 | return 0; | 1264 | return 0; |
1255 | 1265 | ||
1256 | out_clear: | 1266 | out_clear: |
1257 | disk_put_part(bdev->bd_part); | 1267 | disk_put_part(bdev->bd_part); |
1258 | bdev->bd_disk = NULL; | 1268 | bdev->bd_disk = NULL; |
1259 | bdev->bd_part = NULL; | 1269 | bdev->bd_part = NULL; |
1260 | bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; | 1270 | bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; |
1261 | if (bdev != bdev->bd_contains) | 1271 | if (bdev != bdev->bd_contains) |
1262 | __blkdev_put(bdev->bd_contains, mode, 1); | 1272 | __blkdev_put(bdev->bd_contains, mode, 1); |
1263 | bdev->bd_contains = NULL; | 1273 | bdev->bd_contains = NULL; |
1264 | out_unlock_bdev: | 1274 | out_unlock_bdev: |
1265 | mutex_unlock(&bdev->bd_mutex); | 1275 | mutex_unlock(&bdev->bd_mutex); |
1266 | out_unlock_kernel: | 1276 | out_unlock_kernel: |
1267 | unlock_kernel(); | 1277 | unlock_kernel(); |
1268 | 1278 | ||
1269 | if (disk) | 1279 | if (disk) |
1270 | module_put(disk->fops->owner); | 1280 | module_put(disk->fops->owner); |
1271 | put_disk(disk); | 1281 | put_disk(disk); |
1272 | bdput(bdev); | 1282 | bdput(bdev); |
1273 | 1283 | ||
1274 | return ret; | 1284 | return ret; |
1275 | } | 1285 | } |
1276 | 1286 | ||
1277 | int blkdev_get(struct block_device *bdev, fmode_t mode) | 1287 | int blkdev_get(struct block_device *bdev, fmode_t mode) |
1278 | { | 1288 | { |
1279 | return __blkdev_get(bdev, mode, 0); | 1289 | return __blkdev_get(bdev, mode, 0); |
1280 | } | 1290 | } |
1281 | EXPORT_SYMBOL(blkdev_get); | 1291 | EXPORT_SYMBOL(blkdev_get); |
1282 | 1292 | ||
1283 | static int blkdev_open(struct inode * inode, struct file * filp) | 1293 | static int blkdev_open(struct inode * inode, struct file * filp) |
1284 | { | 1294 | { |
1285 | struct block_device *bdev; | 1295 | struct block_device *bdev; |
1286 | int res; | 1296 | int res; |
1287 | 1297 | ||
1288 | /* | 1298 | /* |
1289 | * Preserve backwards compatibility and allow large file access | 1299 | * Preserve backwards compatibility and allow large file access |
1290 | * even if userspace doesn't ask for it explicitly. Some mkfs | 1300 | * even if userspace doesn't ask for it explicitly. Some mkfs |
1291 | * binary needs it. We might want to drop this workaround | 1301 | * binary needs it. We might want to drop this workaround |
1292 | * during an unstable branch. | 1302 | * during an unstable branch. |
1293 | */ | 1303 | */ |
1294 | filp->f_flags |= O_LARGEFILE; | 1304 | filp->f_flags |= O_LARGEFILE; |
1295 | 1305 | ||
1296 | if (filp->f_flags & O_NDELAY) | 1306 | if (filp->f_flags & O_NDELAY) |
1297 | filp->f_mode |= FMODE_NDELAY; | 1307 | filp->f_mode |= FMODE_NDELAY; |
1298 | if (filp->f_flags & O_EXCL) | 1308 | if (filp->f_flags & O_EXCL) |
1299 | filp->f_mode |= FMODE_EXCL; | 1309 | filp->f_mode |= FMODE_EXCL; |
1300 | if ((filp->f_flags & O_ACCMODE) == 3) | 1310 | if ((filp->f_flags & O_ACCMODE) == 3) |
1301 | filp->f_mode |= FMODE_WRITE_IOCTL; | 1311 | filp->f_mode |= FMODE_WRITE_IOCTL; |
1302 | 1312 | ||
1303 | bdev = bd_acquire(inode); | 1313 | bdev = bd_acquire(inode); |
1304 | if (bdev == NULL) | 1314 | if (bdev == NULL) |
1305 | return -ENOMEM; | 1315 | return -ENOMEM; |
1306 | 1316 | ||
1307 | filp->f_mapping = bdev->bd_inode->i_mapping; | 1317 | filp->f_mapping = bdev->bd_inode->i_mapping; |
1308 | 1318 | ||
1309 | res = blkdev_get(bdev, filp->f_mode); | 1319 | res = blkdev_get(bdev, filp->f_mode); |
1310 | if (res) | 1320 | if (res) |
1311 | return res; | 1321 | return res; |
1312 | 1322 | ||
1313 | if (filp->f_mode & FMODE_EXCL) { | 1323 | if (filp->f_mode & FMODE_EXCL) { |
1314 | res = bd_claim(bdev, filp); | 1324 | res = bd_claim(bdev, filp); |
1315 | if (res) | 1325 | if (res) |
1316 | goto out_blkdev_put; | 1326 | goto out_blkdev_put; |
1317 | } | 1327 | } |
1318 | 1328 | ||
1319 | return 0; | 1329 | return 0; |
1320 | 1330 | ||
1321 | out_blkdev_put: | 1331 | out_blkdev_put: |
1322 | blkdev_put(bdev, filp->f_mode); | 1332 | blkdev_put(bdev, filp->f_mode); |
1323 | return res; | 1333 | return res; |
1324 | } | 1334 | } |
1325 | 1335 | ||
1326 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) | 1336 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) |
1327 | { | 1337 | { |
1328 | int ret = 0; | 1338 | int ret = 0; |
1329 | struct gendisk *disk = bdev->bd_disk; | 1339 | struct gendisk *disk = bdev->bd_disk; |
1330 | struct block_device *victim = NULL; | 1340 | struct block_device *victim = NULL; |
1331 | 1341 | ||
1332 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1342 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
1333 | lock_kernel(); | 1343 | lock_kernel(); |
1334 | if (for_part) | 1344 | if (for_part) |
1335 | bdev->bd_part_count--; | 1345 | bdev->bd_part_count--; |
1336 | 1346 | ||
1337 | if (!--bdev->bd_openers) { | 1347 | if (!--bdev->bd_openers) { |
1338 | sync_blockdev(bdev); | 1348 | sync_blockdev(bdev); |
1339 | kill_bdev(bdev); | 1349 | kill_bdev(bdev); |
1340 | } | 1350 | } |
1341 | if (bdev->bd_contains == bdev) { | 1351 | if (bdev->bd_contains == bdev) { |
1342 | if (disk->fops->release) | 1352 | if (disk->fops->release) |
1343 | ret = disk->fops->release(disk, mode); | 1353 | ret = disk->fops->release(disk, mode); |
1344 | } | 1354 | } |
1345 | if (!bdev->bd_openers) { | 1355 | if (!bdev->bd_openers) { |
1346 | struct module *owner = disk->fops->owner; | 1356 | struct module *owner = disk->fops->owner; |
1347 | 1357 | ||
1348 | put_disk(disk); | 1358 | put_disk(disk); |
1349 | module_put(owner); | 1359 | module_put(owner); |
1350 | disk_put_part(bdev->bd_part); | 1360 | disk_put_part(bdev->bd_part); |
1351 | bdev->bd_part = NULL; | 1361 | bdev->bd_part = NULL; |
1352 | bdev->bd_disk = NULL; | 1362 | bdev->bd_disk = NULL; |
1353 | bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; | 1363 | bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; |
1354 | if (bdev != bdev->bd_contains) | 1364 | if (bdev != bdev->bd_contains) |
1355 | victim = bdev->bd_contains; | 1365 | victim = bdev->bd_contains; |
1356 | bdev->bd_contains = NULL; | 1366 | bdev->bd_contains = NULL; |
1357 | } | 1367 | } |
1358 | unlock_kernel(); | 1368 | unlock_kernel(); |
1359 | mutex_unlock(&bdev->bd_mutex); | 1369 | mutex_unlock(&bdev->bd_mutex); |
1360 | bdput(bdev); | 1370 | bdput(bdev); |
1361 | if (victim) | 1371 | if (victim) |
1362 | __blkdev_put(victim, mode, 1); | 1372 | __blkdev_put(victim, mode, 1); |
1363 | return ret; | 1373 | return ret; |
1364 | } | 1374 | } |
1365 | 1375 | ||
1366 | int blkdev_put(struct block_device *bdev, fmode_t mode) | 1376 | int blkdev_put(struct block_device *bdev, fmode_t mode) |
1367 | { | 1377 | { |
1368 | return __blkdev_put(bdev, mode, 0); | 1378 | return __blkdev_put(bdev, mode, 0); |
1369 | } | 1379 | } |
1370 | EXPORT_SYMBOL(blkdev_put); | 1380 | EXPORT_SYMBOL(blkdev_put); |
1371 | 1381 | ||
1372 | static int blkdev_close(struct inode * inode, struct file * filp) | 1382 | static int blkdev_close(struct inode * inode, struct file * filp) |
1373 | { | 1383 | { |
1374 | struct block_device *bdev = I_BDEV(filp->f_mapping->host); | 1384 | struct block_device *bdev = I_BDEV(filp->f_mapping->host); |
1375 | if (bdev->bd_holder == filp) | 1385 | if (bdev->bd_holder == filp) |
1376 | bd_release(bdev); | 1386 | bd_release(bdev); |
1377 | return blkdev_put(bdev, filp->f_mode); | 1387 | return blkdev_put(bdev, filp->f_mode); |
1378 | } | 1388 | } |
1379 | 1389 | ||
1380 | static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) | 1390 | static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) |
1381 | { | 1391 | { |
1382 | struct block_device *bdev = I_BDEV(file->f_mapping->host); | 1392 | struct block_device *bdev = I_BDEV(file->f_mapping->host); |
1383 | fmode_t mode = file->f_mode; | 1393 | fmode_t mode = file->f_mode; |
1384 | 1394 | ||
1385 | /* | 1395 | /* |
1386 | * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have | 1396 | * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have |
1387 | * to updated it before every ioctl. | 1397 | * to updated it before every ioctl. |
1388 | */ | 1398 | */ |
1389 | if (file->f_flags & O_NDELAY) | 1399 | if (file->f_flags & O_NDELAY) |
1390 | mode |= FMODE_NDELAY; | 1400 | mode |= FMODE_NDELAY; |
1391 | else | 1401 | else |
1392 | mode &= ~FMODE_NDELAY; | 1402 | mode &= ~FMODE_NDELAY; |
1393 | 1403 | ||
1394 | return blkdev_ioctl(bdev, mode, cmd, arg); | 1404 | return blkdev_ioctl(bdev, mode, cmd, arg); |
1395 | } | 1405 | } |
1396 | 1406 | ||
1397 | /* | 1407 | /* |
1398 | * Try to release a page associated with block device when the system | 1408 | * Try to release a page associated with block device when the system |
1399 | * is under memory pressure. | 1409 | * is under memory pressure. |
1400 | */ | 1410 | */ |
1401 | static int blkdev_releasepage(struct page *page, gfp_t wait) | 1411 | static int blkdev_releasepage(struct page *page, gfp_t wait) |
1402 | { | 1412 | { |
1403 | struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; | 1413 | struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; |
1404 | 1414 | ||
1405 | if (super && super->s_op->bdev_try_to_free_page) | 1415 | if (super && super->s_op->bdev_try_to_free_page) |
1406 | return super->s_op->bdev_try_to_free_page(super, page, wait); | 1416 | return super->s_op->bdev_try_to_free_page(super, page, wait); |
1407 | 1417 | ||
1408 | return try_to_free_buffers(page); | 1418 | return try_to_free_buffers(page); |
1409 | } | 1419 | } |
1410 | 1420 | ||
1411 | static const struct address_space_operations def_blk_aops = { | 1421 | static const struct address_space_operations def_blk_aops = { |
1412 | .readpage = blkdev_readpage, | 1422 | .readpage = blkdev_readpage, |
1413 | .writepage = blkdev_writepage, | 1423 | .writepage = blkdev_writepage, |
1414 | .sync_page = block_sync_page, | 1424 | .sync_page = block_sync_page, |
1415 | .write_begin = blkdev_write_begin, | 1425 | .write_begin = blkdev_write_begin, |
1416 | .write_end = blkdev_write_end, | 1426 | .write_end = blkdev_write_end, |
1417 | .writepages = generic_writepages, | 1427 | .writepages = generic_writepages, |
1418 | .releasepage = blkdev_releasepage, | 1428 | .releasepage = blkdev_releasepage, |
1419 | .direct_IO = blkdev_direct_IO, | 1429 | .direct_IO = blkdev_direct_IO, |
1420 | }; | 1430 | }; |
1421 | 1431 | ||
1422 | const struct file_operations def_blk_fops = { | 1432 | const struct file_operations def_blk_fops = { |
1423 | .open = blkdev_open, | 1433 | .open = blkdev_open, |
1424 | .release = blkdev_close, | 1434 | .release = blkdev_close, |
1425 | .llseek = block_llseek, | 1435 | .llseek = block_llseek, |
1426 | .read = do_sync_read, | 1436 | .read = do_sync_read, |
1427 | .write = do_sync_write, | 1437 | .write = do_sync_write, |
1428 | .aio_read = generic_file_aio_read, | 1438 | .aio_read = generic_file_aio_read, |
1429 | .aio_write = generic_file_aio_write_nolock, | 1439 | .aio_write = generic_file_aio_write_nolock, |
1430 | .mmap = generic_file_mmap, | 1440 | .mmap = generic_file_mmap, |
1431 | .fsync = block_fsync, | 1441 | .fsync = block_fsync, |
1432 | .unlocked_ioctl = block_ioctl, | 1442 | .unlocked_ioctl = block_ioctl, |
1433 | #ifdef CONFIG_COMPAT | 1443 | #ifdef CONFIG_COMPAT |
1434 | .compat_ioctl = compat_blkdev_ioctl, | 1444 | .compat_ioctl = compat_blkdev_ioctl, |
1435 | #endif | 1445 | #endif |
1436 | .splice_read = generic_file_splice_read, | 1446 | .splice_read = generic_file_splice_read, |
1437 | .splice_write = generic_file_splice_write, | 1447 | .splice_write = generic_file_splice_write, |
1438 | }; | 1448 | }; |
1439 | 1449 | ||
1440 | int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) | 1450 | int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) |
1441 | { | 1451 | { |
1442 | int res; | 1452 | int res; |
1443 | mm_segment_t old_fs = get_fs(); | 1453 | mm_segment_t old_fs = get_fs(); |
1444 | set_fs(KERNEL_DS); | 1454 | set_fs(KERNEL_DS); |
1445 | res = blkdev_ioctl(bdev, 0, cmd, arg); | 1455 | res = blkdev_ioctl(bdev, 0, cmd, arg); |
1446 | set_fs(old_fs); | 1456 | set_fs(old_fs); |
1447 | return res; | 1457 | return res; |
1448 | } | 1458 | } |
1449 | 1459 | ||
1450 | EXPORT_SYMBOL(ioctl_by_bdev); | 1460 | EXPORT_SYMBOL(ioctl_by_bdev); |
1451 | 1461 | ||
1452 | /** | 1462 | /** |
1453 | * lookup_bdev - lookup a struct block_device by name | 1463 | * lookup_bdev - lookup a struct block_device by name |
1454 | * @pathname: special file representing the block device | 1464 | * @pathname: special file representing the block device |
1455 | * | 1465 | * |
1456 | * Get a reference to the blockdevice at @pathname in the current | 1466 | * Get a reference to the blockdevice at @pathname in the current |
1457 | * namespace if possible and return it. Return ERR_PTR(error) | 1467 | * namespace if possible and return it. Return ERR_PTR(error) |
1458 | * otherwise. | 1468 | * otherwise. |
1459 | */ | 1469 | */ |
1460 | struct block_device *lookup_bdev(const char *pathname) | 1470 | struct block_device *lookup_bdev(const char *pathname) |
1461 | { | 1471 | { |
1462 | struct block_device *bdev; | 1472 | struct block_device *bdev; |
1463 | struct inode *inode; | 1473 | struct inode *inode; |
1464 | struct path path; | 1474 | struct path path; |
1465 | int error; | 1475 | int error; |
1466 | 1476 | ||
1467 | if (!pathname || !*pathname) | 1477 | if (!pathname || !*pathname) |
1468 | return ERR_PTR(-EINVAL); | 1478 | return ERR_PTR(-EINVAL); |
1469 | 1479 | ||
1470 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); | 1480 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); |
1471 | if (error) | 1481 | if (error) |
1472 | return ERR_PTR(error); | 1482 | return ERR_PTR(error); |
1473 | 1483 | ||
1474 | inode = path.dentry->d_inode; | 1484 | inode = path.dentry->d_inode; |
1475 | error = -ENOTBLK; | 1485 | error = -ENOTBLK; |
1476 | if (!S_ISBLK(inode->i_mode)) | 1486 | if (!S_ISBLK(inode->i_mode)) |
1477 | goto fail; | 1487 | goto fail; |
1478 | error = -EACCES; | 1488 | error = -EACCES; |
1479 | if (path.mnt->mnt_flags & MNT_NODEV) | 1489 | if (path.mnt->mnt_flags & MNT_NODEV) |
1480 | goto fail; | 1490 | goto fail; |
1481 | error = -ENOMEM; | 1491 | error = -ENOMEM; |
1482 | bdev = bd_acquire(inode); | 1492 | bdev = bd_acquire(inode); |
1483 | if (!bdev) | 1493 | if (!bdev) |
1484 | goto fail; | 1494 | goto fail; |
1485 | out: | 1495 | out: |
1486 | path_put(&path); | 1496 | path_put(&path); |
1487 | return bdev; | 1497 | return bdev; |
1488 | fail: | 1498 | fail: |
1489 | bdev = ERR_PTR(error); | 1499 | bdev = ERR_PTR(error); |
1490 | goto out; | 1500 | goto out; |
1491 | } | 1501 | } |
1492 | EXPORT_SYMBOL(lookup_bdev); | 1502 | EXPORT_SYMBOL(lookup_bdev); |
1493 | 1503 | ||
1494 | /** | 1504 | /** |
1495 | * open_bdev_exclusive - open a block device by name and set it up for use | 1505 | * open_bdev_exclusive - open a block device by name and set it up for use |
1496 | * | 1506 | * |
1497 | * @path: special file representing the block device | 1507 | * @path: special file representing the block device |
1498 | * @mode: FMODE_... combination to pass be used | 1508 | * @mode: FMODE_... combination to pass be used |
1499 | * @holder: owner for exclusion | 1509 | * @holder: owner for exclusion |
1500 | * | 1510 | * |
1501 | * Open the blockdevice described by the special file at @path, claim it | 1511 | * Open the blockdevice described by the special file at @path, claim it |
1502 | * for the @holder. | 1512 | * for the @holder. |
1503 | */ | 1513 | */ |
1504 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) | 1514 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) |
1505 | { | 1515 | { |
1506 | struct block_device *bdev; | 1516 | struct block_device *bdev; |
1507 | int error = 0; | 1517 | int error = 0; |
1508 | 1518 | ||
1509 | bdev = lookup_bdev(path); | 1519 | bdev = lookup_bdev(path); |
1510 | if (IS_ERR(bdev)) | 1520 | if (IS_ERR(bdev)) |
1511 | return bdev; | 1521 | return bdev; |
1512 | 1522 | ||
1513 | error = blkdev_get(bdev, mode); | 1523 | error = blkdev_get(bdev, mode); |
1514 | if (error) | 1524 | if (error) |
1515 | return ERR_PTR(error); | 1525 | return ERR_PTR(error); |
1516 | error = -EACCES; | 1526 | error = -EACCES; |
1517 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) | 1527 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) |
1518 | goto blkdev_put; | 1528 | goto blkdev_put; |
1519 | error = bd_claim(bdev, holder); | 1529 | error = bd_claim(bdev, holder); |
1520 | if (error) | 1530 | if (error) |
1521 | goto blkdev_put; | 1531 | goto blkdev_put; |
1522 | 1532 | ||
1523 | return bdev; | 1533 | return bdev; |
1524 | 1534 | ||
1525 | blkdev_put: | 1535 | blkdev_put: |
1526 | blkdev_put(bdev, mode); | 1536 | blkdev_put(bdev, mode); |
1527 | return ERR_PTR(error); | 1537 | return ERR_PTR(error); |
1528 | } | 1538 | } |
1529 | 1539 | ||
1530 | EXPORT_SYMBOL(open_bdev_exclusive); | 1540 | EXPORT_SYMBOL(open_bdev_exclusive); |
1531 | 1541 | ||
1532 | /** | 1542 | /** |
1533 | * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() | 1543 | * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() |
1534 | * | 1544 | * |
1535 | * @bdev: blockdevice to close | 1545 | * @bdev: blockdevice to close |
1536 | * @mode: mode, must match that used to open. | 1546 | * @mode: mode, must match that used to open. |
1537 | * | 1547 | * |
1538 | * This is the counterpart to open_bdev_exclusive(). | 1548 | * This is the counterpart to open_bdev_exclusive(). |
1539 | */ | 1549 | */ |
1540 | void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) | 1550 | void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) |
1541 | { | 1551 | { |
1542 | bd_release(bdev); | 1552 | bd_release(bdev); |
1543 | blkdev_put(bdev, mode); | 1553 | blkdev_put(bdev, mode); |
1544 | } | 1554 | } |
1545 | 1555 | ||
1546 | EXPORT_SYMBOL(close_bdev_exclusive); | 1556 | EXPORT_SYMBOL(close_bdev_exclusive); |
1547 | 1557 | ||
1548 | int __invalidate_device(struct block_device *bdev) | 1558 | int __invalidate_device(struct block_device *bdev) |
1549 | { | 1559 | { |
1550 | struct super_block *sb = get_super(bdev); | 1560 | struct super_block *sb = get_super(bdev); |
1551 | int res = 0; | 1561 | int res = 0; |
1552 | 1562 | ||
1553 | if (sb) { | 1563 | if (sb) { |
1554 | /* | 1564 | /* |
1555 | * no need to lock the super, get_super holds the | 1565 | * no need to lock the super, get_super holds the |
1556 | * read mutex so the filesystem cannot go away | 1566 | * read mutex so the filesystem cannot go away |
1557 | * under us (->put_super runs with the write lock | 1567 | * under us (->put_super runs with the write lock |
1558 | * hold). | 1568 | * hold). |
1559 | */ | 1569 | */ |
1560 | shrink_dcache_sb(sb); | 1570 | shrink_dcache_sb(sb); |
1561 | res = invalidate_inodes(sb); | 1571 | res = invalidate_inodes(sb); |
1562 | drop_super(sb); | 1572 | drop_super(sb); |
1563 | } | 1573 | } |
1564 | invalidate_bdev(bdev); | 1574 | invalidate_bdev(bdev); |
1565 | return res; | 1575 | return res; |
1566 | } | 1576 | } |
1567 | EXPORT_SYMBOL(__invalidate_device); | 1577 | EXPORT_SYMBOL(__invalidate_device); |
1568 | 1578 |
include/linux/fs.h
1 | #ifndef _LINUX_FS_H | 1 | #ifndef _LINUX_FS_H |
2 | #define _LINUX_FS_H | 2 | #define _LINUX_FS_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * This file has definitions for some important file table | 5 | * This file has definitions for some important file table |
6 | * structures etc. | 6 | * structures etc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/limits.h> | 9 | #include <linux/limits.h> |
10 | #include <linux/ioctl.h> | 10 | #include <linux/ioctl.h> |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change | 13 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change |
14 | * the file limit at runtime and only root can increase the per-process | 14 | * the file limit at runtime and only root can increase the per-process |
15 | * nr_file rlimit, so it's safe to set up a ridiculously high absolute | 15 | * nr_file rlimit, so it's safe to set up a ridiculously high absolute |
16 | * upper limit on files-per-process. | 16 | * upper limit on files-per-process. |
17 | * | 17 | * |
18 | * Some programs (notably those using select()) may have to be | 18 | * Some programs (notably those using select()) may have to be |
19 | * recompiled to take full advantage of the new limits.. | 19 | * recompiled to take full advantage of the new limits.. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | /* Fixed constants first: */ | 22 | /* Fixed constants first: */ |
23 | #undef NR_OPEN | 23 | #undef NR_OPEN |
24 | #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ | 24 | #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ |
25 | 25 | ||
26 | #define BLOCK_SIZE_BITS 10 | 26 | #define BLOCK_SIZE_BITS 10 |
27 | #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) | 27 | #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) |
28 | 28 | ||
29 | #define SEEK_SET 0 /* seek relative to beginning of file */ | 29 | #define SEEK_SET 0 /* seek relative to beginning of file */ |
30 | #define SEEK_CUR 1 /* seek relative to current file position */ | 30 | #define SEEK_CUR 1 /* seek relative to current file position */ |
31 | #define SEEK_END 2 /* seek relative to end of file */ | 31 | #define SEEK_END 2 /* seek relative to end of file */ |
32 | #define SEEK_MAX SEEK_END | 32 | #define SEEK_MAX SEEK_END |
33 | 33 | ||
34 | /* And dynamically-tunable limits and defaults: */ | 34 | /* And dynamically-tunable limits and defaults: */ |
35 | struct files_stat_struct { | 35 | struct files_stat_struct { |
36 | int nr_files; /* read only */ | 36 | int nr_files; /* read only */ |
37 | int nr_free_files; /* read only */ | 37 | int nr_free_files; /* read only */ |
38 | int max_files; /* tunable */ | 38 | int max_files; /* tunable */ |
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct inodes_stat_t { | 41 | struct inodes_stat_t { |
42 | int nr_inodes; | 42 | int nr_inodes; |
43 | int nr_unused; | 43 | int nr_unused; |
44 | int dummy[5]; /* padding for sysctl ABI compatibility */ | 44 | int dummy[5]; /* padding for sysctl ABI compatibility */ |
45 | }; | 45 | }; |
46 | 46 | ||
47 | 47 | ||
48 | #define NR_FILE 8192 /* this can well be larger on a larger system */ | 48 | #define NR_FILE 8192 /* this can well be larger on a larger system */ |
49 | 49 | ||
50 | #define MAY_EXEC 1 | 50 | #define MAY_EXEC 1 |
51 | #define MAY_WRITE 2 | 51 | #define MAY_WRITE 2 |
52 | #define MAY_READ 4 | 52 | #define MAY_READ 4 |
53 | #define MAY_APPEND 8 | 53 | #define MAY_APPEND 8 |
54 | #define MAY_ACCESS 16 | 54 | #define MAY_ACCESS 16 |
55 | #define MAY_OPEN 32 | 55 | #define MAY_OPEN 32 |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond | 58 | * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond |
59 | * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() | 59 | * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() |
60 | */ | 60 | */ |
61 | 61 | ||
62 | /* file is open for reading */ | 62 | /* file is open for reading */ |
63 | #define FMODE_READ ((__force fmode_t)1) | 63 | #define FMODE_READ ((__force fmode_t)1) |
64 | /* file is open for writing */ | 64 | /* file is open for writing */ |
65 | #define FMODE_WRITE ((__force fmode_t)2) | 65 | #define FMODE_WRITE ((__force fmode_t)2) |
66 | /* file is seekable */ | 66 | /* file is seekable */ |
67 | #define FMODE_LSEEK ((__force fmode_t)4) | 67 | #define FMODE_LSEEK ((__force fmode_t)4) |
68 | /* file can be accessed using pread */ | 68 | /* file can be accessed using pread */ |
69 | #define FMODE_PREAD ((__force fmode_t)8) | 69 | #define FMODE_PREAD ((__force fmode_t)8) |
70 | /* file can be accessed using pwrite */ | 70 | /* file can be accessed using pwrite */ |
71 | #define FMODE_PWRITE ((__force fmode_t)16) | 71 | #define FMODE_PWRITE ((__force fmode_t)16) |
72 | /* File is opened for execution with sys_execve / sys_uselib */ | 72 | /* File is opened for execution with sys_execve / sys_uselib */ |
73 | #define FMODE_EXEC ((__force fmode_t)32) | 73 | #define FMODE_EXEC ((__force fmode_t)32) |
74 | /* File is opened with O_NDELAY (only set for block devices) */ | 74 | /* File is opened with O_NDELAY (only set for block devices) */ |
75 | #define FMODE_NDELAY ((__force fmode_t)64) | 75 | #define FMODE_NDELAY ((__force fmode_t)64) |
76 | /* File is opened with O_EXCL (only set for block devices) */ | 76 | /* File is opened with O_EXCL (only set for block devices) */ |
77 | #define FMODE_EXCL ((__force fmode_t)128) | 77 | #define FMODE_EXCL ((__force fmode_t)128) |
78 | /* File is opened using open(.., 3, ..) and is writeable only for ioctls | 78 | /* File is opened using open(.., 3, ..) and is writeable only for ioctls |
79 | (specialy hack for floppy.c) */ | 79 | (specialy hack for floppy.c) */ |
80 | #define FMODE_WRITE_IOCTL ((__force fmode_t)256) | 80 | #define FMODE_WRITE_IOCTL ((__force fmode_t)256) |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Don't update ctime and mtime. | 83 | * Don't update ctime and mtime. |
84 | * | 84 | * |
85 | * Currently a special hack for the XFS open_by_handle ioctl, but we'll | 85 | * Currently a special hack for the XFS open_by_handle ioctl, but we'll |
86 | * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. | 86 | * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. |
87 | */ | 87 | */ |
88 | #define FMODE_NOCMTIME ((__force fmode_t)2048) | 88 | #define FMODE_NOCMTIME ((__force fmode_t)2048) |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * The below are the various read and write types that we support. Some of | 91 | * The below are the various read and write types that we support. Some of |
92 | * them include behavioral modifiers that send information down to the | 92 | * them include behavioral modifiers that send information down to the |
93 | * block layer and IO scheduler. Terminology: | 93 | * block layer and IO scheduler. Terminology: |
94 | * | 94 | * |
95 | * The block layer uses device plugging to defer IO a little bit, in | 95 | * The block layer uses device plugging to defer IO a little bit, in |
96 | * the hope that we will see more IO very shortly. This increases | 96 | * the hope that we will see more IO very shortly. This increases |
97 | * coalescing of adjacent IO and thus reduces the number of IOs we | 97 | * coalescing of adjacent IO and thus reduces the number of IOs we |
98 | * have to send to the device. It also allows for better queuing, | 98 | * have to send to the device. It also allows for better queuing, |
99 | * if the IO isn't mergeable. If the caller is going to be waiting | 99 | * if the IO isn't mergeable. If the caller is going to be waiting |
100 | * for the IO, then he must ensure that the device is unplugged so | 100 | * for the IO, then he must ensure that the device is unplugged so |
101 | * that the IO is dispatched to the driver. | 101 | * that the IO is dispatched to the driver. |
102 | * | 102 | * |
103 | * All IO is handled async in Linux. This is fine for background | 103 | * All IO is handled async in Linux. This is fine for background |
104 | * writes, but for reads or writes that someone waits for completion | 104 | * writes, but for reads or writes that someone waits for completion |
105 | * on, we want to notify the block layer and IO scheduler so that they | 105 | * on, we want to notify the block layer and IO scheduler so that they |
106 | * know about it. That allows them to make better scheduling | 106 | * know about it. That allows them to make better scheduling |
107 | * decisions. So when the below references 'sync' and 'async', it | 107 | * decisions. So when the below references 'sync' and 'async', it |
108 | * is referencing this priority hint. | 108 | * is referencing this priority hint. |
109 | * | 109 | * |
110 | * With that in mind, the available types are: | 110 | * With that in mind, the available types are: |
111 | * | 111 | * |
112 | * READ A normal read operation. Device will be plugged. | 112 | * READ A normal read operation. Device will be plugged. |
113 | * READ_SYNC A synchronous read. Device is not plugged, caller can | 113 | * READ_SYNC A synchronous read. Device is not plugged, caller can |
114 | * immediately wait on this read without caring about | 114 | * immediately wait on this read without caring about |
115 | * unplugging. | 115 | * unplugging. |
116 | * READA Used for read-ahead operations. Lower priority, and the | 116 | * READA Used for read-ahead operations. Lower priority, and the |
117 | * block layer could (in theory) choose to ignore this | 117 | * block layer could (in theory) choose to ignore this |
118 | * request if it runs into resource problems. | 118 | * request if it runs into resource problems. |
119 | * WRITE A normal async write. Device will be plugged. | 119 | * WRITE A normal async write. Device will be plugged. |
120 | * SWRITE Like WRITE, but a special case for ll_rw_block() that | 120 | * SWRITE Like WRITE, but a special case for ll_rw_block() that |
121 | * tells it to lock the buffer first. Normally a buffer | 121 | * tells it to lock the buffer first. Normally a buffer |
122 | * must be locked before doing IO. | 122 | * must be locked before doing IO. |
123 | * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down | 123 | * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down |
124 | * the hint that someone will be waiting on this IO | 124 | * the hint that someone will be waiting on this IO |
125 | * shortly. The device must still be unplugged explicitly, | 125 | * shortly. The device must still be unplugged explicitly, |
126 | * WRITE_SYNC_PLUG does not do this as we could be | 126 | * WRITE_SYNC_PLUG does not do this as we could be |
127 | * submitting more writes before we actually wait on any | 127 | * submitting more writes before we actually wait on any |
128 | * of them. | 128 | * of them. |
129 | * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device | 129 | * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device |
130 | * immediately after submission. The write equivalent | 130 | * immediately after submission. The write equivalent |
131 | * of READ_SYNC. | 131 | * of READ_SYNC. |
132 | * WRITE_ODIRECT Special case write for O_DIRECT only. | 132 | * WRITE_ODIRECT Special case write for O_DIRECT only. |
133 | * SWRITE_SYNC | 133 | * SWRITE_SYNC |
134 | * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. | 134 | * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. |
135 | * See SWRITE. | 135 | * See SWRITE. |
136 | * WRITE_BARRIER Like WRITE, but tells the block layer that all | 136 | * WRITE_BARRIER Like WRITE, but tells the block layer that all |
137 | * previously submitted writes must be safely on storage | 137 | * previously submitted writes must be safely on storage |
138 | * before this one is started. Also guarantees that when | 138 | * before this one is started. Also guarantees that when |
139 | * this write is complete, it itself is also safely on | 139 | * this write is complete, it itself is also safely on |
140 | * storage. Prevents reordering of writes on both sides | 140 | * storage. Prevents reordering of writes on both sides |
141 | * of this IO. | 141 | * of this IO. |
142 | * | 142 | * |
143 | */ | 143 | */ |
144 | #define RW_MASK 1 | 144 | #define RW_MASK 1 |
145 | #define RWA_MASK 2 | 145 | #define RWA_MASK 2 |
146 | #define READ 0 | 146 | #define READ 0 |
147 | #define WRITE 1 | 147 | #define WRITE 1 |
148 | #define READA 2 /* read-ahead - don't block if no resources */ | 148 | #define READA 2 /* read-ahead - don't block if no resources */ |
149 | #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ | 149 | #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ |
150 | #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) | 150 | #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) |
151 | #define READ_META (READ | (1 << BIO_RW_META)) | 151 | #define READ_META (READ | (1 << BIO_RW_META)) |
152 | #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) | 152 | #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) |
153 | #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) | 153 | #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) |
154 | #define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) | 154 | #define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) |
155 | #define SWRITE_SYNC_PLUG \ | 155 | #define SWRITE_SYNC_PLUG \ |
156 | (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) | 156 | (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) |
157 | #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) | 157 | #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) |
158 | #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) | 158 | #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * These aren't really reads or writes, they pass down information about | 161 | * These aren't really reads or writes, they pass down information about |
162 | * parts of device that are now unused by the file system. | 162 | * parts of device that are now unused by the file system. |
163 | */ | 163 | */ |
164 | #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) | 164 | #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) |
165 | #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) | 165 | #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) |
166 | 166 | ||
167 | #define SEL_IN 1 | 167 | #define SEL_IN 1 |
168 | #define SEL_OUT 2 | 168 | #define SEL_OUT 2 |
169 | #define SEL_EX 4 | 169 | #define SEL_EX 4 |
170 | 170 | ||
171 | /* public flags for file_system_type */ | 171 | /* public flags for file_system_type */ |
172 | #define FS_REQUIRES_DEV 1 | 172 | #define FS_REQUIRES_DEV 1 |
173 | #define FS_BINARY_MOUNTDATA 2 | 173 | #define FS_BINARY_MOUNTDATA 2 |
174 | #define FS_HAS_SUBTYPE 4 | 174 | #define FS_HAS_SUBTYPE 4 |
175 | #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ | 175 | #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ |
176 | #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() | 176 | #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() |
177 | * during rename() internally. | 177 | * during rename() internally. |
178 | */ | 178 | */ |
179 | 179 | ||
180 | /* | 180 | /* |
181 | * These are the fs-independent mount-flags: up to 32 flags are supported | 181 | * These are the fs-independent mount-flags: up to 32 flags are supported |
182 | */ | 182 | */ |
183 | #define MS_RDONLY 1 /* Mount read-only */ | 183 | #define MS_RDONLY 1 /* Mount read-only */ |
184 | #define MS_NOSUID 2 /* Ignore suid and sgid bits */ | 184 | #define MS_NOSUID 2 /* Ignore suid and sgid bits */ |
185 | #define MS_NODEV 4 /* Disallow access to device special files */ | 185 | #define MS_NODEV 4 /* Disallow access to device special files */ |
186 | #define MS_NOEXEC 8 /* Disallow program execution */ | 186 | #define MS_NOEXEC 8 /* Disallow program execution */ |
187 | #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ | 187 | #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ |
188 | #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ | 188 | #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ |
189 | #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ | 189 | #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ |
190 | #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ | 190 | #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ |
191 | #define MS_NOATIME 1024 /* Do not update access times. */ | 191 | #define MS_NOATIME 1024 /* Do not update access times. */ |
192 | #define MS_NODIRATIME 2048 /* Do not update directory access times */ | 192 | #define MS_NODIRATIME 2048 /* Do not update directory access times */ |
193 | #define MS_BIND 4096 | 193 | #define MS_BIND 4096 |
194 | #define MS_MOVE 8192 | 194 | #define MS_MOVE 8192 |
195 | #define MS_REC 16384 | 195 | #define MS_REC 16384 |
196 | #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. | 196 | #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. |
197 | MS_VERBOSE is deprecated. */ | 197 | MS_VERBOSE is deprecated. */ |
198 | #define MS_SILENT 32768 | 198 | #define MS_SILENT 32768 |
199 | #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ | 199 | #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ |
200 | #define MS_UNBINDABLE (1<<17) /* change to unbindable */ | 200 | #define MS_UNBINDABLE (1<<17) /* change to unbindable */ |
201 | #define MS_PRIVATE (1<<18) /* change to private */ | 201 | #define MS_PRIVATE (1<<18) /* change to private */ |
202 | #define MS_SLAVE (1<<19) /* change to slave */ | 202 | #define MS_SLAVE (1<<19) /* change to slave */ |
203 | #define MS_SHARED (1<<20) /* change to shared */ | 203 | #define MS_SHARED (1<<20) /* change to shared */ |
204 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ | 204 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ |
205 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ | 205 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ |
206 | #define MS_I_VERSION (1<<23) /* Update inode I_version field */ | 206 | #define MS_I_VERSION (1<<23) /* Update inode I_version field */ |
207 | #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ | 207 | #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ |
208 | #define MS_ACTIVE (1<<30) | 208 | #define MS_ACTIVE (1<<30) |
209 | #define MS_NOUSER (1<<31) | 209 | #define MS_NOUSER (1<<31) |
210 | 210 | ||
211 | /* | 211 | /* |
212 | * Superblock flags that can be altered by MS_REMOUNT | 212 | * Superblock flags that can be altered by MS_REMOUNT |
213 | */ | 213 | */ |
214 | #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) | 214 | #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) |
215 | 215 | ||
216 | /* | 216 | /* |
217 | * Old magic mount flag and mask | 217 | * Old magic mount flag and mask |
218 | */ | 218 | */ |
219 | #define MS_MGC_VAL 0xC0ED0000 | 219 | #define MS_MGC_VAL 0xC0ED0000 |
220 | #define MS_MGC_MSK 0xffff0000 | 220 | #define MS_MGC_MSK 0xffff0000 |
221 | 221 | ||
222 | /* Inode flags - they have nothing to superblock flags now */ | 222 | /* Inode flags - they have nothing to superblock flags now */ |
223 | 223 | ||
224 | #define S_SYNC 1 /* Writes are synced at once */ | 224 | #define S_SYNC 1 /* Writes are synced at once */ |
225 | #define S_NOATIME 2 /* Do not update access times */ | 225 | #define S_NOATIME 2 /* Do not update access times */ |
226 | #define S_APPEND 4 /* Append-only file */ | 226 | #define S_APPEND 4 /* Append-only file */ |
227 | #define S_IMMUTABLE 8 /* Immutable file */ | 227 | #define S_IMMUTABLE 8 /* Immutable file */ |
228 | #define S_DEAD 16 /* removed, but still open directory */ | 228 | #define S_DEAD 16 /* removed, but still open directory */ |
229 | #define S_NOQUOTA 32 /* Inode is not counted to quota */ | 229 | #define S_NOQUOTA 32 /* Inode is not counted to quota */ |
230 | #define S_DIRSYNC 64 /* Directory modifications are synchronous */ | 230 | #define S_DIRSYNC 64 /* Directory modifications are synchronous */ |
231 | #define S_NOCMTIME 128 /* Do not update file c/mtime */ | 231 | #define S_NOCMTIME 128 /* Do not update file c/mtime */ |
232 | #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ | 232 | #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ |
233 | #define S_PRIVATE 512 /* Inode is fs-internal */ | 233 | #define S_PRIVATE 512 /* Inode is fs-internal */ |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * Note that nosuid etc flags are inode-specific: setting some file-system | 236 | * Note that nosuid etc flags are inode-specific: setting some file-system |
237 | * flags just means all the inodes inherit those flags by default. It might be | 237 | * flags just means all the inodes inherit those flags by default. It might be |
238 | * possible to override it selectively if you really wanted to with some | 238 | * possible to override it selectively if you really wanted to with some |
239 | * ioctl() that is not currently implemented. | 239 | * ioctl() that is not currently implemented. |
240 | * | 240 | * |
241 | * Exception: MS_RDONLY is always applied to the entire file system. | 241 | * Exception: MS_RDONLY is always applied to the entire file system. |
242 | * | 242 | * |
243 | * Unfortunately, it is possible to change a filesystems flags with it mounted | 243 | * Unfortunately, it is possible to change a filesystems flags with it mounted |
244 | * with files in use. This means that all of the inodes will not have their | 244 | * with files in use. This means that all of the inodes will not have their |
245 | * i_flags updated. Hence, i_flags no longer inherit the superblock mount | 245 | * i_flags updated. Hence, i_flags no longer inherit the superblock mount |
246 | * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org | 246 | * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org |
247 | */ | 247 | */ |
248 | #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) | 248 | #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) |
249 | 249 | ||
250 | #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) | 250 | #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) |
251 | #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ | 251 | #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ |
252 | ((inode)->i_flags & S_SYNC)) | 252 | ((inode)->i_flags & S_SYNC)) |
253 | #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ | 253 | #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ |
254 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) | 254 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) |
255 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) | 255 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) |
256 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) | 256 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) |
257 | #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) | 257 | #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) |
258 | 258 | ||
259 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) | 259 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) |
260 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) | 260 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) |
261 | #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) | 261 | #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) |
262 | #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) | 262 | #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) |
263 | 263 | ||
264 | #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) | 264 | #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) |
265 | #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) | 265 | #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) |
266 | #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) | 266 | #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) |
267 | #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) | 267 | #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) |
268 | 268 | ||
269 | /* the read-only stuff doesn't really belong here, but any other place is | 269 | /* the read-only stuff doesn't really belong here, but any other place is |
270 | probably as bad and I don't want to create yet another include file. */ | 270 | probably as bad and I don't want to create yet another include file. */ |
271 | 271 | ||
272 | #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ | 272 | #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ |
273 | #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ | 273 | #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ |
274 | #define BLKRRPART _IO(0x12,95) /* re-read partition table */ | 274 | #define BLKRRPART _IO(0x12,95) /* re-read partition table */ |
275 | #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ | 275 | #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ |
276 | #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ | 276 | #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ |
277 | #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ | 277 | #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ |
278 | #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ | 278 | #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ |
279 | #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ | 279 | #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ |
280 | #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ | 280 | #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ |
281 | #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ | 281 | #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ |
282 | #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ | 282 | #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ |
283 | #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ | 283 | #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ |
284 | #if 0 | 284 | #if 0 |
285 | #define BLKPG _IO(0x12,105)/* See blkpg.h */ | 285 | #define BLKPG _IO(0x12,105)/* See blkpg.h */ |
286 | 286 | ||
287 | /* Some people are morons. Do not use sizeof! */ | 287 | /* Some people are morons. Do not use sizeof! */ |
288 | 288 | ||
289 | #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ | 289 | #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ |
290 | #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ | 290 | #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ |
291 | /* This was here just to show that the number is taken - | 291 | /* This was here just to show that the number is taken - |
292 | probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ | 292 | probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ |
293 | #endif | 293 | #endif |
294 | /* A jump here: 108-111 have been used for various private purposes. */ | 294 | /* A jump here: 108-111 have been used for various private purposes. */ |
295 | #define BLKBSZGET _IOR(0x12,112,size_t) | 295 | #define BLKBSZGET _IOR(0x12,112,size_t) |
296 | #define BLKBSZSET _IOW(0x12,113,size_t) | 296 | #define BLKBSZSET _IOW(0x12,113,size_t) |
297 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ | 297 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ |
298 | #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) | 298 | #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) |
299 | #define BLKTRACESTART _IO(0x12,116) | 299 | #define BLKTRACESTART _IO(0x12,116) |
300 | #define BLKTRACESTOP _IO(0x12,117) | 300 | #define BLKTRACESTOP _IO(0x12,117) |
301 | #define BLKTRACETEARDOWN _IO(0x12,118) | 301 | #define BLKTRACETEARDOWN _IO(0x12,118) |
302 | #define BLKDISCARD _IO(0x12,119) | 302 | #define BLKDISCARD _IO(0x12,119) |
303 | 303 | ||
304 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ | 304 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ |
305 | #define FIBMAP _IO(0x00,1) /* bmap access */ | 305 | #define FIBMAP _IO(0x00,1) /* bmap access */ |
306 | #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ | 306 | #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ |
307 | #define FIFREEZE _IOWR('X', 119, int) /* Freeze */ | 307 | #define FIFREEZE _IOWR('X', 119, int) /* Freeze */ |
308 | #define FITHAW _IOWR('X', 120, int) /* Thaw */ | 308 | #define FITHAW _IOWR('X', 120, int) /* Thaw */ |
309 | 309 | ||
310 | #define FS_IOC_GETFLAGS _IOR('f', 1, long) | 310 | #define FS_IOC_GETFLAGS _IOR('f', 1, long) |
311 | #define FS_IOC_SETFLAGS _IOW('f', 2, long) | 311 | #define FS_IOC_SETFLAGS _IOW('f', 2, long) |
312 | #define FS_IOC_GETVERSION _IOR('v', 1, long) | 312 | #define FS_IOC_GETVERSION _IOR('v', 1, long) |
313 | #define FS_IOC_SETVERSION _IOW('v', 2, long) | 313 | #define FS_IOC_SETVERSION _IOW('v', 2, long) |
314 | #define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap) | 314 | #define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap) |
315 | #define FS_IOC32_GETFLAGS _IOR('f', 1, int) | 315 | #define FS_IOC32_GETFLAGS _IOR('f', 1, int) |
316 | #define FS_IOC32_SETFLAGS _IOW('f', 2, int) | 316 | #define FS_IOC32_SETFLAGS _IOW('f', 2, int) |
317 | #define FS_IOC32_GETVERSION _IOR('v', 1, int) | 317 | #define FS_IOC32_GETVERSION _IOR('v', 1, int) |
318 | #define FS_IOC32_SETVERSION _IOW('v', 2, int) | 318 | #define FS_IOC32_SETVERSION _IOW('v', 2, int) |
319 | 319 | ||
320 | /* | 320 | /* |
321 | * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) | 321 | * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) |
322 | */ | 322 | */ |
323 | #define FS_SECRM_FL 0x00000001 /* Secure deletion */ | 323 | #define FS_SECRM_FL 0x00000001 /* Secure deletion */ |
324 | #define FS_UNRM_FL 0x00000002 /* Undelete */ | 324 | #define FS_UNRM_FL 0x00000002 /* Undelete */ |
325 | #define FS_COMPR_FL 0x00000004 /* Compress file */ | 325 | #define FS_COMPR_FL 0x00000004 /* Compress file */ |
326 | #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ | 326 | #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ |
327 | #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ | 327 | #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ |
328 | #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ | 328 | #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ |
329 | #define FS_NODUMP_FL 0x00000040 /* do not dump file */ | 329 | #define FS_NODUMP_FL 0x00000040 /* do not dump file */ |
330 | #define FS_NOATIME_FL 0x00000080 /* do not update atime */ | 330 | #define FS_NOATIME_FL 0x00000080 /* do not update atime */ |
331 | /* Reserved for compression usage... */ | 331 | /* Reserved for compression usage... */ |
332 | #define FS_DIRTY_FL 0x00000100 | 332 | #define FS_DIRTY_FL 0x00000100 |
333 | #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ | 333 | #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ |
334 | #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ | 334 | #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ |
335 | #define FS_ECOMPR_FL 0x00000800 /* Compression error */ | 335 | #define FS_ECOMPR_FL 0x00000800 /* Compression error */ |
336 | /* End compression flags --- maybe not all used */ | 336 | /* End compression flags --- maybe not all used */ |
337 | #define FS_BTREE_FL 0x00001000 /* btree format dir */ | 337 | #define FS_BTREE_FL 0x00001000 /* btree format dir */ |
338 | #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ | 338 | #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ |
339 | #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ | 339 | #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ |
340 | #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ | 340 | #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ |
341 | #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ | 341 | #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ |
342 | #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ | 342 | #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ |
343 | #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ | 343 | #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ |
344 | #define FS_EXTENT_FL 0x00080000 /* Extents */ | 344 | #define FS_EXTENT_FL 0x00080000 /* Extents */ |
345 | #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ | 345 | #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ |
346 | #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ | 346 | #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ |
347 | 347 | ||
348 | #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ | 348 | #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ |
349 | #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ | 349 | #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ |
350 | 350 | ||
351 | 351 | ||
352 | #define SYNC_FILE_RANGE_WAIT_BEFORE 1 | 352 | #define SYNC_FILE_RANGE_WAIT_BEFORE 1 |
353 | #define SYNC_FILE_RANGE_WRITE 2 | 353 | #define SYNC_FILE_RANGE_WRITE 2 |
354 | #define SYNC_FILE_RANGE_WAIT_AFTER 4 | 354 | #define SYNC_FILE_RANGE_WAIT_AFTER 4 |
355 | 355 | ||
356 | #ifdef __KERNEL__ | 356 | #ifdef __KERNEL__ |
357 | 357 | ||
358 | #include <linux/linkage.h> | 358 | #include <linux/linkage.h> |
359 | #include <linux/wait.h> | 359 | #include <linux/wait.h> |
360 | #include <linux/types.h> | 360 | #include <linux/types.h> |
361 | #include <linux/kdev_t.h> | 361 | #include <linux/kdev_t.h> |
362 | #include <linux/dcache.h> | 362 | #include <linux/dcache.h> |
363 | #include <linux/path.h> | 363 | #include <linux/path.h> |
364 | #include <linux/stat.h> | 364 | #include <linux/stat.h> |
365 | #include <linux/cache.h> | 365 | #include <linux/cache.h> |
366 | #include <linux/kobject.h> | 366 | #include <linux/kobject.h> |
367 | #include <linux/list.h> | 367 | #include <linux/list.h> |
368 | #include <linux/radix-tree.h> | 368 | #include <linux/radix-tree.h> |
369 | #include <linux/prio_tree.h> | 369 | #include <linux/prio_tree.h> |
370 | #include <linux/init.h> | 370 | #include <linux/init.h> |
371 | #include <linux/pid.h> | 371 | #include <linux/pid.h> |
372 | #include <linux/mutex.h> | 372 | #include <linux/mutex.h> |
373 | #include <linux/capability.h> | 373 | #include <linux/capability.h> |
374 | #include <linux/semaphore.h> | 374 | #include <linux/semaphore.h> |
375 | #include <linux/fiemap.h> | 375 | #include <linux/fiemap.h> |
376 | 376 | ||
377 | #include <asm/atomic.h> | 377 | #include <asm/atomic.h> |
378 | #include <asm/byteorder.h> | 378 | #include <asm/byteorder.h> |
379 | 379 | ||
380 | struct export_operations; | 380 | struct export_operations; |
381 | struct hd_geometry; | 381 | struct hd_geometry; |
382 | struct iovec; | 382 | struct iovec; |
383 | struct nameidata; | 383 | struct nameidata; |
384 | struct kiocb; | 384 | struct kiocb; |
385 | struct pipe_inode_info; | 385 | struct pipe_inode_info; |
386 | struct poll_table_struct; | 386 | struct poll_table_struct; |
387 | struct kstatfs; | 387 | struct kstatfs; |
388 | struct vm_area_struct; | 388 | struct vm_area_struct; |
389 | struct vfsmount; | 389 | struct vfsmount; |
390 | struct cred; | 390 | struct cred; |
391 | 391 | ||
392 | extern void __init inode_init(void); | 392 | extern void __init inode_init(void); |
393 | extern void __init inode_init_early(void); | 393 | extern void __init inode_init_early(void); |
394 | extern void __init files_init(unsigned long); | 394 | extern void __init files_init(unsigned long); |
395 | 395 | ||
396 | extern struct files_stat_struct files_stat; | 396 | extern struct files_stat_struct files_stat; |
397 | extern int get_max_files(void); | 397 | extern int get_max_files(void); |
398 | extern int sysctl_nr_open; | 398 | extern int sysctl_nr_open; |
399 | extern struct inodes_stat_t inodes_stat; | 399 | extern struct inodes_stat_t inodes_stat; |
400 | extern int leases_enable, lease_break_time; | 400 | extern int leases_enable, lease_break_time; |
401 | #ifdef CONFIG_DNOTIFY | 401 | #ifdef CONFIG_DNOTIFY |
402 | extern int dir_notify_enable; | 402 | extern int dir_notify_enable; |
403 | #endif | 403 | #endif |
404 | 404 | ||
405 | struct buffer_head; | 405 | struct buffer_head; |
406 | typedef int (get_block_t)(struct inode *inode, sector_t iblock, | 406 | typedef int (get_block_t)(struct inode *inode, sector_t iblock, |
407 | struct buffer_head *bh_result, int create); | 407 | struct buffer_head *bh_result, int create); |
408 | typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, | 408 | typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, |
409 | ssize_t bytes, void *private); | 409 | ssize_t bytes, void *private); |
410 | 410 | ||
411 | /* | 411 | /* |
412 | * Attribute flags. These should be or-ed together to figure out what | 412 | * Attribute flags. These should be or-ed together to figure out what |
413 | * has been changed! | 413 | * has been changed! |
414 | */ | 414 | */ |
415 | #define ATTR_MODE (1 << 0) | 415 | #define ATTR_MODE (1 << 0) |
416 | #define ATTR_UID (1 << 1) | 416 | #define ATTR_UID (1 << 1) |
417 | #define ATTR_GID (1 << 2) | 417 | #define ATTR_GID (1 << 2) |
418 | #define ATTR_SIZE (1 << 3) | 418 | #define ATTR_SIZE (1 << 3) |
419 | #define ATTR_ATIME (1 << 4) | 419 | #define ATTR_ATIME (1 << 4) |
420 | #define ATTR_MTIME (1 << 5) | 420 | #define ATTR_MTIME (1 << 5) |
421 | #define ATTR_CTIME (1 << 6) | 421 | #define ATTR_CTIME (1 << 6) |
422 | #define ATTR_ATIME_SET (1 << 7) | 422 | #define ATTR_ATIME_SET (1 << 7) |
423 | #define ATTR_MTIME_SET (1 << 8) | 423 | #define ATTR_MTIME_SET (1 << 8) |
424 | #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ | 424 | #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ |
425 | #define ATTR_ATTR_FLAG (1 << 10) | 425 | #define ATTR_ATTR_FLAG (1 << 10) |
426 | #define ATTR_KILL_SUID (1 << 11) | 426 | #define ATTR_KILL_SUID (1 << 11) |
427 | #define ATTR_KILL_SGID (1 << 12) | 427 | #define ATTR_KILL_SGID (1 << 12) |
428 | #define ATTR_FILE (1 << 13) | 428 | #define ATTR_FILE (1 << 13) |
429 | #define ATTR_KILL_PRIV (1 << 14) | 429 | #define ATTR_KILL_PRIV (1 << 14) |
430 | #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ | 430 | #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ |
431 | #define ATTR_TIMES_SET (1 << 16) | 431 | #define ATTR_TIMES_SET (1 << 16) |
432 | 432 | ||
433 | /* | 433 | /* |
434 | * This is the Inode Attributes structure, used for notify_change(). It | 434 | * This is the Inode Attributes structure, used for notify_change(). It |
435 | * uses the above definitions as flags, to know which values have changed. | 435 | * uses the above definitions as flags, to know which values have changed. |
436 | * Also, in this manner, a Filesystem can look at only the values it cares | 436 | * Also, in this manner, a Filesystem can look at only the values it cares |
437 | * about. Basically, these are the attributes that the VFS layer can | 437 | * about. Basically, these are the attributes that the VFS layer can |
438 | * request to change from the FS layer. | 438 | * request to change from the FS layer. |
439 | * | 439 | * |
440 | * Derek Atkins <warlord@MIT.EDU> 94-10-20 | 440 | * Derek Atkins <warlord@MIT.EDU> 94-10-20 |
441 | */ | 441 | */ |
442 | struct iattr { | 442 | struct iattr { |
443 | unsigned int ia_valid; | 443 | unsigned int ia_valid; |
444 | umode_t ia_mode; | 444 | umode_t ia_mode; |
445 | uid_t ia_uid; | 445 | uid_t ia_uid; |
446 | gid_t ia_gid; | 446 | gid_t ia_gid; |
447 | loff_t ia_size; | 447 | loff_t ia_size; |
448 | struct timespec ia_atime; | 448 | struct timespec ia_atime; |
449 | struct timespec ia_mtime; | 449 | struct timespec ia_mtime; |
450 | struct timespec ia_ctime; | 450 | struct timespec ia_ctime; |
451 | 451 | ||
452 | /* | 452 | /* |
453 | * Not an attribute, but an auxilary info for filesystems wanting to | 453 | * Not an attribute, but an auxilary info for filesystems wanting to |
454 | * implement an ftruncate() like method. NOTE: filesystem should | 454 | * implement an ftruncate() like method. NOTE: filesystem should |
455 | * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). | 455 | * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). |
456 | */ | 456 | */ |
457 | struct file *ia_file; | 457 | struct file *ia_file; |
458 | }; | 458 | }; |
459 | 459 | ||
460 | /* | 460 | /* |
461 | * Includes for diskquotas. | 461 | * Includes for diskquotas. |
462 | */ | 462 | */ |
463 | #include <linux/quota.h> | 463 | #include <linux/quota.h> |
464 | 464 | ||
465 | /** | 465 | /** |
466 | * enum positive_aop_returns - aop return codes with specific semantics | 466 | * enum positive_aop_returns - aop return codes with specific semantics |
467 | * | 467 | * |
468 | * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has | 468 | * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has |
469 | * completed, that the page is still locked, and | 469 | * completed, that the page is still locked, and |
470 | * should be considered active. The VM uses this hint | 470 | * should be considered active. The VM uses this hint |
471 | * to return the page to the active list -- it won't | 471 | * to return the page to the active list -- it won't |
472 | * be a candidate for writeback again in the near | 472 | * be a candidate for writeback again in the near |
473 | * future. Other callers must be careful to unlock | 473 | * future. Other callers must be careful to unlock |
474 | * the page if they get this return. Returned by | 474 | * the page if they get this return. Returned by |
475 | * writepage(); | 475 | * writepage(); |
476 | * | 476 | * |
477 | * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has | 477 | * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has |
478 | * unlocked it and the page might have been truncated. | 478 | * unlocked it and the page might have been truncated. |
479 | * The caller should back up to acquiring a new page and | 479 | * The caller should back up to acquiring a new page and |
480 | * trying again. The aop will be taking reasonable | 480 | * trying again. The aop will be taking reasonable |
481 | * precautions not to livelock. If the caller held a page | 481 | * precautions not to livelock. If the caller held a page |
482 | * reference, it should drop it before retrying. Returned | 482 | * reference, it should drop it before retrying. Returned |
483 | * by readpage(). | 483 | * by readpage(). |
484 | * | 484 | * |
485 | * address_space_operation functions return these large constants to indicate | 485 | * address_space_operation functions return these large constants to indicate |
486 | * special semantics to the caller. These are much larger than the bytes in a | 486 | * special semantics to the caller. These are much larger than the bytes in a |
487 | * page to allow for functions that return the number of bytes operated on in a | 487 | * page to allow for functions that return the number of bytes operated on in a |
488 | * given page. | 488 | * given page. |
489 | */ | 489 | */ |
490 | 490 | ||
491 | enum positive_aop_returns { | 491 | enum positive_aop_returns { |
492 | AOP_WRITEPAGE_ACTIVATE = 0x80000, | 492 | AOP_WRITEPAGE_ACTIVATE = 0x80000, |
493 | AOP_TRUNCATED_PAGE = 0x80001, | 493 | AOP_TRUNCATED_PAGE = 0x80001, |
494 | }; | 494 | }; |
495 | 495 | ||
496 | #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ | 496 | #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ |
497 | #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ | 497 | #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ |
498 | #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct | 498 | #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct |
499 | * helper code (eg buffer layer) | 499 | * helper code (eg buffer layer) |
500 | * to clear GFP_FS from alloc */ | 500 | * to clear GFP_FS from alloc */ |
501 | 501 | ||
502 | /* | 502 | /* |
503 | * oh the beauties of C type declarations. | 503 | * oh the beauties of C type declarations. |
504 | */ | 504 | */ |
505 | struct page; | 505 | struct page; |
506 | struct address_space; | 506 | struct address_space; |
507 | struct writeback_control; | 507 | struct writeback_control; |
508 | 508 | ||
509 | struct iov_iter { | 509 | struct iov_iter { |
510 | const struct iovec *iov; | 510 | const struct iovec *iov; |
511 | unsigned long nr_segs; | 511 | unsigned long nr_segs; |
512 | size_t iov_offset; | 512 | size_t iov_offset; |
513 | size_t count; | 513 | size_t count; |
514 | }; | 514 | }; |
515 | 515 | ||
516 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 516 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
517 | struct iov_iter *i, unsigned long offset, size_t bytes); | 517 | struct iov_iter *i, unsigned long offset, size_t bytes); |
518 | size_t iov_iter_copy_from_user(struct page *page, | 518 | size_t iov_iter_copy_from_user(struct page *page, |
519 | struct iov_iter *i, unsigned long offset, size_t bytes); | 519 | struct iov_iter *i, unsigned long offset, size_t bytes); |
520 | void iov_iter_advance(struct iov_iter *i, size_t bytes); | 520 | void iov_iter_advance(struct iov_iter *i, size_t bytes); |
521 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); | 521 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); |
522 | size_t iov_iter_single_seg_count(struct iov_iter *i); | 522 | size_t iov_iter_single_seg_count(struct iov_iter *i); |
523 | 523 | ||
524 | static inline void iov_iter_init(struct iov_iter *i, | 524 | static inline void iov_iter_init(struct iov_iter *i, |
525 | const struct iovec *iov, unsigned long nr_segs, | 525 | const struct iovec *iov, unsigned long nr_segs, |
526 | size_t count, size_t written) | 526 | size_t count, size_t written) |
527 | { | 527 | { |
528 | i->iov = iov; | 528 | i->iov = iov; |
529 | i->nr_segs = nr_segs; | 529 | i->nr_segs = nr_segs; |
530 | i->iov_offset = 0; | 530 | i->iov_offset = 0; |
531 | i->count = count + written; | 531 | i->count = count + written; |
532 | 532 | ||
533 | iov_iter_advance(i, written); | 533 | iov_iter_advance(i, written); |
534 | } | 534 | } |
535 | 535 | ||
536 | static inline size_t iov_iter_count(struct iov_iter *i) | 536 | static inline size_t iov_iter_count(struct iov_iter *i) |
537 | { | 537 | { |
538 | return i->count; | 538 | return i->count; |
539 | } | 539 | } |
540 | 540 | ||
541 | /* | 541 | /* |
542 | * "descriptor" for what we're up to with a read. | 542 | * "descriptor" for what we're up to with a read. |
543 | * This allows us to use the same read code yet | 543 | * This allows us to use the same read code yet |
544 | * have multiple different users of the data that | 544 | * have multiple different users of the data that |
545 | * we read from a file. | 545 | * we read from a file. |
546 | * | 546 | * |
547 | * The simplest case just copies the data to user | 547 | * The simplest case just copies the data to user |
548 | * mode. | 548 | * mode. |
549 | */ | 549 | */ |
550 | typedef struct { | 550 | typedef struct { |
551 | size_t written; | 551 | size_t written; |
552 | size_t count; | 552 | size_t count; |
553 | union { | 553 | union { |
554 | char __user *buf; | 554 | char __user *buf; |
555 | void *data; | 555 | void *data; |
556 | } arg; | 556 | } arg; |
557 | int error; | 557 | int error; |
558 | } read_descriptor_t; | 558 | } read_descriptor_t; |
559 | 559 | ||
560 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, | 560 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, |
561 | unsigned long, unsigned long); | 561 | unsigned long, unsigned long); |
562 | 562 | ||
563 | struct address_space_operations { | 563 | struct address_space_operations { |
564 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 564 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
565 | int (*readpage)(struct file *, struct page *); | 565 | int (*readpage)(struct file *, struct page *); |
566 | void (*sync_page)(struct page *); | 566 | void (*sync_page)(struct page *); |
567 | 567 | ||
568 | /* Write back some dirty pages from this mapping. */ | 568 | /* Write back some dirty pages from this mapping. */ |
569 | int (*writepages)(struct address_space *, struct writeback_control *); | 569 | int (*writepages)(struct address_space *, struct writeback_control *); |
570 | 570 | ||
571 | /* Set a page dirty. Return true if this dirtied it */ | 571 | /* Set a page dirty. Return true if this dirtied it */ |
572 | int (*set_page_dirty)(struct page *page); | 572 | int (*set_page_dirty)(struct page *page); |
573 | 573 | ||
574 | int (*readpages)(struct file *filp, struct address_space *mapping, | 574 | int (*readpages)(struct file *filp, struct address_space *mapping, |
575 | struct list_head *pages, unsigned nr_pages); | 575 | struct list_head *pages, unsigned nr_pages); |
576 | 576 | ||
577 | int (*write_begin)(struct file *, struct address_space *mapping, | 577 | int (*write_begin)(struct file *, struct address_space *mapping, |
578 | loff_t pos, unsigned len, unsigned flags, | 578 | loff_t pos, unsigned len, unsigned flags, |
579 | struct page **pagep, void **fsdata); | 579 | struct page **pagep, void **fsdata); |
580 | int (*write_end)(struct file *, struct address_space *mapping, | 580 | int (*write_end)(struct file *, struct address_space *mapping, |
581 | loff_t pos, unsigned len, unsigned copied, | 581 | loff_t pos, unsigned len, unsigned copied, |
582 | struct page *page, void *fsdata); | 582 | struct page *page, void *fsdata); |
583 | 583 | ||
584 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ | 584 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ |
585 | sector_t (*bmap)(struct address_space *, sector_t); | 585 | sector_t (*bmap)(struct address_space *, sector_t); |
586 | void (*invalidatepage) (struct page *, unsigned long); | 586 | void (*invalidatepage) (struct page *, unsigned long); |
587 | int (*releasepage) (struct page *, gfp_t); | 587 | int (*releasepage) (struct page *, gfp_t); |
588 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 588 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
589 | loff_t offset, unsigned long nr_segs); | 589 | loff_t offset, unsigned long nr_segs); |
590 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, | 590 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, |
591 | void **, unsigned long *); | 591 | void **, unsigned long *); |
592 | /* migrate the contents of a page to the specified target */ | 592 | /* migrate the contents of a page to the specified target */ |
593 | int (*migratepage) (struct address_space *, | 593 | int (*migratepage) (struct address_space *, |
594 | struct page *, struct page *); | 594 | struct page *, struct page *); |
595 | int (*launder_page) (struct page *); | 595 | int (*launder_page) (struct page *); |
596 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, | 596 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, |
597 | unsigned long); | 597 | unsigned long); |
598 | }; | 598 | }; |
599 | 599 | ||
600 | /* | 600 | /* |
601 | * pagecache_write_begin/pagecache_write_end must be used by general code | 601 | * pagecache_write_begin/pagecache_write_end must be used by general code |
602 | * to write into the pagecache. | 602 | * to write into the pagecache. |
603 | */ | 603 | */ |
604 | int pagecache_write_begin(struct file *, struct address_space *mapping, | 604 | int pagecache_write_begin(struct file *, struct address_space *mapping, |
605 | loff_t pos, unsigned len, unsigned flags, | 605 | loff_t pos, unsigned len, unsigned flags, |
606 | struct page **pagep, void **fsdata); | 606 | struct page **pagep, void **fsdata); |
607 | 607 | ||
608 | int pagecache_write_end(struct file *, struct address_space *mapping, | 608 | int pagecache_write_end(struct file *, struct address_space *mapping, |
609 | loff_t pos, unsigned len, unsigned copied, | 609 | loff_t pos, unsigned len, unsigned copied, |
610 | struct page *page, void *fsdata); | 610 | struct page *page, void *fsdata); |
611 | 611 | ||
612 | struct backing_dev_info; | 612 | struct backing_dev_info; |
613 | struct address_space { | 613 | struct address_space { |
614 | struct inode *host; /* owner: inode, block_device */ | 614 | struct inode *host; /* owner: inode, block_device */ |
615 | struct radix_tree_root page_tree; /* radix tree of all pages */ | 615 | struct radix_tree_root page_tree; /* radix tree of all pages */ |
616 | spinlock_t tree_lock; /* and lock protecting it */ | 616 | spinlock_t tree_lock; /* and lock protecting it */ |
617 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ | 617 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ |
618 | struct prio_tree_root i_mmap; /* tree of private and shared mappings */ | 618 | struct prio_tree_root i_mmap; /* tree of private and shared mappings */ |
619 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 619 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
620 | spinlock_t i_mmap_lock; /* protect tree, count, list */ | 620 | spinlock_t i_mmap_lock; /* protect tree, count, list */ |
621 | unsigned int truncate_count; /* Cover race condition with truncate */ | 621 | unsigned int truncate_count; /* Cover race condition with truncate */ |
622 | unsigned long nrpages; /* number of total pages */ | 622 | unsigned long nrpages; /* number of total pages */ |
623 | pgoff_t writeback_index;/* writeback starts here */ | 623 | pgoff_t writeback_index;/* writeback starts here */ |
624 | const struct address_space_operations *a_ops; /* methods */ | 624 | const struct address_space_operations *a_ops; /* methods */ |
625 | unsigned long flags; /* error bits/gfp mask */ | 625 | unsigned long flags; /* error bits/gfp mask */ |
626 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ | 626 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ |
627 | spinlock_t private_lock; /* for use by the address_space */ | 627 | spinlock_t private_lock; /* for use by the address_space */ |
628 | struct list_head private_list; /* ditto */ | 628 | struct list_head private_list; /* ditto */ |
629 | struct address_space *assoc_mapping; /* ditto */ | 629 | struct address_space *assoc_mapping; /* ditto */ |
630 | } __attribute__((aligned(sizeof(long)))); | 630 | } __attribute__((aligned(sizeof(long)))); |
631 | /* | 631 | /* |
632 | * On most architectures that alignment is already the case; but | 632 | * On most architectures that alignment is already the case; but |
633 | * must be enforced here for CRIS, to let the least signficant bit | 633 | * must be enforced here for CRIS, to let the least signficant bit |
634 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. | 634 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. |
635 | */ | 635 | */ |
636 | 636 | ||
637 | struct block_device { | 637 | struct block_device { |
638 | dev_t bd_dev; /* not a kdev_t - it's a search key */ | 638 | dev_t bd_dev; /* not a kdev_t - it's a search key */ |
639 | struct inode * bd_inode; /* will die */ | 639 | struct inode * bd_inode; /* will die */ |
640 | struct super_block * bd_super; | 640 | struct super_block * bd_super; |
641 | int bd_openers; | 641 | int bd_openers; |
642 | struct mutex bd_mutex; /* open/close mutex */ | 642 | struct mutex bd_mutex; /* open/close mutex */ |
643 | struct semaphore bd_mount_sem; | 643 | struct semaphore bd_mount_sem; |
644 | struct list_head bd_inodes; | 644 | struct list_head bd_inodes; |
645 | void * bd_holder; | 645 | void * bd_holder; |
646 | int bd_holders; | 646 | int bd_holders; |
647 | #ifdef CONFIG_SYSFS | 647 | #ifdef CONFIG_SYSFS |
648 | struct list_head bd_holder_list; | 648 | struct list_head bd_holder_list; |
649 | #endif | 649 | #endif |
650 | struct block_device * bd_contains; | 650 | struct block_device * bd_contains; |
651 | unsigned bd_block_size; | 651 | unsigned bd_block_size; |
652 | struct hd_struct * bd_part; | 652 | struct hd_struct * bd_part; |
653 | /* number of times partitions within this device have been opened. */ | 653 | /* number of times partitions within this device have been opened. */ |
654 | unsigned bd_part_count; | 654 | unsigned bd_part_count; |
655 | int bd_invalidated; | 655 | int bd_invalidated; |
656 | struct gendisk * bd_disk; | 656 | struct gendisk * bd_disk; |
657 | struct list_head bd_list; | 657 | struct list_head bd_list; |
658 | struct backing_dev_info *bd_inode_backing_dev_info; | 658 | struct backing_dev_info *bd_inode_backing_dev_info; |
659 | /* | 659 | /* |
660 | * Private data. You must have bd_claim'ed the block_device | 660 | * Private data. You must have bd_claim'ed the block_device |
661 | * to use this. NOTE: bd_claim allows an owner to claim | 661 | * to use this. NOTE: bd_claim allows an owner to claim |
662 | * the same device multiple times, the owner must take special | 662 | * the same device multiple times, the owner must take special |
663 | * care to not mess up bd_private for that case. | 663 | * care to not mess up bd_private for that case. |
664 | */ | 664 | */ |
665 | unsigned long bd_private; | 665 | unsigned long bd_private; |
666 | 666 | ||
667 | /* The counter of freeze processes */ | 667 | /* The counter of freeze processes */ |
668 | int bd_fsfreeze_count; | 668 | int bd_fsfreeze_count; |
669 | /* Mutex for freeze */ | 669 | /* Mutex for freeze */ |
670 | struct mutex bd_fsfreeze_mutex; | 670 | struct mutex bd_fsfreeze_mutex; |
671 | }; | 671 | }; |
672 | 672 | ||
673 | /* | 673 | /* |
674 | * Radix-tree tags, for tagging dirty and writeback pages within the pagecache | 674 | * Radix-tree tags, for tagging dirty and writeback pages within the pagecache |
675 | * radix trees | 675 | * radix trees |
676 | */ | 676 | */ |
677 | #define PAGECACHE_TAG_DIRTY 0 | 677 | #define PAGECACHE_TAG_DIRTY 0 |
678 | #define PAGECACHE_TAG_WRITEBACK 1 | 678 | #define PAGECACHE_TAG_WRITEBACK 1 |
679 | 679 | ||
680 | int mapping_tagged(struct address_space *mapping, int tag); | 680 | int mapping_tagged(struct address_space *mapping, int tag); |
681 | 681 | ||
682 | /* | 682 | /* |
683 | * Might pages of this file be mapped into userspace? | 683 | * Might pages of this file be mapped into userspace? |
684 | */ | 684 | */ |
685 | static inline int mapping_mapped(struct address_space *mapping) | 685 | static inline int mapping_mapped(struct address_space *mapping) |
686 | { | 686 | { |
687 | return !prio_tree_empty(&mapping->i_mmap) || | 687 | return !prio_tree_empty(&mapping->i_mmap) || |
688 | !list_empty(&mapping->i_mmap_nonlinear); | 688 | !list_empty(&mapping->i_mmap_nonlinear); |
689 | } | 689 | } |
690 | 690 | ||
691 | /* | 691 | /* |
692 | * Might pages of this file have been modified in userspace? | 692 | * Might pages of this file have been modified in userspace? |
693 | * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff | 693 | * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff |
694 | * marks vma as VM_SHARED if it is shared, and the file was opened for | 694 | * marks vma as VM_SHARED if it is shared, and the file was opened for |
695 | * writing i.e. vma may be mprotected writable even if now readonly. | 695 | * writing i.e. vma may be mprotected writable even if now readonly. |
696 | */ | 696 | */ |
697 | static inline int mapping_writably_mapped(struct address_space *mapping) | 697 | static inline int mapping_writably_mapped(struct address_space *mapping) |
698 | { | 698 | { |
699 | return mapping->i_mmap_writable != 0; | 699 | return mapping->i_mmap_writable != 0; |
700 | } | 700 | } |
701 | 701 | ||
702 | /* | 702 | /* |
703 | * Use sequence counter to get consistent i_size on 32-bit processors. | 703 | * Use sequence counter to get consistent i_size on 32-bit processors. |
704 | */ | 704 | */ |
705 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 705 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
706 | #include <linux/seqlock.h> | 706 | #include <linux/seqlock.h> |
707 | #define __NEED_I_SIZE_ORDERED | 707 | #define __NEED_I_SIZE_ORDERED |
708 | #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) | 708 | #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) |
709 | #else | 709 | #else |
710 | #define i_size_ordered_init(inode) do { } while (0) | 710 | #define i_size_ordered_init(inode) do { } while (0) |
711 | #endif | 711 | #endif |
712 | 712 | ||
713 | struct posix_acl; | 713 | struct posix_acl; |
714 | #define ACL_NOT_CACHED ((void *)(-1)) | 714 | #define ACL_NOT_CACHED ((void *)(-1)) |
715 | 715 | ||
716 | struct inode { | 716 | struct inode { |
717 | struct hlist_node i_hash; | 717 | struct hlist_node i_hash; |
718 | struct list_head i_list; | 718 | struct list_head i_list; |
719 | struct list_head i_sb_list; | 719 | struct list_head i_sb_list; |
720 | struct list_head i_dentry; | 720 | struct list_head i_dentry; |
721 | unsigned long i_ino; | 721 | unsigned long i_ino; |
722 | atomic_t i_count; | 722 | atomic_t i_count; |
723 | unsigned int i_nlink; | 723 | unsigned int i_nlink; |
724 | uid_t i_uid; | 724 | uid_t i_uid; |
725 | gid_t i_gid; | 725 | gid_t i_gid; |
726 | dev_t i_rdev; | 726 | dev_t i_rdev; |
727 | u64 i_version; | 727 | u64 i_version; |
728 | loff_t i_size; | 728 | loff_t i_size; |
729 | #ifdef __NEED_I_SIZE_ORDERED | 729 | #ifdef __NEED_I_SIZE_ORDERED |
730 | seqcount_t i_size_seqcount; | 730 | seqcount_t i_size_seqcount; |
731 | #endif | 731 | #endif |
732 | struct timespec i_atime; | 732 | struct timespec i_atime; |
733 | struct timespec i_mtime; | 733 | struct timespec i_mtime; |
734 | struct timespec i_ctime; | 734 | struct timespec i_ctime; |
735 | blkcnt_t i_blocks; | 735 | blkcnt_t i_blocks; |
736 | unsigned int i_blkbits; | 736 | unsigned int i_blkbits; |
737 | unsigned short i_bytes; | 737 | unsigned short i_bytes; |
738 | umode_t i_mode; | 738 | umode_t i_mode; |
739 | spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ | 739 | spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ |
740 | struct mutex i_mutex; | 740 | struct mutex i_mutex; |
741 | struct rw_semaphore i_alloc_sem; | 741 | struct rw_semaphore i_alloc_sem; |
742 | const struct inode_operations *i_op; | 742 | const struct inode_operations *i_op; |
743 | const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ | 743 | const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ |
744 | struct super_block *i_sb; | 744 | struct super_block *i_sb; |
745 | struct file_lock *i_flock; | 745 | struct file_lock *i_flock; |
746 | struct address_space *i_mapping; | 746 | struct address_space *i_mapping; |
747 | struct address_space i_data; | 747 | struct address_space i_data; |
748 | #ifdef CONFIG_QUOTA | 748 | #ifdef CONFIG_QUOTA |
749 | struct dquot *i_dquot[MAXQUOTAS]; | 749 | struct dquot *i_dquot[MAXQUOTAS]; |
750 | #endif | 750 | #endif |
751 | struct list_head i_devices; | 751 | struct list_head i_devices; |
752 | union { | 752 | union { |
753 | struct pipe_inode_info *i_pipe; | 753 | struct pipe_inode_info *i_pipe; |
754 | struct block_device *i_bdev; | 754 | struct block_device *i_bdev; |
755 | struct cdev *i_cdev; | 755 | struct cdev *i_cdev; |
756 | }; | 756 | }; |
757 | 757 | ||
758 | __u32 i_generation; | 758 | __u32 i_generation; |
759 | 759 | ||
760 | #ifdef CONFIG_FSNOTIFY | 760 | #ifdef CONFIG_FSNOTIFY |
761 | __u32 i_fsnotify_mask; /* all events this inode cares about */ | 761 | __u32 i_fsnotify_mask; /* all events this inode cares about */ |
762 | struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */ | 762 | struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */ |
763 | #endif | 763 | #endif |
764 | 764 | ||
765 | #ifdef CONFIG_INOTIFY | 765 | #ifdef CONFIG_INOTIFY |
766 | struct list_head inotify_watches; /* watches on this inode */ | 766 | struct list_head inotify_watches; /* watches on this inode */ |
767 | struct mutex inotify_mutex; /* protects the watches list */ | 767 | struct mutex inotify_mutex; /* protects the watches list */ |
768 | #endif | 768 | #endif |
769 | 769 | ||
770 | unsigned long i_state; | 770 | unsigned long i_state; |
771 | unsigned long dirtied_when; /* jiffies of first dirtying */ | 771 | unsigned long dirtied_when; /* jiffies of first dirtying */ |
772 | 772 | ||
773 | unsigned int i_flags; | 773 | unsigned int i_flags; |
774 | 774 | ||
775 | atomic_t i_writecount; | 775 | atomic_t i_writecount; |
776 | #ifdef CONFIG_SECURITY | 776 | #ifdef CONFIG_SECURITY |
777 | void *i_security; | 777 | void *i_security; |
778 | #endif | 778 | #endif |
779 | #ifdef CONFIG_FS_POSIX_ACL | 779 | #ifdef CONFIG_FS_POSIX_ACL |
780 | struct posix_acl *i_acl; | 780 | struct posix_acl *i_acl; |
781 | struct posix_acl *i_default_acl; | 781 | struct posix_acl *i_default_acl; |
782 | #endif | 782 | #endif |
783 | void *i_private; /* fs or device private pointer */ | 783 | void *i_private; /* fs or device private pointer */ |
784 | }; | 784 | }; |
785 | 785 | ||
786 | /* | 786 | /* |
787 | * inode->i_mutex nesting subclasses for the lock validator: | 787 | * inode->i_mutex nesting subclasses for the lock validator: |
788 | * | 788 | * |
789 | * 0: the object of the current VFS operation | 789 | * 0: the object of the current VFS operation |
790 | * 1: parent | 790 | * 1: parent |
791 | * 2: child/target | 791 | * 2: child/target |
792 | * 3: quota file | 792 | * 3: quota file |
793 | * | 793 | * |
794 | * The locking order between these classes is | 794 | * The locking order between these classes is |
795 | * parent -> child -> normal -> xattr -> quota | 795 | * parent -> child -> normal -> xattr -> quota |
796 | */ | 796 | */ |
797 | enum inode_i_mutex_lock_class | 797 | enum inode_i_mutex_lock_class |
798 | { | 798 | { |
799 | I_MUTEX_NORMAL, | 799 | I_MUTEX_NORMAL, |
800 | I_MUTEX_PARENT, | 800 | I_MUTEX_PARENT, |
801 | I_MUTEX_CHILD, | 801 | I_MUTEX_CHILD, |
802 | I_MUTEX_XATTR, | 802 | I_MUTEX_XATTR, |
803 | I_MUTEX_QUOTA | 803 | I_MUTEX_QUOTA |
804 | }; | 804 | }; |
805 | 805 | ||
806 | /* | 806 | /* |
807 | * NOTE: in a 32bit arch with a preemptable kernel and | 807 | * NOTE: in a 32bit arch with a preemptable kernel and |
808 | * an UP compile the i_size_read/write must be atomic | 808 | * an UP compile the i_size_read/write must be atomic |
809 | * with respect to the local cpu (unlike with preempt disabled), | 809 | * with respect to the local cpu (unlike with preempt disabled), |
810 | * but they don't need to be atomic with respect to other cpus like in | 810 | * but they don't need to be atomic with respect to other cpus like in |
811 | * true SMP (so they need either to either locally disable irq around | 811 | * true SMP (so they need either to either locally disable irq around |
812 | * the read or for example on x86 they can be still implemented as a | 812 | * the read or for example on x86 they can be still implemented as a |
813 | * cmpxchg8b without the need of the lock prefix). For SMP compiles | 813 | * cmpxchg8b without the need of the lock prefix). For SMP compiles |
814 | * and 64bit archs it makes no difference if preempt is enabled or not. | 814 | * and 64bit archs it makes no difference if preempt is enabled or not. |
815 | */ | 815 | */ |
816 | static inline loff_t i_size_read(const struct inode *inode) | 816 | static inline loff_t i_size_read(const struct inode *inode) |
817 | { | 817 | { |
818 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 818 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
819 | loff_t i_size; | 819 | loff_t i_size; |
820 | unsigned int seq; | 820 | unsigned int seq; |
821 | 821 | ||
822 | do { | 822 | do { |
823 | seq = read_seqcount_begin(&inode->i_size_seqcount); | 823 | seq = read_seqcount_begin(&inode->i_size_seqcount); |
824 | i_size = inode->i_size; | 824 | i_size = inode->i_size; |
825 | } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); | 825 | } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); |
826 | return i_size; | 826 | return i_size; |
827 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) | 827 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) |
828 | loff_t i_size; | 828 | loff_t i_size; |
829 | 829 | ||
830 | preempt_disable(); | 830 | preempt_disable(); |
831 | i_size = inode->i_size; | 831 | i_size = inode->i_size; |
832 | preempt_enable(); | 832 | preempt_enable(); |
833 | return i_size; | 833 | return i_size; |
834 | #else | 834 | #else |
835 | return inode->i_size; | 835 | return inode->i_size; |
836 | #endif | 836 | #endif |
837 | } | 837 | } |
838 | 838 | ||
839 | /* | 839 | /* |
840 | * NOTE: unlike i_size_read(), i_size_write() does need locking around it | 840 | * NOTE: unlike i_size_read(), i_size_write() does need locking around it |
841 | * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount | 841 | * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount |
842 | * can be lost, resulting in subsequent i_size_read() calls spinning forever. | 842 | * can be lost, resulting in subsequent i_size_read() calls spinning forever. |
843 | */ | 843 | */ |
844 | static inline void i_size_write(struct inode *inode, loff_t i_size) | 844 | static inline void i_size_write(struct inode *inode, loff_t i_size) |
845 | { | 845 | { |
846 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) | 846 | #if BITS_PER_LONG==32 && defined(CONFIG_SMP) |
847 | write_seqcount_begin(&inode->i_size_seqcount); | 847 | write_seqcount_begin(&inode->i_size_seqcount); |
848 | inode->i_size = i_size; | 848 | inode->i_size = i_size; |
849 | write_seqcount_end(&inode->i_size_seqcount); | 849 | write_seqcount_end(&inode->i_size_seqcount); |
850 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) | 850 | #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) |
851 | preempt_disable(); | 851 | preempt_disable(); |
852 | inode->i_size = i_size; | 852 | inode->i_size = i_size; |
853 | preempt_enable(); | 853 | preempt_enable(); |
854 | #else | 854 | #else |
855 | inode->i_size = i_size; | 855 | inode->i_size = i_size; |
856 | #endif | 856 | #endif |
857 | } | 857 | } |
858 | 858 | ||
859 | static inline unsigned iminor(const struct inode *inode) | 859 | static inline unsigned iminor(const struct inode *inode) |
860 | { | 860 | { |
861 | return MINOR(inode->i_rdev); | 861 | return MINOR(inode->i_rdev); |
862 | } | 862 | } |
863 | 863 | ||
864 | static inline unsigned imajor(const struct inode *inode) | 864 | static inline unsigned imajor(const struct inode *inode) |
865 | { | 865 | { |
866 | return MAJOR(inode->i_rdev); | 866 | return MAJOR(inode->i_rdev); |
867 | } | 867 | } |
868 | 868 | ||
869 | extern struct block_device *I_BDEV(struct inode *inode); | 869 | extern struct block_device *I_BDEV(struct inode *inode); |
870 | 870 | ||
871 | struct fown_struct { | 871 | struct fown_struct { |
872 | rwlock_t lock; /* protects pid, uid, euid fields */ | 872 | rwlock_t lock; /* protects pid, uid, euid fields */ |
873 | struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ | 873 | struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ |
874 | enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ | 874 | enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ |
875 | uid_t uid, euid; /* uid/euid of process setting the owner */ | 875 | uid_t uid, euid; /* uid/euid of process setting the owner */ |
876 | int signum; /* posix.1b rt signal to be delivered on IO */ | 876 | int signum; /* posix.1b rt signal to be delivered on IO */ |
877 | }; | 877 | }; |
878 | 878 | ||
879 | /* | 879 | /* |
880 | * Track a single file's readahead state | 880 | * Track a single file's readahead state |
881 | */ | 881 | */ |
882 | struct file_ra_state { | 882 | struct file_ra_state { |
883 | pgoff_t start; /* where readahead started */ | 883 | pgoff_t start; /* where readahead started */ |
884 | unsigned int size; /* # of readahead pages */ | 884 | unsigned int size; /* # of readahead pages */ |
885 | unsigned int async_size; /* do asynchronous readahead when | 885 | unsigned int async_size; /* do asynchronous readahead when |
886 | there are only # of pages ahead */ | 886 | there are only # of pages ahead */ |
887 | 887 | ||
888 | unsigned int ra_pages; /* Maximum readahead window */ | 888 | unsigned int ra_pages; /* Maximum readahead window */ |
889 | unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ | 889 | unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ |
890 | loff_t prev_pos; /* Cache last read() position */ | 890 | loff_t prev_pos; /* Cache last read() position */ |
891 | }; | 891 | }; |
892 | 892 | ||
893 | /* | 893 | /* |
894 | * Check if @index falls in the readahead windows. | 894 | * Check if @index falls in the readahead windows. |
895 | */ | 895 | */ |
896 | static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) | 896 | static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) |
897 | { | 897 | { |
898 | return (index >= ra->start && | 898 | return (index >= ra->start && |
899 | index < ra->start + ra->size); | 899 | index < ra->start + ra->size); |
900 | } | 900 | } |
901 | 901 | ||
902 | #define FILE_MNT_WRITE_TAKEN 1 | 902 | #define FILE_MNT_WRITE_TAKEN 1 |
903 | #define FILE_MNT_WRITE_RELEASED 2 | 903 | #define FILE_MNT_WRITE_RELEASED 2 |
904 | 904 | ||
905 | struct file { | 905 | struct file { |
906 | /* | 906 | /* |
907 | * fu_list becomes invalid after file_free is called and queued via | 907 | * fu_list becomes invalid after file_free is called and queued via |
908 | * fu_rcuhead for RCU freeing | 908 | * fu_rcuhead for RCU freeing |
909 | */ | 909 | */ |
910 | union { | 910 | union { |
911 | struct list_head fu_list; | 911 | struct list_head fu_list; |
912 | struct rcu_head fu_rcuhead; | 912 | struct rcu_head fu_rcuhead; |
913 | } f_u; | 913 | } f_u; |
914 | struct path f_path; | 914 | struct path f_path; |
915 | #define f_dentry f_path.dentry | 915 | #define f_dentry f_path.dentry |
916 | #define f_vfsmnt f_path.mnt | 916 | #define f_vfsmnt f_path.mnt |
917 | const struct file_operations *f_op; | 917 | const struct file_operations *f_op; |
918 | spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ | 918 | spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ |
919 | atomic_long_t f_count; | 919 | atomic_long_t f_count; |
920 | unsigned int f_flags; | 920 | unsigned int f_flags; |
921 | fmode_t f_mode; | 921 | fmode_t f_mode; |
922 | loff_t f_pos; | 922 | loff_t f_pos; |
923 | struct fown_struct f_owner; | 923 | struct fown_struct f_owner; |
924 | const struct cred *f_cred; | 924 | const struct cred *f_cred; |
925 | struct file_ra_state f_ra; | 925 | struct file_ra_state f_ra; |
926 | 926 | ||
927 | u64 f_version; | 927 | u64 f_version; |
928 | #ifdef CONFIG_SECURITY | 928 | #ifdef CONFIG_SECURITY |
929 | void *f_security; | 929 | void *f_security; |
930 | #endif | 930 | #endif |
931 | /* needed for tty driver, and maybe others */ | 931 | /* needed for tty driver, and maybe others */ |
932 | void *private_data; | 932 | void *private_data; |
933 | 933 | ||
934 | #ifdef CONFIG_EPOLL | 934 | #ifdef CONFIG_EPOLL |
935 | /* Used by fs/eventpoll.c to link all the hooks to this file */ | 935 | /* Used by fs/eventpoll.c to link all the hooks to this file */ |
936 | struct list_head f_ep_links; | 936 | struct list_head f_ep_links; |
937 | #endif /* #ifdef CONFIG_EPOLL */ | 937 | #endif /* #ifdef CONFIG_EPOLL */ |
938 | struct address_space *f_mapping; | 938 | struct address_space *f_mapping; |
939 | #ifdef CONFIG_DEBUG_WRITECOUNT | 939 | #ifdef CONFIG_DEBUG_WRITECOUNT |
940 | unsigned long f_mnt_write_state; | 940 | unsigned long f_mnt_write_state; |
941 | #endif | 941 | #endif |
942 | }; | 942 | }; |
943 | extern spinlock_t files_lock; | 943 | extern spinlock_t files_lock; |
944 | #define file_list_lock() spin_lock(&files_lock); | 944 | #define file_list_lock() spin_lock(&files_lock); |
945 | #define file_list_unlock() spin_unlock(&files_lock); | 945 | #define file_list_unlock() spin_unlock(&files_lock); |
946 | 946 | ||
947 | #define get_file(x) atomic_long_inc(&(x)->f_count) | 947 | #define get_file(x) atomic_long_inc(&(x)->f_count) |
948 | #define file_count(x) atomic_long_read(&(x)->f_count) | 948 | #define file_count(x) atomic_long_read(&(x)->f_count) |
949 | 949 | ||
950 | #ifdef CONFIG_DEBUG_WRITECOUNT | 950 | #ifdef CONFIG_DEBUG_WRITECOUNT |
951 | static inline void file_take_write(struct file *f) | 951 | static inline void file_take_write(struct file *f) |
952 | { | 952 | { |
953 | WARN_ON(f->f_mnt_write_state != 0); | 953 | WARN_ON(f->f_mnt_write_state != 0); |
954 | f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; | 954 | f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; |
955 | } | 955 | } |
956 | static inline void file_release_write(struct file *f) | 956 | static inline void file_release_write(struct file *f) |
957 | { | 957 | { |
958 | f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; | 958 | f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; |
959 | } | 959 | } |
960 | static inline void file_reset_write(struct file *f) | 960 | static inline void file_reset_write(struct file *f) |
961 | { | 961 | { |
962 | f->f_mnt_write_state = 0; | 962 | f->f_mnt_write_state = 0; |
963 | } | 963 | } |
964 | static inline void file_check_state(struct file *f) | 964 | static inline void file_check_state(struct file *f) |
965 | { | 965 | { |
966 | /* | 966 | /* |
967 | * At this point, either both or neither of these bits | 967 | * At this point, either both or neither of these bits |
968 | * should be set. | 968 | * should be set. |
969 | */ | 969 | */ |
970 | WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); | 970 | WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); |
971 | WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); | 971 | WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); |
972 | } | 972 | } |
973 | static inline int file_check_writeable(struct file *f) | 973 | static inline int file_check_writeable(struct file *f) |
974 | { | 974 | { |
975 | if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) | 975 | if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) |
976 | return 0; | 976 | return 0; |
977 | printk(KERN_WARNING "writeable file with no " | 977 | printk(KERN_WARNING "writeable file with no " |
978 | "mnt_want_write()\n"); | 978 | "mnt_want_write()\n"); |
979 | WARN_ON(1); | 979 | WARN_ON(1); |
980 | return -EINVAL; | 980 | return -EINVAL; |
981 | } | 981 | } |
982 | #else /* !CONFIG_DEBUG_WRITECOUNT */ | 982 | #else /* !CONFIG_DEBUG_WRITECOUNT */ |
983 | static inline void file_take_write(struct file *filp) {} | 983 | static inline void file_take_write(struct file *filp) {} |
984 | static inline void file_release_write(struct file *filp) {} | 984 | static inline void file_release_write(struct file *filp) {} |
985 | static inline void file_reset_write(struct file *filp) {} | 985 | static inline void file_reset_write(struct file *filp) {} |
986 | static inline void file_check_state(struct file *filp) {} | 986 | static inline void file_check_state(struct file *filp) {} |
987 | static inline int file_check_writeable(struct file *filp) | 987 | static inline int file_check_writeable(struct file *filp) |
988 | { | 988 | { |
989 | return 0; | 989 | return 0; |
990 | } | 990 | } |
991 | #endif /* CONFIG_DEBUG_WRITECOUNT */ | 991 | #endif /* CONFIG_DEBUG_WRITECOUNT */ |
992 | 992 | ||
993 | #define MAX_NON_LFS ((1UL<<31) - 1) | 993 | #define MAX_NON_LFS ((1UL<<31) - 1) |
994 | 994 | ||
995 | /* Page cache limit. The filesystems should put that into their s_maxbytes | 995 | /* Page cache limit. The filesystems should put that into their s_maxbytes |
996 | limits, otherwise bad things can happen in VM. */ | 996 | limits, otherwise bad things can happen in VM. */ |
997 | #if BITS_PER_LONG==32 | 997 | #if BITS_PER_LONG==32 |
998 | #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) | 998 | #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) |
999 | #elif BITS_PER_LONG==64 | 999 | #elif BITS_PER_LONG==64 |
1000 | #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL | 1000 | #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL |
1001 | #endif | 1001 | #endif |
1002 | 1002 | ||
1003 | #define FL_POSIX 1 | 1003 | #define FL_POSIX 1 |
1004 | #define FL_FLOCK 2 | 1004 | #define FL_FLOCK 2 |
1005 | #define FL_ACCESS 8 /* not trying to lock, just looking */ | 1005 | #define FL_ACCESS 8 /* not trying to lock, just looking */ |
1006 | #define FL_EXISTS 16 /* when unlocking, test for existence */ | 1006 | #define FL_EXISTS 16 /* when unlocking, test for existence */ |
1007 | #define FL_LEASE 32 /* lease held on this file */ | 1007 | #define FL_LEASE 32 /* lease held on this file */ |
1008 | #define FL_CLOSE 64 /* unlock on close */ | 1008 | #define FL_CLOSE 64 /* unlock on close */ |
1009 | #define FL_SLEEP 128 /* A blocking lock */ | 1009 | #define FL_SLEEP 128 /* A blocking lock */ |
1010 | 1010 | ||
1011 | /* | 1011 | /* |
1012 | * Special return value from posix_lock_file() and vfs_lock_file() for | 1012 | * Special return value from posix_lock_file() and vfs_lock_file() for |
1013 | * asynchronous locking. | 1013 | * asynchronous locking. |
1014 | */ | 1014 | */ |
1015 | #define FILE_LOCK_DEFERRED 1 | 1015 | #define FILE_LOCK_DEFERRED 1 |
1016 | 1016 | ||
1017 | /* | 1017 | /* |
1018 | * The POSIX file lock owner is determined by | 1018 | * The POSIX file lock owner is determined by |
1019 | * the "struct files_struct" in the thread group | 1019 | * the "struct files_struct" in the thread group |
1020 | * (or NULL for no owner - BSD locks). | 1020 | * (or NULL for no owner - BSD locks). |
1021 | * | 1021 | * |
1022 | * Lockd stuffs a "host" pointer into this. | 1022 | * Lockd stuffs a "host" pointer into this. |
1023 | */ | 1023 | */ |
1024 | typedef struct files_struct *fl_owner_t; | 1024 | typedef struct files_struct *fl_owner_t; |
1025 | 1025 | ||
1026 | struct file_lock_operations { | 1026 | struct file_lock_operations { |
1027 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 1027 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
1028 | void (*fl_release_private)(struct file_lock *); | 1028 | void (*fl_release_private)(struct file_lock *); |
1029 | }; | 1029 | }; |
1030 | 1030 | ||
1031 | struct lock_manager_operations { | 1031 | struct lock_manager_operations { |
1032 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); | 1032 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); |
1033 | void (*fl_notify)(struct file_lock *); /* unblock callback */ | 1033 | void (*fl_notify)(struct file_lock *); /* unblock callback */ |
1034 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); | 1034 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); |
1035 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 1035 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
1036 | void (*fl_release_private)(struct file_lock *); | 1036 | void (*fl_release_private)(struct file_lock *); |
1037 | void (*fl_break)(struct file_lock *); | 1037 | void (*fl_break)(struct file_lock *); |
1038 | int (*fl_mylease)(struct file_lock *, struct file_lock *); | 1038 | int (*fl_mylease)(struct file_lock *, struct file_lock *); |
1039 | int (*fl_change)(struct file_lock **, int); | 1039 | int (*fl_change)(struct file_lock **, int); |
1040 | }; | 1040 | }; |
1041 | 1041 | ||
1042 | struct lock_manager { | 1042 | struct lock_manager { |
1043 | struct list_head list; | 1043 | struct list_head list; |
1044 | }; | 1044 | }; |
1045 | 1045 | ||
1046 | void locks_start_grace(struct lock_manager *); | 1046 | void locks_start_grace(struct lock_manager *); |
1047 | void locks_end_grace(struct lock_manager *); | 1047 | void locks_end_grace(struct lock_manager *); |
1048 | int locks_in_grace(void); | 1048 | int locks_in_grace(void); |
1049 | 1049 | ||
1050 | /* that will die - we need it for nfs_lock_info */ | 1050 | /* that will die - we need it for nfs_lock_info */ |
1051 | #include <linux/nfs_fs_i.h> | 1051 | #include <linux/nfs_fs_i.h> |
1052 | 1052 | ||
1053 | struct file_lock { | 1053 | struct file_lock { |
1054 | struct file_lock *fl_next; /* singly linked list for this inode */ | 1054 | struct file_lock *fl_next; /* singly linked list for this inode */ |
1055 | struct list_head fl_link; /* doubly linked list of all locks */ | 1055 | struct list_head fl_link; /* doubly linked list of all locks */ |
1056 | struct list_head fl_block; /* circular list of blocked processes */ | 1056 | struct list_head fl_block; /* circular list of blocked processes */ |
1057 | fl_owner_t fl_owner; | 1057 | fl_owner_t fl_owner; |
1058 | unsigned char fl_flags; | 1058 | unsigned char fl_flags; |
1059 | unsigned char fl_type; | 1059 | unsigned char fl_type; |
1060 | unsigned int fl_pid; | 1060 | unsigned int fl_pid; |
1061 | struct pid *fl_nspid; | 1061 | struct pid *fl_nspid; |
1062 | wait_queue_head_t fl_wait; | 1062 | wait_queue_head_t fl_wait; |
1063 | struct file *fl_file; | 1063 | struct file *fl_file; |
1064 | loff_t fl_start; | 1064 | loff_t fl_start; |
1065 | loff_t fl_end; | 1065 | loff_t fl_end; |
1066 | 1066 | ||
1067 | struct fasync_struct * fl_fasync; /* for lease break notifications */ | 1067 | struct fasync_struct * fl_fasync; /* for lease break notifications */ |
1068 | unsigned long fl_break_time; /* for nonblocking lease breaks */ | 1068 | unsigned long fl_break_time; /* for nonblocking lease breaks */ |
1069 | 1069 | ||
1070 | struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ | 1070 | struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ |
1071 | struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ | 1071 | struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ |
1072 | union { | 1072 | union { |
1073 | struct nfs_lock_info nfs_fl; | 1073 | struct nfs_lock_info nfs_fl; |
1074 | struct nfs4_lock_info nfs4_fl; | 1074 | struct nfs4_lock_info nfs4_fl; |
1075 | struct { | 1075 | struct { |
1076 | struct list_head link; /* link in AFS vnode's pending_locks list */ | 1076 | struct list_head link; /* link in AFS vnode's pending_locks list */ |
1077 | int state; /* state of grant or error if -ve */ | 1077 | int state; /* state of grant or error if -ve */ |
1078 | } afs; | 1078 | } afs; |
1079 | } fl_u; | 1079 | } fl_u; |
1080 | }; | 1080 | }; |
1081 | 1081 | ||
1082 | /* The following constant reflects the upper bound of the file/locking space */ | 1082 | /* The following constant reflects the upper bound of the file/locking space */ |
1083 | #ifndef OFFSET_MAX | 1083 | #ifndef OFFSET_MAX |
1084 | #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) | 1084 | #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) |
1085 | #define OFFSET_MAX INT_LIMIT(loff_t) | 1085 | #define OFFSET_MAX INT_LIMIT(loff_t) |
1086 | #define OFFT_OFFSET_MAX INT_LIMIT(off_t) | 1086 | #define OFFT_OFFSET_MAX INT_LIMIT(off_t) |
1087 | #endif | 1087 | #endif |
1088 | 1088 | ||
1089 | #include <linux/fcntl.h> | 1089 | #include <linux/fcntl.h> |
1090 | 1090 | ||
1091 | extern void send_sigio(struct fown_struct *fown, int fd, int band); | 1091 | extern void send_sigio(struct fown_struct *fown, int fd, int band); |
1092 | 1092 | ||
1093 | /* fs/sync.c */ | 1093 | /* fs/sync.c */ |
1094 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, | 1094 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
1095 | loff_t endbyte, unsigned int flags); | 1095 | loff_t endbyte, unsigned int flags); |
1096 | 1096 | ||
1097 | #ifdef CONFIG_FILE_LOCKING | 1097 | #ifdef CONFIG_FILE_LOCKING |
1098 | extern int fcntl_getlk(struct file *, struct flock __user *); | 1098 | extern int fcntl_getlk(struct file *, struct flock __user *); |
1099 | extern int fcntl_setlk(unsigned int, struct file *, unsigned int, | 1099 | extern int fcntl_setlk(unsigned int, struct file *, unsigned int, |
1100 | struct flock __user *); | 1100 | struct flock __user *); |
1101 | 1101 | ||
1102 | #if BITS_PER_LONG == 32 | 1102 | #if BITS_PER_LONG == 32 |
1103 | extern int fcntl_getlk64(struct file *, struct flock64 __user *); | 1103 | extern int fcntl_getlk64(struct file *, struct flock64 __user *); |
1104 | extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, | 1104 | extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, |
1105 | struct flock64 __user *); | 1105 | struct flock64 __user *); |
1106 | #endif | 1106 | #endif |
1107 | 1107 | ||
1108 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); | 1108 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); |
1109 | extern int fcntl_getlease(struct file *filp); | 1109 | extern int fcntl_getlease(struct file *filp); |
1110 | 1110 | ||
1111 | /* fs/locks.c */ | 1111 | /* fs/locks.c */ |
1112 | extern void locks_init_lock(struct file_lock *); | 1112 | extern void locks_init_lock(struct file_lock *); |
1113 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); | 1113 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); |
1114 | extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); | 1114 | extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); |
1115 | extern void locks_remove_posix(struct file *, fl_owner_t); | 1115 | extern void locks_remove_posix(struct file *, fl_owner_t); |
1116 | extern void locks_remove_flock(struct file *); | 1116 | extern void locks_remove_flock(struct file *); |
1117 | extern void locks_release_private(struct file_lock *); | 1117 | extern void locks_release_private(struct file_lock *); |
1118 | extern void posix_test_lock(struct file *, struct file_lock *); | 1118 | extern void posix_test_lock(struct file *, struct file_lock *); |
1119 | extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); | 1119 | extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); |
1120 | extern int posix_lock_file_wait(struct file *, struct file_lock *); | 1120 | extern int posix_lock_file_wait(struct file *, struct file_lock *); |
1121 | extern int posix_unblock_lock(struct file *, struct file_lock *); | 1121 | extern int posix_unblock_lock(struct file *, struct file_lock *); |
1122 | extern int vfs_test_lock(struct file *, struct file_lock *); | 1122 | extern int vfs_test_lock(struct file *, struct file_lock *); |
1123 | extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); | 1123 | extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); |
1124 | extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); | 1124 | extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); |
1125 | extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); | 1125 | extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); |
1126 | extern int __break_lease(struct inode *inode, unsigned int flags); | 1126 | extern int __break_lease(struct inode *inode, unsigned int flags); |
1127 | extern void lease_get_mtime(struct inode *, struct timespec *time); | 1127 | extern void lease_get_mtime(struct inode *, struct timespec *time); |
1128 | extern int generic_setlease(struct file *, long, struct file_lock **); | 1128 | extern int generic_setlease(struct file *, long, struct file_lock **); |
1129 | extern int vfs_setlease(struct file *, long, struct file_lock **); | 1129 | extern int vfs_setlease(struct file *, long, struct file_lock **); |
1130 | extern int lease_modify(struct file_lock **, int); | 1130 | extern int lease_modify(struct file_lock **, int); |
1131 | extern int lock_may_read(struct inode *, loff_t start, unsigned long count); | 1131 | extern int lock_may_read(struct inode *, loff_t start, unsigned long count); |
1132 | extern int lock_may_write(struct inode *, loff_t start, unsigned long count); | 1132 | extern int lock_may_write(struct inode *, loff_t start, unsigned long count); |
1133 | #else /* !CONFIG_FILE_LOCKING */ | 1133 | #else /* !CONFIG_FILE_LOCKING */ |
1134 | static inline int fcntl_getlk(struct file *file, struct flock __user *user) | 1134 | static inline int fcntl_getlk(struct file *file, struct flock __user *user) |
1135 | { | 1135 | { |
1136 | return -EINVAL; | 1136 | return -EINVAL; |
1137 | } | 1137 | } |
1138 | 1138 | ||
1139 | static inline int fcntl_setlk(unsigned int fd, struct file *file, | 1139 | static inline int fcntl_setlk(unsigned int fd, struct file *file, |
1140 | unsigned int cmd, struct flock __user *user) | 1140 | unsigned int cmd, struct flock __user *user) |
1141 | { | 1141 | { |
1142 | return -EACCES; | 1142 | return -EACCES; |
1143 | } | 1143 | } |
1144 | 1144 | ||
1145 | #if BITS_PER_LONG == 32 | 1145 | #if BITS_PER_LONG == 32 |
1146 | static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user) | 1146 | static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user) |
1147 | { | 1147 | { |
1148 | return -EINVAL; | 1148 | return -EINVAL; |
1149 | } | 1149 | } |
1150 | 1150 | ||
1151 | static inline int fcntl_setlk64(unsigned int fd, struct file *file, | 1151 | static inline int fcntl_setlk64(unsigned int fd, struct file *file, |
1152 | unsigned int cmd, struct flock64 __user *user) | 1152 | unsigned int cmd, struct flock64 __user *user) |
1153 | { | 1153 | { |
1154 | return -EACCES; | 1154 | return -EACCES; |
1155 | } | 1155 | } |
1156 | #endif | 1156 | #endif |
1157 | static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) | 1157 | static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) |
1158 | { | 1158 | { |
1159 | return 0; | 1159 | return 0; |
1160 | } | 1160 | } |
1161 | 1161 | ||
1162 | static inline int fcntl_getlease(struct file *filp) | 1162 | static inline int fcntl_getlease(struct file *filp) |
1163 | { | 1163 | { |
1164 | return 0; | 1164 | return 0; |
1165 | } | 1165 | } |
1166 | 1166 | ||
1167 | static inline void locks_init_lock(struct file_lock *fl) | 1167 | static inline void locks_init_lock(struct file_lock *fl) |
1168 | { | 1168 | { |
1169 | return; | 1169 | return; |
1170 | } | 1170 | } |
1171 | 1171 | ||
1172 | static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) | 1172 | static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) |
1173 | { | 1173 | { |
1174 | return; | 1174 | return; |
1175 | } | 1175 | } |
1176 | 1176 | ||
1177 | static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) | 1177 | static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) |
1178 | { | 1178 | { |
1179 | return; | 1179 | return; |
1180 | } | 1180 | } |
1181 | 1181 | ||
1182 | static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) | 1182 | static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) |
1183 | { | 1183 | { |
1184 | return; | 1184 | return; |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | static inline void locks_remove_flock(struct file *filp) | 1187 | static inline void locks_remove_flock(struct file *filp) |
1188 | { | 1188 | { |
1189 | return; | 1189 | return; |
1190 | } | 1190 | } |
1191 | 1191 | ||
1192 | static inline void posix_test_lock(struct file *filp, struct file_lock *fl) | 1192 | static inline void posix_test_lock(struct file *filp, struct file_lock *fl) |
1193 | { | 1193 | { |
1194 | return; | 1194 | return; |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | static inline int posix_lock_file(struct file *filp, struct file_lock *fl, | 1197 | static inline int posix_lock_file(struct file *filp, struct file_lock *fl, |
1198 | struct file_lock *conflock) | 1198 | struct file_lock *conflock) |
1199 | { | 1199 | { |
1200 | return -ENOLCK; | 1200 | return -ENOLCK; |
1201 | } | 1201 | } |
1202 | 1202 | ||
1203 | static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl) | 1203 | static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl) |
1204 | { | 1204 | { |
1205 | return -ENOLCK; | 1205 | return -ENOLCK; |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | static inline int posix_unblock_lock(struct file *filp, | 1208 | static inline int posix_unblock_lock(struct file *filp, |
1209 | struct file_lock *waiter) | 1209 | struct file_lock *waiter) |
1210 | { | 1210 | { |
1211 | return -ENOENT; | 1211 | return -ENOENT; |
1212 | } | 1212 | } |
1213 | 1213 | ||
1214 | static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) | 1214 | static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) |
1215 | { | 1215 | { |
1216 | return 0; | 1216 | return 0; |
1217 | } | 1217 | } |
1218 | 1218 | ||
1219 | static inline int vfs_lock_file(struct file *filp, unsigned int cmd, | 1219 | static inline int vfs_lock_file(struct file *filp, unsigned int cmd, |
1220 | struct file_lock *fl, struct file_lock *conf) | 1220 | struct file_lock *fl, struct file_lock *conf) |
1221 | { | 1221 | { |
1222 | return -ENOLCK; | 1222 | return -ENOLCK; |
1223 | } | 1223 | } |
1224 | 1224 | ||
1225 | static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) | 1225 | static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) |
1226 | { | 1226 | { |
1227 | return 0; | 1227 | return 0; |
1228 | } | 1228 | } |
1229 | 1229 | ||
1230 | static inline int flock_lock_file_wait(struct file *filp, | 1230 | static inline int flock_lock_file_wait(struct file *filp, |
1231 | struct file_lock *request) | 1231 | struct file_lock *request) |
1232 | { | 1232 | { |
1233 | return -ENOLCK; | 1233 | return -ENOLCK; |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | static inline int __break_lease(struct inode *inode, unsigned int mode) | 1236 | static inline int __break_lease(struct inode *inode, unsigned int mode) |
1237 | { | 1237 | { |
1238 | return 0; | 1238 | return 0; |
1239 | } | 1239 | } |
1240 | 1240 | ||
1241 | static inline void lease_get_mtime(struct inode *inode, struct timespec *time) | 1241 | static inline void lease_get_mtime(struct inode *inode, struct timespec *time) |
1242 | { | 1242 | { |
1243 | return; | 1243 | return; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | static inline int generic_setlease(struct file *filp, long arg, | 1246 | static inline int generic_setlease(struct file *filp, long arg, |
1247 | struct file_lock **flp) | 1247 | struct file_lock **flp) |
1248 | { | 1248 | { |
1249 | return -EINVAL; | 1249 | return -EINVAL; |
1250 | } | 1250 | } |
1251 | 1251 | ||
1252 | static inline int vfs_setlease(struct file *filp, long arg, | 1252 | static inline int vfs_setlease(struct file *filp, long arg, |
1253 | struct file_lock **lease) | 1253 | struct file_lock **lease) |
1254 | { | 1254 | { |
1255 | return -EINVAL; | 1255 | return -EINVAL; |
1256 | } | 1256 | } |
1257 | 1257 | ||
1258 | static inline int lease_modify(struct file_lock **before, int arg) | 1258 | static inline int lease_modify(struct file_lock **before, int arg) |
1259 | { | 1259 | { |
1260 | return -EINVAL; | 1260 | return -EINVAL; |
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | static inline int lock_may_read(struct inode *inode, loff_t start, | 1263 | static inline int lock_may_read(struct inode *inode, loff_t start, |
1264 | unsigned long len) | 1264 | unsigned long len) |
1265 | { | 1265 | { |
1266 | return 1; | 1266 | return 1; |
1267 | } | 1267 | } |
1268 | 1268 | ||
1269 | static inline int lock_may_write(struct inode *inode, loff_t start, | 1269 | static inline int lock_may_write(struct inode *inode, loff_t start, |
1270 | unsigned long len) | 1270 | unsigned long len) |
1271 | { | 1271 | { |
1272 | return 1; | 1272 | return 1; |
1273 | } | 1273 | } |
1274 | 1274 | ||
1275 | #endif /* !CONFIG_FILE_LOCKING */ | 1275 | #endif /* !CONFIG_FILE_LOCKING */ |
1276 | 1276 | ||
1277 | 1277 | ||
1278 | struct fasync_struct { | 1278 | struct fasync_struct { |
1279 | int magic; | 1279 | int magic; |
1280 | int fa_fd; | 1280 | int fa_fd; |
1281 | struct fasync_struct *fa_next; /* singly linked list */ | 1281 | struct fasync_struct *fa_next; /* singly linked list */ |
1282 | struct file *fa_file; | 1282 | struct file *fa_file; |
1283 | }; | 1283 | }; |
1284 | 1284 | ||
1285 | #define FASYNC_MAGIC 0x4601 | 1285 | #define FASYNC_MAGIC 0x4601 |
1286 | 1286 | ||
1287 | /* SMP safe fasync helpers: */ | 1287 | /* SMP safe fasync helpers: */ |
1288 | extern int fasync_helper(int, struct file *, int, struct fasync_struct **); | 1288 | extern int fasync_helper(int, struct file *, int, struct fasync_struct **); |
1289 | /* can be called from interrupts */ | 1289 | /* can be called from interrupts */ |
1290 | extern void kill_fasync(struct fasync_struct **, int, int); | 1290 | extern void kill_fasync(struct fasync_struct **, int, int); |
1291 | /* only for net: no internal synchronization */ | 1291 | /* only for net: no internal synchronization */ |
1292 | extern void __kill_fasync(struct fasync_struct *, int, int); | 1292 | extern void __kill_fasync(struct fasync_struct *, int, int); |
1293 | 1293 | ||
1294 | extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); | 1294 | extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); |
1295 | extern int f_setown(struct file *filp, unsigned long arg, int force); | 1295 | extern int f_setown(struct file *filp, unsigned long arg, int force); |
1296 | extern void f_delown(struct file *filp); | 1296 | extern void f_delown(struct file *filp); |
1297 | extern pid_t f_getown(struct file *filp); | 1297 | extern pid_t f_getown(struct file *filp); |
1298 | extern int send_sigurg(struct fown_struct *fown); | 1298 | extern int send_sigurg(struct fown_struct *fown); |
1299 | 1299 | ||
1300 | /* | 1300 | /* |
1301 | * Umount options | 1301 | * Umount options |
1302 | */ | 1302 | */ |
1303 | 1303 | ||
1304 | #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ | 1304 | #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ |
1305 | #define MNT_DETACH 0x00000002 /* Just detach from the tree */ | 1305 | #define MNT_DETACH 0x00000002 /* Just detach from the tree */ |
1306 | #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ | 1306 | #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ |
1307 | 1307 | ||
1308 | extern struct list_head super_blocks; | 1308 | extern struct list_head super_blocks; |
1309 | extern spinlock_t sb_lock; | 1309 | extern spinlock_t sb_lock; |
1310 | 1310 | ||
1311 | #define sb_entry(list) list_entry((list), struct super_block, s_list) | 1311 | #define sb_entry(list) list_entry((list), struct super_block, s_list) |
1312 | #define S_BIAS (1<<30) | 1312 | #define S_BIAS (1<<30) |
1313 | struct super_block { | 1313 | struct super_block { |
1314 | struct list_head s_list; /* Keep this first */ | 1314 | struct list_head s_list; /* Keep this first */ |
1315 | dev_t s_dev; /* search index; _not_ kdev_t */ | 1315 | dev_t s_dev; /* search index; _not_ kdev_t */ |
1316 | unsigned long s_blocksize; | 1316 | unsigned long s_blocksize; |
1317 | unsigned char s_blocksize_bits; | 1317 | unsigned char s_blocksize_bits; |
1318 | unsigned char s_dirt; | 1318 | unsigned char s_dirt; |
1319 | unsigned long long s_maxbytes; /* Max file size */ | 1319 | unsigned long long s_maxbytes; /* Max file size */ |
1320 | struct file_system_type *s_type; | 1320 | struct file_system_type *s_type; |
1321 | const struct super_operations *s_op; | 1321 | const struct super_operations *s_op; |
1322 | struct dquot_operations *dq_op; | 1322 | struct dquot_operations *dq_op; |
1323 | struct quotactl_ops *s_qcop; | 1323 | struct quotactl_ops *s_qcop; |
1324 | const struct export_operations *s_export_op; | 1324 | const struct export_operations *s_export_op; |
1325 | unsigned long s_flags; | 1325 | unsigned long s_flags; |
1326 | unsigned long s_magic; | 1326 | unsigned long s_magic; |
1327 | struct dentry *s_root; | 1327 | struct dentry *s_root; |
1328 | struct rw_semaphore s_umount; | 1328 | struct rw_semaphore s_umount; |
1329 | struct mutex s_lock; | 1329 | struct mutex s_lock; |
1330 | int s_count; | 1330 | int s_count; |
1331 | int s_need_sync; | 1331 | int s_need_sync; |
1332 | atomic_t s_active; | 1332 | atomic_t s_active; |
1333 | #ifdef CONFIG_SECURITY | 1333 | #ifdef CONFIG_SECURITY |
1334 | void *s_security; | 1334 | void *s_security; |
1335 | #endif | 1335 | #endif |
1336 | struct xattr_handler **s_xattr; | 1336 | struct xattr_handler **s_xattr; |
1337 | 1337 | ||
1338 | struct list_head s_inodes; /* all inodes */ | 1338 | struct list_head s_inodes; /* all inodes */ |
1339 | struct list_head s_dirty; /* dirty inodes */ | 1339 | struct list_head s_dirty; /* dirty inodes */ |
1340 | struct list_head s_io; /* parked for writeback */ | 1340 | struct list_head s_io; /* parked for writeback */ |
1341 | struct list_head s_more_io; /* parked for more writeback */ | 1341 | struct list_head s_more_io; /* parked for more writeback */ |
1342 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ | 1342 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ |
1343 | struct list_head s_files; | 1343 | struct list_head s_files; |
1344 | /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ | 1344 | /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ |
1345 | struct list_head s_dentry_lru; /* unused dentry lru */ | 1345 | struct list_head s_dentry_lru; /* unused dentry lru */ |
1346 | int s_nr_dentry_unused; /* # of dentry on lru */ | 1346 | int s_nr_dentry_unused; /* # of dentry on lru */ |
1347 | 1347 | ||
1348 | struct block_device *s_bdev; | 1348 | struct block_device *s_bdev; |
1349 | struct mtd_info *s_mtd; | 1349 | struct mtd_info *s_mtd; |
1350 | struct list_head s_instances; | 1350 | struct list_head s_instances; |
1351 | struct quota_info s_dquot; /* Diskquota specific options */ | 1351 | struct quota_info s_dquot; /* Diskquota specific options */ |
1352 | 1352 | ||
1353 | int s_frozen; | 1353 | int s_frozen; |
1354 | wait_queue_head_t s_wait_unfrozen; | 1354 | wait_queue_head_t s_wait_unfrozen; |
1355 | 1355 | ||
1356 | char s_id[32]; /* Informational name */ | 1356 | char s_id[32]; /* Informational name */ |
1357 | 1357 | ||
1358 | void *s_fs_info; /* Filesystem private info */ | 1358 | void *s_fs_info; /* Filesystem private info */ |
1359 | fmode_t s_mode; | 1359 | fmode_t s_mode; |
1360 | 1360 | ||
1361 | /* | 1361 | /* |
1362 | * The next field is for VFS *only*. No filesystems have any business | 1362 | * The next field is for VFS *only*. No filesystems have any business |
1363 | * even looking at it. You had been warned. | 1363 | * even looking at it. You had been warned. |
1364 | */ | 1364 | */ |
1365 | struct mutex s_vfs_rename_mutex; /* Kludge */ | 1365 | struct mutex s_vfs_rename_mutex; /* Kludge */ |
1366 | 1366 | ||
1367 | /* Granularity of c/m/atime in ns. | 1367 | /* Granularity of c/m/atime in ns. |
1368 | Cannot be worse than a second */ | 1368 | Cannot be worse than a second */ |
1369 | u32 s_time_gran; | 1369 | u32 s_time_gran; |
1370 | 1370 | ||
1371 | /* | 1371 | /* |
1372 | * Filesystem subtype. If non-empty the filesystem type field | 1372 | * Filesystem subtype. If non-empty the filesystem type field |
1373 | * in /proc/mounts will be "type.subtype" | 1373 | * in /proc/mounts will be "type.subtype" |
1374 | */ | 1374 | */ |
1375 | char *s_subtype; | 1375 | char *s_subtype; |
1376 | 1376 | ||
1377 | /* | 1377 | /* |
1378 | * Saved mount options for lazy filesystems using | 1378 | * Saved mount options for lazy filesystems using |
1379 | * generic_show_options() | 1379 | * generic_show_options() |
1380 | */ | 1380 | */ |
1381 | char *s_options; | 1381 | char *s_options; |
1382 | }; | 1382 | }; |
1383 | 1383 | ||
1384 | extern struct timespec current_fs_time(struct super_block *sb); | 1384 | extern struct timespec current_fs_time(struct super_block *sb); |
1385 | 1385 | ||
1386 | /* | 1386 | /* |
1387 | * Snapshotting support. | 1387 | * Snapshotting support. |
1388 | */ | 1388 | */ |
1389 | enum { | 1389 | enum { |
1390 | SB_UNFROZEN = 0, | 1390 | SB_UNFROZEN = 0, |
1391 | SB_FREEZE_WRITE = 1, | 1391 | SB_FREEZE_WRITE = 1, |
1392 | SB_FREEZE_TRANS = 2, | 1392 | SB_FREEZE_TRANS = 2, |
1393 | }; | 1393 | }; |
1394 | 1394 | ||
1395 | #define vfs_check_frozen(sb, level) \ | 1395 | #define vfs_check_frozen(sb, level) \ |
1396 | wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) | 1396 | wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) |
1397 | 1397 | ||
1398 | #define get_fs_excl() atomic_inc(¤t->fs_excl) | 1398 | #define get_fs_excl() atomic_inc(¤t->fs_excl) |
1399 | #define put_fs_excl() atomic_dec(¤t->fs_excl) | 1399 | #define put_fs_excl() atomic_dec(¤t->fs_excl) |
1400 | #define has_fs_excl() atomic_read(¤t->fs_excl) | 1400 | #define has_fs_excl() atomic_read(¤t->fs_excl) |
1401 | 1401 | ||
1402 | #define is_owner_or_cap(inode) \ | 1402 | #define is_owner_or_cap(inode) \ |
1403 | ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER)) | 1403 | ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER)) |
1404 | 1404 | ||
1405 | /* not quite ready to be deprecated, but... */ | 1405 | /* not quite ready to be deprecated, but... */ |
1406 | extern void lock_super(struct super_block *); | 1406 | extern void lock_super(struct super_block *); |
1407 | extern void unlock_super(struct super_block *); | 1407 | extern void unlock_super(struct super_block *); |
1408 | 1408 | ||
1409 | /* | 1409 | /* |
1410 | * VFS helper functions.. | 1410 | * VFS helper functions.. |
1411 | */ | 1411 | */ |
1412 | extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); | 1412 | extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); |
1413 | extern int vfs_mkdir(struct inode *, struct dentry *, int); | 1413 | extern int vfs_mkdir(struct inode *, struct dentry *, int); |
1414 | extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); | 1414 | extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); |
1415 | extern int vfs_symlink(struct inode *, struct dentry *, const char *); | 1415 | extern int vfs_symlink(struct inode *, struct dentry *, const char *); |
1416 | extern int vfs_link(struct dentry *, struct inode *, struct dentry *); | 1416 | extern int vfs_link(struct dentry *, struct inode *, struct dentry *); |
1417 | extern int vfs_rmdir(struct inode *, struct dentry *); | 1417 | extern int vfs_rmdir(struct inode *, struct dentry *); |
1418 | extern int vfs_unlink(struct inode *, struct dentry *); | 1418 | extern int vfs_unlink(struct inode *, struct dentry *); |
1419 | extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); | 1419 | extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); |
1420 | 1420 | ||
1421 | /* | 1421 | /* |
1422 | * VFS dentry helper functions. | 1422 | * VFS dentry helper functions. |
1423 | */ | 1423 | */ |
1424 | extern void dentry_unhash(struct dentry *dentry); | 1424 | extern void dentry_unhash(struct dentry *dentry); |
1425 | 1425 | ||
1426 | /* | 1426 | /* |
1427 | * VFS file helper functions. | 1427 | * VFS file helper functions. |
1428 | */ | 1428 | */ |
1429 | extern int file_permission(struct file *, int); | 1429 | extern int file_permission(struct file *, int); |
1430 | 1430 | ||
1431 | /* | 1431 | /* |
1432 | * VFS FS_IOC_FIEMAP helper definitions. | 1432 | * VFS FS_IOC_FIEMAP helper definitions. |
1433 | */ | 1433 | */ |
1434 | struct fiemap_extent_info { | 1434 | struct fiemap_extent_info { |
1435 | unsigned int fi_flags; /* Flags as passed from user */ | 1435 | unsigned int fi_flags; /* Flags as passed from user */ |
1436 | unsigned int fi_extents_mapped; /* Number of mapped extents */ | 1436 | unsigned int fi_extents_mapped; /* Number of mapped extents */ |
1437 | unsigned int fi_extents_max; /* Size of fiemap_extent array */ | 1437 | unsigned int fi_extents_max; /* Size of fiemap_extent array */ |
1438 | struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent | 1438 | struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent |
1439 | * array */ | 1439 | * array */ |
1440 | }; | 1440 | }; |
1441 | int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, | 1441 | int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, |
1442 | u64 phys, u64 len, u32 flags); | 1442 | u64 phys, u64 len, u32 flags); |
1443 | int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); | 1443 | int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); |
1444 | 1444 | ||
1445 | /* | 1445 | /* |
1446 | * File types | 1446 | * File types |
1447 | * | 1447 | * |
1448 | * NOTE! These match bits 12..15 of stat.st_mode | 1448 | * NOTE! These match bits 12..15 of stat.st_mode |
1449 | * (ie "(i_mode >> 12) & 15"). | 1449 | * (ie "(i_mode >> 12) & 15"). |
1450 | */ | 1450 | */ |
1451 | #define DT_UNKNOWN 0 | 1451 | #define DT_UNKNOWN 0 |
1452 | #define DT_FIFO 1 | 1452 | #define DT_FIFO 1 |
1453 | #define DT_CHR 2 | 1453 | #define DT_CHR 2 |
1454 | #define DT_DIR 4 | 1454 | #define DT_DIR 4 |
1455 | #define DT_BLK 6 | 1455 | #define DT_BLK 6 |
1456 | #define DT_REG 8 | 1456 | #define DT_REG 8 |
1457 | #define DT_LNK 10 | 1457 | #define DT_LNK 10 |
1458 | #define DT_SOCK 12 | 1458 | #define DT_SOCK 12 |
1459 | #define DT_WHT 14 | 1459 | #define DT_WHT 14 |
1460 | 1460 | ||
1461 | #define OSYNC_METADATA (1<<0) | 1461 | #define OSYNC_METADATA (1<<0) |
1462 | #define OSYNC_DATA (1<<1) | 1462 | #define OSYNC_DATA (1<<1) |
1463 | #define OSYNC_INODE (1<<2) | 1463 | #define OSYNC_INODE (1<<2) |
1464 | int generic_osync_inode(struct inode *, struct address_space *, int); | 1464 | int generic_osync_inode(struct inode *, struct address_space *, int); |
1465 | 1465 | ||
1466 | /* | 1466 | /* |
1467 | * This is the "filldir" function type, used by readdir() to let | 1467 | * This is the "filldir" function type, used by readdir() to let |
1468 | * the kernel specify what kind of dirent layout it wants to have. | 1468 | * the kernel specify what kind of dirent layout it wants to have. |
1469 | * This allows the kernel to read directories into kernel space or | 1469 | * This allows the kernel to read directories into kernel space or |
1470 | * to have different dirent layouts depending on the binary type. | 1470 | * to have different dirent layouts depending on the binary type. |
1471 | */ | 1471 | */ |
1472 | typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); | 1472 | typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); |
1473 | struct block_device_operations; | 1473 | struct block_device_operations; |
1474 | 1474 | ||
1475 | /* These macros are for out of kernel modules to test that | 1475 | /* These macros are for out of kernel modules to test that |
1476 | * the kernel supports the unlocked_ioctl and compat_ioctl | 1476 | * the kernel supports the unlocked_ioctl and compat_ioctl |
1477 | * fields in struct file_operations. */ | 1477 | * fields in struct file_operations. */ |
1478 | #define HAVE_COMPAT_IOCTL 1 | 1478 | #define HAVE_COMPAT_IOCTL 1 |
1479 | #define HAVE_UNLOCKED_IOCTL 1 | 1479 | #define HAVE_UNLOCKED_IOCTL 1 |
1480 | 1480 | ||
1481 | /* | 1481 | /* |
1482 | * NOTE: | 1482 | * NOTE: |
1483 | * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl | 1483 | * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl |
1484 | * can be called without the big kernel lock held in all filesystems. | 1484 | * can be called without the big kernel lock held in all filesystems. |
1485 | */ | 1485 | */ |
1486 | struct file_operations { | 1486 | struct file_operations { |
1487 | struct module *owner; | 1487 | struct module *owner; |
1488 | loff_t (*llseek) (struct file *, loff_t, int); | 1488 | loff_t (*llseek) (struct file *, loff_t, int); |
1489 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); | 1489 | ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); |
1490 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); | 1490 | ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); |
1491 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1491 | ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1492 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); | 1492 | ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); |
1493 | int (*readdir) (struct file *, void *, filldir_t); | 1493 | int (*readdir) (struct file *, void *, filldir_t); |
1494 | unsigned int (*poll) (struct file *, struct poll_table_struct *); | 1494 | unsigned int (*poll) (struct file *, struct poll_table_struct *); |
1495 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); | 1495 | int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); |
1496 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); | 1496 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); |
1497 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); | 1497 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); |
1498 | int (*mmap) (struct file *, struct vm_area_struct *); | 1498 | int (*mmap) (struct file *, struct vm_area_struct *); |
1499 | int (*open) (struct inode *, struct file *); | 1499 | int (*open) (struct inode *, struct file *); |
1500 | int (*flush) (struct file *, fl_owner_t id); | 1500 | int (*flush) (struct file *, fl_owner_t id); |
1501 | int (*release) (struct inode *, struct file *); | 1501 | int (*release) (struct inode *, struct file *); |
1502 | int (*fsync) (struct file *, struct dentry *, int datasync); | 1502 | int (*fsync) (struct file *, struct dentry *, int datasync); |
1503 | int (*aio_fsync) (struct kiocb *, int datasync); | 1503 | int (*aio_fsync) (struct kiocb *, int datasync); |
1504 | int (*fasync) (int, struct file *, int); | 1504 | int (*fasync) (int, struct file *, int); |
1505 | int (*lock) (struct file *, int, struct file_lock *); | 1505 | int (*lock) (struct file *, int, struct file_lock *); |
1506 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); | 1506 | ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); |
1507 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 1507 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
1508 | int (*check_flags)(int); | 1508 | int (*check_flags)(int); |
1509 | int (*flock) (struct file *, int, struct file_lock *); | 1509 | int (*flock) (struct file *, int, struct file_lock *); |
1510 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); | 1510 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); |
1511 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); | 1511 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); |
1512 | int (*setlease)(struct file *, long, struct file_lock **); | 1512 | int (*setlease)(struct file *, long, struct file_lock **); |
1513 | }; | 1513 | }; |
1514 | 1514 | ||
1515 | struct inode_operations { | 1515 | struct inode_operations { |
1516 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); | 1516 | int (*create) (struct inode *,struct dentry *,int, struct nameidata *); |
1517 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); | 1517 | struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); |
1518 | int (*link) (struct dentry *,struct inode *,struct dentry *); | 1518 | int (*link) (struct dentry *,struct inode *,struct dentry *); |
1519 | int (*unlink) (struct inode *,struct dentry *); | 1519 | int (*unlink) (struct inode *,struct dentry *); |
1520 | int (*symlink) (struct inode *,struct dentry *,const char *); | 1520 | int (*symlink) (struct inode *,struct dentry *,const char *); |
1521 | int (*mkdir) (struct inode *,struct dentry *,int); | 1521 | int (*mkdir) (struct inode *,struct dentry *,int); |
1522 | int (*rmdir) (struct inode *,struct dentry *); | 1522 | int (*rmdir) (struct inode *,struct dentry *); |
1523 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); | 1523 | int (*mknod) (struct inode *,struct dentry *,int,dev_t); |
1524 | int (*rename) (struct inode *, struct dentry *, | 1524 | int (*rename) (struct inode *, struct dentry *, |
1525 | struct inode *, struct dentry *); | 1525 | struct inode *, struct dentry *); |
1526 | int (*readlink) (struct dentry *, char __user *,int); | 1526 | int (*readlink) (struct dentry *, char __user *,int); |
1527 | void * (*follow_link) (struct dentry *, struct nameidata *); | 1527 | void * (*follow_link) (struct dentry *, struct nameidata *); |
1528 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 1528 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
1529 | void (*truncate) (struct inode *); | 1529 | void (*truncate) (struct inode *); |
1530 | int (*permission) (struct inode *, int); | 1530 | int (*permission) (struct inode *, int); |
1531 | int (*setattr) (struct dentry *, struct iattr *); | 1531 | int (*setattr) (struct dentry *, struct iattr *); |
1532 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 1532 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
1533 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 1533 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
1534 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 1534 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
1535 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 1535 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
1536 | int (*removexattr) (struct dentry *, const char *); | 1536 | int (*removexattr) (struct dentry *, const char *); |
1537 | void (*truncate_range)(struct inode *, loff_t, loff_t); | 1537 | void (*truncate_range)(struct inode *, loff_t, loff_t); |
1538 | long (*fallocate)(struct inode *inode, int mode, loff_t offset, | 1538 | long (*fallocate)(struct inode *inode, int mode, loff_t offset, |
1539 | loff_t len); | 1539 | loff_t len); |
1540 | int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, | 1540 | int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, |
1541 | u64 len); | 1541 | u64 len); |
1542 | }; | 1542 | }; |
1543 | 1543 | ||
1544 | struct seq_file; | 1544 | struct seq_file; |
1545 | 1545 | ||
1546 | ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, | 1546 | ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, |
1547 | unsigned long nr_segs, unsigned long fast_segs, | 1547 | unsigned long nr_segs, unsigned long fast_segs, |
1548 | struct iovec *fast_pointer, | 1548 | struct iovec *fast_pointer, |
1549 | struct iovec **ret_pointer); | 1549 | struct iovec **ret_pointer); |
1550 | 1550 | ||
1551 | extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); | 1551 | extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); |
1552 | extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); | 1552 | extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); |
1553 | extern ssize_t vfs_readv(struct file *, const struct iovec __user *, | 1553 | extern ssize_t vfs_readv(struct file *, const struct iovec __user *, |
1554 | unsigned long, loff_t *); | 1554 | unsigned long, loff_t *); |
1555 | extern ssize_t vfs_writev(struct file *, const struct iovec __user *, | 1555 | extern ssize_t vfs_writev(struct file *, const struct iovec __user *, |
1556 | unsigned long, loff_t *); | 1556 | unsigned long, loff_t *); |
1557 | 1557 | ||
1558 | struct super_operations { | 1558 | struct super_operations { |
1559 | struct inode *(*alloc_inode)(struct super_block *sb); | 1559 | struct inode *(*alloc_inode)(struct super_block *sb); |
1560 | void (*destroy_inode)(struct inode *); | 1560 | void (*destroy_inode)(struct inode *); |
1561 | 1561 | ||
1562 | void (*dirty_inode) (struct inode *); | 1562 | void (*dirty_inode) (struct inode *); |
1563 | int (*write_inode) (struct inode *, int); | 1563 | int (*write_inode) (struct inode *, int); |
1564 | void (*drop_inode) (struct inode *); | 1564 | void (*drop_inode) (struct inode *); |
1565 | void (*delete_inode) (struct inode *); | 1565 | void (*delete_inode) (struct inode *); |
1566 | void (*put_super) (struct super_block *); | 1566 | void (*put_super) (struct super_block *); |
1567 | void (*write_super) (struct super_block *); | 1567 | void (*write_super) (struct super_block *); |
1568 | int (*sync_fs)(struct super_block *sb, int wait); | 1568 | int (*sync_fs)(struct super_block *sb, int wait); |
1569 | int (*freeze_fs) (struct super_block *); | 1569 | int (*freeze_fs) (struct super_block *); |
1570 | int (*unfreeze_fs) (struct super_block *); | 1570 | int (*unfreeze_fs) (struct super_block *); |
1571 | int (*statfs) (struct dentry *, struct kstatfs *); | 1571 | int (*statfs) (struct dentry *, struct kstatfs *); |
1572 | int (*remount_fs) (struct super_block *, int *, char *); | 1572 | int (*remount_fs) (struct super_block *, int *, char *); |
1573 | void (*clear_inode) (struct inode *); | 1573 | void (*clear_inode) (struct inode *); |
1574 | void (*umount_begin) (struct super_block *); | 1574 | void (*umount_begin) (struct super_block *); |
1575 | 1575 | ||
1576 | int (*show_options)(struct seq_file *, struct vfsmount *); | 1576 | int (*show_options)(struct seq_file *, struct vfsmount *); |
1577 | int (*show_stats)(struct seq_file *, struct vfsmount *); | 1577 | int (*show_stats)(struct seq_file *, struct vfsmount *); |
1578 | #ifdef CONFIG_QUOTA | 1578 | #ifdef CONFIG_QUOTA |
1579 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 1579 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
1580 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 1580 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
1581 | #endif | 1581 | #endif |
1582 | int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); | 1582 | int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); |
1583 | }; | 1583 | }; |
1584 | 1584 | ||
1585 | /* | 1585 | /* |
1586 | * Inode state bits. Protected by inode_lock. | 1586 | * Inode state bits. Protected by inode_lock. |
1587 | * | 1587 | * |
1588 | * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, | 1588 | * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, |
1589 | * I_DIRTY_DATASYNC and I_DIRTY_PAGES. | 1589 | * I_DIRTY_DATASYNC and I_DIRTY_PAGES. |
1590 | * | 1590 | * |
1591 | * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, | 1591 | * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, |
1592 | * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at | 1592 | * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at |
1593 | * various stages of removing an inode. | 1593 | * various stages of removing an inode. |
1594 | * | 1594 | * |
1595 | * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. | 1595 | * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. |
1596 | * | 1596 | * |
1597 | * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on | 1597 | * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on |
1598 | * fdatasync(). i_atime is the usual cause. | 1598 | * fdatasync(). i_atime is the usual cause. |
1599 | * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of | 1599 | * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of |
1600 | * these changes separately from I_DIRTY_SYNC so that we | 1600 | * these changes separately from I_DIRTY_SYNC so that we |
1601 | * don't have to write inode on fdatasync() when only | 1601 | * don't have to write inode on fdatasync() when only |
1602 | * mtime has changed in it. | 1602 | * mtime has changed in it. |
1603 | * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. | 1603 | * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. |
1604 | * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both | 1604 | * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both |
1605 | * are cleared by unlock_new_inode(), called from iget(). | 1605 | * are cleared by unlock_new_inode(), called from iget(). |
1606 | * I_WILL_FREE Must be set when calling write_inode_now() if i_count | 1606 | * I_WILL_FREE Must be set when calling write_inode_now() if i_count |
1607 | * is zero. I_FREEING must be set when I_WILL_FREE is | 1607 | * is zero. I_FREEING must be set when I_WILL_FREE is |
1608 | * cleared. | 1608 | * cleared. |
1609 | * I_FREEING Set when inode is about to be freed but still has dirty | 1609 | * I_FREEING Set when inode is about to be freed but still has dirty |
1610 | * pages or buffers attached or the inode itself is still | 1610 | * pages or buffers attached or the inode itself is still |
1611 | * dirty. | 1611 | * dirty. |
1612 | * I_CLEAR Set by clear_inode(). In this state the inode is clean | 1612 | * I_CLEAR Set by clear_inode(). In this state the inode is clean |
1613 | * and can be destroyed. | 1613 | * and can be destroyed. |
1614 | * | 1614 | * |
1615 | * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are | 1615 | * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are |
1616 | * prohibited for many purposes. iget() must wait for | 1616 | * prohibited for many purposes. iget() must wait for |
1617 | * the inode to be completely released, then create it | 1617 | * the inode to be completely released, then create it |
1618 | * anew. Other functions will just ignore such inodes, | 1618 | * anew. Other functions will just ignore such inodes, |
1619 | * if appropriate. I_LOCK is used for waiting. | 1619 | * if appropriate. I_LOCK is used for waiting. |
1620 | * | 1620 | * |
1621 | * I_LOCK Serves as both a mutex and completion notification. | 1621 | * I_LOCK Serves as both a mutex and completion notification. |
1622 | * New inodes set I_LOCK. If two processes both create | 1622 | * New inodes set I_LOCK. If two processes both create |
1623 | * the same inode, one of them will release its inode and | 1623 | * the same inode, one of them will release its inode and |
1624 | * wait for I_LOCK to be released before returning. | 1624 | * wait for I_LOCK to be released before returning. |
1625 | * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can | 1625 | * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can |
1626 | * also cause waiting on I_LOCK, without I_LOCK actually | 1626 | * also cause waiting on I_LOCK, without I_LOCK actually |
1627 | * being set. find_inode() uses this to prevent returning | 1627 | * being set. find_inode() uses this to prevent returning |
1628 | * nearly-dead inodes. | 1628 | * nearly-dead inodes. |
1629 | * I_SYNC Similar to I_LOCK, but limited in scope to writeback | 1629 | * I_SYNC Similar to I_LOCK, but limited in scope to writeback |
1630 | * of inode dirty data. Having a separate lock for this | 1630 | * of inode dirty data. Having a separate lock for this |
1631 | * purpose reduces latency and prevents some filesystem- | 1631 | * purpose reduces latency and prevents some filesystem- |
1632 | * specific deadlocks. | 1632 | * specific deadlocks. |
1633 | * | 1633 | * |
1634 | * Q: What is the difference between I_WILL_FREE and I_FREEING? | 1634 | * Q: What is the difference between I_WILL_FREE and I_FREEING? |
1635 | * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on | 1635 | * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on |
1636 | * I_CLEAR? If not, why? | 1636 | * I_CLEAR? If not, why? |
1637 | */ | 1637 | */ |
1638 | #define I_DIRTY_SYNC 1 | 1638 | #define I_DIRTY_SYNC 1 |
1639 | #define I_DIRTY_DATASYNC 2 | 1639 | #define I_DIRTY_DATASYNC 2 |
1640 | #define I_DIRTY_PAGES 4 | 1640 | #define I_DIRTY_PAGES 4 |
1641 | #define I_NEW 8 | 1641 | #define I_NEW 8 |
1642 | #define I_WILL_FREE 16 | 1642 | #define I_WILL_FREE 16 |
1643 | #define I_FREEING 32 | 1643 | #define I_FREEING 32 |
1644 | #define I_CLEAR 64 | 1644 | #define I_CLEAR 64 |
1645 | #define __I_LOCK 7 | 1645 | #define __I_LOCK 7 |
1646 | #define I_LOCK (1 << __I_LOCK) | 1646 | #define I_LOCK (1 << __I_LOCK) |
1647 | #define __I_SYNC 8 | 1647 | #define __I_SYNC 8 |
1648 | #define I_SYNC (1 << __I_SYNC) | 1648 | #define I_SYNC (1 << __I_SYNC) |
1649 | 1649 | ||
1650 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) | 1650 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) |
1651 | 1651 | ||
1652 | extern void __mark_inode_dirty(struct inode *, int); | 1652 | extern void __mark_inode_dirty(struct inode *, int); |
1653 | static inline void mark_inode_dirty(struct inode *inode) | 1653 | static inline void mark_inode_dirty(struct inode *inode) |
1654 | { | 1654 | { |
1655 | __mark_inode_dirty(inode, I_DIRTY); | 1655 | __mark_inode_dirty(inode, I_DIRTY); |
1656 | } | 1656 | } |
1657 | 1657 | ||
1658 | static inline void mark_inode_dirty_sync(struct inode *inode) | 1658 | static inline void mark_inode_dirty_sync(struct inode *inode) |
1659 | { | 1659 | { |
1660 | __mark_inode_dirty(inode, I_DIRTY_SYNC); | 1660 | __mark_inode_dirty(inode, I_DIRTY_SYNC); |
1661 | } | 1661 | } |
1662 | 1662 | ||
1663 | /** | 1663 | /** |
1664 | * inc_nlink - directly increment an inode's link count | 1664 | * inc_nlink - directly increment an inode's link count |
1665 | * @inode: inode | 1665 | * @inode: inode |
1666 | * | 1666 | * |
1667 | * This is a low-level filesystem helper to replace any | 1667 | * This is a low-level filesystem helper to replace any |
1668 | * direct filesystem manipulation of i_nlink. Currently, | 1668 | * direct filesystem manipulation of i_nlink. Currently, |
1669 | * it is only here for parity with dec_nlink(). | 1669 | * it is only here for parity with dec_nlink(). |
1670 | */ | 1670 | */ |
1671 | static inline void inc_nlink(struct inode *inode) | 1671 | static inline void inc_nlink(struct inode *inode) |
1672 | { | 1672 | { |
1673 | inode->i_nlink++; | 1673 | inode->i_nlink++; |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | static inline void inode_inc_link_count(struct inode *inode) | 1676 | static inline void inode_inc_link_count(struct inode *inode) |
1677 | { | 1677 | { |
1678 | inc_nlink(inode); | 1678 | inc_nlink(inode); |
1679 | mark_inode_dirty(inode); | 1679 | mark_inode_dirty(inode); |
1680 | } | 1680 | } |
1681 | 1681 | ||
1682 | /** | 1682 | /** |
1683 | * drop_nlink - directly drop an inode's link count | 1683 | * drop_nlink - directly drop an inode's link count |
1684 | * @inode: inode | 1684 | * @inode: inode |
1685 | * | 1685 | * |
1686 | * This is a low-level filesystem helper to replace any | 1686 | * This is a low-level filesystem helper to replace any |
1687 | * direct filesystem manipulation of i_nlink. In cases | 1687 | * direct filesystem manipulation of i_nlink. In cases |
1688 | * where we are attempting to track writes to the | 1688 | * where we are attempting to track writes to the |
1689 | * filesystem, a decrement to zero means an imminent | 1689 | * filesystem, a decrement to zero means an imminent |
1690 | * write when the file is truncated and actually unlinked | 1690 | * write when the file is truncated and actually unlinked |
1691 | * on the filesystem. | 1691 | * on the filesystem. |
1692 | */ | 1692 | */ |
1693 | static inline void drop_nlink(struct inode *inode) | 1693 | static inline void drop_nlink(struct inode *inode) |
1694 | { | 1694 | { |
1695 | inode->i_nlink--; | 1695 | inode->i_nlink--; |
1696 | } | 1696 | } |
1697 | 1697 | ||
1698 | /** | 1698 | /** |
1699 | * clear_nlink - directly zero an inode's link count | 1699 | * clear_nlink - directly zero an inode's link count |
1700 | * @inode: inode | 1700 | * @inode: inode |
1701 | * | 1701 | * |
1702 | * This is a low-level filesystem helper to replace any | 1702 | * This is a low-level filesystem helper to replace any |
1703 | * direct filesystem manipulation of i_nlink. See | 1703 | * direct filesystem manipulation of i_nlink. See |
1704 | * drop_nlink() for why we care about i_nlink hitting zero. | 1704 | * drop_nlink() for why we care about i_nlink hitting zero. |
1705 | */ | 1705 | */ |
1706 | static inline void clear_nlink(struct inode *inode) | 1706 | static inline void clear_nlink(struct inode *inode) |
1707 | { | 1707 | { |
1708 | inode->i_nlink = 0; | 1708 | inode->i_nlink = 0; |
1709 | } | 1709 | } |
1710 | 1710 | ||
1711 | static inline void inode_dec_link_count(struct inode *inode) | 1711 | static inline void inode_dec_link_count(struct inode *inode) |
1712 | { | 1712 | { |
1713 | drop_nlink(inode); | 1713 | drop_nlink(inode); |
1714 | mark_inode_dirty(inode); | 1714 | mark_inode_dirty(inode); |
1715 | } | 1715 | } |
1716 | 1716 | ||
1717 | /** | 1717 | /** |
1718 | * inode_inc_iversion - increments i_version | 1718 | * inode_inc_iversion - increments i_version |
1719 | * @inode: inode that need to be updated | 1719 | * @inode: inode that need to be updated |
1720 | * | 1720 | * |
1721 | * Every time the inode is modified, the i_version field will be incremented. | 1721 | * Every time the inode is modified, the i_version field will be incremented. |
1722 | * The filesystem has to be mounted with i_version flag | 1722 | * The filesystem has to be mounted with i_version flag |
1723 | */ | 1723 | */ |
1724 | 1724 | ||
1725 | static inline void inode_inc_iversion(struct inode *inode) | 1725 | static inline void inode_inc_iversion(struct inode *inode) |
1726 | { | 1726 | { |
1727 | spin_lock(&inode->i_lock); | 1727 | spin_lock(&inode->i_lock); |
1728 | inode->i_version++; | 1728 | inode->i_version++; |
1729 | spin_unlock(&inode->i_lock); | 1729 | spin_unlock(&inode->i_lock); |
1730 | } | 1730 | } |
1731 | 1731 | ||
1732 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); | 1732 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); |
1733 | static inline void file_accessed(struct file *file) | 1733 | static inline void file_accessed(struct file *file) |
1734 | { | 1734 | { |
1735 | if (!(file->f_flags & O_NOATIME)) | 1735 | if (!(file->f_flags & O_NOATIME)) |
1736 | touch_atime(file->f_path.mnt, file->f_path.dentry); | 1736 | touch_atime(file->f_path.mnt, file->f_path.dentry); |
1737 | } | 1737 | } |
1738 | 1738 | ||
1739 | int sync_inode(struct inode *inode, struct writeback_control *wbc); | 1739 | int sync_inode(struct inode *inode, struct writeback_control *wbc); |
1740 | 1740 | ||
1741 | struct file_system_type { | 1741 | struct file_system_type { |
1742 | const char *name; | 1742 | const char *name; |
1743 | int fs_flags; | 1743 | int fs_flags; |
1744 | int (*get_sb) (struct file_system_type *, int, | 1744 | int (*get_sb) (struct file_system_type *, int, |
1745 | const char *, void *, struct vfsmount *); | 1745 | const char *, void *, struct vfsmount *); |
1746 | void (*kill_sb) (struct super_block *); | 1746 | void (*kill_sb) (struct super_block *); |
1747 | struct module *owner; | 1747 | struct module *owner; |
1748 | struct file_system_type * next; | 1748 | struct file_system_type * next; |
1749 | struct list_head fs_supers; | 1749 | struct list_head fs_supers; |
1750 | 1750 | ||
1751 | struct lock_class_key s_lock_key; | 1751 | struct lock_class_key s_lock_key; |
1752 | struct lock_class_key s_umount_key; | 1752 | struct lock_class_key s_umount_key; |
1753 | 1753 | ||
1754 | struct lock_class_key i_lock_key; | 1754 | struct lock_class_key i_lock_key; |
1755 | struct lock_class_key i_mutex_key; | 1755 | struct lock_class_key i_mutex_key; |
1756 | struct lock_class_key i_mutex_dir_key; | 1756 | struct lock_class_key i_mutex_dir_key; |
1757 | struct lock_class_key i_alloc_sem_key; | 1757 | struct lock_class_key i_alloc_sem_key; |
1758 | }; | 1758 | }; |
1759 | 1759 | ||
1760 | extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, | 1760 | extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, |
1761 | int (*fill_super)(struct super_block *, void *, int), | 1761 | int (*fill_super)(struct super_block *, void *, int), |
1762 | struct vfsmount *mnt); | 1762 | struct vfsmount *mnt); |
1763 | extern int get_sb_bdev(struct file_system_type *fs_type, | 1763 | extern int get_sb_bdev(struct file_system_type *fs_type, |
1764 | int flags, const char *dev_name, void *data, | 1764 | int flags, const char *dev_name, void *data, |
1765 | int (*fill_super)(struct super_block *, void *, int), | 1765 | int (*fill_super)(struct super_block *, void *, int), |
1766 | struct vfsmount *mnt); | 1766 | struct vfsmount *mnt); |
1767 | extern int get_sb_single(struct file_system_type *fs_type, | 1767 | extern int get_sb_single(struct file_system_type *fs_type, |
1768 | int flags, void *data, | 1768 | int flags, void *data, |
1769 | int (*fill_super)(struct super_block *, void *, int), | 1769 | int (*fill_super)(struct super_block *, void *, int), |
1770 | struct vfsmount *mnt); | 1770 | struct vfsmount *mnt); |
1771 | extern int get_sb_nodev(struct file_system_type *fs_type, | 1771 | extern int get_sb_nodev(struct file_system_type *fs_type, |
1772 | int flags, void *data, | 1772 | int flags, void *data, |
1773 | int (*fill_super)(struct super_block *, void *, int), | 1773 | int (*fill_super)(struct super_block *, void *, int), |
1774 | struct vfsmount *mnt); | 1774 | struct vfsmount *mnt); |
1775 | void generic_shutdown_super(struct super_block *sb); | 1775 | void generic_shutdown_super(struct super_block *sb); |
1776 | void kill_block_super(struct super_block *sb); | 1776 | void kill_block_super(struct super_block *sb); |
1777 | void kill_anon_super(struct super_block *sb); | 1777 | void kill_anon_super(struct super_block *sb); |
1778 | void kill_litter_super(struct super_block *sb); | 1778 | void kill_litter_super(struct super_block *sb); |
1779 | void deactivate_super(struct super_block *sb); | 1779 | void deactivate_super(struct super_block *sb); |
1780 | void deactivate_locked_super(struct super_block *sb); | 1780 | void deactivate_locked_super(struct super_block *sb); |
1781 | int set_anon_super(struct super_block *s, void *data); | 1781 | int set_anon_super(struct super_block *s, void *data); |
1782 | struct super_block *sget(struct file_system_type *type, | 1782 | struct super_block *sget(struct file_system_type *type, |
1783 | int (*test)(struct super_block *,void *), | 1783 | int (*test)(struct super_block *,void *), |
1784 | int (*set)(struct super_block *,void *), | 1784 | int (*set)(struct super_block *,void *), |
1785 | void *data); | 1785 | void *data); |
1786 | extern int get_sb_pseudo(struct file_system_type *, char *, | 1786 | extern int get_sb_pseudo(struct file_system_type *, char *, |
1787 | const struct super_operations *ops, unsigned long, | 1787 | const struct super_operations *ops, unsigned long, |
1788 | struct vfsmount *mnt); | 1788 | struct vfsmount *mnt); |
1789 | extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); | 1789 | extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); |
1790 | int __put_super_and_need_restart(struct super_block *sb); | 1790 | int __put_super_and_need_restart(struct super_block *sb); |
1791 | 1791 | ||
1792 | /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ | 1792 | /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ |
1793 | #define fops_get(fops) \ | 1793 | #define fops_get(fops) \ |
1794 | (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) | 1794 | (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) |
1795 | #define fops_put(fops) \ | 1795 | #define fops_put(fops) \ |
1796 | do { if (fops) module_put((fops)->owner); } while(0) | 1796 | do { if (fops) module_put((fops)->owner); } while(0) |
1797 | 1797 | ||
1798 | extern int register_filesystem(struct file_system_type *); | 1798 | extern int register_filesystem(struct file_system_type *); |
1799 | extern int unregister_filesystem(struct file_system_type *); | 1799 | extern int unregister_filesystem(struct file_system_type *); |
1800 | extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); | 1800 | extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); |
1801 | #define kern_mount(type) kern_mount_data(type, NULL) | 1801 | #define kern_mount(type) kern_mount_data(type, NULL) |
1802 | extern int may_umount_tree(struct vfsmount *); | 1802 | extern int may_umount_tree(struct vfsmount *); |
1803 | extern int may_umount(struct vfsmount *); | 1803 | extern int may_umount(struct vfsmount *); |
1804 | extern long do_mount(char *, char *, char *, unsigned long, void *); | 1804 | extern long do_mount(char *, char *, char *, unsigned long, void *); |
1805 | extern struct vfsmount *collect_mounts(struct path *); | 1805 | extern struct vfsmount *collect_mounts(struct path *); |
1806 | extern void drop_collected_mounts(struct vfsmount *); | 1806 | extern void drop_collected_mounts(struct vfsmount *); |
1807 | 1807 | ||
1808 | extern int vfs_statfs(struct dentry *, struct kstatfs *); | 1808 | extern int vfs_statfs(struct dentry *, struct kstatfs *); |
1809 | 1809 | ||
1810 | extern int current_umask(void); | 1810 | extern int current_umask(void); |
1811 | 1811 | ||
1812 | /* /sys/fs */ | 1812 | /* /sys/fs */ |
1813 | extern struct kobject *fs_kobj; | 1813 | extern struct kobject *fs_kobj; |
1814 | 1814 | ||
1815 | extern int rw_verify_area(int, struct file *, loff_t *, size_t); | 1815 | extern int rw_verify_area(int, struct file *, loff_t *, size_t); |
1816 | 1816 | ||
1817 | #define FLOCK_VERIFY_READ 1 | 1817 | #define FLOCK_VERIFY_READ 1 |
1818 | #define FLOCK_VERIFY_WRITE 2 | 1818 | #define FLOCK_VERIFY_WRITE 2 |
1819 | 1819 | ||
1820 | #ifdef CONFIG_FILE_LOCKING | 1820 | #ifdef CONFIG_FILE_LOCKING |
1821 | extern int locks_mandatory_locked(struct inode *); | 1821 | extern int locks_mandatory_locked(struct inode *); |
1822 | extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); | 1822 | extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); |
1823 | 1823 | ||
1824 | /* | 1824 | /* |
1825 | * Candidates for mandatory locking have the setgid bit set | 1825 | * Candidates for mandatory locking have the setgid bit set |
1826 | * but no group execute bit - an otherwise meaningless combination. | 1826 | * but no group execute bit - an otherwise meaningless combination. |
1827 | */ | 1827 | */ |
1828 | 1828 | ||
1829 | static inline int __mandatory_lock(struct inode *ino) | 1829 | static inline int __mandatory_lock(struct inode *ino) |
1830 | { | 1830 | { |
1831 | return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; | 1831 | return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; |
1832 | } | 1832 | } |
1833 | 1833 | ||
1834 | /* | 1834 | /* |
1835 | * ... and these candidates should be on MS_MANDLOCK mounted fs, | 1835 | * ... and these candidates should be on MS_MANDLOCK mounted fs, |
1836 | * otherwise these will be advisory locks | 1836 | * otherwise these will be advisory locks |
1837 | */ | 1837 | */ |
1838 | 1838 | ||
1839 | static inline int mandatory_lock(struct inode *ino) | 1839 | static inline int mandatory_lock(struct inode *ino) |
1840 | { | 1840 | { |
1841 | return IS_MANDLOCK(ino) && __mandatory_lock(ino); | 1841 | return IS_MANDLOCK(ino) && __mandatory_lock(ino); |
1842 | } | 1842 | } |
1843 | 1843 | ||
1844 | static inline int locks_verify_locked(struct inode *inode) | 1844 | static inline int locks_verify_locked(struct inode *inode) |
1845 | { | 1845 | { |
1846 | if (mandatory_lock(inode)) | 1846 | if (mandatory_lock(inode)) |
1847 | return locks_mandatory_locked(inode); | 1847 | return locks_mandatory_locked(inode); |
1848 | return 0; | 1848 | return 0; |
1849 | } | 1849 | } |
1850 | 1850 | ||
1851 | static inline int locks_verify_truncate(struct inode *inode, | 1851 | static inline int locks_verify_truncate(struct inode *inode, |
1852 | struct file *filp, | 1852 | struct file *filp, |
1853 | loff_t size) | 1853 | loff_t size) |
1854 | { | 1854 | { |
1855 | if (inode->i_flock && mandatory_lock(inode)) | 1855 | if (inode->i_flock && mandatory_lock(inode)) |
1856 | return locks_mandatory_area( | 1856 | return locks_mandatory_area( |
1857 | FLOCK_VERIFY_WRITE, inode, filp, | 1857 | FLOCK_VERIFY_WRITE, inode, filp, |
1858 | size < inode->i_size ? size : inode->i_size, | 1858 | size < inode->i_size ? size : inode->i_size, |
1859 | (size < inode->i_size ? inode->i_size - size | 1859 | (size < inode->i_size ? inode->i_size - size |
1860 | : size - inode->i_size) | 1860 | : size - inode->i_size) |
1861 | ); | 1861 | ); |
1862 | return 0; | 1862 | return 0; |
1863 | } | 1863 | } |
1864 | 1864 | ||
1865 | static inline int break_lease(struct inode *inode, unsigned int mode) | 1865 | static inline int break_lease(struct inode *inode, unsigned int mode) |
1866 | { | 1866 | { |
1867 | if (inode->i_flock) | 1867 | if (inode->i_flock) |
1868 | return __break_lease(inode, mode); | 1868 | return __break_lease(inode, mode); |
1869 | return 0; | 1869 | return 0; |
1870 | } | 1870 | } |
1871 | #else /* !CONFIG_FILE_LOCKING */ | 1871 | #else /* !CONFIG_FILE_LOCKING */ |
1872 | static inline int locks_mandatory_locked(struct inode *inode) | 1872 | static inline int locks_mandatory_locked(struct inode *inode) |
1873 | { | 1873 | { |
1874 | return 0; | 1874 | return 0; |
1875 | } | 1875 | } |
1876 | 1876 | ||
1877 | static inline int locks_mandatory_area(int rw, struct inode *inode, | 1877 | static inline int locks_mandatory_area(int rw, struct inode *inode, |
1878 | struct file *filp, loff_t offset, | 1878 | struct file *filp, loff_t offset, |
1879 | size_t count) | 1879 | size_t count) |
1880 | { | 1880 | { |
1881 | return 0; | 1881 | return 0; |
1882 | } | 1882 | } |
1883 | 1883 | ||
1884 | static inline int __mandatory_lock(struct inode *inode) | 1884 | static inline int __mandatory_lock(struct inode *inode) |
1885 | { | 1885 | { |
1886 | return 0; | 1886 | return 0; |
1887 | } | 1887 | } |
1888 | 1888 | ||
1889 | static inline int mandatory_lock(struct inode *inode) | 1889 | static inline int mandatory_lock(struct inode *inode) |
1890 | { | 1890 | { |
1891 | return 0; | 1891 | return 0; |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | static inline int locks_verify_locked(struct inode *inode) | 1894 | static inline int locks_verify_locked(struct inode *inode) |
1895 | { | 1895 | { |
1896 | return 0; | 1896 | return 0; |
1897 | } | 1897 | } |
1898 | 1898 | ||
1899 | static inline int locks_verify_truncate(struct inode *inode, struct file *filp, | 1899 | static inline int locks_verify_truncate(struct inode *inode, struct file *filp, |
1900 | size_t size) | 1900 | size_t size) |
1901 | { | 1901 | { |
1902 | return 0; | 1902 | return 0; |
1903 | } | 1903 | } |
1904 | 1904 | ||
1905 | static inline int break_lease(struct inode *inode, unsigned int mode) | 1905 | static inline int break_lease(struct inode *inode, unsigned int mode) |
1906 | { | 1906 | { |
1907 | return 0; | 1907 | return 0; |
1908 | } | 1908 | } |
1909 | 1909 | ||
1910 | #endif /* CONFIG_FILE_LOCKING */ | 1910 | #endif /* CONFIG_FILE_LOCKING */ |
1911 | 1911 | ||
1912 | /* fs/open.c */ | 1912 | /* fs/open.c */ |
1913 | 1913 | ||
1914 | extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, | 1914 | extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, |
1915 | struct file *filp); | 1915 | struct file *filp); |
1916 | extern int do_fallocate(struct file *file, int mode, loff_t offset, | 1916 | extern int do_fallocate(struct file *file, int mode, loff_t offset, |
1917 | loff_t len); | 1917 | loff_t len); |
1918 | extern long do_sys_open(int dfd, const char __user *filename, int flags, | 1918 | extern long do_sys_open(int dfd, const char __user *filename, int flags, |
1919 | int mode); | 1919 | int mode); |
1920 | extern struct file *filp_open(const char *, int, int); | 1920 | extern struct file *filp_open(const char *, int, int); |
1921 | extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, | 1921 | extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, |
1922 | const struct cred *); | 1922 | const struct cred *); |
1923 | extern int filp_close(struct file *, fl_owner_t id); | 1923 | extern int filp_close(struct file *, fl_owner_t id); |
1924 | extern char * getname(const char __user *); | 1924 | extern char * getname(const char __user *); |
1925 | 1925 | ||
1926 | /* fs/ioctl.c */ | 1926 | /* fs/ioctl.c */ |
1927 | 1927 | ||
1928 | extern int ioctl_preallocate(struct file *filp, void __user *argp); | 1928 | extern int ioctl_preallocate(struct file *filp, void __user *argp); |
1929 | 1929 | ||
1930 | /* fs/dcache.c */ | 1930 | /* fs/dcache.c */ |
1931 | extern void __init vfs_caches_init_early(void); | 1931 | extern void __init vfs_caches_init_early(void); |
1932 | extern void __init vfs_caches_init(unsigned long); | 1932 | extern void __init vfs_caches_init(unsigned long); |
1933 | 1933 | ||
1934 | extern struct kmem_cache *names_cachep; | 1934 | extern struct kmem_cache *names_cachep; |
1935 | 1935 | ||
1936 | #define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp)) | 1936 | #define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp)) |
1937 | #define __getname() __getname_gfp(GFP_KERNEL) | 1937 | #define __getname() __getname_gfp(GFP_KERNEL) |
1938 | #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) | 1938 | #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) |
1939 | #ifndef CONFIG_AUDITSYSCALL | 1939 | #ifndef CONFIG_AUDITSYSCALL |
1940 | #define putname(name) __putname(name) | 1940 | #define putname(name) __putname(name) |
1941 | #else | 1941 | #else |
1942 | extern void putname(const char *name); | 1942 | extern void putname(const char *name); |
1943 | #endif | 1943 | #endif |
1944 | 1944 | ||
1945 | #ifdef CONFIG_BLOCK | 1945 | #ifdef CONFIG_BLOCK |
1946 | extern int register_blkdev(unsigned int, const char *); | 1946 | extern int register_blkdev(unsigned int, const char *); |
1947 | extern void unregister_blkdev(unsigned int, const char *); | 1947 | extern void unregister_blkdev(unsigned int, const char *); |
1948 | extern struct block_device *bdget(dev_t); | 1948 | extern struct block_device *bdget(dev_t); |
1949 | extern struct block_device *bdgrab(struct block_device *bdev); | ||
1949 | extern void bd_set_size(struct block_device *, loff_t size); | 1950 | extern void bd_set_size(struct block_device *, loff_t size); |
1950 | extern void bd_forget(struct inode *inode); | 1951 | extern void bd_forget(struct inode *inode); |
1951 | extern void bdput(struct block_device *); | 1952 | extern void bdput(struct block_device *); |
1952 | extern struct block_device *open_by_devnum(dev_t, fmode_t); | 1953 | extern struct block_device *open_by_devnum(dev_t, fmode_t); |
1953 | extern void invalidate_bdev(struct block_device *); | 1954 | extern void invalidate_bdev(struct block_device *); |
1954 | extern int sync_blockdev(struct block_device *bdev); | 1955 | extern int sync_blockdev(struct block_device *bdev); |
1955 | extern struct super_block *freeze_bdev(struct block_device *); | 1956 | extern struct super_block *freeze_bdev(struct block_device *); |
1956 | extern void emergency_thaw_all(void); | 1957 | extern void emergency_thaw_all(void); |
1957 | extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); | 1958 | extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); |
1958 | extern int fsync_bdev(struct block_device *); | 1959 | extern int fsync_bdev(struct block_device *); |
1959 | #else | 1960 | #else |
1960 | static inline void bd_forget(struct inode *inode) {} | 1961 | static inline void bd_forget(struct inode *inode) {} |
1961 | static inline int sync_blockdev(struct block_device *bdev) { return 0; } | 1962 | static inline int sync_blockdev(struct block_device *bdev) { return 0; } |
1962 | static inline void invalidate_bdev(struct block_device *bdev) {} | 1963 | static inline void invalidate_bdev(struct block_device *bdev) {} |
1963 | 1964 | ||
1964 | static inline struct super_block *freeze_bdev(struct block_device *sb) | 1965 | static inline struct super_block *freeze_bdev(struct block_device *sb) |
1965 | { | 1966 | { |
1966 | return NULL; | 1967 | return NULL; |
1967 | } | 1968 | } |
1968 | 1969 | ||
1969 | static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) | 1970 | static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) |
1970 | { | 1971 | { |
1971 | return 0; | 1972 | return 0; |
1972 | } | 1973 | } |
1973 | #endif | 1974 | #endif |
1974 | extern int sync_filesystem(struct super_block *); | 1975 | extern int sync_filesystem(struct super_block *); |
1975 | extern const struct file_operations def_blk_fops; | 1976 | extern const struct file_operations def_blk_fops; |
1976 | extern const struct file_operations def_chr_fops; | 1977 | extern const struct file_operations def_chr_fops; |
1977 | extern const struct file_operations bad_sock_fops; | 1978 | extern const struct file_operations bad_sock_fops; |
1978 | extern const struct file_operations def_fifo_fops; | 1979 | extern const struct file_operations def_fifo_fops; |
1979 | #ifdef CONFIG_BLOCK | 1980 | #ifdef CONFIG_BLOCK |
1980 | extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); | 1981 | extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); |
1981 | extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); | 1982 | extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); |
1982 | extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); | 1983 | extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); |
1983 | extern int blkdev_get(struct block_device *, fmode_t); | 1984 | extern int blkdev_get(struct block_device *, fmode_t); |
1984 | extern int blkdev_put(struct block_device *, fmode_t); | 1985 | extern int blkdev_put(struct block_device *, fmode_t); |
1985 | extern int bd_claim(struct block_device *, void *); | 1986 | extern int bd_claim(struct block_device *, void *); |
1986 | extern void bd_release(struct block_device *); | 1987 | extern void bd_release(struct block_device *); |
1987 | #ifdef CONFIG_SYSFS | 1988 | #ifdef CONFIG_SYSFS |
1988 | extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); | 1989 | extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); |
1989 | extern void bd_release_from_disk(struct block_device *, struct gendisk *); | 1990 | extern void bd_release_from_disk(struct block_device *, struct gendisk *); |
1990 | #else | 1991 | #else |
1991 | #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) | 1992 | #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) |
1992 | #define bd_release_from_disk(bdev, disk) bd_release(bdev) | 1993 | #define bd_release_from_disk(bdev, disk) bd_release(bdev) |
1993 | #endif | 1994 | #endif |
1994 | #endif | 1995 | #endif |
1995 | 1996 | ||
1996 | /* fs/char_dev.c */ | 1997 | /* fs/char_dev.c */ |
1997 | #define CHRDEV_MAJOR_HASH_SIZE 255 | 1998 | #define CHRDEV_MAJOR_HASH_SIZE 255 |
1998 | extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); | 1999 | extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); |
1999 | extern int register_chrdev_region(dev_t, unsigned, const char *); | 2000 | extern int register_chrdev_region(dev_t, unsigned, const char *); |
2000 | extern int register_chrdev(unsigned int, const char *, | 2001 | extern int register_chrdev(unsigned int, const char *, |
2001 | const struct file_operations *); | 2002 | const struct file_operations *); |
2002 | extern void unregister_chrdev(unsigned int, const char *); | 2003 | extern void unregister_chrdev(unsigned int, const char *); |
2003 | extern void unregister_chrdev_region(dev_t, unsigned); | 2004 | extern void unregister_chrdev_region(dev_t, unsigned); |
2004 | extern void chrdev_show(struct seq_file *,off_t); | 2005 | extern void chrdev_show(struct seq_file *,off_t); |
2005 | 2006 | ||
2006 | /* fs/block_dev.c */ | 2007 | /* fs/block_dev.c */ |
2007 | #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ | 2008 | #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ |
2008 | #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ | 2009 | #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ |
2009 | 2010 | ||
2010 | #ifdef CONFIG_BLOCK | 2011 | #ifdef CONFIG_BLOCK |
2011 | #define BLKDEV_MAJOR_HASH_SIZE 255 | 2012 | #define BLKDEV_MAJOR_HASH_SIZE 255 |
2012 | extern const char *__bdevname(dev_t, char *buffer); | 2013 | extern const char *__bdevname(dev_t, char *buffer); |
2013 | extern const char *bdevname(struct block_device *bdev, char *buffer); | 2014 | extern const char *bdevname(struct block_device *bdev, char *buffer); |
2014 | extern struct block_device *lookup_bdev(const char *); | 2015 | extern struct block_device *lookup_bdev(const char *); |
2015 | extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); | 2016 | extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); |
2016 | extern void close_bdev_exclusive(struct block_device *, fmode_t); | 2017 | extern void close_bdev_exclusive(struct block_device *, fmode_t); |
2017 | extern void blkdev_show(struct seq_file *,off_t); | 2018 | extern void blkdev_show(struct seq_file *,off_t); |
2018 | 2019 | ||
2019 | #else | 2020 | #else |
2020 | #define BLKDEV_MAJOR_HASH_SIZE 0 | 2021 | #define BLKDEV_MAJOR_HASH_SIZE 0 |
2021 | #endif | 2022 | #endif |
2022 | 2023 | ||
2023 | extern void init_special_inode(struct inode *, umode_t, dev_t); | 2024 | extern void init_special_inode(struct inode *, umode_t, dev_t); |
2024 | 2025 | ||
2025 | /* Invalid inode operations -- fs/bad_inode.c */ | 2026 | /* Invalid inode operations -- fs/bad_inode.c */ |
2026 | extern void make_bad_inode(struct inode *); | 2027 | extern void make_bad_inode(struct inode *); |
2027 | extern int is_bad_inode(struct inode *); | 2028 | extern int is_bad_inode(struct inode *); |
2028 | 2029 | ||
2029 | extern const struct file_operations read_pipefifo_fops; | 2030 | extern const struct file_operations read_pipefifo_fops; |
2030 | extern const struct file_operations write_pipefifo_fops; | 2031 | extern const struct file_operations write_pipefifo_fops; |
2031 | extern const struct file_operations rdwr_pipefifo_fops; | 2032 | extern const struct file_operations rdwr_pipefifo_fops; |
2032 | 2033 | ||
2033 | extern int fs_may_remount_ro(struct super_block *); | 2034 | extern int fs_may_remount_ro(struct super_block *); |
2034 | 2035 | ||
2035 | #ifdef CONFIG_BLOCK | 2036 | #ifdef CONFIG_BLOCK |
2036 | /* | 2037 | /* |
2037 | * return READ, READA, or WRITE | 2038 | * return READ, READA, or WRITE |
2038 | */ | 2039 | */ |
2039 | #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) | 2040 | #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) |
2040 | 2041 | ||
2041 | /* | 2042 | /* |
2042 | * return data direction, READ or WRITE | 2043 | * return data direction, READ or WRITE |
2043 | */ | 2044 | */ |
2044 | #define bio_data_dir(bio) ((bio)->bi_rw & 1) | 2045 | #define bio_data_dir(bio) ((bio)->bi_rw & 1) |
2045 | 2046 | ||
2046 | extern void check_disk_size_change(struct gendisk *disk, | 2047 | extern void check_disk_size_change(struct gendisk *disk, |
2047 | struct block_device *bdev); | 2048 | struct block_device *bdev); |
2048 | extern int revalidate_disk(struct gendisk *); | 2049 | extern int revalidate_disk(struct gendisk *); |
2049 | extern int check_disk_change(struct block_device *); | 2050 | extern int check_disk_change(struct block_device *); |
2050 | extern int __invalidate_device(struct block_device *); | 2051 | extern int __invalidate_device(struct block_device *); |
2051 | extern int invalidate_partition(struct gendisk *, int); | 2052 | extern int invalidate_partition(struct gendisk *, int); |
2052 | #endif | 2053 | #endif |
2053 | extern int invalidate_inodes(struct super_block *); | 2054 | extern int invalidate_inodes(struct super_block *); |
2054 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | 2055 | unsigned long invalidate_mapping_pages(struct address_space *mapping, |
2055 | pgoff_t start, pgoff_t end); | 2056 | pgoff_t start, pgoff_t end); |
2056 | 2057 | ||
2057 | static inline unsigned long __deprecated | 2058 | static inline unsigned long __deprecated |
2058 | invalidate_inode_pages(struct address_space *mapping) | 2059 | invalidate_inode_pages(struct address_space *mapping) |
2059 | { | 2060 | { |
2060 | return invalidate_mapping_pages(mapping, 0, ~0UL); | 2061 | return invalidate_mapping_pages(mapping, 0, ~0UL); |
2061 | } | 2062 | } |
2062 | 2063 | ||
2063 | static inline void invalidate_remote_inode(struct inode *inode) | 2064 | static inline void invalidate_remote_inode(struct inode *inode) |
2064 | { | 2065 | { |
2065 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 2066 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
2066 | S_ISLNK(inode->i_mode)) | 2067 | S_ISLNK(inode->i_mode)) |
2067 | invalidate_mapping_pages(inode->i_mapping, 0, -1); | 2068 | invalidate_mapping_pages(inode->i_mapping, 0, -1); |
2068 | } | 2069 | } |
2069 | extern int invalidate_inode_pages2(struct address_space *mapping); | 2070 | extern int invalidate_inode_pages2(struct address_space *mapping); |
2070 | extern int invalidate_inode_pages2_range(struct address_space *mapping, | 2071 | extern int invalidate_inode_pages2_range(struct address_space *mapping, |
2071 | pgoff_t start, pgoff_t end); | 2072 | pgoff_t start, pgoff_t end); |
2072 | extern void generic_sync_sb_inodes(struct super_block *sb, | 2073 | extern void generic_sync_sb_inodes(struct super_block *sb, |
2073 | struct writeback_control *wbc); | 2074 | struct writeback_control *wbc); |
2074 | extern int write_inode_now(struct inode *, int); | 2075 | extern int write_inode_now(struct inode *, int); |
2075 | extern int filemap_fdatawrite(struct address_space *); | 2076 | extern int filemap_fdatawrite(struct address_space *); |
2076 | extern int filemap_flush(struct address_space *); | 2077 | extern int filemap_flush(struct address_space *); |
2077 | extern int filemap_fdatawait(struct address_space *); | 2078 | extern int filemap_fdatawait(struct address_space *); |
2078 | extern int filemap_write_and_wait(struct address_space *mapping); | 2079 | extern int filemap_write_and_wait(struct address_space *mapping); |
2079 | extern int filemap_write_and_wait_range(struct address_space *mapping, | 2080 | extern int filemap_write_and_wait_range(struct address_space *mapping, |
2080 | loff_t lstart, loff_t lend); | 2081 | loff_t lstart, loff_t lend); |
2081 | extern int wait_on_page_writeback_range(struct address_space *mapping, | 2082 | extern int wait_on_page_writeback_range(struct address_space *mapping, |
2082 | pgoff_t start, pgoff_t end); | 2083 | pgoff_t start, pgoff_t end); |
2083 | extern int __filemap_fdatawrite_range(struct address_space *mapping, | 2084 | extern int __filemap_fdatawrite_range(struct address_space *mapping, |
2084 | loff_t start, loff_t end, int sync_mode); | 2085 | loff_t start, loff_t end, int sync_mode); |
2085 | extern int filemap_fdatawrite_range(struct address_space *mapping, | 2086 | extern int filemap_fdatawrite_range(struct address_space *mapping, |
2086 | loff_t start, loff_t end); | 2087 | loff_t start, loff_t end); |
2087 | 2088 | ||
2088 | extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); | 2089 | extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); |
2089 | extern void sync_supers(void); | 2090 | extern void sync_supers(void); |
2090 | extern void emergency_sync(void); | 2091 | extern void emergency_sync(void); |
2091 | extern void emergency_remount(void); | 2092 | extern void emergency_remount(void); |
2092 | #ifdef CONFIG_BLOCK | 2093 | #ifdef CONFIG_BLOCK |
2093 | extern sector_t bmap(struct inode *, sector_t); | 2094 | extern sector_t bmap(struct inode *, sector_t); |
2094 | #endif | 2095 | #endif |
2095 | extern int notify_change(struct dentry *, struct iattr *); | 2096 | extern int notify_change(struct dentry *, struct iattr *); |
2096 | extern int inode_permission(struct inode *, int); | 2097 | extern int inode_permission(struct inode *, int); |
2097 | extern int generic_permission(struct inode *, int, | 2098 | extern int generic_permission(struct inode *, int, |
2098 | int (*check_acl)(struct inode *, int)); | 2099 | int (*check_acl)(struct inode *, int)); |
2099 | 2100 | ||
2100 | static inline bool execute_ok(struct inode *inode) | 2101 | static inline bool execute_ok(struct inode *inode) |
2101 | { | 2102 | { |
2102 | return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); | 2103 | return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); |
2103 | } | 2104 | } |
2104 | 2105 | ||
2105 | extern int get_write_access(struct inode *); | 2106 | extern int get_write_access(struct inode *); |
2106 | extern int deny_write_access(struct file *); | 2107 | extern int deny_write_access(struct file *); |
2107 | static inline void put_write_access(struct inode * inode) | 2108 | static inline void put_write_access(struct inode * inode) |
2108 | { | 2109 | { |
2109 | atomic_dec(&inode->i_writecount); | 2110 | atomic_dec(&inode->i_writecount); |
2110 | } | 2111 | } |
2111 | static inline void allow_write_access(struct file *file) | 2112 | static inline void allow_write_access(struct file *file) |
2112 | { | 2113 | { |
2113 | if (file) | 2114 | if (file) |
2114 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); | 2115 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
2115 | } | 2116 | } |
2116 | extern int do_pipe_flags(int *, int); | 2117 | extern int do_pipe_flags(int *, int); |
2117 | extern struct file *create_read_pipe(struct file *f, int flags); | 2118 | extern struct file *create_read_pipe(struct file *f, int flags); |
2118 | extern struct file *create_write_pipe(int flags); | 2119 | extern struct file *create_write_pipe(int flags); |
2119 | extern void free_write_pipe(struct file *); | 2120 | extern void free_write_pipe(struct file *); |
2120 | 2121 | ||
2121 | extern struct file *do_filp_open(int dfd, const char *pathname, | 2122 | extern struct file *do_filp_open(int dfd, const char *pathname, |
2122 | int open_flag, int mode, int acc_mode); | 2123 | int open_flag, int mode, int acc_mode); |
2123 | extern int may_open(struct path *, int, int); | 2124 | extern int may_open(struct path *, int, int); |
2124 | 2125 | ||
2125 | extern int kernel_read(struct file *, unsigned long, char *, unsigned long); | 2126 | extern int kernel_read(struct file *, unsigned long, char *, unsigned long); |
2126 | extern struct file * open_exec(const char *); | 2127 | extern struct file * open_exec(const char *); |
2127 | 2128 | ||
2128 | /* fs/dcache.c -- generic fs support functions */ | 2129 | /* fs/dcache.c -- generic fs support functions */ |
2129 | extern int is_subdir(struct dentry *, struct dentry *); | 2130 | extern int is_subdir(struct dentry *, struct dentry *); |
2130 | extern ino_t find_inode_number(struct dentry *, struct qstr *); | 2131 | extern ino_t find_inode_number(struct dentry *, struct qstr *); |
2131 | 2132 | ||
2132 | #include <linux/err.h> | 2133 | #include <linux/err.h> |
2133 | 2134 | ||
2134 | /* needed for stackable file system support */ | 2135 | /* needed for stackable file system support */ |
2135 | extern loff_t default_llseek(struct file *file, loff_t offset, int origin); | 2136 | extern loff_t default_llseek(struct file *file, loff_t offset, int origin); |
2136 | 2137 | ||
2137 | extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); | 2138 | extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); |
2138 | 2139 | ||
2139 | extern struct inode * inode_init_always(struct super_block *, struct inode *); | 2140 | extern struct inode * inode_init_always(struct super_block *, struct inode *); |
2140 | extern void inode_init_once(struct inode *); | 2141 | extern void inode_init_once(struct inode *); |
2141 | extern void inode_add_to_lists(struct super_block *, struct inode *); | 2142 | extern void inode_add_to_lists(struct super_block *, struct inode *); |
2142 | extern void iput(struct inode *); | 2143 | extern void iput(struct inode *); |
2143 | extern struct inode * igrab(struct inode *); | 2144 | extern struct inode * igrab(struct inode *); |
2144 | extern ino_t iunique(struct super_block *, ino_t); | 2145 | extern ino_t iunique(struct super_block *, ino_t); |
2145 | extern int inode_needs_sync(struct inode *inode); | 2146 | extern int inode_needs_sync(struct inode *inode); |
2146 | extern void generic_delete_inode(struct inode *inode); | 2147 | extern void generic_delete_inode(struct inode *inode); |
2147 | extern void generic_drop_inode(struct inode *inode); | 2148 | extern void generic_drop_inode(struct inode *inode); |
2148 | 2149 | ||
2149 | extern struct inode *ilookup5_nowait(struct super_block *sb, | 2150 | extern struct inode *ilookup5_nowait(struct super_block *sb, |
2150 | unsigned long hashval, int (*test)(struct inode *, void *), | 2151 | unsigned long hashval, int (*test)(struct inode *, void *), |
2151 | void *data); | 2152 | void *data); |
2152 | extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, | 2153 | extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, |
2153 | int (*test)(struct inode *, void *), void *data); | 2154 | int (*test)(struct inode *, void *), void *data); |
2154 | extern struct inode *ilookup(struct super_block *sb, unsigned long ino); | 2155 | extern struct inode *ilookup(struct super_block *sb, unsigned long ino); |
2155 | 2156 | ||
2156 | extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); | 2157 | extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); |
2157 | extern struct inode * iget_locked(struct super_block *, unsigned long); | 2158 | extern struct inode * iget_locked(struct super_block *, unsigned long); |
2158 | extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); | 2159 | extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); |
2159 | extern int insert_inode_locked(struct inode *); | 2160 | extern int insert_inode_locked(struct inode *); |
2160 | extern void unlock_new_inode(struct inode *); | 2161 | extern void unlock_new_inode(struct inode *); |
2161 | 2162 | ||
2162 | extern void __iget(struct inode * inode); | 2163 | extern void __iget(struct inode * inode); |
2163 | extern void iget_failed(struct inode *); | 2164 | extern void iget_failed(struct inode *); |
2164 | extern void clear_inode(struct inode *); | 2165 | extern void clear_inode(struct inode *); |
2165 | extern void destroy_inode(struct inode *); | 2166 | extern void destroy_inode(struct inode *); |
2166 | extern struct inode *new_inode(struct super_block *); | 2167 | extern struct inode *new_inode(struct super_block *); |
2167 | extern int should_remove_suid(struct dentry *); | 2168 | extern int should_remove_suid(struct dentry *); |
2168 | extern int file_remove_suid(struct file *); | 2169 | extern int file_remove_suid(struct file *); |
2169 | 2170 | ||
2170 | extern void __insert_inode_hash(struct inode *, unsigned long hashval); | 2171 | extern void __insert_inode_hash(struct inode *, unsigned long hashval); |
2171 | extern void remove_inode_hash(struct inode *); | 2172 | extern void remove_inode_hash(struct inode *); |
2172 | static inline void insert_inode_hash(struct inode *inode) { | 2173 | static inline void insert_inode_hash(struct inode *inode) { |
2173 | __insert_inode_hash(inode, inode->i_ino); | 2174 | __insert_inode_hash(inode, inode->i_ino); |
2174 | } | 2175 | } |
2175 | 2176 | ||
2176 | extern struct file * get_empty_filp(void); | 2177 | extern struct file * get_empty_filp(void); |
2177 | extern void file_move(struct file *f, struct list_head *list); | 2178 | extern void file_move(struct file *f, struct list_head *list); |
2178 | extern void file_kill(struct file *f); | 2179 | extern void file_kill(struct file *f); |
2179 | #ifdef CONFIG_BLOCK | 2180 | #ifdef CONFIG_BLOCK |
2180 | struct bio; | 2181 | struct bio; |
2181 | extern void submit_bio(int, struct bio *); | 2182 | extern void submit_bio(int, struct bio *); |
2182 | extern int bdev_read_only(struct block_device *); | 2183 | extern int bdev_read_only(struct block_device *); |
2183 | #endif | 2184 | #endif |
2184 | extern int set_blocksize(struct block_device *, int); | 2185 | extern int set_blocksize(struct block_device *, int); |
2185 | extern int sb_set_blocksize(struct super_block *, int); | 2186 | extern int sb_set_blocksize(struct super_block *, int); |
2186 | extern int sb_min_blocksize(struct super_block *, int); | 2187 | extern int sb_min_blocksize(struct super_block *, int); |
2187 | extern int sb_has_dirty_inodes(struct super_block *); | 2188 | extern int sb_has_dirty_inodes(struct super_block *); |
2188 | 2189 | ||
2189 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); | 2190 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); |
2190 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); | 2191 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); |
2191 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); | 2192 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); |
2192 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); | 2193 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); |
2193 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); | 2194 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); |
2194 | extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); | 2195 | extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); |
2195 | extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, | 2196 | extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, |
2196 | unsigned long, loff_t); | 2197 | unsigned long, loff_t); |
2197 | extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, | 2198 | extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, |
2198 | unsigned long *, loff_t, loff_t *, size_t, size_t); | 2199 | unsigned long *, loff_t, loff_t *, size_t, size_t); |
2199 | extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, | 2200 | extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, |
2200 | unsigned long, loff_t, loff_t *, size_t, ssize_t); | 2201 | unsigned long, loff_t, loff_t *, size_t, ssize_t); |
2201 | extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); | 2202 | extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); |
2202 | extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); | 2203 | extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); |
2203 | extern int generic_segment_checks(const struct iovec *iov, | 2204 | extern int generic_segment_checks(const struct iovec *iov, |
2204 | unsigned long *nr_segs, size_t *count, int access_flags); | 2205 | unsigned long *nr_segs, size_t *count, int access_flags); |
2205 | 2206 | ||
2206 | /* fs/splice.c */ | 2207 | /* fs/splice.c */ |
2207 | extern ssize_t generic_file_splice_read(struct file *, loff_t *, | 2208 | extern ssize_t generic_file_splice_read(struct file *, loff_t *, |
2208 | struct pipe_inode_info *, size_t, unsigned int); | 2209 | struct pipe_inode_info *, size_t, unsigned int); |
2209 | extern ssize_t default_file_splice_read(struct file *, loff_t *, | 2210 | extern ssize_t default_file_splice_read(struct file *, loff_t *, |
2210 | struct pipe_inode_info *, size_t, unsigned int); | 2211 | struct pipe_inode_info *, size_t, unsigned int); |
2211 | extern ssize_t generic_file_splice_write(struct pipe_inode_info *, | 2212 | extern ssize_t generic_file_splice_write(struct pipe_inode_info *, |
2212 | struct file *, loff_t *, size_t, unsigned int); | 2213 | struct file *, loff_t *, size_t, unsigned int); |
2213 | extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, | 2214 | extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, |
2214 | struct file *out, loff_t *, size_t len, unsigned int flags); | 2215 | struct file *out, loff_t *, size_t len, unsigned int flags); |
2215 | extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | 2216 | extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, |
2216 | size_t len, unsigned int flags); | 2217 | size_t len, unsigned int flags); |
2217 | 2218 | ||
2218 | extern void | 2219 | extern void |
2219 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); | 2220 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); |
2220 | extern loff_t no_llseek(struct file *file, loff_t offset, int origin); | 2221 | extern loff_t no_llseek(struct file *file, loff_t offset, int origin); |
2221 | extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); | 2222 | extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); |
2222 | extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, | 2223 | extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, |
2223 | int origin); | 2224 | int origin); |
2224 | extern int generic_file_open(struct inode * inode, struct file * filp); | 2225 | extern int generic_file_open(struct inode * inode, struct file * filp); |
2225 | extern int nonseekable_open(struct inode * inode, struct file * filp); | 2226 | extern int nonseekable_open(struct inode * inode, struct file * filp); |
2226 | 2227 | ||
2227 | #ifdef CONFIG_FS_XIP | 2228 | #ifdef CONFIG_FS_XIP |
2228 | extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, | 2229 | extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, |
2229 | loff_t *ppos); | 2230 | loff_t *ppos); |
2230 | extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); | 2231 | extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); |
2231 | extern ssize_t xip_file_write(struct file *filp, const char __user *buf, | 2232 | extern ssize_t xip_file_write(struct file *filp, const char __user *buf, |
2232 | size_t len, loff_t *ppos); | 2233 | size_t len, loff_t *ppos); |
2233 | extern int xip_truncate_page(struct address_space *mapping, loff_t from); | 2234 | extern int xip_truncate_page(struct address_space *mapping, loff_t from); |
2234 | #else | 2235 | #else |
2235 | static inline int xip_truncate_page(struct address_space *mapping, loff_t from) | 2236 | static inline int xip_truncate_page(struct address_space *mapping, loff_t from) |
2236 | { | 2237 | { |
2237 | return 0; | 2238 | return 0; |
2238 | } | 2239 | } |
2239 | #endif | 2240 | #endif |
2240 | 2241 | ||
2241 | #ifdef CONFIG_BLOCK | 2242 | #ifdef CONFIG_BLOCK |
2242 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 2243 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
2243 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 2244 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
2244 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 2245 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
2245 | int lock_type); | 2246 | int lock_type); |
2246 | 2247 | ||
2247 | enum { | 2248 | enum { |
2248 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ | 2249 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ |
2249 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ | 2250 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ |
2250 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ | 2251 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ |
2251 | }; | 2252 | }; |
2252 | 2253 | ||
2253 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | 2254 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, |
2254 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 2255 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
2255 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 2256 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
2256 | dio_iodone_t end_io) | 2257 | dio_iodone_t end_io) |
2257 | { | 2258 | { |
2258 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2259 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2259 | nr_segs, get_block, end_io, DIO_LOCKING); | 2260 | nr_segs, get_block, end_io, DIO_LOCKING); |
2260 | } | 2261 | } |
2261 | 2262 | ||
2262 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | 2263 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, |
2263 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 2264 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
2264 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 2265 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
2265 | dio_iodone_t end_io) | 2266 | dio_iodone_t end_io) |
2266 | { | 2267 | { |
2267 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2268 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2268 | nr_segs, get_block, end_io, DIO_NO_LOCKING); | 2269 | nr_segs, get_block, end_io, DIO_NO_LOCKING); |
2269 | } | 2270 | } |
2270 | 2271 | ||
2271 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, | 2272 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, |
2272 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 2273 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
2273 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 2274 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
2274 | dio_iodone_t end_io) | 2275 | dio_iodone_t end_io) |
2275 | { | 2276 | { |
2276 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2277 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2277 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); | 2278 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); |
2278 | } | 2279 | } |
2279 | #endif | 2280 | #endif |
2280 | 2281 | ||
2281 | extern const struct file_operations generic_ro_fops; | 2282 | extern const struct file_operations generic_ro_fops; |
2282 | 2283 | ||
2283 | #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) | 2284 | #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) |
2284 | 2285 | ||
2285 | extern int vfs_readlink(struct dentry *, char __user *, int, const char *); | 2286 | extern int vfs_readlink(struct dentry *, char __user *, int, const char *); |
2286 | extern int vfs_follow_link(struct nameidata *, const char *); | 2287 | extern int vfs_follow_link(struct nameidata *, const char *); |
2287 | extern int page_readlink(struct dentry *, char __user *, int); | 2288 | extern int page_readlink(struct dentry *, char __user *, int); |
2288 | extern void *page_follow_link_light(struct dentry *, struct nameidata *); | 2289 | extern void *page_follow_link_light(struct dentry *, struct nameidata *); |
2289 | extern void page_put_link(struct dentry *, struct nameidata *, void *); | 2290 | extern void page_put_link(struct dentry *, struct nameidata *, void *); |
2290 | extern int __page_symlink(struct inode *inode, const char *symname, int len, | 2291 | extern int __page_symlink(struct inode *inode, const char *symname, int len, |
2291 | int nofs); | 2292 | int nofs); |
2292 | extern int page_symlink(struct inode *inode, const char *symname, int len); | 2293 | extern int page_symlink(struct inode *inode, const char *symname, int len); |
2293 | extern const struct inode_operations page_symlink_inode_operations; | 2294 | extern const struct inode_operations page_symlink_inode_operations; |
2294 | extern int generic_readlink(struct dentry *, char __user *, int); | 2295 | extern int generic_readlink(struct dentry *, char __user *, int); |
2295 | extern void generic_fillattr(struct inode *, struct kstat *); | 2296 | extern void generic_fillattr(struct inode *, struct kstat *); |
2296 | extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 2297 | extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
2297 | void inode_add_bytes(struct inode *inode, loff_t bytes); | 2298 | void inode_add_bytes(struct inode *inode, loff_t bytes); |
2298 | void inode_sub_bytes(struct inode *inode, loff_t bytes); | 2299 | void inode_sub_bytes(struct inode *inode, loff_t bytes); |
2299 | loff_t inode_get_bytes(struct inode *inode); | 2300 | loff_t inode_get_bytes(struct inode *inode); |
2300 | void inode_set_bytes(struct inode *inode, loff_t bytes); | 2301 | void inode_set_bytes(struct inode *inode, loff_t bytes); |
2301 | 2302 | ||
2302 | extern int vfs_readdir(struct file *, filldir_t, void *); | 2303 | extern int vfs_readdir(struct file *, filldir_t, void *); |
2303 | 2304 | ||
2304 | extern int vfs_stat(char __user *, struct kstat *); | 2305 | extern int vfs_stat(char __user *, struct kstat *); |
2305 | extern int vfs_lstat(char __user *, struct kstat *); | 2306 | extern int vfs_lstat(char __user *, struct kstat *); |
2306 | extern int vfs_fstat(unsigned int, struct kstat *); | 2307 | extern int vfs_fstat(unsigned int, struct kstat *); |
2307 | extern int vfs_fstatat(int , char __user *, struct kstat *, int); | 2308 | extern int vfs_fstatat(int , char __user *, struct kstat *, int); |
2308 | 2309 | ||
2309 | extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, | 2310 | extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, |
2310 | unsigned long arg); | 2311 | unsigned long arg); |
2311 | extern int __generic_block_fiemap(struct inode *inode, | 2312 | extern int __generic_block_fiemap(struct inode *inode, |
2312 | struct fiemap_extent_info *fieinfo, u64 start, | 2313 | struct fiemap_extent_info *fieinfo, u64 start, |
2313 | u64 len, get_block_t *get_block); | 2314 | u64 len, get_block_t *get_block); |
2314 | extern int generic_block_fiemap(struct inode *inode, | 2315 | extern int generic_block_fiemap(struct inode *inode, |
2315 | struct fiemap_extent_info *fieinfo, u64 start, | 2316 | struct fiemap_extent_info *fieinfo, u64 start, |
2316 | u64 len, get_block_t *get_block); | 2317 | u64 len, get_block_t *get_block); |
2317 | 2318 | ||
2318 | extern void get_filesystem(struct file_system_type *fs); | 2319 | extern void get_filesystem(struct file_system_type *fs); |
2319 | extern void put_filesystem(struct file_system_type *fs); | 2320 | extern void put_filesystem(struct file_system_type *fs); |
2320 | extern struct file_system_type *get_fs_type(const char *name); | 2321 | extern struct file_system_type *get_fs_type(const char *name); |
2321 | extern struct super_block *get_super(struct block_device *); | 2322 | extern struct super_block *get_super(struct block_device *); |
2322 | extern struct super_block *user_get_super(dev_t); | 2323 | extern struct super_block *user_get_super(dev_t); |
2323 | extern void drop_super(struct super_block *sb); | 2324 | extern void drop_super(struct super_block *sb); |
2324 | 2325 | ||
2325 | extern int dcache_dir_open(struct inode *, struct file *); | 2326 | extern int dcache_dir_open(struct inode *, struct file *); |
2326 | extern int dcache_dir_close(struct inode *, struct file *); | 2327 | extern int dcache_dir_close(struct inode *, struct file *); |
2327 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); | 2328 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); |
2328 | extern int dcache_readdir(struct file *, void *, filldir_t); | 2329 | extern int dcache_readdir(struct file *, void *, filldir_t); |
2329 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 2330 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
2330 | extern int simple_statfs(struct dentry *, struct kstatfs *); | 2331 | extern int simple_statfs(struct dentry *, struct kstatfs *); |
2331 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); | 2332 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); |
2332 | extern int simple_unlink(struct inode *, struct dentry *); | 2333 | extern int simple_unlink(struct inode *, struct dentry *); |
2333 | extern int simple_rmdir(struct inode *, struct dentry *); | 2334 | extern int simple_rmdir(struct inode *, struct dentry *); |
2334 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); | 2335 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); |
2335 | extern int simple_sync_file(struct file *, struct dentry *, int); | 2336 | extern int simple_sync_file(struct file *, struct dentry *, int); |
2336 | extern int simple_empty(struct dentry *); | 2337 | extern int simple_empty(struct dentry *); |
2337 | extern int simple_readpage(struct file *file, struct page *page); | 2338 | extern int simple_readpage(struct file *file, struct page *page); |
2338 | extern int simple_prepare_write(struct file *file, struct page *page, | 2339 | extern int simple_prepare_write(struct file *file, struct page *page, |
2339 | unsigned offset, unsigned to); | 2340 | unsigned offset, unsigned to); |
2340 | extern int simple_write_begin(struct file *file, struct address_space *mapping, | 2341 | extern int simple_write_begin(struct file *file, struct address_space *mapping, |
2341 | loff_t pos, unsigned len, unsigned flags, | 2342 | loff_t pos, unsigned len, unsigned flags, |
2342 | struct page **pagep, void **fsdata); | 2343 | struct page **pagep, void **fsdata); |
2343 | extern int simple_write_end(struct file *file, struct address_space *mapping, | 2344 | extern int simple_write_end(struct file *file, struct address_space *mapping, |
2344 | loff_t pos, unsigned len, unsigned copied, | 2345 | loff_t pos, unsigned len, unsigned copied, |
2345 | struct page *page, void *fsdata); | 2346 | struct page *page, void *fsdata); |
2346 | 2347 | ||
2347 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); | 2348 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); |
2348 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); | 2349 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); |
2349 | extern const struct file_operations simple_dir_operations; | 2350 | extern const struct file_operations simple_dir_operations; |
2350 | extern const struct inode_operations simple_dir_inode_operations; | 2351 | extern const struct inode_operations simple_dir_inode_operations; |
2351 | struct tree_descr { char *name; const struct file_operations *ops; int mode; }; | 2352 | struct tree_descr { char *name; const struct file_operations *ops; int mode; }; |
2352 | struct dentry *d_alloc_name(struct dentry *, const char *); | 2353 | struct dentry *d_alloc_name(struct dentry *, const char *); |
2353 | extern int simple_fill_super(struct super_block *, int, struct tree_descr *); | 2354 | extern int simple_fill_super(struct super_block *, int, struct tree_descr *); |
2354 | extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); | 2355 | extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); |
2355 | extern void simple_release_fs(struct vfsmount **mount, int *count); | 2356 | extern void simple_release_fs(struct vfsmount **mount, int *count); |
2356 | 2357 | ||
2357 | extern ssize_t simple_read_from_buffer(void __user *to, size_t count, | 2358 | extern ssize_t simple_read_from_buffer(void __user *to, size_t count, |
2358 | loff_t *ppos, const void *from, size_t available); | 2359 | loff_t *ppos, const void *from, size_t available); |
2359 | 2360 | ||
2360 | extern int simple_fsync(struct file *, struct dentry *, int); | 2361 | extern int simple_fsync(struct file *, struct dentry *, int); |
2361 | 2362 | ||
2362 | #ifdef CONFIG_MIGRATION | 2363 | #ifdef CONFIG_MIGRATION |
2363 | extern int buffer_migrate_page(struct address_space *, | 2364 | extern int buffer_migrate_page(struct address_space *, |
2364 | struct page *, struct page *); | 2365 | struct page *, struct page *); |
2365 | #else | 2366 | #else |
2366 | #define buffer_migrate_page NULL | 2367 | #define buffer_migrate_page NULL |
2367 | #endif | 2368 | #endif |
2368 | 2369 | ||
2369 | extern int inode_change_ok(struct inode *, struct iattr *); | 2370 | extern int inode_change_ok(struct inode *, struct iattr *); |
2370 | extern int __must_check inode_setattr(struct inode *, struct iattr *); | 2371 | extern int __must_check inode_setattr(struct inode *, struct iattr *); |
2371 | 2372 | ||
2372 | extern void file_update_time(struct file *file); | 2373 | extern void file_update_time(struct file *file); |
2373 | 2374 | ||
2374 | extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt); | 2375 | extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt); |
2375 | extern void save_mount_options(struct super_block *sb, char *options); | 2376 | extern void save_mount_options(struct super_block *sb, char *options); |
2376 | extern void replace_mount_options(struct super_block *sb, char *options); | 2377 | extern void replace_mount_options(struct super_block *sb, char *options); |
2377 | 2378 | ||
2378 | static inline ino_t parent_ino(struct dentry *dentry) | 2379 | static inline ino_t parent_ino(struct dentry *dentry) |
2379 | { | 2380 | { |
2380 | ino_t res; | 2381 | ino_t res; |
2381 | 2382 | ||
2382 | spin_lock(&dentry->d_lock); | 2383 | spin_lock(&dentry->d_lock); |
2383 | res = dentry->d_parent->d_inode->i_ino; | 2384 | res = dentry->d_parent->d_inode->i_ino; |
2384 | spin_unlock(&dentry->d_lock); | 2385 | spin_unlock(&dentry->d_lock); |
2385 | return res; | 2386 | return res; |
2386 | } | 2387 | } |
2387 | 2388 | ||
2388 | /* Transaction based IO helpers */ | 2389 | /* Transaction based IO helpers */ |
2389 | 2390 | ||
2390 | /* | 2391 | /* |
2391 | * An argresp is stored in an allocated page and holds the | 2392 | * An argresp is stored in an allocated page and holds the |
2392 | * size of the argument or response, along with its content | 2393 | * size of the argument or response, along with its content |
2393 | */ | 2394 | */ |
2394 | struct simple_transaction_argresp { | 2395 | struct simple_transaction_argresp { |
2395 | ssize_t size; | 2396 | ssize_t size; |
2396 | char data[0]; | 2397 | char data[0]; |
2397 | }; | 2398 | }; |
2398 | 2399 | ||
2399 | #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) | 2400 | #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) |
2400 | 2401 | ||
2401 | char *simple_transaction_get(struct file *file, const char __user *buf, | 2402 | char *simple_transaction_get(struct file *file, const char __user *buf, |
2402 | size_t size); | 2403 | size_t size); |
2403 | ssize_t simple_transaction_read(struct file *file, char __user *buf, | 2404 | ssize_t simple_transaction_read(struct file *file, char __user *buf, |
2404 | size_t size, loff_t *pos); | 2405 | size_t size, loff_t *pos); |
2405 | int simple_transaction_release(struct inode *inode, struct file *file); | 2406 | int simple_transaction_release(struct inode *inode, struct file *file); |
2406 | 2407 | ||
2407 | void simple_transaction_set(struct file *file, size_t n); | 2408 | void simple_transaction_set(struct file *file, size_t n); |
2408 | 2409 | ||
2409 | /* | 2410 | /* |
2410 | * simple attribute files | 2411 | * simple attribute files |
2411 | * | 2412 | * |
2412 | * These attributes behave similar to those in sysfs: | 2413 | * These attributes behave similar to those in sysfs: |
2413 | * | 2414 | * |
2414 | * Writing to an attribute immediately sets a value, an open file can be | 2415 | * Writing to an attribute immediately sets a value, an open file can be |
2415 | * written to multiple times. | 2416 | * written to multiple times. |
2416 | * | 2417 | * |
2417 | * Reading from an attribute creates a buffer from the value that might get | 2418 | * Reading from an attribute creates a buffer from the value that might get |
2418 | * read with multiple read calls. When the attribute has been read | 2419 | * read with multiple read calls. When the attribute has been read |
2419 | * completely, no further read calls are possible until the file is opened | 2420 | * completely, no further read calls are possible until the file is opened |
2420 | * again. | 2421 | * again. |
2421 | * | 2422 | * |
2422 | * All attributes contain a text representation of a numeric value | 2423 | * All attributes contain a text representation of a numeric value |
2423 | * that are accessed with the get() and set() functions. | 2424 | * that are accessed with the get() and set() functions. |
2424 | */ | 2425 | */ |
2425 | #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ | 2426 | #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ |
2426 | static int __fops ## _open(struct inode *inode, struct file *file) \ | 2427 | static int __fops ## _open(struct inode *inode, struct file *file) \ |
2427 | { \ | 2428 | { \ |
2428 | __simple_attr_check_format(__fmt, 0ull); \ | 2429 | __simple_attr_check_format(__fmt, 0ull); \ |
2429 | return simple_attr_open(inode, file, __get, __set, __fmt); \ | 2430 | return simple_attr_open(inode, file, __get, __set, __fmt); \ |
2430 | } \ | 2431 | } \ |
2431 | static struct file_operations __fops = { \ | 2432 | static struct file_operations __fops = { \ |
2432 | .owner = THIS_MODULE, \ | 2433 | .owner = THIS_MODULE, \ |
2433 | .open = __fops ## _open, \ | 2434 | .open = __fops ## _open, \ |
2434 | .release = simple_attr_release, \ | 2435 | .release = simple_attr_release, \ |
2435 | .read = simple_attr_read, \ | 2436 | .read = simple_attr_read, \ |
2436 | .write = simple_attr_write, \ | 2437 | .write = simple_attr_write, \ |
2437 | }; | 2438 | }; |
2438 | 2439 | ||
2439 | static inline void __attribute__((format(printf, 1, 2))) | 2440 | static inline void __attribute__((format(printf, 1, 2))) |
2440 | __simple_attr_check_format(const char *fmt, ...) | 2441 | __simple_attr_check_format(const char *fmt, ...) |
2441 | { | 2442 | { |
2442 | /* don't do anything, just let the compiler check the arguments; */ | 2443 | /* don't do anything, just let the compiler check the arguments; */ |
2443 | } | 2444 | } |
2444 | 2445 | ||
2445 | int simple_attr_open(struct inode *inode, struct file *file, | 2446 | int simple_attr_open(struct inode *inode, struct file *file, |
2446 | int (*get)(void *, u64 *), int (*set)(void *, u64), | 2447 | int (*get)(void *, u64 *), int (*set)(void *, u64), |
2447 | const char *fmt); | 2448 | const char *fmt); |
2448 | int simple_attr_release(struct inode *inode, struct file *file); | 2449 | int simple_attr_release(struct inode *inode, struct file *file); |
2449 | ssize_t simple_attr_read(struct file *file, char __user *buf, | 2450 | ssize_t simple_attr_read(struct file *file, char __user *buf, |
2450 | size_t len, loff_t *ppos); | 2451 | size_t len, loff_t *ppos); |
2451 | ssize_t simple_attr_write(struct file *file, const char __user *buf, | 2452 | ssize_t simple_attr_write(struct file *file, const char __user *buf, |
2452 | size_t len, loff_t *ppos); | 2453 | size_t len, loff_t *ppos); |
2453 | 2454 | ||
2454 | struct ctl_table; | 2455 | struct ctl_table; |
2455 | int proc_nr_files(struct ctl_table *table, int write, struct file *filp, | 2456 | int proc_nr_files(struct ctl_table *table, int write, struct file *filp, |
2456 | void __user *buffer, size_t *lenp, loff_t *ppos); | 2457 | void __user *buffer, size_t *lenp, loff_t *ppos); |
2457 | 2458 | ||
2458 | int __init get_filesystem_list(char *buf); | 2459 | int __init get_filesystem_list(char *buf); |
2459 | 2460 | ||
2460 | #endif /* __KERNEL__ */ | 2461 | #endif /* __KERNEL__ */ |
2461 | #endif /* _LINUX_FS_H */ | 2462 | #endif /* _LINUX_FS_H */ |
2462 | 2463 |
mm/swapfile.c
1 | /* | 1 | /* |
2 | * linux/mm/swapfile.c | 2 | * linux/mm/swapfile.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
5 | * Swap reorganised 29.12.95, Stephen Tweedie | 5 | * Swap reorganised 29.12.95, Stephen Tweedie |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/hugetlb.h> | 9 | #include <linux/hugetlb.h> |
10 | #include <linux/mman.h> | 10 | #include <linux/mman.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/kernel_stat.h> | 12 | #include <linux/kernel_stat.h> |
13 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shm.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | 19 | #include <linux/random.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 26 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/mutex.h> | 28 | #include <linux/mutex.h> |
29 | #include <linux/capability.h> | 29 | #include <linux/capability.h> |
30 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | 32 | ||
33 | #include <asm/pgtable.h> | 33 | #include <asm/pgtable.h> |
34 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
35 | #include <linux/swapops.h> | 35 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 36 | #include <linux/page_cgroup.h> |
37 | 37 | ||
38 | static DEFINE_SPINLOCK(swap_lock); | 38 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 39 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 40 | long nr_swap_pages; |
41 | long total_swap_pages; | 41 | long total_swap_pages; |
42 | static int swap_overflow; | 42 | static int swap_overflow; |
43 | static int least_priority; | 43 | static int least_priority; |
44 | 44 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 45 | static const char Bad_file[] = "Bad swap file entry "; |
46 | static const char Unused_file[] = "Unused swap file entry "; | 46 | static const char Unused_file[] = "Unused swap file entry "; |
47 | static const char Bad_offset[] = "Bad swap offset entry "; | 47 | static const char Bad_offset[] = "Bad swap offset entry "; |
48 | static const char Unused_offset[] = "Unused swap offset entry "; | 48 | static const char Unused_offset[] = "Unused swap offset entry "; |
49 | 49 | ||
50 | static struct swap_list_t swap_list = {-1, -1}; | 50 | static struct swap_list_t swap_list = {-1, -1}; |
51 | 51 | ||
52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
53 | 53 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
55 | 55 | ||
56 | /* For reference count accounting in swap_map */ | 56 | /* For reference count accounting in swap_map */ |
57 | /* enum for swap_map[] handling. internal use only */ | 57 | /* enum for swap_map[] handling. internal use only */ |
58 | enum { | 58 | enum { |
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | 59 | SWAP_MAP = 0, /* ops for reference from swap users */ |
60 | SWAP_CACHE, /* ops for reference from swap cache */ | 60 | SWAP_CACHE, /* ops for reference from swap cache */ |
61 | }; | 61 | }; |
62 | 62 | ||
63 | static inline int swap_count(unsigned short ent) | 63 | static inline int swap_count(unsigned short ent) |
64 | { | 64 | { |
65 | return ent & SWAP_COUNT_MASK; | 65 | return ent & SWAP_COUNT_MASK; |
66 | } | 66 | } |
67 | 67 | ||
68 | static inline bool swap_has_cache(unsigned short ent) | 68 | static inline bool swap_has_cache(unsigned short ent) |
69 | { | 69 | { |
70 | return !!(ent & SWAP_HAS_CACHE); | 70 | return !!(ent & SWAP_HAS_CACHE); |
71 | } | 71 | } |
72 | 72 | ||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | 73 | static inline unsigned short encode_swapmap(int count, bool has_cache) |
74 | { | 74 | { |
75 | unsigned short ret = count; | 75 | unsigned short ret = count; |
76 | 76 | ||
77 | if (has_cache) | 77 | if (has_cache) |
78 | return SWAP_HAS_CACHE | ret; | 78 | return SWAP_HAS_CACHE | ret; |
79 | return ret; | 79 | return ret; |
80 | } | 80 | } |
81 | 81 | ||
82 | /* returnes 1 if swap entry is freed */ | 82 | /* returnes 1 if swap entry is freed */ |
83 | static int | 83 | static int |
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
85 | { | 85 | { |
86 | int type = si - swap_info; | 86 | int type = si - swap_info; |
87 | swp_entry_t entry = swp_entry(type, offset); | 87 | swp_entry_t entry = swp_entry(type, offset); |
88 | struct page *page; | 88 | struct page *page; |
89 | int ret = 0; | 89 | int ret = 0; |
90 | 90 | ||
91 | page = find_get_page(&swapper_space, entry.val); | 91 | page = find_get_page(&swapper_space, entry.val); |
92 | if (!page) | 92 | if (!page) |
93 | return 0; | 93 | return 0; |
94 | /* | 94 | /* |
95 | * This function is called from scan_swap_map() and it's called | 95 | * This function is called from scan_swap_map() and it's called |
96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | 96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. |
97 | * We have to use trylock for avoiding deadlock. This is a special | 97 | * We have to use trylock for avoiding deadlock. This is a special |
98 | * case and you should use try_to_free_swap() with explicit lock_page() | 98 | * case and you should use try_to_free_swap() with explicit lock_page() |
99 | * in usual operations. | 99 | * in usual operations. |
100 | */ | 100 | */ |
101 | if (trylock_page(page)) { | 101 | if (trylock_page(page)) { |
102 | ret = try_to_free_swap(page); | 102 | ret = try_to_free_swap(page); |
103 | unlock_page(page); | 103 | unlock_page(page); |
104 | } | 104 | } |
105 | page_cache_release(page); | 105 | page_cache_release(page); |
106 | return ret; | 106 | return ret; |
107 | } | 107 | } |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
111 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
112 | * cannot be turned into a mutex. | 112 | * cannot be turned into a mutex. |
113 | */ | 113 | */ |
114 | static DECLARE_RWSEM(swap_unplug_sem); | 114 | static DECLARE_RWSEM(swap_unplug_sem); |
115 | 115 | ||
116 | void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | 116 | void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) |
117 | { | 117 | { |
118 | swp_entry_t entry; | 118 | swp_entry_t entry; |
119 | 119 | ||
120 | down_read(&swap_unplug_sem); | 120 | down_read(&swap_unplug_sem); |
121 | entry.val = page_private(page); | 121 | entry.val = page_private(page); |
122 | if (PageSwapCache(page)) { | 122 | if (PageSwapCache(page)) { |
123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; |
124 | struct backing_dev_info *bdi; | 124 | struct backing_dev_info *bdi; |
125 | 125 | ||
126 | /* | 126 | /* |
127 | * If the page is removed from swapcache from under us (with a | 127 | * If the page is removed from swapcache from under us (with a |
128 | * racy try_to_unuse/swapoff) we need an additional reference | 128 | * racy try_to_unuse/swapoff) we need an additional reference |
129 | * count to avoid reading garbage from page_private(page) above. | 129 | * count to avoid reading garbage from page_private(page) above. |
130 | * If the WARN_ON triggers during a swapoff it maybe the race | 130 | * If the WARN_ON triggers during a swapoff it maybe the race |
131 | * condition and it's harmless. However if it triggers without | 131 | * condition and it's harmless. However if it triggers without |
132 | * swapoff it signals a problem. | 132 | * swapoff it signals a problem. |
133 | */ | 133 | */ |
134 | WARN_ON(page_count(page) <= 1); | 134 | WARN_ON(page_count(page) <= 1); |
135 | 135 | ||
136 | bdi = bdev->bd_inode->i_mapping->backing_dev_info; | 136 | bdi = bdev->bd_inode->i_mapping->backing_dev_info; |
137 | blk_run_backing_dev(bdi, page); | 137 | blk_run_backing_dev(bdi, page); |
138 | } | 138 | } |
139 | up_read(&swap_unplug_sem); | 139 | up_read(&swap_unplug_sem); |
140 | } | 140 | } |
141 | 141 | ||
142 | /* | 142 | /* |
143 | * swapon tell device that all the old swap contents can be discarded, | 143 | * swapon tell device that all the old swap contents can be discarded, |
144 | * to allow the swap device to optimize its wear-levelling. | 144 | * to allow the swap device to optimize its wear-levelling. |
145 | */ | 145 | */ |
146 | static int discard_swap(struct swap_info_struct *si) | 146 | static int discard_swap(struct swap_info_struct *si) |
147 | { | 147 | { |
148 | struct swap_extent *se; | 148 | struct swap_extent *se; |
149 | int err = 0; | 149 | int err = 0; |
150 | 150 | ||
151 | list_for_each_entry(se, &si->extent_list, list) { | 151 | list_for_each_entry(se, &si->extent_list, list) { |
152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); |
153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
154 | 154 | ||
155 | if (se->start_page == 0) { | 155 | if (se->start_page == 0) { |
156 | /* Do not discard the swap header page! */ | 156 | /* Do not discard the swap header page! */ |
157 | start_block += 1 << (PAGE_SHIFT - 9); | 157 | start_block += 1 << (PAGE_SHIFT - 9); |
158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | 158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); |
159 | if (!nr_blocks) | 159 | if (!nr_blocks) |
160 | continue; | 160 | continue; |
161 | } | 161 | } |
162 | 162 | ||
163 | err = blkdev_issue_discard(si->bdev, start_block, | 163 | err = blkdev_issue_discard(si->bdev, start_block, |
164 | nr_blocks, GFP_KERNEL); | 164 | nr_blocks, GFP_KERNEL); |
165 | if (err) | 165 | if (err) |
166 | break; | 166 | break; |
167 | 167 | ||
168 | cond_resched(); | 168 | cond_resched(); |
169 | } | 169 | } |
170 | return err; /* That will often be -EOPNOTSUPP */ | 170 | return err; /* That will often be -EOPNOTSUPP */ |
171 | } | 171 | } |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * swap allocation tell device that a cluster of swap can now be discarded, | 174 | * swap allocation tell device that a cluster of swap can now be discarded, |
175 | * to allow the swap device to optimize its wear-levelling. | 175 | * to allow the swap device to optimize its wear-levelling. |
176 | */ | 176 | */ |
177 | static void discard_swap_cluster(struct swap_info_struct *si, | 177 | static void discard_swap_cluster(struct swap_info_struct *si, |
178 | pgoff_t start_page, pgoff_t nr_pages) | 178 | pgoff_t start_page, pgoff_t nr_pages) |
179 | { | 179 | { |
180 | struct swap_extent *se = si->curr_swap_extent; | 180 | struct swap_extent *se = si->curr_swap_extent; |
181 | int found_extent = 0; | 181 | int found_extent = 0; |
182 | 182 | ||
183 | while (nr_pages) { | 183 | while (nr_pages) { |
184 | struct list_head *lh; | 184 | struct list_head *lh; |
185 | 185 | ||
186 | if (se->start_page <= start_page && | 186 | if (se->start_page <= start_page && |
187 | start_page < se->start_page + se->nr_pages) { | 187 | start_page < se->start_page + se->nr_pages) { |
188 | pgoff_t offset = start_page - se->start_page; | 188 | pgoff_t offset = start_page - se->start_page; |
189 | sector_t start_block = se->start_block + offset; | 189 | sector_t start_block = se->start_block + offset; |
190 | sector_t nr_blocks = se->nr_pages - offset; | 190 | sector_t nr_blocks = se->nr_pages - offset; |
191 | 191 | ||
192 | if (nr_blocks > nr_pages) | 192 | if (nr_blocks > nr_pages) |
193 | nr_blocks = nr_pages; | 193 | nr_blocks = nr_pages; |
194 | start_page += nr_blocks; | 194 | start_page += nr_blocks; |
195 | nr_pages -= nr_blocks; | 195 | nr_pages -= nr_blocks; |
196 | 196 | ||
197 | if (!found_extent++) | 197 | if (!found_extent++) |
198 | si->curr_swap_extent = se; | 198 | si->curr_swap_extent = se; |
199 | 199 | ||
200 | start_block <<= PAGE_SHIFT - 9; | 200 | start_block <<= PAGE_SHIFT - 9; |
201 | nr_blocks <<= PAGE_SHIFT - 9; | 201 | nr_blocks <<= PAGE_SHIFT - 9; |
202 | if (blkdev_issue_discard(si->bdev, start_block, | 202 | if (blkdev_issue_discard(si->bdev, start_block, |
203 | nr_blocks, GFP_NOIO)) | 203 | nr_blocks, GFP_NOIO)) |
204 | break; | 204 | break; |
205 | } | 205 | } |
206 | 206 | ||
207 | lh = se->list.next; | 207 | lh = se->list.next; |
208 | if (lh == &si->extent_list) | 208 | if (lh == &si->extent_list) |
209 | lh = lh->next; | 209 | lh = lh->next; |
210 | se = list_entry(lh, struct swap_extent, list); | 210 | se = list_entry(lh, struct swap_extent, list); |
211 | } | 211 | } |
212 | } | 212 | } |
213 | 213 | ||
214 | static int wait_for_discard(void *word) | 214 | static int wait_for_discard(void *word) |
215 | { | 215 | { |
216 | schedule(); | 216 | schedule(); |
217 | return 0; | 217 | return 0; |
218 | } | 218 | } |
219 | 219 | ||
220 | #define SWAPFILE_CLUSTER 256 | 220 | #define SWAPFILE_CLUSTER 256 |
221 | #define LATENCY_LIMIT 256 | 221 | #define LATENCY_LIMIT 256 |
222 | 222 | ||
223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
224 | int cache) | 224 | int cache) |
225 | { | 225 | { |
226 | unsigned long offset; | 226 | unsigned long offset; |
227 | unsigned long scan_base; | 227 | unsigned long scan_base; |
228 | unsigned long last_in_cluster = 0; | 228 | unsigned long last_in_cluster = 0; |
229 | int latency_ration = LATENCY_LIMIT; | 229 | int latency_ration = LATENCY_LIMIT; |
230 | int found_free_cluster = 0; | 230 | int found_free_cluster = 0; |
231 | 231 | ||
232 | /* | 232 | /* |
233 | * We try to cluster swap pages by allocating them sequentially | 233 | * We try to cluster swap pages by allocating them sequentially |
234 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this | 234 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this |
235 | * way, however, we resort to first-free allocation, starting | 235 | * way, however, we resort to first-free allocation, starting |
236 | * a new cluster. This prevents us from scattering swap pages | 236 | * a new cluster. This prevents us from scattering swap pages |
237 | * all over the entire swap partition, so that we reduce | 237 | * all over the entire swap partition, so that we reduce |
238 | * overall disk seek times between swap pages. -- sct | 238 | * overall disk seek times between swap pages. -- sct |
239 | * But we do now try to find an empty cluster. -Andrea | 239 | * But we do now try to find an empty cluster. -Andrea |
240 | * And we let swap pages go all over an SSD partition. Hugh | 240 | * And we let swap pages go all over an SSD partition. Hugh |
241 | */ | 241 | */ |
242 | 242 | ||
243 | si->flags += SWP_SCANNING; | 243 | si->flags += SWP_SCANNING; |
244 | scan_base = offset = si->cluster_next; | 244 | scan_base = offset = si->cluster_next; |
245 | 245 | ||
246 | if (unlikely(!si->cluster_nr--)) { | 246 | if (unlikely(!si->cluster_nr--)) { |
247 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { | 247 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
249 | goto checks; | 249 | goto checks; |
250 | } | 250 | } |
251 | if (si->flags & SWP_DISCARDABLE) { | 251 | if (si->flags & SWP_DISCARDABLE) { |
252 | /* | 252 | /* |
253 | * Start range check on racing allocations, in case | 253 | * Start range check on racing allocations, in case |
254 | * they overlap the cluster we eventually decide on | 254 | * they overlap the cluster we eventually decide on |
255 | * (we scan without swap_lock to allow preemption). | 255 | * (we scan without swap_lock to allow preemption). |
256 | * It's hardly conceivable that cluster_nr could be | 256 | * It's hardly conceivable that cluster_nr could be |
257 | * wrapped during our scan, but don't depend on it. | 257 | * wrapped during our scan, but don't depend on it. |
258 | */ | 258 | */ |
259 | if (si->lowest_alloc) | 259 | if (si->lowest_alloc) |
260 | goto checks; | 260 | goto checks; |
261 | si->lowest_alloc = si->max; | 261 | si->lowest_alloc = si->max; |
262 | si->highest_alloc = 0; | 262 | si->highest_alloc = 0; |
263 | } | 263 | } |
264 | spin_unlock(&swap_lock); | 264 | spin_unlock(&swap_lock); |
265 | 265 | ||
266 | /* | 266 | /* |
267 | * If seek is expensive, start searching for new cluster from | 267 | * If seek is expensive, start searching for new cluster from |
268 | * start of partition, to minimize the span of allocated swap. | 268 | * start of partition, to minimize the span of allocated swap. |
269 | * But if seek is cheap, search from our current position, so | 269 | * But if seek is cheap, search from our current position, so |
270 | * that swap is allocated from all over the partition: if the | 270 | * that swap is allocated from all over the partition: if the |
271 | * Flash Translation Layer only remaps within limited zones, | 271 | * Flash Translation Layer only remaps within limited zones, |
272 | * we don't want to wear out the first zone too quickly. | 272 | * we don't want to wear out the first zone too quickly. |
273 | */ | 273 | */ |
274 | if (!(si->flags & SWP_SOLIDSTATE)) | 274 | if (!(si->flags & SWP_SOLIDSTATE)) |
275 | scan_base = offset = si->lowest_bit; | 275 | scan_base = offset = si->lowest_bit; |
276 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 276 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
277 | 277 | ||
278 | /* Locate the first empty (unaligned) cluster */ | 278 | /* Locate the first empty (unaligned) cluster */ |
279 | for (; last_in_cluster <= si->highest_bit; offset++) { | 279 | for (; last_in_cluster <= si->highest_bit; offset++) { |
280 | if (si->swap_map[offset]) | 280 | if (si->swap_map[offset]) |
281 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 281 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
282 | else if (offset == last_in_cluster) { | 282 | else if (offset == last_in_cluster) { |
283 | spin_lock(&swap_lock); | 283 | spin_lock(&swap_lock); |
284 | offset -= SWAPFILE_CLUSTER - 1; | 284 | offset -= SWAPFILE_CLUSTER - 1; |
285 | si->cluster_next = offset; | 285 | si->cluster_next = offset; |
286 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 286 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
287 | found_free_cluster = 1; | 287 | found_free_cluster = 1; |
288 | goto checks; | 288 | goto checks; |
289 | } | 289 | } |
290 | if (unlikely(--latency_ration < 0)) { | 290 | if (unlikely(--latency_ration < 0)) { |
291 | cond_resched(); | 291 | cond_resched(); |
292 | latency_ration = LATENCY_LIMIT; | 292 | latency_ration = LATENCY_LIMIT; |
293 | } | 293 | } |
294 | } | 294 | } |
295 | 295 | ||
296 | offset = si->lowest_bit; | 296 | offset = si->lowest_bit; |
297 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 297 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
298 | 298 | ||
299 | /* Locate the first empty (unaligned) cluster */ | 299 | /* Locate the first empty (unaligned) cluster */ |
300 | for (; last_in_cluster < scan_base; offset++) { | 300 | for (; last_in_cluster < scan_base; offset++) { |
301 | if (si->swap_map[offset]) | 301 | if (si->swap_map[offset]) |
302 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 302 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
303 | else if (offset == last_in_cluster) { | 303 | else if (offset == last_in_cluster) { |
304 | spin_lock(&swap_lock); | 304 | spin_lock(&swap_lock); |
305 | offset -= SWAPFILE_CLUSTER - 1; | 305 | offset -= SWAPFILE_CLUSTER - 1; |
306 | si->cluster_next = offset; | 306 | si->cluster_next = offset; |
307 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 307 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
308 | found_free_cluster = 1; | 308 | found_free_cluster = 1; |
309 | goto checks; | 309 | goto checks; |
310 | } | 310 | } |
311 | if (unlikely(--latency_ration < 0)) { | 311 | if (unlikely(--latency_ration < 0)) { |
312 | cond_resched(); | 312 | cond_resched(); |
313 | latency_ration = LATENCY_LIMIT; | 313 | latency_ration = LATENCY_LIMIT; |
314 | } | 314 | } |
315 | } | 315 | } |
316 | 316 | ||
317 | offset = scan_base; | 317 | offset = scan_base; |
318 | spin_lock(&swap_lock); | 318 | spin_lock(&swap_lock); |
319 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 319 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
320 | si->lowest_alloc = 0; | 320 | si->lowest_alloc = 0; |
321 | } | 321 | } |
322 | 322 | ||
323 | checks: | 323 | checks: |
324 | if (!(si->flags & SWP_WRITEOK)) | 324 | if (!(si->flags & SWP_WRITEOK)) |
325 | goto no_page; | 325 | goto no_page; |
326 | if (!si->highest_bit) | 326 | if (!si->highest_bit) |
327 | goto no_page; | 327 | goto no_page; |
328 | if (offset > si->highest_bit) | 328 | if (offset > si->highest_bit) |
329 | scan_base = offset = si->lowest_bit; | 329 | scan_base = offset = si->lowest_bit; |
330 | 330 | ||
331 | /* reuse swap entry of cache-only swap if not busy. */ | 331 | /* reuse swap entry of cache-only swap if not busy. */ |
332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
333 | int swap_was_freed; | 333 | int swap_was_freed; |
334 | spin_unlock(&swap_lock); | 334 | spin_unlock(&swap_lock); |
335 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 335 | swap_was_freed = __try_to_reclaim_swap(si, offset); |
336 | spin_lock(&swap_lock); | 336 | spin_lock(&swap_lock); |
337 | /* entry was freed successfully, try to use this again */ | 337 | /* entry was freed successfully, try to use this again */ |
338 | if (swap_was_freed) | 338 | if (swap_was_freed) |
339 | goto checks; | 339 | goto checks; |
340 | goto scan; /* check next one */ | 340 | goto scan; /* check next one */ |
341 | } | 341 | } |
342 | 342 | ||
343 | if (si->swap_map[offset]) | 343 | if (si->swap_map[offset]) |
344 | goto scan; | 344 | goto scan; |
345 | 345 | ||
346 | if (offset == si->lowest_bit) | 346 | if (offset == si->lowest_bit) |
347 | si->lowest_bit++; | 347 | si->lowest_bit++; |
348 | if (offset == si->highest_bit) | 348 | if (offset == si->highest_bit) |
349 | si->highest_bit--; | 349 | si->highest_bit--; |
350 | si->inuse_pages++; | 350 | si->inuse_pages++; |
351 | if (si->inuse_pages == si->pages) { | 351 | if (si->inuse_pages == si->pages) { |
352 | si->lowest_bit = si->max; | 352 | si->lowest_bit = si->max; |
353 | si->highest_bit = 0; | 353 | si->highest_bit = 0; |
354 | } | 354 | } |
355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
356 | si->swap_map[offset] = encode_swapmap(0, true); | 356 | si->swap_map[offset] = encode_swapmap(0, true); |
357 | else /* at suspend */ | 357 | else /* at suspend */ |
358 | si->swap_map[offset] = encode_swapmap(1, false); | 358 | si->swap_map[offset] = encode_swapmap(1, false); |
359 | si->cluster_next = offset + 1; | 359 | si->cluster_next = offset + 1; |
360 | si->flags -= SWP_SCANNING; | 360 | si->flags -= SWP_SCANNING; |
361 | 361 | ||
362 | if (si->lowest_alloc) { | 362 | if (si->lowest_alloc) { |
363 | /* | 363 | /* |
364 | * Only set when SWP_DISCARDABLE, and there's a scan | 364 | * Only set when SWP_DISCARDABLE, and there's a scan |
365 | * for a free cluster in progress or just completed. | 365 | * for a free cluster in progress or just completed. |
366 | */ | 366 | */ |
367 | if (found_free_cluster) { | 367 | if (found_free_cluster) { |
368 | /* | 368 | /* |
369 | * To optimize wear-levelling, discard the | 369 | * To optimize wear-levelling, discard the |
370 | * old data of the cluster, taking care not to | 370 | * old data of the cluster, taking care not to |
371 | * discard any of its pages that have already | 371 | * discard any of its pages that have already |
372 | * been allocated by racing tasks (offset has | 372 | * been allocated by racing tasks (offset has |
373 | * already stepped over any at the beginning). | 373 | * already stepped over any at the beginning). |
374 | */ | 374 | */ |
375 | if (offset < si->highest_alloc && | 375 | if (offset < si->highest_alloc && |
376 | si->lowest_alloc <= last_in_cluster) | 376 | si->lowest_alloc <= last_in_cluster) |
377 | last_in_cluster = si->lowest_alloc - 1; | 377 | last_in_cluster = si->lowest_alloc - 1; |
378 | si->flags |= SWP_DISCARDING; | 378 | si->flags |= SWP_DISCARDING; |
379 | spin_unlock(&swap_lock); | 379 | spin_unlock(&swap_lock); |
380 | 380 | ||
381 | if (offset < last_in_cluster) | 381 | if (offset < last_in_cluster) |
382 | discard_swap_cluster(si, offset, | 382 | discard_swap_cluster(si, offset, |
383 | last_in_cluster - offset + 1); | 383 | last_in_cluster - offset + 1); |
384 | 384 | ||
385 | spin_lock(&swap_lock); | 385 | spin_lock(&swap_lock); |
386 | si->lowest_alloc = 0; | 386 | si->lowest_alloc = 0; |
387 | si->flags &= ~SWP_DISCARDING; | 387 | si->flags &= ~SWP_DISCARDING; |
388 | 388 | ||
389 | smp_mb(); /* wake_up_bit advises this */ | 389 | smp_mb(); /* wake_up_bit advises this */ |
390 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | 390 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); |
391 | 391 | ||
392 | } else if (si->flags & SWP_DISCARDING) { | 392 | } else if (si->flags & SWP_DISCARDING) { |
393 | /* | 393 | /* |
394 | * Delay using pages allocated by racing tasks | 394 | * Delay using pages allocated by racing tasks |
395 | * until the whole discard has been issued. We | 395 | * until the whole discard has been issued. We |
396 | * could defer that delay until swap_writepage, | 396 | * could defer that delay until swap_writepage, |
397 | * but it's easier to keep this self-contained. | 397 | * but it's easier to keep this self-contained. |
398 | */ | 398 | */ |
399 | spin_unlock(&swap_lock); | 399 | spin_unlock(&swap_lock); |
400 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | 400 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), |
401 | wait_for_discard, TASK_UNINTERRUPTIBLE); | 401 | wait_for_discard, TASK_UNINTERRUPTIBLE); |
402 | spin_lock(&swap_lock); | 402 | spin_lock(&swap_lock); |
403 | } else { | 403 | } else { |
404 | /* | 404 | /* |
405 | * Note pages allocated by racing tasks while | 405 | * Note pages allocated by racing tasks while |
406 | * scan for a free cluster is in progress, so | 406 | * scan for a free cluster is in progress, so |
407 | * that its final discard can exclude them. | 407 | * that its final discard can exclude them. |
408 | */ | 408 | */ |
409 | if (offset < si->lowest_alloc) | 409 | if (offset < si->lowest_alloc) |
410 | si->lowest_alloc = offset; | 410 | si->lowest_alloc = offset; |
411 | if (offset > si->highest_alloc) | 411 | if (offset > si->highest_alloc) |
412 | si->highest_alloc = offset; | 412 | si->highest_alloc = offset; |
413 | } | 413 | } |
414 | } | 414 | } |
415 | return offset; | 415 | return offset; |
416 | 416 | ||
417 | scan: | 417 | scan: |
418 | spin_unlock(&swap_lock); | 418 | spin_unlock(&swap_lock); |
419 | while (++offset <= si->highest_bit) { | 419 | while (++offset <= si->highest_bit) { |
420 | if (!si->swap_map[offset]) { | 420 | if (!si->swap_map[offset]) { |
421 | spin_lock(&swap_lock); | 421 | spin_lock(&swap_lock); |
422 | goto checks; | 422 | goto checks; |
423 | } | 423 | } |
424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
425 | spin_lock(&swap_lock); | 425 | spin_lock(&swap_lock); |
426 | goto checks; | 426 | goto checks; |
427 | } | 427 | } |
428 | if (unlikely(--latency_ration < 0)) { | 428 | if (unlikely(--latency_ration < 0)) { |
429 | cond_resched(); | 429 | cond_resched(); |
430 | latency_ration = LATENCY_LIMIT; | 430 | latency_ration = LATENCY_LIMIT; |
431 | } | 431 | } |
432 | } | 432 | } |
433 | offset = si->lowest_bit; | 433 | offset = si->lowest_bit; |
434 | while (++offset < scan_base) { | 434 | while (++offset < scan_base) { |
435 | if (!si->swap_map[offset]) { | 435 | if (!si->swap_map[offset]) { |
436 | spin_lock(&swap_lock); | 436 | spin_lock(&swap_lock); |
437 | goto checks; | 437 | goto checks; |
438 | } | 438 | } |
439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
440 | spin_lock(&swap_lock); | 440 | spin_lock(&swap_lock); |
441 | goto checks; | 441 | goto checks; |
442 | } | 442 | } |
443 | if (unlikely(--latency_ration < 0)) { | 443 | if (unlikely(--latency_ration < 0)) { |
444 | cond_resched(); | 444 | cond_resched(); |
445 | latency_ration = LATENCY_LIMIT; | 445 | latency_ration = LATENCY_LIMIT; |
446 | } | 446 | } |
447 | } | 447 | } |
448 | spin_lock(&swap_lock); | 448 | spin_lock(&swap_lock); |
449 | 449 | ||
450 | no_page: | 450 | no_page: |
451 | si->flags -= SWP_SCANNING; | 451 | si->flags -= SWP_SCANNING; |
452 | return 0; | 452 | return 0; |
453 | } | 453 | } |
454 | 454 | ||
455 | swp_entry_t get_swap_page(void) | 455 | swp_entry_t get_swap_page(void) |
456 | { | 456 | { |
457 | struct swap_info_struct *si; | 457 | struct swap_info_struct *si; |
458 | pgoff_t offset; | 458 | pgoff_t offset; |
459 | int type, next; | 459 | int type, next; |
460 | int wrapped = 0; | 460 | int wrapped = 0; |
461 | 461 | ||
462 | spin_lock(&swap_lock); | 462 | spin_lock(&swap_lock); |
463 | if (nr_swap_pages <= 0) | 463 | if (nr_swap_pages <= 0) |
464 | goto noswap; | 464 | goto noswap; |
465 | nr_swap_pages--; | 465 | nr_swap_pages--; |
466 | 466 | ||
467 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 467 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
468 | si = swap_info + type; | 468 | si = swap_info + type; |
469 | next = si->next; | 469 | next = si->next; |
470 | if (next < 0 || | 470 | if (next < 0 || |
471 | (!wrapped && si->prio != swap_info[next].prio)) { | 471 | (!wrapped && si->prio != swap_info[next].prio)) { |
472 | next = swap_list.head; | 472 | next = swap_list.head; |
473 | wrapped++; | 473 | wrapped++; |
474 | } | 474 | } |
475 | 475 | ||
476 | if (!si->highest_bit) | 476 | if (!si->highest_bit) |
477 | continue; | 477 | continue; |
478 | if (!(si->flags & SWP_WRITEOK)) | 478 | if (!(si->flags & SWP_WRITEOK)) |
479 | continue; | 479 | continue; |
480 | 480 | ||
481 | swap_list.next = next; | 481 | swap_list.next = next; |
482 | /* This is called for allocating swap entry for cache */ | 482 | /* This is called for allocating swap entry for cache */ |
483 | offset = scan_swap_map(si, SWAP_CACHE); | 483 | offset = scan_swap_map(si, SWAP_CACHE); |
484 | if (offset) { | 484 | if (offset) { |
485 | spin_unlock(&swap_lock); | 485 | spin_unlock(&swap_lock); |
486 | return swp_entry(type, offset); | 486 | return swp_entry(type, offset); |
487 | } | 487 | } |
488 | next = swap_list.next; | 488 | next = swap_list.next; |
489 | } | 489 | } |
490 | 490 | ||
491 | nr_swap_pages++; | 491 | nr_swap_pages++; |
492 | noswap: | 492 | noswap: |
493 | spin_unlock(&swap_lock); | 493 | spin_unlock(&swap_lock); |
494 | return (swp_entry_t) {0}; | 494 | return (swp_entry_t) {0}; |
495 | } | 495 | } |
496 | 496 | ||
497 | /* The only caller of this function is now susupend routine */ | 497 | /* The only caller of this function is now susupend routine */ |
498 | swp_entry_t get_swap_page_of_type(int type) | 498 | swp_entry_t get_swap_page_of_type(int type) |
499 | { | 499 | { |
500 | struct swap_info_struct *si; | 500 | struct swap_info_struct *si; |
501 | pgoff_t offset; | 501 | pgoff_t offset; |
502 | 502 | ||
503 | spin_lock(&swap_lock); | 503 | spin_lock(&swap_lock); |
504 | si = swap_info + type; | 504 | si = swap_info + type; |
505 | if (si->flags & SWP_WRITEOK) { | 505 | if (si->flags & SWP_WRITEOK) { |
506 | nr_swap_pages--; | 506 | nr_swap_pages--; |
507 | /* This is called for allocating swap entry, not cache */ | 507 | /* This is called for allocating swap entry, not cache */ |
508 | offset = scan_swap_map(si, SWAP_MAP); | 508 | offset = scan_swap_map(si, SWAP_MAP); |
509 | if (offset) { | 509 | if (offset) { |
510 | spin_unlock(&swap_lock); | 510 | spin_unlock(&swap_lock); |
511 | return swp_entry(type, offset); | 511 | return swp_entry(type, offset); |
512 | } | 512 | } |
513 | nr_swap_pages++; | 513 | nr_swap_pages++; |
514 | } | 514 | } |
515 | spin_unlock(&swap_lock); | 515 | spin_unlock(&swap_lock); |
516 | return (swp_entry_t) {0}; | 516 | return (swp_entry_t) {0}; |
517 | } | 517 | } |
518 | 518 | ||
519 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 519 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) |
520 | { | 520 | { |
521 | struct swap_info_struct * p; | 521 | struct swap_info_struct * p; |
522 | unsigned long offset, type; | 522 | unsigned long offset, type; |
523 | 523 | ||
524 | if (!entry.val) | 524 | if (!entry.val) |
525 | goto out; | 525 | goto out; |
526 | type = swp_type(entry); | 526 | type = swp_type(entry); |
527 | if (type >= nr_swapfiles) | 527 | if (type >= nr_swapfiles) |
528 | goto bad_nofile; | 528 | goto bad_nofile; |
529 | p = & swap_info[type]; | 529 | p = & swap_info[type]; |
530 | if (!(p->flags & SWP_USED)) | 530 | if (!(p->flags & SWP_USED)) |
531 | goto bad_device; | 531 | goto bad_device; |
532 | offset = swp_offset(entry); | 532 | offset = swp_offset(entry); |
533 | if (offset >= p->max) | 533 | if (offset >= p->max) |
534 | goto bad_offset; | 534 | goto bad_offset; |
535 | if (!p->swap_map[offset]) | 535 | if (!p->swap_map[offset]) |
536 | goto bad_free; | 536 | goto bad_free; |
537 | spin_lock(&swap_lock); | 537 | spin_lock(&swap_lock); |
538 | return p; | 538 | return p; |
539 | 539 | ||
540 | bad_free: | 540 | bad_free: |
541 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); | 541 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); |
542 | goto out; | 542 | goto out; |
543 | bad_offset: | 543 | bad_offset: |
544 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); | 544 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); |
545 | goto out; | 545 | goto out; |
546 | bad_device: | 546 | bad_device: |
547 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); | 547 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); |
548 | goto out; | 548 | goto out; |
549 | bad_nofile: | 549 | bad_nofile: |
550 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 550 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); |
551 | out: | 551 | out: |
552 | return NULL; | 552 | return NULL; |
553 | } | 553 | } |
554 | 554 | ||
555 | static int swap_entry_free(struct swap_info_struct *p, | 555 | static int swap_entry_free(struct swap_info_struct *p, |
556 | swp_entry_t ent, int cache) | 556 | swp_entry_t ent, int cache) |
557 | { | 557 | { |
558 | unsigned long offset = swp_offset(ent); | 558 | unsigned long offset = swp_offset(ent); |
559 | int count = swap_count(p->swap_map[offset]); | 559 | int count = swap_count(p->swap_map[offset]); |
560 | bool has_cache; | 560 | bool has_cache; |
561 | 561 | ||
562 | has_cache = swap_has_cache(p->swap_map[offset]); | 562 | has_cache = swap_has_cache(p->swap_map[offset]); |
563 | 563 | ||
564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
565 | if (count < SWAP_MAP_MAX) { | 565 | if (count < SWAP_MAP_MAX) { |
566 | count--; | 566 | count--; |
567 | p->swap_map[offset] = encode_swapmap(count, has_cache); | 567 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
568 | } | 568 | } |
569 | } else { /* dropping swap cache flag */ | 569 | } else { /* dropping swap cache flag */ |
570 | VM_BUG_ON(!has_cache); | 570 | VM_BUG_ON(!has_cache); |
571 | p->swap_map[offset] = encode_swapmap(count, false); | 571 | p->swap_map[offset] = encode_swapmap(count, false); |
572 | 572 | ||
573 | } | 573 | } |
574 | /* return code. */ | 574 | /* return code. */ |
575 | count = p->swap_map[offset]; | 575 | count = p->swap_map[offset]; |
576 | /* free if no reference */ | 576 | /* free if no reference */ |
577 | if (!count) { | 577 | if (!count) { |
578 | if (offset < p->lowest_bit) | 578 | if (offset < p->lowest_bit) |
579 | p->lowest_bit = offset; | 579 | p->lowest_bit = offset; |
580 | if (offset > p->highest_bit) | 580 | if (offset > p->highest_bit) |
581 | p->highest_bit = offset; | 581 | p->highest_bit = offset; |
582 | if (p->prio > swap_info[swap_list.next].prio) | 582 | if (p->prio > swap_info[swap_list.next].prio) |
583 | swap_list.next = p - swap_info; | 583 | swap_list.next = p - swap_info; |
584 | nr_swap_pages++; | 584 | nr_swap_pages++; |
585 | p->inuse_pages--; | 585 | p->inuse_pages--; |
586 | } | 586 | } |
587 | if (!swap_count(count)) | 587 | if (!swap_count(count)) |
588 | mem_cgroup_uncharge_swap(ent); | 588 | mem_cgroup_uncharge_swap(ent); |
589 | return count; | 589 | return count; |
590 | } | 590 | } |
591 | 591 | ||
592 | /* | 592 | /* |
593 | * Caller has made sure that the swapdevice corresponding to entry | 593 | * Caller has made sure that the swapdevice corresponding to entry |
594 | * is still around or has not been recycled. | 594 | * is still around or has not been recycled. |
595 | */ | 595 | */ |
596 | void swap_free(swp_entry_t entry) | 596 | void swap_free(swp_entry_t entry) |
597 | { | 597 | { |
598 | struct swap_info_struct * p; | 598 | struct swap_info_struct * p; |
599 | 599 | ||
600 | p = swap_info_get(entry); | 600 | p = swap_info_get(entry); |
601 | if (p) { | 601 | if (p) { |
602 | swap_entry_free(p, entry, SWAP_MAP); | 602 | swap_entry_free(p, entry, SWAP_MAP); |
603 | spin_unlock(&swap_lock); | 603 | spin_unlock(&swap_lock); |
604 | } | 604 | } |
605 | } | 605 | } |
606 | 606 | ||
607 | /* | 607 | /* |
608 | * Called after dropping swapcache to decrease refcnt to swap entries. | 608 | * Called after dropping swapcache to decrease refcnt to swap entries. |
609 | */ | 609 | */ |
610 | void swapcache_free(swp_entry_t entry, struct page *page) | 610 | void swapcache_free(swp_entry_t entry, struct page *page) |
611 | { | 611 | { |
612 | struct swap_info_struct *p; | 612 | struct swap_info_struct *p; |
613 | int ret; | 613 | int ret; |
614 | 614 | ||
615 | p = swap_info_get(entry); | 615 | p = swap_info_get(entry); |
616 | if (p) { | 616 | if (p) { |
617 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 617 | ret = swap_entry_free(p, entry, SWAP_CACHE); |
618 | if (page) { | 618 | if (page) { |
619 | bool swapout; | 619 | bool swapout; |
620 | if (ret) | 620 | if (ret) |
621 | swapout = true; /* the end of swap out */ | 621 | swapout = true; /* the end of swap out */ |
622 | else | 622 | else |
623 | swapout = false; /* no more swap users! */ | 623 | swapout = false; /* no more swap users! */ |
624 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | 624 | mem_cgroup_uncharge_swapcache(page, entry, swapout); |
625 | } | 625 | } |
626 | spin_unlock(&swap_lock); | 626 | spin_unlock(&swap_lock); |
627 | } | 627 | } |
628 | return; | 628 | return; |
629 | } | 629 | } |
630 | 630 | ||
631 | /* | 631 | /* |
632 | * How many references to page are currently swapped out? | 632 | * How many references to page are currently swapped out? |
633 | */ | 633 | */ |
634 | static inline int page_swapcount(struct page *page) | 634 | static inline int page_swapcount(struct page *page) |
635 | { | 635 | { |
636 | int count = 0; | 636 | int count = 0; |
637 | struct swap_info_struct *p; | 637 | struct swap_info_struct *p; |
638 | swp_entry_t entry; | 638 | swp_entry_t entry; |
639 | 639 | ||
640 | entry.val = page_private(page); | 640 | entry.val = page_private(page); |
641 | p = swap_info_get(entry); | 641 | p = swap_info_get(entry); |
642 | if (p) { | 642 | if (p) { |
643 | count = swap_count(p->swap_map[swp_offset(entry)]); | 643 | count = swap_count(p->swap_map[swp_offset(entry)]); |
644 | spin_unlock(&swap_lock); | 644 | spin_unlock(&swap_lock); |
645 | } | 645 | } |
646 | return count; | 646 | return count; |
647 | } | 647 | } |
648 | 648 | ||
649 | /* | 649 | /* |
650 | * We can write to an anon page without COW if there are no other references | 650 | * We can write to an anon page without COW if there are no other references |
651 | * to it. And as a side-effect, free up its swap: because the old content | 651 | * to it. And as a side-effect, free up its swap: because the old content |
652 | * on disk will never be read, and seeking back there to write new content | 652 | * on disk will never be read, and seeking back there to write new content |
653 | * later would only waste time away from clustering. | 653 | * later would only waste time away from clustering. |
654 | */ | 654 | */ |
655 | int reuse_swap_page(struct page *page) | 655 | int reuse_swap_page(struct page *page) |
656 | { | 656 | { |
657 | int count; | 657 | int count; |
658 | 658 | ||
659 | VM_BUG_ON(!PageLocked(page)); | 659 | VM_BUG_ON(!PageLocked(page)); |
660 | count = page_mapcount(page); | 660 | count = page_mapcount(page); |
661 | if (count <= 1 && PageSwapCache(page)) { | 661 | if (count <= 1 && PageSwapCache(page)) { |
662 | count += page_swapcount(page); | 662 | count += page_swapcount(page); |
663 | if (count == 1 && !PageWriteback(page)) { | 663 | if (count == 1 && !PageWriteback(page)) { |
664 | delete_from_swap_cache(page); | 664 | delete_from_swap_cache(page); |
665 | SetPageDirty(page); | 665 | SetPageDirty(page); |
666 | } | 666 | } |
667 | } | 667 | } |
668 | return count == 1; | 668 | return count == 1; |
669 | } | 669 | } |
670 | 670 | ||
671 | /* | 671 | /* |
672 | * If swap is getting full, or if there are no more mappings of this page, | 672 | * If swap is getting full, or if there are no more mappings of this page, |
673 | * then try_to_free_swap is called to free its swap space. | 673 | * then try_to_free_swap is called to free its swap space. |
674 | */ | 674 | */ |
675 | int try_to_free_swap(struct page *page) | 675 | int try_to_free_swap(struct page *page) |
676 | { | 676 | { |
677 | VM_BUG_ON(!PageLocked(page)); | 677 | VM_BUG_ON(!PageLocked(page)); |
678 | 678 | ||
679 | if (!PageSwapCache(page)) | 679 | if (!PageSwapCache(page)) |
680 | return 0; | 680 | return 0; |
681 | if (PageWriteback(page)) | 681 | if (PageWriteback(page)) |
682 | return 0; | 682 | return 0; |
683 | if (page_swapcount(page)) | 683 | if (page_swapcount(page)) |
684 | return 0; | 684 | return 0; |
685 | 685 | ||
686 | delete_from_swap_cache(page); | 686 | delete_from_swap_cache(page); |
687 | SetPageDirty(page); | 687 | SetPageDirty(page); |
688 | return 1; | 688 | return 1; |
689 | } | 689 | } |
690 | 690 | ||
691 | /* | 691 | /* |
692 | * Free the swap entry like above, but also try to | 692 | * Free the swap entry like above, but also try to |
693 | * free the page cache entry if it is the last user. | 693 | * free the page cache entry if it is the last user. |
694 | */ | 694 | */ |
695 | int free_swap_and_cache(swp_entry_t entry) | 695 | int free_swap_and_cache(swp_entry_t entry) |
696 | { | 696 | { |
697 | struct swap_info_struct *p; | 697 | struct swap_info_struct *p; |
698 | struct page *page = NULL; | 698 | struct page *page = NULL; |
699 | 699 | ||
700 | if (is_migration_entry(entry)) | 700 | if (is_migration_entry(entry)) |
701 | return 1; | 701 | return 1; |
702 | 702 | ||
703 | p = swap_info_get(entry); | 703 | p = swap_info_get(entry); |
704 | if (p) { | 704 | if (p) { |
705 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 705 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
706 | page = find_get_page(&swapper_space, entry.val); | 706 | page = find_get_page(&swapper_space, entry.val); |
707 | if (page && !trylock_page(page)) { | 707 | if (page && !trylock_page(page)) { |
708 | page_cache_release(page); | 708 | page_cache_release(page); |
709 | page = NULL; | 709 | page = NULL; |
710 | } | 710 | } |
711 | } | 711 | } |
712 | spin_unlock(&swap_lock); | 712 | spin_unlock(&swap_lock); |
713 | } | 713 | } |
714 | if (page) { | 714 | if (page) { |
715 | /* | 715 | /* |
716 | * Not mapped elsewhere, or swap space full? Free it! | 716 | * Not mapped elsewhere, or swap space full? Free it! |
717 | * Also recheck PageSwapCache now page is locked (above). | 717 | * Also recheck PageSwapCache now page is locked (above). |
718 | */ | 718 | */ |
719 | if (PageSwapCache(page) && !PageWriteback(page) && | 719 | if (PageSwapCache(page) && !PageWriteback(page) && |
720 | (!page_mapped(page) || vm_swap_full())) { | 720 | (!page_mapped(page) || vm_swap_full())) { |
721 | delete_from_swap_cache(page); | 721 | delete_from_swap_cache(page); |
722 | SetPageDirty(page); | 722 | SetPageDirty(page); |
723 | } | 723 | } |
724 | unlock_page(page); | 724 | unlock_page(page); |
725 | page_cache_release(page); | 725 | page_cache_release(page); |
726 | } | 726 | } |
727 | return p != NULL; | 727 | return p != NULL; |
728 | } | 728 | } |
729 | 729 | ||
730 | #ifdef CONFIG_HIBERNATION | 730 | #ifdef CONFIG_HIBERNATION |
731 | /* | 731 | /* |
732 | * Find the swap type that corresponds to given device (if any). | 732 | * Find the swap type that corresponds to given device (if any). |
733 | * | 733 | * |
734 | * @offset - number of the PAGE_SIZE-sized block of the device, starting | 734 | * @offset - number of the PAGE_SIZE-sized block of the device, starting |
735 | * from 0, in which the swap header is expected to be located. | 735 | * from 0, in which the swap header is expected to be located. |
736 | * | 736 | * |
737 | * This is needed for the suspend to disk (aka swsusp). | 737 | * This is needed for the suspend to disk (aka swsusp). |
738 | */ | 738 | */ |
739 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 739 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
740 | { | 740 | { |
741 | struct block_device *bdev = NULL; | 741 | struct block_device *bdev = NULL; |
742 | int i; | 742 | int i; |
743 | 743 | ||
744 | if (device) | 744 | if (device) |
745 | bdev = bdget(device); | 745 | bdev = bdget(device); |
746 | 746 | ||
747 | spin_lock(&swap_lock); | 747 | spin_lock(&swap_lock); |
748 | for (i = 0; i < nr_swapfiles; i++) { | 748 | for (i = 0; i < nr_swapfiles; i++) { |
749 | struct swap_info_struct *sis = swap_info + i; | 749 | struct swap_info_struct *sis = swap_info + i; |
750 | 750 | ||
751 | if (!(sis->flags & SWP_WRITEOK)) | 751 | if (!(sis->flags & SWP_WRITEOK)) |
752 | continue; | 752 | continue; |
753 | 753 | ||
754 | if (!bdev) { | 754 | if (!bdev) { |
755 | if (bdev_p) | 755 | if (bdev_p) |
756 | *bdev_p = bdget(sis->bdev->bd_dev); | 756 | *bdev_p = bdgrab(sis->bdev); |
757 | 757 | ||
758 | spin_unlock(&swap_lock); | 758 | spin_unlock(&swap_lock); |
759 | return i; | 759 | return i; |
760 | } | 760 | } |
761 | if (bdev == sis->bdev) { | 761 | if (bdev == sis->bdev) { |
762 | struct swap_extent *se; | 762 | struct swap_extent *se; |
763 | 763 | ||
764 | se = list_entry(sis->extent_list.next, | 764 | se = list_entry(sis->extent_list.next, |
765 | struct swap_extent, list); | 765 | struct swap_extent, list); |
766 | if (se->start_block == offset) { | 766 | if (se->start_block == offset) { |
767 | if (bdev_p) | 767 | if (bdev_p) |
768 | *bdev_p = bdget(sis->bdev->bd_dev); | 768 | *bdev_p = bdgrab(sis->bdev); |
769 | 769 | ||
770 | spin_unlock(&swap_lock); | 770 | spin_unlock(&swap_lock); |
771 | bdput(bdev); | 771 | bdput(bdev); |
772 | return i; | 772 | return i; |
773 | } | 773 | } |
774 | } | 774 | } |
775 | } | 775 | } |
776 | spin_unlock(&swap_lock); | 776 | spin_unlock(&swap_lock); |
777 | if (bdev) | 777 | if (bdev) |
778 | bdput(bdev); | 778 | bdput(bdev); |
779 | 779 | ||
780 | return -ENODEV; | 780 | return -ENODEV; |
781 | } | 781 | } |
782 | 782 | ||
783 | /* | 783 | /* |
784 | * Return either the total number of swap pages of given type, or the number | 784 | * Return either the total number of swap pages of given type, or the number |
785 | * of free pages of that type (depending on @free) | 785 | * of free pages of that type (depending on @free) |
786 | * | 786 | * |
787 | * This is needed for software suspend | 787 | * This is needed for software suspend |
788 | */ | 788 | */ |
789 | unsigned int count_swap_pages(int type, int free) | 789 | unsigned int count_swap_pages(int type, int free) |
790 | { | 790 | { |
791 | unsigned int n = 0; | 791 | unsigned int n = 0; |
792 | 792 | ||
793 | if (type < nr_swapfiles) { | 793 | if (type < nr_swapfiles) { |
794 | spin_lock(&swap_lock); | 794 | spin_lock(&swap_lock); |
795 | if (swap_info[type].flags & SWP_WRITEOK) { | 795 | if (swap_info[type].flags & SWP_WRITEOK) { |
796 | n = swap_info[type].pages; | 796 | n = swap_info[type].pages; |
797 | if (free) | 797 | if (free) |
798 | n -= swap_info[type].inuse_pages; | 798 | n -= swap_info[type].inuse_pages; |
799 | } | 799 | } |
800 | spin_unlock(&swap_lock); | 800 | spin_unlock(&swap_lock); |
801 | } | 801 | } |
802 | return n; | 802 | return n; |
803 | } | 803 | } |
804 | #endif | 804 | #endif |
805 | 805 | ||
806 | /* | 806 | /* |
807 | * No need to decide whether this PTE shares the swap entry with others, | 807 | * No need to decide whether this PTE shares the swap entry with others, |
808 | * just let do_wp_page work it out if a write is requested later - to | 808 | * just let do_wp_page work it out if a write is requested later - to |
809 | * force COW, vm_page_prot omits write permission from any private vma. | 809 | * force COW, vm_page_prot omits write permission from any private vma. |
810 | */ | 810 | */ |
811 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 811 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
812 | unsigned long addr, swp_entry_t entry, struct page *page) | 812 | unsigned long addr, swp_entry_t entry, struct page *page) |
813 | { | 813 | { |
814 | struct mem_cgroup *ptr = NULL; | 814 | struct mem_cgroup *ptr = NULL; |
815 | spinlock_t *ptl; | 815 | spinlock_t *ptl; |
816 | pte_t *pte; | 816 | pte_t *pte; |
817 | int ret = 1; | 817 | int ret = 1; |
818 | 818 | ||
819 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { | 819 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { |
820 | ret = -ENOMEM; | 820 | ret = -ENOMEM; |
821 | goto out_nolock; | 821 | goto out_nolock; |
822 | } | 822 | } |
823 | 823 | ||
824 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 824 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
825 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 825 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
826 | if (ret > 0) | 826 | if (ret > 0) |
827 | mem_cgroup_cancel_charge_swapin(ptr); | 827 | mem_cgroup_cancel_charge_swapin(ptr); |
828 | ret = 0; | 828 | ret = 0; |
829 | goto out; | 829 | goto out; |
830 | } | 830 | } |
831 | 831 | ||
832 | inc_mm_counter(vma->vm_mm, anon_rss); | 832 | inc_mm_counter(vma->vm_mm, anon_rss); |
833 | get_page(page); | 833 | get_page(page); |
834 | set_pte_at(vma->vm_mm, addr, pte, | 834 | set_pte_at(vma->vm_mm, addr, pte, |
835 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 835 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
836 | page_add_anon_rmap(page, vma, addr); | 836 | page_add_anon_rmap(page, vma, addr); |
837 | mem_cgroup_commit_charge_swapin(page, ptr); | 837 | mem_cgroup_commit_charge_swapin(page, ptr); |
838 | swap_free(entry); | 838 | swap_free(entry); |
839 | /* | 839 | /* |
840 | * Move the page to the active list so it is not | 840 | * Move the page to the active list so it is not |
841 | * immediately swapped out again after swapon. | 841 | * immediately swapped out again after swapon. |
842 | */ | 842 | */ |
843 | activate_page(page); | 843 | activate_page(page); |
844 | out: | 844 | out: |
845 | pte_unmap_unlock(pte, ptl); | 845 | pte_unmap_unlock(pte, ptl); |
846 | out_nolock: | 846 | out_nolock: |
847 | return ret; | 847 | return ret; |
848 | } | 848 | } |
849 | 849 | ||
850 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 850 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
851 | unsigned long addr, unsigned long end, | 851 | unsigned long addr, unsigned long end, |
852 | swp_entry_t entry, struct page *page) | 852 | swp_entry_t entry, struct page *page) |
853 | { | 853 | { |
854 | pte_t swp_pte = swp_entry_to_pte(entry); | 854 | pte_t swp_pte = swp_entry_to_pte(entry); |
855 | pte_t *pte; | 855 | pte_t *pte; |
856 | int ret = 0; | 856 | int ret = 0; |
857 | 857 | ||
858 | /* | 858 | /* |
859 | * We don't actually need pte lock while scanning for swp_pte: since | 859 | * We don't actually need pte lock while scanning for swp_pte: since |
860 | * we hold page lock and mmap_sem, swp_pte cannot be inserted into the | 860 | * we hold page lock and mmap_sem, swp_pte cannot be inserted into the |
861 | * page table while we're scanning; though it could get zapped, and on | 861 | * page table while we're scanning; though it could get zapped, and on |
862 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | 862 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse |
863 | * of unmatched parts which look like swp_pte, so unuse_pte must | 863 | * of unmatched parts which look like swp_pte, so unuse_pte must |
864 | * recheck under pte lock. Scanning without pte lock lets it be | 864 | * recheck under pte lock. Scanning without pte lock lets it be |
865 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | 865 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. |
866 | */ | 866 | */ |
867 | pte = pte_offset_map(pmd, addr); | 867 | pte = pte_offset_map(pmd, addr); |
868 | do { | 868 | do { |
869 | /* | 869 | /* |
870 | * swapoff spends a _lot_ of time in this loop! | 870 | * swapoff spends a _lot_ of time in this loop! |
871 | * Test inline before going to call unuse_pte. | 871 | * Test inline before going to call unuse_pte. |
872 | */ | 872 | */ |
873 | if (unlikely(pte_same(*pte, swp_pte))) { | 873 | if (unlikely(pte_same(*pte, swp_pte))) { |
874 | pte_unmap(pte); | 874 | pte_unmap(pte); |
875 | ret = unuse_pte(vma, pmd, addr, entry, page); | 875 | ret = unuse_pte(vma, pmd, addr, entry, page); |
876 | if (ret) | 876 | if (ret) |
877 | goto out; | 877 | goto out; |
878 | pte = pte_offset_map(pmd, addr); | 878 | pte = pte_offset_map(pmd, addr); |
879 | } | 879 | } |
880 | } while (pte++, addr += PAGE_SIZE, addr != end); | 880 | } while (pte++, addr += PAGE_SIZE, addr != end); |
881 | pte_unmap(pte - 1); | 881 | pte_unmap(pte - 1); |
882 | out: | 882 | out: |
883 | return ret; | 883 | return ret; |
884 | } | 884 | } |
885 | 885 | ||
886 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 886 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
887 | unsigned long addr, unsigned long end, | 887 | unsigned long addr, unsigned long end, |
888 | swp_entry_t entry, struct page *page) | 888 | swp_entry_t entry, struct page *page) |
889 | { | 889 | { |
890 | pmd_t *pmd; | 890 | pmd_t *pmd; |
891 | unsigned long next; | 891 | unsigned long next; |
892 | int ret; | 892 | int ret; |
893 | 893 | ||
894 | pmd = pmd_offset(pud, addr); | 894 | pmd = pmd_offset(pud, addr); |
895 | do { | 895 | do { |
896 | next = pmd_addr_end(addr, end); | 896 | next = pmd_addr_end(addr, end); |
897 | if (pmd_none_or_clear_bad(pmd)) | 897 | if (pmd_none_or_clear_bad(pmd)) |
898 | continue; | 898 | continue; |
899 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 899 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
900 | if (ret) | 900 | if (ret) |
901 | return ret; | 901 | return ret; |
902 | } while (pmd++, addr = next, addr != end); | 902 | } while (pmd++, addr = next, addr != end); |
903 | return 0; | 903 | return 0; |
904 | } | 904 | } |
905 | 905 | ||
906 | static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 906 | static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
907 | unsigned long addr, unsigned long end, | 907 | unsigned long addr, unsigned long end, |
908 | swp_entry_t entry, struct page *page) | 908 | swp_entry_t entry, struct page *page) |
909 | { | 909 | { |
910 | pud_t *pud; | 910 | pud_t *pud; |
911 | unsigned long next; | 911 | unsigned long next; |
912 | int ret; | 912 | int ret; |
913 | 913 | ||
914 | pud = pud_offset(pgd, addr); | 914 | pud = pud_offset(pgd, addr); |
915 | do { | 915 | do { |
916 | next = pud_addr_end(addr, end); | 916 | next = pud_addr_end(addr, end); |
917 | if (pud_none_or_clear_bad(pud)) | 917 | if (pud_none_or_clear_bad(pud)) |
918 | continue; | 918 | continue; |
919 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); | 919 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
920 | if (ret) | 920 | if (ret) |
921 | return ret; | 921 | return ret; |
922 | } while (pud++, addr = next, addr != end); | 922 | } while (pud++, addr = next, addr != end); |
923 | return 0; | 923 | return 0; |
924 | } | 924 | } |
925 | 925 | ||
926 | static int unuse_vma(struct vm_area_struct *vma, | 926 | static int unuse_vma(struct vm_area_struct *vma, |
927 | swp_entry_t entry, struct page *page) | 927 | swp_entry_t entry, struct page *page) |
928 | { | 928 | { |
929 | pgd_t *pgd; | 929 | pgd_t *pgd; |
930 | unsigned long addr, end, next; | 930 | unsigned long addr, end, next; |
931 | int ret; | 931 | int ret; |
932 | 932 | ||
933 | if (page->mapping) { | 933 | if (page->mapping) { |
934 | addr = page_address_in_vma(page, vma); | 934 | addr = page_address_in_vma(page, vma); |
935 | if (addr == -EFAULT) | 935 | if (addr == -EFAULT) |
936 | return 0; | 936 | return 0; |
937 | else | 937 | else |
938 | end = addr + PAGE_SIZE; | 938 | end = addr + PAGE_SIZE; |
939 | } else { | 939 | } else { |
940 | addr = vma->vm_start; | 940 | addr = vma->vm_start; |
941 | end = vma->vm_end; | 941 | end = vma->vm_end; |
942 | } | 942 | } |
943 | 943 | ||
944 | pgd = pgd_offset(vma->vm_mm, addr); | 944 | pgd = pgd_offset(vma->vm_mm, addr); |
945 | do { | 945 | do { |
946 | next = pgd_addr_end(addr, end); | 946 | next = pgd_addr_end(addr, end); |
947 | if (pgd_none_or_clear_bad(pgd)) | 947 | if (pgd_none_or_clear_bad(pgd)) |
948 | continue; | 948 | continue; |
949 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); | 949 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); |
950 | if (ret) | 950 | if (ret) |
951 | return ret; | 951 | return ret; |
952 | } while (pgd++, addr = next, addr != end); | 952 | } while (pgd++, addr = next, addr != end); |
953 | return 0; | 953 | return 0; |
954 | } | 954 | } |
955 | 955 | ||
956 | static int unuse_mm(struct mm_struct *mm, | 956 | static int unuse_mm(struct mm_struct *mm, |
957 | swp_entry_t entry, struct page *page) | 957 | swp_entry_t entry, struct page *page) |
958 | { | 958 | { |
959 | struct vm_area_struct *vma; | 959 | struct vm_area_struct *vma; |
960 | int ret = 0; | 960 | int ret = 0; |
961 | 961 | ||
962 | if (!down_read_trylock(&mm->mmap_sem)) { | 962 | if (!down_read_trylock(&mm->mmap_sem)) { |
963 | /* | 963 | /* |
964 | * Activate page so shrink_inactive_list is unlikely to unmap | 964 | * Activate page so shrink_inactive_list is unlikely to unmap |
965 | * its ptes while lock is dropped, so swapoff can make progress. | 965 | * its ptes while lock is dropped, so swapoff can make progress. |
966 | */ | 966 | */ |
967 | activate_page(page); | 967 | activate_page(page); |
968 | unlock_page(page); | 968 | unlock_page(page); |
969 | down_read(&mm->mmap_sem); | 969 | down_read(&mm->mmap_sem); |
970 | lock_page(page); | 970 | lock_page(page); |
971 | } | 971 | } |
972 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 972 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
973 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) | 973 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
974 | break; | 974 | break; |
975 | } | 975 | } |
976 | up_read(&mm->mmap_sem); | 976 | up_read(&mm->mmap_sem); |
977 | return (ret < 0)? ret: 0; | 977 | return (ret < 0)? ret: 0; |
978 | } | 978 | } |
979 | 979 | ||
980 | /* | 980 | /* |
981 | * Scan swap_map from current position to next entry still in use. | 981 | * Scan swap_map from current position to next entry still in use. |
982 | * Recycle to start on reaching the end, returning 0 when empty. | 982 | * Recycle to start on reaching the end, returning 0 when empty. |
983 | */ | 983 | */ |
984 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 984 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
985 | unsigned int prev) | 985 | unsigned int prev) |
986 | { | 986 | { |
987 | unsigned int max = si->max; | 987 | unsigned int max = si->max; |
988 | unsigned int i = prev; | 988 | unsigned int i = prev; |
989 | int count; | 989 | int count; |
990 | 990 | ||
991 | /* | 991 | /* |
992 | * No need for swap_lock here: we're just looking | 992 | * No need for swap_lock here: we're just looking |
993 | * for whether an entry is in use, not modifying it; false | 993 | * for whether an entry is in use, not modifying it; false |
994 | * hits are okay, and sys_swapoff() has already prevented new | 994 | * hits are okay, and sys_swapoff() has already prevented new |
995 | * allocations from this area (while holding swap_lock). | 995 | * allocations from this area (while holding swap_lock). |
996 | */ | 996 | */ |
997 | for (;;) { | 997 | for (;;) { |
998 | if (++i >= max) { | 998 | if (++i >= max) { |
999 | if (!prev) { | 999 | if (!prev) { |
1000 | i = 0; | 1000 | i = 0; |
1001 | break; | 1001 | break; |
1002 | } | 1002 | } |
1003 | /* | 1003 | /* |
1004 | * No entries in use at top of swap_map, | 1004 | * No entries in use at top of swap_map, |
1005 | * loop back to start and recheck there. | 1005 | * loop back to start and recheck there. |
1006 | */ | 1006 | */ |
1007 | max = prev + 1; | 1007 | max = prev + 1; |
1008 | prev = 0; | 1008 | prev = 0; |
1009 | i = 1; | 1009 | i = 1; |
1010 | } | 1010 | } |
1011 | count = si->swap_map[i]; | 1011 | count = si->swap_map[i]; |
1012 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1012 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1013 | break; | 1013 | break; |
1014 | } | 1014 | } |
1015 | return i; | 1015 | return i; |
1016 | } | 1016 | } |
1017 | 1017 | ||
1018 | /* | 1018 | /* |
1019 | * We completely avoid races by reading each swap page in advance, | 1019 | * We completely avoid races by reading each swap page in advance, |
1020 | * and then search for the process using it. All the necessary | 1020 | * and then search for the process using it. All the necessary |
1021 | * page table adjustments can then be made atomically. | 1021 | * page table adjustments can then be made atomically. |
1022 | */ | 1022 | */ |
1023 | static int try_to_unuse(unsigned int type) | 1023 | static int try_to_unuse(unsigned int type) |
1024 | { | 1024 | { |
1025 | struct swap_info_struct * si = &swap_info[type]; | 1025 | struct swap_info_struct * si = &swap_info[type]; |
1026 | struct mm_struct *start_mm; | 1026 | struct mm_struct *start_mm; |
1027 | unsigned short *swap_map; | 1027 | unsigned short *swap_map; |
1028 | unsigned short swcount; | 1028 | unsigned short swcount; |
1029 | struct page *page; | 1029 | struct page *page; |
1030 | swp_entry_t entry; | 1030 | swp_entry_t entry; |
1031 | unsigned int i = 0; | 1031 | unsigned int i = 0; |
1032 | int retval = 0; | 1032 | int retval = 0; |
1033 | int reset_overflow = 0; | 1033 | int reset_overflow = 0; |
1034 | int shmem; | 1034 | int shmem; |
1035 | 1035 | ||
1036 | /* | 1036 | /* |
1037 | * When searching mms for an entry, a good strategy is to | 1037 | * When searching mms for an entry, a good strategy is to |
1038 | * start at the first mm we freed the previous entry from | 1038 | * start at the first mm we freed the previous entry from |
1039 | * (though actually we don't notice whether we or coincidence | 1039 | * (though actually we don't notice whether we or coincidence |
1040 | * freed the entry). Initialize this start_mm with a hold. | 1040 | * freed the entry). Initialize this start_mm with a hold. |
1041 | * | 1041 | * |
1042 | * A simpler strategy would be to start at the last mm we | 1042 | * A simpler strategy would be to start at the last mm we |
1043 | * freed the previous entry from; but that would take less | 1043 | * freed the previous entry from; but that would take less |
1044 | * advantage of mmlist ordering, which clusters forked mms | 1044 | * advantage of mmlist ordering, which clusters forked mms |
1045 | * together, child after parent. If we race with dup_mmap(), we | 1045 | * together, child after parent. If we race with dup_mmap(), we |
1046 | * prefer to resolve parent before child, lest we miss entries | 1046 | * prefer to resolve parent before child, lest we miss entries |
1047 | * duplicated after we scanned child: using last mm would invert | 1047 | * duplicated after we scanned child: using last mm would invert |
1048 | * that. Though it's only a serious concern when an overflowed | 1048 | * that. Though it's only a serious concern when an overflowed |
1049 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | 1049 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. |
1050 | */ | 1050 | */ |
1051 | start_mm = &init_mm; | 1051 | start_mm = &init_mm; |
1052 | atomic_inc(&init_mm.mm_users); | 1052 | atomic_inc(&init_mm.mm_users); |
1053 | 1053 | ||
1054 | /* | 1054 | /* |
1055 | * Keep on scanning until all entries have gone. Usually, | 1055 | * Keep on scanning until all entries have gone. Usually, |
1056 | * one pass through swap_map is enough, but not necessarily: | 1056 | * one pass through swap_map is enough, but not necessarily: |
1057 | * there are races when an instance of an entry might be missed. | 1057 | * there are races when an instance of an entry might be missed. |
1058 | */ | 1058 | */ |
1059 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1059 | while ((i = find_next_to_unuse(si, i)) != 0) { |
1060 | if (signal_pending(current)) { | 1060 | if (signal_pending(current)) { |
1061 | retval = -EINTR; | 1061 | retval = -EINTR; |
1062 | break; | 1062 | break; |
1063 | } | 1063 | } |
1064 | 1064 | ||
1065 | /* | 1065 | /* |
1066 | * Get a page for the entry, using the existing swap | 1066 | * Get a page for the entry, using the existing swap |
1067 | * cache page if there is one. Otherwise, get a clean | 1067 | * cache page if there is one. Otherwise, get a clean |
1068 | * page and read the swap into it. | 1068 | * page and read the swap into it. |
1069 | */ | 1069 | */ |
1070 | swap_map = &si->swap_map[i]; | 1070 | swap_map = &si->swap_map[i]; |
1071 | entry = swp_entry(type, i); | 1071 | entry = swp_entry(type, i); |
1072 | page = read_swap_cache_async(entry, | 1072 | page = read_swap_cache_async(entry, |
1073 | GFP_HIGHUSER_MOVABLE, NULL, 0); | 1073 | GFP_HIGHUSER_MOVABLE, NULL, 0); |
1074 | if (!page) { | 1074 | if (!page) { |
1075 | /* | 1075 | /* |
1076 | * Either swap_duplicate() failed because entry | 1076 | * Either swap_duplicate() failed because entry |
1077 | * has been freed independently, and will not be | 1077 | * has been freed independently, and will not be |
1078 | * reused since sys_swapoff() already disabled | 1078 | * reused since sys_swapoff() already disabled |
1079 | * allocation from here, or alloc_page() failed. | 1079 | * allocation from here, or alloc_page() failed. |
1080 | */ | 1080 | */ |
1081 | if (!*swap_map) | 1081 | if (!*swap_map) |
1082 | continue; | 1082 | continue; |
1083 | retval = -ENOMEM; | 1083 | retval = -ENOMEM; |
1084 | break; | 1084 | break; |
1085 | } | 1085 | } |
1086 | 1086 | ||
1087 | /* | 1087 | /* |
1088 | * Don't hold on to start_mm if it looks like exiting. | 1088 | * Don't hold on to start_mm if it looks like exiting. |
1089 | */ | 1089 | */ |
1090 | if (atomic_read(&start_mm->mm_users) == 1) { | 1090 | if (atomic_read(&start_mm->mm_users) == 1) { |
1091 | mmput(start_mm); | 1091 | mmput(start_mm); |
1092 | start_mm = &init_mm; | 1092 | start_mm = &init_mm; |
1093 | atomic_inc(&init_mm.mm_users); | 1093 | atomic_inc(&init_mm.mm_users); |
1094 | } | 1094 | } |
1095 | 1095 | ||
1096 | /* | 1096 | /* |
1097 | * Wait for and lock page. When do_swap_page races with | 1097 | * Wait for and lock page. When do_swap_page races with |
1098 | * try_to_unuse, do_swap_page can handle the fault much | 1098 | * try_to_unuse, do_swap_page can handle the fault much |
1099 | * faster than try_to_unuse can locate the entry. This | 1099 | * faster than try_to_unuse can locate the entry. This |
1100 | * apparently redundant "wait_on_page_locked" lets try_to_unuse | 1100 | * apparently redundant "wait_on_page_locked" lets try_to_unuse |
1101 | * defer to do_swap_page in such a case - in some tests, | 1101 | * defer to do_swap_page in such a case - in some tests, |
1102 | * do_swap_page and try_to_unuse repeatedly compete. | 1102 | * do_swap_page and try_to_unuse repeatedly compete. |
1103 | */ | 1103 | */ |
1104 | wait_on_page_locked(page); | 1104 | wait_on_page_locked(page); |
1105 | wait_on_page_writeback(page); | 1105 | wait_on_page_writeback(page); |
1106 | lock_page(page); | 1106 | lock_page(page); |
1107 | wait_on_page_writeback(page); | 1107 | wait_on_page_writeback(page); |
1108 | 1108 | ||
1109 | /* | 1109 | /* |
1110 | * Remove all references to entry. | 1110 | * Remove all references to entry. |
1111 | * Whenever we reach init_mm, there's no address space | 1111 | * Whenever we reach init_mm, there's no address space |
1112 | * to search, but use it as a reminder to search shmem. | 1112 | * to search, but use it as a reminder to search shmem. |
1113 | */ | 1113 | */ |
1114 | shmem = 0; | 1114 | shmem = 0; |
1115 | swcount = *swap_map; | 1115 | swcount = *swap_map; |
1116 | if (swap_count(swcount)) { | 1116 | if (swap_count(swcount)) { |
1117 | if (start_mm == &init_mm) | 1117 | if (start_mm == &init_mm) |
1118 | shmem = shmem_unuse(entry, page); | 1118 | shmem = shmem_unuse(entry, page); |
1119 | else | 1119 | else |
1120 | retval = unuse_mm(start_mm, entry, page); | 1120 | retval = unuse_mm(start_mm, entry, page); |
1121 | } | 1121 | } |
1122 | if (swap_count(*swap_map)) { | 1122 | if (swap_count(*swap_map)) { |
1123 | int set_start_mm = (*swap_map >= swcount); | 1123 | int set_start_mm = (*swap_map >= swcount); |
1124 | struct list_head *p = &start_mm->mmlist; | 1124 | struct list_head *p = &start_mm->mmlist; |
1125 | struct mm_struct *new_start_mm = start_mm; | 1125 | struct mm_struct *new_start_mm = start_mm; |
1126 | struct mm_struct *prev_mm = start_mm; | 1126 | struct mm_struct *prev_mm = start_mm; |
1127 | struct mm_struct *mm; | 1127 | struct mm_struct *mm; |
1128 | 1128 | ||
1129 | atomic_inc(&new_start_mm->mm_users); | 1129 | atomic_inc(&new_start_mm->mm_users); |
1130 | atomic_inc(&prev_mm->mm_users); | 1130 | atomic_inc(&prev_mm->mm_users); |
1131 | spin_lock(&mmlist_lock); | 1131 | spin_lock(&mmlist_lock); |
1132 | while (swap_count(*swap_map) && !retval && !shmem && | 1132 | while (swap_count(*swap_map) && !retval && !shmem && |
1133 | (p = p->next) != &start_mm->mmlist) { | 1133 | (p = p->next) != &start_mm->mmlist) { |
1134 | mm = list_entry(p, struct mm_struct, mmlist); | 1134 | mm = list_entry(p, struct mm_struct, mmlist); |
1135 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1135 | if (!atomic_inc_not_zero(&mm->mm_users)) |
1136 | continue; | 1136 | continue; |
1137 | spin_unlock(&mmlist_lock); | 1137 | spin_unlock(&mmlist_lock); |
1138 | mmput(prev_mm); | 1138 | mmput(prev_mm); |
1139 | prev_mm = mm; | 1139 | prev_mm = mm; |
1140 | 1140 | ||
1141 | cond_resched(); | 1141 | cond_resched(); |
1142 | 1142 | ||
1143 | swcount = *swap_map; | 1143 | swcount = *swap_map; |
1144 | if (!swap_count(swcount)) /* any usage ? */ | 1144 | if (!swap_count(swcount)) /* any usage ? */ |
1145 | ; | 1145 | ; |
1146 | else if (mm == &init_mm) { | 1146 | else if (mm == &init_mm) { |
1147 | set_start_mm = 1; | 1147 | set_start_mm = 1; |
1148 | shmem = shmem_unuse(entry, page); | 1148 | shmem = shmem_unuse(entry, page); |
1149 | } else | 1149 | } else |
1150 | retval = unuse_mm(mm, entry, page); | 1150 | retval = unuse_mm(mm, entry, page); |
1151 | 1151 | ||
1152 | if (set_start_mm && | 1152 | if (set_start_mm && |
1153 | swap_count(*swap_map) < swcount) { | 1153 | swap_count(*swap_map) < swcount) { |
1154 | mmput(new_start_mm); | 1154 | mmput(new_start_mm); |
1155 | atomic_inc(&mm->mm_users); | 1155 | atomic_inc(&mm->mm_users); |
1156 | new_start_mm = mm; | 1156 | new_start_mm = mm; |
1157 | set_start_mm = 0; | 1157 | set_start_mm = 0; |
1158 | } | 1158 | } |
1159 | spin_lock(&mmlist_lock); | 1159 | spin_lock(&mmlist_lock); |
1160 | } | 1160 | } |
1161 | spin_unlock(&mmlist_lock); | 1161 | spin_unlock(&mmlist_lock); |
1162 | mmput(prev_mm); | 1162 | mmput(prev_mm); |
1163 | mmput(start_mm); | 1163 | mmput(start_mm); |
1164 | start_mm = new_start_mm; | 1164 | start_mm = new_start_mm; |
1165 | } | 1165 | } |
1166 | if (shmem) { | 1166 | if (shmem) { |
1167 | /* page has already been unlocked and released */ | 1167 | /* page has already been unlocked and released */ |
1168 | if (shmem > 0) | 1168 | if (shmem > 0) |
1169 | continue; | 1169 | continue; |
1170 | retval = shmem; | 1170 | retval = shmem; |
1171 | break; | 1171 | break; |
1172 | } | 1172 | } |
1173 | if (retval) { | 1173 | if (retval) { |
1174 | unlock_page(page); | 1174 | unlock_page(page); |
1175 | page_cache_release(page); | 1175 | page_cache_release(page); |
1176 | break; | 1176 | break; |
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | /* | 1179 | /* |
1180 | * How could swap count reach 0x7ffe ? | 1180 | * How could swap count reach 0x7ffe ? |
1181 | * There's no way to repeat a swap page within an mm | 1181 | * There's no way to repeat a swap page within an mm |
1182 | * (except in shmem, where it's the shared object which takes | 1182 | * (except in shmem, where it's the shared object which takes |
1183 | * the reference count)? | 1183 | * the reference count)? |
1184 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | 1184 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
1185 | * short is too small....) | 1185 | * short is too small....) |
1186 | * If that's wrong, then we should worry more about | 1186 | * If that's wrong, then we should worry more about |
1187 | * exit_mmap() and do_munmap() cases described above: | 1187 | * exit_mmap() and do_munmap() cases described above: |
1188 | * we might be resetting SWAP_MAP_MAX too early here. | 1188 | * we might be resetting SWAP_MAP_MAX too early here. |
1189 | * We know "Undead"s can happen, they're okay, so don't | 1189 | * We know "Undead"s can happen, they're okay, so don't |
1190 | * report them; but do report if we reset SWAP_MAP_MAX. | 1190 | * report them; but do report if we reset SWAP_MAP_MAX. |
1191 | */ | 1191 | */ |
1192 | /* We might release the lock_page() in unuse_mm(). */ | 1192 | /* We might release the lock_page() in unuse_mm(). */ |
1193 | if (!PageSwapCache(page) || page_private(page) != entry.val) | 1193 | if (!PageSwapCache(page) || page_private(page) != entry.val) |
1194 | goto retry; | 1194 | goto retry; |
1195 | 1195 | ||
1196 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | 1196 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { |
1197 | spin_lock(&swap_lock); | 1197 | spin_lock(&swap_lock); |
1198 | *swap_map = encode_swapmap(0, true); | 1198 | *swap_map = encode_swapmap(0, true); |
1199 | spin_unlock(&swap_lock); | 1199 | spin_unlock(&swap_lock); |
1200 | reset_overflow = 1; | 1200 | reset_overflow = 1; |
1201 | } | 1201 | } |
1202 | 1202 | ||
1203 | /* | 1203 | /* |
1204 | * If a reference remains (rare), we would like to leave | 1204 | * If a reference remains (rare), we would like to leave |
1205 | * the page in the swap cache; but try_to_unmap could | 1205 | * the page in the swap cache; but try_to_unmap could |
1206 | * then re-duplicate the entry once we drop page lock, | 1206 | * then re-duplicate the entry once we drop page lock, |
1207 | * so we might loop indefinitely; also, that page could | 1207 | * so we might loop indefinitely; also, that page could |
1208 | * not be swapped out to other storage meanwhile. So: | 1208 | * not be swapped out to other storage meanwhile. So: |
1209 | * delete from cache even if there's another reference, | 1209 | * delete from cache even if there's another reference, |
1210 | * after ensuring that the data has been saved to disk - | 1210 | * after ensuring that the data has been saved to disk - |
1211 | * since if the reference remains (rarer), it will be | 1211 | * since if the reference remains (rarer), it will be |
1212 | * read from disk into another page. Splitting into two | 1212 | * read from disk into another page. Splitting into two |
1213 | * pages would be incorrect if swap supported "shared | 1213 | * pages would be incorrect if swap supported "shared |
1214 | * private" pages, but they are handled by tmpfs files. | 1214 | * private" pages, but they are handled by tmpfs files. |
1215 | */ | 1215 | */ |
1216 | if (swap_count(*swap_map) && | 1216 | if (swap_count(*swap_map) && |
1217 | PageDirty(page) && PageSwapCache(page)) { | 1217 | PageDirty(page) && PageSwapCache(page)) { |
1218 | struct writeback_control wbc = { | 1218 | struct writeback_control wbc = { |
1219 | .sync_mode = WB_SYNC_NONE, | 1219 | .sync_mode = WB_SYNC_NONE, |
1220 | }; | 1220 | }; |
1221 | 1221 | ||
1222 | swap_writepage(page, &wbc); | 1222 | swap_writepage(page, &wbc); |
1223 | lock_page(page); | 1223 | lock_page(page); |
1224 | wait_on_page_writeback(page); | 1224 | wait_on_page_writeback(page); |
1225 | } | 1225 | } |
1226 | 1226 | ||
1227 | /* | 1227 | /* |
1228 | * It is conceivable that a racing task removed this page from | 1228 | * It is conceivable that a racing task removed this page from |
1229 | * swap cache just before we acquired the page lock at the top, | 1229 | * swap cache just before we acquired the page lock at the top, |
1230 | * or while we dropped it in unuse_mm(). The page might even | 1230 | * or while we dropped it in unuse_mm(). The page might even |
1231 | * be back in swap cache on another swap area: that we must not | 1231 | * be back in swap cache on another swap area: that we must not |
1232 | * delete, since it may not have been written out to swap yet. | 1232 | * delete, since it may not have been written out to swap yet. |
1233 | */ | 1233 | */ |
1234 | if (PageSwapCache(page) && | 1234 | if (PageSwapCache(page) && |
1235 | likely(page_private(page) == entry.val)) | 1235 | likely(page_private(page) == entry.val)) |
1236 | delete_from_swap_cache(page); | 1236 | delete_from_swap_cache(page); |
1237 | 1237 | ||
1238 | /* | 1238 | /* |
1239 | * So we could skip searching mms once swap count went | 1239 | * So we could skip searching mms once swap count went |
1240 | * to 1, we did not mark any present ptes as dirty: must | 1240 | * to 1, we did not mark any present ptes as dirty: must |
1241 | * mark page dirty so shrink_page_list will preserve it. | 1241 | * mark page dirty so shrink_page_list will preserve it. |
1242 | */ | 1242 | */ |
1243 | SetPageDirty(page); | 1243 | SetPageDirty(page); |
1244 | retry: | 1244 | retry: |
1245 | unlock_page(page); | 1245 | unlock_page(page); |
1246 | page_cache_release(page); | 1246 | page_cache_release(page); |
1247 | 1247 | ||
1248 | /* | 1248 | /* |
1249 | * Make sure that we aren't completely killing | 1249 | * Make sure that we aren't completely killing |
1250 | * interactive performance. | 1250 | * interactive performance. |
1251 | */ | 1251 | */ |
1252 | cond_resched(); | 1252 | cond_resched(); |
1253 | } | 1253 | } |
1254 | 1254 | ||
1255 | mmput(start_mm); | 1255 | mmput(start_mm); |
1256 | if (reset_overflow) { | 1256 | if (reset_overflow) { |
1257 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | 1257 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); |
1258 | swap_overflow = 0; | 1258 | swap_overflow = 0; |
1259 | } | 1259 | } |
1260 | return retval; | 1260 | return retval; |
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | /* | 1263 | /* |
1264 | * After a successful try_to_unuse, if no swap is now in use, we know | 1264 | * After a successful try_to_unuse, if no swap is now in use, we know |
1265 | * we can empty the mmlist. swap_lock must be held on entry and exit. | 1265 | * we can empty the mmlist. swap_lock must be held on entry and exit. |
1266 | * Note that mmlist_lock nests inside swap_lock, and an mm must be | 1266 | * Note that mmlist_lock nests inside swap_lock, and an mm must be |
1267 | * added to the mmlist just after page_duplicate - before would be racy. | 1267 | * added to the mmlist just after page_duplicate - before would be racy. |
1268 | */ | 1268 | */ |
1269 | static void drain_mmlist(void) | 1269 | static void drain_mmlist(void) |
1270 | { | 1270 | { |
1271 | struct list_head *p, *next; | 1271 | struct list_head *p, *next; |
1272 | unsigned int i; | 1272 | unsigned int i; |
1273 | 1273 | ||
1274 | for (i = 0; i < nr_swapfiles; i++) | 1274 | for (i = 0; i < nr_swapfiles; i++) |
1275 | if (swap_info[i].inuse_pages) | 1275 | if (swap_info[i].inuse_pages) |
1276 | return; | 1276 | return; |
1277 | spin_lock(&mmlist_lock); | 1277 | spin_lock(&mmlist_lock); |
1278 | list_for_each_safe(p, next, &init_mm.mmlist) | 1278 | list_for_each_safe(p, next, &init_mm.mmlist) |
1279 | list_del_init(p); | 1279 | list_del_init(p); |
1280 | spin_unlock(&mmlist_lock); | 1280 | spin_unlock(&mmlist_lock); |
1281 | } | 1281 | } |
1282 | 1282 | ||
1283 | /* | 1283 | /* |
1284 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1284 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
1285 | * corresponds to page offset `offset'. | 1285 | * corresponds to page offset `offset'. |
1286 | */ | 1286 | */ |
1287 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1287 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) |
1288 | { | 1288 | { |
1289 | struct swap_extent *se = sis->curr_swap_extent; | 1289 | struct swap_extent *se = sis->curr_swap_extent; |
1290 | struct swap_extent *start_se = se; | 1290 | struct swap_extent *start_se = se; |
1291 | 1291 | ||
1292 | for ( ; ; ) { | 1292 | for ( ; ; ) { |
1293 | struct list_head *lh; | 1293 | struct list_head *lh; |
1294 | 1294 | ||
1295 | if (se->start_page <= offset && | 1295 | if (se->start_page <= offset && |
1296 | offset < (se->start_page + se->nr_pages)) { | 1296 | offset < (se->start_page + se->nr_pages)) { |
1297 | return se->start_block + (offset - se->start_page); | 1297 | return se->start_block + (offset - se->start_page); |
1298 | } | 1298 | } |
1299 | lh = se->list.next; | 1299 | lh = se->list.next; |
1300 | if (lh == &sis->extent_list) | 1300 | if (lh == &sis->extent_list) |
1301 | lh = lh->next; | 1301 | lh = lh->next; |
1302 | se = list_entry(lh, struct swap_extent, list); | 1302 | se = list_entry(lh, struct swap_extent, list); |
1303 | sis->curr_swap_extent = se; | 1303 | sis->curr_swap_extent = se; |
1304 | BUG_ON(se == start_se); /* It *must* be present */ | 1304 | BUG_ON(se == start_se); /* It *must* be present */ |
1305 | } | 1305 | } |
1306 | } | 1306 | } |
1307 | 1307 | ||
1308 | #ifdef CONFIG_HIBERNATION | 1308 | #ifdef CONFIG_HIBERNATION |
1309 | /* | 1309 | /* |
1310 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1310 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev |
1311 | * corresponding to given index in swap_info (swap type). | 1311 | * corresponding to given index in swap_info (swap type). |
1312 | */ | 1312 | */ |
1313 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1313 | sector_t swapdev_block(int swap_type, pgoff_t offset) |
1314 | { | 1314 | { |
1315 | struct swap_info_struct *sis; | 1315 | struct swap_info_struct *sis; |
1316 | 1316 | ||
1317 | if (swap_type >= nr_swapfiles) | 1317 | if (swap_type >= nr_swapfiles) |
1318 | return 0; | 1318 | return 0; |
1319 | 1319 | ||
1320 | sis = swap_info + swap_type; | 1320 | sis = swap_info + swap_type; |
1321 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | 1321 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; |
1322 | } | 1322 | } |
1323 | #endif /* CONFIG_HIBERNATION */ | 1323 | #endif /* CONFIG_HIBERNATION */ |
1324 | 1324 | ||
1325 | /* | 1325 | /* |
1326 | * Free all of a swapdev's extent information | 1326 | * Free all of a swapdev's extent information |
1327 | */ | 1327 | */ |
1328 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1328 | static void destroy_swap_extents(struct swap_info_struct *sis) |
1329 | { | 1329 | { |
1330 | while (!list_empty(&sis->extent_list)) { | 1330 | while (!list_empty(&sis->extent_list)) { |
1331 | struct swap_extent *se; | 1331 | struct swap_extent *se; |
1332 | 1332 | ||
1333 | se = list_entry(sis->extent_list.next, | 1333 | se = list_entry(sis->extent_list.next, |
1334 | struct swap_extent, list); | 1334 | struct swap_extent, list); |
1335 | list_del(&se->list); | 1335 | list_del(&se->list); |
1336 | kfree(se); | 1336 | kfree(se); |
1337 | } | 1337 | } |
1338 | } | 1338 | } |
1339 | 1339 | ||
1340 | /* | 1340 | /* |
1341 | * Add a block range (and the corresponding page range) into this swapdev's | 1341 | * Add a block range (and the corresponding page range) into this swapdev's |
1342 | * extent list. The extent list is kept sorted in page order. | 1342 | * extent list. The extent list is kept sorted in page order. |
1343 | * | 1343 | * |
1344 | * This function rather assumes that it is called in ascending page order. | 1344 | * This function rather assumes that it is called in ascending page order. |
1345 | */ | 1345 | */ |
1346 | static int | 1346 | static int |
1347 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 1347 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
1348 | unsigned long nr_pages, sector_t start_block) | 1348 | unsigned long nr_pages, sector_t start_block) |
1349 | { | 1349 | { |
1350 | struct swap_extent *se; | 1350 | struct swap_extent *se; |
1351 | struct swap_extent *new_se; | 1351 | struct swap_extent *new_se; |
1352 | struct list_head *lh; | 1352 | struct list_head *lh; |
1353 | 1353 | ||
1354 | lh = sis->extent_list.prev; /* The highest page extent */ | 1354 | lh = sis->extent_list.prev; /* The highest page extent */ |
1355 | if (lh != &sis->extent_list) { | 1355 | if (lh != &sis->extent_list) { |
1356 | se = list_entry(lh, struct swap_extent, list); | 1356 | se = list_entry(lh, struct swap_extent, list); |
1357 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1357 | BUG_ON(se->start_page + se->nr_pages != start_page); |
1358 | if (se->start_block + se->nr_pages == start_block) { | 1358 | if (se->start_block + se->nr_pages == start_block) { |
1359 | /* Merge it */ | 1359 | /* Merge it */ |
1360 | se->nr_pages += nr_pages; | 1360 | se->nr_pages += nr_pages; |
1361 | return 0; | 1361 | return 0; |
1362 | } | 1362 | } |
1363 | } | 1363 | } |
1364 | 1364 | ||
1365 | /* | 1365 | /* |
1366 | * No merge. Insert a new extent, preserving ordering. | 1366 | * No merge. Insert a new extent, preserving ordering. |
1367 | */ | 1367 | */ |
1368 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); | 1368 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); |
1369 | if (new_se == NULL) | 1369 | if (new_se == NULL) |
1370 | return -ENOMEM; | 1370 | return -ENOMEM; |
1371 | new_se->start_page = start_page; | 1371 | new_se->start_page = start_page; |
1372 | new_se->nr_pages = nr_pages; | 1372 | new_se->nr_pages = nr_pages; |
1373 | new_se->start_block = start_block; | 1373 | new_se->start_block = start_block; |
1374 | 1374 | ||
1375 | list_add_tail(&new_se->list, &sis->extent_list); | 1375 | list_add_tail(&new_se->list, &sis->extent_list); |
1376 | return 1; | 1376 | return 1; |
1377 | } | 1377 | } |
1378 | 1378 | ||
1379 | /* | 1379 | /* |
1380 | * A `swap extent' is a simple thing which maps a contiguous range of pages | 1380 | * A `swap extent' is a simple thing which maps a contiguous range of pages |
1381 | * onto a contiguous range of disk blocks. An ordered list of swap extents | 1381 | * onto a contiguous range of disk blocks. An ordered list of swap extents |
1382 | * is built at swapon time and is then used at swap_writepage/swap_readpage | 1382 | * is built at swapon time and is then used at swap_writepage/swap_readpage |
1383 | * time for locating where on disk a page belongs. | 1383 | * time for locating where on disk a page belongs. |
1384 | * | 1384 | * |
1385 | * If the swapfile is an S_ISBLK block device, a single extent is installed. | 1385 | * If the swapfile is an S_ISBLK block device, a single extent is installed. |
1386 | * This is done so that the main operating code can treat S_ISBLK and S_ISREG | 1386 | * This is done so that the main operating code can treat S_ISBLK and S_ISREG |
1387 | * swap files identically. | 1387 | * swap files identically. |
1388 | * | 1388 | * |
1389 | * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap | 1389 | * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap |
1390 | * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK | 1390 | * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK |
1391 | * swapfiles are handled *identically* after swapon time. | 1391 | * swapfiles are handled *identically* after swapon time. |
1392 | * | 1392 | * |
1393 | * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks | 1393 | * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks |
1394 | * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If | 1394 | * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If |
1395 | * some stray blocks are found which do not fall within the PAGE_SIZE alignment | 1395 | * some stray blocks are found which do not fall within the PAGE_SIZE alignment |
1396 | * requirements, they are simply tossed out - we will never use those blocks | 1396 | * requirements, they are simply tossed out - we will never use those blocks |
1397 | * for swapping. | 1397 | * for swapping. |
1398 | * | 1398 | * |
1399 | * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This | 1399 | * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This |
1400 | * prevents root from shooting her foot off by ftruncating an in-use swapfile, | 1400 | * prevents root from shooting her foot off by ftruncating an in-use swapfile, |
1401 | * which will scribble on the fs. | 1401 | * which will scribble on the fs. |
1402 | * | 1402 | * |
1403 | * The amount of disk space which a single swap extent represents varies. | 1403 | * The amount of disk space which a single swap extent represents varies. |
1404 | * Typically it is in the 1-4 megabyte range. So we can have hundreds of | 1404 | * Typically it is in the 1-4 megabyte range. So we can have hundreds of |
1405 | * extents in the list. To avoid much list walking, we cache the previous | 1405 | * extents in the list. To avoid much list walking, we cache the previous |
1406 | * search location in `curr_swap_extent', and start new searches from there. | 1406 | * search location in `curr_swap_extent', and start new searches from there. |
1407 | * This is extremely effective. The average number of iterations in | 1407 | * This is extremely effective. The average number of iterations in |
1408 | * map_swap_page() has been measured at about 0.3 per page. - akpm. | 1408 | * map_swap_page() has been measured at about 0.3 per page. - akpm. |
1409 | */ | 1409 | */ |
1410 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | 1410 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1411 | { | 1411 | { |
1412 | struct inode *inode; | 1412 | struct inode *inode; |
1413 | unsigned blocks_per_page; | 1413 | unsigned blocks_per_page; |
1414 | unsigned long page_no; | 1414 | unsigned long page_no; |
1415 | unsigned blkbits; | 1415 | unsigned blkbits; |
1416 | sector_t probe_block; | 1416 | sector_t probe_block; |
1417 | sector_t last_block; | 1417 | sector_t last_block; |
1418 | sector_t lowest_block = -1; | 1418 | sector_t lowest_block = -1; |
1419 | sector_t highest_block = 0; | 1419 | sector_t highest_block = 0; |
1420 | int nr_extents = 0; | 1420 | int nr_extents = 0; |
1421 | int ret; | 1421 | int ret; |
1422 | 1422 | ||
1423 | inode = sis->swap_file->f_mapping->host; | 1423 | inode = sis->swap_file->f_mapping->host; |
1424 | if (S_ISBLK(inode->i_mode)) { | 1424 | if (S_ISBLK(inode->i_mode)) { |
1425 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1425 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1426 | *span = sis->pages; | 1426 | *span = sis->pages; |
1427 | goto done; | 1427 | goto done; |
1428 | } | 1428 | } |
1429 | 1429 | ||
1430 | blkbits = inode->i_blkbits; | 1430 | blkbits = inode->i_blkbits; |
1431 | blocks_per_page = PAGE_SIZE >> blkbits; | 1431 | blocks_per_page = PAGE_SIZE >> blkbits; |
1432 | 1432 | ||
1433 | /* | 1433 | /* |
1434 | * Map all the blocks into the extent list. This code doesn't try | 1434 | * Map all the blocks into the extent list. This code doesn't try |
1435 | * to be very smart. | 1435 | * to be very smart. |
1436 | */ | 1436 | */ |
1437 | probe_block = 0; | 1437 | probe_block = 0; |
1438 | page_no = 0; | 1438 | page_no = 0; |
1439 | last_block = i_size_read(inode) >> blkbits; | 1439 | last_block = i_size_read(inode) >> blkbits; |
1440 | while ((probe_block + blocks_per_page) <= last_block && | 1440 | while ((probe_block + blocks_per_page) <= last_block && |
1441 | page_no < sis->max) { | 1441 | page_no < sis->max) { |
1442 | unsigned block_in_page; | 1442 | unsigned block_in_page; |
1443 | sector_t first_block; | 1443 | sector_t first_block; |
1444 | 1444 | ||
1445 | first_block = bmap(inode, probe_block); | 1445 | first_block = bmap(inode, probe_block); |
1446 | if (first_block == 0) | 1446 | if (first_block == 0) |
1447 | goto bad_bmap; | 1447 | goto bad_bmap; |
1448 | 1448 | ||
1449 | /* | 1449 | /* |
1450 | * It must be PAGE_SIZE aligned on-disk | 1450 | * It must be PAGE_SIZE aligned on-disk |
1451 | */ | 1451 | */ |
1452 | if (first_block & (blocks_per_page - 1)) { | 1452 | if (first_block & (blocks_per_page - 1)) { |
1453 | probe_block++; | 1453 | probe_block++; |
1454 | goto reprobe; | 1454 | goto reprobe; |
1455 | } | 1455 | } |
1456 | 1456 | ||
1457 | for (block_in_page = 1; block_in_page < blocks_per_page; | 1457 | for (block_in_page = 1; block_in_page < blocks_per_page; |
1458 | block_in_page++) { | 1458 | block_in_page++) { |
1459 | sector_t block; | 1459 | sector_t block; |
1460 | 1460 | ||
1461 | block = bmap(inode, probe_block + block_in_page); | 1461 | block = bmap(inode, probe_block + block_in_page); |
1462 | if (block == 0) | 1462 | if (block == 0) |
1463 | goto bad_bmap; | 1463 | goto bad_bmap; |
1464 | if (block != first_block + block_in_page) { | 1464 | if (block != first_block + block_in_page) { |
1465 | /* Discontiguity */ | 1465 | /* Discontiguity */ |
1466 | probe_block++; | 1466 | probe_block++; |
1467 | goto reprobe; | 1467 | goto reprobe; |
1468 | } | 1468 | } |
1469 | } | 1469 | } |
1470 | 1470 | ||
1471 | first_block >>= (PAGE_SHIFT - blkbits); | 1471 | first_block >>= (PAGE_SHIFT - blkbits); |
1472 | if (page_no) { /* exclude the header page */ | 1472 | if (page_no) { /* exclude the header page */ |
1473 | if (first_block < lowest_block) | 1473 | if (first_block < lowest_block) |
1474 | lowest_block = first_block; | 1474 | lowest_block = first_block; |
1475 | if (first_block > highest_block) | 1475 | if (first_block > highest_block) |
1476 | highest_block = first_block; | 1476 | highest_block = first_block; |
1477 | } | 1477 | } |
1478 | 1478 | ||
1479 | /* | 1479 | /* |
1480 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | 1480 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks |
1481 | */ | 1481 | */ |
1482 | ret = add_swap_extent(sis, page_no, 1, first_block); | 1482 | ret = add_swap_extent(sis, page_no, 1, first_block); |
1483 | if (ret < 0) | 1483 | if (ret < 0) |
1484 | goto out; | 1484 | goto out; |
1485 | nr_extents += ret; | 1485 | nr_extents += ret; |
1486 | page_no++; | 1486 | page_no++; |
1487 | probe_block += blocks_per_page; | 1487 | probe_block += blocks_per_page; |
1488 | reprobe: | 1488 | reprobe: |
1489 | continue; | 1489 | continue; |
1490 | } | 1490 | } |
1491 | ret = nr_extents; | 1491 | ret = nr_extents; |
1492 | *span = 1 + highest_block - lowest_block; | 1492 | *span = 1 + highest_block - lowest_block; |
1493 | if (page_no == 0) | 1493 | if (page_no == 0) |
1494 | page_no = 1; /* force Empty message */ | 1494 | page_no = 1; /* force Empty message */ |
1495 | sis->max = page_no; | 1495 | sis->max = page_no; |
1496 | sis->pages = page_no - 1; | 1496 | sis->pages = page_no - 1; |
1497 | sis->highest_bit = page_no - 1; | 1497 | sis->highest_bit = page_no - 1; |
1498 | done: | 1498 | done: |
1499 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1499 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, |
1500 | struct swap_extent, list); | 1500 | struct swap_extent, list); |
1501 | goto out; | 1501 | goto out; |
1502 | bad_bmap: | 1502 | bad_bmap: |
1503 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1503 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
1504 | ret = -EINVAL; | 1504 | ret = -EINVAL; |
1505 | out: | 1505 | out: |
1506 | return ret; | 1506 | return ret; |
1507 | } | 1507 | } |
1508 | 1508 | ||
1509 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1509 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1510 | { | 1510 | { |
1511 | struct swap_info_struct * p = NULL; | 1511 | struct swap_info_struct * p = NULL; |
1512 | unsigned short *swap_map; | 1512 | unsigned short *swap_map; |
1513 | struct file *swap_file, *victim; | 1513 | struct file *swap_file, *victim; |
1514 | struct address_space *mapping; | 1514 | struct address_space *mapping; |
1515 | struct inode *inode; | 1515 | struct inode *inode; |
1516 | char * pathname; | 1516 | char * pathname; |
1517 | int i, type, prev; | 1517 | int i, type, prev; |
1518 | int err; | 1518 | int err; |
1519 | 1519 | ||
1520 | if (!capable(CAP_SYS_ADMIN)) | 1520 | if (!capable(CAP_SYS_ADMIN)) |
1521 | return -EPERM; | 1521 | return -EPERM; |
1522 | 1522 | ||
1523 | pathname = getname(specialfile); | 1523 | pathname = getname(specialfile); |
1524 | err = PTR_ERR(pathname); | 1524 | err = PTR_ERR(pathname); |
1525 | if (IS_ERR(pathname)) | 1525 | if (IS_ERR(pathname)) |
1526 | goto out; | 1526 | goto out; |
1527 | 1527 | ||
1528 | victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); | 1528 | victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); |
1529 | putname(pathname); | 1529 | putname(pathname); |
1530 | err = PTR_ERR(victim); | 1530 | err = PTR_ERR(victim); |
1531 | if (IS_ERR(victim)) | 1531 | if (IS_ERR(victim)) |
1532 | goto out; | 1532 | goto out; |
1533 | 1533 | ||
1534 | mapping = victim->f_mapping; | 1534 | mapping = victim->f_mapping; |
1535 | prev = -1; | 1535 | prev = -1; |
1536 | spin_lock(&swap_lock); | 1536 | spin_lock(&swap_lock); |
1537 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1537 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { |
1538 | p = swap_info + type; | 1538 | p = swap_info + type; |
1539 | if (p->flags & SWP_WRITEOK) { | 1539 | if (p->flags & SWP_WRITEOK) { |
1540 | if (p->swap_file->f_mapping == mapping) | 1540 | if (p->swap_file->f_mapping == mapping) |
1541 | break; | 1541 | break; |
1542 | } | 1542 | } |
1543 | prev = type; | 1543 | prev = type; |
1544 | } | 1544 | } |
1545 | if (type < 0) { | 1545 | if (type < 0) { |
1546 | err = -EINVAL; | 1546 | err = -EINVAL; |
1547 | spin_unlock(&swap_lock); | 1547 | spin_unlock(&swap_lock); |
1548 | goto out_dput; | 1548 | goto out_dput; |
1549 | } | 1549 | } |
1550 | if (!security_vm_enough_memory(p->pages)) | 1550 | if (!security_vm_enough_memory(p->pages)) |
1551 | vm_unacct_memory(p->pages); | 1551 | vm_unacct_memory(p->pages); |
1552 | else { | 1552 | else { |
1553 | err = -ENOMEM; | 1553 | err = -ENOMEM; |
1554 | spin_unlock(&swap_lock); | 1554 | spin_unlock(&swap_lock); |
1555 | goto out_dput; | 1555 | goto out_dput; |
1556 | } | 1556 | } |
1557 | if (prev < 0) { | 1557 | if (prev < 0) { |
1558 | swap_list.head = p->next; | 1558 | swap_list.head = p->next; |
1559 | } else { | 1559 | } else { |
1560 | swap_info[prev].next = p->next; | 1560 | swap_info[prev].next = p->next; |
1561 | } | 1561 | } |
1562 | if (type == swap_list.next) { | 1562 | if (type == swap_list.next) { |
1563 | /* just pick something that's safe... */ | 1563 | /* just pick something that's safe... */ |
1564 | swap_list.next = swap_list.head; | 1564 | swap_list.next = swap_list.head; |
1565 | } | 1565 | } |
1566 | if (p->prio < 0) { | 1566 | if (p->prio < 0) { |
1567 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1567 | for (i = p->next; i >= 0; i = swap_info[i].next) |
1568 | swap_info[i].prio = p->prio--; | 1568 | swap_info[i].prio = p->prio--; |
1569 | least_priority++; | 1569 | least_priority++; |
1570 | } | 1570 | } |
1571 | nr_swap_pages -= p->pages; | 1571 | nr_swap_pages -= p->pages; |
1572 | total_swap_pages -= p->pages; | 1572 | total_swap_pages -= p->pages; |
1573 | p->flags &= ~SWP_WRITEOK; | 1573 | p->flags &= ~SWP_WRITEOK; |
1574 | spin_unlock(&swap_lock); | 1574 | spin_unlock(&swap_lock); |
1575 | 1575 | ||
1576 | current->flags |= PF_SWAPOFF; | 1576 | current->flags |= PF_SWAPOFF; |
1577 | err = try_to_unuse(type); | 1577 | err = try_to_unuse(type); |
1578 | current->flags &= ~PF_SWAPOFF; | 1578 | current->flags &= ~PF_SWAPOFF; |
1579 | 1579 | ||
1580 | if (err) { | 1580 | if (err) { |
1581 | /* re-insert swap space back into swap_list */ | 1581 | /* re-insert swap space back into swap_list */ |
1582 | spin_lock(&swap_lock); | 1582 | spin_lock(&swap_lock); |
1583 | if (p->prio < 0) | 1583 | if (p->prio < 0) |
1584 | p->prio = --least_priority; | 1584 | p->prio = --least_priority; |
1585 | prev = -1; | 1585 | prev = -1; |
1586 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1586 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { |
1587 | if (p->prio >= swap_info[i].prio) | 1587 | if (p->prio >= swap_info[i].prio) |
1588 | break; | 1588 | break; |
1589 | prev = i; | 1589 | prev = i; |
1590 | } | 1590 | } |
1591 | p->next = i; | 1591 | p->next = i; |
1592 | if (prev < 0) | 1592 | if (prev < 0) |
1593 | swap_list.head = swap_list.next = p - swap_info; | 1593 | swap_list.head = swap_list.next = p - swap_info; |
1594 | else | 1594 | else |
1595 | swap_info[prev].next = p - swap_info; | 1595 | swap_info[prev].next = p - swap_info; |
1596 | nr_swap_pages += p->pages; | 1596 | nr_swap_pages += p->pages; |
1597 | total_swap_pages += p->pages; | 1597 | total_swap_pages += p->pages; |
1598 | p->flags |= SWP_WRITEOK; | 1598 | p->flags |= SWP_WRITEOK; |
1599 | spin_unlock(&swap_lock); | 1599 | spin_unlock(&swap_lock); |
1600 | goto out_dput; | 1600 | goto out_dput; |
1601 | } | 1601 | } |
1602 | 1602 | ||
1603 | /* wait for any unplug function to finish */ | 1603 | /* wait for any unplug function to finish */ |
1604 | down_write(&swap_unplug_sem); | 1604 | down_write(&swap_unplug_sem); |
1605 | up_write(&swap_unplug_sem); | 1605 | up_write(&swap_unplug_sem); |
1606 | 1606 | ||
1607 | destroy_swap_extents(p); | 1607 | destroy_swap_extents(p); |
1608 | mutex_lock(&swapon_mutex); | 1608 | mutex_lock(&swapon_mutex); |
1609 | spin_lock(&swap_lock); | 1609 | spin_lock(&swap_lock); |
1610 | drain_mmlist(); | 1610 | drain_mmlist(); |
1611 | 1611 | ||
1612 | /* wait for anyone still in scan_swap_map */ | 1612 | /* wait for anyone still in scan_swap_map */ |
1613 | p->highest_bit = 0; /* cuts scans short */ | 1613 | p->highest_bit = 0; /* cuts scans short */ |
1614 | while (p->flags >= SWP_SCANNING) { | 1614 | while (p->flags >= SWP_SCANNING) { |
1615 | spin_unlock(&swap_lock); | 1615 | spin_unlock(&swap_lock); |
1616 | schedule_timeout_uninterruptible(1); | 1616 | schedule_timeout_uninterruptible(1); |
1617 | spin_lock(&swap_lock); | 1617 | spin_lock(&swap_lock); |
1618 | } | 1618 | } |
1619 | 1619 | ||
1620 | swap_file = p->swap_file; | 1620 | swap_file = p->swap_file; |
1621 | p->swap_file = NULL; | 1621 | p->swap_file = NULL; |
1622 | p->max = 0; | 1622 | p->max = 0; |
1623 | swap_map = p->swap_map; | 1623 | swap_map = p->swap_map; |
1624 | p->swap_map = NULL; | 1624 | p->swap_map = NULL; |
1625 | p->flags = 0; | 1625 | p->flags = 0; |
1626 | spin_unlock(&swap_lock); | 1626 | spin_unlock(&swap_lock); |
1627 | mutex_unlock(&swapon_mutex); | 1627 | mutex_unlock(&swapon_mutex); |
1628 | vfree(swap_map); | 1628 | vfree(swap_map); |
1629 | /* Destroy swap account informatin */ | 1629 | /* Destroy swap account informatin */ |
1630 | swap_cgroup_swapoff(type); | 1630 | swap_cgroup_swapoff(type); |
1631 | 1631 | ||
1632 | inode = mapping->host; | 1632 | inode = mapping->host; |
1633 | if (S_ISBLK(inode->i_mode)) { | 1633 | if (S_ISBLK(inode->i_mode)) { |
1634 | struct block_device *bdev = I_BDEV(inode); | 1634 | struct block_device *bdev = I_BDEV(inode); |
1635 | set_blocksize(bdev, p->old_block_size); | 1635 | set_blocksize(bdev, p->old_block_size); |
1636 | bd_release(bdev); | 1636 | bd_release(bdev); |
1637 | } else { | 1637 | } else { |
1638 | mutex_lock(&inode->i_mutex); | 1638 | mutex_lock(&inode->i_mutex); |
1639 | inode->i_flags &= ~S_SWAPFILE; | 1639 | inode->i_flags &= ~S_SWAPFILE; |
1640 | mutex_unlock(&inode->i_mutex); | 1640 | mutex_unlock(&inode->i_mutex); |
1641 | } | 1641 | } |
1642 | filp_close(swap_file, NULL); | 1642 | filp_close(swap_file, NULL); |
1643 | err = 0; | 1643 | err = 0; |
1644 | 1644 | ||
1645 | out_dput: | 1645 | out_dput: |
1646 | filp_close(victim, NULL); | 1646 | filp_close(victim, NULL); |
1647 | out: | 1647 | out: |
1648 | return err; | 1648 | return err; |
1649 | } | 1649 | } |
1650 | 1650 | ||
1651 | #ifdef CONFIG_PROC_FS | 1651 | #ifdef CONFIG_PROC_FS |
1652 | /* iterator */ | 1652 | /* iterator */ |
1653 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1653 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
1654 | { | 1654 | { |
1655 | struct swap_info_struct *ptr = swap_info; | 1655 | struct swap_info_struct *ptr = swap_info; |
1656 | int i; | 1656 | int i; |
1657 | loff_t l = *pos; | 1657 | loff_t l = *pos; |
1658 | 1658 | ||
1659 | mutex_lock(&swapon_mutex); | 1659 | mutex_lock(&swapon_mutex); |
1660 | 1660 | ||
1661 | if (!l) | 1661 | if (!l) |
1662 | return SEQ_START_TOKEN; | 1662 | return SEQ_START_TOKEN; |
1663 | 1663 | ||
1664 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1664 | for (i = 0; i < nr_swapfiles; i++, ptr++) { |
1665 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1665 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1666 | continue; | 1666 | continue; |
1667 | if (!--l) | 1667 | if (!--l) |
1668 | return ptr; | 1668 | return ptr; |
1669 | } | 1669 | } |
1670 | 1670 | ||
1671 | return NULL; | 1671 | return NULL; |
1672 | } | 1672 | } |
1673 | 1673 | ||
1674 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1674 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1675 | { | 1675 | { |
1676 | struct swap_info_struct *ptr; | 1676 | struct swap_info_struct *ptr; |
1677 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1677 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; |
1678 | 1678 | ||
1679 | if (v == SEQ_START_TOKEN) | 1679 | if (v == SEQ_START_TOKEN) |
1680 | ptr = swap_info; | 1680 | ptr = swap_info; |
1681 | else { | 1681 | else { |
1682 | ptr = v; | 1682 | ptr = v; |
1683 | ptr++; | 1683 | ptr++; |
1684 | } | 1684 | } |
1685 | 1685 | ||
1686 | for (; ptr < endptr; ptr++) { | 1686 | for (; ptr < endptr; ptr++) { |
1687 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1687 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1688 | continue; | 1688 | continue; |
1689 | ++*pos; | 1689 | ++*pos; |
1690 | return ptr; | 1690 | return ptr; |
1691 | } | 1691 | } |
1692 | 1692 | ||
1693 | return NULL; | 1693 | return NULL; |
1694 | } | 1694 | } |
1695 | 1695 | ||
1696 | static void swap_stop(struct seq_file *swap, void *v) | 1696 | static void swap_stop(struct seq_file *swap, void *v) |
1697 | { | 1697 | { |
1698 | mutex_unlock(&swapon_mutex); | 1698 | mutex_unlock(&swapon_mutex); |
1699 | } | 1699 | } |
1700 | 1700 | ||
1701 | static int swap_show(struct seq_file *swap, void *v) | 1701 | static int swap_show(struct seq_file *swap, void *v) |
1702 | { | 1702 | { |
1703 | struct swap_info_struct *ptr = v; | 1703 | struct swap_info_struct *ptr = v; |
1704 | struct file *file; | 1704 | struct file *file; |
1705 | int len; | 1705 | int len; |
1706 | 1706 | ||
1707 | if (ptr == SEQ_START_TOKEN) { | 1707 | if (ptr == SEQ_START_TOKEN) { |
1708 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1708 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1709 | return 0; | 1709 | return 0; |
1710 | } | 1710 | } |
1711 | 1711 | ||
1712 | file = ptr->swap_file; | 1712 | file = ptr->swap_file; |
1713 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1713 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1714 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1714 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1715 | len < 40 ? 40 - len : 1, " ", | 1715 | len < 40 ? 40 - len : 1, " ", |
1716 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1716 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1717 | "partition" : "file\t", | 1717 | "partition" : "file\t", |
1718 | ptr->pages << (PAGE_SHIFT - 10), | 1718 | ptr->pages << (PAGE_SHIFT - 10), |
1719 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1719 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
1720 | ptr->prio); | 1720 | ptr->prio); |
1721 | return 0; | 1721 | return 0; |
1722 | } | 1722 | } |
1723 | 1723 | ||
1724 | static const struct seq_operations swaps_op = { | 1724 | static const struct seq_operations swaps_op = { |
1725 | .start = swap_start, | 1725 | .start = swap_start, |
1726 | .next = swap_next, | 1726 | .next = swap_next, |
1727 | .stop = swap_stop, | 1727 | .stop = swap_stop, |
1728 | .show = swap_show | 1728 | .show = swap_show |
1729 | }; | 1729 | }; |
1730 | 1730 | ||
1731 | static int swaps_open(struct inode *inode, struct file *file) | 1731 | static int swaps_open(struct inode *inode, struct file *file) |
1732 | { | 1732 | { |
1733 | return seq_open(file, &swaps_op); | 1733 | return seq_open(file, &swaps_op); |
1734 | } | 1734 | } |
1735 | 1735 | ||
1736 | static const struct file_operations proc_swaps_operations = { | 1736 | static const struct file_operations proc_swaps_operations = { |
1737 | .open = swaps_open, | 1737 | .open = swaps_open, |
1738 | .read = seq_read, | 1738 | .read = seq_read, |
1739 | .llseek = seq_lseek, | 1739 | .llseek = seq_lseek, |
1740 | .release = seq_release, | 1740 | .release = seq_release, |
1741 | }; | 1741 | }; |
1742 | 1742 | ||
1743 | static int __init procswaps_init(void) | 1743 | static int __init procswaps_init(void) |
1744 | { | 1744 | { |
1745 | proc_create("swaps", 0, NULL, &proc_swaps_operations); | 1745 | proc_create("swaps", 0, NULL, &proc_swaps_operations); |
1746 | return 0; | 1746 | return 0; |
1747 | } | 1747 | } |
1748 | __initcall(procswaps_init); | 1748 | __initcall(procswaps_init); |
1749 | #endif /* CONFIG_PROC_FS */ | 1749 | #endif /* CONFIG_PROC_FS */ |
1750 | 1750 | ||
1751 | #ifdef MAX_SWAPFILES_CHECK | 1751 | #ifdef MAX_SWAPFILES_CHECK |
1752 | static int __init max_swapfiles_check(void) | 1752 | static int __init max_swapfiles_check(void) |
1753 | { | 1753 | { |
1754 | MAX_SWAPFILES_CHECK(); | 1754 | MAX_SWAPFILES_CHECK(); |
1755 | return 0; | 1755 | return 0; |
1756 | } | 1756 | } |
1757 | late_initcall(max_swapfiles_check); | 1757 | late_initcall(max_swapfiles_check); |
1758 | #endif | 1758 | #endif |
1759 | 1759 | ||
1760 | /* | 1760 | /* |
1761 | * Written 01/25/92 by Simmule Turner, heavily changed by Linus. | 1761 | * Written 01/25/92 by Simmule Turner, heavily changed by Linus. |
1762 | * | 1762 | * |
1763 | * The swapon system call | 1763 | * The swapon system call |
1764 | */ | 1764 | */ |
1765 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1765 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1766 | { | 1766 | { |
1767 | struct swap_info_struct * p; | 1767 | struct swap_info_struct * p; |
1768 | char *name = NULL; | 1768 | char *name = NULL; |
1769 | struct block_device *bdev = NULL; | 1769 | struct block_device *bdev = NULL; |
1770 | struct file *swap_file = NULL; | 1770 | struct file *swap_file = NULL; |
1771 | struct address_space *mapping; | 1771 | struct address_space *mapping; |
1772 | unsigned int type; | 1772 | unsigned int type; |
1773 | int i, prev; | 1773 | int i, prev; |
1774 | int error; | 1774 | int error; |
1775 | union swap_header *swap_header = NULL; | 1775 | union swap_header *swap_header = NULL; |
1776 | unsigned int nr_good_pages = 0; | 1776 | unsigned int nr_good_pages = 0; |
1777 | int nr_extents = 0; | 1777 | int nr_extents = 0; |
1778 | sector_t span; | 1778 | sector_t span; |
1779 | unsigned long maxpages = 1; | 1779 | unsigned long maxpages = 1; |
1780 | unsigned long swapfilepages; | 1780 | unsigned long swapfilepages; |
1781 | unsigned short *swap_map = NULL; | 1781 | unsigned short *swap_map = NULL; |
1782 | struct page *page = NULL; | 1782 | struct page *page = NULL; |
1783 | struct inode *inode = NULL; | 1783 | struct inode *inode = NULL; |
1784 | int did_down = 0; | 1784 | int did_down = 0; |
1785 | 1785 | ||
1786 | if (!capable(CAP_SYS_ADMIN)) | 1786 | if (!capable(CAP_SYS_ADMIN)) |
1787 | return -EPERM; | 1787 | return -EPERM; |
1788 | spin_lock(&swap_lock); | 1788 | spin_lock(&swap_lock); |
1789 | p = swap_info; | 1789 | p = swap_info; |
1790 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1790 | for (type = 0 ; type < nr_swapfiles ; type++,p++) |
1791 | if (!(p->flags & SWP_USED)) | 1791 | if (!(p->flags & SWP_USED)) |
1792 | break; | 1792 | break; |
1793 | error = -EPERM; | 1793 | error = -EPERM; |
1794 | if (type >= MAX_SWAPFILES) { | 1794 | if (type >= MAX_SWAPFILES) { |
1795 | spin_unlock(&swap_lock); | 1795 | spin_unlock(&swap_lock); |
1796 | goto out; | 1796 | goto out; |
1797 | } | 1797 | } |
1798 | if (type >= nr_swapfiles) | 1798 | if (type >= nr_swapfiles) |
1799 | nr_swapfiles = type+1; | 1799 | nr_swapfiles = type+1; |
1800 | memset(p, 0, sizeof(*p)); | 1800 | memset(p, 0, sizeof(*p)); |
1801 | INIT_LIST_HEAD(&p->extent_list); | 1801 | INIT_LIST_HEAD(&p->extent_list); |
1802 | p->flags = SWP_USED; | 1802 | p->flags = SWP_USED; |
1803 | p->next = -1; | 1803 | p->next = -1; |
1804 | spin_unlock(&swap_lock); | 1804 | spin_unlock(&swap_lock); |
1805 | name = getname(specialfile); | 1805 | name = getname(specialfile); |
1806 | error = PTR_ERR(name); | 1806 | error = PTR_ERR(name); |
1807 | if (IS_ERR(name)) { | 1807 | if (IS_ERR(name)) { |
1808 | name = NULL; | 1808 | name = NULL; |
1809 | goto bad_swap_2; | 1809 | goto bad_swap_2; |
1810 | } | 1810 | } |
1811 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); | 1811 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); |
1812 | error = PTR_ERR(swap_file); | 1812 | error = PTR_ERR(swap_file); |
1813 | if (IS_ERR(swap_file)) { | 1813 | if (IS_ERR(swap_file)) { |
1814 | swap_file = NULL; | 1814 | swap_file = NULL; |
1815 | goto bad_swap_2; | 1815 | goto bad_swap_2; |
1816 | } | 1816 | } |
1817 | 1817 | ||
1818 | p->swap_file = swap_file; | 1818 | p->swap_file = swap_file; |
1819 | mapping = swap_file->f_mapping; | 1819 | mapping = swap_file->f_mapping; |
1820 | inode = mapping->host; | 1820 | inode = mapping->host; |
1821 | 1821 | ||
1822 | error = -EBUSY; | 1822 | error = -EBUSY; |
1823 | for (i = 0; i < nr_swapfiles; i++) { | 1823 | for (i = 0; i < nr_swapfiles; i++) { |
1824 | struct swap_info_struct *q = &swap_info[i]; | 1824 | struct swap_info_struct *q = &swap_info[i]; |
1825 | 1825 | ||
1826 | if (i == type || !q->swap_file) | 1826 | if (i == type || !q->swap_file) |
1827 | continue; | 1827 | continue; |
1828 | if (mapping == q->swap_file->f_mapping) | 1828 | if (mapping == q->swap_file->f_mapping) |
1829 | goto bad_swap; | 1829 | goto bad_swap; |
1830 | } | 1830 | } |
1831 | 1831 | ||
1832 | error = -EINVAL; | 1832 | error = -EINVAL; |
1833 | if (S_ISBLK(inode->i_mode)) { | 1833 | if (S_ISBLK(inode->i_mode)) { |
1834 | bdev = I_BDEV(inode); | 1834 | bdev = I_BDEV(inode); |
1835 | error = bd_claim(bdev, sys_swapon); | 1835 | error = bd_claim(bdev, sys_swapon); |
1836 | if (error < 0) { | 1836 | if (error < 0) { |
1837 | bdev = NULL; | 1837 | bdev = NULL; |
1838 | error = -EINVAL; | 1838 | error = -EINVAL; |
1839 | goto bad_swap; | 1839 | goto bad_swap; |
1840 | } | 1840 | } |
1841 | p->old_block_size = block_size(bdev); | 1841 | p->old_block_size = block_size(bdev); |
1842 | error = set_blocksize(bdev, PAGE_SIZE); | 1842 | error = set_blocksize(bdev, PAGE_SIZE); |
1843 | if (error < 0) | 1843 | if (error < 0) |
1844 | goto bad_swap; | 1844 | goto bad_swap; |
1845 | p->bdev = bdev; | 1845 | p->bdev = bdev; |
1846 | } else if (S_ISREG(inode->i_mode)) { | 1846 | } else if (S_ISREG(inode->i_mode)) { |
1847 | p->bdev = inode->i_sb->s_bdev; | 1847 | p->bdev = inode->i_sb->s_bdev; |
1848 | mutex_lock(&inode->i_mutex); | 1848 | mutex_lock(&inode->i_mutex); |
1849 | did_down = 1; | 1849 | did_down = 1; |
1850 | if (IS_SWAPFILE(inode)) { | 1850 | if (IS_SWAPFILE(inode)) { |
1851 | error = -EBUSY; | 1851 | error = -EBUSY; |
1852 | goto bad_swap; | 1852 | goto bad_swap; |
1853 | } | 1853 | } |
1854 | } else { | 1854 | } else { |
1855 | goto bad_swap; | 1855 | goto bad_swap; |
1856 | } | 1856 | } |
1857 | 1857 | ||
1858 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; | 1858 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1859 | 1859 | ||
1860 | /* | 1860 | /* |
1861 | * Read the swap header. | 1861 | * Read the swap header. |
1862 | */ | 1862 | */ |
1863 | if (!mapping->a_ops->readpage) { | 1863 | if (!mapping->a_ops->readpage) { |
1864 | error = -EINVAL; | 1864 | error = -EINVAL; |
1865 | goto bad_swap; | 1865 | goto bad_swap; |
1866 | } | 1866 | } |
1867 | page = read_mapping_page(mapping, 0, swap_file); | 1867 | page = read_mapping_page(mapping, 0, swap_file); |
1868 | if (IS_ERR(page)) { | 1868 | if (IS_ERR(page)) { |
1869 | error = PTR_ERR(page); | 1869 | error = PTR_ERR(page); |
1870 | goto bad_swap; | 1870 | goto bad_swap; |
1871 | } | 1871 | } |
1872 | swap_header = kmap(page); | 1872 | swap_header = kmap(page); |
1873 | 1873 | ||
1874 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { | 1874 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1875 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 1875 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
1876 | error = -EINVAL; | 1876 | error = -EINVAL; |
1877 | goto bad_swap; | 1877 | goto bad_swap; |
1878 | } | 1878 | } |
1879 | 1879 | ||
1880 | /* swap partition endianess hack... */ | 1880 | /* swap partition endianess hack... */ |
1881 | if (swab32(swap_header->info.version) == 1) { | 1881 | if (swab32(swap_header->info.version) == 1) { |
1882 | swab32s(&swap_header->info.version); | 1882 | swab32s(&swap_header->info.version); |
1883 | swab32s(&swap_header->info.last_page); | 1883 | swab32s(&swap_header->info.last_page); |
1884 | swab32s(&swap_header->info.nr_badpages); | 1884 | swab32s(&swap_header->info.nr_badpages); |
1885 | for (i = 0; i < swap_header->info.nr_badpages; i++) | 1885 | for (i = 0; i < swap_header->info.nr_badpages; i++) |
1886 | swab32s(&swap_header->info.badpages[i]); | 1886 | swab32s(&swap_header->info.badpages[i]); |
1887 | } | 1887 | } |
1888 | /* Check the swap header's sub-version */ | 1888 | /* Check the swap header's sub-version */ |
1889 | if (swap_header->info.version != 1) { | 1889 | if (swap_header->info.version != 1) { |
1890 | printk(KERN_WARNING | 1890 | printk(KERN_WARNING |
1891 | "Unable to handle swap header version %d\n", | 1891 | "Unable to handle swap header version %d\n", |
1892 | swap_header->info.version); | 1892 | swap_header->info.version); |
1893 | error = -EINVAL; | 1893 | error = -EINVAL; |
1894 | goto bad_swap; | 1894 | goto bad_swap; |
1895 | } | 1895 | } |
1896 | 1896 | ||
1897 | p->lowest_bit = 1; | 1897 | p->lowest_bit = 1; |
1898 | p->cluster_next = 1; | 1898 | p->cluster_next = 1; |
1899 | 1899 | ||
1900 | /* | 1900 | /* |
1901 | * Find out how many pages are allowed for a single swap | 1901 | * Find out how many pages are allowed for a single swap |
1902 | * device. There are two limiting factors: 1) the number of | 1902 | * device. There are two limiting factors: 1) the number of |
1903 | * bits for the swap offset in the swp_entry_t type and | 1903 | * bits for the swap offset in the swp_entry_t type and |
1904 | * 2) the number of bits in the a swap pte as defined by | 1904 | * 2) the number of bits in the a swap pte as defined by |
1905 | * the different architectures. In order to find the | 1905 | * the different architectures. In order to find the |
1906 | * largest possible bit mask a swap entry with swap type 0 | 1906 | * largest possible bit mask a swap entry with swap type 0 |
1907 | * and swap offset ~0UL is created, encoded to a swap pte, | 1907 | * and swap offset ~0UL is created, encoded to a swap pte, |
1908 | * decoded to a swp_entry_t again and finally the swap | 1908 | * decoded to a swp_entry_t again and finally the swap |
1909 | * offset is extracted. This will mask all the bits from | 1909 | * offset is extracted. This will mask all the bits from |
1910 | * the initial ~0UL mask that can't be encoded in either | 1910 | * the initial ~0UL mask that can't be encoded in either |
1911 | * the swp_entry_t or the architecture definition of a | 1911 | * the swp_entry_t or the architecture definition of a |
1912 | * swap pte. | 1912 | * swap pte. |
1913 | */ | 1913 | */ |
1914 | maxpages = swp_offset(pte_to_swp_entry( | 1914 | maxpages = swp_offset(pte_to_swp_entry( |
1915 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1915 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; |
1916 | if (maxpages > swap_header->info.last_page) | 1916 | if (maxpages > swap_header->info.last_page) |
1917 | maxpages = swap_header->info.last_page; | 1917 | maxpages = swap_header->info.last_page; |
1918 | p->highest_bit = maxpages - 1; | 1918 | p->highest_bit = maxpages - 1; |
1919 | 1919 | ||
1920 | error = -EINVAL; | 1920 | error = -EINVAL; |
1921 | if (!maxpages) | 1921 | if (!maxpages) |
1922 | goto bad_swap; | 1922 | goto bad_swap; |
1923 | if (swapfilepages && maxpages > swapfilepages) { | 1923 | if (swapfilepages && maxpages > swapfilepages) { |
1924 | printk(KERN_WARNING | 1924 | printk(KERN_WARNING |
1925 | "Swap area shorter than signature indicates\n"); | 1925 | "Swap area shorter than signature indicates\n"); |
1926 | goto bad_swap; | 1926 | goto bad_swap; |
1927 | } | 1927 | } |
1928 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1928 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1929 | goto bad_swap; | 1929 | goto bad_swap; |
1930 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1930 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
1931 | goto bad_swap; | 1931 | goto bad_swap; |
1932 | 1932 | ||
1933 | /* OK, set up the swap map and apply the bad block list */ | 1933 | /* OK, set up the swap map and apply the bad block list */ |
1934 | swap_map = vmalloc(maxpages * sizeof(short)); | 1934 | swap_map = vmalloc(maxpages * sizeof(short)); |
1935 | if (!swap_map) { | 1935 | if (!swap_map) { |
1936 | error = -ENOMEM; | 1936 | error = -ENOMEM; |
1937 | goto bad_swap; | 1937 | goto bad_swap; |
1938 | } | 1938 | } |
1939 | 1939 | ||
1940 | memset(swap_map, 0, maxpages * sizeof(short)); | 1940 | memset(swap_map, 0, maxpages * sizeof(short)); |
1941 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1941 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1942 | int page_nr = swap_header->info.badpages[i]; | 1942 | int page_nr = swap_header->info.badpages[i]; |
1943 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1943 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
1944 | error = -EINVAL; | 1944 | error = -EINVAL; |
1945 | goto bad_swap; | 1945 | goto bad_swap; |
1946 | } | 1946 | } |
1947 | swap_map[page_nr] = SWAP_MAP_BAD; | 1947 | swap_map[page_nr] = SWAP_MAP_BAD; |
1948 | } | 1948 | } |
1949 | 1949 | ||
1950 | error = swap_cgroup_swapon(type, maxpages); | 1950 | error = swap_cgroup_swapon(type, maxpages); |
1951 | if (error) | 1951 | if (error) |
1952 | goto bad_swap; | 1952 | goto bad_swap; |
1953 | 1953 | ||
1954 | nr_good_pages = swap_header->info.last_page - | 1954 | nr_good_pages = swap_header->info.last_page - |
1955 | swap_header->info.nr_badpages - | 1955 | swap_header->info.nr_badpages - |
1956 | 1 /* header page */; | 1956 | 1 /* header page */; |
1957 | 1957 | ||
1958 | if (nr_good_pages) { | 1958 | if (nr_good_pages) { |
1959 | swap_map[0] = SWAP_MAP_BAD; | 1959 | swap_map[0] = SWAP_MAP_BAD; |
1960 | p->max = maxpages; | 1960 | p->max = maxpages; |
1961 | p->pages = nr_good_pages; | 1961 | p->pages = nr_good_pages; |
1962 | nr_extents = setup_swap_extents(p, &span); | 1962 | nr_extents = setup_swap_extents(p, &span); |
1963 | if (nr_extents < 0) { | 1963 | if (nr_extents < 0) { |
1964 | error = nr_extents; | 1964 | error = nr_extents; |
1965 | goto bad_swap; | 1965 | goto bad_swap; |
1966 | } | 1966 | } |
1967 | nr_good_pages = p->pages; | 1967 | nr_good_pages = p->pages; |
1968 | } | 1968 | } |
1969 | if (!nr_good_pages) { | 1969 | if (!nr_good_pages) { |
1970 | printk(KERN_WARNING "Empty swap-file\n"); | 1970 | printk(KERN_WARNING "Empty swap-file\n"); |
1971 | error = -EINVAL; | 1971 | error = -EINVAL; |
1972 | goto bad_swap; | 1972 | goto bad_swap; |
1973 | } | 1973 | } |
1974 | 1974 | ||
1975 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 1975 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
1976 | p->flags |= SWP_SOLIDSTATE; | 1976 | p->flags |= SWP_SOLIDSTATE; |
1977 | p->cluster_next = 1 + (random32() % p->highest_bit); | 1977 | p->cluster_next = 1 + (random32() % p->highest_bit); |
1978 | } | 1978 | } |
1979 | if (discard_swap(p) == 0) | 1979 | if (discard_swap(p) == 0) |
1980 | p->flags |= SWP_DISCARDABLE; | 1980 | p->flags |= SWP_DISCARDABLE; |
1981 | 1981 | ||
1982 | mutex_lock(&swapon_mutex); | 1982 | mutex_lock(&swapon_mutex); |
1983 | spin_lock(&swap_lock); | 1983 | spin_lock(&swap_lock); |
1984 | if (swap_flags & SWAP_FLAG_PREFER) | 1984 | if (swap_flags & SWAP_FLAG_PREFER) |
1985 | p->prio = | 1985 | p->prio = |
1986 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 1986 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
1987 | else | 1987 | else |
1988 | p->prio = --least_priority; | 1988 | p->prio = --least_priority; |
1989 | p->swap_map = swap_map; | 1989 | p->swap_map = swap_map; |
1990 | p->flags |= SWP_WRITEOK; | 1990 | p->flags |= SWP_WRITEOK; |
1991 | nr_swap_pages += nr_good_pages; | 1991 | nr_swap_pages += nr_good_pages; |
1992 | total_swap_pages += nr_good_pages; | 1992 | total_swap_pages += nr_good_pages; |
1993 | 1993 | ||
1994 | printk(KERN_INFO "Adding %uk swap on %s. " | 1994 | printk(KERN_INFO "Adding %uk swap on %s. " |
1995 | "Priority:%d extents:%d across:%lluk %s%s\n", | 1995 | "Priority:%d extents:%d across:%lluk %s%s\n", |
1996 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, | 1996 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, |
1997 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 1997 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
1998 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 1998 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
1999 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 1999 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); |
2000 | 2000 | ||
2001 | /* insert swap space into swap_list: */ | 2001 | /* insert swap space into swap_list: */ |
2002 | prev = -1; | 2002 | prev = -1; |
2003 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2003 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { |
2004 | if (p->prio >= swap_info[i].prio) { | 2004 | if (p->prio >= swap_info[i].prio) { |
2005 | break; | 2005 | break; |
2006 | } | 2006 | } |
2007 | prev = i; | 2007 | prev = i; |
2008 | } | 2008 | } |
2009 | p->next = i; | 2009 | p->next = i; |
2010 | if (prev < 0) { | 2010 | if (prev < 0) { |
2011 | swap_list.head = swap_list.next = p - swap_info; | 2011 | swap_list.head = swap_list.next = p - swap_info; |
2012 | } else { | 2012 | } else { |
2013 | swap_info[prev].next = p - swap_info; | 2013 | swap_info[prev].next = p - swap_info; |
2014 | } | 2014 | } |
2015 | spin_unlock(&swap_lock); | 2015 | spin_unlock(&swap_lock); |
2016 | mutex_unlock(&swapon_mutex); | 2016 | mutex_unlock(&swapon_mutex); |
2017 | error = 0; | 2017 | error = 0; |
2018 | goto out; | 2018 | goto out; |
2019 | bad_swap: | 2019 | bad_swap: |
2020 | if (bdev) { | 2020 | if (bdev) { |
2021 | set_blocksize(bdev, p->old_block_size); | 2021 | set_blocksize(bdev, p->old_block_size); |
2022 | bd_release(bdev); | 2022 | bd_release(bdev); |
2023 | } | 2023 | } |
2024 | destroy_swap_extents(p); | 2024 | destroy_swap_extents(p); |
2025 | swap_cgroup_swapoff(type); | 2025 | swap_cgroup_swapoff(type); |
2026 | bad_swap_2: | 2026 | bad_swap_2: |
2027 | spin_lock(&swap_lock); | 2027 | spin_lock(&swap_lock); |
2028 | p->swap_file = NULL; | 2028 | p->swap_file = NULL; |
2029 | p->flags = 0; | 2029 | p->flags = 0; |
2030 | spin_unlock(&swap_lock); | 2030 | spin_unlock(&swap_lock); |
2031 | vfree(swap_map); | 2031 | vfree(swap_map); |
2032 | if (swap_file) | 2032 | if (swap_file) |
2033 | filp_close(swap_file, NULL); | 2033 | filp_close(swap_file, NULL); |
2034 | out: | 2034 | out: |
2035 | if (page && !IS_ERR(page)) { | 2035 | if (page && !IS_ERR(page)) { |
2036 | kunmap(page); | 2036 | kunmap(page); |
2037 | page_cache_release(page); | 2037 | page_cache_release(page); |
2038 | } | 2038 | } |
2039 | if (name) | 2039 | if (name) |
2040 | putname(name); | 2040 | putname(name); |
2041 | if (did_down) { | 2041 | if (did_down) { |
2042 | if (!error) | 2042 | if (!error) |
2043 | inode->i_flags |= S_SWAPFILE; | 2043 | inode->i_flags |= S_SWAPFILE; |
2044 | mutex_unlock(&inode->i_mutex); | 2044 | mutex_unlock(&inode->i_mutex); |
2045 | } | 2045 | } |
2046 | return error; | 2046 | return error; |
2047 | } | 2047 | } |
2048 | 2048 | ||
2049 | void si_swapinfo(struct sysinfo *val) | 2049 | void si_swapinfo(struct sysinfo *val) |
2050 | { | 2050 | { |
2051 | unsigned int i; | 2051 | unsigned int i; |
2052 | unsigned long nr_to_be_unused = 0; | 2052 | unsigned long nr_to_be_unused = 0; |
2053 | 2053 | ||
2054 | spin_lock(&swap_lock); | 2054 | spin_lock(&swap_lock); |
2055 | for (i = 0; i < nr_swapfiles; i++) { | 2055 | for (i = 0; i < nr_swapfiles; i++) { |
2056 | if (!(swap_info[i].flags & SWP_USED) || | 2056 | if (!(swap_info[i].flags & SWP_USED) || |
2057 | (swap_info[i].flags & SWP_WRITEOK)) | 2057 | (swap_info[i].flags & SWP_WRITEOK)) |
2058 | continue; | 2058 | continue; |
2059 | nr_to_be_unused += swap_info[i].inuse_pages; | 2059 | nr_to_be_unused += swap_info[i].inuse_pages; |
2060 | } | 2060 | } |
2061 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2061 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
2062 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2062 | val->totalswap = total_swap_pages + nr_to_be_unused; |
2063 | spin_unlock(&swap_lock); | 2063 | spin_unlock(&swap_lock); |
2064 | } | 2064 | } |
2065 | 2065 | ||
2066 | /* | 2066 | /* |
2067 | * Verify that a swap entry is valid and increment its swap map count. | 2067 | * Verify that a swap entry is valid and increment its swap map count. |
2068 | * | 2068 | * |
2069 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2069 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
2070 | * "permanent", but will be reclaimed by the next swapoff. | 2070 | * "permanent", but will be reclaimed by the next swapoff. |
2071 | * Returns error code in following case. | 2071 | * Returns error code in following case. |
2072 | * - success -> 0 | 2072 | * - success -> 0 |
2073 | * - swp_entry is invalid -> EINVAL | 2073 | * - swp_entry is invalid -> EINVAL |
2074 | * - swp_entry is migration entry -> EINVAL | 2074 | * - swp_entry is migration entry -> EINVAL |
2075 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2075 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2076 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2076 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2077 | */ | 2077 | */ |
2078 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2078 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
2079 | { | 2079 | { |
2080 | struct swap_info_struct * p; | 2080 | struct swap_info_struct * p; |
2081 | unsigned long offset, type; | 2081 | unsigned long offset, type; |
2082 | int result = -EINVAL; | 2082 | int result = -EINVAL; |
2083 | int count; | 2083 | int count; |
2084 | bool has_cache; | 2084 | bool has_cache; |
2085 | 2085 | ||
2086 | if (is_migration_entry(entry)) | 2086 | if (is_migration_entry(entry)) |
2087 | return -EINVAL; | 2087 | return -EINVAL; |
2088 | 2088 | ||
2089 | type = swp_type(entry); | 2089 | type = swp_type(entry); |
2090 | if (type >= nr_swapfiles) | 2090 | if (type >= nr_swapfiles) |
2091 | goto bad_file; | 2091 | goto bad_file; |
2092 | p = type + swap_info; | 2092 | p = type + swap_info; |
2093 | offset = swp_offset(entry); | 2093 | offset = swp_offset(entry); |
2094 | 2094 | ||
2095 | spin_lock(&swap_lock); | 2095 | spin_lock(&swap_lock); |
2096 | 2096 | ||
2097 | if (unlikely(offset >= p->max)) | 2097 | if (unlikely(offset >= p->max)) |
2098 | goto unlock_out; | 2098 | goto unlock_out; |
2099 | 2099 | ||
2100 | count = swap_count(p->swap_map[offset]); | 2100 | count = swap_count(p->swap_map[offset]); |
2101 | has_cache = swap_has_cache(p->swap_map[offset]); | 2101 | has_cache = swap_has_cache(p->swap_map[offset]); |
2102 | 2102 | ||
2103 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2103 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ |
2104 | 2104 | ||
2105 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2105 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
2106 | if (!has_cache && count) { | 2106 | if (!has_cache && count) { |
2107 | p->swap_map[offset] = encode_swapmap(count, true); | 2107 | p->swap_map[offset] = encode_swapmap(count, true); |
2108 | result = 0; | 2108 | result = 0; |
2109 | } else if (has_cache) /* someone added cache */ | 2109 | } else if (has_cache) /* someone added cache */ |
2110 | result = -EEXIST; | 2110 | result = -EEXIST; |
2111 | else if (!count) /* no users */ | 2111 | else if (!count) /* no users */ |
2112 | result = -ENOENT; | 2112 | result = -ENOENT; |
2113 | 2113 | ||
2114 | } else if (count || has_cache) { | 2114 | } else if (count || has_cache) { |
2115 | if (count < SWAP_MAP_MAX - 1) { | 2115 | if (count < SWAP_MAP_MAX - 1) { |
2116 | p->swap_map[offset] = encode_swapmap(count + 1, | 2116 | p->swap_map[offset] = encode_swapmap(count + 1, |
2117 | has_cache); | 2117 | has_cache); |
2118 | result = 0; | 2118 | result = 0; |
2119 | } else if (count <= SWAP_MAP_MAX) { | 2119 | } else if (count <= SWAP_MAP_MAX) { |
2120 | if (swap_overflow++ < 5) | 2120 | if (swap_overflow++ < 5) |
2121 | printk(KERN_WARNING | 2121 | printk(KERN_WARNING |
2122 | "swap_dup: swap entry overflow\n"); | 2122 | "swap_dup: swap entry overflow\n"); |
2123 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2123 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
2124 | has_cache); | 2124 | has_cache); |
2125 | result = 0; | 2125 | result = 0; |
2126 | } | 2126 | } |
2127 | } else | 2127 | } else |
2128 | result = -ENOENT; /* unused swap entry */ | 2128 | result = -ENOENT; /* unused swap entry */ |
2129 | unlock_out: | 2129 | unlock_out: |
2130 | spin_unlock(&swap_lock); | 2130 | spin_unlock(&swap_lock); |
2131 | out: | 2131 | out: |
2132 | return result; | 2132 | return result; |
2133 | 2133 | ||
2134 | bad_file: | 2134 | bad_file: |
2135 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2135 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
2136 | goto out; | 2136 | goto out; |
2137 | } | 2137 | } |
2138 | /* | 2138 | /* |
2139 | * increase reference count of swap entry by 1. | 2139 | * increase reference count of swap entry by 1. |
2140 | */ | 2140 | */ |
2141 | void swap_duplicate(swp_entry_t entry) | 2141 | void swap_duplicate(swp_entry_t entry) |
2142 | { | 2142 | { |
2143 | __swap_duplicate(entry, SWAP_MAP); | 2143 | __swap_duplicate(entry, SWAP_MAP); |
2144 | } | 2144 | } |
2145 | 2145 | ||
2146 | /* | 2146 | /* |
2147 | * @entry: swap entry for which we allocate swap cache. | 2147 | * @entry: swap entry for which we allocate swap cache. |
2148 | * | 2148 | * |
2149 | * Called when allocating swap cache for exising swap entry, | 2149 | * Called when allocating swap cache for exising swap entry, |
2150 | * This can return error codes. Returns 0 at success. | 2150 | * This can return error codes. Returns 0 at success. |
2151 | * -EBUSY means there is a swap cache. | 2151 | * -EBUSY means there is a swap cache. |
2152 | * Note: return code is different from swap_duplicate(). | 2152 | * Note: return code is different from swap_duplicate(). |
2153 | */ | 2153 | */ |
2154 | int swapcache_prepare(swp_entry_t entry) | 2154 | int swapcache_prepare(swp_entry_t entry) |
2155 | { | 2155 | { |
2156 | return __swap_duplicate(entry, SWAP_CACHE); | 2156 | return __swap_duplicate(entry, SWAP_CACHE); |
2157 | } | 2157 | } |
2158 | 2158 | ||
2159 | 2159 | ||
2160 | struct swap_info_struct * | 2160 | struct swap_info_struct * |
2161 | get_swap_info_struct(unsigned type) | 2161 | get_swap_info_struct(unsigned type) |
2162 | { | 2162 | { |
2163 | return &swap_info[type]; | 2163 | return &swap_info[type]; |
2164 | } | 2164 | } |
2165 | 2165 | ||
2166 | /* | 2166 | /* |
2167 | * swap_lock prevents swap_map being freed. Don't grab an extra | 2167 | * swap_lock prevents swap_map being freed. Don't grab an extra |
2168 | * reference on the swaphandle, it doesn't matter if it becomes unused. | 2168 | * reference on the swaphandle, it doesn't matter if it becomes unused. |
2169 | */ | 2169 | */ |
2170 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 2170 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) |
2171 | { | 2171 | { |
2172 | struct swap_info_struct *si; | 2172 | struct swap_info_struct *si; |
2173 | int our_page_cluster = page_cluster; | 2173 | int our_page_cluster = page_cluster; |
2174 | pgoff_t target, toff; | 2174 | pgoff_t target, toff; |
2175 | pgoff_t base, end; | 2175 | pgoff_t base, end; |
2176 | int nr_pages = 0; | 2176 | int nr_pages = 0; |
2177 | 2177 | ||
2178 | if (!our_page_cluster) /* no readahead */ | 2178 | if (!our_page_cluster) /* no readahead */ |
2179 | return 0; | 2179 | return 0; |
2180 | 2180 | ||
2181 | si = &swap_info[swp_type(entry)]; | 2181 | si = &swap_info[swp_type(entry)]; |
2182 | target = swp_offset(entry); | 2182 | target = swp_offset(entry); |
2183 | base = (target >> our_page_cluster) << our_page_cluster; | 2183 | base = (target >> our_page_cluster) << our_page_cluster; |
2184 | end = base + (1 << our_page_cluster); | 2184 | end = base + (1 << our_page_cluster); |
2185 | if (!base) /* first page is swap header */ | 2185 | if (!base) /* first page is swap header */ |
2186 | base++; | 2186 | base++; |
2187 | 2187 | ||
2188 | spin_lock(&swap_lock); | 2188 | spin_lock(&swap_lock); |
2189 | if (end > si->max) /* don't go beyond end of map */ | 2189 | if (end > si->max) /* don't go beyond end of map */ |
2190 | end = si->max; | 2190 | end = si->max; |
2191 | 2191 | ||
2192 | /* Count contiguous allocated slots above our target */ | 2192 | /* Count contiguous allocated slots above our target */ |
2193 | for (toff = target; ++toff < end; nr_pages++) { | 2193 | for (toff = target; ++toff < end; nr_pages++) { |
2194 | /* Don't read in free or bad pages */ | 2194 | /* Don't read in free or bad pages */ |
2195 | if (!si->swap_map[toff]) | 2195 | if (!si->swap_map[toff]) |
2196 | break; | 2196 | break; |
2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | 2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2198 | break; | 2198 | break; |
2199 | } | 2199 | } |
2200 | /* Count contiguous allocated slots below our target */ | 2200 | /* Count contiguous allocated slots below our target */ |
2201 | for (toff = target; --toff >= base; nr_pages++) { | 2201 | for (toff = target; --toff >= base; nr_pages++) { |
2202 | /* Don't read in free or bad pages */ | 2202 | /* Don't read in free or bad pages */ |
2203 | if (!si->swap_map[toff]) | 2203 | if (!si->swap_map[toff]) |
2204 | break; | 2204 | break; |
2205 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | 2205 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2206 | break; | 2206 | break; |
2207 | } | 2207 | } |
2208 | spin_unlock(&swap_lock); | 2208 | spin_unlock(&swap_lock); |
2209 | 2209 | ||
2210 | /* | 2210 | /* |
2211 | * Indicate starting offset, and return number of pages to get: | 2211 | * Indicate starting offset, and return number of pages to get: |
2212 | * if only 1, say 0, since there's then no readahead to be done. | 2212 | * if only 1, say 0, since there's then no readahead to be done. |
2213 | */ | 2213 | */ |
2214 | *offset = ++toff; | 2214 | *offset = ++toff; |
2215 | return nr_pages? ++nr_pages: 0; | 2215 | return nr_pages? ++nr_pages: 0; |
2216 | } | 2216 | } |
2217 | 2217 |