Commit dddac6a7b445de95515f64fdf82fe5dc36c02f26

Authored by Alan Jenkins
Committed by Rafael J. Wysocki
1 parent ec79be2687

PM / Hibernate: Replace bdget call with simple atomic_inc of i_count

Create bdgrab().  This function copies an existing reference to a
block_device.  It is safe to call from any context.

Hibernation code wishes to copy a reference to the active swap device.
Right now it calls bdget() under a spinlock, but this is wrong because
bdget() can sleep.  It doesn't need a full bdget() because we already
hold a reference to active swap devices (and the spinlock protects
against swapoff).

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>

Showing 3 changed files with 13 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/block_dev.c 2 * linux/fs/block_dev.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6 */ 6 */
7 7
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/fcntl.h> 10 #include <linux/fcntl.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/kmod.h> 12 #include <linux/kmod.h>
13 #include <linux/major.h> 13 #include <linux/major.h>
14 #include <linux/smp_lock.h> 14 #include <linux/smp_lock.h>
15 #include <linux/device_cgroup.h> 15 #include <linux/device_cgroup.h>
16 #include <linux/highmem.h> 16 #include <linux/highmem.h>
17 #include <linux/blkdev.h> 17 #include <linux/blkdev.h>
18 #include <linux/module.h> 18 #include <linux/module.h>
19 #include <linux/blkpg.h> 19 #include <linux/blkpg.h>
20 #include <linux/buffer_head.h> 20 #include <linux/buffer_head.h>
21 #include <linux/pagevec.h> 21 #include <linux/pagevec.h>
22 #include <linux/writeback.h> 22 #include <linux/writeback.h>
23 #include <linux/mpage.h> 23 #include <linux/mpage.h>
24 #include <linux/mount.h> 24 #include <linux/mount.h>
25 #include <linux/uio.h> 25 #include <linux/uio.h>
26 #include <linux/namei.h> 26 #include <linux/namei.h>
27 #include <linux/log2.h> 27 #include <linux/log2.h>
28 #include <linux/kmemleak.h> 28 #include <linux/kmemleak.h>
29 #include <asm/uaccess.h> 29 #include <asm/uaccess.h>
30 #include "internal.h" 30 #include "internal.h"
31 31
32 struct bdev_inode { 32 struct bdev_inode {
33 struct block_device bdev; 33 struct block_device bdev;
34 struct inode vfs_inode; 34 struct inode vfs_inode;
35 }; 35 };
36 36
37 static const struct address_space_operations def_blk_aops; 37 static const struct address_space_operations def_blk_aops;
38 38
39 static inline struct bdev_inode *BDEV_I(struct inode *inode) 39 static inline struct bdev_inode *BDEV_I(struct inode *inode)
40 { 40 {
41 return container_of(inode, struct bdev_inode, vfs_inode); 41 return container_of(inode, struct bdev_inode, vfs_inode);
42 } 42 }
43 43
44 inline struct block_device *I_BDEV(struct inode *inode) 44 inline struct block_device *I_BDEV(struct inode *inode)
45 { 45 {
46 return &BDEV_I(inode)->bdev; 46 return &BDEV_I(inode)->bdev;
47 } 47 }
48 48
49 EXPORT_SYMBOL(I_BDEV); 49 EXPORT_SYMBOL(I_BDEV);
50 50
51 static sector_t max_block(struct block_device *bdev) 51 static sector_t max_block(struct block_device *bdev)
52 { 52 {
53 sector_t retval = ~((sector_t)0); 53 sector_t retval = ~((sector_t)0);
54 loff_t sz = i_size_read(bdev->bd_inode); 54 loff_t sz = i_size_read(bdev->bd_inode);
55 55
56 if (sz) { 56 if (sz) {
57 unsigned int size = block_size(bdev); 57 unsigned int size = block_size(bdev);
58 unsigned int sizebits = blksize_bits(size); 58 unsigned int sizebits = blksize_bits(size);
59 retval = (sz >> sizebits); 59 retval = (sz >> sizebits);
60 } 60 }
61 return retval; 61 return retval;
62 } 62 }
63 63
64 /* Kill _all_ buffers and pagecache , dirty or not.. */ 64 /* Kill _all_ buffers and pagecache , dirty or not.. */
65 static void kill_bdev(struct block_device *bdev) 65 static void kill_bdev(struct block_device *bdev)
66 { 66 {
67 if (bdev->bd_inode->i_mapping->nrpages == 0) 67 if (bdev->bd_inode->i_mapping->nrpages == 0)
68 return; 68 return;
69 invalidate_bh_lrus(); 69 invalidate_bh_lrus();
70 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 70 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
71 } 71 }
72 72
73 int set_blocksize(struct block_device *bdev, int size) 73 int set_blocksize(struct block_device *bdev, int size)
74 { 74 {
75 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 75 /* Size must be a power of two, and between 512 and PAGE_SIZE */
76 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 76 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
77 return -EINVAL; 77 return -EINVAL;
78 78
79 /* Size cannot be smaller than the size supported by the device */ 79 /* Size cannot be smaller than the size supported by the device */
80 if (size < bdev_logical_block_size(bdev)) 80 if (size < bdev_logical_block_size(bdev))
81 return -EINVAL; 81 return -EINVAL;
82 82
83 /* Don't change the size if it is same as current */ 83 /* Don't change the size if it is same as current */
84 if (bdev->bd_block_size != size) { 84 if (bdev->bd_block_size != size) {
85 sync_blockdev(bdev); 85 sync_blockdev(bdev);
86 bdev->bd_block_size = size; 86 bdev->bd_block_size = size;
87 bdev->bd_inode->i_blkbits = blksize_bits(size); 87 bdev->bd_inode->i_blkbits = blksize_bits(size);
88 kill_bdev(bdev); 88 kill_bdev(bdev);
89 } 89 }
90 return 0; 90 return 0;
91 } 91 }
92 92
93 EXPORT_SYMBOL(set_blocksize); 93 EXPORT_SYMBOL(set_blocksize);
94 94
95 int sb_set_blocksize(struct super_block *sb, int size) 95 int sb_set_blocksize(struct super_block *sb, int size)
96 { 96 {
97 if (set_blocksize(sb->s_bdev, size)) 97 if (set_blocksize(sb->s_bdev, size))
98 return 0; 98 return 0;
99 /* If we get here, we know size is power of two 99 /* If we get here, we know size is power of two
100 * and it's value is between 512 and PAGE_SIZE */ 100 * and it's value is between 512 and PAGE_SIZE */
101 sb->s_blocksize = size; 101 sb->s_blocksize = size;
102 sb->s_blocksize_bits = blksize_bits(size); 102 sb->s_blocksize_bits = blksize_bits(size);
103 return sb->s_blocksize; 103 return sb->s_blocksize;
104 } 104 }
105 105
106 EXPORT_SYMBOL(sb_set_blocksize); 106 EXPORT_SYMBOL(sb_set_blocksize);
107 107
108 int sb_min_blocksize(struct super_block *sb, int size) 108 int sb_min_blocksize(struct super_block *sb, int size)
109 { 109 {
110 int minsize = bdev_logical_block_size(sb->s_bdev); 110 int minsize = bdev_logical_block_size(sb->s_bdev);
111 if (size < minsize) 111 if (size < minsize)
112 size = minsize; 112 size = minsize;
113 return sb_set_blocksize(sb, size); 113 return sb_set_blocksize(sb, size);
114 } 114 }
115 115
116 EXPORT_SYMBOL(sb_min_blocksize); 116 EXPORT_SYMBOL(sb_min_blocksize);
117 117
118 static int 118 static int
119 blkdev_get_block(struct inode *inode, sector_t iblock, 119 blkdev_get_block(struct inode *inode, sector_t iblock,
120 struct buffer_head *bh, int create) 120 struct buffer_head *bh, int create)
121 { 121 {
122 if (iblock >= max_block(I_BDEV(inode))) { 122 if (iblock >= max_block(I_BDEV(inode))) {
123 if (create) 123 if (create)
124 return -EIO; 124 return -EIO;
125 125
126 /* 126 /*
127 * for reads, we're just trying to fill a partial page. 127 * for reads, we're just trying to fill a partial page.
128 * return a hole, they will have to call get_block again 128 * return a hole, they will have to call get_block again
129 * before they can fill it, and they will get -EIO at that 129 * before they can fill it, and they will get -EIO at that
130 * time 130 * time
131 */ 131 */
132 return 0; 132 return 0;
133 } 133 }
134 bh->b_bdev = I_BDEV(inode); 134 bh->b_bdev = I_BDEV(inode);
135 bh->b_blocknr = iblock; 135 bh->b_blocknr = iblock;
136 set_buffer_mapped(bh); 136 set_buffer_mapped(bh);
137 return 0; 137 return 0;
138 } 138 }
139 139
140 static int 140 static int
141 blkdev_get_blocks(struct inode *inode, sector_t iblock, 141 blkdev_get_blocks(struct inode *inode, sector_t iblock,
142 struct buffer_head *bh, int create) 142 struct buffer_head *bh, int create)
143 { 143 {
144 sector_t end_block = max_block(I_BDEV(inode)); 144 sector_t end_block = max_block(I_BDEV(inode));
145 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 145 unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
146 146
147 if ((iblock + max_blocks) > end_block) { 147 if ((iblock + max_blocks) > end_block) {
148 max_blocks = end_block - iblock; 148 max_blocks = end_block - iblock;
149 if ((long)max_blocks <= 0) { 149 if ((long)max_blocks <= 0) {
150 if (create) 150 if (create)
151 return -EIO; /* write fully beyond EOF */ 151 return -EIO; /* write fully beyond EOF */
152 /* 152 /*
153 * It is a read which is fully beyond EOF. We return 153 * It is a read which is fully beyond EOF. We return
154 * a !buffer_mapped buffer 154 * a !buffer_mapped buffer
155 */ 155 */
156 max_blocks = 0; 156 max_blocks = 0;
157 } 157 }
158 } 158 }
159 159
160 bh->b_bdev = I_BDEV(inode); 160 bh->b_bdev = I_BDEV(inode);
161 bh->b_blocknr = iblock; 161 bh->b_blocknr = iblock;
162 bh->b_size = max_blocks << inode->i_blkbits; 162 bh->b_size = max_blocks << inode->i_blkbits;
163 if (max_blocks) 163 if (max_blocks)
164 set_buffer_mapped(bh); 164 set_buffer_mapped(bh);
165 return 0; 165 return 0;
166 } 166 }
167 167
168 static ssize_t 168 static ssize_t
169 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 169 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
170 loff_t offset, unsigned long nr_segs) 170 loff_t offset, unsigned long nr_segs)
171 { 171 {
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 iov, offset, nr_segs, blkdev_get_blocks, NULL);
177 } 177 }
178 178
179 int __sync_blockdev(struct block_device *bdev, int wait) 179 int __sync_blockdev(struct block_device *bdev, int wait)
180 { 180 {
181 if (!bdev) 181 if (!bdev)
182 return 0; 182 return 0;
183 if (!wait) 183 if (!wait)
184 return filemap_flush(bdev->bd_inode->i_mapping); 184 return filemap_flush(bdev->bd_inode->i_mapping);
185 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 185 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
186 } 186 }
187 187
188 /* 188 /*
189 * Write out and wait upon all the dirty data associated with a block 189 * Write out and wait upon all the dirty data associated with a block
190 * device via its mapping. Does not take the superblock lock. 190 * device via its mapping. Does not take the superblock lock.
191 */ 191 */
192 int sync_blockdev(struct block_device *bdev) 192 int sync_blockdev(struct block_device *bdev)
193 { 193 {
194 return __sync_blockdev(bdev, 1); 194 return __sync_blockdev(bdev, 1);
195 } 195 }
196 EXPORT_SYMBOL(sync_blockdev); 196 EXPORT_SYMBOL(sync_blockdev);
197 197
198 /* 198 /*
199 * Write out and wait upon all dirty data associated with this 199 * Write out and wait upon all dirty data associated with this
200 * device. Filesystem data as well as the underlying block 200 * device. Filesystem data as well as the underlying block
201 * device. Takes the superblock lock. 201 * device. Takes the superblock lock.
202 */ 202 */
203 int fsync_bdev(struct block_device *bdev) 203 int fsync_bdev(struct block_device *bdev)
204 { 204 {
205 struct super_block *sb = get_super(bdev); 205 struct super_block *sb = get_super(bdev);
206 if (sb) { 206 if (sb) {
207 int res = sync_filesystem(sb); 207 int res = sync_filesystem(sb);
208 drop_super(sb); 208 drop_super(sb);
209 return res; 209 return res;
210 } 210 }
211 return sync_blockdev(bdev); 211 return sync_blockdev(bdev);
212 } 212 }
213 EXPORT_SYMBOL(fsync_bdev); 213 EXPORT_SYMBOL(fsync_bdev);
214 214
215 /** 215 /**
216 * freeze_bdev -- lock a filesystem and force it into a consistent state 216 * freeze_bdev -- lock a filesystem and force it into a consistent state
217 * @bdev: blockdevice to lock 217 * @bdev: blockdevice to lock
218 * 218 *
219 * This takes the block device bd_mount_sem to make sure no new mounts 219 * This takes the block device bd_mount_sem to make sure no new mounts
220 * happen on bdev until thaw_bdev() is called. 220 * happen on bdev until thaw_bdev() is called.
221 * If a superblock is found on this device, we take the s_umount semaphore 221 * If a superblock is found on this device, we take the s_umount semaphore
222 * on it to make sure nobody unmounts until the snapshot creation is done. 222 * on it to make sure nobody unmounts until the snapshot creation is done.
223 * The reference counter (bd_fsfreeze_count) guarantees that only the last 223 * The reference counter (bd_fsfreeze_count) guarantees that only the last
224 * unfreeze process can unfreeze the frozen filesystem actually when multiple 224 * unfreeze process can unfreeze the frozen filesystem actually when multiple
225 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 225 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
226 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 226 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
227 * actually. 227 * actually.
228 */ 228 */
229 struct super_block *freeze_bdev(struct block_device *bdev) 229 struct super_block *freeze_bdev(struct block_device *bdev)
230 { 230 {
231 struct super_block *sb; 231 struct super_block *sb;
232 int error = 0; 232 int error = 0;
233 233
234 mutex_lock(&bdev->bd_fsfreeze_mutex); 234 mutex_lock(&bdev->bd_fsfreeze_mutex);
235 if (bdev->bd_fsfreeze_count > 0) { 235 if (bdev->bd_fsfreeze_count > 0) {
236 bdev->bd_fsfreeze_count++; 236 bdev->bd_fsfreeze_count++;
237 sb = get_super(bdev); 237 sb = get_super(bdev);
238 mutex_unlock(&bdev->bd_fsfreeze_mutex); 238 mutex_unlock(&bdev->bd_fsfreeze_mutex);
239 return sb; 239 return sb;
240 } 240 }
241 bdev->bd_fsfreeze_count++; 241 bdev->bd_fsfreeze_count++;
242 242
243 down(&bdev->bd_mount_sem); 243 down(&bdev->bd_mount_sem);
244 sb = get_super(bdev); 244 sb = get_super(bdev);
245 if (sb && !(sb->s_flags & MS_RDONLY)) { 245 if (sb && !(sb->s_flags & MS_RDONLY)) {
246 sb->s_frozen = SB_FREEZE_WRITE; 246 sb->s_frozen = SB_FREEZE_WRITE;
247 smp_wmb(); 247 smp_wmb();
248 248
249 sync_filesystem(sb); 249 sync_filesystem(sb);
250 250
251 sb->s_frozen = SB_FREEZE_TRANS; 251 sb->s_frozen = SB_FREEZE_TRANS;
252 smp_wmb(); 252 smp_wmb();
253 253
254 sync_blockdev(sb->s_bdev); 254 sync_blockdev(sb->s_bdev);
255 255
256 if (sb->s_op->freeze_fs) { 256 if (sb->s_op->freeze_fs) {
257 error = sb->s_op->freeze_fs(sb); 257 error = sb->s_op->freeze_fs(sb);
258 if (error) { 258 if (error) {
259 printk(KERN_ERR 259 printk(KERN_ERR
260 "VFS:Filesystem freeze failed\n"); 260 "VFS:Filesystem freeze failed\n");
261 sb->s_frozen = SB_UNFROZEN; 261 sb->s_frozen = SB_UNFROZEN;
262 drop_super(sb); 262 drop_super(sb);
263 up(&bdev->bd_mount_sem); 263 up(&bdev->bd_mount_sem);
264 bdev->bd_fsfreeze_count--; 264 bdev->bd_fsfreeze_count--;
265 mutex_unlock(&bdev->bd_fsfreeze_mutex); 265 mutex_unlock(&bdev->bd_fsfreeze_mutex);
266 return ERR_PTR(error); 266 return ERR_PTR(error);
267 } 267 }
268 } 268 }
269 } 269 }
270 270
271 sync_blockdev(bdev); 271 sync_blockdev(bdev);
272 mutex_unlock(&bdev->bd_fsfreeze_mutex); 272 mutex_unlock(&bdev->bd_fsfreeze_mutex);
273 273
274 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 274 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
275 } 275 }
276 EXPORT_SYMBOL(freeze_bdev); 276 EXPORT_SYMBOL(freeze_bdev);
277 277
278 /** 278 /**
279 * thaw_bdev -- unlock filesystem 279 * thaw_bdev -- unlock filesystem
280 * @bdev: blockdevice to unlock 280 * @bdev: blockdevice to unlock
281 * @sb: associated superblock 281 * @sb: associated superblock
282 * 282 *
283 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 283 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
284 */ 284 */
285 int thaw_bdev(struct block_device *bdev, struct super_block *sb) 285 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
286 { 286 {
287 int error = 0; 287 int error = 0;
288 288
289 mutex_lock(&bdev->bd_fsfreeze_mutex); 289 mutex_lock(&bdev->bd_fsfreeze_mutex);
290 if (!bdev->bd_fsfreeze_count) { 290 if (!bdev->bd_fsfreeze_count) {
291 mutex_unlock(&bdev->bd_fsfreeze_mutex); 291 mutex_unlock(&bdev->bd_fsfreeze_mutex);
292 return -EINVAL; 292 return -EINVAL;
293 } 293 }
294 294
295 bdev->bd_fsfreeze_count--; 295 bdev->bd_fsfreeze_count--;
296 if (bdev->bd_fsfreeze_count > 0) { 296 if (bdev->bd_fsfreeze_count > 0) {
297 if (sb) 297 if (sb)
298 drop_super(sb); 298 drop_super(sb);
299 mutex_unlock(&bdev->bd_fsfreeze_mutex); 299 mutex_unlock(&bdev->bd_fsfreeze_mutex);
300 return 0; 300 return 0;
301 } 301 }
302 302
303 if (sb) { 303 if (sb) {
304 BUG_ON(sb->s_bdev != bdev); 304 BUG_ON(sb->s_bdev != bdev);
305 if (!(sb->s_flags & MS_RDONLY)) { 305 if (!(sb->s_flags & MS_RDONLY)) {
306 if (sb->s_op->unfreeze_fs) { 306 if (sb->s_op->unfreeze_fs) {
307 error = sb->s_op->unfreeze_fs(sb); 307 error = sb->s_op->unfreeze_fs(sb);
308 if (error) { 308 if (error) {
309 printk(KERN_ERR 309 printk(KERN_ERR
310 "VFS:Filesystem thaw failed\n"); 310 "VFS:Filesystem thaw failed\n");
311 sb->s_frozen = SB_FREEZE_TRANS; 311 sb->s_frozen = SB_FREEZE_TRANS;
312 bdev->bd_fsfreeze_count++; 312 bdev->bd_fsfreeze_count++;
313 mutex_unlock(&bdev->bd_fsfreeze_mutex); 313 mutex_unlock(&bdev->bd_fsfreeze_mutex);
314 return error; 314 return error;
315 } 315 }
316 } 316 }
317 sb->s_frozen = SB_UNFROZEN; 317 sb->s_frozen = SB_UNFROZEN;
318 smp_wmb(); 318 smp_wmb();
319 wake_up(&sb->s_wait_unfrozen); 319 wake_up(&sb->s_wait_unfrozen);
320 } 320 }
321 drop_super(sb); 321 drop_super(sb);
322 } 322 }
323 323
324 up(&bdev->bd_mount_sem); 324 up(&bdev->bd_mount_sem);
325 mutex_unlock(&bdev->bd_fsfreeze_mutex); 325 mutex_unlock(&bdev->bd_fsfreeze_mutex);
326 return 0; 326 return 0;
327 } 327 }
328 EXPORT_SYMBOL(thaw_bdev); 328 EXPORT_SYMBOL(thaw_bdev);
329 329
330 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 330 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
331 { 331 {
332 return block_write_full_page(page, blkdev_get_block, wbc); 332 return block_write_full_page(page, blkdev_get_block, wbc);
333 } 333 }
334 334
335 static int blkdev_readpage(struct file * file, struct page * page) 335 static int blkdev_readpage(struct file * file, struct page * page)
336 { 336 {
337 return block_read_full_page(page, blkdev_get_block); 337 return block_read_full_page(page, blkdev_get_block);
338 } 338 }
339 339
340 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 340 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
341 loff_t pos, unsigned len, unsigned flags, 341 loff_t pos, unsigned len, unsigned flags,
342 struct page **pagep, void **fsdata) 342 struct page **pagep, void **fsdata)
343 { 343 {
344 *pagep = NULL; 344 *pagep = NULL;
345 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 345 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
346 blkdev_get_block); 346 blkdev_get_block);
347 } 347 }
348 348
349 static int blkdev_write_end(struct file *file, struct address_space *mapping, 349 static int blkdev_write_end(struct file *file, struct address_space *mapping,
350 loff_t pos, unsigned len, unsigned copied, 350 loff_t pos, unsigned len, unsigned copied,
351 struct page *page, void *fsdata) 351 struct page *page, void *fsdata)
352 { 352 {
353 int ret; 353 int ret;
354 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 354 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
355 355
356 unlock_page(page); 356 unlock_page(page);
357 page_cache_release(page); 357 page_cache_release(page);
358 358
359 return ret; 359 return ret;
360 } 360 }
361 361
362 /* 362 /*
363 * private llseek: 363 * private llseek:
364 * for a block special file file->f_path.dentry->d_inode->i_size is zero 364 * for a block special file file->f_path.dentry->d_inode->i_size is zero
365 * so we compute the size by hand (just as in block_read/write above) 365 * so we compute the size by hand (just as in block_read/write above)
366 */ 366 */
367 static loff_t block_llseek(struct file *file, loff_t offset, int origin) 367 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
368 { 368 {
369 struct inode *bd_inode = file->f_mapping->host; 369 struct inode *bd_inode = file->f_mapping->host;
370 loff_t size; 370 loff_t size;
371 loff_t retval; 371 loff_t retval;
372 372
373 mutex_lock(&bd_inode->i_mutex); 373 mutex_lock(&bd_inode->i_mutex);
374 size = i_size_read(bd_inode); 374 size = i_size_read(bd_inode);
375 375
376 switch (origin) { 376 switch (origin) {
377 case 2: 377 case 2:
378 offset += size; 378 offset += size;
379 break; 379 break;
380 case 1: 380 case 1:
381 offset += file->f_pos; 381 offset += file->f_pos;
382 } 382 }
383 retval = -EINVAL; 383 retval = -EINVAL;
384 if (offset >= 0 && offset <= size) { 384 if (offset >= 0 && offset <= size) {
385 if (offset != file->f_pos) { 385 if (offset != file->f_pos) {
386 file->f_pos = offset; 386 file->f_pos = offset;
387 } 387 }
388 retval = offset; 388 retval = offset;
389 } 389 }
390 mutex_unlock(&bd_inode->i_mutex); 390 mutex_unlock(&bd_inode->i_mutex);
391 return retval; 391 return retval;
392 } 392 }
393 393
394 /* 394 /*
395 * Filp is never NULL; the only case when ->fsync() is called with 395 * Filp is never NULL; the only case when ->fsync() is called with
396 * NULL first argument is nfsd_sync_dir() and that's not a directory. 396 * NULL first argument is nfsd_sync_dir() and that's not a directory.
397 */ 397 */
398 398
399 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 399 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
400 { 400 {
401 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 401 return sync_blockdev(I_BDEV(filp->f_mapping->host));
402 } 402 }
403 403
404 /* 404 /*
405 * pseudo-fs 405 * pseudo-fs
406 */ 406 */
407 407
408 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 408 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
409 static struct kmem_cache * bdev_cachep __read_mostly; 409 static struct kmem_cache * bdev_cachep __read_mostly;
410 410
411 static struct inode *bdev_alloc_inode(struct super_block *sb) 411 static struct inode *bdev_alloc_inode(struct super_block *sb)
412 { 412 {
413 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 413 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
414 if (!ei) 414 if (!ei)
415 return NULL; 415 return NULL;
416 return &ei->vfs_inode; 416 return &ei->vfs_inode;
417 } 417 }
418 418
419 static void bdev_destroy_inode(struct inode *inode) 419 static void bdev_destroy_inode(struct inode *inode)
420 { 420 {
421 struct bdev_inode *bdi = BDEV_I(inode); 421 struct bdev_inode *bdi = BDEV_I(inode);
422 422
423 bdi->bdev.bd_inode_backing_dev_info = NULL; 423 bdi->bdev.bd_inode_backing_dev_info = NULL;
424 kmem_cache_free(bdev_cachep, bdi); 424 kmem_cache_free(bdev_cachep, bdi);
425 } 425 }
426 426
427 static void init_once(void *foo) 427 static void init_once(void *foo)
428 { 428 {
429 struct bdev_inode *ei = (struct bdev_inode *) foo; 429 struct bdev_inode *ei = (struct bdev_inode *) foo;
430 struct block_device *bdev = &ei->bdev; 430 struct block_device *bdev = &ei->bdev;
431 431
432 memset(bdev, 0, sizeof(*bdev)); 432 memset(bdev, 0, sizeof(*bdev));
433 mutex_init(&bdev->bd_mutex); 433 mutex_init(&bdev->bd_mutex);
434 sema_init(&bdev->bd_mount_sem, 1); 434 sema_init(&bdev->bd_mount_sem, 1);
435 INIT_LIST_HEAD(&bdev->bd_inodes); 435 INIT_LIST_HEAD(&bdev->bd_inodes);
436 INIT_LIST_HEAD(&bdev->bd_list); 436 INIT_LIST_HEAD(&bdev->bd_list);
437 #ifdef CONFIG_SYSFS 437 #ifdef CONFIG_SYSFS
438 INIT_LIST_HEAD(&bdev->bd_holder_list); 438 INIT_LIST_HEAD(&bdev->bd_holder_list);
439 #endif 439 #endif
440 inode_init_once(&ei->vfs_inode); 440 inode_init_once(&ei->vfs_inode);
441 /* Initialize mutex for freeze. */ 441 /* Initialize mutex for freeze. */
442 mutex_init(&bdev->bd_fsfreeze_mutex); 442 mutex_init(&bdev->bd_fsfreeze_mutex);
443 } 443 }
444 444
445 static inline void __bd_forget(struct inode *inode) 445 static inline void __bd_forget(struct inode *inode)
446 { 446 {
447 list_del_init(&inode->i_devices); 447 list_del_init(&inode->i_devices);
448 inode->i_bdev = NULL; 448 inode->i_bdev = NULL;
449 inode->i_mapping = &inode->i_data; 449 inode->i_mapping = &inode->i_data;
450 } 450 }
451 451
452 static void bdev_clear_inode(struct inode *inode) 452 static void bdev_clear_inode(struct inode *inode)
453 { 453 {
454 struct block_device *bdev = &BDEV_I(inode)->bdev; 454 struct block_device *bdev = &BDEV_I(inode)->bdev;
455 struct list_head *p; 455 struct list_head *p;
456 spin_lock(&bdev_lock); 456 spin_lock(&bdev_lock);
457 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 457 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
458 __bd_forget(list_entry(p, struct inode, i_devices)); 458 __bd_forget(list_entry(p, struct inode, i_devices));
459 } 459 }
460 list_del_init(&bdev->bd_list); 460 list_del_init(&bdev->bd_list);
461 spin_unlock(&bdev_lock); 461 spin_unlock(&bdev_lock);
462 } 462 }
463 463
464 static const struct super_operations bdev_sops = { 464 static const struct super_operations bdev_sops = {
465 .statfs = simple_statfs, 465 .statfs = simple_statfs,
466 .alloc_inode = bdev_alloc_inode, 466 .alloc_inode = bdev_alloc_inode,
467 .destroy_inode = bdev_destroy_inode, 467 .destroy_inode = bdev_destroy_inode,
468 .drop_inode = generic_delete_inode, 468 .drop_inode = generic_delete_inode,
469 .clear_inode = bdev_clear_inode, 469 .clear_inode = bdev_clear_inode,
470 }; 470 };
471 471
472 static int bd_get_sb(struct file_system_type *fs_type, 472 static int bd_get_sb(struct file_system_type *fs_type,
473 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 473 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
474 { 474 {
475 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 475 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
476 } 476 }
477 477
478 static struct file_system_type bd_type = { 478 static struct file_system_type bd_type = {
479 .name = "bdev", 479 .name = "bdev",
480 .get_sb = bd_get_sb, 480 .get_sb = bd_get_sb,
481 .kill_sb = kill_anon_super, 481 .kill_sb = kill_anon_super,
482 }; 482 };
483 483
484 struct super_block *blockdev_superblock __read_mostly; 484 struct super_block *blockdev_superblock __read_mostly;
485 485
486 void __init bdev_cache_init(void) 486 void __init bdev_cache_init(void)
487 { 487 {
488 int err; 488 int err;
489 struct vfsmount *bd_mnt; 489 struct vfsmount *bd_mnt;
490 490
491 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 491 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
492 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 492 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
493 SLAB_MEM_SPREAD|SLAB_PANIC), 493 SLAB_MEM_SPREAD|SLAB_PANIC),
494 init_once); 494 init_once);
495 err = register_filesystem(&bd_type); 495 err = register_filesystem(&bd_type);
496 if (err) 496 if (err)
497 panic("Cannot register bdev pseudo-fs"); 497 panic("Cannot register bdev pseudo-fs");
498 bd_mnt = kern_mount(&bd_type); 498 bd_mnt = kern_mount(&bd_type);
499 if (IS_ERR(bd_mnt)) 499 if (IS_ERR(bd_mnt))
500 panic("Cannot create bdev pseudo-fs"); 500 panic("Cannot create bdev pseudo-fs");
501 /* 501 /*
502 * This vfsmount structure is only used to obtain the 502 * This vfsmount structure is only used to obtain the
503 * blockdev_superblock, so tell kmemleak not to report it. 503 * blockdev_superblock, so tell kmemleak not to report it.
504 */ 504 */
505 kmemleak_not_leak(bd_mnt); 505 kmemleak_not_leak(bd_mnt);
506 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 506 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
507 } 507 }
508 508
509 /* 509 /*
510 * Most likely _very_ bad one - but then it's hardly critical for small 510 * Most likely _very_ bad one - but then it's hardly critical for small
511 * /dev and can be fixed when somebody will need really large one. 511 * /dev and can be fixed when somebody will need really large one.
512 * Keep in mind that it will be fed through icache hash function too. 512 * Keep in mind that it will be fed through icache hash function too.
513 */ 513 */
514 static inline unsigned long hash(dev_t dev) 514 static inline unsigned long hash(dev_t dev)
515 { 515 {
516 return MAJOR(dev)+MINOR(dev); 516 return MAJOR(dev)+MINOR(dev);
517 } 517 }
518 518
519 static int bdev_test(struct inode *inode, void *data) 519 static int bdev_test(struct inode *inode, void *data)
520 { 520 {
521 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 521 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
522 } 522 }
523 523
524 static int bdev_set(struct inode *inode, void *data) 524 static int bdev_set(struct inode *inode, void *data)
525 { 525 {
526 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 526 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
527 return 0; 527 return 0;
528 } 528 }
529 529
530 static LIST_HEAD(all_bdevs); 530 static LIST_HEAD(all_bdevs);
531 531
532 struct block_device *bdget(dev_t dev) 532 struct block_device *bdget(dev_t dev)
533 { 533 {
534 struct block_device *bdev; 534 struct block_device *bdev;
535 struct inode *inode; 535 struct inode *inode;
536 536
537 inode = iget5_locked(blockdev_superblock, hash(dev), 537 inode = iget5_locked(blockdev_superblock, hash(dev),
538 bdev_test, bdev_set, &dev); 538 bdev_test, bdev_set, &dev);
539 539
540 if (!inode) 540 if (!inode)
541 return NULL; 541 return NULL;
542 542
543 bdev = &BDEV_I(inode)->bdev; 543 bdev = &BDEV_I(inode)->bdev;
544 544
545 if (inode->i_state & I_NEW) { 545 if (inode->i_state & I_NEW) {
546 bdev->bd_contains = NULL; 546 bdev->bd_contains = NULL;
547 bdev->bd_inode = inode; 547 bdev->bd_inode = inode;
548 bdev->bd_block_size = (1 << inode->i_blkbits); 548 bdev->bd_block_size = (1 << inode->i_blkbits);
549 bdev->bd_part_count = 0; 549 bdev->bd_part_count = 0;
550 bdev->bd_invalidated = 0; 550 bdev->bd_invalidated = 0;
551 inode->i_mode = S_IFBLK; 551 inode->i_mode = S_IFBLK;
552 inode->i_rdev = dev; 552 inode->i_rdev = dev;
553 inode->i_bdev = bdev; 553 inode->i_bdev = bdev;
554 inode->i_data.a_ops = &def_blk_aops; 554 inode->i_data.a_ops = &def_blk_aops;
555 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 555 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
556 inode->i_data.backing_dev_info = &default_backing_dev_info; 556 inode->i_data.backing_dev_info = &default_backing_dev_info;
557 spin_lock(&bdev_lock); 557 spin_lock(&bdev_lock);
558 list_add(&bdev->bd_list, &all_bdevs); 558 list_add(&bdev->bd_list, &all_bdevs);
559 spin_unlock(&bdev_lock); 559 spin_unlock(&bdev_lock);
560 unlock_new_inode(inode); 560 unlock_new_inode(inode);
561 } 561 }
562 return bdev; 562 return bdev;
563 } 563 }
564 564
565 EXPORT_SYMBOL(bdget); 565 EXPORT_SYMBOL(bdget);
566 566
567 /**
568 * bdgrab -- Grab a reference to an already referenced block device
569 * @bdev: Block device to grab a reference to.
570 */
571 struct block_device *bdgrab(struct block_device *bdev)
572 {
573 atomic_inc(&bdev->bd_inode->i_count);
574 return bdev;
575 }
576
567 long nr_blockdev_pages(void) 577 long nr_blockdev_pages(void)
568 { 578 {
569 struct block_device *bdev; 579 struct block_device *bdev;
570 long ret = 0; 580 long ret = 0;
571 spin_lock(&bdev_lock); 581 spin_lock(&bdev_lock);
572 list_for_each_entry(bdev, &all_bdevs, bd_list) { 582 list_for_each_entry(bdev, &all_bdevs, bd_list) {
573 ret += bdev->bd_inode->i_mapping->nrpages; 583 ret += bdev->bd_inode->i_mapping->nrpages;
574 } 584 }
575 spin_unlock(&bdev_lock); 585 spin_unlock(&bdev_lock);
576 return ret; 586 return ret;
577 } 587 }
578 588
579 void bdput(struct block_device *bdev) 589 void bdput(struct block_device *bdev)
580 { 590 {
581 iput(bdev->bd_inode); 591 iput(bdev->bd_inode);
582 } 592 }
583 593
584 EXPORT_SYMBOL(bdput); 594 EXPORT_SYMBOL(bdput);
585 595
586 static struct block_device *bd_acquire(struct inode *inode) 596 static struct block_device *bd_acquire(struct inode *inode)
587 { 597 {
588 struct block_device *bdev; 598 struct block_device *bdev;
589 599
590 spin_lock(&bdev_lock); 600 spin_lock(&bdev_lock);
591 bdev = inode->i_bdev; 601 bdev = inode->i_bdev;
592 if (bdev) { 602 if (bdev) {
593 atomic_inc(&bdev->bd_inode->i_count); 603 atomic_inc(&bdev->bd_inode->i_count);
594 spin_unlock(&bdev_lock); 604 spin_unlock(&bdev_lock);
595 return bdev; 605 return bdev;
596 } 606 }
597 spin_unlock(&bdev_lock); 607 spin_unlock(&bdev_lock);
598 608
599 bdev = bdget(inode->i_rdev); 609 bdev = bdget(inode->i_rdev);
600 if (bdev) { 610 if (bdev) {
601 spin_lock(&bdev_lock); 611 spin_lock(&bdev_lock);
602 if (!inode->i_bdev) { 612 if (!inode->i_bdev) {
603 /* 613 /*
604 * We take an additional bd_inode->i_count for inode, 614 * We take an additional bd_inode->i_count for inode,
605 * and it's released in clear_inode() of inode. 615 * and it's released in clear_inode() of inode.
606 * So, we can access it via ->i_mapping always 616 * So, we can access it via ->i_mapping always
607 * without igrab(). 617 * without igrab().
608 */ 618 */
609 atomic_inc(&bdev->bd_inode->i_count); 619 atomic_inc(&bdev->bd_inode->i_count);
610 inode->i_bdev = bdev; 620 inode->i_bdev = bdev;
611 inode->i_mapping = bdev->bd_inode->i_mapping; 621 inode->i_mapping = bdev->bd_inode->i_mapping;
612 list_add(&inode->i_devices, &bdev->bd_inodes); 622 list_add(&inode->i_devices, &bdev->bd_inodes);
613 } 623 }
614 spin_unlock(&bdev_lock); 624 spin_unlock(&bdev_lock);
615 } 625 }
616 return bdev; 626 return bdev;
617 } 627 }
618 628
619 /* Call when you free inode */ 629 /* Call when you free inode */
620 630
621 void bd_forget(struct inode *inode) 631 void bd_forget(struct inode *inode)
622 { 632 {
623 struct block_device *bdev = NULL; 633 struct block_device *bdev = NULL;
624 634
625 spin_lock(&bdev_lock); 635 spin_lock(&bdev_lock);
626 if (inode->i_bdev) { 636 if (inode->i_bdev) {
627 if (!sb_is_blkdev_sb(inode->i_sb)) 637 if (!sb_is_blkdev_sb(inode->i_sb))
628 bdev = inode->i_bdev; 638 bdev = inode->i_bdev;
629 __bd_forget(inode); 639 __bd_forget(inode);
630 } 640 }
631 spin_unlock(&bdev_lock); 641 spin_unlock(&bdev_lock);
632 642
633 if (bdev) 643 if (bdev)
634 iput(bdev->bd_inode); 644 iput(bdev->bd_inode);
635 } 645 }
636 646
637 int bd_claim(struct block_device *bdev, void *holder) 647 int bd_claim(struct block_device *bdev, void *holder)
638 { 648 {
639 int res; 649 int res;
640 spin_lock(&bdev_lock); 650 spin_lock(&bdev_lock);
641 651
642 /* first decide result */ 652 /* first decide result */
643 if (bdev->bd_holder == holder) 653 if (bdev->bd_holder == holder)
644 res = 0; /* already a holder */ 654 res = 0; /* already a holder */
645 else if (bdev->bd_holder != NULL) 655 else if (bdev->bd_holder != NULL)
646 res = -EBUSY; /* held by someone else */ 656 res = -EBUSY; /* held by someone else */
647 else if (bdev->bd_contains == bdev) 657 else if (bdev->bd_contains == bdev)
648 res = 0; /* is a whole device which isn't held */ 658 res = 0; /* is a whole device which isn't held */
649 659
650 else if (bdev->bd_contains->bd_holder == bd_claim) 660 else if (bdev->bd_contains->bd_holder == bd_claim)
651 res = 0; /* is a partition of a device that is being partitioned */ 661 res = 0; /* is a partition of a device that is being partitioned */
652 else if (bdev->bd_contains->bd_holder != NULL) 662 else if (bdev->bd_contains->bd_holder != NULL)
653 res = -EBUSY; /* is a partition of a held device */ 663 res = -EBUSY; /* is a partition of a held device */
654 else 664 else
655 res = 0; /* is a partition of an un-held device */ 665 res = 0; /* is a partition of an un-held device */
656 666
657 /* now impose change */ 667 /* now impose change */
658 if (res==0) { 668 if (res==0) {
659 /* note that for a whole device bd_holders 669 /* note that for a whole device bd_holders
660 * will be incremented twice, and bd_holder will 670 * will be incremented twice, and bd_holder will
661 * be set to bd_claim before being set to holder 671 * be set to bd_claim before being set to holder
662 */ 672 */
663 bdev->bd_contains->bd_holders ++; 673 bdev->bd_contains->bd_holders ++;
664 bdev->bd_contains->bd_holder = bd_claim; 674 bdev->bd_contains->bd_holder = bd_claim;
665 bdev->bd_holders++; 675 bdev->bd_holders++;
666 bdev->bd_holder = holder; 676 bdev->bd_holder = holder;
667 } 677 }
668 spin_unlock(&bdev_lock); 678 spin_unlock(&bdev_lock);
669 return res; 679 return res;
670 } 680 }
671 681
672 EXPORT_SYMBOL(bd_claim); 682 EXPORT_SYMBOL(bd_claim);
673 683
674 void bd_release(struct block_device *bdev) 684 void bd_release(struct block_device *bdev)
675 { 685 {
676 spin_lock(&bdev_lock); 686 spin_lock(&bdev_lock);
677 if (!--bdev->bd_contains->bd_holders) 687 if (!--bdev->bd_contains->bd_holders)
678 bdev->bd_contains->bd_holder = NULL; 688 bdev->bd_contains->bd_holder = NULL;
679 if (!--bdev->bd_holders) 689 if (!--bdev->bd_holders)
680 bdev->bd_holder = NULL; 690 bdev->bd_holder = NULL;
681 spin_unlock(&bdev_lock); 691 spin_unlock(&bdev_lock);
682 } 692 }
683 693
684 EXPORT_SYMBOL(bd_release); 694 EXPORT_SYMBOL(bd_release);
685 695
686 #ifdef CONFIG_SYSFS 696 #ifdef CONFIG_SYSFS
687 /* 697 /*
688 * Functions for bd_claim_by_kobject / bd_release_from_kobject 698 * Functions for bd_claim_by_kobject / bd_release_from_kobject
689 * 699 *
690 * If a kobject is passed to bd_claim_by_kobject() 700 * If a kobject is passed to bd_claim_by_kobject()
691 * and the kobject has a parent directory, 701 * and the kobject has a parent directory,
692 * following symlinks are created: 702 * following symlinks are created:
693 * o from the kobject to the claimed bdev 703 * o from the kobject to the claimed bdev
694 * o from "holders" directory of the bdev to the parent of the kobject 704 * o from "holders" directory of the bdev to the parent of the kobject
695 * bd_release_from_kobject() removes these symlinks. 705 * bd_release_from_kobject() removes these symlinks.
696 * 706 *
697 * Example: 707 * Example:
698 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 708 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
699 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 709 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
700 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 710 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
701 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 711 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
702 */ 712 */
703 713
704 static int add_symlink(struct kobject *from, struct kobject *to) 714 static int add_symlink(struct kobject *from, struct kobject *to)
705 { 715 {
706 if (!from || !to) 716 if (!from || !to)
707 return 0; 717 return 0;
708 return sysfs_create_link(from, to, kobject_name(to)); 718 return sysfs_create_link(from, to, kobject_name(to));
709 } 719 }
710 720
711 static void del_symlink(struct kobject *from, struct kobject *to) 721 static void del_symlink(struct kobject *from, struct kobject *to)
712 { 722 {
713 if (!from || !to) 723 if (!from || !to)
714 return; 724 return;
715 sysfs_remove_link(from, kobject_name(to)); 725 sysfs_remove_link(from, kobject_name(to));
716 } 726 }
717 727
718 /* 728 /*
719 * 'struct bd_holder' contains pointers to kobjects symlinked by 729 * 'struct bd_holder' contains pointers to kobjects symlinked by
720 * bd_claim_by_kobject. 730 * bd_claim_by_kobject.
721 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 731 * It's connected to bd_holder_list which is protected by bdev->bd_sem.
722 */ 732 */
723 struct bd_holder { 733 struct bd_holder {
724 struct list_head list; /* chain of holders of the bdev */ 734 struct list_head list; /* chain of holders of the bdev */
725 int count; /* references from the holder */ 735 int count; /* references from the holder */
726 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 736 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
727 struct kobject *hdev; /* e.g. "/block/dm-0" */ 737 struct kobject *hdev; /* e.g. "/block/dm-0" */
728 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 738 struct kobject *hdir; /* e.g. "/block/sda/holders" */
729 struct kobject *sdev; /* e.g. "/block/sda" */ 739 struct kobject *sdev; /* e.g. "/block/sda" */
730 }; 740 };
731 741
732 /* 742 /*
733 * Get references of related kobjects at once. 743 * Get references of related kobjects at once.
734 * Returns 1 on success. 0 on failure. 744 * Returns 1 on success. 0 on failure.
735 * 745 *
736 * Should call bd_holder_release_dirs() after successful use. 746 * Should call bd_holder_release_dirs() after successful use.
737 */ 747 */
738 static int bd_holder_grab_dirs(struct block_device *bdev, 748 static int bd_holder_grab_dirs(struct block_device *bdev,
739 struct bd_holder *bo) 749 struct bd_holder *bo)
740 { 750 {
741 if (!bdev || !bo) 751 if (!bdev || !bo)
742 return 0; 752 return 0;
743 753
744 bo->sdir = kobject_get(bo->sdir); 754 bo->sdir = kobject_get(bo->sdir);
745 if (!bo->sdir) 755 if (!bo->sdir)
746 return 0; 756 return 0;
747 757
748 bo->hdev = kobject_get(bo->sdir->parent); 758 bo->hdev = kobject_get(bo->sdir->parent);
749 if (!bo->hdev) 759 if (!bo->hdev)
750 goto fail_put_sdir; 760 goto fail_put_sdir;
751 761
752 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); 762 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
753 if (!bo->sdev) 763 if (!bo->sdev)
754 goto fail_put_hdev; 764 goto fail_put_hdev;
755 765
756 bo->hdir = kobject_get(bdev->bd_part->holder_dir); 766 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
757 if (!bo->hdir) 767 if (!bo->hdir)
758 goto fail_put_sdev; 768 goto fail_put_sdev;
759 769
760 return 1; 770 return 1;
761 771
762 fail_put_sdev: 772 fail_put_sdev:
763 kobject_put(bo->sdev); 773 kobject_put(bo->sdev);
764 fail_put_hdev: 774 fail_put_hdev:
765 kobject_put(bo->hdev); 775 kobject_put(bo->hdev);
766 fail_put_sdir: 776 fail_put_sdir:
767 kobject_put(bo->sdir); 777 kobject_put(bo->sdir);
768 778
769 return 0; 779 return 0;
770 } 780 }
771 781
772 /* Put references of related kobjects at once. */ 782 /* Put references of related kobjects at once. */
773 static void bd_holder_release_dirs(struct bd_holder *bo) 783 static void bd_holder_release_dirs(struct bd_holder *bo)
774 { 784 {
775 kobject_put(bo->hdir); 785 kobject_put(bo->hdir);
776 kobject_put(bo->sdev); 786 kobject_put(bo->sdev);
777 kobject_put(bo->hdev); 787 kobject_put(bo->hdev);
778 kobject_put(bo->sdir); 788 kobject_put(bo->sdir);
779 } 789 }
780 790
781 static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 791 static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
782 { 792 {
783 struct bd_holder *bo; 793 struct bd_holder *bo;
784 794
785 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 795 bo = kzalloc(sizeof(*bo), GFP_KERNEL);
786 if (!bo) 796 if (!bo)
787 return NULL; 797 return NULL;
788 798
789 bo->count = 1; 799 bo->count = 1;
790 bo->sdir = kobj; 800 bo->sdir = kobj;
791 801
792 return bo; 802 return bo;
793 } 803 }
794 804
795 static void free_bd_holder(struct bd_holder *bo) 805 static void free_bd_holder(struct bd_holder *bo)
796 { 806 {
797 kfree(bo); 807 kfree(bo);
798 } 808 }
799 809
800 /** 810 /**
801 * find_bd_holder - find matching struct bd_holder from the block device 811 * find_bd_holder - find matching struct bd_holder from the block device
802 * 812 *
803 * @bdev: struct block device to be searched 813 * @bdev: struct block device to be searched
804 * @bo: target struct bd_holder 814 * @bo: target struct bd_holder
805 * 815 *
806 * Returns matching entry with @bo in @bdev->bd_holder_list. 816 * Returns matching entry with @bo in @bdev->bd_holder_list.
807 * If found, increment the reference count and return the pointer. 817 * If found, increment the reference count and return the pointer.
808 * If not found, returns NULL. 818 * If not found, returns NULL.
809 */ 819 */
810 static struct bd_holder *find_bd_holder(struct block_device *bdev, 820 static struct bd_holder *find_bd_holder(struct block_device *bdev,
811 struct bd_holder *bo) 821 struct bd_holder *bo)
812 { 822 {
813 struct bd_holder *tmp; 823 struct bd_holder *tmp;
814 824
815 list_for_each_entry(tmp, &bdev->bd_holder_list, list) 825 list_for_each_entry(tmp, &bdev->bd_holder_list, list)
816 if (tmp->sdir == bo->sdir) { 826 if (tmp->sdir == bo->sdir) {
817 tmp->count++; 827 tmp->count++;
818 return tmp; 828 return tmp;
819 } 829 }
820 830
821 return NULL; 831 return NULL;
822 } 832 }
823 833
824 /** 834 /**
825 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 835 * add_bd_holder - create sysfs symlinks for bd_claim() relationship
826 * 836 *
827 * @bdev: block device to be bd_claimed 837 * @bdev: block device to be bd_claimed
828 * @bo: preallocated and initialized by alloc_bd_holder() 838 * @bo: preallocated and initialized by alloc_bd_holder()
829 * 839 *
830 * Add @bo to @bdev->bd_holder_list, create symlinks. 840 * Add @bo to @bdev->bd_holder_list, create symlinks.
831 * 841 *
832 * Returns 0 if symlinks are created. 842 * Returns 0 if symlinks are created.
833 * Returns -ve if something fails. 843 * Returns -ve if something fails.
834 */ 844 */
835 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 845 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
836 { 846 {
837 int err; 847 int err;
838 848
839 if (!bo) 849 if (!bo)
840 return -EINVAL; 850 return -EINVAL;
841 851
842 if (!bd_holder_grab_dirs(bdev, bo)) 852 if (!bd_holder_grab_dirs(bdev, bo))
843 return -EBUSY; 853 return -EBUSY;
844 854
845 err = add_symlink(bo->sdir, bo->sdev); 855 err = add_symlink(bo->sdir, bo->sdev);
846 if (err) 856 if (err)
847 return err; 857 return err;
848 858
849 err = add_symlink(bo->hdir, bo->hdev); 859 err = add_symlink(bo->hdir, bo->hdev);
850 if (err) { 860 if (err) {
851 del_symlink(bo->sdir, bo->sdev); 861 del_symlink(bo->sdir, bo->sdev);
852 return err; 862 return err;
853 } 863 }
854 864
855 list_add_tail(&bo->list, &bdev->bd_holder_list); 865 list_add_tail(&bo->list, &bdev->bd_holder_list);
856 return 0; 866 return 0;
857 } 867 }
858 868
859 /** 869 /**
860 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 870 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
861 * 871 *
862 * @bdev: block device to be bd_claimed 872 * @bdev: block device to be bd_claimed
863 * @kobj: holder's kobject 873 * @kobj: holder's kobject
864 * 874 *
865 * If there is matching entry with @kobj in @bdev->bd_holder_list 875 * If there is matching entry with @kobj in @bdev->bd_holder_list
866 * and no other bd_claim() from the same kobject, 876 * and no other bd_claim() from the same kobject,
867 * remove the struct bd_holder from the list, delete symlinks for it. 877 * remove the struct bd_holder from the list, delete symlinks for it.
868 * 878 *
869 * Returns a pointer to the struct bd_holder when it's removed from the list 879 * Returns a pointer to the struct bd_holder when it's removed from the list
870 * and ready to be freed. 880 * and ready to be freed.
871 * Returns NULL if matching claim isn't found or there is other bd_claim() 881 * Returns NULL if matching claim isn't found or there is other bd_claim()
872 * by the same kobject. 882 * by the same kobject.
873 */ 883 */
874 static struct bd_holder *del_bd_holder(struct block_device *bdev, 884 static struct bd_holder *del_bd_holder(struct block_device *bdev,
875 struct kobject *kobj) 885 struct kobject *kobj)
876 { 886 {
877 struct bd_holder *bo; 887 struct bd_holder *bo;
878 888
879 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 889 list_for_each_entry(bo, &bdev->bd_holder_list, list) {
880 if (bo->sdir == kobj) { 890 if (bo->sdir == kobj) {
881 bo->count--; 891 bo->count--;
882 BUG_ON(bo->count < 0); 892 BUG_ON(bo->count < 0);
883 if (!bo->count) { 893 if (!bo->count) {
884 list_del(&bo->list); 894 list_del(&bo->list);
885 del_symlink(bo->sdir, bo->sdev); 895 del_symlink(bo->sdir, bo->sdev);
886 del_symlink(bo->hdir, bo->hdev); 896 del_symlink(bo->hdir, bo->hdev);
887 bd_holder_release_dirs(bo); 897 bd_holder_release_dirs(bo);
888 return bo; 898 return bo;
889 } 899 }
890 break; 900 break;
891 } 901 }
892 } 902 }
893 903
894 return NULL; 904 return NULL;
895 } 905 }
896 906
897 /** 907 /**
898 * bd_claim_by_kobject - bd_claim() with additional kobject signature 908 * bd_claim_by_kobject - bd_claim() with additional kobject signature
899 * 909 *
900 * @bdev: block device to be claimed 910 * @bdev: block device to be claimed
901 * @holder: holder's signature 911 * @holder: holder's signature
902 * @kobj: holder's kobject 912 * @kobj: holder's kobject
903 * 913 *
904 * Do bd_claim() and if it succeeds, create sysfs symlinks between 914 * Do bd_claim() and if it succeeds, create sysfs symlinks between
905 * the bdev and the holder's kobject. 915 * the bdev and the holder's kobject.
906 * Use bd_release_from_kobject() when relesing the claimed bdev. 916 * Use bd_release_from_kobject() when relesing the claimed bdev.
907 * 917 *
908 * Returns 0 on success. (same as bd_claim()) 918 * Returns 0 on success. (same as bd_claim())
909 * Returns errno on failure. 919 * Returns errno on failure.
910 */ 920 */
911 static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 921 static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
912 struct kobject *kobj) 922 struct kobject *kobj)
913 { 923 {
914 int err; 924 int err;
915 struct bd_holder *bo, *found; 925 struct bd_holder *bo, *found;
916 926
917 if (!kobj) 927 if (!kobj)
918 return -EINVAL; 928 return -EINVAL;
919 929
920 bo = alloc_bd_holder(kobj); 930 bo = alloc_bd_holder(kobj);
921 if (!bo) 931 if (!bo)
922 return -ENOMEM; 932 return -ENOMEM;
923 933
924 mutex_lock(&bdev->bd_mutex); 934 mutex_lock(&bdev->bd_mutex);
925 935
926 err = bd_claim(bdev, holder); 936 err = bd_claim(bdev, holder);
927 if (err) 937 if (err)
928 goto fail; 938 goto fail;
929 939
930 found = find_bd_holder(bdev, bo); 940 found = find_bd_holder(bdev, bo);
931 if (found) 941 if (found)
932 goto fail; 942 goto fail;
933 943
934 err = add_bd_holder(bdev, bo); 944 err = add_bd_holder(bdev, bo);
935 if (err) 945 if (err)
936 bd_release(bdev); 946 bd_release(bdev);
937 else 947 else
938 bo = NULL; 948 bo = NULL;
939 fail: 949 fail:
940 mutex_unlock(&bdev->bd_mutex); 950 mutex_unlock(&bdev->bd_mutex);
941 free_bd_holder(bo); 951 free_bd_holder(bo);
942 return err; 952 return err;
943 } 953 }
944 954
945 /** 955 /**
946 * bd_release_from_kobject - bd_release() with additional kobject signature 956 * bd_release_from_kobject - bd_release() with additional kobject signature
947 * 957 *
948 * @bdev: block device to be released 958 * @bdev: block device to be released
949 * @kobj: holder's kobject 959 * @kobj: holder's kobject
950 * 960 *
951 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 961 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
952 */ 962 */
953 static void bd_release_from_kobject(struct block_device *bdev, 963 static void bd_release_from_kobject(struct block_device *bdev,
954 struct kobject *kobj) 964 struct kobject *kobj)
955 { 965 {
956 if (!kobj) 966 if (!kobj)
957 return; 967 return;
958 968
959 mutex_lock(&bdev->bd_mutex); 969 mutex_lock(&bdev->bd_mutex);
960 bd_release(bdev); 970 bd_release(bdev);
961 free_bd_holder(del_bd_holder(bdev, kobj)); 971 free_bd_holder(del_bd_holder(bdev, kobj));
962 mutex_unlock(&bdev->bd_mutex); 972 mutex_unlock(&bdev->bd_mutex);
963 } 973 }
964 974
965 /** 975 /**
966 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 976 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
967 * 977 *
968 * @bdev: block device to be claimed 978 * @bdev: block device to be claimed
969 * @holder: holder's signature 979 * @holder: holder's signature
970 * @disk: holder's gendisk 980 * @disk: holder's gendisk
971 * 981 *
972 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 982 * Call bd_claim_by_kobject() with getting @disk->slave_dir.
973 */ 983 */
974 int bd_claim_by_disk(struct block_device *bdev, void *holder, 984 int bd_claim_by_disk(struct block_device *bdev, void *holder,
975 struct gendisk *disk) 985 struct gendisk *disk)
976 { 986 {
977 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 987 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
978 } 988 }
979 EXPORT_SYMBOL_GPL(bd_claim_by_disk); 989 EXPORT_SYMBOL_GPL(bd_claim_by_disk);
980 990
981 /** 991 /**
982 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 992 * bd_release_from_disk - wrapper function for bd_release_from_kobject()
983 * 993 *
984 * @bdev: block device to be claimed 994 * @bdev: block device to be claimed
985 * @disk: holder's gendisk 995 * @disk: holder's gendisk
986 * 996 *
987 * Call bd_release_from_kobject() and put @disk->slave_dir. 997 * Call bd_release_from_kobject() and put @disk->slave_dir.
988 */ 998 */
989 void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 999 void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
990 { 1000 {
991 bd_release_from_kobject(bdev, disk->slave_dir); 1001 bd_release_from_kobject(bdev, disk->slave_dir);
992 kobject_put(disk->slave_dir); 1002 kobject_put(disk->slave_dir);
993 } 1003 }
994 EXPORT_SYMBOL_GPL(bd_release_from_disk); 1004 EXPORT_SYMBOL_GPL(bd_release_from_disk);
995 #endif 1005 #endif
996 1006
997 /* 1007 /*
998 * Tries to open block device by device number. Use it ONLY if you 1008 * Tries to open block device by device number. Use it ONLY if you
999 * really do not have anything better - i.e. when you are behind a 1009 * really do not have anything better - i.e. when you are behind a
1000 * truly sucky interface and all you are given is a device number. _Never_ 1010 * truly sucky interface and all you are given is a device number. _Never_
1001 * to be used for internal purposes. If you ever need it - reconsider 1011 * to be used for internal purposes. If you ever need it - reconsider
1002 * your API. 1012 * your API.
1003 */ 1013 */
1004 struct block_device *open_by_devnum(dev_t dev, fmode_t mode) 1014 struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
1005 { 1015 {
1006 struct block_device *bdev = bdget(dev); 1016 struct block_device *bdev = bdget(dev);
1007 int err = -ENOMEM; 1017 int err = -ENOMEM;
1008 if (bdev) 1018 if (bdev)
1009 err = blkdev_get(bdev, mode); 1019 err = blkdev_get(bdev, mode);
1010 return err ? ERR_PTR(err) : bdev; 1020 return err ? ERR_PTR(err) : bdev;
1011 } 1021 }
1012 1022
1013 EXPORT_SYMBOL(open_by_devnum); 1023 EXPORT_SYMBOL(open_by_devnum);
1014 1024
1015 /** 1025 /**
1016 * flush_disk - invalidates all buffer-cache entries on a disk 1026 * flush_disk - invalidates all buffer-cache entries on a disk
1017 * 1027 *
1018 * @bdev: struct block device to be flushed 1028 * @bdev: struct block device to be flushed
1019 * 1029 *
1020 * Invalidates all buffer-cache entries on a disk. It should be called 1030 * Invalidates all buffer-cache entries on a disk. It should be called
1021 * when a disk has been changed -- either by a media change or online 1031 * when a disk has been changed -- either by a media change or online
1022 * resize. 1032 * resize.
1023 */ 1033 */
1024 static void flush_disk(struct block_device *bdev) 1034 static void flush_disk(struct block_device *bdev)
1025 { 1035 {
1026 if (__invalidate_device(bdev)) { 1036 if (__invalidate_device(bdev)) {
1027 char name[BDEVNAME_SIZE] = ""; 1037 char name[BDEVNAME_SIZE] = "";
1028 1038
1029 if (bdev->bd_disk) 1039 if (bdev->bd_disk)
1030 disk_name(bdev->bd_disk, 0, name); 1040 disk_name(bdev->bd_disk, 0, name);
1031 printk(KERN_WARNING "VFS: busy inodes on changed media or " 1041 printk(KERN_WARNING "VFS: busy inodes on changed media or "
1032 "resized disk %s\n", name); 1042 "resized disk %s\n", name);
1033 } 1043 }
1034 1044
1035 if (!bdev->bd_disk) 1045 if (!bdev->bd_disk)
1036 return; 1046 return;
1037 if (disk_partitionable(bdev->bd_disk)) 1047 if (disk_partitionable(bdev->bd_disk))
1038 bdev->bd_invalidated = 1; 1048 bdev->bd_invalidated = 1;
1039 } 1049 }
1040 1050
1041 /** 1051 /**
1042 * check_disk_size_change - checks for disk size change and adjusts bdev size. 1052 * check_disk_size_change - checks for disk size change and adjusts bdev size.
1043 * @disk: struct gendisk to check 1053 * @disk: struct gendisk to check
1044 * @bdev: struct bdev to adjust. 1054 * @bdev: struct bdev to adjust.
1045 * 1055 *
1046 * This routine checks to see if the bdev size does not match the disk size 1056 * This routine checks to see if the bdev size does not match the disk size
1047 * and adjusts it if it differs. 1057 * and adjusts it if it differs.
1048 */ 1058 */
1049 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1059 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
1050 { 1060 {
1051 loff_t disk_size, bdev_size; 1061 loff_t disk_size, bdev_size;
1052 1062
1053 disk_size = (loff_t)get_capacity(disk) << 9; 1063 disk_size = (loff_t)get_capacity(disk) << 9;
1054 bdev_size = i_size_read(bdev->bd_inode); 1064 bdev_size = i_size_read(bdev->bd_inode);
1055 if (disk_size != bdev_size) { 1065 if (disk_size != bdev_size) {
1056 char name[BDEVNAME_SIZE]; 1066 char name[BDEVNAME_SIZE];
1057 1067
1058 disk_name(disk, 0, name); 1068 disk_name(disk, 0, name);
1059 printk(KERN_INFO 1069 printk(KERN_INFO
1060 "%s: detected capacity change from %lld to %lld\n", 1070 "%s: detected capacity change from %lld to %lld\n",
1061 name, bdev_size, disk_size); 1071 name, bdev_size, disk_size);
1062 i_size_write(bdev->bd_inode, disk_size); 1072 i_size_write(bdev->bd_inode, disk_size);
1063 flush_disk(bdev); 1073 flush_disk(bdev);
1064 } 1074 }
1065 } 1075 }
1066 EXPORT_SYMBOL(check_disk_size_change); 1076 EXPORT_SYMBOL(check_disk_size_change);
1067 1077
1068 /** 1078 /**
1069 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1079 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
1070 * @disk: struct gendisk to be revalidated 1080 * @disk: struct gendisk to be revalidated
1071 * 1081 *
1072 * This routine is a wrapper for lower-level driver's revalidate_disk 1082 * This routine is a wrapper for lower-level driver's revalidate_disk
1073 * call-backs. It is used to do common pre and post operations needed 1083 * call-backs. It is used to do common pre and post operations needed
1074 * for all revalidate_disk operations. 1084 * for all revalidate_disk operations.
1075 */ 1085 */
1076 int revalidate_disk(struct gendisk *disk) 1086 int revalidate_disk(struct gendisk *disk)
1077 { 1087 {
1078 struct block_device *bdev; 1088 struct block_device *bdev;
1079 int ret = 0; 1089 int ret = 0;
1080 1090
1081 if (disk->fops->revalidate_disk) 1091 if (disk->fops->revalidate_disk)
1082 ret = disk->fops->revalidate_disk(disk); 1092 ret = disk->fops->revalidate_disk(disk);
1083 1093
1084 bdev = bdget_disk(disk, 0); 1094 bdev = bdget_disk(disk, 0);
1085 if (!bdev) 1095 if (!bdev)
1086 return ret; 1096 return ret;
1087 1097
1088 mutex_lock(&bdev->bd_mutex); 1098 mutex_lock(&bdev->bd_mutex);
1089 check_disk_size_change(disk, bdev); 1099 check_disk_size_change(disk, bdev);
1090 mutex_unlock(&bdev->bd_mutex); 1100 mutex_unlock(&bdev->bd_mutex);
1091 bdput(bdev); 1101 bdput(bdev);
1092 return ret; 1102 return ret;
1093 } 1103 }
1094 EXPORT_SYMBOL(revalidate_disk); 1104 EXPORT_SYMBOL(revalidate_disk);
1095 1105
1096 /* 1106 /*
1097 * This routine checks whether a removable media has been changed, 1107 * This routine checks whether a removable media has been changed,
1098 * and invalidates all buffer-cache-entries in that case. This 1108 * and invalidates all buffer-cache-entries in that case. This
1099 * is a relatively slow routine, so we have to try to minimize using 1109 * is a relatively slow routine, so we have to try to minimize using
1100 * it. Thus it is called only upon a 'mount' or 'open'. This 1110 * it. Thus it is called only upon a 'mount' or 'open'. This
1101 * is the best way of combining speed and utility, I think. 1111 * is the best way of combining speed and utility, I think.
1102 * People changing diskettes in the middle of an operation deserve 1112 * People changing diskettes in the middle of an operation deserve
1103 * to lose :-) 1113 * to lose :-)
1104 */ 1114 */
1105 int check_disk_change(struct block_device *bdev) 1115 int check_disk_change(struct block_device *bdev)
1106 { 1116 {
1107 struct gendisk *disk = bdev->bd_disk; 1117 struct gendisk *disk = bdev->bd_disk;
1108 struct block_device_operations * bdops = disk->fops; 1118 struct block_device_operations * bdops = disk->fops;
1109 1119
1110 if (!bdops->media_changed) 1120 if (!bdops->media_changed)
1111 return 0; 1121 return 0;
1112 if (!bdops->media_changed(bdev->bd_disk)) 1122 if (!bdops->media_changed(bdev->bd_disk))
1113 return 0; 1123 return 0;
1114 1124
1115 flush_disk(bdev); 1125 flush_disk(bdev);
1116 if (bdops->revalidate_disk) 1126 if (bdops->revalidate_disk)
1117 bdops->revalidate_disk(bdev->bd_disk); 1127 bdops->revalidate_disk(bdev->bd_disk);
1118 return 1; 1128 return 1;
1119 } 1129 }
1120 1130
1121 EXPORT_SYMBOL(check_disk_change); 1131 EXPORT_SYMBOL(check_disk_change);
1122 1132
1123 void bd_set_size(struct block_device *bdev, loff_t size) 1133 void bd_set_size(struct block_device *bdev, loff_t size)
1124 { 1134 {
1125 unsigned bsize = bdev_logical_block_size(bdev); 1135 unsigned bsize = bdev_logical_block_size(bdev);
1126 1136
1127 bdev->bd_inode->i_size = size; 1137 bdev->bd_inode->i_size = size;
1128 while (bsize < PAGE_CACHE_SIZE) { 1138 while (bsize < PAGE_CACHE_SIZE) {
1129 if (size & bsize) 1139 if (size & bsize)
1130 break; 1140 break;
1131 bsize <<= 1; 1141 bsize <<= 1;
1132 } 1142 }
1133 bdev->bd_block_size = bsize; 1143 bdev->bd_block_size = bsize;
1134 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1144 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1135 } 1145 }
1136 EXPORT_SYMBOL(bd_set_size); 1146 EXPORT_SYMBOL(bd_set_size);
1137 1147
1138 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1148 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1139 1149
1140 /* 1150 /*
1141 * bd_mutex locking: 1151 * bd_mutex locking:
1142 * 1152 *
1143 * mutex_lock(part->bd_mutex) 1153 * mutex_lock(part->bd_mutex)
1144 * mutex_lock_nested(whole->bd_mutex, 1) 1154 * mutex_lock_nested(whole->bd_mutex, 1)
1145 */ 1155 */
1146 1156
1147 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1157 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1148 { 1158 {
1149 struct gendisk *disk; 1159 struct gendisk *disk;
1150 int ret; 1160 int ret;
1151 int partno; 1161 int partno;
1152 int perm = 0; 1162 int perm = 0;
1153 1163
1154 if (mode & FMODE_READ) 1164 if (mode & FMODE_READ)
1155 perm |= MAY_READ; 1165 perm |= MAY_READ;
1156 if (mode & FMODE_WRITE) 1166 if (mode & FMODE_WRITE)
1157 perm |= MAY_WRITE; 1167 perm |= MAY_WRITE;
1158 /* 1168 /*
1159 * hooks: /n/, see "layering violations". 1169 * hooks: /n/, see "layering violations".
1160 */ 1170 */
1161 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1171 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1162 if (ret != 0) { 1172 if (ret != 0) {
1163 bdput(bdev); 1173 bdput(bdev);
1164 return ret; 1174 return ret;
1165 } 1175 }
1166 1176
1167 lock_kernel(); 1177 lock_kernel();
1168 restart: 1178 restart:
1169 1179
1170 ret = -ENXIO; 1180 ret = -ENXIO;
1171 disk = get_gendisk(bdev->bd_dev, &partno); 1181 disk = get_gendisk(bdev->bd_dev, &partno);
1172 if (!disk) 1182 if (!disk)
1173 goto out_unlock_kernel; 1183 goto out_unlock_kernel;
1174 1184
1175 mutex_lock_nested(&bdev->bd_mutex, for_part); 1185 mutex_lock_nested(&bdev->bd_mutex, for_part);
1176 if (!bdev->bd_openers) { 1186 if (!bdev->bd_openers) {
1177 bdev->bd_disk = disk; 1187 bdev->bd_disk = disk;
1178 bdev->bd_contains = bdev; 1188 bdev->bd_contains = bdev;
1179 if (!partno) { 1189 if (!partno) {
1180 struct backing_dev_info *bdi; 1190 struct backing_dev_info *bdi;
1181 1191
1182 ret = -ENXIO; 1192 ret = -ENXIO;
1183 bdev->bd_part = disk_get_part(disk, partno); 1193 bdev->bd_part = disk_get_part(disk, partno);
1184 if (!bdev->bd_part) 1194 if (!bdev->bd_part)
1185 goto out_clear; 1195 goto out_clear;
1186 1196
1187 if (disk->fops->open) { 1197 if (disk->fops->open) {
1188 ret = disk->fops->open(bdev, mode); 1198 ret = disk->fops->open(bdev, mode);
1189 if (ret == -ERESTARTSYS) { 1199 if (ret == -ERESTARTSYS) {
1190 /* Lost a race with 'disk' being 1200 /* Lost a race with 'disk' being
1191 * deleted, try again. 1201 * deleted, try again.
1192 * See md.c 1202 * See md.c
1193 */ 1203 */
1194 disk_put_part(bdev->bd_part); 1204 disk_put_part(bdev->bd_part);
1195 bdev->bd_part = NULL; 1205 bdev->bd_part = NULL;
1196 module_put(disk->fops->owner); 1206 module_put(disk->fops->owner);
1197 put_disk(disk); 1207 put_disk(disk);
1198 bdev->bd_disk = NULL; 1208 bdev->bd_disk = NULL;
1199 mutex_unlock(&bdev->bd_mutex); 1209 mutex_unlock(&bdev->bd_mutex);
1200 goto restart; 1210 goto restart;
1201 } 1211 }
1202 if (ret) 1212 if (ret)
1203 goto out_clear; 1213 goto out_clear;
1204 } 1214 }
1205 if (!bdev->bd_openers) { 1215 if (!bdev->bd_openers) {
1206 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1216 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1207 bdi = blk_get_backing_dev_info(bdev); 1217 bdi = blk_get_backing_dev_info(bdev);
1208 if (bdi == NULL) 1218 if (bdi == NULL)
1209 bdi = &default_backing_dev_info; 1219 bdi = &default_backing_dev_info;
1210 bdev->bd_inode->i_data.backing_dev_info = bdi; 1220 bdev->bd_inode->i_data.backing_dev_info = bdi;
1211 } 1221 }
1212 if (bdev->bd_invalidated) 1222 if (bdev->bd_invalidated)
1213 rescan_partitions(disk, bdev); 1223 rescan_partitions(disk, bdev);
1214 } else { 1224 } else {
1215 struct block_device *whole; 1225 struct block_device *whole;
1216 whole = bdget_disk(disk, 0); 1226 whole = bdget_disk(disk, 0);
1217 ret = -ENOMEM; 1227 ret = -ENOMEM;
1218 if (!whole) 1228 if (!whole)
1219 goto out_clear; 1229 goto out_clear;
1220 BUG_ON(for_part); 1230 BUG_ON(for_part);
1221 ret = __blkdev_get(whole, mode, 1); 1231 ret = __blkdev_get(whole, mode, 1);
1222 if (ret) 1232 if (ret)
1223 goto out_clear; 1233 goto out_clear;
1224 bdev->bd_contains = whole; 1234 bdev->bd_contains = whole;
1225 bdev->bd_inode->i_data.backing_dev_info = 1235 bdev->bd_inode->i_data.backing_dev_info =
1226 whole->bd_inode->i_data.backing_dev_info; 1236 whole->bd_inode->i_data.backing_dev_info;
1227 bdev->bd_part = disk_get_part(disk, partno); 1237 bdev->bd_part = disk_get_part(disk, partno);
1228 if (!(disk->flags & GENHD_FL_UP) || 1238 if (!(disk->flags & GENHD_FL_UP) ||
1229 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1239 !bdev->bd_part || !bdev->bd_part->nr_sects) {
1230 ret = -ENXIO; 1240 ret = -ENXIO;
1231 goto out_clear; 1241 goto out_clear;
1232 } 1242 }
1233 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1243 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1234 } 1244 }
1235 } else { 1245 } else {
1236 put_disk(disk); 1246 put_disk(disk);
1237 module_put(disk->fops->owner); 1247 module_put(disk->fops->owner);
1238 disk = NULL; 1248 disk = NULL;
1239 if (bdev->bd_contains == bdev) { 1249 if (bdev->bd_contains == bdev) {
1240 if (bdev->bd_disk->fops->open) { 1250 if (bdev->bd_disk->fops->open) {
1241 ret = bdev->bd_disk->fops->open(bdev, mode); 1251 ret = bdev->bd_disk->fops->open(bdev, mode);
1242 if (ret) 1252 if (ret)
1243 goto out_unlock_bdev; 1253 goto out_unlock_bdev;
1244 } 1254 }
1245 if (bdev->bd_invalidated) 1255 if (bdev->bd_invalidated)
1246 rescan_partitions(bdev->bd_disk, bdev); 1256 rescan_partitions(bdev->bd_disk, bdev);
1247 } 1257 }
1248 } 1258 }
1249 bdev->bd_openers++; 1259 bdev->bd_openers++;
1250 if (for_part) 1260 if (for_part)
1251 bdev->bd_part_count++; 1261 bdev->bd_part_count++;
1252 mutex_unlock(&bdev->bd_mutex); 1262 mutex_unlock(&bdev->bd_mutex);
1253 unlock_kernel(); 1263 unlock_kernel();
1254 return 0; 1264 return 0;
1255 1265
1256 out_clear: 1266 out_clear:
1257 disk_put_part(bdev->bd_part); 1267 disk_put_part(bdev->bd_part);
1258 bdev->bd_disk = NULL; 1268 bdev->bd_disk = NULL;
1259 bdev->bd_part = NULL; 1269 bdev->bd_part = NULL;
1260 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1270 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1261 if (bdev != bdev->bd_contains) 1271 if (bdev != bdev->bd_contains)
1262 __blkdev_put(bdev->bd_contains, mode, 1); 1272 __blkdev_put(bdev->bd_contains, mode, 1);
1263 bdev->bd_contains = NULL; 1273 bdev->bd_contains = NULL;
1264 out_unlock_bdev: 1274 out_unlock_bdev:
1265 mutex_unlock(&bdev->bd_mutex); 1275 mutex_unlock(&bdev->bd_mutex);
1266 out_unlock_kernel: 1276 out_unlock_kernel:
1267 unlock_kernel(); 1277 unlock_kernel();
1268 1278
1269 if (disk) 1279 if (disk)
1270 module_put(disk->fops->owner); 1280 module_put(disk->fops->owner);
1271 put_disk(disk); 1281 put_disk(disk);
1272 bdput(bdev); 1282 bdput(bdev);
1273 1283
1274 return ret; 1284 return ret;
1275 } 1285 }
1276 1286
1277 int blkdev_get(struct block_device *bdev, fmode_t mode) 1287 int blkdev_get(struct block_device *bdev, fmode_t mode)
1278 { 1288 {
1279 return __blkdev_get(bdev, mode, 0); 1289 return __blkdev_get(bdev, mode, 0);
1280 } 1290 }
1281 EXPORT_SYMBOL(blkdev_get); 1291 EXPORT_SYMBOL(blkdev_get);
1282 1292
1283 static int blkdev_open(struct inode * inode, struct file * filp) 1293 static int blkdev_open(struct inode * inode, struct file * filp)
1284 { 1294 {
1285 struct block_device *bdev; 1295 struct block_device *bdev;
1286 int res; 1296 int res;
1287 1297
1288 /* 1298 /*
1289 * Preserve backwards compatibility and allow large file access 1299 * Preserve backwards compatibility and allow large file access
1290 * even if userspace doesn't ask for it explicitly. Some mkfs 1300 * even if userspace doesn't ask for it explicitly. Some mkfs
1291 * binary needs it. We might want to drop this workaround 1301 * binary needs it. We might want to drop this workaround
1292 * during an unstable branch. 1302 * during an unstable branch.
1293 */ 1303 */
1294 filp->f_flags |= O_LARGEFILE; 1304 filp->f_flags |= O_LARGEFILE;
1295 1305
1296 if (filp->f_flags & O_NDELAY) 1306 if (filp->f_flags & O_NDELAY)
1297 filp->f_mode |= FMODE_NDELAY; 1307 filp->f_mode |= FMODE_NDELAY;
1298 if (filp->f_flags & O_EXCL) 1308 if (filp->f_flags & O_EXCL)
1299 filp->f_mode |= FMODE_EXCL; 1309 filp->f_mode |= FMODE_EXCL;
1300 if ((filp->f_flags & O_ACCMODE) == 3) 1310 if ((filp->f_flags & O_ACCMODE) == 3)
1301 filp->f_mode |= FMODE_WRITE_IOCTL; 1311 filp->f_mode |= FMODE_WRITE_IOCTL;
1302 1312
1303 bdev = bd_acquire(inode); 1313 bdev = bd_acquire(inode);
1304 if (bdev == NULL) 1314 if (bdev == NULL)
1305 return -ENOMEM; 1315 return -ENOMEM;
1306 1316
1307 filp->f_mapping = bdev->bd_inode->i_mapping; 1317 filp->f_mapping = bdev->bd_inode->i_mapping;
1308 1318
1309 res = blkdev_get(bdev, filp->f_mode); 1319 res = blkdev_get(bdev, filp->f_mode);
1310 if (res) 1320 if (res)
1311 return res; 1321 return res;
1312 1322
1313 if (filp->f_mode & FMODE_EXCL) { 1323 if (filp->f_mode & FMODE_EXCL) {
1314 res = bd_claim(bdev, filp); 1324 res = bd_claim(bdev, filp);
1315 if (res) 1325 if (res)
1316 goto out_blkdev_put; 1326 goto out_blkdev_put;
1317 } 1327 }
1318 1328
1319 return 0; 1329 return 0;
1320 1330
1321 out_blkdev_put: 1331 out_blkdev_put:
1322 blkdev_put(bdev, filp->f_mode); 1332 blkdev_put(bdev, filp->f_mode);
1323 return res; 1333 return res;
1324 } 1334 }
1325 1335
1326 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1336 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1327 { 1337 {
1328 int ret = 0; 1338 int ret = 0;
1329 struct gendisk *disk = bdev->bd_disk; 1339 struct gendisk *disk = bdev->bd_disk;
1330 struct block_device *victim = NULL; 1340 struct block_device *victim = NULL;
1331 1341
1332 mutex_lock_nested(&bdev->bd_mutex, for_part); 1342 mutex_lock_nested(&bdev->bd_mutex, for_part);
1333 lock_kernel(); 1343 lock_kernel();
1334 if (for_part) 1344 if (for_part)
1335 bdev->bd_part_count--; 1345 bdev->bd_part_count--;
1336 1346
1337 if (!--bdev->bd_openers) { 1347 if (!--bdev->bd_openers) {
1338 sync_blockdev(bdev); 1348 sync_blockdev(bdev);
1339 kill_bdev(bdev); 1349 kill_bdev(bdev);
1340 } 1350 }
1341 if (bdev->bd_contains == bdev) { 1351 if (bdev->bd_contains == bdev) {
1342 if (disk->fops->release) 1352 if (disk->fops->release)
1343 ret = disk->fops->release(disk, mode); 1353 ret = disk->fops->release(disk, mode);
1344 } 1354 }
1345 if (!bdev->bd_openers) { 1355 if (!bdev->bd_openers) {
1346 struct module *owner = disk->fops->owner; 1356 struct module *owner = disk->fops->owner;
1347 1357
1348 put_disk(disk); 1358 put_disk(disk);
1349 module_put(owner); 1359 module_put(owner);
1350 disk_put_part(bdev->bd_part); 1360 disk_put_part(bdev->bd_part);
1351 bdev->bd_part = NULL; 1361 bdev->bd_part = NULL;
1352 bdev->bd_disk = NULL; 1362 bdev->bd_disk = NULL;
1353 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1363 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1354 if (bdev != bdev->bd_contains) 1364 if (bdev != bdev->bd_contains)
1355 victim = bdev->bd_contains; 1365 victim = bdev->bd_contains;
1356 bdev->bd_contains = NULL; 1366 bdev->bd_contains = NULL;
1357 } 1367 }
1358 unlock_kernel(); 1368 unlock_kernel();
1359 mutex_unlock(&bdev->bd_mutex); 1369 mutex_unlock(&bdev->bd_mutex);
1360 bdput(bdev); 1370 bdput(bdev);
1361 if (victim) 1371 if (victim)
1362 __blkdev_put(victim, mode, 1); 1372 __blkdev_put(victim, mode, 1);
1363 return ret; 1373 return ret;
1364 } 1374 }
1365 1375
1366 int blkdev_put(struct block_device *bdev, fmode_t mode) 1376 int blkdev_put(struct block_device *bdev, fmode_t mode)
1367 { 1377 {
1368 return __blkdev_put(bdev, mode, 0); 1378 return __blkdev_put(bdev, mode, 0);
1369 } 1379 }
1370 EXPORT_SYMBOL(blkdev_put); 1380 EXPORT_SYMBOL(blkdev_put);
1371 1381
1372 static int blkdev_close(struct inode * inode, struct file * filp) 1382 static int blkdev_close(struct inode * inode, struct file * filp)
1373 { 1383 {
1374 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1384 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1375 if (bdev->bd_holder == filp) 1385 if (bdev->bd_holder == filp)
1376 bd_release(bdev); 1386 bd_release(bdev);
1377 return blkdev_put(bdev, filp->f_mode); 1387 return blkdev_put(bdev, filp->f_mode);
1378 } 1388 }
1379 1389
1380 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1390 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1381 { 1391 {
1382 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1392 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1383 fmode_t mode = file->f_mode; 1393 fmode_t mode = file->f_mode;
1384 1394
1385 /* 1395 /*
1386 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1396 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1387 * to updated it before every ioctl. 1397 * to updated it before every ioctl.
1388 */ 1398 */
1389 if (file->f_flags & O_NDELAY) 1399 if (file->f_flags & O_NDELAY)
1390 mode |= FMODE_NDELAY; 1400 mode |= FMODE_NDELAY;
1391 else 1401 else
1392 mode &= ~FMODE_NDELAY; 1402 mode &= ~FMODE_NDELAY;
1393 1403
1394 return blkdev_ioctl(bdev, mode, cmd, arg); 1404 return blkdev_ioctl(bdev, mode, cmd, arg);
1395 } 1405 }
1396 1406
1397 /* 1407 /*
1398 * Try to release a page associated with block device when the system 1408 * Try to release a page associated with block device when the system
1399 * is under memory pressure. 1409 * is under memory pressure.
1400 */ 1410 */
1401 static int blkdev_releasepage(struct page *page, gfp_t wait) 1411 static int blkdev_releasepage(struct page *page, gfp_t wait)
1402 { 1412 {
1403 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1413 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1404 1414
1405 if (super && super->s_op->bdev_try_to_free_page) 1415 if (super && super->s_op->bdev_try_to_free_page)
1406 return super->s_op->bdev_try_to_free_page(super, page, wait); 1416 return super->s_op->bdev_try_to_free_page(super, page, wait);
1407 1417
1408 return try_to_free_buffers(page); 1418 return try_to_free_buffers(page);
1409 } 1419 }
1410 1420
1411 static const struct address_space_operations def_blk_aops = { 1421 static const struct address_space_operations def_blk_aops = {
1412 .readpage = blkdev_readpage, 1422 .readpage = blkdev_readpage,
1413 .writepage = blkdev_writepage, 1423 .writepage = blkdev_writepage,
1414 .sync_page = block_sync_page, 1424 .sync_page = block_sync_page,
1415 .write_begin = blkdev_write_begin, 1425 .write_begin = blkdev_write_begin,
1416 .write_end = blkdev_write_end, 1426 .write_end = blkdev_write_end,
1417 .writepages = generic_writepages, 1427 .writepages = generic_writepages,
1418 .releasepage = blkdev_releasepage, 1428 .releasepage = blkdev_releasepage,
1419 .direct_IO = blkdev_direct_IO, 1429 .direct_IO = blkdev_direct_IO,
1420 }; 1430 };
1421 1431
1422 const struct file_operations def_blk_fops = { 1432 const struct file_operations def_blk_fops = {
1423 .open = blkdev_open, 1433 .open = blkdev_open,
1424 .release = blkdev_close, 1434 .release = blkdev_close,
1425 .llseek = block_llseek, 1435 .llseek = block_llseek,
1426 .read = do_sync_read, 1436 .read = do_sync_read,
1427 .write = do_sync_write, 1437 .write = do_sync_write,
1428 .aio_read = generic_file_aio_read, 1438 .aio_read = generic_file_aio_read,
1429 .aio_write = generic_file_aio_write_nolock, 1439 .aio_write = generic_file_aio_write_nolock,
1430 .mmap = generic_file_mmap, 1440 .mmap = generic_file_mmap,
1431 .fsync = block_fsync, 1441 .fsync = block_fsync,
1432 .unlocked_ioctl = block_ioctl, 1442 .unlocked_ioctl = block_ioctl,
1433 #ifdef CONFIG_COMPAT 1443 #ifdef CONFIG_COMPAT
1434 .compat_ioctl = compat_blkdev_ioctl, 1444 .compat_ioctl = compat_blkdev_ioctl,
1435 #endif 1445 #endif
1436 .splice_read = generic_file_splice_read, 1446 .splice_read = generic_file_splice_read,
1437 .splice_write = generic_file_splice_write, 1447 .splice_write = generic_file_splice_write,
1438 }; 1448 };
1439 1449
1440 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1450 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1441 { 1451 {
1442 int res; 1452 int res;
1443 mm_segment_t old_fs = get_fs(); 1453 mm_segment_t old_fs = get_fs();
1444 set_fs(KERNEL_DS); 1454 set_fs(KERNEL_DS);
1445 res = blkdev_ioctl(bdev, 0, cmd, arg); 1455 res = blkdev_ioctl(bdev, 0, cmd, arg);
1446 set_fs(old_fs); 1456 set_fs(old_fs);
1447 return res; 1457 return res;
1448 } 1458 }
1449 1459
1450 EXPORT_SYMBOL(ioctl_by_bdev); 1460 EXPORT_SYMBOL(ioctl_by_bdev);
1451 1461
1452 /** 1462 /**
1453 * lookup_bdev - lookup a struct block_device by name 1463 * lookup_bdev - lookup a struct block_device by name
1454 * @pathname: special file representing the block device 1464 * @pathname: special file representing the block device
1455 * 1465 *
1456 * Get a reference to the blockdevice at @pathname in the current 1466 * Get a reference to the blockdevice at @pathname in the current
1457 * namespace if possible and return it. Return ERR_PTR(error) 1467 * namespace if possible and return it. Return ERR_PTR(error)
1458 * otherwise. 1468 * otherwise.
1459 */ 1469 */
1460 struct block_device *lookup_bdev(const char *pathname) 1470 struct block_device *lookup_bdev(const char *pathname)
1461 { 1471 {
1462 struct block_device *bdev; 1472 struct block_device *bdev;
1463 struct inode *inode; 1473 struct inode *inode;
1464 struct path path; 1474 struct path path;
1465 int error; 1475 int error;
1466 1476
1467 if (!pathname || !*pathname) 1477 if (!pathname || !*pathname)
1468 return ERR_PTR(-EINVAL); 1478 return ERR_PTR(-EINVAL);
1469 1479
1470 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1480 error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1471 if (error) 1481 if (error)
1472 return ERR_PTR(error); 1482 return ERR_PTR(error);
1473 1483
1474 inode = path.dentry->d_inode; 1484 inode = path.dentry->d_inode;
1475 error = -ENOTBLK; 1485 error = -ENOTBLK;
1476 if (!S_ISBLK(inode->i_mode)) 1486 if (!S_ISBLK(inode->i_mode))
1477 goto fail; 1487 goto fail;
1478 error = -EACCES; 1488 error = -EACCES;
1479 if (path.mnt->mnt_flags & MNT_NODEV) 1489 if (path.mnt->mnt_flags & MNT_NODEV)
1480 goto fail; 1490 goto fail;
1481 error = -ENOMEM; 1491 error = -ENOMEM;
1482 bdev = bd_acquire(inode); 1492 bdev = bd_acquire(inode);
1483 if (!bdev) 1493 if (!bdev)
1484 goto fail; 1494 goto fail;
1485 out: 1495 out:
1486 path_put(&path); 1496 path_put(&path);
1487 return bdev; 1497 return bdev;
1488 fail: 1498 fail:
1489 bdev = ERR_PTR(error); 1499 bdev = ERR_PTR(error);
1490 goto out; 1500 goto out;
1491 } 1501 }
1492 EXPORT_SYMBOL(lookup_bdev); 1502 EXPORT_SYMBOL(lookup_bdev);
1493 1503
1494 /** 1504 /**
1495 * open_bdev_exclusive - open a block device by name and set it up for use 1505 * open_bdev_exclusive - open a block device by name and set it up for use
1496 * 1506 *
1497 * @path: special file representing the block device 1507 * @path: special file representing the block device
1498 * @mode: FMODE_... combination to pass be used 1508 * @mode: FMODE_... combination to pass be used
1499 * @holder: owner for exclusion 1509 * @holder: owner for exclusion
1500 * 1510 *
1501 * Open the blockdevice described by the special file at @path, claim it 1511 * Open the blockdevice described by the special file at @path, claim it
1502 * for the @holder. 1512 * for the @holder.
1503 */ 1513 */
1504 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1514 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1505 { 1515 {
1506 struct block_device *bdev; 1516 struct block_device *bdev;
1507 int error = 0; 1517 int error = 0;
1508 1518
1509 bdev = lookup_bdev(path); 1519 bdev = lookup_bdev(path);
1510 if (IS_ERR(bdev)) 1520 if (IS_ERR(bdev))
1511 return bdev; 1521 return bdev;
1512 1522
1513 error = blkdev_get(bdev, mode); 1523 error = blkdev_get(bdev, mode);
1514 if (error) 1524 if (error)
1515 return ERR_PTR(error); 1525 return ERR_PTR(error);
1516 error = -EACCES; 1526 error = -EACCES;
1517 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1527 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1518 goto blkdev_put; 1528 goto blkdev_put;
1519 error = bd_claim(bdev, holder); 1529 error = bd_claim(bdev, holder);
1520 if (error) 1530 if (error)
1521 goto blkdev_put; 1531 goto blkdev_put;
1522 1532
1523 return bdev; 1533 return bdev;
1524 1534
1525 blkdev_put: 1535 blkdev_put:
1526 blkdev_put(bdev, mode); 1536 blkdev_put(bdev, mode);
1527 return ERR_PTR(error); 1537 return ERR_PTR(error);
1528 } 1538 }
1529 1539
1530 EXPORT_SYMBOL(open_bdev_exclusive); 1540 EXPORT_SYMBOL(open_bdev_exclusive);
1531 1541
1532 /** 1542 /**
1533 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() 1543 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
1534 * 1544 *
1535 * @bdev: blockdevice to close 1545 * @bdev: blockdevice to close
1536 * @mode: mode, must match that used to open. 1546 * @mode: mode, must match that used to open.
1537 * 1547 *
1538 * This is the counterpart to open_bdev_exclusive(). 1548 * This is the counterpart to open_bdev_exclusive().
1539 */ 1549 */
1540 void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) 1550 void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1541 { 1551 {
1542 bd_release(bdev); 1552 bd_release(bdev);
1543 blkdev_put(bdev, mode); 1553 blkdev_put(bdev, mode);
1544 } 1554 }
1545 1555
1546 EXPORT_SYMBOL(close_bdev_exclusive); 1556 EXPORT_SYMBOL(close_bdev_exclusive);
1547 1557
1548 int __invalidate_device(struct block_device *bdev) 1558 int __invalidate_device(struct block_device *bdev)
1549 { 1559 {
1550 struct super_block *sb = get_super(bdev); 1560 struct super_block *sb = get_super(bdev);
1551 int res = 0; 1561 int res = 0;
1552 1562
1553 if (sb) { 1563 if (sb) {
1554 /* 1564 /*
1555 * no need to lock the super, get_super holds the 1565 * no need to lock the super, get_super holds the
1556 * read mutex so the filesystem cannot go away 1566 * read mutex so the filesystem cannot go away
1557 * under us (->put_super runs with the write lock 1567 * under us (->put_super runs with the write lock
1558 * hold). 1568 * hold).
1559 */ 1569 */
1560 shrink_dcache_sb(sb); 1570 shrink_dcache_sb(sb);
1561 res = invalidate_inodes(sb); 1571 res = invalidate_inodes(sb);
1562 drop_super(sb); 1572 drop_super(sb);
1563 } 1573 }
1564 invalidate_bdev(bdev); 1574 invalidate_bdev(bdev);
1565 return res; 1575 return res;
1566 } 1576 }
1567 EXPORT_SYMBOL(__invalidate_device); 1577 EXPORT_SYMBOL(__invalidate_device);
1568 1578
1 #ifndef _LINUX_FS_H 1 #ifndef _LINUX_FS_H
2 #define _LINUX_FS_H 2 #define _LINUX_FS_H
3 3
4 /* 4 /*
5 * This file has definitions for some important file table 5 * This file has definitions for some important file table
6 * structures etc. 6 * structures etc.
7 */ 7 */
8 8
9 #include <linux/limits.h> 9 #include <linux/limits.h>
10 #include <linux/ioctl.h> 10 #include <linux/ioctl.h>
11 11
12 /* 12 /*
13 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change 13 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
14 * the file limit at runtime and only root can increase the per-process 14 * the file limit at runtime and only root can increase the per-process
15 * nr_file rlimit, so it's safe to set up a ridiculously high absolute 15 * nr_file rlimit, so it's safe to set up a ridiculously high absolute
16 * upper limit on files-per-process. 16 * upper limit on files-per-process.
17 * 17 *
18 * Some programs (notably those using select()) may have to be 18 * Some programs (notably those using select()) may have to be
19 * recompiled to take full advantage of the new limits.. 19 * recompiled to take full advantage of the new limits..
20 */ 20 */
21 21
22 /* Fixed constants first: */ 22 /* Fixed constants first: */
23 #undef NR_OPEN 23 #undef NR_OPEN
24 #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ 24 #define INR_OPEN 1024 /* Initial setting for nfile rlimits */
25 25
26 #define BLOCK_SIZE_BITS 10 26 #define BLOCK_SIZE_BITS 10
27 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) 27 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
28 28
29 #define SEEK_SET 0 /* seek relative to beginning of file */ 29 #define SEEK_SET 0 /* seek relative to beginning of file */
30 #define SEEK_CUR 1 /* seek relative to current file position */ 30 #define SEEK_CUR 1 /* seek relative to current file position */
31 #define SEEK_END 2 /* seek relative to end of file */ 31 #define SEEK_END 2 /* seek relative to end of file */
32 #define SEEK_MAX SEEK_END 32 #define SEEK_MAX SEEK_END
33 33
34 /* And dynamically-tunable limits and defaults: */ 34 /* And dynamically-tunable limits and defaults: */
35 struct files_stat_struct { 35 struct files_stat_struct {
36 int nr_files; /* read only */ 36 int nr_files; /* read only */
37 int nr_free_files; /* read only */ 37 int nr_free_files; /* read only */
38 int max_files; /* tunable */ 38 int max_files; /* tunable */
39 }; 39 };
40 40
41 struct inodes_stat_t { 41 struct inodes_stat_t {
42 int nr_inodes; 42 int nr_inodes;
43 int nr_unused; 43 int nr_unused;
44 int dummy[5]; /* padding for sysctl ABI compatibility */ 44 int dummy[5]; /* padding for sysctl ABI compatibility */
45 }; 45 };
46 46
47 47
48 #define NR_FILE 8192 /* this can well be larger on a larger system */ 48 #define NR_FILE 8192 /* this can well be larger on a larger system */
49 49
50 #define MAY_EXEC 1 50 #define MAY_EXEC 1
51 #define MAY_WRITE 2 51 #define MAY_WRITE 2
52 #define MAY_READ 4 52 #define MAY_READ 4
53 #define MAY_APPEND 8 53 #define MAY_APPEND 8
54 #define MAY_ACCESS 16 54 #define MAY_ACCESS 16
55 #define MAY_OPEN 32 55 #define MAY_OPEN 32
56 56
57 /* 57 /*
58 * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond 58 * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond
59 * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() 59 * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
60 */ 60 */
61 61
62 /* file is open for reading */ 62 /* file is open for reading */
63 #define FMODE_READ ((__force fmode_t)1) 63 #define FMODE_READ ((__force fmode_t)1)
64 /* file is open for writing */ 64 /* file is open for writing */
65 #define FMODE_WRITE ((__force fmode_t)2) 65 #define FMODE_WRITE ((__force fmode_t)2)
66 /* file is seekable */ 66 /* file is seekable */
67 #define FMODE_LSEEK ((__force fmode_t)4) 67 #define FMODE_LSEEK ((__force fmode_t)4)
68 /* file can be accessed using pread */ 68 /* file can be accessed using pread */
69 #define FMODE_PREAD ((__force fmode_t)8) 69 #define FMODE_PREAD ((__force fmode_t)8)
70 /* file can be accessed using pwrite */ 70 /* file can be accessed using pwrite */
71 #define FMODE_PWRITE ((__force fmode_t)16) 71 #define FMODE_PWRITE ((__force fmode_t)16)
72 /* File is opened for execution with sys_execve / sys_uselib */ 72 /* File is opened for execution with sys_execve / sys_uselib */
73 #define FMODE_EXEC ((__force fmode_t)32) 73 #define FMODE_EXEC ((__force fmode_t)32)
74 /* File is opened with O_NDELAY (only set for block devices) */ 74 /* File is opened with O_NDELAY (only set for block devices) */
75 #define FMODE_NDELAY ((__force fmode_t)64) 75 #define FMODE_NDELAY ((__force fmode_t)64)
76 /* File is opened with O_EXCL (only set for block devices) */ 76 /* File is opened with O_EXCL (only set for block devices) */
77 #define FMODE_EXCL ((__force fmode_t)128) 77 #define FMODE_EXCL ((__force fmode_t)128)
78 /* File is opened using open(.., 3, ..) and is writeable only for ioctls 78 /* File is opened using open(.., 3, ..) and is writeable only for ioctls
79 (specialy hack for floppy.c) */ 79 (specialy hack for floppy.c) */
80 #define FMODE_WRITE_IOCTL ((__force fmode_t)256) 80 #define FMODE_WRITE_IOCTL ((__force fmode_t)256)
81 81
82 /* 82 /*
83 * Don't update ctime and mtime. 83 * Don't update ctime and mtime.
84 * 84 *
85 * Currently a special hack for the XFS open_by_handle ioctl, but we'll 85 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
86 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. 86 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
87 */ 87 */
88 #define FMODE_NOCMTIME ((__force fmode_t)2048) 88 #define FMODE_NOCMTIME ((__force fmode_t)2048)
89 89
90 /* 90 /*
91 * The below are the various read and write types that we support. Some of 91 * The below are the various read and write types that we support. Some of
92 * them include behavioral modifiers that send information down to the 92 * them include behavioral modifiers that send information down to the
93 * block layer and IO scheduler. Terminology: 93 * block layer and IO scheduler. Terminology:
94 * 94 *
95 * The block layer uses device plugging to defer IO a little bit, in 95 * The block layer uses device plugging to defer IO a little bit, in
96 * the hope that we will see more IO very shortly. This increases 96 * the hope that we will see more IO very shortly. This increases
97 * coalescing of adjacent IO and thus reduces the number of IOs we 97 * coalescing of adjacent IO and thus reduces the number of IOs we
98 * have to send to the device. It also allows for better queuing, 98 * have to send to the device. It also allows for better queuing,
99 * if the IO isn't mergeable. If the caller is going to be waiting 99 * if the IO isn't mergeable. If the caller is going to be waiting
100 * for the IO, then he must ensure that the device is unplugged so 100 * for the IO, then he must ensure that the device is unplugged so
101 * that the IO is dispatched to the driver. 101 * that the IO is dispatched to the driver.
102 * 102 *
103 * All IO is handled async in Linux. This is fine for background 103 * All IO is handled async in Linux. This is fine for background
104 * writes, but for reads or writes that someone waits for completion 104 * writes, but for reads or writes that someone waits for completion
105 * on, we want to notify the block layer and IO scheduler so that they 105 * on, we want to notify the block layer and IO scheduler so that they
106 * know about it. That allows them to make better scheduling 106 * know about it. That allows them to make better scheduling
107 * decisions. So when the below references 'sync' and 'async', it 107 * decisions. So when the below references 'sync' and 'async', it
108 * is referencing this priority hint. 108 * is referencing this priority hint.
109 * 109 *
110 * With that in mind, the available types are: 110 * With that in mind, the available types are:
111 * 111 *
112 * READ A normal read operation. Device will be plugged. 112 * READ A normal read operation. Device will be plugged.
113 * READ_SYNC A synchronous read. Device is not plugged, caller can 113 * READ_SYNC A synchronous read. Device is not plugged, caller can
114 * immediately wait on this read without caring about 114 * immediately wait on this read without caring about
115 * unplugging. 115 * unplugging.
116 * READA Used for read-ahead operations. Lower priority, and the 116 * READA Used for read-ahead operations. Lower priority, and the
117 * block layer could (in theory) choose to ignore this 117 * block layer could (in theory) choose to ignore this
118 * request if it runs into resource problems. 118 * request if it runs into resource problems.
119 * WRITE A normal async write. Device will be plugged. 119 * WRITE A normal async write. Device will be plugged.
120 * SWRITE Like WRITE, but a special case for ll_rw_block() that 120 * SWRITE Like WRITE, but a special case for ll_rw_block() that
121 * tells it to lock the buffer first. Normally a buffer 121 * tells it to lock the buffer first. Normally a buffer
122 * must be locked before doing IO. 122 * must be locked before doing IO.
123 * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down 123 * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down
124 * the hint that someone will be waiting on this IO 124 * the hint that someone will be waiting on this IO
125 * shortly. The device must still be unplugged explicitly, 125 * shortly. The device must still be unplugged explicitly,
126 * WRITE_SYNC_PLUG does not do this as we could be 126 * WRITE_SYNC_PLUG does not do this as we could be
127 * submitting more writes before we actually wait on any 127 * submitting more writes before we actually wait on any
128 * of them. 128 * of them.
129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
130 * immediately after submission. The write equivalent 130 * immediately after submission. The write equivalent
131 * of READ_SYNC. 131 * of READ_SYNC.
132 * WRITE_ODIRECT Special case write for O_DIRECT only. 132 * WRITE_ODIRECT Special case write for O_DIRECT only.
133 * SWRITE_SYNC 133 * SWRITE_SYNC
134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
135 * See SWRITE. 135 * See SWRITE.
136 * WRITE_BARRIER Like WRITE, but tells the block layer that all 136 * WRITE_BARRIER Like WRITE, but tells the block layer that all
137 * previously submitted writes must be safely on storage 137 * previously submitted writes must be safely on storage
138 * before this one is started. Also guarantees that when 138 * before this one is started. Also guarantees that when
139 * this write is complete, it itself is also safely on 139 * this write is complete, it itself is also safely on
140 * storage. Prevents reordering of writes on both sides 140 * storage. Prevents reordering of writes on both sides
141 * of this IO. 141 * of this IO.
142 * 142 *
143 */ 143 */
144 #define RW_MASK 1 144 #define RW_MASK 1
145 #define RWA_MASK 2 145 #define RWA_MASK 2
146 #define READ 0 146 #define READ 0
147 #define WRITE 1 147 #define WRITE 1
148 #define READA 2 /* read-ahead - don't block if no resources */ 148 #define READA 2 /* read-ahead - don't block if no resources */
149 #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ 149 #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */
150 #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) 150 #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
151 #define READ_META (READ | (1 << BIO_RW_META)) 151 #define READ_META (READ | (1 << BIO_RW_META))
152 #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 152 #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
153 #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 153 #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
154 #define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) 154 #define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
155 #define SWRITE_SYNC_PLUG \ 155 #define SWRITE_SYNC_PLUG \
156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
157 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 157 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
158 #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) 158 #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
159 159
160 /* 160 /*
161 * These aren't really reads or writes, they pass down information about 161 * These aren't really reads or writes, they pass down information about
162 * parts of device that are now unused by the file system. 162 * parts of device that are now unused by the file system.
163 */ 163 */
164 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) 164 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
165 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) 165 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
166 166
167 #define SEL_IN 1 167 #define SEL_IN 1
168 #define SEL_OUT 2 168 #define SEL_OUT 2
169 #define SEL_EX 4 169 #define SEL_EX 4
170 170
171 /* public flags for file_system_type */ 171 /* public flags for file_system_type */
172 #define FS_REQUIRES_DEV 1 172 #define FS_REQUIRES_DEV 1
173 #define FS_BINARY_MOUNTDATA 2 173 #define FS_BINARY_MOUNTDATA 2
174 #define FS_HAS_SUBTYPE 4 174 #define FS_HAS_SUBTYPE 4
175 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 175 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
176 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() 176 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move()
177 * during rename() internally. 177 * during rename() internally.
178 */ 178 */
179 179
180 /* 180 /*
181 * These are the fs-independent mount-flags: up to 32 flags are supported 181 * These are the fs-independent mount-flags: up to 32 flags are supported
182 */ 182 */
183 #define MS_RDONLY 1 /* Mount read-only */ 183 #define MS_RDONLY 1 /* Mount read-only */
184 #define MS_NOSUID 2 /* Ignore suid and sgid bits */ 184 #define MS_NOSUID 2 /* Ignore suid and sgid bits */
185 #define MS_NODEV 4 /* Disallow access to device special files */ 185 #define MS_NODEV 4 /* Disallow access to device special files */
186 #define MS_NOEXEC 8 /* Disallow program execution */ 186 #define MS_NOEXEC 8 /* Disallow program execution */
187 #define MS_SYNCHRONOUS 16 /* Writes are synced at once */ 187 #define MS_SYNCHRONOUS 16 /* Writes are synced at once */
188 #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ 188 #define MS_REMOUNT 32 /* Alter flags of a mounted FS */
189 #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ 189 #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
190 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ 190 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */
191 #define MS_NOATIME 1024 /* Do not update access times. */ 191 #define MS_NOATIME 1024 /* Do not update access times. */
192 #define MS_NODIRATIME 2048 /* Do not update directory access times */ 192 #define MS_NODIRATIME 2048 /* Do not update directory access times */
193 #define MS_BIND 4096 193 #define MS_BIND 4096
194 #define MS_MOVE 8192 194 #define MS_MOVE 8192
195 #define MS_REC 16384 195 #define MS_REC 16384
196 #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. 196 #define MS_VERBOSE 32768 /* War is peace. Verbosity is silence.
197 MS_VERBOSE is deprecated. */ 197 MS_VERBOSE is deprecated. */
198 #define MS_SILENT 32768 198 #define MS_SILENT 32768
199 #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ 199 #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
200 #define MS_UNBINDABLE (1<<17) /* change to unbindable */ 200 #define MS_UNBINDABLE (1<<17) /* change to unbindable */
201 #define MS_PRIVATE (1<<18) /* change to private */ 201 #define MS_PRIVATE (1<<18) /* change to private */
202 #define MS_SLAVE (1<<19) /* change to slave */ 202 #define MS_SLAVE (1<<19) /* change to slave */
203 #define MS_SHARED (1<<20) /* change to shared */ 203 #define MS_SHARED (1<<20) /* change to shared */
204 #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ 204 #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
205 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ 205 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
206 #define MS_I_VERSION (1<<23) /* Update inode I_version field */ 206 #define MS_I_VERSION (1<<23) /* Update inode I_version field */
207 #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ 207 #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
208 #define MS_ACTIVE (1<<30) 208 #define MS_ACTIVE (1<<30)
209 #define MS_NOUSER (1<<31) 209 #define MS_NOUSER (1<<31)
210 210
211 /* 211 /*
212 * Superblock flags that can be altered by MS_REMOUNT 212 * Superblock flags that can be altered by MS_REMOUNT
213 */ 213 */
214 #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) 214 #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
215 215
216 /* 216 /*
217 * Old magic mount flag and mask 217 * Old magic mount flag and mask
218 */ 218 */
219 #define MS_MGC_VAL 0xC0ED0000 219 #define MS_MGC_VAL 0xC0ED0000
220 #define MS_MGC_MSK 0xffff0000 220 #define MS_MGC_MSK 0xffff0000
221 221
222 /* Inode flags - they have nothing to superblock flags now */ 222 /* Inode flags - they have nothing to superblock flags now */
223 223
224 #define S_SYNC 1 /* Writes are synced at once */ 224 #define S_SYNC 1 /* Writes are synced at once */
225 #define S_NOATIME 2 /* Do not update access times */ 225 #define S_NOATIME 2 /* Do not update access times */
226 #define S_APPEND 4 /* Append-only file */ 226 #define S_APPEND 4 /* Append-only file */
227 #define S_IMMUTABLE 8 /* Immutable file */ 227 #define S_IMMUTABLE 8 /* Immutable file */
228 #define S_DEAD 16 /* removed, but still open directory */ 228 #define S_DEAD 16 /* removed, but still open directory */
229 #define S_NOQUOTA 32 /* Inode is not counted to quota */ 229 #define S_NOQUOTA 32 /* Inode is not counted to quota */
230 #define S_DIRSYNC 64 /* Directory modifications are synchronous */ 230 #define S_DIRSYNC 64 /* Directory modifications are synchronous */
231 #define S_NOCMTIME 128 /* Do not update file c/mtime */ 231 #define S_NOCMTIME 128 /* Do not update file c/mtime */
232 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ 232 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
233 #define S_PRIVATE 512 /* Inode is fs-internal */ 233 #define S_PRIVATE 512 /* Inode is fs-internal */
234 234
235 /* 235 /*
236 * Note that nosuid etc flags are inode-specific: setting some file-system 236 * Note that nosuid etc flags are inode-specific: setting some file-system
237 * flags just means all the inodes inherit those flags by default. It might be 237 * flags just means all the inodes inherit those flags by default. It might be
238 * possible to override it selectively if you really wanted to with some 238 * possible to override it selectively if you really wanted to with some
239 * ioctl() that is not currently implemented. 239 * ioctl() that is not currently implemented.
240 * 240 *
241 * Exception: MS_RDONLY is always applied to the entire file system. 241 * Exception: MS_RDONLY is always applied to the entire file system.
242 * 242 *
243 * Unfortunately, it is possible to change a filesystems flags with it mounted 243 * Unfortunately, it is possible to change a filesystems flags with it mounted
244 * with files in use. This means that all of the inodes will not have their 244 * with files in use. This means that all of the inodes will not have their
245 * i_flags updated. Hence, i_flags no longer inherit the superblock mount 245 * i_flags updated. Hence, i_flags no longer inherit the superblock mount
246 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org 246 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
247 */ 247 */
248 #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) 248 #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
249 249
250 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) 250 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
251 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ 251 #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
252 ((inode)->i_flags & S_SYNC)) 252 ((inode)->i_flags & S_SYNC))
253 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ 253 #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
254 ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) 254 ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
255 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) 255 #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
256 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) 256 #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
257 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) 257 #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION)
258 258
259 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) 259 #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
260 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) 260 #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
261 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) 261 #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
262 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) 262 #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
263 263
264 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) 264 #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
265 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) 265 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
266 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) 266 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
267 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) 267 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
268 268
269 /* the read-only stuff doesn't really belong here, but any other place is 269 /* the read-only stuff doesn't really belong here, but any other place is
270 probably as bad and I don't want to create yet another include file. */ 270 probably as bad and I don't want to create yet another include file. */
271 271
272 #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ 272 #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
273 #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ 273 #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
274 #define BLKRRPART _IO(0x12,95) /* re-read partition table */ 274 #define BLKRRPART _IO(0x12,95) /* re-read partition table */
275 #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ 275 #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
276 #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ 276 #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
277 #define BLKRASET _IO(0x12,98) /* set read ahead for block device */ 277 #define BLKRASET _IO(0x12,98) /* set read ahead for block device */
278 #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ 278 #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
279 #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ 279 #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
280 #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ 280 #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
281 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ 281 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
282 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ 282 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
283 #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ 283 #define BLKSSZGET _IO(0x12,104)/* get block device sector size */
284 #if 0 284 #if 0
285 #define BLKPG _IO(0x12,105)/* See blkpg.h */ 285 #define BLKPG _IO(0x12,105)/* See blkpg.h */
286 286
287 /* Some people are morons. Do not use sizeof! */ 287 /* Some people are morons. Do not use sizeof! */
288 288
289 #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ 289 #define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */
290 #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ 290 #define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */
291 /* This was here just to show that the number is taken - 291 /* This was here just to show that the number is taken -
292 probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ 292 probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
293 #endif 293 #endif
294 /* A jump here: 108-111 have been used for various private purposes. */ 294 /* A jump here: 108-111 have been used for various private purposes. */
295 #define BLKBSZGET _IOR(0x12,112,size_t) 295 #define BLKBSZGET _IOR(0x12,112,size_t)
296 #define BLKBSZSET _IOW(0x12,113,size_t) 296 #define BLKBSZSET _IOW(0x12,113,size_t)
297 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ 297 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
298 #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) 298 #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
299 #define BLKTRACESTART _IO(0x12,116) 299 #define BLKTRACESTART _IO(0x12,116)
300 #define BLKTRACESTOP _IO(0x12,117) 300 #define BLKTRACESTOP _IO(0x12,117)
301 #define BLKTRACETEARDOWN _IO(0x12,118) 301 #define BLKTRACETEARDOWN _IO(0x12,118)
302 #define BLKDISCARD _IO(0x12,119) 302 #define BLKDISCARD _IO(0x12,119)
303 303
304 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 304 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
305 #define FIBMAP _IO(0x00,1) /* bmap access */ 305 #define FIBMAP _IO(0x00,1) /* bmap access */
306 #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ 306 #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
307 #define FIFREEZE _IOWR('X', 119, int) /* Freeze */ 307 #define FIFREEZE _IOWR('X', 119, int) /* Freeze */
308 #define FITHAW _IOWR('X', 120, int) /* Thaw */ 308 #define FITHAW _IOWR('X', 120, int) /* Thaw */
309 309
310 #define FS_IOC_GETFLAGS _IOR('f', 1, long) 310 #define FS_IOC_GETFLAGS _IOR('f', 1, long)
311 #define FS_IOC_SETFLAGS _IOW('f', 2, long) 311 #define FS_IOC_SETFLAGS _IOW('f', 2, long)
312 #define FS_IOC_GETVERSION _IOR('v', 1, long) 312 #define FS_IOC_GETVERSION _IOR('v', 1, long)
313 #define FS_IOC_SETVERSION _IOW('v', 2, long) 313 #define FS_IOC_SETVERSION _IOW('v', 2, long)
314 #define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap) 314 #define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
315 #define FS_IOC32_GETFLAGS _IOR('f', 1, int) 315 #define FS_IOC32_GETFLAGS _IOR('f', 1, int)
316 #define FS_IOC32_SETFLAGS _IOW('f', 2, int) 316 #define FS_IOC32_SETFLAGS _IOW('f', 2, int)
317 #define FS_IOC32_GETVERSION _IOR('v', 1, int) 317 #define FS_IOC32_GETVERSION _IOR('v', 1, int)
318 #define FS_IOC32_SETVERSION _IOW('v', 2, int) 318 #define FS_IOC32_SETVERSION _IOW('v', 2, int)
319 319
320 /* 320 /*
321 * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) 321 * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
322 */ 322 */
323 #define FS_SECRM_FL 0x00000001 /* Secure deletion */ 323 #define FS_SECRM_FL 0x00000001 /* Secure deletion */
324 #define FS_UNRM_FL 0x00000002 /* Undelete */ 324 #define FS_UNRM_FL 0x00000002 /* Undelete */
325 #define FS_COMPR_FL 0x00000004 /* Compress file */ 325 #define FS_COMPR_FL 0x00000004 /* Compress file */
326 #define FS_SYNC_FL 0x00000008 /* Synchronous updates */ 326 #define FS_SYNC_FL 0x00000008 /* Synchronous updates */
327 #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ 327 #define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
328 #define FS_APPEND_FL 0x00000020 /* writes to file may only append */ 328 #define FS_APPEND_FL 0x00000020 /* writes to file may only append */
329 #define FS_NODUMP_FL 0x00000040 /* do not dump file */ 329 #define FS_NODUMP_FL 0x00000040 /* do not dump file */
330 #define FS_NOATIME_FL 0x00000080 /* do not update atime */ 330 #define FS_NOATIME_FL 0x00000080 /* do not update atime */
331 /* Reserved for compression usage... */ 331 /* Reserved for compression usage... */
332 #define FS_DIRTY_FL 0x00000100 332 #define FS_DIRTY_FL 0x00000100
333 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ 333 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
334 #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ 334 #define FS_NOCOMP_FL 0x00000400 /* Don't compress */
335 #define FS_ECOMPR_FL 0x00000800 /* Compression error */ 335 #define FS_ECOMPR_FL 0x00000800 /* Compression error */
336 /* End compression flags --- maybe not all used */ 336 /* End compression flags --- maybe not all used */
337 #define FS_BTREE_FL 0x00001000 /* btree format dir */ 337 #define FS_BTREE_FL 0x00001000 /* btree format dir */
338 #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ 338 #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */
339 #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ 339 #define FS_IMAGIC_FL 0x00002000 /* AFS directory */
340 #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */ 340 #define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */
341 #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ 341 #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
342 #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ 342 #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
343 #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 343 #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
344 #define FS_EXTENT_FL 0x00080000 /* Extents */ 344 #define FS_EXTENT_FL 0x00080000 /* Extents */
345 #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ 345 #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */
346 #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ 346 #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
347 347
348 #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ 348 #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
349 #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 349 #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
350 350
351 351
352 #define SYNC_FILE_RANGE_WAIT_BEFORE 1 352 #define SYNC_FILE_RANGE_WAIT_BEFORE 1
353 #define SYNC_FILE_RANGE_WRITE 2 353 #define SYNC_FILE_RANGE_WRITE 2
354 #define SYNC_FILE_RANGE_WAIT_AFTER 4 354 #define SYNC_FILE_RANGE_WAIT_AFTER 4
355 355
356 #ifdef __KERNEL__ 356 #ifdef __KERNEL__
357 357
358 #include <linux/linkage.h> 358 #include <linux/linkage.h>
359 #include <linux/wait.h> 359 #include <linux/wait.h>
360 #include <linux/types.h> 360 #include <linux/types.h>
361 #include <linux/kdev_t.h> 361 #include <linux/kdev_t.h>
362 #include <linux/dcache.h> 362 #include <linux/dcache.h>
363 #include <linux/path.h> 363 #include <linux/path.h>
364 #include <linux/stat.h> 364 #include <linux/stat.h>
365 #include <linux/cache.h> 365 #include <linux/cache.h>
366 #include <linux/kobject.h> 366 #include <linux/kobject.h>
367 #include <linux/list.h> 367 #include <linux/list.h>
368 #include <linux/radix-tree.h> 368 #include <linux/radix-tree.h>
369 #include <linux/prio_tree.h> 369 #include <linux/prio_tree.h>
370 #include <linux/init.h> 370 #include <linux/init.h>
371 #include <linux/pid.h> 371 #include <linux/pid.h>
372 #include <linux/mutex.h> 372 #include <linux/mutex.h>
373 #include <linux/capability.h> 373 #include <linux/capability.h>
374 #include <linux/semaphore.h> 374 #include <linux/semaphore.h>
375 #include <linux/fiemap.h> 375 #include <linux/fiemap.h>
376 376
377 #include <asm/atomic.h> 377 #include <asm/atomic.h>
378 #include <asm/byteorder.h> 378 #include <asm/byteorder.h>
379 379
380 struct export_operations; 380 struct export_operations;
381 struct hd_geometry; 381 struct hd_geometry;
382 struct iovec; 382 struct iovec;
383 struct nameidata; 383 struct nameidata;
384 struct kiocb; 384 struct kiocb;
385 struct pipe_inode_info; 385 struct pipe_inode_info;
386 struct poll_table_struct; 386 struct poll_table_struct;
387 struct kstatfs; 387 struct kstatfs;
388 struct vm_area_struct; 388 struct vm_area_struct;
389 struct vfsmount; 389 struct vfsmount;
390 struct cred; 390 struct cred;
391 391
392 extern void __init inode_init(void); 392 extern void __init inode_init(void);
393 extern void __init inode_init_early(void); 393 extern void __init inode_init_early(void);
394 extern void __init files_init(unsigned long); 394 extern void __init files_init(unsigned long);
395 395
396 extern struct files_stat_struct files_stat; 396 extern struct files_stat_struct files_stat;
397 extern int get_max_files(void); 397 extern int get_max_files(void);
398 extern int sysctl_nr_open; 398 extern int sysctl_nr_open;
399 extern struct inodes_stat_t inodes_stat; 399 extern struct inodes_stat_t inodes_stat;
400 extern int leases_enable, lease_break_time; 400 extern int leases_enable, lease_break_time;
401 #ifdef CONFIG_DNOTIFY 401 #ifdef CONFIG_DNOTIFY
402 extern int dir_notify_enable; 402 extern int dir_notify_enable;
403 #endif 403 #endif
404 404
405 struct buffer_head; 405 struct buffer_head;
406 typedef int (get_block_t)(struct inode *inode, sector_t iblock, 406 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
407 struct buffer_head *bh_result, int create); 407 struct buffer_head *bh_result, int create);
408 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 408 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
409 ssize_t bytes, void *private); 409 ssize_t bytes, void *private);
410 410
411 /* 411 /*
412 * Attribute flags. These should be or-ed together to figure out what 412 * Attribute flags. These should be or-ed together to figure out what
413 * has been changed! 413 * has been changed!
414 */ 414 */
415 #define ATTR_MODE (1 << 0) 415 #define ATTR_MODE (1 << 0)
416 #define ATTR_UID (1 << 1) 416 #define ATTR_UID (1 << 1)
417 #define ATTR_GID (1 << 2) 417 #define ATTR_GID (1 << 2)
418 #define ATTR_SIZE (1 << 3) 418 #define ATTR_SIZE (1 << 3)
419 #define ATTR_ATIME (1 << 4) 419 #define ATTR_ATIME (1 << 4)
420 #define ATTR_MTIME (1 << 5) 420 #define ATTR_MTIME (1 << 5)
421 #define ATTR_CTIME (1 << 6) 421 #define ATTR_CTIME (1 << 6)
422 #define ATTR_ATIME_SET (1 << 7) 422 #define ATTR_ATIME_SET (1 << 7)
423 #define ATTR_MTIME_SET (1 << 8) 423 #define ATTR_MTIME_SET (1 << 8)
424 #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ 424 #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */
425 #define ATTR_ATTR_FLAG (1 << 10) 425 #define ATTR_ATTR_FLAG (1 << 10)
426 #define ATTR_KILL_SUID (1 << 11) 426 #define ATTR_KILL_SUID (1 << 11)
427 #define ATTR_KILL_SGID (1 << 12) 427 #define ATTR_KILL_SGID (1 << 12)
428 #define ATTR_FILE (1 << 13) 428 #define ATTR_FILE (1 << 13)
429 #define ATTR_KILL_PRIV (1 << 14) 429 #define ATTR_KILL_PRIV (1 << 14)
430 #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ 430 #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */
431 #define ATTR_TIMES_SET (1 << 16) 431 #define ATTR_TIMES_SET (1 << 16)
432 432
433 /* 433 /*
434 * This is the Inode Attributes structure, used for notify_change(). It 434 * This is the Inode Attributes structure, used for notify_change(). It
435 * uses the above definitions as flags, to know which values have changed. 435 * uses the above definitions as flags, to know which values have changed.
436 * Also, in this manner, a Filesystem can look at only the values it cares 436 * Also, in this manner, a Filesystem can look at only the values it cares
437 * about. Basically, these are the attributes that the VFS layer can 437 * about. Basically, these are the attributes that the VFS layer can
438 * request to change from the FS layer. 438 * request to change from the FS layer.
439 * 439 *
440 * Derek Atkins <warlord@MIT.EDU> 94-10-20 440 * Derek Atkins <warlord@MIT.EDU> 94-10-20
441 */ 441 */
442 struct iattr { 442 struct iattr {
443 unsigned int ia_valid; 443 unsigned int ia_valid;
444 umode_t ia_mode; 444 umode_t ia_mode;
445 uid_t ia_uid; 445 uid_t ia_uid;
446 gid_t ia_gid; 446 gid_t ia_gid;
447 loff_t ia_size; 447 loff_t ia_size;
448 struct timespec ia_atime; 448 struct timespec ia_atime;
449 struct timespec ia_mtime; 449 struct timespec ia_mtime;
450 struct timespec ia_ctime; 450 struct timespec ia_ctime;
451 451
452 /* 452 /*
453 * Not an attribute, but an auxilary info for filesystems wanting to 453 * Not an attribute, but an auxilary info for filesystems wanting to
454 * implement an ftruncate() like method. NOTE: filesystem should 454 * implement an ftruncate() like method. NOTE: filesystem should
455 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). 455 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
456 */ 456 */
457 struct file *ia_file; 457 struct file *ia_file;
458 }; 458 };
459 459
460 /* 460 /*
461 * Includes for diskquotas. 461 * Includes for diskquotas.
462 */ 462 */
463 #include <linux/quota.h> 463 #include <linux/quota.h>
464 464
465 /** 465 /**
466 * enum positive_aop_returns - aop return codes with specific semantics 466 * enum positive_aop_returns - aop return codes with specific semantics
467 * 467 *
468 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has 468 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
469 * completed, that the page is still locked, and 469 * completed, that the page is still locked, and
470 * should be considered active. The VM uses this hint 470 * should be considered active. The VM uses this hint
471 * to return the page to the active list -- it won't 471 * to return the page to the active list -- it won't
472 * be a candidate for writeback again in the near 472 * be a candidate for writeback again in the near
473 * future. Other callers must be careful to unlock 473 * future. Other callers must be careful to unlock
474 * the page if they get this return. Returned by 474 * the page if they get this return. Returned by
475 * writepage(); 475 * writepage();
476 * 476 *
477 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has 477 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
478 * unlocked it and the page might have been truncated. 478 * unlocked it and the page might have been truncated.
479 * The caller should back up to acquiring a new page and 479 * The caller should back up to acquiring a new page and
480 * trying again. The aop will be taking reasonable 480 * trying again. The aop will be taking reasonable
481 * precautions not to livelock. If the caller held a page 481 * precautions not to livelock. If the caller held a page
482 * reference, it should drop it before retrying. Returned 482 * reference, it should drop it before retrying. Returned
483 * by readpage(). 483 * by readpage().
484 * 484 *
485 * address_space_operation functions return these large constants to indicate 485 * address_space_operation functions return these large constants to indicate
486 * special semantics to the caller. These are much larger than the bytes in a 486 * special semantics to the caller. These are much larger than the bytes in a
487 * page to allow for functions that return the number of bytes operated on in a 487 * page to allow for functions that return the number of bytes operated on in a
488 * given page. 488 * given page.
489 */ 489 */
490 490
491 enum positive_aop_returns { 491 enum positive_aop_returns {
492 AOP_WRITEPAGE_ACTIVATE = 0x80000, 492 AOP_WRITEPAGE_ACTIVATE = 0x80000,
493 AOP_TRUNCATED_PAGE = 0x80001, 493 AOP_TRUNCATED_PAGE = 0x80001,
494 }; 494 };
495 495
496 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ 496 #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */
497 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ 497 #define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */
498 #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct 498 #define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct
499 * helper code (eg buffer layer) 499 * helper code (eg buffer layer)
500 * to clear GFP_FS from alloc */ 500 * to clear GFP_FS from alloc */
501 501
502 /* 502 /*
503 * oh the beauties of C type declarations. 503 * oh the beauties of C type declarations.
504 */ 504 */
505 struct page; 505 struct page;
506 struct address_space; 506 struct address_space;
507 struct writeback_control; 507 struct writeback_control;
508 508
509 struct iov_iter { 509 struct iov_iter {
510 const struct iovec *iov; 510 const struct iovec *iov;
511 unsigned long nr_segs; 511 unsigned long nr_segs;
512 size_t iov_offset; 512 size_t iov_offset;
513 size_t count; 513 size_t count;
514 }; 514 };
515 515
516 size_t iov_iter_copy_from_user_atomic(struct page *page, 516 size_t iov_iter_copy_from_user_atomic(struct page *page,
517 struct iov_iter *i, unsigned long offset, size_t bytes); 517 struct iov_iter *i, unsigned long offset, size_t bytes);
518 size_t iov_iter_copy_from_user(struct page *page, 518 size_t iov_iter_copy_from_user(struct page *page,
519 struct iov_iter *i, unsigned long offset, size_t bytes); 519 struct iov_iter *i, unsigned long offset, size_t bytes);
520 void iov_iter_advance(struct iov_iter *i, size_t bytes); 520 void iov_iter_advance(struct iov_iter *i, size_t bytes);
521 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); 521 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
522 size_t iov_iter_single_seg_count(struct iov_iter *i); 522 size_t iov_iter_single_seg_count(struct iov_iter *i);
523 523
524 static inline void iov_iter_init(struct iov_iter *i, 524 static inline void iov_iter_init(struct iov_iter *i,
525 const struct iovec *iov, unsigned long nr_segs, 525 const struct iovec *iov, unsigned long nr_segs,
526 size_t count, size_t written) 526 size_t count, size_t written)
527 { 527 {
528 i->iov = iov; 528 i->iov = iov;
529 i->nr_segs = nr_segs; 529 i->nr_segs = nr_segs;
530 i->iov_offset = 0; 530 i->iov_offset = 0;
531 i->count = count + written; 531 i->count = count + written;
532 532
533 iov_iter_advance(i, written); 533 iov_iter_advance(i, written);
534 } 534 }
535 535
536 static inline size_t iov_iter_count(struct iov_iter *i) 536 static inline size_t iov_iter_count(struct iov_iter *i)
537 { 537 {
538 return i->count; 538 return i->count;
539 } 539 }
540 540
541 /* 541 /*
542 * "descriptor" for what we're up to with a read. 542 * "descriptor" for what we're up to with a read.
543 * This allows us to use the same read code yet 543 * This allows us to use the same read code yet
544 * have multiple different users of the data that 544 * have multiple different users of the data that
545 * we read from a file. 545 * we read from a file.
546 * 546 *
547 * The simplest case just copies the data to user 547 * The simplest case just copies the data to user
548 * mode. 548 * mode.
549 */ 549 */
550 typedef struct { 550 typedef struct {
551 size_t written; 551 size_t written;
552 size_t count; 552 size_t count;
553 union { 553 union {
554 char __user *buf; 554 char __user *buf;
555 void *data; 555 void *data;
556 } arg; 556 } arg;
557 int error; 557 int error;
558 } read_descriptor_t; 558 } read_descriptor_t;
559 559
560 typedef int (*read_actor_t)(read_descriptor_t *, struct page *, 560 typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
561 unsigned long, unsigned long); 561 unsigned long, unsigned long);
562 562
563 struct address_space_operations { 563 struct address_space_operations {
564 int (*writepage)(struct page *page, struct writeback_control *wbc); 564 int (*writepage)(struct page *page, struct writeback_control *wbc);
565 int (*readpage)(struct file *, struct page *); 565 int (*readpage)(struct file *, struct page *);
566 void (*sync_page)(struct page *); 566 void (*sync_page)(struct page *);
567 567
568 /* Write back some dirty pages from this mapping. */ 568 /* Write back some dirty pages from this mapping. */
569 int (*writepages)(struct address_space *, struct writeback_control *); 569 int (*writepages)(struct address_space *, struct writeback_control *);
570 570
571 /* Set a page dirty. Return true if this dirtied it */ 571 /* Set a page dirty. Return true if this dirtied it */
572 int (*set_page_dirty)(struct page *page); 572 int (*set_page_dirty)(struct page *page);
573 573
574 int (*readpages)(struct file *filp, struct address_space *mapping, 574 int (*readpages)(struct file *filp, struct address_space *mapping,
575 struct list_head *pages, unsigned nr_pages); 575 struct list_head *pages, unsigned nr_pages);
576 576
577 int (*write_begin)(struct file *, struct address_space *mapping, 577 int (*write_begin)(struct file *, struct address_space *mapping,
578 loff_t pos, unsigned len, unsigned flags, 578 loff_t pos, unsigned len, unsigned flags,
579 struct page **pagep, void **fsdata); 579 struct page **pagep, void **fsdata);
580 int (*write_end)(struct file *, struct address_space *mapping, 580 int (*write_end)(struct file *, struct address_space *mapping,
581 loff_t pos, unsigned len, unsigned copied, 581 loff_t pos, unsigned len, unsigned copied,
582 struct page *page, void *fsdata); 582 struct page *page, void *fsdata);
583 583
584 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 584 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
585 sector_t (*bmap)(struct address_space *, sector_t); 585 sector_t (*bmap)(struct address_space *, sector_t);
586 void (*invalidatepage) (struct page *, unsigned long); 586 void (*invalidatepage) (struct page *, unsigned long);
587 int (*releasepage) (struct page *, gfp_t); 587 int (*releasepage) (struct page *, gfp_t);
588 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 588 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
589 loff_t offset, unsigned long nr_segs); 589 loff_t offset, unsigned long nr_segs);
590 int (*get_xip_mem)(struct address_space *, pgoff_t, int, 590 int (*get_xip_mem)(struct address_space *, pgoff_t, int,
591 void **, unsigned long *); 591 void **, unsigned long *);
592 /* migrate the contents of a page to the specified target */ 592 /* migrate the contents of a page to the specified target */
593 int (*migratepage) (struct address_space *, 593 int (*migratepage) (struct address_space *,
594 struct page *, struct page *); 594 struct page *, struct page *);
595 int (*launder_page) (struct page *); 595 int (*launder_page) (struct page *);
596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
597 unsigned long); 597 unsigned long);
598 }; 598 };
599 599
600 /* 600 /*
601 * pagecache_write_begin/pagecache_write_end must be used by general code 601 * pagecache_write_begin/pagecache_write_end must be used by general code
602 * to write into the pagecache. 602 * to write into the pagecache.
603 */ 603 */
604 int pagecache_write_begin(struct file *, struct address_space *mapping, 604 int pagecache_write_begin(struct file *, struct address_space *mapping,
605 loff_t pos, unsigned len, unsigned flags, 605 loff_t pos, unsigned len, unsigned flags,
606 struct page **pagep, void **fsdata); 606 struct page **pagep, void **fsdata);
607 607
608 int pagecache_write_end(struct file *, struct address_space *mapping, 608 int pagecache_write_end(struct file *, struct address_space *mapping,
609 loff_t pos, unsigned len, unsigned copied, 609 loff_t pos, unsigned len, unsigned copied,
610 struct page *page, void *fsdata); 610 struct page *page, void *fsdata);
611 611
612 struct backing_dev_info; 612 struct backing_dev_info;
613 struct address_space { 613 struct address_space {
614 struct inode *host; /* owner: inode, block_device */ 614 struct inode *host; /* owner: inode, block_device */
615 struct radix_tree_root page_tree; /* radix tree of all pages */ 615 struct radix_tree_root page_tree; /* radix tree of all pages */
616 spinlock_t tree_lock; /* and lock protecting it */ 616 spinlock_t tree_lock; /* and lock protecting it */
617 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 617 unsigned int i_mmap_writable;/* count VM_SHARED mappings */
618 struct prio_tree_root i_mmap; /* tree of private and shared mappings */ 618 struct prio_tree_root i_mmap; /* tree of private and shared mappings */
619 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 619 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
620 spinlock_t i_mmap_lock; /* protect tree, count, list */ 620 spinlock_t i_mmap_lock; /* protect tree, count, list */
621 unsigned int truncate_count; /* Cover race condition with truncate */ 621 unsigned int truncate_count; /* Cover race condition with truncate */
622 unsigned long nrpages; /* number of total pages */ 622 unsigned long nrpages; /* number of total pages */
623 pgoff_t writeback_index;/* writeback starts here */ 623 pgoff_t writeback_index;/* writeback starts here */
624 const struct address_space_operations *a_ops; /* methods */ 624 const struct address_space_operations *a_ops; /* methods */
625 unsigned long flags; /* error bits/gfp mask */ 625 unsigned long flags; /* error bits/gfp mask */
626 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 626 struct backing_dev_info *backing_dev_info; /* device readahead, etc */
627 spinlock_t private_lock; /* for use by the address_space */ 627 spinlock_t private_lock; /* for use by the address_space */
628 struct list_head private_list; /* ditto */ 628 struct list_head private_list; /* ditto */
629 struct address_space *assoc_mapping; /* ditto */ 629 struct address_space *assoc_mapping; /* ditto */
630 } __attribute__((aligned(sizeof(long)))); 630 } __attribute__((aligned(sizeof(long))));
631 /* 631 /*
632 * On most architectures that alignment is already the case; but 632 * On most architectures that alignment is already the case; but
633 * must be enforced here for CRIS, to let the least signficant bit 633 * must be enforced here for CRIS, to let the least signficant bit
634 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. 634 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
635 */ 635 */
636 636
637 struct block_device { 637 struct block_device {
638 dev_t bd_dev; /* not a kdev_t - it's a search key */ 638 dev_t bd_dev; /* not a kdev_t - it's a search key */
639 struct inode * bd_inode; /* will die */ 639 struct inode * bd_inode; /* will die */
640 struct super_block * bd_super; 640 struct super_block * bd_super;
641 int bd_openers; 641 int bd_openers;
642 struct mutex bd_mutex; /* open/close mutex */ 642 struct mutex bd_mutex; /* open/close mutex */
643 struct semaphore bd_mount_sem; 643 struct semaphore bd_mount_sem;
644 struct list_head bd_inodes; 644 struct list_head bd_inodes;
645 void * bd_holder; 645 void * bd_holder;
646 int bd_holders; 646 int bd_holders;
647 #ifdef CONFIG_SYSFS 647 #ifdef CONFIG_SYSFS
648 struct list_head bd_holder_list; 648 struct list_head bd_holder_list;
649 #endif 649 #endif
650 struct block_device * bd_contains; 650 struct block_device * bd_contains;
651 unsigned bd_block_size; 651 unsigned bd_block_size;
652 struct hd_struct * bd_part; 652 struct hd_struct * bd_part;
653 /* number of times partitions within this device have been opened. */ 653 /* number of times partitions within this device have been opened. */
654 unsigned bd_part_count; 654 unsigned bd_part_count;
655 int bd_invalidated; 655 int bd_invalidated;
656 struct gendisk * bd_disk; 656 struct gendisk * bd_disk;
657 struct list_head bd_list; 657 struct list_head bd_list;
658 struct backing_dev_info *bd_inode_backing_dev_info; 658 struct backing_dev_info *bd_inode_backing_dev_info;
659 /* 659 /*
660 * Private data. You must have bd_claim'ed the block_device 660 * Private data. You must have bd_claim'ed the block_device
661 * to use this. NOTE: bd_claim allows an owner to claim 661 * to use this. NOTE: bd_claim allows an owner to claim
662 * the same device multiple times, the owner must take special 662 * the same device multiple times, the owner must take special
663 * care to not mess up bd_private for that case. 663 * care to not mess up bd_private for that case.
664 */ 664 */
665 unsigned long bd_private; 665 unsigned long bd_private;
666 666
667 /* The counter of freeze processes */ 667 /* The counter of freeze processes */
668 int bd_fsfreeze_count; 668 int bd_fsfreeze_count;
669 /* Mutex for freeze */ 669 /* Mutex for freeze */
670 struct mutex bd_fsfreeze_mutex; 670 struct mutex bd_fsfreeze_mutex;
671 }; 671 };
672 672
673 /* 673 /*
674 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache 674 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
675 * radix trees 675 * radix trees
676 */ 676 */
677 #define PAGECACHE_TAG_DIRTY 0 677 #define PAGECACHE_TAG_DIRTY 0
678 #define PAGECACHE_TAG_WRITEBACK 1 678 #define PAGECACHE_TAG_WRITEBACK 1
679 679
680 int mapping_tagged(struct address_space *mapping, int tag); 680 int mapping_tagged(struct address_space *mapping, int tag);
681 681
682 /* 682 /*
683 * Might pages of this file be mapped into userspace? 683 * Might pages of this file be mapped into userspace?
684 */ 684 */
685 static inline int mapping_mapped(struct address_space *mapping) 685 static inline int mapping_mapped(struct address_space *mapping)
686 { 686 {
687 return !prio_tree_empty(&mapping->i_mmap) || 687 return !prio_tree_empty(&mapping->i_mmap) ||
688 !list_empty(&mapping->i_mmap_nonlinear); 688 !list_empty(&mapping->i_mmap_nonlinear);
689 } 689 }
690 690
691 /* 691 /*
692 * Might pages of this file have been modified in userspace? 692 * Might pages of this file have been modified in userspace?
693 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff 693 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
694 * marks vma as VM_SHARED if it is shared, and the file was opened for 694 * marks vma as VM_SHARED if it is shared, and the file was opened for
695 * writing i.e. vma may be mprotected writable even if now readonly. 695 * writing i.e. vma may be mprotected writable even if now readonly.
696 */ 696 */
697 static inline int mapping_writably_mapped(struct address_space *mapping) 697 static inline int mapping_writably_mapped(struct address_space *mapping)
698 { 698 {
699 return mapping->i_mmap_writable != 0; 699 return mapping->i_mmap_writable != 0;
700 } 700 }
701 701
702 /* 702 /*
703 * Use sequence counter to get consistent i_size on 32-bit processors. 703 * Use sequence counter to get consistent i_size on 32-bit processors.
704 */ 704 */
705 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 705 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
706 #include <linux/seqlock.h> 706 #include <linux/seqlock.h>
707 #define __NEED_I_SIZE_ORDERED 707 #define __NEED_I_SIZE_ORDERED
708 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) 708 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
709 #else 709 #else
710 #define i_size_ordered_init(inode) do { } while (0) 710 #define i_size_ordered_init(inode) do { } while (0)
711 #endif 711 #endif
712 712
713 struct posix_acl; 713 struct posix_acl;
714 #define ACL_NOT_CACHED ((void *)(-1)) 714 #define ACL_NOT_CACHED ((void *)(-1))
715 715
716 struct inode { 716 struct inode {
717 struct hlist_node i_hash; 717 struct hlist_node i_hash;
718 struct list_head i_list; 718 struct list_head i_list;
719 struct list_head i_sb_list; 719 struct list_head i_sb_list;
720 struct list_head i_dentry; 720 struct list_head i_dentry;
721 unsigned long i_ino; 721 unsigned long i_ino;
722 atomic_t i_count; 722 atomic_t i_count;
723 unsigned int i_nlink; 723 unsigned int i_nlink;
724 uid_t i_uid; 724 uid_t i_uid;
725 gid_t i_gid; 725 gid_t i_gid;
726 dev_t i_rdev; 726 dev_t i_rdev;
727 u64 i_version; 727 u64 i_version;
728 loff_t i_size; 728 loff_t i_size;
729 #ifdef __NEED_I_SIZE_ORDERED 729 #ifdef __NEED_I_SIZE_ORDERED
730 seqcount_t i_size_seqcount; 730 seqcount_t i_size_seqcount;
731 #endif 731 #endif
732 struct timespec i_atime; 732 struct timespec i_atime;
733 struct timespec i_mtime; 733 struct timespec i_mtime;
734 struct timespec i_ctime; 734 struct timespec i_ctime;
735 blkcnt_t i_blocks; 735 blkcnt_t i_blocks;
736 unsigned int i_blkbits; 736 unsigned int i_blkbits;
737 unsigned short i_bytes; 737 unsigned short i_bytes;
738 umode_t i_mode; 738 umode_t i_mode;
739 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 739 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
740 struct mutex i_mutex; 740 struct mutex i_mutex;
741 struct rw_semaphore i_alloc_sem; 741 struct rw_semaphore i_alloc_sem;
742 const struct inode_operations *i_op; 742 const struct inode_operations *i_op;
743 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 743 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
744 struct super_block *i_sb; 744 struct super_block *i_sb;
745 struct file_lock *i_flock; 745 struct file_lock *i_flock;
746 struct address_space *i_mapping; 746 struct address_space *i_mapping;
747 struct address_space i_data; 747 struct address_space i_data;
748 #ifdef CONFIG_QUOTA 748 #ifdef CONFIG_QUOTA
749 struct dquot *i_dquot[MAXQUOTAS]; 749 struct dquot *i_dquot[MAXQUOTAS];
750 #endif 750 #endif
751 struct list_head i_devices; 751 struct list_head i_devices;
752 union { 752 union {
753 struct pipe_inode_info *i_pipe; 753 struct pipe_inode_info *i_pipe;
754 struct block_device *i_bdev; 754 struct block_device *i_bdev;
755 struct cdev *i_cdev; 755 struct cdev *i_cdev;
756 }; 756 };
757 757
758 __u32 i_generation; 758 __u32 i_generation;
759 759
760 #ifdef CONFIG_FSNOTIFY 760 #ifdef CONFIG_FSNOTIFY
761 __u32 i_fsnotify_mask; /* all events this inode cares about */ 761 __u32 i_fsnotify_mask; /* all events this inode cares about */
762 struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */ 762 struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */
763 #endif 763 #endif
764 764
765 #ifdef CONFIG_INOTIFY 765 #ifdef CONFIG_INOTIFY
766 struct list_head inotify_watches; /* watches on this inode */ 766 struct list_head inotify_watches; /* watches on this inode */
767 struct mutex inotify_mutex; /* protects the watches list */ 767 struct mutex inotify_mutex; /* protects the watches list */
768 #endif 768 #endif
769 769
770 unsigned long i_state; 770 unsigned long i_state;
771 unsigned long dirtied_when; /* jiffies of first dirtying */ 771 unsigned long dirtied_when; /* jiffies of first dirtying */
772 772
773 unsigned int i_flags; 773 unsigned int i_flags;
774 774
775 atomic_t i_writecount; 775 atomic_t i_writecount;
776 #ifdef CONFIG_SECURITY 776 #ifdef CONFIG_SECURITY
777 void *i_security; 777 void *i_security;
778 #endif 778 #endif
779 #ifdef CONFIG_FS_POSIX_ACL 779 #ifdef CONFIG_FS_POSIX_ACL
780 struct posix_acl *i_acl; 780 struct posix_acl *i_acl;
781 struct posix_acl *i_default_acl; 781 struct posix_acl *i_default_acl;
782 #endif 782 #endif
783 void *i_private; /* fs or device private pointer */ 783 void *i_private; /* fs or device private pointer */
784 }; 784 };
785 785
786 /* 786 /*
787 * inode->i_mutex nesting subclasses for the lock validator: 787 * inode->i_mutex nesting subclasses for the lock validator:
788 * 788 *
789 * 0: the object of the current VFS operation 789 * 0: the object of the current VFS operation
790 * 1: parent 790 * 1: parent
791 * 2: child/target 791 * 2: child/target
792 * 3: quota file 792 * 3: quota file
793 * 793 *
794 * The locking order between these classes is 794 * The locking order between these classes is
795 * parent -> child -> normal -> xattr -> quota 795 * parent -> child -> normal -> xattr -> quota
796 */ 796 */
797 enum inode_i_mutex_lock_class 797 enum inode_i_mutex_lock_class
798 { 798 {
799 I_MUTEX_NORMAL, 799 I_MUTEX_NORMAL,
800 I_MUTEX_PARENT, 800 I_MUTEX_PARENT,
801 I_MUTEX_CHILD, 801 I_MUTEX_CHILD,
802 I_MUTEX_XATTR, 802 I_MUTEX_XATTR,
803 I_MUTEX_QUOTA 803 I_MUTEX_QUOTA
804 }; 804 };
805 805
806 /* 806 /*
807 * NOTE: in a 32bit arch with a preemptable kernel and 807 * NOTE: in a 32bit arch with a preemptable kernel and
808 * an UP compile the i_size_read/write must be atomic 808 * an UP compile the i_size_read/write must be atomic
809 * with respect to the local cpu (unlike with preempt disabled), 809 * with respect to the local cpu (unlike with preempt disabled),
810 * but they don't need to be atomic with respect to other cpus like in 810 * but they don't need to be atomic with respect to other cpus like in
811 * true SMP (so they need either to either locally disable irq around 811 * true SMP (so they need either to either locally disable irq around
812 * the read or for example on x86 they can be still implemented as a 812 * the read or for example on x86 they can be still implemented as a
813 * cmpxchg8b without the need of the lock prefix). For SMP compiles 813 * cmpxchg8b without the need of the lock prefix). For SMP compiles
814 * and 64bit archs it makes no difference if preempt is enabled or not. 814 * and 64bit archs it makes no difference if preempt is enabled or not.
815 */ 815 */
816 static inline loff_t i_size_read(const struct inode *inode) 816 static inline loff_t i_size_read(const struct inode *inode)
817 { 817 {
818 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 818 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
819 loff_t i_size; 819 loff_t i_size;
820 unsigned int seq; 820 unsigned int seq;
821 821
822 do { 822 do {
823 seq = read_seqcount_begin(&inode->i_size_seqcount); 823 seq = read_seqcount_begin(&inode->i_size_seqcount);
824 i_size = inode->i_size; 824 i_size = inode->i_size;
825 } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); 825 } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
826 return i_size; 826 return i_size;
827 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 827 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
828 loff_t i_size; 828 loff_t i_size;
829 829
830 preempt_disable(); 830 preempt_disable();
831 i_size = inode->i_size; 831 i_size = inode->i_size;
832 preempt_enable(); 832 preempt_enable();
833 return i_size; 833 return i_size;
834 #else 834 #else
835 return inode->i_size; 835 return inode->i_size;
836 #endif 836 #endif
837 } 837 }
838 838
839 /* 839 /*
840 * NOTE: unlike i_size_read(), i_size_write() does need locking around it 840 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
841 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount 841 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
842 * can be lost, resulting in subsequent i_size_read() calls spinning forever. 842 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
843 */ 843 */
844 static inline void i_size_write(struct inode *inode, loff_t i_size) 844 static inline void i_size_write(struct inode *inode, loff_t i_size)
845 { 845 {
846 #if BITS_PER_LONG==32 && defined(CONFIG_SMP) 846 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
847 write_seqcount_begin(&inode->i_size_seqcount); 847 write_seqcount_begin(&inode->i_size_seqcount);
848 inode->i_size = i_size; 848 inode->i_size = i_size;
849 write_seqcount_end(&inode->i_size_seqcount); 849 write_seqcount_end(&inode->i_size_seqcount);
850 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) 850 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
851 preempt_disable(); 851 preempt_disable();
852 inode->i_size = i_size; 852 inode->i_size = i_size;
853 preempt_enable(); 853 preempt_enable();
854 #else 854 #else
855 inode->i_size = i_size; 855 inode->i_size = i_size;
856 #endif 856 #endif
857 } 857 }
858 858
859 static inline unsigned iminor(const struct inode *inode) 859 static inline unsigned iminor(const struct inode *inode)
860 { 860 {
861 return MINOR(inode->i_rdev); 861 return MINOR(inode->i_rdev);
862 } 862 }
863 863
864 static inline unsigned imajor(const struct inode *inode) 864 static inline unsigned imajor(const struct inode *inode)
865 { 865 {
866 return MAJOR(inode->i_rdev); 866 return MAJOR(inode->i_rdev);
867 } 867 }
868 868
869 extern struct block_device *I_BDEV(struct inode *inode); 869 extern struct block_device *I_BDEV(struct inode *inode);
870 870
871 struct fown_struct { 871 struct fown_struct {
872 rwlock_t lock; /* protects pid, uid, euid fields */ 872 rwlock_t lock; /* protects pid, uid, euid fields */
873 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ 873 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
874 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ 874 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
875 uid_t uid, euid; /* uid/euid of process setting the owner */ 875 uid_t uid, euid; /* uid/euid of process setting the owner */
876 int signum; /* posix.1b rt signal to be delivered on IO */ 876 int signum; /* posix.1b rt signal to be delivered on IO */
877 }; 877 };
878 878
879 /* 879 /*
880 * Track a single file's readahead state 880 * Track a single file's readahead state
881 */ 881 */
882 struct file_ra_state { 882 struct file_ra_state {
883 pgoff_t start; /* where readahead started */ 883 pgoff_t start; /* where readahead started */
884 unsigned int size; /* # of readahead pages */ 884 unsigned int size; /* # of readahead pages */
885 unsigned int async_size; /* do asynchronous readahead when 885 unsigned int async_size; /* do asynchronous readahead when
886 there are only # of pages ahead */ 886 there are only # of pages ahead */
887 887
888 unsigned int ra_pages; /* Maximum readahead window */ 888 unsigned int ra_pages; /* Maximum readahead window */
889 unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ 889 unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
890 loff_t prev_pos; /* Cache last read() position */ 890 loff_t prev_pos; /* Cache last read() position */
891 }; 891 };
892 892
893 /* 893 /*
894 * Check if @index falls in the readahead windows. 894 * Check if @index falls in the readahead windows.
895 */ 895 */
896 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) 896 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
897 { 897 {
898 return (index >= ra->start && 898 return (index >= ra->start &&
899 index < ra->start + ra->size); 899 index < ra->start + ra->size);
900 } 900 }
901 901
902 #define FILE_MNT_WRITE_TAKEN 1 902 #define FILE_MNT_WRITE_TAKEN 1
903 #define FILE_MNT_WRITE_RELEASED 2 903 #define FILE_MNT_WRITE_RELEASED 2
904 904
905 struct file { 905 struct file {
906 /* 906 /*
907 * fu_list becomes invalid after file_free is called and queued via 907 * fu_list becomes invalid after file_free is called and queued via
908 * fu_rcuhead for RCU freeing 908 * fu_rcuhead for RCU freeing
909 */ 909 */
910 union { 910 union {
911 struct list_head fu_list; 911 struct list_head fu_list;
912 struct rcu_head fu_rcuhead; 912 struct rcu_head fu_rcuhead;
913 } f_u; 913 } f_u;
914 struct path f_path; 914 struct path f_path;
915 #define f_dentry f_path.dentry 915 #define f_dentry f_path.dentry
916 #define f_vfsmnt f_path.mnt 916 #define f_vfsmnt f_path.mnt
917 const struct file_operations *f_op; 917 const struct file_operations *f_op;
918 spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ 918 spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */
919 atomic_long_t f_count; 919 atomic_long_t f_count;
920 unsigned int f_flags; 920 unsigned int f_flags;
921 fmode_t f_mode; 921 fmode_t f_mode;
922 loff_t f_pos; 922 loff_t f_pos;
923 struct fown_struct f_owner; 923 struct fown_struct f_owner;
924 const struct cred *f_cred; 924 const struct cred *f_cred;
925 struct file_ra_state f_ra; 925 struct file_ra_state f_ra;
926 926
927 u64 f_version; 927 u64 f_version;
928 #ifdef CONFIG_SECURITY 928 #ifdef CONFIG_SECURITY
929 void *f_security; 929 void *f_security;
930 #endif 930 #endif
931 /* needed for tty driver, and maybe others */ 931 /* needed for tty driver, and maybe others */
932 void *private_data; 932 void *private_data;
933 933
934 #ifdef CONFIG_EPOLL 934 #ifdef CONFIG_EPOLL
935 /* Used by fs/eventpoll.c to link all the hooks to this file */ 935 /* Used by fs/eventpoll.c to link all the hooks to this file */
936 struct list_head f_ep_links; 936 struct list_head f_ep_links;
937 #endif /* #ifdef CONFIG_EPOLL */ 937 #endif /* #ifdef CONFIG_EPOLL */
938 struct address_space *f_mapping; 938 struct address_space *f_mapping;
939 #ifdef CONFIG_DEBUG_WRITECOUNT 939 #ifdef CONFIG_DEBUG_WRITECOUNT
940 unsigned long f_mnt_write_state; 940 unsigned long f_mnt_write_state;
941 #endif 941 #endif
942 }; 942 };
943 extern spinlock_t files_lock; 943 extern spinlock_t files_lock;
944 #define file_list_lock() spin_lock(&files_lock); 944 #define file_list_lock() spin_lock(&files_lock);
945 #define file_list_unlock() spin_unlock(&files_lock); 945 #define file_list_unlock() spin_unlock(&files_lock);
946 946
947 #define get_file(x) atomic_long_inc(&(x)->f_count) 947 #define get_file(x) atomic_long_inc(&(x)->f_count)
948 #define file_count(x) atomic_long_read(&(x)->f_count) 948 #define file_count(x) atomic_long_read(&(x)->f_count)
949 949
950 #ifdef CONFIG_DEBUG_WRITECOUNT 950 #ifdef CONFIG_DEBUG_WRITECOUNT
951 static inline void file_take_write(struct file *f) 951 static inline void file_take_write(struct file *f)
952 { 952 {
953 WARN_ON(f->f_mnt_write_state != 0); 953 WARN_ON(f->f_mnt_write_state != 0);
954 f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; 954 f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
955 } 955 }
956 static inline void file_release_write(struct file *f) 956 static inline void file_release_write(struct file *f)
957 { 957 {
958 f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; 958 f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED;
959 } 959 }
960 static inline void file_reset_write(struct file *f) 960 static inline void file_reset_write(struct file *f)
961 { 961 {
962 f->f_mnt_write_state = 0; 962 f->f_mnt_write_state = 0;
963 } 963 }
964 static inline void file_check_state(struct file *f) 964 static inline void file_check_state(struct file *f)
965 { 965 {
966 /* 966 /*
967 * At this point, either both or neither of these bits 967 * At this point, either both or neither of these bits
968 * should be set. 968 * should be set.
969 */ 969 */
970 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); 970 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
971 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); 971 WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
972 } 972 }
973 static inline int file_check_writeable(struct file *f) 973 static inline int file_check_writeable(struct file *f)
974 { 974 {
975 if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) 975 if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN)
976 return 0; 976 return 0;
977 printk(KERN_WARNING "writeable file with no " 977 printk(KERN_WARNING "writeable file with no "
978 "mnt_want_write()\n"); 978 "mnt_want_write()\n");
979 WARN_ON(1); 979 WARN_ON(1);
980 return -EINVAL; 980 return -EINVAL;
981 } 981 }
982 #else /* !CONFIG_DEBUG_WRITECOUNT */ 982 #else /* !CONFIG_DEBUG_WRITECOUNT */
983 static inline void file_take_write(struct file *filp) {} 983 static inline void file_take_write(struct file *filp) {}
984 static inline void file_release_write(struct file *filp) {} 984 static inline void file_release_write(struct file *filp) {}
985 static inline void file_reset_write(struct file *filp) {} 985 static inline void file_reset_write(struct file *filp) {}
986 static inline void file_check_state(struct file *filp) {} 986 static inline void file_check_state(struct file *filp) {}
987 static inline int file_check_writeable(struct file *filp) 987 static inline int file_check_writeable(struct file *filp)
988 { 988 {
989 return 0; 989 return 0;
990 } 990 }
991 #endif /* CONFIG_DEBUG_WRITECOUNT */ 991 #endif /* CONFIG_DEBUG_WRITECOUNT */
992 992
993 #define MAX_NON_LFS ((1UL<<31) - 1) 993 #define MAX_NON_LFS ((1UL<<31) - 1)
994 994
995 /* Page cache limit. The filesystems should put that into their s_maxbytes 995 /* Page cache limit. The filesystems should put that into their s_maxbytes
996 limits, otherwise bad things can happen in VM. */ 996 limits, otherwise bad things can happen in VM. */
997 #if BITS_PER_LONG==32 997 #if BITS_PER_LONG==32
998 #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 998 #define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
999 #elif BITS_PER_LONG==64 999 #elif BITS_PER_LONG==64
1000 #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL 1000 #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL
1001 #endif 1001 #endif
1002 1002
1003 #define FL_POSIX 1 1003 #define FL_POSIX 1
1004 #define FL_FLOCK 2 1004 #define FL_FLOCK 2
1005 #define FL_ACCESS 8 /* not trying to lock, just looking */ 1005 #define FL_ACCESS 8 /* not trying to lock, just looking */
1006 #define FL_EXISTS 16 /* when unlocking, test for existence */ 1006 #define FL_EXISTS 16 /* when unlocking, test for existence */
1007 #define FL_LEASE 32 /* lease held on this file */ 1007 #define FL_LEASE 32 /* lease held on this file */
1008 #define FL_CLOSE 64 /* unlock on close */ 1008 #define FL_CLOSE 64 /* unlock on close */
1009 #define FL_SLEEP 128 /* A blocking lock */ 1009 #define FL_SLEEP 128 /* A blocking lock */
1010 1010
1011 /* 1011 /*
1012 * Special return value from posix_lock_file() and vfs_lock_file() for 1012 * Special return value from posix_lock_file() and vfs_lock_file() for
1013 * asynchronous locking. 1013 * asynchronous locking.
1014 */ 1014 */
1015 #define FILE_LOCK_DEFERRED 1 1015 #define FILE_LOCK_DEFERRED 1
1016 1016
1017 /* 1017 /*
1018 * The POSIX file lock owner is determined by 1018 * The POSIX file lock owner is determined by
1019 * the "struct files_struct" in the thread group 1019 * the "struct files_struct" in the thread group
1020 * (or NULL for no owner - BSD locks). 1020 * (or NULL for no owner - BSD locks).
1021 * 1021 *
1022 * Lockd stuffs a "host" pointer into this. 1022 * Lockd stuffs a "host" pointer into this.
1023 */ 1023 */
1024 typedef struct files_struct *fl_owner_t; 1024 typedef struct files_struct *fl_owner_t;
1025 1025
1026 struct file_lock_operations { 1026 struct file_lock_operations {
1027 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 1027 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
1028 void (*fl_release_private)(struct file_lock *); 1028 void (*fl_release_private)(struct file_lock *);
1029 }; 1029 };
1030 1030
1031 struct lock_manager_operations { 1031 struct lock_manager_operations {
1032 int (*fl_compare_owner)(struct file_lock *, struct file_lock *); 1032 int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
1033 void (*fl_notify)(struct file_lock *); /* unblock callback */ 1033 void (*fl_notify)(struct file_lock *); /* unblock callback */
1034 int (*fl_grant)(struct file_lock *, struct file_lock *, int); 1034 int (*fl_grant)(struct file_lock *, struct file_lock *, int);
1035 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 1035 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
1036 void (*fl_release_private)(struct file_lock *); 1036 void (*fl_release_private)(struct file_lock *);
1037 void (*fl_break)(struct file_lock *); 1037 void (*fl_break)(struct file_lock *);
1038 int (*fl_mylease)(struct file_lock *, struct file_lock *); 1038 int (*fl_mylease)(struct file_lock *, struct file_lock *);
1039 int (*fl_change)(struct file_lock **, int); 1039 int (*fl_change)(struct file_lock **, int);
1040 }; 1040 };
1041 1041
1042 struct lock_manager { 1042 struct lock_manager {
1043 struct list_head list; 1043 struct list_head list;
1044 }; 1044 };
1045 1045
1046 void locks_start_grace(struct lock_manager *); 1046 void locks_start_grace(struct lock_manager *);
1047 void locks_end_grace(struct lock_manager *); 1047 void locks_end_grace(struct lock_manager *);
1048 int locks_in_grace(void); 1048 int locks_in_grace(void);
1049 1049
1050 /* that will die - we need it for nfs_lock_info */ 1050 /* that will die - we need it for nfs_lock_info */
1051 #include <linux/nfs_fs_i.h> 1051 #include <linux/nfs_fs_i.h>
1052 1052
1053 struct file_lock { 1053 struct file_lock {
1054 struct file_lock *fl_next; /* singly linked list for this inode */ 1054 struct file_lock *fl_next; /* singly linked list for this inode */
1055 struct list_head fl_link; /* doubly linked list of all locks */ 1055 struct list_head fl_link; /* doubly linked list of all locks */
1056 struct list_head fl_block; /* circular list of blocked processes */ 1056 struct list_head fl_block; /* circular list of blocked processes */
1057 fl_owner_t fl_owner; 1057 fl_owner_t fl_owner;
1058 unsigned char fl_flags; 1058 unsigned char fl_flags;
1059 unsigned char fl_type; 1059 unsigned char fl_type;
1060 unsigned int fl_pid; 1060 unsigned int fl_pid;
1061 struct pid *fl_nspid; 1061 struct pid *fl_nspid;
1062 wait_queue_head_t fl_wait; 1062 wait_queue_head_t fl_wait;
1063 struct file *fl_file; 1063 struct file *fl_file;
1064 loff_t fl_start; 1064 loff_t fl_start;
1065 loff_t fl_end; 1065 loff_t fl_end;
1066 1066
1067 struct fasync_struct * fl_fasync; /* for lease break notifications */ 1067 struct fasync_struct * fl_fasync; /* for lease break notifications */
1068 unsigned long fl_break_time; /* for nonblocking lease breaks */ 1068 unsigned long fl_break_time; /* for nonblocking lease breaks */
1069 1069
1070 struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ 1070 struct file_lock_operations *fl_ops; /* Callbacks for filesystems */
1071 struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ 1071 struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */
1072 union { 1072 union {
1073 struct nfs_lock_info nfs_fl; 1073 struct nfs_lock_info nfs_fl;
1074 struct nfs4_lock_info nfs4_fl; 1074 struct nfs4_lock_info nfs4_fl;
1075 struct { 1075 struct {
1076 struct list_head link; /* link in AFS vnode's pending_locks list */ 1076 struct list_head link; /* link in AFS vnode's pending_locks list */
1077 int state; /* state of grant or error if -ve */ 1077 int state; /* state of grant or error if -ve */
1078 } afs; 1078 } afs;
1079 } fl_u; 1079 } fl_u;
1080 }; 1080 };
1081 1081
1082 /* The following constant reflects the upper bound of the file/locking space */ 1082 /* The following constant reflects the upper bound of the file/locking space */
1083 #ifndef OFFSET_MAX 1083 #ifndef OFFSET_MAX
1084 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) 1084 #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
1085 #define OFFSET_MAX INT_LIMIT(loff_t) 1085 #define OFFSET_MAX INT_LIMIT(loff_t)
1086 #define OFFT_OFFSET_MAX INT_LIMIT(off_t) 1086 #define OFFT_OFFSET_MAX INT_LIMIT(off_t)
1087 #endif 1087 #endif
1088 1088
1089 #include <linux/fcntl.h> 1089 #include <linux/fcntl.h>
1090 1090
1091 extern void send_sigio(struct fown_struct *fown, int fd, int band); 1091 extern void send_sigio(struct fown_struct *fown, int fd, int band);
1092 1092
1093 /* fs/sync.c */ 1093 /* fs/sync.c */
1094 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, 1094 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
1095 loff_t endbyte, unsigned int flags); 1095 loff_t endbyte, unsigned int flags);
1096 1096
1097 #ifdef CONFIG_FILE_LOCKING 1097 #ifdef CONFIG_FILE_LOCKING
1098 extern int fcntl_getlk(struct file *, struct flock __user *); 1098 extern int fcntl_getlk(struct file *, struct flock __user *);
1099 extern int fcntl_setlk(unsigned int, struct file *, unsigned int, 1099 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
1100 struct flock __user *); 1100 struct flock __user *);
1101 1101
1102 #if BITS_PER_LONG == 32 1102 #if BITS_PER_LONG == 32
1103 extern int fcntl_getlk64(struct file *, struct flock64 __user *); 1103 extern int fcntl_getlk64(struct file *, struct flock64 __user *);
1104 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, 1104 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
1105 struct flock64 __user *); 1105 struct flock64 __user *);
1106 #endif 1106 #endif
1107 1107
1108 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); 1108 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
1109 extern int fcntl_getlease(struct file *filp); 1109 extern int fcntl_getlease(struct file *filp);
1110 1110
1111 /* fs/locks.c */ 1111 /* fs/locks.c */
1112 extern void locks_init_lock(struct file_lock *); 1112 extern void locks_init_lock(struct file_lock *);
1113 extern void locks_copy_lock(struct file_lock *, struct file_lock *); 1113 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
1114 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); 1114 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
1115 extern void locks_remove_posix(struct file *, fl_owner_t); 1115 extern void locks_remove_posix(struct file *, fl_owner_t);
1116 extern void locks_remove_flock(struct file *); 1116 extern void locks_remove_flock(struct file *);
1117 extern void locks_release_private(struct file_lock *); 1117 extern void locks_release_private(struct file_lock *);
1118 extern void posix_test_lock(struct file *, struct file_lock *); 1118 extern void posix_test_lock(struct file *, struct file_lock *);
1119 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); 1119 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
1120 extern int posix_lock_file_wait(struct file *, struct file_lock *); 1120 extern int posix_lock_file_wait(struct file *, struct file_lock *);
1121 extern int posix_unblock_lock(struct file *, struct file_lock *); 1121 extern int posix_unblock_lock(struct file *, struct file_lock *);
1122 extern int vfs_test_lock(struct file *, struct file_lock *); 1122 extern int vfs_test_lock(struct file *, struct file_lock *);
1123 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); 1123 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
1124 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); 1124 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
1125 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); 1125 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
1126 extern int __break_lease(struct inode *inode, unsigned int flags); 1126 extern int __break_lease(struct inode *inode, unsigned int flags);
1127 extern void lease_get_mtime(struct inode *, struct timespec *time); 1127 extern void lease_get_mtime(struct inode *, struct timespec *time);
1128 extern int generic_setlease(struct file *, long, struct file_lock **); 1128 extern int generic_setlease(struct file *, long, struct file_lock **);
1129 extern int vfs_setlease(struct file *, long, struct file_lock **); 1129 extern int vfs_setlease(struct file *, long, struct file_lock **);
1130 extern int lease_modify(struct file_lock **, int); 1130 extern int lease_modify(struct file_lock **, int);
1131 extern int lock_may_read(struct inode *, loff_t start, unsigned long count); 1131 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
1132 extern int lock_may_write(struct inode *, loff_t start, unsigned long count); 1132 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
1133 #else /* !CONFIG_FILE_LOCKING */ 1133 #else /* !CONFIG_FILE_LOCKING */
1134 static inline int fcntl_getlk(struct file *file, struct flock __user *user) 1134 static inline int fcntl_getlk(struct file *file, struct flock __user *user)
1135 { 1135 {
1136 return -EINVAL; 1136 return -EINVAL;
1137 } 1137 }
1138 1138
1139 static inline int fcntl_setlk(unsigned int fd, struct file *file, 1139 static inline int fcntl_setlk(unsigned int fd, struct file *file,
1140 unsigned int cmd, struct flock __user *user) 1140 unsigned int cmd, struct flock __user *user)
1141 { 1141 {
1142 return -EACCES; 1142 return -EACCES;
1143 } 1143 }
1144 1144
1145 #if BITS_PER_LONG == 32 1145 #if BITS_PER_LONG == 32
1146 static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user) 1146 static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user)
1147 { 1147 {
1148 return -EINVAL; 1148 return -EINVAL;
1149 } 1149 }
1150 1150
1151 static inline int fcntl_setlk64(unsigned int fd, struct file *file, 1151 static inline int fcntl_setlk64(unsigned int fd, struct file *file,
1152 unsigned int cmd, struct flock64 __user *user) 1152 unsigned int cmd, struct flock64 __user *user)
1153 { 1153 {
1154 return -EACCES; 1154 return -EACCES;
1155 } 1155 }
1156 #endif 1156 #endif
1157 static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1157 static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1158 { 1158 {
1159 return 0; 1159 return 0;
1160 } 1160 }
1161 1161
1162 static inline int fcntl_getlease(struct file *filp) 1162 static inline int fcntl_getlease(struct file *filp)
1163 { 1163 {
1164 return 0; 1164 return 0;
1165 } 1165 }
1166 1166
1167 static inline void locks_init_lock(struct file_lock *fl) 1167 static inline void locks_init_lock(struct file_lock *fl)
1168 { 1168 {
1169 return; 1169 return;
1170 } 1170 }
1171 1171
1172 static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) 1172 static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
1173 { 1173 {
1174 return; 1174 return;
1175 } 1175 }
1176 1176
1177 static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) 1177 static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
1178 { 1178 {
1179 return; 1179 return;
1180 } 1180 }
1181 1181
1182 static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) 1182 static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
1183 { 1183 {
1184 return; 1184 return;
1185 } 1185 }
1186 1186
1187 static inline void locks_remove_flock(struct file *filp) 1187 static inline void locks_remove_flock(struct file *filp)
1188 { 1188 {
1189 return; 1189 return;
1190 } 1190 }
1191 1191
1192 static inline void posix_test_lock(struct file *filp, struct file_lock *fl) 1192 static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
1193 { 1193 {
1194 return; 1194 return;
1195 } 1195 }
1196 1196
1197 static inline int posix_lock_file(struct file *filp, struct file_lock *fl, 1197 static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
1198 struct file_lock *conflock) 1198 struct file_lock *conflock)
1199 { 1199 {
1200 return -ENOLCK; 1200 return -ENOLCK;
1201 } 1201 }
1202 1202
1203 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl) 1203 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
1204 { 1204 {
1205 return -ENOLCK; 1205 return -ENOLCK;
1206 } 1206 }
1207 1207
1208 static inline int posix_unblock_lock(struct file *filp, 1208 static inline int posix_unblock_lock(struct file *filp,
1209 struct file_lock *waiter) 1209 struct file_lock *waiter)
1210 { 1210 {
1211 return -ENOENT; 1211 return -ENOENT;
1212 } 1212 }
1213 1213
1214 static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) 1214 static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
1215 { 1215 {
1216 return 0; 1216 return 0;
1217 } 1217 }
1218 1218
1219 static inline int vfs_lock_file(struct file *filp, unsigned int cmd, 1219 static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
1220 struct file_lock *fl, struct file_lock *conf) 1220 struct file_lock *fl, struct file_lock *conf)
1221 { 1221 {
1222 return -ENOLCK; 1222 return -ENOLCK;
1223 } 1223 }
1224 1224
1225 static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) 1225 static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
1226 { 1226 {
1227 return 0; 1227 return 0;
1228 } 1228 }
1229 1229
1230 static inline int flock_lock_file_wait(struct file *filp, 1230 static inline int flock_lock_file_wait(struct file *filp,
1231 struct file_lock *request) 1231 struct file_lock *request)
1232 { 1232 {
1233 return -ENOLCK; 1233 return -ENOLCK;
1234 } 1234 }
1235 1235
1236 static inline int __break_lease(struct inode *inode, unsigned int mode) 1236 static inline int __break_lease(struct inode *inode, unsigned int mode)
1237 { 1237 {
1238 return 0; 1238 return 0;
1239 } 1239 }
1240 1240
1241 static inline void lease_get_mtime(struct inode *inode, struct timespec *time) 1241 static inline void lease_get_mtime(struct inode *inode, struct timespec *time)
1242 { 1242 {
1243 return; 1243 return;
1244 } 1244 }
1245 1245
1246 static inline int generic_setlease(struct file *filp, long arg, 1246 static inline int generic_setlease(struct file *filp, long arg,
1247 struct file_lock **flp) 1247 struct file_lock **flp)
1248 { 1248 {
1249 return -EINVAL; 1249 return -EINVAL;
1250 } 1250 }
1251 1251
1252 static inline int vfs_setlease(struct file *filp, long arg, 1252 static inline int vfs_setlease(struct file *filp, long arg,
1253 struct file_lock **lease) 1253 struct file_lock **lease)
1254 { 1254 {
1255 return -EINVAL; 1255 return -EINVAL;
1256 } 1256 }
1257 1257
1258 static inline int lease_modify(struct file_lock **before, int arg) 1258 static inline int lease_modify(struct file_lock **before, int arg)
1259 { 1259 {
1260 return -EINVAL; 1260 return -EINVAL;
1261 } 1261 }
1262 1262
1263 static inline int lock_may_read(struct inode *inode, loff_t start, 1263 static inline int lock_may_read(struct inode *inode, loff_t start,
1264 unsigned long len) 1264 unsigned long len)
1265 { 1265 {
1266 return 1; 1266 return 1;
1267 } 1267 }
1268 1268
1269 static inline int lock_may_write(struct inode *inode, loff_t start, 1269 static inline int lock_may_write(struct inode *inode, loff_t start,
1270 unsigned long len) 1270 unsigned long len)
1271 { 1271 {
1272 return 1; 1272 return 1;
1273 } 1273 }
1274 1274
1275 #endif /* !CONFIG_FILE_LOCKING */ 1275 #endif /* !CONFIG_FILE_LOCKING */
1276 1276
1277 1277
1278 struct fasync_struct { 1278 struct fasync_struct {
1279 int magic; 1279 int magic;
1280 int fa_fd; 1280 int fa_fd;
1281 struct fasync_struct *fa_next; /* singly linked list */ 1281 struct fasync_struct *fa_next; /* singly linked list */
1282 struct file *fa_file; 1282 struct file *fa_file;
1283 }; 1283 };
1284 1284
1285 #define FASYNC_MAGIC 0x4601 1285 #define FASYNC_MAGIC 0x4601
1286 1286
1287 /* SMP safe fasync helpers: */ 1287 /* SMP safe fasync helpers: */
1288 extern int fasync_helper(int, struct file *, int, struct fasync_struct **); 1288 extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
1289 /* can be called from interrupts */ 1289 /* can be called from interrupts */
1290 extern void kill_fasync(struct fasync_struct **, int, int); 1290 extern void kill_fasync(struct fasync_struct **, int, int);
1291 /* only for net: no internal synchronization */ 1291 /* only for net: no internal synchronization */
1292 extern void __kill_fasync(struct fasync_struct *, int, int); 1292 extern void __kill_fasync(struct fasync_struct *, int, int);
1293 1293
1294 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); 1294 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
1295 extern int f_setown(struct file *filp, unsigned long arg, int force); 1295 extern int f_setown(struct file *filp, unsigned long arg, int force);
1296 extern void f_delown(struct file *filp); 1296 extern void f_delown(struct file *filp);
1297 extern pid_t f_getown(struct file *filp); 1297 extern pid_t f_getown(struct file *filp);
1298 extern int send_sigurg(struct fown_struct *fown); 1298 extern int send_sigurg(struct fown_struct *fown);
1299 1299
1300 /* 1300 /*
1301 * Umount options 1301 * Umount options
1302 */ 1302 */
1303 1303
1304 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ 1304 #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
1305 #define MNT_DETACH 0x00000002 /* Just detach from the tree */ 1305 #define MNT_DETACH 0x00000002 /* Just detach from the tree */
1306 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ 1306 #define MNT_EXPIRE 0x00000004 /* Mark for expiry */
1307 1307
1308 extern struct list_head super_blocks; 1308 extern struct list_head super_blocks;
1309 extern spinlock_t sb_lock; 1309 extern spinlock_t sb_lock;
1310 1310
1311 #define sb_entry(list) list_entry((list), struct super_block, s_list) 1311 #define sb_entry(list) list_entry((list), struct super_block, s_list)
1312 #define S_BIAS (1<<30) 1312 #define S_BIAS (1<<30)
1313 struct super_block { 1313 struct super_block {
1314 struct list_head s_list; /* Keep this first */ 1314 struct list_head s_list; /* Keep this first */
1315 dev_t s_dev; /* search index; _not_ kdev_t */ 1315 dev_t s_dev; /* search index; _not_ kdev_t */
1316 unsigned long s_blocksize; 1316 unsigned long s_blocksize;
1317 unsigned char s_blocksize_bits; 1317 unsigned char s_blocksize_bits;
1318 unsigned char s_dirt; 1318 unsigned char s_dirt;
1319 unsigned long long s_maxbytes; /* Max file size */ 1319 unsigned long long s_maxbytes; /* Max file size */
1320 struct file_system_type *s_type; 1320 struct file_system_type *s_type;
1321 const struct super_operations *s_op; 1321 const struct super_operations *s_op;
1322 struct dquot_operations *dq_op; 1322 struct dquot_operations *dq_op;
1323 struct quotactl_ops *s_qcop; 1323 struct quotactl_ops *s_qcop;
1324 const struct export_operations *s_export_op; 1324 const struct export_operations *s_export_op;
1325 unsigned long s_flags; 1325 unsigned long s_flags;
1326 unsigned long s_magic; 1326 unsigned long s_magic;
1327 struct dentry *s_root; 1327 struct dentry *s_root;
1328 struct rw_semaphore s_umount; 1328 struct rw_semaphore s_umount;
1329 struct mutex s_lock; 1329 struct mutex s_lock;
1330 int s_count; 1330 int s_count;
1331 int s_need_sync; 1331 int s_need_sync;
1332 atomic_t s_active; 1332 atomic_t s_active;
1333 #ifdef CONFIG_SECURITY 1333 #ifdef CONFIG_SECURITY
1334 void *s_security; 1334 void *s_security;
1335 #endif 1335 #endif
1336 struct xattr_handler **s_xattr; 1336 struct xattr_handler **s_xattr;
1337 1337
1338 struct list_head s_inodes; /* all inodes */ 1338 struct list_head s_inodes; /* all inodes */
1339 struct list_head s_dirty; /* dirty inodes */ 1339 struct list_head s_dirty; /* dirty inodes */
1340 struct list_head s_io; /* parked for writeback */ 1340 struct list_head s_io; /* parked for writeback */
1341 struct list_head s_more_io; /* parked for more writeback */ 1341 struct list_head s_more_io; /* parked for more writeback */
1342 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ 1342 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
1343 struct list_head s_files; 1343 struct list_head s_files;
1344 /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ 1344 /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
1345 struct list_head s_dentry_lru; /* unused dentry lru */ 1345 struct list_head s_dentry_lru; /* unused dentry lru */
1346 int s_nr_dentry_unused; /* # of dentry on lru */ 1346 int s_nr_dentry_unused; /* # of dentry on lru */
1347 1347
1348 struct block_device *s_bdev; 1348 struct block_device *s_bdev;
1349 struct mtd_info *s_mtd; 1349 struct mtd_info *s_mtd;
1350 struct list_head s_instances; 1350 struct list_head s_instances;
1351 struct quota_info s_dquot; /* Diskquota specific options */ 1351 struct quota_info s_dquot; /* Diskquota specific options */
1352 1352
1353 int s_frozen; 1353 int s_frozen;
1354 wait_queue_head_t s_wait_unfrozen; 1354 wait_queue_head_t s_wait_unfrozen;
1355 1355
1356 char s_id[32]; /* Informational name */ 1356 char s_id[32]; /* Informational name */
1357 1357
1358 void *s_fs_info; /* Filesystem private info */ 1358 void *s_fs_info; /* Filesystem private info */
1359 fmode_t s_mode; 1359 fmode_t s_mode;
1360 1360
1361 /* 1361 /*
1362 * The next field is for VFS *only*. No filesystems have any business 1362 * The next field is for VFS *only*. No filesystems have any business
1363 * even looking at it. You had been warned. 1363 * even looking at it. You had been warned.
1364 */ 1364 */
1365 struct mutex s_vfs_rename_mutex; /* Kludge */ 1365 struct mutex s_vfs_rename_mutex; /* Kludge */
1366 1366
1367 /* Granularity of c/m/atime in ns. 1367 /* Granularity of c/m/atime in ns.
1368 Cannot be worse than a second */ 1368 Cannot be worse than a second */
1369 u32 s_time_gran; 1369 u32 s_time_gran;
1370 1370
1371 /* 1371 /*
1372 * Filesystem subtype. If non-empty the filesystem type field 1372 * Filesystem subtype. If non-empty the filesystem type field
1373 * in /proc/mounts will be "type.subtype" 1373 * in /proc/mounts will be "type.subtype"
1374 */ 1374 */
1375 char *s_subtype; 1375 char *s_subtype;
1376 1376
1377 /* 1377 /*
1378 * Saved mount options for lazy filesystems using 1378 * Saved mount options for lazy filesystems using
1379 * generic_show_options() 1379 * generic_show_options()
1380 */ 1380 */
1381 char *s_options; 1381 char *s_options;
1382 }; 1382 };
1383 1383
1384 extern struct timespec current_fs_time(struct super_block *sb); 1384 extern struct timespec current_fs_time(struct super_block *sb);
1385 1385
1386 /* 1386 /*
1387 * Snapshotting support. 1387 * Snapshotting support.
1388 */ 1388 */
1389 enum { 1389 enum {
1390 SB_UNFROZEN = 0, 1390 SB_UNFROZEN = 0,
1391 SB_FREEZE_WRITE = 1, 1391 SB_FREEZE_WRITE = 1,
1392 SB_FREEZE_TRANS = 2, 1392 SB_FREEZE_TRANS = 2,
1393 }; 1393 };
1394 1394
1395 #define vfs_check_frozen(sb, level) \ 1395 #define vfs_check_frozen(sb, level) \
1396 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 1396 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
1397 1397
1398 #define get_fs_excl() atomic_inc(&current->fs_excl) 1398 #define get_fs_excl() atomic_inc(&current->fs_excl)
1399 #define put_fs_excl() atomic_dec(&current->fs_excl) 1399 #define put_fs_excl() atomic_dec(&current->fs_excl)
1400 #define has_fs_excl() atomic_read(&current->fs_excl) 1400 #define has_fs_excl() atomic_read(&current->fs_excl)
1401 1401
1402 #define is_owner_or_cap(inode) \ 1402 #define is_owner_or_cap(inode) \
1403 ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER)) 1403 ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER))
1404 1404
1405 /* not quite ready to be deprecated, but... */ 1405 /* not quite ready to be deprecated, but... */
1406 extern void lock_super(struct super_block *); 1406 extern void lock_super(struct super_block *);
1407 extern void unlock_super(struct super_block *); 1407 extern void unlock_super(struct super_block *);
1408 1408
1409 /* 1409 /*
1410 * VFS helper functions.. 1410 * VFS helper functions..
1411 */ 1411 */
1412 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); 1412 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
1413 extern int vfs_mkdir(struct inode *, struct dentry *, int); 1413 extern int vfs_mkdir(struct inode *, struct dentry *, int);
1414 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); 1414 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
1415 extern int vfs_symlink(struct inode *, struct dentry *, const char *); 1415 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
1416 extern int vfs_link(struct dentry *, struct inode *, struct dentry *); 1416 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
1417 extern int vfs_rmdir(struct inode *, struct dentry *); 1417 extern int vfs_rmdir(struct inode *, struct dentry *);
1418 extern int vfs_unlink(struct inode *, struct dentry *); 1418 extern int vfs_unlink(struct inode *, struct dentry *);
1419 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 1419 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
1420 1420
1421 /* 1421 /*
1422 * VFS dentry helper functions. 1422 * VFS dentry helper functions.
1423 */ 1423 */
1424 extern void dentry_unhash(struct dentry *dentry); 1424 extern void dentry_unhash(struct dentry *dentry);
1425 1425
1426 /* 1426 /*
1427 * VFS file helper functions. 1427 * VFS file helper functions.
1428 */ 1428 */
1429 extern int file_permission(struct file *, int); 1429 extern int file_permission(struct file *, int);
1430 1430
1431 /* 1431 /*
1432 * VFS FS_IOC_FIEMAP helper definitions. 1432 * VFS FS_IOC_FIEMAP helper definitions.
1433 */ 1433 */
1434 struct fiemap_extent_info { 1434 struct fiemap_extent_info {
1435 unsigned int fi_flags; /* Flags as passed from user */ 1435 unsigned int fi_flags; /* Flags as passed from user */
1436 unsigned int fi_extents_mapped; /* Number of mapped extents */ 1436 unsigned int fi_extents_mapped; /* Number of mapped extents */
1437 unsigned int fi_extents_max; /* Size of fiemap_extent array */ 1437 unsigned int fi_extents_max; /* Size of fiemap_extent array */
1438 struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent 1438 struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent
1439 * array */ 1439 * array */
1440 }; 1440 };
1441 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, 1441 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
1442 u64 phys, u64 len, u32 flags); 1442 u64 phys, u64 len, u32 flags);
1443 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); 1443 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
1444 1444
1445 /* 1445 /*
1446 * File types 1446 * File types
1447 * 1447 *
1448 * NOTE! These match bits 12..15 of stat.st_mode 1448 * NOTE! These match bits 12..15 of stat.st_mode
1449 * (ie "(i_mode >> 12) & 15"). 1449 * (ie "(i_mode >> 12) & 15").
1450 */ 1450 */
1451 #define DT_UNKNOWN 0 1451 #define DT_UNKNOWN 0
1452 #define DT_FIFO 1 1452 #define DT_FIFO 1
1453 #define DT_CHR 2 1453 #define DT_CHR 2
1454 #define DT_DIR 4 1454 #define DT_DIR 4
1455 #define DT_BLK 6 1455 #define DT_BLK 6
1456 #define DT_REG 8 1456 #define DT_REG 8
1457 #define DT_LNK 10 1457 #define DT_LNK 10
1458 #define DT_SOCK 12 1458 #define DT_SOCK 12
1459 #define DT_WHT 14 1459 #define DT_WHT 14
1460 1460
1461 #define OSYNC_METADATA (1<<0) 1461 #define OSYNC_METADATA (1<<0)
1462 #define OSYNC_DATA (1<<1) 1462 #define OSYNC_DATA (1<<1)
1463 #define OSYNC_INODE (1<<2) 1463 #define OSYNC_INODE (1<<2)
1464 int generic_osync_inode(struct inode *, struct address_space *, int); 1464 int generic_osync_inode(struct inode *, struct address_space *, int);
1465 1465
1466 /* 1466 /*
1467 * This is the "filldir" function type, used by readdir() to let 1467 * This is the "filldir" function type, used by readdir() to let
1468 * the kernel specify what kind of dirent layout it wants to have. 1468 * the kernel specify what kind of dirent layout it wants to have.
1469 * This allows the kernel to read directories into kernel space or 1469 * This allows the kernel to read directories into kernel space or
1470 * to have different dirent layouts depending on the binary type. 1470 * to have different dirent layouts depending on the binary type.
1471 */ 1471 */
1472 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); 1472 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
1473 struct block_device_operations; 1473 struct block_device_operations;
1474 1474
1475 /* These macros are for out of kernel modules to test that 1475 /* These macros are for out of kernel modules to test that
1476 * the kernel supports the unlocked_ioctl and compat_ioctl 1476 * the kernel supports the unlocked_ioctl and compat_ioctl
1477 * fields in struct file_operations. */ 1477 * fields in struct file_operations. */
1478 #define HAVE_COMPAT_IOCTL 1 1478 #define HAVE_COMPAT_IOCTL 1
1479 #define HAVE_UNLOCKED_IOCTL 1 1479 #define HAVE_UNLOCKED_IOCTL 1
1480 1480
1481 /* 1481 /*
1482 * NOTE: 1482 * NOTE:
1483 * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl 1483 * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
1484 * can be called without the big kernel lock held in all filesystems. 1484 * can be called without the big kernel lock held in all filesystems.
1485 */ 1485 */
1486 struct file_operations { 1486 struct file_operations {
1487 struct module *owner; 1487 struct module *owner;
1488 loff_t (*llseek) (struct file *, loff_t, int); 1488 loff_t (*llseek) (struct file *, loff_t, int);
1489 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 1489 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1490 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 1490 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1491 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1491 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1492 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1492 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1493 int (*readdir) (struct file *, void *, filldir_t); 1493 int (*readdir) (struct file *, void *, filldir_t);
1494 unsigned int (*poll) (struct file *, struct poll_table_struct *); 1494 unsigned int (*poll) (struct file *, struct poll_table_struct *);
1495 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); 1495 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
1496 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1496 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1497 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1497 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1498 int (*mmap) (struct file *, struct vm_area_struct *); 1498 int (*mmap) (struct file *, struct vm_area_struct *);
1499 int (*open) (struct inode *, struct file *); 1499 int (*open) (struct inode *, struct file *);
1500 int (*flush) (struct file *, fl_owner_t id); 1500 int (*flush) (struct file *, fl_owner_t id);
1501 int (*release) (struct inode *, struct file *); 1501 int (*release) (struct inode *, struct file *);
1502 int (*fsync) (struct file *, struct dentry *, int datasync); 1502 int (*fsync) (struct file *, struct dentry *, int datasync);
1503 int (*aio_fsync) (struct kiocb *, int datasync); 1503 int (*aio_fsync) (struct kiocb *, int datasync);
1504 int (*fasync) (int, struct file *, int); 1504 int (*fasync) (int, struct file *, int);
1505 int (*lock) (struct file *, int, struct file_lock *); 1505 int (*lock) (struct file *, int, struct file_lock *);
1506 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); 1506 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1507 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1507 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1508 int (*check_flags)(int); 1508 int (*check_flags)(int);
1509 int (*flock) (struct file *, int, struct file_lock *); 1509 int (*flock) (struct file *, int, struct file_lock *);
1510 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); 1510 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1511 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); 1511 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1512 int (*setlease)(struct file *, long, struct file_lock **); 1512 int (*setlease)(struct file *, long, struct file_lock **);
1513 }; 1513 };
1514 1514
1515 struct inode_operations { 1515 struct inode_operations {
1516 int (*create) (struct inode *,struct dentry *,int, struct nameidata *); 1516 int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
1517 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); 1517 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
1518 int (*link) (struct dentry *,struct inode *,struct dentry *); 1518 int (*link) (struct dentry *,struct inode *,struct dentry *);
1519 int (*unlink) (struct inode *,struct dentry *); 1519 int (*unlink) (struct inode *,struct dentry *);
1520 int (*symlink) (struct inode *,struct dentry *,const char *); 1520 int (*symlink) (struct inode *,struct dentry *,const char *);
1521 int (*mkdir) (struct inode *,struct dentry *,int); 1521 int (*mkdir) (struct inode *,struct dentry *,int);
1522 int (*rmdir) (struct inode *,struct dentry *); 1522 int (*rmdir) (struct inode *,struct dentry *);
1523 int (*mknod) (struct inode *,struct dentry *,int,dev_t); 1523 int (*mknod) (struct inode *,struct dentry *,int,dev_t);
1524 int (*rename) (struct inode *, struct dentry *, 1524 int (*rename) (struct inode *, struct dentry *,
1525 struct inode *, struct dentry *); 1525 struct inode *, struct dentry *);
1526 int (*readlink) (struct dentry *, char __user *,int); 1526 int (*readlink) (struct dentry *, char __user *,int);
1527 void * (*follow_link) (struct dentry *, struct nameidata *); 1527 void * (*follow_link) (struct dentry *, struct nameidata *);
1528 void (*put_link) (struct dentry *, struct nameidata *, void *); 1528 void (*put_link) (struct dentry *, struct nameidata *, void *);
1529 void (*truncate) (struct inode *); 1529 void (*truncate) (struct inode *);
1530 int (*permission) (struct inode *, int); 1530 int (*permission) (struct inode *, int);
1531 int (*setattr) (struct dentry *, struct iattr *); 1531 int (*setattr) (struct dentry *, struct iattr *);
1532 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 1532 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
1533 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 1533 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
1534 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 1534 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
1535 ssize_t (*listxattr) (struct dentry *, char *, size_t); 1535 ssize_t (*listxattr) (struct dentry *, char *, size_t);
1536 int (*removexattr) (struct dentry *, const char *); 1536 int (*removexattr) (struct dentry *, const char *);
1537 void (*truncate_range)(struct inode *, loff_t, loff_t); 1537 void (*truncate_range)(struct inode *, loff_t, loff_t);
1538 long (*fallocate)(struct inode *inode, int mode, loff_t offset, 1538 long (*fallocate)(struct inode *inode, int mode, loff_t offset,
1539 loff_t len); 1539 loff_t len);
1540 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, 1540 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1541 u64 len); 1541 u64 len);
1542 }; 1542 };
1543 1543
1544 struct seq_file; 1544 struct seq_file;
1545 1545
1546 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 1546 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
1547 unsigned long nr_segs, unsigned long fast_segs, 1547 unsigned long nr_segs, unsigned long fast_segs,
1548 struct iovec *fast_pointer, 1548 struct iovec *fast_pointer,
1549 struct iovec **ret_pointer); 1549 struct iovec **ret_pointer);
1550 1550
1551 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1551 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
1552 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1552 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
1553 extern ssize_t vfs_readv(struct file *, const struct iovec __user *, 1553 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
1554 unsigned long, loff_t *); 1554 unsigned long, loff_t *);
1555 extern ssize_t vfs_writev(struct file *, const struct iovec __user *, 1555 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
1556 unsigned long, loff_t *); 1556 unsigned long, loff_t *);
1557 1557
1558 struct super_operations { 1558 struct super_operations {
1559 struct inode *(*alloc_inode)(struct super_block *sb); 1559 struct inode *(*alloc_inode)(struct super_block *sb);
1560 void (*destroy_inode)(struct inode *); 1560 void (*destroy_inode)(struct inode *);
1561 1561
1562 void (*dirty_inode) (struct inode *); 1562 void (*dirty_inode) (struct inode *);
1563 int (*write_inode) (struct inode *, int); 1563 int (*write_inode) (struct inode *, int);
1564 void (*drop_inode) (struct inode *); 1564 void (*drop_inode) (struct inode *);
1565 void (*delete_inode) (struct inode *); 1565 void (*delete_inode) (struct inode *);
1566 void (*put_super) (struct super_block *); 1566 void (*put_super) (struct super_block *);
1567 void (*write_super) (struct super_block *); 1567 void (*write_super) (struct super_block *);
1568 int (*sync_fs)(struct super_block *sb, int wait); 1568 int (*sync_fs)(struct super_block *sb, int wait);
1569 int (*freeze_fs) (struct super_block *); 1569 int (*freeze_fs) (struct super_block *);
1570 int (*unfreeze_fs) (struct super_block *); 1570 int (*unfreeze_fs) (struct super_block *);
1571 int (*statfs) (struct dentry *, struct kstatfs *); 1571 int (*statfs) (struct dentry *, struct kstatfs *);
1572 int (*remount_fs) (struct super_block *, int *, char *); 1572 int (*remount_fs) (struct super_block *, int *, char *);
1573 void (*clear_inode) (struct inode *); 1573 void (*clear_inode) (struct inode *);
1574 void (*umount_begin) (struct super_block *); 1574 void (*umount_begin) (struct super_block *);
1575 1575
1576 int (*show_options)(struct seq_file *, struct vfsmount *); 1576 int (*show_options)(struct seq_file *, struct vfsmount *);
1577 int (*show_stats)(struct seq_file *, struct vfsmount *); 1577 int (*show_stats)(struct seq_file *, struct vfsmount *);
1578 #ifdef CONFIG_QUOTA 1578 #ifdef CONFIG_QUOTA
1579 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 1579 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1580 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 1580 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1581 #endif 1581 #endif
1582 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 1582 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
1583 }; 1583 };
1584 1584
1585 /* 1585 /*
1586 * Inode state bits. Protected by inode_lock. 1586 * Inode state bits. Protected by inode_lock.
1587 * 1587 *
1588 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1588 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
1589 * I_DIRTY_DATASYNC and I_DIRTY_PAGES. 1589 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
1590 * 1590 *
1591 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, 1591 * Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
1592 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at 1592 * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
1593 * various stages of removing an inode. 1593 * various stages of removing an inode.
1594 * 1594 *
1595 * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. 1595 * Two bits are used for locking and completion notification, I_LOCK and I_SYNC.
1596 * 1596 *
1597 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on 1597 * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
1598 * fdatasync(). i_atime is the usual cause. 1598 * fdatasync(). i_atime is the usual cause.
1599 * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of 1599 * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
1600 * these changes separately from I_DIRTY_SYNC so that we 1600 * these changes separately from I_DIRTY_SYNC so that we
1601 * don't have to write inode on fdatasync() when only 1601 * don't have to write inode on fdatasync() when only
1602 * mtime has changed in it. 1602 * mtime has changed in it.
1603 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. 1603 * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
1604 * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both 1604 * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both
1605 * are cleared by unlock_new_inode(), called from iget(). 1605 * are cleared by unlock_new_inode(), called from iget().
1606 * I_WILL_FREE Must be set when calling write_inode_now() if i_count 1606 * I_WILL_FREE Must be set when calling write_inode_now() if i_count
1607 * is zero. I_FREEING must be set when I_WILL_FREE is 1607 * is zero. I_FREEING must be set when I_WILL_FREE is
1608 * cleared. 1608 * cleared.
1609 * I_FREEING Set when inode is about to be freed but still has dirty 1609 * I_FREEING Set when inode is about to be freed but still has dirty
1610 * pages or buffers attached or the inode itself is still 1610 * pages or buffers attached or the inode itself is still
1611 * dirty. 1611 * dirty.
1612 * I_CLEAR Set by clear_inode(). In this state the inode is clean 1612 * I_CLEAR Set by clear_inode(). In this state the inode is clean
1613 * and can be destroyed. 1613 * and can be destroyed.
1614 * 1614 *
1615 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are 1615 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
1616 * prohibited for many purposes. iget() must wait for 1616 * prohibited for many purposes. iget() must wait for
1617 * the inode to be completely released, then create it 1617 * the inode to be completely released, then create it
1618 * anew. Other functions will just ignore such inodes, 1618 * anew. Other functions will just ignore such inodes,
1619 * if appropriate. I_LOCK is used for waiting. 1619 * if appropriate. I_LOCK is used for waiting.
1620 * 1620 *
1621 * I_LOCK Serves as both a mutex and completion notification. 1621 * I_LOCK Serves as both a mutex and completion notification.
1622 * New inodes set I_LOCK. If two processes both create 1622 * New inodes set I_LOCK. If two processes both create
1623 * the same inode, one of them will release its inode and 1623 * the same inode, one of them will release its inode and
1624 * wait for I_LOCK to be released before returning. 1624 * wait for I_LOCK to be released before returning.
1625 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can 1625 * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
1626 * also cause waiting on I_LOCK, without I_LOCK actually 1626 * also cause waiting on I_LOCK, without I_LOCK actually
1627 * being set. find_inode() uses this to prevent returning 1627 * being set. find_inode() uses this to prevent returning
1628 * nearly-dead inodes. 1628 * nearly-dead inodes.
1629 * I_SYNC Similar to I_LOCK, but limited in scope to writeback 1629 * I_SYNC Similar to I_LOCK, but limited in scope to writeback
1630 * of inode dirty data. Having a separate lock for this 1630 * of inode dirty data. Having a separate lock for this
1631 * purpose reduces latency and prevents some filesystem- 1631 * purpose reduces latency and prevents some filesystem-
1632 * specific deadlocks. 1632 * specific deadlocks.
1633 * 1633 *
1634 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1634 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1635 * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on 1635 * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on
1636 * I_CLEAR? If not, why? 1636 * I_CLEAR? If not, why?
1637 */ 1637 */
1638 #define I_DIRTY_SYNC 1 1638 #define I_DIRTY_SYNC 1
1639 #define I_DIRTY_DATASYNC 2 1639 #define I_DIRTY_DATASYNC 2
1640 #define I_DIRTY_PAGES 4 1640 #define I_DIRTY_PAGES 4
1641 #define I_NEW 8 1641 #define I_NEW 8
1642 #define I_WILL_FREE 16 1642 #define I_WILL_FREE 16
1643 #define I_FREEING 32 1643 #define I_FREEING 32
1644 #define I_CLEAR 64 1644 #define I_CLEAR 64
1645 #define __I_LOCK 7 1645 #define __I_LOCK 7
1646 #define I_LOCK (1 << __I_LOCK) 1646 #define I_LOCK (1 << __I_LOCK)
1647 #define __I_SYNC 8 1647 #define __I_SYNC 8
1648 #define I_SYNC (1 << __I_SYNC) 1648 #define I_SYNC (1 << __I_SYNC)
1649 1649
1650 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1650 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1651 1651
1652 extern void __mark_inode_dirty(struct inode *, int); 1652 extern void __mark_inode_dirty(struct inode *, int);
1653 static inline void mark_inode_dirty(struct inode *inode) 1653 static inline void mark_inode_dirty(struct inode *inode)
1654 { 1654 {
1655 __mark_inode_dirty(inode, I_DIRTY); 1655 __mark_inode_dirty(inode, I_DIRTY);
1656 } 1656 }
1657 1657
1658 static inline void mark_inode_dirty_sync(struct inode *inode) 1658 static inline void mark_inode_dirty_sync(struct inode *inode)
1659 { 1659 {
1660 __mark_inode_dirty(inode, I_DIRTY_SYNC); 1660 __mark_inode_dirty(inode, I_DIRTY_SYNC);
1661 } 1661 }
1662 1662
1663 /** 1663 /**
1664 * inc_nlink - directly increment an inode's link count 1664 * inc_nlink - directly increment an inode's link count
1665 * @inode: inode 1665 * @inode: inode
1666 * 1666 *
1667 * This is a low-level filesystem helper to replace any 1667 * This is a low-level filesystem helper to replace any
1668 * direct filesystem manipulation of i_nlink. Currently, 1668 * direct filesystem manipulation of i_nlink. Currently,
1669 * it is only here for parity with dec_nlink(). 1669 * it is only here for parity with dec_nlink().
1670 */ 1670 */
1671 static inline void inc_nlink(struct inode *inode) 1671 static inline void inc_nlink(struct inode *inode)
1672 { 1672 {
1673 inode->i_nlink++; 1673 inode->i_nlink++;
1674 } 1674 }
1675 1675
1676 static inline void inode_inc_link_count(struct inode *inode) 1676 static inline void inode_inc_link_count(struct inode *inode)
1677 { 1677 {
1678 inc_nlink(inode); 1678 inc_nlink(inode);
1679 mark_inode_dirty(inode); 1679 mark_inode_dirty(inode);
1680 } 1680 }
1681 1681
1682 /** 1682 /**
1683 * drop_nlink - directly drop an inode's link count 1683 * drop_nlink - directly drop an inode's link count
1684 * @inode: inode 1684 * @inode: inode
1685 * 1685 *
1686 * This is a low-level filesystem helper to replace any 1686 * This is a low-level filesystem helper to replace any
1687 * direct filesystem manipulation of i_nlink. In cases 1687 * direct filesystem manipulation of i_nlink. In cases
1688 * where we are attempting to track writes to the 1688 * where we are attempting to track writes to the
1689 * filesystem, a decrement to zero means an imminent 1689 * filesystem, a decrement to zero means an imminent
1690 * write when the file is truncated and actually unlinked 1690 * write when the file is truncated and actually unlinked
1691 * on the filesystem. 1691 * on the filesystem.
1692 */ 1692 */
1693 static inline void drop_nlink(struct inode *inode) 1693 static inline void drop_nlink(struct inode *inode)
1694 { 1694 {
1695 inode->i_nlink--; 1695 inode->i_nlink--;
1696 } 1696 }
1697 1697
1698 /** 1698 /**
1699 * clear_nlink - directly zero an inode's link count 1699 * clear_nlink - directly zero an inode's link count
1700 * @inode: inode 1700 * @inode: inode
1701 * 1701 *
1702 * This is a low-level filesystem helper to replace any 1702 * This is a low-level filesystem helper to replace any
1703 * direct filesystem manipulation of i_nlink. See 1703 * direct filesystem manipulation of i_nlink. See
1704 * drop_nlink() for why we care about i_nlink hitting zero. 1704 * drop_nlink() for why we care about i_nlink hitting zero.
1705 */ 1705 */
1706 static inline void clear_nlink(struct inode *inode) 1706 static inline void clear_nlink(struct inode *inode)
1707 { 1707 {
1708 inode->i_nlink = 0; 1708 inode->i_nlink = 0;
1709 } 1709 }
1710 1710
1711 static inline void inode_dec_link_count(struct inode *inode) 1711 static inline void inode_dec_link_count(struct inode *inode)
1712 { 1712 {
1713 drop_nlink(inode); 1713 drop_nlink(inode);
1714 mark_inode_dirty(inode); 1714 mark_inode_dirty(inode);
1715 } 1715 }
1716 1716
1717 /** 1717 /**
1718 * inode_inc_iversion - increments i_version 1718 * inode_inc_iversion - increments i_version
1719 * @inode: inode that need to be updated 1719 * @inode: inode that need to be updated
1720 * 1720 *
1721 * Every time the inode is modified, the i_version field will be incremented. 1721 * Every time the inode is modified, the i_version field will be incremented.
1722 * The filesystem has to be mounted with i_version flag 1722 * The filesystem has to be mounted with i_version flag
1723 */ 1723 */
1724 1724
1725 static inline void inode_inc_iversion(struct inode *inode) 1725 static inline void inode_inc_iversion(struct inode *inode)
1726 { 1726 {
1727 spin_lock(&inode->i_lock); 1727 spin_lock(&inode->i_lock);
1728 inode->i_version++; 1728 inode->i_version++;
1729 spin_unlock(&inode->i_lock); 1729 spin_unlock(&inode->i_lock);
1730 } 1730 }
1731 1731
1732 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); 1732 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
1733 static inline void file_accessed(struct file *file) 1733 static inline void file_accessed(struct file *file)
1734 { 1734 {
1735 if (!(file->f_flags & O_NOATIME)) 1735 if (!(file->f_flags & O_NOATIME))
1736 touch_atime(file->f_path.mnt, file->f_path.dentry); 1736 touch_atime(file->f_path.mnt, file->f_path.dentry);
1737 } 1737 }
1738 1738
1739 int sync_inode(struct inode *inode, struct writeback_control *wbc); 1739 int sync_inode(struct inode *inode, struct writeback_control *wbc);
1740 1740
1741 struct file_system_type { 1741 struct file_system_type {
1742 const char *name; 1742 const char *name;
1743 int fs_flags; 1743 int fs_flags;
1744 int (*get_sb) (struct file_system_type *, int, 1744 int (*get_sb) (struct file_system_type *, int,
1745 const char *, void *, struct vfsmount *); 1745 const char *, void *, struct vfsmount *);
1746 void (*kill_sb) (struct super_block *); 1746 void (*kill_sb) (struct super_block *);
1747 struct module *owner; 1747 struct module *owner;
1748 struct file_system_type * next; 1748 struct file_system_type * next;
1749 struct list_head fs_supers; 1749 struct list_head fs_supers;
1750 1750
1751 struct lock_class_key s_lock_key; 1751 struct lock_class_key s_lock_key;
1752 struct lock_class_key s_umount_key; 1752 struct lock_class_key s_umount_key;
1753 1753
1754 struct lock_class_key i_lock_key; 1754 struct lock_class_key i_lock_key;
1755 struct lock_class_key i_mutex_key; 1755 struct lock_class_key i_mutex_key;
1756 struct lock_class_key i_mutex_dir_key; 1756 struct lock_class_key i_mutex_dir_key;
1757 struct lock_class_key i_alloc_sem_key; 1757 struct lock_class_key i_alloc_sem_key;
1758 }; 1758 };
1759 1759
1760 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, 1760 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
1761 int (*fill_super)(struct super_block *, void *, int), 1761 int (*fill_super)(struct super_block *, void *, int),
1762 struct vfsmount *mnt); 1762 struct vfsmount *mnt);
1763 extern int get_sb_bdev(struct file_system_type *fs_type, 1763 extern int get_sb_bdev(struct file_system_type *fs_type,
1764 int flags, const char *dev_name, void *data, 1764 int flags, const char *dev_name, void *data,
1765 int (*fill_super)(struct super_block *, void *, int), 1765 int (*fill_super)(struct super_block *, void *, int),
1766 struct vfsmount *mnt); 1766 struct vfsmount *mnt);
1767 extern int get_sb_single(struct file_system_type *fs_type, 1767 extern int get_sb_single(struct file_system_type *fs_type,
1768 int flags, void *data, 1768 int flags, void *data,
1769 int (*fill_super)(struct super_block *, void *, int), 1769 int (*fill_super)(struct super_block *, void *, int),
1770 struct vfsmount *mnt); 1770 struct vfsmount *mnt);
1771 extern int get_sb_nodev(struct file_system_type *fs_type, 1771 extern int get_sb_nodev(struct file_system_type *fs_type,
1772 int flags, void *data, 1772 int flags, void *data,
1773 int (*fill_super)(struct super_block *, void *, int), 1773 int (*fill_super)(struct super_block *, void *, int),
1774 struct vfsmount *mnt); 1774 struct vfsmount *mnt);
1775 void generic_shutdown_super(struct super_block *sb); 1775 void generic_shutdown_super(struct super_block *sb);
1776 void kill_block_super(struct super_block *sb); 1776 void kill_block_super(struct super_block *sb);
1777 void kill_anon_super(struct super_block *sb); 1777 void kill_anon_super(struct super_block *sb);
1778 void kill_litter_super(struct super_block *sb); 1778 void kill_litter_super(struct super_block *sb);
1779 void deactivate_super(struct super_block *sb); 1779 void deactivate_super(struct super_block *sb);
1780 void deactivate_locked_super(struct super_block *sb); 1780 void deactivate_locked_super(struct super_block *sb);
1781 int set_anon_super(struct super_block *s, void *data); 1781 int set_anon_super(struct super_block *s, void *data);
1782 struct super_block *sget(struct file_system_type *type, 1782 struct super_block *sget(struct file_system_type *type,
1783 int (*test)(struct super_block *,void *), 1783 int (*test)(struct super_block *,void *),
1784 int (*set)(struct super_block *,void *), 1784 int (*set)(struct super_block *,void *),
1785 void *data); 1785 void *data);
1786 extern int get_sb_pseudo(struct file_system_type *, char *, 1786 extern int get_sb_pseudo(struct file_system_type *, char *,
1787 const struct super_operations *ops, unsigned long, 1787 const struct super_operations *ops, unsigned long,
1788 struct vfsmount *mnt); 1788 struct vfsmount *mnt);
1789 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); 1789 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
1790 int __put_super_and_need_restart(struct super_block *sb); 1790 int __put_super_and_need_restart(struct super_block *sb);
1791 1791
1792 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ 1792 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1793 #define fops_get(fops) \ 1793 #define fops_get(fops) \
1794 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) 1794 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
1795 #define fops_put(fops) \ 1795 #define fops_put(fops) \
1796 do { if (fops) module_put((fops)->owner); } while(0) 1796 do { if (fops) module_put((fops)->owner); } while(0)
1797 1797
1798 extern int register_filesystem(struct file_system_type *); 1798 extern int register_filesystem(struct file_system_type *);
1799 extern int unregister_filesystem(struct file_system_type *); 1799 extern int unregister_filesystem(struct file_system_type *);
1800 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); 1800 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
1801 #define kern_mount(type) kern_mount_data(type, NULL) 1801 #define kern_mount(type) kern_mount_data(type, NULL)
1802 extern int may_umount_tree(struct vfsmount *); 1802 extern int may_umount_tree(struct vfsmount *);
1803 extern int may_umount(struct vfsmount *); 1803 extern int may_umount(struct vfsmount *);
1804 extern long do_mount(char *, char *, char *, unsigned long, void *); 1804 extern long do_mount(char *, char *, char *, unsigned long, void *);
1805 extern struct vfsmount *collect_mounts(struct path *); 1805 extern struct vfsmount *collect_mounts(struct path *);
1806 extern void drop_collected_mounts(struct vfsmount *); 1806 extern void drop_collected_mounts(struct vfsmount *);
1807 1807
1808 extern int vfs_statfs(struct dentry *, struct kstatfs *); 1808 extern int vfs_statfs(struct dentry *, struct kstatfs *);
1809 1809
1810 extern int current_umask(void); 1810 extern int current_umask(void);
1811 1811
1812 /* /sys/fs */ 1812 /* /sys/fs */
1813 extern struct kobject *fs_kobj; 1813 extern struct kobject *fs_kobj;
1814 1814
1815 extern int rw_verify_area(int, struct file *, loff_t *, size_t); 1815 extern int rw_verify_area(int, struct file *, loff_t *, size_t);
1816 1816
1817 #define FLOCK_VERIFY_READ 1 1817 #define FLOCK_VERIFY_READ 1
1818 #define FLOCK_VERIFY_WRITE 2 1818 #define FLOCK_VERIFY_WRITE 2
1819 1819
1820 #ifdef CONFIG_FILE_LOCKING 1820 #ifdef CONFIG_FILE_LOCKING
1821 extern int locks_mandatory_locked(struct inode *); 1821 extern int locks_mandatory_locked(struct inode *);
1822 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1822 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1823 1823
1824 /* 1824 /*
1825 * Candidates for mandatory locking have the setgid bit set 1825 * Candidates for mandatory locking have the setgid bit set
1826 * but no group execute bit - an otherwise meaningless combination. 1826 * but no group execute bit - an otherwise meaningless combination.
1827 */ 1827 */
1828 1828
1829 static inline int __mandatory_lock(struct inode *ino) 1829 static inline int __mandatory_lock(struct inode *ino)
1830 { 1830 {
1831 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; 1831 return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
1832 } 1832 }
1833 1833
1834 /* 1834 /*
1835 * ... and these candidates should be on MS_MANDLOCK mounted fs, 1835 * ... and these candidates should be on MS_MANDLOCK mounted fs,
1836 * otherwise these will be advisory locks 1836 * otherwise these will be advisory locks
1837 */ 1837 */
1838 1838
1839 static inline int mandatory_lock(struct inode *ino) 1839 static inline int mandatory_lock(struct inode *ino)
1840 { 1840 {
1841 return IS_MANDLOCK(ino) && __mandatory_lock(ino); 1841 return IS_MANDLOCK(ino) && __mandatory_lock(ino);
1842 } 1842 }
1843 1843
1844 static inline int locks_verify_locked(struct inode *inode) 1844 static inline int locks_verify_locked(struct inode *inode)
1845 { 1845 {
1846 if (mandatory_lock(inode)) 1846 if (mandatory_lock(inode))
1847 return locks_mandatory_locked(inode); 1847 return locks_mandatory_locked(inode);
1848 return 0; 1848 return 0;
1849 } 1849 }
1850 1850
1851 static inline int locks_verify_truncate(struct inode *inode, 1851 static inline int locks_verify_truncate(struct inode *inode,
1852 struct file *filp, 1852 struct file *filp,
1853 loff_t size) 1853 loff_t size)
1854 { 1854 {
1855 if (inode->i_flock && mandatory_lock(inode)) 1855 if (inode->i_flock && mandatory_lock(inode))
1856 return locks_mandatory_area( 1856 return locks_mandatory_area(
1857 FLOCK_VERIFY_WRITE, inode, filp, 1857 FLOCK_VERIFY_WRITE, inode, filp,
1858 size < inode->i_size ? size : inode->i_size, 1858 size < inode->i_size ? size : inode->i_size,
1859 (size < inode->i_size ? inode->i_size - size 1859 (size < inode->i_size ? inode->i_size - size
1860 : size - inode->i_size) 1860 : size - inode->i_size)
1861 ); 1861 );
1862 return 0; 1862 return 0;
1863 } 1863 }
1864 1864
1865 static inline int break_lease(struct inode *inode, unsigned int mode) 1865 static inline int break_lease(struct inode *inode, unsigned int mode)
1866 { 1866 {
1867 if (inode->i_flock) 1867 if (inode->i_flock)
1868 return __break_lease(inode, mode); 1868 return __break_lease(inode, mode);
1869 return 0; 1869 return 0;
1870 } 1870 }
1871 #else /* !CONFIG_FILE_LOCKING */ 1871 #else /* !CONFIG_FILE_LOCKING */
1872 static inline int locks_mandatory_locked(struct inode *inode) 1872 static inline int locks_mandatory_locked(struct inode *inode)
1873 { 1873 {
1874 return 0; 1874 return 0;
1875 } 1875 }
1876 1876
1877 static inline int locks_mandatory_area(int rw, struct inode *inode, 1877 static inline int locks_mandatory_area(int rw, struct inode *inode,
1878 struct file *filp, loff_t offset, 1878 struct file *filp, loff_t offset,
1879 size_t count) 1879 size_t count)
1880 { 1880 {
1881 return 0; 1881 return 0;
1882 } 1882 }
1883 1883
1884 static inline int __mandatory_lock(struct inode *inode) 1884 static inline int __mandatory_lock(struct inode *inode)
1885 { 1885 {
1886 return 0; 1886 return 0;
1887 } 1887 }
1888 1888
1889 static inline int mandatory_lock(struct inode *inode) 1889 static inline int mandatory_lock(struct inode *inode)
1890 { 1890 {
1891 return 0; 1891 return 0;
1892 } 1892 }
1893 1893
1894 static inline int locks_verify_locked(struct inode *inode) 1894 static inline int locks_verify_locked(struct inode *inode)
1895 { 1895 {
1896 return 0; 1896 return 0;
1897 } 1897 }
1898 1898
1899 static inline int locks_verify_truncate(struct inode *inode, struct file *filp, 1899 static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
1900 size_t size) 1900 size_t size)
1901 { 1901 {
1902 return 0; 1902 return 0;
1903 } 1903 }
1904 1904
1905 static inline int break_lease(struct inode *inode, unsigned int mode) 1905 static inline int break_lease(struct inode *inode, unsigned int mode)
1906 { 1906 {
1907 return 0; 1907 return 0;
1908 } 1908 }
1909 1909
1910 #endif /* CONFIG_FILE_LOCKING */ 1910 #endif /* CONFIG_FILE_LOCKING */
1911 1911
1912 /* fs/open.c */ 1912 /* fs/open.c */
1913 1913
1914 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, 1914 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
1915 struct file *filp); 1915 struct file *filp);
1916 extern int do_fallocate(struct file *file, int mode, loff_t offset, 1916 extern int do_fallocate(struct file *file, int mode, loff_t offset,
1917 loff_t len); 1917 loff_t len);
1918 extern long do_sys_open(int dfd, const char __user *filename, int flags, 1918 extern long do_sys_open(int dfd, const char __user *filename, int flags,
1919 int mode); 1919 int mode);
1920 extern struct file *filp_open(const char *, int, int); 1920 extern struct file *filp_open(const char *, int, int);
1921 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, 1921 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
1922 const struct cred *); 1922 const struct cred *);
1923 extern int filp_close(struct file *, fl_owner_t id); 1923 extern int filp_close(struct file *, fl_owner_t id);
1924 extern char * getname(const char __user *); 1924 extern char * getname(const char __user *);
1925 1925
1926 /* fs/ioctl.c */ 1926 /* fs/ioctl.c */
1927 1927
1928 extern int ioctl_preallocate(struct file *filp, void __user *argp); 1928 extern int ioctl_preallocate(struct file *filp, void __user *argp);
1929 1929
1930 /* fs/dcache.c */ 1930 /* fs/dcache.c */
1931 extern void __init vfs_caches_init_early(void); 1931 extern void __init vfs_caches_init_early(void);
1932 extern void __init vfs_caches_init(unsigned long); 1932 extern void __init vfs_caches_init(unsigned long);
1933 1933
1934 extern struct kmem_cache *names_cachep; 1934 extern struct kmem_cache *names_cachep;
1935 1935
1936 #define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp)) 1936 #define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp))
1937 #define __getname() __getname_gfp(GFP_KERNEL) 1937 #define __getname() __getname_gfp(GFP_KERNEL)
1938 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) 1938 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
1939 #ifndef CONFIG_AUDITSYSCALL 1939 #ifndef CONFIG_AUDITSYSCALL
1940 #define putname(name) __putname(name) 1940 #define putname(name) __putname(name)
1941 #else 1941 #else
1942 extern void putname(const char *name); 1942 extern void putname(const char *name);
1943 #endif 1943 #endif
1944 1944
1945 #ifdef CONFIG_BLOCK 1945 #ifdef CONFIG_BLOCK
1946 extern int register_blkdev(unsigned int, const char *); 1946 extern int register_blkdev(unsigned int, const char *);
1947 extern void unregister_blkdev(unsigned int, const char *); 1947 extern void unregister_blkdev(unsigned int, const char *);
1948 extern struct block_device *bdget(dev_t); 1948 extern struct block_device *bdget(dev_t);
1949 extern struct block_device *bdgrab(struct block_device *bdev);
1949 extern void bd_set_size(struct block_device *, loff_t size); 1950 extern void bd_set_size(struct block_device *, loff_t size);
1950 extern void bd_forget(struct inode *inode); 1951 extern void bd_forget(struct inode *inode);
1951 extern void bdput(struct block_device *); 1952 extern void bdput(struct block_device *);
1952 extern struct block_device *open_by_devnum(dev_t, fmode_t); 1953 extern struct block_device *open_by_devnum(dev_t, fmode_t);
1953 extern void invalidate_bdev(struct block_device *); 1954 extern void invalidate_bdev(struct block_device *);
1954 extern int sync_blockdev(struct block_device *bdev); 1955 extern int sync_blockdev(struct block_device *bdev);
1955 extern struct super_block *freeze_bdev(struct block_device *); 1956 extern struct super_block *freeze_bdev(struct block_device *);
1956 extern void emergency_thaw_all(void); 1957 extern void emergency_thaw_all(void);
1957 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); 1958 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
1958 extern int fsync_bdev(struct block_device *); 1959 extern int fsync_bdev(struct block_device *);
1959 #else 1960 #else
1960 static inline void bd_forget(struct inode *inode) {} 1961 static inline void bd_forget(struct inode *inode) {}
1961 static inline int sync_blockdev(struct block_device *bdev) { return 0; } 1962 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
1962 static inline void invalidate_bdev(struct block_device *bdev) {} 1963 static inline void invalidate_bdev(struct block_device *bdev) {}
1963 1964
1964 static inline struct super_block *freeze_bdev(struct block_device *sb) 1965 static inline struct super_block *freeze_bdev(struct block_device *sb)
1965 { 1966 {
1966 return NULL; 1967 return NULL;
1967 } 1968 }
1968 1969
1969 static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) 1970 static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
1970 { 1971 {
1971 return 0; 1972 return 0;
1972 } 1973 }
1973 #endif 1974 #endif
1974 extern int sync_filesystem(struct super_block *); 1975 extern int sync_filesystem(struct super_block *);
1975 extern const struct file_operations def_blk_fops; 1976 extern const struct file_operations def_blk_fops;
1976 extern const struct file_operations def_chr_fops; 1977 extern const struct file_operations def_chr_fops;
1977 extern const struct file_operations bad_sock_fops; 1978 extern const struct file_operations bad_sock_fops;
1978 extern const struct file_operations def_fifo_fops; 1979 extern const struct file_operations def_fifo_fops;
1979 #ifdef CONFIG_BLOCK 1980 #ifdef CONFIG_BLOCK
1980 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); 1981 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
1981 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); 1982 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
1982 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); 1983 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
1983 extern int blkdev_get(struct block_device *, fmode_t); 1984 extern int blkdev_get(struct block_device *, fmode_t);
1984 extern int blkdev_put(struct block_device *, fmode_t); 1985 extern int blkdev_put(struct block_device *, fmode_t);
1985 extern int bd_claim(struct block_device *, void *); 1986 extern int bd_claim(struct block_device *, void *);
1986 extern void bd_release(struct block_device *); 1987 extern void bd_release(struct block_device *);
1987 #ifdef CONFIG_SYSFS 1988 #ifdef CONFIG_SYSFS
1988 extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); 1989 extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
1989 extern void bd_release_from_disk(struct block_device *, struct gendisk *); 1990 extern void bd_release_from_disk(struct block_device *, struct gendisk *);
1990 #else 1991 #else
1991 #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) 1992 #define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder)
1992 #define bd_release_from_disk(bdev, disk) bd_release(bdev) 1993 #define bd_release_from_disk(bdev, disk) bd_release(bdev)
1993 #endif 1994 #endif
1994 #endif 1995 #endif
1995 1996
1996 /* fs/char_dev.c */ 1997 /* fs/char_dev.c */
1997 #define CHRDEV_MAJOR_HASH_SIZE 255 1998 #define CHRDEV_MAJOR_HASH_SIZE 255
1998 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); 1999 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
1999 extern int register_chrdev_region(dev_t, unsigned, const char *); 2000 extern int register_chrdev_region(dev_t, unsigned, const char *);
2000 extern int register_chrdev(unsigned int, const char *, 2001 extern int register_chrdev(unsigned int, const char *,
2001 const struct file_operations *); 2002 const struct file_operations *);
2002 extern void unregister_chrdev(unsigned int, const char *); 2003 extern void unregister_chrdev(unsigned int, const char *);
2003 extern void unregister_chrdev_region(dev_t, unsigned); 2004 extern void unregister_chrdev_region(dev_t, unsigned);
2004 extern void chrdev_show(struct seq_file *,off_t); 2005 extern void chrdev_show(struct seq_file *,off_t);
2005 2006
2006 /* fs/block_dev.c */ 2007 /* fs/block_dev.c */
2007 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ 2008 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */
2008 #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ 2009 #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */
2009 2010
2010 #ifdef CONFIG_BLOCK 2011 #ifdef CONFIG_BLOCK
2011 #define BLKDEV_MAJOR_HASH_SIZE 255 2012 #define BLKDEV_MAJOR_HASH_SIZE 255
2012 extern const char *__bdevname(dev_t, char *buffer); 2013 extern const char *__bdevname(dev_t, char *buffer);
2013 extern const char *bdevname(struct block_device *bdev, char *buffer); 2014 extern const char *bdevname(struct block_device *bdev, char *buffer);
2014 extern struct block_device *lookup_bdev(const char *); 2015 extern struct block_device *lookup_bdev(const char *);
2015 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); 2016 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
2016 extern void close_bdev_exclusive(struct block_device *, fmode_t); 2017 extern void close_bdev_exclusive(struct block_device *, fmode_t);
2017 extern void blkdev_show(struct seq_file *,off_t); 2018 extern void blkdev_show(struct seq_file *,off_t);
2018 2019
2019 #else 2020 #else
2020 #define BLKDEV_MAJOR_HASH_SIZE 0 2021 #define BLKDEV_MAJOR_HASH_SIZE 0
2021 #endif 2022 #endif
2022 2023
2023 extern void init_special_inode(struct inode *, umode_t, dev_t); 2024 extern void init_special_inode(struct inode *, umode_t, dev_t);
2024 2025
2025 /* Invalid inode operations -- fs/bad_inode.c */ 2026 /* Invalid inode operations -- fs/bad_inode.c */
2026 extern void make_bad_inode(struct inode *); 2027 extern void make_bad_inode(struct inode *);
2027 extern int is_bad_inode(struct inode *); 2028 extern int is_bad_inode(struct inode *);
2028 2029
2029 extern const struct file_operations read_pipefifo_fops; 2030 extern const struct file_operations read_pipefifo_fops;
2030 extern const struct file_operations write_pipefifo_fops; 2031 extern const struct file_operations write_pipefifo_fops;
2031 extern const struct file_operations rdwr_pipefifo_fops; 2032 extern const struct file_operations rdwr_pipefifo_fops;
2032 2033
2033 extern int fs_may_remount_ro(struct super_block *); 2034 extern int fs_may_remount_ro(struct super_block *);
2034 2035
2035 #ifdef CONFIG_BLOCK 2036 #ifdef CONFIG_BLOCK
2036 /* 2037 /*
2037 * return READ, READA, or WRITE 2038 * return READ, READA, or WRITE
2038 */ 2039 */
2039 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) 2040 #define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK))
2040 2041
2041 /* 2042 /*
2042 * return data direction, READ or WRITE 2043 * return data direction, READ or WRITE
2043 */ 2044 */
2044 #define bio_data_dir(bio) ((bio)->bi_rw & 1) 2045 #define bio_data_dir(bio) ((bio)->bi_rw & 1)
2045 2046
2046 extern void check_disk_size_change(struct gendisk *disk, 2047 extern void check_disk_size_change(struct gendisk *disk,
2047 struct block_device *bdev); 2048 struct block_device *bdev);
2048 extern int revalidate_disk(struct gendisk *); 2049 extern int revalidate_disk(struct gendisk *);
2049 extern int check_disk_change(struct block_device *); 2050 extern int check_disk_change(struct block_device *);
2050 extern int __invalidate_device(struct block_device *); 2051 extern int __invalidate_device(struct block_device *);
2051 extern int invalidate_partition(struct gendisk *, int); 2052 extern int invalidate_partition(struct gendisk *, int);
2052 #endif 2053 #endif
2053 extern int invalidate_inodes(struct super_block *); 2054 extern int invalidate_inodes(struct super_block *);
2054 unsigned long invalidate_mapping_pages(struct address_space *mapping, 2055 unsigned long invalidate_mapping_pages(struct address_space *mapping,
2055 pgoff_t start, pgoff_t end); 2056 pgoff_t start, pgoff_t end);
2056 2057
2057 static inline unsigned long __deprecated 2058 static inline unsigned long __deprecated
2058 invalidate_inode_pages(struct address_space *mapping) 2059 invalidate_inode_pages(struct address_space *mapping)
2059 { 2060 {
2060 return invalidate_mapping_pages(mapping, 0, ~0UL); 2061 return invalidate_mapping_pages(mapping, 0, ~0UL);
2061 } 2062 }
2062 2063
2063 static inline void invalidate_remote_inode(struct inode *inode) 2064 static inline void invalidate_remote_inode(struct inode *inode)
2064 { 2065 {
2065 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2066 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2066 S_ISLNK(inode->i_mode)) 2067 S_ISLNK(inode->i_mode))
2067 invalidate_mapping_pages(inode->i_mapping, 0, -1); 2068 invalidate_mapping_pages(inode->i_mapping, 0, -1);
2068 } 2069 }
2069 extern int invalidate_inode_pages2(struct address_space *mapping); 2070 extern int invalidate_inode_pages2(struct address_space *mapping);
2070 extern int invalidate_inode_pages2_range(struct address_space *mapping, 2071 extern int invalidate_inode_pages2_range(struct address_space *mapping,
2071 pgoff_t start, pgoff_t end); 2072 pgoff_t start, pgoff_t end);
2072 extern void generic_sync_sb_inodes(struct super_block *sb, 2073 extern void generic_sync_sb_inodes(struct super_block *sb,
2073 struct writeback_control *wbc); 2074 struct writeback_control *wbc);
2074 extern int write_inode_now(struct inode *, int); 2075 extern int write_inode_now(struct inode *, int);
2075 extern int filemap_fdatawrite(struct address_space *); 2076 extern int filemap_fdatawrite(struct address_space *);
2076 extern int filemap_flush(struct address_space *); 2077 extern int filemap_flush(struct address_space *);
2077 extern int filemap_fdatawait(struct address_space *); 2078 extern int filemap_fdatawait(struct address_space *);
2078 extern int filemap_write_and_wait(struct address_space *mapping); 2079 extern int filemap_write_and_wait(struct address_space *mapping);
2079 extern int filemap_write_and_wait_range(struct address_space *mapping, 2080 extern int filemap_write_and_wait_range(struct address_space *mapping,
2080 loff_t lstart, loff_t lend); 2081 loff_t lstart, loff_t lend);
2081 extern int wait_on_page_writeback_range(struct address_space *mapping, 2082 extern int wait_on_page_writeback_range(struct address_space *mapping,
2082 pgoff_t start, pgoff_t end); 2083 pgoff_t start, pgoff_t end);
2083 extern int __filemap_fdatawrite_range(struct address_space *mapping, 2084 extern int __filemap_fdatawrite_range(struct address_space *mapping,
2084 loff_t start, loff_t end, int sync_mode); 2085 loff_t start, loff_t end, int sync_mode);
2085 extern int filemap_fdatawrite_range(struct address_space *mapping, 2086 extern int filemap_fdatawrite_range(struct address_space *mapping,
2086 loff_t start, loff_t end); 2087 loff_t start, loff_t end);
2087 2088
2088 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); 2089 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
2089 extern void sync_supers(void); 2090 extern void sync_supers(void);
2090 extern void emergency_sync(void); 2091 extern void emergency_sync(void);
2091 extern void emergency_remount(void); 2092 extern void emergency_remount(void);
2092 #ifdef CONFIG_BLOCK 2093 #ifdef CONFIG_BLOCK
2093 extern sector_t bmap(struct inode *, sector_t); 2094 extern sector_t bmap(struct inode *, sector_t);
2094 #endif 2095 #endif
2095 extern int notify_change(struct dentry *, struct iattr *); 2096 extern int notify_change(struct dentry *, struct iattr *);
2096 extern int inode_permission(struct inode *, int); 2097 extern int inode_permission(struct inode *, int);
2097 extern int generic_permission(struct inode *, int, 2098 extern int generic_permission(struct inode *, int,
2098 int (*check_acl)(struct inode *, int)); 2099 int (*check_acl)(struct inode *, int));
2099 2100
2100 static inline bool execute_ok(struct inode *inode) 2101 static inline bool execute_ok(struct inode *inode)
2101 { 2102 {
2102 return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); 2103 return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
2103 } 2104 }
2104 2105
2105 extern int get_write_access(struct inode *); 2106 extern int get_write_access(struct inode *);
2106 extern int deny_write_access(struct file *); 2107 extern int deny_write_access(struct file *);
2107 static inline void put_write_access(struct inode * inode) 2108 static inline void put_write_access(struct inode * inode)
2108 { 2109 {
2109 atomic_dec(&inode->i_writecount); 2110 atomic_dec(&inode->i_writecount);
2110 } 2111 }
2111 static inline void allow_write_access(struct file *file) 2112 static inline void allow_write_access(struct file *file)
2112 { 2113 {
2113 if (file) 2114 if (file)
2114 atomic_inc(&file->f_path.dentry->d_inode->i_writecount); 2115 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
2115 } 2116 }
2116 extern int do_pipe_flags(int *, int); 2117 extern int do_pipe_flags(int *, int);
2117 extern struct file *create_read_pipe(struct file *f, int flags); 2118 extern struct file *create_read_pipe(struct file *f, int flags);
2118 extern struct file *create_write_pipe(int flags); 2119 extern struct file *create_write_pipe(int flags);
2119 extern void free_write_pipe(struct file *); 2120 extern void free_write_pipe(struct file *);
2120 2121
2121 extern struct file *do_filp_open(int dfd, const char *pathname, 2122 extern struct file *do_filp_open(int dfd, const char *pathname,
2122 int open_flag, int mode, int acc_mode); 2123 int open_flag, int mode, int acc_mode);
2123 extern int may_open(struct path *, int, int); 2124 extern int may_open(struct path *, int, int);
2124 2125
2125 extern int kernel_read(struct file *, unsigned long, char *, unsigned long); 2126 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
2126 extern struct file * open_exec(const char *); 2127 extern struct file * open_exec(const char *);
2127 2128
2128 /* fs/dcache.c -- generic fs support functions */ 2129 /* fs/dcache.c -- generic fs support functions */
2129 extern int is_subdir(struct dentry *, struct dentry *); 2130 extern int is_subdir(struct dentry *, struct dentry *);
2130 extern ino_t find_inode_number(struct dentry *, struct qstr *); 2131 extern ino_t find_inode_number(struct dentry *, struct qstr *);
2131 2132
2132 #include <linux/err.h> 2133 #include <linux/err.h>
2133 2134
2134 /* needed for stackable file system support */ 2135 /* needed for stackable file system support */
2135 extern loff_t default_llseek(struct file *file, loff_t offset, int origin); 2136 extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
2136 2137
2137 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); 2138 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
2138 2139
2139 extern struct inode * inode_init_always(struct super_block *, struct inode *); 2140 extern struct inode * inode_init_always(struct super_block *, struct inode *);
2140 extern void inode_init_once(struct inode *); 2141 extern void inode_init_once(struct inode *);
2141 extern void inode_add_to_lists(struct super_block *, struct inode *); 2142 extern void inode_add_to_lists(struct super_block *, struct inode *);
2142 extern void iput(struct inode *); 2143 extern void iput(struct inode *);
2143 extern struct inode * igrab(struct inode *); 2144 extern struct inode * igrab(struct inode *);
2144 extern ino_t iunique(struct super_block *, ino_t); 2145 extern ino_t iunique(struct super_block *, ino_t);
2145 extern int inode_needs_sync(struct inode *inode); 2146 extern int inode_needs_sync(struct inode *inode);
2146 extern void generic_delete_inode(struct inode *inode); 2147 extern void generic_delete_inode(struct inode *inode);
2147 extern void generic_drop_inode(struct inode *inode); 2148 extern void generic_drop_inode(struct inode *inode);
2148 2149
2149 extern struct inode *ilookup5_nowait(struct super_block *sb, 2150 extern struct inode *ilookup5_nowait(struct super_block *sb,
2150 unsigned long hashval, int (*test)(struct inode *, void *), 2151 unsigned long hashval, int (*test)(struct inode *, void *),
2151 void *data); 2152 void *data);
2152 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 2153 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
2153 int (*test)(struct inode *, void *), void *data); 2154 int (*test)(struct inode *, void *), void *data);
2154 extern struct inode *ilookup(struct super_block *sb, unsigned long ino); 2155 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
2155 2156
2156 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); 2157 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
2157 extern struct inode * iget_locked(struct super_block *, unsigned long); 2158 extern struct inode * iget_locked(struct super_block *, unsigned long);
2158 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); 2159 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
2159 extern int insert_inode_locked(struct inode *); 2160 extern int insert_inode_locked(struct inode *);
2160 extern void unlock_new_inode(struct inode *); 2161 extern void unlock_new_inode(struct inode *);
2161 2162
2162 extern void __iget(struct inode * inode); 2163 extern void __iget(struct inode * inode);
2163 extern void iget_failed(struct inode *); 2164 extern void iget_failed(struct inode *);
2164 extern void clear_inode(struct inode *); 2165 extern void clear_inode(struct inode *);
2165 extern void destroy_inode(struct inode *); 2166 extern void destroy_inode(struct inode *);
2166 extern struct inode *new_inode(struct super_block *); 2167 extern struct inode *new_inode(struct super_block *);
2167 extern int should_remove_suid(struct dentry *); 2168 extern int should_remove_suid(struct dentry *);
2168 extern int file_remove_suid(struct file *); 2169 extern int file_remove_suid(struct file *);
2169 2170
2170 extern void __insert_inode_hash(struct inode *, unsigned long hashval); 2171 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
2171 extern void remove_inode_hash(struct inode *); 2172 extern void remove_inode_hash(struct inode *);
2172 static inline void insert_inode_hash(struct inode *inode) { 2173 static inline void insert_inode_hash(struct inode *inode) {
2173 __insert_inode_hash(inode, inode->i_ino); 2174 __insert_inode_hash(inode, inode->i_ino);
2174 } 2175 }
2175 2176
2176 extern struct file * get_empty_filp(void); 2177 extern struct file * get_empty_filp(void);
2177 extern void file_move(struct file *f, struct list_head *list); 2178 extern void file_move(struct file *f, struct list_head *list);
2178 extern void file_kill(struct file *f); 2179 extern void file_kill(struct file *f);
2179 #ifdef CONFIG_BLOCK 2180 #ifdef CONFIG_BLOCK
2180 struct bio; 2181 struct bio;
2181 extern void submit_bio(int, struct bio *); 2182 extern void submit_bio(int, struct bio *);
2182 extern int bdev_read_only(struct block_device *); 2183 extern int bdev_read_only(struct block_device *);
2183 #endif 2184 #endif
2184 extern int set_blocksize(struct block_device *, int); 2185 extern int set_blocksize(struct block_device *, int);
2185 extern int sb_set_blocksize(struct super_block *, int); 2186 extern int sb_set_blocksize(struct super_block *, int);
2186 extern int sb_min_blocksize(struct super_block *, int); 2187 extern int sb_min_blocksize(struct super_block *, int);
2187 extern int sb_has_dirty_inodes(struct super_block *); 2188 extern int sb_has_dirty_inodes(struct super_block *);
2188 2189
2189 extern int generic_file_mmap(struct file *, struct vm_area_struct *); 2190 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
2190 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 2191 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
2191 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); 2192 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
2192 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); 2193 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
2193 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); 2194 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
2194 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); 2195 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
2195 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, 2196 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
2196 unsigned long, loff_t); 2197 unsigned long, loff_t);
2197 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, 2198 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
2198 unsigned long *, loff_t, loff_t *, size_t, size_t); 2199 unsigned long *, loff_t, loff_t *, size_t, size_t);
2199 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, 2200 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
2200 unsigned long, loff_t, loff_t *, size_t, ssize_t); 2201 unsigned long, loff_t, loff_t *, size_t, ssize_t);
2201 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); 2202 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
2202 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); 2203 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
2203 extern int generic_segment_checks(const struct iovec *iov, 2204 extern int generic_segment_checks(const struct iovec *iov,
2204 unsigned long *nr_segs, size_t *count, int access_flags); 2205 unsigned long *nr_segs, size_t *count, int access_flags);
2205 2206
2206 /* fs/splice.c */ 2207 /* fs/splice.c */
2207 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 2208 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
2208 struct pipe_inode_info *, size_t, unsigned int); 2209 struct pipe_inode_info *, size_t, unsigned int);
2209 extern ssize_t default_file_splice_read(struct file *, loff_t *, 2210 extern ssize_t default_file_splice_read(struct file *, loff_t *,
2210 struct pipe_inode_info *, size_t, unsigned int); 2211 struct pipe_inode_info *, size_t, unsigned int);
2211 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2212 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
2212 struct file *, loff_t *, size_t, unsigned int); 2213 struct file *, loff_t *, size_t, unsigned int);
2213 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2214 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
2214 struct file *out, loff_t *, size_t len, unsigned int flags); 2215 struct file *out, loff_t *, size_t len, unsigned int flags);
2215 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 2216 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
2216 size_t len, unsigned int flags); 2217 size_t len, unsigned int flags);
2217 2218
2218 extern void 2219 extern void
2219 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); 2220 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
2220 extern loff_t no_llseek(struct file *file, loff_t offset, int origin); 2221 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
2221 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); 2222 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
2222 extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, 2223 extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
2223 int origin); 2224 int origin);
2224 extern int generic_file_open(struct inode * inode, struct file * filp); 2225 extern int generic_file_open(struct inode * inode, struct file * filp);
2225 extern int nonseekable_open(struct inode * inode, struct file * filp); 2226 extern int nonseekable_open(struct inode * inode, struct file * filp);
2226 2227
2227 #ifdef CONFIG_FS_XIP 2228 #ifdef CONFIG_FS_XIP
2228 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, 2229 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
2229 loff_t *ppos); 2230 loff_t *ppos);
2230 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); 2231 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
2231 extern ssize_t xip_file_write(struct file *filp, const char __user *buf, 2232 extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
2232 size_t len, loff_t *ppos); 2233 size_t len, loff_t *ppos);
2233 extern int xip_truncate_page(struct address_space *mapping, loff_t from); 2234 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
2234 #else 2235 #else
2235 static inline int xip_truncate_page(struct address_space *mapping, loff_t from) 2236 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
2236 { 2237 {
2237 return 0; 2238 return 0;
2238 } 2239 }
2239 #endif 2240 #endif
2240 2241
2241 #ifdef CONFIG_BLOCK 2242 #ifdef CONFIG_BLOCK
2242 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 2243 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
2243 struct block_device *bdev, const struct iovec *iov, loff_t offset, 2244 struct block_device *bdev, const struct iovec *iov, loff_t offset,
2244 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 2245 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
2245 int lock_type); 2246 int lock_type);
2246 2247
2247 enum { 2248 enum {
2248 DIO_LOCKING = 1, /* need locking between buffered and direct access */ 2249 DIO_LOCKING = 1, /* need locking between buffered and direct access */
2249 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ 2250 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
2250 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ 2251 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
2251 }; 2252 };
2252 2253
2253 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 2254 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2254 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 2255 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2255 loff_t offset, unsigned long nr_segs, get_block_t get_block, 2256 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2256 dio_iodone_t end_io) 2257 dio_iodone_t end_io)
2257 { 2258 {
2258 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2259 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2259 nr_segs, get_block, end_io, DIO_LOCKING); 2260 nr_segs, get_block, end_io, DIO_LOCKING);
2260 } 2261 }
2261 2262
2262 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, 2263 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
2263 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 2264 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2264 loff_t offset, unsigned long nr_segs, get_block_t get_block, 2265 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2265 dio_iodone_t end_io) 2266 dio_iodone_t end_io)
2266 { 2267 {
2267 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2268 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2268 nr_segs, get_block, end_io, DIO_NO_LOCKING); 2269 nr_segs, get_block, end_io, DIO_NO_LOCKING);
2269 } 2270 }
2270 2271
2271 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, 2272 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
2272 struct inode *inode, struct block_device *bdev, const struct iovec *iov, 2273 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2273 loff_t offset, unsigned long nr_segs, get_block_t get_block, 2274 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2274 dio_iodone_t end_io) 2275 dio_iodone_t end_io)
2275 { 2276 {
2276 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2277 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2277 nr_segs, get_block, end_io, DIO_OWN_LOCKING); 2278 nr_segs, get_block, end_io, DIO_OWN_LOCKING);
2278 } 2279 }
2279 #endif 2280 #endif
2280 2281
2281 extern const struct file_operations generic_ro_fops; 2282 extern const struct file_operations generic_ro_fops;
2282 2283
2283 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) 2284 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
2284 2285
2285 extern int vfs_readlink(struct dentry *, char __user *, int, const char *); 2286 extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
2286 extern int vfs_follow_link(struct nameidata *, const char *); 2287 extern int vfs_follow_link(struct nameidata *, const char *);
2287 extern int page_readlink(struct dentry *, char __user *, int); 2288 extern int page_readlink(struct dentry *, char __user *, int);
2288 extern void *page_follow_link_light(struct dentry *, struct nameidata *); 2289 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
2289 extern void page_put_link(struct dentry *, struct nameidata *, void *); 2290 extern void page_put_link(struct dentry *, struct nameidata *, void *);
2290 extern int __page_symlink(struct inode *inode, const char *symname, int len, 2291 extern int __page_symlink(struct inode *inode, const char *symname, int len,
2291 int nofs); 2292 int nofs);
2292 extern int page_symlink(struct inode *inode, const char *symname, int len); 2293 extern int page_symlink(struct inode *inode, const char *symname, int len);
2293 extern const struct inode_operations page_symlink_inode_operations; 2294 extern const struct inode_operations page_symlink_inode_operations;
2294 extern int generic_readlink(struct dentry *, char __user *, int); 2295 extern int generic_readlink(struct dentry *, char __user *, int);
2295 extern void generic_fillattr(struct inode *, struct kstat *); 2296 extern void generic_fillattr(struct inode *, struct kstat *);
2296 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 2297 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
2297 void inode_add_bytes(struct inode *inode, loff_t bytes); 2298 void inode_add_bytes(struct inode *inode, loff_t bytes);
2298 void inode_sub_bytes(struct inode *inode, loff_t bytes); 2299 void inode_sub_bytes(struct inode *inode, loff_t bytes);
2299 loff_t inode_get_bytes(struct inode *inode); 2300 loff_t inode_get_bytes(struct inode *inode);
2300 void inode_set_bytes(struct inode *inode, loff_t bytes); 2301 void inode_set_bytes(struct inode *inode, loff_t bytes);
2301 2302
2302 extern int vfs_readdir(struct file *, filldir_t, void *); 2303 extern int vfs_readdir(struct file *, filldir_t, void *);
2303 2304
2304 extern int vfs_stat(char __user *, struct kstat *); 2305 extern int vfs_stat(char __user *, struct kstat *);
2305 extern int vfs_lstat(char __user *, struct kstat *); 2306 extern int vfs_lstat(char __user *, struct kstat *);
2306 extern int vfs_fstat(unsigned int, struct kstat *); 2307 extern int vfs_fstat(unsigned int, struct kstat *);
2307 extern int vfs_fstatat(int , char __user *, struct kstat *, int); 2308 extern int vfs_fstatat(int , char __user *, struct kstat *, int);
2308 2309
2309 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, 2310 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
2310 unsigned long arg); 2311 unsigned long arg);
2311 extern int __generic_block_fiemap(struct inode *inode, 2312 extern int __generic_block_fiemap(struct inode *inode,
2312 struct fiemap_extent_info *fieinfo, u64 start, 2313 struct fiemap_extent_info *fieinfo, u64 start,
2313 u64 len, get_block_t *get_block); 2314 u64 len, get_block_t *get_block);
2314 extern int generic_block_fiemap(struct inode *inode, 2315 extern int generic_block_fiemap(struct inode *inode,
2315 struct fiemap_extent_info *fieinfo, u64 start, 2316 struct fiemap_extent_info *fieinfo, u64 start,
2316 u64 len, get_block_t *get_block); 2317 u64 len, get_block_t *get_block);
2317 2318
2318 extern void get_filesystem(struct file_system_type *fs); 2319 extern void get_filesystem(struct file_system_type *fs);
2319 extern void put_filesystem(struct file_system_type *fs); 2320 extern void put_filesystem(struct file_system_type *fs);
2320 extern struct file_system_type *get_fs_type(const char *name); 2321 extern struct file_system_type *get_fs_type(const char *name);
2321 extern struct super_block *get_super(struct block_device *); 2322 extern struct super_block *get_super(struct block_device *);
2322 extern struct super_block *user_get_super(dev_t); 2323 extern struct super_block *user_get_super(dev_t);
2323 extern void drop_super(struct super_block *sb); 2324 extern void drop_super(struct super_block *sb);
2324 2325
2325 extern int dcache_dir_open(struct inode *, struct file *); 2326 extern int dcache_dir_open(struct inode *, struct file *);
2326 extern int dcache_dir_close(struct inode *, struct file *); 2327 extern int dcache_dir_close(struct inode *, struct file *);
2327 extern loff_t dcache_dir_lseek(struct file *, loff_t, int); 2328 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
2328 extern int dcache_readdir(struct file *, void *, filldir_t); 2329 extern int dcache_readdir(struct file *, void *, filldir_t);
2329 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); 2330 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
2330 extern int simple_statfs(struct dentry *, struct kstatfs *); 2331 extern int simple_statfs(struct dentry *, struct kstatfs *);
2331 extern int simple_link(struct dentry *, struct inode *, struct dentry *); 2332 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
2332 extern int simple_unlink(struct inode *, struct dentry *); 2333 extern int simple_unlink(struct inode *, struct dentry *);
2333 extern int simple_rmdir(struct inode *, struct dentry *); 2334 extern int simple_rmdir(struct inode *, struct dentry *);
2334 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); 2335 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
2335 extern int simple_sync_file(struct file *, struct dentry *, int); 2336 extern int simple_sync_file(struct file *, struct dentry *, int);
2336 extern int simple_empty(struct dentry *); 2337 extern int simple_empty(struct dentry *);
2337 extern int simple_readpage(struct file *file, struct page *page); 2338 extern int simple_readpage(struct file *file, struct page *page);
2338 extern int simple_prepare_write(struct file *file, struct page *page, 2339 extern int simple_prepare_write(struct file *file, struct page *page,
2339 unsigned offset, unsigned to); 2340 unsigned offset, unsigned to);
2340 extern int simple_write_begin(struct file *file, struct address_space *mapping, 2341 extern int simple_write_begin(struct file *file, struct address_space *mapping,
2341 loff_t pos, unsigned len, unsigned flags, 2342 loff_t pos, unsigned len, unsigned flags,
2342 struct page **pagep, void **fsdata); 2343 struct page **pagep, void **fsdata);
2343 extern int simple_write_end(struct file *file, struct address_space *mapping, 2344 extern int simple_write_end(struct file *file, struct address_space *mapping,
2344 loff_t pos, unsigned len, unsigned copied, 2345 loff_t pos, unsigned len, unsigned copied,
2345 struct page *page, void *fsdata); 2346 struct page *page, void *fsdata);
2346 2347
2347 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); 2348 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
2348 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); 2349 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
2349 extern const struct file_operations simple_dir_operations; 2350 extern const struct file_operations simple_dir_operations;
2350 extern const struct inode_operations simple_dir_inode_operations; 2351 extern const struct inode_operations simple_dir_inode_operations;
2351 struct tree_descr { char *name; const struct file_operations *ops; int mode; }; 2352 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
2352 struct dentry *d_alloc_name(struct dentry *, const char *); 2353 struct dentry *d_alloc_name(struct dentry *, const char *);
2353 extern int simple_fill_super(struct super_block *, int, struct tree_descr *); 2354 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
2354 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); 2355 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
2355 extern void simple_release_fs(struct vfsmount **mount, int *count); 2356 extern void simple_release_fs(struct vfsmount **mount, int *count);
2356 2357
2357 extern ssize_t simple_read_from_buffer(void __user *to, size_t count, 2358 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
2358 loff_t *ppos, const void *from, size_t available); 2359 loff_t *ppos, const void *from, size_t available);
2359 2360
2360 extern int simple_fsync(struct file *, struct dentry *, int); 2361 extern int simple_fsync(struct file *, struct dentry *, int);
2361 2362
2362 #ifdef CONFIG_MIGRATION 2363 #ifdef CONFIG_MIGRATION
2363 extern int buffer_migrate_page(struct address_space *, 2364 extern int buffer_migrate_page(struct address_space *,
2364 struct page *, struct page *); 2365 struct page *, struct page *);
2365 #else 2366 #else
2366 #define buffer_migrate_page NULL 2367 #define buffer_migrate_page NULL
2367 #endif 2368 #endif
2368 2369
2369 extern int inode_change_ok(struct inode *, struct iattr *); 2370 extern int inode_change_ok(struct inode *, struct iattr *);
2370 extern int __must_check inode_setattr(struct inode *, struct iattr *); 2371 extern int __must_check inode_setattr(struct inode *, struct iattr *);
2371 2372
2372 extern void file_update_time(struct file *file); 2373 extern void file_update_time(struct file *file);
2373 2374
2374 extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt); 2375 extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
2375 extern void save_mount_options(struct super_block *sb, char *options); 2376 extern void save_mount_options(struct super_block *sb, char *options);
2376 extern void replace_mount_options(struct super_block *sb, char *options); 2377 extern void replace_mount_options(struct super_block *sb, char *options);
2377 2378
2378 static inline ino_t parent_ino(struct dentry *dentry) 2379 static inline ino_t parent_ino(struct dentry *dentry)
2379 { 2380 {
2380 ino_t res; 2381 ino_t res;
2381 2382
2382 spin_lock(&dentry->d_lock); 2383 spin_lock(&dentry->d_lock);
2383 res = dentry->d_parent->d_inode->i_ino; 2384 res = dentry->d_parent->d_inode->i_ino;
2384 spin_unlock(&dentry->d_lock); 2385 spin_unlock(&dentry->d_lock);
2385 return res; 2386 return res;
2386 } 2387 }
2387 2388
2388 /* Transaction based IO helpers */ 2389 /* Transaction based IO helpers */
2389 2390
2390 /* 2391 /*
2391 * An argresp is stored in an allocated page and holds the 2392 * An argresp is stored in an allocated page and holds the
2392 * size of the argument or response, along with its content 2393 * size of the argument or response, along with its content
2393 */ 2394 */
2394 struct simple_transaction_argresp { 2395 struct simple_transaction_argresp {
2395 ssize_t size; 2396 ssize_t size;
2396 char data[0]; 2397 char data[0];
2397 }; 2398 };
2398 2399
2399 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) 2400 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))
2400 2401
2401 char *simple_transaction_get(struct file *file, const char __user *buf, 2402 char *simple_transaction_get(struct file *file, const char __user *buf,
2402 size_t size); 2403 size_t size);
2403 ssize_t simple_transaction_read(struct file *file, char __user *buf, 2404 ssize_t simple_transaction_read(struct file *file, char __user *buf,
2404 size_t size, loff_t *pos); 2405 size_t size, loff_t *pos);
2405 int simple_transaction_release(struct inode *inode, struct file *file); 2406 int simple_transaction_release(struct inode *inode, struct file *file);
2406 2407
2407 void simple_transaction_set(struct file *file, size_t n); 2408 void simple_transaction_set(struct file *file, size_t n);
2408 2409
2409 /* 2410 /*
2410 * simple attribute files 2411 * simple attribute files
2411 * 2412 *
2412 * These attributes behave similar to those in sysfs: 2413 * These attributes behave similar to those in sysfs:
2413 * 2414 *
2414 * Writing to an attribute immediately sets a value, an open file can be 2415 * Writing to an attribute immediately sets a value, an open file can be
2415 * written to multiple times. 2416 * written to multiple times.
2416 * 2417 *
2417 * Reading from an attribute creates a buffer from the value that might get 2418 * Reading from an attribute creates a buffer from the value that might get
2418 * read with multiple read calls. When the attribute has been read 2419 * read with multiple read calls. When the attribute has been read
2419 * completely, no further read calls are possible until the file is opened 2420 * completely, no further read calls are possible until the file is opened
2420 * again. 2421 * again.
2421 * 2422 *
2422 * All attributes contain a text representation of a numeric value 2423 * All attributes contain a text representation of a numeric value
2423 * that are accessed with the get() and set() functions. 2424 * that are accessed with the get() and set() functions.
2424 */ 2425 */
2425 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ 2426 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \
2426 static int __fops ## _open(struct inode *inode, struct file *file) \ 2427 static int __fops ## _open(struct inode *inode, struct file *file) \
2427 { \ 2428 { \
2428 __simple_attr_check_format(__fmt, 0ull); \ 2429 __simple_attr_check_format(__fmt, 0ull); \
2429 return simple_attr_open(inode, file, __get, __set, __fmt); \ 2430 return simple_attr_open(inode, file, __get, __set, __fmt); \
2430 } \ 2431 } \
2431 static struct file_operations __fops = { \ 2432 static struct file_operations __fops = { \
2432 .owner = THIS_MODULE, \ 2433 .owner = THIS_MODULE, \
2433 .open = __fops ## _open, \ 2434 .open = __fops ## _open, \
2434 .release = simple_attr_release, \ 2435 .release = simple_attr_release, \
2435 .read = simple_attr_read, \ 2436 .read = simple_attr_read, \
2436 .write = simple_attr_write, \ 2437 .write = simple_attr_write, \
2437 }; 2438 };
2438 2439
2439 static inline void __attribute__((format(printf, 1, 2))) 2440 static inline void __attribute__((format(printf, 1, 2)))
2440 __simple_attr_check_format(const char *fmt, ...) 2441 __simple_attr_check_format(const char *fmt, ...)
2441 { 2442 {
2442 /* don't do anything, just let the compiler check the arguments; */ 2443 /* don't do anything, just let the compiler check the arguments; */
2443 } 2444 }
2444 2445
2445 int simple_attr_open(struct inode *inode, struct file *file, 2446 int simple_attr_open(struct inode *inode, struct file *file,
2446 int (*get)(void *, u64 *), int (*set)(void *, u64), 2447 int (*get)(void *, u64 *), int (*set)(void *, u64),
2447 const char *fmt); 2448 const char *fmt);
2448 int simple_attr_release(struct inode *inode, struct file *file); 2449 int simple_attr_release(struct inode *inode, struct file *file);
2449 ssize_t simple_attr_read(struct file *file, char __user *buf, 2450 ssize_t simple_attr_read(struct file *file, char __user *buf,
2450 size_t len, loff_t *ppos); 2451 size_t len, loff_t *ppos);
2451 ssize_t simple_attr_write(struct file *file, const char __user *buf, 2452 ssize_t simple_attr_write(struct file *file, const char __user *buf,
2452 size_t len, loff_t *ppos); 2453 size_t len, loff_t *ppos);
2453 2454
2454 struct ctl_table; 2455 struct ctl_table;
2455 int proc_nr_files(struct ctl_table *table, int write, struct file *filp, 2456 int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
2456 void __user *buffer, size_t *lenp, loff_t *ppos); 2457 void __user *buffer, size_t *lenp, loff_t *ppos);
2457 2458
2458 int __init get_filesystem_list(char *buf); 2459 int __init get_filesystem_list(char *buf);
2459 2460
2460 #endif /* __KERNEL__ */ 2461 #endif /* __KERNEL__ */
2461 #endif /* _LINUX_FS_H */ 2462 #endif /* _LINUX_FS_H */
2462 2463
1 /* 1 /*
2 * linux/mm/swapfile.c 2 * linux/mm/swapfile.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */ 6 */
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/hugetlb.h> 9 #include <linux/hugetlb.h>
10 #include <linux/mman.h> 10 #include <linux/mman.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/kernel_stat.h> 12 #include <linux/kernel_stat.h>
13 #include <linux/swap.h> 13 #include <linux/swap.h>
14 #include <linux/vmalloc.h> 14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/namei.h> 16 #include <linux/namei.h>
17 #include <linux/shm.h> 17 #include <linux/shm.h>
18 #include <linux/blkdev.h> 18 #include <linux/blkdev.h>
19 #include <linux/random.h> 19 #include <linux/random.h>
20 #include <linux/writeback.h> 20 #include <linux/writeback.h>
21 #include <linux/proc_fs.h> 21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h> 22 #include <linux/seq_file.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/module.h> 24 #include <linux/module.h>
25 #include <linux/rmap.h> 25 #include <linux/rmap.h>
26 #include <linux/security.h> 26 #include <linux/security.h>
27 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h>
28 #include <linux/mutex.h> 28 #include <linux/mutex.h>
29 #include <linux/capability.h> 29 #include <linux/capability.h>
30 #include <linux/syscalls.h> 30 #include <linux/syscalls.h>
31 #include <linux/memcontrol.h> 31 #include <linux/memcontrol.h>
32 32
33 #include <asm/pgtable.h> 33 #include <asm/pgtable.h>
34 #include <asm/tlbflush.h> 34 #include <asm/tlbflush.h>
35 #include <linux/swapops.h> 35 #include <linux/swapops.h>
36 #include <linux/page_cgroup.h> 36 #include <linux/page_cgroup.h>
37 37
38 static DEFINE_SPINLOCK(swap_lock); 38 static DEFINE_SPINLOCK(swap_lock);
39 static unsigned int nr_swapfiles; 39 static unsigned int nr_swapfiles;
40 long nr_swap_pages; 40 long nr_swap_pages;
41 long total_swap_pages; 41 long total_swap_pages;
42 static int swap_overflow; 42 static int swap_overflow;
43 static int least_priority; 43 static int least_priority;
44 44
45 static const char Bad_file[] = "Bad swap file entry "; 45 static const char Bad_file[] = "Bad swap file entry ";
46 static const char Unused_file[] = "Unused swap file entry "; 46 static const char Unused_file[] = "Unused swap file entry ";
47 static const char Bad_offset[] = "Bad swap offset entry "; 47 static const char Bad_offset[] = "Bad swap offset entry ";
48 static const char Unused_offset[] = "Unused swap offset entry "; 48 static const char Unused_offset[] = "Unused swap offset entry ";
49 49
50 static struct swap_list_t swap_list = {-1, -1}; 50 static struct swap_list_t swap_list = {-1, -1};
51 51
52 static struct swap_info_struct swap_info[MAX_SWAPFILES]; 52 static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54 static DEFINE_MUTEX(swapon_mutex); 54 static DEFINE_MUTEX(swapon_mutex);
55 55
56 /* For reference count accounting in swap_map */ 56 /* For reference count accounting in swap_map */
57 /* enum for swap_map[] handling. internal use only */ 57 /* enum for swap_map[] handling. internal use only */
58 enum { 58 enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */ 59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */ 60 SWAP_CACHE, /* ops for reference from swap cache */
61 }; 61 };
62 62
63 static inline int swap_count(unsigned short ent) 63 static inline int swap_count(unsigned short ent)
64 { 64 {
65 return ent & SWAP_COUNT_MASK; 65 return ent & SWAP_COUNT_MASK;
66 } 66 }
67 67
68 static inline bool swap_has_cache(unsigned short ent) 68 static inline bool swap_has_cache(unsigned short ent)
69 { 69 {
70 return !!(ent & SWAP_HAS_CACHE); 70 return !!(ent & SWAP_HAS_CACHE);
71 } 71 }
72 72
73 static inline unsigned short encode_swapmap(int count, bool has_cache) 73 static inline unsigned short encode_swapmap(int count, bool has_cache)
74 { 74 {
75 unsigned short ret = count; 75 unsigned short ret = count;
76 76
77 if (has_cache) 77 if (has_cache)
78 return SWAP_HAS_CACHE | ret; 78 return SWAP_HAS_CACHE | ret;
79 return ret; 79 return ret;
80 } 80 }
81 81
82 /* returnes 1 if swap entry is freed */ 82 /* returnes 1 if swap entry is freed */
83 static int 83 static int
84 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 84 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85 { 85 {
86 int type = si - swap_info; 86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset); 87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page; 88 struct page *page;
89 int ret = 0; 89 int ret = 0;
90 90
91 page = find_get_page(&swapper_space, entry.val); 91 page = find_get_page(&swapper_space, entry.val);
92 if (!page) 92 if (!page)
93 return 0; 93 return 0;
94 /* 94 /*
95 * This function is called from scan_swap_map() and it's called 95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special 97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page() 98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations. 99 * in usual operations.
100 */ 100 */
101 if (trylock_page(page)) { 101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page); 102 ret = try_to_free_swap(page);
103 unlock_page(page); 103 unlock_page(page);
104 } 104 }
105 page_cache_release(page); 105 page_cache_release(page);
106 return ret; 106 return ret;
107 } 107 }
108 108
109 /* 109 /*
110 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
111 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
112 * cannot be turned into a mutex. 112 * cannot be turned into a mutex.
113 */ 113 */
114 static DECLARE_RWSEM(swap_unplug_sem); 114 static DECLARE_RWSEM(swap_unplug_sem);
115 115
116 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 116 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
117 { 117 {
118 swp_entry_t entry; 118 swp_entry_t entry;
119 119
120 down_read(&swap_unplug_sem); 120 down_read(&swap_unplug_sem);
121 entry.val = page_private(page); 121 entry.val = page_private(page);
122 if (PageSwapCache(page)) { 122 if (PageSwapCache(page)) {
123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 123 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
124 struct backing_dev_info *bdi; 124 struct backing_dev_info *bdi;
125 125
126 /* 126 /*
127 * If the page is removed from swapcache from under us (with a 127 * If the page is removed from swapcache from under us (with a
128 * racy try_to_unuse/swapoff) we need an additional reference 128 * racy try_to_unuse/swapoff) we need an additional reference
129 * count to avoid reading garbage from page_private(page) above. 129 * count to avoid reading garbage from page_private(page) above.
130 * If the WARN_ON triggers during a swapoff it maybe the race 130 * If the WARN_ON triggers during a swapoff it maybe the race
131 * condition and it's harmless. However if it triggers without 131 * condition and it's harmless. However if it triggers without
132 * swapoff it signals a problem. 132 * swapoff it signals a problem.
133 */ 133 */
134 WARN_ON(page_count(page) <= 1); 134 WARN_ON(page_count(page) <= 1);
135 135
136 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 136 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
137 blk_run_backing_dev(bdi, page); 137 blk_run_backing_dev(bdi, page);
138 } 138 }
139 up_read(&swap_unplug_sem); 139 up_read(&swap_unplug_sem);
140 } 140 }
141 141
142 /* 142 /*
143 * swapon tell device that all the old swap contents can be discarded, 143 * swapon tell device that all the old swap contents can be discarded,
144 * to allow the swap device to optimize its wear-levelling. 144 * to allow the swap device to optimize its wear-levelling.
145 */ 145 */
146 static int discard_swap(struct swap_info_struct *si) 146 static int discard_swap(struct swap_info_struct *si)
147 { 147 {
148 struct swap_extent *se; 148 struct swap_extent *se;
149 int err = 0; 149 int err = 0;
150 150
151 list_for_each_entry(se, &si->extent_list, list) { 151 list_for_each_entry(se, &si->extent_list, list) {
152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
154 154
155 if (se->start_page == 0) { 155 if (se->start_page == 0) {
156 /* Do not discard the swap header page! */ 156 /* Do not discard the swap header page! */
157 start_block += 1 << (PAGE_SHIFT - 9); 157 start_block += 1 << (PAGE_SHIFT - 9);
158 nr_blocks -= 1 << (PAGE_SHIFT - 9); 158 nr_blocks -= 1 << (PAGE_SHIFT - 9);
159 if (!nr_blocks) 159 if (!nr_blocks)
160 continue; 160 continue;
161 } 161 }
162 162
163 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL);
165 if (err) 165 if (err)
166 break; 166 break;
167 167
168 cond_resched(); 168 cond_resched();
169 } 169 }
170 return err; /* That will often be -EOPNOTSUPP */ 170 return err; /* That will often be -EOPNOTSUPP */
171 } 171 }
172 172
173 /* 173 /*
174 * swap allocation tell device that a cluster of swap can now be discarded, 174 * swap allocation tell device that a cluster of swap can now be discarded,
175 * to allow the swap device to optimize its wear-levelling. 175 * to allow the swap device to optimize its wear-levelling.
176 */ 176 */
177 static void discard_swap_cluster(struct swap_info_struct *si, 177 static void discard_swap_cluster(struct swap_info_struct *si,
178 pgoff_t start_page, pgoff_t nr_pages) 178 pgoff_t start_page, pgoff_t nr_pages)
179 { 179 {
180 struct swap_extent *se = si->curr_swap_extent; 180 struct swap_extent *se = si->curr_swap_extent;
181 int found_extent = 0; 181 int found_extent = 0;
182 182
183 while (nr_pages) { 183 while (nr_pages) {
184 struct list_head *lh; 184 struct list_head *lh;
185 185
186 if (se->start_page <= start_page && 186 if (se->start_page <= start_page &&
187 start_page < se->start_page + se->nr_pages) { 187 start_page < se->start_page + se->nr_pages) {
188 pgoff_t offset = start_page - se->start_page; 188 pgoff_t offset = start_page - se->start_page;
189 sector_t start_block = se->start_block + offset; 189 sector_t start_block = se->start_block + offset;
190 sector_t nr_blocks = se->nr_pages - offset; 190 sector_t nr_blocks = se->nr_pages - offset;
191 191
192 if (nr_blocks > nr_pages) 192 if (nr_blocks > nr_pages)
193 nr_blocks = nr_pages; 193 nr_blocks = nr_pages;
194 start_page += nr_blocks; 194 start_page += nr_blocks;
195 nr_pages -= nr_blocks; 195 nr_pages -= nr_blocks;
196 196
197 if (!found_extent++) 197 if (!found_extent++)
198 si->curr_swap_extent = se; 198 si->curr_swap_extent = se;
199 199
200 start_block <<= PAGE_SHIFT - 9; 200 start_block <<= PAGE_SHIFT - 9;
201 nr_blocks <<= PAGE_SHIFT - 9; 201 nr_blocks <<= PAGE_SHIFT - 9;
202 if (blkdev_issue_discard(si->bdev, start_block, 202 if (blkdev_issue_discard(si->bdev, start_block,
203 nr_blocks, GFP_NOIO)) 203 nr_blocks, GFP_NOIO))
204 break; 204 break;
205 } 205 }
206 206
207 lh = se->list.next; 207 lh = se->list.next;
208 if (lh == &si->extent_list) 208 if (lh == &si->extent_list)
209 lh = lh->next; 209 lh = lh->next;
210 se = list_entry(lh, struct swap_extent, list); 210 se = list_entry(lh, struct swap_extent, list);
211 } 211 }
212 } 212 }
213 213
214 static int wait_for_discard(void *word) 214 static int wait_for_discard(void *word)
215 { 215 {
216 schedule(); 216 schedule();
217 return 0; 217 return 0;
218 } 218 }
219 219
220 #define SWAPFILE_CLUSTER 256 220 #define SWAPFILE_CLUSTER 256
221 #define LATENCY_LIMIT 256 221 #define LATENCY_LIMIT 256
222 222
223 static inline unsigned long scan_swap_map(struct swap_info_struct *si, 223 static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache) 224 int cache)
225 { 225 {
226 unsigned long offset; 226 unsigned long offset;
227 unsigned long scan_base; 227 unsigned long scan_base;
228 unsigned long last_in_cluster = 0; 228 unsigned long last_in_cluster = 0;
229 int latency_ration = LATENCY_LIMIT; 229 int latency_ration = LATENCY_LIMIT;
230 int found_free_cluster = 0; 230 int found_free_cluster = 0;
231 231
232 /* 232 /*
233 * We try to cluster swap pages by allocating them sequentially 233 * We try to cluster swap pages by allocating them sequentially
234 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 234 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
235 * way, however, we resort to first-free allocation, starting 235 * way, however, we resort to first-free allocation, starting
236 * a new cluster. This prevents us from scattering swap pages 236 * a new cluster. This prevents us from scattering swap pages
237 * all over the entire swap partition, so that we reduce 237 * all over the entire swap partition, so that we reduce
238 * overall disk seek times between swap pages. -- sct 238 * overall disk seek times between swap pages. -- sct
239 * But we do now try to find an empty cluster. -Andrea 239 * But we do now try to find an empty cluster. -Andrea
240 * And we let swap pages go all over an SSD partition. Hugh 240 * And we let swap pages go all over an SSD partition. Hugh
241 */ 241 */
242 242
243 si->flags += SWP_SCANNING; 243 si->flags += SWP_SCANNING;
244 scan_base = offset = si->cluster_next; 244 scan_base = offset = si->cluster_next;
245 245
246 if (unlikely(!si->cluster_nr--)) { 246 if (unlikely(!si->cluster_nr--)) {
247 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 247 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
248 si->cluster_nr = SWAPFILE_CLUSTER - 1; 248 si->cluster_nr = SWAPFILE_CLUSTER - 1;
249 goto checks; 249 goto checks;
250 } 250 }
251 if (si->flags & SWP_DISCARDABLE) { 251 if (si->flags & SWP_DISCARDABLE) {
252 /* 252 /*
253 * Start range check on racing allocations, in case 253 * Start range check on racing allocations, in case
254 * they overlap the cluster we eventually decide on 254 * they overlap the cluster we eventually decide on
255 * (we scan without swap_lock to allow preemption). 255 * (we scan without swap_lock to allow preemption).
256 * It's hardly conceivable that cluster_nr could be 256 * It's hardly conceivable that cluster_nr could be
257 * wrapped during our scan, but don't depend on it. 257 * wrapped during our scan, but don't depend on it.
258 */ 258 */
259 if (si->lowest_alloc) 259 if (si->lowest_alloc)
260 goto checks; 260 goto checks;
261 si->lowest_alloc = si->max; 261 si->lowest_alloc = si->max;
262 si->highest_alloc = 0; 262 si->highest_alloc = 0;
263 } 263 }
264 spin_unlock(&swap_lock); 264 spin_unlock(&swap_lock);
265 265
266 /* 266 /*
267 * If seek is expensive, start searching for new cluster from 267 * If seek is expensive, start searching for new cluster from
268 * start of partition, to minimize the span of allocated swap. 268 * start of partition, to minimize the span of allocated swap.
269 * But if seek is cheap, search from our current position, so 269 * But if seek is cheap, search from our current position, so
270 * that swap is allocated from all over the partition: if the 270 * that swap is allocated from all over the partition: if the
271 * Flash Translation Layer only remaps within limited zones, 271 * Flash Translation Layer only remaps within limited zones,
272 * we don't want to wear out the first zone too quickly. 272 * we don't want to wear out the first zone too quickly.
273 */ 273 */
274 if (!(si->flags & SWP_SOLIDSTATE)) 274 if (!(si->flags & SWP_SOLIDSTATE))
275 scan_base = offset = si->lowest_bit; 275 scan_base = offset = si->lowest_bit;
276 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 276 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
277 277
278 /* Locate the first empty (unaligned) cluster */ 278 /* Locate the first empty (unaligned) cluster */
279 for (; last_in_cluster <= si->highest_bit; offset++) { 279 for (; last_in_cluster <= si->highest_bit; offset++) {
280 if (si->swap_map[offset]) 280 if (si->swap_map[offset])
281 last_in_cluster = offset + SWAPFILE_CLUSTER; 281 last_in_cluster = offset + SWAPFILE_CLUSTER;
282 else if (offset == last_in_cluster) { 282 else if (offset == last_in_cluster) {
283 spin_lock(&swap_lock); 283 spin_lock(&swap_lock);
284 offset -= SWAPFILE_CLUSTER - 1; 284 offset -= SWAPFILE_CLUSTER - 1;
285 si->cluster_next = offset; 285 si->cluster_next = offset;
286 si->cluster_nr = SWAPFILE_CLUSTER - 1; 286 si->cluster_nr = SWAPFILE_CLUSTER - 1;
287 found_free_cluster = 1; 287 found_free_cluster = 1;
288 goto checks; 288 goto checks;
289 } 289 }
290 if (unlikely(--latency_ration < 0)) { 290 if (unlikely(--latency_ration < 0)) {
291 cond_resched(); 291 cond_resched();
292 latency_ration = LATENCY_LIMIT; 292 latency_ration = LATENCY_LIMIT;
293 } 293 }
294 } 294 }
295 295
296 offset = si->lowest_bit; 296 offset = si->lowest_bit;
297 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 297 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
298 298
299 /* Locate the first empty (unaligned) cluster */ 299 /* Locate the first empty (unaligned) cluster */
300 for (; last_in_cluster < scan_base; offset++) { 300 for (; last_in_cluster < scan_base; offset++) {
301 if (si->swap_map[offset]) 301 if (si->swap_map[offset])
302 last_in_cluster = offset + SWAPFILE_CLUSTER; 302 last_in_cluster = offset + SWAPFILE_CLUSTER;
303 else if (offset == last_in_cluster) { 303 else if (offset == last_in_cluster) {
304 spin_lock(&swap_lock); 304 spin_lock(&swap_lock);
305 offset -= SWAPFILE_CLUSTER - 1; 305 offset -= SWAPFILE_CLUSTER - 1;
306 si->cluster_next = offset; 306 si->cluster_next = offset;
307 si->cluster_nr = SWAPFILE_CLUSTER - 1; 307 si->cluster_nr = SWAPFILE_CLUSTER - 1;
308 found_free_cluster = 1; 308 found_free_cluster = 1;
309 goto checks; 309 goto checks;
310 } 310 }
311 if (unlikely(--latency_ration < 0)) { 311 if (unlikely(--latency_ration < 0)) {
312 cond_resched(); 312 cond_resched();
313 latency_ration = LATENCY_LIMIT; 313 latency_ration = LATENCY_LIMIT;
314 } 314 }
315 } 315 }
316 316
317 offset = scan_base; 317 offset = scan_base;
318 spin_lock(&swap_lock); 318 spin_lock(&swap_lock);
319 si->cluster_nr = SWAPFILE_CLUSTER - 1; 319 si->cluster_nr = SWAPFILE_CLUSTER - 1;
320 si->lowest_alloc = 0; 320 si->lowest_alloc = 0;
321 } 321 }
322 322
323 checks: 323 checks:
324 if (!(si->flags & SWP_WRITEOK)) 324 if (!(si->flags & SWP_WRITEOK))
325 goto no_page; 325 goto no_page;
326 if (!si->highest_bit) 326 if (!si->highest_bit)
327 goto no_page; 327 goto no_page;
328 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
329 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330 330
331 /* reuse swap entry of cache-only swap if not busy. */ 331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed; 333 int swap_was_freed;
334 spin_unlock(&swap_lock); 334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset); 335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock); 336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */ 337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed) 338 if (swap_was_freed)
339 goto checks; 339 goto checks;
340 goto scan; /* check next one */ 340 goto scan; /* check next one */
341 } 341 }
342 342
343 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
344 goto scan; 344 goto scan;
345 345
346 if (offset == si->lowest_bit) 346 if (offset == si->lowest_bit)
347 si->lowest_bit++; 347 si->lowest_bit++;
348 if (offset == si->highest_bit) 348 if (offset == si->highest_bit)
349 si->highest_bit--; 349 si->highest_bit--;
350 si->inuse_pages++; 350 si->inuse_pages++;
351 if (si->inuse_pages == si->pages) { 351 if (si->inuse_pages == si->pages) {
352 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
353 si->highest_bit = 0; 353 si->highest_bit = 0;
354 } 354 }
355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true); 356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */ 357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false); 358 si->swap_map[offset] = encode_swapmap(1, false);
359 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
360 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
361 361
362 if (si->lowest_alloc) { 362 if (si->lowest_alloc) {
363 /* 363 /*
364 * Only set when SWP_DISCARDABLE, and there's a scan 364 * Only set when SWP_DISCARDABLE, and there's a scan
365 * for a free cluster in progress or just completed. 365 * for a free cluster in progress or just completed.
366 */ 366 */
367 if (found_free_cluster) { 367 if (found_free_cluster) {
368 /* 368 /*
369 * To optimize wear-levelling, discard the 369 * To optimize wear-levelling, discard the
370 * old data of the cluster, taking care not to 370 * old data of the cluster, taking care not to
371 * discard any of its pages that have already 371 * discard any of its pages that have already
372 * been allocated by racing tasks (offset has 372 * been allocated by racing tasks (offset has
373 * already stepped over any at the beginning). 373 * already stepped over any at the beginning).
374 */ 374 */
375 if (offset < si->highest_alloc && 375 if (offset < si->highest_alloc &&
376 si->lowest_alloc <= last_in_cluster) 376 si->lowest_alloc <= last_in_cluster)
377 last_in_cluster = si->lowest_alloc - 1; 377 last_in_cluster = si->lowest_alloc - 1;
378 si->flags |= SWP_DISCARDING; 378 si->flags |= SWP_DISCARDING;
379 spin_unlock(&swap_lock); 379 spin_unlock(&swap_lock);
380 380
381 if (offset < last_in_cluster) 381 if (offset < last_in_cluster)
382 discard_swap_cluster(si, offset, 382 discard_swap_cluster(si, offset,
383 last_in_cluster - offset + 1); 383 last_in_cluster - offset + 1);
384 384
385 spin_lock(&swap_lock); 385 spin_lock(&swap_lock);
386 si->lowest_alloc = 0; 386 si->lowest_alloc = 0;
387 si->flags &= ~SWP_DISCARDING; 387 si->flags &= ~SWP_DISCARDING;
388 388
389 smp_mb(); /* wake_up_bit advises this */ 389 smp_mb(); /* wake_up_bit advises this */
390 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); 390 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
391 391
392 } else if (si->flags & SWP_DISCARDING) { 392 } else if (si->flags & SWP_DISCARDING) {
393 /* 393 /*
394 * Delay using pages allocated by racing tasks 394 * Delay using pages allocated by racing tasks
395 * until the whole discard has been issued. We 395 * until the whole discard has been issued. We
396 * could defer that delay until swap_writepage, 396 * could defer that delay until swap_writepage,
397 * but it's easier to keep this self-contained. 397 * but it's easier to keep this self-contained.
398 */ 398 */
399 spin_unlock(&swap_lock); 399 spin_unlock(&swap_lock);
400 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 400 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
401 wait_for_discard, TASK_UNINTERRUPTIBLE); 401 wait_for_discard, TASK_UNINTERRUPTIBLE);
402 spin_lock(&swap_lock); 402 spin_lock(&swap_lock);
403 } else { 403 } else {
404 /* 404 /*
405 * Note pages allocated by racing tasks while 405 * Note pages allocated by racing tasks while
406 * scan for a free cluster is in progress, so 406 * scan for a free cluster is in progress, so
407 * that its final discard can exclude them. 407 * that its final discard can exclude them.
408 */ 408 */
409 if (offset < si->lowest_alloc) 409 if (offset < si->lowest_alloc)
410 si->lowest_alloc = offset; 410 si->lowest_alloc = offset;
411 if (offset > si->highest_alloc) 411 if (offset > si->highest_alloc)
412 si->highest_alloc = offset; 412 si->highest_alloc = offset;
413 } 413 }
414 } 414 }
415 return offset; 415 return offset;
416 416
417 scan: 417 scan:
418 spin_unlock(&swap_lock); 418 spin_unlock(&swap_lock);
419 while (++offset <= si->highest_bit) { 419 while (++offset <= si->highest_bit) {
420 if (!si->swap_map[offset]) { 420 if (!si->swap_map[offset]) {
421 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
422 goto checks; 422 goto checks;
423 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock); 425 spin_lock(&swap_lock);
426 goto checks; 426 goto checks;
427 } 427 }
428 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
429 cond_resched(); 429 cond_resched();
430 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
431 } 431 }
432 } 432 }
433 offset = si->lowest_bit; 433 offset = si->lowest_bit;
434 while (++offset < scan_base) { 434 while (++offset < scan_base) {
435 if (!si->swap_map[offset]) { 435 if (!si->swap_map[offset]) {
436 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
437 goto checks; 437 goto checks;
438 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock); 440 spin_lock(&swap_lock);
441 goto checks; 441 goto checks;
442 } 442 }
443 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
444 cond_resched(); 444 cond_resched();
445 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
446 } 446 }
447 } 447 }
448 spin_lock(&swap_lock); 448 spin_lock(&swap_lock);
449 449
450 no_page: 450 no_page:
451 si->flags -= SWP_SCANNING; 451 si->flags -= SWP_SCANNING;
452 return 0; 452 return 0;
453 } 453 }
454 454
455 swp_entry_t get_swap_page(void) 455 swp_entry_t get_swap_page(void)
456 { 456 {
457 struct swap_info_struct *si; 457 struct swap_info_struct *si;
458 pgoff_t offset; 458 pgoff_t offset;
459 int type, next; 459 int type, next;
460 int wrapped = 0; 460 int wrapped = 0;
461 461
462 spin_lock(&swap_lock); 462 spin_lock(&swap_lock);
463 if (nr_swap_pages <= 0) 463 if (nr_swap_pages <= 0)
464 goto noswap; 464 goto noswap;
465 nr_swap_pages--; 465 nr_swap_pages--;
466 466
467 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 467 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
468 si = swap_info + type; 468 si = swap_info + type;
469 next = si->next; 469 next = si->next;
470 if (next < 0 || 470 if (next < 0 ||
471 (!wrapped && si->prio != swap_info[next].prio)) { 471 (!wrapped && si->prio != swap_info[next].prio)) {
472 next = swap_list.head; 472 next = swap_list.head;
473 wrapped++; 473 wrapped++;
474 } 474 }
475 475
476 if (!si->highest_bit) 476 if (!si->highest_bit)
477 continue; 477 continue;
478 if (!(si->flags & SWP_WRITEOK)) 478 if (!(si->flags & SWP_WRITEOK))
479 continue; 479 continue;
480 480
481 swap_list.next = next; 481 swap_list.next = next;
482 /* This is called for allocating swap entry for cache */ 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE); 483 offset = scan_swap_map(si, SWAP_CACHE);
484 if (offset) { 484 if (offset) {
485 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
486 return swp_entry(type, offset); 486 return swp_entry(type, offset);
487 } 487 }
488 next = swap_list.next; 488 next = swap_list.next;
489 } 489 }
490 490
491 nr_swap_pages++; 491 nr_swap_pages++;
492 noswap: 492 noswap:
493 spin_unlock(&swap_lock); 493 spin_unlock(&swap_lock);
494 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
495 } 495 }
496 496
497 /* The only caller of this function is now susupend routine */ 497 /* The only caller of this function is now susupend routine */
498 swp_entry_t get_swap_page_of_type(int type) 498 swp_entry_t get_swap_page_of_type(int type)
499 { 499 {
500 struct swap_info_struct *si; 500 struct swap_info_struct *si;
501 pgoff_t offset; 501 pgoff_t offset;
502 502
503 spin_lock(&swap_lock); 503 spin_lock(&swap_lock);
504 si = swap_info + type; 504 si = swap_info + type;
505 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
506 nr_swap_pages--; 506 nr_swap_pages--;
507 /* This is called for allocating swap entry, not cache */ 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP); 508 offset = scan_swap_map(si, SWAP_MAP);
509 if (offset) { 509 if (offset) {
510 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
511 return swp_entry(type, offset); 511 return swp_entry(type, offset);
512 } 512 }
513 nr_swap_pages++; 513 nr_swap_pages++;
514 } 514 }
515 spin_unlock(&swap_lock); 515 spin_unlock(&swap_lock);
516 return (swp_entry_t) {0}; 516 return (swp_entry_t) {0};
517 } 517 }
518 518
519 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 519 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
520 { 520 {
521 struct swap_info_struct * p; 521 struct swap_info_struct * p;
522 unsigned long offset, type; 522 unsigned long offset, type;
523 523
524 if (!entry.val) 524 if (!entry.val)
525 goto out; 525 goto out;
526 type = swp_type(entry); 526 type = swp_type(entry);
527 if (type >= nr_swapfiles) 527 if (type >= nr_swapfiles)
528 goto bad_nofile; 528 goto bad_nofile;
529 p = & swap_info[type]; 529 p = & swap_info[type];
530 if (!(p->flags & SWP_USED)) 530 if (!(p->flags & SWP_USED))
531 goto bad_device; 531 goto bad_device;
532 offset = swp_offset(entry); 532 offset = swp_offset(entry);
533 if (offset >= p->max) 533 if (offset >= p->max)
534 goto bad_offset; 534 goto bad_offset;
535 if (!p->swap_map[offset]) 535 if (!p->swap_map[offset])
536 goto bad_free; 536 goto bad_free;
537 spin_lock(&swap_lock); 537 spin_lock(&swap_lock);
538 return p; 538 return p;
539 539
540 bad_free: 540 bad_free:
541 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 541 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
542 goto out; 542 goto out;
543 bad_offset: 543 bad_offset:
544 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 544 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
545 goto out; 545 goto out;
546 bad_device: 546 bad_device:
547 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 547 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
548 goto out; 548 goto out;
549 bad_nofile: 549 bad_nofile:
550 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 550 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
551 out: 551 out:
552 return NULL; 552 return NULL;
553 } 553 }
554 554
555 static int swap_entry_free(struct swap_info_struct *p, 555 static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache) 556 swp_entry_t ent, int cache)
557 { 557 {
558 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
559 int count = swap_count(p->swap_map[offset]); 559 int count = swap_count(p->swap_map[offset]);
560 bool has_cache; 560 bool has_cache;
561 561
562 has_cache = swap_has_cache(p->swap_map[offset]); 562 has_cache = swap_has_cache(p->swap_map[offset]);
563 563
564 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
565 if (count < SWAP_MAP_MAX) { 565 if (count < SWAP_MAP_MAX) {
566 count--; 566 count--;
567 p->swap_map[offset] = encode_swapmap(count, has_cache); 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
568 } 568 }
569 } else { /* dropping swap cache flag */ 569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache); 570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false); 571 p->swap_map[offset] = encode_swapmap(count, false);
572 572
573 } 573 }
574 /* return code. */ 574 /* return code. */
575 count = p->swap_map[offset]; 575 count = p->swap_map[offset];
576 /* free if no reference */ 576 /* free if no reference */
577 if (!count) { 577 if (!count) {
578 if (offset < p->lowest_bit) 578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset; 579 p->lowest_bit = offset;
580 if (offset > p->highest_bit) 580 if (offset > p->highest_bit)
581 p->highest_bit = offset; 581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio) 582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info; 583 swap_list.next = p - swap_info;
584 nr_swap_pages++; 584 nr_swap_pages++;
585 p->inuse_pages--; 585 p->inuse_pages--;
586 } 586 }
587 if (!swap_count(count)) 587 if (!swap_count(count))
588 mem_cgroup_uncharge_swap(ent); 588 mem_cgroup_uncharge_swap(ent);
589 return count; 589 return count;
590 } 590 }
591 591
592 /* 592 /*
593 * Caller has made sure that the swapdevice corresponding to entry 593 * Caller has made sure that the swapdevice corresponding to entry
594 * is still around or has not been recycled. 594 * is still around or has not been recycled.
595 */ 595 */
596 void swap_free(swp_entry_t entry) 596 void swap_free(swp_entry_t entry)
597 { 597 {
598 struct swap_info_struct * p; 598 struct swap_info_struct * p;
599 599
600 p = swap_info_get(entry); 600 p = swap_info_get(entry);
601 if (p) { 601 if (p) {
602 swap_entry_free(p, entry, SWAP_MAP); 602 swap_entry_free(p, entry, SWAP_MAP);
603 spin_unlock(&swap_lock); 603 spin_unlock(&swap_lock);
604 } 604 }
605 } 605 }
606 606
607 /* 607 /*
608 * Called after dropping swapcache to decrease refcnt to swap entries. 608 * Called after dropping swapcache to decrease refcnt to swap entries.
609 */ 609 */
610 void swapcache_free(swp_entry_t entry, struct page *page) 610 void swapcache_free(swp_entry_t entry, struct page *page)
611 { 611 {
612 struct swap_info_struct *p; 612 struct swap_info_struct *p;
613 int ret; 613 int ret;
614 614
615 p = swap_info_get(entry); 615 p = swap_info_get(entry);
616 if (p) { 616 if (p) {
617 ret = swap_entry_free(p, entry, SWAP_CACHE); 617 ret = swap_entry_free(p, entry, SWAP_CACHE);
618 if (page) { 618 if (page) {
619 bool swapout; 619 bool swapout;
620 if (ret) 620 if (ret)
621 swapout = true; /* the end of swap out */ 621 swapout = true; /* the end of swap out */
622 else 622 else
623 swapout = false; /* no more swap users! */ 623 swapout = false; /* no more swap users! */
624 mem_cgroup_uncharge_swapcache(page, entry, swapout); 624 mem_cgroup_uncharge_swapcache(page, entry, swapout);
625 } 625 }
626 spin_unlock(&swap_lock); 626 spin_unlock(&swap_lock);
627 } 627 }
628 return; 628 return;
629 } 629 }
630 630
631 /* 631 /*
632 * How many references to page are currently swapped out? 632 * How many references to page are currently swapped out?
633 */ 633 */
634 static inline int page_swapcount(struct page *page) 634 static inline int page_swapcount(struct page *page)
635 { 635 {
636 int count = 0; 636 int count = 0;
637 struct swap_info_struct *p; 637 struct swap_info_struct *p;
638 swp_entry_t entry; 638 swp_entry_t entry;
639 639
640 entry.val = page_private(page); 640 entry.val = page_private(page);
641 p = swap_info_get(entry); 641 p = swap_info_get(entry);
642 if (p) { 642 if (p) {
643 count = swap_count(p->swap_map[swp_offset(entry)]); 643 count = swap_count(p->swap_map[swp_offset(entry)]);
644 spin_unlock(&swap_lock); 644 spin_unlock(&swap_lock);
645 } 645 }
646 return count; 646 return count;
647 } 647 }
648 648
649 /* 649 /*
650 * We can write to an anon page without COW if there are no other references 650 * We can write to an anon page without COW if there are no other references
651 * to it. And as a side-effect, free up its swap: because the old content 651 * to it. And as a side-effect, free up its swap: because the old content
652 * on disk will never be read, and seeking back there to write new content 652 * on disk will never be read, and seeking back there to write new content
653 * later would only waste time away from clustering. 653 * later would only waste time away from clustering.
654 */ 654 */
655 int reuse_swap_page(struct page *page) 655 int reuse_swap_page(struct page *page)
656 { 656 {
657 int count; 657 int count;
658 658
659 VM_BUG_ON(!PageLocked(page)); 659 VM_BUG_ON(!PageLocked(page));
660 count = page_mapcount(page); 660 count = page_mapcount(page);
661 if (count <= 1 && PageSwapCache(page)) { 661 if (count <= 1 && PageSwapCache(page)) {
662 count += page_swapcount(page); 662 count += page_swapcount(page);
663 if (count == 1 && !PageWriteback(page)) { 663 if (count == 1 && !PageWriteback(page)) {
664 delete_from_swap_cache(page); 664 delete_from_swap_cache(page);
665 SetPageDirty(page); 665 SetPageDirty(page);
666 } 666 }
667 } 667 }
668 return count == 1; 668 return count == 1;
669 } 669 }
670 670
671 /* 671 /*
672 * If swap is getting full, or if there are no more mappings of this page, 672 * If swap is getting full, or if there are no more mappings of this page,
673 * then try_to_free_swap is called to free its swap space. 673 * then try_to_free_swap is called to free its swap space.
674 */ 674 */
675 int try_to_free_swap(struct page *page) 675 int try_to_free_swap(struct page *page)
676 { 676 {
677 VM_BUG_ON(!PageLocked(page)); 677 VM_BUG_ON(!PageLocked(page));
678 678
679 if (!PageSwapCache(page)) 679 if (!PageSwapCache(page))
680 return 0; 680 return 0;
681 if (PageWriteback(page)) 681 if (PageWriteback(page))
682 return 0; 682 return 0;
683 if (page_swapcount(page)) 683 if (page_swapcount(page))
684 return 0; 684 return 0;
685 685
686 delete_from_swap_cache(page); 686 delete_from_swap_cache(page);
687 SetPageDirty(page); 687 SetPageDirty(page);
688 return 1; 688 return 1;
689 } 689 }
690 690
691 /* 691 /*
692 * Free the swap entry like above, but also try to 692 * Free the swap entry like above, but also try to
693 * free the page cache entry if it is the last user. 693 * free the page cache entry if it is the last user.
694 */ 694 */
695 int free_swap_and_cache(swp_entry_t entry) 695 int free_swap_and_cache(swp_entry_t entry)
696 { 696 {
697 struct swap_info_struct *p; 697 struct swap_info_struct *p;
698 struct page *page = NULL; 698 struct page *page = NULL;
699 699
700 if (is_migration_entry(entry)) 700 if (is_migration_entry(entry))
701 return 1; 701 return 1;
702 702
703 p = swap_info_get(entry); 703 p = swap_info_get(entry);
704 if (p) { 704 if (p) {
705 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 705 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
706 page = find_get_page(&swapper_space, entry.val); 706 page = find_get_page(&swapper_space, entry.val);
707 if (page && !trylock_page(page)) { 707 if (page && !trylock_page(page)) {
708 page_cache_release(page); 708 page_cache_release(page);
709 page = NULL; 709 page = NULL;
710 } 710 }
711 } 711 }
712 spin_unlock(&swap_lock); 712 spin_unlock(&swap_lock);
713 } 713 }
714 if (page) { 714 if (page) {
715 /* 715 /*
716 * Not mapped elsewhere, or swap space full? Free it! 716 * Not mapped elsewhere, or swap space full? Free it!
717 * Also recheck PageSwapCache now page is locked (above). 717 * Also recheck PageSwapCache now page is locked (above).
718 */ 718 */
719 if (PageSwapCache(page) && !PageWriteback(page) && 719 if (PageSwapCache(page) && !PageWriteback(page) &&
720 (!page_mapped(page) || vm_swap_full())) { 720 (!page_mapped(page) || vm_swap_full())) {
721 delete_from_swap_cache(page); 721 delete_from_swap_cache(page);
722 SetPageDirty(page); 722 SetPageDirty(page);
723 } 723 }
724 unlock_page(page); 724 unlock_page(page);
725 page_cache_release(page); 725 page_cache_release(page);
726 } 726 }
727 return p != NULL; 727 return p != NULL;
728 } 728 }
729 729
730 #ifdef CONFIG_HIBERNATION 730 #ifdef CONFIG_HIBERNATION
731 /* 731 /*
732 * Find the swap type that corresponds to given device (if any). 732 * Find the swap type that corresponds to given device (if any).
733 * 733 *
734 * @offset - number of the PAGE_SIZE-sized block of the device, starting 734 * @offset - number of the PAGE_SIZE-sized block of the device, starting
735 * from 0, in which the swap header is expected to be located. 735 * from 0, in which the swap header is expected to be located.
736 * 736 *
737 * This is needed for the suspend to disk (aka swsusp). 737 * This is needed for the suspend to disk (aka swsusp).
738 */ 738 */
739 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 739 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
740 { 740 {
741 struct block_device *bdev = NULL; 741 struct block_device *bdev = NULL;
742 int i; 742 int i;
743 743
744 if (device) 744 if (device)
745 bdev = bdget(device); 745 bdev = bdget(device);
746 746
747 spin_lock(&swap_lock); 747 spin_lock(&swap_lock);
748 for (i = 0; i < nr_swapfiles; i++) { 748 for (i = 0; i < nr_swapfiles; i++) {
749 struct swap_info_struct *sis = swap_info + i; 749 struct swap_info_struct *sis = swap_info + i;
750 750
751 if (!(sis->flags & SWP_WRITEOK)) 751 if (!(sis->flags & SWP_WRITEOK))
752 continue; 752 continue;
753 753
754 if (!bdev) { 754 if (!bdev) {
755 if (bdev_p) 755 if (bdev_p)
756 *bdev_p = bdget(sis->bdev->bd_dev); 756 *bdev_p = bdgrab(sis->bdev);
757 757
758 spin_unlock(&swap_lock); 758 spin_unlock(&swap_lock);
759 return i; 759 return i;
760 } 760 }
761 if (bdev == sis->bdev) { 761 if (bdev == sis->bdev) {
762 struct swap_extent *se; 762 struct swap_extent *se;
763 763
764 se = list_entry(sis->extent_list.next, 764 se = list_entry(sis->extent_list.next,
765 struct swap_extent, list); 765 struct swap_extent, list);
766 if (se->start_block == offset) { 766 if (se->start_block == offset) {
767 if (bdev_p) 767 if (bdev_p)
768 *bdev_p = bdget(sis->bdev->bd_dev); 768 *bdev_p = bdgrab(sis->bdev);
769 769
770 spin_unlock(&swap_lock); 770 spin_unlock(&swap_lock);
771 bdput(bdev); 771 bdput(bdev);
772 return i; 772 return i;
773 } 773 }
774 } 774 }
775 } 775 }
776 spin_unlock(&swap_lock); 776 spin_unlock(&swap_lock);
777 if (bdev) 777 if (bdev)
778 bdput(bdev); 778 bdput(bdev);
779 779
780 return -ENODEV; 780 return -ENODEV;
781 } 781 }
782 782
783 /* 783 /*
784 * Return either the total number of swap pages of given type, or the number 784 * Return either the total number of swap pages of given type, or the number
785 * of free pages of that type (depending on @free) 785 * of free pages of that type (depending on @free)
786 * 786 *
787 * This is needed for software suspend 787 * This is needed for software suspend
788 */ 788 */
789 unsigned int count_swap_pages(int type, int free) 789 unsigned int count_swap_pages(int type, int free)
790 { 790 {
791 unsigned int n = 0; 791 unsigned int n = 0;
792 792
793 if (type < nr_swapfiles) { 793 if (type < nr_swapfiles) {
794 spin_lock(&swap_lock); 794 spin_lock(&swap_lock);
795 if (swap_info[type].flags & SWP_WRITEOK) { 795 if (swap_info[type].flags & SWP_WRITEOK) {
796 n = swap_info[type].pages; 796 n = swap_info[type].pages;
797 if (free) 797 if (free)
798 n -= swap_info[type].inuse_pages; 798 n -= swap_info[type].inuse_pages;
799 } 799 }
800 spin_unlock(&swap_lock); 800 spin_unlock(&swap_lock);
801 } 801 }
802 return n; 802 return n;
803 } 803 }
804 #endif 804 #endif
805 805
806 /* 806 /*
807 * No need to decide whether this PTE shares the swap entry with others, 807 * No need to decide whether this PTE shares the swap entry with others,
808 * just let do_wp_page work it out if a write is requested later - to 808 * just let do_wp_page work it out if a write is requested later - to
809 * force COW, vm_page_prot omits write permission from any private vma. 809 * force COW, vm_page_prot omits write permission from any private vma.
810 */ 810 */
811 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 811 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
812 unsigned long addr, swp_entry_t entry, struct page *page) 812 unsigned long addr, swp_entry_t entry, struct page *page)
813 { 813 {
814 struct mem_cgroup *ptr = NULL; 814 struct mem_cgroup *ptr = NULL;
815 spinlock_t *ptl; 815 spinlock_t *ptl;
816 pte_t *pte; 816 pte_t *pte;
817 int ret = 1; 817 int ret = 1;
818 818
819 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 819 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
820 ret = -ENOMEM; 820 ret = -ENOMEM;
821 goto out_nolock; 821 goto out_nolock;
822 } 822 }
823 823
824 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 824 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
825 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 825 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
826 if (ret > 0) 826 if (ret > 0)
827 mem_cgroup_cancel_charge_swapin(ptr); 827 mem_cgroup_cancel_charge_swapin(ptr);
828 ret = 0; 828 ret = 0;
829 goto out; 829 goto out;
830 } 830 }
831 831
832 inc_mm_counter(vma->vm_mm, anon_rss); 832 inc_mm_counter(vma->vm_mm, anon_rss);
833 get_page(page); 833 get_page(page);
834 set_pte_at(vma->vm_mm, addr, pte, 834 set_pte_at(vma->vm_mm, addr, pte,
835 pte_mkold(mk_pte(page, vma->vm_page_prot))); 835 pte_mkold(mk_pte(page, vma->vm_page_prot)));
836 page_add_anon_rmap(page, vma, addr); 836 page_add_anon_rmap(page, vma, addr);
837 mem_cgroup_commit_charge_swapin(page, ptr); 837 mem_cgroup_commit_charge_swapin(page, ptr);
838 swap_free(entry); 838 swap_free(entry);
839 /* 839 /*
840 * Move the page to the active list so it is not 840 * Move the page to the active list so it is not
841 * immediately swapped out again after swapon. 841 * immediately swapped out again after swapon.
842 */ 842 */
843 activate_page(page); 843 activate_page(page);
844 out: 844 out:
845 pte_unmap_unlock(pte, ptl); 845 pte_unmap_unlock(pte, ptl);
846 out_nolock: 846 out_nolock:
847 return ret; 847 return ret;
848 } 848 }
849 849
850 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 850 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
851 unsigned long addr, unsigned long end, 851 unsigned long addr, unsigned long end,
852 swp_entry_t entry, struct page *page) 852 swp_entry_t entry, struct page *page)
853 { 853 {
854 pte_t swp_pte = swp_entry_to_pte(entry); 854 pte_t swp_pte = swp_entry_to_pte(entry);
855 pte_t *pte; 855 pte_t *pte;
856 int ret = 0; 856 int ret = 0;
857 857
858 /* 858 /*
859 * We don't actually need pte lock while scanning for swp_pte: since 859 * We don't actually need pte lock while scanning for swp_pte: since
860 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 860 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
861 * page table while we're scanning; though it could get zapped, and on 861 * page table while we're scanning; though it could get zapped, and on
862 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 862 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
863 * of unmatched parts which look like swp_pte, so unuse_pte must 863 * of unmatched parts which look like swp_pte, so unuse_pte must
864 * recheck under pte lock. Scanning without pte lock lets it be 864 * recheck under pte lock. Scanning without pte lock lets it be
865 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 865 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
866 */ 866 */
867 pte = pte_offset_map(pmd, addr); 867 pte = pte_offset_map(pmd, addr);
868 do { 868 do {
869 /* 869 /*
870 * swapoff spends a _lot_ of time in this loop! 870 * swapoff spends a _lot_ of time in this loop!
871 * Test inline before going to call unuse_pte. 871 * Test inline before going to call unuse_pte.
872 */ 872 */
873 if (unlikely(pte_same(*pte, swp_pte))) { 873 if (unlikely(pte_same(*pte, swp_pte))) {
874 pte_unmap(pte); 874 pte_unmap(pte);
875 ret = unuse_pte(vma, pmd, addr, entry, page); 875 ret = unuse_pte(vma, pmd, addr, entry, page);
876 if (ret) 876 if (ret)
877 goto out; 877 goto out;
878 pte = pte_offset_map(pmd, addr); 878 pte = pte_offset_map(pmd, addr);
879 } 879 }
880 } while (pte++, addr += PAGE_SIZE, addr != end); 880 } while (pte++, addr += PAGE_SIZE, addr != end);
881 pte_unmap(pte - 1); 881 pte_unmap(pte - 1);
882 out: 882 out:
883 return ret; 883 return ret;
884 } 884 }
885 885
886 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 886 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
887 unsigned long addr, unsigned long end, 887 unsigned long addr, unsigned long end,
888 swp_entry_t entry, struct page *page) 888 swp_entry_t entry, struct page *page)
889 { 889 {
890 pmd_t *pmd; 890 pmd_t *pmd;
891 unsigned long next; 891 unsigned long next;
892 int ret; 892 int ret;
893 893
894 pmd = pmd_offset(pud, addr); 894 pmd = pmd_offset(pud, addr);
895 do { 895 do {
896 next = pmd_addr_end(addr, end); 896 next = pmd_addr_end(addr, end);
897 if (pmd_none_or_clear_bad(pmd)) 897 if (pmd_none_or_clear_bad(pmd))
898 continue; 898 continue;
899 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 899 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
900 if (ret) 900 if (ret)
901 return ret; 901 return ret;
902 } while (pmd++, addr = next, addr != end); 902 } while (pmd++, addr = next, addr != end);
903 return 0; 903 return 0;
904 } 904 }
905 905
906 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 906 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
907 unsigned long addr, unsigned long end, 907 unsigned long addr, unsigned long end,
908 swp_entry_t entry, struct page *page) 908 swp_entry_t entry, struct page *page)
909 { 909 {
910 pud_t *pud; 910 pud_t *pud;
911 unsigned long next; 911 unsigned long next;
912 int ret; 912 int ret;
913 913
914 pud = pud_offset(pgd, addr); 914 pud = pud_offset(pgd, addr);
915 do { 915 do {
916 next = pud_addr_end(addr, end); 916 next = pud_addr_end(addr, end);
917 if (pud_none_or_clear_bad(pud)) 917 if (pud_none_or_clear_bad(pud))
918 continue; 918 continue;
919 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 919 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
920 if (ret) 920 if (ret)
921 return ret; 921 return ret;
922 } while (pud++, addr = next, addr != end); 922 } while (pud++, addr = next, addr != end);
923 return 0; 923 return 0;
924 } 924 }
925 925
926 static int unuse_vma(struct vm_area_struct *vma, 926 static int unuse_vma(struct vm_area_struct *vma,
927 swp_entry_t entry, struct page *page) 927 swp_entry_t entry, struct page *page)
928 { 928 {
929 pgd_t *pgd; 929 pgd_t *pgd;
930 unsigned long addr, end, next; 930 unsigned long addr, end, next;
931 int ret; 931 int ret;
932 932
933 if (page->mapping) { 933 if (page->mapping) {
934 addr = page_address_in_vma(page, vma); 934 addr = page_address_in_vma(page, vma);
935 if (addr == -EFAULT) 935 if (addr == -EFAULT)
936 return 0; 936 return 0;
937 else 937 else
938 end = addr + PAGE_SIZE; 938 end = addr + PAGE_SIZE;
939 } else { 939 } else {
940 addr = vma->vm_start; 940 addr = vma->vm_start;
941 end = vma->vm_end; 941 end = vma->vm_end;
942 } 942 }
943 943
944 pgd = pgd_offset(vma->vm_mm, addr); 944 pgd = pgd_offset(vma->vm_mm, addr);
945 do { 945 do {
946 next = pgd_addr_end(addr, end); 946 next = pgd_addr_end(addr, end);
947 if (pgd_none_or_clear_bad(pgd)) 947 if (pgd_none_or_clear_bad(pgd))
948 continue; 948 continue;
949 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 949 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
950 if (ret) 950 if (ret)
951 return ret; 951 return ret;
952 } while (pgd++, addr = next, addr != end); 952 } while (pgd++, addr = next, addr != end);
953 return 0; 953 return 0;
954 } 954 }
955 955
956 static int unuse_mm(struct mm_struct *mm, 956 static int unuse_mm(struct mm_struct *mm,
957 swp_entry_t entry, struct page *page) 957 swp_entry_t entry, struct page *page)
958 { 958 {
959 struct vm_area_struct *vma; 959 struct vm_area_struct *vma;
960 int ret = 0; 960 int ret = 0;
961 961
962 if (!down_read_trylock(&mm->mmap_sem)) { 962 if (!down_read_trylock(&mm->mmap_sem)) {
963 /* 963 /*
964 * Activate page so shrink_inactive_list is unlikely to unmap 964 * Activate page so shrink_inactive_list is unlikely to unmap
965 * its ptes while lock is dropped, so swapoff can make progress. 965 * its ptes while lock is dropped, so swapoff can make progress.
966 */ 966 */
967 activate_page(page); 967 activate_page(page);
968 unlock_page(page); 968 unlock_page(page);
969 down_read(&mm->mmap_sem); 969 down_read(&mm->mmap_sem);
970 lock_page(page); 970 lock_page(page);
971 } 971 }
972 for (vma = mm->mmap; vma; vma = vma->vm_next) { 972 for (vma = mm->mmap; vma; vma = vma->vm_next) {
973 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 973 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
974 break; 974 break;
975 } 975 }
976 up_read(&mm->mmap_sem); 976 up_read(&mm->mmap_sem);
977 return (ret < 0)? ret: 0; 977 return (ret < 0)? ret: 0;
978 } 978 }
979 979
980 /* 980 /*
981 * Scan swap_map from current position to next entry still in use. 981 * Scan swap_map from current position to next entry still in use.
982 * Recycle to start on reaching the end, returning 0 when empty. 982 * Recycle to start on reaching the end, returning 0 when empty.
983 */ 983 */
984 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 984 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
985 unsigned int prev) 985 unsigned int prev)
986 { 986 {
987 unsigned int max = si->max; 987 unsigned int max = si->max;
988 unsigned int i = prev; 988 unsigned int i = prev;
989 int count; 989 int count;
990 990
991 /* 991 /*
992 * No need for swap_lock here: we're just looking 992 * No need for swap_lock here: we're just looking
993 * for whether an entry is in use, not modifying it; false 993 * for whether an entry is in use, not modifying it; false
994 * hits are okay, and sys_swapoff() has already prevented new 994 * hits are okay, and sys_swapoff() has already prevented new
995 * allocations from this area (while holding swap_lock). 995 * allocations from this area (while holding swap_lock).
996 */ 996 */
997 for (;;) { 997 for (;;) {
998 if (++i >= max) { 998 if (++i >= max) {
999 if (!prev) { 999 if (!prev) {
1000 i = 0; 1000 i = 0;
1001 break; 1001 break;
1002 } 1002 }
1003 /* 1003 /*
1004 * No entries in use at top of swap_map, 1004 * No entries in use at top of swap_map,
1005 * loop back to start and recheck there. 1005 * loop back to start and recheck there.
1006 */ 1006 */
1007 max = prev + 1; 1007 max = prev + 1;
1008 prev = 0; 1008 prev = 0;
1009 i = 1; 1009 i = 1;
1010 } 1010 }
1011 count = si->swap_map[i]; 1011 count = si->swap_map[i];
1012 if (count && swap_count(count) != SWAP_MAP_BAD) 1012 if (count && swap_count(count) != SWAP_MAP_BAD)
1013 break; 1013 break;
1014 } 1014 }
1015 return i; 1015 return i;
1016 } 1016 }
1017 1017
1018 /* 1018 /*
1019 * We completely avoid races by reading each swap page in advance, 1019 * We completely avoid races by reading each swap page in advance,
1020 * and then search for the process using it. All the necessary 1020 * and then search for the process using it. All the necessary
1021 * page table adjustments can then be made atomically. 1021 * page table adjustments can then be made atomically.
1022 */ 1022 */
1023 static int try_to_unuse(unsigned int type) 1023 static int try_to_unuse(unsigned int type)
1024 { 1024 {
1025 struct swap_info_struct * si = &swap_info[type]; 1025 struct swap_info_struct * si = &swap_info[type];
1026 struct mm_struct *start_mm; 1026 struct mm_struct *start_mm;
1027 unsigned short *swap_map; 1027 unsigned short *swap_map;
1028 unsigned short swcount; 1028 unsigned short swcount;
1029 struct page *page; 1029 struct page *page;
1030 swp_entry_t entry; 1030 swp_entry_t entry;
1031 unsigned int i = 0; 1031 unsigned int i = 0;
1032 int retval = 0; 1032 int retval = 0;
1033 int reset_overflow = 0; 1033 int reset_overflow = 0;
1034 int shmem; 1034 int shmem;
1035 1035
1036 /* 1036 /*
1037 * When searching mms for an entry, a good strategy is to 1037 * When searching mms for an entry, a good strategy is to
1038 * start at the first mm we freed the previous entry from 1038 * start at the first mm we freed the previous entry from
1039 * (though actually we don't notice whether we or coincidence 1039 * (though actually we don't notice whether we or coincidence
1040 * freed the entry). Initialize this start_mm with a hold. 1040 * freed the entry). Initialize this start_mm with a hold.
1041 * 1041 *
1042 * A simpler strategy would be to start at the last mm we 1042 * A simpler strategy would be to start at the last mm we
1043 * freed the previous entry from; but that would take less 1043 * freed the previous entry from; but that would take less
1044 * advantage of mmlist ordering, which clusters forked mms 1044 * advantage of mmlist ordering, which clusters forked mms
1045 * together, child after parent. If we race with dup_mmap(), we 1045 * together, child after parent. If we race with dup_mmap(), we
1046 * prefer to resolve parent before child, lest we miss entries 1046 * prefer to resolve parent before child, lest we miss entries
1047 * duplicated after we scanned child: using last mm would invert 1047 * duplicated after we scanned child: using last mm would invert
1048 * that. Though it's only a serious concern when an overflowed 1048 * that. Though it's only a serious concern when an overflowed
1049 * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 1049 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1050 */ 1050 */
1051 start_mm = &init_mm; 1051 start_mm = &init_mm;
1052 atomic_inc(&init_mm.mm_users); 1052 atomic_inc(&init_mm.mm_users);
1053 1053
1054 /* 1054 /*
1055 * Keep on scanning until all entries have gone. Usually, 1055 * Keep on scanning until all entries have gone. Usually,
1056 * one pass through swap_map is enough, but not necessarily: 1056 * one pass through swap_map is enough, but not necessarily:
1057 * there are races when an instance of an entry might be missed. 1057 * there are races when an instance of an entry might be missed.
1058 */ 1058 */
1059 while ((i = find_next_to_unuse(si, i)) != 0) { 1059 while ((i = find_next_to_unuse(si, i)) != 0) {
1060 if (signal_pending(current)) { 1060 if (signal_pending(current)) {
1061 retval = -EINTR; 1061 retval = -EINTR;
1062 break; 1062 break;
1063 } 1063 }
1064 1064
1065 /* 1065 /*
1066 * Get a page for the entry, using the existing swap 1066 * Get a page for the entry, using the existing swap
1067 * cache page if there is one. Otherwise, get a clean 1067 * cache page if there is one. Otherwise, get a clean
1068 * page and read the swap into it. 1068 * page and read the swap into it.
1069 */ 1069 */
1070 swap_map = &si->swap_map[i]; 1070 swap_map = &si->swap_map[i];
1071 entry = swp_entry(type, i); 1071 entry = swp_entry(type, i);
1072 page = read_swap_cache_async(entry, 1072 page = read_swap_cache_async(entry,
1073 GFP_HIGHUSER_MOVABLE, NULL, 0); 1073 GFP_HIGHUSER_MOVABLE, NULL, 0);
1074 if (!page) { 1074 if (!page) {
1075 /* 1075 /*
1076 * Either swap_duplicate() failed because entry 1076 * Either swap_duplicate() failed because entry
1077 * has been freed independently, and will not be 1077 * has been freed independently, and will not be
1078 * reused since sys_swapoff() already disabled 1078 * reused since sys_swapoff() already disabled
1079 * allocation from here, or alloc_page() failed. 1079 * allocation from here, or alloc_page() failed.
1080 */ 1080 */
1081 if (!*swap_map) 1081 if (!*swap_map)
1082 continue; 1082 continue;
1083 retval = -ENOMEM; 1083 retval = -ENOMEM;
1084 break; 1084 break;
1085 } 1085 }
1086 1086
1087 /* 1087 /*
1088 * Don't hold on to start_mm if it looks like exiting. 1088 * Don't hold on to start_mm if it looks like exiting.
1089 */ 1089 */
1090 if (atomic_read(&start_mm->mm_users) == 1) { 1090 if (atomic_read(&start_mm->mm_users) == 1) {
1091 mmput(start_mm); 1091 mmput(start_mm);
1092 start_mm = &init_mm; 1092 start_mm = &init_mm;
1093 atomic_inc(&init_mm.mm_users); 1093 atomic_inc(&init_mm.mm_users);
1094 } 1094 }
1095 1095
1096 /* 1096 /*
1097 * Wait for and lock page. When do_swap_page races with 1097 * Wait for and lock page. When do_swap_page races with
1098 * try_to_unuse, do_swap_page can handle the fault much 1098 * try_to_unuse, do_swap_page can handle the fault much
1099 * faster than try_to_unuse can locate the entry. This 1099 * faster than try_to_unuse can locate the entry. This
1100 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1100 * apparently redundant "wait_on_page_locked" lets try_to_unuse
1101 * defer to do_swap_page in such a case - in some tests, 1101 * defer to do_swap_page in such a case - in some tests,
1102 * do_swap_page and try_to_unuse repeatedly compete. 1102 * do_swap_page and try_to_unuse repeatedly compete.
1103 */ 1103 */
1104 wait_on_page_locked(page); 1104 wait_on_page_locked(page);
1105 wait_on_page_writeback(page); 1105 wait_on_page_writeback(page);
1106 lock_page(page); 1106 lock_page(page);
1107 wait_on_page_writeback(page); 1107 wait_on_page_writeback(page);
1108 1108
1109 /* 1109 /*
1110 * Remove all references to entry. 1110 * Remove all references to entry.
1111 * Whenever we reach init_mm, there's no address space 1111 * Whenever we reach init_mm, there's no address space
1112 * to search, but use it as a reminder to search shmem. 1112 * to search, but use it as a reminder to search shmem.
1113 */ 1113 */
1114 shmem = 0; 1114 shmem = 0;
1115 swcount = *swap_map; 1115 swcount = *swap_map;
1116 if (swap_count(swcount)) { 1116 if (swap_count(swcount)) {
1117 if (start_mm == &init_mm) 1117 if (start_mm == &init_mm)
1118 shmem = shmem_unuse(entry, page); 1118 shmem = shmem_unuse(entry, page);
1119 else 1119 else
1120 retval = unuse_mm(start_mm, entry, page); 1120 retval = unuse_mm(start_mm, entry, page);
1121 } 1121 }
1122 if (swap_count(*swap_map)) { 1122 if (swap_count(*swap_map)) {
1123 int set_start_mm = (*swap_map >= swcount); 1123 int set_start_mm = (*swap_map >= swcount);
1124 struct list_head *p = &start_mm->mmlist; 1124 struct list_head *p = &start_mm->mmlist;
1125 struct mm_struct *new_start_mm = start_mm; 1125 struct mm_struct *new_start_mm = start_mm;
1126 struct mm_struct *prev_mm = start_mm; 1126 struct mm_struct *prev_mm = start_mm;
1127 struct mm_struct *mm; 1127 struct mm_struct *mm;
1128 1128
1129 atomic_inc(&new_start_mm->mm_users); 1129 atomic_inc(&new_start_mm->mm_users);
1130 atomic_inc(&prev_mm->mm_users); 1130 atomic_inc(&prev_mm->mm_users);
1131 spin_lock(&mmlist_lock); 1131 spin_lock(&mmlist_lock);
1132 while (swap_count(*swap_map) && !retval && !shmem && 1132 while (swap_count(*swap_map) && !retval && !shmem &&
1133 (p = p->next) != &start_mm->mmlist) { 1133 (p = p->next) != &start_mm->mmlist) {
1134 mm = list_entry(p, struct mm_struct, mmlist); 1134 mm = list_entry(p, struct mm_struct, mmlist);
1135 if (!atomic_inc_not_zero(&mm->mm_users)) 1135 if (!atomic_inc_not_zero(&mm->mm_users))
1136 continue; 1136 continue;
1137 spin_unlock(&mmlist_lock); 1137 spin_unlock(&mmlist_lock);
1138 mmput(prev_mm); 1138 mmput(prev_mm);
1139 prev_mm = mm; 1139 prev_mm = mm;
1140 1140
1141 cond_resched(); 1141 cond_resched();
1142 1142
1143 swcount = *swap_map; 1143 swcount = *swap_map;
1144 if (!swap_count(swcount)) /* any usage ? */ 1144 if (!swap_count(swcount)) /* any usage ? */
1145 ; 1145 ;
1146 else if (mm == &init_mm) { 1146 else if (mm == &init_mm) {
1147 set_start_mm = 1; 1147 set_start_mm = 1;
1148 shmem = shmem_unuse(entry, page); 1148 shmem = shmem_unuse(entry, page);
1149 } else 1149 } else
1150 retval = unuse_mm(mm, entry, page); 1150 retval = unuse_mm(mm, entry, page);
1151 1151
1152 if (set_start_mm && 1152 if (set_start_mm &&
1153 swap_count(*swap_map) < swcount) { 1153 swap_count(*swap_map) < swcount) {
1154 mmput(new_start_mm); 1154 mmput(new_start_mm);
1155 atomic_inc(&mm->mm_users); 1155 atomic_inc(&mm->mm_users);
1156 new_start_mm = mm; 1156 new_start_mm = mm;
1157 set_start_mm = 0; 1157 set_start_mm = 0;
1158 } 1158 }
1159 spin_lock(&mmlist_lock); 1159 spin_lock(&mmlist_lock);
1160 } 1160 }
1161 spin_unlock(&mmlist_lock); 1161 spin_unlock(&mmlist_lock);
1162 mmput(prev_mm); 1162 mmput(prev_mm);
1163 mmput(start_mm); 1163 mmput(start_mm);
1164 start_mm = new_start_mm; 1164 start_mm = new_start_mm;
1165 } 1165 }
1166 if (shmem) { 1166 if (shmem) {
1167 /* page has already been unlocked and released */ 1167 /* page has already been unlocked and released */
1168 if (shmem > 0) 1168 if (shmem > 0)
1169 continue; 1169 continue;
1170 retval = shmem; 1170 retval = shmem;
1171 break; 1171 break;
1172 } 1172 }
1173 if (retval) { 1173 if (retval) {
1174 unlock_page(page); 1174 unlock_page(page);
1175 page_cache_release(page); 1175 page_cache_release(page);
1176 break; 1176 break;
1177 } 1177 }
1178 1178
1179 /* 1179 /*
1180 * How could swap count reach 0x7ffe ? 1180 * How could swap count reach 0x7ffe ?
1181 * There's no way to repeat a swap page within an mm 1181 * There's no way to repeat a swap page within an mm
1182 * (except in shmem, where it's the shared object which takes 1182 * (except in shmem, where it's the shared object which takes
1183 * the reference count)? 1183 * the reference count)?
1184 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned 1184 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1185 * short is too small....) 1185 * short is too small....)
1186 * If that's wrong, then we should worry more about 1186 * If that's wrong, then we should worry more about
1187 * exit_mmap() and do_munmap() cases described above: 1187 * exit_mmap() and do_munmap() cases described above:
1188 * we might be resetting SWAP_MAP_MAX too early here. 1188 * we might be resetting SWAP_MAP_MAX too early here.
1189 * We know "Undead"s can happen, they're okay, so don't 1189 * We know "Undead"s can happen, they're okay, so don't
1190 * report them; but do report if we reset SWAP_MAP_MAX. 1190 * report them; but do report if we reset SWAP_MAP_MAX.
1191 */ 1191 */
1192 /* We might release the lock_page() in unuse_mm(). */ 1192 /* We might release the lock_page() in unuse_mm(). */
1193 if (!PageSwapCache(page) || page_private(page) != entry.val) 1193 if (!PageSwapCache(page) || page_private(page) != entry.val)
1194 goto retry; 1194 goto retry;
1195 1195
1196 if (swap_count(*swap_map) == SWAP_MAP_MAX) { 1196 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1197 spin_lock(&swap_lock); 1197 spin_lock(&swap_lock);
1198 *swap_map = encode_swapmap(0, true); 1198 *swap_map = encode_swapmap(0, true);
1199 spin_unlock(&swap_lock); 1199 spin_unlock(&swap_lock);
1200 reset_overflow = 1; 1200 reset_overflow = 1;
1201 } 1201 }
1202 1202
1203 /* 1203 /*
1204 * If a reference remains (rare), we would like to leave 1204 * If a reference remains (rare), we would like to leave
1205 * the page in the swap cache; but try_to_unmap could 1205 * the page in the swap cache; but try_to_unmap could
1206 * then re-duplicate the entry once we drop page lock, 1206 * then re-duplicate the entry once we drop page lock,
1207 * so we might loop indefinitely; also, that page could 1207 * so we might loop indefinitely; also, that page could
1208 * not be swapped out to other storage meanwhile. So: 1208 * not be swapped out to other storage meanwhile. So:
1209 * delete from cache even if there's another reference, 1209 * delete from cache even if there's another reference,
1210 * after ensuring that the data has been saved to disk - 1210 * after ensuring that the data has been saved to disk -
1211 * since if the reference remains (rarer), it will be 1211 * since if the reference remains (rarer), it will be
1212 * read from disk into another page. Splitting into two 1212 * read from disk into another page. Splitting into two
1213 * pages would be incorrect if swap supported "shared 1213 * pages would be incorrect if swap supported "shared
1214 * private" pages, but they are handled by tmpfs files. 1214 * private" pages, but they are handled by tmpfs files.
1215 */ 1215 */
1216 if (swap_count(*swap_map) && 1216 if (swap_count(*swap_map) &&
1217 PageDirty(page) && PageSwapCache(page)) { 1217 PageDirty(page) && PageSwapCache(page)) {
1218 struct writeback_control wbc = { 1218 struct writeback_control wbc = {
1219 .sync_mode = WB_SYNC_NONE, 1219 .sync_mode = WB_SYNC_NONE,
1220 }; 1220 };
1221 1221
1222 swap_writepage(page, &wbc); 1222 swap_writepage(page, &wbc);
1223 lock_page(page); 1223 lock_page(page);
1224 wait_on_page_writeback(page); 1224 wait_on_page_writeback(page);
1225 } 1225 }
1226 1226
1227 /* 1227 /*
1228 * It is conceivable that a racing task removed this page from 1228 * It is conceivable that a racing task removed this page from
1229 * swap cache just before we acquired the page lock at the top, 1229 * swap cache just before we acquired the page lock at the top,
1230 * or while we dropped it in unuse_mm(). The page might even 1230 * or while we dropped it in unuse_mm(). The page might even
1231 * be back in swap cache on another swap area: that we must not 1231 * be back in swap cache on another swap area: that we must not
1232 * delete, since it may not have been written out to swap yet. 1232 * delete, since it may not have been written out to swap yet.
1233 */ 1233 */
1234 if (PageSwapCache(page) && 1234 if (PageSwapCache(page) &&
1235 likely(page_private(page) == entry.val)) 1235 likely(page_private(page) == entry.val))
1236 delete_from_swap_cache(page); 1236 delete_from_swap_cache(page);
1237 1237
1238 /* 1238 /*
1239 * So we could skip searching mms once swap count went 1239 * So we could skip searching mms once swap count went
1240 * to 1, we did not mark any present ptes as dirty: must 1240 * to 1, we did not mark any present ptes as dirty: must
1241 * mark page dirty so shrink_page_list will preserve it. 1241 * mark page dirty so shrink_page_list will preserve it.
1242 */ 1242 */
1243 SetPageDirty(page); 1243 SetPageDirty(page);
1244 retry: 1244 retry:
1245 unlock_page(page); 1245 unlock_page(page);
1246 page_cache_release(page); 1246 page_cache_release(page);
1247 1247
1248 /* 1248 /*
1249 * Make sure that we aren't completely killing 1249 * Make sure that we aren't completely killing
1250 * interactive performance. 1250 * interactive performance.
1251 */ 1251 */
1252 cond_resched(); 1252 cond_resched();
1253 } 1253 }
1254 1254
1255 mmput(start_mm); 1255 mmput(start_mm);
1256 if (reset_overflow) { 1256 if (reset_overflow) {
1257 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 1257 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1258 swap_overflow = 0; 1258 swap_overflow = 0;
1259 } 1259 }
1260 return retval; 1260 return retval;
1261 } 1261 }
1262 1262
1263 /* 1263 /*
1264 * After a successful try_to_unuse, if no swap is now in use, we know 1264 * After a successful try_to_unuse, if no swap is now in use, we know
1265 * we can empty the mmlist. swap_lock must be held on entry and exit. 1265 * we can empty the mmlist. swap_lock must be held on entry and exit.
1266 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1266 * Note that mmlist_lock nests inside swap_lock, and an mm must be
1267 * added to the mmlist just after page_duplicate - before would be racy. 1267 * added to the mmlist just after page_duplicate - before would be racy.
1268 */ 1268 */
1269 static void drain_mmlist(void) 1269 static void drain_mmlist(void)
1270 { 1270 {
1271 struct list_head *p, *next; 1271 struct list_head *p, *next;
1272 unsigned int i; 1272 unsigned int i;
1273 1273
1274 for (i = 0; i < nr_swapfiles; i++) 1274 for (i = 0; i < nr_swapfiles; i++)
1275 if (swap_info[i].inuse_pages) 1275 if (swap_info[i].inuse_pages)
1276 return; 1276 return;
1277 spin_lock(&mmlist_lock); 1277 spin_lock(&mmlist_lock);
1278 list_for_each_safe(p, next, &init_mm.mmlist) 1278 list_for_each_safe(p, next, &init_mm.mmlist)
1279 list_del_init(p); 1279 list_del_init(p);
1280 spin_unlock(&mmlist_lock); 1280 spin_unlock(&mmlist_lock);
1281 } 1281 }
1282 1282
1283 /* 1283 /*
1284 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1284 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1285 * corresponds to page offset `offset'. 1285 * corresponds to page offset `offset'.
1286 */ 1286 */
1287 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1287 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1288 { 1288 {
1289 struct swap_extent *se = sis->curr_swap_extent; 1289 struct swap_extent *se = sis->curr_swap_extent;
1290 struct swap_extent *start_se = se; 1290 struct swap_extent *start_se = se;
1291 1291
1292 for ( ; ; ) { 1292 for ( ; ; ) {
1293 struct list_head *lh; 1293 struct list_head *lh;
1294 1294
1295 if (se->start_page <= offset && 1295 if (se->start_page <= offset &&
1296 offset < (se->start_page + se->nr_pages)) { 1296 offset < (se->start_page + se->nr_pages)) {
1297 return se->start_block + (offset - se->start_page); 1297 return se->start_block + (offset - se->start_page);
1298 } 1298 }
1299 lh = se->list.next; 1299 lh = se->list.next;
1300 if (lh == &sis->extent_list) 1300 if (lh == &sis->extent_list)
1301 lh = lh->next; 1301 lh = lh->next;
1302 se = list_entry(lh, struct swap_extent, list); 1302 se = list_entry(lh, struct swap_extent, list);
1303 sis->curr_swap_extent = se; 1303 sis->curr_swap_extent = se;
1304 BUG_ON(se == start_se); /* It *must* be present */ 1304 BUG_ON(se == start_se); /* It *must* be present */
1305 } 1305 }
1306 } 1306 }
1307 1307
1308 #ifdef CONFIG_HIBERNATION 1308 #ifdef CONFIG_HIBERNATION
1309 /* 1309 /*
1310 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1310 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1311 * corresponding to given index in swap_info (swap type). 1311 * corresponding to given index in swap_info (swap type).
1312 */ 1312 */
1313 sector_t swapdev_block(int swap_type, pgoff_t offset) 1313 sector_t swapdev_block(int swap_type, pgoff_t offset)
1314 { 1314 {
1315 struct swap_info_struct *sis; 1315 struct swap_info_struct *sis;
1316 1316
1317 if (swap_type >= nr_swapfiles) 1317 if (swap_type >= nr_swapfiles)
1318 return 0; 1318 return 0;
1319 1319
1320 sis = swap_info + swap_type; 1320 sis = swap_info + swap_type;
1321 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; 1321 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1322 } 1322 }
1323 #endif /* CONFIG_HIBERNATION */ 1323 #endif /* CONFIG_HIBERNATION */
1324 1324
1325 /* 1325 /*
1326 * Free all of a swapdev's extent information 1326 * Free all of a swapdev's extent information
1327 */ 1327 */
1328 static void destroy_swap_extents(struct swap_info_struct *sis) 1328 static void destroy_swap_extents(struct swap_info_struct *sis)
1329 { 1329 {
1330 while (!list_empty(&sis->extent_list)) { 1330 while (!list_empty(&sis->extent_list)) {
1331 struct swap_extent *se; 1331 struct swap_extent *se;
1332 1332
1333 se = list_entry(sis->extent_list.next, 1333 se = list_entry(sis->extent_list.next,
1334 struct swap_extent, list); 1334 struct swap_extent, list);
1335 list_del(&se->list); 1335 list_del(&se->list);
1336 kfree(se); 1336 kfree(se);
1337 } 1337 }
1338 } 1338 }
1339 1339
1340 /* 1340 /*
1341 * Add a block range (and the corresponding page range) into this swapdev's 1341 * Add a block range (and the corresponding page range) into this swapdev's
1342 * extent list. The extent list is kept sorted in page order. 1342 * extent list. The extent list is kept sorted in page order.
1343 * 1343 *
1344 * This function rather assumes that it is called in ascending page order. 1344 * This function rather assumes that it is called in ascending page order.
1345 */ 1345 */
1346 static int 1346 static int
1347 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1347 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1348 unsigned long nr_pages, sector_t start_block) 1348 unsigned long nr_pages, sector_t start_block)
1349 { 1349 {
1350 struct swap_extent *se; 1350 struct swap_extent *se;
1351 struct swap_extent *new_se; 1351 struct swap_extent *new_se;
1352 struct list_head *lh; 1352 struct list_head *lh;
1353 1353
1354 lh = sis->extent_list.prev; /* The highest page extent */ 1354 lh = sis->extent_list.prev; /* The highest page extent */
1355 if (lh != &sis->extent_list) { 1355 if (lh != &sis->extent_list) {
1356 se = list_entry(lh, struct swap_extent, list); 1356 se = list_entry(lh, struct swap_extent, list);
1357 BUG_ON(se->start_page + se->nr_pages != start_page); 1357 BUG_ON(se->start_page + se->nr_pages != start_page);
1358 if (se->start_block + se->nr_pages == start_block) { 1358 if (se->start_block + se->nr_pages == start_block) {
1359 /* Merge it */ 1359 /* Merge it */
1360 se->nr_pages += nr_pages; 1360 se->nr_pages += nr_pages;
1361 return 0; 1361 return 0;
1362 } 1362 }
1363 } 1363 }
1364 1364
1365 /* 1365 /*
1366 * No merge. Insert a new extent, preserving ordering. 1366 * No merge. Insert a new extent, preserving ordering.
1367 */ 1367 */
1368 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1368 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1369 if (new_se == NULL) 1369 if (new_se == NULL)
1370 return -ENOMEM; 1370 return -ENOMEM;
1371 new_se->start_page = start_page; 1371 new_se->start_page = start_page;
1372 new_se->nr_pages = nr_pages; 1372 new_se->nr_pages = nr_pages;
1373 new_se->start_block = start_block; 1373 new_se->start_block = start_block;
1374 1374
1375 list_add_tail(&new_se->list, &sis->extent_list); 1375 list_add_tail(&new_se->list, &sis->extent_list);
1376 return 1; 1376 return 1;
1377 } 1377 }
1378 1378
1379 /* 1379 /*
1380 * A `swap extent' is a simple thing which maps a contiguous range of pages 1380 * A `swap extent' is a simple thing which maps a contiguous range of pages
1381 * onto a contiguous range of disk blocks. An ordered list of swap extents 1381 * onto a contiguous range of disk blocks. An ordered list of swap extents
1382 * is built at swapon time and is then used at swap_writepage/swap_readpage 1382 * is built at swapon time and is then used at swap_writepage/swap_readpage
1383 * time for locating where on disk a page belongs. 1383 * time for locating where on disk a page belongs.
1384 * 1384 *
1385 * If the swapfile is an S_ISBLK block device, a single extent is installed. 1385 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1386 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 1386 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1387 * swap files identically. 1387 * swap files identically.
1388 * 1388 *
1389 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 1389 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1390 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 1390 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
1391 * swapfiles are handled *identically* after swapon time. 1391 * swapfiles are handled *identically* after swapon time.
1392 * 1392 *
1393 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 1393 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1394 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 1394 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
1395 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 1395 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1396 * requirements, they are simply tossed out - we will never use those blocks 1396 * requirements, they are simply tossed out - we will never use those blocks
1397 * for swapping. 1397 * for swapping.
1398 * 1398 *
1399 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 1399 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
1400 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 1400 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1401 * which will scribble on the fs. 1401 * which will scribble on the fs.
1402 * 1402 *
1403 * The amount of disk space which a single swap extent represents varies. 1403 * The amount of disk space which a single swap extent represents varies.
1404 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 1404 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
1405 * extents in the list. To avoid much list walking, we cache the previous 1405 * extents in the list. To avoid much list walking, we cache the previous
1406 * search location in `curr_swap_extent', and start new searches from there. 1406 * search location in `curr_swap_extent', and start new searches from there.
1407 * This is extremely effective. The average number of iterations in 1407 * This is extremely effective. The average number of iterations in
1408 * map_swap_page() has been measured at about 0.3 per page. - akpm. 1408 * map_swap_page() has been measured at about 0.3 per page. - akpm.
1409 */ 1409 */
1410 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1410 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1411 { 1411 {
1412 struct inode *inode; 1412 struct inode *inode;
1413 unsigned blocks_per_page; 1413 unsigned blocks_per_page;
1414 unsigned long page_no; 1414 unsigned long page_no;
1415 unsigned blkbits; 1415 unsigned blkbits;
1416 sector_t probe_block; 1416 sector_t probe_block;
1417 sector_t last_block; 1417 sector_t last_block;
1418 sector_t lowest_block = -1; 1418 sector_t lowest_block = -1;
1419 sector_t highest_block = 0; 1419 sector_t highest_block = 0;
1420 int nr_extents = 0; 1420 int nr_extents = 0;
1421 int ret; 1421 int ret;
1422 1422
1423 inode = sis->swap_file->f_mapping->host; 1423 inode = sis->swap_file->f_mapping->host;
1424 if (S_ISBLK(inode->i_mode)) { 1424 if (S_ISBLK(inode->i_mode)) {
1425 ret = add_swap_extent(sis, 0, sis->max, 0); 1425 ret = add_swap_extent(sis, 0, sis->max, 0);
1426 *span = sis->pages; 1426 *span = sis->pages;
1427 goto done; 1427 goto done;
1428 } 1428 }
1429 1429
1430 blkbits = inode->i_blkbits; 1430 blkbits = inode->i_blkbits;
1431 blocks_per_page = PAGE_SIZE >> blkbits; 1431 blocks_per_page = PAGE_SIZE >> blkbits;
1432 1432
1433 /* 1433 /*
1434 * Map all the blocks into the extent list. This code doesn't try 1434 * Map all the blocks into the extent list. This code doesn't try
1435 * to be very smart. 1435 * to be very smart.
1436 */ 1436 */
1437 probe_block = 0; 1437 probe_block = 0;
1438 page_no = 0; 1438 page_no = 0;
1439 last_block = i_size_read(inode) >> blkbits; 1439 last_block = i_size_read(inode) >> blkbits;
1440 while ((probe_block + blocks_per_page) <= last_block && 1440 while ((probe_block + blocks_per_page) <= last_block &&
1441 page_no < sis->max) { 1441 page_no < sis->max) {
1442 unsigned block_in_page; 1442 unsigned block_in_page;
1443 sector_t first_block; 1443 sector_t first_block;
1444 1444
1445 first_block = bmap(inode, probe_block); 1445 first_block = bmap(inode, probe_block);
1446 if (first_block == 0) 1446 if (first_block == 0)
1447 goto bad_bmap; 1447 goto bad_bmap;
1448 1448
1449 /* 1449 /*
1450 * It must be PAGE_SIZE aligned on-disk 1450 * It must be PAGE_SIZE aligned on-disk
1451 */ 1451 */
1452 if (first_block & (blocks_per_page - 1)) { 1452 if (first_block & (blocks_per_page - 1)) {
1453 probe_block++; 1453 probe_block++;
1454 goto reprobe; 1454 goto reprobe;
1455 } 1455 }
1456 1456
1457 for (block_in_page = 1; block_in_page < blocks_per_page; 1457 for (block_in_page = 1; block_in_page < blocks_per_page;
1458 block_in_page++) { 1458 block_in_page++) {
1459 sector_t block; 1459 sector_t block;
1460 1460
1461 block = bmap(inode, probe_block + block_in_page); 1461 block = bmap(inode, probe_block + block_in_page);
1462 if (block == 0) 1462 if (block == 0)
1463 goto bad_bmap; 1463 goto bad_bmap;
1464 if (block != first_block + block_in_page) { 1464 if (block != first_block + block_in_page) {
1465 /* Discontiguity */ 1465 /* Discontiguity */
1466 probe_block++; 1466 probe_block++;
1467 goto reprobe; 1467 goto reprobe;
1468 } 1468 }
1469 } 1469 }
1470 1470
1471 first_block >>= (PAGE_SHIFT - blkbits); 1471 first_block >>= (PAGE_SHIFT - blkbits);
1472 if (page_no) { /* exclude the header page */ 1472 if (page_no) { /* exclude the header page */
1473 if (first_block < lowest_block) 1473 if (first_block < lowest_block)
1474 lowest_block = first_block; 1474 lowest_block = first_block;
1475 if (first_block > highest_block) 1475 if (first_block > highest_block)
1476 highest_block = first_block; 1476 highest_block = first_block;
1477 } 1477 }
1478 1478
1479 /* 1479 /*
1480 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1480 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1481 */ 1481 */
1482 ret = add_swap_extent(sis, page_no, 1, first_block); 1482 ret = add_swap_extent(sis, page_no, 1, first_block);
1483 if (ret < 0) 1483 if (ret < 0)
1484 goto out; 1484 goto out;
1485 nr_extents += ret; 1485 nr_extents += ret;
1486 page_no++; 1486 page_no++;
1487 probe_block += blocks_per_page; 1487 probe_block += blocks_per_page;
1488 reprobe: 1488 reprobe:
1489 continue; 1489 continue;
1490 } 1490 }
1491 ret = nr_extents; 1491 ret = nr_extents;
1492 *span = 1 + highest_block - lowest_block; 1492 *span = 1 + highest_block - lowest_block;
1493 if (page_no == 0) 1493 if (page_no == 0)
1494 page_no = 1; /* force Empty message */ 1494 page_no = 1; /* force Empty message */
1495 sis->max = page_no; 1495 sis->max = page_no;
1496 sis->pages = page_no - 1; 1496 sis->pages = page_no - 1;
1497 sis->highest_bit = page_no - 1; 1497 sis->highest_bit = page_no - 1;
1498 done: 1498 done:
1499 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1499 sis->curr_swap_extent = list_entry(sis->extent_list.prev,
1500 struct swap_extent, list); 1500 struct swap_extent, list);
1501 goto out; 1501 goto out;
1502 bad_bmap: 1502 bad_bmap:
1503 printk(KERN_ERR "swapon: swapfile has holes\n"); 1503 printk(KERN_ERR "swapon: swapfile has holes\n");
1504 ret = -EINVAL; 1504 ret = -EINVAL;
1505 out: 1505 out:
1506 return ret; 1506 return ret;
1507 } 1507 }
1508 1508
1509 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1509 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1510 { 1510 {
1511 struct swap_info_struct * p = NULL; 1511 struct swap_info_struct * p = NULL;
1512 unsigned short *swap_map; 1512 unsigned short *swap_map;
1513 struct file *swap_file, *victim; 1513 struct file *swap_file, *victim;
1514 struct address_space *mapping; 1514 struct address_space *mapping;
1515 struct inode *inode; 1515 struct inode *inode;
1516 char * pathname; 1516 char * pathname;
1517 int i, type, prev; 1517 int i, type, prev;
1518 int err; 1518 int err;
1519 1519
1520 if (!capable(CAP_SYS_ADMIN)) 1520 if (!capable(CAP_SYS_ADMIN))
1521 return -EPERM; 1521 return -EPERM;
1522 1522
1523 pathname = getname(specialfile); 1523 pathname = getname(specialfile);
1524 err = PTR_ERR(pathname); 1524 err = PTR_ERR(pathname);
1525 if (IS_ERR(pathname)) 1525 if (IS_ERR(pathname))
1526 goto out; 1526 goto out;
1527 1527
1528 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1528 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1529 putname(pathname); 1529 putname(pathname);
1530 err = PTR_ERR(victim); 1530 err = PTR_ERR(victim);
1531 if (IS_ERR(victim)) 1531 if (IS_ERR(victim))
1532 goto out; 1532 goto out;
1533 1533
1534 mapping = victim->f_mapping; 1534 mapping = victim->f_mapping;
1535 prev = -1; 1535 prev = -1;
1536 spin_lock(&swap_lock); 1536 spin_lock(&swap_lock);
1537 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1537 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1538 p = swap_info + type; 1538 p = swap_info + type;
1539 if (p->flags & SWP_WRITEOK) { 1539 if (p->flags & SWP_WRITEOK) {
1540 if (p->swap_file->f_mapping == mapping) 1540 if (p->swap_file->f_mapping == mapping)
1541 break; 1541 break;
1542 } 1542 }
1543 prev = type; 1543 prev = type;
1544 } 1544 }
1545 if (type < 0) { 1545 if (type < 0) {
1546 err = -EINVAL; 1546 err = -EINVAL;
1547 spin_unlock(&swap_lock); 1547 spin_unlock(&swap_lock);
1548 goto out_dput; 1548 goto out_dput;
1549 } 1549 }
1550 if (!security_vm_enough_memory(p->pages)) 1550 if (!security_vm_enough_memory(p->pages))
1551 vm_unacct_memory(p->pages); 1551 vm_unacct_memory(p->pages);
1552 else { 1552 else {
1553 err = -ENOMEM; 1553 err = -ENOMEM;
1554 spin_unlock(&swap_lock); 1554 spin_unlock(&swap_lock);
1555 goto out_dput; 1555 goto out_dput;
1556 } 1556 }
1557 if (prev < 0) { 1557 if (prev < 0) {
1558 swap_list.head = p->next; 1558 swap_list.head = p->next;
1559 } else { 1559 } else {
1560 swap_info[prev].next = p->next; 1560 swap_info[prev].next = p->next;
1561 } 1561 }
1562 if (type == swap_list.next) { 1562 if (type == swap_list.next) {
1563 /* just pick something that's safe... */ 1563 /* just pick something that's safe... */
1564 swap_list.next = swap_list.head; 1564 swap_list.next = swap_list.head;
1565 } 1565 }
1566 if (p->prio < 0) { 1566 if (p->prio < 0) {
1567 for (i = p->next; i >= 0; i = swap_info[i].next) 1567 for (i = p->next; i >= 0; i = swap_info[i].next)
1568 swap_info[i].prio = p->prio--; 1568 swap_info[i].prio = p->prio--;
1569 least_priority++; 1569 least_priority++;
1570 } 1570 }
1571 nr_swap_pages -= p->pages; 1571 nr_swap_pages -= p->pages;
1572 total_swap_pages -= p->pages; 1572 total_swap_pages -= p->pages;
1573 p->flags &= ~SWP_WRITEOK; 1573 p->flags &= ~SWP_WRITEOK;
1574 spin_unlock(&swap_lock); 1574 spin_unlock(&swap_lock);
1575 1575
1576 current->flags |= PF_SWAPOFF; 1576 current->flags |= PF_SWAPOFF;
1577 err = try_to_unuse(type); 1577 err = try_to_unuse(type);
1578 current->flags &= ~PF_SWAPOFF; 1578 current->flags &= ~PF_SWAPOFF;
1579 1579
1580 if (err) { 1580 if (err) {
1581 /* re-insert swap space back into swap_list */ 1581 /* re-insert swap space back into swap_list */
1582 spin_lock(&swap_lock); 1582 spin_lock(&swap_lock);
1583 if (p->prio < 0) 1583 if (p->prio < 0)
1584 p->prio = --least_priority; 1584 p->prio = --least_priority;
1585 prev = -1; 1585 prev = -1;
1586 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1586 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1587 if (p->prio >= swap_info[i].prio) 1587 if (p->prio >= swap_info[i].prio)
1588 break; 1588 break;
1589 prev = i; 1589 prev = i;
1590 } 1590 }
1591 p->next = i; 1591 p->next = i;
1592 if (prev < 0) 1592 if (prev < 0)
1593 swap_list.head = swap_list.next = p - swap_info; 1593 swap_list.head = swap_list.next = p - swap_info;
1594 else 1594 else
1595 swap_info[prev].next = p - swap_info; 1595 swap_info[prev].next = p - swap_info;
1596 nr_swap_pages += p->pages; 1596 nr_swap_pages += p->pages;
1597 total_swap_pages += p->pages; 1597 total_swap_pages += p->pages;
1598 p->flags |= SWP_WRITEOK; 1598 p->flags |= SWP_WRITEOK;
1599 spin_unlock(&swap_lock); 1599 spin_unlock(&swap_lock);
1600 goto out_dput; 1600 goto out_dput;
1601 } 1601 }
1602 1602
1603 /* wait for any unplug function to finish */ 1603 /* wait for any unplug function to finish */
1604 down_write(&swap_unplug_sem); 1604 down_write(&swap_unplug_sem);
1605 up_write(&swap_unplug_sem); 1605 up_write(&swap_unplug_sem);
1606 1606
1607 destroy_swap_extents(p); 1607 destroy_swap_extents(p);
1608 mutex_lock(&swapon_mutex); 1608 mutex_lock(&swapon_mutex);
1609 spin_lock(&swap_lock); 1609 spin_lock(&swap_lock);
1610 drain_mmlist(); 1610 drain_mmlist();
1611 1611
1612 /* wait for anyone still in scan_swap_map */ 1612 /* wait for anyone still in scan_swap_map */
1613 p->highest_bit = 0; /* cuts scans short */ 1613 p->highest_bit = 0; /* cuts scans short */
1614 while (p->flags >= SWP_SCANNING) { 1614 while (p->flags >= SWP_SCANNING) {
1615 spin_unlock(&swap_lock); 1615 spin_unlock(&swap_lock);
1616 schedule_timeout_uninterruptible(1); 1616 schedule_timeout_uninterruptible(1);
1617 spin_lock(&swap_lock); 1617 spin_lock(&swap_lock);
1618 } 1618 }
1619 1619
1620 swap_file = p->swap_file; 1620 swap_file = p->swap_file;
1621 p->swap_file = NULL; 1621 p->swap_file = NULL;
1622 p->max = 0; 1622 p->max = 0;
1623 swap_map = p->swap_map; 1623 swap_map = p->swap_map;
1624 p->swap_map = NULL; 1624 p->swap_map = NULL;
1625 p->flags = 0; 1625 p->flags = 0;
1626 spin_unlock(&swap_lock); 1626 spin_unlock(&swap_lock);
1627 mutex_unlock(&swapon_mutex); 1627 mutex_unlock(&swapon_mutex);
1628 vfree(swap_map); 1628 vfree(swap_map);
1629 /* Destroy swap account informatin */ 1629 /* Destroy swap account informatin */
1630 swap_cgroup_swapoff(type); 1630 swap_cgroup_swapoff(type);
1631 1631
1632 inode = mapping->host; 1632 inode = mapping->host;
1633 if (S_ISBLK(inode->i_mode)) { 1633 if (S_ISBLK(inode->i_mode)) {
1634 struct block_device *bdev = I_BDEV(inode); 1634 struct block_device *bdev = I_BDEV(inode);
1635 set_blocksize(bdev, p->old_block_size); 1635 set_blocksize(bdev, p->old_block_size);
1636 bd_release(bdev); 1636 bd_release(bdev);
1637 } else { 1637 } else {
1638 mutex_lock(&inode->i_mutex); 1638 mutex_lock(&inode->i_mutex);
1639 inode->i_flags &= ~S_SWAPFILE; 1639 inode->i_flags &= ~S_SWAPFILE;
1640 mutex_unlock(&inode->i_mutex); 1640 mutex_unlock(&inode->i_mutex);
1641 } 1641 }
1642 filp_close(swap_file, NULL); 1642 filp_close(swap_file, NULL);
1643 err = 0; 1643 err = 0;
1644 1644
1645 out_dput: 1645 out_dput:
1646 filp_close(victim, NULL); 1646 filp_close(victim, NULL);
1647 out: 1647 out:
1648 return err; 1648 return err;
1649 } 1649 }
1650 1650
1651 #ifdef CONFIG_PROC_FS 1651 #ifdef CONFIG_PROC_FS
1652 /* iterator */ 1652 /* iterator */
1653 static void *swap_start(struct seq_file *swap, loff_t *pos) 1653 static void *swap_start(struct seq_file *swap, loff_t *pos)
1654 { 1654 {
1655 struct swap_info_struct *ptr = swap_info; 1655 struct swap_info_struct *ptr = swap_info;
1656 int i; 1656 int i;
1657 loff_t l = *pos; 1657 loff_t l = *pos;
1658 1658
1659 mutex_lock(&swapon_mutex); 1659 mutex_lock(&swapon_mutex);
1660 1660
1661 if (!l) 1661 if (!l)
1662 return SEQ_START_TOKEN; 1662 return SEQ_START_TOKEN;
1663 1663
1664 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1664 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1665 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1665 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1666 continue; 1666 continue;
1667 if (!--l) 1667 if (!--l)
1668 return ptr; 1668 return ptr;
1669 } 1669 }
1670 1670
1671 return NULL; 1671 return NULL;
1672 } 1672 }
1673 1673
1674 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1674 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1675 { 1675 {
1676 struct swap_info_struct *ptr; 1676 struct swap_info_struct *ptr;
1677 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1677 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1678 1678
1679 if (v == SEQ_START_TOKEN) 1679 if (v == SEQ_START_TOKEN)
1680 ptr = swap_info; 1680 ptr = swap_info;
1681 else { 1681 else {
1682 ptr = v; 1682 ptr = v;
1683 ptr++; 1683 ptr++;
1684 } 1684 }
1685 1685
1686 for (; ptr < endptr; ptr++) { 1686 for (; ptr < endptr; ptr++) {
1687 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1687 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1688 continue; 1688 continue;
1689 ++*pos; 1689 ++*pos;
1690 return ptr; 1690 return ptr;
1691 } 1691 }
1692 1692
1693 return NULL; 1693 return NULL;
1694 } 1694 }
1695 1695
1696 static void swap_stop(struct seq_file *swap, void *v) 1696 static void swap_stop(struct seq_file *swap, void *v)
1697 { 1697 {
1698 mutex_unlock(&swapon_mutex); 1698 mutex_unlock(&swapon_mutex);
1699 } 1699 }
1700 1700
1701 static int swap_show(struct seq_file *swap, void *v) 1701 static int swap_show(struct seq_file *swap, void *v)
1702 { 1702 {
1703 struct swap_info_struct *ptr = v; 1703 struct swap_info_struct *ptr = v;
1704 struct file *file; 1704 struct file *file;
1705 int len; 1705 int len;
1706 1706
1707 if (ptr == SEQ_START_TOKEN) { 1707 if (ptr == SEQ_START_TOKEN) {
1708 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1708 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1709 return 0; 1709 return 0;
1710 } 1710 }
1711 1711
1712 file = ptr->swap_file; 1712 file = ptr->swap_file;
1713 len = seq_path(swap, &file->f_path, " \t\n\\"); 1713 len = seq_path(swap, &file->f_path, " \t\n\\");
1714 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1714 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1715 len < 40 ? 40 - len : 1, " ", 1715 len < 40 ? 40 - len : 1, " ",
1716 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1716 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1717 "partition" : "file\t", 1717 "partition" : "file\t",
1718 ptr->pages << (PAGE_SHIFT - 10), 1718 ptr->pages << (PAGE_SHIFT - 10),
1719 ptr->inuse_pages << (PAGE_SHIFT - 10), 1719 ptr->inuse_pages << (PAGE_SHIFT - 10),
1720 ptr->prio); 1720 ptr->prio);
1721 return 0; 1721 return 0;
1722 } 1722 }
1723 1723
1724 static const struct seq_operations swaps_op = { 1724 static const struct seq_operations swaps_op = {
1725 .start = swap_start, 1725 .start = swap_start,
1726 .next = swap_next, 1726 .next = swap_next,
1727 .stop = swap_stop, 1727 .stop = swap_stop,
1728 .show = swap_show 1728 .show = swap_show
1729 }; 1729 };
1730 1730
1731 static int swaps_open(struct inode *inode, struct file *file) 1731 static int swaps_open(struct inode *inode, struct file *file)
1732 { 1732 {
1733 return seq_open(file, &swaps_op); 1733 return seq_open(file, &swaps_op);
1734 } 1734 }
1735 1735
1736 static const struct file_operations proc_swaps_operations = { 1736 static const struct file_operations proc_swaps_operations = {
1737 .open = swaps_open, 1737 .open = swaps_open,
1738 .read = seq_read, 1738 .read = seq_read,
1739 .llseek = seq_lseek, 1739 .llseek = seq_lseek,
1740 .release = seq_release, 1740 .release = seq_release,
1741 }; 1741 };
1742 1742
1743 static int __init procswaps_init(void) 1743 static int __init procswaps_init(void)
1744 { 1744 {
1745 proc_create("swaps", 0, NULL, &proc_swaps_operations); 1745 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1746 return 0; 1746 return 0;
1747 } 1747 }
1748 __initcall(procswaps_init); 1748 __initcall(procswaps_init);
1749 #endif /* CONFIG_PROC_FS */ 1749 #endif /* CONFIG_PROC_FS */
1750 1750
1751 #ifdef MAX_SWAPFILES_CHECK 1751 #ifdef MAX_SWAPFILES_CHECK
1752 static int __init max_swapfiles_check(void) 1752 static int __init max_swapfiles_check(void)
1753 { 1753 {
1754 MAX_SWAPFILES_CHECK(); 1754 MAX_SWAPFILES_CHECK();
1755 return 0; 1755 return 0;
1756 } 1756 }
1757 late_initcall(max_swapfiles_check); 1757 late_initcall(max_swapfiles_check);
1758 #endif 1758 #endif
1759 1759
1760 /* 1760 /*
1761 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1761 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1762 * 1762 *
1763 * The swapon system call 1763 * The swapon system call
1764 */ 1764 */
1765 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1765 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1766 { 1766 {
1767 struct swap_info_struct * p; 1767 struct swap_info_struct * p;
1768 char *name = NULL; 1768 char *name = NULL;
1769 struct block_device *bdev = NULL; 1769 struct block_device *bdev = NULL;
1770 struct file *swap_file = NULL; 1770 struct file *swap_file = NULL;
1771 struct address_space *mapping; 1771 struct address_space *mapping;
1772 unsigned int type; 1772 unsigned int type;
1773 int i, prev; 1773 int i, prev;
1774 int error; 1774 int error;
1775 union swap_header *swap_header = NULL; 1775 union swap_header *swap_header = NULL;
1776 unsigned int nr_good_pages = 0; 1776 unsigned int nr_good_pages = 0;
1777 int nr_extents = 0; 1777 int nr_extents = 0;
1778 sector_t span; 1778 sector_t span;
1779 unsigned long maxpages = 1; 1779 unsigned long maxpages = 1;
1780 unsigned long swapfilepages; 1780 unsigned long swapfilepages;
1781 unsigned short *swap_map = NULL; 1781 unsigned short *swap_map = NULL;
1782 struct page *page = NULL; 1782 struct page *page = NULL;
1783 struct inode *inode = NULL; 1783 struct inode *inode = NULL;
1784 int did_down = 0; 1784 int did_down = 0;
1785 1785
1786 if (!capable(CAP_SYS_ADMIN)) 1786 if (!capable(CAP_SYS_ADMIN))
1787 return -EPERM; 1787 return -EPERM;
1788 spin_lock(&swap_lock); 1788 spin_lock(&swap_lock);
1789 p = swap_info; 1789 p = swap_info;
1790 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1790 for (type = 0 ; type < nr_swapfiles ; type++,p++)
1791 if (!(p->flags & SWP_USED)) 1791 if (!(p->flags & SWP_USED))
1792 break; 1792 break;
1793 error = -EPERM; 1793 error = -EPERM;
1794 if (type >= MAX_SWAPFILES) { 1794 if (type >= MAX_SWAPFILES) {
1795 spin_unlock(&swap_lock); 1795 spin_unlock(&swap_lock);
1796 goto out; 1796 goto out;
1797 } 1797 }
1798 if (type >= nr_swapfiles) 1798 if (type >= nr_swapfiles)
1799 nr_swapfiles = type+1; 1799 nr_swapfiles = type+1;
1800 memset(p, 0, sizeof(*p)); 1800 memset(p, 0, sizeof(*p));
1801 INIT_LIST_HEAD(&p->extent_list); 1801 INIT_LIST_HEAD(&p->extent_list);
1802 p->flags = SWP_USED; 1802 p->flags = SWP_USED;
1803 p->next = -1; 1803 p->next = -1;
1804 spin_unlock(&swap_lock); 1804 spin_unlock(&swap_lock);
1805 name = getname(specialfile); 1805 name = getname(specialfile);
1806 error = PTR_ERR(name); 1806 error = PTR_ERR(name);
1807 if (IS_ERR(name)) { 1807 if (IS_ERR(name)) {
1808 name = NULL; 1808 name = NULL;
1809 goto bad_swap_2; 1809 goto bad_swap_2;
1810 } 1810 }
1811 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1811 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1812 error = PTR_ERR(swap_file); 1812 error = PTR_ERR(swap_file);
1813 if (IS_ERR(swap_file)) { 1813 if (IS_ERR(swap_file)) {
1814 swap_file = NULL; 1814 swap_file = NULL;
1815 goto bad_swap_2; 1815 goto bad_swap_2;
1816 } 1816 }
1817 1817
1818 p->swap_file = swap_file; 1818 p->swap_file = swap_file;
1819 mapping = swap_file->f_mapping; 1819 mapping = swap_file->f_mapping;
1820 inode = mapping->host; 1820 inode = mapping->host;
1821 1821
1822 error = -EBUSY; 1822 error = -EBUSY;
1823 for (i = 0; i < nr_swapfiles; i++) { 1823 for (i = 0; i < nr_swapfiles; i++) {
1824 struct swap_info_struct *q = &swap_info[i]; 1824 struct swap_info_struct *q = &swap_info[i];
1825 1825
1826 if (i == type || !q->swap_file) 1826 if (i == type || !q->swap_file)
1827 continue; 1827 continue;
1828 if (mapping == q->swap_file->f_mapping) 1828 if (mapping == q->swap_file->f_mapping)
1829 goto bad_swap; 1829 goto bad_swap;
1830 } 1830 }
1831 1831
1832 error = -EINVAL; 1832 error = -EINVAL;
1833 if (S_ISBLK(inode->i_mode)) { 1833 if (S_ISBLK(inode->i_mode)) {
1834 bdev = I_BDEV(inode); 1834 bdev = I_BDEV(inode);
1835 error = bd_claim(bdev, sys_swapon); 1835 error = bd_claim(bdev, sys_swapon);
1836 if (error < 0) { 1836 if (error < 0) {
1837 bdev = NULL; 1837 bdev = NULL;
1838 error = -EINVAL; 1838 error = -EINVAL;
1839 goto bad_swap; 1839 goto bad_swap;
1840 } 1840 }
1841 p->old_block_size = block_size(bdev); 1841 p->old_block_size = block_size(bdev);
1842 error = set_blocksize(bdev, PAGE_SIZE); 1842 error = set_blocksize(bdev, PAGE_SIZE);
1843 if (error < 0) 1843 if (error < 0)
1844 goto bad_swap; 1844 goto bad_swap;
1845 p->bdev = bdev; 1845 p->bdev = bdev;
1846 } else if (S_ISREG(inode->i_mode)) { 1846 } else if (S_ISREG(inode->i_mode)) {
1847 p->bdev = inode->i_sb->s_bdev; 1847 p->bdev = inode->i_sb->s_bdev;
1848 mutex_lock(&inode->i_mutex); 1848 mutex_lock(&inode->i_mutex);
1849 did_down = 1; 1849 did_down = 1;
1850 if (IS_SWAPFILE(inode)) { 1850 if (IS_SWAPFILE(inode)) {
1851 error = -EBUSY; 1851 error = -EBUSY;
1852 goto bad_swap; 1852 goto bad_swap;
1853 } 1853 }
1854 } else { 1854 } else {
1855 goto bad_swap; 1855 goto bad_swap;
1856 } 1856 }
1857 1857
1858 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1858 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1859 1859
1860 /* 1860 /*
1861 * Read the swap header. 1861 * Read the swap header.
1862 */ 1862 */
1863 if (!mapping->a_ops->readpage) { 1863 if (!mapping->a_ops->readpage) {
1864 error = -EINVAL; 1864 error = -EINVAL;
1865 goto bad_swap; 1865 goto bad_swap;
1866 } 1866 }
1867 page = read_mapping_page(mapping, 0, swap_file); 1867 page = read_mapping_page(mapping, 0, swap_file);
1868 if (IS_ERR(page)) { 1868 if (IS_ERR(page)) {
1869 error = PTR_ERR(page); 1869 error = PTR_ERR(page);
1870 goto bad_swap; 1870 goto bad_swap;
1871 } 1871 }
1872 swap_header = kmap(page); 1872 swap_header = kmap(page);
1873 1873
1874 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1874 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1875 printk(KERN_ERR "Unable to find swap-space signature\n"); 1875 printk(KERN_ERR "Unable to find swap-space signature\n");
1876 error = -EINVAL; 1876 error = -EINVAL;
1877 goto bad_swap; 1877 goto bad_swap;
1878 } 1878 }
1879 1879
1880 /* swap partition endianess hack... */ 1880 /* swap partition endianess hack... */
1881 if (swab32(swap_header->info.version) == 1) { 1881 if (swab32(swap_header->info.version) == 1) {
1882 swab32s(&swap_header->info.version); 1882 swab32s(&swap_header->info.version);
1883 swab32s(&swap_header->info.last_page); 1883 swab32s(&swap_header->info.last_page);
1884 swab32s(&swap_header->info.nr_badpages); 1884 swab32s(&swap_header->info.nr_badpages);
1885 for (i = 0; i < swap_header->info.nr_badpages; i++) 1885 for (i = 0; i < swap_header->info.nr_badpages; i++)
1886 swab32s(&swap_header->info.badpages[i]); 1886 swab32s(&swap_header->info.badpages[i]);
1887 } 1887 }
1888 /* Check the swap header's sub-version */ 1888 /* Check the swap header's sub-version */
1889 if (swap_header->info.version != 1) { 1889 if (swap_header->info.version != 1) {
1890 printk(KERN_WARNING 1890 printk(KERN_WARNING
1891 "Unable to handle swap header version %d\n", 1891 "Unable to handle swap header version %d\n",
1892 swap_header->info.version); 1892 swap_header->info.version);
1893 error = -EINVAL; 1893 error = -EINVAL;
1894 goto bad_swap; 1894 goto bad_swap;
1895 } 1895 }
1896 1896
1897 p->lowest_bit = 1; 1897 p->lowest_bit = 1;
1898 p->cluster_next = 1; 1898 p->cluster_next = 1;
1899 1899
1900 /* 1900 /*
1901 * Find out how many pages are allowed for a single swap 1901 * Find out how many pages are allowed for a single swap
1902 * device. There are two limiting factors: 1) the number of 1902 * device. There are two limiting factors: 1) the number of
1903 * bits for the swap offset in the swp_entry_t type and 1903 * bits for the swap offset in the swp_entry_t type and
1904 * 2) the number of bits in the a swap pte as defined by 1904 * 2) the number of bits in the a swap pte as defined by
1905 * the different architectures. In order to find the 1905 * the different architectures. In order to find the
1906 * largest possible bit mask a swap entry with swap type 0 1906 * largest possible bit mask a swap entry with swap type 0
1907 * and swap offset ~0UL is created, encoded to a swap pte, 1907 * and swap offset ~0UL is created, encoded to a swap pte,
1908 * decoded to a swp_entry_t again and finally the swap 1908 * decoded to a swp_entry_t again and finally the swap
1909 * offset is extracted. This will mask all the bits from 1909 * offset is extracted. This will mask all the bits from
1910 * the initial ~0UL mask that can't be encoded in either 1910 * the initial ~0UL mask that can't be encoded in either
1911 * the swp_entry_t or the architecture definition of a 1911 * the swp_entry_t or the architecture definition of a
1912 * swap pte. 1912 * swap pte.
1913 */ 1913 */
1914 maxpages = swp_offset(pte_to_swp_entry( 1914 maxpages = swp_offset(pte_to_swp_entry(
1915 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1915 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1916 if (maxpages > swap_header->info.last_page) 1916 if (maxpages > swap_header->info.last_page)
1917 maxpages = swap_header->info.last_page; 1917 maxpages = swap_header->info.last_page;
1918 p->highest_bit = maxpages - 1; 1918 p->highest_bit = maxpages - 1;
1919 1919
1920 error = -EINVAL; 1920 error = -EINVAL;
1921 if (!maxpages) 1921 if (!maxpages)
1922 goto bad_swap; 1922 goto bad_swap;
1923 if (swapfilepages && maxpages > swapfilepages) { 1923 if (swapfilepages && maxpages > swapfilepages) {
1924 printk(KERN_WARNING 1924 printk(KERN_WARNING
1925 "Swap area shorter than signature indicates\n"); 1925 "Swap area shorter than signature indicates\n");
1926 goto bad_swap; 1926 goto bad_swap;
1927 } 1927 }
1928 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1928 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1929 goto bad_swap; 1929 goto bad_swap;
1930 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1930 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1931 goto bad_swap; 1931 goto bad_swap;
1932 1932
1933 /* OK, set up the swap map and apply the bad block list */ 1933 /* OK, set up the swap map and apply the bad block list */
1934 swap_map = vmalloc(maxpages * sizeof(short)); 1934 swap_map = vmalloc(maxpages * sizeof(short));
1935 if (!swap_map) { 1935 if (!swap_map) {
1936 error = -ENOMEM; 1936 error = -ENOMEM;
1937 goto bad_swap; 1937 goto bad_swap;
1938 } 1938 }
1939 1939
1940 memset(swap_map, 0, maxpages * sizeof(short)); 1940 memset(swap_map, 0, maxpages * sizeof(short));
1941 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1941 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1942 int page_nr = swap_header->info.badpages[i]; 1942 int page_nr = swap_header->info.badpages[i];
1943 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1943 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
1944 error = -EINVAL; 1944 error = -EINVAL;
1945 goto bad_swap; 1945 goto bad_swap;
1946 } 1946 }
1947 swap_map[page_nr] = SWAP_MAP_BAD; 1947 swap_map[page_nr] = SWAP_MAP_BAD;
1948 } 1948 }
1949 1949
1950 error = swap_cgroup_swapon(type, maxpages); 1950 error = swap_cgroup_swapon(type, maxpages);
1951 if (error) 1951 if (error)
1952 goto bad_swap; 1952 goto bad_swap;
1953 1953
1954 nr_good_pages = swap_header->info.last_page - 1954 nr_good_pages = swap_header->info.last_page -
1955 swap_header->info.nr_badpages - 1955 swap_header->info.nr_badpages -
1956 1 /* header page */; 1956 1 /* header page */;
1957 1957
1958 if (nr_good_pages) { 1958 if (nr_good_pages) {
1959 swap_map[0] = SWAP_MAP_BAD; 1959 swap_map[0] = SWAP_MAP_BAD;
1960 p->max = maxpages; 1960 p->max = maxpages;
1961 p->pages = nr_good_pages; 1961 p->pages = nr_good_pages;
1962 nr_extents = setup_swap_extents(p, &span); 1962 nr_extents = setup_swap_extents(p, &span);
1963 if (nr_extents < 0) { 1963 if (nr_extents < 0) {
1964 error = nr_extents; 1964 error = nr_extents;
1965 goto bad_swap; 1965 goto bad_swap;
1966 } 1966 }
1967 nr_good_pages = p->pages; 1967 nr_good_pages = p->pages;
1968 } 1968 }
1969 if (!nr_good_pages) { 1969 if (!nr_good_pages) {
1970 printk(KERN_WARNING "Empty swap-file\n"); 1970 printk(KERN_WARNING "Empty swap-file\n");
1971 error = -EINVAL; 1971 error = -EINVAL;
1972 goto bad_swap; 1972 goto bad_swap;
1973 } 1973 }
1974 1974
1975 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1975 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1976 p->flags |= SWP_SOLIDSTATE; 1976 p->flags |= SWP_SOLIDSTATE;
1977 p->cluster_next = 1 + (random32() % p->highest_bit); 1977 p->cluster_next = 1 + (random32() % p->highest_bit);
1978 } 1978 }
1979 if (discard_swap(p) == 0) 1979 if (discard_swap(p) == 0)
1980 p->flags |= SWP_DISCARDABLE; 1980 p->flags |= SWP_DISCARDABLE;
1981 1981
1982 mutex_lock(&swapon_mutex); 1982 mutex_lock(&swapon_mutex);
1983 spin_lock(&swap_lock); 1983 spin_lock(&swap_lock);
1984 if (swap_flags & SWAP_FLAG_PREFER) 1984 if (swap_flags & SWAP_FLAG_PREFER)
1985 p->prio = 1985 p->prio =
1986 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 1986 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1987 else 1987 else
1988 p->prio = --least_priority; 1988 p->prio = --least_priority;
1989 p->swap_map = swap_map; 1989 p->swap_map = swap_map;
1990 p->flags |= SWP_WRITEOK; 1990 p->flags |= SWP_WRITEOK;
1991 nr_swap_pages += nr_good_pages; 1991 nr_swap_pages += nr_good_pages;
1992 total_swap_pages += nr_good_pages; 1992 total_swap_pages += nr_good_pages;
1993 1993
1994 printk(KERN_INFO "Adding %uk swap on %s. " 1994 printk(KERN_INFO "Adding %uk swap on %s. "
1995 "Priority:%d extents:%d across:%lluk %s%s\n", 1995 "Priority:%d extents:%d across:%lluk %s%s\n",
1996 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1996 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1997 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 1997 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1998 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 1998 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1999 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 1999 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2000 2000
2001 /* insert swap space into swap_list: */ 2001 /* insert swap space into swap_list: */
2002 prev = -1; 2002 prev = -1;
2003 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2003 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
2004 if (p->prio >= swap_info[i].prio) { 2004 if (p->prio >= swap_info[i].prio) {
2005 break; 2005 break;
2006 } 2006 }
2007 prev = i; 2007 prev = i;
2008 } 2008 }
2009 p->next = i; 2009 p->next = i;
2010 if (prev < 0) { 2010 if (prev < 0) {
2011 swap_list.head = swap_list.next = p - swap_info; 2011 swap_list.head = swap_list.next = p - swap_info;
2012 } else { 2012 } else {
2013 swap_info[prev].next = p - swap_info; 2013 swap_info[prev].next = p - swap_info;
2014 } 2014 }
2015 spin_unlock(&swap_lock); 2015 spin_unlock(&swap_lock);
2016 mutex_unlock(&swapon_mutex); 2016 mutex_unlock(&swapon_mutex);
2017 error = 0; 2017 error = 0;
2018 goto out; 2018 goto out;
2019 bad_swap: 2019 bad_swap:
2020 if (bdev) { 2020 if (bdev) {
2021 set_blocksize(bdev, p->old_block_size); 2021 set_blocksize(bdev, p->old_block_size);
2022 bd_release(bdev); 2022 bd_release(bdev);
2023 } 2023 }
2024 destroy_swap_extents(p); 2024 destroy_swap_extents(p);
2025 swap_cgroup_swapoff(type); 2025 swap_cgroup_swapoff(type);
2026 bad_swap_2: 2026 bad_swap_2:
2027 spin_lock(&swap_lock); 2027 spin_lock(&swap_lock);
2028 p->swap_file = NULL; 2028 p->swap_file = NULL;
2029 p->flags = 0; 2029 p->flags = 0;
2030 spin_unlock(&swap_lock); 2030 spin_unlock(&swap_lock);
2031 vfree(swap_map); 2031 vfree(swap_map);
2032 if (swap_file) 2032 if (swap_file)
2033 filp_close(swap_file, NULL); 2033 filp_close(swap_file, NULL);
2034 out: 2034 out:
2035 if (page && !IS_ERR(page)) { 2035 if (page && !IS_ERR(page)) {
2036 kunmap(page); 2036 kunmap(page);
2037 page_cache_release(page); 2037 page_cache_release(page);
2038 } 2038 }
2039 if (name) 2039 if (name)
2040 putname(name); 2040 putname(name);
2041 if (did_down) { 2041 if (did_down) {
2042 if (!error) 2042 if (!error)
2043 inode->i_flags |= S_SWAPFILE; 2043 inode->i_flags |= S_SWAPFILE;
2044 mutex_unlock(&inode->i_mutex); 2044 mutex_unlock(&inode->i_mutex);
2045 } 2045 }
2046 return error; 2046 return error;
2047 } 2047 }
2048 2048
2049 void si_swapinfo(struct sysinfo *val) 2049 void si_swapinfo(struct sysinfo *val)
2050 { 2050 {
2051 unsigned int i; 2051 unsigned int i;
2052 unsigned long nr_to_be_unused = 0; 2052 unsigned long nr_to_be_unused = 0;
2053 2053
2054 spin_lock(&swap_lock); 2054 spin_lock(&swap_lock);
2055 for (i = 0; i < nr_swapfiles; i++) { 2055 for (i = 0; i < nr_swapfiles; i++) {
2056 if (!(swap_info[i].flags & SWP_USED) || 2056 if (!(swap_info[i].flags & SWP_USED) ||
2057 (swap_info[i].flags & SWP_WRITEOK)) 2057 (swap_info[i].flags & SWP_WRITEOK))
2058 continue; 2058 continue;
2059 nr_to_be_unused += swap_info[i].inuse_pages; 2059 nr_to_be_unused += swap_info[i].inuse_pages;
2060 } 2060 }
2061 val->freeswap = nr_swap_pages + nr_to_be_unused; 2061 val->freeswap = nr_swap_pages + nr_to_be_unused;
2062 val->totalswap = total_swap_pages + nr_to_be_unused; 2062 val->totalswap = total_swap_pages + nr_to_be_unused;
2063 spin_unlock(&swap_lock); 2063 spin_unlock(&swap_lock);
2064 } 2064 }
2065 2065
2066 /* 2066 /*
2067 * Verify that a swap entry is valid and increment its swap map count. 2067 * Verify that a swap entry is valid and increment its swap map count.
2068 * 2068 *
2069 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2069 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2070 * "permanent", but will be reclaimed by the next swapoff. 2070 * "permanent", but will be reclaimed by the next swapoff.
2071 * Returns error code in following case. 2071 * Returns error code in following case.
2072 * - success -> 0 2072 * - success -> 0
2073 * - swp_entry is invalid -> EINVAL 2073 * - swp_entry is invalid -> EINVAL
2074 * - swp_entry is migration entry -> EINVAL 2074 * - swp_entry is migration entry -> EINVAL
2075 * - swap-cache reference is requested but there is already one. -> EEXIST 2075 * - swap-cache reference is requested but there is already one. -> EEXIST
2076 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2076 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2077 */ 2077 */
2078 static int __swap_duplicate(swp_entry_t entry, bool cache) 2078 static int __swap_duplicate(swp_entry_t entry, bool cache)
2079 { 2079 {
2080 struct swap_info_struct * p; 2080 struct swap_info_struct * p;
2081 unsigned long offset, type; 2081 unsigned long offset, type;
2082 int result = -EINVAL; 2082 int result = -EINVAL;
2083 int count; 2083 int count;
2084 bool has_cache; 2084 bool has_cache;
2085 2085
2086 if (is_migration_entry(entry)) 2086 if (is_migration_entry(entry))
2087 return -EINVAL; 2087 return -EINVAL;
2088 2088
2089 type = swp_type(entry); 2089 type = swp_type(entry);
2090 if (type >= nr_swapfiles) 2090 if (type >= nr_swapfiles)
2091 goto bad_file; 2091 goto bad_file;
2092 p = type + swap_info; 2092 p = type + swap_info;
2093 offset = swp_offset(entry); 2093 offset = swp_offset(entry);
2094 2094
2095 spin_lock(&swap_lock); 2095 spin_lock(&swap_lock);
2096 2096
2097 if (unlikely(offset >= p->max)) 2097 if (unlikely(offset >= p->max))
2098 goto unlock_out; 2098 goto unlock_out;
2099 2099
2100 count = swap_count(p->swap_map[offset]); 2100 count = swap_count(p->swap_map[offset]);
2101 has_cache = swap_has_cache(p->swap_map[offset]); 2101 has_cache = swap_has_cache(p->swap_map[offset]);
2102 2102
2103 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2103 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2104 2104
2105 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2105 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2106 if (!has_cache && count) { 2106 if (!has_cache && count) {
2107 p->swap_map[offset] = encode_swapmap(count, true); 2107 p->swap_map[offset] = encode_swapmap(count, true);
2108 result = 0; 2108 result = 0;
2109 } else if (has_cache) /* someone added cache */ 2109 } else if (has_cache) /* someone added cache */
2110 result = -EEXIST; 2110 result = -EEXIST;
2111 else if (!count) /* no users */ 2111 else if (!count) /* no users */
2112 result = -ENOENT; 2112 result = -ENOENT;
2113 2113
2114 } else if (count || has_cache) { 2114 } else if (count || has_cache) {
2115 if (count < SWAP_MAP_MAX - 1) { 2115 if (count < SWAP_MAP_MAX - 1) {
2116 p->swap_map[offset] = encode_swapmap(count + 1, 2116 p->swap_map[offset] = encode_swapmap(count + 1,
2117 has_cache); 2117 has_cache);
2118 result = 0; 2118 result = 0;
2119 } else if (count <= SWAP_MAP_MAX) { 2119 } else if (count <= SWAP_MAP_MAX) {
2120 if (swap_overflow++ < 5) 2120 if (swap_overflow++ < 5)
2121 printk(KERN_WARNING 2121 printk(KERN_WARNING
2122 "swap_dup: swap entry overflow\n"); 2122 "swap_dup: swap entry overflow\n");
2123 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2123 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2124 has_cache); 2124 has_cache);
2125 result = 0; 2125 result = 0;
2126 } 2126 }
2127 } else 2127 } else
2128 result = -ENOENT; /* unused swap entry */ 2128 result = -ENOENT; /* unused swap entry */
2129 unlock_out: 2129 unlock_out:
2130 spin_unlock(&swap_lock); 2130 spin_unlock(&swap_lock);
2131 out: 2131 out:
2132 return result; 2132 return result;
2133 2133
2134 bad_file: 2134 bad_file:
2135 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2135 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2136 goto out; 2136 goto out;
2137 } 2137 }
2138 /* 2138 /*
2139 * increase reference count of swap entry by 1. 2139 * increase reference count of swap entry by 1.
2140 */ 2140 */
2141 void swap_duplicate(swp_entry_t entry) 2141 void swap_duplicate(swp_entry_t entry)
2142 { 2142 {
2143 __swap_duplicate(entry, SWAP_MAP); 2143 __swap_duplicate(entry, SWAP_MAP);
2144 } 2144 }
2145 2145
2146 /* 2146 /*
2147 * @entry: swap entry for which we allocate swap cache. 2147 * @entry: swap entry for which we allocate swap cache.
2148 * 2148 *
2149 * Called when allocating swap cache for exising swap entry, 2149 * Called when allocating swap cache for exising swap entry,
2150 * This can return error codes. Returns 0 at success. 2150 * This can return error codes. Returns 0 at success.
2151 * -EBUSY means there is a swap cache. 2151 * -EBUSY means there is a swap cache.
2152 * Note: return code is different from swap_duplicate(). 2152 * Note: return code is different from swap_duplicate().
2153 */ 2153 */
2154 int swapcache_prepare(swp_entry_t entry) 2154 int swapcache_prepare(swp_entry_t entry)
2155 { 2155 {
2156 return __swap_duplicate(entry, SWAP_CACHE); 2156 return __swap_duplicate(entry, SWAP_CACHE);
2157 } 2157 }
2158 2158
2159 2159
2160 struct swap_info_struct * 2160 struct swap_info_struct *
2161 get_swap_info_struct(unsigned type) 2161 get_swap_info_struct(unsigned type)
2162 { 2162 {
2163 return &swap_info[type]; 2163 return &swap_info[type];
2164 } 2164 }
2165 2165
2166 /* 2166 /*
2167 * swap_lock prevents swap_map being freed. Don't grab an extra 2167 * swap_lock prevents swap_map being freed. Don't grab an extra
2168 * reference on the swaphandle, it doesn't matter if it becomes unused. 2168 * reference on the swaphandle, it doesn't matter if it becomes unused.
2169 */ 2169 */
2170 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 2170 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2171 { 2171 {
2172 struct swap_info_struct *si; 2172 struct swap_info_struct *si;
2173 int our_page_cluster = page_cluster; 2173 int our_page_cluster = page_cluster;
2174 pgoff_t target, toff; 2174 pgoff_t target, toff;
2175 pgoff_t base, end; 2175 pgoff_t base, end;
2176 int nr_pages = 0; 2176 int nr_pages = 0;
2177 2177
2178 if (!our_page_cluster) /* no readahead */ 2178 if (!our_page_cluster) /* no readahead */
2179 return 0; 2179 return 0;
2180 2180
2181 si = &swap_info[swp_type(entry)]; 2181 si = &swap_info[swp_type(entry)];
2182 target = swp_offset(entry); 2182 target = swp_offset(entry);
2183 base = (target >> our_page_cluster) << our_page_cluster; 2183 base = (target >> our_page_cluster) << our_page_cluster;
2184 end = base + (1 << our_page_cluster); 2184 end = base + (1 << our_page_cluster);
2185 if (!base) /* first page is swap header */ 2185 if (!base) /* first page is swap header */
2186 base++; 2186 base++;
2187 2187
2188 spin_lock(&swap_lock); 2188 spin_lock(&swap_lock);
2189 if (end > si->max) /* don't go beyond end of map */ 2189 if (end > si->max) /* don't go beyond end of map */
2190 end = si->max; 2190 end = si->max;
2191 2191
2192 /* Count contiguous allocated slots above our target */ 2192 /* Count contiguous allocated slots above our target */
2193 for (toff = target; ++toff < end; nr_pages++) { 2193 for (toff = target; ++toff < end; nr_pages++) {
2194 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2195 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2196 break; 2196 break;
2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2198 break; 2198 break;
2199 } 2199 }
2200 /* Count contiguous allocated slots below our target */ 2200 /* Count contiguous allocated slots below our target */
2201 for (toff = target; --toff >= base; nr_pages++) { 2201 for (toff = target; --toff >= base; nr_pages++) {
2202 /* Don't read in free or bad pages */ 2202 /* Don't read in free or bad pages */
2203 if (!si->swap_map[toff]) 2203 if (!si->swap_map[toff])
2204 break; 2204 break;
2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2206 break; 2206 break;
2207 } 2207 }
2208 spin_unlock(&swap_lock); 2208 spin_unlock(&swap_lock);
2209 2209
2210 /* 2210 /*
2211 * Indicate starting offset, and return number of pages to get: 2211 * Indicate starting offset, and return number of pages to get:
2212 * if only 1, say 0, since there's then no readahead to be done. 2212 * if only 1, say 0, since there's then no readahead to be done.
2213 */ 2213 */
2214 *offset = ++toff; 2214 *offset = ++toff;
2215 return nr_pages? ++nr_pages: 0; 2215 return nr_pages? ++nr_pages: 0;
2216 } 2216 }
2217 2217