Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit dddac6a7b445de95515f64fdf82fe5dc36c02f26

Authored by Alan Jenkins 2009-07-30 03:07:55 +0800

Committed by Rafael J. Wysocki 2009-07-30 03:07:55 +0800

Exists in master and in 7 other branches

PM / Hibernate: Replace bdget call with simple atomic_inc of i_count

Create bdgrab().  This function copies an existing reference to a
block_device.  It is safe to call from any context.

Hibernation code wishes to copy a reference to the active swap device.
Right now it calls bdget() under a spinlock, but this is wrong because
bdget() can sleep.  It doesn't need a full bdget() because we already
hold a reference to active swap devices (and the spinlock protects
against swapoff).

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>

Showing 3 changed files with 13 additions and 2 deletions Inline Diff

fs/block_dev.c
include/linux/fs.h
mm/swapfile.c

fs/block_dev.c

Diff comments View file @ dddac6a

 /*
  *  linux/fs/block_dev.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
  */
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
 #include <linux/smp_lock.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
 };
 static const struct address_space_operations def_blk_aops;
 static inline struct bdev_inode *BDEV_I(struct inode *inode)
 {
 	return container_of(inode, struct bdev_inode, vfs_inode);
 }
 inline struct block_device *I_BDEV(struct inode *inode)
 {
 	return &BDEV_I(inode)->bdev;
 }
 EXPORT_SYMBOL(I_BDEV);
 static sector_t max_block(struct block_device *bdev)
 {
 	sector_t retval = ~((sector_t)0);
 	loff_t sz = i_size_read(bdev->bd_inode);
 	if (sz) {
 		unsigned int size = block_size(bdev);
 		unsigned int sizebits = blksize_bits(size);
 		retval = (sz >> sizebits);
 	}
 	return retval;
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
 	if (bdev->bd_inode->i_mapping->nrpages == 0)
 		return;
 	invalidate_bh_lrus();
 	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 }
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
 	/* Size cannot be smaller than the size supported by the device */
 	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 	/* Don't change the size if it is same as current */
 	if (bdev->bd_block_size != size) {
 		sync_blockdev(bdev);
 		bdev->bd_block_size = size;
 		bdev->bd_inode->i_blkbits = blksize_bits(size);
 		kill_bdev(bdev);
 	}
 	return 0;
 }
 EXPORT_SYMBOL(set_blocksize);
 int sb_set_blocksize(struct super_block *sb, int size)
 {
 	if (set_blocksize(sb->s_bdev, size))
 		return 0;
 	/* If we get here, we know size is power of two
 	 * and it's value is between 512 and PAGE_SIZE */
 	sb->s_blocksize = size;
 	sb->s_blocksize_bits = blksize_bits(size);
 	return sb->s_blocksize;
 }
 EXPORT_SYMBOL(sb_set_blocksize);
 int sb_min_blocksize(struct super_block *sb, int size)
 {
 	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
 }
 EXPORT_SYMBOL(sb_min_blocksize);
 static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
 	if (iblock >= max_block(I_BDEV(inode))) {
 		if (create)
 			return -EIO;
 		/*
 		 * for reads, we're just trying to fill a partial page.
 		 * return a hole, they will have to call get_block again
 		 * before they can fill it, and they will get -EIO at that
 		 * time
 		 */
 		return 0;
 	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	set_buffer_mapped(bh);
 	return 0;
 }
 static int
 blkdev_get_blocks(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
 	sector_t end_block = max_block(I_BDEV(inode));
 	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
 	if ((iblock + max_blocks) > end_block) {
 		max_blocks = end_block - iblock;
 		if ((long)max_blocks <= 0) {
 			if (create)
 				return -EIO;	/* write fully beyond EOF */
 			/*
 			 * It is a read which is fully beyond EOF.  We return
 			 * a !buffer_mapped buffer
 			 */
 			max_blocks = 0;
 		}
 	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	bh->b_size = max_blocks << inode->i_blkbits;
 	if (max_blocks)
 		set_buffer_mapped(bh);
 	return 0;
 }
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
 {
 	if (!bdev)
 		return 0;
 	if (!wait)
 		return filemap_flush(bdev->bd_inode->i_mapping);
 	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
 }
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
 	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
 int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
 		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
 	return sync_blockdev(bdev);
 }
 EXPORT_SYMBOL(fsync_bdev);
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
  * This takes the block device bd_mount_sem to make sure no new mounts
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
  * unfreeze process can unfreeze the frozen filesystem actually when multiple
  * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
  * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
  * actually.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 	int error = 0;
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (bdev->bd_fsfreeze_count > 0) {
 		bdev->bd_fsfreeze_count++;
 		sb = get_super(bdev);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
 	bdev->bd_fsfreeze_count++;
 	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
 	if (sb && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 		sync_filesystem(sb);
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
 		sync_blockdev(sb->s_bdev);
 		if (sb->s_op->freeze_fs) {
 			error = sb->s_op->freeze_fs(sb);
 			if (error) {
 				printk(KERN_ERR
 					"VFS:Filesystem freeze failed\n");
 				sb->s_frozen = SB_UNFROZEN;
 				drop_super(sb);
 				up(&bdev->bd_mount_sem);
 				bdev->bd_fsfreeze_count--;
 				mutex_unlock(&bdev->bd_fsfreeze_mutex);
 				return ERR_PTR(error);
 			}
 		}
 	}
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
 /**
  * thaw_bdev  -- unlock filesystem
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	int error = 0;
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (!bdev->bd_fsfreeze_count) {
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return -EINVAL;
 	}
 	bdev->bd_fsfreeze_count--;
 	if (bdev->bd_fsfreeze_count > 0) {
 		if (sb)
 			drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return 0;
 	}
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
 		if (!(sb->s_flags & MS_RDONLY)) {
 			if (sb->s_op->unfreeze_fs) {
 				error = sb->s_op->unfreeze_fs(sb);
 				if (error) {
 					printk(KERN_ERR
 						"VFS:Filesystem thaw failed\n");
 					sb->s_frozen = SB_FREEZE_TRANS;
 					bdev->bd_fsfreeze_count++;
 					mutex_unlock(&bdev->bd_fsfreeze_mutex);
 					return error;
 				}
 			}
 			sb->s_frozen = SB_UNFROZEN;
 			smp_wmb();
 			wake_up(&sb->s_wait_unfrozen);
 		}
 		drop_super(sb);
 	}
 	up(&bdev->bd_mount_sem);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
 }
 static int blkdev_readpage(struct file * file, struct page * page)
 {
 	return block_read_full_page(page, blkdev_get_block);
 }
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	*pagep = NULL;
 	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	int ret;
 	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	unlock_page(page);
 	page_cache_release(page);
 	return ret;
 }
 /*
  * private llseek:
  * for a block special file file->f_path.dentry->d_inode->i_size is zero
  * so we compute the size by hand (just as in block_read/write above)
  */
 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 {
 	struct inode *bd_inode = file->f_mapping->host;
 	loff_t size;
 	loff_t retval;
 	mutex_lock(&bd_inode->i_mutex);
 	size = i_size_read(bd_inode);
 	switch (origin) {
 		case 2:
 			offset += size;
 			break;
 		case 1:
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
 	if (offset >= 0 && offset <= size) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 		}
 		retval = offset;
 	}
 	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
 /*
  *	Filp is never NULL; the only case when ->fsync() is called with
  *	NULL first argument is nfsd_sync_dir() and that's not a directory.
  */
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
 	return sync_blockdev(I_BDEV(filp->f_mapping->host));
 }
 /*
  * pseudo-fs
  */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
 static struct kmem_cache * bdev_cachep __read_mostly;
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
 }
 static void bdev_destroy_inode(struct inode *inode)
 {
 	struct bdev_inode *bdi = BDEV_I(inode);
 	bdi->bdev.bd_inode_backing_dev_info = NULL;
 	kmem_cache_free(bdev_cachep, bdi);
 }
 static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
 	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
 	sema_init(&bdev->bd_mount_sem, 1);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
 	INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 static inline void __bd_forget(struct inode *inode)
 {
 	list_del_init(&inode->i_devices);
 	inode->i_bdev = NULL;
 	inode->i_mapping = &inode->i_data;
 }
 static void bdev_clear_inode(struct inode *inode)
 {
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 		__bd_forget(list_entry(p, struct inode, i_devices));
 	}
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
 }
 static const struct super_operations bdev_sops = {
 	.statfs = simple_statfs,
 	.alloc_inode = bdev_alloc_inode,
 	.destroy_inode = bdev_destroy_inode,
 	.drop_inode = generic_delete_inode,
 	.clear_inode = bdev_clear_inode,
 };
 static int bd_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 static struct file_system_type bd_type = {
 	.name		= "bdev",
 	.get_sb		= bd_get_sb,
 	.kill_sb	= kill_anon_super,
 };
 struct super_block *blockdev_superblock __read_mostly;
 void __init bdev_cache_init(void)
 {
 	int err;
 	struct vfsmount *bd_mnt;
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
 		panic("Cannot register bdev pseudo-fs");
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
 	/*
 	 * This vfsmount structure is only used to obtain the
 	 * blockdev_superblock, so tell kmemleak not to report it.
 	 */
 	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 /*
  * Most likely _very_ bad one - but then it's hardly critical for small
  * /dev and can be fixed when somebody will need really large one.
  * Keep in mind that it will be fed through icache hash function too.
  */
 static inline unsigned long hash(dev_t dev)
 {
 	return MAJOR(dev)+MINOR(dev);
 }
 static int bdev_test(struct inode *inode, void *data)
 {
 	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
 }
 static int bdev_set(struct inode *inode, void *data)
 {
 	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
 	return 0;
 }
 static LIST_HEAD(all_bdevs);
 struct block_device *bdget(dev_t dev)
 {
 	struct block_device *bdev;
 	struct inode *inode;
 	inode = iget5_locked(blockdev_superblock, hash(dev),
 			bdev_test, bdev_set, &dev);
 	if (!inode)
 		return NULL;
 	bdev = &BDEV_I(inode)->bdev;
 	if (inode->i_state & I_NEW) {
 		bdev->bd_contains = NULL;
 		bdev->bd_inode = inode;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
 		bdev->bd_invalidated = 0;
 		inode->i_mode = S_IFBLK;
 		inode->i_rdev = dev;
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 		inode->i_data.backing_dev_info = &default_backing_dev_info;
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
 		unlock_new_inode(inode);
 	}
 	return bdev;
 }
 EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:	Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+	atomic_inc(&bdev->bd_inode->i_count);
+	return bdev;
+}
 long nr_blockdev_pages(void)
 {
 	struct block_device *bdev;
 	long ret = 0;
 	spin_lock(&bdev_lock);
 	list_for_each_entry(bdev, &all_bdevs, bd_list) {
 		ret += bdev->bd_inode->i_mapping->nrpages;
 	}
 	spin_unlock(&bdev_lock);
 	return ret;
 }
 void bdput(struct block_device *bdev)
 {
 	iput(bdev->bd_inode);
 }
 EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
 	struct block_device *bdev;
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
 		atomic_inc(&bdev->bd_inode->i_count);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
 	spin_unlock(&bdev_lock);
 	bdev = bdget(inode->i_rdev);
 	if (bdev) {
 		spin_lock(&bdev_lock);
 		if (!inode->i_bdev) {
 			/*
 			 * We take an additional bd_inode->i_count for inode,
 			 * and it's released in clear_inode() of inode.
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
 			atomic_inc(&bdev->bd_inode->i_count);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
 		}
 		spin_unlock(&bdev_lock);
 	}
 	return bdev;
 }
 /* Call when you free inode */
 void bd_forget(struct inode *inode)
 {
 	struct block_device *bdev = NULL;
 	spin_lock(&bdev_lock);
 	if (inode->i_bdev) {
 		if (!sb_is_blkdev_sb(inode->i_sb))
 			bdev = inode->i_bdev;
 		__bd_forget(inode);
 	}
 	spin_unlock(&bdev_lock);
 	if (bdev)
 		iput(bdev->bd_inode);
 }
 int bd_claim(struct block_device *bdev, void *holder)
 {
 	int res;
 	spin_lock(&bdev_lock);
 	/* first decide result */
 	if (bdev->bd_holder == holder)
 		res = 0;	 /* already a holder */
 	else if (bdev->bd_holder != NULL)
 		res = -EBUSY; 	 /* held by someone else */
 	else if (bdev->bd_contains == bdev)
 		res = 0;  	 /* is a whole device which isn't held */
 	else if (bdev->bd_contains->bd_holder == bd_claim)
 		res = 0; 	 /* is a partition of a device that is being partitioned */
 	else if (bdev->bd_contains->bd_holder != NULL)
 		res = -EBUSY;	 /* is a partition of a held device */
 	else
 		res = 0;	 /* is a partition of an un-held device */
 	/* now impose change */
 	if (res==0) {
 		/* note that for a whole device bd_holders
 		 * will be incremented twice, and bd_holder will
 		 * be set to bd_claim before being set to holder
 		 */
 		bdev->bd_contains->bd_holders ++;
 		bdev->bd_contains->bd_holder = bd_claim;
 		bdev->bd_holders++;
 		bdev->bd_holder = holder;
 	}
 	spin_unlock(&bdev_lock);
 	return res;
 }
 EXPORT_SYMBOL(bd_claim);
 void bd_release(struct block_device *bdev)
 {
 	spin_lock(&bdev_lock);
 	if (!--bdev->bd_contains->bd_holders)
 		bdev->bd_contains->bd_holder = NULL;
 	if (!--bdev->bd_holders)
 		bdev->bd_holder = NULL;
 	spin_unlock(&bdev_lock);
 }
 EXPORT_SYMBOL(bd_release);
 #ifdef CONFIG_SYSFS
 /*
  * Functions for bd_claim_by_kobject / bd_release_from_kobject
  *
  *     If a kobject is passed to bd_claim_by_kobject()
  *     and the kobject has a parent directory,
  *     following symlinks are created:
  *        o from the kobject to the claimed bdev
  *        o from "holders" directory of the bdev to the parent of the kobject
  *     bd_release_from_kobject() removes these symlinks.
  *
  *     Example:
  *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
  *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
  *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
  *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  */
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
 		return 0;
 	return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
 		return;
 	sysfs_remove_link(from, kobject_name(to));
 }
 /*
  * 'struct bd_holder' contains pointers to kobjects symlinked by
  * bd_claim_by_kobject.
  * It's connected to bd_holder_list which is protected by bdev->bd_sem.
  */
 struct bd_holder {
 	struct list_head list;	/* chain of holders of the bdev */
 	int count;		/* references from the holder */
 	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
 	struct kobject *hdev;	/* e.g. "/block/dm-0" */
 	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
 	struct kobject *sdev;	/* e.g. "/block/sda" */
 };
 /*
  * Get references of related kobjects at once.
  * Returns 1 on success. 0 on failure.
  *
  * Should call bd_holder_release_dirs() after successful use.
  */
 static int bd_holder_grab_dirs(struct block_device *bdev,
 			struct bd_holder *bo)
 {
 	if (!bdev || !bo)
 		return 0;
 	bo->sdir = kobject_get(bo->sdir);
 	if (!bo->sdir)
 		return 0;
 	bo->hdev = kobject_get(bo->sdir->parent);
 	if (!bo->hdev)
 		goto fail_put_sdir;
 	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	if (!bo->sdev)
 		goto fail_put_hdev;
 	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
 	if (!bo->hdir)
 		goto fail_put_sdev;
 	return 1;
 fail_put_sdev:
 	kobject_put(bo->sdev);
 fail_put_hdev:
 	kobject_put(bo->hdev);
 fail_put_sdir:
 	kobject_put(bo->sdir);
 	return 0;
 }
 /* Put references of related kobjects at once. */
 static void bd_holder_release_dirs(struct bd_holder *bo)
 {
 	kobject_put(bo->hdir);
 	kobject_put(bo->sdev);
 	kobject_put(bo->hdev);
 	kobject_put(bo->sdir);
 }
 static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
 {
 	struct bd_holder *bo;
 	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
 	if (!bo)
 		return NULL;
 	bo->count = 1;
 	bo->sdir = kobj;
 	return bo;
 }
 static void free_bd_holder(struct bd_holder *bo)
 {
 	kfree(bo);
 }
 /**
  * find_bd_holder - find matching struct bd_holder from the block device
  *
  * @bdev:	struct block device to be searched
  * @bo:		target struct bd_holder
  *
  * Returns matching entry with @bo in @bdev->bd_holder_list.
  * If found, increment the reference count and return the pointer.
  * If not found, returns NULL.
  */
 static struct bd_holder *find_bd_holder(struct block_device *bdev,
 					struct bd_holder *bo)
 {
 	struct bd_holder *tmp;
 	list_for_each_entry(tmp, &bdev->bd_holder_list, list)
 		if (tmp->sdir == bo->sdir) {
 			tmp->count++;
 			return tmp;
 		}
 	return NULL;
 }
 /**
  * add_bd_holder - create sysfs symlinks for bd_claim() relationship
  *
  * @bdev:	block device to be bd_claimed
  * @bo:		preallocated and initialized by alloc_bd_holder()
  *
  * Add @bo to @bdev->bd_holder_list, create symlinks.
  *
  * Returns 0 if symlinks are created.
  * Returns -ve if something fails.
  */
 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
 {
 	int err;
 	if (!bo)
 		return -EINVAL;
 	if (!bd_holder_grab_dirs(bdev, bo))
 		return -EBUSY;
 	err = add_symlink(bo->sdir, bo->sdev);
 	if (err)
 		return err;
 	err = add_symlink(bo->hdir, bo->hdev);
 	if (err) {
 		del_symlink(bo->sdir, bo->sdev);
 		return err;
 	}
 	list_add_tail(&bo->list, &bdev->bd_holder_list);
 	return 0;
 }
 /**
  * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
  *
  * @bdev:	block device to be bd_claimed
  * @kobj:	holder's kobject
  *
  * If there is matching entry with @kobj in @bdev->bd_holder_list
  * and no other bd_claim() from the same kobject,
  * remove the struct bd_holder from the list, delete symlinks for it.
  *
  * Returns a pointer to the struct bd_holder when it's removed from the list
  * and ready to be freed.
  * Returns NULL if matching claim isn't found or there is other bd_claim()
  * by the same kobject.
  */
 static struct bd_holder *del_bd_holder(struct block_device *bdev,
 					struct kobject *kobj)
 {
 	struct bd_holder *bo;
 	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
 		if (bo->sdir == kobj) {
 			bo->count--;
 			BUG_ON(bo->count < 0);
 			if (!bo->count) {
 				list_del(&bo->list);
 				del_symlink(bo->sdir, bo->sdev);
 				del_symlink(bo->hdir, bo->hdev);
 				bd_holder_release_dirs(bo);
 				return bo;
 			}
 			break;
 		}
 	}
 	return NULL;
 }
 /**
  * bd_claim_by_kobject - bd_claim() with additional kobject signature
  *
  * @bdev:	block device to be claimed
  * @holder:	holder's signature
  * @kobj:	holder's kobject
  *
  * Do bd_claim() and if it succeeds, create sysfs symlinks between
  * the bdev and the holder's kobject.
  * Use bd_release_from_kobject() when relesing the claimed bdev.
  *
  * Returns 0 on success. (same as bd_claim())
  * Returns errno on failure.
  */
 static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
 				struct kobject *kobj)
 {
 	int err;
 	struct bd_holder *bo, *found;
 	if (!kobj)
 		return -EINVAL;
 	bo = alloc_bd_holder(kobj);
 	if (!bo)
 		return -ENOMEM;
 	mutex_lock(&bdev->bd_mutex);
 	err = bd_claim(bdev, holder);
 	if (err)
 		goto fail;
 	found = find_bd_holder(bdev, bo);
 	if (found)
 		goto fail;
 	err = add_bd_holder(bdev, bo);
 	if (err)
 		bd_release(bdev);
 	else
 		bo = NULL;
 fail:
 	mutex_unlock(&bdev->bd_mutex);
 	free_bd_holder(bo);
 	return err;
 }
 /**
  * bd_release_from_kobject - bd_release() with additional kobject signature
  *
  * @bdev:	block device to be released
  * @kobj:	holder's kobject
  *
  * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
  */
 static void bd_release_from_kobject(struct block_device *bdev,
 					struct kobject *kobj)
 {
 	if (!kobj)
 		return;
 	mutex_lock(&bdev->bd_mutex);
 	bd_release(bdev);
 	free_bd_holder(del_bd_holder(bdev, kobj));
 	mutex_unlock(&bdev->bd_mutex);
 }
 /**
  * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
  *
  * @bdev:	block device to be claimed
  * @holder:	holder's signature
  * @disk:	holder's gendisk
  *
  * Call bd_claim_by_kobject() with getting @disk->slave_dir.
  */
 int bd_claim_by_disk(struct block_device *bdev, void *holder,
 			struct gendisk *disk)
 {
 	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
 }
 EXPORT_SYMBOL_GPL(bd_claim_by_disk);
 /**
  * bd_release_from_disk - wrapper function for bd_release_from_kobject()
  *
  * @bdev:	block device to be claimed
  * @disk:	holder's gendisk
  *
  * Call bd_release_from_kobject() and put @disk->slave_dir.
  */
 void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
 {
 	bd_release_from_kobject(bdev, disk->slave_dir);
 	kobject_put(disk->slave_dir);
 }
 EXPORT_SYMBOL_GPL(bd_release_from_disk);
 #endif
 /*
  * Tries to open block device by device number.  Use it ONLY if you
  * really do not have anything better - i.e. when you are behind a
  * truly sucky interface and all you are given is a device number.  _Never_
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
 struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
 	if (bdev)
 		err = blkdev_get(bdev, mode);
 	return err ? ERR_PTR(err) : bdev;
 }
 EXPORT_SYMBOL(open_by_devnum);
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
  * @bdev:      struct block device to be flushed
  *
  * Invalidates all buffer-cache entries on a disk. It should be called
  * when a disk has been changed -- either by a media change or online
  * resize.
  */
 static void flush_disk(struct block_device *bdev)
 {
 	if (__invalidate_device(bdev)) {
 		char name[BDEVNAME_SIZE] = "";
 		if (bdev->bd_disk)
 			disk_name(bdev->bd_disk, 0, name);
 		printk(KERN_WARNING "VFS: busy inodes on changed media or "
 		       "resized disk %s\n", name);
 	}
 	if (!bdev->bd_disk)
 		return;
 	if (disk_partitionable(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 }
 /**
  * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
  *
  * This routine checks to see if the bdev size does not match the disk size
  * and adjusts it if it differs.
  */
 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 {
 	loff_t disk_size, bdev_size;
 	disk_size = (loff_t)get_capacity(disk) << 9;
 	bdev_size = i_size_read(bdev->bd_inode);
 	if (disk_size != bdev_size) {
 		char name[BDEVNAME_SIZE];
 		disk_name(disk, 0, name);
 		printk(KERN_INFO
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
 		flush_disk(bdev);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);
 /**
  * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
  * @disk: struct gendisk to be revalidated
  *
  * This routine is a wrapper for lower-level driver's revalidate_disk
  * call-backs.  It is used to do common pre and post operations needed
  * for all revalidate_disk operations.
  */
 int revalidate_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
 	int ret = 0;
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
 		return ret;
 	mutex_lock(&bdev->bd_mutex);
 	check_disk_size_change(disk, bdev);
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return ret;
 }
 EXPORT_SYMBOL(revalidate_disk);
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
  * is a relatively slow routine, so we have to try to minimize using
  * it. Thus it is called only upon a 'mount' or 'open'. This
  * is the best way of combining speed and utility, I think.
  * People changing diskettes in the middle of an operation deserve
  * to lose :-)
  */
 int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device_operations * bdops = disk->fops;
 	if (!bdops->media_changed)
 		return 0;
 	if (!bdops->media_changed(bdev->bd_disk))
 		return 0;
 	flush_disk(bdev);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
 	return 1;
 }
 EXPORT_SYMBOL(check_disk_change);
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
 		bsize <<= 1;
 	}
 	bdev->bd_block_size = bsize;
 	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 }
 EXPORT_SYMBOL(bd_set_size);
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 /*
  * bd_mutex locking:
  *
  *  mutex_lock(part->bd_mutex)
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
 	int ret;
 	int partno;
 	int perm = 0;
 	if (mode & FMODE_READ)
 		perm |= MAY_READ;
 	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
 	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
 	if (ret != 0) {
 		bdput(bdev);
 		return ret;
 	}
 	lock_kernel();
  restart:
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out_unlock_kernel;
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!bdev->bd_part)
 				goto out_clear;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret == -ERESTARTSYS) {
 					/* Lost a race with 'disk' being
 					 * deleted, try again.
 					 * See md.c
 					 */
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
 					module_put(disk->fops->owner);
 					put_disk(disk);
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					goto restart;
 				}
 				if (ret)
 					goto out_clear;
 			}
 			if (!bdev->bd_openers) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
 					bdi = &default_backing_dev_info;
 				bdev->bd_inode->i_data.backing_dev_info = bdi;
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
 		} else {
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
 			if (!whole)
 				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, mode, 1);
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				ret = -ENXIO;
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
 		put_disk(disk);
 		module_put(disk->fops->owner);
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev, mode);
 				if (ret)
 					goto out_unlock_bdev;
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
 		}
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
 	unlock_kernel();
 	return 0;
  out_clear:
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
  out_unlock_kernel:
 	unlock_kernel();
 	if (disk)
 		module_put(disk->fops->owner);
 	put_disk(disk);
 	bdput(bdev);
 	return ret;
 }
 int blkdev_get(struct block_device *bdev, fmode_t mode)
 {
 	return __blkdev_get(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_get);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
 	int res;
 	/*
 	 * Preserve backwards compatibility and allow large file access
 	 * even if userspace doesn't ask for it explicitly. Some mkfs
 	 * binary needs it. We might want to drop this workaround
 	 * during an unstable branch.
 	 */
 	filp->f_flags |= O_LARGEFILE;
 	if (filp->f_flags & O_NDELAY)
 		filp->f_mode |= FMODE_NDELAY;
 	if (filp->f_flags & O_EXCL)
 		filp->f_mode |= FMODE_EXCL;
 	if ((filp->f_flags & O_ACCMODE) == 3)
 		filp->f_mode |= FMODE_WRITE_IOCTL;
 	bdev = bd_acquire(inode);
 	if (bdev == NULL)
 		return -ENOMEM;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	res = blkdev_get(bdev, filp->f_mode);
 	if (res)
 		return res;
 	if (filp->f_mode & FMODE_EXCL) {
 		res = bd_claim(bdev, filp);
 		if (res)
 			goto out_blkdev_put;
 	}
 	return 0;
  out_blkdev_put:
 	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	lock_kernel();
 	if (for_part)
 		bdev->bd_part_count--;
 	if (!--bdev->bd_openers) {
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
 			ret = disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 		put_disk(disk);
 		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 	}
 	unlock_kernel();
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
 		__blkdev_put(victim, mode, 1);
 	return ret;
 }
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	if (bdev->bd_holder == filp)
 		bd_release(bdev);
 	return blkdev_put(bdev, filp->f_mode);
 }
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	fmode_t mode = file->f_mode;
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
 	 * to updated it before every ioctl.
 	 */
 	if (file->f_flags & O_NDELAY)
 		mode |= FMODE_NDELAY;
 	else
 		mode &= ~FMODE_NDELAY;
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
  */
 static int blkdev_releasepage(struct page *page, gfp_t wait)
 {
 	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
 	if (super && super->s_op->bdev_try_to_free_page)
 		return super->s_op->bdev_try_to_free_page(super, page, wait);
 	return try_to_free_buffers(page);
 }
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 };
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
   	.aio_write	= generic_file_aio_write_nolock,
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 {
 	int res;
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	res = blkdev_ioctl(bdev, 0, cmd, arg);
 	set_fs(old_fs);
 	return res;
 }
 EXPORT_SYMBOL(ioctl_by_bdev);
 /**
  * lookup_bdev  - lookup a struct block_device by name
  * @pathname:	special file representing the block device
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
 struct block_device *lookup_bdev(const char *pathname)
 {
 	struct block_device *bdev;
 	struct inode *inode;
 	struct path path;
 	int error;
 	if (!pathname || !*pathname)
 		return ERR_PTR(-EINVAL);
 	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
 	if (error)
 		return ERR_PTR(error);
 	inode = path.dentry->d_inode;
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
 	if (path.mnt->mnt_flags & MNT_NODEV)
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
 	if (!bdev)
 		goto fail;
 out:
 	path_put(&path);
 	return bdev;
 fail:
 	bdev = ERR_PTR(error);
 	goto out;
 }
 EXPORT_SYMBOL(lookup_bdev);
 /**
  * open_bdev_exclusive  -  open a block device by name and set it up for use
  *
  * @path:	special file representing the block device
  * @mode:	FMODE_... combination to pass be used
  * @holder:	owner for exclusion
  *
  * Open the blockdevice described by the special file at @path, claim it
  * for the @holder.
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
 	struct block_device *bdev;
 	int error = 0;
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 	error = blkdev_get(bdev, mode);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
 		goto blkdev_put;
 	error = bd_claim(bdev, holder);
 	if (error)
 		goto blkdev_put;
 	return bdev;
 blkdev_put:
 	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(open_bdev_exclusive);
 /**
  * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
  *
  * @bdev:	blockdevice to close
  * @mode:	mode, must match that used to open.
  *
  * This is the counterpart to open_bdev_exclusive().
  */
 void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
 {
 	bd_release(bdev);
 	blkdev_put(bdev, mode);
 }
 EXPORT_SYMBOL(close_bdev_exclusive);
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	int res = 0;
 	if (sb) {
 		/*
 		 * no need to lock the super, get_super holds the
 		 * read mutex so the filesystem cannot go away
 		 * under us (->put_super runs with the write lock
 		 * hold).
 		 */
 		shrink_dcache_sb(sb);
 		res = invalidate_inodes(sb);
 		drop_super(sb);
 	}
 	invalidate_bdev(bdev);
 	return res;
 }
 EXPORT_SYMBOL(__invalidate_device);

include/linux/fs.h

Diff comments View file @ dddac6a

 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 /*
  * This file has definitions for some important file table
  * structures etc.
  */
 #include <linux/limits.h>
 #include <linux/ioctl.h>
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
  * the file limit at runtime and only root can increase the per-process
  * nr_file rlimit, so it's safe to set up a ridiculously high absolute
  * upper limit on files-per-process.
  *
  * Some programs (notably those using select()) may have to be
  * recompiled to take full advantage of the new limits..
  */
 /* Fixed constants first: */
 #undef NR_OPEN
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 #define BLOCK_SIZE_BITS 10
 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
 #define SEEK_SET	0	/* seek relative to beginning of file */
 #define SEEK_CUR	1	/* seek relative to current file position */
 #define SEEK_END	2	/* seek relative to end of file */
 #define SEEK_MAX	SEEK_END
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
 	int nr_files;		/* read only */
 	int nr_free_files;	/* read only */
 	int max_files;		/* tunable */
 };
 struct inodes_stat_t {
 	int nr_inodes;
 	int nr_unused;
 	int dummy[5];		/* padding for sysctl ABI compatibility */
 };
 #define NR_FILE  8192	/* this can well be larger on a larger system */
 #define MAY_EXEC 1
 #define MAY_WRITE 2
 #define MAY_READ 4
 #define MAY_APPEND 8
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
  * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
  */
 /* file is open for reading */
 #define FMODE_READ		((__force fmode_t)1)
 /* file is open for writing */
 #define FMODE_WRITE		((__force fmode_t)2)
 /* file is seekable */
 #define FMODE_LSEEK		((__force fmode_t)4)
 /* file can be accessed using pread */
 #define FMODE_PREAD		((__force fmode_t)8)
 /* file can be accessed using pwrite */
 #define FMODE_PWRITE		((__force fmode_t)16)
 /* File is opened for execution with sys_execve / sys_uselib */
 #define FMODE_EXEC		((__force fmode_t)32)
 /* File is opened with O_NDELAY (only set for block devices) */
 #define FMODE_NDELAY		((__force fmode_t)64)
 /* File is opened with O_EXCL (only set for block devices) */
 #define FMODE_EXCL		((__force fmode_t)128)
 /* File is opened using open(.., 3, ..) and is writeable only for ioctls
    (specialy hack for floppy.c) */
 #define FMODE_WRITE_IOCTL	((__force fmode_t)256)
 /*
  * Don't update ctime and mtime.
  *
  * Currently a special hack for the XFS open_by_handle ioctl, but we'll
  * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
  */
 #define FMODE_NOCMTIME		((__force fmode_t)2048)
 /*
  * The below are the various read and write types that we support. Some of
  * them include behavioral modifiers that send information down to the
  * block layer and IO scheduler. Terminology:
  *
  *	The block layer uses device plugging to defer IO a little bit, in
  *	the hope that we will see more IO very shortly. This increases
  *	coalescing of adjacent IO and thus reduces the number of IOs we
  *	have to send to the device. It also allows for better queuing,
  *	if the IO isn't mergeable. If the caller is going to be waiting
  *	for the IO, then he must ensure that the device is unplugged so
  *	that the IO is dispatched to the driver.
  *
  *	All IO is handled async in Linux. This is fine for background
  *	writes, but for reads or writes that someone waits for completion
  *	on, we want to notify the block layer and IO scheduler so that they
  *	know about it. That allows them to make better scheduling
  *	decisions. So when the below references 'sync' and 'async', it
  *	is referencing this priority hint.
  *
  * With that in mind, the available types are:
  *
  * READ			A normal read operation. Device will be plugged.
  * READ_SYNC		A synchronous read. Device is not plugged, caller can
  *			immediately wait on this read without caring about
  *			unplugging.
  * READA		Used for read-ahead operations. Lower priority, and the
  *			 block layer could (in theory) choose to ignore this
  *			request if it runs into resource problems.
  * WRITE		A normal async write. Device will be plugged.
  * SWRITE		Like WRITE, but a special case for ll_rw_block() that
  *			tells it to lock the buffer first. Normally a buffer
  *			must be locked before doing IO.
  * WRITE_SYNC_PLUG	Synchronous write. Identical to WRITE, but passes down
  *			the hint that someone will be waiting on this IO
  *			shortly. The device must still be unplugged explicitly,
  *			WRITE_SYNC_PLUG does not do this as we could be
  *			submitting more writes before we actually wait on any
  *			of them.
  * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
  * WRITE_ODIRECT	Special case write for O_DIRECT only.
  * SWRITE_SYNC
  * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
  *			See SWRITE.
  * WRITE_BARRIER	Like WRITE, but tells the block layer that all
  *			previously submitted writes must be safely on storage
  *			before this one is started. Also guarantees that when
  *			this write is complete, it itself is also safely on
  *			storage. Prevents reordering of writes on both sides
  *			of this IO.
  *
  */
 #define RW_MASK		1
 #define RWA_MASK	2
 #define READ 0
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
 #define WRITE_ODIRECT	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define SWRITE_SYNC_PLUG	\
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 /*
  * These aren't really reads or writes, they pass down information about
  * parts of device that are now unused by the file system.
  */
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
 #define SEL_IN		1
 #define SEL_OUT		2
 #define SEL_EX		4
 /* public flags for file_system_type */
 #define FS_REQUIRES_DEV 1
 #define FS_BINARY_MOUNTDATA 2
 #define FS_HAS_SUBTYPE 4
 #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move()
 					 * during rename() internally.
 					 */
 /*
  * These are the fs-independent mount-flags: up to 32 flags are supported
  */
 #define MS_RDONLY	 1	/* Mount read-only */
 #define MS_NOSUID	 2	/* Ignore suid and sgid bits */
 #define MS_NODEV	 4	/* Disallow access to device special files */
 #define MS_NOEXEC	 8	/* Disallow program execution */
 #define MS_SYNCHRONOUS	16	/* Writes are synced at once */
 #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
 #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
 #define MS_DIRSYNC	128	/* Directory modifications are synchronous */
 #define MS_NOATIME	1024	/* Do not update access times. */
 #define MS_NODIRATIME	2048	/* Do not update directory access times */
 #define MS_BIND		4096
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768	/* War is peace. Verbosity is silence.
 				   MS_VERBOSE is deprecated. */
 #define MS_SILENT	32768
 #define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
 #define MS_SLAVE	(1<<19)	/* change to slave */
 #define MS_SHARED	(1<<20)	/* change to shared */
 #define MS_RELATIME	(1<<21)	/* Update atime relative to mtime/ctime. */
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
 #define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
 /*
  * Old magic mount flag and mask
  */
 #define MS_MGC_VAL 0xC0ED0000
 #define MS_MGC_MSK 0xffff0000
 /* Inode flags - they have nothing to superblock flags now */
 #define S_SYNC		1	/* Writes are synced at once */
 #define S_NOATIME	2	/* Do not update access times */
 #define S_APPEND	4	/* Append-only file */
 #define S_IMMUTABLE	8	/* Immutable file */
 #define S_DEAD		16	/* removed, but still open directory */
 #define S_NOQUOTA	32	/* Inode is not counted to quota */
 #define S_DIRSYNC	64	/* Directory modifications are synchronous */
 #define S_NOCMTIME	128	/* Do not update file c/mtime */
 #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
 #define S_PRIVATE	512	/* Inode is fs-internal */
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
  * flags just means all the inodes inherit those flags by default. It might be
  * possible to override it selectively if you really wanted to with some
  * ioctl() that is not currently implemented.
  *
  * Exception: MS_RDONLY is always applied to the entire file system.
  *
  * Unfortunately, it is possible to change a filesystems flags with it mounted
  * with files in use.  This means that all of the inodes will not have their
  * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
  * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
  */
 #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
 #define IS_SYNC(inode)		(__IS_FLG(inode, MS_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
 #define IS_DIRSYNC(inode)	(__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
 					((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
 #define IS_MANDLOCK(inode)	__IS_FLG(inode, MS_MANDLOCK)
 #define IS_NOATIME(inode)   __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
 #define IS_I_VERSION(inode)   __IS_FLG(inode, MS_I_VERSION)
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
 #define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
 #define BLKROSET   _IO(0x12,93)	/* set device read-only (0 = read-write) */
 #define BLKROGET   _IO(0x12,94)	/* get read-only status (0 = read_write) */
 #define BLKRRPART  _IO(0x12,95)	/* re-read partition table */
 #define BLKGETSIZE _IO(0x12,96)	/* return device size /512 (long *arg) */
 #define BLKFLSBUF  _IO(0x12,97)	/* flush buffer cache */
 #define BLKRASET   _IO(0x12,98)	/* set read ahead for block device */
 #define BLKRAGET   _IO(0x12,99)	/* get current read ahead setting */
 #define BLKFRASET  _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
 #define BLKFRAGET  _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
 #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
 #if 0
 #define BLKPG      _IO(0x12,105)/* See blkpg.h */
 /* Some people are morons.  Do not use sizeof! */
 #define BLKELVGET  _IOR(0x12,106,size_t)/* elevator get */
 #define BLKELVSET  _IOW(0x12,107,size_t)/* elevator set */
 /* This was here just to show that the number is taken -
    probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
 #endif
 /* A jump here: 108-111 have been used for various private purposes. */
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
 #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
 #define BLKTRACESTART _IO(0x12,116)
 #define BLKTRACESTOP _IO(0x12,117)
 #define BLKTRACETEARDOWN _IO(0x12,118)
 #define BLKDISCARD _IO(0x12,119)
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
 #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
 #define	FS_IOC_SETVERSION		_IOW('v', 2, long)
 #define FS_IOC_FIEMAP			_IOWR('f', 11, struct fiemap)
 #define FS_IOC32_GETFLAGS		_IOR('f', 1, int)
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  */
 #define	FS_SECRM_FL			0x00000001 /* Secure deletion */
 #define	FS_UNRM_FL			0x00000002 /* Undelete */
 #define	FS_COMPR_FL			0x00000004 /* Compress file */
 #define FS_SYNC_FL			0x00000008 /* Synchronous updates */
 #define FS_IMMUTABLE_FL			0x00000010 /* Immutable file */
 #define FS_APPEND_FL			0x00000020 /* writes to file may only append */
 #define FS_NODUMP_FL			0x00000040 /* do not dump file */
 #define FS_NOATIME_FL			0x00000080 /* do not update atime */
 /* Reserved for compression usage... */
 #define FS_DIRTY_FL			0x00000100
 #define FS_COMPRBLK_FL			0x00000200 /* One or more compressed clusters */
 #define FS_NOCOMP_FL			0x00000400 /* Don't compress */
 #define FS_ECOMPR_FL			0x00000800 /* Compression error */
 /* End compression flags --- maybe not all used */
 #define FS_BTREE_FL			0x00001000 /* btree format dir */
 #define FS_INDEX_FL			0x00001000 /* hash-indexed directory */
 #define FS_IMAGIC_FL			0x00002000 /* AFS directory */
 #define FS_JOURNAL_DATA_FL		0x00004000 /* Reserved for ext3 */
 #define FS_NOTAIL_FL			0x00008000 /* file tail should not be merged */
 #define FS_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define FS_EXTENT_FL			0x00080000 /* Extents */
 #define FS_DIRECTIO_FL			0x00100000 /* Use direct i/o */
 #define FS_RESERVED_FL			0x80000000 /* reserved for ext2 lib */
 #define FS_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 #define FS_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
 #define SYNC_FILE_RANGE_WAIT_BEFORE	1
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 #ifdef __KERNEL__
 #include <linux/linkage.h>
 #include <linux/wait.h>
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/dcache.h>
 #include <linux/path.h>
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
 #include <asm/atomic.h>
 #include <asm/byteorder.h>
 struct export_operations;
 struct hd_geometry;
 struct iovec;
 struct nameidata;
 struct kiocb;
 struct pipe_inode_info;
 struct poll_table_struct;
 struct kstatfs;
 struct vm_area_struct;
 struct vfsmount;
 struct cred;
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 extern struct files_stat_struct files_stat;
 extern int get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
 #ifdef CONFIG_DNOTIFY
 extern int dir_notify_enable;
 #endif
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
  */
 #define ATTR_MODE	(1 << 0)
 #define ATTR_UID	(1 << 1)
 #define ATTR_GID	(1 << 2)
 #define ATTR_SIZE	(1 << 3)
 #define ATTR_ATIME	(1 << 4)
 #define ATTR_MTIME	(1 << 5)
 #define ATTR_CTIME	(1 << 6)
 #define ATTR_ATIME_SET	(1 << 7)
 #define ATTR_MTIME_SET	(1 << 8)
 #define ATTR_FORCE	(1 << 9) /* Not a change, but a change it */
 #define ATTR_ATTR_FLAG	(1 << 10)
 #define ATTR_KILL_SUID	(1 << 11)
 #define ATTR_KILL_SGID	(1 << 12)
 #define ATTR_FILE	(1 << 13)
 #define ATTR_KILL_PRIV	(1 << 14)
 #define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
 #define ATTR_TIMES_SET	(1 << 16)
 /*
  * This is the Inode Attributes structure, used for notify_change().  It
  * uses the above definitions as flags, to know which values have changed.
  * Also, in this manner, a Filesystem can look at only the values it cares
  * about.  Basically, these are the attributes that the VFS layer can
  * request to change from the FS layer.
  *
  * Derek Atkins <warlord@MIT.EDU> 94-10-20
  */
 struct iattr {
 	unsigned int	ia_valid;
 	umode_t		ia_mode;
 	uid_t		ia_uid;
 	gid_t		ia_gid;
 	loff_t		ia_size;
 	struct timespec	ia_atime;
 	struct timespec	ia_mtime;
 	struct timespec	ia_ctime;
 	/*
 	 * Not an attribute, but an auxilary info for filesystems wanting to
 	 * implement an ftruncate() like method.  NOTE: filesystem should
 	 * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
 	 */
 	struct file	*ia_file;
 };
 /*
  * Includes for diskquotas.
  */
 #include <linux/quota.h>
 /**
  * enum positive_aop_returns - aop return codes with specific semantics
  *
  * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
  * 			    completed, that the page is still locked, and
  * 			    should be considered active.  The VM uses this hint
  * 			    to return the page to the active list -- it won't
  * 			    be a candidate for writeback again in the near
  * 			    future.  Other callers must be careful to unlock
  * 			    the page if they get this return.  Returned by
  * 			    writepage();
  *
  * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
  *  			unlocked it and the page might have been truncated.
  *  			The caller should back up to acquiring a new page and
  *  			trying again.  The aop will be taking reasonable
  *  			precautions not to livelock.  If the caller held a page
  *  			reference, it should drop it before retrying.  Returned
  *  			by readpage().
  *
  * address_space_operation functions return these large constants to indicate
  * special semantics to the caller.  These are much larger than the bytes in a
  * page to allow for functions that return the number of bytes operated on in a
  * given page.
  */
 enum positive_aop_returns {
 	AOP_WRITEPAGE_ACTIVATE	= 0x80000,
 	AOP_TRUNCATED_PAGE	= 0x80001,
 };
 #define AOP_FLAG_UNINTERRUPTIBLE	0x0001 /* will not do a short write */
 #define AOP_FLAG_CONT_EXPAND		0x0002 /* called from cont_expand */
 #define AOP_FLAG_NOFS			0x0004 /* used by filesystem to direct
 						* helper code (eg buffer layer)
 						* to clear GFP_FS from alloc */
 /*
  * oh the beauties of C type declarations.
  */
 struct page;
 struct address_space;
 struct writeback_control;
 struct iov_iter {
 	const struct iovec *iov;
 	unsigned long nr_segs;
 	size_t iov_offset;
 	size_t count;
 };
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 size_t iov_iter_copy_from_user(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(struct iov_iter *i);
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count, size_t written)
 {
 	i->iov = iov;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count + written;
 	iov_iter_advance(i, written);
 }
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
 	return i->count;
 }
 /*
  * "descriptor" for what we're up to with a read.
  * This allows us to use the same read code yet
  * have multiple different users of the data that
  * we read from a file.
  *
  * The simplest case just copies the data to user
  * mode.
  */
 typedef struct {
 	size_t written;
 	size_t count;
 	union {
 		char __user *buf;
 		void *data;
 	} arg;
 	int error;
 } read_descriptor_t;
 typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
 		unsigned long, unsigned long);
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
 	void (*sync_page)(struct page *);
 	/* Write back some dirty pages from this mapping. */
 	int (*writepages)(struct address_space *, struct writeback_control *);
 	/* Set a page dirty.  Return true if this dirtied it */
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, gfp_t);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 };
 /*
  * pagecache_write_begin/pagecache_write_end must be used by general code
  * to write into the pagecache.
  */
 int pagecache_write_begin(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
 int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	spinlock_t		i_mmap_lock;	/* protect tree, count, list */
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
 	 * must be enforced here for CRIS, to let the least signficant bit
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
 	struct super_block *	bd_super;
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
 	struct semaphore	bd_mount_sem;
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
 #ifdef CONFIG_SYSFS
 	struct list_head	bd_holder_list;
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct list_head	bd_list;
 	struct backing_dev_info *bd_inode_backing_dev_info;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
 	 * to use this.  NOTE:  bd_claim allows an owner to claim
 	 * the same device multiple times, the owner must take special
 	 * care to not mess up bd_private for that case.
 	 */
 	unsigned long		bd_private;
 	/* The counter of freeze processes */
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
 };
 /*
  * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
  * radix trees
  */
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
 int mapping_tagged(struct address_space *mapping, int tag);
 /*
  * Might pages of this file be mapped into userspace?
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
 	return	!prio_tree_empty(&mapping->i_mmap) ||
 		!list_empty(&mapping->i_mmap_nonlinear);
 }
 /*
  * Might pages of this file have been modified in userspace?
  * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  */
 static inline int mapping_writably_mapped(struct address_space *mapping)
 {
 	return mapping->i_mmap_writable != 0;
 }
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 #include <linux/seqlock.h>
 #define __NEED_I_SIZE_ORDERED
 #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
 #else
 #define i_size_ordered_init(inode) do { } while (0)
 #endif
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
 struct inode {
 	struct hlist_node	i_hash;
 	struct list_head	i_list;
 	struct list_head	i_sb_list;
 	struct list_head	i_dentry;
 	unsigned long		i_ino;
 	atomic_t		i_count;
 	unsigned int		i_nlink;
 	uid_t			i_uid;
 	gid_t			i_gid;
 	dev_t			i_rdev;
 	u64			i_version;
 	loff_t			i_size;
 #ifdef __NEED_I_SIZE_ORDERED
 	seqcount_t		i_size_seqcount;
 #endif
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
 	blkcnt_t		i_blocks;
 	unsigned int		i_blkbits;
 	unsigned short          i_bytes;
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	const struct inode_operations	*i_op;
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
 	struct address_space	i_data;
 #ifdef CONFIG_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
 #endif
 	struct list_head	i_devices;
 	union {
 		struct pipe_inode_info	*i_pipe;
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 	};
 	__u32			i_generation;
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
 	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
 #endif
 #ifdef CONFIG_INOTIFY
 	struct list_head	inotify_watches; /* watches on this inode */
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 	unsigned int		i_flags;
 	atomic_t		i_writecount;
 #ifdef CONFIG_SECURITY
 	void			*i_security;
 #endif
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
 	void			*i_private; /* fs or device private pointer */
 };
 /*
  * inode->i_mutex nesting subclasses for the lock validator:
  *
  * 0: the object of the current VFS operation
  * 1: parent
  * 2: child/target
  * 3: quota file
  *
  * The locking order between these classes is
  * parent -> child -> normal -> xattr -> quota
  */
 enum inode_i_mutex_lock_class
 {
 	I_MUTEX_NORMAL,
 	I_MUTEX_PARENT,
 	I_MUTEX_CHILD,
 	I_MUTEX_XATTR,
 	I_MUTEX_QUOTA
 };
 /*
  * NOTE: in a 32bit arch with a preemptable kernel and
  * an UP compile the i_size_read/write must be atomic
  * with respect to the local cpu (unlike with preempt disabled),
  * but they don't need to be atomic with respect to other cpus like in
  * true SMP (so they need either to either locally disable irq around
  * the read or for example on x86 they can be still implemented as a
  * cmpxchg8b without the need of the lock prefix). For SMP compiles
  * and 64bit archs it makes no difference if preempt is enabled or not.
  */
 static inline loff_t i_size_read(const struct inode *inode)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	loff_t i_size;
 	unsigned int seq;
 	do {
 		seq = read_seqcount_begin(&inode->i_size_seqcount);
 		i_size = inode->i_size;
 	} while (read_seqcount_retry(&inode->i_size_seqcount, seq));
 	return i_size;
 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
 	loff_t i_size;
 	preempt_disable();
 	i_size = inode->i_size;
 	preempt_enable();
 	return i_size;
 #else
 	return inode->i_size;
 #endif
 }
 /*
  * NOTE: unlike i_size_read(), i_size_write() does need locking around it
  * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
  * can be lost, resulting in subsequent i_size_read() calls spinning forever.
  */
 static inline void i_size_write(struct inode *inode, loff_t i_size)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&inode->i_size_seqcount);
 	inode->i_size = i_size;
 	write_seqcount_end(&inode->i_size_seqcount);
 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
 	preempt_disable();
 	inode->i_size = i_size;
 	preempt_enable();
 #else
 	inode->i_size = i_size;
 #endif
 }
 static inline unsigned iminor(const struct inode *inode)
 {
 	return MINOR(inode->i_rdev);
 }
 static inline unsigned imajor(const struct inode *inode)
 {
 	return MAJOR(inode->i_rdev);
 }
 extern struct block_device *I_BDEV(struct inode *inode);
 struct fown_struct {
 	rwlock_t lock;          /* protects pid, uid, euid fields */
 	struct pid *pid;	/* pid or -pgrp where SIGIO should be sent */
 	enum pid_type pid_type;	/* Kind of process group SIGIO should be sent to */
 	uid_t uid, euid;	/* uid/euid of process setting the owner */
 	int signum;		/* posix.1b rt signal to be delivered on IO */
 };
 /*
  * Track a single file's readahead state
  */
 struct file_ra_state {
 	pgoff_t start;			/* where readahead started */
 	unsigned int size;		/* # of readahead pages */
 	unsigned int async_size;	/* do asynchronous readahead when
 					   there are only # of pages ahead */
 	unsigned int ra_pages;		/* Maximum readahead window */
 	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };
 /*
  * Check if @index falls in the readahead windows.
  */
 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 {
 	return (index >= ra->start &&
 		index <  ra->start + ra->size);
 }
 #define FILE_MNT_WRITE_TAKEN	1
 #define FILE_MNT_WRITE_RELEASED	2
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
 	 * fu_rcuhead for RCU freeing
 	 */
 	union {
 		struct list_head	fu_list;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
 	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
 	u64			f_version;
 #ifdef CONFIG_SECURITY
 	void			*f_security;
 #endif
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
 	struct list_head	f_ep_links;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 #ifdef CONFIG_DEBUG_WRITECOUNT
 	unsigned long f_mnt_write_state;
 #endif
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 #define get_file(x)	atomic_long_inc(&(x)->f_count)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 #ifdef CONFIG_DEBUG_WRITECOUNT
 static inline void file_take_write(struct file *f)
 {
 	WARN_ON(f->f_mnt_write_state != 0);
 	f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
 }
 static inline void file_release_write(struct file *f)
 {
 	f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED;
 }
 static inline void file_reset_write(struct file *f)
 {
 	f->f_mnt_write_state = 0;
 }
 static inline void file_check_state(struct file *f)
 {
 	/*
 	 * At this point, either both or neither of these bits
 	 * should be set.
 	 */
 	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
 	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
 }
 static inline int file_check_writeable(struct file *f)
 {
 	if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN)
 		return 0;
 	printk(KERN_WARNING "writeable file with no "
 			    "mnt_want_write()\n");
 	WARN_ON(1);
 	return -EINVAL;
 }
 #else /* !CONFIG_DEBUG_WRITECOUNT */
 static inline void file_take_write(struct file *filp) {}
 static inline void file_release_write(struct file *filp) {}
 static inline void file_reset_write(struct file *filp) {}
 static inline void file_check_state(struct file *filp) {}
 static inline int file_check_writeable(struct file *filp)
 {
 	return 0;
 }
 #endif /* CONFIG_DEBUG_WRITECOUNT */
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 /* Page cache limit. The filesystems should put that into their s_maxbytes
    limits, otherwise bad things can happen in VM. */
 #if BITS_PER_LONG==32
 #define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
 #elif BITS_PER_LONG==64
 #define MAX_LFS_FILESIZE 	0x7fffffffffffffffUL
 #endif
 #define FL_POSIX	1
 #define FL_FLOCK	2
 #define FL_ACCESS	8	/* not trying to lock, just looking */
 #define FL_EXISTS	16	/* when unlocking, test for existence */
 #define FL_LEASE	32	/* lease held on this file */
 #define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
  * asynchronous locking.
  */
 #define FILE_LOCK_DEFERRED 1
 /*
  * The POSIX file lock owner is determined by
  * the "struct files_struct" in the thread group
  * (or NULL for no owner - BSD locks).
  *
  * Lockd stuffs a "host" pointer into this.
  */
 typedef struct files_struct *fl_owner_t;
 struct file_lock_operations {
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 };
 struct lock_manager_operations {
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);	/* unblock callback */
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *);
 	int (*fl_mylease)(struct file_lock *, struct file_lock *);
 	int (*fl_change)(struct file_lock **, int);
 };
 struct lock_manager {
 	struct list_head list;
 };
 void locks_start_grace(struct lock_manager *);
 void locks_end_grace(struct lock_manager *);
 int locks_in_grace(void);
 /* that will die - we need it for nfs_lock_info */
 #include <linux/nfs_fs_i.h>
 struct file_lock {
 	struct file_lock *fl_next;	/* singly linked list for this inode  */
 	struct list_head fl_link;	/* doubly linked list of all locks */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
 	unsigned char fl_flags;
 	unsigned char fl_type;
 	unsigned int fl_pid;
 	struct pid *fl_nspid;
 	wait_queue_head_t fl_wait;
 	struct file *fl_file;
 	loff_t fl_start;
 	loff_t fl_end;
 	struct fasync_struct *	fl_fasync; /* for lease break notifications */
 	unsigned long fl_break_time;	/* for nonblocking lease breaks */
 	struct file_lock_operations *fl_ops;	/* Callbacks for filesystems */
 	struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
 	union {
 		struct nfs_lock_info	nfs_fl;
 		struct nfs4_lock_info	nfs4_fl;
 		struct {
 			struct list_head link;	/* link in AFS vnode's pending_locks list */
 			int state;		/* state of grant or error if -ve */
 		} afs;
 	} fl_u;
 };
 /* The following constant reflects the upper bound of the file/locking space */
 #ifndef OFFSET_MAX
 #define INT_LIMIT(x)	(~((x)1 << (sizeof(x)*8 - 1)))
 #define OFFSET_MAX	INT_LIMIT(loff_t)
 #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
 #endif
 #include <linux/fcntl.h>
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 /* fs/sync.c */
 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 			loff_t endbyte, unsigned int flags);
 #ifdef CONFIG_FILE_LOCKING
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
 			struct flock __user *);
 #if BITS_PER_LONG == 32
 extern int fcntl_getlk64(struct file *, struct flock64 __user *);
 extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
 			struct flock64 __user *);
 #endif
 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);
 /* fs/locks.c */
 extern void locks_init_lock(struct file_lock *);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
 extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **);
 extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, struct flock __user *user)
 {
 	return -EINVAL;
 }
 static inline int fcntl_setlk(unsigned int fd, struct file *file,
 			      unsigned int cmd, struct flock __user *user)
 {
 	return -EACCES;
 }
 #if BITS_PER_LONG == 32
 static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user)
 {
 	return -EINVAL;
 }
 static inline int fcntl_setlk64(unsigned int fd, struct file *file,
 				unsigned int cmd, struct flock64 __user *user)
 {
 	return -EACCES;
 }
 #endif
 static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
 	return 0;
 }
 static inline int fcntl_getlease(struct file *filp)
 {
 	return 0;
 }
 static inline void locks_init_lock(struct file_lock *fl)
 {
 	return;
 }
 static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
 	return;
 }
 static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
 	return;
 }
 static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	return;
 }
 static inline void locks_remove_flock(struct file *filp)
 {
 	return;
 }
 static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	return;
 }
 static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
 				  struct file_lock *conflock)
 {
 	return -ENOLCK;
 }
 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 {
 	return -ENOLCK;
 }
 static inline int posix_unblock_lock(struct file *filp,
 				     struct file_lock *waiter)
 {
 	return -ENOENT;
 }
 static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
 {
 	return 0;
 }
 static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
 				struct file_lock *fl, struct file_lock *conf)
 {
 	return -ENOLCK;
 }
 static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 {
 	return 0;
 }
 static inline int flock_lock_file_wait(struct file *filp,
 				       struct file_lock *request)
 {
 	return -ENOLCK;
 }
 static inline int __break_lease(struct inode *inode, unsigned int mode)
 {
 	return 0;
 }
 static inline void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	return;
 }
 static inline int generic_setlease(struct file *filp, long arg,
 				    struct file_lock **flp)
 {
 	return -EINVAL;
 }
 static inline int vfs_setlease(struct file *filp, long arg,
 			       struct file_lock **lease)
 {
 	return -EINVAL;
 }
 static inline int lease_modify(struct file_lock **before, int arg)
 {
 	return -EINVAL;
 }
 static inline int lock_may_read(struct inode *inode, loff_t start,
 				unsigned long len)
 {
 	return 1;
 }
 static inline int lock_may_write(struct inode *inode, loff_t start,
 				 unsigned long len)
 {
 	return 1;
 }
 #endif /* !CONFIG_FILE_LOCKING */
 struct fasync_struct {
 	int	magic;
 	int	fa_fd;
 	struct	fasync_struct	*fa_next; /* singly linked list */
 	struct	file 		*fa_file;
 };
 #define FASYNC_MAGIC 0x4601
 /* SMP safe fasync helpers: */
 extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
 /* can be called from interrupts */
 extern void kill_fasync(struct fasync_struct **, int, int);
 /* only for net: no internal synchronization */
 extern void __kill_fasync(struct fasync_struct *, int, int);
 extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
 extern int f_setown(struct file *filp, unsigned long arg, int force);
 extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
 /*
  *	Umount options
  */
 #define MNT_FORCE	0x00000001	/* Attempt to forcibily umount */
 #define MNT_DETACH	0x00000002	/* Just detach from the tree */
 #define MNT_EXPIRE	0x00000004	/* Mark for expiry */
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 #define sb_entry(list)  list_entry((list), struct super_block, s_list)
 #define S_BIAS (1<<30)
 struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
 	unsigned long		s_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
 	unsigned long long	s_maxbytes;	/* Max file size */
 	struct file_system_type	*s_type;
 	const struct super_operations	*s_op;
 	struct dquot_operations	*dq_op;
  	struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
 	int			s_need_sync;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
 #endif
 	struct xattr_handler	**s_xattr;
 	struct list_head	s_inodes;	/* all inodes */
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct list_head	s_more_io;	/* parked for more writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
 	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 	struct block_device	*s_bdev;
 	struct mtd_info		*s_mtd;
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
 	int			s_frozen;
 	wait_queue_head_t	s_wait_unfrozen;
 	char s_id[32];				/* Informational name */
 	void 			*s_fs_info;	/* Filesystem private info */
 	fmode_t			s_mode;
 	/*
 	 * The next field is for VFS *only*. No filesystems have any business
 	 * even looking at it. You had been warned.
 	 */
 	struct mutex s_vfs_rename_mutex;	/* Kludge */
 	/* Granularity of c/m/atime in ns.
 	   Cannot be worse than a second */
 	u32		   s_time_gran;
 	/*
 	 * Filesystem subtype.  If non-empty the filesystem type field
 	 * in /proc/mounts will be "type.subtype"
 	 */
 	char *s_subtype;
 	/*
 	 * Saved mount options for lazy filesystems using
 	 * generic_show_options()
 	 */
 	char *s_options;
 };
 extern struct timespec current_fs_time(struct super_block *sb);
 /*
  * Snapshotting support.
  */
 enum {
 	SB_UNFROZEN = 0,
 	SB_FREEZE_WRITE	= 1,
 	SB_FREEZE_TRANS = 2,
 };
 #define vfs_check_frozen(sb, level) \
 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
 #define get_fs_excl() atomic_inc(&current->fs_excl)
 #define put_fs_excl() atomic_dec(&current->fs_excl)
 #define has_fs_excl() atomic_read(&current->fs_excl)
 #define is_owner_or_cap(inode)	\
 	((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER))
 /* not quite ready to be deprecated, but... */
 extern void lock_super(struct super_block *);
 extern void unlock_super(struct super_block *);
 /*
  * VFS helper functions..
  */
 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int vfs_mkdir(struct inode *, struct dentry *, int);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 /*
  * VFS dentry helper functions.
  */
 extern void dentry_unhash(struct dentry *dentry);
 /*
  * VFS file helper functions.
  */
 extern int file_permission(struct file *, int);
 /*
  * VFS FS_IOC_FIEMAP helper definitions.
  */
 struct fiemap_extent_info {
 	unsigned int fi_flags;		/* Flags as passed from user */
 	unsigned int fi_extents_mapped;	/* Number of mapped extents */
 	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
 	struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent
 						 * array */
 };
 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
 			    u64 phys, u64 len, u32 flags);
 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
 /*
  * File types
  *
  * NOTE! These match bits 12..15 of stat.st_mode
  * (ie "(i_mode >> 12) & 15").
  */
 #define DT_UNKNOWN	0
 #define DT_FIFO		1
 #define DT_CHR		2
 #define DT_DIR		4
 #define DT_BLK		6
 #define DT_REG		8
 #define DT_LNK		10
 #define DT_SOCK		12
 #define DT_WHT		14
 #define OSYNC_METADATA	(1<<0)
 #define OSYNC_DATA	(1<<1)
 #define OSYNC_INODE	(1<<2)
 int generic_osync_inode(struct inode *, struct address_space *, int);
 /*
  * This is the "filldir" function type, used by readdir() to let
  * the kernel specify what kind of dirent layout it wants to have.
  * This allows the kernel to read directories into kernel space or
  * to have different dirent layouts depending on the binary type.
  */
 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
 struct block_device_operations;
 /* These macros are for out of kernel modules to test that
  * the kernel supports the unlocked_ioctl and compat_ioctl
  * fields in struct file_operations. */
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 /*
  * NOTE:
  * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
  * can be called without the big kernel lock held in all filesystems.
  */
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, struct dentry *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
 };
 struct inode_operations {
 	int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
 	int (*mkdir) (struct inode *,struct dentry *,int);
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *);
 	int (*readlink) (struct dentry *, char __user *,int);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
 };
 struct seq_file;
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 				unsigned long nr_segs, unsigned long fast_segs,
 				struct iovec *fast_pointer,
 				struct iovec **ret_pointer);
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
    	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
 	int (*freeze_fs) (struct super_block *);
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 	int (*show_stats)(struct seq_file *, struct vfsmount *);
 #ifdef CONFIG_QUOTA
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
 };
 /*
  * Inode state bits.  Protected by inode_lock.
  *
  * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
  * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
  *
  * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
  * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
  * various stages of removing an inode.
  *
  * Two bits are used for locking and completion notification, I_LOCK and I_SYNC.
  *
  * I_DIRTY_SYNC		Inode is dirty, but doesn't have to be written on
  *			fdatasync().  i_atime is the usual cause.
  * I_DIRTY_DATASYNC	Data-related inode changes pending. We keep track of
  *			these changes separately from I_DIRTY_SYNC so that we
  *			don't have to write inode on fdatasync() when only
  *			mtime has changed in it.
  * I_DIRTY_PAGES	Inode has dirty pages.  Inode itself may be clean.
  * I_NEW		get_new_inode() sets i_state to I_LOCK|I_NEW.  Both
  *			are cleared by unlock_new_inode(), called from iget().
  * I_WILL_FREE		Must be set when calling write_inode_now() if i_count
  *			is zero.  I_FREEING must be set when I_WILL_FREE is
  *			cleared.
  * I_FREEING		Set when inode is about to be freed but still has dirty
  *			pages or buffers attached or the inode itself is still
  *			dirty.
  * I_CLEAR		Set by clear_inode().  In this state the inode is clean
  *			and can be destroyed.
  *
  *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
  *			prohibited for many purposes.  iget() must wait for
  *			the inode to be completely released, then create it
  *			anew.  Other functions will just ignore such inodes,
  *			if appropriate.  I_LOCK is used for waiting.
  *
  * I_LOCK		Serves as both a mutex and completion notification.
  *			New inodes set I_LOCK.  If two processes both create
  *			the same inode, one of them will release its inode and
  *			wait for I_LOCK to be released before returning.
  *			Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
  *			also cause waiting on I_LOCK, without I_LOCK actually
  *			being set.  find_inode() uses this to prevent returning
  *			nearly-dead inodes.
  * I_SYNC		Similar to I_LOCK, but limited in scope to writeback
  *			of inode dirty data.  Having a separate lock for this
  *			purpose reduces latency and prevents some filesystem-
  *			specific deadlocks.
  *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  * Q: igrab() only checks on (I_FREEING|I_WILL_FREE).  Should it also check on
  *    I_CLEAR?  If not, why?
  */
 #define I_DIRTY_SYNC		1
 #define I_DIRTY_DATASYNC	2
 #define I_DIRTY_PAGES		4
 #define I_NEW			8
 #define I_WILL_FREE		16
 #define I_FREEING		32
 #define I_CLEAR			64
 #define __I_LOCK		7
 #define I_LOCK			(1 << __I_LOCK)
 #define __I_SYNC		8
 #define I_SYNC			(1 << __I_SYNC)
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
 {
 	__mark_inode_dirty(inode, I_DIRTY);
 }
 static inline void mark_inode_dirty_sync(struct inode *inode)
 {
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode
  *
  * This is a low-level filesystem helper to replace any
  * direct filesystem manipulation of i_nlink.  Currently,
  * it is only here for parity with dec_nlink().
  */
 static inline void inc_nlink(struct inode *inode)
 {
 	inode->i_nlink++;
 }
 static inline void inode_inc_link_count(struct inode *inode)
 {
 	inc_nlink(inode);
 	mark_inode_dirty(inode);
 }
 /**
  * drop_nlink - directly drop an inode's link count
  * @inode: inode
  *
  * This is a low-level filesystem helper to replace any
  * direct filesystem manipulation of i_nlink.  In cases
  * where we are attempting to track writes to the
  * filesystem, a decrement to zero means an imminent
  * write when the file is truncated and actually unlinked
  * on the filesystem.
  */
 static inline void drop_nlink(struct inode *inode)
 {
 	inode->i_nlink--;
 }
 /**
  * clear_nlink - directly zero an inode's link count
  * @inode: inode
  *
  * This is a low-level filesystem helper to replace any
  * direct filesystem manipulation of i_nlink.  See
  * drop_nlink() for why we care about i_nlink hitting zero.
  */
 static inline void clear_nlink(struct inode *inode)
 {
 	inode->i_nlink = 0;
 }
 static inline void inode_dec_link_count(struct inode *inode)
 {
 	drop_nlink(inode);
 	mark_inode_dirty(inode);
 }
 /**
  * inode_inc_iversion - increments i_version
  * @inode: inode that need to be updated
  *
  * Every time the inode is modified, the i_version field will be incremented.
  * The filesystem has to be mounted with i_version flag
  */
 static inline void inode_inc_iversion(struct inode *inode)
 {
        spin_lock(&inode->i_lock);
        inode->i_version++;
        spin_unlock(&inode->i_lock);
 }
 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
 		touch_atime(file->f_path.mnt, file->f_path.dentry);
 }
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
 struct file_system_type {
 	const char *name;
 	int fs_flags;
 	int (*get_sb) (struct file_system_type *, int,
 		       const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
 	struct list_head fs_supers;
 	struct lock_class_key s_lock_key;
 	struct lock_class_key s_umount_key;
 	struct lock_class_key i_lock_key;
 	struct lock_class_key i_mutex_key;
 	struct lock_class_key i_mutex_dir_key;
 	struct lock_class_key i_alloc_sem_key;
 };
 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
 extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
 extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
 extern int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			void *data);
 extern int get_sb_pseudo(struct file_system_type *, char *,
 	const struct super_operations *ops, unsigned long,
 	struct vfsmount *mnt);
 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
 	(((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
 #define fops_put(fops) \
 	do { if (fops) module_put((fops)->owner); } while(0)
 extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
 extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
 extern int current_umask(void);
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 extern int rw_verify_area(int, struct file *, loff_t *, size_t);
 #define FLOCK_VERIFY_READ  1
 #define FLOCK_VERIFY_WRITE 2
 #ifdef CONFIG_FILE_LOCKING
 extern int locks_mandatory_locked(struct inode *);
 extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
 /*
  * Candidates for mandatory locking have the setgid bit set
  * but no group execute bit -  an otherwise meaningless combination.
  */
 static inline int __mandatory_lock(struct inode *ino)
 {
 	return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
 }
 /*
  * ... and these candidates should be on MS_MANDLOCK mounted fs,
  * otherwise these will be advisory locks
  */
 static inline int mandatory_lock(struct inode *ino)
 {
 	return IS_MANDLOCK(ino) && __mandatory_lock(ino);
 }
 static inline int locks_verify_locked(struct inode *inode)
 {
 	if (mandatory_lock(inode))
 		return locks_mandatory_locked(inode);
 	return 0;
 }
 static inline int locks_verify_truncate(struct inode *inode,
 				    struct file *filp,
 				    loff_t size)
 {
 	if (inode->i_flock && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
 			(size < inode->i_size ? inode->i_size - size
 			 : size - inode->i_size)
 		);
 	return 0;
 }
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	if (inode->i_flock)
 		return __break_lease(inode, mode);
 	return 0;
 }
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct inode *inode)
 {
 	return 0;
 }
 static inline int locks_mandatory_area(int rw, struct inode *inode,
 				       struct file *filp, loff_t offset,
 				       size_t count)
 {
 	return 0;
 }
 static inline int __mandatory_lock(struct inode *inode)
 {
 	return 0;
 }
 static inline int mandatory_lock(struct inode *inode)
 {
 	return 0;
 }
 static inline int locks_verify_locked(struct inode *inode)
 {
 	return 0;
 }
 static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
 					size_t size)
 {
 	return 0;
 }
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	return 0;
 }
 #endif /* CONFIG_FILE_LOCKING */
 /* fs/open.c */
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
 extern int do_fallocate(struct file *file, int mode, loff_t offset,
 			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 				 const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
 extern char * getname(const char __user *);
 /* fs/ioctl.c */
 extern int ioctl_preallocate(struct file *filp, void __user *argp);
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
 extern void __init vfs_caches_init(unsigned long);
 extern struct kmem_cache *names_cachep;
 #define __getname_gfp(gfp)	kmem_cache_alloc(names_cachep, (gfp))
 #define __getname()		__getname_gfp(GFP_KERNEL)
 #define __putname(name)		kmem_cache_free(names_cachep, (void *)(name))
 #ifndef CONFIG_AUDITSYSCALL
 #define putname(name)   __putname(name)
 #else
 extern void putname(const char *name);
 #endif
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
+extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, fmode_t);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
 static inline void invalidate_bdev(struct block_device *bdev) {}
 static inline struct super_block *freeze_bdev(struct block_device *sb)
 {
 	return NULL;
 }
 static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	return 0;
 }
 #endif
 extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
 #ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *, fmode_t);
 extern int blkdev_put(struct block_device *, fmode_t);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 #ifdef CONFIG_SYSFS
 extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
 extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 #else
 #define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
 #define bd_release_from_disk(bdev, disk)	bd_release(bdev)
 #endif
 #endif
 /* fs/char_dev.c */
 #define CHRDEV_MAJOR_HASH_SIZE	255
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int register_chrdev(unsigned int, const char *,
 			   const struct file_operations *);
 extern void unregister_chrdev(unsigned int, const char *);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern void chrdev_show(struct seq_file *,off_t);
 /* fs/block_dev.c */
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
 #define BDEVT_SIZE	10	/* Largest string for MAJ:MIN for blkdev */
 #ifdef CONFIG_BLOCK
 #define BLKDEV_MAJOR_HASH_SIZE	255
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
 extern void close_bdev_exclusive(struct block_device *, fmode_t);
 extern void blkdev_show(struct seq_file *,off_t);
 #else
 #define BLKDEV_MAJOR_HASH_SIZE	0
 #endif
 extern void init_special_inode(struct inode *, umode_t, dev_t);
 /* Invalid inode operations -- fs/bad_inode.c */
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 extern const struct file_operations read_pipefifo_fops;
 extern const struct file_operations write_pipefifo_fops;
 extern const struct file_operations rdwr_pipefifo_fops;
 extern int fs_may_remount_ro(struct super_block *);
 #ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
  */
 #define bio_rw(bio)		((bio)->bi_rw & (RW_MASK | RWA_MASK))
 /*
  * return data direction, READ or WRITE
  */
 #define bio_data_dir(bio)	((bio)->bi_rw & 1)
 extern void check_disk_size_change(struct gendisk *disk,
 				   struct block_device *bdev);
 extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
 extern int invalidate_inodes(struct super_block *);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 static inline unsigned long __deprecated
 invalidate_inode_pages(struct address_space *mapping)
 {
 	return invalidate_mapping_pages(mapping, 0, ~0UL);
 }
 static inline void invalidate_remote_inode(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode))
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 }
 extern int invalidate_inode_pages2(struct address_space *mapping);
 extern int invalidate_inode_pages2_range(struct address_space *mapping,
 					 pgoff_t start, pgoff_t end);
 extern void generic_sync_sb_inodes(struct super_block *sb,
 				struct writeback_control *wbc);
 extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
 extern int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int,
 		int (*check_acl)(struct inode *, int));
 static inline bool execute_ok(struct inode *inode)
 {
 	return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
 }
 extern int get_write_access(struct inode *);
 extern int deny_write_access(struct file *);
 static inline void put_write_access(struct inode * inode)
 {
 	atomic_dec(&inode->i_writecount);
 }
 static inline void allow_write_access(struct file *file)
 {
 	if (file)
 		atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
 }
 extern int do_pipe_flags(int *, int);
 extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
 extern struct file * open_exec(const char *);
 /* fs/dcache.c -- generic fs support functions */
 extern int is_subdir(struct dentry *, struct dentry *);
 extern ino_t find_inode_number(struct dentry *, struct qstr *);
 #include <linux/err.h>
 /* needed for stackable file system support */
 extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 extern struct inode * inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
 extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
 		void *data);
 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data);
 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 extern void unlock_new_inode(struct inode *);
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_suid(struct file *);
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
 static inline void insert_inode_hash(struct inode *inode) {
 	__insert_inode_hash(inode, inode->i_ino);
 }
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
 struct bio;
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
 #endif
 extern int set_blocksize(struct block_device *, int);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
 extern int sb_has_dirty_inodes(struct super_block *);
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
 		unsigned long *nr_segs, size_t *count, int access_flags);
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t default_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		size_t len, unsigned int flags);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
 			int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 #ifdef CONFIG_FS_XIP
 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
 			     loff_t *ppos);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
 extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
 			      size_t len, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_BLOCK
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	int lock_type);
 enum {
 	DIO_LOCKING = 1, /* need locking between buffered and direct access */
 	DIO_NO_LOCKING,  /* bdev; no locking at all between buffered/direct */
 	DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
 };
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				nr_segs, get_block, end_io, DIO_LOCKING);
 }
 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				nr_segs, get_block, end_io, DIO_NO_LOCKING);
 }
 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
 #endif
 extern const struct file_operations generic_ro_fops;
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
 extern int vfs_follow_link(struct nameidata *, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
 extern void page_put_link(struct dentry *, struct nameidata *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 extern int vfs_readdir(struct file *, filldir_t, void *);
 extern int vfs_stat(char __user *, struct kstat *);
 extern int vfs_lstat(char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
 extern int vfs_fstatat(int , char __user *, struct kstat *, int);
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 extern int __generic_block_fiemap(struct inode *inode,
 				  struct fiemap_extent_info *fieinfo, u64 start,
 				  u64 len, get_block_t *get_block);
 extern int generic_block_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo, u64 start,
 				u64 len, get_block_t *get_block);
 extern void get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
 extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, void *, filldir_t);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 extern int simple_sync_file(struct file *, struct dentry *, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
 extern int simple_prepare_write(struct file *file, struct page *page,
 			unsigned offset, unsigned to);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
 extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
 extern const struct file_operations simple_dir_operations;
 extern const struct inode_operations simple_dir_inode_operations;
 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
 extern int simple_fsync(struct file *, struct dentry *, int);
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
 #else
 #define buffer_migrate_page NULL
 #endif
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 extern void file_update_time(struct file *file);
 extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
 	spin_lock(&dentry->d_lock);
 	res = dentry->d_parent->d_inode->i_ino;
 	spin_unlock(&dentry->d_lock);
 	return res;
 }
 /* Transaction based IO helpers */
 /*
  * An argresp is stored in an allocated page and holds the
  * size of the argument or response, along with its content
  */
 struct simple_transaction_argresp {
 	ssize_t size;
 	char data[0];
 };
 #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))
 char *simple_transaction_get(struct file *file, const char __user *buf,
 				size_t size);
 ssize_t simple_transaction_read(struct file *file, char __user *buf,
 				size_t size, loff_t *pos);
 int simple_transaction_release(struct inode *inode, struct file *file);
 void simple_transaction_set(struct file *file, size_t n);
 /*
  * simple attribute files
  *
  * These attributes behave similar to those in sysfs:
  *
  * Writing to an attribute immediately sets a value, an open file can be
  * written to multiple times.
  *
  * Reading from an attribute creates a buffer from the value that might get
  * read with multiple read calls. When the attribute has been read
  * completely, no further read calls are possible until the file is opened
  * again.
  *
  * All attributes contain a text representation of a numeric value
  * that are accessed with the get() and set() functions.
  */
 #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)		\
 static int __fops ## _open(struct inode *inode, struct file *file)	\
 {									\
 	__simple_attr_check_format(__fmt, 0ull);			\
 	return simple_attr_open(inode, file, __get, __set, __fmt);	\
 }									\
 static struct file_operations __fops = {				\
 	.owner	 = THIS_MODULE,						\
 	.open	 = __fops ## _open,					\
 	.release = simple_attr_release,					\
 	.read	 = simple_attr_read,					\
 	.write	 = simple_attr_write,					\
 };
 static inline void __attribute__((format(printf, 1, 2)))
 __simple_attr_check_format(const char *fmt, ...)
 {
 	/* don't do anything, just let the compiler check the arguments; */
 }
 int simple_attr_open(struct inode *inode, struct file *file,
 		     int (*get)(void *, u64 *), int (*set)(void *, u64),
 		     const char *fmt);
 int simple_attr_release(struct inode *inode, struct file *file);
 ssize_t simple_attr_read(struct file *file, char __user *buf,
 			 size_t len, loff_t *ppos);
 ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 int __init get_filesystem_list(char *buf);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */

mm/swapfile.c

Diff comments View file @ dddac6a

1	/*	1	/*
2	* linux/mm/swapfile.c	2	* linux/mm/swapfile.c
3	*	3	*
4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds	4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5	* Swap reorganised 29.12.95, Stephen Tweedie	5	* Swap reorganised 29.12.95, Stephen Tweedie
6	*/	6	*/
7		7
8	#include <linux/mm.h>	8	#include <linux/mm.h>
9	#include <linux/hugetlb.h>	9	#include <linux/hugetlb.h>
10	#include <linux/mman.h>	10	#include <linux/mman.h>
11	#include <linux/slab.h>	11	#include <linux/slab.h>
12	#include <linux/kernel_stat.h>	12	#include <linux/kernel_stat.h>
13	#include <linux/swap.h>	13	#include <linux/swap.h>
14	#include <linux/vmalloc.h>	14	#include <linux/vmalloc.h>
15	#include <linux/pagemap.h>	15	#include <linux/pagemap.h>
16	#include <linux/namei.h>	16	#include <linux/namei.h>
17	#include <linux/shm.h>	17	#include <linux/shm.h>
18	#include <linux/blkdev.h>	18	#include <linux/blkdev.h>
19	#include <linux/random.h>	19	#include <linux/random.h>
20	#include <linux/writeback.h>	20	#include <linux/writeback.h>
21	#include <linux/proc_fs.h>	21	#include <linux/proc_fs.h>
22	#include <linux/seq_file.h>	22	#include <linux/seq_file.h>
23	#include <linux/init.h>	23	#include <linux/init.h>
24	#include <linux/module.h>	24	#include <linux/module.h>
25	#include <linux/rmap.h>	25	#include <linux/rmap.h>
26	#include <linux/security.h>	26	#include <linux/security.h>
27	#include <linux/backing-dev.h>	27	#include <linux/backing-dev.h>
28	#include <linux/mutex.h>	28	#include <linux/mutex.h>
29	#include <linux/capability.h>	29	#include <linux/capability.h>
30	#include <linux/syscalls.h>	30	#include <linux/syscalls.h>
31	#include <linux/memcontrol.h>	31	#include <linux/memcontrol.h>
32		32
33	#include <asm/pgtable.h>	33	#include <asm/pgtable.h>
34	#include <asm/tlbflush.h>	34	#include <asm/tlbflush.h>
35	#include <linux/swapops.h>	35	#include <linux/swapops.h>
36	#include <linux/page_cgroup.h>	36	#include <linux/page_cgroup.h>
37		37
38	static DEFINE_SPINLOCK(swap_lock);	38	static DEFINE_SPINLOCK(swap_lock);
39	static unsigned int nr_swapfiles;	39	static unsigned int nr_swapfiles;
40	long nr_swap_pages;	40	long nr_swap_pages;
41	long total_swap_pages;	41	long total_swap_pages;
42	static int swap_overflow;	42	static int swap_overflow;
43	static int least_priority;	43	static int least_priority;
44		44
45	static const char Bad_file[] = "Bad swap file entry ";	45	static const char Bad_file[] = "Bad swap file entry ";
46	static const char Unused_file[] = "Unused swap file entry ";	46	static const char Unused_file[] = "Unused swap file entry ";
47	static const char Bad_offset[] = "Bad swap offset entry ";	47	static const char Bad_offset[] = "Bad swap offset entry ";
48	static const char Unused_offset[] = "Unused swap offset entry ";	48	static const char Unused_offset[] = "Unused swap offset entry ";
49		49
50	static struct swap_list_t swap_list = {-1, -1};	50	static struct swap_list_t swap_list = {-1, -1};
51		51
52	static struct swap_info_struct swap_info[MAX_SWAPFILES];	52	static struct swap_info_struct swap_info[MAX_SWAPFILES];
53		53
54	static DEFINE_MUTEX(swapon_mutex);	54	static DEFINE_MUTEX(swapon_mutex);
55		55
56	/* For reference count accounting in swap_map */	56	/* For reference count accounting in swap_map */
57	/* enum for swap_map[] handling. internal use only */	57	/* enum for swap_map[] handling. internal use only */
58	enum {	58	enum {
59	SWAP_MAP = 0, /* ops for reference from swap users */	59	SWAP_MAP = 0, /* ops for reference from swap users */
60	SWAP_CACHE, /* ops for reference from swap cache */	60	SWAP_CACHE, /* ops for reference from swap cache */
61	};	61	};
62		62
63	static inline int swap_count(unsigned short ent)	63	static inline int swap_count(unsigned short ent)
64	{	64	{
65	return ent & SWAP_COUNT_MASK;	65	return ent & SWAP_COUNT_MASK;
66	}	66	}
67		67
68	static inline bool swap_has_cache(unsigned short ent)	68	static inline bool swap_has_cache(unsigned short ent)
69	{	69	{
70	return !!(ent & SWAP_HAS_CACHE);	70	return !!(ent & SWAP_HAS_CACHE);
71	}	71	}
72		72
73	static inline unsigned short encode_swapmap(int count, bool has_cache)	73	static inline unsigned short encode_swapmap(int count, bool has_cache)
74	{	74	{
75	unsigned short ret = count;	75	unsigned short ret = count;
76		76
77	if (has_cache)	77	if (has_cache)
78	return SWAP_HAS_CACHE \| ret;	78	return SWAP_HAS_CACHE \| ret;
79	return ret;	79	return ret;
80	}	80	}
81		81
82	/* returnes 1 if swap entry is freed */	82	/* returnes 1 if swap entry is freed */
83	static int	83	static int
84	__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)	84	__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85	{	85	{
86	int type = si - swap_info;	86	int type = si - swap_info;
87	swp_entry_t entry = swp_entry(type, offset);	87	swp_entry_t entry = swp_entry(type, offset);
88	struct page *page;	88	struct page *page;
89	int ret = 0;	89	int ret = 0;
90		90
91	page = find_get_page(&swapper_space, entry.val);	91	page = find_get_page(&swapper_space, entry.val);
92	if (!page)	92	if (!page)
93	return 0;	93	return 0;
94	/*	94	/*
95	* This function is called from scan_swap_map() and it's called	95	* This function is called from scan_swap_map() and it's called
96	* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.	96	* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97	* We have to use trylock for avoiding deadlock. This is a special	97	* We have to use trylock for avoiding deadlock. This is a special
98	* case and you should use try_to_free_swap() with explicit lock_page()	98	* case and you should use try_to_free_swap() with explicit lock_page()
99	* in usual operations.	99	* in usual operations.
100	*/	100	*/
101	if (trylock_page(page)) {	101	if (trylock_page(page)) {
102	ret = try_to_free_swap(page);	102	ret = try_to_free_swap(page);
103	unlock_page(page);	103	unlock_page(page);
104	}	104	}
105	page_cache_release(page);	105	page_cache_release(page);
106	return ret;	106	return ret;
107	}	107	}
108		108
109	/*	109	/*
110	* We need this because the bdev->unplug_fn can sleep and we cannot	110	* We need this because the bdev->unplug_fn can sleep and we cannot
111	* hold swap_lock while calling the unplug_fn. And swap_lock	111	* hold swap_lock while calling the unplug_fn. And swap_lock
112	* cannot be turned into a mutex.	112	* cannot be turned into a mutex.
113	*/	113	*/
114	static DECLARE_RWSEM(swap_unplug_sem);	114	static DECLARE_RWSEM(swap_unplug_sem);
115		115
116	void swap_unplug_io_fn(struct backing_dev_info unused_bdi, struct page page)	116	void swap_unplug_io_fn(struct backing_dev_info unused_bdi, struct page page)
117	{	117	{
118	swp_entry_t entry;	118	swp_entry_t entry;
119		119
120	down_read(&swap_unplug_sem);	120	down_read(&swap_unplug_sem);
121	entry.val = page_private(page);	121	entry.val = page_private(page);
122	if (PageSwapCache(page)) {	122	if (PageSwapCache(page)) {
123	struct block_device *bdev = swap_info[swp_type(entry)].bdev;	123	struct block_device *bdev = swap_info[swp_type(entry)].bdev;
124	struct backing_dev_info *bdi;	124	struct backing_dev_info *bdi;
125		125
126	/*	126	/*
127	* If the page is removed from swapcache from under us (with a	127	* If the page is removed from swapcache from under us (with a
128	* racy try_to_unuse/swapoff) we need an additional reference	128	* racy try_to_unuse/swapoff) we need an additional reference
129	* count to avoid reading garbage from page_private(page) above.	129	* count to avoid reading garbage from page_private(page) above.
130	* If the WARN_ON triggers during a swapoff it maybe the race	130	* If the WARN_ON triggers during a swapoff it maybe the race
131	* condition and it's harmless. However if it triggers without	131	* condition and it's harmless. However if it triggers without
132	* swapoff it signals a problem.	132	* swapoff it signals a problem.
133	*/	133	*/
134	WARN_ON(page_count(page) <= 1);	134	WARN_ON(page_count(page) <= 1);
135		135
136	bdi = bdev->bd_inode->i_mapping->backing_dev_info;	136	bdi = bdev->bd_inode->i_mapping->backing_dev_info;
137	blk_run_backing_dev(bdi, page);	137	blk_run_backing_dev(bdi, page);
138	}	138	}
139	up_read(&swap_unplug_sem);	139	up_read(&swap_unplug_sem);
140	}	140	}
141		141
142	/*	142	/*
143	* swapon tell device that all the old swap contents can be discarded,	143	* swapon tell device that all the old swap contents can be discarded,
144	* to allow the swap device to optimize its wear-levelling.	144	* to allow the swap device to optimize its wear-levelling.
145	*/	145	*/
146	static int discard_swap(struct swap_info_struct *si)	146	static int discard_swap(struct swap_info_struct *si)
147	{	147	{
148	struct swap_extent *se;	148	struct swap_extent *se;
149	int err = 0;	149	int err = 0;
150		150
151	list_for_each_entry(se, &si->extent_list, list) {	151	list_for_each_entry(se, &si->extent_list, list) {
152	sector_t start_block = se->start_block << (PAGE_SHIFT - 9);	152	sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
153	sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);	153	sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
154		154
155	if (se->start_page == 0) {	155	if (se->start_page == 0) {
156	/* Do not discard the swap header page! */	156	/* Do not discard the swap header page! */
157	start_block += 1 << (PAGE_SHIFT - 9);	157	start_block += 1 << (PAGE_SHIFT - 9);
158	nr_blocks -= 1 << (PAGE_SHIFT - 9);	158	nr_blocks -= 1 << (PAGE_SHIFT - 9);
159	if (!nr_blocks)	159	if (!nr_blocks)
160	continue;	160	continue;
161	}	161	}
162		162
163	err = blkdev_issue_discard(si->bdev, start_block,	163	err = blkdev_issue_discard(si->bdev, start_block,
164	nr_blocks, GFP_KERNEL);	164	nr_blocks, GFP_KERNEL);
165	if (err)	165	if (err)
166	break;	166	break;
167		167
168	cond_resched();	168	cond_resched();
169	}	169	}
170	return err; /* That will often be -EOPNOTSUPP */	170	return err; /* That will often be -EOPNOTSUPP */
171	}	171	}
172		172
173	/*	173	/*
174	* swap allocation tell device that a cluster of swap can now be discarded,	174	* swap allocation tell device that a cluster of swap can now be discarded,
175	* to allow the swap device to optimize its wear-levelling.	175	* to allow the swap device to optimize its wear-levelling.
176	*/	176	*/
177	static void discard_swap_cluster(struct swap_info_struct *si,	177	static void discard_swap_cluster(struct swap_info_struct *si,
178	pgoff_t start_page, pgoff_t nr_pages)	178	pgoff_t start_page, pgoff_t nr_pages)
179	{	179	{
180	struct swap_extent *se = si->curr_swap_extent;	180	struct swap_extent *se = si->curr_swap_extent;
181	int found_extent = 0;	181	int found_extent = 0;
182		182
183	while (nr_pages) {	183	while (nr_pages) {
184	struct list_head *lh;	184	struct list_head *lh;
185		185
186	if (se->start_page <= start_page &&	186	if (se->start_page <= start_page &&
187	start_page < se->start_page + se->nr_pages) {	187	start_page < se->start_page + se->nr_pages) {
188	pgoff_t offset = start_page - se->start_page;	188	pgoff_t offset = start_page - se->start_page;
189	sector_t start_block = se->start_block + offset;	189	sector_t start_block = se->start_block + offset;
190	sector_t nr_blocks = se->nr_pages - offset;	190	sector_t nr_blocks = se->nr_pages - offset;
191		191
192	if (nr_blocks > nr_pages)	192	if (nr_blocks > nr_pages)
193	nr_blocks = nr_pages;	193	nr_blocks = nr_pages;
194	start_page += nr_blocks;	194	start_page += nr_blocks;
195	nr_pages -= nr_blocks;	195	nr_pages -= nr_blocks;
196		196
197	if (!found_extent++)	197	if (!found_extent++)
198	si->curr_swap_extent = se;	198	si->curr_swap_extent = se;
199		199
200	start_block <<= PAGE_SHIFT - 9;	200	start_block <<= PAGE_SHIFT - 9;
201	nr_blocks <<= PAGE_SHIFT - 9;	201	nr_blocks <<= PAGE_SHIFT - 9;
202	if (blkdev_issue_discard(si->bdev, start_block,	202	if (blkdev_issue_discard(si->bdev, start_block,
203	nr_blocks, GFP_NOIO))	203	nr_blocks, GFP_NOIO))
204	break;	204	break;
205	}	205	}
206		206
207	lh = se->list.next;	207	lh = se->list.next;
208	if (lh == &si->extent_list)	208	if (lh == &si->extent_list)
209	lh = lh->next;	209	lh = lh->next;
210	se = list_entry(lh, struct swap_extent, list);	210	se = list_entry(lh, struct swap_extent, list);
211	}	211	}
212	}	212	}
213		213
214	static int wait_for_discard(void *word)	214	static int wait_for_discard(void *word)
215	{	215	{
216	schedule();	216	schedule();
217	return 0;	217	return 0;
218	}	218	}
219		219
220	#define SWAPFILE_CLUSTER 256	220	#define SWAPFILE_CLUSTER 256
221	#define LATENCY_LIMIT 256	221	#define LATENCY_LIMIT 256
222		222
223	static inline unsigned long scan_swap_map(struct swap_info_struct *si,	223	static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224	int cache)	224	int cache)
225	{	225	{
226	unsigned long offset;	226	unsigned long offset;
227	unsigned long scan_base;	227	unsigned long scan_base;
228	unsigned long last_in_cluster = 0;	228	unsigned long last_in_cluster = 0;
229	int latency_ration = LATENCY_LIMIT;	229	int latency_ration = LATENCY_LIMIT;
230	int found_free_cluster = 0;	230	int found_free_cluster = 0;
231		231
232	/*	232	/*
233	* We try to cluster swap pages by allocating them sequentially	233	* We try to cluster swap pages by allocating them sequentially
234	* in swap. Once we've allocated SWAPFILE_CLUSTER pages this	234	* in swap. Once we've allocated SWAPFILE_CLUSTER pages this
235	* way, however, we resort to first-free allocation, starting	235	* way, however, we resort to first-free allocation, starting
236	* a new cluster. This prevents us from scattering swap pages	236	* a new cluster. This prevents us from scattering swap pages
237	* all over the entire swap partition, so that we reduce	237	* all over the entire swap partition, so that we reduce
238	* overall disk seek times between swap pages. -- sct	238	* overall disk seek times between swap pages. -- sct
239	* But we do now try to find an empty cluster. -Andrea	239	* But we do now try to find an empty cluster. -Andrea
240	* And we let swap pages go all over an SSD partition. Hugh	240	* And we let swap pages go all over an SSD partition. Hugh
241	*/	241	*/
242		242
243	si->flags += SWP_SCANNING;	243	si->flags += SWP_SCANNING;
244	scan_base = offset = si->cluster_next;	244	scan_base = offset = si->cluster_next;
245		245
246	if (unlikely(!si->cluster_nr--)) {	246	if (unlikely(!si->cluster_nr--)) {
247	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {	247	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
248	si->cluster_nr = SWAPFILE_CLUSTER - 1;	248	si->cluster_nr = SWAPFILE_CLUSTER - 1;
249	goto checks;	249	goto checks;
250	}	250	}
251	if (si->flags & SWP_DISCARDABLE) {	251	if (si->flags & SWP_DISCARDABLE) {
252	/*	252	/*
253	* Start range check on racing allocations, in case	253	* Start range check on racing allocations, in case
254	* they overlap the cluster we eventually decide on	254	* they overlap the cluster we eventually decide on
255	* (we scan without swap_lock to allow preemption).	255	* (we scan without swap_lock to allow preemption).
256	* It's hardly conceivable that cluster_nr could be	256	* It's hardly conceivable that cluster_nr could be
257	* wrapped during our scan, but don't depend on it.	257	* wrapped during our scan, but don't depend on it.
258	*/	258	*/
259	if (si->lowest_alloc)	259	if (si->lowest_alloc)
260	goto checks;	260	goto checks;
261	si->lowest_alloc = si->max;	261	si->lowest_alloc = si->max;
262	si->highest_alloc = 0;	262	si->highest_alloc = 0;
263	}	263	}
264	spin_unlock(&swap_lock);	264	spin_unlock(&swap_lock);
265		265
266	/*	266	/*
267	* If seek is expensive, start searching for new cluster from	267	* If seek is expensive, start searching for new cluster from
268	* start of partition, to minimize the span of allocated swap.	268	* start of partition, to minimize the span of allocated swap.
269	* But if seek is cheap, search from our current position, so	269	* But if seek is cheap, search from our current position, so
270	* that swap is allocated from all over the partition: if the	270	* that swap is allocated from all over the partition: if the
271	* Flash Translation Layer only remaps within limited zones,	271	* Flash Translation Layer only remaps within limited zones,
272	* we don't want to wear out the first zone too quickly.	272	* we don't want to wear out the first zone too quickly.
273	*/	273	*/
274	if (!(si->flags & SWP_SOLIDSTATE))	274	if (!(si->flags & SWP_SOLIDSTATE))
275	scan_base = offset = si->lowest_bit;	275	scan_base = offset = si->lowest_bit;
276	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;	276	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
277		277
278	/* Locate the first empty (unaligned) cluster */	278	/* Locate the first empty (unaligned) cluster */
279	for (; last_in_cluster <= si->highest_bit; offset++) {	279	for (; last_in_cluster <= si->highest_bit; offset++) {
280	if (si->swap_map[offset])	280	if (si->swap_map[offset])
281	last_in_cluster = offset + SWAPFILE_CLUSTER;	281	last_in_cluster = offset + SWAPFILE_CLUSTER;
282	else if (offset == last_in_cluster) {	282	else if (offset == last_in_cluster) {
283	spin_lock(&swap_lock);	283	spin_lock(&swap_lock);
284	offset -= SWAPFILE_CLUSTER - 1;	284	offset -= SWAPFILE_CLUSTER - 1;
285	si->cluster_next = offset;	285	si->cluster_next = offset;
286	si->cluster_nr = SWAPFILE_CLUSTER - 1;	286	si->cluster_nr = SWAPFILE_CLUSTER - 1;
287	found_free_cluster = 1;	287	found_free_cluster = 1;
288	goto checks;	288	goto checks;
289	}	289	}
290	if (unlikely(--latency_ration < 0)) {	290	if (unlikely(--latency_ration < 0)) {
291	cond_resched();	291	cond_resched();
292	latency_ration = LATENCY_LIMIT;	292	latency_ration = LATENCY_LIMIT;
293	}	293	}
294	}	294	}
295		295
296	offset = si->lowest_bit;	296	offset = si->lowest_bit;
297	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;	297	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
298		298
299	/* Locate the first empty (unaligned) cluster */	299	/* Locate the first empty (unaligned) cluster */
300	for (; last_in_cluster < scan_base; offset++) {	300	for (; last_in_cluster < scan_base; offset++) {
301	if (si->swap_map[offset])	301	if (si->swap_map[offset])
302	last_in_cluster = offset + SWAPFILE_CLUSTER;	302	last_in_cluster = offset + SWAPFILE_CLUSTER;
303	else if (offset == last_in_cluster) {	303	else if (offset == last_in_cluster) {
304	spin_lock(&swap_lock);	304	spin_lock(&swap_lock);
305	offset -= SWAPFILE_CLUSTER - 1;	305	offset -= SWAPFILE_CLUSTER - 1;
306	si->cluster_next = offset;	306	si->cluster_next = offset;
307	si->cluster_nr = SWAPFILE_CLUSTER - 1;	307	si->cluster_nr = SWAPFILE_CLUSTER - 1;
308	found_free_cluster = 1;	308	found_free_cluster = 1;
309	goto checks;	309	goto checks;
310	}	310	}
311	if (unlikely(--latency_ration < 0)) {	311	if (unlikely(--latency_ration < 0)) {
312	cond_resched();	312	cond_resched();
313	latency_ration = LATENCY_LIMIT;	313	latency_ration = LATENCY_LIMIT;
314	}	314	}
315	}	315	}
316		316
317	offset = scan_base;	317	offset = scan_base;
318	spin_lock(&swap_lock);	318	spin_lock(&swap_lock);
319	si->cluster_nr = SWAPFILE_CLUSTER - 1;	319	si->cluster_nr = SWAPFILE_CLUSTER - 1;
320	si->lowest_alloc = 0;	320	si->lowest_alloc = 0;
321	}	321	}
322		322
323	checks:	323	checks:
324	if (!(si->flags & SWP_WRITEOK))	324	if (!(si->flags & SWP_WRITEOK))
325	goto no_page;	325	goto no_page;
326	if (!si->highest_bit)	326	if (!si->highest_bit)
327	goto no_page;	327	goto no_page;
328	if (offset > si->highest_bit)	328	if (offset > si->highest_bit)
329	scan_base = offset = si->lowest_bit;	329	scan_base = offset = si->lowest_bit;
330		330
331	/* reuse swap entry of cache-only swap if not busy. */	331	/* reuse swap entry of cache-only swap if not busy. */
332	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {	332	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333	int swap_was_freed;	333	int swap_was_freed;
334	spin_unlock(&swap_lock);	334	spin_unlock(&swap_lock);
335	swap_was_freed = __try_to_reclaim_swap(si, offset);	335	swap_was_freed = __try_to_reclaim_swap(si, offset);
336	spin_lock(&swap_lock);	336	spin_lock(&swap_lock);
337	/* entry was freed successfully, try to use this again */	337	/* entry was freed successfully, try to use this again */
338	if (swap_was_freed)	338	if (swap_was_freed)
339	goto checks;	339	goto checks;
340	goto scan; /* check next one */	340	goto scan; /* check next one */
341	}	341	}
342		342
343	if (si->swap_map[offset])	343	if (si->swap_map[offset])
344	goto scan;	344	goto scan;
345		345
346	if (offset == si->lowest_bit)	346	if (offset == si->lowest_bit)
347	si->lowest_bit++;	347	si->lowest_bit++;
348	if (offset == si->highest_bit)	348	if (offset == si->highest_bit)
349	si->highest_bit--;	349	si->highest_bit--;
350	si->inuse_pages++;	350	si->inuse_pages++;
351	if (si->inuse_pages == si->pages) {	351	if (si->inuse_pages == si->pages) {
352	si->lowest_bit = si->max;	352	si->lowest_bit = si->max;
353	si->highest_bit = 0;	353	si->highest_bit = 0;
354	}	354	}
355	if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */	355	if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356	si->swap_map[offset] = encode_swapmap(0, true);	356	si->swap_map[offset] = encode_swapmap(0, true);
357	else /* at suspend */	357	else /* at suspend */
358	si->swap_map[offset] = encode_swapmap(1, false);	358	si->swap_map[offset] = encode_swapmap(1, false);
359	si->cluster_next = offset + 1;	359	si->cluster_next = offset + 1;
360	si->flags -= SWP_SCANNING;	360	si->flags -= SWP_SCANNING;
361		361
362	if (si->lowest_alloc) {	362	if (si->lowest_alloc) {
363	/*	363	/*
364	* Only set when SWP_DISCARDABLE, and there's a scan	364	* Only set when SWP_DISCARDABLE, and there's a scan
365	* for a free cluster in progress or just completed.	365	* for a free cluster in progress or just completed.
366	*/	366	*/
367	if (found_free_cluster) {	367	if (found_free_cluster) {
368	/*	368	/*
369	* To optimize wear-levelling, discard the	369	* To optimize wear-levelling, discard the
370	* old data of the cluster, taking care not to	370	* old data of the cluster, taking care not to
371	* discard any of its pages that have already	371	* discard any of its pages that have already
372	* been allocated by racing tasks (offset has	372	* been allocated by racing tasks (offset has
373	* already stepped over any at the beginning).	373	* already stepped over any at the beginning).
374	*/	374	*/
375	if (offset < si->highest_alloc &&	375	if (offset < si->highest_alloc &&
376	si->lowest_alloc <= last_in_cluster)	376	si->lowest_alloc <= last_in_cluster)
377	last_in_cluster = si->lowest_alloc - 1;	377	last_in_cluster = si->lowest_alloc - 1;
378	si->flags \|= SWP_DISCARDING;	378	si->flags \|= SWP_DISCARDING;
379	spin_unlock(&swap_lock);	379	spin_unlock(&swap_lock);
380		380
381	if (offset < last_in_cluster)	381	if (offset < last_in_cluster)
382	discard_swap_cluster(si, offset,	382	discard_swap_cluster(si, offset,
383	last_in_cluster - offset + 1);	383	last_in_cluster - offset + 1);
384		384
385	spin_lock(&swap_lock);	385	spin_lock(&swap_lock);
386	si->lowest_alloc = 0;	386	si->lowest_alloc = 0;
387	si->flags &= ~SWP_DISCARDING;	387	si->flags &= ~SWP_DISCARDING;
388		388
389	smp_mb(); /* wake_up_bit advises this */	389	smp_mb(); /* wake_up_bit advises this */
390	wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));	390	wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
391		391
392	} else if (si->flags & SWP_DISCARDING) {	392	} else if (si->flags & SWP_DISCARDING) {
393	/*	393	/*
394	* Delay using pages allocated by racing tasks	394	* Delay using pages allocated by racing tasks
395	* until the whole discard has been issued. We	395	* until the whole discard has been issued. We
396	* could defer that delay until swap_writepage,	396	* could defer that delay until swap_writepage,
397	* but it's easier to keep this self-contained.	397	* but it's easier to keep this self-contained.
398	*/	398	*/
399	spin_unlock(&swap_lock);	399	spin_unlock(&swap_lock);
400	wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),	400	wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
401	wait_for_discard, TASK_UNINTERRUPTIBLE);	401	wait_for_discard, TASK_UNINTERRUPTIBLE);
402	spin_lock(&swap_lock);	402	spin_lock(&swap_lock);
403	} else {	403	} else {
404	/*	404	/*
405	* Note pages allocated by racing tasks while	405	* Note pages allocated by racing tasks while
406	* scan for a free cluster is in progress, so	406	* scan for a free cluster is in progress, so
407	* that its final discard can exclude them.	407	* that its final discard can exclude them.
408	*/	408	*/
409	if (offset < si->lowest_alloc)	409	if (offset < si->lowest_alloc)
410	si->lowest_alloc = offset;	410	si->lowest_alloc = offset;
411	if (offset > si->highest_alloc)	411	if (offset > si->highest_alloc)
412	si->highest_alloc = offset;	412	si->highest_alloc = offset;
413	}	413	}
414	}	414	}
415	return offset;	415	return offset;
416		416
417	scan:	417	scan:
418	spin_unlock(&swap_lock);	418	spin_unlock(&swap_lock);
419	while (++offset <= si->highest_bit) {	419	while (++offset <= si->highest_bit) {
420	if (!si->swap_map[offset]) {	420	if (!si->swap_map[offset]) {
421	spin_lock(&swap_lock);	421	spin_lock(&swap_lock);
422	goto checks;	422	goto checks;
423	}	423	}
424	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {	424	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425	spin_lock(&swap_lock);	425	spin_lock(&swap_lock);
426	goto checks;	426	goto checks;
427	}	427	}
428	if (unlikely(--latency_ration < 0)) {	428	if (unlikely(--latency_ration < 0)) {
429	cond_resched();	429	cond_resched();
430	latency_ration = LATENCY_LIMIT;	430	latency_ration = LATENCY_LIMIT;
431	}	431	}
432	}	432	}
433	offset = si->lowest_bit;	433	offset = si->lowest_bit;
434	while (++offset < scan_base) {	434	while (++offset < scan_base) {
435	if (!si->swap_map[offset]) {	435	if (!si->swap_map[offset]) {
436	spin_lock(&swap_lock);	436	spin_lock(&swap_lock);
437	goto checks;	437	goto checks;
438	}	438	}
439	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {	439	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440	spin_lock(&swap_lock);	440	spin_lock(&swap_lock);
441	goto checks;	441	goto checks;
442	}	442	}
443	if (unlikely(--latency_ration < 0)) {	443	if (unlikely(--latency_ration < 0)) {
444	cond_resched();	444	cond_resched();
445	latency_ration = LATENCY_LIMIT;	445	latency_ration = LATENCY_LIMIT;
446	}	446	}
447	}	447	}
448	spin_lock(&swap_lock);	448	spin_lock(&swap_lock);
449		449
450	no_page:	450	no_page:
451	si->flags -= SWP_SCANNING;	451	si->flags -= SWP_SCANNING;
452	return 0;	452	return 0;
453	}	453	}
454		454
455	swp_entry_t get_swap_page(void)	455	swp_entry_t get_swap_page(void)
456	{	456	{
457	struct swap_info_struct *si;	457	struct swap_info_struct *si;
458	pgoff_t offset;	458	pgoff_t offset;
459	int type, next;	459	int type, next;
460	int wrapped = 0;	460	int wrapped = 0;
461		461
462	spin_lock(&swap_lock);	462	spin_lock(&swap_lock);
463	if (nr_swap_pages <= 0)	463	if (nr_swap_pages <= 0)
464	goto noswap;	464	goto noswap;
465	nr_swap_pages--;	465	nr_swap_pages--;
466		466
467	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {	467	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
468	si = swap_info + type;	468	si = swap_info + type;
469	next = si->next;	469	next = si->next;
470	if (next < 0 \|\|	470	if (next < 0 \|\|
471	(!wrapped && si->prio != swap_info[next].prio)) {	471	(!wrapped && si->prio != swap_info[next].prio)) {
472	next = swap_list.head;	472	next = swap_list.head;
473	wrapped++;	473	wrapped++;
474	}	474	}
475		475
476	if (!si->highest_bit)	476	if (!si->highest_bit)
477	continue;	477	continue;
478	if (!(si->flags & SWP_WRITEOK))	478	if (!(si->flags & SWP_WRITEOK))
479	continue;	479	continue;
480		480
481	swap_list.next = next;	481	swap_list.next = next;
482	/* This is called for allocating swap entry for cache */	482	/* This is called for allocating swap entry for cache */
483	offset = scan_swap_map(si, SWAP_CACHE);	483	offset = scan_swap_map(si, SWAP_CACHE);
484	if (offset) {	484	if (offset) {
485	spin_unlock(&swap_lock);	485	spin_unlock(&swap_lock);
486	return swp_entry(type, offset);	486	return swp_entry(type, offset);
487	}	487	}
488	next = swap_list.next;	488	next = swap_list.next;
489	}	489	}
490		490
491	nr_swap_pages++;	491	nr_swap_pages++;
492	noswap:	492	noswap:
493	spin_unlock(&swap_lock);	493	spin_unlock(&swap_lock);
494	return (swp_entry_t) {0};	494	return (swp_entry_t) {0};
495	}	495	}
496		496
497	/* The only caller of this function is now susupend routine */	497	/* The only caller of this function is now susupend routine */
498	swp_entry_t get_swap_page_of_type(int type)	498	swp_entry_t get_swap_page_of_type(int type)
499	{	499	{
500	struct swap_info_struct *si;	500	struct swap_info_struct *si;
501	pgoff_t offset;	501	pgoff_t offset;
502		502
503	spin_lock(&swap_lock);	503	spin_lock(&swap_lock);
504	si = swap_info + type;	504	si = swap_info + type;
505	if (si->flags & SWP_WRITEOK) {	505	if (si->flags & SWP_WRITEOK) {
506	nr_swap_pages--;	506	nr_swap_pages--;
507	/* This is called for allocating swap entry, not cache */	507	/* This is called for allocating swap entry, not cache */
508	offset = scan_swap_map(si, SWAP_MAP);	508	offset = scan_swap_map(si, SWAP_MAP);
509	if (offset) {	509	if (offset) {
510	spin_unlock(&swap_lock);	510	spin_unlock(&swap_lock);
511	return swp_entry(type, offset);	511	return swp_entry(type, offset);
512	}	512	}
513	nr_swap_pages++;	513	nr_swap_pages++;
514	}	514	}
515	spin_unlock(&swap_lock);	515	spin_unlock(&swap_lock);
516	return (swp_entry_t) {0};	516	return (swp_entry_t) {0};
517	}	517	}
518		518
519	static struct swap_info_struct * swap_info_get(swp_entry_t entry)	519	static struct swap_info_struct * swap_info_get(swp_entry_t entry)
520	{	520	{
521	struct swap_info_struct * p;	521	struct swap_info_struct * p;
522	unsigned long offset, type;	522	unsigned long offset, type;
523		523
524	if (!entry.val)	524	if (!entry.val)
525	goto out;	525	goto out;
526	type = swp_type(entry);	526	type = swp_type(entry);
527	if (type >= nr_swapfiles)	527	if (type >= nr_swapfiles)
528	goto bad_nofile;	528	goto bad_nofile;
529	p = & swap_info[type];	529	p = & swap_info[type];
530	if (!(p->flags & SWP_USED))	530	if (!(p->flags & SWP_USED))
531	goto bad_device;	531	goto bad_device;
532	offset = swp_offset(entry);	532	offset = swp_offset(entry);
533	if (offset >= p->max)	533	if (offset >= p->max)
534	goto bad_offset;	534	goto bad_offset;
535	if (!p->swap_map[offset])	535	if (!p->swap_map[offset])
536	goto bad_free;	536	goto bad_free;
537	spin_lock(&swap_lock);	537	spin_lock(&swap_lock);
538	return p;	538	return p;
539		539
540	bad_free:	540	bad_free:
541	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);	541	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
542	goto out;	542	goto out;
543	bad_offset:	543	bad_offset:
544	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);	544	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
545	goto out;	545	goto out;
546	bad_device:	546	bad_device:
547	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);	547	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
548	goto out;	548	goto out;
549	bad_nofile:	549	bad_nofile:
550	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);	550	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
551	out:	551	out:
552	return NULL;	552	return NULL;
553	}	553	}
554		554
555	static int swap_entry_free(struct swap_info_struct *p,	555	static int swap_entry_free(struct swap_info_struct *p,
556	swp_entry_t ent, int cache)	556	swp_entry_t ent, int cache)
557	{	557	{
558	unsigned long offset = swp_offset(ent);	558	unsigned long offset = swp_offset(ent);
559	int count = swap_count(p->swap_map[offset]);	559	int count = swap_count(p->swap_map[offset]);
560	bool has_cache;	560	bool has_cache;
561		561
562	has_cache = swap_has_cache(p->swap_map[offset]);	562	has_cache = swap_has_cache(p->swap_map[offset]);
563		563
564	if (cache == SWAP_MAP) { /* dropping usage count of swap */	564	if (cache == SWAP_MAP) { /* dropping usage count of swap */
565	if (count < SWAP_MAP_MAX) {	565	if (count < SWAP_MAP_MAX) {
566	count--;	566	count--;
567	p->swap_map[offset] = encode_swapmap(count, has_cache);	567	p->swap_map[offset] = encode_swapmap(count, has_cache);
568	}	568	}
569	} else { /* dropping swap cache flag */	569	} else { /* dropping swap cache flag */
570	VM_BUG_ON(!has_cache);	570	VM_BUG_ON(!has_cache);
571	p->swap_map[offset] = encode_swapmap(count, false);	571	p->swap_map[offset] = encode_swapmap(count, false);
572		572
573	}	573	}
574	/* return code. */	574	/* return code. */
575	count = p->swap_map[offset];	575	count = p->swap_map[offset];
576	/* free if no reference */	576	/* free if no reference */
577	if (!count) {	577	if (!count) {
578	if (offset < p->lowest_bit)	578	if (offset < p->lowest_bit)
579	p->lowest_bit = offset;	579	p->lowest_bit = offset;
580	if (offset > p->highest_bit)	580	if (offset > p->highest_bit)
581	p->highest_bit = offset;	581	p->highest_bit = offset;
582	if (p->prio > swap_info[swap_list.next].prio)	582	if (p->prio > swap_info[swap_list.next].prio)
583	swap_list.next = p - swap_info;	583	swap_list.next = p - swap_info;
584	nr_swap_pages++;	584	nr_swap_pages++;
585	p->inuse_pages--;	585	p->inuse_pages--;
586	}	586	}
587	if (!swap_count(count))	587	if (!swap_count(count))
588	mem_cgroup_uncharge_swap(ent);	588	mem_cgroup_uncharge_swap(ent);
589	return count;	589	return count;
590	}	590	}
591		591
592	/*	592	/*
593	* Caller has made sure that the swapdevice corresponding to entry	593	* Caller has made sure that the swapdevice corresponding to entry
594	* is still around or has not been recycled.	594	* is still around or has not been recycled.
595	*/	595	*/
596	void swap_free(swp_entry_t entry)	596	void swap_free(swp_entry_t entry)
597	{	597	{
598	struct swap_info_struct * p;	598	struct swap_info_struct * p;
599		599
600	p = swap_info_get(entry);	600	p = swap_info_get(entry);
601	if (p) {	601	if (p) {
602	swap_entry_free(p, entry, SWAP_MAP);	602	swap_entry_free(p, entry, SWAP_MAP);
603	spin_unlock(&swap_lock);	603	spin_unlock(&swap_lock);
604	}	604	}
605	}	605	}
606		606
607	/*	607	/*
608	* Called after dropping swapcache to decrease refcnt to swap entries.	608	* Called after dropping swapcache to decrease refcnt to swap entries.
609	*/	609	*/
610	void swapcache_free(swp_entry_t entry, struct page *page)	610	void swapcache_free(swp_entry_t entry, struct page *page)
611	{	611	{
612	struct swap_info_struct *p;	612	struct swap_info_struct *p;
613	int ret;	613	int ret;
614		614
615	p = swap_info_get(entry);	615	p = swap_info_get(entry);
616	if (p) {	616	if (p) {
617	ret = swap_entry_free(p, entry, SWAP_CACHE);	617	ret = swap_entry_free(p, entry, SWAP_CACHE);
618	if (page) {	618	if (page) {
619	bool swapout;	619	bool swapout;
620	if (ret)	620	if (ret)
621	swapout = true; /* the end of swap out */	621	swapout = true; /* the end of swap out */
622	else	622	else
623	swapout = false; /* no more swap users! */	623	swapout = false; /* no more swap users! */
624	mem_cgroup_uncharge_swapcache(page, entry, swapout);	624	mem_cgroup_uncharge_swapcache(page, entry, swapout);
625	}	625	}
626	spin_unlock(&swap_lock);	626	spin_unlock(&swap_lock);
627	}	627	}
628	return;	628	return;
629	}	629	}
630		630
631	/*	631	/*
632	* How many references to page are currently swapped out?	632	* How many references to page are currently swapped out?
633	*/	633	*/
634	static inline int page_swapcount(struct page *page)	634	static inline int page_swapcount(struct page *page)
635	{	635	{
636	int count = 0;	636	int count = 0;
637	struct swap_info_struct *p;	637	struct swap_info_struct *p;
638	swp_entry_t entry;	638	swp_entry_t entry;
639		639
640	entry.val = page_private(page);	640	entry.val = page_private(page);
641	p = swap_info_get(entry);	641	p = swap_info_get(entry);
642	if (p) {	642	if (p) {
643	count = swap_count(p->swap_map[swp_offset(entry)]);	643	count = swap_count(p->swap_map[swp_offset(entry)]);
644	spin_unlock(&swap_lock);	644	spin_unlock(&swap_lock);
645	}	645	}
646	return count;	646	return count;
647	}	647	}
648		648
649	/*	649	/*
650	* We can write to an anon page without COW if there are no other references	650	* We can write to an anon page without COW if there are no other references
651	* to it. And as a side-effect, free up its swap: because the old content	651	* to it. And as a side-effect, free up its swap: because the old content
652	* on disk will never be read, and seeking back there to write new content	652	* on disk will never be read, and seeking back there to write new content
653	* later would only waste time away from clustering.	653	* later would only waste time away from clustering.
654	*/	654	*/
655	int reuse_swap_page(struct page *page)	655	int reuse_swap_page(struct page *page)
656	{	656	{
657	int count;	657	int count;
658		658
659	VM_BUG_ON(!PageLocked(page));	659	VM_BUG_ON(!PageLocked(page));
660	count = page_mapcount(page);	660	count = page_mapcount(page);
661	if (count <= 1 && PageSwapCache(page)) {	661	if (count <= 1 && PageSwapCache(page)) {
662	count += page_swapcount(page);	662	count += page_swapcount(page);
663	if (count == 1 && !PageWriteback(page)) {	663	if (count == 1 && !PageWriteback(page)) {
664	delete_from_swap_cache(page);	664	delete_from_swap_cache(page);
665	SetPageDirty(page);	665	SetPageDirty(page);
666	}	666	}
667	}	667	}
668	return count == 1;	668	return count == 1;
669	}	669	}
670		670
671	/*	671	/*
672	* If swap is getting full, or if there are no more mappings of this page,	672	* If swap is getting full, or if there are no more mappings of this page,
673	* then try_to_free_swap is called to free its swap space.	673	* then try_to_free_swap is called to free its swap space.
674	*/	674	*/
675	int try_to_free_swap(struct page *page)	675	int try_to_free_swap(struct page *page)
676	{	676	{
677	VM_BUG_ON(!PageLocked(page));	677	VM_BUG_ON(!PageLocked(page));
678		678
679	if (!PageSwapCache(page))	679	if (!PageSwapCache(page))
680	return 0;	680	return 0;
681	if (PageWriteback(page))	681	if (PageWriteback(page))
682	return 0;	682	return 0;
683	if (page_swapcount(page))	683	if (page_swapcount(page))
684	return 0;	684	return 0;
685		685
686	delete_from_swap_cache(page);	686	delete_from_swap_cache(page);
687	SetPageDirty(page);	687	SetPageDirty(page);
688	return 1;	688	return 1;
689	}	689	}
690		690
691	/*	691	/*
692	* Free the swap entry like above, but also try to	692	* Free the swap entry like above, but also try to
693	* free the page cache entry if it is the last user.	693	* free the page cache entry if it is the last user.
694	*/	694	*/
695	int free_swap_and_cache(swp_entry_t entry)	695	int free_swap_and_cache(swp_entry_t entry)
696	{	696	{
697	struct swap_info_struct *p;	697	struct swap_info_struct *p;
698	struct page *page = NULL;	698	struct page *page = NULL;
699		699
700	if (is_migration_entry(entry))	700	if (is_migration_entry(entry))
701	return 1;	701	return 1;
702		702
703	p = swap_info_get(entry);	703	p = swap_info_get(entry);
704	if (p) {	704	if (p) {
705	if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {	705	if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
706	page = find_get_page(&swapper_space, entry.val);	706	page = find_get_page(&swapper_space, entry.val);
707	if (page && !trylock_page(page)) {	707	if (page && !trylock_page(page)) {
708	page_cache_release(page);	708	page_cache_release(page);
709	page = NULL;	709	page = NULL;
710	}	710	}
711	}	711	}
712	spin_unlock(&swap_lock);	712	spin_unlock(&swap_lock);
713	}	713	}
714	if (page) {	714	if (page) {
715	/*	715	/*
716	* Not mapped elsewhere, or swap space full? Free it!	716	* Not mapped elsewhere, or swap space full? Free it!
717	* Also recheck PageSwapCache now page is locked (above).	717	* Also recheck PageSwapCache now page is locked (above).
718	*/	718	*/
719	if (PageSwapCache(page) && !PageWriteback(page) &&	719	if (PageSwapCache(page) && !PageWriteback(page) &&
720	(!page_mapped(page) \|\| vm_swap_full())) {	720	(!page_mapped(page) \|\| vm_swap_full())) {
721	delete_from_swap_cache(page);	721	delete_from_swap_cache(page);
722	SetPageDirty(page);	722	SetPageDirty(page);
723	}	723	}
724	unlock_page(page);	724	unlock_page(page);
725	page_cache_release(page);	725	page_cache_release(page);
726	}	726	}
727	return p != NULL;	727	return p != NULL;
728	}	728	}
729		729
730	#ifdef CONFIG_HIBERNATION	730	#ifdef CONFIG_HIBERNATION
731	/*	731	/*
732	* Find the swap type that corresponds to given device (if any).	732	* Find the swap type that corresponds to given device (if any).
733	*	733	*
734	* @offset - number of the PAGE_SIZE-sized block of the device, starting	734	* @offset - number of the PAGE_SIZE-sized block of the device, starting
735	* from 0, in which the swap header is expected to be located.	735	* from 0, in which the swap header is expected to be located.
736	*	736	*
737	* This is needed for the suspend to disk (aka swsusp).	737	* This is needed for the suspend to disk (aka swsusp).
738	*/	738	*/
739	int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)	739	int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
740	{	740	{
741	struct block_device *bdev = NULL;	741	struct block_device *bdev = NULL;
742	int i;	742	int i;
743		743
744	if (device)	744	if (device)
745	bdev = bdget(device);	745	bdev = bdget(device);
746		746
747	spin_lock(&swap_lock);	747	spin_lock(&swap_lock);
748	for (i = 0; i < nr_swapfiles; i++) {	748	for (i = 0; i < nr_swapfiles; i++) {
749	struct swap_info_struct *sis = swap_info + i;	749	struct swap_info_struct *sis = swap_info + i;
750		750
751	if (!(sis->flags & SWP_WRITEOK))	751	if (!(sis->flags & SWP_WRITEOK))
752	continue;	752	continue;
753		753
754	if (!bdev) {	754	if (!bdev) {
755	if (bdev_p)	755	if (bdev_p)
756	*bdev_p = bdget(sis->bdev->bd_dev);	756	*bdev_p = bdgrab(sis->bdev);
757		757
758	spin_unlock(&swap_lock);	758	spin_unlock(&swap_lock);
759	return i;	759	return i;
760	}	760	}
761	if (bdev == sis->bdev) {	761	if (bdev == sis->bdev) {
762	struct swap_extent *se;	762	struct swap_extent *se;
763		763
764	se = list_entry(sis->extent_list.next,	764	se = list_entry(sis->extent_list.next,
765	struct swap_extent, list);	765	struct swap_extent, list);
766	if (se->start_block == offset) {	766	if (se->start_block == offset) {
767	if (bdev_p)	767	if (bdev_p)
768	*bdev_p = bdget(sis->bdev->bd_dev);	768	*bdev_p = bdgrab(sis->bdev);
769		769
770	spin_unlock(&swap_lock);	770	spin_unlock(&swap_lock);
771	bdput(bdev);	771	bdput(bdev);
772	return i;	772	return i;
773	}	773	}
774	}	774	}
775	}	775	}
776	spin_unlock(&swap_lock);	776	spin_unlock(&swap_lock);
777	if (bdev)	777	if (bdev)
778	bdput(bdev);	778	bdput(bdev);
779		779
780	return -ENODEV;	780	return -ENODEV;
781	}	781	}
782		782
783	/*	783	/*
784	* Return either the total number of swap pages of given type, or the number	784	* Return either the total number of swap pages of given type, or the number
785	* of free pages of that type (depending on @free)	785	* of free pages of that type (depending on @free)
786	*	786	*
787	* This is needed for software suspend	787	* This is needed for software suspend
788	*/	788	*/
789	unsigned int count_swap_pages(int type, int free)	789	unsigned int count_swap_pages(int type, int free)
790	{	790	{
791	unsigned int n = 0;	791	unsigned int n = 0;
792		792
793	if (type < nr_swapfiles) {	793	if (type < nr_swapfiles) {
794	spin_lock(&swap_lock);	794	spin_lock(&swap_lock);
795	if (swap_info[type].flags & SWP_WRITEOK) {	795	if (swap_info[type].flags & SWP_WRITEOK) {
796	n = swap_info[type].pages;	796	n = swap_info[type].pages;
797	if (free)	797	if (free)
798	n -= swap_info[type].inuse_pages;	798	n -= swap_info[type].inuse_pages;
799	}	799	}
800	spin_unlock(&swap_lock);	800	spin_unlock(&swap_lock);
801	}	801	}
802	return n;	802	return n;
803	}	803	}
804	#endif	804	#endif
805		805
806	/*	806	/*
807	* No need to decide whether this PTE shares the swap entry with others,	807	* No need to decide whether this PTE shares the swap entry with others,
808	* just let do_wp_page work it out if a write is requested later - to	808	* just let do_wp_page work it out if a write is requested later - to
809	* force COW, vm_page_prot omits write permission from any private vma.	809	* force COW, vm_page_prot omits write permission from any private vma.
810	*/	810	*/
811	static int unuse_pte(struct vm_area_struct vma, pmd_t pmd,	811	static int unuse_pte(struct vm_area_struct vma, pmd_t pmd,
812	unsigned long addr, swp_entry_t entry, struct page *page)	812	unsigned long addr, swp_entry_t entry, struct page *page)
813	{	813	{
814	struct mem_cgroup *ptr = NULL;	814	struct mem_cgroup *ptr = NULL;
815	spinlock_t *ptl;	815	spinlock_t *ptl;
816	pte_t *pte;	816	pte_t *pte;
817	int ret = 1;	817	int ret = 1;
818		818
819	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {	819	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
820	ret = -ENOMEM;	820	ret = -ENOMEM;
821	goto out_nolock;	821	goto out_nolock;
822	}	822	}
823		823
824	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);	824	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
825	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {	825	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
826	if (ret > 0)	826	if (ret > 0)
827	mem_cgroup_cancel_charge_swapin(ptr);	827	mem_cgroup_cancel_charge_swapin(ptr);
828	ret = 0;	828	ret = 0;
829	goto out;	829	goto out;
830	}	830	}
831		831
832	inc_mm_counter(vma->vm_mm, anon_rss);	832	inc_mm_counter(vma->vm_mm, anon_rss);
833	get_page(page);	833	get_page(page);
834	set_pte_at(vma->vm_mm, addr, pte,	834	set_pte_at(vma->vm_mm, addr, pte,
835	pte_mkold(mk_pte(page, vma->vm_page_prot)));	835	pte_mkold(mk_pte(page, vma->vm_page_prot)));
836	page_add_anon_rmap(page, vma, addr);	836	page_add_anon_rmap(page, vma, addr);
837	mem_cgroup_commit_charge_swapin(page, ptr);	837	mem_cgroup_commit_charge_swapin(page, ptr);
838	swap_free(entry);	838	swap_free(entry);
839	/*	839	/*
840	* Move the page to the active list so it is not	840	* Move the page to the active list so it is not
841	* immediately swapped out again after swapon.	841	* immediately swapped out again after swapon.
842	*/	842	*/
843	activate_page(page);	843	activate_page(page);
844	out:	844	out:
845	pte_unmap_unlock(pte, ptl);	845	pte_unmap_unlock(pte, ptl);
846	out_nolock:	846	out_nolock:
847	return ret;	847	return ret;
848	}	848	}
849		849
850	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,	850	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
851	unsigned long addr, unsigned long end,	851	unsigned long addr, unsigned long end,
852	swp_entry_t entry, struct page *page)	852	swp_entry_t entry, struct page *page)
853	{	853	{
854	pte_t swp_pte = swp_entry_to_pte(entry);	854	pte_t swp_pte = swp_entry_to_pte(entry);
855	pte_t *pte;	855	pte_t *pte;
856	int ret = 0;	856	int ret = 0;
857		857
858	/*	858	/*
859	* We don't actually need pte lock while scanning for swp_pte: since	859	* We don't actually need pte lock while scanning for swp_pte: since
860	* we hold page lock and mmap_sem, swp_pte cannot be inserted into the	860	* we hold page lock and mmap_sem, swp_pte cannot be inserted into the
861	* page table while we're scanning; though it could get zapped, and on	861	* page table while we're scanning; though it could get zapped, and on
862	* some architectures (e.g. x86_32 with PAE) we might catch a glimpse	862	* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
863	* of unmatched parts which look like swp_pte, so unuse_pte must	863	* of unmatched parts which look like swp_pte, so unuse_pte must
864	* recheck under pte lock. Scanning without pte lock lets it be	864	* recheck under pte lock. Scanning without pte lock lets it be
865	* preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.	865	* preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
866	*/	866	*/
867	pte = pte_offset_map(pmd, addr);	867	pte = pte_offset_map(pmd, addr);
868	do {	868	do {
869	/*	869	/*
870	* swapoff spends a _lot_ of time in this loop!	870	* swapoff spends a _lot_ of time in this loop!
871	* Test inline before going to call unuse_pte.	871	* Test inline before going to call unuse_pte.
872	*/	872	*/
873	if (unlikely(pte_same(*pte, swp_pte))) {	873	if (unlikely(pte_same(*pte, swp_pte))) {
874	pte_unmap(pte);	874	pte_unmap(pte);
875	ret = unuse_pte(vma, pmd, addr, entry, page);	875	ret = unuse_pte(vma, pmd, addr, entry, page);
876	if (ret)	876	if (ret)
877	goto out;	877	goto out;
878	pte = pte_offset_map(pmd, addr);	878	pte = pte_offset_map(pmd, addr);
879	}	879	}
880	} while (pte++, addr += PAGE_SIZE, addr != end);	880	} while (pte++, addr += PAGE_SIZE, addr != end);
881	pte_unmap(pte - 1);	881	pte_unmap(pte - 1);
882	out:	882	out:
883	return ret;	883	return ret;
884	}	884	}
885		885
886	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,	886	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
887	unsigned long addr, unsigned long end,	887	unsigned long addr, unsigned long end,
888	swp_entry_t entry, struct page *page)	888	swp_entry_t entry, struct page *page)
889	{	889	{
890	pmd_t *pmd;	890	pmd_t *pmd;
891	unsigned long next;	891	unsigned long next;
892	int ret;	892	int ret;
893		893
894	pmd = pmd_offset(pud, addr);	894	pmd = pmd_offset(pud, addr);
895	do {	895	do {
896	next = pmd_addr_end(addr, end);	896	next = pmd_addr_end(addr, end);
897	if (pmd_none_or_clear_bad(pmd))	897	if (pmd_none_or_clear_bad(pmd))
898	continue;	898	continue;
899	ret = unuse_pte_range(vma, pmd, addr, next, entry, page);	899	ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
900	if (ret)	900	if (ret)
901	return ret;	901	return ret;
902	} while (pmd++, addr = next, addr != end);	902	} while (pmd++, addr = next, addr != end);
903	return 0;	903	return 0;
904	}	904	}
905		905
906	static inline int unuse_pud_range(struct vm_area_struct vma, pgd_t pgd,	906	static inline int unuse_pud_range(struct vm_area_struct vma, pgd_t pgd,
907	unsigned long addr, unsigned long end,	907	unsigned long addr, unsigned long end,
908	swp_entry_t entry, struct page *page)	908	swp_entry_t entry, struct page *page)
909	{	909	{
910	pud_t *pud;	910	pud_t *pud;
911	unsigned long next;	911	unsigned long next;
912	int ret;	912	int ret;
913		913
914	pud = pud_offset(pgd, addr);	914	pud = pud_offset(pgd, addr);
915	do {	915	do {
916	next = pud_addr_end(addr, end);	916	next = pud_addr_end(addr, end);
917	if (pud_none_or_clear_bad(pud))	917	if (pud_none_or_clear_bad(pud))
918	continue;	918	continue;
919	ret = unuse_pmd_range(vma, pud, addr, next, entry, page);	919	ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
920	if (ret)	920	if (ret)
921	return ret;	921	return ret;
922	} while (pud++, addr = next, addr != end);	922	} while (pud++, addr = next, addr != end);
923	return 0;	923	return 0;
924	}	924	}
925		925
926	static int unuse_vma(struct vm_area_struct *vma,	926	static int unuse_vma(struct vm_area_struct *vma,
927	swp_entry_t entry, struct page *page)	927	swp_entry_t entry, struct page *page)
928	{	928	{
929	pgd_t *pgd;	929	pgd_t *pgd;
930	unsigned long addr, end, next;	930	unsigned long addr, end, next;
931	int ret;	931	int ret;
932		932
933	if (page->mapping) {	933	if (page->mapping) {
934	addr = page_address_in_vma(page, vma);	934	addr = page_address_in_vma(page, vma);
935	if (addr == -EFAULT)	935	if (addr == -EFAULT)
936	return 0;	936	return 0;
937	else	937	else
938	end = addr + PAGE_SIZE;	938	end = addr + PAGE_SIZE;
939	} else {	939	} else {
940	addr = vma->vm_start;	940	addr = vma->vm_start;
941	end = vma->vm_end;	941	end = vma->vm_end;
942	}	942	}
943		943
944	pgd = pgd_offset(vma->vm_mm, addr);	944	pgd = pgd_offset(vma->vm_mm, addr);
945	do {	945	do {
946	next = pgd_addr_end(addr, end);	946	next = pgd_addr_end(addr, end);
947	if (pgd_none_or_clear_bad(pgd))	947	if (pgd_none_or_clear_bad(pgd))
948	continue;	948	continue;
949	ret = unuse_pud_range(vma, pgd, addr, next, entry, page);	949	ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
950	if (ret)	950	if (ret)
951	return ret;	951	return ret;
952	} while (pgd++, addr = next, addr != end);	952	} while (pgd++, addr = next, addr != end);
953	return 0;	953	return 0;
954	}	954	}
955		955
956	static int unuse_mm(struct mm_struct *mm,	956	static int unuse_mm(struct mm_struct *mm,
957	swp_entry_t entry, struct page *page)	957	swp_entry_t entry, struct page *page)
958	{	958	{
959	struct vm_area_struct *vma;	959	struct vm_area_struct *vma;
960	int ret = 0;	960	int ret = 0;
961		961
962	if (!down_read_trylock(&mm->mmap_sem)) {	962	if (!down_read_trylock(&mm->mmap_sem)) {
963	/*	963	/*
964	* Activate page so shrink_inactive_list is unlikely to unmap	964	* Activate page so shrink_inactive_list is unlikely to unmap
965	* its ptes while lock is dropped, so swapoff can make progress.	965	* its ptes while lock is dropped, so swapoff can make progress.
966	*/	966	*/
967	activate_page(page);	967	activate_page(page);
968	unlock_page(page);	968	unlock_page(page);
969	down_read(&mm->mmap_sem);	969	down_read(&mm->mmap_sem);
970	lock_page(page);	970	lock_page(page);
971	}	971	}
972	for (vma = mm->mmap; vma; vma = vma->vm_next) {	972	for (vma = mm->mmap; vma; vma = vma->vm_next) {
973	if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))	973	if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
974	break;	974	break;
975	}	975	}
976	up_read(&mm->mmap_sem);	976	up_read(&mm->mmap_sem);
977	return (ret < 0)? ret: 0;	977	return (ret < 0)? ret: 0;
978	}	978	}
979		979
980	/*	980	/*
981	* Scan swap_map from current position to next entry still in use.	981	* Scan swap_map from current position to next entry still in use.
982	* Recycle to start on reaching the end, returning 0 when empty.	982	* Recycle to start on reaching the end, returning 0 when empty.
983	*/	983	*/
984	static unsigned int find_next_to_unuse(struct swap_info_struct *si,	984	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
985	unsigned int prev)	985	unsigned int prev)
986	{	986	{
987	unsigned int max = si->max;	987	unsigned int max = si->max;
988	unsigned int i = prev;	988	unsigned int i = prev;
989	int count;	989	int count;
990		990
991	/*	991	/*
992	* No need for swap_lock here: we're just looking	992	* No need for swap_lock here: we're just looking
993	* for whether an entry is in use, not modifying it; false	993	* for whether an entry is in use, not modifying it; false
994	* hits are okay, and sys_swapoff() has already prevented new	994	* hits are okay, and sys_swapoff() has already prevented new
995	* allocations from this area (while holding swap_lock).	995	* allocations from this area (while holding swap_lock).
996	*/	996	*/
997	for (;;) {	997	for (;;) {
998	if (++i >= max) {	998	if (++i >= max) {
999	if (!prev) {	999	if (!prev) {
1000	i = 0;	1000	i = 0;
1001	break;	1001	break;
1002	}	1002	}
1003	/*	1003	/*
1004	* No entries in use at top of swap_map,	1004	* No entries in use at top of swap_map,
1005	* loop back to start and recheck there.	1005	* loop back to start and recheck there.
1006	*/	1006	*/
1007	max = prev + 1;	1007	max = prev + 1;
1008	prev = 0;	1008	prev = 0;
1009	i = 1;	1009	i = 1;
1010	}	1010	}
1011	count = si->swap_map[i];	1011	count = si->swap_map[i];
1012	if (count && swap_count(count) != SWAP_MAP_BAD)	1012	if (count && swap_count(count) != SWAP_MAP_BAD)
1013	break;	1013	break;
1014	}	1014	}
1015	return i;	1015	return i;
1016	}	1016	}
1017		1017
1018	/*	1018	/*
1019	* We completely avoid races by reading each swap page in advance,	1019	* We completely avoid races by reading each swap page in advance,
1020	* and then search for the process using it. All the necessary	1020	* and then search for the process using it. All the necessary
1021	* page table adjustments can then be made atomically.	1021	* page table adjustments can then be made atomically.
1022	*/	1022	*/
1023	static int try_to_unuse(unsigned int type)	1023	static int try_to_unuse(unsigned int type)
1024	{	1024	{
1025	struct swap_info_struct * si = &swap_info[type];	1025	struct swap_info_struct * si = &swap_info[type];
1026	struct mm_struct *start_mm;	1026	struct mm_struct *start_mm;
1027	unsigned short *swap_map;	1027	unsigned short *swap_map;
1028	unsigned short swcount;	1028	unsigned short swcount;
1029	struct page *page;	1029	struct page *page;
1030	swp_entry_t entry;	1030	swp_entry_t entry;
1031	unsigned int i = 0;	1031	unsigned int i = 0;
1032	int retval = 0;	1032	int retval = 0;
1033	int reset_overflow = 0;	1033	int reset_overflow = 0;
1034	int shmem;	1034	int shmem;
1035		1035
1036	/*	1036	/*
1037	* When searching mms for an entry, a good strategy is to	1037	* When searching mms for an entry, a good strategy is to
1038	* start at the first mm we freed the previous entry from	1038	* start at the first mm we freed the previous entry from
1039	* (though actually we don't notice whether we or coincidence	1039	* (though actually we don't notice whether we or coincidence
1040	* freed the entry). Initialize this start_mm with a hold.	1040	* freed the entry). Initialize this start_mm with a hold.
1041	*	1041	*
1042	* A simpler strategy would be to start at the last mm we	1042	* A simpler strategy would be to start at the last mm we
1043	* freed the previous entry from; but that would take less	1043	* freed the previous entry from; but that would take less
1044	* advantage of mmlist ordering, which clusters forked mms	1044	* advantage of mmlist ordering, which clusters forked mms
1045	* together, child after parent. If we race with dup_mmap(), we	1045	* together, child after parent. If we race with dup_mmap(), we
1046	* prefer to resolve parent before child, lest we miss entries	1046	* prefer to resolve parent before child, lest we miss entries
1047	* duplicated after we scanned child: using last mm would invert	1047	* duplicated after we scanned child: using last mm would invert
1048	* that. Though it's only a serious concern when an overflowed	1048	* that. Though it's only a serious concern when an overflowed
1049	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.	1049	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1050	*/	1050	*/
1051	start_mm = &init_mm;	1051	start_mm = &init_mm;
1052	atomic_inc(&init_mm.mm_users);	1052	atomic_inc(&init_mm.mm_users);
1053		1053
1054	/*	1054	/*
1055	* Keep on scanning until all entries have gone. Usually,	1055	* Keep on scanning until all entries have gone. Usually,
1056	* one pass through swap_map is enough, but not necessarily:	1056	* one pass through swap_map is enough, but not necessarily:
1057	* there are races when an instance of an entry might be missed.	1057	* there are races when an instance of an entry might be missed.
1058	*/	1058	*/
1059	while ((i = find_next_to_unuse(si, i)) != 0) {	1059	while ((i = find_next_to_unuse(si, i)) != 0) {
1060	if (signal_pending(current)) {	1060	if (signal_pending(current)) {
1061	retval = -EINTR;	1061	retval = -EINTR;
1062	break;	1062	break;
1063	}	1063	}
1064		1064
1065	/*	1065	/*
1066	* Get a page for the entry, using the existing swap	1066	* Get a page for the entry, using the existing swap
1067	* cache page if there is one. Otherwise, get a clean	1067	* cache page if there is one. Otherwise, get a clean
1068	* page and read the swap into it.	1068	* page and read the swap into it.
1069	*/	1069	*/
1070	swap_map = &si->swap_map[i];	1070	swap_map = &si->swap_map[i];
1071	entry = swp_entry(type, i);	1071	entry = swp_entry(type, i);
1072	page = read_swap_cache_async(entry,	1072	page = read_swap_cache_async(entry,
1073	GFP_HIGHUSER_MOVABLE, NULL, 0);	1073	GFP_HIGHUSER_MOVABLE, NULL, 0);
1074	if (!page) {	1074	if (!page) {
1075	/*	1075	/*
1076	* Either swap_duplicate() failed because entry	1076	* Either swap_duplicate() failed because entry
1077	* has been freed independently, and will not be	1077	* has been freed independently, and will not be
1078	* reused since sys_swapoff() already disabled	1078	* reused since sys_swapoff() already disabled
1079	* allocation from here, or alloc_page() failed.	1079	* allocation from here, or alloc_page() failed.
1080	*/	1080	*/
1081	if (!*swap_map)	1081	if (!*swap_map)
1082	continue;	1082	continue;
1083	retval = -ENOMEM;	1083	retval = -ENOMEM;
1084	break;	1084	break;
1085	}	1085	}
1086		1086
1087	/*	1087	/*
1088	* Don't hold on to start_mm if it looks like exiting.	1088	* Don't hold on to start_mm if it looks like exiting.
1089	*/	1089	*/
1090	if (atomic_read(&start_mm->mm_users) == 1) {	1090	if (atomic_read(&start_mm->mm_users) == 1) {
1091	mmput(start_mm);	1091	mmput(start_mm);
1092	start_mm = &init_mm;	1092	start_mm = &init_mm;
1093	atomic_inc(&init_mm.mm_users);	1093	atomic_inc(&init_mm.mm_users);
1094	}	1094	}
1095		1095
1096	/*	1096	/*
1097	* Wait for and lock page. When do_swap_page races with	1097	* Wait for and lock page. When do_swap_page races with
1098	* try_to_unuse, do_swap_page can handle the fault much	1098	* try_to_unuse, do_swap_page can handle the fault much
1099	* faster than try_to_unuse can locate the entry. This	1099	* faster than try_to_unuse can locate the entry. This
1100	* apparently redundant "wait_on_page_locked" lets try_to_unuse	1100	* apparently redundant "wait_on_page_locked" lets try_to_unuse
1101	* defer to do_swap_page in such a case - in some tests,	1101	* defer to do_swap_page in such a case - in some tests,
1102	* do_swap_page and try_to_unuse repeatedly compete.	1102	* do_swap_page and try_to_unuse repeatedly compete.
1103	*/	1103	*/
1104	wait_on_page_locked(page);	1104	wait_on_page_locked(page);
1105	wait_on_page_writeback(page);	1105	wait_on_page_writeback(page);
1106	lock_page(page);	1106	lock_page(page);
1107	wait_on_page_writeback(page);	1107	wait_on_page_writeback(page);
1108		1108
1109	/*	1109	/*
1110	* Remove all references to entry.	1110	* Remove all references to entry.
1111	* Whenever we reach init_mm, there's no address space	1111	* Whenever we reach init_mm, there's no address space
1112	* to search, but use it as a reminder to search shmem.	1112	* to search, but use it as a reminder to search shmem.
1113	*/	1113	*/
1114	shmem = 0;	1114	shmem = 0;
1115	swcount = *swap_map;	1115	swcount = *swap_map;
1116	if (swap_count(swcount)) {	1116	if (swap_count(swcount)) {
1117	if (start_mm == &init_mm)	1117	if (start_mm == &init_mm)
1118	shmem = shmem_unuse(entry, page);	1118	shmem = shmem_unuse(entry, page);
1119	else	1119	else
1120	retval = unuse_mm(start_mm, entry, page);	1120	retval = unuse_mm(start_mm, entry, page);
1121	}	1121	}
1122	if (swap_count(*swap_map)) {	1122	if (swap_count(*swap_map)) {
1123	int set_start_mm = (*swap_map >= swcount);	1123	int set_start_mm = (*swap_map >= swcount);
1124	struct list_head *p = &start_mm->mmlist;	1124	struct list_head *p = &start_mm->mmlist;
1125	struct mm_struct *new_start_mm = start_mm;	1125	struct mm_struct *new_start_mm = start_mm;
1126	struct mm_struct *prev_mm = start_mm;	1126	struct mm_struct *prev_mm = start_mm;
1127	struct mm_struct *mm;	1127	struct mm_struct *mm;
1128		1128
1129	atomic_inc(&new_start_mm->mm_users);	1129	atomic_inc(&new_start_mm->mm_users);
1130	atomic_inc(&prev_mm->mm_users);	1130	atomic_inc(&prev_mm->mm_users);
1131	spin_lock(&mmlist_lock);	1131	spin_lock(&mmlist_lock);
1132	while (swap_count(*swap_map) && !retval && !shmem &&	1132	while (swap_count(*swap_map) && !retval && !shmem &&
1133	(p = p->next) != &start_mm->mmlist) {	1133	(p = p->next) != &start_mm->mmlist) {
1134	mm = list_entry(p, struct mm_struct, mmlist);	1134	mm = list_entry(p, struct mm_struct, mmlist);
1135	if (!atomic_inc_not_zero(&mm->mm_users))	1135	if (!atomic_inc_not_zero(&mm->mm_users))
1136	continue;	1136	continue;
1137	spin_unlock(&mmlist_lock);	1137	spin_unlock(&mmlist_lock);
1138	mmput(prev_mm);	1138	mmput(prev_mm);
1139	prev_mm = mm;	1139	prev_mm = mm;
1140		1140
1141	cond_resched();	1141	cond_resched();
1142		1142
1143	swcount = *swap_map;	1143	swcount = *swap_map;
1144	if (!swap_count(swcount)) /* any usage ? */	1144	if (!swap_count(swcount)) /* any usage ? */
1145	;	1145	;
1146	else if (mm == &init_mm) {	1146	else if (mm == &init_mm) {
1147	set_start_mm = 1;	1147	set_start_mm = 1;
1148	shmem = shmem_unuse(entry, page);	1148	shmem = shmem_unuse(entry, page);
1149	} else	1149	} else
1150	retval = unuse_mm(mm, entry, page);	1150	retval = unuse_mm(mm, entry, page);
1151		1151
1152	if (set_start_mm &&	1152	if (set_start_mm &&
1153	swap_count(*swap_map) < swcount) {	1153	swap_count(*swap_map) < swcount) {
1154	mmput(new_start_mm);	1154	mmput(new_start_mm);
1155	atomic_inc(&mm->mm_users);	1155	atomic_inc(&mm->mm_users);
1156	new_start_mm = mm;	1156	new_start_mm = mm;
1157	set_start_mm = 0;	1157	set_start_mm = 0;
1158	}	1158	}
1159	spin_lock(&mmlist_lock);	1159	spin_lock(&mmlist_lock);
1160	}	1160	}
1161	spin_unlock(&mmlist_lock);	1161	spin_unlock(&mmlist_lock);
1162	mmput(prev_mm);	1162	mmput(prev_mm);
1163	mmput(start_mm);	1163	mmput(start_mm);
1164	start_mm = new_start_mm;	1164	start_mm = new_start_mm;
1165	}	1165	}
1166	if (shmem) {	1166	if (shmem) {
1167	/* page has already been unlocked and released */	1167	/* page has already been unlocked and released */
1168	if (shmem > 0)	1168	if (shmem > 0)
1169	continue;	1169	continue;
1170	retval = shmem;	1170	retval = shmem;
1171	break;	1171	break;
1172	}	1172	}
1173	if (retval) {	1173	if (retval) {
1174	unlock_page(page);	1174	unlock_page(page);
1175	page_cache_release(page);	1175	page_cache_release(page);
1176	break;	1176	break;
1177	}	1177	}
1178		1178
1179	/*	1179	/*
1180	* How could swap count reach 0x7ffe ?	1180	* How could swap count reach 0x7ffe ?
1181	* There's no way to repeat a swap page within an mm	1181	* There's no way to repeat a swap page within an mm
1182	* (except in shmem, where it's the shared object which takes	1182	* (except in shmem, where it's the shared object which takes
1183	* the reference count)?	1183	* the reference count)?
1184	* We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned	1184	* We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1185	* short is too small....)	1185	* short is too small....)
1186	* If that's wrong, then we should worry more about	1186	* If that's wrong, then we should worry more about
1187	* exit_mmap() and do_munmap() cases described above:	1187	* exit_mmap() and do_munmap() cases described above:
1188	* we might be resetting SWAP_MAP_MAX too early here.	1188	* we might be resetting SWAP_MAP_MAX too early here.
1189	* We know "Undead"s can happen, they're okay, so don't	1189	* We know "Undead"s can happen, they're okay, so don't
1190	* report them; but do report if we reset SWAP_MAP_MAX.	1190	* report them; but do report if we reset SWAP_MAP_MAX.
1191	*/	1191	*/
1192	/* We might release the lock_page() in unuse_mm(). */	1192	/* We might release the lock_page() in unuse_mm(). */
1193	if (!PageSwapCache(page) \|\| page_private(page) != entry.val)	1193	if (!PageSwapCache(page) \|\| page_private(page) != entry.val)
1194	goto retry;	1194	goto retry;
1195		1195
1196	if (swap_count(*swap_map) == SWAP_MAP_MAX) {	1196	if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1197	spin_lock(&swap_lock);	1197	spin_lock(&swap_lock);
1198	*swap_map = encode_swapmap(0, true);	1198	*swap_map = encode_swapmap(0, true);
1199	spin_unlock(&swap_lock);	1199	spin_unlock(&swap_lock);
1200	reset_overflow = 1;	1200	reset_overflow = 1;
1201	}	1201	}
1202		1202
1203	/*	1203	/*
1204	* If a reference remains (rare), we would like to leave	1204	* If a reference remains (rare), we would like to leave
1205	* the page in the swap cache; but try_to_unmap could	1205	* the page in the swap cache; but try_to_unmap could
1206	* then re-duplicate the entry once we drop page lock,	1206	* then re-duplicate the entry once we drop page lock,
1207	* so we might loop indefinitely; also, that page could	1207	* so we might loop indefinitely; also, that page could
1208	* not be swapped out to other storage meanwhile. So:	1208	* not be swapped out to other storage meanwhile. So:
1209	* delete from cache even if there's another reference,	1209	* delete from cache even if there's another reference,
1210	* after ensuring that the data has been saved to disk -	1210	* after ensuring that the data has been saved to disk -
1211	* since if the reference remains (rarer), it will be	1211	* since if the reference remains (rarer), it will be
1212	* read from disk into another page. Splitting into two	1212	* read from disk into another page. Splitting into two
1213	* pages would be incorrect if swap supported "shared	1213	* pages would be incorrect if swap supported "shared
1214	* private" pages, but they are handled by tmpfs files.	1214	* private" pages, but they are handled by tmpfs files.
1215	*/	1215	*/
1216	if (swap_count(*swap_map) &&	1216	if (swap_count(*swap_map) &&
1217	PageDirty(page) && PageSwapCache(page)) {	1217	PageDirty(page) && PageSwapCache(page)) {
1218	struct writeback_control wbc = {	1218	struct writeback_control wbc = {
1219	.sync_mode = WB_SYNC_NONE,	1219	.sync_mode = WB_SYNC_NONE,
1220	};	1220	};
1221		1221
1222	swap_writepage(page, &wbc);	1222	swap_writepage(page, &wbc);
1223	lock_page(page);	1223	lock_page(page);
1224	wait_on_page_writeback(page);	1224	wait_on_page_writeback(page);
1225	}	1225	}
1226		1226
1227	/*	1227	/*
1228	* It is conceivable that a racing task removed this page from	1228	* It is conceivable that a racing task removed this page from
1229	* swap cache just before we acquired the page lock at the top,	1229	* swap cache just before we acquired the page lock at the top,
1230	* or while we dropped it in unuse_mm(). The page might even	1230	* or while we dropped it in unuse_mm(). The page might even
1231	* be back in swap cache on another swap area: that we must not	1231	* be back in swap cache on another swap area: that we must not
1232	* delete, since it may not have been written out to swap yet.	1232	* delete, since it may not have been written out to swap yet.
1233	*/	1233	*/
1234	if (PageSwapCache(page) &&	1234	if (PageSwapCache(page) &&
1235	likely(page_private(page) == entry.val))	1235	likely(page_private(page) == entry.val))
1236	delete_from_swap_cache(page);	1236	delete_from_swap_cache(page);
1237		1237
1238	/*	1238	/*
1239	* So we could skip searching mms once swap count went	1239	* So we could skip searching mms once swap count went
1240	* to 1, we did not mark any present ptes as dirty: must	1240	* to 1, we did not mark any present ptes as dirty: must
1241	* mark page dirty so shrink_page_list will preserve it.	1241	* mark page dirty so shrink_page_list will preserve it.
1242	*/	1242	*/
1243	SetPageDirty(page);	1243	SetPageDirty(page);
1244	retry:	1244	retry:
1245	unlock_page(page);	1245	unlock_page(page);
1246	page_cache_release(page);	1246	page_cache_release(page);
1247		1247
1248	/*	1248	/*
1249	* Make sure that we aren't completely killing	1249	* Make sure that we aren't completely killing
1250	* interactive performance.	1250	* interactive performance.
1251	*/	1251	*/
1252	cond_resched();	1252	cond_resched();
1253	}	1253	}
1254		1254
1255	mmput(start_mm);	1255	mmput(start_mm);
1256	if (reset_overflow) {	1256	if (reset_overflow) {
1257	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");	1257	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1258	swap_overflow = 0;	1258	swap_overflow = 0;
1259	}	1259	}
1260	return retval;	1260	return retval;
1261	}	1261	}
1262		1262
1263	/*	1263	/*
1264	* After a successful try_to_unuse, if no swap is now in use, we know	1264	* After a successful try_to_unuse, if no swap is now in use, we know
1265	* we can empty the mmlist. swap_lock must be held on entry and exit.	1265	* we can empty the mmlist. swap_lock must be held on entry and exit.
1266	* Note that mmlist_lock nests inside swap_lock, and an mm must be	1266	* Note that mmlist_lock nests inside swap_lock, and an mm must be
1267	* added to the mmlist just after page_duplicate - before would be racy.	1267	* added to the mmlist just after page_duplicate - before would be racy.
1268	*/	1268	*/
1269	static void drain_mmlist(void)	1269	static void drain_mmlist(void)
1270	{	1270	{
1271	struct list_head p, next;	1271	struct list_head p, next;
1272	unsigned int i;	1272	unsigned int i;
1273		1273
1274	for (i = 0; i < nr_swapfiles; i++)	1274	for (i = 0; i < nr_swapfiles; i++)
1275	if (swap_info[i].inuse_pages)	1275	if (swap_info[i].inuse_pages)
1276	return;	1276	return;
1277	spin_lock(&mmlist_lock);	1277	spin_lock(&mmlist_lock);
1278	list_for_each_safe(p, next, &init_mm.mmlist)	1278	list_for_each_safe(p, next, &init_mm.mmlist)
1279	list_del_init(p);	1279	list_del_init(p);
1280	spin_unlock(&mmlist_lock);	1280	spin_unlock(&mmlist_lock);
1281	}	1281	}
1282		1282
1283	/*	1283	/*
1284	* Use this swapdev's extent info to locate the (PAGE_SIZE) block which	1284	* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1285	* corresponds to page offset `offset'.	1285	* corresponds to page offset `offset'.
1286	*/	1286	*/
1287	sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)	1287	sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1288	{	1288	{
1289	struct swap_extent *se = sis->curr_swap_extent;	1289	struct swap_extent *se = sis->curr_swap_extent;
1290	struct swap_extent *start_se = se;	1290	struct swap_extent *start_se = se;
1291		1291
1292	for ( ; ; ) {	1292	for ( ; ; ) {
1293	struct list_head *lh;	1293	struct list_head *lh;
1294		1294
1295	if (se->start_page <= offset &&	1295	if (se->start_page <= offset &&
1296	offset < (se->start_page + se->nr_pages)) {	1296	offset < (se->start_page + se->nr_pages)) {
1297	return se->start_block + (offset - se->start_page);	1297	return se->start_block + (offset - se->start_page);
1298	}	1298	}
1299	lh = se->list.next;	1299	lh = se->list.next;
1300	if (lh == &sis->extent_list)	1300	if (lh == &sis->extent_list)
1301	lh = lh->next;	1301	lh = lh->next;
1302	se = list_entry(lh, struct swap_extent, list);	1302	se = list_entry(lh, struct swap_extent, list);
1303	sis->curr_swap_extent = se;	1303	sis->curr_swap_extent = se;
1304	BUG_ON(se == start_se); /* It must be present */	1304	BUG_ON(se == start_se); /* It must be present */
1305	}	1305	}
1306	}	1306	}
1307		1307
1308	#ifdef CONFIG_HIBERNATION	1308	#ifdef CONFIG_HIBERNATION
1309	/*	1309	/*
1310	* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev	1310	* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1311	* corresponding to given index in swap_info (swap type).	1311	* corresponding to given index in swap_info (swap type).
1312	*/	1312	*/
1313	sector_t swapdev_block(int swap_type, pgoff_t offset)	1313	sector_t swapdev_block(int swap_type, pgoff_t offset)
1314	{	1314	{
1315	struct swap_info_struct *sis;	1315	struct swap_info_struct *sis;
1316		1316
1317	if (swap_type >= nr_swapfiles)	1317	if (swap_type >= nr_swapfiles)
1318	return 0;	1318	return 0;
1319		1319
1320	sis = swap_info + swap_type;	1320	sis = swap_info + swap_type;
1321	return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;	1321	return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1322	}	1322	}
1323	#endif /* CONFIG_HIBERNATION */	1323	#endif /* CONFIG_HIBERNATION */
1324		1324
1325	/*	1325	/*
1326	* Free all of a swapdev's extent information	1326	* Free all of a swapdev's extent information
1327	*/	1327	*/
1328	static void destroy_swap_extents(struct swap_info_struct *sis)	1328	static void destroy_swap_extents(struct swap_info_struct *sis)
1329	{	1329	{
1330	while (!list_empty(&sis->extent_list)) {	1330	while (!list_empty(&sis->extent_list)) {
1331	struct swap_extent *se;	1331	struct swap_extent *se;
1332		1332
1333	se = list_entry(sis->extent_list.next,	1333	se = list_entry(sis->extent_list.next,
1334	struct swap_extent, list);	1334	struct swap_extent, list);
1335	list_del(&se->list);	1335	list_del(&se->list);
1336	kfree(se);	1336	kfree(se);
1337	}	1337	}
1338	}	1338	}
1339		1339
1340	/*	1340	/*
1341	* Add a block range (and the corresponding page range) into this swapdev's	1341	* Add a block range (and the corresponding page range) into this swapdev's
1342	* extent list. The extent list is kept sorted in page order.	1342	* extent list. The extent list is kept sorted in page order.
1343	*	1343	*
1344	* This function rather assumes that it is called in ascending page order.	1344	* This function rather assumes that it is called in ascending page order.
1345	*/	1345	*/
1346	static int	1346	static int
1347	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,	1347	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1348	unsigned long nr_pages, sector_t start_block)	1348	unsigned long nr_pages, sector_t start_block)
1349	{	1349	{
1350	struct swap_extent *se;	1350	struct swap_extent *se;
1351	struct swap_extent *new_se;	1351	struct swap_extent *new_se;
1352	struct list_head *lh;	1352	struct list_head *lh;
1353		1353
1354	lh = sis->extent_list.prev; /* The highest page extent */	1354	lh = sis->extent_list.prev; /* The highest page extent */
1355	if (lh != &sis->extent_list) {	1355	if (lh != &sis->extent_list) {
1356	se = list_entry(lh, struct swap_extent, list);	1356	se = list_entry(lh, struct swap_extent, list);
1357	BUG_ON(se->start_page + se->nr_pages != start_page);	1357	BUG_ON(se->start_page + se->nr_pages != start_page);
1358	if (se->start_block + se->nr_pages == start_block) {	1358	if (se->start_block + se->nr_pages == start_block) {
1359	/* Merge it */	1359	/* Merge it */
1360	se->nr_pages += nr_pages;	1360	se->nr_pages += nr_pages;
1361	return 0;	1361	return 0;
1362	}	1362	}
1363	}	1363	}
1364		1364
1365	/*	1365	/*
1366	* No merge. Insert a new extent, preserving ordering.	1366	* No merge. Insert a new extent, preserving ordering.
1367	*/	1367	*/
1368	new_se = kmalloc(sizeof(*se), GFP_KERNEL);	1368	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1369	if (new_se == NULL)	1369	if (new_se == NULL)
1370	return -ENOMEM;	1370	return -ENOMEM;
1371	new_se->start_page = start_page;	1371	new_se->start_page = start_page;
1372	new_se->nr_pages = nr_pages;	1372	new_se->nr_pages = nr_pages;
1373	new_se->start_block = start_block;	1373	new_se->start_block = start_block;
1374		1374
1375	list_add_tail(&new_se->list, &sis->extent_list);	1375	list_add_tail(&new_se->list, &sis->extent_list);
1376	return 1;	1376	return 1;
1377	}	1377	}
1378		1378
1379	/*	1379	/*
1380	* A `swap extent' is a simple thing which maps a contiguous range of pages	1380	* A `swap extent' is a simple thing which maps a contiguous range of pages
1381	* onto a contiguous range of disk blocks. An ordered list of swap extents	1381	* onto a contiguous range of disk blocks. An ordered list of swap extents
1382	* is built at swapon time and is then used at swap_writepage/swap_readpage	1382	* is built at swapon time and is then used at swap_writepage/swap_readpage
1383	* time for locating where on disk a page belongs.	1383	* time for locating where on disk a page belongs.
1384	*	1384	*
1385	* If the swapfile is an S_ISBLK block device, a single extent is installed.	1385	* If the swapfile is an S_ISBLK block device, a single extent is installed.
1386	* This is done so that the main operating code can treat S_ISBLK and S_ISREG	1386	* This is done so that the main operating code can treat S_ISBLK and S_ISREG
1387	* swap files identically.	1387	* swap files identically.
1388	*	1388	*
1389	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap	1389	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1390	* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK	1390	* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
1391	* swapfiles are handled identically after swapon time.	1391	* swapfiles are handled identically after swapon time.
1392	*	1392	*
1393	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks	1393	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1394	* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If	1394	* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
1395	* some stray blocks are found which do not fall within the PAGE_SIZE alignment	1395	* some stray blocks are found which do not fall within the PAGE_SIZE alignment
1396	* requirements, they are simply tossed out - we will never use those blocks	1396	* requirements, they are simply tossed out - we will never use those blocks
1397	* for swapping.	1397	* for swapping.
1398	*	1398	*
1399	* For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This	1399	* For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
1400	* prevents root from shooting her foot off by ftruncating an in-use swapfile,	1400	* prevents root from shooting her foot off by ftruncating an in-use swapfile,
1401	* which will scribble on the fs.	1401	* which will scribble on the fs.
1402	*	1402	*
1403	* The amount of disk space which a single swap extent represents varies.	1403	* The amount of disk space which a single swap extent represents varies.
1404	* Typically it is in the 1-4 megabyte range. So we can have hundreds of	1404	* Typically it is in the 1-4 megabyte range. So we can have hundreds of
1405	* extents in the list. To avoid much list walking, we cache the previous	1405	* extents in the list. To avoid much list walking, we cache the previous
1406	* search location in `curr_swap_extent', and start new searches from there.	1406	* search location in `curr_swap_extent', and start new searches from there.
1407	* This is extremely effective. The average number of iterations in	1407	* This is extremely effective. The average number of iterations in
1408	* map_swap_page() has been measured at about 0.3 per page. - akpm.	1408	* map_swap_page() has been measured at about 0.3 per page. - akpm.
1409	*/	1409	*/
1410	static int setup_swap_extents(struct swap_info_struct sis, sector_t span)	1410	static int setup_swap_extents(struct swap_info_struct sis, sector_t span)
1411	{	1411	{
1412	struct inode *inode;	1412	struct inode *inode;
1413	unsigned blocks_per_page;	1413	unsigned blocks_per_page;
1414	unsigned long page_no;	1414	unsigned long page_no;
1415	unsigned blkbits;	1415	unsigned blkbits;
1416	sector_t probe_block;	1416	sector_t probe_block;
1417	sector_t last_block;	1417	sector_t last_block;
1418	sector_t lowest_block = -1;	1418	sector_t lowest_block = -1;
1419	sector_t highest_block = 0;	1419	sector_t highest_block = 0;
1420	int nr_extents = 0;	1420	int nr_extents = 0;
1421	int ret;	1421	int ret;
1422		1422
1423	inode = sis->swap_file->f_mapping->host;	1423	inode = sis->swap_file->f_mapping->host;
1424	if (S_ISBLK(inode->i_mode)) {	1424	if (S_ISBLK(inode->i_mode)) {
1425	ret = add_swap_extent(sis, 0, sis->max, 0);	1425	ret = add_swap_extent(sis, 0, sis->max, 0);
1426	*span = sis->pages;	1426	*span = sis->pages;
1427	goto done;	1427	goto done;
1428	}	1428	}
1429		1429
1430	blkbits = inode->i_blkbits;	1430	blkbits = inode->i_blkbits;
1431	blocks_per_page = PAGE_SIZE >> blkbits;	1431	blocks_per_page = PAGE_SIZE >> blkbits;
1432		1432
1433	/*	1433	/*
1434	* Map all the blocks into the extent list. This code doesn't try	1434	* Map all the blocks into the extent list. This code doesn't try
1435	* to be very smart.	1435	* to be very smart.
1436	*/	1436	*/
1437	probe_block = 0;	1437	probe_block = 0;
1438	page_no = 0;	1438	page_no = 0;
1439	last_block = i_size_read(inode) >> blkbits;	1439	last_block = i_size_read(inode) >> blkbits;
1440	while ((probe_block + blocks_per_page) <= last_block &&	1440	while ((probe_block + blocks_per_page) <= last_block &&
1441	page_no < sis->max) {	1441	page_no < sis->max) {
1442	unsigned block_in_page;	1442	unsigned block_in_page;
1443	sector_t first_block;	1443	sector_t first_block;
1444		1444
1445	first_block = bmap(inode, probe_block);	1445	first_block = bmap(inode, probe_block);
1446	if (first_block == 0)	1446	if (first_block == 0)
1447	goto bad_bmap;	1447	goto bad_bmap;
1448		1448
1449	/*	1449	/*
1450	* It must be PAGE_SIZE aligned on-disk	1450	* It must be PAGE_SIZE aligned on-disk
1451	*/	1451	*/
1452	if (first_block & (blocks_per_page - 1)) {	1452	if (first_block & (blocks_per_page - 1)) {
1453	probe_block++;	1453	probe_block++;
1454	goto reprobe;	1454	goto reprobe;
1455	}	1455	}
1456		1456
1457	for (block_in_page = 1; block_in_page < blocks_per_page;	1457	for (block_in_page = 1; block_in_page < blocks_per_page;
1458	block_in_page++) {	1458	block_in_page++) {
1459	sector_t block;	1459	sector_t block;
1460		1460
1461	block = bmap(inode, probe_block + block_in_page);	1461	block = bmap(inode, probe_block + block_in_page);
1462	if (block == 0)	1462	if (block == 0)
1463	goto bad_bmap;	1463	goto bad_bmap;
1464	if (block != first_block + block_in_page) {	1464	if (block != first_block + block_in_page) {
1465	/* Discontiguity */	1465	/* Discontiguity */
1466	probe_block++;	1466	probe_block++;
1467	goto reprobe;	1467	goto reprobe;
1468	}	1468	}
1469	}	1469	}
1470		1470
1471	first_block >>= (PAGE_SHIFT - blkbits);	1471	first_block >>= (PAGE_SHIFT - blkbits);
1472	if (page_no) { /* exclude the header page */	1472	if (page_no) { /* exclude the header page */
1473	if (first_block < lowest_block)	1473	if (first_block < lowest_block)
1474	lowest_block = first_block;	1474	lowest_block = first_block;
1475	if (first_block > highest_block)	1475	if (first_block > highest_block)
1476	highest_block = first_block;	1476	highest_block = first_block;
1477	}	1477	}
1478		1478
1479	/*	1479	/*
1480	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks	1480	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1481	*/	1481	*/
1482	ret = add_swap_extent(sis, page_no, 1, first_block);	1482	ret = add_swap_extent(sis, page_no, 1, first_block);
1483	if (ret < 0)	1483	if (ret < 0)
1484	goto out;	1484	goto out;
1485	nr_extents += ret;	1485	nr_extents += ret;
1486	page_no++;	1486	page_no++;
1487	probe_block += blocks_per_page;	1487	probe_block += blocks_per_page;
1488	reprobe:	1488	reprobe:
1489	continue;	1489	continue;
1490	}	1490	}
1491	ret = nr_extents;	1491	ret = nr_extents;
1492	*span = 1 + highest_block - lowest_block;	1492	*span = 1 + highest_block - lowest_block;
1493	if (page_no == 0)	1493	if (page_no == 0)
1494	page_no = 1; /* force Empty message */	1494	page_no = 1; /* force Empty message */
1495	sis->max = page_no;	1495	sis->max = page_no;
1496	sis->pages = page_no - 1;	1496	sis->pages = page_no - 1;
1497	sis->highest_bit = page_no - 1;	1497	sis->highest_bit = page_no - 1;
1498	done:	1498	done:
1499	sis->curr_swap_extent = list_entry(sis->extent_list.prev,	1499	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
1500	struct swap_extent, list);	1500	struct swap_extent, list);
1501	goto out;	1501	goto out;
1502	bad_bmap:	1502	bad_bmap:
1503	printk(KERN_ERR "swapon: swapfile has holes\n");	1503	printk(KERN_ERR "swapon: swapfile has holes\n");
1504	ret = -EINVAL;	1504	ret = -EINVAL;
1505	out:	1505	out:
1506	return ret;	1506	return ret;
1507	}	1507	}
1508		1508
1509	SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)	1509	SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1510	{	1510	{
1511	struct swap_info_struct * p = NULL;	1511	struct swap_info_struct * p = NULL;
1512	unsigned short *swap_map;	1512	unsigned short *swap_map;
1513	struct file swap_file, victim;	1513	struct file swap_file, victim;
1514	struct address_space *mapping;	1514	struct address_space *mapping;
1515	struct inode *inode;	1515	struct inode *inode;
1516	char * pathname;	1516	char * pathname;
1517	int i, type, prev;	1517	int i, type, prev;
1518	int err;	1518	int err;
1519		1519
1520	if (!capable(CAP_SYS_ADMIN))	1520	if (!capable(CAP_SYS_ADMIN))
1521	return -EPERM;	1521	return -EPERM;
1522		1522
1523	pathname = getname(specialfile);	1523	pathname = getname(specialfile);
1524	err = PTR_ERR(pathname);	1524	err = PTR_ERR(pathname);
1525	if (IS_ERR(pathname))	1525	if (IS_ERR(pathname))
1526	goto out;	1526	goto out;
1527		1527
1528	victim = filp_open(pathname, O_RDWR\|O_LARGEFILE, 0);	1528	victim = filp_open(pathname, O_RDWR\|O_LARGEFILE, 0);
1529	putname(pathname);	1529	putname(pathname);
1530	err = PTR_ERR(victim);	1530	err = PTR_ERR(victim);
1531	if (IS_ERR(victim))	1531	if (IS_ERR(victim))
1532	goto out;	1532	goto out;
1533		1533
1534	mapping = victim->f_mapping;	1534	mapping = victim->f_mapping;
1535	prev = -1;	1535	prev = -1;
1536	spin_lock(&swap_lock);	1536	spin_lock(&swap_lock);
1537	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {	1537	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1538	p = swap_info + type;	1538	p = swap_info + type;
1539	if (p->flags & SWP_WRITEOK) {	1539	if (p->flags & SWP_WRITEOK) {
1540	if (p->swap_file->f_mapping == mapping)	1540	if (p->swap_file->f_mapping == mapping)
1541	break;	1541	break;
1542	}	1542	}
1543	prev = type;	1543	prev = type;
1544	}	1544	}
1545	if (type < 0) {	1545	if (type < 0) {
1546	err = -EINVAL;	1546	err = -EINVAL;
1547	spin_unlock(&swap_lock);	1547	spin_unlock(&swap_lock);
1548	goto out_dput;	1548	goto out_dput;
1549	}	1549	}
1550	if (!security_vm_enough_memory(p->pages))	1550	if (!security_vm_enough_memory(p->pages))
1551	vm_unacct_memory(p->pages);	1551	vm_unacct_memory(p->pages);
1552	else {	1552	else {
1553	err = -ENOMEM;	1553	err = -ENOMEM;
1554	spin_unlock(&swap_lock);	1554	spin_unlock(&swap_lock);
1555	goto out_dput;	1555	goto out_dput;
1556	}	1556	}
1557	if (prev < 0) {	1557	if (prev < 0) {
1558	swap_list.head = p->next;	1558	swap_list.head = p->next;
1559	} else {	1559	} else {
1560	swap_info[prev].next = p->next;	1560	swap_info[prev].next = p->next;
1561	}	1561	}
1562	if (type == swap_list.next) {	1562	if (type == swap_list.next) {
1563	/* just pick something that's safe... */	1563	/* just pick something that's safe... */
1564	swap_list.next = swap_list.head;	1564	swap_list.next = swap_list.head;
1565	}	1565	}
1566	if (p->prio < 0) {	1566	if (p->prio < 0) {
1567	for (i = p->next; i >= 0; i = swap_info[i].next)	1567	for (i = p->next; i >= 0; i = swap_info[i].next)
1568	swap_info[i].prio = p->prio--;	1568	swap_info[i].prio = p->prio--;
1569	least_priority++;	1569	least_priority++;
1570	}	1570	}
1571	nr_swap_pages -= p->pages;	1571	nr_swap_pages -= p->pages;
1572	total_swap_pages -= p->pages;	1572	total_swap_pages -= p->pages;
1573	p->flags &= ~SWP_WRITEOK;	1573	p->flags &= ~SWP_WRITEOK;
1574	spin_unlock(&swap_lock);	1574	spin_unlock(&swap_lock);
1575		1575
1576	current->flags \|= PF_SWAPOFF;	1576	current->flags \|= PF_SWAPOFF;
1577	err = try_to_unuse(type);	1577	err = try_to_unuse(type);
1578	current->flags &= ~PF_SWAPOFF;	1578	current->flags &= ~PF_SWAPOFF;
1579		1579
1580	if (err) {	1580	if (err) {
1581	/* re-insert swap space back into swap_list */	1581	/* re-insert swap space back into swap_list */
1582	spin_lock(&swap_lock);	1582	spin_lock(&swap_lock);
1583	if (p->prio < 0)	1583	if (p->prio < 0)
1584	p->prio = --least_priority;	1584	p->prio = --least_priority;
1585	prev = -1;	1585	prev = -1;
1586	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {	1586	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1587	if (p->prio >= swap_info[i].prio)	1587	if (p->prio >= swap_info[i].prio)
1588	break;	1588	break;
1589	prev = i;	1589	prev = i;
1590	}	1590	}
1591	p->next = i;	1591	p->next = i;
1592	if (prev < 0)	1592	if (prev < 0)
1593	swap_list.head = swap_list.next = p - swap_info;	1593	swap_list.head = swap_list.next = p - swap_info;
1594	else	1594	else
1595	swap_info[prev].next = p - swap_info;	1595	swap_info[prev].next = p - swap_info;
1596	nr_swap_pages += p->pages;	1596	nr_swap_pages += p->pages;
1597	total_swap_pages += p->pages;	1597	total_swap_pages += p->pages;
1598	p->flags \|= SWP_WRITEOK;	1598	p->flags \|= SWP_WRITEOK;
1599	spin_unlock(&swap_lock);	1599	spin_unlock(&swap_lock);
1600	goto out_dput;	1600	goto out_dput;
1601	}	1601	}
1602		1602
1603	/* wait for any unplug function to finish */	1603	/* wait for any unplug function to finish */
1604	down_write(&swap_unplug_sem);	1604	down_write(&swap_unplug_sem);
1605	up_write(&swap_unplug_sem);	1605	up_write(&swap_unplug_sem);
1606		1606
1607	destroy_swap_extents(p);	1607	destroy_swap_extents(p);
1608	mutex_lock(&swapon_mutex);	1608	mutex_lock(&swapon_mutex);
1609	spin_lock(&swap_lock);	1609	spin_lock(&swap_lock);
1610	drain_mmlist();	1610	drain_mmlist();
1611		1611
1612	/* wait for anyone still in scan_swap_map */	1612	/* wait for anyone still in scan_swap_map */
1613	p->highest_bit = 0; /* cuts scans short */	1613	p->highest_bit = 0; /* cuts scans short */
1614	while (p->flags >= SWP_SCANNING) {	1614	while (p->flags >= SWP_SCANNING) {
1615	spin_unlock(&swap_lock);	1615	spin_unlock(&swap_lock);
1616	schedule_timeout_uninterruptible(1);	1616	schedule_timeout_uninterruptible(1);
1617	spin_lock(&swap_lock);	1617	spin_lock(&swap_lock);
1618	}	1618	}
1619		1619
1620	swap_file = p->swap_file;	1620	swap_file = p->swap_file;
1621	p->swap_file = NULL;	1621	p->swap_file = NULL;
1622	p->max = 0;	1622	p->max = 0;
1623	swap_map = p->swap_map;	1623	swap_map = p->swap_map;
1624	p->swap_map = NULL;	1624	p->swap_map = NULL;
1625	p->flags = 0;	1625	p->flags = 0;
1626	spin_unlock(&swap_lock);	1626	spin_unlock(&swap_lock);
1627	mutex_unlock(&swapon_mutex);	1627	mutex_unlock(&swapon_mutex);
1628	vfree(swap_map);	1628	vfree(swap_map);
1629	/* Destroy swap account informatin */	1629	/* Destroy swap account informatin */
1630	swap_cgroup_swapoff(type);	1630	swap_cgroup_swapoff(type);
1631		1631
1632	inode = mapping->host;	1632	inode = mapping->host;
1633	if (S_ISBLK(inode->i_mode)) {	1633	if (S_ISBLK(inode->i_mode)) {
1634	struct block_device *bdev = I_BDEV(inode);	1634	struct block_device *bdev = I_BDEV(inode);
1635	set_blocksize(bdev, p->old_block_size);	1635	set_blocksize(bdev, p->old_block_size);
1636	bd_release(bdev);	1636	bd_release(bdev);
1637	} else {	1637	} else {
1638	mutex_lock(&inode->i_mutex);	1638	mutex_lock(&inode->i_mutex);
1639	inode->i_flags &= ~S_SWAPFILE;	1639	inode->i_flags &= ~S_SWAPFILE;
1640	mutex_unlock(&inode->i_mutex);	1640	mutex_unlock(&inode->i_mutex);
1641	}	1641	}
1642	filp_close(swap_file, NULL);	1642	filp_close(swap_file, NULL);
1643	err = 0;	1643	err = 0;
1644		1644
1645	out_dput:	1645	out_dput:
1646	filp_close(victim, NULL);	1646	filp_close(victim, NULL);
1647	out:	1647	out:
1648	return err;	1648	return err;
1649	}	1649	}
1650		1650
1651	#ifdef CONFIG_PROC_FS	1651	#ifdef CONFIG_PROC_FS
1652	/* iterator */	1652	/* iterator */
1653	static void swap_start(struct seq_file swap, loff_t *pos)	1653	static void swap_start(struct seq_file swap, loff_t *pos)
1654	{	1654	{
1655	struct swap_info_struct *ptr = swap_info;	1655	struct swap_info_struct *ptr = swap_info;
1656	int i;	1656	int i;
1657	loff_t l = *pos;	1657	loff_t l = *pos;
1658		1658
1659	mutex_lock(&swapon_mutex);	1659	mutex_lock(&swapon_mutex);
1660		1660
1661	if (!l)	1661	if (!l)
1662	return SEQ_START_TOKEN;	1662	return SEQ_START_TOKEN;
1663		1663
1664	for (i = 0; i < nr_swapfiles; i++, ptr++) {	1664	for (i = 0; i < nr_swapfiles; i++, ptr++) {
1665	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)	1665	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
1666	continue;	1666	continue;
1667	if (!--l)	1667	if (!--l)
1668	return ptr;	1668	return ptr;
1669	}	1669	}
1670		1670
1671	return NULL;	1671	return NULL;
1672	}	1672	}
1673		1673
1674	static void swap_next(struct seq_file swap, void v, loff_t pos)	1674	static void swap_next(struct seq_file swap, void v, loff_t pos)
1675	{	1675	{
1676	struct swap_info_struct *ptr;	1676	struct swap_info_struct *ptr;
1677	struct swap_info_struct *endptr = swap_info + nr_swapfiles;	1677	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1678		1678
1679	if (v == SEQ_START_TOKEN)	1679	if (v == SEQ_START_TOKEN)
1680	ptr = swap_info;	1680	ptr = swap_info;
1681	else {	1681	else {
1682	ptr = v;	1682	ptr = v;
1683	ptr++;	1683	ptr++;
1684	}	1684	}
1685		1685
1686	for (; ptr < endptr; ptr++) {	1686	for (; ptr < endptr; ptr++) {
1687	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)	1687	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
1688	continue;	1688	continue;
1689	++*pos;	1689	++*pos;
1690	return ptr;	1690	return ptr;
1691	}	1691	}
1692		1692
1693	return NULL;	1693	return NULL;
1694	}	1694	}
1695		1695
1696	static void swap_stop(struct seq_file swap, void v)	1696	static void swap_stop(struct seq_file swap, void v)
1697	{	1697	{
1698	mutex_unlock(&swapon_mutex);	1698	mutex_unlock(&swapon_mutex);
1699	}	1699	}
1700		1700
1701	static int swap_show(struct seq_file swap, void v)	1701	static int swap_show(struct seq_file swap, void v)
1702	{	1702	{
1703	struct swap_info_struct *ptr = v;	1703	struct swap_info_struct *ptr = v;
1704	struct file *file;	1704	struct file *file;
1705	int len;	1705	int len;
1706		1706
1707	if (ptr == SEQ_START_TOKEN) {	1707	if (ptr == SEQ_START_TOKEN) {
1708	seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");	1708	seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1709	return 0;	1709	return 0;
1710	}	1710	}
1711		1711
1712	file = ptr->swap_file;	1712	file = ptr->swap_file;
1713	len = seq_path(swap, &file->f_path, " \t\n\\");	1713	len = seq_path(swap, &file->f_path, " \t\n\\");
1714	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",	1714	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1715	len < 40 ? 40 - len : 1, " ",	1715	len < 40 ? 40 - len : 1, " ",
1716	S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?	1716	S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1717	"partition" : "file\t",	1717	"partition" : "file\t",
1718	ptr->pages << (PAGE_SHIFT - 10),	1718	ptr->pages << (PAGE_SHIFT - 10),
1719	ptr->inuse_pages << (PAGE_SHIFT - 10),	1719	ptr->inuse_pages << (PAGE_SHIFT - 10),
1720	ptr->prio);	1720	ptr->prio);
1721	return 0;	1721	return 0;
1722	}	1722	}
1723		1723
1724	static const struct seq_operations swaps_op = {	1724	static const struct seq_operations swaps_op = {
1725	.start = swap_start,	1725	.start = swap_start,
1726	.next = swap_next,	1726	.next = swap_next,
1727	.stop = swap_stop,	1727	.stop = swap_stop,
1728	.show = swap_show	1728	.show = swap_show
1729	};	1729	};
1730		1730
1731	static int swaps_open(struct inode inode, struct file file)	1731	static int swaps_open(struct inode inode, struct file file)
1732	{	1732	{
1733	return seq_open(file, &swaps_op);	1733	return seq_open(file, &swaps_op);
1734	}	1734	}
1735		1735
1736	static const struct file_operations proc_swaps_operations = {	1736	static const struct file_operations proc_swaps_operations = {
1737	.open = swaps_open,	1737	.open = swaps_open,
1738	.read = seq_read,	1738	.read = seq_read,
1739	.llseek = seq_lseek,	1739	.llseek = seq_lseek,
1740	.release = seq_release,	1740	.release = seq_release,
1741	};	1741	};
1742		1742
1743	static int __init procswaps_init(void)	1743	static int __init procswaps_init(void)
1744	{	1744	{
1745	proc_create("swaps", 0, NULL, &proc_swaps_operations);	1745	proc_create("swaps", 0, NULL, &proc_swaps_operations);
1746	return 0;	1746	return 0;
1747	}	1747	}
1748	__initcall(procswaps_init);	1748	__initcall(procswaps_init);
1749	#endif /* CONFIG_PROC_FS */	1749	#endif /* CONFIG_PROC_FS */
1750		1750
1751	#ifdef MAX_SWAPFILES_CHECK	1751	#ifdef MAX_SWAPFILES_CHECK
1752	static int __init max_swapfiles_check(void)	1752	static int __init max_swapfiles_check(void)
1753	{	1753	{
1754	MAX_SWAPFILES_CHECK();	1754	MAX_SWAPFILES_CHECK();
1755	return 0;	1755	return 0;
1756	}	1756	}
1757	late_initcall(max_swapfiles_check);	1757	late_initcall(max_swapfiles_check);
1758	#endif	1758	#endif
1759		1759
1760	/*	1760	/*
1761	* Written 01/25/92 by Simmule Turner, heavily changed by Linus.	1761	* Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1762	*	1762	*
1763	* The swapon system call	1763	* The swapon system call
1764	*/	1764	*/
1765	SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)	1765	SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1766	{	1766	{
1767	struct swap_info_struct * p;	1767	struct swap_info_struct * p;
1768	char *name = NULL;	1768	char *name = NULL;
1769	struct block_device *bdev = NULL;	1769	struct block_device *bdev = NULL;
1770	struct file *swap_file = NULL;	1770	struct file *swap_file = NULL;
1771	struct address_space *mapping;	1771	struct address_space *mapping;
1772	unsigned int type;	1772	unsigned int type;
1773	int i, prev;	1773	int i, prev;
1774	int error;	1774	int error;
1775	union swap_header *swap_header = NULL;	1775	union swap_header *swap_header = NULL;
1776	unsigned int nr_good_pages = 0;	1776	unsigned int nr_good_pages = 0;
1777	int nr_extents = 0;	1777	int nr_extents = 0;
1778	sector_t span;	1778	sector_t span;
1779	unsigned long maxpages = 1;	1779	unsigned long maxpages = 1;
1780	unsigned long swapfilepages;	1780	unsigned long swapfilepages;
1781	unsigned short *swap_map = NULL;	1781	unsigned short *swap_map = NULL;
1782	struct page *page = NULL;	1782	struct page *page = NULL;
1783	struct inode *inode = NULL;	1783	struct inode *inode = NULL;
1784	int did_down = 0;	1784	int did_down = 0;
1785		1785
1786	if (!capable(CAP_SYS_ADMIN))	1786	if (!capable(CAP_SYS_ADMIN))
1787	return -EPERM;	1787	return -EPERM;
1788	spin_lock(&swap_lock);	1788	spin_lock(&swap_lock);
1789	p = swap_info;	1789	p = swap_info;
1790	for (type = 0 ; type < nr_swapfiles ; type++,p++)	1790	for (type = 0 ; type < nr_swapfiles ; type++,p++)
1791	if (!(p->flags & SWP_USED))	1791	if (!(p->flags & SWP_USED))
1792	break;	1792	break;
1793	error = -EPERM;	1793	error = -EPERM;
1794	if (type >= MAX_SWAPFILES) {	1794	if (type >= MAX_SWAPFILES) {
1795	spin_unlock(&swap_lock);	1795	spin_unlock(&swap_lock);
1796	goto out;	1796	goto out;
1797	}	1797	}
1798	if (type >= nr_swapfiles)	1798	if (type >= nr_swapfiles)
1799	nr_swapfiles = type+1;	1799	nr_swapfiles = type+1;
1800	memset(p, 0, sizeof(*p));	1800	memset(p, 0, sizeof(*p));
1801	INIT_LIST_HEAD(&p->extent_list);	1801	INIT_LIST_HEAD(&p->extent_list);
1802	p->flags = SWP_USED;	1802	p->flags = SWP_USED;
1803	p->next = -1;	1803	p->next = -1;
1804	spin_unlock(&swap_lock);	1804	spin_unlock(&swap_lock);
1805	name = getname(specialfile);	1805	name = getname(specialfile);
1806	error = PTR_ERR(name);	1806	error = PTR_ERR(name);
1807	if (IS_ERR(name)) {	1807	if (IS_ERR(name)) {
1808	name = NULL;	1808	name = NULL;
1809	goto bad_swap_2;	1809	goto bad_swap_2;
1810	}	1810	}
1811	swap_file = filp_open(name, O_RDWR\|O_LARGEFILE, 0);	1811	swap_file = filp_open(name, O_RDWR\|O_LARGEFILE, 0);
1812	error = PTR_ERR(swap_file);	1812	error = PTR_ERR(swap_file);
1813	if (IS_ERR(swap_file)) {	1813	if (IS_ERR(swap_file)) {
1814	swap_file = NULL;	1814	swap_file = NULL;
1815	goto bad_swap_2;	1815	goto bad_swap_2;
1816	}	1816	}
1817		1817
1818	p->swap_file = swap_file;	1818	p->swap_file = swap_file;
1819	mapping = swap_file->f_mapping;	1819	mapping = swap_file->f_mapping;
1820	inode = mapping->host;	1820	inode = mapping->host;
1821		1821
1822	error = -EBUSY;	1822	error = -EBUSY;
1823	for (i = 0; i < nr_swapfiles; i++) {	1823	for (i = 0; i < nr_swapfiles; i++) {
1824	struct swap_info_struct *q = &swap_info[i];	1824	struct swap_info_struct *q = &swap_info[i];
1825		1825
1826	if (i == type \|\| !q->swap_file)	1826	if (i == type \|\| !q->swap_file)
1827	continue;	1827	continue;
1828	if (mapping == q->swap_file->f_mapping)	1828	if (mapping == q->swap_file->f_mapping)
1829	goto bad_swap;	1829	goto bad_swap;
1830	}	1830	}
1831		1831
1832	error = -EINVAL;	1832	error = -EINVAL;
1833	if (S_ISBLK(inode->i_mode)) {	1833	if (S_ISBLK(inode->i_mode)) {
1834	bdev = I_BDEV(inode);	1834	bdev = I_BDEV(inode);
1835	error = bd_claim(bdev, sys_swapon);	1835	error = bd_claim(bdev, sys_swapon);
1836	if (error < 0) {	1836	if (error < 0) {
1837	bdev = NULL;	1837	bdev = NULL;
1838	error = -EINVAL;	1838	error = -EINVAL;
1839	goto bad_swap;	1839	goto bad_swap;
1840	}	1840	}
1841	p->old_block_size = block_size(bdev);	1841	p->old_block_size = block_size(bdev);
1842	error = set_blocksize(bdev, PAGE_SIZE);	1842	error = set_blocksize(bdev, PAGE_SIZE);
1843	if (error < 0)	1843	if (error < 0)
1844	goto bad_swap;	1844	goto bad_swap;
1845	p->bdev = bdev;	1845	p->bdev = bdev;
1846	} else if (S_ISREG(inode->i_mode)) {	1846	} else if (S_ISREG(inode->i_mode)) {
1847	p->bdev = inode->i_sb->s_bdev;	1847	p->bdev = inode->i_sb->s_bdev;
1848	mutex_lock(&inode->i_mutex);	1848	mutex_lock(&inode->i_mutex);
1849	did_down = 1;	1849	did_down = 1;
1850	if (IS_SWAPFILE(inode)) {	1850	if (IS_SWAPFILE(inode)) {
1851	error = -EBUSY;	1851	error = -EBUSY;
1852	goto bad_swap;	1852	goto bad_swap;
1853	}	1853	}
1854	} else {	1854	} else {
1855	goto bad_swap;	1855	goto bad_swap;
1856	}	1856	}
1857		1857
1858	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;	1858	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1859		1859
1860	/*	1860	/*
1861	* Read the swap header.	1861	* Read the swap header.
1862	*/	1862	*/
1863	if (!mapping->a_ops->readpage) {	1863	if (!mapping->a_ops->readpage) {
1864	error = -EINVAL;	1864	error = -EINVAL;
1865	goto bad_swap;	1865	goto bad_swap;
1866	}	1866	}
1867	page = read_mapping_page(mapping, 0, swap_file);	1867	page = read_mapping_page(mapping, 0, swap_file);
1868	if (IS_ERR(page)) {	1868	if (IS_ERR(page)) {
1869	error = PTR_ERR(page);	1869	error = PTR_ERR(page);
1870	goto bad_swap;	1870	goto bad_swap;
1871	}	1871	}
1872	swap_header = kmap(page);	1872	swap_header = kmap(page);
1873		1873
1874	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {	1874	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1875	printk(KERN_ERR "Unable to find swap-space signature\n");	1875	printk(KERN_ERR "Unable to find swap-space signature\n");
1876	error = -EINVAL;	1876	error = -EINVAL;
1877	goto bad_swap;	1877	goto bad_swap;
1878	}	1878	}
1879		1879
1880	/* swap partition endianess hack... */	1880	/* swap partition endianess hack... */
1881	if (swab32(swap_header->info.version) == 1) {	1881	if (swab32(swap_header->info.version) == 1) {
1882	swab32s(&swap_header->info.version);	1882	swab32s(&swap_header->info.version);
1883	swab32s(&swap_header->info.last_page);	1883	swab32s(&swap_header->info.last_page);
1884	swab32s(&swap_header->info.nr_badpages);	1884	swab32s(&swap_header->info.nr_badpages);
1885	for (i = 0; i < swap_header->info.nr_badpages; i++)	1885	for (i = 0; i < swap_header->info.nr_badpages; i++)
1886	swab32s(&swap_header->info.badpages[i]);	1886	swab32s(&swap_header->info.badpages[i]);
1887	}	1887	}
1888	/* Check the swap header's sub-version */	1888	/* Check the swap header's sub-version */
1889	if (swap_header->info.version != 1) {	1889	if (swap_header->info.version != 1) {
1890	printk(KERN_WARNING	1890	printk(KERN_WARNING
1891	"Unable to handle swap header version %d\n",	1891	"Unable to handle swap header version %d\n",
1892	swap_header->info.version);	1892	swap_header->info.version);
1893	error = -EINVAL;	1893	error = -EINVAL;
1894	goto bad_swap;	1894	goto bad_swap;
1895	}	1895	}
1896		1896
1897	p->lowest_bit = 1;	1897	p->lowest_bit = 1;
1898	p->cluster_next = 1;	1898	p->cluster_next = 1;
1899		1899
1900	/*	1900	/*
1901	* Find out how many pages are allowed for a single swap	1901	* Find out how many pages are allowed for a single swap
1902	* device. There are two limiting factors: 1) the number of	1902	* device. There are two limiting factors: 1) the number of
1903	* bits for the swap offset in the swp_entry_t type and	1903	* bits for the swap offset in the swp_entry_t type and
1904	* 2) the number of bits in the a swap pte as defined by	1904	* 2) the number of bits in the a swap pte as defined by
1905	* the different architectures. In order to find the	1905	* the different architectures. In order to find the
1906	* largest possible bit mask a swap entry with swap type 0	1906	* largest possible bit mask a swap entry with swap type 0
1907	* and swap offset ~0UL is created, encoded to a swap pte,	1907	* and swap offset ~0UL is created, encoded to a swap pte,
1908	* decoded to a swp_entry_t again and finally the swap	1908	* decoded to a swp_entry_t again and finally the swap
1909	* offset is extracted. This will mask all the bits from	1909	* offset is extracted. This will mask all the bits from
1910	* the initial ~0UL mask that can't be encoded in either	1910	* the initial ~0UL mask that can't be encoded in either
1911	* the swp_entry_t or the architecture definition of a	1911	* the swp_entry_t or the architecture definition of a
1912	* swap pte.	1912	* swap pte.
1913	*/	1913	*/
1914	maxpages = swp_offset(pte_to_swp_entry(	1914	maxpages = swp_offset(pte_to_swp_entry(
1915	swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;	1915	swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1916	if (maxpages > swap_header->info.last_page)	1916	if (maxpages > swap_header->info.last_page)
1917	maxpages = swap_header->info.last_page;	1917	maxpages = swap_header->info.last_page;
1918	p->highest_bit = maxpages - 1;	1918	p->highest_bit = maxpages - 1;
1919		1919
1920	error = -EINVAL;	1920	error = -EINVAL;
1921	if (!maxpages)	1921	if (!maxpages)
1922	goto bad_swap;	1922	goto bad_swap;
1923	if (swapfilepages && maxpages > swapfilepages) {	1923	if (swapfilepages && maxpages > swapfilepages) {
1924	printk(KERN_WARNING	1924	printk(KERN_WARNING
1925	"Swap area shorter than signature indicates\n");	1925	"Swap area shorter than signature indicates\n");
1926	goto bad_swap;	1926	goto bad_swap;
1927	}	1927	}
1928	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))	1928	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1929	goto bad_swap;	1929	goto bad_swap;
1930	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)	1930	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1931	goto bad_swap;	1931	goto bad_swap;
1932		1932
1933	/* OK, set up the swap map and apply the bad block list */	1933	/* OK, set up the swap map and apply the bad block list */
1934	swap_map = vmalloc(maxpages * sizeof(short));	1934	swap_map = vmalloc(maxpages * sizeof(short));
1935	if (!swap_map) {	1935	if (!swap_map) {
1936	error = -ENOMEM;	1936	error = -ENOMEM;
1937	goto bad_swap;	1937	goto bad_swap;
1938	}	1938	}
1939		1939
1940	memset(swap_map, 0, maxpages * sizeof(short));	1940	memset(swap_map, 0, maxpages * sizeof(short));
1941	for (i = 0; i < swap_header->info.nr_badpages; i++) {	1941	for (i = 0; i < swap_header->info.nr_badpages; i++) {
1942	int page_nr = swap_header->info.badpages[i];	1942	int page_nr = swap_header->info.badpages[i];
1943	if (page_nr <= 0 \|\| page_nr >= swap_header->info.last_page) {	1943	if (page_nr <= 0 \|\| page_nr >= swap_header->info.last_page) {
1944	error = -EINVAL;	1944	error = -EINVAL;
1945	goto bad_swap;	1945	goto bad_swap;
1946	}	1946	}
1947	swap_map[page_nr] = SWAP_MAP_BAD;	1947	swap_map[page_nr] = SWAP_MAP_BAD;
1948	}	1948	}
1949		1949
1950	error = swap_cgroup_swapon(type, maxpages);	1950	error = swap_cgroup_swapon(type, maxpages);
1951	if (error)	1951	if (error)
1952	goto bad_swap;	1952	goto bad_swap;
1953		1953
1954	nr_good_pages = swap_header->info.last_page -	1954	nr_good_pages = swap_header->info.last_page -
1955	swap_header->info.nr_badpages -	1955	swap_header->info.nr_badpages -
1956	1 /* header page */;	1956	1 /* header page */;
1957		1957
1958	if (nr_good_pages) {	1958	if (nr_good_pages) {
1959	swap_map[0] = SWAP_MAP_BAD;	1959	swap_map[0] = SWAP_MAP_BAD;
1960	p->max = maxpages;	1960	p->max = maxpages;
1961	p->pages = nr_good_pages;	1961	p->pages = nr_good_pages;
1962	nr_extents = setup_swap_extents(p, &span);	1962	nr_extents = setup_swap_extents(p, &span);
1963	if (nr_extents < 0) {	1963	if (nr_extents < 0) {
1964	error = nr_extents;	1964	error = nr_extents;
1965	goto bad_swap;	1965	goto bad_swap;
1966	}	1966	}
1967	nr_good_pages = p->pages;	1967	nr_good_pages = p->pages;
1968	}	1968	}
1969	if (!nr_good_pages) {	1969	if (!nr_good_pages) {
1970	printk(KERN_WARNING "Empty swap-file\n");	1970	printk(KERN_WARNING "Empty swap-file\n");
1971	error = -EINVAL;	1971	error = -EINVAL;
1972	goto bad_swap;	1972	goto bad_swap;
1973	}	1973	}
1974		1974
1975	if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {	1975	if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1976	p->flags \|= SWP_SOLIDSTATE;	1976	p->flags \|= SWP_SOLIDSTATE;
1977	p->cluster_next = 1 + (random32() % p->highest_bit);	1977	p->cluster_next = 1 + (random32() % p->highest_bit);
1978	}	1978	}
1979	if (discard_swap(p) == 0)	1979	if (discard_swap(p) == 0)
1980	p->flags \|= SWP_DISCARDABLE;	1980	p->flags \|= SWP_DISCARDABLE;
1981		1981
1982	mutex_lock(&swapon_mutex);	1982	mutex_lock(&swapon_mutex);
1983	spin_lock(&swap_lock);	1983	spin_lock(&swap_lock);
1984	if (swap_flags & SWAP_FLAG_PREFER)	1984	if (swap_flags & SWAP_FLAG_PREFER)
1985	p->prio =	1985	p->prio =
1986	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;	1986	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1987	else	1987	else
1988	p->prio = --least_priority;	1988	p->prio = --least_priority;
1989	p->swap_map = swap_map;	1989	p->swap_map = swap_map;
1990	p->flags \|= SWP_WRITEOK;	1990	p->flags \|= SWP_WRITEOK;
1991	nr_swap_pages += nr_good_pages;	1991	nr_swap_pages += nr_good_pages;
1992	total_swap_pages += nr_good_pages;	1992	total_swap_pages += nr_good_pages;
1993		1993
1994	printk(KERN_INFO "Adding %uk swap on %s. "	1994	printk(KERN_INFO "Adding %uk swap on %s. "
1995	"Priority:%d extents:%d across:%lluk %s%s\n",	1995	"Priority:%d extents:%d across:%lluk %s%s\n",
1996	nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,	1996	nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1997	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),	1997	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1998	(p->flags & SWP_SOLIDSTATE) ? "SS" : "",	1998	(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1999	(p->flags & SWP_DISCARDABLE) ? "D" : "");	1999	(p->flags & SWP_DISCARDABLE) ? "D" : "");
2000		2000
2001	/* insert swap space into swap_list: */	2001	/* insert swap space into swap_list: */
2002	prev = -1;	2002	prev = -1;
2003	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {	2003	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
2004	if (p->prio >= swap_info[i].prio) {	2004	if (p->prio >= swap_info[i].prio) {
2005	break;	2005	break;
2006	}	2006	}
2007	prev = i;	2007	prev = i;
2008	}	2008	}
2009	p->next = i;	2009	p->next = i;
2010	if (prev < 0) {	2010	if (prev < 0) {
2011	swap_list.head = swap_list.next = p - swap_info;	2011	swap_list.head = swap_list.next = p - swap_info;
2012	} else {	2012	} else {
2013	swap_info[prev].next = p - swap_info;	2013	swap_info[prev].next = p - swap_info;
2014	}	2014	}
2015	spin_unlock(&swap_lock);	2015	spin_unlock(&swap_lock);
2016	mutex_unlock(&swapon_mutex);	2016	mutex_unlock(&swapon_mutex);
2017	error = 0;	2017	error = 0;
2018	goto out;	2018	goto out;
2019	bad_swap:	2019	bad_swap:
2020	if (bdev) {	2020	if (bdev) {
2021	set_blocksize(bdev, p->old_block_size);	2021	set_blocksize(bdev, p->old_block_size);
2022	bd_release(bdev);	2022	bd_release(bdev);
2023	}	2023	}
2024	destroy_swap_extents(p);	2024	destroy_swap_extents(p);
2025	swap_cgroup_swapoff(type);	2025	swap_cgroup_swapoff(type);
2026	bad_swap_2:	2026	bad_swap_2:
2027	spin_lock(&swap_lock);	2027	spin_lock(&swap_lock);
2028	p->swap_file = NULL;	2028	p->swap_file = NULL;
2029	p->flags = 0;	2029	p->flags = 0;
2030	spin_unlock(&swap_lock);	2030	spin_unlock(&swap_lock);
2031	vfree(swap_map);	2031	vfree(swap_map);
2032	if (swap_file)	2032	if (swap_file)
2033	filp_close(swap_file, NULL);	2033	filp_close(swap_file, NULL);
2034	out:	2034	out:
2035	if (page && !IS_ERR(page)) {	2035	if (page && !IS_ERR(page)) {
2036	kunmap(page);	2036	kunmap(page);
2037	page_cache_release(page);	2037	page_cache_release(page);
2038	}	2038	}
2039	if (name)	2039	if (name)
2040	putname(name);	2040	putname(name);
2041	if (did_down) {	2041	if (did_down) {
2042	if (!error)	2042	if (!error)
2043	inode->i_flags \|= S_SWAPFILE;	2043	inode->i_flags \|= S_SWAPFILE;
2044	mutex_unlock(&inode->i_mutex);	2044	mutex_unlock(&inode->i_mutex);
2045	}	2045	}
2046	return error;	2046	return error;
2047	}	2047	}
2048		2048
2049	void si_swapinfo(struct sysinfo *val)	2049	void si_swapinfo(struct sysinfo *val)
2050	{	2050	{
2051	unsigned int i;	2051	unsigned int i;
2052	unsigned long nr_to_be_unused = 0;	2052	unsigned long nr_to_be_unused = 0;
2053		2053
2054	spin_lock(&swap_lock);	2054	spin_lock(&swap_lock);
2055	for (i = 0; i < nr_swapfiles; i++) {	2055	for (i = 0; i < nr_swapfiles; i++) {
2056	if (!(swap_info[i].flags & SWP_USED) \|\|	2056	if (!(swap_info[i].flags & SWP_USED) \|\|
2057	(swap_info[i].flags & SWP_WRITEOK))	2057	(swap_info[i].flags & SWP_WRITEOK))
2058	continue;	2058	continue;
2059	nr_to_be_unused += swap_info[i].inuse_pages;	2059	nr_to_be_unused += swap_info[i].inuse_pages;
2060	}	2060	}
2061	val->freeswap = nr_swap_pages + nr_to_be_unused;	2061	val->freeswap = nr_swap_pages + nr_to_be_unused;
2062	val->totalswap = total_swap_pages + nr_to_be_unused;	2062	val->totalswap = total_swap_pages + nr_to_be_unused;
2063	spin_unlock(&swap_lock);	2063	spin_unlock(&swap_lock);
2064	}	2064	}
2065		2065
2066	/*	2066	/*
2067	* Verify that a swap entry is valid and increment its swap map count.	2067	* Verify that a swap entry is valid and increment its swap map count.
2068	*	2068	*
2069	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as	2069	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2070	* "permanent", but will be reclaimed by the next swapoff.	2070	* "permanent", but will be reclaimed by the next swapoff.
2071	* Returns error code in following case.	2071	* Returns error code in following case.
2072	* - success -> 0	2072	* - success -> 0
2073	* - swp_entry is invalid -> EINVAL	2073	* - swp_entry is invalid -> EINVAL
2074	* - swp_entry is migration entry -> EINVAL	2074	* - swp_entry is migration entry -> EINVAL
2075	* - swap-cache reference is requested but there is already one. -> EEXIST	2075	* - swap-cache reference is requested but there is already one. -> EEXIST
2076	* - swap-cache reference is requested but the entry is not used. -> ENOENT	2076	* - swap-cache reference is requested but the entry is not used. -> ENOENT
2077	*/	2077	*/
2078	static int __swap_duplicate(swp_entry_t entry, bool cache)	2078	static int __swap_duplicate(swp_entry_t entry, bool cache)
2079	{	2079	{
2080	struct swap_info_struct * p;	2080	struct swap_info_struct * p;
2081	unsigned long offset, type;	2081	unsigned long offset, type;
2082	int result = -EINVAL;	2082	int result = -EINVAL;
2083	int count;	2083	int count;
2084	bool has_cache;	2084	bool has_cache;
2085		2085
2086	if (is_migration_entry(entry))	2086	if (is_migration_entry(entry))
2087	return -EINVAL;	2087	return -EINVAL;
2088		2088
2089	type = swp_type(entry);	2089	type = swp_type(entry);
2090	if (type >= nr_swapfiles)	2090	if (type >= nr_swapfiles)
2091	goto bad_file;	2091	goto bad_file;
2092	p = type + swap_info;	2092	p = type + swap_info;
2093	offset = swp_offset(entry);	2093	offset = swp_offset(entry);
2094		2094
2095	spin_lock(&swap_lock);	2095	spin_lock(&swap_lock);
2096		2096
2097	if (unlikely(offset >= p->max))	2097	if (unlikely(offset >= p->max))
2098	goto unlock_out;	2098	goto unlock_out;
2099		2099
2100	count = swap_count(p->swap_map[offset]);	2100	count = swap_count(p->swap_map[offset]);
2101	has_cache = swap_has_cache(p->swap_map[offset]);	2101	has_cache = swap_has_cache(p->swap_map[offset]);
2102		2102
2103	if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */	2103	if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2104		2104
2105	/* set SWAP_HAS_CACHE if there is no cache and entry is used */	2105	/* set SWAP_HAS_CACHE if there is no cache and entry is used */
2106	if (!has_cache && count) {	2106	if (!has_cache && count) {
2107	p->swap_map[offset] = encode_swapmap(count, true);	2107	p->swap_map[offset] = encode_swapmap(count, true);
2108	result = 0;	2108	result = 0;
2109	} else if (has_cache) /* someone added cache */	2109	} else if (has_cache) /* someone added cache */
2110	result = -EEXIST;	2110	result = -EEXIST;
2111	else if (!count) /* no users */	2111	else if (!count) /* no users */
2112	result = -ENOENT;	2112	result = -ENOENT;
2113		2113
2114	} else if (count \|\| has_cache) {	2114	} else if (count \|\| has_cache) {
2115	if (count < SWAP_MAP_MAX - 1) {	2115	if (count < SWAP_MAP_MAX - 1) {
2116	p->swap_map[offset] = encode_swapmap(count + 1,	2116	p->swap_map[offset] = encode_swapmap(count + 1,
2117	has_cache);	2117	has_cache);
2118	result = 0;	2118	result = 0;
2119	} else if (count <= SWAP_MAP_MAX) {	2119	} else if (count <= SWAP_MAP_MAX) {
2120	if (swap_overflow++ < 5)	2120	if (swap_overflow++ < 5)
2121	printk(KERN_WARNING	2121	printk(KERN_WARNING
2122	"swap_dup: swap entry overflow\n");	2122	"swap_dup: swap entry overflow\n");
2123	p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,	2123	p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2124	has_cache);	2124	has_cache);
2125	result = 0;	2125	result = 0;
2126	}	2126	}
2127	} else	2127	} else
2128	result = -ENOENT; /* unused swap entry */	2128	result = -ENOENT; /* unused swap entry */
2129	unlock_out:	2129	unlock_out:
2130	spin_unlock(&swap_lock);	2130	spin_unlock(&swap_lock);
2131	out:	2131	out:
2132	return result;	2132	return result;
2133		2133
2134	bad_file:	2134	bad_file:
2135	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);	2135	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2136	goto out;	2136	goto out;
2137	}	2137	}
2138	/*	2138	/*
2139	* increase reference count of swap entry by 1.	2139	* increase reference count of swap entry by 1.
2140	*/	2140	*/
2141	void swap_duplicate(swp_entry_t entry)	2141	void swap_duplicate(swp_entry_t entry)
2142	{	2142	{
2143	__swap_duplicate(entry, SWAP_MAP);	2143	__swap_duplicate(entry, SWAP_MAP);
2144	}	2144	}
2145		2145
2146	/*	2146	/*
2147	* @entry: swap entry for which we allocate swap cache.	2147	* @entry: swap entry for which we allocate swap cache.
2148	*	2148	*
2149	* Called when allocating swap cache for exising swap entry,	2149	* Called when allocating swap cache for exising swap entry,
2150	* This can return error codes. Returns 0 at success.	2150	* This can return error codes. Returns 0 at success.
2151	* -EBUSY means there is a swap cache.	2151	* -EBUSY means there is a swap cache.
2152	* Note: return code is different from swap_duplicate().	2152	* Note: return code is different from swap_duplicate().
2153	*/	2153	*/
2154	int swapcache_prepare(swp_entry_t entry)	2154	int swapcache_prepare(swp_entry_t entry)
2155	{	2155	{
2156	return __swap_duplicate(entry, SWAP_CACHE);	2156	return __swap_duplicate(entry, SWAP_CACHE);
2157	}	2157	}
2158		2158
2159		2159
2160	struct swap_info_struct *	2160	struct swap_info_struct *
2161	get_swap_info_struct(unsigned type)	2161	get_swap_info_struct(unsigned type)
2162	{	2162	{
2163	return &swap_info[type];	2163	return &swap_info[type];
2164	}	2164	}
2165		2165
2166	/*	2166	/*
2167	* swap_lock prevents swap_map being freed. Don't grab an extra	2167	* swap_lock prevents swap_map being freed. Don't grab an extra
2168	* reference on the swaphandle, it doesn't matter if it becomes unused.	2168	* reference on the swaphandle, it doesn't matter if it becomes unused.
2169	*/	2169	*/
2170	int valid_swaphandles(swp_entry_t entry, unsigned long *offset)	2170	int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2171	{	2171	{
2172	struct swap_info_struct *si;	2172	struct swap_info_struct *si;
2173	int our_page_cluster = page_cluster;	2173	int our_page_cluster = page_cluster;
2174	pgoff_t target, toff;	2174	pgoff_t target, toff;
2175	pgoff_t base, end;	2175	pgoff_t base, end;
2176	int nr_pages = 0;	2176	int nr_pages = 0;
2177		2177
2178	if (!our_page_cluster) /* no readahead */	2178	if (!our_page_cluster) /* no readahead */
2179	return 0;	2179	return 0;
2180		2180
2181	si = &swap_info[swp_type(entry)];	2181	si = &swap_info[swp_type(entry)];
2182	target = swp_offset(entry);	2182	target = swp_offset(entry);
2183	base = (target >> our_page_cluster) << our_page_cluster;	2183	base = (target >> our_page_cluster) << our_page_cluster;
2184	end = base + (1 << our_page_cluster);	2184	end = base + (1 << our_page_cluster);
2185	if (!base) /* first page is swap header */	2185	if (!base) /* first page is swap header */
2186	base++;	2186	base++;
2187		2187
2188	spin_lock(&swap_lock);	2188	spin_lock(&swap_lock);
2189	if (end > si->max) /* don't go beyond end of map */	2189	if (end > si->max) /* don't go beyond end of map */
2190	end = si->max;	2190	end = si->max;
2191		2191
2192	/* Count contiguous allocated slots above our target */	2192	/* Count contiguous allocated slots above our target */
2193	for (toff = target; ++toff < end; nr_pages++) {	2193	for (toff = target; ++toff < end; nr_pages++) {
2194	/* Don't read in free or bad pages */	2194	/* Don't read in free or bad pages */
2195	if (!si->swap_map[toff])	2195	if (!si->swap_map[toff])
2196	break;	2196	break;
2197	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)	2197	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2198	break;	2198	break;
2199	}	2199	}
2200	/* Count contiguous allocated slots below our target */	2200	/* Count contiguous allocated slots below our target */
2201	for (toff = target; --toff >= base; nr_pages++) {	2201	for (toff = target; --toff >= base; nr_pages++) {
2202	/* Don't read in free or bad pages */	2202	/* Don't read in free or bad pages */
2203	if (!si->swap_map[toff])	2203	if (!si->swap_map[toff])
2204	break;	2204	break;
2205	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)	2205	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2206	break;	2206	break;
2207	}	2207	}
2208	spin_unlock(&swap_lock);	2208	spin_unlock(&swap_lock);
2209		2209
2210	/*	2210	/*
2211	* Indicate starting offset, and return number of pages to get:	2211	* Indicate starting offset, and return number of pages to get:
2212	* if only 1, say 0, since there's then no readahead to be done.	2212	* if only 1, say 0, since there's then no readahead to be done.
2213	*/	2213	*/
2214	*offset = ++toff;	2214	*offset = ++toff;
2215	return nr_pages? ++nr_pages: 0;	2215	return nr_pages? ++nr_pages: 0;
2216	}	2216	}
2217		2217