Eric Lee / linux-smarc-t335x-v3.2

Commit f01ef569cddb1a8627b1c6b3a134998ad1cf4b22

Authored by Linus Torvalds 2011-07-27 01:39:54 +0800

Exists in master and in 4 other branches

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
  mm: properly reflect task dirty limits in dirty_exceeded logic
  writeback: don't busy retry writeback on new/freeing inodes
  writeback: scale IO chunk size up to half device bandwidth
  writeback: trace global_dirty_state
  writeback: introduce max-pause and pass-good dirty limits
  writeback: introduce smoothed global dirty limit
  writeback: consolidate variable names in balance_dirty_pages()
  writeback: show bdi write bandwidth in debugfs
  writeback: bdi write bandwidth estimation
  writeback: account per-bdi accumulated written pages
  writeback: make writeback_control.nr_to_write straight
  writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
  writeback: trace event writeback_queue_io
  writeback: trace event writeback_single_inode
  writeback: remove .nonblocking and .encountered_congestion
  writeback: remove writeback_control.more_io
  writeback: skip balance_dirty_pages() for in-memory fs
  writeback: add bdi_dirty_limit() kernel-doc
  writeback: avoid extra sync work at enqueue time
  writeback: elevate queue_io() into wb_writeback()
  ...

Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c

Showing 15 changed files Inline Diff

fs/block_dev.c
fs/btrfs/extent_io.c
fs/ext4/inode.c
fs/fs-writeback.c
fs/inode.c
fs/nfs/write.c
include/linux/backing-dev.h
include/linux/writeback.h
include/trace/events/btrfs.h
include/trace/events/ext4.h
include/trace/events/writeback.h
mm/backing-dev.c
mm/filemap.c
mm/page-writeback.c
mm/rmap.c

fs/block_dev.c

Diff comments View file @ f01ef56

 /*
  *  linux/fs/block_dev.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
  */
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
 };
 static const struct address_space_operations def_blk_aops;
 static inline struct bdev_inode *BDEV_I(struct inode *inode)
 {
 	return container_of(inode, struct bdev_inode, vfs_inode);
 }
 inline struct block_device *I_BDEV(struct inode *inode)
 {
 	return &BDEV_I(inode)->bdev;
 }
 EXPORT_SYMBOL(I_BDEV);
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
- * we need to move it onto the dirty list of @dst so that the inode is always
+ * need to move it onto the dirty list of @dst so that the inode is always on
- * on the right list.
+ * the right list.
  */
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+	if (unlikely(dst == old))		/* deadlock avoidance */
+		return;
+	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&old->wb.list_lock);
+	spin_unlock(&dst->wb.list_lock);
 }
 static sector_t max_block(struct block_device *bdev)
 {
 	sector_t retval = ~((sector_t)0);
 	loff_t sz = i_size_read(bdev->bd_inode);
 	if (sz) {
 		unsigned int size = block_size(bdev);
 		unsigned int sizebits = blksize_bits(size);
 		retval = (sz >> sizebits);
 	}
 	return retval;
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
 	if (bdev->bd_inode->i_mapping->nrpages == 0)
 		return;
 	invalidate_bh_lrus();
 	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 }
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
 	/* Size cannot be smaller than the size supported by the device */
 	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 	/* Don't change the size if it is same as current */
 	if (bdev->bd_block_size != size) {
 		sync_blockdev(bdev);
 		bdev->bd_block_size = size;
 		bdev->bd_inode->i_blkbits = blksize_bits(size);
 		kill_bdev(bdev);
 	}
 	return 0;
 }
 EXPORT_SYMBOL(set_blocksize);
 int sb_set_blocksize(struct super_block *sb, int size)
 {
 	if (set_blocksize(sb->s_bdev, size))
 		return 0;
 	/* If we get here, we know size is power of two
 	 * and it's value is between 512 and PAGE_SIZE */
 	sb->s_blocksize = size;
 	sb->s_blocksize_bits = blksize_bits(size);
 	return sb->s_blocksize;
 }
 EXPORT_SYMBOL(sb_set_blocksize);
 int sb_min_blocksize(struct super_block *sb, int size)
 {
 	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
 }
 EXPORT_SYMBOL(sb_min_blocksize);
 static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
 	if (iblock >= max_block(I_BDEV(inode))) {
 		if (create)
 			return -EIO;
 		/*
 		 * for reads, we're just trying to fill a partial page.
 		 * return a hole, they will have to call get_block again
 		 * before they can fill it, and they will get -EIO at that
 		 * time
 		 */
 		return 0;
 	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	set_buffer_mapped(bh);
 	return 0;
 }
 static int
 blkdev_get_blocks(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
 	sector_t end_block = max_block(I_BDEV(inode));
 	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
 	if ((iblock + max_blocks) > end_block) {
 		max_blocks = end_block - iblock;
 		if ((long)max_blocks <= 0) {
 			if (create)
 				return -EIO;	/* write fully beyond EOF */
 			/*
 			 * It is a read which is fully beyond EOF.  We return
 			 * a !buffer_mapped buffer
 			 */
 			max_blocks = 0;
 		}
 	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	bh->b_size = max_blocks << inode->i_blkbits;
 	if (max_blocks)
 		set_buffer_mapped(bh);
 	return 0;
 }
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
 				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
 {
 	if (!bdev)
 		return 0;
 	if (!wait)
 		return filemap_flush(bdev->bd_inode->i_mapping);
 	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
 }
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
 	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
 int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
 		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
 	return sync_blockdev(bdev);
 }
 EXPORT_SYMBOL(fsync_bdev);
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
  * unfreeze process can unfreeze the frozen filesystem actually when multiple
  * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
  * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
  * actually.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 	int error = 0;
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (++bdev->bd_fsfreeze_count > 1) {
 		/*
 		 * We don't even need to grab a reference - the first call
 		 * to freeze_bdev grab an active reference and only the last
 		 * thaw_bdev drops it.
 		 */
 		sb = get_super(bdev);
 		drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
 	sb = get_active_super(bdev);
 	if (!sb)
 		goto out;
 	error = freeze_super(sb);
 	if (error) {
 		deactivate_super(sb);
 		bdev->bd_fsfreeze_count--;
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return ERR_PTR(error);
 	}
 	deactivate_super(sb);
  out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return sb;	/* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
 /**
  * thaw_bdev  -- unlock filesystem
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	int error = -EINVAL;
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (!bdev->bd_fsfreeze_count)
 		goto out;
 	error = 0;
 	if (--bdev->bd_fsfreeze_count > 0)
 		goto out;
 	if (!sb)
 		goto out;
 	error = thaw_super(sb);
 	if (error) {
 		bdev->bd_fsfreeze_count++;
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return error;
 	}
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
 }
 static int blkdev_readpage(struct file * file, struct page * page)
 {
 	return block_read_full_page(page, blkdev_get_block);
 }
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	return block_write_begin(mapping, pos, len, flags, pagep,
 				 blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	int ret;
 	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	unlock_page(page);
 	page_cache_release(page);
 	return ret;
 }
 /*
  * private llseek:
  * for a block special file file->f_path.dentry->d_inode->i_size is zero
  * so we compute the size by hand (just as in block_read/write above)
  */
 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 {
 	struct inode *bd_inode = file->f_mapping->host;
 	loff_t size;
 	loff_t retval;
 	mutex_lock(&bd_inode->i_mutex);
 	size = i_size_read(bd_inode);
 	retval = -EINVAL;
 	switch (origin) {
 		case SEEK_END:
 			offset += size;
 			break;
 		case SEEK_CUR:
 			offset += file->f_pos;
 		case SEEK_SET:
 			break;
 		default:
 			goto out;
 	}
 	if (offset >= 0 && offset <= size) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 		}
 		retval = offset;
 	}
 out:
 	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *bd_inode = filp->f_mapping->host;
 	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
 	/*
 	 * There is no need to serialise calls to blkdev_issue_flush with
 	 * i_mutex and doing so causes performance issues with concurrent
 	 * O_SYNC writers to a block device.
 	 */
 	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
 }
 EXPORT_SYMBOL(blkdev_fsync);
 /*
  * pseudo-fs
  */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
 static struct kmem_cache * bdev_cachep __read_mostly;
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
 }
 static void bdev_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bdev_cachep, bdi);
 }
 static void bdev_destroy_inode(struct inode *inode)
 {
 	call_rcu(&inode->i_rcu, bdev_i_callback);
 }
 static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
 	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
 	INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 static inline void __bd_forget(struct inode *inode)
 {
 	list_del_init(&inode->i_devices);
 	inode->i_bdev = NULL;
 	inode->i_mapping = &inode->i_data;
 }
 static void bdev_evict_inode(struct inode *inode)
 {
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
 	truncate_inode_pages(&inode->i_data, 0);
 	invalidate_inode_buffers(inode); /* is it needed here? */
 	end_writeback(inode);
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 		__bd_forget(list_entry(p, struct inode, i_devices));
 	}
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
 }
 static const struct super_operations bdev_sops = {
 	.statfs = simple_statfs,
 	.alloc_inode = bdev_alloc_inode,
 	.destroy_inode = bdev_destroy_inode,
 	.drop_inode = generic_delete_inode,
 	.evict_inode = bdev_evict_inode,
 };
 static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 static struct file_system_type bd_type = {
 	.name		= "bdev",
 	.mount		= bd_mount,
 	.kill_sb	= kill_anon_super,
 };
 struct super_block *blockdev_superblock __read_mostly;
 void __init bdev_cache_init(void)
 {
 	int err;
 	struct vfsmount *bd_mnt;
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
 		panic("Cannot register bdev pseudo-fs");
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
 	/*
 	 * This vfsmount structure is only used to obtain the
 	 * blockdev_superblock, so tell kmemleak not to report it.
 	 */
 	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 /*
  * Most likely _very_ bad one - but then it's hardly critical for small
  * /dev and can be fixed when somebody will need really large one.
  * Keep in mind that it will be fed through icache hash function too.
  */
 static inline unsigned long hash(dev_t dev)
 {
 	return MAJOR(dev)+MINOR(dev);
 }
 static int bdev_test(struct inode *inode, void *data)
 {
 	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
 }
 static int bdev_set(struct inode *inode, void *data)
 {
 	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
 	return 0;
 }
 static LIST_HEAD(all_bdevs);
 struct block_device *bdget(dev_t dev)
 {
 	struct block_device *bdev;
 	struct inode *inode;
 	inode = iget5_locked(blockdev_superblock, hash(dev),
 			bdev_test, bdev_set, &dev);
 	if (!inode)
 		return NULL;
 	bdev = &BDEV_I(inode)->bdev;
 	if (inode->i_state & I_NEW) {
 		bdev->bd_contains = NULL;
 		bdev->bd_inode = inode;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
 		bdev->bd_invalidated = 0;
 		inode->i_mode = S_IFBLK;
 		inode->i_rdev = dev;
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 		inode->i_data.backing_dev_info = &default_backing_dev_info;
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
 		unlock_new_inode(inode);
 	}
 	return bdev;
 }
 EXPORT_SYMBOL(bdget);
 /**
  * bdgrab -- Grab a reference to an already referenced block device
  * @bdev:	Block device to grab a reference to.
  */
 struct block_device *bdgrab(struct block_device *bdev)
 {
 	ihold(bdev->bd_inode);
 	return bdev;
 }
 long nr_blockdev_pages(void)
 {
 	struct block_device *bdev;
 	long ret = 0;
 	spin_lock(&bdev_lock);
 	list_for_each_entry(bdev, &all_bdevs, bd_list) {
 		ret += bdev->bd_inode->i_mapping->nrpages;
 	}
 	spin_unlock(&bdev_lock);
 	return ret;
 }
 void bdput(struct block_device *bdev)
 {
 	iput(bdev->bd_inode);
 }
 EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
 	struct block_device *bdev;
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
 		ihold(bdev->bd_inode);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
 	spin_unlock(&bdev_lock);
 	bdev = bdget(inode->i_rdev);
 	if (bdev) {
 		spin_lock(&bdev_lock);
 		if (!inode->i_bdev) {
 			/*
 			 * We take an additional reference to bd_inode,
 			 * and it's released in clear_inode() of inode.
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
 			ihold(bdev->bd_inode);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
 		}
 		spin_unlock(&bdev_lock);
 	}
 	return bdev;
 }
 /* Call when you free inode */
 void bd_forget(struct inode *inode)
 {
 	struct block_device *bdev = NULL;
 	spin_lock(&bdev_lock);
 	if (inode->i_bdev) {
 		if (!sb_is_blkdev_sb(inode->i_sb))
 			bdev = inode->i_bdev;
 		__bd_forget(inode);
 	}
 	spin_unlock(&bdev_lock);
 	if (bdev)
 		iput(bdev->bd_inode);
 }
 /**
  * bd_may_claim - test whether a block device can be claimed
  * @bdev: block device of interest
  * @whole: whole block device containing @bdev, may equal @bdev
  * @holder: holder trying to claim @bdev
  *
  * Test whether @bdev can be claimed by @holder.
  *
  * CONTEXT:
  * spin_lock(&bdev_lock).
  *
  * RETURNS:
  * %true if @bdev can be claimed, %false otherwise.
  */
 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 			 void *holder)
 {
 	if (bdev->bd_holder == holder)
 		return true;	 /* already a holder */
 	else if (bdev->bd_holder != NULL)
 		return false; 	 /* held by someone else */
 	else if (bdev->bd_contains == bdev)
 		return true;  	 /* is a whole device which isn't held */
 	else if (whole->bd_holder == bd_may_claim)
 		return true; 	 /* is a partition of a device that is being partitioned */
 	else if (whole->bd_holder != NULL)
 		return false;	 /* is a partition of a held device */
 	else
 		return true;	 /* is a partition of an un-held device */
 }
 /**
  * bd_prepare_to_claim - prepare to claim a block device
  * @bdev: block device of interest
  * @whole: the whole device containing @bdev, may equal @bdev
  * @holder: holder trying to claim @bdev
  *
  * Prepare to claim @bdev.  This function fails if @bdev is already
  * claimed by another holder and waits if another claiming is in
  * progress.  This function doesn't actually claim.  On successful
  * return, the caller has ownership of bd_claiming and bd_holder[s].
  *
  * CONTEXT:
  * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
  * it multiple times.
  *
  * RETURNS:
  * 0 if @bdev can be claimed, -EBUSY otherwise.
  */
 static int bd_prepare_to_claim(struct block_device *bdev,
 			       struct block_device *whole, void *holder)
 {
 retry:
 	/* if someone else claimed, fail */
 	if (!bd_may_claim(bdev, whole, holder))
 		return -EBUSY;
 	/* if claiming is already in progress, wait for it to finish */
 	if (whole->bd_claiming) {
 		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
 		DEFINE_WAIT(wait);
 		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
 		spin_unlock(&bdev_lock);
 		schedule();
 		finish_wait(wq, &wait);
 		spin_lock(&bdev_lock);
 		goto retry;
 	}
 	/* yay, all mine */
 	return 0;
 }
 /**
  * bd_start_claiming - start claiming a block device
  * @bdev: block device of interest
  * @holder: holder trying to claim @bdev
  *
  * @bdev is about to be opened exclusively.  Check @bdev can be opened
  * exclusively and mark that an exclusive open is in progress.  Each
  * successful call to this function must be matched with a call to
  * either bd_finish_claiming() or bd_abort_claiming() (which do not
  * fail).
  *
  * This function is used to gain exclusive access to the block device
  * without actually causing other exclusive open attempts to fail. It
  * should be used when the open sequence itself requires exclusive
  * access but may subsequently fail.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
  * Pointer to the block device containing @bdev on success, ERR_PTR()
  * value on failure.
  */
 static struct block_device *bd_start_claiming(struct block_device *bdev,
 					      void *holder)
 {
 	struct gendisk *disk;
 	struct block_device *whole;
 	int partno, err;
 	might_sleep();
 	/*
 	 * @bdev might not have been initialized properly yet, look up
 	 * and grab the outer block device the hard way.
 	 */
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		return ERR_PTR(-ENXIO);
 	/*
 	 * Normally, @bdev should equal what's returned from bdget_disk()
 	 * if partno is 0; however, some drivers (floppy) use multiple
 	 * bdev's for the same physical device and @bdev may be one of the
 	 * aliases.  Keep @bdev if partno is 0.  This means claimer
 	 * tracking is broken for those devices but it has always been that
 	 * way.
 	 */
 	if (partno)
 		whole = bdget_disk(disk, 0);
 	else
 		whole = bdgrab(bdev);
 	module_put(disk->fops->owner);
 	put_disk(disk);
 	if (!whole)
 		return ERR_PTR(-ENOMEM);
 	/* prepare to claim, if successful, mark claiming in progress */
 	spin_lock(&bdev_lock);
 	err = bd_prepare_to_claim(bdev, whole, holder);
 	if (err == 0) {
 		whole->bd_claiming = holder;
 		spin_unlock(&bdev_lock);
 		return whole;
 	} else {
 		spin_unlock(&bdev_lock);
 		bdput(whole);
 		return ERR_PTR(err);
 	}
 }
 #ifdef CONFIG_SYSFS
 struct bd_holder_disk {
 	struct list_head	list;
 	struct gendisk		*disk;
 	int			refcnt;
 };
 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
 						  struct gendisk *disk)
 {
 	struct bd_holder_disk *holder;
 	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
 		if (holder->disk == disk)
 			return holder;
 	return NULL;
 }
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
 	sysfs_remove_link(from, kobject_name(to));
 }
 /**
  * bd_link_disk_holder - create symlinks between holding disk and slave bdev
  * @bdev: the claimed slave bdev
  * @disk: the holding disk
  *
  * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
  *
  * This functions creates the following sysfs symlinks.
  *
  * - from "slaves" directory of the holder @disk to the claimed @bdev
  * - from "holders" directory of the @bdev to the holder @disk
  *
  * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
  * passed to bd_link_disk_holder(), then:
  *
  *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
  *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  *
  * The caller must have claimed @bdev before calling this function and
  * ensure that both @bdev and @disk are valid during the creation and
  * lifetime of these symlinks.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
 	struct bd_holder_disk *holder;
 	int ret = 0;
 	mutex_lock(&bdev->bd_mutex);
 	WARN_ON_ONCE(!bdev->bd_holder);
 	/* FIXME: remove the following once add_disk() handles errors */
 	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
 		goto out_unlock;
 	holder = bd_find_holder_disk(bdev, disk);
 	if (holder) {
 		holder->refcnt++;
 		goto out_unlock;
 	}
 	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
 	if (!holder) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 	INIT_LIST_HEAD(&holder->list);
 	holder->disk = disk;
 	holder->refcnt = 1;
 	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 	if (ret)
 		goto out_free;
 	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 	if (ret)
 		goto out_del;
 	/*
 	 * bdev could be deleted beneath us which would implicitly destroy
 	 * the holder directory.  Hold on to it.
 	 */
 	kobject_get(bdev->bd_part->holder_dir);
 	list_add(&holder->list, &bdev->bd_holder_disks);
 	goto out_unlock;
 out_del:
 	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 out_free:
 	kfree(holder);
 out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 /**
  * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
  * @bdev: the calimed slave bdev
  * @disk: the holding disk
  *
  * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
  *
  * CONTEXT:
  * Might sleep.
  */
 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
 	struct bd_holder_disk *holder;
 	mutex_lock(&bdev->bd_mutex);
 	holder = bd_find_holder_disk(bdev, disk);
 	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
 		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 		del_symlink(bdev->bd_part->holder_dir,
 			    &disk_to_dev(disk)->kobj);
 		kobject_put(bdev->bd_part->holder_dir);
 		list_del_init(&holder->list);
 		kfree(holder);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 }
 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
  * @bdev:      struct block device to be flushed
  * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Invalidates all buffer-cache entries on a disk. It should be called
  * when a disk has been changed -- either by a media change or online
  * resize.
  */
 static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
 	if (__invalidate_device(bdev, kill_dirty)) {
 		char name[BDEVNAME_SIZE] = "";
 		if (bdev->bd_disk)
 			disk_name(bdev->bd_disk, 0, name);
 		printk(KERN_WARNING "VFS: busy inodes on changed media or "
 		       "resized disk %s\n", name);
 	}
 	if (!bdev->bd_disk)
 		return;
 	if (disk_partitionable(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 }
 /**
  * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
  *
  * This routine checks to see if the bdev size does not match the disk size
  * and adjusts it if it differs.
  */
 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 {
 	loff_t disk_size, bdev_size;
 	disk_size = (loff_t)get_capacity(disk) << 9;
 	bdev_size = i_size_read(bdev->bd_inode);
 	if (disk_size != bdev_size) {
 		char name[BDEVNAME_SIZE];
 		disk_name(disk, 0, name);
 		printk(KERN_INFO
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
 		flush_disk(bdev, false);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);
 /**
  * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
  * @disk: struct gendisk to be revalidated
  *
  * This routine is a wrapper for lower-level driver's revalidate_disk
  * call-backs.  It is used to do common pre and post operations needed
  * for all revalidate_disk operations.
  */
 int revalidate_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
 	int ret = 0;
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
 		return ret;
 	mutex_lock(&bdev->bd_mutex);
 	check_disk_size_change(disk, bdev);
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return ret;
 }
 EXPORT_SYMBOL(revalidate_disk);
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
  * is a relatively slow routine, so we have to try to minimize using
  * it. Thus it is called only upon a 'mount' or 'open'. This
  * is the best way of combining speed and utility, I think.
  * People changing diskettes in the middle of an operation deserve
  * to lose :-)
  */
 int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	const struct block_device_operations *bdops = disk->fops;
 	unsigned int events;
 	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
 				   DISK_EVENT_EJECT_REQUEST);
 	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 	flush_disk(bdev, true);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
 	return 1;
 }
 EXPORT_SYMBOL(check_disk_change);
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
 		bsize <<= 1;
 	}
 	bdev->bd_block_size = bsize;
 	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 }
 EXPORT_SYMBOL(bd_set_size);
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 /*
  * bd_mutex locking:
  *
  *  mutex_lock(part->bd_mutex)
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
 	int ret;
 	int partno;
 	int perm = 0;
 	if (mode & FMODE_READ)
 		perm |= MAY_READ;
 	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
 	if (!for_part) {
 		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
 		if (ret != 0) {
 			bdput(bdev);
 			return ret;
 		}
 	}
  restart:
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!bdev->bd_part)
 				goto out_clear;
 			ret = 0;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret == -ERESTARTSYS) {
 					/* Lost a race with 'disk' being
 					 * deleted, try again.
 					 * See md.c
 					 */
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
 					module_put(disk->fops->owner);
 					put_disk(disk);
 					goto restart;
 				}
 			}
 			if (!ret && !bdev->bd_openers) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
 					bdi = &default_backing_dev_info;
 				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
 			}
 			/*
 			 * If the device is invalidated, rescan partition
 			 * if open succeeded or failed with -ENOMEDIUM.
 			 * The latter is necessary to prevent ghost
 			 * partitions on a removed medium.
 			 */
 			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
 				rescan_partitions(disk, bdev);
 			if (ret)
 				goto out_clear;
 		} else {
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
 			if (!whole)
 				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, mode, 1);
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
 			bdev_inode_switch_bdi(bdev->bd_inode,
 				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				ret = -ENXIO;
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
 		if (bdev->bd_contains == bdev) {
 			ret = 0;
 			if (bdev->bd_disk->fops->open)
 				ret = bdev->bd_disk->fops->open(bdev, mode);
 			/* the same as first opener case, read comment there */
 			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
 				rescan_partitions(bdev->bd_disk, bdev);
 			if (ret)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
 		module_put(disk->fops->owner);
 		put_disk(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
 	return 0;
  out_clear:
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
 	module_put(disk->fops->owner);
 	put_disk(disk);
  out:
 	bdput(bdev);
 	return ret;
 }
 /**
  * blkdev_get - open a block device
  * @bdev: block_device to open
  * @mode: FMODE_* mask
  * @holder: exclusive holder identifier
  *
  * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
  * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
  * @holder is invalid.  Exclusive opens may nest for the same @holder.
  *
  * On success, the reference count of @bdev is unchanged.  On failure,
  * @bdev is put.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
 	struct block_device *whole = NULL;
 	int res;
 	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
 	if ((mode & FMODE_EXCL) && holder) {
 		whole = bd_start_claiming(bdev, holder);
 		if (IS_ERR(whole)) {
 			bdput(bdev);
 			return PTR_ERR(whole);
 		}
 	}
 	res = __blkdev_get(bdev, mode, 0);
 	if (whole) {
 		struct gendisk *disk = whole->bd_disk;
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
 		spin_lock(&bdev_lock);
 		if (!res) {
 			BUG_ON(!bd_may_claim(bdev, whole, holder));
 			/*
 			 * Note that for a whole device bd_holders
 			 * will be incremented twice, and bd_holder
 			 * will be set to bd_may_claim before being
 			 * set to holder
 			 */
 			whole->bd_holders++;
 			whole->bd_holder = bd_may_claim;
 			bdev->bd_holders++;
 			bdev->bd_holder = holder;
 		}
 		/* tell others that we're done */
 		BUG_ON(whole->bd_claiming != holder);
 		whole->bd_claiming = NULL;
 		wake_up_bit(&whole->bd_claiming, 0);
 		spin_unlock(&bdev_lock);
 		/*
 		 * Block event polling for write claims if requested.  Any
 		 * write holder makes the write_holder state stick until
 		 * all are released.  This is good enough and tracking
 		 * individual writeable reference is too fragile given the
 		 * way @mode is used in blkdev_get/put().
 		 */
 		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
 		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
 			bdev->bd_write_holder = true;
 			disk_block_events(disk);
 		}
 		mutex_unlock(&bdev->bd_mutex);
 		bdput(whole);
 	}
 	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
 /**
  * blkdev_get_by_path - open a block device by name
  * @path: path to the block device to open
  * @mode: FMODE_* mask
  * @holder: exclusive holder identifier
  *
  * Open the blockdevice described by the device file at @path.  @mode
  * and @holder are identical to blkdev_get().
  *
  * On success, the returned block_device has reference count of one.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
  * Pointer to block_device on success, ERR_PTR(-errno) on failure.
  */
 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					void *holder)
 {
 	struct block_device *bdev;
 	int err;
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 	err = blkdev_get(bdev, mode, holder);
 	if (err)
 		return ERR_PTR(err);
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
 		blkdev_put(bdev, mode);
 		return ERR_PTR(-EACCES);
 	}
 	return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
 /**
  * blkdev_get_by_dev - open a block device by device number
  * @dev: device number of block device to open
  * @mode: FMODE_* mask
  * @holder: exclusive holder identifier
  *
  * Open the blockdevice described by device number @dev.  @mode and
  * @holder are identical to blkdev_get().
  *
  * Use it ONLY if you really do not have anything better - i.e. when
  * you are behind a truly sucky interface and all you are given is a
  * device number.  _Never_ to be used for internal purposes.  If you
  * ever need it - reconsider your API.
  *
  * On success, the returned block_device has reference count of one.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
  * Pointer to block_device on success, ERR_PTR(-errno) on failure.
  */
 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 {
 	struct block_device *bdev;
 	int err;
 	bdev = bdget(dev);
 	if (!bdev)
 		return ERR_PTR(-ENOMEM);
 	err = blkdev_get(bdev, mode, holder);
 	if (err)
 		return ERR_PTR(err);
 	return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
 	/*
 	 * Preserve backwards compatibility and allow large file access
 	 * even if userspace doesn't ask for it explicitly. Some mkfs
 	 * binary needs it. We might want to drop this workaround
 	 * during an unstable branch.
 	 */
 	filp->f_flags |= O_LARGEFILE;
 	if (filp->f_flags & O_NDELAY)
 		filp->f_mode |= FMODE_NDELAY;
 	if (filp->f_flags & O_EXCL)
 		filp->f_mode |= FMODE_EXCL;
 	if ((filp->f_flags & O_ACCMODE) == 3)
 		filp->f_mode |= FMODE_WRITE_IOCTL;
 	bdev = bd_acquire(inode);
 	if (bdev == NULL)
 		return -ENOMEM;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	return blkdev_get(bdev, filp->f_mode, filp);
 }
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (for_part)
 		bdev->bd_part_count--;
 	if (!--bdev->bd_openers) {
 		WARN_ON_ONCE(bdev->bd_holders);
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
 			ret = disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 		put_disk(disk);
 		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		bdev_inode_switch_bdi(bdev->bd_inode,
 					&default_backing_dev_info);
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
 		__blkdev_put(victim, mode, 1);
 	return ret;
 }
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
 	mutex_lock(&bdev->bd_mutex);
 	if (mode & FMODE_EXCL) {
 		bool bdev_free;
 		/*
 		 * Release a claim on the device.  The holder fields
 		 * are protected with bdev_lock.  bd_mutex is to
 		 * synchronize disk_holder unlinking.
 		 */
 		spin_lock(&bdev_lock);
 		WARN_ON_ONCE(--bdev->bd_holders < 0);
 		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
 		/* bd_contains might point to self, check in a separate step */
 		if ((bdev_free = !bdev->bd_holders))
 			bdev->bd_holder = NULL;
 		if (!bdev->bd_contains->bd_holders)
 			bdev->bd_contains->bd_holder = NULL;
 		spin_unlock(&bdev_lock);
 		/*
 		 * If this was the last claim, remove holder link and
 		 * unblock evpoll if it was a write holder.
 		 */
 		if (bdev_free && bdev->bd_write_holder) {
 			disk_unblock_events(bdev->bd_disk);
 			bdev->bd_write_holder = false;
 		}
 	}
 	/*
 	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
 	 * event.  This is to ensure detection of media removal commanded
 	 * from userland - e.g. eject(1).
 	 */
 	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
 	mutex_unlock(&bdev->bd_mutex);
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	return blkdev_put(bdev, filp->f_mode);
 }
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	fmode_t mode = file->f_mode;
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
 	 * to updated it before every ioctl.
 	 */
 	if (file->f_flags & O_NDELAY)
 		mode |= FMODE_NDELAY;
 	else
 		mode &= ~FMODE_NDELAY;
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
  *
  * Does not take i_mutex for the write and thus is not for general purpose
  * use.
  */
 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t ret;
 	BUG_ON(iocb->ki_pos != pos);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 		err = generic_write_sync(file, pos, ret);
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
  */
 static int blkdev_releasepage(struct page *page, gfp_t wait)
 {
 	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
 	if (super && super->s_op->bdev_try_to_free_page)
 		return super->s_op->bdev_try_to_free_page(super, page, wait);
 	return try_to_free_buffers(page);
 }
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 };
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
 	.aio_write	= blkdev_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 {
 	int res;
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	res = blkdev_ioctl(bdev, 0, cmd, arg);
 	set_fs(old_fs);
 	return res;
 }
 EXPORT_SYMBOL(ioctl_by_bdev);
 /**
  * lookup_bdev  - lookup a struct block_device by name
  * @pathname:	special file representing the block device
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
 struct block_device *lookup_bdev(const char *pathname)
 {
 	struct block_device *bdev;
 	struct inode *inode;
 	struct path path;
 	int error;
 	if (!pathname || !*pathname)
 		return ERR_PTR(-EINVAL);
 	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
 	if (error)
 		return ERR_PTR(error);
 	inode = path.dentry->d_inode;
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
 	if (path.mnt->mnt_flags & MNT_NODEV)
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
 	if (!bdev)
 		goto fail;
 out:
 	path_put(&path);
 	return bdev;
 fail:
 	bdev = ERR_PTR(error);
 	goto out;
 }
 EXPORT_SYMBOL(lookup_bdev);
 int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
 	struct super_block *sb = get_super(bdev);
 	int res = 0;
 	if (sb) {
 		/*
 		 * no need to lock the super, get_super holds the
 		 * read mutex so the filesystem cannot go away
 		 * under us (->put_super runs with the write lock
 		 * hold).
 		 */
 		shrink_dcache_sb(sb);
 		res = invalidate_inodes(sb, kill_dirty);
 		drop_super(sb);
 	}
 	invalidate_bdev(bdev);
 	return res;
 }
 EXPORT_SYMBOL(__invalidate_device);

fs/btrfs/extent_io.c

Diff comments View file @ f01ef56

 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 #define LEAK_DEBUG 0
 #if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
 #endif
 #define BUFFER_LRU_MAX 64
 struct tree_entry {
 	u64 start;
 	u64 end;
 	struct rb_node rb_node;
 };
 struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
 	unsigned int extent_locked:1;
 	/* tells the submit_bio code to use a WRITE_SYNC */
 	unsigned int sync_io:1;
 };
 int __init extent_io_init(void)
 {
 	extent_state_cache = kmem_cache_create("extent_state",
 			sizeof(struct extent_state), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 	extent_buffer_cache = kmem_cache_create("extent_buffers",
 			sizeof(struct extent_buffer), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 	return 0;
 free_state_cache:
 	kmem_cache_destroy(extent_state_cache);
 	return -ENOMEM;
 }
 void extent_io_exit(void)
 {
 	struct extent_state *state;
 	struct extent_buffer *eb;
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, leak_list);
 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
 		       "state %lu in tree %p refs %d\n",
 		       (unsigned long long)state->start,
 		       (unsigned long long)state->end,
 		       state->state, state->tree, atomic_read(&state->refs));
 		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 	}
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
 		       "refs %d\n", (unsigned long long)eb->start,
 		       eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
 		kmem_cache_destroy(extent_buffer_cache);
 }
 void extent_io_tree_init(struct extent_io_tree *tree,
 			 struct address_space *mapping)
 {
 	tree->state = RB_ROOT;
 	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
 	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
 }
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
 #if LEAK_DEBUG
 	unsigned long flags;
 #endif
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
 		return state;
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
 #if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
 #endif
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
 }
 void free_extent_state(struct extent_state *state)
 {
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
 #if LEAK_DEBUG
 		unsigned long flags;
 #endif
 		WARN_ON(state->tree);
 #if LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
 #endif
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
 		if (offset < entry->start)
 			p = &(*p)->rb_left;
 		else if (offset > entry->end)
 			p = &(*p)->rb_right;
 		else
 			return parent;
 	}
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
 {
 	struct rb_root *root = &tree->state;
 	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
 		prev_entry = entry;
 		if (offset < entry->start)
 			n = n->rb_left;
 		else if (offset > entry->end)
 			n = n->rb_right;
 		else
 			return n;
 	}
 	if (prev_ret) {
 		orig_prev = prev;
 		while (prev && offset > prev_entry->end) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
 		*prev_ret = prev;
 		prev = orig_prev;
 	}
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
 		*next_ret = prev;
 	}
 	return NULL;
 }
 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 					  u64 offset)
 {
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 	ret = __etree_search(tree, offset, &prev, NULL);
 	if (!ret)
 		return prev;
 	return ret;
 }
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
 	if (tree->ops && tree->ops->merge_extent_hook)
 		tree->ops->merge_extent_hook(tree->mapping->host, new,
 					     other);
 }
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
  * extent in the tree.  Extents with EXTENT_IO in their state field
  * are not merged because the end_io handlers need to be able to do
  * operations on them without sleeping (or doing allocations/splits).
  *
  * This should be called with the tree lock held.
  */
 static int merge_state(struct extent_io_tree *tree,
 		       struct extent_state *state)
 {
 	struct extent_state *other;
 	struct rb_node *other_node;
 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 		return 0;
 	other_node = rb_prev(&state->rb_node);
 	if (other_node) {
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->end == state->start - 1 &&
 		    other->state == state->state) {
 			merge_cb(tree, state, other);
 			state->start = other->start;
 			other->tree = NULL;
 			rb_erase(&other->rb_node, &tree->state);
 			free_extent_state(other);
 		}
 	}
 	other_node = rb_next(&state->rb_node);
 	if (other_node) {
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->start == state->end + 1 &&
 		    other->state == state->state) {
 			merge_cb(tree, state, other);
 			other->start = state->start;
 			state->tree = NULL;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
 			state = NULL;
 		}
 	}
 	return 0;
 }
 static int set_state_cb(struct extent_io_tree *tree,
 			 struct extent_state *state, int *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook) {
 		return tree->ops->set_bit_hook(tree->mapping->host,
 					       state, bits);
 	}
 	return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
 			   struct extent_state *state, int *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
  *
  * This may return -EEXIST if the extent is already there, in which case the
  * state struct is freed.
  *
  * The tree lock is not taken internally.  This is a utility function and
  * probably isn't what you want to call (see set/clear_extent_bit).
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
 			int *bits)
 {
 	struct rb_node *node;
 	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 	int ret;
 	if (end < start) {
 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
 		       (unsigned long long)end,
 		       (unsigned long long)start);
 		WARN_ON(1);
 	}
 	state->start = start;
 	state->end = end;
 	ret = set_state_cb(tree, state, bits);
 	if (ret)
 		return ret;
 	if (bits_to_set & EXTENT_DIRTY)
 		tree->dirty_bytes += end - start + 1;
 	state->state |= bits_to_set;
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 		       "%llu %llu\n", (unsigned long long)found->start,
 		       (unsigned long long)found->end,
 		       (unsigned long long)start, (unsigned long long)end);
 		free_extent_state(state);
 		return -EEXIST;
 	}
 	state->tree = tree;
 	merge_state(tree, state);
 	return 0;
 }
 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 		     u64 split)
 {
 	if (tree->ops && tree->ops->split_extent_hook)
 		return tree->ops->split_extent_hook(tree->mapping->host,
 						    orig, split);
 	return 0;
 }
 /*
  * split a given extent state struct in two, inserting the preallocated
  * struct 'prealloc' as the newly created second half.  'split' indicates an
  * offset inside 'orig' where it should be split.
  *
  * Before calling,
  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
  * are two extent state structs in the tree:
  * prealloc: [orig->start, split - 1]
  * orig: [ split, orig->end ]
  *
  * The tree locks are not taken by this function. They need to be held
  * by the caller.
  */
 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 		       struct extent_state *prealloc, u64 split)
 {
 	struct rb_node *node;
 	split_cb(tree, orig, split);
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
 	prealloc->state = orig->state;
 	orig->start = split;
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
 	prealloc->tree = tree;
 	return 0;
 }
 /*
  * utility function to clear some bits in an extent state struct.
  * it will optionally wake up any one waiting on this state (wake == 1), or
  * forcibly remove the state from the tree (delete == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
  */
 static int clear_state_bit(struct extent_io_tree *tree,
 			    struct extent_state *state,
 			    int *bits, int wake)
 {
 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 	int ret = state->state & bits_to_clear;
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		WARN_ON(range > tree->dirty_bytes);
 		tree->dirty_bytes -= range;
 	}
 	clear_state_cb(tree, state, bits);
 	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
 	if (state->state == 0) {
 		if (state->tree) {
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
 			free_extent_state(state);
 		} else {
 			WARN_ON(1);
 		}
 	} else {
 		merge_state(tree, state);
 	}
 	return ret;
 }
 static struct extent_state *
 alloc_extent_state_atomic(struct extent_state *prealloc)
 {
 	if (!prealloc)
 		prealloc = alloc_extent_state(GFP_ATOMIC);
 	return prealloc;
 }
 /*
  * clear some bits on a range in the tree.  This may require splitting
  * or inserting elements in the tree, so the gfp mask is used to
  * indicate which allocations or sleeping are allowed.
  *
  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
  * the given range from the tree regardless of state (ie for truncate).
  *
  * the range [start, end] is inclusive.
  *
  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
  * bits were already set, or zero if none of the bits were already set.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     int bits, int wake, int delete,
 		     struct extent_state **cached_state,
 		     gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
 	int set = 0;
 	int clear = 0;
 	if (delete)
 		bits |= ~EXTENT_CTLBITS;
 	bits |= EXTENT_FIRST_DELALLOC;
 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 		clear = 1;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
 		if (!prealloc)
 			return -ENOMEM;
 	}
 	spin_lock(&tree->lock);
 	if (cached_state) {
 		cached = *cached_state;
 		if (clear) {
 			*cached_state = NULL;
 			cached_state = NULL;
 		}
 		if (cached && cached->tree && cached->start == start) {
 			if (clear)
 				atomic_dec(&cached->refs);
 			state = cached;
 			goto hit_next;
 		}
 		if (clear)
 			free_extent_state(cached);
 	}
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
 	 */
 	node = tree_search(tree, start);
 	if (!node)
 		goto out;
 	state = rb_entry(node, struct extent_state, rb_node);
 hit_next:
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
 	last_end = state->end;
 	/*
 	 *     | ---- desired range ---- |
 	 *  | state | or
 	 *  | ------------- state -------------- |
 	 *
 	 * We need to split the extent we found, and may flip
 	 * bits on second half.
 	 *
 	 * If the extent we found extends past our range, we
 	 * just split and search again.  It'll get split again
 	 * the next time though.
 	 *
 	 * If the extent we found is inside our range, we clear
 	 * the desired bit on it.
 	 */
 	if (state->start < start) {
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
 		BUG_ON(err == -EEXIST);
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
 			set |= clear_state_bit(tree, state, &bits, wake);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
 		}
 		goto search_again;
 	}
 	/*
 	 * | ---- desired range ---- |
 	 *                        | state |
 	 * We need to split the extent, and clear the bit
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 		if (wake)
 			wake_up(&state->wq);
 		set |= clear_state_bit(tree, prealloc, &bits, wake);
 		prealloc = NULL;
 		goto out;
 	}
 	if (state->end < end && prealloc && !need_resched())
 		next_node = rb_next(&state->rb_node);
 	else
 		next_node = NULL;
 	set |= clear_state_bit(tree, state, &bits, wake);
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
 	if (start <= end && next_node) {
 		state = rb_entry(next_node, struct extent_state,
 				 rb_node);
 		if (state->start == start)
 			goto hit_next;
 	}
 	goto search_again;
 out:
 	spin_unlock(&tree->lock);
 	if (prealloc)
 		free_extent_state(prealloc);
 	return set;
 search_again:
 	if (start > end)
 		goto out;
 	spin_unlock(&tree->lock);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
 }
 static int wait_on_state(struct extent_io_tree *tree,
 			 struct extent_state *state)
 		__releases(tree->lock)
 		__acquires(tree->lock)
 {
 	DEFINE_WAIT(wait);
 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&tree->lock);
 	schedule();
 	spin_lock(&tree->lock);
 	finish_wait(&state->wq, &wait);
 	return 0;
 }
 /*
  * waits for one or more bits to clear on a range in the state tree.
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 {
 	struct extent_state *state;
 	struct rb_node *node;
 	spin_lock(&tree->lock);
 again:
 	while (1) {
 		/*
 		 * this search will find all the extents that end after
 		 * our range starts
 		 */
 		node = tree_search(tree, start);
 		if (!node)
 			break;
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > end)
 			goto out;
 		if (state->state & bits) {
 			start = state->start;
 			atomic_inc(&state->refs);
 			wait_on_state(tree, state);
 			free_extent_state(state);
 			goto again;
 		}
 		start = state->end + 1;
 		if (start > end)
 			break;
 		if (need_resched()) {
 			spin_unlock(&tree->lock);
 			cond_resched();
 			spin_lock(&tree->lock);
 		}
 	}
 out:
 	spin_unlock(&tree->lock);
 	return 0;
 }
 static int set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
 			   int *bits)
 {
 	int ret;
 	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 	ret = set_state_cb(tree, state, bits);
 	if (ret)
 		return ret;
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
 	state->state |= bits_to_set;
 	return 0;
 }
 static void cache_state(struct extent_state *state,
 			struct extent_state **cached_ptr)
 {
 	if (cached_ptr && !(*cached_ptr)) {
 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 			*cached_ptr = state;
 			atomic_inc(&state->refs);
 		}
 	}
 }
 static void uncache_state(struct extent_state **cached_ptr)
 {
 	if (cached_ptr && (*cached_ptr)) {
 		struct extent_state *state = *cached_ptr;
 		*cached_ptr = NULL;
 		free_extent_state(state);
 	}
 }
 /*
  * set some bits on a range in the tree.  This may require allocations or
  * sleeping, so the gfp mask is used to indicate what is allowed.
  *
  * If any of the exclusive bits are set, this will fail with -EEXIST if some
  * part of the range already has the desired bits set.  The start of the
  * existing range is returned in failed_start in this case.
  *
  * [start, end] is inclusive This takes the tree lock.
  */
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   int bits, int exclusive_bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
 		BUG_ON(!prealloc);
 	}
 	spin_lock(&tree->lock);
 	if (cached_state && *cached_state) {
 		state = *cached_state;
 		if (state->start == start && state->tree) {
 			node = &state->rb_node;
 			goto hit_next;
 		}
 	}
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
 	if (!node) {
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = insert_state(tree, prealloc, start, end, &bits);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
 hit_next:
 	last_start = state->start;
 	last_end = state->end;
 	/*
 	 * | ---- desired range ---- |
 	 * | state |
 	 *
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
 		struct rb_node *next_node;
 		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
 			err = -EEXIST;
 			goto out;
 		}
 		err = set_state_bits(tree, state, &bits);
 		if (err)
 			goto out;
 		next_node = rb_next(node);
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
 			goto out;
 		start = last_end + 1;
 		if (next_node && start < end && prealloc && !need_resched()) {
 			state = rb_entry(next_node, struct extent_state,
 					 rb_node);
 			if (state->start == start)
 				goto hit_next;
 		}
 		goto search_again;
 	}
 	/*
 	 *     | ---- desired range ---- |
 	 * | state |
 	 *   or
 	 * | ------------- state -------------- |
 	 *
 	 * We need to split the extent we found, and may flip bits on
 	 * second half.
 	 *
 	 * If the extent we found extends past our
 	 * range, we just split and search again.  It'll get split
 	 * again the next time though.
 	 *
 	 * If the extent we found is inside our range, we set the
 	 * desired bit on it.
 	 */
 	if (state->start < start) {
 		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
 		}
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
 		BUG_ON(err == -EEXIST);
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
 			err = set_state_bits(tree, state, &bits);
 			if (err)
 				goto out;
 			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
 		}
 		goto search_again;
 	}
 	/*
 	 * | ---- desired range ---- |
 	 *     | state | or               | state |
 	 *
 	 * There's a hole, we need to insert something in it and
 	 * ignore the extent we found.
 	 */
 	if (state->start > start) {
 		u64 this_end;
 		if (end < last_start)
 			this_end = end;
 		else
 			this_end = last_start - 1;
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		/*
 		 * Avoid to free 'prealloc' if it can be merged with
 		 * the later extent.
 		 */
 		atomic_inc(&prealloc->refs);
 		err = insert_state(tree, prealloc, start, this_end,
 				   &bits);
 		BUG_ON(err == -EEXIST);
 		if (err) {
 			free_extent_state(prealloc);
 			prealloc = NULL;
 			goto out;
 		}
 		cache_state(prealloc, cached_state);
 		free_extent_state(prealloc);
 		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
 	/*
 	 * | ---- desired range ---- |
 	 *                        | state |
 	 * We need to split the extent, and set the bit
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
 		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
 		}
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 		err = set_state_bits(tree, prealloc, &bits);
 		if (err) {
 			prealloc = NULL;
 			goto out;
 		}
 		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
 		goto out;
 	}
 	goto search_again;
 out:
 	spin_unlock(&tree->lock);
 	if (prealloc)
 		free_extent_state(prealloc);
 	return err;
 search_again:
 	if (start > end)
 		goto out;
 	spin_unlock(&tree->lock);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
 }
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
 			      NULL, mask);
 }
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, bits, 0, NULL,
 			      NULL, mask);
 }
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
 			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
 			      0, NULL, cached_state, mask);
 }
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end,
 				EXTENT_DIRTY | EXTENT_DELALLOC |
 				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
 }
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
 			      NULL, mask);
 }
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
 			      NULL, cached_state, mask);
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 				 u64 end, struct extent_state **cached_state,
 				 gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
 				cached_state, mask);
 }
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		     int bits, struct extent_state **cached_state, gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 	while (1) {
 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
 				     EXTENT_LOCKED, &failed_start,
 				     cached_state, mask);
 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
 		} else {
 			break;
 		}
 		WARN_ON(start > end);
 	}
 	return err;
 }
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
 	return lock_extent_bits(tree, start, end, 0, NULL, mask);
 }
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
 			     &failed_start, NULL, mask);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
 					 EXTENT_LOCKED, 1, 0, NULL, mask);
 		return 0;
 	}
 	return 1;
 }
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 			 struct extent_state **cached, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
 				mask);
 }
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
 				mask);
 }
 /*
  * helper function to set both pages and extents in the tree writeback
  */
 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	struct page *page;
 	while (index <= end_index) {
 		page = find_get_page(tree->mapping, index);
 		BUG_ON(!page);
 		set_page_writeback(page);
 		page_cache_release(page);
 		index++;
 	}
 	return 0;
 }
 /*
  * find the first offset in the io tree with 'bits' set. zero is
  * returned if we find something, and *start_ret and *end_ret are
  * set to reflect the state struct that was found.
  *
  * If nothing was found, 1 is returned, < 0 on error
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	int ret = 1;
 	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
 	if (!node)
 		goto out;
 	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->end >= start && (state->state & bits)) {
 			*start_ret = state->start;
 			*end_ret = state->end;
 			ret = 0;
 			break;
 		}
 		node = rb_next(node);
 		if (!node)
 			break;
 	}
 out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
 /* find the first state struct with 'bits' set after 'start', and
  * return it.  tree->lock must be held.  NULL will returned if
  * nothing was found after 'start'
  */
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
 	if (!node)
 		goto out;
 	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->end >= start && (state->state & bits))
 			return state;
 		node = rb_next(node);
 		if (!node)
 			break;
 	}
 out:
 	return NULL;
 }
 /*
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 					u64 *start, u64 *end, u64 max_bytes,
 					struct extent_state **cached_state)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	u64 cur_start = *start;
 	u64 found = 0;
 	u64 total_bytes = 0;
 	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
 	if (!node) {
 		if (!found)
 			*end = (u64)-1;
 		goto out;
 	}
 	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (found && (state->start != cur_start ||
 			      (state->state & EXTENT_BOUNDARY))) {
 			goto out;
 		}
 		if (!(state->state & EXTENT_DELALLOC)) {
 			if (!found)
 				*end = state->end;
 			goto out;
 		}
 		if (!found) {
 			*start = state->start;
 			*cached_state = state;
 			atomic_inc(&state->refs);
 		}
 		found++;
 		*end = state->end;
 		cur_start = state->end + 1;
 		node = rb_next(node);
 		if (!node)
 			break;
 		total_bytes += state->end - state->start + 1;
 		if (total_bytes >= max_bytes)
 			break;
 	}
 out:
 	spin_unlock(&tree->lock);
 	return found;
 }
 static noinline int __unlock_for_delalloc(struct inode *inode,
 					  struct page *locked_page,
 					  u64 start, u64 end)
 {
 	int ret;
 	struct page *pages[16];
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
 	if (index == locked_page->index && end_index == index)
 		return 0;
 	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long, nr_pages,
 				     ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (pages[i] != locked_page)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
 		cond_resched();
 	}
 	return 0;
 }
 static noinline int lock_delalloc_pages(struct inode *inode,
 					struct page *locked_page,
 					u64 delalloc_start,
 					u64 delalloc_end)
 {
 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
 	unsigned long start_index = index;
 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
 	unsigned long pages_locked = 0;
 	struct page *pages[16];
 	unsigned long nrpages;
 	int ret;
 	int i;
 	/* the caller is responsible for locking the start index */
 	if (index == locked_page->index && index == end_index)
 		return 0;
 	/* skip the page at the start index */
 	nrpages = end_index - index + 1;
 	while (nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nrpages, ARRAY_SIZE(pages)), pages);
 		if (ret == 0) {
 			ret = -EAGAIN;
 			goto done;
 		}
 		/* now we have an array of pages, lock them all */
 		for (i = 0; i < ret; i++) {
 			/*
 			 * the caller is taking responsibility for
 			 * locked_page
 			 */
 			if (pages[i] != locked_page) {
 				lock_page(pages[i]);
 				if (!PageDirty(pages[i]) ||
 				    pages[i]->mapping != inode->i_mapping) {
 					ret = -EAGAIN;
 					unlock_page(pages[i]);
 					page_cache_release(pages[i]);
 					goto done;
 				}
 			}
 			page_cache_release(pages[i]);
 			pages_locked++;
 		}
 		nrpages -= ret;
 		index += ret;
 		cond_resched();
 	}
 	ret = 0;
 done:
 	if (ret && pages_locked) {
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start,
 			      ((u64)(start_index + pages_locked - 1)) <<
 			      PAGE_CACHE_SHIFT);
 	}
 	return ret;
 }
 /*
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
 static noinline u64 find_lock_delalloc_range(struct inode *inode,
 					     struct extent_io_tree *tree,
 					     struct page *locked_page,
 					     u64 *start, u64 *end,
 					     u64 max_bytes)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
 	u64 found;
 	struct extent_state *cached_state = NULL;
 	int ret;
 	int loops = 0;
 again:
 	/* step one, find a bunch of delalloc bytes starting at start */
 	delalloc_start = *start;
 	delalloc_end = 0;
 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
 				    max_bytes, &cached_state);
 	if (!found || delalloc_end <= *start) {
 		*start = delalloc_start;
 		*end = delalloc_end;
 		free_extent_state(cached_state);
 		return found;
 	}
 	/*
 	 * start comes from the offset of locked_page.  We have to lock
 	 * pages in order, so we can't process delalloc bytes before
 	 * locked_page
 	 */
 	if (delalloc_start < *start)
 		delalloc_start = *start;
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
 	 */
 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
 	if (ret == -EAGAIN) {
 		/* some of the pages are gone, lets avoid looping by
 		 * shortening the size of the delalloc range we're searching
 		 */
 		free_extent_state(cached_state);
 		if (!loops) {
 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
 			max_bytes = PAGE_CACHE_SIZE - offset;
 			loops = 1;
 			goto again;
 		} else {
 			found = 0;
 			goto out_failed;
 		}
 	}
 	BUG_ON(ret);
 	/* step three, lock the state bits for the whole range */
 	lock_extent_bits(tree, delalloc_start, delalloc_end,
 			 0, &cached_state, GFP_NOFS);
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
 			     EXTENT_DELALLOC, 1, cached_state);
 	if (!ret) {
 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
 				     &cached_state, GFP_NOFS);
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start, delalloc_end);
 		cond_resched();
 		goto again;
 	}
 	free_extent_state(cached_state);
 	*start = delalloc_start;
 	*end = delalloc_end;
 out_failed:
 	return found;
 }
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
 				unsigned long op)
 {
 	int ret;
 	struct page *pages[16];
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
 	int clear_bits = 0;
 	if (op & EXTENT_CLEAR_UNLOCK)
 		clear_bits |= EXTENT_LOCKED;
 	if (op & EXTENT_CLEAR_DIRTY)
 		clear_bits |= EXTENT_DIRTY;
 	if (op & EXTENT_CLEAR_DELALLOC)
 		clear_bits |= EXTENT_DELALLOC;
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
 		    EXTENT_SET_PRIVATE2)))
 		return 0;
 	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (op & EXTENT_SET_PRIVATE2)
 				SetPagePrivate2(pages[i]);
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
 			}
 			if (op & EXTENT_CLEAR_DIRTY)
 				clear_page_dirty_for_io(pages[i]);
 			if (op & EXTENT_SET_WRITEBACK)
 				set_page_writeback(pages[i]);
 			if (op & EXTENT_END_WRITEBACK)
 				end_page_writeback(pages[i]);
 			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
 		cond_resched();
 	}
 	return 0;
 }
 /*
  * count the number of bytes in the tree that have a given bit(s)
  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
  * cached.  The total number found is returned.
  */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
 		     unsigned long bits, int contig)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	u64 cur_start = *start;
 	u64 total_bytes = 0;
 	u64 last = 0;
 	int found = 0;
 	if (search_end <= cur_start) {
 		WARN_ON(1);
 		return 0;
 	}
 	spin_lock(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
 		total_bytes = tree->dirty_bytes;
 		goto out;
 	}
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
 	if (!node)
 		goto out;
 	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
 		if (contig && found && state->start > last + 1)
 			break;
 		if (state->end >= cur_start && (state->state & bits) == bits) {
 			total_bytes += min(search_end, state->end) + 1 -
 				       max(cur_start, state->start);
 			if (total_bytes >= max_bytes)
 				break;
 			if (!found) {
 				*start = max(cur_start, state->start);
 				found = 1;
 			}
 			last = state->end;
 		} else if (contig && found) {
 			break;
 		}
 		node = rb_next(node);
 		if (!node)
 			break;
 	}
 out:
 	spin_unlock(&tree->lock);
 	return total_bytes;
 }
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
  */
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	int ret = 0;
 	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
 	if (!node) {
 		ret = -ENOENT;
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
 	if (state->start != start) {
 		ret = -ENOENT;
 		goto out;
 	}
 	state->private = private;
 out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	int ret = 0;
 	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
 	if (!node) {
 		ret = -ENOENT;
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
 	if (state->start != start) {
 		ret = -ENOENT;
 		goto out;
 	}
 	*private = state->private;
 out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
 /*
  * searches a range in the state tree for a given mask.
  * If 'filled' == 1, this returns 1 only if every extent in the tree
  * has the bits set.  Otherwise, 1 is returned if any bit in the
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   int bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
 	int bitset = 0;
 	spin_lock(&tree->lock);
 	if (cached && cached->tree && cached->start == start)
 		node = &cached->rb_node;
 	else
 		node = tree_search(tree, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (filled && state->start > start) {
 			bitset = 0;
 			break;
 		}
 		if (state->start > end)
 			break;
 		if (state->state & bits) {
 			bitset = 1;
 			if (!filled)
 				break;
 		} else if (filled) {
 			bitset = 0;
 			break;
 		}
 		if (state->end == (u64)-1)
 			break;
 		start = state->end + 1;
 		if (start > end)
 			break;
 		node = rb_next(node);
 		if (!node) {
 			if (filled)
 				bitset = 0;
 			break;
 		}
 	}
 	spin_unlock(&tree->lock);
 	return bitset;
 }
 /*
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
 static int check_page_uptodate(struct extent_io_tree *tree,
 			       struct page *page)
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
 	return 0;
 }
 /*
  * helper function to unlock a page if all the extents in the tree
  * for that page are unlocked
  */
 static int check_page_locked(struct extent_io_tree *tree,
 			     struct page *page)
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
 		unlock_page(page);
 	return 0;
 }
 /*
  * helper function to end page writeback if all the extents
  * in the tree for that page are done with writeback
  */
 static int check_page_writeback(struct extent_io_tree *tree,
 			     struct page *page)
 {
 	end_page_writeback(page);
 	return 0;
 }
 /* lots and lots of room for performance fixes in the end_bio funcs */
 /*
  * after a writepage IO is done, we need to:
  * clear the uptodate bits on error
  * clear the writeback bits in the extent tree for this IO
  * end_page_writeback if the page has no more pending IO
  *
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
 	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
 	int whole_page;
 	int ret;
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
 			 bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
 			whole_page = 1;
 		else
 			whole_page = 0;
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 		if (tree->ops && tree->ops->writepage_end_io_hook) {
 			ret = tree->ops->writepage_end_io_hook(page, start,
 						       end, NULL, uptodate);
 			if (ret)
 				uptodate = 0;
 		}
 		if (!uptodate && tree->ops &&
 		    tree->ops->writepage_io_failed_hook) {
 			ret = tree->ops->writepage_io_failed_hook(bio, page,
 							 start, end, NULL);
 			if (ret == 0) {
 				uptodate = (err == 0);
 				continue;
 			}
 		}
 		if (!uptodate) {
 			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 		if (whole_page)
 			end_page_writeback(page);
 		else
 			check_page_writeback(tree, page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 }
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
  * set the uptodate bits if things worked
  * set the page up to date if all extents in the tree are uptodate
  * clear the lock bit in the extent tree
  * unlock the page if there are no other extents locked for it
  *
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
 	int whole_page;
 	int ret;
 	if (err)
 		uptodate = 0;
 	do {
 		struct page *page = bvec->bv_page;
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
 			bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
 			whole_page = 1;
 		else
 			whole_page = 0;
 		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
 		spin_lock(&tree->lock);
 		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
 		if (state && state->start == start) {
 			/*
 			 * take a reference on the state, unlock will drop
 			 * the ref
 			 */
 			cache_state(state, &cached);
 		}
 		spin_unlock(&tree->lock);
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
 							      state);
 			if (ret)
 				uptodate = 0;
 		}
 		if (!uptodate && tree->ops &&
 		    tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(bio, page,
 							 start, end, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
 					uptodate = 0;
 				uncache_state(&cached);
 				continue;
 			}
 		}
 		if (uptodate) {
 			set_extent_uptodate(tree, start, end, &cached,
 					    GFP_ATOMIC);
 		}
 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
 		if (whole_page) {
 			if (uptodate) {
 				SetPageUptodate(page);
 			} else {
 				ClearPageUptodate(page);
 				SetPageError(page);
 			}
 			unlock_page(page);
 		} else {
 			if (uptodate) {
 				check_page_uptodate(tree, page);
 			} else {
 				ClearPageUptodate(page);
 				SetPageError(page);
 			}
 			check_page_locked(tree, page);
 		}
 	} while (bvec <= bvec_end);
 	bio_put(bio);
 }
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
 	struct bio *bio;
 	bio = bio_alloc(gfp_flags, nr_vecs);
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
 		while (!bio && (nr_vecs /= 2))
 			bio = bio_alloc(gfp_flags, nr_vecs);
 	}
 	if (bio) {
 		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
 	}
 	return bio;
 }
 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 			  unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
 	bio->bi_private = NULL;
 	bio_get(bio);
 	if (tree->ops && tree->ops->submit_bio_hook)
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
 			      bio_end_io_t end_io_func,
 			      int mirror_num,
 			      unsigned long prev_bio_flags,
 			      unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio *bio;
 	int nr;
 	int contig = 0;
 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
 		if (old_compressed)
 			contig = bio->bi_sector == sector;
 		else
 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
 				sector;
 		if (prev_bio_flags != bio_flags || !contig ||
 		    (tree->ops && tree->ops->merge_bio_hook &&
 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
 					       bio_flags)) ||
 		    bio_add_page(bio, page, page_size, offset) < page_size) {
 			ret = submit_one_bio(rw, bio, mirror_num,
 					     prev_bio_flags);
 			bio = NULL;
 		} else {
 			return 0;
 		}
 	}
 	if (this_compressed)
 		nr = BIO_MAX_PAGES;
 	else
 		nr = bio_get_nr_vecs(bdev);
 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio)
 		return -ENOMEM;
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 	if (bio_ret)
 		*bio_ret = bio;
 	else
 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
 	return ret;
 }
 void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		page_cache_get(page);
 		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
 	WARN_ON(!PagePrivate(page));
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
  */
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
 				   struct bio **bio, int mirror_num,
 				   unsigned long *bio_flags)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 cur_end;
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
 	struct btrfs_ordered_extent *ordered;
 	int ret;
 	int nr = 0;
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
 	unsigned long this_bio_flag = 0;
 	set_page_extent_mapped(page);
 	if (!PageUptodate(page)) {
 		if (cleancache_get_page(page) == 0) {
 			BUG_ON(blocksize != PAGE_SIZE);
 			goto out;
 		}
 	}
 	end = page_end;
 	while (1) {
 		lock_extent(tree, start, end, GFP_NOFS);
 		ordered = btrfs_lookup_ordered_extent(inode, start);
 		if (!ordered)
 			break;
 		unlock_extent(tree, start, end, GFP_NOFS);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
 		char *userpage;
 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
 		if (zero_offset) {
 			iosize = PAGE_CACHE_SIZE - zero_offset;
 			userpage = kmap_atomic(page, KM_USER0);
 			memset(userpage + zero_offset, 0, iosize);
 			flush_dcache_page(page);
 			kunmap_atomic(userpage, KM_USER0);
 		}
 	}
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			char *userpage;
 			struct extent_state *cached = NULL;
 			iosize = PAGE_CACHE_SIZE - pg_offset;
 			userpage = kmap_atomic(page, KM_USER0);
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
 			kunmap_atomic(userpage, KM_USER0);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
 			unlock_extent_cached(tree, cur, cur + iosize - 1,
 					     &cached, GFP_NOFS);
 			break;
 		}
 		em = get_extent(inode, page, pg_offset, cur,
 				end - cur + 1, 0);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
 			unlock_extent(tree, cur, end, GFP_NOFS);
 			break;
 		}
 		extent_offset = cur - em->start;
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 			this_bio_flag = EXTENT_BIO_COMPRESSED;
 			extent_set_compress_type(&this_bio_flag,
 						 em->compress_type);
 		}
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
 			disk_io_size = em->block_len;
 			sector = em->block_start >> 9;
 		} else {
 			sector = (em->block_start + extent_offset) >> 9;
 			disk_io_size = iosize;
 		}
 		bdev = em->bdev;
 		block_start = em->block_start;
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			block_start = EXTENT_MAP_HOLE;
 		free_extent_map(em);
 		em = NULL;
 		/* we've found a hole, just zero and go on */
 		if (block_start == EXTENT_MAP_HOLE) {
 			char *userpage;
 			struct extent_state *cached = NULL;
 			userpage = kmap_atomic(page, KM_USER0);
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
 			kunmap_atomic(userpage, KM_USER0);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
 			unlock_extent_cached(tree, cur, cur + iosize - 1,
 			                     &cached, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 		/* we have an inline extent but it didn't get marked up
 		 * to date.  Error out
 		 */
 		if (block_start == EXTENT_MAP_INLINE) {
 			SetPageError(page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 		ret = 0;
 		if (tree->ops && tree->ops->readpage_io_hook) {
 			ret = tree->ops->readpage_io_hook(page, cur,
 							  cur + iosize - 1);
 		}
 		if (!ret) {
 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
 			pnr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
 					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
 			nr++;
 			*bio_flags = this_bio_flag;
 		}
 		if (ret)
 			SetPageError(page);
 		cur = cur + iosize;
 		pg_offset += iosize;
 	}
 out:
 	if (!nr) {
 		if (!PageError(page))
 			SetPageUptodate(page);
 		unlock_page(page);
 	}
 	return 0;
 }
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent)
 {
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 	int ret;
 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
 				      &bio_flags);
 	if (bio)
 		ret = submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 static noinline void update_nr_written(struct page *page,
 				      struct writeback_control *wbc,
 				      unsigned long nr_written)
 {
 	wbc->nr_to_write -= nr_written;
 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
 		page->mapping->writeback_index = page->index + nr_written;
 }
 /*
  * the writepage semantics are similar to regular writepage.  extent
  * records are inserted to lock ranges in the tree, and as dirty areas
  * are found, they are marked writeback.  Then the lock bits are removed
  * and the end_io handler clears the writeback ranges
  */
 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			      void *data)
 {
 	struct inode *inode = page->mapping->host;
 	struct extent_page_data *epd = data;
 	struct extent_io_tree *tree = epd->tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 delalloc_start;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
 	sector_t sector;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
 	struct block_device *bdev;
 	int ret;
 	int nr = 0;
 	size_t pg_offset = 0;
 	size_t blocksize;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
 	u64 nr_delalloc;
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
 	int write_flags;
 	unsigned long nr_written = 0;
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		write_flags = WRITE_SYNC;
 	else
 		write_flags = WRITE;
 	trace___extent_writepage(page, inode, wbc);
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
 		page->mapping->a_ops->invalidatepage(page, 0);
 		unlock_page(page);
 		return 0;
 	}
 	if (page->index == end_index) {
 		char *userpage;
 		userpage = kmap_atomic(page, KM_USER0);
 		memset(userpage + pg_offset, 0,
 		       PAGE_CACHE_SIZE - pg_offset);
 		kunmap_atomic(userpage, KM_USER0);
 		flush_dcache_page(page);
 	}
 	pg_offset = 0;
 	set_page_extent_mapped(page);
 	delalloc_start = start;
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
 		u64 delalloc_to_write = 0;
 		/*
 		 * make sure the wbc mapping index is at least updated
 		 * to this page.
 		 */
 		update_nr_written(page, wbc, 0);
 		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
 			if (nr_delalloc == 0) {
 				delalloc_start = delalloc_end + 1;
 				continue;
 			}
 			tree->ops->fill_delalloc(inode, page, delalloc_start,
 						 delalloc_end, &page_started,
 						 &nr_written);
 			/*
 			 * delalloc_end is already one less than the total
 			 * length, so we don't subtract one from
 			 * PAGE_CACHE_SIZE
 			 */
 			delalloc_to_write += (delalloc_end - delalloc_start +
 					      PAGE_CACHE_SIZE) >>
 					      PAGE_CACHE_SHIFT;
 			delalloc_start = delalloc_end + 1;
 		}
 		if (wbc->nr_to_write < delalloc_to_write) {
 			int thresh = 8192;
 			if (delalloc_to_write < thresh * 2)
 				thresh = delalloc_to_write;
 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
 						 thresh);
 		}
 		/* did the fill delalloc function already unlock and start
 		 * the IO?
 		 */
 		if (page_started) {
 			ret = 0;
 			/*
 			 * we've unlocked the page, so we can't update
 			 * the mapping's writeback index, just update
 			 * nr_to_write.
 			 */
 			wbc->nr_to_write -= nr_written;
 			goto done_unlocked;
 		}
 	}
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
 		if (ret == -EAGAIN) {
 			redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
 			ret = 0;
 			goto done_unlocked;
 		}
 	}
 	/*
 	 * we don't want to touch the inode after unlocking the page,
 	 * so we update the mapping writeback index now
 	 */
 	update_nr_written(page, wbc, nr_written + 1);
 	end = page_end;
 	if (last_byte <= start) {
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
 		goto done;
 	}
 	blocksize = inode->i_sb->s_blocksize;
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
 			break;
 		}
 		em = epd->get_extent(inode, page, pg_offset, cur,
 				     end - cur + 1, 1);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
 			break;
 		}
 		extent_offset = cur - em->start;
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		free_extent_map(em);
 		em = NULL;
 		/*
 		 * compressed and inline extents are written through other
 		 * paths in the FS
 		 */
 		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
 			/*
 			 * end_io notification does not happen here for
 			 * compressed extents
 			 */
 			if (!compressed && tree->ops &&
 			    tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 cur + iosize - 1,
 							 NULL, 1);
 			else if (compressed) {
 				/* we don't want to end_page_writeback on
 				 * a compressed extent.  this happens
 				 * elsewhere
 				 */
 				nr++;
 			}
 			cur += iosize;
 			pg_offset += iosize;
 			continue;
 		}
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
 				   EXTENT_DIRTY, 0, NULL)) {
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
 						cur + iosize - 1);
 		} else {
 			ret = 0;
 		}
 		if (ret) {
 			SetPageError(page);
 		} else {
 			unsigned long max_nr = end_index + 1;
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
 				printk(KERN_ERR "btrfs warning page %lu not "
 				       "writeback, cur %llu end %llu\n",
 				       page->index, (unsigned long long)cur,
 				       (unsigned long long)end);
 			}
 			ret = submit_extent_page(write_flags, tree, page,
 						 sector, iosize, pg_offset,
 						 bdev, &epd->bio, max_nr,
 						 end_bio_extent_writepage,
 						 0, 0, 0);
 			if (ret)
 				SetPageError(page);
 		}
 		cur = cur + iosize;
 		pg_offset += iosize;
 		nr++;
 	}
 done:
 	if (nr == 0) {
 		/* make sure the mapping tag for page dirty gets cleared */
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
 	unlock_page(page);
 done_unlocked:
 	/* drop our reference on any cached states */
 	free_extent_state(cached_state);
 	return 0;
 }
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @writepage: function called for each page
  * @data: data passed to writepage function
  *
  * If a page is already under I/O, write_cache_pages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
 static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     struct address_space *mapping,
 			     struct writeback_control *wbc,
 			     writepage_t writepage, void *data,
 			     void (*flush_fn)(void *))
 {
 	int ret = 0;
 	int done = 0;
 	int nr_to_write_done = 0;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		scanned = 1;
 	}
 retry:
 	while (!done && !nr_to_write_done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 			      PAGECACHE_TAG_DIRTY, min(end - index,
 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 		scanned = 1;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			/*
 			 * At this point we hold neither mapping->tree_lock nor
 			 * lock on the page itself: the page may be truncated or
 			 * invalidated (changing page->mapping to NULL), or even
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
 				tree->ops->write_cache_pages_lock_hook(page);
 			else
 				lock_page(page);
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
 			}
 			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
 			}
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
 					flush_fn(data);
 				wait_on_page_writeback(page);
 			}
 			if (PageWriteback(page) ||
 			    !clear_page_dirty_for_io(page)) {
 				unlock_page(page);
 				continue;
 			}
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
 				unlock_page(page);
 				ret = 0;
 			}
 			if (ret)
 				done = 1;
 			/*
 			 * the filesystem may choose to bump up nr_to_write.
 			 * We have to make sure to honor the new nr_to_write
 			 * at any time
 			 */
 			nr_to_write_done = wbc->nr_to_write <= 0;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	if (!scanned && !done) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
 		scanned = 1;
 		index = 0;
 		goto retry;
 	}
 	return ret;
 }
 static void flush_epd_write_bio(struct extent_page_data *epd)
 {
 	if (epd->bio) {
 		if (epd->sync_io)
 			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
 		else
 			submit_one_bio(WRITE, epd->bio, 0, 0);
 		epd->bio = NULL;
 	}
 }
 static noinline void flush_write_bio(void *data)
 {
 	struct extent_page_data *epd = data;
 	flush_epd_write_bio(epd);
 }
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
 {
 	int ret;
 	struct address_space *mapping = page->mapping;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= wbc->sync_mode,
-		.older_than_this = NULL,
 		.nr_to_write	= 64,
 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
 		.range_end	= (loff_t)-1,
 	};
 	ret = __extent_writepage(page, wbc, &epd);
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
 	flush_epd_write_bio(&epd);
 	return ret;
 }
 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 			      u64 start, u64 end, get_extent_t *get_extent,
 			      int mode)
 {
 	int ret = 0;
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
-		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
 		.range_start	= start,
 		.range_end	= end + 1,
 	};
 	while (start <= end) {
 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
 		else {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, start,
 						 start + PAGE_CACHE_SIZE - 1,
 						 NULL, 1);
 			unlock_page(page);
 		}
 		page_cache_release(page);
 		start += PAGE_CACHE_SIZE;
 	}
 	flush_epd_write_bio(&epd);
 	return ret;
 }
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc)
 {
 	int ret = 0;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
 	flush_epd_write_bio(&epd);
 	return ret;
 }
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
 		     struct list_head *pages, unsigned nr_pages,
 		     get_extent_t get_extent)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	unsigned long bio_flags = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 		prefetchw(&page->flags);
 		list_del(&page->lru);
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_NOFS)) {
 			__extent_read_full_page(tree, page, get_extent,
 						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 /*
  * basic invalidatepage code, this waits on any locked or writeback
  * ranges corresponding to the page, and then deletes any extent state
  * records from the tree
  */
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset)
 {
 	struct extent_state *cached_state = NULL;
 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 	start += (offset + blocksize - 1) & ~(blocksize - 1);
 	if (start > end)
 		return 0;
 	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
 			 EXTENT_DO_ACCOUNTING,
 			 1, 1, &cached_state, GFP_NOFS);
 	return 0;
 }
 /*
  * a helper for releasepage, this tests for areas of the page that
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
 int try_release_extent_state(struct extent_map_tree *map,
 			     struct extent_io_tree *tree, struct page *page,
 			     gfp_t mask)
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	int ret = 1;
 	if (test_range_bit(tree, start, end,
 			   EXTENT_IOBITS, 0, NULL))
 		ret = 0;
 	else {
 		if ((mask & GFP_NOFS) == GFP_NOFS)
 			mask = GFP_NOFS;
 		/*
 		 * at this point we can safely clear everything except the
 		 * locked bit and the nodatasum bit
 		 */
 		ret = clear_extent_bit(tree, start, end,
 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
 				 0, 0, NULL, mask);
 		/* if clear_extent_bit failed for enomem reasons,
 		 * we can't allow the release to continue.
 		 */
 		if (ret < 0)
 			ret = 0;
 		else
 			ret = 1;
 	}
 	return ret;
 }
 /*
  * a helper for releasepage.  As long as there are no locked extents
  * in the range corresponding to the page, both state records and extent
  * map records are removed
  */
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask)
 {
 	struct extent_map *em;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if ((mask & __GFP_WAIT) &&
 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
 		u64 len;
 		while (start <= end) {
 			len = end - start + 1;
 			write_lock(&map->lock);
 			em = lookup_extent_mapping(map, start, len);
 			if (IS_ERR_OR_NULL(em)) {
 				write_unlock(&map->lock);
 				break;
 			}
 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
 			    em->start != start) {
 				write_unlock(&map->lock);
 				free_extent_map(em);
 				break;
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
 					    0, NULL)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
 			}
 			start = extent_map_end(em);
 			write_unlock(&map->lock);
 			/* once for us */
 			free_extent_map(em);
 		}
 	}
 	return try_release_extent_state(map, tree, page, mask);
 }
 /*
  * helper function for fiemap, which doesn't want to see any holes.
  * This maps until we find something past 'last'
  */
 static struct extent_map *get_extent_skip_holes(struct inode *inode,
 						u64 offset,
 						u64 last,
 						get_extent_t *get_extent)
 {
 	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
 	struct extent_map *em;
 	u64 len;
 	if (offset >= last)
 		return NULL;
 	while(1) {
 		len = last - offset;
 		if (len == 0)
 			break;
 		len = (len + sectorsize - 1) & ~(sectorsize - 1);
 		em = get_extent(inode, NULL, 0, offset, len, 0);
 		if (IS_ERR_OR_NULL(em))
 			return em;
 		/* if this isn't a hole return it */
 		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
 		    em->block_start != EXTENT_MAP_HOLE) {
 			return em;
 		}
 		/* this is a hole, advance to the next extent */
 		offset = extent_map_end(em);
 		free_extent_map(em);
 		if (offset >= last)
 			break;
 	}
 	return NULL;
 }
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
 	int ret = 0;
 	u64 off = start;
 	u64 max = start + len;
 	u32 flags = 0;
 	u32 found_type;
 	u64 last;
 	u64 last_for_get_extent = 0;
 	u64 disko = 0;
 	u64 isize = i_size_read(inode);
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *item;
 	int end = 0;
 	u64 em_start = 0;
 	u64 em_len = 0;
 	u64 em_end = 0;
 	unsigned long emflags;
 	if (len == 0)
 		return -EINVAL;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->leave_spinning = 1;
 	/*
 	 * lookup the last file extent.  We're not using i_size here
 	 * because there might be preallocation past i_size
 	 */
 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
 				       path, btrfs_ino(inode), -1, 0);
 	if (ret < 0) {
 		btrfs_free_path(path);
 		return ret;
 	}
 	WARN_ON(!ret);
 	path->slots[0]--;
 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			      struct btrfs_file_extent_item);
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 	/* No extents, but there might be delalloc bits */
 	if (found_key.objectid != btrfs_ino(inode) ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		/* have to trust i_size as the end */
 		last = (u64)-1;
 		last_for_get_extent = isize;
 	} else {
 		/*
 		 * remember the start of the last extent.  There are a
 		 * bunch of different factors that go into the length of the
 		 * extent, so its much less complex to remember where it started
 		 */
 		last = found_key.offset;
 		last_for_get_extent = last + 1;
 	}
 	btrfs_free_path(path);
 	/*
 	 * we might have some extents allocated but more delalloc past those
 	 * extents.  so, we trust isize unless the start of the last extent is
 	 * beyond isize
 	 */
 	if (last < isize) {
 		last = (u64)-1;
 		last_for_get_extent = isize;
 	}
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
 			 &cached_state, GFP_NOFS);
 	em = get_extent_skip_holes(inode, off, last_for_get_extent,
 				   get_extent);
 	if (!em)
 		goto out;
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto out;
 	}
 	while (!end) {
 		u64 offset_in_extent;
 		/* break if the extent we found is outside the range */
 		if (em->start >= max || extent_map_end(em) < off)
 			break;
 		/*
 		 * get_extent may return an extent that starts before our
 		 * requested range.  We have to make sure the ranges
 		 * we return to fiemap always move forward and don't
 		 * overlap, so adjust the offsets here
 		 */
 		em_start = max(em->start, off);
 		/*
 		 * record the offset from the start of the extent
 		 * for adjusting the disk offset below
 		 */
 		offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
 		em_len = em_end - em_start;
 		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 		/*
 		 * bump off for our next call to get_extent
 		 */
 		off = extent_map_end(em);
 		if (off >= max)
 			end = 1;
 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
 			end = 1;
 			flags |= FIEMAP_EXTENT_LAST;
 		} else if (em->block_start == EXTENT_MAP_INLINE) {
 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
 				  FIEMAP_EXTENT_NOT_ALIGNED);
 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
 			disko = em->block_start + offset_in_extent;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
 		free_extent_map(em);
 		em = NULL;
 		if ((em_start >= last) || em_len == (u64)-1 ||
 		   (last == (u64)-1 && isize <= em_end)) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
 		/* now scan forward to see if this is really the last extent. */
 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
 					   get_extent);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
 		}
 		if (!em) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
 					      em_len, flags);
 		if (ret)
 			goto out_free;
 	}
 out_free:
 	free_extent_map(em);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
 			     &cached_state, GFP_NOFS);
 	return ret;
 }
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
 	struct page *p;
 	struct address_space *mapping;
 	if (i == 0)
 		return eb->first_page;
 	i += eb->start >> PAGE_CACHE_SHIFT;
 	mapping = eb->first_page->mapping;
 	if (!mapping)
 		return NULL;
 	/*
 	 * extent_buffer_page is only called after pinning the page
 	 * by increasing the reference count.  So we know the page must
 	 * be in the radix tree.
 	 */
 	rcu_read_lock();
 	p = radix_tree_lookup(&mapping->page_tree, i);
 	rcu_read_unlock();
 	return p;
 }
 static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
 		(start >> PAGE_CACHE_SHIFT);
 }
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   u64 start,
 						   unsigned long len,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
 #if LEAK_DEBUG
 	unsigned long flags;
 #endif
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	if (eb == NULL)
 		return NULL;
 	eb->start = start;
 	eb->len = len;
 	spin_lock_init(&eb->lock);
 	init_waitqueue_head(&eb->lock_wq);
 #if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
 #endif
 	atomic_set(&eb->refs, 1);
 	return eb;
 }
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
 #if LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
 	spin_unlock_irqrestore(&leak_lock, flags);
 #endif
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 /*
  * Helper for releasing extent buffer page.
  */
 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
 						unsigned long start_idx)
 {
 	unsigned long index;
 	struct page *page;
 	if (!eb->first_page)
 		return;
 	index = num_extent_pages(eb->start, eb->len);
 	if (start_idx >= index)
 		return;
 	do {
 		index--;
 		page = extent_buffer_page(eb, index);
 		if (page)
 			page_cache_release(page);
 	} while (index != start_idx);
 }
 /*
  * Helper for releasing the extent buffer.
  */
 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
 	btrfs_release_extent_buffer_page(eb, 0);
 	__free_extent_buffer(eb);
 }
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 					  u64 start, unsigned long len,
 					  struct page *page0)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 1;
 	int ret;
 	rcu_read_lock();
 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
 		return eb;
 	}
 	rcu_read_unlock();
 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
 	if (!eb)
 		return NULL;
 	if (page0) {
 		eb->first_page = page0;
 		i = 1;
 		index++;
 		page_cache_get(page0);
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
 		set_page_extent_head(page0, len);
 		uptodate = PageUptodate(page0);
 	} else {
 		i = 0;
 	}
 	for (; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
 		if (!p) {
 			WARN_ON(1);
 			goto free_eb;
 		}
 		set_page_extent_mapped(p);
 		mark_page_accessed(p);
 		if (i == 0) {
 			eb->first_page = p;
 			set_page_extent_head(p, len);
 		} else {
 			set_page_private(p, EXTENT_PAGE_PRIVATE);
 		}
 		if (!PageUptodate(p))
 			uptodate = 0;
 		/*
 		 * see below about how we avoid a nasty race with release page
 		 * and why we unlock later
 		 */
 		if (i != 0)
 			unlock_page(p);
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret)
 		goto free_eb;
 	spin_lock(&tree->buffer_lock);
 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
 	if (ret == -EEXIST) {
 		exists = radix_tree_lookup(&tree->buffer,
 						start >> PAGE_CACHE_SHIFT);
 		/* add one reference for the caller */
 		atomic_inc(&exists->refs);
 		spin_unlock(&tree->buffer_lock);
 		radix_tree_preload_end();
 		goto free_eb;
 	}
 	/* add one reference for the tree */
 	atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
 	radix_tree_preload_end();
 	/*
 	 * there is a race where release page may have
 	 * tried to find this extent buffer in the radix
 	 * but failed.  It will tell the VM it is safe to
 	 * reclaim the, and it will clear the page private bit.
 	 * We must make sure to set the page private bit properly
 	 * after the extent buffer is in the radix tree so
 	 * it doesn't get lost
 	 */
 	set_page_extent_mapped(eb->first_page);
 	set_page_extent_head(eb->first_page, eb->len);
 	if (!page0)
 		unlock_page(eb->first_page);
 	return eb;
 free_eb:
 	if (eb->first_page && !page0)
 		unlock_page(eb->first_page);
 	if (!atomic_dec_and_test(&eb->refs))
 		return exists;
 	btrfs_release_extent_buffer(eb);
 	return exists;
 }
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
 	rcu_read_lock();
 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
 		return eb;
 	}
 	rcu_read_unlock();
 	return NULL;
 }
 void free_extent_buffer(struct extent_buffer *eb)
 {
 	if (!eb)
 		return;
 	if (!atomic_dec_and_test(&eb->refs))
 		return;
 	WARN_ON(1);
 }
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
 	struct page *page;
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (!PageDirty(page))
 			continue;
 		lock_page(page);
 		WARN_ON(!PagePrivate(page));
 		set_page_extent_mapped(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&page->mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&page->mapping->tree_lock);
 		unlock_page(page);
 	}
 	return 0;
 }
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
 	int was_dirty = 0;
 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
 	return was_dirty;
 }
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 				struct extent_buffer *eb,
 				struct extent_state **cached_state)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 	num_pages = num_extent_pages(eb->start, eb->len);
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			      cached_state, GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (page)
 			ClearPageUptodate(page);
 	}
 	return 0;
 }
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 				struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 	num_pages = num_extent_pages(eb->start, eb->len);
 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			    NULL, GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
 		    ((i == num_pages - 1) &&
 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
 			check_page_uptodate(tree, page);
 			continue;
 		}
 		SetPageUptodate(page);
 	}
 	return 0;
 }
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end)
 {
 	struct page *page;
 	int ret;
 	int pg_uptodate = 1;
 	int uptodate;
 	unsigned long index;
 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
 	if (ret)
 		return 1;
 	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
 		uptodate = PageUptodate(page);
 		page_cache_release(page);
 		if (!uptodate) {
 			pg_uptodate = 0;
 			break;
 		}
 		start += PAGE_CACHE_SIZE;
 	}
 	return pg_uptodate;
 }
 int extent_buffer_uptodate(struct extent_io_tree *tree,
 			   struct extent_buffer *eb,
 			   struct extent_state *cached_state)
 {
 	int ret = 0;
 	unsigned long num_pages;
 	unsigned long i;
 	struct page *page;
 	int pg_uptodate = 1;
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 1;
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1, cached_state);
 	if (ret)
 		return ret;
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (!PageUptodate(page)) {
 			pg_uptodate = 0;
 			break;
 		}
 	}
 	return pg_uptodate;
 }
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb,
 			     u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
 	unsigned long start_i;
 	struct page *page;
 	int err;
 	int ret = 0;
 	int locked_pages = 0;
 	int all_uptodate = 1;
 	int inc_all_pages = 0;
 	unsigned long num_pages;
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1, NULL)) {
 		return 0;
 	}
 	if (start) {
 		WARN_ON(start < eb->start);
 		start_i = (start >> PAGE_CACHE_SHIFT) -
 			(eb->start >> PAGE_CACHE_SHIFT);
 	} else {
 		start_i = 0;
 	}
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (!wait) {
 			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
 			lock_page(page);
 		}
 		locked_pages++;
 		if (!PageUptodate(page))
 			all_uptodate = 0;
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		goto unlock_exit;
 	}
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PagePrivate(page));
 		set_page_extent_mapped(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
 		if (inc_all_pages)
 			page_cache_get(page);
 		if (!PageUptodate(page)) {
 			if (start_i == 0)
 				inc_all_pages = 1;
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
 						      mirror_num, &bio_flags);
 			if (err)
 				ret = err;
 		} else {
 			unlock_page(page);
 		}
 	}
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 	if (ret || !wait)
 		return ret;
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
 			ret = -EIO;
 	}
 	if (!ret)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 unlock_exit:
 	i = start_i;
 	while (locked_pages > 0) {
 		page = extent_buffer_page(eb, i);
 		i++;
 		unlock_page(page);
 		locked_pages--;
 	}
 	return ret;
 }
 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 			unsigned long start,
 			unsigned long len)
 {
 	size_t cur;
 	size_t offset;
 	struct page *page;
 	char *kaddr;
 	char *dst = (char *)dstv;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 		kaddr = kmap_atomic(page, KM_USER1);
 		memcpy(dst, kaddr + offset, cur);
 		kunmap_atomic(kaddr, KM_USER1);
 		dst += cur;
 		len -= cur;
 		offset = 0;
 		i++;
 	}
 }
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 			       unsigned long min_len, char **token, char **map,
 			       unsigned long *map_start,
 			       unsigned long *map_len, int km)
 {
 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
 	char *kaddr;
 	struct page *p;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	unsigned long end_i = (start_offset + start + min_len - 1) >>
 		PAGE_CACHE_SHIFT;
 	if (i != end_i)
 		return -EINVAL;
 	if (i == 0) {
 		offset = start_offset;
 		*map_start = 0;
 	} else {
 		offset = 0;
 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
 	}
 	if (start + min_len > eb->len) {
 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
 		       eb->len, start, min_len);
 		WARN_ON(1);
 		return -EINVAL;
 	}
 	p = extent_buffer_page(eb, i);
 	kaddr = kmap_atomic(p, km);
 	*token = kaddr;
 	*map = kaddr + offset;
 	*map_len = PAGE_CACHE_SIZE - offset;
 	return 0;
 }
 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		      unsigned long min_len,
 		      char **token, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km)
 {
 	int err;
 	int save = 0;
 	if (eb->map_token) {
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
 	if (!err && save) {
 		eb->map_token = *token;
 		eb->kaddr = *map;
 		eb->map_start = *map_start;
 		eb->map_len = *map_len;
 	}
 	return err;
 }
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
 {
 	kunmap_atomic(token, km);
 }
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len)
 {
 	size_t cur;
 	size_t offset;
 	struct page *page;
 	char *kaddr;
 	char *ptr = (char *)ptrv;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	int ret = 0;
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 		kaddr = kmap_atomic(page, KM_USER0);
 		ret = memcmp(ptr, kaddr + offset, cur);
 		kunmap_atomic(kaddr, KM_USER0);
 		if (ret)
 			break;
 		ptr += cur;
 		len -= cur;
 		offset = 0;
 		i++;
 	}
 	return ret;
 }
 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 			 unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
 	struct page *page;
 	char *kaddr;
 	char *src = (char *)srcv;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 		cur = min(len, PAGE_CACHE_SIZE - offset);
 		kaddr = kmap_atomic(page, KM_USER1);
 		memcpy(kaddr + offset, src, cur);
 		kunmap_atomic(kaddr, KM_USER1);
 		src += cur;
 		len -= cur;
 		offset = 0;
 		i++;
 	}
 }
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
 	struct page *page;
 	char *kaddr;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 		cur = min(len, PAGE_CACHE_SIZE - offset);
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + offset, c, cur);
 		kunmap_atomic(kaddr, KM_USER0);
 		len -= cur;
 		offset = 0;
 		i++;
 	}
 }
 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 			unsigned long dst_offset, unsigned long src_offset,
 			unsigned long len)
 {
 	u64 dst_len = dst->len;
 	size_t cur;
 	size_t offset;
 	struct page *page;
 	char *kaddr;
 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 	WARN_ON(src->len != dst_len);
 	offset = (start_offset + dst_offset) &
 		((unsigned long)PAGE_CACHE_SIZE - 1);
 	while (len > 0) {
 		page = extent_buffer_page(dst, i);
 		WARN_ON(!PageUptodate(page));
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
 		kaddr = kmap_atomic(page, KM_USER0);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
 		kunmap_atomic(kaddr, KM_USER0);
 		src_offset += cur;
 		len -= cur;
 		offset = 0;
 		i++;
 	}
 }
 static void move_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
 		       unsigned long len)
 {
 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
 	if (dst_page == src_page) {
 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
 	} else {
 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
 		char *p = dst_kaddr + dst_off + len;
 		char *s = src_kaddr + src_off + len;
 		while (len--)
 			*--p = *--s;
 		kunmap_atomic(src_kaddr, KM_USER1);
 	}
 	kunmap_atomic(dst_kaddr, KM_USER0);
 }
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
 	unsigned long distance = (src > dst) ? src - dst : dst - src;
 	return distance < len;
 }
 static void copy_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
 		       unsigned long len)
 {
 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
 	char *src_kaddr;
 	if (dst_page != src_page) {
 		src_kaddr = kmap_atomic(src_page, KM_USER1);
 	} else {
 		src_kaddr = dst_kaddr;
 		BUG_ON(areas_overlap(src_off, dst_off, len));
 	}
 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 	kunmap_atomic(dst_kaddr, KM_USER0);
 	if (dst_page != src_page)
 		kunmap_atomic(src_kaddr, KM_USER1);
 }
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len)
 {
 	size_t cur;
 	size_t dst_off_in_page;
 	size_t src_off_in_page;
 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long dst_i;
 	unsigned long src_i;
 	if (src_offset + len > dst->len) {
 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
 					       src_off_in_page));
 		cur = min_t(unsigned long, cur,
 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
 		copy_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page, src_off_in_page, cur);
 		src_offset += cur;
 		dst_offset += cur;
 		len -= cur;
 	}
 }
 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len)
 {
 	size_t cur;
 	size_t dst_off_in_page;
 	size_t src_off_in_page;
 	unsigned long dst_end = dst_offset + len - 1;
 	unsigned long src_end = src_offset + len - 1;
 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long dst_i;
 	unsigned long src_i;
 	if (src_offset + len > dst->len) {
 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
 		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
 		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (!areas_overlap(src_offset, dst_offset, len)) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
 		return;
 	}
 	while (len > 0) {
 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 		dst_off_in_page = (start_offset + dst_end) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_end) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
 		move_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);
 		dst_end -= cur;
 		src_end -= cur;
 		len -= cur;
 	}
 }
 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 {
 	struct extent_buffer *eb =
 			container_of(head, struct extent_buffer, rcu_head);
 	btrfs_release_extent_buffer(eb);
 }
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
 	u64 start = page_offset(page);
 	struct extent_buffer *eb;
 	int ret = 1;
 	spin_lock(&tree->buffer_lock);
 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (!eb) {
 		spin_unlock(&tree->buffer_lock);
 		return ret;
 	}
 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
 		ret = 0;
 		goto out;
 	}
 	/*
 	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
 	 * Or go back.
 	 */
 	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
 		ret = 0;
 		goto out;
 	}
 	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 out:
 	spin_unlock(&tree->buffer_lock);
 	/* at this point we can safely release the extent buffer */
 	if (atomic_read(&eb->refs) == 0)
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 	return ret;
 }

fs/ext4/inode.c

Diff comments View file @ f01ef56

1	/*	1	/*
2	* linux/fs/ext4/inode.c	2	* linux/fs/ext4/inode.c
3	*	3	*
4	* Copyright (C) 1992, 1993, 1994, 1995	4	* Copyright (C) 1992, 1993, 1994, 1995
5	* Remy Card (card@masi.ibp.fr)	5	* Remy Card (card@masi.ibp.fr)
6	* Laboratoire MASI - Institut Blaise Pascal	6	* Laboratoire MASI - Institut Blaise Pascal
7	* Universite Pierre et Marie Curie (Paris VI)	7	* Universite Pierre et Marie Curie (Paris VI)
8	*	8	*
9	* from	9	* from
10	*	10	*
11	* linux/fs/minix/inode.c	11	* linux/fs/minix/inode.c
12	*	12	*
13	* Copyright (C) 1991, 1992 Linus Torvalds	13	* Copyright (C) 1991, 1992 Linus Torvalds
14	*	14	*
15	* Goal-directed block allocation by Stephen Tweedie	15	* Goal-directed block allocation by Stephen Tweedie
16	* (sct@redhat.com), 1993, 1998	16	* (sct@redhat.com), 1993, 1998
17	* Big-endian to little-endian byte-swapping/bitmaps by	17	* Big-endian to little-endian byte-swapping/bitmaps by
18	* David S. Miller (davem@caip.rutgers.edu), 1995	18	* David S. Miller (davem@caip.rutgers.edu), 1995
19	* 64-bit file support on 64-bit platforms by Jakub Jelinek	19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
20	* (jj@sunsite.ms.mff.cuni.cz)	20	* (jj@sunsite.ms.mff.cuni.cz)
21	*	21	*
22	* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000	22	* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23	*/	23	*/
24		24
25	#include <linux/module.h>	25	#include <linux/module.h>
26	#include <linux/fs.h>	26	#include <linux/fs.h>
27	#include <linux/time.h>	27	#include <linux/time.h>
28	#include <linux/jbd2.h>	28	#include <linux/jbd2.h>
29	#include <linux/highuid.h>	29	#include <linux/highuid.h>
30	#include <linux/pagemap.h>	30	#include <linux/pagemap.h>
31	#include <linux/quotaops.h>	31	#include <linux/quotaops.h>
32	#include <linux/string.h>	32	#include <linux/string.h>
33	#include <linux/buffer_head.h>	33	#include <linux/buffer_head.h>
34	#include <linux/writeback.h>	34	#include <linux/writeback.h>
35	#include <linux/pagevec.h>	35	#include <linux/pagevec.h>
36	#include <linux/mpage.h>	36	#include <linux/mpage.h>
37	#include <linux/namei.h>	37	#include <linux/namei.h>
38	#include <linux/uio.h>	38	#include <linux/uio.h>
39	#include <linux/bio.h>	39	#include <linux/bio.h>
40	#include <linux/workqueue.h>	40	#include <linux/workqueue.h>
41	#include <linux/kernel.h>	41	#include <linux/kernel.h>
42	#include <linux/printk.h>	42	#include <linux/printk.h>
43	#include <linux/slab.h>	43	#include <linux/slab.h>
44	#include <linux/ratelimit.h>	44	#include <linux/ratelimit.h>
45		45
46	#include "ext4_jbd2.h"	46	#include "ext4_jbd2.h"
47	#include "xattr.h"	47	#include "xattr.h"
48	#include "acl.h"	48	#include "acl.h"
49	#include "ext4_extents.h"	49	#include "ext4_extents.h"
50		50
51	#include <trace/events/ext4.h>	51	#include <trace/events/ext4.h>
52		52
53	#define MPAGE_DA_EXTENT_TAIL 0x01	53	#define MPAGE_DA_EXTENT_TAIL 0x01
54		54
55	static inline int ext4_begin_ordered_truncate(struct inode *inode,	55	static inline int ext4_begin_ordered_truncate(struct inode *inode,
56	loff_t new_size)	56	loff_t new_size)
57	{	57	{
58	trace_ext4_begin_ordered_truncate(inode, new_size);	58	trace_ext4_begin_ordered_truncate(inode, new_size);
59	/*	59	/*
60	* If jinode is zero, then we never opened the file for	60	* If jinode is zero, then we never opened the file for
61	* writing, so there's no need to call	61	* writing, so there's no need to call
62	* jbd2_journal_begin_ordered_truncate() since there's no	62	* jbd2_journal_begin_ordered_truncate() since there's no
63	* outstanding writes we need to flush.	63	* outstanding writes we need to flush.
64	*/	64	*/
65	if (!EXT4_I(inode)->jinode)	65	if (!EXT4_I(inode)->jinode)
66	return 0;	66	return 0;
67	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),	67	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68	EXT4_I(inode)->jinode,	68	EXT4_I(inode)->jinode,
69	new_size);	69	new_size);
70	}	70	}
71		71
72	static void ext4_invalidatepage(struct page *page, unsigned long offset);	72	static void ext4_invalidatepage(struct page *page, unsigned long offset);
73	static int noalloc_get_block_write(struct inode *inode, sector_t iblock,	73	static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74	struct buffer_head *bh_result, int create);	74	struct buffer_head *bh_result, int create);
75	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode);	75	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode);
76	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);	76	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77	static int __ext4_journalled_writepage(struct page *page, unsigned int len);	77	static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78	static int ext4_bh_delay_or_unwritten(handle_t handle, struct buffer_head bh);	78	static int ext4_bh_delay_or_unwritten(handle_t handle, struct buffer_head bh);
79		79
80	/*	80	/*
81	* Test whether an inode is a fast symlink.	81	* Test whether an inode is a fast symlink.
82	*/	82	*/
83	static int ext4_inode_is_fast_symlink(struct inode *inode)	83	static int ext4_inode_is_fast_symlink(struct inode *inode)
84	{	84	{
85	int ea_blocks = EXT4_I(inode)->i_file_acl ?	85	int ea_blocks = EXT4_I(inode)->i_file_acl ?
86	(inode->i_sb->s_blocksize >> 9) : 0;	86	(inode->i_sb->s_blocksize >> 9) : 0;
87		87
88	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);	88	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
89	}	89	}
90		90
91	/*	91	/*
92	* Work out how many blocks we need to proceed with the next chunk of a	92	* Work out how many blocks we need to proceed with the next chunk of a
93	* truncate transaction.	93	* truncate transaction.
94	*/	94	*/
95	static unsigned long blocks_for_truncate(struct inode *inode)	95	static unsigned long blocks_for_truncate(struct inode *inode)
96	{	96	{
97	ext4_lblk_t needed;	97	ext4_lblk_t needed;
98		98
99	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	99	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100		100
101	/* Give ourselves just enough room to cope with inodes in which	101	/* Give ourselves just enough room to cope with inodes in which
102	* i_blocks is corrupt: we've seen disk corruptions in the past	102	* i_blocks is corrupt: we've seen disk corruptions in the past
103	* which resulted in random data in an inode which looked enough	103	* which resulted in random data in an inode which looked enough
104	* like a regular file for ext4 to try to delete it. Things	104	* like a regular file for ext4 to try to delete it. Things
105	* will go a bit crazy if that happens, but at least we should	105	* will go a bit crazy if that happens, but at least we should
106	* try not to panic the whole kernel. */	106	* try not to panic the whole kernel. */
107	if (needed < 2)	107	if (needed < 2)
108	needed = 2;	108	needed = 2;
109		109
110	/* But we need to bound the transaction so we don't overflow the	110	/* But we need to bound the transaction so we don't overflow the
111	* journal. */	111	* journal. */
112	if (needed > EXT4_MAX_TRANS_DATA)	112	if (needed > EXT4_MAX_TRANS_DATA)
113	needed = EXT4_MAX_TRANS_DATA;	113	needed = EXT4_MAX_TRANS_DATA;
114		114
115	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;	115	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
116	}	116	}
117		117
118	/*	118	/*
119	* Truncate transactions can be complex and absolutely huge. So we need to	119	* Truncate transactions can be complex and absolutely huge. So we need to
120	* be able to restart the transaction at a conventient checkpoint to make	120	* be able to restart the transaction at a conventient checkpoint to make
121	* sure we don't overflow the journal.	121	* sure we don't overflow the journal.
122	*	122	*
123	* start_transaction gets us a new handle for a truncate transaction,	123	* start_transaction gets us a new handle for a truncate transaction,
124	* and extend_transaction tries to extend the existing one a bit. If	124	* and extend_transaction tries to extend the existing one a bit. If
125	* extend fails, we need to propagate the failure up and restart the	125	* extend fails, we need to propagate the failure up and restart the
126	* transaction in the top-level truncate loop. --sct	126	* transaction in the top-level truncate loop. --sct
127	*/	127	*/
128	static handle_t start_transaction(struct inode inode)	128	static handle_t start_transaction(struct inode inode)
129	{	129	{
130	handle_t *result;	130	handle_t *result;
131		131
132	result = ext4_journal_start(inode, blocks_for_truncate(inode));	132	result = ext4_journal_start(inode, blocks_for_truncate(inode));
133	if (!IS_ERR(result))	133	if (!IS_ERR(result))
134	return result;	134	return result;
135		135
136	ext4_std_error(inode->i_sb, PTR_ERR(result));	136	ext4_std_error(inode->i_sb, PTR_ERR(result));
137	return result;	137	return result;
138	}	138	}
139		139
140	/*	140	/*
141	* Try to extend this transaction for the purposes of truncation.	141	* Try to extend this transaction for the purposes of truncation.
142	*	142	*
143	* Returns 0 if we managed to create more room. If we can't create more	143	* Returns 0 if we managed to create more room. If we can't create more
144	* room, and the transaction must be restarted we return 1.	144	* room, and the transaction must be restarted we return 1.
145	*/	145	*/
146	static int try_to_extend_transaction(handle_t handle, struct inode inode)	146	static int try_to_extend_transaction(handle_t handle, struct inode inode)
147	{	147	{
148	if (!ext4_handle_valid(handle))	148	if (!ext4_handle_valid(handle))
149	return 0;	149	return 0;
150	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))	150	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
151	return 0;	151	return 0;
152	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))	152	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
153	return 0;	153	return 0;
154	return 1;	154	return 1;
155	}	155	}
156		156
157	/*	157	/*
158	* Restart the transaction associated with *handle. This does a commit,	158	* Restart the transaction associated with *handle. This does a commit,
159	* so before we call here everything must be consistently dirtied against	159	* so before we call here everything must be consistently dirtied against
160	* this transaction.	160	* this transaction.
161	*/	161	*/
162	int ext4_truncate_restart_trans(handle_t handle, struct inode inode,	162	int ext4_truncate_restart_trans(handle_t handle, struct inode inode,
163	int nblocks)	163	int nblocks)
164	{	164	{
165	int ret;	165	int ret;
166		166
167	/*	167	/*
168	* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this	168	* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
169	* moment, get_block can be called only for blocks inside i_size since	169	* moment, get_block can be called only for blocks inside i_size since
170	* page cache has been already dropped and writes are blocked by	170	* page cache has been already dropped and writes are blocked by
171	* i_mutex. So we can safely drop the i_data_sem here.	171	* i_mutex. So we can safely drop the i_data_sem here.
172	*/	172	*/
173	BUG_ON(EXT4_JOURNAL(inode) == NULL);	173	BUG_ON(EXT4_JOURNAL(inode) == NULL);
174	jbd_debug(2, "restarting handle %p\n", handle);	174	jbd_debug(2, "restarting handle %p\n", handle);
175	up_write(&EXT4_I(inode)->i_data_sem);	175	up_write(&EXT4_I(inode)->i_data_sem);
176	ret = ext4_journal_restart(handle, nblocks);	176	ret = ext4_journal_restart(handle, nblocks);
177	down_write(&EXT4_I(inode)->i_data_sem);	177	down_write(&EXT4_I(inode)->i_data_sem);
178	ext4_discard_preallocations(inode);	178	ext4_discard_preallocations(inode);
179		179
180	return ret;	180	return ret;
181	}	181	}
182		182
183	/*	183	/*
184	* Called at the last iput() if i_nlink is zero.	184	* Called at the last iput() if i_nlink is zero.
185	*/	185	*/
186	void ext4_evict_inode(struct inode *inode)	186	void ext4_evict_inode(struct inode *inode)
187	{	187	{
188	handle_t *handle;	188	handle_t *handle;
189	int err;	189	int err;
190		190
191	trace_ext4_evict_inode(inode);	191	trace_ext4_evict_inode(inode);
192	if (inode->i_nlink) {	192	if (inode->i_nlink) {
193	truncate_inode_pages(&inode->i_data, 0);	193	truncate_inode_pages(&inode->i_data, 0);
194	goto no_delete;	194	goto no_delete;
195	}	195	}
196		196
197	if (!is_bad_inode(inode))	197	if (!is_bad_inode(inode))
198	dquot_initialize(inode);	198	dquot_initialize(inode);
199		199
200	if (ext4_should_order_data(inode))	200	if (ext4_should_order_data(inode))
201	ext4_begin_ordered_truncate(inode, 0);	201	ext4_begin_ordered_truncate(inode, 0);
202	truncate_inode_pages(&inode->i_data, 0);	202	truncate_inode_pages(&inode->i_data, 0);
203		203
204	if (is_bad_inode(inode))	204	if (is_bad_inode(inode))
205	goto no_delete;	205	goto no_delete;
206		206
207	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);	207	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
208	if (IS_ERR(handle)) {	208	if (IS_ERR(handle)) {
209	ext4_std_error(inode->i_sb, PTR_ERR(handle));	209	ext4_std_error(inode->i_sb, PTR_ERR(handle));
210	/*	210	/*
211	* If we're going to skip the normal cleanup, we still need to	211	* If we're going to skip the normal cleanup, we still need to
212	* make sure that the in-core orphan linked list is properly	212	* make sure that the in-core orphan linked list is properly
213	* cleaned up.	213	* cleaned up.
214	*/	214	*/
215	ext4_orphan_del(NULL, inode);	215	ext4_orphan_del(NULL, inode);
216	goto no_delete;	216	goto no_delete;
217	}	217	}
218		218
219	if (IS_SYNC(inode))	219	if (IS_SYNC(inode))
220	ext4_handle_sync(handle);	220	ext4_handle_sync(handle);
221	inode->i_size = 0;	221	inode->i_size = 0;
222	err = ext4_mark_inode_dirty(handle, inode);	222	err = ext4_mark_inode_dirty(handle, inode);
223	if (err) {	223	if (err) {
224	ext4_warning(inode->i_sb,	224	ext4_warning(inode->i_sb,
225	"couldn't mark inode dirty (err %d)", err);	225	"couldn't mark inode dirty (err %d)", err);
226	goto stop_handle;	226	goto stop_handle;
227	}	227	}
228	if (inode->i_blocks)	228	if (inode->i_blocks)
229	ext4_truncate(inode);	229	ext4_truncate(inode);
230		230
231	/*	231	/*
232	* ext4_ext_truncate() doesn't reserve any slop when it	232	* ext4_ext_truncate() doesn't reserve any slop when it
233	* restarts journal transactions; therefore there may not be	233	* restarts journal transactions; therefore there may not be
234	* enough credits left in the handle to remove the inode from	234	* enough credits left in the handle to remove the inode from
235	* the orphan list and set the dtime field.	235	* the orphan list and set the dtime field.
236	*/	236	*/
237	if (!ext4_handle_has_enough_credits(handle, 3)) {	237	if (!ext4_handle_has_enough_credits(handle, 3)) {
238	err = ext4_journal_extend(handle, 3);	238	err = ext4_journal_extend(handle, 3);
239	if (err > 0)	239	if (err > 0)
240	err = ext4_journal_restart(handle, 3);	240	err = ext4_journal_restart(handle, 3);
241	if (err != 0) {	241	if (err != 0) {
242	ext4_warning(inode->i_sb,	242	ext4_warning(inode->i_sb,
243	"couldn't extend journal (err %d)", err);	243	"couldn't extend journal (err %d)", err);
244	stop_handle:	244	stop_handle:
245	ext4_journal_stop(handle);	245	ext4_journal_stop(handle);
246	ext4_orphan_del(NULL, inode);	246	ext4_orphan_del(NULL, inode);
247	goto no_delete;	247	goto no_delete;
248	}	248	}
249	}	249	}
250		250
251	/*	251	/*
252	* Kill off the orphan record which ext4_truncate created.	252	* Kill off the orphan record which ext4_truncate created.
253	* AKPM: I think this can be inside the above `if'.	253	* AKPM: I think this can be inside the above `if'.
254	* Note that ext4_orphan_del() has to be able to cope with the	254	* Note that ext4_orphan_del() has to be able to cope with the
255	* deletion of a non-existent orphan - this is because we don't	255	* deletion of a non-existent orphan - this is because we don't
256	* know if ext4_truncate() actually created an orphan record.	256	* know if ext4_truncate() actually created an orphan record.
257	* (Well, we could do this if we need to, but heck - it works)	257	* (Well, we could do this if we need to, but heck - it works)
258	*/	258	*/
259	ext4_orphan_del(handle, inode);	259	ext4_orphan_del(handle, inode);
260	EXT4_I(inode)->i_dtime = get_seconds();	260	EXT4_I(inode)->i_dtime = get_seconds();
261		261
262	/*	262	/*
263	* One subtle ordering requirement: if anything has gone wrong	263	* One subtle ordering requirement: if anything has gone wrong
264	* (transaction abort, IO errors, whatever), then we can still	264	* (transaction abort, IO errors, whatever), then we can still
265	* do these next steps (the fs will already have been marked as	265	* do these next steps (the fs will already have been marked as
266	* having errors), but we can't free the inode if the mark_dirty	266	* having errors), but we can't free the inode if the mark_dirty
267	* fails.	267	* fails.
268	*/	268	*/
269	if (ext4_mark_inode_dirty(handle, inode))	269	if (ext4_mark_inode_dirty(handle, inode))
270	/* If that failed, just do the required in-core inode clear. */	270	/* If that failed, just do the required in-core inode clear. */
271	ext4_clear_inode(inode);	271	ext4_clear_inode(inode);
272	else	272	else
273	ext4_free_inode(handle, inode);	273	ext4_free_inode(handle, inode);
274	ext4_journal_stop(handle);	274	ext4_journal_stop(handle);
275	return;	275	return;
276	no_delete:	276	no_delete:
277	ext4_clear_inode(inode); /* We must guarantee clearing of inode... */	277	ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
278	}	278	}
279		279
280	typedef struct {	280	typedef struct {
281	__le32 *p;	281	__le32 *p;
282	__le32 key;	282	__le32 key;
283	struct buffer_head *bh;	283	struct buffer_head *bh;
284	} Indirect;	284	} Indirect;
285		285
286	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)	286	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
287	{	287	{
288	p->key = *(p->p = v);	288	p->key = *(p->p = v);
289	p->bh = bh;	289	p->bh = bh;
290	}	290	}
291		291
292	/**	292	/**
293	* ext4_block_to_path - parse the block number into array of offsets	293	* ext4_block_to_path - parse the block number into array of offsets
294	* @inode: inode in question (we are only interested in its superblock)	294	* @inode: inode in question (we are only interested in its superblock)
295	* @i_block: block number to be parsed	295	* @i_block: block number to be parsed
296	* @offsets: array to store the offsets in	296	* @offsets: array to store the offsets in
297	* @boundary: set this non-zero if the referred-to block is likely to be	297	* @boundary: set this non-zero if the referred-to block is likely to be
298	* followed (on disk) by an indirect block.	298	* followed (on disk) by an indirect block.
299	*	299	*
300	* To store the locations of file's data ext4 uses a data structure common	300	* To store the locations of file's data ext4 uses a data structure common
301	* for UNIX filesystems - tree of pointers anchored in the inode, with	301	* for UNIX filesystems - tree of pointers anchored in the inode, with
302	* data blocks at leaves and indirect blocks in intermediate nodes.	302	* data blocks at leaves and indirect blocks in intermediate nodes.
303	* This function translates the block number into path in that tree -	303	* This function translates the block number into path in that tree -
304	* return value is the path length and @offsets[n] is the offset of	304	* return value is the path length and @offsets[n] is the offset of
305	* pointer to (n+1)th node in the nth one. If @block is out of range	305	* pointer to (n+1)th node in the nth one. If @block is out of range
306	* (negative or too large) warning is printed and zero returned.	306	* (negative or too large) warning is printed and zero returned.
307	*	307	*
308	* Note: function doesn't find node addresses, so no IO is needed. All	308	* Note: function doesn't find node addresses, so no IO is needed. All
309	* we need to know is the capacity of indirect blocks (taken from the	309	* we need to know is the capacity of indirect blocks (taken from the
310	* inode->i_sb).	310	* inode->i_sb).
311	*/	311	*/
312		312
313	/*	313	/*
314	* Portability note: the last comparison (check that we fit into triple	314	* Portability note: the last comparison (check that we fit into triple
315	* indirect block) is spelled differently, because otherwise on an	315	* indirect block) is spelled differently, because otherwise on an
316	* architecture with 32-bit longs and 8Kb pages we might get into trouble	316	* architecture with 32-bit longs and 8Kb pages we might get into trouble
317	* if our filesystem had 8Kb blocks. We might use long long, but that would	317	* if our filesystem had 8Kb blocks. We might use long long, but that would
318	* kill us on x86. Oh, well, at least the sign propagation does not matter -	318	* kill us on x86. Oh, well, at least the sign propagation does not matter -
319	* i_block would have to be negative in the very beginning, so we would not	319	* i_block would have to be negative in the very beginning, so we would not
320	* get there at all.	320	* get there at all.
321	*/	321	*/
322		322
323	static int ext4_block_to_path(struct inode *inode,	323	static int ext4_block_to_path(struct inode *inode,
324	ext4_lblk_t i_block,	324	ext4_lblk_t i_block,
325	ext4_lblk_t offsets[4], int *boundary)	325	ext4_lblk_t offsets[4], int *boundary)
326	{	326	{
327	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);	327	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
328	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);	328	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
329	const long direct_blocks = EXT4_NDIR_BLOCKS,	329	const long direct_blocks = EXT4_NDIR_BLOCKS,
330	indirect_blocks = ptrs,	330	indirect_blocks = ptrs,
331	double_blocks = (1 << (ptrs_bits * 2));	331	double_blocks = (1 << (ptrs_bits * 2));
332	int n = 0;	332	int n = 0;
333	int final = 0;	333	int final = 0;
334		334
335	if (i_block < direct_blocks) {	335	if (i_block < direct_blocks) {
336	offsets[n++] = i_block;	336	offsets[n++] = i_block;
337	final = direct_blocks;	337	final = direct_blocks;
338	} else if ((i_block -= direct_blocks) < indirect_blocks) {	338	} else if ((i_block -= direct_blocks) < indirect_blocks) {
339	offsets[n++] = EXT4_IND_BLOCK;	339	offsets[n++] = EXT4_IND_BLOCK;
340	offsets[n++] = i_block;	340	offsets[n++] = i_block;
341	final = ptrs;	341	final = ptrs;
342	} else if ((i_block -= indirect_blocks) < double_blocks) {	342	} else if ((i_block -= indirect_blocks) < double_blocks) {
343	offsets[n++] = EXT4_DIND_BLOCK;	343	offsets[n++] = EXT4_DIND_BLOCK;
344	offsets[n++] = i_block >> ptrs_bits;	344	offsets[n++] = i_block >> ptrs_bits;
345	offsets[n++] = i_block & (ptrs - 1);	345	offsets[n++] = i_block & (ptrs - 1);
346	final = ptrs;	346	final = ptrs;
347	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {	347	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
348	offsets[n++] = EXT4_TIND_BLOCK;	348	offsets[n++] = EXT4_TIND_BLOCK;
349	offsets[n++] = i_block >> (ptrs_bits * 2);	349	offsets[n++] = i_block >> (ptrs_bits * 2);
350	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);	350	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
351	offsets[n++] = i_block & (ptrs - 1);	351	offsets[n++] = i_block & (ptrs - 1);
352	final = ptrs;	352	final = ptrs;
353	} else {	353	} else {
354	ext4_warning(inode->i_sb, "block %lu > max in inode %lu",	354	ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
355	i_block + direct_blocks +	355	i_block + direct_blocks +
356	indirect_blocks + double_blocks, inode->i_ino);	356	indirect_blocks + double_blocks, inode->i_ino);
357	}	357	}
358	if (boundary)	358	if (boundary)
359	*boundary = final - 1 - (i_block & (ptrs - 1));	359	*boundary = final - 1 - (i_block & (ptrs - 1));
360	return n;	360	return n;
361	}	361	}
362		362
363	static int __ext4_check_blockref(const char *function, unsigned int line,	363	static int __ext4_check_blockref(const char *function, unsigned int line,
364	struct inode *inode,	364	struct inode *inode,
365	__le32 *p, unsigned int max)	365	__le32 *p, unsigned int max)
366	{	366	{
367	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;	367	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368	__le32 *bref = p;	368	__le32 *bref = p;
369	unsigned int blk;	369	unsigned int blk;
370		370
371	while (bref < p+max) {	371	while (bref < p+max) {
372	blk = le32_to_cpu(*bref++);	372	blk = le32_to_cpu(*bref++);
373	if (blk &&	373	if (blk &&
374	unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),	374	unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
375	blk, 1))) {	375	blk, 1))) {
376	es->s_last_error_block = cpu_to_le64(blk);	376	es->s_last_error_block = cpu_to_le64(blk);
377	ext4_error_inode(inode, function, line, blk,	377	ext4_error_inode(inode, function, line, blk,
378	"invalid block");	378	"invalid block");
379	return -EIO;	379	return -EIO;
380	}	380	}
381	}	381	}
382	return 0;	382	return 0;
383	}	383	}
384		384
385		385
386	#define ext4_check_indirect_blockref(inode, bh) \	386	#define ext4_check_indirect_blockref(inode, bh) \
387	__ext4_check_blockref(__func__, __LINE__, inode, \	387	__ext4_check_blockref(__func__, __LINE__, inode, \
388	(__le32 *)(bh)->b_data, \	388	(__le32 *)(bh)->b_data, \
389	EXT4_ADDR_PER_BLOCK((inode)->i_sb))	389	EXT4_ADDR_PER_BLOCK((inode)->i_sb))
390		390
391	#define ext4_check_inode_blockref(inode) \	391	#define ext4_check_inode_blockref(inode) \
392	__ext4_check_blockref(__func__, __LINE__, inode, \	392	__ext4_check_blockref(__func__, __LINE__, inode, \
393	EXT4_I(inode)->i_data, \	393	EXT4_I(inode)->i_data, \
394	EXT4_NDIR_BLOCKS)	394	EXT4_NDIR_BLOCKS)
395		395
396	/**	396	/**
397	* ext4_get_branch - read the chain of indirect blocks leading to data	397	* ext4_get_branch - read the chain of indirect blocks leading to data
398	* @inode: inode in question	398	* @inode: inode in question
399	* @depth: depth of the chain (1 - direct pointer, etc.)	399	* @depth: depth of the chain (1 - direct pointer, etc.)
400	* @offsets: offsets of pointers in inode/indirect blocks	400	* @offsets: offsets of pointers in inode/indirect blocks
401	* @chain: place to store the result	401	* @chain: place to store the result
402	* @err: here we store the error value	402	* @err: here we store the error value
403	*	403	*
404	* Function fills the array of triples <key, p, bh> and returns %NULL	404	* Function fills the array of triples <key, p, bh> and returns %NULL
405	* if everything went OK or the pointer to the last filled triple	405	* if everything went OK or the pointer to the last filled triple
406	* (incomplete one) otherwise. Upon the return chain[i].key contains	406	* (incomplete one) otherwise. Upon the return chain[i].key contains
407	* the number of (i+1)-th block in the chain (as it is stored in memory,	407	* the number of (i+1)-th block in the chain (as it is stored in memory,
408	* i.e. little-endian 32-bit), chain[i].p contains the address of that	408	* i.e. little-endian 32-bit), chain[i].p contains the address of that
409	* number (it points into struct inode for i==0 and into the bh->b_data	409	* number (it points into struct inode for i==0 and into the bh->b_data
410	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect	410	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411	* block for i>0 and NULL for i==0. In other words, it holds the block	411	* block for i>0 and NULL for i==0. In other words, it holds the block
412	* numbers of the chain, addresses they were taken from (and where we can	412	* numbers of the chain, addresses they were taken from (and where we can
413	* verify that chain did not change) and buffer_heads hosting these	413	* verify that chain did not change) and buffer_heads hosting these
414	* numbers.	414	* numbers.
415	*	415	*
416	* Function stops when it stumbles upon zero pointer (absent block)	416	* Function stops when it stumbles upon zero pointer (absent block)
417	* (pointer to last triple returned, *@err == 0)	417	* (pointer to last triple returned, *@err == 0)
418	* or when it gets an IO error reading an indirect block	418	* or when it gets an IO error reading an indirect block
419	* (ditto, *@err == -EIO)	419	* (ditto, *@err == -EIO)
420	* or when it reads all @depth-1 indirect blocks successfully and finds	420	* or when it reads all @depth-1 indirect blocks successfully and finds
421	* the whole chain, all way to the data (returns %NULL, *err == 0).	421	* the whole chain, all way to the data (returns %NULL, *err == 0).
422	*	422	*
423	* Need to be called with	423	* Need to be called with
424	* down_read(&EXT4_I(inode)->i_data_sem)	424	* down_read(&EXT4_I(inode)->i_data_sem)
425	*/	425	*/
426	static Indirect ext4_get_branch(struct inode inode, int depth,	426	static Indirect ext4_get_branch(struct inode inode, int depth,
427	ext4_lblk_t *offsets,	427	ext4_lblk_t *offsets,
428	Indirect chain[4], int *err)	428	Indirect chain[4], int *err)
429	{	429	{
430	struct super_block *sb = inode->i_sb;	430	struct super_block *sb = inode->i_sb;
431	Indirect *p = chain;	431	Indirect *p = chain;
432	struct buffer_head *bh;	432	struct buffer_head *bh;
433		433
434	*err = 0;	434	*err = 0;
435	/* i_data is not going away, no lock needed */	435	/* i_data is not going away, no lock needed */
436	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);	436	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
437	if (!p->key)	437	if (!p->key)
438	goto no_block;	438	goto no_block;
439	while (--depth) {	439	while (--depth) {
440	bh = sb_getblk(sb, le32_to_cpu(p->key));	440	bh = sb_getblk(sb, le32_to_cpu(p->key));
441	if (unlikely(!bh))	441	if (unlikely(!bh))
442	goto failure;	442	goto failure;
443		443
444	if (!bh_uptodate_or_lock(bh)) {	444	if (!bh_uptodate_or_lock(bh)) {
445	if (bh_submit_read(bh) < 0) {	445	if (bh_submit_read(bh) < 0) {
446	put_bh(bh);	446	put_bh(bh);
447	goto failure;	447	goto failure;
448	}	448	}
449	/* validate block references */	449	/* validate block references */
450	if (ext4_check_indirect_blockref(inode, bh)) {	450	if (ext4_check_indirect_blockref(inode, bh)) {
451	put_bh(bh);	451	put_bh(bh);
452	goto failure;	452	goto failure;
453	}	453	}
454	}	454	}
455		455
456	add_chain(++p, bh, (__le32 )bh->b_data + ++offsets);	456	add_chain(++p, bh, (__le32 )bh->b_data + ++offsets);
457	/* Reader: end */	457	/* Reader: end */
458	if (!p->key)	458	if (!p->key)
459	goto no_block;	459	goto no_block;
460	}	460	}
461	return NULL;	461	return NULL;
462		462
463	failure:	463	failure:
464	*err = -EIO;	464	*err = -EIO;
465	no_block:	465	no_block:
466	return p;	466	return p;
467	}	467	}
468		468
469	/**	469	/**
470	* ext4_find_near - find a place for allocation with sufficient locality	470	* ext4_find_near - find a place for allocation with sufficient locality
471	* @inode: owner	471	* @inode: owner
472	* @ind: descriptor of indirect block.	472	* @ind: descriptor of indirect block.
473	*	473	*
474	* This function returns the preferred place for block allocation.	474	* This function returns the preferred place for block allocation.
475	* It is used when heuristic for sequential allocation fails.	475	* It is used when heuristic for sequential allocation fails.
476	* Rules are:	476	* Rules are:
477	* + if there is a block to the left of our position - allocate near it.	477	* + if there is a block to the left of our position - allocate near it.
478	* + if pointer will live in indirect block - allocate near that block.	478	* + if pointer will live in indirect block - allocate near that block.
479	* + if pointer will live in inode - allocate in the same	479	* + if pointer will live in inode - allocate in the same
480	* cylinder group.	480	* cylinder group.
481	*	481	*
482	* In the latter case we colour the starting block by the callers PID to	482	* In the latter case we colour the starting block by the callers PID to
483	* prevent it from clashing with concurrent allocations for a different inode	483	* prevent it from clashing with concurrent allocations for a different inode
484	* in the same block group. The PID is used here so that functionally related	484	* in the same block group. The PID is used here so that functionally related
485	* files will be close-by on-disk.	485	* files will be close-by on-disk.
486	*	486	*
487	* Caller must make sure that @ind is valid and will stay that way.	487	* Caller must make sure that @ind is valid and will stay that way.
488	*/	488	*/
489	static ext4_fsblk_t ext4_find_near(struct inode inode, Indirect ind)	489	static ext4_fsblk_t ext4_find_near(struct inode inode, Indirect ind)
490	{	490	{
491	struct ext4_inode_info *ei = EXT4_I(inode);	491	struct ext4_inode_info *ei = EXT4_I(inode);
492	__le32 start = ind->bh ? (__le32 ) ind->bh->b_data : ei->i_data;	492	__le32 start = ind->bh ? (__le32 ) ind->bh->b_data : ei->i_data;
493	__le32 *p;	493	__le32 *p;
494	ext4_fsblk_t bg_start;	494	ext4_fsblk_t bg_start;
495	ext4_fsblk_t last_block;	495	ext4_fsblk_t last_block;
496	ext4_grpblk_t colour;	496	ext4_grpblk_t colour;
497	ext4_group_t block_group;	497	ext4_group_t block_group;
498	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));	498	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
499		499
500	/* Try to find previous block */	500	/* Try to find previous block */
501	for (p = ind->p - 1; p >= start; p--) {	501	for (p = ind->p - 1; p >= start; p--) {
502	if (*p)	502	if (*p)
503	return le32_to_cpu(*p);	503	return le32_to_cpu(*p);
504	}	504	}
505		505
506	/* No such thing, so let's try location of indirect block */	506	/* No such thing, so let's try location of indirect block */
507	if (ind->bh)	507	if (ind->bh)
508	return ind->bh->b_blocknr;	508	return ind->bh->b_blocknr;
509		509
510	/*	510	/*
511	* It is going to be referred to from the inode itself? OK, just put it	511	* It is going to be referred to from the inode itself? OK, just put it
512	* into the same cylinder group then.	512	* into the same cylinder group then.
513	*/	513	*/
514	block_group = ei->i_block_group;	514	block_group = ei->i_block_group;
515	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {	515	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
516	block_group &= ~(flex_size-1);	516	block_group &= ~(flex_size-1);
517	if (S_ISREG(inode->i_mode))	517	if (S_ISREG(inode->i_mode))
518	block_group++;	518	block_group++;
519	}	519	}
520	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);	520	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
521	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;	521	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
522		522
523	/*	523	/*
524	* If we are doing delayed allocation, we don't need take	524	* If we are doing delayed allocation, we don't need take
525	* colour into account.	525	* colour into account.
526	*/	526	*/
527	if (test_opt(inode->i_sb, DELALLOC))	527	if (test_opt(inode->i_sb, DELALLOC))
528	return bg_start;	528	return bg_start;
529		529
530	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)	530	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
531	colour = (current->pid % 16) *	531	colour = (current->pid % 16) *
532	(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);	532	(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
533	else	533	else
534	colour = (current->pid % 16) * ((last_block - bg_start) / 16);	534	colour = (current->pid % 16) * ((last_block - bg_start) / 16);
535	return bg_start + colour;	535	return bg_start + colour;
536	}	536	}
537		537
538	/**	538	/**
539	* ext4_find_goal - find a preferred place for allocation.	539	* ext4_find_goal - find a preferred place for allocation.
540	* @inode: owner	540	* @inode: owner
541	* @block: block we want	541	* @block: block we want
542	* @partial: pointer to the last triple within a chain	542	* @partial: pointer to the last triple within a chain
543	*	543	*
544	* Normally this function find the preferred place for block allocation,	544	* Normally this function find the preferred place for block allocation,
545	* returns it.	545	* returns it.
546	* Because this is only used for non-extent files, we limit the block nr	546	* Because this is only used for non-extent files, we limit the block nr
547	* to 32 bits.	547	* to 32 bits.
548	*/	548	*/
549	static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,	549	static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550	Indirect *partial)	550	Indirect *partial)
551	{	551	{
552	ext4_fsblk_t goal;	552	ext4_fsblk_t goal;
553		553
554	/*	554	/*
555	* XXX need to get goal block from mballoc's data structures	555	* XXX need to get goal block from mballoc's data structures
556	*/	556	*/
557		557
558	goal = ext4_find_near(inode, partial);	558	goal = ext4_find_near(inode, partial);
559	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;	559	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
560	return goal;	560	return goal;
561	}	561	}
562		562
563	/**	563	/**
564	* ext4_blks_to_allocate - Look up the block map and count the number	564	* ext4_blks_to_allocate - Look up the block map and count the number
565	* of direct blocks need to be allocated for the given branch.	565	* of direct blocks need to be allocated for the given branch.
566	*	566	*
567	* @branch: chain of indirect blocks	567	* @branch: chain of indirect blocks
568	* @k: number of blocks need for indirect blocks	568	* @k: number of blocks need for indirect blocks
569	* @blks: number of data blocks to be mapped.	569	* @blks: number of data blocks to be mapped.
570	* @blocks_to_boundary: the offset in the indirect block	570	* @blocks_to_boundary: the offset in the indirect block
571	*	571	*
572	* return the total number of blocks to be allocate, including the	572	* return the total number of blocks to be allocate, including the
573	* direct and indirect blocks.	573	* direct and indirect blocks.
574	*/	574	*/
575	static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,	575	static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576	int blocks_to_boundary)	576	int blocks_to_boundary)
577	{	577	{
578	unsigned int count = 0;	578	unsigned int count = 0;
579		579
580	/*	580	/*
581	* Simple case, [t,d]Indirect block(s) has not allocated yet	581	* Simple case, [t,d]Indirect block(s) has not allocated yet
582	* then it's clear blocks on that path have not allocated	582	* then it's clear blocks on that path have not allocated
583	*/	583	*/
584	if (k > 0) {	584	if (k > 0) {
585	/* right now we don't handle cross boundary allocation */	585	/* right now we don't handle cross boundary allocation */
586	if (blks < blocks_to_boundary + 1)	586	if (blks < blocks_to_boundary + 1)
587	count += blks;	587	count += blks;
588	else	588	else
589	count += blocks_to_boundary + 1;	589	count += blocks_to_boundary + 1;
590	return count;	590	return count;
591	}	591	}
592		592
593	count++;	593	count++;
594	while (count < blks && count <= blocks_to_boundary &&	594	while (count < blks && count <= blocks_to_boundary &&
595	le32_to_cpu(*(branch[0].p + count)) == 0) {	595	le32_to_cpu(*(branch[0].p + count)) == 0) {
596	count++;	596	count++;
597	}	597	}
598	return count;	598	return count;
599	}	599	}
600		600
601	/**	601	/**
602	* ext4_alloc_blocks: multiple allocate blocks needed for a branch	602	* ext4_alloc_blocks: multiple allocate blocks needed for a branch
603	* @handle: handle for this transaction	603	* @handle: handle for this transaction
604	* @inode: inode which needs allocated blocks	604	* @inode: inode which needs allocated blocks
605	* @iblock: the logical block to start allocated at	605	* @iblock: the logical block to start allocated at
606	* @goal: preferred physical block of allocation	606	* @goal: preferred physical block of allocation
607	* @indirect_blks: the number of blocks need to allocate for indirect	607	* @indirect_blks: the number of blocks need to allocate for indirect
608	* blocks	608	* blocks
609	* @blks: number of desired blocks	609	* @blks: number of desired blocks
610	* @new_blocks: on return it will store the new block numbers for	610	* @new_blocks: on return it will store the new block numbers for
611	* the indirect blocks(if needed) and the first direct block,	611	* the indirect blocks(if needed) and the first direct block,
612	* @err: on return it will store the error code	612	* @err: on return it will store the error code
613	*	613	*
614	* This function will return the number of blocks allocated as	614	* This function will return the number of blocks allocated as
615	* requested by the passed-in parameters.	615	* requested by the passed-in parameters.
616	*/	616	*/
617	static int ext4_alloc_blocks(handle_t handle, struct inode inode,	617	static int ext4_alloc_blocks(handle_t handle, struct inode inode,
618	ext4_lblk_t iblock, ext4_fsblk_t goal,	618	ext4_lblk_t iblock, ext4_fsblk_t goal,
619	int indirect_blks, int blks,	619	int indirect_blks, int blks,
620	ext4_fsblk_t new_blocks[4], int *err)	620	ext4_fsblk_t new_blocks[4], int *err)
621	{	621	{
622	struct ext4_allocation_request ar;	622	struct ext4_allocation_request ar;
623	int target, i;	623	int target, i;
624	unsigned long count = 0, blk_allocated = 0;	624	unsigned long count = 0, blk_allocated = 0;
625	int index = 0;	625	int index = 0;
626	ext4_fsblk_t current_block = 0;	626	ext4_fsblk_t current_block = 0;
627	int ret = 0;	627	int ret = 0;
628		628
629	/*	629	/*
630	* Here we try to allocate the requested multiple blocks at once,	630	* Here we try to allocate the requested multiple blocks at once,
631	* on a best-effort basis.	631	* on a best-effort basis.
632	* To build a branch, we should allocate blocks for	632	* To build a branch, we should allocate blocks for
633	* the indirect blocks(if not allocated yet), and at least	633	* the indirect blocks(if not allocated yet), and at least
634	* the first direct block of this branch. That's the	634	* the first direct block of this branch. That's the
635	* minimum number of blocks need to allocate(required)	635	* minimum number of blocks need to allocate(required)
636	*/	636	*/
637	/* first we try to allocate the indirect blocks */	637	/* first we try to allocate the indirect blocks */
638	target = indirect_blks;	638	target = indirect_blks;
639	while (target > 0) {	639	while (target > 0) {
640	count = target;	640	count = target;
641	/* allocating blocks for indirect blocks and direct blocks */	641	/* allocating blocks for indirect blocks and direct blocks */
642	current_block = ext4_new_meta_blocks(handle, inode, goal,	642	current_block = ext4_new_meta_blocks(handle, inode, goal,
643	0, &count, err);	643	0, &count, err);
644	if (*err)	644	if (*err)
645	goto failed_out;	645	goto failed_out;
646		646
647	if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {	647	if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
648	EXT4_ERROR_INODE(inode,	648	EXT4_ERROR_INODE(inode,
649	"current_block %llu + count %lu > %d!",	649	"current_block %llu + count %lu > %d!",
650	current_block, count,	650	current_block, count,
651	EXT4_MAX_BLOCK_FILE_PHYS);	651	EXT4_MAX_BLOCK_FILE_PHYS);
652	*err = -EIO;	652	*err = -EIO;
653	goto failed_out;	653	goto failed_out;
654	}	654	}
655		655
656	target -= count;	656	target -= count;
657	/* allocate blocks for indirect blocks */	657	/* allocate blocks for indirect blocks */
658	while (index < indirect_blks && count) {	658	while (index < indirect_blks && count) {
659	new_blocks[index++] = current_block++;	659	new_blocks[index++] = current_block++;
660	count--;	660	count--;
661	}	661	}
662	if (count > 0) {	662	if (count > 0) {
663	/*	663	/*
664	* save the new block number	664	* save the new block number
665	* for the first direct block	665	* for the first direct block
666	*/	666	*/
667	new_blocks[index] = current_block;	667	new_blocks[index] = current_block;
668	printk(KERN_INFO "%s returned more blocks than "	668	printk(KERN_INFO "%s returned more blocks than "
669	"requested\n", __func__);	669	"requested\n", __func__);
670	WARN_ON(1);	670	WARN_ON(1);
671	break;	671	break;
672	}	672	}
673	}	673	}
674		674
675	target = blks - count ;	675	target = blks - count ;
676	blk_allocated = count;	676	blk_allocated = count;
677	if (!target)	677	if (!target)
678	goto allocated;	678	goto allocated;
679	/* Now allocate data blocks */	679	/* Now allocate data blocks */
680	memset(&ar, 0, sizeof(ar));	680	memset(&ar, 0, sizeof(ar));
681	ar.inode = inode;	681	ar.inode = inode;
682	ar.goal = goal;	682	ar.goal = goal;
683	ar.len = target;	683	ar.len = target;
684	ar.logical = iblock;	684	ar.logical = iblock;
685	if (S_ISREG(inode->i_mode))	685	if (S_ISREG(inode->i_mode))
686	/* enable in-core preallocation only for regular files */	686	/* enable in-core preallocation only for regular files */
687	ar.flags = EXT4_MB_HINT_DATA;	687	ar.flags = EXT4_MB_HINT_DATA;
688		688
689	current_block = ext4_mb_new_blocks(handle, &ar, err);	689	current_block = ext4_mb_new_blocks(handle, &ar, err);
690	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {	690	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
691	EXT4_ERROR_INODE(inode,	691	EXT4_ERROR_INODE(inode,
692	"current_block %llu + ar.len %d > %d!",	692	"current_block %llu + ar.len %d > %d!",
693	current_block, ar.len,	693	current_block, ar.len,
694	EXT4_MAX_BLOCK_FILE_PHYS);	694	EXT4_MAX_BLOCK_FILE_PHYS);
695	*err = -EIO;	695	*err = -EIO;
696	goto failed_out;	696	goto failed_out;
697	}	697	}
698		698
699	if (*err && (target == blks)) {	699	if (*err && (target == blks)) {
700	/*	700	/*
701	* if the allocation failed and we didn't allocate	701	* if the allocation failed and we didn't allocate
702	* any blocks before	702	* any blocks before
703	*/	703	*/
704	goto failed_out;	704	goto failed_out;
705	}	705	}
706	if (!*err) {	706	if (!*err) {
707	if (target == blks) {	707	if (target == blks) {
708	/*	708	/*
709	* save the new block number	709	* save the new block number
710	* for the first direct block	710	* for the first direct block
711	*/	711	*/
712	new_blocks[index] = current_block;	712	new_blocks[index] = current_block;
713	}	713	}
714	blk_allocated += ar.len;	714	blk_allocated += ar.len;
715	}	715	}
716	allocated:	716	allocated:
717	/* total number of blocks allocated for direct blocks */	717	/* total number of blocks allocated for direct blocks */
718	ret = blk_allocated;	718	ret = blk_allocated;
719	*err = 0;	719	*err = 0;
720	return ret;	720	return ret;
721	failed_out:	721	failed_out:
722	for (i = 0; i < index; i++)	722	for (i = 0; i < index; i++)
723	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);	723	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724	return ret;	724	return ret;
725	}	725	}
726		726
727	/**	727	/**
728	* ext4_alloc_branch - allocate and set up a chain of blocks.	728	* ext4_alloc_branch - allocate and set up a chain of blocks.
729	* @handle: handle for this transaction	729	* @handle: handle for this transaction
730	* @inode: owner	730	* @inode: owner
731	* @indirect_blks: number of allocated indirect blocks	731	* @indirect_blks: number of allocated indirect blocks
732	* @blks: number of allocated direct blocks	732	* @blks: number of allocated direct blocks
733	* @goal: preferred place for allocation	733	* @goal: preferred place for allocation
734	* @offsets: offsets (in the blocks) to store the pointers to next.	734	* @offsets: offsets (in the blocks) to store the pointers to next.
735	* @branch: place to store the chain in.	735	* @branch: place to store the chain in.
736	*	736	*
737	* This function allocates blocks, zeroes out all but the last one,	737	* This function allocates blocks, zeroes out all but the last one,
738	* links them into chain and (if we are synchronous) writes them to disk.	738	* links them into chain and (if we are synchronous) writes them to disk.
739	* In other words, it prepares a branch that can be spliced onto the	739	* In other words, it prepares a branch that can be spliced onto the
740	* inode. It stores the information about that chain in the branch[], in	740	* inode. It stores the information about that chain in the branch[], in
741	* the same format as ext4_get_branch() would do. We are calling it after	741	* the same format as ext4_get_branch() would do. We are calling it after
742	* we had read the existing part of chain and partial points to the last	742	* we had read the existing part of chain and partial points to the last
743	* triple of that (one with zero ->key). Upon the exit we have the same	743	* triple of that (one with zero ->key). Upon the exit we have the same
744	* picture as after the successful ext4_get_block(), except that in one	744	* picture as after the successful ext4_get_block(), except that in one
745	* place chain is disconnected - *branch->p is still zero (we did not	745	* place chain is disconnected - *branch->p is still zero (we did not
746	* set the last link), but branch->key contains the number that should	746	* set the last link), but branch->key contains the number that should
747	* be placed into *branch->p to fill that gap.	747	* be placed into *branch->p to fill that gap.
748	*	748	*
749	* If allocation fails we free all blocks we've allocated (and forget	749	* If allocation fails we free all blocks we've allocated (and forget
750	* their buffer_heads) and return the error value the from failed	750	* their buffer_heads) and return the error value the from failed
751	* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain	751	* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
752	* as described above and return 0.	752	* as described above and return 0.
753	*/	753	*/
754	static int ext4_alloc_branch(handle_t handle, struct inode inode,	754	static int ext4_alloc_branch(handle_t handle, struct inode inode,
755	ext4_lblk_t iblock, int indirect_blks,	755	ext4_lblk_t iblock, int indirect_blks,
756	int *blks, ext4_fsblk_t goal,	756	int *blks, ext4_fsblk_t goal,
757	ext4_lblk_t offsets, Indirect branch)	757	ext4_lblk_t offsets, Indirect branch)
758	{	758	{
759	int blocksize = inode->i_sb->s_blocksize;	759	int blocksize = inode->i_sb->s_blocksize;
760	int i, n = 0;	760	int i, n = 0;
761	int err = 0;	761	int err = 0;
762	struct buffer_head *bh;	762	struct buffer_head *bh;
763	int num;	763	int num;
764	ext4_fsblk_t new_blocks[4];	764	ext4_fsblk_t new_blocks[4];
765	ext4_fsblk_t current_block;	765	ext4_fsblk_t current_block;
766		766
767	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,	767	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
768	*blks, new_blocks, &err);	768	*blks, new_blocks, &err);
769	if (err)	769	if (err)
770	return err;	770	return err;
771		771
772	branch[0].key = cpu_to_le32(new_blocks[0]);	772	branch[0].key = cpu_to_le32(new_blocks[0]);
773	/*	773	/*
774	* metadata blocks and data blocks are allocated.	774	* metadata blocks and data blocks are allocated.
775	*/	775	*/
776	for (n = 1; n <= indirect_blks; n++) {	776	for (n = 1; n <= indirect_blks; n++) {
777	/*	777	/*
778	* Get buffer_head for parent block, zero it out	778	* Get buffer_head for parent block, zero it out
779	* and set the pointer to new one, then send	779	* and set the pointer to new one, then send
780	* parent to disk.	780	* parent to disk.
781	*/	781	*/
782	bh = sb_getblk(inode->i_sb, new_blocks[n-1]);	782	bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783	if (unlikely(!bh)) {	783	if (unlikely(!bh)) {
784	err = -EIO;	784	err = -EIO;
785	goto failed;	785	goto failed;
786	}	786	}
787		787
788	branch[n].bh = bh;	788	branch[n].bh = bh;
789	lock_buffer(bh);	789	lock_buffer(bh);
790	BUFFER_TRACE(bh, "call get_create_access");	790	BUFFER_TRACE(bh, "call get_create_access");
791	err = ext4_journal_get_create_access(handle, bh);	791	err = ext4_journal_get_create_access(handle, bh);
792	if (err) {	792	if (err) {
793	/* Don't brelse(bh) here; it's done in	793	/* Don't brelse(bh) here; it's done in
794	* ext4_journal_forget() below */	794	* ext4_journal_forget() below */
795	unlock_buffer(bh);	795	unlock_buffer(bh);
796	goto failed;	796	goto failed;
797	}	797	}
798		798
799	memset(bh->b_data, 0, blocksize);	799	memset(bh->b_data, 0, blocksize);
800	branch[n].p = (__le32 *) bh->b_data + offsets[n];	800	branch[n].p = (__le32 *) bh->b_data + offsets[n];
801	branch[n].key = cpu_to_le32(new_blocks[n]);	801	branch[n].key = cpu_to_le32(new_blocks[n]);
802	*branch[n].p = branch[n].key;	802	*branch[n].p = branch[n].key;
803	if (n == indirect_blks) {	803	if (n == indirect_blks) {
804	current_block = new_blocks[n];	804	current_block = new_blocks[n];
805	/*	805	/*
806	* End of chain, update the last new metablock of	806	* End of chain, update the last new metablock of
807	* the chain to point to the new allocated	807	* the chain to point to the new allocated
808	* data blocks numbers	808	* data blocks numbers
809	*/	809	*/
810	for (i = 1; i < num; i++)	810	for (i = 1; i < num; i++)
811	*(branch[n].p + i) = cpu_to_le32(++current_block);	811	*(branch[n].p + i) = cpu_to_le32(++current_block);
812	}	812	}
813	BUFFER_TRACE(bh, "marking uptodate");	813	BUFFER_TRACE(bh, "marking uptodate");
814	set_buffer_uptodate(bh);	814	set_buffer_uptodate(bh);
815	unlock_buffer(bh);	815	unlock_buffer(bh);
816		816
817	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	817	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
818	err = ext4_handle_dirty_metadata(handle, inode, bh);	818	err = ext4_handle_dirty_metadata(handle, inode, bh);
819	if (err)	819	if (err)
820	goto failed;	820	goto failed;
821	}	821	}
822	*blks = num;	822	*blks = num;
823	return err;	823	return err;
824	failed:	824	failed:
825	/* Allocation failed, free what we already allocated */	825	/* Allocation failed, free what we already allocated */
826	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);	826	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827	for (i = 1; i <= n ; i++) {	827	for (i = 1; i <= n ; i++) {
828	/*	828	/*
829	* branch[i].bh is newly allocated, so there is no	829	* branch[i].bh is newly allocated, so there is no
830	* need to revoke the block, which is why we don't	830	* need to revoke the block, which is why we don't
831	* need to set EXT4_FREE_BLOCKS_METADATA.	831	* need to set EXT4_FREE_BLOCKS_METADATA.
832	*/	832	*/
833	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,	833	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834	EXT4_FREE_BLOCKS_FORGET);	834	EXT4_FREE_BLOCKS_FORGET);
835	}	835	}
836	for (i = n+1; i < indirect_blks; i++)	836	for (i = n+1; i < indirect_blks; i++)
837	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);	837	ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838		838
839	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);	839	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840		840
841	return err;	841	return err;
842	}	842	}
843		843
844	/**	844	/**
845	* ext4_splice_branch - splice the allocated branch onto inode.	845	* ext4_splice_branch - splice the allocated branch onto inode.
846	* @handle: handle for this transaction	846	* @handle: handle for this transaction
847	* @inode: owner	847	* @inode: owner
848	* @block: (logical) number of block we are adding	848	* @block: (logical) number of block we are adding
849	* @chain: chain of indirect blocks (with a missing link - see	849	* @chain: chain of indirect blocks (with a missing link - see
850	* ext4_alloc_branch)	850	* ext4_alloc_branch)
851	* @where: location of missing link	851	* @where: location of missing link
852	* @num: number of indirect blocks we are adding	852	* @num: number of indirect blocks we are adding
853	* @blks: number of direct blocks we are adding	853	* @blks: number of direct blocks we are adding
854	*	854	*
855	* This function fills the missing link and does all housekeeping needed in	855	* This function fills the missing link and does all housekeeping needed in
856	* inode (->i_blocks, etc.). In case of success we end up with the full	856	* inode (->i_blocks, etc.). In case of success we end up with the full
857	* chain to new block and return 0.	857	* chain to new block and return 0.
858	*/	858	*/
859	static int ext4_splice_branch(handle_t handle, struct inode inode,	859	static int ext4_splice_branch(handle_t handle, struct inode inode,
860	ext4_lblk_t block, Indirect *where, int num,	860	ext4_lblk_t block, Indirect *where, int num,
861	int blks)	861	int blks)
862	{	862	{
863	int i;	863	int i;
864	int err = 0;	864	int err = 0;
865	ext4_fsblk_t current_block;	865	ext4_fsblk_t current_block;
866		866
867	/*	867	/*
868	* If we're splicing into a [td]indirect block (as opposed to the	868	* If we're splicing into a [td]indirect block (as opposed to the
869	* inode) then we need to get write access to the [td]indirect block	869	* inode) then we need to get write access to the [td]indirect block
870	* before the splice.	870	* before the splice.
871	*/	871	*/
872	if (where->bh) {	872	if (where->bh) {
873	BUFFER_TRACE(where->bh, "get_write_access");	873	BUFFER_TRACE(where->bh, "get_write_access");
874	err = ext4_journal_get_write_access(handle, where->bh);	874	err = ext4_journal_get_write_access(handle, where->bh);
875	if (err)	875	if (err)
876	goto err_out;	876	goto err_out;
877	}	877	}
878	/* That's it */	878	/* That's it */
879		879
880	*where->p = where->key;	880	*where->p = where->key;
881		881
882	/*	882	/*
883	* Update the host buffer_head or inode to point to more just allocated	883	* Update the host buffer_head or inode to point to more just allocated
884	* direct blocks blocks	884	* direct blocks blocks
885	*/	885	*/
886	if (num == 0 && blks > 1) {	886	if (num == 0 && blks > 1) {
887	current_block = le32_to_cpu(where->key) + 1;	887	current_block = le32_to_cpu(where->key) + 1;
888	for (i = 1; i < blks; i++)	888	for (i = 1; i < blks; i++)
889	*(where->p + i) = cpu_to_le32(current_block++);	889	*(where->p + i) = cpu_to_le32(current_block++);
890	}	890	}
891		891
892	/* We are done with atomic stuff, now do the rest of housekeeping */	892	/* We are done with atomic stuff, now do the rest of housekeeping */
893	/* had we spliced it onto indirect block? */	893	/* had we spliced it onto indirect block? */
894	if (where->bh) {	894	if (where->bh) {
895	/*	895	/*
896	* If we spliced it onto an indirect block, we haven't	896	* If we spliced it onto an indirect block, we haven't
897	* altered the inode. Note however that if it is being spliced	897	* altered the inode. Note however that if it is being spliced
898	* onto an indirect block at the very end of the file (the	898	* onto an indirect block at the very end of the file (the
899	* file is growing) then we will alter the inode to reflect	899	* file is growing) then we will alter the inode to reflect
900	* the new i_size. But that is not done here - it is done in	900	* the new i_size. But that is not done here - it is done in
901	* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.	901	* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
902	*/	902	*/
903	jbd_debug(5, "splicing indirect only\n");	903	jbd_debug(5, "splicing indirect only\n");
904	BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");	904	BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
905	err = ext4_handle_dirty_metadata(handle, inode, where->bh);	905	err = ext4_handle_dirty_metadata(handle, inode, where->bh);
906	if (err)	906	if (err)
907	goto err_out;	907	goto err_out;
908	} else {	908	} else {
909	/*	909	/*
910	* OK, we spliced it into the inode itself on a direct block.	910	* OK, we spliced it into the inode itself on a direct block.
911	*/	911	*/
912	ext4_mark_inode_dirty(handle, inode);	912	ext4_mark_inode_dirty(handle, inode);
913	jbd_debug(5, "splicing direct\n");	913	jbd_debug(5, "splicing direct\n");
914	}	914	}
915	return err;	915	return err;
916		916
917	err_out:	917	err_out:
918	for (i = 1; i <= num; i++) {	918	for (i = 1; i <= num; i++) {
919	/*	919	/*
920	* branch[i].bh is newly allocated, so there is no	920	* branch[i].bh is newly allocated, so there is no
921	* need to revoke the block, which is why we don't	921	* need to revoke the block, which is why we don't
922	* need to set EXT4_FREE_BLOCKS_METADATA.	922	* need to set EXT4_FREE_BLOCKS_METADATA.
923	*/	923	*/
924	ext4_free_blocks(handle, inode, where[i].bh, 0, 1,	924	ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925	EXT4_FREE_BLOCKS_FORGET);	925	EXT4_FREE_BLOCKS_FORGET);
926	}	926	}
927	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),	927	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928	blks, 0);	928	blks, 0);
929		929
930	return err;	930	return err;
931	}	931	}
932		932
933	/*	933	/*
934	* The ext4_ind_map_blocks() function handles non-extents inodes	934	* The ext4_ind_map_blocks() function handles non-extents inodes
935	* (i.e., using the traditional indirect/double-indirect i_blocks	935	* (i.e., using the traditional indirect/double-indirect i_blocks
936	* scheme) for ext4_map_blocks().	936	* scheme) for ext4_map_blocks().
937	*	937	*
938	* Allocation strategy is simple: if we have to allocate something, we will	938	* Allocation strategy is simple: if we have to allocate something, we will
939	* have to go the whole way to leaf. So let's do it before attaching anything	939	* have to go the whole way to leaf. So let's do it before attaching anything
940	* to tree, set linkage between the newborn blocks, write them if sync is	940	* to tree, set linkage between the newborn blocks, write them if sync is
941	* required, recheck the path, free and repeat if check fails, otherwise	941	* required, recheck the path, free and repeat if check fails, otherwise
942	* set the last missing link (that will protect us from any truncate-generated	942	* set the last missing link (that will protect us from any truncate-generated
943	* removals - all blocks on the path are immune now) and possibly force the	943	* removals - all blocks on the path are immune now) and possibly force the
944	* write on the parent block.	944	* write on the parent block.
945	* That has a nice additional property: no special recovery from the failed	945	* That has a nice additional property: no special recovery from the failed
946	* allocations is needed - we simply release blocks and do not touch anything	946	* allocations is needed - we simply release blocks and do not touch anything
947	* reachable from inode.	947	* reachable from inode.
948	*	948	*
949	* `handle' can be NULL if create == 0.	949	* `handle' can be NULL if create == 0.
950	*	950	*
951	* return > 0, # of blocks mapped or allocated.	951	* return > 0, # of blocks mapped or allocated.
952	* return = 0, if plain lookup failed.	952	* return = 0, if plain lookup failed.
953	* return < 0, error case.	953	* return < 0, error case.
954	*	954	*
955	* The ext4_ind_get_blocks() function should be called with	955	* The ext4_ind_get_blocks() function should be called with
956	* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem	956	* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
957	* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or	957	* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
958	* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system	958	* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
959	* blocks.	959	* blocks.
960	*/	960	*/
961	static int ext4_ind_map_blocks(handle_t handle, struct inode inode,	961	static int ext4_ind_map_blocks(handle_t handle, struct inode inode,
962	struct ext4_map_blocks *map,	962	struct ext4_map_blocks *map,
963	int flags)	963	int flags)
964	{	964	{
965	int err = -EIO;	965	int err = -EIO;
966	ext4_lblk_t offsets[4];	966	ext4_lblk_t offsets[4];
967	Indirect chain[4];	967	Indirect chain[4];
968	Indirect *partial;	968	Indirect *partial;
969	ext4_fsblk_t goal;	969	ext4_fsblk_t goal;
970	int indirect_blks;	970	int indirect_blks;
971	int blocks_to_boundary = 0;	971	int blocks_to_boundary = 0;
972	int depth;	972	int depth;
973	int count = 0;	973	int count = 0;
974	ext4_fsblk_t first_block = 0;	974	ext4_fsblk_t first_block = 0;
975		975
976	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);	976	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
977	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));	977	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
978	J_ASSERT(handle != NULL \|\| (flags & EXT4_GET_BLOCKS_CREATE) == 0);	978	J_ASSERT(handle != NULL \|\| (flags & EXT4_GET_BLOCKS_CREATE) == 0);
979	depth = ext4_block_to_path(inode, map->m_lblk, offsets,	979	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
980	&blocks_to_boundary);	980	&blocks_to_boundary);
981		981
982	if (depth == 0)	982	if (depth == 0)
983	goto out;	983	goto out;
984		984
985	partial = ext4_get_branch(inode, depth, offsets, chain, &err);	985	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
986		986
987	/* Simplest case - block found, no allocation needed */	987	/* Simplest case - block found, no allocation needed */
988	if (!partial) {	988	if (!partial) {
989	first_block = le32_to_cpu(chain[depth - 1].key);	989	first_block = le32_to_cpu(chain[depth - 1].key);
990	count++;	990	count++;
991	/map more blocks/	991	/map more blocks/
992	while (count < map->m_len && count <= blocks_to_boundary) {	992	while (count < map->m_len && count <= blocks_to_boundary) {
993	ext4_fsblk_t blk;	993	ext4_fsblk_t blk;
994		994
995	blk = le32_to_cpu(*(chain[depth-1].p + count));	995	blk = le32_to_cpu(*(chain[depth-1].p + count));
996		996
997	if (blk == first_block + count)	997	if (blk == first_block + count)
998	count++;	998	count++;
999	else	999	else
1000	break;	1000	break;
1001	}	1001	}
1002	goto got_it;	1002	goto got_it;
1003	}	1003	}
1004		1004
1005	/* Next simple case - plain lookup or failed read of indirect block */	1005	/* Next simple case - plain lookup or failed read of indirect block */
1006	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 \|\| err == -EIO)	1006	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 \|\| err == -EIO)
1007	goto cleanup;	1007	goto cleanup;
1008		1008
1009	/*	1009	/*
1010	* Okay, we need to do block allocation.	1010	* Okay, we need to do block allocation.
1011	*/	1011	*/
1012	goal = ext4_find_goal(inode, map->m_lblk, partial);	1012	goal = ext4_find_goal(inode, map->m_lblk, partial);
1013		1013
1014	/* the number of blocks need to allocate for [d,t]indirect blocks */	1014	/* the number of blocks need to allocate for [d,t]indirect blocks */
1015	indirect_blks = (chain + depth) - partial - 1;	1015	indirect_blks = (chain + depth) - partial - 1;
1016		1016
1017	/*	1017	/*
1018	* Next look up the indirect map to count the totoal number of	1018	* Next look up the indirect map to count the totoal number of
1019	* direct blocks to allocate for this branch.	1019	* direct blocks to allocate for this branch.
1020	*/	1020	*/
1021	count = ext4_blks_to_allocate(partial, indirect_blks,	1021	count = ext4_blks_to_allocate(partial, indirect_blks,
1022	map->m_len, blocks_to_boundary);	1022	map->m_len, blocks_to_boundary);
1023	/*	1023	/*
1024	* Block out ext4_truncate while we alter the tree	1024	* Block out ext4_truncate while we alter the tree
1025	*/	1025	*/
1026	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,	1026	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1027	&count, goal,	1027	&count, goal,
1028	offsets + (partial - chain), partial);	1028	offsets + (partial - chain), partial);
1029		1029
1030	/*	1030	/*
1031	* The ext4_splice_branch call will free and forget any buffers	1031	* The ext4_splice_branch call will free and forget any buffers
1032	* on the new chain if there is a failure, but that risks using	1032	* on the new chain if there is a failure, but that risks using
1033	* up transaction credits, especially for bitmaps where the	1033	* up transaction credits, especially for bitmaps where the
1034	* credits cannot be returned. Can we handle this somehow? We	1034	* credits cannot be returned. Can we handle this somehow? We
1035	* may need to return -EAGAIN upwards in the worst case. --sct	1035	* may need to return -EAGAIN upwards in the worst case. --sct
1036	*/	1036	*/
1037	if (!err)	1037	if (!err)
1038	err = ext4_splice_branch(handle, inode, map->m_lblk,	1038	err = ext4_splice_branch(handle, inode, map->m_lblk,
1039	partial, indirect_blks, count);	1039	partial, indirect_blks, count);
1040	if (err)	1040	if (err)
1041	goto cleanup;	1041	goto cleanup;
1042		1042
1043	map->m_flags \|= EXT4_MAP_NEW;	1043	map->m_flags \|= EXT4_MAP_NEW;
1044		1044
1045	ext4_update_inode_fsync_trans(handle, inode, 1);	1045	ext4_update_inode_fsync_trans(handle, inode, 1);
1046	got_it:	1046	got_it:
1047	map->m_flags \|= EXT4_MAP_MAPPED;	1047	map->m_flags \|= EXT4_MAP_MAPPED;
1048	map->m_pblk = le32_to_cpu(chain[depth-1].key);	1048	map->m_pblk = le32_to_cpu(chain[depth-1].key);
1049	map->m_len = count;	1049	map->m_len = count;
1050	if (count > blocks_to_boundary)	1050	if (count > blocks_to_boundary)
1051	map->m_flags \|= EXT4_MAP_BOUNDARY;	1051	map->m_flags \|= EXT4_MAP_BOUNDARY;
1052	err = count;	1052	err = count;
1053	/* Clean up and exit */	1053	/* Clean up and exit */
1054	partial = chain + depth - 1; /* the whole chain */	1054	partial = chain + depth - 1; /* the whole chain */
1055	cleanup:	1055	cleanup:
1056	while (partial > chain) {	1056	while (partial > chain) {
1057	BUFFER_TRACE(partial->bh, "call brelse");	1057	BUFFER_TRACE(partial->bh, "call brelse");
1058	brelse(partial->bh);	1058	brelse(partial->bh);
1059	partial--;	1059	partial--;
1060	}	1060	}
1061	out:	1061	out:
1062	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,	1062	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063	map->m_pblk, map->m_len, err);	1063	map->m_pblk, map->m_len, err);
1064	return err;	1064	return err;
1065	}	1065	}
1066		1066
1067	#ifdef CONFIG_QUOTA	1067	#ifdef CONFIG_QUOTA
1068	qsize_t ext4_get_reserved_space(struct inode inode)	1068	qsize_t ext4_get_reserved_space(struct inode inode)
1069	{	1069	{
1070	return &EXT4_I(inode)->i_reserved_quota;	1070	return &EXT4_I(inode)->i_reserved_quota;
1071	}	1071	}
1072	#endif	1072	#endif
1073		1073
1074	/*	1074	/*
1075	* Calculate the number of metadata blocks need to reserve	1075	* Calculate the number of metadata blocks need to reserve
1076	* to allocate a new block at @lblocks for non extent file based file	1076	* to allocate a new block at @lblocks for non extent file based file
1077	*/	1077	*/
1078	static int ext4_indirect_calc_metadata_amount(struct inode *inode,	1078	static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079	sector_t lblock)	1079	sector_t lblock)
1080	{	1080	{
1081	struct ext4_inode_info *ei = EXT4_I(inode);	1081	struct ext4_inode_info *ei = EXT4_I(inode);
1082	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);	1082	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1083	int blk_bits;	1083	int blk_bits;
1084		1084
1085	if (lblock < EXT4_NDIR_BLOCKS)	1085	if (lblock < EXT4_NDIR_BLOCKS)
1086	return 0;	1086	return 0;
1087		1087
1088	lblock -= EXT4_NDIR_BLOCKS;	1088	lblock -= EXT4_NDIR_BLOCKS;
1089		1089
1090	if (ei->i_da_metadata_calc_len &&	1090	if (ei->i_da_metadata_calc_len &&
1091	(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {	1091	(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1092	ei->i_da_metadata_calc_len++;	1092	ei->i_da_metadata_calc_len++;
1093	return 0;	1093	return 0;
1094	}	1094	}
1095	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;	1095	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1096	ei->i_da_metadata_calc_len = 1;	1096	ei->i_da_metadata_calc_len = 1;
1097	blk_bits = order_base_2(lblock);	1097	blk_bits = order_base_2(lblock);
1098	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;	1098	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1099	}	1099	}
1100		1100
1101	/*	1101	/*
1102	* Calculate the number of metadata blocks need to reserve	1102	* Calculate the number of metadata blocks need to reserve
1103	* to allocate a block located at @lblock	1103	* to allocate a block located at @lblock
1104	*/	1104	*/
1105	static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)	1105	static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1106	{	1106	{
1107	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))	1107	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1108	return ext4_ext_calc_metadata_amount(inode, lblock);	1108	return ext4_ext_calc_metadata_amount(inode, lblock);
1109		1109
1110	return ext4_indirect_calc_metadata_amount(inode, lblock);	1110	return ext4_indirect_calc_metadata_amount(inode, lblock);
1111	}	1111	}
1112		1112
1113	/*	1113	/*
1114	* Called with i_data_sem down, which is important since we can call	1114	* Called with i_data_sem down, which is important since we can call
1115	* ext4_discard_preallocations() from here.	1115	* ext4_discard_preallocations() from here.
1116	*/	1116	*/
1117	void ext4_da_update_reserve_space(struct inode *inode,	1117	void ext4_da_update_reserve_space(struct inode *inode,
1118	int used, int quota_claim)	1118	int used, int quota_claim)
1119	{	1119	{
1120	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	1120	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1121	struct ext4_inode_info *ei = EXT4_I(inode);	1121	struct ext4_inode_info *ei = EXT4_I(inode);
1122		1122
1123	spin_lock(&ei->i_block_reservation_lock);	1123	spin_lock(&ei->i_block_reservation_lock);
1124	trace_ext4_da_update_reserve_space(inode, used);	1124	trace_ext4_da_update_reserve_space(inode, used);
1125	if (unlikely(used > ei->i_reserved_data_blocks)) {	1125	if (unlikely(used > ei->i_reserved_data_blocks)) {
1126	ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "	1126	ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1127	"with only %d reserved data blocks\n",	1127	"with only %d reserved data blocks\n",
1128	__func__, inode->i_ino, used,	1128	__func__, inode->i_ino, used,
1129	ei->i_reserved_data_blocks);	1129	ei->i_reserved_data_blocks);
1130	WARN_ON(1);	1130	WARN_ON(1);
1131	used = ei->i_reserved_data_blocks;	1131	used = ei->i_reserved_data_blocks;
1132	}	1132	}
1133		1133
1134	/* Update per-inode reservations */	1134	/* Update per-inode reservations */
1135	ei->i_reserved_data_blocks -= used;	1135	ei->i_reserved_data_blocks -= used;
1136	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;	1136	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1137	percpu_counter_sub(&sbi->s_dirtyblocks_counter,	1137	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1138	used + ei->i_allocated_meta_blocks);	1138	used + ei->i_allocated_meta_blocks);
1139	ei->i_allocated_meta_blocks = 0;	1139	ei->i_allocated_meta_blocks = 0;
1140		1140
1141	if (ei->i_reserved_data_blocks == 0) {	1141	if (ei->i_reserved_data_blocks == 0) {
1142	/*	1142	/*
1143	* We can release all of the reserved metadata blocks	1143	* We can release all of the reserved metadata blocks
1144	* only when we have written all of the delayed	1144	* only when we have written all of the delayed
1145	* allocation blocks.	1145	* allocation blocks.
1146	*/	1146	*/
1147	percpu_counter_sub(&sbi->s_dirtyblocks_counter,	1147	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1148	ei->i_reserved_meta_blocks);	1148	ei->i_reserved_meta_blocks);
1149	ei->i_reserved_meta_blocks = 0;	1149	ei->i_reserved_meta_blocks = 0;
1150	ei->i_da_metadata_calc_len = 0;	1150	ei->i_da_metadata_calc_len = 0;
1151	}	1151	}
1152	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);	1152	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1153		1153
1154	/* Update quota subsystem for data blocks */	1154	/* Update quota subsystem for data blocks */
1155	if (quota_claim)	1155	if (quota_claim)
1156	dquot_claim_block(inode, used);	1156	dquot_claim_block(inode, used);
1157	else {	1157	else {
1158	/*	1158	/*
1159	* We did fallocate with an offset that is already delayed	1159	* We did fallocate with an offset that is already delayed
1160	* allocated. So on delayed allocated writeback we should	1160	* allocated. So on delayed allocated writeback we should
1161	* not re-claim the quota for fallocated blocks.	1161	* not re-claim the quota for fallocated blocks.
1162	*/	1162	*/
1163	dquot_release_reservation_block(inode, used);	1163	dquot_release_reservation_block(inode, used);
1164	}	1164	}
1165		1165
1166	/*	1166	/*
1167	* If we have done all the pending block allocations and if	1167	* If we have done all the pending block allocations and if
1168	* there aren't any writers on the inode, we can discard the	1168	* there aren't any writers on the inode, we can discard the
1169	* inode's preallocations.	1169	* inode's preallocations.
1170	*/	1170	*/
1171	if ((ei->i_reserved_data_blocks == 0) &&	1171	if ((ei->i_reserved_data_blocks == 0) &&
1172	(atomic_read(&inode->i_writecount) == 0))	1172	(atomic_read(&inode->i_writecount) == 0))
1173	ext4_discard_preallocations(inode);	1173	ext4_discard_preallocations(inode);
1174	}	1174	}
1175		1175
1176	static int __check_block_validity(struct inode inode, const char func,	1176	static int __check_block_validity(struct inode inode, const char func,
1177	unsigned int line,	1177	unsigned int line,
1178	struct ext4_map_blocks *map)	1178	struct ext4_map_blocks *map)
1179	{	1179	{
1180	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,	1180	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1181	map->m_len)) {	1181	map->m_len)) {
1182	ext4_error_inode(inode, func, line, map->m_pblk,	1182	ext4_error_inode(inode, func, line, map->m_pblk,
1183	"lblock %lu mapped to illegal pblock "	1183	"lblock %lu mapped to illegal pblock "
1184	"(length %d)", (unsigned long) map->m_lblk,	1184	"(length %d)", (unsigned long) map->m_lblk,
1185	map->m_len);	1185	map->m_len);
1186	return -EIO;	1186	return -EIO;
1187	}	1187	}
1188	return 0;	1188	return 0;
1189	}	1189	}
1190		1190
1191	#define check_block_validity(inode, map) \	1191	#define check_block_validity(inode, map) \
1192	__check_block_validity((inode), __func__, __LINE__, (map))	1192	__check_block_validity((inode), __func__, __LINE__, (map))
1193		1193
1194	/*	1194	/*
1195	* Return the number of contiguous dirty pages in a given inode	1195	* Return the number of contiguous dirty pages in a given inode
1196	* starting at page frame idx.	1196	* starting at page frame idx.
1197	*/	1197	*/
1198	static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,	1198	static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1199	unsigned int max_pages)	1199	unsigned int max_pages)
1200	{	1200	{
1201	struct address_space *mapping = inode->i_mapping;	1201	struct address_space *mapping = inode->i_mapping;
1202	pgoff_t index;	1202	pgoff_t index;
1203	struct pagevec pvec;	1203	struct pagevec pvec;
1204	pgoff_t num = 0;	1204	pgoff_t num = 0;
1205	int i, nr_pages, done = 0;	1205	int i, nr_pages, done = 0;
1206		1206
1207	if (max_pages == 0)	1207	if (max_pages == 0)
1208	return 0;	1208	return 0;
1209	pagevec_init(&pvec, 0);	1209	pagevec_init(&pvec, 0);
1210	while (!done) {	1210	while (!done) {
1211	index = idx;	1211	index = idx;
1212	nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,	1212	nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1213	PAGECACHE_TAG_DIRTY,	1213	PAGECACHE_TAG_DIRTY,
1214	(pgoff_t)PAGEVEC_SIZE);	1214	(pgoff_t)PAGEVEC_SIZE);
1215	if (nr_pages == 0)	1215	if (nr_pages == 0)
1216	break;	1216	break;
1217	for (i = 0; i < nr_pages; i++) {	1217	for (i = 0; i < nr_pages; i++) {
1218	struct page *page = pvec.pages[i];	1218	struct page *page = pvec.pages[i];
1219	struct buffer_head bh, head;	1219	struct buffer_head bh, head;
1220		1220
1221	lock_page(page);	1221	lock_page(page);
1222	if (unlikely(page->mapping != mapping) \|\|	1222	if (unlikely(page->mapping != mapping) \|\|
1223	!PageDirty(page) \|\|	1223	!PageDirty(page) \|\|
1224	PageWriteback(page) \|\|	1224	PageWriteback(page) \|\|
1225	page->index != idx) {	1225	page->index != idx) {
1226	done = 1;	1226	done = 1;
1227	unlock_page(page);	1227	unlock_page(page);
1228	break;	1228	break;
1229	}	1229	}
1230	if (page_has_buffers(page)) {	1230	if (page_has_buffers(page)) {
1231	bh = head = page_buffers(page);	1231	bh = head = page_buffers(page);
1232	do {	1232	do {
1233	if (!buffer_delay(bh) &&	1233	if (!buffer_delay(bh) &&
1234	!buffer_unwritten(bh))	1234	!buffer_unwritten(bh))
1235	done = 1;	1235	done = 1;
1236	bh = bh->b_this_page;	1236	bh = bh->b_this_page;
1237	} while (!done && (bh != head));	1237	} while (!done && (bh != head));
1238	}	1238	}
1239	unlock_page(page);	1239	unlock_page(page);
1240	if (done)	1240	if (done)
1241	break;	1241	break;
1242	idx++;	1242	idx++;
1243	num++;	1243	num++;
1244	if (num >= max_pages) {	1244	if (num >= max_pages) {
1245	done = 1;	1245	done = 1;
1246	break;	1246	break;
1247	}	1247	}
1248	}	1248	}
1249	pagevec_release(&pvec);	1249	pagevec_release(&pvec);
1250	}	1250	}
1251	return num;	1251	return num;
1252	}	1252	}
1253		1253
1254	/*	1254	/*
1255	* The ext4_map_blocks() function tries to look up the requested blocks,	1255	* The ext4_map_blocks() function tries to look up the requested blocks,
1256	* and returns if the blocks are already mapped.	1256	* and returns if the blocks are already mapped.
1257	*	1257	*
1258	* Otherwise it takes the write lock of the i_data_sem and allocate blocks	1258	* Otherwise it takes the write lock of the i_data_sem and allocate blocks
1259	* and store the allocated blocks in the result buffer head and mark it	1259	* and store the allocated blocks in the result buffer head and mark it
1260	* mapped.	1260	* mapped.
1261	*	1261	*
1262	* If file type is extents based, it will call ext4_ext_map_blocks(),	1262	* If file type is extents based, it will call ext4_ext_map_blocks(),
1263	* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping	1263	* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1264	* based files	1264	* based files
1265	*	1265	*
1266	* On success, it returns the number of blocks being mapped or allocate.	1266	* On success, it returns the number of blocks being mapped or allocate.
1267	* if create==0 and the blocks are pre-allocated and uninitialized block,	1267	* if create==0 and the blocks are pre-allocated and uninitialized block,
1268	* the result buffer head is unmapped. If the create ==1, it will make sure	1268	* the result buffer head is unmapped. If the create ==1, it will make sure
1269	* the buffer head is mapped.	1269	* the buffer head is mapped.
1270	*	1270	*
1271	* It returns 0 if plain look up failed (blocks have not been allocated), in	1271	* It returns 0 if plain look up failed (blocks have not been allocated), in
1272	* that casem, buffer head is unmapped	1272	* that casem, buffer head is unmapped
1273	*	1273	*
1274	* It returns the error in case of allocation failure.	1274	* It returns the error in case of allocation failure.
1275	*/	1275	*/
1276	int ext4_map_blocks(handle_t handle, struct inode inode,	1276	int ext4_map_blocks(handle_t handle, struct inode inode,
1277	struct ext4_map_blocks *map, int flags)	1277	struct ext4_map_blocks *map, int flags)
1278	{	1278	{
1279	int retval;	1279	int retval;
1280		1280
1281	map->m_flags = 0;	1281	map->m_flags = 0;
1282	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"	1282	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1283	"logical block %lu\n", inode->i_ino, flags, map->m_len,	1283	"logical block %lu\n", inode->i_ino, flags, map->m_len,
1284	(unsigned long) map->m_lblk);	1284	(unsigned long) map->m_lblk);
1285	/*	1285	/*
1286	* Try to see if we can get the block without requesting a new	1286	* Try to see if we can get the block without requesting a new
1287	* file system block.	1287	* file system block.
1288	*/	1288	*/
1289	down_read((&EXT4_I(inode)->i_data_sem));	1289	down_read((&EXT4_I(inode)->i_data_sem));
1290	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {	1290	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1291	retval = ext4_ext_map_blocks(handle, inode, map, 0);	1291	retval = ext4_ext_map_blocks(handle, inode, map, 0);
1292	} else {	1292	} else {
1293	retval = ext4_ind_map_blocks(handle, inode, map, 0);	1293	retval = ext4_ind_map_blocks(handle, inode, map, 0);
1294	}	1294	}
1295	up_read((&EXT4_I(inode)->i_data_sem));	1295	up_read((&EXT4_I(inode)->i_data_sem));
1296		1296
1297	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {	1297	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1298	int ret = check_block_validity(inode, map);	1298	int ret = check_block_validity(inode, map);
1299	if (ret != 0)	1299	if (ret != 0)
1300	return ret;	1300	return ret;
1301	}	1301	}
1302		1302
1303	/* If it is only a block(s) look up */	1303	/* If it is only a block(s) look up */
1304	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)	1304	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1305	return retval;	1305	return retval;
1306		1306
1307	/*	1307	/*
1308	* Returns if the blocks have already allocated	1308	* Returns if the blocks have already allocated
1309	*	1309	*
1310	* Note that if blocks have been preallocated	1310	* Note that if blocks have been preallocated
1311	* ext4_ext_get_block() returns th create = 0	1311	* ext4_ext_get_block() returns th create = 0
1312	* with buffer head unmapped.	1312	* with buffer head unmapped.
1313	*/	1313	*/
1314	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)	1314	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1315	return retval;	1315	return retval;
1316		1316
1317	/*	1317	/*
1318	* When we call get_blocks without the create flag, the	1318	* When we call get_blocks without the create flag, the
1319	* BH_Unwritten flag could have gotten set if the blocks	1319	* BH_Unwritten flag could have gotten set if the blocks
1320	* requested were part of a uninitialized extent. We need to	1320	* requested were part of a uninitialized extent. We need to
1321	* clear this flag now that we are committed to convert all or	1321	* clear this flag now that we are committed to convert all or
1322	* part of the uninitialized extent to be an initialized	1322	* part of the uninitialized extent to be an initialized
1323	* extent. This is because we need to avoid the combination	1323	* extent. This is because we need to avoid the combination
1324	* of BH_Unwritten and BH_Mapped flags being simultaneously	1324	* of BH_Unwritten and BH_Mapped flags being simultaneously
1325	* set on the buffer_head.	1325	* set on the buffer_head.
1326	*/	1326	*/
1327	map->m_flags &= ~EXT4_MAP_UNWRITTEN;	1327	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1328		1328
1329	/*	1329	/*
1330	* New blocks allocate and/or writing to uninitialized extent	1330	* New blocks allocate and/or writing to uninitialized extent
1331	* will possibly result in updating i_data, so we take	1331	* will possibly result in updating i_data, so we take
1332	* the write lock of i_data_sem, and call get_blocks()	1332	* the write lock of i_data_sem, and call get_blocks()
1333	* with create == 1 flag.	1333	* with create == 1 flag.
1334	*/	1334	*/
1335	down_write((&EXT4_I(inode)->i_data_sem));	1335	down_write((&EXT4_I(inode)->i_data_sem));
1336		1336
1337	/*	1337	/*
1338	* if the caller is from delayed allocation writeout path	1338	* if the caller is from delayed allocation writeout path
1339	* we have already reserved fs blocks for allocation	1339	* we have already reserved fs blocks for allocation
1340	* let the underlying get_block() function know to	1340	* let the underlying get_block() function know to
1341	* avoid double accounting	1341	* avoid double accounting
1342	*/	1342	*/
1343	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)	1343	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1344	ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);	1344	ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1345	/*	1345	/*
1346	* We need to check for EXT4 here because migrate	1346	* We need to check for EXT4 here because migrate
1347	* could have changed the inode type in between	1347	* could have changed the inode type in between
1348	*/	1348	*/
1349	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {	1349	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1350	retval = ext4_ext_map_blocks(handle, inode, map, flags);	1350	retval = ext4_ext_map_blocks(handle, inode, map, flags);
1351	} else {	1351	} else {
1352	retval = ext4_ind_map_blocks(handle, inode, map, flags);	1352	retval = ext4_ind_map_blocks(handle, inode, map, flags);
1353		1353
1354	if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {	1354	if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1355	/*	1355	/*
1356	* We allocated new blocks which will result in	1356	* We allocated new blocks which will result in
1357	* i_data's format changing. Force the migrate	1357	* i_data's format changing. Force the migrate
1358	* to fail by clearing migrate flags	1358	* to fail by clearing migrate flags
1359	*/	1359	*/
1360	ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);	1360	ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1361	}	1361	}
1362		1362
1363	/*	1363	/*
1364	* Update reserved blocks/metadata blocks after successful	1364	* Update reserved blocks/metadata blocks after successful
1365	* block allocation which had been deferred till now. We don't	1365	* block allocation which had been deferred till now. We don't
1366	* support fallocate for non extent files. So we can update	1366	* support fallocate for non extent files. So we can update
1367	* reserve space here.	1367	* reserve space here.
1368	*/	1368	*/
1369	if ((retval > 0) &&	1369	if ((retval > 0) &&
1370	(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))	1370	(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1371	ext4_da_update_reserve_space(inode, retval, 1);	1371	ext4_da_update_reserve_space(inode, retval, 1);
1372	}	1372	}
1373	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)	1373	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1374	ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);	1374	ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1375		1375
1376	up_write((&EXT4_I(inode)->i_data_sem));	1376	up_write((&EXT4_I(inode)->i_data_sem));
1377	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {	1377	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1378	int ret = check_block_validity(inode, map);	1378	int ret = check_block_validity(inode, map);
1379	if (ret != 0)	1379	if (ret != 0)
1380	return ret;	1380	return ret;
1381	}	1381	}
1382	return retval;	1382	return retval;
1383	}	1383	}
1384		1384
1385	/* Maximum number of blocks we map for direct IO at once. */	1385	/* Maximum number of blocks we map for direct IO at once. */
1386	#define DIO_MAX_BLOCKS 4096	1386	#define DIO_MAX_BLOCKS 4096
1387		1387
1388	static int _ext4_get_block(struct inode *inode, sector_t iblock,	1388	static int _ext4_get_block(struct inode *inode, sector_t iblock,
1389	struct buffer_head *bh, int flags)	1389	struct buffer_head *bh, int flags)
1390	{	1390	{
1391	handle_t *handle = ext4_journal_current_handle();	1391	handle_t *handle = ext4_journal_current_handle();
1392	struct ext4_map_blocks map;	1392	struct ext4_map_blocks map;
1393	int ret = 0, started = 0;	1393	int ret = 0, started = 0;
1394	int dio_credits;	1394	int dio_credits;
1395		1395
1396	map.m_lblk = iblock;	1396	map.m_lblk = iblock;
1397	map.m_len = bh->b_size >> inode->i_blkbits;	1397	map.m_len = bh->b_size >> inode->i_blkbits;
1398		1398
1399	if (flags && !handle) {	1399	if (flags && !handle) {
1400	/* Direct IO write... */	1400	/* Direct IO write... */
1401	if (map.m_len > DIO_MAX_BLOCKS)	1401	if (map.m_len > DIO_MAX_BLOCKS)
1402	map.m_len = DIO_MAX_BLOCKS;	1402	map.m_len = DIO_MAX_BLOCKS;
1403	dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);	1403	dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1404	handle = ext4_journal_start(inode, dio_credits);	1404	handle = ext4_journal_start(inode, dio_credits);
1405	if (IS_ERR(handle)) {	1405	if (IS_ERR(handle)) {
1406	ret = PTR_ERR(handle);	1406	ret = PTR_ERR(handle);
1407	return ret;	1407	return ret;
1408	}	1408	}
1409	started = 1;	1409	started = 1;
1410	}	1410	}
1411		1411
1412	ret = ext4_map_blocks(handle, inode, &map, flags);	1412	ret = ext4_map_blocks(handle, inode, &map, flags);
1413	if (ret > 0) {	1413	if (ret > 0) {
1414	map_bh(bh, inode->i_sb, map.m_pblk);	1414	map_bh(bh, inode->i_sb, map.m_pblk);
1415	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) \| map.m_flags;	1415	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) \| map.m_flags;
1416	bh->b_size = inode->i_sb->s_blocksize * map.m_len;	1416	bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1417	ret = 0;	1417	ret = 0;
1418	}	1418	}
1419	if (started)	1419	if (started)
1420	ext4_journal_stop(handle);	1420	ext4_journal_stop(handle);
1421	return ret;	1421	return ret;
1422	}	1422	}
1423		1423
1424	int ext4_get_block(struct inode *inode, sector_t iblock,	1424	int ext4_get_block(struct inode *inode, sector_t iblock,
1425	struct buffer_head *bh, int create)	1425	struct buffer_head *bh, int create)
1426	{	1426	{
1427	return _ext4_get_block(inode, iblock, bh,	1427	return _ext4_get_block(inode, iblock, bh,
1428	create ? EXT4_GET_BLOCKS_CREATE : 0);	1428	create ? EXT4_GET_BLOCKS_CREATE : 0);
1429	}	1429	}
1430		1430
1431	/*	1431	/*
1432	* `handle' can be NULL if create is zero	1432	* `handle' can be NULL if create is zero
1433	*/	1433	*/
1434	struct buffer_head ext4_getblk(handle_t handle, struct inode *inode,	1434	struct buffer_head ext4_getblk(handle_t handle, struct inode *inode,
1435	ext4_lblk_t block, int create, int *errp)	1435	ext4_lblk_t block, int create, int *errp)
1436	{	1436	{
1437	struct ext4_map_blocks map;	1437	struct ext4_map_blocks map;
1438	struct buffer_head *bh;	1438	struct buffer_head *bh;
1439	int fatal = 0, err;	1439	int fatal = 0, err;
1440		1440
1441	J_ASSERT(handle != NULL \|\| create == 0);	1441	J_ASSERT(handle != NULL \|\| create == 0);
1442		1442
1443	map.m_lblk = block;	1443	map.m_lblk = block;
1444	map.m_len = 1;	1444	map.m_len = 1;
1445	err = ext4_map_blocks(handle, inode, &map,	1445	err = ext4_map_blocks(handle, inode, &map,
1446	create ? EXT4_GET_BLOCKS_CREATE : 0);	1446	create ? EXT4_GET_BLOCKS_CREATE : 0);
1447		1447
1448	if (err < 0)	1448	if (err < 0)
1449	*errp = err;	1449	*errp = err;
1450	if (err <= 0)	1450	if (err <= 0)
1451	return NULL;	1451	return NULL;
1452	*errp = 0;	1452	*errp = 0;
1453		1453
1454	bh = sb_getblk(inode->i_sb, map.m_pblk);	1454	bh = sb_getblk(inode->i_sb, map.m_pblk);
1455	if (!bh) {	1455	if (!bh) {
1456	*errp = -EIO;	1456	*errp = -EIO;
1457	return NULL;	1457	return NULL;
1458	}	1458	}
1459	if (map.m_flags & EXT4_MAP_NEW) {	1459	if (map.m_flags & EXT4_MAP_NEW) {
1460	J_ASSERT(create != 0);	1460	J_ASSERT(create != 0);
1461	J_ASSERT(handle != NULL);	1461	J_ASSERT(handle != NULL);
1462		1462
1463	/*	1463	/*
1464	* Now that we do not always journal data, we should	1464	* Now that we do not always journal data, we should
1465	* keep in mind whether this should always journal the	1465	* keep in mind whether this should always journal the
1466	* new buffer as metadata. For now, regular file	1466	* new buffer as metadata. For now, regular file
1467	* writes use ext4_get_block instead, so it's not a	1467	* writes use ext4_get_block instead, so it's not a
1468	* problem.	1468	* problem.
1469	*/	1469	*/
1470	lock_buffer(bh);	1470	lock_buffer(bh);
1471	BUFFER_TRACE(bh, "call get_create_access");	1471	BUFFER_TRACE(bh, "call get_create_access");
1472	fatal = ext4_journal_get_create_access(handle, bh);	1472	fatal = ext4_journal_get_create_access(handle, bh);
1473	if (!fatal && !buffer_uptodate(bh)) {	1473	if (!fatal && !buffer_uptodate(bh)) {
1474	memset(bh->b_data, 0, inode->i_sb->s_blocksize);	1474	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1475	set_buffer_uptodate(bh);	1475	set_buffer_uptodate(bh);
1476	}	1476	}
1477	unlock_buffer(bh);	1477	unlock_buffer(bh);
1478	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	1478	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1479	err = ext4_handle_dirty_metadata(handle, inode, bh);	1479	err = ext4_handle_dirty_metadata(handle, inode, bh);
1480	if (!fatal)	1480	if (!fatal)
1481	fatal = err;	1481	fatal = err;
1482	} else {	1482	} else {
1483	BUFFER_TRACE(bh, "not a new buffer");	1483	BUFFER_TRACE(bh, "not a new buffer");
1484	}	1484	}
1485	if (fatal) {	1485	if (fatal) {
1486	*errp = fatal;	1486	*errp = fatal;
1487	brelse(bh);	1487	brelse(bh);
1488	bh = NULL;	1488	bh = NULL;
1489	}	1489	}
1490	return bh;	1490	return bh;
1491	}	1491	}
1492		1492
1493	struct buffer_head ext4_bread(handle_t handle, struct inode *inode,	1493	struct buffer_head ext4_bread(handle_t handle, struct inode *inode,
1494	ext4_lblk_t block, int create, int *err)	1494	ext4_lblk_t block, int create, int *err)
1495	{	1495	{
1496	struct buffer_head *bh;	1496	struct buffer_head *bh;
1497		1497
1498	bh = ext4_getblk(handle, inode, block, create, err);	1498	bh = ext4_getblk(handle, inode, block, create, err);
1499	if (!bh)	1499	if (!bh)
1500	return bh;	1500	return bh;
1501	if (buffer_uptodate(bh))	1501	if (buffer_uptodate(bh))
1502	return bh;	1502	return bh;
1503	ll_rw_block(READ_META, 1, &bh);	1503	ll_rw_block(READ_META, 1, &bh);
1504	wait_on_buffer(bh);	1504	wait_on_buffer(bh);
1505	if (buffer_uptodate(bh))	1505	if (buffer_uptodate(bh))
1506	return bh;	1506	return bh;
1507	put_bh(bh);	1507	put_bh(bh);
1508	*err = -EIO;	1508	*err = -EIO;
1509	return NULL;	1509	return NULL;
1510	}	1510	}
1511		1511
1512	static int walk_page_buffers(handle_t *handle,	1512	static int walk_page_buffers(handle_t *handle,
1513	struct buffer_head *head,	1513	struct buffer_head *head,
1514	unsigned from,	1514	unsigned from,
1515	unsigned to,	1515	unsigned to,
1516	int *partial,	1516	int *partial,
1517	int (fn)(handle_t handle,	1517	int (fn)(handle_t handle,
1518	struct buffer_head *bh))	1518	struct buffer_head *bh))
1519	{	1519	{
1520	struct buffer_head *bh;	1520	struct buffer_head *bh;
1521	unsigned block_start, block_end;	1521	unsigned block_start, block_end;
1522	unsigned blocksize = head->b_size;	1522	unsigned blocksize = head->b_size;
1523	int err, ret = 0;	1523	int err, ret = 0;
1524	struct buffer_head *next;	1524	struct buffer_head *next;
1525		1525
1526	for (bh = head, block_start = 0;	1526	for (bh = head, block_start = 0;
1527	ret == 0 && (bh != head \|\| !block_start);	1527	ret == 0 && (bh != head \|\| !block_start);
1528	block_start = block_end, bh = next) {	1528	block_start = block_end, bh = next) {
1529	next = bh->b_this_page;	1529	next = bh->b_this_page;
1530	block_end = block_start + blocksize;	1530	block_end = block_start + blocksize;
1531	if (block_end <= from \|\| block_start >= to) {	1531	if (block_end <= from \|\| block_start >= to) {
1532	if (partial && !buffer_uptodate(bh))	1532	if (partial && !buffer_uptodate(bh))
1533	*partial = 1;	1533	*partial = 1;
1534	continue;	1534	continue;
1535	}	1535	}
1536	err = (*fn)(handle, bh);	1536	err = (*fn)(handle, bh);
1537	if (!ret)	1537	if (!ret)
1538	ret = err;	1538	ret = err;
1539	}	1539	}
1540	return ret;	1540	return ret;
1541	}	1541	}
1542		1542
1543	/*	1543	/*
1544	* To preserve ordering, it is essential that the hole instantiation and	1544	* To preserve ordering, it is essential that the hole instantiation and
1545	* the data write be encapsulated in a single transaction. We cannot	1545	* the data write be encapsulated in a single transaction. We cannot
1546	* close off a transaction and start a new one between the ext4_get_block()	1546	* close off a transaction and start a new one between the ext4_get_block()
1547	* and the commit_write(). So doing the jbd2_journal_start at the start of	1547	* and the commit_write(). So doing the jbd2_journal_start at the start of
1548	* prepare_write() is the right place.	1548	* prepare_write() is the right place.
1549	*	1549	*
1550	* Also, this function can nest inside ext4_writepage() ->	1550	* Also, this function can nest inside ext4_writepage() ->
1551	* block_write_full_page(). In that case, we know that ext4_writepage()	1551	* block_write_full_page(). In that case, we know that ext4_writepage()
1552	* has generated enough buffer credits to do the whole page. So we won't	1552	* has generated enough buffer credits to do the whole page. So we won't
1553	* block on the journal in that case, which is good, because the caller may	1553	* block on the journal in that case, which is good, because the caller may
1554	* be PF_MEMALLOC.	1554	* be PF_MEMALLOC.
1555	*	1555	*
1556	* By accident, ext4 can be reentered when a transaction is open via	1556	* By accident, ext4 can be reentered when a transaction is open via
1557	* quota file writes. If we were to commit the transaction while thus	1557	* quota file writes. If we were to commit the transaction while thus
1558	* reentered, there can be a deadlock - we would be holding a quota	1558	* reentered, there can be a deadlock - we would be holding a quota
1559	* lock, and the commit would never complete if another thread had a	1559	* lock, and the commit would never complete if another thread had a
1560	* transaction open and was blocking on the quota lock - a ranking	1560	* transaction open and was blocking on the quota lock - a ranking
1561	* violation.	1561	* violation.
1562	*	1562	*
1563	* So what we do is to rely on the fact that jbd2_journal_stop/journal_start	1563	* So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1564	* will _not_ run commit under these circumstances because handle->h_ref	1564	* will _not_ run commit under these circumstances because handle->h_ref
1565	* is elevated. We'll still have enough credits for the tiny quotafile	1565	* is elevated. We'll still have enough credits for the tiny quotafile
1566	* write.	1566	* write.
1567	*/	1567	*/
1568	static int do_journal_get_write_access(handle_t *handle,	1568	static int do_journal_get_write_access(handle_t *handle,
1569	struct buffer_head *bh)	1569	struct buffer_head *bh)
1570	{	1570	{
1571	int dirty = buffer_dirty(bh);	1571	int dirty = buffer_dirty(bh);
1572	int ret;	1572	int ret;
1573		1573
1574	if (!buffer_mapped(bh) \|\| buffer_freed(bh))	1574	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
1575	return 0;	1575	return 0;
1576	/*	1576	/*
1577	* __block_write_begin() could have dirtied some buffers. Clean	1577	* __block_write_begin() could have dirtied some buffers. Clean
1578	* the dirty bit as jbd2_journal_get_write_access() could complain	1578	* the dirty bit as jbd2_journal_get_write_access() could complain
1579	* otherwise about fs integrity issues. Setting of the dirty bit	1579	* otherwise about fs integrity issues. Setting of the dirty bit
1580	* by __block_write_begin() isn't a real problem here as we clear	1580	* by __block_write_begin() isn't a real problem here as we clear
1581	* the bit before releasing a page lock and thus writeback cannot	1581	* the bit before releasing a page lock and thus writeback cannot
1582	* ever write the buffer.	1582	* ever write the buffer.
1583	*/	1583	*/
1584	if (dirty)	1584	if (dirty)
1585	clear_buffer_dirty(bh);	1585	clear_buffer_dirty(bh);
1586	ret = ext4_journal_get_write_access(handle, bh);	1586	ret = ext4_journal_get_write_access(handle, bh);
1587	if (!ret && dirty)	1587	if (!ret && dirty)
1588	ret = ext4_handle_dirty_metadata(handle, NULL, bh);	1588	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1589	return ret;	1589	return ret;
1590	}	1590	}
1591		1591
1592	/*	1592	/*
1593	* Truncate blocks that were not used by write. We have to truncate the	1593	* Truncate blocks that were not used by write. We have to truncate the
1594	* pagecache as well so that corresponding buffers get properly unmapped.	1594	* pagecache as well so that corresponding buffers get properly unmapped.
1595	*/	1595	*/
1596	static void ext4_truncate_failed_write(struct inode *inode)	1596	static void ext4_truncate_failed_write(struct inode *inode)
1597	{	1597	{
1598	truncate_inode_pages(inode->i_mapping, inode->i_size);	1598	truncate_inode_pages(inode->i_mapping, inode->i_size);
1599	ext4_truncate(inode);	1599	ext4_truncate(inode);
1600	}	1600	}
1601		1601
1602	static int ext4_get_block_write(struct inode *inode, sector_t iblock,	1602	static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1603	struct buffer_head *bh_result, int create);	1603	struct buffer_head *bh_result, int create);
1604	static int ext4_write_begin(struct file file, struct address_space mapping,	1604	static int ext4_write_begin(struct file file, struct address_space mapping,
1605	loff_t pos, unsigned len, unsigned flags,	1605	loff_t pos, unsigned len, unsigned flags,
1606	struct page pagep, void fsdata)	1606	struct page pagep, void fsdata)
1607	{	1607	{
1608	struct inode *inode = mapping->host;	1608	struct inode *inode = mapping->host;
1609	int ret, needed_blocks;	1609	int ret, needed_blocks;
1610	handle_t *handle;	1610	handle_t *handle;
1611	int retries = 0;	1611	int retries = 0;
1612	struct page *page;	1612	struct page *page;
1613	pgoff_t index;	1613	pgoff_t index;
1614	unsigned from, to;	1614	unsigned from, to;
1615		1615
1616	trace_ext4_write_begin(inode, pos, len, flags);	1616	trace_ext4_write_begin(inode, pos, len, flags);
1617	/*	1617	/*
1618	* Reserve one block more for addition to orphan list in case	1618	* Reserve one block more for addition to orphan list in case
1619	* we allocate blocks but write fails for some reason	1619	* we allocate blocks but write fails for some reason
1620	*/	1620	*/
1621	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;	1621	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1622	index = pos >> PAGE_CACHE_SHIFT;	1622	index = pos >> PAGE_CACHE_SHIFT;
1623	from = pos & (PAGE_CACHE_SIZE - 1);	1623	from = pos & (PAGE_CACHE_SIZE - 1);
1624	to = from + len;	1624	to = from + len;
1625		1625
1626	retry:	1626	retry:
1627	handle = ext4_journal_start(inode, needed_blocks);	1627	handle = ext4_journal_start(inode, needed_blocks);
1628	if (IS_ERR(handle)) {	1628	if (IS_ERR(handle)) {
1629	ret = PTR_ERR(handle);	1629	ret = PTR_ERR(handle);
1630	goto out;	1630	goto out;
1631	}	1631	}
1632		1632
1633	/* We cannot recurse into the filesystem as the transaction is already	1633	/* We cannot recurse into the filesystem as the transaction is already
1634	* started */	1634	* started */
1635	flags \|= AOP_FLAG_NOFS;	1635	flags \|= AOP_FLAG_NOFS;
1636		1636
1637	page = grab_cache_page_write_begin(mapping, index, flags);	1637	page = grab_cache_page_write_begin(mapping, index, flags);
1638	if (!page) {	1638	if (!page) {
1639	ext4_journal_stop(handle);	1639	ext4_journal_stop(handle);
1640	ret = -ENOMEM;	1640	ret = -ENOMEM;
1641	goto out;	1641	goto out;
1642	}	1642	}
1643	*pagep = page;	1643	*pagep = page;
1644		1644
1645	if (ext4_should_dioread_nolock(inode))	1645	if (ext4_should_dioread_nolock(inode))
1646	ret = __block_write_begin(page, pos, len, ext4_get_block_write);	1646	ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1647	else	1647	else
1648	ret = __block_write_begin(page, pos, len, ext4_get_block);	1648	ret = __block_write_begin(page, pos, len, ext4_get_block);
1649		1649
1650	if (!ret && ext4_should_journal_data(inode)) {	1650	if (!ret && ext4_should_journal_data(inode)) {
1651	ret = walk_page_buffers(handle, page_buffers(page),	1651	ret = walk_page_buffers(handle, page_buffers(page),
1652	from, to, NULL, do_journal_get_write_access);	1652	from, to, NULL, do_journal_get_write_access);
1653	}	1653	}
1654		1654
1655	if (ret) {	1655	if (ret) {
1656	unlock_page(page);	1656	unlock_page(page);
1657	page_cache_release(page);	1657	page_cache_release(page);
1658	/*	1658	/*
1659	* __block_write_begin may have instantiated a few blocks	1659	* __block_write_begin may have instantiated a few blocks
1660	* outside i_size. Trim these off again. Don't need	1660	* outside i_size. Trim these off again. Don't need
1661	* i_size_read because we hold i_mutex.	1661	* i_size_read because we hold i_mutex.
1662	*	1662	*
1663	* Add inode to orphan list in case we crash before	1663	* Add inode to orphan list in case we crash before
1664	* truncate finishes	1664	* truncate finishes
1665	*/	1665	*/
1666	if (pos + len > inode->i_size && ext4_can_truncate(inode))	1666	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1667	ext4_orphan_add(handle, inode);	1667	ext4_orphan_add(handle, inode);
1668		1668
1669	ext4_journal_stop(handle);	1669	ext4_journal_stop(handle);
1670	if (pos + len > inode->i_size) {	1670	if (pos + len > inode->i_size) {
1671	ext4_truncate_failed_write(inode);	1671	ext4_truncate_failed_write(inode);
1672	/*	1672	/*
1673	* If truncate failed early the inode might	1673	* If truncate failed early the inode might
1674	* still be on the orphan list; we need to	1674	* still be on the orphan list; we need to
1675	* make sure the inode is removed from the	1675	* make sure the inode is removed from the
1676	* orphan list in that case.	1676	* orphan list in that case.
1677	*/	1677	*/
1678	if (inode->i_nlink)	1678	if (inode->i_nlink)
1679	ext4_orphan_del(NULL, inode);	1679	ext4_orphan_del(NULL, inode);
1680	}	1680	}
1681	}	1681	}
1682		1682
1683	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))	1683	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1684	goto retry;	1684	goto retry;
1685	out:	1685	out:
1686	return ret;	1686	return ret;
1687	}	1687	}
1688		1688
1689	/* For write_end() in data=journal mode */	1689	/* For write_end() in data=journal mode */
1690	static int write_end_fn(handle_t handle, struct buffer_head bh)	1690	static int write_end_fn(handle_t handle, struct buffer_head bh)
1691	{	1691	{
1692	if (!buffer_mapped(bh) \|\| buffer_freed(bh))	1692	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
1693	return 0;	1693	return 0;
1694	set_buffer_uptodate(bh);	1694	set_buffer_uptodate(bh);
1695	return ext4_handle_dirty_metadata(handle, NULL, bh);	1695	return ext4_handle_dirty_metadata(handle, NULL, bh);
1696	}	1696	}
1697		1697
1698	static int ext4_generic_write_end(struct file *file,	1698	static int ext4_generic_write_end(struct file *file,
1699	struct address_space *mapping,	1699	struct address_space *mapping,
1700	loff_t pos, unsigned len, unsigned copied,	1700	loff_t pos, unsigned len, unsigned copied,
1701	struct page page, void fsdata)	1701	struct page page, void fsdata)
1702	{	1702	{
1703	int i_size_changed = 0;	1703	int i_size_changed = 0;
1704	struct inode *inode = mapping->host;	1704	struct inode *inode = mapping->host;
1705	handle_t *handle = ext4_journal_current_handle();	1705	handle_t *handle = ext4_journal_current_handle();
1706		1706
1707	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);	1707	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1708		1708
1709	/*	1709	/*
1710	* No need to use i_size_read() here, the i_size	1710	* No need to use i_size_read() here, the i_size
1711	* cannot change under us because we hold i_mutex.	1711	* cannot change under us because we hold i_mutex.
1712	*	1712	*
1713	* But it's important to update i_size while still holding page lock:	1713	* But it's important to update i_size while still holding page lock:
1714	* page writeout could otherwise come in and zero beyond i_size.	1714	* page writeout could otherwise come in and zero beyond i_size.
1715	*/	1715	*/
1716	if (pos + copied > inode->i_size) {	1716	if (pos + copied > inode->i_size) {
1717	i_size_write(inode, pos + copied);	1717	i_size_write(inode, pos + copied);
1718	i_size_changed = 1;	1718	i_size_changed = 1;
1719	}	1719	}
1720		1720
1721	if (pos + copied > EXT4_I(inode)->i_disksize) {	1721	if (pos + copied > EXT4_I(inode)->i_disksize) {
1722	/* We need to mark inode dirty even if	1722	/* We need to mark inode dirty even if
1723	* new_i_size is less that inode->i_size	1723	* new_i_size is less that inode->i_size
1724	* bu greater than i_disksize.(hint delalloc)	1724	* bu greater than i_disksize.(hint delalloc)
1725	*/	1725	*/
1726	ext4_update_i_disksize(inode, (pos + copied));	1726	ext4_update_i_disksize(inode, (pos + copied));
1727	i_size_changed = 1;	1727	i_size_changed = 1;
1728	}	1728	}
1729	unlock_page(page);	1729	unlock_page(page);
1730	page_cache_release(page);	1730	page_cache_release(page);
1731		1731
1732	/*	1732	/*
1733	* Don't mark the inode dirty under page lock. First, it unnecessarily	1733	* Don't mark the inode dirty under page lock. First, it unnecessarily
1734	* makes the holding time of page lock longer. Second, it forces lock	1734	* makes the holding time of page lock longer. Second, it forces lock
1735	* ordering of page lock and transaction start for journaling	1735	* ordering of page lock and transaction start for journaling
1736	* filesystems.	1736	* filesystems.
1737	*/	1737	*/
1738	if (i_size_changed)	1738	if (i_size_changed)
1739	ext4_mark_inode_dirty(handle, inode);	1739	ext4_mark_inode_dirty(handle, inode);
1740		1740
1741	return copied;	1741	return copied;
1742	}	1742	}
1743		1743
1744	/*	1744	/*
1745	* We need to pick up the new inode size which generic_commit_write gave us	1745	* We need to pick up the new inode size which generic_commit_write gave us
1746	* `file' can be NULL - eg, when called from page_symlink().	1746	* `file' can be NULL - eg, when called from page_symlink().
1747	*	1747	*
1748	* ext4 never places buffers on inode->i_mapping->private_list. metadata	1748	* ext4 never places buffers on inode->i_mapping->private_list. metadata
1749	* buffers are managed internally.	1749	* buffers are managed internally.
1750	*/	1750	*/
1751	static int ext4_ordered_write_end(struct file *file,	1751	static int ext4_ordered_write_end(struct file *file,
1752	struct address_space *mapping,	1752	struct address_space *mapping,
1753	loff_t pos, unsigned len, unsigned copied,	1753	loff_t pos, unsigned len, unsigned copied,
1754	struct page page, void fsdata)	1754	struct page page, void fsdata)
1755	{	1755	{
1756	handle_t *handle = ext4_journal_current_handle();	1756	handle_t *handle = ext4_journal_current_handle();
1757	struct inode *inode = mapping->host;	1757	struct inode *inode = mapping->host;
1758	int ret = 0, ret2;	1758	int ret = 0, ret2;
1759		1759
1760	trace_ext4_ordered_write_end(inode, pos, len, copied);	1760	trace_ext4_ordered_write_end(inode, pos, len, copied);
1761	ret = ext4_jbd2_file_inode(handle, inode);	1761	ret = ext4_jbd2_file_inode(handle, inode);
1762		1762
1763	if (ret == 0) {	1763	if (ret == 0) {
1764	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,	1764	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1765	page, fsdata);	1765	page, fsdata);
1766	copied = ret2;	1766	copied = ret2;
1767	if (pos + len > inode->i_size && ext4_can_truncate(inode))	1767	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1768	/* if we have allocated more blocks and copied	1768	/* if we have allocated more blocks and copied
1769	* less. We will have blocks allocated outside	1769	* less. We will have blocks allocated outside
1770	* inode->i_size. So truncate them	1770	* inode->i_size. So truncate them
1771	*/	1771	*/
1772	ext4_orphan_add(handle, inode);	1772	ext4_orphan_add(handle, inode);
1773	if (ret2 < 0)	1773	if (ret2 < 0)
1774	ret = ret2;	1774	ret = ret2;
1775	}	1775	}
1776	ret2 = ext4_journal_stop(handle);	1776	ret2 = ext4_journal_stop(handle);
1777	if (!ret)	1777	if (!ret)
1778	ret = ret2;	1778	ret = ret2;
1779		1779
1780	if (pos + len > inode->i_size) {	1780	if (pos + len > inode->i_size) {
1781	ext4_truncate_failed_write(inode);	1781	ext4_truncate_failed_write(inode);
1782	/*	1782	/*
1783	* If truncate failed early the inode might still be	1783	* If truncate failed early the inode might still be
1784	* on the orphan list; we need to make sure the inode	1784	* on the orphan list; we need to make sure the inode
1785	* is removed from the orphan list in that case.	1785	* is removed from the orphan list in that case.
1786	*/	1786	*/
1787	if (inode->i_nlink)	1787	if (inode->i_nlink)
1788	ext4_orphan_del(NULL, inode);	1788	ext4_orphan_del(NULL, inode);
1789	}	1789	}
1790		1790
1791		1791
1792	return ret ? ret : copied;	1792	return ret ? ret : copied;
1793	}	1793	}
1794		1794
1795	static int ext4_writeback_write_end(struct file *file,	1795	static int ext4_writeback_write_end(struct file *file,
1796	struct address_space *mapping,	1796	struct address_space *mapping,
1797	loff_t pos, unsigned len, unsigned copied,	1797	loff_t pos, unsigned len, unsigned copied,
1798	struct page page, void fsdata)	1798	struct page page, void fsdata)
1799	{	1799	{
1800	handle_t *handle = ext4_journal_current_handle();	1800	handle_t *handle = ext4_journal_current_handle();
1801	struct inode *inode = mapping->host;	1801	struct inode *inode = mapping->host;
1802	int ret = 0, ret2;	1802	int ret = 0, ret2;
1803		1803
1804	trace_ext4_writeback_write_end(inode, pos, len, copied);	1804	trace_ext4_writeback_write_end(inode, pos, len, copied);
1805	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,	1805	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1806	page, fsdata);	1806	page, fsdata);
1807	copied = ret2;	1807	copied = ret2;
1808	if (pos + len > inode->i_size && ext4_can_truncate(inode))	1808	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1809	/* if we have allocated more blocks and copied	1809	/* if we have allocated more blocks and copied
1810	* less. We will have blocks allocated outside	1810	* less. We will have blocks allocated outside
1811	* inode->i_size. So truncate them	1811	* inode->i_size. So truncate them
1812	*/	1812	*/
1813	ext4_orphan_add(handle, inode);	1813	ext4_orphan_add(handle, inode);
1814		1814
1815	if (ret2 < 0)	1815	if (ret2 < 0)
1816	ret = ret2;	1816	ret = ret2;
1817		1817
1818	ret2 = ext4_journal_stop(handle);	1818	ret2 = ext4_journal_stop(handle);
1819	if (!ret)	1819	if (!ret)
1820	ret = ret2;	1820	ret = ret2;
1821		1821
1822	if (pos + len > inode->i_size) {	1822	if (pos + len > inode->i_size) {
1823	ext4_truncate_failed_write(inode);	1823	ext4_truncate_failed_write(inode);
1824	/*	1824	/*
1825	* If truncate failed early the inode might still be	1825	* If truncate failed early the inode might still be
1826	* on the orphan list; we need to make sure the inode	1826	* on the orphan list; we need to make sure the inode
1827	* is removed from the orphan list in that case.	1827	* is removed from the orphan list in that case.
1828	*/	1828	*/
1829	if (inode->i_nlink)	1829	if (inode->i_nlink)
1830	ext4_orphan_del(NULL, inode);	1830	ext4_orphan_del(NULL, inode);
1831	}	1831	}
1832		1832
1833	return ret ? ret : copied;	1833	return ret ? ret : copied;
1834	}	1834	}
1835		1835
1836	static int ext4_journalled_write_end(struct file *file,	1836	static int ext4_journalled_write_end(struct file *file,
1837	struct address_space *mapping,	1837	struct address_space *mapping,
1838	loff_t pos, unsigned len, unsigned copied,	1838	loff_t pos, unsigned len, unsigned copied,
1839	struct page page, void fsdata)	1839	struct page page, void fsdata)
1840	{	1840	{
1841	handle_t *handle = ext4_journal_current_handle();	1841	handle_t *handle = ext4_journal_current_handle();
1842	struct inode *inode = mapping->host;	1842	struct inode *inode = mapping->host;
1843	int ret = 0, ret2;	1843	int ret = 0, ret2;
1844	int partial = 0;	1844	int partial = 0;
1845	unsigned from, to;	1845	unsigned from, to;
1846	loff_t new_i_size;	1846	loff_t new_i_size;
1847		1847
1848	trace_ext4_journalled_write_end(inode, pos, len, copied);	1848	trace_ext4_journalled_write_end(inode, pos, len, copied);
1849	from = pos & (PAGE_CACHE_SIZE - 1);	1849	from = pos & (PAGE_CACHE_SIZE - 1);
1850	to = from + len;	1850	to = from + len;
1851		1851
1852	if (copied < len) {	1852	if (copied < len) {
1853	if (!PageUptodate(page))	1853	if (!PageUptodate(page))
1854	copied = 0;	1854	copied = 0;
1855	page_zero_new_buffers(page, from+copied, to);	1855	page_zero_new_buffers(page, from+copied, to);
1856	}	1856	}
1857		1857
1858	ret = walk_page_buffers(handle, page_buffers(page), from,	1858	ret = walk_page_buffers(handle, page_buffers(page), from,
1859	to, &partial, write_end_fn);	1859	to, &partial, write_end_fn);
1860	if (!partial)	1860	if (!partial)
1861	SetPageUptodate(page);	1861	SetPageUptodate(page);
1862	new_i_size = pos + copied;	1862	new_i_size = pos + copied;
1863	if (new_i_size > inode->i_size)	1863	if (new_i_size > inode->i_size)
1864	i_size_write(inode, pos+copied);	1864	i_size_write(inode, pos+copied);
1865	ext4_set_inode_state(inode, EXT4_STATE_JDATA);	1865	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1866	if (new_i_size > EXT4_I(inode)->i_disksize) {	1866	if (new_i_size > EXT4_I(inode)->i_disksize) {
1867	ext4_update_i_disksize(inode, new_i_size);	1867	ext4_update_i_disksize(inode, new_i_size);
1868	ret2 = ext4_mark_inode_dirty(handle, inode);	1868	ret2 = ext4_mark_inode_dirty(handle, inode);
1869	if (!ret)	1869	if (!ret)
1870	ret = ret2;	1870	ret = ret2;
1871	}	1871	}
1872		1872
1873	unlock_page(page);	1873	unlock_page(page);
1874	page_cache_release(page);	1874	page_cache_release(page);
1875	if (pos + len > inode->i_size && ext4_can_truncate(inode))	1875	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1876	/* if we have allocated more blocks and copied	1876	/* if we have allocated more blocks and copied
1877	* less. We will have blocks allocated outside	1877	* less. We will have blocks allocated outside
1878	* inode->i_size. So truncate them	1878	* inode->i_size. So truncate them
1879	*/	1879	*/
1880	ext4_orphan_add(handle, inode);	1880	ext4_orphan_add(handle, inode);
1881		1881
1882	ret2 = ext4_journal_stop(handle);	1882	ret2 = ext4_journal_stop(handle);
1883	if (!ret)	1883	if (!ret)
1884	ret = ret2;	1884	ret = ret2;
1885	if (pos + len > inode->i_size) {	1885	if (pos + len > inode->i_size) {
1886	ext4_truncate_failed_write(inode);	1886	ext4_truncate_failed_write(inode);
1887	/*	1887	/*
1888	* If truncate failed early the inode might still be	1888	* If truncate failed early the inode might still be
1889	* on the orphan list; we need to make sure the inode	1889	* on the orphan list; we need to make sure the inode
1890	* is removed from the orphan list in that case.	1890	* is removed from the orphan list in that case.
1891	*/	1891	*/
1892	if (inode->i_nlink)	1892	if (inode->i_nlink)
1893	ext4_orphan_del(NULL, inode);	1893	ext4_orphan_del(NULL, inode);
1894	}	1894	}
1895		1895
1896	return ret ? ret : copied;	1896	return ret ? ret : copied;
1897	}	1897	}
1898		1898
1899	/*	1899	/*
1900	* Reserve a single block located at lblock	1900	* Reserve a single block located at lblock
1901	*/	1901	*/
1902	static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)	1902	static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1903	{	1903	{
1904	int retries = 0;	1904	int retries = 0;
1905	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	1905	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1906	struct ext4_inode_info *ei = EXT4_I(inode);	1906	struct ext4_inode_info *ei = EXT4_I(inode);
1907	unsigned long md_needed;	1907	unsigned long md_needed;
1908	int ret;	1908	int ret;
1909		1909
1910	/*	1910	/*
1911	* recalculate the amount of metadata blocks to reserve	1911	* recalculate the amount of metadata blocks to reserve
1912	* in order to allocate nrblocks	1912	* in order to allocate nrblocks
1913	* worse case is one extent per block	1913	* worse case is one extent per block
1914	*/	1914	*/
1915	repeat:	1915	repeat:
1916	spin_lock(&ei->i_block_reservation_lock);	1916	spin_lock(&ei->i_block_reservation_lock);
1917	md_needed = ext4_calc_metadata_amount(inode, lblock);	1917	md_needed = ext4_calc_metadata_amount(inode, lblock);
1918	trace_ext4_da_reserve_space(inode, md_needed);	1918	trace_ext4_da_reserve_space(inode, md_needed);
1919	spin_unlock(&ei->i_block_reservation_lock);	1919	spin_unlock(&ei->i_block_reservation_lock);
1920		1920
1921	/*	1921	/*
1922	* We will charge metadata quota at writeout time; this saves	1922	* We will charge metadata quota at writeout time; this saves
1923	* us from metadata over-estimation, though we may go over by	1923	* us from metadata over-estimation, though we may go over by
1924	* a small amount in the end. Here we just reserve for data.	1924	* a small amount in the end. Here we just reserve for data.
1925	*/	1925	*/
1926	ret = dquot_reserve_block(inode, 1);	1926	ret = dquot_reserve_block(inode, 1);
1927	if (ret)	1927	if (ret)
1928	return ret;	1928	return ret;
1929	/*	1929	/*
1930	* We do still charge estimated metadata to the sb though;	1930	* We do still charge estimated metadata to the sb though;
1931	* we cannot afford to run out of free blocks.	1931	* we cannot afford to run out of free blocks.
1932	*/	1932	*/
1933	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {	1933	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934	dquot_release_reservation_block(inode, 1);	1934	dquot_release_reservation_block(inode, 1);
1935	if (ext4_should_retry_alloc(inode->i_sb, &retries)) {	1935	if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936	yield();	1936	yield();
1937	goto repeat;	1937	goto repeat;
1938	}	1938	}
1939	return -ENOSPC;	1939	return -ENOSPC;
1940	}	1940	}
1941	spin_lock(&ei->i_block_reservation_lock);	1941	spin_lock(&ei->i_block_reservation_lock);
1942	ei->i_reserved_data_blocks++;	1942	ei->i_reserved_data_blocks++;
1943	ei->i_reserved_meta_blocks += md_needed;	1943	ei->i_reserved_meta_blocks += md_needed;
1944	spin_unlock(&ei->i_block_reservation_lock);	1944	spin_unlock(&ei->i_block_reservation_lock);
1945		1945
1946	return 0; /* success */	1946	return 0; /* success */
1947	}	1947	}
1948		1948
1949	static void ext4_da_release_space(struct inode *inode, int to_free)	1949	static void ext4_da_release_space(struct inode *inode, int to_free)
1950	{	1950	{
1951	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	1951	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1952	struct ext4_inode_info *ei = EXT4_I(inode);	1952	struct ext4_inode_info *ei = EXT4_I(inode);
1953		1953
1954	if (!to_free)	1954	if (!to_free)
1955	return; /* Nothing to release, exit */	1955	return; /* Nothing to release, exit */
1956		1956
1957	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);	1957	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1958		1958
1959	trace_ext4_da_release_space(inode, to_free);	1959	trace_ext4_da_release_space(inode, to_free);
1960	if (unlikely(to_free > ei->i_reserved_data_blocks)) {	1960	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1961	/*	1961	/*
1962	* if there aren't enough reserved blocks, then the	1962	* if there aren't enough reserved blocks, then the
1963	* counter is messed up somewhere. Since this	1963	* counter is messed up somewhere. Since this
1964	* function is called from invalidate page, it's	1964	* function is called from invalidate page, it's
1965	* harmless to return without any action.	1965	* harmless to return without any action.
1966	*/	1966	*/
1967	ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "	1967	ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1968	"ino %lu, to_free %d with only %d reserved "	1968	"ino %lu, to_free %d with only %d reserved "
1969	"data blocks\n", inode->i_ino, to_free,	1969	"data blocks\n", inode->i_ino, to_free,
1970	ei->i_reserved_data_blocks);	1970	ei->i_reserved_data_blocks);
1971	WARN_ON(1);	1971	WARN_ON(1);
1972	to_free = ei->i_reserved_data_blocks;	1972	to_free = ei->i_reserved_data_blocks;
1973	}	1973	}
1974	ei->i_reserved_data_blocks -= to_free;	1974	ei->i_reserved_data_blocks -= to_free;
1975		1975
1976	if (ei->i_reserved_data_blocks == 0) {	1976	if (ei->i_reserved_data_blocks == 0) {
1977	/*	1977	/*
1978	* We can release all of the reserved metadata blocks	1978	* We can release all of the reserved metadata blocks
1979	* only when we have written all of the delayed	1979	* only when we have written all of the delayed
1980	* allocation blocks.	1980	* allocation blocks.
1981	*/	1981	*/
1982	percpu_counter_sub(&sbi->s_dirtyblocks_counter,	1982	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1983	ei->i_reserved_meta_blocks);	1983	ei->i_reserved_meta_blocks);
1984	ei->i_reserved_meta_blocks = 0;	1984	ei->i_reserved_meta_blocks = 0;
1985	ei->i_da_metadata_calc_len = 0;	1985	ei->i_da_metadata_calc_len = 0;
1986	}	1986	}
1987		1987
1988	/* update fs dirty data blocks counter */	1988	/* update fs dirty data blocks counter */
1989	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);	1989	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1990		1990
1991	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);	1991	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1992		1992
1993	dquot_release_reservation_block(inode, to_free);	1993	dquot_release_reservation_block(inode, to_free);
1994	}	1994	}
1995		1995
1996	static void ext4_da_page_release_reservation(struct page *page,	1996	static void ext4_da_page_release_reservation(struct page *page,
1997	unsigned long offset)	1997	unsigned long offset)
1998	{	1998	{
1999	int to_release = 0;	1999	int to_release = 0;
2000	struct buffer_head head, bh;	2000	struct buffer_head head, bh;
2001	unsigned int curr_off = 0;	2001	unsigned int curr_off = 0;
2002		2002
2003	head = page_buffers(page);	2003	head = page_buffers(page);
2004	bh = head;	2004	bh = head;
2005	do {	2005	do {
2006	unsigned int next_off = curr_off + bh->b_size;	2006	unsigned int next_off = curr_off + bh->b_size;
2007		2007
2008	if ((offset <= curr_off) && (buffer_delay(bh))) {	2008	if ((offset <= curr_off) && (buffer_delay(bh))) {
2009	to_release++;	2009	to_release++;
2010	clear_buffer_delay(bh);	2010	clear_buffer_delay(bh);
2011	}	2011	}
2012	curr_off = next_off;	2012	curr_off = next_off;
2013	} while ((bh = bh->b_this_page) != head);	2013	} while ((bh = bh->b_this_page) != head);
2014	ext4_da_release_space(page->mapping->host, to_release);	2014	ext4_da_release_space(page->mapping->host, to_release);
2015	}	2015	}
2016		2016
2017	/*	2017	/*
2018	* Delayed allocation stuff	2018	* Delayed allocation stuff
2019	*/	2019	*/
2020		2020
2021	/*	2021	/*
2022	* mpage_da_submit_io - walks through extent of pages and try to write	2022	* mpage_da_submit_io - walks through extent of pages and try to write
2023	* them with writepage() call back	2023	* them with writepage() call back
2024	*	2024	*
2025	* @mpd->inode: inode	2025	* @mpd->inode: inode
2026	* @mpd->first_page: first page of the extent	2026	* @mpd->first_page: first page of the extent
2027	* @mpd->next_page: page after the last page of the extent	2027	* @mpd->next_page: page after the last page of the extent
2028	*	2028	*
2029	* By the time mpage_da_submit_io() is called we expect all blocks	2029	* By the time mpage_da_submit_io() is called we expect all blocks
2030	* to be allocated. this may be wrong if allocation failed.	2030	* to be allocated. this may be wrong if allocation failed.
2031	*	2031	*
2032	* As pages are already locked by write_cache_pages(), we can't use it	2032	* As pages are already locked by write_cache_pages(), we can't use it
2033	*/	2033	*/
2034	static int mpage_da_submit_io(struct mpage_da_data *mpd,	2034	static int mpage_da_submit_io(struct mpage_da_data *mpd,
2035	struct ext4_map_blocks *map)	2035	struct ext4_map_blocks *map)
2036	{	2036	{
2037	struct pagevec pvec;	2037	struct pagevec pvec;
2038	unsigned long index, end;	2038	unsigned long index, end;
2039	int ret = 0, err, nr_pages, i;	2039	int ret = 0, err, nr_pages, i;
2040	struct inode *inode = mpd->inode;	2040	struct inode *inode = mpd->inode;
2041	struct address_space *mapping = inode->i_mapping;	2041	struct address_space *mapping = inode->i_mapping;
2042	loff_t size = i_size_read(inode);	2042	loff_t size = i_size_read(inode);
2043	unsigned int len, block_start;	2043	unsigned int len, block_start;
2044	struct buffer_head bh, page_bufs = NULL;	2044	struct buffer_head bh, page_bufs = NULL;
2045	int journal_data = ext4_should_journal_data(inode);	2045	int journal_data = ext4_should_journal_data(inode);
2046	sector_t pblock = 0, cur_logical = 0;	2046	sector_t pblock = 0, cur_logical = 0;
2047	struct ext4_io_submit io_submit;	2047	struct ext4_io_submit io_submit;
2048		2048
2049	BUG_ON(mpd->next_page <= mpd->first_page);	2049	BUG_ON(mpd->next_page <= mpd->first_page);
2050	memset(&io_submit, 0, sizeof(io_submit));	2050	memset(&io_submit, 0, sizeof(io_submit));
2051	/*	2051	/*
2052	* We need to start from the first_page to the next_page - 1	2052	* We need to start from the first_page to the next_page - 1
2053	* to make sure we also write the mapped dirty buffer_heads.	2053	* to make sure we also write the mapped dirty buffer_heads.
2054	* If we look at mpd->b_blocknr we would only be looking	2054	* If we look at mpd->b_blocknr we would only be looking
2055	* at the currently mapped buffer_heads.	2055	* at the currently mapped buffer_heads.
2056	*/	2056	*/
2057	index = mpd->first_page;	2057	index = mpd->first_page;
2058	end = mpd->next_page - 1;	2058	end = mpd->next_page - 1;
2059		2059
2060	pagevec_init(&pvec, 0);	2060	pagevec_init(&pvec, 0);
2061	while (index <= end) {	2061	while (index <= end) {
2062	nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);	2062	nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2063	if (nr_pages == 0)	2063	if (nr_pages == 0)
2064	break;	2064	break;
2065	for (i = 0; i < nr_pages; i++) {	2065	for (i = 0; i < nr_pages; i++) {
2066	int commit_write = 0, skip_page = 0;	2066	int commit_write = 0, skip_page = 0;
2067	struct page *page = pvec.pages[i];	2067	struct page *page = pvec.pages[i];
2068		2068
2069	index = page->index;	2069	index = page->index;
2070	if (index > end)	2070	if (index > end)
2071	break;	2071	break;
2072		2072
2073	if (index == size >> PAGE_CACHE_SHIFT)	2073	if (index == size >> PAGE_CACHE_SHIFT)
2074	len = size & ~PAGE_CACHE_MASK;	2074	len = size & ~PAGE_CACHE_MASK;
2075	else	2075	else
2076	len = PAGE_CACHE_SIZE;	2076	len = PAGE_CACHE_SIZE;
2077	if (map) {	2077	if (map) {
2078	cur_logical = index << (PAGE_CACHE_SHIFT -	2078	cur_logical = index << (PAGE_CACHE_SHIFT -
2079	inode->i_blkbits);	2079	inode->i_blkbits);
2080	pblock = map->m_pblk + (cur_logical -	2080	pblock = map->m_pblk + (cur_logical -
2081	map->m_lblk);	2081	map->m_lblk);
2082	}	2082	}
2083	index++;	2083	index++;
2084		2084
2085	BUG_ON(!PageLocked(page));	2085	BUG_ON(!PageLocked(page));
2086	BUG_ON(PageWriteback(page));	2086	BUG_ON(PageWriteback(page));
2087		2087
2088	/*	2088	/*
2089	* If the page does not have buffers (for	2089	* If the page does not have buffers (for
2090	* whatever reason), try to create them using	2090	* whatever reason), try to create them using
2091	* __block_write_begin. If this fails,	2091	* __block_write_begin. If this fails,
2092	* skip the page and move on.	2092	* skip the page and move on.
2093	*/	2093	*/
2094	if (!page_has_buffers(page)) {	2094	if (!page_has_buffers(page)) {
2095	if (__block_write_begin(page, 0, len,	2095	if (__block_write_begin(page, 0, len,
2096	noalloc_get_block_write)) {	2096	noalloc_get_block_write)) {
2097	skip_page:	2097	skip_page:
2098	unlock_page(page);	2098	unlock_page(page);
2099	continue;	2099	continue;
2100	}	2100	}
2101	commit_write = 1;	2101	commit_write = 1;
2102	}	2102	}
2103		2103
2104	bh = page_bufs = page_buffers(page);	2104	bh = page_bufs = page_buffers(page);
2105	block_start = 0;	2105	block_start = 0;
2106	do {	2106	do {
2107	if (!bh)	2107	if (!bh)
2108	goto skip_page;	2108	goto skip_page;
2109	if (map && (cur_logical >= map->m_lblk) &&	2109	if (map && (cur_logical >= map->m_lblk) &&
2110	(cur_logical <= (map->m_lblk +	2110	(cur_logical <= (map->m_lblk +
2111	(map->m_len - 1)))) {	2111	(map->m_len - 1)))) {
2112	if (buffer_delay(bh)) {	2112	if (buffer_delay(bh)) {
2113	clear_buffer_delay(bh);	2113	clear_buffer_delay(bh);
2114	bh->b_blocknr = pblock;	2114	bh->b_blocknr = pblock;
2115	}	2115	}
2116	if (buffer_unwritten(bh) \|\|	2116	if (buffer_unwritten(bh) \|\|
2117	buffer_mapped(bh))	2117	buffer_mapped(bh))
2118	BUG_ON(bh->b_blocknr != pblock);	2118	BUG_ON(bh->b_blocknr != pblock);
2119	if (map->m_flags & EXT4_MAP_UNINIT)	2119	if (map->m_flags & EXT4_MAP_UNINIT)
2120	set_buffer_uninit(bh);	2120	set_buffer_uninit(bh);
2121	clear_buffer_unwritten(bh);	2121	clear_buffer_unwritten(bh);
2122	}	2122	}
2123		2123
2124	/* skip page if block allocation undone */	2124	/* skip page if block allocation undone */
2125	if (buffer_delay(bh) \|\| buffer_unwritten(bh))	2125	if (buffer_delay(bh) \|\| buffer_unwritten(bh))
2126	skip_page = 1;	2126	skip_page = 1;
2127	bh = bh->b_this_page;	2127	bh = bh->b_this_page;
2128	block_start += bh->b_size;	2128	block_start += bh->b_size;
2129	cur_logical++;	2129	cur_logical++;
2130	pblock++;	2130	pblock++;
2131	} while (bh != page_bufs);	2131	} while (bh != page_bufs);
2132		2132
2133	if (skip_page)	2133	if (skip_page)
2134	goto skip_page;	2134	goto skip_page;
2135		2135
2136	if (commit_write)	2136	if (commit_write)
2137	/* mark the buffer_heads as dirty & uptodate */	2137	/* mark the buffer_heads as dirty & uptodate */
2138	block_commit_write(page, 0, len);	2138	block_commit_write(page, 0, len);
2139		2139
2140	clear_page_dirty_for_io(page);	2140	clear_page_dirty_for_io(page);
2141	/*	2141	/*
2142	* Delalloc doesn't support data journalling,	2142	* Delalloc doesn't support data journalling,
2143	* but eventually maybe we'll lift this	2143	* but eventually maybe we'll lift this
2144	* restriction.	2144	* restriction.
2145	*/	2145	*/
2146	if (unlikely(journal_data && PageChecked(page)))	2146	if (unlikely(journal_data && PageChecked(page)))
2147	err = __ext4_journalled_writepage(page, len);	2147	err = __ext4_journalled_writepage(page, len);
2148	else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))	2148	else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149	err = ext4_bio_write_page(&io_submit, page,	2149	err = ext4_bio_write_page(&io_submit, page,
2150	len, mpd->wbc);	2150	len, mpd->wbc);
2151	else	2151	else
2152	err = block_write_full_page(page,	2152	err = block_write_full_page(page,
2153	noalloc_get_block_write, mpd->wbc);	2153	noalloc_get_block_write, mpd->wbc);
2154		2154
2155	if (!err)	2155	if (!err)
2156	mpd->pages_written++;	2156	mpd->pages_written++;
2157	/*	2157	/*
2158	* In error case, we have to continue because	2158	* In error case, we have to continue because
2159	* remaining pages are still locked	2159	* remaining pages are still locked
2160	*/	2160	*/
2161	if (ret == 0)	2161	if (ret == 0)
2162	ret = err;	2162	ret = err;
2163	}	2163	}
2164	pagevec_release(&pvec);	2164	pagevec_release(&pvec);
2165	}	2165	}
2166	ext4_io_submit(&io_submit);	2166	ext4_io_submit(&io_submit);
2167	return ret;	2167	return ret;
2168	}	2168	}
2169		2169
2170	static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)	2170	static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2171	{	2171	{
2172	int nr_pages, i;	2172	int nr_pages, i;
2173	pgoff_t index, end;	2173	pgoff_t index, end;
2174	struct pagevec pvec;	2174	struct pagevec pvec;
2175	struct inode *inode = mpd->inode;	2175	struct inode *inode = mpd->inode;
2176	struct address_space *mapping = inode->i_mapping;	2176	struct address_space *mapping = inode->i_mapping;
2177		2177
2178	index = mpd->first_page;	2178	index = mpd->first_page;
2179	end = mpd->next_page - 1;	2179	end = mpd->next_page - 1;
2180	while (index <= end) {	2180	while (index <= end) {
2181	nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);	2181	nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182	if (nr_pages == 0)	2182	if (nr_pages == 0)
2183	break;	2183	break;
2184	for (i = 0; i < nr_pages; i++) {	2184	for (i = 0; i < nr_pages; i++) {
2185	struct page *page = pvec.pages[i];	2185	struct page *page = pvec.pages[i];
2186	if (page->index > end)	2186	if (page->index > end)
2187	break;	2187	break;
2188	BUG_ON(!PageLocked(page));	2188	BUG_ON(!PageLocked(page));
2189	BUG_ON(PageWriteback(page));	2189	BUG_ON(PageWriteback(page));
2190	block_invalidatepage(page, 0);	2190	block_invalidatepage(page, 0);
2191	ClearPageUptodate(page);	2191	ClearPageUptodate(page);
2192	unlock_page(page);	2192	unlock_page(page);
2193	}	2193	}
2194	index = pvec.pages[nr_pages - 1]->index + 1;	2194	index = pvec.pages[nr_pages - 1]->index + 1;
2195	pagevec_release(&pvec);	2195	pagevec_release(&pvec);
2196	}	2196	}
2197	return;	2197	return;
2198	}	2198	}
2199		2199
2200	static void ext4_print_free_blocks(struct inode *inode)	2200	static void ext4_print_free_blocks(struct inode *inode)
2201	{	2201	{
2202	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	2202	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2203	printk(KERN_CRIT "Total free blocks count %lld\n",	2203	printk(KERN_CRIT "Total free blocks count %lld\n",
2204	ext4_count_free_blocks(inode->i_sb));	2204	ext4_count_free_blocks(inode->i_sb));
2205	printk(KERN_CRIT "Free/Dirty block details\n");	2205	printk(KERN_CRIT "Free/Dirty block details\n");
2206	printk(KERN_CRIT "free_blocks=%lld\n",	2206	printk(KERN_CRIT "free_blocks=%lld\n",
2207	(long long) percpu_counter_sum(&sbi->s_freeblocks_counter));	2207	(long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2208	printk(KERN_CRIT "dirty_blocks=%lld\n",	2208	printk(KERN_CRIT "dirty_blocks=%lld\n",
2209	(long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));	2209	(long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2210	printk(KERN_CRIT "Block reservation details\n");	2210	printk(KERN_CRIT "Block reservation details\n");
2211	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",	2211	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2212	EXT4_I(inode)->i_reserved_data_blocks);	2212	EXT4_I(inode)->i_reserved_data_blocks);
2213	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",	2213	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2214	EXT4_I(inode)->i_reserved_meta_blocks);	2214	EXT4_I(inode)->i_reserved_meta_blocks);
2215	return;	2215	return;
2216	}	2216	}
2217		2217
2218	/*	2218	/*
2219	* mpage_da_map_and_submit - go through given space, map them	2219	* mpage_da_map_and_submit - go through given space, map them
2220	* if necessary, and then submit them for I/O	2220	* if necessary, and then submit them for I/O
2221	*	2221	*
2222	* @mpd - bh describing space	2222	* @mpd - bh describing space
2223	*	2223	*
2224	* The function skips space we know is already mapped to disk blocks.	2224	* The function skips space we know is already mapped to disk blocks.
2225	*	2225	*
2226	*/	2226	*/
2227	static void mpage_da_map_and_submit(struct mpage_da_data *mpd)	2227	static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2228	{	2228	{
2229	int err, blks, get_blocks_flags;	2229	int err, blks, get_blocks_flags;
2230	struct ext4_map_blocks map, *mapp = NULL;	2230	struct ext4_map_blocks map, *mapp = NULL;
2231	sector_t next = mpd->b_blocknr;	2231	sector_t next = mpd->b_blocknr;
2232	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;	2232	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2233	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;	2233	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2234	handle_t *handle = NULL;	2234	handle_t *handle = NULL;
2235		2235
2236	/*	2236	/*
2237	* If the blocks are mapped already, or we couldn't accumulate	2237	* If the blocks are mapped already, or we couldn't accumulate
2238	* any blocks, then proceed immediately to the submission stage.	2238	* any blocks, then proceed immediately to the submission stage.
2239	*/	2239	*/
2240	if ((mpd->b_size == 0) \|\|	2240	if ((mpd->b_size == 0) \|\|
2241	((mpd->b_state & (1 << BH_Mapped)) &&	2241	((mpd->b_state & (1 << BH_Mapped)) &&
2242	!(mpd->b_state & (1 << BH_Delay)) &&	2242	!(mpd->b_state & (1 << BH_Delay)) &&
2243	!(mpd->b_state & (1 << BH_Unwritten))))	2243	!(mpd->b_state & (1 << BH_Unwritten))))
2244	goto submit_io;	2244	goto submit_io;
2245		2245
2246	handle = ext4_journal_current_handle();	2246	handle = ext4_journal_current_handle();
2247	BUG_ON(!handle);	2247	BUG_ON(!handle);
2248		2248
2249	/*	2249	/*
2250	* Call ext4_map_blocks() to allocate any delayed allocation	2250	* Call ext4_map_blocks() to allocate any delayed allocation
2251	* blocks, or to convert an uninitialized extent to be	2251	* blocks, or to convert an uninitialized extent to be
2252	* initialized (in the case where we have written into	2252	* initialized (in the case where we have written into
2253	* one or more preallocated blocks).	2253	* one or more preallocated blocks).
2254	*	2254	*
2255	* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to	2255	* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2256	* indicate that we are on the delayed allocation path. This	2256	* indicate that we are on the delayed allocation path. This
2257	* affects functions in many different parts of the allocation	2257	* affects functions in many different parts of the allocation
2258	* call path. This flag exists primarily because we don't	2258	* call path. This flag exists primarily because we don't
2259	* want to change many call functions, so ext4_map_blocks()	2259	* want to change many call functions, so ext4_map_blocks()
2260	* will set the EXT4_STATE_DELALLOC_RESERVED flag once the	2260	* will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2261	* inode's allocation semaphore is taken.	2261	* inode's allocation semaphore is taken.
2262	*	2262	*
2263	* If the blocks in questions were delalloc blocks, set	2263	* If the blocks in questions were delalloc blocks, set
2264	* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting	2264	* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2265	* variables are updated after the blocks have been allocated.	2265	* variables are updated after the blocks have been allocated.
2266	*/	2266	*/
2267	map.m_lblk = next;	2267	map.m_lblk = next;
2268	map.m_len = max_blocks;	2268	map.m_len = max_blocks;
2269	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;	2269	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2270	if (ext4_should_dioread_nolock(mpd->inode))	2270	if (ext4_should_dioread_nolock(mpd->inode))
2271	get_blocks_flags \|= EXT4_GET_BLOCKS_IO_CREATE_EXT;	2271	get_blocks_flags \|= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2272	if (mpd->b_state & (1 << BH_Delay))	2272	if (mpd->b_state & (1 << BH_Delay))
2273	get_blocks_flags \|= EXT4_GET_BLOCKS_DELALLOC_RESERVE;	2273	get_blocks_flags \|= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2274		2274
2275	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);	2275	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2276	if (blks < 0) {	2276	if (blks < 0) {
2277	struct super_block *sb = mpd->inode->i_sb;	2277	struct super_block *sb = mpd->inode->i_sb;
2278		2278
2279	err = blks;	2279	err = blks;
2280	/*	2280	/*
2281	* If get block returns EAGAIN or ENOSPC and there	2281	* If get block returns EAGAIN or ENOSPC and there
2282	* appears to be free blocks we will just let	2282	* appears to be free blocks we will just let
2283	* mpage_da_submit_io() unlock all of the pages.	2283	* mpage_da_submit_io() unlock all of the pages.
2284	*/	2284	*/
2285	if (err == -EAGAIN)	2285	if (err == -EAGAIN)
2286	goto submit_io;	2286	goto submit_io;
2287		2287
2288	if (err == -ENOSPC &&	2288	if (err == -ENOSPC &&
2289	ext4_count_free_blocks(sb)) {	2289	ext4_count_free_blocks(sb)) {
2290	mpd->retval = err;	2290	mpd->retval = err;
2291	goto submit_io;	2291	goto submit_io;
2292	}	2292	}
2293		2293
2294	/*	2294	/*
2295	* get block failure will cause us to loop in	2295	* get block failure will cause us to loop in
2296	* writepages, because a_ops->writepage won't be able	2296	* writepages, because a_ops->writepage won't be able
2297	* to make progress. The page will be redirtied by	2297	* to make progress. The page will be redirtied by
2298	* writepage and writepages will again try to write	2298	* writepage and writepages will again try to write
2299	* the same.	2299	* the same.
2300	*/	2300	*/
2301	if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {	2301	if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2302	ext4_msg(sb, KERN_CRIT,	2302	ext4_msg(sb, KERN_CRIT,
2303	"delayed block allocation failed for inode %lu "	2303	"delayed block allocation failed for inode %lu "
2304	"at logical offset %llu with max blocks %zd "	2304	"at logical offset %llu with max blocks %zd "
2305	"with error %d", mpd->inode->i_ino,	2305	"with error %d", mpd->inode->i_ino,
2306	(unsigned long long) next,	2306	(unsigned long long) next,
2307	mpd->b_size >> mpd->inode->i_blkbits, err);	2307	mpd->b_size >> mpd->inode->i_blkbits, err);
2308	ext4_msg(sb, KERN_CRIT,	2308	ext4_msg(sb, KERN_CRIT,
2309	"This should not happen!! Data will be lost\n");	2309	"This should not happen!! Data will be lost\n");
2310	if (err == -ENOSPC)	2310	if (err == -ENOSPC)
2311	ext4_print_free_blocks(mpd->inode);	2311	ext4_print_free_blocks(mpd->inode);
2312	}	2312	}
2313	/* invalidate all the pages */	2313	/* invalidate all the pages */
2314	ext4_da_block_invalidatepages(mpd);	2314	ext4_da_block_invalidatepages(mpd);
2315		2315
2316	/* Mark this page range as having been completed */	2316	/* Mark this page range as having been completed */
2317	mpd->io_done = 1;	2317	mpd->io_done = 1;
2318	return;	2318	return;
2319	}	2319	}
2320	BUG_ON(blks == 0);	2320	BUG_ON(blks == 0);
2321		2321
2322	mapp = &map;	2322	mapp = &map;
2323	if (map.m_flags & EXT4_MAP_NEW) {	2323	if (map.m_flags & EXT4_MAP_NEW) {
2324	struct block_device *bdev = mpd->inode->i_sb->s_bdev;	2324	struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2325	int i;	2325	int i;
2326		2326
2327	for (i = 0; i < map.m_len; i++)	2327	for (i = 0; i < map.m_len; i++)
2328	unmap_underlying_metadata(bdev, map.m_pblk + i);	2328	unmap_underlying_metadata(bdev, map.m_pblk + i);
2329	}	2329	}
2330		2330
2331	if (ext4_should_order_data(mpd->inode)) {	2331	if (ext4_should_order_data(mpd->inode)) {
2332	err = ext4_jbd2_file_inode(handle, mpd->inode);	2332	err = ext4_jbd2_file_inode(handle, mpd->inode);
2333	if (err)	2333	if (err)
2334	/* This only happens if the journal is aborted */	2334	/* This only happens if the journal is aborted */
2335	return;	2335	return;
2336	}	2336	}
2337		2337
2338	/*	2338	/*
2339	* Update on-disk size along with block allocation.	2339	* Update on-disk size along with block allocation.
2340	*/	2340	*/
2341	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;	2341	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2342	if (disksize > i_size_read(mpd->inode))	2342	if (disksize > i_size_read(mpd->inode))
2343	disksize = i_size_read(mpd->inode);	2343	disksize = i_size_read(mpd->inode);
2344	if (disksize > EXT4_I(mpd->inode)->i_disksize) {	2344	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2345	ext4_update_i_disksize(mpd->inode, disksize);	2345	ext4_update_i_disksize(mpd->inode, disksize);
2346	err = ext4_mark_inode_dirty(handle, mpd->inode);	2346	err = ext4_mark_inode_dirty(handle, mpd->inode);
2347	if (err)	2347	if (err)
2348	ext4_error(mpd->inode->i_sb,	2348	ext4_error(mpd->inode->i_sb,
2349	"Failed to mark inode %lu dirty",	2349	"Failed to mark inode %lu dirty",
2350	mpd->inode->i_ino);	2350	mpd->inode->i_ino);
2351	}	2351	}
2352		2352
2353	submit_io:	2353	submit_io:
2354	mpage_da_submit_io(mpd, mapp);	2354	mpage_da_submit_io(mpd, mapp);
2355	mpd->io_done = 1;	2355	mpd->io_done = 1;
2356	}	2356	}
2357		2357
2358	#define BH_FLAGS ((1 << BH_Uptodate) \| (1 << BH_Mapped) \| \	2358	#define BH_FLAGS ((1 << BH_Uptodate) \| (1 << BH_Mapped) \| \
2359	(1 << BH_Delay) \| (1 << BH_Unwritten))	2359	(1 << BH_Delay) \| (1 << BH_Unwritten))
2360		2360
2361	/*	2361	/*
2362	* mpage_add_bh_to_extent - try to add one more block to extent of blocks	2362	* mpage_add_bh_to_extent - try to add one more block to extent of blocks
2363	*	2363	*
2364	* @mpd->lbh - extent of blocks	2364	* @mpd->lbh - extent of blocks
2365	* @logical - logical number of the block in the file	2365	* @logical - logical number of the block in the file
2366	* @bh - bh of the block (used to access block's state)	2366	* @bh - bh of the block (used to access block's state)
2367	*	2367	*
2368	* the function is used to collect contig. blocks in same state	2368	* the function is used to collect contig. blocks in same state
2369	*/	2369	*/
2370	static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,	2370	static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2371	sector_t logical, size_t b_size,	2371	sector_t logical, size_t b_size,
2372	unsigned long b_state)	2372	unsigned long b_state)
2373	{	2373	{
2374	sector_t next;	2374	sector_t next;
2375	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;	2375	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2376		2376
2377	/*	2377	/*
2378	* XXX Don't go larger than mballoc is willing to allocate	2378	* XXX Don't go larger than mballoc is willing to allocate
2379	* This is a stopgap solution. We eventually need to fold	2379	* This is a stopgap solution. We eventually need to fold
2380	* mpage_da_submit_io() into this function and then call	2380	* mpage_da_submit_io() into this function and then call
2381	* ext4_map_blocks() multiple times in a loop	2381	* ext4_map_blocks() multiple times in a loop
2382	*/	2382	*/
2383	if (nrblocks >= 810241024/mpd->inode->i_sb->s_blocksize)	2383	if (nrblocks >= 810241024/mpd->inode->i_sb->s_blocksize)
2384	goto flush_it;	2384	goto flush_it;
2385		2385
2386	/* check if thereserved journal credits might overflow */	2386	/* check if thereserved journal credits might overflow */
2387	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {	2387	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2388	if (nrblocks >= EXT4_MAX_TRANS_DATA) {	2388	if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2389	/*	2389	/*
2390	* With non-extent format we are limited by the journal	2390	* With non-extent format we are limited by the journal
2391	* credit available. Total credit needed to insert	2391	* credit available. Total credit needed to insert
2392	* nrblocks contiguous blocks is dependent on the	2392	* nrblocks contiguous blocks is dependent on the
2393	* nrblocks. So limit nrblocks.	2393	* nrblocks. So limit nrblocks.
2394	*/	2394	*/
2395	goto flush_it;	2395	goto flush_it;
2396	} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >	2396	} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2397	EXT4_MAX_TRANS_DATA) {	2397	EXT4_MAX_TRANS_DATA) {
2398	/*	2398	/*
2399	* Adding the new buffer_head would make it cross the	2399	* Adding the new buffer_head would make it cross the
2400	* allowed limit for which we have journal credit	2400	* allowed limit for which we have journal credit
2401	* reserved. So limit the new bh->b_size	2401	* reserved. So limit the new bh->b_size
2402	*/	2402	*/
2403	b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<	2403	b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2404	mpd->inode->i_blkbits;	2404	mpd->inode->i_blkbits;
2405	/* we will do mpage_da_submit_io in the next loop */	2405	/* we will do mpage_da_submit_io in the next loop */
2406	}	2406	}
2407	}	2407	}
2408	/*	2408	/*
2409	* First block in the extent	2409	* First block in the extent
2410	*/	2410	*/
2411	if (mpd->b_size == 0) {	2411	if (mpd->b_size == 0) {
2412	mpd->b_blocknr = logical;	2412	mpd->b_blocknr = logical;
2413	mpd->b_size = b_size;	2413	mpd->b_size = b_size;
2414	mpd->b_state = b_state & BH_FLAGS;	2414	mpd->b_state = b_state & BH_FLAGS;
2415	return;	2415	return;
2416	}	2416	}
2417		2417
2418	next = mpd->b_blocknr + nrblocks;	2418	next = mpd->b_blocknr + nrblocks;
2419	/*	2419	/*
2420	* Can we merge the block to our big extent?	2420	* Can we merge the block to our big extent?
2421	*/	2421	*/
2422	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {	2422	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2423	mpd->b_size += b_size;	2423	mpd->b_size += b_size;
2424	return;	2424	return;
2425	}	2425	}
2426		2426
2427	flush_it:	2427	flush_it:
2428	/*	2428	/*
2429	* We couldn't merge the block to our extent, so we	2429	* We couldn't merge the block to our extent, so we
2430	* need to flush current extent and start new one	2430	* need to flush current extent and start new one
2431	*/	2431	*/
2432	mpage_da_map_and_submit(mpd);	2432	mpage_da_map_and_submit(mpd);
2433	return;	2433	return;
2434	}	2434	}
2435		2435
2436	static int ext4_bh_delay_or_unwritten(handle_t handle, struct buffer_head bh)	2436	static int ext4_bh_delay_or_unwritten(handle_t handle, struct buffer_head bh)
2437	{	2437	{
2438	return (buffer_delay(bh) \|\| buffer_unwritten(bh)) && buffer_dirty(bh);	2438	return (buffer_delay(bh) \|\| buffer_unwritten(bh)) && buffer_dirty(bh);
2439	}	2439	}
2440		2440
2441	/*	2441	/*
2442	* This is a special get_blocks_t callback which is used by	2442	* This is a special get_blocks_t callback which is used by
2443	* ext4_da_write_begin(). It will either return mapped block or	2443	* ext4_da_write_begin(). It will either return mapped block or
2444	* reserve space for a single block.	2444	* reserve space for a single block.
2445	*	2445	*
2446	* For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.	2446	* For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2447	* We also have b_blocknr = -1 and b_bdev initialized properly	2447	* We also have b_blocknr = -1 and b_bdev initialized properly
2448	*	2448	*
2449	* For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.	2449	* For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2450	* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev	2450	* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2451	* initialized properly.	2451	* initialized properly.
2452	*/	2452	*/
2453	static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,	2453	static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2454	struct buffer_head *bh, int create)	2454	struct buffer_head *bh, int create)
2455	{	2455	{
2456	struct ext4_map_blocks map;	2456	struct ext4_map_blocks map;
2457	int ret = 0;	2457	int ret = 0;
2458	sector_t invalid_block = ~((sector_t) 0xffff);	2458	sector_t invalid_block = ~((sector_t) 0xffff);
2459		2459
2460	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))	2460	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2461	invalid_block = ~0;	2461	invalid_block = ~0;
2462		2462
2463	BUG_ON(create == 0);	2463	BUG_ON(create == 0);
2464	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);	2464	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2465		2465
2466	map.m_lblk = iblock;	2466	map.m_lblk = iblock;
2467	map.m_len = 1;	2467	map.m_len = 1;
2468		2468
2469	/*	2469	/*
2470	* first, we need to know whether the block is allocated already	2470	* first, we need to know whether the block is allocated already
2471	* preallocated blocks are unmapped but should treated	2471	* preallocated blocks are unmapped but should treated
2472	* the same as allocated blocks.	2472	* the same as allocated blocks.
2473	*/	2473	*/
2474	ret = ext4_map_blocks(NULL, inode, &map, 0);	2474	ret = ext4_map_blocks(NULL, inode, &map, 0);
2475	if (ret < 0)	2475	if (ret < 0)
2476	return ret;	2476	return ret;
2477	if (ret == 0) {	2477	if (ret == 0) {
2478	if (buffer_delay(bh))	2478	if (buffer_delay(bh))
2479	return 0; /* Not sure this could or should happen */	2479	return 0; /* Not sure this could or should happen */
2480	/*	2480	/*
2481	* XXX: __block_write_begin() unmaps passed block, is it OK?	2481	* XXX: __block_write_begin() unmaps passed block, is it OK?
2482	*/	2482	*/
2483	ret = ext4_da_reserve_space(inode, iblock);	2483	ret = ext4_da_reserve_space(inode, iblock);
2484	if (ret)	2484	if (ret)
2485	/* not enough space to reserve */	2485	/* not enough space to reserve */
2486	return ret;	2486	return ret;
2487		2487
2488	map_bh(bh, inode->i_sb, invalid_block);	2488	map_bh(bh, inode->i_sb, invalid_block);
2489	set_buffer_new(bh);	2489	set_buffer_new(bh);
2490	set_buffer_delay(bh);	2490	set_buffer_delay(bh);
2491	return 0;	2491	return 0;
2492	}	2492	}
2493		2493
2494	map_bh(bh, inode->i_sb, map.m_pblk);	2494	map_bh(bh, inode->i_sb, map.m_pblk);
2495	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) \| map.m_flags;	2495	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) \| map.m_flags;
2496		2496
2497	if (buffer_unwritten(bh)) {	2497	if (buffer_unwritten(bh)) {
2498	/* A delayed write to unwritten bh should be marked	2498	/* A delayed write to unwritten bh should be marked
2499	* new and mapped. Mapped ensures that we don't do	2499	* new and mapped. Mapped ensures that we don't do
2500	* get_block multiple times when we write to the same	2500	* get_block multiple times when we write to the same
2501	* offset and new ensures that we do proper zero out	2501	* offset and new ensures that we do proper zero out
2502	* for partial write.	2502	* for partial write.
2503	*/	2503	*/
2504	set_buffer_new(bh);	2504	set_buffer_new(bh);
2505	set_buffer_mapped(bh);	2505	set_buffer_mapped(bh);
2506	}	2506	}
2507	return 0;	2507	return 0;
2508	}	2508	}
2509		2509
2510	/*	2510	/*
2511	* This function is used as a standard get_block_t calback function	2511	* This function is used as a standard get_block_t calback function
2512	* when there is no desire to allocate any blocks. It is used as a	2512	* when there is no desire to allocate any blocks. It is used as a
2513	* callback function for block_write_begin() and block_write_full_page().	2513	* callback function for block_write_begin() and block_write_full_page().
2514	* These functions should only try to map a single block at a time.	2514	* These functions should only try to map a single block at a time.
2515	*	2515	*
2516	* Since this function doesn't do block allocations even if the caller	2516	* Since this function doesn't do block allocations even if the caller
2517	* requests it by passing in create=1, it is critically important that	2517	* requests it by passing in create=1, it is critically important that
2518	* any caller checks to make sure that any buffer heads are returned	2518	* any caller checks to make sure that any buffer heads are returned
2519	* by this function are either all already mapped or marked for	2519	* by this function are either all already mapped or marked for
2520	* delayed allocation before calling block_write_full_page(). Otherwise,	2520	* delayed allocation before calling block_write_full_page(). Otherwise,
2521	* b_blocknr could be left unitialized, and the page write functions will	2521	* b_blocknr could be left unitialized, and the page write functions will
2522	* be taken by surprise.	2522	* be taken by surprise.
2523	*/	2523	*/
2524	static int noalloc_get_block_write(struct inode *inode, sector_t iblock,	2524	static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2525	struct buffer_head *bh_result, int create)	2525	struct buffer_head *bh_result, int create)
2526	{	2526	{
2527	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);	2527	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2528	return _ext4_get_block(inode, iblock, bh_result, 0);	2528	return _ext4_get_block(inode, iblock, bh_result, 0);
2529	}	2529	}
2530		2530
2531	static int bget_one(handle_t handle, struct buffer_head bh)	2531	static int bget_one(handle_t handle, struct buffer_head bh)
2532	{	2532	{
2533	get_bh(bh);	2533	get_bh(bh);
2534	return 0;	2534	return 0;
2535	}	2535	}
2536		2536
2537	static int bput_one(handle_t handle, struct buffer_head bh)	2537	static int bput_one(handle_t handle, struct buffer_head bh)
2538	{	2538	{
2539	put_bh(bh);	2539	put_bh(bh);
2540	return 0;	2540	return 0;
2541	}	2541	}
2542		2542
2543	static int __ext4_journalled_writepage(struct page *page,	2543	static int __ext4_journalled_writepage(struct page *page,
2544	unsigned int len)	2544	unsigned int len)
2545	{	2545	{
2546	struct address_space *mapping = page->mapping;	2546	struct address_space *mapping = page->mapping;
2547	struct inode *inode = mapping->host;	2547	struct inode *inode = mapping->host;
2548	struct buffer_head *page_bufs;	2548	struct buffer_head *page_bufs;
2549	handle_t *handle = NULL;	2549	handle_t *handle = NULL;
2550	int ret = 0;	2550	int ret = 0;
2551	int err;	2551	int err;
2552		2552
2553	ClearPageChecked(page);	2553	ClearPageChecked(page);
2554	page_bufs = page_buffers(page);	2554	page_bufs = page_buffers(page);
2555	BUG_ON(!page_bufs);	2555	BUG_ON(!page_bufs);
2556	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);	2556	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2557	/* As soon as we unlock the page, it can go away, but we have	2557	/* As soon as we unlock the page, it can go away, but we have
2558	* references to buffers so we are safe */	2558	* references to buffers so we are safe */
2559	unlock_page(page);	2559	unlock_page(page);
2560		2560
2561	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));	2561	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2562	if (IS_ERR(handle)) {	2562	if (IS_ERR(handle)) {
2563	ret = PTR_ERR(handle);	2563	ret = PTR_ERR(handle);
2564	goto out;	2564	goto out;
2565	}	2565	}
2566		2566
2567	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,	2567	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2568	do_journal_get_write_access);	2568	do_journal_get_write_access);
2569		2569
2570	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,	2570	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2571	write_end_fn);	2571	write_end_fn);
2572	if (ret == 0)	2572	if (ret == 0)
2573	ret = err;	2573	ret = err;
2574	err = ext4_journal_stop(handle);	2574	err = ext4_journal_stop(handle);
2575	if (!ret)	2575	if (!ret)
2576	ret = err;	2576	ret = err;
2577		2577
2578	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);	2578	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2579	ext4_set_inode_state(inode, EXT4_STATE_JDATA);	2579	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2580	out:	2580	out:
2581	return ret;	2581	return ret;
2582	}	2582	}
2583		2583
2584	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode);	2584	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode);
2585	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);	2585	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2586		2586
2587	/*	2587	/*
2588	* Note that we don't need to start a transaction unless we're journaling data	2588	* Note that we don't need to start a transaction unless we're journaling data
2589	* because we should have holes filled from ext4_page_mkwrite(). We even don't	2589	* because we should have holes filled from ext4_page_mkwrite(). We even don't
2590	* need to file the inode to the transaction's list in ordered mode because if	2590	* need to file the inode to the transaction's list in ordered mode because if
2591	* we are writing back data added by write(), the inode is already there and if	2591	* we are writing back data added by write(), the inode is already there and if
2592	* we are writing back data modified via mmap(), no one guarantees in which	2592	* we are writing back data modified via mmap(), no one guarantees in which
2593	* transaction the data will hit the disk. In case we are journaling data, we	2593	* transaction the data will hit the disk. In case we are journaling data, we
2594	* cannot start transaction directly because transaction start ranks above page	2594	* cannot start transaction directly because transaction start ranks above page
2595	* lock so we have to do some magic.	2595	* lock so we have to do some magic.
2596	*	2596	*
2597	* This function can get called via...	2597	* This function can get called via...
2598	* - ext4_da_writepages after taking page lock (have journal handle)	2598	* - ext4_da_writepages after taking page lock (have journal handle)
2599	* - journal_submit_inode_data_buffers (no journal handle)	2599	* - journal_submit_inode_data_buffers (no journal handle)
2600	* - shrink_page_list via pdflush (no journal handle)	2600	* - shrink_page_list via pdflush (no journal handle)
2601	* - grab_page_cache when doing write_begin (have journal handle)	2601	* - grab_page_cache when doing write_begin (have journal handle)
2602	*	2602	*
2603	* We don't do any block allocation in this function. If we have page with	2603	* We don't do any block allocation in this function. If we have page with
2604	* multiple blocks we need to write those buffer_heads that are mapped. This	2604	* multiple blocks we need to write those buffer_heads that are mapped. This
2605	* is important for mmaped based write. So if we do with blocksize 1K	2605	* is important for mmaped based write. So if we do with blocksize 1K
2606	* truncate(f, 1024);	2606	* truncate(f, 1024);
2607	* a = mmap(f, 0, 4096);	2607	* a = mmap(f, 0, 4096);
2608	* a[0] = 'a';	2608	* a[0] = 'a';
2609	* truncate(f, 4096);	2609	* truncate(f, 4096);
2610	* we have in the page first buffer_head mapped via page_mkwrite call back	2610	* we have in the page first buffer_head mapped via page_mkwrite call back
2611	* but other bufer_heads would be unmapped but dirty(dirty done via the	2611	* but other bufer_heads would be unmapped but dirty(dirty done via the
2612	* do_wp_page). So writepage should write the first block. If we modify	2612	* do_wp_page). So writepage should write the first block. If we modify
2613	* the mmap area beyond 1024 we will again get a page_fault and the	2613	* the mmap area beyond 1024 we will again get a page_fault and the
2614	* page_mkwrite callback will do the block allocation and mark the	2614	* page_mkwrite callback will do the block allocation and mark the
2615	* buffer_heads mapped.	2615	* buffer_heads mapped.
2616	*	2616	*
2617	* We redirty the page if we have any buffer_heads that is either delay or	2617	* We redirty the page if we have any buffer_heads that is either delay or
2618	* unwritten in the page.	2618	* unwritten in the page.
2619	*	2619	*
2620	* We can get recursively called as show below.	2620	* We can get recursively called as show below.
2621	*	2621	*
2622	* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->	2622	* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2623	* ext4_writepage()	2623	* ext4_writepage()
2624	*	2624	*
2625	* But since we don't do any block allocation we should not deadlock.	2625	* But since we don't do any block allocation we should not deadlock.
2626	* Page also have the dirty flag cleared so we don't get recurive page_lock.	2626	* Page also have the dirty flag cleared so we don't get recurive page_lock.
2627	*/	2627	*/
2628	static int ext4_writepage(struct page *page,	2628	static int ext4_writepage(struct page *page,
2629	struct writeback_control *wbc)	2629	struct writeback_control *wbc)
2630	{	2630	{
2631	int ret = 0, commit_write = 0;	2631	int ret = 0, commit_write = 0;
2632	loff_t size;	2632	loff_t size;
2633	unsigned int len;	2633	unsigned int len;
2634	struct buffer_head *page_bufs = NULL;	2634	struct buffer_head *page_bufs = NULL;
2635	struct inode *inode = page->mapping->host;	2635	struct inode *inode = page->mapping->host;
2636		2636
2637	trace_ext4_writepage(page);	2637	trace_ext4_writepage(page);
2638	size = i_size_read(inode);	2638	size = i_size_read(inode);
2639	if (page->index == size >> PAGE_CACHE_SHIFT)	2639	if (page->index == size >> PAGE_CACHE_SHIFT)
2640	len = size & ~PAGE_CACHE_MASK;	2640	len = size & ~PAGE_CACHE_MASK;
2641	else	2641	else
2642	len = PAGE_CACHE_SIZE;	2642	len = PAGE_CACHE_SIZE;
2643		2643
2644	/*	2644	/*
2645	* If the page does not have buffers (for whatever reason),	2645	* If the page does not have buffers (for whatever reason),
2646	* try to create them using __block_write_begin. If this	2646	* try to create them using __block_write_begin. If this
2647	* fails, redirty the page and move on.	2647	* fails, redirty the page and move on.
2648	*/	2648	*/
2649	if (!page_has_buffers(page)) {	2649	if (!page_has_buffers(page)) {
2650	if (__block_write_begin(page, 0, len,	2650	if (__block_write_begin(page, 0, len,
2651	noalloc_get_block_write)) {	2651	noalloc_get_block_write)) {
2652	redirty_page:	2652	redirty_page:
2653	redirty_page_for_writepage(wbc, page);	2653	redirty_page_for_writepage(wbc, page);
2654	unlock_page(page);	2654	unlock_page(page);
2655	return 0;	2655	return 0;
2656	}	2656	}
2657	commit_write = 1;	2657	commit_write = 1;
2658	}	2658	}
2659	page_bufs = page_buffers(page);	2659	page_bufs = page_buffers(page);
2660	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,	2660	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2661	ext4_bh_delay_or_unwritten)) {	2661	ext4_bh_delay_or_unwritten)) {
2662	/*	2662	/*
2663	* We don't want to do block allocation, so redirty	2663	* We don't want to do block allocation, so redirty
2664	* the page and return. We may reach here when we do	2664	* the page and return. We may reach here when we do
2665	* a journal commit via journal_submit_inode_data_buffers.	2665	* a journal commit via journal_submit_inode_data_buffers.
2666	* We can also reach here via shrink_page_list	2666	* We can also reach here via shrink_page_list
2667	*/	2667	*/
2668	goto redirty_page;	2668	goto redirty_page;
2669	}	2669	}
2670	if (commit_write)	2670	if (commit_write)
2671	/* now mark the buffer_heads as dirty and uptodate */	2671	/* now mark the buffer_heads as dirty and uptodate */
2672	block_commit_write(page, 0, len);	2672	block_commit_write(page, 0, len);
2673		2673
2674	if (PageChecked(page) && ext4_should_journal_data(inode))	2674	if (PageChecked(page) && ext4_should_journal_data(inode))
2675	/*	2675	/*
2676	* It's mmapped pagecache. Add buffers and journal it. There	2676	* It's mmapped pagecache. Add buffers and journal it. There
2677	* doesn't seem much point in redirtying the page here.	2677	* doesn't seem much point in redirtying the page here.
2678	*/	2678	*/
2679	return __ext4_journalled_writepage(page, len);	2679	return __ext4_journalled_writepage(page, len);
2680		2680
2681	if (buffer_uninit(page_bufs)) {	2681	if (buffer_uninit(page_bufs)) {
2682	ext4_set_bh_endio(page_bufs, inode);	2682	ext4_set_bh_endio(page_bufs, inode);
2683	ret = block_write_full_page_endio(page, noalloc_get_block_write,	2683	ret = block_write_full_page_endio(page, noalloc_get_block_write,
2684	wbc, ext4_end_io_buffer_write);	2684	wbc, ext4_end_io_buffer_write);
2685	} else	2685	} else
2686	ret = block_write_full_page(page, noalloc_get_block_write,	2686	ret = block_write_full_page(page, noalloc_get_block_write,
2687	wbc);	2687	wbc);
2688		2688
2689	return ret;	2689	return ret;
2690	}	2690	}
2691		2691
2692	/*	2692	/*
2693	* This is called via ext4_da_writepages() to	2693	* This is called via ext4_da_writepages() to
2694	* calculate the total number of credits to reserve to fit	2694	* calculate the total number of credits to reserve to fit
2695	* a single extent allocation into a single transaction,	2695	* a single extent allocation into a single transaction,
2696	* ext4_da_writpeages() will loop calling this before	2696	* ext4_da_writpeages() will loop calling this before
2697	* the block allocation.	2697	* the block allocation.
2698	*/	2698	*/
2699		2699
2700	static int ext4_da_writepages_trans_blocks(struct inode *inode)	2700	static int ext4_da_writepages_trans_blocks(struct inode *inode)
2701	{	2701	{
2702	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;	2702	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2703		2703
2704	/*	2704	/*
2705	* With non-extent format the journal credit needed to	2705	* With non-extent format the journal credit needed to
2706	* insert nrblocks contiguous block is dependent on	2706	* insert nrblocks contiguous block is dependent on
2707	* number of contiguous block. So we will limit	2707	* number of contiguous block. So we will limit
2708	* number of contiguous block to a sane value	2708	* number of contiguous block to a sane value
2709	*/	2709	*/
2710	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&	2710	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2711	(max_blocks > EXT4_MAX_TRANS_DATA))	2711	(max_blocks > EXT4_MAX_TRANS_DATA))
2712	max_blocks = EXT4_MAX_TRANS_DATA;	2712	max_blocks = EXT4_MAX_TRANS_DATA;
2713		2713
2714	return ext4_chunk_trans_blocks(inode, max_blocks);	2714	return ext4_chunk_trans_blocks(inode, max_blocks);
2715	}	2715	}
2716		2716
2717	/*	2717	/*
2718	* write_cache_pages_da - walk the list of dirty pages of the given	2718	* write_cache_pages_da - walk the list of dirty pages of the given
2719	* address space and accumulate pages that need writing, and call	2719	* address space and accumulate pages that need writing, and call
2720	* mpage_da_map_and_submit to map a single contiguous memory region	2720	* mpage_da_map_and_submit to map a single contiguous memory region
2721	* and then write them.	2721	* and then write them.
2722	*/	2722	*/
2723	static int write_cache_pages_da(struct address_space *mapping,	2723	static int write_cache_pages_da(struct address_space *mapping,
2724	struct writeback_control *wbc,	2724	struct writeback_control *wbc,
2725	struct mpage_da_data *mpd,	2725	struct mpage_da_data *mpd,
2726	pgoff_t *done_index)	2726	pgoff_t *done_index)
2727	{	2727	{
2728	struct buffer_head bh, head;	2728	struct buffer_head bh, head;
2729	struct inode *inode = mapping->host;	2729	struct inode *inode = mapping->host;
2730	struct pagevec pvec;	2730	struct pagevec pvec;
2731	unsigned int nr_pages;	2731	unsigned int nr_pages;
2732	sector_t logical;	2732	sector_t logical;
2733	pgoff_t index, end;	2733	pgoff_t index, end;
2734	long nr_to_write = wbc->nr_to_write;	2734	long nr_to_write = wbc->nr_to_write;
2735	int i, tag, ret = 0;	2735	int i, tag, ret = 0;
2736		2736
2737	memset(mpd, 0, sizeof(struct mpage_da_data));	2737	memset(mpd, 0, sizeof(struct mpage_da_data));
2738	mpd->wbc = wbc;	2738	mpd->wbc = wbc;
2739	mpd->inode = inode;	2739	mpd->inode = inode;
2740	pagevec_init(&pvec, 0);	2740	pagevec_init(&pvec, 0);
2741	index = wbc->range_start >> PAGE_CACHE_SHIFT;	2741	index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742	end = wbc->range_end >> PAGE_CACHE_SHIFT;	2742	end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743		2743
2744	if (wbc->sync_mode == WB_SYNC_ALL)	2744	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2745	tag = PAGECACHE_TAG_TOWRITE;	2745	tag = PAGECACHE_TAG_TOWRITE;
2746	else	2746	else
2747	tag = PAGECACHE_TAG_DIRTY;	2747	tag = PAGECACHE_TAG_DIRTY;
2748		2748
2749	*done_index = index;	2749	*done_index = index;
2750	while (index <= end) {	2750	while (index <= end) {
2751	nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,	2751	nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2752	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);	2752	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2753	if (nr_pages == 0)	2753	if (nr_pages == 0)
2754	return 0;	2754	return 0;
2755		2755
2756	for (i = 0; i < nr_pages; i++) {	2756	for (i = 0; i < nr_pages; i++) {
2757	struct page *page = pvec.pages[i];	2757	struct page *page = pvec.pages[i];
2758		2758
2759	/*	2759	/*
2760	* At this point, the page may be truncated or	2760	* At this point, the page may be truncated or
2761	* invalidated (changing page->mapping to NULL), or	2761	* invalidated (changing page->mapping to NULL), or
2762	* even swizzled back from swapper_space to tmpfs file	2762	* even swizzled back from swapper_space to tmpfs file
2763	* mapping. However, page->index will not change	2763	* mapping. However, page->index will not change
2764	* because we have a reference on the page.	2764	* because we have a reference on the page.
2765	*/	2765	*/
2766	if (page->index > end)	2766	if (page->index > end)
2767	goto out;	2767	goto out;
2768		2768
2769	*done_index = page->index + 1;	2769	*done_index = page->index + 1;
2770		2770
2771	/*	2771	/*
2772	* If we can't merge this page, and we have	2772	* If we can't merge this page, and we have
2773	* accumulated an contiguous region, write it	2773	* accumulated an contiguous region, write it
2774	*/	2774	*/
2775	if ((mpd->next_page != page->index) &&	2775	if ((mpd->next_page != page->index) &&
2776	(mpd->next_page != mpd->first_page)) {	2776	(mpd->next_page != mpd->first_page)) {
2777	mpage_da_map_and_submit(mpd);	2777	mpage_da_map_and_submit(mpd);
2778	goto ret_extent_tail;	2778	goto ret_extent_tail;
2779	}	2779	}
2780		2780
2781	lock_page(page);	2781	lock_page(page);
2782		2782
2783	/*	2783	/*
2784	* If the page is no longer dirty, or its	2784	* If the page is no longer dirty, or its
2785	* mapping no longer corresponds to inode we	2785	* mapping no longer corresponds to inode we
2786	* are writing (which means it has been	2786	* are writing (which means it has been
2787	* truncated or invalidated), or the page is	2787	* truncated or invalidated), or the page is
2788	* already under writeback and we are not	2788	* already under writeback and we are not
2789	* doing a data integrity writeback, skip the page	2789	* doing a data integrity writeback, skip the page
2790	*/	2790	*/
2791	if (!PageDirty(page) \|\|	2791	if (!PageDirty(page) \|\|
2792	(PageWriteback(page) &&	2792	(PageWriteback(page) &&
2793	(wbc->sync_mode == WB_SYNC_NONE)) \|\|	2793	(wbc->sync_mode == WB_SYNC_NONE)) \|\|
2794	unlikely(page->mapping != mapping)) {	2794	unlikely(page->mapping != mapping)) {
2795	unlock_page(page);	2795	unlock_page(page);
2796	continue;	2796	continue;
2797	}	2797	}
2798		2798
2799	wait_on_page_writeback(page);	2799	wait_on_page_writeback(page);
2800	BUG_ON(PageWriteback(page));	2800	BUG_ON(PageWriteback(page));
2801		2801
2802	if (mpd->next_page != page->index)	2802	if (mpd->next_page != page->index)
2803	mpd->first_page = page->index;	2803	mpd->first_page = page->index;
2804	mpd->next_page = page->index + 1;	2804	mpd->next_page = page->index + 1;
2805	logical = (sector_t) page->index <<	2805	logical = (sector_t) page->index <<
2806	(PAGE_CACHE_SHIFT - inode->i_blkbits);	2806	(PAGE_CACHE_SHIFT - inode->i_blkbits);
2807		2807
2808	if (!page_has_buffers(page)) {	2808	if (!page_has_buffers(page)) {
2809	mpage_add_bh_to_extent(mpd, logical,	2809	mpage_add_bh_to_extent(mpd, logical,
2810	PAGE_CACHE_SIZE,	2810	PAGE_CACHE_SIZE,
2811	(1 << BH_Dirty) \| (1 << BH_Uptodate));	2811	(1 << BH_Dirty) \| (1 << BH_Uptodate));
2812	if (mpd->io_done)	2812	if (mpd->io_done)
2813	goto ret_extent_tail;	2813	goto ret_extent_tail;
2814	} else {	2814	} else {
2815	/*	2815	/*
2816	* Page with regular buffer heads,	2816	* Page with regular buffer heads,
2817	* just add all dirty ones	2817	* just add all dirty ones
2818	*/	2818	*/
2819	head = page_buffers(page);	2819	head = page_buffers(page);
2820	bh = head;	2820	bh = head;
2821	do {	2821	do {
2822	BUG_ON(buffer_locked(bh));	2822	BUG_ON(buffer_locked(bh));
2823	/*	2823	/*
2824	* We need to try to allocate	2824	* We need to try to allocate
2825	* unmapped blocks in the same page.	2825	* unmapped blocks in the same page.
2826	* Otherwise we won't make progress	2826	* Otherwise we won't make progress
2827	* with the page in ext4_writepage	2827	* with the page in ext4_writepage
2828	*/	2828	*/
2829	if (ext4_bh_delay_or_unwritten(NULL, bh)) {	2829	if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2830	mpage_add_bh_to_extent(mpd, logical,	2830	mpage_add_bh_to_extent(mpd, logical,
2831	bh->b_size,	2831	bh->b_size,
2832	bh->b_state);	2832	bh->b_state);
2833	if (mpd->io_done)	2833	if (mpd->io_done)
2834	goto ret_extent_tail;	2834	goto ret_extent_tail;
2835	} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {	2835	} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2836	/*	2836	/*
2837	* mapped dirty buffer. We need	2837	* mapped dirty buffer. We need
2838	* to update the b_state	2838	* to update the b_state
2839	* because we look at b_state	2839	* because we look at b_state
2840	* in mpage_da_map_blocks. We	2840	* in mpage_da_map_blocks. We
2841	* don't update b_size because	2841	* don't update b_size because
2842	* if we find an unmapped	2842	* if we find an unmapped
2843	* buffer_head later we need to	2843	* buffer_head later we need to
2844	* use the b_state flag of that	2844	* use the b_state flag of that
2845	* buffer_head.	2845	* buffer_head.
2846	*/	2846	*/
2847	if (mpd->b_size == 0)	2847	if (mpd->b_size == 0)
2848	mpd->b_state = bh->b_state & BH_FLAGS;	2848	mpd->b_state = bh->b_state & BH_FLAGS;
2849	}	2849	}
2850	logical++;	2850	logical++;
2851	} while ((bh = bh->b_this_page) != head);	2851	} while ((bh = bh->b_this_page) != head);
2852	}	2852	}
2853		2853
2854	if (nr_to_write > 0) {	2854	if (nr_to_write > 0) {
2855	nr_to_write--;	2855	nr_to_write--;
2856	if (nr_to_write == 0 &&	2856	if (nr_to_write == 0 &&
2857	wbc->sync_mode == WB_SYNC_NONE)	2857	wbc->sync_mode == WB_SYNC_NONE)
2858	/*	2858	/*
2859	* We stop writing back only if we are	2859	* We stop writing back only if we are
2860	* not doing integrity sync. In case of	2860	* not doing integrity sync. In case of
2861	* integrity sync we have to keep going	2861	* integrity sync we have to keep going
2862	* because someone may be concurrently	2862	* because someone may be concurrently
2863	* dirtying pages, and we might have	2863	* dirtying pages, and we might have
2864	* synced a lot of newly appeared dirty	2864	* synced a lot of newly appeared dirty
2865	* pages, but have not synced all of the	2865	* pages, but have not synced all of the
2866	* old dirty pages.	2866	* old dirty pages.
2867	*/	2867	*/
2868	goto out;	2868	goto out;
2869	}	2869	}
2870	}	2870	}
2871	pagevec_release(&pvec);	2871	pagevec_release(&pvec);
2872	cond_resched();	2872	cond_resched();
2873	}	2873	}
2874	return 0;	2874	return 0;
2875	ret_extent_tail:	2875	ret_extent_tail:
2876	ret = MPAGE_DA_EXTENT_TAIL;	2876	ret = MPAGE_DA_EXTENT_TAIL;
2877	out:	2877	out:
2878	pagevec_release(&pvec);	2878	pagevec_release(&pvec);
2879	cond_resched();	2879	cond_resched();
2880	return ret;	2880	return ret;
2881	}	2881	}
2882		2882
2883		2883
2884	static int ext4_da_writepages(struct address_space *mapping,	2884	static int ext4_da_writepages(struct address_space *mapping,
2885	struct writeback_control *wbc)	2885	struct writeback_control *wbc)
2886	{	2886	{
2887	pgoff_t index;	2887	pgoff_t index;
2888	int range_whole = 0;	2888	int range_whole = 0;
2889	handle_t *handle = NULL;	2889	handle_t *handle = NULL;
2890	struct mpage_da_data mpd;	2890	struct mpage_da_data mpd;
2891	struct inode *inode = mapping->host;	2891	struct inode *inode = mapping->host;
2892	int pages_written = 0;	2892	int pages_written = 0;
2893	unsigned int max_pages;	2893	unsigned int max_pages;
2894	int range_cyclic, cycled = 1, io_done = 0;	2894	int range_cyclic, cycled = 1, io_done = 0;
2895	int needed_blocks, ret = 0;	2895	int needed_blocks, ret = 0;
2896	long desired_nr_to_write, nr_to_writebump = 0;	2896	long desired_nr_to_write, nr_to_writebump = 0;
2897	loff_t range_start = wbc->range_start;	2897	loff_t range_start = wbc->range_start;
2898	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);	2898	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2899	pgoff_t done_index = 0;	2899	pgoff_t done_index = 0;
2900	pgoff_t end;	2900	pgoff_t end;
2901		2901
2902	trace_ext4_da_writepages(inode, wbc);	2902	trace_ext4_da_writepages(inode, wbc);
2903		2903
2904	/*	2904	/*
2905	* No pages to write? This is mainly a kludge to avoid starting	2905	* No pages to write? This is mainly a kludge to avoid starting
2906	* a transaction for special inodes like journal inode on last iput()	2906	* a transaction for special inodes like journal inode on last iput()
2907	* because that could violate lock ordering on umount	2907	* because that could violate lock ordering on umount
2908	*/	2908	*/
2909	if (!mapping->nrpages \|\| !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))	2909	if (!mapping->nrpages \|\| !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2910	return 0;	2910	return 0;
2911		2911
2912	/*	2912	/*
2913	* If the filesystem has aborted, it is read-only, so return	2913	* If the filesystem has aborted, it is read-only, so return
2914	* right away instead of dumping stack traces later on that	2914	* right away instead of dumping stack traces later on that
2915	* will obscure the real source of the problem. We test	2915	* will obscure the real source of the problem. We test
2916	* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because	2916	* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2917	* the latter could be true if the filesystem is mounted	2917	* the latter could be true if the filesystem is mounted
2918	* read-only, and in that case, ext4_da_writepages should	2918	* read-only, and in that case, ext4_da_writepages should
2919	* never be called, so if that ever happens, we would want	2919	* never be called, so if that ever happens, we would want
2920	* the stack trace.	2920	* the stack trace.
2921	*/	2921	*/
2922	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))	2922	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2923	return -EROFS;	2923	return -EROFS;
2924		2924
2925	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)	2925	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2926	range_whole = 1;	2926	range_whole = 1;
2927		2927
2928	range_cyclic = wbc->range_cyclic;	2928	range_cyclic = wbc->range_cyclic;
2929	if (wbc->range_cyclic) {	2929	if (wbc->range_cyclic) {
2930	index = mapping->writeback_index;	2930	index = mapping->writeback_index;
2931	if (index)	2931	if (index)
2932	cycled = 0;	2932	cycled = 0;
2933	wbc->range_start = index << PAGE_CACHE_SHIFT;	2933	wbc->range_start = index << PAGE_CACHE_SHIFT;
2934	wbc->range_end = LLONG_MAX;	2934	wbc->range_end = LLONG_MAX;
2935	wbc->range_cyclic = 0;	2935	wbc->range_cyclic = 0;
2936	end = -1;	2936	end = -1;
2937	} else {	2937	} else {
2938	index = wbc->range_start >> PAGE_CACHE_SHIFT;	2938	index = wbc->range_start >> PAGE_CACHE_SHIFT;
2939	end = wbc->range_end >> PAGE_CACHE_SHIFT;	2939	end = wbc->range_end >> PAGE_CACHE_SHIFT;
2940	}	2940	}
2941		2941
2942	/*	2942	/*
2943	* This works around two forms of stupidity. The first is in	2943	* This works around two forms of stupidity. The first is in
2944	* the writeback code, which caps the maximum number of pages	2944	* the writeback code, which caps the maximum number of pages
2945	* written to be 1024 pages. This is wrong on multiple	2945	* written to be 1024 pages. This is wrong on multiple
2946	* levels; different architectues have a different page size,	2946	* levels; different architectues have a different page size,
2947	* which changes the maximum amount of data which gets	2947	* which changes the maximum amount of data which gets
2948	* written. Secondly, 4 megabytes is way too small. XFS	2948	* written. Secondly, 4 megabytes is way too small. XFS
2949	* forces this value to be 16 megabytes by multiplying	2949	* forces this value to be 16 megabytes by multiplying
2950	* nr_to_write parameter by four, and then relies on its	2950	* nr_to_write parameter by four, and then relies on its
2951	* allocator to allocate larger extents to make them	2951	* allocator to allocate larger extents to make them
2952	* contiguous. Unfortunately this brings us to the second	2952	* contiguous. Unfortunately this brings us to the second
2953	* stupidity, which is that ext4's mballoc code only allocates	2953	* stupidity, which is that ext4's mballoc code only allocates
2954	* at most 2048 blocks. So we force contiguous writes up to	2954	* at most 2048 blocks. So we force contiguous writes up to
2955	* the number of dirty blocks in the inode, or	2955	* the number of dirty blocks in the inode, or
2956	* sbi->max_writeback_mb_bump whichever is smaller.	2956	* sbi->max_writeback_mb_bump whichever is smaller.
2957	*/	2957	*/
2958	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);	2958	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2959	if (!range_cyclic && range_whole) {	2959	if (!range_cyclic && range_whole) {
2960	if (wbc->nr_to_write == LONG_MAX)	2960	if (wbc->nr_to_write == LONG_MAX)
2961	desired_nr_to_write = wbc->nr_to_write;	2961	desired_nr_to_write = wbc->nr_to_write;
2962	else	2962	else
2963	desired_nr_to_write = wbc->nr_to_write * 8;	2963	desired_nr_to_write = wbc->nr_to_write * 8;
2964	} else	2964	} else
2965	desired_nr_to_write = ext4_num_dirty_pages(inode, index,	2965	desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2966	max_pages);	2966	max_pages);
2967	if (desired_nr_to_write > max_pages)	2967	if (desired_nr_to_write > max_pages)
2968	desired_nr_to_write = max_pages;	2968	desired_nr_to_write = max_pages;
2969		2969
2970	if (wbc->nr_to_write < desired_nr_to_write) {	2970	if (wbc->nr_to_write < desired_nr_to_write) {
2971	nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;	2971	nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2972	wbc->nr_to_write = desired_nr_to_write;	2972	wbc->nr_to_write = desired_nr_to_write;
2973	}	2973	}
2974		2974
2975	retry:	2975	retry:
2976	if (wbc->sync_mode == WB_SYNC_ALL)	2976	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2977	tag_pages_for_writeback(mapping, index, end);	2977	tag_pages_for_writeback(mapping, index, end);
2978		2978
2979	while (!ret && wbc->nr_to_write > 0) {	2979	while (!ret && wbc->nr_to_write > 0) {
2980		2980
2981	/*	2981	/*
2982	* we insert one extent at a time. So we need	2982	* we insert one extent at a time. So we need
2983	* credit needed for single extent allocation.	2983	* credit needed for single extent allocation.
2984	* journalled mode is currently not supported	2984	* journalled mode is currently not supported
2985	* by delalloc	2985	* by delalloc
2986	*/	2986	*/
2987	BUG_ON(ext4_should_journal_data(inode));	2987	BUG_ON(ext4_should_journal_data(inode));
2988	needed_blocks = ext4_da_writepages_trans_blocks(inode);	2988	needed_blocks = ext4_da_writepages_trans_blocks(inode);
2989		2989
2990	/* start a new transaction*/	2990	/* start a new transaction*/
2991	handle = ext4_journal_start(inode, needed_blocks);	2991	handle = ext4_journal_start(inode, needed_blocks);
2992	if (IS_ERR(handle)) {	2992	if (IS_ERR(handle)) {
2993	ret = PTR_ERR(handle);	2993	ret = PTR_ERR(handle);
2994	ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "	2994	ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2995	"%ld pages, ino %lu; err %d", __func__,	2995	"%ld pages, ino %lu; err %d", __func__,
2996	wbc->nr_to_write, inode->i_ino, ret);	2996	wbc->nr_to_write, inode->i_ino, ret);
2997	goto out_writepages;	2997	goto out_writepages;
2998	}	2998	}
2999		2999
3000	/*	3000	/*
3001	* Now call write_cache_pages_da() to find the next	3001	* Now call write_cache_pages_da() to find the next
3002	* contiguous region of logical blocks that need	3002	* contiguous region of logical blocks that need
3003	* blocks to be allocated by ext4 and submit them.	3003	* blocks to be allocated by ext4 and submit them.
3004	*/	3004	*/
3005	ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);	3005	ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3006	/*	3006	/*
3007	* If we have a contiguous extent of pages and we	3007	* If we have a contiguous extent of pages and we
3008	* haven't done the I/O yet, map the blocks and submit	3008	* haven't done the I/O yet, map the blocks and submit
3009	* them for I/O.	3009	* them for I/O.
3010	*/	3010	*/
3011	if (!mpd.io_done && mpd.next_page != mpd.first_page) {	3011	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3012	mpage_da_map_and_submit(&mpd);	3012	mpage_da_map_and_submit(&mpd);
3013	ret = MPAGE_DA_EXTENT_TAIL;	3013	ret = MPAGE_DA_EXTENT_TAIL;
3014	}	3014	}
3015	trace_ext4_da_write_pages(inode, &mpd);	3015	trace_ext4_da_write_pages(inode, &mpd);
3016	wbc->nr_to_write -= mpd.pages_written;	3016	wbc->nr_to_write -= mpd.pages_written;
3017		3017
3018	ext4_journal_stop(handle);	3018	ext4_journal_stop(handle);
3019		3019
3020	if ((mpd.retval == -ENOSPC) && sbi->s_journal) {	3020	if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
3021	/* commit the transaction which would	3021	/* commit the transaction which would
3022	* free blocks released in the transaction	3022	* free blocks released in the transaction
3023	* and try again	3023	* and try again
3024	*/	3024	*/
3025	jbd2_journal_force_commit_nested(sbi->s_journal);	3025	jbd2_journal_force_commit_nested(sbi->s_journal);
3026	ret = 0;	3026	ret = 0;
3027	} else if (ret == MPAGE_DA_EXTENT_TAIL) {	3027	} else if (ret == MPAGE_DA_EXTENT_TAIL) {
3028	/*	3028	/*
3029	* got one extent now try with	3029	* got one extent now try with
3030	* rest of the pages	3030	* rest of the pages
3031	*/	3031	*/
3032	pages_written += mpd.pages_written;	3032	pages_written += mpd.pages_written;
3033	ret = 0;	3033	ret = 0;
3034	io_done = 1;	3034	io_done = 1;
3035	} else if (wbc->nr_to_write)	3035	} else if (wbc->nr_to_write)
3036	/*	3036	/*
3037	* There is no more writeout needed	3037	* There is no more writeout needed
3038	* or we requested for a noblocking writeout	3038	* or we requested for a noblocking writeout
3039	* and we found the device congested	3039	* and we found the device congested
3040	*/	3040	*/
3041	break;	3041	break;
3042	}	3042	}
3043	if (!io_done && !cycled) {	3043	if (!io_done && !cycled) {
3044	cycled = 1;	3044	cycled = 1;
3045	index = 0;	3045	index = 0;
3046	wbc->range_start = index << PAGE_CACHE_SHIFT;	3046	wbc->range_start = index << PAGE_CACHE_SHIFT;
3047	wbc->range_end = mapping->writeback_index - 1;	3047	wbc->range_end = mapping->writeback_index - 1;
3048	goto retry;	3048	goto retry;
3049	}	3049	}
3050		3050
3051	/* Update index */	3051	/* Update index */
3052	wbc->range_cyclic = range_cyclic;	3052	wbc->range_cyclic = range_cyclic;
3053	if (wbc->range_cyclic \|\| (range_whole && wbc->nr_to_write > 0))	3053	if (wbc->range_cyclic \|\| (range_whole && wbc->nr_to_write > 0))
3054	/*	3054	/*
3055	* set the writeback_index so that range_cyclic	3055	* set the writeback_index so that range_cyclic
3056	* mode will write it back later	3056	* mode will write it back later
3057	*/	3057	*/
3058	mapping->writeback_index = done_index;	3058	mapping->writeback_index = done_index;
3059		3059
3060	out_writepages:	3060	out_writepages:
3061	wbc->nr_to_write -= nr_to_writebump;	3061	wbc->nr_to_write -= nr_to_writebump;
3062	wbc->range_start = range_start;	3062	wbc->range_start = range_start;
3063	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);	3063	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3064	return ret;	3064	return ret;
3065	}	3065	}
3066		3066
3067	#define FALL_BACK_TO_NONDELALLOC 1	3067	#define FALL_BACK_TO_NONDELALLOC 1
3068	static int ext4_nonda_switch(struct super_block *sb)	3068	static int ext4_nonda_switch(struct super_block *sb)
3069	{	3069	{
3070	s64 free_blocks, dirty_blocks;	3070	s64 free_blocks, dirty_blocks;
3071	struct ext4_sb_info *sbi = EXT4_SB(sb);	3071	struct ext4_sb_info *sbi = EXT4_SB(sb);
3072		3072
3073	/*	3073	/*
3074	* switch to non delalloc mode if we are running low	3074	* switch to non delalloc mode if we are running low
3075	* on free block. The free block accounting via percpu	3075	* on free block. The free block accounting via percpu
3076	* counters can get slightly wrong with percpu_counter_batch getting	3076	* counters can get slightly wrong with percpu_counter_batch getting
3077	* accumulated on each CPU without updating global counters	3077	* accumulated on each CPU without updating global counters
3078	* Delalloc need an accurate free block accounting. So switch	3078	* Delalloc need an accurate free block accounting. So switch
3079	* to non delalloc when we are near to error range.	3079	* to non delalloc when we are near to error range.
3080	*/	3080	*/
3081	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);	3081	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
3082	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);	3082	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
3083	if (2 * free_blocks < 3 * dirty_blocks \|\|	3083	if (2 * free_blocks < 3 * dirty_blocks \|\|
3084	free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {	3084	free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3085	/*	3085	/*
3086	* free block count is less than 150% of dirty blocks	3086	* free block count is less than 150% of dirty blocks
3087	* or free blocks is less than watermark	3087	* or free blocks is less than watermark
3088	*/	3088	*/
3089	return 1;	3089	return 1;
3090	}	3090	}
3091	/*	3091	/*
3092	* Even if we don't switch but are nearing capacity,	3092	* Even if we don't switch but are nearing capacity,
3093	* start pushing delalloc when 1/2 of free blocks are dirty.	3093	* start pushing delalloc when 1/2 of free blocks are dirty.
3094	*/	3094	*/
3095	if (free_blocks < 2 * dirty_blocks)	3095	if (free_blocks < 2 * dirty_blocks)
3096	writeback_inodes_sb_if_idle(sb);	3096	writeback_inodes_sb_if_idle(sb);
3097		3097
3098	return 0;	3098	return 0;
3099	}	3099	}
3100		3100
3101	static int ext4_da_write_begin(struct file file, struct address_space mapping,	3101	static int ext4_da_write_begin(struct file file, struct address_space mapping,
3102	loff_t pos, unsigned len, unsigned flags,	3102	loff_t pos, unsigned len, unsigned flags,
3103	struct page pagep, void fsdata)	3103	struct page pagep, void fsdata)
3104	{	3104	{
3105	int ret, retries = 0;	3105	int ret, retries = 0;
3106	struct page *page;	3106	struct page *page;
3107	pgoff_t index;	3107	pgoff_t index;
3108	struct inode *inode = mapping->host;	3108	struct inode *inode = mapping->host;
3109	handle_t *handle;	3109	handle_t *handle;
3110		3110
3111	index = pos >> PAGE_CACHE_SHIFT;	3111	index = pos >> PAGE_CACHE_SHIFT;
3112		3112
3113	if (ext4_nonda_switch(inode->i_sb)) {	3113	if (ext4_nonda_switch(inode->i_sb)) {
3114	fsdata = (void )FALL_BACK_TO_NONDELALLOC;	3114	fsdata = (void )FALL_BACK_TO_NONDELALLOC;
3115	return ext4_write_begin(file, mapping, pos,	3115	return ext4_write_begin(file, mapping, pos,
3116	len, flags, pagep, fsdata);	3116	len, flags, pagep, fsdata);
3117	}	3117	}
3118	fsdata = (void )0;	3118	fsdata = (void )0;
3119	trace_ext4_da_write_begin(inode, pos, len, flags);	3119	trace_ext4_da_write_begin(inode, pos, len, flags);
3120	retry:	3120	retry:
3121	/*	3121	/*
3122	* With delayed allocation, we don't log the i_disksize update	3122	* With delayed allocation, we don't log the i_disksize update
3123	* if there is delayed block allocation. But we still need	3123	* if there is delayed block allocation. But we still need
3124	* to journalling the i_disksize update if writes to the end	3124	* to journalling the i_disksize update if writes to the end
3125	* of file which has an already mapped buffer.	3125	* of file which has an already mapped buffer.
3126	*/	3126	*/
3127	handle = ext4_journal_start(inode, 1);	3127	handle = ext4_journal_start(inode, 1);
3128	if (IS_ERR(handle)) {	3128	if (IS_ERR(handle)) {
3129	ret = PTR_ERR(handle);	3129	ret = PTR_ERR(handle);
3130	goto out;	3130	goto out;
3131	}	3131	}
3132	/* We cannot recurse into the filesystem as the transaction is already	3132	/* We cannot recurse into the filesystem as the transaction is already
3133	* started */	3133	* started */
3134	flags \|= AOP_FLAG_NOFS;	3134	flags \|= AOP_FLAG_NOFS;
3135		3135
3136	page = grab_cache_page_write_begin(mapping, index, flags);	3136	page = grab_cache_page_write_begin(mapping, index, flags);
3137	if (!page) {	3137	if (!page) {
3138	ext4_journal_stop(handle);	3138	ext4_journal_stop(handle);
3139	ret = -ENOMEM;	3139	ret = -ENOMEM;
3140	goto out;	3140	goto out;
3141	}	3141	}
3142	*pagep = page;	3142	*pagep = page;
3143		3143
3144	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);	3144	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3145	if (ret < 0) {	3145	if (ret < 0) {
3146	unlock_page(page);	3146	unlock_page(page);
3147	ext4_journal_stop(handle);	3147	ext4_journal_stop(handle);
3148	page_cache_release(page);	3148	page_cache_release(page);
3149	/*	3149	/*
3150	* block_write_begin may have instantiated a few blocks	3150	* block_write_begin may have instantiated a few blocks
3151	* outside i_size. Trim these off again. Don't need	3151	* outside i_size. Trim these off again. Don't need
3152	* i_size_read because we hold i_mutex.	3152	* i_size_read because we hold i_mutex.
3153	*/	3153	*/
3154	if (pos + len > inode->i_size)	3154	if (pos + len > inode->i_size)
3155	ext4_truncate_failed_write(inode);	3155	ext4_truncate_failed_write(inode);
3156	}	3156	}
3157		3157
3158	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))	3158	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3159	goto retry;	3159	goto retry;
3160	out:	3160	out:
3161	return ret;	3161	return ret;
3162	}	3162	}
3163		3163
3164	/*	3164	/*
3165	* Check if we should update i_disksize	3165	* Check if we should update i_disksize
3166	* when write to the end of file but not require block allocation	3166	* when write to the end of file but not require block allocation
3167	*/	3167	*/
3168	static int ext4_da_should_update_i_disksize(struct page *page,	3168	static int ext4_da_should_update_i_disksize(struct page *page,
3169	unsigned long offset)	3169	unsigned long offset)
3170	{	3170	{
3171	struct buffer_head *bh;	3171	struct buffer_head *bh;
3172	struct inode *inode = page->mapping->host;	3172	struct inode *inode = page->mapping->host;
3173	unsigned int idx;	3173	unsigned int idx;
3174	int i;	3174	int i;
3175		3175
3176	bh = page_buffers(page);	3176	bh = page_buffers(page);
3177	idx = offset >> inode->i_blkbits;	3177	idx = offset >> inode->i_blkbits;
3178		3178
3179	for (i = 0; i < idx; i++)	3179	for (i = 0; i < idx; i++)
3180	bh = bh->b_this_page;	3180	bh = bh->b_this_page;
3181		3181
3182	if (!buffer_mapped(bh) \|\| (buffer_delay(bh)) \|\| buffer_unwritten(bh))	3182	if (!buffer_mapped(bh) \|\| (buffer_delay(bh)) \|\| buffer_unwritten(bh))
3183	return 0;	3183	return 0;
3184	return 1;	3184	return 1;
3185	}	3185	}
3186		3186
3187	static int ext4_da_write_end(struct file *file,	3187	static int ext4_da_write_end(struct file *file,
3188	struct address_space *mapping,	3188	struct address_space *mapping,
3189	loff_t pos, unsigned len, unsigned copied,	3189	loff_t pos, unsigned len, unsigned copied,
3190	struct page page, void fsdata)	3190	struct page page, void fsdata)
3191	{	3191	{
3192	struct inode *inode = mapping->host;	3192	struct inode *inode = mapping->host;
3193	int ret = 0, ret2;	3193	int ret = 0, ret2;
3194	handle_t *handle = ext4_journal_current_handle();	3194	handle_t *handle = ext4_journal_current_handle();
3195	loff_t new_i_size;	3195	loff_t new_i_size;
3196	unsigned long start, end;	3196	unsigned long start, end;
3197	int write_mode = (int)(unsigned long)fsdata;	3197	int write_mode = (int)(unsigned long)fsdata;
3198		3198
3199	if (write_mode == FALL_BACK_TO_NONDELALLOC) {	3199	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
3200	if (ext4_should_order_data(inode)) {	3200	if (ext4_should_order_data(inode)) {
3201	return ext4_ordered_write_end(file, mapping, pos,	3201	return ext4_ordered_write_end(file, mapping, pos,
3202	len, copied, page, fsdata);	3202	len, copied, page, fsdata);
3203	} else if (ext4_should_writeback_data(inode)) {	3203	} else if (ext4_should_writeback_data(inode)) {
3204	return ext4_writeback_write_end(file, mapping, pos,	3204	return ext4_writeback_write_end(file, mapping, pos,
3205	len, copied, page, fsdata);	3205	len, copied, page, fsdata);
3206	} else {	3206	} else {
3207	BUG();	3207	BUG();
3208	}	3208	}
3209	}	3209	}
3210		3210
3211	trace_ext4_da_write_end(inode, pos, len, copied);	3211	trace_ext4_da_write_end(inode, pos, len, copied);
3212	start = pos & (PAGE_CACHE_SIZE - 1);	3212	start = pos & (PAGE_CACHE_SIZE - 1);
3213	end = start + copied - 1;	3213	end = start + copied - 1;
3214		3214
3215	/*	3215	/*
3216	* generic_write_end() will run mark_inode_dirty() if i_size	3216	* generic_write_end() will run mark_inode_dirty() if i_size
3217	* changes. So let's piggyback the i_disksize mark_inode_dirty	3217	* changes. So let's piggyback the i_disksize mark_inode_dirty
3218	* into that.	3218	* into that.
3219	*/	3219	*/
3220		3220
3221	new_i_size = pos + copied;	3221	new_i_size = pos + copied;
3222	if (new_i_size > EXT4_I(inode)->i_disksize) {	3222	if (new_i_size > EXT4_I(inode)->i_disksize) {
3223	if (ext4_da_should_update_i_disksize(page, end)) {	3223	if (ext4_da_should_update_i_disksize(page, end)) {
3224	down_write(&EXT4_I(inode)->i_data_sem);	3224	down_write(&EXT4_I(inode)->i_data_sem);
3225	if (new_i_size > EXT4_I(inode)->i_disksize) {	3225	if (new_i_size > EXT4_I(inode)->i_disksize) {
3226	/*	3226	/*
3227	* Updating i_disksize when extending file	3227	* Updating i_disksize when extending file
3228	* without needing block allocation	3228	* without needing block allocation
3229	*/	3229	*/
3230	if (ext4_should_order_data(inode))	3230	if (ext4_should_order_data(inode))
3231	ret = ext4_jbd2_file_inode(handle,	3231	ret = ext4_jbd2_file_inode(handle,
3232	inode);	3232	inode);
3233		3233
3234	EXT4_I(inode)->i_disksize = new_i_size;	3234	EXT4_I(inode)->i_disksize = new_i_size;
3235	}	3235	}
3236	up_write(&EXT4_I(inode)->i_data_sem);	3236	up_write(&EXT4_I(inode)->i_data_sem);
3237	/* We need to mark inode dirty even if	3237	/* We need to mark inode dirty even if
3238	* new_i_size is less that inode->i_size	3238	* new_i_size is less that inode->i_size
3239	* bu greater than i_disksize.(hint delalloc)	3239	* bu greater than i_disksize.(hint delalloc)
3240	*/	3240	*/
3241	ext4_mark_inode_dirty(handle, inode);	3241	ext4_mark_inode_dirty(handle, inode);
3242	}	3242	}
3243	}	3243	}
3244	ret2 = generic_write_end(file, mapping, pos, len, copied,	3244	ret2 = generic_write_end(file, mapping, pos, len, copied,
3245	page, fsdata);	3245	page, fsdata);
3246	copied = ret2;	3246	copied = ret2;
3247	if (ret2 < 0)	3247	if (ret2 < 0)
3248	ret = ret2;	3248	ret = ret2;
3249	ret2 = ext4_journal_stop(handle);	3249	ret2 = ext4_journal_stop(handle);
3250	if (!ret)	3250	if (!ret)
3251	ret = ret2;	3251	ret = ret2;
3252		3252
3253	return ret ? ret : copied;	3253	return ret ? ret : copied;
3254	}	3254	}
3255		3255
3256	static void ext4_da_invalidatepage(struct page *page, unsigned long offset)	3256	static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3257	{	3257	{
3258	/*	3258	/*
3259	* Drop reserved blocks	3259	* Drop reserved blocks
3260	*/	3260	*/
3261	BUG_ON(!PageLocked(page));	3261	BUG_ON(!PageLocked(page));
3262	if (!page_has_buffers(page))	3262	if (!page_has_buffers(page))
3263	goto out;	3263	goto out;
3264		3264
3265	ext4_da_page_release_reservation(page, offset);	3265	ext4_da_page_release_reservation(page, offset);
3266		3266
3267	out:	3267	out:
3268	ext4_invalidatepage(page, offset);	3268	ext4_invalidatepage(page, offset);
3269		3269
3270	return;	3270	return;
3271	}	3271	}
3272		3272
3273	/*	3273	/*
3274	* Force all delayed allocation blocks to be allocated for a given inode.	3274	* Force all delayed allocation blocks to be allocated for a given inode.
3275	*/	3275	*/
3276	int ext4_alloc_da_blocks(struct inode *inode)	3276	int ext4_alloc_da_blocks(struct inode *inode)
3277	{	3277	{
3278	trace_ext4_alloc_da_blocks(inode);	3278	trace_ext4_alloc_da_blocks(inode);
3279		3279
3280	if (!EXT4_I(inode)->i_reserved_data_blocks &&	3280	if (!EXT4_I(inode)->i_reserved_data_blocks &&
3281	!EXT4_I(inode)->i_reserved_meta_blocks)	3281	!EXT4_I(inode)->i_reserved_meta_blocks)
3282	return 0;	3282	return 0;
3283		3283
3284	/*	3284	/*
3285	* We do something simple for now. The filemap_flush() will	3285	* We do something simple for now. The filemap_flush() will
3286	* also start triggering a write of the data blocks, which is	3286	* also start triggering a write of the data blocks, which is
3287	* not strictly speaking necessary (and for users of	3287	* not strictly speaking necessary (and for users of
3288	* laptop_mode, not even desirable). However, to do otherwise	3288	* laptop_mode, not even desirable). However, to do otherwise
3289	* would require replicating code paths in:	3289	* would require replicating code paths in:
3290	*	3290	*
3291	* ext4_da_writepages() ->	3291	* ext4_da_writepages() ->
3292	* write_cache_pages() ---> (via passed in callback function)	3292	* write_cache_pages() ---> (via passed in callback function)
3293	* __mpage_da_writepage() -->	3293	* __mpage_da_writepage() -->
3294	* mpage_add_bh_to_extent()	3294	* mpage_add_bh_to_extent()
3295	* mpage_da_map_blocks()	3295	* mpage_da_map_blocks()
3296	*	3296	*
3297	* The problem is that write_cache_pages(), located in	3297	* The problem is that write_cache_pages(), located in
3298	* mm/page-writeback.c, marks pages clean in preparation for	3298	* mm/page-writeback.c, marks pages clean in preparation for
3299	* doing I/O, which is not desirable if we're not planning on	3299	* doing I/O, which is not desirable if we're not planning on
3300	* doing I/O at all.	3300	* doing I/O at all.
3301	*	3301	*
3302	* We could call write_cache_pages(), and then redirty all of	3302	* We could call write_cache_pages(), and then redirty all of
3303	* the pages by calling redirty_page_for_writepage() but that	3303	* the pages by calling redirty_page_for_writepage() but that
3304	* would be ugly in the extreme. So instead we would need to	3304	* would be ugly in the extreme. So instead we would need to
3305	* replicate parts of the code in the above functions,	3305	* replicate parts of the code in the above functions,
3306	* simplifying them because we wouldn't actually intend to	3306	* simplifying them because we wouldn't actually intend to
3307	* write out the pages, but rather only collect contiguous	3307	* write out the pages, but rather only collect contiguous
3308	* logical block extents, call the multi-block allocator, and	3308	* logical block extents, call the multi-block allocator, and
3309	* then update the buffer heads with the block allocations.	3309	* then update the buffer heads with the block allocations.
3310	*	3310	*
3311	* For now, though, we'll cheat by calling filemap_flush(),	3311	* For now, though, we'll cheat by calling filemap_flush(),
3312	* which will map the blocks, and start the I/O, but not	3312	* which will map the blocks, and start the I/O, but not
3313	* actually wait for the I/O to complete.	3313	* actually wait for the I/O to complete.
3314	*/	3314	*/
3315	return filemap_flush(inode->i_mapping);	3315	return filemap_flush(inode->i_mapping);
3316	}	3316	}
3317		3317
3318	/*	3318	/*
3319	* bmap() is special. It gets used by applications such as lilo and by	3319	* bmap() is special. It gets used by applications such as lilo and by
3320	* the swapper to find the on-disk block of a specific piece of data.	3320	* the swapper to find the on-disk block of a specific piece of data.
3321	*	3321	*
3322	* Naturally, this is dangerous if the block concerned is still in the	3322	* Naturally, this is dangerous if the block concerned is still in the
3323	* journal. If somebody makes a swapfile on an ext4 data-journaling	3323	* journal. If somebody makes a swapfile on an ext4 data-journaling
3324	* filesystem and enables swap, then they may get a nasty shock when the	3324	* filesystem and enables swap, then they may get a nasty shock when the
3325	* data getting swapped to that swapfile suddenly gets overwritten by	3325	* data getting swapped to that swapfile suddenly gets overwritten by
3326	* the original zero's written out previously to the journal and	3326	* the original zero's written out previously to the journal and
3327	* awaiting writeback in the kernel's buffer cache.	3327	* awaiting writeback in the kernel's buffer cache.
3328	*	3328	*
3329	* So, if we see any bmap calls here on a modified, data-journaled file,	3329	* So, if we see any bmap calls here on a modified, data-journaled file,
3330	* take extra steps to flush any blocks which might be in the cache.	3330	* take extra steps to flush any blocks which might be in the cache.
3331	*/	3331	*/
3332	static sector_t ext4_bmap(struct address_space *mapping, sector_t block)	3332	static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3333	{	3333	{
3334	struct inode *inode = mapping->host;	3334	struct inode *inode = mapping->host;
3335	journal_t *journal;	3335	journal_t *journal;
3336	int err;	3336	int err;
3337		3337
3338	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&	3338	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3339	test_opt(inode->i_sb, DELALLOC)) {	3339	test_opt(inode->i_sb, DELALLOC)) {
3340	/*	3340	/*
3341	* With delalloc we want to sync the file	3341	* With delalloc we want to sync the file
3342	* so that we can make sure we allocate	3342	* so that we can make sure we allocate
3343	* blocks for file	3343	* blocks for file
3344	*/	3344	*/
3345	filemap_write_and_wait(mapping);	3345	filemap_write_and_wait(mapping);
3346	}	3346	}
3347		3347
3348	if (EXT4_JOURNAL(inode) &&	3348	if (EXT4_JOURNAL(inode) &&
3349	ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {	3349	ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3350	/*	3350	/*
3351	* This is a REALLY heavyweight approach, but the use of	3351	* This is a REALLY heavyweight approach, but the use of
3352	* bmap on dirty files is expected to be extremely rare:	3352	* bmap on dirty files is expected to be extremely rare:
3353	* only if we run lilo or swapon on a freshly made file	3353	* only if we run lilo or swapon on a freshly made file
3354	* do we expect this to happen.	3354	* do we expect this to happen.
3355	*	3355	*
3356	* (bmap requires CAP_SYS_RAWIO so this does not	3356	* (bmap requires CAP_SYS_RAWIO so this does not
3357	* represent an unprivileged user DOS attack --- we'd be	3357	* represent an unprivileged user DOS attack --- we'd be
3358	* in trouble if mortal users could trigger this path at	3358	* in trouble if mortal users could trigger this path at
3359	* will.)	3359	* will.)
3360	*	3360	*
3361	* NB. EXT4_STATE_JDATA is not set on files other than	3361	* NB. EXT4_STATE_JDATA is not set on files other than
3362	* regular files. If somebody wants to bmap a directory	3362	* regular files. If somebody wants to bmap a directory
3363	* or symlink and gets confused because the buffer	3363	* or symlink and gets confused because the buffer
3364	* hasn't yet been flushed to disk, they deserve	3364	* hasn't yet been flushed to disk, they deserve
3365	* everything they get.	3365	* everything they get.
3366	*/	3366	*/
3367		3367
3368	ext4_clear_inode_state(inode, EXT4_STATE_JDATA);	3368	ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3369	journal = EXT4_JOURNAL(inode);	3369	journal = EXT4_JOURNAL(inode);
3370	jbd2_journal_lock_updates(journal);	3370	jbd2_journal_lock_updates(journal);
3371	err = jbd2_journal_flush(journal);	3371	err = jbd2_journal_flush(journal);
3372	jbd2_journal_unlock_updates(journal);	3372	jbd2_journal_unlock_updates(journal);
3373		3373
3374	if (err)	3374	if (err)
3375	return 0;	3375	return 0;
3376	}	3376	}
3377		3377
3378	return generic_block_bmap(mapping, block, ext4_get_block);	3378	return generic_block_bmap(mapping, block, ext4_get_block);
3379	}	3379	}
3380		3380
3381	static int ext4_readpage(struct file file, struct page page)	3381	static int ext4_readpage(struct file file, struct page page)
3382	{	3382	{
3383	trace_ext4_readpage(page);	3383	trace_ext4_readpage(page);
3384	return mpage_readpage(page, ext4_get_block);	3384	return mpage_readpage(page, ext4_get_block);
3385	}	3385	}
3386		3386
3387	static int	3387	static int
3388	ext4_readpages(struct file file, struct address_space mapping,	3388	ext4_readpages(struct file file, struct address_space mapping,
3389	struct list_head *pages, unsigned nr_pages)	3389	struct list_head *pages, unsigned nr_pages)
3390	{	3390	{
3391	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);	3391	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3392	}	3392	}
3393		3393
3394	static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)	3394	static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3395	{	3395	{
3396	struct buffer_head head, bh;	3396	struct buffer_head head, bh;
3397	unsigned int curr_off = 0;	3397	unsigned int curr_off = 0;
3398		3398
3399	if (!page_has_buffers(page))	3399	if (!page_has_buffers(page))
3400	return;	3400	return;
3401	head = bh = page_buffers(page);	3401	head = bh = page_buffers(page);
3402	do {	3402	do {
3403	if (offset <= curr_off && test_clear_buffer_uninit(bh)	3403	if (offset <= curr_off && test_clear_buffer_uninit(bh)
3404	&& bh->b_private) {	3404	&& bh->b_private) {
3405	ext4_free_io_end(bh->b_private);	3405	ext4_free_io_end(bh->b_private);
3406	bh->b_private = NULL;	3406	bh->b_private = NULL;
3407	bh->b_end_io = NULL;	3407	bh->b_end_io = NULL;
3408	}	3408	}
3409	curr_off = curr_off + bh->b_size;	3409	curr_off = curr_off + bh->b_size;
3410	bh = bh->b_this_page;	3410	bh = bh->b_this_page;
3411	} while (bh != head);	3411	} while (bh != head);
3412	}	3412	}
3413		3413
3414	static void ext4_invalidatepage(struct page *page, unsigned long offset)	3414	static void ext4_invalidatepage(struct page *page, unsigned long offset)
3415	{	3415	{
3416	journal_t *journal = EXT4_JOURNAL(page->mapping->host);	3416	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3417		3417
3418	trace_ext4_invalidatepage(page, offset);	3418	trace_ext4_invalidatepage(page, offset);
3419		3419
3420	/*	3420	/*
3421	* free any io_end structure allocated for buffers to be discarded	3421	* free any io_end structure allocated for buffers to be discarded
3422	*/	3422	*/
3423	if (ext4_should_dioread_nolock(page->mapping->host))	3423	if (ext4_should_dioread_nolock(page->mapping->host))
3424	ext4_invalidatepage_free_endio(page, offset);	3424	ext4_invalidatepage_free_endio(page, offset);
3425	/*	3425	/*
3426	* If it's a full truncate we just forget about the pending dirtying	3426	* If it's a full truncate we just forget about the pending dirtying
3427	*/	3427	*/
3428	if (offset == 0)	3428	if (offset == 0)
3429	ClearPageChecked(page);	3429	ClearPageChecked(page);
3430		3430
3431	if (journal)	3431	if (journal)
3432	jbd2_journal_invalidatepage(journal, page, offset);	3432	jbd2_journal_invalidatepage(journal, page, offset);
3433	else	3433	else
3434	block_invalidatepage(page, offset);	3434	block_invalidatepage(page, offset);
3435	}	3435	}
3436		3436
3437	static int ext4_releasepage(struct page *page, gfp_t wait)	3437	static int ext4_releasepage(struct page *page, gfp_t wait)
3438	{	3438	{
3439	journal_t *journal = EXT4_JOURNAL(page->mapping->host);	3439	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3440		3440
3441	trace_ext4_releasepage(page);	3441	trace_ext4_releasepage(page);
3442		3442
3443	WARN_ON(PageChecked(page));	3443	WARN_ON(PageChecked(page));
3444	if (!page_has_buffers(page))	3444	if (!page_has_buffers(page))
3445	return 0;	3445	return 0;
3446	if (journal)	3446	if (journal)
3447	return jbd2_journal_try_to_free_buffers(journal, page, wait);	3447	return jbd2_journal_try_to_free_buffers(journal, page, wait);
3448	else	3448	else
3449	return try_to_free_buffers(page);	3449	return try_to_free_buffers(page);
3450	}	3450	}
3451		3451
3452	/*	3452	/*
3453	* O_DIRECT for ext3 (or indirect map) based files	3453	* O_DIRECT for ext3 (or indirect map) based files
3454	*	3454	*
3455	* If the O_DIRECT write will extend the file then add this inode to the	3455	* If the O_DIRECT write will extend the file then add this inode to the
3456	* orphan list. So recovery will truncate it back to the original size	3456	* orphan list. So recovery will truncate it back to the original size
3457	* if the machine crashes during the write.	3457	* if the machine crashes during the write.
3458	*	3458	*
3459	* If the O_DIRECT write is intantiating holes inside i_size and the machine	3459	* If the O_DIRECT write is intantiating holes inside i_size and the machine
3460	* crashes then stale disk data _may_ be exposed inside the file. But current	3460	* crashes then stale disk data _may_ be exposed inside the file. But current
3461	* VFS code falls back into buffered path in that case so we are safe.	3461	* VFS code falls back into buffered path in that case so we are safe.
3462	*/	3462	*/
3463	static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,	3463	static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3464	const struct iovec *iov, loff_t offset,	3464	const struct iovec *iov, loff_t offset,
3465	unsigned long nr_segs)	3465	unsigned long nr_segs)
3466	{	3466	{
3467	struct file *file = iocb->ki_filp;	3467	struct file *file = iocb->ki_filp;
3468	struct inode *inode = file->f_mapping->host;	3468	struct inode *inode = file->f_mapping->host;
3469	struct ext4_inode_info *ei = EXT4_I(inode);	3469	struct ext4_inode_info *ei = EXT4_I(inode);
3470	handle_t *handle;	3470	handle_t *handle;
3471	ssize_t ret;	3471	ssize_t ret;
3472	int orphan = 0;	3472	int orphan = 0;
3473	size_t count = iov_length(iov, nr_segs);	3473	size_t count = iov_length(iov, nr_segs);
3474	int retries = 0;	3474	int retries = 0;
3475		3475
3476	if (rw == WRITE) {	3476	if (rw == WRITE) {
3477	loff_t final_size = offset + count;	3477	loff_t final_size = offset + count;
3478		3478
3479	if (final_size > inode->i_size) {	3479	if (final_size > inode->i_size) {
3480	/* Credits for sb + inode write */	3480	/* Credits for sb + inode write */
3481	handle = ext4_journal_start(inode, 2);	3481	handle = ext4_journal_start(inode, 2);
3482	if (IS_ERR(handle)) {	3482	if (IS_ERR(handle)) {
3483	ret = PTR_ERR(handle);	3483	ret = PTR_ERR(handle);
3484	goto out;	3484	goto out;
3485	}	3485	}
3486	ret = ext4_orphan_add(handle, inode);	3486	ret = ext4_orphan_add(handle, inode);
3487	if (ret) {	3487	if (ret) {
3488	ext4_journal_stop(handle);	3488	ext4_journal_stop(handle);
3489	goto out;	3489	goto out;
3490	}	3490	}
3491	orphan = 1;	3491	orphan = 1;
3492	ei->i_disksize = inode->i_size;	3492	ei->i_disksize = inode->i_size;
3493	ext4_journal_stop(handle);	3493	ext4_journal_stop(handle);
3494	}	3494	}
3495	}	3495	}
3496		3496
3497	retry:	3497	retry:
3498	if (rw == READ && ext4_should_dioread_nolock(inode))	3498	if (rw == READ && ext4_should_dioread_nolock(inode))
3499	ret = __blockdev_direct_IO(rw, iocb, inode,	3499	ret = __blockdev_direct_IO(rw, iocb, inode,
3500	inode->i_sb->s_bdev, iov,	3500	inode->i_sb->s_bdev, iov,
3501	offset, nr_segs,	3501	offset, nr_segs,
3502	ext4_get_block, NULL, NULL, 0);	3502	ext4_get_block, NULL, NULL, 0);
3503	else {	3503	else {
3504	ret = blockdev_direct_IO(rw, iocb, inode, iov,	3504	ret = blockdev_direct_IO(rw, iocb, inode, iov,
3505	offset, nr_segs, ext4_get_block);	3505	offset, nr_segs, ext4_get_block);
3506		3506
3507	if (unlikely((rw & WRITE) && ret < 0)) {	3507	if (unlikely((rw & WRITE) && ret < 0)) {
3508	loff_t isize = i_size_read(inode);	3508	loff_t isize = i_size_read(inode);
3509	loff_t end = offset + iov_length(iov, nr_segs);	3509	loff_t end = offset + iov_length(iov, nr_segs);
3510		3510
3511	if (end > isize)	3511	if (end > isize)
3512	ext4_truncate_failed_write(inode);	3512	ext4_truncate_failed_write(inode);
3513	}	3513	}
3514	}	3514	}
3515	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))	3515	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3516	goto retry;	3516	goto retry;
3517		3517
3518	if (orphan) {	3518	if (orphan) {
3519	int err;	3519	int err;
3520		3520
3521	/* Credits for sb + inode write */	3521	/* Credits for sb + inode write */
3522	handle = ext4_journal_start(inode, 2);	3522	handle = ext4_journal_start(inode, 2);
3523	if (IS_ERR(handle)) {	3523	if (IS_ERR(handle)) {
3524	/* This is really bad luck. We've written the data	3524	/* This is really bad luck. We've written the data
3525	* but cannot extend i_size. Bail out and pretend	3525	* but cannot extend i_size. Bail out and pretend
3526	* the write failed... */	3526	* the write failed... */
3527	ret = PTR_ERR(handle);	3527	ret = PTR_ERR(handle);
3528	if (inode->i_nlink)	3528	if (inode->i_nlink)
3529	ext4_orphan_del(NULL, inode);	3529	ext4_orphan_del(NULL, inode);
3530		3530
3531	goto out;	3531	goto out;
3532	}	3532	}
3533	if (inode->i_nlink)	3533	if (inode->i_nlink)
3534	ext4_orphan_del(handle, inode);	3534	ext4_orphan_del(handle, inode);
3535	if (ret > 0) {	3535	if (ret > 0) {
3536	loff_t end = offset + ret;	3536	loff_t end = offset + ret;
3537	if (end > inode->i_size) {	3537	if (end > inode->i_size) {
3538	ei->i_disksize = end;	3538	ei->i_disksize = end;
3539	i_size_write(inode, end);	3539	i_size_write(inode, end);
3540	/*	3540	/*
3541	* We're going to return a positive `ret'	3541	* We're going to return a positive `ret'
3542	* here due to non-zero-length I/O, so there's	3542	* here due to non-zero-length I/O, so there's
3543	* no way of reporting error returns from	3543	* no way of reporting error returns from
3544	* ext4_mark_inode_dirty() to userspace. So	3544	* ext4_mark_inode_dirty() to userspace. So
3545	* ignore it.	3545	* ignore it.
3546	*/	3546	*/
3547	ext4_mark_inode_dirty(handle, inode);	3547	ext4_mark_inode_dirty(handle, inode);
3548	}	3548	}
3549	}	3549	}
3550	err = ext4_journal_stop(handle);	3550	err = ext4_journal_stop(handle);
3551	if (ret == 0)	3551	if (ret == 0)
3552	ret = err;	3552	ret = err;
3553	}	3553	}
3554	out:	3554	out:
3555	return ret;	3555	return ret;
3556	}	3556	}
3557		3557
3558	/*	3558	/*
3559	* ext4_get_block used when preparing for a DIO write or buffer write.	3559	* ext4_get_block used when preparing for a DIO write or buffer write.
3560	* We allocate an uinitialized extent if blocks haven't been allocated.	3560	* We allocate an uinitialized extent if blocks haven't been allocated.
3561	* The extent will be converted to initialized after the IO is complete.	3561	* The extent will be converted to initialized after the IO is complete.
3562	*/	3562	*/
3563	static int ext4_get_block_write(struct inode *inode, sector_t iblock,	3563	static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3564	struct buffer_head *bh_result, int create)	3564	struct buffer_head *bh_result, int create)
3565	{	3565	{
3566	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",	3566	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3567	inode->i_ino, create);	3567	inode->i_ino, create);
3568	return _ext4_get_block(inode, iblock, bh_result,	3568	return _ext4_get_block(inode, iblock, bh_result,
3569	EXT4_GET_BLOCKS_IO_CREATE_EXT);	3569	EXT4_GET_BLOCKS_IO_CREATE_EXT);
3570	}	3570	}
3571		3571
3572	static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,	3572	static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3573	ssize_t size, void *private, int ret,	3573	ssize_t size, void *private, int ret,
3574	bool is_async)	3574	bool is_async)
3575	{	3575	{
3576	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;	3576	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
3577	ext4_io_end_t *io_end = iocb->private;	3577	ext4_io_end_t *io_end = iocb->private;
3578	struct workqueue_struct *wq;	3578	struct workqueue_struct *wq;
3579	unsigned long flags;	3579	unsigned long flags;
3580	struct ext4_inode_info *ei;	3580	struct ext4_inode_info *ei;
3581		3581
3582	/* if not async direct IO or dio with 0 bytes write, just return */	3582	/* if not async direct IO or dio with 0 bytes write, just return */
3583	if (!io_end \|\| !size)	3583	if (!io_end \|\| !size)
3584	goto out;	3584	goto out;
3585		3585
3586	ext_debug("ext4_end_io_dio(): io_end 0x%p"	3586	ext_debug("ext4_end_io_dio(): io_end 0x%p"
3587	"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",	3587	"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3588	iocb->private, io_end->inode->i_ino, iocb, offset,	3588	iocb->private, io_end->inode->i_ino, iocb, offset,
3589	size);	3589	size);
3590		3590
3591	/* if not aio dio with unwritten extents, just free io and return */	3591	/* if not aio dio with unwritten extents, just free io and return */
3592	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {	3592	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3593	ext4_free_io_end(io_end);	3593	ext4_free_io_end(io_end);
3594	iocb->private = NULL;	3594	iocb->private = NULL;
3595	out:	3595	out:
3596	if (is_async)	3596	if (is_async)
3597	aio_complete(iocb, ret, 0);	3597	aio_complete(iocb, ret, 0);
3598	inode_dio_done(inode);	3598	inode_dio_done(inode);
3599	return;	3599	return;
3600	}	3600	}
3601		3601
3602	io_end->offset = offset;	3602	io_end->offset = offset;
3603	io_end->size = size;	3603	io_end->size = size;
3604	if (is_async) {	3604	if (is_async) {
3605	io_end->iocb = iocb;	3605	io_end->iocb = iocb;
3606	io_end->result = ret;	3606	io_end->result = ret;
3607	}	3607	}
3608	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;	3608	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3609		3609
3610	/* Add the io_end to per-inode completed aio dio list*/	3610	/* Add the io_end to per-inode completed aio dio list*/
3611	ei = EXT4_I(io_end->inode);	3611	ei = EXT4_I(io_end->inode);
3612	spin_lock_irqsave(&ei->i_completed_io_lock, flags);	3612	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3613	list_add_tail(&io_end->list, &ei->i_completed_io_list);	3613	list_add_tail(&io_end->list, &ei->i_completed_io_list);
3614	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);	3614	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3615		3615
3616	/* queue the work to convert unwritten extents to written */	3616	/* queue the work to convert unwritten extents to written */
3617	queue_work(wq, &io_end->work);	3617	queue_work(wq, &io_end->work);
3618	iocb->private = NULL;	3618	iocb->private = NULL;
3619		3619
3620	/* XXX: probably should move into the real I/O completion handler */	3620	/* XXX: probably should move into the real I/O completion handler */
3621	inode_dio_done(inode);	3621	inode_dio_done(inode);
3622	}	3622	}
3623		3623
3624	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)	3624	static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3625	{	3625	{
3626	ext4_io_end_t *io_end = bh->b_private;	3626	ext4_io_end_t *io_end = bh->b_private;
3627	struct workqueue_struct *wq;	3627	struct workqueue_struct *wq;
3628	struct inode *inode;	3628	struct inode *inode;
3629	unsigned long flags;	3629	unsigned long flags;
3630		3630
3631	if (!test_clear_buffer_uninit(bh) \|\| !io_end)	3631	if (!test_clear_buffer_uninit(bh) \|\| !io_end)
3632	goto out;	3632	goto out;
3633		3633
3634	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {	3634	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3635	printk("sb umounted, discard end_io request for inode %lu\n",	3635	printk("sb umounted, discard end_io request for inode %lu\n",
3636	io_end->inode->i_ino);	3636	io_end->inode->i_ino);
3637	ext4_free_io_end(io_end);	3637	ext4_free_io_end(io_end);
3638	goto out;	3638	goto out;
3639	}	3639	}
3640		3640
3641	io_end->flag = EXT4_IO_END_UNWRITTEN;	3641	io_end->flag = EXT4_IO_END_UNWRITTEN;
3642	inode = io_end->inode;	3642	inode = io_end->inode;
3643		3643
3644	/* Add the io_end to per-inode completed io list*/	3644	/* Add the io_end to per-inode completed io list*/
3645	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);	3645	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3646	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);	3646	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3647	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);	3647	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3648		3648
3649	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;	3649	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3650	/* queue the work to convert unwritten extents to written */	3650	/* queue the work to convert unwritten extents to written */
3651	queue_work(wq, &io_end->work);	3651	queue_work(wq, &io_end->work);
3652	out:	3652	out:
3653	bh->b_private = NULL;	3653	bh->b_private = NULL;
3654	bh->b_end_io = NULL;	3654	bh->b_end_io = NULL;
3655	clear_buffer_uninit(bh);	3655	clear_buffer_uninit(bh);
3656	end_buffer_async_write(bh, uptodate);	3656	end_buffer_async_write(bh, uptodate);
3657	}	3657	}
3658		3658
3659	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode)	3659	static int ext4_set_bh_endio(struct buffer_head bh, struct inode inode)
3660	{	3660	{
3661	ext4_io_end_t *io_end;	3661	ext4_io_end_t *io_end;
3662	struct page *page = bh->b_page;	3662	struct page *page = bh->b_page;
3663	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;	3663	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3664	size_t size = bh->b_size;	3664	size_t size = bh->b_size;
3665		3665
3666	retry:	3666	retry:
3667	io_end = ext4_init_io_end(inode, GFP_ATOMIC);	3667	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3668	if (!io_end) {	3668	if (!io_end) {
3669	pr_warn_ratelimited("%s: allocation fail\n", __func__);	3669	pr_warn_ratelimited("%s: allocation fail\n", __func__);
3670	schedule();	3670	schedule();
3671	goto retry;	3671	goto retry;
3672	}	3672	}
3673	io_end->offset = offset;	3673	io_end->offset = offset;
3674	io_end->size = size;	3674	io_end->size = size;
3675	/*	3675	/*
3676	* We need to hold a reference to the page to make sure it	3676	* We need to hold a reference to the page to make sure it
3677	* doesn't get evicted before ext4_end_io_work() has a chance	3677	* doesn't get evicted before ext4_end_io_work() has a chance
3678	* to convert the extent from written to unwritten.	3678	* to convert the extent from written to unwritten.
3679	*/	3679	*/
3680	io_end->page = page;	3680	io_end->page = page;
3681	get_page(io_end->page);	3681	get_page(io_end->page);
3682		3682
3683	bh->b_private = io_end;	3683	bh->b_private = io_end;
3684	bh->b_end_io = ext4_end_io_buffer_write;	3684	bh->b_end_io = ext4_end_io_buffer_write;
3685	return 0;	3685	return 0;
3686	}	3686	}
3687		3687
3688	/*	3688	/*
3689	* For ext4 extent files, ext4 will do direct-io write to holes,	3689	* For ext4 extent files, ext4 will do direct-io write to holes,
3690	* preallocated extents, and those write extend the file, no need to	3690	* preallocated extents, and those write extend the file, no need to
3691	* fall back to buffered IO.	3691	* fall back to buffered IO.
3692	*	3692	*
3693	* For holes, we fallocate those blocks, mark them as uninitialized	3693	* For holes, we fallocate those blocks, mark them as uninitialized
3694	* If those blocks were preallocated, we mark sure they are splited, but	3694	* If those blocks were preallocated, we mark sure they are splited, but
3695	* still keep the range to write as uninitialized.	3695	* still keep the range to write as uninitialized.
3696	*	3696	*
3697	* The unwrritten extents will be converted to written when DIO is completed.	3697	* The unwrritten extents will be converted to written when DIO is completed.
3698	* For async direct IO, since the IO may still pending when return, we	3698	* For async direct IO, since the IO may still pending when return, we
3699	* set up an end_io call back function, which will do the conversion	3699	* set up an end_io call back function, which will do the conversion
3700	* when async direct IO completed.	3700	* when async direct IO completed.
3701	*	3701	*
3702	* If the O_DIRECT write will extend the file then add this inode to the	3702	* If the O_DIRECT write will extend the file then add this inode to the
3703	* orphan list. So recovery will truncate it back to the original size	3703	* orphan list. So recovery will truncate it back to the original size
3704	* if the machine crashes during the write.	3704	* if the machine crashes during the write.
3705	*	3705	*
3706	*/	3706	*/
3707	static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,	3707	static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3708	const struct iovec *iov, loff_t offset,	3708	const struct iovec *iov, loff_t offset,
3709	unsigned long nr_segs)	3709	unsigned long nr_segs)
3710	{	3710	{
3711	struct file *file = iocb->ki_filp;	3711	struct file *file = iocb->ki_filp;
3712	struct inode *inode = file->f_mapping->host;	3712	struct inode *inode = file->f_mapping->host;
3713	ssize_t ret;	3713	ssize_t ret;
3714	size_t count = iov_length(iov, nr_segs);	3714	size_t count = iov_length(iov, nr_segs);
3715		3715
3716	loff_t final_size = offset + count;	3716	loff_t final_size = offset + count;
3717	if (rw == WRITE && final_size <= inode->i_size) {	3717	if (rw == WRITE && final_size <= inode->i_size) {
3718	/*	3718	/*
3719	* We could direct write to holes and fallocate.	3719	* We could direct write to holes and fallocate.
3720	*	3720	*
3721	* Allocated blocks to fill the hole are marked as uninitialized	3721	* Allocated blocks to fill the hole are marked as uninitialized
3722	* to prevent parallel buffered read to expose the stale data	3722	* to prevent parallel buffered read to expose the stale data
3723	* before DIO complete the data IO.	3723	* before DIO complete the data IO.
3724	*	3724	*
3725	* As to previously fallocated extents, ext4 get_block	3725	* As to previously fallocated extents, ext4 get_block
3726	* will just simply mark the buffer mapped but still	3726	* will just simply mark the buffer mapped but still
3727	* keep the extents uninitialized.	3727	* keep the extents uninitialized.
3728	*	3728	*
3729	* for non AIO case, we will convert those unwritten extents	3729	* for non AIO case, we will convert those unwritten extents
3730	* to written after return back from blockdev_direct_IO.	3730	* to written after return back from blockdev_direct_IO.
3731	*	3731	*
3732	* for async DIO, the conversion needs to be defered when	3732	* for async DIO, the conversion needs to be defered when
3733	* the IO is completed. The ext4 end_io callback function	3733	* the IO is completed. The ext4 end_io callback function
3734	* will be called to take care of the conversion work.	3734	* will be called to take care of the conversion work.
3735	* Here for async case, we allocate an io_end structure to	3735	* Here for async case, we allocate an io_end structure to
3736	* hook to the iocb.	3736	* hook to the iocb.
3737	*/	3737	*/
3738	iocb->private = NULL;	3738	iocb->private = NULL;
3739	EXT4_I(inode)->cur_aio_dio = NULL;	3739	EXT4_I(inode)->cur_aio_dio = NULL;
3740	if (!is_sync_kiocb(iocb)) {	3740	if (!is_sync_kiocb(iocb)) {
3741	iocb->private = ext4_init_io_end(inode, GFP_NOFS);	3741	iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3742	if (!iocb->private)	3742	if (!iocb->private)
3743	return -ENOMEM;	3743	return -ENOMEM;
3744	/*	3744	/*
3745	* we save the io structure for current async	3745	* we save the io structure for current async
3746	* direct IO, so that later ext4_map_blocks()	3746	* direct IO, so that later ext4_map_blocks()
3747	* could flag the io structure whether there	3747	* could flag the io structure whether there
3748	* is a unwritten extents needs to be converted	3748	* is a unwritten extents needs to be converted
3749	* when IO is completed.	3749	* when IO is completed.
3750	*/	3750	*/
3751	EXT4_I(inode)->cur_aio_dio = iocb->private;	3751	EXT4_I(inode)->cur_aio_dio = iocb->private;
3752	}	3752	}
3753		3753
3754	ret = __blockdev_direct_IO(rw, iocb, inode,	3754	ret = __blockdev_direct_IO(rw, iocb, inode,
3755	inode->i_sb->s_bdev, iov,	3755	inode->i_sb->s_bdev, iov,
3756	offset, nr_segs,	3756	offset, nr_segs,
3757	ext4_get_block_write,	3757	ext4_get_block_write,
3758	ext4_end_io_dio,	3758	ext4_end_io_dio,
3759	NULL,	3759	NULL,
3760	DIO_LOCKING \| DIO_SKIP_HOLES);	3760	DIO_LOCKING \| DIO_SKIP_HOLES);
3761	if (iocb->private)	3761	if (iocb->private)
3762	EXT4_I(inode)->cur_aio_dio = NULL;	3762	EXT4_I(inode)->cur_aio_dio = NULL;
3763	/*	3763	/*
3764	* The io_end structure takes a reference to the inode,	3764	* The io_end structure takes a reference to the inode,
3765	* that structure needs to be destroyed and the	3765	* that structure needs to be destroyed and the
3766	* reference to the inode need to be dropped, when IO is	3766	* reference to the inode need to be dropped, when IO is
3767	* complete, even with 0 byte write, or failed.	3767	* complete, even with 0 byte write, or failed.
3768	*	3768	*
3769	* In the successful AIO DIO case, the io_end structure will be	3769	* In the successful AIO DIO case, the io_end structure will be
3770	* desctroyed and the reference to the inode will be dropped	3770	* desctroyed and the reference to the inode will be dropped
3771	* after the end_io call back function is called.	3771	* after the end_io call back function is called.
3772	*	3772	*
3773	* In the case there is 0 byte write, or error case, since	3773	* In the case there is 0 byte write, or error case, since
3774	* VFS direct IO won't invoke the end_io call back function,	3774	* VFS direct IO won't invoke the end_io call back function,
3775	* we need to free the end_io structure here.	3775	* we need to free the end_io structure here.
3776	*/	3776	*/
3777	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {	3777	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3778	ext4_free_io_end(iocb->private);	3778	ext4_free_io_end(iocb->private);
3779	iocb->private = NULL;	3779	iocb->private = NULL;
3780	} else if (ret > 0 && ext4_test_inode_state(inode,	3780	} else if (ret > 0 && ext4_test_inode_state(inode,
3781	EXT4_STATE_DIO_UNWRITTEN)) {	3781	EXT4_STATE_DIO_UNWRITTEN)) {
3782	int err;	3782	int err;
3783	/*	3783	/*
3784	* for non AIO case, since the IO is already	3784	* for non AIO case, since the IO is already
3785	* completed, we could do the conversion right here	3785	* completed, we could do the conversion right here
3786	*/	3786	*/
3787	err = ext4_convert_unwritten_extents(inode,	3787	err = ext4_convert_unwritten_extents(inode,
3788	offset, ret);	3788	offset, ret);
3789	if (err < 0)	3789	if (err < 0)
3790	ret = err;	3790	ret = err;
3791	ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);	3791	ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3792	}	3792	}
3793	return ret;	3793	return ret;
3794	}	3794	}
3795		3795
3796	/* for write the the end of file case, we fall back to old way */	3796	/* for write the the end of file case, we fall back to old way */
3797	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);	3797	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3798	}	3798	}
3799		3799
3800	static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,	3800	static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3801	const struct iovec *iov, loff_t offset,	3801	const struct iovec *iov, loff_t offset,
3802	unsigned long nr_segs)	3802	unsigned long nr_segs)
3803	{	3803	{
3804	struct file *file = iocb->ki_filp;	3804	struct file *file = iocb->ki_filp;
3805	struct inode *inode = file->f_mapping->host;	3805	struct inode *inode = file->f_mapping->host;
3806	ssize_t ret;	3806	ssize_t ret;
3807		3807
3808	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);	3808	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3809	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))	3809	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3810	ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);	3810	ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3811	else	3811	else
3812	ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);	3812	ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3813	trace_ext4_direct_IO_exit(inode, offset,	3813	trace_ext4_direct_IO_exit(inode, offset,
3814	iov_length(iov, nr_segs), rw, ret);	3814	iov_length(iov, nr_segs), rw, ret);
3815	return ret;	3815	return ret;
3816	}	3816	}
3817		3817
3818	/*	3818	/*
3819	* Pages can be marked dirty completely asynchronously from ext4's journalling	3819	* Pages can be marked dirty completely asynchronously from ext4's journalling
3820	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do	3820	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
3821	* much here because ->set_page_dirty is called under VFS locks. The page is	3821	* much here because ->set_page_dirty is called under VFS locks. The page is
3822	* not necessarily locked.	3822	* not necessarily locked.
3823	*	3823	*
3824	* We cannot just dirty the page and leave attached buffers clean, because the	3824	* We cannot just dirty the page and leave attached buffers clean, because the
3825	* buffers' dirty state is "definitive". We cannot just set the buffers dirty	3825	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
3826	* or jbddirty because all the journalling code will explode.	3826	* or jbddirty because all the journalling code will explode.
3827	*	3827	*
3828	* So what we do is to mark the page "pending dirty" and next time writepage	3828	* So what we do is to mark the page "pending dirty" and next time writepage
3829	* is called, propagate that into the buffers appropriately.	3829	* is called, propagate that into the buffers appropriately.
3830	*/	3830	*/
3831	static int ext4_journalled_set_page_dirty(struct page *page)	3831	static int ext4_journalled_set_page_dirty(struct page *page)
3832	{	3832	{
3833	SetPageChecked(page);	3833	SetPageChecked(page);
3834	return __set_page_dirty_nobuffers(page);	3834	return __set_page_dirty_nobuffers(page);
3835	}	3835	}
3836		3836
3837	static const struct address_space_operations ext4_ordered_aops = {	3837	static const struct address_space_operations ext4_ordered_aops = {
3838	.readpage = ext4_readpage,	3838	.readpage = ext4_readpage,
3839	.readpages = ext4_readpages,	3839	.readpages = ext4_readpages,
3840	.writepage = ext4_writepage,	3840	.writepage = ext4_writepage,
3841	.write_begin = ext4_write_begin,	3841	.write_begin = ext4_write_begin,
3842	.write_end = ext4_ordered_write_end,	3842	.write_end = ext4_ordered_write_end,
3843	.bmap = ext4_bmap,	3843	.bmap = ext4_bmap,
3844	.invalidatepage = ext4_invalidatepage,	3844	.invalidatepage = ext4_invalidatepage,
3845	.releasepage = ext4_releasepage,	3845	.releasepage = ext4_releasepage,
3846	.direct_IO = ext4_direct_IO,	3846	.direct_IO = ext4_direct_IO,
3847	.migratepage = buffer_migrate_page,	3847	.migratepage = buffer_migrate_page,
3848	.is_partially_uptodate = block_is_partially_uptodate,	3848	.is_partially_uptodate = block_is_partially_uptodate,
3849	.error_remove_page = generic_error_remove_page,	3849	.error_remove_page = generic_error_remove_page,
3850	};	3850	};
3851		3851
3852	static const struct address_space_operations ext4_writeback_aops = {	3852	static const struct address_space_operations ext4_writeback_aops = {
3853	.readpage = ext4_readpage,	3853	.readpage = ext4_readpage,
3854	.readpages = ext4_readpages,	3854	.readpages = ext4_readpages,
3855	.writepage = ext4_writepage,	3855	.writepage = ext4_writepage,
3856	.write_begin = ext4_write_begin,	3856	.write_begin = ext4_write_begin,
3857	.write_end = ext4_writeback_write_end,	3857	.write_end = ext4_writeback_write_end,
3858	.bmap = ext4_bmap,	3858	.bmap = ext4_bmap,
3859	.invalidatepage = ext4_invalidatepage,	3859	.invalidatepage = ext4_invalidatepage,
3860	.releasepage = ext4_releasepage,	3860	.releasepage = ext4_releasepage,
3861	.direct_IO = ext4_direct_IO,	3861	.direct_IO = ext4_direct_IO,
3862	.migratepage = buffer_migrate_page,	3862	.migratepage = buffer_migrate_page,
3863	.is_partially_uptodate = block_is_partially_uptodate,	3863	.is_partially_uptodate = block_is_partially_uptodate,
3864	.error_remove_page = generic_error_remove_page,	3864	.error_remove_page = generic_error_remove_page,
3865	};	3865	};
3866		3866
3867	static const struct address_space_operations ext4_journalled_aops = {	3867	static const struct address_space_operations ext4_journalled_aops = {
3868	.readpage = ext4_readpage,	3868	.readpage = ext4_readpage,
3869	.readpages = ext4_readpages,	3869	.readpages = ext4_readpages,
3870	.writepage = ext4_writepage,	3870	.writepage = ext4_writepage,
3871	.write_begin = ext4_write_begin,	3871	.write_begin = ext4_write_begin,
3872	.write_end = ext4_journalled_write_end,	3872	.write_end = ext4_journalled_write_end,
3873	.set_page_dirty = ext4_journalled_set_page_dirty,	3873	.set_page_dirty = ext4_journalled_set_page_dirty,
3874	.bmap = ext4_bmap,	3874	.bmap = ext4_bmap,
3875	.invalidatepage = ext4_invalidatepage,	3875	.invalidatepage = ext4_invalidatepage,
3876	.releasepage = ext4_releasepage,	3876	.releasepage = ext4_releasepage,
3877	.is_partially_uptodate = block_is_partially_uptodate,	3877	.is_partially_uptodate = block_is_partially_uptodate,
3878	.error_remove_page = generic_error_remove_page,	3878	.error_remove_page = generic_error_remove_page,
3879	};	3879	};
3880		3880
3881	static const struct address_space_operations ext4_da_aops = {	3881	static const struct address_space_operations ext4_da_aops = {
3882	.readpage = ext4_readpage,	3882	.readpage = ext4_readpage,
3883	.readpages = ext4_readpages,	3883	.readpages = ext4_readpages,
3884	.writepage = ext4_writepage,	3884	.writepage = ext4_writepage,
3885	.writepages = ext4_da_writepages,	3885	.writepages = ext4_da_writepages,
3886	.write_begin = ext4_da_write_begin,	3886	.write_begin = ext4_da_write_begin,
3887	.write_end = ext4_da_write_end,	3887	.write_end = ext4_da_write_end,
3888	.bmap = ext4_bmap,	3888	.bmap = ext4_bmap,
3889	.invalidatepage = ext4_da_invalidatepage,	3889	.invalidatepage = ext4_da_invalidatepage,
3890	.releasepage = ext4_releasepage,	3890	.releasepage = ext4_releasepage,
3891	.direct_IO = ext4_direct_IO,	3891	.direct_IO = ext4_direct_IO,
3892	.migratepage = buffer_migrate_page,	3892	.migratepage = buffer_migrate_page,
3893	.is_partially_uptodate = block_is_partially_uptodate,	3893	.is_partially_uptodate = block_is_partially_uptodate,
3894	.error_remove_page = generic_error_remove_page,	3894	.error_remove_page = generic_error_remove_page,
3895	};	3895	};
3896		3896
3897	void ext4_set_aops(struct inode *inode)	3897	void ext4_set_aops(struct inode *inode)
3898	{	3898	{
3899	if (ext4_should_order_data(inode) &&	3899	if (ext4_should_order_data(inode) &&
3900	test_opt(inode->i_sb, DELALLOC))	3900	test_opt(inode->i_sb, DELALLOC))
3901	inode->i_mapping->a_ops = &ext4_da_aops;	3901	inode->i_mapping->a_ops = &ext4_da_aops;
3902	else if (ext4_should_order_data(inode))	3902	else if (ext4_should_order_data(inode))
3903	inode->i_mapping->a_ops = &ext4_ordered_aops;	3903	inode->i_mapping->a_ops = &ext4_ordered_aops;
3904	else if (ext4_should_writeback_data(inode) &&	3904	else if (ext4_should_writeback_data(inode) &&
3905	test_opt(inode->i_sb, DELALLOC))	3905	test_opt(inode->i_sb, DELALLOC))
3906	inode->i_mapping->a_ops = &ext4_da_aops;	3906	inode->i_mapping->a_ops = &ext4_da_aops;
3907	else if (ext4_should_writeback_data(inode))	3907	else if (ext4_should_writeback_data(inode))
3908	inode->i_mapping->a_ops = &ext4_writeback_aops;	3908	inode->i_mapping->a_ops = &ext4_writeback_aops;
3909	else	3909	else
3910	inode->i_mapping->a_ops = &ext4_journalled_aops;	3910	inode->i_mapping->a_ops = &ext4_journalled_aops;
3911	}	3911	}
3912		3912
3913	/*	3913	/*
3914	* ext4_block_truncate_page() zeroes out a mapping from file offset `from'	3914	* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3915	* up to the end of the block which corresponds to `from'.	3915	* up to the end of the block which corresponds to `from'.
3916	* This required during truncate. We need to physically zero the tail end	3916	* This required during truncate. We need to physically zero the tail end
3917	* of that block so it doesn't yield old data if the file is later grown.	3917	* of that block so it doesn't yield old data if the file is later grown.
3918	*/	3918	*/
3919	int ext4_block_truncate_page(handle_t *handle,	3919	int ext4_block_truncate_page(handle_t *handle,
3920	struct address_space *mapping, loff_t from)	3920	struct address_space *mapping, loff_t from)
3921	{	3921	{
3922	unsigned offset = from & (PAGE_CACHE_SIZE-1);	3922	unsigned offset = from & (PAGE_CACHE_SIZE-1);
3923	unsigned length;	3923	unsigned length;
3924	unsigned blocksize;	3924	unsigned blocksize;
3925	struct inode *inode = mapping->host;	3925	struct inode *inode = mapping->host;
3926		3926
3927	blocksize = inode->i_sb->s_blocksize;	3927	blocksize = inode->i_sb->s_blocksize;
3928	length = blocksize - (offset & (blocksize - 1));	3928	length = blocksize - (offset & (blocksize - 1));
3929		3929
3930	return ext4_block_zero_page_range(handle, mapping, from, length);	3930	return ext4_block_zero_page_range(handle, mapping, from, length);
3931	}	3931	}
3932		3932
3933	/*	3933	/*
3934	* ext4_block_zero_page_range() zeros out a mapping of length 'length'	3934	* ext4_block_zero_page_range() zeros out a mapping of length 'length'
3935	* starting from file offset 'from'. The range to be zero'd must	3935	* starting from file offset 'from'. The range to be zero'd must
3936	* be contained with in one block. If the specified range exceeds	3936	* be contained with in one block. If the specified range exceeds
3937	* the end of the block it will be shortened to end of the block	3937	* the end of the block it will be shortened to end of the block
3938	* that cooresponds to 'from'	3938	* that cooresponds to 'from'
3939	*/	3939	*/
3940	int ext4_block_zero_page_range(handle_t *handle,	3940	int ext4_block_zero_page_range(handle_t *handle,
3941	struct address_space *mapping, loff_t from, loff_t length)	3941	struct address_space *mapping, loff_t from, loff_t length)
3942	{	3942	{
3943	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;	3943	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3944	unsigned offset = from & (PAGE_CACHE_SIZE-1);	3944	unsigned offset = from & (PAGE_CACHE_SIZE-1);
3945	unsigned blocksize, max, pos;	3945	unsigned blocksize, max, pos;
3946	ext4_lblk_t iblock;	3946	ext4_lblk_t iblock;
3947	struct inode *inode = mapping->host;	3947	struct inode *inode = mapping->host;
3948	struct buffer_head *bh;	3948	struct buffer_head *bh;
3949	struct page *page;	3949	struct page *page;
3950	int err = 0;	3950	int err = 0;
3951		3951
3952	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,	3952	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3953	mapping_gfp_mask(mapping) & ~__GFP_FS);	3953	mapping_gfp_mask(mapping) & ~__GFP_FS);
3954	if (!page)	3954	if (!page)
3955	return -EINVAL;	3955	return -EINVAL;
3956		3956
3957	blocksize = inode->i_sb->s_blocksize;	3957	blocksize = inode->i_sb->s_blocksize;
3958	max = blocksize - (offset & (blocksize - 1));	3958	max = blocksize - (offset & (blocksize - 1));
3959		3959
3960	/*	3960	/*
3961	* correct length if it does not fall between	3961	* correct length if it does not fall between
3962	* 'from' and the end of the block	3962	* 'from' and the end of the block
3963	*/	3963	*/
3964	if (length > max \|\| length < 0)	3964	if (length > max \|\| length < 0)
3965	length = max;	3965	length = max;
3966		3966
3967	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);	3967	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3968		3968
3969	if (!page_has_buffers(page))	3969	if (!page_has_buffers(page))
3970	create_empty_buffers(page, blocksize, 0);	3970	create_empty_buffers(page, blocksize, 0);
3971		3971
3972	/* Find the buffer that contains "offset" */	3972	/* Find the buffer that contains "offset" */
3973	bh = page_buffers(page);	3973	bh = page_buffers(page);
3974	pos = blocksize;	3974	pos = blocksize;
3975	while (offset >= pos) {	3975	while (offset >= pos) {
3976	bh = bh->b_this_page;	3976	bh = bh->b_this_page;
3977	iblock++;	3977	iblock++;
3978	pos += blocksize;	3978	pos += blocksize;
3979	}	3979	}
3980		3980
3981	err = 0;	3981	err = 0;
3982	if (buffer_freed(bh)) {	3982	if (buffer_freed(bh)) {
3983	BUFFER_TRACE(bh, "freed: skip");	3983	BUFFER_TRACE(bh, "freed: skip");
3984	goto unlock;	3984	goto unlock;
3985	}	3985	}
3986		3986
3987	if (!buffer_mapped(bh)) {	3987	if (!buffer_mapped(bh)) {
3988	BUFFER_TRACE(bh, "unmapped");	3988	BUFFER_TRACE(bh, "unmapped");
3989	ext4_get_block(inode, iblock, bh, 0);	3989	ext4_get_block(inode, iblock, bh, 0);
3990	/* unmapped? It's a hole - nothing to do */	3990	/* unmapped? It's a hole - nothing to do */
3991	if (!buffer_mapped(bh)) {	3991	if (!buffer_mapped(bh)) {
3992	BUFFER_TRACE(bh, "still unmapped");	3992	BUFFER_TRACE(bh, "still unmapped");
3993	goto unlock;	3993	goto unlock;
3994	}	3994	}
3995	}	3995	}
3996		3996
3997	/* Ok, it's mapped. Make sure it's up-to-date */	3997	/* Ok, it's mapped. Make sure it's up-to-date */
3998	if (PageUptodate(page))	3998	if (PageUptodate(page))
3999	set_buffer_uptodate(bh);	3999	set_buffer_uptodate(bh);
4000		4000
4001	if (!buffer_uptodate(bh)) {	4001	if (!buffer_uptodate(bh)) {
4002	err = -EIO;	4002	err = -EIO;
4003	ll_rw_block(READ, 1, &bh);	4003	ll_rw_block(READ, 1, &bh);
4004	wait_on_buffer(bh);	4004	wait_on_buffer(bh);
4005	/* Uhhuh. Read error. Complain and punt. */	4005	/* Uhhuh. Read error. Complain and punt. */
4006	if (!buffer_uptodate(bh))	4006	if (!buffer_uptodate(bh))
4007	goto unlock;	4007	goto unlock;
4008	}	4008	}
4009		4009
4010	if (ext4_should_journal_data(inode)) {	4010	if (ext4_should_journal_data(inode)) {
4011	BUFFER_TRACE(bh, "get write access");	4011	BUFFER_TRACE(bh, "get write access");
4012	err = ext4_journal_get_write_access(handle, bh);	4012	err = ext4_journal_get_write_access(handle, bh);
4013	if (err)	4013	if (err)
4014	goto unlock;	4014	goto unlock;
4015	}	4015	}
4016		4016
4017	zero_user(page, offset, length);	4017	zero_user(page, offset, length);
4018		4018
4019	BUFFER_TRACE(bh, "zeroed end of block");	4019	BUFFER_TRACE(bh, "zeroed end of block");
4020		4020
4021	err = 0;	4021	err = 0;
4022	if (ext4_should_journal_data(inode)) {	4022	if (ext4_should_journal_data(inode)) {
4023	err = ext4_handle_dirty_metadata(handle, inode, bh);	4023	err = ext4_handle_dirty_metadata(handle, inode, bh);
4024	} else {	4024	} else {
4025	if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)	4025	if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4026	err = ext4_jbd2_file_inode(handle, inode);	4026	err = ext4_jbd2_file_inode(handle, inode);
4027	mark_buffer_dirty(bh);	4027	mark_buffer_dirty(bh);
4028	}	4028	}
4029		4029
4030	unlock:	4030	unlock:
4031	unlock_page(page);	4031	unlock_page(page);
4032	page_cache_release(page);	4032	page_cache_release(page);
4033	return err;	4033	return err;
4034	}	4034	}
4035		4035
4036	/*	4036	/*
4037	* Probably it should be a library function... search for first non-zero word	4037	* Probably it should be a library function... search for first non-zero word
4038	* or memcmp with zero_page, whatever is better for particular architecture.	4038	* or memcmp with zero_page, whatever is better for particular architecture.
4039	* Linus?	4039	* Linus?
4040	*/	4040	*/
4041	static inline int all_zeroes(__le32 p, __le32 q)	4041	static inline int all_zeroes(__le32 p, __le32 q)
4042	{	4042	{
4043	while (p < q)	4043	while (p < q)
4044	if (*p++)	4044	if (*p++)
4045	return 0;	4045	return 0;
4046	return 1;	4046	return 1;
4047	}	4047	}
4048		4048
4049	/**	4049	/**
4050	* ext4_find_shared - find the indirect blocks for partial truncation.	4050	* ext4_find_shared - find the indirect blocks for partial truncation.
4051	* @inode: inode in question	4051	* @inode: inode in question
4052	* @depth: depth of the affected branch	4052	* @depth: depth of the affected branch
4053	* @offsets: offsets of pointers in that branch (see ext4_block_to_path)	4053	* @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4054	* @chain: place to store the pointers to partial indirect blocks	4054	* @chain: place to store the pointers to partial indirect blocks
4055	* @top: place to the (detached) top of branch	4055	* @top: place to the (detached) top of branch
4056	*	4056	*
4057	* This is a helper function used by ext4_truncate().	4057	* This is a helper function used by ext4_truncate().
4058	*	4058	*
4059	* When we do truncate() we may have to clean the ends of several	4059	* When we do truncate() we may have to clean the ends of several
4060	* indirect blocks but leave the blocks themselves alive. Block is	4060	* indirect blocks but leave the blocks themselves alive. Block is
4061	* partially truncated if some data below the new i_size is referred	4061	* partially truncated if some data below the new i_size is referred
4062	* from it (and it is on the path to the first completely truncated	4062	* from it (and it is on the path to the first completely truncated
4063	* data block, indeed). We have to free the top of that path along	4063	* data block, indeed). We have to free the top of that path along
4064	* with everything to the right of the path. Since no allocation	4064	* with everything to the right of the path. Since no allocation
4065	* past the truncation point is possible until ext4_truncate()	4065	* past the truncation point is possible until ext4_truncate()
4066	* finishes, we may safely do the latter, but top of branch may	4066	* finishes, we may safely do the latter, but top of branch may
4067	* require special attention - pageout below the truncation point	4067	* require special attention - pageout below the truncation point
4068	* might try to populate it.	4068	* might try to populate it.
4069	*	4069	*
4070	* We atomically detach the top of branch from the tree, store the	4070	* We atomically detach the top of branch from the tree, store the
4071	* block number of its root in *@top, pointers to buffer_heads of	4071	* block number of its root in *@top, pointers to buffer_heads of
4072	* partially truncated blocks - in @chain[].bh and pointers to	4072	* partially truncated blocks - in @chain[].bh and pointers to
4073	* their last elements that should not be removed - in	4073	* their last elements that should not be removed - in
4074	* @chain[].p. Return value is the pointer to last filled element	4074	* @chain[].p. Return value is the pointer to last filled element
4075	* of @chain.	4075	* of @chain.
4076	*	4076	*
4077	* The work left to caller to do the actual freeing of subtrees:	4077	* The work left to caller to do the actual freeing of subtrees:
4078	* a) free the subtree starting from *@top	4078	* a) free the subtree starting from *@top
4079	* b) free the subtrees whose roots are stored in	4079	* b) free the subtrees whose roots are stored in
4080	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)	4080	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4081	* c) free the subtrees growing from the inode past the @chain[0].	4081	* c) free the subtrees growing from the inode past the @chain[0].
4082	* (no partially truncated stuff there). */	4082	* (no partially truncated stuff there). */
4083		4083
4084	static Indirect ext4_find_shared(struct inode inode, int depth,	4084	static Indirect ext4_find_shared(struct inode inode, int depth,
4085	ext4_lblk_t offsets[4], Indirect chain[4],	4085	ext4_lblk_t offsets[4], Indirect chain[4],
4086	__le32 *top)	4086	__le32 *top)
4087	{	4087	{
4088	Indirect partial, p;	4088	Indirect partial, p;
4089	int k, err;	4089	int k, err;
4090		4090
4091	*top = 0;	4091	*top = 0;
4092	/* Make k index the deepest non-null offset + 1 */	4092	/* Make k index the deepest non-null offset + 1 */
4093	for (k = depth; k > 1 && !offsets[k-1]; k--)	4093	for (k = depth; k > 1 && !offsets[k-1]; k--)
4094	;	4094	;
4095	partial = ext4_get_branch(inode, k, offsets, chain, &err);	4095	partial = ext4_get_branch(inode, k, offsets, chain, &err);
4096	/* Writer: pointers */	4096	/* Writer: pointers */
4097	if (!partial)	4097	if (!partial)
4098	partial = chain + k-1;	4098	partial = chain + k-1;
4099	/*	4099	/*
4100	* If the branch acquired continuation since we've looked at it -	4100	* If the branch acquired continuation since we've looked at it -
4101	* fine, it should all survive and (new) top doesn't belong to us.	4101	* fine, it should all survive and (new) top doesn't belong to us.
4102	*/	4102	*/
4103	if (!partial->key && *partial->p)	4103	if (!partial->key && *partial->p)
4104	/* Writer: end */	4104	/* Writer: end */
4105	goto no_top;	4105	goto no_top;
4106	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)	4106	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4107	;	4107	;
4108	/*	4108	/*
4109	* OK, we've found the last block that must survive. The rest of our	4109	* OK, we've found the last block that must survive. The rest of our
4110	* branch should be detached before unlocking. However, if that rest	4110	* branch should be detached before unlocking. However, if that rest
4111	* of branch is all ours and does not grow immediately from the inode	4111	* of branch is all ours and does not grow immediately from the inode
4112	* it's easier to cheat and just decrement partial->p.	4112	* it's easier to cheat and just decrement partial->p.
4113	*/	4113	*/
4114	if (p == chain + k - 1 && p > chain) {	4114	if (p == chain + k - 1 && p > chain) {
4115	p->p--;	4115	p->p--;
4116	} else {	4116	} else {
4117	top = p->p;	4117	top = p->p;
4118	/* Nope, don't do this in ext4. Must leave the tree intact */	4118	/* Nope, don't do this in ext4. Must leave the tree intact */
4119	#if 0	4119	#if 0
4120	*p->p = 0;	4120	*p->p = 0;
4121	#endif	4121	#endif
4122	}	4122	}
4123	/* Writer: end */	4123	/* Writer: end */
4124		4124
4125	while (partial > p) {	4125	while (partial > p) {
4126	brelse(partial->bh);	4126	brelse(partial->bh);
4127	partial--;	4127	partial--;
4128	}	4128	}
4129	no_top:	4129	no_top:
4130	return partial;	4130	return partial;
4131	}	4131	}
4132		4132
4133	/*	4133	/*
4134	* Zero a number of block pointers in either an inode or an indirect block.	4134	* Zero a number of block pointers in either an inode or an indirect block.
4135	* If we restart the transaction we must again get write access to the	4135	* If we restart the transaction we must again get write access to the
4136	* indirect block for further modification.	4136	* indirect block for further modification.
4137	*	4137	*
4138	* We release `count' blocks on disk, but (last - first) may be greater	4138	* We release `count' blocks on disk, but (last - first) may be greater
4139	* than `count' because there can be holes in there.	4139	* than `count' because there can be holes in there.
4140	*	4140	*
4141	* Return 0 on success, 1 on invalid block range	4141	* Return 0 on success, 1 on invalid block range
4142	* and < 0 on fatal error.	4142	* and < 0 on fatal error.
4143	*/	4143	*/
4144	static int ext4_clear_blocks(handle_t handle, struct inode inode,	4144	static int ext4_clear_blocks(handle_t handle, struct inode inode,
4145	struct buffer_head *bh,	4145	struct buffer_head *bh,
4146	ext4_fsblk_t block_to_free,	4146	ext4_fsblk_t block_to_free,
4147	unsigned long count, __le32 *first,	4147	unsigned long count, __le32 *first,
4148	__le32 *last)	4148	__le32 *last)
4149	{	4149	{
4150	__le32 *p;	4150	__le32 *p;
4151	int flags = EXT4_FREE_BLOCKS_FORGET \| EXT4_FREE_BLOCKS_VALIDATED;	4151	int flags = EXT4_FREE_BLOCKS_FORGET \| EXT4_FREE_BLOCKS_VALIDATED;
4152	int err;	4152	int err;
4153		4153
4154	if (S_ISDIR(inode->i_mode) \|\| S_ISLNK(inode->i_mode))	4154	if (S_ISDIR(inode->i_mode) \|\| S_ISLNK(inode->i_mode))
4155	flags \|= EXT4_FREE_BLOCKS_METADATA;	4155	flags \|= EXT4_FREE_BLOCKS_METADATA;
4156		4156
4157	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,	4157	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4158	count)) {	4158	count)) {
4159	EXT4_ERROR_INODE(inode, "attempt to clear invalid "	4159	EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4160	"blocks %llu len %lu",	4160	"blocks %llu len %lu",
4161	(unsigned long long) block_to_free, count);	4161	(unsigned long long) block_to_free, count);
4162	return 1;	4162	return 1;
4163	}	4163	}
4164		4164
4165	if (try_to_extend_transaction(handle, inode)) {	4165	if (try_to_extend_transaction(handle, inode)) {
4166	if (bh) {	4166	if (bh) {
4167	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	4167	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4168	err = ext4_handle_dirty_metadata(handle, inode, bh);	4168	err = ext4_handle_dirty_metadata(handle, inode, bh);
4169	if (unlikely(err))	4169	if (unlikely(err))
4170	goto out_err;	4170	goto out_err;
4171	}	4171	}
4172	err = ext4_mark_inode_dirty(handle, inode);	4172	err = ext4_mark_inode_dirty(handle, inode);
4173	if (unlikely(err))	4173	if (unlikely(err))
4174	goto out_err;	4174	goto out_err;
4175	err = ext4_truncate_restart_trans(handle, inode,	4175	err = ext4_truncate_restart_trans(handle, inode,
4176	blocks_for_truncate(inode));	4176	blocks_for_truncate(inode));
4177	if (unlikely(err))	4177	if (unlikely(err))
4178	goto out_err;	4178	goto out_err;
4179	if (bh) {	4179	if (bh) {
4180	BUFFER_TRACE(bh, "retaking write access");	4180	BUFFER_TRACE(bh, "retaking write access");
4181	err = ext4_journal_get_write_access(handle, bh);	4181	err = ext4_journal_get_write_access(handle, bh);
4182	if (unlikely(err))	4182	if (unlikely(err))
4183	goto out_err;	4183	goto out_err;
4184	}	4184	}
4185	}	4185	}
4186		4186
4187	for (p = first; p < last; p++)	4187	for (p = first; p < last; p++)
4188	*p = 0;	4188	*p = 0;
4189		4189
4190	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);	4190	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4191	return 0;	4191	return 0;
4192	out_err:	4192	out_err:
4193	ext4_std_error(inode->i_sb, err);	4193	ext4_std_error(inode->i_sb, err);
4194	return err;	4194	return err;
4195	}	4195	}
4196		4196
4197	/**	4197	/**
4198	* ext4_free_data - free a list of data blocks	4198	* ext4_free_data - free a list of data blocks
4199	* @handle: handle for this transaction	4199	* @handle: handle for this transaction
4200	* @inode: inode we are dealing with	4200	* @inode: inode we are dealing with
4201	* @this_bh: indirect buffer_head which contains @first and @last	4201	* @this_bh: indirect buffer_head which contains @first and @last
4202	* @first: array of block numbers	4202	* @first: array of block numbers
4203	* @last: points immediately past the end of array	4203	* @last: points immediately past the end of array
4204	*	4204	*
4205	* We are freeing all blocks referred from that array (numbers are stored as	4205	* We are freeing all blocks referred from that array (numbers are stored as
4206	* little-endian 32-bit) and updating @inode->i_blocks appropriately.	4206	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
4207	*	4207	*
4208	* We accumulate contiguous runs of blocks to free. Conveniently, if these	4208	* We accumulate contiguous runs of blocks to free. Conveniently, if these
4209	* blocks are contiguous then releasing them at one time will only affect one	4209	* blocks are contiguous then releasing them at one time will only affect one
4210	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't	4210	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4211	* actually use a lot of journal space.	4211	* actually use a lot of journal space.
4212	*	4212	*
4213	* @this_bh will be %NULL if @first and @last point into the inode's direct	4213	* @this_bh will be %NULL if @first and @last point into the inode's direct
4214	* block pointers.	4214	* block pointers.
4215	*/	4215	*/
4216	static void ext4_free_data(handle_t handle, struct inode inode,	4216	static void ext4_free_data(handle_t handle, struct inode inode,
4217	struct buffer_head *this_bh,	4217	struct buffer_head *this_bh,
4218	__le32 first, __le32 last)	4218	__le32 first, __le32 last)
4219	{	4219	{
4220	ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */	4220	ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
4221	unsigned long count = 0; /* Number of blocks in the run */	4221	unsigned long count = 0; /* Number of blocks in the run */
4222	__le32 block_to_free_p = NULL; / Pointer into inode/ind	4222	__le32 block_to_free_p = NULL; / Pointer into inode/ind
4223	corresponding to	4223	corresponding to
4224	block_to_free */	4224	block_to_free */
4225	ext4_fsblk_t nr; /* Current block # */	4225	ext4_fsblk_t nr; /* Current block # */
4226	__le32 p; / Pointer into inode/ind	4226	__le32 p; / Pointer into inode/ind
4227	for current block */	4227	for current block */
4228	int err = 0;	4228	int err = 0;
4229		4229
4230	if (this_bh) { /* For indirect block */	4230	if (this_bh) { /* For indirect block */
4231	BUFFER_TRACE(this_bh, "get_write_access");	4231	BUFFER_TRACE(this_bh, "get_write_access");
4232	err = ext4_journal_get_write_access(handle, this_bh);	4232	err = ext4_journal_get_write_access(handle, this_bh);
4233	/* Important: if we can't update the indirect pointers	4233	/* Important: if we can't update the indirect pointers
4234	* to the blocks, we can't free them. */	4234	* to the blocks, we can't free them. */
4235	if (err)	4235	if (err)
4236	return;	4236	return;
4237	}	4237	}
4238		4238
4239	for (p = first; p < last; p++) {	4239	for (p = first; p < last; p++) {
4240	nr = le32_to_cpu(*p);	4240	nr = le32_to_cpu(*p);
4241	if (nr) {	4241	if (nr) {
4242	/* accumulate blocks to free if they're contiguous */	4242	/* accumulate blocks to free if they're contiguous */
4243	if (count == 0) {	4243	if (count == 0) {
4244	block_to_free = nr;	4244	block_to_free = nr;
4245	block_to_free_p = p;	4245	block_to_free_p = p;
4246	count = 1;	4246	count = 1;
4247	} else if (nr == block_to_free + count) {	4247	} else if (nr == block_to_free + count) {
4248	count++;	4248	count++;
4249	} else {	4249	} else {
4250	err = ext4_clear_blocks(handle, inode, this_bh,	4250	err = ext4_clear_blocks(handle, inode, this_bh,
4251	block_to_free, count,	4251	block_to_free, count,
4252	block_to_free_p, p);	4252	block_to_free_p, p);
4253	if (err)	4253	if (err)
4254	break;	4254	break;
4255	block_to_free = nr;	4255	block_to_free = nr;
4256	block_to_free_p = p;	4256	block_to_free_p = p;
4257	count = 1;	4257	count = 1;
4258	}	4258	}
4259	}	4259	}
4260	}	4260	}
4261		4261
4262	if (!err && count > 0)	4262	if (!err && count > 0)
4263	err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,	4263	err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4264	count, block_to_free_p, p);	4264	count, block_to_free_p, p);
4265	if (err < 0)	4265	if (err < 0)
4266	/* fatal error */	4266	/* fatal error */
4267	return;	4267	return;
4268		4268
4269	if (this_bh) {	4269	if (this_bh) {
4270	BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");	4270	BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4271		4271
4272	/*	4272	/*
4273	* The buffer head should have an attached journal head at this	4273	* The buffer head should have an attached journal head at this
4274	* point. However, if the data is corrupted and an indirect	4274	* point. However, if the data is corrupted and an indirect
4275	* block pointed to itself, it would have been detached when	4275	* block pointed to itself, it would have been detached when
4276	* the block was cleared. Check for this instead of OOPSing.	4276	* the block was cleared. Check for this instead of OOPSing.
4277	*/	4277	*/
4278	if ((EXT4_JOURNAL(inode) == NULL) \|\| bh2jh(this_bh))	4278	if ((EXT4_JOURNAL(inode) == NULL) \|\| bh2jh(this_bh))
4279	ext4_handle_dirty_metadata(handle, inode, this_bh);	4279	ext4_handle_dirty_metadata(handle, inode, this_bh);
4280	else	4280	else
4281	EXT4_ERROR_INODE(inode,	4281	EXT4_ERROR_INODE(inode,
4282	"circular indirect block detected at "	4282	"circular indirect block detected at "
4283	"block %llu",	4283	"block %llu",
4284	(unsigned long long) this_bh->b_blocknr);	4284	(unsigned long long) this_bh->b_blocknr);
4285	}	4285	}
4286	}	4286	}
4287		4287
4288	/**	4288	/**
4289	* ext4_free_branches - free an array of branches	4289	* ext4_free_branches - free an array of branches
4290	* @handle: JBD handle for this transaction	4290	* @handle: JBD handle for this transaction
4291	* @inode: inode we are dealing with	4291	* @inode: inode we are dealing with
4292	* @parent_bh: the buffer_head which contains @first and @last	4292	* @parent_bh: the buffer_head which contains @first and @last
4293	* @first: array of block numbers	4293	* @first: array of block numbers
4294	* @last: pointer immediately past the end of array	4294	* @last: pointer immediately past the end of array
4295	* @depth: depth of the branches to free	4295	* @depth: depth of the branches to free
4296	*	4296	*
4297	* We are freeing all blocks referred from these branches (numbers are	4297	* We are freeing all blocks referred from these branches (numbers are
4298	* stored as little-endian 32-bit) and updating @inode->i_blocks	4298	* stored as little-endian 32-bit) and updating @inode->i_blocks
4299	* appropriately.	4299	* appropriately.
4300	*/	4300	*/
4301	static void ext4_free_branches(handle_t handle, struct inode inode,	4301	static void ext4_free_branches(handle_t handle, struct inode inode,
4302	struct buffer_head *parent_bh,	4302	struct buffer_head *parent_bh,
4303	__le32 first, __le32 last, int depth)	4303	__le32 first, __le32 last, int depth)
4304	{	4304	{
4305	ext4_fsblk_t nr;	4305	ext4_fsblk_t nr;
4306	__le32 *p;	4306	__le32 *p;
4307		4307
4308	if (ext4_handle_is_aborted(handle))	4308	if (ext4_handle_is_aborted(handle))
4309	return;	4309	return;
4310		4310
4311	if (depth--) {	4311	if (depth--) {
4312	struct buffer_head *bh;	4312	struct buffer_head *bh;
4313	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);	4313	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4314	p = last;	4314	p = last;
4315	while (--p >= first) {	4315	while (--p >= first) {
4316	nr = le32_to_cpu(*p);	4316	nr = le32_to_cpu(*p);
4317	if (!nr)	4317	if (!nr)
4318	continue; /* A hole */	4318	continue; /* A hole */
4319		4319
4320	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),	4320	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4321	nr, 1)) {	4321	nr, 1)) {
4322	EXT4_ERROR_INODE(inode,	4322	EXT4_ERROR_INODE(inode,
4323	"invalid indirect mapped "	4323	"invalid indirect mapped "
4324	"block %lu (level %d)",	4324	"block %lu (level %d)",
4325	(unsigned long) nr, depth);	4325	(unsigned long) nr, depth);
4326	break;	4326	break;
4327	}	4327	}
4328		4328
4329	/* Go read the buffer for the next level down */	4329	/* Go read the buffer for the next level down */
4330	bh = sb_bread(inode->i_sb, nr);	4330	bh = sb_bread(inode->i_sb, nr);
4331		4331
4332	/*	4332	/*
4333	* A read failure? Report error and clear slot	4333	* A read failure? Report error and clear slot
4334	* (should be rare).	4334	* (should be rare).
4335	*/	4335	*/
4336	if (!bh) {	4336	if (!bh) {
4337	EXT4_ERROR_INODE_BLOCK(inode, nr,	4337	EXT4_ERROR_INODE_BLOCK(inode, nr,
4338	"Read failure");	4338	"Read failure");
4339	continue;	4339	continue;
4340	}	4340	}
4341		4341
4342	/* This zaps the entire block. Bottom up. */	4342	/* This zaps the entire block. Bottom up. */
4343	BUFFER_TRACE(bh, "free child branches");	4343	BUFFER_TRACE(bh, "free child branches");
4344	ext4_free_branches(handle, inode, bh,	4344	ext4_free_branches(handle, inode, bh,
4345	(__le32 *) bh->b_data,	4345	(__le32 *) bh->b_data,
4346	(__le32 *) bh->b_data + addr_per_block,	4346	(__le32 *) bh->b_data + addr_per_block,
4347	depth);	4347	depth);
4348	brelse(bh);	4348	brelse(bh);
4349		4349
4350	/*	4350	/*
4351	* Everything below this this pointer has been	4351	* Everything below this this pointer has been
4352	* released. Now let this top-of-subtree go.	4352	* released. Now let this top-of-subtree go.
4353	*	4353	*
4354	* We want the freeing of this indirect block to be	4354	* We want the freeing of this indirect block to be
4355	* atomic in the journal with the updating of the	4355	* atomic in the journal with the updating of the
4356	* bitmap block which owns it. So make some room in	4356	* bitmap block which owns it. So make some room in
4357	* the journal.	4357	* the journal.
4358	*	4358	*
4359	* We zero the parent pointer after freeing its	4359	* We zero the parent pointer after freeing its
4360	* pointee in the bitmaps, so if extend_transaction()	4360	* pointee in the bitmaps, so if extend_transaction()
4361	* for some reason fails to put the bitmap changes and	4361	* for some reason fails to put the bitmap changes and
4362	* the release into the same transaction, recovery	4362	* the release into the same transaction, recovery
4363	* will merely complain about releasing a free block,	4363	* will merely complain about releasing a free block,
4364	* rather than leaking blocks.	4364	* rather than leaking blocks.
4365	*/	4365	*/
4366	if (ext4_handle_is_aborted(handle))	4366	if (ext4_handle_is_aborted(handle))
4367	return;	4367	return;
4368	if (try_to_extend_transaction(handle, inode)) {	4368	if (try_to_extend_transaction(handle, inode)) {
4369	ext4_mark_inode_dirty(handle, inode);	4369	ext4_mark_inode_dirty(handle, inode);
4370	ext4_truncate_restart_trans(handle, inode,	4370	ext4_truncate_restart_trans(handle, inode,
4371	blocks_for_truncate(inode));	4371	blocks_for_truncate(inode));
4372	}	4372	}
4373		4373
4374	/*	4374	/*
4375	* The forget flag here is critical because if	4375	* The forget flag here is critical because if
4376	* we are journaling (and not doing data	4376	* we are journaling (and not doing data
4377	* journaling), we have to make sure a revoke	4377	* journaling), we have to make sure a revoke
4378	* record is written to prevent the journal	4378	* record is written to prevent the journal
4379	* replay from overwriting the (former)	4379	* replay from overwriting the (former)
4380	* indirect block if it gets reallocated as a	4380	* indirect block if it gets reallocated as a
4381	* data block. This must happen in the same	4381	* data block. This must happen in the same
4382	* transaction where the data blocks are	4382	* transaction where the data blocks are
4383	* actually freed.	4383	* actually freed.
4384	*/	4384	*/
4385	ext4_free_blocks(handle, inode, NULL, nr, 1,	4385	ext4_free_blocks(handle, inode, NULL, nr, 1,
4386	EXT4_FREE_BLOCKS_METADATA\|	4386	EXT4_FREE_BLOCKS_METADATA\|
4387	EXT4_FREE_BLOCKS_FORGET);	4387	EXT4_FREE_BLOCKS_FORGET);
4388		4388
4389	if (parent_bh) {	4389	if (parent_bh) {
4390	/*	4390	/*
4391	* The block which we have just freed is	4391	* The block which we have just freed is
4392	* pointed to by an indirect block: journal it	4392	* pointed to by an indirect block: journal it
4393	*/	4393	*/
4394	BUFFER_TRACE(parent_bh, "get_write_access");	4394	BUFFER_TRACE(parent_bh, "get_write_access");
4395	if (!ext4_journal_get_write_access(handle,	4395	if (!ext4_journal_get_write_access(handle,
4396	parent_bh)){	4396	parent_bh)){
4397	*p = 0;	4397	*p = 0;
4398	BUFFER_TRACE(parent_bh,	4398	BUFFER_TRACE(parent_bh,
4399	"call ext4_handle_dirty_metadata");	4399	"call ext4_handle_dirty_metadata");
4400	ext4_handle_dirty_metadata(handle,	4400	ext4_handle_dirty_metadata(handle,
4401	inode,	4401	inode,
4402	parent_bh);	4402	parent_bh);
4403	}	4403	}
4404	}	4404	}
4405	}	4405	}
4406	} else {	4406	} else {
4407	/* We have reached the bottom of the tree. */	4407	/* We have reached the bottom of the tree. */
4408	BUFFER_TRACE(parent_bh, "free data blocks");	4408	BUFFER_TRACE(parent_bh, "free data blocks");
4409	ext4_free_data(handle, inode, parent_bh, first, last);	4409	ext4_free_data(handle, inode, parent_bh, first, last);
4410	}	4410	}
4411	}	4411	}
4412		4412
4413	int ext4_can_truncate(struct inode *inode)	4413	int ext4_can_truncate(struct inode *inode)
4414	{	4414	{
4415	if (S_ISREG(inode->i_mode))	4415	if (S_ISREG(inode->i_mode))
4416	return 1;	4416	return 1;
4417	if (S_ISDIR(inode->i_mode))	4417	if (S_ISDIR(inode->i_mode))
4418	return 1;	4418	return 1;
4419	if (S_ISLNK(inode->i_mode))	4419	if (S_ISLNK(inode->i_mode))
4420	return !ext4_inode_is_fast_symlink(inode);	4420	return !ext4_inode_is_fast_symlink(inode);
4421	return 0;	4421	return 0;
4422	}	4422	}
4423		4423
4424	/*	4424	/*
4425	* ext4_punch_hole: punches a hole in a file by releaseing the blocks	4425	* ext4_punch_hole: punches a hole in a file by releaseing the blocks
4426	* associated with the given offset and length	4426	* associated with the given offset and length
4427	*	4427	*
4428	* @inode: File inode	4428	* @inode: File inode
4429	* @offset: The offset where the hole will begin	4429	* @offset: The offset where the hole will begin
4430	* @len: The length of the hole	4430	* @len: The length of the hole
4431	*	4431	*
4432	* Returns: 0 on sucess or negative on failure	4432	* Returns: 0 on sucess or negative on failure
4433	*/	4433	*/
4434		4434
4435	int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)	4435	int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4436	{	4436	{
4437	struct inode *inode = file->f_path.dentry->d_inode;	4437	struct inode *inode = file->f_path.dentry->d_inode;
4438	if (!S_ISREG(inode->i_mode))	4438	if (!S_ISREG(inode->i_mode))
4439	return -ENOTSUPP;	4439	return -ENOTSUPP;
4440		4440
4441	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {	4441	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4442	/* TODO: Add support for non extent hole punching */	4442	/* TODO: Add support for non extent hole punching */
4443	return -ENOTSUPP;	4443	return -ENOTSUPP;
4444	}	4444	}
4445		4445
4446	return ext4_ext_punch_hole(file, offset, length);	4446	return ext4_ext_punch_hole(file, offset, length);
4447	}	4447	}
4448		4448
4449	/*	4449	/*
4450	* ext4_truncate()	4450	* ext4_truncate()
4451	*	4451	*
4452	* We block out ext4_get_block() block instantiations across the entire	4452	* We block out ext4_get_block() block instantiations across the entire
4453	* transaction, and VFS/VM ensures that ext4_truncate() cannot run	4453	* transaction, and VFS/VM ensures that ext4_truncate() cannot run
4454	* simultaneously on behalf of the same inode.	4454	* simultaneously on behalf of the same inode.
4455	*	4455	*
4456	* As we work through the truncate and commmit bits of it to the journal there	4456	* As we work through the truncate and commmit bits of it to the journal there
4457	* is one core, guiding principle: the file's tree must always be consistent on	4457	* is one core, guiding principle: the file's tree must always be consistent on
4458	* disk. We must be able to restart the truncate after a crash.	4458	* disk. We must be able to restart the truncate after a crash.
4459	*	4459	*
4460	* The file's tree may be transiently inconsistent in memory (although it	4460	* The file's tree may be transiently inconsistent in memory (although it
4461	* probably isn't), but whenever we close off and commit a journal transaction,	4461	* probably isn't), but whenever we close off and commit a journal transaction,
4462	* the contents of (the filesystem + the journal) must be consistent and	4462	* the contents of (the filesystem + the journal) must be consistent and
4463	* restartable. It's pretty simple, really: bottom up, right to left (although	4463	* restartable. It's pretty simple, really: bottom up, right to left (although
4464	* left-to-right works OK too).	4464	* left-to-right works OK too).
4465	*	4465	*
4466	* Note that at recovery time, journal replay occurs before the restart of	4466	* Note that at recovery time, journal replay occurs before the restart of
4467	* truncate against the orphan inode list.	4467	* truncate against the orphan inode list.
4468	*	4468	*
4469	* The committed inode has the new, desired i_size (which is the same as	4469	* The committed inode has the new, desired i_size (which is the same as
4470	* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see	4470	* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4471	* that this inode's truncate did not complete and it will again call	4471	* that this inode's truncate did not complete and it will again call
4472	* ext4_truncate() to have another go. So there will be instantiated blocks	4472	* ext4_truncate() to have another go. So there will be instantiated blocks
4473	* to the right of the truncation point in a crashed ext4 filesystem. But	4473	* to the right of the truncation point in a crashed ext4 filesystem. But
4474	* that's fine - as long as they are linked from the inode, the post-crash	4474	* that's fine - as long as they are linked from the inode, the post-crash
4475	* ext4_truncate() run will find them and release them.	4475	* ext4_truncate() run will find them and release them.
4476	*/	4476	*/
4477	void ext4_truncate(struct inode *inode)	4477	void ext4_truncate(struct inode *inode)
4478	{	4478	{
4479	handle_t *handle;	4479	handle_t *handle;
4480	struct ext4_inode_info *ei = EXT4_I(inode);	4480	struct ext4_inode_info *ei = EXT4_I(inode);
4481	__le32 *i_data = ei->i_data;	4481	__le32 *i_data = ei->i_data;
4482	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);	4482	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4483	struct address_space *mapping = inode->i_mapping;	4483	struct address_space *mapping = inode->i_mapping;
4484	ext4_lblk_t offsets[4];	4484	ext4_lblk_t offsets[4];
4485	Indirect chain[4];	4485	Indirect chain[4];
4486	Indirect *partial;	4486	Indirect *partial;
4487	__le32 nr = 0;	4487	__le32 nr = 0;
4488	int n = 0;	4488	int n = 0;
4489	ext4_lblk_t last_block, max_block;	4489	ext4_lblk_t last_block, max_block;
4490	unsigned blocksize = inode->i_sb->s_blocksize;	4490	unsigned blocksize = inode->i_sb->s_blocksize;
4491		4491
4492	trace_ext4_truncate_enter(inode);	4492	trace_ext4_truncate_enter(inode);
4493		4493
4494	if (!ext4_can_truncate(inode))	4494	if (!ext4_can_truncate(inode))
4495	return;	4495	return;
4496		4496
4497	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);	4497	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4498		4498
4499	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))	4499	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4500	ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);	4500	ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4501		4501
4502	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {	4502	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4503	ext4_ext_truncate(inode);	4503	ext4_ext_truncate(inode);
4504	trace_ext4_truncate_exit(inode);	4504	trace_ext4_truncate_exit(inode);
4505	return;	4505	return;
4506	}	4506	}
4507		4507
4508	handle = start_transaction(inode);	4508	handle = start_transaction(inode);
4509	if (IS_ERR(handle))	4509	if (IS_ERR(handle))
4510	return; /* AKPM: return what? */	4510	return; /* AKPM: return what? */
4511		4511
4512	last_block = (inode->i_size + blocksize-1)	4512	last_block = (inode->i_size + blocksize-1)
4513	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);	4513	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4514	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)	4514	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4515	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);	4515	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4516		4516
4517	if (inode->i_size & (blocksize - 1))	4517	if (inode->i_size & (blocksize - 1))
4518	if (ext4_block_truncate_page(handle, mapping, inode->i_size))	4518	if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4519	goto out_stop;	4519	goto out_stop;
4520		4520
4521	if (last_block != max_block) {	4521	if (last_block != max_block) {
4522	n = ext4_block_to_path(inode, last_block, offsets, NULL);	4522	n = ext4_block_to_path(inode, last_block, offsets, NULL);
4523	if (n == 0)	4523	if (n == 0)
4524	goto out_stop; /* error */	4524	goto out_stop; /* error */
4525	}	4525	}
4526		4526
4527	/*	4527	/*
4528	* OK. This truncate is going to happen. We add the inode to the	4528	* OK. This truncate is going to happen. We add the inode to the
4529	* orphan list, so that if this truncate spans multiple transactions,	4529	* orphan list, so that if this truncate spans multiple transactions,
4530	* and we crash, we will resume the truncate when the filesystem	4530	* and we crash, we will resume the truncate when the filesystem
4531	* recovers. It also marks the inode dirty, to catch the new size.	4531	* recovers. It also marks the inode dirty, to catch the new size.
4532	*	4532	*
4533	* Implication: the file must always be in a sane, consistent	4533	* Implication: the file must always be in a sane, consistent
4534	* truncatable state while each transaction commits.	4534	* truncatable state while each transaction commits.
4535	*/	4535	*/
4536	if (ext4_orphan_add(handle, inode))	4536	if (ext4_orphan_add(handle, inode))
4537	goto out_stop;	4537	goto out_stop;
4538		4538
4539	/*	4539	/*
4540	* From here we block out all ext4_get_block() callers who want to	4540	* From here we block out all ext4_get_block() callers who want to
4541	* modify the block allocation tree.	4541	* modify the block allocation tree.
4542	*/	4542	*/
4543	down_write(&ei->i_data_sem);	4543	down_write(&ei->i_data_sem);
4544		4544
4545	ext4_discard_preallocations(inode);	4545	ext4_discard_preallocations(inode);
4546		4546
4547	/*	4547	/*
4548	* The orphan list entry will now protect us from any crash which	4548	* The orphan list entry will now protect us from any crash which
4549	* occurs before the truncate completes, so it is now safe to propagate	4549	* occurs before the truncate completes, so it is now safe to propagate
4550	* the new, shorter inode size (held for now in i_size) into the	4550	* the new, shorter inode size (held for now in i_size) into the
4551	* on-disk inode. We do this via i_disksize, which is the value which	4551	* on-disk inode. We do this via i_disksize, which is the value which
4552	* ext4 really writes onto the disk inode.	4552	* ext4 really writes onto the disk inode.
4553	*/	4553	*/
4554	ei->i_disksize = inode->i_size;	4554	ei->i_disksize = inode->i_size;
4555		4555
4556	if (last_block == max_block) {	4556	if (last_block == max_block) {
4557	/*	4557	/*
4558	* It is unnecessary to free any data blocks if last_block is	4558	* It is unnecessary to free any data blocks if last_block is
4559	* equal to the indirect block limit.	4559	* equal to the indirect block limit.
4560	*/	4560	*/
4561	goto out_unlock;	4561	goto out_unlock;
4562	} else if (n == 1) { /* direct blocks */	4562	} else if (n == 1) { /* direct blocks */
4563	ext4_free_data(handle, inode, NULL, i_data+offsets[0],	4563	ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4564	i_data + EXT4_NDIR_BLOCKS);	4564	i_data + EXT4_NDIR_BLOCKS);
4565	goto do_indirects;	4565	goto do_indirects;
4566	}	4566	}
4567		4567
4568	partial = ext4_find_shared(inode, n, offsets, chain, &nr);	4568	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4569	/* Kill the top of shared branch (not detached) */	4569	/* Kill the top of shared branch (not detached) */
4570	if (nr) {	4570	if (nr) {
4571	if (partial == chain) {	4571	if (partial == chain) {
4572	/* Shared branch grows from the inode */	4572	/* Shared branch grows from the inode */
4573	ext4_free_branches(handle, inode, NULL,	4573	ext4_free_branches(handle, inode, NULL,
4574	&nr, &nr+1, (chain+n-1) - partial);	4574	&nr, &nr+1, (chain+n-1) - partial);
4575	*partial->p = 0;	4575	*partial->p = 0;
4576	/*	4576	/*
4577	* We mark the inode dirty prior to restart,	4577	* We mark the inode dirty prior to restart,
4578	* and prior to stop. No need for it here.	4578	* and prior to stop. No need for it here.
4579	*/	4579	*/
4580	} else {	4580	} else {
4581	/* Shared branch grows from an indirect block */	4581	/* Shared branch grows from an indirect block */
4582	BUFFER_TRACE(partial->bh, "get_write_access");	4582	BUFFER_TRACE(partial->bh, "get_write_access");
4583	ext4_free_branches(handle, inode, partial->bh,	4583	ext4_free_branches(handle, inode, partial->bh,
4584	partial->p,	4584	partial->p,
4585	partial->p+1, (chain+n-1) - partial);	4585	partial->p+1, (chain+n-1) - partial);
4586	}	4586	}
4587	}	4587	}
4588	/* Clear the ends of indirect blocks on the shared branch */	4588	/* Clear the ends of indirect blocks on the shared branch */
4589	while (partial > chain) {	4589	while (partial > chain) {
4590	ext4_free_branches(handle, inode, partial->bh, partial->p + 1,	4590	ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4591	(__le32*)partial->bh->b_data+addr_per_block,	4591	(__le32*)partial->bh->b_data+addr_per_block,
4592	(chain+n-1) - partial);	4592	(chain+n-1) - partial);
4593	BUFFER_TRACE(partial->bh, "call brelse");	4593	BUFFER_TRACE(partial->bh, "call brelse");
4594	brelse(partial->bh);	4594	brelse(partial->bh);
4595	partial--;	4595	partial--;
4596	}	4596	}
4597	do_indirects:	4597	do_indirects:
4598	/* Kill the remaining (whole) subtrees */	4598	/* Kill the remaining (whole) subtrees */
4599	switch (offsets[0]) {	4599	switch (offsets[0]) {
4600	default:	4600	default:
4601	nr = i_data[EXT4_IND_BLOCK];	4601	nr = i_data[EXT4_IND_BLOCK];
4602	if (nr) {	4602	if (nr) {
4603	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);	4603	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4604	i_data[EXT4_IND_BLOCK] = 0;	4604	i_data[EXT4_IND_BLOCK] = 0;
4605	}	4605	}
4606	case EXT4_IND_BLOCK:	4606	case EXT4_IND_BLOCK:
4607	nr = i_data[EXT4_DIND_BLOCK];	4607	nr = i_data[EXT4_DIND_BLOCK];
4608	if (nr) {	4608	if (nr) {
4609	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);	4609	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4610	i_data[EXT4_DIND_BLOCK] = 0;	4610	i_data[EXT4_DIND_BLOCK] = 0;
4611	}	4611	}
4612	case EXT4_DIND_BLOCK:	4612	case EXT4_DIND_BLOCK:
4613	nr = i_data[EXT4_TIND_BLOCK];	4613	nr = i_data[EXT4_TIND_BLOCK];
4614	if (nr) {	4614	if (nr) {
4615	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);	4615	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4616	i_data[EXT4_TIND_BLOCK] = 0;	4616	i_data[EXT4_TIND_BLOCK] = 0;
4617	}	4617	}
4618	case EXT4_TIND_BLOCK:	4618	case EXT4_TIND_BLOCK:
4619	;	4619	;
4620	}	4620	}
4621		4621
4622	out_unlock:	4622	out_unlock:
4623	up_write(&ei->i_data_sem);	4623	up_write(&ei->i_data_sem);
4624	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);	4624	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4625	ext4_mark_inode_dirty(handle, inode);	4625	ext4_mark_inode_dirty(handle, inode);
4626		4626
4627	/*	4627	/*
4628	* In a multi-transaction truncate, we only make the final transaction	4628	* In a multi-transaction truncate, we only make the final transaction
4629	* synchronous	4629	* synchronous
4630	*/	4630	*/
4631	if (IS_SYNC(inode))	4631	if (IS_SYNC(inode))
4632	ext4_handle_sync(handle);	4632	ext4_handle_sync(handle);
4633	out_stop:	4633	out_stop:
4634	/*	4634	/*
4635	* If this was a simple ftruncate(), and the file will remain alive	4635	* If this was a simple ftruncate(), and the file will remain alive
4636	* then we need to clear up the orphan record which we created above.	4636	* then we need to clear up the orphan record which we created above.
4637	* However, if this was a real unlink then we were called by	4637	* However, if this was a real unlink then we were called by
4638	* ext4_delete_inode(), and we allow that function to clean up the	4638	* ext4_delete_inode(), and we allow that function to clean up the
4639	* orphan info for us.	4639	* orphan info for us.
4640	*/	4640	*/
4641	if (inode->i_nlink)	4641	if (inode->i_nlink)
4642	ext4_orphan_del(handle, inode);	4642	ext4_orphan_del(handle, inode);
4643		4643
4644	ext4_journal_stop(handle);	4644	ext4_journal_stop(handle);
4645	trace_ext4_truncate_exit(inode);	4645	trace_ext4_truncate_exit(inode);
4646	}	4646	}
4647		4647
4648	/*	4648	/*
4649	* ext4_get_inode_loc returns with an extra refcount against the inode's	4649	* ext4_get_inode_loc returns with an extra refcount against the inode's
4650	* underlying buffer_head on success. If 'in_mem' is true, we have all	4650	* underlying buffer_head on success. If 'in_mem' is true, we have all
4651	* data in memory that is needed to recreate the on-disk version of this	4651	* data in memory that is needed to recreate the on-disk version of this
4652	* inode.	4652	* inode.
4653	*/	4653	*/
4654	static int __ext4_get_inode_loc(struct inode *inode,	4654	static int __ext4_get_inode_loc(struct inode *inode,
4655	struct ext4_iloc *iloc, int in_mem)	4655	struct ext4_iloc *iloc, int in_mem)
4656	{	4656	{
4657	struct ext4_group_desc *gdp;	4657	struct ext4_group_desc *gdp;
4658	struct buffer_head *bh;	4658	struct buffer_head *bh;
4659	struct super_block *sb = inode->i_sb;	4659	struct super_block *sb = inode->i_sb;
4660	ext4_fsblk_t block;	4660	ext4_fsblk_t block;
4661	int inodes_per_block, inode_offset;	4661	int inodes_per_block, inode_offset;
4662		4662
4663	iloc->bh = NULL;	4663	iloc->bh = NULL;
4664	if (!ext4_valid_inum(sb, inode->i_ino))	4664	if (!ext4_valid_inum(sb, inode->i_ino))
4665	return -EIO;	4665	return -EIO;
4666		4666
4667	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);	4667	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4668	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);	4668	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4669	if (!gdp)	4669	if (!gdp)
4670	return -EIO;	4670	return -EIO;
4671		4671
4672	/*	4672	/*
4673	* Figure out the offset within the block group inode table	4673	* Figure out the offset within the block group inode table
4674	*/	4674	*/
4675	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;	4675	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4676	inode_offset = ((inode->i_ino - 1) %	4676	inode_offset = ((inode->i_ino - 1) %
4677	EXT4_INODES_PER_GROUP(sb));	4677	EXT4_INODES_PER_GROUP(sb));
4678	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);	4678	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4679	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);	4679	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4680		4680
4681	bh = sb_getblk(sb, block);	4681	bh = sb_getblk(sb, block);
4682	if (!bh) {	4682	if (!bh) {
4683	EXT4_ERROR_INODE_BLOCK(inode, block,	4683	EXT4_ERROR_INODE_BLOCK(inode, block,
4684	"unable to read itable block");	4684	"unable to read itable block");
4685	return -EIO;	4685	return -EIO;
4686	}	4686	}
4687	if (!buffer_uptodate(bh)) {	4687	if (!buffer_uptodate(bh)) {
4688	lock_buffer(bh);	4688	lock_buffer(bh);
4689		4689
4690	/*	4690	/*
4691	* If the buffer has the write error flag, we have failed	4691	* If the buffer has the write error flag, we have failed
4692	* to write out another inode in the same block. In this	4692	* to write out another inode in the same block. In this
4693	* case, we don't have to read the block because we may	4693	* case, we don't have to read the block because we may
4694	* read the old inode data successfully.	4694	* read the old inode data successfully.
4695	*/	4695	*/
4696	if (buffer_write_io_error(bh) && !buffer_uptodate(bh))	4696	if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4697	set_buffer_uptodate(bh);	4697	set_buffer_uptodate(bh);
4698		4698
4699	if (buffer_uptodate(bh)) {	4699	if (buffer_uptodate(bh)) {
4700	/* someone brought it uptodate while we waited */	4700	/* someone brought it uptodate while we waited */
4701	unlock_buffer(bh);	4701	unlock_buffer(bh);
4702	goto has_buffer;	4702	goto has_buffer;
4703	}	4703	}
4704		4704
4705	/*	4705	/*
4706	* If we have all information of the inode in memory and this	4706	* If we have all information of the inode in memory and this
4707	* is the only valid inode in the block, we need not read the	4707	* is the only valid inode in the block, we need not read the
4708	* block.	4708	* block.
4709	*/	4709	*/
4710	if (in_mem) {	4710	if (in_mem) {
4711	struct buffer_head *bitmap_bh;	4711	struct buffer_head *bitmap_bh;
4712	int i, start;	4712	int i, start;
4713		4713
4714	start = inode_offset & ~(inodes_per_block - 1);	4714	start = inode_offset & ~(inodes_per_block - 1);
4715		4715
4716	/* Is the inode bitmap in cache? */	4716	/* Is the inode bitmap in cache? */
4717	bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));	4717	bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4718	if (!bitmap_bh)	4718	if (!bitmap_bh)
4719	goto make_io;	4719	goto make_io;
4720		4720
4721	/*	4721	/*
4722	* If the inode bitmap isn't in cache then the	4722	* If the inode bitmap isn't in cache then the
4723	* optimisation may end up performing two reads instead	4723	* optimisation may end up performing two reads instead
4724	* of one, so skip it.	4724	* of one, so skip it.
4725	*/	4725	*/
4726	if (!buffer_uptodate(bitmap_bh)) {	4726	if (!buffer_uptodate(bitmap_bh)) {
4727	brelse(bitmap_bh);	4727	brelse(bitmap_bh);
4728	goto make_io;	4728	goto make_io;
4729	}	4729	}
4730	for (i = start; i < start + inodes_per_block; i++) {	4730	for (i = start; i < start + inodes_per_block; i++) {
4731	if (i == inode_offset)	4731	if (i == inode_offset)
4732	continue;	4732	continue;
4733	if (ext4_test_bit(i, bitmap_bh->b_data))	4733	if (ext4_test_bit(i, bitmap_bh->b_data))
4734	break;	4734	break;
4735	}	4735	}
4736	brelse(bitmap_bh);	4736	brelse(bitmap_bh);
4737	if (i == start + inodes_per_block) {	4737	if (i == start + inodes_per_block) {
4738	/* all other inodes are free, so skip I/O */	4738	/* all other inodes are free, so skip I/O */
4739	memset(bh->b_data, 0, bh->b_size);	4739	memset(bh->b_data, 0, bh->b_size);
4740	set_buffer_uptodate(bh);	4740	set_buffer_uptodate(bh);
4741	unlock_buffer(bh);	4741	unlock_buffer(bh);
4742	goto has_buffer;	4742	goto has_buffer;
4743	}	4743	}
4744	}	4744	}
4745		4745
4746	make_io:	4746	make_io:
4747	/*	4747	/*
4748	* If we need to do any I/O, try to pre-readahead extra	4748	* If we need to do any I/O, try to pre-readahead extra
4749	* blocks from the inode table.	4749	* blocks from the inode table.
4750	*/	4750	*/
4751	if (EXT4_SB(sb)->s_inode_readahead_blks) {	4751	if (EXT4_SB(sb)->s_inode_readahead_blks) {
4752	ext4_fsblk_t b, end, table;	4752	ext4_fsblk_t b, end, table;
4753	unsigned num;	4753	unsigned num;
4754		4754
4755	table = ext4_inode_table(sb, gdp);	4755	table = ext4_inode_table(sb, gdp);
4756	/* s_inode_readahead_blks is always a power of 2 */	4756	/* s_inode_readahead_blks is always a power of 2 */
4757	b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);	4757	b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4758	if (table > b)	4758	if (table > b)
4759	b = table;	4759	b = table;
4760	end = b + EXT4_SB(sb)->s_inode_readahead_blks;	4760	end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4761	num = EXT4_INODES_PER_GROUP(sb);	4761	num = EXT4_INODES_PER_GROUP(sb);
4762	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,	4762	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4763	EXT4_FEATURE_RO_COMPAT_GDT_CSUM))	4763	EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4764	num -= ext4_itable_unused_count(sb, gdp);	4764	num -= ext4_itable_unused_count(sb, gdp);
4765	table += num / inodes_per_block;	4765	table += num / inodes_per_block;
4766	if (end > table)	4766	if (end > table)
4767	end = table;	4767	end = table;
4768	while (b <= end)	4768	while (b <= end)
4769	sb_breadahead(sb, b++);	4769	sb_breadahead(sb, b++);
4770	}	4770	}
4771		4771
4772	/*	4772	/*
4773	* There are other valid inodes in the buffer, this inode	4773	* There are other valid inodes in the buffer, this inode
4774	* has in-inode xattrs, or we don't have this inode in memory.	4774	* has in-inode xattrs, or we don't have this inode in memory.
4775	* Read the block from disk.	4775	* Read the block from disk.
4776	*/	4776	*/
4777	trace_ext4_load_inode(inode);	4777	trace_ext4_load_inode(inode);
4778	get_bh(bh);	4778	get_bh(bh);
4779	bh->b_end_io = end_buffer_read_sync;	4779	bh->b_end_io = end_buffer_read_sync;
4780	submit_bh(READ_META, bh);	4780	submit_bh(READ_META, bh);
4781	wait_on_buffer(bh);	4781	wait_on_buffer(bh);
4782	if (!buffer_uptodate(bh)) {	4782	if (!buffer_uptodate(bh)) {
4783	EXT4_ERROR_INODE_BLOCK(inode, block,	4783	EXT4_ERROR_INODE_BLOCK(inode, block,
4784	"unable to read itable block");	4784	"unable to read itable block");
4785	brelse(bh);	4785	brelse(bh);
4786	return -EIO;	4786	return -EIO;
4787	}	4787	}
4788	}	4788	}
4789	has_buffer:	4789	has_buffer:
4790	iloc->bh = bh;	4790	iloc->bh = bh;
4791	return 0;	4791	return 0;
4792	}	4792	}
4793		4793
4794	int ext4_get_inode_loc(struct inode inode, struct ext4_iloc iloc)	4794	int ext4_get_inode_loc(struct inode inode, struct ext4_iloc iloc)
4795	{	4795	{
4796	/* We have all inode data except xattrs in memory here. */	4796	/* We have all inode data except xattrs in memory here. */
4797	return __ext4_get_inode_loc(inode, iloc,	4797	return __ext4_get_inode_loc(inode, iloc,
4798	!ext4_test_inode_state(inode, EXT4_STATE_XATTR));	4798	!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4799	}	4799	}
4800		4800
4801	void ext4_set_inode_flags(struct inode *inode)	4801	void ext4_set_inode_flags(struct inode *inode)
4802	{	4802	{
4803	unsigned int flags = EXT4_I(inode)->i_flags;	4803	unsigned int flags = EXT4_I(inode)->i_flags;
4804		4804
4805	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);	4805	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
4806	if (flags & EXT4_SYNC_FL)	4806	if (flags & EXT4_SYNC_FL)
4807	inode->i_flags \|= S_SYNC;	4807	inode->i_flags \|= S_SYNC;
4808	if (flags & EXT4_APPEND_FL)	4808	if (flags & EXT4_APPEND_FL)
4809	inode->i_flags \|= S_APPEND;	4809	inode->i_flags \|= S_APPEND;
4810	if (flags & EXT4_IMMUTABLE_FL)	4810	if (flags & EXT4_IMMUTABLE_FL)
4811	inode->i_flags \|= S_IMMUTABLE;	4811	inode->i_flags \|= S_IMMUTABLE;
4812	if (flags & EXT4_NOATIME_FL)	4812	if (flags & EXT4_NOATIME_FL)
4813	inode->i_flags \|= S_NOATIME;	4813	inode->i_flags \|= S_NOATIME;
4814	if (flags & EXT4_DIRSYNC_FL)	4814	if (flags & EXT4_DIRSYNC_FL)
4815	inode->i_flags \|= S_DIRSYNC;	4815	inode->i_flags \|= S_DIRSYNC;
4816	}	4816	}
4817		4817
4818	/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */	4818	/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4819	void ext4_get_inode_flags(struct ext4_inode_info *ei)	4819	void ext4_get_inode_flags(struct ext4_inode_info *ei)
4820	{	4820	{
4821	unsigned int vfs_fl;	4821	unsigned int vfs_fl;
4822	unsigned long old_fl, new_fl;	4822	unsigned long old_fl, new_fl;
4823		4823
4824	do {	4824	do {
4825	vfs_fl = ei->vfs_inode.i_flags;	4825	vfs_fl = ei->vfs_inode.i_flags;
4826	old_fl = ei->i_flags;	4826	old_fl = ei->i_flags;
4827	new_fl = old_fl & ~(EXT4_SYNC_FL\|EXT4_APPEND_FL\|	4827	new_fl = old_fl & ~(EXT4_SYNC_FL\|EXT4_APPEND_FL\|
4828	EXT4_IMMUTABLE_FL\|EXT4_NOATIME_FL\|	4828	EXT4_IMMUTABLE_FL\|EXT4_NOATIME_FL\|
4829	EXT4_DIRSYNC_FL);	4829	EXT4_DIRSYNC_FL);
4830	if (vfs_fl & S_SYNC)	4830	if (vfs_fl & S_SYNC)
4831	new_fl \|= EXT4_SYNC_FL;	4831	new_fl \|= EXT4_SYNC_FL;
4832	if (vfs_fl & S_APPEND)	4832	if (vfs_fl & S_APPEND)
4833	new_fl \|= EXT4_APPEND_FL;	4833	new_fl \|= EXT4_APPEND_FL;
4834	if (vfs_fl & S_IMMUTABLE)	4834	if (vfs_fl & S_IMMUTABLE)
4835	new_fl \|= EXT4_IMMUTABLE_FL;	4835	new_fl \|= EXT4_IMMUTABLE_FL;
4836	if (vfs_fl & S_NOATIME)	4836	if (vfs_fl & S_NOATIME)
4837	new_fl \|= EXT4_NOATIME_FL;	4837	new_fl \|= EXT4_NOATIME_FL;
4838	if (vfs_fl & S_DIRSYNC)	4838	if (vfs_fl & S_DIRSYNC)
4839	new_fl \|= EXT4_DIRSYNC_FL;	4839	new_fl \|= EXT4_DIRSYNC_FL;
4840	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);	4840	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4841	}	4841	}
4842		4842
4843	static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,	4843	static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4844	struct ext4_inode_info *ei)	4844	struct ext4_inode_info *ei)
4845	{	4845	{
4846	blkcnt_t i_blocks ;	4846	blkcnt_t i_blocks ;
4847	struct inode *inode = &(ei->vfs_inode);	4847	struct inode *inode = &(ei->vfs_inode);
4848	struct super_block *sb = inode->i_sb;	4848	struct super_block *sb = inode->i_sb;
4849		4849
4850	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,	4850	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4851	EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {	4851	EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4852	/* we are using combined 48 bit field */	4852	/* we are using combined 48 bit field */
4853	i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 \|	4853	i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 \|
4854	le32_to_cpu(raw_inode->i_blocks_lo);	4854	le32_to_cpu(raw_inode->i_blocks_lo);
4855	if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {	4855	if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4856	/* i_blocks represent file system block size */	4856	/* i_blocks represent file system block size */
4857	return i_blocks << (inode->i_blkbits - 9);	4857	return i_blocks << (inode->i_blkbits - 9);
4858	} else {	4858	} else {
4859	return i_blocks;	4859	return i_blocks;
4860	}	4860	}
4861	} else {	4861	} else {
4862	return le32_to_cpu(raw_inode->i_blocks_lo);	4862	return le32_to_cpu(raw_inode->i_blocks_lo);
4863	}	4863	}
4864	}	4864	}
4865		4865
4866	struct inode ext4_iget(struct super_block sb, unsigned long ino)	4866	struct inode ext4_iget(struct super_block sb, unsigned long ino)
4867	{	4867	{
4868	struct ext4_iloc iloc;	4868	struct ext4_iloc iloc;
4869	struct ext4_inode *raw_inode;	4869	struct ext4_inode *raw_inode;
4870	struct ext4_inode_info *ei;	4870	struct ext4_inode_info *ei;
4871	struct inode *inode;	4871	struct inode *inode;
4872	journal_t *journal = EXT4_SB(sb)->s_journal;	4872	journal_t *journal = EXT4_SB(sb)->s_journal;
4873	long ret;	4873	long ret;
4874	int block;	4874	int block;
4875		4875
4876	inode = iget_locked(sb, ino);	4876	inode = iget_locked(sb, ino);
4877	if (!inode)	4877	if (!inode)
4878	return ERR_PTR(-ENOMEM);	4878	return ERR_PTR(-ENOMEM);
4879	if (!(inode->i_state & I_NEW))	4879	if (!(inode->i_state & I_NEW))
4880	return inode;	4880	return inode;
4881		4881
4882	ei = EXT4_I(inode);	4882	ei = EXT4_I(inode);
4883	iloc.bh = NULL;	4883	iloc.bh = NULL;
4884		4884
4885	ret = __ext4_get_inode_loc(inode, &iloc, 0);	4885	ret = __ext4_get_inode_loc(inode, &iloc, 0);
4886	if (ret < 0)	4886	if (ret < 0)
4887	goto bad_inode;	4887	goto bad_inode;
4888	raw_inode = ext4_raw_inode(&iloc);	4888	raw_inode = ext4_raw_inode(&iloc);
4889	inode->i_mode = le16_to_cpu(raw_inode->i_mode);	4889	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4890	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);	4890	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4891	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);	4891	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4892	if (!(test_opt(inode->i_sb, NO_UID32))) {	4892	if (!(test_opt(inode->i_sb, NO_UID32))) {
4893	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;	4893	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
4894	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;	4894	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
4895	}	4895	}
4896	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);	4896	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4897		4897
4898	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */	4898	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4899	ei->i_dir_start_lookup = 0;	4899	ei->i_dir_start_lookup = 0;
4900	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);	4900	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4901	/* We now have enough fields to check if the inode was active or not.	4901	/* We now have enough fields to check if the inode was active or not.
4902	* This is needed because nfsd might try to access dead inodes	4902	* This is needed because nfsd might try to access dead inodes
4903	* the test is that same one that e2fsck uses	4903	* the test is that same one that e2fsck uses
4904	* NeilBrown 1999oct15	4904	* NeilBrown 1999oct15
4905	*/	4905	*/
4906	if (inode->i_nlink == 0) {	4906	if (inode->i_nlink == 0) {
4907	if (inode->i_mode == 0 \|\|	4907	if (inode->i_mode == 0 \|\|
4908	!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {	4908	!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4909	/* this inode is deleted */	4909	/* this inode is deleted */
4910	ret = -ESTALE;	4910	ret = -ESTALE;
4911	goto bad_inode;	4911	goto bad_inode;
4912	}	4912	}
4913	/* The only unlinked inodes we let through here have	4913	/* The only unlinked inodes we let through here have
4914	* valid i_mode and are being read by the orphan	4914	* valid i_mode and are being read by the orphan
4915	* recovery code: that's fine, we're about to complete	4915	* recovery code: that's fine, we're about to complete
4916	* the process of deleting those. */	4916	* the process of deleting those. */
4917	}	4917	}
4918	ei->i_flags = le32_to_cpu(raw_inode->i_flags);	4918	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4919	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);	4919	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4920	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);	4920	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4921	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))	4921	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4922	ei->i_file_acl \|=	4922	ei->i_file_acl \|=
4923	((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;	4923	((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4924	inode->i_size = ext4_isize(raw_inode);	4924	inode->i_size = ext4_isize(raw_inode);
4925	ei->i_disksize = inode->i_size;	4925	ei->i_disksize = inode->i_size;
4926	#ifdef CONFIG_QUOTA	4926	#ifdef CONFIG_QUOTA
4927	ei->i_reserved_quota = 0;	4927	ei->i_reserved_quota = 0;
4928	#endif	4928	#endif
4929	inode->i_generation = le32_to_cpu(raw_inode->i_generation);	4929	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4930	ei->i_block_group = iloc.block_group;	4930	ei->i_block_group = iloc.block_group;
4931	ei->i_last_alloc_group = ~0;	4931	ei->i_last_alloc_group = ~0;
4932	/*	4932	/*
4933	* NOTE! The in-memory inode i_data array is in little-endian order	4933	* NOTE! The in-memory inode i_data array is in little-endian order
4934	* even on big-endian machines: we do NOT byteswap the block numbers!	4934	* even on big-endian machines: we do NOT byteswap the block numbers!
4935	*/	4935	*/
4936	for (block = 0; block < EXT4_N_BLOCKS; block++)	4936	for (block = 0; block < EXT4_N_BLOCKS; block++)
4937	ei->i_data[block] = raw_inode->i_block[block];	4937	ei->i_data[block] = raw_inode->i_block[block];
4938	INIT_LIST_HEAD(&ei->i_orphan);	4938	INIT_LIST_HEAD(&ei->i_orphan);
4939		4939
4940	/*	4940	/*
4941	* Set transaction id's of transactions that have to be committed	4941	* Set transaction id's of transactions that have to be committed
4942	* to finish f[data]sync. We set them to currently running transaction	4942	* to finish f[data]sync. We set them to currently running transaction
4943	* as we cannot be sure that the inode or some of its metadata isn't	4943	* as we cannot be sure that the inode or some of its metadata isn't
4944	* part of the transaction - the inode could have been reclaimed and	4944	* part of the transaction - the inode could have been reclaimed and
4945	* now it is reread from disk.	4945	* now it is reread from disk.
4946	*/	4946	*/
4947	if (journal) {	4947	if (journal) {
4948	transaction_t *transaction;	4948	transaction_t *transaction;
4949	tid_t tid;	4949	tid_t tid;
4950		4950
4951	read_lock(&journal->j_state_lock);	4951	read_lock(&journal->j_state_lock);
4952	if (journal->j_running_transaction)	4952	if (journal->j_running_transaction)
4953	transaction = journal->j_running_transaction;	4953	transaction = journal->j_running_transaction;
4954	else	4954	else
4955	transaction = journal->j_committing_transaction;	4955	transaction = journal->j_committing_transaction;
4956	if (transaction)	4956	if (transaction)
4957	tid = transaction->t_tid;	4957	tid = transaction->t_tid;
4958	else	4958	else
4959	tid = journal->j_commit_sequence;	4959	tid = journal->j_commit_sequence;
4960	read_unlock(&journal->j_state_lock);	4960	read_unlock(&journal->j_state_lock);
4961	ei->i_sync_tid = tid;	4961	ei->i_sync_tid = tid;
4962	ei->i_datasync_tid = tid;	4962	ei->i_datasync_tid = tid;
4963	}	4963	}
4964		4964
4965	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {	4965	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4966	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);	4966	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4967	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >	4967	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4968	EXT4_INODE_SIZE(inode->i_sb)) {	4968	EXT4_INODE_SIZE(inode->i_sb)) {
4969	ret = -EIO;	4969	ret = -EIO;
4970	goto bad_inode;	4970	goto bad_inode;
4971	}	4971	}
4972	if (ei->i_extra_isize == 0) {	4972	if (ei->i_extra_isize == 0) {
4973	/* The extra space is currently unused. Use it. */	4973	/* The extra space is currently unused. Use it. */
4974	ei->i_extra_isize = sizeof(struct ext4_inode) -	4974	ei->i_extra_isize = sizeof(struct ext4_inode) -
4975	EXT4_GOOD_OLD_INODE_SIZE;	4975	EXT4_GOOD_OLD_INODE_SIZE;
4976	} else {	4976	} else {
4977	__le32 magic = (void )raw_inode +	4977	__le32 magic = (void )raw_inode +
4978	EXT4_GOOD_OLD_INODE_SIZE +	4978	EXT4_GOOD_OLD_INODE_SIZE +
4979	ei->i_extra_isize;	4979	ei->i_extra_isize;
4980	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))	4980	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4981	ext4_set_inode_state(inode, EXT4_STATE_XATTR);	4981	ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4982	}	4982	}
4983	} else	4983	} else
4984	ei->i_extra_isize = 0;	4984	ei->i_extra_isize = 0;
4985		4985
4986	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);	4986	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4987	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);	4987	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4988	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);	4988	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4989	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);	4989	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4990		4990
4991	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);	4991	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4992	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {	4992	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4993	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))	4993	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4994	inode->i_version \|=	4994	inode->i_version \|=
4995	(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;	4995	(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4996	}	4996	}
4997		4997
4998	ret = 0;	4998	ret = 0;
4999	if (ei->i_file_acl &&	4999	if (ei->i_file_acl &&
5000	!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {	5000	!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5001	EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",	5001	EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5002	ei->i_file_acl);	5002	ei->i_file_acl);
5003	ret = -EIO;	5003	ret = -EIO;
5004	goto bad_inode;	5004	goto bad_inode;
5005	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {	5005	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5006	if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|	5006	if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
5007	(S_ISLNK(inode->i_mode) &&	5007	(S_ISLNK(inode->i_mode) &&
5008	!ext4_inode_is_fast_symlink(inode)))	5008	!ext4_inode_is_fast_symlink(inode)))
5009	/* Validate extent which is part of inode */	5009	/* Validate extent which is part of inode */
5010	ret = ext4_ext_check_inode(inode);	5010	ret = ext4_ext_check_inode(inode);
5011	} else if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|	5011	} else if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
5012	(S_ISLNK(inode->i_mode) &&	5012	(S_ISLNK(inode->i_mode) &&
5013	!ext4_inode_is_fast_symlink(inode))) {	5013	!ext4_inode_is_fast_symlink(inode))) {
5014	/* Validate block references which are part of inode */	5014	/* Validate block references which are part of inode */
5015	ret = ext4_check_inode_blockref(inode);	5015	ret = ext4_check_inode_blockref(inode);
5016	}	5016	}
5017	if (ret)	5017	if (ret)
5018	goto bad_inode;	5018	goto bad_inode;
5019		5019
5020	if (S_ISREG(inode->i_mode)) {	5020	if (S_ISREG(inode->i_mode)) {
5021	inode->i_op = &ext4_file_inode_operations;	5021	inode->i_op = &ext4_file_inode_operations;
5022	inode->i_fop = &ext4_file_operations;	5022	inode->i_fop = &ext4_file_operations;
5023	ext4_set_aops(inode);	5023	ext4_set_aops(inode);
5024	} else if (S_ISDIR(inode->i_mode)) {	5024	} else if (S_ISDIR(inode->i_mode)) {
5025	inode->i_op = &ext4_dir_inode_operations;	5025	inode->i_op = &ext4_dir_inode_operations;
5026	inode->i_fop = &ext4_dir_operations;	5026	inode->i_fop = &ext4_dir_operations;
5027	} else if (S_ISLNK(inode->i_mode)) {	5027	} else if (S_ISLNK(inode->i_mode)) {
5028	if (ext4_inode_is_fast_symlink(inode)) {	5028	if (ext4_inode_is_fast_symlink(inode)) {
5029	inode->i_op = &ext4_fast_symlink_inode_operations;	5029	inode->i_op = &ext4_fast_symlink_inode_operations;
5030	nd_terminate_link(ei->i_data, inode->i_size,	5030	nd_terminate_link(ei->i_data, inode->i_size,
5031	sizeof(ei->i_data) - 1);	5031	sizeof(ei->i_data) - 1);
5032	} else {	5032	} else {
5033	inode->i_op = &ext4_symlink_inode_operations;	5033	inode->i_op = &ext4_symlink_inode_operations;
5034	ext4_set_aops(inode);	5034	ext4_set_aops(inode);
5035	}	5035	}
5036	} else if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode) \|\|	5036	} else if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode) \|\|
5037	S_ISFIFO(inode->i_mode) \|\| S_ISSOCK(inode->i_mode)) {	5037	S_ISFIFO(inode->i_mode) \|\| S_ISSOCK(inode->i_mode)) {
5038	inode->i_op = &ext4_special_inode_operations;	5038	inode->i_op = &ext4_special_inode_operations;
5039	if (raw_inode->i_block[0])	5039	if (raw_inode->i_block[0])
5040	init_special_inode(inode, inode->i_mode,	5040	init_special_inode(inode, inode->i_mode,
5041	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));	5041	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
5042	else	5042	else
5043	init_special_inode(inode, inode->i_mode,	5043	init_special_inode(inode, inode->i_mode,
5044	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));	5044	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5045	} else {	5045	} else {
5046	ret = -EIO;	5046	ret = -EIO;
5047	EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);	5047	EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5048	goto bad_inode;	5048	goto bad_inode;
5049	}	5049	}
5050	brelse(iloc.bh);	5050	brelse(iloc.bh);
5051	ext4_set_inode_flags(inode);	5051	ext4_set_inode_flags(inode);
5052	unlock_new_inode(inode);	5052	unlock_new_inode(inode);
5053	return inode;	5053	return inode;
5054		5054
5055	bad_inode:	5055	bad_inode:
5056	brelse(iloc.bh);	5056	brelse(iloc.bh);
5057	iget_failed(inode);	5057	iget_failed(inode);
5058	return ERR_PTR(ret);	5058	return ERR_PTR(ret);
5059	}	5059	}
5060		5060
5061	static int ext4_inode_blocks_set(handle_t *handle,	5061	static int ext4_inode_blocks_set(handle_t *handle,
5062	struct ext4_inode *raw_inode,	5062	struct ext4_inode *raw_inode,
5063	struct ext4_inode_info *ei)	5063	struct ext4_inode_info *ei)
5064	{	5064	{
5065	struct inode *inode = &(ei->vfs_inode);	5065	struct inode *inode = &(ei->vfs_inode);
5066	u64 i_blocks = inode->i_blocks;	5066	u64 i_blocks = inode->i_blocks;
5067	struct super_block *sb = inode->i_sb;	5067	struct super_block *sb = inode->i_sb;
5068		5068
5069	if (i_blocks <= ~0U) {	5069	if (i_blocks <= ~0U) {
5070	/*	5070	/*
5071	* i_blocks can be represnted in a 32 bit variable	5071	* i_blocks can be represnted in a 32 bit variable
5072	* as multiple of 512 bytes	5072	* as multiple of 512 bytes
5073	*/	5073	*/
5074	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);	5074	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5075	raw_inode->i_blocks_high = 0;	5075	raw_inode->i_blocks_high = 0;
5076	ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);	5076	ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5077	return 0;	5077	return 0;
5078	}	5078	}
5079	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))	5079	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
5080	return -EFBIG;	5080	return -EFBIG;
5081		5081
5082	if (i_blocks <= 0xffffffffffffULL) {	5082	if (i_blocks <= 0xffffffffffffULL) {
5083	/*	5083	/*
5084	* i_blocks can be represented in a 48 bit variable	5084	* i_blocks can be represented in a 48 bit variable
5085	* as multiple of 512 bytes	5085	* as multiple of 512 bytes
5086	*/	5086	*/
5087	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);	5087	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5088	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);	5088	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5089	ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);	5089	ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5090	} else {	5090	} else {
5091	ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);	5091	ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5092	/* i_block is stored in file system block size */	5092	/* i_block is stored in file system block size */
5093	i_blocks = i_blocks >> (inode->i_blkbits - 9);	5093	i_blocks = i_blocks >> (inode->i_blkbits - 9);
5094	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);	5094	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5095	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);	5095	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5096	}	5096	}
5097	return 0;	5097	return 0;
5098	}	5098	}
5099		5099
5100	/*	5100	/*
5101	* Post the struct inode info into an on-disk inode location in the	5101	* Post the struct inode info into an on-disk inode location in the
5102	* buffer-cache. This gobbles the caller's reference to the	5102	* buffer-cache. This gobbles the caller's reference to the
5103	* buffer_head in the inode location struct.	5103	* buffer_head in the inode location struct.
5104	*	5104	*
5105	* The caller must have write access to iloc->bh.	5105	* The caller must have write access to iloc->bh.
5106	*/	5106	*/
5107	static int ext4_do_update_inode(handle_t *handle,	5107	static int ext4_do_update_inode(handle_t *handle,
5108	struct inode *inode,	5108	struct inode *inode,
5109	struct ext4_iloc *iloc)	5109	struct ext4_iloc *iloc)
5110	{	5110	{
5111	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);	5111	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5112	struct ext4_inode_info *ei = EXT4_I(inode);	5112	struct ext4_inode_info *ei = EXT4_I(inode);
5113	struct buffer_head *bh = iloc->bh;	5113	struct buffer_head *bh = iloc->bh;
5114	int err = 0, rc, block;	5114	int err = 0, rc, block;
5115		5115
5116	/* For fields not not tracking in the in-memory inode,	5116	/* For fields not not tracking in the in-memory inode,
5117	* initialise them to zero for new inodes. */	5117	* initialise them to zero for new inodes. */
5118	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))	5118	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5119	memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);	5119	memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5120		5120
5121	ext4_get_inode_flags(ei);	5121	ext4_get_inode_flags(ei);
5122	raw_inode->i_mode = cpu_to_le16(inode->i_mode);	5122	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
5123	if (!(test_opt(inode->i_sb, NO_UID32))) {	5123	if (!(test_opt(inode->i_sb, NO_UID32))) {
5124	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));	5124	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
5125	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));	5125	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
5126	/*	5126	/*
5127	* Fix up interoperability with old kernels. Otherwise, old inodes get	5127	* Fix up interoperability with old kernels. Otherwise, old inodes get
5128	* re-used with the upper 16 bits of the uid/gid intact	5128	* re-used with the upper 16 bits of the uid/gid intact
5129	*/	5129	*/
5130	if (!ei->i_dtime) {	5130	if (!ei->i_dtime) {
5131	raw_inode->i_uid_high =	5131	raw_inode->i_uid_high =
5132	cpu_to_le16(high_16_bits(inode->i_uid));	5132	cpu_to_le16(high_16_bits(inode->i_uid));
5133	raw_inode->i_gid_high =	5133	raw_inode->i_gid_high =
5134	cpu_to_le16(high_16_bits(inode->i_gid));	5134	cpu_to_le16(high_16_bits(inode->i_gid));
5135	} else {	5135	} else {
5136	raw_inode->i_uid_high = 0;	5136	raw_inode->i_uid_high = 0;
5137	raw_inode->i_gid_high = 0;	5137	raw_inode->i_gid_high = 0;
5138	}	5138	}
5139	} else {	5139	} else {
5140	raw_inode->i_uid_low =	5140	raw_inode->i_uid_low =
5141	cpu_to_le16(fs_high2lowuid(inode->i_uid));	5141	cpu_to_le16(fs_high2lowuid(inode->i_uid));
5142	raw_inode->i_gid_low =	5142	raw_inode->i_gid_low =
5143	cpu_to_le16(fs_high2lowgid(inode->i_gid));	5143	cpu_to_le16(fs_high2lowgid(inode->i_gid));
5144	raw_inode->i_uid_high = 0;	5144	raw_inode->i_uid_high = 0;
5145	raw_inode->i_gid_high = 0;	5145	raw_inode->i_gid_high = 0;
5146	}	5146	}
5147	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);	5147	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
5148		5148
5149	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);	5149	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5150	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);	5150	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5151	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);	5151	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5152	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);	5152	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
5153		5153
5154	if (ext4_inode_blocks_set(handle, raw_inode, ei))	5154	if (ext4_inode_blocks_set(handle, raw_inode, ei))
5155	goto out_brelse;	5155	goto out_brelse;
5156	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);	5156	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5157	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);	5157	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5158	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=	5158	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5159	cpu_to_le32(EXT4_OS_HURD))	5159	cpu_to_le32(EXT4_OS_HURD))
5160	raw_inode->i_file_acl_high =	5160	raw_inode->i_file_acl_high =
5161	cpu_to_le16(ei->i_file_acl >> 32);	5161	cpu_to_le16(ei->i_file_acl >> 32);
5162	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);	5162	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5163	ext4_isize_set(raw_inode, ei->i_disksize);	5163	ext4_isize_set(raw_inode, ei->i_disksize);
5164	if (ei->i_disksize > 0x7fffffffULL) {	5164	if (ei->i_disksize > 0x7fffffffULL) {
5165	struct super_block *sb = inode->i_sb;	5165	struct super_block *sb = inode->i_sb;
5166	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,	5166	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
5167	EXT4_FEATURE_RO_COMPAT_LARGE_FILE) \|\|	5167	EXT4_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
5168	EXT4_SB(sb)->s_es->s_rev_level ==	5168	EXT4_SB(sb)->s_es->s_rev_level ==
5169	cpu_to_le32(EXT4_GOOD_OLD_REV)) {	5169	cpu_to_le32(EXT4_GOOD_OLD_REV)) {
5170	/* If this is the first large file	5170	/* If this is the first large file
5171	* created, add a flag to the superblock.	5171	* created, add a flag to the superblock.
5172	*/	5172	*/
5173	err = ext4_journal_get_write_access(handle,	5173	err = ext4_journal_get_write_access(handle,
5174	EXT4_SB(sb)->s_sbh);	5174	EXT4_SB(sb)->s_sbh);
5175	if (err)	5175	if (err)
5176	goto out_brelse;	5176	goto out_brelse;
5177	ext4_update_dynamic_rev(sb);	5177	ext4_update_dynamic_rev(sb);
5178	EXT4_SET_RO_COMPAT_FEATURE(sb,	5178	EXT4_SET_RO_COMPAT_FEATURE(sb,
5179	EXT4_FEATURE_RO_COMPAT_LARGE_FILE);	5179	EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5180	sb->s_dirt = 1;	5180	sb->s_dirt = 1;
5181	ext4_handle_sync(handle);	5181	ext4_handle_sync(handle);
5182	err = ext4_handle_dirty_metadata(handle, NULL,	5182	err = ext4_handle_dirty_metadata(handle, NULL,
5183	EXT4_SB(sb)->s_sbh);	5183	EXT4_SB(sb)->s_sbh);
5184	}	5184	}
5185	}	5185	}
5186	raw_inode->i_generation = cpu_to_le32(inode->i_generation);	5186	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
5187	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {	5187	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
5188	if (old_valid_dev(inode->i_rdev)) {	5188	if (old_valid_dev(inode->i_rdev)) {
5189	raw_inode->i_block[0] =	5189	raw_inode->i_block[0] =
5190	cpu_to_le32(old_encode_dev(inode->i_rdev));	5190	cpu_to_le32(old_encode_dev(inode->i_rdev));
5191	raw_inode->i_block[1] = 0;	5191	raw_inode->i_block[1] = 0;
5192	} else {	5192	} else {
5193	raw_inode->i_block[0] = 0;	5193	raw_inode->i_block[0] = 0;
5194	raw_inode->i_block[1] =	5194	raw_inode->i_block[1] =
5195	cpu_to_le32(new_encode_dev(inode->i_rdev));	5195	cpu_to_le32(new_encode_dev(inode->i_rdev));
5196	raw_inode->i_block[2] = 0;	5196	raw_inode->i_block[2] = 0;
5197	}	5197	}
5198	} else	5198	} else
5199	for (block = 0; block < EXT4_N_BLOCKS; block++)	5199	for (block = 0; block < EXT4_N_BLOCKS; block++)
5200	raw_inode->i_block[block] = ei->i_data[block];	5200	raw_inode->i_block[block] = ei->i_data[block];
5201		5201
5202	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);	5202	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
5203	if (ei->i_extra_isize) {	5203	if (ei->i_extra_isize) {
5204	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))	5204	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
5205	raw_inode->i_version_hi =	5205	raw_inode->i_version_hi =
5206	cpu_to_le32(inode->i_version >> 32);	5206	cpu_to_le32(inode->i_version >> 32);
5207	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);	5207	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
5208	}	5208	}
5209		5209
5210	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	5210	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5211	rc = ext4_handle_dirty_metadata(handle, NULL, bh);	5211	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5212	if (!err)	5212	if (!err)
5213	err = rc;	5213	err = rc;
5214	ext4_clear_inode_state(inode, EXT4_STATE_NEW);	5214	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5215		5215
5216	ext4_update_inode_fsync_trans(handle, inode, 0);	5216	ext4_update_inode_fsync_trans(handle, inode, 0);
5217	out_brelse:	5217	out_brelse:
5218	brelse(bh);	5218	brelse(bh);
5219	ext4_std_error(inode->i_sb, err);	5219	ext4_std_error(inode->i_sb, err);
5220	return err;	5220	return err;
5221	}	5221	}
5222		5222
5223	/*	5223	/*
5224	* ext4_write_inode()	5224	* ext4_write_inode()
5225	*	5225	*
5226	* We are called from a few places:	5226	* We are called from a few places:
5227	*	5227	*
5228	* - Within generic_file_write() for O_SYNC files.	5228	* - Within generic_file_write() for O_SYNC files.
5229	* Here, there will be no transaction running. We wait for any running	5229	* Here, there will be no transaction running. We wait for any running
5230	* trasnaction to commit.	5230	* trasnaction to commit.
5231	*	5231	*
5232	* - Within sys_sync(), kupdate and such.	5232	* - Within sys_sync(), kupdate and such.
5233	* We wait on commit, if tol to.	5233	* We wait on commit, if tol to.
5234	*	5234	*
5235	* - Within prune_icache() (PF_MEMALLOC == true)	5235	* - Within prune_icache() (PF_MEMALLOC == true)
5236	* Here we simply return. We can't afford to block kswapd on the	5236	* Here we simply return. We can't afford to block kswapd on the
5237	* journal commit.	5237	* journal commit.
5238	*	5238	*
5239	* In all cases it is actually safe for us to return without doing anything,	5239	* In all cases it is actually safe for us to return without doing anything,
5240	* because the inode has been copied into a raw inode buffer in	5240	* because the inode has been copied into a raw inode buffer in
5241	* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for	5241	* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
5242	* knfsd.	5242	* knfsd.
5243	*	5243	*
5244	* Note that we are absolutely dependent upon all inode dirtiers doing the	5244	* Note that we are absolutely dependent upon all inode dirtiers doing the
5245	* right thing: they must call mark_inode_dirty() after dirtying info in	5245	* right thing: they must call mark_inode_dirty() after dirtying info in
5246	* which we are interested.	5246	* which we are interested.
5247	*	5247	*
5248	* It would be a bug for them to not do this. The code:	5248	* It would be a bug for them to not do this. The code:
5249	*	5249	*
5250	* mark_inode_dirty(inode)	5250	* mark_inode_dirty(inode)
5251	* stuff();	5251	* stuff();
5252	* inode->i_size = expr;	5252	* inode->i_size = expr;
5253	*	5253	*
5254	* is in error because a kswapd-driven write_inode() could occur while	5254	* is in error because a kswapd-driven write_inode() could occur while
5255	* `stuff()' is running, and the new i_size will be lost. Plus the inode	5255	* `stuff()' is running, and the new i_size will be lost. Plus the inode
5256	* will no longer be on the superblock's dirty inode list.	5256	* will no longer be on the superblock's dirty inode list.
5257	*/	5257	*/
5258	int ext4_write_inode(struct inode inode, struct writeback_control wbc)	5258	int ext4_write_inode(struct inode inode, struct writeback_control wbc)
5259	{	5259	{
5260	int err;	5260	int err;
5261		5261
5262	if (current->flags & PF_MEMALLOC)	5262	if (current->flags & PF_MEMALLOC)
5263	return 0;	5263	return 0;
5264		5264
5265	if (EXT4_SB(inode->i_sb)->s_journal) {	5265	if (EXT4_SB(inode->i_sb)->s_journal) {
5266	if (ext4_journal_current_handle()) {	5266	if (ext4_journal_current_handle()) {
5267	jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");	5267	jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
5268	dump_stack();	5268	dump_stack();
5269	return -EIO;	5269	return -EIO;
5270	}	5270	}
5271		5271
5272	if (wbc->sync_mode != WB_SYNC_ALL)	5272	if (wbc->sync_mode != WB_SYNC_ALL)
5273	return 0;	5273	return 0;
5274		5274
5275	err = ext4_force_commit(inode->i_sb);	5275	err = ext4_force_commit(inode->i_sb);
5276	} else {	5276	} else {
5277	struct ext4_iloc iloc;	5277	struct ext4_iloc iloc;
5278		5278
5279	err = __ext4_get_inode_loc(inode, &iloc, 0);	5279	err = __ext4_get_inode_loc(inode, &iloc, 0);
5280	if (err)	5280	if (err)
5281	return err;	5281	return err;
5282	if (wbc->sync_mode == WB_SYNC_ALL)	5282	if (wbc->sync_mode == WB_SYNC_ALL)
5283	sync_dirty_buffer(iloc.bh);	5283	sync_dirty_buffer(iloc.bh);
5284	if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {	5284	if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5285	EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,	5285	EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5286	"IO error syncing inode");	5286	"IO error syncing inode");
5287	err = -EIO;	5287	err = -EIO;
5288	}	5288	}
5289	brelse(iloc.bh);	5289	brelse(iloc.bh);
5290	}	5290	}
5291	return err;	5291	return err;
5292	}	5292	}
5293		5293
5294	/*	5294	/*
5295	* ext4_setattr()	5295	* ext4_setattr()
5296	*	5296	*
5297	* Called from notify_change.	5297	* Called from notify_change.
5298	*	5298	*
5299	* We want to trap VFS attempts to truncate the file as soon as	5299	* We want to trap VFS attempts to truncate the file as soon as
5300	* possible. In particular, we want to make sure that when the VFS	5300	* possible. In particular, we want to make sure that when the VFS
5301	* shrinks i_size, we put the inode on the orphan list and modify	5301	* shrinks i_size, we put the inode on the orphan list and modify
5302	* i_disksize immediately, so that during the subsequent flushing of	5302	* i_disksize immediately, so that during the subsequent flushing of
5303	* dirty pages and freeing of disk blocks, we can guarantee that any	5303	* dirty pages and freeing of disk blocks, we can guarantee that any
5304	* commit will leave the blocks being flushed in an unused state on	5304	* commit will leave the blocks being flushed in an unused state on
5305	* disk. (On recovery, the inode will get truncated and the blocks will	5305	* disk. (On recovery, the inode will get truncated and the blocks will
5306	* be freed, so we have a strong guarantee that no future commit will	5306	* be freed, so we have a strong guarantee that no future commit will
5307	* leave these blocks visible to the user.)	5307	* leave these blocks visible to the user.)
5308	*	5308	*
5309	* Another thing we have to assure is that if we are in ordered mode	5309	* Another thing we have to assure is that if we are in ordered mode
5310	* and inode is still attached to the committing transaction, we must	5310	* and inode is still attached to the committing transaction, we must
5311	* we start writeout of all the dirty pages which are being truncated.	5311	* we start writeout of all the dirty pages which are being truncated.
5312	* This way we are sure that all the data written in the previous	5312	* This way we are sure that all the data written in the previous
5313	* transaction are already on disk (truncate waits for pages under	5313	* transaction are already on disk (truncate waits for pages under
5314	* writeback).	5314	* writeback).
5315	*	5315	*
5316	* Called with inode->i_mutex down.	5316	* Called with inode->i_mutex down.
5317	*/	5317	*/
5318	int ext4_setattr(struct dentry dentry, struct iattr attr)	5318	int ext4_setattr(struct dentry dentry, struct iattr attr)
5319	{	5319	{
5320	struct inode *inode = dentry->d_inode;	5320	struct inode *inode = dentry->d_inode;
5321	int error, rc = 0;	5321	int error, rc = 0;
5322	int orphan = 0;	5322	int orphan = 0;
5323	const unsigned int ia_valid = attr->ia_valid;	5323	const unsigned int ia_valid = attr->ia_valid;
5324		5324
5325	error = inode_change_ok(inode, attr);	5325	error = inode_change_ok(inode, attr);
5326	if (error)	5326	if (error)
5327	return error;	5327	return error;
5328		5328
5329	if (is_quota_modification(inode, attr))	5329	if (is_quota_modification(inode, attr))
5330	dquot_initialize(inode);	5330	dquot_initialize(inode);
5331	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|	5331	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
5332	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {	5332	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5333	handle_t *handle;	5333	handle_t *handle;
5334		5334
5335	/* (user+group)*(old+new) structure, inode write (sb,	5335	/* (user+group)*(old+new) structure, inode write (sb,
5336	* inode block, ? - but truncate inode update has it) */	5336	* inode block, ? - but truncate inode update has it) */
5337	handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+	5337	handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5338	EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);	5338	EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5339	if (IS_ERR(handle)) {	5339	if (IS_ERR(handle)) {
5340	error = PTR_ERR(handle);	5340	error = PTR_ERR(handle);
5341	goto err_out;	5341	goto err_out;
5342	}	5342	}
5343	error = dquot_transfer(inode, attr);	5343	error = dquot_transfer(inode, attr);
5344	if (error) {	5344	if (error) {
5345	ext4_journal_stop(handle);	5345	ext4_journal_stop(handle);
5346	return error;	5346	return error;
5347	}	5347	}
5348	/* Update corresponding info in inode so that everything is in	5348	/* Update corresponding info in inode so that everything is in
5349	* one transaction */	5349	* one transaction */
5350	if (attr->ia_valid & ATTR_UID)	5350	if (attr->ia_valid & ATTR_UID)
5351	inode->i_uid = attr->ia_uid;	5351	inode->i_uid = attr->ia_uid;
5352	if (attr->ia_valid & ATTR_GID)	5352	if (attr->ia_valid & ATTR_GID)
5353	inode->i_gid = attr->ia_gid;	5353	inode->i_gid = attr->ia_gid;
5354	error = ext4_mark_inode_dirty(handle, inode);	5354	error = ext4_mark_inode_dirty(handle, inode);
5355	ext4_journal_stop(handle);	5355	ext4_journal_stop(handle);
5356	}	5356	}
5357		5357
5358	if (attr->ia_valid & ATTR_SIZE) {	5358	if (attr->ia_valid & ATTR_SIZE) {
5359	inode_dio_wait(inode);	5359	inode_dio_wait(inode);
5360		5360
5361	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {	5361	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5362	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	5362	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5363		5363
5364	if (attr->ia_size > sbi->s_bitmap_maxbytes)	5364	if (attr->ia_size > sbi->s_bitmap_maxbytes)
5365	return -EFBIG;	5365	return -EFBIG;
5366	}	5366	}
5367	}	5367	}
5368		5368
5369	if (S_ISREG(inode->i_mode) &&	5369	if (S_ISREG(inode->i_mode) &&
5370	attr->ia_valid & ATTR_SIZE &&	5370	attr->ia_valid & ATTR_SIZE &&
5371	(attr->ia_size < inode->i_size)) {	5371	(attr->ia_size < inode->i_size)) {
5372	handle_t *handle;	5372	handle_t *handle;
5373		5373
5374	handle = ext4_journal_start(inode, 3);	5374	handle = ext4_journal_start(inode, 3);
5375	if (IS_ERR(handle)) {	5375	if (IS_ERR(handle)) {
5376	error = PTR_ERR(handle);	5376	error = PTR_ERR(handle);
5377	goto err_out;	5377	goto err_out;
5378	}	5378	}
5379	if (ext4_handle_valid(handle)) {	5379	if (ext4_handle_valid(handle)) {
5380	error = ext4_orphan_add(handle, inode);	5380	error = ext4_orphan_add(handle, inode);
5381	orphan = 1;	5381	orphan = 1;
5382	}	5382	}
5383	EXT4_I(inode)->i_disksize = attr->ia_size;	5383	EXT4_I(inode)->i_disksize = attr->ia_size;
5384	rc = ext4_mark_inode_dirty(handle, inode);	5384	rc = ext4_mark_inode_dirty(handle, inode);
5385	if (!error)	5385	if (!error)
5386	error = rc;	5386	error = rc;
5387	ext4_journal_stop(handle);	5387	ext4_journal_stop(handle);
5388		5388
5389	if (ext4_should_order_data(inode)) {	5389	if (ext4_should_order_data(inode)) {
5390	error = ext4_begin_ordered_truncate(inode,	5390	error = ext4_begin_ordered_truncate(inode,
5391	attr->ia_size);	5391	attr->ia_size);
5392	if (error) {	5392	if (error) {
5393	/* Do as much error cleanup as possible */	5393	/* Do as much error cleanup as possible */
5394	handle = ext4_journal_start(inode, 3);	5394	handle = ext4_journal_start(inode, 3);
5395	if (IS_ERR(handle)) {	5395	if (IS_ERR(handle)) {
5396	ext4_orphan_del(NULL, inode);	5396	ext4_orphan_del(NULL, inode);
5397	goto err_out;	5397	goto err_out;
5398	}	5398	}
5399	ext4_orphan_del(handle, inode);	5399	ext4_orphan_del(handle, inode);
5400	orphan = 0;	5400	orphan = 0;
5401	ext4_journal_stop(handle);	5401	ext4_journal_stop(handle);
5402	goto err_out;	5402	goto err_out;
5403	}	5403	}
5404	}	5404	}
5405	}	5405	}
5406		5406
5407	if (attr->ia_valid & ATTR_SIZE) {	5407	if (attr->ia_valid & ATTR_SIZE) {
5408	if (attr->ia_size != i_size_read(inode)) {	5408	if (attr->ia_size != i_size_read(inode)) {
5409	truncate_setsize(inode, attr->ia_size);	5409	truncate_setsize(inode, attr->ia_size);
5410	ext4_truncate(inode);	5410	ext4_truncate(inode);
5411	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))	5411	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5412	ext4_truncate(inode);	5412	ext4_truncate(inode);
5413	}	5413	}
5414		5414
5415	if (!rc) {	5415	if (!rc) {
5416	setattr_copy(inode, attr);	5416	setattr_copy(inode, attr);
5417	mark_inode_dirty(inode);	5417	mark_inode_dirty(inode);
5418	}	5418	}
5419		5419
5420	/*	5420	/*
5421	* If the call to ext4_truncate failed to get a transaction handle at	5421	* If the call to ext4_truncate failed to get a transaction handle at
5422	* all, we need to clean up the in-core orphan list manually.	5422	* all, we need to clean up the in-core orphan list manually.
5423	*/	5423	*/
5424	if (orphan && inode->i_nlink)	5424	if (orphan && inode->i_nlink)
5425	ext4_orphan_del(NULL, inode);	5425	ext4_orphan_del(NULL, inode);
5426		5426
5427	if (!rc && (ia_valid & ATTR_MODE))	5427	if (!rc && (ia_valid & ATTR_MODE))
5428	rc = ext4_acl_chmod(inode);	5428	rc = ext4_acl_chmod(inode);
5429		5429
5430	err_out:	5430	err_out:
5431	ext4_std_error(inode->i_sb, error);	5431	ext4_std_error(inode->i_sb, error);
5432	if (!error)	5432	if (!error)
5433	error = rc;	5433	error = rc;
5434	return error;	5434	return error;
5435	}	5435	}
5436		5436
5437	int ext4_getattr(struct vfsmount mnt, struct dentry dentry,	5437	int ext4_getattr(struct vfsmount mnt, struct dentry dentry,
5438	struct kstat *stat)	5438	struct kstat *stat)
5439	{	5439	{
5440	struct inode *inode;	5440	struct inode *inode;
5441	unsigned long delalloc_blocks;	5441	unsigned long delalloc_blocks;
5442		5442
5443	inode = dentry->d_inode;	5443	inode = dentry->d_inode;
5444	generic_fillattr(inode, stat);	5444	generic_fillattr(inode, stat);
5445		5445
5446	/*	5446	/*
5447	* We can't update i_blocks if the block allocation is delayed	5447	* We can't update i_blocks if the block allocation is delayed
5448	* otherwise in the case of system crash before the real block	5448	* otherwise in the case of system crash before the real block
5449	* allocation is done, we will have i_blocks inconsistent with	5449	* allocation is done, we will have i_blocks inconsistent with
5450	* on-disk file blocks.	5450	* on-disk file blocks.
5451	* We always keep i_blocks updated together with real	5451	* We always keep i_blocks updated together with real
5452	* allocation. But to not confuse with user, stat	5452	* allocation. But to not confuse with user, stat
5453	* will return the blocks that include the delayed allocation	5453	* will return the blocks that include the delayed allocation
5454	* blocks for this file.	5454	* blocks for this file.
5455	*/	5455	*/
5456	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;	5456	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5457		5457
5458	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;	5458	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5459	return 0;	5459	return 0;
5460	}	5460	}
5461		5461
5462	static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,	5462	static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5463	int chunk)	5463	int chunk)
5464	{	5464	{
5465	int indirects;	5465	int indirects;
5466		5466
5467	/* if nrblocks are contiguous */	5467	/* if nrblocks are contiguous */
5468	if (chunk) {	5468	if (chunk) {
5469	/*	5469	/*
5470	* With N contiguous data blocks, we need at most	5470	* With N contiguous data blocks, we need at most
5471	* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,	5471	* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5472	* 2 dindirect blocks, and 1 tindirect block	5472	* 2 dindirect blocks, and 1 tindirect block
5473	*/	5473	*/
5474	return DIV_ROUND_UP(nrblocks,	5474	return DIV_ROUND_UP(nrblocks,
5475	EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;	5475	EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5476	}	5476	}
5477	/*	5477	/*
5478	* if nrblocks are not contiguous, worse case, each block touch	5478	* if nrblocks are not contiguous, worse case, each block touch
5479	* a indirect block, and each indirect block touch a double indirect	5479	* a indirect block, and each indirect block touch a double indirect
5480	* block, plus a triple indirect block	5480	* block, plus a triple indirect block
5481	*/	5481	*/
5482	indirects = nrblocks * 2 + 1;	5482	indirects = nrblocks * 2 + 1;
5483	return indirects;	5483	return indirects;
5484	}	5484	}
5485		5485
5486	static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)	5486	static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5487	{	5487	{
5488	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))	5488	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5489	return ext4_indirect_trans_blocks(inode, nrblocks, chunk);	5489	return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5490	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);	5490	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5491	}	5491	}
5492		5492
5493	/*	5493	/*
5494	* Account for index blocks, block groups bitmaps and block group	5494	* Account for index blocks, block groups bitmaps and block group
5495	* descriptor blocks if modify datablocks and index blocks	5495	* descriptor blocks if modify datablocks and index blocks
5496	* worse case, the indexs blocks spread over different block groups	5496	* worse case, the indexs blocks spread over different block groups
5497	*	5497	*
5498	* If datablocks are discontiguous, they are possible to spread over	5498	* If datablocks are discontiguous, they are possible to spread over
5499	* different block groups too. If they are contiuguous, with flexbg,	5499	* different block groups too. If they are contiuguous, with flexbg,
5500	* they could still across block group boundary.	5500	* they could still across block group boundary.
5501	*	5501	*
5502	* Also account for superblock, inode, quota and xattr blocks	5502	* Also account for superblock, inode, quota and xattr blocks
5503	*/	5503	*/
5504	static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)	5504	static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5505	{	5505	{
5506	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);	5506	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5507	int gdpblocks;	5507	int gdpblocks;
5508	int idxblocks;	5508	int idxblocks;
5509	int ret = 0;	5509	int ret = 0;
5510		5510
5511	/*	5511	/*
5512	* How many index blocks need to touch to modify nrblocks?	5512	* How many index blocks need to touch to modify nrblocks?
5513	* The "Chunk" flag indicating whether the nrblocks is	5513	* The "Chunk" flag indicating whether the nrblocks is
5514	* physically contiguous on disk	5514	* physically contiguous on disk
5515	*	5515	*
5516	* For Direct IO and fallocate, they calls get_block to allocate	5516	* For Direct IO and fallocate, they calls get_block to allocate
5517	* one single extent at a time, so they could set the "Chunk" flag	5517	* one single extent at a time, so they could set the "Chunk" flag
5518	*/	5518	*/
5519	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);	5519	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5520		5520
5521	ret = idxblocks;	5521	ret = idxblocks;
5522		5522
5523	/*	5523	/*
5524	* Now let's see how many group bitmaps and group descriptors need	5524	* Now let's see how many group bitmaps and group descriptors need
5525	* to account	5525	* to account
5526	*/	5526	*/
5527	groups = idxblocks;	5527	groups = idxblocks;
5528	if (chunk)	5528	if (chunk)
5529	groups += 1;	5529	groups += 1;
5530	else	5530	else
5531	groups += nrblocks;	5531	groups += nrblocks;
5532		5532
5533	gdpblocks = groups;	5533	gdpblocks = groups;
5534	if (groups > ngroups)	5534	if (groups > ngroups)
5535	groups = ngroups;	5535	groups = ngroups;
5536	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)	5536	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5537	gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;	5537	gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5538		5538
5539	/* bitmaps and block group descriptor blocks */	5539	/* bitmaps and block group descriptor blocks */
5540	ret += groups + gdpblocks;	5540	ret += groups + gdpblocks;
5541		5541
5542	/* Blocks for super block, inode, quota and xattr blocks */	5542	/* Blocks for super block, inode, quota and xattr blocks */
5543	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);	5543	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5544		5544
5545	return ret;	5545	return ret;
5546	}	5546	}
5547		5547
5548	/*	5548	/*
5549	* Calculate the total number of credits to reserve to fit	5549	* Calculate the total number of credits to reserve to fit
5550	* the modification of a single pages into a single transaction,	5550	* the modification of a single pages into a single transaction,
5551	* which may include multiple chunks of block allocations.	5551	* which may include multiple chunks of block allocations.
5552	*	5552	*
5553	* This could be called via ext4_write_begin()	5553	* This could be called via ext4_write_begin()
5554	*	5554	*
5555	* We need to consider the worse case, when	5555	* We need to consider the worse case, when
5556	* one new block per extent.	5556	* one new block per extent.
5557	*/	5557	*/
5558	int ext4_writepage_trans_blocks(struct inode *inode)	5558	int ext4_writepage_trans_blocks(struct inode *inode)
5559	{	5559	{
5560	int bpp = ext4_journal_blocks_per_page(inode);	5560	int bpp = ext4_journal_blocks_per_page(inode);
5561	int ret;	5561	int ret;
5562		5562
5563	ret = ext4_meta_trans_blocks(inode, bpp, 0);	5563	ret = ext4_meta_trans_blocks(inode, bpp, 0);
5564		5564
5565	/* Account for data blocks for journalled mode */	5565	/* Account for data blocks for journalled mode */
5566	if (ext4_should_journal_data(inode))	5566	if (ext4_should_journal_data(inode))
5567	ret += bpp;	5567	ret += bpp;
5568	return ret;	5568	return ret;
5569	}	5569	}
5570		5570
5571	/*	5571	/*
5572	* Calculate the journal credits for a chunk of data modification.	5572	* Calculate the journal credits for a chunk of data modification.
5573	*	5573	*
5574	* This is called from DIO, fallocate or whoever calling	5574	* This is called from DIO, fallocate or whoever calling
5575	* ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.	5575	* ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5576	*	5576	*
5577	* journal buffers for data blocks are not included here, as DIO	5577	* journal buffers for data blocks are not included here, as DIO
5578	* and fallocate do no need to journal data buffers.	5578	* and fallocate do no need to journal data buffers.
5579	*/	5579	*/
5580	int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)	5580	int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5581	{	5581	{
5582	return ext4_meta_trans_blocks(inode, nrblocks, 1);	5582	return ext4_meta_trans_blocks(inode, nrblocks, 1);
5583	}	5583	}
5584		5584
5585	/*	5585	/*
5586	* The caller must have previously called ext4_reserve_inode_write().	5586	* The caller must have previously called ext4_reserve_inode_write().
5587	* Give this, we know that the caller already has write access to iloc->bh.	5587	* Give this, we know that the caller already has write access to iloc->bh.
5588	*/	5588	*/
5589	int ext4_mark_iloc_dirty(handle_t *handle,	5589	int ext4_mark_iloc_dirty(handle_t *handle,
5590	struct inode inode, struct ext4_iloc iloc)	5590	struct inode inode, struct ext4_iloc iloc)
5591	{	5591	{
5592	int err = 0;	5592	int err = 0;
5593		5593
5594	if (test_opt(inode->i_sb, I_VERSION))	5594	if (test_opt(inode->i_sb, I_VERSION))
5595	inode_inc_iversion(inode);	5595	inode_inc_iversion(inode);
5596		5596
5597	/* the do_update_inode consumes one bh->b_count */	5597	/* the do_update_inode consumes one bh->b_count */
5598	get_bh(iloc->bh);	5598	get_bh(iloc->bh);
5599		5599
5600	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */	5600	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5601	err = ext4_do_update_inode(handle, inode, iloc);	5601	err = ext4_do_update_inode(handle, inode, iloc);
5602	put_bh(iloc->bh);	5602	put_bh(iloc->bh);
5603	return err;	5603	return err;
5604	}	5604	}
5605		5605
5606	/*	5606	/*
5607	* On success, We end up with an outstanding reference count against	5607	* On success, We end up with an outstanding reference count against
5608	* iloc->bh. This _must_ be cleaned up later.	5608	* iloc->bh. This _must_ be cleaned up later.
5609	*/	5609	*/
5610		5610
5611	int	5611	int
5612	ext4_reserve_inode_write(handle_t handle, struct inode inode,	5612	ext4_reserve_inode_write(handle_t handle, struct inode inode,
5613	struct ext4_iloc *iloc)	5613	struct ext4_iloc *iloc)
5614	{	5614	{
5615	int err;	5615	int err;
5616		5616
5617	err = ext4_get_inode_loc(inode, iloc);	5617	err = ext4_get_inode_loc(inode, iloc);
5618	if (!err) {	5618	if (!err) {
5619	BUFFER_TRACE(iloc->bh, "get_write_access");	5619	BUFFER_TRACE(iloc->bh, "get_write_access");
5620	err = ext4_journal_get_write_access(handle, iloc->bh);	5620	err = ext4_journal_get_write_access(handle, iloc->bh);
5621	if (err) {	5621	if (err) {
5622	brelse(iloc->bh);	5622	brelse(iloc->bh);
5623	iloc->bh = NULL;	5623	iloc->bh = NULL;
5624	}	5624	}
5625	}	5625	}
5626	ext4_std_error(inode->i_sb, err);	5626	ext4_std_error(inode->i_sb, err);
5627	return err;	5627	return err;
5628	}	5628	}
5629		5629
5630	/*	5630	/*
5631	* Expand an inode by new_extra_isize bytes.	5631	* Expand an inode by new_extra_isize bytes.
5632	* Returns 0 on success or negative error number on failure.	5632	* Returns 0 on success or negative error number on failure.
5633	*/	5633	*/
5634	static int ext4_expand_extra_isize(struct inode *inode,	5634	static int ext4_expand_extra_isize(struct inode *inode,
5635	unsigned int new_extra_isize,	5635	unsigned int new_extra_isize,
5636	struct ext4_iloc iloc,	5636	struct ext4_iloc iloc,
5637	handle_t *handle)	5637	handle_t *handle)
5638	{	5638	{
5639	struct ext4_inode *raw_inode;	5639	struct ext4_inode *raw_inode;
5640	struct ext4_xattr_ibody_header *header;	5640	struct ext4_xattr_ibody_header *header;
5641		5641
5642	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)	5642	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5643	return 0;	5643	return 0;
5644		5644
5645	raw_inode = ext4_raw_inode(&iloc);	5645	raw_inode = ext4_raw_inode(&iloc);
5646		5646
5647	header = IHDR(inode, raw_inode);	5647	header = IHDR(inode, raw_inode);
5648		5648
5649	/* No extended attributes present */	5649	/* No extended attributes present */
5650	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) \|\|	5650	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) \|\|
5651	header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {	5651	header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5652	memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,	5652	memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5653	new_extra_isize);	5653	new_extra_isize);
5654	EXT4_I(inode)->i_extra_isize = new_extra_isize;	5654	EXT4_I(inode)->i_extra_isize = new_extra_isize;
5655	return 0;	5655	return 0;
5656	}	5656	}
5657		5657
5658	/* try to expand with EAs present */	5658	/* try to expand with EAs present */
5659	return ext4_expand_extra_isize_ea(inode, new_extra_isize,	5659	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5660	raw_inode, handle);	5660	raw_inode, handle);
5661	}	5661	}
5662		5662
5663	/*	5663	/*
5664	* What we do here is to mark the in-core inode as clean with respect to inode	5664	* What we do here is to mark the in-core inode as clean with respect to inode
5665	* dirtiness (it may still be data-dirty).	5665	* dirtiness (it may still be data-dirty).
5666	* This means that the in-core inode may be reaped by prune_icache	5666	* This means that the in-core inode may be reaped by prune_icache
5667	* without having to perform any I/O. This is a very good thing,	5667	* without having to perform any I/O. This is a very good thing,
5668	* because any task may call prune_icache - even ones which	5668	* because any task may call prune_icache - even ones which
5669	* have a transaction open against a different journal.	5669	* have a transaction open against a different journal.
5670	*	5670	*
5671	* Is this cheating? Not really. Sure, we haven't written the	5671	* Is this cheating? Not really. Sure, we haven't written the
5672	* inode out, but prune_icache isn't a user-visible syncing function.	5672	* inode out, but prune_icache isn't a user-visible syncing function.
5673	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)	5673	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5674	* we start and wait on commits.	5674	* we start and wait on commits.
5675	*	5675	*
5676	* Is this efficient/effective? Well, we're being nice to the system	5676	* Is this efficient/effective? Well, we're being nice to the system
5677	* by cleaning up our inodes proactively so they can be reaped	5677	* by cleaning up our inodes proactively so they can be reaped
5678	* without I/O. But we are potentially leaving up to five seconds'	5678	* without I/O. But we are potentially leaving up to five seconds'
5679	* worth of inodes floating about which prune_icache wants us to	5679	* worth of inodes floating about which prune_icache wants us to
5680	* write out. One way to fix that would be to get prune_icache()	5680	* write out. One way to fix that would be to get prune_icache()
5681	* to do a write_super() to free up some memory. It has the desired	5681	* to do a write_super() to free up some memory. It has the desired
5682	* effect.	5682	* effect.
5683	*/	5683	*/
5684	int ext4_mark_inode_dirty(handle_t handle, struct inode inode)	5684	int ext4_mark_inode_dirty(handle_t handle, struct inode inode)
5685	{	5685	{
5686	struct ext4_iloc iloc;	5686	struct ext4_iloc iloc;
5687	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);	5687	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5688	static unsigned int mnt_count;	5688	static unsigned int mnt_count;
5689	int err, ret;	5689	int err, ret;
5690		5690
5691	might_sleep();	5691	might_sleep();
5692	trace_ext4_mark_inode_dirty(inode, _RET_IP_);	5692	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5693	err = ext4_reserve_inode_write(handle, inode, &iloc);	5693	err = ext4_reserve_inode_write(handle, inode, &iloc);
5694	if (ext4_handle_valid(handle) &&	5694	if (ext4_handle_valid(handle) &&
5695	EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&	5695	EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5696	!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {	5696	!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5697	/*	5697	/*
5698	* We need extra buffer credits since we may write into EA block	5698	* We need extra buffer credits since we may write into EA block
5699	* with this same handle. If journal_extend fails, then it will	5699	* with this same handle. If journal_extend fails, then it will
5700	* only result in a minor loss of functionality for that inode.	5700	* only result in a minor loss of functionality for that inode.
5701	* If this is felt to be critical, then e2fsck should be run to	5701	* If this is felt to be critical, then e2fsck should be run to
5702	* force a large enough s_min_extra_isize.	5702	* force a large enough s_min_extra_isize.
5703	*/	5703	*/
5704	if ((jbd2_journal_extend(handle,	5704	if ((jbd2_journal_extend(handle,
5705	EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {	5705	EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5706	ret = ext4_expand_extra_isize(inode,	5706	ret = ext4_expand_extra_isize(inode,
5707	sbi->s_want_extra_isize,	5707	sbi->s_want_extra_isize,
5708	iloc, handle);	5708	iloc, handle);
5709	if (ret) {	5709	if (ret) {
5710	ext4_set_inode_state(inode,	5710	ext4_set_inode_state(inode,
5711	EXT4_STATE_NO_EXPAND);	5711	EXT4_STATE_NO_EXPAND);
5712	if (mnt_count !=	5712	if (mnt_count !=
5713	le16_to_cpu(sbi->s_es->s_mnt_count)) {	5713	le16_to_cpu(sbi->s_es->s_mnt_count)) {
5714	ext4_warning(inode->i_sb,	5714	ext4_warning(inode->i_sb,
5715	"Unable to expand inode %lu. Delete"	5715	"Unable to expand inode %lu. Delete"
5716	" some EAs or run e2fsck.",	5716	" some EAs or run e2fsck.",
5717	inode->i_ino);	5717	inode->i_ino);
5718	mnt_count =	5718	mnt_count =
5719	le16_to_cpu(sbi->s_es->s_mnt_count);	5719	le16_to_cpu(sbi->s_es->s_mnt_count);
5720	}	5720	}
5721	}	5721	}
5722	}	5722	}
5723	}	5723	}
5724	if (!err)	5724	if (!err)
5725	err = ext4_mark_iloc_dirty(handle, inode, &iloc);	5725	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5726	return err;	5726	return err;
5727	}	5727	}
5728		5728
5729	/*	5729	/*
5730	* ext4_dirty_inode() is called from __mark_inode_dirty()	5730	* ext4_dirty_inode() is called from __mark_inode_dirty()
5731	*	5731	*
5732	* We're really interested in the case where a file is being extended.	5732	* We're really interested in the case where a file is being extended.
5733	* i_size has been changed by generic_commit_write() and we thus need	5733	* i_size has been changed by generic_commit_write() and we thus need
5734	* to include the updated inode in the current transaction.	5734	* to include the updated inode in the current transaction.
5735	*	5735	*
5736	* Also, dquot_alloc_block() will always dirty the inode when blocks	5736	* Also, dquot_alloc_block() will always dirty the inode when blocks
5737	* are allocated to the file.	5737	* are allocated to the file.
5738	*	5738	*
5739	* If the inode is marked synchronous, we don't honour that here - doing	5739	* If the inode is marked synchronous, we don't honour that here - doing
5740	* so would cause a commit on atime updates, which we don't bother doing.	5740	* so would cause a commit on atime updates, which we don't bother doing.
5741	* We handle synchronous inodes at the highest possible level.	5741	* We handle synchronous inodes at the highest possible level.
5742	*/	5742	*/
5743	void ext4_dirty_inode(struct inode *inode, int flags)	5743	void ext4_dirty_inode(struct inode *inode, int flags)
5744	{	5744	{
5745	handle_t *handle;	5745	handle_t *handle;
5746		5746
5747	handle = ext4_journal_start(inode, 2);	5747	handle = ext4_journal_start(inode, 2);
5748	if (IS_ERR(handle))	5748	if (IS_ERR(handle))
5749	goto out;	5749	goto out;
5750		5750
5751	ext4_mark_inode_dirty(handle, inode);	5751	ext4_mark_inode_dirty(handle, inode);
5752		5752
5753	ext4_journal_stop(handle);	5753	ext4_journal_stop(handle);
5754	out:	5754	out:
5755	return;	5755	return;
5756	}	5756	}
5757		5757
5758	#if 0	5758	#if 0
5759	/*	5759	/*
5760	* Bind an inode's backing buffer_head into this transaction, to prevent	5760	* Bind an inode's backing buffer_head into this transaction, to prevent
5761	* it from being flushed to disk early. Unlike	5761	* it from being flushed to disk early. Unlike
5762	* ext4_reserve_inode_write, this leaves behind no bh reference and	5762	* ext4_reserve_inode_write, this leaves behind no bh reference and
5763	* returns no iloc structure, so the caller needs to repeat the iloc	5763	* returns no iloc structure, so the caller needs to repeat the iloc
5764	* lookup to mark the inode dirty later.	5764	* lookup to mark the inode dirty later.
5765	*/	5765	*/
5766	static int ext4_pin_inode(handle_t handle, struct inode inode)	5766	static int ext4_pin_inode(handle_t handle, struct inode inode)
5767	{	5767	{
5768	struct ext4_iloc iloc;	5768	struct ext4_iloc iloc;
5769		5769
5770	int err = 0;	5770	int err = 0;
5771	if (handle) {	5771	if (handle) {
5772	err = ext4_get_inode_loc(inode, &iloc);	5772	err = ext4_get_inode_loc(inode, &iloc);
5773	if (!err) {	5773	if (!err) {
5774	BUFFER_TRACE(iloc.bh, "get_write_access");	5774	BUFFER_TRACE(iloc.bh, "get_write_access");
5775	err = jbd2_journal_get_write_access(handle, iloc.bh);	5775	err = jbd2_journal_get_write_access(handle, iloc.bh);
5776	if (!err)	5776	if (!err)
5777	err = ext4_handle_dirty_metadata(handle,	5777	err = ext4_handle_dirty_metadata(handle,
5778	NULL,	5778	NULL,
5779	iloc.bh);	5779	iloc.bh);
5780	brelse(iloc.bh);	5780	brelse(iloc.bh);
5781	}	5781	}
5782	}	5782	}
5783	ext4_std_error(inode->i_sb, err);	5783	ext4_std_error(inode->i_sb, err);
5784	return err;	5784	return err;
5785	}	5785	}
5786	#endif	5786	#endif
5787		5787
5788	int ext4_change_inode_journal_flag(struct inode *inode, int val)	5788	int ext4_change_inode_journal_flag(struct inode *inode, int val)
5789	{	5789	{
5790	journal_t *journal;	5790	journal_t *journal;
5791	handle_t *handle;	5791	handle_t *handle;
5792	int err;	5792	int err;
5793		5793
5794	/*	5794	/*
5795	* We have to be very careful here: changing a data block's	5795	* We have to be very careful here: changing a data block's
5796	* journaling status dynamically is dangerous. If we write a	5796	* journaling status dynamically is dangerous. If we write a
5797	* data block to the journal, change the status and then delete	5797	* data block to the journal, change the status and then delete
5798	* that block, we risk forgetting to revoke the old log record	5798	* that block, we risk forgetting to revoke the old log record
5799	* from the journal and so a subsequent replay can corrupt data.	5799	* from the journal and so a subsequent replay can corrupt data.
5800	* So, first we make sure that the journal is empty and that	5800	* So, first we make sure that the journal is empty and that
5801	* nobody is changing anything.	5801	* nobody is changing anything.
5802	*/	5802	*/
5803		5803
5804	journal = EXT4_JOURNAL(inode);	5804	journal = EXT4_JOURNAL(inode);
5805	if (!journal)	5805	if (!journal)
5806	return 0;	5806	return 0;
5807	if (is_journal_aborted(journal))	5807	if (is_journal_aborted(journal))
5808	return -EROFS;	5808	return -EROFS;
5809		5809
5810	jbd2_journal_lock_updates(journal);	5810	jbd2_journal_lock_updates(journal);
5811	jbd2_journal_flush(journal);	5811	jbd2_journal_flush(journal);
5812		5812
5813	/*	5813	/*
5814	* OK, there are no updates running now, and all cached data is	5814	* OK, there are no updates running now, and all cached data is
5815	* synced to disk. We are now in a completely consistent state	5815	* synced to disk. We are now in a completely consistent state
5816	* which doesn't have anything in the journal, and we know that	5816	* which doesn't have anything in the journal, and we know that
5817	* no filesystem updates are running, so it is safe to modify	5817	* no filesystem updates are running, so it is safe to modify
5818	* the inode's in-core data-journaling state flag now.	5818	* the inode's in-core data-journaling state flag now.
5819	*/	5819	*/
5820		5820
5821	if (val)	5821	if (val)
5822	ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);	5822	ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5823	else	5823	else
5824	ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);	5824	ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5825	ext4_set_aops(inode);	5825	ext4_set_aops(inode);
5826		5826
5827	jbd2_journal_unlock_updates(journal);	5827	jbd2_journal_unlock_updates(journal);
5828		5828
5829	/* Finally we can mark the inode as dirty. */	5829	/* Finally we can mark the inode as dirty. */
5830		5830
5831	handle = ext4_journal_start(inode, 1);	5831	handle = ext4_journal_start(inode, 1);
5832	if (IS_ERR(handle))	5832	if (IS_ERR(handle))
5833	return PTR_ERR(handle);	5833	return PTR_ERR(handle);
5834		5834
5835	err = ext4_mark_inode_dirty(handle, inode);	5835	err = ext4_mark_inode_dirty(handle, inode);
5836	ext4_handle_sync(handle);	5836	ext4_handle_sync(handle);
5837	ext4_journal_stop(handle);	5837	ext4_journal_stop(handle);
5838	ext4_std_error(inode->i_sb, err);	5838	ext4_std_error(inode->i_sb, err);
5839		5839
5840	return err;	5840	return err;
5841	}	5841	}
5842		5842
5843	static int ext4_bh_unmapped(handle_t handle, struct buffer_head bh)	5843	static int ext4_bh_unmapped(handle_t handle, struct buffer_head bh)
5844	{	5844	{
5845	return !buffer_mapped(bh);	5845	return !buffer_mapped(bh);
5846	}	5846	}
5847		5847
5848	int ext4_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)	5848	int ext4_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)
5849	{	5849	{
5850	struct page *page = vmf->page;	5850	struct page *page = vmf->page;
5851	loff_t size;	5851	loff_t size;
5852	unsigned long len;	5852	unsigned long len;
5853	int ret;	5853	int ret;
5854	struct file *file = vma->vm_file;	5854	struct file *file = vma->vm_file;
5855	struct inode *inode = file->f_path.dentry->d_inode;	5855	struct inode *inode = file->f_path.dentry->d_inode;
5856	struct address_space *mapping = inode->i_mapping;	5856	struct address_space *mapping = inode->i_mapping;
5857	handle_t *handle;	5857	handle_t *handle;
5858	get_block_t *get_block;	5858	get_block_t *get_block;
5859	int retries = 0;	5859	int retries = 0;
5860		5860
5861	/*	5861	/*
5862	* This check is racy but catches the common case. We rely on	5862	* This check is racy but catches the common case. We rely on
5863	* __block_page_mkwrite() to do a reliable check.	5863	* __block_page_mkwrite() to do a reliable check.
5864	*/	5864	*/
5865	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);	5865	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
5866	/* Delalloc case is easy... */	5866	/* Delalloc case is easy... */
5867	if (test_opt(inode->i_sb, DELALLOC) &&	5867	if (test_opt(inode->i_sb, DELALLOC) &&
5868	!ext4_should_journal_data(inode) &&	5868	!ext4_should_journal_data(inode) &&
5869	!ext4_nonda_switch(inode->i_sb)) {	5869	!ext4_nonda_switch(inode->i_sb)) {
5870	do {	5870	do {
5871	ret = __block_page_mkwrite(vma, vmf,	5871	ret = __block_page_mkwrite(vma, vmf,
5872	ext4_da_get_block_prep);	5872	ext4_da_get_block_prep);
5873	} while (ret == -ENOSPC &&	5873	} while (ret == -ENOSPC &&
5874	ext4_should_retry_alloc(inode->i_sb, &retries));	5874	ext4_should_retry_alloc(inode->i_sb, &retries));
5875	goto out_ret;	5875	goto out_ret;
5876	}	5876	}
5877		5877
5878	lock_page(page);	5878	lock_page(page);
5879	size = i_size_read(inode);	5879	size = i_size_read(inode);
5880	/* Page got truncated from under us? */	5880	/* Page got truncated from under us? */
5881	if (page->mapping != mapping \|\| page_offset(page) > size) {	5881	if (page->mapping != mapping \|\| page_offset(page) > size) {
5882	unlock_page(page);	5882	unlock_page(page);
5883	ret = VM_FAULT_NOPAGE;	5883	ret = VM_FAULT_NOPAGE;
5884	goto out;	5884	goto out;
5885	}	5885	}
5886		5886
5887	if (page->index == size >> PAGE_CACHE_SHIFT)	5887	if (page->index == size >> PAGE_CACHE_SHIFT)
5888	len = size & ~PAGE_CACHE_MASK;	5888	len = size & ~PAGE_CACHE_MASK;
5889	else	5889	else
5890	len = PAGE_CACHE_SIZE;	5890	len = PAGE_CACHE_SIZE;
5891	/*	5891	/*
5892	* Return if we have all the buffers mapped. This avoids the need to do	5892	* Return if we have all the buffers mapped. This avoids the need to do
5893	* journal_start/journal_stop which can block and take a long time	5893	* journal_start/journal_stop which can block and take a long time
5894	*/	5894	*/
5895	if (page_has_buffers(page)) {	5895	if (page_has_buffers(page)) {
5896	if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,	5896	if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5897	ext4_bh_unmapped)) {	5897	ext4_bh_unmapped)) {
5898	/* Wait so that we don't change page under IO */	5898	/* Wait so that we don't change page under IO */
5899	wait_on_page_writeback(page);	5899	wait_on_page_writeback(page);
5900	ret = VM_FAULT_LOCKED;	5900	ret = VM_FAULT_LOCKED;
5901	goto out;	5901	goto out;
5902	}	5902	}
5903	}	5903	}
5904	unlock_page(page);	5904	unlock_page(page);
5905	/* OK, we need to fill the hole... */	5905	/* OK, we need to fill the hole... */
5906	if (ext4_should_dioread_nolock(inode))	5906	if (ext4_should_dioread_nolock(inode))
5907	get_block = ext4_get_block_write;	5907	get_block = ext4_get_block_write;
5908	else	5908	else
5909	get_block = ext4_get_block;	5909	get_block = ext4_get_block;
5910	retry_alloc:	5910	retry_alloc:
5911	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));	5911	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
5912	if (IS_ERR(handle)) {	5912	if (IS_ERR(handle)) {
5913	ret = VM_FAULT_SIGBUS;	5913	ret = VM_FAULT_SIGBUS;
5914	goto out;	5914	goto out;
5915	}	5915	}
5916	ret = __block_page_mkwrite(vma, vmf, get_block);	5916	ret = __block_page_mkwrite(vma, vmf, get_block);
5917	if (!ret && ext4_should_journal_data(inode)) {	5917	if (!ret && ext4_should_journal_data(inode)) {
5918	if (walk_page_buffers(handle, page_buffers(page), 0,	5918	if (walk_page_buffers(handle, page_buffers(page), 0,
5919	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {	5919	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
5920	unlock_page(page);	5920	unlock_page(page);
5921	ret = VM_FAULT_SIGBUS;	5921	ret = VM_FAULT_SIGBUS;
5922	goto out;	5922	goto out;
5923	}	5923	}
5924	ext4_set_inode_state(inode, EXT4_STATE_JDATA);	5924	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
5925	}	5925	}
5926	ext4_journal_stop(handle);	5926	ext4_journal_stop(handle);
5927	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))	5927	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
5928	goto retry_alloc;	5928	goto retry_alloc;
5929	out_ret:	5929	out_ret:
5930	ret = block_page_mkwrite_return(ret);	5930	ret = block_page_mkwrite_return(ret);
5931	out:	5931	out:
5932	return ret;	5932	return ret;
5933	}	5933	}
5934		5934

fs/fs-writeback.c

Diff comments View file @ f01ef56

1	/*	1	/*
2	* fs/fs-writeback.c	2	* fs/fs-writeback.c
3	*	3	*
4	* Copyright (C) 2002, Linus Torvalds.	4	* Copyright (C) 2002, Linus Torvalds.
5	*	5	*
6	* Contains all the functions related to writing back and waiting	6	* Contains all the functions related to writing back and waiting
7	* upon dirty inodes against superblocks, and writing back dirty	7	* upon dirty inodes against superblocks, and writing back dirty
8	* pages against inodes. ie: data writeback. Writeout of the	8	* pages against inodes. ie: data writeback. Writeout of the
9	* inode itself is not handled here.	9	* inode itself is not handled here.
10	*	10	*
11	* 10Apr2002 Andrew Morton	11	* 10Apr2002 Andrew Morton
12	* Split out of fs/inode.c	12	* Split out of fs/inode.c
13	* Additions for address_space-based writeback	13	* Additions for address_space-based writeback
14	*/	14	*/
15		15
16	#include <linux/kernel.h>	16	#include <linux/kernel.h>
17	#include <linux/module.h>	17	#include <linux/module.h>
18	#include <linux/spinlock.h>	18	#include <linux/spinlock.h>
19	#include <linux/slab.h>	19	#include <linux/slab.h>
20	#include <linux/sched.h>	20	#include <linux/sched.h>
21	#include <linux/fs.h>	21	#include <linux/fs.h>
22	#include <linux/mm.h>	22	#include <linux/mm.h>
23	#include <linux/kthread.h>	23	#include <linux/kthread.h>
24	#include <linux/freezer.h>	24	#include <linux/freezer.h>
25	#include <linux/writeback.h>	25	#include <linux/writeback.h>
26	#include <linux/blkdev.h>	26	#include <linux/blkdev.h>
27	#include <linux/backing-dev.h>	27	#include <linux/backing-dev.h>
28	#include <linux/buffer_head.h>	28	#include <linux/buffer_head.h>
29	#include <linux/tracepoint.h>	29	#include <linux/tracepoint.h>
30	#include "internal.h"	30	#include "internal.h"
31		31
32	/*	32	/*
33	* Passed into wb_writeback(), essentially a subset of writeback_control	33	* Passed into wb_writeback(), essentially a subset of writeback_control
34	*/	34	*/
35	struct wb_writeback_work {	35	struct wb_writeback_work {
36	long nr_pages;	36	long nr_pages;
37	struct super_block *sb;	37	struct super_block *sb;
		38	unsigned long *older_than_this;
38	enum writeback_sync_modes sync_mode;	39	enum writeback_sync_modes sync_mode;
		40	unsigned int tagged_writepages:1;
39	unsigned int for_kupdate:1;	41	unsigned int for_kupdate:1;
40	unsigned int range_cyclic:1;	42	unsigned int range_cyclic:1;
41	unsigned int for_background:1;	43	unsigned int for_background:1;
42		44
43	struct list_head list; /* pending work list */	45	struct list_head list; /* pending work list */
44	struct completion done; / set if the caller waits */	46	struct completion done; / set if the caller waits */
45	};	47	};
46		48
47	/*	49	/*
48	* Include the creation of the trace points after defining the	50	* Include the creation of the trace points after defining the
49	* wb_writeback_work structure so that the definition remains local to this	51	* wb_writeback_work structure so that the definition remains local to this
50	* file.	52	* file.
51	*/	53	*/
52	#define CREATE_TRACE_POINTS	54	#define CREATE_TRACE_POINTS
53	#include <trace/events/writeback.h>	55	#include <trace/events/writeback.h>
54		56
55	/*	57	/*
56	* We don't actually have pdflush, but this one is exported though /proc...	58	* We don't actually have pdflush, but this one is exported though /proc...
57	*/	59	*/
58	int nr_pdflush_threads;	60	int nr_pdflush_threads;
59		61
60	/**	62	/**
61	* writeback_in_progress - determine whether there is writeback in progress	63	* writeback_in_progress - determine whether there is writeback in progress
62	* @bdi: the device's backing_dev_info structure.	64	* @bdi: the device's backing_dev_info structure.
63	*	65	*
64	* Determine whether there is writeback waiting to be handled against a	66	* Determine whether there is writeback waiting to be handled against a
65	* backing device.	67	* backing device.
66	*/	68	*/
67	int writeback_in_progress(struct backing_dev_info *bdi)	69	int writeback_in_progress(struct backing_dev_info *bdi)
68	{	70	{
69	return test_bit(BDI_writeback_running, &bdi->state);	71	return test_bit(BDI_writeback_running, &bdi->state);
70	}	72	}
71		73
72	static inline struct backing_dev_info inode_to_bdi(struct inode inode)	74	static inline struct backing_dev_info inode_to_bdi(struct inode inode)
73	{	75	{
74	struct super_block *sb = inode->i_sb;	76	struct super_block *sb = inode->i_sb;
75		77
76	if (strcmp(sb->s_type->name, "bdev") == 0)	78	if (strcmp(sb->s_type->name, "bdev") == 0)
77	return inode->i_mapping->backing_dev_info;	79	return inode->i_mapping->backing_dev_info;
78		80
79	return sb->s_bdi;	81	return sb->s_bdi;
80	}	82	}
81		83
82	static inline struct inode wb_inode(struct list_head head)	84	static inline struct inode wb_inode(struct list_head head)
83	{	85	{
84	return list_entry(head, struct inode, i_wb_list);	86	return list_entry(head, struct inode, i_wb_list);
85	}	87	}
86		88
87	/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */	89	/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88	static void bdi_wakeup_flusher(struct backing_dev_info *bdi)	90	static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89	{	91	{
90	if (bdi->wb.task) {	92	if (bdi->wb.task) {
91	wake_up_process(bdi->wb.task);	93	wake_up_process(bdi->wb.task);
92	} else {	94	} else {
93	/*	95	/*
94	* The bdi thread isn't there, wake up the forker thread which	96	* The bdi thread isn't there, wake up the forker thread which
95	* will create and run it.	97	* will create and run it.
96	*/	98	*/
97	wake_up_process(default_backing_dev_info.wb.task);	99	wake_up_process(default_backing_dev_info.wb.task);
98	}	100	}
99	}	101	}
100		102
101	static void bdi_queue_work(struct backing_dev_info *bdi,	103	static void bdi_queue_work(struct backing_dev_info *bdi,
102	struct wb_writeback_work *work)	104	struct wb_writeback_work *work)
103	{	105	{
104	trace_writeback_queue(bdi, work);	106	trace_writeback_queue(bdi, work);
105		107
106	spin_lock_bh(&bdi->wb_lock);	108	spin_lock_bh(&bdi->wb_lock);
107	list_add_tail(&work->list, &bdi->work_list);	109	list_add_tail(&work->list, &bdi->work_list);
108	if (!bdi->wb.task)	110	if (!bdi->wb.task)
109	trace_writeback_nothread(bdi, work);	111	trace_writeback_nothread(bdi, work);
110	bdi_wakeup_flusher(bdi);	112	bdi_wakeup_flusher(bdi);
111	spin_unlock_bh(&bdi->wb_lock);	113	spin_unlock_bh(&bdi->wb_lock);
112	}	114	}
113		115
114	static void	116	static void
115	__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,	117	__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
116	bool range_cyclic)	118	bool range_cyclic)
117	{	119	{
118	struct wb_writeback_work *work;	120	struct wb_writeback_work *work;
119		121
120	/*	122	/*
121	* This is WB_SYNC_NONE writeback, so if allocation fails just	123	* This is WB_SYNC_NONE writeback, so if allocation fails just
122	* wakeup the thread for old dirty data writeback	124	* wakeup the thread for old dirty data writeback
123	*/	125	*/
124	work = kzalloc(sizeof(*work), GFP_ATOMIC);	126	work = kzalloc(sizeof(*work), GFP_ATOMIC);
125	if (!work) {	127	if (!work) {
126	if (bdi->wb.task) {	128	if (bdi->wb.task) {
127	trace_writeback_nowork(bdi);	129	trace_writeback_nowork(bdi);
128	wake_up_process(bdi->wb.task);	130	wake_up_process(bdi->wb.task);
129	}	131	}
130	return;	132	return;
131	}	133	}
132		134
133	work->sync_mode = WB_SYNC_NONE;	135	work->sync_mode = WB_SYNC_NONE;
134	work->nr_pages = nr_pages;	136	work->nr_pages = nr_pages;
135	work->range_cyclic = range_cyclic;	137	work->range_cyclic = range_cyclic;
136		138
137	bdi_queue_work(bdi, work);	139	bdi_queue_work(bdi, work);
138	}	140	}
139		141
140	/**	142	/**
141	* bdi_start_writeback - start writeback	143	* bdi_start_writeback - start writeback
142	* @bdi: the backing device to write from	144	* @bdi: the backing device to write from
143	* @nr_pages: the number of pages to write	145	* @nr_pages: the number of pages to write
144	*	146	*
145	* Description:	147	* Description:
146	* This does WB_SYNC_NONE opportunistic writeback. The IO is only	148	* This does WB_SYNC_NONE opportunistic writeback. The IO is only
147	* started when this function returns, we make no guarantees on	149	* started when this function returns, we make no guarantees on
148	* completion. Caller need not hold sb s_umount semaphore.	150	* completion. Caller need not hold sb s_umount semaphore.
149	*	151	*
150	*/	152	*/
151	void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)	153	void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152	{	154	{
153	__bdi_start_writeback(bdi, nr_pages, true);	155	__bdi_start_writeback(bdi, nr_pages, true);
154	}	156	}
155		157
156	/**	158	/**
157	* bdi_start_background_writeback - start background writeback	159	* bdi_start_background_writeback - start background writeback
158	* @bdi: the backing device to write from	160	* @bdi: the backing device to write from
159	*	161	*
160	* Description:	162	* Description:
161	* This makes sure WB_SYNC_NONE background writeback happens. When	163	* This makes sure WB_SYNC_NONE background writeback happens. When
162	* this function returns, it is only guaranteed that for given BDI	164	* this function returns, it is only guaranteed that for given BDI
163	* some IO is happening if we are over background dirty threshold.	165	* some IO is happening if we are over background dirty threshold.
164	* Caller need not hold sb s_umount semaphore.	166	* Caller need not hold sb s_umount semaphore.
165	*/	167	*/
166	void bdi_start_background_writeback(struct backing_dev_info *bdi)	168	void bdi_start_background_writeback(struct backing_dev_info *bdi)
167	{	169	{
168	/*	170	/*
169	* We just wake up the flusher thread. It will perform background	171	* We just wake up the flusher thread. It will perform background
170	* writeback as soon as there is no other work to do.	172	* writeback as soon as there is no other work to do.
171	*/	173	*/
172	trace_writeback_wake_background(bdi);	174	trace_writeback_wake_background(bdi);
173	spin_lock_bh(&bdi->wb_lock);	175	spin_lock_bh(&bdi->wb_lock);
174	bdi_wakeup_flusher(bdi);	176	bdi_wakeup_flusher(bdi);
175	spin_unlock_bh(&bdi->wb_lock);	177	spin_unlock_bh(&bdi->wb_lock);
176	}	178	}
177		179
178	/*	180	/*
179	* Remove the inode from the writeback list it is on.	181	* Remove the inode from the writeback list it is on.
180	*/	182	*/
181	void inode_wb_list_del(struct inode *inode)	183	void inode_wb_list_del(struct inode *inode)
182	{	184	{
183	spin_lock(&inode_wb_list_lock);	185	struct backing_dev_info *bdi = inode_to_bdi(inode);
		186
		187	spin_lock(&bdi->wb.list_lock);
184	list_del_init(&inode->i_wb_list);	188	list_del_init(&inode->i_wb_list);
185	spin_unlock(&inode_wb_list_lock);	189	spin_unlock(&bdi->wb.list_lock);
186	}	190	}
187		191
188
189	/*	192	/*
190	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the	193	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191	* furthest end of its superblock's dirty-inode list.	194	* furthest end of its superblock's dirty-inode list.
192	*	195	*
193	* Before stamping the inode's ->dirtied_when, we check to see whether it is	196	* Before stamping the inode's ->dirtied_when, we check to see whether it is
194	* already the most-recently-dirtied inode on the b_dirty list. If that is	197	* already the most-recently-dirtied inode on the b_dirty list. If that is
195	* the case then the inode must have been redirtied while it was being written	198	* the case then the inode must have been redirtied while it was being written
196	* out and we don't reset its dirtied_when.	199	* out and we don't reset its dirtied_when.
197	*/	200	*/
198	static void redirty_tail(struct inode *inode)	201	static void redirty_tail(struct inode inode, struct bdi_writeback wb)
199	{	202	{
200	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;	203	assert_spin_locked(&wb->list_lock);
201
202	assert_spin_locked(&inode_wb_list_lock);
203	if (!list_empty(&wb->b_dirty)) {	204	if (!list_empty(&wb->b_dirty)) {
204	struct inode *tail;	205	struct inode *tail;
205		206
206	tail = wb_inode(wb->b_dirty.next);	207	tail = wb_inode(wb->b_dirty.next);
207	if (time_before(inode->dirtied_when, tail->dirtied_when))	208	if (time_before(inode->dirtied_when, tail->dirtied_when))
208	inode->dirtied_when = jiffies;	209	inode->dirtied_when = jiffies;
209	}	210	}
210	list_move(&inode->i_wb_list, &wb->b_dirty);	211	list_move(&inode->i_wb_list, &wb->b_dirty);
211	}	212	}
212		213
213	/*	214	/*
214	* requeue inode for re-scanning after bdi->b_io list is exhausted.	215	* requeue inode for re-scanning after bdi->b_io list is exhausted.
215	*/	216	*/
216	static void requeue_io(struct inode *inode)	217	static void requeue_io(struct inode inode, struct bdi_writeback wb)
217	{	218	{
218	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;	219	assert_spin_locked(&wb->list_lock);
219
220	assert_spin_locked(&inode_wb_list_lock);
221	list_move(&inode->i_wb_list, &wb->b_more_io);	220	list_move(&inode->i_wb_list, &wb->b_more_io);
222	}	221	}
223		222
224	static void inode_sync_complete(struct inode *inode)	223	static void inode_sync_complete(struct inode *inode)
225	{	224	{
226	/*	225	/*
227	* Prevent speculative execution through	226	* Prevent speculative execution through
228	* spin_unlock(&inode_wb_list_lock);	227	* spin_unlock(&wb->list_lock);
229	*/	228	*/
230		229
231	smp_mb();	230	smp_mb();
232	wake_up_bit(&inode->i_state, __I_SYNC);	231	wake_up_bit(&inode->i_state, __I_SYNC);
233	}	232	}
234		233
235	static bool inode_dirtied_after(struct inode *inode, unsigned long t)	234	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
236	{	235	{
237	bool ret = time_after(inode->dirtied_when, t);	236	bool ret = time_after(inode->dirtied_when, t);
238	#ifndef CONFIG_64BIT	237	#ifndef CONFIG_64BIT
239	/*	238	/*
240	* For inodes being constantly redirtied, dirtied_when can get stuck.	239	* For inodes being constantly redirtied, dirtied_when can get stuck.
241	* It _appears_ to be in the future, but is actually in distant past.	240	* It _appears_ to be in the future, but is actually in distant past.
242	* This test is necessary to prevent such wrapped-around relative times	241	* This test is necessary to prevent such wrapped-around relative times
243	* from permanently stopping the whole bdi writeback.	242	* from permanently stopping the whole bdi writeback.
244	*/	243	*/
245	ret = ret && time_before_eq(inode->dirtied_when, jiffies);	244	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
246	#endif	245	#endif
247	return ret;	246	return ret;
248	}	247	}
249		248
250	/*	249	/*
251	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.	250	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252	*/	251	*/
253	static void move_expired_inodes(struct list_head *delaying_queue,	252	static int move_expired_inodes(struct list_head *delaying_queue,
254	struct list_head *dispatch_queue,	253	struct list_head *dispatch_queue,
255	unsigned long *older_than_this)	254	unsigned long *older_than_this)
256	{	255	{
257	LIST_HEAD(tmp);	256	LIST_HEAD(tmp);
258	struct list_head pos, node;	257	struct list_head pos, node;
259	struct super_block *sb = NULL;	258	struct super_block *sb = NULL;
260	struct inode *inode;	259	struct inode *inode;
261	int do_sb_sort = 0;	260	int do_sb_sort = 0;
		261	int moved = 0;
262		262
263	while (!list_empty(delaying_queue)) {	263	while (!list_empty(delaying_queue)) {
264	inode = wb_inode(delaying_queue->prev);	264	inode = wb_inode(delaying_queue->prev);
265	if (older_than_this &&	265	if (older_than_this &&
266	inode_dirtied_after(inode, *older_than_this))	266	inode_dirtied_after(inode, *older_than_this))
267	break;	267	break;
268	if (sb && sb != inode->i_sb)	268	if (sb && sb != inode->i_sb)
269	do_sb_sort = 1;	269	do_sb_sort = 1;
270	sb = inode->i_sb;	270	sb = inode->i_sb;
271	list_move(&inode->i_wb_list, &tmp);	271	list_move(&inode->i_wb_list, &tmp);
		272	moved++;
272	}	273	}
273		274
274	/* just one sb in list, splice to dispatch_queue and we're done */	275	/* just one sb in list, splice to dispatch_queue and we're done */
275	if (!do_sb_sort) {	276	if (!do_sb_sort) {
276	list_splice(&tmp, dispatch_queue);	277	list_splice(&tmp, dispatch_queue);
277	return;	278	goto out;
278	}	279	}
279		280
280	/* Move inodes from one superblock together */	281	/* Move inodes from one superblock together */
281	while (!list_empty(&tmp)) {	282	while (!list_empty(&tmp)) {
282	sb = wb_inode(tmp.prev)->i_sb;	283	sb = wb_inode(tmp.prev)->i_sb;
283	list_for_each_prev_safe(pos, node, &tmp) {	284	list_for_each_prev_safe(pos, node, &tmp) {
284	inode = wb_inode(pos);	285	inode = wb_inode(pos);
285	if (inode->i_sb == sb)	286	if (inode->i_sb == sb)
286	list_move(&inode->i_wb_list, dispatch_queue);	287	list_move(&inode->i_wb_list, dispatch_queue);
287	}	288	}
288	}	289	}
		290	out:
		291	return moved;
289	}	292	}
290		293
291	/*	294	/*
292	* Queue all expired dirty inodes for io, eldest first.	295	* Queue all expired dirty inodes for io, eldest first.
293	* Before	296	* Before
294	* newly dirtied b_dirty b_io b_more_io	297	* newly dirtied b_dirty b_io b_more_io
295	* =============> gf edc BA	298	* =============> gf edc BA
296	* After	299	* After
297	* newly dirtied b_dirty b_io b_more_io	300	* newly dirtied b_dirty b_io b_more_io
298	* =============> g fBAedc	301	* =============> g fBAedc
299	* \|	302	* \|
300	* +--> dequeue for IO	303	* +--> dequeue for IO
301	*/	304	*/
302	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)	305	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)
303	{	306	{
304	assert_spin_locked(&inode_wb_list_lock);	307	int moved;
		308	assert_spin_locked(&wb->list_lock);
305	list_splice_init(&wb->b_more_io, &wb->b_io);	309	list_splice_init(&wb->b_more_io, &wb->b_io);
306	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);	310	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
		311	trace_writeback_queue_io(wb, older_than_this, moved);
307	}	312	}
308		313
309	static int write_inode(struct inode inode, struct writeback_control wbc)	314	static int write_inode(struct inode inode, struct writeback_control wbc)
310	{	315	{
311	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))	316	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
312	return inode->i_sb->s_op->write_inode(inode, wbc);	317	return inode->i_sb->s_op->write_inode(inode, wbc);
313	return 0;	318	return 0;
314	}	319	}
315		320
316	/*	321	/*
317	* Wait for writeback on an inode to complete.	322	* Wait for writeback on an inode to complete.
318	*/	323	*/
319	static void inode_wait_for_writeback(struct inode *inode)	324	static void inode_wait_for_writeback(struct inode *inode,
		325	struct bdi_writeback *wb)
320	{	326	{
321	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);	327	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322	wait_queue_head_t *wqh;	328	wait_queue_head_t *wqh;
323		329
324	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);	330	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325	while (inode->i_state & I_SYNC) {	331	while (inode->i_state & I_SYNC) {
326	spin_unlock(&inode->i_lock);	332	spin_unlock(&inode->i_lock);
327	spin_unlock(&inode_wb_list_lock);	333	spin_unlock(&wb->list_lock);
328	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);	334	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329	spin_lock(&inode_wb_list_lock);	335	spin_lock(&wb->list_lock);
330	spin_lock(&inode->i_lock);	336	spin_lock(&inode->i_lock);
331	}	337	}
332	}	338	}
333		339
334	/*	340	/*
335	* Write out an inode's dirty pages. Called under inode_wb_list_lock and	341	* Write out an inode's dirty pages. Called under wb->list_lock and
336	* inode->i_lock. Either the caller has an active reference on the inode or	342	* inode->i_lock. Either the caller has an active reference on the inode or
337	* the inode has I_WILL_FREE set.	343	* the inode has I_WILL_FREE set.
338	*	344	*
339	* If `wait' is set, wait on the writeout.	345	* If `wait' is set, wait on the writeout.
340	*	346	*
341	* The whole writeout design is quite complex and fragile. We want to avoid	347	* The whole writeout design is quite complex and fragile. We want to avoid
342	* starvation of particular inodes when others are being redirtied, prevent	348	* starvation of particular inodes when others are being redirtied, prevent
343	* livelocks, etc.	349	* livelocks, etc.
344	*/	350	*/
345	static int	351	static int
346	writeback_single_inode(struct inode inode, struct writeback_control wbc)	352	writeback_single_inode(struct inode inode, struct bdi_writeback wb,
		353	struct writeback_control *wbc)
347	{	354	{
348	struct address_space *mapping = inode->i_mapping;	355	struct address_space *mapping = inode->i_mapping;
		356	long nr_to_write = wbc->nr_to_write;
349	unsigned dirty;	357	unsigned dirty;
350	int ret;	358	int ret;
351		359
352	assert_spin_locked(&inode_wb_list_lock);	360	assert_spin_locked(&wb->list_lock);
353	assert_spin_locked(&inode->i_lock);	361	assert_spin_locked(&inode->i_lock);
354		362
355	if (!atomic_read(&inode->i_count))	363	if (!atomic_read(&inode->i_count))
356	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));	364	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
357	else	365	else
358	WARN_ON(inode->i_state & I_WILL_FREE);	366	WARN_ON(inode->i_state & I_WILL_FREE);
359		367
360	if (inode->i_state & I_SYNC) {	368	if (inode->i_state & I_SYNC) {
361	/*	369	/*
362	* If this inode is locked for writeback and we are not doing	370	* If this inode is locked for writeback and we are not doing
363	* writeback-for-data-integrity, move it to b_more_io so that	371	* writeback-for-data-integrity, move it to b_more_io so that
364	* writeback can proceed with the other inodes on s_io.	372	* writeback can proceed with the other inodes on s_io.
365	*	373	*
366	* We'll have another go at writing back this inode when we	374	* We'll have another go at writing back this inode when we
367	* completed a full scan of b_io.	375	* completed a full scan of b_io.
368	*/	376	*/
369	if (wbc->sync_mode != WB_SYNC_ALL) {	377	if (wbc->sync_mode != WB_SYNC_ALL) {
370	requeue_io(inode);	378	requeue_io(inode, wb);
		379	trace_writeback_single_inode_requeue(inode, wbc,
		380	nr_to_write);
371	return 0;	381	return 0;
372	}	382	}
373		383
374	/*	384	/*
375	* It's a data-integrity sync. We must wait.	385	* It's a data-integrity sync. We must wait.
376	*/	386	*/
377	inode_wait_for_writeback(inode);	387	inode_wait_for_writeback(inode, wb);
378	}	388	}
379		389
380	BUG_ON(inode->i_state & I_SYNC);	390	BUG_ON(inode->i_state & I_SYNC);
381		391
382	/* Set I_SYNC, reset I_DIRTY_PAGES */	392	/* Set I_SYNC, reset I_DIRTY_PAGES */
383	inode->i_state \|= I_SYNC;	393	inode->i_state \|= I_SYNC;
384	inode->i_state &= ~I_DIRTY_PAGES;	394	inode->i_state &= ~I_DIRTY_PAGES;
385	spin_unlock(&inode->i_lock);	395	spin_unlock(&inode->i_lock);
386	spin_unlock(&inode_wb_list_lock);	396	spin_unlock(&wb->list_lock);
387		397
388	ret = do_writepages(mapping, wbc);	398	ret = do_writepages(mapping, wbc);
389		399
390	/*	400	/*
391	* Make sure to wait on the data before writing out the metadata.	401	* Make sure to wait on the data before writing out the metadata.
392	* This is important for filesystems that modify metadata on data	402	* This is important for filesystems that modify metadata on data
393	* I/O completion.	403	* I/O completion.
394	*/	404	*/
395	if (wbc->sync_mode == WB_SYNC_ALL) {	405	if (wbc->sync_mode == WB_SYNC_ALL) {
396	int err = filemap_fdatawait(mapping);	406	int err = filemap_fdatawait(mapping);
397	if (ret == 0)	407	if (ret == 0)
398	ret = err;	408	ret = err;
399	}	409	}
400		410
401	/*	411	/*
402	* Some filesystems may redirty the inode during the writeback	412	* Some filesystems may redirty the inode during the writeback
403	* due to delalloc, clear dirty metadata flags right before	413	* due to delalloc, clear dirty metadata flags right before
404	* write_inode()	414	* write_inode()
405	*/	415	*/
406	spin_lock(&inode->i_lock);	416	spin_lock(&inode->i_lock);
407	dirty = inode->i_state & I_DIRTY;	417	dirty = inode->i_state & I_DIRTY;
408	inode->i_state &= ~(I_DIRTY_SYNC \| I_DIRTY_DATASYNC);	418	inode->i_state &= ~(I_DIRTY_SYNC \| I_DIRTY_DATASYNC);
409	spin_unlock(&inode->i_lock);	419	spin_unlock(&inode->i_lock);
410	/* Don't write the inode if only I_DIRTY_PAGES was set */	420	/* Don't write the inode if only I_DIRTY_PAGES was set */
411	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {	421	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
412	int err = write_inode(inode, wbc);	422	int err = write_inode(inode, wbc);
413	if (ret == 0)	423	if (ret == 0)
414	ret = err;	424	ret = err;
415	}	425	}
416		426
417	spin_lock(&inode_wb_list_lock);	427	spin_lock(&wb->list_lock);
418	spin_lock(&inode->i_lock);	428	spin_lock(&inode->i_lock);
419	inode->i_state &= ~I_SYNC;	429	inode->i_state &= ~I_SYNC;
420	if (!(inode->i_state & I_FREEING)) {	430	if (!(inode->i_state & I_FREEING)) {
		431	/*
		432	* Sync livelock prevention. Each inode is tagged and synced in
		433	* one shot. If still dirty, it will be redirty_tail()'ed below.
		434	* Update the dirty time to prevent enqueue and sync it again.
		435	*/
		436	if ((inode->i_state & I_DIRTY) &&
		437	(wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages))
		438	inode->dirtied_when = jiffies;
		439
421	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {	440	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422	/*	441	/*
423	* We didn't write back all the pages. nfs_writepages()	442	* We didn't write back all the pages. nfs_writepages()
424	* sometimes bales out without doing anything.	443	* sometimes bales out without doing anything.
425	*/	444	*/
426	inode->i_state \|= I_DIRTY_PAGES;	445	inode->i_state \|= I_DIRTY_PAGES;
427	if (wbc->nr_to_write <= 0) {	446	if (wbc->nr_to_write <= 0) {
428	/*	447	/*
429	* slice used up: queue for next turn	448	* slice used up: queue for next turn
430	*/	449	*/
431	requeue_io(inode);	450	requeue_io(inode, wb);
432	} else {	451	} else {
433	/*	452	/*
434	* Writeback blocked by something other than	453	* Writeback blocked by something other than
435	* congestion. Delay the inode for some time to	454	* congestion. Delay the inode for some time to
436	* avoid spinning on the CPU (100% iowait)	455	* avoid spinning on the CPU (100% iowait)
437	* retrying writeback of the dirty page/inode	456	* retrying writeback of the dirty page/inode
438	* that cannot be performed immediately.	457	* that cannot be performed immediately.
439	*/	458	*/
440	redirty_tail(inode);	459	redirty_tail(inode, wb);
441	}	460	}
442	} else if (inode->i_state & I_DIRTY) {	461	} else if (inode->i_state & I_DIRTY) {
443	/*	462	/*
444	* Filesystems can dirty the inode during writeback	463	* Filesystems can dirty the inode during writeback
445	* operations, such as delayed allocation during	464	* operations, such as delayed allocation during
446	* submission or metadata updates after data IO	465	* submission or metadata updates after data IO
447	* completion.	466	* completion.
448	*/	467	*/
449	redirty_tail(inode);	468	redirty_tail(inode, wb);
450	} else {	469	} else {
451	/*	470	/*
452	* The inode is clean. At this point we either have	471	* The inode is clean. At this point we either have
453	* a reference to the inode or it's on it's way out.	472	* a reference to the inode or it's on it's way out.
454	* No need to add it back to the LRU.	473	* No need to add it back to the LRU.
455	*/	474	*/
456	list_del_init(&inode->i_wb_list);	475	list_del_init(&inode->i_wb_list);
457	}	476	}
458	}	477	}
459	inode_sync_complete(inode);	478	inode_sync_complete(inode);
		479	trace_writeback_single_inode(inode, wbc, nr_to_write);
460	return ret;	480	return ret;
461	}	481	}
462		482
		483	static long writeback_chunk_size(struct backing_dev_info *bdi,
		484	struct wb_writeback_work *work)
		485	{
		486	long pages;
		487
		488	/*
		489	* WB_SYNC_ALL mode does livelock avoidance by syncing dirty
		490	* inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
		491	* here avoids calling into writeback_inodes_wb() more than once.
		492	*
		493	* The intended call sequence for WB_SYNC_ALL writeback is:
		494	*
		495	* wb_writeback()
		496	* writeback_sb_inodes() <== called only once
		497	* write_cache_pages() <== called once for each inode
		498	* (quickly) tag currently dirty pages
		499	* (maybe slowly) sync all tagged pages
		500	*/
		501	if (work->sync_mode == WB_SYNC_ALL \|\| work->tagged_writepages)
		502	pages = LONG_MAX;
		503	else {
		504	pages = min(bdi->avg_write_bandwidth / 2,
		505	global_dirty_limit / DIRTY_SCOPE);
		506	pages = min(pages, work->nr_pages);
		507	pages = round_down(pages + MIN_WRITEBACK_PAGES,
		508	MIN_WRITEBACK_PAGES);
		509	}
		510
		511	return pages;
		512	}
		513
463	/*	514	/*
464	* Write a portion of b_io inodes which belong to @sb.	515	* Write a portion of b_io inodes which belong to @sb.
465	*	516	*
466	* If @only_this_sb is true, then find and write all such	517	* If @only_this_sb is true, then find and write all such
467	* inodes. Otherwise write only ones which go sequentially	518	* inodes. Otherwise write only ones which go sequentially
468	* in reverse order.	519	* in reverse order.
469	*	520	*
470	* Return 1, if the caller writeback routine should be	521	* Return the number of pages and/or inodes written.
471	* interrupted. Otherwise return 0.
472	*/	522	*/
473	static int writeback_sb_inodes(struct super_block sb, struct bdi_writeback wb,	523	static long writeback_sb_inodes(struct super_block *sb,
474	struct writeback_control *wbc, bool only_this_sb)	524	struct bdi_writeback *wb,
		525	struct wb_writeback_work *work)
475	{	526	{
		527	struct writeback_control wbc = {
		528	.sync_mode = work->sync_mode,
		529	.tagged_writepages = work->tagged_writepages,
		530	.for_kupdate = work->for_kupdate,
		531	.for_background = work->for_background,
		532	.range_cyclic = work->range_cyclic,
		533	.range_start = 0,
		534	.range_end = LLONG_MAX,
		535	};
		536	unsigned long start_time = jiffies;
		537	long write_chunk;
		538	long wrote = 0; /* count both pages and inodes */
		539
476	while (!list_empty(&wb->b_io)) {	540	while (!list_empty(&wb->b_io)) {
477	long pages_skipped;
478	struct inode *inode = wb_inode(wb->b_io.prev);	541	struct inode *inode = wb_inode(wb->b_io.prev);
479		542
480	if (inode->i_sb != sb) {	543	if (inode->i_sb != sb) {
481	if (only_this_sb) {	544	if (work->sb) {
482	/*	545	/*
483	* We only want to write back data for this	546	* We only want to write back data for this
484	* superblock, move all inodes not belonging	547	* superblock, move all inodes not belonging
485	* to it back onto the dirty list.	548	* to it back onto the dirty list.
486	*/	549	*/
487	redirty_tail(inode);	550	redirty_tail(inode, wb);
488	continue;	551	continue;
489	}	552	}
490		553
491	/*	554	/*
492	* The inode belongs to a different superblock.	555	* The inode belongs to a different superblock.
493	* Bounce back to the caller to unpin this and	556	* Bounce back to the caller to unpin this and
494	* pin the next superblock.	557	* pin the next superblock.
495	*/	558	*/
496	return 0;	559	break;
497	}	560	}
498		561
499	/*	562	/*
500	* Don't bother with new inodes or inodes beeing freed, first	563	* Don't bother with new inodes or inodes beeing freed, first
501	* kind does not need peridic writeout yet, and for the latter	564	* kind does not need peridic writeout yet, and for the latter
502	* kind writeout is handled by the freer.	565	* kind writeout is handled by the freer.
503	*/	566	*/
504	spin_lock(&inode->i_lock);	567	spin_lock(&inode->i_lock);
505	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {	568	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
506	spin_unlock(&inode->i_lock);	569	spin_unlock(&inode->i_lock);
507	requeue_io(inode);	570	redirty_tail(inode, wb);
508	continue;	571	continue;
509	}	572	}
510
511	/*
512	* Was this inode dirtied after sync_sb_inodes was called?
513	* This keeps sync from extra jobs and livelock.
514	*/
515	if (inode_dirtied_after(inode, wbc->wb_start)) {
516	spin_unlock(&inode->i_lock);
517	return 1;
518	}
519
520	__iget(inode);	573	__iget(inode);
		574	write_chunk = writeback_chunk_size(wb->bdi, work);
		575	wbc.nr_to_write = write_chunk;
		576	wbc.pages_skipped = 0;
521		577
522	pages_skipped = wbc->pages_skipped;	578	writeback_single_inode(inode, wb, &wbc);
523	writeback_single_inode(inode, wbc);	579
524	if (wbc->pages_skipped != pages_skipped) {	580	work->nr_pages -= write_chunk - wbc.nr_to_write;
		581	wrote += write_chunk - wbc.nr_to_write;
		582	if (!(inode->i_state & I_DIRTY))
		583	wrote++;
		584	if (wbc.pages_skipped) {
525	/*	585	/*
526	* writeback is not making progress due to locked	586	* writeback is not making progress due to locked
527	* buffers. Skip this inode for now.	587	* buffers. Skip this inode for now.
528	*/	588	*/
529	redirty_tail(inode);	589	redirty_tail(inode, wb);
530	}	590	}
531	spin_unlock(&inode->i_lock);	591	spin_unlock(&inode->i_lock);
532	spin_unlock(&inode_wb_list_lock);	592	spin_unlock(&wb->list_lock);
533	iput(inode);	593	iput(inode);
534	cond_resched();	594	cond_resched();
535	spin_lock(&inode_wb_list_lock);	595	spin_lock(&wb->list_lock);
536	if (wbc->nr_to_write <= 0) {	596	/*
537	wbc->more_io = 1;	597	* bail out to wb_writeback() often enough to check
538	return 1;	598	* background threshold and other termination conditions.
		599	*/
		600	if (wrote) {
		601	if (time_is_before_jiffies(start_time + HZ / 10UL))
		602	break;
		603	if (work->nr_pages <= 0)
		604	break;
539	}	605	}
540	if (!list_empty(&wb->b_more_io))
541	wbc->more_io = 1;
542	}	606	}
543	/* b_io is empty */	607	return wrote;
544	return 1;
545	}	608	}
546		609
547	void writeback_inodes_wb(struct bdi_writeback *wb,	610	static long __writeback_inodes_wb(struct bdi_writeback *wb,
548	struct writeback_control *wbc)	611	struct wb_writeback_work *work)
549	{	612	{
550	int ret = 0;	613	unsigned long start_time = jiffies;
		614	long wrote = 0;
551		615
552	if (!wbc->wb_start)
553	wbc->wb_start = jiffies; /* livelock avoidance */
554	spin_lock(&inode_wb_list_lock);
555	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))
556	queue_io(wb, wbc->older_than_this);
557
558	while (!list_empty(&wb->b_io)) {	616	while (!list_empty(&wb->b_io)) {
559	struct inode *inode = wb_inode(wb->b_io.prev);	617	struct inode *inode = wb_inode(wb->b_io.prev);
560	struct super_block *sb = inode->i_sb;	618	struct super_block *sb = inode->i_sb;
561		619
562	if (!grab_super_passive(sb)) {	620	if (!grab_super_passive(sb)) {
563	requeue_io(inode);	621	requeue_io(inode, wb);
564	continue;	622	continue;
565	}	623	}
566	ret = writeback_sb_inodes(sb, wb, wbc, false);	624	wrote += writeback_sb_inodes(sb, wb, work);
567	drop_super(sb);	625	drop_super(sb);
568		626
569	if (ret)	627	/* refer to the same tests at the end of writeback_sb_inodes */
570	break;	628	if (wrote) {
		629	if (time_is_before_jiffies(start_time + HZ / 10UL))
		630	break;
		631	if (work->nr_pages <= 0)
		632	break;
		633	}
571	}	634	}
572	spin_unlock(&inode_wb_list_lock);
573	/* Leave any unwritten inodes on b_io */	635	/* Leave any unwritten inodes on b_io */
		636	return wrote;
574	}	637	}
575		638
576	static void __writeback_inodes_sb(struct super_block *sb,	639	long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
577	struct bdi_writeback wb, struct writeback_control wbc)
578	{	640	{
579	WARN_ON(!rwsem_is_locked(&sb->s_umount));	641	struct wb_writeback_work work = {
		642	.nr_pages = nr_pages,
		643	.sync_mode = WB_SYNC_NONE,
		644	.range_cyclic = 1,
		645	};
580		646
581	spin_lock(&inode_wb_list_lock);	647	spin_lock(&wb->list_lock);
582	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))	648	if (list_empty(&wb->b_io))
583	queue_io(wb, wbc->older_than_this);	649	queue_io(wb, NULL);
584	writeback_sb_inodes(sb, wb, wbc, true);	650	__writeback_inodes_wb(wb, &work);
585	spin_unlock(&inode_wb_list_lock);	651	spin_unlock(&wb->list_lock);
		652
		653	return nr_pages - work.nr_pages;
586	}	654	}
587		655
588	/*
589	* The maximum number of pages to writeout in a single bdi flush/kupdate
590	* operation. We do this so we don't hold I_SYNC against an inode for
591	* enormous amounts of time, which would block a userspace task which has
592	* been forced to throttle against that inode. Also, the code reevaluates
593	* the dirty each time it has written this many pages.
594	*/
595	#define MAX_WRITEBACK_PAGES 1024
596
597	static inline bool over_bground_thresh(void)	656	static inline bool over_bground_thresh(void)
598	{	657	{
599	unsigned long background_thresh, dirty_thresh;	658	unsigned long background_thresh, dirty_thresh;
600		659
601	global_dirty_limits(&background_thresh, &dirty_thresh);	660	global_dirty_limits(&background_thresh, &dirty_thresh);
602		661
603	return (global_page_state(NR_FILE_DIRTY) +	662	return (global_page_state(NR_FILE_DIRTY) +
604	global_page_state(NR_UNSTABLE_NFS) > background_thresh);	663	global_page_state(NR_UNSTABLE_NFS) > background_thresh);
605	}	664	}
606		665
607	/*	666	/*
		667	* Called under wb->list_lock. If there are multiple wb per bdi,
		668	* only the flusher working on the first wb should do it.
		669	*/
		670	static void wb_update_bandwidth(struct bdi_writeback *wb,
		671	unsigned long start_time)
		672	{
		673	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
		674	}
		675
		676	/*
608	* Explicit flushing or periodic writeback of "old" data.	677	* Explicit flushing or periodic writeback of "old" data.
609	*	678	*
610	* Define "old": the first time one of an inode's pages is dirtied, we mark the	679	* Define "old": the first time one of an inode's pages is dirtied, we mark the
611	* dirtying-time in the inode's address_space. So this periodic writeback code	680	* dirtying-time in the inode's address_space. So this periodic writeback code
612	* just walks the superblock inode list, writing back any inodes which are	681	* just walks the superblock inode list, writing back any inodes which are
613	* older than a specific point in time.	682	* older than a specific point in time.
614	*	683	*
615	* Try to run once per dirty_writeback_interval. But if a writeback event	684	* Try to run once per dirty_writeback_interval. But if a writeback event
616	* takes longer than a dirty_writeback_interval interval, then leave a	685	* takes longer than a dirty_writeback_interval interval, then leave a
617	* one-second gap.	686	* one-second gap.
618	*	687	*
619	* older_than_this takes precedence over nr_to_write. So we'll only write back	688	* older_than_this takes precedence over nr_to_write. So we'll only write back
620	* all dirty pages if they are all attached to "old" mappings.	689	* all dirty pages if they are all attached to "old" mappings.
621	*/	690	*/
622	static long wb_writeback(struct bdi_writeback *wb,	691	static long wb_writeback(struct bdi_writeback *wb,
623	struct wb_writeback_work *work)	692	struct wb_writeback_work *work)
624	{	693	{
625	struct writeback_control wbc = {	694	unsigned long wb_start = jiffies;
626	.sync_mode = work->sync_mode,	695	long nr_pages = work->nr_pages;
627	.older_than_this = NULL,
628	.for_kupdate = work->for_kupdate,
629	.for_background = work->for_background,
630	.range_cyclic = work->range_cyclic,
631	};
632	unsigned long oldest_jif;	696	unsigned long oldest_jif;
633	long wrote = 0;
634	long write_chunk;
635	struct inode *inode;	697	struct inode *inode;
		698	long progress;
636		699
637	if (wbc.for_kupdate) {	700	oldest_jif = jiffies;
638	wbc.older_than_this = &oldest_jif;	701	work->older_than_this = &oldest_jif;
639	oldest_jif = jiffies -
640	msecs_to_jiffies(dirty_expire_interval * 10);
641	}
642	if (!wbc.range_cyclic) {
643	wbc.range_start = 0;
644	wbc.range_end = LLONG_MAX;
645	}
646		702
647	/*	703	spin_lock(&wb->list_lock);
648	* WB_SYNC_ALL mode does livelock avoidance by syncing dirty
649	* inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
650	* here avoids calling into writeback_inodes_wb() more than once.
651	*
652	* The intended call sequence for WB_SYNC_ALL writeback is:
653	*
654	* wb_writeback()
655	* __writeback_inodes_sb() <== called only once
656	* write_cache_pages() <== called once for each inode
657	* (quickly) tag currently dirty pages
658	* (maybe slowly) sync all tagged pages
659	*/
660	if (wbc.sync_mode == WB_SYNC_NONE)
661	write_chunk = MAX_WRITEBACK_PAGES;
662	else
663	write_chunk = LONG_MAX;
664
665	wbc.wb_start = jiffies; /* livelock avoidance */
666	for (;;) {	704	for (;;) {
667	/*	705	/*
668	* Stop writeback when nr_pages has been consumed	706	* Stop writeback when nr_pages has been consumed
669	*/	707	*/
670	if (work->nr_pages <= 0)	708	if (work->nr_pages <= 0)
671	break;	709	break;
672		710
673	/*	711	/*
674	* Background writeout and kupdate-style writeback may	712	* Background writeout and kupdate-style writeback may
675	* run forever. Stop them if there is other work to do	713	* run forever. Stop them if there is other work to do
676	* so that e.g. sync can proceed. They'll be restarted	714	* so that e.g. sync can proceed. They'll be restarted
677	* after the other works are all done.	715	* after the other works are all done.
678	*/	716	*/
679	if ((work->for_background \|\| work->for_kupdate) &&	717	if ((work->for_background \|\| work->for_kupdate) &&
680	!list_empty(&wb->bdi->work_list))	718	!list_empty(&wb->bdi->work_list))
681	break;	719	break;
682		720
683	/*	721	/*
684	* For background writeout, stop when we are below the	722	* For background writeout, stop when we are below the
685	* background dirty threshold	723	* background dirty threshold
686	*/	724	*/
687	if (work->for_background && !over_bground_thresh())	725	if (work->for_background && !over_bground_thresh())
688	break;	726	break;
689		727
690	wbc.more_io = 0;	728	if (work->for_kupdate) {
691	wbc.nr_to_write = write_chunk;	729	oldest_jif = jiffies -
692	wbc.pages_skipped = 0;	730	msecs_to_jiffies(dirty_expire_interval * 10);
		731	work->older_than_this = &oldest_jif;
		732	}
693		733
694	trace_wbc_writeback_start(&wbc, wb->bdi);	734	trace_writeback_start(wb->bdi, work);
		735	if (list_empty(&wb->b_io))
		736	queue_io(wb, work->older_than_this);
695	if (work->sb)	737	if (work->sb)
696	__writeback_inodes_sb(work->sb, wb, &wbc);	738	progress = writeback_sb_inodes(work->sb, wb, work);
697	else	739	else
698	writeback_inodes_wb(wb, &wbc);	740	progress = __writeback_inodes_wb(wb, work);
699	trace_wbc_writeback_written(&wbc, wb->bdi);	741	trace_writeback_written(wb->bdi, work);
700		742
701	work->nr_pages -= write_chunk - wbc.nr_to_write;	743	wb_update_bandwidth(wb, wb_start);
702	wrote += write_chunk - wbc.nr_to_write;
703		744
704	/*	745	/*
705	* If we consumed everything, see if we have more	746	* Did we write something? Try for more
		747	*
		748	* Dirty inodes are moved to b_io for writeback in batches.
		749	* The completion of the current batch does not necessarily
		750	* mean the overall work is done. So we keep looping as long
		751	* as made some progress on cleaning pages or inodes.
706	*/	752	*/
707	if (wbc.nr_to_write <= 0)	753	if (progress)
708	continue;	754	continue;
709	/*	755	/*
710	* Didn't write everything and we don't have more IO, bail	756	* No more inodes for IO, bail
711	*/	757	*/
712	if (!wbc.more_io)	758	if (list_empty(&wb->b_more_io))
713	break;	759	break;
714	/*	760	/*
715	* Did we write something? Try for more
716	*/
717	if (wbc.nr_to_write < write_chunk)
718	continue;
719	/*
720	* Nothing written. Wait for some inode to	761	* Nothing written. Wait for some inode to
721	* become available for writeback. Otherwise	762	* become available for writeback. Otherwise
722	* we'll just busyloop.	763	* we'll just busyloop.
723	*/	764	*/
724	spin_lock(&inode_wb_list_lock);
725	if (!list_empty(&wb->b_more_io)) {	765	if (!list_empty(&wb->b_more_io)) {
		766	trace_writeback_wait(wb->bdi, work);
726	inode = wb_inode(wb->b_more_io.prev);	767	inode = wb_inode(wb->b_more_io.prev);
727	trace_wbc_writeback_wait(&wbc, wb->bdi);
728	spin_lock(&inode->i_lock);	768	spin_lock(&inode->i_lock);
729	inode_wait_for_writeback(inode);	769	inode_wait_for_writeback(inode, wb);
730	spin_unlock(&inode->i_lock);	770	spin_unlock(&inode->i_lock);
731	}	771	}
732	spin_unlock(&inode_wb_list_lock);
733	}	772	}
		773	spin_unlock(&wb->list_lock);
734		774
735	return wrote;	775	return nr_pages - work->nr_pages;
736	}	776	}
737		777
738	/*	778	/*
739	* Return the next wb_writeback_work struct that hasn't been processed yet.	779	* Return the next wb_writeback_work struct that hasn't been processed yet.
740	*/	780	*/
741	static struct wb_writeback_work *	781	static struct wb_writeback_work *
742	get_next_work_item(struct backing_dev_info *bdi)	782	get_next_work_item(struct backing_dev_info *bdi)
743	{	783	{
744	struct wb_writeback_work *work = NULL;	784	struct wb_writeback_work *work = NULL;
745		785
746	spin_lock_bh(&bdi->wb_lock);	786	spin_lock_bh(&bdi->wb_lock);
747	if (!list_empty(&bdi->work_list)) {	787	if (!list_empty(&bdi->work_list)) {
748	work = list_entry(bdi->work_list.next,	788	work = list_entry(bdi->work_list.next,
749	struct wb_writeback_work, list);	789	struct wb_writeback_work, list);
750	list_del_init(&work->list);	790	list_del_init(&work->list);
751	}	791	}
752	spin_unlock_bh(&bdi->wb_lock);	792	spin_unlock_bh(&bdi->wb_lock);
753	return work;	793	return work;
754	}	794	}
755		795
756	/*	796	/*
757	* Add in the number of potentially dirty inodes, because each inode	797	* Add in the number of potentially dirty inodes, because each inode
758	* write can dirty pagecache in the underlying blockdev.	798	* write can dirty pagecache in the underlying blockdev.
759	*/	799	*/
760	static unsigned long get_nr_dirty_pages(void)	800	static unsigned long get_nr_dirty_pages(void)
761	{	801	{
762	return global_page_state(NR_FILE_DIRTY) +	802	return global_page_state(NR_FILE_DIRTY) +
763	global_page_state(NR_UNSTABLE_NFS) +	803	global_page_state(NR_UNSTABLE_NFS) +
764	get_nr_dirty_inodes();	804	get_nr_dirty_inodes();
765	}	805	}
766		806
767	static long wb_check_background_flush(struct bdi_writeback *wb)	807	static long wb_check_background_flush(struct bdi_writeback *wb)
768	{	808	{
769	if (over_bground_thresh()) {	809	if (over_bground_thresh()) {
770		810
771	struct wb_writeback_work work = {	811	struct wb_writeback_work work = {
772	.nr_pages = LONG_MAX,	812	.nr_pages = LONG_MAX,
773	.sync_mode = WB_SYNC_NONE,	813	.sync_mode = WB_SYNC_NONE,
774	.for_background = 1,	814	.for_background = 1,
775	.range_cyclic = 1,	815	.range_cyclic = 1,
776	};	816	};
777		817
778	return wb_writeback(wb, &work);	818	return wb_writeback(wb, &work);
779	}	819	}
780		820
781	return 0;	821	return 0;
782	}	822	}
783		823
784	static long wb_check_old_data_flush(struct bdi_writeback *wb)	824	static long wb_check_old_data_flush(struct bdi_writeback *wb)
785	{	825	{
786	unsigned long expired;	826	unsigned long expired;
787	long nr_pages;	827	long nr_pages;
788		828
789	/*	829	/*
790	* When set to zero, disable periodic writeback	830	* When set to zero, disable periodic writeback
791	*/	831	*/
792	if (!dirty_writeback_interval)	832	if (!dirty_writeback_interval)
793	return 0;	833	return 0;
794		834
795	expired = wb->last_old_flush +	835	expired = wb->last_old_flush +
796	msecs_to_jiffies(dirty_writeback_interval * 10);	836	msecs_to_jiffies(dirty_writeback_interval * 10);
797	if (time_before(jiffies, expired))	837	if (time_before(jiffies, expired))
798	return 0;	838	return 0;
799		839
800	wb->last_old_flush = jiffies;	840	wb->last_old_flush = jiffies;
801	nr_pages = get_nr_dirty_pages();	841	nr_pages = get_nr_dirty_pages();
802		842
803	if (nr_pages) {	843	if (nr_pages) {
804	struct wb_writeback_work work = {	844	struct wb_writeback_work work = {
805	.nr_pages = nr_pages,	845	.nr_pages = nr_pages,
806	.sync_mode = WB_SYNC_NONE,	846	.sync_mode = WB_SYNC_NONE,
807	.for_kupdate = 1,	847	.for_kupdate = 1,
808	.range_cyclic = 1,	848	.range_cyclic = 1,
809	};	849	};
810		850
811	return wb_writeback(wb, &work);	851	return wb_writeback(wb, &work);
812	}	852	}
813		853
814	return 0;	854	return 0;
815	}	855	}
816		856
817	/*	857	/*
818	* Retrieve work items and do the writeback they describe	858	* Retrieve work items and do the writeback they describe
819	*/	859	*/
820	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)	860	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
821	{	861	{
822	struct backing_dev_info *bdi = wb->bdi;	862	struct backing_dev_info *bdi = wb->bdi;
823	struct wb_writeback_work *work;	863	struct wb_writeback_work *work;
824	long wrote = 0;	864	long wrote = 0;
825		865
826	set_bit(BDI_writeback_running, &wb->bdi->state);	866	set_bit(BDI_writeback_running, &wb->bdi->state);
827	while ((work = get_next_work_item(bdi)) != NULL) {	867	while ((work = get_next_work_item(bdi)) != NULL) {
828	/*	868	/*
829	* Override sync mode, in case we must wait for completion	869	* Override sync mode, in case we must wait for completion
830	* because this thread is exiting now.	870	* because this thread is exiting now.
831	*/	871	*/
832	if (force_wait)	872	if (force_wait)
833	work->sync_mode = WB_SYNC_ALL;	873	work->sync_mode = WB_SYNC_ALL;
834		874
835	trace_writeback_exec(bdi, work);	875	trace_writeback_exec(bdi, work);
836		876
837	wrote += wb_writeback(wb, work);	877	wrote += wb_writeback(wb, work);
838		878
839	/*	879	/*
840	* Notify the caller of completion if this is a synchronous	880	* Notify the caller of completion if this is a synchronous
841	* work item, otherwise just free it.	881	* work item, otherwise just free it.
842	*/	882	*/
843	if (work->done)	883	if (work->done)
844	complete(work->done);	884	complete(work->done);
845	else	885	else
846	kfree(work);	886	kfree(work);
847	}	887	}
848		888
849	/*	889	/*
850	* Check for periodic writeback, kupdated() style	890	* Check for periodic writeback, kupdated() style
851	*/	891	*/
852	wrote += wb_check_old_data_flush(wb);	892	wrote += wb_check_old_data_flush(wb);
853	wrote += wb_check_background_flush(wb);	893	wrote += wb_check_background_flush(wb);
854	clear_bit(BDI_writeback_running, &wb->bdi->state);	894	clear_bit(BDI_writeback_running, &wb->bdi->state);
855		895
856	return wrote;	896	return wrote;
857	}	897	}
858		898
859	/*	899	/*
860	* Handle writeback of dirty data for the device backed by this bdi. Also	900	* Handle writeback of dirty data for the device backed by this bdi. Also
861	* wakes up periodically and does kupdated style flushing.	901	* wakes up periodically and does kupdated style flushing.
862	*/	902	*/
863	int bdi_writeback_thread(void *data)	903	int bdi_writeback_thread(void *data)
864	{	904	{
865	struct bdi_writeback *wb = data;	905	struct bdi_writeback *wb = data;
866	struct backing_dev_info *bdi = wb->bdi;	906	struct backing_dev_info *bdi = wb->bdi;
867	long pages_written;	907	long pages_written;
868		908
869	current->flags \|= PF_SWAPWRITE;	909	current->flags \|= PF_SWAPWRITE;
870	set_freezable();	910	set_freezable();
871	wb->last_active = jiffies;	911	wb->last_active = jiffies;
872		912
873	/*	913	/*
874	* Our parent may run at a different priority, just set us to normal	914	* Our parent may run at a different priority, just set us to normal
875	*/	915	*/
876	set_user_nice(current, 0);	916	set_user_nice(current, 0);
877		917
878	trace_writeback_thread_start(bdi);	918	trace_writeback_thread_start(bdi);
879		919
880	while (!kthread_should_stop()) {	920	while (!kthread_should_stop()) {
881	/*	921	/*
882	* Remove own delayed wake-up timer, since we are already awake	922	* Remove own delayed wake-up timer, since we are already awake
883	* and we'll take care of the preriodic write-back.	923	* and we'll take care of the preriodic write-back.
884	*/	924	*/
885	del_timer(&wb->wakeup_timer);	925	del_timer(&wb->wakeup_timer);
886		926
887	pages_written = wb_do_writeback(wb, 0);	927	pages_written = wb_do_writeback(wb, 0);
888		928
889	trace_writeback_pages_written(pages_written);	929	trace_writeback_pages_written(pages_written);
890		930
891	if (pages_written)	931	if (pages_written)
892	wb->last_active = jiffies;	932	wb->last_active = jiffies;
893		933
894	set_current_state(TASK_INTERRUPTIBLE);	934	set_current_state(TASK_INTERRUPTIBLE);
895	if (!list_empty(&bdi->work_list) \|\| kthread_should_stop()) {	935	if (!list_empty(&bdi->work_list) \|\| kthread_should_stop()) {
896	__set_current_state(TASK_RUNNING);	936	__set_current_state(TASK_RUNNING);
897	continue;	937	continue;
898	}	938	}
899		939
900	if (wb_has_dirty_io(wb) && dirty_writeback_interval)	940	if (wb_has_dirty_io(wb) && dirty_writeback_interval)
901	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));	941	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
902	else {	942	else {
903	/*	943	/*
904	* We have nothing to do, so can go sleep without any	944	* We have nothing to do, so can go sleep without any
905	* timeout and save power. When a work is queued or	945	* timeout and save power. When a work is queued or
906	* something is made dirty - we will be woken up.	946	* something is made dirty - we will be woken up.
907	*/	947	*/
908	schedule();	948	schedule();
909	}	949	}
910		950
911	try_to_freeze();	951	try_to_freeze();
912	}	952	}
913		953
914	/* Flush any work that raced with us exiting */	954	/* Flush any work that raced with us exiting */
915	if (!list_empty(&bdi->work_list))	955	if (!list_empty(&bdi->work_list))
916	wb_do_writeback(wb, 1);	956	wb_do_writeback(wb, 1);
917		957
918	trace_writeback_thread_stop(bdi);	958	trace_writeback_thread_stop(bdi);
919	return 0;	959	return 0;
920	}	960	}
921		961
922		962
923	/*	963	/*
924	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back	964	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
925	* the whole world.	965	* the whole world.
926	*/	966	*/
927	void wakeup_flusher_threads(long nr_pages)	967	void wakeup_flusher_threads(long nr_pages)
928	{	968	{
929	struct backing_dev_info *bdi;	969	struct backing_dev_info *bdi;
930		970
931	if (!nr_pages) {	971	if (!nr_pages) {
932	nr_pages = global_page_state(NR_FILE_DIRTY) +	972	nr_pages = global_page_state(NR_FILE_DIRTY) +
933	global_page_state(NR_UNSTABLE_NFS);	973	global_page_state(NR_UNSTABLE_NFS);
934	}	974	}
935		975
936	rcu_read_lock();	976	rcu_read_lock();
937	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {	977	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
938	if (!bdi_has_dirty_io(bdi))	978	if (!bdi_has_dirty_io(bdi))
939	continue;	979	continue;
940	__bdi_start_writeback(bdi, nr_pages, false);	980	__bdi_start_writeback(bdi, nr_pages, false);
941	}	981	}
942	rcu_read_unlock();	982	rcu_read_unlock();
943	}	983	}
944		984
945	static noinline void block_dump___mark_inode_dirty(struct inode *inode)	985	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
946	{	986	{
947	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {	987	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
948	struct dentry *dentry;	988	struct dentry *dentry;
949	const char *name = "?";	989	const char *name = "?";
950		990
951	dentry = d_find_alias(inode);	991	dentry = d_find_alias(inode);
952	if (dentry) {	992	if (dentry) {
953	spin_lock(&dentry->d_lock);	993	spin_lock(&dentry->d_lock);
954	name = (const char *) dentry->d_name.name;	994	name = (const char *) dentry->d_name.name;
955	}	995	}
956	printk(KERN_DEBUG	996	printk(KERN_DEBUG
957	"%s(%d): dirtied inode %lu (%s) on %s\n",	997	"%s(%d): dirtied inode %lu (%s) on %s\n",
958	current->comm, task_pid_nr(current), inode->i_ino,	998	current->comm, task_pid_nr(current), inode->i_ino,
959	name, inode->i_sb->s_id);	999	name, inode->i_sb->s_id);
960	if (dentry) {	1000	if (dentry) {
961	spin_unlock(&dentry->d_lock);	1001	spin_unlock(&dentry->d_lock);
962	dput(dentry);	1002	dput(dentry);
963	}	1003	}
964	}	1004	}
965	}	1005	}
966		1006
967	/**	1007	/**
968	* __mark_inode_dirty - internal function	1008	* __mark_inode_dirty - internal function
969	* @inode: inode to mark	1009	* @inode: inode to mark
970	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)	1010	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
971	* Mark an inode as dirty. Callers should use mark_inode_dirty or	1011	* Mark an inode as dirty. Callers should use mark_inode_dirty or
972	* mark_inode_dirty_sync.	1012	* mark_inode_dirty_sync.
973	*	1013	*
974	* Put the inode on the super block's dirty list.	1014	* Put the inode on the super block's dirty list.
975	*	1015	*
976	* CAREFUL! We mark it dirty unconditionally, but move it onto the	1016	* CAREFUL! We mark it dirty unconditionally, but move it onto the
977	* dirty list only if it is hashed or if it refers to a blockdev.	1017	* dirty list only if it is hashed or if it refers to a blockdev.
978	* If it was not hashed, it will never be added to the dirty list	1018	* If it was not hashed, it will never be added to the dirty list
979	* even if it is later hashed, as it will have been marked dirty already.	1019	* even if it is later hashed, as it will have been marked dirty already.
980	*	1020	*
981	* In short, make sure you hash any inodes _before_ you start marking	1021	* In short, make sure you hash any inodes _before_ you start marking
982	* them dirty.	1022	* them dirty.
983	*	1023	*
984	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of	1024	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
985	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of	1025	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
986	* the kernel-internal blockdev inode represents the dirtying time of the	1026	* the kernel-internal blockdev inode represents the dirtying time of the
987	* blockdev's pages. This is why for I_DIRTY_PAGES we always use	1027	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
988	* page->mapping->host, so the page-dirtying time is recorded in the internal	1028	* page->mapping->host, so the page-dirtying time is recorded in the internal
989	* blockdev inode.	1029	* blockdev inode.
990	*/	1030	*/
991	void __mark_inode_dirty(struct inode *inode, int flags)	1031	void __mark_inode_dirty(struct inode *inode, int flags)
992	{	1032	{
993	struct super_block *sb = inode->i_sb;	1033	struct super_block *sb = inode->i_sb;
994	struct backing_dev_info *bdi = NULL;	1034	struct backing_dev_info *bdi = NULL;
995		1035
996	/*	1036	/*
997	* Don't do this for I_DIRTY_PAGES - that doesn't actually	1037	* Don't do this for I_DIRTY_PAGES - that doesn't actually
998	* dirty the inode itself	1038	* dirty the inode itself
999	*/	1039	*/
1000	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {	1040	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
1001	if (sb->s_op->dirty_inode)	1041	if (sb->s_op->dirty_inode)
1002	sb->s_op->dirty_inode(inode, flags);	1042	sb->s_op->dirty_inode(inode, flags);
1003	}	1043	}
1004		1044
1005	/*	1045	/*
1006	* make sure that changes are seen by all cpus before we test i_state	1046	* make sure that changes are seen by all cpus before we test i_state
1007	* -- mikulas	1047	* -- mikulas
1008	*/	1048	*/
1009	smp_mb();	1049	smp_mb();
1010		1050
1011	/* avoid the locking if we can */	1051	/* avoid the locking if we can */
1012	if ((inode->i_state & flags) == flags)	1052	if ((inode->i_state & flags) == flags)
1013	return;	1053	return;
1014		1054
1015	if (unlikely(block_dump))	1055	if (unlikely(block_dump))
1016	block_dump___mark_inode_dirty(inode);	1056	block_dump___mark_inode_dirty(inode);
1017		1057
1018	spin_lock(&inode->i_lock);	1058	spin_lock(&inode->i_lock);
1019	if ((inode->i_state & flags) != flags) {	1059	if ((inode->i_state & flags) != flags) {
1020	const int was_dirty = inode->i_state & I_DIRTY;	1060	const int was_dirty = inode->i_state & I_DIRTY;
1021		1061
1022	inode->i_state \|= flags;	1062	inode->i_state \|= flags;
1023		1063
1024	/*	1064	/*
1025	* If the inode is being synced, just update its dirty state.	1065	* If the inode is being synced, just update its dirty state.
1026	* The unlocker will place the inode on the appropriate	1066	* The unlocker will place the inode on the appropriate
1027	* superblock list, based upon its state.	1067	* superblock list, based upon its state.
1028	*/	1068	*/
1029	if (inode->i_state & I_SYNC)	1069	if (inode->i_state & I_SYNC)
1030	goto out_unlock_inode;	1070	goto out_unlock_inode;
1031		1071
1032	/*	1072	/*
1033	* Only add valid (hashed) inodes to the superblock's	1073	* Only add valid (hashed) inodes to the superblock's
1034	* dirty list. Add blockdev inodes as well.	1074	* dirty list. Add blockdev inodes as well.
1035	*/	1075	*/
1036	if (!S_ISBLK(inode->i_mode)) {	1076	if (!S_ISBLK(inode->i_mode)) {
1037	if (inode_unhashed(inode))	1077	if (inode_unhashed(inode))
1038	goto out_unlock_inode;	1078	goto out_unlock_inode;
1039	}	1079	}
1040	if (inode->i_state & I_FREEING)	1080	if (inode->i_state & I_FREEING)
1041	goto out_unlock_inode;	1081	goto out_unlock_inode;
1042		1082
1043	/*	1083	/*
1044	* If the inode was already on b_dirty/b_io/b_more_io, don't	1084	* If the inode was already on b_dirty/b_io/b_more_io, don't
1045	* reposition it (that would break b_dirty time-ordering).	1085	* reposition it (that would break b_dirty time-ordering).
1046	*/	1086	*/
1047	if (!was_dirty) {	1087	if (!was_dirty) {
1048	bool wakeup_bdi = false;	1088	bool wakeup_bdi = false;
1049	bdi = inode_to_bdi(inode);	1089	bdi = inode_to_bdi(inode);
1050		1090
1051	if (bdi_cap_writeback_dirty(bdi)) {	1091	if (bdi_cap_writeback_dirty(bdi)) {
1052	WARN(!test_bit(BDI_registered, &bdi->state),	1092	WARN(!test_bit(BDI_registered, &bdi->state),
1053	"bdi-%s not registered\n", bdi->name);	1093	"bdi-%s not registered\n", bdi->name);
1054		1094
1055	/*	1095	/*
1056	* If this is the first dirty inode for this	1096	* If this is the first dirty inode for this
1057	* bdi, we have to wake-up the corresponding	1097	* bdi, we have to wake-up the corresponding
1058	* bdi thread to make sure background	1098	* bdi thread to make sure background
1059	* write-back happens later.	1099	* write-back happens later.
1060	*/	1100	*/
1061	if (!wb_has_dirty_io(&bdi->wb))	1101	if (!wb_has_dirty_io(&bdi->wb))
1062	wakeup_bdi = true;	1102	wakeup_bdi = true;
1063	}	1103	}
1064		1104
1065	spin_unlock(&inode->i_lock);	1105	spin_unlock(&inode->i_lock);
1066	spin_lock(&inode_wb_list_lock);	1106	spin_lock(&bdi->wb.list_lock);
1067	inode->dirtied_when = jiffies;	1107	inode->dirtied_when = jiffies;
1068	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);	1108	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1069	spin_unlock(&inode_wb_list_lock);	1109	spin_unlock(&bdi->wb.list_lock);
1070		1110
1071	if (wakeup_bdi)	1111	if (wakeup_bdi)
1072	bdi_wakeup_thread_delayed(bdi);	1112	bdi_wakeup_thread_delayed(bdi);
1073	return;	1113	return;
1074	}	1114	}
1075	}	1115	}
1076	out_unlock_inode:	1116	out_unlock_inode:
1077	spin_unlock(&inode->i_lock);	1117	spin_unlock(&inode->i_lock);
1078		1118
1079	}	1119	}
1080	EXPORT_SYMBOL(__mark_inode_dirty);	1120	EXPORT_SYMBOL(__mark_inode_dirty);
1081		1121
1082	/*	1122	/*
1083	* Write out a superblock's list of dirty inodes. A wait will be performed	1123	* Write out a superblock's list of dirty inodes. A wait will be performed
1084	* upon no inodes, all inodes or the final one, depending upon sync_mode.	1124	* upon no inodes, all inodes or the final one, depending upon sync_mode.
1085	*	1125	*
1086	* If older_than_this is non-NULL, then only write out inodes which	1126	* If older_than_this is non-NULL, then only write out inodes which
1087	* had their first dirtying at a time earlier than *older_than_this.	1127	* had their first dirtying at a time earlier than *older_than_this.
1088	*	1128	*
1089	* If `bdi' is non-zero then we're being asked to writeback a specific queue.	1129	* If `bdi' is non-zero then we're being asked to writeback a specific queue.
1090	* This function assumes that the blockdev superblock's inodes are backed by	1130	* This function assumes that the blockdev superblock's inodes are backed by
1091	* a variety of queues, so all inodes are searched. For other superblocks,	1131	* a variety of queues, so all inodes are searched. For other superblocks,
1092	* assume that all inodes are backed by the same queue.	1132	* assume that all inodes are backed by the same queue.
1093	*	1133	*
1094	* The inodes to be written are parked on bdi->b_io. They are moved back onto	1134	* The inodes to be written are parked on bdi->b_io. They are moved back onto
1095	* bdi->b_dirty as they are selected for writing. This way, none can be missed	1135	* bdi->b_dirty as they are selected for writing. This way, none can be missed
1096	* on the writer throttling path, and we get decent balancing between many	1136	* on the writer throttling path, and we get decent balancing between many
1097	* throttled threads: we don't want them all piling up on inode_sync_wait.	1137	* throttled threads: we don't want them all piling up on inode_sync_wait.
1098	*/	1138	*/
1099	static void wait_sb_inodes(struct super_block *sb)	1139	static void wait_sb_inodes(struct super_block *sb)
1100	{	1140	{
1101	struct inode inode, old_inode = NULL;	1141	struct inode inode, old_inode = NULL;
1102		1142
1103	/*	1143	/*
1104	* We need to be protected against the filesystem going from	1144	* We need to be protected against the filesystem going from
1105	* r/o to r/w or vice versa.	1145	* r/o to r/w or vice versa.
1106	*/	1146	*/
1107	WARN_ON(!rwsem_is_locked(&sb->s_umount));	1147	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1108		1148
1109	spin_lock(&inode_sb_list_lock);	1149	spin_lock(&inode_sb_list_lock);
1110		1150
1111	/*	1151	/*
1112	* Data integrity sync. Must wait for all pages under writeback,	1152	* Data integrity sync. Must wait for all pages under writeback,
1113	* because there may have been pages dirtied before our sync	1153	* because there may have been pages dirtied before our sync
1114	* call, but which had writeout started before we write it out.	1154	* call, but which had writeout started before we write it out.
1115	* In which case, the inode may not be on the dirty list, but	1155	* In which case, the inode may not be on the dirty list, but
1116	* we still have to wait for that writeout.	1156	* we still have to wait for that writeout.
1117	*/	1157	*/
1118	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {	1158	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1119	struct address_space *mapping = inode->i_mapping;	1159	struct address_space *mapping = inode->i_mapping;
1120		1160
1121	spin_lock(&inode->i_lock);	1161	spin_lock(&inode->i_lock);
1122	if ((inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW)) \|\|	1162	if ((inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW)) \|\|
1123	(mapping->nrpages == 0)) {	1163	(mapping->nrpages == 0)) {
1124	spin_unlock(&inode->i_lock);	1164	spin_unlock(&inode->i_lock);
1125	continue;	1165	continue;
1126	}	1166	}
1127	__iget(inode);	1167	__iget(inode);
1128	spin_unlock(&inode->i_lock);	1168	spin_unlock(&inode->i_lock);
1129	spin_unlock(&inode_sb_list_lock);	1169	spin_unlock(&inode_sb_list_lock);
1130		1170
1131	/*	1171	/*
1132	* We hold a reference to 'inode' so it couldn't have been	1172	* We hold a reference to 'inode' so it couldn't have been
1133	* removed from s_inodes list while we dropped the	1173	* removed from s_inodes list while we dropped the
1134	* inode_sb_list_lock. We cannot iput the inode now as we can	1174	* inode_sb_list_lock. We cannot iput the inode now as we can
1135	* be holding the last reference and we cannot iput it under	1175	* be holding the last reference and we cannot iput it under
1136	* inode_sb_list_lock. So we keep the reference and iput it	1176	* inode_sb_list_lock. So we keep the reference and iput it
1137	* later.	1177	* later.
1138	*/	1178	*/
1139	iput(old_inode);	1179	iput(old_inode);
1140	old_inode = inode;	1180	old_inode = inode;
1141		1181
1142	filemap_fdatawait(mapping);	1182	filemap_fdatawait(mapping);
1143		1183
1144	cond_resched();	1184	cond_resched();
1145		1185
1146	spin_lock(&inode_sb_list_lock);	1186	spin_lock(&inode_sb_list_lock);
1147	}	1187	}
1148	spin_unlock(&inode_sb_list_lock);	1188	spin_unlock(&inode_sb_list_lock);
1149	iput(old_inode);	1189	iput(old_inode);
1150	}	1190	}
1151		1191
1152	/**	1192	/**
1153	* writeback_inodes_sb_nr - writeback dirty inodes from given super_block	1193	* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1154	* @sb: the superblock	1194	* @sb: the superblock
1155	* @nr: the number of pages to write	1195	* @nr: the number of pages to write
1156	*	1196	*
1157	* Start writeback on some inodes on this super_block. No guarantees are made	1197	* Start writeback on some inodes on this super_block. No guarantees are made
1158	* on how many (if any) will be written, and this function does not wait	1198	* on how many (if any) will be written, and this function does not wait
1159	* for IO completion of submitted IO.	1199	* for IO completion of submitted IO.
1160	*/	1200	*/
1161	void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)	1201	void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1162	{	1202	{
1163	DECLARE_COMPLETION_ONSTACK(done);	1203	DECLARE_COMPLETION_ONSTACK(done);
1164	struct wb_writeback_work work = {	1204	struct wb_writeback_work work = {
1165	.sb = sb,	1205	.sb = sb,
1166	.sync_mode = WB_SYNC_NONE,	1206	.sync_mode = WB_SYNC_NONE,
1167	.done = &done,	1207	.tagged_writepages = 1,
1168	.nr_pages = nr,	1208	.done = &done,
		1209	.nr_pages = nr,
1169	};	1210	};
1170		1211
1171	WARN_ON(!rwsem_is_locked(&sb->s_umount));	1212	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1172	bdi_queue_work(sb->s_bdi, &work);	1213	bdi_queue_work(sb->s_bdi, &work);
1173	wait_for_completion(&done);	1214	wait_for_completion(&done);
1174	}	1215	}
1175	EXPORT_SYMBOL(writeback_inodes_sb_nr);	1216	EXPORT_SYMBOL(writeback_inodes_sb_nr);
1176		1217
1177	/**	1218	/**
1178	* writeback_inodes_sb - writeback dirty inodes from given super_block	1219	* writeback_inodes_sb - writeback dirty inodes from given super_block
1179	* @sb: the superblock	1220	* @sb: the superblock
1180	*	1221	*
1181	* Start writeback on some inodes on this super_block. No guarantees are made	1222	* Start writeback on some inodes on this super_block. No guarantees are made
1182	* on how many (if any) will be written, and this function does not wait	1223	* on how many (if any) will be written, and this function does not wait
1183	* for IO completion of submitted IO.	1224	* for IO completion of submitted IO.
1184	*/	1225	*/
1185	void writeback_inodes_sb(struct super_block *sb)	1226	void writeback_inodes_sb(struct super_block *sb)
1186	{	1227	{
1187	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());	1228	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1188	}	1229	}
1189	EXPORT_SYMBOL(writeback_inodes_sb);	1230	EXPORT_SYMBOL(writeback_inodes_sb);
1190		1231
1191	/**	1232	/**
1192	* writeback_inodes_sb_if_idle - start writeback if none underway	1233	* writeback_inodes_sb_if_idle - start writeback if none underway
1193	* @sb: the superblock	1234	* @sb: the superblock
1194	*	1235	*
1195	* Invoke writeback_inodes_sb if no writeback is currently underway.	1236	* Invoke writeback_inodes_sb if no writeback is currently underway.
1196	* Returns 1 if writeback was started, 0 if not.	1237	* Returns 1 if writeback was started, 0 if not.
1197	*/	1238	*/
1198	int writeback_inodes_sb_if_idle(struct super_block *sb)	1239	int writeback_inodes_sb_if_idle(struct super_block *sb)
1199	{	1240	{
1200	if (!writeback_in_progress(sb->s_bdi)) {	1241	if (!writeback_in_progress(sb->s_bdi)) {
1201	down_read(&sb->s_umount);	1242	down_read(&sb->s_umount);
1202	writeback_inodes_sb(sb);	1243	writeback_inodes_sb(sb);
1203	up_read(&sb->s_umount);	1244	up_read(&sb->s_umount);
1204	return 1;	1245	return 1;
1205	} else	1246	} else
1206	return 0;	1247	return 0;
1207	}	1248	}
1208	EXPORT_SYMBOL(writeback_inodes_sb_if_idle);	1249	EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1209		1250
1210	/**	1251	/**
1211	* writeback_inodes_sb_if_idle - start writeback if none underway	1252	* writeback_inodes_sb_if_idle - start writeback if none underway
1212	* @sb: the superblock	1253	* @sb: the superblock
1213	* @nr: the number of pages to write	1254	* @nr: the number of pages to write
1214	*	1255	*
1215	* Invoke writeback_inodes_sb if no writeback is currently underway.	1256	* Invoke writeback_inodes_sb if no writeback is currently underway.
1216	* Returns 1 if writeback was started, 0 if not.	1257	* Returns 1 if writeback was started, 0 if not.
1217	*/	1258	*/
1218	int writeback_inodes_sb_nr_if_idle(struct super_block *sb,	1259	int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1219	unsigned long nr)	1260	unsigned long nr)
1220	{	1261	{
1221	if (!writeback_in_progress(sb->s_bdi)) {	1262	if (!writeback_in_progress(sb->s_bdi)) {
1222	down_read(&sb->s_umount);	1263	down_read(&sb->s_umount);
1223	writeback_inodes_sb_nr(sb, nr);	1264	writeback_inodes_sb_nr(sb, nr);
1224	up_read(&sb->s_umount);	1265	up_read(&sb->s_umount);
1225	return 1;	1266	return 1;
1226	} else	1267	} else
1227	return 0;	1268	return 0;
1228	}	1269	}
1229	EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);	1270	EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1230		1271
1231	/**	1272	/**
1232	* sync_inodes_sb - sync sb inode pages	1273	* sync_inodes_sb - sync sb inode pages
1233	* @sb: the superblock	1274	* @sb: the superblock
1234	*	1275	*
1235	* This function writes and waits on any dirty inode belonging to this	1276	* This function writes and waits on any dirty inode belonging to this
1236	* super_block.	1277	* super_block.
1237	*/	1278	*/
1238	void sync_inodes_sb(struct super_block *sb)	1279	void sync_inodes_sb(struct super_block *sb)
1239	{	1280	{
1240	DECLARE_COMPLETION_ONSTACK(done);	1281	DECLARE_COMPLETION_ONSTACK(done);
1241	struct wb_writeback_work work = {	1282	struct wb_writeback_work work = {
1242	.sb = sb,	1283	.sb = sb,
1243	.sync_mode = WB_SYNC_ALL,	1284	.sync_mode = WB_SYNC_ALL,
1244	.nr_pages = LONG_MAX,	1285	.nr_pages = LONG_MAX,
1245	.range_cyclic = 0,	1286	.range_cyclic = 0,
1246	.done = &done,	1287	.done = &done,
1247	};	1288	};
1248		1289
1249	WARN_ON(!rwsem_is_locked(&sb->s_umount));	1290	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1250		1291
1251	bdi_queue_work(sb->s_bdi, &work);	1292	bdi_queue_work(sb->s_bdi, &work);
1252	wait_for_completion(&done);	1293	wait_for_completion(&done);
1253		1294
1254	wait_sb_inodes(sb);	1295	wait_sb_inodes(sb);
1255	}	1296	}
1256	EXPORT_SYMBOL(sync_inodes_sb);	1297	EXPORT_SYMBOL(sync_inodes_sb);
1257		1298
1258	/**	1299	/**
1259	* write_inode_now - write an inode to disk	1300	* write_inode_now - write an inode to disk
1260	* @inode: inode to write to disk	1301	* @inode: inode to write to disk

fs/inode.c

Diff comments View file @ f01ef56

 /*
  * (C) 1997 Linus Torvalds
  * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
  */
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/dcache.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
 #include <linux/prefetch.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include "internal.h"
 /*
  * Inode locking rules:
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
  * inode->i_sb->s_inode_lru_lock protects:
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
  * Lock ordering:
  *
  * inode_sb_list_lock
  *   inode->i_lock
  *     inode->i_sb->s_inode_lru_lock
  *
- * inode_wb_list_lock
+ * bdi->wb.list_lock
  *   inode->i_lock
  *
  * inode_hash_lock
  *   inode_sb_list_lock
  *   inode->i_lock
  *
  * iunique_lock
  *   inode_hash_lock
  */
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 /*
  * Empty aops. Can be used for the cases where the user does not
  * define any of the address_space operations.
  */
 const struct address_space_operations empty_aops = {
 };
 EXPORT_SYMBOL(empty_aops);
 /*
  * Statistics gathering..
  */
 struct inodes_stat_t inodes_stat;
 static DEFINE_PER_CPU(unsigned int, nr_inodes);
 static DEFINE_PER_CPU(unsigned int, nr_unused);
 static struct kmem_cache *inode_cachep __read_mostly;
 static int get_nr_inodes(void)
 {
 	int i;
 	int sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_inodes, i);
 	return sum < 0 ? 0 : sum;
 }
 static inline int get_nr_inodes_unused(void)
 {
 	int i;
 	int sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_unused, i);
 	return sum < 0 ? 0 : sum;
 }
 int get_nr_dirty_inodes(void)
 {
 	/* not actually dirty inodes, but a wild approximation */
 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
 }
 /*
  * Handle nr_inode sysctl
  */
 #ifdef CONFIG_SYSCTL
 int proc_nr_inodes(ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
 	inodes_stat.nr_unused = get_nr_inodes_unused();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
 /**
  * inode_init_always - perform inode structure intialisation
  * @sb: superblock inode belongs to
  * @inode: inode to initialise
  *
  * These are initializations that need to be done on every inode
  * allocation as the fields are not initialised by slab allocation.
  */
 int inode_init_always(struct super_block *sb, struct inode *inode)
 {
 	static const struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
 	struct address_space *const mapping = &inode->i_data;
 	inode->i_sb = sb;
 	inode->i_blkbits = sb->s_blocksize_bits;
 	inode->i_flags = 0;
 	atomic_set(&inode->i_count, 1);
 	inode->i_op = &empty_iops;
 	inode->i_fop = &empty_fops;
 	inode->i_nlink = 1;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	atomic_set(&inode->i_writecount, 0);
 	inode->i_size = 0;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
 	inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
 	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
 	inode->i_pipe = NULL;
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 	if (security_inode_alloc(inode))
 		goto out;
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 	mutex_init(&inode->i_mutex);
 	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
 	atomic_set(&inode->i_dio_count, 0);
 	mapping->a_ops = &empty_aops;
 	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
 	/*
 	 * If the block_device provides a backing_dev_info for client
 	 * inodes then use that.  Otherwise the inode share the bdev's
 	 * backing_dev_info.
 	 */
 	if (sb->s_bdev) {
 		struct backing_dev_info *bdi;
 		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		mapping->backing_dev_info = bdi;
 	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 #ifdef CONFIG_FS_POSIX_ACL
 	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
 #endif
 #ifdef CONFIG_FSNOTIFY
 	inode->i_fsnotify_mask = 0;
 #endif
 	this_cpu_inc(nr_inodes);
 	return 0;
 out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(inode_init_always);
 static struct inode *alloc_inode(struct super_block *sb)
 {
 	struct inode *inode;
 	if (sb->s_op->alloc_inode)
 		inode = sb->s_op->alloc_inode(sb);
 	else
 		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 	if (!inode)
 		return NULL;
 	if (unlikely(inode_init_always(sb, inode))) {
 		if (inode->i_sb->s_op->destroy_inode)
 			inode->i_sb->s_op->destroy_inode(inode);
 		else
 			kmem_cache_free(inode_cachep, inode);
 		return NULL;
 	}
 	return inode;
 }
 void free_inode_nonrcu(struct inode *inode)
 {
 	kmem_cache_free(inode_cachep, inode);
 }
 EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
 	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_acl);
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
 	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 static void i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(inode_cachep, inode);
 }
 static void destroy_inode(struct inode *inode)
 {
 	BUG_ON(!list_empty(&inode->i_lru));
 	__destroy_inode(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
 		call_rcu(&inode->i_rcu, i_callback);
 }
 void address_space_init_once(struct address_space *mapping)
 {
 	memset(mapping, 0, sizeof(*mapping));
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
 	mutex_init(&mapping->i_mmap_mutex);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
  * of the inode, so let the slab aware of that.
  */
 void inode_init_once(struct inode *inode)
 {
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
 }
 EXPORT_SYMBOL(inode_init_once);
 static void init_once(void *foo)
 {
 	struct inode *inode = (struct inode *) foo;
 	inode_init_once(inode);
 }
 /*
  * inode->i_lock must be held
  */
 void __iget(struct inode *inode)
 {
 	atomic_inc(&inode->i_count);
 }
 /*
  * get additional reference to inode; caller must already hold one.
  */
 void ihold(struct inode *inode)
 {
 	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
 }
 EXPORT_SYMBOL(ihold);
 static void inode_lru_list_add(struct inode *inode)
 {
 	spin_lock(&inode->i_sb->s_inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
 		inode->i_sb->s_nr_inodes_unused++;
 		this_cpu_inc(nr_unused);
 	}
 	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 static void inode_lru_list_del(struct inode *inode)
 {
 	spin_lock(&inode->i_sb->s_inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
 		inode->i_sb->s_nr_inodes_unused--;
 		this_cpu_dec(nr_unused);
 	}
 	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 /**
  * inode_sb_list_add - add inode to the superblock list of inodes
  * @inode: inode to add
  */
 void inode_sb_list_add(struct inode *inode)
 {
 	spin_lock(&inode_sb_list_lock);
 	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 	spin_unlock(&inode_sb_list_lock);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 static inline void inode_sb_list_del(struct inode *inode)
 {
 	spin_lock(&inode_sb_list_lock);
 	list_del_init(&inode->i_sb_list);
 	spin_unlock(&inode_sb_list_lock);
 }
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
 {
 	unsigned long tmp;
 	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
 			L1_CACHE_BYTES;
 	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
 	return tmp & i_hash_mask;
 }
 /**
  *	__insert_inode_hash - hash an inode
  *	@inode: unhashed inode
  *	@hashval: unsigned long value used to locate this object in the
  *		inode_hashtable.
  *
  *	Add an inode to the inode hash for this superblock.
  */
 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
 	spin_lock(&inode_hash_lock);
 	spin_lock(&inode->i_lock);
 	hlist_add_head(&inode->i_hash, b);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 /**
  *	remove_inode_hash - remove an inode from the hash
  *	@inode: inode to unhash
  *
  *	Remove an inode from the superblock.
  */
 void remove_inode_hash(struct inode *inode)
 {
 	spin_lock(&inode_hash_lock);
 	spin_lock(&inode->i_lock);
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
 	might_sleep();
 	/*
 	 * We have to cycle tree_lock here because reclaim can be still in the
 	 * process of removing the last page (in __delete_from_page_cache())
 	 * and we must not free mapping under it.
 	 */
 	spin_lock_irq(&inode->i_data.tree_lock);
 	BUG_ON(inode->i_data.nrpages);
 	spin_unlock_irq(&inode->i_data.tree_lock);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
 	/* don't need i_lock here, no concurrent mods to i_state */
 	inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
 /*
  * Free the inode passed in, removing it from the lists it is still connected
  * to. We remove any pages still attached to the inode and wait for any IO that
  * is still in progress before finally destroying the inode.
  *
  * An inode must already be marked I_FREEING so that we avoid the inode being
  * moved back onto lists if we race with other code that manipulates the lists
  * (e.g. writeback_single_inode). The caller is responsible for setting this.
  *
  * An inode must already be removed from the LRU list before being evicted from
  * the cache. This should occur atomically with setting the I_FREEING state
  * flag, so no inodes here should ever be on the LRU when being evicted.
  */
 static void evict(struct inode *inode)
 {
 	const struct super_operations *op = inode->i_sb->s_op;
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(!list_empty(&inode->i_lru));
 	inode_wb_list_del(inode);
 	inode_sb_list_del(inode);
 	if (op->evict_inode) {
 		op->evict_inode(inode);
 	} else {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
 		end_writeback(inode);
 	}
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
 	remove_inode_hash(inode);
 	spin_lock(&inode->i_lock);
 	wake_up_bit(&inode->i_state, __I_NEW);
 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	spin_unlock(&inode->i_lock);
 	destroy_inode(inode);
 }
 /*
  * dispose_list - dispose of the contents of a local list
  * @head: the head of the list to free
  *
  * Dispose-list gets a local list with local inodes in it, so it doesn't
  * need to worry about list corruption and SMP locks.
  */
 static void dispose_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
 		struct inode *inode;
 		inode = list_first_entry(head, struct inode, i_lru);
 		list_del_init(&inode->i_lru);
 		evict(inode);
 	}
 }
 /**
  * evict_inodes	- evict all evictable inodes for a superblock
  * @sb:		superblock to operate on
  *
  * Make sure that no inodes with zero refcount are retained.  This is
  * called by superblock shutdown after having MS_ACTIVE flag removed,
  * so any inode reaching zero refcount during or after that call will
  * be immediately evicted.
  */
 void evict_inodes(struct super_block *sb)
 {
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		inode->i_state |= I_FREEING;
 		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_sb_list_lock);
 	dispose_list(&dispose);
 }
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
  * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Attempts to free all inodes for a given superblock.  If there were any
  * busy inodes return a non-zero value, else zero.
  * If @kill_dirty is set, discard dirty inodes too, otherwise treat
  * them as busy.
  */
 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
 	int busy = 0;
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode->i_state & I_DIRTY && !kill_dirty) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
 		}
 		if (atomic_read(&inode->i_count)) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
 		}
 		inode->i_state |= I_FREEING;
 		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_sb_list_lock);
 	dispose_list(&dispose);
 	return busy;
 }
 static int can_unuse(struct inode *inode)
 {
 	if (inode->i_state & ~I_REFERENCED)
 		return 0;
 	if (inode_has_buffers(inode))
 		return 0;
 	if (atomic_read(&inode->i_count))
 		return 0;
 	if (inode->i_data.nrpages)
 		return 0;
 	return 1;
 }
 /*
  * Walk the superblock inode LRU for freeable inodes and attempt to free them.
  * This is called from the superblock shrinker function with a number of inodes
  * to trim from the LRU. Inodes to be freed are moved to a temporary list and
  * then are freed outside inode_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
  * mapping->private_list then try to remove them.
  *
  * If the inode has the I_REFERENCED flag set, then it means that it has been
  * used recently - the flag is set in iput_final(). When we encounter such an
  * inode, clear the flag and move it to the back of the LRU so it gets another
  * pass through the LRU before it gets reclaimed. This is necessary because of
  * the fact we are doing lazy LRU updates to minimise lock contention so the
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
 void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 {
 	LIST_HEAD(freeable);
 	int nr_scanned;
 	unsigned long reap = 0;
 	spin_lock(&sb->s_inode_lru_lock);
 	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
 		struct inode *inode;
 		if (list_empty(&sb->s_inode_lru))
 			break;
 		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
 		/*
 		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
 		 * so use a trylock. If we fail to get the lock, just move the
 		 * inode to the back of the list so we don't spin on it.
 		 */
 		if (!spin_trylock(&inode->i_lock)) {
 			list_move(&inode->i_lru, &sb->s_inode_lru);
 			continue;
 		}
 		/*
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
 			spin_unlock(&inode->i_lock);
 			sb->s_nr_inodes_unused--;
 			this_cpu_dec(nr_unused);
 			continue;
 		}
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
 			inode->i_state &= ~I_REFERENCED;
 			list_move(&inode->i_lru, &sb->s_inode_lru);
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&sb->s_inode_lru_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
 			spin_lock(&sb->s_inode_lru_lock);
 			if (inode != list_entry(sb->s_inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
 			/* avoid lock inversions with trylock */
 			if (!spin_trylock(&inode->i_lock))
 				continue;
 			if (!can_unuse(inode)) {
 				spin_unlock(&inode->i_lock);
 				continue;
 			}
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
 		spin_unlock(&inode->i_lock);
 		list_move(&inode->i_lru, &freeable);
 		sb->s_nr_inodes_unused--;
 		this_cpu_dec(nr_unused);
 	}
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&sb->s_inode_lru_lock);
 	dispose_list(&freeable);
 }
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
  */
 static struct inode *find_inode(struct super_block *sb,
 				struct hlist_head *head,
 				int (*test)(struct inode *, void *),
 				void *data)
 {
 	struct hlist_node *node;
 	struct inode *inode = NULL;
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_sb != sb) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (!test(inode, data)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		return inode;
 	}
 	return NULL;
 }
 /*
  * find_inode_fast is the fast path version of find_inode, see the comment at
  * iget_locked for details.
  */
 static struct inode *find_inode_fast(struct super_block *sb,
 				struct hlist_head *head, unsigned long ino)
 {
 	struct hlist_node *node;
 	struct inode *inode = NULL;
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_ino != ino) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode->i_sb != sb) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		return inode;
 	}
 	return NULL;
 }
 /*
  * Each cpu owns a range of LAST_INO_BATCH numbers.
  * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
  * to renew the exhausted range.
  *
  * This does not significantly increase overflow rate because every CPU can
  * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
  * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
  * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
  * overflow rate by 2x, which does not seem too significant.
  *
  * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
  * error if st_ino won't fit in target struct field. Use 32bit counter
  * here to attempt to avoid that.
  */
 #define LAST_INO_BATCH 1024
 static DEFINE_PER_CPU(unsigned int, last_ino);
 unsigned int get_next_ino(void)
 {
 	unsigned int *p = &get_cpu_var(last_ino);
 	unsigned int res = *p;
 #ifdef CONFIG_SMP
 	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
 		static atomic_t shared_last_ino;
 		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
 		res = next - LAST_INO_BATCH;
 	}
 #endif
 	*p = ++res;
 	put_cpu_var(last_ino);
 	return res;
 }
 EXPORT_SYMBOL(get_next_ino);
 /**
  *	new_inode 	- obtain an inode
  *	@sb: superblock
  *
  *	Allocates a new inode for given superblock. The default gfp_mask
  *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
  *	If HIGHMEM pages are unsuitable or it is known that pages allocated
  *	for the page cache are not reclaimable or migratable,
  *	mapping_set_gfp_mask() must be called with suitable flags on the
  *	newly created inode's mapping
  *
  */
 struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 	spin_lock_prefetch(&inode_sb_list_lock);
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
 		spin_unlock(&inode->i_lock);
 		inode_sb_list_add(inode);
 	}
 	return inode;
 }
 EXPORT_SYMBOL(new_inode);
 /**
  * unlock_new_inode - clear the I_NEW state and wake up any waiters
  * @inode:	new inode to unlock
  *
  * Called when the inode is fully initialised to clear the new state of the
  * inode and wake up anyone waiting for the inode to finish initialisation.
  */
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (S_ISDIR(inode->i_mode)) {
 		struct file_system_type *type = inode->i_sb->s_type;
 		/* Set new key only if filesystem hasn't already changed it */
 		if (!lockdep_match_class(&inode->i_mutex,
 		    &type->i_mutex_key)) {
 			/*
 			 * ensure nobody is actually holding i_mutex
 			 */
 			mutex_destroy(&inode->i_mutex);
 			mutex_init(&inode->i_mutex);
 			lockdep_set_class(&inode->i_mutex,
 					  &type->i_mutex_dir_key);
 		}
 	}
 #endif
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW;
 	wake_up_bit(&inode->i_state, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
 /**
  * iget5_locked - obtain an inode from a mounted file system
  * @sb:		super block of file system
  * @hashval:	hash value (usually inode number) to get
  * @test:	callback used for comparisons between inodes
  * @set:	callback used to initialize a new struct inode
  * @data:	opaque data pointer to pass to @test and @set
  *
  * Search for the inode specified by @hashval and @data in the inode cache,
  * and if present it is return it with an increased reference count. This is
  * a generalized version of iget_locked() for file systems where the inode
  * number is not sufficient for unique identification of an inode.
  *
  * If the inode is not in cache, allocate a new inode and return it locked,
  * hashed, and with the I_NEW flag set. The file system gets to fill it in
  * before unlocking it via unlock_new_inode().
  *
  * Note both @test and @set are called with the inode_hash_lock held, so can't
  * sleep.
  */
 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *),
 		int (*set)(struct inode *, void *), void *data)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 	spin_lock(&inode_hash_lock);
 	inode = find_inode(sb, head, test, data);
 	spin_unlock(&inode_hash_lock);
 	if (inode) {
 		wait_on_inode(inode);
 		return inode;
 	}
 	inode = alloc_inode(sb);
 	if (inode) {
 		struct inode *old;
 		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		old = find_inode(sb, head, test, data);
 		if (!old) {
 			if (set(inode, data))
 				goto set_failed;
 			spin_lock(&inode->i_lock);
 			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			inode_sb_list_add(inode);
 			spin_unlock(&inode_hash_lock);
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
 			 */
 			return inode;
 		}
 		/*
 		 * Uhhuh, somebody else created the same inode under
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
 		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		inode = old;
 		wait_on_inode(inode);
 	}
 	return inode;
 set_failed:
 	spin_unlock(&inode_hash_lock);
 	destroy_inode(inode);
 	return NULL;
 }
 EXPORT_SYMBOL(iget5_locked);
 /**
  * iget_locked - obtain an inode from a mounted file system
  * @sb:		super block of file system
  * @ino:	inode number to get
  *
  * Search for the inode specified by @ino in the inode cache and if present
  * return it with an increased reference count. This is for file systems
  * where the inode number is sufficient for unique identification of an inode.
  *
  * If the inode is not in cache, allocate a new inode and return it locked,
  * hashed, and with the I_NEW flag set.  The file system gets to fill it in
  * before unlocking it via unlock_new_inode().
  */
 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
 	spin_lock(&inode_hash_lock);
 	inode = find_inode_fast(sb, head, ino);
 	spin_unlock(&inode_hash_lock);
 	if (inode) {
 		wait_on_inode(inode);
 		return inode;
 	}
 	inode = alloc_inode(sb);
 	if (inode) {
 		struct inode *old;
 		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 			inode->i_ino = ino;
 			spin_lock(&inode->i_lock);
 			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			inode_sb_list_add(inode);
 			spin_unlock(&inode_hash_lock);
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
 			 */
 			return inode;
 		}
 		/*
 		 * Uhhuh, somebody else created the same inode under
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
 		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		inode = old;
 		wait_on_inode(inode);
 	}
 	return inode;
 }
 EXPORT_SYMBOL(iget_locked);
 /*
  * search the inode cache for a matching inode number.
  * If we find one, then the inode number we are trying to
  * allocate is not unique and so we should not use it.
  *
  * Returns 1 if the inode number is unique, 0 if it is not.
  */
 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *b = inode_hashtable + hash(sb, ino);
 	struct hlist_node *node;
 	struct inode *inode;
 	spin_lock(&inode_hash_lock);
 	hlist_for_each_entry(inode, node, b, i_hash) {
 		if (inode->i_ino == ino && inode->i_sb == sb) {
 			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
 	}
 	spin_unlock(&inode_hash_lock);
 	return 1;
 }
 /**
  *	iunique - get a unique inode number
  *	@sb: superblock
  *	@max_reserved: highest reserved inode number
  *
  *	Obtain an inode number that is unique on the system for a given
  *	superblock. This is used by file systems that have no natural
  *	permanent inode numbering system. An inode number is returned that
  *	is higher than the reserved limit but unique.
  *
  *	BUGS:
  *	With a large number of inodes live on the file system this function
  *	currently becomes quite slow.
  */
 ino_t iunique(struct super_block *sb, ino_t max_reserved)
 {
 	/*
 	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 	 * error if st_ino won't fit in target struct field. Use 32bit counter
 	 * here to attempt to avoid that.
 	 */
 	static DEFINE_SPINLOCK(iunique_lock);
 	static unsigned int counter;
 	ino_t res;
 	spin_lock(&iunique_lock);
 	do {
 		if (counter <= max_reserved)
 			counter = max_reserved + 1;
 		res = counter++;
 	} while (!test_inode_iunique(sb, res));
 	spin_unlock(&iunique_lock);
 	return res;
 }
 EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode->i_lock);
 	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 	} else {
 		spin_unlock(&inode->i_lock);
 		/*
 		 * Handle the case where s_op->clear_inode is not been
 		 * called yet, and somebody is calling igrab
 		 * while the inode is getting freed.
 		 */
 		inode = NULL;
 	}
 	return inode;
 }
 EXPORT_SYMBOL(igrab);
 /**
  * ilookup5_nowait - search for an inode in the inode cache
  * @sb:		super block of file system to search
  * @hashval:	hash value (usually inode number) to search for
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
  *
  * Search for the inode specified by @hashval and @data in the inode cache.
  * If the inode is in the cache, the inode is returned with an incremented
  * reference count.
  *
  * Note: I_NEW is not waited upon so you have to be very careful what you do
  * with the returned inode.  You probably should be using ilookup5() instead.
  *
  * Note2: @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 	spin_lock(&inode_hash_lock);
 	inode = find_inode(sb, head, test, data);
 	spin_unlock(&inode_hash_lock);
 	return inode;
 }
 EXPORT_SYMBOL(ilookup5_nowait);
 /**
  * ilookup5 - search for an inode in the inode cache
  * @sb:		super block of file system to search
  * @hashval:	hash value (usually inode number) to search for
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
  *
  * Search for the inode specified by @hashval and @data in the inode cache,
  * and if the inode is in the cache, return the inode with an incremented
  * reference count.  Waits on I_NEW before returning the inode.
  * returned with an incremented reference count.
  *
  * This is a generalized version of ilookup() for file systems where the
  * inode number is not sufficient for unique identification of an inode.
  *
  * Note: @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
 	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
 	if (inode)
 		wait_on_inode(inode);
 	return inode;
 }
 EXPORT_SYMBOL(ilookup5);
 /**
  * ilookup - search for an inode in the inode cache
  * @sb:		super block of file system to search
  * @ino:	inode number to search for
  *
  * Search for the inode @ino in the inode cache, and if the inode is in the
  * cache, the inode is returned with an incremented reference count.
  */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
 	spin_lock(&inode_hash_lock);
 	inode = find_inode_fast(sb, head, ino);
 	spin_unlock(&inode_hash_lock);
 	if (inode)
 		wait_on_inode(inode);
 	return inode;
 }
 EXPORT_SYMBOL(ilookup);
 int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
 		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_ino != ino)
 				continue;
 			if (old->i_sb != sb)
 				continue;
 			spin_lock(&old->i_lock);
 			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
 				spin_unlock(&old->i_lock);
 				continue;
 			}
 			break;
 		}
 		if (likely(!node)) {
 			spin_lock(&inode->i_lock);
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
 		iput(old);
 	}
 }
 EXPORT_SYMBOL(insert_inode_locked);
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
 		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_sb != sb)
 				continue;
 			if (!test(old, data))
 				continue;
 			spin_lock(&old->i_lock);
 			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
 				spin_unlock(&old->i_lock);
 				continue;
 			}
 			break;
 		}
 		if (likely(!node)) {
 			spin_lock(&inode->i_lock);
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
 		iput(old);
 	}
 }
 EXPORT_SYMBOL(insert_inode_locked4);
 int generic_delete_inode(struct inode *inode)
 {
 	return 1;
 }
 EXPORT_SYMBOL(generic_delete_inode);
 /*
  * Normal UNIX filesystem behaviour: delete the
  * inode when the usage count drops to zero, and
  * i_nlink is zero.
  */
 int generic_drop_inode(struct inode *inode)
 {
 	return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
 /*
  * Called when we're dropping the last reference
  * to an inode.
  *
  * Call the FS "drop_inode()" function, defaulting to
  * the legacy UNIX filesystem behaviour.  If it tells
  * us to evict inode, do so.  Otherwise, retain inode
  * in cache if fs is alive, sync and evict if fs is
  * shutting down.
  */
 static void iput_final(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 	WARN_ON(inode->i_state & I_NEW);
 	if (op->drop_inode)
 		drop = op->drop_inode(inode);
 	else
 		drop = generic_drop_inode(inode);
 	if (!drop && (sb->s_flags & MS_ACTIVE)) {
 		inode->i_state |= I_REFERENCED;
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			inode_lru_list_add(inode);
 		spin_unlock(&inode->i_lock);
 		return;
 	}
 	if (!drop) {
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode->i_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 	}
 	inode->i_state |= I_FREEING;
 	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
 	evict(inode);
 }
 /**
  *	iput	- put an inode
  *	@inode: inode to put
  *
  *	Puts an inode, dropping its usage count. If the inode use count hits
  *	zero, the inode is then freed and may also be destroyed.
  *
  *	Consequently, iput() can sleep.
  */
 void iput(struct inode *inode)
 {
 	if (inode) {
 		BUG_ON(inode->i_state & I_CLEAR);
 		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
 			iput_final(inode);
 	}
 }
 EXPORT_SYMBOL(iput);
 /**
  *	bmap	- find a block number in a file
  *	@inode: inode of file
  *	@block: block to find
  *
  *	Returns the block number on the device holding the inode that
  *	is the disk block number for the block of the file requested.
  *	That is, asked for block 4 of inode 1 the function will return the
  *	disk block relative to the disk start that holds that block of the
  *	file.
  */
 sector_t bmap(struct inode *inode, sector_t block)
 {
 	sector_t res = 0;
 	if (inode->i_mapping->a_ops->bmap)
 		res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
 	return res;
 }
 EXPORT_SYMBOL(bmap);
 /*
  * With relative atime, only update atime if the previous atime is
  * earlier than either the ctime or mtime or if at least a day has
  * passed since the last atime update.
  */
 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 			     struct timespec now)
 {
 	if (!(mnt->mnt_flags & MNT_RELATIME))
 		return 1;
 	/*
 	 * Is mtime younger than atime? If yes, update atime:
 	 */
 	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
 		return 1;
 	/*
 	 * Is ctime younger than atime? If yes, update atime:
 	 */
 	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
 		return 1;
 	/*
 	 * Is the previous atime value older than a day? If yes,
 	 * update atime:
 	 */
 	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
 	 */
 	return 0;
 }
 /**
  *	touch_atime	-	update the access time
  *	@mnt: mount the inode is accessed on
  *	@dentry: dentry accessed
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
  *	as well as the "noatime" flag and inode specific "noatime" markers.
  */
 void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 	if (inode->i_flags & S_NOATIME)
 		return;
 	if (IS_NOATIME(inode))
 		return;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return;
 	if (mnt->mnt_flags & MNT_NOATIME)
 		return;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return;
 	now = current_fs_time(inode->i_sb);
 	if (!relatime_need_update(mnt, inode, now))
 		return;
 	if (timespec_equal(&inode->i_atime, &now))
 		return;
 	if (mnt_want_write(mnt))
 		return;
 	inode->i_atime = now;
 	mark_inode_dirty_sync(inode);
 	mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
 /**
  *	file_update_time	-	update mtime and ctime time
  *	@file: file accessed
  *
  *	Update the mtime and ctime members of an inode and mark the inode
  *	for writeback.  Note that this function is meant exclusively for
  *	usage in the file write path of filesystems, and filesystems may
  *	choose to explicitly ignore update via this function with the
  *	S_NOCMTIME inode flag, e.g. for network filesystem where these
  *	timestamps are handled by the server.
  */
 void file_update_time(struct file *file)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct timespec now;
 	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return;
 	now = current_fs_time(inode->i_sb);
 	if (!timespec_equal(&inode->i_mtime, &now))
 		sync_it = S_MTIME;
 	if (!timespec_equal(&inode->i_ctime, &now))
 		sync_it |= S_CTIME;
 	if (IS_I_VERSION(inode))
 		sync_it |= S_VERSION;
 	if (!sync_it)
 		return;
 	/* Finally allowed to write? Takes lock. */
 	if (mnt_want_write_file(file))
 		return;
 	/* Only change inode inside the lock region */
 	if (sync_it & S_VERSION)
 		inode_inc_iversion(inode);
 	if (sync_it & S_CTIME)
 		inode->i_ctime = now;
 	if (sync_it & S_MTIME)
 		inode->i_mtime = now;
 	mark_inode_dirty_sync(inode);
 	mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(file_update_time);
 int inode_needs_sync(struct inode *inode)
 {
 	if (IS_SYNC(inode))
 		return 1;
 	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
 		return 1;
 	return 0;
 }
 EXPORT_SYMBOL(inode_needs_sync);
 int inode_wait(void *word)
 {
 	schedule();
 	return 0;
 }
 EXPORT_SYMBOL(inode_wait);
 /*
  * If we try to find an inode in the inode hash while it is being
  * deleted, we have to wait until the filesystem completes its
  * deletion before reporting that it isn't found.  This function waits
  * until the deletion _might_ have completed.  Callers are responsible
  * to recheck inode state.
  *
  * It doesn't matter if I_NEW is not set initially, a call to
  * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
  * will DTRT.
  */
 static void __wait_on_freeing_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq;
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_hash_lock);
 	schedule();
 	finish_wait(wq, &wait.wait);
 	spin_lock(&inode_hash_lock);
 }
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	ihash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("ihash_entries=", set_ihash_entries);
 /*
  * Initialize the waitqueues and inode hash table.
  */
 void __init inode_init_early(void)
 {
 	int loop;
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
 	 */
 	if (hashdist)
 		return;
 	inode_hashtable =
 		alloc_large_system_hash("Inode-cache",
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
 					HASH_EARLY,
 					&i_hash_shift,
 					&i_hash_mask,
 					0);
 	for (loop = 0; loop < (1 << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 void __init inode_init(void)
 {
 	int loop;
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache",
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
 		return;
 	inode_hashtable =
 		alloc_large_system_hash("Inode-cache",
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
 					0,
 					&i_hash_shift,
 					&i_hash_mask,
 					0);
 	for (loop = 0; loop < (1 << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 {
 	inode->i_mode = mode;
 	if (S_ISCHR(mode)) {
 		inode->i_fop = &def_chr_fops;
 		inode->i_rdev = rdev;
 	} else if (S_ISBLK(mode)) {
 		inode->i_fop = &def_blk_fops;
 		inode->i_rdev = rdev;
 	} else if (S_ISFIFO(mode))
 		inode->i_fop = &def_fifo_fops;
 	else if (S_ISSOCK(mode))
 		inode->i_fop = &bad_sock_fops;
 	else
 		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
 				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
 				  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
 /**
  * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
  * @inode: New inode
  * @dir: Directory inode
  * @mode: mode of the new inode
  */
 void inode_init_owner(struct inode *inode, const struct inode *dir,
 			mode_t mode)
 {
 	inode->i_uid = current_fsuid();
 	if (dir && dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else
 		inode->i_gid = current_fsgid();
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
 /**
  * inode_owner_or_capable - check current task permissions to inode
  * @inode: inode being checked
  *
  * Return true if current either has CAP_FOWNER to the inode, or
  * owns the file.
  */
 bool inode_owner_or_capable(const struct inode *inode)
 {
 	struct user_namespace *ns = inode_userns(inode);
 	if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
 		return true;
 	if (ns_capable(ns, CAP_FOWNER))
 		return true;
 	return false;
 }
 EXPORT_SYMBOL(inode_owner_or_capable);

fs/nfs/write.c

Diff comments View file @ f01ef56

 /*
  * linux/fs/nfs/write.c
  *
  * Write file data over NFS.
  *
  * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
  */
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/swap.h>
 #include <linux/migrate.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
 #include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
 #include "pnfs.h"
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 #define MIN_POOL_WRITE		(32)
 #define MIN_POOL_COMMIT		(4)
 /*
  * Local function declarations
  */
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
 				  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 struct nfs_write_data *nfs_commitdata_alloc(void)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 	}
 	return p;
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
 void nfs_commit_free(struct nfs_write_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
 			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
 		}
 	}
 	return p;
 }
 void nfs_writedata_free(struct nfs_write_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
 	put_lseg(wdata->lseg);
 	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 }
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 {
 	ctx->error = error;
 	smp_wmb();
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 {
 	struct nfs_page *req = NULL;
 	if (PagePrivate(page)) {
 		req = (struct nfs_page *)page_private(page);
 		if (req != NULL)
 			kref_get(&req->wb_kref);
 	}
 	return req;
 }
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page *req = NULL;
 	spin_lock(&inode->i_lock);
 	req = nfs_page_find_request_locked(page);
 	spin_unlock(&inode->i_lock);
 	return req;
 }
 /* Adjust the file length if we're writing beyond the end */
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t end, i_size;
 	pgoff_t end_index;
 	spin_lock(&inode->i_lock);
 	i_size = i_size_read(inode);
 	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 	if (i_size > 0 && page->index < end_index)
 		goto out;
 	end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
 	if (i_size >= end)
 		goto out;
 	i_size_write(inode, end);
 	nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
 out:
 	spin_unlock(&inode->i_lock);
 }
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
 static void nfs_set_pageerror(struct page *page)
 {
 	SetPageError(page);
 	nfs_zap_mapping(page->mapping->host, page->mapping);
 }
 /* We can set the PG_uptodate flag if we see that a write request
  * covers the full page.
  */
 static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
 {
 	if (PageUptodate(page))
 		return;
 	if (base != 0)
 		return;
 	if (count != nfs_page_length(page))
 		return;
 	SetPageUptodate(page);
 }
 static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
 	if (wbc->for_kupdate || wbc->for_background)
 		return FLUSH_LOWPRI | FLUSH_COND_STABLE;
 	return FLUSH_COND_STABLE;
 }
 /*
  * NFS congestion control
  */
 int nfs_congestion_kb;
 #define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
 #define NFS_CONGESTION_OFF_THRESH	\
 	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
 static int nfs_set_page_writeback(struct page *page)
 {
 	int ret = test_set_page_writeback(page);
 	if (!ret) {
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 		page_cache_get(page);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
 						BLK_RW_ASYNC);
 		}
 	}
 	return ret;
 }
 static void nfs_end_page_writeback(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	end_page_writeback(page);
 	page_cache_release(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page *req;
 	int ret;
 	spin_lock(&inode->i_lock);
 	for (;;) {
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL)
 			break;
 		if (nfs_set_page_tag_locked(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
 		 *	 then the call to nfs_set_page_tag_locked() will always
 		 *	 succeed provided that someone hasn't already marked the
 		 *	 request as dirty (in which case we don't care).
 		 */
 		spin_unlock(&inode->i_lock);
 		if (!nonblock)
 			ret = nfs_wait_on_request(req);
 		else
 			ret = -EAGAIN;
 		nfs_release_request(req);
 		if (ret != 0)
 			return ERR_PTR(ret);
 		spin_lock(&inode->i_lock);
 	}
 	spin_unlock(&inode->i_lock);
 	return req;
 }
 /*
  * Find an associated nfs write request, and prepare to flush it out
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page, bool nonblock)
 {
 	struct nfs_page *req;
 	int ret = 0;
 	req = nfs_find_and_lock_request(page, nonblock);
 	if (!req)
 		goto out;
 	ret = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
 	ret = nfs_set_page_writeback(page);
 	BUG_ON(ret != 0);
 	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
 		ret = pgio->pg_error;
 	}
 out:
 	return ret;
 }
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 	nfs_pageio_cond_complete(pgio, page->index);
 	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
 	}
 	return ret;
 }
 /*
  * Write an mmapped page to the server.
  */
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
 	struct nfs_pageio_descriptor pgio;
 	int err;
 	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
 		return err;
 	if (pgio.pg_error < 0)
 		return pgio.pg_error;
 	return 0;
 }
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
 	return ret;
 }
 static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
 {
 	int ret;
 	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
 	return ret;
 }
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 	/* Stop dirtying of new pages while we sync */
 	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
 			nfs_wait_bit_killable, TASK_KILLABLE);
 	if (err)
 		goto out_err;
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
 	smp_mb__after_clear_bit();
 	wake_up_bit(bitlock, NFS_INO_FLUSHING);
 	if (err < 0)
 		goto out_err;
 	err = pgio.pg_error;
 	if (err < 0)
 		goto out_err;
 	return 0;
 out_err:
 	return err;
 }
 /*
  * Insert a write request into an inode
  */
 static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int error;
 	error = radix_tree_preload(GFP_NOFS);
 	if (error != 0)
 		goto out;
 	/* Lock the request! */
 	nfs_lock_request_dontget(req);
 	spin_lock(&inode->i_lock);
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
 	BUG_ON(error);
 	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
 		nfsi->change_attr++;
 	set_bit(PG_MAPPED, &req->wb_flags);
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
 				NFS_PAGE_TAG_LOCKED);
 	spin_unlock(&inode->i_lock);
 	radix_tree_preload_end();
 out:
 	return error;
 }
 /*
  * Remove a write request from an inode
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	BUG_ON (!NFS_WBACK_BUSY(req));
 	spin_lock(&inode->i_lock);
 	set_page_private(req->wb_page, 0);
 	ClearPagePrivate(req->wb_page);
 	clear_bit(PG_MAPPED, &req->wb_flags);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
 	nfsi->npages--;
 	spin_unlock(&inode->i_lock);
 	nfs_release_request(req);
 }
 static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
 	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * Add a request to the inode's commit list.
  */
 static void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	spin_lock(&inode->i_lock);
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
 			NFS_PAGE_TAG_COMMIT);
 	nfsi->ncommit++;
 	spin_unlock(&inode->i_lock);
 	pnfs_mark_request_commit(req, lseg);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 static int
 nfs_clear_request_commit(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 		return 1;
 	}
 	return 0;
 }
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
 	if (data->verf.committed == NFS_DATA_SYNC)
 		return data->lseg == NULL;
 	else
 		return data->verf.committed != NFS_FILE_SYNC;
 }
 static inline
 int nfs_reschedule_unstable_write(struct nfs_page *req,
 				  struct nfs_write_data *data)
 {
 	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
 		nfs_mark_request_commit(req, data->lseg);
 		return 1;
 	}
 	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
 		nfs_mark_request_dirty(req);
 		return 1;
 	}
 	return 0;
 }
 #else
 static inline void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
 }
 static inline int
 nfs_clear_request_commit(struct nfs_page *req)
 {
 	return 0;
 }
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
 	return 0;
 }
 static inline
 int nfs_reschedule_unstable_write(struct nfs_page *req,
 				  struct nfs_write_data *data)
 {
 	return 0;
 }
 #endif
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
 {
 	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
 }
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
  * @dst: destination list
  * @idx_start: lower bound of page->index to scan.
  * @npages: idx_start + npages sets the upper bound to scan.
  *
  * Moves requests from the inode's 'commit' request list.
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret;
 	if (!nfs_need_commit(nfsi))
 		return 0;
 	spin_lock(&inode->i_lock);
 	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 	if (ret > 0)
 		nfsi->ncommit -= ret;
 	spin_unlock(&inode->i_lock);
 	if (nfs_need_commit(NFS_I(inode)))
 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
 #else
 static inline int nfs_need_commit(struct nfs_inode *nfsi)
 {
 	return 0;
 }
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
 }
 #endif
 /*
  * Search for an existing write request, and attempt to update
  * it to reflect a new dirty region on a given page.
  *
  * If the attempt fails, then the existing request is flushed out
  * to disk.
  */
 static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		struct page *page,
 		unsigned int offset,
 		unsigned int bytes)
 {
 	struct nfs_page *req;
 	unsigned int rqend;
 	unsigned int end;
 	int error;
 	if (!PagePrivate(page))
 		return NULL;
 	end = offset + bytes;
 	spin_lock(&inode->i_lock);
 	for (;;) {
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL)
 			goto out_unlock;
 		rqend = req->wb_offset + req->wb_bytes;
 		/*
 		 * Tell the caller to flush out the request if
 		 * the offsets are non-contiguous.
 		 * Note: nfs_flush_incompatible() will already
 		 * have flushed out requests having wrong owners.
 		 */
 		if (offset > rqend
 		    || end < req->wb_offset)
 			goto out_flushme;
 		if (nfs_set_page_tag_locked(req))
 			break;
 		/* The request is locked, so wait and then retry */
 		spin_unlock(&inode->i_lock);
 		error = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (error != 0)
 			goto out_err;
 		spin_lock(&inode->i_lock);
 	}
 	if (nfs_clear_request_commit(req) &&
 	    radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
 				 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
 		NFS_I(inode)->ncommit--;
 		pnfs_clear_request_commit(req);
 	}
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
 		req->wb_offset = offset;
 		req->wb_pgbase = offset;
 	}
 	if (end > rqend)
 		req->wb_bytes = end - req->wb_offset;
 	else
 		req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
 	spin_unlock(&inode->i_lock);
 	return req;
 out_flushme:
 	spin_unlock(&inode->i_lock);
 	nfs_release_request(req);
 	error = nfs_wb_page(inode, page);
 out_err:
 	return ERR_PTR(error);
 }
 /*
  * Try to update an existing write request, or create one if there is none.
  *
  * Note: Should always be called with the Page Lock held to prevent races
  * if we have to add a new request. Also assumes that the caller has
  * already called nfs_flush_incompatible() if necessary.
  */
 static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
 		struct page *page, unsigned int offset, unsigned int bytes)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page	*req;
 	int error;
 	req = nfs_try_to_update_request(inode, page, offset, bytes);
 	if (req != NULL)
 		goto out;
 	req = nfs_create_request(ctx, inode, page, offset, bytes);
 	if (IS_ERR(req))
 		goto out;
 	error = nfs_inode_add_request(inode, req);
 	if (error != 0) {
 		nfs_release_request(req);
 		req = ERR_PTR(error);
 	}
 out:
 	return req;
 }
 static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_page	*req;
 	req = nfs_setup_write_request(ctx, page, offset, count);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 	nfs_mark_request_dirty(req);
 	nfs_clear_page_tag_locked(req);
 	return 0;
 }
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
 	 * Look for a request corresponding to this page. If there
 	 * is one, and it belongs to another file, we flush it out
 	 * before we try to copy anything into the page. Do this
 	 * due to the lack of an ACCESS-type call in NFSv2.
 	 * Also do the same if we find a request from an existing
 	 * dropped page.
 	 */
 	do {
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			return 0;
 		do_flush = req->wb_page != page || req->wb_context != ctx ||
 			req->wb_lock_context->lockowner != current->files ||
 			req->wb_lock_context->pid != current->tgid;
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
 		status = nfs_wb_page(page->mapping->host, page);
 	} while (status == 0);
 	return status;
 }
 /*
  * If the page cache is marked as unsafe or invalid, then we can't rely on
  * the PageUptodate() flag. In this case, we will need to turn off
  * write optimisations that depend on the page contents being correct.
  */
 static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
 	return PageUptodate(page) &&
 		!(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA));
 }
 /*
  * Update and possibly write a cached page of an NFS file.
  *
  * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
  * things with a page scheduled for an RPC call (e.g. invalidate it).
  */
 int nfs_updatepage(struct file *file, struct page *page,
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = page->mapping->host;
 	int		status = 0;
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
 	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name, count,
 		(long long)(page_offset(page) + offset));
 	/* If we're not using byte range locks, and we know the page
 	 * is up to date, it may be more efficient to extend the write
 	 * to cover the entire page in order to avoid fragmentation
 	 * inefficiencies.
 	 */
 	if (nfs_write_pageuptodate(page, inode) &&
 			inode->i_flock == NULL &&
 			!(file->f_flags & O_DSYNC)) {
 		count = max(count + offset, nfs_page_length(page));
 		offset = 0;
 	}
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
 		nfs_set_pageerror(page);
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
 	return status;
 }
 static void nfs_writepage_release(struct nfs_page *req,
 				  struct nfs_write_data *data)
 {
 	struct page *page = req->wb_page;
 	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
 		nfs_inode_remove_request(req);
 	nfs_clear_page_tag_locked(req);
 	nfs_end_page_writeback(page);
 }
 static int flush_task_priority(int how)
 {
 	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
 		case FLUSH_HIGHPRI:
 			return RPC_PRIORITY_HIGH;
 		case FLUSH_LOWPRI:
 			return RPC_PRIORITY_LOW;
 	}
 	return RPC_PRIORITY_NORMAL;
 }
 int nfs_initiate_write(struct nfs_write_data *data,
 		       struct rpc_clnt *clnt,
 		       const struct rpc_call_ops *call_ops,
 		       int how)
 {
 	struct inode *inode = data->inode;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clnt,
 		.task = &data->task,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
 	int ret = 0;
 	/* Set up the initial task struct.  */
 	NFS_PROTO(inode)->write_setup(data, &msg);
 	dprintk("NFS: %5u initiated write call "
 		"(req %s/%lld, %u bytes @ offset %llu)\n",
 		data->task.tk_pid,
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode),
 		data->args.count,
 		(unsigned long long)data->args.offset);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task)) {
 		ret = PTR_ERR(task);
 		goto out;
 	}
 	if (how & FLUSH_SYNC) {
 		ret = rpc_wait_for_completion_task(task);
 		if (ret == 0)
 			ret = task->tk_status;
 	}
 	rpc_put_task(task);
 out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_write);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
 static int nfs_write_rpcsetup(struct nfs_page *req,
 		struct nfs_write_data *data,
 		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset,
 		struct pnfs_layout_segment *lseg,
 		int how)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 	data->req = req;
 	data->inode = inode = req->wb_context->dentry->d_inode;
 	data->cred = req->wb_context->cred;
 	data->lseg = get_lseg(lseg);
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
 	/* pnfs_set_layoutcommit needs this */
 	data->mds_offset = data->args.offset;
 	data->args.pgbase = req->wb_pgbase + offset;
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.lock_context = req->wb_lock_context;
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
 		data->args.stable = NFS_DATA_SYNC;
 		if (!nfs_need_commit(NFS_I(inode)))
 			data->args.stable = NFS_FILE_SYNC;
 	}
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 	if (data->lseg &&
 	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
 		return 0;
 	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
  * call this on each, which will prepare them to be retried on next
  * writeback using standard nfs.
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	nfs_mark_request_dirty(req);
 	nfs_clear_page_tag_locked(req);
 	nfs_end_page_writeback(page);
 }
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
 static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
 	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
 	struct nfs_write_data *data;
 	size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
 	int ret = 0;
 	struct pnfs_layout_segment *lseg;
 	LIST_HEAD(list);
 	nfs_list_remove_request(req);
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
 	     desc->pg_count > wsize))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes, wsize);
 		data = nfs_writedata_alloc(1);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
 		requests++;
 		nbytes -= len;
 	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 	BUG_ON(desc->pg_lseg);
 	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
 				  req_offset(req), desc->pg_count,
 				  IOMODE_RW, GFP_NOFS);
 	ClearPageError(page);
 	offset = 0;
 	nbytes = desc->pg_count;
 	do {
 		int ret2;
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del_init(&data->pages);
 		data->pagevec[0] = page;
 		if (nbytes < wsize)
 			wsize = nbytes;
 		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
 					  wsize, offset, lseg, desc->pg_ioflags);
 		if (ret == 0)
 			ret = ret2;
 		offset += wsize;
 		nbytes -= wsize;
 	} while (nbytes != 0);
 	put_lseg(lseg);
 	desc->pg_lseg = NULL;
 	return ret;
 out_bad:
 	while (!list_empty(&list)) {
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
 		nfs_writedata_free(data);
 	}
 	nfs_redirty_request(req);
 	return -ENOMEM;
 }
 /*
  * Create an RPC task for the given write request and kick it.
  * The page must have been locked by the caller.
  *
  * It may happen that the page we're passed is not marked dirty.
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
 static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	int ret;
 	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
 						      desc->pg_count));
 	if (!data) {
 		while (!list_empty(head)) {
 			req = nfs_list_entry(head->next);
 			nfs_list_remove_request(req);
 			nfs_redirty_request(req);
 		}
 		ret = -ENOMEM;
 		goto out;
 	}
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
 		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
 	if ((!lseg) && list_is_singular(&data->pages))
 		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
 					  req_offset(req), desc->pg_count,
 					  IOMODE_RW, GFP_NOFS);
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 	/* Set up the argument struct */
 	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
 out:
 	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
 	desc->pg_lseg = NULL;
 	return ret;
 }
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
 	else
 		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
 }
 /*
  * Handle a write reply that flushed part of a page.
  */
 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
 		task->tk_pid,
 		data->req->wb_context->dentry->d_inode->i_sb->s_id,
 		(long long)
 		  NFS_FILEID(data->req->wb_context->dentry->d_inode),
 		data->req->wb_bytes, (long long)req_offset(data->req));
 	nfs_writeback_done(task, data);
 }
 static void nfs_writeback_release_partial(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req = data->req;
 	struct page		*page = req->wb_page;
 	int status = data->task.tk_status;
 	if (status < 0) {
 		nfs_set_pageerror(page);
 		nfs_context_set_write_error(req->wb_context, status);
 		dprintk(", error = %d\n", status);
 		goto out;
 	}
 	if (nfs_write_need_commit(data)) {
 		struct inode *inode = page->mapping->host;
 		spin_lock(&inode->i_lock);
 		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
 			/* Do nothing we need to resend the writes */
 		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
 			dprintk(" defer commit\n");
 		} else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
 			set_bit(PG_NEED_RESCHED, &req->wb_flags);
 			clear_bit(PG_NEED_COMMIT, &req->wb_flags);
 			dprintk(" server reboot detected\n");
 		}
 		spin_unlock(&inode->i_lock);
 	} else
 		dprintk(" OK\n");
 out:
 	if (atomic_dec_and_test(&req->wb_complete))
 		nfs_writepage_release(req, data);
 	nfs_writedata_release(calldata);
 }
 #if defined(CONFIG_NFS_V4_1)
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
 	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
 				&data->args.seq_args,
 				&data->res.seq_res, 1, task))
 		return;
 	rpc_call_start(task);
 }
 #endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_write_partial_ops = {
 #if defined(CONFIG_NFS_V4_1)
 	.rpc_call_prepare = nfs_write_prepare,
 #endif /* CONFIG_NFS_V4_1 */
 	.rpc_call_done = nfs_writeback_done_partial,
 	.rpc_release = nfs_writeback_release_partial,
 };
 /*
  * Handle a write reply that flushes a whole page.
  *
  * FIXME: There is an inherent race with invalidate_inode_pages and
  *	  writebacks since the page->count is kept > 1 for as long
  *	  as the page has a write request pending.
  */
 static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	nfs_writeback_done(task, data);
 }
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	int status = data->task.tk_status;
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
 		struct page *page = req->wb_page;
 		nfs_list_remove_request(req);
 		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
 			data->task.tk_pid,
 			req->wb_context->dentry->d_inode->i_sb->s_id,
 			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (status < 0) {
 			nfs_set_pageerror(page);
 			nfs_context_set_write_error(req->wb_context, status);
 			dprintk(", error = %d\n", status);
 			goto remove_request;
 		}
 		if (nfs_write_need_commit(data)) {
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, data->lseg);
 			dprintk(" marked for commit\n");
 			goto next;
 		}
 		dprintk(" OK\n");
 remove_request:
 		nfs_inode_remove_request(req);
 	next:
 		nfs_clear_page_tag_locked(req);
 		nfs_end_page_writeback(page);
 	}
 	nfs_writedata_release(calldata);
 }
 static const struct rpc_call_ops nfs_write_full_ops = {
 #if defined(CONFIG_NFS_V4_1)
 	.rpc_call_prepare = nfs_write_prepare,
 #endif /* CONFIG_NFS_V4_1 */
 	.rpc_call_done = nfs_writeback_done_full,
 	.rpc_release = nfs_writeback_release_full,
 };
 /*
  * This function is called when the WRITE call is complete.
  */
 void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
 	struct nfs_server	*server = NFS_SERVER(data->inode);
 	int status;
 	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
 		task->tk_pid, task->tk_status);
 	/*
 	 * ->write_done will attempt to use post-op attributes to detect
 	 * conflicting writes by other clients.  A strict interpretation
 	 * of close-to-open would allow us to continue caching even if
 	 * another writer had changed the file, but some applications
 	 * depend on tighter cache coherency when writing.
 	 */
 	status = NFS_PROTO(data->inode)->write_done(task, data);
 	if (status != 0)
 		return;
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
 		/* We tried a write call, but the server did not
 		 * commit data to stable storage even though we
 		 * requested it.
 		 * Note: There is a known bug in Tru64 < 5.0 in which
 		 *	 the server reports NFS_DATA_SYNC, but performs
 		 *	 NFS_FILE_SYNC. We therefore implement this checking
 		 *	 as a dprintk() in order to avoid filling syslog.
 		 */
 		static unsigned long    complain;
 		/* Note this will print the MDS for a DS write */
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
 				server->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
 			complain = jiffies + 300 * HZ;
 		}
 	}
 #endif
 	/* Is this a short write? */
 	if (task->tk_status >= 0 && resp->count < argp->count) {
 		static unsigned long    complain;
 		nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE);
 		/* Has the server at least made some progress? */
 		if (resp->count != 0) {
 			/* Was this an NFSv2 write or an NFSv3 stable write? */
 			if (resp->verf->committed != NFS_UNSTABLE) {
 				/* Resend from where the server left off */
 				data->mds_offset += resp->count;
 				argp->offset += resp->count;
 				argp->pgbase += resp->count;
 				argp->count -= resp->count;
 			} else {
 				/* Resend as a stable write in order to avoid
 				 * headaches in the case of a server crash.
 				 */
 				argp->stable = NFS_FILE_SYNC;
 			}
 			nfs_restart_rpc(task, server->nfs_client);
 			return;
 		}
 		if (time_before(complain, jiffies)) {
 			printk(KERN_WARNING
 			       "NFS: Server wrote zero bytes, expected %u.\n",
 					argp->count);
 			complain = jiffies + 300 * HZ;
 		}
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 	}
 	return;
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 {
 	int ret;
 	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
 		return 1;
 	if (!may_wait)
 		return 0;
 	ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
 				NFS_INO_COMMIT,
 				nfs_wait_bit_killable,
 				TASK_KILLABLE);
 	return (ret < 0) ? ret : 1;
 }
 void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 {
 	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
 }
 EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
 void nfs_commitdata_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 	put_lseg(wdata->lseg);
 	put_nfs_open_context(wdata->args.context);
 	nfs_commit_free(wdata);
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
 			const struct rpc_call_ops *call_ops,
 			int how)
 {
 	struct rpc_task *task;
 	int priority = flush_task_priority(how);
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.task = &data->task,
 		.rpc_client = clnt,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
 	/* Set up the initial task struct.  */
 	NFS_PROTO(data->inode)->commit_setup(data, &msg);
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	if (how & FLUSH_SYNC)
 		rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
 void nfs_init_commit(struct nfs_write_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->dentry->d_inode;
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 	list_splice_init(head, &data->pages);
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
 	data->lseg	  = lseg; /* reference transferred */
 	data->mds_ops     = &nfs_commit_ops;
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
 	data->args.offset = 0;
 	data->args.count  = 0;
 	data->args.context = get_nfs_open_context(first->wb_context);
 	data->res.count   = 0;
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 }
 EXPORT_SYMBOL_GPL(nfs_init_commit);
 void nfs_retry_commit(struct list_head *page_list,
 		      struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page *req;
 	while (!list_empty(page_list)) {
 		req = nfs_list_entry(page_list->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req, lseg);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 			     BDI_RECLAIMABLE);
 		nfs_clear_page_tag_locked(req);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
 /*
  * Commit dirty pages
  */
 static int
 nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	struct nfs_write_data	*data;
 	data = nfs_commitdata_alloc();
 	if (!data)
 		goto out_bad;
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL);
 	return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
  out_bad:
 	nfs_retry_commit(head, NULL);
 	nfs_commit_clear_lock(NFS_I(inode));
 	return -ENOMEM;
 }
 /*
  * COMMIT call returned
  */
 static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
         dprintk("NFS: %5u nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
 	/* Call the NFS version-specific code */
 	NFS_PROTO(data->inode)->commit_done(task, data);
 }
 void nfs_commit_release_pages(struct nfs_write_data *data)
 {
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 		nfs_clear_request_commit(req);
 		dprintk("NFS:       commit (%s/%lld %d@%lld)",
 			req->wb_context->dentry->d_sb->s_id,
 			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (status < 0) {
 			nfs_context_set_write_error(req->wb_context, status);
 			nfs_inode_remove_request(req);
 			dprintk(", error = %d\n", status);
 			goto next;
 		}
 		/* Okay, COMMIT succeeded, apparently. Check the verifier
 		 * returned by the server against all stored verfs. */
 		if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
 			/* We have a match */
 			nfs_inode_remove_request(req);
 			dprintk(" OK\n");
 			goto next;
 		}
 		/* We have a mismatch. Write the page again */
 		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 	next:
 		nfs_clear_page_tag_locked(req);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
 static void nfs_commit_release(void *calldata)
 {
 	struct nfs_write_data *data = calldata;
 	nfs_commit_release_pages(data);
 	nfs_commit_clear_lock(NFS_I(data->inode));
 	nfs_commitdata_release(calldata);
 }
 static const struct rpc_call_ops nfs_commit_ops = {
 #if defined(CONFIG_NFS_V4_1)
 	.rpc_call_prepare = nfs_write_prepare,
 #endif /* CONFIG_NFS_V4_1 */
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
 	int may_wait = how & FLUSH_SYNC;
 	int res;
 	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
 	if (res <= 0)
 		goto out_mark_dirty;
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	if (res) {
 		int error;
 		error = pnfs_commit_list(inode, &head, how);
 		if (error == PNFS_NOT_ATTEMPTED)
 			error = nfs_commit_list(inode, &head, how);
 		if (error < 0)
 			return error;
 		if (!may_wait)
 			goto out_mark_dirty;
 		error = wait_on_bit(&NFS_I(inode)->flags,
 				NFS_INO_COMMIT,
 				nfs_wait_bit_killable,
 				TASK_KILLABLE);
 		if (error < 0)
 			return error;
 	} else
 		nfs_commit_clear_lock(NFS_I(inode));
 	return res;
 	/* Note: If we exit without ensuring that the commit is complete,
 	 * we must mark the inode as dirty. Otherwise, future calls to
 	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
 	 * that the data is on the disk.
 	 */
 out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return res;
 }
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int flags = FLUSH_SYNC;
 	int ret = 0;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
 		 * are a lot of outstanding writes for this mapping.
 		 */
 		if (nfsi->ncommit <= (nfsi->npages >> 1))
 			goto out_mark_dirty;
 		/* don't wait for the COMMIT response */
 		flags = 0;
 	}
 	ret = nfs_commit_inode(inode, flags);
 	if (ret >= 0) {
 		if (wbc->sync_mode == WB_SYNC_NONE) {
 			if (ret < wbc->nr_to_write)
 				wbc->nr_to_write -= ret;
 			else
 				wbc->nr_to_write = 0;
 		}
 		return 0;
 	}
 out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
 #else
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
 	return 0;
 }
 #endif
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 	ret = nfs_commit_unstable_pages(inode, wbc);
 	if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
 		int status;
 		bool sync = true;
-		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
+		if (wbc->sync_mode == WB_SYNC_NONE)
-		    wbc->for_background)
 			sync = false;
 		status = pnfs_layoutcommit_inode(inode, sync);
 		if (status < 0)
 			return status;
 	}
 	return ret;
 }
 /*
  * flush the inode to disk.
  */
 int nfs_wb_all(struct inode *inode)
 {
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 	};
 	return sync_inode(inode, &wbc);
 }
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 {
 	struct nfs_page *req;
 	int ret = 0;
 	BUG_ON(!PageLocked(page));
 	for (;;) {
 		wait_on_page_writeback(page);
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			break;
 		if (nfs_lock_request_dontget(req)) {
 			nfs_inode_remove_request(req);
 			/*
 			 * In case nfs_inode_remove_request has marked the
 			 * page as being dirty
 			 */
 			cancel_dirty_page(page, PAGE_CACHE_SIZE);
 			nfs_unlock_request(req);
 			break;
 		}
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret < 0)
 			break;
 	}
 	return ret;
 }
 /*
  * Write back all requests on one page - we do this before reading it.
  */
 int nfs_wb_page(struct inode *inode, struct page *page)
 {
 	loff_t range_start = page_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 0,
 		.range_start = range_start,
 		.range_end = range_end,
 	};
 	int ret;
 	for (;;) {
 		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
 			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
 			continue;
 		}
 		if (!PagePrivate(page))
 			break;
 		ret = nfs_commit_inode(inode, FLUSH_SYNC);
 		if (ret < 0)
 			goto out_error;
 	}
 	return 0;
 out_error:
 	return ret;
 }
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
 {
 	struct nfs_page *req;
 	int ret;
 	nfs_fscache_release_page(page, GFP_KERNEL);
 	req = nfs_find_and_lock_request(page, false);
 	ret = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
 	ret = migrate_page(mapping, newpage, page);
 	if (!req)
 		goto out;
 	if (ret)
 		goto out_unlock;
 	page_cache_get(newpage);
 	spin_lock(&mapping->host->i_lock);
 	req->wb_page = newpage;
 	SetPagePrivate(newpage);
 	set_page_private(newpage, (unsigned long)req);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
 	spin_unlock(&mapping->host->i_lock);
 	page_cache_release(page);
 out_unlock:
 	nfs_clear_page_tag_locked(req);
 out:
 	return ret;
 }
 #endif
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_wdata_cachep == NULL)
 		return -ENOMEM;
 	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
 						     nfs_wdata_cachep);
 	if (nfs_wdata_mempool == NULL)
 		return -ENOMEM;
 	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
 						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
 		return -ENOMEM;
 	/*
 	 * NFS congestion size, scale with available memory.
 	 *
 	 *  64MB:    8192k
 	 * 128MB:   11585k
 	 * 256MB:   16384k
 	 * 512MB:   23170k
 	 *   1GB:   32768k
 	 *   2GB:   46340k
 	 *   4GB:   65536k
 	 *   8GB:   92681k
 	 *  16GB:  131072k
 	 *
 	 * This allows larger machines to have larger/more transfers.
 	 * Limit the default to 256M
 	 */
 	nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
 	if (nfs_congestion_kb > 256*1024)
 		nfs_congestion_kb = 256*1024;
 	return 0;
 }
 void nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
 	kmem_cache_destroy(nfs_wdata_cachep);
 }

include/linux/backing-dev.h

Diff comments View file @ f01ef56

 /*
  * include/linux/backing-dev.h
  *
  * low-level device information and state which is propagated up through
  * to high-level code.
  */
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 #include <linux/percpu_counter.h>
 #include <linux/log2.h>
 #include <linux/proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/writeback.h>
 #include <asm/atomic.h>
 struct page;
 struct device;
 struct dentry;
 /*
  * Bits in backing_dev_info.state
  */
 enum bdi_state {
 	BDI_pending,		/* On its way to being activated */
 	BDI_wb_alloc,		/* Default embedded wb allocated */
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_registered,		/* bdi_register() was done */
 	BDI_writeback_running,	/* Writeback is in progress */
 	BDI_unused,		/* Available bits start here */
 };
 typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
 	unsigned int nr;
 	unsigned long last_old_flush;	/* last old data flush */
 	unsigned long last_active;	/* last time bdi thread was active */
 	struct task_struct *task;	/* writeback thread */
 	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	spinlock_t list_lock;		/* protects the b_* lists */
 };
 struct backing_dev_info {
 	struct list_head bdi_list;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 	char *name;
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw */
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
 	struct list_head work_list;
 	struct device *dev;
 	struct timer_list laptop_mode_wb_timer;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
 #endif
 };
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 extern struct list_head bdi_pending_list;
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
 	return !list_empty(&wb->b_dirty) ||
 	       !list_empty(&wb->b_io) ||
 	       !list_empty(&wb->b_more_io);
 }
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, s64 amount)
 {
 	__percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
 }
 static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	__add_bdi_stat(bdi, item, 1);
 }
 static inline void inc_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	__inc_bdi_stat(bdi, item);
 	local_irq_restore(flags);
 }
 static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	__add_bdi_stat(bdi, item, -1);
 }
 static inline void dec_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	__dec_bdi_stat(bdi, item);
 	local_irq_restore(flags);
 }
 static inline s64 bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
 }
 static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
 }
 static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	s64 sum;
 	unsigned long flags;
 	local_irq_save(flags);
 	sum = __bdi_stat_sum(bdi, item);
 	local_irq_restore(flags);
 	return sum;
 }
 extern void bdi_writeout_inc(struct backing_dev_info *bdi);
 /*
  * maximal error of a stat counter.
  */
 static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
 {
 #ifdef CONFIG_SMP
 	return nr_cpu_ids * BDI_STAT_BATCH;
 #else
 	return 1;
 #endif
 }
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 /*
  * Flags in backing_dev_info::capability
  *
  * The first three flags control whether dirty pages will contribute to the
  * VM's accounting and whether writepages() should be called for dirty pages
  * (something that would not, for example, be appropriate for ramfs)
  *
  * WARNING: these flags are closely related and should not normally be
  * used separately.  The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these
  * three flags into a single convenience macro.
  *
  * BDI_CAP_NO_ACCT_DIRTY:  Dirty pages shouldn't contribute to accounting
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
  *
  * These flags let !MMU mmap() govern direct device mapping vs immediate
  * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
  *
  * BDI_CAP_MAP_COPY:       Copy can be mapped (MAP_PRIVATE)
  * BDI_CAP_MAP_DIRECT:     Can be mapped directly (MAP_SHARED)
  * BDI_CAP_READ_MAP:       Can be mapped for reading
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  *
  * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
 #define BDI_CAP_MAP_COPY	0x00000004
 #define BDI_CAP_MAP_DIRECT	0x00000008
 #define BDI_CAP_READ_MAP	0x00000010
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 #if defined(VM_MAYREAD) && \
 	(BDI_CAP_READ_MAP != VM_MAYREAD || \
 	 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
 	 BDI_CAP_EXEC_MAP != VM_MAYEXEC)
 #error please change backing_dev_info::capabilities flags
 #endif
 extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 int writeback_in_progress(struct backing_dev_info *bdi);
 static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 {
 	if (bdi->congested_fn)
 		return bdi->congested_fn(bdi->congested_data, bdi_bits);
 	return (bdi->state & bdi_bits);
 }
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
 	return bdi_congested(bdi, 1 << BDI_sync_congested);
 }
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
 	return bdi_congested(bdi, 1 << BDI_async_congested);
 }
 static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 {
 	return bdi_congested(bdi, (1 << BDI_sync_congested) |
 				  (1 << BDI_async_congested));
 }
 enum {
 	BLK_RW_ASYNC	= 0,
 	BLK_RW_SYNC	= 1,
 };
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct zone *zone, int sync, long timeout);
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
 {
 	return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK);
 }
 static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi)
 {
 	return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY);
 }
 static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 {
 	/* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */
 	return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB |
 				      BDI_CAP_NO_WRITEBACK));
 }
 static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
 {
 	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
 }
 static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
 {
 	return bdi == &default_backing_dev_info;
 }
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
 }
 static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 {
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 static inline bool mapping_cap_swap_backed(struct address_space *mapping)
 {
 	return bdi_cap_swap_backed(mapping->backing_dev_info);
 }
 static inline int bdi_sched_wait(void *word)
 {
 	schedule();
 	return 0;
 }
 #endif		/* _LINUX_BACKING_DEV_H */

include/linux/writeback.h

Diff comments View file @ f01ef56

 /*
  * include/linux/writeback.h
  */
 #ifndef WRITEBACK_H
 #define WRITEBACK_H
 #include <linux/sched.h>
 #include <linux/fs.h>
-struct backing_dev_info;
+/*
+ * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
+ *
+ *	(thresh - thresh/DIRTY_FULL_SCOPE, thresh)
+ *
+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
+ *
+ *	(limit, limit + limit/DIRTY_MAXPAUSE_AREA)
+ *
+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
+ * to loops:
+ *
+ *	(limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
+ *
+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop, unless written enough pages.
+ *
+ * The global dirty threshold is normally equal to the global dirty limit,
+ * except when the system suddenly allocates a lot of anonymous memory and
+ * knocks down the global dirty threshold quickly, in which case the global
+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_SCOPE		8
+#define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
+#define DIRTY_MAXPAUSE_AREA		16
+#define DIRTY_PASSGOOD_AREA		8
-extern spinlock_t inode_wb_list_lock;
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+struct backing_dev_info;
 /*
  * fs/fs-writeback.c
  */
 enum writeback_sync_modes {
 	WB_SYNC_NONE,	/* Don't wait on anything */
 	WB_SYNC_ALL,	/* Wait on every mapping */
 };
 /*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
  * in a manner such that unspecified fields are set to zero.
  */
 struct writeback_control {
 	enum writeback_sync_modes sync_mode;
-	unsigned long *older_than_this;	/* If !NULL, only write back inodes
-					   older than this */
-	unsigned long wb_start;         /* Time writeback_inodes_wb was
-					   called. This is needed to avoid
-					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
 	/*
 	 * For a_ops->writepages(): is start or end are non-zero then this is
 	 * a hint that the filesystem need only write out the pages inside that
 	 * byterange.  The byte at `end' is included in the writeout request.
 	 */
 	loff_t range_start;
 	loff_t range_end;
-	unsigned nonblocking:1;		/* Don't get stuck on request queues */
-	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
+	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
-	unsigned more_io:1;		/* more io to be dispatched */
 };
 /*
  * fs/fs-writeback.c
  */
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
 int writeback_inodes_sb_if_idle(struct super_block *);
 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
-void writeback_inodes_wb(struct bdi_writeback *wb,
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
-		struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
 	might_sleep();
 	wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE);
 }
 static inline void inode_sync_wait(struct inode *inode)
 {
 	might_sleep();
 	wait_on_bit(&inode->i_state, __I_SYNC, inode_wait,
 							TASK_UNINTERRUPTIBLE);
 }
 /*
  * mm/page-writeback.c
  */
 #ifdef CONFIG_BLOCK
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_sync(struct work_struct *work);
 void laptop_mode_timer_fn(unsigned long data);
 #else
 static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
+extern unsigned long global_dirty_limit;
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 extern unsigned long determine_dirtyable_memory(void);
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 			       unsigned long dirty);
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
+			    unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied);
 static inline void
 balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
 	balance_dirty_pages_ratelimited_nr(mapping, 1);
 }
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
 int generic_writepages(struct address_space *mapping,
 		       struct writeback_control *wbc);
 void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end);
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 void set_page_dirty_balance(struct page *page, int page_mkwrite);
 void writeback_set_ratelimit(void);
 void tag_pages_for_writeback(struct address_space *mapping,

include/trace/events/btrfs.h

Diff comments View file @ f01ef56

 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM btrfs
 #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BTRFS_H
 #include <linux/writeback.h>
 #include <linux/tracepoint.h>
 struct btrfs_root;
 struct btrfs_fs_info;
 struct btrfs_inode;
 struct extent_map;
 struct btrfs_ordered_extent;
 struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
 struct map_lookup;
 struct extent_buffer;
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
 		{ BTRFS_TREE_BLOCK_REF_KEY, 	"TREE_BLOCK_REF" },	\
 		{ BTRFS_EXTENT_DATA_REF_KEY, 	"EXTENT_DATA_REF" },	\
 		{ BTRFS_EXTENT_REF_V0_KEY, 	"EXTENT_REF_V0" },	\
 		{ BTRFS_SHARED_BLOCK_REF_KEY, 	"SHARED_BLOCK_REF" },	\
 		{ BTRFS_SHARED_DATA_REF_KEY, 	"SHARED_DATA_REF" })
 #define __show_root_type(obj)						\
 	__print_symbolic_u64(obj,					\
 		{ BTRFS_ROOT_TREE_OBJECTID, 	"ROOT_TREE"	},	\
 		{ BTRFS_EXTENT_TREE_OBJECTID, 	"EXTENT_TREE"	},	\
 		{ BTRFS_CHUNK_TREE_OBJECTID, 	"CHUNK_TREE"	},	\
 		{ BTRFS_DEV_TREE_OBJECTID, 	"DEV_TREE"	},	\
 		{ BTRFS_FS_TREE_OBJECTID, 	"FS_TREE"	},	\
 		{ BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR"	},	\
 		{ BTRFS_CSUM_TREE_OBJECTID, 	"CSUM_TREE"	},	\
 		{ BTRFS_TREE_LOG_OBJECTID,	"TREE_LOG"	},	\
 		{ BTRFS_TREE_RELOC_OBJECTID,	"TREE_RELOC"	},	\
 		{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
 #define show_root_type(obj)						\
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
 	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 TRACE_EVENT(btrfs_transaction_commit,
 	TP_PROTO(struct btrfs_root *root),
 	TP_ARGS(root),
 	TP_STRUCT__entry(
 		__field(	u64,  generation		)
 		__field(	u64,  root_objectid		)
 	),
 	TP_fast_assign(
 		__entry->generation	= root->fs_info->generation;
 		__entry->root_objectid	= root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), gen = %llu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->generation)
 );
 DECLARE_EVENT_CLASS(btrfs__inode,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	ino_t,  ino			)
 		__field(	blkcnt_t,  blocks		)
 		__field(	u64,  disk_i_size		)
 		__field(	u64,  generation		)
 		__field(	u64,  last_trans		)
 		__field(	u64,  logged_trans		)
 		__field(	u64,  root_objectid		)
 	),
 	TP_fast_assign(
 		__entry->ino	= inode->i_ino;
 		__entry->blocks	= inode->i_blocks;
 		__entry->disk_i_size  = BTRFS_I(inode)->disk_i_size;
 		__entry->generation = BTRFS_I(inode)->generation;
 		__entry->last_trans = BTRFS_I(inode)->last_trans;
 		__entry->logged_trans = BTRFS_I(inode)->logged_trans;
 		__entry->root_objectid =
 				BTRFS_I(inode)->root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, "
 		  "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->generation,
 		  (unsigned long)__entry->ino,
 		  (unsigned long long)__entry->blocks,
 		  (unsigned long long)__entry->disk_i_size,
 		  (unsigned long long)__entry->last_trans,
 		  (unsigned long long)__entry->logged_trans)
 );
 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode)
 );
 DEFINE_EVENT(btrfs__inode, btrfs_inode_request,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode)
 );
 DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode)
 );
 #define __show_map_type(type)						\
 	__print_symbolic_u64(type,					\
 		{ EXTENT_MAP_LAST_BYTE, "LAST_BYTE" 	},		\
 		{ EXTENT_MAP_HOLE, 	"HOLE" 		},		\
 		{ EXTENT_MAP_INLINE, 	"INLINE" 	},		\
 		{ EXTENT_MAP_DELALLOC,	"DELALLOC" 	})
 #define show_map_type(type)			\
 	type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" :  __show_map_type(type)
 #define show_map_flags(flag)						\
 	__print_flags(flag, "|",					\
 		{ EXTENT_FLAG_PINNED, 		"PINNED" 	},	\
 		{ EXTENT_FLAG_COMPRESSED, 	"COMPRESSED" 	},	\
 		{ EXTENT_FLAG_VACANCY, 		"VACANCY" 	},	\
 		{ EXTENT_FLAG_PREALLOC, 	"PREALLOC" 	})
 TRACE_EVENT(btrfs_get_extent,
 	TP_PROTO(struct btrfs_root *root, struct extent_map *map),
 	TP_ARGS(root, map),
 	TP_STRUCT__entry(
 		__field(	u64,  root_objectid	)
 		__field(	u64,  start		)
 		__field(	u64,  len		)
 		__field(	u64,  orig_start	)
 		__field(	u64,  block_start	)
 		__field(	u64,  block_len		)
 		__field(	unsigned long,  flags	)
 		__field(	int,  refs		)
 		__field(	unsigned int,  compress_type	)
 	),
 	TP_fast_assign(
 		__entry->root_objectid	= root->root_key.objectid;
 		__entry->start 		= map->start;
 		__entry->len		= map->len;
 		__entry->orig_start	= map->orig_start;
 		__entry->block_start	= map->block_start;
 		__entry->block_len	= map->block_len;
 		__entry->flags		= map->flags;
 		__entry->refs		= atomic_read(&map->refs);
 		__entry->compress_type	= map->compress_type;
 	),
 	TP_printk("root = %llu(%s), start = %llu, len = %llu, "
 		  "orig_start = %llu, block_start = %llu(%s), "
 		  "block_len = %llu, flags = %s, refs = %u, "
 		  "compress_type = %u",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->start,
 		  (unsigned long long)__entry->len,
 		  (unsigned long long)__entry->orig_start,
 		  show_map_type(__entry->block_start),
 		  (unsigned long long)__entry->block_len,
 		  show_map_flags(__entry->flags),
 		  __entry->refs, __entry->compress_type)
 );
 #define show_ordered_flags(flags)					\
 	__print_symbolic(flags,					\
 		{ BTRFS_ORDERED_IO_DONE, 	"IO_DONE" 	},	\
 		{ BTRFS_ORDERED_COMPLETE, 	"COMPLETE" 	},	\
 		{ BTRFS_ORDERED_NOCOW, 		"NOCOW" 	},	\
 		{ BTRFS_ORDERED_COMPRESSED, 	"COMPRESSED" 	},	\
 		{ BTRFS_ORDERED_PREALLOC, 	"PREALLOC" 	},	\
 		{ BTRFS_ORDERED_DIRECT, 	"DIRECT" 	})
 DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
 	TP_ARGS(inode, ordered),
 	TP_STRUCT__entry(
 		__field(	ino_t,  ino		)
 		__field(	u64,  file_offset	)
 		__field(	u64,  start		)
 		__field(	u64,  len		)
 		__field(	u64,  disk_len		)
 		__field(	u64,  bytes_left	)
 		__field(	unsigned long,  flags	)
 		__field(	int,  compress_type	)
 		__field(	int,  refs		)
 		__field(	u64,  root_objectid	)
 	),
 	TP_fast_assign(
 		__entry->ino 		= inode->i_ino;
 		__entry->file_offset	= ordered->file_offset;
 		__entry->start		= ordered->start;
 		__entry->len		= ordered->len;
 		__entry->disk_len	= ordered->disk_len;
 		__entry->bytes_left	= ordered->bytes_left;
 		__entry->flags		= ordered->flags;
 		__entry->compress_type	= ordered->compress_type;
 		__entry->refs		= atomic_read(&ordered->refs);
 		__entry->root_objectid	=
 				BTRFS_I(inode)->root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, "
 		  "start = %llu, len = %llu, disk_len = %llu, "
 		  "bytes_left = %llu, flags = %s, compress_type = %d, "
 		  "refs = %d",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->ino,
 		  (unsigned long long)__entry->file_offset,
 		  (unsigned long long)__entry->start,
 		  (unsigned long long)__entry->len,
 		  (unsigned long long)__entry->disk_len,
 		  (unsigned long long)__entry->bytes_left,
 		  show_ordered_flags(__entry->flags),
 		  __entry->compress_type, __entry->refs)
 );
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add,
 	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
 	TP_ARGS(inode, ordered)
 );
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove,
 	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
 	TP_ARGS(inode, ordered)
 );
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start,
 	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
 	TP_ARGS(inode, ordered)
 );
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
 	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
 	TP_ARGS(inode, ordered)
 );
 DECLARE_EVENT_CLASS(btrfs__writepage,
 	TP_PROTO(struct page *page, struct inode *inode,
 		 struct writeback_control *wbc),
 	TP_ARGS(page, inode, wbc),
 	TP_STRUCT__entry(
 		__field(	ino_t,  ino			)
 		__field(	pgoff_t,  index			)
 		__field(	long,   nr_to_write		)
 		__field(	long,   pages_skipped		)
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
-		__field(	char,   nonblocking		)
 		__field(	char,   for_kupdate		)
 		__field(	char,   for_reclaim		)
 		__field(	char,   range_cyclic		)
 		__field(	pgoff_t,  writeback_index	)
 		__field(	u64,    root_objectid		)
 	),
 	TP_fast_assign(
 		__entry->ino		= inode->i_ino;
 		__entry->index		= page->index;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
-		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->root_objectid	=
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
 		  "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
-		  "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
+		  "range_end = %llu, for_kupdate = %d, "
 		  "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
-		  __entry->nonblocking, __entry->for_kupdate,
+		  __entry->for_kupdate,
 		  __entry->for_reclaim, __entry->range_cyclic,
 		  (unsigned long)__entry->writeback_index)
 );
 DEFINE_EVENT(btrfs__writepage, __extent_writepage,
 	TP_PROTO(struct page *page, struct inode *inode,
 		 struct writeback_control *wbc),
 	TP_ARGS(page, inode, wbc)
 );
 TRACE_EVENT(btrfs_writepage_end_io_hook,
 	TP_PROTO(struct page *page, u64 start, u64 end, int uptodate),
 	TP_ARGS(page, start, end, uptodate),
 	TP_STRUCT__entry(
 		__field(	ino_t,	 ino		)
 		__field(	pgoff_t, index		)
 		__field(	u64,	 start		)
 		__field(	u64,	 end		)
 		__field(	int,	 uptodate	)
 		__field(	u64,    root_objectid	)
 	),
 	TP_fast_assign(
 		__entry->ino	= page->mapping->host->i_ino;
 		__entry->index	= page->index;
 		__entry->start	= start;
 		__entry->end	= end;
 		__entry->uptodate = uptodate;
 		__entry->root_objectid	=
 			 BTRFS_I(page->mapping->host)->root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, "
 		  "end = %llu, uptodate = %d",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, (unsigned long)__entry->index,
 		  (unsigned long long)__entry->start,
 		  (unsigned long long)__entry->end, __entry->uptodate)
 );
 TRACE_EVENT(btrfs_sync_file,
 	TP_PROTO(struct file *file, int datasync),
 	TP_ARGS(file, datasync),
 	TP_STRUCT__entry(
 		__field(	ino_t,  ino		)
 		__field(	ino_t,  parent		)
 		__field(	int,    datasync	)
 		__field(	u64,    root_objectid	)
 	),
 	TP_fast_assign(
 		struct dentry *dentry = file->f_path.dentry;
 		struct inode *inode = dentry->d_inode;
 		__entry->ino		= inode->i_ino;
 		__entry->parent		= dentry->d_parent->d_inode->i_ino;
 		__entry->datasync	= datasync;
 		__entry->root_objectid	=
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, (unsigned long)__entry->parent,
 		  __entry->datasync)
 );
 TRACE_EVENT(btrfs_sync_fs,
 	TP_PROTO(int wait),
 	TP_ARGS(wait),
 	TP_STRUCT__entry(
 		__field(	int,  wait		)
 	),
 	TP_fast_assign(
 		__entry->wait	= wait;
 	),
 	TP_printk("wait = %d", __entry->wait)
 );
 #define show_ref_action(action)						\
 	__print_symbolic(action,					\
 		{ BTRFS_ADD_DELAYED_REF,    "ADD_DELAYED_REF" },	\
 		{ BTRFS_DROP_DELAYED_REF,   "DROP_DELAYED_REF" },	\
 		{ BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, 	\
 		{ BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" })
 TRACE_EVENT(btrfs_delayed_tree_ref,
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 	TP_ARGS(ref, full_ref, action),
 	TP_STRUCT__entry(
 		__field(	u64,  bytenr		)
 		__field(	u64,  num_bytes		)
 		__field(	int,  action		)
 		__field(	u64,  parent		)
 		__field(	u64,  ref_root		)
 		__field(	int,  level		)
 		__field(	int,  type		)
 	),
 	TP_fast_assign(
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
 		__entry->action		= action;
 		__entry->parent		= full_ref->parent;
 		__entry->ref_root	= full_ref->root;
 		__entry->level		= full_ref->level;
 		__entry->type		= ref->type;
 	),
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
 		  "type = %s",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
 		  __entry->level, show_ref_type(__entry->type))
 );
 TRACE_EVENT(btrfs_delayed_data_ref,
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 	TP_ARGS(ref, full_ref, action),
 	TP_STRUCT__entry(
 		__field(	u64,  bytenr		)
 		__field(	u64,  num_bytes		)
 		__field(	int,  action		)
 		__field(	u64,  parent		)
 		__field(	u64,  ref_root		)
 		__field(	u64,  owner		)
 		__field(	u64,  offset		)
 		__field(	int,  type		)
 	),
 	TP_fast_assign(
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
 		__entry->action		= action;
 		__entry->parent		= full_ref->parent;
 		__entry->ref_root	= full_ref->root;
 		__entry->owner		= full_ref->objectid;
 		__entry->offset		= full_ref->offset;
 		__entry->type		= ref->type;
 	),
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
 		  "offset = %llu, type = %s",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
 		  (unsigned long long)__entry->owner,
 		  (unsigned long long)__entry->offset,
 		  show_ref_type(__entry->type))
 );
 TRACE_EVENT(btrfs_delayed_ref_head,
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 	TP_ARGS(ref, head_ref, action),
 	TP_STRUCT__entry(
 		__field(	u64,  bytenr		)
 		__field(	u64,  num_bytes		)
 		__field(	int,  action		)
 		__field(	int,  is_data		)
 	),
 	TP_fast_assign(
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
 		__entry->action		= action;
 		__entry->is_data	= head_ref->is_data;
 	),
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  __entry->is_data)
 );
 #define show_chunk_type(type)					\
 	__print_flags(type, "|",				\
 		{ BTRFS_BLOCK_GROUP_DATA, 	"DATA"	},	\
 		{ BTRFS_BLOCK_GROUP_SYSTEM, 	"SYSTEM"},	\
 		{ BTRFS_BLOCK_GROUP_METADATA, 	"METADATA"},	\
 		{ BTRFS_BLOCK_GROUP_RAID0, 	"RAID0" },	\
 		{ BTRFS_BLOCK_GROUP_RAID1, 	"RAID1" },	\
 		{ BTRFS_BLOCK_GROUP_DUP, 	"DUP"	},	\
 		{ BTRFS_BLOCK_GROUP_RAID10, 	"RAID10"})
 DECLARE_EVENT_CLASS(btrfs__chunk,
 	TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
 		 u64 offset, u64 size),
 	TP_ARGS(root, map, offset, size),
 	TP_STRUCT__entry(
 		__field(	int,  num_stripes		)
 		__field(	u64,  type			)
 		__field(	int,  sub_stripes		)
 		__field(	u64,  offset			)
 		__field(	u64,  size			)
 		__field(	u64,  root_objectid		)
 	),
 	TP_fast_assign(
 		__entry->num_stripes	= map->num_stripes;
 		__entry->type		= map->type;
 		__entry->sub_stripes	= map->sub_stripes;
 		__entry->offset		= offset;
 		__entry->size		= size;
 		__entry->root_objectid	= root->root_key.objectid;
 	),
 	TP_printk("root = %llu(%s), offset = %llu, size = %llu, "
 		  "num_stripes = %d, sub_stripes = %d, type = %s",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->offset,
 		  (unsigned long long)__entry->size,
 		  __entry->num_stripes, __entry->sub_stripes,
 		  show_chunk_type(__entry->type))
 );
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_alloc,
 	TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
 		 u64 offset, u64 size),
 	TP_ARGS(root, map, offset, size)
 );
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_free,
 	TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
 		 u64 offset, u64 size),
 	TP_ARGS(root, map, offset, size)
 );
 TRACE_EVENT(btrfs_cow_block,
 	TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf,
 		 struct extent_buffer *cow),
 	TP_ARGS(root, buf, cow),
 	TP_STRUCT__entry(
 		__field(	u64,  root_objectid		)
 		__field(	u64,  buf_start			)
 		__field(	int,  refs			)
 		__field(	u64,  cow_start			)
 		__field(	int,  buf_level			)
 		__field(	int,  cow_level			)
 	),
 	TP_fast_assign(
 		__entry->root_objectid	= root->root_key.objectid;
 		__entry->buf_start	= buf->start;
 		__entry->refs		= atomic_read(&buf->refs);
 		__entry->cow_start	= cow->start;
 		__entry->buf_level	= btrfs_header_level(buf);
 		__entry->cow_level	= btrfs_header_level(cow);
 	),
 	TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu "
 		  "(orig_level = %d), cow_buf = %llu (cow_level = %d)",
 		  show_root_type(__entry->root_objectid),
 		  __entry->refs,
 		  (unsigned long long)__entry->buf_start,
 		  __entry->buf_level,
 		  (unsigned long long)__entry->cow_start,
 		  __entry->cow_level)
 );
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
 	TP_ARGS(root, start, len),
 	TP_STRUCT__entry(
 		__field(	u64,  root_objectid		)
 		__field(	u64,  start			)
 		__field(	u64,  len			)
 	),
 	TP_fast_assign(
 		__entry->root_objectid	= root->root_key.objectid;
 		__entry->start		= start;
 		__entry->len		= len;
 	),
 	TP_printk("root = %llu(%s), start = %llu, len = %llu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->start,
 		  (unsigned long long)__entry->len)
 );
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
 	TP_ARGS(root, start, len)
 );
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
 	TP_ARGS(root, start, len)
 );
 #endif /* _TRACE_BTRFS_H */
 /* This part must be outside protection */
 #include <trace/define_trace.h>

include/trace/events/ext4.h

Diff comments View file @ f01ef56

 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM ext4
 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_EXT4_H
 #include <linux/writeback.h>
 #include <linux/tracepoint.h>
 struct ext4_allocation_context;
 struct ext4_allocation_request;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	uid_t,	uid			)
 		__field(	gid_t,	gid			)
 		__field(	__u64, blocks			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
 		__entry->uid	= inode->i_uid;
 		__entry->gid	= inode->i_gid;
 		__entry->blocks	= inode->i_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->mode,
 		  __entry->uid, __entry->gid, __entry->blocks)
 );
 TRACE_EVENT(ext4_request_inode,
 	TP_PROTO(struct inode *dir, int mode),
 	TP_ARGS(dir, mode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	dir			)
 		__field(	umode_t, mode			)
 	),
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
 		__entry->dir	= dir->i_ino;
 		__entry->mode	= mode;
 	),
 	TP_printk("dev %d,%d dir %lu mode 0%o",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->dir, __entry->mode)
 );
 TRACE_EVENT(ext4_allocate_inode,
 	TP_PROTO(struct inode *inode, struct inode *dir, int mode),
 	TP_ARGS(inode, dir, mode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	ino_t,	dir			)
 		__field(	umode_t, mode			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->dir	= dir->i_ino;
 		__entry->mode	= mode;
 	),
 	TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned long) __entry->dir, __entry->mode)
 );
 TRACE_EVENT(ext4_evict_inode,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	int,	nlink			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->nlink	= inode->i_nlink;
 	),
 	TP_printk("dev %d,%d ino %lu nlink %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nlink)
 );
 TRACE_EVENT(ext4_drop_inode,
 	TP_PROTO(struct inode *inode, int drop),
 	TP_ARGS(inode, drop),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	int,	drop			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->drop	= drop;
 	),
 	TP_printk("dev %d,%d ino %lu drop %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->drop)
 );
 TRACE_EVENT(ext4_mark_inode_dirty,
 	TP_PROTO(struct inode *inode, unsigned long IP),
 	TP_ARGS(inode, IP),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(unsigned long,	ip			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->ip	= IP;
 	),
 	TP_printk("dev %d,%d ino %lu caller %pF",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, (void *)__entry->ip)
 );
 TRACE_EVENT(ext4_begin_ordered_truncate,
 	TP_PROTO(struct inode *inode, loff_t new_size),
 	TP_ARGS(inode, new_size),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	loff_t,	new_size		)
 	),
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->new_size	= new_size;
 	),
 	TP_printk("dev %d,%d ino %lu new_size %lld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->new_size)
 );
 DECLARE_EVENT_CLASS(ext4__write_begin,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int flags),
 	TP_ARGS(inode, pos, len, flags),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	loff_t,	pos			)
 		__field(	unsigned int, len		)
 		__field(	unsigned int, flags		)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->pos	= pos;
 		__entry->len	= len;
 		__entry->flags	= flags;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pos, __entry->len, __entry->flags)
 );
 DEFINE_EVENT(ext4__write_begin, ext4_write_begin,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int flags),
 	TP_ARGS(inode, pos, len, flags)
 );
 DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int flags),
 	TP_ARGS(inode, pos, len, flags)
 );
 DECLARE_EVENT_CLASS(ext4__write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 			unsigned int copied),
 	TP_ARGS(inode, pos, len, copied),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	loff_t,	pos			)
 		__field(	unsigned int, len		)
 		__field(	unsigned int, copied		)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->pos	= pos;
 		__entry->len	= len;
 		__entry->copied	= copied;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pos, __entry->len, __entry->copied)
 );
 DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int copied),
 	TP_ARGS(inode, pos, len, copied)
 );
 DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int copied),
 	TP_ARGS(inode, pos, len, copied)
 );
 DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int copied),
 	TP_ARGS(inode, pos, len, copied)
 );
 DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int copied),
 	TP_ARGS(inode, pos, len, copied)
 );
 TRACE_EVENT(ext4_da_writepages,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
 	TP_ARGS(inode, wbc),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	long,	nr_to_write		)
 		__field(	long,	pages_skipped		)
 		__field(	loff_t,	range_start		)
 		__field(	loff_t,	range_end		)
 		__field(	int,	sync_mode		)
 		__field(	char,	for_kupdate		)
 		__field(	char,	range_cyclic		)
 		__field(       pgoff_t,	writeback_index		)
 	),
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 	TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
 		  "range_start %lld range_end %lld sync_mode %d"
 		  "for_kupdate %d range_cyclic %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nr_to_write,
 		  __entry->pages_skipped, __entry->range_start,
 		  __entry->range_end, __entry->sync_mode,
 		  __entry->for_kupdate, __entry->range_cyclic,
 		  (unsigned long) __entry->writeback_index)
 );
 TRACE_EVENT(ext4_da_write_pages,
 	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
 	TP_ARGS(inode, mpd),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u64,	b_blocknr		)
 		__field(	__u32,	b_size			)
 		__field(	__u32,	b_state			)
 		__field(	unsigned long,	first_page	)
 		__field(	int,	io_done			)
 		__field(	int,	pages_written		)
 		__field(	int,	sync_mode		)
 	),
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->b_blocknr	= mpd->b_blocknr;
 		__entry->b_size		= mpd->b_size;
 		__entry->b_state	= mpd->b_state;
 		__entry->first_page	= mpd->first_page;
 		__entry->io_done	= mpd->io_done;
 		__entry->pages_written	= mpd->pages_written;
 		__entry->sync_mode	= mpd->wbc->sync_mode;
 	),
 	TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
 		  "first_page %lu io_done %d pages_written %d sync_mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->b_blocknr, __entry->b_size,
 		  __entry->b_state, __entry->first_page,
 		  __entry->io_done, __entry->pages_written,
 		  __entry->sync_mode
                   )
 );
 TRACE_EVENT(ext4_da_writepages_result,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc,
 			int ret, int pages_written),
 	TP_ARGS(inode, wbc, ret, pages_written),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	int,	ret			)
 		__field(	int,	pages_written		)
 		__field(	long,	pages_skipped		)
 		__field(	int,	sync_mode		)
-		__field(	char,	more_io			)
 		__field(       pgoff_t,	writeback_index		)
 	),
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->ret		= ret;
 		__entry->pages_written	= pages_written;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
-		__entry->more_io	= wbc->more_io;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
-		  " more_io %d sync_mode %d writeback_index %lu",
+		  "sync_mode %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
 		  __entry->pages_written, __entry->pages_skipped,
-		  __entry->more_io, __entry->sync_mode,
+		  __entry->sync_mode,
 		  (unsigned long) __entry->writeback_index)
 );
 DECLARE_EVENT_CLASS(ext4__page_op,
 	TP_PROTO(struct page *page),
 	TP_ARGS(page),
 	TP_STRUCT__entry(
 		__field(	pgoff_t, index			)
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 	),
 	TP_fast_assign(
 		__entry->index	= page->index;
 		__entry->ino	= page->mapping->host->i_ino;
 		__entry->dev	= page->mapping->host->i_sb->s_dev;
 	),
 	TP_printk("dev %d,%d ino %lu page_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned long) __entry->index)
 );
 DEFINE_EVENT(ext4__page_op, ext4_writepage,
 	TP_PROTO(struct page *page),
 	TP_ARGS(page)
 );
 DEFINE_EVENT(ext4__page_op, ext4_readpage,
 	TP_PROTO(struct page *page),
 	TP_ARGS(page)
 );
 DEFINE_EVENT(ext4__page_op, ext4_releasepage,
 	TP_PROTO(struct page *page),
 	TP_ARGS(page)
 );
 TRACE_EVENT(ext4_invalidatepage,
 	TP_PROTO(struct page *page, unsigned long offset),
 	TP_ARGS(page, offset),
 	TP_STRUCT__entry(
 		__field(	pgoff_t, index			)
 		__field(	unsigned long, offset		)
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 	),
 	TP_fast_assign(
 		__entry->index	= page->index;
 		__entry->offset	= offset;
 		__entry->ino	= page->mapping->host->i_ino;
 		__entry->dev	= page->mapping->host->i_sb->s_dev;
 	),
 	TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned long) __entry->index, __entry->offset)
 );
 TRACE_EVENT(ext4_discard_blocks,
 	TP_PROTO(struct super_block *sb, unsigned long long blk,
 			unsigned long long count),
 	TP_ARGS(sb, blk, count),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	__u64,	blk			)
 		__field(	__u64,	count			)
 	),
 	TP_fast_assign(
 		__entry->dev	= sb->s_dev;
 		__entry->blk	= blk;
 		__entry->count	= count;
 	),
 	TP_printk("dev %d,%d blk %llu count %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->blk, __entry->count)
 );
 DECLARE_EVENT_CLASS(ext4__mb_new_pa,
 	TP_PROTO(struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 	TP_ARGS(ac, pa),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u64,	pa_pstart		)
 		__field(	__u32,	pa_len			)
 		__field(	__u64,	pa_lstart		)
 	),
 	TP_fast_assign(
 		__entry->dev		= ac->ac_sb->s_dev;
 		__entry->ino		= ac->ac_inode->i_ino;
 		__entry->pa_pstart	= pa->pa_pstart;
 		__entry->pa_len		= pa->pa_len;
 		__entry->pa_lstart	= pa->pa_lstart;
 	),
 	TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
 );
 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
 	TP_PROTO(struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 	TP_ARGS(ac, pa)
 );
 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,
 	TP_PROTO(struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 	TP_ARGS(ac, pa)
 );
 TRACE_EVENT(ext4_mb_release_inode_pa,
 	TP_PROTO(struct ext4_prealloc_space *pa,
 		 unsigned long long block, unsigned int count),
 	TP_ARGS(pa, block, count),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u64,	block			)
 		__field(	__u32,	count			)
 	),
 	TP_fast_assign(
 		__entry->dev		= pa->pa_inode->i_sb->s_dev;
 		__entry->ino		= pa->pa_inode->i_ino;
 		__entry->block		= block;
 		__entry->count		= count;
 	),
 	TP_printk("dev %d,%d ino %lu block %llu count %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->block, __entry->count)
 );
 TRACE_EVENT(ext4_mb_release_group_pa,
 	TP_PROTO(struct ext4_prealloc_space *pa),
 	TP_ARGS(pa),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	__u64,	pa_pstart		)
 		__field(	__u32,	pa_len			)
 	),
 	TP_fast_assign(
 		__entry->dev		= pa->pa_inode->i_sb->s_dev;
 		__entry->pa_pstart	= pa->pa_pstart;
 		__entry->pa_len		= pa->pa_len;
 	),
 	TP_printk("dev %d,%d pstart %llu len %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->pa_pstart, __entry->pa_len)
 );
 TRACE_EVENT(ext4_discard_preallocations,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 	),
 	TP_printk("dev %d,%d ino %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino)
 );
 TRACE_EVENT(ext4_mb_discard_preallocations,
 	TP_PROTO(struct super_block *sb, int needed),
 	TP_ARGS(sb, needed),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	int,	needed			)
 	),
 	TP_fast_assign(
 		__entry->dev	= sb->s_dev;
 		__entry->needed	= needed;
 	),
 	TP_printk("dev %d,%d needed %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->needed)
 );
 TRACE_EVENT(ext4_request_blocks,
 	TP_PROTO(struct ext4_allocation_request *ar),
 	TP_ARGS(ar),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	unsigned int, flags		)
 		__field(	unsigned int, len		)
 		__field(	__u32,  logical			)
 		__field(	__u32,	lleft			)
 		__field(	__u32,	lright			)
 		__field(	__u64,	goal			)
 		__field(	__u64,	pleft			)
 		__field(	__u64,	pright			)
 	),
 	TP_fast_assign(
 		__entry->dev	= ar->inode->i_sb->s_dev;
 		__entry->ino	= ar->inode->i_ino;
 		__entry->flags	= ar->flags;
 		__entry->len	= ar->len;
 		__entry->logical = ar->logical;
 		__entry->goal	= ar->goal;
 		__entry->lleft	= ar->lleft;
 		__entry->lright	= ar->lright;
 		__entry->pleft	= ar->pleft;
 		__entry->pright	= ar->pright;
 	),
 	TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
 		  "lleft %u lright %u pleft %llu pright %llu ",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->flags,
 		  __entry->len, __entry->logical, __entry->goal,
 		  __entry->lleft, __entry->lright, __entry->pleft,
 		  __entry->pright)
 );
 TRACE_EVENT(ext4_allocate_blocks,
 	TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),
 	TP_ARGS(ar, block),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u64,	block			)
 		__field(	unsigned int, flags		)
 		__field(	unsigned int, len		)
 		__field(	__u32,  logical			)
 		__field(	__u32,	lleft			)
 		__field(	__u32,	lright			)
 		__field(	__u64,	goal			)
 		__field(	__u64,	pleft			)
 		__field(	__u64,	pright			)
 	),
 	TP_fast_assign(
 		__entry->dev	= ar->inode->i_sb->s_dev;
 		__entry->ino	= ar->inode->i_ino;
 		__entry->block	= block;
 		__entry->flags	= ar->flags;
 		__entry->len	= ar->len;
 		__entry->logical = ar->logical;
 		__entry->goal	= ar->goal;
 		__entry->lleft	= ar->lleft;
 		__entry->lright	= ar->lright;
 		__entry->pleft	= ar->pleft;
 		__entry->pright	= ar->pright;
 	),
 	TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
 		  "goal %llu lleft %u lright %u pleft %llu pright %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->flags,
 		  __entry->len, __entry->block, __entry->logical,
 		  __entry->goal,  __entry->lleft, __entry->lright,
 		  __entry->pleft, __entry->pright)
 );
 TRACE_EVENT(ext4_free_blocks,
 	TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
 		 int flags),
 	TP_ARGS(inode, block, count, flags),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	__u64,	block			)
 		__field(	unsigned long,	count		)
 		__field(	int,	flags			)
 	),
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->mode		= inode->i_mode;
 		__entry->block		= block;
 		__entry->count		= count;
 		__entry->flags		= flags;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->block, __entry->count,
 		  __entry->flags)
 );
 TRACE_EVENT(ext4_sync_file_enter,
 	TP_PROTO(struct file *file, int datasync),
 	TP_ARGS(file, datasync),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	ino_t,	parent			)
 		__field(	int,	datasync		)
 	),
 	TP_fast_assign(
 		struct dentry *dentry = file->f_path.dentry;
 		__entry->dev		= dentry->d_inode->i_sb->s_dev;
 		__entry->ino		= dentry->d_inode->i_ino;
 		__entry->datasync	= datasync;
 		__entry->parent		= dentry->d_parent->d_inode->i_ino;
 	),
 	TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned long) __entry->parent, __entry->datasync)
 );
 TRACE_EVENT(ext4_sync_file_exit,
 	TP_PROTO(struct inode *inode, int ret),
 	TP_ARGS(inode, ret),
 	TP_STRUCT__entry(
 		__field(	int,	ret			)
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 	),
 	TP_fast_assign(
 		__entry->ret		= ret;
 		__entry->ino		= inode->i_ino;
 		__entry->dev		= inode->i_sb->s_dev;
 	),
 	TP_printk("dev %d,%d ino %lu ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->ret)
 );
 TRACE_EVENT(ext4_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 	TP_ARGS(sb, wait),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	int,	wait			)
 	),
 	TP_fast_assign(
 		__entry->dev	= sb->s_dev;
 		__entry->wait	= wait;
 	),
 	TP_printk("dev %d,%d wait %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->wait)
 );
 TRACE_EVENT(ext4_alloc_da_blocks,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field( unsigned int,	data_blocks	)
 		__field( unsigned int,	meta_blocks	)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 		__entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->data_blocks, __entry->meta_blocks)
 );
 TRACE_EVENT(ext4_mballoc_alloc,
 	TP_PROTO(struct ext4_allocation_context *ac),
 	TP_ARGS(ac),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u16,	found			)
 		__field(	__u16,	groups			)
 		__field(	__u16,	buddy			)
 		__field(	__u16,	flags			)
 		__field(	__u16,	tail			)
 		__field(	__u8,	cr			)
 		__field(	__u32, 	orig_logical		)
 		__field(	  int,	orig_start		)
 		__field(	__u32, 	orig_group		)
 		__field(	  int,	orig_len		)
 		__field(	__u32, 	goal_logical		)
 		__field(	  int,	goal_start		)
 		__field(	__u32, 	goal_group		)
 		__field(	  int,	goal_len		)
 		__field(	__u32, 	result_logical		)
 		__field(	  int,	result_start		)
 		__field(	__u32, 	result_group		)
 		__field(	  int,	result_len		)
 	),
 	TP_fast_assign(
 		__entry->dev		= ac->ac_inode->i_sb->s_dev;
 		__entry->ino		= ac->ac_inode->i_ino;
 		__entry->found		= ac->ac_found;
 		__entry->flags		= ac->ac_flags;
 		__entry->groups		= ac->ac_groups_scanned;
 		__entry->buddy		= ac->ac_buddy;
 		__entry->tail		= ac->ac_tail;
 		__entry->cr		= ac->ac_criteria;
 		__entry->orig_logical	= ac->ac_o_ex.fe_logical;
 		__entry->orig_start	= ac->ac_o_ex.fe_start;
 		__entry->orig_group	= ac->ac_o_ex.fe_group;
 		__entry->orig_len	= ac->ac_o_ex.fe_len;
 		__entry->goal_logical	= ac->ac_g_ex.fe_logical;
 		__entry->goal_start	= ac->ac_g_ex.fe_start;
 		__entry->goal_group	= ac->ac_g_ex.fe_group;
 		__entry->goal_len	= ac->ac_g_ex.fe_len;
 		__entry->result_logical	= ac->ac_f_ex.fe_logical;
 		__entry->result_start	= ac->ac_f_ex.fe_start;
 		__entry->result_group	= ac->ac_f_ex.fe_group;
 		__entry->result_len	= ac->ac_f_ex.fe_len;
 	),
 	TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
 		  "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
 		  "tail %u broken %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->orig_group, __entry->orig_start,
 		  __entry->orig_len, __entry->orig_logical,
 		  __entry->goal_group, __entry->goal_start,
 		  __entry->goal_len, __entry->goal_logical,
 		  __entry->result_group, __entry->result_start,
 		  __entry->result_len, __entry->result_logical,
 		  __entry->found, __entry->groups, __entry->cr,
 		  __entry->flags, __entry->tail,
 		  __entry->buddy ? 1 << __entry->buddy : 0)
 );
 TRACE_EVENT(ext4_mballoc_prealloc,
 	TP_PROTO(struct ext4_allocation_context *ac),
 	TP_ARGS(ac),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	__u32, 	orig_logical		)
 		__field(	  int,	orig_start		)
 		__field(	__u32, 	orig_group		)
 		__field(	  int,	orig_len		)
 		__field(	__u32, 	result_logical		)
 		__field(	  int,	result_start		)
 		__field(	__u32, 	result_group		)
 		__field(	  int,	result_len		)
 	),
 	TP_fast_assign(
 		__entry->dev		= ac->ac_inode->i_sb->s_dev;
 		__entry->ino		= ac->ac_inode->i_ino;
 		__entry->orig_logical	= ac->ac_o_ex.fe_logical;
 		__entry->orig_start	= ac->ac_o_ex.fe_start;
 		__entry->orig_group	= ac->ac_o_ex.fe_group;
 		__entry->orig_len	= ac->ac_o_ex.fe_len;
 		__entry->result_logical	= ac->ac_b_ex.fe_logical;
 		__entry->result_start	= ac->ac_b_ex.fe_start;
 		__entry->result_group	= ac->ac_b_ex.fe_group;
 		__entry->result_len	= ac->ac_b_ex.fe_len;
 	),
 	TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->orig_group, __entry->orig_start,
 		  __entry->orig_len, __entry->orig_logical,
 		  __entry->result_group, __entry->result_start,
 		  __entry->result_len, __entry->result_logical)
 );
 DECLARE_EVENT_CLASS(ext4__mballoc,
 	TP_PROTO(struct super_block *sb,
 		 struct inode *inode,
 		 ext4_group_t group,
 		 ext4_grpblk_t start,
 		 ext4_grpblk_t len),
 	TP_ARGS(sb, inode, group, start, len),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	  int,	result_start		)
 		__field(	__u32, 	result_group		)
 		__field(	  int,	result_len		)
 	),
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
 		__entry->ino		= inode ? inode->i_ino : 0;
 		__entry->result_start	= start;
 		__entry->result_group	= group;
 		__entry->result_len	= len;
 	),
 	TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->result_group, __entry->result_start,
 		  __entry->result_len)
 );
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
 	TP_PROTO(struct super_block *sb,
 		 struct inode *inode,
 		 ext4_group_t group,
 		 ext4_grpblk_t start,
 		 ext4_grpblk_t len),
 	TP_ARGS(sb, inode, group, start, len)
 );
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
 	TP_PROTO(struct super_block *sb,
 		 struct inode *inode,
 		 ext4_group_t group,
 		 ext4_grpblk_t start,
 		 ext4_grpblk_t len),
 	TP_ARGS(sb, inode, group, start, len)
 );
 TRACE_EVENT(ext4_forget,
 	TP_PROTO(struct inode *inode, int is_metadata, __u64 block),
 	TP_ARGS(inode, is_metadata, block),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	int,	is_metadata		)
 		__field(	__u64,	block			)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
 		__entry->is_metadata = is_metadata;
 		__entry->block	= block;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->is_metadata, __entry->block)
 );
 TRACE_EVENT(ext4_da_update_reserve_space,
 	TP_PROTO(struct inode *inode, int used_blocks),
 	TP_ARGS(inode, used_blocks),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	__u64,	i_blocks		)
 		__field(	int,	used_blocks		)
 		__field(	int,	reserved_data_blocks	)
 		__field(	int,	reserved_meta_blocks	)
 		__field(	int,	allocated_meta_blocks	)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
 		__entry->i_blocks = inode->i_blocks;
 		__entry->used_blocks = used_blocks;
 		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
 		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
 		  "reserved_data_blocks %d reserved_meta_blocks %d "
 		  "allocated_meta_blocks %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->i_blocks,
 		  __entry->used_blocks, __entry->reserved_data_blocks,
 		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
 );
 TRACE_EVENT(ext4_da_reserve_space,
 	TP_PROTO(struct inode *inode, int md_needed),
 	TP_ARGS(inode, md_needed),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	__u64,	i_blocks		)
 		__field(	int,	md_needed		)
 		__field(	int,	reserved_data_blocks	)
 		__field(	int,	reserved_meta_blocks	)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
 		__entry->i_blocks = inode->i_blocks;
 		__entry->md_needed = md_needed;
 		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d "
 		  "reserved_data_blocks %d reserved_meta_blocks %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->i_blocks,
 		  __entry->md_needed, __entry->reserved_data_blocks,
 		  __entry->reserved_meta_blocks)
 );
 TRACE_EVENT(ext4_da_release_space,
 	TP_PROTO(struct inode *inode, int freed_blocks),
 	TP_ARGS(inode, freed_blocks),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	umode_t, mode			)
 		__field(	__u64,	i_blocks		)
 		__field(	int,	freed_blocks		)
 		__field(	int,	reserved_data_blocks	)
 		__field(	int,	reserved_meta_blocks	)
 		__field(	int,	allocated_meta_blocks	)
 	),
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
 		__entry->i_blocks = inode->i_blocks;
 		__entry->freed_blocks = freed_blocks;
 		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
 		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
 		  "reserved_data_blocks %d reserved_meta_blocks %d "
 		  "allocated_meta_blocks %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->i_blocks,
 		  __entry->freed_blocks, __entry->reserved_data_blocks,
 		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
 );
 DECLARE_EVENT_CLASS(ext4__bitmap_load,
 	TP_PROTO(struct super_block *sb, unsigned long group),
 	TP_ARGS(sb, group),
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	__u32,	group			)
 	),
 	TP_fast_assign(
 		__entry->dev	= sb->s_dev;
 		__entry->group	= group;
 	),
 	TP_printk("dev %d,%d group %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->group)
 );
 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,
 	TP_PROTO(struct super_block *sb, unsigned long group),
 	TP_ARGS(sb, group)
 );
 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,
 	TP_PROTO(struct super_block *sb, unsigned long group),
 	TP_ARGS(sb, group)
 );
 DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load,
 	TP_PROTO(struct super_block *sb, unsigned long group),
 	TP_ARGS(sb, group)
 );
 DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,
 	TP_PROTO(struct super_block *sb, unsigned long group),
 	TP_ARGS(sb, group)
 );
 TRACE_EVENT(ext4_direct_IO_enter,
 	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),
 	TP_ARGS(inode, offset, len, rw),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 		__field(	loff_t,	pos			)
 		__field(	unsigned long,	len		)
 		__field(	int,	rw			)
 	),
 	TP_fast_assign(
 		__entry->ino	= inode->i_ino;
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->pos	= offset;
 		__entry->len	= len;
 		__entry->rw	= rw;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pos, __entry->len, __entry->rw)
 );
 TRACE_EVENT(ext4_direct_IO_exit,
 	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
 		 int rw, int ret),
 	TP_ARGS(inode, offset, len, rw, ret),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 		__field(	loff_t,	pos			)
 		__field(	unsigned long,	len		)
 		__field(	int,	rw			)
 		__field(	int,	ret			)
 	),
 	TP_fast_assign(
 		__entry->ino	= inode->i_ino;
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->pos	= offset;
 		__entry->len	= len;
 		__entry->rw	= rw;
 		__entry->ret	= ret;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pos, __entry->len,
 		  __entry->rw, __entry->ret)
 );
 TRACE_EVENT(ext4_fallocate_enter,
 	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
 	TP_ARGS(inode, offset, len, mode),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 		__field(	loff_t,	pos			)
 		__field(	loff_t,	len			)
 		__field(	int,	mode			)
 	),
 	TP_fast_assign(
 		__entry->ino	= inode->i_ino;
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->pos	= offset;
 		__entry->len	= len;
 		__entry->mode	= mode;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->pos,
 		  __entry->len, __entry->mode)
 );
 TRACE_EVENT(ext4_fallocate_exit,
 	TP_PROTO(struct inode *inode, loff_t offset,
 		 unsigned int max_blocks, int ret),
 	TP_ARGS(inode, offset, max_blocks, ret),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 		__field(	loff_t,	pos			)
 		__field(	unsigned int,	blocks		)
 		__field(	int, 	ret			)
 	),
 	TP_fast_assign(
 		__entry->ino	= inode->i_ino;
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->pos	= offset;
 		__entry->blocks	= max_blocks;
 		__entry->ret	= ret;
 	),
 	TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->pos, __entry->blocks,
 		  __entry->ret)
 );
 TRACE_EVENT(ext4_unlink_enter,
 	TP_PROTO(struct inode *parent, struct dentry *dentry),
 	TP_ARGS(parent, dentry),
 	TP_STRUCT__entry(
 		__field(	ino_t,	parent			)
 		__field(	ino_t,	ino			)
 		__field(	loff_t,	size			)
 		__field(	dev_t,	dev			)
 	),
 	TP_fast_assign(
 		__entry->parent		= parent->i_ino;
 		__entry->ino		= dentry->d_inode->i_ino;
 		__entry->size		= dentry->d_inode->i_size;
 		__entry->dev		= dentry->d_inode->i_sb->s_dev;
 	),
 	TP_printk("dev %d,%d ino %lu size %lld parent %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->size,
 		  (unsigned long) __entry->parent)
 );
 TRACE_EVENT(ext4_unlink_exit,
 	TP_PROTO(struct dentry *dentry, int ret),
 	TP_ARGS(dentry, ret),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 		__field(	int,	ret			)
 	),
 	TP_fast_assign(
 		__entry->ino		= dentry->d_inode->i_ino;
 		__entry->dev		= dentry->d_inode->i_sb->s_dev;
 		__entry->ret		= ret;
 	),
 	TP_printk("dev %d,%d ino %lu ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->ret)
 );
 DECLARE_EVENT_CLASS(ext4__truncate,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	ino_t,  	ino		)
 		__field(	dev_t,  	dev		)
 		__field(	__u64,		blocks		)
 	),
 	TP_fast_assign(
 		__entry->ino    = inode->i_ino;
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->blocks	= inode->i_blocks;
 	),
 	TP_printk("dev %d,%d ino %lu blocks %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->blocks)
 );
 DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode)
 );
 DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode)
 );
 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 unsigned int len, unsigned int flags),
 	TP_ARGS(inode, lblk, len, flags),
 	TP_STRUCT__entry(
 		__field(	ino_t,  	ino		)
 		__field(	dev_t,  	dev		)
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	unsigned int,	len		)
 		__field(	unsigned int,	flags		)
 	),
 	TP_fast_assign(
 		__entry->ino    = inode->i_ino;
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->lblk	= lblk;
 		__entry->len	= len;
 		__entry->flags	= flags;
 	),
 	TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->len, __entry->flags)
 );
 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 unsigned len, unsigned flags),
 	TP_ARGS(inode, lblk, len, flags)
 );
 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 unsigned len, unsigned flags),
 	TP_ARGS(inode, lblk, len, flags)
 );
 DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 ext4_fsblk_t pblk, unsigned int len, int ret),
 	TP_ARGS(inode, lblk, pblk, len, ret),
 	TP_STRUCT__entry(
 		__field(	ino_t,		ino		)
 		__field(	dev_t,		dev		)
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_fsblk_t,	pblk		)
 		__field(	unsigned int,	len		)
 		__field(	int,		ret		)
 	),
 	TP_fast_assign(
 		__entry->ino    = inode->i_ino;
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->lblk	= lblk;
 		__entry->pblk	= pblk;
 		__entry->len	= len;
 		__entry->ret	= ret;
 	),
 	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->pblk,
 		  __entry->len, __entry->ret)
 );
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 ext4_fsblk_t pblk, unsigned len, int ret),
 	TP_ARGS(inode, lblk, pblk, len, ret)
 );
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 ext4_fsblk_t pblk, unsigned len, int ret),
 	TP_ARGS(inode, lblk, pblk, len, ret)
 );
 TRACE_EVENT(ext4_ext_load_extent,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),
 	TP_ARGS(inode, lblk, pblk),
 	TP_STRUCT__entry(
 		__field(	ino_t,		ino		)
 		__field(	dev_t,		dev		)
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_fsblk_t,	pblk		)
 	),
 	TP_fast_assign(
 		__entry->ino    = inode->i_ino;
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->lblk	= lblk;
 		__entry->pblk	= pblk;
 	),
 	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->pblk)
 );
 TRACE_EVENT(ext4_load_inode,
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
 	TP_STRUCT__entry(
 		__field(	ino_t,	ino		)
 		__field(	dev_t,	dev		)
 	),
 	TP_fast_assign(
 		__entry->ino		= inode->i_ino;
 		__entry->dev		= inode->i_sb->s_dev;
 	),
 	TP_printk("dev %d,%d ino %ld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino)
 );
 #endif /* _TRACE_EXT4_H */
 /* This part must be outside protection */
 #include <trace/define_trace.h>

include/trace/events/writeback.h

Diff comments View file @ f01ef56

 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM writeback
 #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_WRITEBACK_H
 #include <linux/backing-dev.h>
 #include <linux/device.h>
 #include <linux/writeback.h>
+#define show_inode_state(state)					\
+	__print_flags(state, "|",				\
+		{I_DIRTY_SYNC,		"I_DIRTY_SYNC"},	\
+		{I_DIRTY_DATASYNC,	"I_DIRTY_DATASYNC"},	\
+		{I_DIRTY_PAGES,		"I_DIRTY_PAGES"},	\
+		{I_NEW,			"I_NEW"},		\
+		{I_WILL_FREE,		"I_WILL_FREE"},		\
+		{I_FREEING,		"I_FREEING"},		\
+		{I_CLEAR,		"I_CLEAR"},		\
+		{I_SYNC,		"I_SYNC"},		\
+		{I_REFERENCED,		"I_REFERENCED"}		\
+	)
 struct wb_writeback_work;
 DECLARE_EVENT_CLASS(writeback_work_class,
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
 	TP_ARGS(bdi, work),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 		__field(long, nr_pages)
 		__field(dev_t, sb_dev)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, range_cyclic)
 		__field(int, for_background)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
 		__entry->for_kupdate = work->for_kupdate;
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
 	),
 	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
 		  "kupdate=%d range_cyclic=%d background=%d",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
 		  __entry->sync_mode,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
 		  __entry->for_background
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
 DEFINE_EVENT(writeback_work_class, name, \
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
 	TP_ARGS(bdi, work))
 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
 TRACE_EVENT(writeback_pages_written,
 	TP_PROTO(long pages_written),
 	TP_ARGS(pages_written),
 	TP_STRUCT__entry(
 		__field(long,		pages)
 	),
 	TP_fast_assign(
 		__entry->pages		= pages_written;
 	),
 	TP_printk("%ld", __entry->pages)
 );
 DECLARE_EVENT_CLASS(writeback_class,
 	TP_PROTO(struct backing_dev_info *bdi),
 	TP_ARGS(bdi),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 	),
 	TP_printk("bdi %s",
 		  __entry->name
 	)
 );
 #define DEFINE_WRITEBACK_EVENT(name) \
 DEFINE_EVENT(writeback_class, name, \
 	TP_PROTO(struct backing_dev_info *bdi), \
 	TP_ARGS(bdi))
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
+DEFINE_WRITEBACK_EVENT(balance_dirty_start);
+DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
+TRACE_EVENT(balance_dirty_written,
+	TP_PROTO(struct backing_dev_info *bdi, int written),
+	TP_ARGS(bdi, written),
+	TP_STRUCT__entry(
+		__array(char,	name, 32)
+		__field(int,	written)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->written = written;
+	),
+	TP_printk("bdi %s written %d",
+		  __entry->name,
+		  __entry->written
+	)
+);
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
 	TP_ARGS(wbc, bdi),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 		__field(long, nr_to_write)
 		__field(long, pages_skipped)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
 		__field(int, for_reclaim)
 		__field(int, range_cyclic)
-		__field(int, more_io)
-		__field(unsigned long, older_than_this)
 		__field(long, range_start)
 		__field(long, range_end)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
-		__entry->more_io	= wbc->more_io;
-		__entry->older_than_this = wbc->older_than_this ?
-						*wbc->older_than_this : 0;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
 	),
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"bgrd=%d reclm=%d cyclic=%d "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
 		__entry->for_reclaim,
 		__entry->range_cyclic,
-		__entry->more_io,
-		__entry->older_than_this,
 		__entry->range_start,
 		__entry->range_end)
 )
 #define DEFINE_WBC_EVENT(name) \
 DEFINE_EVENT(wbc_class, name, \
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
 	TP_ARGS(wbc, bdi))
-DEFINE_WBC_EVENT(wbc_writeback_start);
-DEFINE_WBC_EVENT(wbc_writeback_written);
-DEFINE_WBC_EVENT(wbc_writeback_wait);
-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
+TRACE_EVENT(writeback_queue_io,
+	TP_PROTO(struct bdi_writeback *wb,
+		 unsigned long *older_than_this,
+		 int moved),
+	TP_ARGS(wb, older_than_this, moved),
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned long,	older)
+		__field(long,		age)
+		__field(int,		moved)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->age	= older_than_this ?
+				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+		__entry->moved	= moved;
+	),
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+		__entry->name,
+		__entry->older,	/* older_than_this in jiffies */
+		__entry->age,	/* older_than_this in relative milliseconds */
+		__entry->moved)
+);
+TRACE_EVENT(global_dirty_state,
+	TP_PROTO(unsigned long background_thresh,
+		 unsigned long dirty_thresh
+	),
+	TP_ARGS(background_thresh,
+		dirty_thresh
+	),
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_dirty)
+		__field(unsigned long,	nr_writeback)
+		__field(unsigned long,	nr_unstable)
+		__field(unsigned long,	background_thresh)
+		__field(unsigned long,	dirty_thresh)
+		__field(unsigned long,	dirty_limit)
+		__field(unsigned long,	nr_dirtied)
+		__field(unsigned long,	nr_written)
+	),
+	TP_fast_assign(
+		__entry->nr_dirty	= global_page_state(NR_FILE_DIRTY);
+		__entry->nr_writeback	= global_page_state(NR_WRITEBACK);
+		__entry->nr_unstable	= global_page_state(NR_UNSTABLE_NFS);
+		__entry->nr_dirtied	= global_page_state(NR_DIRTIED);
+		__entry->nr_written	= global_page_state(NR_WRITTEN);
+		__entry->background_thresh = background_thresh;
+		__entry->dirty_thresh	= dirty_thresh;
+		__entry->dirty_limit = global_dirty_limit;
+	),
+	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+		  "bg_thresh=%lu thresh=%lu limit=%lu "
+		  "dirtied=%lu written=%lu",
+		  __entry->nr_dirty,
+		  __entry->nr_writeback,
+		  __entry->nr_unstable,
+		  __entry->background_thresh,
+		  __entry->dirty_thresh,
+		  __entry->dirty_limit,
+		  __entry->nr_dirtied,
+		  __entry->nr_written
+	)
+);
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
 	TP_ARGS(usec_timeout, usec_delayed),
 	TP_STRUCT__entry(
 		__field(	unsigned int,	usec_timeout	)
 		__field(	unsigned int,	usec_delayed	)
 	),
 	TP_fast_assign(
 		__entry->usec_timeout	= usec_timeout;
 		__entry->usec_delayed	= usec_delayed;
 	),
 	TP_printk("usec_timeout=%u usec_delayed=%u",
 			__entry->usec_timeout,
 			__entry->usec_delayed)
 );
 DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
 	TP_ARGS(usec_timeout, usec_delayed)
 );
 DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
 	TP_ARGS(usec_timeout, usec_delayed)
+);
+DECLARE_EVENT_CLASS(writeback_single_inode_template,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write
+	),
+	TP_ARGS(inode, wbc, nr_to_write),
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, ino)
+		__field(unsigned long, state)
+		__field(unsigned long, age)
+		__field(unsigned long, writeback_index)
+		__field(long, nr_to_write)
+		__field(unsigned long, wrote)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
+		__entry->age		= (jiffies - inode->dirtied_when) *
+								1000 / HZ;
+		__entry->writeback_index = inode->i_mapping->writeback_index;
+		__entry->nr_to_write	= nr_to_write;
+		__entry->wrote		= nr_to_write - wbc->nr_to_write;
+	),
+	TP_printk("bdi %s: ino=%lu state=%s age=%lu "
+		  "index=%lu to_write=%ld wrote=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  show_inode_state(__entry->state),
+		  __entry->age,
+		  __entry->writeback_index,
+		  __entry->nr_to_write,
+		  __entry->wrote
+	)
+);
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write),
+	TP_ARGS(inode, wbc, nr_to_write)
+);

mm/backing-dev.c

Diff comments View file @ f01ef56

 #include <linux/wait.h>
 #include <linux/backing-dev.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
 #include <trace/events/writeback.h>
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 struct backing_dev_info default_backing_dev_info = {
 	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 /*
  * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
  * reader side protection for bdi_pending_list. bdi_list has RCU reader side
  * locking.
  */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
 static struct task_struct *sync_supers_tsk;
 static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+	if (wb1 < wb2) {
+		spin_lock(&wb1->list_lock);
+		spin_lock_nested(&wb2->list_lock, 1);
+	} else {
+		spin_lock(&wb2->list_lock);
+		spin_lock_nested(&wb1->list_lock, 1);
+	}
+}
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 static struct dentry *bdi_debug_root;
 static void bdi_debug_init(void)
 {
 	bdi_debug_root = debugfs_create_dir("bdi", NULL);
 }
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
 {
 	struct backing_dev_info *bdi = m->private;
 	struct bdi_writeback *wb = &bdi->wb;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long nr_dirty, nr_io, nr_more_io;
 	struct inode *inode;
 	nr_dirty = nr_io = nr_more_io = 0;
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
-		   "BdiWriteback:     %8lu kB\n"
+		   "BdiWriteback:       %10lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
+		   "BdiReclaimable:     %10lu kB\n"
-		   "BdiDirtyThresh:   %8lu kB\n"
+		   "BdiDirtyThresh:     %10lu kB\n"
-		   "DirtyThresh:      %8lu kB\n"
+		   "DirtyThresh:        %10lu kB\n"
-		   "BackgroundThresh: %8lu kB\n"
+		   "BackgroundThresh:   %10lu kB\n"
-		   "b_dirty:          %8lu\n"
+		   "BdiWritten:         %10lu kB\n"
-		   "b_io:             %8lu\n"
+		   "BdiWriteBandwidth:  %10lu kBps\n"
-		   "b_more_io:        %8lu\n"
+		   "b_dirty:            %10lu\n"
-		   "bdi_list:         %8u\n"
+		   "b_io:               %10lu\n"
-		   "state:            %8lx\n",
+		   "b_more_io:          %10lu\n"
+		   "bdi_list:           %10u\n"
+		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh), K(dirty_thresh),
+		   K(bdi_thresh),
-		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   K(dirty_thresh),
+		   K(background_thresh),
+		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(bdi->write_bandwidth),
+		   nr_dirty,
+		   nr_io,
+		   nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 	return 0;
 }
 static int bdi_debug_stats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, bdi_debug_stats_show, inode->i_private);
 }
 static const struct file_operations bdi_debug_stats_fops = {
 	.open		= bdi_debug_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
 	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
 					       bdi, &bdi_debug_stats_fops);
 }
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
 	debugfs_remove(bdi->debug_stats);
 	debugfs_remove(bdi->debug_dir);
 }
 #else
 static inline void bdi_debug_init(void)
 {
 }
 static inline void bdi_debug_register(struct backing_dev_info *bdi,
 				      const char *name)
 {
 }
 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
 }
 #endif
 static ssize_t read_ahead_kb_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
 	char *end;
 	unsigned long read_ahead_kb;
 	ssize_t ret = -EINVAL;
 	read_ahead_kb = simple_strtoul(buf, &end, 10);
 	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 		bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 		ret = count;
 	}
 	return ret;
 }
 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
 #define BDI_SHOW(name, expr)						\
 static ssize_t name##_show(struct device *dev,				\
 			   struct device_attribute *attr, char *page)	\
 {									\
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
 									\
 	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
 }
 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 static ssize_t min_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
 	char *end;
 	unsigned int ratio;
 	ssize_t ret = -EINVAL;
 	ratio = simple_strtoul(buf, &end, 10);
 	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 		ret = bdi_set_min_ratio(bdi, ratio);
 		if (!ret)
 			ret = count;
 	}
 	return ret;
 }
 BDI_SHOW(min_ratio, bdi->min_ratio)
 static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
 	char *end;
 	unsigned int ratio;
 	ssize_t ret = -EINVAL;
 	ratio = simple_strtoul(buf, &end, 10);
 	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 		ret = bdi_set_max_ratio(bdi, ratio);
 		if (!ret)
 			ret = count;
 	}
 	return ret;
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
 	__ATTR_NULL,
 };
 static __init int bdi_class_init(void)
 {
 	bdi_class = class_create(THIS_MODULE, "bdi");
 	if (IS_ERR(bdi_class))
 		return PTR_ERR(bdi_class);
 	bdi_class->dev_attrs = bdi_dev_attrs;
 	bdi_debug_init();
 	return 0;
 }
 postcore_initcall(bdi_class_init);
 static int __init default_bdi_init(void)
 {
 	int err;
 	sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
 	BUG_ON(IS_ERR(sync_supers_tsk));
 	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
 	bdi_arm_supers_timer();
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
 		bdi_register(&default_backing_dev_info, NULL, "default");
 	err = bdi_init(&noop_backing_dev_info);
 	return err;
 }
 subsys_initcall(default_bdi_init);
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
 }
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
-	struct writeback_control wbc = {
-		.sync_mode		= WB_SYNC_NONE,
-		.older_than_this	= NULL,
-		.range_cyclic		= 1,
-		.nr_to_write		= 1024,
-	};
-	writeback_inodes_wb(&bdi->wb, &wbc);
-}
 /*
  * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
  * or we risk deadlocking on ->s_umount. The longer term solution would be
  * to implement sync_supers_bdi() or similar and simply do it from the
  * bdi writeback thread individually.
  */
 static int bdi_sync_supers(void *unused)
 {
 	set_user_nice(current, 0);
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		/*
 		 * Do this periodically, like kupdated() did before.
 		 */
 		sync_supers();
 	}
 	return 0;
 }
 void bdi_arm_supers_timer(void)
 {
 	unsigned long next;
 	if (!dirty_writeback_interval)
 		return;
 	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }
 static void sync_supers_timer_fn(unsigned long unused)
 {
 	wake_up_process(sync_supers_tsk);
 	bdi_arm_supers_timer();
 }
 static void wakeup_timer_fn(unsigned long data)
 {
 	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
 	spin_lock_bh(&bdi->wb_lock);
 	if (bdi->wb.task) {
 		trace_writeback_wake_thread(bdi);
 		wake_up_process(bdi->wb.task);
 	} else {
 		/*
 		 * When bdi tasks are inactive for long time, they are killed.
 		 * In this case we have to wake-up the forker thread which
 		 * should create and run the bdi thread.
 		 */
 		trace_writeback_wake_forker_thread(bdi);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
 	spin_unlock_bh(&bdi->wb_lock);
 }
 /*
  * This function is used when the first inode for this bdi is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
  * periodic background write-out of dirty inodes. Since the write-out would
  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
  * set up a timer which wakes the bdi thread up later.
  *
  * Note, we wouldn't bother setting up the timer, but this function is on the
  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
  * by delaying the wake-up.
  */
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 {
 	unsigned long timeout;
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 }
 /*
  * Calculate the longest interval (jiffies) bdi threads are allowed to be
  * inactive.
  */
 static unsigned long bdi_longest_inactive(void)
 {
 	unsigned long interval;
 	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
 	return max(5UL * 60 * HZ, interval);
 }
 static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 	current->flags |= PF_SWAPWRITE;
 	set_freezable();
 	/*
 	 * Our parent may run at a different priority, just set us to normal
 	 */
 	set_user_nice(current, 0);
 	for (;;) {
 		struct task_struct *task = NULL;
 		struct backing_dev_info *bdi;
 		enum {
 			NO_ACTION,   /* Nothing to do */
 			FORK_THREAD, /* Fork bdi thread */
 			KILL_THREAD, /* Kill inactive bdi thread */
 		} action = NO_ACTION;
 		/*
 		 * Temporary measure, we want to make sure we don't see
 		 * dirty data on the default backing_dev_info
 		 */
 		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
 			del_timer(&me->wakeup_timer);
 			wb_do_writeback(me, 0);
 		}
 		spin_lock_bh(&bdi_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
 		list_for_each_entry(bdi, &bdi_list, bdi_list) {
 			bool have_dirty_io;
 			if (!bdi_cap_writeback_dirty(bdi) ||
 			     bdi_cap_flush_forker(bdi))
 				continue;
 			WARN(!test_bit(BDI_registered, &bdi->state),
 			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
 			have_dirty_io = !list_empty(&bdi->work_list) ||
 					wb_has_dirty_io(&bdi->wb);
 			/*
 			 * If the bdi has work to do, but the thread does not
 			 * exist - create it.
 			 */
 			if (!bdi->wb.task && have_dirty_io) {
 				/*
 				 * Set the pending bit - if someone will try to
 				 * unregister this bdi - it'll wait on this bit.
 				 */
 				set_bit(BDI_pending, &bdi->state);
 				action = FORK_THREAD;
 				break;
 			}
 			spin_lock(&bdi->wb_lock);
 			/*
 			 * If there is no work to do and the bdi thread was
 			 * inactive long enough - kill it. The wb_lock is taken
 			 * to make sure no-one adds more work to this bdi and
 			 * wakes the bdi thread up.
 			 */
 			if (bdi->wb.task && !have_dirty_io &&
 			    time_after(jiffies, bdi->wb.last_active +
 						bdi_longest_inactive())) {
 				task = bdi->wb.task;
 				bdi->wb.task = NULL;
 				spin_unlock(&bdi->wb_lock);
 				set_bit(BDI_pending, &bdi->state);
 				action = KILL_THREAD;
 				break;
 			}
 			spin_unlock(&bdi->wb_lock);
 		}
 		spin_unlock_bh(&bdi_lock);
 		/* Keep working if default bdi still has things to do */
 		if (!list_empty(&me->bdi->work_list))
 			__set_current_state(TASK_RUNNING);
 		switch (action) {
 		case FORK_THREAD:
 			__set_current_state(TASK_RUNNING);
 			task = kthread_create(bdi_writeback_thread, &bdi->wb,
 					      "flush-%s", dev_name(bdi->dev));
 			if (IS_ERR(task)) {
 				/*
 				 * If thread creation fails, force writeout of
-				 * the bdi from the thread.
+				 * the bdi from the thread. Hopefully 1024 is
+				 * large enough for efficient IO.
 				 */
-				bdi_flush_io(bdi);
+				writeback_inodes_wb(&bdi->wb, 1024);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
 				 * And as soon as the bdi thread is visible, we
 				 * can start it.
 				 */
 				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
 				spin_unlock_bh(&bdi->wb_lock);
 				wake_up_process(task);
 			}
 			break;
 		case KILL_THREAD:
 			__set_current_state(TASK_RUNNING);
 			kthread_stop(task);
 			break;
 		case NO_ACTION:
 			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
 				/*
 				 * There are no dirty data. The only thing we
 				 * should now care about is checking for
 				 * inactive bdi threads and killing them. Thus,
 				 * let's sleep for longer time, save energy and
 				 * be friendly for battery-driven devices.
 				 */
 				schedule_timeout(bdi_longest_inactive());
 			else
 				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
 			try_to_freeze();
 			/* Back to the main loop */
 			continue;
 		}
 		/*
 		 * Clear pending bit and wakeup anybody waiting to tear us down.
 		 */
 		clear_bit(BDI_pending, &bdi->state);
 		smp_mb__after_clear_bit();
 		wake_up_bit(&bdi->state, BDI_pending);
 	}
 	return 0;
 }
 /*
  * Remove bdi from bdi_list, and ensure that it is no longer visible
  */
 static void bdi_remove_from_list(struct backing_dev_info *bdi)
 {
 	spin_lock_bh(&bdi_lock);
 	list_del_rcu(&bdi->bdi_list);
 	spin_unlock_bh(&bdi_lock);
 	synchronize_rcu_expedited();
 }
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
 	va_list args;
 	struct device *dev;
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 	va_end(args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 	bdi->dev = dev;
 	/*
 	 * Just start the forker thread for our default backing_dev_info,
 	 * and add other bdi's to the list. They will get a thread created
 	 * on-demand when they need it.
 	 */
 	if (bdi_cap_flush_forker(bdi)) {
 		struct bdi_writeback *wb = &bdi->wb;
 		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 						dev_name(dev));
 		if (IS_ERR(wb->task))
 			return PTR_ERR(wb->task);
 	}
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
 	spin_lock_bh(&bdi_lock);
 	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 	spin_unlock_bh(&bdi_lock);
 	trace_writeback_bdi_register(bdi);
 	return 0;
 }
 EXPORT_SYMBOL(bdi_register);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 {
 	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
 }
 EXPORT_SYMBOL(bdi_register_dev);
 /*
  * Remove bdi from the global list and shutdown any threads we have running
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 	/*
 	 * Make sure nobody finds us on the bdi_list anymore
 	 */
 	bdi_remove_from_list(bdi);
 	/*
 	 * If setup is pending, wait for that to complete first
 	 */
 	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
 			TASK_UNINTERRUPTIBLE);
 	/*
 	 * Finally, kill the kernel thread. We don't need to be RCU
 	 * safe anymore, since the bdi is gone from visibility. Force
 	 * unfreeze of the thread before calling kthread_stop(), otherwise
 	 * it would never exet if it is currently stuck in the refrigerator.
 	 */
 	if (bdi->wb.task) {
 		thaw_process(bdi->wb.task);
 		kthread_stop(bdi->wb.task);
 	}
 }
 /*
  * This bdi is going away now, make sure that no super_blocks point to it
  */
 static void bdi_prune_sb(struct backing_dev_info *bdi)
 {
 	struct super_block *sb;
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (sb->s_bdi == bdi)
 			sb->s_bdi = &default_backing_dev_info;
 	}
 	spin_unlock(&sb_lock);
 }
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
 		bdi_set_min_ratio(bdi, 0);
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
 		del_timer_sync(&bdi->wb.wakeup_timer);
 		if (!bdi_cap_flush_forker(bdi))
 			bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
 	}
 }
 EXPORT_SYMBOL(bdi_unregister);
 static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 {
 	memset(wb, 0, sizeof(*wb));
 	wb->bdi = bdi;
 	wb->last_old_flush = jiffies;
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	spin_lock_init(&wb->list_lock);
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
 	bdi->dev = NULL;
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
 	bdi_wb_init(&bdi->wb, bdi);
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 		if (err)
 			goto err;
 	}
 	bdi->dirty_exceeded = 0;
+	bdi->bw_time_stamp = jiffies;
+	bdi->written_stamp = 0;
+	bdi->write_bandwidth = INIT_BW;
+	bdi->avg_write_bandwidth = INIT_BW;
 	err = prop_local_init_percpu(&bdi->completions);
 	if (err) {
 err:
 		while (i--)
 			percpu_counter_destroy(&bdi->bdi_stat[i]);
 	}
 	return err;
 }
 EXPORT_SYMBOL(bdi_init);
 void bdi_destroy(struct backing_dev_info *bdi)
 {
 	int i;
 	/*
 	 * Splice our entries to the default_backing_dev_info, if this
 	 * bdi disappears
 	 */
 	if (bdi_has_dirty_io(bdi)) {
 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
-		spin_lock(&inode_wb_list_lock);
+		bdi_lock_two(&bdi->wb, dst);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&bdi->wb.list_lock);
+		spin_unlock(&dst->list_lock);
 	}
 	bdi_unregister(bdi);
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 	prop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
 /*
  * For use from filesystems to quickly init and register a bdi associated
  * with dirty writeback
  */
 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
 			   unsigned int cap)
 {
 	char tmp[32];
 	int err;
 	bdi->name = name;
 	bdi->capabilities = cap;
 	err = bdi_init(bdi);
 	if (err)
 		return err;
 	sprintf(tmp, "%.28s%s", name, "-%d");
 	err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
 	if (err) {
 		bdi_destroy(bdi);
 		return err;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(bdi_setup_and_register);
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
 static atomic_t nr_bdi_congested[2];
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	if (test_and_clear_bit(bit, &bdi->state))
 		atomic_dec(&nr_bdi_congested[sync]);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
 EXPORT_SYMBOL(clear_bdi_congested);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	if (!test_and_set_bit(bit, &bdi->state))
 		atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 /**
  * congestion_wait - wait for a backing_dev to become uncongested
  * @sync: SYNC or ASYNC IO
  * @timeout: timeout in jiffies
  *
  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
  * write congestion.  If no backing_devs are congested then just wait for the
  * next write to be completed.
  */
 long congestion_wait(int sync, long timeout)
 {
 	long ret;
 	unsigned long start = jiffies;
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
 	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
 					jiffies_to_usecs(jiffies - start));
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
 /**
  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
  * @zone: A zone to check if it is heavily congested
  * @sync: SYNC or ASYNC IO
  * @timeout: timeout in jiffies
  *
  * In the event of a congested backing_dev (any backing_dev) and the given
  * @zone has experienced recent congestion, this waits for up to @timeout
  * jiffies for either a BDI to exit congestion of the given @sync queue
  * or a write to complete.
  *
  * In the absence of zone congestion, cond_resched() is called to yield
  * the processor if necessary but otherwise does not sleep.
  *
  * The return value is 0 if the sleep is for the full timeout. Otherwise,
  * it is the number of jiffies that were still remaining when the function
  * returned. return_value == timeout implies the function did not sleep.
  */
 long wait_iff_congested(struct zone *zone, int sync, long timeout)
 {
 	long ret;
 	unsigned long start = jiffies;
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 	/*
 	 * If there is no congestion, or heavy congestion is not being
 	 * encountered in the current zone, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
 	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 			!zone_is_reclaim_congested(zone)) {
 		cond_resched();
 		/* In case we scheduled, work out time remaining */
 		ret = timeout - (jiffies - start);
 		if (ret < 0)
 			ret = 0;
 		goto out;
 	}
 	/* Sleep until uncongested or a write happens */

mm/filemap.c

Diff comments View file @ f01ef56

1	/*	1	/*
2	* linux/mm/filemap.c	2	* linux/mm/filemap.c
3	*	3	*
4	* Copyright (C) 1994-1999 Linus Torvalds	4	* Copyright (C) 1994-1999 Linus Torvalds
5	*/	5	*/
6		6
7	/*	7	/*
8	* This file handles the generic file mmap semantics used by	8	* This file handles the generic file mmap semantics used by
9	* most "normal" filesystems (but you don't /have/ to use this:	9	* most "normal" filesystems (but you don't /have/ to use this:
10	* the NFS filesystem used to do this differently, for example)	10	* the NFS filesystem used to do this differently, for example)
11	*/	11	*/
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/compiler.h>	13	#include <linux/compiler.h>
14	#include <linux/fs.h>	14	#include <linux/fs.h>
15	#include <linux/uaccess.h>	15	#include <linux/uaccess.h>
16	#include <linux/aio.h>	16	#include <linux/aio.h>
17	#include <linux/capability.h>	17	#include <linux/capability.h>
18	#include <linux/kernel_stat.h>	18	#include <linux/kernel_stat.h>
19	#include <linux/gfp.h>	19	#include <linux/gfp.h>
20	#include <linux/mm.h>	20	#include <linux/mm.h>
21	#include <linux/swap.h>	21	#include <linux/swap.h>
22	#include <linux/mman.h>	22	#include <linux/mman.h>
23	#include <linux/pagemap.h>	23	#include <linux/pagemap.h>
24	#include <linux/file.h>	24	#include <linux/file.h>
25	#include <linux/uio.h>	25	#include <linux/uio.h>
26	#include <linux/hash.h>	26	#include <linux/hash.h>
27	#include <linux/writeback.h>	27	#include <linux/writeback.h>
28	#include <linux/backing-dev.h>	28	#include <linux/backing-dev.h>
29	#include <linux/pagevec.h>	29	#include <linux/pagevec.h>
30	#include <linux/blkdev.h>	30	#include <linux/blkdev.h>
31	#include <linux/security.h>	31	#include <linux/security.h>
32	#include <linux/syscalls.h>	32	#include <linux/syscalls.h>
33	#include <linux/cpuset.h>	33	#include <linux/cpuset.h>
34	#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */	34	#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35	#include <linux/memcontrol.h>	35	#include <linux/memcontrol.h>
36	#include <linux/mm_inline.h> /* for page_is_file_cache() */	36	#include <linux/mm_inline.h> /* for page_is_file_cache() */
37	#include <linux/cleancache.h>	37	#include <linux/cleancache.h>
38	#include "internal.h"	38	#include "internal.h"
39		39
40	/*	40	/*
41	* FIXME: remove all knowledge of the buffer layer from the core VM	41	* FIXME: remove all knowledge of the buffer layer from the core VM
42	*/	42	*/
43	#include <linux/buffer_head.h> /* for try_to_free_buffers */	43	#include <linux/buffer_head.h> /* for try_to_free_buffers */
44		44
45	#include <asm/mman.h>	45	#include <asm/mman.h>
46		46
47	/*	47	/*
48	* Shared mappings implemented 30.11.1994. It's not fully working yet,	48	* Shared mappings implemented 30.11.1994. It's not fully working yet,
49	* though.	49	* though.
50	*	50	*
51	* Shared mappings now work. 15.8.1995 Bruno.	51	* Shared mappings now work. 15.8.1995 Bruno.
52	*	52	*
53	* finished 'unifying' the page and buffer cache and SMP-threaded the	53	* finished 'unifying' the page and buffer cache and SMP-threaded the
54	* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>	54	* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
55	*	55	*
56	* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>	56	* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
57	*/	57	*/
58		58
59	/*	59	/*
60	* Lock ordering:	60	* Lock ordering:
61	*	61	*
62	* ->i_mmap_mutex (truncate_pagecache)	62	* ->i_mmap_mutex (truncate_pagecache)
63	* ->private_lock (__free_pte->__set_page_dirty_buffers)	63	* ->private_lock (__free_pte->__set_page_dirty_buffers)
64	* ->swap_lock (exclusive_swap_page, others)	64	* ->swap_lock (exclusive_swap_page, others)
65	* ->mapping->tree_lock	65	* ->mapping->tree_lock
66	*	66	*
67	* ->i_mutex	67	* ->i_mutex
68	* ->i_mmap_mutex (truncate->unmap_mapping_range)	68	* ->i_mmap_mutex (truncate->unmap_mapping_range)
69	*	69	*
70	* ->mmap_sem	70	* ->mmap_sem
71	* ->i_mmap_mutex	71	* ->i_mmap_mutex
72	* ->page_table_lock or pte_lock (various, mainly in memory.c)	72	* ->page_table_lock or pte_lock (various, mainly in memory.c)
73	* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)	73	* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
74	*	74	*
75	* ->mmap_sem	75	* ->mmap_sem
76	* ->lock_page (access_process_vm)	76	* ->lock_page (access_process_vm)
77	*	77	*
78	* ->i_mutex (generic_file_buffered_write)	78	* ->i_mutex (generic_file_buffered_write)
79	* ->mmap_sem (fault_in_pages_readable->do_page_fault)	79	* ->mmap_sem (fault_in_pages_readable->do_page_fault)
80	*	80	*
81	* inode_wb_list_lock	81	* bdi->wb.list_lock
82	* sb_lock (fs/fs-writeback.c)	82	* sb_lock (fs/fs-writeback.c)
83	* ->mapping->tree_lock (__sync_single_inode)	83	* ->mapping->tree_lock (__sync_single_inode)
84	*	84	*
85	* ->i_mmap_mutex	85	* ->i_mmap_mutex
86	* ->anon_vma.lock (vma_adjust)	86	* ->anon_vma.lock (vma_adjust)
87	*	87	*
88	* ->anon_vma.lock	88	* ->anon_vma.lock
89	* ->page_table_lock or pte_lock (anon_vma_prepare and various)	89	* ->page_table_lock or pte_lock (anon_vma_prepare and various)
90	*	90	*
91	* ->page_table_lock or pte_lock	91	* ->page_table_lock or pte_lock
92	* ->swap_lock (try_to_unmap_one)	92	* ->swap_lock (try_to_unmap_one)
93	* ->private_lock (try_to_unmap_one)	93	* ->private_lock (try_to_unmap_one)
94	* ->tree_lock (try_to_unmap_one)	94	* ->tree_lock (try_to_unmap_one)
95	* ->zone.lru_lock (follow_page->mark_page_accessed)	95	* ->zone.lru_lock (follow_page->mark_page_accessed)
96	* ->zone.lru_lock (check_pte_range->isolate_lru_page)	96	* ->zone.lru_lock (check_pte_range->isolate_lru_page)
97	* ->private_lock (page_remove_rmap->set_page_dirty)	97	* ->private_lock (page_remove_rmap->set_page_dirty)
98	* ->tree_lock (page_remove_rmap->set_page_dirty)	98	* ->tree_lock (page_remove_rmap->set_page_dirty)
99	* inode_wb_list_lock (page_remove_rmap->set_page_dirty)	99	* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
100	* ->inode->i_lock (page_remove_rmap->set_page_dirty)	100	* ->inode->i_lock (page_remove_rmap->set_page_dirty)
101	* inode_wb_list_lock (zap_pte_range->set_page_dirty)	101	* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
102	* ->inode->i_lock (zap_pte_range->set_page_dirty)	102	* ->inode->i_lock (zap_pte_range->set_page_dirty)
103	* ->private_lock (zap_pte_range->__set_page_dirty_buffers)	103	* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104	*	104	*
105	* (code doesn't rely on that order, so you could switch it around)	105	* (code doesn't rely on that order, so you could switch it around)
106	* ->tasklist_lock (memory_failure, collect_procs_ao)	106	* ->tasklist_lock (memory_failure, collect_procs_ao)
107	* ->i_mmap_mutex	107	* ->i_mmap_mutex
108	*/	108	*/
109		109
110	/*	110	/*
111	* Delete a page from the page cache and free it. Caller has to make	111	* Delete a page from the page cache and free it. Caller has to make
112	* sure the page is locked and that nobody else uses it - or that usage	112	* sure the page is locked and that nobody else uses it - or that usage
113	* is safe. The caller must hold the mapping's tree_lock.	113	* is safe. The caller must hold the mapping's tree_lock.
114	*/	114	*/
115	void __delete_from_page_cache(struct page *page)	115	void __delete_from_page_cache(struct page *page)
116	{	116	{
117	struct address_space *mapping = page->mapping;	117	struct address_space *mapping = page->mapping;
118		118
119	/*	119	/*
120	* if we're uptodate, flush out into the cleancache, otherwise	120	* if we're uptodate, flush out into the cleancache, otherwise
121	* invalidate any existing cleancache entries. We can't leave	121	* invalidate any existing cleancache entries. We can't leave
122	* stale data around in the cleancache once our page is gone	122	* stale data around in the cleancache once our page is gone
123	*/	123	*/
124	if (PageUptodate(page) && PageMappedToDisk(page))	124	if (PageUptodate(page) && PageMappedToDisk(page))
125	cleancache_put_page(page);	125	cleancache_put_page(page);
126	else	126	else
127	cleancache_flush_page(mapping, page);	127	cleancache_flush_page(mapping, page);
128		128
129	radix_tree_delete(&mapping->page_tree, page->index);	129	radix_tree_delete(&mapping->page_tree, page->index);
130	page->mapping = NULL;	130	page->mapping = NULL;
131	/* Leave page->index set: truncation lookup relies upon it */	131	/* Leave page->index set: truncation lookup relies upon it */
132	mapping->nrpages--;	132	mapping->nrpages--;
133	__dec_zone_page_state(page, NR_FILE_PAGES);	133	__dec_zone_page_state(page, NR_FILE_PAGES);
134	if (PageSwapBacked(page))	134	if (PageSwapBacked(page))
135	__dec_zone_page_state(page, NR_SHMEM);	135	__dec_zone_page_state(page, NR_SHMEM);
136	BUG_ON(page_mapped(page));	136	BUG_ON(page_mapped(page));
137		137
138	/*	138	/*
139	* Some filesystems seem to re-dirty the page even after	139	* Some filesystems seem to re-dirty the page even after
140	* the VM has canceled the dirty bit (eg ext3 journaling).	140	* the VM has canceled the dirty bit (eg ext3 journaling).
141	*	141	*
142	* Fix it up by doing a final dirty accounting check after	142	* Fix it up by doing a final dirty accounting check after
143	* having removed the page entirely.	143	* having removed the page entirely.
144	*/	144	*/
145	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {	145	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
146	dec_zone_page_state(page, NR_FILE_DIRTY);	146	dec_zone_page_state(page, NR_FILE_DIRTY);
147	dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);	147	dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
148	}	148	}
149	}	149	}
150		150
151	/**	151	/**
152	* delete_from_page_cache - delete page from page cache	152	* delete_from_page_cache - delete page from page cache
153	* @page: the page which the kernel is trying to remove from page cache	153	* @page: the page which the kernel is trying to remove from page cache
154	*	154	*
155	* This must be called only on pages that have been verified to be in the page	155	* This must be called only on pages that have been verified to be in the page
156	* cache and locked. It will never put the page into the free list, the caller	156	* cache and locked. It will never put the page into the free list, the caller
157	* has a reference on the page.	157	* has a reference on the page.
158	*/	158	*/
159	void delete_from_page_cache(struct page *page)	159	void delete_from_page_cache(struct page *page)
160	{	160	{
161	struct address_space *mapping = page->mapping;	161	struct address_space *mapping = page->mapping;
162	void (freepage)(struct page );	162	void (freepage)(struct page );
163		163
164	BUG_ON(!PageLocked(page));	164	BUG_ON(!PageLocked(page));
165		165
166	freepage = mapping->a_ops->freepage;	166	freepage = mapping->a_ops->freepage;
167	spin_lock_irq(&mapping->tree_lock);	167	spin_lock_irq(&mapping->tree_lock);
168	__delete_from_page_cache(page);	168	__delete_from_page_cache(page);
169	spin_unlock_irq(&mapping->tree_lock);	169	spin_unlock_irq(&mapping->tree_lock);
170	mem_cgroup_uncharge_cache_page(page);	170	mem_cgroup_uncharge_cache_page(page);
171		171
172	if (freepage)	172	if (freepage)
173	freepage(page);	173	freepage(page);
174	page_cache_release(page);	174	page_cache_release(page);
175	}	175	}
176	EXPORT_SYMBOL(delete_from_page_cache);	176	EXPORT_SYMBOL(delete_from_page_cache);
177		177
178	static int sleep_on_page(void *word)	178	static int sleep_on_page(void *word)
179	{	179	{
180	io_schedule();	180	io_schedule();
181	return 0;	181	return 0;
182	}	182	}
183		183
184	static int sleep_on_page_killable(void *word)	184	static int sleep_on_page_killable(void *word)
185	{	185	{
186	sleep_on_page(word);	186	sleep_on_page(word);
187	return fatal_signal_pending(current) ? -EINTR : 0;	187	return fatal_signal_pending(current) ? -EINTR : 0;
188	}	188	}
189		189
190	/**	190	/**
191	* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range	191	* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
192	* @mapping: address space structure to write	192	* @mapping: address space structure to write
193	* @start: offset in bytes where the range starts	193	* @start: offset in bytes where the range starts
194	* @end: offset in bytes where the range ends (inclusive)	194	* @end: offset in bytes where the range ends (inclusive)
195	* @sync_mode: enable synchronous operation	195	* @sync_mode: enable synchronous operation
196	*	196	*
197	* Start writeback against all of a mapping's dirty pages that lie	197	* Start writeback against all of a mapping's dirty pages that lie
198	* within the byte offsets <start, end> inclusive.	198	* within the byte offsets <start, end> inclusive.
199	*	199	*
200	* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as	200	* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
201	* opposed to a regular memory cleansing writeback. The difference between	201	* opposed to a regular memory cleansing writeback. The difference between
202	* these two operations is that if a dirty page/buffer is encountered, it must	202	* these two operations is that if a dirty page/buffer is encountered, it must
203	* be waited upon, and not just skipped over.	203	* be waited upon, and not just skipped over.
204	*/	204	*/
205	int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,	205	int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
206	loff_t end, int sync_mode)	206	loff_t end, int sync_mode)
207	{	207	{
208	int ret;	208	int ret;
209	struct writeback_control wbc = {	209	struct writeback_control wbc = {
210	.sync_mode = sync_mode,	210	.sync_mode = sync_mode,
211	.nr_to_write = LONG_MAX,	211	.nr_to_write = LONG_MAX,
212	.range_start = start,	212	.range_start = start,
213	.range_end = end,	213	.range_end = end,
214	};	214	};
215		215
216	if (!mapping_cap_writeback_dirty(mapping))	216	if (!mapping_cap_writeback_dirty(mapping))
217	return 0;	217	return 0;
218		218
219	ret = do_writepages(mapping, &wbc);	219	ret = do_writepages(mapping, &wbc);
220	return ret;	220	return ret;
221	}	221	}
222		222
223	static inline int __filemap_fdatawrite(struct address_space *mapping,	223	static inline int __filemap_fdatawrite(struct address_space *mapping,
224	int sync_mode)	224	int sync_mode)
225	{	225	{
226	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);	226	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
227	}	227	}
228		228
229	int filemap_fdatawrite(struct address_space *mapping)	229	int filemap_fdatawrite(struct address_space *mapping)
230	{	230	{
231	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);	231	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
232	}	232	}
233	EXPORT_SYMBOL(filemap_fdatawrite);	233	EXPORT_SYMBOL(filemap_fdatawrite);
234		234
235	int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,	235	int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
236	loff_t end)	236	loff_t end)
237	{	237	{
238	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);	238	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
239	}	239	}
240	EXPORT_SYMBOL(filemap_fdatawrite_range);	240	EXPORT_SYMBOL(filemap_fdatawrite_range);
241		241
242	/**	242	/**
243	* filemap_flush - mostly a non-blocking flush	243	* filemap_flush - mostly a non-blocking flush
244	* @mapping: target address_space	244	* @mapping: target address_space
245	*	245	*
246	* This is a mostly non-blocking flush. Not suitable for data-integrity	246	* This is a mostly non-blocking flush. Not suitable for data-integrity
247	* purposes - I/O may not be started against all dirty pages.	247	* purposes - I/O may not be started against all dirty pages.
248	*/	248	*/
249	int filemap_flush(struct address_space *mapping)	249	int filemap_flush(struct address_space *mapping)
250	{	250	{
251	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);	251	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
252	}	252	}
253	EXPORT_SYMBOL(filemap_flush);	253	EXPORT_SYMBOL(filemap_flush);
254		254
255	/**	255	/**
256	* filemap_fdatawait_range - wait for writeback to complete	256	* filemap_fdatawait_range - wait for writeback to complete
257	* @mapping: address space structure to wait for	257	* @mapping: address space structure to wait for
258	* @start_byte: offset in bytes where the range starts	258	* @start_byte: offset in bytes where the range starts
259	* @end_byte: offset in bytes where the range ends (inclusive)	259	* @end_byte: offset in bytes where the range ends (inclusive)
260	*	260	*
261	* Walk the list of under-writeback pages of the given address space	261	* Walk the list of under-writeback pages of the given address space
262	* in the given range and wait for all of them.	262	* in the given range and wait for all of them.
263	*/	263	*/
264	int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,	264	int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
265	loff_t end_byte)	265	loff_t end_byte)
266	{	266	{
267	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;	267	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
268	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;	268	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
269	struct pagevec pvec;	269	struct pagevec pvec;
270	int nr_pages;	270	int nr_pages;
271	int ret = 0;	271	int ret = 0;
272		272
273	if (end_byte < start_byte)	273	if (end_byte < start_byte)
274	return 0;	274	return 0;
275		275
276	pagevec_init(&pvec, 0);	276	pagevec_init(&pvec, 0);
277	while ((index <= end) &&	277	while ((index <= end) &&
278	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,	278	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
279	PAGECACHE_TAG_WRITEBACK,	279	PAGECACHE_TAG_WRITEBACK,
280	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {	280	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
281	unsigned i;	281	unsigned i;
282		282
283	for (i = 0; i < nr_pages; i++) {	283	for (i = 0; i < nr_pages; i++) {
284	struct page *page = pvec.pages[i];	284	struct page *page = pvec.pages[i];
285		285
286	/* until radix tree lookup accepts end_index */	286	/* until radix tree lookup accepts end_index */
287	if (page->index > end)	287	if (page->index > end)
288	continue;	288	continue;
289		289
290	wait_on_page_writeback(page);	290	wait_on_page_writeback(page);
291	if (TestClearPageError(page))	291	if (TestClearPageError(page))
292	ret = -EIO;	292	ret = -EIO;
293	}	293	}
294	pagevec_release(&pvec);	294	pagevec_release(&pvec);
295	cond_resched();	295	cond_resched();
296	}	296	}
297		297
298	/* Check for outstanding write errors */	298	/* Check for outstanding write errors */
299	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))	299	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
300	ret = -ENOSPC;	300	ret = -ENOSPC;
301	if (test_and_clear_bit(AS_EIO, &mapping->flags))	301	if (test_and_clear_bit(AS_EIO, &mapping->flags))
302	ret = -EIO;	302	ret = -EIO;
303		303
304	return ret;	304	return ret;
305	}	305	}
306	EXPORT_SYMBOL(filemap_fdatawait_range);	306	EXPORT_SYMBOL(filemap_fdatawait_range);
307		307
308	/**	308	/**
309	* filemap_fdatawait - wait for all under-writeback pages to complete	309	* filemap_fdatawait - wait for all under-writeback pages to complete
310	* @mapping: address space structure to wait for	310	* @mapping: address space structure to wait for
311	*	311	*
312	* Walk the list of under-writeback pages of the given address space	312	* Walk the list of under-writeback pages of the given address space
313	* and wait for all of them.	313	* and wait for all of them.
314	*/	314	*/
315	int filemap_fdatawait(struct address_space *mapping)	315	int filemap_fdatawait(struct address_space *mapping)
316	{	316	{
317	loff_t i_size = i_size_read(mapping->host);	317	loff_t i_size = i_size_read(mapping->host);
318		318
319	if (i_size == 0)	319	if (i_size == 0)
320	return 0;	320	return 0;
321		321
322	return filemap_fdatawait_range(mapping, 0, i_size - 1);	322	return filemap_fdatawait_range(mapping, 0, i_size - 1);
323	}	323	}
324	EXPORT_SYMBOL(filemap_fdatawait);	324	EXPORT_SYMBOL(filemap_fdatawait);
325		325
326	int filemap_write_and_wait(struct address_space *mapping)	326	int filemap_write_and_wait(struct address_space *mapping)
327	{	327	{
328	int err = 0;	328	int err = 0;
329		329
330	if (mapping->nrpages) {	330	if (mapping->nrpages) {
331	err = filemap_fdatawrite(mapping);	331	err = filemap_fdatawrite(mapping);
332	/*	332	/*
333	* Even if the above returned error, the pages may be	333	* Even if the above returned error, the pages may be
334	* written partially (e.g. -ENOSPC), so we wait for it.	334	* written partially (e.g. -ENOSPC), so we wait for it.
335	* But the -EIO is special case, it may indicate the worst	335	* But the -EIO is special case, it may indicate the worst
336	* thing (e.g. bug) happened, so we avoid waiting for it.	336	* thing (e.g. bug) happened, so we avoid waiting for it.
337	*/	337	*/
338	if (err != -EIO) {	338	if (err != -EIO) {
339	int err2 = filemap_fdatawait(mapping);	339	int err2 = filemap_fdatawait(mapping);
340	if (!err)	340	if (!err)
341	err = err2;	341	err = err2;
342	}	342	}
343	}	343	}
344	return err;	344	return err;
345	}	345	}
346	EXPORT_SYMBOL(filemap_write_and_wait);	346	EXPORT_SYMBOL(filemap_write_and_wait);
347		347
348	/**	348	/**
349	* filemap_write_and_wait_range - write out & wait on a file range	349	* filemap_write_and_wait_range - write out & wait on a file range
350	* @mapping: the address_space for the pages	350	* @mapping: the address_space for the pages
351	* @lstart: offset in bytes where the range starts	351	* @lstart: offset in bytes where the range starts
352	* @lend: offset in bytes where the range ends (inclusive)	352	* @lend: offset in bytes where the range ends (inclusive)
353	*	353	*
354	* Write out and wait upon file offsets lstart->lend, inclusive.	354	* Write out and wait upon file offsets lstart->lend, inclusive.
355	*	355	*
356	* Note that `lend' is inclusive (describes the last byte to be written) so	356	* Note that `lend' is inclusive (describes the last byte to be written) so
357	* that this function can be used to write to the very end-of-file (end = -1).	357	* that this function can be used to write to the very end-of-file (end = -1).
358	*/	358	*/
359	int filemap_write_and_wait_range(struct address_space *mapping,	359	int filemap_write_and_wait_range(struct address_space *mapping,
360	loff_t lstart, loff_t lend)	360	loff_t lstart, loff_t lend)
361	{	361	{
362	int err = 0;	362	int err = 0;
363		363
364	if (mapping->nrpages) {	364	if (mapping->nrpages) {
365	err = __filemap_fdatawrite_range(mapping, lstart, lend,	365	err = __filemap_fdatawrite_range(mapping, lstart, lend,
366	WB_SYNC_ALL);	366	WB_SYNC_ALL);
367	/* See comment of filemap_write_and_wait() */	367	/* See comment of filemap_write_and_wait() */
368	if (err != -EIO) {	368	if (err != -EIO) {
369	int err2 = filemap_fdatawait_range(mapping,	369	int err2 = filemap_fdatawait_range(mapping,
370	lstart, lend);	370	lstart, lend);
371	if (!err)	371	if (!err)
372	err = err2;	372	err = err2;
373	}	373	}
374	}	374	}
375	return err;	375	return err;
376	}	376	}
377	EXPORT_SYMBOL(filemap_write_and_wait_range);	377	EXPORT_SYMBOL(filemap_write_and_wait_range);
378		378
379	/**	379	/**
380	* replace_page_cache_page - replace a pagecache page with a new one	380	* replace_page_cache_page - replace a pagecache page with a new one
381	* @old: page to be replaced	381	* @old: page to be replaced
382	* @new: page to replace with	382	* @new: page to replace with
383	* @gfp_mask: allocation mode	383	* @gfp_mask: allocation mode
384	*	384	*
385	* This function replaces a page in the pagecache with a new one. On	385	* This function replaces a page in the pagecache with a new one. On
386	* success it acquires the pagecache reference for the new page and	386	* success it acquires the pagecache reference for the new page and
387	* drops it for the old page. Both the old and new pages must be	387	* drops it for the old page. Both the old and new pages must be
388	* locked. This function does not add the new page to the LRU, the	388	* locked. This function does not add the new page to the LRU, the
389	* caller must do that.	389	* caller must do that.
390	*	390	*
391	* The remove + add is atomic. The only way this function can fail is	391	* The remove + add is atomic. The only way this function can fail is
392	* memory allocation failure.	392	* memory allocation failure.
393	*/	393	*/
394	int replace_page_cache_page(struct page old, struct page new, gfp_t gfp_mask)	394	int replace_page_cache_page(struct page old, struct page new, gfp_t gfp_mask)
395	{	395	{
396	int error;	396	int error;
397	struct mem_cgroup *memcg = NULL;	397	struct mem_cgroup *memcg = NULL;
398		398
399	VM_BUG_ON(!PageLocked(old));	399	VM_BUG_ON(!PageLocked(old));
400	VM_BUG_ON(!PageLocked(new));	400	VM_BUG_ON(!PageLocked(new));
401	VM_BUG_ON(new->mapping);	401	VM_BUG_ON(new->mapping);
402		402
403	/*	403	/*
404	* This is not page migration, but prepare_migration and	404	* This is not page migration, but prepare_migration and
405	* end_migration does enough work for charge replacement.	405	* end_migration does enough work for charge replacement.
406	*	406	*
407	* In the longer term we probably want a specialized function	407	* In the longer term we probably want a specialized function
408	* for moving the charge from old to new in a more efficient	408	* for moving the charge from old to new in a more efficient
409	* manner.	409	* manner.
410	*/	410	*/
411	error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);	411	error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
412	if (error)	412	if (error)
413	return error;	413	return error;
414		414
415	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);	415	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
416	if (!error) {	416	if (!error) {
417	struct address_space *mapping = old->mapping;	417	struct address_space *mapping = old->mapping;
418	void (freepage)(struct page );	418	void (freepage)(struct page );
419		419
420	pgoff_t offset = old->index;	420	pgoff_t offset = old->index;
421	freepage = mapping->a_ops->freepage;	421	freepage = mapping->a_ops->freepage;
422		422
423	page_cache_get(new);	423	page_cache_get(new);
424	new->mapping = mapping;	424	new->mapping = mapping;
425	new->index = offset;	425	new->index = offset;
426		426
427	spin_lock_irq(&mapping->tree_lock);	427	spin_lock_irq(&mapping->tree_lock);
428	__delete_from_page_cache(old);	428	__delete_from_page_cache(old);
429	error = radix_tree_insert(&mapping->page_tree, offset, new);	429	error = radix_tree_insert(&mapping->page_tree, offset, new);
430	BUG_ON(error);	430	BUG_ON(error);
431	mapping->nrpages++;	431	mapping->nrpages++;
432	__inc_zone_page_state(new, NR_FILE_PAGES);	432	__inc_zone_page_state(new, NR_FILE_PAGES);
433	if (PageSwapBacked(new))	433	if (PageSwapBacked(new))
434	__inc_zone_page_state(new, NR_SHMEM);	434	__inc_zone_page_state(new, NR_SHMEM);
435	spin_unlock_irq(&mapping->tree_lock);	435	spin_unlock_irq(&mapping->tree_lock);
436	radix_tree_preload_end();	436	radix_tree_preload_end();
437	if (freepage)	437	if (freepage)
438	freepage(old);	438	freepage(old);
439	page_cache_release(old);	439	page_cache_release(old);
440	mem_cgroup_end_migration(memcg, old, new, true);	440	mem_cgroup_end_migration(memcg, old, new, true);
441	} else {	441	} else {
442	mem_cgroup_end_migration(memcg, old, new, false);	442	mem_cgroup_end_migration(memcg, old, new, false);
443	}	443	}
444		444
445	return error;	445	return error;
446	}	446	}
447	EXPORT_SYMBOL_GPL(replace_page_cache_page);	447	EXPORT_SYMBOL_GPL(replace_page_cache_page);
448		448
449	/**	449	/**
450	* add_to_page_cache_locked - add a locked page to the pagecache	450	* add_to_page_cache_locked - add a locked page to the pagecache
451	* @page: page to add	451	* @page: page to add
452	* @mapping: the page's address_space	452	* @mapping: the page's address_space
453	* @offset: page index	453	* @offset: page index
454	* @gfp_mask: page allocation mode	454	* @gfp_mask: page allocation mode
455	*	455	*
456	* This function is used to add a page to the pagecache. It must be locked.	456	* This function is used to add a page to the pagecache. It must be locked.
457	* This function does not add the page to the LRU. The caller must do that.	457	* This function does not add the page to the LRU. The caller must do that.
458	*/	458	*/
459	int add_to_page_cache_locked(struct page page, struct address_space mapping,	459	int add_to_page_cache_locked(struct page page, struct address_space mapping,
460	pgoff_t offset, gfp_t gfp_mask)	460	pgoff_t offset, gfp_t gfp_mask)
461	{	461	{
462	int error;	462	int error;
463		463
464	VM_BUG_ON(!PageLocked(page));	464	VM_BUG_ON(!PageLocked(page));
465		465
466	error = mem_cgroup_cache_charge(page, current->mm,	466	error = mem_cgroup_cache_charge(page, current->mm,
467	gfp_mask & GFP_RECLAIM_MASK);	467	gfp_mask & GFP_RECLAIM_MASK);
468	if (error)	468	if (error)
469	goto out;	469	goto out;
470		470
471	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);	471	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
472	if (error == 0) {	472	if (error == 0) {
473	page_cache_get(page);	473	page_cache_get(page);
474	page->mapping = mapping;	474	page->mapping = mapping;
475	page->index = offset;	475	page->index = offset;
476		476
477	spin_lock_irq(&mapping->tree_lock);	477	spin_lock_irq(&mapping->tree_lock);
478	error = radix_tree_insert(&mapping->page_tree, offset, page);	478	error = radix_tree_insert(&mapping->page_tree, offset, page);
479	if (likely(!error)) {	479	if (likely(!error)) {
480	mapping->nrpages++;	480	mapping->nrpages++;
481	__inc_zone_page_state(page, NR_FILE_PAGES);	481	__inc_zone_page_state(page, NR_FILE_PAGES);
482	if (PageSwapBacked(page))	482	if (PageSwapBacked(page))
483	__inc_zone_page_state(page, NR_SHMEM);	483	__inc_zone_page_state(page, NR_SHMEM);
484	spin_unlock_irq(&mapping->tree_lock);	484	spin_unlock_irq(&mapping->tree_lock);
485	} else {	485	} else {
486	page->mapping = NULL;	486	page->mapping = NULL;
487	/* Leave page->index set: truncation relies upon it */	487	/* Leave page->index set: truncation relies upon it */
488	spin_unlock_irq(&mapping->tree_lock);	488	spin_unlock_irq(&mapping->tree_lock);
489	mem_cgroup_uncharge_cache_page(page);	489	mem_cgroup_uncharge_cache_page(page);
490	page_cache_release(page);	490	page_cache_release(page);
491	}	491	}
492	radix_tree_preload_end();	492	radix_tree_preload_end();
493	} else	493	} else
494	mem_cgroup_uncharge_cache_page(page);	494	mem_cgroup_uncharge_cache_page(page);
495	out:	495	out:
496	return error;	496	return error;
497	}	497	}
498	EXPORT_SYMBOL(add_to_page_cache_locked);	498	EXPORT_SYMBOL(add_to_page_cache_locked);
499		499
500	int add_to_page_cache_lru(struct page page, struct address_space mapping,	500	int add_to_page_cache_lru(struct page page, struct address_space mapping,
501	pgoff_t offset, gfp_t gfp_mask)	501	pgoff_t offset, gfp_t gfp_mask)
502	{	502	{
503	int ret;	503	int ret;
504		504
505	/*	505	/*
506	* Splice_read and readahead add shmem/tmpfs pages into the page cache	506	* Splice_read and readahead add shmem/tmpfs pages into the page cache
507	* before shmem_readpage has a chance to mark them as SwapBacked: they	507	* before shmem_readpage has a chance to mark them as SwapBacked: they
508	* need to go on the anon lru below, and mem_cgroup_cache_charge	508	* need to go on the anon lru below, and mem_cgroup_cache_charge
509	* (called in add_to_page_cache) needs to know where they're going too.	509	* (called in add_to_page_cache) needs to know where they're going too.
510	*/	510	*/
511	if (mapping_cap_swap_backed(mapping))	511	if (mapping_cap_swap_backed(mapping))
512	SetPageSwapBacked(page);	512	SetPageSwapBacked(page);
513		513
514	ret = add_to_page_cache(page, mapping, offset, gfp_mask);	514	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
515	if (ret == 0) {	515	if (ret == 0) {
516	if (page_is_file_cache(page))	516	if (page_is_file_cache(page))
517	lru_cache_add_file(page);	517	lru_cache_add_file(page);
518	else	518	else
519	lru_cache_add_anon(page);	519	lru_cache_add_anon(page);
520	}	520	}
521	return ret;	521	return ret;
522	}	522	}
523	EXPORT_SYMBOL_GPL(add_to_page_cache_lru);	523	EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
524		524
525	#ifdef CONFIG_NUMA	525	#ifdef CONFIG_NUMA
526	struct page *__page_cache_alloc(gfp_t gfp)	526	struct page *__page_cache_alloc(gfp_t gfp)
527	{	527	{
528	int n;	528	int n;
529	struct page *page;	529	struct page *page;
530		530
531	if (cpuset_do_page_mem_spread()) {	531	if (cpuset_do_page_mem_spread()) {
532	get_mems_allowed();	532	get_mems_allowed();
533	n = cpuset_mem_spread_node();	533	n = cpuset_mem_spread_node();
534	page = alloc_pages_exact_node(n, gfp, 0);	534	page = alloc_pages_exact_node(n, gfp, 0);
535	put_mems_allowed();	535	put_mems_allowed();
536	return page;	536	return page;
537	}	537	}
538	return alloc_pages(gfp, 0);	538	return alloc_pages(gfp, 0);
539	}	539	}
540	EXPORT_SYMBOL(__page_cache_alloc);	540	EXPORT_SYMBOL(__page_cache_alloc);
541	#endif	541	#endif
542		542
543	/*	543	/*
544	* In order to wait for pages to become available there must be	544	* In order to wait for pages to become available there must be
545	* waitqueues associated with pages. By using a hash table of	545	* waitqueues associated with pages. By using a hash table of
546	* waitqueues where the bucket discipline is to maintain all	546	* waitqueues where the bucket discipline is to maintain all
547	* waiters on the same queue and wake all when any of the pages	547	* waiters on the same queue and wake all when any of the pages
548	* become available, and for the woken contexts to check to be	548	* become available, and for the woken contexts to check to be
549	* sure the appropriate page became available, this saves space	549	* sure the appropriate page became available, this saves space
550	* at a cost of "thundering herd" phenomena during rare hash	550	* at a cost of "thundering herd" phenomena during rare hash
551	* collisions.	551	* collisions.
552	*/	552	*/
553	static wait_queue_head_t page_waitqueue(struct page page)	553	static wait_queue_head_t page_waitqueue(struct page page)
554	{	554	{
555	const struct zone *zone = page_zone(page);	555	const struct zone *zone = page_zone(page);
556		556
557	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];	557	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
558	}	558	}
559		559
560	static inline void wake_up_page(struct page *page, int bit)	560	static inline void wake_up_page(struct page *page, int bit)
561	{	561	{
562	__wake_up_bit(page_waitqueue(page), &page->flags, bit);	562	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
563	}	563	}
564		564
565	void wait_on_page_bit(struct page *page, int bit_nr)	565	void wait_on_page_bit(struct page *page, int bit_nr)
566	{	566	{
567	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);	567	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
568		568
569	if (test_bit(bit_nr, &page->flags))	569	if (test_bit(bit_nr, &page->flags))
570	__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,	570	__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
571	TASK_UNINTERRUPTIBLE);	571	TASK_UNINTERRUPTIBLE);
572	}	572	}
573	EXPORT_SYMBOL(wait_on_page_bit);	573	EXPORT_SYMBOL(wait_on_page_bit);
574		574
575	int wait_on_page_bit_killable(struct page *page, int bit_nr)	575	int wait_on_page_bit_killable(struct page *page, int bit_nr)
576	{	576	{
577	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);	577	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
578		578
579	if (!test_bit(bit_nr, &page->flags))	579	if (!test_bit(bit_nr, &page->flags))
580	return 0;	580	return 0;
581		581
582	return __wait_on_bit(page_waitqueue(page), &wait,	582	return __wait_on_bit(page_waitqueue(page), &wait,
583	sleep_on_page_killable, TASK_KILLABLE);	583	sleep_on_page_killable, TASK_KILLABLE);
584	}	584	}
585		585
586	/**	586	/**
587	* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue	587	* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
588	* @page: Page defining the wait queue of interest	588	* @page: Page defining the wait queue of interest
589	* @waiter: Waiter to add to the queue	589	* @waiter: Waiter to add to the queue
590	*	590	*
591	* Add an arbitrary @waiter to the wait queue for the nominated @page.	591	* Add an arbitrary @waiter to the wait queue for the nominated @page.
592	*/	592	*/
593	void add_page_wait_queue(struct page page, wait_queue_t waiter)	593	void add_page_wait_queue(struct page page, wait_queue_t waiter)
594	{	594	{
595	wait_queue_head_t *q = page_waitqueue(page);	595	wait_queue_head_t *q = page_waitqueue(page);
596	unsigned long flags;	596	unsigned long flags;
597		597
598	spin_lock_irqsave(&q->lock, flags);	598	spin_lock_irqsave(&q->lock, flags);
599	__add_wait_queue(q, waiter);	599	__add_wait_queue(q, waiter);
600	spin_unlock_irqrestore(&q->lock, flags);	600	spin_unlock_irqrestore(&q->lock, flags);
601	}	601	}
602	EXPORT_SYMBOL_GPL(add_page_wait_queue);	602	EXPORT_SYMBOL_GPL(add_page_wait_queue);
603		603
604	/**	604	/**
605	* unlock_page - unlock a locked page	605	* unlock_page - unlock a locked page
606	* @page: the page	606	* @page: the page
607	*	607	*
608	* Unlocks the page and wakes up sleepers in ___wait_on_page_locked().	608	* Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
609	* Also wakes sleepers in wait_on_page_writeback() because the wakeup	609	* Also wakes sleepers in wait_on_page_writeback() because the wakeup
610	* mechananism between PageLocked pages and PageWriteback pages is shared.	610	* mechananism between PageLocked pages and PageWriteback pages is shared.
611	* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.	611	* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
612	*	612	*
613	* The mb is necessary to enforce ordering between the clear_bit and the read	613	* The mb is necessary to enforce ordering between the clear_bit and the read
614	* of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).	614	* of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
615	*/	615	*/
616	void unlock_page(struct page *page)	616	void unlock_page(struct page *page)
617	{	617	{
618	VM_BUG_ON(!PageLocked(page));	618	VM_BUG_ON(!PageLocked(page));
619	clear_bit_unlock(PG_locked, &page->flags);	619	clear_bit_unlock(PG_locked, &page->flags);
620	smp_mb__after_clear_bit();	620	smp_mb__after_clear_bit();
621	wake_up_page(page, PG_locked);	621	wake_up_page(page, PG_locked);
622	}	622	}
623	EXPORT_SYMBOL(unlock_page);	623	EXPORT_SYMBOL(unlock_page);
624		624
625	/**	625	/**
626	* end_page_writeback - end writeback against a page	626	* end_page_writeback - end writeback against a page
627	* @page: the page	627	* @page: the page
628	*/	628	*/
629	void end_page_writeback(struct page *page)	629	void end_page_writeback(struct page *page)
630	{	630	{
631	if (TestClearPageReclaim(page))	631	if (TestClearPageReclaim(page))
632	rotate_reclaimable_page(page);	632	rotate_reclaimable_page(page);
633		633
634	if (!test_clear_page_writeback(page))	634	if (!test_clear_page_writeback(page))
635	BUG();	635	BUG();
636		636
637	smp_mb__after_clear_bit();	637	smp_mb__after_clear_bit();
638	wake_up_page(page, PG_writeback);	638	wake_up_page(page, PG_writeback);
639	}	639	}
640	EXPORT_SYMBOL(end_page_writeback);	640	EXPORT_SYMBOL(end_page_writeback);
641		641
642	/**	642	/**
643	* __lock_page - get a lock on the page, assuming we need to sleep to get it	643	* __lock_page - get a lock on the page, assuming we need to sleep to get it
644	* @page: the page to lock	644	* @page: the page to lock
645	*/	645	*/
646	void __lock_page(struct page *page)	646	void __lock_page(struct page *page)
647	{	647	{
648	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);	648	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
649		649
650	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,	650	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
651	TASK_UNINTERRUPTIBLE);	651	TASK_UNINTERRUPTIBLE);
652	}	652	}
653	EXPORT_SYMBOL(__lock_page);	653	EXPORT_SYMBOL(__lock_page);
654		654
655	int __lock_page_killable(struct page *page)	655	int __lock_page_killable(struct page *page)
656	{	656	{
657	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);	657	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
658		658
659	return __wait_on_bit_lock(page_waitqueue(page), &wait,	659	return __wait_on_bit_lock(page_waitqueue(page), &wait,
660	sleep_on_page_killable, TASK_KILLABLE);	660	sleep_on_page_killable, TASK_KILLABLE);
661	}	661	}
662	EXPORT_SYMBOL_GPL(__lock_page_killable);	662	EXPORT_SYMBOL_GPL(__lock_page_killable);
663		663
664	int __lock_page_or_retry(struct page page, struct mm_struct mm,	664	int __lock_page_or_retry(struct page page, struct mm_struct mm,
665	unsigned int flags)	665	unsigned int flags)
666	{	666	{
667	if (flags & FAULT_FLAG_ALLOW_RETRY) {	667	if (flags & FAULT_FLAG_ALLOW_RETRY) {
668	/*	668	/*
669	* CAUTION! In this case, mmap_sem is not released	669	* CAUTION! In this case, mmap_sem is not released
670	* even though return 0.	670	* even though return 0.
671	*/	671	*/
672	if (flags & FAULT_FLAG_RETRY_NOWAIT)	672	if (flags & FAULT_FLAG_RETRY_NOWAIT)
673	return 0;	673	return 0;
674		674
675	up_read(&mm->mmap_sem);	675	up_read(&mm->mmap_sem);
676	if (flags & FAULT_FLAG_KILLABLE)	676	if (flags & FAULT_FLAG_KILLABLE)
677	wait_on_page_locked_killable(page);	677	wait_on_page_locked_killable(page);
678	else	678	else
679	wait_on_page_locked(page);	679	wait_on_page_locked(page);
680	return 0;	680	return 0;
681	} else {	681	} else {
682	if (flags & FAULT_FLAG_KILLABLE) {	682	if (flags & FAULT_FLAG_KILLABLE) {
683	int ret;	683	int ret;
684		684
685	ret = __lock_page_killable(page);	685	ret = __lock_page_killable(page);
686	if (ret) {	686	if (ret) {
687	up_read(&mm->mmap_sem);	687	up_read(&mm->mmap_sem);
688	return 0;	688	return 0;
689	}	689	}
690	} else	690	} else
691	__lock_page(page);	691	__lock_page(page);
692	return 1;	692	return 1;
693	}	693	}
694	}	694	}
695		695
696	/**	696	/**
697	* find_get_page - find and get a page reference	697	* find_get_page - find and get a page reference
698	* @mapping: the address_space to search	698	* @mapping: the address_space to search
699	* @offset: the page index	699	* @offset: the page index
700	*	700	*
701	* Is there a pagecache struct page at the given (mapping, offset) tuple?	701	* Is there a pagecache struct page at the given (mapping, offset) tuple?
702	* If yes, increment its refcount and return it; if no, return NULL.	702	* If yes, increment its refcount and return it; if no, return NULL.
703	*/	703	*/
704	struct page find_get_page(struct address_space mapping, pgoff_t offset)	704	struct page find_get_page(struct address_space mapping, pgoff_t offset)
705	{	705	{
706	void **pagep;	706	void **pagep;
707	struct page *page;	707	struct page *page;
708		708
709	rcu_read_lock();	709	rcu_read_lock();
710	repeat:	710	repeat:
711	page = NULL;	711	page = NULL;
712	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);	712	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
713	if (pagep) {	713	if (pagep) {
714	page = radix_tree_deref_slot(pagep);	714	page = radix_tree_deref_slot(pagep);
715	if (unlikely(!page))	715	if (unlikely(!page))
716	goto out;	716	goto out;
717	if (radix_tree_deref_retry(page))	717	if (radix_tree_deref_retry(page))
718	goto repeat;	718	goto repeat;
719		719
720	if (!page_cache_get_speculative(page))	720	if (!page_cache_get_speculative(page))
721	goto repeat;	721	goto repeat;
722		722
723	/*	723	/*
724	* Has the page moved?	724	* Has the page moved?
725	* This is part of the lockless pagecache protocol. See	725	* This is part of the lockless pagecache protocol. See
726	* include/linux/pagemap.h for details.	726	* include/linux/pagemap.h for details.
727	*/	727	*/
728	if (unlikely(page != *pagep)) {	728	if (unlikely(page != *pagep)) {
729	page_cache_release(page);	729	page_cache_release(page);
730	goto repeat;	730	goto repeat;
731	}	731	}
732	}	732	}
733	out:	733	out:
734	rcu_read_unlock();	734	rcu_read_unlock();
735		735
736	return page;	736	return page;
737	}	737	}
738	EXPORT_SYMBOL(find_get_page);	738	EXPORT_SYMBOL(find_get_page);
739		739
740	/**	740	/**
741	* find_lock_page - locate, pin and lock a pagecache page	741	* find_lock_page - locate, pin and lock a pagecache page
742	* @mapping: the address_space to search	742	* @mapping: the address_space to search
743	* @offset: the page index	743	* @offset: the page index
744	*	744	*
745	* Locates the desired pagecache page, locks it, increments its reference	745	* Locates the desired pagecache page, locks it, increments its reference
746	* count and returns its address.	746	* count and returns its address.
747	*	747	*
748	* Returns zero if the page was not present. find_lock_page() may sleep.	748	* Returns zero if the page was not present. find_lock_page() may sleep.
749	*/	749	*/
750	struct page find_lock_page(struct address_space mapping, pgoff_t offset)	750	struct page find_lock_page(struct address_space mapping, pgoff_t offset)
751	{	751	{
752	struct page *page;	752	struct page *page;
753		753
754	repeat:	754	repeat:
755	page = find_get_page(mapping, offset);	755	page = find_get_page(mapping, offset);
756	if (page) {	756	if (page) {
757	lock_page(page);	757	lock_page(page);
758	/* Has the page been truncated? */	758	/* Has the page been truncated? */
759	if (unlikely(page->mapping != mapping)) {	759	if (unlikely(page->mapping != mapping)) {
760	unlock_page(page);	760	unlock_page(page);
761	page_cache_release(page);	761	page_cache_release(page);
762	goto repeat;	762	goto repeat;
763	}	763	}
764	VM_BUG_ON(page->index != offset);	764	VM_BUG_ON(page->index != offset);
765	}	765	}
766	return page;	766	return page;
767	}	767	}
768	EXPORT_SYMBOL(find_lock_page);	768	EXPORT_SYMBOL(find_lock_page);
769		769
770	/**	770	/**
771	* find_or_create_page - locate or add a pagecache page	771	* find_or_create_page - locate or add a pagecache page
772	* @mapping: the page's address_space	772	* @mapping: the page's address_space
773	* @index: the page's index into the mapping	773	* @index: the page's index into the mapping
774	* @gfp_mask: page allocation mode	774	* @gfp_mask: page allocation mode
775	*	775	*
776	* Locates a page in the pagecache. If the page is not present, a new page	776	* Locates a page in the pagecache. If the page is not present, a new page
777	* is allocated using @gfp_mask and is added to the pagecache and to the VM's	777	* is allocated using @gfp_mask and is added to the pagecache and to the VM's
778	* LRU list. The returned page is locked and has its reference count	778	* LRU list. The returned page is locked and has its reference count
779	* incremented.	779	* incremented.
780	*	780	*
781	* find_or_create_page() may sleep, even if @gfp_flags specifies an atomic	781	* find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
782	* allocation!	782	* allocation!
783	*	783	*
784	* find_or_create_page() returns the desired page's address, or zero on	784	* find_or_create_page() returns the desired page's address, or zero on
785	* memory exhaustion.	785	* memory exhaustion.
786	*/	786	*/
787	struct page find_or_create_page(struct address_space mapping,	787	struct page find_or_create_page(struct address_space mapping,
788	pgoff_t index, gfp_t gfp_mask)	788	pgoff_t index, gfp_t gfp_mask)
789	{	789	{
790	struct page *page;	790	struct page *page;
791	int err;	791	int err;
792	repeat:	792	repeat:
793	page = find_lock_page(mapping, index);	793	page = find_lock_page(mapping, index);
794	if (!page) {	794	if (!page) {
795	page = __page_cache_alloc(gfp_mask);	795	page = __page_cache_alloc(gfp_mask);
796	if (!page)	796	if (!page)
797	return NULL;	797	return NULL;
798	/*	798	/*
799	* We want a regular kernel memory (not highmem or DMA etc)	799	* We want a regular kernel memory (not highmem or DMA etc)
800	* allocation for the radix tree nodes, but we need to honour	800	* allocation for the radix tree nodes, but we need to honour
801	* the context-specific requirements the caller has asked for.	801	* the context-specific requirements the caller has asked for.
802	* GFP_RECLAIM_MASK collects those requirements.	802	* GFP_RECLAIM_MASK collects those requirements.
803	*/	803	*/
804	err = add_to_page_cache_lru(page, mapping, index,	804	err = add_to_page_cache_lru(page, mapping, index,
805	(gfp_mask & GFP_RECLAIM_MASK));	805	(gfp_mask & GFP_RECLAIM_MASK));
806	if (unlikely(err)) {	806	if (unlikely(err)) {
807	page_cache_release(page);	807	page_cache_release(page);
808	page = NULL;	808	page = NULL;
809	if (err == -EEXIST)	809	if (err == -EEXIST)
810	goto repeat;	810	goto repeat;
811	}	811	}
812	}	812	}
813	return page;	813	return page;
814	}	814	}
815	EXPORT_SYMBOL(find_or_create_page);	815	EXPORT_SYMBOL(find_or_create_page);
816		816
817	/**	817	/**
818	* find_get_pages - gang pagecache lookup	818	* find_get_pages - gang pagecache lookup
819	* @mapping: The address_space to search	819	* @mapping: The address_space to search
820	* @start: The starting page index	820	* @start: The starting page index
821	* @nr_pages: The maximum number of pages	821	* @nr_pages: The maximum number of pages
822	* @pages: Where the resulting pages are placed	822	* @pages: Where the resulting pages are placed
823	*	823	*
824	* find_get_pages() will search for and return a group of up to	824	* find_get_pages() will search for and return a group of up to
825	* @nr_pages pages in the mapping. The pages are placed at @pages.	825	* @nr_pages pages in the mapping. The pages are placed at @pages.
826	* find_get_pages() takes a reference against the returned pages.	826	* find_get_pages() takes a reference against the returned pages.
827	*	827	*
828	* The search returns a group of mapping-contiguous pages with ascending	828	* The search returns a group of mapping-contiguous pages with ascending
829	* indexes. There may be holes in the indices due to not-present pages.	829	* indexes. There may be holes in the indices due to not-present pages.
830	*	830	*
831	* find_get_pages() returns the number of pages which were found.	831	* find_get_pages() returns the number of pages which were found.
832	*/	832	*/
833	unsigned find_get_pages(struct address_space *mapping, pgoff_t start,	833	unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
834	unsigned int nr_pages, struct page **pages)	834	unsigned int nr_pages, struct page **pages)
835	{	835	{
836	unsigned int i;	836	unsigned int i;
837	unsigned int ret;	837	unsigned int ret;
838	unsigned int nr_found;	838	unsigned int nr_found;
839		839
840	rcu_read_lock();	840	rcu_read_lock();
841	restart:	841	restart:
842	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,	842	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
843	(void ***)pages, start, nr_pages);	843	(void ***)pages, start, nr_pages);
844	ret = 0;	844	ret = 0;
845	for (i = 0; i < nr_found; i++) {	845	for (i = 0; i < nr_found; i++) {
846	struct page *page;	846	struct page *page;
847	repeat:	847	repeat:
848	page = radix_tree_deref_slot((void **)pages[i]);	848	page = radix_tree_deref_slot((void **)pages[i]);
849	if (unlikely(!page))	849	if (unlikely(!page))
850	continue;	850	continue;
851		851
852	/*	852	/*
853	* This can only trigger when the entry at index 0 moves out	853	* This can only trigger when the entry at index 0 moves out
854	* of or back to the root: none yet gotten, safe to restart.	854	* of or back to the root: none yet gotten, safe to restart.
855	*/	855	*/
856	if (radix_tree_deref_retry(page)) {	856	if (radix_tree_deref_retry(page)) {
857	WARN_ON(start \| i);	857	WARN_ON(start \| i);
858	goto restart;	858	goto restart;
859	}	859	}
860		860
861	if (!page_cache_get_speculative(page))	861	if (!page_cache_get_speculative(page))
862	goto repeat;	862	goto repeat;
863		863
864	/* Has the page moved? */	864	/* Has the page moved? */
865	if (unlikely(page != ((void *)pages[i]))) {	865	if (unlikely(page != ((void *)pages[i]))) {
866	page_cache_release(page);	866	page_cache_release(page);
867	goto repeat;	867	goto repeat;
868	}	868	}
869		869
870	pages[ret] = page;	870	pages[ret] = page;
871	ret++;	871	ret++;
872	}	872	}
873		873
874	/*	874	/*
875	* If all entries were removed before we could secure them,	875	* If all entries were removed before we could secure them,
876	* try again, because callers stop trying once 0 is returned.	876	* try again, because callers stop trying once 0 is returned.
877	*/	877	*/
878	if (unlikely(!ret && nr_found))	878	if (unlikely(!ret && nr_found))
879	goto restart;	879	goto restart;
880	rcu_read_unlock();	880	rcu_read_unlock();
881	return ret;	881	return ret;
882	}	882	}
883		883
884	/**	884	/**
885	* find_get_pages_contig - gang contiguous pagecache lookup	885	* find_get_pages_contig - gang contiguous pagecache lookup
886	* @mapping: The address_space to search	886	* @mapping: The address_space to search
887	* @index: The starting page index	887	* @index: The starting page index
888	* @nr_pages: The maximum number of pages	888	* @nr_pages: The maximum number of pages
889	* @pages: Where the resulting pages are placed	889	* @pages: Where the resulting pages are placed
890	*	890	*
891	* find_get_pages_contig() works exactly like find_get_pages(), except	891	* find_get_pages_contig() works exactly like find_get_pages(), except
892	* that the returned number of pages are guaranteed to be contiguous.	892	* that the returned number of pages are guaranteed to be contiguous.
893	*	893	*
894	* find_get_pages_contig() returns the number of pages which were found.	894	* find_get_pages_contig() returns the number of pages which were found.
895	*/	895	*/
896	unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,	896	unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
897	unsigned int nr_pages, struct page **pages)	897	unsigned int nr_pages, struct page **pages)
898	{	898	{
899	unsigned int i;	899	unsigned int i;
900	unsigned int ret;	900	unsigned int ret;
901	unsigned int nr_found;	901	unsigned int nr_found;
902		902
903	rcu_read_lock();	903	rcu_read_lock();
904	restart:	904	restart:
905	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,	905	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
906	(void ***)pages, index, nr_pages);	906	(void ***)pages, index, nr_pages);
907	ret = 0;	907	ret = 0;
908	for (i = 0; i < nr_found; i++) {	908	for (i = 0; i < nr_found; i++) {
909	struct page *page;	909	struct page *page;
910	repeat:	910	repeat:
911	page = radix_tree_deref_slot((void **)pages[i]);	911	page = radix_tree_deref_slot((void **)pages[i]);
912	if (unlikely(!page))	912	if (unlikely(!page))
913	continue;	913	continue;
914		914
915	/*	915	/*
916	* This can only trigger when the entry at index 0 moves out	916	* This can only trigger when the entry at index 0 moves out
917	* of or back to the root: none yet gotten, safe to restart.	917	* of or back to the root: none yet gotten, safe to restart.
918	*/	918	*/
919	if (radix_tree_deref_retry(page))	919	if (radix_tree_deref_retry(page))
920	goto restart;	920	goto restart;
921		921
922	if (!page_cache_get_speculative(page))	922	if (!page_cache_get_speculative(page))
923	goto repeat;	923	goto repeat;
924		924
925	/* Has the page moved? */	925	/* Has the page moved? */
926	if (unlikely(page != ((void *)pages[i]))) {	926	if (unlikely(page != ((void *)pages[i]))) {
927	page_cache_release(page);	927	page_cache_release(page);
928	goto repeat;	928	goto repeat;
929	}	929	}
930		930
931	/*	931	/*
932	* must check mapping and index after taking the ref.	932	* must check mapping and index after taking the ref.
933	* otherwise we can get both false positives and false	933	* otherwise we can get both false positives and false
934	* negatives, which is just confusing to the caller.	934	* negatives, which is just confusing to the caller.
935	*/	935	*/
936	if (page->mapping == NULL \|\| page->index != index) {	936	if (page->mapping == NULL \|\| page->index != index) {
937	page_cache_release(page);	937	page_cache_release(page);
938	break;	938	break;
939	}	939	}
940		940
941	pages[ret] = page;	941	pages[ret] = page;
942	ret++;	942	ret++;
943	index++;	943	index++;
944	}	944	}
945	rcu_read_unlock();	945	rcu_read_unlock();
946	return ret;	946	return ret;
947	}	947	}
948	EXPORT_SYMBOL(find_get_pages_contig);	948	EXPORT_SYMBOL(find_get_pages_contig);
949		949
950	/**	950	/**
951	* find_get_pages_tag - find and return pages that match @tag	951	* find_get_pages_tag - find and return pages that match @tag
952	* @mapping: the address_space to search	952	* @mapping: the address_space to search
953	* @index: the starting page index	953	* @index: the starting page index
954	* @tag: the tag index	954	* @tag: the tag index
955	* @nr_pages: the maximum number of pages	955	* @nr_pages: the maximum number of pages
956	* @pages: where the resulting pages are placed	956	* @pages: where the resulting pages are placed
957	*	957	*
958	* Like find_get_pages, except we only return pages which are tagged with	958	* Like find_get_pages, except we only return pages which are tagged with
959	* @tag. We update @index to index the next page for the traversal.	959	* @tag. We update @index to index the next page for the traversal.
960	*/	960	*/
961	unsigned find_get_pages_tag(struct address_space mapping, pgoff_t index,	961	unsigned find_get_pages_tag(struct address_space mapping, pgoff_t index,
962	int tag, unsigned int nr_pages, struct page **pages)	962	int tag, unsigned int nr_pages, struct page **pages)
963	{	963	{
964	unsigned int i;	964	unsigned int i;
965	unsigned int ret;	965	unsigned int ret;
966	unsigned int nr_found;	966	unsigned int nr_found;
967		967
968	rcu_read_lock();	968	rcu_read_lock();
969	restart:	969	restart:
970	nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,	970	nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
971	(void **)pages, index, nr_pages, tag);	971	(void **)pages, index, nr_pages, tag);
972	ret = 0;	972	ret = 0;
973	for (i = 0; i < nr_found; i++) {	973	for (i = 0; i < nr_found; i++) {
974	struct page *page;	974	struct page *page;
975	repeat:	975	repeat:
976	page = radix_tree_deref_slot((void **)pages[i]);	976	page = radix_tree_deref_slot((void **)pages[i]);
977	if (unlikely(!page))	977	if (unlikely(!page))
978	continue;	978	continue;
979		979
980	/*	980	/*
981	* This can only trigger when the entry at index 0 moves out	981	* This can only trigger when the entry at index 0 moves out
982	* of or back to the root: none yet gotten, safe to restart.	982	* of or back to the root: none yet gotten, safe to restart.
983	*/	983	*/
984	if (radix_tree_deref_retry(page))	984	if (radix_tree_deref_retry(page))
985	goto restart;	985	goto restart;
986		986
987	if (!page_cache_get_speculative(page))	987	if (!page_cache_get_speculative(page))
988	goto repeat;	988	goto repeat;
989		989
990	/* Has the page moved? */	990	/* Has the page moved? */
991	if (unlikely(page != ((void *)pages[i]))) {	991	if (unlikely(page != ((void *)pages[i]))) {
992	page_cache_release(page);	992	page_cache_release(page);
993	goto repeat;	993	goto repeat;
994	}	994	}
995		995
996	pages[ret] = page;	996	pages[ret] = page;
997	ret++;	997	ret++;
998	}	998	}
999		999
1000	/*	1000	/*
1001	* If all entries were removed before we could secure them,	1001	* If all entries were removed before we could secure them,
1002	* try again, because callers stop trying once 0 is returned.	1002	* try again, because callers stop trying once 0 is returned.
1003	*/	1003	*/
1004	if (unlikely(!ret && nr_found))	1004	if (unlikely(!ret && nr_found))
1005	goto restart;	1005	goto restart;
1006	rcu_read_unlock();	1006	rcu_read_unlock();
1007		1007
1008	if (ret)	1008	if (ret)
1009	*index = pages[ret - 1]->index + 1;	1009	*index = pages[ret - 1]->index + 1;
1010		1010
1011	return ret;	1011	return ret;
1012	}	1012	}
1013	EXPORT_SYMBOL(find_get_pages_tag);	1013	EXPORT_SYMBOL(find_get_pages_tag);
1014		1014
1015	/**	1015	/**
1016	* grab_cache_page_nowait - returns locked page at given index in given cache	1016	* grab_cache_page_nowait - returns locked page at given index in given cache
1017	* @mapping: target address_space	1017	* @mapping: target address_space
1018	* @index: the page index	1018	* @index: the page index
1019	*	1019	*
1020	* Same as grab_cache_page(), but do not wait if the page is unavailable.	1020	* Same as grab_cache_page(), but do not wait if the page is unavailable.
1021	* This is intended for speculative data generators, where the data can	1021	* This is intended for speculative data generators, where the data can
1022	* be regenerated if the page couldn't be grabbed. This routine should	1022	* be regenerated if the page couldn't be grabbed. This routine should
1023	* be safe to call while holding the lock for another page.	1023	* be safe to call while holding the lock for another page.
1024	*	1024	*
1025	* Clear __GFP_FS when allocating the page to avoid recursion into the fs	1025	* Clear __GFP_FS when allocating the page to avoid recursion into the fs
1026	* and deadlock against the caller's locked page.	1026	* and deadlock against the caller's locked page.
1027	*/	1027	*/
1028	struct page *	1028	struct page *
1029	grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)	1029	grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1030	{	1030	{
1031	struct page *page = find_get_page(mapping, index);	1031	struct page *page = find_get_page(mapping, index);
1032		1032
1033	if (page) {	1033	if (page) {
1034	if (trylock_page(page))	1034	if (trylock_page(page))
1035	return page;	1035	return page;
1036	page_cache_release(page);	1036	page_cache_release(page);
1037	return NULL;	1037	return NULL;
1038	}	1038	}
1039	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);	1039	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1040	if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {	1040	if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1041	page_cache_release(page);	1041	page_cache_release(page);
1042	page = NULL;	1042	page = NULL;
1043	}	1043	}
1044	return page;	1044	return page;
1045	}	1045	}
1046	EXPORT_SYMBOL(grab_cache_page_nowait);	1046	EXPORT_SYMBOL(grab_cache_page_nowait);
1047		1047
1048	/*	1048	/*
1049	* CD/DVDs are error prone. When a medium error occurs, the driver may fail	1049	* CD/DVDs are error prone. When a medium error occurs, the driver may fail
1050	* a _large_ part of the i/o request. Imagine the worst scenario:	1050	* a _large_ part of the i/o request. Imagine the worst scenario:
1051	*	1051	*
1052	* ---R__________________________________________B__________	1052	* ---R__________________________________________B__________
1053	* ^ reading here ^ bad block(assume 4k)	1053	* ^ reading here ^ bad block(assume 4k)
1054	*	1054	*
1055	* read(R) => miss => readahead(R...B) => media error => frustrating retries	1055	* read(R) => miss => readahead(R...B) => media error => frustrating retries
1056	* => failing the whole request => read(R) => read(R+1) =>	1056	* => failing the whole request => read(R) => read(R+1) =>
1057	* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>	1057	* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
1058	* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>	1058	* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
1059	* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......	1059	* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
1060	*	1060	*
1061	* It is going insane. Fix it by quickly scaling down the readahead size.	1061	* It is going insane. Fix it by quickly scaling down the readahead size.
1062	*/	1062	*/
1063	static void shrink_readahead_size_eio(struct file *filp,	1063	static void shrink_readahead_size_eio(struct file *filp,
1064	struct file_ra_state *ra)	1064	struct file_ra_state *ra)
1065	{	1065	{
1066	ra->ra_pages /= 4;	1066	ra->ra_pages /= 4;
1067	}	1067	}
1068		1068
1069	/**	1069	/**
1070	* do_generic_file_read - generic file read routine	1070	* do_generic_file_read - generic file read routine
1071	* @filp: the file to read	1071	* @filp: the file to read
1072	* @ppos: current file position	1072	* @ppos: current file position
1073	* @desc: read_descriptor	1073	* @desc: read_descriptor
1074	* @actor: read method	1074	* @actor: read method
1075	*	1075	*
1076	* This is a generic file read routine, and uses the	1076	* This is a generic file read routine, and uses the
1077	* mapping->a_ops->readpage() function for the actual low-level stuff.	1077	* mapping->a_ops->readpage() function for the actual low-level stuff.
1078	*	1078	*
1079	* This is really ugly. But the goto's actually try to clarify some	1079	* This is really ugly. But the goto's actually try to clarify some
1080	* of the logic when it comes to error handling etc.	1080	* of the logic when it comes to error handling etc.
1081	*/	1081	*/
1082	static void do_generic_file_read(struct file filp, loff_t ppos,	1082	static void do_generic_file_read(struct file filp, loff_t ppos,
1083	read_descriptor_t *desc, read_actor_t actor)	1083	read_descriptor_t *desc, read_actor_t actor)
1084	{	1084	{
1085	struct address_space *mapping = filp->f_mapping;	1085	struct address_space *mapping = filp->f_mapping;
1086	struct inode *inode = mapping->host;	1086	struct inode *inode = mapping->host;
1087	struct file_ra_state *ra = &filp->f_ra;	1087	struct file_ra_state *ra = &filp->f_ra;
1088	pgoff_t index;	1088	pgoff_t index;
1089	pgoff_t last_index;	1089	pgoff_t last_index;
1090	pgoff_t prev_index;	1090	pgoff_t prev_index;
1091	unsigned long offset; /* offset into pagecache page */	1091	unsigned long offset; /* offset into pagecache page */
1092	unsigned int prev_offset;	1092	unsigned int prev_offset;
1093	int error;	1093	int error;
1094		1094
1095	index = *ppos >> PAGE_CACHE_SHIFT;	1095	index = *ppos >> PAGE_CACHE_SHIFT;
1096	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;	1096	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1097	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);	1097	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
1098	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;	1098	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1099	offset = *ppos & ~PAGE_CACHE_MASK;	1099	offset = *ppos & ~PAGE_CACHE_MASK;
1100		1100
1101	for (;;) {	1101	for (;;) {
1102	struct page *page;	1102	struct page *page;
1103	pgoff_t end_index;	1103	pgoff_t end_index;
1104	loff_t isize;	1104	loff_t isize;
1105	unsigned long nr, ret;	1105	unsigned long nr, ret;
1106		1106
1107	cond_resched();	1107	cond_resched();
1108	find_page:	1108	find_page:
1109	page = find_get_page(mapping, index);	1109	page = find_get_page(mapping, index);
1110	if (!page) {	1110	if (!page) {
1111	page_cache_sync_readahead(mapping,	1111	page_cache_sync_readahead(mapping,
1112	ra, filp,	1112	ra, filp,
1113	index, last_index - index);	1113	index, last_index - index);
1114	page = find_get_page(mapping, index);	1114	page = find_get_page(mapping, index);
1115	if (unlikely(page == NULL))	1115	if (unlikely(page == NULL))
1116	goto no_cached_page;	1116	goto no_cached_page;
1117	}	1117	}
1118	if (PageReadahead(page)) {	1118	if (PageReadahead(page)) {
1119	page_cache_async_readahead(mapping,	1119	page_cache_async_readahead(mapping,
1120	ra, filp, page,	1120	ra, filp, page,
1121	index, last_index - index);	1121	index, last_index - index);
1122	}	1122	}
1123	if (!PageUptodate(page)) {	1123	if (!PageUptodate(page)) {
1124	if (inode->i_blkbits == PAGE_CACHE_SHIFT \|\|	1124	if (inode->i_blkbits == PAGE_CACHE_SHIFT \|\|
1125	!mapping->a_ops->is_partially_uptodate)	1125	!mapping->a_ops->is_partially_uptodate)
1126	goto page_not_up_to_date;	1126	goto page_not_up_to_date;
1127	if (!trylock_page(page))	1127	if (!trylock_page(page))
1128	goto page_not_up_to_date;	1128	goto page_not_up_to_date;
1129	/* Did it get truncated before we got the lock? */	1129	/* Did it get truncated before we got the lock? */
1130	if (!page->mapping)	1130	if (!page->mapping)
1131	goto page_not_up_to_date_locked;	1131	goto page_not_up_to_date_locked;
1132	if (!mapping->a_ops->is_partially_uptodate(page,	1132	if (!mapping->a_ops->is_partially_uptodate(page,
1133	desc, offset))	1133	desc, offset))
1134	goto page_not_up_to_date_locked;	1134	goto page_not_up_to_date_locked;
1135	unlock_page(page);	1135	unlock_page(page);
1136	}	1136	}
1137	page_ok:	1137	page_ok:
1138	/*	1138	/*
1139	* i_size must be checked after we know the page is Uptodate.	1139	* i_size must be checked after we know the page is Uptodate.
1140	*	1140	*
1141	* Checking i_size after the check allows us to calculate	1141	* Checking i_size after the check allows us to calculate
1142	* the correct value for "nr", which means the zero-filled	1142	* the correct value for "nr", which means the zero-filled
1143	* part of the page is not copied back to userspace (unless	1143	* part of the page is not copied back to userspace (unless
1144	* another truncate extends the file - this is desired though).	1144	* another truncate extends the file - this is desired though).
1145	*/	1145	*/
1146		1146
1147	isize = i_size_read(inode);	1147	isize = i_size_read(inode);
1148	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;	1148	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1149	if (unlikely(!isize \|\| index > end_index)) {	1149	if (unlikely(!isize \|\| index > end_index)) {
1150	page_cache_release(page);	1150	page_cache_release(page);
1151	goto out;	1151	goto out;
1152	}	1152	}
1153		1153
1154	/* nr is the maximum number of bytes to copy from this page */	1154	/* nr is the maximum number of bytes to copy from this page */
1155	nr = PAGE_CACHE_SIZE;	1155	nr = PAGE_CACHE_SIZE;
1156	if (index == end_index) {	1156	if (index == end_index) {
1157	nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;	1157	nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1158	if (nr <= offset) {	1158	if (nr <= offset) {
1159	page_cache_release(page);	1159	page_cache_release(page);
1160	goto out;	1160	goto out;
1161	}	1161	}
1162	}	1162	}
1163	nr = nr - offset;	1163	nr = nr - offset;
1164		1164
1165	/* If users can be writing to this page using arbitrary	1165	/* If users can be writing to this page using arbitrary
1166	* virtual addresses, take care about potential aliasing	1166	* virtual addresses, take care about potential aliasing
1167	* before reading the page on the kernel side.	1167	* before reading the page on the kernel side.
1168	*/	1168	*/
1169	if (mapping_writably_mapped(mapping))	1169	if (mapping_writably_mapped(mapping))
1170	flush_dcache_page(page);	1170	flush_dcache_page(page);
1171		1171
1172	/*	1172	/*
1173	* When a sequential read accesses a page several times,	1173	* When a sequential read accesses a page several times,
1174	* only mark it as accessed the first time.	1174	* only mark it as accessed the first time.
1175	*/	1175	*/
1176	if (prev_index != index \|\| offset != prev_offset)	1176	if (prev_index != index \|\| offset != prev_offset)
1177	mark_page_accessed(page);	1177	mark_page_accessed(page);
1178	prev_index = index;	1178	prev_index = index;
1179		1179
1180	/*	1180	/*
1181	* Ok, we have the page, and it's up-to-date, so	1181	* Ok, we have the page, and it's up-to-date, so
1182	* now we can copy it to user space...	1182	* now we can copy it to user space...
1183	*	1183	*
1184	* The actor routine returns how many bytes were actually used..	1184	* The actor routine returns how many bytes were actually used..
1185	* NOTE! This may not be the same as how much of a user buffer	1185	* NOTE! This may not be the same as how much of a user buffer
1186	* we filled up (we may be padding etc), so we can only update	1186	* we filled up (we may be padding etc), so we can only update
1187	* "pos" here (the actor routine has to update the user buffer	1187	* "pos" here (the actor routine has to update the user buffer
1188	* pointers and the remaining count).	1188	* pointers and the remaining count).
1189	*/	1189	*/
1190	ret = actor(desc, page, offset, nr);	1190	ret = actor(desc, page, offset, nr);
1191	offset += ret;	1191	offset += ret;
1192	index += offset >> PAGE_CACHE_SHIFT;	1192	index += offset >> PAGE_CACHE_SHIFT;
1193	offset &= ~PAGE_CACHE_MASK;	1193	offset &= ~PAGE_CACHE_MASK;
1194	prev_offset = offset;	1194	prev_offset = offset;
1195		1195
1196	page_cache_release(page);	1196	page_cache_release(page);
1197	if (ret == nr && desc->count)	1197	if (ret == nr && desc->count)
1198	continue;	1198	continue;
1199	goto out;	1199	goto out;
1200		1200
1201	page_not_up_to_date:	1201	page_not_up_to_date:
1202	/* Get exclusive access to the page ... */	1202	/* Get exclusive access to the page ... */
1203	error = lock_page_killable(page);	1203	error = lock_page_killable(page);
1204	if (unlikely(error))	1204	if (unlikely(error))
1205	goto readpage_error;	1205	goto readpage_error;
1206		1206
1207	page_not_up_to_date_locked:	1207	page_not_up_to_date_locked:
1208	/* Did it get truncated before we got the lock? */	1208	/* Did it get truncated before we got the lock? */
1209	if (!page->mapping) {	1209	if (!page->mapping) {
1210	unlock_page(page);	1210	unlock_page(page);
1211	page_cache_release(page);	1211	page_cache_release(page);
1212	continue;	1212	continue;
1213	}	1213	}
1214		1214
1215	/* Did somebody else fill it already? */	1215	/* Did somebody else fill it already? */
1216	if (PageUptodate(page)) {	1216	if (PageUptodate(page)) {
1217	unlock_page(page);	1217	unlock_page(page);
1218	goto page_ok;	1218	goto page_ok;
1219	}	1219	}
1220		1220
1221	readpage:	1221	readpage:
1222	/*	1222	/*
1223	* A previous I/O error may have been due to temporary	1223	* A previous I/O error may have been due to temporary
1224	* failures, eg. multipath errors.	1224	* failures, eg. multipath errors.
1225	* PG_error will be set again if readpage fails.	1225	* PG_error will be set again if readpage fails.
1226	*/	1226	*/
1227	ClearPageError(page);	1227	ClearPageError(page);
1228	/* Start the actual read. The read will unlock the page. */	1228	/* Start the actual read. The read will unlock the page. */
1229	error = mapping->a_ops->readpage(filp, page);	1229	error = mapping->a_ops->readpage(filp, page);
1230		1230
1231	if (unlikely(error)) {	1231	if (unlikely(error)) {
1232	if (error == AOP_TRUNCATED_PAGE) {	1232	if (error == AOP_TRUNCATED_PAGE) {
1233	page_cache_release(page);	1233	page_cache_release(page);
1234	goto find_page;	1234	goto find_page;
1235	}	1235	}
1236	goto readpage_error;	1236	goto readpage_error;
1237	}	1237	}
1238		1238
1239	if (!PageUptodate(page)) {	1239	if (!PageUptodate(page)) {
1240	error = lock_page_killable(page);	1240	error = lock_page_killable(page);
1241	if (unlikely(error))	1241	if (unlikely(error))
1242	goto readpage_error;	1242	goto readpage_error;
1243	if (!PageUptodate(page)) {	1243	if (!PageUptodate(page)) {
1244	if (page->mapping == NULL) {	1244	if (page->mapping == NULL) {
1245	/*	1245	/*
1246	* invalidate_mapping_pages got it	1246	* invalidate_mapping_pages got it
1247	*/	1247	*/
1248	unlock_page(page);	1248	unlock_page(page);
1249	page_cache_release(page);	1249	page_cache_release(page);
1250	goto find_page;	1250	goto find_page;
1251	}	1251	}
1252	unlock_page(page);	1252	unlock_page(page);
1253	shrink_readahead_size_eio(filp, ra);	1253	shrink_readahead_size_eio(filp, ra);
1254	error = -EIO;	1254	error = -EIO;
1255	goto readpage_error;	1255	goto readpage_error;
1256	}	1256	}
1257	unlock_page(page);	1257	unlock_page(page);
1258	}	1258	}
1259		1259
1260	goto page_ok;	1260	goto page_ok;
1261		1261
1262	readpage_error:	1262	readpage_error:
1263	/* UHHUH! A synchronous read error occurred. Report it */	1263	/* UHHUH! A synchronous read error occurred. Report it */
1264	desc->error = error;	1264	desc->error = error;
1265	page_cache_release(page);	1265	page_cache_release(page);
1266	goto out;	1266	goto out;
1267		1267
1268	no_cached_page:	1268	no_cached_page:
1269	/*	1269	/*
1270	* Ok, it wasn't cached, so we need to create a new	1270	* Ok, it wasn't cached, so we need to create a new
1271	* page..	1271	* page..
1272	*/	1272	*/
1273	page = page_cache_alloc_cold(mapping);	1273	page = page_cache_alloc_cold(mapping);
1274	if (!page) {	1274	if (!page) {
1275	desc->error = -ENOMEM;	1275	desc->error = -ENOMEM;
1276	goto out;	1276	goto out;
1277	}	1277	}
1278	error = add_to_page_cache_lru(page, mapping,	1278	error = add_to_page_cache_lru(page, mapping,
1279	index, GFP_KERNEL);	1279	index, GFP_KERNEL);
1280	if (error) {	1280	if (error) {
1281	page_cache_release(page);	1281	page_cache_release(page);
1282	if (error == -EEXIST)	1282	if (error == -EEXIST)
1283	goto find_page;	1283	goto find_page;
1284	desc->error = error;	1284	desc->error = error;
1285	goto out;	1285	goto out;
1286	}	1286	}
1287	goto readpage;	1287	goto readpage;
1288	}	1288	}
1289		1289
1290	out:	1290	out:
1291	ra->prev_pos = prev_index;	1291	ra->prev_pos = prev_index;
1292	ra->prev_pos <<= PAGE_CACHE_SHIFT;	1292	ra->prev_pos <<= PAGE_CACHE_SHIFT;
1293	ra->prev_pos \|= prev_offset;	1293	ra->prev_pos \|= prev_offset;
1294		1294
1295	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;	1295	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1296	file_accessed(filp);	1296	file_accessed(filp);
1297	}	1297	}
1298		1298
1299	int file_read_actor(read_descriptor_t desc, struct page page,	1299	int file_read_actor(read_descriptor_t desc, struct page page,
1300	unsigned long offset, unsigned long size)	1300	unsigned long offset, unsigned long size)
1301	{	1301	{
1302	char *kaddr;	1302	char *kaddr;
1303	unsigned long left, count = desc->count;	1303	unsigned long left, count = desc->count;
1304		1304
1305	if (size > count)	1305	if (size > count)
1306	size = count;	1306	size = count;
1307		1307
1308	/*	1308	/*
1309	* Faults on the destination of a read are common, so do it before	1309	* Faults on the destination of a read are common, so do it before
1310	* taking the kmap.	1310	* taking the kmap.
1311	*/	1311	*/
1312	if (!fault_in_pages_writeable(desc->arg.buf, size)) {	1312	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1313	kaddr = kmap_atomic(page, KM_USER0);	1313	kaddr = kmap_atomic(page, KM_USER0);
1314	left = __copy_to_user_inatomic(desc->arg.buf,	1314	left = __copy_to_user_inatomic(desc->arg.buf,
1315	kaddr + offset, size);	1315	kaddr + offset, size);
1316	kunmap_atomic(kaddr, KM_USER0);	1316	kunmap_atomic(kaddr, KM_USER0);
1317	if (left == 0)	1317	if (left == 0)
1318	goto success;	1318	goto success;
1319	}	1319	}
1320		1320
1321	/* Do it the slow way */	1321	/* Do it the slow way */
1322	kaddr = kmap(page);	1322	kaddr = kmap(page);
1323	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);	1323	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1324	kunmap(page);	1324	kunmap(page);
1325		1325
1326	if (left) {	1326	if (left) {
1327	size -= left;	1327	size -= left;
1328	desc->error = -EFAULT;	1328	desc->error = -EFAULT;
1329	}	1329	}
1330	success:	1330	success:
1331	desc->count = count - size;	1331	desc->count = count - size;
1332	desc->written += size;	1332	desc->written += size;
1333	desc->arg.buf += size;	1333	desc->arg.buf += size;
1334	return size;	1334	return size;
1335	}	1335	}
1336		1336
1337	/*	1337	/*
1338	* Performs necessary checks before doing a write	1338	* Performs necessary checks before doing a write
1339	* @iov: io vector request	1339	* @iov: io vector request
1340	* @nr_segs: number of segments in the iovec	1340	* @nr_segs: number of segments in the iovec
1341	* @count: number of bytes to write	1341	* @count: number of bytes to write
1342	* @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE	1342	* @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1343	*	1343	*
1344	* Adjust number of segments and amount of bytes to write (nr_segs should be	1344	* Adjust number of segments and amount of bytes to write (nr_segs should be
1345	* properly initialized first). Returns appropriate error code that caller	1345	* properly initialized first). Returns appropriate error code that caller
1346	* should return or zero in case that write should be allowed.	1346	* should return or zero in case that write should be allowed.
1347	*/	1347	*/
1348	int generic_segment_checks(const struct iovec *iov,	1348	int generic_segment_checks(const struct iovec *iov,
1349	unsigned long nr_segs, size_t count, int access_flags)	1349	unsigned long nr_segs, size_t count, int access_flags)
1350	{	1350	{
1351	unsigned long seg;	1351	unsigned long seg;
1352	size_t cnt = 0;	1352	size_t cnt = 0;
1353	for (seg = 0; seg < *nr_segs; seg++) {	1353	for (seg = 0; seg < *nr_segs; seg++) {
1354	const struct iovec *iv = &iov[seg];	1354	const struct iovec *iv = &iov[seg];
1355		1355
1356	/*	1356	/*
1357	* If any segment has a negative length, or the cumulative	1357	* If any segment has a negative length, or the cumulative
1358	* length ever wraps negative then return -EINVAL.	1358	* length ever wraps negative then return -EINVAL.
1359	*/	1359	*/
1360	cnt += iv->iov_len;	1360	cnt += iv->iov_len;
1361	if (unlikely((ssize_t)(cnt\|iv->iov_len) < 0))	1361	if (unlikely((ssize_t)(cnt\|iv->iov_len) < 0))
1362	return -EINVAL;	1362	return -EINVAL;
1363	if (access_ok(access_flags, iv->iov_base, iv->iov_len))	1363	if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1364	continue;	1364	continue;
1365	if (seg == 0)	1365	if (seg == 0)
1366	return -EFAULT;	1366	return -EFAULT;
1367	*nr_segs = seg;	1367	*nr_segs = seg;
1368	cnt -= iv->iov_len; /* This segment is no good */	1368	cnt -= iv->iov_len; /* This segment is no good */
1369	break;	1369	break;
1370	}	1370	}
1371	*count = cnt;	1371	*count = cnt;
1372	return 0;	1372	return 0;
1373	}	1373	}
1374	EXPORT_SYMBOL(generic_segment_checks);	1374	EXPORT_SYMBOL(generic_segment_checks);
1375		1375
1376	/**	1376	/**
1377	* generic_file_aio_read - generic filesystem read routine	1377	* generic_file_aio_read - generic filesystem read routine
1378	* @iocb: kernel I/O control block	1378	* @iocb: kernel I/O control block
1379	* @iov: io vector request	1379	* @iov: io vector request
1380	* @nr_segs: number of segments in the iovec	1380	* @nr_segs: number of segments in the iovec
1381	* @pos: current file position	1381	* @pos: current file position
1382	*	1382	*
1383	* This is the "read()" routine for all filesystems	1383	* This is the "read()" routine for all filesystems
1384	* that can use the page cache directly.	1384	* that can use the page cache directly.
1385	*/	1385	*/
1386	ssize_t	1386	ssize_t
1387	generic_file_aio_read(struct kiocb iocb, const struct iovec iov,	1387	generic_file_aio_read(struct kiocb iocb, const struct iovec iov,
1388	unsigned long nr_segs, loff_t pos)	1388	unsigned long nr_segs, loff_t pos)
1389	{	1389	{
1390	struct file *filp = iocb->ki_filp;	1390	struct file *filp = iocb->ki_filp;
1391	ssize_t retval;	1391	ssize_t retval;
1392	unsigned long seg = 0;	1392	unsigned long seg = 0;
1393	size_t count;	1393	size_t count;
1394	loff_t *ppos = &iocb->ki_pos;	1394	loff_t *ppos = &iocb->ki_pos;
1395	struct blk_plug plug;	1395	struct blk_plug plug;
1396		1396
1397	count = 0;	1397	count = 0;
1398	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);	1398	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1399	if (retval)	1399	if (retval)
1400	return retval;	1400	return retval;
1401		1401
1402	blk_start_plug(&plug);	1402	blk_start_plug(&plug);
1403		1403
1404	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */	1404	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1405	if (filp->f_flags & O_DIRECT) {	1405	if (filp->f_flags & O_DIRECT) {
1406	loff_t size;	1406	loff_t size;
1407	struct address_space *mapping;	1407	struct address_space *mapping;
1408	struct inode *inode;	1408	struct inode *inode;
1409		1409
1410	mapping = filp->f_mapping;	1410	mapping = filp->f_mapping;
1411	inode = mapping->host;	1411	inode = mapping->host;
1412	if (!count)	1412	if (!count)
1413	goto out; /* skip atime */	1413	goto out; /* skip atime */
1414	size = i_size_read(inode);	1414	size = i_size_read(inode);
1415	if (pos < size) {	1415	if (pos < size) {
1416	retval = filemap_write_and_wait_range(mapping, pos,	1416	retval = filemap_write_and_wait_range(mapping, pos,
1417	pos + iov_length(iov, nr_segs) - 1);	1417	pos + iov_length(iov, nr_segs) - 1);
1418	if (!retval) {	1418	if (!retval) {
1419	retval = mapping->a_ops->direct_IO(READ, iocb,	1419	retval = mapping->a_ops->direct_IO(READ, iocb,
1420	iov, pos, nr_segs);	1420	iov, pos, nr_segs);
1421	}	1421	}
1422	if (retval > 0) {	1422	if (retval > 0) {
1423	*ppos = pos + retval;	1423	*ppos = pos + retval;
1424	count -= retval;	1424	count -= retval;
1425	}	1425	}
1426		1426
1427	/*	1427	/*
1428	* Btrfs can have a short DIO read if we encounter	1428	* Btrfs can have a short DIO read if we encounter
1429	* compressed extents, so if there was an error, or if	1429	* compressed extents, so if there was an error, or if
1430	* we've already read everything we wanted to, or if	1430	* we've already read everything we wanted to, or if
1431	* there was a short read because we hit EOF, go ahead	1431	* there was a short read because we hit EOF, go ahead
1432	* and return. Otherwise fallthrough to buffered io for	1432	* and return. Otherwise fallthrough to buffered io for
1433	* the rest of the read.	1433	* the rest of the read.
1434	*/	1434	*/
1435	if (retval < 0 \|\| !count \|\| *ppos >= size) {	1435	if (retval < 0 \|\| !count \|\| *ppos >= size) {
1436	file_accessed(filp);	1436	file_accessed(filp);
1437	goto out;	1437	goto out;
1438	}	1438	}
1439	}	1439	}
1440	}	1440	}
1441		1441
1442	count = retval;	1442	count = retval;
1443	for (seg = 0; seg < nr_segs; seg++) {	1443	for (seg = 0; seg < nr_segs; seg++) {
1444	read_descriptor_t desc;	1444	read_descriptor_t desc;
1445	loff_t offset = 0;	1445	loff_t offset = 0;
1446		1446
1447	/*	1447	/*
1448	* If we did a short DIO read we need to skip the section of the	1448	* If we did a short DIO read we need to skip the section of the
1449	* iov that we've already read data into.	1449	* iov that we've already read data into.
1450	*/	1450	*/
1451	if (count) {	1451	if (count) {
1452	if (count > iov[seg].iov_len) {	1452	if (count > iov[seg].iov_len) {
1453	count -= iov[seg].iov_len;	1453	count -= iov[seg].iov_len;
1454	continue;	1454	continue;
1455	}	1455	}
1456	offset = count;	1456	offset = count;
1457	count = 0;	1457	count = 0;
1458	}	1458	}
1459		1459
1460	desc.written = 0;	1460	desc.written = 0;
1461	desc.arg.buf = iov[seg].iov_base + offset;	1461	desc.arg.buf = iov[seg].iov_base + offset;
1462	desc.count = iov[seg].iov_len - offset;	1462	desc.count = iov[seg].iov_len - offset;
1463	if (desc.count == 0)	1463	if (desc.count == 0)
1464	continue;	1464	continue;
1465	desc.error = 0;	1465	desc.error = 0;
1466	do_generic_file_read(filp, ppos, &desc, file_read_actor);	1466	do_generic_file_read(filp, ppos, &desc, file_read_actor);
1467	retval += desc.written;	1467	retval += desc.written;
1468	if (desc.error) {	1468	if (desc.error) {
1469	retval = retval ?: desc.error;	1469	retval = retval ?: desc.error;
1470	break;	1470	break;
1471	}	1471	}
1472	if (desc.count > 0)	1472	if (desc.count > 0)
1473	break;	1473	break;
1474	}	1474	}
1475	out:	1475	out:
1476	blk_finish_plug(&plug);	1476	blk_finish_plug(&plug);
1477	return retval;	1477	return retval;
1478	}	1478	}
1479	EXPORT_SYMBOL(generic_file_aio_read);	1479	EXPORT_SYMBOL(generic_file_aio_read);
1480		1480
1481	static ssize_t	1481	static ssize_t
1482	do_readahead(struct address_space mapping, struct file filp,	1482	do_readahead(struct address_space mapping, struct file filp,
1483	pgoff_t index, unsigned long nr)	1483	pgoff_t index, unsigned long nr)
1484	{	1484	{
1485	if (!mapping \|\| !mapping->a_ops \|\| !mapping->a_ops->readpage)	1485	if (!mapping \|\| !mapping->a_ops \|\| !mapping->a_ops->readpage)
1486	return -EINVAL;	1486	return -EINVAL;
1487		1487
1488	force_page_cache_readahead(mapping, filp, index, nr);	1488	force_page_cache_readahead(mapping, filp, index, nr);
1489	return 0;	1489	return 0;
1490	}	1490	}
1491		1491
1492	SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)	1492	SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1493	{	1493	{
1494	ssize_t ret;	1494	ssize_t ret;
1495	struct file *file;	1495	struct file *file;
1496		1496
1497	ret = -EBADF;	1497	ret = -EBADF;
1498	file = fget(fd);	1498	file = fget(fd);
1499	if (file) {	1499	if (file) {
1500	if (file->f_mode & FMODE_READ) {	1500	if (file->f_mode & FMODE_READ) {
1501	struct address_space *mapping = file->f_mapping;	1501	struct address_space *mapping = file->f_mapping;
1502	pgoff_t start = offset >> PAGE_CACHE_SHIFT;	1502	pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1503	pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;	1503	pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1504	unsigned long len = end - start + 1;	1504	unsigned long len = end - start + 1;
1505	ret = do_readahead(mapping, file, start, len);	1505	ret = do_readahead(mapping, file, start, len);
1506	}	1506	}
1507	fput(file);	1507	fput(file);
1508	}	1508	}
1509	return ret;	1509	return ret;
1510	}	1510	}
1511	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS	1511	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1512	asmlinkage long SyS_readahead(long fd, loff_t offset, long count)	1512	asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1513	{	1513	{
1514	return SYSC_readahead((int) fd, offset, (size_t) count);	1514	return SYSC_readahead((int) fd, offset, (size_t) count);
1515	}	1515	}
1516	SYSCALL_ALIAS(sys_readahead, SyS_readahead);	1516	SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1517	#endif	1517	#endif
1518		1518
1519	#ifdef CONFIG_MMU	1519	#ifdef CONFIG_MMU
1520	/**	1520	/**
1521	* page_cache_read - adds requested page to the page cache if not already there	1521	* page_cache_read - adds requested page to the page cache if not already there
1522	* @file: file to read	1522	* @file: file to read
1523	* @offset: page index	1523	* @offset: page index
1524	*	1524	*
1525	* This adds the requested page to the page cache if it isn't already there,	1525	* This adds the requested page to the page cache if it isn't already there,
1526	* and schedules an I/O to read in its contents from disk.	1526	* and schedules an I/O to read in its contents from disk.
1527	*/	1527	*/
1528	static int page_cache_read(struct file *file, pgoff_t offset)	1528	static int page_cache_read(struct file *file, pgoff_t offset)
1529	{	1529	{
1530	struct address_space *mapping = file->f_mapping;	1530	struct address_space *mapping = file->f_mapping;
1531	struct page *page;	1531	struct page *page;
1532	int ret;	1532	int ret;
1533		1533
1534	do {	1534	do {
1535	page = page_cache_alloc_cold(mapping);	1535	page = page_cache_alloc_cold(mapping);
1536	if (!page)	1536	if (!page)
1537	return -ENOMEM;	1537	return -ENOMEM;
1538		1538
1539	ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);	1539	ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1540	if (ret == 0)	1540	if (ret == 0)
1541	ret = mapping->a_ops->readpage(file, page);	1541	ret = mapping->a_ops->readpage(file, page);
1542	else if (ret == -EEXIST)	1542	else if (ret == -EEXIST)
1543	ret = 0; /* losing race to add is OK */	1543	ret = 0; /* losing race to add is OK */
1544		1544
1545	page_cache_release(page);	1545	page_cache_release(page);
1546		1546
1547	} while (ret == AOP_TRUNCATED_PAGE);	1547	} while (ret == AOP_TRUNCATED_PAGE);
1548		1548
1549	return ret;	1549	return ret;
1550	}	1550	}
1551		1551
1552	#define MMAP_LOTSAMISS (100)	1552	#define MMAP_LOTSAMISS (100)
1553		1553
1554	/*	1554	/*
1555	* Synchronous readahead happens when we don't even find	1555	* Synchronous readahead happens when we don't even find
1556	* a page in the page cache at all.	1556	* a page in the page cache at all.
1557	*/	1557	*/
1558	static void do_sync_mmap_readahead(struct vm_area_struct *vma,	1558	static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1559	struct file_ra_state *ra,	1559	struct file_ra_state *ra,
1560	struct file *file,	1560	struct file *file,
1561	pgoff_t offset)	1561	pgoff_t offset)
1562	{	1562	{
1563	unsigned long ra_pages;	1563	unsigned long ra_pages;
1564	struct address_space *mapping = file->f_mapping;	1564	struct address_space *mapping = file->f_mapping;
1565		1565
1566	/* If we don't want any read-ahead, don't bother */	1566	/* If we don't want any read-ahead, don't bother */
1567	if (VM_RandomReadHint(vma))	1567	if (VM_RandomReadHint(vma))
1568	return;	1568	return;
1569	if (!ra->ra_pages)	1569	if (!ra->ra_pages)
1570	return;	1570	return;
1571		1571
1572	if (VM_SequentialReadHint(vma)) {	1572	if (VM_SequentialReadHint(vma)) {
1573	page_cache_sync_readahead(mapping, ra, file, offset,	1573	page_cache_sync_readahead(mapping, ra, file, offset,
1574	ra->ra_pages);	1574	ra->ra_pages);
1575	return;	1575	return;
1576	}	1576	}
1577		1577
1578	/* Avoid banging the cache line if not needed */	1578	/* Avoid banging the cache line if not needed */
1579	if (ra->mmap_miss < MMAP_LOTSAMISS * 10)	1579	if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
1580	ra->mmap_miss++;	1580	ra->mmap_miss++;
1581		1581
1582	/*	1582	/*
1583	* Do we miss much more than hit in this file? If so,	1583	* Do we miss much more than hit in this file? If so,
1584	* stop bothering with read-ahead. It will only hurt.	1584	* stop bothering with read-ahead. It will only hurt.
1585	*/	1585	*/
1586	if (ra->mmap_miss > MMAP_LOTSAMISS)	1586	if (ra->mmap_miss > MMAP_LOTSAMISS)
1587	return;	1587	return;
1588		1588
1589	/*	1589	/*
1590	* mmap read-around	1590	* mmap read-around
1591	*/	1591	*/
1592	ra_pages = max_sane_readahead(ra->ra_pages);	1592	ra_pages = max_sane_readahead(ra->ra_pages);
1593	ra->start = max_t(long, 0, offset - ra_pages / 2);	1593	ra->start = max_t(long, 0, offset - ra_pages / 2);
1594	ra->size = ra_pages;	1594	ra->size = ra_pages;
1595	ra->async_size = ra_pages / 4;	1595	ra->async_size = ra_pages / 4;
1596	ra_submit(ra, mapping, file);	1596	ra_submit(ra, mapping, file);
1597	}	1597	}
1598		1598
1599	/*	1599	/*
1600	* Asynchronous readahead happens when we find the page and PG_readahead,	1600	* Asynchronous readahead happens when we find the page and PG_readahead,
1601	* so we want to possibly extend the readahead further..	1601	* so we want to possibly extend the readahead further..
1602	*/	1602	*/
1603	static void do_async_mmap_readahead(struct vm_area_struct *vma,	1603	static void do_async_mmap_readahead(struct vm_area_struct *vma,
1604	struct file_ra_state *ra,	1604	struct file_ra_state *ra,
1605	struct file *file,	1605	struct file *file,
1606	struct page *page,	1606	struct page *page,
1607	pgoff_t offset)	1607	pgoff_t offset)
1608	{	1608	{
1609	struct address_space *mapping = file->f_mapping;	1609	struct address_space *mapping = file->f_mapping;
1610		1610
1611	/* If we don't want any read-ahead, don't bother */	1611	/* If we don't want any read-ahead, don't bother */
1612	if (VM_RandomReadHint(vma))	1612	if (VM_RandomReadHint(vma))
1613	return;	1613	return;
1614	if (ra->mmap_miss > 0)	1614	if (ra->mmap_miss > 0)
1615	ra->mmap_miss--;	1615	ra->mmap_miss--;
1616	if (PageReadahead(page))	1616	if (PageReadahead(page))
1617	page_cache_async_readahead(mapping, ra, file,	1617	page_cache_async_readahead(mapping, ra, file,
1618	page, offset, ra->ra_pages);	1618	page, offset, ra->ra_pages);
1619	}	1619	}
1620		1620
1621	/**	1621	/**
1622	* filemap_fault - read in file data for page fault handling	1622	* filemap_fault - read in file data for page fault handling
1623	* @vma: vma in which the fault was taken	1623	* @vma: vma in which the fault was taken
1624	* @vmf: struct vm_fault containing details of the fault	1624	* @vmf: struct vm_fault containing details of the fault
1625	*	1625	*
1626	* filemap_fault() is invoked via the vma operations vector for a	1626	* filemap_fault() is invoked via the vma operations vector for a
1627	* mapped memory region to read in file data during a page fault.	1627	* mapped memory region to read in file data during a page fault.
1628	*	1628	*
1629	* The goto's are kind of ugly, but this streamlines the normal case of having	1629	* The goto's are kind of ugly, but this streamlines the normal case of having
1630	* it in the page cache, and handles the special cases reasonably without	1630	* it in the page cache, and handles the special cases reasonably without
1631	* having a lot of duplicated code.	1631	* having a lot of duplicated code.
1632	*/	1632	*/
1633	int filemap_fault(struct vm_area_struct vma, struct vm_fault vmf)	1633	int filemap_fault(struct vm_area_struct vma, struct vm_fault vmf)
1634	{	1634	{
1635	int error;	1635	int error;
1636	struct file *file = vma->vm_file;	1636	struct file *file = vma->vm_file;
1637	struct address_space *mapping = file->f_mapping;	1637	struct address_space *mapping = file->f_mapping;
1638	struct file_ra_state *ra = &file->f_ra;	1638	struct file_ra_state *ra = &file->f_ra;
1639	struct inode *inode = mapping->host;	1639	struct inode *inode = mapping->host;
1640	pgoff_t offset = vmf->pgoff;	1640	pgoff_t offset = vmf->pgoff;
1641	struct page *page;	1641	struct page *page;
1642	pgoff_t size;	1642	pgoff_t size;
1643	int ret = 0;	1643	int ret = 0;
1644		1644
1645	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1645	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1646	if (offset >= size)	1646	if (offset >= size)
1647	return VM_FAULT_SIGBUS;	1647	return VM_FAULT_SIGBUS;
1648		1648
1649	/*	1649	/*
1650	* Do we have something in the page cache already?	1650	* Do we have something in the page cache already?
1651	*/	1651	*/
1652	page = find_get_page(mapping, offset);	1652	page = find_get_page(mapping, offset);
1653	if (likely(page)) {	1653	if (likely(page)) {
1654	/*	1654	/*
1655	* We found the page, so try async readahead before	1655	* We found the page, so try async readahead before
1656	* waiting for the lock.	1656	* waiting for the lock.
1657	*/	1657	*/
1658	do_async_mmap_readahead(vma, ra, file, page, offset);	1658	do_async_mmap_readahead(vma, ra, file, page, offset);
1659	} else {	1659	} else {
1660	/* No page in the page cache at all */	1660	/* No page in the page cache at all */
1661	do_sync_mmap_readahead(vma, ra, file, offset);	1661	do_sync_mmap_readahead(vma, ra, file, offset);
1662	count_vm_event(PGMAJFAULT);	1662	count_vm_event(PGMAJFAULT);
1663	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);	1663	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1664	ret = VM_FAULT_MAJOR;	1664	ret = VM_FAULT_MAJOR;
1665	retry_find:	1665	retry_find:
1666	page = find_get_page(mapping, offset);	1666	page = find_get_page(mapping, offset);
1667	if (!page)	1667	if (!page)
1668	goto no_cached_page;	1668	goto no_cached_page;
1669	}	1669	}
1670		1670
1671	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {	1671	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
1672	page_cache_release(page);	1672	page_cache_release(page);
1673	return ret \| VM_FAULT_RETRY;	1673	return ret \| VM_FAULT_RETRY;
1674	}	1674	}
1675		1675
1676	/* Did it get truncated? */	1676	/* Did it get truncated? */
1677	if (unlikely(page->mapping != mapping)) {	1677	if (unlikely(page->mapping != mapping)) {
1678	unlock_page(page);	1678	unlock_page(page);
1679	put_page(page);	1679	put_page(page);
1680	goto retry_find;	1680	goto retry_find;
1681	}	1681	}
1682	VM_BUG_ON(page->index != offset);	1682	VM_BUG_ON(page->index != offset);
1683		1683
1684	/*	1684	/*
1685	* We have a locked page in the page cache, now we need to check	1685	* We have a locked page in the page cache, now we need to check
1686	* that it's up-to-date. If not, it is going to be due to an error.	1686	* that it's up-to-date. If not, it is going to be due to an error.
1687	*/	1687	*/
1688	if (unlikely(!PageUptodate(page)))	1688	if (unlikely(!PageUptodate(page)))
1689	goto page_not_uptodate;	1689	goto page_not_uptodate;
1690		1690
1691	/*	1691	/*
1692	* Found the page and have a reference on it.	1692	* Found the page and have a reference on it.
1693	* We must recheck i_size under page lock.	1693	* We must recheck i_size under page lock.
1694	*/	1694	*/
1695	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1695	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1696	if (unlikely(offset >= size)) {	1696	if (unlikely(offset >= size)) {
1697	unlock_page(page);	1697	unlock_page(page);
1698	page_cache_release(page);	1698	page_cache_release(page);
1699	return VM_FAULT_SIGBUS;	1699	return VM_FAULT_SIGBUS;
1700	}	1700	}
1701		1701
1702	vmf->page = page;	1702	vmf->page = page;
1703	return ret \| VM_FAULT_LOCKED;	1703	return ret \| VM_FAULT_LOCKED;
1704		1704
1705	no_cached_page:	1705	no_cached_page:
1706	/*	1706	/*
1707	* We're only likely to ever get here if MADV_RANDOM is in	1707	* We're only likely to ever get here if MADV_RANDOM is in
1708	* effect.	1708	* effect.
1709	*/	1709	*/
1710	error = page_cache_read(file, offset);	1710	error = page_cache_read(file, offset);
1711		1711
1712	/*	1712	/*
1713	* The page we want has now been added to the page cache.	1713	* The page we want has now been added to the page cache.
1714	* In the unlikely event that someone removed it in the	1714	* In the unlikely event that someone removed it in the
1715	* meantime, we'll just come back here and read it again.	1715	* meantime, we'll just come back here and read it again.
1716	*/	1716	*/
1717	if (error >= 0)	1717	if (error >= 0)
1718	goto retry_find;	1718	goto retry_find;
1719		1719
1720	/*	1720	/*
1721	* An error return from page_cache_read can result if the	1721	* An error return from page_cache_read can result if the
1722	* system is low on memory, or a problem occurs while trying	1722	* system is low on memory, or a problem occurs while trying
1723	* to schedule I/O.	1723	* to schedule I/O.
1724	*/	1724	*/
1725	if (error == -ENOMEM)	1725	if (error == -ENOMEM)
1726	return VM_FAULT_OOM;	1726	return VM_FAULT_OOM;
1727	return VM_FAULT_SIGBUS;	1727	return VM_FAULT_SIGBUS;
1728		1728
1729	page_not_uptodate:	1729	page_not_uptodate:
1730	/*	1730	/*
1731	* Umm, take care of errors if the page isn't up-to-date.	1731	* Umm, take care of errors if the page isn't up-to-date.
1732	* Try to re-read it _once_. We do this synchronously,	1732	* Try to re-read it _once_. We do this synchronously,
1733	* because there really aren't any performance issues here	1733	* because there really aren't any performance issues here
1734	* and we need to check for errors.	1734	* and we need to check for errors.
1735	*/	1735	*/
1736	ClearPageError(page);	1736	ClearPageError(page);
1737	error = mapping->a_ops->readpage(file, page);	1737	error = mapping->a_ops->readpage(file, page);
1738	if (!error) {	1738	if (!error) {
1739	wait_on_page_locked(page);	1739	wait_on_page_locked(page);
1740	if (!PageUptodate(page))	1740	if (!PageUptodate(page))
1741	error = -EIO;	1741	error = -EIO;
1742	}	1742	}
1743	page_cache_release(page);	1743	page_cache_release(page);
1744		1744
1745	if (!error \|\| error == AOP_TRUNCATED_PAGE)	1745	if (!error \|\| error == AOP_TRUNCATED_PAGE)
1746	goto retry_find;	1746	goto retry_find;
1747		1747
1748	/* Things didn't work out. Return zero to tell the mm layer so. */	1748	/* Things didn't work out. Return zero to tell the mm layer so. */
1749	shrink_readahead_size_eio(file, ra);	1749	shrink_readahead_size_eio(file, ra);
1750	return VM_FAULT_SIGBUS;	1750	return VM_FAULT_SIGBUS;
1751	}	1751	}
1752	EXPORT_SYMBOL(filemap_fault);	1752	EXPORT_SYMBOL(filemap_fault);
1753		1753
1754	const struct vm_operations_struct generic_file_vm_ops = {	1754	const struct vm_operations_struct generic_file_vm_ops = {
1755	.fault = filemap_fault,	1755	.fault = filemap_fault,
1756	};	1756	};
1757		1757
1758	/* This is used for a general mmap of a disk file */	1758	/* This is used for a general mmap of a disk file */
1759		1759
1760	int generic_file_mmap(struct file * file, struct vm_area_struct * vma)	1760	int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1761	{	1761	{
1762	struct address_space *mapping = file->f_mapping;	1762	struct address_space *mapping = file->f_mapping;
1763		1763
1764	if (!mapping->a_ops->readpage)	1764	if (!mapping->a_ops->readpage)
1765	return -ENOEXEC;	1765	return -ENOEXEC;
1766	file_accessed(file);	1766	file_accessed(file);
1767	vma->vm_ops = &generic_file_vm_ops;	1767	vma->vm_ops = &generic_file_vm_ops;
1768	vma->vm_flags \|= VM_CAN_NONLINEAR;	1768	vma->vm_flags \|= VM_CAN_NONLINEAR;
1769	return 0;	1769	return 0;
1770	}	1770	}
1771		1771
1772	/*	1772	/*
1773	* This is for filesystems which do not implement ->writepage.	1773	* This is for filesystems which do not implement ->writepage.
1774	*/	1774	*/
1775	int generic_file_readonly_mmap(struct file file, struct vm_area_struct vma)	1775	int generic_file_readonly_mmap(struct file file, struct vm_area_struct vma)
1776	{	1776	{
1777	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))	1777	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1778	return -EINVAL;	1778	return -EINVAL;
1779	return generic_file_mmap(file, vma);	1779	return generic_file_mmap(file, vma);
1780	}	1780	}
1781	#else	1781	#else
1782	int generic_file_mmap(struct file * file, struct vm_area_struct * vma)	1782	int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1783	{	1783	{
1784	return -ENOSYS;	1784	return -ENOSYS;
1785	}	1785	}
1786	int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)	1786	int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1787	{	1787	{
1788	return -ENOSYS;	1788	return -ENOSYS;
1789	}	1789	}
1790	#endif /* CONFIG_MMU */	1790	#endif /* CONFIG_MMU */
1791		1791
1792	EXPORT_SYMBOL(generic_file_mmap);	1792	EXPORT_SYMBOL(generic_file_mmap);
1793	EXPORT_SYMBOL(generic_file_readonly_mmap);	1793	EXPORT_SYMBOL(generic_file_readonly_mmap);
1794		1794
1795	static struct page __read_cache_page(struct address_space mapping,	1795	static struct page __read_cache_page(struct address_space mapping,
1796	pgoff_t index,	1796	pgoff_t index,
1797	int (filler)(void , struct page *),	1797	int (filler)(void , struct page *),
1798	void *data,	1798	void *data,
1799	gfp_t gfp)	1799	gfp_t gfp)
1800	{	1800	{
1801	struct page *page;	1801	struct page *page;
1802	int err;	1802	int err;
1803	repeat:	1803	repeat:
1804	page = find_get_page(mapping, index);	1804	page = find_get_page(mapping, index);
1805	if (!page) {	1805	if (!page) {
1806	page = __page_cache_alloc(gfp \| __GFP_COLD);	1806	page = __page_cache_alloc(gfp \| __GFP_COLD);
1807	if (!page)	1807	if (!page)
1808	return ERR_PTR(-ENOMEM);	1808	return ERR_PTR(-ENOMEM);
1809	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);	1809	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1810	if (unlikely(err)) {	1810	if (unlikely(err)) {
1811	page_cache_release(page);	1811	page_cache_release(page);
1812	if (err == -EEXIST)	1812	if (err == -EEXIST)
1813	goto repeat;	1813	goto repeat;
1814	/* Presumably ENOMEM for radix tree node */	1814	/* Presumably ENOMEM for radix tree node */
1815	return ERR_PTR(err);	1815	return ERR_PTR(err);
1816	}	1816	}
1817	err = filler(data, page);	1817	err = filler(data, page);
1818	if (err < 0) {	1818	if (err < 0) {
1819	page_cache_release(page);	1819	page_cache_release(page);
1820	page = ERR_PTR(err);	1820	page = ERR_PTR(err);
1821	}	1821	}
1822	}	1822	}
1823	return page;	1823	return page;
1824	}	1824	}
1825		1825
1826	static struct page do_read_cache_page(struct address_space mapping,	1826	static struct page do_read_cache_page(struct address_space mapping,
1827	pgoff_t index,	1827	pgoff_t index,
1828	int (filler)(void , struct page *),	1828	int (filler)(void , struct page *),
1829	void *data,	1829	void *data,
1830	gfp_t gfp)	1830	gfp_t gfp)
1831		1831
1832	{	1832	{
1833	struct page *page;	1833	struct page *page;
1834	int err;	1834	int err;
1835		1835
1836	retry:	1836	retry:
1837	page = __read_cache_page(mapping, index, filler, data, gfp);	1837	page = __read_cache_page(mapping, index, filler, data, gfp);
1838	if (IS_ERR(page))	1838	if (IS_ERR(page))
1839	return page;	1839	return page;
1840	if (PageUptodate(page))	1840	if (PageUptodate(page))
1841	goto out;	1841	goto out;
1842		1842
1843	lock_page(page);	1843	lock_page(page);
1844	if (!page->mapping) {	1844	if (!page->mapping) {
1845	unlock_page(page);	1845	unlock_page(page);
1846	page_cache_release(page);	1846	page_cache_release(page);
1847	goto retry;	1847	goto retry;
1848	}	1848	}
1849	if (PageUptodate(page)) {	1849	if (PageUptodate(page)) {
1850	unlock_page(page);	1850	unlock_page(page);
1851	goto out;	1851	goto out;
1852	}	1852	}
1853	err = filler(data, page);	1853	err = filler(data, page);
1854	if (err < 0) {	1854	if (err < 0) {
1855	page_cache_release(page);	1855	page_cache_release(page);
1856	return ERR_PTR(err);	1856	return ERR_PTR(err);
1857	}	1857	}
1858	out:	1858	out:
1859	mark_page_accessed(page);	1859	mark_page_accessed(page);
1860	return page;	1860	return page;
1861	}	1861	}
1862		1862
1863	/**	1863	/**
1864	* read_cache_page_async - read into page cache, fill it if needed	1864	* read_cache_page_async - read into page cache, fill it if needed
1865	* @mapping: the page's address_space	1865	* @mapping: the page's address_space
1866	* @index: the page index	1866	* @index: the page index
1867	* @filler: function to perform the read	1867	* @filler: function to perform the read
1868	* @data: first arg to filler(data, page) function, often left as NULL	1868	* @data: first arg to filler(data, page) function, often left as NULL
1869	*	1869	*
1870	* Same as read_cache_page, but don't wait for page to become unlocked	1870	* Same as read_cache_page, but don't wait for page to become unlocked
1871	* after submitting it to the filler.	1871	* after submitting it to the filler.
1872	*	1872	*
1873	* Read into the page cache. If a page already exists, and PageUptodate() is	1873	* Read into the page cache. If a page already exists, and PageUptodate() is
1874	* not set, try to fill the page but don't wait for it to become unlocked.	1874	* not set, try to fill the page but don't wait for it to become unlocked.
1875	*	1875	*
1876	* If the page does not get brought uptodate, return -EIO.	1876	* If the page does not get brought uptodate, return -EIO.
1877	*/	1877	*/
1878	struct page read_cache_page_async(struct address_space mapping,	1878	struct page read_cache_page_async(struct address_space mapping,
1879	pgoff_t index,	1879	pgoff_t index,
1880	int (filler)(void , struct page *),	1880	int (filler)(void , struct page *),
1881	void *data)	1881	void *data)
1882	{	1882	{
1883	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));	1883	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1884	}	1884	}
1885	EXPORT_SYMBOL(read_cache_page_async);	1885	EXPORT_SYMBOL(read_cache_page_async);
1886		1886
1887	static struct page wait_on_page_read(struct page page)	1887	static struct page wait_on_page_read(struct page page)
1888	{	1888	{
1889	if (!IS_ERR(page)) {	1889	if (!IS_ERR(page)) {
1890	wait_on_page_locked(page);	1890	wait_on_page_locked(page);
1891	if (!PageUptodate(page)) {	1891	if (!PageUptodate(page)) {
1892	page_cache_release(page);	1892	page_cache_release(page);
1893	page = ERR_PTR(-EIO);	1893	page = ERR_PTR(-EIO);
1894	}	1894	}
1895	}	1895	}
1896	return page;	1896	return page;
1897	}	1897	}
1898		1898
1899	/**	1899	/**
1900	* read_cache_page_gfp - read into page cache, using specified page allocation flags.	1900	* read_cache_page_gfp - read into page cache, using specified page allocation flags.
1901	* @mapping: the page's address_space	1901	* @mapping: the page's address_space
1902	* @index: the page index	1902	* @index: the page index
1903	* @gfp: the page allocator flags to use if allocating	1903	* @gfp: the page allocator flags to use if allocating
1904	*	1904	*
1905	* This is the same as "read_mapping_page(mapping, index, NULL)", but with	1905	* This is the same as "read_mapping_page(mapping, index, NULL)", but with
1906	* any new page allocations done using the specified allocation flags. Note	1906	* any new page allocations done using the specified allocation flags. Note
1907	* that the Radix tree operations will still use GFP_KERNEL, so you can't	1907	* that the Radix tree operations will still use GFP_KERNEL, so you can't
1908	* expect to do this atomically or anything like that - but you can pass in	1908	* expect to do this atomically or anything like that - but you can pass in
1909	* other page requirements.	1909	* other page requirements.
1910	*	1910	*
1911	* If the page does not get brought uptodate, return -EIO.	1911	* If the page does not get brought uptodate, return -EIO.
1912	*/	1912	*/
1913	struct page read_cache_page_gfp(struct address_space mapping,	1913	struct page read_cache_page_gfp(struct address_space mapping,
1914	pgoff_t index,	1914	pgoff_t index,
1915	gfp_t gfp)	1915	gfp_t gfp)
1916	{	1916	{
1917	filler_t filler = (filler_t )mapping->a_ops->readpage;	1917	filler_t filler = (filler_t )mapping->a_ops->readpage;
1918		1918
1919	return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));	1919	return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1920	}	1920	}
1921	EXPORT_SYMBOL(read_cache_page_gfp);	1921	EXPORT_SYMBOL(read_cache_page_gfp);
1922		1922
1923	/**	1923	/**
1924	* read_cache_page - read into page cache, fill it if needed	1924	* read_cache_page - read into page cache, fill it if needed
1925	* @mapping: the page's address_space	1925	* @mapping: the page's address_space
1926	* @index: the page index	1926	* @index: the page index
1927	* @filler: function to perform the read	1927	* @filler: function to perform the read
1928	* @data: first arg to filler(data, page) function, often left as NULL	1928	* @data: first arg to filler(data, page) function, often left as NULL
1929	*	1929	*
1930	* Read into the page cache. If a page already exists, and PageUptodate() is	1930	* Read into the page cache. If a page already exists, and PageUptodate() is
1931	* not set, try to fill the page then wait for it to become unlocked.	1931	* not set, try to fill the page then wait for it to become unlocked.
1932	*	1932	*
1933	* If the page does not get brought uptodate, return -EIO.	1933	* If the page does not get brought uptodate, return -EIO.
1934	*/	1934	*/
1935	struct page read_cache_page(struct address_space mapping,	1935	struct page read_cache_page(struct address_space mapping,
1936	pgoff_t index,	1936	pgoff_t index,
1937	int (filler)(void , struct page *),	1937	int (filler)(void , struct page *),
1938	void *data)	1938	void *data)
1939	{	1939	{
1940	return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));	1940	return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1941	}	1941	}
1942	EXPORT_SYMBOL(read_cache_page);	1942	EXPORT_SYMBOL(read_cache_page);
1943		1943
1944	/*	1944	/*
1945	* The logic we want is	1945	* The logic we want is
1946	*	1946	*
1947	* if suid or (sgid and xgrp)	1947	* if suid or (sgid and xgrp)
1948	* remove privs	1948	* remove privs
1949	*/	1949	*/
1950	int should_remove_suid(struct dentry *dentry)	1950	int should_remove_suid(struct dentry *dentry)
1951	{	1951	{
1952	mode_t mode = dentry->d_inode->i_mode;	1952	mode_t mode = dentry->d_inode->i_mode;
1953	int kill = 0;	1953	int kill = 0;
1954		1954
1955	/* suid always must be killed */	1955	/* suid always must be killed */
1956	if (unlikely(mode & S_ISUID))	1956	if (unlikely(mode & S_ISUID))
1957	kill = ATTR_KILL_SUID;	1957	kill = ATTR_KILL_SUID;
1958		1958
1959	/*	1959	/*
1960	* sgid without any exec bits is just a mandatory locking mark; leave	1960	* sgid without any exec bits is just a mandatory locking mark; leave
1961	* it alone. If some exec bits are set, it's a real sgid; kill it.	1961	* it alone. If some exec bits are set, it's a real sgid; kill it.
1962	*/	1962	*/
1963	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))	1963	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1964	kill \|= ATTR_KILL_SGID;	1964	kill \|= ATTR_KILL_SGID;
1965		1965
1966	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))	1966	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1967	return kill;	1967	return kill;
1968		1968
1969	return 0;	1969	return 0;
1970	}	1970	}
1971	EXPORT_SYMBOL(should_remove_suid);	1971	EXPORT_SYMBOL(should_remove_suid);
1972		1972
1973	static int __remove_suid(struct dentry *dentry, int kill)	1973	static int __remove_suid(struct dentry *dentry, int kill)
1974	{	1974	{
1975	struct iattr newattrs;	1975	struct iattr newattrs;
1976		1976
1977	newattrs.ia_valid = ATTR_FORCE \| kill;	1977	newattrs.ia_valid = ATTR_FORCE \| kill;
1978	return notify_change(dentry, &newattrs);	1978	return notify_change(dentry, &newattrs);
1979	}	1979	}
1980		1980
1981	int file_remove_suid(struct file *file)	1981	int file_remove_suid(struct file *file)
1982	{	1982	{
1983	struct dentry *dentry = file->f_path.dentry;	1983	struct dentry *dentry = file->f_path.dentry;
1984	struct inode *inode = dentry->d_inode;	1984	struct inode *inode = dentry->d_inode;
1985	int killsuid;	1985	int killsuid;
1986	int killpriv;	1986	int killpriv;
1987	int error = 0;	1987	int error = 0;
1988		1988
1989	/* Fast path for nothing security related */	1989	/* Fast path for nothing security related */
1990	if (IS_NOSEC(inode))	1990	if (IS_NOSEC(inode))
1991	return 0;	1991	return 0;
1992		1992
1993	killsuid = should_remove_suid(dentry);	1993	killsuid = should_remove_suid(dentry);
1994	killpriv = security_inode_need_killpriv(dentry);	1994	killpriv = security_inode_need_killpriv(dentry);
1995		1995
1996	if (killpriv < 0)	1996	if (killpriv < 0)
1997	return killpriv;	1997	return killpriv;
1998	if (killpriv)	1998	if (killpriv)
1999	error = security_inode_killpriv(dentry);	1999	error = security_inode_killpriv(dentry);
2000	if (!error && killsuid)	2000	if (!error && killsuid)
2001	error = __remove_suid(dentry, killsuid);	2001	error = __remove_suid(dentry, killsuid);
2002	if (!error && (inode->i_sb->s_flags & MS_NOSEC))	2002	if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2003	inode->i_flags \|= S_NOSEC;	2003	inode->i_flags \|= S_NOSEC;
2004		2004
2005	return error;	2005	return error;
2006	}	2006	}
2007	EXPORT_SYMBOL(file_remove_suid);	2007	EXPORT_SYMBOL(file_remove_suid);
2008		2008
2009	static size_t __iovec_copy_from_user_inatomic(char *vaddr,	2009	static size_t __iovec_copy_from_user_inatomic(char *vaddr,
2010	const struct iovec *iov, size_t base, size_t bytes)	2010	const struct iovec *iov, size_t base, size_t bytes)
2011	{	2011	{
2012	size_t copied = 0, left = 0;	2012	size_t copied = 0, left = 0;
2013		2013
2014	while (bytes) {	2014	while (bytes) {
2015	char __user *buf = iov->iov_base + base;	2015	char __user *buf = iov->iov_base + base;
2016	int copy = min(bytes, iov->iov_len - base);	2016	int copy = min(bytes, iov->iov_len - base);
2017		2017
2018	base = 0;	2018	base = 0;
2019	left = __copy_from_user_inatomic(vaddr, buf, copy);	2019	left = __copy_from_user_inatomic(vaddr, buf, copy);
2020	copied += copy;	2020	copied += copy;
2021	bytes -= copy;	2021	bytes -= copy;
2022	vaddr += copy;	2022	vaddr += copy;
2023	iov++;	2023	iov++;
2024		2024
2025	if (unlikely(left))	2025	if (unlikely(left))
2026	break;	2026	break;
2027	}	2027	}
2028	return copied - left;	2028	return copied - left;
2029	}	2029	}
2030		2030
2031	/*	2031	/*
2032	* Copy as much as we can into the page and return the number of bytes which	2032	* Copy as much as we can into the page and return the number of bytes which
2033	* were successfully copied. If a fault is encountered then return the number of	2033	* were successfully copied. If a fault is encountered then return the number of
2034	* bytes which were copied.	2034	* bytes which were copied.
2035	*/	2035	*/
2036	size_t iov_iter_copy_from_user_atomic(struct page *page,	2036	size_t iov_iter_copy_from_user_atomic(struct page *page,
2037	struct iov_iter *i, unsigned long offset, size_t bytes)	2037	struct iov_iter *i, unsigned long offset, size_t bytes)
2038	{	2038	{
2039	char *kaddr;	2039	char *kaddr;
2040	size_t copied;	2040	size_t copied;
2041		2041
2042	BUG_ON(!in_atomic());	2042	BUG_ON(!in_atomic());
2043	kaddr = kmap_atomic(page, KM_USER0);	2043	kaddr = kmap_atomic(page, KM_USER0);
2044	if (likely(i->nr_segs == 1)) {	2044	if (likely(i->nr_segs == 1)) {
2045	int left;	2045	int left;
2046	char __user *buf = i->iov->iov_base + i->iov_offset;	2046	char __user *buf = i->iov->iov_base + i->iov_offset;
2047	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);	2047	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
2048	copied = bytes - left;	2048	copied = bytes - left;
2049	} else {	2049	} else {
2050	copied = __iovec_copy_from_user_inatomic(kaddr + offset,	2050	copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2051	i->iov, i->iov_offset, bytes);	2051	i->iov, i->iov_offset, bytes);
2052	}	2052	}
2053	kunmap_atomic(kaddr, KM_USER0);	2053	kunmap_atomic(kaddr, KM_USER0);
2054		2054
2055	return copied;	2055	return copied;
2056	}	2056	}
2057	EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);	2057	EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
2058		2058
2059	/*	2059	/*
2060	* This has the same sideeffects and return value as	2060	* This has the same sideeffects and return value as
2061	* iov_iter_copy_from_user_atomic().	2061	* iov_iter_copy_from_user_atomic().
2062	* The difference is that it attempts to resolve faults.	2062	* The difference is that it attempts to resolve faults.
2063	* Page must not be locked.	2063	* Page must not be locked.
2064	*/	2064	*/
2065	size_t iov_iter_copy_from_user(struct page *page,	2065	size_t iov_iter_copy_from_user(struct page *page,
2066	struct iov_iter *i, unsigned long offset, size_t bytes)	2066	struct iov_iter *i, unsigned long offset, size_t bytes)
2067	{	2067	{
2068	char *kaddr;	2068	char *kaddr;
2069	size_t copied;	2069	size_t copied;
2070		2070
2071	kaddr = kmap(page);	2071	kaddr = kmap(page);
2072	if (likely(i->nr_segs == 1)) {	2072	if (likely(i->nr_segs == 1)) {
2073	int left;	2073	int left;
2074	char __user *buf = i->iov->iov_base + i->iov_offset;	2074	char __user *buf = i->iov->iov_base + i->iov_offset;
2075	left = __copy_from_user(kaddr + offset, buf, bytes);	2075	left = __copy_from_user(kaddr + offset, buf, bytes);
2076	copied = bytes - left;	2076	copied = bytes - left;
2077	} else {	2077	} else {
2078	copied = __iovec_copy_from_user_inatomic(kaddr + offset,	2078	copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2079	i->iov, i->iov_offset, bytes);	2079	i->iov, i->iov_offset, bytes);
2080	}	2080	}
2081	kunmap(page);	2081	kunmap(page);
2082	return copied;	2082	return copied;
2083	}	2083	}
2084	EXPORT_SYMBOL(iov_iter_copy_from_user);	2084	EXPORT_SYMBOL(iov_iter_copy_from_user);
2085		2085
2086	void iov_iter_advance(struct iov_iter *i, size_t bytes)	2086	void iov_iter_advance(struct iov_iter *i, size_t bytes)
2087	{	2087	{
2088	BUG_ON(i->count < bytes);	2088	BUG_ON(i->count < bytes);
2089		2089
2090	if (likely(i->nr_segs == 1)) {	2090	if (likely(i->nr_segs == 1)) {
2091	i->iov_offset += bytes;	2091	i->iov_offset += bytes;
2092	i->count -= bytes;	2092	i->count -= bytes;
2093	} else {	2093	} else {
2094	const struct iovec *iov = i->iov;	2094	const struct iovec *iov = i->iov;
2095	size_t base = i->iov_offset;	2095	size_t base = i->iov_offset;
2096		2096
2097	/*	2097	/*
2098	* The !iov->iov_len check ensures we skip over unlikely	2098	* The !iov->iov_len check ensures we skip over unlikely
2099	* zero-length segments (without overruning the iovec).	2099	* zero-length segments (without overruning the iovec).
2100	*/	2100	*/
2101	while (bytes \|\| unlikely(i->count && !iov->iov_len)) {	2101	while (bytes \|\| unlikely(i->count && !iov->iov_len)) {
2102	int copy;	2102	int copy;
2103		2103
2104	copy = min(bytes, iov->iov_len - base);	2104	copy = min(bytes, iov->iov_len - base);
2105	BUG_ON(!i->count \|\| i->count < copy);	2105	BUG_ON(!i->count \|\| i->count < copy);
2106	i->count -= copy;	2106	i->count -= copy;
2107	bytes -= copy;	2107	bytes -= copy;
2108	base += copy;	2108	base += copy;
2109	if (iov->iov_len == base) {	2109	if (iov->iov_len == base) {
2110	iov++;	2110	iov++;
2111	base = 0;	2111	base = 0;
2112	}	2112	}
2113	}	2113	}
2114	i->iov = iov;	2114	i->iov = iov;
2115	i->iov_offset = base;	2115	i->iov_offset = base;
2116	}	2116	}
2117	}	2117	}
2118	EXPORT_SYMBOL(iov_iter_advance);	2118	EXPORT_SYMBOL(iov_iter_advance);
2119		2119
2120	/*	2120	/*
2121	* Fault in the first iovec of the given iov_iter, to a maximum length	2121	* Fault in the first iovec of the given iov_iter, to a maximum length
2122	* of bytes. Returns 0 on success, or non-zero if the memory could not be	2122	* of bytes. Returns 0 on success, or non-zero if the memory could not be
2123	* accessed (ie. because it is an invalid address).	2123	* accessed (ie. because it is an invalid address).
2124	*	2124	*
2125	* writev-intensive code may want this to prefault several iovecs -- that	2125	* writev-intensive code may want this to prefault several iovecs -- that
2126	* would be possible (callers must not rely on the fact that _only_ the	2126	* would be possible (callers must not rely on the fact that _only_ the
2127	* first iovec will be faulted with the current implementation).	2127	* first iovec will be faulted with the current implementation).
2128	*/	2128	*/
2129	int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)	2129	int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
2130	{	2130	{
2131	char __user *buf = i->iov->iov_base + i->iov_offset;	2131	char __user *buf = i->iov->iov_base + i->iov_offset;
2132	bytes = min(bytes, i->iov->iov_len - i->iov_offset);	2132	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
2133	return fault_in_pages_readable(buf, bytes);	2133	return fault_in_pages_readable(buf, bytes);
2134	}	2134	}
2135	EXPORT_SYMBOL(iov_iter_fault_in_readable);	2135	EXPORT_SYMBOL(iov_iter_fault_in_readable);
2136		2136
2137	/*	2137	/*
2138	* Return the count of just the current iov_iter segment.	2138	* Return the count of just the current iov_iter segment.
2139	*/	2139	*/
2140	size_t iov_iter_single_seg_count(struct iov_iter *i)	2140	size_t iov_iter_single_seg_count(struct iov_iter *i)
2141	{	2141	{
2142	const struct iovec *iov = i->iov;	2142	const struct iovec *iov = i->iov;
2143	if (i->nr_segs == 1)	2143	if (i->nr_segs == 1)
2144	return i->count;	2144	return i->count;
2145	else	2145	else
2146	return min(i->count, iov->iov_len - i->iov_offset);	2146	return min(i->count, iov->iov_len - i->iov_offset);
2147	}	2147	}
2148	EXPORT_SYMBOL(iov_iter_single_seg_count);	2148	EXPORT_SYMBOL(iov_iter_single_seg_count);
2149		2149
2150	/*	2150	/*
2151	* Performs necessary checks before doing a write	2151	* Performs necessary checks before doing a write
2152	*	2152	*
2153	* Can adjust writing position or amount of bytes to write.	2153	* Can adjust writing position or amount of bytes to write.
2154	* Returns appropriate error code that caller should return or	2154	* Returns appropriate error code that caller should return or
2155	* zero in case that write should be allowed.	2155	* zero in case that write should be allowed.
2156	*/	2156	*/
2157	inline int generic_write_checks(struct file file, loff_t pos, size_t *count, int isblk)	2157	inline int generic_write_checks(struct file file, loff_t pos, size_t *count, int isblk)
2158	{	2158	{
2159	struct inode *inode = file->f_mapping->host;	2159	struct inode *inode = file->f_mapping->host;
2160	unsigned long limit = rlimit(RLIMIT_FSIZE);	2160	unsigned long limit = rlimit(RLIMIT_FSIZE);
2161		2161
2162	if (unlikely(*pos < 0))	2162	if (unlikely(*pos < 0))
2163	return -EINVAL;	2163	return -EINVAL;
2164		2164
2165	if (!isblk) {	2165	if (!isblk) {
2166	/* FIXME: this is for backwards compatibility with 2.4 */	2166	/* FIXME: this is for backwards compatibility with 2.4 */
2167	if (file->f_flags & O_APPEND)	2167	if (file->f_flags & O_APPEND)
2168	*pos = i_size_read(inode);	2168	*pos = i_size_read(inode);
2169		2169
2170	if (limit != RLIM_INFINITY) {	2170	if (limit != RLIM_INFINITY) {
2171	if (*pos >= limit) {	2171	if (*pos >= limit) {
2172	send_sig(SIGXFSZ, current, 0);	2172	send_sig(SIGXFSZ, current, 0);
2173	return -EFBIG;	2173	return -EFBIG;
2174	}	2174	}
2175	if (count > limit - (typeof(limit))pos) {	2175	if (count > limit - (typeof(limit))pos) {
2176	count = limit - (typeof(limit))pos;	2176	count = limit - (typeof(limit))pos;
2177	}	2177	}
2178	}	2178	}
2179	}	2179	}
2180		2180
2181	/*	2181	/*
2182	* LFS rule	2182	* LFS rule
2183	*/	2183	*/
2184	if (unlikely(pos + count > MAX_NON_LFS &&	2184	if (unlikely(pos + count > MAX_NON_LFS &&
2185	!(file->f_flags & O_LARGEFILE))) {	2185	!(file->f_flags & O_LARGEFILE))) {
2186	if (*pos >= MAX_NON_LFS) {	2186	if (*pos >= MAX_NON_LFS) {
2187	return -EFBIG;	2187	return -EFBIG;
2188	}	2188	}
2189	if (count > MAX_NON_LFS - (unsigned long)pos) {	2189	if (count > MAX_NON_LFS - (unsigned long)pos) {
2190	count = MAX_NON_LFS - (unsigned long)pos;	2190	count = MAX_NON_LFS - (unsigned long)pos;
2191	}	2191	}
2192	}	2192	}
2193		2193
2194	/*	2194	/*
2195	* Are we about to exceed the fs block limit ?	2195	* Are we about to exceed the fs block limit ?
2196	*	2196	*
2197	* If we have written data it becomes a short write. If we have	2197	* If we have written data it becomes a short write. If we have
2198	* exceeded without writing data we send a signal and return EFBIG.	2198	* exceeded without writing data we send a signal and return EFBIG.
2199	* Linus frestrict idea will clean these up nicely..	2199	* Linus frestrict idea will clean these up nicely..
2200	*/	2200	*/
2201	if (likely(!isblk)) {	2201	if (likely(!isblk)) {
2202	if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {	2202	if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2203	if (count \|\| pos > inode->i_sb->s_maxbytes) {	2203	if (count \|\| pos > inode->i_sb->s_maxbytes) {
2204	return -EFBIG;	2204	return -EFBIG;
2205	}	2205	}
2206	/* zero-length writes at ->s_maxbytes are OK */	2206	/* zero-length writes at ->s_maxbytes are OK */
2207	}	2207	}
2208		2208
2209	if (unlikely(pos + count > inode->i_sb->s_maxbytes))	2209	if (unlikely(pos + count > inode->i_sb->s_maxbytes))
2210	count = inode->i_sb->s_maxbytes - pos;	2210	count = inode->i_sb->s_maxbytes - pos;
2211	} else {	2211	} else {
2212	#ifdef CONFIG_BLOCK	2212	#ifdef CONFIG_BLOCK
2213	loff_t isize;	2213	loff_t isize;
2214	if (bdev_read_only(I_BDEV(inode)))	2214	if (bdev_read_only(I_BDEV(inode)))
2215	return -EPERM;	2215	return -EPERM;
2216	isize = i_size_read(inode);	2216	isize = i_size_read(inode);
2217	if (*pos >= isize) {	2217	if (*pos >= isize) {
2218	if (count \|\| pos > isize)	2218	if (count \|\| pos > isize)
2219	return -ENOSPC;	2219	return -ENOSPC;
2220	}	2220	}
2221		2221
2222	if (pos + count > isize)	2222	if (pos + count > isize)
2223	count = isize - pos;	2223	count = isize - pos;
2224	#else	2224	#else
2225	return -EPERM;	2225	return -EPERM;
2226	#endif	2226	#endif
2227	}	2227	}
2228	return 0;	2228	return 0;
2229	}	2229	}
2230	EXPORT_SYMBOL(generic_write_checks);	2230	EXPORT_SYMBOL(generic_write_checks);
2231		2231
2232	int pagecache_write_begin(struct file file, struct address_space mapping,	2232	int pagecache_write_begin(struct file file, struct address_space mapping,
2233	loff_t pos, unsigned len, unsigned flags,	2233	loff_t pos, unsigned len, unsigned flags,
2234	struct page pagep, void fsdata)	2234	struct page pagep, void fsdata)
2235	{	2235	{
2236	const struct address_space_operations *aops = mapping->a_ops;	2236	const struct address_space_operations *aops = mapping->a_ops;
2237		2237
2238	return aops->write_begin(file, mapping, pos, len, flags,	2238	return aops->write_begin(file, mapping, pos, len, flags,
2239	pagep, fsdata);	2239	pagep, fsdata);
2240	}	2240	}
2241	EXPORT_SYMBOL(pagecache_write_begin);	2241	EXPORT_SYMBOL(pagecache_write_begin);
2242		2242
2243	int pagecache_write_end(struct file file, struct address_space mapping,	2243	int pagecache_write_end(struct file file, struct address_space mapping,
2244	loff_t pos, unsigned len, unsigned copied,	2244	loff_t pos, unsigned len, unsigned copied,
2245	struct page page, void fsdata)	2245	struct page page, void fsdata)
2246	{	2246	{
2247	const struct address_space_operations *aops = mapping->a_ops;	2247	const struct address_space_operations *aops = mapping->a_ops;
2248		2248
2249	mark_page_accessed(page);	2249	mark_page_accessed(page);
2250	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);	2250	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2251	}	2251	}
2252	EXPORT_SYMBOL(pagecache_write_end);	2252	EXPORT_SYMBOL(pagecache_write_end);
2253		2253
2254	ssize_t	2254	ssize_t
2255	generic_file_direct_write(struct kiocb iocb, const struct iovec iov,	2255	generic_file_direct_write(struct kiocb iocb, const struct iovec iov,
2256	unsigned long nr_segs, loff_t pos, loff_t ppos,	2256	unsigned long nr_segs, loff_t pos, loff_t ppos,
2257	size_t count, size_t ocount)	2257	size_t count, size_t ocount)
2258	{	2258	{
2259	struct file *file = iocb->ki_filp;	2259	struct file *file = iocb->ki_filp;
2260	struct address_space *mapping = file->f_mapping;	2260	struct address_space *mapping = file->f_mapping;
2261	struct inode *inode = mapping->host;	2261	struct inode *inode = mapping->host;
2262	ssize_t written;	2262	ssize_t written;
2263	size_t write_len;	2263	size_t write_len;
2264	pgoff_t end;	2264	pgoff_t end;
2265		2265
2266	if (count != ocount)	2266	if (count != ocount)
2267	nr_segs = iov_shorten((struct iovec )iov, *nr_segs, count);	2267	nr_segs = iov_shorten((struct iovec )iov, *nr_segs, count);
2268		2268
2269	write_len = iov_length(iov, *nr_segs);	2269	write_len = iov_length(iov, *nr_segs);
2270	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;	2270	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2271		2271
2272	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);	2272	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2273	if (written)	2273	if (written)
2274	goto out;	2274	goto out;
2275		2275
2276	/*	2276	/*
2277	* After a write we want buffered reads to be sure to go to disk to get	2277	* After a write we want buffered reads to be sure to go to disk to get
2278	* the new data. We invalidate clean cached page from the region we're	2278	* the new data. We invalidate clean cached page from the region we're
2279	* about to write. We do this before the write so that we can return	2279	* about to write. We do this before the write so that we can return
2280	* without clobbering -EIOCBQUEUED from ->direct_IO().	2280	* without clobbering -EIOCBQUEUED from ->direct_IO().
2281	*/	2281	*/
2282	if (mapping->nrpages) {	2282	if (mapping->nrpages) {
2283	written = invalidate_inode_pages2_range(mapping,	2283	written = invalidate_inode_pages2_range(mapping,
2284	pos >> PAGE_CACHE_SHIFT, end);	2284	pos >> PAGE_CACHE_SHIFT, end);
2285	/*	2285	/*
2286	* If a page can not be invalidated, return 0 to fall back	2286	* If a page can not be invalidated, return 0 to fall back
2287	* to buffered write.	2287	* to buffered write.
2288	*/	2288	*/
2289	if (written) {	2289	if (written) {
2290	if (written == -EBUSY)	2290	if (written == -EBUSY)
2291	return 0;	2291	return 0;
2292	goto out;	2292	goto out;
2293	}	2293	}
2294	}	2294	}
2295		2295
2296	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);	2296	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2297		2297
2298	/*	2298	/*
2299	* Finally, try again to invalidate clean pages which might have been	2299	* Finally, try again to invalidate clean pages which might have been
2300	* cached by non-direct readahead, or faulted in by get_user_pages()	2300	* cached by non-direct readahead, or faulted in by get_user_pages()
2301	* if the source of the write was an mmap'ed region of the file	2301	* if the source of the write was an mmap'ed region of the file
2302	* we're writing. Either one is a pretty crazy thing to do,	2302	* we're writing. Either one is a pretty crazy thing to do,
2303	* so we don't support it 100%. If this invalidation	2303	* so we don't support it 100%. If this invalidation
2304	* fails, tough, the write still worked...	2304	* fails, tough, the write still worked...
2305	*/	2305	*/
2306	if (mapping->nrpages) {	2306	if (mapping->nrpages) {
2307	invalidate_inode_pages2_range(mapping,	2307	invalidate_inode_pages2_range(mapping,
2308	pos >> PAGE_CACHE_SHIFT, end);	2308	pos >> PAGE_CACHE_SHIFT, end);
2309	}	2309	}
2310		2310
2311	if (written > 0) {	2311	if (written > 0) {
2312	pos += written;	2312	pos += written;
2313	if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {	2313	if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2314	i_size_write(inode, pos);	2314	i_size_write(inode, pos);
2315	mark_inode_dirty(inode);	2315	mark_inode_dirty(inode);
2316	}	2316	}
2317	*ppos = pos;	2317	*ppos = pos;
2318	}	2318	}
2319	out:	2319	out:
2320	return written;	2320	return written;
2321	}	2321	}
2322	EXPORT_SYMBOL(generic_file_direct_write);	2322	EXPORT_SYMBOL(generic_file_direct_write);
2323		2323
2324	/*	2324	/*
2325	* Find or create a page at the given pagecache position. Return the locked	2325	* Find or create a page at the given pagecache position. Return the locked
2326	* page. This function is specifically for buffered writes.	2326	* page. This function is specifically for buffered writes.
2327	*/	2327	*/
2328	struct page grab_cache_page_write_begin(struct address_space mapping,	2328	struct page grab_cache_page_write_begin(struct address_space mapping,
2329	pgoff_t index, unsigned flags)	2329	pgoff_t index, unsigned flags)
2330	{	2330	{
2331	int status;	2331	int status;
2332	struct page *page;	2332	struct page *page;
2333	gfp_t gfp_notmask = 0;	2333	gfp_t gfp_notmask = 0;
2334	if (flags & AOP_FLAG_NOFS)	2334	if (flags & AOP_FLAG_NOFS)
2335	gfp_notmask = __GFP_FS;	2335	gfp_notmask = __GFP_FS;
2336	repeat:	2336	repeat:
2337	page = find_lock_page(mapping, index);	2337	page = find_lock_page(mapping, index);
2338	if (page)	2338	if (page)
2339	goto found;	2339	goto found;
2340		2340
2341	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);	2341	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2342	if (!page)	2342	if (!page)
2343	return NULL;	2343	return NULL;
2344	status = add_to_page_cache_lru(page, mapping, index,	2344	status = add_to_page_cache_lru(page, mapping, index,
2345	GFP_KERNEL & ~gfp_notmask);	2345	GFP_KERNEL & ~gfp_notmask);
2346	if (unlikely(status)) {	2346	if (unlikely(status)) {
2347	page_cache_release(page);	2347	page_cache_release(page);
2348	if (status == -EEXIST)	2348	if (status == -EEXIST)
2349	goto repeat;	2349	goto repeat;
2350	return NULL;	2350	return NULL;
2351	}	2351	}
2352	found:	2352	found:
2353	wait_on_page_writeback(page);	2353	wait_on_page_writeback(page);
2354	return page;	2354	return page;
2355	}	2355	}
2356	EXPORT_SYMBOL(grab_cache_page_write_begin);	2356	EXPORT_SYMBOL(grab_cache_page_write_begin);
2357		2357
2358	static ssize_t generic_perform_write(struct file *file,	2358	static ssize_t generic_perform_write(struct file *file,
2359	struct iov_iter *i, loff_t pos)	2359	struct iov_iter *i, loff_t pos)
2360	{	2360	{
2361	struct address_space *mapping = file->f_mapping;	2361	struct address_space *mapping = file->f_mapping;
2362	const struct address_space_operations *a_ops = mapping->a_ops;	2362	const struct address_space_operations *a_ops = mapping->a_ops;
2363	long status = 0;	2363	long status = 0;
2364	ssize_t written = 0;	2364	ssize_t written = 0;
2365	unsigned int flags = 0;	2365	unsigned int flags = 0;
2366		2366
2367	/*	2367	/*
2368	* Copies from kernel address space cannot fail (NFSD is a big user).	2368	* Copies from kernel address space cannot fail (NFSD is a big user).
2369	*/	2369	*/
2370	if (segment_eq(get_fs(), KERNEL_DS))	2370	if (segment_eq(get_fs(), KERNEL_DS))
2371	flags \|= AOP_FLAG_UNINTERRUPTIBLE;	2371	flags \|= AOP_FLAG_UNINTERRUPTIBLE;
2372		2372
2373	do {	2373	do {
2374	struct page *page;	2374	struct page *page;
2375	unsigned long offset; /* Offset into pagecache page */	2375	unsigned long offset; /* Offset into pagecache page */
2376	unsigned long bytes; /* Bytes to write to page */	2376	unsigned long bytes; /* Bytes to write to page */
2377	size_t copied; /* Bytes copied from user */	2377	size_t copied; /* Bytes copied from user */
2378	void *fsdata;	2378	void *fsdata;
2379		2379
2380	offset = (pos & (PAGE_CACHE_SIZE - 1));	2380	offset = (pos & (PAGE_CACHE_SIZE - 1));
2381	bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,	2381	bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2382	iov_iter_count(i));	2382	iov_iter_count(i));
2383		2383
2384	again:	2384	again:
2385		2385
2386	/*	2386	/*
2387	* Bring in the user page that we will copy from _first_.	2387	* Bring in the user page that we will copy from _first_.
2388	* Otherwise there's a nasty deadlock on copying from the	2388	* Otherwise there's a nasty deadlock on copying from the
2389	* same page as we're writing to, without it being marked	2389	* same page as we're writing to, without it being marked
2390	* up-to-date.	2390	* up-to-date.
2391	*	2391	*
2392	* Not only is this an optimisation, but it is also required	2392	* Not only is this an optimisation, but it is also required
2393	* to check that the address is actually valid, when atomic	2393	* to check that the address is actually valid, when atomic
2394	* usercopies are used, below.	2394	* usercopies are used, below.
2395	*/	2395	*/
2396	if (unlikely(iov_iter_fault_in_readable(i, bytes))) {	2396	if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2397	status = -EFAULT;	2397	status = -EFAULT;
2398	break;	2398	break;
2399	}	2399	}
2400		2400
2401	status = a_ops->write_begin(file, mapping, pos, bytes, flags,	2401	status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2402	&page, &fsdata);	2402	&page, &fsdata);
2403	if (unlikely(status))	2403	if (unlikely(status))
2404	break;	2404	break;
2405		2405
2406	if (mapping_writably_mapped(mapping))	2406	if (mapping_writably_mapped(mapping))
2407	flush_dcache_page(page);	2407	flush_dcache_page(page);
2408		2408
2409	pagefault_disable();	2409	pagefault_disable();
2410	copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);	2410	copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2411	pagefault_enable();	2411	pagefault_enable();
2412	flush_dcache_page(page);	2412	flush_dcache_page(page);
2413		2413
2414	mark_page_accessed(page);	2414	mark_page_accessed(page);
2415	status = a_ops->write_end(file, mapping, pos, bytes, copied,	2415	status = a_ops->write_end(file, mapping, pos, bytes, copied,
2416	page, fsdata);	2416	page, fsdata);
2417	if (unlikely(status < 0))	2417	if (unlikely(status < 0))
2418	break;	2418	break;
2419	copied = status;	2419	copied = status;
2420		2420
2421	cond_resched();	2421	cond_resched();
2422		2422
2423	iov_iter_advance(i, copied);	2423	iov_iter_advance(i, copied);
2424	if (unlikely(copied == 0)) {	2424	if (unlikely(copied == 0)) {
2425	/*	2425	/*
2426	* If we were unable to copy any data at all, we must	2426	* If we were unable to copy any data at all, we must
2427	* fall back to a single segment length write.	2427	* fall back to a single segment length write.
2428	*	2428	*
2429	* If we didn't fallback here, we could livelock	2429	* If we didn't fallback here, we could livelock
2430	* because not all segments in the iov can be copied at	2430	* because not all segments in the iov can be copied at
2431	* once without a pagefault.	2431	* once without a pagefault.
2432	*/	2432	*/
2433	bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,	2433	bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2434	iov_iter_single_seg_count(i));	2434	iov_iter_single_seg_count(i));
2435	goto again;	2435	goto again;
2436	}	2436	}
2437	pos += copied;	2437	pos += copied;
2438	written += copied;	2438	written += copied;
2439		2439
2440	balance_dirty_pages_ratelimited(mapping);	2440	balance_dirty_pages_ratelimited(mapping);
2441		2441
2442	} while (iov_iter_count(i));	2442	} while (iov_iter_count(i));
2443		2443
2444	return written ? written : status;	2444	return written ? written : status;
2445	}	2445	}
2446		2446
2447	ssize_t	2447	ssize_t
2448	generic_file_buffered_write(struct kiocb iocb, const struct iovec iov,	2448	generic_file_buffered_write(struct kiocb iocb, const struct iovec iov,
2449	unsigned long nr_segs, loff_t pos, loff_t *ppos,	2449	unsigned long nr_segs, loff_t pos, loff_t *ppos,
2450	size_t count, ssize_t written)	2450	size_t count, ssize_t written)
2451	{	2451	{
2452	struct file *file = iocb->ki_filp;	2452	struct file *file = iocb->ki_filp;
2453	ssize_t status;	2453	ssize_t status;
2454	struct iov_iter i;	2454	struct iov_iter i;
2455		2455
2456	iov_iter_init(&i, iov, nr_segs, count, written);	2456	iov_iter_init(&i, iov, nr_segs, count, written);
2457	status = generic_perform_write(file, &i, pos);	2457	status = generic_perform_write(file, &i, pos);
2458		2458
2459	if (likely(status >= 0)) {	2459	if (likely(status >= 0)) {
2460	written += status;	2460	written += status;
2461	*ppos = pos + status;	2461	*ppos = pos + status;
2462	}	2462	}
2463		2463
2464	return written ? written : status;	2464	return written ? written : status;
2465	}	2465	}
2466	EXPORT_SYMBOL(generic_file_buffered_write);	2466	EXPORT_SYMBOL(generic_file_buffered_write);
2467		2467
2468	/**	2468	/**
2469	* __generic_file_aio_write - write data to a file	2469	* __generic_file_aio_write - write data to a file
2470	* @iocb: IO state structure (file, offset, etc.)	2470	* @iocb: IO state structure (file, offset, etc.)
2471	* @iov: vector with data to write	2471	* @iov: vector with data to write
2472	* @nr_segs: number of segments in the vector	2472	* @nr_segs: number of segments in the vector
2473	* @ppos: position where to write	2473	* @ppos: position where to write
2474	*	2474	*
2475	* This function does all the work needed for actually writing data to a	2475	* This function does all the work needed for actually writing data to a
2476	* file. It does all basic checks, removes SUID from the file, updates	2476	* file. It does all basic checks, removes SUID from the file, updates
2477	* modification times and calls proper subroutines depending on whether we	2477	* modification times and calls proper subroutines depending on whether we
2478	* do direct IO or a standard buffered write.	2478	* do direct IO or a standard buffered write.
2479	*	2479	*
2480	* It expects i_mutex to be grabbed unless we work on a block device or similar	2480	* It expects i_mutex to be grabbed unless we work on a block device or similar
2481	* object which does not need locking at all.	2481	* object which does not need locking at all.
2482	*	2482	*
2483	* This function does not take care of syncing data in case of O_SYNC write.	2483	* This function does not take care of syncing data in case of O_SYNC write.
2484	* A caller has to handle it. This is mainly due to the fact that we want to	2484	* A caller has to handle it. This is mainly due to the fact that we want to
2485	* avoid syncing under i_mutex.	2485	* avoid syncing under i_mutex.
2486	*/	2486	*/
2487	ssize_t __generic_file_aio_write(struct kiocb iocb, const struct iovec iov,	2487	ssize_t __generic_file_aio_write(struct kiocb iocb, const struct iovec iov,
2488	unsigned long nr_segs, loff_t *ppos)	2488	unsigned long nr_segs, loff_t *ppos)
2489	{	2489	{
2490	struct file *file = iocb->ki_filp;	2490	struct file *file = iocb->ki_filp;
2491	struct address_space * mapping = file->f_mapping;	2491	struct address_space * mapping = file->f_mapping;
2492	size_t ocount; /* original count */	2492	size_t ocount; /* original count */
2493	size_t count; /* after file limit checks */	2493	size_t count; /* after file limit checks */
2494	struct inode *inode = mapping->host;	2494	struct inode *inode = mapping->host;
2495	loff_t pos;	2495	loff_t pos;
2496	ssize_t written;	2496	ssize_t written;
2497	ssize_t err;	2497	ssize_t err;
2498		2498
2499	ocount = 0;	2499	ocount = 0;
2500	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);	2500	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2501	if (err)	2501	if (err)
2502	return err;	2502	return err;
2503		2503
2504	count = ocount;	2504	count = ocount;
2505	pos = *ppos;	2505	pos = *ppos;
2506		2506
2507	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);	2507	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2508		2508
2509	/* We can write back this queue in page reclaim */	2509	/* We can write back this queue in page reclaim */
2510	current->backing_dev_info = mapping->backing_dev_info;	2510	current->backing_dev_info = mapping->backing_dev_info;
2511	written = 0;	2511	written = 0;
2512		2512
2513	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));	2513	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2514	if (err)	2514	if (err)
2515	goto out;	2515	goto out;
2516		2516
2517	if (count == 0)	2517	if (count == 0)
2518	goto out;	2518	goto out;
2519		2519
2520	err = file_remove_suid(file);	2520	err = file_remove_suid(file);
2521	if (err)	2521	if (err)
2522	goto out;	2522	goto out;
2523		2523
2524	file_update_time(file);	2524	file_update_time(file);
2525		2525
2526	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */	2526	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2527	if (unlikely(file->f_flags & O_DIRECT)) {	2527	if (unlikely(file->f_flags & O_DIRECT)) {
2528	loff_t endbyte;	2528	loff_t endbyte;
2529	ssize_t written_buffered;	2529	ssize_t written_buffered;
2530		2530
2531	written = generic_file_direct_write(iocb, iov, &nr_segs, pos,	2531	written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2532	ppos, count, ocount);	2532	ppos, count, ocount);
2533	if (written < 0 \|\| written == count)	2533	if (written < 0 \|\| written == count)
2534	goto out;	2534	goto out;
2535	/*	2535	/*
2536	* direct-io write to a hole: fall through to buffered I/O	2536	* direct-io write to a hole: fall through to buffered I/O
2537	* for completing the rest of the request.	2537	* for completing the rest of the request.
2538	*/	2538	*/
2539	pos += written;	2539	pos += written;
2540	count -= written;	2540	count -= written;
2541	written_buffered = generic_file_buffered_write(iocb, iov,	2541	written_buffered = generic_file_buffered_write(iocb, iov,
2542	nr_segs, pos, ppos, count,	2542	nr_segs, pos, ppos, count,
2543	written);	2543	written);
2544	/*	2544	/*
2545	* If generic_file_buffered_write() retuned a synchronous error	2545	* If generic_file_buffered_write() retuned a synchronous error
2546	* then we want to return the number of bytes which were	2546	* then we want to return the number of bytes which were
2547	* direct-written, or the error code if that was zero. Note	2547	* direct-written, or the error code if that was zero. Note
2548	* that this differs from normal direct-io semantics, which	2548	* that this differs from normal direct-io semantics, which
2549	* will return -EFOO even if some bytes were written.	2549	* will return -EFOO even if some bytes were written.
2550	*/	2550	*/
2551	if (written_buffered < 0) {	2551	if (written_buffered < 0) {
2552	err = written_buffered;	2552	err = written_buffered;
2553	goto out;	2553	goto out;
2554	}	2554	}
2555		2555
2556	/*	2556	/*
2557	* We need to ensure that the page cache pages are written to	2557	* We need to ensure that the page cache pages are written to
2558	* disk and invalidated to preserve the expected O_DIRECT	2558	* disk and invalidated to preserve the expected O_DIRECT
2559	* semantics.	2559	* semantics.
2560	*/	2560	*/
2561	endbyte = pos + written_buffered - written - 1;	2561	endbyte = pos + written_buffered - written - 1;
2562	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);	2562	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2563	if (err == 0) {	2563	if (err == 0) {
2564	written = written_buffered;	2564	written = written_buffered;
2565	invalidate_mapping_pages(mapping,	2565	invalidate_mapping_pages(mapping,
2566	pos >> PAGE_CACHE_SHIFT,	2566	pos >> PAGE_CACHE_SHIFT,
2567	endbyte >> PAGE_CACHE_SHIFT);	2567	endbyte >> PAGE_CACHE_SHIFT);
2568	} else {	2568	} else {
2569	/*	2569	/*
2570	* We don't know how much we wrote, so just return	2570	* We don't know how much we wrote, so just return
2571	* the number of bytes which were direct-written	2571	* the number of bytes which were direct-written
2572	*/	2572	*/
2573	}	2573	}
2574	} else {	2574	} else {
2575	written = generic_file_buffered_write(iocb, iov, nr_segs,	2575	written = generic_file_buffered_write(iocb, iov, nr_segs,
2576	pos, ppos, count, written);	2576	pos, ppos, count, written);
2577	}	2577	}
2578	out:	2578	out:
2579	current->backing_dev_info = NULL;	2579	current->backing_dev_info = NULL;
2580	return written ? written : err;	2580	return written ? written : err;
2581	}	2581	}
2582	EXPORT_SYMBOL(__generic_file_aio_write);	2582	EXPORT_SYMBOL(__generic_file_aio_write);
2583		2583
2584	/**	2584	/**
2585	* generic_file_aio_write - write data to a file	2585	* generic_file_aio_write - write data to a file
2586	* @iocb: IO state structure	2586	* @iocb: IO state structure
2587	* @iov: vector with data to write	2587	* @iov: vector with data to write
2588	* @nr_segs: number of segments in the vector	2588	* @nr_segs: number of segments in the vector
2589	* @pos: position in file where to write	2589	* @pos: position in file where to write
2590	*	2590	*
2591	* This is a wrapper around __generic_file_aio_write() to be used by most	2591	* This is a wrapper around __generic_file_aio_write() to be used by most
2592	* filesystems. It takes care of syncing the file in case of O_SYNC file	2592	* filesystems. It takes care of syncing the file in case of O_SYNC file
2593	* and acquires i_mutex as needed.	2593	* and acquires i_mutex as needed.
2594	*/	2594	*/
2595	ssize_t generic_file_aio_write(struct kiocb iocb, const struct iovec iov,	2595	ssize_t generic_file_aio_write(struct kiocb iocb, const struct iovec iov,
2596	unsigned long nr_segs, loff_t pos)	2596	unsigned long nr_segs, loff_t pos)
2597	{	2597	{
2598	struct file *file = iocb->ki_filp;	2598	struct file *file = iocb->ki_filp;
2599	struct inode *inode = file->f_mapping->host;	2599	struct inode *inode = file->f_mapping->host;
2600	struct blk_plug plug;	2600	struct blk_plug plug;
2601	ssize_t ret;	2601	ssize_t ret;
2602		2602
2603	BUG_ON(iocb->ki_pos != pos);	2603	BUG_ON(iocb->ki_pos != pos);
2604		2604
2605	mutex_lock(&inode->i_mutex);	2605	mutex_lock(&inode->i_mutex);
2606	blk_start_plug(&plug);	2606	blk_start_plug(&plug);
2607	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);	2607	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2608	mutex_unlock(&inode->i_mutex);	2608	mutex_unlock(&inode->i_mutex);
2609		2609
2610	if (ret > 0 \|\| ret == -EIOCBQUEUED) {	2610	if (ret > 0 \|\| ret == -EIOCBQUEUED) {
2611	ssize_t err;	2611	ssize_t err;
2612		2612
2613	err = generic_write_sync(file, pos, ret);	2613	err = generic_write_sync(file, pos, ret);
2614	if (err < 0 && ret > 0)	2614	if (err < 0 && ret > 0)
2615	ret = err;	2615	ret = err;
2616	}	2616	}
2617	blk_finish_plug(&plug);	2617	blk_finish_plug(&plug);
2618	return ret;	2618	return ret;
2619	}	2619	}
2620	EXPORT_SYMBOL(generic_file_aio_write);	2620	EXPORT_SYMBOL(generic_file_aio_write);
2621		2621
2622	/**	2622	/**
2623	* try_to_release_page() - release old fs-specific metadata on a page	2623	* try_to_release_page() - release old fs-specific metadata on a page
2624	*	2624	*
2625	* @page: the page which the kernel is trying to free	2625	* @page: the page which the kernel is trying to free
2626	* @gfp_mask: memory allocation flags (and I/O mode)	2626	* @gfp_mask: memory allocation flags (and I/O mode)
2627	*	2627	*
2628	* The address_space is to try to release any data against the page	2628	* The address_space is to try to release any data against the page
2629	* (presumably at page->private). If the release was successful, return `1'.	2629	* (presumably at page->private). If the release was successful, return `1'.
2630	* Otherwise return zero.	2630	* Otherwise return zero.
2631	*	2631	*
2632	* This may also be called if PG_fscache is set on a page, indicating that the	2632	* This may also be called if PG_fscache is set on a page, indicating that the
2633	* page is known to the local caching routines.	2633	* page is known to the local caching routines.
2634	*	2634	*
2635	* The @gfp_mask argument specifies whether I/O may be performed to release	2635	* The @gfp_mask argument specifies whether I/O may be performed to release
2636	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).	2636	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2637	*	2637	*
2638	*/	2638	*/
2639	int try_to_release_page(struct page *page, gfp_t gfp_mask)	2639	int try_to_release_page(struct page *page, gfp_t gfp_mask)
2640	{	2640	{
2641	struct address_space * const mapping = page->mapping;	2641	struct address_space * const mapping = page->mapping;
2642		2642
2643	BUG_ON(!PageLocked(page));	2643	BUG_ON(!PageLocked(page));
2644	if (PageWriteback(page))	2644	if (PageWriteback(page))
2645	return 0;	2645	return 0;
2646		2646
2647	if (mapping && mapping->a_ops->releasepage)	2647	if (mapping && mapping->a_ops->releasepage)
2648	return mapping->a_ops->releasepage(page, gfp_mask);	2648	return mapping->a_ops->releasepage(page, gfp_mask);
2649	return try_to_free_buffers(page);	2649	return try_to_free_buffers(page);
2650	}	2650	}
2651		2651
2652	EXPORT_SYMBOL(try_to_release_page);	2652	EXPORT_SYMBOL(try_to_release_page);
2653		2653

mm/page-writeback.c

Diff comments View file @ f01ef56

 /*
  * mm/page-writeback.c
  *
  * Copyright (C) 2002, Linus Torvalds.
  * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Contains functions related to writing back dirty pages at the
  * address_space level.
  *
  * 10Apr2002	Andrew Morton
  *		Initial version
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/init.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/rmap.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 /*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE		max(HZ/5, 1)
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL	max(HZ/5, 1)
+/*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  * will look to see if it needs to force writeback or throttling.
  */
 static long ratelimit_pages = 32;
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
  * It should be somewhat larger than dirtied pages to ensure that reasonably
  * large amounts of I/O are submitted.
  */
 static inline long sync_writeback_pages(unsigned long dirtied)
 {
 	if (dirtied < ratelimit_pages)
 		dirtied = ratelimit_pages;
 	return dirtied + dirtied / 2;
 }
 /* The following parameters are exported via /proc/sys/vm */
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
 int dirty_background_ratio = 10;
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
  * dirty_background_ratio * the amount of dirtyable memory
  */
 unsigned long dirty_background_bytes;
 /*
  * free highmem will not be subtracted from the total free memory
  * for calculating free ratios if vm_highmem_is_dirtyable is true
  */
 int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
 int vm_dirty_ratio = 20;
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
  * vm_dirty_ratio * the amount of dirtyable memory
  */
 unsigned long vm_dirty_bytes;
 /*
  * The interval between `kupdate'-style writebacks
  */
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
 /*
  * The longest time for which data is allowed to remain dirty
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
  */
 int block_dump;
 /*
  * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
  * a full sync is triggered after this time elapses without any disk activity.
  */
 int laptop_mode;
 EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
+unsigned long global_dirty_limit;
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
  *
  * We do this by keeping a floating proportion between BDIs, based on page
  * writeback completions [end_page_writeback()]. Those devices that write out
  * pages fastest will get the larger share, while the slower will get a smaller
  * share.
  *
  * We use page writeout completions because we are interested in getting rid of
  * dirty pages. Having them written out is the primary goal.
  *
  * We introduce a concept of time, a period over which we measure these events,
  * because demand can/will vary over time. The length of this period itself is
  * measured in page writeback completions.
  *
  */
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 /*
  * couple the period to the dirty_ratio:
  *
  *   period/2 ~ roundup_pow_of_two(dirty limit)
  */
 static int calc_period_shift(void)
 {
 	unsigned long dirty_total;
 	if (vm_dirty_bytes)
 		dirty_total = vm_dirty_bytes / PAGE_SIZE;
 	else
 		dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
 				100;
 	return 2 + ilog2(dirty_total - 1);
 }
 /*
  * update the period when the dirty threshold changes.
  */
 static void update_completion_period(void)
 {
 	int shift = calc_period_shift();
 	prop_change_shift(&vm_completions, shift);
 	prop_change_shift(&vm_dirties, shift);
 }
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_bytes = 0;
 	return ret;
 }
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_ratio = 0;
 	return ret;
 }
 int dirty_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 		update_completion_period();
 		vm_dirty_bytes = 0;
 	}
 	return ret;
 }
 int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
 		update_completion_period();
 		vm_dirty_ratio = 0;
 	}
 	return ret;
 }
 /*
  * Increment the BDI's writeout completion count and the global writeout
  * completion count. Called from test_clear_page_writeback().
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+	__inc_bdi_stat(bdi, BDI_WRITTEN);
 	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
 			      bdi->max_prop_frac);
 }
 void bdi_writeout_inc(struct backing_dev_info *bdi)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	__bdi_writeout_inc(bdi);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 void task_dirty_inc(struct task_struct *tsk)
 {
 	prop_inc_single(&vm_dirties, &tsk->dirties);
 }
 /*
  * Obtain an accurate fraction of the BDI's portion.
  */
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	if (bdi_cap_writeback_dirty(bdi)) {
+	prop_fraction_percpu(&vm_completions, &bdi->completions,
-		prop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
-	} else {
-		*numerator = 0;
-		*denominator = 1;
-	}
 }
 static inline void task_dirties_fraction(struct task_struct *tsk,
 		long *numerator, long *denominator)
 {
 	prop_fraction_single(&vm_dirties, &tsk->dirties,
 				numerator, denominator);
 }
 /*
  * task_dirty_limit - scale down dirty throttling threshold for one task
  *
  * task specific dirty limit:
  *
  *   dirty -= (dirty/8) * p_{t}
  *
  * To protect light/slow dirtying tasks from heavier/fast ones, we start
  * throttling individual tasks before reaching the bdi dirty limit.
  * Relatively low thresholds will be allocated to heavy dirtiers. So when
  * dirty pages grow large, heavy dirtiers will be throttled first, which will
  * effectively curb the growth of dirty pages. Light dirtiers with high enough
  * dirty threshold may never get throttled.
  */
+#define TASK_LIMIT_FRACTION 8
 static unsigned long task_dirty_limit(struct task_struct *tsk,
 				       unsigned long bdi_dirty)
 {
 	long numerator, denominator;
 	unsigned long dirty = bdi_dirty;
-	u64 inv = dirty >> 3;
+	u64 inv = dirty / TASK_LIMIT_FRACTION;
 	task_dirties_fraction(tsk, &numerator, &denominator);
 	inv *= numerator;
 	do_div(inv, denominator);
 	dirty -= inv;
 	return max(dirty, bdi_dirty/2);
 }
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+	return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
 /*
  *
  */
 static unsigned int bdi_min_ratio;
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
 	int ret = 0;
 	spin_lock_bh(&bdi_lock);
 	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
 	} else {
 		min_ratio -= bdi->min_ratio;
 		if (bdi_min_ratio + min_ratio < 100) {
 			bdi_min_ratio += min_ratio;
 			bdi->min_ratio += min_ratio;
 		} else {
 			ret = -EINVAL;
 		}
 	}
 	spin_unlock_bh(&bdi_lock);
 	return ret;
 }
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 {
 	int ret = 0;
 	if (max_ratio > 100)
 		return -EINVAL;
 	spin_lock_bh(&bdi_lock);
 	if (bdi->min_ratio > max_ratio) {
 		ret = -EINVAL;
 	} else {
 		bdi->max_ratio = max_ratio;
 		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 	}
 	spin_unlock_bh(&bdi_lock);
 	return ret;
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 /*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
  * The main aim here is to lower them aggressively if there is a lot of mapped
  * memory around.  To avoid stressing page reclaim with lots of unreclaimable
  * pages.  It is better to clamp down on writers than to start swapping, and
  * performing lots of scanning.
  *
  * We only allow 1/2 of the currently-unmapped memory to be dirtied.
  *
  * We don't permit the clamping level to fall below 5% - that is getting rather
  * excessive.
  *
  * We make sure that the background writeout level is below the adjusted
  * clamping level.
  */
 static unsigned long highmem_dirtyable_memory(unsigned long total)
 {
 #ifdef CONFIG_HIGHMEM
 	int node;
 	unsigned long x = 0;
 	for_each_node_state(node, N_HIGH_MEMORY) {
 		struct zone *z =
 			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 		x += zone_page_state(z, NR_FREE_PAGES) +
 		     zone_reclaimable_pages(z);
 	}
 	/*
 	 * Make sure that the number of highmem pages is never larger
 	 * than the number of the total dirtyable memory. This can only
 	 * occur in very strange VM situations but we want to make sure
 	 * that this does not occur.
 	 */
 	return min(x, total);
 #else
 	return 0;
 #endif
 }
 /**
  * determine_dirtyable_memory - amount of memory that may be used
  *
  * Returns the numebr of pages that can currently be freed and used
  * by the kernel for direct mappings.
  */
 unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
 	return x + 1;	/* Ensure that we never return 0 */
 }
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+	return max(thresh, global_dirty_limit);
+}
 /*
  * global_dirty_limits - background-writeback and dirty-throttling thresholds
  *
  * Calculate the dirty thresholds based on sysctl parameters
  * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
  * - vm.dirty_ratio             or  vm.dirty_bytes
  * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
  * real-time tasks.
  */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
 	unsigned long background;
 	unsigned long dirty;
 	unsigned long uninitialized_var(available_memory);
 	struct task_struct *tsk;
 	if (!vm_dirty_bytes || !dirty_background_bytes)
 		available_memory = determine_dirtyable_memory();
 	if (vm_dirty_bytes)
 		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
 	else
 		dirty = (vm_dirty_ratio * available_memory) / 100;
 	if (dirty_background_bytes)
 		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
 	else
 		background = (dirty_background_ratio * available_memory) / 100;
 	if (background >= dirty)
 		background = dirty / 2;
 	tsk = current;
 	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
 		background += background / 4;
 		dirty += dirty / 4;
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+	trace_global_dirty_state(background, dirty);
 }
-/*
+/**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
  *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
  * The bdi's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
  */
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 {
 	u64 bdi_dirty;
 	long numerator, denominator;
 	/*
 	 * Calculate this BDI's share of the dirty ratio.
 	 */
 	bdi_writeout_fraction(bdi, &numerator, &denominator);
 	bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
 	bdi_dirty *= numerator;
 	do_div(bdi_dirty, denominator);
 	bdi_dirty += (dirty * bdi->min_ratio) / 100;
 	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 		bdi_dirty = dirty * bdi->max_ratio / 100;
 	return bdi_dirty;
 }
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+				       unsigned long elapsed,
+				       unsigned long written)
+{
+	const unsigned long period = roundup_pow_of_two(3 * HZ);
+	unsigned long avg = bdi->avg_write_bandwidth;
+	unsigned long old = bdi->write_bandwidth;
+	u64 bw;
+	/*
+	 * bw = written * HZ / elapsed
+	 *
+	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
+	 * write_bandwidth = ---------------------------------------------------
+	 *                                          period
+	 */
+	bw = written - bdi->written_stamp;
+	bw *= HZ;
+	if (unlikely(elapsed > period)) {
+		do_div(bw, elapsed);
+		avg = bw;
+		goto out;
+	}
+	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	bw >>= ilog2(period);
+	/*
+	 * one more level of smoothing, for filtering out sudden spikes
+	 */
+	if (avg > old && old >= (unsigned long)bw)
+		avg -= (avg - old) >> 3;
+	if (avg < old && old <= (unsigned long)bw)
+		avg += (old - avg) >> 3;
+out:
+	bdi->write_bandwidth = bw;
+	bdi->avg_write_bandwidth = avg;
+}
 /*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+	unsigned long limit = global_dirty_limit;
+	/*
+	 * Follow up in one step.
+	 */
+	if (limit < thresh) {
+		limit = thresh;
+		goto update;
+	}
+	/*
+	 * Follow down slowly. Use the higher one as the target, because thresh
+	 * may drop below dirty. This is exactly the reason to introduce
+	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+	 */
+	thresh = max(thresh, dirty);
+	if (limit > thresh) {
+		limit -= (limit - thresh) >> 5;
+		goto update;
+	}
+	return;
+update:
+	global_dirty_limit = limit;
+}
+static void global_update_bandwidth(unsigned long thresh,
+				    unsigned long dirty,
+				    unsigned long now)
+{
+	static DEFINE_SPINLOCK(dirty_lock);
+	static unsigned long update_time;
+	/*
+	 * check locklessly first to optimize away locking for the most time
+	 */
+	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+		return;
+	spin_lock(&dirty_lock);
+	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+		update_dirty_limit(thresh, dirty);
+		update_time = now;
+	}
+	spin_unlock(&dirty_lock);
+}
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
+			    unsigned long start_time)
+{
+	unsigned long now = jiffies;
+	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long written;
+	/*
+	 * rate-limit, only update once every 200ms.
+	 */
+	if (elapsed < BANDWIDTH_INTERVAL)
+		return;
+	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+	/*
+	 * Skip quiet periods when disk bandwidth is under-utilized.
+	 * (at least 1s idle time between two flusher runs)
+	 */
+	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+		goto snapshot;
+	if (thresh)
+		global_update_bandwidth(thresh, dirty, now);
+	bdi_update_write_bandwidth(bdi, elapsed, written);
+snapshot:
+	bdi->written_stamp = written;
+	bdi->bw_time_stamp = now;
+}
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+				 unsigned long thresh,
+				 unsigned long dirty,
+				 unsigned long bdi_thresh,
+				 unsigned long bdi_dirty,
+				 unsigned long start_time)
+{
+	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+		return;
+	spin_lock(&bdi->wb.list_lock);
+	__bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+			       start_time);
+	spin_unlock(&bdi->wb.list_lock);
+}
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
  * If we're over `background_thresh' then the writeback threads are woken to
  * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long write_chunk)
 {
-	long nr_reclaimable, bdi_nr_reclaimable;
+	unsigned long nr_reclaimable, bdi_nr_reclaimable;
-	long nr_writeback, bdi_nr_writeback;
+	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+	unsigned long bdi_dirty;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
+	unsigned long task_bdi_thresh;
+	unsigned long min_task_bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long pause = 1;
 	bool dirty_exceeded = false;
+	bool clear_dirty_exceeded = true;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	unsigned long start_time = jiffies;
 	for (;;) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 		/*
 		 * Throttle it only when the background writeback cannot
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <=
+		if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
-				(background_thresh + dirty_thresh) / 2)
 			break;
 		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+		min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+		task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
 		 * to ensure we accurately count the 'dirty' pages when
 		 * the threshold is low.
 		 *
 		 * Otherwise it would be possible to get thresh+n pages
 		 * reported dirty, even though there are thresh-m pages
 		 * actually dirty; with m+n sitting in the percpu
 		 * deltas.
 		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+		if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
 			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else {
 			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
 		/*
 		 * The bdi thresh is somehow "soft" limit derived from the
 		 * global "hard" limit. The former helps to prevent heavy IO
 		 * bdi or process from holding back light ones; The latter is
 		 * the last resort safeguard.
 		 */
-		dirty_exceeded =
+		dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
-			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+				  (nr_dirty > dirty_thresh);
-			|| (nr_reclaimable + nr_writeback > dirty_thresh);
+		clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+					(nr_dirty <= dirty_thresh);
 		if (!dirty_exceeded)
 			break;
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
+		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+				     bdi_thresh, bdi_dirty, start_time);
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 * Only move pages to writeback if this bdi is over its
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
-		trace_wbc_balance_dirty_start(&wbc, bdi);
+		trace_balance_dirty_start(bdi);
-		if (bdi_nr_reclaimable > bdi_thresh) {
+		if (bdi_nr_reclaimable > task_bdi_thresh) {
-			writeback_inodes_wb(&bdi->wb, &wbc);
+			pages_written += writeback_inodes_wb(&bdi->wb,
-			pages_written += write_chunk - wbc.nr_to_write;
+							     write_chunk);
-			trace_wbc_balance_dirty_written(&wbc, bdi);
+			trace_balance_dirty_written(bdi, pages_written);
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		trace_wbc_balance_dirty_wait(&wbc, bdi);
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
+		trace_balance_dirty_wait(bdi);
+		dirty_thresh = hard_dirty_limit(dirty_thresh);
 		/*
+		 * max-pause area. If dirty exceeded but still within this
+		 * area, no need to sleep for more than 200ms: (a) 8 pages per
+		 * 200ms is typically more than enough to curb heavy dirtiers;
+		 * (b) the pause time limit makes the dirtiers more responsive.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+		    time_after(jiffies, start_time + MAX_PAUSE))
+			break;
+		/*
+		 * pass-good area. When some bdi gets blocked (eg. NFS server
+		 * not responding), or write bandwidth dropped dramatically due
+		 * to concurrent reads, or dirty threshold suddenly dropped and
+		 * the dirty pages cannot be brought down anytime soon (eg. on
+		 * slow USB stick), at least let go of the good bdi's.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_PASSGOOD_AREA &&
+		    bdi_dirty < bdi_thresh)
+			break;
+		/*
 		 * Increase the delay for each loop, up to our previous
 		 * default of taking a 100ms nap.
 		 */
 		pause <<= 1;
 		if (pause > HZ / 10)
 			pause = HZ / 10;
 	}
-	if (!dirty_exceeded && bdi->dirty_exceeded)
+	/* Clear dirty_exceeded flag only when no task can exceed the limit */
+	if (clear_dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 	if (writeback_in_progress(bdi))
 		return;
 	/*
 	 * In laptop mode, we wait until hitting the higher threshold before
 	 * starting background writeout, and then write out all the way down
 	 * to the lower threshold.  So slow writers cause minimal disk activity.
 	 *
 	 * In normal mode, we start background writeout at the lower
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
 	    (!laptop_mode && (nr_reclaimable > background_thresh)))
 		bdi_start_background_writeback(bdi);
 }
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
 {
 	if (set_page_dirty(page) || page_mkwrite) {
 		struct address_space *mapping = page_mapping(page);
 		if (mapping)
 			balance_dirty_pages_ratelimited(mapping);
 	}
 }
 static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
  * @nr_pages_dirtied: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
  * On really big machines, get_writeback_state is expensive, so try to avoid
  * calling it too often (ratelimiting).  But once we're over the dirty memory
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long ratelimit;
 	unsigned long *p;
+	if (!bdi_cap_account_dirty(bdi))
+		return;
 	ratelimit = ratelimit_pages;
 	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
 	preempt_disable();
 	p =  &__get_cpu_var(bdp_ratelimits);
 	*p += nr_pages_dirtied;
 	if (unlikely(*p >= ratelimit)) {
 		ratelimit = sync_writeback_pages(*p);
 		*p = 0;
 		preempt_enable();
 		balance_dirty_pages(mapping, ratelimit);
 		return;
 	}
 	preempt_enable();
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
         for ( ; ; ) {
 		global_dirty_limits(&background_thresh, &dirty_thresh);
                 /*
                  * Boost the allowable dirty threshold a bit for page
                  * allocators so they don't get DoS'ed by heavy writers
                  */
                 dirty_thresh += dirty_thresh / 10;      /* wheeee... */
                 if (global_page_state(NR_UNSTABLE_NFS) +
 			global_page_state(NR_WRITEBACK) <= dirty_thresh)
                         	break;
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
 		/*
 		 * The caller might hold locks which can prevent IO completion
 		 * or progress in the filesystem.  So we cannot just sit here
 		 * waiting for IO to complete.
 		 */
 		if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
 			break;
         }
 }
 /*
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
 	bdi_arm_supers_timer();
 	return 0;
 }
 #ifdef CONFIG_BLOCK
 void laptop_mode_timer_fn(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 	int nr_pages = global_page_state(NR_FILE_DIRTY) +
 		global_page_state(NR_UNSTABLE_NFS);
 	/*
 	 * We want to write everything out, not just down to the dirty
 	 * threshold
 	 */
 	if (bdi_has_dirty_io(&q->backing_dev_info))
 		bdi_start_writeback(&q->backing_dev_info, nr_pages);
 }
 /*
  * We've spun up the disk and we're in laptop mode: schedule writeback
  * of all dirty data a few seconds from now.  If the flush is already scheduled
  * then push it back - the user is still using the disk.
  */
 void laptop_io_completion(struct backing_dev_info *info)
 {
 	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
  * We're in laptop mode and we've just synced. The sync's writes will have
  * caused another writeback to be scheduled by laptop_io_completion.
  * Nothing needs to be written back anymore, so we unschedule the writeback.
  */
 void laptop_sync_completion(void)
 {
 	struct backing_dev_info *bdi;
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
 		del_timer(&bdi->laptop_mode_wb_timer);
 	rcu_read_unlock();
 }
 #endif
 /*
  * If ratelimit_pages is too high then we can get into dirty-data overload
  * if a large number of processes all perform writes at the same time.
  * If it is too low then SMP machines will call the (expensive)
  * get_writeback_state too often.
  *
  * Here we set ratelimit_pages to a level which ensures that when all CPUs are
  * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
  * thresholds before writeback cuts in.
  *
  * But the limit should not be set too high.  Because it also controls the
  * amount of memory which the balance_dirty_pages() caller has to write back.
  * If this is too large then the caller will block on the IO queue all the
  * time.  So limit it to four megabytes - the balance_dirty_pages() caller
  * will write six megabyte chunks, max.
  */
 void writeback_set_ratelimit(void)
 {
 	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
 	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
 		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
 	writeback_set_ratelimit();
 	return NOTIFY_DONE;
 }
 static struct notifier_block __cpuinitdata ratelimit_nb = {
 	.notifier_call	= ratelimit_handler,
 	.next		= NULL,
 };
 /*
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
  * related to pages that could be allocated for buffers (by
  * comparing nr_free_buffer_pages() to vm_total_pages.
  *
  * However, that was when we used "dirty_ratio" to scale with
  * all memory, and we don't do that any more. "dirty_ratio"
  * is now applied to total non-HIGHPAGE memory (by subtracting
  * totalhigh_pages from vm_total_pages), and as such we can't
  * get into the old insane situation any more where we had
  * large amounts of dirty pages compared to a small amount of
  * non-HIGHMEM memory.
  *
  * But we might still want to scale the dirty_ratio by how
  * much memory the box has..
  */
 void __init page_writeback_init(void)
 {
 	int shift;
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
 }
 /**
  * tag_pages_for_writeback - tag pages to be written by write_cache_pages
  * @mapping: address space structure to write
  * @start: starting page index
  * @end: ending page index (inclusive)
  *
  * This function scans the page range from @start to @end (inclusive) and tags
  * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
  * that write_cache_pages (or whoever calls this function) will then use
  * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
  * used to avoid livelocking of writeback by a process steadily creating new
  * dirty pages in the file (thus it is important for this function to be quick
  * so that it can tag pages faster than a dirtying process can create them).
  */
 /*
  * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
  */
 void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end)
 {
 #define WRITEBACK_TAG_BATCH 4096
 	unsigned long tagged;
 	do {
 		spin_lock_irq(&mapping->tree_lock);
 		tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
 				&start, end, WRITEBACK_TAG_BATCH,
 				PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
 		spin_unlock_irq(&mapping->tree_lock);
 		WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
 		cond_resched();
 		/* We check 'start' to handle wrapping when end == ~0UL */
 	} while (tagged >= WRITEBACK_TAG_BATCH && start);
 }
 EXPORT_SYMBOL(tag_pages_for_writeback);
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @writepage: function called for each page
  * @data: data passed to writepage function
  *
  * If a page is already under I/O, write_cache_pages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  *
  * To avoid livelocks (when other process dirties new pages), we first tag
  * pages which should be written back with TOWRITE tag and only then start
  * writing them. For data-integrity sync we have to be careful so that we do
  * not miss some pages (e.g., because some other process has cleared TOWRITE
  * tag we set). The rule we follow is that TOWRITE tag can be cleared only
  * by the process clearing the DIRTY tag (and submitting the page for IO).
  */
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data)
 {
 	int ret = 0;
 	int done = 0;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t uninitialized_var(writeback_index);
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
 	int cycled;
 	int range_whole = 0;
 	int tag;
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
 		if (index == 0)
 			cycled = 1;
 		else
 			cycled = 0;
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
 		int i;
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			/*
 			 * At this point, the page may be truncated or
 			 * invalidated (changing page->mapping to NULL), or
 			 * even swizzled back from swapper_space to tmpfs file
 			 * mapping. However, page->index will not change
 			 * because we have a reference on the page.
 			 */
 			if (page->index > end) {
 				/*
 				 * can't be range_cyclic (1st pass) because
 				 * end == -1 in that case.
 				 */
 				done = 1;
 				break;
 			}
 			done_index = page->index;
 			lock_page(page);
 			/*
 			 * Page truncated or invalidated. We can freely skip it
 			 * then, even for data integrity operations: the page
 			 * has disappeared concurrently, so there could be no
 			 * real expectation of this data interity operation
 			 * even if there is now a new, dirty page at the same
 			 * pagecache address.
 			 */
 			if (unlikely(page->mapping != mapping)) {
 continue_unlock:
 				unlock_page(page);
 				continue;
 			}
 			if (!PageDirty(page)) {
 				/* someone wrote it for us */
 				goto continue_unlock;
 			}
 			if (PageWriteback(page)) {
 				if (wbc->sync_mode != WB_SYNC_NONE)
 					wait_on_page_writeback(page);
 				else
 					goto continue_unlock;
 			}
 			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 			trace_wbc_writepage(wbc, mapping->backing_dev_info);
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
 					unlock_page(page);
 					ret = 0;
 				} else {
 					/*
 					 * done_index is set past this page,
 					 * so media errors will not choke
 					 * background writeout for the entire
 					 * file. This has consequences for
 					 * range_cyclic semantics (ie. it may
 					 * not be suitable for data integrity
 					 * writeout).
 					 */
 					done_index = page->index + 1;
 					done = 1;
 					break;
 				}
 			}
 			/*
 			 * We stop writing back only if we are not doing
 			 * integrity sync. In case of integrity sync we have to
 			 * keep going until we have written all the pages
 			 * we tagged for writeback prior to entering this loop.
 			 */
 			if (--wbc->nr_to_write <= 0 &&
 			    wbc->sync_mode == WB_SYNC_NONE) {
 				done = 1;
 				break;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	if (!cycled && !done) {
 		/*
 		 * range_cyclic:
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
 		cycled = 1;
 		index = 0;
 		end = writeback_index - 1;
 		goto retry;
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = done_index;
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
 /*
  * Function used by generic_writepages to call the real writepage
  * function and set the mapping flags on error
  */
 static int __writepage(struct page *page, struct writeback_control *wbc,
 		       void *data)
 {
 	struct address_space *mapping = data;
 	int ret = mapping->a_ops->writepage(page, wbc);
 	mapping_set_error(mapping, ret);
 	return ret;
 }
 /**
  * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  *
  * This is a library function, which implements the writepages()
  * address_space_operation.
  */
 int generic_writepages(struct address_space *mapping,
 		       struct writeback_control *wbc)
 {
 	struct blk_plug plug;
 	int ret;
 	/* deal with chardevs and other special file */
 	if (!mapping->a_ops->writepage)
 		return 0;
 	blk_start_plug(&plug);
 	ret = write_cache_pages(mapping, wbc, __writepage, mapping);
 	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
 	if (wbc->nr_to_write <= 0)
 		return 0;
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
 	return ret;
 }
 /**
  * write_one_page - write out a single page and optionally wait on I/O
  * @page: the page to write
  * @wait: if true, wait on writeout
  *
  * The page must be locked by the caller and will be unlocked upon return.
  *
  * write_one_page() returns a negative error code if I/O failed.
  */
 int write_one_page(struct page *page, int wait)
 {
 	struct address_space *mapping = page->mapping;
 	int ret = 0;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 1,
 	};
 	BUG_ON(!PageLocked(page));
 	if (wait)
 		wait_on_page_writeback(page);
 	if (clear_page_dirty_for_io(page)) {
 		page_cache_get(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
 			if (PageError(page))
 				ret = -EIO;
 		}
 		page_cache_release(page);
 	} else {
 		unlock_page(page);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(write_one_page);
 /*
  * For address_spaces which do not use buffers nor write back.
  */
 int __set_page_dirty_no_writeback(struct page *page)
 {
 	if (!PageDirty(page))
 		return !TestSetPageDirty(page);
 	return 0;
 }
 /*
  * Helper function for set_page_dirty family.
  * NOTE: This relies on being atomic wrt interrupts.
  */
 void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
 }
 EXPORT_SYMBOL(account_page_dirtied);
 /*
  * Helper function for set_page_writeback family.
  * NOTE: Unlike account_page_dirtied this does not rely on being atomic
  * wrt interrupts.
  */
 void account_page_writeback(struct page *page)
 {
 	inc_zone_page_state(page, NR_WRITEBACK);
 }
 EXPORT_SYMBOL(account_page_writeback);
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
  *
  * This is also used when a single buffer is being dirtied: we want to set the
  * page dirty in that case, but not all the buffers.  This is a "bottom-up"
  * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
  *
  * Most callers have locked the page, which pins the address_space in memory.
  * But zap_pte_range() does not lock the page, however in that case the
  * mapping is pinned by the vma's ->vm_file reference.
  *
  * We take care to handle the case where the page was truncated from the
  * mapping by re-checking page_mapping() inside tree_lock.
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		struct address_space *mapping2;
 		if (!mapping)
 			return 1;
 		spin_lock_irq(&mapping->tree_lock);
 		mapping2 = page_mapping(page);
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&mapping->tree_lock);
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		}
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 /*
  * When a writepage implementation decides that it doesn't want to write this
  * page for some reason, it should redirty the locked page via
  * redirty_page_for_writepage() and it should then unlock the page and return 0
  */
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
 	wbc->pages_skipped++;
 	return __set_page_dirty_nobuffers(page);
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
 /*
  * Dirty a page.
  *
  * For pages with a mapping this should be done under the page lock
  * for the benefit of asynchronous memory errors who prefer a consistent
  * dirty state. This rule can be broken in some special cases,
  * but should be better not to.
  *
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
 int set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	if (likely(mapping)) {
 		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
 		/*
 		 * readahead/lru_deactivate_page could remain
 		 * PG_readahead/PG_reclaim due to race with end_page_writeback
 		 * About readahead, if the page is written, the flags would be
 		 * reset. So no problem.
 		 * About lru_deactivate_page, if the page is redirty, the flag
 		 * will be reset. So no problem. but if the page is used by readahead
 		 * it will confuse readahead and make it restart the size rampup
 		 * process. But it's a trivial problem.
 		 */
 		ClearPageReclaim(page);
 #ifdef CONFIG_BLOCK
 		if (!spd)
 			spd = __set_page_dirty_buffers;
 #endif
 		return (*spd)(page);
 	}
 	if (!PageDirty(page)) {
 		if (!TestSetPageDirty(page))
 			return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
  * CPU could truncate the page off the mapping and then free the mapping.
  *
  * Usually, the page _is_ locked, or the caller is a user-space process which
  * holds a reference on the inode by having an open file.
  *
  * In other cases, the page should be locked before running set_page_dirty().
  */
 int set_page_dirty_lock(struct page *page)
 {
 	int ret;
 	lock_page(page);
 	ret = set_page_dirty(page);
 	unlock_page(page);
 	return ret;
 }
 EXPORT_SYMBOL(set_page_dirty_lock);
 /*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
  *
  * This is for preparing to put the page under writeout.  We leave the page
  * tagged as dirty in the radix tree so that a concurrent write-for-sync
  * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
  * implementation will run either set_page_writeback() or set_page_dirty(),
  * at which stage we bring the page's dirty flag and radix-tree dirty tag
  * back into sync.
  *
  * This incoherency between the page's dirty flag and radix-tree tag is
  * unfortunate, but it only exists while the page is locked.
  */
 int clear_page_dirty_for_io(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	BUG_ON(!PageLocked(page));
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		/*
 		 * Yes, Virginia, this is indeed insane.
 		 *
 		 * We use this sequence to make sure that
 		 *  (a) we account for dirty stats properly
 		 *  (b) we tell the low-level filesystem to
 		 *      mark the whole page dirty if it was
 		 *      dirty in a pagetable. Only to then
 		 *  (c) clean the page again and return 1 to
 		 *      cause the writeback.
 		 *
 		 * This way we avoid all nasty races with the
 		 * dirty bit in multiple places and clearing
 		 * them concurrently from different threads.
 		 *
 		 * Note! Normally the "set_page_dirty(page)"
 		 * has no effect on the actual dirty bit - since
 		 * that will already usually be set. But we
 		 * need the side effects, and it can help us
 		 * avoid races.
 		 *
 		 * We basically use the page "master dirty bit"
 		 * as a serialization point for all the different
 		 * threads doing their things.
 		 */
 		if (page_mkclean(page))
 			set_page_dirty(page);
 		/*
 		 * We carefully synchronise fault handlers against
 		 * installing a dirty pte and marking the page dirty
 		 * at this point. We do this by having them hold the
 		 * page lock at some point after installing their
 		 * pte, but before marking the page dirty.
 		 * Pages are always locked coming in here, so we get
 		 * the desired exclusion. See mm/memory.c:do_wp_page()
 		 * for more comments.
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
 			return 1;
 		}
 		return 0;
 	}
 	return TestClearPageDirty(page);
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	int ret;
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
 		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
 	if (ret) {
 		dec_zone_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITTEN);
 	}
 	return ret;
 }
 int test_set_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	int ret;
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
 		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi))
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
 		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		radix_tree_tag_clear(&mapping->page_tree,
 				     page_index(page),
 				     PAGECACHE_TAG_TOWRITE);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret)
 		account_page_writeback(page);
 	return ret;

mm/rmap.c

Diff comments View file @ f01ef56

1	/*	1	/*
2	* mm/rmap.c - physical to virtual reverse mappings	2	* mm/rmap.c - physical to virtual reverse mappings
3	*	3	*
4	* Copyright 2001, Rik van Riel <riel@conectiva.com.br>	4	* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5	* Released under the General Public License (GPL).	5	* Released under the General Public License (GPL).
6	*	6	*
7	* Simple, low overhead reverse mapping scheme.	7	* Simple, low overhead reverse mapping scheme.
8	* Please try to keep this thing as modular as possible.	8	* Please try to keep this thing as modular as possible.
9	*	9	*
10	* Provides methods for unmapping each kind of mapped page:	10	* Provides methods for unmapping each kind of mapped page:
11	* the anon methods track anonymous pages, and	11	* the anon methods track anonymous pages, and
12	* the file methods track pages belonging to an inode.	12	* the file methods track pages belonging to an inode.
13	*	13	*
14	* Original design by Rik van Riel <riel@conectiva.com.br> 2001	14	* Original design by Rik van Riel <riel@conectiva.com.br> 2001
15	* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004	15	* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16	* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004	16	* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17	* Contributions by Hugh Dickins 2003, 2004	17	* Contributions by Hugh Dickins 2003, 2004
18	*/	18	*/
19		19
20	/*	20	/*
21	* Lock ordering in mm:	21	* Lock ordering in mm:
22	*	22	*
23	* inode->i_mutex (while writing or truncating, not reading or faulting)	23	* inode->i_mutex (while writing or truncating, not reading or faulting)
24	* mm->mmap_sem	24	* mm->mmap_sem
25	* page->flags PG_locked (lock_page)	25	* page->flags PG_locked (lock_page)
26	* mapping->i_mmap_mutex	26	* mapping->i_mmap_mutex
27	* anon_vma->mutex	27	* anon_vma->mutex
28	* mm->page_table_lock or pte_lock	28	* mm->page_table_lock or pte_lock
29	* zone->lru_lock (in mark_page_accessed, isolate_lru_page)	29	* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30	* swap_lock (in swap_duplicate, swap_info_get)	30	* swap_lock (in swap_duplicate, swap_info_get)
31	* mmlist_lock (in mmput, drain_mmlist and others)	31	* mmlist_lock (in mmput, drain_mmlist and others)
32	* mapping->private_lock (in __set_page_dirty_buffers)	32	* mapping->private_lock (in __set_page_dirty_buffers)
33	* inode->i_lock (in set_page_dirty's __mark_inode_dirty)	33	* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34	* inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)	34	* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35	* sb_lock (within inode_lock in fs/fs-writeback.c)	35	* sb_lock (within inode_lock in fs/fs-writeback.c)
36	* mapping->tree_lock (widely used, in set_page_dirty,	36	* mapping->tree_lock (widely used, in set_page_dirty,
37	* in arch-dependent flush_dcache_mmap_lock,	37	* in arch-dependent flush_dcache_mmap_lock,
38	* within inode_wb_list_lock in __sync_single_inode)	38	* within bdi.wb->list_lock in __sync_single_inode)
39	*	39	*
40	* anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)	40	* anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41	* ->tasklist_lock	41	* ->tasklist_lock
42	* pte map lock	42	* pte map lock
43	*/	43	*/
44		44
45	#include <linux/mm.h>	45	#include <linux/mm.h>
46	#include <linux/pagemap.h>	46	#include <linux/pagemap.h>
47	#include <linux/swap.h>	47	#include <linux/swap.h>
48	#include <linux/swapops.h>	48	#include <linux/swapops.h>
49	#include <linux/slab.h>	49	#include <linux/slab.h>
50	#include <linux/init.h>	50	#include <linux/init.h>
51	#include <linux/ksm.h>	51	#include <linux/ksm.h>
52	#include <linux/rmap.h>	52	#include <linux/rmap.h>
53	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
54	#include <linux/module.h>	54	#include <linux/module.h>
55	#include <linux/memcontrol.h>	55	#include <linux/memcontrol.h>
56	#include <linux/mmu_notifier.h>	56	#include <linux/mmu_notifier.h>
57	#include <linux/migrate.h>	57	#include <linux/migrate.h>
58	#include <linux/hugetlb.h>	58	#include <linux/hugetlb.h>
59		59
60	#include <asm/tlbflush.h>	60	#include <asm/tlbflush.h>
61		61
62	#include "internal.h"	62	#include "internal.h"
63		63
64	static struct kmem_cache *anon_vma_cachep;	64	static struct kmem_cache *anon_vma_cachep;
65	static struct kmem_cache *anon_vma_chain_cachep;	65	static struct kmem_cache *anon_vma_chain_cachep;
66		66
67	static inline struct anon_vma *anon_vma_alloc(void)	67	static inline struct anon_vma *anon_vma_alloc(void)
68	{	68	{
69	struct anon_vma *anon_vma;	69	struct anon_vma *anon_vma;
70		70
71	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);	71	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
72	if (anon_vma) {	72	if (anon_vma) {
73	atomic_set(&anon_vma->refcount, 1);	73	atomic_set(&anon_vma->refcount, 1);
74	/*	74	/*
75	* Initialise the anon_vma root to point to itself. If called	75	* Initialise the anon_vma root to point to itself. If called
76	* from fork, the root will be reset to the parents anon_vma.	76	* from fork, the root will be reset to the parents anon_vma.
77	*/	77	*/
78	anon_vma->root = anon_vma;	78	anon_vma->root = anon_vma;
79	}	79	}
80		80
81	return anon_vma;	81	return anon_vma;
82	}	82	}
83		83
84	static inline void anon_vma_free(struct anon_vma *anon_vma)	84	static inline void anon_vma_free(struct anon_vma *anon_vma)
85	{	85	{
86	VM_BUG_ON(atomic_read(&anon_vma->refcount));	86	VM_BUG_ON(atomic_read(&anon_vma->refcount));
87		87
88	/*	88	/*
89	* Synchronize against page_lock_anon_vma() such that	89	* Synchronize against page_lock_anon_vma() such that
90	* we can safely hold the lock without the anon_vma getting	90	* we can safely hold the lock without the anon_vma getting
91	* freed.	91	* freed.
92	*	92	*
93	* Relies on the full mb implied by the atomic_dec_and_test() from	93	* Relies on the full mb implied by the atomic_dec_and_test() from
94	* put_anon_vma() against the acquire barrier implied by	94	* put_anon_vma() against the acquire barrier implied by
95	* mutex_trylock() from page_lock_anon_vma(). This orders:	95	* mutex_trylock() from page_lock_anon_vma(). This orders:
96	*	96	*
97	* page_lock_anon_vma() VS put_anon_vma()	97	* page_lock_anon_vma() VS put_anon_vma()
98	* mutex_trylock() atomic_dec_and_test()	98	* mutex_trylock() atomic_dec_and_test()
99	* LOCK MB	99	* LOCK MB
100	* atomic_read() mutex_is_locked()	100	* atomic_read() mutex_is_locked()
101	*	101	*
102	* LOCK should suffice since the actual taking of the lock must	102	* LOCK should suffice since the actual taking of the lock must
103	* happen _before_ what follows.	103	* happen _before_ what follows.
104	*/	104	*/
105	if (mutex_is_locked(&anon_vma->root->mutex)) {	105	if (mutex_is_locked(&anon_vma->root->mutex)) {
106	anon_vma_lock(anon_vma);	106	anon_vma_lock(anon_vma);
107	anon_vma_unlock(anon_vma);	107	anon_vma_unlock(anon_vma);
108	}	108	}
109		109
110	kmem_cache_free(anon_vma_cachep, anon_vma);	110	kmem_cache_free(anon_vma_cachep, anon_vma);
111	}	111	}
112		112
113	static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)	113	static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
114	{	114	{
115	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);	115	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
116	}	116	}
117		117
118	static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)	118	static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
119	{	119	{
120	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);	120	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121	}	121	}
122		122
123	/**	123	/**
124	* anon_vma_prepare - attach an anon_vma to a memory region	124	* anon_vma_prepare - attach an anon_vma to a memory region
125	* @vma: the memory region in question	125	* @vma: the memory region in question
126	*	126	*
127	* This makes sure the memory mapping described by 'vma' has	127	* This makes sure the memory mapping described by 'vma' has
128	* an 'anon_vma' attached to it, so that we can associate the	128	* an 'anon_vma' attached to it, so that we can associate the
129	* anonymous pages mapped into it with that anon_vma.	129	* anonymous pages mapped into it with that anon_vma.
130	*	130	*
131	* The common case will be that we already have one, but if	131	* The common case will be that we already have one, but if
132	* not we either need to find an adjacent mapping that we	132	* not we either need to find an adjacent mapping that we
133	* can re-use the anon_vma from (very common when the only	133	* can re-use the anon_vma from (very common when the only
134	* reason for splitting a vma has been mprotect()), or we	134	* reason for splitting a vma has been mprotect()), or we
135	* allocate a new one.	135	* allocate a new one.
136	*	136	*
137	* Anon-vma allocations are very subtle, because we may have	137	* Anon-vma allocations are very subtle, because we may have
138	* optimistically looked up an anon_vma in page_lock_anon_vma()	138	* optimistically looked up an anon_vma in page_lock_anon_vma()
139	* and that may actually touch the spinlock even in the newly	139	* and that may actually touch the spinlock even in the newly
140	* allocated vma (it depends on RCU to make sure that the	140	* allocated vma (it depends on RCU to make sure that the
141	* anon_vma isn't actually destroyed).	141	* anon_vma isn't actually destroyed).
142	*	142	*
143	* As a result, we need to do proper anon_vma locking even	143	* As a result, we need to do proper anon_vma locking even
144	* for the new allocation. At the same time, we do not want	144	* for the new allocation. At the same time, we do not want
145	* to do any locking for the common case of already having	145	* to do any locking for the common case of already having
146	* an anon_vma.	146	* an anon_vma.
147	*	147	*
148	* This must be called with the mmap_sem held for reading.	148	* This must be called with the mmap_sem held for reading.
149	*/	149	*/
150	int anon_vma_prepare(struct vm_area_struct *vma)	150	int anon_vma_prepare(struct vm_area_struct *vma)
151	{	151	{
152	struct anon_vma *anon_vma = vma->anon_vma;	152	struct anon_vma *anon_vma = vma->anon_vma;
153	struct anon_vma_chain *avc;	153	struct anon_vma_chain *avc;
154		154
155	might_sleep();	155	might_sleep();
156	if (unlikely(!anon_vma)) {	156	if (unlikely(!anon_vma)) {
157	struct mm_struct *mm = vma->vm_mm;	157	struct mm_struct *mm = vma->vm_mm;
158	struct anon_vma *allocated;	158	struct anon_vma *allocated;
159		159
160	avc = anon_vma_chain_alloc(GFP_KERNEL);	160	avc = anon_vma_chain_alloc(GFP_KERNEL);
161	if (!avc)	161	if (!avc)
162	goto out_enomem;	162	goto out_enomem;
163		163
164	anon_vma = find_mergeable_anon_vma(vma);	164	anon_vma = find_mergeable_anon_vma(vma);
165	allocated = NULL;	165	allocated = NULL;
166	if (!anon_vma) {	166	if (!anon_vma) {
167	anon_vma = anon_vma_alloc();	167	anon_vma = anon_vma_alloc();
168	if (unlikely(!anon_vma))	168	if (unlikely(!anon_vma))
169	goto out_enomem_free_avc;	169	goto out_enomem_free_avc;
170	allocated = anon_vma;	170	allocated = anon_vma;
171	}	171	}
172		172
173	anon_vma_lock(anon_vma);	173	anon_vma_lock(anon_vma);
174	/* page_table_lock to protect against threads */	174	/* page_table_lock to protect against threads */
175	spin_lock(&mm->page_table_lock);	175	spin_lock(&mm->page_table_lock);
176	if (likely(!vma->anon_vma)) {	176	if (likely(!vma->anon_vma)) {
177	vma->anon_vma = anon_vma;	177	vma->anon_vma = anon_vma;
178	avc->anon_vma = anon_vma;	178	avc->anon_vma = anon_vma;
179	avc->vma = vma;	179	avc->vma = vma;
180	list_add(&avc->same_vma, &vma->anon_vma_chain);	180	list_add(&avc->same_vma, &vma->anon_vma_chain);
181	list_add_tail(&avc->same_anon_vma, &anon_vma->head);	181	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182	allocated = NULL;	182	allocated = NULL;
183	avc = NULL;	183	avc = NULL;
184	}	184	}
185	spin_unlock(&mm->page_table_lock);	185	spin_unlock(&mm->page_table_lock);
186	anon_vma_unlock(anon_vma);	186	anon_vma_unlock(anon_vma);
187		187
188	if (unlikely(allocated))	188	if (unlikely(allocated))
189	put_anon_vma(allocated);	189	put_anon_vma(allocated);
190	if (unlikely(avc))	190	if (unlikely(avc))
191	anon_vma_chain_free(avc);	191	anon_vma_chain_free(avc);
192	}	192	}
193	return 0;	193	return 0;
194		194
195	out_enomem_free_avc:	195	out_enomem_free_avc:
196	anon_vma_chain_free(avc);	196	anon_vma_chain_free(avc);
197	out_enomem:	197	out_enomem:
198	return -ENOMEM;	198	return -ENOMEM;
199	}	199	}
200		200
201	/*	201	/*
202	* This is a useful helper function for locking the anon_vma root as	202	* This is a useful helper function for locking the anon_vma root as
203	* we traverse the vma->anon_vma_chain, looping over anon_vma's that	203	* we traverse the vma->anon_vma_chain, looping over anon_vma's that
204	* have the same vma.	204	* have the same vma.
205	*	205	*
206	* Such anon_vma's should have the same root, so you'd expect to see	206	* Such anon_vma's should have the same root, so you'd expect to see
207	* just a single mutex_lock for the whole traversal.	207	* just a single mutex_lock for the whole traversal.
208	*/	208	*/
209	static inline struct anon_vma lock_anon_vma_root(struct anon_vma root, struct anon_vma *anon_vma)	209	static inline struct anon_vma lock_anon_vma_root(struct anon_vma root, struct anon_vma *anon_vma)
210	{	210	{
211	struct anon_vma *new_root = anon_vma->root;	211	struct anon_vma *new_root = anon_vma->root;
212	if (new_root != root) {	212	if (new_root != root) {
213	if (WARN_ON_ONCE(root))	213	if (WARN_ON_ONCE(root))
214	mutex_unlock(&root->mutex);	214	mutex_unlock(&root->mutex);
215	root = new_root;	215	root = new_root;
216	mutex_lock(&root->mutex);	216	mutex_lock(&root->mutex);
217	}	217	}
218	return root;	218	return root;
219	}	219	}
220		220
221	static inline void unlock_anon_vma_root(struct anon_vma *root)	221	static inline void unlock_anon_vma_root(struct anon_vma *root)
222	{	222	{
223	if (root)	223	if (root)
224	mutex_unlock(&root->mutex);	224	mutex_unlock(&root->mutex);
225	}	225	}
226		226
227	static void anon_vma_chain_link(struct vm_area_struct *vma,	227	static void anon_vma_chain_link(struct vm_area_struct *vma,
228	struct anon_vma_chain *avc,	228	struct anon_vma_chain *avc,
229	struct anon_vma *anon_vma)	229	struct anon_vma *anon_vma)
230	{	230	{
231	avc->vma = vma;	231	avc->vma = vma;
232	avc->anon_vma = anon_vma;	232	avc->anon_vma = anon_vma;
233	list_add(&avc->same_vma, &vma->anon_vma_chain);	233	list_add(&avc->same_vma, &vma->anon_vma_chain);
234		234
235	/*	235	/*
236	* It's critical to add new vmas to the tail of the anon_vma,	236	* It's critical to add new vmas to the tail of the anon_vma,
237	* see comment in huge_memory.c:__split_huge_page().	237	* see comment in huge_memory.c:__split_huge_page().
238	*/	238	*/
239	list_add_tail(&avc->same_anon_vma, &anon_vma->head);	239	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240	}	240	}
241		241
242	/*	242	/*
243	* Attach the anon_vmas from src to dst.	243	* Attach the anon_vmas from src to dst.
244	* Returns 0 on success, -ENOMEM on failure.	244	* Returns 0 on success, -ENOMEM on failure.
245	*/	245	*/
246	int anon_vma_clone(struct vm_area_struct dst, struct vm_area_struct src)	246	int anon_vma_clone(struct vm_area_struct dst, struct vm_area_struct src)
247	{	247	{
248	struct anon_vma_chain avc, pavc;	248	struct anon_vma_chain avc, pavc;
249	struct anon_vma *root = NULL;	249	struct anon_vma *root = NULL;
250		250
251	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {	251	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
252	struct anon_vma *anon_vma;	252	struct anon_vma *anon_vma;
253		253
254	avc = anon_vma_chain_alloc(GFP_NOWAIT \| __GFP_NOWARN);	254	avc = anon_vma_chain_alloc(GFP_NOWAIT \| __GFP_NOWARN);
255	if (unlikely(!avc)) {	255	if (unlikely(!avc)) {
256	unlock_anon_vma_root(root);	256	unlock_anon_vma_root(root);
257	root = NULL;	257	root = NULL;
258	avc = anon_vma_chain_alloc(GFP_KERNEL);	258	avc = anon_vma_chain_alloc(GFP_KERNEL);
259	if (!avc)	259	if (!avc)
260	goto enomem_failure;	260	goto enomem_failure;
261	}	261	}
262	anon_vma = pavc->anon_vma;	262	anon_vma = pavc->anon_vma;
263	root = lock_anon_vma_root(root, anon_vma);	263	root = lock_anon_vma_root(root, anon_vma);
264	anon_vma_chain_link(dst, avc, anon_vma);	264	anon_vma_chain_link(dst, avc, anon_vma);
265	}	265	}
266	unlock_anon_vma_root(root);	266	unlock_anon_vma_root(root);
267	return 0;	267	return 0;
268		268
269	enomem_failure:	269	enomem_failure:
270	unlink_anon_vmas(dst);	270	unlink_anon_vmas(dst);
271	return -ENOMEM;	271	return -ENOMEM;
272	}	272	}
273		273
274	/*	274	/*
275	* Attach vma to its own anon_vma, as well as to the anon_vmas that	275	* Attach vma to its own anon_vma, as well as to the anon_vmas that
276	* the corresponding VMA in the parent process is attached to.	276	* the corresponding VMA in the parent process is attached to.
277	* Returns 0 on success, non-zero on failure.	277	* Returns 0 on success, non-zero on failure.
278	*/	278	*/
279	int anon_vma_fork(struct vm_area_struct vma, struct vm_area_struct pvma)	279	int anon_vma_fork(struct vm_area_struct vma, struct vm_area_struct pvma)
280	{	280	{
281	struct anon_vma_chain *avc;	281	struct anon_vma_chain *avc;
282	struct anon_vma *anon_vma;	282	struct anon_vma *anon_vma;
283		283
284	/* Don't bother if the parent process has no anon_vma here. */	284	/* Don't bother if the parent process has no anon_vma here. */
285	if (!pvma->anon_vma)	285	if (!pvma->anon_vma)
286	return 0;	286	return 0;
287		287
288	/*	288	/*
289	* First, attach the new VMA to the parent VMA's anon_vmas,	289	* First, attach the new VMA to the parent VMA's anon_vmas,
290	* so rmap can find non-COWed pages in child processes.	290	* so rmap can find non-COWed pages in child processes.
291	*/	291	*/
292	if (anon_vma_clone(vma, pvma))	292	if (anon_vma_clone(vma, pvma))
293	return -ENOMEM;	293	return -ENOMEM;
294		294
295	/* Then add our own anon_vma. */	295	/* Then add our own anon_vma. */
296	anon_vma = anon_vma_alloc();	296	anon_vma = anon_vma_alloc();
297	if (!anon_vma)	297	if (!anon_vma)
298	goto out_error;	298	goto out_error;
299	avc = anon_vma_chain_alloc(GFP_KERNEL);	299	avc = anon_vma_chain_alloc(GFP_KERNEL);
300	if (!avc)	300	if (!avc)
301	goto out_error_free_anon_vma;	301	goto out_error_free_anon_vma;
302		302
303	/*	303	/*
304	* The root anon_vma's spinlock is the lock actually used when we	304	* The root anon_vma's spinlock is the lock actually used when we
305	* lock any of the anon_vmas in this anon_vma tree.	305	* lock any of the anon_vmas in this anon_vma tree.
306	*/	306	*/
307	anon_vma->root = pvma->anon_vma->root;	307	anon_vma->root = pvma->anon_vma->root;
308	/*	308	/*
309	* With refcounts, an anon_vma can stay around longer than the	309	* With refcounts, an anon_vma can stay around longer than the
310	* process it belongs to. The root anon_vma needs to be pinned until	310	* process it belongs to. The root anon_vma needs to be pinned until
311	* this anon_vma is freed, because the lock lives in the root.	311	* this anon_vma is freed, because the lock lives in the root.
312	*/	312	*/
313	get_anon_vma(anon_vma->root);	313	get_anon_vma(anon_vma->root);
314	/* Mark this anon_vma as the one where our new (COWed) pages go. */	314	/* Mark this anon_vma as the one where our new (COWed) pages go. */
315	vma->anon_vma = anon_vma;	315	vma->anon_vma = anon_vma;
316	anon_vma_lock(anon_vma);	316	anon_vma_lock(anon_vma);
317	anon_vma_chain_link(vma, avc, anon_vma);	317	anon_vma_chain_link(vma, avc, anon_vma);
318	anon_vma_unlock(anon_vma);	318	anon_vma_unlock(anon_vma);
319		319
320	return 0;	320	return 0;
321		321
322	out_error_free_anon_vma:	322	out_error_free_anon_vma:
323	put_anon_vma(anon_vma);	323	put_anon_vma(anon_vma);
324	out_error:	324	out_error:
325	unlink_anon_vmas(vma);	325	unlink_anon_vmas(vma);
326	return -ENOMEM;	326	return -ENOMEM;
327	}	327	}
328		328
329	void unlink_anon_vmas(struct vm_area_struct *vma)	329	void unlink_anon_vmas(struct vm_area_struct *vma)
330	{	330	{
331	struct anon_vma_chain avc, next;	331	struct anon_vma_chain avc, next;
332	struct anon_vma *root = NULL;	332	struct anon_vma *root = NULL;
333		333
334	/*	334	/*
335	* Unlink each anon_vma chained to the VMA. This list is ordered	335	* Unlink each anon_vma chained to the VMA. This list is ordered
336	* from newest to oldest, ensuring the root anon_vma gets freed last.	336	* from newest to oldest, ensuring the root anon_vma gets freed last.
337	*/	337	*/
338	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {	338	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
339	struct anon_vma *anon_vma = avc->anon_vma;	339	struct anon_vma *anon_vma = avc->anon_vma;
340		340
341	root = lock_anon_vma_root(root, anon_vma);	341	root = lock_anon_vma_root(root, anon_vma);
342	list_del(&avc->same_anon_vma);	342	list_del(&avc->same_anon_vma);
343		343
344	/*	344	/*
345	* Leave empty anon_vmas on the list - we'll need	345	* Leave empty anon_vmas on the list - we'll need
346	* to free them outside the lock.	346	* to free them outside the lock.
347	*/	347	*/
348	if (list_empty(&anon_vma->head))	348	if (list_empty(&anon_vma->head))
349	continue;	349	continue;
350		350
351	list_del(&avc->same_vma);	351	list_del(&avc->same_vma);
352	anon_vma_chain_free(avc);	352	anon_vma_chain_free(avc);
353	}	353	}
354	unlock_anon_vma_root(root);	354	unlock_anon_vma_root(root);
355		355
356	/*	356	/*
357	* Iterate the list once more, it now only contains empty and unlinked	357	* Iterate the list once more, it now only contains empty and unlinked
358	* anon_vmas, destroy them. Could not do before due to __put_anon_vma()	358	* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
359	* needing to acquire the anon_vma->root->mutex.	359	* needing to acquire the anon_vma->root->mutex.
360	*/	360	*/
361	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {	361	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
362	struct anon_vma *anon_vma = avc->anon_vma;	362	struct anon_vma *anon_vma = avc->anon_vma;
363		363
364	put_anon_vma(anon_vma);	364	put_anon_vma(anon_vma);
365		365
366	list_del(&avc->same_vma);	366	list_del(&avc->same_vma);
367	anon_vma_chain_free(avc);	367	anon_vma_chain_free(avc);
368	}	368	}
369	}	369	}
370		370
371	static void anon_vma_ctor(void *data)	371	static void anon_vma_ctor(void *data)
372	{	372	{
373	struct anon_vma *anon_vma = data;	373	struct anon_vma *anon_vma = data;
374		374
375	mutex_init(&anon_vma->mutex);	375	mutex_init(&anon_vma->mutex);
376	atomic_set(&anon_vma->refcount, 0);	376	atomic_set(&anon_vma->refcount, 0);
377	INIT_LIST_HEAD(&anon_vma->head);	377	INIT_LIST_HEAD(&anon_vma->head);
378	}	378	}
379		379
380	void __init anon_vma_init(void)	380	void __init anon_vma_init(void)
381	{	381	{
382	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),	382	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
383	0, SLAB_DESTROY_BY_RCU\|SLAB_PANIC, anon_vma_ctor);	383	0, SLAB_DESTROY_BY_RCU\|SLAB_PANIC, anon_vma_ctor);
384	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);	384	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
385	}	385	}
386		386
387	/*	387	/*
388	* Getting a lock on a stable anon_vma from a page off the LRU is tricky!	388	* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
389	*	389	*
390	* Since there is no serialization what so ever against page_remove_rmap()	390	* Since there is no serialization what so ever against page_remove_rmap()
391	* the best this function can do is return a locked anon_vma that might	391	* the best this function can do is return a locked anon_vma that might
392	* have been relevant to this page.	392	* have been relevant to this page.
393	*	393	*
394	* The page might have been remapped to a different anon_vma or the anon_vma	394	* The page might have been remapped to a different anon_vma or the anon_vma
395	* returned may already be freed (and even reused).	395	* returned may already be freed (and even reused).
396	*	396	*
397	* In case it was remapped to a different anon_vma, the new anon_vma will be a	397	* In case it was remapped to a different anon_vma, the new anon_vma will be a
398	* child of the old anon_vma, and the anon_vma lifetime rules will therefore	398	* child of the old anon_vma, and the anon_vma lifetime rules will therefore
399	* ensure that any anon_vma obtained from the page will still be valid for as	399	* ensure that any anon_vma obtained from the page will still be valid for as
400	* long as we observe page_mapped() [ hence all those page_mapped() tests ].	400	* long as we observe page_mapped() [ hence all those page_mapped() tests ].
401	*	401	*
402	* All users of this function must be very careful when walking the anon_vma	402	* All users of this function must be very careful when walking the anon_vma
403	* chain and verify that the page in question is indeed mapped in it	403	* chain and verify that the page in question is indeed mapped in it
404	* [ something equivalent to page_mapped_in_vma() ].	404	* [ something equivalent to page_mapped_in_vma() ].
405	*	405	*
406	* Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()	406	* Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
407	* that the anon_vma pointer from page->mapping is valid if there is a	407	* that the anon_vma pointer from page->mapping is valid if there is a
408	* mapcount, we can dereference the anon_vma after observing those.	408	* mapcount, we can dereference the anon_vma after observing those.
409	*/	409	*/
410	struct anon_vma page_get_anon_vma(struct page page)	410	struct anon_vma page_get_anon_vma(struct page page)
411	{	411	{
412	struct anon_vma *anon_vma = NULL;	412	struct anon_vma *anon_vma = NULL;
413	unsigned long anon_mapping;	413	unsigned long anon_mapping;
414		414
415	rcu_read_lock();	415	rcu_read_lock();
416	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);	416	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
417	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)	417	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
418	goto out;	418	goto out;
419	if (!page_mapped(page))	419	if (!page_mapped(page))
420	goto out;	420	goto out;
421		421
422	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);	422	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
423	if (!atomic_inc_not_zero(&anon_vma->refcount)) {	423	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
424	anon_vma = NULL;	424	anon_vma = NULL;
425	goto out;	425	goto out;
426	}	426	}
427		427
428	/*	428	/*
429	* If this page is still mapped, then its anon_vma cannot have been	429	* If this page is still mapped, then its anon_vma cannot have been
430	* freed. But if it has been unmapped, we have no security against the	430	* freed. But if it has been unmapped, we have no security against the
431	* anon_vma structure being freed and reused (for another anon_vma:	431	* anon_vma structure being freed and reused (for another anon_vma:
432	* SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()	432	* SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
433	* above cannot corrupt).	433	* above cannot corrupt).
434	*/	434	*/
435	if (!page_mapped(page)) {	435	if (!page_mapped(page)) {
436	put_anon_vma(anon_vma);	436	put_anon_vma(anon_vma);
437	anon_vma = NULL;	437	anon_vma = NULL;
438	}	438	}
439	out:	439	out:
440	rcu_read_unlock();	440	rcu_read_unlock();
441		441
442	return anon_vma;	442	return anon_vma;
443	}	443	}
444		444
445	/*	445	/*
446	* Similar to page_get_anon_vma() except it locks the anon_vma.	446	* Similar to page_get_anon_vma() except it locks the anon_vma.
447	*	447	*
448	* Its a little more complex as it tries to keep the fast path to a single	448	* Its a little more complex as it tries to keep the fast path to a single
449	* atomic op -- the trylock. If we fail the trylock, we fall back to getting a	449	* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
450	* reference like with page_get_anon_vma() and then block on the mutex.	450	* reference like with page_get_anon_vma() and then block on the mutex.
451	*/	451	*/
452	struct anon_vma page_lock_anon_vma(struct page page)	452	struct anon_vma page_lock_anon_vma(struct page page)
453	{	453	{
454	struct anon_vma *anon_vma = NULL;	454	struct anon_vma *anon_vma = NULL;
455	struct anon_vma *root_anon_vma;	455	struct anon_vma *root_anon_vma;
456	unsigned long anon_mapping;	456	unsigned long anon_mapping;
457		457
458	rcu_read_lock();	458	rcu_read_lock();
459	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);	459	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
460	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)	460	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
461	goto out;	461	goto out;
462	if (!page_mapped(page))	462	if (!page_mapped(page))
463	goto out;	463	goto out;
464		464
465	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);	465	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
466	root_anon_vma = ACCESS_ONCE(anon_vma->root);	466	root_anon_vma = ACCESS_ONCE(anon_vma->root);
467	if (mutex_trylock(&root_anon_vma->mutex)) {	467	if (mutex_trylock(&root_anon_vma->mutex)) {
468	/*	468	/*
469	* If the page is still mapped, then this anon_vma is still	469	* If the page is still mapped, then this anon_vma is still
470	* its anon_vma, and holding the mutex ensures that it will	470	* its anon_vma, and holding the mutex ensures that it will
471	* not go away, see anon_vma_free().	471	* not go away, see anon_vma_free().
472	*/	472	*/
473	if (!page_mapped(page)) {	473	if (!page_mapped(page)) {
474	mutex_unlock(&root_anon_vma->mutex);	474	mutex_unlock(&root_anon_vma->mutex);
475	anon_vma = NULL;	475	anon_vma = NULL;
476	}	476	}
477	goto out;	477	goto out;
478	}	478	}
479		479
480	/* trylock failed, we got to sleep */	480	/* trylock failed, we got to sleep */
481	if (!atomic_inc_not_zero(&anon_vma->refcount)) {	481	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
482	anon_vma = NULL;	482	anon_vma = NULL;
483	goto out;	483	goto out;
484	}	484	}
485		485
486	if (!page_mapped(page)) {	486	if (!page_mapped(page)) {
487	put_anon_vma(anon_vma);	487	put_anon_vma(anon_vma);
488	anon_vma = NULL;	488	anon_vma = NULL;
489	goto out;	489	goto out;
490	}	490	}
491		491
492	/* we pinned the anon_vma, its safe to sleep */	492	/* we pinned the anon_vma, its safe to sleep */
493	rcu_read_unlock();	493	rcu_read_unlock();
494	anon_vma_lock(anon_vma);	494	anon_vma_lock(anon_vma);
495		495
496	if (atomic_dec_and_test(&anon_vma->refcount)) {	496	if (atomic_dec_and_test(&anon_vma->refcount)) {
497	/*	497	/*
498	* Oops, we held the last refcount, release the lock	498	* Oops, we held the last refcount, release the lock
499	* and bail -- can't simply use put_anon_vma() because	499	* and bail -- can't simply use put_anon_vma() because
500	* we'll deadlock on the anon_vma_lock() recursion.	500	* we'll deadlock on the anon_vma_lock() recursion.
501	*/	501	*/
502	anon_vma_unlock(anon_vma);	502	anon_vma_unlock(anon_vma);
503	__put_anon_vma(anon_vma);	503	__put_anon_vma(anon_vma);
504	anon_vma = NULL;	504	anon_vma = NULL;
505	}	505	}
506		506
507	return anon_vma;	507	return anon_vma;
508		508
509	out:	509	out:
510	rcu_read_unlock();	510	rcu_read_unlock();
511	return anon_vma;	511	return anon_vma;
512	}	512	}
513		513
514	void page_unlock_anon_vma(struct anon_vma *anon_vma)	514	void page_unlock_anon_vma(struct anon_vma *anon_vma)
515	{	515	{
516	anon_vma_unlock(anon_vma);	516	anon_vma_unlock(anon_vma);
517	}	517	}
518		518
519	/*	519	/*
520	* At what user virtual address is page expected in @vma?	520	* At what user virtual address is page expected in @vma?
521	* Returns virtual address or -EFAULT if page's index/offset is not	521	* Returns virtual address or -EFAULT if page's index/offset is not
522	* within the range mapped the @vma.	522	* within the range mapped the @vma.
523	*/	523	*/
524	inline unsigned long	524	inline unsigned long
525	vma_address(struct page page, struct vm_area_struct vma)	525	vma_address(struct page page, struct vm_area_struct vma)
526	{	526	{
527	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);	527	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
528	unsigned long address;	528	unsigned long address;
529		529
530	if (unlikely(is_vm_hugetlb_page(vma)))	530	if (unlikely(is_vm_hugetlb_page(vma)))
531	pgoff = page->index << huge_page_order(page_hstate(page));	531	pgoff = page->index << huge_page_order(page_hstate(page));
532	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);	532	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
533	if (unlikely(address < vma->vm_start \|\| address >= vma->vm_end)) {	533	if (unlikely(address < vma->vm_start \|\| address >= vma->vm_end)) {
534	/* page should be within @vma mapping range */	534	/* page should be within @vma mapping range */
535	return -EFAULT;	535	return -EFAULT;
536	}	536	}
537	return address;	537	return address;
538	}	538	}
539		539
540	/*	540	/*
541	* At what user virtual address is page expected in vma?	541	* At what user virtual address is page expected in vma?
542	* Caller should check the page is actually part of the vma.	542	* Caller should check the page is actually part of the vma.
543	*/	543	*/
544	unsigned long page_address_in_vma(struct page page, struct vm_area_struct vma)	544	unsigned long page_address_in_vma(struct page page, struct vm_area_struct vma)
545	{	545	{
546	if (PageAnon(page)) {	546	if (PageAnon(page)) {
547	struct anon_vma *page__anon_vma = page_anon_vma(page);	547	struct anon_vma *page__anon_vma = page_anon_vma(page);
548	/*	548	/*
549	* Note: swapoff's unuse_vma() is more efficient with this	549	* Note: swapoff's unuse_vma() is more efficient with this
550	* check, and needs it to match anon_vma when KSM is active.	550	* check, and needs it to match anon_vma when KSM is active.
551	*/	551	*/
552	if (!vma->anon_vma \|\| !page__anon_vma \|\|	552	if (!vma->anon_vma \|\| !page__anon_vma \|\|
553	vma->anon_vma->root != page__anon_vma->root)	553	vma->anon_vma->root != page__anon_vma->root)
554	return -EFAULT;	554	return -EFAULT;
555	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {	555	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
556	if (!vma->vm_file \|\|	556	if (!vma->vm_file \|\|
557	vma->vm_file->f_mapping != page->mapping)	557	vma->vm_file->f_mapping != page->mapping)
558	return -EFAULT;	558	return -EFAULT;
559	} else	559	} else
560	return -EFAULT;	560	return -EFAULT;
561	return vma_address(page, vma);	561	return vma_address(page, vma);
562	}	562	}
563		563
564	/*	564	/*
565	* Check that @page is mapped at @address into @mm.	565	* Check that @page is mapped at @address into @mm.
566	*	566	*
567	* If @sync is false, page_check_address may perform a racy check to avoid	567	* If @sync is false, page_check_address may perform a racy check to avoid
568	* the page table lock when the pte is not present (helpful when reclaiming	568	* the page table lock when the pte is not present (helpful when reclaiming
569	* highly shared pages).	569	* highly shared pages).
570	*	570	*
571	* On success returns with pte mapped and locked.	571	* On success returns with pte mapped and locked.
572	*/	572	*/
573	pte_t __page_check_address(struct page page, struct mm_struct *mm,	573	pte_t __page_check_address(struct page page, struct mm_struct *mm,
574	unsigned long address, spinlock_t **ptlp, int sync)	574	unsigned long address, spinlock_t **ptlp, int sync)
575	{	575	{
576	pgd_t *pgd;	576	pgd_t *pgd;
577	pud_t *pud;	577	pud_t *pud;
578	pmd_t *pmd;	578	pmd_t *pmd;
579	pte_t *pte;	579	pte_t *pte;
580	spinlock_t *ptl;	580	spinlock_t *ptl;
581		581
582	if (unlikely(PageHuge(page))) {	582	if (unlikely(PageHuge(page))) {
583	pte = huge_pte_offset(mm, address);	583	pte = huge_pte_offset(mm, address);
584	ptl = &mm->page_table_lock;	584	ptl = &mm->page_table_lock;
585	goto check;	585	goto check;
586	}	586	}
587		587
588	pgd = pgd_offset(mm, address);	588	pgd = pgd_offset(mm, address);
589	if (!pgd_present(*pgd))	589	if (!pgd_present(*pgd))
590	return NULL;	590	return NULL;
591		591
592	pud = pud_offset(pgd, address);	592	pud = pud_offset(pgd, address);
593	if (!pud_present(*pud))	593	if (!pud_present(*pud))
594	return NULL;	594	return NULL;
595		595
596	pmd = pmd_offset(pud, address);	596	pmd = pmd_offset(pud, address);
597	if (!pmd_present(*pmd))	597	if (!pmd_present(*pmd))
598	return NULL;	598	return NULL;
599	if (pmd_trans_huge(*pmd))	599	if (pmd_trans_huge(*pmd))
600	return NULL;	600	return NULL;
601		601
602	pte = pte_offset_map(pmd, address);	602	pte = pte_offset_map(pmd, address);
603	/* Make a quick check before getting the lock */	603	/* Make a quick check before getting the lock */
604	if (!sync && !pte_present(*pte)) {	604	if (!sync && !pte_present(*pte)) {
605	pte_unmap(pte);	605	pte_unmap(pte);
606	return NULL;	606	return NULL;
607	}	607	}
608		608
609	ptl = pte_lockptr(mm, pmd);	609	ptl = pte_lockptr(mm, pmd);
610	check:	610	check:
611	spin_lock(ptl);	611	spin_lock(ptl);
612	if (pte_present(pte) && page_to_pfn(page) == pte_pfn(pte)) {	612	if (pte_present(pte) && page_to_pfn(page) == pte_pfn(pte)) {
613	*ptlp = ptl;	613	*ptlp = ptl;
614	return pte;	614	return pte;
615	}	615	}
616	pte_unmap_unlock(pte, ptl);	616	pte_unmap_unlock(pte, ptl);
617	return NULL;	617	return NULL;
618	}	618	}
619		619
620	/**	620	/**
621	* page_mapped_in_vma - check whether a page is really mapped in a VMA	621	* page_mapped_in_vma - check whether a page is really mapped in a VMA
622	* @page: the page to test	622	* @page: the page to test
623	* @vma: the VMA to test	623	* @vma: the VMA to test
624	*	624	*
625	* Returns 1 if the page is mapped into the page tables of the VMA, 0	625	* Returns 1 if the page is mapped into the page tables of the VMA, 0
626	* if the page is not mapped into the page tables of this VMA. Only	626	* if the page is not mapped into the page tables of this VMA. Only
627	* valid for normal file or anonymous VMAs.	627	* valid for normal file or anonymous VMAs.
628	*/	628	*/
629	int page_mapped_in_vma(struct page page, struct vm_area_struct vma)	629	int page_mapped_in_vma(struct page page, struct vm_area_struct vma)
630	{	630	{
631	unsigned long address;	631	unsigned long address;
632	pte_t *pte;	632	pte_t *pte;
633	spinlock_t *ptl;	633	spinlock_t *ptl;
634		634
635	address = vma_address(page, vma);	635	address = vma_address(page, vma);
636	if (address == -EFAULT) /* out of vma range */	636	if (address == -EFAULT) /* out of vma range */
637	return 0;	637	return 0;
638	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);	638	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
639	if (!pte) /* the page is not in this mm */	639	if (!pte) /* the page is not in this mm */
640	return 0;	640	return 0;
641	pte_unmap_unlock(pte, ptl);	641	pte_unmap_unlock(pte, ptl);
642		642
643	return 1;	643	return 1;
644	}	644	}
645		645
646	/*	646	/*
647	* Subfunctions of page_referenced: page_referenced_one called	647	* Subfunctions of page_referenced: page_referenced_one called
648	* repeatedly from either page_referenced_anon or page_referenced_file.	648	* repeatedly from either page_referenced_anon or page_referenced_file.
649	*/	649	*/
650	int page_referenced_one(struct page page, struct vm_area_struct vma,	650	int page_referenced_one(struct page page, struct vm_area_struct vma,
651	unsigned long address, unsigned int *mapcount,	651	unsigned long address, unsigned int *mapcount,
652	unsigned long *vm_flags)	652	unsigned long *vm_flags)
653	{	653	{
654	struct mm_struct *mm = vma->vm_mm;	654	struct mm_struct *mm = vma->vm_mm;
655	int referenced = 0;	655	int referenced = 0;
656		656
657	if (unlikely(PageTransHuge(page))) {	657	if (unlikely(PageTransHuge(page))) {
658	pmd_t *pmd;	658	pmd_t *pmd;
659		659
660	spin_lock(&mm->page_table_lock);	660	spin_lock(&mm->page_table_lock);
661	/*	661	/*
662	* rmap might return false positives; we must filter	662	* rmap might return false positives; we must filter
663	* these out using page_check_address_pmd().	663	* these out using page_check_address_pmd().
664	*/	664	*/
665	pmd = page_check_address_pmd(page, mm, address,	665	pmd = page_check_address_pmd(page, mm, address,
666	PAGE_CHECK_ADDRESS_PMD_FLAG);	666	PAGE_CHECK_ADDRESS_PMD_FLAG);
667	if (!pmd) {	667	if (!pmd) {
668	spin_unlock(&mm->page_table_lock);	668	spin_unlock(&mm->page_table_lock);
669	goto out;	669	goto out;
670	}	670	}
671		671
672	if (vma->vm_flags & VM_LOCKED) {	672	if (vma->vm_flags & VM_LOCKED) {
673	spin_unlock(&mm->page_table_lock);	673	spin_unlock(&mm->page_table_lock);
674	mapcount = 0; / break early from loop */	674	mapcount = 0; / break early from loop */
675	*vm_flags \|= VM_LOCKED;	675	*vm_flags \|= VM_LOCKED;
676	goto out;	676	goto out;
677	}	677	}
678		678
679	/* go ahead even if the pmd is pmd_trans_splitting() */	679	/* go ahead even if the pmd is pmd_trans_splitting() */
680	if (pmdp_clear_flush_young_notify(vma, address, pmd))	680	if (pmdp_clear_flush_young_notify(vma, address, pmd))
681	referenced++;	681	referenced++;
682	spin_unlock(&mm->page_table_lock);	682	spin_unlock(&mm->page_table_lock);
683	} else {	683	} else {
684	pte_t *pte;	684	pte_t *pte;
685	spinlock_t *ptl;	685	spinlock_t *ptl;
686		686
687	/*	687	/*
688	* rmap might return false positives; we must filter	688	* rmap might return false positives; we must filter
689	* these out using page_check_address().	689	* these out using page_check_address().
690	*/	690	*/
691	pte = page_check_address(page, mm, address, &ptl, 0);	691	pte = page_check_address(page, mm, address, &ptl, 0);
692	if (!pte)	692	if (!pte)
693	goto out;	693	goto out;
694		694
695	if (vma->vm_flags & VM_LOCKED) {	695	if (vma->vm_flags & VM_LOCKED) {
696	pte_unmap_unlock(pte, ptl);	696	pte_unmap_unlock(pte, ptl);
697	mapcount = 0; / break early from loop */	697	mapcount = 0; / break early from loop */
698	*vm_flags \|= VM_LOCKED;	698	*vm_flags \|= VM_LOCKED;
699	goto out;	699	goto out;
700	}	700	}
701		701
702	if (ptep_clear_flush_young_notify(vma, address, pte)) {	702	if (ptep_clear_flush_young_notify(vma, address, pte)) {
703	/*	703	/*
704	* Don't treat a reference through a sequentially read	704	* Don't treat a reference through a sequentially read
705	* mapping as such. If the page has been used in	705	* mapping as such. If the page has been used in
706	* another mapping, we will catch it; if this other	706	* another mapping, we will catch it; if this other
707	* mapping is already gone, the unmap path will have	707	* mapping is already gone, the unmap path will have
708	* set PG_referenced or activated the page.	708	* set PG_referenced or activated the page.
709	*/	709	*/
710	if (likely(!VM_SequentialReadHint(vma)))	710	if (likely(!VM_SequentialReadHint(vma)))
711	referenced++;	711	referenced++;
712	}	712	}
713	pte_unmap_unlock(pte, ptl);	713	pte_unmap_unlock(pte, ptl);
714	}	714	}
715		715
716	/* Pretend the page is referenced if the task has the	716	/* Pretend the page is referenced if the task has the
717	swap token and is in the middle of a page fault. */	717	swap token and is in the middle of a page fault. */
718	if (mm != current->mm && has_swap_token(mm) &&	718	if (mm != current->mm && has_swap_token(mm) &&
719	rwsem_is_locked(&mm->mmap_sem))	719	rwsem_is_locked(&mm->mmap_sem))
720	referenced++;	720	referenced++;
721		721
722	(*mapcount)--;	722	(*mapcount)--;
723		723
724	if (referenced)	724	if (referenced)
725	*vm_flags \|= vma->vm_flags;	725	*vm_flags \|= vma->vm_flags;
726	out:	726	out:
727	return referenced;	727	return referenced;
728	}	728	}
729		729
730	static int page_referenced_anon(struct page *page,	730	static int page_referenced_anon(struct page *page,
731	struct mem_cgroup *mem_cont,	731	struct mem_cgroup *mem_cont,
732	unsigned long *vm_flags)	732	unsigned long *vm_flags)
733	{	733	{
734	unsigned int mapcount;	734	unsigned int mapcount;
735	struct anon_vma *anon_vma;	735	struct anon_vma *anon_vma;
736	struct anon_vma_chain *avc;	736	struct anon_vma_chain *avc;
737	int referenced = 0;	737	int referenced = 0;
738		738
739	anon_vma = page_lock_anon_vma(page);	739	anon_vma = page_lock_anon_vma(page);
740	if (!anon_vma)	740	if (!anon_vma)
741	return referenced;	741	return referenced;
742		742
743	mapcount = page_mapcount(page);	743	mapcount = page_mapcount(page);
744	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {	744	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
745	struct vm_area_struct *vma = avc->vma;	745	struct vm_area_struct *vma = avc->vma;
746	unsigned long address = vma_address(page, vma);	746	unsigned long address = vma_address(page, vma);
747	if (address == -EFAULT)	747	if (address == -EFAULT)
748	continue;	748	continue;
749	/*	749	/*
750	* If we are reclaiming on behalf of a cgroup, skip	750	* If we are reclaiming on behalf of a cgroup, skip
751	* counting on behalf of references from different	751	* counting on behalf of references from different
752	* cgroups	752	* cgroups
753	*/	753	*/
754	if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))	754	if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
755	continue;	755	continue;
756	referenced += page_referenced_one(page, vma, address,	756	referenced += page_referenced_one(page, vma, address,
757	&mapcount, vm_flags);	757	&mapcount, vm_flags);
758	if (!mapcount)	758	if (!mapcount)
759	break;	759	break;
760	}	760	}
761		761
762	page_unlock_anon_vma(anon_vma);	762	page_unlock_anon_vma(anon_vma);
763	return referenced;	763	return referenced;
764	}	764	}
765		765
766	/**	766	/**
767	* page_referenced_file - referenced check for object-based rmap	767	* page_referenced_file - referenced check for object-based rmap
768	* @page: the page we're checking references on.	768	* @page: the page we're checking references on.
769	* @mem_cont: target memory controller	769	* @mem_cont: target memory controller
770	* @vm_flags: collect encountered vma->vm_flags who actually referenced the page	770	* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
771	*	771	*
772	* For an object-based mapped page, find all the places it is mapped and	772	* For an object-based mapped page, find all the places it is mapped and
773	* check/clear the referenced flag. This is done by following the page->mapping	773	* check/clear the referenced flag. This is done by following the page->mapping
774	* pointer, then walking the chain of vmas it holds. It returns the number	774	* pointer, then walking the chain of vmas it holds. It returns the number
775	* of references it found.	775	* of references it found.
776	*	776	*
777	* This function is only called from page_referenced for object-based pages.	777	* This function is only called from page_referenced for object-based pages.
778	*/	778	*/
779	static int page_referenced_file(struct page *page,	779	static int page_referenced_file(struct page *page,
780	struct mem_cgroup *mem_cont,	780	struct mem_cgroup *mem_cont,
781	unsigned long *vm_flags)	781	unsigned long *vm_flags)
782	{	782	{
783	unsigned int mapcount;	783	unsigned int mapcount;
784	struct address_space *mapping = page->mapping;	784	struct address_space *mapping = page->mapping;
785	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);	785	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
786	struct vm_area_struct *vma;	786	struct vm_area_struct *vma;
787	struct prio_tree_iter iter;	787	struct prio_tree_iter iter;
788	int referenced = 0;	788	int referenced = 0;
789		789
790	/*	790	/*
791	* The caller's checks on page->mapping and !PageAnon have made	791	* The caller's checks on page->mapping and !PageAnon have made
792	* sure that this is a file page: the check for page->mapping	792	* sure that this is a file page: the check for page->mapping
793	* excludes the case just before it gets set on an anon page.	793	* excludes the case just before it gets set on an anon page.
794	*/	794	*/
795	BUG_ON(PageAnon(page));	795	BUG_ON(PageAnon(page));
796		796
797	/*	797	/*
798	* The page lock not only makes sure that page->mapping cannot	798	* The page lock not only makes sure that page->mapping cannot
799	* suddenly be NULLified by truncation, it makes sure that the	799	* suddenly be NULLified by truncation, it makes sure that the
800	* structure at mapping cannot be freed and reused yet,	800	* structure at mapping cannot be freed and reused yet,
801	* so we can safely take mapping->i_mmap_mutex.	801	* so we can safely take mapping->i_mmap_mutex.
802	*/	802	*/
803	BUG_ON(!PageLocked(page));	803	BUG_ON(!PageLocked(page));
804		804
805	mutex_lock(&mapping->i_mmap_mutex);	805	mutex_lock(&mapping->i_mmap_mutex);
806		806
807	/*	807	/*
808	* i_mmap_mutex does not stabilize mapcount at all, but mapcount	808	* i_mmap_mutex does not stabilize mapcount at all, but mapcount
809	* is more likely to be accurate if we note it after spinning.	809	* is more likely to be accurate if we note it after spinning.
810	*/	810	*/
811	mapcount = page_mapcount(page);	811	mapcount = page_mapcount(page);
812		812
813	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {	813	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
814	unsigned long address = vma_address(page, vma);	814	unsigned long address = vma_address(page, vma);
815	if (address == -EFAULT)	815	if (address == -EFAULT)
816	continue;	816	continue;
817	/*	817	/*
818	* If we are reclaiming on behalf of a cgroup, skip	818	* If we are reclaiming on behalf of a cgroup, skip
819	* counting on behalf of references from different	819	* counting on behalf of references from different
820	* cgroups	820	* cgroups
821	*/	821	*/
822	if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))	822	if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
823	continue;	823	continue;
824	referenced += page_referenced_one(page, vma, address,	824	referenced += page_referenced_one(page, vma, address,
825	&mapcount, vm_flags);	825	&mapcount, vm_flags);
826	if (!mapcount)	826	if (!mapcount)
827	break;	827	break;
828	}	828	}
829		829
830	mutex_unlock(&mapping->i_mmap_mutex);	830	mutex_unlock(&mapping->i_mmap_mutex);
831	return referenced;	831	return referenced;
832	}	832	}
833		833
834	/**	834	/**
835	* page_referenced - test if the page was referenced	835	* page_referenced - test if the page was referenced
836	* @page: the page to test	836	* @page: the page to test
837	* @is_locked: caller holds lock on the page	837	* @is_locked: caller holds lock on the page
838	* @mem_cont: target memory controller	838	* @mem_cont: target memory controller
839	* @vm_flags: collect encountered vma->vm_flags who actually referenced the page	839	* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
840	*	840	*
841	* Quick test_and_clear_referenced for all mappings to a page,	841	* Quick test_and_clear_referenced for all mappings to a page,
842	* returns the number of ptes which referenced the page.	842	* returns the number of ptes which referenced the page.
843	*/	843	*/
844	int page_referenced(struct page *page,	844	int page_referenced(struct page *page,
845	int is_locked,	845	int is_locked,
846	struct mem_cgroup *mem_cont,	846	struct mem_cgroup *mem_cont,
847	unsigned long *vm_flags)	847	unsigned long *vm_flags)
848	{	848	{
849	int referenced = 0;	849	int referenced = 0;
850	int we_locked = 0;	850	int we_locked = 0;
851		851
852	*vm_flags = 0;	852	*vm_flags = 0;
853	if (page_mapped(page) && page_rmapping(page)) {	853	if (page_mapped(page) && page_rmapping(page)) {
854	if (!is_locked && (!PageAnon(page) \|\| PageKsm(page))) {	854	if (!is_locked && (!PageAnon(page) \|\| PageKsm(page))) {
855	we_locked = trylock_page(page);	855	we_locked = trylock_page(page);
856	if (!we_locked) {	856	if (!we_locked) {
857	referenced++;	857	referenced++;
858	goto out;	858	goto out;
859	}	859	}
860	}	860	}
861	if (unlikely(PageKsm(page)))	861	if (unlikely(PageKsm(page)))
862	referenced += page_referenced_ksm(page, mem_cont,	862	referenced += page_referenced_ksm(page, mem_cont,
863	vm_flags);	863	vm_flags);
864	else if (PageAnon(page))	864	else if (PageAnon(page))
865	referenced += page_referenced_anon(page, mem_cont,	865	referenced += page_referenced_anon(page, mem_cont,
866	vm_flags);	866	vm_flags);
867	else if (page->mapping)	867	else if (page->mapping)
868	referenced += page_referenced_file(page, mem_cont,	868	referenced += page_referenced_file(page, mem_cont,
869	vm_flags);	869	vm_flags);
870	if (we_locked)	870	if (we_locked)
871	unlock_page(page);	871	unlock_page(page);
872		872
873	if (page_test_and_clear_young(page_to_pfn(page)))	873	if (page_test_and_clear_young(page_to_pfn(page)))
874	referenced++;	874	referenced++;
875	}	875	}
876	out:	876	out:
877	return referenced;	877	return referenced;
878	}	878	}
879		879
880	static int page_mkclean_one(struct page page, struct vm_area_struct vma,	880	static int page_mkclean_one(struct page page, struct vm_area_struct vma,
881	unsigned long address)	881	unsigned long address)
882	{	882	{
883	struct mm_struct *mm = vma->vm_mm;	883	struct mm_struct *mm = vma->vm_mm;
884	pte_t *pte;	884	pte_t *pte;
885	spinlock_t *ptl;	885	spinlock_t *ptl;
886	int ret = 0;	886	int ret = 0;
887		887
888	pte = page_check_address(page, mm, address, &ptl, 1);	888	pte = page_check_address(page, mm, address, &ptl, 1);
889	if (!pte)	889	if (!pte)
890	goto out;	890	goto out;
891		891
892	if (pte_dirty(pte) \|\| pte_write(pte)) {	892	if (pte_dirty(pte) \|\| pte_write(pte)) {
893	pte_t entry;	893	pte_t entry;
894		894
895	flush_cache_page(vma, address, pte_pfn(*pte));	895	flush_cache_page(vma, address, pte_pfn(*pte));
896	entry = ptep_clear_flush_notify(vma, address, pte);	896	entry = ptep_clear_flush_notify(vma, address, pte);
897	entry = pte_wrprotect(entry);	897	entry = pte_wrprotect(entry);
898	entry = pte_mkclean(entry);	898	entry = pte_mkclean(entry);
899	set_pte_at(mm, address, pte, entry);	899	set_pte_at(mm, address, pte, entry);
900	ret = 1;	900	ret = 1;
901	}	901	}
902		902
903	pte_unmap_unlock(pte, ptl);	903	pte_unmap_unlock(pte, ptl);
904	out:	904	out:
905	return ret;	905	return ret;
906	}	906	}
907		907
908	static int page_mkclean_file(struct address_space mapping, struct page page)	908	static int page_mkclean_file(struct address_space mapping, struct page page)
909	{	909	{
910	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);	910	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
911	struct vm_area_struct *vma;	911	struct vm_area_struct *vma;
912	struct prio_tree_iter iter;	912	struct prio_tree_iter iter;
913	int ret = 0;	913	int ret = 0;
914		914
915	BUG_ON(PageAnon(page));	915	BUG_ON(PageAnon(page));
916		916
917	mutex_lock(&mapping->i_mmap_mutex);	917	mutex_lock(&mapping->i_mmap_mutex);
918	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {	918	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
919	if (vma->vm_flags & VM_SHARED) {	919	if (vma->vm_flags & VM_SHARED) {
920	unsigned long address = vma_address(page, vma);	920	unsigned long address = vma_address(page, vma);
921	if (address == -EFAULT)	921	if (address == -EFAULT)
922	continue;	922	continue;
923	ret += page_mkclean_one(page, vma, address);	923	ret += page_mkclean_one(page, vma, address);
924	}	924	}
925	}	925	}
926	mutex_unlock(&mapping->i_mmap_mutex);	926	mutex_unlock(&mapping->i_mmap_mutex);
927	return ret;	927	return ret;
928	}	928	}
929		929
930	int page_mkclean(struct page *page)	930	int page_mkclean(struct page *page)
931	{	931	{
932	int ret = 0;	932	int ret = 0;
933		933
934	BUG_ON(!PageLocked(page));	934	BUG_ON(!PageLocked(page));
935		935
936	if (page_mapped(page)) {	936	if (page_mapped(page)) {
937	struct address_space *mapping = page_mapping(page);	937	struct address_space *mapping = page_mapping(page);
938	if (mapping) {	938	if (mapping) {
939	ret = page_mkclean_file(mapping, page);	939	ret = page_mkclean_file(mapping, page);
940	if (page_test_and_clear_dirty(page_to_pfn(page), 1))	940	if (page_test_and_clear_dirty(page_to_pfn(page), 1))
941	ret = 1;	941	ret = 1;
942	}	942	}
943	}	943	}
944		944
945	return ret;	945	return ret;
946	}	946	}
947	EXPORT_SYMBOL_GPL(page_mkclean);	947	EXPORT_SYMBOL_GPL(page_mkclean);
948		948
949	/**	949	/**
950	* page_move_anon_rmap - move a page to our anon_vma	950	* page_move_anon_rmap - move a page to our anon_vma
951	* @page: the page to move to our anon_vma	951	* @page: the page to move to our anon_vma
952	* @vma: the vma the page belongs to	952	* @vma: the vma the page belongs to
953	* @address: the user virtual address mapped	953	* @address: the user virtual address mapped
954	*	954	*
955	* When a page belongs exclusively to one process after a COW event,	955	* When a page belongs exclusively to one process after a COW event,
956	* that page can be moved into the anon_vma that belongs to just that	956	* that page can be moved into the anon_vma that belongs to just that
957	* process, so the rmap code will not search the parent or sibling	957	* process, so the rmap code will not search the parent or sibling
958	* processes.	958	* processes.
959	*/	959	*/
960	void page_move_anon_rmap(struct page *page,	960	void page_move_anon_rmap(struct page *page,
961	struct vm_area_struct *vma, unsigned long address)	961	struct vm_area_struct *vma, unsigned long address)
962	{	962	{
963	struct anon_vma *anon_vma = vma->anon_vma;	963	struct anon_vma *anon_vma = vma->anon_vma;
964		964
965	VM_BUG_ON(!PageLocked(page));	965	VM_BUG_ON(!PageLocked(page));
966	VM_BUG_ON(!anon_vma);	966	VM_BUG_ON(!anon_vma);
967	VM_BUG_ON(page->index != linear_page_index(vma, address));	967	VM_BUG_ON(page->index != linear_page_index(vma, address));
968		968
969	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;	969	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
970	page->mapping = (struct address_space *) anon_vma;	970	page->mapping = (struct address_space *) anon_vma;
971	}	971	}
972		972
973	/**	973	/**
974	* __page_set_anon_rmap - set up new anonymous rmap	974	* __page_set_anon_rmap - set up new anonymous rmap
975	* @page: Page to add to rmap	975	* @page: Page to add to rmap
976	* @vma: VM area to add page to.	976	* @vma: VM area to add page to.
977	* @address: User virtual address of the mapping	977	* @address: User virtual address of the mapping
978	* @exclusive: the page is exclusively owned by the current process	978	* @exclusive: the page is exclusively owned by the current process
979	*/	979	*/
980	static void __page_set_anon_rmap(struct page *page,	980	static void __page_set_anon_rmap(struct page *page,
981	struct vm_area_struct *vma, unsigned long address, int exclusive)	981	struct vm_area_struct *vma, unsigned long address, int exclusive)
982	{	982	{
983	struct anon_vma *anon_vma = vma->anon_vma;	983	struct anon_vma *anon_vma = vma->anon_vma;
984		984
985	BUG_ON(!anon_vma);	985	BUG_ON(!anon_vma);
986		986
987	if (PageAnon(page))	987	if (PageAnon(page))
988	return;	988	return;
989		989
990	/*	990	/*
991	* If the page isn't exclusively mapped into this vma,	991	* If the page isn't exclusively mapped into this vma,
992	* we must use the _oldest_ possible anon_vma for the	992	* we must use the _oldest_ possible anon_vma for the
993	* page mapping!	993	* page mapping!
994	*/	994	*/
995	if (!exclusive)	995	if (!exclusive)
996	anon_vma = anon_vma->root;	996	anon_vma = anon_vma->root;
997		997
998	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;	998	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
999	page->mapping = (struct address_space *) anon_vma;	999	page->mapping = (struct address_space *) anon_vma;
1000	page->index = linear_page_index(vma, address);	1000	page->index = linear_page_index(vma, address);
1001	}	1001	}
1002		1002
1003	/**	1003	/**
1004	* __page_check_anon_rmap - sanity check anonymous rmap addition	1004	* __page_check_anon_rmap - sanity check anonymous rmap addition
1005	* @page: the page to add the mapping to	1005	* @page: the page to add the mapping to
1006	* @vma: the vm area in which the mapping is added	1006	* @vma: the vm area in which the mapping is added
1007	* @address: the user virtual address mapped	1007	* @address: the user virtual address mapped
1008	*/	1008	*/
1009	static void __page_check_anon_rmap(struct page *page,	1009	static void __page_check_anon_rmap(struct page *page,
1010	struct vm_area_struct *vma, unsigned long address)	1010	struct vm_area_struct *vma, unsigned long address)
1011	{	1011	{
1012	#ifdef CONFIG_DEBUG_VM	1012	#ifdef CONFIG_DEBUG_VM
1013	/*	1013	/*
1014	* The page's anon-rmap details (mapping and index) are guaranteed to	1014	* The page's anon-rmap details (mapping and index) are guaranteed to
1015	* be set up correctly at this point.	1015	* be set up correctly at this point.
1016	*	1016	*
1017	* We have exclusion against page_add_anon_rmap because the caller	1017	* We have exclusion against page_add_anon_rmap because the caller
1018	* always holds the page locked, except if called from page_dup_rmap,	1018	* always holds the page locked, except if called from page_dup_rmap,
1019	* in which case the page is already known to be setup.	1019	* in which case the page is already known to be setup.
1020	*	1020	*
1021	* We have exclusion against page_add_new_anon_rmap because those pages	1021	* We have exclusion against page_add_new_anon_rmap because those pages
1022	* are initially only visible via the pagetables, and the pte is locked	1022	* are initially only visible via the pagetables, and the pte is locked
1023	* over the call to page_add_new_anon_rmap.	1023	* over the call to page_add_new_anon_rmap.
1024	*/	1024	*/
1025	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);	1025	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
1026	BUG_ON(page->index != linear_page_index(vma, address));	1026	BUG_ON(page->index != linear_page_index(vma, address));
1027	#endif	1027	#endif
1028	}	1028	}
1029		1029
1030	/**	1030	/**
1031	* page_add_anon_rmap - add pte mapping to an anonymous page	1031	* page_add_anon_rmap - add pte mapping to an anonymous page
1032	* @page: the page to add the mapping to	1032	* @page: the page to add the mapping to
1033	* @vma: the vm area in which the mapping is added	1033	* @vma: the vm area in which the mapping is added
1034	* @address: the user virtual address mapped	1034	* @address: the user virtual address mapped
1035	*	1035	*
1036	* The caller needs to hold the pte lock, and the page must be locked in	1036	* The caller needs to hold the pte lock, and the page must be locked in
1037	* the anon_vma case: to serialize mapping,index checking after setting,	1037	* the anon_vma case: to serialize mapping,index checking after setting,
1038	* and to ensure that PageAnon is not being upgraded racily to PageKsm	1038	* and to ensure that PageAnon is not being upgraded racily to PageKsm
1039	* (but PageKsm is never downgraded to PageAnon).	1039	* (but PageKsm is never downgraded to PageAnon).
1040	*/	1040	*/
1041	void page_add_anon_rmap(struct page *page,	1041	void page_add_anon_rmap(struct page *page,
1042	struct vm_area_struct *vma, unsigned long address)	1042	struct vm_area_struct *vma, unsigned long address)
1043	{	1043	{
1044	do_page_add_anon_rmap(page, vma, address, 0);	1044	do_page_add_anon_rmap(page, vma, address, 0);
1045	}	1045	}
1046		1046
1047	/*	1047	/*
1048	* Special version of the above for do_swap_page, which often runs	1048	* Special version of the above for do_swap_page, which often runs
1049	* into pages that are exclusively owned by the current process.	1049	* into pages that are exclusively owned by the current process.
1050	* Everybody else should continue to use page_add_anon_rmap above.	1050	* Everybody else should continue to use page_add_anon_rmap above.
1051	*/	1051	*/
1052	void do_page_add_anon_rmap(struct page *page,	1052	void do_page_add_anon_rmap(struct page *page,
1053	struct vm_area_struct *vma, unsigned long address, int exclusive)	1053	struct vm_area_struct *vma, unsigned long address, int exclusive)
1054	{	1054	{
1055	int first = atomic_inc_and_test(&page->_mapcount);	1055	int first = atomic_inc_and_test(&page->_mapcount);
1056	if (first) {	1056	if (first) {
1057	if (!PageTransHuge(page))	1057	if (!PageTransHuge(page))
1058	__inc_zone_page_state(page, NR_ANON_PAGES);	1058	__inc_zone_page_state(page, NR_ANON_PAGES);
1059	else	1059	else
1060	__inc_zone_page_state(page,	1060	__inc_zone_page_state(page,
1061	NR_ANON_TRANSPARENT_HUGEPAGES);	1061	NR_ANON_TRANSPARENT_HUGEPAGES);
1062	}	1062	}
1063	if (unlikely(PageKsm(page)))	1063	if (unlikely(PageKsm(page)))
1064	return;	1064	return;
1065		1065
1066	VM_BUG_ON(!PageLocked(page));	1066	VM_BUG_ON(!PageLocked(page));
1067	/* address might be in next vma when migration races vma_adjust */	1067	/* address might be in next vma when migration races vma_adjust */
1068	if (first)	1068	if (first)
1069	__page_set_anon_rmap(page, vma, address, exclusive);	1069	__page_set_anon_rmap(page, vma, address, exclusive);
1070	else	1070	else
1071	__page_check_anon_rmap(page, vma, address);	1071	__page_check_anon_rmap(page, vma, address);
1072	}	1072	}
1073		1073
1074	/**	1074	/**
1075	* page_add_new_anon_rmap - add pte mapping to a new anonymous page	1075	* page_add_new_anon_rmap - add pte mapping to a new anonymous page
1076	* @page: the page to add the mapping to	1076	* @page: the page to add the mapping to
1077	* @vma: the vm area in which the mapping is added	1077	* @vma: the vm area in which the mapping is added
1078	* @address: the user virtual address mapped	1078	* @address: the user virtual address mapped
1079	*	1079	*
1080	* Same as page_add_anon_rmap but must only be called on new pages.	1080	* Same as page_add_anon_rmap but must only be called on new pages.
1081	* This means the inc-and-test can be bypassed.	1081	* This means the inc-and-test can be bypassed.
1082	* Page does not have to be locked.	1082	* Page does not have to be locked.
1083	*/	1083	*/
1084	void page_add_new_anon_rmap(struct page *page,	1084	void page_add_new_anon_rmap(struct page *page,
1085	struct vm_area_struct *vma, unsigned long address)	1085	struct vm_area_struct *vma, unsigned long address)
1086	{	1086	{
1087	VM_BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);	1087	VM_BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
1088	SetPageSwapBacked(page);	1088	SetPageSwapBacked(page);
1089	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */	1089	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1090	if (!PageTransHuge(page))	1090	if (!PageTransHuge(page))
1091	__inc_zone_page_state(page, NR_ANON_PAGES);	1091	__inc_zone_page_state(page, NR_ANON_PAGES);
1092	else	1092	else
1093	__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);	1093	__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1094	__page_set_anon_rmap(page, vma, address, 1);	1094	__page_set_anon_rmap(page, vma, address, 1);
1095	if (page_evictable(page, vma))	1095	if (page_evictable(page, vma))
1096	lru_cache_add_lru(page, LRU_ACTIVE_ANON);	1096	lru_cache_add_lru(page, LRU_ACTIVE_ANON);
1097	else	1097	else
1098	add_page_to_unevictable_list(page);	1098	add_page_to_unevictable_list(page);
1099	}	1099	}
1100		1100
1101	/**	1101	/**
1102	* page_add_file_rmap - add pte mapping to a file page	1102	* page_add_file_rmap - add pte mapping to a file page
1103	* @page: the page to add the mapping to	1103	* @page: the page to add the mapping to
1104	*	1104	*
1105	* The caller needs to hold the pte lock.	1105	* The caller needs to hold the pte lock.
1106	*/	1106	*/
1107	void page_add_file_rmap(struct page *page)	1107	void page_add_file_rmap(struct page *page)
1108	{	1108	{
1109	if (atomic_inc_and_test(&page->_mapcount)) {	1109	if (atomic_inc_and_test(&page->_mapcount)) {
1110	__inc_zone_page_state(page, NR_FILE_MAPPED);	1110	__inc_zone_page_state(page, NR_FILE_MAPPED);
1111	mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);	1111	mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1112	}	1112	}
1113	}	1113	}
1114		1114
1115	/**	1115	/**
1116	* page_remove_rmap - take down pte mapping from a page	1116	* page_remove_rmap - take down pte mapping from a page
1117	* @page: page to remove mapping from	1117	* @page: page to remove mapping from
1118	*	1118	*
1119	* The caller needs to hold the pte lock.	1119	* The caller needs to hold the pte lock.
1120	*/	1120	*/
1121	void page_remove_rmap(struct page *page)	1121	void page_remove_rmap(struct page *page)
1122	{	1122	{
1123	/* page still mapped by someone else? */	1123	/* page still mapped by someone else? */
1124	if (!atomic_add_negative(-1, &page->_mapcount))	1124	if (!atomic_add_negative(-1, &page->_mapcount))
1125	return;	1125	return;
1126		1126
1127	/*	1127	/*
1128	* Now that the last pte has gone, s390 must transfer dirty	1128	* Now that the last pte has gone, s390 must transfer dirty
1129	* flag from storage key to struct page. We can usually skip	1129	* flag from storage key to struct page. We can usually skip
1130	* this if the page is anon, so about to be freed; but perhaps	1130	* this if the page is anon, so about to be freed; but perhaps
1131	* not if it's in swapcache - there might be another pte slot	1131	* not if it's in swapcache - there might be another pte slot
1132	* containing the swap entry, but page not yet written to swap.	1132	* containing the swap entry, but page not yet written to swap.
1133	*/	1133	*/
1134	if ((!PageAnon(page) \|\| PageSwapCache(page)) &&	1134	if ((!PageAnon(page) \|\| PageSwapCache(page)) &&
1135	page_test_and_clear_dirty(page_to_pfn(page), 1))	1135	page_test_and_clear_dirty(page_to_pfn(page), 1))
1136	set_page_dirty(page);	1136	set_page_dirty(page);
1137	/*	1137	/*
1138	* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED	1138	* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1139	* and not charged by memcg for now.	1139	* and not charged by memcg for now.
1140	*/	1140	*/
1141	if (unlikely(PageHuge(page)))	1141	if (unlikely(PageHuge(page)))
1142	return;	1142	return;
1143	if (PageAnon(page)) {	1143	if (PageAnon(page)) {
1144	mem_cgroup_uncharge_page(page);	1144	mem_cgroup_uncharge_page(page);
1145	if (!PageTransHuge(page))	1145	if (!PageTransHuge(page))
1146	__dec_zone_page_state(page, NR_ANON_PAGES);	1146	__dec_zone_page_state(page, NR_ANON_PAGES);
1147	else	1147	else
1148	__dec_zone_page_state(page,	1148	__dec_zone_page_state(page,
1149	NR_ANON_TRANSPARENT_HUGEPAGES);	1149	NR_ANON_TRANSPARENT_HUGEPAGES);
1150	} else {	1150	} else {
1151	__dec_zone_page_state(page, NR_FILE_MAPPED);	1151	__dec_zone_page_state(page, NR_FILE_MAPPED);
1152	mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);	1152	mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
1153	}	1153	}
1154	/*	1154	/*
1155	* It would be tidy to reset the PageAnon mapping here,	1155	* It would be tidy to reset the PageAnon mapping here,
1156	* but that might overwrite a racing page_add_anon_rmap	1156	* but that might overwrite a racing page_add_anon_rmap
1157	* which increments mapcount after us but sets mapping	1157	* which increments mapcount after us but sets mapping
1158	* before us: so leave the reset to free_hot_cold_page,	1158	* before us: so leave the reset to free_hot_cold_page,
1159	* and remember that it's only reliable while mapped.	1159	* and remember that it's only reliable while mapped.
1160	* Leaving it set also helps swapoff to reinstate ptes	1160	* Leaving it set also helps swapoff to reinstate ptes
1161	* faster for those pages still in swapcache.	1161	* faster for those pages still in swapcache.
1162	*/	1162	*/
1163	}	1163	}
1164		1164
1165	/*	1165	/*
1166	* Subfunctions of try_to_unmap: try_to_unmap_one called	1166	* Subfunctions of try_to_unmap: try_to_unmap_one called
1167	* repeatedly from either try_to_unmap_anon or try_to_unmap_file.	1167	* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
1168	*/	1168	*/
1169	int try_to_unmap_one(struct page page, struct vm_area_struct vma,	1169	int try_to_unmap_one(struct page page, struct vm_area_struct vma,
1170	unsigned long address, enum ttu_flags flags)	1170	unsigned long address, enum ttu_flags flags)
1171	{	1171	{
1172	struct mm_struct *mm = vma->vm_mm;	1172	struct mm_struct *mm = vma->vm_mm;
1173	pte_t *pte;	1173	pte_t *pte;
1174	pte_t pteval;	1174	pte_t pteval;
1175	spinlock_t *ptl;	1175	spinlock_t *ptl;
1176	int ret = SWAP_AGAIN;	1176	int ret = SWAP_AGAIN;
1177		1177
1178	pte = page_check_address(page, mm, address, &ptl, 0);	1178	pte = page_check_address(page, mm, address, &ptl, 0);
1179	if (!pte)	1179	if (!pte)
1180	goto out;	1180	goto out;
1181		1181
1182	/*	1182	/*
1183	* If the page is mlock()d, we cannot swap it out.	1183	* If the page is mlock()d, we cannot swap it out.
1184	* If it's recently referenced (perhaps page_referenced	1184	* If it's recently referenced (perhaps page_referenced
1185	* skipped over this mm) then we should reactivate it.	1185	* skipped over this mm) then we should reactivate it.
1186	*/	1186	*/
1187	if (!(flags & TTU_IGNORE_MLOCK)) {	1187	if (!(flags & TTU_IGNORE_MLOCK)) {
1188	if (vma->vm_flags & VM_LOCKED)	1188	if (vma->vm_flags & VM_LOCKED)
1189	goto out_mlock;	1189	goto out_mlock;
1190		1190
1191	if (TTU_ACTION(flags) == TTU_MUNLOCK)	1191	if (TTU_ACTION(flags) == TTU_MUNLOCK)
1192	goto out_unmap;	1192	goto out_unmap;
1193	}	1193	}
1194	if (!(flags & TTU_IGNORE_ACCESS)) {	1194	if (!(flags & TTU_IGNORE_ACCESS)) {
1195	if (ptep_clear_flush_young_notify(vma, address, pte)) {	1195	if (ptep_clear_flush_young_notify(vma, address, pte)) {
1196	ret = SWAP_FAIL;	1196	ret = SWAP_FAIL;
1197	goto out_unmap;	1197	goto out_unmap;
1198	}	1198	}
1199	}	1199	}
1200		1200
1201	/* Nuke the page table entry. */	1201	/* Nuke the page table entry. */
1202	flush_cache_page(vma, address, page_to_pfn(page));	1202	flush_cache_page(vma, address, page_to_pfn(page));
1203	pteval = ptep_clear_flush_notify(vma, address, pte);	1203	pteval = ptep_clear_flush_notify(vma, address, pte);
1204		1204
1205	/* Move the dirty bit to the physical page now the pte is gone. */	1205	/* Move the dirty bit to the physical page now the pte is gone. */
1206	if (pte_dirty(pteval))	1206	if (pte_dirty(pteval))
1207	set_page_dirty(page);	1207	set_page_dirty(page);
1208		1208
1209	/* Update high watermark before we lower rss */	1209	/* Update high watermark before we lower rss */
1210	update_hiwater_rss(mm);	1210	update_hiwater_rss(mm);
1211		1211
1212	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {	1212	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1213	if (PageAnon(page))	1213	if (PageAnon(page))
1214	dec_mm_counter(mm, MM_ANONPAGES);	1214	dec_mm_counter(mm, MM_ANONPAGES);
1215	else	1215	else
1216	dec_mm_counter(mm, MM_FILEPAGES);	1216	dec_mm_counter(mm, MM_FILEPAGES);
1217	set_pte_at(mm, address, pte,	1217	set_pte_at(mm, address, pte,
1218	swp_entry_to_pte(make_hwpoison_entry(page)));	1218	swp_entry_to_pte(make_hwpoison_entry(page)));
1219	} else if (PageAnon(page)) {	1219	} else if (PageAnon(page)) {
1220	swp_entry_t entry = { .val = page_private(page) };	1220	swp_entry_t entry = { .val = page_private(page) };
1221		1221
1222	if (PageSwapCache(page)) {	1222	if (PageSwapCache(page)) {
1223	/*	1223	/*
1224	* Store the swap location in the pte.	1224	* Store the swap location in the pte.
1225	* See handle_pte_fault() ...	1225	* See handle_pte_fault() ...
1226	*/	1226	*/
1227	if (swap_duplicate(entry) < 0) {	1227	if (swap_duplicate(entry) < 0) {
1228	set_pte_at(mm, address, pte, pteval);	1228	set_pte_at(mm, address, pte, pteval);
1229	ret = SWAP_FAIL;	1229	ret = SWAP_FAIL;
1230	goto out_unmap;	1230	goto out_unmap;
1231	}	1231	}
1232	if (list_empty(&mm->mmlist)) {	1232	if (list_empty(&mm->mmlist)) {
1233	spin_lock(&mmlist_lock);	1233	spin_lock(&mmlist_lock);
1234	if (list_empty(&mm->mmlist))	1234	if (list_empty(&mm->mmlist))
1235	list_add(&mm->mmlist, &init_mm.mmlist);	1235	list_add(&mm->mmlist, &init_mm.mmlist);
1236	spin_unlock(&mmlist_lock);	1236	spin_unlock(&mmlist_lock);
1237	}	1237	}
1238	dec_mm_counter(mm, MM_ANONPAGES);	1238	dec_mm_counter(mm, MM_ANONPAGES);
1239	inc_mm_counter(mm, MM_SWAPENTS);	1239	inc_mm_counter(mm, MM_SWAPENTS);
1240	} else if (PAGE_MIGRATION) {	1240	} else if (PAGE_MIGRATION) {
1241	/*	1241	/*
1242	* Store the pfn of the page in a special migration	1242	* Store the pfn of the page in a special migration
1243	* pte. do_swap_page() will wait until the migration	1243	* pte. do_swap_page() will wait until the migration
1244	* pte is removed and then restart fault handling.	1244	* pte is removed and then restart fault handling.
1245	*/	1245	*/
1246	BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);	1246	BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
1247	entry = make_migration_entry(page, pte_write(pteval));	1247	entry = make_migration_entry(page, pte_write(pteval));
1248	}	1248	}
1249	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));	1249	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1250	BUG_ON(pte_file(*pte));	1250	BUG_ON(pte_file(*pte));
1251	} else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {	1251	} else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
1252	/* Establish migration entry for a file page */	1252	/* Establish migration entry for a file page */
1253	swp_entry_t entry;	1253	swp_entry_t entry;
1254	entry = make_migration_entry(page, pte_write(pteval));	1254	entry = make_migration_entry(page, pte_write(pteval));
1255	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));	1255	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1256	} else	1256	} else
1257	dec_mm_counter(mm, MM_FILEPAGES);	1257	dec_mm_counter(mm, MM_FILEPAGES);
1258		1258
1259	page_remove_rmap(page);	1259	page_remove_rmap(page);
1260	page_cache_release(page);	1260	page_cache_release(page);
1261		1261
1262	out_unmap:	1262	out_unmap:
1263	pte_unmap_unlock(pte, ptl);	1263	pte_unmap_unlock(pte, ptl);
1264	out:	1264	out:
1265	return ret;	1265	return ret;
1266		1266
1267	out_mlock:	1267	out_mlock:
1268	pte_unmap_unlock(pte, ptl);	1268	pte_unmap_unlock(pte, ptl);
1269		1269
1270		1270
1271	/*	1271	/*
1272	* We need mmap_sem locking, Otherwise VM_LOCKED check makes	1272	* We need mmap_sem locking, Otherwise VM_LOCKED check makes
1273	* unstable result and race. Plus, We can't wait here because	1273	* unstable result and race. Plus, We can't wait here because
1274	* we now hold anon_vma->mutex or mapping->i_mmap_mutex.	1274	* we now hold anon_vma->mutex or mapping->i_mmap_mutex.
1275	* if trylock failed, the page remain in evictable lru and later	1275	* if trylock failed, the page remain in evictable lru and later
1276	* vmscan could retry to move the page to unevictable lru if the	1276	* vmscan could retry to move the page to unevictable lru if the
1277	* page is actually mlocked.	1277	* page is actually mlocked.
1278	*/	1278	*/
1279	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {	1279	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1280	if (vma->vm_flags & VM_LOCKED) {	1280	if (vma->vm_flags & VM_LOCKED) {
1281	mlock_vma_page(page);	1281	mlock_vma_page(page);
1282	ret = SWAP_MLOCK;	1282	ret = SWAP_MLOCK;
1283	}	1283	}
1284	up_read(&vma->vm_mm->mmap_sem);	1284	up_read(&vma->vm_mm->mmap_sem);
1285	}	1285	}
1286	return ret;	1286	return ret;
1287	}	1287	}
1288		1288
1289	/*	1289	/*
1290	* objrmap doesn't work for nonlinear VMAs because the assumption that	1290	* objrmap doesn't work for nonlinear VMAs because the assumption that
1291	* offset-into-file correlates with offset-into-virtual-addresses does not hold.	1291	* offset-into-file correlates with offset-into-virtual-addresses does not hold.
1292	* Consequently, given a particular page and its ->index, we cannot locate the	1292	* Consequently, given a particular page and its ->index, we cannot locate the
1293	* ptes which are mapping that page without an exhaustive linear search.	1293	* ptes which are mapping that page without an exhaustive linear search.
1294	*	1294	*
1295	* So what this code does is a mini "virtual scan" of each nonlinear VMA which	1295	* So what this code does is a mini "virtual scan" of each nonlinear VMA which
1296	* maps the file to which the target page belongs. The ->vm_private_data field	1296	* maps the file to which the target page belongs. The ->vm_private_data field
1297	* holds the current cursor into that scan. Successive searches will circulate	1297	* holds the current cursor into that scan. Successive searches will circulate
1298	* around the vma's virtual address space.	1298	* around the vma's virtual address space.
1299	*	1299	*
1300	* So as more replacement pressure is applied to the pages in a nonlinear VMA,	1300	* So as more replacement pressure is applied to the pages in a nonlinear VMA,
1301	* more scanning pressure is placed against them as well. Eventually pages	1301	* more scanning pressure is placed against them as well. Eventually pages
1302	* will become fully unmapped and are eligible for eviction.	1302	* will become fully unmapped and are eligible for eviction.
1303	*	1303	*
1304	* For very sparsely populated VMAs this is a little inefficient - chances are	1304	* For very sparsely populated VMAs this is a little inefficient - chances are
1305	* there there won't be many ptes located within the scan cluster. In this case	1305	* there there won't be many ptes located within the scan cluster. In this case
1306	* maybe we could scan further - to the end of the pte page, perhaps.	1306	* maybe we could scan further - to the end of the pte page, perhaps.
1307	*	1307	*
1308	* Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can	1308	* Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
1309	* acquire it without blocking. If vma locked, mlock the pages in the cluster,	1309	* acquire it without blocking. If vma locked, mlock the pages in the cluster,
1310	* rather than unmapping them. If we encounter the "check_page" that vmscan is	1310	* rather than unmapping them. If we encounter the "check_page" that vmscan is
1311	* trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.	1311	* trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1312	*/	1312	*/
1313	#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)	1313	#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
1314	#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))	1314	#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
1315		1315
1316	static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,	1316	static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1317	struct vm_area_struct vma, struct page check_page)	1317	struct vm_area_struct vma, struct page check_page)
1318	{	1318	{
1319	struct mm_struct *mm = vma->vm_mm;	1319	struct mm_struct *mm = vma->vm_mm;
1320	pgd_t *pgd;	1320	pgd_t *pgd;
1321	pud_t *pud;	1321	pud_t *pud;
1322	pmd_t *pmd;	1322	pmd_t *pmd;
1323	pte_t *pte;	1323	pte_t *pte;
1324	pte_t pteval;	1324	pte_t pteval;
1325	spinlock_t *ptl;	1325	spinlock_t *ptl;
1326	struct page *page;	1326	struct page *page;
1327	unsigned long address;	1327	unsigned long address;
1328	unsigned long end;	1328	unsigned long end;
1329	int ret = SWAP_AGAIN;	1329	int ret = SWAP_AGAIN;
1330	int locked_vma = 0;	1330	int locked_vma = 0;
1331		1331
1332	address = (vma->vm_start + cursor) & CLUSTER_MASK;	1332	address = (vma->vm_start + cursor) & CLUSTER_MASK;
1333	end = address + CLUSTER_SIZE;	1333	end = address + CLUSTER_SIZE;
1334	if (address < vma->vm_start)	1334	if (address < vma->vm_start)
1335	address = vma->vm_start;	1335	address = vma->vm_start;
1336	if (end > vma->vm_end)	1336	if (end > vma->vm_end)
1337	end = vma->vm_end;	1337	end = vma->vm_end;
1338		1338
1339	pgd = pgd_offset(mm, address);	1339	pgd = pgd_offset(mm, address);
1340	if (!pgd_present(*pgd))	1340	if (!pgd_present(*pgd))
1341	return ret;	1341	return ret;
1342		1342
1343	pud = pud_offset(pgd, address);	1343	pud = pud_offset(pgd, address);
1344	if (!pud_present(*pud))	1344	if (!pud_present(*pud))
1345	return ret;	1345	return ret;
1346		1346
1347	pmd = pmd_offset(pud, address);	1347	pmd = pmd_offset(pud, address);
1348	if (!pmd_present(*pmd))	1348	if (!pmd_present(*pmd))
1349	return ret;	1349	return ret;
1350		1350
1351	/*	1351	/*
1352	* If we can acquire the mmap_sem for read, and vma is VM_LOCKED,	1352	* If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1353	* keep the sem while scanning the cluster for mlocking pages.	1353	* keep the sem while scanning the cluster for mlocking pages.
1354	*/	1354	*/
1355	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {	1355	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1356	locked_vma = (vma->vm_flags & VM_LOCKED);	1356	locked_vma = (vma->vm_flags & VM_LOCKED);
1357	if (!locked_vma)	1357	if (!locked_vma)
1358	up_read(&vma->vm_mm->mmap_sem); /* don't need it */	1358	up_read(&vma->vm_mm->mmap_sem); /* don't need it */
1359	}	1359	}
1360		1360
1361	pte = pte_offset_map_lock(mm, pmd, address, &ptl);	1361	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1362		1362
1363	/* Update high watermark before we lower rss */	1363	/* Update high watermark before we lower rss */
1364	update_hiwater_rss(mm);	1364	update_hiwater_rss(mm);
1365		1365
1366	for (; address < end; pte++, address += PAGE_SIZE) {	1366	for (; address < end; pte++, address += PAGE_SIZE) {
1367	if (!pte_present(*pte))	1367	if (!pte_present(*pte))
1368	continue;	1368	continue;
1369	page = vm_normal_page(vma, address, *pte);	1369	page = vm_normal_page(vma, address, *pte);
1370	BUG_ON(!page \|\| PageAnon(page));	1370	BUG_ON(!page \|\| PageAnon(page));
1371		1371
1372	if (locked_vma) {	1372	if (locked_vma) {
1373	mlock_vma_page(page); /* no-op if already mlocked */	1373	mlock_vma_page(page); /* no-op if already mlocked */
1374	if (page == check_page)	1374	if (page == check_page)
1375	ret = SWAP_MLOCK;	1375	ret = SWAP_MLOCK;
1376	continue; /* don't unmap */	1376	continue; /* don't unmap */
1377	}	1377	}
1378		1378
1379	if (ptep_clear_flush_young_notify(vma, address, pte))	1379	if (ptep_clear_flush_young_notify(vma, address, pte))
1380	continue;	1380	continue;
1381		1381
1382	/* Nuke the page table entry. */	1382	/* Nuke the page table entry. */
1383	flush_cache_page(vma, address, pte_pfn(*pte));	1383	flush_cache_page(vma, address, pte_pfn(*pte));
1384	pteval = ptep_clear_flush_notify(vma, address, pte);	1384	pteval = ptep_clear_flush_notify(vma, address, pte);
1385		1385
1386	/* If nonlinear, store the file page offset in the pte. */	1386	/* If nonlinear, store the file page offset in the pte. */
1387	if (page->index != linear_page_index(vma, address))	1387	if (page->index != linear_page_index(vma, address))
1388	set_pte_at(mm, address, pte, pgoff_to_pte(page->index));	1388	set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
1389		1389
1390	/* Move the dirty bit to the physical page now the pte is gone. */	1390	/* Move the dirty bit to the physical page now the pte is gone. */
1391	if (pte_dirty(pteval))	1391	if (pte_dirty(pteval))
1392	set_page_dirty(page);	1392	set_page_dirty(page);
1393		1393
1394	page_remove_rmap(page);	1394	page_remove_rmap(page);
1395	page_cache_release(page);	1395	page_cache_release(page);
1396	dec_mm_counter(mm, MM_FILEPAGES);	1396	dec_mm_counter(mm, MM_FILEPAGES);
1397	(*mapcount)--;	1397	(*mapcount)--;
1398	}	1398	}
1399	pte_unmap_unlock(pte - 1, ptl);	1399	pte_unmap_unlock(pte - 1, ptl);
1400	if (locked_vma)	1400	if (locked_vma)
1401	up_read(&vma->vm_mm->mmap_sem);	1401	up_read(&vma->vm_mm->mmap_sem);
1402	return ret;	1402	return ret;
1403	}	1403	}
1404		1404
1405	bool is_vma_temporary_stack(struct vm_area_struct *vma)	1405	bool is_vma_temporary_stack(struct vm_area_struct *vma)
1406	{	1406	{
1407	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN \| VM_GROWSUP);	1407	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN \| VM_GROWSUP);
1408		1408
1409	if (!maybe_stack)	1409	if (!maybe_stack)
1410	return false;	1410	return false;
1411		1411
1412	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==	1412	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1413	VM_STACK_INCOMPLETE_SETUP)	1413	VM_STACK_INCOMPLETE_SETUP)
1414	return true;	1414	return true;
1415		1415
1416	return false;	1416	return false;
1417	}	1417	}
1418		1418
1419	/**	1419	/**
1420	* try_to_unmap_anon - unmap or unlock anonymous page using the object-based	1420	* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1421	* rmap method	1421	* rmap method
1422	* @page: the page to unmap/unlock	1422	* @page: the page to unmap/unlock
1423	* @flags: action and flags	1423	* @flags: action and flags
1424	*	1424	*
1425	* Find all the mappings of a page using the mapping pointer and the vma chains	1425	* Find all the mappings of a page using the mapping pointer and the vma chains
1426	* contained in the anon_vma struct it points to.	1426	* contained in the anon_vma struct it points to.
1427	*	1427	*
1428	* This function is only called from try_to_unmap/try_to_munlock for	1428	* This function is only called from try_to_unmap/try_to_munlock for
1429	* anonymous pages.	1429	* anonymous pages.
1430	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma	1430	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1431	* where the page was found will be held for write. So, we won't recheck	1431	* where the page was found will be held for write. So, we won't recheck
1432	* vm_flags for that VMA. That should be OK, because that vma shouldn't be	1432	* vm_flags for that VMA. That should be OK, because that vma shouldn't be
1433	* 'LOCKED.	1433	* 'LOCKED.
1434	*/	1434	*/
1435	static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)	1435	static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1436	{	1436	{
1437	struct anon_vma *anon_vma;	1437	struct anon_vma *anon_vma;
1438	struct anon_vma_chain *avc;	1438	struct anon_vma_chain *avc;
1439	int ret = SWAP_AGAIN;	1439	int ret = SWAP_AGAIN;
1440		1440
1441	anon_vma = page_lock_anon_vma(page);	1441	anon_vma = page_lock_anon_vma(page);
1442	if (!anon_vma)	1442	if (!anon_vma)
1443	return ret;	1443	return ret;
1444		1444
1445	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {	1445	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1446	struct vm_area_struct *vma = avc->vma;	1446	struct vm_area_struct *vma = avc->vma;
1447	unsigned long address;	1447	unsigned long address;
1448		1448
1449	/*	1449	/*
1450	* During exec, a temporary VMA is setup and later moved.	1450	* During exec, a temporary VMA is setup and later moved.
1451	* The VMA is moved under the anon_vma lock but not the	1451	* The VMA is moved under the anon_vma lock but not the
1452	* page tables leading to a race where migration cannot	1452	* page tables leading to a race where migration cannot
1453	* find the migration ptes. Rather than increasing the	1453	* find the migration ptes. Rather than increasing the
1454	* locking requirements of exec(), migration skips	1454	* locking requirements of exec(), migration skips
1455	* temporary VMAs until after exec() completes.	1455	* temporary VMAs until after exec() completes.
1456	*/	1456	*/
1457	if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&	1457	if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1458	is_vma_temporary_stack(vma))	1458	is_vma_temporary_stack(vma))
1459	continue;	1459	continue;
1460		1460
1461	address = vma_address(page, vma);	1461	address = vma_address(page, vma);
1462	if (address == -EFAULT)	1462	if (address == -EFAULT)
1463	continue;	1463	continue;
1464	ret = try_to_unmap_one(page, vma, address, flags);	1464	ret = try_to_unmap_one(page, vma, address, flags);
1465	if (ret != SWAP_AGAIN \|\| !page_mapped(page))	1465	if (ret != SWAP_AGAIN \|\| !page_mapped(page))
1466	break;	1466	break;
1467	}	1467	}
1468		1468
1469	page_unlock_anon_vma(anon_vma);	1469	page_unlock_anon_vma(anon_vma);
1470	return ret;	1470	return ret;
1471	}	1471	}
1472		1472
1473	/**	1473	/**
1474	* try_to_unmap_file - unmap/unlock file page using the object-based rmap method	1474	* try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1475	* @page: the page to unmap/unlock	1475	* @page: the page to unmap/unlock
1476	* @flags: action and flags	1476	* @flags: action and flags
1477	*	1477	*
1478	* Find all the mappings of a page using the mapping pointer and the vma chains	1478	* Find all the mappings of a page using the mapping pointer and the vma chains
1479	* contained in the address_space struct it points to.	1479	* contained in the address_space struct it points to.
1480	*	1480	*
1481	* This function is only called from try_to_unmap/try_to_munlock for	1481	* This function is only called from try_to_unmap/try_to_munlock for
1482	* object-based pages.	1482	* object-based pages.
1483	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma	1483	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1484	* where the page was found will be held for write. So, we won't recheck	1484	* where the page was found will be held for write. So, we won't recheck
1485	* vm_flags for that VMA. That should be OK, because that vma shouldn't be	1485	* vm_flags for that VMA. That should be OK, because that vma shouldn't be
1486	* 'LOCKED.	1486	* 'LOCKED.
1487	*/	1487	*/
1488	static int try_to_unmap_file(struct page *page, enum ttu_flags flags)	1488	static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1489	{	1489	{
1490	struct address_space *mapping = page->mapping;	1490	struct address_space *mapping = page->mapping;
1491	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);	1491	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1492	struct vm_area_struct *vma;	1492	struct vm_area_struct *vma;
1493	struct prio_tree_iter iter;	1493	struct prio_tree_iter iter;
1494	int ret = SWAP_AGAIN;	1494	int ret = SWAP_AGAIN;
1495	unsigned long cursor;	1495	unsigned long cursor;
1496	unsigned long max_nl_cursor = 0;	1496	unsigned long max_nl_cursor = 0;
1497	unsigned long max_nl_size = 0;	1497	unsigned long max_nl_size = 0;
1498	unsigned int mapcount;	1498	unsigned int mapcount;
1499		1499
1500	mutex_lock(&mapping->i_mmap_mutex);	1500	mutex_lock(&mapping->i_mmap_mutex);
1501	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {	1501	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1502	unsigned long address = vma_address(page, vma);	1502	unsigned long address = vma_address(page, vma);
1503	if (address == -EFAULT)	1503	if (address == -EFAULT)
1504	continue;	1504	continue;
1505	ret = try_to_unmap_one(page, vma, address, flags);	1505	ret = try_to_unmap_one(page, vma, address, flags);
1506	if (ret != SWAP_AGAIN \|\| !page_mapped(page))	1506	if (ret != SWAP_AGAIN \|\| !page_mapped(page))
1507	goto out;	1507	goto out;
1508	}	1508	}
1509		1509
1510	if (list_empty(&mapping->i_mmap_nonlinear))	1510	if (list_empty(&mapping->i_mmap_nonlinear))
1511	goto out;	1511	goto out;
1512		1512
1513	/*	1513	/*
1514	* We don't bother to try to find the munlocked page in nonlinears.	1514	* We don't bother to try to find the munlocked page in nonlinears.
1515	* It's costly. Instead, later, page reclaim logic may call	1515	* It's costly. Instead, later, page reclaim logic may call
1516	* try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.	1516	* try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1517	*/	1517	*/
1518	if (TTU_ACTION(flags) == TTU_MUNLOCK)	1518	if (TTU_ACTION(flags) == TTU_MUNLOCK)
1519	goto out;	1519	goto out;
1520		1520
1521	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,	1521	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1522	shared.vm_set.list) {	1522	shared.vm_set.list) {
1523	cursor = (unsigned long) vma->vm_private_data;	1523	cursor = (unsigned long) vma->vm_private_data;
1524	if (cursor > max_nl_cursor)	1524	if (cursor > max_nl_cursor)
1525	max_nl_cursor = cursor;	1525	max_nl_cursor = cursor;
1526	cursor = vma->vm_end - vma->vm_start;	1526	cursor = vma->vm_end - vma->vm_start;
1527	if (cursor > max_nl_size)	1527	if (cursor > max_nl_size)
1528	max_nl_size = cursor;	1528	max_nl_size = cursor;
1529	}	1529	}
1530		1530
1531	if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */	1531	if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1532	ret = SWAP_FAIL;	1532	ret = SWAP_FAIL;
1533	goto out;	1533	goto out;
1534	}	1534	}
1535		1535
1536	/*	1536	/*
1537	* We don't try to search for this page in the nonlinear vmas,	1537	* We don't try to search for this page in the nonlinear vmas,
1538	* and page_referenced wouldn't have found it anyway. Instead	1538	* and page_referenced wouldn't have found it anyway. Instead
1539	* just walk the nonlinear vmas trying to age and unmap some.	1539	* just walk the nonlinear vmas trying to age and unmap some.
1540	* The mapcount of the page we came in with is irrelevant,	1540	* The mapcount of the page we came in with is irrelevant,
1541	* but even so use it as a guide to how hard we should try?	1541	* but even so use it as a guide to how hard we should try?
1542	*/	1542	*/
1543	mapcount = page_mapcount(page);	1543	mapcount = page_mapcount(page);
1544	if (!mapcount)	1544	if (!mapcount)
1545	goto out;	1545	goto out;
1546	cond_resched();	1546	cond_resched();
1547		1547
1548	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;	1548	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1549	if (max_nl_cursor == 0)	1549	if (max_nl_cursor == 0)
1550	max_nl_cursor = CLUSTER_SIZE;	1550	max_nl_cursor = CLUSTER_SIZE;
1551		1551
1552	do {	1552	do {
1553	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,	1553	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1554	shared.vm_set.list) {	1554	shared.vm_set.list) {
1555	cursor = (unsigned long) vma->vm_private_data;	1555	cursor = (unsigned long) vma->vm_private_data;
1556	while ( cursor < max_nl_cursor &&	1556	while ( cursor < max_nl_cursor &&
1557	cursor < vma->vm_end - vma->vm_start) {	1557	cursor < vma->vm_end - vma->vm_start) {
1558	if (try_to_unmap_cluster(cursor, &mapcount,	1558	if (try_to_unmap_cluster(cursor, &mapcount,
1559	vma, page) == SWAP_MLOCK)	1559	vma, page) == SWAP_MLOCK)
1560	ret = SWAP_MLOCK;	1560	ret = SWAP_MLOCK;
1561	cursor += CLUSTER_SIZE;	1561	cursor += CLUSTER_SIZE;
1562	vma->vm_private_data = (void *) cursor;	1562	vma->vm_private_data = (void *) cursor;
1563	if ((int)mapcount <= 0)	1563	if ((int)mapcount <= 0)
1564	goto out;	1564	goto out;
1565	}	1565	}
1566	vma->vm_private_data = (void *) max_nl_cursor;	1566	vma->vm_private_data = (void *) max_nl_cursor;
1567	}	1567	}
1568	cond_resched();	1568	cond_resched();
1569	max_nl_cursor += CLUSTER_SIZE;	1569	max_nl_cursor += CLUSTER_SIZE;
1570	} while (max_nl_cursor <= max_nl_size);	1570	} while (max_nl_cursor <= max_nl_size);
1571		1571
1572	/*	1572	/*
1573	* Don't loop forever (perhaps all the remaining pages are	1573	* Don't loop forever (perhaps all the remaining pages are
1574	* in locked vmas). Reset cursor on all unreserved nonlinear	1574	* in locked vmas). Reset cursor on all unreserved nonlinear
1575	* vmas, now forgetting on which ones it had fallen behind.	1575	* vmas, now forgetting on which ones it had fallen behind.
1576	*/	1576	*/
1577	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)	1577	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1578	vma->vm_private_data = NULL;	1578	vma->vm_private_data = NULL;
1579	out:	1579	out:
1580	mutex_unlock(&mapping->i_mmap_mutex);	1580	mutex_unlock(&mapping->i_mmap_mutex);
1581	return ret;	1581	return ret;
1582	}	1582	}
1583		1583
1584	/**	1584	/**
1585	* try_to_unmap - try to remove all page table mappings to a page	1585	* try_to_unmap - try to remove all page table mappings to a page
1586	* @page: the page to get unmapped	1586	* @page: the page to get unmapped
1587	* @flags: action and flags	1587	* @flags: action and flags
1588	*	1588	*
1589	* Tries to remove all the page table entries which are mapping this	1589	* Tries to remove all the page table entries which are mapping this
1590	* page, used in the pageout path. Caller must hold the page lock.	1590	* page, used in the pageout path. Caller must hold the page lock.
1591	* Return values are:	1591	* Return values are:
1592	*	1592	*
1593	* SWAP_SUCCESS - we succeeded in removing all mappings	1593	* SWAP_SUCCESS - we succeeded in removing all mappings
1594	* SWAP_AGAIN - we missed a mapping, try again later	1594	* SWAP_AGAIN - we missed a mapping, try again later
1595	* SWAP_FAIL - the page is unswappable	1595	* SWAP_FAIL - the page is unswappable
1596	* SWAP_MLOCK - page is mlocked.	1596	* SWAP_MLOCK - page is mlocked.
1597	*/	1597	*/
1598	int try_to_unmap(struct page *page, enum ttu_flags flags)	1598	int try_to_unmap(struct page *page, enum ttu_flags flags)
1599	{	1599	{
1600	int ret;	1600	int ret;
1601		1601
1602	BUG_ON(!PageLocked(page));	1602	BUG_ON(!PageLocked(page));
1603	VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));	1603	VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1604		1604
1605	if (unlikely(PageKsm(page)))	1605	if (unlikely(PageKsm(page)))
1606	ret = try_to_unmap_ksm(page, flags);	1606	ret = try_to_unmap_ksm(page, flags);
1607	else if (PageAnon(page))	1607	else if (PageAnon(page))
1608	ret = try_to_unmap_anon(page, flags);	1608	ret = try_to_unmap_anon(page, flags);
1609	else	1609	else
1610	ret = try_to_unmap_file(page, flags);	1610	ret = try_to_unmap_file(page, flags);
1611	if (ret != SWAP_MLOCK && !page_mapped(page))	1611	if (ret != SWAP_MLOCK && !page_mapped(page))
1612	ret = SWAP_SUCCESS;	1612	ret = SWAP_SUCCESS;
1613	return ret;	1613	return ret;
1614	}	1614	}
1615		1615
1616	/**	1616	/**
1617	* try_to_munlock - try to munlock a page	1617	* try_to_munlock - try to munlock a page
1618	* @page: the page to be munlocked	1618	* @page: the page to be munlocked
1619	*	1619	*
1620	* Called from munlock code. Checks all of the VMAs mapping the page	1620	* Called from munlock code. Checks all of the VMAs mapping the page
1621	* to make sure nobody else has this page mlocked. The page will be	1621	* to make sure nobody else has this page mlocked. The page will be
1622	* returned with PG_mlocked cleared if no other vmas have it mlocked.	1622	* returned with PG_mlocked cleared if no other vmas have it mlocked.
1623	*	1623	*
1624	* Return values are:	1624	* Return values are:
1625	*	1625	*
1626	* SWAP_AGAIN - no vma is holding page mlocked, or,	1626	* SWAP_AGAIN - no vma is holding page mlocked, or,
1627	* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem	1627	* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1628	* SWAP_FAIL - page cannot be located at present	1628	* SWAP_FAIL - page cannot be located at present
1629	* SWAP_MLOCK - page is now mlocked.	1629	* SWAP_MLOCK - page is now mlocked.
1630	*/	1630	*/
1631	int try_to_munlock(struct page *page)	1631	int try_to_munlock(struct page *page)
1632	{	1632	{
1633	VM_BUG_ON(!PageLocked(page) \|\| PageLRU(page));	1633	VM_BUG_ON(!PageLocked(page) \|\| PageLRU(page));
1634		1634
1635	if (unlikely(PageKsm(page)))	1635	if (unlikely(PageKsm(page)))
1636	return try_to_unmap_ksm(page, TTU_MUNLOCK);	1636	return try_to_unmap_ksm(page, TTU_MUNLOCK);
1637	else if (PageAnon(page))	1637	else if (PageAnon(page))
1638	return try_to_unmap_anon(page, TTU_MUNLOCK);	1638	return try_to_unmap_anon(page, TTU_MUNLOCK);
1639	else	1639	else
1640	return try_to_unmap_file(page, TTU_MUNLOCK);	1640	return try_to_unmap_file(page, TTU_MUNLOCK);
1641	}	1641	}
1642		1642
1643	void __put_anon_vma(struct anon_vma *anon_vma)	1643	void __put_anon_vma(struct anon_vma *anon_vma)
1644	{	1644	{
1645	struct anon_vma *root = anon_vma->root;	1645	struct anon_vma *root = anon_vma->root;
1646		1646
1647	if (root != anon_vma && atomic_dec_and_test(&root->refcount))	1647	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1648	anon_vma_free(root);	1648	anon_vma_free(root);
1649		1649
1650	anon_vma_free(anon_vma);	1650	anon_vma_free(anon_vma);
1651	}	1651	}
1652		1652
1653	#ifdef CONFIG_MIGRATION	1653	#ifdef CONFIG_MIGRATION
1654	/*	1654	/*
1655	* rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():	1655	* rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1656	* Called by migrate.c to remove migration ptes, but might be used more later.	1656	* Called by migrate.c to remove migration ptes, but might be used more later.
1657	*/	1657	*/
1658	static int rmap_walk_anon(struct page page, int (rmap_one)(struct page *,	1658	static int rmap_walk_anon(struct page page, int (rmap_one)(struct page *,
1659	struct vm_area_struct , unsigned long, void ), void *arg)	1659	struct vm_area_struct , unsigned long, void ), void *arg)
1660	{	1660	{
1661	struct anon_vma *anon_vma;	1661	struct anon_vma *anon_vma;
1662	struct anon_vma_chain *avc;	1662	struct anon_vma_chain *avc;
1663	int ret = SWAP_AGAIN;	1663	int ret = SWAP_AGAIN;
1664		1664
1665	/*	1665	/*
1666	* Note: remove_migration_ptes() cannot use page_lock_anon_vma()	1666	* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1667	* because that depends on page_mapped(); but not all its usages	1667	* because that depends on page_mapped(); but not all its usages
1668	* are holding mmap_sem. Users without mmap_sem are required to	1668	* are holding mmap_sem. Users without mmap_sem are required to
1669	* take a reference count to prevent the anon_vma disappearing	1669	* take a reference count to prevent the anon_vma disappearing
1670	*/	1670	*/
1671	anon_vma = page_anon_vma(page);	1671	anon_vma = page_anon_vma(page);
1672	if (!anon_vma)	1672	if (!anon_vma)
1673	return ret;	1673	return ret;
1674	anon_vma_lock(anon_vma);	1674	anon_vma_lock(anon_vma);
1675	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {	1675	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1676	struct vm_area_struct *vma = avc->vma;	1676	struct vm_area_struct *vma = avc->vma;
1677	unsigned long address = vma_address(page, vma);	1677	unsigned long address = vma_address(page, vma);
1678	if (address == -EFAULT)	1678	if (address == -EFAULT)
1679	continue;	1679	continue;
1680	ret = rmap_one(page, vma, address, arg);	1680	ret = rmap_one(page, vma, address, arg);
1681	if (ret != SWAP_AGAIN)	1681	if (ret != SWAP_AGAIN)
1682	break;	1682	break;
1683	}	1683	}
1684	anon_vma_unlock(anon_vma);	1684	anon_vma_unlock(anon_vma);
1685	return ret;	1685	return ret;
1686	}	1686	}
1687		1687
1688	static int rmap_walk_file(struct page page, int (rmap_one)(struct page *,	1688	static int rmap_walk_file(struct page page, int (rmap_one)(struct page *,
1689	struct vm_area_struct , unsigned long, void ), void *arg)	1689	struct vm_area_struct , unsigned long, void ), void *arg)
1690	{	1690	{
1691	struct address_space *mapping = page->mapping;	1691	struct address_space *mapping = page->mapping;
1692	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);	1692	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1693	struct vm_area_struct *vma;	1693	struct vm_area_struct *vma;
1694	struct prio_tree_iter iter;	1694	struct prio_tree_iter iter;
1695	int ret = SWAP_AGAIN;	1695	int ret = SWAP_AGAIN;
1696		1696
1697	if (!mapping)	1697	if (!mapping)
1698	return ret;	1698	return ret;
1699	mutex_lock(&mapping->i_mmap_mutex);	1699	mutex_lock(&mapping->i_mmap_mutex);
1700	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {	1700	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1701	unsigned long address = vma_address(page, vma);	1701	unsigned long address = vma_address(page, vma);
1702	if (address == -EFAULT)	1702	if (address == -EFAULT)
1703	continue;	1703	continue;
1704	ret = rmap_one(page, vma, address, arg);	1704	ret = rmap_one(page, vma, address, arg);
1705	if (ret != SWAP_AGAIN)	1705	if (ret != SWAP_AGAIN)
1706	break;	1706	break;
1707	}	1707	}
1708	/*	1708	/*
1709	* No nonlinear handling: being always shared, nonlinear vmas	1709	* No nonlinear handling: being always shared, nonlinear vmas
1710	* never contain migration ptes. Decide what to do about this	1710	* never contain migration ptes. Decide what to do about this
1711	* limitation to linear when we need rmap_walk() on nonlinear.	1711	* limitation to linear when we need rmap_walk() on nonlinear.
1712	*/	1712	*/
1713	mutex_unlock(&mapping->i_mmap_mutex);	1713	mutex_unlock(&mapping->i_mmap_mutex);
1714	return ret;	1714	return ret;
1715	}	1715	}
1716		1716
1717	int rmap_walk(struct page page, int (rmap_one)(struct page *,	1717	int rmap_walk(struct page page, int (rmap_one)(struct page *,
1718	struct vm_area_struct , unsigned long, void ), void *arg)	1718	struct vm_area_struct , unsigned long, void ), void *arg)
1719	{	1719	{
1720	VM_BUG_ON(!PageLocked(page));	1720	VM_BUG_ON(!PageLocked(page));
1721		1721
1722	if (unlikely(PageKsm(page)))	1722	if (unlikely(PageKsm(page)))
1723	return rmap_walk_ksm(page, rmap_one, arg);	1723	return rmap_walk_ksm(page, rmap_one, arg);
1724	else if (PageAnon(page))	1724	else if (PageAnon(page))
1725	return rmap_walk_anon(page, rmap_one, arg);	1725	return rmap_walk_anon(page, rmap_one, arg);
1726	else	1726	else
1727	return rmap_walk_file(page, rmap_one, arg);	1727	return rmap_walk_file(page, rmap_one, arg);
1728	}	1728	}
1729	#endif /* CONFIG_MIGRATION */	1729	#endif /* CONFIG_MIGRATION */
1730		1730
1731	#ifdef CONFIG_HUGETLB_PAGE	1731	#ifdef CONFIG_HUGETLB_PAGE
1732	/*	1732	/*
1733	* The following three functions are for anonymous (private mapped) hugepages.	1733	* The following three functions are for anonymous (private mapped) hugepages.
1734	* Unlike common anonymous pages, anonymous hugepages have no accounting code	1734	* Unlike common anonymous pages, anonymous hugepages have no accounting code
1735	* and no lru code, because we handle hugepages differently from common pages.	1735	* and no lru code, because we handle hugepages differently from common pages.
1736	*/	1736	*/
1737	static void __hugepage_set_anon_rmap(struct page *page,	1737	static void __hugepage_set_anon_rmap(struct page *page,
1738	struct vm_area_struct *vma, unsigned long address, int exclusive)	1738	struct vm_area_struct *vma, unsigned long address, int exclusive)
1739	{	1739	{
1740	struct anon_vma *anon_vma = vma->anon_vma;	1740	struct anon_vma *anon_vma = vma->anon_vma;
1741		1741
1742	BUG_ON(!anon_vma);	1742	BUG_ON(!anon_vma);
1743		1743
1744	if (PageAnon(page))	1744	if (PageAnon(page))
1745	return;	1745	return;
1746	if (!exclusive)	1746	if (!exclusive)
1747	anon_vma = anon_vma->root;	1747	anon_vma = anon_vma->root;
1748		1748
1749	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;	1749	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1750	page->mapping = (struct address_space *) anon_vma;	1750	page->mapping = (struct address_space *) anon_vma;
1751	page->index = linear_page_index(vma, address);	1751	page->index = linear_page_index(vma, address);
1752	}	1752	}
1753		1753
1754	void hugepage_add_anon_rmap(struct page *page,	1754	void hugepage_add_anon_rmap(struct page *page,
1755	struct vm_area_struct *vma, unsigned long address)	1755	struct vm_area_struct *vma, unsigned long address)
1756	{	1756	{
1757	struct anon_vma *anon_vma = vma->anon_vma;	1757	struct anon_vma *anon_vma = vma->anon_vma;
1758	int first;	1758	int first;
1759		1759
1760	BUG_ON(!PageLocked(page));	1760	BUG_ON(!PageLocked(page));
1761	BUG_ON(!anon_vma);	1761	BUG_ON(!anon_vma);
1762	/* address might be in next vma when migration races vma_adjust */	1762	/* address might be in next vma when migration races vma_adjust */
1763	first = atomic_inc_and_test(&page->_mapcount);	1763	first = atomic_inc_and_test(&page->_mapcount);
1764	if (first)	1764	if (first)
1765	__hugepage_set_anon_rmap(page, vma, address, 0);	1765	__hugepage_set_anon_rmap(page, vma, address, 0);
1766	}	1766	}
1767		1767
1768	void hugepage_add_new_anon_rmap(struct page *page,	1768	void hugepage_add_new_anon_rmap(struct page *page,
1769	struct vm_area_struct *vma, unsigned long address)	1769	struct vm_area_struct *vma, unsigned long address)
1770	{	1770	{
1771	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);	1771	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
1772	atomic_set(&page->_mapcount, 0);	1772	atomic_set(&page->_mapcount, 0);
1773	__hugepage_set_anon_rmap(page, vma, address, 1);	1773	__hugepage_set_anon_rmap(page, vma, address, 1);
1774	}	1774	}
1775	#endif /* CONFIG_HUGETLB_PAGE */	1775	#endif /* CONFIG_HUGETLB_PAGE */
1776		1776