Eric Lee / linux-smarc-t335x-v3.2

Commit 562c72aa57c36b178eacc3500a0215651eca9429

Authored by Christoph Hellwig 2011-06-25 02:29:45 +0800

Committed by Al Viro 2011-07-21 08:47:47 +0800

Exists in master and in 4 other branches

fs: move inode_dio_wait calls into ->setattr

Let filesystems handle waiting for direct I/O requests themselves instead
of doing it beforehand. This means filesystem-specific locks to prevent
new dio referenes from appearing can be held. This is important to allow
generalizing i_dio_count to non-DIO_LOCKING filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 12 changed files with 24 additions and 3 deletions Inline Diff

fs/attr.c
fs/ext2/inode.c
fs/ext3/inode.c
fs/ext4/inode.c
fs/fat/file.c
fs/gfs2/bmap.c
fs/hfs/inode.c
fs/hfsplus/inode.c
fs/jfs/file.c
fs/nilfs2/inode.c
fs/ocfs2/file.c
fs/reiserfs/inode.c

fs/attr.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/attr.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  changes by Thomas Schoebel-Theuer
  */
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/security.h>
 /**
  * inode_change_ok - check if attribute changes to an inode are allowed
  * @inode:	inode to check
  * @attr:	attributes to change
  *
  * Check if we are allowed to change the attributes contained in @attr
  * in the given inode.  This includes the normal unix access permission
  * checks, as well as checks for rlimits and others.
  *
  * Should be called as the first thing in ->setattr implementations,
  * possibly after taking additional locks.
  */
 int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 	/*
 	 * First check size constraints.  These can't be overriden using
 	 * ATTR_FORCE.
 	 */
 	if (ia_valid & ATTR_SIZE) {
 		int error = inode_newsize_ok(inode, attr->ia_size);
 		if (error)
 			return error;
 	}
 	/* If force is set do it anyway. */
 	if (ia_valid & ATTR_FORCE)
 		return 0;
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
 	    (current_fsuid() != inode->i_uid ||
 	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
 		return -EPERM;
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (current_fsuid() != inode->i_uid ||
 	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
 	    !capable(CAP_CHOWN))
 		return -EPERM;
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) && !capable(CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(inode_change_ok);
 /**
  * inode_newsize_ok - may this inode be truncated to a given size
  * @inode:	the inode to be truncated
  * @offset:	the new size to assign to the inode
  * @Returns:	0 on success, -ve errno on failure
  *
  * inode_newsize_ok must be called with i_mutex held.
  *
  * inode_newsize_ok will check filesystem limits and ulimits to check that the
  * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
  * when necessary. Caller must not proceed with inode size change if failure is
  * returned. @inode must be a file (not directory), with appropriate
  * permissions to allow truncate (inode_newsize_ok does NOT check these
  * conditions).
  */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
 	if (inode->i_size < offset) {
 		unsigned long limit;
 		limit = rlimit(RLIMIT_FSIZE);
 		if (limit != RLIM_INFINITY && offset > limit)
 			goto out_sig;
 		if (offset > inode->i_sb->s_maxbytes)
 			goto out_big;
 	} else {
 		/*
 		 * truncation of in-use swapfiles is disallowed - it would
 		 * cause subsequent swapout to scribble on the now-freed
 		 * blocks.
 		 */
 		if (IS_SWAPFILE(inode))
 			return -ETXTBSY;
 	}
 	return 0;
 out_sig:
 	send_sig(SIGXFSZ, current, 0);
 out_big:
 	return -EFBIG;
 }
 EXPORT_SYMBOL(inode_newsize_ok);
 /**
  * setattr_copy - copy simple metadata updates into the generic inode
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
  * setattr_copy must be called with i_mutex held.
  *
  * setattr_copy updates the inode's metadata with that specified
  * in attr. Noticeably missing is inode size update, which is more complex
  * as it requires pagecache updates.
  *
  * The inode is not marked as dirty after this operation. The rationale is
  * that for "simple" filesystems, the struct inode is the inode storage.
  * The caller is free to mark the inode dirty afterwards if needed.
  */
 void setattr_copy(struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 	if (ia_valid & ATTR_UID)
 		inode->i_uid = attr->ia_uid;
 	if (ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
 		inode->i_atime = timespec_trunc(attr->ia_atime,
 						inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
 		inode->i_mtime = timespec_trunc(attr->ia_mtime,
 						inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
 		inode->i_ctime = timespec_trunc(attr->ia_ctime,
 						inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
 }
 EXPORT_SYMBOL(setattr_copy);
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
 	struct inode *inode = dentry->d_inode;
 	mode_t mode = inode->i_mode;
 	int error;
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 			return -EPERM;
 	}
 	if ((ia_valid & ATTR_MODE)) {
 		mode_t amode = attr->ia_mode;
 		/* Flag setting protected by i_mutex */
 		if (is_sxid(amode))
 			inode->i_flags &= ~S_NOSEC;
 	}
 	now = current_fs_time(inode->i_sb);
 	attr->ia_ctime = now;
 	if (!(ia_valid & ATTR_ATIME_SET))
 		attr->ia_atime = now;
 	if (!(ia_valid & ATTR_MTIME_SET))
 		attr->ia_mtime = now;
 	if (ia_valid & ATTR_KILL_PRIV) {
 		attr->ia_valid &= ~ATTR_KILL_PRIV;
 		ia_valid &= ~ATTR_KILL_PRIV;
 		error = security_inode_need_killpriv(dentry);
 		if (error > 0)
 			error = security_inode_killpriv(dentry);
 		if (error)
 			return error;
 	}
 	/*
 	 * We now pass ATTR_KILL_S*ID to the lower level setattr function so
 	 * that the function has the ability to reinterpret a mode change
 	 * that's due to these bits. This adds an implicit restriction that
 	 * no function will ever call notify_change with both ATTR_MODE and
 	 * ATTR_KILL_S*ID set.
 	 */
 	if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
 	    (ia_valid & ATTR_MODE))
 		BUG();
 	if (ia_valid & ATTR_KILL_SUID) {
 		if (mode & S_ISUID) {
 			ia_valid = attr->ia_valid |= ATTR_MODE;
 			attr->ia_mode = (inode->i_mode & ~S_ISUID);
 		}
 	}
 	if (ia_valid & ATTR_KILL_SGID) {
 		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
 			if (!(ia_valid & ATTR_MODE)) {
 				ia_valid = attr->ia_valid |= ATTR_MODE;
 				attr->ia_mode = inode->i_mode;
 			}
 			attr->ia_mode &= ~S_ISGID;
 		}
 	}
 	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
 		return 0;
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
-	if (ia_valid & ATTR_SIZE)
-		inode_dio_wait(inode);
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
 	else
 		error = simple_setattr(dentry, attr);
 	if (!error)
 		fsnotify_change(dentry, ia_valid);
 	return error;
 }
 EXPORT_SYMBOL(notify_change);

fs/ext2/inode.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/ext2/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
  * 	(sct@dcs.ed.ac.uk), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  * 	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
  */
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
 MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 static int __ext2_write_inode(struct inode *inode, int do_sync);
 /*
  * Test whether an inode is a fast symlink.
  */
 static inline int ext2_inode_is_fast_symlink(struct inode *inode)
 {
 	int ea_blocks = EXT2_I(inode)->i_file_acl ?
 		(inode->i_sb->s_blocksize >> 9) : 0;
 	return (S_ISLNK(inode->i_mode) &&
 		inode->i_blocks - ea_blocks == 0);
 }
 static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
 static void ext2_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
 	if (to > inode->i_size) {
 		truncate_pagecache(inode, to, inode->i_size);
 		ext2_truncate_blocks(inode, inode->i_size);
 	}
 }
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext2_evict_inode(struct inode * inode)
 {
 	struct ext2_block_alloc_info *rsv;
 	int want_delete = 0;
 	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		want_delete = 1;
 		dquot_initialize(inode);
 	} else {
 		dquot_drop(inode);
 	}
 	truncate_inode_pages(&inode->i_data, 0);
 	if (want_delete) {
 		/* set dtime */
 		EXT2_I(inode)->i_dtime	= get_seconds();
 		mark_inode_dirty(inode);
 		__ext2_write_inode(inode, inode_needs_sync(inode));
 		/* truncate to 0 */
 		inode->i_size = 0;
 		if (inode->i_blocks)
 			ext2_truncate_blocks(inode, 0);
 	}
 	invalidate_inode_buffers(inode);
 	end_writeback(inode);
 	ext2_discard_reservation(inode);
 	rsv = EXT2_I(inode)->i_block_alloc_info;
 	EXT2_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
 	if (want_delete)
 		ext2_free_inode(inode);
 }
 typedef struct {
 	__le32	*p;
 	__le32	key;
 	struct buffer_head *bh;
 } Indirect;
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
 	p->bh = bh;
 }
 static inline int verify_chain(Indirect *from, Indirect *to)
 {
 	while (from <= to && from->key == *from->p)
 		from++;
 	return (from > to);
 }
 /**
  *	ext2_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
  *      @boundary: set this non-zero if the referred-to block is likely to be
  *             followed (on disk) by an indirect block.
  *	To store the locations of file's data ext2 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
  *	data blocks at leaves and indirect blocks in intermediate nodes.
  *	This function translates the block number into path in that tree -
  *	return value is the path length and @offsets[n] is the offset of
  *	pointer to (n+1)th node in the nth one. If @block is out of range
  *	(negative or too large) warning is printed and zero returned.
  *
  *	Note: function doesn't find node addresses, so no IO is needed. All
  *	we need to know is the capacity of indirect blocks (taken from the
  *	inode->i_sb).
  */
 /*
  * Portability note: the last comparison (check that we fit into triple
  * indirect block) is spelled differently, because otherwise on an
  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  * if our filesystem had 8Kb blocks. We might use long long, but that would
  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  * i_block would have to be negative in the very beginning, so we would not
  * get there at all.
  */
 static int ext2_block_to_path(struct inode *inode,
 			long i_block, int offsets[4], int *boundary)
 {
 	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
 	const long direct_blocks = EXT2_NDIR_BLOCKS,
 		indirect_blocks = ptrs,
 		double_blocks = (1 << (ptrs_bits * 2));
 	int n = 0;
 	int final = 0;
 	if (i_block < 0) {
 		ext2_msg(inode->i_sb, KERN_WARNING,
 			"warning: %s: block < 0", __func__);
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT2_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
 	} else if ((i_block -= indirect_blocks) < double_blocks) {
 		offsets[n++] = EXT2_DIND_BLOCK;
 		offsets[n++] = i_block >> ptrs_bits;
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 		offsets[n++] = EXT2_TIND_BLOCK;
 		offsets[n++] = i_block >> (ptrs_bits * 2);
 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
 		ext2_msg(inode->i_sb, KERN_WARNING,
 			"warning: %s: block is too big", __func__);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 /**
  *	ext2_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
  *	@depth: depth of the chain (1 - direct pointer, etc.)
  *	@offsets: offsets of pointers in inode/indirect blocks
  *	@chain: place to store the result
  *	@err: here we store the error value
  *
  *	Function fills the array of triples <key, p, bh> and returns %NULL
  *	if everything went OK or the pointer to the last filled triple
  *	(incomplete one) otherwise. Upon the return chain[i].key contains
  *	the number of (i+1)-th block in the chain (as it is stored in memory,
  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
  *	number (it points into struct inode for i==0 and into the bh->b_data
  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  *	block for i>0 and NULL for i==0. In other words, it holds the block
  *	numbers of the chain, addresses they were taken from (and where we can
  *	verify that chain did not change) and buffer_heads hosting these
  *	numbers.
  *
  *	Function stops when it stumbles upon zero pointer (absent block)
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
  *	or when it notices that chain had been changed while it was reading
  *		(ditto, *@err == -EAGAIN)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
 static Indirect *ext2_get_branch(struct inode *inode,
 				 int depth,
 				 int *offsets,
 				 Indirect chain[4],
 				 int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
 		read_lock(&EXT2_I(inode)->i_meta_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 		read_unlock(&EXT2_I(inode)->i_meta_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 changed:
 	read_unlock(&EXT2_I(inode)->i_meta_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
 failure:
 	*err = -EIO;
 no_block:
 	return p;
 }
 /**
  *	ext2_find_near - find a place for allocation with sufficient locality
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
  *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
  * in the same block group.   The PID is used here so that functionally related
  * files will be close-by on-disk.
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
 static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext2_fsblk_t bg_start;
 	ext2_fsblk_t colour;
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--)
 		if (*p)
 			return le32_to_cpu(*p);
 	/* No such thing, so let's try location of indirect block */
 	if (ind->bh)
 		return ind->bh->b_blocknr;
 	/*
 	 * It is going to be referred from inode itself? OK, just put it into
 	 * the same cylinder group then.
 	 */
 	bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
 	colour = (current->pid % 16) *
 			(EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	return bg_start + colour;
 }
 /**
  *	ext2_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
  *	Returns preferred place for a block (the goal).
  */
 static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block,
 					  Indirect *partial)
 {
 	struct ext2_block_alloc_info *block_i;
 	block_i = EXT2_I(inode)->i_block_alloc_info;
 	/*
 	 * try the heuristic for sequential allocation,
 	 * failing that at least try to get decent locality.
 	 */
 	if (block_i && (block == block_i->last_alloc_logical_block + 1)
 		&& (block_i->last_alloc_physical_block != 0)) {
 		return block_i->last_alloc_physical_block + 1;
 	}
 	return ext2_find_near(inode, partial);
 }
 /**
  *	ext2_blks_to_allocate: Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  * 	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
 static int
 ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
 		int blocks_to_boundary)
 {
 	unsigned long count = 0;
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
 	 * then it's clear blocks on that path have not allocated
 	 */
 	if (k > 0) {
 		/* right now don't hanel cross boundary allocation */
 		if (blks < blocks_to_boundary + 1)
 			count += blks;
 		else
 			count += blocks_to_boundary + 1;
 		return count;
 	}
 	count++;
 	while (count < blks && count <= blocks_to_boundary
 		&& le32_to_cpu(*(branch[0].p + count)) == 0) {
 		count++;
 	}
 	return count;
 }
 /**
  *	ext2_alloc_blocks: multiple allocate blocks needed for a branch
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
  *
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
  *	@blks:	on return it will store the total number of allocated
  *		direct blocks
  */
 static int ext2_alloc_blocks(struct inode *inode,
 			ext2_fsblk_t goal, int indirect_blks, int blks,
 			ext2_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
 	ext2_fsblk_t current_block = 0;
 	int ret = 0;
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
 	 * To build a branch, we should allocate blocks for
 	 * the indirect blocks(if not allocated yet), and at least
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
 	target = blks + indirect_blks;
 	while (1) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
 		current_block = ext2_new_blocks(inode,goal,&count,err);
 		if (*err)
 			goto failed_out;
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
 		}
 		if (count > 0)
 			break;
 	}
 	/* save the new block number for the first direct block */
 	new_blocks[index] = current_block;
 	/* total number of blocks allocated for direct blocks */
 	ret = count;
 	*err = 0;
 	return ret;
 failed_out:
 	for (i = 0; i <index; i++)
 		ext2_free_blocks(inode, new_blocks[i], 1);
 	if (index)
 		mark_inode_dirty(inode);
 	return ret;
 }
 /**
  *	ext2_alloc_branch - allocate and set up a chain of blocks.
  *	@inode: owner
  *	@num: depth of the chain (number of blocks to allocate)
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
  *	This function allocates @num blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
  *	the same format as ext2_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
  *	picture as after the successful ext2_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
  *
  *	If allocation fails we free all blocks we've allocated (and forget
  *	their buffer_heads) and return the error value the from failed
  *	ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
 static int ext2_alloc_branch(struct inode *inode,
 			int indirect_blks, int *blks, ext2_fsblk_t goal,
 			int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
 	ext2_fsblk_t new_blocks[4];
 	ext2_fsblk_t current_block;
 	num = ext2_alloc_blocks(inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
 	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
 	 * metadata blocks and data blocks are allocated.
 	 */
 	for (n = 1; n <= indirect_blks;  n++) {
 		/*
 		 * Get buffer_head for parent block, zero it out
 		 * and set the pointer to new one, then send
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		memset(bh->b_data, 0, blocksize);
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
 		if ( n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
 			for (i=1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		mark_buffer_dirty_inode(bh, inode);
 		/* We used to sync bh here if IS_SYNC(inode).
 		 * But we now rely upon generic_write_sync()
 		 * and b_inode_buffers.  But not for directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
 			sync_dirty_buffer(bh);
 	}
 	*blks = num;
 	return err;
 }
 /**
  * ext2_splice_branch - splice the allocated branch onto inode.
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
 static void ext2_splice_branch(struct inode *inode,
 			long block, Indirect *where, int num, int blks)
 {
 	int i;
 	struct ext2_block_alloc_info *block_i;
 	ext2_fsblk_t current_block;
 	block_i = EXT2_I(inode)->i_block_alloc_info;
 	/* XXX LOCKING probably should have i_meta_lock ?*/
 	/* That's it */
 	*where->p = where->key;
 	/*
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i ) = cpu_to_le32(current_block++);
 	}
 	/*
 	 * update the most recently allocated logical & physical block
 	 * in i_block_alloc_info, to assist find the proper goal block for next
 	 * allocation
 	 */
 	if (block_i) {
 		block_i->last_alloc_logical_block = block + blks - 1;
 		block_i->last_alloc_physical_block =
 				le32_to_cpu(where[num].key) + blks - 1;
 	}
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 	/* had we spliced it onto indirect block? */
 	if (where->bh)
 		mark_buffer_dirty_inode(where->bh, inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 }
 /*
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
  * required, recheck the path, free and repeat if check fails, otherwise
  * set the last missing link (that will protect us from any truncate-generated
  * removals - all blocks on the path are immune now) and possibly force the
  * write on the parent block.
  * That has a nice additional property: no special recovery from the failed
  * allocations is needed - we simply release blocks and do not touch anything
  * reachable from inode.
  *
  * `handle' can be NULL if create == 0.
  *
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  */
 static int ext2_get_blocks(struct inode *inode,
 			   sector_t iblock, unsigned long maxblocks,
 			   struct buffer_head *bh_result,
 			   int create)
 {
 	int err = -EIO;
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext2_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int count = 0;
 	ext2_fsblk_t first_block = 0;
 	depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
 	if (depth == 0)
 		return (err);
 	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
 		clear_buffer_new(bh_result); /* What's this do? */
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext2_fsblk_t blk;
 			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
 				 * Handling of that case: forget what we've
 				 * got now, go to reread.
 				 */
 				err = -EAGAIN;
 				count = 0;
 				break;
 			}
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
 				count++;
 			else
 				break;
 		}
 		if (err != -EAGAIN)
 			goto got_it;
 	}
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if (!create || err == -EIO)
 		goto cleanup;
 	mutex_lock(&ei->truncate_mutex);
 	/*
 	 * If the indirect block is missing while we are reading
 	 * the chain(ext2_get_branch() returns -EAGAIN err), or
 	 * if the chain has been changed after we grab the semaphore,
 	 * (either because another process truncated this branch, or
 	 * another get_block allocated this branch) re-grab the chain to see if
 	 * the request block has been allocated or not.
 	 *
 	 * Since we already block the truncate/other get_block
 	 * at this point, we will have the current copy of the chain when we
 	 * splice the branch into the tree.
 	 */
 	if (err == -EAGAIN || !verify_chain(chain, partial)) {
 		while (partial > chain) {
 			brelse(partial->bh);
 			partial--;
 		}
 		partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 		if (!partial) {
 			count++;
 			mutex_unlock(&ei->truncate_mutex);
 			if (err)
 				goto cleanup;
 			clear_buffer_new(bh_result);
 			goto got_it;
 		}
 	}
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
 	 * allocation info here if necessary
 	*/
 	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
 		ext2_init_block_alloc_info(inode);
 	goal = ext2_find_goal(inode, iblock, partial);
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext2_blks_to_allocate(partial, indirect_blks,
 					maxblocks, blocks_to_boundary);
 	/*
 	 * XXX ???? Block out ext2_truncate while we alter the tree
 	 */
 	err = ext2_alloc_branch(inode, indirect_blks, &count, goal,
 				offsets + (partial - chain), partial);
 	if (err) {
 		mutex_unlock(&ei->truncate_mutex);
 		goto cleanup;
 	}
 	if (ext2_use_xip(inode->i_sb)) {
 		/*
 		 * we need to clear the block
 		 */
 		err = ext2_clear_xip_target (inode,
 			le32_to_cpu(chain[depth-1].key));
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
 		}
 	}
 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
 	mutex_unlock(&ei->truncate_mutex);
 	set_buffer_new(bh_result);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
 		set_buffer_boundary(bh_result);
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
 	while (partial > chain) {
 		brelse(partial->bh);
 		partial--;
 	}
 	return err;
 }
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
 {
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int ret = ext2_get_blocks(inode, iblock, max_blocks,
 			      bh_result, create);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
 	}
 	return ret;
 }
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len)
 {
 	return generic_block_fiemap(inode, fieinfo, start, len,
 				    ext2_get_block);
 }
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
 }
 static int ext2_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, ext2_get_block);
 }
 static int
 ext2_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
 }
 static int
 ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
 	int ret;
 	ret = block_write_begin(mapping, pos, len, flags, pagep,
 				ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
 }
 static int ext2_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	int ret;
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (ret < len)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
 }
 static int
 ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
 	int ret;
 	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
 			       ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
 }
 static int ext2_nobh_writepage(struct page *page,
 			struct writeback_control *wbc)
 {
 	return nobh_writepage(page, ext2_get_block, wbc);
 }
 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
 static ssize_t
 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
 				iov, offset, nr_segs, ext2_get_block, NULL);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
 	return ret;
 }
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
 	.write_begin		= ext2_write_begin,
 	.write_end		= ext2_write_end,
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_mem		= ext2_get_xip_mem,
 };
 const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
 	.write_begin		= ext2_nobh_write_begin,
 	.write_end		= nobh_write_end,
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
 	.error_remove_page	= generic_error_remove_page,
 };
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
  * Linus?
  */
 static inline int all_zeroes(__le32 *p, __le32 *q)
 {
 	while (p < q)
 		if (*p++)
 			return 0;
 	return 1;
 }
 /**
  *	ext2_find_shared - find the indirect blocks for partial truncation.
  *	@inode:	  inode in question
  *	@depth:	  depth of the affected branch
  *	@offsets: offsets of pointers in that branch (see ext2_block_to_path)
  *	@chain:	  place to store the pointers to partial indirect blocks
  *	@top:	  place to the (detached) top of branch
  *
  *	This is a helper function used by ext2_truncate().
  *
  *	When we do truncate() we may have to clean the ends of several indirect
  *	blocks but leave the blocks themselves alive. Block is partially
  *	truncated if some data below the new i_size is referred from it (and
  *	it is on the path to the first completely truncated data block, indeed).
  *	We have to free the top of that path along with everything to the right
  *	of the path. Since no allocation past the truncation point is possible
  *	until ext2_truncate() finishes, we may safely do the latter, but top
  *	of branch may require special attention - pageout below the truncation
  *	point might try to populate it.
  *
  *	We atomically detach the top of branch from the tree, store the block
  *	number of its root in *@top, pointers to buffer_heads of partially
  *	truncated blocks - in @chain[].bh and pointers to their last elements
  *	that should not be removed - in @chain[].p. Return value is the pointer
  *	to last filled element of @chain.
  *
  *	The work left to caller to do the actual freeing of subtrees:
  *		a) free the subtree starting from *@top
  *		b) free the subtrees whose roots are stored in
  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
  *		c) free the subtrees growing from the inode past the @chain[0].p
  *			(no partially truncated stuff there).
  */
 static Indirect *ext2_find_shared(struct inode *inode,
 				int depth,
 				int offsets[4],
 				Indirect chain[4],
 				__le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
 	*top = 0;
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext2_get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	write_lock(&EXT2_I(inode)->i_meta_lock);
 	if (!partial->key && *partial->p) {
 		write_unlock(&EXT2_I(inode)->i_meta_lock);
 		goto no_top;
 	}
 	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
 	 * branch should be detached before unlocking. However, if that rest
 	 * of branch is all ours and does not grow immediately from the inode
 	 * it's easier to cheat and just decrement partial->p.
 	 */
 	if (p == chain + k - 1 && p > chain) {
 		p->p--;
 	} else {
 		*top = *p->p;
 		*p->p = 0;
 	}
 	write_unlock(&EXT2_I(inode)->i_meta_lock);
 	while(partial > p)
 	{
 		brelse(partial->bh);
 		partial--;
 	}
 no_top:
 	return partial;
 }
 /**
  *	ext2_free_data - free a list of data blocks
  *	@inode:	inode we are dealing with
  *	@p:	array of block numbers
  *	@q:	points immediately past the end of array
  *
  *	We are freeing all blocks referred from that array (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
 {
 	unsigned long block_to_free = 0, count = 0;
 	unsigned long nr;
 	for ( ; p < q ; p++) {
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			*p = 0;
 			/* accumulate blocks to free if they're contiguous */
 			if (count == 0)
 				goto free_this;
 			else if (block_to_free == nr - count)
 				count++;
 			else {
 				ext2_free_blocks (inode, block_to_free, count);
 				mark_inode_dirty(inode);
 			free_this:
 				block_to_free = nr;
 				count = 1;
 			}
 		}
 	}
 	if (count > 0) {
 		ext2_free_blocks (inode, block_to_free, count);
 		mark_inode_dirty(inode);
 	}
 }
 /**
  *	ext2_free_branches - free an array of branches
  *	@inode:	inode we are dealing with
  *	@p:	array of block numbers
  *	@q:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
  *	We are freeing all blocks referred from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth)
 {
 	struct buffer_head * bh;
 	unsigned long nr;
 	if (depth--) {
 		int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 		for ( ; p < q ; p++) {
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;
 			*p = 0;
 			bh = sb_bread(inode->i_sb, nr);
 			/*
 			 * A read failure? Report error and clear slot
 			 * (should be rare).
 			 */
 			if (!bh) {
 				ext2_error(inode->i_sb, "ext2_free_branches",
 					"Read failure, inode=%ld, block=%ld",
 					inode->i_ino, nr);
 				continue;
 			}
 			ext2_free_branches(inode,
 					   (__le32*)bh->b_data,
 					   (__le32*)bh->b_data + addr_per_block,
 					   depth);
 			bforget(bh);
 			ext2_free_blocks(inode, nr, 1);
 			mark_inode_dirty(inode);
 		}
 	} else
 		ext2_free_data(inode, p, q);
 }
 static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	__le32 *i_data = EXT2_I(inode)->i_data;
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
 	long iblock;
 	unsigned blocksize;
 	blocksize = inode->i_sb->s_blocksize;
 	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 	n = ext2_block_to_path(inode, iblock, offsets, NULL);
 	if (n == 0)
 		return;
 	/*
 	 * From here we block out all ext2_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
 	mutex_lock(&ei->truncate_mutex);
 	if (n == 1) {
 		ext2_free_data(inode, i_data+offsets[0],
 					i_data + EXT2_NDIR_BLOCKS);
 		goto do_indirects;
 	}
 	partial = ext2_find_shared(inode, n, offsets, chain, &nr);
 	/* Kill the top of shared branch (already detached) */
 	if (nr) {
 		if (partial == chain)
 			mark_inode_dirty(inode);
 		else
 			mark_buffer_dirty_inode(partial->bh, inode);
 		ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
 	}
 	/* Clear the ends of indirect blocks on the shared branch */
 	while (partial > chain) {
 		ext2_free_branches(inode,
 				   partial->p + 1,
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		mark_buffer_dirty_inode(partial->bh, inode);
 		brelse (partial->bh);
 		partial--;
 	}
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
 	switch (offsets[0]) {
 		default:
 			nr = i_data[EXT2_IND_BLOCK];
 			if (nr) {
 				i_data[EXT2_IND_BLOCK] = 0;
 				mark_inode_dirty(inode);
 				ext2_free_branches(inode, &nr, &nr+1, 1);
 			}
 		case EXT2_IND_BLOCK:
 			nr = i_data[EXT2_DIND_BLOCK];
 			if (nr) {
 				i_data[EXT2_DIND_BLOCK] = 0;
 				mark_inode_dirty(inode);
 				ext2_free_branches(inode, &nr, &nr+1, 2);
 			}
 		case EXT2_DIND_BLOCK:
 			nr = i_data[EXT2_TIND_BLOCK];
 			if (nr) {
 				i_data[EXT2_TIND_BLOCK] = 0;
 				mark_inode_dirty(inode);
 				ext2_free_branches(inode, &nr, &nr+1, 3);
 			}
 		case EXT2_TIND_BLOCK:
 			;
 	}
 	ext2_discard_reservation(inode);
 	mutex_unlock(&ei->truncate_mutex);
 }
 static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	/*
 	 * XXX: it seems like a bug here that we don't allow
 	 * IS_APPEND inode to have blocks-past-i_size trimmed off.
 	 * review and fix this.
 	 *
 	 * Also would be nice to be able to handle IO errors and such,
 	 * but that's probably too much to ask.
 	 */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return;
 	if (ext2_inode_is_fast_symlink(inode))
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 	__ext2_truncate_blocks(inode, offset);
 }
 static int ext2_setsize(struct inode *inode, loff_t newsize)
 {
 	int error;
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return -EINVAL;
 	if (ext2_inode_is_fast_symlink(inode))
 		return -EINVAL;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
+	inode_dio_wait(inode);
 	if (mapping_is_xip(inode->i_mapping))
 		error = xip_truncate_page(inode->i_mapping, newsize);
 	else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
 	else
 		error = block_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
 	if (error)
 		return error;
 	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
 		sync_inode_metadata(inode, 1);
 	} else {
 		mark_inode_dirty(inode);
 	}
 	return 0;
 }
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
 					struct buffer_head **p)
 {
 	struct buffer_head * bh;
 	unsigned long block_group;
 	unsigned long block;
 	unsigned long offset;
 	struct ext2_group_desc * gdp;
 	*p = NULL;
 	if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
 	    ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
 		goto Einval;
 	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
 	gdp = ext2_get_group_desc(sb, block_group, NULL);
 	if (!gdp)
 		goto Egdp;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb);
 	block = le32_to_cpu(gdp->bg_inode_table) +
 		(offset >> EXT2_BLOCK_SIZE_BITS(sb));
 	if (!(bh = sb_bread(sb, block)))
 		goto Eio;
 	*p = bh;
 	offset &= (EXT2_BLOCK_SIZE(sb) - 1);
 	return (struct ext2_inode *) (bh->b_data + offset);
 Einval:
 	ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
 		   (unsigned long) ino);
 	return ERR_PTR(-EINVAL);
 Eio:
 	ext2_error(sb, "ext2_get_inode",
 		   "unable to read inode block - inode=%lu, block=%lu",
 		   (unsigned long) ino, block);
 Egdp:
 	return ERR_PTR(-EIO);
 }
 void ext2_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT2_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT2_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT2_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & EXT2_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & EXT2_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 }
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
 void ext2_get_inode_flags(struct ext2_inode_info *ei)
 {
 	unsigned int flags = ei->vfs_inode.i_flags;
 	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
 			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
 	if (flags & S_SYNC)
 		ei->i_flags |= EXT2_SYNC_FL;
 	if (flags & S_APPEND)
 		ei->i_flags |= EXT2_APPEND_FL;
 	if (flags & S_IMMUTABLE)
 		ei->i_flags |= EXT2_IMMUTABLE_FL;
 	if (flags & S_NOATIME)
 		ei->i_flags |= EXT2_NOATIME_FL;
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT2_DIRSYNC_FL;
 }
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 {
 	struct ext2_inode_info *ei;
 	struct buffer_head * bh;
 	struct ext2_inode *raw_inode;
 	struct inode *inode;
 	long ret = -EIO;
 	int n;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT2_I(inode);
 	ei->i_block_alloc_info = NULL;
 	raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
 	if (IS_ERR(raw_inode)) {
 		ret = PTR_ERR(raw_inode);
  		goto bad_inode;
 	}
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if (!(test_opt (inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
 		/* this inode is deleted */
 		brelse (bh);
 		ret = -ESTALE;
 		goto bad_inode;
 	}
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
 	ei->i_frag_no = raw_inode->i_frag;
 	ei->i_frag_size = raw_inode->i_fsize;
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
 	ei->i_dir_acl = 0;
 	if (S_ISREG(inode->i_mode))
 		inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
 	else
 		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
 	ei->i_dtime = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_state = 0;
 	ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
 	ei->i_dir_start_lookup = 0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (n = 0; n < EXT2_N_BLOCKS; n++)
 		ei->i_data[n] = raw_inode->i_block[n];
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext2_file_inode_operations;
 		if (ext2_use_xip(inode->i_sb)) {
 			inode->i_mapping->a_ops = &ext2_aops_xip;
 			inode->i_fop = &ext2_xip_file_operations;
 		} else if (test_opt(inode->i_sb, NOBH)) {
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
 			inode->i_fop = &ext2_file_operations;
 		} else {
 			inode->i_mapping->a_ops = &ext2_aops;
 			inode->i_fop = &ext2_file_operations;
 		}
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext2_dir_inode_operations;
 		inode->i_fop = &ext2_dir_operations;
 		if (test_opt(inode->i_sb, NOBH))
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
 		else
 			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext2_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext2_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext2_symlink_inode_operations;
 			if (test_opt(inode->i_sb, NOBH))
 				inode->i_mapping->a_ops = &ext2_nobh_aops;
 			else
 				inode->i_mapping->a_ops = &ext2_aops;
 		}
 	} else {
 		inode->i_op = &ext2_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
 	brelse (bh);
 	ext2_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 static int __ext2_write_inode(struct inode *inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
 	uid_t uid = inode->i_uid;
 	gid_t gid = inode->i_gid;
 	struct buffer_head * bh;
 	struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh);
 	int n;
 	int err = 0;
 	if (IS_ERR(raw_inode))
  		return -EIO;
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ei->i_state & EXT2_STATE_NEW)
 		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
 	ext2_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if (!(test_opt(sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if (!ei->i_dtime) {
 			raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
 			raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
 		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(inode->i_size);
 	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
 	raw_inode->i_frag = ei->i_frag_no;
 	raw_inode->i_fsize = ei->i_frag_size;
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode))
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	else {
 		raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
 		if (inode->i_size > 0x7fffffffULL) {
 			if (!EXT2_HAS_RO_COMPAT_FEATURE(sb,
 					EXT2_FEATURE_RO_COMPAT_LARGE_FILE) ||
 			    EXT2_SB(sb)->s_es->s_rev_level ==
 					cpu_to_le32(EXT2_GOOD_OLD_REV)) {
 			       /* If this is the first large file
 				* created, add a flag to the superblock.
 				*/
 				spin_lock(&EXT2_SB(sb)->s_lock);
 				ext2_update_dynamic_rev(sb);
 				EXT2_SET_RO_COMPAT_FEATURE(sb,
 					EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
 				spin_unlock(&EXT2_SB(sb)->s_lock);
 				ext2_write_super(sb);
 			}
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else for (n = 0; n < EXT2_N_BLOCKS; n++)
 		raw_inode->i_block[n] = ei->i_data[n];
 	mark_buffer_dirty(bh);
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh)) {
 			printk ("IO error syncing ext2 inode [%s:%08lx]\n",
 				sb->s_id, (unsigned long) ino);
 			err = -EIO;
 		}
 	}
 	ei->i_state &= ~EXT2_STATE_NEW;
 	brelse (bh);
 	return err;
 }
 int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		return error;
 	if (is_quota_modification(inode, iattr))
 		dquot_initialize(inode);
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
 		error = dquot_transfer(inode, iattr);
 		if (error)
 			return error;
 	}
 	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
 		error = ext2_setsize(inode, iattr->ia_size);
 		if (error)
 			return error;
 	}
 	setattr_copy(inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
 		error = ext2_acl_chmod(inode);
 	mark_inode_dirty(inode);
 	return error;
 }

fs/ext3/inode.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/ext3/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
  *	(sct@redhat.com), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/ext3_jbd.h>
 #include <linux/jbd.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
 static int ext3_writepage_trans_blocks(struct inode *inode);
 /*
  * Test whether an inode is a fast symlink.
  */
 static int ext3_inode_is_fast_symlink(struct inode *inode)
 {
 	int ea_blocks = EXT3_I(inode)->i_file_acl ?
 		(inode->i_sb->s_blocksize >> 9) : 0;
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 /*
  * The ext3 forget function must perform a revoke if we are freeing data
  * which has been journaled.  Metadata (eg. indirect blocks) must be
  * revoked in all cases.
  *
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
  */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
 			struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
 	int err;
 	might_sleep();
 	BUFFER_TRACE(bh, "enter");
 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
 		  "data mode %lx\n",
 		  bh, is_metadata, inode->i_mode,
 		  test_opt(inode->i_sb, DATA_FLAGS));
 	/* Never use the revoke function if we are doing full data
 	 * journaling: there is no need to, and a V1 superblock won't
 	 * support it.  Otherwise, only skip the revoke on un-journaled
 	 * data blocks. */
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
 	    (!is_metadata && !ext3_should_journal_data(inode))) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call journal_forget");
 			return ext3_journal_forget(handle, bh);
 		}
 		return 0;
 	}
 	/*
 	 * data!=journal && (is_metadata || should_journal_data(inode))
 	 */
 	BUFFER_TRACE(bh, "call ext3_journal_revoke");
 	err = ext3_journal_revoke(handle, blocknr, bh);
 	if (err)
 		ext3_abort(inode->i_sb, __func__,
 			   "error %d when attempting revoke", err);
 	BUFFER_TRACE(bh, "exit");
 	return err;
 }
 /*
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
 	unsigned long needed;
 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 	/* Give ourselves just enough room to cope with inodes in which
 	 * i_blocks is corrupt: we've seen disk corruptions in the past
 	 * which resulted in random data in an inode which looked enough
 	 * like a regular file for ext3 to try to delete it.  Things
 	 * will go a bit crazy if that happens, but at least we should
 	 * try not to panic the whole kernel. */
 	if (needed < 2)
 		needed = 2;
 	/* But we need to bound the transaction so we don't overflow the
 	 * journal. */
 	if (needed > EXT3_MAX_TRANS_DATA)
 		needed = EXT3_MAX_TRANS_DATA;
 	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 /*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
  * start_transaction gets us a new handle for a truncate transaction,
  * and extend_transaction tries to extend the existing one a bit.  If
  * extend fails, we need to propagate the failure up and restart the
  * transaction in the top-level truncate loop. --sct
  */
 static handle_t *start_transaction(struct inode *inode)
 {
 	handle_t *result;
 	result = ext3_journal_start(inode, blocks_for_truncate(inode));
 	if (!IS_ERR(result))
 		return result;
 	ext3_std_error(inode->i_sb, PTR_ERR(result));
 	return result;
 }
 /*
  * Try to extend this transaction for the purposes of truncation.
  *
  * Returns 0 if we managed to create more room.  If we can't create more
  * room, and the transaction must be restarted we return 1.
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
 	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
 		return 0;
 	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
 	return 1;
 }
 /*
  * Restart the transaction associated with *handle.  This does a commit,
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
 static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 {
 	int ret;
 	jbd_debug(2, "restarting handle %p\n", handle);
 	/*
 	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
 	 * At this moment, get_block can be called only for blocks inside
 	 * i_size since page cache has been already dropped and writes are
 	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
 	 */
 	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
 	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
 	mutex_lock(&EXT3_I(inode)->truncate_mutex);
 	return ret;
 }
 /*
  * Called at inode eviction from icache
  */
 void ext3_evict_inode (struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv;
 	handle_t *handle;
 	int want_delete = 0;
 	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		dquot_initialize(inode);
 		want_delete = 1;
 	}
 	truncate_inode_pages(&inode->i_data, 0);
 	ext3_discard_reservation(inode);
 	rsv = EXT3_I(inode)->i_block_alloc_info;
 	EXT3_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
 	if (!want_delete)
 		goto no_delete;
 	handle = start_transaction(inode);
 	if (IS_ERR(handle)) {
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext3_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 	inode->i_size = 0;
 	if (inode->i_blocks)
 		ext3_truncate(inode);
 	/*
 	 * Kill off the orphan record which ext3_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
 	 * Note that ext3_orphan_del() has to be able to cope with the
 	 * deletion of a non-existent orphan - this is because we don't
 	 * know if ext3_truncate() actually created an orphan record.
 	 * (Well, we could do this if we need to, but heck - it works)
 	 */
 	ext3_orphan_del(handle, inode);
 	EXT3_I(inode)->i_dtime	= get_seconds();
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
 	if (ext3_mark_inode_dirty(handle, inode)) {
 		/* If that failed, just dquot_drop() and be done with that */
 		dquot_drop(inode);
 		end_writeback(inode);
 	} else {
 		ext3_xattr_delete_inode(handle, inode);
 		dquot_free_inode(inode);
 		dquot_drop(inode);
 		end_writeback(inode);
 		ext3_free_inode(handle, inode);
 	}
 	ext3_journal_stop(handle);
 	return;
 no_delete:
 	end_writeback(inode);
 	dquot_drop(inode);
 }
 typedef struct {
 	__le32	*p;
 	__le32	key;
 	struct buffer_head *bh;
 } Indirect;
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
 	p->bh = bh;
 }
 static int verify_chain(Indirect *from, Indirect *to)
 {
 	while (from <= to && from->key == *from->p)
 		from++;
 	return (from > to);
 }
 /**
  *	ext3_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
  *      @boundary: set this non-zero if the referred-to block is likely to be
  *             followed (on disk) by an indirect block.
  *
  *	To store the locations of file's data ext3 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
  *	data blocks at leaves and indirect blocks in intermediate nodes.
  *	This function translates the block number into path in that tree -
  *	return value is the path length and @offsets[n] is the offset of
  *	pointer to (n+1)th node in the nth one. If @block is out of range
  *	(negative or too large) warning is printed and zero returned.
  *
  *	Note: function doesn't find node addresses, so no IO is needed. All
  *	we need to know is the capacity of indirect blocks (taken from the
  *	inode->i_sb).
  */
 /*
  * Portability note: the last comparison (check that we fit into triple
  * indirect block) is spelled differently, because otherwise on an
  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  * if our filesystem had 8Kb blocks. We might use long long, but that would
  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  * i_block would have to be negative in the very beginning, so we would not
  * get there at all.
  */
 static int ext3_block_to_path(struct inode *inode,
 			long i_block, int offsets[4], int *boundary)
 {
 	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
 	const long direct_blocks = EXT3_NDIR_BLOCKS,
 		indirect_blocks = ptrs,
 		double_blocks = (1 << (ptrs_bits * 2));
 	int n = 0;
 	int final = 0;
 	if (i_block < 0) {
 		ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT3_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
 	} else if ((i_block -= indirect_blocks) < double_blocks) {
 		offsets[n++] = EXT3_DIND_BLOCK;
 		offsets[n++] = i_block >> ptrs_bits;
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 		offsets[n++] = EXT3_TIND_BLOCK;
 		offsets[n++] = i_block >> (ptrs_bits * 2);
 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
 		ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 /**
  *	ext3_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
  *	@depth: depth of the chain (1 - direct pointer, etc.)
  *	@offsets: offsets of pointers in inode/indirect blocks
  *	@chain: place to store the result
  *	@err: here we store the error value
  *
  *	Function fills the array of triples <key, p, bh> and returns %NULL
  *	if everything went OK or the pointer to the last filled triple
  *	(incomplete one) otherwise. Upon the return chain[i].key contains
  *	the number of (i+1)-th block in the chain (as it is stored in memory,
  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
  *	number (it points into struct inode for i==0 and into the bh->b_data
  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  *	block for i>0 and NULL for i==0. In other words, it holds the block
  *	numbers of the chain, addresses they were taken from (and where we can
  *	verify that chain did not change) and buffer_heads hosting these
  *	numbers.
  *
  *	Function stops when it stumbles upon zero pointer (absent block)
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
  *	or when it notices that chain had been changed while it was reading
  *		(ditto, *@err == -EAGAIN)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
 		/* Reader: pointers */
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 changed:
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
 failure:
 	*err = -EIO;
 no_block:
 	return p;
 }
 /**
  *	ext3_find_near - find a place for allocation with sufficient locality
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
  *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
  * in the same block group.   The PID is used here so that functionally related
  * files will be close-by on-disk.
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
 static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext3_fsblk_t bg_start;
 	ext3_grpblk_t colour;
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
 		if (*p)
 			return le32_to_cpu(*p);
 	}
 	/* No such thing, so let's try location of indirect block */
 	if (ind->bh)
 		return ind->bh->b_blocknr;
 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
 	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
 	colour = (current->pid % 16) *
 			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	return bg_start + colour;
 }
 /**
  *	ext3_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
  */
 static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 				   Indirect *partial)
 {
 	struct ext3_block_alloc_info *block_i;
 	block_i =  EXT3_I(inode)->i_block_alloc_info;
 	/*
 	 * try the heuristic for sequential allocation,
 	 * failing that at least try to get decent locality.
 	 */
 	if (block_i && (block == block_i->last_alloc_logical_block + 1)
 		&& (block_i->last_alloc_physical_block != 0)) {
 		return block_i->last_alloc_physical_block + 1;
 	}
 	return ext3_find_near(inode, partial);
 }
 /**
  *	ext3_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
 static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 		int blocks_to_boundary)
 {
 	unsigned long count = 0;
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
 	 * then it's clear blocks on that path have not allocated
 	 */
 	if (k > 0) {
 		/* right now we don't handle cross boundary allocation */
 		if (blks < blocks_to_boundary + 1)
 			count += blks;
 		else
 			count += blocks_to_boundary + 1;
 		return count;
 	}
 	count++;
 	while (count < blks && count <= blocks_to_boundary &&
 		le32_to_cpu(*(branch[0].p + count)) == 0) {
 		count++;
 	}
 	return count;
 }
 /**
  *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
  *	@handle: handle for this transaction
  *	@inode: owner
  *	@goal: preferred place for allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
  *	@blks:	number of blocks need to allocated for direct blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
  *	@err: here we store the error value
  *
  *	return the number of direct blocks allocated
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int indirect_blks, int blks,
 			ext3_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
 	ext3_fsblk_t current_block = 0;
 	int ret = 0;
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
 	 * To build a branch, we should allocate blocks for
 	 * the indirect blocks(if not allocated yet), and at least
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
 	target = blks + indirect_blks;
 	while (1) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
 		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
 		if (*err)
 			goto failed_out;
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
 		}
 		if (count > 0)
 			break;
 	}
 	/* save the new block number for the first direct block */
 	new_blocks[index] = current_block;
 	/* total number of blocks allocated for direct blocks */
 	ret = count;
 	*err = 0;
 	return ret;
 failed_out:
 	for (i = 0; i <index; i++)
 		ext3_free_blocks(handle, inode, new_blocks[i], 1);
 	return ret;
 }
 /**
  *	ext3_alloc_branch - allocate and set up a chain of blocks.
  *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
  *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
  *	This function allocates blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
  *	the same format as ext3_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
  *	picture as after the successful ext3_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
  *
  *	If allocation fails we free all blocks we've allocated (and forget
  *	their buffer_heads) and return the error value the from failed
  *	ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 			int indirect_blks, int *blks, ext3_fsblk_t goal,
 			int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
 	ext3_fsblk_t new_blocks[4];
 	ext3_fsblk_t current_block;
 	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
 	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
 	 * metadata blocks and data blocks are allocated.
 	 */
 	for (n = 1; n <= indirect_blks;  n++) {
 		/*
 		 * Get buffer_head for parent block, zero it out
 		 * and set the pointer to new one, then send
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext3_journal_get_create_access(handle, bh);
 		if (err) {
 			unlock_buffer(bh);
 			brelse(bh);
 			goto failed;
 		}
 		memset(bh->b_data, 0, blocksize);
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
 		if ( n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
 			for (i=1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 		err = ext3_journal_dirty_metadata(handle, bh);
 		if (err)
 			goto failed;
 	}
 	*blks = num;
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
 	for (i = 1; i <= n ; i++) {
 		BUFFER_TRACE(branch[i].bh, "call journal_forget");
 		ext3_journal_forget(handle, branch[i].bh);
 	}
 	for (i = 0; i <indirect_blks; i++)
 		ext3_free_blocks(handle, inode, new_blocks[i], 1);
 	ext3_free_blocks(handle, inode, new_blocks[i], num);
 	return err;
 }
 /**
  * ext3_splice_branch - splice the allocated branch onto inode.
  * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
 static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 			long block, Indirect *where, int num, int blks)
 {
 	int i;
 	int err = 0;
 	struct ext3_block_alloc_info *block_i;
 	ext3_fsblk_t current_block;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	block_i = ei->i_block_alloc_info;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
 	 * before the splice.
 	 */
 	if (where->bh) {
 		BUFFER_TRACE(where->bh, "get_write_access");
 		err = ext3_journal_get_write_access(handle, where->bh);
 		if (err)
 			goto err_out;
 	}
 	/* That's it */
 	*where->p = where->key;
 	/*
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i ) = cpu_to_le32(current_block++);
 	}
 	/*
 	 * update the most recently allocated logical & physical block
 	 * in i_block_alloc_info, to assist find the proper goal block for next
 	 * allocation
 	 */
 	if (block_i) {
 		block_i->last_alloc_logical_block = block + blks - 1;
 		block_i->last_alloc_physical_block =
 				le32_to_cpu(where[num].key) + blks - 1;
 	}
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 	inode->i_ctime = CURRENT_TIME_SEC;
 	ext3_mark_inode_dirty(handle, inode);
 	/* ext3_mark_inode_dirty already updated i_sync_tid */
 	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
 		 * If we spliced it onto an indirect block, we haven't
 		 * altered the inode.  Note however that if it is being spliced
 		 * onto an indirect block at the very end of the file (the
 		 * file is growing) then we *will* alter the inode to reflect
 		 * the new i_size.  But that is not done here - it is done in
 		 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
 		err = ext3_journal_dirty_metadata(handle, where->bh);
 		if (err)
 			goto err_out;
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
 		 * Inode was dirtied above.
 		 */
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
 err_out:
 	for (i = 1; i <= num; i++) {
 		BUFFER_TRACE(where[i].bh, "call journal_forget");
 		ext3_journal_forget(handle, where[i].bh);
 		ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
 	}
 	ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
 	return err;
 }
 /*
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
  * required, recheck the path, free and repeat if check fails, otherwise
  * set the last missing link (that will protect us from any truncate-generated
  * removals - all blocks on the path are immune now) and possibly force the
  * write on the parent block.
  * That has a nice additional property: no special recovery from the failed
  * allocations is needed - we simply release blocks and do not touch anything
  * reachable from inode.
  *
  * `handle' can be NULL if create == 0.
  *
  * The BKL may not be held on entry here.  Be sure to take it early.
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  */
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		sector_t iblock, unsigned long maxblocks,
 		struct buffer_head *bh_result,
 		int create)
 {
 	int err = -EIO;
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext3_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int count = 0;
 	ext3_fsblk_t first_block = 0;
 	J_ASSERT(handle != NULL || create == 0);
 	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
 	if (depth == 0)
 		goto out;
 	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
 		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext3_fsblk_t blk;
 			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
 				 * Handling of that case: forget what we've
 				 * got now. Flag the err as EAGAIN, so it
 				 * will reread.
 				 */
 				err = -EAGAIN;
 				count = 0;
 				break;
 			}
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
 				count++;
 			else
 				break;
 		}
 		if (err != -EAGAIN)
 			goto got_it;
 	}
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if (!create || err == -EIO)
 		goto cleanup;
 	mutex_lock(&ei->truncate_mutex);
 	/*
 	 * If the indirect block is missing while we are reading
 	 * the chain(ext3_get_branch() returns -EAGAIN err), or
 	 * if the chain has been changed after we grab the semaphore,
 	 * (either because another process truncated this branch, or
 	 * another get_block allocated this branch) re-grab the chain to see if
 	 * the request block has been allocated or not.
 	 *
 	 * Since we already block the truncate/other get_block
 	 * at this point, we will have the current copy of the chain when we
 	 * splice the branch into the tree.
 	 */
 	if (err == -EAGAIN || !verify_chain(chain, partial)) {
 		while (partial > chain) {
 			brelse(partial->bh);
 			partial--;
 		}
 		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 		if (!partial) {
 			count++;
 			mutex_unlock(&ei->truncate_mutex);
 			if (err)
 				goto cleanup;
 			clear_buffer_new(bh_result);
 			goto got_it;
 		}
 	}
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
 	 * allocation info here if necessary
 	*/
 	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
 		ext3_init_block_alloc_info(inode);
 	goal = ext3_find_goal(inode, iblock, partial);
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext3_blks_to_allocate(partial, indirect_blks,
 					maxblocks, blocks_to_boundary);
 	/*
 	 * Block out ext3_truncate while we alter the tree
 	 */
 	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
 				offsets + (partial - chain), partial);
 	/*
 	 * The ext3_splice_branch call will free and forget any buffers
 	 * on the new chain if there is a failure, but that risks using
 	 * up transaction credits, especially for bitmaps where the
 	 * credits cannot be returned.  Can we handle this somehow?  We
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
 		err = ext3_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
 	mutex_unlock(&ei->truncate_mutex);
 	if (err)
 		goto cleanup;
 	set_buffer_new(bh_result);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
 		set_buffer_boundary(bh_result);
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
 	while (partial > chain) {
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 	BUFFER_TRACE(bh_result, "returned");
 out:
 	return err;
 }
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 /*
  * Number of credits we need for writing DIO_MAX_BLOCKS:
  * We need sb + group descriptor + bitmap + inode -> 4
  * For B blocks with A block pointers per block we need:
  * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
  * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
  */
 #define DIO_CREDITS 25
 static int ext3_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext3_journal_current_handle();
 	int ret = 0, started = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	if (create && !handle) {	/* Direct IO write... */
 		if (max_blocks > DIO_MAX_BLOCKS)
 			max_blocks = DIO_MAX_BLOCKS;
 		handle = ext3_journal_start(inode, DIO_CREDITS +
 				EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			goto out;
 		}
 		started = 1;
 	}
 	ret = ext3_get_blocks_handle(handle, inode, iblock,
 					max_blocks, bh_result, create);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
 	}
 	if (started)
 		ext3_journal_stop(handle);
 out:
 	return ret;
 }
 int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len)
 {
 	return generic_block_fiemap(inode, fieinfo, start, len,
 				    ext3_get_block);
 }
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
 				long block, int create, int *errp)
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
 	J_ASSERT(handle != NULL || create == 0);
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext3_get_blocks_handle(handle, inode, block, 1,
 					&dummy, create);
 	/*
 	 * ext3_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
 		if (err > 1)
 			WARN_ON(1);
 		err = 0;
 	}
 	*errp = err;
 	if (!err && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 		if (!bh) {
 			*errp = -EIO;
 			goto err;
 		}
 		if (buffer_new(&dummy)) {
 			J_ASSERT(create != 0);
 			J_ASSERT(handle != NULL);
 			/*
 			 * Now that we do not always journal data, we should
 			 * keep in mind whether this should always journal the
 			 * new buffer as metadata.  For now, regular file
 			 * writes use ext3_get_block instead, so it's not a
 			 * problem.
 			 */
 			lock_buffer(bh);
 			BUFFER_TRACE(bh, "call get_create_access");
 			fatal = ext3_journal_get_create_access(handle, bh);
 			if (!fatal && !buffer_uptodate(bh)) {
 				memset(bh->b_data,0,inode->i_sb->s_blocksize);
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 			err = ext3_journal_dirty_metadata(handle, bh);
 			if (!fatal)
 				fatal = err;
 		} else {
 			BUFFER_TRACE(bh, "not a new buffer");
 		}
 		if (fatal) {
 			*errp = fatal;
 			brelse(bh);
 			bh = NULL;
 		}
 		return bh;
 	}
 err:
 	return NULL;
 }
 struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
 			       int block, int create, int *err)
 {
 	struct buffer_head * bh;
 	bh = ext3_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
 	*err = -EIO;
 	return NULL;
 }
 static int walk_page_buffers(	handle_t *handle,
 				struct buffer_head *head,
 				unsigned from,
 				unsigned to,
 				int *partial,
 				int (*fn)(	handle_t *handle,
 						struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
 		block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * To preserve ordering, it is essential that the hole instantiation and
  * the data write be encapsulated in a single transaction.  We cannot
  * close off a transaction and start a new one between the ext3_get_block()
  * and the commit_write().  So doing the journal_start at the start of
  * prepare_write() is the right place.
  *
  * Also, this function can nest inside ext3_writepage() ->
  * block_write_full_page(). In that case, we *know* that ext3_writepage()
  * has generated enough buffer credits to do the whole page.  So we won't
  * block on the journal in that case, which is good, because the caller may
  * be PF_MEMALLOC.
  *
  * By accident, ext3 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
  * reentered, there can be a deadlock - we would be holding a quota
  * lock, and the commit would never complete if another thread had a
  * transaction open and was blocking on the quota lock - a ranking
  * violation.
  *
  * So what we do is to rely on the fact that journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
 					struct buffer_head *bh)
 {
 	int dirty = buffer_dirty(bh);
 	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	/*
 	 * __block_prepare_write() could have dirtied some buffers. Clean
 	 * the dirty bit as jbd2_journal_get_write_access() could complain
 	 * otherwise about fs integrity issues. Setting of the dirty bit
 	 * by __block_prepare_write() isn't a real problem here as we clear
 	 * the bit before releasing a page lock and thus writeback cannot
 	 * ever write the buffer.
 	 */
 	if (dirty)
 		clear_buffer_dirty(bh);
 	ret = ext3_journal_get_write_access(handle, bh);
 	if (!ret && dirty)
 		ret = ext3_journal_dirty_metadata(handle, bh);
 	return ret;
 }
 /*
  * Truncate blocks that were not used by write. We have to truncate the
  * pagecache as well so that corresponding buffers get properly unmapped.
  */
 static void ext3_truncate_failed_write(struct inode *inode)
 {
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext3_truncate(inode);
 }
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	/* Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason */
 	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 retry:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 	handle = ext3_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
 		unlock_page(page);
 		page_cache_release(page);
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	ret = __block_write_begin(page, pos, len, ext3_get_block);
 	if (ret)
 		goto write_begin_failed;
 	if (ext3_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
 	}
 write_begin_failed:
 	if (ret) {
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
 		 * Add inode to orphan list in case we crash before truncate
 		 * finishes. Do this only if ext3_can_truncate() agrees so
 		 * that orphan processing code is happy.
 		 */
 		if (pos + len > inode->i_size && ext3_can_truncate(inode))
 			ext3_orphan_add(handle, inode);
 		ext3_journal_stop(handle);
 		unlock_page(page);
 		page_cache_release(page);
 		if (pos + len > inode->i_size)
 			ext3_truncate_failed_write(inode);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 {
 	int err = journal_dirty_data(handle, bh);
 	if (err)
 		ext3_journal_abort_handle(__func__, __func__,
 						bh, handle, err);
 	return err;
 }
 /* For ordered writepage and write_end functions */
 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 {
 	/*
 	 * Write could have mapped the buffer but it didn't copy the data in
 	 * yet. So avoid filing such buffer into a transaction.
 	 */
 	if (buffer_mapped(bh) && buffer_uptodate(bh))
 		return ext3_journal_dirty_data(handle, bh);
 	return 0;
 }
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
 	return ext3_journal_dirty_metadata(handle, bh);
 }
 /*
  * This is nasty and subtle: ext3_write_begin() could have allocated blocks
  * for the whole page but later we failed to copy the data in. Update inode
  * size according to what we managed to copy. The rest is going to be
  * truncated in write_end function.
  */
 static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
 {
 	/* What matters to us is i_disksize. We don't write i_size anywhere */
 	if (pos + copied > inode->i_size)
 		i_size_write(inode, pos + copied);
 	if (pos + copied > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = pos + copied;
 		mark_inode_dirty(inode);
 	}
 }
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
 static int ext3_ordered_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + copied;
 	ret = walk_page_buffers(handle, page_buffers(page),
 		from, to, NULL, journal_dirty_data_fn);
 	if (ret == 0)
 		update_file_sizes(inode, pos, copied);
 	/*
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
 	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size)
 		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 static int ext3_writeback_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	update_file_sizes(inode, pos, copied);
 	/*
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
 	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ret = ext3_journal_stop(handle);
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size)
 		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 static int ext3_journalled_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, from + copied, to);
 		to = from + copied;
 	}
 	ret = walk_page_buffers(handle, page_buffers(page), from,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
 	if (pos + copied > inode->i_size)
 		i_size_write(inode, pos + copied);
 	/*
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
 	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = inode->i_size;
 		ret2 = ext3_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size)
 		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
  * Naturally, this is dangerous if the block concerned is still in the
  * journal.  If somebody makes a swapfile on an ext3 data-journaling
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
  * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 	if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
 		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
 		 * will.)
 		 *
 		 * NB. EXT3_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
 		 * or symlink and gets confused because the buffer
 		 * hasn't yet been flushed to disk, they deserve
 		 * everything they get.
 		 */
 		ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
 		journal = EXT3_JOURNAL(inode);
 		journal_lock_updates(journal);
 		err = journal_flush(journal);
 		journal_unlock_updates(journal);
 		if (err)
 			return 0;
 	}
 	return generic_block_bmap(mapping,block,ext3_get_block);
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
 	return 0;
 }
 static int bput_one(handle_t *handle, struct buffer_head *bh)
 {
 	put_bh(bh);
 	return 0;
 }
 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
 {
 	return !buffer_mapped(bh);
 }
 /*
  * Note that we always start a transaction even if we're not journalling
  * data.  This is to preserve ordering: any hole instantiation within
  * __block_write_full_page -> ext3_get_block() should be journalled
  * along with the data so we don't crash and then get metadata which
  * refers to old data.
  *
  * In all journalling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
  *	ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext3_writepage()
  *
  * Similar for:
  *
  *	ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
  *
  * Same applies to ext3_get_block().  We will deadlock on various things like
  * lock_journal and i_truncate_mutex.
  *
  * Setting PF_MEMALLOC here doesn't work - too many internal memory
  * allocations fail.
  *
  * 16May01: If we're reentered then journal_current_handle() will be
  *	    non-zero. We simply *return*.
  *
  * 1 July 2001: @@@ FIXME:
  *   In journalled data mode, a data buffer may be metadata against the
  *   current transaction.  But the same file is part of a shared mapping
  *   and someone does a writepage() on it.
  *
  *   We will move the buffer onto the async_data list, but *after* it has
  *   been dirtied. So there's a small window where we have dirty data on
  *   BJ_Metadata.
  *
  *   Note that this only applies to the last partial page in the file.  The
  *   bit which block_write_full_page() uses prepare/commit for.  (That's
  *   broken code anyway: it's wrong for msync()).
  *
  *   It's a rare case: affects the final partial page, for journalled data
  *   where the file is subject to bith write() and writepage() in the same
  *   transction.  To fix it we'll need a custom block_write_full_page().
  *   We'll probably need that anyway for journalling writepage() output.
  *
  * We don't honour synchronous mounts for writepage().  That would be
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
  * we don't need to open a transaction here.
  */
 static int ext3_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	J_ASSERT(PageLocked(page));
 	WARN_ON_ONCE(IS_RDONLY(inode));
 	/*
 	 * We give up here if we're reentered, because it might be for a
 	 * different filesystem.
 	 */
 	if (ext3_journal_current_handle())
 		goto out_fail;
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
 		page_bufs = page_buffers(page);
 	} else {
 		page_bufs = page_buffers(page);
 		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
 				       NULL, buffer_unmapped)) {
 			/* Provide NULL get_block() to catch bugs if buffers
 			 * weren't really mapped */
 			return block_write_full_page(page, NULL, wbc);
 		}
 	}
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out_fail;
 	}
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 	ret = block_write_full_page(page, ext3_get_block, wbc);
 	/*
 	 * The page can become unlocked at any point now, and
 	 * truncate can then come in and change things.  So we
 	 * can't touch *page from now on.  But *page_bufs is
 	 * safe due to elevated refcount.
 	 */
 	/*
 	 * And attach them to the current transaction.  But only if
 	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
 	 * and generally junk.
 	 */
 	if (ret == 0) {
 		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
 					NULL, journal_dirty_data_fn);
 		if (!ret)
 			ret = err;
 	}
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bput_one);
 	err = ext3_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return ret;
 }
 static int ext3_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	J_ASSERT(PageLocked(page));
 	WARN_ON_ONCE(IS_RDONLY(inode));
 	if (ext3_journal_current_handle())
 		goto out_fail;
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0,
 				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
 			/* Provide NULL get_block() to catch bugs if buffers
 			 * weren't really mapped */
 			return block_write_full_page(page, NULL, wbc);
 		}
 	}
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out_fail;
 	}
 	ret = block_write_full_page(page, ext3_get_block, wbc);
 	err = ext3_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return ret;
 }
 static int ext3_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	J_ASSERT(PageLocked(page));
 	WARN_ON_ONCE(IS_RDONLY(inode));
 	if (ext3_journal_current_handle())
 		goto no_write;
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto no_write;
 	}
 	if (!page_has_buffers(page) || PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
 		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
 					  ext3_get_block);
 		if (ret != 0) {
 			ext3_journal_stop(handle);
 			goto out_unlock;
 		}
 		ret = walk_page_buffers(handle, page_buffers(page), 0,
 			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 		err = walk_page_buffers(handle, page_buffers(page), 0,
 				PAGE_CACHE_SIZE, NULL, write_end_fn);
 		if (ret == 0)
 			ret = err;
 		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
 		unlock_page(page);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
 		ret = block_write_full_page(page, ext3_get_block, wbc);
 	}
 	err = ext3_journal_stop(handle);
 	if (!ret)
 		ret = err;
 out:
 	return ret;
 no_write:
 	redirty_page_for_writepage(wbc, page);
 out_unlock:
 	unlock_page(page);
 	goto out;
 }
 static int ext3_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, ext3_get_block);
 }
 static int
 ext3_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
 		ClearPageChecked(page);
 	journal_invalidatepage(journal, page, offset);
 }
 static int ext3_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
 	return journal_try_to_free_buffers(journal, page, wait);
 }
 /*
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
 	int retries = 0;
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
 		if (final_size > inode->i_size) {
 			/* Credits for sb + inode write */
 			handle = ext3_journal_start(inode, 2);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				goto out;
 			}
 			ret = ext3_orphan_add(handle, inode);
 			if (ret) {
 				ext3_journal_stop(handle);
 				goto out;
 			}
 			orphan = 1;
 			ei->i_disksize = inode->i_size;
 			ext3_journal_stop(handle);
 		}
 	}
 retry:
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext3_get_block, NULL);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + iov_length(iov, nr_segs);
 		if (end > isize)
 			vmtruncate(inode, isize);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 	if (orphan) {
 		int err;
 		/* Credits for sb + inode write */
 		handle = ext3_journal_start(inode, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Truncate allocated blocks
 			 * and pretend the write failed... */
 			ext3_truncate(inode);
 			ret = PTR_ERR(handle);
 			goto out;
 		}
 		if (inode->i_nlink)
 			ext3_orphan_del(handle, inode);
 		if (ret > 0) {
 			loff_t end = offset + ret;
 			if (end > inode->i_size) {
 				ei->i_disksize = end;
 				i_size_write(inode, end);
 				/*
 				 * We're going to return a positive `ret'
 				 * here due to non-zero-length I/O, so there's
 				 * no way of reporting error returns from
 				 * ext3_mark_inode_dirty() to userspace.  So
 				 * ignore it.
 				 */
 				ext3_mark_inode_dirty(handle, inode);
 			}
 		}
 		err = ext3_journal_stop(handle);
 		if (ret == 0)
 			ret = err;
 	}
 out:
 	return ret;
 }
 /*
  * Pages can be marked dirty completely asynchronously from ext3's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
  * much here because ->set_page_dirty is called under VFS locks.  The page is
  * not necessarily locked.
  *
  * We cannot just dirty the page and leave attached buffers clean, because the
  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
  * or jbddirty because all the journalling code will explode.
  *
  * So what we do is to mark the page "pending dirty" and next time writepage
  * is called, propagate that into the buffers appropriately.
  */
 static int ext3_journalled_set_page_dirty(struct page *page)
 {
 	SetPageChecked(page);
 	return __set_page_dirty_nobuffers(page);
 }
 static const struct address_space_operations ext3_ordered_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_ordered_writepage,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_ordered_write_end,
 	.bmap			= ext3_bmap,
 	.invalidatepage		= ext3_invalidatepage,
 	.releasepage		= ext3_releasepage,
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext3_writeback_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_writeback_writepage,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_writeback_write_end,
 	.bmap			= ext3_bmap,
 	.invalidatepage		= ext3_invalidatepage,
 	.releasepage		= ext3_releasepage,
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext3_journalled_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_journalled_writepage,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_journalled_write_end,
 	.set_page_dirty		= ext3_journalled_set_page_dirty,
 	.bmap			= ext3_bmap,
 	.invalidatepage		= ext3_invalidatepage,
 	.releasepage		= ext3_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 void ext3_set_aops(struct inode *inode)
 {
 	if (ext3_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext3_ordered_aops;
 	else if (ext3_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext3_writeback_aops;
 	else
 		inode->i_mapping->a_ops = &ext3_journalled_aops;
 }
 /*
  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
 	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	int err = 0;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext3_get_block(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			BUFFER_TRACE(bh, "still unmapped");
 			goto unlock;
 		}
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	if (ext3_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext3_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 	err = 0;
 	if (ext3_should_journal_data(inode)) {
 		err = ext3_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext3_should_order_data(inode))
 			err = ext3_journal_dirty_data(handle, bh);
 		mark_buffer_dirty(bh);
 	}
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
  * Linus?
  */
 static inline int all_zeroes(__le32 *p, __le32 *q)
 {
 	while (p < q)
 		if (*p++)
 			return 0;
 	return 1;
 }
 /**
  *	ext3_find_shared - find the indirect blocks for partial truncation.
  *	@inode:	  inode in question
  *	@depth:	  depth of the affected branch
  *	@offsets: offsets of pointers in that branch (see ext3_block_to_path)
  *	@chain:	  place to store the pointers to partial indirect blocks
  *	@top:	  place to the (detached) top of branch
  *
  *	This is a helper function used by ext3_truncate().
  *
  *	When we do truncate() we may have to clean the ends of several
  *	indirect blocks but leave the blocks themselves alive. Block is
  *	partially truncated if some data below the new i_size is referred
  *	from it (and it is on the path to the first completely truncated
  *	data block, indeed).  We have to free the top of that path along
  *	with everything to the right of the path. Since no allocation
  *	past the truncation point is possible until ext3_truncate()
  *	finishes, we may safely do the latter, but top of branch may
  *	require special attention - pageout below the truncation point
  *	might try to populate it.
  *
  *	We atomically detach the top of branch from the tree, store the
  *	block number of its root in *@top, pointers to buffer_heads of
  *	partially truncated blocks - in @chain[].bh and pointers to
  *	their last elements that should not be removed - in
  *	@chain[].p. Return value is the pointer to last filled element
  *	of @chain.
  *
  *	The work left to caller to do the actual freeing of subtrees:
  *		a) free the subtree starting from *@top
  *		b) free the subtrees whose roots are stored in
  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
  *		c) free the subtrees growing from the inode past the @chain[0].
  *			(no partially truncated stuff there).  */
 static Indirect *ext3_find_shared(struct inode *inode, int depth,
 			int offsets[4], Indirect chain[4], __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
 	*top = 0;
 	/* Make k index the deepest non-null offset + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext3_get_branch(inode, k, offsets, chain, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
 	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
 	 * branch should be detached before unlocking. However, if that rest
 	 * of branch is all ours and does not grow immediately from the inode
 	 * it's easier to cheat and just decrement partial->p.
 	 */
 	if (p == chain + k - 1 && p > chain) {
 		p->p--;
 	} else {
 		*top = *p->p;
 		/* Nope, don't do this in ext3.  Must leave the tree intact */
 #if 0
 		*p->p = 0;
 #endif
 	}
 	/* Writer: end */
 	while(partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
 no_top:
 	return partial;
 }
 /*
  * Zero a number of block pointers in either an inode or an indirect block.
  * If we restart the transaction we must again get write access to the
  * indirect block for further modification.
  *
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
 		struct buffer_head *bh, ext3_fsblk_t block_to_free,
 		unsigned long count, __le32 *first, __le32 *last)
 {
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 			if (ext3_journal_dirty_metadata(handle, bh))
 				return;
 		}
 		ext3_mark_inode_dirty(handle, inode);
 		truncate_restart_transaction(handle, inode);
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			if (ext3_journal_get_write_access(handle, bh))
 				return;
 		}
 	}
 	/*
 	 * Any buffers which are on the journal will be in memory. We find
 	 * them on the hash table so journal_revoke() will run journal_forget()
 	 * on them.  We've already detached each block from the file, so
 	 * bforget() in journal_forget() should be safe.
 	 *
 	 * AKPM: turn on bforget in journal_forget()!!!
 	 */
 	for (p = first; p < last; p++) {
 		u32 nr = le32_to_cpu(*p);
 		if (nr) {
 			struct buffer_head *bh;
 			*p = 0;
 			bh = sb_find_get_block(inode->i_sb, nr);
 			ext3_forget(handle, 0, inode, bh, nr);
 		}
 	}
 	ext3_free_blocks(handle, inode, block_to_free, count);
 }
 /**
  * ext3_free_data - free a list of data blocks
  * @handle:	handle for this transaction
  * @inode:	inode we are dealing with
  * @this_bh:	indirect buffer_head which contains *@first and *@last
  * @first:	array of block numbers
  * @last:	points immediately past the end of array
  *
  * We are freeing all blocks referred from that array (numbers are stored as
  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
  *
  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
  * blocks are contiguous then releasing them at one time will only affect one
  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
  * actually use a lot of journal space.
  *
  * @this_bh will be %NULL if @first and @last point into the inode's direct
  * block pointers.
  */
 static void ext3_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
 	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
 	ext3_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err;
 	if (this_bh) {				/* For indirect block */
 		BUFFER_TRACE(this_bh, "get_write_access");
 		err = ext3_journal_get_write_access(handle, this_bh);
 		/* Important: if we can't update the indirect pointers
 		 * to the blocks, we can't free them. */
 		if (err)
 			return;
 	}
 	for (p = first; p < last; p++) {
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
 			if (count == 0) {
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
 				ext3_clear_blocks(handle, inode, this_bh,
 						  block_to_free,
 						  count, block_to_free_p, p);
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			}
 		}
 	}
 	if (count > 0)
 		ext3_clear_blocks(handle, inode, this_bh, block_to_free,
 				  count, block_to_free_p, p);
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
 		/*
 		 * The buffer head should have an attached journal head at this
 		 * point. However, if the data is corrupted and an indirect
 		 * block pointed to itself, it would have been detached when
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
 		if (bh2jh(this_bh))
 			ext3_journal_dirty_metadata(handle, this_bh);
 		else
 			ext3_error(inode->i_sb, "ext3_free_data",
 				   "circular indirect block detected, "
 				   "inode=%lu, block=%llu",
 				   inode->i_ino,
 				   (unsigned long long)this_bh->b_blocknr);
 	}
 }
 /**
  *	ext3_free_branches - free an array of branches
  *	@handle: JBD handle for this transaction
  *	@inode:	inode we are dealing with
  *	@parent_bh: the buffer_head which contains *@first and *@last
  *	@first:	array of block numbers
  *	@last:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
  *	We are freeing all blocks referred from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
 	ext3_fsblk_t nr;
 	__le32 *p;
 	if (is_handle_aborted(handle))
 		return;
 	if (depth--) {
 		struct buffer_head *bh;
 		int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 		p = last;
 		while (--p >= first) {
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;		/* A hole */
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 			/*
 			 * A read failure? Report error and clear slot
 			 * (should be rare).
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
 					   "Read failure, inode=%lu, block="E3FSBLK,
 					   inode->i_ino, nr);
 				continue;
 			}
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext3_free_branches(handle, inode, bh,
 					   (__le32*)bh->b_data,
 					   (__le32*)bh->b_data + addr_per_block,
 					   depth);
 			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
 			 * We want the freeing of this indirect block to be
 			 * atomic in the journal with the updating of the
 			 * bitmap block which owns it.  So make some room in
 			 * the journal.
 			 *
 			 * We zero the parent pointer *after* freeing its
 			 * pointee in the bitmaps, so if extend_transaction()
 			 * for some reason fails to put the bitmap changes and
 			 * the release into the same transaction, recovery
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
 			if (is_handle_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext3_mark_inode_dirty(handle, inode);
 				truncate_restart_transaction(handle, inode);
 			}
 			/*
 			 * We've probably journalled the indirect block several
 			 * times during the truncate.  But it's no longer
 			 * needed and we now drop it from the transaction via
 			 * journal_revoke().
 			 *
 			 * That's easy if it's exclusively part of this
 			 * transaction.  But if it's part of the committing
 			 * transaction then journal_forget() will simply
 			 * brelse() it.  That means that if the underlying
 			 * block is reallocated in ext3_get_block(),
 			 * unmap_underlying_metadata() will find this block
 			 * and will try to get rid of it.  damn, damn. Thus
 			 * we don't allow a block to be reallocated until
 			 * a transaction freeing it has fully committed.
 			 *
 			 * We also have to make sure journal replay after a
 			 * crash does not overwrite non-journaled data blocks
 			 * with old metadata when the block got reallocated for
 			 * data.  Thus we have to store a revoke record for a
 			 * block in the same transaction in which we free the
 			 * block.
 			 */
 			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
 			ext3_free_blocks(handle, inode, nr, 1);
 			if (parent_bh) {
 				/*
 				 * The block which we have just freed is
 				 * pointed to by an indirect block: journal it
 				 */
 				BUFFER_TRACE(parent_bh, "get_write_access");
 				if (!ext3_journal_get_write_access(handle,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
 					"call ext3_journal_dirty_metadata");
 					ext3_journal_dirty_metadata(handle,
 								    parent_bh);
 				}
 			}
 		}
 	} else {
 		/* We have reached the bottom of the tree. */
 		BUFFER_TRACE(parent_bh, "free data blocks");
 		ext3_free_data(handle, inode, parent_bh, first, last);
 	}
 }
 int ext3_can_truncate(struct inode *inode)
 {
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return 0;
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
 		return 1;
 	if (S_ISLNK(inode->i_mode))
 		return !ext3_inode_is_fast_symlink(inode);
 	return 0;
 }
 /*
  * ext3_truncate()
  *
  * We block out ext3_get_block() block instantiations across the entire
  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
  * As we work through the truncate and commmit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
  * The file's tree may be transiently inconsistent in memory (although it
  * probably isn't), but whenever we close off and commit a journal transaction,
  * the contents of (the filesystem + the journal) must be consistent and
  * restartable.  It's pretty simple, really: bottom up, right to left (although
  * left-to-right works OK too).
  *
  * Note that at recovery time, journal replay occurs *before* the restart of
  * truncate against the orphan inode list.
  *
  * The committed inode has the new, desired i_size (which is the same as
  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
  * that this inode's truncate did not complete and it will again call
  * ext3_truncate() to have another go.  So there will be instantiated blocks
  * to the right of the truncation point in a crashed ext3 filesystem.  But
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext3_truncate() run will find them and release them.
  */
 void ext3_truncate(struct inode *inode)
 {
 	handle_t *handle;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
 	long last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 	if (!ext3_can_truncate(inode))
 		goto out_notrans;
 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
 		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
 	 */
 	if ((inode->i_size & (blocksize - 1)) == 0) {
 		/* Block boundary? Nothing to do */
 		page = NULL;
 	} else {
 		page = grab_cache_page(mapping,
 				inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
 			goto out_notrans;
 	}
 	handle = start_transaction(inode);
 	if (IS_ERR(handle)) {
 		if (page) {
 			clear_highpage(page);
 			flush_dcache_page(page);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 		goto out_notrans;
 	}
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
 	if (page)
 		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
 	n = ext3_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
 		goto out_stop;	/* error */
 	/*
 	 * OK.  This truncate is going to happen.  We add the inode to the
 	 * orphan list, so that if this truncate spans multiple transactions,
 	 * and we crash, we will resume the truncate when the filesystem
 	 * recovers.  It also marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
 	if (ext3_orphan_add(handle, inode))
 		goto out_stop;
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
 	 * on-disk inode. We do this via i_disksize, which is the value which
 	 * ext3 *really* writes onto the disk inode.
 	 */
 	ei->i_disksize = inode->i_size;
 	/*
 	 * From here we block out all ext3_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
 	mutex_lock(&ei->truncate_mutex);
 	if (n == 1) {		/* direct blocks */
 		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT3_NDIR_BLOCKS);
 		goto do_indirects;
 	}
 	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
 	/* Kill the top of shared branch (not detached) */
 	if (nr) {
 		if (partial == chain) {
 			/* Shared branch grows from the inode */
 			ext3_free_branches(handle, inode, NULL,
 					   &nr, &nr+1, (chain+n-1) - partial);
 			*partial->p = 0;
 			/*
 			 * We mark the inode dirty prior to restart,
 			 * and prior to stop.  No need for it here.
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
 			ext3_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
 		}
 	}
 	/* Clear the ends of indirect blocks on the shared branch */
 	while (partial > chain) {
 		ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse (partial->bh);
 		partial--;
 	}
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
 	switch (offsets[0]) {
 	default:
 		nr = i_data[EXT3_IND_BLOCK];
 		if (nr) {
 			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT3_IND_BLOCK] = 0;
 		}
 	case EXT3_IND_BLOCK:
 		nr = i_data[EXT3_DIND_BLOCK];
 		if (nr) {
 			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT3_DIND_BLOCK] = 0;
 		}
 	case EXT3_DIND_BLOCK:
 		nr = i_data[EXT3_TIND_BLOCK];
 		if (nr) {
 			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT3_TIND_BLOCK] = 0;
 		}
 	case EXT3_TIND_BLOCK:
 		;
 	}
 	ext3_discard_reservation(inode);
 	mutex_unlock(&ei->truncate_mutex);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	ext3_mark_inode_dirty(handle, inode);
 	/*
 	 * In a multi-transaction truncate, we only make the final transaction
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext3_evict_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
 		ext3_orphan_del(handle, inode);
 	ext3_journal_stop(handle);
 	return;
 out_notrans:
 	/*
 	 * Delete the inode from orphan list so that it doesn't stay there
 	 * forever and trigger assertion on umount.
 	 */
 	if (inode->i_nlink)
 		ext3_orphan_del(NULL, inode);
 }
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext3_iloc *iloc)
 {
 	unsigned long block_group;
 	unsigned long offset;
 	ext3_fsblk_t block;
 	struct ext3_group_desc *gdp;
 	if (!ext3_valid_inum(sb, ino)) {
 		/*
 		 * This error is already checked for in namei.c unless we are
 		 * looking at an NFS filehandle, in which case no error
 		 * report is needed
 		 */
 		return 0;
 	}
 	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
 	gdp = ext3_get_group_desc(sb, block_group, NULL);
 	if (!gdp)
 		return 0;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
 		EXT3_INODE_SIZE(sb);
 	block = le32_to_cpu(gdp->bg_inode_table) +
 		(offset >> EXT3_BLOCK_SIZE_BITS(sb));
 	iloc->block_group = block_group;
 	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
 	return block;
 }
 /*
  * ext3_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * data in memory that is needed to recreate the on-disk version of this
  * inode.
  */
 static int __ext3_get_inode_loc(struct inode *inode,
 				struct ext3_iloc *iloc, int in_mem)
 {
 	ext3_fsblk_t block;
 	struct buffer_head *bh;
 	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
 	if (!block)
 		return -EIO;
 	bh = sb_getblk(inode->i_sb, block);
 	if (!bh) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 				"unable to read inode block - "
 				"inode=%lu, block="E3FSBLK,
 				 inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		/*
 		 * If the buffer has the write error flag, we have failed
 		 * to write out another inode in the same block.  In this
 		 * case, we don't have to read the block because we may
 		 * read the old inode data successfully.
 		 */
 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
 			set_buffer_uptodate(bh);
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
 			goto has_buffer;
 		}
 		/*
 		 * If we have all information of the inode in memory and this
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			struct ext3_group_desc *desc;
 			int inodes_per_buffer;
 			int inode_offset, i;
 			int block_group;
 			int start;
 			block_group = (inode->i_ino - 1) /
 					EXT3_INODES_PER_GROUP(inode->i_sb);
 			inodes_per_buffer = bh->b_size /
 				EXT3_INODE_SIZE(inode->i_sb);
 			inode_offset = ((inode->i_ino - 1) %
 					EXT3_INODES_PER_GROUP(inode->i_sb));
 			start = inode_offset & ~(inodes_per_buffer - 1);
 			/* Is the inode bitmap in cache? */
 			desc = ext3_get_group_desc(inode->i_sb,
 						block_group, NULL);
 			if (!desc)
 				goto make_io;
 			bitmap_bh = sb_getblk(inode->i_sb,
 					le32_to_cpu(desc->bg_inode_bitmap));
 			if (!bitmap_bh)
 				goto make_io;
 			/*
 			 * If the inode bitmap isn't in cache then the
 			 * optimisation may end up performing two reads instead
 			 * of one, so skip it.
 			 */
 			if (!buffer_uptodate(bitmap_bh)) {
 				brelse(bitmap_bh);
 				goto make_io;
 			}
 			for (i = start; i < start + inodes_per_buffer; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext3_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_buffer) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
 			}
 		}
 make_io:
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
 					"unable to read inode block - "
 					"inode=%lu, block="E3FSBLK,
 					inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
 	}
 has_buffer:
 	iloc->bh = bh;
 	return 0;
 }
 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext3_get_inode_loc(inode, iloc,
 		!ext3_test_inode_state(inode, EXT3_STATE_XATTR));
 }
 void ext3_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT3_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT3_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT3_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & EXT3_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & EXT3_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT3_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 }
 /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
 void ext3_get_inode_flags(struct ext3_inode_info *ei)
 {
 	unsigned int flags = ei->vfs_inode.i_flags;
 	ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
 			EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
 	if (flags & S_SYNC)
 		ei->i_flags |= EXT3_SYNC_FL;
 	if (flags & S_APPEND)
 		ei->i_flags |= EXT3_APPEND_FL;
 	if (flags & S_IMMUTABLE)
 		ei->i_flags |= EXT3_IMMUTABLE_FL;
 	if (flags & S_NOATIME)
 		ei->i_flags |= EXT3_NOATIME_FL;
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT3_DIRSYNC_FL;
 }
 struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext3_iloc iloc;
 	struct ext3_inode *raw_inode;
 	struct ext3_inode_info *ei;
 	struct buffer_head *bh;
 	struct inode *inode;
 	journal_t *journal = EXT3_SB(sb)->s_journal;
 	transaction_t *transaction;
 	long ret;
 	int block;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT3_I(inode);
 	ei->i_block_alloc_info = NULL;
 	ret = __ext3_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
 	bh = iloc.bh;
 	raw_inode = ext3_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if(!(test_opt (inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
 		if (inode->i_mode == 0 ||
 		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
 			/* this inode is deleted */
 			brelse (bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 #ifdef EXT3_FRAGMENTS
 	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
 	ei->i_frag_no = raw_inode->i_frag;
 	ei->i_frag_size = raw_inode->i_fsize;
 #endif
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
 	} else {
 		inode->i_size |=
 			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
 	}
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT3_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 	/*
 	 * Set transaction id's of transactions that have to be committed
 	 * to finish f[data]sync. We set them to currently running transaction
 	 * as we cannot be sure that the inode or some of its metadata isn't
 	 * part of the transaction - the inode could have been reclaimed and
 	 * now it is reread from disk.
 	 */
 	if (journal) {
 		tid_t tid;
 		spin_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
 			transaction = journal->j_committing_transaction;
 		if (transaction)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
 		spin_unlock(&journal->j_state_lock);
 		atomic_set(&ei->i_sync_tid, tid);
 		atomic_set(&ei->i_datasync_tid, tid);
 	}
 	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
 	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
 		/*
 		 * When mke2fs creates big inodes it does not zero out
 		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
 		 * so ignore those first few inodes.
 		 */
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT3_INODE_SIZE(inode->i_sb)) {
 			brelse (bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext3_inode) -
 					    EXT3_GOOD_OLD_INODE_SIZE;
 		} else {
 			__le32 *magic = (void *)raw_inode +
 					EXT3_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
 				 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext3_file_inode_operations;
 		inode->i_fop = &ext3_file_operations;
 		ext3_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext3_dir_inode_operations;
 		inode->i_fop = &ext3_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext3_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext3_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext3_symlink_inode_operations;
 			ext3_set_aops(inode);
 		}
 	} else {
 		inode->i_op = &ext3_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
 	brelse (iloc.bh);
 	ext3_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
  *
  * The caller must have write access to iloc->bh.
  */
 static int ext3_do_update_inode(handle_t *handle,
 				struct inode *inode,
 				struct ext3_iloc *iloc)
 {
 	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 again:
 	/* we can't allow multiple procs in here at once, its a bit racey */
 	lock_buffer(bh);
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
 		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
 	ext3_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if(!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if(!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
 				cpu_to_le16(high_16_bits(inode->i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low =
 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
 		raw_inode->i_gid_low =
 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
 	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 #ifdef EXT3_FRAGMENTS
 	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
 	raw_inode->i_frag = ei->i_frag_no;
 	raw_inode->i_fsize = ei->i_frag_size;
 #endif
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	} else {
 		raw_inode->i_size_high =
 			cpu_to_le32(ei->i_disksize >> 32);
 		if (ei->i_disksize > 0x7fffffffULL) {
 			struct super_block *sb = inode->i_sb;
 			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
 			    EXT3_SB(sb)->s_es->s_rev_level ==
 					cpu_to_le32(EXT3_GOOD_OLD_REV)) {
 			       /* If this is the first large file
 				* created, add a flag to the superblock.
 				*/
 				unlock_buffer(bh);
 				err = ext3_journal_get_write_access(handle,
 						EXT3_SB(sb)->s_sbh);
 				if (err)
 					goto out_brelse;
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
 				/* get our lock and start over */
 				goto again;
 			}
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
 		raw_inode->i_block[block] = ei->i_data[block];
 	if (ei->i_extra_isize)
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 	unlock_buffer(bh);
 	rc = ext3_journal_dirty_metadata(handle, bh);
 	if (!err)
 		err = rc;
 	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
 	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
 out_brelse:
 	brelse (bh);
 	ext3_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * ext3_write_inode()
  *
  * We are called from a few places:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
  *   trasnaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
  *
  * - Within prune_icache() (PF_MEMALLOC == true)
  *   Here we simply return.  We can't afford to block kswapd on the
  *   journal commit.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
  * knfsd.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
  * which we are interested.
  *
  * It would be a bug for them to not do this.  The code:
  *
  *	mark_inode_dirty(inode)
  *	stuff();
  *	inode->i_size = expr;
  *
  * is in error because a kswapd-driven write_inode() could occur while
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
 int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 	if (ext3_journal_current_handle()) {
 		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 		dump_stack();
 		return -EIO;
 	}
 	if (wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 	return ext3_force_commit(inode->i_sb);
 }
 /*
  * ext3_setattr()
  *
  * Called from notify_change.
  *
  * We want to trap VFS attempts to truncate the file as soon as
  * possible.  In particular, we want to make sure that when the VFS
  * shrinks i_size, we put the inode on the orphan list and modify
  * i_disksize immediately, so that during the subsequent flushing of
  * dirty pages and freeing of disk blocks, we can guarantee that any
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
  * Called with inode->sem down.
  */
 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
 		handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
 					EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext3_journal_stop(handle);
 			return error;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext3_mark_inode_dirty(handle, inode);
 		ext3_journal_stop(handle);
 	}
+	if (attr->ia_valid & ATTR_SIZE)
+		inode_dio_wait(inode);
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
 		handle_t *handle;
 		handle = ext3_journal_start(inode, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = ext3_orphan_add(handle, inode);
 		EXT3_I(inode)->i_disksize = attr->ia_size;
 		rc = ext3_mark_inode_dirty(handle, inode);
 		if (!error)
 			error = rc;
 		ext3_journal_stop(handle);
 	}
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
 		rc = vmtruncate(inode, attr->ia_size);
 		if (rc)
 			goto err_out;
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	if (ia_valid & ATTR_MODE)
 		rc = ext3_acl_chmod(inode);
 err_out:
 	ext3_std_error(inode->i_sb, error);
 	if (!error)
 		error = rc;
 	return error;
 }
 /*
  * How many blocks doth make a writepage()?
  *
  * With N blocks per page, it may be:
  * N data blocks
  * 2 indirect block
  * 2 dindirect
  * 1 tindirect
  * N+5 bitmap blocks (from the above)
  * N+5 group descriptor summary blocks
  * 1 inode block
  * 1 superblock.
  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
  *
  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
  *
  * With ordered or writeback data it's the same, less the N data blocks.
  *
  * If the inode's direct blocks can hold an integral number of pages then a
  * page cannot straddle two indirect blocks, and we can only touch one indirect
  * and dindirect block, and the "5" above becomes "3".
  *
  * This still overestimates under most circumstances.  If we were to pass the
  * start and end offsets in here as well we could do block_to_path() on each
  * block and work out the exact number of indirects which are touched.  Pah.
  */
 static int ext3_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext3_journal_blocks_per_page(inode);
 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 	if (ext3_should_journal_data(inode))
 		ret = 3 * (bpp + indirects) + 2;
 	else
 		ret = 2 * (bpp + indirects) + indirects + 2;
 #ifdef CONFIG_QUOTA
 	/* We know that structure was already allocated during dquot_initialize so
 	 * we will be updating only the data blocks + inodes */
 	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 #endif
 	return ret;
 }
 /*
  * The caller must have previously called ext3_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext3_mark_iloc_dirty(handle_t *handle,
 		struct inode *inode, struct ext3_iloc *iloc)
 {
 	int err = 0;
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 	/* ext3_do_update_inode() does journal_dirty_metadata */
 	err = ext3_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
  */
 int
 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext3_iloc *iloc)
 {
 	int err = 0;
 	if (handle) {
 		err = ext3_get_inode_loc(inode, iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc->bh, "get_write_access");
 			err = ext3_journal_get_write_access(handle, iloc->bh);
 			if (err) {
 				brelse(iloc->bh);
 				iloc->bh = NULL;
 			}
 		}
 	}
 	ext3_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
  * without having to perform any I/O.  This is a very good thing,
  * because *any* task may call prune_icache - even ones which
  * have a transaction open against a different journal.
  *
  * Is this cheating?  Not really.  Sure, we haven't written the
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
  *
  * Is this efficient/effective?  Well, we're being nice to the system
  * by cleaning up our inodes proactively so they can be reaped
  * without I/O.  But we are potentially leaving up to five seconds'
  * worth of inodes floating about which prune_icache wants us to
  * write out.  One way to fix that would be to get prune_icache()
  * to do a write_super() to free up some memory.  It has the desired
  * effect.
  */
 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext3_iloc iloc;
 	int err;
 	might_sleep();
 	err = ext3_reserve_inode_write(handle, inode, &iloc);
 	if (!err)
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
 }
 /*
  * ext3_dirty_inode() is called from __mark_inode_dirty()
  *
  * We're really interested in the case where a file is being extended.
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
  * Also, dquot_alloc_space() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
 void ext3_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *current_handle = ext3_journal_current_handle();
 	handle_t *handle;
 	handle = ext3_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
 	if (current_handle &&
 		current_handle->h_transaction != handle->h_transaction) {
 		/* This task has a transaction open against a different fs */
 		printk(KERN_EMERG "%s: transactions do not match!\n",
 		       __func__);
 	} else {
 		jbd_debug(5, "marking dirty.  outer handle=%p\n",
 				current_handle);
 		ext3_mark_inode_dirty(handle, inode);
 	}
 	ext3_journal_stop(handle);
 out:
 	return;
 }
 #if 0
 /*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext3_reserve_inode_write, this leaves behind no bh reference and
  * returns no iloc structure, so the caller needs to repeat the iloc
  * lookup to mark the inode dirty later.
  */
 static int ext3_pin_inode(handle_t *handle, struct inode *inode)
 {
 	struct ext3_iloc iloc;
 	int err = 0;
 	if (handle) {
 		err = ext3_get_inode_loc(inode, &iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext3_journal_dirty_metadata(handle,
 								  iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
 	ext3_std_error(inode->i_sb, err);
 	return err;
 }
 #endif
 int ext3_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
 	handle_t *handle;
 	int err;
 	/*
 	 * We have to be very careful here: changing a data block's
 	 * journaling status dynamically is dangerous.  If we write a
 	 * data block to the journal, change the status and then delete
 	 * that block, we risk forgetting to revoke the old log record
 	 * from the journal and so a subsequent replay can corrupt data.
 	 * So, first we make sure that the journal is empty and that
 	 * nobody is changing anything.
 	 */
 	journal = EXT3_JOURNAL(inode);
 	if (is_journal_aborted(journal))
 		return -EROFS;
 	journal_lock_updates(journal);
 	journal_flush(journal);
 	/*
 	 * OK, there are no updates running now, and all cached data is
 	 * synced to disk.  We are now in a completely consistent state
 	 * which doesn't have anything in the journal, and we know that
 	 * no filesystem updates are running, so it is safe to modify
 	 * the inode's in-core data-journaling state flag now.
 	 */
 	if (val)
 		EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
 	else
 		EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
 	ext3_set_aops(inode);
 	journal_unlock_updates(journal);
 	/* Finally we can mark the inode as dirty. */
 	handle = ext3_journal_start(inode, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	err = ext3_mark_inode_dirty(handle, inode);
 	handle->h_sync = 1;
 	ext3_journal_stop(handle);
 	ext3_std_error(inode->i_sb, err);
 	return err;
 }

fs/ext4/inode.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/ext4/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
  *	(sct@redhat.com), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
 #include <trace/events/ext4.h>
 #define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
 	trace_ext4_begin_ordered_truncate(inode, new_size);
 	/*
 	 * If jinode is zero, then we never opened the file for
 	 * writing, so there's no need to call
 	 * jbd2_journal_begin_ordered_truncate() since there's no
 	 * outstanding writes we need to flush.
 	 */
 	if (!EXT4_I(inode)->jinode)
 		return 0;
 	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
 						   EXT4_I(inode)->jinode,
 						   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create);
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
  * Test whether an inode is a fast symlink.
  */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
 	int ea_blocks = EXT4_I(inode)->i_file_acl ?
 		(inode->i_sb->s_blocksize >> 9) : 0;
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 /*
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
 	ext4_lblk_t needed;
 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 	/* Give ourselves just enough room to cope with inodes in which
 	 * i_blocks is corrupt: we've seen disk corruptions in the past
 	 * which resulted in random data in an inode which looked enough
 	 * like a regular file for ext4 to try to delete it.  Things
 	 * will go a bit crazy if that happens, but at least we should
 	 * try not to panic the whole kernel. */
 	if (needed < 2)
 		needed = 2;
 	/* But we need to bound the transaction so we don't overflow the
 	 * journal. */
 	if (needed > EXT4_MAX_TRANS_DATA)
 		needed = EXT4_MAX_TRANS_DATA;
 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 /*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
  * start_transaction gets us a new handle for a truncate transaction,
  * and extend_transaction tries to extend the existing one a bit.  If
  * extend fails, we need to propagate the failure up and restart the
  * transaction in the top-level truncate loop. --sct
  */
 static handle_t *start_transaction(struct inode *inode)
 {
 	handle_t *result;
 	result = ext4_journal_start(inode, blocks_for_truncate(inode));
 	if (!IS_ERR(result))
 		return result;
 	ext4_std_error(inode->i_sb, PTR_ERR(result));
 	return result;
 }
 /*
  * Try to extend this transaction for the purposes of truncation.
  *
  * Returns 0 if we managed to create more room.  If we can't create more
  * room, and the transaction must be restarted we return 1.
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
 	if (!ext4_handle_valid(handle))
 		return 0;
 	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
 	return 1;
 }
 /*
  * Restart the transaction associated with *handle.  This does a commit,
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 				 int nblocks)
 {
 	int ret;
 	/*
 	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
 	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_journal_restart(handle, nblocks);
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 	return ret;
 }
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
 	trace_ext4_evict_inode(inode);
 	if (inode->i_nlink) {
 		truncate_inode_pages(&inode->i_data, 0);
 		goto no_delete;
 	}
 	if (!is_bad_inode(inode))
 		dquot_initialize(inode);
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode))
 		goto no_delete;
 	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
 	if (inode->i_blocks)
 		ext4_truncate(inode);
 	/*
 	 * ext4_ext_truncate() doesn't reserve any slop when it
 	 * restarts journal transactions; therefore there may not be
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
 	if (!ext4_handle_has_enough_credits(handle, 3)) {
 		err = ext4_journal_extend(handle, 3);
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
 		if (err != 0) {
 			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
 			ext4_orphan_del(NULL, inode);
 			goto no_delete;
 		}
 	}
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
 	 * Note that ext4_orphan_del() has to be able to cope with the
 	 * deletion of a non-existent orphan - this is because we don't
 	 * know if ext4_truncate() actually created an orphan record.
 	 * (Well, we could do this if we need to, but heck - it works)
 	 */
 	ext4_orphan_del(handle, inode);
 	EXT4_I(inode)->i_dtime	= get_seconds();
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 typedef struct {
 	__le32	*p;
 	__le32	key;
 	struct buffer_head *bh;
 } Indirect;
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
 	p->bh = bh;
 }
 /**
  *	ext4_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
  *	@boundary: set this non-zero if the referred-to block is likely to be
  *	       followed (on disk) by an indirect block.
  *
  *	To store the locations of file's data ext4 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
  *	data blocks at leaves and indirect blocks in intermediate nodes.
  *	This function translates the block number into path in that tree -
  *	return value is the path length and @offsets[n] is the offset of
  *	pointer to (n+1)th node in the nth one. If @block is out of range
  *	(negative or too large) warning is printed and zero returned.
  *
  *	Note: function doesn't find node addresses, so no IO is needed. All
  *	we need to know is the capacity of indirect blocks (taken from the
  *	inode->i_sb).
  */
 /*
  * Portability note: the last comparison (check that we fit into triple
  * indirect block) is spelled differently, because otherwise on an
  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  * if our filesystem had 8Kb blocks. We might use long long, but that would
  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  * i_block would have to be negative in the very beginning, so we would not
  * get there at all.
  */
 static int ext4_block_to_path(struct inode *inode,
 			      ext4_lblk_t i_block,
 			      ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 	const long direct_blocks = EXT4_NDIR_BLOCKS,
 		indirect_blocks = ptrs,
 		double_blocks = (1 << (ptrs_bits * 2));
 	int n = 0;
 	int final = 0;
 	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
 	} else if ((i_block -= indirect_blocks) < double_blocks) {
 		offsets[n++] = EXT4_DIND_BLOCK;
 		offsets[n++] = i_block >> ptrs_bits;
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 		offsets[n++] = EXT4_TIND_BLOCK;
 		offsets[n++] = i_block >> (ptrs_bits * 2);
 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
 			     i_block + direct_blocks +
 			     indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 static int __ext4_check_blockref(const char *function, unsigned int line,
 				 struct inode *inode,
 				 __le32 *p, unsigned int max)
 {
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	__le32 *bref = p;
 	unsigned int blk;
 	while (bref < p+max) {
 		blk = le32_to_cpu(*bref++);
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
 			es->s_last_error_block = cpu_to_le64(blk);
 			ext4_error_inode(inode, function, line, blk,
 					 "invalid block");
 			return -EIO;
 		}
 	}
 	return 0;
 }
 #define ext4_check_indirect_blockref(inode, bh)                         \
 	__ext4_check_blockref(__func__, __LINE__, inode,		\
 			      (__le32 *)(bh)->b_data,			\
 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
 	__ext4_check_blockref(__func__, __LINE__, inode,		\
 			      EXT4_I(inode)->i_data,			\
 			      EXT4_NDIR_BLOCKS)
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
  *	@depth: depth of the chain (1 - direct pointer, etc.)
  *	@offsets: offsets of pointers in inode/indirect blocks
  *	@chain: place to store the result
  *	@err: here we store the error value
  *
  *	Function fills the array of triples <key, p, bh> and returns %NULL
  *	if everything went OK or the pointer to the last filled triple
  *	(incomplete one) otherwise. Upon the return chain[i].key contains
  *	the number of (i+1)-th block in the chain (as it is stored in memory,
  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
  *	number (it points into struct inode for i==0 and into the bh->b_data
  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  *	block for i>0 and NULL for i==0. In other words, it holds the block
  *	numbers of the chain, addresses they were taken from (and where we can
  *	verify that chain did not change) and buffer_heads hosting these
  *	numbers.
  *
  *	Function stops when it stumbles upon zero pointer (absent block)
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  *
  *      Need to be called with
  *      down_read(&EXT4_I(inode)->i_data_sem)
  */
 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				 ext4_lblk_t  *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_getblk(sb, le32_to_cpu(p->key));
 		if (unlikely(!bh))
 			goto failure;
 		if (!bh_uptodate_or_lock(bh)) {
 			if (bh_submit_read(bh) < 0) {
 				put_bh(bh);
 				goto failure;
 			}
 			/* validate block references */
 			if (ext4_check_indirect_blockref(inode, bh)) {
 				put_bh(bh);
 				goto failure;
 			}
 		}
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 failure:
 	*err = -EIO;
 no_block:
 	return p;
 }
 /**
  *	ext4_find_near - find a place for allocation with sufficient locality
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
  *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
  * in the same block group.   The PID is used here so that functionally related
  * files will be close-by on-disk.
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
 	ext4_group_t block_group;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
 		if (*p)
 			return le32_to_cpu(*p);
 	}
 	/* No such thing, so let's try location of indirect block */
 	if (ind->bh)
 		return ind->bh->b_blocknr;
 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
 	block_group = ei->i_block_group;
 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 		block_group &= ~(flex_size-1);
 		if (S_ISREG(inode->i_mode))
 			block_group++;
 	}
 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 	/*
 	 * If we are doing delayed allocation, we don't need take
 	 * colour into account.
 	 */
 	if (test_opt(inode->i_sb, DELALLOC))
 		return bg_start;
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	else
 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 	return bg_start + colour;
 }
 /**
  *	ext4_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
  *	Because this is only used for non-extent files, we limit the block nr
  *	to 32 bits.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 				   Indirect *partial)
 {
 	ext4_fsblk_t goal;
 	/*
 	 * XXX need to get goal block from mballoc's data structures
 	 */
 	goal = ext4_find_near(inode, partial);
 	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 	return goal;
 }
 /**
  *	ext4_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 				 int blocks_to_boundary)
 {
 	unsigned int count = 0;
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
 	 * then it's clear blocks on that path have not allocated
 	 */
 	if (k > 0) {
 		/* right now we don't handle cross boundary allocation */
 		if (blks < blocks_to_boundary + 1)
 			count += blks;
 		else
 			count += blocks_to_boundary + 1;
 		return count;
 	}
 	count++;
 	while (count < blks && count <= blocks_to_boundary &&
 		le32_to_cpu(*(branch[0].p + count)) == 0) {
 		count++;
 	}
 	return count;
 }
 /**
  *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
  *	@handle: handle for this transaction
  *	@inode: inode which needs allocated blocks
  *	@iblock: the logical block to start allocated at
  *	@goal: preferred physical block of allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
  *	@blks: number of desired blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
  *	@err: on return it will store the error code
  *
  *	This function will return the number of blocks allocated as
  *	requested by the passed-in parameters.
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			     ext4_lblk_t iblock, ext4_fsblk_t goal,
 			     int indirect_blks, int blks,
 			     ext4_fsblk_t new_blocks[4], int *err)
 {
 	struct ext4_allocation_request ar;
 	int target, i;
 	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
 	 * To build a branch, we should allocate blocks for
 	 * the indirect blocks(if not allocated yet), and at least
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
 	/* first we try to allocate the indirect blocks */
 	target = indirect_blks;
 	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
 		current_block = ext4_new_meta_blocks(handle, inode, goal,
 						     0, &count, err);
 		if (*err)
 			goto failed_out;
 		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
 			EXT4_ERROR_INODE(inode,
 					 "current_block %llu + count %lu > %d!",
 					 current_block, count,
 					 EXT4_MAX_BLOCK_FILE_PHYS);
 			*err = -EIO;
 			goto failed_out;
 		}
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
 		}
 		if (count > 0) {
 			/*
 			 * save the new block number
 			 * for the first direct block
 			 */
 			new_blocks[index] = current_block;
 			printk(KERN_INFO "%s returned more blocks than "
 						"requested\n", __func__);
 			WARN_ON(1);
 			break;
 		}
 	}
 	target = blks - count ;
 	blk_allocated = count;
 	if (!target)
 		goto allocated;
 	/* Now allocate data blocks */
 	memset(&ar, 0, sizeof(ar));
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = target;
 	ar.logical = iblock;
 	if (S_ISREG(inode->i_mode))
 		/* enable in-core preallocation only for regular files */
 		ar.flags = EXT4_MB_HINT_DATA;
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
 	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
 		EXT4_ERROR_INODE(inode,
 				 "current_block %llu + ar.len %d > %d!",
 				 current_block, ar.len,
 				 EXT4_MAX_BLOCK_FILE_PHYS);
 		*err = -EIO;
 		goto failed_out;
 	}
 	if (*err && (target == blks)) {
 		/*
 		 * if the allocation failed and we didn't allocate
 		 * any blocks before
 		 */
 		goto failed_out;
 	}
 	if (!*err) {
 		if (target == blks) {
 			/*
 			 * save the new block number
 			 * for the first direct block
 			 */
 			new_blocks[index] = current_block;
 		}
 		blk_allocated += ar.len;
 	}
 allocated:
 	/* total number of blocks allocated for direct blocks */
 	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
 	for (i = 0; i < index; i++)
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 	return ret;
 }
 /**
  *	ext4_alloc_branch - allocate and set up a chain of blocks.
  *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
  *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
  *	This function allocates blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
  *	the same format as ext4_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
  *	picture as after the successful ext4_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
  *
  *	If allocation fails we free all blocks we've allocated (and forget
  *	their buffer_heads) and return the error value the from failed
  *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			     ext4_lblk_t iblock, int indirect_blks,
 			     int *blks, ext4_fsblk_t goal,
 			     ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
 	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
 	 * metadata blocks and data blocks are allocated.
 	 */
 	for (n = 1; n <= indirect_blks;  n++) {
 		/*
 		 * Get buffer_head for parent block, zero it out
 		 * and set the pointer to new one, then send
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 		if (unlikely(!bh)) {
 			err = -EIO;
 			goto failed;
 		}
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
 			/* Don't brelse(bh) here; it's done in
 			 * ext4_journal_forget() below */
 			unlock_buffer(bh);
 			goto failed;
 		}
 		memset(bh->b_data, 0, blocksize);
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
 		if (n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
 			for (i = 1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto failed;
 	}
 	*blks = num;
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
 	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
 		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
 	for (i = n+1; i < indirect_blks; i++)
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
 	return err;
 }
 /**
  * ext4_splice_branch - splice the allocated branch onto inode.
  * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @chain: chain of indirect blocks (with a missing link - see
  *	ext4_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 			      ext4_lblk_t block, Indirect *where, int num,
 			      int blks)
 {
 	int i;
 	int err = 0;
 	ext4_fsblk_t current_block;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
 	 * before the splice.
 	 */
 	if (where->bh) {
 		BUFFER_TRACE(where->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, where->bh);
 		if (err)
 			goto err_out;
 	}
 	/* That's it */
 	*where->p = where->key;
 	/*
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
 		 * If we spliced it onto an indirect block, we haven't
 		 * altered the inode.  Note however that if it is being spliced
 		 * onto an indirect block at the very end of the file (the
 		 * file is growing) then we *will* alter the inode to reflect
 		 * the new i_size.  But that is not done here - it is done in
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 		if (err)
 			goto err_out;
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
 		 */
 		ext4_mark_inode_dirty(handle, inode);
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
 err_out:
 	for (i = 1; i <= num; i++) {
 		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
 		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
 	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
 			 blks, 0);
 	return err;
 }
 /*
  * The ext4_ind_map_blocks() function handles non-extents inodes
  * (i.e., using the traditional indirect/double-indirect i_blocks
  * scheme) for ext4_map_blocks().
  *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
  * required, recheck the path, free and repeat if check fails, otherwise
  * set the last missing link (that will protect us from any truncate-generated
  * removals - all blocks on the path are immune now) and possibly force the
  * write on the parent block.
  * That has a nice additional property: no special recovery from the failed
  * allocations is needed - we simply release blocks and do not touch anything
  * reachable from inode.
  *
  * `handle' can be NULL if create == 0.
  *
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
  * The ext4_ind_get_blocks() function should be called with
  * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
  * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  * blocks.
  */
 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map,
 			       int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext4_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 	if (depth == 0)
 		goto out;
 	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
 		count++;
 		/*map more blocks*/
 		while (count < map->m_len && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
 				count++;
 			else
 				break;
 		}
 		goto got_it;
 	}
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 		goto cleanup;
 	/*
 	 * Okay, we need to do block allocation.
 	*/
 	goal = ext4_find_goal(inode, map->m_lblk, partial);
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
 				      map->m_len, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
 	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
 				&count, goal,
 				offsets + (partial - chain), partial);
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
 	 * on the new chain if there is a failure, but that risks using
 	 * up transaction credits, especially for bitmaps where the
 	 * credits cannot be returned.  Can we handle this somehow?  We
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
 		err = ext4_splice_branch(handle, inode, map->m_lblk,
 					 partial, indirect_blks, count);
 	if (err)
 		goto cleanup;
 	map->m_flags |= EXT4_MAP_NEW;
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
 	map->m_flags |= EXT4_MAP_MAPPED;
 	map->m_pblk = le32_to_cpu(chain[depth-1].key);
 	map->m_len = count;
 	if (count > blocks_to_boundary)
 		map->m_flags |= EXT4_MAP_BOUNDARY;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
 	while (partial > chain) {
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 out:
 	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
 				map->m_pblk, map->m_len, err);
 	return err;
 }
 #ifdef CONFIG_QUOTA
 qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
 	return &EXT4_I(inode)->i_reserved_quota;
 }
 #endif
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a new block at @lblocks for non extent file based file
  */
 static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 					      sector_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
 	int blk_bits;
 	if (lblock < EXT4_NDIR_BLOCKS)
 		return 0;
 	lblock -= EXT4_NDIR_BLOCKS;
 	if (ei->i_da_metadata_calc_len &&
 	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
 		ei->i_da_metadata_calc_len++;
 		return 0;
 	}
 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
 	ei->i_da_metadata_calc_len = 1;
 	blk_bits = order_base_2(lblock);
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a block located at @lblock
  */
 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
 	return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
 /*
  * Called with i_data_sem down, which is important since we can call
  * ext4_discard_preallocations() from here.
  */
 void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
 			 __func__, inode->i_ino, used,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		used = ei->i_reserved_data_blocks;
 	}
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	/* Update quota subsystem for data blocks */
 	if (quota_claim)
 		dquot_claim_block(inode, used);
 	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
 		 * not re-claim the quota for fallocated blocks.
 		 */
 		dquot_release_reservation_block(inode, used);
 	}
 	/*
 	 * If we have done all the pending block allocations and if
 	 * there aren't any writers on the inode, we can discard the
 	 * inode's preallocations.
 	 */
 	if ((ei->i_reserved_data_blocks == 0) &&
 	    (atomic_read(&inode->i_writecount) == 0))
 		ext4_discard_preallocations(inode);
 }
 static int __check_block_validity(struct inode *inode, const char *func,
 				unsigned int line,
 				struct ext4_map_blocks *map)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 				   map->m_len)) {
 		ext4_error_inode(inode, func, line, map->m_pblk,
 				 "lblock %lu mapped to illegal pblock "
 				 "(length %d)", (unsigned long) map->m_lblk,
 				 map->m_len);
 		return -EIO;
 	}
 	return 0;
 }
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 /*
  * Return the number of contiguous dirty pages in a given inode
  * starting at page frame idx.
  */
 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				    unsigned int max_pages)
 {
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t	index;
 	struct pagevec pvec;
 	pgoff_t num = 0;
 	int i, nr_pages, done = 0;
 	if (max_pages == 0)
 		return 0;
 	pagevec_init(&pvec, 0);
 	while (!done) {
 		index = idx;
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 					      PAGECACHE_TAG_DIRTY,
 					      (pgoff_t)PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			struct buffer_head *bh, *head;
 			lock_page(page);
 			if (unlikely(page->mapping != mapping) ||
 			    !PageDirty(page) ||
 			    PageWriteback(page) ||
 			    page->index != idx) {
 				done = 1;
 				unlock_page(page);
 				break;
 			}
 			if (page_has_buffers(page)) {
 				bh = head = page_buffers(page);
 				do {
 					if (!buffer_delay(bh) &&
 					    !buffer_unwritten(bh))
 						done = 1;
 					bh = bh->b_this_page;
 				} while (!done && (bh != head));
 			}
 			unlock_page(page);
 			if (done)
 				break;
 			idx++;
 			num++;
 			if (num >= max_pages) {
 				done = 1;
 				break;
 			}
 		}
 		pagevec_release(&pvec);
 	}
 	return num;
 }
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_map_blocks(),
  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
  * if create==0 and the blocks are pre-allocated and uninitialized block,
  * the result buffer head is unmapped. If the create ==1, it will make sure
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
  * that casem, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    struct ext4_map_blocks *map, int flags)
 {
 	int retval;
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 	/*
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		return retval;
 	/*
 	 * When we call get_blocks without the create flag, the
 	 * BH_Unwritten flag could have gotten set if the blocks
 	 * requested were part of a uninitialized extent.  We need to
 	 * clear this flag now that we are committed to convert all or
 	 * part of the uninitialized extent to be an initialized
 	 * extent.  This is because we need to avoid the combination
 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
 	 * set on the buffer_head.
 	 */
 	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_blocks()
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * if the caller is from delayed allocation writeout path
 	 * we have already reserved fs blocks for allocation
 	 * let the underlying get_block() function know to
 	 * avoid double accounting
 	 */
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
 			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 		}
 		/*
 		 * Update reserved blocks/metadata blocks after successful
 		 * block allocation which had been deferred till now. We don't
 		 * support fallocate for non extent files. So we can update
 		 * reserve space here.
 		 */
 		if ((retval > 0) &&
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	return retval;
 }
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 static int _ext4_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
 	int dio_credits;
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	if (flags && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			return ret;
 		}
 		started = 1;
 	}
 	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
 	return ret;
 }
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh, int create)
 {
 	return _ext4_get_block(inode, iblock, bh,
 			       create ? EXT4_GET_BLOCKS_CREATE : 0);
 }
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
 	struct ext4_map_blocks map;
 	struct buffer_head *bh;
 	int fatal = 0, err;
 	J_ASSERT(handle != NULL || create == 0);
 	map.m_lblk = block;
 	map.m_len = 1;
 	err = ext4_map_blocks(handle, inode, &map,
 			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (err < 0)
 		*errp = err;
 	if (err <= 0)
 		return NULL;
 	*errp = 0;
 	bh = sb_getblk(inode->i_sb, map.m_pblk);
 	if (!bh) {
 		*errp = -EIO;
 		return NULL;
 	}
 	if (map.m_flags & EXT4_MAP_NEW) {
 		J_ASSERT(create != 0);
 		J_ASSERT(handle != NULL);
 		/*
 		 * Now that we do not always journal data, we should
 		 * keep in mind whether this should always journal the
 		 * new buffer as metadata.  For now, regular file
 		 * writes use ext4_get_block instead, so it's not a
 		 * problem.
 		 */
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		fatal = ext4_journal_get_create_access(handle, bh);
 		if (!fatal && !buffer_uptodate(bh)) {
 			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 			set_buffer_uptodate(bh);
 		}
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (!fatal)
 			fatal = err;
 	} else {
 		BUFFER_TRACE(bh, "not a new buffer");
 	}
 	if (fatal) {
 		*errp = fatal;
 		brelse(bh);
 		bh = NULL;
 	}
 	return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
 	struct buffer_head *bh;
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
 	*err = -EIO;
 	return NULL;
 }
 static int walk_page_buffers(handle_t *handle,
 			     struct buffer_head *head,
 			     unsigned from,
 			     unsigned to,
 			     int *partial,
 			     int (*fn)(handle_t *handle,
 				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (bh = head, block_start = 0;
 	     ret == 0 && (bh != head || !block_start);
 	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * To preserve ordering, it is essential that the hole instantiation and
  * the data write be encapsulated in a single transaction.  We cannot
  * close off a transaction and start a new one between the ext4_get_block()
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
  * Also, this function can nest inside ext4_writepage() ->
  * block_write_full_page(). In that case, we *know* that ext4_writepage()
  * has generated enough buffer credits to do the whole page.  So we won't
  * block on the journal in that case, which is good, because the caller may
  * be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
  * reentered, there can be a deadlock - we would be holding a quota
  * lock, and the commit would never complete if another thread had a
  * transaction open and was blocking on the quota lock - a ranking
  * violation.
  *
  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
 				       struct buffer_head *bh)
 {
 	int dirty = buffer_dirty(bh);
 	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	/*
 	 * __block_write_begin() could have dirtied some buffers. Clean
 	 * the dirty bit as jbd2_journal_get_write_access() could complain
 	 * otherwise about fs integrity issues. Setting of the dirty bit
 	 * by __block_write_begin() isn't a real problem here as we clear
 	 * the bit before releasing a page lock and thus writeback cannot
 	 * ever write the buffer.
 	 */
 	if (dirty)
 		clear_buffer_dirty(bh);
 	ret = ext4_journal_get_write_access(handle, bh);
 	if (!ret && dirty)
 		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 	return ret;
 }
 /*
  * Truncate blocks that were not used by write. We have to truncate the
  * pagecache as well so that corresponding buffers get properly unmapped.
  */
 static void ext4_truncate_failed_write(struct inode *inode)
 {
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext4_truncate(inode);
 }
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 retry:
 	handle = ext4_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	/* We cannot recurse into the filesystem as the transaction is already
 	 * started */
 	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
 		goto out;
 	}
 	*pagep = page;
 	if (ext4_should_dioread_nolock(inode))
 		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 	else
 		ret = __block_write_begin(page, pos, len, ext4_get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
 	}
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
 			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
 			 * orphan list in that case.
 			 */
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
 	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 static int ext4_generic_write_end(struct file *file,
 				  struct address_space *mapping,
 				  loff_t pos, unsigned len, unsigned copied,
 				  struct page *page, void *fsdata)
 {
 	int i_size_changed = 0;
 	struct inode *inode = mapping->host;
 	handle_t *handle = ext4_journal_current_handle();
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos + copied > inode->i_size) {
 		i_size_write(inode, pos + copied);
 		i_size_changed = 1;
 	}
 	if (pos + copied >  EXT4_I(inode)->i_disksize) {
 		/* We need to mark inode dirty even if
 		 * new_i_size is less that inode->i_size
 		 * bu greater than i_disksize.(hint delalloc)
 		 */
 		ext4_update_i_disksize(inode, (pos + copied));
 		i_size_changed = 1;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		ext4_mark_inode_dirty(handle, inode);
 	return copied;
 }
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
 static int ext4_ordered_write_end(struct file *file,
 				  struct address_space *mapping,
 				  loff_t pos, unsigned len, unsigned copied,
 				  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	trace_ext4_ordered_write_end(inode, pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 	if (ret == 0) {
 		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			/* if we have allocated more blocks and copied
 			 * less. We will have blocks allocated outside
 			 * inode->i_size. So truncate them
 			 */
 			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
 	}
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 static int ext4_writeback_write_end(struct file *file,
 				    struct address_space *mapping,
 				    loff_t pos, unsigned len, unsigned copied,
 				    struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	trace_ext4_writeback_write_end(inode, pos, len, copied);
 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 	if (ret2 < 0)
 		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 static int ext4_journalled_write_end(struct file *file,
 				     struct address_space *mapping,
 				     loff_t pos, unsigned len, unsigned copied,
 				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
 	loff_t new_i_size;
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, from+copied, to);
 	}
 	ret = walk_page_buffers(handle, page_buffers(page), from,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
 	new_i_size = pos + copied;
 	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 /*
  * Reserve a single block located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned long md_needed;
 	int ret;
 	/*
 	 * recalculate the amount of metadata blocks to reserve
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 	/*
 	 * We will charge metadata quota at writeout time; this saves
 	 * us from metadata over-estimation, though we may go over by
 	 * a small amount in the end.  Here we just reserve for data.
 	 */
 	ret = dquot_reserve_block(inode, 1);
 	if (ret)
 		return ret;
 	/*
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
 	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
 		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
 		}
 		return -ENOSPC;
 	}
 	spin_lock(&ei->i_block_reservation_lock);
 	ei->i_reserved_data_blocks++;
 	ei->i_reserved_meta_blocks += md_needed;
 	spin_unlock(&ei->i_block_reservation_lock);
 	return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	if (!to_free)
 		return;		/* Nothing to release, exit */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
 		 * counter is messed up somewhere.  Since this
 		 * function is called from invalidate page, it's
 		 * harmless to return without any action.
 		 */
 		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
 			 "ino %lu, to_free %d with only %d reserved "
 			 "data blocks\n", inode->i_ino, to_free,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		to_free = ei->i_reserved_data_blocks;
 	}
 	ei->i_reserved_data_blocks -= to_free;
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	dquot_release_reservation_block(inode, to_free);
 }
 static void ext4_da_page_release_reservation(struct page *page,
 					     unsigned long offset)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
 	ext4_da_release_space(page->mapping->host, to_release);
 }
 /*
  * Delayed allocation stuff
  */
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
  * them with writepage() call back
  *
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd,
 			      struct ext4_map_blocks *map)
 {
 	struct pagevec pvec;
 	unsigned long index, end;
 	int ret = 0, err, nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t size = i_size_read(inode);
 	unsigned int len, block_start;
 	struct buffer_head *bh, *page_bufs = NULL;
 	int journal_data = ext4_should_journal_data(inode);
 	sector_t pblock = 0, cur_logical = 0;
 	struct ext4_io_submit io_submit;
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	memset(&io_submit, 0, sizeof(io_submit));
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
 	 * If we look at mpd->b_blocknr we would only be looking
 	 * at the currently mapped buffer_heads.
 	 */
 	index = mpd->first_page;
 	end = mpd->next_page - 1;
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			int commit_write = 0, skip_page = 0;
 			struct page *page = pvec.pages[i];
 			index = page->index;
 			if (index > end)
 				break;
 			if (index == size >> PAGE_CACHE_SHIFT)
 				len = size & ~PAGE_CACHE_MASK;
 			else
 				len = PAGE_CACHE_SIZE;
 			if (map) {
 				cur_logical = index << (PAGE_CACHE_SHIFT -
 							inode->i_blkbits);
 				pblock = map->m_pblk + (cur_logical -
 							map->m_lblk);
 			}
 			index++;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			/*
 			 * If the page does not have buffers (for
 			 * whatever reason), try to create them using
 			 * __block_write_begin.  If this fails,
 			 * skip the page and move on.
 			 */
 			if (!page_has_buffers(page)) {
 				if (__block_write_begin(page, 0, len,
 						noalloc_get_block_write)) {
 				skip_page:
 					unlock_page(page);
 					continue;
 				}
 				commit_write = 1;
 			}
 			bh = page_bufs = page_buffers(page);
 			block_start = 0;
 			do {
 				if (!bh)
 					goto skip_page;
 				if (map && (cur_logical >= map->m_lblk) &&
 				    (cur_logical <= (map->m_lblk +
 						     (map->m_len - 1)))) {
 					if (buffer_delay(bh)) {
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
 					}
 					if (buffer_unwritten(bh) ||
 					    buffer_mapped(bh))
 						BUG_ON(bh->b_blocknr != pblock);
 					if (map->m_flags & EXT4_MAP_UNINIT)
 						set_buffer_uninit(bh);
 					clear_buffer_unwritten(bh);
 				}
 				/* skip page if block allocation undone */
 				if (buffer_delay(bh) || buffer_unwritten(bh))
 					skip_page = 1;
 				bh = bh->b_this_page;
 				block_start += bh->b_size;
 				cur_logical++;
 				pblock++;
 			} while (bh != page_bufs);
 			if (skip_page)
 				goto skip_page;
 			if (commit_write)
 				/* mark the buffer_heads as dirty & uptodate */
 				block_commit_write(page, 0, len);
 			clear_page_dirty_for_io(page);
 			/*
 			 * Delalloc doesn't support data journalling,
 			 * but eventually maybe we'll lift this
 			 * restriction.
 			 */
 			if (unlikely(journal_data && PageChecked(page)))
 				err = __ext4_journalled_writepage(page, len);
 			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
 				err = ext4_bio_write_page(&io_submit, page,
 							  len, mpd->wbc);
 			else
 				err = block_write_full_page(page,
 					noalloc_get_block_write, mpd->wbc);
 			if (!err)
 				mpd->pages_written++;
 			/*
 			 * In error case, we have to continue because
 			 * remaining pages are still locked
 			 */
 			if (ret == 0)
 				ret = err;
 		}
 		pagevec_release(&pvec);
 	}
 	ext4_io_submit(&io_submit);
 	return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			if (page->index > end)
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			block_invalidatepage(page, 0);
 			ClearPageUptodate(page);
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
 	return;
 }
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	printk(KERN_CRIT "Total free blocks count %lld\n",
 	       ext4_count_free_blocks(inode->i_sb));
 	printk(KERN_CRIT "Free/Dirty block details\n");
 	printk(KERN_CRIT "free_blocks=%lld\n",
 	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
 	printk(KERN_CRIT "dirty_blocks=%lld\n",
 	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 	printk(KERN_CRIT "Block reservation details\n");
 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_data_blocks);
 	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
 /*
  * mpage_da_map_and_submit - go through given space, map them
  *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
 	struct ext4_map_blocks map, *mapp = NULL;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
 	handle_t *handle = NULL;
 	/*
 	 * If the blocks are mapped already, or we couldn't accumulate
 	 * any blocks, then proceed immediately to the submission stage.
 	 */
 	if ((mpd->b_size == 0) ||
 	    ((mpd->b_state  & (1 << BH_Mapped)) &&
 	     !(mpd->b_state & (1 << BH_Delay)) &&
 	     !(mpd->b_state & (1 << BH_Unwritten))))
 		goto submit_io;
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	/*
 	 * Call ext4_map_blocks() to allocate any delayed allocation
 	 * blocks, or to convert an uninitialized extent to be
 	 * initialized (in the case where we have written into
 	 * one or more preallocated blocks).
 	 *
 	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
 	 * indicate that we are on the delayed allocation path.  This
 	 * affects functions in many different parts of the allocation
 	 * call path.  This flag exists primarily because we don't
 	 * want to change *many* call functions, so ext4_map_blocks()
 	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
 	 * inode's allocation semaphore is taken.
 	 *
 	 * If the blocks in questions were delalloc blocks, set
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 	 * variables are updated after the blocks have been allocated.
 	 */
 	map.m_lblk = next;
 	map.m_len = max_blocks;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
 	if (ext4_should_dioread_nolock(mpd->inode))
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
 		struct super_block *sb = mpd->inode->i_sb;
 		err = blks;
 		/*
 		 * If get block returns EAGAIN or ENOSPC and there
 		 * appears to be free blocks we will just let
 		 * mpage_da_submit_io() unlock all of the pages.
 		 */
 		if (err == -EAGAIN)
 			goto submit_io;
 		if (err == -ENOSPC &&
 		    ext4_count_free_blocks(sb)) {
 			mpd->retval = err;
 			goto submit_io;
 		}
 		/*
 		 * get block failure will cause us to loop in
 		 * writepages, because a_ops->writepage won't be able
 		 * to make progress. The page will be redirtied by
 		 * writepage and writepages will again try to write
 		 * the same.
 		 */
 		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 			ext4_msg(sb, KERN_CRIT,
 				 "delayed block allocation failed for inode %lu "
 				 "at logical offset %llu with max blocks %zd "
 				 "with error %d", mpd->inode->i_ino,
 				 (unsigned long long) next,
 				 mpd->b_size >> mpd->inode->i_blkbits, err);
 			ext4_msg(sb, KERN_CRIT,
 				"This should not happen!! Data will be lost\n");
 			if (err == -ENOSPC)
 				ext4_print_free_blocks(mpd->inode);
 		}
 		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd);
 		/* Mark this page range as having been completed */
 		mpd->io_done = 1;
 		return;
 	}
 	BUG_ON(blks == 0);
 	mapp = &map;
 	if (map.m_flags & EXT4_MAP_NEW) {
 		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
 		int i;
 		for (i = 0; i < map.m_len; i++)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
 	}
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
 		if (err)
 			/* This only happens if the journal is aborted */
 			return;
 	}
 	/*
 	 * Update on-disk size along with block allocation.
 	 */
 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 	if (disksize > i_size_read(mpd->inode))
 		disksize = i_size_read(mpd->inode);
 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
 		ext4_update_i_disksize(mpd->inode, disksize);
 		err = ext4_mark_inode_dirty(handle, mpd->inode);
 		if (err)
 			ext4_error(mpd->inode->i_sb,
 				   "Failed to mark inode %lu dirty",
 				   mpd->inode->i_ino);
 	}
 submit_io:
 	mpage_da_submit_io(mpd, mapp);
 	mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
 		(1 << BH_Delay) | (1 << BH_Unwritten))
 /*
  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
  *
  * @mpd->lbh - extent of blocks
  * @logical - logical number of the block in the file
  * @bh - bh of the block (used to access block's state)
  *
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 				   sector_t logical, size_t b_size,
 				   unsigned long b_state)
 {
 	sector_t next;
 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 	/*
 	 * XXX Don't go larger than mballoc is willing to allocate
 	 * This is a stopgap solution.  We eventually need to fold
 	 * mpage_da_submit_io() into this function and then call
 	 * ext4_map_blocks() multiple times in a loop
 	 */
 	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
 		goto flush_it;
 	/* check if thereserved journal credits might overflow */
 	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 			/*
 			 * With non-extent format we are limited by the journal
 			 * credit available.  Total credit needed to insert
 			 * nrblocks contiguous blocks is dependent on the
 			 * nrblocks.  So limit nrblocks.
 			 */
 			goto flush_it;
 		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
 				EXT4_MAX_TRANS_DATA) {
 			/*
 			 * Adding the new buffer_head would make it cross the
 			 * allowed limit for which we have journal credit
 			 * reserved. So limit the new bh->b_size
 			 */
 			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
 						mpd->inode->i_blkbits;
 			/* we will do mpage_da_submit_io in the next loop */
 		}
 	}
 	/*
 	 * First block in the extent
 	 */
 	if (mpd->b_size == 0) {
 		mpd->b_blocknr = logical;
 		mpd->b_size = b_size;
 		mpd->b_state = b_state & BH_FLAGS;
 		return;
 	}
 	next = mpd->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
 	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
 		mpd->b_size += b_size;
 		return;
 	}
 flush_it:
 	/*
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
 	mpage_da_map_and_submit(mpd);
 	return;
 }
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
  * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
  *
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh, int create)
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
 		invalid_block = ~0;
 	BUG_ON(create == 0);
 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
 	map.m_lblk = iblock;
 	map.m_len = 1;
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
 	ret = ext4_map_blocks(NULL, inode, &map, 0);
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
 		if (buffer_delay(bh))
 			return 0; /* Not sure this could or should happen */
 		/*
 		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
 		ret = ext4_da_reserve_space(inode, iblock);
 		if (ret)
 			/* not enough space to reserve */
 			return ret;
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
 		set_buffer_delay(bh);
 		return 0;
 	}
 	map_bh(bh, inode->i_sb, map.m_pblk);
 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 	if (buffer_unwritten(bh)) {
 		/* A delayed write to unwritten bh should be marked
 		 * new and mapped.  Mapped ensures that we don't do
 		 * get_block multiple times when we write to the same
 		 * offset and new ensures that we do proper zero out
 		 * for partial write.
 		 */
 		set_buffer_new(bh);
 		set_buffer_mapped(bh);
 	}
 	return 0;
 }
 /*
  * This function is used as a standard get_block_t calback function
  * when there is no desire to allocate any blocks.  It is used as a
  * callback function for block_write_begin() and block_write_full_page().
  * These functions should only try to map a single block at a time.
  *
  * Since this function doesn't do block allocations even if the caller
  * requests it by passing in create=1, it is critically important that
  * any caller checks to make sure that any buffer heads are returned
  * by this function are either all already mapped or marked for
  * delayed allocation before calling  block_write_full_page().  Otherwise,
  * b_blocknr could be left unitialized, and the page write functions will
  * be taken by surprise.
  */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
 	return _ext4_get_block(inode, iblock, bh_result, 0);
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
 	return 0;
 }
 static int bput_one(handle_t *handle, struct buffer_head *bh)
 {
 	put_bh(bh);
 	return 0;
 }
 static int __ext4_journalled_writepage(struct page *page,
 				       unsigned int len)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	ClearPageChecked(page);
 	page_bufs = page_buffers(page);
 	BUG_ON(!page_bufs);
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
 	/* As soon as we unlock the page, it can go away, but we have
 	 * references to buffers so we are safe */
 	unlock_page(page);
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
 				do_journal_get_write_access);
 	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
 				write_end_fn);
 	if (ret == 0)
 		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	return ret;
 }
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
  * need to file the inode to the transaction's list in ordered mode because if
  * we are writing back data added by write(), the inode is already there and if
  * we are writing back data modified via mmap(), no one guarantees in which
  * transaction the data will hit the disk. In case we are journaling data, we
  * cannot start transaction directly because transaction start ranks above page
  * lock so we have to do some magic.
  *
  * This function can get called via...
  *   - ext4_da_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via pdflush (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
  *
  * We don't do any block allocation in this function. If we have page with
  * multiple blocks we need to write those buffer_heads that are mapped. This
  * is important for mmaped based write. So if we do with blocksize 1K
  * truncate(f, 1024);
  * a = mmap(f, 0, 4096);
  * a[0] = 'a';
  * truncate(f, 4096);
  * we have in the page first buffer_head mapped via page_mkwrite call back
  * but other bufer_heads would be unmapped but dirty(dirty done via the
  * do_wp_page). So writepage should write the first block. If we modify
  * the mmap area beyond 1024 we will again get a page_fault and the
  * page_mkwrite callback will do the block allocation and mark the
  * buffer_heads mapped.
  *
  * We redirty the page if we have any buffer_heads that is either delay or
  * unwritten in the page.
  *
  * We can get recursively called as show below.
  *
  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext4_writepage()
  *
  * But since we don't do any block allocation we should not deadlock.
  * Page also have the dirty flag cleared so we don't get recurive page_lock.
  */
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
 	int ret = 0, commit_write = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	/*
 	 * If the page does not have buffers (for whatever reason),
 	 * try to create them using __block_write_begin.  If this
 	 * fails, redirty the page and move on.
 	 */
 	if (!page_has_buffers(page)) {
 		if (__block_write_begin(page, 0, len,
 					noalloc_get_block_write)) {
 		redirty_page:
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
 		}
 		commit_write = 1;
 	}
 	page_bufs = page_buffers(page);
 	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 			      ext4_bh_delay_or_unwritten)) {
 		/*
 		 * We don't want to do block allocation, so redirty
 		 * the page and return.  We may reach here when we do
 		 * a journal commit via journal_submit_inode_data_buffers.
 		 * We can also reach here via shrink_page_list
 		 */
 		goto redirty_page;
 	}
 	if (commit_write)
 		/* now mark the buffer_heads as dirty and uptodate */
 		block_commit_write(page, 0, len);
 	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		return __ext4_journalled_writepage(page, len);
 	if (buffer_uninit(page_bufs)) {
 		ext4_set_bh_endio(page_bufs, inode);
 		ret = block_write_full_page_endio(page, noalloc_get_block_write,
 					    wbc, ext4_end_io_buffer_write);
 	} else
 		ret = block_write_full_page(page, noalloc_get_block_write,
 					    wbc);
 	return ret;
 }
 /*
  * This is called via ext4_da_writepages() to
  * calculate the total number of credits to reserve to fit
  * a single extent allocation into a single transaction,
  * ext4_da_writpeages() will loop calling this before
  * the block allocation.
  */
 static int ext4_da_writepages_trans_blocks(struct inode *inode)
 {
 	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 	/*
 	 * With non-extent format the journal credit needed to
 	 * insert nrblocks contiguous block is dependent on
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 	return ext4_chunk_trans_blocks(inode, max_blocks);
 }
 /*
  * write_cache_pages_da - walk the list of dirty pages of the given
  * address space and accumulate pages that need writing, and call
  * mpage_da_map_and_submit to map a single contiguous memory region
  * and then write them.
  */
 static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
 				struct mpage_da_data *mpd,
 				pgoff_t *done_index)
 {
 	struct buffer_head	*bh, *head;
 	struct inode		*inode = mapping->host;
 	struct pagevec		pvec;
 	unsigned int		nr_pages;
 	sector_t		logical;
 	pgoff_t			index, end;
 	long			nr_to_write = wbc->nr_to_write;
 	int			i, tag, ret = 0;
 	memset(mpd, 0, sizeof(struct mpage_da_data));
 	mpd->wbc = wbc;
 	mpd->inode = inode;
 	pagevec_init(&pvec, 0);
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 	*done_index = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			return 0;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			/*
 			 * At this point, the page may be truncated or
 			 * invalidated (changing page->mapping to NULL), or
 			 * even swizzled back from swapper_space to tmpfs file
 			 * mapping. However, page->index will not change
 			 * because we have a reference on the page.
 			 */
 			if (page->index > end)
 				goto out;
 			*done_index = page->index + 1;
 			/*
 			 * If we can't merge this page, and we have
 			 * accumulated an contiguous region, write it
 			 */
 			if ((mpd->next_page != page->index) &&
 			    (mpd->next_page != mpd->first_page)) {
 				mpage_da_map_and_submit(mpd);
 				goto ret_extent_tail;
 			}
 			lock_page(page);
 			/*
 			 * If the page is no longer dirty, or its
 			 * mapping no longer corresponds to inode we
 			 * are writing (which means it has been
 			 * truncated or invalidated), or the page is
 			 * already under writeback and we are not
 			 * doing a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
 			     (wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
 			}
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 			if (mpd->next_page != page->index)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
 			logical = (sector_t) page->index <<
 				(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!page_has_buffers(page)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       PAGE_CACHE_SIZE,
 						       (1 << BH_Dirty) | (1 << BH_Uptodate));
 				if (mpd->io_done)
 					goto ret_extent_tail;
 			} else {
 				/*
 				 * Page with regular buffer heads,
 				 * just add all dirty ones
 				 */
 				head = page_buffers(page);
 				bh = head;
 				do {
 					BUG_ON(buffer_locked(bh));
 					/*
 					 * We need to try to allocate
 					 * unmapped blocks in the same page.
 					 * Otherwise we won't make progress
 					 * with the page in ext4_writepage
 					 */
 					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 						mpage_add_bh_to_extent(mpd, logical,
 								       bh->b_size,
 								       bh->b_state);
 						if (mpd->io_done)
 							goto ret_extent_tail;
 					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
 						/*
 						 * mapped dirty buffer. We need
 						 * to update the b_state
 						 * because we look at b_state
 						 * in mpage_da_map_blocks.  We
 						 * don't update b_size because
 						 * if we find an unmapped
 						 * buffer_head later we need to
 						 * use the b_state flag of that
 						 * buffer_head.
 						 */
 						if (mpd->b_size == 0)
 							mpd->b_state = bh->b_state & BH_FLAGS;
 					}
 					logical++;
 				} while ((bh = bh->b_this_page) != head);
 			}
 			if (nr_to_write > 0) {
 				nr_to_write--;
 				if (nr_to_write == 0 &&
 				    wbc->sync_mode == WB_SYNC_NONE)
 					/*
 					 * We stop writing back only if we are
 					 * not doing integrity sync. In case of
 					 * integrity sync we have to keep going
 					 * because someone may be concurrently
 					 * dirtying pages, and we might have
 					 * synced a lot of newly appeared dirty
 					 * pages, but have not synced all of the
 					 * old dirty pages.
 					 */
 					goto out;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
 ret_extent_tail:
 	ret = MPAGE_DA_EXTENT_TAIL;
 out:
 	pagevec_release(&pvec);
 	cond_resched();
 	return ret;
 }
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
 	pgoff_t	index;
 	int range_whole = 0;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int pages_written = 0;
 	unsigned int max_pages;
 	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0;
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	pgoff_t done_index = 0;
 	pgoff_t end;
 	trace_ext4_da_writepages(inode, wbc);
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
 	 * because that could violate lock ordering on umount
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_da_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 	range_cyclic = wbc->range_cyclic;
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
 		if (index)
 			cycled = 0;
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = LLONG_MAX;
 		wbc->range_cyclic = 0;
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 	}
 	/*
 	 * This works around two forms of stupidity.  The first is in
 	 * the writeback code, which caps the maximum number of pages
 	 * written to be 1024 pages.  This is wrong on multiple
 	 * levels; different architectues have a different page size,
 	 * which changes the maximum amount of data which gets
 	 * written.  Secondly, 4 megabytes is way too small.  XFS
 	 * forces this value to be 16 megabytes by multiplying
 	 * nr_to_write parameter by four, and then relies on its
 	 * allocator to allocate larger extents to make them
 	 * contiguous.  Unfortunately this brings us to the second
 	 * stupidity, which is that ext4's mballoc code only allocates
 	 * at most 2048 blocks.  So we force contiguous writes up to
 	 * the number of dirty blocks in the inode, or
 	 * sbi->max_writeback_mb_bump whichever is smaller.
 	 */
 	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
 	if (!range_cyclic && range_whole) {
 		if (wbc->nr_to_write == LONG_MAX)
 			desired_nr_to_write = wbc->nr_to_write;
 		else
 			desired_nr_to_write = wbc->nr_to_write * 8;
 	} else
 		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
 							   max_pages);
 	if (desired_nr_to_write > max_pages)
 		desired_nr_to_write = max_pages;
 	if (wbc->nr_to_write < desired_nr_to_write) {
 		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
 		wbc->nr_to_write = desired_nr_to_write;
 	}
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag_pages_for_writeback(mapping, index, end);
 	while (!ret && wbc->nr_to_write > 0) {
 		/*
 		 * we  insert one extent at a time. So we need
 		 * credit needed for single extent allocation.
 		 * journalled mode is currently not supported
 		 * by delalloc
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			goto out_writepages;
 		}
 		/*
 		 * Now call write_cache_pages_da() to find the next
 		 * contiguous region of logical blocks that need
 		 * blocks to be allocated by ext4 and submit them.
 		 */
 		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
 		 * them for I/O.
 		 */
 		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
 			mpage_da_map_and_submit(&mpd);
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
 		trace_ext4_da_write_pages(inode, &mpd);
 		wbc->nr_to_write -= mpd.pages_written;
 		ext4_journal_stop(handle);
 		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
 		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			 * got one extent now try with
 			 * rest of the pages
 			 */
 			pages_written += mpd.pages_written;
 			ret = 0;
 			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
 			 * and we found the device congested
 			 */
 			break;
 	}
 	if (!io_done && !cycled) {
 		cycled = 1;
 		index = 0;
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = mapping->writeback_index - 1;
 		goto retry;
 	}
 	/* Update index */
 	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
 		mapping->writeback_index = done_index;
 out_writepages:
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
 #define FALL_BACK_TO_NONDELALLOC 1
 static int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_blocks, dirty_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
 	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
 	 */
 	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
 	if (2 * free_blocks < 3 * dirty_blocks ||
 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
 		 */
 		return 1;
 	}
 	/*
 	 * Even if we don't switch but are nearing capacity,
 	 * start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
 	if (free_blocks < 2 * dirty_blocks)
 		writeback_inodes_sb_if_idle(sb);
 	return 0;
 }
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
 	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 	index = pos >> PAGE_CACHE_SHIFT;
 	if (ext4_nonda_switch(inode->i_sb)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
 	 * to journalling the i_disksize update if writes to the end
 	 * of file which has an already mapped buffer.
 	 */
 	handle = ext4_journal_start(inode, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	/* We cannot recurse into the filesystem as the transaction is already
 	 * started */
 	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
 		goto out;
 	}
 	*pagep = page;
 	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 /*
  * Check if we should update i_disksize
  * when write to the end of file but not require block allocation
  */
 static int ext4_da_should_update_i_disksize(struct page *page,
 					    unsigned long offset)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
 	unsigned int idx;
 	int i;
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
 static int ext4_da_write_end(struct file *file,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
 		if (ext4_should_order_data(inode)) {
 			return ext4_ordered_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
 		} else if (ext4_should_writeback_data(inode)) {
 			return ext4_writeback_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
 		} else {
 			BUG();
 		}
 	}
 	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
 	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 	 * into that.
 	 */
 	new_i_size = pos + copied;
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_da_should_update_i_disksize(page, end)) {
 			down_write(&EXT4_I(inode)->i_data_sem);
 			if (new_i_size > EXT4_I(inode)->i_disksize) {
 				/*
 				 * Updating i_disksize when extending file
 				 * without needing block allocation
 				 */
 				if (ext4_should_order_data(inode))
 					ret = ext4_jbd2_file_inode(handle,
 								   inode);
 				EXT4_I(inode)->i_disksize = new_i_size;
 			}
 			up_write(&EXT4_I(inode)->i_data_sem);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
 			 * bu greater than i_disksize.(hint delalloc)
 			 */
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	return ret ? ret : copied;
 }
 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 {
 	/*
 	 * Drop reserved blocks
 	 */
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 	ext4_da_page_release_reservation(page, offset);
 out:
 	ext4_invalidatepage(page, offset);
 	return;
 }
 /*
  * Force all delayed allocation blocks to be allocated for a given inode.
  */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
 	trace_ext4_alloc_da_blocks(inode);
 	if (!EXT4_I(inode)->i_reserved_data_blocks &&
 	    !EXT4_I(inode)->i_reserved_meta_blocks)
 		return 0;
 	/*
 	 * We do something simple for now.  The filemap_flush() will
 	 * also start triggering a write of the data blocks, which is
 	 * not strictly speaking necessary (and for users of
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
 	 * ext4_da_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
 	 *           mpage_add_bh_to_extent()
 	 *           mpage_da_map_blocks()
 	 *
 	 * The problem is that write_cache_pages(), located in
 	 * mm/page-writeback.c, marks pages clean in preparation for
 	 * doing I/O, which is not desirable if we're not planning on
 	 * doing I/O at all.
 	 *
 	 * We could call write_cache_pages(), and then redirty all of
 	 * the pages by calling redirty_page_for_writepage() but that
 	 * would be ugly in the extreme.  So instead we would need to
 	 * replicate parts of the code in the above functions,
 	 * simplifying them because we wouldn't actually intend to
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
 	 *
 	 * For now, though, we'll cheat by calling filemap_flush(),
 	 * which will map the blocks, and start the I/O, but not
 	 * actually wait for the I/O to complete.
 	 */
 	return filemap_flush(inode->i_mapping);
 }
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
  * Naturally, this is dangerous if the block concerned is still in the
  * journal.  If somebody makes a swapfile on an ext4 data-journaling
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
  * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
 			test_opt(inode->i_sb, DELALLOC)) {
 		/*
 		 * With delalloc we want to sync the file
 		 * so that we can make sure we allocate
 		 * blocks for file
 		 */
 		filemap_write_and_wait(mapping);
 	}
 	if (EXT4_JOURNAL(inode) &&
 	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
 		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
 		 * will.)
 		 *
 		 * NB. EXT4_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
 		 * or symlink and gets confused because the buffer
 		 * hasn't yet been flushed to disk, they deserve
 		 * everything they get.
 		 */
 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
 		jbd2_journal_unlock_updates(journal);
 		if (err)
 			return 0;
 	}
 	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 static int ext4_readpage(struct file *file, struct page *page)
 {
 	trace_ext4_readpage(page);
 	return mpage_readpage(page, ext4_get_block);
 }
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	if (!page_has_buffers(page))
 		return;
 	head = bh = page_buffers(page);
 	do {
 		if (offset <= curr_off && test_clear_buffer_uninit(bh)
 					&& bh->b_private) {
 			ext4_free_io_end(bh->b_private);
 			bh->b_private = NULL;
 			bh->b_end_io = NULL;
 		}
 		curr_off = curr_off + bh->b_size;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_invalidatepage(page, offset);
 	/*
 	 * free any io_end structure allocated for buffers to be discarded
 	 */
 	if (ext4_should_dioread_nolock(page->mapping->host))
 		ext4_invalidatepage_free_endio(page, offset);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
 		ClearPageChecked(page);
 	if (journal)
 		jbd2_journal_invalidatepage(journal, page, offset);
 	else
 		block_invalidatepage(page, offset);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_releasepage(page);
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
 	else
 		return try_to_free_buffers(page);
 }
 /*
  * O_DIRECT for ext3 (or indirect map) based files
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
 	int retries = 0;
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
 		if (final_size > inode->i_size) {
 			/* Credits for sb + inode write */
 			handle = ext4_journal_start(inode, 2);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				goto out;
 			}
 			ret = ext4_orphan_add(handle, inode);
 			if (ret) {
 				ext4_journal_stop(handle);
 				goto out;
 			}
 			orphan = 1;
 			ei->i_disksize = inode->i_size;
 			ext4_journal_stop(handle);
 		}
 	}
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode))
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
 	else {
 		ret = blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
 			loff_t end = offset + iov_length(iov, nr_segs);
 			if (end > isize)
 				ext4_truncate_failed_write(inode);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 	if (orphan) {
 		int err;
 		/* Credits for sb + inode write */
 		handle = ext4_journal_start(inode, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Bail out and pretend
 			 * the write failed... */
 			ret = PTR_ERR(handle);
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 			goto out;
 		}
 		if (inode->i_nlink)
 			ext4_orphan_del(handle, inode);
 		if (ret > 0) {
 			loff_t end = offset + ret;
 			if (end > inode->i_size) {
 				ei->i_disksize = end;
 				i_size_write(inode, end);
 				/*
 				 * We're going to return a positive `ret'
 				 * here due to non-zero-length I/O, so there's
 				 * no way of reporting error returns from
 				 * ext4_mark_inode_dirty() to userspace.  So
 				 * ignore it.
 				 */
 				ext4_mark_inode_dirty(handle, inode);
 			}
 		}
 		err = ext4_journal_stop(handle);
 		if (ret == 0)
 			ret = err;
 	}
 out:
 	return ret;
 }
 /*
  * ext4_get_block used when preparing for a DIO write or buffer write.
  * We allocate an uinitialized extent if blocks haven't been allocated.
  * The extent will be converted to initialized after the IO is complete.
  */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
 	return _ext4_get_block(inode, iblock, bh_result,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private, int ret,
 			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
 	unsigned long flags;
 	struct ext4_inode_info *ei;
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
 		goto out;
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
 	/* if not aio dio with unwritten extents, just free io and return */
 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
 out:
 		if (is_async)
 			aio_complete(iocb, ret, 0);
 		return;
 	}
 	io_end->offset = offset;
 	io_end->size = size;
 	if (is_async) {
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 	iocb->private = NULL;
 }
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 {
 	ext4_io_end_t *io_end = bh->b_private;
 	struct workqueue_struct *wq;
 	struct inode *inode;
 	unsigned long flags;
 	if (!test_clear_buffer_uninit(bh) || !io_end)
 		goto out;
 	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
 		printk("sb umounted, discard end_io request for inode %lu\n",
 			io_end->inode->i_ino);
 		ext4_free_io_end(io_end);
 		goto out;
 	}
 	io_end->flag = EXT4_IO_END_UNWRITTEN;
 	inode = io_end->inode;
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
 	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 out:
 	bh->b_private = NULL;
 	bh->b_end_io = NULL;
 	clear_buffer_uninit(bh);
 	end_buffer_async_write(bh, uptodate);
 }
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 {
 	ext4_io_end_t *io_end;
 	struct page *page = bh->b_page;
 	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
 	size_t size = bh->b_size;
 retry:
 	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
 	if (!io_end) {
 		pr_warn_ratelimited("%s: allocation fail\n", __func__);
 		schedule();
 		goto retry;
 	}
 	io_end->offset = offset;
 	io_end->size = size;
 	/*
 	 * We need to hold a reference to the page to make sure it
 	 * doesn't get evicted before ext4_end_io_work() has a chance
 	 * to convert the extent from written to unwritten.
 	 */
 	io_end->page = page;
 	get_page(io_end->page);
 	bh->b_private = io_end;
 	bh->b_end_io = ext4_end_io_buffer_write;
 	return 0;
 }
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
  * For holes, we fallocate those blocks, mark them as uninitialized
  * If those blocks were preallocated, we mark sure they are splited, but
  * still keep the range to write as uninitialized.
  *
  * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
  * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  */
 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	size_t count = iov_length(iov, nr_segs);
 	loff_t final_size = offset + count;
 	if (rw == WRITE && final_size <= inode->i_size) {
 		/*
  		 * We could direct write to holes and fallocate.
 		 *
  		 * Allocated blocks to fill the hole are marked as uninitialized
  		 * to prevent parallel buffered read to expose the stale data
  		 * before DIO complete the data IO.
 		 *
  		 * As to previously fallocated extents, ext4 get_block
  		 * will just simply mark the buffer mapped but still
  		 * keep the extents uninitialized.
  		 *
 		 * for non AIO case, we will convert those unwritten extents
 		 * to written after return back from blockdev_direct_IO.
 		 *
 		 * for async DIO, the conversion needs to be defered when
 		 * the IO is completed. The ext4 end_io callback function
 		 * will be called to take care of the conversion work.
 		 * Here for async case, we allocate an io_end structure to
 		 * hook to the iocb.
  		 */
 		iocb->private = NULL;
 		EXT4_I(inode)->cur_aio_dio = NULL;
 		if (!is_sync_kiocb(iocb)) {
 			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
 			if (!iocb->private)
 				return -ENOMEM;
 			/*
 			 * we save the io structure for current async
 			 * direct IO, so that later ext4_map_blocks()
 			 * could flag the io structure whether there
 			 * is a unwritten extents needs to be converted
 			 * when IO is completed.
 			 */
 			EXT4_I(inode)->cur_aio_dio = iocb->private;
 		}
 		ret = blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
 					 ext4_get_block_write,
 					 ext4_end_io_dio);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
 		/*
 		 * The io_end structure takes a reference to the inode,
 		 * that structure needs to be destroyed and the
 		 * reference to the inode need to be dropped, when IO is
 		 * complete, even with 0 byte write, or failed.
 		 *
 		 * In the successful AIO DIO case, the io_end structure will be
 		 * desctroyed and the reference to the inode will be dropped
 		 * after the end_io call back function is called.
 		 *
 		 * In the case there is 0 byte write, or error case, since
 		 * VFS direct IO won't invoke the end_io call back function,
 		 * we need to free the end_io structure here.
 		 */
 		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
 			ext4_free_io_end(iocb->private);
 			iocb->private = NULL;
 		} else if (ret > 0 && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 			int err;
 			/*
 			 * for non AIO case, since the IO is already
 			 * completed, we could do the conversion right here
 			 */
 			err = ext4_convert_unwritten_extents(inode,
 							     offset, ret);
 			if (err < 0)
 				ret = err;
 			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		}
 		return ret;
 	}
 	/* for write the the end of file case, we fall back to old way */
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 	else
 		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 	trace_ext4_direct_IO_exit(inode, offset,
 				iov_length(iov, nr_segs), rw, ret);
 	return ret;
 }
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
  * much here because ->set_page_dirty is called under VFS locks.  The page is
  * not necessarily locked.
  *
  * We cannot just dirty the page and leave attached buffers clean, because the
  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
  * or jbddirty because all the journalling code will explode.
  *
  * So what we do is to mark the page "pending dirty" and next time writepage
  * is called, propagate that into the buffers appropriately.
  */
 static int ext4_journalled_set_page_dirty(struct page *page)
 {
 	SetPageChecked(page);
 	return __set_page_dirty_nobuffers(page);
 }
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_da_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_da_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode) &&
 		test_opt(inode->i_sb, DELALLOC))
 		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode) &&
 		 test_opt(inode->i_sb, DELALLOC))
 		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
 int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned length;
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	return ext4_block_zero_page_range(handle, mapping, from, length);
 }
 /*
  * ext4_block_zero_page_range() zeros out a mapping of length 'length'
  * starting from file offset 'from'.  The range to be zero'd must
  * be contained with in one block.  If the specified range exceeds
  * the end of the block it will be shortened to end of the block
  * that cooresponds to 'from'
  */
 int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, max, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	struct page *page;
 	int err = 0;
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
 		return -EINVAL;
 	blocksize = inode->i_sb->s_blocksize;
 	max = blocksize - (offset & (blocksize - 1));
 	/*
 	 * correct length if it does not fall between
 	 * 'from' and the end of the block
 	 */
 	if (length > max || length < 0)
 		length = max;
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext4_get_block(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			BUFFER_TRACE(bh, "still unmapped");
 			goto unlock;
 		}
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	if (ext4_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
 		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
 			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
  * Linus?
  */
 static inline int all_zeroes(__le32 *p, __le32 *q)
 {
 	while (p < q)
 		if (*p++)
 			return 0;
 	return 1;
 }
 /**
  *	ext4_find_shared - find the indirect blocks for partial truncation.
  *	@inode:	  inode in question
  *	@depth:	  depth of the affected branch
  *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
  *	@chain:	  place to store the pointers to partial indirect blocks
  *	@top:	  place to the (detached) top of branch
  *
  *	This is a helper function used by ext4_truncate().
  *
  *	When we do truncate() we may have to clean the ends of several
  *	indirect blocks but leave the blocks themselves alive. Block is
  *	partially truncated if some data below the new i_size is referred
  *	from it (and it is on the path to the first completely truncated
  *	data block, indeed).  We have to free the top of that path along
  *	with everything to the right of the path. Since no allocation
  *	past the truncation point is possible until ext4_truncate()
  *	finishes, we may safely do the latter, but top of branch may
  *	require special attention - pageout below the truncation point
  *	might try to populate it.
  *
  *	We atomically detach the top of branch from the tree, store the
  *	block number of its root in *@top, pointers to buffer_heads of
  *	partially truncated blocks - in @chain[].bh and pointers to
  *	their last elements that should not be removed - in
  *	@chain[].p. Return value is the pointer to last filled element
  *	of @chain.
  *
  *	The work left to caller to do the actual freeing of subtrees:
  *		a) free the subtree starting from *@top
  *		b) free the subtrees whose roots are stored in
  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
  *		c) free the subtrees growing from the inode past the @chain[0].
  *			(no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
 				  ext4_lblk_t offsets[4], Indirect chain[4],
 				  __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
 	*top = 0;
 	/* Make k index the deepest non-null offset + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext4_get_branch(inode, k, offsets, chain, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
 	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
 	 * branch should be detached before unlocking. However, if that rest
 	 * of branch is all ours and does not grow immediately from the inode
 	 * it's easier to cheat and just decrement partial->p.
 	 */
 	if (p == chain + k - 1 && p > chain) {
 		p->p--;
 	} else {
 		*top = *p->p;
 		/* Nope, don't do this in ext4.  Must leave the tree intact */
 #if 0
 		*p->p = 0;
 #endif
 	}
 	/* Writer: end */
 	while (partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
 no_top:
 	return partial;
 }
 /*
  * Zero a number of block pointers in either an inode or an indirect block.
  * If we restart the transaction we must again get write access to the
  * indirect block for further modification.
  *
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  *
  * Return 0 on success, 1 on invalid block range
  * and < 0 on fatal error.
  */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh,
 			     ext4_fsblk_t block_to_free,
 			     unsigned long count, __le32 *first,
 			     __le32 *last)
 {
 	__le32 *p;
 	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
 	int	err;
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
 		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
 				 "blocks %llu len %lu",
 				 (unsigned long long) block_to_free, count);
 		return 1;
 	}
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 			err = ext4_handle_dirty_metadata(handle, inode, bh);
 			if (unlikely(err))
 				goto out_err;
 		}
 		err = ext4_mark_inode_dirty(handle, inode);
 		if (unlikely(err))
 			goto out_err;
 		err = ext4_truncate_restart_trans(handle, inode,
 						  blocks_for_truncate(inode));
 		if (unlikely(err))
 			goto out_err;
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			err = ext4_journal_get_write_access(handle, bh);
 			if (unlikely(err))
 				goto out_err;
 		}
 	}
 	for (p = first; p < last; p++)
 		*p = 0;
 	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
 	return 0;
 out_err:
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /**
  * ext4_free_data - free a list of data blocks
  * @handle:	handle for this transaction
  * @inode:	inode we are dealing with
  * @this_bh:	indirect buffer_head which contains *@first and *@last
  * @first:	array of block numbers
  * @last:	points immediately past the end of array
  *
  * We are freeing all blocks referred from that array (numbers are stored as
  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
  *
  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
  * blocks are contiguous then releasing them at one time will only affect one
  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
  * actually use a lot of journal space.
  *
  * @this_bh will be %NULL if @first and @last point into the inode's direct
  * block pointers.
  */
 static void ext4_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
 	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
 	ext4_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err = 0;
 	if (this_bh) {				/* For indirect block */
 		BUFFER_TRACE(this_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, this_bh);
 		/* Important: if we can't update the indirect pointers
 		 * to the blocks, we can't free them. */
 		if (err)
 			return;
 	}
 	for (p = first; p < last; p++) {
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
 			if (count == 0) {
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
 				err = ext4_clear_blocks(handle, inode, this_bh,
 						        block_to_free, count,
 						        block_to_free_p, p);
 				if (err)
 					break;
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			}
 		}
 	}
 	if (!err && count > 0)
 		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
 					count, block_to_free_p, p);
 	if (err < 0)
 		/* fatal error */
 		return;
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
 		/*
 		 * The buffer head should have an attached journal head at this
 		 * point. However, if the data is corrupted and an indirect
 		 * block pointed to itself, it would have been detached when
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			EXT4_ERROR_INODE(inode,
 					 "circular indirect block detected at "
 					 "block %llu",
 				(unsigned long long) this_bh->b_blocknr);
 	}
 }
 /**
  *	ext4_free_branches - free an array of branches
  *	@handle: JBD handle for this transaction
  *	@inode:	inode we are dealing with
  *	@parent_bh: the buffer_head which contains *@first and *@last
  *	@first:	array of block numbers
  *	@last:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
  *	We are freeing all blocks referred from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
 	ext4_fsblk_t nr;
 	__le32 *p;
 	if (ext4_handle_is_aborted(handle))
 		return;
 	if (depth--) {
 		struct buffer_head *bh;
 		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 		p = last;
 		while (--p >= first) {
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;		/* A hole */
 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						   nr, 1)) {
 				EXT4_ERROR_INODE(inode,
 						 "invalid indirect mapped "
 						 "block %lu (level %d)",
 						 (unsigned long) nr, depth);
 				break;
 			}
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 			/*
 			 * A read failure? Report error and clear slot
 			 * (should be rare).
 			 */
 			if (!bh) {
 				EXT4_ERROR_INODE_BLOCK(inode, nr,
 						       "Read failure");
 				continue;
 			}
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
 					(__le32 *) bh->b_data,
 					(__le32 *) bh->b_data + addr_per_block,
 					depth);
 			brelse(bh);
 			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
 			 * We want the freeing of this indirect block to be
 			 * atomic in the journal with the updating of the
 			 * bitmap block which owns it.  So make some room in
 			 * the journal.
 			 *
 			 * We zero the parent pointer *after* freeing its
 			 * pointee in the bitmaps, so if extend_transaction()
 			 * for some reason fails to put the bitmap changes and
 			 * the release into the same transaction, recovery
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
 			if (ext4_handle_is_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
 				ext4_truncate_restart_trans(handle, inode,
 					    blocks_for_truncate(inode));
 			}
 			/*
 			 * The forget flag here is critical because if
 			 * we are journaling (and not doing data
 			 * journaling), we have to make sure a revoke
 			 * record is written to prevent the journal
 			 * replay from overwriting the (former)
 			 * indirect block if it gets reallocated as a
 			 * data block.  This must happen in the same
 			 * transaction where the data blocks are
 			 * actually freed.
 			 */
 			ext4_free_blocks(handle, inode, NULL, nr, 1,
 					 EXT4_FREE_BLOCKS_METADATA|
 					 EXT4_FREE_BLOCKS_FORGET);
 			if (parent_bh) {
 				/*
 				 * The block which we have just freed is
 				 * pointed to by an indirect block: journal it
 				 */
 				BUFFER_TRACE(parent_bh, "get_write_access");
 				if (!ext4_journal_get_write_access(handle,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
 					"call ext4_handle_dirty_metadata");
 					ext4_handle_dirty_metadata(handle,
 								   inode,
 								   parent_bh);
 				}
 			}
 		}
 	} else {
 		/* We have reached the bottom of the tree. */
 		BUFFER_TRACE(parent_bh, "free data blocks");
 		ext4_free_data(handle, inode, parent_bh, first, last);
 	}
 }
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
 		return 1;
 	if (S_ISLNK(inode->i_mode))
 		return !ext4_inode_is_fast_symlink(inode);
 	return 0;
 }
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
  *
  * @inode:  File inode
  * @offset: The offset where the hole will begin
  * @len:    The length of the hole
  *
  * Returns: 0 on sucess or negative on failure
  */
 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	if (!S_ISREG(inode->i_mode))
 		return -ENOTSUPP;
 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		/* TODO: Add support for non extent hole punching */
 		return -ENOTSUPP;
 	}
 	return ext4_ext_punch_hole(file, offset, length);
 }
 /*
  * ext4_truncate()
  *
  * We block out ext4_get_block() block instantiations across the entire
  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
  * As we work through the truncate and commmit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
  * The file's tree may be transiently inconsistent in memory (although it
  * probably isn't), but whenever we close off and commit a journal transaction,
  * the contents of (the filesystem + the journal) must be consistent and
  * restartable.  It's pretty simple, really: bottom up, right to left (although
  * left-to-right works OK too).
  *
  * Note that at recovery time, journal replay occurs *before* the restart of
  * truncate against the orphan inode list.
  *
  * The committed inode has the new, desired i_size (which is the same as
  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
  * that this inode's truncate did not complete and it will again call
  * ext4_truncate() to have another go.  So there will be instantiated blocks
  * to the right of the truncation point in a crashed ext4 filesystem.  But
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
 void ext4_truncate(struct inode *inode)
 {
 	handle_t *handle;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	trace_ext4_truncate_enter(inode);
 	if (!ext4_can_truncate(inode))
 		return;
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		trace_ext4_truncate_exit(inode);
 		return;
 	}
 	handle = start_transaction(inode);
 	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	if (inode->i_size & (blocksize - 1))
 		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
 			goto out_stop;
 	if (last_block != max_block) {
 		n = ext4_block_to_path(inode, last_block, offsets, NULL);
 		if (n == 0)
 			goto out_stop;	/* error */
 	}
 	/*
 	 * OK.  This truncate is going to happen.  We add the inode to the
 	 * orphan list, so that if this truncate spans multiple transactions,
 	 * and we crash, we will resume the truncate when the filesystem
 	 * recovers.  It also marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 	/*
 	 * From here we block out all ext4_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
 	down_write(&ei->i_data_sem);
 	ext4_discard_preallocations(inode);
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
 	 * on-disk inode. We do this via i_disksize, which is the value which
 	 * ext4 *really* writes onto the disk inode.
 	 */
 	ei->i_disksize = inode->i_size;
 	if (last_block == max_block) {
 		/*
 		 * It is unnecessary to free any data blocks if last_block is
 		 * equal to the indirect block limit.
 		 */
 		goto out_unlock;
 	} else if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
 		goto do_indirects;
 	}
 	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
 	/* Kill the top of shared branch (not detached) */
 	if (nr) {
 		if (partial == chain) {
 			/* Shared branch grows from the inode */
 			ext4_free_branches(handle, inode, NULL,
 					   &nr, &nr+1, (chain+n-1) - partial);
 			*partial->p = 0;
 			/*
 			 * We mark the inode dirty prior to restart,
 			 * and prior to stop.  No need for it here.
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
 			BUFFER_TRACE(partial->bh, "get_write_access");
 			ext4_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
 		}
 	}
 	/* Clear the ends of indirect blocks on the shared branch */
 	while (partial > chain) {
 		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
 	switch (offsets[0]) {
 	default:
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT4_IND_BLOCK] = 0;
 		}
 	case EXT4_IND_BLOCK:
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT4_DIND_BLOCK] = 0;
 		}
 	case EXT4_DIND_BLOCK:
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT4_TIND_BLOCK] = 0;
 		}
 	case EXT4_TIND_BLOCK:
 		;
 	}
 out_unlock:
 	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	/*
 	 * In a multi-transaction truncate, we only make the final transaction
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext4_delete_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	ext4_journal_stop(handle);
 	trace_ext4_truncate_exit(inode);
 }
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * data in memory that is needed to recreate the on-disk version of this
  * inode.
  */
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
 	struct ext4_group_desc	*gdp;
 	struct buffer_head	*bh;
 	struct super_block	*sb = inode->i_sb;
 	ext4_fsblk_t		block;
 	int			inodes_per_block, inode_offset;
 	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
 	if (!gdp)
 		return -EIO;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	inode_offset = ((inode->i_ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
 	bh = sb_getblk(sb, block);
 	if (!bh) {
 		EXT4_ERROR_INODE_BLOCK(inode, block,
 				       "unable to read itable block");
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		/*
 		 * If the buffer has the write error flag, we have failed
 		 * to write out another inode in the same block.  In this
 		 * case, we don't have to read the block because we may
 		 * read the old inode data successfully.
 		 */
 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
 			set_buffer_uptodate(bh);
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
 			goto has_buffer;
 		}
 		/*
 		 * If we have all information of the inode in memory and this
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			int i, start;
 			start = inode_offset & ~(inodes_per_block - 1);
 			/* Is the inode bitmap in cache? */
 			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 				goto make_io;
 			/*
 			 * If the inode bitmap isn't in cache then the
 			 * optimisation may end up performing two reads instead
 			 * of one, so skip it.
 			 */
 			if (!buffer_uptodate(bitmap_bh)) {
 				brelse(bitmap_bh);
 				goto make_io;
 			}
 			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
 			}
 		}
 make_io:
 		/*
 		 * If we need to do any I/O, try to pre-readahead extra
 		 * blocks from the inode table.
 		 */
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
 			table = ext4_inode_table(sb, gdp);
 			/* s_inode_readahead_blks is always a power of 2 */
 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
 			if (table > b)
 				b = table;
 			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
 				num -= ext4_itable_unused_count(sb, gdp);
 			table += num / inodes_per_block;
 			if (end > table)
 				end = table;
 			while (b <= end)
 				sb_breadahead(sb, b++);
 		}
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
 		trace_ext4_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
 			brelse(bh);
 			return -EIO;
 		}
 	}
 has_buffer:
 	iloc->bh = bh;
 	return 0;
 }
 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
 		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT4_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
 	unsigned int vfs_fl;
 	unsigned long old_fl, new_fl;
 	do {
 		vfs_fl = ei->vfs_inode.i_flags;
 		old_fl = ei->i_flags;
 		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
 				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
 				EXT4_DIRSYNC_FL);
 		if (vfs_fl & S_SYNC)
 			new_fl |= EXT4_SYNC_FL;
 		if (vfs_fl & S_APPEND)
 			new_fl |= EXT4_APPEND_FL;
 		if (vfs_fl & S_IMMUTABLE)
 			new_fl |= EXT4_IMMUTABLE_FL;
 		if (vfs_fl & S_NOATIME)
 			new_fl |= EXT4_NOATIME_FL;
 		if (vfs_fl & S_DIRSYNC)
 			new_fl |= EXT4_DIRSYNC_FL;
 	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
 	struct super_block *sb = inode->i_sb;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
 		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
 			return i_blocks;
 		}
 	} else {
 		return le32_to_cpu(raw_inode->i_blocks_lo);
 	}
 }
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT4_I(inode);
 	iloc.bh = NULL;
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
 	raw_inode = ext4_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
 	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT4_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 	/*
 	 * Set transaction id's of transactions that have to be committed
 	 * to finish f[data]sync. We set them to currently running transaction
 	 * as we cannot be sure that the inode or some of its metadata isn't
 	 * part of the transaction - the inode could have been reclaimed and
 	 * now it is reread from disk.
 	 */
 	if (journal) {
 		transaction_t *transaction;
 		tid_t tid;
 		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
 			transaction = journal->j_committing_transaction;
 		if (transaction)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
 		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
 			ret = -EIO;
 			goto bad_inode;
 		}
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
 			__le32 *magic = (void *)raw_inode +
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
 				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			inode->i_version |=
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
 		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
 				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		    (S_ISLNK(inode->i_mode) &&
 		     !ext4_inode_is_fast_symlink(inode)))
 			/* Validate extent which is part of inode */
 			ret = ext4_ext_check_inode(inode);
 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
 		/* Validate block references which are part of inode */
 		ret = ext4_check_inode_blockref(inode);
 	}
 	if (ret)
 		goto bad_inode;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	brelse(iloc.bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
 {
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
 	if (i_blocks <= ~0U) {
 		/*
 		 * i_blocks can be represnted in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
 		return -EFBIG;
 	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
 		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 	return 0;
 }
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
  *
  * The caller must have write access to iloc->bh.
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
 				struct ext4_iloc *iloc)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
 				cpu_to_le16(high_16_bits(inode->i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low =
 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
 		raw_inode->i_gid_low =
 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	ext4_isize_set(raw_inode, ei->i_disksize);
 	if (ei->i_disksize > 0x7fffffffULL) {
 		struct super_block *sb = inode->i_sb;
 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
 				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
 			/* If this is the first large file
 			 * created, add a flag to the superblock.
 			 */
 			err = ext4_journal_get_write_access(handle,
 					EXT4_SB(sb)->s_sbh);
 			if (err)
 				goto out_brelse;
 			ext4_update_dynamic_rev(sb);
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
 			ext4_handle_sync(handle);
 			err = ext4_handle_dirty_metadata(handle, NULL,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else
 		for (block = 0; block < EXT4_N_BLOCKS; block++)
 			raw_inode->i_block[block] = ei->i_data[block];
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			raw_inode->i_version_hi =
 			cpu_to_le32(inode->i_version >> 32);
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * ext4_write_inode()
  *
  * We are called from a few places:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
  *   trasnaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
  *
  * - Within prune_icache() (PF_MEMALLOC == true)
  *   Here we simply return.  We can't afford to block kswapd on the
  *   journal commit.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
  * knfsd.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
  * which we are interested.
  *
  * It would be a bug for them to not do this.  The code:
  *
  *	mark_inode_dirty(inode)
  *	stuff();
  *	inode->i_size = expr;
  *
  * is in error because a kswapd-driven write_inode() could occur while
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 			dump_stack();
 			return -EIO;
 		}
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 		err = ext4_force_commit(inode->i_sb);
 	} else {
 		struct ext4_iloc iloc;
 		err = __ext4_get_inode_loc(inode, &iloc, 0);
 		if (err)
 			return err;
 		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
 					 "IO error syncing inode");
 			err = -EIO;
 		}
 		brelse(iloc.bh);
 	}
 	return err;
 }
 /*
  * ext4_setattr()
  *
  * Called from notify_change.
  *
  * We want to trap VFS attempts to truncate the file as soon as
  * possible.  In particular, we want to make sure that when the VFS
  * shrinks i_size, we put the inode on the orphan list and modify
  * i_disksize immediately, so that during the subsequent flushing of
  * dirty pages and freeing of disk blocks, we can guarantee that any
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
  * Another thing we have to assure is that if we are in ordered mode
  * and inode is still attached to the committing transaction, we must
  * we start writeout of all the dirty pages which are being truncated.
  * This way we are sure that all the data written in the previous
  * transaction are already on disk (truncate waits for pages under
  * writeback).
  *
  * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
 		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
 					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 	}
 	if (attr->ia_valid & ATTR_SIZE) {
+		inode_dio_wait(inode);
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
 	}
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    (attr->ia_size < inode->i_size)) {
 		handle_t *handle;
 		handle = ext4_journal_start(inode, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		if (ext4_handle_valid(handle)) {
 			error = ext4_orphan_add(handle, inode);
 			orphan = 1;
 		}
 		EXT4_I(inode)->i_disksize = attr->ia_size;
 		rc = ext4_mark_inode_dirty(handle, inode);
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
 		if (ext4_should_order_data(inode)) {
 			error = ext4_begin_ordered_truncate(inode,
 							    attr->ia_size);
 			if (error) {
 				/* Do as much error cleanup as possible */
 				handle = ext4_journal_start(inode, 3);
 				if (IS_ERR(handle)) {
 					ext4_orphan_del(NULL, inode);
 					goto err_out;
 				}
 				ext4_orphan_del(handle, inode);
 				orphan = 0;
 				ext4_journal_stop(handle);
 				goto err_out;
 			}
 		}
 	}
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size != i_size_read(inode)) {
 			truncate_setsize(inode, attr->ia_size);
 			ext4_truncate(inode);
 		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
 			ext4_truncate(inode);
 	}
 	if (!rc) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
 	/*
 	 * If the call to ext4_truncate failed to get a transaction handle at
 	 * all, we need to clean up the in-core orphan list manually.
 	 */
 	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = ext4_acl_chmod(inode);
 err_out:
 	ext4_std_error(inode->i_sb, error);
 	if (!error)
 		error = rc;
 	return error;
 }
 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct inode *inode;
 	unsigned long delalloc_blocks;
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	/*
 	 * We can't update i_blocks if the block allocation is delayed
 	 * otherwise in the case of system crash before the real block
 	 * allocation is done, we will have i_blocks inconsistent with
 	 * on-disk file blocks.
 	 * We always keep i_blocks updated together with real
 	 * allocation. But to not confuse with user, stat
 	 * will return the blocks that include the delayed allocation
 	 * blocks for this file.
 	 */
 	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
 	return 0;
 }
 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 				      int chunk)
 {
 	int indirects;
 	/* if nrblocks are contiguous */
 	if (chunk) {
 		/*
 		 * With N contiguous data blocks, we need at most
 		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
 		 * 2 dindirect blocks, and 1 tindirect block
 		 */
 		return DIV_ROUND_UP(nrblocks,
 				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 	}
 	/*
 	 * if nrblocks are not contiguous, worse case, each block touch
 	 * a indirect block, and each indirect block touch a double indirect
 	 * block, plus a triple indirect block
 	 */
 	indirects = nrblocks * 2 + 1;
 	return indirects;
 }
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
  * different block groups too. If they are contiuguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 	/*
 	 * How many index blocks need to touch to modify nrblocks?
 	 * The "Chunk" flag indicating whether the nrblocks is
 	 * physically contiguous on disk
 	 *
 	 * For Direct IO and fallocate, they calls get_block to allocate
 	 * one single extent at a time, so they could set the "Chunk" flag
 	 */
 	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
 	ret = idxblocks;
 	/*
 	 * Now let's see how many group bitmaps and group descriptors need
 	 * to account
 	 */
 	groups = idxblocks;
 	if (chunk)
 		groups += 1;
 	else
 		groups += nrblocks;
 	gdpblocks = groups;
 	if (groups > ngroups)
 		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 	/* bitmaps and block group descriptor blocks */
 	ret += groups + gdpblocks;
 	/* Blocks for super block, inode, quota and xattr blocks */
 	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
 	return ret;
 }
 /*
  * Calculate the total number of credits to reserve to fit
  * the modification of a single pages into a single transaction,
  * which may include multiple chunks of block allocations.
  *
  * This could be called via ext4_write_begin()
  *
  * We need to consider the worse case, when
  * one new block per extent.
  */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int ret;
 	ret = ext4_meta_trans_blocks(inode, bpp, 0);
 	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
 		ret += bpp;
 	return ret;
 }
 /*
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
  */
 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
 {
 	return ext4_meta_trans_blocks(inode, nrblocks, 1);
 }
 /*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
 			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 	if (test_opt(inode->i_sb, I_VERSION))
 		inode_inc_iversion(inode);
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
  */
 int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
 	int err;
 	err = ext4_get_inode_loc(inode, iloc);
 	if (!err) {
 		BUFFER_TRACE(iloc->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, iloc->bh);
 		if (err) {
 			brelse(iloc->bh);
 			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * Expand an inode by new_extra_isize bytes.
  * Returns 0 on success or negative error number on failure.
  */
 static int ext4_expand_extra_isize(struct inode *inode,
 				   unsigned int new_extra_isize,
 				   struct ext4_iloc iloc,
 				   handle_t *handle)
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
 	/* No extended attributes present */
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
 	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
 		return 0;
 	}
 	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
 					  raw_inode, handle);
 }
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
  * without having to perform any I/O.  This is a very good thing,
  * because *any* task may call prune_icache - even ones which
  * have a transaction open against a different journal.
  *
  * Is this cheating?  Not really.  Sure, we haven't written the
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
  *
  * Is this efficient/effective?  Well, we're being nice to the system
  * by cleaning up our inodes proactively so they can be reaped
  * without I/O.  But we are potentially leaving up to five seconds'
  * worth of inodes floating about which prune_icache wants us to
  * write out.  One way to fix that would be to get prune_icache()
  * to do a write_super() to free up some memory.  It has the desired
  * effect.
  */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	static unsigned int mnt_count;
 	int err, ret;
 	might_sleep();
 	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
 		if ((jbd2_journal_extend(handle,
 			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
 					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
 					mnt_count =
 					  le16_to_cpu(sbi->s_es->s_mnt_count);
 				}
 			}
 		}
 	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
 }
 /*
  * ext4_dirty_inode() is called from __mark_inode_dirty()
  *
  * We're really interested in the case where a file is being extended.
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
  * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out:
 	return;
 }
 #if 0
 /*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext4_reserve_inode_write, this leaves behind no bh reference and
  * returns no iloc structure, so the caller needs to repeat the iloc
  * lookup to mark the inode dirty later.
  */
 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	int err = 0;
 	if (handle) {
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_handle_dirty_metadata(handle,
 								 NULL,
 								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 #endif
 int ext4_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
 	handle_t *handle;
 	int err;
 	/*
 	 * We have to be very careful here: changing a data block's
 	 * journaling status dynamically is dangerous.  If we write a
 	 * data block to the journal, change the status and then delete
 	 * that block, we risk forgetting to revoke the old log record
 	 * from the journal and so a subsequent replay can corrupt data.
 	 * So, first we make sure that the journal is empty and that
 	 * nobody is changing anything.
 	 */
 	journal = EXT4_JOURNAL(inode);
 	if (!journal)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
 	jbd2_journal_lock_updates(journal);
 	jbd2_journal_flush(journal);
 	/*
 	 * OK, there are no updates running now, and all cached data is
 	 * synced to disk.  We are now in a completely consistent state
 	 * which doesn't have anything in the journal, and we know that
 	 * no filesystem updates are running, so it is safe to modify
 	 * the inode's in-core data-journaling state flag now.
 	 */
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	ext4_set_aops(inode);
 	jbd2_journal_unlock_updates(journal);
 	/* Finally we can mark the inode as dirty. */
 	handle = ext4_journal_start(inode, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 {
 	return !buffer_mapped(bh);
 }
 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	handle_t *handle;
 	get_block_t *get_block;
 	int retries = 0;
 	/*
 	 * This check is racy but catches the common case. We rely on
 	 * __block_page_mkwrite() to do a reliable check.
 	 */
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
 	    !ext4_nonda_switch(inode->i_sb)) {
 		do {
 			ret = __block_page_mkwrite(vma, vmf,
 						   ext4_da_get_block_prep);
 		} while (ret == -ENOSPC &&
 		       ext4_should_retry_alloc(inode->i_sb, &retries));
 		goto out_ret;
 	}
 	lock_page(page);
 	size = i_size_read(inode);
 	/* Page got truncated from under us? */
 	if (page->mapping != mapping || page_offset(page) > size) {
 		unlock_page(page);
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	/*
 	 * Return if we have all the buffers mapped. This avoids the need to do
 	 * journal_start/journal_stop which can block and take a long time
 	 */
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 					ext4_bh_unmapped)) {
 			/* Wait so that we don't change page under IO */
 			wait_on_page_writeback(page);
 			ret = VM_FAULT_LOCKED;
 			goto out;
 		}
 	}
 	unlock_page(page);
 	/* OK, we need to fill the hole... */
 	if (ext4_should_dioread_nolock(inode))
 		get_block = ext4_get_block_write;
 	else
 		get_block = ext4_get_block;
 retry_alloc:
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 	ret = __block_page_mkwrite(vma, vmf, get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		if (walk_page_buffers(handle, page_buffers(page), 0,
 			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
 			goto out;
 		}
 		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	}
 	ext4_journal_stop(handle);
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry_alloc;
 out_ret:
 	ret = block_page_mkwrite_return(ret);
 out:
 	return ret;
 }

fs/fat/file.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/fat/file.c
  *
  *  Written 1992,1993 by Werner Almesberger
  *
  *  regular file handling primitives for fat-based filesystems
  */
 #include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include "fat.h"
 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
 	u32 attr;
 	mutex_lock(&inode->i_mutex);
 	attr = fat_make_attrs(inode);
 	mutex_unlock(&inode->i_mutex);
 	return put_user(attr, user_attr);
 }
 static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	int is_dir = S_ISDIR(inode->i_mode);
 	u32 attr, oldattr;
 	struct iattr ia;
 	int err;
 	err = get_user(attr, user_attr);
 	if (err)
 		goto out;
 	mutex_lock(&inode->i_mutex);
 	err = mnt_want_write(file->f_path.mnt);
 	if (err)
 		goto out_unlock_inode;
 	/*
 	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
 	 * prevents the user from turning us into a VFAT
 	 * longname entry.  Also, we obviously can't set
 	 * any of the NTFS attributes in the high 24 bits.
 	 */
 	attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
 	/* Merge in ATTR_VOLUME and ATTR_DIR */
 	attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
 		(is_dir ? ATTR_DIR : 0);
 	oldattr = fat_make_attrs(inode);
 	/* Equivalent to a chmod() */
 	ia.ia_valid = ATTR_MODE | ATTR_CTIME;
 	ia.ia_ctime = current_fs_time(inode->i_sb);
 	if (is_dir)
 		ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
 	else {
 		ia.ia_mode = fat_make_mode(sbi, attr,
 			S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 	}
 	/* The root directory has no attributes */
 	if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
 		err = -EINVAL;
 		goto out_drop_write;
 	}
 	if (sbi->options.sys_immutable &&
 	    ((attr | oldattr) & ATTR_SYS) &&
 	    !capable(CAP_LINUX_IMMUTABLE)) {
 		err = -EPERM;
 		goto out_drop_write;
 	}
 	/*
 	 * The security check is questionable...  We single
 	 * out the RO attribute for checking by the security
 	 * module, just because it maps to a file mode.
 	 */
 	err = security_inode_setattr(file->f_path.dentry, &ia);
 	if (err)
 		goto out_drop_write;
 	/* This MUST be done before doing anything irreversible... */
 	err = fat_setattr(file->f_path.dentry, &ia);
 	if (err)
 		goto out_drop_write;
 	fsnotify_change(file->f_path.dentry, ia.ia_valid);
 	if (sbi->options.sys_immutable) {
 		if (attr & ATTR_SYS)
 			inode->i_flags |= S_IMMUTABLE;
 		else
 			inode->i_flags &= ~S_IMMUTABLE;
 	}
 	fat_save_attrs(inode, attr);
 	mark_inode_dirty(inode);
 out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
 out_unlock_inode:
 	mutex_unlock(&inode->i_mutex);
 out:
 	return err;
 }
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 __user *user_attr = (u32 __user *)arg;
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 		return fat_ioctl_get_attributes(inode, user_attr);
 	case FAT_IOCTL_SET_ATTRIBUTES:
 		return fat_ioctl_set_attributes(filp, user_attr);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
 }
 #ifdef CONFIG_COMPAT
 static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
 				      unsigned long arg)
 {
 	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
 }
 #endif
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
 	if ((filp->f_mode & FMODE_WRITE) &&
 	     MSDOS_SB(inode->i_sb)->options.flush) {
 		fat_flush_inodes(inode->i_sb, inode, NULL);
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 	return 0;
 }
 int fat_file_fsync(struct file *filp, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int res, err;
 	res = generic_file_fsync(filp, datasync);
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
 	return res ? res : err;
 }
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.unlocked_ioctl	= fat_generic_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_generic_compat_ioctl,
 #endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 static int fat_cont_expand(struct inode *inode, loff_t size)
 {
 	struct address_space *mapping = inode->i_mapping;
 	loff_t start = inode->i_size, count = size - inode->i_size;
 	int err;
 	err = generic_cont_expand_simple(inode, size);
 	if (err)
 		goto out;
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 	if (IS_SYNC(inode)) {
 		int err2;
 		/*
 		 * Opencode syncing since we don't have a file open to use
 		 * standard fsync path.
 		 */
 		err = filemap_fdatawrite_range(mapping, start,
 					       start + count - 1);
 		err2 = sync_mapping_buffers(mapping);
 		if (!err)
 			err = err2;
 		err2 = write_inode_now(inode, 1);
 		if (!err)
 			err = err2;
 		if (!err) {
 			err =  filemap_fdatawait_range(mapping, start,
 						       start + count - 1);
 		}
 	}
 out:
 	return err;
 }
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
 {
 	struct super_block *sb = inode->i_sb;
 	int err, wait, free_start, i_start, i_logstart;
 	if (MSDOS_I(inode)->i_start == 0)
 		return 0;
 	fat_cache_inval_inode(inode);
 	wait = IS_DIRSYNC(inode);
 	i_start = free_start = MSDOS_I(inode)->i_start;
 	i_logstart = MSDOS_I(inode)->i_logstart;
 	/* First, we write the new file size. */
 	if (!skip) {
 		MSDOS_I(inode)->i_start = 0;
 		MSDOS_I(inode)->i_logstart = 0;
 	}
 	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	if (wait) {
 		err = fat_sync_inode(inode);
 		if (err) {
 			MSDOS_I(inode)->i_start = i_start;
 			MSDOS_I(inode)->i_logstart = i_logstart;
 			return err;
 		}
 	} else
 		mark_inode_dirty(inode);
 	/* Write a new EOF, and get the remaining cluster chain for freeing. */
 	if (skip) {
 		struct fat_entry fatent;
 		int ret, fclus, dclus;
 		ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus);
 		if (ret < 0)
 			return ret;
 		else if (ret == FAT_ENT_EOF)
 			return 0;
 		fatent_init(&fatent);
 		ret = fat_ent_read(inode, &fatent, dclus);
 		if (ret == FAT_ENT_EOF) {
 			fatent_brelse(&fatent);
 			return 0;
 		} else if (ret == FAT_ENT_FREE) {
 			fat_fs_error(sb,
 				     "%s: invalid cluster chain (i_pos %lld)",
 				     __func__, MSDOS_I(inode)->i_pos);
 			ret = -EIO;
 		} else if (ret > 0) {
 			err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
 			if (err)
 				ret = err;
 		}
 		fatent_brelse(&fatent);
 		if (ret < 0)
 			return ret;
 		free_start = ret;
 	}
 	inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
 	/* Freeing the remained cluster chain */
 	return fat_free_clusters(inode, free_start);
 }
 void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	const unsigned int cluster_size = sbi->cluster_size;
 	int nr_clusters;
 	/*
 	 * This protects against truncating a file bigger than it was then
 	 * trying to write into the hole.
 	 */
 	if (MSDOS_I(inode)->mmu_private > offset)
 		MSDOS_I(inode)->mmu_private = offset;
 	nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
 	fat_free(inode, nr_clusters);
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
 static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 			     struct inode *inode, umode_t *mode_ptr)
 {
 	mode_t mask, perm;
 	/*
 	 * Note, the basic check is already done by a caller of
 	 * (attr->ia_mode & ~FAT_VALID_MODE)
 	 */
 	if (S_ISREG(inode->i_mode))
 		mask = sbi->options.fs_fmask;
 	else
 		mask = sbi->options.fs_dmask;
 	perm = *mode_ptr & ~(S_IFMT | mask);
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
 	 *
 	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
 	 */
 	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
 	if (fat_mode_can_hold_ro(inode)) {
 		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
 			return -EPERM;
 	} else {
 		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
 			return -EPERM;
 	}
 	*mode_ptr &= S_IFMT | perm;
 	return 0;
 }
 static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 {
 	mode_t allow_utime = sbi->options.allow_utime;
 	if (current_fsuid() != inode->i_uid) {
 		if (in_group_p(inode->i_gid))
 			allow_utime >>= 3;
 		if (allow_utime & MAY_WRITE)
 			return 1;
 	}
 	/* use a default check */
 	return 0;
 }
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	unsigned int ia_valid;
 	int error;
 	/* Check for setting the inode time. */
 	ia_valid = attr->ia_valid;
 	if (ia_valid & TIMES_SET_FLAGS) {
 		if (fat_allow_set_time(sbi, inode))
 			attr->ia_valid &= ~TIMES_SET_FLAGS;
 	}
 	error = inode_change_ok(inode, attr);
 	attr->ia_valid = ia_valid;
 	if (error) {
 		if (sbi->options.quiet)
 			error = 0;
 		goto out;
 	}
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
 	 * before calling the ->truncate(), but FAT needs to fill the
 	 * hole before it. XXX: this is no longer true with new truncate
 	 * sequence.
 	 */
 	if (attr->ia_valid & ATTR_SIZE) {
+		inode_dio_wait(inode);
 		if (attr->ia_size > inode->i_size) {
 			error = fat_cont_expand(inode, attr->ia_size);
 			if (error || attr->ia_valid == ATTR_SIZE)
 				goto out;
 			attr->ia_valid &= ~ATTR_SIZE;
 		}
 	}
 	if (((attr->ia_valid & ATTR_UID) &&
 	     (attr->ia_uid != sbi->options.fs_uid)) ||
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
 	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
 	if (error) {
 		if (sbi->options.quiet)
 			error = 0;
 		goto out;
 	}
 	/*
 	 * We don't return -EPERM here. Yes, strange, but this is too
 	 * old behavior.
 	 */
 	if (attr->ia_valid & ATTR_MODE) {
 		if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
 			attr->ia_valid &= ~ATTR_MODE;
 	}
 	if (attr->ia_valid & ATTR_SIZE) {
 		down_write(&MSDOS_I(inode)->truncate_lock);
 		truncate_setsize(inode, attr->ia_size);
 		fat_truncate_blocks(inode, attr->ia_size);
 		up_write(&MSDOS_I(inode)->truncate_lock);
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 out:
 	return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
 };

fs/gfs2/bmap.c

Diff comments View file @ 562c72a

 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
  * of the GNU General Public License version 2.
  */
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
 #include "trace_gfs2.h"
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  * block is 512, so __u16 is fine for that. It saves stack space to
  * keep it small.
  */
 struct metapath {
 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
 typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
 			     struct buffer_head *bh, __be64 *top,
 			     __be64 *bottom, unsigned int height,
 			     void *data);
 struct strip_mine {
 	int sm_first;
 	unsigned int sm_height;
 };
 /**
  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  * @ip: the inode
  * @dibh: the dinode buffer
  * @block: the block number that was allocated
  * @page: The (optional) page. This is looked up if @page is NULL
  *
  * Returns: errno
  */
 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 			       u64 block, struct page *page)
 {
 	struct inode *inode = &ip->i_inode;
 	struct buffer_head *bh;
 	int release = 0;
 	if (!page || page->index) {
 		page = grab_cache_page(inode->i_mapping, 0);
 		if (!page)
 			return -ENOMEM;
 		release = 1;
 	}
 	if (!PageUptodate(page)) {
 		void *kaddr = kmap(page);
 		u64 dsize = i_size_read(inode);
 		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
 			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
 		memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
 		kunmap(page);
 		SetPageUptodate(page);
 	}
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits,
 				     (1 << BH_Uptodate));
 	bh = page_buffers(page);
 	if (!buffer_mapped(bh))
 		map_bh(bh, inode->i_sb, block);
 	set_buffer_uptodate(bh);
 	if (!gfs2_is_jdata(ip))
 		mark_buffer_dirty(bh);
 	if (!gfs2_is_writeback(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 	if (release) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	return 0;
 }
 /**
  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
  * @ip: The GFS2 inode to unstuff
  * @page: The (optional) page. This is looked up if the @page is NULL
  *
  * This routine unstuffs a dinode and returns it to a "normal" state such
  * that the height can be grown in the traditional way.
  *
  * Returns: errno
  */
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 {
 	struct buffer_head *bh, *dibh;
 	struct gfs2_dinode *di;
 	u64 block = 0;
 	int isdir = gfs2_is_dir(ip);
 	int error;
 	down_write(&ip->i_rw_mutex);
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto out;
 	if (i_size_read(&ip->i_inode)) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 		unsigned int n = 1;
 		error = gfs2_alloc_block(ip, &block, &n);
 		if (error)
 			goto out_brelse;
 		if (isdir) {
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
 				goto out_brelse;
 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 					      dibh, sizeof(struct gfs2_dinode));
 			brelse(bh);
 		} else {
 			error = gfs2_unstuffer_page(ip, dibh, block, page);
 			if (error)
 				goto out_brelse;
 		}
 	}
 	/*  Set up the pointer to the new block  */
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	if (i_size_read(&ip->i_inode)) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	}
 	ip->i_height = 1;
 	di->di_height = cpu_to_be16(1);
 out_brelse:
 	brelse(dibh);
 out:
 	up_write(&ip->i_rw_mutex);
 	return error;
 }
 /**
  * find_metapath - Find path through the metadata tree
  * @sdp: The superblock
  * @mp: The metapath to return the result in
  * @block: The disk block to look up
  * @height: The pre-calculated height of the metadata tree
  *
  *   This routine returns a struct metapath structure that defines a path
  *   through the metadata of inode "ip" to get to block "block".
  *
  *   Example:
  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
  *   filesystem with a blocksize of 4096.
  *
  *   find_metapath() would return a struct metapath structure set to:
  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
  *   and mp_list[2] = 165.
  *
  *   That means that in order to get to the block containing the byte at
  *   offset 101342453, we would load the indirect block pointed to by pointer
  *   0 in the dinode.  We would then load the indirect block pointed to by
  *   pointer 48 in that indirect block.  We would then load the data block
  *   pointed to by pointer 165 in that indirect block.
  *
  *             ----------------------------------------
  *             | Dinode |                             |
  *             |        |                            4|
  *             |        |0 1 2 3 4 5                 9|
  *             |        |                            6|
  *             ----------------------------------------
  *                       |
  *                       |
  *                       V
  *             ----------------------------------------
  *             | Indirect Block                       |
  *             |                                     5|
  *             |            4 4 4 4 4 5 5            1|
  *             |0           5 6 7 8 9 0 1            2|
  *             ----------------------------------------
  *                                |
  *                                |
  *                                V
  *             ----------------------------------------
  *             | Indirect Block                       |
  *             |                         1 1 1 1 1   5|
  *             |                         6 6 6 6 6   1|
  *             |0                        3 4 5 6 7   2|
  *             ----------------------------------------
  *                                           |
  *                                           |
  *                                           V
  *             ----------------------------------------
  *             | Data block containing offset         |
  *             |            101342453                 |
  *             |                                      |
  *             |                                      |
  *             ----------------------------------------
  *
  */
 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 			  struct metapath *mp, unsigned int height)
 {
 	unsigned int i;
 	for (i = height; i--;)
 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 }
 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 {
 	if (mp->mp_list[0] == 0)
 		return 2;
 	return 1;
 }
 /**
  * metapointer - Return pointer to start of metadata in a buffer
  * @height: The metadata height (0 = dinode)
  * @mp: The metapath
  *
  * Return a pointer to the block number of the next height of the metadata
  * tree given a buffer containing the pointer to the current height of the
  * metadata tree.
  */
 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 {
 	struct buffer_head *bh = mp->mp_bh[height];
 	unsigned int head_size = (height > 0) ?
 		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
 	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
 }
 /**
  * lookup_metapath - Walk the metadata tree to a specific point
  * @ip: The inode
  * @mp: The metapath
  *
  * Assumes that the inode's buffer has already been looked up and
  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
  * by find_metapath().
  *
  * If this function encounters part of the tree which has not been
  * allocated, it returns the current height of the tree at the point
  * at which it found the unallocated block. Blocks which are found are
  * added to the mp->mp_bh[] list.
  *
  * Returns: error or height of metadata tree
  */
 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 {
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
 	__be64 *ptr;
 	u64 dblock;
 	int ret;
 	for (x = 0; x < end_of_metadata; x++) {
 		ptr = metapointer(x, mp);
 		dblock = be64_to_cpu(*ptr);
 		if (!dblock)
 			return x + 1;
 		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
 		if (ret)
 			return ret;
 	}
 	return ip->i_height;
 }
 static inline void release_metapath(struct metapath *mp)
 {
 	int i;
 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 		if (mp->mp_bh[i] == NULL)
 			break;
 		brelse(mp->mp_bh[i]);
 	}
 }
 /**
  * gfs2_extent_length - Returns length of an extent of blocks
  * @start: Start of the buffer
  * @len: Length of the buffer in bytes
  * @ptr: Current position in the buffer
  * @limit: Max extent length to return (0 = unlimited)
  * @eob: Set to 1 if we hit "end of block"
  *
  * If the first block is zero (unallocated) it will return the number of
  * unallocated blocks in the extent, otherwise it will return the number
  * of contiguous blocks in the extent.
  *
  * Returns: The length of the extent (minimum of one block)
  */
 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
 {
 	const __be64 *end = (start + len);
 	const __be64 *first = ptr;
 	u64 d = be64_to_cpu(*ptr);
 	*eob = 0;
 	do {
 		ptr++;
 		if (ptr >= end)
 			break;
 		if (limit && --limit == 0)
 			break;
 		if (d)
 			d++;
 	} while(be64_to_cpu(*ptr) == d);
 	if (ptr >= end)
 		*eob = 1;
 	return (ptr - first);
 }
 static inline void bmap_lock(struct gfs2_inode *ip, int create)
 {
 	if (create)
 		down_write(&ip->i_rw_mutex);
 	else
 		down_read(&ip->i_rw_mutex);
 }
 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 {
 	if (create)
 		up_write(&ip->i_rw_mutex);
 	else
 		up_read(&ip->i_rw_mutex);
 }
 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 					 struct gfs2_glock *gl, unsigned int i,
 					 unsigned offset, u64 bn)
 {
 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
 				 sizeof(struct gfs2_dinode)));
 	BUG_ON(i < 1);
 	BUG_ON(mp->mp_bh[i] != NULL);
 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 	gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 	ptr += offset;
 	*ptr = cpu_to_be64(bn);
 	return ptr;
 }
 enum alloc_state {
 	ALLOC_DATA = 0,
 	ALLOC_GROW_DEPTH = 1,
 	ALLOC_GROW_HEIGHT = 2,
 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 };
 /**
  * gfs2_bmap_alloc - Build a metadata tree of the requested height
  * @inode: The GFS2 inode
  * @lblock: The logical starting block of the extent
  * @bh_map: This is used to return the mapping details
  * @mp: The metapath
  * @sheight: The starting height (i.e. whats already mapped)
  * @height: The height to build to
  * @maxlen: The max number of data blocks to alloc
  *
  * In this routine we may have to alloc:
  *   i) Indirect blocks to grow the metadata tree height
  *  ii) Indirect blocks to fill in lower part of the metadata tree
  * iii) Data blocks
  *
  * The function is in two parts. The first part works out the total
  * number of blocks which we need. The second part does the actual
  * allocation asking for an extent at a time (if enough contiguous free
  * blocks are available, there will only be one request per bmap call)
  * and uses the state machine to initialise the blocks in order.
  *
  * Returns: errno on error
  */
 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 			   struct buffer_head *bh_map, struct metapath *mp,
 			   const unsigned int sheight,
 			   const unsigned int height,
 			   const unsigned int maxlen)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn, dblock = 0;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = height - 1;
 	int eob = 0;
 	enum alloc_state state;
 	__be64 *ptr;
 	__be64 zero_bn = 0;
 	BUG_ON(sheight < 1);
 	BUG_ON(dibh == NULL);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	if (height == sheight) {
 		struct buffer_head *bh;
 		/* Bottom indirect block exists, find unalloced extent size */
 		ptr = metapointer(end_of_metadata, mp);
 		bh = mp->mp_bh[end_of_metadata];
 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
 					   &eob);
 		BUG_ON(dblks < 1);
 		state = ALLOC_DATA;
 	} else {
 		/* Need to allocate indirect blocks */
 		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
 		dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
 		if (height == ip->i_height) {
 			/* Writing into existing tree, extend tree down */
 			iblks = height - sheight;
 			state = ALLOC_GROW_DEPTH;
 		} else {
 			/* Building up tree height */
 			state = ALLOC_GROW_HEIGHT;
 			iblks = height - ip->i_height;
 			branch_start = metapath_branch_start(mp);
 			iblks += (height - branch_start);
 		}
 	}
 	/* start of the second part of the function (state machine) */
 	blks = dblks + iblks;
 	i = sheight;
 	do {
 		int error;
 		n = blks - alloced;
 		error = gfs2_alloc_block(ip, &bn, &n);
 		if (error)
 			return error;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
 		switch (state) {
 		/* Growing height of tree */
 		case ALLOC_GROW_HEIGHT:
 			if (i == 1) {
 				ptr = (__be64 *)(dibh->b_data +
 						 sizeof(struct gfs2_dinode));
 				zero_bn = *ptr;
 			}
 			for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 			if (i - 1 == height - ip->i_height) {
 				i--;
 				gfs2_buffer_copy_tail(mp->mp_bh[i],
 						sizeof(struct gfs2_meta_header),
 						dibh, sizeof(struct gfs2_dinode));
 				gfs2_buffer_clear_tail(dibh,
 						sizeof(struct gfs2_dinode) +
 						sizeof(__be64));
 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 					sizeof(struct gfs2_meta_header));
 				*ptr = zero_bn;
 				state = ALLOC_GROW_DEPTH;
 				for(i = branch_start; i < height; i++) {
 					if (mp->mp_bh[i] == NULL)
 						break;
 					brelse(mp->mp_bh[i]);
 					mp->mp_bh[i] = NULL;
 				}
 				i = branch_start;
 			}
 			if (n == 0)
 				break;
 		/* Branching from existing tree */
 		case ALLOC_GROW_DEPTH:
 			if (i > 1 && i < height)
 				gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
 			for (; i < height && n > 0; i++, n--)
 				gfs2_indirect_init(mp, ip->i_gl, i,
 						   mp->mp_list[i-1], bn++);
 			if (i == height)
 				state = ALLOC_DATA;
 			if (n == 0)
 				break;
 		/* Tree complete, adding data blocks */
 		case ALLOC_DATA:
 			BUG_ON(n > dblks);
 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
 			dblks = n;
 			ptr = metapointer(end_of_metadata, mp);
 			dblock = bn;
 			while (n-- > 0)
 				*ptr++ = cpu_to_be64(bn++);
 			break;
 		}
 	} while ((state != ALLOC_DATA) || !dblock);
 	ip->i_height = height;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
 	map_bh(bh_map, inode->i_sb, dblock);
 	bh_map->b_size = dblks << inode->i_blkbits;
 	set_buffer_new(bh_map);
 	return 0;
 }
 /**
  * gfs2_block_map - Map a block from an inode to a disk block
  * @inode: The inode
  * @lblock: The logical block number
  * @bh_map: The bh to be mapped
  * @create: True if its ok to alloc blocks to satify the request
  *
  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
  * read of metadata will be required before the next block can be
  * mapped. Sets buffer_new() if new blocks were allocated.
  *
  * Returns: errno
  */
 int gfs2_block_map(struct inode *inode, sector_t lblock,
 		   struct buffer_head *bh_map, int create)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned int bsize = sdp->sd_sb.sb_bsize;
 	const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
 	const u64 *arr = sdp->sd_heightsize;
 	__be64 *ptr;
 	u64 size;
 	struct metapath mp;
 	int ret;
 	int eob;
 	unsigned int len;
 	struct buffer_head *bh;
 	u8 height;
 	BUG_ON(maxlen == 0);
 	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
 	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
 	clear_buffer_boundary(bh_map);
 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
 	if (gfs2_is_dir(ip)) {
 		bsize = sdp->sd_jbsize;
 		arr = sdp->sd_jheightsize;
 	}
 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
 	if (ret)
 		goto out;
 	height = ip->i_height;
 	size = (lblock + 1) * bsize;
 	while (size > arr[height])
 		height++;
 	find_metapath(sdp, lblock, &mp, height);
 	ret = 1;
 	if (height > ip->i_height || gfs2_is_stuffed(ip))
 		goto do_alloc;
 	ret = lookup_metapath(ip, &mp);
 	if (ret < 0)
 		goto out;
 	if (ret != ip->i_height)
 		goto do_alloc;
 	ptr = metapointer(ip->i_height - 1, &mp);
 	if (*ptr == 0)
 		goto do_alloc;
 	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
 	bh = mp.mp_bh[ip->i_height - 1];
 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
 	bh_map->b_size = (len << inode->i_blkbits);
 	if (eob)
 		set_buffer_boundary(bh_map);
 	ret = 0;
 out:
 	release_metapath(&mp);
 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
 	bmap_unlock(ip, create);
 	return ret;
 do_alloc:
 	/* All allocations are done here, firstly check create flag */
 	if (!create) {
 		BUG_ON(gfs2_is_stuffed(ip));
 		ret = 0;
 		goto out;
 	}
 	/* At this point ret is the tree depth of already allocated blocks */
 	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
 	goto out;
 }
 /*
  * Deprecated: do not use in new code
  */
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 {
 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
 	int ret;
 	int create = *new;
 	BUG_ON(!extlen);
 	BUG_ON(!dblock);
 	BUG_ON(!new);
 	bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
 	ret = gfs2_block_map(inode, lblock, &bh, create);
 	*extlen = bh.b_size >> inode->i_blkbits;
 	*dblock = bh.b_blocknr;
 	if (buffer_new(&bh))
 		*new = 1;
 	else
 		*new = 0;
 	return ret;
 }
 /**
  * recursive_scan - recursively scan through the end of a file
  * @ip: the inode
  * @dibh: the dinode buffer
  * @mp: the path through the metadata to the point to start
  * @height: the height the recursion is at
  * @block: the indirect block to look at
  * @first: 1 if this is the first block
  * @bc: the call to make for each piece of metadata
  * @data: data opaque to this function to pass to @bc
  *
  * When this is first called @height and @block should be zero and
  * @first should be 1.
  *
  * Returns: errno
  */
 static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 			  struct metapath *mp, unsigned int height,
 			  u64 block, int first, block_call_t bc,
 			  void *data)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *bh = NULL;
 	__be64 *top, *bottom;
 	u64 bn;
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
 	if (!height) {
 		error = gfs2_meta_inode_buffer(ip, &bh);
 		if (error)
 			return error;
 		dibh = bh;
 		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
 		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
 	} else {
 		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
 		if (error)
 			return error;
 		top = (__be64 *)(bh->b_data + mh_size) +
 				  (first ? mp->mp_list[height] : 0);
 		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
 	}
 	error = bc(ip, dibh, bh, top, bottom, height, data);
 	if (error)
 		goto out;
 	if (height < ip->i_height - 1)
 		for (; top < bottom; top++, first = 0) {
 			if (!*top)
 				continue;
 			bn = be64_to_cpu(*top);
 			error = recursive_scan(ip, dibh, mp, height + 1, bn,
 					       first, bc, data);
 			if (error)
 				break;
 		}
 out:
 	brelse(bh);
 	return error;
 }
 /**
  * do_strip - Look for a layer a particular layer of the file and strip it off
  * @ip: the inode
  * @dibh: the dinode buffer
  * @bh: A buffer of pointers
  * @top: The first pointer in the buffer
  * @bottom: One more than the last pointer
  * @height: the height this buffer is at
  * @data: a pointer to a struct strip_mine
  *
  * Returns: errno
  */
 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
 		    unsigned int height, void *data)
 {
 	struct strip_mine *sm = data;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
 	u64 bn, bstart;
 	u32 blen, btotal;
 	__be64 *p;
 	unsigned int rg_blocks = 0;
 	int metadata;
 	unsigned int revokes = 0;
 	int x;
 	int error = 0;
 	if (!*top)
 		sm->sm_first = 0;
 	if (height != sm->sm_height)
 		return 0;
 	if (sm->sm_first) {
 		top++;
 		sm->sm_first = 0;
 	}
 	metadata = (height != ip->i_height - 1);
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 	else if (ip->i_depth)
 		revokes = sdp->sd_inptrs;
 	if (ip != GFS2_I(sdp->sd_rindex))
 		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
 	else if (!sdp->sd_rgrps)
 		error = gfs2_ri_update(ip);
 	if (error)
 		return error;
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 	bstart = 0;
 	blen = 0;
 	for (p = top; p < bottom; p++) {
 		if (!*p)
 			continue;
 		bn = be64_to_cpu(*p);
 		if (bstart + blen == bn)
 			blen++;
 		else {
 			if (bstart)
 				gfs2_rlist_add(sdp, &rlist, bstart);
 			bstart = bn;
 			blen = 1;
 		}
 	}
 	if (bstart)
 		gfs2_rlist_add(sdp, &rlist, bstart);
 	else
 		goto out; /* Nothing to do */
 	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
 	for (x = 0; x < rlist.rl_rgrps; x++) {
 		struct gfs2_rgrpd *rgd;
 		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
 		rg_blocks += rgd->rd_length;
 	}
 	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
 	if (error)
 		goto out_rlist;
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
 				 revokes);
 	if (error)
 		goto out_rg_gunlock;
 	down_write(&ip->i_rw_mutex);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	bstart = 0;
 	blen = 0;
 	btotal = 0;
 	for (p = top; p < bottom; p++) {
 		if (!*p)
 			continue;
 		bn = be64_to_cpu(*p);
 		if (bstart + blen == bn)
 			blen++;
 		else {
 			if (bstart) {
 				if (metadata)
 					__gfs2_free_meta(ip, bstart, blen);
 				else
 					__gfs2_free_data(ip, bstart, blen);
 				btotal += blen;
 			}
 			bstart = bn;
 			blen = 1;
 		}
 		*p = 0;
 		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart) {
 		if (metadata)
 			__gfs2_free_meta(ip, bstart, blen);
 		else
 			__gfs2_free_data(ip, bstart, blen);
 		btotal += blen;
 	}
 	gfs2_statfs_change(sdp, 0, +btotal, 0);
 	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
 			  ip->i_inode.i_gid);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 	up_write(&ip->i_rw_mutex);
 	gfs2_trans_end(sdp);
 out_rg_gunlock:
 	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
 	gfs2_rlist_free(&rlist);
 out:
 	if (ip != GFS2_I(sdp->sd_rindex))
 		gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
 	return error;
 }
 /**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
  */
 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct buffer_head *bh;
 	struct page *page;
 	int err;
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return 0;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (!buffer_mapped(bh)) {
 		gfs2_block_map(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh))
 			goto unlock;
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 		err = 0;
 	}
 	if (!gfs2_is_writeback(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct buffer_head *dibh;
 	int journaled = gfs2_is_jdata(ip);
 	int error;
 	error = gfs2_trans_begin(sdp,
 				 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
 	if (error)
 		return error;
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto out;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	if (gfs2_is_stuffed(ip)) {
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
 	} else {
 		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
 			error = gfs2_block_truncate_page(mapping, newsize);
 			if (error)
 				goto out_brelse;
 		}
 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 	}
 	i_size_write(inode, newsize);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 	truncate_pagecache(inode, oldsize, newsize);
 out_brelse:
 	brelse(dibh);
 out:
 	gfs2_trans_end(sdp);
 	return error;
 }
 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	unsigned int height = ip->i_height;
 	u64 lblock;
 	struct metapath mp;
 	int error;
 	if (!size)
 		lblock = 0;
 	else
 		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
 	find_metapath(sdp, lblock, &mp, ip->i_height);
 	if (!gfs2_alloc_get(ip))
 		return -ENOMEM;
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto out;
 	while (height--) {
 		struct strip_mine sm;
 		sm.sm_first = !!size;
 		sm.sm_height = height;
 		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
 		if (error)
 			break;
 	}
 	gfs2_quota_unhold(ip);
 out:
 	gfs2_alloc_put(ip);
 	return error;
 }
 static int trunc_end(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
 	int error;
 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
 	if (error)
 		return error;
 	down_write(&ip->i_rw_mutex);
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto out;
 	if (!i_size_read(&ip->i_inode)) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	}
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 out:
 	up_write(&ip->i_rw_mutex);
 	gfs2_trans_end(sdp);
 	return error;
 }
 /**
  * do_shrink - make a file smaller
  * @inode: the inode
  * @oldsize: the current inode size
  * @newsize: the size to make the file
  *
  * Called with an exclusive lock on @inode. The @size must
  * be equal to or smaller than the current inode size.
  *
  * Returns: errno
  */
 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 	error = trunc_start(inode, oldsize, newsize);
 	if (error < 0)
 		return error;
 	if (gfs2_is_stuffed(ip))
 		return 0;
 	error = trunc_dealloc(ip, newsize);
 	if (error == 0)
 		error = trunc_end(ip);
 	return error;
 }
 void gfs2_trim_blocks(struct inode *inode)
 {
 	u64 size = inode->i_size;
 	int ret;
 	ret = do_shrink(inode, size, size);
 	WARN_ON(ret != 0);
 }
 /**
  * do_grow - Touch and update inode size
  * @inode: The inode
  * @size: The new size
  *
  * This function updates the timestamps on the inode and
  * may also increase the size of the inode. This function
  * must not be called with @size any smaller than the current
  * inode size.
  *
  * Although it is not strictly required to unstuff files here,
  * earlier versions of GFS2 have a bug in the stuffed file reading
  * code which will result in a buffer overrun if the size is larger
  * than the max stuffed file size. In order to prevent this from
  * occurring, such files are unstuffed, but in other cases we can
  * just update the inode size directly.
  *
  * Returns: 0 on success, or -ve on error
  */
 static int do_grow(struct inode *inode, u64 size)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = NULL;
 	int error;
 	if (gfs2_is_stuffed(ip) &&
 	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
 		al = gfs2_alloc_get(ip);
 		if (al == NULL)
 			return -ENOMEM;
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto do_grow_alloc_put;
 		al->al_requested = 1;
 		error = gfs2_inplace_reserve(ip);
 		if (error)
 			goto do_grow_qunlock;
 	}
 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
 	if (error)
 		goto do_grow_release;
 	if (al) {
 		error = gfs2_unstuff_dinode(ip, NULL);
 		if (error)
 			goto do_end_trans;
 	}
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto do_end_trans;
 	i_size_write(inode, size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 do_end_trans:
 	gfs2_trans_end(sdp);
 do_grow_release:
 	if (al) {
 		gfs2_inplace_release(ip);
 do_grow_qunlock:
 		gfs2_quota_unlock(ip);
 do_grow_alloc_put:
 		gfs2_alloc_put(ip);
 	}
 	return error;
 }
 /**
  * gfs2_setattr_size - make a file a given size
  * @inode: the inode
  * @newsize: the size to make the file
  *
  * The file size can grow, shrink, or stay the same size. This
  * is called holding i_mutex and an exclusive glock on the inode
  * in question.
  *
  * Returns: errno
  */
 int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
 	int ret;
 	u64 oldsize;
 	BUG_ON(!S_ISREG(inode->i_mode));
 	ret = inode_newsize_ok(inode, newsize);
 	if (ret)
 		return ret;
+	inode_dio_wait(inode);
 	oldsize = inode->i_size;
 	if (newsize >= oldsize)
 		return do_grow(inode, newsize);
 	return do_shrink(inode, oldsize, newsize);
 }
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
 	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
 }
 int gfs2_file_dealloc(struct gfs2_inode *ip)
 {
 	return trunc_dealloc(ip, 0);
 }
 /**
  * gfs2_write_alloc_required - figure out if a write will require an allocation
  * @ip: the file being written to
  * @offset: the offset to write to
  * @len: the number of bytes being written
  *
  * Returns: 1 if an alloc is required, 0 otherwise
  */
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 			      unsigned int len)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head bh;
 	unsigned int shift;
 	u64 lblock, lblock_stop, size;
 	u64 end_of_file;
 	if (!len)
 		return 0;
 	if (gfs2_is_stuffed(ip)) {
 		if (offset + len >
 		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
 			return 1;
 		return 0;
 	}
 	shift = sdp->sd_sb.sb_bsize_shift;
 	BUG_ON(gfs2_is_dir(ip));
 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
 	lblock = offset >> shift;
 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
 	if (lblock_stop > end_of_file)
 		return 1;
 	size = (lblock_stop - lblock) << shift;
 	do {
 		bh.b_state = 0;
 		bh.b_size = size;
 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
 		if (!buffer_mapped(&bh))
 			return 1;
 		size -= bh.b_size;
 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
 	} while(size > 0);
 	return 0;
 }

fs/hfs/inode.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/hfs/inode.c
  *
  * Copyright (C) 1995-1997  Paul H. Hargrove
  * (C) 2003 Ardis Technologies <roman@ardistech.com>
  * This file may be distributed under the terms of the GNU General Public License.
  *
  * This file contains inode-related functions which do not depend on
  * which scheme is being used to represent forks.
  *
  * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
  */
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
 #include "hfs_fs.h"
 #include "btree.h"
 static const struct file_operations hfs_file_operations;
 static const struct inode_operations hfs_file_inode_operations;
 /*================ Variable-like macros ================*/
 #define HFS_VALID_MODE_BITS  (S_IFREG | S_IFDIR | S_IRWXUGO)
 static int hfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, hfs_get_block, wbc);
 }
 static int hfs_readpage(struct file *file, struct page *page)
 {
 	return block_read_full_page(page, hfs_get_block);
 }
 static int hfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
 	*pagep = NULL;
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfs_get_block,
 				&HFS_I(mapping->host)->phys_size);
 	if (unlikely(ret)) {
 		loff_t isize = mapping->host->i_size;
 		if (pos + len > isize)
 			vmtruncate(mapping->host, isize);
 	}
 	return ret;
 }
 static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, hfs_get_block);
 }
 static int hfs_releasepage(struct page *page, gfp_t mask)
 {
 	struct inode *inode = page->mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct hfs_btree *tree;
 	struct hfs_bnode *node;
 	u32 nidx;
 	int i, res = 1;
 	switch (inode->i_ino) {
 	case HFS_EXT_CNID:
 		tree = HFS_SB(sb)->ext_tree;
 		break;
 	case HFS_CAT_CNID:
 		tree = HFS_SB(sb)->cat_tree;
 		break;
 	default:
 		BUG();
 		return 0;
 	}
 	if (!tree)
 		return 0;
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
 		if (!node)
 			;
 		else if (atomic_read(&node->refcnt))
 			res = 0;
 		if (res && node) {
 			hfs_bnode_unhash(node);
 			hfs_bnode_free(node);
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
 		nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
 		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
 		spin_lock(&tree->hash_lock);
 		do {
 			node = hfs_bnode_findhash(tree, nidx++);
 			if (!node)
 				continue;
 			if (atomic_read(&node->refcnt)) {
 				res = 0;
 				break;
 			}
 			hfs_bnode_unhash(node);
 			hfs_bnode_free(node);
 		} while (--i && nidx < tree->node_count);
 		spin_unlock(&tree->hash_lock);
 	}
 	return res ? try_to_free_buffers(page) : 0;
 }
 static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	ssize_t ret;
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfs_get_block, NULL);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + iov_length(iov, nr_segs);
 		if (end > isize)
 			vmtruncate(inode, isize);
 	}
 	return ret;
 }
 static int hfs_writepages(struct address_space *mapping,
 			  struct writeback_control *wbc)
 {
 	return mpage_writepages(mapping, wbc, hfs_get_block);
 }
 const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
 	.releasepage	= hfs_releasepage,
 };
 const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
 	.direct_IO	= hfs_direct_IO,
 	.writepages	= hfs_writepages,
 };
 /*
  * hfs_new_inode
  */
 struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = new_inode(sb);
 	if (!inode)
 		return NULL;
 	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
 	inode->i_ino = HFS_SB(sb)->next_id++;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	HFS_I(inode)->fs_blocks = 0;
 	if (S_ISDIR(mode)) {
 		inode->i_size = 2;
 		HFS_SB(sb)->folder_count++;
 		if (dir->i_ino == HFS_ROOT_CNID)
 			HFS_SB(sb)->root_dirs++;
 		inode->i_op = &hfs_dir_inode_operations;
 		inode->i_fop = &hfs_dir_operations;
 		inode->i_mode |= S_IRWXUGO;
 		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
 	} else if (S_ISREG(mode)) {
 		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
 		HFS_SB(sb)->file_count++;
 		if (dir->i_ino == HFS_ROOT_CNID)
 			HFS_SB(sb)->root_files++;
 		inode->i_op = &hfs_file_inode_operations;
 		inode->i_fop = &hfs_file_operations;
 		inode->i_mapping->a_ops = &hfs_aops;
 		inode->i_mode |= S_IRUGO|S_IXUGO;
 		if (mode & S_IWUSR)
 			inode->i_mode |= S_IWUGO;
 		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
 		HFS_I(inode)->phys_size = 0;
 		HFS_I(inode)->alloc_blocks = 0;
 		HFS_I(inode)->first_blocks = 0;
 		HFS_I(inode)->cached_start = 0;
 		HFS_I(inode)->cached_blocks = 0;
 		memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
 		memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
 	}
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
 	sb->s_dirt = 1;
 	return inode;
 }
 void hfs_delete_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino);
 	if (S_ISDIR(inode->i_mode)) {
 		HFS_SB(sb)->folder_count--;
 		if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
 			HFS_SB(sb)->root_dirs--;
 		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
 		sb->s_dirt = 1;
 		return;
 	}
 	HFS_SB(sb)->file_count--;
 	if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
 		HFS_SB(sb)->root_files--;
 	if (S_ISREG(inode->i_mode)) {
 		if (!inode->i_nlink) {
 			inode->i_size = 0;
 			hfs_file_truncate(inode);
 		}
 	}
 	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
 	sb->s_dirt = 1;
 }
 void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			 __be32 __log_size, __be32 phys_size, u32 clump_size)
 {
 	struct super_block *sb = inode->i_sb;
 	u32 log_size = be32_to_cpu(__log_size);
 	u16 count;
 	int i;
 	memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
 	for (count = 0, i = 0; i < 3; i++)
 		count += be16_to_cpu(ext[i].count);
 	HFS_I(inode)->first_blocks = count;
 	inode->i_size = HFS_I(inode)->phys_size = log_size;
 	HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
 	HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
 				     HFS_SB(sb)->alloc_blksz;
 	HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
 	if (!HFS_I(inode)->clump_blocks)
 		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
 }
 struct hfs_iget_data {
 	struct hfs_cat_key *key;
 	hfs_cat_rec *rec;
 };
 static int hfs_test_inode(struct inode *inode, void *data)
 {
 	struct hfs_iget_data *idata = data;
 	hfs_cat_rec *rec;
 	rec = idata->rec;
 	switch (rec->type) {
 	case HFS_CDR_DIR:
 		return inode->i_ino == be32_to_cpu(rec->dir.DirID);
 	case HFS_CDR_FIL:
 		return inode->i_ino == be32_to_cpu(rec->file.FlNum);
 	default:
 		BUG();
 		return 1;
 	}
 }
 /*
  * hfs_read_inode
  */
 static int hfs_read_inode(struct inode *inode, void *data)
 {
 	struct hfs_iget_data *idata = data;
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	hfs_cat_rec *rec;
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 	/* Initialize the inode */
 	inode->i_uid = hsb->s_uid;
 	inode->i_gid = hsb->s_gid;
 	inode->i_nlink = 1;
 	if (idata->key)
 		HFS_I(inode)->cat_key = *idata->key;
 	else
 		HFS_I(inode)->flags |= HFS_FLG_RSRC;
 	HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
 	rec = idata->rec;
 	switch (rec->type) {
 	case HFS_CDR_FIL:
 		if (!HFS_IS_RSRC(inode)) {
 			hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
 					    rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
 		} else {
 			hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
 					    rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
 		}
 		inode->i_ino = be32_to_cpu(rec->file.FlNum);
 		inode->i_mode = S_IRUGO | S_IXUGO;
 		if (!(rec->file.Flags & HFS_FIL_LOCK))
 			inode->i_mode |= S_IWUGO;
 		inode->i_mode &= ~hsb->s_file_umask;
 		inode->i_mode |= S_IFREG;
 		inode->i_ctime = inode->i_atime = inode->i_mtime =
 				hfs_m_to_utime(rec->file.MdDat);
 		inode->i_op = &hfs_file_inode_operations;
 		inode->i_fop = &hfs_file_operations;
 		inode->i_mapping->a_ops = &hfs_aops;
 		break;
 	case HFS_CDR_DIR:
 		inode->i_ino = be32_to_cpu(rec->dir.DirID);
 		inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
 		HFS_I(inode)->fs_blocks = 0;
 		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
 		inode->i_ctime = inode->i_atime = inode->i_mtime =
 				hfs_m_to_utime(rec->dir.MdDat);
 		inode->i_op = &hfs_dir_inode_operations;
 		inode->i_fop = &hfs_dir_operations;
 		break;
 	default:
 		make_bad_inode(inode);
 	}
 	return 0;
 }
 /*
  * __hfs_iget()
  *
  * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
  * the catalog B-tree and the 'type' of the desired file return the
  * inode for that file/directory or NULL.  Note that 'type' indicates
  * whether we want the actual file or directory, or the corresponding
  * metadata (AppleDouble header file or CAP metadata file).
  */
 struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
 {
 	struct hfs_iget_data data = { key, rec };
 	struct inode *inode;
 	u32 cnid;
 	switch (rec->type) {
 	case HFS_CDR_DIR:
 		cnid = be32_to_cpu(rec->dir.DirID);
 		break;
 	case HFS_CDR_FIL:
 		cnid = be32_to_cpu(rec->file.FlNum);
 		break;
 	default:
 		return NULL;
 	}
 	inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
 	if (inode && (inode->i_state & I_NEW))
 		unlock_new_inode(inode);
 	return inode;
 }
 void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
 			  __be32 *log_size, __be32 *phys_size)
 {
 	memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));
 	if (log_size)
 		*log_size = cpu_to_be32(inode->i_size);
 	if (phys_size)
 		*phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
 					 HFS_SB(inode->i_sb)->alloc_blksz);
 }
 int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct inode *main_inode = inode;
 	struct hfs_find_data fd;
 	hfs_cat_rec rec;
 	dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino);
 	hfs_ext_write_extent(inode);
 	if (inode->i_ino < HFS_FIRSTUSER_CNID) {
 		switch (inode->i_ino) {
 		case HFS_ROOT_CNID:
 			break;
 		case HFS_EXT_CNID:
 			hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
 			return 0;
 		case HFS_CAT_CNID:
 			hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
 			return 0;
 		default:
 			BUG();
 			return -EIO;
 		}
 	}
 	if (HFS_IS_RSRC(inode))
 		main_inode = HFS_I(inode)->rsrc_inode;
 	if (!main_inode->i_nlink)
 		return 0;
 	if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
 		/* panic? */
 		return -EIO;
 	fd.search_key->cat = HFS_I(main_inode)->cat_key;
 	if (hfs_brec_find(&fd))
 		/* panic? */
 		goto out;
 	if (S_ISDIR(main_inode->i_mode)) {
 		if (fd.entrylength < sizeof(struct hfs_cat_dir))
 			/* panic? */;
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
 			   sizeof(struct hfs_cat_dir));
 		if (rec.type != HFS_CDR_DIR ||
 		    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
 		}
 		rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
 		rec.dir.Val = cpu_to_be16(inode->i_size - 2);
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
 			    sizeof(struct hfs_cat_dir));
 	} else if (HFS_IS_RSRC(inode)) {
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
 			       sizeof(struct hfs_cat_file));
 		hfs_inode_write_fork(inode, rec.file.RExtRec,
 				     &rec.file.RLgLen, &rec.file.RPyLen);
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
 				sizeof(struct hfs_cat_file));
 	} else {
 		if (fd.entrylength < sizeof(struct hfs_cat_file))
 			/* panic? */;
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
 			   sizeof(struct hfs_cat_file));
 		if (rec.type != HFS_CDR_FIL ||
 		    be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
 		}
 		if (inode->i_mode & S_IWUSR)
 			rec.file.Flags &= ~HFS_FIL_LOCK;
 		else
 			rec.file.Flags |= HFS_FIL_LOCK;
 		hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
 		rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
 			    sizeof(struct hfs_cat_file));
 	}
 out:
 	hfs_find_exit(&fd);
 	return 0;
 }
 static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 				      struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	hfs_cat_rec rec;
 	struct hfs_find_data fd;
 	int res;
 	if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
 		goto out;
 	inode = HFS_I(dir)->rsrc_inode;
 	if (inode)
 		goto out;
 	inode = new_inode(dir->i_sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
 	fd.search_key->cat = HFS_I(dir)->cat_key;
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
 	if (!res) {
 		struct hfs_iget_data idata = { NULL, &rec };
 		hfs_read_inode(inode, &idata);
 	}
 	hfs_find_exit(&fd);
 	if (res) {
 		iput(inode);
 		return ERR_PTR(res);
 	}
 	HFS_I(inode)->rsrc_inode = dir;
 	HFS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
 	hlist_add_fake(&inode->i_hash);
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
 	return NULL;
 }
 void hfs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
 	if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
 		HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
 		iput(HFS_I(inode)->rsrc_inode);
 	}
 }
 static int hfs_file_open(struct inode *inode, struct file *file)
 {
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
 	atomic_inc(&HFS_I(inode)->opencnt);
 	return 0;
 }
 static int hfs_file_release(struct inode *inode, struct file *file)
 {
 	//struct super_block *sb = inode->i_sb;
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfs_file_truncate(inode);
 		//if (inode->i_flags & S_DEAD) {
 		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 		//	hfs_delete_inode(inode);
 		//}
 		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
 /*
  * hfs_notify_change()
  *
  * Based very closely on fs/msdos/inode.c by Werner Almesberger
  *
  * This is the notify_change() field in the super_operations structure
  * for HFS file systems.  The purpose is to take that changes made to
  * an inode and apply then in a filesystem-dependent manner.  In this
  * case the process has a few of tasks to do:
  *  1) prevent changes to the i_uid and i_gid fields.
  *  2) map file permissions to the closest allowable permissions
  *  3) Since multiple Linux files can share the same on-disk inode under
  *     HFS (for instance the data and resource forks of a file) a change
  *     to permissions must be applied to all other in-core inodes which
  *     correspond to the same HFS file.
  */
 int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	int error;
 	error = inode_change_ok(inode, attr); /* basic permission checks */
 	if (error)
 		return error;
 	/* no uig/gid changes and limit which mode bits can be set */
 	if (((attr->ia_valid & ATTR_UID) &&
 	     (attr->ia_uid != hsb->s_uid)) ||
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != hsb->s_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
 	     ((S_ISDIR(inode->i_mode) &&
 	       (attr->ia_mode != inode->i_mode)) ||
 	      (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
 		return hsb->s_quiet ? 0 : error;
 	}
 	if (attr->ia_valid & ATTR_MODE) {
 		/* Only the 'w' bits can ever change and only all together. */
 		if (attr->ia_mode & S_IWUSR)
 			attr->ia_mode = inode->i_mode | S_IWUGO;
 		else
 			attr->ia_mode = inode->i_mode & ~S_IWUGO;
 		attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
 	}
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
 		error = vmtruncate(inode, attr->ia_size);
 		if (error)
 			return error;
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
 static int hfs_file_fsync(struct file *filp, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	struct super_block * sb;
 	int ret, err;
 	/* sync the inode to buffers */
 	ret = write_inode_now(inode, 0);
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
 	if (sb->s_dirt) {
 		lock_super(sb);
 		sb->s_dirt = 0;
 		if (!(sb->s_flags & MS_RDONLY))
 			hfs_mdb_commit(sb);
 		unlock_super(sb);
 	}
 	/* .. finally sync the buffers to disk */
 	err = sync_blockdev(sb->s_bdev);
 	if (!ret)
 		ret = err;
 	return ret;
 }
 static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= hfs_file_fsync,
 	.open		= hfs_file_open,
 	.release	= hfs_file_release,
 };
 static const struct inode_operations hfs_file_inode_operations = {
 	.lookup		= hfs_file_lookup,
 	.truncate	= hfs_file_truncate,
 	.setattr	= hfs_inode_setattr,
 	.setxattr	= hfs_setxattr,
 	.getxattr	= hfs_getxattr,
 	.listxattr	= hfs_listxattr,
 };

fs/hfsplus/inode.c

Diff comments View file @ 562c72a

 /*
  *  linux/fs/hfsplus/inode.c
  *
  * Copyright (C) 2001
  * Brad Boyer (flar@allandria.com)
  * (C) 2003 Ardis Technologies <roman@ardistech.com>
  *
  * Inode handling routines
  */
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 static int hfsplus_readpage(struct file *file, struct page *page)
 {
 	return block_read_full_page(page, hfsplus_get_block);
 }
 static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, hfsplus_get_block, wbc);
 }
 static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
 	*pagep = NULL;
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfsplus_get_block,
 				&HFSPLUS_I(mapping->host)->phys_size);
 	if (unlikely(ret)) {
 		loff_t isize = mapping->host->i_size;
 		if (pos + len > isize)
 			vmtruncate(mapping->host, isize);
 	}
 	return ret;
 }
 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, hfsplus_get_block);
 }
 static int hfsplus_releasepage(struct page *page, gfp_t mask)
 {
 	struct inode *inode = page->mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct hfs_btree *tree;
 	struct hfs_bnode *node;
 	u32 nidx;
 	int i, res = 1;
 	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
 		tree = HFSPLUS_SB(sb)->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
 		tree = HFSPLUS_SB(sb)->cat_tree;
 		break;
 	case HFSPLUS_ATTR_CNID:
 		tree = HFSPLUS_SB(sb)->attr_tree;
 		break;
 	default:
 		BUG();
 		return 0;
 	}
 	if (!tree)
 		return 0;
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >>
 			(tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
 		if (!node)
 			;
 		else if (atomic_read(&node->refcnt))
 			res = 0;
 		if (res && node) {
 			hfs_bnode_unhash(node);
 			hfs_bnode_free(node);
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
 		nidx = page->index <<
 			(PAGE_CACHE_SHIFT - tree->node_size_shift);
 		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
 		spin_lock(&tree->hash_lock);
 		do {
 			node = hfs_bnode_findhash(tree, nidx++);
 			if (!node)
 				continue;
 			if (atomic_read(&node->refcnt)) {
 				res = 0;
 				break;
 			}
 			hfs_bnode_unhash(node);
 			hfs_bnode_free(node);
 		} while (--i && nidx < tree->node_count);
 		spin_unlock(&tree->hash_lock);
 	}
 	return res ? try_to_free_buffers(page) : 0;
 }
 static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	ssize_t ret;
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfsplus_get_block, NULL);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + iov_length(iov, nr_segs);
 		if (end > isize)
 			vmtruncate(inode, isize);
 	}
 	return ret;
 }
 static int hfsplus_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
 	return mpage_writepages(mapping, wbc, hfsplus_get_block);
 }
 const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
 	.releasepage	= hfsplus_releasepage,
 };
 const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
 	.direct_IO	= hfsplus_direct_IO,
 	.writepages	= hfsplus_writepages,
 };
 const struct dentry_operations hfsplus_dentry_operations = {
 	.d_hash       = hfsplus_hash_dentry,
 	.d_compare    = hfsplus_compare_dentry,
 };
 static struct dentry *hfsplus_file_lookup(struct inode *dir,
 		struct dentry *dentry, struct nameidata *nd)
 {
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
 	struct hfsplus_inode_info *hip;
 	int err;
 	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
 		goto out;
 	inode = HFSPLUS_I(dir)->rsrc_inode;
 	if (inode)
 		goto out;
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	hip = HFSPLUS_I(inode);
 	inode->i_ino = dir->i_ino;
 	INIT_LIST_HEAD(&hip->open_dir_list);
 	mutex_init(&hip->extents_lock);
 	hip->extent_state = 0;
 	hip->flags = 0;
 	set_bit(HFSPLUS_I_RSRC, &hip->flags);
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
 	if (!err)
 		err = hfsplus_cat_read_inode(inode, &fd);
 	hfs_find_exit(&fd);
 	if (err) {
 		iput(inode);
 		return ERR_PTR(err);
 	}
 	hip->rsrc_inode = dir;
 	HFSPLUS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
 	/*
 	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 	 * want resource fork inodes in the regular inode space, we make them
 	 * appear hashed, but do not put on any lists.  hlist_del()
 	 * will work fine and require no locking.
 	 */
 	hlist_add_fake(&inode->i_hash);
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
 	return NULL;
 }
 static void hfsplus_get_perms(struct inode *inode,
 		struct hfsplus_perm *perms, int dir)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	u16 mode;
 	mode = be16_to_cpu(perms->mode);
 	inode->i_uid = be32_to_cpu(perms->owner);
 	if (!inode->i_uid && !mode)
 		inode->i_uid = sbi->uid;
 	inode->i_gid = be32_to_cpu(perms->group);
 	if (!inode->i_gid && !mode)
 		inode->i_gid = sbi->gid;
 	if (dir) {
 		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
 		mode |= S_IFDIR;
 	} else if (!mode)
 		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 	HFSPLUS_I(inode)->userflags = perms->userflags;
 	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
 		inode->i_flags &= ~S_IMMUTABLE;
 	if (perms->rootflags & HFSPLUS_FLG_APPEND)
 		inode->i_flags |= S_APPEND;
 	else
 		inode->i_flags &= ~S_APPEND;
 }
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EOVERFLOW;
 	atomic_inc(&HFSPLUS_I(inode)->opencnt);
 	return 0;
 }
 static int hfsplus_file_release(struct inode *inode, struct file *file)
 {
 	struct super_block *sb = inode->i_sb;
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
 			hfsplus_delete_cat(inode->i_ino,
 					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
 		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
 static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
 		error = vmtruncate(inode, attr->ia_size);
 		if (error)
 			return error;
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
 int hfsplus_file_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	int error = 0, error2;
 	/*
 	 * Sync inode metadata into the catalog and extent trees.
 	 */
 	sync_inode_metadata(inode, 1);
 	/*
 	 * And explicitly write out the btrees.
 	 */
 	if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
 		error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
 	if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
 		error2 =
 			filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
 		if (!error)
 			error = error2;
 	}
 	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
 		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
 		if (!error)
 			error = error2;
 	}
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return error;
 }
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
 	.setattr	= hfsplus_setattr,
 	.setxattr	= hfsplus_setxattr,
 	.getxattr	= hfsplus_getxattr,
 	.listxattr	= hfsplus_listxattr,
 };
 static const struct file_operations hfsplus_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= hfsplus_file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
 	.unlocked_ioctl = hfsplus_ioctl,
 };
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
 	struct hfsplus_inode_info *hip;
 	if (!inode)
 		return NULL;
 	inode->i_ino = sbi->next_cnid++;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	hip = HFSPLUS_I(inode);
 	INIT_LIST_HEAD(&hip->open_dir_list);
 	mutex_init(&hip->extents_lock);
 	atomic_set(&hip->opencnt, 0);
 	hip->extent_state = 0;
 	hip->flags = 0;
 	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
 	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
 	hip->alloc_blocks = 0;
 	hip->first_blocks = 0;
 	hip->cached_start = 0;
 	hip->cached_blocks = 0;
 	hip->phys_size = 0;
 	hip->fs_blocks = 0;
 	hip->rsrc_inode = NULL;
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_size = 2;
 		sbi->folder_count++;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (S_ISREG(inode->i_mode)) {
 		sbi->file_count++;
 		inode->i_op = &hfsplus_file_inode_operations;
 		inode->i_fop = &hfsplus_file_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
 		hip->clump_blocks = sbi->data_clump_blocks;
 	} else if (S_ISLNK(inode->i_mode)) {
 		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
 		hip->clump_blocks = 1;
 	} else
 		sbi->file_count++;
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 	sb->s_dirt = 1;
 	return inode;
 }
 void hfsplus_delete_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	if (S_ISDIR(inode->i_mode)) {
 		HFSPLUS_SB(sb)->folder_count--;
 		sb->s_dirt = 1;
 		return;
 	}
 	HFSPLUS_SB(sb)->file_count--;
 	if (S_ISREG(inode->i_mode)) {
 		if (!inode->i_nlink) {
 			inode->i_size = 0;
 			hfsplus_file_truncate(inode);
 		}
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_size = 0;
 		hfsplus_file_truncate(inode);
 	}
 	sb->s_dirt = 1;
 }
 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
 	struct super_block *sb = inode->i_sb;
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 count;
 	int i;
 	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
 	for (count = 0, i = 0; i < 8; i++)
 		count += be32_to_cpu(fork->extents[i].block_count);
 	hip->first_blocks = count;
 	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
 	hip->cached_start = 0;
 	hip->cached_blocks = 0;
 	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
 	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
 	hip->fs_blocks =
 		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
 	hip->clump_blocks =
 		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
 	if (!hip->clump_blocks) {
 		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
 			sbi->rsrc_clump_blocks :
 			sbi->data_clump_blocks;
 	}
 }
 void hfsplus_inode_write_fork(struct inode *inode,
 		struct hfsplus_fork_raw *fork)
 {
 	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
 	       sizeof(hfsplus_extent_rec));
 	fork->total_size = cpu_to_be64(inode->i_size);
 	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
 }
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 {
 	hfsplus_cat_entry entry;
 	int res = 0;
 	u16 type;
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 	HFSPLUS_I(inode)->linkid = 0;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 		if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
 			/* panic? */;
 		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		hfsplus_get_perms(inode, &folder->permissions, 1);
 		inode->i_nlink = 1;
 		inode->i_size = 2 + be32_to_cpu(folder->valence);
 		inode->i_atime = hfsp_mt2ut(folder->access_date);
 		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
 		HFSPLUS_I(inode)->create_date = folder->create_date;
 		HFSPLUS_I(inode)->fs_blocks = 0;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (type == HFSPLUS_FILE) {
 		struct hfsplus_cat_file *file = &entry.file;
 		if (fd->entrylength < sizeof(struct hfsplus_cat_file))
 			/* panic? */;
 		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
 					&file->rsrc_fork : &file->data_fork);
 		hfsplus_get_perms(inode, &file->permissions, 0);
 		inode->i_nlink = 1;
 		if (S_ISREG(inode->i_mode)) {
 			if (file->permissions.dev)
 				inode->i_nlink =
 					be32_to_cpu(file->permissions.dev);
 			inode->i_op = &hfsplus_file_inode_operations;
 			inode->i_fop = &hfsplus_file_operations;
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &page_symlink_inode_operations;
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else {
 			init_special_inode(inode, inode->i_mode,
 					   be32_to_cpu(file->permissions.dev));
 		}
 		inode->i_atime = hfsp_mt2ut(file->access_date);
 		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
 		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
 		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
 		res = -EIO;
 	}
 	return res;
 }
 int hfsplus_cat_write_inode(struct inode *inode)
 {
 	struct inode *main_inode = inode;
 	struct hfs_find_data fd;
 	hfsplus_cat_entry entry;
 	if (HFSPLUS_IS_RSRC(inode))
 		main_inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (!main_inode->i_nlink)
 		return 0;
 	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
 		/* panic? */
 		return -EIO;
 	if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd))
 		/* panic? */
 		goto out;
 	if (S_ISDIR(main_inode->i_mode)) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 		if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
 			/* panic? */;
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
 		hfsplus_cat_set_perms(inode, &folder->permissions);
 		folder->access_date = hfsp_ut2mt(inode->i_atime);
 		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
 		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
 		folder->valence = cpu_to_be32(inode->i_size - 2);
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 					 sizeof(struct hfsplus_cat_folder));
 	} else if (HFSPLUS_IS_RSRC(inode)) {
 		struct hfsplus_cat_file *file = &entry.file;
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 			       sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->rsrc_fork);
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 				sizeof(struct hfsplus_cat_file));
 	} else {
 		struct hfsplus_cat_file *file = &entry.file;
 		if (fd.entrylength < sizeof(struct hfsplus_cat_file))
 			/* panic? */;
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
 		hfsplus_cat_set_perms(inode, &file->permissions);
 		if (HFSPLUS_FLG_IMMUTABLE &
 				(file->permissions.rootflags |
 					file->permissions.userflags))
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
 			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
 		file->access_date = hfsp_ut2mt(inode->i_atime);
 		file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
 		file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 					 sizeof(struct hfsplus_cat_file));
 	}
 	set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
 out:
 	hfs_find_exit(&fd);
 	return 0;
 }

fs/jfs/file.c

Diff comments View file @ 562c72a

 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
  *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
  *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_dmap.h"
 #include "jfs_txnmgr.h"
 #include "jfs_xattr.h"
 #include "jfs_acl.h"
 #include "jfs_debug.h"
 int jfs_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int rc = 0;
 	if (!(inode->i_state & I_DIRTY) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
 		return rc;
 	}
 	rc |= jfs_commit_inode(inode, 1);
 	return rc ? -EIO : 0;
 }
 static int jfs_open(struct inode *inode, struct file *file)
 {
 	int rc;
 	if ((rc = dquot_file_open(inode, file)))
 		return rc;
 	/*
 	 * We attempt to allow only one "active" file open per aggregate
 	 * group.  Otherwise, appending to files in parallel can cause
 	 * fragmentation within the files.
 	 *
 	 * If the file is empty, it was probably just created and going
 	 * to be written to.  If it has a size, we'll hold off until the
 	 * file is actually grown.
 	 */
 	if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
 	    (inode->i_size == 0)) {
 		struct jfs_inode_info *ji = JFS_IP(inode);
 		spin_lock_irq(&ji->ag_lock);
 		if (ji->active_ag == -1) {
 			struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
 			ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
 			atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
 		}
 		spin_unlock_irq(&ji->ag_lock);
 	}
 	return 0;
 }
 static int jfs_release(struct inode *inode, struct file *file)
 {
 	struct jfs_inode_info *ji = JFS_IP(inode);
 	spin_lock_irq(&ji->ag_lock);
 	if (ji->active_ag != -1) {
 		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
 		atomic_dec(&bmap->db_active[ji->active_ag]);
 		ji->active_ag = -1;
 	}
 	spin_unlock_irq(&ji->ag_lock);
 	return 0;
 }
 int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	int rc;
 	rc = inode_change_ok(inode, iattr);
 	if (rc)
 		return rc;
 	if (is_quota_modification(inode, iattr))
 		dquot_initialize(inode);
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
 		rc = dquot_transfer(inode, iattr);
 		if (rc)
 			return rc;
 	}
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
 		rc = vmtruncate(inode, iattr->ia_size);
 		if (rc)
 			return rc;
 	}
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE)
 		rc = jfs_acl_chmod(inode);
 	return rc;
 }
 const struct inode_operations jfs_file_inode_operations = {
 	.truncate	= jfs_truncate,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
 	.listxattr	= jfs_listxattr,
 	.removexattr	= jfs_removexattr,
 	.setattr	= jfs_setattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.check_acl	= jfs_check_acl,
 #endif
 };
 const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
 	.write		= do_sync_write,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
 	.unlocked_ioctl = jfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= jfs_compat_ioctl,
 #endif
 };

fs/nilfs2/inode.c

Diff comments View file @ 562c72a

 /*
  * inode.c - NILFS inode operations.
  *
  * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
 #include "cpfile.h"
 #include "ifile.h"
 struct nilfs_iget_args {
 	u64 ino;
 	__u64 cno;
 	struct nilfs_root *root;
 	int for_gc;
 };
 void nilfs_inode_add_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
 		atomic_add(n, &root->blocks_count);
 }
 void nilfs_inode_sub_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
 		atomic_sub(n, &root->blocks_count);
 }
 /**
  * nilfs_get_block() - get a file block on the filesystem (callback function)
  * @inode - inode struct of the target file
  * @blkoff - file block number
  * @bh_result - buffer head to be mapped on
  * @create - indicate whether allocating the block or not when it has not
  *      been allocated yet.
  *
  * This function does not issue actual read request of the specified data
  * block. It is done by VFS.
  */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		    struct buffer_head *bh_result, int create)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 blknum = 0;
 	int err = 0, ret;
 	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	if (ret >= 0) {	/* found */
 		map_bh(bh_result, inode->i_sb, blknum);
 		if (ret > 0)
 			bh_result->b_size = (ret << inode->i_blkbits);
 		goto out;
 	}
 	/* data block was not found */
 	if (ret == -ENOENT && create) {
 		struct nilfs_transaction_info ti;
 		bh_result->b_blocknr = 0;
 		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
 		if (unlikely(err))
 			goto out;
 		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
 					(unsigned long)bh_result);
 		if (unlikely(err != 0)) {
 			if (err == -EEXIST) {
 				/*
 				 * The get_block() function could be called
 				 * from multiple callers for an inode.
 				 * However, the page having this block must
 				 * be locked in this case.
 				 */
 				printk(KERN_WARNING
 				       "nilfs_get_block: a race condition "
 				       "while inserting a data block. "
 				       "(inode number=%lu, file block "
 				       "offset=%llu)\n",
 				       inode->i_ino,
 				       (unsigned long long)blkoff);
 				err = 0;
 			}
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
 		}
 		nilfs_mark_inode_dirty(inode);
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
 		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
 						      to proper value */
 	} else if (ret == -ENOENT) {
 		/* not found is not error (e.g. hole); must return without
 		   the mapped state flag. */
 		;
 	} else {
 		err = ret;
 	}
  out:
 	return err;
 }
 /**
  * nilfs_readpage() - implement readpage() method of nilfs_aops {}
  * address_space_operations.
  * @file - file struct of the file to be read
  * @page - the page to be read
  */
 static int nilfs_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, nilfs_get_block);
 }
 /**
  * nilfs_readpages() - implement readpages() method of nilfs_aops {}
  * address_space_operations.
  * @file - file struct of the file to be read
  * @mapping - address_space struct used for reading multiple pages
  * @pages - the pages to be read
  * @nr_pages - number of pages to be read
  */
 static int nilfs_readpages(struct file *file, struct address_space *mapping,
 			   struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
 }
 static int nilfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	int err = 0;
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
 						    wbc->range_start,
 						    wbc->range_end);
 	return err;
 }
 static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	int err;
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		err = nilfs_construct_segment(inode->i_sb);
 		if (unlikely(err))
 			return err;
 	} else if (wbc->for_reclaim)
 		nilfs_flush_segment(inode->i_sb, inode->i_ino);
 	return 0;
 }
 static int nilfs_set_page_dirty(struct page *page)
 {
 	int ret = __set_page_dirty_buffers(page);
 	if (ret) {
 		struct inode *inode = page->mapping->host;
 		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
 		nilfs_set_file_dirty(inode, nr_dirty);
 	}
 	return ret;
 }
 static int nilfs_write_begin(struct file *file, struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
 	if (unlikely(err))
 		return err;
 	err = block_write_begin(mapping, pos, len, flags, pagep,
 				nilfs_get_block);
 	if (unlikely(err)) {
 		loff_t isize = mapping->host->i_size;
 		if (pos + len > isize)
 			vmtruncate(mapping->host, isize);
 		nilfs_transaction_abort(inode->i_sb);
 	}
 	return err;
 }
 static int nilfs_write_end(struct file *file, struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned nr_dirty;
 	int err;
 	nr_dirty = nilfs_page_count_clean_buffers(page, start,
 						  start + copied);
 	copied = generic_write_end(file, mapping, pos, len, copied, page,
 				   fsdata);
 	nilfs_set_file_dirty(inode, nr_dirty);
 	err = nilfs_transaction_commit(inode->i_sb);
 	return err ? : copied;
 }
 static ssize_t
 nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t size;
 	if (rw == WRITE)
 		return 0;
 	/* Needs synchronization with the cleaner */
 	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, nilfs_get_block, NULL);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && size < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + iov_length(iov, nr_segs);
 		if (end > isize)
 			vmtruncate(inode, isize);
 	}
 	return size;
 }
 const struct address_space_operations nilfs_aops = {
 	.writepage		= nilfs_writepage,
 	.readpage		= nilfs_readpage,
 	.writepages		= nilfs_writepages,
 	.set_page_dirty		= nilfs_set_page_dirty,
 	.readpages		= nilfs_readpages,
 	.write_begin		= nilfs_write_begin,
 	.write_end		= nilfs_write_end,
 	/* .releasepage		= nilfs_releasepage, */
 	.invalidatepage		= block_invalidatepage,
 	.direct_IO		= nilfs_direct_IO,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
 	struct nilfs_root *root;
 	int err = -ENOMEM;
 	ino_t ino;
 	inode = new_inode(sb);
 	if (unlikely(!inode))
 		goto failed;
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 	root = NILFS_I(dir)->i_root;
 	ii = NILFS_I(inode);
 	ii->i_state = 1 << NILFS_I_NEW;
 	ii->i_root = root;
 	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
 	if (unlikely(err))
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 	atomic_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, NULL);
 		if (err < 0)
 			goto failed_bmap;
 		set_bit(NILFS_I_BMAP, &ii->i_state);
 		/* No lock is needed; iget() ensures it. */
 	}
 	ii->i_flags = nilfs_mask_flags(
 		mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
 	ii->i_dir_start_lookup = 0;
 	nilfs_set_inode_flags(inode);
 	spin_lock(&nilfs->ns_next_gen_lock);
 	inode->i_generation = nilfs->ns_next_generation++;
 	spin_unlock(&nilfs->ns_next_gen_lock);
 	insert_inode_hash(inode);
 	err = nilfs_init_acl(inode, dir);
 	if (unlikely(err))
 		goto failed_acl; /* never occur. When supporting
 				    nilfs_init_acl(), proper cancellation of
 				    above jobs should be considered */
 	return inode;
  failed_acl:
  failed_bmap:
 	inode->i_nlink = 0;
 	iput(inode);  /* raw_inode will be deleted through
 			 generic_delete_inode() */
 	goto failed;
  failed_ifile_create_inode:
 	make_bad_inode(inode);
 	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
 			 called */
  failed:
 	return ERR_PTR(err);
 }
 void nilfs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = NILFS_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
 			    S_DIRSYNC);
 	if (flags & FS_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & FS_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & FS_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & FS_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 }
 int nilfs_read_inode_common(struct inode *inode,
 			    struct nilfs_inode *raw_inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
 	inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le64_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 	if (inode->i_nlink == 0 && inode->i_mode == 0)
 		return -EINVAL; /* this inode is deleted */
 	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
 	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
 #if 0
 	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
 	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
 		0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
 	ii->i_dir_start_lookup = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
 		if (err < 0)
 			return err;
 		set_bit(NILFS_I_BMAP, &ii->i_state);
 		/* No lock is needed; iget() ensures it. */
 	}
 	return 0;
 }
 static int __nilfs_read_inode(struct super_block *sb,
 			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct buffer_head *bh;
 	struct nilfs_inode *raw_inode;
 	int err;
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
 	if (unlikely(err))
 		goto bad_inode;
 	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
 	err = nilfs_read_inode_common(inode, raw_inode);
 	if (err)
 		goto failed_unmap;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &nilfs_file_inode_operations;
 		inode->i_fop = &nilfs_file_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &nilfs_dir_inode_operations;
 		inode->i_fop = &nilfs_dir_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &nilfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else {
 		inode->i_op = &nilfs_special_inode_operations;
 		init_special_inode(
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
 	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
 	return 0;
  failed_unmap:
 	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
  bad_inode:
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	return err;
 }
 static int nilfs_iget_test(struct inode *inode, void *opaque)
 {
 	struct nilfs_iget_args *args = opaque;
 	struct nilfs_inode_info *ii;
 	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
 		return 0;
 	ii = NILFS_I(inode);
 	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
 		return !args->for_gc;
 	return args->for_gc && args->cno == ii->i_cno;
 }
 static int nilfs_iget_set(struct inode *inode, void *opaque)
 {
 	struct nilfs_iget_args *args = opaque;
 	inode->i_ino = args->ino;
 	if (args->for_gc) {
 		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
 		NILFS_I(inode)->i_cno = args->cno;
 		NILFS_I(inode)->i_root = NULL;
 	} else {
 		if (args->root && args->ino == NILFS_ROOT_INO)
 			nilfs_get_root(args->root);
 		NILFS_I(inode)->i_root = args->root;
 	}
 	return 0;
 }
 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
 			    unsigned long ino)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = root, .cno = 0, .for_gc = 0
 	};
 	return ilookup5(sb, ino, nilfs_iget_test, &args);
 }
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
 				unsigned long ino)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = root, .cno = 0, .for_gc = 0
 	};
 	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 }
 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 			 unsigned long ino)
 {
 	struct inode *inode;
 	int err;
 	inode = nilfs_iget_locked(sb, root, ino);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	err = __nilfs_read_inode(sb, root, ino, inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
 	}
 	unlock_new_inode(inode);
 	return inode;
 }
 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
 				__u64 cno)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
 	};
 	struct inode *inode;
 	int err;
 	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	err = nilfs_init_gcinode(inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
 	}
 	unlock_new_inode(inode);
 	return inode;
 }
 void nilfs_write_inode_common(struct inode *inode,
 			      struct nilfs_inode *raw_inode, int has_bmap)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	raw_inode->i_uid = cpu_to_le32(inode->i_uid);
 	raw_inode->i_gid = cpu_to_le32(inode->i_gid);
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le64(inode->i_size);
 	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
 	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
 		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 		/* zero-fill unused portion in the case of super root block */
 		raw_inode->i_xattr = 0;
 		raw_inode->i_pad = 0;
 		memset((void *)raw_inode + sizeof(*raw_inode), 0,
 		       nilfs->ns_inode_size - sizeof(*raw_inode));
 	}
 	if (has_bmap)
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/* When extending inode, nilfs->ns_inode_size should be checked
 	   for substitutions of appended fields */
 }
 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
 	ino_t ino = inode->i_ino;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct inode *ifile = ii->i_root->ifile;
 	struct nilfs_inode *raw_inode;
 	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
 		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
 	nilfs_write_inode_common(inode, raw_inode, 0);
 		/* XXX: call with has_bmap = 0 is a workaround to avoid
 		   deadlock of bmap. This delays update of i_bmap to just
 		   before writing */
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
 				unsigned long from)
 {
 	unsigned long b;
 	int ret;
 	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
 		return;
 repeat:
 	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
 	if (ret == -ENOENT)
 		return;
 	else if (ret < 0)
 		goto failed;
 	if (b < from)
 		return;
 	b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
 	ret = nilfs_bmap_truncate(ii->i_bmap, b);
 	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
 	if (!ret || (ret == -ENOMEM &&
 		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
 		goto repeat;
 failed:
 	nilfs_warning(ii->vfs_inode.i_sb, __func__,
 		      "failed to truncate bmap (ino=%lu, err=%d)",
 		      ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
 {
 	unsigned long blkoff;
 	unsigned int blocksize;
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 	blocksize = sb->s_blocksize;
 	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
 	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
 	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
 	nilfs_truncate_bmap(ii, blkoff);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(inode, 0);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
 	   But truncate has no return value. */
 }
 static void nilfs_clear_inode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 	/*
 	 * Free resources allocated in nilfs_read_inode(), here.
 	 */
 	BUG_ON(!list_empty(&ii->i_dirty));
 	brelse(ii->i_bh);
 	ii->i_bh = NULL;
 	if (mdi && mdi->mi_palloc_cache)
 		nilfs_palloc_destroy_cache(inode);
 	if (test_bit(NILFS_I_BMAP, &ii->i_state))
 		nilfs_bmap_clear(ii->i_bmap);
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
 		nilfs_put_root(ii->i_root);
 }
 void nilfs_evict_inode(struct inode *inode)
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int ret;
 	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
 		end_writeback(inode);
 		nilfs_clear_inode(inode);
 		return;
 	}
 	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 	/* TODO: some of the following operations may fail.  */
 	nilfs_truncate_bmap(ii, 0);
 	nilfs_mark_inode_dirty(inode);
 	end_writeback(inode);
 	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
 	if (!ret)
 		atomic_dec(&ii->i_root->inodes_count);
 	nilfs_clear_inode(inode);
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
 	   But delete_inode has no return value. */
 }
 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct nilfs_transaction_info ti;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	int err;
 	err = inode_change_ok(inode, iattr);
 	if (err)
 		return err;
 	err = nilfs_transaction_begin(sb, &ti, 0);
 	if (unlikely(err))
 		return err;
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(inode)) {
+		inode_dio_wait(inode);
 		err = vmtruncate(inode, iattr->ia_size);
 		if (unlikely(err))
 			goto out_err;
 	}
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		err = nilfs_acl_chmod(inode);
 		if (unlikely(err))
 			goto out_err;
 	}
 	return nilfs_transaction_commit(sb);
 out_err:
 	nilfs_transaction_abort(sb);
 	return err;
 }
 int nilfs_permission(struct inode *inode, int mask)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	if ((mask & MAY_WRITE) && root &&
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 	return generic_permission(inode, mask);
 }
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 	spin_lock(&nilfs->ns_inode_lock);
 	if (ii->i_bh == NULL) {
 		spin_unlock(&nilfs->ns_inode_lock);
 		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
 						  inode->i_ino, pbh);
 		if (unlikely(err))
 			return err;
 		spin_lock(&nilfs->ns_inode_lock);
 		if (ii->i_bh == NULL)
 			ii->i_bh = *pbh;
 		else {
 			brelse(*pbh);
 			*pbh = ii->i_bh;
 		}
 	} else
 		*pbh = ii->i_bh;
 	get_bh(*pbh);
 	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 int nilfs_inode_dirty(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	int ret = 0;
 	if (!list_empty(&ii->i_dirty)) {
 		spin_lock(&nilfs->ns_inode_lock);
 		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
 			test_bit(NILFS_I_BUSY, &ii->i_state);
 		spin_unlock(&nilfs->ns_inode_lock);
 	}
 	return ret;
 }
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
 	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
 		return 0;
 	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
 		/* Because this routine may race with nilfs_dispose_list(),
 		   we have to check NILFS_I_QUEUED here, too. */
 		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
 			/* This will happen when somebody is freeing
 			   this inode. */
 			nilfs_warning(inode->i_sb, __func__,
 				      "cannot get inode (ino=%lu)\n",
 				      inode->i_ino);
 			spin_unlock(&nilfs->ns_inode_lock);
 			return -EINVAL; /* NILFS_I_DIRTY may remain for
 					   freeing inode */
 		}
 		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
 	struct buffer_head *ibh;
 	int err;
 	err = nilfs_load_inode_block(inode, &ibh);
 	if (unlikely(err)) {
 		nilfs_warning(inode->i_sb, __func__,
 			      "failed to reget inode block.\n");
 		return err;
 	}
 	nilfs_update_inode(inode, ibh);
 	mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
 	return 0;
 }
 /**
  * nilfs_dirty_inode - reflect changes on given inode to an inode block.
  * @inode: inode of the file to be registered.
  *
  * nilfs_dirty_inode() loads a inode block containing the specified
  * @inode and copies data from a nilfs_inode to a corresponding inode
  * entry in the inode block. This operation is excluded from the segment
  * construction. This function can be called both as a single operation
  * and as a part of indivisible file operations.
  */
 void nilfs_dirty_inode(struct inode *inode, int flags)
 {
 	struct nilfs_transaction_info ti;
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 	if (is_bad_inode(inode)) {
 		nilfs_warning(inode->i_sb, __func__,
 			      "tried to mark bad_inode dirty. ignored.\n");
 		dump_stack();
 		return;
 	}
 	if (mdi) {
 		nilfs_mdt_mark_dirty(inode);
 		return;
 	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 __u64 start, __u64 len)
 {
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 logical = 0, phys = 0, size = 0;
 	__u32 flags = 0;
 	loff_t isize;
 	sector_t blkoff, end_blkoff;
 	sector_t delalloc_blkoff;
 	unsigned long delalloc_blklen;
 	unsigned int blkbits = inode->i_blkbits;
 	int ret, n;
 	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
 	if (ret)
 		return ret;
 	mutex_lock(&inode->i_mutex);
 	isize = i_size_read(inode);
 	blkoff = start >> blkbits;
 	end_blkoff = (start + len - 1) >> blkbits;
 	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
 							&delalloc_blkoff);
 	do {
 		__u64 blkphy;
 		unsigned int maxblocks;
 		if (delalloc_blklen && blkoff == delalloc_blkoff) {
 			if (size) {
 				/* End of the current extent */
 				ret = fiemap_fill_next_extent(
 					fieinfo, logical, phys, size, flags);
 				if (ret)
 					break;
 			}
 			if (blkoff > end_blkoff)
 				break;
 			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
 			logical = blkoff << blkbits;
 			phys = 0;
 			size = delalloc_blklen << blkbits;
 			blkoff = delalloc_blkoff + delalloc_blklen;
 			delalloc_blklen = nilfs_find_uncommitted_extent(
 				inode, blkoff, &delalloc_blkoff);
 			continue;
 		}
 		/*
 		 * Limit the number of blocks that we look up so as
 		 * not to get into the next delayed allocation extent.
 		 */
 		maxblocks = INT_MAX;
 		if (delalloc_blklen)
 			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
 					  maxblocks);
 		blkphy = 0;
 		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 		n = nilfs_bmap_lookup_contig(
 			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
 		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 		if (n < 0) {
 			int past_eof;
 			if (unlikely(n != -ENOENT))
 				break; /* error */
 			/* HOLE */
 			blkoff++;
 			past_eof = ((blkoff << blkbits) >= isize);
 			if (size) {
 				/* End of the current extent */
 				if (past_eof)
 					flags |= FIEMAP_EXTENT_LAST;
 				ret = fiemap_fill_next_extent(
 					fieinfo, logical, phys, size, flags);
 				if (ret)
 					break;
 				size = 0;
 			}
 			if (blkoff > end_blkoff || past_eof)
 				break;
 		} else {
 			if (size) {
 				if (phys && blkphy << blkbits == phys + size) {
 					/* The current extent goes on */
 					size += n << blkbits;
 				} else {
 					/* Terminate the current extent */
 					ret = fiemap_fill_next_extent(
 						fieinfo, logical, phys, size,
 						flags);
 					if (ret || blkoff > end_blkoff)
 						break;
 					/* Start another extent */
 					flags = FIEMAP_EXTENT_MERGED;
 					logical = blkoff << blkbits;
 					phys = blkphy << blkbits;
 					size = n << blkbits;
 				}
 			} else {
 				/* Start a new extent */
 				flags = FIEMAP_EXTENT_MERGED;
 				logical = blkoff << blkbits;
 				phys = blkphy << blkbits;
 				size = n << blkbits;
 			}
 			blkoff += n;
 		}
 		cond_resched();
 	} while (true);
 	/* If ret is 1 then we just hit the end of the extent array */
 	if (ret == 1)
 		ret = 0;
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }

fs/ocfs2/file.c

Diff comments View file @ 562c72a

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * file.c
  *
  * File open, close, extend, truncate
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
 #include <linux/splice.h>
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
 #include "locks.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
 #include "ocfs2_trace.h"
 #include "buffer_head_io.h"
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp;
 	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 	fp->fp_file = file;
 	mutex_init(&fp->fp_mutex);
 	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
 	file->private_data = fp;
 	return 0;
 }
 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if (fp) {
 		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
 		ocfs2_lock_res_free(&fp->fp_flock);
 		kfree(fp);
 		file->private_data = NULL;
 	}
 }
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
 	int status;
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
 			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			      file->f_path.dentry->d_name.len,
 			      file->f_path.dentry->d_name.name, mode);
 	if (file->f_mode & FMODE_WRITE)
 		dquot_initialize(inode);
 	spin_lock(&oi->ip_lock);
 	/* Check that the inode hasn't been wiped from disk by another
 	 * node. If it hasn't then we're safe as long as we hold the
 	 * spin lock until our increment of open count. */
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&oi->ip_lock);
 		status = -ENOENT;
 		goto leave;
 	}
 	if (mode & O_DIRECT)
 		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 	oi->ip_open_count++;
 	spin_unlock(&oi->ip_lock);
 	status = ocfs2_init_file_private(inode, file);
 	if (status) {
 		/*
 		 * We want to set open count back if we're failing the
 		 * open.
 		 */
 		spin_lock(&oi->ip_lock);
 		oi->ip_open_count--;
 		spin_unlock(&oi->ip_lock);
 	}
 leave:
 	return status;
 }
 static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
 				 oi->ip_blkno,
 				 file->f_path.dentry->d_name.len,
 				 file->f_path.dentry->d_name.name,
 				 oi->ip_open_count);
 	spin_unlock(&oi->ip_lock);
 	ocfs2_free_file_private(inode, file);
 	return 0;
 }
 static int ocfs2_dir_open(struct inode *inode, struct file *file)
 {
 	return ocfs2_init_file_private(inode, file);
 }
 static int ocfs2_dir_release(struct inode *inode, struct file *file)
 {
 	ocfs2_free_file_private(inode, file);
 	return 0;
 }
 static int ocfs2_sync_file(struct file *file, int datasync)
 {
 	int err = 0;
 	journal_t *journal;
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
 			      OCFS2_I(inode)->ip_blkno,
 			      file->f_path.dentry->d_name.len,
 			      file->f_path.dentry->d_name.name,
 			      (unsigned long long)datasync);
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
 		 * We still have to flush drive's caches to get data to the
 		 * platter
 		 */
 		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		goto bail;
 	}
 	journal = osb->journal->j_journal;
 	err = jbd2_journal_force_commit(journal);
 bail:
 	if (err)
 		mlog_errno(err);
 	return (err < 0) ? -EIO : 0;
 }
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt)
 {
 	struct timespec now;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return 0;
 	if ((inode->i_flags & S_NOATIME) ||
 	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	/*
 	 * We can be called with no vfsmnt structure - NFSD will
 	 * sometimes do this.
 	 *
 	 * Note that our action here is different than touch_atime() -
 	 * if we can't tell whether this is a noatime mount, then we
 	 * don't know whether to trust the value of s_atime_quantum.
 	 */
 	if (vfsmnt == NULL)
 		return 0;
 	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
 		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 			return 1;
 		return 0;
 	}
 	now = CURRENT_TIME;
 	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 		return 0;
 	else
 		return 1;
 }
 int ocfs2_update_inode_atime(struct inode *inode,
 			     struct buffer_head *bh)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	/*
 	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
 	 * have i_mutex to guard against concurrent changes to other
 	 * inode fields.
 	 */
 	inode->i_atime = CURRENT_TIME;
 	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 	ocfs2_journal_dirty(handle, bh);
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
 static int ocfs2_set_inode_size(handle_t *handle,
 				struct inode *inode,
 				struct buffer_head *fe_bh,
 				u64 new_i_size)
 {
 	int status;
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 bail:
 	return status;
 }
 int ocfs2_simple_size_update(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_set_inode_size(handle, inode, di_bh,
 				   new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 static int ocfs2_cow_file_pos(struct inode *inode,
 			      struct buffer_head *fe_bh,
 			      u64 offset)
 {
 	int status;
 	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	unsigned int num_clusters = 0;
 	unsigned int ext_flags = 0;
 	/*
 	 * If the new offset is aligned to the range of the cluster, there is
 	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
 	 * CoW either.
 	 */
 	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
 		return 0;
 	status = ocfs2_get_clusters(inode, cpos, &phys,
 				    &num_clusters, &ext_flags);
 	if (status) {
 		mlog_errno(status);
 		goto out;
 	}
 	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 		goto out;
 	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 out:
 	return status;
 }
 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
 				     u64 new_i_size)
 {
 	int status;
 	handle_t *handle;
 	struct ocfs2_dinode *di;
 	u64 cluster_bytes;
 	/*
 	 * We need to CoW the cluster contains the offset if it is reflinked
 	 * since we will call ocfs2_zero_range_for_truncate later which will
 	 * write "0" from offset to the end of the cluster.
 	 */
 	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
 	if (status) {
 		mlog_errno(status);
 		return status;
 	}
 	/* TODO: This needs to actually orphan the inode in this
 	 * transaction. */
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out;
 	}
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	/*
 	 * Do this before setting i_size.
 	 */
 	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 					       cluster_bytes);
 	if (status) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	i_size_write(inode, new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	ocfs2_journal_dirty(handle, fe_bh);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	return status;
 }
 static int ocfs2_truncate_file(struct inode *inode,
 			       struct buffer_head *di_bh,
 			       u64 new_i_size)
 {
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
 	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
 				  (unsigned long long)le64_to_cpu(fe->i_size),
 				  (unsigned long long)new_i_size);
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"i_size = %llu, i_flags = 0x%x\n",
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			i_size_read(inode),
 			(unsigned long long)le64_to_cpu(fe->i_size),
 			le32_to_cpu(fe->i_flags));
 	if (new_i_size > le64_to_cpu(fe->i_size)) {
 		trace_ocfs2_truncate_file_error(
 			(unsigned long long)le64_to_cpu(fe->i_size),
 			(unsigned long long)new_i_size);
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	/* lets handle the simple truncate cases before doing any more
 	 * cluster locking. */
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_resv_discard(&osb->osb_la_resmap,
 			   &OCFS2_I(inode)->ip_la_data_resv);
 	/*
 	 * The inode lock forced other nodes to sync and drop their
 	 * pages, which (correctly) happens even if we have a truncate
 	 * without allocation change - ocfs2 cluster sizes can be much
 	 * greater than page size, so we have to truncate them
 	 * anyway.
 	 */
 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 					       i_size_read(inode), 1);
 		if (status)
 			mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
 	 * i_size. */
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	status = ocfs2_commit_truncate(osb, inode, di_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	/* TODO: orphan dir cleanup here. */
 bail_unlock_sem:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
 	if (!status && OCFS2_I(inode)->ip_clusters == 0)
 		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 	return status;
 }
 /*
  * extend file allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
  * metadata reservations in the contexts.
  *
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
 int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct inode *inode,
 			 u32 *logical_offset,
 			 u32 clusters_to_add,
 			 int mark_unwritten,
 			 struct buffer_head *fe_bh,
 			 handle_t *handle,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret)
 {
 	int ret;
 	struct ocfs2_extent_tree et;
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
 	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
 					  clusters_to_add, mark_unwritten,
 					  data_ac, meta_ac, reason_ret);
 	return ret;
 }
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 				     u32 clusters_to_add, int mark_unwritten)
 {
 	int status = 0;
 	int restart_func = 0;
 	int credits;
 	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 	/*
 	 * This function only exists for file systems which don't
 	 * support holes.
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
 					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto leave;
 	}
 restarted_transaction:
 	trace_ocfs2_extend_allocation(
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
 		(unsigned long long)i_size_read(inode),
 		le32_to_cpu(fe->i_clusters), clusters_to_add,
 		why, restart_func);
 	status = dquot_alloc_space_nodirty(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (status)
 		goto leave;
 	did_quota = 1;
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 	status = ocfs2_add_inode_data(osb,
 				      inode,
 				      &logical_start,
 				      clusters_to_add,
 				      mark_unwritten,
 				      bh,
 				      handle,
 				      data_ac,
 				      meta_ac,
 				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 	ocfs2_journal_dirty(handle, bh);
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	/* Release unused quota reservation */
 	dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	did_quota = 0;
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 			restart_func = 1;
 			status = 0;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
 							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
 				/* handle still has to be committed at
 				 * this point. */
 				status = -ENOMEM;
 				mlog_errno(status);
 				goto leave;
 			}
 			goto restarted_transaction;
 		}
 	}
 	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
 	     le32_to_cpu(fe->i_clusters),
 	     (unsigned long long)le64_to_cpu(fe->i_size),
 	     OCFS2_I(inode)->ip_clusters,
 	     (unsigned long long)i_size_read(inode));
 leave:
 	if (status < 0 && did_quota)
 		dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 	}
 	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
 		data_ac = NULL;
 	}
 	if (meta_ac) {
 		ocfs2_free_alloc_context(meta_ac);
 		meta_ac = NULL;
 	}
 	if ((!status) && restart_func) {
 		restart_func = 0;
 		goto restart_all;
 	}
 	brelse(bh);
 	bh = NULL;
 	return status;
 }
 /*
  * While a write will already be ordering the data, a truncate will not.
  * Thus, we need to explicitly order the zeroed pages.
  */
 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	int ret = 0;
 	if (!ocfs2_should_order_data(inode))
 		goto out;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_jbd2_file_inode(handle, inode);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
 	if (ret) {
 		if (!IS_ERR(handle))
 			ocfs2_commit_trans(osb, handle);
 		handle = ERR_PTR(ret);
 	}
 	return handle;
 }
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 				 u64 abs_to)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
 	handle_t *handle = NULL;
 	int ret = 0;
 	unsigned zero_from, zero_to, block_start, block_end;
 	BUG_ON(abs_from >= abs_to);
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 	page = find_or_create_page(mapping, index, GFP_NOFS);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	/* Get the offsets within the page that we want to zero */
 	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
 	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
 	if (!zero_to)
 		zero_to = PAGE_CACHE_SIZE;
 	trace_ocfs2_write_zero_page(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			(unsigned long long)abs_from,
 			(unsigned long long)abs_to,
 			index, zero_from, zero_to);
 	/* We know that zero_from is block aligned */
 	for (block_start = zero_from; block_start < zero_to;
 	     block_start = block_end) {
 		block_end = block_start + (1 << inode->i_blkbits);
 		/*
 		 * block_start is block-aligned.  Bump it by one to force
 		 * __block_write_begin and block_commit_write to zero the
 		 * whole block.
 		 */
 		ret = __block_write_begin(page, block_start + 1, 0,
 					  ocfs2_get_block);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
 		}
 		if (!handle) {
 			handle = ocfs2_zero_start_ordered_transaction(inode);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				handle = NULL;
 				break;
 			}
 		}
 		/* must not update i_size! */
 		ret = block_commit_write(page, block_start + 1,
 					 block_start + 1);
 		if (ret < 0)
 			mlog_errno(ret);
 		else
 			ret = 0;
 	}
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return ret;
 }
 /*
  * Find the next range to zero.  We do this in terms of bytes because
  * that's what ocfs2_zero_extend() wants, and it is dealing with the
  * pagecache.  We may return multiple extents.
  *
  * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
  * needs to be zeroed.  range_start and range_end return the next zeroing
  * range.  A subsequent call should pass the previous range_end as its
  * zero_start.  If range_end is 0, there's nothing to do.
  *
  * Unwritten extents are skipped over.  Refcounted extents are CoWd.
  */
 static int ocfs2_zero_extend_get_range(struct inode *inode,
 				       struct buffer_head *di_bh,
 				       u64 zero_start, u64 zero_end,
 				       u64 *range_start, u64 *range_end)
 {
 	int rc = 0, needs_cow = 0;
 	u32 p_cpos, zero_clusters = 0;
 	u32 zero_cpos =
 		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
 	unsigned int num_clusters = 0;
 	unsigned int ext_flags = 0;
 	while (zero_cpos < last_cpos) {
 		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
 					&num_clusters, &ext_flags);
 		if (rc) {
 			mlog_errno(rc);
 			goto out;
 		}
 		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
 			zero_clusters = num_clusters;
 			if (ext_flags & OCFS2_EXT_REFCOUNTED)
 				needs_cow = 1;
 			break;
 		}
 		zero_cpos += num_clusters;
 	}
 	if (!zero_clusters) {
 		*range_end = 0;
 		goto out;
 	}
 	while ((zero_cpos + zero_clusters) < last_cpos) {
 		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
 					&p_cpos, &num_clusters,
 					&ext_flags);
 		if (rc) {
 			mlog_errno(rc);
 			goto out;
 		}
 		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
 			break;
 		if (ext_flags & OCFS2_EXT_REFCOUNTED)
 			needs_cow = 1;
 		zero_clusters += num_clusters;
 	}
 	if ((zero_cpos + zero_clusters) > last_cpos)
 		zero_clusters = last_cpos - zero_cpos;
 	if (needs_cow) {
 		rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
 					zero_clusters, UINT_MAX);
 		if (rc) {
 			mlog_errno(rc);
 			goto out;
 		}
 	}
 	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
 	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
 					     zero_cpos + zero_clusters);
 out:
 	return rc;
 }
 /*
  * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
  * has made sure that the entire range needs zeroing.
  */
 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 				   u64 range_end)
 {
 	int rc = 0;
 	u64 next_pos;
 	u64 zero_pos = range_start;
 	trace_ocfs2_zero_extend_range(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			(unsigned long long)range_start,
 			(unsigned long long)range_end);
 	BUG_ON(range_start >= range_end);
 	while (zero_pos < range_end) {
 		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
 		if (next_pos > range_end)
 			next_pos = range_end;
 		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
 		if (rc < 0) {
 			mlog_errno(rc);
 			break;
 		}
 		zero_pos = next_pos;
 		/*
 		 * Very large extends have the potential to lock up
 		 * the cpu for extended periods of time.
 		 */
 		cond_resched();
 	}
 	return rc;
 }
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 		      loff_t zero_to_size)
 {
 	int ret = 0;
 	u64 zero_start, range_start = 0, range_end = 0;
 	struct super_block *sb = inode->i_sb;
 	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
 				(unsigned long long)zero_start,
 				(unsigned long long)i_size_read(inode));
 	while (zero_start < zero_to_size) {
 		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 						  zero_to_size,
 						  &range_start,
 						  &range_end);
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 		if (!range_end)
 			break;
 		/* Trim the ends */
 		if (range_start < zero_start)
 			range_start = zero_start;
 		if (range_end > zero_to_size)
 			range_end = zero_to_size;
 		ret = ocfs2_zero_extend_range(inode, range_start,
 					      range_end);
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 		zero_start = range_end;
 	}
 	return ret;
 }
 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
 			  u64 new_i_size, u64 zero_to)
 {
 	int ret;
 	u32 clusters_to_add;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	/*
 	 * Only quota files call this without a bh, and they can't be
 	 * refcounted.
 	 */
 	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
 	if (clusters_to_add < oi->ip_clusters)
 		clusters_to_add = 0;
 	else
 		clusters_to_add -= oi->ip_clusters;
 	if (clusters_to_add) {
 		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
 						clusters_to_add, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * Call this even if we don't add any clusters to the tree. We
 	 * still need to zero the area between the old i_size and the
 	 * new i_size.
 	 */
 	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
 	return ret;
 }
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
 	int ret = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	BUG_ON(!di_bh);
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
 		goto out;
 	if (i_size_read(inode) == new_i_size)
 		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 	/*
 	 * The alloc sem blocks people in read/write from reading our
 	 * allocation until we're done changing it. We depend on
 	 * i_mutex to block other extend/truncate calls while we're
 	 * here.  We even have to hold it for sparse files because there
 	 * might be some tail zeroing.
 	 */
 	down_write(&oi->ip_alloc_sem);
 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		/*
 		 * We can optimize small extends by keeping the inodes
 		 * inline data.
 		 */
 		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
 			up_write(&oi->ip_alloc_sem);
 			goto out_update_size;
 		}
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			up_write(&oi->ip_alloc_sem);
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
 	else
 		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
 					    new_i_size);
 	up_write(&oi->ip_alloc_sem);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 out_update_size:
 	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
 	return ret;
 }
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0, size_change;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
 	struct dquot *transfer_to[MAXQUOTAS] = { };
 	int qtype;
 	trace_ocfs2_setattr(inode, dentry,
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    dentry->d_name.len, dentry->d_name.name,
 			    attr->ia_valid, attr->ia_mode,
 			    attr->ia_uid, attr->ia_gid);
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
 		attr->ia_valid &= ~ATTR_SIZE;
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
 		return 0;
 	status = inode_change_ok(inode, attr);
 	if (status)
 		return status;
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	}
 	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto bail_unlock_rw;
 	}
 	if (size_change && attr->ia_size != i_size_read(inode)) {
 		status = inode_newsize_ok(inode, attr->ia_size);
 		if (status)
 			goto bail_unlock;
+		inode_dio_wait(inode);
 		if (i_size_read(inode) > attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
 				status = ocfs2_begin_ordered_truncate(inode,
 								      attr->ia_size);
 				if (status)
 					goto bail_unlock;
 			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			status = -ENOSPC;
 			goto bail_unlock;
 		}
 	}
 	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		/*
 		 * Gather pointers to quota structures so that allocation /
 		 * freeing of quota structures happens here and not inside
 		 * dquot_transfer() where we have problems with lock ordering
 		 */
 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 			transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
 						      USRQUOTA);
 			if (!transfer_to[USRQUOTA]) {
 				status = -ESRCH;
 				goto bail_unlock;
 			}
 		}
 		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 			transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
 						      GRPQUOTA);
 			if (!transfer_to[GRPQUOTA]) {
 				status = -ESRCH;
 				goto bail_unlock;
 			}
 		}
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
 					   2 * ocfs2_quota_trans_credits(sb));
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
 			goto bail_unlock;
 		}
 		status = __dquot_transfer(inode, transfer_to);
 		if (status < 0)
 			goto bail_commit;
 	} else {
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
 			goto bail_unlock;
 		}
 	}
 	/*
 	 * This will intentionally not wind up calling truncate_setsize(),
 	 * since all the work for a size change has been done above.
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
 	 * changes.
 	 *
 	 * XXX: this means the conditional below can probably be removed.
 	 */
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
 		status = vmtruncate(inode, attr->ia_size);
 		if (status) {
 			mlog_errno(status);
 			goto bail_commit;
 		}
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (status < 0)
 		mlog_errno(status);
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
 	brelse(bh);
 	/* Release quota pointers in case we acquired them */
 	for (qtype = 0; qtype < MAXQUOTAS; qtype++)
 		dqput(transfer_to[qtype]);
 	if (!status && attr->ia_valid & ATTR_MODE) {
 		status = ocfs2_acl_chmod(inode);
 		if (status < 0)
 			mlog_errno(status);
 	}
 	return status;
 }
 int ocfs2_getattr(struct vfsmount *mnt,
 		  struct dentry *dentry,
 		  struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = dentry->d_inode->i_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 	err = ocfs2_inode_revalidate(dentry);
 	if (err) {
 		if (err != -ENOENT)
 			mlog_errno(err);
 		goto bail;
 	}
 	generic_fillattr(inode, stat);
 	/* We set the blksize from the cluster size for performance */
 	stat->blksize = osb->s_clustersize;
 bail:
 	return err;
 }
 int ocfs2_permission(struct inode *inode, int mask)
 {
 	int ret;
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
 			mlog_errno(ret);
 		goto out;
 	}
 	ret = generic_permission(inode, mask);
 	ocfs2_inode_unlock(inode, 0);
 out:
 	return ret;
 }
 static int __ocfs2_write_remove_suid(struct inode *inode,
 				     struct buffer_head *bh)
 {
 	int ret;
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 	trace_ocfs2_write_remove_suid(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			inode->i_mode);
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_trans;
 	}
 	inode->i_mode &= ~S_ISUID;
 	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
 		inode->i_mode &= ~S_ISGID;
 	di = (struct ocfs2_dinode *) bh->b_data;
 	di->i_mode = cpu_to_le16(inode->i_mode);
 	ocfs2_journal_dirty(handle, bh);
 out_trans:
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 /*
  * Will look for holes and unwritten extents in the range starting at
  * pos for count bytes (inclusive).
  */
 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
 	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
 					 &extent_flags);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
 			ret = 1;
 			break;
 		}
 		if (extent_len > clusters)
 			extent_len = clusters;
 		clusters -= extent_len;
 		cpos += extent_len;
 	}
 out:
 	return ret;
 }
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
 	ret = ocfs2_read_inode_block(inode, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret =  __ocfs2_write_remove_suid(inode, bh);
 out:
 	brelse(bh);
 	return ret;
 }
 /*
  * Allocate enough extents to cover the region starting at byte offset
  * start for len bytes. Existing extents are skipped, any extents
  * added are marked as "unwritten".
  */
 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 					    u64 start, u64 len)
 {
 	int ret;
 	u32 cpos, phys_cpos, clusters, alloc_size;
 	u64 end = start + len;
 	struct buffer_head *di_bh = NULL;
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_read_inode_block(inode, &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * Nothing to do if the requested reservation range
 		 * fits within the inode.
 		 */
 		if (ocfs2_size_fits_inline_data(di_bh, end))
 			goto out;
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * We consider both start and len to be inclusive.
 	 */
 	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
 	clusters -= cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
 					 &alloc_size, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * Hole or existing extent len can be arbitrary, so
 		 * cap it to our own allocation request.
 		 */
 		if (alloc_size > clusters)
 			alloc_size = clusters;
 		if (phys_cpos) {
 			/*
 			 * We already have an allocation at this
 			 * region so we can safely skip it.
 			 */
 			goto next;
 		}
 		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
 		if (ret) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out;
 		}
 next:
 		cpos += alloc_size;
 		clusters -= alloc_size;
 	}
 	ret = 0;
 out:
 	brelse(di_bh);
 	return ret;
 }
 /*
  * Truncate a byte range, avoiding pages within partial clusters. This
  * preserves those pages for the zeroing code to write to.
  */
 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
 					 u64 byte_len)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	loff_t start, end;
 	struct address_space *mapping = inode->i_mapping;
 	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
 	end = byte_start + byte_len;
 	end = end & ~(osb->s_clustersize - 1);
 	if (start < end) {
 		unmap_mapping_range(mapping, start, end - start, 0);
 		truncate_inode_pages_range(mapping, start, end - 1);
 	}
 }
 static int ocfs2_zero_partial_clusters(struct inode *inode,
 				       u64 start, u64 len)
 {
 	int ret = 0;
 	u64 tmpend, end = start + len;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	unsigned int csize = osb->s_clustersize;
 	handle_t *handle;
 	/*
 	 * The "start" and "end" values are NOT necessarily part of
 	 * the range whose allocation is being deleted. Rather, this
 	 * is what the user passed in with the request. We must zero
 	 * partial clusters here. There's no need to worry about
 	 * physical allocation - the zeroing code knows to skip holes.
 	 */
 	trace_ocfs2_zero_partial_clusters(
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
 		(unsigned long long)start, (unsigned long long)end);
 	/*
 	 * If both edges are on a cluster boundary then there's no
 	 * zeroing required as the region is part of the allocation to
 	 * be truncated.
 	 */
 	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
 		goto out;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * We want to get the byte offset of the end of the 1st cluster.
 	 */
 	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
 	if (tmpend > end)
 		tmpend = end;
 	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
 						 (unsigned long long)tmpend);
 	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
 	if (ret)
 		mlog_errno(ret);
 	if (tmpend < end) {
 		/*
 		 * This may make start and end equal, but the zeroing
 		 * code will skip any work in that case so there's no
 		 * need to catch it up here.
 		 */
 		start = end & ~(osb->s_clustersize - 1);
 		trace_ocfs2_zero_partial_clusters_range2(
 			(unsigned long long)start, (unsigned long long)end);
 		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
 		if (ret)
 			mlog_errno(ret);
 	}
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
 {
 	int i;
 	struct ocfs2_extent_rec *rec = NULL;
 	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
 		rec = &el->l_recs[i];
 		if (le32_to_cpu(rec->e_cpos) < pos)
 			break;
 	}
 	return i;
 }
 /*
  * Helper to calculate the punching pos and length in one run, we handle the
  * following three cases in order:
  *
  * - remove the entire record
  * - remove a partial record
  * - no record needs to be removed (hole-punching completed)
 */
 static void ocfs2_calc_trunc_pos(struct inode *inode,
 				 struct ocfs2_extent_list *el,
 				 struct ocfs2_extent_rec *rec,
 				 u32 trunc_start, u32 *trunc_cpos,
 				 u32 *trunc_len, u32 *trunc_end,
 				 u64 *blkno, int *done)
 {
 	int ret = 0;
 	u32 coff, range;
 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
 	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
 		/*
 		 * remove an entire extent record.
 		 */
 		*trunc_cpos = le32_to_cpu(rec->e_cpos);
 		/*
 		 * Skip holes if any.
 		 */
 		if (range < *trunc_end)
 			*trunc_end = range;
 		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
 		*blkno = le64_to_cpu(rec->e_blkno);
 		*trunc_end = le32_to_cpu(rec->e_cpos);
 	} else if (range > trunc_start) {
 		/*
 		 * remove a partial extent record, which means we're
 		 * removing the last extent record.
 		 */
 		*trunc_cpos = trunc_start;
 		/*
 		 * skip hole if any.
 		 */
 		if (range < *trunc_end)
 			*trunc_end = range;
 		*trunc_len = *trunc_end - trunc_start;
 		coff = trunc_start - le32_to_cpu(rec->e_cpos);
 		*blkno = le64_to_cpu(rec->e_blkno) +
 				ocfs2_clusters_to_blocks(inode->i_sb, coff);
 		*trunc_end = trunc_start;
 	} else {
 		/*
 		 * It may have two following possibilities:
 		 *
 		 * - last record has been removed
 		 * - trunc_start was within a hole
 		 *
 		 * both two cases mean the completion of hole punching.
 		 */
 		ret = 1;
 	}
 	*done = ret;
 }
 static int ocfs2_remove_inode_range(struct inode *inode,
 				    struct buffer_head *di_bh, u64 byte_start,
 				    u64 byte_len)
 {
 	int ret = 0, flags = 0, done = 0, i;
 	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
 	u32 cluster_in_el;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
 	struct ocfs2_extent_tree et;
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_rec *rec = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 	trace_ocfs2_remove_inode_range(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			(unsigned long long)byte_start,
 			(unsigned long long)byte_len);
 	if (byte_len == 0)
 		return 0;
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
 					    byte_start + byte_len, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * There's no need to get fancy with the page cache
 		 * truncate of an inline-data inode. We're talking
 		 * about less than a page here, which will be cached
 		 * in the dinode buffer anyway.
 		 */
 		unmap_mapping_range(mapping, 0, 0, 0);
 		truncate_inode_pages(mapping, 0);
 		goto out;
 	}
 	/*
 	 * For reflinks, we may need to CoW 2 clusters which might be
 	 * partially zero'd later, if hole's start and end offset were
 	 * within one cluster(means is not exactly aligned to clustersize).
 	 */
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
 		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
 	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
 	cluster_in_el = trunc_end;
 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	path = ocfs2_new_path_from_et(&et);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	while (trunc_end > trunc_start) {
 		ret = ocfs2_find_path(INODE_CACHE(inode), path,
 				      cluster_in_el);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		el = path_leaf_el(path);
 		i = ocfs2_find_rec(el, trunc_end);
 		/*
 		 * Need to go to previous extent block.
 		 */
 		if (i < 0) {
 			if (path->p_tree_depth == 0)
 				break;
 			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
 							    path,
 							    &cluster_in_el);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 			/*
 			 * We've reached the leftmost extent block,
 			 * it's safe to leave.
 			 */
 			if (cluster_in_el == 0)
 				break;
 			/*
 			 * The 'pos' searched for previous extent block is
 			 * always one cluster less than actual trunc_end.
 			 */
 			trunc_end = cluster_in_el + 1;
 			ocfs2_reinit_path(path, 1);
 			continue;
 		} else
 			rec = &el->l_recs[i];
 		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
 				     &trunc_len, &trunc_end, &blkno, &done);
 		if (done)
 			break;
 		flags = rec->e_flags;
 		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
 		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
 					       phys_cpos, trunc_len, flags,
 					       &dealloc, refcount_loc);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		cluster_in_el = trunc_end;
 		ocfs2_reinit_path(path, 1);
 	}
 	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
 out:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &dealloc);
 	return ret;
 }
 /*
  * Parts of this function taken from xfs_change_file_space()
  */
 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 				     loff_t f_pos, unsigned int cmd,
 				     struct ocfs2_space_resv *sr,
 				     int change_size)
 {
 	int ret;
 	s64 llen;
 	loff_t size;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *di_bh = NULL;
 	handle_t *handle;
 	unsigned long long max_off = inode->i_sb->s_maxbytes;
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 	mutex_lock(&inode->i_mutex);
 	/*
 	 * This prevents concurrent writes on other nodes
 	 */
 	ret = ocfs2_rw_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_rw_unlock;
 	}
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		ret = -EPERM;
 		goto out_inode_unlock;
 	}
 	switch (sr->l_whence) {
 	case 0: /*SEEK_SET*/
 		break;
 	case 1: /*SEEK_CUR*/
 		sr->l_start += f_pos;
 		break;
 	case 2: /*SEEK_END*/
 		sr->l_start += i_size_read(inode);
 		break;
 	default:
 		ret = -EINVAL;
 		goto out_inode_unlock;
 	}
 	sr->l_whence = 0;
 	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
 	if (sr->l_start < 0
 	    || sr->l_start > max_off
 	    || (sr->l_start + llen) < 0
 	    || (sr->l_start + llen) > max_off) {
 		ret = -EINVAL;
 		goto out_inode_unlock;
 	}
 	size = sr->l_start + sr->l_len;
 	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
 		if (sr->l_len <= 0) {
 			ret = -EINVAL;
 			goto out_inode_unlock;
 		}
 	}
 	if (file && should_remove_suid(file->f_path.dentry)) {
 		ret = __ocfs2_write_remove_suid(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_inode_unlock;
 		}
 	}
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	switch (cmd) {
 	case OCFS2_IOC_RESVSP:
 	case OCFS2_IOC_RESVSP64:
 		/*
 		 * This takes unsigned offsets, but the signed ones we
 		 * pass have been checked against overflow above.
 		 */
 		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
 						       sr->l_len);
 		break;
 	case OCFS2_IOC_UNRESVSP:
 	case OCFS2_IOC_UNRESVSP64:
 		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
 					       sr->l_len);
 		break;
 	default:
 		ret = -EINVAL;
 	}
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_inode_unlock;
 	}
 	/*
 	 * We update c/mtime for these changes
 	 */
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out_inode_unlock;
 	}
 	if (change_size && i_size_read(inode) < size)
 		i_size_write(inode, size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(osb, handle);
 out_inode_unlock:
 	brelse(di_bh);
 	ocfs2_inode_unlock(inode, 1);
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
 	    !ocfs2_writes_unwritten_extents(osb))
 		return -ENOTTY;
 	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
 		 !ocfs2_sparse_alloc(osb))
 		return -ENOTTY;
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
 	int cmd = OCFS2_IOC_RESVSP64;
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		cmd = OCFS2_IOC_UNRESVSP64;
 	sr.l_whence = 0;
 	sr.l_start = (s64)offset;
 	sr.l_len = (s64)len;
 	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
 					 change_size);
 }
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
 				   size_t count)
 {
 	int ret = 0;
 	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
 	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
 	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return 0;
 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
 					 &extent_flags);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
 			ret = 1;
 			break;
 		}
 		if (extent_len > clusters)
 			extent_len = clusters;
 		clusters -= extent_len;
 		cpos += extent_len;
 	}
 out:
 	return ret;
 }
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 					    struct file *file,
 					    loff_t pos, size_t count,
 					    int *meta_level)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
 	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	u32 clusters =
 		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	*meta_level = 1;
 	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
 	if (ret)
 		mlog_errno(ret);
 out:
 	brelse(di_bh);
 	return ret;
 }
 static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t *ppos,
 					 size_t count,
 					 int appending,
 					 int *direct_io,
 					 int *has_refcount)
 {
 	int ret = 0, meta_level = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	loff_t saved_pos = 0, end;
 	/*
 	 * We start with a read level meta lock and only jump to an ex
 	 * if we need to make modifications here.
 	 */
 	for(;;) {
 		ret = ocfs2_inode_lock(inode, NULL, meta_level);
 		if (ret < 0) {
 			meta_level = -1;
 			mlog_errno(ret);
 			goto out;
 		}
 		/* Clear suid / sgid if necessary. We do this here
 		 * instead of later in the write path because
 		 * remove_suid() calls ->setattr without any hint that
 		 * we may have already done our cluster locking. Since
 		 * ocfs2_setattr() *must* take cluster locks to
 		 * proceeed, this will lead us to recursively lock the
 		 * inode. There's also the dinode i_size state which
 		 * can be lost via setattr during extending writes (we
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
 				ocfs2_inode_unlock(inode, meta_level);
 				meta_level = 1;
 				continue;
 			}
 			ret = ocfs2_write_remove_suid(inode);
 			if (ret < 0) {
 				mlog_errno(ret);
 				goto out_unlock;
 			}
 		}
 		/* work on a copy of ppos until we're sure that we won't have
 		 * to recalculate it due to relocking. */
 		if (appending)
 			saved_pos = i_size_read(inode);
 		else
 			saved_pos = *ppos;
 		end = saved_pos + count;
 		ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
 		if (ret == 1) {
 			ocfs2_inode_unlock(inode, meta_level);
 			meta_level = -1;
 			ret = ocfs2_prepare_inode_for_refcount(inode,
 							       file,
 							       saved_pos,
 							       count,
 							       &meta_level);
 			if (has_refcount)
 				*has_refcount = 1;
 			if (direct_io)
 				*direct_io = 0;
 		}
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
 		}
 		/*
 		 * Skip the O_DIRECT checks if we don't need
 		 * them.
 		 */
 		if (!direct_io || !(*direct_io))
 			break;
 		/*
 		 * There's no sane way to do direct writes to an inode
 		 * with inline data.
 		 */
 		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 			*direct_io = 0;
 			break;
 		}
 		/*
 		 * Allowing concurrent direct writes means
 		 * i_size changes wouldn't be synchronized, so
 		 * one node could wind up truncating another
 		 * nodes writes.
 		 */
 		if (end > i_size_read(inode)) {
 			*direct_io = 0;
 			break;
 		}
 		/*
 		 * We don't fill holes during direct io, so
 		 * check for them here. If any are found, the
 		 * caller will have to retake some cluster
 		 * locks and initiate the io as buffered.
 		 */
 		ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
 		if (ret == 1) {
 			*direct_io = 0;
 			ret = 0;
 		} else if (ret < 0)
 			mlog_errno(ret);
 		break;
 	}
 	if (appending)
 		*ppos = saved_pos;
 out_unlock:
 	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
 					    saved_pos, appending, count,
 					    direct_io, has_refcount);
 	if (meta_level >= 0)
 		ocfs2_inode_unlock(inode, meta_level);
 out:
 	return ret;
 }
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
 	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
 	int can_do_direct, has_refcount = 0;
 	ssize_t written = 0;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	loff_t old_size, *ppos = &iocb->ki_pos;
 	u32 old_clusters;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
 		file->f_path.dentry->d_name.len,
 		file->f_path.dentry->d_name.name,
 		(unsigned int)nr_segs);
 	if (iocb->ki_left == 0)
 		return 0;
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
 	mutex_lock(&inode->i_mutex);
 	ocfs2_iocb_clear_sem_locked(iocb);
 relock:
 	/* to match setattr's i_mutex -> rw_lock ordering */
 	if (direct_io) {
 		atomic_inc(&inode->i_dio_count);
 		have_alloc_sem = 1;
 		/* communicate with ocfs2_dio_end_io */
 		ocfs2_iocb_set_sem_locked(iocb);
 	}
 	/*
 	 * Concurrent O_DIRECT writes are allowed with
 	 * mount_option "coherency=buffered".
 	 */
 	rw_level = (!direct_io || full_coherency);
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 	/*
 	 * O_DIRECT writes with "coherency=full" need to take EX cluster
 	 * inode_lock to guarantee coherency.
 	 */
 	if (direct_io && full_coherency) {
 		/*
 		 * We need to take and drop the inode lock to force
 		 * other nodes to drop their caches.  Buffered I/O
 		 * already does this in write_begin().
 		 */
 		ret = ocfs2_inode_lock(inode, NULL, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_sems;
 		}
 		ocfs2_inode_unlock(inode, 1);
 	}
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file, ppos,
 					    iocb->ki_left, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
 	 * buffered I/O.
 	 */
 	if (direct_io && !can_do_direct) {
 		ocfs2_rw_unlock(inode, rw_level);
 		inode_dio_done(inode);
 		have_alloc_sem = 0;
 		rw_level = -1;
 		direct_io = 0;
 		goto relock;
 	}
 	/*
 	 * To later detect whether a journal commit for sync writes is
 	 * necessary, we sample i_size, and cluster count here.
 	 */
 	old_size = i_size_read(inode);
 	old_clusters = OCFS2_I(inode)->ip_clusters;
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	ret = generic_segment_checks(iov, &nr_segs, &ocount,
 				     VERIFY_READ);
 	if (ret)
 		goto out_dio;
 	count = ocount;
 	ret = generic_write_checks(file, ppos, &count,
 				   S_ISBLK(inode->i_mode));
 	if (ret)
 		goto out_dio;
 	if (direct_io) {
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
 			ret = written;
 			goto out_dio;
 		}
 	} else {
 		current->backing_dev_info = file->f_mapping->backing_dev_info;
 		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
 						      ppos, count, 0);
 		current->backing_dev_info = NULL;
 	}
 out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 	if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
 	    ((file->f_flags & O_DIRECT) && !direct_io)) {
 		ret = filemap_fdatawrite_range(file->f_mapping, pos,
 					       pos + count - 1);
 		if (ret < 0)
 			written = ret;
 		if (!ret && ((old_size != i_size_read(inode)) ||
 			     (old_clusters != OCFS2_I(inode)->ip_clusters) ||
 			     has_refcount)) {
 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
 		if (!ret)
 			ret = filemap_fdatawait_range(file->f_mapping, pos,
 						      pos + count - 1);
 	}
 	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
 	 * it can unlock our rw lock.
 	 * Unfortunately there are error cases which call end_io and others
 	 * that don't.  so we don't have to unlock the rw_lock if either an
 	 * async dio is going to do it in the future or an end_io after an
 	 * error has already done it.
 	 */
 	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 out_sems:
 	if (have_alloc_sem) {
 		inode_dio_done(inode);
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 	mutex_unlock(&inode->i_mutex);
 	if (written)
 		ret = written;
 	return ret;
 }
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 				struct file *out,
 				struct splice_desc *sd)
 {
 	int ret;
 	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
 					    sd->total_len, 0, NULL, NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
 }
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
 				       size_t len,
 				       unsigned int flags)
 {
 	int ret;
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
 		.pos = *ppos,
 		.u.file = out,
 	};
 	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			out->f_path.dentry->d_name.len,
 			out->f_path.dentry->d_name.name, len);
 	if (pipe->inode)
 		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
 	splice_from_pipe_begin(&sd);
 	do {
 		ret = splice_from_pipe_next(pipe, &sd);
 		if (ret <= 0)
 			break;
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 		ret = ocfs2_rw_lock(inode, 1);
 		if (ret < 0)
 			mlog_errno(ret);
 		else {
 			ret = ocfs2_splice_to_file(pipe, out, &sd);
 			ocfs2_rw_unlock(inode, 1);
 		}
 		mutex_unlock(&inode->i_mutex);
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	if (sd.num_spliced)
 		ret = sd.num_spliced;
 	if (ret > 0) {
 		unsigned long nr_pages;
 		int err;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 		err = generic_write_sync(out, *ppos, ret);
 		if (err)
 			ret = err;
 		else
 			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 	return ret;
 }
 static ssize_t ocfs2_file_splice_read(struct file *in,
 				      loff_t *ppos,
 				      struct pipe_inode_info *pipe,
 				      size_t len,
 				      unsigned int flags)
 {
 	int ret = 0, lock_level = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			in->f_path.dentry->d_name.len,
 			in->f_path.dentry->d_name.name, len);
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
 	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_inode_unlock(inode, lock_level);
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 bail:
 	return ret;
 }
 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 				   const struct iovec *iov,
 				   unsigned long nr_segs,
 				   loff_t pos)
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			filp->f_path.dentry->d_name.len,
 			filp->f_path.dentry->d_name.name, nr_segs);
 	if (!inode) {
 		ret = -EINVAL;
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_iocb_clear_sem_locked(iocb);
 	/*
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
 	 */
 	if (filp->f_flags & O_DIRECT) {
 		have_alloc_sem = 1;
 		atomic_inc(&inode->i_dio_count);
 		ocfs2_iocb_set_sem_locked(iocb);
 		ret = ocfs2_rw_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto bail;
 		}
 		rw_level = 0;
 		/* communicate with ocfs2_dio_end_io */
 		ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	}
 	/*
 	 * We're fine letting folks race truncates and extending
 	 * writes with read across the cluster, just like they can
 	 * locally. Hence no rw_lock during read.
 	 *
 	 * Take and drop the meta data lock to update inode fields
 	 * like i_size. This allows the checks down below
 	 * generic_file_aio_read() a chance of actually working.
 	 */
 	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_inode_unlock(inode, lock_level);
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	trace_generic_file_aio_read_ret(ret);
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
 	/* see ocfs2_file_aio_write */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 bail:
 	if (have_alloc_sem) {
 		inode_dio_done(inode);
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 	return ret;
 }
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
 	.fiemap		= ocfs2_fiemap,
 	.check_acl	= ocfs2_check_acl,
 };
 const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.check_acl	= ocfs2_check_acl,
 };
 /*
  * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
  */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 };
 /*
  * POSIX-lockless variants of our file_operations.
  *
  * These will be used if the underlying cluster stack does not support
  * posix file locking, if the user passes the "localflocks" mount
  * option, or if we have a local-only fs.
  *
  * ocfs2_flock is in here because all stacks handle UNIX file locks,
  * so we still want it in the case of no stack support for
  * plocks. Internally, it will do the right thing when asked to ignore
  * the cluster.
  */
 const struct file_operations ocfs2_fops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 };

fs/reiserfs/inode.c

Diff comments View file @ 562c72a

 /*
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/exportfs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
 void reiserfs_evict_inode(struct inode *inode)
 {
 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 2 +
 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
 	struct reiserfs_transaction_handle th;
 	int depth;
 	int err;
 	if (!inode->i_nlink && !is_bad_inode(inode))
 		dquot_initialize(inode);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (inode->i_nlink)
 		goto no_delete;
 	depth = reiserfs_write_lock_once(inode->i_sb);
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
 		reiserfs_delete_xattrs(inode);
 		if (journal_begin(&th, inode->i_sb, jbegin_count))
 			goto out;
 		reiserfs_update_inode_transaction(inode);
 		reiserfs_discard_prealloc(&th, inode);
 		err = reiserfs_delete_object(&th, inode);
 		/* Do quota update inside a transaction for journaled quotas. We must do that
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err)
 			dquot_free_inode(inode);
 		if (journal_end(&th, inode->i_sb, jbegin_count))
 			goto out;
 		/* check return value from reiserfs_delete_object after
 		 * ending the transaction
 		 */
 		if (err)
 		    goto out;
 		/* all items of file are deleted, so we can remove "save" link */
 		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
 								 * about an error here */
 	} else {
 		/* no object items are in the tree */
 		;
 	}
       out:
 	end_writeback(inode);	/* note this must go after the journal_end to prevent deadlock */
 	dquot_drop(inode);
 	inode->i_blocks = 0;
 	reiserfs_write_unlock_once(inode->i_sb, depth);
 	return;
 no_delete:
 	end_writeback(inode);
 	dquot_drop(inode);
 }
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
 			  __u32 objectid, loff_t offset, int type, int length)
 {
 	key->version = version;
 	key->on_disk_key.k_dir_id = dirid;
 	key->on_disk_key.k_objectid = objectid;
 	set_cpu_key_k_offset(key, offset);
 	set_cpu_key_k_type(key, type);
 	key->key_length = length;
 }
 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
    offset and type of key */
 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
 		  int type, int length)
 {
 	_make_cpu_key(key, get_inode_item_key_version(inode),
 		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
 		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
 		      length);
 }
 //
 // when key is 0, do not set version and short key
 //
 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 			      int version,
 			      loff_t offset, int type, int length,
 			      int entry_count /*or ih_free_space */ )
 {
 	if (key) {
 		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
 		ih->ih_key.k_objectid =
 		    cpu_to_le32(key->on_disk_key.k_objectid);
 	}
 	put_ih_version(ih, version);
 	set_le_ih_k_offset(ih, offset);
 	set_le_ih_k_type(ih, type);
 	put_ih_item_len(ih, length);
 	/*    set_ih_free_space (ih, 0); */
 	// for directory items it is entry count, for directs and stat
 	// datas - 0xffff, for indirects - 0
 	put_ih_entry_count(ih, entry_count);
 }
 //
 // FIXME: we might cache recently accessed indirect item
 // Ugh.  Not too eager for that....
 //  I cut the code until such time as I see a convincing argument (benchmark).
 // I don't want a bloated inode struct..., and I don't like code complexity....
 /* cutting the code is fine, since it really isn't in use yet and is easy
 ** to add back in.  But, Vladimir has a really good idea here.  Think
 ** about what happens for reading a file.  For each page,
 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 ** an indirect item.  This indirect item has X number of pointers, where
 ** X is a big number if we've done the block allocation right.  But,
 ** we only use one or two of these pointers during each call to readpage,
 ** needlessly researching again later on.
 **
 ** The size of the cache could be dynamic based on the size of the file.
 **
 ** I'd also like to see us cache the location the stat data item, since
 ** we are needlessly researching for that frequently.
 **
 ** --chris
 */
 /* If this page has a file tail in it, and
 ** it was read in by get_block_create_0, the page data is valid,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
 ** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
 {
 	struct buffer_head *head, *next, *bh;
 	if (page && page_has_buffers(page)) {
 		head = page_buffers(page);
 		bh = head;
 		do {
 			next = bh->b_this_page;
 			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 				reiserfs_unmap_buffer(bh);
 			}
 			bh = next;
 		} while (bh != head);
 	}
 }
 /* reiserfs_get_block does not need to allocate a block only if it has been
    done already or non-hole position has been found in the indirect item */
 static inline int allocation_needed(int retval, b_blocknr_t allocated,
 				    struct item_head *ih,
 				    __le32 * item, int pos_in_item)
 {
 	if (allocated)
 		return 0;
 	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
 	    get_block_num(item, pos_in_item))
 		return 0;
 	return 1;
 }
 static inline int indirect_item_found(int retval, struct item_head *ih)
 {
 	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
 }
 static inline void set_block_dev_mapped(struct buffer_head *bh,
 					b_blocknr_t block, struct inode *inode)
 {
 	map_bh(bh, inode->i_sb, block);
 }
 //
 // files which were created in the earlier version can not be longer,
 // than 2 gb
 //
 static int file_capable(struct inode *inode, sector_t block)
 {
 	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
 	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
 		return 1;
 	return 0;
 }
 static int restart_transaction(struct reiserfs_transaction_handle *th,
 			       struct inode *inode, struct treepath *path)
 {
 	struct super_block *s = th->t_super;
 	int len = th->t_blocks_allocated;
 	int err;
 	BUG_ON(!th->t_trans_id);
 	BUG_ON(!th->t_refcount);
 	pathrelse(path);
 	/* we cannot restart while nested */
 	if (th->t_refcount > 1) {
 		return 0;
 	}
 	reiserfs_update_sd(th, inode);
 	err = journal_end(th, s, len);
 	if (!err) {
 		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
 		if (!err)
 			reiserfs_update_inode_transaction(inode);
 	}
 	return err;
 }
 // it is called by get_block when create == 0. Returns block number
 // for 'block'-th logical block of file. When it hits direct item it
 // returns 0 (being called from bmap) or read direct item into piece
 // of page (bh_result)
 // Please improve the english/clarity in the comment above, as it is
 // hard to understand.
 static int _get_block_create_0(struct inode *inode, sector_t block,
 			       struct buffer_head *bh_result, int args)
 {
 	INITIALIZE_PATH(path);
 	struct cpu_key key;
 	struct buffer_head *bh;
 	struct item_head *ih, tmp_ih;
 	b_blocknr_t blocknr;
 	char *p = NULL;
 	int chars;
 	int ret;
 	int result;
 	int done = 0;
 	unsigned long offset;
 	// prepare the key to look for the 'block'-th block of file
 	make_cpu_key(&key, inode,
 		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
 		     3);
 	result = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (result != POSITION_FOUND) {
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		if (result == IO_ERROR)
 			return -EIO;
 		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
 		// That there is some MMAPED data associated with it that is yet to be written to disk.
 		if ((args & GET_BLOCK_NO_HOLE)
 		    && !PageUptodate(bh_result->b_page)) {
 			return -ENOENT;
 		}
 		return 0;
 	}
 	//
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	if (is_indirect_le_ih(ih)) {
 		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
 		/* FIXME: here we could cache indirect item or part of it in
 		   the inode to avoid search_by_key in case of subsequent
 		   access to file */
 		blocknr = get_block_num(ind_item, path.pos_in_item);
 		ret = 0;
 		if (blocknr) {
 			map_bh(bh_result, inode->i_sb, blocknr);
 			if (path.pos_in_item ==
 			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 				set_buffer_boundary(bh_result);
 			}
 		} else
 			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
 			// That there is some MMAPED data associated with it that is yet to  be written to disk.
 		if ((args & GET_BLOCK_NO_HOLE)
 			    && !PageUptodate(bh_result->b_page)) {
 			ret = -ENOENT;
 		}
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		return ret;
 	}
 	// requested data are in direct item(s)
 	if (!(args & GET_BLOCK_READ_DIRECT)) {
 		// we are called by bmap. FIXME: we can not map block of file
 		// when it is stored in direct item(s)
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		return -ENOENT;
 	}
 	/* if we've got a direct item, and the buffer or page was uptodate,
 	 ** we don't want to pull data off disk again.  skip to the
 	 ** end, where we map the buffer and return
 	 */
 	if (buffer_uptodate(bh_result)) {
 		goto finished;
 	} else
 		/*
 		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 		 ** pages without any buffers.  If the page is up to date, we don't want
 		 ** read old data off disk.  Set the up to date bit on the buffer instead
 		 ** and jump to the end
 		 */
 	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 		set_buffer_uptodate(bh_result);
 		goto finished;
 	}
 	// read file tail into part of page
 	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
 	copy_item_head(&tmp_ih, ih);
 	/* we only want to kmap if we are reading the tail into the page.
 	 ** this is not the common case, so we don't kmap until we are
 	 ** sure we need to.  But, this means the item might move if
 	 ** kmap schedules
 	 */
 	if (!p)
 		p = (char *)kmap(bh_result->b_page);
 	p += offset;
 	memset(p, 0, inode->i_sb->s_blocksize);
 	do {
 		if (!is_direct_le_ih(ih)) {
 			BUG();
 		}
 		/* make sure we don't read more bytes than actually exist in
 		 ** the file.  This can happen in odd cases where i_size isn't
 		 ** correct, and when direct item padding results in a few
 		 ** extra bytes at the end of the direct item
 		 */
 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 			break;
 		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 			chars =
 			    inode->i_size - (le_ih_k_offset(ih) - 1) -
 			    path.pos_in_item;
 			done = 1;
 		} else {
 			chars = ih_item_len(ih) - path.pos_in_item;
 		}
 		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
 		if (done)
 			break;
 		p += chars;
 		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
 			// we done, if read direct item is not the last item of
 			// node FIXME: we could try to check right delimiting key
 			// to see whether direct item continues in the right
 			// neighbor or rely on i_size
 			break;
 		// update key to look for the next piece
 		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
 		result = search_for_position_by_key(inode->i_sb, &key, &path);
 		if (result != POSITION_FOUND)
 			// i/o error most likely
 			break;
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 	} while (1);
 	flush_dcache_page(bh_result->b_page);
 	kunmap(bh_result->b_page);
       finished:
 	pathrelse(&path);
 	if (result == IO_ERROR)
 		return -EIO;
 	/* this buffer has valid data, but isn't valid for io.  mapping it to
 	 * block #0 tells the rest of reiserfs it just has a tail in it
 	 */
 	map_bh(bh_result, inode->i_sb, 0);
 	set_buffer_uptodate(bh_result);
 	return 0;
 }
 // this is called to create file map. So, _get_block_create_0 will not
 // read direct item
 static int reiserfs_bmap(struct inode *inode, sector_t block,
 			 struct buffer_head *bh_result, int create)
 {
 	if (!file_capable(inode, block))
 		return -EFBIG;
 	reiserfs_write_lock(inode->i_sb);
 	/* do not read the direct item */
 	_get_block_create_0(inode, block, bh_result, 0);
 	reiserfs_write_unlock(inode->i_sb);
 	return 0;
 }
 /* special version of get_block that is only used by grab_tail_page right
 ** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
 ** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
 ** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
 **
 ** The point of the wrapper is forcing a certain value for create, even
 ** though the VFS layer is calling this function with create==1.  If you
 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 ** don't use this function.
 */
 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
 				       struct buffer_head *bh_result,
 				       int create)
 {
 	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
 }
 /* This is special helper for reiserfs_get_block in case we are executing
    direct_IO request. */
 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 					 sector_t iblock,
 					 struct buffer_head *bh_result,
 					 int create)
 {
 	int ret;
 	bh_result->b_page = NULL;
 	/* We set the b_size before reiserfs_get_block call since it is
 	   referenced in convert_tail_for_hole() that may be called from
 	   reiserfs_get_block() */
 	bh_result->b_size = (1 << inode->i_blkbits);
 	ret = reiserfs_get_block(inode, iblock, bh_result,
 				 create | GET_BLOCK_NO_DANGLE);
 	if (ret)
 		goto out;
 	/* don't allow direct io onto tail pages */
 	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 		/* make sure future calls to the direct io funcs for this offset
 		 ** in the file fail by unmapping the buffer
 		 */
 		clear_buffer_mapped(bh_result);
 		ret = -EINVAL;
 	}
 	/* Possible unpacked tail. Flush the data before pages have
 	   disappeared */
 	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 		int err;
 		reiserfs_write_lock(inode->i_sb);
 		err = reiserfs_commit_for_inode(inode);
 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 		reiserfs_write_unlock(inode->i_sb);
 		if (err < 0)
 			ret = err;
 	}
       out:
 	return ret;
 }
 /*
 ** helper function for when reiserfs_get_block is called for a hole
 ** but the file tail is still in a direct item
 ** bh_result is the buffer head for the hole
 ** tail_offset is the offset of the start of the tail in the file
 **
 ** This calls prepare_write, which will start a new transaction
 ** you should not be in a transaction, or have any paths held when you
 ** call this.
 */
 static int convert_tail_for_hole(struct inode *inode,
 				 struct buffer_head *bh_result,
 				 loff_t tail_offset)
 {
 	unsigned long index;
 	unsigned long tail_end;
 	unsigned long tail_start;
 	struct page *tail_page;
 	struct page *hole_page = bh_result->b_page;
 	int retval = 0;
 	if ((tail_offset & (bh_result->b_size - 1)) != 1)
 		return -EIO;
 	/* always try to read until the end of the block */
 	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
 	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
 	index = tail_offset >> PAGE_CACHE_SHIFT;
 	/* hole_page can be zero in case of direct_io, we are sure
 	   that we cannot get here if we write with O_DIRECT into
 	   tail page */
 	if (!hole_page || index != hole_page->index) {
 		tail_page = grab_cache_page(inode->i_mapping, index);
 		retval = -ENOMEM;
 		if (!tail_page) {
 			goto out;
 		}
 	} else {
 		tail_page = hole_page;
 	}
 	/* we don't have to make sure the conversion did not happen while
 	 ** we were locking the page because anyone that could convert
 	 ** must first take i_mutex.
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
 	 ** data that has been read directly into the page, and
 	 ** __block_write_begin won't trigger a get_block in this case.
 	 */
 	fix_tail_page_for_writing(tail_page);
 	retval = __reiserfs_write_begin(tail_page, tail_start,
 				      tail_end - tail_start);
 	if (retval)
 		goto unlock;
 	/* tail conversion might change the data in the page */
 	flush_dcache_page(tail_page);
 	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
       unlock:
 	if (tail_page != hole_page) {
 		unlock_page(tail_page);
 		page_cache_release(tail_page);
 	}
       out:
 	return retval;
 }
 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 				  sector_t block,
 				  struct inode *inode,
 				  b_blocknr_t * allocated_block_nr,
 				  struct treepath *path, int flags)
 {
 	BUG_ON(!th->t_trans_id);
 #ifdef REISERFS_PREALLOCATE
 	if (!(flags & GET_BLOCK_NO_IMUX)) {
 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
 						  path, block);
 	}
 #endif
 	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
 					 block);
 }
 int reiserfs_get_block(struct inode *inode, sector_t block,
 		       struct buffer_head *bh_result, int create)
 {
 	int repeat, retval = 0;
 	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
 	INITIALIZE_PATH(path);
 	int pos_in_item;
 	struct cpu_key key;
 	struct buffer_head *bh, *unbh = NULL;
 	struct item_head *ih, tmp_ih;
 	__le32 *item;
 	int done;
 	int fs_gen;
 	int lock_depth;
 	struct reiserfs_transaction_handle *th = NULL;
 	/* space reserved in transaction batch:
 	   . 3 balancings in direct->indirect conversion
 	   . 1 block involved into reiserfs_update_sd()
 	   XXX in practically impossible worst case direct2indirect()
 	   can incur (much) more than 3 balancings.
 	   quota update for user, group */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
 	int version;
 	int dangle = 1;
 	loff_t new_offset =
 	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
 	lock_depth = reiserfs_write_lock_once(inode->i_sb);
 	version = get_inode_item_key_version(inode);
 	if (!file_capable(inode, block)) {
 		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 		return -EFBIG;
 	}
 	/* if !create, we aren't changing the FS, so we don't need to
 	 ** log anything, so we don't need to start a transaction
 	 */
 	if (!(create & GET_BLOCK_CREATE)) {
 		int ret;
 		/* find number of block-th logical block of the file */
 		ret = _get_block_create_0(inode, block, bh_result,
 					  create | GET_BLOCK_READ_DIRECT);
 		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 		return ret;
 	}
 	/*
 	 * if we're already in a transaction, make sure to close
 	 * any new transactions we start in this func
 	 */
 	if ((create & GET_BLOCK_NO_DANGLE) ||
 	    reiserfs_transaction_running(inode->i_sb))
 		dangle = 0;
 	/* If file is of such a size, that it might have a tail and tails are enabled
 	 ** we should mark it as possibly needing tail packing on close
 	 */
 	if ((have_large_tails(inode->i_sb)
 	     && inode->i_size < i_block_size(inode) * 4)
 	    || (have_small_tails(inode->i_sb)
 		&& inode->i_size < i_block_size(inode)))
 		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
 	/* set the key of the first byte in the 'block'-th block of file */
 	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
 	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 	      start_trans:
 		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 		if (!th) {
 			retval = -ENOMEM;
 			goto failure;
 		}
 		reiserfs_update_inode_transaction(inode);
 	}
       research:
 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (retval == IO_ERROR) {
 		retval = -EIO;
 		goto failure;
 	}
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	item = get_item(&path);
 	pos_in_item = path.pos_in_item;
 	fs_gen = get_generation(inode->i_sb);
 	copy_item_head(&tmp_ih, ih);
 	if (allocation_needed
 	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
 		/* we have to allocate block for the unformatted node */
 		if (!th) {
 			pathrelse(&path);
 			goto start_trans;
 		}
 		repeat =
 		    _allocate_block(th, block, inode, &allocated_block_nr,
 				    &path, create);
 		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 			/* restart the transaction to give the journal a chance to free
 			 ** some blocks.  releases the path, so we have to go back to
 			 ** research if we succeed on the second try
 			 */
 			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 			retval = restart_transaction(th, inode, &path);
 			if (retval)
 				goto failure;
 			repeat =
 			    _allocate_block(th, block, inode,
 					    &allocated_block_nr, NULL, create);
 			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 				goto research;
 			}
 			if (repeat == QUOTA_EXCEEDED)
 				retval = -EDQUOT;
 			else
 				retval = -ENOSPC;
 			goto failure;
 		}
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			goto research;
 		}
 	}
 	if (indirect_item_found(retval, ih)) {
 		b_blocknr_t unfm_ptr;
 		/* 'block'-th block is in the file already (there is
 		   corresponding cell in some indirect item). But it may be
 		   zero unformatted node pointer (hole) */
 		unfm_ptr = get_block_num(item, pos_in_item);
 		if (unfm_ptr == 0) {
 			/* use allocated block to plug the hole */
 			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 			if (fs_changed(fs_gen, inode->i_sb)
 			    && item_moved(&tmp_ih, &path)) {
 				reiserfs_restore_prepared_buffer(inode->i_sb,
 								 bh);
 				goto research;
 			}
 			set_buffer_new(bh_result);
 			if (buffer_dirty(bh_result)
 			    && reiserfs_data_ordered(inode->i_sb))
 				reiserfs_add_ordered_list(inode, bh_result);
 			put_block_num(item, pos_in_item, allocated_block_nr);
 			unfm_ptr = allocated_block_nr;
 			journal_mark_dirty(th, inode->i_sb, bh);
 			reiserfs_update_sd(th, inode);
 		}
 		set_block_dev_mapped(bh_result, unfm_ptr, inode);
 		pathrelse(&path);
 		retval = 0;
 		if (!dangle && th)
 			retval = reiserfs_end_persistent_transaction(th);
 		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 		/* the item was found, so new blocks were not added to the file
 		 ** there is no need to make sure the inode is updated with this
 		 ** transaction
 		 */
 		return retval;
 	}
 	if (!th) {
 		pathrelse(&path);
 		goto start_trans;
 	}
 	/* desired position is not found or is in the direct item. We have
 	   to append file with holes up to 'block'-th block converting
 	   direct items to indirect one if necessary */
 	done = 0;
 	do {
 		if (is_statdata_le_ih(ih)) {
 			__le32 unp = 0;
 			struct cpu_key tmp_key;
 			/* indirect item has to be inserted */
 			make_le_item_head(&tmp_ih, &key, version, 1,
 					  TYPE_INDIRECT, UNFM_P_SIZE,
 					  0 /* free_space */ );
 			if (cpu_key_k_offset(&key) == 1) {
 				/* we are going to add 'block'-th block to the file. Use
 				   allocated block for that */
 				unp = cpu_to_le32(allocated_block_nr);
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				set_buffer_new(bh_result);
 				done = 1;
 			}
 			tmp_key = key;	// ;)
 			set_cpu_key_k_offset(&tmp_key, 1);
 			PATH_LAST_POSITION(&path)++;
 			retval =
 			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
 						 inode, (char *)&unp);
 			if (retval) {
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 			}
 			//mark_tail_converted (inode);
 		} else if (is_direct_le_ih(ih)) {
 			/* direct item has to be converted */
 			loff_t tail_offset;
 			tail_offset =
 			    ((le_ih_k_offset(ih) -
 			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 			if (tail_offset == cpu_key_k_offset(&key)) {
 				/* direct item we just found fits into block we have
 				   to map. Convert it into unformatted node: use
 				   bh_result for the conversion */
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				unbh = bh_result;
 				done = 1;
 			} else {
 				/* we have to padd file tail stored in direct item(s)
 				   up to block size and convert it to unformatted
 				   node. FIXME: this should also get into page cache */
 				pathrelse(&path);
 				/*
 				 * ugly, but we can only end the transaction if
 				 * we aren't nested
 				 */
 				BUG_ON(!th->t_refcount);
 				if (th->t_refcount == 1) {
 					retval =
 					    reiserfs_end_persistent_transaction
 					    (th);
 					th = NULL;
 					if (retval)
 						goto failure;
 				}
 				retval =
 				    convert_tail_for_hole(inode, bh_result,
 							  tail_offset);
 				if (retval) {
 					if (retval != -ENOSPC)
 						reiserfs_error(inode->i_sb,
 							"clm-6004",
 							"convert tail failed "
 							"inode %lu, error %d",
 							inode->i_ino,
 							retval);
 					if (allocated_block_nr) {
 						/* the bitmap, the super, and the stat data == 3 */
 						if (!th)
 							th = reiserfs_persistent_transaction(inode->i_sb, 3);
 						if (th)
 							reiserfs_free_block(th,
 									    inode,
 									    allocated_block_nr,
 									    1);
 					}
 					goto failure;
 				}
 				goto research;
 			}
 			retval =
 			    direct2indirect(th, inode, &path, unbh,
 					    tail_offset);
 			if (retval) {
 				reiserfs_unmap_buffer(unbh);
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;
 			}
 			/* it is important the set_buffer_uptodate is done after
 			 ** the direct2indirect.  The buffer might contain valid
 			 ** data newer than the data on disk (read by readpage, changed,
 			 ** and then sent here by writepage).  direct2indirect needs
 			 ** to know if unbh was already up to date, so it can decide
 			 ** if the data in unbh needs to be replaced with data from
 			 ** the disk
 			 */
 			set_buffer_uptodate(unbh);
 			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
 			   buffer will disappear shortly, so it should not be added to
 			 */
 			if (unbh->b_page) {
 				/* we've converted the tail, so we must
 				 ** flush unbh before the transaction commits
 				 */
 				reiserfs_add_tail_list(inode, unbh);
 				/* mark it dirty now to prevent commit_write from adding
 				 ** this buffer to the inode's dirty buffer list
 				 */
 				/*
 				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 				 * It's still atomic, but it sets the page dirty too,
 				 * which makes it eligible for writeback at any time by the
 				 * VM (which was also the case with __mark_buffer_dirty())
 				 */
 				mark_buffer_dirty(unbh);
 			}
 		} else {
 			/* append indirect item with holes if needed, when appending
 			   pointer to 'block'-th block use block, which is already
 			   allocated */
 			struct cpu_key tmp_key;
 			unp_t unf_single = 0;	// We use this in case we need to allocate only
 			// one block which is a fastpath
 			unp_t *un;
 			__u64 max_to_insert =
 			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
 			    UNFM_P_SIZE;
 			__u64 blocks_needed;
 			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 			       "vs-804: invalid position for append");
 			/* indirect item has to be appended, set up key of that position */
 			make_cpu_key(&tmp_key, inode,
 				     le_key_k_offset(version,
 						     &(ih->ih_key)) +
 				     op_bytes_number(ih,
 						     inode->i_sb->s_blocksize),
 				     //pos_in_item * inode->i_sb->s_blocksize,
 				     TYPE_INDIRECT, 3);	// key type is unimportant
 			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
 			       "green-805: invalid offset");
 			blocks_needed =
 			    1 +
 			    ((cpu_key_k_offset(&key) -
 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
 			     s_blocksize_bits);
 			if (blocks_needed == 1) {
 				un = &unf_single;
 			} else {
 				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
 				if (!un) {
 					un = &unf_single;
 					blocks_needed = 1;
 					max_to_insert = 0;
 				}
 			}
 			if (blocks_needed <= max_to_insert) {
 				/* we are going to add target block to the file. Use allocated
 				   block for that */
 				un[blocks_needed - 1] =
 				    cpu_to_le32(allocated_block_nr);
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				set_buffer_new(bh_result);
 				done = 1;
 			} else {
 				/* paste hole to the indirect item */
 				/* If kmalloc failed, max_to_insert becomes zero and it means we
 				   only have space for one block */
 				blocks_needed =
 				    max_to_insert ? max_to_insert : 1;
 			}
 			retval =
 			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
 						     (char *)un,
 						     UNFM_P_SIZE *
 						     blocks_needed);
 			if (blocks_needed != 1)
 				kfree(un);
 			if (retval) {
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;
 			}
 			if (!done) {
 				/* We need to mark new file size in case this function will be
 				   interrupted/aborted later on. And we may do this only for
 				   holes. */
 				inode->i_size +=
 				    inode->i_sb->s_blocksize * blocks_needed;
 			}
 		}
 		if (done == 1)
 			break;
 		/* this loop could log more blocks than we had originally asked
 		 ** for.  So, we have to allow the transaction to end if it is
 		 ** too big or too full.  Update the inode so things are
 		 ** consistent if we crash before the function returns
 		 **
 		 ** release the path so that anybody waiting on the path before
 		 ** ending their transaction will be able to continue.
 		 */
 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 			retval = restart_transaction(th, inode, &path);
 			if (retval)
 				goto failure;
 		}
 		/*
 		 * inserting indirect pointers for a hole can take a
 		 * long time.  reschedule if needed and also release the write
 		 * lock for others.
 		 */
 		if (need_resched()) {
 			reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 			schedule();
 			lock_depth = reiserfs_write_lock_once(inode->i_sb);
 		}
 		retval = search_for_position_by_key(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto failure;
 		}
 		if (retval == POSITION_FOUND) {
 			reiserfs_warning(inode->i_sb, "vs-825",
 					 "%K should not be found", &key);
 			retval = -EEXIST;
 			if (allocated_block_nr)
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 			pathrelse(&path);
 			goto failure;
 		}
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 		item = get_item(&path);
 		pos_in_item = path.pos_in_item;
 	} while (1);
 	retval = 0;
       failure:
 	if (th && (!dangle || (retval && !th->t_trans_id))) {
 		int err;
 		if (th->t_trans_id)
 			reiserfs_update_sd(th, inode);
 		err = reiserfs_end_persistent_transaction(th);
 		if (err)
 			retval = err;
 	}
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	reiserfs_check_path(&path);
 	return retval;
 }
 static int
 reiserfs_readpages(struct file *file, struct address_space *mapping,
 		   struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 }
 /* Compute real number of used bytes by file
  * Following three functions can go away when we'll have enough space in stat item
  */
 static int real_space_diff(struct inode *inode, int sd_size)
 {
 	int bytes;
 	loff_t blocksize = inode->i_sb->s_blocksize;
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
 		return sd_size;
 	/* End of file is also in full block with indirect reference, so round
 	 ** up to the next block.
 	 **
 	 ** there is just no way to know if the tail is actually packed
 	 ** on the file, so we have to assume it isn't.  When we pack the
 	 ** tail, we add 4 bytes to pretend there really is an unformatted
 	 ** node pointer
 	 */
 	bytes =
 	    ((inode->i_size +
 	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
 	    sd_size;
 	return bytes;
 }
 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
 					int sd_size)
 {
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 		return inode->i_size +
 		    (loff_t) (real_space_diff(inode, sd_size));
 	}
 	return ((loff_t) real_space_diff(inode, sd_size)) +
 	    (((loff_t) blocks) << 9);
 }
 /* Compute number of blocks used by file in ReiserFS counting */
 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 {
 	loff_t bytes = inode_get_bytes(inode);
 	loff_t real_space = real_space_diff(inode, sd_size);
 	/* keeps fsck and non-quota versions of reiserfs happy */
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 		bytes += (loff_t) 511;
 	}
 	/* files from before the quota patch might i_blocks such that
 	 ** bytes < real_space.  Deal with that here to prevent it from
 	 ** going negative.
 	 */
 	if (bytes < real_space)
 		return 0;
 	return (bytes - real_space) >> 9;
 }
 //
 // BAD: new directories have stat data of new type and all other items
 // of old type. Version stored in the inode says about body items, so
 // in update_stat_data we can not rely on inode, but have to check
 // item version directly
 //
 // called by read_locked_inode
 static void init_inode(struct inode *inode, struct treepath *path)
 {
 	struct buffer_head *bh;
 	struct item_head *ih;
 	__u32 rdev;
 	//int version = ITEM_VERSION_1;
 	bh = PATH_PLAST_BUFFER(path);
 	ih = PATH_PITEM_HEAD(path);
 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
 	REISERFS_I(inode)->i_prealloc_block = 0;
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	reiserfs_init_xattr_rwsem(inode);
 	if (stat_data_v1(ih)) {
 		struct stat_data_v1 *sd =
 		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
 		unsigned long blocks;
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 		set_inode_sd_version(inode, STAT_DATA_V1);
 		inode->i_mode = sd_v1_mode(sd);
 		inode->i_nlink = sd_v1_nlink(sd);
 		inode->i_uid = sd_v1_uid(sd);
 		inode->i_gid = sd_v1_gid(sd);
 		inode->i_size = sd_v1_size(sd);
 		inode->i_atime.tv_sec = sd_v1_atime(sd);
 		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
 		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
 		inode->i_atime.tv_nsec = 0;
 		inode->i_ctime.tv_nsec = 0;
 		inode->i_mtime.tv_nsec = 0;
 		inode->i_blocks = sd_v1_blocks(sd);
 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 		blocks = (inode->i_size + 511) >> 9;
 		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
 		if (inode->i_blocks > blocks) {
 			// there was a bug in <=3.5.23 when i_blocks could take negative
 			// values. Starting from 3.5.17 this value could even be stored in
 			// stat data. For such files we set i_blocks based on file
 			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
 			// only updated if file's inode will ever change
 			inode->i_blocks = blocks;
 		}
 		rdev = sd_v1_rdev(sd);
 		REISERFS_I(inode)->i_first_direct_byte =
 		    sd_v1_first_direct_byte(sd);
 		/* an early bug in the quota code can give us an odd number for the
 		 ** block count.  This is incorrect, fix it here.
 		 */
 		if (inode->i_blocks & 1) {
 			inode->i_blocks++;
 		}
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V1_SIZE));
 		/* nopack is initially zero for v1 objects. For v2 objects,
 		   nopack is initialised from sd_attrs */
 		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
 	} else {
 		// new stat data found, but object may have old items
 		// (directories and symlinks)
 		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
 		inode->i_mode = sd_v2_mode(sd);
 		inode->i_nlink = sd_v2_nlink(sd);
 		inode->i_uid = sd_v2_uid(sd);
 		inode->i_size = sd_v2_size(sd);
 		inode->i_gid = sd_v2_gid(sd);
 		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
 		inode->i_atime.tv_sec = sd_v2_atime(sd);
 		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
 		inode->i_ctime.tv_nsec = 0;
 		inode->i_mtime.tv_nsec = 0;
 		inode->i_atime.tv_nsec = 0;
 		inode->i_blocks = sd_v2_blocks(sd);
 		rdev = sd_v2_rdev(sd);
 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 			inode->i_generation =
 			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 		else
 			inode->i_generation = sd_v2_generation(sd);
 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 		else
 			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
 		REISERFS_I(inode)->i_first_direct_byte = 0;
 		set_inode_sd_version(inode, STAT_DATA_V2);
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V2_SIZE));
 		/* read persistent inode attributes from sd and initialise
 		   generic inode flags from them */
 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
 	}
 	pathrelse(path);
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &reiserfs_file_inode_operations;
 		inode->i_fop = &reiserfs_file_operations;
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &reiserfs_dir_inode_operations;
 		inode->i_fop = &reiserfs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &reiserfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else {
 		inode->i_blocks = 0;
 		inode->i_op = &reiserfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
 	}
 }
 // update new stat data with inode fields
 static void inode2sd(void *sd, struct inode *inode, loff_t size)
 {
 	struct stat_data *sd_v2 = (struct stat_data *)sd;
 	__u16 flags;
 	set_sd_v2_mode(sd_v2, inode->i_mode);
 	set_sd_v2_nlink(sd_v2, inode->i_nlink);
 	set_sd_v2_uid(sd_v2, inode->i_uid);
 	set_sd_v2_size(sd_v2, size);
 	set_sd_v2_gid(sd_v2, inode->i_gid);
 	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
 	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
 	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
 	else
 		set_sd_v2_generation(sd_v2, inode->i_generation);
 	flags = REISERFS_I(inode)->i_attrs;
 	i_attrs_to_sd_attrs(inode, &flags);
 	set_sd_v2_attrs(sd_v2, flags);
 }
 // used to copy inode's fields to old stat data
 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
 {
 	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
 	set_sd_v1_mode(sd_v1, inode->i_mode);
 	set_sd_v1_uid(sd_v1, inode->i_uid);
 	set_sd_v1_gid(sd_v1, inode->i_gid);
 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
 	set_sd_v1_size(sd_v1, size);
 	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
 	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
 	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
 	else
 		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
 	// Sigh. i_first_direct_byte is back
 	set_sd_v1_first_direct_byte(sd_v1,
 				    REISERFS_I(inode)->i_first_direct_byte);
 }
 /* NOTE, you must prepare the buffer head before sending it here,
 ** and then log it after the call
 */
 static void update_stat_data(struct treepath *path, struct inode *inode,
 			     loff_t size)
 {
 	struct buffer_head *bh;
 	struct item_head *ih;
 	bh = PATH_PLAST_BUFFER(path);
 	ih = PATH_PITEM_HEAD(path);
 	if (!is_statdata_le_ih(ih))
 		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
 			       INODE_PKEY(inode), ih);
 	if (stat_data_v1(ih)) {
 		// path points to old stat data
 		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
 	} else {
 		inode2sd(B_I_PITEM(bh, ih), inode, size);
 	}
 	return;
 }
 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 			     struct inode *inode, loff_t size)
 {
 	struct cpu_key key;
 	INITIALIZE_PATH(path);
 	struct buffer_head *bh;
 	int fs_gen;
 	struct item_head *ih, tmp_ih;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
 	for (;;) {
 		int pos;
 		/* look for the object's stat data */
 		retval = search_item(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
 			reiserfs_error(inode->i_sb, "vs-13050",
 				       "i/o failure occurred trying to "
 				       "update %K stat data", &key);
 			return;
 		}
 		if (retval == ITEM_NOT_FOUND) {
 			pos = PATH_LAST_POSITION(&path);
 			pathrelse(&path);
 			if (inode->i_nlink == 0) {
 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
 				return;
 			}
 			reiserfs_warning(inode->i_sb, "vs-13060",
 					 "stat data of object %k (nlink == %d) "
 					 "not found (pos %d)",
 					 INODE_PKEY(inode), inode->i_nlink,
 					 pos);
 			reiserfs_check_path(&path);
 			return;
 		}
 		/* sigh, prepare_for_journal might schedule.  When it schedules the
 		 ** FS might change.  We have to detect that, and loop back to the
 		 ** search if the stat data item has moved
 		 */
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 		copy_item_head(&tmp_ih, ih);
 		fs_gen = get_generation(inode->i_sb);
 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 			continue;	/* Stat_data item has been moved after scheduling. */
 		}
 		break;
 	}
 	update_stat_data(&path, inode, size);
 	journal_mark_dirty(th, th->t_super, bh);
 	pathrelse(&path);
 	return;
 }
 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
 ** does a make_bad_inode when things go wrong.  But, we need to make sure
 ** and clear the key in the private portion of the inode, otherwise a
 ** corresponding iput might try to delete whatever object the inode last
 ** represented.
 */
 static void reiserfs_make_bad_inode(struct inode *inode)
 {
 	memset(INODE_PKEY(inode), 0, KEY_SIZE);
 	make_bad_inode(inode);
 }
 //
 // initially this function was derived from minix or ext2's analog and
 // evolved as the prototype did
 //
 int reiserfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
 	inode->i_ino = args->objectid;
 	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
 	return 0;
 }
 /* looks for stat data in the tree, and fills up the fields of in-core
    inode stat data fields */
 void reiserfs_read_locked_inode(struct inode *inode,
 				struct reiserfs_iget_args *args)
 {
 	INITIALIZE_PATH(path_to_sd);
 	struct cpu_key key;
 	unsigned long dirino;
 	int retval;
 	dirino = args->dirid;
 	/* set version 1, version 2 could be used too, because stat data
 	   key is the same in both versions */
 	key.version = KEY_FORMAT_3_5;
 	key.on_disk_key.k_dir_id = dirino;
 	key.on_disk_key.k_objectid = inode->i_ino;
 	key.on_disk_key.k_offset = 0;
 	key.on_disk_key.k_type = 0;
 	/* look for the object's stat data */
 	retval = search_item(inode->i_sb, &key, &path_to_sd);
 	if (retval == IO_ERROR) {
 		reiserfs_error(inode->i_sb, "vs-13070",
 			       "i/o failure occurred trying to find "
 			       "stat data of %K", &key);
 		reiserfs_make_bad_inode(inode);
 		return;
 	}
 	if (retval != ITEM_FOUND) {
 		/* a stale NFS handle can trigger this without it being an error */
 		pathrelse(&path_to_sd);
 		reiserfs_make_bad_inode(inode);
 		inode->i_nlink = 0;
 		return;
 	}
 	init_inode(inode, &path_to_sd);
 	/* It is possible that knfsd is trying to access inode of a file
 	   that is being removed from the disk by some other thread. As we
 	   update sd on unlink all that is required is to check for nlink
 	   here. This bug was first found by Sizif when debugging
 	   SquidNG/Butterfly, forgotten, and found again after Philippe
 	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 	   More logical fix would require changes in fs/inode.c:iput() to
 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
 	   in iget() to return NULL if I_FREEING inode is found in
 	   hash-table. */
 	/* Currently there is one place where it's ok to meet inode with
 	   nlink==0: processing of open-unlinked and half-truncated files
 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
 	if ((inode->i_nlink == 0) &&
 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
 		reiserfs_warning(inode->i_sb, "vs-13075",
 				 "dead inode read from disk %K. "
 				 "This is likely to be race with knfsd. Ignore",
 				 &key);
 		reiserfs_make_bad_inode(inode);
 	}
 	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
 }
 /**
  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
  *
  * @inode:    inode from hash table to check
  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
  *
  * This function is called by iget5_locked() to distinguish reiserfs inodes
  * having the same inode numbers. Such inodes can only exist due to some
  * error condition. One of them should be bad. Inodes with identical
  * inode numbers (objectids) are distinguished by parent directory ids.
  *
  */
 int reiserfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct reiserfs_iget_args *args;
 	args = opaque;
 	/* args is already in CPU order */
 	return (inode->i_ino == args->objectid) &&
 	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
 }
 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
 {
 	struct inode *inode;
 	struct reiserfs_iget_args args;
 	args.objectid = key->on_disk_key.k_objectid;
 	args.dirid = key->on_disk_key.k_dir_id;
 	reiserfs_write_unlock(s);
 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
 			     reiserfs_find_actor, reiserfs_init_locked_inode,
 			     (void *)(&args));
 	reiserfs_write_lock(s);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (inode->i_state & I_NEW) {
 		reiserfs_read_locked_inode(inode, &args);
 		unlock_new_inode(inode);
 	}
 	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
 		/* either due to i/o error or a stale NFS handle */
 		iput(inode);
 		inode = NULL;
 	}
 	return inode;
 }
 static struct dentry *reiserfs_get_dentry(struct super_block *sb,
 	u32 objectid, u32 dir_id, u32 generation)
 {
 	struct cpu_key key;
 	struct inode *inode;
 	key.on_disk_key.k_objectid = objectid;
 	key.on_disk_key.k_dir_id = dir_id;
 	reiserfs_write_lock(sb);
 	inode = reiserfs_iget(sb, &key);
 	if (inode && !IS_ERR(inode) && generation != 0 &&
 	    generation != inode->i_generation) {
 		iput(inode);
 		inode = NULL;
 	}
 	reiserfs_write_unlock(sb);
 	return d_obtain_alias(inode);
 }
 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		int fh_len, int fh_type)
 {
 	/* fhtype happens to reflect the number of u32s encoded.
 	 * due to a bug in earlier code, fhtype might indicate there
 	 * are more u32s then actually fitted.
 	 * so if fhtype seems to be more than len, reduce fhtype.
 	 * Valid types are:
 	 *   2 - objectid + dir_id - legacy support
 	 *   3 - objectid + dir_id + generation
 	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
 	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
 	 *   6 - as above plus generation of directory
 	 * 6 does not fit in NFSv2 handles
 	 */
 	if (fh_type > fh_len) {
 		if (fh_type != 6 || fh_len != 5)
 			reiserfs_warning(sb, "reiserfs-13077",
 				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
 				fh_type, fh_len);
 		fh_type = 5;
 	}
 	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
 		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
 }
 struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 		int fh_len, int fh_type)
 {
 	if (fh_type < 4)
 		return NULL;
 	return reiserfs_get_dentry(sb,
 		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
 		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
 		(fh_type == 6) ? fid->raw[5] : 0);
 }
 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 		       int need_parent)
 {
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 	if (need_parent && (maxlen < 5)) {
 		*lenp = 5;
 		return 255;
 	} else if (maxlen < 3) {
 		*lenp = 3;
 		return 255;
 	}
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 	data[2] = inode->i_generation;
 	*lenp = 3;
 	/* no room for directory info? return what we've stored so far */
 	if (maxlen < 5 || !need_parent)
 		return 3;
 	spin_lock(&dentry->d_lock);
 	inode = dentry->d_parent->d_inode;
 	data[3] = inode->i_ino;
 	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 	*lenp = 5;
 	if (maxlen >= 6) {
 		data[5] = inode->i_generation;
 		*lenp = 6;
 	}
 	spin_unlock(&dentry->d_lock);
 	return *lenp;
 }
 /* looks for stat data, then copies fields to it, marks the buffer
    containing stat data as dirty */
 /* reiserfs inodes are never really dirty, since the dirty inode call
 ** always logs them.  This call allows the VFS inode marking routines
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
 int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct reiserfs_transaction_handle th;
 	int jbegin_count = 1;
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
 	 ** these cases are just when the system needs ram, not when the
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
 	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
 			reiserfs_update_sd(&th, inode);
 			journal_end_sync(&th, inode->i_sb, jbegin_count);
 		}
 		reiserfs_write_unlock(inode->i_sb);
 	}
 	return 0;
 }
 /* stat data of new object is inserted already, this inserts the item
    containing "." and ".." entries */
 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
 				  struct inode *inode,
 				  struct item_head *ih, struct treepath *path,
 				  struct inode *dir)
 {
 	struct super_block *sb = th->t_super;
 	char empty_dir[EMPTY_DIR_SIZE];
 	char *body = empty_dir;
 	struct cpu_key key;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
 		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
 		      TYPE_DIRENTRY, 3 /*key length */ );
 	/* compose item head for new item. Directories consist of items of
 	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
 	   is done by reiserfs_new_inode */
 	if (old_format_only(sb)) {
 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
 		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
 				       ih->ih_key.k_objectid,
 				       INODE_PKEY(dir)->k_dir_id,
 				       INODE_PKEY(dir)->k_objectid);
 	} else {
 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
 		make_empty_dir_item(body, ih->ih_key.k_dir_id,
 				    ih->ih_key.k_objectid,
 				    INODE_PKEY(dir)->k_dir_id,
 				    INODE_PKEY(dir)->k_objectid);
 	}
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
 		reiserfs_error(sb, "vs-13080",
 			       "i/o failure occurred creating new directory");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
 		reiserfs_warning(sb, "vs-13070",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
 	}
 	/* insert item, that is empty directory item */
 	return reiserfs_insert_item(th, path, &key, ih, inode, body);
 }
 /* stat data of object has been inserted, this inserts the item
    containing the body of symlink */
 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
 				struct item_head *ih,
 				struct treepath *path, const char *symname,
 				int item_len)
 {
 	struct super_block *sb = th->t_super;
 	struct cpu_key key;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	_make_cpu_key(&key, KEY_FORMAT_3_5,
 		      le32_to_cpu(ih->ih_key.k_dir_id),
 		      le32_to_cpu(ih->ih_key.k_objectid),
 		      1, TYPE_DIRECT, 3 /*key length */ );
 	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
 			  0 /*free_space */ );
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
 		reiserfs_error(sb, "vs-13080",
 			       "i/o failure occurred creating new symlink");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
 		reiserfs_warning(sb, "vs-13080",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
 	}
 	/* insert item, that is body of symlink */
 	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
 }
 /* inserts the stat data into the tree, and then calls
    reiserfs_new_directory (to insert ".", ".." item if new object is
    directory) or reiserfs_new_symlink (to insert symlink body if new
    object is symlink) or nothing (if new object is regular file)
    NOTE! uid and gid must already be set in the inode.  If we return
    non-zero due to an error, we have to drop the quota previously allocated
    for the fresh inode.  This can only be done outside a transaction, so
    if we return non-zero, we also end the transaction.  */
 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *dir, int mode, const char *symname,
 		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
 		          strlen (symname) for symlinks) */
 		       loff_t i_size, struct dentry *dentry,
 		       struct inode *inode,
 		       struct reiserfs_security_handle *security)
 {
 	struct super_block *sb;
 	struct reiserfs_iget_args args;
 	INITIALIZE_PATH(path_to_key);
 	struct cpu_key key;
 	struct item_head ih;
 	struct stat_data sd;
 	int retval;
 	int err;
 	BUG_ON(!th->t_trans_id);
 	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto out_end_trans;
 	if (!dir->i_nlink) {
 		err = -EPERM;
 		goto out_bad_inode;
 	}
 	sb = dir->i_sb;
 	/* item head of new item */
 	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
 	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
 	if (!ih.ih_key.k_objectid) {
 		err = -ENOMEM;
 		goto out_bad_inode;
 	}
 	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
 	if (old_format_only(sb))
 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
 				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
 	else
 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
 				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
 	if (insert_inode_locked4(inode, args.objectid,
 			     reiserfs_find_actor, &args) < 0) {
 		err = -EINVAL;
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb))
 		/* not a perfect generation count, as object ids can be reused, but
 		 ** this is as good as reiserfs can do right now.
 		 ** note that the private part of inode isn't filled in yet, we have
 		 ** to use the directory.
 		 */
 		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
 	else
 #if defined( USE_INODE_GENERATION_COUNTER )
 		inode->i_generation =
 		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
 #else
 		inode->i_generation = ++event;
 #endif
 	/* fill stat data */
 	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
 	/* uid and gid must already be set by the caller for quota init */
 	/* symlink cannot be immutable or append only, right? */
 	if (S_ISLNK(inode->i_mode))
 		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	inode->i_size = i_size;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
 	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
 	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
 	REISERFS_I(inode)->i_prealloc_block = 0;
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	REISERFS_I(inode)->i_attrs =
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
 	reiserfs_init_xattr_rwsem(inode);
 	/* key to search for correct place for new stat data */
 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
 		      TYPE_STAT_DATA, 3 /*key length */ );
 	/* find proper place for inserting of stat data */
 	retval = search_item(sb, &key, &path_to_key);
 	if (retval == IO_ERROR) {
 		err = -EIO;
 		goto out_bad_inode;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(&path_to_key);
 		err = -EEXIST;
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb)) {
 		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
 			pathrelse(&path_to_key);
 			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
 			err = -EINVAL;
 			goto out_bad_inode;
 		}
 		inode2sd_v1(&sd, inode, inode->i_size);
 	} else {
 		inode2sd(&sd, inode, inode->i_size);
 	}
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
 	// format, other new objects will consist of new items)
 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 	else
 		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
 	if (old_format_only(sb))
 		set_inode_sd_version(inode, STAT_DATA_V1);
 	else
 		set_inode_sd_version(inode, STAT_DATA_V2);
 	/* insert the stat data into the tree */
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	if (REISERFS_I(dir)->new_packing_locality)
 		th->displace_new_blocks = 1;
 #endif
 	retval =
 	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
 				 (char *)(&sd));
 	if (retval) {
 		err = retval;
 		reiserfs_check_path(&path_to_key);
 		goto out_bad_inode;
 	}
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	if (!th->displace_new_blocks)
 		REISERFS_I(dir)->new_packing_locality = 0;
 #endif
 	if (S_ISDIR(mode)) {
 		/* insert item with "." and ".." */
 		retval =
 		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
 	}
 	if (S_ISLNK(mode)) {
 		/* insert body of symlink */
 		if (!old_format_only(sb))
 			i_size = ROUND_UP(i_size);
 		retval =
 		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
 					 i_size);
 	}
 	if (retval) {
 		err = retval;
 		reiserfs_check_path(&path_to_key);
 		journal_end(th, th->t_super, th->t_blocks_allocated);
 		goto out_inserted_sd;
 	}
 	if (reiserfs_posixacl(inode->i_sb)) {
 		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
 		if (retval) {
 			err = retval;
 			reiserfs_check_path(&path_to_key);
 			journal_end(th, th->t_super, th->t_blocks_allocated);
 			goto out_inserted_sd;
 		}
 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
 		reiserfs_warning(inode->i_sb, "jdm-13090",
 				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
 	} else if (IS_PRIVATE(dir))
 		inode->i_flags |= S_PRIVATE;
 	if (security->name) {
 		retval = reiserfs_security_write(th, inode, security);
 		if (retval) {
 			err = retval;
 			reiserfs_check_path(&path_to_key);
 			retval = journal_end(th, th->t_super,
 					     th->t_blocks_allocated);
 			if (retval)
 				err = retval;
 			goto out_inserted_sd;
 		}
 	}
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
 	return 0;
 /* it looks like you can easily compress these two goto targets into
  * one.  Keeping it like this doesn't actually hurt anything, and they
  * are place holders for what the quota code actually needs.
  */
       out_bad_inode:
 	/* Invalidate the object, nothing was inserted yet */
 	INODE_PKEY(inode)->k_objectid = 0;
 	/* Quota change must be inside a transaction for journaling */
 	dquot_free_inode(inode);
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
       out_inserted_sd:
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
 	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
 	iput(inode);
 	return err;
 }
 /*
 ** finds the tail page in the page cache,
 ** reads the last block in.
 **
 ** On success, page_result is set to a locked, pinned page, and bh_result
 ** is set to an up to date buffer for the last block in the file.  returns 0.
 **
 ** tail conversion is not done, so bh_result might not be valid for writing
 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
 ** trying to write the block.
 **
 ** on failure, nonzero is returned, page_result and bh_result are untouched.
 */
 static int grab_tail_page(struct inode *inode,
 			  struct page **page_result,
 			  struct buffer_head **bh_result)
 {
 	/* we want the page with the last byte in the file,
 	 ** not the page that will hold the next byte for appending
 	 */
 	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
 	unsigned long pos = 0;
 	unsigned long start = 0;
 	unsigned long blocksize = inode->i_sb->s_blocksize;
 	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
 	int error;
 	/* we know that we are only called with inode->i_size > 0.
 	 ** we also know that a file tail can never be as big as a block
 	 ** If i_size % blocksize == 0, our file is currently block aligned
 	 ** and it won't need converting or zeroing after a truncate.
 	 */
 	if ((offset & (blocksize - 1)) == 0) {
 		return -ENOENT;
 	}
 	page = grab_cache_page(inode->i_mapping, index);
 	error = -ENOMEM;
 	if (!page) {
 		goto out;
 	}
 	/* start within the page of the last block in the file */
 	start = (offset / blocksize) * blocksize;
 	error = __block_write_begin(page, start, offset - start,
 				    reiserfs_get_block_create_0);
 	if (error)
 		goto unlock;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (pos >= start) {
 			break;
 		}
 		bh = bh->b_this_page;
 		pos += blocksize;
 	} while (bh != head);
 	if (!buffer_uptodate(bh)) {
 		/* note, this should never happen, prepare_write should
 		 ** be taking care of this for us.  If the buffer isn't up to date,
 		 ** I've screwed up the code to find the buffer, or the code to
 		 ** call prepare_write
 		 */
 		reiserfs_error(inode->i_sb, "clm-6000",
 			       "error reading block %lu", bh->b_blocknr);
 		error = -EIO;
 		goto unlock;
 	}
 	*bh_result = bh;
 	*page_result = page;
       out:
 	return error;
       unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return error;
 }
 /*
 ** vfs version of truncate file.  Must NOT be called with
 ** a transaction already started.
 **
 ** some code taken from block_truncate_page
 */
 int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 {
 	struct reiserfs_transaction_handle th;
 	/* we want the offset for the first byte after the end of the file */
 	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned length;
 	struct page *page = NULL;
 	int error;
 	struct buffer_head *bh = NULL;
 	int err2;
 	int lock_depth;
 	lock_depth = reiserfs_write_lock_once(inode->i_sb);
 	if (inode->i_size > 0) {
 		error = grab_tail_page(inode, &page, &bh);
 		if (error) {
 			// -ENOENT means we truncated past the end of the file,
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
 				reiserfs_error(inode->i_sb, "clm-6001",
 					       "grab_tail_page failed %d",
 					       error);
 			page = NULL;
 			bh = NULL;
 		}
 	}
 	/* so, if page != NULL, we have a buffer head for the offset at
 	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
 	 ** then we have an unformatted node.  Otherwise, we have a direct item,
 	 ** and no zeroing is required on disk.  We zero after the truncate,
 	 ** because the truncate might pack the item anyway
 	 ** (it will unmap bh if it packs).
 	 */
 	/* it is enough to reserve space in transaction for 2 balancings:
 	   one for "save" link adding and another for the first
 	   cut_from_item. 1 is for update_sd */
 	error = journal_begin(&th, inode->i_sb,
 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 	reiserfs_update_inode_transaction(inode);
 	if (update_timestamps)
 		/* we are doing real truncate: if the system crashes before the last
 		   transaction of truncating gets committed - on reboot the file
 		   either appears truncated properly or not truncated at all */
 		add_save_link(&th, inode, 1);
 	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
 	error =
 	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 	/* check reiserfs_do_truncate after ending the transaction */
 	if (err2) {
 		error = err2;
   		goto out;
 	}
 	if (update_timestamps) {
 		error = remove_save_link(inode, 1 /* truncate */);
 		if (error)
 			goto out;
 	}
 	if (page) {
 		length = offset & (blocksize - 1);
 		/* if we are not on a block boundary */
 		if (length) {
 			length = blocksize - length;
 			zero_user(page, offset, length);
 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 				mark_buffer_dirty(bh);
 			}
 		}
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	return 0;
       out:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	return error;
 }
 static int map_block_for_writepage(struct inode *inode,
 				   struct buffer_head *bh_result,
 				   unsigned long block)
 {
 	struct reiserfs_transaction_handle th;
 	int fs_gen;
 	struct item_head tmp_ih;
 	struct item_head *ih;
 	struct buffer_head *bh;
 	__le32 *item;
 	struct cpu_key key;
 	INITIALIZE_PATH(path);
 	int pos_in_item;
 	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
 	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
 	int retval;
 	int use_get_block = 0;
 	int bytes_copied = 0;
 	int copy_size;
 	int trans_running = 0;
 	/* catch places below that try to log something without starting a trans */
 	th.t_trans_id = 0;
 	if (!buffer_uptodate(bh_result)) {
 		return -EIO;
 	}
 	kmap(bh_result->b_page);
       start_over:
 	reiserfs_write_lock(inode->i_sb);
 	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
       research:
 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (retval != POSITION_FOUND) {
 		use_get_block = 1;
 		goto out;
 	}
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	item = get_item(&path);
 	pos_in_item = path.pos_in_item;
 	/* we've found an unformatted node */
 	if (indirect_item_found(retval, ih)) {
 		if (bytes_copied > 0) {
 			reiserfs_warning(inode->i_sb, "clm-6002",
 					 "bytes_copied %d", bytes_copied);
 		}
 		if (!get_block_num(item, pos_in_item)) {
 			/* crap, we are writing to a hole */
 			use_get_block = 1;
 			goto out;
 		}
 		set_block_dev_mapped(bh_result,
 				     get_block_num(item, pos_in_item), inode);
 	} else if (is_direct_le_ih(ih)) {
 		char *p;
 		p = page_address(bh_result->b_page);
 		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
 		copy_size = ih_item_len(ih) - pos_in_item;
 		fs_gen = get_generation(inode->i_sb);
 		copy_item_head(&tmp_ih, ih);
 		if (!trans_running) {
 			/* vs-3050 is gone, no need to drop the path */
 			retval = journal_begin(&th, inode->i_sb, jbegin_count);
 			if (retval)
 				goto out;
 			reiserfs_update_inode_transaction(inode);
 			trans_running = 1;
 			if (fs_changed(fs_gen, inode->i_sb)
 			    && item_moved(&tmp_ih, &path)) {
 				reiserfs_restore_prepared_buffer(inode->i_sb,
 								 bh);
 				goto research;
 			}
 		}
 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 			goto research;
 		}
 		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
 		       copy_size);
 		journal_mark_dirty(&th, inode->i_sb, bh);
 		bytes_copied += copy_size;
 		set_block_dev_mapped(bh_result, 0, inode);
 		/* are there still bytes left? */
 		if (bytes_copied < bh_result->b_size &&
 		    (byte_offset + bytes_copied) < inode->i_size) {
 			set_cpu_key_k_offset(&key,
 					     cpu_key_k_offset(&key) +
 					     copy_size);
 			goto research;
 		}
 	} else {
 		reiserfs_warning(inode->i_sb, "clm-6003",
 				 "bad item inode %lu", inode->i_ino);
 		retval = -EIO;
 		goto out;
 	}
 	retval = 0;
       out:
 	pathrelse(&path);
 	if (trans_running) {
 		int err = journal_end(&th, inode->i_sb, jbegin_count);
 		if (err)
 			retval = err;
 		trans_running = 0;
 	}
 	reiserfs_write_unlock(inode->i_sb);
 	/* this is where we fill in holes in the file. */
 	if (use_get_block) {
 		retval = reiserfs_get_block(inode, block, bh_result,
 					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
 					    | GET_BLOCK_NO_DANGLE);
 		if (!retval) {
 			if (!buffer_mapped(bh_result)
 			    || bh_result->b_blocknr == 0) {
 				/* get_block failed to find a mapped unformatted node. */
 				use_get_block = 0;
 				goto start_over;
 			}
 		}
 	}
 	kunmap(bh_result->b_page);
 	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 		/* we've copied data from the page into the direct item, so the
 		 * buffer in the page is now clean, mark it to reflect that.
 		 */
 		lock_buffer(bh_result);
 		clear_buffer_dirty(bh_result);
 		unlock_buffer(bh_result);
 	}
 	return retval;
 }
 /*
  * mason@suse.com: updated in 2.5.54 to follow the same general io
  * start/recovery path as __block_write_full_page, along with special
  * code to handle reiserfs tails.
  */
 static int reiserfs_write_full_page(struct page *page,
 				    struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 	int error = 0;
 	unsigned long block;
 	sector_t last_block;
 	struct buffer_head *head, *bh;
 	int partial = 0;
 	int nr = 0;
 	int checked = PageChecked(page);
 	struct reiserfs_transaction_handle th;
 	struct super_block *s = inode->i_sb;
 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 	th.t_trans_id = 0;
 	/* no logging allowed when nonblocking or from PF_MEMALLOC */
 	if (checked && (current->flags & PF_MEMALLOC)) {
 		redirty_page_for_writepage(wbc, page);
 		unlock_page(page);
 		return 0;
 	}
 	/* The page dirty bit is cleared before writepage is called, which
 	 * means we have to tell create_empty_buffers to make dirty buffers
 	 * The page really should be up to date at this point, so tossing
 	 * in the BH_Uptodate is just a sanity check.
 	 */
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, s->s_blocksize,
 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
 	}
 	head = page_buffers(page);
 	/* last page in the file, zero out any contents past the
 	 ** last byte in the file
 	 */
 	if (page->index >= end_index) {
 		unsigned last_offset;
 		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
 		/* no file contents in this page */
 		if (page->index >= end_index + 1 || !last_offset) {
 			unlock_page(page);
 			return 0;
 		}
 		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
 	}
 	bh = head;
 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 	/* first map all the buffers, logging any direct items we find */
 	do {
 		if (block > last_block) {
 			/*
 			 * This can happen when the block size is less than
 			 * the page size.  The corresponding bytes in the page
 			 * were zero filled above
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 		} else if ((checked || buffer_dirty(bh)) &&
 		           (!buffer_mapped(bh) || (buffer_mapped(bh)
 						       && bh->b_blocknr ==
 						       0))) {
 			/* not mapped yet, or it points to a direct item, search
 			 * the btree for the mapping info, and log any direct
 			 * items found
 			 */
 			if ((error = map_block_for_writepage(inode, bh, block))) {
 				goto fail;
 			}
 		}
 		bh = bh->b_this_page;
 		block++;
 	} while (bh != head);
 	/*
 	 * we start the transaction after map_block_for_writepage,
 	 * because it can create holes in the file (an unbounded operation).
 	 * starting it here, we can make a reliable estimate for how many
 	 * blocks we're going to log
 	 */
 	if (checked) {
 		ClearPageChecked(page);
 		reiserfs_write_lock(s);
 		error = journal_begin(&th, s, bh_per_page + 1);
 		if (error) {
 			reiserfs_write_unlock(s);
 			goto fail;
 		}
 		reiserfs_update_inode_transaction(inode);
 	}
 	/* now go through and lock any dirty buffers on the page */
 	do {
 		get_bh(bh);
 		if (!buffer_mapped(bh))
 			continue;
 		if (buffer_mapped(bh) && bh->b_blocknr == 0)
 			continue;
 		if (checked) {
 			reiserfs_prepare_for_journal(s, bh, 1);
 			journal_mark_dirty(&th, s, bh);
 			continue;
 		}
 		/* from this point on, we know the buffer is mapped to a
 		 * real block and not a direct item
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else {
 			if (!trylock_buffer(bh)) {
 				redirty_page_for_writepage(wbc, page);
 				continue;
 			}
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			mark_buffer_async_write(bh);
 		} else {
 			unlock_buffer(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	if (checked) {
 		error = journal_end(&th, s, bh_per_page + 1);
 		reiserfs_write_unlock(s);
 		if (error)
 			goto fail;
 	}
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
 	/*
 	 * since any buffer might be the only dirty buffer on the page,
 	 * the first submit_bh can bring the page out of writeback.
 	 * be careful with the buffers.
 	 */
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
 		bh = next;
 	} while (bh != head);
 	error = 0;
       done:
 	if (nr == 0) {
 		/*
 		 * if this page only had a direct item, it is very possible for
 		 * no io to be required without there being an error.  Or,
 		 * someone else could have locked them and sent them down the
 		 * pipe without locking the page
 		 */
 		bh = head;
 		do {
 			if (!buffer_uptodate(bh)) {
 				partial = 1;
 				break;
 			}
 			bh = bh->b_this_page;
 		} while (bh != head);
 		if (!partial)
 			SetPageUptodate(page);
 		end_page_writeback(page);
 	}
 	return error;
       fail:
 	/* catches various errors, we need to make sure any valid dirty blocks
 	 * get to the media.  The page is currently locked and not marked for
 	 * writeback
 	 */
 	ClearPageUptodate(page);
 	bh = head;
 	do {
 		get_bh(bh);
 		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
 			/*
 			 * clear any dirty bits that might have come from getting
 			 * attached to a dirty page
 			 */
 			clear_buffer_dirty(bh);
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
 			submit_bh(WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
 		bh = next;
 	} while (bh != head);
 	goto done;
 }
 static int reiserfs_readpage(struct file *f, struct page *page)
 {
 	return block_read_full_page(page, reiserfs_get_block);
 }
 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	return reiserfs_write_full_page(page, wbc);
 }
 static void reiserfs_truncate_failed_write(struct inode *inode)
 {
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	reiserfs_truncate_file(inode, 0);
 }
 static int reiserfs_write_begin(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode;
 	struct page *page;
 	pgoff_t index;
 	int ret;
 	int old_ref = 0;
  	inode = mapping->host;
 	*fsdata = 0;
  	if (flags & AOP_FLAG_CONT_EXPAND &&
  	    (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
  		pos ++;
 		*fsdata = (void *)(unsigned long)flags;
 	}
 	index = pos >> PAGE_CACHE_SHIFT;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	fix_tail_page_for_writing(page);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th;
 		th = (struct reiserfs_transaction_handle *)current->
 		    journal_info;
 		BUG_ON(!th->t_refcount);
 		BUG_ON(!th->t_trans_id);
 		old_ref = th->t_refcount;
 		th->t_refcount++;
 	}
 	ret = __block_write_begin(page, pos, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
 		 * error and left a transacstion running, we've got to close it,
 		 * and we've got to free handle if it was a persistent transaction.
 		 *
 		 * But, if we had nested into an existing transaction, we need
 		 * to just drop the ref count on the handle.
 		 *
 		 * If old_ref == 0, the transaction is from reiserfs_get_block,
 		 * and it was a persistent trans.  Otherwise, it was nested above.
 		 */
 		if (th->t_refcount > old_ref) {
 			if (old_ref)
 				th->t_refcount--;
 			else {
 				int err;
 				reiserfs_write_lock(inode->i_sb);
 				err = reiserfs_end_persistent_transaction(th);
 				reiserfs_write_unlock(inode->i_sb);
 				if (err)
 					ret = err;
 			}
 		}
 	}
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
 		/* Truncate allocated blocks */
 		reiserfs_truncate_failed_write(inode);
 	}
 	return ret;
 }
 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
 	int old_ref = 0;
 	reiserfs_write_unlock(inode->i_sb);
 	reiserfs_wait_on_write_block(inode->i_sb);
 	reiserfs_write_lock(inode->i_sb);
 	fix_tail_page_for_writing(page);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th;
 		th = (struct reiserfs_transaction_handle *)current->
 		    journal_info;
 		BUG_ON(!th->t_refcount);
 		BUG_ON(!th->t_trans_id);
 		old_ref = th->t_refcount;
 		th->t_refcount++;
 	}
 	ret = __block_write_begin(page, from, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
 		 * error and left a transacstion running, we've got to close it,
 		 * and we've got to free handle if it was a persistent transaction.
 		 *
 		 * But, if we had nested into an existing transaction, we need
 		 * to just drop the ref count on the handle.
 		 *
 		 * If old_ref == 0, the transaction is from reiserfs_get_block,
 		 * and it was a persistent trans.  Otherwise, it was nested above.
 		 */
 		if (th->t_refcount > old_ref) {
 			if (old_ref)
 				th->t_refcount--;
 			else {
 				int err;
 				reiserfs_write_lock(inode->i_sb);
 				err = reiserfs_end_persistent_transaction(th);
 				reiserfs_write_unlock(inode->i_sb);
 				if (err)
 					ret = err;
 			}
 		}
 	}
 	return ret;
 }
 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
 {
 	return generic_block_bmap(as, block, reiserfs_bmap);
 }
 static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 			      loff_t pos, unsigned len, unsigned copied,
 			      struct page *page, void *fsdata)
 {
 	struct inode *inode = page->mapping->host;
 	int ret = 0;
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th;
 	unsigned start;
 	int lock_depth = 0;
 	bool locked = false;
 	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
 		pos ++;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	if (reiserfs_transaction_running(inode->i_sb))
 		th = current->journal_info;
 	else
 		th = NULL;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	if (unlikely(copied < len)) {
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, start + copied, start + len);
 	}
 	flush_dcache_page(page);
 	reiserfs_commit_page(inode, page, start, start + copied);
 	/* generic_commit_write does this for us, but does not update the
 	 ** transaction tracking stuff when the size changes.  So, we have
 	 ** to do the i_size updates here.
 	 */
 	if (pos + copied > inode->i_size) {
 		struct reiserfs_transaction_handle myth;
 		lock_depth = reiserfs_write_lock_once(inode->i_sb);
 		locked = true;
 		/* If the file have grown beyond the border where it
 		   can have a tail, unmark it as needing a tail
 		   packing */
 		if ((have_large_tails(inode->i_sb)
 		     && inode->i_size > i_block_size(inode) * 4)
 		    || (have_small_tails(inode->i_sb)
 			&& inode->i_size > i_block_size(inode)))
 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 		ret = journal_begin(&myth, inode->i_sb, 1);
 		if (ret)
 			goto journal_error;
 		reiserfs_update_inode_transaction(inode);
 		inode->i_size = pos + copied;
 		/*
 		 * this will just nest into our transaction.  It's important
 		 * to use mark_inode_dirty so the inode gets pushed around on the
 		 * dirty lists, and so that O_SYNC works as expected
 		 */
 		mark_inode_dirty(inode);
 		reiserfs_update_sd(&myth, inode);
 		update_sd = 1;
 		ret = journal_end(&myth, inode->i_sb, 1);
 		if (ret)
 			goto journal_error;
 	}
 	if (th) {
 		if (!locked) {
 			lock_depth = reiserfs_write_lock_once(inode->i_sb);
 			locked = true;
 		}
 		if (!update_sd)
 			mark_inode_dirty(inode);
 		ret = reiserfs_end_persistent_transaction(th);
 		if (ret)
 			goto out;
 	}
       out:
 	if (locked)
 		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size)
 		reiserfs_truncate_failed_write(inode);
 	return ret == 0 ? copied : ret;
       journal_error:
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	locked = false;
 	if (th) {
 		if (!update_sd)
 			reiserfs_update_sd(th, inode);
 		ret = reiserfs_end_persistent_transaction(th);
 	}
 	goto out;
 }
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
 	int ret = 0;
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th = NULL;
 	reiserfs_write_unlock(inode->i_sb);
 	reiserfs_wait_on_write_block(inode->i_sb);
 	reiserfs_write_lock(inode->i_sb);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		th = current->journal_info;
 	}
 	reiserfs_commit_page(inode, page, from, to);
 	/* generic_commit_write does this for us, but does not update the
 	 ** transaction tracking stuff when the size changes.  So, we have
 	 ** to do the i_size updates here.
 	 */
 	if (pos > inode->i_size) {
 		struct reiserfs_transaction_handle myth;
 		/* If the file have grown beyond the border where it
 		   can have a tail, unmark it as needing a tail
 		   packing */
 		if ((have_large_tails(inode->i_sb)
 		     && inode->i_size > i_block_size(inode) * 4)
 		    || (have_small_tails(inode->i_sb)
 			&& inode->i_size > i_block_size(inode)))
 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 		ret = journal_begin(&myth, inode->i_sb, 1);
 		if (ret)
 			goto journal_error;
 		reiserfs_update_inode_transaction(inode);
 		inode->i_size = pos;
 		/*
 		 * this will just nest into our transaction.  It's important
 		 * to use mark_inode_dirty so the inode gets pushed around on the
 		 * dirty lists, and so that O_SYNC works as expected
 		 */
 		mark_inode_dirty(inode);
 		reiserfs_update_sd(&myth, inode);
 		update_sd = 1;
 		ret = journal_end(&myth, inode->i_sb, 1);
 		if (ret)
 			goto journal_error;
 	}
 	if (th) {
 		if (!update_sd)
 			mark_inode_dirty(inode);
 		ret = reiserfs_end_persistent_transaction(th);
 		if (ret)
 			goto out;
 	}
       out:
 	return ret;
       journal_error:
 	if (th) {
 		if (!update_sd)
 			reiserfs_update_sd(th, inode);
 		ret = reiserfs_end_persistent_transaction(th);
 	}
 	return ret;
 }
 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
 {
 	if (reiserfs_attrs(inode->i_sb)) {
 		if (sd_attrs & REISERFS_SYNC_FL)
 			inode->i_flags |= S_SYNC;
 		else
 			inode->i_flags &= ~S_SYNC;
 		if (sd_attrs & REISERFS_IMMUTABLE_FL)
 			inode->i_flags |= S_IMMUTABLE;
 		else
 			inode->i_flags &= ~S_IMMUTABLE;
 		if (sd_attrs & REISERFS_APPEND_FL)
 			inode->i_flags |= S_APPEND;
 		else
 			inode->i_flags &= ~S_APPEND;
 		if (sd_attrs & REISERFS_NOATIME_FL)
 			inode->i_flags |= S_NOATIME;
 		else
 			inode->i_flags &= ~S_NOATIME;
 		if (sd_attrs & REISERFS_NOTAIL_FL)
 			REISERFS_I(inode)->i_flags |= i_nopack_mask;
 		else
 			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
 	}
 }
 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
 {
 	if (reiserfs_attrs(inode->i_sb)) {
 		if (inode->i_flags & S_IMMUTABLE)
 			*sd_attrs |= REISERFS_IMMUTABLE_FL;
 		else
 			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
 		if (inode->i_flags & S_SYNC)
 			*sd_attrs |= REISERFS_SYNC_FL;
 		else
 			*sd_attrs &= ~REISERFS_SYNC_FL;
 		if (inode->i_flags & S_NOATIME)
 			*sd_attrs |= REISERFS_NOATIME_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOATIME_FL;
 		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
 			*sd_attrs |= REISERFS_NOTAIL_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOTAIL_FL;
 	}
 }
 /* decide if this buffer needs to stay around for data logging or ordered
 ** write purposes
 */
 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 {
 	int ret = 1;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 	lock_buffer(bh);
 	spin_lock(&j->j_dirty_buffers_lock);
 	if (!buffer_mapped(bh)) {
 		goto free_jh;
 	}
 	/* the page is locked, and the only places that log a data buffer
 	 * also lock the page.
 	 */
 	if (reiserfs_file_data_log(inode)) {
 		/*
 		 * very conservative, leave the buffer pinned if
 		 * anyone might need it.
 		 */
 		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
 			ret = 0;
 		}
 	} else  if (buffer_dirty(bh)) {
 		struct reiserfs_journal_list *jl;
 		struct reiserfs_jh *jh = bh->b_private;
 		/* why is this safe?
 		 * reiserfs_setattr updates i_size in the on disk
 		 * stat data before allowing vmtruncate to be called.
 		 *
 		 * If buffer was put onto the ordered list for this
 		 * transaction, we know for sure either this transaction
 		 * or an older one already has updated i_size on disk,
 		 * and this ordered data won't be referenced in the file
 		 * if we crash.
 		 *
 		 * if the buffer was put onto the ordered list for an older
 		 * transaction, we need to leave it around
 		 */
 		if (jh && (jl = jh->jl)
 		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
 			ret = 0;
 	}
       free_jh:
 	if (ret && bh->b_private) {
 		reiserfs_free_jh(bh);
 	}
 	spin_unlock(&j->j_dirty_buffers_lock);
 	unlock_buffer(bh);
 	return ret;
 }
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
 static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	struct inode *inode = page->mapping->host;
 	unsigned int curr_off = 0;
 	int ret = 1;
 	BUG_ON(!PageLocked(page));
 	if (offset == 0)
 		ClearPageChecked(page);
 	if (!page_has_buffers(page))
 		goto out;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off) {
 			if (invalidatepage_can_drop(inode, bh))
 				reiserfs_unmap_buffer(bh);
 			else
 				ret = 0;
 		}
 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);
 	/*
 	 * We release buffers only if the entire page is being invalidated.
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
 	if (!offset && ret) {
 		ret = try_to_release_page(page, 0);
 		/* maybe should BUG_ON(!ret); - neilb */
 	}
       out:
 	return;
 }
 static int reiserfs_set_page_dirty(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	if (reiserfs_file_data_log(inode)) {
 		SetPageChecked(page);
 		return __set_page_dirty_nobuffers(page);
 	}
 	return __set_page_dirty_buffers(page);
 }
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
  * even in -o notail mode, we can't be sure an old mount without -o notail
  * didn't create files with tails.
  */
 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
 	struct inode *inode = page->mapping->host;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 	struct buffer_head *head;
 	struct buffer_head *bh;
 	int ret = 1;
 	WARN_ON(PageChecked(page));
 	spin_lock(&j->j_dirty_buffers_lock);
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (bh->b_private) {
 			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
 				reiserfs_free_jh(bh);
 			} else {
 				ret = 0;
 				break;
 			}
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
 	if (ret)
 		ret = try_to_free_buffers(page);
 	spin_unlock(&j->j_dirty_buffers_lock);
 	return ret;
 }
 /* We thank Mingming Cao for helping us understand in great detail what
    to do in this section of the code. */
 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 				  const struct iovec *iov, loff_t offset,
 				  unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs,
 				  reiserfs_get_blocks_direct_io, NULL);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + iov_length(iov, nr_segs);
 		if (end > isize)
 			vmtruncate(inode, isize);
 	}
 	return ret;
 }
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	unsigned int ia_valid;
 	int depth;
 	int error;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	/* must be turned off for recursive notify_change calls */
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 	depth = reiserfs_write_lock_once(inode->i_sb);
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* version 2 items will be caught by the s_maxbytes check
 		 ** done for us in vmtruncate
 		 */
 		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
 		    attr->ia_size > MAX_NON_LFS) {
 			error = -EFBIG;
 			goto out;
 		}
+		inode_dio_wait(inode);
 		/* fill in hole pointers in the expanding truncate case. */
 		if (attr->ia_size > inode->i_size) {
 			error = generic_cont_expand_simple(inode, attr->ia_size);
 			if (REISERFS_I(inode)->i_prealloc_count > 0) {
 				int err;
 				struct reiserfs_transaction_handle th;
 				/* we're changing at most 2 bitmaps, inode + super */
 				err = journal_begin(&th, inode->i_sb, 4);
 				if (!err) {
 					reiserfs_discard_prealloc(&th, inode);
 					err = journal_end(&th, inode->i_sb, 4);
 				}
 				if (err)
 					error = err;
 			}
 			if (error)
 				goto out;
 			/*
 			 * file size is changed, ctime and mtime are
 			 * to be updated
 			 */
 			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
 		}
 	}
 	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
 	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
 	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
 		/* stat data of format v3.5 has 16 bit uid and gid */
 		error = -EINVAL;
 		goto out;
 	}
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		struct reiserfs_transaction_handle th;
 		int jbegin_count =
 		    2 *
 		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
 		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
 		    2;
 		error = reiserfs_chown_xattrs(inode, attr);
 		if (error)
 			return error;
 		/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
 		error = journal_begin(&th, inode->i_sb, jbegin_count);
 		if (error)
 			goto out;
 		error = dquot_transfer(inode, attr);
 		if (error) {
 			journal_end(&th, inode->i_sb, jbegin_count);
 			goto out;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		mark_inode_dirty(inode);
 		error = journal_end(&th, inode->i_sb, jbegin_count);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Relax the lock here, as it might truncate the
 	 * inode pages and wait for inode pages locks.
 	 * To release such page lock, the owner needs the
 	 * reiserfs lock
 	 */
 	reiserfs_write_unlock_once(inode->i_sb, depth);
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode))
 		error = vmtruncate(inode, attr->ia_size);
 	if (!error) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
 	depth = reiserfs_write_lock_once(inode->i_sb);
 	if (!error && reiserfs_posixacl(inode->i_sb)) {
 		if (attr->ia_valid & ATTR_MODE)
 			error = reiserfs_acl_chmod(inode);
 	}
       out:
 	reiserfs_write_unlock_once(inode->i_sb, depth);
 	return error;
 }
 const struct address_space_operations reiserfs_address_space_operations = {
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
 	.releasepage = reiserfs_releasepage,
 	.invalidatepage = reiserfs_invalidatepage,
 	.write_begin = reiserfs_write_begin,
 	.write_end = reiserfs_write_end,
 	.bmap = reiserfs_aop_bmap,
 	.direct_IO = reiserfs_direct_IO,
 	.set_page_dirty = reiserfs_set_page_dirty,
 };