Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 92bfc6e7c4eabbbd15e7d6d49123b296d05dcfd1

Authored by Christoph Hellwig 2008-11-28 11:23:41 +0800

Committed by Niv Sardi 2008-12-01 08:38:08 +0800

[XFS] embededd struct xfs_imap into xfs_inode

Most uses of struct xfs_imap are to map and inode to a buffer.  To avoid
copying around the inode location information we should just embedd a
strcut xfs_imap into the xfs_inode.  To make sure it doesn't bloat an
inode the im_len is changed to a ushort, which is fine as that's what
the users exepect anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>

Showing 7 changed files with 33 additions and 71 deletions Inline Diff

fs/xfs/xfs_ialloc.c
fs/xfs/xfs_imap.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_log_recover.c

fs/xfs/xfs_ialloc.c

Diff comments View file @ 92bfc6e

 /*
  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_imap.h"
 /*
  * Allocation group level functions.
  */
 static inline int
 xfs_ialloc_cluster_alignment(
 	xfs_alloc_arg_t	*args)
 {
 	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
 	    args->mp->m_sb.sb_inoalignmt >=
 	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
 		return args->mp->m_sb.sb_inoalignmt;
 	return 1;
 }
 /*
  * Lookup the record equal to ino in the btree given by cur.
  */
 STATIC int				/* error */
 xfs_inobt_lookup_eq(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
 	__int32_t		fcnt,	/* free inode count */
 	xfs_inofree_t		free,	/* free inode mask */
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
 	cur->bc_rec.i.ir_freecount = fcnt;
 	cur->bc_rec.i.ir_free = free;
 	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
 }
 /*
  * Lookup the first record greater than or equal to ino
  * in the btree given by cur.
  */
 int					/* error */
 xfs_inobt_lookup_ge(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
 	__int32_t		fcnt,	/* free inode count */
 	xfs_inofree_t		free,	/* free inode mask */
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
 	cur->bc_rec.i.ir_freecount = fcnt;
 	cur->bc_rec.i.ir_free = free;
 	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
 }
 /*
  * Lookup the first record less than or equal to ino
  * in the btree given by cur.
  */
 int					/* error */
 xfs_inobt_lookup_le(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
 	__int32_t		fcnt,	/* free inode count */
 	xfs_inofree_t		free,	/* free inode mask */
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
 	cur->bc_rec.i.ir_freecount = fcnt;
 	cur->bc_rec.i.ir_free = free;
 	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
  * Update the record referred to by cur to the value given
  * by [ino, fcnt, free].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
 STATIC int				/* error */
 xfs_inobt_update(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
 	__int32_t		fcnt,	/* free inode count */
 	xfs_inofree_t		free)	/* free inode mask */
 {
 	union xfs_btree_rec	rec;
 	rec.inobt.ir_startino = cpu_to_be32(ino);
 	rec.inobt.ir_freecount = cpu_to_be32(fcnt);
 	rec.inobt.ir_free = cpu_to_be64(free);
 	return xfs_btree_update(cur, &rec);
 }
 /*
  * Get the data from the pointed-to record.
  */
 int					/* error */
 xfs_inobt_get_rec(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		*ino,	/* output: starting inode of chunk */
 	__int32_t		*fcnt,	/* output: number of free inodes */
 	xfs_inofree_t		*free,	/* output: free inode mask */
 	int			*stat)	/* output: success/failure */
 {
 	union xfs_btree_rec	*rec;
 	int			error;
 	error = xfs_btree_get_rec(cur, &rec, stat);
 	if (!error && *stat == 1) {
 		*ino = be32_to_cpu(rec->inobt.ir_startino);
 		*fcnt = be32_to_cpu(rec->inobt.ir_freecount);
 		*free = be64_to_cpu(rec->inobt.ir_free);
 	}
 	return error;
 }
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
  */
 STATIC int				/* error code or 0 */
 xfs_ialloc_ag_alloc(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_buf_t	*agbp,		/* alloc group buffer */
 	int		*alloc)
 {
 	xfs_agi_t	*agi;		/* allocation group header */
 	xfs_alloc_arg_t	args;		/* allocation argument structure */
 	int		blks_per_cluster;  /* fs blocks per inode cluster */
 	xfs_btree_cur_t	*cur;		/* inode btree cursor */
 	xfs_daddr_t	d;		/* disk addr of buffer */
 	xfs_agnumber_t	agno;
 	int		error;
 	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
 	xfs_dinode_t	*free;		/* new free inode structure */
 	int		i;		/* inode counter */
 	int		j;		/* block counter */
 	int		nbufs;		/* num bufs of new inodes */
 	xfs_agino_t	newino;		/* new first inode's number */
 	xfs_agino_t	newlen;		/* new number of inodes */
 	int		ninodes;	/* num inodes per buf */
 	xfs_agino_t	thisino;	/* current inode number, for loop */
 	int		version;	/* inode version number to use */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
 	unsigned int	gen;
 	args.tp = tp;
 	args.mp = tp->t_mountp;
 	/*
 	 * Locking will ensure that we don't have two callers in here
 	 * at one time.
 	 */
 	newlen = XFS_IALLOC_INODES(args.mp);
 	if (args.mp->m_maxicount &&
 	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
 		return XFS_ERROR(ENOSPC);
 	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
 	/*
 	 * First try to allocate inodes contiguous with the last-allocated
 	 * chunk of inodes.  If the filesystem is striped, this will fill
 	 * an entire stripe unit with inodes.
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 			XFS_IALLOC_BLOCKS(args.mp);
 	if (likely(newino != NULLAGINO &&
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
 		/*
 		 * We need to take into account alignment here to ensure that
 		 * we don't modify the free list if we fail to have an exact
 		 * block. If we don't have an exact match, and every oher
 		 * attempt allocation attempt fails, we'll end up cancelling
 		 * a dirty transaction and shutting down.
 		 *
 		 * For an exact allocation, alignment must be 1,
 		 * however we need to take cluster alignment into account when
 		 * fixing up the freelist. Use the minalignslop field to
 		 * indicate that extra blocks might be required for alignment,
 		 * but not to use them in the actual exact allocation.
 		 */
 		args.alignment = 1;
 		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
 		/* Allow space for the inode btree to split. */
 		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	} else
 		args.fsbno = NULLFSBLOCK;
 	if (unlikely(args.fsbno == NULLFSBLOCK)) {
 		/*
 		 * Set the alignment for the allocation.
 		 * If stripe alignment is turned on then align at stripe unit
 		 * boundary.
 		 * If the cluster size is smaller than a filesystem block
 		 * then we're doing I/O for inodes in filesystem block size
 		 * pieces, so don't need alignment anyway.
 		 */
 		isaligned = 0;
 		if (args.mp->m_sinoalign) {
 			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
 		} else
 			args.alignment = xfs_ialloc_cluster_alignment(&args);
 		/*
 		 * Need to figure out where to allocate the inode blocks.
 		 * Ideally they should be spaced out through the a.g.
 		 * For now, just allocate blocks up front.
 		 */
 		args.agbno = be32_to_cpu(agi->agi_root);
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		/*
 		 * Allocate a fixed-size extent of inodes.
 		 */
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
 		/*
 		 * Allow space for the inode btree to split.
 		 */
 		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
 	/*
 	 * If stripe alignment is turned on, then try again with cluster
 	 * alignment.
 	 */
 	if (isaligned && args.fsbno == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.agbno = be32_to_cpu(agi->agi_root);
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		args.alignment = xfs_ialloc_cluster_alignment(&args);
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
 	if (args.fsbno == NULLFSBLOCK) {
 		*alloc = 0;
 		return 0;
 	}
 	ASSERT(args.len == args.minlen);
 	/*
 	 * Convert the results.
 	 */
 	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 	/*
 	 * Loop over the new block(s), filling in the inodes.
 	 * For small block sizes, manipulate the inodes in buffers
 	 * which are multiples of the blocks size.
 	 */
 	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
 		blks_per_cluster = 1;
 		nbufs = (int)args.len;
 		ninodes = args.mp->m_sb.sb_inopblock;
 	} else {
 		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
 				   args.mp->m_sb.sb_blocksize;
 		nbufs = (int)args.len / blks_per_cluster;
 		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
 	}
 	/*
 	 * Figure out what version number to use in the inodes we create.
 	 * If the superblock version has caught up to the one that supports
 	 * the new inode format, then use the new inode version.  Otherwise
 	 * use the old version so that old kernels will continue to be
 	 * able to use the file system.
 	 */
 	if (xfs_sb_version_hasnlink(&args.mp->m_sb))
 		version = 2;
 	else
 		version = 1;
 	/*
 	 * Seed the new inode cluster with a random generation number. This
 	 * prevents short-term reuse of generation numbers if a chunk is
 	 * freed and then immediately reallocated. We use random numbers
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
 	gen = random32();
 	for (j = 0; j < nbufs; j++) {
 		/*
 		 * Get the block.
 		 */
 		d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
 				     args.agbno + (j * blks_per_cluster));
 		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
 					 args.mp->m_bsize * blks_per_cluster,
 					 XFS_BUF_LOCK);
 		ASSERT(fbuf);
 		ASSERT(!XFS_BUF_GETERROR(fbuf));
 		/*
 		 * Initialize all inodes in this buffer and then log them.
 		 *
 		 * XXX: It would be much better if we had just one transaction to
 		 *      log a whole cluster of inodes instead of all the indivdual
 		 *      transactions causing a lot of log traffic.
 		 */
 		xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << args.mp->m_sb.sb_inodelog;
 			uint	isize = sizeof(struct xfs_dinode);
 			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
 			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_version = version;
 			free->di_gen = cpu_to_be32(gen);
 			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
 		}
 		xfs_trans_inode_alloc_buf(tp, fbuf);
 	}
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
 	agno = be32_to_cpu(agi->agi_seqno);
 	down_read(&args.mp->m_peraglock);
 	args.mp->m_perag[agno].pagi_freecount += newlen;
 	up_read(&args.mp->m_peraglock);
 	agi->agi_newino = cpu_to_be32(newino);
 	/*
 	 * Insert records describing the new inode chunk into the btree.
 	 */
 	cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
 	for (thisino = newino;
 	     thisino < newino + newlen;
 	     thisino += XFS_INODES_PER_CHUNK) {
 		if ((error = xfs_inobt_lookup_eq(cur, thisino,
 				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
 		ASSERT(i == 0);
 		if ((error = xfs_btree_insert(cur, &i))) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
 		ASSERT(i == 1);
 	}
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	/*
 	 * Log allocation group header fields
 	 */
 	xfs_ialloc_log_agi(tp, agbp,
 		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 	/*
 	 * Modify/log superblock values for inode count and inode free count.
 	 */
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 	*alloc = 1;
 	return 0;
 }
 STATIC_INLINE xfs_agnumber_t
 xfs_ialloc_next_ag(
 	xfs_mount_t	*mp)
 {
 	xfs_agnumber_t	agno;
 	spin_lock(&mp->m_agirotor_lock);
 	agno = mp->m_agirotor;
 	if (++mp->m_agirotor == mp->m_maxagi)
 		mp->m_agirotor = 0;
 	spin_unlock(&mp->m_agirotor_lock);
 	return agno;
 }
 /*
  * Select an allocation group to look for a free inode in, based on the parent
  * inode and then mode.  Return the allocation group buffer.
  */
 STATIC xfs_buf_t *			/* allocation group buffer */
 xfs_ialloc_ag_select(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_ino_t	parent,		/* parent directory inode number */
 	mode_t		mode,		/* bits set to indicate file type */
 	int		okalloc)	/* ok to allocate more space */
 {
 	xfs_buf_t	*agbp;		/* allocation group header buffer */
 	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
 	xfs_agnumber_t	agno;		/* current ag number */
 	int		flags;		/* alloc buffer locking flags */
 	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
 	xfs_extlen_t	longest = 0;	/* longest extent available */
 	xfs_mount_t	*mp;		/* mount point structure */
 	int		needspace;	/* file mode implies space allocated */
 	xfs_perag_t	*pag;		/* per allocation group data */
 	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
 	/*
 	 * Files of these types need at least one block if length > 0
 	 * (and they won't fit in the inode, but that's hard to figure out).
 	 */
 	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
 	mp = tp->t_mountp;
 	agcount = mp->m_maxagi;
 	if (S_ISDIR(mode))
 		pagno = xfs_ialloc_next_ag(mp);
 	else {
 		pagno = XFS_INO_TO_AGNO(mp, parent);
 		if (pagno >= agcount)
 			pagno = 0;
 	}
 	ASSERT(pagno < agcount);
 	/*
 	 * Loop through allocation groups, looking for one with a little
 	 * free space in it.  Note we don't look for free inodes, exactly.
 	 * Instead, we include whether there is a need to allocate inodes
 	 * to mean that blocks must be allocated for them,
 	 * if none are currently free.
 	 */
 	agno = pagno;
 	flags = XFS_ALLOC_FLAG_TRYLOCK;
 	down_read(&mp->m_peraglock);
 	for (;;) {
 		pag = &mp->m_perag[agno];
 		if (!pag->pagi_init) {
 			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 				agbp = NULL;
 				goto nextag;
 			}
 		} else
 			agbp = NULL;
 		if (!pag->pagi_inodeok) {
 			xfs_ialloc_next_ag(mp);
 			goto unlock_nextag;
 		}
 		/*
 		 * Is there enough free space for the file plus a block
 		 * of inodes (if we need to allocate some)?
 		 */
 		ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
 		if (ineed && !pag->pagf_init) {
 			if (agbp == NULL &&
 			    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 				agbp = NULL;
 				goto nextag;
 			}
 			(void)xfs_alloc_pagf_init(mp, tp, agno, flags);
 		}
 		if (!ineed || pag->pagf_init) {
 			if (ineed && !(longest = pag->pagf_longest))
 				longest = pag->pagf_flcount > 0;
 			if (!ineed ||
 			    (pag->pagf_freeblks >= needspace + ineed &&
 			     longest >= ineed &&
 			     okalloc)) {
 				if (agbp == NULL &&
 				    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 					agbp = NULL;
 					goto nextag;
 				}
 				up_read(&mp->m_peraglock);
 				return agbp;
 			}
 		}
 unlock_nextag:
 		if (agbp)
 			xfs_trans_brelse(tp, agbp);
 nextag:
 		/*
 		 * No point in iterating over the rest, if we're shutting
 		 * down.
 		 */
 		if (XFS_FORCED_SHUTDOWN(mp)) {
 			up_read(&mp->m_peraglock);
 			return NULL;
 		}
 		agno++;
 		if (agno >= agcount)
 			agno = 0;
 		if (agno == pagno) {
 			if (flags == 0) {
 				up_read(&mp->m_peraglock);
 				return NULL;
 			}
 			flags = 0;
 		}
 	}
 }
 /*
  * Visible inode allocation functions.
  */
 /*
  * Allocate an inode on disk.
  * Mode is used to tell whether the new inode will need space, and whether
  * it is a directory.
  *
  * The arguments IO_agbp and alloc_done are defined to work within
  * the constraint of one allocation per transaction.
  * xfs_dialloc() is designed to be called twice if it has to do an
  * allocation to make more free inodes.  On the first call,
  * IO_agbp should be set to NULL. If an inode is available,
  * i.e., xfs_dialloc() did not need to do an allocation, an inode
  * number is returned.  In this case, IO_agbp would be set to the
  * current ag_buf and alloc_done set to false.
  * If an allocation needed to be done, xfs_dialloc would return
  * the current ag_buf in IO_agbp and set alloc_done to true.
  * The caller should then commit the current transaction, allocate a new
  * transaction, and call xfs_dialloc() again, passing in the previous
  * value of IO_agbp.  IO_agbp should be held across the transactions.
  * Since the agbp is locked across the two calls, the second call is
  * guaranteed to have a free inode available.
  *
  * Once we successfully pick an inode its number is returned and the
  * on-disk data structures are updated.  The inode itself is not read
  * in, since doing so would break ordering constraints with xfs_reclaim.
  */
 int
 xfs_dialloc(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_ino_t	parent,		/* parent inode (directory) */
 	mode_t		mode,		/* mode bits for new inode */
 	int		okalloc,	/* ok to allocate more space */
 	xfs_buf_t	**IO_agbp,	/* in/out ag header's buffer */
 	boolean_t	*alloc_done,	/* true if we needed to replenish
 					   inode freelist */
 	xfs_ino_t	*inop)		/* inode number allocated */
 {
 	xfs_agnumber_t	agcount;	/* number of allocation groups */
 	xfs_buf_t	*agbp;		/* allocation group header's buffer */
 	xfs_agnumber_t	agno;		/* allocation group number */
 	xfs_agi_t	*agi;		/* allocation group header structure */
 	xfs_btree_cur_t	*cur;		/* inode allocation btree cursor */
 	int		error;		/* error return value */
 	int		i;		/* result code */
 	int		ialloced;	/* inode allocation status */
 	int		noroom = 0;	/* no space for inode blk allocation */
 	xfs_ino_t	ino;		/* fs-relative inode to be returned */
 	/* REFERENCED */
 	int		j;		/* result code */
 	xfs_mount_t	*mp;		/* file system mount structure */
 	int		offset;		/* index of inode in chunk */
 	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
 	xfs_agnumber_t	pagno;		/* parent's allocation group number */
 	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
 	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */
 	if (*IO_agbp == NULL) {
 		/*
 		 * We do not have an agbp, so select an initial allocation
 		 * group for inode allocation.
 		 */
 		agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
 		/*
 		 * Couldn't find an allocation group satisfying the
 		 * criteria, give up.
 		 */
 		if (!agbp) {
 			*inop = NULLFSINO;
 			return 0;
 		}
 		agi = XFS_BUF_TO_AGI(agbp);
 		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 	} else {
 		/*
 		 * Continue where we left off before.  In this case, we
 		 * know that the allocation group has free inodes.
 		 */
 		agbp = *IO_agbp;
 		agi = XFS_BUF_TO_AGI(agbp);
 		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 		ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
 	}
 	mp = tp->t_mountp;
 	agcount = mp->m_sb.sb_agcount;
 	agno = be32_to_cpu(agi->agi_seqno);
 	tagno = agno;
 	pagno = XFS_INO_TO_AGNO(mp, parent);
 	pagino = XFS_INO_TO_AGINO(mp, parent);
 	/*
 	 * If we have already hit the ceiling of inode blocks then clear
 	 * okalloc so we scan all available agi structures for a free
 	 * inode.
 	 */
 	if (mp->m_maxicount &&
 	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
 		noroom = 1;
 		okalloc = 0;
 	}
 	/*
 	 * Loop until we find an allocation group that either has free inodes
 	 * or in which we can allocate some inodes.  Iterate through the
 	 * allocation groups upward, wrapping at the end.
 	 */
 	*alloc_done = B_FALSE;
 	while (!agi->agi_freecount) {
 		/*
 		 * Don't do anything if we're not supposed to allocate
 		 * any blocks, just go on to the next ag.
 		 */
 		if (okalloc) {
 			/*
 			 * Try to allocate some new inodes in the allocation
 			 * group.
 			 */
 			if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
 				xfs_trans_brelse(tp, agbp);
 				if (error == ENOSPC) {
 					*inop = NULLFSINO;
 					return 0;
 				} else
 					return error;
 			}
 			if (ialloced) {
 				/*
 				 * We successfully allocated some inodes, return
 				 * the current context to the caller so that it
 				 * can commit the current transaction and call
 				 * us again where we left off.
 				 */
 				ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
 				*alloc_done = B_TRUE;
 				*IO_agbp = agbp;
 				*inop = NULLFSINO;
 				return 0;
 			}
 		}
 		/*
 		 * If it failed, give up on this ag.
 		 */
 		xfs_trans_brelse(tp, agbp);
 		/*
 		 * Go on to the next ag: get its ag header.
 		 */
 nextag:
 		if (++tagno == agcount)
 			tagno = 0;
 		if (tagno == agno) {
 			*inop = NULLFSINO;
 			return noroom ? ENOSPC : 0;
 		}
 		down_read(&mp->m_peraglock);
 		if (mp->m_perag[tagno].pagi_inodeok == 0) {
 			up_read(&mp->m_peraglock);
 			goto nextag;
 		}
 		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
 		up_read(&mp->m_peraglock);
 		if (error)
 			goto nextag;
 		agi = XFS_BUF_TO_AGI(agbp);
 		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 	}
 	/*
 	 * Here with an allocation group that has a free inode.
 	 * Reset agno since we may have chosen a new ag in the
 	 * loop above.
 	 */
 	agno = tagno;
 	*IO_agbp = NULL;
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
 	/*
 	 * If pagino is 0 (this is the root inode allocation) use newino.
 	 * This must work because we've just allocated some.
 	 */
 	if (!pagino)
 		pagino = be32_to_cpu(agi->agi_newino);
 #ifdef DEBUG
 	if (cur->bc_nlevels == 1) {
 		int	freecount = 0;
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		do {
 			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
 					&rec.ir_freecount, &rec.ir_free, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
 			if ((error = xfs_btree_increment(cur, 0, &i)))
 				goto error0;
 		} while (i == 1);
 		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
 		       XFS_FORCED_SHUTDOWN(mp));
 	}
 #endif
 	/*
 	 * If in the same a.g. as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
 		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
 			goto error0;
 		if (i != 0 &&
 		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
 			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
 			 * Found a free inode in the same chunk
 			 * as parent, done.
 			 */
 		}
 		/*
 		 * In the same a.g. as parent, but parent's chunk is full.
 		 */
 		else {
 			int	doneleft;	/* done, to the left */
 			int	doneright;	/* done, to the right */
 			if (error)
 				goto error0;
 			ASSERT(i == 1);
 			ASSERT(j == 1);
 			/*
 			 * Duplicate the cursor, search left & right
 			 * simultaneously.
 			 */
 			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
 				goto error0;
 			/*
 			 * Search left with tcur, back up 1 record.
 			 */
 			if ((error = xfs_btree_decrement(tcur, 0, &i)))
 				goto error1;
 			doneleft = !i;
 			if (!doneleft) {
 				if ((error = xfs_inobt_get_rec(tcur,
 						&trec.ir_startino,
 						&trec.ir_freecount,
 						&trec.ir_free, &i)))
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
 			/*
 			 * Search right with cur, go forward 1 record.
 			 */
 			if ((error = xfs_btree_increment(cur, 0, &i)))
 				goto error1;
 			doneright = !i;
 			if (!doneright) {
 				if ((error = xfs_inobt_get_rec(cur,
 						&rec.ir_startino,
 						&rec.ir_freecount,
 						&rec.ir_free, &i)))
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
 			/*
 			 * Loop until we find the closest inode chunk
 			 * with a free one.
 			 */
 			while (!doneleft || !doneright) {
 				int	useleft;  /* using left inode
 						     chunk this time */
 				/*
 				 * Figure out which block is closer,
 				 * if both are valid.
 				 */
 				if (!doneleft && !doneright)
 					useleft =
 						pagino -
 						(trec.ir_startino +
 						 XFS_INODES_PER_CHUNK - 1) <
 						 rec.ir_startino - pagino;
 				else
 					useleft = !doneleft;
 				/*
 				 * If checking the left, does it have
 				 * free inodes?
 				 */
 				if (useleft && trec.ir_freecount) {
 					/*
 					 * Yes, set it up as the chunk to use.
 					 */
 					rec = trec;
 					xfs_btree_del_cursor(cur,
 						XFS_BTREE_NOERROR);
 					cur = tcur;
 					break;
 				}
 				/*
 				 * If checking the right, does it have
 				 * free inodes?
 				 */
 				if (!useleft && rec.ir_freecount) {
 					/*
 					 * Yes, it's already set up.
 					 */
 					xfs_btree_del_cursor(tcur,
 						XFS_BTREE_NOERROR);
 					break;
 				}
 				/*
 				 * If used the left, get another one
 				 * further left.
 				 */
 				if (useleft) {
 					if ((error = xfs_btree_decrement(tcur, 0,
 							&i)))
 						goto error1;
 					doneleft = !i;
 					if (!doneleft) {
 						if ((error = xfs_inobt_get_rec(
 							    tcur,
 							    &trec.ir_startino,
 							    &trec.ir_freecount,
 							    &trec.ir_free, &i)))
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
 					}
 				}
 				/*
 				 * If used the right, get another one
 				 * further right.
 				 */
 				else {
 					if ((error = xfs_btree_increment(cur, 0,
 							&i)))
 						goto error1;
 					doneright = !i;
 					if (!doneright) {
 						if ((error = xfs_inobt_get_rec(
 							    cur,
 							    &rec.ir_startino,
 							    &rec.ir_freecount,
 							    &rec.ir_free, &i)))
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
 					}
 				}
 			}
 			ASSERT(!doneleft || !doneright);
 		}
 	}
 	/*
 	 * In a different a.g. from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
 	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
 		if ((error = xfs_inobt_lookup_eq(cur,
 				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
 			goto error0;
 		if (i == 1 &&
 		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
 			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
 			 * The last chunk allocated in the group still has
 			 * a free inode.
 			 */
 		}
 		/*
 		 * None left in the last group, search the whole a.g.
 		 */
 		else {
 			if (error)
 				goto error0;
 			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 				goto error0;
 			ASSERT(i == 1);
 			for (;;) {
 				if ((error = xfs_inobt_get_rec(cur,
 						&rec.ir_startino,
 						&rec.ir_freecount, &rec.ir_free,
 						&i)))
 					goto error0;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 				if (rec.ir_freecount > 0)
 					break;
 				if ((error = xfs_btree_increment(cur, 0, &i)))
 					goto error0;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			}
 		}
 	}
 	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
 				   XFS_INODES_PER_CHUNK) == 0);
 	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
 	XFS_INOBT_CLR_FREE(&rec, offset);
 	rec.ir_freecount--;
 	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
 			rec.ir_free)))
 		goto error0;
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 	down_read(&mp->m_peraglock);
 	mp->m_perag[tagno].pagi_freecount--;
 	up_read(&mp->m_peraglock);
 #ifdef DEBUG
 	if (cur->bc_nlevels == 1) {
 		int	freecount = 0;
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
 			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
 					&rec.ir_freecount, &rec.ir_free, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
 			if ((error = xfs_btree_increment(cur, 0, &i)))
 				goto error0;
 		} while (i == 1);
 		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
 		       XFS_FORCED_SHUTDOWN(mp));
 	}
 #endif
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
 	*inop = ino;
 	return 0;
 error1:
 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
 error0:
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 	return error;
 }
 /*
  * Free disk inode.  Carefully avoids touching the incore inode, all
  * manipulations incore are the caller's responsibility.
  * The on-disk inode is not changed by this operation, only the
  * btree (free inode mask) is changed.
  */
 int
 xfs_difree(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_ino_t	inode,		/* inode to be freed */
 	xfs_bmap_free_t	*flist,		/* extents to free */
 	int		*delete,	/* set if inode cluster was deleted */
 	xfs_ino_t	*first_ino)	/* first inode in deleted cluster */
 {
 	/* REFERENCED */
 	xfs_agblock_t	agbno;	/* block number containing inode */
 	xfs_buf_t	*agbp;	/* buffer containing allocation group header */
 	xfs_agino_t	agino;	/* inode number relative to allocation group */
 	xfs_agnumber_t	agno;	/* allocation group number */
 	xfs_agi_t	*agi;	/* allocation group header */
 	xfs_btree_cur_t	*cur;	/* inode btree cursor */
 	int		error;	/* error return value */
 	int		i;	/* result code */
 	int		ilen;	/* inodes in an inode cluster */
 	xfs_mount_t	*mp;	/* mount structure for filesystem */
 	int		off;	/* offset of inode in inode chunk */
 	xfs_inobt_rec_incore_t rec;	/* btree record */
 	mp = tp->t_mountp;
 	/*
 	 * Break up inode number into its components.
 	 */
 	agno = XFS_INO_TO_AGNO(mp, inode);
 	if (agno >= mp->m_sb.sb_agcount)  {
 		cmn_err(CE_WARN,
 			"xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
 			agno, mp->m_sb.sb_agcount, mp->m_fsname);
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
 	agino = XFS_INO_TO_AGINO(mp, inode);
 	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
 		cmn_err(CE_WARN,
 			"xfs_difree: inode != XFS_AGINO_TO_INO() "
 			"(%llu != %llu) on %s.  Returning EINVAL.",
 			(unsigned long long)inode,
 			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
 			mp->m_fsname);
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 	if (agbno >= mp->m_sb.sb_agblocks)  {
 		cmn_err(CE_WARN,
 			"xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
 			agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
 	/*
 	 * Get the allocation group header.
 	 */
 	down_read(&mp->m_peraglock);
 	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
 	up_read(&mp->m_peraglock);
 	if (error) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
 		return error;
 	}
 	agi = XFS_BUF_TO_AGI(agbp);
 	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 	ASSERT(agbno < be32_to_cpu(agi->agi_length));
 	/*
 	 * Initialize the cursor.
 	 */
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
 #ifdef DEBUG
 	if (cur->bc_nlevels == 1) {
 		int freecount = 0;
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
 			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
 					&rec.ir_freecount, &rec.ir_free, &i)))
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
 				if ((error = xfs_btree_increment(cur, 0, &i)))
 					goto error0;
 			}
 		} while (i == 1);
 		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
 		       XFS_FORCED_SHUTDOWN(mp));
 	}
 #endif
 	/*
 	 * Look for the entry describing this inode.
 	 */
 	if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
 			&rec.ir_free, &i))) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	/*
 	 * Get the offset in the inode chunk.
 	 */
 	off = agino - rec.ir_startino;
 	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
 	ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
 	/*
 	 * Mark the inode free & increment the count.
 	 */
 	XFS_INOBT_SET_FREE(&rec, off);
 	rec.ir_freecount++;
 	/*
 	 * When an inode cluster is free, it becomes eligible for removal
 	 */
 	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
 	    (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
 		*delete = 1;
 		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
 		/*
 		 * Remove the inode cluster from the AGI B+Tree, adjust the
 		 * AGI and Superblock inode counts, and mark the disk space
 		 * to be freed when the transaction is committed.
 		 */
 		ilen = XFS_IALLOC_INODES(mp);
 		be32_add_cpu(&agi->agi_count, -ilen);
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
 		mp->m_perag[agno].pagi_freecount -= ilen - 1;
 		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 		if ((error = xfs_btree_delete(cur, &i))) {
 			cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
 				error, mp->m_fsname);
 			goto error0;
 		}
 		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
 				agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
 				XFS_IALLOC_BLOCKS(mp), flist, mp);
 	} else {
 		*delete = 0;
 		if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
 			cmn_err(CE_WARN,
 				"xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 			goto error0;
 		}
 		/*
 		 * Change the inode free counts and log the ag/sb changes.
 		 */
 		be32_add_cpu(&agi->agi_freecount, 1);
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
 		mp->m_perag[agno].pagi_freecount++;
 		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
 #ifdef DEBUG
 	if (cur->bc_nlevels == 1) {
 		int freecount = 0;
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
 			if ((error = xfs_inobt_get_rec(cur,
 					&rec.ir_startino,
 					&rec.ir_freecount,
 					&rec.ir_free, &i)))
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
 				if ((error = xfs_btree_increment(cur, 0, &i)))
 					goto error0;
 			}
 		} while (i == 1);
 		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
 		       XFS_FORCED_SHUTDOWN(mp));
 	}
 #endif
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	return 0;
 error0:
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 	return error;
 }
 /*
  * Return the location of the inode in imap, for mapping it into a buffer.
  */
 int
 xfs_imap(
 	xfs_mount_t	 *mp,	/* file system mount structure */
 	xfs_trans_t	 *tp,	/* transaction pointer */
 	xfs_ino_t	ino,	/* inode to locate */
 	struct xfs_imap	*imap,	/* location map structure */
 	uint		flags)	/* flags for inode btree lookup */
 {
 	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
 	xfs_agino_t	agino;	/* inode number within alloc group */
 	xfs_agnumber_t	agno;	/* allocation group number */
 	int		blks_per_cluster; /* num blocks per inode cluster */
 	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
 	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
 	int		error;	/* error code */
 	int		offset;	/* index of inode in its buffer */
 	int		offset_agbno;	/* blks from chunk start to inode */
 	ASSERT(ino != NULLFSINO);
 	/*
 	 * Split up the inode number into its parts.
 	 */
 	agno = XFS_INO_TO_AGNO(mp, ino);
 	agino = XFS_INO_TO_AGINO(mp, ino);
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
 	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
 		/* no diagnostics for bulkstat, ino comes from userspace */
 		if (flags & XFS_IMAP_BULKSTAT)
 			return XFS_ERROR(EINVAL);
 		if (agno >= mp->m_sb.sb_agcount) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xfs_imap: agno (%d) >= "
 					"mp->m_sb.sb_agcount (%d)",
 					agno,  mp->m_sb.sb_agcount);
 		}
 		if (agbno >= mp->m_sb.sb_agblocks) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xfs_imap: agbno (0x%llx) >= "
 					"mp->m_sb.sb_agblocks (0x%lx)",
 					(unsigned long long) agbno,
 					(unsigned long) mp->m_sb.sb_agblocks);
 		}
 		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xfs_imap: ino (0x%llx) != "
 					"XFS_AGINO_TO_INO(mp, agno, agino) "
 					"(0x%llx)",
 					ino, XFS_AGINO_TO_INO(mp, agno, agino));
 		}
 		xfs_stack_trace();
 #endif /* DEBUG */
 		return XFS_ERROR(EINVAL);
 	}
 	/*
 	 * If the inode cluster size is the same as the blocksize or
 	 * smaller we get to the buffer by simple arithmetics.
 	 */
 	if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
 		imap->im_len = XFS_FSB_TO_BB(mp, 1);
 		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
 		return 0;
 	}
 	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
 	/*
 	 * If we get a block number passed from bulkstat we can use it to
 	 * find the buffer easily.
 	 */
 	if (imap->im_blkno) {
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 		cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
 		offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
 		imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
 		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
 		return 0;
 	}
 	/*
 	 * If the inode chunks are aligned then use simple maths to
 	 * find the location. Otherwise we have to do a btree
 	 * lookup to find the location.
 	 */
 	if (mp->m_inoalign_mask) {
 		offset_agbno = agbno & mp->m_inoalign_mask;
 		chunk_agbno = agbno - offset_agbno;
 	} else {
 		xfs_btree_cur_t	*cur;	/* inode btree cursor */
 		xfs_agino_t	chunk_agino; /* first agino in inode chunk */
 		__int32_t	chunk_cnt; /* count of free inodes in chunk */
 		xfs_inofree_t	chunk_free; /* mask of free inodes in chunk */
 		xfs_buf_t	*agbp;	/* agi buffer */
 		int		i;	/* temp state */
 		down_read(&mp->m_peraglock);
 		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
 		up_read(&mp->m_peraglock);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_ialloc_read_agi() returned "
 					"error %d, agno %d",
 					error, agno);
 			return error;
 		}
 		cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
 		error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_inobt_lookup_le() failed");
 			goto error0;
 		}
 		error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
 				&chunk_free, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_inobt_get_rec() failed");
 			goto error0;
 		}
 		if (i == 0) {
 #ifdef DEBUG
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_inobt_get_rec() failed");
 #endif /* DEBUG */
 			error = XFS_ERROR(EINVAL);
 		}
  error0:
 		xfs_trans_brelse(tp, agbp);
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 		if (error)
 			return error;
 		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
 		offset_agbno = agbno - chunk_agbno;
 	}
 	ASSERT(agbno >= chunk_agbno);
 	cluster_agbno = chunk_agbno +
 		((offset_agbno / blks_per_cluster) * blks_per_cluster);
 	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
 		XFS_INO_TO_OFFSET(mp, ino);
 	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
 	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
 	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
 	/*
 	 * If the inode number maps to a block outside the bounds
 	 * of the file system then return NULL rather than calling
 	 * read_buf and panicing when we get an error from the
 	 * driver.
 	 */
 	if ((imap->im_blkno + imap->im_len) >
 	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 			"(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
 			" XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
 			(unsigned long long) imap->im_blkno,
 			(unsigned long long) imap->im_len,
 			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
 		return XFS_ERROR(EINVAL);
 	}
 	return 0;
 }
 /*
  * Compute and fill in value of m_in_maxlevels.
  */
 void
 xfs_ialloc_compute_maxlevels(
 	xfs_mount_t	*mp)		/* file system mount structure */
 {
 	int		level;
 	uint		maxblocks;
 	uint		maxleafents;
 	int		minleafrecs;
 	int		minnoderecs;
 	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
 		XFS_INODES_PER_CHUNK_LOG;
 	minleafrecs = mp->m_alloc_mnr[0];
 	minnoderecs = mp->m_alloc_mnr[1];
 	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
 	for (level = 1; maxblocks > 1; level++)
 		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
 	mp->m_in_maxlevels = level;
 }
 /*
  * Log specified fields for the ag hdr (inode section)
  */
 void
 xfs_ialloc_log_agi(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_buf_t	*bp,		/* allocation group header buffer */
 	int		fields)		/* bitmask of fields to log */
 {
 	int			first;		/* first byte number */
 	int			last;		/* last byte number */
 	static const short	offsets[] = {	/* field starting offsets */
 					/* keep in sync with bit definitions */
 		offsetof(xfs_agi_t, agi_magicnum),
 		offsetof(xfs_agi_t, agi_versionnum),
 		offsetof(xfs_agi_t, agi_seqno),
 		offsetof(xfs_agi_t, agi_length),
 		offsetof(xfs_agi_t, agi_count),
 		offsetof(xfs_agi_t, agi_root),
 		offsetof(xfs_agi_t, agi_level),
 		offsetof(xfs_agi_t, agi_freecount),
 		offsetof(xfs_agi_t, agi_newino),
 		offsetof(xfs_agi_t, agi_dirino),
 		offsetof(xfs_agi_t, agi_unlinked),
 		sizeof(xfs_agi_t)
 	};
 #ifdef DEBUG
 	xfs_agi_t		*agi;	/* allocation group header */
 	agi = XFS_BUF_TO_AGI(bp);
 	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 #endif
 	/*
 	 * Compute byte offsets for the first and last fields.
 	 */
 	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
 	/*
 	 * Log the allocation group inode header buffer.
 	 */
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 #ifdef DEBUG
 STATIC void
 xfs_check_agi_unlinked(
 	struct xfs_agi		*agi)
 {
 	int			i;
 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
 		ASSERT(agi->agi_unlinked[i]);
 }
 #else
 #define xfs_check_agi_unlinked(agi)
 #endif
 /*
  * Read in the allocation group header (inode allocation section)
  */
 int
 xfs_read_agi(
 	struct xfs_mount	*mp,	/* file system mount structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_agnumber_t		agno,	/* allocation group number */
 	struct xfs_buf		**bpp)	/* allocation group hdr buf */
 {
 	struct xfs_agi		*agi;	/* allocation group header */
 	int			agi_ok;	/* agi is consistent */
 	int			error;
 	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, bpp);
 	if (error)
 		return error;
 	ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
 	agi = XFS_BUF_TO_AGI(*bpp);
 	/*
 	 * Validate the magic number of the agi block.
 	 */
 	agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
 		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
 		be32_to_cpu(agi->agi_seqno) == agno;
 	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
 			XFS_RANDOM_IALLOC_READ_AGI))) {
 		XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
 				     mp, agi);
 		xfs_trans_brelse(tp, *bpp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
 	xfs_check_agi_unlinked(agi);
 	return 0;
 }
 int
 xfs_ialloc_read_agi(
 	struct xfs_mount	*mp,	/* file system mount structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_agnumber_t		agno,	/* allocation group number */
 	struct xfs_buf		**bpp)	/* allocation group hdr buf */
 {
 	struct xfs_agi		*agi;	/* allocation group header */
 	struct xfs_perag	*pag;	/* per allocation group data */
 	int			error;
 	error = xfs_read_agi(mp, tp, agno, bpp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(*bpp);
 	pag = &mp->m_perag[agno];
 	if (!pag->pagi_init) {
 		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
 		pag->pagi_count = be32_to_cpu(agi->agi_count);
 		pag->pagi_init = 1;
 	}
 	/*
 	 * It's possible for these to be out of sync if
 	 * we are in the middle of a forced shutdown.
 	 */
 	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
 		XFS_FORCED_SHUTDOWN(mp));
 	return 0;
 }
 /*
  * Read in the agi to initialise the per-ag data in the mount structure
  */
 int
 xfs_ialloc_pagi_init(
 	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_agnumber_t	agno)		/* allocation group number */
 {
 	xfs_buf_t	*bp = NULL;
 	int		error;
 	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
 	if (error)
 		return error;
 	if (bp)
 		xfs_trans_brelse(tp, bp);
 	return 0;
 }

fs/xfs/xfs_imap.h

View file @ 92bfc6e

1	/*		File was deleted
2	* Copyright (c) 2000,2005 Silicon Graphics, Inc.
3	* All Rights Reserved.
4	*
5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.
8	*
9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*
14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/
18	#ifndef __XFS_IMAP_H__
19	#define __XFS_IMAP_H__
20
21	/*
22	* This is the structure passed to xfs_imap() to map
23	* an inode number to its on disk location.
24	*/
25	typedef struct xfs_imap {
26	xfs_daddr_t im_blkno; /* starting BB of inode chunk */
27	uint im_len; /* length in BBs of inode chunk */
28	ushort im_boffset; /* inode offset in block in bytes */
29	} xfs_imap_t;
30
31	#endif /* __XFS_IMAP_H__ */
32		1	/*

fs/xfs/xfs_inode.c

Diff comments View file @ 92bfc6e

 /*
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/log2.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_imap.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
 #include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
 #include "xfs_acl.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
 /*
  * Used in xfs_itruncate().  This is the maximum number of extents
  * freed from a file in a single transaction.
  */
 #define	XFS_ITRUNC_MAX_EXTENTS	2
 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
 #ifdef DEBUG
 /*
  * Make sure that the extents in the given memory buffer
  * are valid.
  */
 STATIC void
 xfs_validate_extents(
 	xfs_ifork_t		*ifp,
 	int			nrecs,
 	xfs_exntfmt_t		fmt)
 {
 	xfs_bmbt_irec_t		irec;
 	xfs_bmbt_rec_host_t	rec;
 	int			i;
 	for (i = 0; i < nrecs; i++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 		rec.l0 = get_unaligned(&ep->l0);
 		rec.l1 = get_unaligned(&ep->l1);
 		xfs_bmbt_get_all(&rec, &irec);
 		if (fmt == XFS_EXTFMT_NOSTATE)
 			ASSERT(irec.br_state == XFS_EXT_NORM);
 	}
 }
 #else /* DEBUG */
 #define xfs_validate_extents(ifp, nrecs, fmt)
 #endif /* DEBUG */
 /*
  * Check that none of the inode's in the buffer have a next
  * unlinked field of 0.
  */
 #if defined(DEBUG)
 void
 xfs_inobp_check(
 	xfs_mount_t	*mp,
 	xfs_buf_t	*bp)
 {
 	int		i;
 	int		j;
 	xfs_dinode_t	*dip;
 	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < j; i++) {
 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 					i * mp->m_sb.sb_inodesize);
 		if (!dip->di_next_unlinked)  {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
 				bp);
 			ASSERT(dip->di_next_unlinked);
 		}
 	}
 }
 #endif
 /*
  * Find the buffer associated with the given inode map
  * We do basic validation checks on the buffer once it has been
  * retrieved from disk.
  */
 STATIC int
 xfs_imap_to_bp(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
-	xfs_imap_t	*imap,
+	struct xfs_imap	*imap,
 	xfs_buf_t	**bpp,
 	uint		buf_flags,
 	uint		imap_flags)
 {
 	int		error;
 	int		i;
 	int		ni;
 	xfs_buf_t	*bp;
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 				   (int)imap->im_len, buf_flags, &bp);
 	if (error) {
 		if (error != EAGAIN) {
 			cmn_err(CE_WARN,
 				"xfs_imap_to_bp: xfs_trans_read_buf()returned "
 				"an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 		} else {
 			ASSERT(buf_flags & XFS_BUF_TRYLOCK);
 		}
 		return error;
 	}
 	/*
 	 * Validate the magic number and version of every inode in the buffer
 	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
 	 */
 #ifdef DEBUG
 	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
 #else	/* usual case */
 	ni = 1;
 #endif
 	for (i = 0; i < ni; i++) {
 		int		di_ok;
 		xfs_dinode_t	*dip;
 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 					(i << mp->m_sb.sb_inodelog));
 		di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
 			    XFS_DINODE_GOOD_VERSION(dip->di_version);
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 						XFS_ERRTAG_ITOBP_INOTOBP,
 						XFS_RANDOM_ITOBP_INOTOBP))) {
 			if (imap_flags & XFS_IMAP_BULKSTAT) {
 				xfs_trans_brelse(tp, bp);
 				return XFS_ERROR(EINVAL);
 			}
 			XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
 						XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
 			cmn_err(CE_PANIC,
 					"Device %s - bad inode magic/vsn "
 					"daddr %lld #%d (magic=%x)",
 				XFS_BUFTARG_NAME(mp->m_ddev_targp),
 				(unsigned long long)imap->im_blkno, i,
 				be16_to_cpu(dip->di_magic));
 #endif
 			xfs_trans_brelse(tp, bp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 	}
 	xfs_inobp_check(mp, bp);
 	/*
 	 * Mark the buffer as an inode buffer now that it looks good
 	 */
 	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
 	*bpp = bp;
 	return 0;
 }
 /*
  * This routine is called to map an inode number within a file
  * system to the buffer containing the on-disk version of the
  * inode.  It returns a pointer to the buffer containing the
  * on-disk inode in the bpp parameter, and in the dip parameter
  * it returns a pointer to the on-disk inode within that buffer.
  *
  * If a non-zero error is returned, then the contents of bpp and
  * dipp are undefined.
  *
  * Use xfs_imap() to determine the size and location of the
  * buffer to read from disk.
  */
 int
 xfs_inotobp(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
 	xfs_dinode_t	**dipp,
 	xfs_buf_t	**bpp,
 	int		*offset,
 	uint		imap_flags)
 {
-	xfs_imap_t	imap;
+	struct xfs_imap	imap;
 	xfs_buf_t	*bp;
 	int		error;
 	imap.im_blkno = 0;
 	error = xfs_imap(mp, tp, ino, &imap, imap_flags);
 	if (error)
 		return error;
 	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
 	if (error)
 		return error;
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	*offset = imap.im_boffset;
 	return 0;
 }
 /*
  * This routine is called to map an inode to the buffer containing
  * the on-disk version of the inode.  It returns a pointer to the
  * buffer containing the on-disk inode in the bpp parameter, and in
  * the dip parameter it returns a pointer to the on-disk inode within
  * that buffer.
  *
  * If a non-zero error is returned, then the contents of bpp and
  * dipp are undefined.
  *
  * The inode is expected to already been mapped to its buffer and read
  * in once, thus we can use the mapping information stored in the inode
  * rather than calling xfs_imap().  This allows us to avoid the overhead
  * of looking at the inode btree for small block file systems
  * (see xfs_imap()).
  */
 int
 xfs_itobp(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
 	xfs_dinode_t	**dipp,
 	xfs_buf_t	**bpp,
 	uint		buf_flags)
 {
-	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	ASSERT(ip->i_blkno != 0);
+	ASSERT(ip->i_imap.im_blkno != 0);
-	imap.im_blkno = ip->i_blkno;
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
-	imap.im_len = ip->i_len;
-	imap.im_boffset = ip->i_boffset;
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, 0);
 	if (error)
 		return error;
 	if (!bp) {
 		ASSERT(buf_flags & XFS_BUF_TRYLOCK);
 		ASSERT(tp == NULL);
 		*bpp = NULL;
 		return EAGAIN;
 	}
-	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 	*bpp = bp;
 	return 0;
 }
 /*
  * Move inode type and inode format specific information from the
  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
  * this means set if_rdev to the proper value.  For files, directories,
  * and symlinks this means to bring in the in-line data or extent
  * pointers.  For a file in B-tree format, only the root is immediately
  * brought in-core.  The rest will be in-lined in if_extents when it
  * is first referenced (see xfs_iread_extents()).
  */
 STATIC int
 xfs_iformat(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip)
 {
 	xfs_attr_shortform_t	*atp;
 	int			size;
 	int			error;
 	xfs_fsize_t             di_size;
 	ip->i_df.if_ext_max =
 		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	error = 0;
 	if (unlikely(be32_to_cpu(dip->di_nextents) +
 		     be16_to_cpu(dip->di_anextents) >
 		     be64_to_cpu(dip->di_nblocks))) {
 		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 			(unsigned long long)ip->i_ino,
 			(int)(be32_to_cpu(dip->di_nextents) +
 			      be16_to_cpu(dip->di_anextents)),
 			(unsigned long long)
 				be64_to_cpu(dip->di_nblocks));
 		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 			"corrupt dinode %Lu, forkoff = 0x%x.",
 			(unsigned long long)ip->i_ino,
 			dip->di_forkoff);
 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	switch (ip->i_d.di_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFSOCK:
 		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 					      ip->i_mount, dip);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		ip->i_d.di_size = 0;
 		ip->i_size = 0;
 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 		break;
 	case S_IFREG:
 	case S_IFLNK:
 	case S_IFDIR:
 		switch (dip->di_format) {
 		case XFS_DINODE_FMT_LOCAL:
 			/*
 			 * no local regular files yet
 			 */
 			if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
 				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 					"corrupt inode %Lu "
 					"(local format for regular file).",
 					(unsigned long long) ip->i_ino);
 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 						     XFS_ERRLEVEL_LOW,
 						     ip->i_mount, dip);
 				return XFS_ERROR(EFSCORRUPTED);
 			}
 			di_size = be64_to_cpu(dip->di_size);
 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 					"corrupt inode %Lu "
 					"(bad size %Ld for local inode).",
 					(unsigned long long) ip->i_ino,
 					(long long) di_size);
 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 						     XFS_ERRLEVEL_LOW,
 						     ip->i_mount, dip);
 				return XFS_ERROR(EFSCORRUPTED);
 			}
 			size = (int)di_size;
 			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 			break;
 		case XFS_DINODE_FMT_EXTENTS:
 			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 			break;
 		case XFS_DINODE_FMT_BTREE:
 			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 			break;
 		default:
 			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 					 ip->i_mount);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		break;
 	default:
 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (error) {
 		return error;
 	}
 	if (!XFS_DFORK_Q(dip))
 		return 0;
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
 	ip->i_afp->if_ext_max =
 		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 		size = be16_to_cpu(atp->hdr.totsize);
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
 		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 		break;
 	default:
 		error = XFS_ERROR(EFSCORRUPTED);
 		break;
 	}
 	if (error) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
 	}
 	return error;
 }
 /*
  * The file is in-lined in the on-disk inode.
  * If it fits into if_inline_data, then copy
  * it there, otherwise allocate a buffer for it
  * and copy the data there.  Either way, set
  * if_data to point at the data.
  * If we allocate a buffer for the data, make
  * sure that its size is a multiple of 4 and
  * record the real size in i_real_bytes.
  */
 STATIC int
 xfs_iformat_local(
 	xfs_inode_t	*ip,
 	xfs_dinode_t	*dip,
 	int		whichfork,
 	int		size)
 {
 	xfs_ifork_t	*ifp;
 	int		real_size;
 	/*
 	 * If the size is unreasonable, then something
 	 * is wrong and we just bail out rather than crash in
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 			"corrupt inode %Lu "
 			"(bad size %d for local fork, size = %d).",
 			(unsigned long long) ip->i_ino, size,
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	real_size = 0;
 	if (size == 0)
 		ifp->if_u1.if_data = NULL;
 	else if (size <= sizeof(ifp->if_u2.if_inline_data))
 		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 	else {
 		real_size = roundup(size, 4);
 		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
 	}
 	ifp->if_bytes = size;
 	ifp->if_real_bytes = real_size;
 	if (size)
 		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 	ifp->if_flags &= ~XFS_IFEXTENTS;
 	ifp->if_flags |= XFS_IFINLINE;
 	return 0;
 }
 /*
  * The file consists of a set of extents all
  * of which fit into the on-disk inode.
  * If there are few enough extents to fit into
  * the if_inline_ext, then copy them there.
  * Otherwise allocate a buffer for them and copy
  * them into it.  Either way, set if_extents
  * to point at the extents.
  */
 STATIC int
 xfs_iformat_extents(
 	xfs_inode_t	*ip,
 	xfs_dinode_t	*dip,
 	int		whichfork)
 {
 	xfs_bmbt_rec_t	*dp;
 	xfs_ifork_t	*ifp;
 	int		nex;
 	int		size;
 	int		i;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 	/*
 	 * If the number of extents is unreasonable, then something
 	 * is wrong and we just bail out rather than crash in
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 			"corrupt inode %Lu ((a)extents = %d).",
 			(unsigned long long) ip->i_ino, nex);
 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	ifp->if_real_bytes = 0;
 	if (nex == 0)
 		ifp->if_u1.if_extents = NULL;
 	else if (nex <= XFS_INLINE_EXTS)
 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 	else
 		xfs_iext_add(ifp, 0, nex);
 	ifp->if_bytes = size;
 	if (size) {
 		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 		for (i = 0; i < nex; i++, dp++) {
 			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 			ep->l0 = get_unaligned_be64(&dp->l0);
 			ep->l1 = get_unaligned_be64(&dp->l1);
 		}
 		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 		if (whichfork != XFS_DATA_FORK ||
 			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 				if (unlikely(xfs_check_nostate_extents(
 				    ifp, 0, nex))) {
 					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 							 XFS_ERRLEVEL_LOW,
 							 ip->i_mount);
 					return XFS_ERROR(EFSCORRUPTED);
 				}
 	}
 	ifp->if_flags |= XFS_IFEXTENTS;
 	return 0;
 }
 /*
  * The file has too many extents to fit into
  * the inode, so they are in B-tree format.
  * Allocate a buffer for the root of the B-tree
  * and copy the root into it.  The i_extents
  * field will remain NULL until all of the
  * extents are read in (when they are needed).
  */
 STATIC int
 xfs_iformat_btree(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip,
 	int			whichfork)
 {
 	xfs_bmdr_block_t	*dfp;
 	xfs_ifork_t		*ifp;
 	/* REFERENCED */
 	int			nrecs;
 	int			size;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	size = XFS_BMAP_BROOT_SPACE(dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 	/*
 	 * blow out if -- fork has less extents than can fit in
 	 * fork (fork shouldn't be a btree format), root btree
 	 * block has more records than can fit into the fork,
 	 * or the number of extents is greater than the number of
 	 * blocks.
 	 */
 	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
 	    || XFS_BMDR_SPACE_CALC(nrecs) >
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
 	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 			"corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
 		XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	ifp->if_broot_bytes = size;
 	ifp->if_broot = kmem_alloc(size, KM_SLEEP);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
 	 * to the in-memory structure.
 	 */
 	xfs_bmdr_to_bmbt(ip->i_mount, dfp,
 			 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 			 ifp->if_broot, size);
 	ifp->if_flags &= ~XFS_IFEXTENTS;
 	ifp->if_flags |= XFS_IFBROOT;
 	return 0;
 }
 void
 xfs_dinode_from_disk(
 	xfs_icdinode_t		*to,
 	xfs_dinode_t		*from)
 {
 	to->di_magic = be16_to_cpu(from->di_magic);
 	to->di_mode = be16_to_cpu(from->di_mode);
 	to->di_version = from ->di_version;
 	to->di_format = from->di_format;
 	to->di_onlink = be16_to_cpu(from->di_onlink);
 	to->di_uid = be32_to_cpu(from->di_uid);
 	to->di_gid = be32_to_cpu(from->di_gid);
 	to->di_nlink = be32_to_cpu(from->di_nlink);
 	to->di_projid = be16_to_cpu(from->di_projid);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
 	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 	to->di_size = be64_to_cpu(from->di_size);
 	to->di_nblocks = be64_to_cpu(from->di_nblocks);
 	to->di_extsize = be32_to_cpu(from->di_extsize);
 	to->di_nextents = be32_to_cpu(from->di_nextents);
 	to->di_anextents = be16_to_cpu(from->di_anextents);
 	to->di_forkoff = from->di_forkoff;
 	to->di_aformat	= from->di_aformat;
 	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
 	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
 	to->di_flags	= be16_to_cpu(from->di_flags);
 	to->di_gen	= be32_to_cpu(from->di_gen);
 }
 void
 xfs_dinode_to_disk(
 	xfs_dinode_t		*to,
 	xfs_icdinode_t		*from)
 {
 	to->di_magic = cpu_to_be16(from->di_magic);
 	to->di_mode = cpu_to_be16(from->di_mode);
 	to->di_version = from ->di_version;
 	to->di_format = from->di_format;
 	to->di_onlink = cpu_to_be16(from->di_onlink);
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
 	to->di_nlink = cpu_to_be32(from->di_nlink);
 	to->di_projid = cpu_to_be16(from->di_projid);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = cpu_to_be16(from->di_flushiter);
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
 	to->di_extsize = cpu_to_be32(from->di_extsize);
 	to->di_nextents = cpu_to_be32(from->di_nextents);
 	to->di_anextents = cpu_to_be16(from->di_anextents);
 	to->di_forkoff = from->di_forkoff;
 	to->di_aformat = from->di_aformat;
 	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 	to->di_dmstate = cpu_to_be16(from->di_dmstate);
 	to->di_flags = cpu_to_be16(from->di_flags);
 	to->di_gen = cpu_to_be32(from->di_gen);
 }
 STATIC uint
 _xfs_dic2xflags(
 	__uint16_t		di_flags)
 {
 	uint			flags = 0;
 	if (di_flags & XFS_DIFLAG_ANY) {
 		if (di_flags & XFS_DIFLAG_REALTIME)
 			flags |= XFS_XFLAG_REALTIME;
 		if (di_flags & XFS_DIFLAG_PREALLOC)
 			flags |= XFS_XFLAG_PREALLOC;
 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
 			flags |= XFS_XFLAG_IMMUTABLE;
 		if (di_flags & XFS_DIFLAG_APPEND)
 			flags |= XFS_XFLAG_APPEND;
 		if (di_flags & XFS_DIFLAG_SYNC)
 			flags |= XFS_XFLAG_SYNC;
 		if (di_flags & XFS_DIFLAG_NOATIME)
 			flags |= XFS_XFLAG_NOATIME;
 		if (di_flags & XFS_DIFLAG_NODUMP)
 			flags |= XFS_XFLAG_NODUMP;
 		if (di_flags & XFS_DIFLAG_RTINHERIT)
 			flags |= XFS_XFLAG_RTINHERIT;
 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
 			flags |= XFS_XFLAG_PROJINHERIT;
 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 			flags |= XFS_XFLAG_NOSYMLINKS;
 		if (di_flags & XFS_DIFLAG_EXTSIZE)
 			flags |= XFS_XFLAG_EXTSIZE;
 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 			flags |= XFS_XFLAG_EXTSZINHERIT;
 		if (di_flags & XFS_DIFLAG_NODEFRAG)
 			flags |= XFS_XFLAG_NODEFRAG;
 		if (di_flags & XFS_DIFLAG_FILESTREAM)
 			flags |= XFS_XFLAG_FILESTREAM;
 	}
 	return flags;
 }
 uint
 xfs_ip2xflags(
 	xfs_inode_t		*ip)
 {
 	xfs_icdinode_t		*dic = &ip->i_d;
 	return _xfs_dic2xflags(dic->di_flags) |
 				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 }
 uint
 xfs_dic2xflags(
 	xfs_dinode_t		*dip)
 {
 	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 /*
  * Allocate and initialise an xfs_inode.
  */
 STATIC struct xfs_inode *
 xfs_inode_alloc(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
 	struct xfs_inode	*ip;
 	/*
 	 * if this didn't occur in transactions, we could use
 	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
 	 * code up to do this anyway.
 	 */
 	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
 	if (!ip)
 		return NULL;
 	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 	/*
 	 * initialise the VFS inode here to get failures
 	 * out of the way early.
 	 */
 	if (!inode_init_always(mp->m_super, VFS_I(ip))) {
 		kmem_zone_free(xfs_inode_zone, ip);
 		return NULL;
 	}
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_mount = mp;
-	ip->i_blkno = 0;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-	ip->i_len = 0;
-	ip->i_boffset =0;
 	ip->i_afp = NULL;
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_update_core = 0;
 	ip->i_update_size = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 	/*
 	 * Initialize inode's trace buffers.
 	 */
 #ifdef	XFS_INODE_TRACE
 	ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMAP_TRACE
 	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BTREE_TRACE
 	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
 	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_ILOCK_TRACE
 	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_DIR2_TRACE
 	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
 	return ip;
 }
 /*
  * Given a mount structure and an inode number, return a pointer
  * to a newly allocated in-core inode corresponding to the given
  * inode number.
  *
  * Initialize the inode's attributes and extent pointers if it
  * already has them (it will not if the inode has no links).
  */
 int
 xfs_iread(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
 	xfs_inode_t	**ipp,
 	xfs_daddr_t	bno,
 	uint		imap_flags)
 {
 	xfs_buf_t	*bp;
 	xfs_dinode_t	*dip;
 	xfs_inode_t	*ip;
-	xfs_imap_t	imap;
 	int		error;
 	ip = xfs_inode_alloc(mp, ino);
 	if (!ip)
 		return ENOMEM;
 	/*
-	 * Get pointers to the on-disk inode and the buffer containing it.
+	 * Fill in the location information in the in-core inode.
 	 */
-	imap.im_blkno = bno;
+	ip->i_imap.im_blkno = bno;
-	error = xfs_imap(mp, tp, ip->i_ino, &imap, imap_flags);
+	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, imap_flags);
 	if (error)
 		goto out_destroy_inode;
+	ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
 	/*
-	 * Fill in the fields in the inode that will be used to
+	 * Get pointers to the on-disk inode and the buffer containing it.
-	 * map the inode to its buffer from now on.
 	 */
-	ip->i_blkno = imap.im_blkno;
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-	ip->i_len = imap.im_len;
+			       XFS_BUF_LOCK, imap_flags);
-	ip->i_boffset = imap.im_boffset;
-	ASSERT(bno == 0 || bno == imap.im_blkno);
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
 	if (error)
 		goto out_destroy_inode;
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 	/*
 	 * If we got something that isn't an inode it means someone
 	 * (nfs or dmi) has a stale handle.
 	 */
 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 				"dip->di_magic (0x%x) != "
 				"XFS_DINODE_MAGIC (0x%x)",
 				be16_to_cpu(dip->di_magic),
 				XFS_DINODE_MAGIC);
 #endif /* DEBUG */
 		error = XFS_ERROR(EINVAL);
 		goto out_brelse;
 	}
 	/*
 	 * If the on-disk inode is already linked to a directory
 	 * entry, copy all of the inode into the in-core inode.
 	 * xfs_iformat() handles copying in the inode format
 	 * specific information.
 	 * Otherwise, just get the truly permanent information.
 	 */
 	if (dip->di_mode) {
 		xfs_dinode_from_disk(&ip->i_d, dip);
 		error = xfs_iformat(ip, dip);
 		if (error)  {
 #ifdef DEBUG
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 					"xfs_iformat() returned error %d",
 					error);
 #endif /* DEBUG */
 			goto out_brelse;
 		}
 	} else {
 		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 		ip->i_d.di_version = dip->di_version;
 		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 		/*
 		 * Make sure to pull in the mode here as well in
 		 * case the inode is released without being used.
 		 * This ensures that xfs_inactive() will see that
 		 * the inode is already free and not try to mess
 		 * with the uninitialized part of it.
 		 */
 		ip->i_d.di_mode = 0;
 		/*
 		 * Initialize the per-fork minima and maxima for a new
 		 * inode here.  xfs_iformat will do it for old inodes.
 		 */
 		ip->i_df.if_ext_max =
 			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	}
 	/*
 	 * The inode format changed when we moved the link count and
 	 * made it 32 bits long.  If this is an old format inode,
 	 * convert it in memory to look like a new one.  If it gets
 	 * flushed to disk we will convert back before flushing or
 	 * logging it.  We zero out the new projid field and the old link
 	 * count field.  We'll handle clearing the pad field (the remains
 	 * of the old uuid field) when we actually convert the inode to
 	 * the new format. We don't change the version number so that we
 	 * can distinguish this from a real new format inode.
 	 */
 	if (ip->i_d.di_version == 1) {
 		ip->i_d.di_nlink = ip->i_d.di_onlink;
 		ip->i_d.di_onlink = 0;
 		ip->i_d.di_projid = 0;
 	}
 	ip->i_delayed_blks = 0;
 	ip->i_size = ip->i_d.di_size;
 	/*
 	 * Mark the buffer containing the inode as something to keep
 	 * around for a while.  This helps to keep recently accessed
 	 * meta-data in-core longer.
 	 */
 	 XFS_BUF_SET_REF(bp, XFS_INO_REF);
 	/*
 	 * Use xfs_trans_brelse() to release the buffer containing the
 	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
 	 * in xfs_itobp() above.  If tp is NULL, this is just a normal
 	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
 	 * will only release the buffer if it is not dirty within the
 	 * transaction.  It will be OK to release the buffer in this case,
 	 * because inodes on disk are never destroyed and we will be
 	 * locking the new in-core inode before putting it in the hash
 	 * table where other processes can find it.  Thus we don't have
 	 * to worry about the inode being changed just because we released
 	 * the buffer.
 	 */
 	xfs_trans_brelse(tp, bp);
 	*ipp = ip;
 	return 0;
  out_brelse:
 	xfs_trans_brelse(tp, bp);
  out_destroy_inode:
 	xfs_destroy_inode(ip);
 	return error;
 }
 /*
  * Read in extents from a btree-format inode.
  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
  */
 int
 xfs_iread_extents(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
 	int		whichfork)
 {
 	int		error;
 	xfs_ifork_t	*ifp;
 	xfs_extnum_t	nextents;
 	size_t		size;
 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	/*
 	 * We know that the size is valid (it's checked in iformat_btree)
 	 */
 	ifp->if_lastex = NULLEXTNUM;
 	ifp->if_bytes = ifp->if_real_bytes = 0;
 	ifp->if_flags |= XFS_IFEXTENTS;
 	xfs_iext_add(ifp, 0, nextents);
 	error = xfs_bmap_read_extents(tp, ip, whichfork);
 	if (error) {
 		xfs_iext_destroy(ifp);
 		ifp->if_flags &= ~XFS_IFEXTENTS;
 		return error;
 	}
 	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 	return 0;
 }
 /*
  * Allocate an inode on disk and return a copy of its in-core version.
  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
  * appropriately within the inode.  The uid and gid for the inode are
  * set according to the contents of the given cred structure.
  *
  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
  * has a free inode available, call xfs_iget()
  * to obtain the in-core version of the allocated inode.  Finally,
  * fill in the inode and log its initial contents.  In this case,
  * ialloc_context would be set to NULL and call_again set to false.
  *
  * If xfs_dialloc() does not have an available inode,
  * it will replenish its supply by doing an allocation. Since we can
  * only do one allocation within a transaction without deadlocks, we
  * must commit the current transaction before returning the inode itself.
  * In this case, therefore, we will set call_again to true and return.
  * The caller should then commit the current transaction, start a new
  * transaction, and call xfs_ialloc() again to actually get the inode.
  *
  * To ensure that some other process does not grab the inode that
  * was allocated during the first call to xfs_ialloc(), this routine
  * also returns the [locked] bp pointing to the head of the freelist
  * as ialloc_context.  The caller should hold this buffer across
  * the commit and pass it back into this routine on the second call.
  *
  * If we are allocating quota inodes, we do not have a parent inode
  * to attach to or associate with (i.e. pip == NULL) because they
  * are not linked into the directory structure - they are attached
  * directly to the superblock - and so have no parent.
  */
 int
 xfs_ialloc(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*pip,
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
 	cred_t		*cr,
 	xfs_prid_t	prid,
 	int		okalloc,
 	xfs_buf_t	**ialloc_context,
 	boolean_t	*call_again,
 	xfs_inode_t	**ipp)
 {
 	xfs_ino_t	ino;
 	xfs_inode_t	*ip;
 	uint		flags;
 	int		error;
 	timespec_t	tv;
 	int		filestreams = 0;
 	/*
 	 * Call the space management code to pick
 	 * the on-disk inode to be allocated.
 	 */
 	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 			    ialloc_context, call_again, &ino);
 	if (error)
 		return error;
 	if (*call_again || ino == NULLFSINO) {
 		*ipp = NULL;
 		return 0;
 	}
 	ASSERT(*ialloc_context == NULL);
 	/*
 	 * Get the in-core inode with the lock held exclusively.
 	 * This is because we're setting fields here we need
 	 * to prevent others from looking at until we're done.
 	 */
 	error = xfs_trans_iget(tp->t_mountp, tp, ino,
 				XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
 	if (error)
 		return error;
 	ASSERT(ip != NULL);
 	ip->i_d.di_mode = (__uint16_t)mode;
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
 	ASSERT(ip->i_d.di_nlink == nlink);
 	ip->i_d.di_uid = current_fsuid();
 	ip->i_d.di_gid = current_fsgid();
 	ip->i_d.di_projid = prid;
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 	/*
 	 * If the superblock version is up to where we support new format
 	 * inodes and this is currently an old format inode, then change
 	 * the inode version number now.  This way we only do the conversion
 	 * here rather than here and in the flush/logging code.
 	 */
 	if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
 	    ip->i_d.di_version == 1) {
 		ip->i_d.di_version = 2;
 		/*
 		 * We've already zeroed the old link count, the projid field,
 		 * and the pad field.
 		 */
 	}
 	/*
 	 * Project ids won't be stored on disk if we are using a version 1 inode.
 	 */
 	if ((prid != 0) && (ip->i_d.di_version == 1))
 		xfs_bump_ino_vers2(tp, ip);
 	if (pip && XFS_INHERIT_GID(pip)) {
 		ip->i_d.di_gid = pip->i_d.di_gid;
 		if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
 			ip->i_d.di_mode |= S_ISGID;
 		}
 	}
 	/*
 	 * If the group ID of the new file does not match the effective group
 	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 	 * (and only if the irix_sgid_inherit compatibility variable is set).
 	 */
 	if ((irix_sgid_inherit) &&
 	    (ip->i_d.di_mode & S_ISGID) &&
 	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
 		ip->i_d.di_mode &= ~S_ISGID;
 	}
 	ip->i_d.di_size = 0;
 	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 	nanotime(&tv);
 	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 	ip->i_d.di_atime = ip->i_d.di_mtime;
 	ip->i_d.di_ctime = ip->i_d.di_mtime;
 	/*
 	 * di_gen will have been taken care of in xfs_iread.
 	 */
 	ip->i_d.di_extsize = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_dmstate = 0;
 	ip->i_d.di_flags = 0;
 	flags = XFS_ILOG_CORE;
 	switch (mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFSOCK:
 		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 		ip->i_df.if_u2.if_rdev = rdev;
 		ip->i_df.if_flags = 0;
 		flags |= XFS_ILOG_DEV;
 		break;
 	case S_IFREG:
 		/*
 		 * we can't set up filestreams until after the VFS inode
 		 * is set up properly.
 		 */
 		if (pip && xfs_inode_is_filestream(pip))
 			filestreams = 1;
 		/* fall through */
 	case S_IFDIR:
 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 			uint	di_flags = 0;
 			if ((mode & S_IFMT) == S_IFDIR) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_RTINHERIT;
 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 					ip->i_d.di_extsize = pip->i_d.di_extsize;
 				}
 			} else if ((mode & S_IFMT) == S_IFREG) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_REALTIME;
 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 					di_flags |= XFS_DIFLAG_EXTSIZE;
 					ip->i_d.di_extsize = pip->i_d.di_extsize;
 				}
 			}
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 			    xfs_inherit_noatime)
 				di_flags |= XFS_DIFLAG_NOATIME;
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 			    xfs_inherit_nodump)
 				di_flags |= XFS_DIFLAG_NODUMP;
 			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 			    xfs_inherit_sync)
 				di_flags |= XFS_DIFLAG_SYNC;
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 			    xfs_inherit_nosymlinks)
 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 				di_flags |= XFS_DIFLAG_PROJINHERIT;
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 			    xfs_inherit_nodefrag)
 				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 				di_flags |= XFS_DIFLAG_FILESTREAM;
 			ip->i_d.di_flags |= di_flags;
 		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 		ip->i_df.if_flags = XFS_IFEXTENTS;
 		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
 		ip->i_df.if_u1.if_extents = NULL;
 		break;
 	default:
 		ASSERT(0);
 	}
 	/*
 	 * Attribute fork settings for new inode.
 	 */
 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_d.di_anextents = 0;
 	/*
 	 * Log the new values stuffed into the inode.
 	 */
 	xfs_trans_log_inode(tp, ip, flags);
 	/* now that we have an i_mode we can setup inode ops and unlock */
 	xfs_setup_inode(ip);
 	/* now we have set up the vfs inode we can associate the filestream */
 	if (filestreams) {
 		error = xfs_filestream_associate(pip, ip);
 		if (error < 0)
 			return -error;
 		if (!error)
 			xfs_iflags_set(ip, XFS_IFILESTREAM);
 	}
 	*ipp = ip;
 	return 0;
 }
 /*
  * Check to make sure that there are no blocks allocated to the
  * file beyond the size of the file.  We don't check this for
  * files with fixed size extents or real time extents, but we
  * at least do it for regular files.
  */
 #ifdef DEBUG
 void
 xfs_isize_check(
 	xfs_mount_t	*mp,
 	xfs_inode_t	*ip,
 	xfs_fsize_t	isize)
 {
 	xfs_fileoff_t	map_first;
 	int		nimaps;
 	xfs_bmbt_irec_t	imaps[2];
 	if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
 		return;
 	if (XFS_IS_REALTIME_INODE(ip))
 		return;
 	if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
 		return;
 	nimaps = 2;
 	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 	/*
 	 * The filesystem could be shutting down, so bmapi may return
 	 * an error.
 	 */
 	if (xfs_bmapi(NULL, ip, map_first,
 			 (XFS_B_TO_FSB(mp,
 				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
 			  map_first),
 			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
 			 NULL, NULL))
 	    return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
 }
 #endif	/* DEBUG */
 /*
  * Calculate the last possible buffered byte in a file.  This must
  * include data that was buffered beyond the EOF by the write code.
  * This also needs to deal with overflowing the xfs_fsize_t type
  * which can happen for sizes near the limit.
  *
  * We also need to take into account any blocks beyond the EOF.  It
  * may be the case that they were buffered by a write which failed.
  * In that case the pages will still be in memory, but the inode size
  * will never have been updated.
  */
 xfs_fsize_t
 xfs_file_last_byte(
 	xfs_inode_t	*ip)
 {
 	xfs_mount_t	*mp;
 	xfs_fsize_t	last_byte;
 	xfs_fileoff_t	last_block;
 	xfs_fileoff_t	size_last_block;
 	int		error;
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
 	mp = ip->i_mount;
 	/*
 	 * Only check for blocks beyond the EOF if the extents have
 	 * been read in.  This eliminates the need for the inode lock,
 	 * and it also saves us from looking when it really isn't
 	 * necessary.
 	 */
 	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
 		error = xfs_bmap_last_offset(NULL, ip, &last_block,
 			XFS_DATA_FORK);
 		if (error) {
 			last_block = 0;
 		}
 	} else {
 		last_block = 0;
 	}
 	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
 	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
 	last_byte = XFS_FSB_TO_B(mp, last_block);
 	if (last_byte < 0) {
 		return XFS_MAXIOFFSET(mp);
 	}
 	last_byte += (1 << mp->m_writeio_log);
 	if (last_byte < 0) {
 		return XFS_MAXIOFFSET(mp);
 	}
 	return last_byte;
 }
 #if defined(XFS_RW_TRACE)
 STATIC void
 xfs_itrunc_trace(
 	int		tag,
 	xfs_inode_t	*ip,
 	int		flag,
 	xfs_fsize_t	new_size,
 	xfs_off_t	toss_start,
 	xfs_off_t	toss_finish)
 {
 	if (ip->i_rwtrace == NULL) {
 		return;
 	}
 	ktrace_enter(ip->i_rwtrace,
 		     (void*)((long)tag),
 		     (void*)ip,
 		     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
 		     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
 		     (void*)((long)flag),
 		     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
 		     (void*)(unsigned long)(new_size & 0xffffffff),
 		     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
 		     (void*)(unsigned long)(toss_start & 0xffffffff),
 		     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
 		     (void*)(unsigned long)(toss_finish & 0xffffffff),
 		     (void*)(unsigned long)current_cpu(),
 		     (void*)(unsigned long)current_pid(),
 		     (void*)NULL,
 		     (void*)NULL,
 		     (void*)NULL);
 }
 #else
 #define	xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
 #endif
 /*
  * Start the truncation of the file to new_size.  The new size
  * must be smaller than the current size.  This routine will
  * clear the buffer and page caches of file data in the removed
  * range, and xfs_itruncate_finish() will remove the underlying
  * disk blocks.
  *
  * The inode must have its I/O lock locked EXCLUSIVELY, and it
  * must NOT have the inode lock held at all.  This is because we're
  * calling into the buffer/page cache code and we can't hold the
  * inode lock when we do so.
  *
  * We need to wait for any direct I/Os in flight to complete before we
  * proceed with the truncate. This is needed to prevent the extents
  * being read or written by the direct I/Os from being removed while the
  * I/O is in flight as there is no other method of synchronising
  * direct I/O with the truncate operation.  Also, because we hold
  * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
  * started until the truncate completes and drops the lock. Essentially,
  * the vn_iowait() call forms an I/O barrier that provides strict ordering
  * between direct I/Os and the truncate operation.
  *
  * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
  * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
  * in the case that the caller is locking things out of order and
  * may not be able to call xfs_itruncate_finish() with the inode lock
  * held without dropping the I/O lock.  If the caller must drop the
  * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
  * must be called again with all the same restrictions as the initial
  * call.
  */
 int
 xfs_itruncate_start(
 	xfs_inode_t	*ip,
 	uint		flags,
 	xfs_fsize_t	new_size)
 {
 	xfs_fsize_t	last_byte;
 	xfs_off_t	toss_start;
 	xfs_mount_t	*mp;
 	int		error = 0;
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
 	       (flags == XFS_ITRUNC_MAYBE));
 	mp = ip->i_mount;
 	/* wait for the completion of any pending DIOs */
 	if (new_size == 0 || new_size < ip->i_size)
 		vn_iowait(ip);
 	/*
 	 * Call toss_pages or flushinval_pages to get rid of pages
 	 * overlapping the region being removed.  We have to use
 	 * the less efficient flushinval_pages in the case that the
 	 * caller may not be able to finish the truncate without
 	 * dropping the inode's I/O lock.  Make sure
 	 * to catch any pages brought in by buffers overlapping
 	 * the EOF by searching out beyond the isize by our
 	 * block size. We round new_size up to a block boundary
 	 * so that we don't toss things on the same block as
 	 * new_size but before it.
 	 *
 	 * Before calling toss_page or flushinval_pages, make sure to
 	 * call remapf() over the same region if the file is mapped.
 	 * This frees up mapped file references to the pages in the
 	 * given range and for the flushinval_pages case it ensures
 	 * that we get the latest mapped changes flushed out.
 	 */
 	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
 	toss_start = XFS_FSB_TO_B(mp, toss_start);
 	if (toss_start < 0) {
 		/*
 		 * The place to start tossing is beyond our maximum
 		 * file size, so there is no way that the data extended
 		 * out there.
 		 */
 		return 0;
 	}
 	last_byte = xfs_file_last_byte(ip);
 	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
 			 last_byte);
 	if (last_byte > toss_start) {
 		if (flags & XFS_ITRUNC_DEFINITE) {
 			xfs_tosspages(ip, toss_start,
 					-1, FI_REMAPF_LOCKED);
 		} else {
 			error = xfs_flushinval_pages(ip, toss_start,
 					-1, FI_REMAPF_LOCKED);
 		}
 	}
 #ifdef DEBUG
 	if (new_size == 0) {
 		ASSERT(VN_CACHED(VFS_I(ip)) == 0);
 	}
 #endif
 	return error;
 }
 /*
  * Shrink the file to the given new_size.  The new size must be smaller than
  * the current size.  This will free up the underlying blocks in the removed
  * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
  *
  * The transaction passed to this routine must have made a permanent log
  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
  * given transaction and start new ones, so make sure everything involved in
  * the transaction is tidy before calling here.  Some transaction will be
  * returned to the caller to be committed.  The incoming transaction must
  * already include the inode, and both inode locks must be held exclusively.
  * The inode must also be "held" within the transaction.  On return the inode
  * will be "held" within the returned transaction.  This routine does NOT
  * require any disk space to be reserved for it within the transaction.
  *
  * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
  * indicates the fork which is to be truncated.  For the attribute fork we only
  * support truncation to size 0.
  *
  * We use the sync parameter to indicate whether or not the first transaction
  * we perform might have to be synchronous.  For the attr fork, it needs to be
  * so if the unlink of the inode is not yet known to be permanent in the log.
  * This keeps us from freeing and reusing the blocks of the attribute fork
  * before the unlink of the inode becomes permanent.
  *
  * For the data fork, we normally have to run synchronously if we're being
  * called out of the inactive path or we're being called out of the create path
  * where we're truncating an existing file.  Either way, the truncate needs to
  * be sync so blocks don't reappear in the file with altered data in case of a
  * crash.  wsync filesystems can run the first case async because anything that
  * shrinks the inode has to run sync so by the time we're called here from
  * inactive, the inode size is permanently set to 0.
  *
  * Calls from the truncate path always need to be sync unless we're in a wsync
  * filesystem and the file has already been unlinked.
  *
  * The caller is responsible for correctly setting the sync parameter.  It gets
  * too hard for us to guess here which path we're being called out of just
  * based on inode state.
  *
  * If we get an error, we must return with the inode locked and linked into the
  * current transaction. This keeps things simple for the higher level code,
  * because it always knows that the inode is locked and held in the transaction
  * that returns to it whether errors occur or not.  We don't mark the inode
  * dirty on error so that transactions can be easily aborted if possible.
  */
 int
 xfs_itruncate_finish(
 	xfs_trans_t	**tp,
 	xfs_inode_t	*ip,
 	xfs_fsize_t	new_size,
 	int		fork,
 	int		sync)
 {
 	xfs_fsblock_t	first_block;
 	xfs_fileoff_t	first_unmap_block;
 	xfs_fileoff_t	last_block;
 	xfs_filblks_t	unmap_len=0;
 	xfs_mount_t	*mp;
 	xfs_trans_t	*ntp;
 	int		done;
 	int		committed;
 	xfs_bmap_free_t	free_list;
 	int		error;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT(*tp != NULL);
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_transp == *tp);
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
 	ntp = *tp;
 	mp = (ntp)->t_mountp;
 	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
 	/*
 	 * We only support truncating the entire attribute fork.
 	 */
 	if (fork == XFS_ATTR_FORK) {
 		new_size = 0LL;
 	}
 	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
 	/*
 	 * The first thing we do is set the size to new_size permanently
 	 * on disk.  This way we don't have to worry about anyone ever
 	 * being able to look at the data being freed even in the face
 	 * of a crash.  What we're getting around here is the case where
 	 * we free a block, it is allocated to another file, it is written
 	 * to, and then we crash.  If the new data gets written to the
 	 * file but the log buffers containing the free and reallocation
 	 * don't, then we'd end up with garbage in the blocks being freed.
 	 * As long as we make the new_size permanent before actually
 	 * freeing any blocks it doesn't matter if they get writtten to.
 	 *
 	 * The callers must signal into us whether or not the size
 	 * setting here must be synchronous.  There are a few cases
 	 * where it doesn't have to be synchronous.  Those cases
 	 * occur if the file is unlinked and we know the unlink is
 	 * permanent or if the blocks being truncated are guaranteed
 	 * to be beyond the inode eof (regardless of the link count)
 	 * and the eof value is permanent.  Both of these cases occur
 	 * only on wsync-mounted filesystems.  In those cases, we're
 	 * guaranteed that no user will ever see the data in the blocks
 	 * that are being truncated so the truncate can run async.
 	 * In the free beyond eof case, the file may wind up with
 	 * more blocks allocated to it than it needs if we crash
 	 * and that won't get fixed until the next time the file
 	 * is re-opened and closed but that's ok as that shouldn't
 	 * be too many blocks.
 	 *
 	 * However, we can't just make all wsync xactions run async
 	 * because there's one call out of the create path that needs
 	 * to run sync where it's truncating an existing file to size
 	 * 0 whose size is > 0.
 	 *
 	 * It's probably possible to come up with a test in this
 	 * routine that would correctly distinguish all the above
 	 * cases from the values of the function parameters and the
 	 * inode state but for sanity's sake, I've decided to let the
 	 * layers above just tell us.  It's simpler to correctly figure
 	 * out in the layer above exactly under what conditions we
 	 * can run async and I think it's easier for others read and
 	 * follow the logic in case something has to be changed.
 	 * cscope is your friend -- rcc.
 	 *
 	 * The attribute fork is much simpler.
 	 *
 	 * For the attribute fork we allow the caller to tell us whether
 	 * the unlink of the inode that led to this call is yet permanent
 	 * in the on disk log.  If it is not and we will be freeing extents
 	 * in this inode then we make the first transaction synchronous
 	 * to make sure that the unlink is permanent by the time we free
 	 * the blocks.
 	 */
 	if (fork == XFS_DATA_FORK) {
 		if (ip->i_d.di_nextents > 0) {
 			/*
 			 * If we are not changing the file size then do
 			 * not update the on-disk file size - we may be
 			 * called from xfs_inactive_free_eofblocks().  If we
 			 * update the on-disk file size and then the system
 			 * crashes before the contents of the file are
 			 * flushed to disk then the files may be full of
 			 * holes (ie NULL files bug).
 			 */
 			if (ip->i_size != new_size) {
 				ip->i_d.di_size = new_size;
 				ip->i_size = new_size;
 				xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 			}
 		}
 	} else if (sync) {
 		ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
 		if (ip->i_d.di_anextents > 0)
 			xfs_trans_set_sync(ntp);
 	}
 	ASSERT(fork == XFS_DATA_FORK ||
 		(fork == XFS_ATTR_FORK &&
 			((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
 			 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
 	/*
 	 * Since it is possible for space to become allocated beyond
 	 * the end of the file (in a crash where the space is allocated
 	 * but the inode size is not yet updated), simply remove any
 	 * blocks which show up between the new EOF and the maximum
 	 * possible file size.  If the first block to be removed is
 	 * beyond the maximum file size (ie it is the same as last_block),
 	 * then there is nothing to do.
 	 */
 	last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	ASSERT(first_unmap_block <= last_block);
 	done = 0;
 	if (last_block == first_unmap_block) {
 		done = 1;
 	} else {
 		unmap_len = last_block - first_unmap_block + 1;
 	}
 	while (!done) {
 		/*
 		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
 		 * will tell us whether it freed the entire range or
 		 * not.  If this is a synchronous mount (wsync),
 		 * then we can tell bunmapi to keep all the
 		 * transactions asynchronous since the unlink
 		 * transaction that made this inode inactive has
 		 * already hit the disk.  There's no danger of
 		 * the freed blocks being reused, there being a
 		 * crash, and the reused blocks suddenly reappearing
 		 * in this file with garbage in them once recovery
 		 * runs.
 		 */
 		XFS_BMAP_INIT(&free_list, &first_block);
 		error = xfs_bunmapi(ntp, ip,
 				    first_unmap_block, unmap_len,
 				    XFS_BMAPI_AFLAG(fork) |
 				      (sync ? 0 : XFS_BMAPI_ASYNC),
 				    XFS_ITRUNC_MAX_EXTENTS,
 				    &first_block, &free_list,
 				    NULL, &done);
 		if (error) {
 			/*
 			 * If the bunmapi call encounters an error,
 			 * return to the caller where the transaction
 			 * can be properly aborted.  We just need to
 			 * make sure we're not holding any resources
 			 * that we were not when we came in.
 			 */
 			xfs_bmap_cancel(&free_list);
 			return error;
 		}
 		/*
 		 * Duplicate the transaction that has the permanent
 		 * reservation and commit the old transaction.
 		 */
 		error = xfs_bmap_finish(tp, &free_list, &committed);
 		ntp = *tp;
 		if (committed) {
 			/* link the inode into the next xact in the chain */
 			xfs_trans_ijoin(ntp, ip,
 					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 			xfs_trans_ihold(ntp, ip);
 		}
 		if (error) {
 			/*
 			 * If the bmap finish call encounters an error, return
 			 * to the caller where the transaction can be properly
 			 * aborted.  We just need to make sure we're not
 			 * holding any resources that we were not when we came
 			 * in.
 			 *
 			 * Aborting from this point might lose some blocks in
 			 * the file system, but oh well.
 			 */
 			xfs_bmap_cancel(&free_list);
 			return error;
 		}
 		if (committed) {
 			/*
 			 * Mark the inode dirty so it will be logged and
 			 * moved forward in the log as part of every commit.
 			 */
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
 		ntp = xfs_trans_dup(ntp);
 		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
 		/* link the inode into the next transaction in the chain */
 		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 		xfs_trans_ihold(ntp, ip);
 		if (error)
 			return error;
 		/*
 		 * transaction commit worked ok so we can drop the extra ticket
 		 * reference that we gained in xfs_trans_dup()
 		 */
 		xfs_log_ticket_put(ntp->t_ticket);
 		error = xfs_trans_reserve(ntp, 0,
 					XFS_ITRUNCATE_LOG_RES(mp), 0,
 					XFS_TRANS_PERM_LOG_RES,
 					XFS_ITRUNCATE_LOG_COUNT);
 		if (error)
 			return error;
 	}
 	/*
 	 * Only update the size in the case of the data fork, but
 	 * always re-log the inode so that our permanent transaction
 	 * can keep on rolling it forward in the log.
 	 */
 	if (fork == XFS_DATA_FORK) {
 		xfs_isize_check(mp, ip, new_size);
 		/*
 		 * If we are not changing the file size then do
 		 * not update the on-disk file size - we may be
 		 * called from xfs_inactive_free_eofblocks().  If we
 		 * update the on-disk file size and then the system
 		 * crashes before the contents of the file are
 		 * flushed to disk then the files may be full of
 		 * holes (ie NULL files bug).
 		 */
 		if (ip->i_size != new_size) {
 			ip->i_d.di_size = new_size;
 			ip->i_size = new_size;
 		}
 	}
 	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 	ASSERT((new_size != 0) ||
 	       (fork == XFS_ATTR_FORK) ||
 	       (ip->i_delayed_blks == 0));
 	ASSERT((new_size != 0) ||
 	       (fork == XFS_ATTR_FORK) ||
 	       (ip->i_d.di_nextents == 0));
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
 	return 0;
 }
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
  * will be pulled from this list when the inode is freed.
  */
 int
 xfs_iunlink(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip)
 {
 	xfs_mount_t	*mp;
 	xfs_agi_t	*agi;
 	xfs_dinode_t	*dip;
 	xfs_buf_t	*agibp;
 	xfs_buf_t	*ibp;
 	xfs_agino_t	agino;
 	short		bucket_index;
 	int		offset;
 	int		error;
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_mode != 0);
 	ASSERT(ip->i_transp == tp);
 	mp = tp->t_mountp;
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
 	 * on the list.
 	 */
 	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(agibp);
 	/*
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	ASSERT(agino != 0);
 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	ASSERT(agi->agi_unlinked[bucket_index]);
 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
 		/*
 		 * There is already another inode in the bucket we need
 		 * to add ourselves to.  Add us at the front of the list.
 		 * Here we put the head pointer into our next pointer,
 		 * and then we fall through to point the head at us.
 		 */
 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
 		if (error)
 			return error;
 		ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
 		/* both on-disk, don't endian flip twice */
 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-		offset = ip->i_boffset +
+		offset = ip->i_imap.im_boffset +
 			offsetof(xfs_dinode_t, di_next_unlinked);
 		xfs_trans_inode_buf(tp, ibp);
 		xfs_trans_log_buf(tp, ibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
 		xfs_inobp_check(mp, ibp);
 	}
 	/*
 	 * Point the bucket head pointer at the inode being inserted.
 	 */
 	ASSERT(agino != 0);
 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 		(sizeof(xfs_agino_t) * bucket_index);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 	return 0;
 }
 /*
  * Pull the on-disk inode from the AGI unlinked list.
  */
 STATIC int
 xfs_iunlink_remove(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip)
 {
 	xfs_ino_t	next_ino;
 	xfs_mount_t	*mp;
 	xfs_agi_t	*agi;
 	xfs_dinode_t	*dip;
 	xfs_buf_t	*agibp;
 	xfs_buf_t	*ibp;
 	xfs_agnumber_t	agno;
 	xfs_agino_t	agino;
 	xfs_agino_t	next_agino;
 	xfs_buf_t	*last_ibp;
 	xfs_dinode_t	*last_dip = NULL;
 	short		bucket_index;
 	int		offset, last_offset = 0;
 	int		error;
 	mp = tp->t_mountp;
 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
 	 * on the list.
 	 */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(agibp);
 	/*
 	 * Get the index into the agi hash table for the
 	 * list this inode will go on.
 	 */
 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	ASSERT(agino != 0);
 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
 	ASSERT(agi->agi_unlinked[bucket_index]);
 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
 		/*
 		 * We're at the head of the list.  Get the inode's
 		 * on-disk buffer to see if there is anyone after us
 		 * on the list.  Only modify our next pointer if it
 		 * is not already NULLAGINO.  This saves us the overhead
 		 * of dealing with the buffer when there is no need to
 		 * change it.
 		 */
 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 			return error;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
 		if (next_agino != NULLAGINO) {
 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_boffset +
+			offset = ip->i_imap.im_boffset +
 				offsetof(xfs_dinode_t, di_next_unlinked);
 			xfs_trans_inode_buf(tp, ibp);
 			xfs_trans_log_buf(tp, ibp, offset,
 					  (offset + sizeof(xfs_agino_t) - 1));
 			xfs_inobp_check(mp, ibp);
 		} else {
 			xfs_trans_brelse(tp, ibp);
 		}
 		/*
 		 * Point the bucket head pointer at the next inode.
 		 */
 		ASSERT(next_agino != 0);
 		ASSERT(next_agino != agino);
 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
 		offset = offsetof(xfs_agi_t, agi_unlinked) +
 			(sizeof(xfs_agino_t) * bucket_index);
 		xfs_trans_log_buf(tp, agibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
 	} else {
 		/*
 		 * We need to search the list for the inode being freed.
 		 */
 		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
 		last_ibp = NULL;
 		while (next_agino != agino) {
 			/*
 			 * If the last inode wasn't the one pointing to
 			 * us, then release its buffer since we're not
 			 * going to do anything with it.
 			 */
 			if (last_ibp != NULL) {
 				xfs_trans_brelse(tp, last_ibp);
 			}
 			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
 			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
 					    &last_ibp, &last_offset, 0);
 			if (error) {
 				cmn_err(CE_WARN,
 			"xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
 					error, mp->m_fsname);
 				return error;
 			}
 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
 			ASSERT(next_agino != NULLAGINO);
 			ASSERT(next_agino != 0);
 		}
 		/*
 		 * Now last_ibp points to the buffer previous to us on
 		 * the unlinked list.  Pull us from the list.
 		 */
 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 			return error;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
 		ASSERT(next_agino != 0);
 		ASSERT(next_agino != agino);
 		if (next_agino != NULLAGINO) {
 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_boffset +
+			offset = ip->i_imap.im_boffset +
 				offsetof(xfs_dinode_t, di_next_unlinked);
 			xfs_trans_inode_buf(tp, ibp);
 			xfs_trans_log_buf(tp, ibp, offset,
 					  (offset + sizeof(xfs_agino_t) - 1));
 			xfs_inobp_check(mp, ibp);
 		} else {
 			xfs_trans_brelse(tp, ibp);
 		}
 		/*
 		 * Point the previous inode on the list to the next inode.
 		 */
 		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
 		ASSERT(next_agino != 0);
 		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
 		xfs_trans_inode_buf(tp, last_ibp);
 		xfs_trans_log_buf(tp, last_ibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
 		xfs_inobp_check(mp, last_ibp);
 	}
 	return 0;
 }
 STATIC void
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
 	xfs_trans_t	*tp,
 	xfs_ino_t	inum)
 {
 	xfs_mount_t		*mp = free_ip->i_mount;
 	int			blks_per_cluster;
 	int			nbufs;
 	int			ninodes;
 	int			i, j, found, pre_flushed;
 	xfs_daddr_t		blkno;
 	xfs_buf_t		*bp;
 	xfs_inode_t		*ip, **ip_found;
 	xfs_inode_log_item_t	*iip;
 	xfs_log_item_t		*lip;
 	xfs_perag_t		*pag = xfs_get_perag(mp, inum);
 	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
 		blks_per_cluster = 1;
 		ninodes = mp->m_sb.sb_inopblock;
 		nbufs = XFS_IALLOC_BLOCKS(mp);
 	} else {
 		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
 					mp->m_sb.sb_blocksize;
 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
 	}
 	ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 		/*
 		 * Look for each inode in memory and attempt to lock it,
 		 * we can be racing with flush and tail pushing here.
 		 * any inode we get the locks on, add to an array of
 		 * inode items to process later.
 		 *
 		 * The get the buffer lock, we could beat a flush
 		 * or tail pushing thread to the lock here, in which
 		 * case they will go looking for the inode buffer
 		 * and fail, we need some other form of interlock
 		 * here.
 		 */
 		found = 0;
 		for (i = 0; i < ninodes; i++) {
 			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
 			/* Inode not in memory or we found it already,
 			 * nothing to do
 			 */
 			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 			if (xfs_inode_clean(ip)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 			/* If we can get the locks then add it to the
 			 * list, otherwise by the time we get the bp lock
 			 * below it will already be attached to the
 			 * inode buffer.
 			 */
 			/* This inode will already be locked - by us, lets
 			 * keep it that way.
 			 */
 			if (ip == free_ip) {
 				if (xfs_iflock_nowait(ip)) {
 					xfs_iflags_set(ip, XFS_ISTALE);
 					if (xfs_inode_clean(ip)) {
 						xfs_ifunlock(ip);
 					} else {
 						ip_found[found++] = ip;
 					}
 				}
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 			if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				if (xfs_iflock_nowait(ip)) {
 					xfs_iflags_set(ip, XFS_ISTALE);
 					if (xfs_inode_clean(ip)) {
 						xfs_ifunlock(ip);
 						xfs_iunlock(ip, XFS_ILOCK_EXCL);
 					} else {
 						ip_found[found++] = ip;
 					}
 				} else {
 					xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				}
 			}
 			read_unlock(&pag->pag_ici_lock);
 		}
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
 					mp->m_bsize * blks_per_cluster,
 					XFS_BUF_LOCK);
 		pre_flushed = 0;
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 		while (lip) {
 			if (lip->li_type == XFS_LI_INODE) {
 				iip = (xfs_inode_log_item_t *)lip;
 				ASSERT(iip->ili_logged == 1);
 				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
 				xfs_trans_ail_copy_lsn(mp->m_ail,
 							&iip->ili_flush_lsn,
 							&iip->ili_item.li_lsn);
 				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
 				pre_flushed++;
 			}
 			lip = lip->li_bio_list;
 		}
 		for (i = 0; i < found; i++) {
 			ip = ip_found[i];
 			iip = ip->i_itemp;
 			if (!iip) {
 				ip->i_update_core = 0;
 				xfs_ifunlock(ip);
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				continue;
 			}
 			iip->ili_last_fields = iip->ili_format.ilf_fields;
 			iip->ili_format.ilf_fields = 0;
 			iip->ili_logged = 1;
 			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
 						&iip->ili_item.li_lsn);
 			xfs_buf_attach_iodone(bp,
 				(void(*)(xfs_buf_t*,xfs_log_item_t*))
 				xfs_istale_done, (xfs_log_item_t *)iip);
 			if (ip != free_ip) {
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			}
 		}
 		if (found || pre_flushed)
 			xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
 	kmem_free(ip_found);
 	xfs_put_perag(mp, pag);
 }
 /*
  * This is called to return an inode to the inode free list.
  * The inode should already be truncated to 0 length and have
  * no pages associated with it.  This routine also assumes that
  * the inode is already a part of the transaction.
  *
  * The on-disk copy of the inode will have been added to the list
  * of unlinked inodes in the AGI. We need to remove the inode from
  * that list atomically with respect to freeing it here.
  */
 int
 xfs_ifree(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
 	xfs_bmap_free_t	*flist)
 {
 	int			error;
 	int			delete;
 	xfs_ino_t		first_ino;
 	xfs_dinode_t    	*dip;
 	xfs_buf_t       	*ibp;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(ip->i_transp == tp);
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
 	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
 	       ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
 	ASSERT(ip->i_d.di_nblocks == 0);
 	/*
 	 * Pull the on-disk inode from the AGI unlinked list.
 	 */
 	error = xfs_iunlink_remove(tp, ip);
 	if (error != 0) {
 		return error;
 	}
 	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
 	if (error != 0) {
 		return error;
 	}
 	ip->i_d.di_mode = 0;		/* mark incore inode as free */
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
 	ip->i_df.if_ext_max =
 		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	/*
 	 * Bump the generation count so no one will be confused
 	 * by reincarnations of this inode.
 	 */
 	ip->i_d.di_gen++;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
 	if (error)
 		return error;
         /*
 	* Clear the on-disk di_mode. This is to prevent xfs_bulkstat
 	* from picking up this inode when it is reclaimed (its incore state
 	* initialzed but not flushed to disk yet). The in-core di_mode is
 	* already cleared  and a corresponding transaction logged.
 	* The hack here just synchronizes the in-core to on-disk
 	* di_mode value in advance before the actual inode sync to disk.
 	* This is OK because the inode is already unlinked and would never
 	* change its di_mode again for this inode generation.
 	* This is a temporary hack that would require a proper fix
 	* in the future.
 	*/
 	dip->di_mode = 0;
 	if (delete) {
 		xfs_ifree_cluster(ip, tp, first_ino);
 	}
 	return 0;
 }
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
  * and pointers in if_broot to fit the new size.  When shrinking this
  * will eliminate holes between the records and pointers created by
  * the caller.  When growing this will create holes to be filled in
  * by the caller.
  *
  * The caller must not request to add more records than would fit in
  * the on-disk inode root.  If the if_broot is currently NULL, then
  * if we adding records one will be allocated.  The caller must also
  * not request that the number of records go below zero, although
  * it can go to zero.
  *
  * ip -- the inode whose if_broot area is changing
  * ext_diff -- the change in the number of records, positive or negative,
  *	 requested for the if_broot array.
  */
 void
 xfs_iroot_realloc(
 	xfs_inode_t		*ip,
 	int			rec_diff,
 	int			whichfork)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			cur_max;
 	xfs_ifork_t		*ifp;
 	struct xfs_btree_block	*new_broot;
 	int			new_max;
 	size_t			new_size;
 	char			*np;
 	char			*op;
 	/*
 	 * Handle the degenerate case quietly.
 	 */
 	if (rec_diff == 0) {
 		return;
 	}
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
 		 * allocate it now and get out.
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
 			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
 		}
 		/*
 		 * If there is already an existing if_broot, then we need
 		 * to realloc() it and shift the pointers to their new
 		 * location.  The records don't change location because
 		 * they are kept butted up against the btree block header.
 		 */
 		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
 		new_max = cur_max + rec_diff;
 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
 				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
 				KM_SLEEP);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
 		ASSERT(ifp->if_broot_bytes <=
 			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
 		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
 		return;
 	}
 	/*
 	 * rec_diff is less than 0.  In this case, we are shrinking the
 	 * if_broot buffer.  It must already exist.  If we go to zero
 	 * records, just get rid of the root and clear the status bit.
 	 */
 	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
 	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
 	else
 		new_size = 0;
 	if (new_size > 0) {
 		new_broot = kmem_alloc(new_size, KM_SLEEP);
 		/*
 		 * First copy over the btree block header.
 		 */
 		memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
 	} else {
 		new_broot = NULL;
 		ifp->if_flags &= ~XFS_IFBROOT;
 	}
 	/*
 	 * Only copy the records and pointers if there are any.
 	 */
 	if (new_max > 0) {
 		/*
 		 * First copy the records.
 		 */
 		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
 		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
 		/*
 		 * Then copy the pointers.
 		 */
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
 						     (int)new_size);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
 	}
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	ASSERT(ifp->if_broot_bytes <=
 		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
 	return;
 }
 /*
  * This is called when the amount of space needed for if_data
  * is increased or decreased.  The change in size is indicated by
  * the number of bytes that need to be added or deleted in the
  * byte_diff parameter.
  *
  * If the amount of space needed has decreased below the size of the
  * inline buffer, then switch to using the inline buffer.  Otherwise,
  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
  * to what is needed.
  *
  * ip -- the inode whose if_data area is changing
  * byte_diff -- the change in the number of bytes, positive or negative,
  *	 requested for the if_data array.
  */
 void
 xfs_idata_realloc(
 	xfs_inode_t	*ip,
 	int		byte_diff,
 	int		whichfork)
 {
 	xfs_ifork_t	*ifp;
 	int		new_size;
 	int		real_size;
 	if (byte_diff == 0) {
 		return;
 	}
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	new_size = (int)ifp->if_bytes + byte_diff;
 	ASSERT(new_size >= 0);
 	if (new_size == 0) {
 		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
 			kmem_free(ifp->if_u1.if_data);
 		}
 		ifp->if_u1.if_data = NULL;
 		real_size = 0;
 	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
 		/*
 		 * If the valid extents/data can fit in if_inline_ext/data,
 		 * copy them from the malloc'd vector and free it.
 		 */
 		if (ifp->if_u1.if_data == NULL) {
 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
 			ASSERT(ifp->if_real_bytes != 0);
 			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
 			      new_size);
 			kmem_free(ifp->if_u1.if_data);
 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 		}
 		real_size = 0;
 	} else {
 		/*
 		 * Stuck with malloc/realloc.
 		 * For inline data, the underlying buffer must be
 		 * a multiple of 4 bytes in size so that it can be
 		 * logged and stay on word boundaries.  We enforce
 		 * that here.
 		 */
 		real_size = roundup(new_size, 4);
 		if (ifp->if_u1.if_data == NULL) {
 			ASSERT(ifp->if_real_bytes == 0);
 			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
 			/*
 			 * Only do the realloc if the underlying size
 			 * is really changing.
 			 */
 			if (ifp->if_real_bytes != real_size) {
 				ifp->if_u1.if_data =
 					kmem_realloc(ifp->if_u1.if_data,
 							real_size,
 							ifp->if_real_bytes,
 							KM_SLEEP);
 			}
 		} else {
 			ASSERT(ifp->if_real_bytes == 0);
 			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
 			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
 				ifp->if_bytes);
 		}
 	}
 	ifp->if_real_bytes = real_size;
 	ifp->if_bytes = new_size;
 	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 }
 void
 xfs_idestroy_fork(
 	xfs_inode_t	*ip,
 	int		whichfork)
 {
 	xfs_ifork_t	*ifp;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (ifp->if_broot != NULL) {
 		kmem_free(ifp->if_broot);
 		ifp->if_broot = NULL;
 	}
 	/*
 	 * If the format is local, then we can't have an extents
 	 * array so just look for an inline data array.  If we're
 	 * not local then we may or may not have an extents list,
 	 * so check and free it up if we do.
 	 */
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
 		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
 		    (ifp->if_u1.if_data != NULL)) {
 			ASSERT(ifp->if_real_bytes != 0);
 			kmem_free(ifp->if_u1.if_data);
 			ifp->if_u1.if_data = NULL;
 			ifp->if_real_bytes = 0;
 		}
 	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
 		   ((ifp->if_flags & XFS_IFEXTIREC) ||
 		    ((ifp->if_u1.if_extents != NULL) &&
 		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
 		ASSERT(ifp->if_real_bytes != 0);
 		xfs_iext_destroy(ifp);
 	}
 	ASSERT(ifp->if_u1.if_extents == NULL ||
 	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
 	ASSERT(ifp->if_real_bytes == 0);
 	if (whichfork == XFS_ATTR_FORK) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
 	}
 }
 /*
  * This is called free all the memory associated with an inode.
  * It must free the inode itself and any buffers allocated for
  * if_extents/if_data and if_broot.  It must also free the lock
  * associated with the inode.
  *
  * Note: because we don't initialise everything on reallocation out
  * of the zone, we must ensure we nullify everything correctly before
  * freeing the structure.
  */
 void
 xfs_idestroy(
 	xfs_inode_t	*ip)
 {
 	switch (ip->i_d.di_mode & S_IFMT) {
 	case S_IFREG:
 	case S_IFDIR:
 	case S_IFLNK:
 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
 		break;
 	}
 	if (ip->i_afp)
 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 #ifdef XFS_INODE_TRACE
 	ktrace_free(ip->i_trace);
 #endif
 #ifdef XFS_BMAP_TRACE
 	ktrace_free(ip->i_xtrace);
 #endif
 #ifdef XFS_BTREE_TRACE
 	ktrace_free(ip->i_btrace);
 #endif
 #ifdef XFS_RW_TRACE
 	ktrace_free(ip->i_rwtrace);
 #endif
 #ifdef XFS_ILOCK_TRACE
 	ktrace_free(ip->i_lock_trace);
 #endif
 #ifdef XFS_DIR2_TRACE
 	ktrace_free(ip->i_dir_trace);
 #endif
 	if (ip->i_itemp) {
 		/*
 		 * Only if we are shutting down the fs will we see an
 		 * inode still in the AIL. If it is there, we should remove
 		 * it to prevent a use-after-free from occurring.
 		 */
 		xfs_log_item_t	*lip = &ip->i_itemp->ili_item;
 		struct xfs_ail	*ailp = lip->li_ailp;
 		ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
 				       XFS_FORCED_SHUTDOWN(ip->i_mount));
 		if (lip->li_flags & XFS_LI_IN_AIL) {
 			spin_lock(&ailp->xa_lock);
 			if (lip->li_flags & XFS_LI_IN_AIL)
 				xfs_trans_ail_delete(ailp, lip);
 			else
 				spin_unlock(&ailp->xa_lock);
 		}
 		xfs_inode_item_destroy(ip);
 		ip->i_itemp = NULL;
 	}
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 	kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
  * Increment the pin count of the given buffer.
  * This value is protected by ipinlock spinlock in the mount structure.
  */
 void
 xfs_ipin(
 	xfs_inode_t	*ip)
 {
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	atomic_inc(&ip->i_pincount);
 }
 /*
  * Decrement the pin count of the given inode, and wake up
  * anyone in xfs_iwait_unpin() if the count goes to 0.  The
  * inode must have been previously pinned with a call to xfs_ipin().
  */
 void
 xfs_iunpin(
 	xfs_inode_t	*ip)
 {
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 	if (atomic_dec_and_test(&ip->i_pincount))
 		wake_up(&ip->i_ipin_wait);
 }
 /*
  * This is called to unpin an inode. It can be directed to wait or to return
  * immediately without waiting for the inode to be unpinned.  The caller must
  * have the inode locked in at least shared mode so that the buffer cannot be
  * subsequently pinned once someone is waiting for it to be unpinned.
  */
 STATIC void
 __xfs_iunpin_wait(
 	xfs_inode_t	*ip,
 	int		wait)
 {
 	xfs_inode_log_item_t	*iip = ip->i_itemp;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	if (atomic_read(&ip->i_pincount) == 0)
 		return;
 	/* Give the log a push to start the unpinning I/O */
 	xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
 				iip->ili_last_lsn : 0, XFS_LOG_FORCE);
 	if (wait)
 		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
 }
 static inline void
 xfs_iunpin_wait(
 	xfs_inode_t	*ip)
 {
 	__xfs_iunpin_wait(ip, 1);
 }
 static inline void
 xfs_iunpin_nowait(
 	xfs_inode_t	*ip)
 {
 	__xfs_iunpin_wait(ip, 0);
 }
 /*
  * xfs_iextents_copy()
  *
  * This is called to copy the REAL extents (as opposed to the delayed
  * allocation extents) from the inode into the given buffer.  It
  * returns the number of bytes copied into the buffer.
  *
  * If there are no delayed allocation extents, then we can just
  * memcpy() the extents into the buffer.  Otherwise, we need to
  * examine each extent in turn and skip those which are delayed.
  */
 int
 xfs_iextents_copy(
 	xfs_inode_t		*ip,
 	xfs_bmbt_rec_t		*dp,
 	int			whichfork)
 {
 	int			copied;
 	int			i;
 	xfs_ifork_t		*ifp;
 	int			nrecs;
 	xfs_fsblock_t		start_block;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(ifp->if_bytes > 0);
 	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
 	ASSERT(nrecs > 0);
 	/*
 	 * There are some delayed allocation extents in the
 	 * inode, so copy the extents one at a time and skip
 	 * the delayed ones.  There must be at least one
 	 * non-delayed extent.
 	 */
 	copied = 0;
 	for (i = 0; i < nrecs; i++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 		start_block = xfs_bmbt_get_startblock(ep);
 		if (ISNULLSTARTBLOCK(start_block)) {
 			/*
 			 * It's a delayed allocation extent, so skip it.
 			 */
 			continue;
 		}
 		/* Translate to on disk format */
 		put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
 		put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
 		dp++;
 		copied++;
 	}
 	ASSERT(copied != 0);
 	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
 	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
 }
 /*
  * Each of the following cases stores data into the same region
  * of the on-disk inode, so only one of them can be valid at
  * any given time. While it is possible to have conflicting formats
  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
  * in EXTENTS format, this can only happen when the fork has
  * changed formats after being modified but before being flushed.
  * In these cases, the format always takes precedence, because the
  * format indicates the current state of the fork.
  */
 /*ARGSUSED*/
 STATIC void
 xfs_iflush_fork(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip,
 	xfs_inode_log_item_t	*iip,
 	int			whichfork,
 	xfs_buf_t		*bp)
 {
 	char			*cp;
 	xfs_ifork_t		*ifp;
 	xfs_mount_t		*mp;
 #ifdef XFS_TRANS_DEBUG
 	int			first;
 #endif
 	static const short	brootflag[2] =
 		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
 	static const short	dataflag[2] =
 		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
 	static const short	extflag[2] =
 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
 	if (!iip)
 		return;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	/*
 	 * This can happen if we gave up in iformat in an error path,
 	 * for the attribute fork.
 	 */
 	if (!ifp) {
 		ASSERT(whichfork == XFS_ATTR_FORK);
 		return;
 	}
 	cp = XFS_DFORK_PTR(dip, whichfork);
 	mp = ip->i_mount;
 	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
 	case XFS_DINODE_FMT_LOCAL:
 		if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
 			ASSERT(ifp->if_u1.if_data != NULL);
 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
 		}
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
 		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
 		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
 		ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
 			(ifp->if_bytes == 0));
 		ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
 			(ifp->if_bytes > 0));
 		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
 			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
 			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
 				whichfork);
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
 		    (ifp->if_broot_bytes > 0)) {
 			ASSERT(ifp->if_broot != NULL);
 			ASSERT(ifp->if_broot_bytes <=
 			       (XFS_IFORK_SIZE(ip, whichfork) +
 				XFS_BROOT_SIZE_ADJ));
 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
 				(xfs_bmdr_block_t *)cp,
 				XFS_DFORK_SIZE(dip, mp, whichfork));
 		}
 		break;
 	case XFS_DINODE_FMT_DEV:
 		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
 			ASSERT(whichfork == XFS_DATA_FORK);
 			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
 		}
 		break;
 	case XFS_DINODE_FMT_UUID:
 		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
 			ASSERT(whichfork == XFS_DATA_FORK);
 			memcpy(XFS_DFORK_DPTR(dip),
 			       &ip->i_df.if_u2.if_uuid,
 			       sizeof(uuid_t));
 		}
 		break;
 	default:
 		ASSERT(0);
 		break;
 	}
 }
 STATIC int
 xfs_iflush_cluster(
 	xfs_inode_t	*ip,
 	xfs_buf_t	*bp)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
 	unsigned long		first_index, mask;
 	unsigned long		inodes_per_cluster;
 	int			ilist_size;
 	xfs_inode_t		**ilist;
 	xfs_inode_t		*iq;
 	int			nr_found;
 	int			clcount = 0;
 	int			bufwasdelwri;
 	int			i;
 	ASSERT(pag->pagi_inodeok);
 	ASSERT(pag->pag_ici_init);
 	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
 		return 0;
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
 	read_lock(&pag->pag_ici_lock);
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
 					first_index, inodes_per_cluster);
 	if (nr_found == 0)
 		goto out_free;
 	for (i = 0; i < nr_found; i++) {
 		iq = ilist[i];
 		if (iq == ip)
 			continue;
 		/* if the inode lies outside this cluster, we're done. */
 		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
 			break;
 		/*
 		 * Do an un-protected check to see if the inode is dirty and
 		 * is a candidate for flushing.  These checks will be repeated
 		 * later after the appropriate locks are acquired.
 		 */
 		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
 			continue;
 		/*
 		 * Try to get locks.  If any are unavailable or it is pinned,
 		 * then this inode cannot be flushed and is skipped.
 		 */
 		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
 			continue;
 		if (!xfs_iflock_nowait(iq)) {
 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
 			continue;
 		}
 		if (xfs_ipincount(iq)) {
 			xfs_ifunlock(iq);
 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
 			continue;
 		}
 		/*
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
 		 */
 		if (!xfs_inode_clean(iq)) {
 			int	error;
 			error = xfs_iflush_int(iq, bp);
 			if (error) {
 				xfs_iunlock(iq, XFS_ILOCK_SHARED);
 				goto cluster_corrupt_out;
 			}
 			clcount++;
 		} else {
 			xfs_ifunlock(iq);
 		}
 		xfs_iunlock(iq, XFS_ILOCK_SHARED);
 	}
 	if (clcount) {
 		XFS_STATS_INC(xs_icluster_flushcnt);
 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
 	}
 out_free:
 	read_unlock(&pag->pag_ici_lock);
 	kmem_free(ilist);
 	return 0;
 cluster_corrupt_out:
 	/*
 	 * Corruption detected in the clustering loop.  Invalidate the
 	 * inode buffer and shut down the filesystem.
 	 */
 	read_unlock(&pag->pag_ici_lock);
 	/*
 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
 	 * brelse can handle it with no problems.  If not, shut down the
 	 * filesystem before releasing the buffer.
 	 */
 	bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
 	if (bufwasdelwri)
 		xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	if (!bufwasdelwri) {
 		/*
 		 * Just like incore_relse: if we have b_iodone functions,
 		 * mark the buffer as an error and call them.  Otherwise
 		 * mark it as stale and brelse.
 		 */
 		if (XFS_BUF_IODONE_FUNC(bp)) {
 			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
 			XFS_BUF_UNDONE(bp);
 			XFS_BUF_STALE(bp);
 			XFS_BUF_SHUT(bp);
 			XFS_BUF_ERROR(bp,EIO);
 			xfs_biodone(bp);
 		} else {
 			XFS_BUF_STALE(bp);
 			xfs_buf_relse(bp);
 		}
 	}
 	/*
 	 * Unlocks the flush lock
 	 */
 	xfs_iflush_abort(iq);
 	kmem_free(ilist);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 /*
  * xfs_iflush() will write a modified inode's changes out to the
  * inode's on disk home.  The caller must have the inode lock held
  * in at least shared mode and the inode flush completion must be
  * active as well.  The inode lock will still be held upon return from
  * the call and the caller is free to unlock it.
  * The inode flush will be completed when the inode reaches the disk.
  * The flags indicate how the inode's buffer should be written out.
  */
 int
 xfs_iflush(
 	xfs_inode_t		*ip,
 	uint			flags)
 {
 	xfs_inode_log_item_t	*iip;
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 	int			error;
 	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
 	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 	XFS_STATS_INC(xs_iflush_count);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
 	/*
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
 	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
 	/*
 	 * We can't flush the inode until it is unpinned, so wait for it if we
 	 * are allowed to block.  We know noone new can pin it, because we are
 	 * holding the inode lock shared and you need to hold it exclusively to
 	 * pin the inode.
 	 *
 	 * If we are not allowed to block, force the log out asynchronously so
 	 * that when we come back the inode will be unpinned. If other inodes
 	 * in the same cluster are dirty, they will probably write the inode
 	 * out for us if they occur after the log force completes.
 	 */
 	if (noblock && xfs_ipincount(ip)) {
 		xfs_iunpin_nowait(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
 	}
 	xfs_iunpin_wait(ip);
 	/*
 	 * This may have been unpinned because the filesystem is shutting
 	 * down forcibly. If that's the case we must not write this inode
 	 * to disk, because the log record didn't make it to disk!
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		ip->i_update_core = 0;
 		if (iip)
 			iip->ili_format.ilf_fields = 0;
 		xfs_ifunlock(ip);
 		return XFS_ERROR(EIO);
 	}
 	/*
 	 * Decide how buffer will be flushed out.  This is done before
 	 * the call to xfs_iflush_int because this field is zeroed by it.
 	 */
 	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
 		/*
 		 * Flush out the inode buffer according to the directions
 		 * of the caller.  In the cases where the caller has given
 		 * us a choice choose the non-delwri case.  This is because
 		 * the inode is in the AIL and we need to get it out soon.
 		 */
 		switch (flags) {
 		case XFS_IFLUSH_SYNC:
 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
 			flags = 0;
 			break;
 		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
 			flags = INT_ASYNC;
 			break;
 		case XFS_IFLUSH_DELWRI:
 			flags = INT_DELWRI;
 			break;
 		default:
 			ASSERT(0);
 			flags = 0;
 			break;
 		}
 	} else {
 		switch (flags) {
 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
 		case XFS_IFLUSH_DELWRI:
 			flags = INT_DELWRI;
 			break;
 		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 			flags = INT_ASYNC;
 			break;
 		case XFS_IFLUSH_SYNC:
 			flags = 0;
 			break;
 		default:
 			ASSERT(0);
 			flags = 0;
 			break;
 		}
 	}
 	/*
 	 * Get the buffer containing the on-disk inode.
 	 */
 	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
 				noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
 	if (error || !bp) {
 		xfs_ifunlock(ip);
 		return error;
 	}
 	/*
 	 * First flush out the inode that xfs_iflush was called with.
 	 */
 	error = xfs_iflush_int(ip, bp);
 	if (error)
 		goto corrupt_out;
 	/*
 	 * If the buffer is pinned then push on the log now so we won't
 	 * get stuck waiting in the write for too long.
 	 */
 	if (XFS_BUF_ISPINNED(bp))
 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
 	/*
 	 * inode clustering:
 	 * see if other inodes can be gathered into this write
 	 */
 	error = xfs_iflush_cluster(ip, bp);
 	if (error)
 		goto cluster_corrupt_out;
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & INT_ASYNC) {
 		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
 	return error;
 corrupt_out:
 	xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
 	/*
 	 * Unlocks the flush lock
 	 */
 	xfs_iflush_abort(ip);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 STATIC int
 xfs_iflush_int(
 	xfs_inode_t		*ip,
 	xfs_buf_t		*bp)
 {
 	xfs_inode_log_item_t	*iip;
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 #ifdef XFS_TRANS_DEBUG
 	int			first;
 #endif
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
 	/*
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
 	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
 	/* set *dip = inode's place in the buffer */
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 	/*
 	 * Clear i_update_core before copying out the data.
 	 * This is for coordination with our timestamp updates
 	 * that don't hold the inode lock. They will always
 	 * update the timestamps BEFORE setting i_update_core,
 	 * so if we clear i_update_core after they set it we
 	 * are guaranteed to see their updates to the timestamps.
 	 * I believe that this depends on strongly ordered memory
 	 * semantics, but we have that.  We use the SYNCHRONIZE
 	 * macro to make sure that the compiler does not reorder
 	 * the i_update_core access below the data copy below.
 	 */
 	ip->i_update_core = 0;
 	SYNCHRONIZE();
 	/*
 	 * Make sure to get the latest atime from the Linux inode.
 	 */
 	xfs_synchronize_atime(ip);
 	if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
 			ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto corrupt_out;
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
 				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
 			ip->i_ino, ip, ip->i_d.di_magic);
 		goto corrupt_out;
 	}
 	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
 			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
 				ip->i_ino, ip);
 			goto corrupt_out;
 		}
 	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
 		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
 			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 				"xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
 				ip->i_ino, ip);
 			goto corrupt_out;
 		}
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
 				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
 				XFS_RANDOM_IFLUSH_5)) {
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 			"xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
 			ip->i_ino,
 			ip->i_d.di_nextents + ip->i_d.di_anextents,
 			ip->i_d.di_nblocks,
 			ip);
 		goto corrupt_out;
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
 				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 			"xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
 			ip->i_ino, ip->i_d.di_forkoff, ip);
 		goto corrupt_out;
 	}
 	/*
 	 * bump the flush iteration count, used to detect flushes which
 	 * postdate a log record during recovery.
 	 */
 	ip->i_d.di_flushiter++;
 	/*
 	 * Copy the dirty parts of the inode into the on-disk
 	 * inode.  We always copy out the core of the inode,
 	 * because if the inode is dirty at all the core must
 	 * be.
 	 */
 	xfs_dinode_to_disk(dip, &ip->i_d);
 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
 		ip->i_d.di_flushiter = 0;
 	/*
 	 * If this is really an old format inode and the superblock version
 	 * has not been updated to support only new format inodes, then
 	 * convert back to the old inode format.  If the superblock version
 	 * has been updated, then make the conversion permanent.
 	 */
 	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
 	if (ip->i_d.di_version == 1) {
 		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
 			/*
 			 * Convert it back.
 			 */
 			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
 			dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
 		} else {
 			/*
 			 * The superblock version has already been bumped,
 			 * so just make the conversion to the new inode
 			 * format permanent.
 			 */
 			ip->i_d.di_version = 2;
 			dip->di_version = 2;
 			ip->i_d.di_onlink = 0;
 			dip->di_onlink = 0;
 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 			memset(&(dip->di_pad[0]), 0,
 			      sizeof(dip->di_pad));
 			ASSERT(ip->i_d.di_projid == 0);
 		}
 	}
 	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
 	if (XFS_IFORK_Q(ip))
 		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
 	xfs_inobp_check(mp, bp);
 	/*
 	 * We've recorded everything logged in the inode, so we'd
 	 * like to clear the ilf_fields bits so we don't log and
 	 * flush things unnecessarily.  However, we can't stop
 	 * logging all this information until the data we've copied
 	 * into the disk buffer is written to disk.  If we did we might
 	 * overwrite the copy of the inode in the log with all the
 	 * data after re-logging only part of it, and in the face of
 	 * a crash we wouldn't have all the data we need to recover.
 	 *
 	 * What we do is move the bits to the ili_last_fields field.
 	 * When logging the inode, these bits are moved back to the
 	 * ilf_fields field.  In the xfs_iflush_done() routine we
 	 * clear ili_last_fields, since we know that the information
 	 * those bits represent is permanently on disk.  As long as
 	 * the flush completes before the inode is logged again, then
 	 * both ilf_fields and ili_last_fields will be cleared.
 	 *
 	 * We can play with the ilf_fields bits here, because the inode
 	 * lock must be held exclusively in order to set bits there
 	 * and the flush lock protects the ili_last_fields bits.
 	 * Set ili_logged so the flush done
 	 * routine can tell whether or not to look in the AIL.
 	 * Also, store the current LSN of the inode so that we can tell
 	 * whether the item has moved in the AIL from xfs_iflush_done().
 	 * In order to read the lsn we need the AIL lock, because
 	 * it is a 64 bit value that cannot be read atomically.
 	 */
 	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
 		iip->ili_last_fields = iip->ili_format.ilf_fields;
 		iip->ili_format.ilf_fields = 0;
 		iip->ili_logged = 1;
 		xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
 					&iip->ili_item.li_lsn);
 		/*
 		 * Attach the function xfs_iflush_done to the inode's
 		 * buffer.  This will remove the inode from the AIL
 		 * and unlock the inode's flush lock when the inode is
 		 * completely written to disk.
 		 */
 		xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
 				      xfs_iflush_done, (xfs_log_item_t *)iip);
 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
 		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
 	} else {
 		/*
 		 * We're flushing an inode which is not in the AIL and has
 		 * not been logged but has i_update_core set.  For this
 		 * case we can use a B_DELWRI flush and immediately drop
 		 * the inode flush lock because we can avoid the whole
 		 * AIL state thing.  It's OK to drop the flush lock now,
 		 * because we've already locked the buffer and to do anything
 		 * you really need both.
 		 */
 		if (iip != NULL) {
 			ASSERT(iip->ili_logged == 0);
 			ASSERT(iip->ili_last_fields == 0);
 			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
 		}
 		xfs_ifunlock(ip);
 	}
 	return 0;
 corrupt_out:
 	return XFS_ERROR(EFSCORRUPTED);
 }
 #ifdef XFS_ILOCK_TRACE
 ktrace_t	*xfs_ilock_trace_buf;
 void
 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
 {
 	ktrace_enter(ip->i_lock_trace,
 		     (void *)ip,
 		     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
 		     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
 		     (void *)ra,		/* caller of ilock */
 		     (void *)(unsigned long)current_cpu(),
 		     (void *)(unsigned long)current_pid(),
 		     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
 }
 #endif
 /*
  * Return a pointer to the extent record at file index idx.
  */
 xfs_bmbt_rec_host_t *
 xfs_iext_get_ext(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx)		/* index of target extent */
 {
 	ASSERT(idx >= 0);
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 		return ifp->if_u1.if_ext_irec->er_extbuf;
 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
 		xfs_ext_irec_t	*erp;		/* irec pointer */
 		int		erp_idx = 0;	/* irec index */
 		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
 		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
 		return &erp->er_extbuf[page_idx];
 	} else if (ifp->if_bytes) {
 		return &ifp->if_u1.if_extents[idx];
 	} else {
 		return NULL;
 	}
 }
 /*
  * Insert new item(s) into the extent records for incore inode
  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
  */
 void
 xfs_iext_insert(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* starting index of new items */
 	xfs_extnum_t	count,		/* number of inserted items */
 	xfs_bmbt_irec_t	*new)		/* items to insert */
 {
 	xfs_extnum_t	i;		/* extent record index */
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	xfs_iext_add(ifp, idx, count);
 	for (i = idx; i < idx + count; i++, new++)
 		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
 }
 /*
  * This is called when the amount of space required for incore file
  * extents needs to be increased. The ext_diff parameter stores the
  * number of new extents being added and the idx parameter contains
  * the extent index where the new extents will be added. If the new
  * extents are being appended, then we just need to (re)allocate and
  * initialize the space. Otherwise, if the new extents are being
  * inserted into the middle of the existing entries, a bit more work
  * is required to make room for the new extents to be inserted. The
  * caller is responsible for filling in the new extent entries upon
  * return.
  */
 void
 xfs_iext_add(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* index to begin adding exts */
 	int		ext_diff)	/* number of extents to add */
 {
 	int		byte_diff;	/* new bytes being added */
 	int		new_size;	/* size of extents after adding */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT((idx >= 0) && (idx <= nextents));
 	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
 	new_size = ifp->if_bytes + byte_diff;
 	/*
 	 * If the new number of extents (nextents + ext_diff)
 	 * fits inside the inode, then continue to use the inline
 	 * extent buffer.
 	 */
 	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
 		if (idx < nextents) {
 			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
 				&ifp->if_u2.if_inline_ext[idx],
 				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
 			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
 		}
 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 		ifp->if_real_bytes = 0;
 		ifp->if_lastex = nextents + ext_diff;
 	}
 	/*
 	 * Otherwise use a linear (direct) extent list.
 	 * If the extents are currently inside the inode,
 	 * xfs_iext_realloc_direct will switch us from
 	 * inline to direct extent allocation mode.
 	 */
 	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
 		xfs_iext_realloc_direct(ifp, new_size);
 		if (idx < nextents) {
 			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
 				&ifp->if_u1.if_extents[idx],
 				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
 			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
 		}
 	}
 	/* Indirection array */
 	else {
 		xfs_ext_irec_t	*erp;
 		int		erp_idx = 0;
 		int		page_idx = idx;
 		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
 		if (ifp->if_flags & XFS_IFEXTIREC) {
 			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
 		} else {
 			xfs_iext_irec_init(ifp);
 			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 			erp = ifp->if_u1.if_ext_irec;
 		}
 		/* Extents fit in target extent page */
 		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
 			if (page_idx < erp->er_extcount) {
 				memmove(&erp->er_extbuf[page_idx + ext_diff],
 					&erp->er_extbuf[page_idx],
 					(erp->er_extcount - page_idx) *
 					sizeof(xfs_bmbt_rec_t));
 				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
 			}
 			erp->er_extcount += ext_diff;
 			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
 		}
 		/* Insert a new extent page */
 		else if (erp) {
 			xfs_iext_add_indirect_multi(ifp,
 				erp_idx, page_idx, ext_diff);
 		}
 		/*
 		 * If extent(s) are being appended to the last page in
 		 * the indirection array and the new extent(s) don't fit
 		 * in the page, then erp is NULL and erp_idx is set to
 		 * the next index needed in the indirection array.
 		 */
 		else {
 			int	count = ext_diff;
 			while (count) {
 				erp = xfs_iext_irec_new(ifp, erp_idx);
 				erp->er_extcount = count;
 				count -= MIN(count, (int)XFS_LINEAR_EXTS);
 				if (count) {
 					erp_idx++;
 				}
 			}
 		}
 	}
 	ifp->if_bytes = new_size;
 }
 /*
  * This is called when incore extents are being added to the indirection
  * array and the new extents do not fit in the target extent list. The
  * erp_idx parameter contains the irec index for the target extent list
  * in the indirection array, and the idx parameter contains the extent
  * index within the list. The number of extents being added is stored
  * in the count parameter.
  *
  *    |-------|   |-------|
  *    |       |   |       |    idx - number of extents before idx
  *    |  idx  |   | count |
  *    |       |   |       |    count - number of extents being inserted at idx
  *    |-------|   |-------|
  *    | count |   | nex2  |    nex2 - number of extents after idx + count
  *    |-------|   |-------|
  */
 void
 xfs_iext_add_indirect_multi(
 	xfs_ifork_t	*ifp,			/* inode fork pointer */
 	int		erp_idx,		/* target extent irec index */
 	xfs_extnum_t	idx,			/* index within target list */
 	int		count)			/* new extents being added */
 {
 	int		byte_diff;		/* new bytes being added */
 	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
 	xfs_extnum_t	ext_diff;		/* number of extents to add */
 	xfs_extnum_t	ext_cnt;		/* new extents still needed */
 	xfs_extnum_t	nex2;			/* extents after idx + count */
 	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
 	int		nlists;			/* number of irec's (lists) */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	erp = &ifp->if_u1.if_ext_irec[erp_idx];
 	nex2 = erp->er_extcount - idx;
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	/*
 	 * Save second part of target extent list
 	 * (all extents past */
 	if (nex2) {
 		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
 		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
 		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
 		erp->er_extcount -= nex2;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
 		memset(&erp->er_extbuf[idx], 0, byte_diff);
 	}
 	/*
 	 * Add the new extents to the end of the target
 	 * list, then allocate new irec record(s) and
 	 * extent buffer(s) as needed to store the rest
 	 * of the new extents.
 	 */
 	ext_cnt = count;
 	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
 	if (ext_diff) {
 		erp->er_extcount += ext_diff;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
 		ext_cnt -= ext_diff;
 	}
 	while (ext_cnt) {
 		erp_idx++;
 		erp = xfs_iext_irec_new(ifp, erp_idx);
 		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
 		erp->er_extcount = ext_diff;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
 		ext_cnt -= ext_diff;
 	}
 	/* Add nex2 extents back to indirection array */
 	if (nex2) {
 		xfs_extnum_t	ext_avail;
 		int		i;
 		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
 		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
 		i = 0;
 		/*
 		 * If nex2 extents fit in the current page, append
 		 * nex2_ep after the new extents.
 		 */
 		if (nex2 <= ext_avail) {
 			i = erp->er_extcount;
 		}
 		/*
 		 * Otherwise, check if space is available in the
 		 * next page.
 		 */
 		else if ((erp_idx < nlists - 1) &&
 			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
 			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
 			erp_idx++;
 			erp++;
 			/* Create a hole for nex2 extents */
 			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
 				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
 		}
 		/*
 		 * Final choice, create a new extent page for
 		 * nex2 extents.
 		 */
 		else {
 			erp_idx++;
 			erp = xfs_iext_irec_new(ifp, erp_idx);
 		}
 		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
 		kmem_free(nex2_ep);
 		erp->er_extcount += nex2;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
 	}
 }
 /*
  * This is called when the amount of space required for incore file
  * extents needs to be decreased. The ext_diff parameter stores the
  * number of extents to be removed and the idx parameter contains
  * the extent index where the extents will be removed from.
  *
  * If the amount of space needed has decreased below the linear
  * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
  * extent array.  Otherwise, use kmem_realloc() to adjust the
  * size to what is needed.
  */
 void
 xfs_iext_remove(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* index to begin removing exts */
 	int		ext_diff)	/* number of extents to remove */
 {
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		new_size;	/* size of extents after removal */
 	ASSERT(ext_diff > 0);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
 		xfs_iext_remove_indirect(ifp, idx, ext_diff);
 	} else if (ifp->if_real_bytes) {
 		xfs_iext_remove_direct(ifp, idx, ext_diff);
 	} else {
 		xfs_iext_remove_inline(ifp, idx, ext_diff);
 	}
 	ifp->if_bytes = new_size;
 }
 /*
  * This removes ext_diff extents from the inline buffer, beginning
  * at extent index idx.
  */
 void
 xfs_iext_remove_inline(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* index to begin removing exts */
 	int		ext_diff)	/* number of extents to remove */
 {
 	int		nextents;	/* number of extents in file */
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(idx < XFS_INLINE_EXTS);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT(((nextents - ext_diff) > 0) &&
 		(nextents - ext_diff) < XFS_INLINE_EXTS);
 	if (idx + ext_diff < nextents) {
 		memmove(&ifp->if_u2.if_inline_ext[idx],
 			&ifp->if_u2.if_inline_ext[idx + ext_diff],
 			(nextents - (idx + ext_diff)) *
 			 sizeof(xfs_bmbt_rec_t));
 		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
 			0, ext_diff * sizeof(xfs_bmbt_rec_t));
 	} else {
 		memset(&ifp->if_u2.if_inline_ext[idx], 0,
 			ext_diff * sizeof(xfs_bmbt_rec_t));
 	}
 }
 /*
  * This removes ext_diff extents from a linear (direct) extent list,
  * beginning at extent index idx. If the extents are being removed
  * from the end of the list (ie. truncate) then we just need to re-
  * allocate the list to remove the extra space. Otherwise, if the
  * extents are being removed from the middle of the existing extent
  * entries, then we first need to move the extent records beginning
  * at idx + ext_diff up in the list to overwrite the records being
  * removed, then remove the extra space via kmem_realloc.
  */
 void
 xfs_iext_remove_direct(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* index to begin removing exts */
 	int		ext_diff)	/* number of extents to remove */
 {
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		new_size;	/* size of extents after removal */
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	new_size = ifp->if_bytes -
 		(ext_diff * sizeof(xfs_bmbt_rec_t));
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 		return;
 	}
 	/* Move extents up in the list (if needed) */
 	if (idx + ext_diff < nextents) {
 		memmove(&ifp->if_u1.if_extents[idx],
 			&ifp->if_u1.if_extents[idx + ext_diff],
 			(nextents - (idx + ext_diff)) *
 			 sizeof(xfs_bmbt_rec_t));
 	}
 	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
 		0, ext_diff * sizeof(xfs_bmbt_rec_t));
 	/*
 	 * Reallocate the direct extent list. If the extents
 	 * will fit inside the inode then xfs_iext_realloc_direct
 	 * will switch from direct to inline extent allocation
 	 * mode for us.
 	 */
 	xfs_iext_realloc_direct(ifp, new_size);
 	ifp->if_bytes = new_size;
 }
 /*
  * This is called when incore extents are being removed from the
  * indirection array and the extents being removed span multiple extent
  * buffers. The idx parameter contains the file extent index where we
  * want to begin removing extents, and the count parameter contains
  * how many extents need to be removed.
  *
  *    |-------|   |-------|
  *    | nex1  |   |       |    nex1 - number of extents before idx
  *    |-------|   | count |
  *    |       |   |       |    count - number of extents being removed at idx
  *    | count |   |-------|
  *    |       |   | nex2  |    nex2 - number of extents after idx + count
  *    |-------|   |-------|
  */
 void
 xfs_iext_remove_indirect(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	idx,		/* index to begin removing extents */
 	int		count)		/* number of extents to remove */
 {
 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
 	int		erp_idx = 0;	/* indirection array index */
 	xfs_extnum_t	ext_cnt;	/* extents left to remove */
 	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
 	xfs_extnum_t	nex1;		/* number of extents before idx */
 	xfs_extnum_t	nex2;		/* extents after idx + count */
 	int		nlists;		/* entries in indirection array */
 	int		page_idx = idx;	/* index in target extent list */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
 	ASSERT(erp != NULL);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	nex1 = page_idx;
 	ext_cnt = count;
 	while (ext_cnt) {
 		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
 		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
 		/*
 		 * Check for deletion of entire list;
 		 * xfs_iext_irec_remove() updates extent offsets.
 		 */
 		if (ext_diff == erp->er_extcount) {
 			xfs_iext_irec_remove(ifp, erp_idx);
 			ext_cnt -= ext_diff;
 			nex1 = 0;
 			if (ext_cnt) {
 				ASSERT(erp_idx < ifp->if_real_bytes /
 					XFS_IEXT_BUFSZ);
 				erp = &ifp->if_u1.if_ext_irec[erp_idx];
 				nex1 = 0;
 				continue;
 			} else {
 				break;
 			}
 		}
 		/* Move extents up (if needed) */
 		if (nex2) {
 			memmove(&erp->er_extbuf[nex1],
 				&erp->er_extbuf[nex1 + ext_diff],
 				nex2 * sizeof(xfs_bmbt_rec_t));
 		}
 		/* Zero out rest of page */
 		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
 			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
 		/* Update remaining counters */
 		erp->er_extcount -= ext_diff;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
 		ext_cnt -= ext_diff;
 		nex1 = 0;
 		erp_idx++;
 		erp++;
 	}
 	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
 	xfs_iext_irec_compact(ifp);
 }
 /*
  * Create, destroy, or resize a linear (direct) block of extents.
  */
 void
 xfs_iext_realloc_direct(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* new size of extents */
 {
 	int		rnew_size;	/* real new size of extents */
 	rnew_size = new_size;
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
 		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
 		 (new_size != ifp->if_real_bytes)));
 	/* Free extent records */
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 	}
 	/* Resize direct extent list and zero any new bytes */
 	else if (ifp->if_real_bytes) {
 		/* Check if extents will fit inside the inode */
 		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
 			xfs_iext_direct_to_inline(ifp, new_size /
 				(uint)sizeof(xfs_bmbt_rec_t));
 			ifp->if_bytes = new_size;
 			return;
 		}
 		if (!is_power_of_2(new_size)){
 			rnew_size = roundup_pow_of_two(new_size);
 		}
 		if (rnew_size != ifp->if_real_bytes) {
 			ifp->if_u1.if_extents =
 				kmem_realloc(ifp->if_u1.if_extents,
 						rnew_size,
 						ifp->if_real_bytes, KM_NOFS);
 		}
 		if (rnew_size > ifp->if_real_bytes) {
 			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
 				(uint)sizeof(xfs_bmbt_rec_t)], 0,
 				rnew_size - ifp->if_real_bytes);
 		}
 	}
 	/*
 	 * Switch from the inline extent buffer to a direct
 	 * extent list. Be sure to include the inline extent
 	 * bytes in new_size.
 	 */
 	else {
 		new_size += ifp->if_bytes;
 		if (!is_power_of_2(new_size)) {
 			rnew_size = roundup_pow_of_two(new_size);
 		}
 		xfs_iext_inline_to_direct(ifp, rnew_size);
 	}
 	ifp->if_real_bytes = rnew_size;
 	ifp->if_bytes = new_size;
 }
 /*
  * Switch from linear (direct) extent records to inline buffer.
  */
 void
 xfs_iext_direct_to_inline(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	nextents)	/* number of extents in file */
 {
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	ASSERT(nextents <= XFS_INLINE_EXTS);
 	/*
 	 * The inline buffer was zeroed when we switched
 	 * from inline to direct extent allocation mode,
 	 * so we don't need to clear it here.
 	 */
 	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
 		nextents * sizeof(xfs_bmbt_rec_t));
 	kmem_free(ifp->if_u1.if_extents);
 	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 	ifp->if_real_bytes = 0;
 }
 /*
  * Switch from inline buffer to linear (direct) extent records.
  * new_size should already be rounded up to the next power of 2
  * by the caller (when appropriate), so use new_size as it is.
  * However, since new_size may be rounded up, we can't update
  * if_bytes here. It is the caller's responsibility to update
  * if_bytes upon return.
  */
 void
 xfs_iext_inline_to_direct(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* number of extents in file */
 {
 	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
 	memset(ifp->if_u1.if_extents, 0, new_size);
 	if (ifp->if_bytes) {
 		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
 			ifp->if_bytes);
 		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
 			sizeof(xfs_bmbt_rec_t));
 	}
 	ifp->if_real_bytes = new_size;
 }
 /*
  * Resize an extent indirection array to new_size bytes.
  */
 void
 xfs_iext_realloc_indirect(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* new indirection array size */
 {
 	int		nlists;		/* number of irec's (ex lists) */
 	int		size;		/* current indirection array size */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	size = nlists * sizeof(xfs_ext_irec_t);
 	ASSERT(ifp->if_real_bytes);
 	ASSERT((new_size >= 0) && (new_size != size));
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 	} else {
 		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
 			kmem_realloc(ifp->if_u1.if_ext_irec,
 				new_size, size, KM_NOFS);
 	}
 }
 /*
  * Switch from indirection array to linear (direct) extent allocations.
  */
 void
 xfs_iext_indirect_to_direct(
 	 xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		size;		/* size of file extents */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 	xfs_iext_irec_compact_pages(ifp);
 	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
 	ep = ifp->if_u1.if_ext_irec->er_extbuf;
 	kmem_free(ifp->if_u1.if_ext_irec);
 	ifp->if_flags &= ~XFS_IFEXTIREC;
 	ifp->if_u1.if_extents = ep;
 	ifp->if_bytes = size;
 	if (nextents < XFS_LINEAR_EXTS) {
 		xfs_iext_realloc_direct(ifp, size);
 	}
 }
 /*
  * Free incore file extents.
  */
 void
 xfs_iext_destroy(
 	xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	if (ifp->if_flags & XFS_IFEXTIREC) {
 		int	erp_idx;
 		int	nlists;
 		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
 			xfs_iext_irec_remove(ifp, erp_idx);
 		}
 		ifp->if_flags &= ~XFS_IFEXTIREC;
 	} else if (ifp->if_real_bytes) {
 		kmem_free(ifp->if_u1.if_extents);
 	} else if (ifp->if_bytes) {
 		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
 			sizeof(xfs_bmbt_rec_t));
 	}
 	ifp->if_u1.if_extents = NULL;
 	ifp->if_real_bytes = 0;
 	ifp->if_bytes = 0;
 }
 /*
  * Return a pointer to the extent record for file system block bno.
  */
 xfs_bmbt_rec_host_t *			/* pointer to found extent record */
 xfs_iext_bno_to_ext(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_fileoff_t	bno,		/* block number to search for */
 	xfs_extnum_t	*idxp)		/* index of target extent */
 {
 	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
 	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
 	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
 	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
 	int		high;		/* upper boundary in search */
 	xfs_extnum_t	idx = 0;	/* index of target extent */
 	int		low;		/* lower boundary in search */
 	xfs_extnum_t	nextents;	/* number of file extents */
 	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	if (nextents == 0) {
 		*idxp = 0;
 		return NULL;
 	}
 	low = 0;
 	if (ifp->if_flags & XFS_IFEXTIREC) {
 		/* Find target extent list */
 		int	erp_idx = 0;
 		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
 		base = erp->er_extbuf;
 		high = erp->er_extcount - 1;
 	} else {
 		base = ifp->if_u1.if_extents;
 		high = nextents - 1;
 	}
 	/* Binary search extent records */
 	while (low <= high) {
 		idx = (low + high) >> 1;
 		ep = base + idx;
 		startoff = xfs_bmbt_get_startoff(ep);
 		blockcount = xfs_bmbt_get_blockcount(ep);
 		if (bno < startoff) {
 			high = idx - 1;
 		} else if (bno >= startoff + blockcount) {
 			low = idx + 1;
 		} else {
 			/* Convert back to file-based extent index */
 			if (ifp->if_flags & XFS_IFEXTIREC) {
 				idx += erp->er_extoff;
 			}
 			*idxp = idx;
 			return ep;
 		}
 	}
 	/* Convert back to file-based extent index */
 	if (ifp->if_flags & XFS_IFEXTIREC) {
 		idx += erp->er_extoff;
 	}
 	if (bno >= startoff + blockcount) {
 		if (++idx == nextents) {
 			ep = NULL;
 		} else {
 			ep = xfs_iext_get_ext(ifp, idx);
 		}
 	}
 	*idxp = idx;
 	return ep;
 }
 /*
  * Return a pointer to the indirection array entry containing the
  * extent record for filesystem block bno. Store the index of the
  * target irec in *erp_idxp.
  */
 xfs_ext_irec_t *			/* pointer to found extent record */
 xfs_iext_bno_to_irec(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_fileoff_t	bno,		/* block number to search for */
 	int		*erp_idxp)	/* irec index of target ext list */
 {
 	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
 	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
 	int		erp_idx;	/* indirection array index */
 	int		nlists;		/* number of extent irec's (lists) */
 	int		high;		/* binary search upper limit */
 	int		low;		/* binary search lower limit */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
 	low = 0;
 	high = nlists - 1;
 	while (low <= high) {
 		erp_idx = (low + high) >> 1;
 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
 		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
 		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
 			high = erp_idx - 1;
 		} else if (erp_next && bno >=
 			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
 			low = erp_idx + 1;
 		} else {
 			break;
 		}
 	}
 	*erp_idxp = erp_idx;
 	return erp;
 }
 /*
  * Return a pointer to the indirection array entry containing the
  * extent record at file extent index *idxp. Store the index of the
  * target irec in *erp_idxp and store the page index of the target
  * extent record in *idxp.
  */
 xfs_ext_irec_t *
 xfs_iext_idx_to_irec(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
 	int		*erp_idxp,	/* pointer to target irec */
 	int		realloc)	/* new bytes were just added */
 {
 	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
 	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
 	int		erp_idx;	/* indirection array index */
 	int		nlists;		/* number of irec's (ex lists) */
 	int		high;		/* binary search upper limit */
 	int		low;		/* binary search lower limit */
 	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(page_idx >= 0 && page_idx <=
 		ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
 	low = 0;
 	high = nlists - 1;
 	/* Binary search extent irec's */
 	while (low <= high) {
 		erp_idx = (low + high) >> 1;
 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
 		prev = erp_idx > 0 ? erp - 1 : NULL;
 		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
 		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
 			high = erp_idx - 1;
 		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
 			   (page_idx == erp->er_extoff + erp->er_extcount &&
 			    !realloc)) {
 			low = erp_idx + 1;
 		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
 			   erp->er_extcount == XFS_LINEAR_EXTS) {
 			ASSERT(realloc);
 			page_idx = 0;
 			erp_idx++;
 			erp = erp_idx < nlists ? erp + 1 : NULL;
 			break;
 		} else {
 			page_idx -= erp->er_extoff;
 			break;
 		}
 	}
 	*idxp = page_idx;
 	*erp_idxp = erp_idx;
 	return(erp);
 }
 /*
  * Allocate and initialize an indirection array once the space needed
  * for incore extents increases above XFS_IEXT_BUFSZ.
  */
 void
 xfs_iext_irec_init(
 	xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
 	if (nextents == 0) {
 		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
 	} else if (!ifp->if_real_bytes) {
 		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
 	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
 		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
 	}
 	erp->er_extbuf = ifp->if_u1.if_extents;
 	erp->er_extcount = nextents;
 	erp->er_extoff = 0;
 	ifp->if_flags |= XFS_IFEXTIREC;
 	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
 	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
 	ifp->if_u1.if_ext_irec = erp;
 	return;
 }
 /*
  * Allocate and initialize a new entry in the indirection array.
  */
 xfs_ext_irec_t *
 xfs_iext_irec_new(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		erp_idx)	/* index for new irec */
 {
 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
 	int		i;		/* loop counter */
 	int		nlists;		/* number of irec's (ex lists) */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	/* Resize indirection array */
 	xfs_iext_realloc_indirect(ifp, ++nlists *
 				  sizeof(xfs_ext_irec_t));
 	/*
 	 * Move records down in the array so the
 	 * new page can use erp_idx.
 	 */
 	erp = ifp->if_u1.if_ext_irec;
 	for (i = nlists - 1; i > erp_idx; i--) {
 		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
 	}
 	ASSERT(i == erp_idx);
 	/* Initialize new extent record */
 	erp = ifp->if_u1.if_ext_irec;
 	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
 	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
 	erp[erp_idx].er_extcount = 0;
 	erp[erp_idx].er_extoff = erp_idx > 0 ?
 		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
 	return (&erp[erp_idx]);
 }
 /*
  * Remove a record from the indirection array.
  */
 void
 xfs_iext_irec_remove(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		erp_idx)	/* irec index to remove */
 {
 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
 	int		i;		/* loop counter */
 	int		nlists;		/* number of irec's (ex lists) */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp = &ifp->if_u1.if_ext_irec[erp_idx];
 	if (erp->er_extbuf) {
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
 			-erp->er_extcount);
 		kmem_free(erp->er_extbuf);
 	}
 	/* Compact extent records */
 	erp = ifp->if_u1.if_ext_irec;
 	for (i = erp_idx; i < nlists - 1; i++) {
 		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
 	}
 	/*
 	 * Manually free the last extent record from the indirection
 	 * array.  A call to xfs_iext_realloc_indirect() with a size
 	 * of zero would result in a call to xfs_iext_destroy() which
 	 * would in turn call this function again, creating a nasty
 	 * infinite loop.
 	 */
 	if (--nlists) {
 		xfs_iext_realloc_indirect(ifp,
 			nlists * sizeof(xfs_ext_irec_t));
 	} else {
 		kmem_free(ifp->if_u1.if_ext_irec);
 	}
 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
 }
 /*
  * This is called to clean up large amounts of unused memory allocated
  * by the indirection array.  Before compacting anything though, verify
  * that the indirection array is still needed and switch back to the
  * linear extent list (or even the inline buffer) if possible.  The
  * compaction policy is as follows:
  *
  *    Full Compaction: Extents fit into a single page (or inline buffer)
  * Partial Compaction: Extents occupy less than 50% of allocated space
  *      No Compaction: Extents occupy at least 50% of allocated space
  */
 void
 xfs_iext_irec_compact(
 	xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		nlists;		/* number of irec's (ex lists) */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	if (nextents == 0) {
 		xfs_iext_destroy(ifp);
 	} else if (nextents <= XFS_INLINE_EXTS) {
 		xfs_iext_indirect_to_direct(ifp);
 		xfs_iext_direct_to_inline(ifp, nextents);
 	} else if (nextents <= XFS_LINEAR_EXTS) {
 		xfs_iext_indirect_to_direct(ifp);
 	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
 		xfs_iext_irec_compact_pages(ifp);
 	}
 }
 /*
  * Combine extents from neighboring extent pages.
  */
 void
 xfs_iext_irec_compact_pages(
 	xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
 	int		erp_idx = 0;	/* indirection array index */
 	int		nlists;		/* number of irec's (ex lists) */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	while (erp_idx < nlists - 1) {
 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
 		erp_next = erp + 1;
 		if (erp_next->er_extcount <=
 		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
 			memcpy(&erp->er_extbuf[erp->er_extcount],
 				erp_next->er_extbuf, erp_next->er_extcount *
 				sizeof(xfs_bmbt_rec_t));
 			erp->er_extcount += erp_next->er_extcount;
 			/*
 			 * Free page before removing extent record
 			 * so er_extoffs don't get modified in
 			 * xfs_iext_irec_remove.
 			 */
 			kmem_free(erp_next->er_extbuf);
 			erp_next->er_extbuf = NULL;
 			xfs_iext_irec_remove(ifp, erp_idx + 1);
 			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 		} else {
 			erp_idx++;
 		}
 	}
 }
 /*
  * This is called to update the er_extoff field in the indirection
  * array when extents have been added or removed from one of the
  * extent lists. erp_idx contains the irec index to begin updating
  * at and ext_diff contains the number of extents that were added
  * or removed.
  */
 void
 xfs_iext_irec_update_extoffs(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		erp_idx,	/* irec index to update */
 	int		ext_diff)	/* number of new extents */
 {
 	int		i;		/* loop counter */
 	int		nlists;		/* number of irec's (ex lists */
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	for (i = erp_idx; i < nlists; i++) {
 		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
 	}
 }

fs/xfs/xfs_inode.h

Diff comments View file @ 92bfc6e

 /*
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef	__XFS_INODE_H__
 #define	__XFS_INODE_H__
 struct xfs_dinode;
 struct xfs_inode;
 /*
  * Fork identifiers.
  */
 #define	XFS_DATA_FORK	0
 #define	XFS_ATTR_FORK	1
 /*
  * The following xfs_ext_irec_t struct introduces a second (top) level
  * to the in-core extent allocation scheme. These structs are allocated
  * in a contiguous block, creating an indirection array where each entry
  * (irec) contains a pointer to a buffer of in-core extent records which
  * it manages. Each extent buffer is 4k in size, since 4k is the system
  * page size on Linux i386 and systems with larger page sizes don't seem
  * to gain much, if anything, by using their native page size as the
  * extent buffer size. Also, using 4k extent buffers everywhere provides
  * a consistent interface for CXFS across different platforms.
  *
  * There is currently no limit on the number of irec's (extent lists)
  * allowed, so heavily fragmented files may require an indirection array
  * which spans multiple system pages of memory. The number of extents
  * which would require this amount of contiguous memory is very large
  * and should not cause problems in the foreseeable future. However,
  * if the memory needed for the contiguous array ever becomes a problem,
  * it is possible that a third level of indirection may be required.
  */
 typedef struct xfs_ext_irec {
 	xfs_bmbt_rec_host_t *er_extbuf;	/* block of extent records */
 	xfs_extnum_t	er_extoff;	/* extent offset in file */
 	xfs_extnum_t	er_extcount;	/* number of extents in page/block */
 } xfs_ext_irec_t;
 /*
  * File incore extent information, present for each of data & attr forks.
  */
 #define	XFS_IEXT_BUFSZ		4096
 #define	XFS_LINEAR_EXTS		(XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
 #define	XFS_INLINE_EXTS		2
 #define	XFS_INLINE_DATA		32
 typedef struct xfs_ifork {
 	int			if_bytes;	/* bytes in if_u1 */
 	int			if_real_bytes;	/* bytes allocated in if_u1 */
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
 	unsigned char		if_ext_max;	/* max # of extent records */
 	xfs_extnum_t		if_lastex;	/* last if_extents used */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
 		char		*if_data;	/* inline file data */
 	} if_u1;
 	union {
 		xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
 						/* very small file extents */
 		char		if_inline_data[XFS_INLINE_DATA];
 						/* very small file data */
 		xfs_dev_t	if_rdev;	/* dev number if special */
 		uuid_t		if_uuid;	/* mount point value */
 	} if_u2;
 } xfs_ifork_t;
 /*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	ushort		im_len;		/* length in BBs of inode chunk */
+	ushort		im_boffset;	/* inode offset in block in bytes */
+};
+/*
  * This is the xfs in-core inode structure.
  * Most of the on-disk inode is embedded in the i_d field.
  *
  * The extent pointers/inline file space, however, are managed
  * separately.  The memory for this information is pointed to by
  * the if_u1 unions depending on the type of the data.
  * This is used to linearize the array of extents for fast in-core
  * access.  This is used until the file's number of extents
  * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
  * are accessed through the buffer cache.
  *
  * Other state kept in the in-core inode is used for identification,
  * locking, transactional updating, etc of the inode.
  *
  * Generally, we do not want to hold the i_rlock while holding the
  * i_ilock. Hierarchy is i_iolock followed by i_rlock.
  *
  * xfs_iptr_t contains all the inode fields upto and including the
  * i_mnext and i_mprev fields, it is used as a marker in the inode
  * chain off the mount structure by xfs_sync calls.
  */
 typedef struct xfs_ictimestamp {
 	__int32_t	t_sec;		/* timestamp seconds */
 	__int32_t	t_nsec;		/* timestamp nanoseconds */
 } xfs_ictimestamp_t;
 /*
  * NOTE:  This structure must be kept identical to struct xfs_dinode
  * 	  in xfs_dinode.h except for the endianess annotations.
  */
 typedef struct xfs_icdinode {
 	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
 	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
 	__int8_t	di_format;	/* format of di_c data */
 	__uint16_t	di_onlink;	/* old number of links to file */
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
 	__uint32_t	di_nlink;	/* number of links to file */
 	__uint16_t	di_projid;	/* owner's project id */
 	__uint8_t	di_pad[8];	/* unused, zeroed space */
 	__uint16_t	di_flushiter;	/* incremented on flush */
 	xfs_ictimestamp_t di_atime;	/* time last accessed */
 	xfs_ictimestamp_t di_mtime;	/* time last modified */
 	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
 	xfs_fsize_t	di_size;	/* number of bytes in file */
 	xfs_drfsbno_t	di_nblocks;	/* # of direct & btree blocks used */
 	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
 	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
 	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
 	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
 	__int8_t	di_aformat;	/* format of attr fork's data */
 	__uint32_t	di_dmevmask;	/* DMIG event mask */
 	__uint16_t	di_dmstate;	/* DMIG state info */
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 	__uint32_t	di_gen;		/* generation number */
 } xfs_icdinode_t;
 /*
  * Flags for xfs_ichgtime().
  */
 #define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
 #define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
 /*
  * Per-fork incore inode flags.
  */
 #define	XFS_IFINLINE	0x01	/* Inline data is read in */
 #define	XFS_IFEXTENTS	0x02	/* All extent pointers are read in */
 #define	XFS_IFBROOT	0x04	/* i_broot points to the bmap b-tree root */
 #define	XFS_IFEXTIREC	0x08	/* Indirection array of extent blocks */
 /*
  * Flags for xfs_inotobp and xfs_imap().
  */
 #define XFS_IMAP_BULKSTAT	0x1
 /*
  * Fork handling.
  */
 #define XFS_IFORK_Q(ip)			((ip)->i_d.di_forkoff != 0)
 #define XFS_IFORK_BOFF(ip)		((int)((ip)->i_d.di_forkoff << 3))
 #define XFS_IFORK_PTR(ip,w)		\
 	((w) == XFS_DATA_FORK ? \
 		&(ip)->i_df : \
 		(ip)->i_afp)
 #define XFS_IFORK_DSIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
 		XFS_IFORK_BOFF(ip) : \
 		XFS_LITINO((ip)->i_mount))
 #define XFS_IFORK_ASIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
 		XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
 		0)
 #define XFS_IFORK_SIZE(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		XFS_IFORK_DSIZE(ip) : \
 		XFS_IFORK_ASIZE(ip))
 #define XFS_IFORK_FORMAT(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_format : \
 		(ip)->i_d.di_aformat)
 #define XFS_IFORK_FMT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_format = (n)) : \
 		((ip)->i_d.di_aformat = (n)))
 #define XFS_IFORK_NEXTENTS(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_nextents : \
 		(ip)->i_d.di_anextents)
 #define XFS_IFORK_NEXT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
 		((ip)->i_d.di_anextents = (n)))
 #ifdef __KERNEL__
 struct bhv_desc;
 struct cred;
 struct ktrace;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
 struct xfs_inode_log_item;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 #if defined(XFS_ILOCK_TRACE)
 #define XFS_ILOCK_KTRACE_SIZE	32
 extern ktrace_t *xfs_ilock_trace_buf;
 extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
 #else
 #define	xfs_ilock_trace(i,n,f,ra)
 #endif
 typedef struct dm_attrs_s {
 	__uint32_t	da_dmevmask;	/* DMIG event mask */
 	__uint16_t	da_dmstate;	/* DMIG state info */
 	__uint16_t	da_pad;		/* DMIG extra padding */
 } dm_attrs_t;
 typedef struct xfs_inode {
 	/* Inode linking and identification information. */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
 	struct xfs_dquot	*i_udquot;	/* user dquot */
 	struct xfs_dquot	*i_gdquot;	/* group dquot */
 	/* Inode location stuff */
 	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
-	xfs_daddr_t		i_blkno;	/* blkno of inode buffer */
+	struct xfs_imap		i_imap;		/* location for xfs_imap() */
-	ushort			i_len;		/* len of inode buffer */
-	ushort			i_boffset;	/* off of inode in buffer */
 	/* Extent information. */
 	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
 	xfs_ifork_t		i_df;		/* data fork */
 	/* Transaction and locking information. */
 	struct xfs_trans	*i_transp;	/* ptr to owning transaction*/
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
 	struct completion	i_flush;	/* inode flush completion q */
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
 	unsigned char		i_update_size;	/* di_size field is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
 	atomic_t		i_iocount;	/* outstanding I/O count */
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 	/* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
 	struct ktrace		*i_trace;	/* general inode trace */
 #endif
 #ifdef XFS_BMAP_TRACE
 	struct ktrace		*i_xtrace;	/* inode extent list trace */
 #endif
 #ifdef XFS_BTREE_TRACE
 	struct ktrace		*i_btrace;	/* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
 	struct ktrace		*i_rwtrace;	/* inode read/write trace */
 #endif
 #ifdef XFS_ILOCK_TRACE
 	struct ktrace		*i_lock_trace;	/* inode lock/unlock trace */
 #endif
 #ifdef XFS_DIR2_TRACE
 	struct ktrace		*i_dir_trace;	/* inode directory trace */
 #endif
 } xfs_inode_t;
 #define XFS_ISIZE(ip)	(((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
 				(ip)->i_size : (ip)->i_d.di_size;
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
 	return container_of(inode, struct xfs_inode, i_vnode);
 }
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
 	return &ip->i_vnode;
 }
 /*
  * Get rid of a partially initialized inode.
  *
  * We have to go through destroy_inode to make sure allocations
  * from init_inode_always like the security data are undone.
  *
  * We mark the inode bad so that it takes the short cut in
  * the reclaim path instead of going through the flush path
  * which doesn't make sense for an inode that has never seen the
  * light of day.
  */
 static inline void xfs_destroy_inode(struct xfs_inode *ip)
 {
 	make_bad_inode(VFS_I(ip));
 	return destroy_inode(VFS_I(ip));
 }
 /*
  * i_flags helper functions
  */
 static inline void
 __xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
 {
 	ip->i_flags |= flags;
 }
 static inline void
 xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
 {
 	spin_lock(&ip->i_flags_lock);
 	__xfs_iflags_set(ip, flags);
 	spin_unlock(&ip->i_flags_lock);
 }
 static inline void
 xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
 {
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags &= ~flags;
 	spin_unlock(&ip->i_flags_lock);
 }
 static inline int
 __xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
 {
 	return (ip->i_flags & flags);
 }
 static inline int
 xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
 {
 	int ret;
 	spin_lock(&ip->i_flags_lock);
 	ret = __xfs_iflags_test(ip, flags);
 	spin_unlock(&ip->i_flags_lock);
 	return ret;
 }
 static inline int
 xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 {
 	int ret;
 	spin_lock(&ip->i_flags_lock);
 	ret = ip->i_flags & flags;
 	if (ret)
 		ip->i_flags &= ~flags;
 	spin_unlock(&ip->i_flags_lock);
 	return ret;
 }
 /*
  * Manage the i_flush queue embedded in the inode.  This completion
  * queue synchronizes processes attempting to flush the in-core
  * inode back to disk.
  */
 static inline void xfs_iflock(xfs_inode_t *ip)
 {
 	wait_for_completion(&ip->i_flush);
 }
 static inline int xfs_iflock_nowait(xfs_inode_t *ip)
 {
 	return try_wait_for_completion(&ip->i_flush);
 }
 static inline void xfs_ifunlock(xfs_inode_t *ip)
 {
 	complete(&ip->i_flush);
 }
 /*
  * In-core inode flags.
  */
 #define XFS_IGRIO	0x0001  /* inode used for guaranteed rate i/o */
 #define XFS_IUIOSZ	0x0002  /* inode i/o sizes have been explicitly set */
 #define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
 #define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
 #define XFS_ISTALE	0x0010	/* inode has been staled */
 #define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
 #define XFS_INEW	0x0040
 #define XFS_IFILESTREAM	0x0080	/* inode is in a filestream directory */
 #define XFS_IMODIFIED	0x0100	/* XFS inode state possibly differs */
 				/* to the Linux inode state. */
 #define XFS_ITRUNCATED	0x0200	/* truncated down so flush-on-close */
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
  *		1<<16 - 1<<32-1 -- lockdep annotation (integers)
  */
 #define	XFS_IOLOCK_EXCL		(1<<0)
 #define	XFS_IOLOCK_SHARED	(1<<1)
 #define	XFS_ILOCK_EXCL		(1<<2)
 #define	XFS_ILOCK_SHARED	(1<<3)
 #define	XFS_IUNLOCK_NONOTIFY	(1<<4)
 #define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
 				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
 /*
  * Flags for lockdep annotations.
  *
  * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
  * (ie directory operations that require locking a directory inode and
  * an entry inode).  The first inode gets locked with this flag so it
  * gets a lockdep subclass of 1 and the second lock will have a lockdep
  * subclass of 0.
  *
  * XFS_LOCK_INUMORDER - for locking several inodes at the some time
  * with xfs_lock_inodes().  This flag is used as the starting subclass
  * and each subsequent lock acquired will increment the subclass by one.
  * So the first lock acquired will have a lockdep subclass of 2, the
  * second lock will have a lockdep subclass of 3, and so on. It is
  * the responsibility of the class builder to shift this to the correct
  * portion of the lock_mode lockdep mask.
  */
 #define XFS_LOCK_PARENT		1
 #define XFS_LOCK_INUMORDER	2
 #define XFS_IOLOCK_SHIFT	16
 #define	XFS_IOLOCK_PARENT	(XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_SHIFT		24
 #define	XFS_ILOCK_PARENT	(XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
 #define XFS_IOLOCK_DEP_MASK	0x00ff0000
 #define XFS_ILOCK_DEP_MASK	0xff000000
 #define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
 #define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 /*
  * Flags for xfs_iflush()
  */
 #define	XFS_IFLUSH_DELWRI_ELSE_SYNC	1
 #define	XFS_IFLUSH_DELWRI_ELSE_ASYNC	2
 #define	XFS_IFLUSH_SYNC			3
 #define	XFS_IFLUSH_ASYNC		4
 #define	XFS_IFLUSH_DELWRI		5
 #define	XFS_IFLUSH_ASYNC_NOBLOCK	6
 /*
  * Flags for xfs_itruncate_start().
  */
 #define	XFS_ITRUNC_DEFINITE	0x1
 #define	XFS_ITRUNC_MAYBE	0x2
 /*
  * For multiple groups support: if S_ISGID bit is set in the parent
  * directory, group of new file is set to that of the parent, and
  * new subdirectory gets S_ISGID bit from parent.
  */
 #define XFS_INHERIT_GID(pip)	\
 	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
 	 ((pip)->i_d.di_mode & S_ISGID))
 /*
  * Flags for xfs_iget()
  */
 #define XFS_IGET_CREATE		0x1
 #define XFS_IGET_BULKSTAT	0x2
 /*
  * xfs_iget.c prototypes.
  */
 void		xfs_ihash_init(struct xfs_mount *);
 void		xfs_ihash_free(struct xfs_mount *);
 xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
 				  struct xfs_trans *);
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **, xfs_daddr_t);
 void		xfs_iput(xfs_inode_t *, uint);
 void		xfs_iput_new(xfs_inode_t *, uint);
 void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void		xfs_ireclaim(xfs_inode_t *);
 /*
  * xfs_inode.c prototypes.
  */
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
 			   xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
 			   int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
 uint		xfs_ip2xflags(struct xfs_inode *);
 uint		xfs_dic2xflags(struct xfs_dinode *);
 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
 int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
 				     xfs_fsize_t, int, int);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void		xfs_idestroy(xfs_inode_t *);
 void		xfs_iextract(xfs_inode_t *);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 void		xfs_ipin(xfs_inode_t *);
 void		xfs_iunpin(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void		xfs_synchronize_atime(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #endif /* __KERNEL__ */
 int		xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
 			    xfs_ino_t, struct xfs_dinode **,
 			    struct xfs_buf **, int *, uint);
 int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, struct xfs_dinode **,
 			  struct xfs_buf **, uint);
 void		xfs_dinode_from_disk(struct xfs_icdinode *,
 				     struct xfs_dinode *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
 void		xfs_idata_realloc(struct xfs_inode *, int, int);
 void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void		xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
 				xfs_bmbt_irec_t *);
 void		xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
 void		xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_realloc_direct(xfs_ifork_t *, int);
 void		xfs_iext_realloc_indirect(xfs_ifork_t *, int);
 void		xfs_iext_indirect_to_direct(xfs_ifork_t *);
 void		xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
 void		xfs_iext_inline_to_direct(xfs_ifork_t *, int);
 void		xfs_iext_destroy(xfs_ifork_t *);
 xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
 xfs_ext_irec_t	*xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
 xfs_ext_irec_t	*xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
 void		xfs_iext_irec_init(xfs_ifork_t *);
 xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
 void		xfs_iext_irec_remove(xfs_ifork_t *, int);
 void		xfs_iext_irec_compact(xfs_ifork_t *);
 void		xfs_iext_irec_compact_pages(xfs_ifork_t *);
 void		xfs_iext_irec_compact_full(xfs_ifork_t *);
 void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 #ifdef DEBUG
 void		xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
 				xfs_fsize_t);
 #else	/* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif	/* DEBUG */
 #if defined(DEBUG)
 void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #else
 #define	xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;

fs/xfs/xfs_inode_item.c

Diff comments View file @ 92bfc6e

1	/*	1	/*
2	* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.	2	* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3	* All Rights Reserved.	3	* All Rights Reserved.
4	*	4	*
5	* This program is free software; you can redistribute it and/or	5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as	6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.	7	* published by the Free Software Foundation.
8	*	8	*
9	* This program is distributed in the hope that it would be useful,	9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,	15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/	17	*/
18	#include "xfs.h"	18	#include "xfs.h"
19	#include "xfs_fs.h"	19	#include "xfs_fs.h"
20	#include "xfs_types.h"	20	#include "xfs_types.h"
21	#include "xfs_bit.h"	21	#include "xfs_bit.h"
22	#include "xfs_log.h"	22	#include "xfs_log.h"
23	#include "xfs_inum.h"	23	#include "xfs_inum.h"
24	#include "xfs_trans.h"	24	#include "xfs_trans.h"
25	#include "xfs_buf_item.h"	25	#include "xfs_buf_item.h"
26	#include "xfs_sb.h"	26	#include "xfs_sb.h"
27	#include "xfs_ag.h"	27	#include "xfs_ag.h"
28	#include "xfs_dir2.h"	28	#include "xfs_dir2.h"
29	#include "xfs_dmapi.h"	29	#include "xfs_dmapi.h"
30	#include "xfs_mount.h"	30	#include "xfs_mount.h"
31	#include "xfs_trans_priv.h"	31	#include "xfs_trans_priv.h"
32	#include "xfs_bmap_btree.h"	32	#include "xfs_bmap_btree.h"
33	#include "xfs_alloc_btree.h"	33	#include "xfs_alloc_btree.h"
34	#include "xfs_ialloc_btree.h"	34	#include "xfs_ialloc_btree.h"
35	#include "xfs_dir2_sf.h"	35	#include "xfs_dir2_sf.h"
36	#include "xfs_attr_sf.h"	36	#include "xfs_attr_sf.h"
37	#include "xfs_dinode.h"	37	#include "xfs_dinode.h"
38	#include "xfs_inode.h"	38	#include "xfs_inode.h"
39	#include "xfs_inode_item.h"	39	#include "xfs_inode_item.h"
40	#include "xfs_btree.h"	40	#include "xfs_btree.h"
41	#include "xfs_ialloc.h"	41	#include "xfs_ialloc.h"
42	#include "xfs_rw.h"	42	#include "xfs_rw.h"
43	#include "xfs_error.h"	43	#include "xfs_error.h"
44		44
45		45
46	kmem_zone_t xfs_ili_zone; / inode log item zone */	46	kmem_zone_t xfs_ili_zone; / inode log item zone */
47		47
48	/*	48	/*
49	* This returns the number of iovecs needed to log the given inode item.	49	* This returns the number of iovecs needed to log the given inode item.
50	*	50	*
51	* We need one iovec for the inode log format structure, one for the	51	* We need one iovec for the inode log format structure, one for the
52	* inode core, and possibly one for the inode data/extents/b-tree root	52	* inode core, and possibly one for the inode data/extents/b-tree root
53	* and one for the inode attribute data/extents/b-tree root.	53	* and one for the inode attribute data/extents/b-tree root.
54	*/	54	*/
55	STATIC uint	55	STATIC uint
56	xfs_inode_item_size(	56	xfs_inode_item_size(
57	xfs_inode_log_item_t *iip)	57	xfs_inode_log_item_t *iip)
58	{	58	{
59	uint nvecs;	59	uint nvecs;
60	xfs_inode_t *ip;	60	xfs_inode_t *ip;
61		61
62	ip = iip->ili_inode;	62	ip = iip->ili_inode;
63	nvecs = 2;	63	nvecs = 2;
64		64
65	/*	65	/*
66	* Only log the data/extents/b-tree root if there is something	66	* Only log the data/extents/b-tree root if there is something
67	* left to log.	67	* left to log.
68	*/	68	*/
69	iip->ili_format.ilf_fields \|= XFS_ILOG_CORE;	69	iip->ili_format.ilf_fields \|= XFS_ILOG_CORE;
70		70
71	switch (ip->i_d.di_format) {	71	switch (ip->i_d.di_format) {
72	case XFS_DINODE_FMT_EXTENTS:	72	case XFS_DINODE_FMT_EXTENTS:
73	iip->ili_format.ilf_fields &=	73	iip->ili_format.ilf_fields &=
74	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|	74	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|
75	XFS_ILOG_DEV \| XFS_ILOG_UUID);	75	XFS_ILOG_DEV \| XFS_ILOG_UUID);
76	if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&	76	if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
77	(ip->i_d.di_nextents > 0) &&	77	(ip->i_d.di_nextents > 0) &&
78	(ip->i_df.if_bytes > 0)) {	78	(ip->i_df.if_bytes > 0)) {
79	ASSERT(ip->i_df.if_u1.if_extents != NULL);	79	ASSERT(ip->i_df.if_u1.if_extents != NULL);
80	nvecs++;	80	nvecs++;
81	} else {	81	} else {
82	iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;	82	iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
83	}	83	}
84	break;	84	break;
85		85
86	case XFS_DINODE_FMT_BTREE:	86	case XFS_DINODE_FMT_BTREE:
87	ASSERT(ip->i_df.if_ext_max ==	87	ASSERT(ip->i_df.if_ext_max ==
88	XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));	88	XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
89	iip->ili_format.ilf_fields &=	89	iip->ili_format.ilf_fields &=
90	~(XFS_ILOG_DDATA \| XFS_ILOG_DEXT \|	90	~(XFS_ILOG_DDATA \| XFS_ILOG_DEXT \|
91	XFS_ILOG_DEV \| XFS_ILOG_UUID);	91	XFS_ILOG_DEV \| XFS_ILOG_UUID);
92	if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&	92	if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
93	(ip->i_df.if_broot_bytes > 0)) {	93	(ip->i_df.if_broot_bytes > 0)) {
94	ASSERT(ip->i_df.if_broot != NULL);	94	ASSERT(ip->i_df.if_broot != NULL);
95	nvecs++;	95	nvecs++;
96	} else {	96	} else {
97	ASSERT(!(iip->ili_format.ilf_fields &	97	ASSERT(!(iip->ili_format.ilf_fields &
98	XFS_ILOG_DBROOT));	98	XFS_ILOG_DBROOT));
99	#ifdef XFS_TRANS_DEBUG	99	#ifdef XFS_TRANS_DEBUG
100	if (iip->ili_root_size > 0) {	100	if (iip->ili_root_size > 0) {
101	ASSERT(iip->ili_root_size ==	101	ASSERT(iip->ili_root_size ==
102	ip->i_df.if_broot_bytes);	102	ip->i_df.if_broot_bytes);
103	ASSERT(memcmp(iip->ili_orig_root,	103	ASSERT(memcmp(iip->ili_orig_root,
104	ip->i_df.if_broot,	104	ip->i_df.if_broot,
105	iip->ili_root_size) == 0);	105	iip->ili_root_size) == 0);
106	} else {	106	} else {
107	ASSERT(ip->i_df.if_broot_bytes == 0);	107	ASSERT(ip->i_df.if_broot_bytes == 0);
108	}	108	}
109	#endif	109	#endif
110	iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;	110	iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
111	}	111	}
112	break;	112	break;
113		113
114	case XFS_DINODE_FMT_LOCAL:	114	case XFS_DINODE_FMT_LOCAL:
115	iip->ili_format.ilf_fields &=	115	iip->ili_format.ilf_fields &=
116	~(XFS_ILOG_DEXT \| XFS_ILOG_DBROOT \|	116	~(XFS_ILOG_DEXT \| XFS_ILOG_DBROOT \|
117	XFS_ILOG_DEV \| XFS_ILOG_UUID);	117	XFS_ILOG_DEV \| XFS_ILOG_UUID);
118	if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&	118	if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
119	(ip->i_df.if_bytes > 0)) {	119	(ip->i_df.if_bytes > 0)) {
120	ASSERT(ip->i_df.if_u1.if_data != NULL);	120	ASSERT(ip->i_df.if_u1.if_data != NULL);
121	ASSERT(ip->i_d.di_size > 0);	121	ASSERT(ip->i_d.di_size > 0);
122	nvecs++;	122	nvecs++;
123	} else {	123	} else {
124	iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;	124	iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
125	}	125	}
126	break;	126	break;
127		127
128	case XFS_DINODE_FMT_DEV:	128	case XFS_DINODE_FMT_DEV:
129	iip->ili_format.ilf_fields &=	129	iip->ili_format.ilf_fields &=
130	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|	130	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|
131	XFS_ILOG_DEXT \| XFS_ILOG_UUID);	131	XFS_ILOG_DEXT \| XFS_ILOG_UUID);
132	break;	132	break;
133		133
134	case XFS_DINODE_FMT_UUID:	134	case XFS_DINODE_FMT_UUID:
135	iip->ili_format.ilf_fields &=	135	iip->ili_format.ilf_fields &=
136	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|	136	~(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|
137	XFS_ILOG_DEXT \| XFS_ILOG_DEV);	137	XFS_ILOG_DEXT \| XFS_ILOG_DEV);
138	break;	138	break;
139		139
140	default:	140	default:
141	ASSERT(0);	141	ASSERT(0);
142	break;	142	break;
143	}	143	}
144		144
145	/*	145	/*
146	* If there are no attributes associated with this file,	146	* If there are no attributes associated with this file,
147	* then there cannot be anything more to log.	147	* then there cannot be anything more to log.
148	* Clear all attribute-related log flags.	148	* Clear all attribute-related log flags.
149	*/	149	*/
150	if (!XFS_IFORK_Q(ip)) {	150	if (!XFS_IFORK_Q(ip)) {
151	iip->ili_format.ilf_fields &=	151	iip->ili_format.ilf_fields &=
152	~(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT \| XFS_ILOG_AEXT);	152	~(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT \| XFS_ILOG_AEXT);
153	return nvecs;	153	return nvecs;
154	}	154	}
155		155
156	/*	156	/*
157	* Log any necessary attribute data.	157	* Log any necessary attribute data.
158	*/	158	*/
159	switch (ip->i_d.di_aformat) {	159	switch (ip->i_d.di_aformat) {
160	case XFS_DINODE_FMT_EXTENTS:	160	case XFS_DINODE_FMT_EXTENTS:
161	iip->ili_format.ilf_fields &=	161	iip->ili_format.ilf_fields &=
162	~(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT);	162	~(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT);
163	if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&	163	if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
164	(ip->i_d.di_anextents > 0) &&	164	(ip->i_d.di_anextents > 0) &&
165	(ip->i_afp->if_bytes > 0)) {	165	(ip->i_afp->if_bytes > 0)) {
166	ASSERT(ip->i_afp->if_u1.if_extents != NULL);	166	ASSERT(ip->i_afp->if_u1.if_extents != NULL);
167	nvecs++;	167	nvecs++;
168	} else {	168	} else {
169	iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;	169	iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
170	}	170	}
171	break;	171	break;
172		172
173	case XFS_DINODE_FMT_BTREE:	173	case XFS_DINODE_FMT_BTREE:
174	iip->ili_format.ilf_fields &=	174	iip->ili_format.ilf_fields &=
175	~(XFS_ILOG_ADATA \| XFS_ILOG_AEXT);	175	~(XFS_ILOG_ADATA \| XFS_ILOG_AEXT);
176	if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&	176	if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
177	(ip->i_afp->if_broot_bytes > 0)) {	177	(ip->i_afp->if_broot_bytes > 0)) {
178	ASSERT(ip->i_afp->if_broot != NULL);	178	ASSERT(ip->i_afp->if_broot != NULL);
179	nvecs++;	179	nvecs++;
180	} else {	180	} else {
181	iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;	181	iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
182	}	182	}
183	break;	183	break;
184		184
185	case XFS_DINODE_FMT_LOCAL:	185	case XFS_DINODE_FMT_LOCAL:
186	iip->ili_format.ilf_fields &=	186	iip->ili_format.ilf_fields &=
187	~(XFS_ILOG_AEXT \| XFS_ILOG_ABROOT);	187	~(XFS_ILOG_AEXT \| XFS_ILOG_ABROOT);
188	if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&	188	if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
189	(ip->i_afp->if_bytes > 0)) {	189	(ip->i_afp->if_bytes > 0)) {
190	ASSERT(ip->i_afp->if_u1.if_data != NULL);	190	ASSERT(ip->i_afp->if_u1.if_data != NULL);
191	nvecs++;	191	nvecs++;
192	} else {	192	} else {
193	iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;	193	iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
194	}	194	}
195	break;	195	break;
196		196
197	default:	197	default:
198	ASSERT(0);	198	ASSERT(0);
199	break;	199	break;
200	}	200	}
201		201
202	return nvecs;	202	return nvecs;
203	}	203	}
204		204
205	/*	205	/*
206	* This is called to fill in the vector of log iovecs for the	206	* This is called to fill in the vector of log iovecs for the
207	* given inode log item. It fills the first item with an inode	207	* given inode log item. It fills the first item with an inode
208	* log format structure, the second with the on-disk inode structure,	208	* log format structure, the second with the on-disk inode structure,
209	* and a possible third and/or fourth with the inode data/extents/b-tree	209	* and a possible third and/or fourth with the inode data/extents/b-tree
210	* root and inode attributes data/extents/b-tree root.	210	* root and inode attributes data/extents/b-tree root.
211	*/	211	*/
212	STATIC void	212	STATIC void
213	xfs_inode_item_format(	213	xfs_inode_item_format(
214	xfs_inode_log_item_t *iip,	214	xfs_inode_log_item_t *iip,
215	xfs_log_iovec_t *log_vector)	215	xfs_log_iovec_t *log_vector)
216	{	216	{
217	uint nvecs;	217	uint nvecs;
218	xfs_log_iovec_t *vecp;	218	xfs_log_iovec_t *vecp;
219	xfs_inode_t *ip;	219	xfs_inode_t *ip;
220	size_t data_bytes;	220	size_t data_bytes;
221	xfs_bmbt_rec_t *ext_buffer;	221	xfs_bmbt_rec_t *ext_buffer;
222	int nrecs;	222	int nrecs;
223	xfs_mount_t *mp;	223	xfs_mount_t *mp;
224		224
225	ip = iip->ili_inode;	225	ip = iip->ili_inode;
226	vecp = log_vector;	226	vecp = log_vector;
227		227
228	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;	228	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
229	vecp->i_len = sizeof(xfs_inode_log_format_t);	229	vecp->i_len = sizeof(xfs_inode_log_format_t);
230	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);	230	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
231	vecp++;	231	vecp++;
232	nvecs = 1;	232	nvecs = 1;
233		233
234	/*	234	/*
235	* Clear i_update_core if the timestamps (or any other	235	* Clear i_update_core if the timestamps (or any other
236	* non-transactional modification) need flushing/logging	236	* non-transactional modification) need flushing/logging
237	* and we're about to log them with the rest of the core.	237	* and we're about to log them with the rest of the core.
238	*	238	*
239	* This is the same logic as xfs_iflush() but this code can't	239	* This is the same logic as xfs_iflush() but this code can't
240	* run at the same time as xfs_iflush because we're in commit	240	* run at the same time as xfs_iflush because we're in commit
241	* processing here and so we have the inode lock held in	241	* processing here and so we have the inode lock held in
242	* exclusive mode. Although it doesn't really matter	242	* exclusive mode. Although it doesn't really matter
243	* for the timestamps if both routines were to grab the	243	* for the timestamps if both routines were to grab the
244	* timestamps or not. That would be ok.	244	* timestamps or not. That would be ok.
245	*	245	*
246	* We clear i_update_core before copying out the data.	246	* We clear i_update_core before copying out the data.
247	* This is for coordination with our timestamp updates	247	* This is for coordination with our timestamp updates
248	* that don't hold the inode lock. They will always	248	* that don't hold the inode lock. They will always
249	* update the timestamps BEFORE setting i_update_core,	249	* update the timestamps BEFORE setting i_update_core,
250	* so if we clear i_update_core after they set it we	250	* so if we clear i_update_core after they set it we
251	* are guaranteed to see their updates to the timestamps	251	* are guaranteed to see their updates to the timestamps
252	* either here. Likewise, if they set it after we clear it	252	* either here. Likewise, if they set it after we clear it
253	* here, we'll see it either on the next commit of this	253	* here, we'll see it either on the next commit of this
254	* inode or the next time the inode gets flushed via	254	* inode or the next time the inode gets flushed via
255	* xfs_iflush(). This depends on strongly ordered memory	255	* xfs_iflush(). This depends on strongly ordered memory
256	* semantics, but we have that. We use the SYNCHRONIZE	256	* semantics, but we have that. We use the SYNCHRONIZE
257	* macro to make sure that the compiler does not reorder	257	* macro to make sure that the compiler does not reorder
258	* the i_update_core access below the data copy below.	258	* the i_update_core access below the data copy below.
259	*/	259	*/
260	if (ip->i_update_core) {	260	if (ip->i_update_core) {
261	ip->i_update_core = 0;	261	ip->i_update_core = 0;
262	SYNCHRONIZE();	262	SYNCHRONIZE();
263	}	263	}
264		264
265	/*	265	/*
266	* We don't have to worry about re-ordering here because	266	* We don't have to worry about re-ordering here because
267	* the update_size field is protected by the inode lock	267	* the update_size field is protected by the inode lock
268	* and we have that held in exclusive mode.	268	* and we have that held in exclusive mode.
269	*/	269	*/
270	if (ip->i_update_size)	270	if (ip->i_update_size)
271	ip->i_update_size = 0;	271	ip->i_update_size = 0;
272		272
273	/*	273	/*
274	* Make sure to get the latest atime from the Linux inode.	274	* Make sure to get the latest atime from the Linux inode.
275	*/	275	*/
276	xfs_synchronize_atime(ip);	276	xfs_synchronize_atime(ip);
277		277
278	/*	278	/*
279	* make sure the linux inode is dirty	279	* make sure the linux inode is dirty
280	*/	280	*/
281	xfs_mark_inode_dirty_sync(ip);	281	xfs_mark_inode_dirty_sync(ip);
282		282
283	vecp->i_addr = (xfs_caddr_t)&ip->i_d;	283	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
284	vecp->i_len = sizeof(struct xfs_icdinode);	284	vecp->i_len = sizeof(struct xfs_icdinode);
285	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);	285	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
286	vecp++;	286	vecp++;
287	nvecs++;	287	nvecs++;
288	iip->ili_format.ilf_fields \|= XFS_ILOG_CORE;	288	iip->ili_format.ilf_fields \|= XFS_ILOG_CORE;
289		289
290	/*	290	/*
291	* If this is really an old format inode, then we need to	291	* If this is really an old format inode, then we need to
292	* log it as such. This means that we have to copy the link	292	* log it as such. This means that we have to copy the link
293	* count from the new field to the old. We don't have to worry	293	* count from the new field to the old. We don't have to worry
294	* about the new fields, because nothing trusts them as long as	294	* about the new fields, because nothing trusts them as long as
295	* the old inode version number is there. If the superblock already	295	* the old inode version number is there. If the superblock already
296	* has a new version number, then we don't bother converting back.	296	* has a new version number, then we don't bother converting back.
297	*/	297	*/
298	mp = ip->i_mount;	298	mp = ip->i_mount;
299	ASSERT(ip->i_d.di_version == 1 \|\| xfs_sb_version_hasnlink(&mp->m_sb));	299	ASSERT(ip->i_d.di_version == 1 \|\| xfs_sb_version_hasnlink(&mp->m_sb));
300	if (ip->i_d.di_version == 1) {	300	if (ip->i_d.di_version == 1) {
301	if (!xfs_sb_version_hasnlink(&mp->m_sb)) {	301	if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
302	/*	302	/*
303	* Convert it back.	303	* Convert it back.
304	*/	304	*/
305	ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);	305	ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
306	ip->i_d.di_onlink = ip->i_d.di_nlink;	306	ip->i_d.di_onlink = ip->i_d.di_nlink;
307	} else {	307	} else {
308	/*	308	/*
309	* The superblock version has already been bumped,	309	* The superblock version has already been bumped,
310	* so just make the conversion to the new inode	310	* so just make the conversion to the new inode
311	* format permanent.	311	* format permanent.
312	*/	312	*/
313	ip->i_d.di_version = 2;	313	ip->i_d.di_version = 2;
314	ip->i_d.di_onlink = 0;	314	ip->i_d.di_onlink = 0;
315	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));	315	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
316	}	316	}
317	}	317	}
318		318
319	switch (ip->i_d.di_format) {	319	switch (ip->i_d.di_format) {
320	case XFS_DINODE_FMT_EXTENTS:	320	case XFS_DINODE_FMT_EXTENTS:
321	ASSERT(!(iip->ili_format.ilf_fields &	321	ASSERT(!(iip->ili_format.ilf_fields &
322	(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|	322	(XFS_ILOG_DDATA \| XFS_ILOG_DBROOT \|
323	XFS_ILOG_DEV \| XFS_ILOG_UUID)));	323	XFS_ILOG_DEV \| XFS_ILOG_UUID)));
324	if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {	324	if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
325	ASSERT(ip->i_df.if_bytes > 0);	325	ASSERT(ip->i_df.if_bytes > 0);
326	ASSERT(ip->i_df.if_u1.if_extents != NULL);	326	ASSERT(ip->i_df.if_u1.if_extents != NULL);
327	ASSERT(ip->i_d.di_nextents > 0);	327	ASSERT(ip->i_d.di_nextents > 0);
328	ASSERT(iip->ili_extents_buf == NULL);	328	ASSERT(iip->ili_extents_buf == NULL);
329	nrecs = ip->i_df.if_bytes /	329	nrecs = ip->i_df.if_bytes /
330	(uint)sizeof(xfs_bmbt_rec_t);	330	(uint)sizeof(xfs_bmbt_rec_t);
331	ASSERT(nrecs > 0);	331	ASSERT(nrecs > 0);
332	#ifdef XFS_NATIVE_HOST	332	#ifdef XFS_NATIVE_HOST
333	if (nrecs == ip->i_d.di_nextents) {	333	if (nrecs == ip->i_d.di_nextents) {
334	/*	334	/*
335	* There are no delayed allocation	335	* There are no delayed allocation
336	* extents, so just point to the	336	* extents, so just point to the
337	* real extents array.	337	* real extents array.
338	*/	338	*/
339	vecp->i_addr =	339	vecp->i_addr =
340	(char *)(ip->i_df.if_u1.if_extents);	340	(char *)(ip->i_df.if_u1.if_extents);
341	vecp->i_len = ip->i_df.if_bytes;	341	vecp->i_len = ip->i_df.if_bytes;
342	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);	342	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
343	} else	343	} else
344	#endif	344	#endif
345	{	345	{
346	/*	346	/*
347	* There are delayed allocation extents	347	* There are delayed allocation extents
348	* in the inode, or we need to convert	348	* in the inode, or we need to convert
349	* the extents to on disk format.	349	* the extents to on disk format.
350	* Use xfs_iextents_copy()	350	* Use xfs_iextents_copy()
351	* to copy only the real extents into	351	* to copy only the real extents into
352	* a separate buffer. We'll free the	352	* a separate buffer. We'll free the
353	* buffer in the unlock routine.	353	* buffer in the unlock routine.
354	*/	354	*/
355	ext_buffer = kmem_alloc(ip->i_df.if_bytes,	355	ext_buffer = kmem_alloc(ip->i_df.if_bytes,
356	KM_SLEEP);	356	KM_SLEEP);
357	iip->ili_extents_buf = ext_buffer;	357	iip->ili_extents_buf = ext_buffer;
358	vecp->i_addr = (xfs_caddr_t)ext_buffer;	358	vecp->i_addr = (xfs_caddr_t)ext_buffer;
359	vecp->i_len = xfs_iextents_copy(ip, ext_buffer,	359	vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
360	XFS_DATA_FORK);	360	XFS_DATA_FORK);
361	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);	361	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
362	}	362	}
363	ASSERT(vecp->i_len <= ip->i_df.if_bytes);	363	ASSERT(vecp->i_len <= ip->i_df.if_bytes);
364	iip->ili_format.ilf_dsize = vecp->i_len;	364	iip->ili_format.ilf_dsize = vecp->i_len;
365	vecp++;	365	vecp++;
366	nvecs++;	366	nvecs++;
367	}	367	}
368	break;	368	break;
369		369
370	case XFS_DINODE_FMT_BTREE:	370	case XFS_DINODE_FMT_BTREE:
371	ASSERT(!(iip->ili_format.ilf_fields &	371	ASSERT(!(iip->ili_format.ilf_fields &
372	(XFS_ILOG_DDATA \| XFS_ILOG_DEXT \|	372	(XFS_ILOG_DDATA \| XFS_ILOG_DEXT \|
373	XFS_ILOG_DEV \| XFS_ILOG_UUID)));	373	XFS_ILOG_DEV \| XFS_ILOG_UUID)));
374	if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {	374	if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
375	ASSERT(ip->i_df.if_broot_bytes > 0);	375	ASSERT(ip->i_df.if_broot_bytes > 0);
376	ASSERT(ip->i_df.if_broot != NULL);	376	ASSERT(ip->i_df.if_broot != NULL);
377	vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;	377	vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
378	vecp->i_len = ip->i_df.if_broot_bytes;	378	vecp->i_len = ip->i_df.if_broot_bytes;
379	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);	379	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
380	vecp++;	380	vecp++;
381	nvecs++;	381	nvecs++;
382	iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;	382	iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
383	}	383	}
384	break;	384	break;
385		385
386	case XFS_DINODE_FMT_LOCAL:	386	case XFS_DINODE_FMT_LOCAL:
387	ASSERT(!(iip->ili_format.ilf_fields &	387	ASSERT(!(iip->ili_format.ilf_fields &
388	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|	388	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|
389	XFS_ILOG_DEV \| XFS_ILOG_UUID)));	389	XFS_ILOG_DEV \| XFS_ILOG_UUID)));
390	if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {	390	if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
391	ASSERT(ip->i_df.if_bytes > 0);	391	ASSERT(ip->i_df.if_bytes > 0);
392	ASSERT(ip->i_df.if_u1.if_data != NULL);	392	ASSERT(ip->i_df.if_u1.if_data != NULL);
393	ASSERT(ip->i_d.di_size > 0);	393	ASSERT(ip->i_d.di_size > 0);
394		394
395	vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;	395	vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
396	/*	396	/*
397	* Round i_bytes up to a word boundary.	397	* Round i_bytes up to a word boundary.
398	* The underlying memory is guaranteed to	398	* The underlying memory is guaranteed to
399	* to be there by xfs_idata_realloc().	399	* to be there by xfs_idata_realloc().
400	*/	400	*/
401	data_bytes = roundup(ip->i_df.if_bytes, 4);	401	data_bytes = roundup(ip->i_df.if_bytes, 4);
402	ASSERT((ip->i_df.if_real_bytes == 0) \|\|	402	ASSERT((ip->i_df.if_real_bytes == 0) \|\|
403	(ip->i_df.if_real_bytes == data_bytes));	403	(ip->i_df.if_real_bytes == data_bytes));
404	vecp->i_len = (int)data_bytes;	404	vecp->i_len = (int)data_bytes;
405	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);	405	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
406	vecp++;	406	vecp++;
407	nvecs++;	407	nvecs++;
408	iip->ili_format.ilf_dsize = (unsigned)data_bytes;	408	iip->ili_format.ilf_dsize = (unsigned)data_bytes;
409	}	409	}
410	break;	410	break;
411		411
412	case XFS_DINODE_FMT_DEV:	412	case XFS_DINODE_FMT_DEV:
413	ASSERT(!(iip->ili_format.ilf_fields &	413	ASSERT(!(iip->ili_format.ilf_fields &
414	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|	414	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|
415	XFS_ILOG_DDATA \| XFS_ILOG_UUID)));	415	XFS_ILOG_DDATA \| XFS_ILOG_UUID)));
416	if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {	416	if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
417	iip->ili_format.ilf_u.ilfu_rdev =	417	iip->ili_format.ilf_u.ilfu_rdev =
418	ip->i_df.if_u2.if_rdev;	418	ip->i_df.if_u2.if_rdev;
419	}	419	}
420	break;	420	break;
421		421
422	case XFS_DINODE_FMT_UUID:	422	case XFS_DINODE_FMT_UUID:
423	ASSERT(!(iip->ili_format.ilf_fields &	423	ASSERT(!(iip->ili_format.ilf_fields &
424	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|	424	(XFS_ILOG_DBROOT \| XFS_ILOG_DEXT \|
425	XFS_ILOG_DDATA \| XFS_ILOG_DEV)));	425	XFS_ILOG_DDATA \| XFS_ILOG_DEV)));
426	if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {	426	if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
427	iip->ili_format.ilf_u.ilfu_uuid =	427	iip->ili_format.ilf_u.ilfu_uuid =
428	ip->i_df.if_u2.if_uuid;	428	ip->i_df.if_u2.if_uuid;
429	}	429	}
430	break;	430	break;
431		431
432	default:	432	default:
433	ASSERT(0);	433	ASSERT(0);
434	break;	434	break;
435	}	435	}
436		436
437	/*	437	/*
438	* If there are no attributes associated with the file,	438	* If there are no attributes associated with the file,
439	* then we're done.	439	* then we're done.
440	* Assert that no attribute-related log flags are set.	440	* Assert that no attribute-related log flags are set.
441	*/	441	*/
442	if (!XFS_IFORK_Q(ip)) {	442	if (!XFS_IFORK_Q(ip)) {
443	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);	443	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
444	iip->ili_format.ilf_size = nvecs;	444	iip->ili_format.ilf_size = nvecs;
445	ASSERT(!(iip->ili_format.ilf_fields &	445	ASSERT(!(iip->ili_format.ilf_fields &
446	(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT \| XFS_ILOG_AEXT)));	446	(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT \| XFS_ILOG_AEXT)));
447	return;	447	return;
448	}	448	}
449		449
450	switch (ip->i_d.di_aformat) {	450	switch (ip->i_d.di_aformat) {
451	case XFS_DINODE_FMT_EXTENTS:	451	case XFS_DINODE_FMT_EXTENTS:
452	ASSERT(!(iip->ili_format.ilf_fields &	452	ASSERT(!(iip->ili_format.ilf_fields &
453	(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT)));	453	(XFS_ILOG_ADATA \| XFS_ILOG_ABROOT)));
454	if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {	454	if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
455	ASSERT(ip->i_afp->if_bytes > 0);	455	ASSERT(ip->i_afp->if_bytes > 0);
456	ASSERT(ip->i_afp->if_u1.if_extents != NULL);	456	ASSERT(ip->i_afp->if_u1.if_extents != NULL);
457	ASSERT(ip->i_d.di_anextents > 0);	457	ASSERT(ip->i_d.di_anextents > 0);
458	#ifdef DEBUG	458	#ifdef DEBUG
459	nrecs = ip->i_afp->if_bytes /	459	nrecs = ip->i_afp->if_bytes /
460	(uint)sizeof(xfs_bmbt_rec_t);	460	(uint)sizeof(xfs_bmbt_rec_t);
461	#endif	461	#endif
462	ASSERT(nrecs > 0);	462	ASSERT(nrecs > 0);
463	ASSERT(nrecs == ip->i_d.di_anextents);	463	ASSERT(nrecs == ip->i_d.di_anextents);
464	#ifdef XFS_NATIVE_HOST	464	#ifdef XFS_NATIVE_HOST
465	/*	465	/*
466	* There are not delayed allocation extents	466	* There are not delayed allocation extents
467	* for attributes, so just point at the array.	467	* for attributes, so just point at the array.
468	*/	468	*/
469	vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);	469	vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
470	vecp->i_len = ip->i_afp->if_bytes;	470	vecp->i_len = ip->i_afp->if_bytes;
471	#else	471	#else
472	ASSERT(iip->ili_aextents_buf == NULL);	472	ASSERT(iip->ili_aextents_buf == NULL);
473	/*	473	/*
474	* Need to endian flip before logging	474	* Need to endian flip before logging
475	*/	475	*/
476	ext_buffer = kmem_alloc(ip->i_afp->if_bytes,	476	ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
477	KM_SLEEP);	477	KM_SLEEP);
478	iip->ili_aextents_buf = ext_buffer;	478	iip->ili_aextents_buf = ext_buffer;
479	vecp->i_addr = (xfs_caddr_t)ext_buffer;	479	vecp->i_addr = (xfs_caddr_t)ext_buffer;
480	vecp->i_len = xfs_iextents_copy(ip, ext_buffer,	480	vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
481	XFS_ATTR_FORK);	481	XFS_ATTR_FORK);
482	#endif	482	#endif
483	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);	483	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
484	iip->ili_format.ilf_asize = vecp->i_len;	484	iip->ili_format.ilf_asize = vecp->i_len;
485	vecp++;	485	vecp++;
486	nvecs++;	486	nvecs++;
487	}	487	}
488	break;	488	break;
489		489
490	case XFS_DINODE_FMT_BTREE:	490	case XFS_DINODE_FMT_BTREE:
491	ASSERT(!(iip->ili_format.ilf_fields &	491	ASSERT(!(iip->ili_format.ilf_fields &
492	(XFS_ILOG_ADATA \| XFS_ILOG_AEXT)));	492	(XFS_ILOG_ADATA \| XFS_ILOG_AEXT)));
493	if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {	493	if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
494	ASSERT(ip->i_afp->if_broot_bytes > 0);	494	ASSERT(ip->i_afp->if_broot_bytes > 0);
495	ASSERT(ip->i_afp->if_broot != NULL);	495	ASSERT(ip->i_afp->if_broot != NULL);
496	vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;	496	vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
497	vecp->i_len = ip->i_afp->if_broot_bytes;	497	vecp->i_len = ip->i_afp->if_broot_bytes;
498	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);	498	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
499	vecp++;	499	vecp++;
500	nvecs++;	500	nvecs++;
501	iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;	501	iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
502	}	502	}
503	break;	503	break;
504		504
505	case XFS_DINODE_FMT_LOCAL:	505	case XFS_DINODE_FMT_LOCAL:
506	ASSERT(!(iip->ili_format.ilf_fields &	506	ASSERT(!(iip->ili_format.ilf_fields &
507	(XFS_ILOG_ABROOT \| XFS_ILOG_AEXT)));	507	(XFS_ILOG_ABROOT \| XFS_ILOG_AEXT)));
508	if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {	508	if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
509	ASSERT(ip->i_afp->if_bytes > 0);	509	ASSERT(ip->i_afp->if_bytes > 0);
510	ASSERT(ip->i_afp->if_u1.if_data != NULL);	510	ASSERT(ip->i_afp->if_u1.if_data != NULL);
511		511
512	vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;	512	vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
513	/*	513	/*
514	* Round i_bytes up to a word boundary.	514	* Round i_bytes up to a word boundary.
515	* The underlying memory is guaranteed to	515	* The underlying memory is guaranteed to
516	* to be there by xfs_idata_realloc().	516	* to be there by xfs_idata_realloc().
517	*/	517	*/
518	data_bytes = roundup(ip->i_afp->if_bytes, 4);	518	data_bytes = roundup(ip->i_afp->if_bytes, 4);
519	ASSERT((ip->i_afp->if_real_bytes == 0) \|\|	519	ASSERT((ip->i_afp->if_real_bytes == 0) \|\|
520	(ip->i_afp->if_real_bytes == data_bytes));	520	(ip->i_afp->if_real_bytes == data_bytes));
521	vecp->i_len = (int)data_bytes;	521	vecp->i_len = (int)data_bytes;
522	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);	522	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
523	vecp++;	523	vecp++;
524	nvecs++;	524	nvecs++;
525	iip->ili_format.ilf_asize = (unsigned)data_bytes;	525	iip->ili_format.ilf_asize = (unsigned)data_bytes;
526	}	526	}
527	break;	527	break;
528		528
529	default:	529	default:
530	ASSERT(0);	530	ASSERT(0);
531	break;	531	break;
532	}	532	}
533		533
534	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);	534	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
535	iip->ili_format.ilf_size = nvecs;	535	iip->ili_format.ilf_size = nvecs;
536	}	536	}
537		537
538		538
539	/*	539	/*
540	* This is called to pin the inode associated with the inode log	540	* This is called to pin the inode associated with the inode log
541	* item in memory so it cannot be written out. Do this by calling	541	* item in memory so it cannot be written out. Do this by calling
542	* xfs_ipin() to bump the pin count in the inode while holding the	542	* xfs_ipin() to bump the pin count in the inode while holding the
543	* inode pin lock.	543	* inode pin lock.
544	*/	544	*/
545	STATIC void	545	STATIC void
546	xfs_inode_item_pin(	546	xfs_inode_item_pin(
547	xfs_inode_log_item_t *iip)	547	xfs_inode_log_item_t *iip)
548	{	548	{
549	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));	549	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
550	xfs_ipin(iip->ili_inode);	550	xfs_ipin(iip->ili_inode);
551	}	551	}
552		552
553		553
554	/*	554	/*
555	* This is called to unpin the inode associated with the inode log	555	* This is called to unpin the inode associated with the inode log
556	* item which was previously pinned with a call to xfs_inode_item_pin().	556	* item which was previously pinned with a call to xfs_inode_item_pin().
557	* Just call xfs_iunpin() on the inode to do this.	557	* Just call xfs_iunpin() on the inode to do this.
558	*/	558	*/
559	/* ARGSUSED */	559	/* ARGSUSED */
560	STATIC void	560	STATIC void
561	xfs_inode_item_unpin(	561	xfs_inode_item_unpin(
562	xfs_inode_log_item_t *iip,	562	xfs_inode_log_item_t *iip,
563	int stale)	563	int stale)
564	{	564	{
565	xfs_iunpin(iip->ili_inode);	565	xfs_iunpin(iip->ili_inode);
566	}	566	}
567		567
568	/* ARGSUSED */	568	/* ARGSUSED */
569	STATIC void	569	STATIC void
570	xfs_inode_item_unpin_remove(	570	xfs_inode_item_unpin_remove(
571	xfs_inode_log_item_t *iip,	571	xfs_inode_log_item_t *iip,
572	xfs_trans_t *tp)	572	xfs_trans_t *tp)
573	{	573	{
574	xfs_iunpin(iip->ili_inode);	574	xfs_iunpin(iip->ili_inode);
575	}	575	}
576		576
577	/*	577	/*
578	* This is called to attempt to lock the inode associated with this	578	* This is called to attempt to lock the inode associated with this
579	* inode log item, in preparation for the push routine which does the actual	579	* inode log item, in preparation for the push routine which does the actual
580	* iflush. Don't sleep on the inode lock or the flush lock.	580	* iflush. Don't sleep on the inode lock or the flush lock.
581	*	581	*
582	* If the flush lock is already held, indicating that the inode has	582	* If the flush lock is already held, indicating that the inode has
583	* been or is in the process of being flushed, then (ideally) we'd like to	583	* been or is in the process of being flushed, then (ideally) we'd like to
584	* see if the inode's buffer is still incore, and if so give it a nudge.	584	* see if the inode's buffer is still incore, and if so give it a nudge.
585	* We delay doing so until the pushbuf routine, though, to avoid holding	585	* We delay doing so until the pushbuf routine, though, to avoid holding
586	* the AIL lock across a call to the blackhole which is the buffer cache.	586	* the AIL lock across a call to the blackhole which is the buffer cache.
587	* Also we don't want to sleep in any device strategy routines, which can happen	587	* Also we don't want to sleep in any device strategy routines, which can happen
588	* if we do the subsequent bawrite in here.	588	* if we do the subsequent bawrite in here.
589	*/	589	*/
590	STATIC uint	590	STATIC uint
591	xfs_inode_item_trylock(	591	xfs_inode_item_trylock(
592	xfs_inode_log_item_t *iip)	592	xfs_inode_log_item_t *iip)
593	{	593	{
594	register xfs_inode_t *ip;	594	register xfs_inode_t *ip;
595		595
596	ip = iip->ili_inode;	596	ip = iip->ili_inode;
597		597
598	if (xfs_ipincount(ip) > 0) {	598	if (xfs_ipincount(ip) > 0) {
599	return XFS_ITEM_PINNED;	599	return XFS_ITEM_PINNED;
600	}	600	}
601		601
602	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {	602	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
603	return XFS_ITEM_LOCKED;	603	return XFS_ITEM_LOCKED;
604	}	604	}
605		605
606	if (!xfs_iflock_nowait(ip)) {	606	if (!xfs_iflock_nowait(ip)) {
607	/*	607	/*
608	* If someone else isn't already trying to push the inode	608	* If someone else isn't already trying to push the inode
609	* buffer, we get to do it.	609	* buffer, we get to do it.
610	*/	610	*/
611	if (iip->ili_pushbuf_flag == 0) {	611	if (iip->ili_pushbuf_flag == 0) {
612	iip->ili_pushbuf_flag = 1;	612	iip->ili_pushbuf_flag = 1;
613	#ifdef DEBUG	613	#ifdef DEBUG
614	iip->ili_push_owner = current_pid();	614	iip->ili_push_owner = current_pid();
615	#endif	615	#endif
616	/*	616	/*
617	* Inode is left locked in shared mode.	617	* Inode is left locked in shared mode.
618	* Pushbuf routine gets to unlock it.	618	* Pushbuf routine gets to unlock it.
619	*/	619	*/
620	return XFS_ITEM_PUSHBUF;	620	return XFS_ITEM_PUSHBUF;
621	} else {	621	} else {
622	/*	622	/*
623	* We hold the AIL lock, so we must specify the	623	* We hold the AIL lock, so we must specify the
624	* NONOTIFY flag so that we won't double trip.	624	* NONOTIFY flag so that we won't double trip.
625	*/	625	*/
626	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);	626	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);
627	return XFS_ITEM_FLUSHING;	627	return XFS_ITEM_FLUSHING;
628	}	628	}
629	/* NOTREACHED */	629	/* NOTREACHED */
630	}	630	}
631		631
632	/* Stale items should force out the iclog */	632	/* Stale items should force out the iclog */
633	if (ip->i_flags & XFS_ISTALE) {	633	if (ip->i_flags & XFS_ISTALE) {
634	xfs_ifunlock(ip);	634	xfs_ifunlock(ip);
635	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);	635	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);
636	return XFS_ITEM_PINNED;	636	return XFS_ITEM_PINNED;
637	}	637	}
638		638
639	#ifdef DEBUG	639	#ifdef DEBUG
640	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {	640	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
641	ASSERT(iip->ili_format.ilf_fields != 0);	641	ASSERT(iip->ili_format.ilf_fields != 0);
642	ASSERT(iip->ili_logged == 0);	642	ASSERT(iip->ili_logged == 0);
643	ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);	643	ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
644	}	644	}
645	#endif	645	#endif
646	return XFS_ITEM_SUCCESS;	646	return XFS_ITEM_SUCCESS;
647	}	647	}
648		648
649	/*	649	/*
650	* Unlock the inode associated with the inode log item.	650	* Unlock the inode associated with the inode log item.
651	* Clear the fields of the inode and inode log item that	651	* Clear the fields of the inode and inode log item that
652	* are specific to the current transaction. If the	652	* are specific to the current transaction. If the
653	* hold flags is set, do not unlock the inode.	653	* hold flags is set, do not unlock the inode.
654	*/	654	*/
655	STATIC void	655	STATIC void
656	xfs_inode_item_unlock(	656	xfs_inode_item_unlock(
657	xfs_inode_log_item_t *iip)	657	xfs_inode_log_item_t *iip)
658	{	658	{
659	uint hold;	659	uint hold;
660	uint iolocked;	660	uint iolocked;
661	uint lock_flags;	661	uint lock_flags;
662	xfs_inode_t *ip;	662	xfs_inode_t *ip;
663		663
664	ASSERT(iip != NULL);	664	ASSERT(iip != NULL);
665	ASSERT(iip->ili_inode->i_itemp != NULL);	665	ASSERT(iip->ili_inode->i_itemp != NULL);
666	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));	666	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
667	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &	667	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
668	XFS_ILI_IOLOCKED_EXCL)) \|\|	668	XFS_ILI_IOLOCKED_EXCL)) \|\|
669	xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));	669	xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
670	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &	670	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
671	XFS_ILI_IOLOCKED_SHARED)) \|\|	671	XFS_ILI_IOLOCKED_SHARED)) \|\|
672	xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));	672	xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
673	/*	673	/*
674	* Clear the transaction pointer in the inode.	674	* Clear the transaction pointer in the inode.
675	*/	675	*/
676	ip = iip->ili_inode;	676	ip = iip->ili_inode;
677	ip->i_transp = NULL;	677	ip->i_transp = NULL;
678		678
679	/*	679	/*
680	* If the inode needed a separate buffer with which to log	680	* If the inode needed a separate buffer with which to log
681	* its extents, then free it now.	681	* its extents, then free it now.
682	*/	682	*/
683	if (iip->ili_extents_buf != NULL) {	683	if (iip->ili_extents_buf != NULL) {
684	ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);	684	ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
685	ASSERT(ip->i_d.di_nextents > 0);	685	ASSERT(ip->i_d.di_nextents > 0);
686	ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);	686	ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
687	ASSERT(ip->i_df.if_bytes > 0);	687	ASSERT(ip->i_df.if_bytes > 0);
688	kmem_free(iip->ili_extents_buf);	688	kmem_free(iip->ili_extents_buf);
689	iip->ili_extents_buf = NULL;	689	iip->ili_extents_buf = NULL;
690	}	690	}
691	if (iip->ili_aextents_buf != NULL) {	691	if (iip->ili_aextents_buf != NULL) {
692	ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);	692	ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
693	ASSERT(ip->i_d.di_anextents > 0);	693	ASSERT(ip->i_d.di_anextents > 0);
694	ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);	694	ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
695	ASSERT(ip->i_afp->if_bytes > 0);	695	ASSERT(ip->i_afp->if_bytes > 0);
696	kmem_free(iip->ili_aextents_buf);	696	kmem_free(iip->ili_aextents_buf);
697	iip->ili_aextents_buf = NULL;	697	iip->ili_aextents_buf = NULL;
698	}	698	}
699		699
700	/*	700	/*
701	* Figure out if we should unlock the inode or not.	701	* Figure out if we should unlock the inode or not.
702	*/	702	*/
703	hold = iip->ili_flags & XFS_ILI_HOLD;	703	hold = iip->ili_flags & XFS_ILI_HOLD;
704		704
705	/*	705	/*
706	* Before clearing out the flags, remember whether we	706	* Before clearing out the flags, remember whether we
707	* are holding the inode's IO lock.	707	* are holding the inode's IO lock.
708	*/	708	*/
709	iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;	709	iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
710		710
711	/*	711	/*
712	* Clear out the fields of the inode log item particular	712	* Clear out the fields of the inode log item particular
713	* to the current transaction.	713	* to the current transaction.
714	*/	714	*/
715	iip->ili_ilock_recur = 0;	715	iip->ili_ilock_recur = 0;
716	iip->ili_iolock_recur = 0;	716	iip->ili_iolock_recur = 0;
717	iip->ili_flags = 0;	717	iip->ili_flags = 0;
718		718
719	/*	719	/*
720	* Unlock the inode if XFS_ILI_HOLD was not set.	720	* Unlock the inode if XFS_ILI_HOLD was not set.
721	*/	721	*/
722	if (!hold) {	722	if (!hold) {
723	lock_flags = XFS_ILOCK_EXCL;	723	lock_flags = XFS_ILOCK_EXCL;
724	if (iolocked & XFS_ILI_IOLOCKED_EXCL) {	724	if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
725	lock_flags \|= XFS_IOLOCK_EXCL;	725	lock_flags \|= XFS_IOLOCK_EXCL;
726	} else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {	726	} else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
727	lock_flags \|= XFS_IOLOCK_SHARED;	727	lock_flags \|= XFS_IOLOCK_SHARED;
728	}	728	}
729	xfs_iput(iip->ili_inode, lock_flags);	729	xfs_iput(iip->ili_inode, lock_flags);
730	}	730	}
731	}	731	}
732		732
733	/*	733	/*
734	* This is called to find out where the oldest active copy of the	734	* This is called to find out where the oldest active copy of the
735	* inode log item in the on disk log resides now that the last log	735	* inode log item in the on disk log resides now that the last log
736	* write of it completed at the given lsn. Since we always re-log	736	* write of it completed at the given lsn. Since we always re-log
737	* all dirty data in an inode, the latest copy in the on disk log	737	* all dirty data in an inode, the latest copy in the on disk log
738	* is the only one that matters. Therefore, simply return the	738	* is the only one that matters. Therefore, simply return the
739	* given lsn.	739	* given lsn.
740	*/	740	*/
741	/ARGSUSED/	741	/ARGSUSED/
742	STATIC xfs_lsn_t	742	STATIC xfs_lsn_t
743	xfs_inode_item_committed(	743	xfs_inode_item_committed(
744	xfs_inode_log_item_t *iip,	744	xfs_inode_log_item_t *iip,
745	xfs_lsn_t lsn)	745	xfs_lsn_t lsn)
746	{	746	{
747	return (lsn);	747	return (lsn);
748	}	748	}
749		749
750	/*	750	/*
751	* This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK	751	* This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
752	* failed to get the inode flush lock but did get the inode locked SHARED.	752	* failed to get the inode flush lock but did get the inode locked SHARED.
753	* Here we're trying to see if the inode buffer is incore, and if so whether it's	753	* Here we're trying to see if the inode buffer is incore, and if so whether it's
754	* marked delayed write. If that's the case, we'll initiate a bawrite on that	754	* marked delayed write. If that's the case, we'll initiate a bawrite on that
755	* buffer to expedite the process.	755	* buffer to expedite the process.
756	*	756	*
757	* We aren't holding the AIL lock (or the flush lock) when this gets called,	757	* We aren't holding the AIL lock (or the flush lock) when this gets called,
758	* so it is inherently race-y.	758	* so it is inherently race-y.
759	*/	759	*/
760	STATIC void	760	STATIC void
761	xfs_inode_item_pushbuf(	761	xfs_inode_item_pushbuf(
762	xfs_inode_log_item_t *iip)	762	xfs_inode_log_item_t *iip)
763	{	763	{
764	xfs_inode_t *ip;	764	xfs_inode_t *ip;
765	xfs_mount_t *mp;	765	xfs_mount_t *mp;
766	xfs_buf_t *bp;	766	xfs_buf_t *bp;
767	uint dopush;	767	uint dopush;
768		768
769	ip = iip->ili_inode;	769	ip = iip->ili_inode;
770		770
771	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));	771	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
772		772
773	/*	773	/*
774	* The ili_pushbuf_flag keeps others from	774	* The ili_pushbuf_flag keeps others from
775	* trying to duplicate our effort.	775	* trying to duplicate our effort.
776	*/	776	*/
777	ASSERT(iip->ili_pushbuf_flag != 0);	777	ASSERT(iip->ili_pushbuf_flag != 0);
778	ASSERT(iip->ili_push_owner == current_pid());	778	ASSERT(iip->ili_push_owner == current_pid());
779		779
780	/*	780	/*
781	* If a flush is not in progress anymore, chances are that the	781	* If a flush is not in progress anymore, chances are that the
782	* inode was taken off the AIL. So, just get out.	782	* inode was taken off the AIL. So, just get out.
783	*/	783	*/
784	if (completion_done(&ip->i_flush) \|\|	784	if (completion_done(&ip->i_flush) \|\|
785	((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {	785	((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
786	iip->ili_pushbuf_flag = 0;	786	iip->ili_pushbuf_flag = 0;
787	xfs_iunlock(ip, XFS_ILOCK_SHARED);	787	xfs_iunlock(ip, XFS_ILOCK_SHARED);
788	return;	788	return;
789	}	789	}
790		790
791	mp = ip->i_mount;	791	mp = ip->i_mount;
792	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,	792	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
793	iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);	793	iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
794		794
795	if (bp != NULL) {	795	if (bp != NULL) {
796	if (XFS_BUF_ISDELAYWRITE(bp)) {	796	if (XFS_BUF_ISDELAYWRITE(bp)) {
797	/*	797	/*
798	* We were racing with iflush because we don't hold	798	* We were racing with iflush because we don't hold
799	* the AIL lock or the flush lock. However, at this point,	799	* the AIL lock or the flush lock. However, at this point,
800	* we have the buffer, and we know that it's dirty.	800	* we have the buffer, and we know that it's dirty.
801	* So, it's possible that iflush raced with us, and	801	* So, it's possible that iflush raced with us, and
802	* this item is already taken off the AIL.	802	* this item is already taken off the AIL.
803	* If not, we can flush it async.	803	* If not, we can flush it async.
804	*/	804	*/
805	dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&	805	dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
806	!completion_done(&ip->i_flush));	806	!completion_done(&ip->i_flush));
807	iip->ili_pushbuf_flag = 0;	807	iip->ili_pushbuf_flag = 0;
808	xfs_iunlock(ip, XFS_ILOCK_SHARED);	808	xfs_iunlock(ip, XFS_ILOCK_SHARED);
809	xfs_buftrace("INODE ITEM PUSH", bp);	809	xfs_buftrace("INODE ITEM PUSH", bp);
810	if (XFS_BUF_ISPINNED(bp)) {	810	if (XFS_BUF_ISPINNED(bp)) {
811	xfs_log_force(mp, (xfs_lsn_t)0,	811	xfs_log_force(mp, (xfs_lsn_t)0,
812	XFS_LOG_FORCE);	812	XFS_LOG_FORCE);
813	}	813	}
814	if (dopush) {	814	if (dopush) {
815	int error;	815	int error;
816	error = xfs_bawrite(mp, bp);	816	error = xfs_bawrite(mp, bp);
817	if (error)	817	if (error)
818	xfs_fs_cmn_err(CE_WARN, mp,	818	xfs_fs_cmn_err(CE_WARN, mp,
819	"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",	819	"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
820	error, iip, bp);	820	error, iip, bp);
821	} else {	821	} else {
822	xfs_buf_relse(bp);	822	xfs_buf_relse(bp);
823	}	823	}
824	} else {	824	} else {
825	iip->ili_pushbuf_flag = 0;	825	iip->ili_pushbuf_flag = 0;
826	xfs_iunlock(ip, XFS_ILOCK_SHARED);	826	xfs_iunlock(ip, XFS_ILOCK_SHARED);
827	xfs_buf_relse(bp);	827	xfs_buf_relse(bp);
828	}	828	}
829	return;	829	return;
830	}	830	}
831	/*	831	/*
832	* We have to be careful about resetting pushbuf flag too early (above).	832	* We have to be careful about resetting pushbuf flag too early (above).
833	* Even though in theory we can do it as soon as we have the buflock,	833	* Even though in theory we can do it as soon as we have the buflock,
834	* we don't want others to be doing work needlessly. They'll come to	834	* we don't want others to be doing work needlessly. They'll come to
835	* this function thinking that pushing the buffer is their	835	* this function thinking that pushing the buffer is their
836	* responsibility only to find that the buffer is still locked by	836	* responsibility only to find that the buffer is still locked by
837	* another doing the same thing	837	* another doing the same thing
838	*/	838	*/
839	iip->ili_pushbuf_flag = 0;	839	iip->ili_pushbuf_flag = 0;
840	xfs_iunlock(ip, XFS_ILOCK_SHARED);	840	xfs_iunlock(ip, XFS_ILOCK_SHARED);
841	return;	841	return;
842	}	842	}
843		843
844		844
845	/*	845	/*
846	* This is called to asynchronously write the inode associated with this	846	* This is called to asynchronously write the inode associated with this
847	* inode log item out to disk. The inode will already have been locked by	847	* inode log item out to disk. The inode will already have been locked by
848	* a successful call to xfs_inode_item_trylock().	848	* a successful call to xfs_inode_item_trylock().
849	*/	849	*/
850	STATIC void	850	STATIC void
851	xfs_inode_item_push(	851	xfs_inode_item_push(
852	xfs_inode_log_item_t *iip)	852	xfs_inode_log_item_t *iip)
853	{	853	{
854	xfs_inode_t *ip;	854	xfs_inode_t *ip;
855		855
856	ip = iip->ili_inode;	856	ip = iip->ili_inode;
857		857
858	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));	858	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
859	ASSERT(!completion_done(&ip->i_flush));	859	ASSERT(!completion_done(&ip->i_flush));
860	/*	860	/*
861	* Since we were able to lock the inode's flush lock and	861	* Since we were able to lock the inode's flush lock and
862	* we found it on the AIL, the inode must be dirty. This	862	* we found it on the AIL, the inode must be dirty. This
863	* is because the inode is removed from the AIL while still	863	* is because the inode is removed from the AIL while still
864	* holding the flush lock in xfs_iflush_done(). Thus, if	864	* holding the flush lock in xfs_iflush_done(). Thus, if
865	* we found it in the AIL and were able to obtain the flush	865	* we found it in the AIL and were able to obtain the flush
866	* lock without sleeping, then there must not have been	866	* lock without sleeping, then there must not have been
867	* anyone in the process of flushing the inode.	867	* anyone in the process of flushing the inode.
868	*/	868	*/
869	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) \|\|	869	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) \|\|
870	iip->ili_format.ilf_fields != 0);	870	iip->ili_format.ilf_fields != 0);
871		871
872	/*	872	/*
873	* Write out the inode. The completion routine ('iflush_done') will	873	* Write out the inode. The completion routine ('iflush_done') will
874	* pull it from the AIL, mark it clean, unlock the flush lock.	874	* pull it from the AIL, mark it clean, unlock the flush lock.
875	*/	875	*/
876	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);	876	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
877	xfs_iunlock(ip, XFS_ILOCK_SHARED);	877	xfs_iunlock(ip, XFS_ILOCK_SHARED);
878		878
879	return;	879	return;
880	}	880	}
881		881
882	/*	882	/*
883	* XXX rcc - this one really has to do something. Probably needs	883	* XXX rcc - this one really has to do something. Probably needs
884	* to stamp in a new field in the incore inode.	884	* to stamp in a new field in the incore inode.
885	*/	885	*/
886	/* ARGSUSED */	886	/* ARGSUSED */
887	STATIC void	887	STATIC void
888	xfs_inode_item_committing(	888	xfs_inode_item_committing(
889	xfs_inode_log_item_t *iip,	889	xfs_inode_log_item_t *iip,
890	xfs_lsn_t lsn)	890	xfs_lsn_t lsn)
891	{	891	{
892	iip->ili_last_lsn = lsn;	892	iip->ili_last_lsn = lsn;
893	return;	893	return;
894	}	894	}
895		895
896	/*	896	/*
897	* This is the ops vector shared by all buf log items.	897	* This is the ops vector shared by all buf log items.
898	*/	898	*/
899	static struct xfs_item_ops xfs_inode_item_ops = {	899	static struct xfs_item_ops xfs_inode_item_ops = {
900	.iop_size = (uint()(xfs_log_item_t))xfs_inode_item_size,	900	.iop_size = (uint()(xfs_log_item_t))xfs_inode_item_size,
901	.iop_format = (void()(xfs_log_item_t, xfs_log_iovec_t*))	901	.iop_format = (void()(xfs_log_item_t, xfs_log_iovec_t*))
902	xfs_inode_item_format,	902	xfs_inode_item_format,
903	.iop_pin = (void()(xfs_log_item_t))xfs_inode_item_pin,	903	.iop_pin = (void()(xfs_log_item_t))xfs_inode_item_pin,
904	.iop_unpin = (void()(xfs_log_item_t, int))xfs_inode_item_unpin,	904	.iop_unpin = (void()(xfs_log_item_t, int))xfs_inode_item_unpin,
905	.iop_unpin_remove = (void()(xfs_log_item_t, xfs_trans_t*))	905	.iop_unpin_remove = (void()(xfs_log_item_t, xfs_trans_t*))
906	xfs_inode_item_unpin_remove,	906	xfs_inode_item_unpin_remove,
907	.iop_trylock = (uint()(xfs_log_item_t))xfs_inode_item_trylock,	907	.iop_trylock = (uint()(xfs_log_item_t))xfs_inode_item_trylock,
908	.iop_unlock = (void()(xfs_log_item_t))xfs_inode_item_unlock,	908	.iop_unlock = (void()(xfs_log_item_t))xfs_inode_item_unlock,
909	.iop_committed = (xfs_lsn_t()(xfs_log_item_t, xfs_lsn_t))	909	.iop_committed = (xfs_lsn_t()(xfs_log_item_t, xfs_lsn_t))
910	xfs_inode_item_committed,	910	xfs_inode_item_committed,
911	.iop_push = (void()(xfs_log_item_t))xfs_inode_item_push,	911	.iop_push = (void()(xfs_log_item_t))xfs_inode_item_push,
912	.iop_pushbuf = (void()(xfs_log_item_t))xfs_inode_item_pushbuf,	912	.iop_pushbuf = (void()(xfs_log_item_t))xfs_inode_item_pushbuf,
913	.iop_committing = (void()(xfs_log_item_t, xfs_lsn_t))	913	.iop_committing = (void()(xfs_log_item_t, xfs_lsn_t))
914	xfs_inode_item_committing	914	xfs_inode_item_committing
915	};	915	};
916		916
917		917
918	/*	918	/*
919	* Initialize the inode log item for a newly allocated (in-core) inode.	919	* Initialize the inode log item for a newly allocated (in-core) inode.
920	*/	920	*/
921	void	921	void
922	xfs_inode_item_init(	922	xfs_inode_item_init(
923	xfs_inode_t *ip,	923	xfs_inode_t *ip,
924	xfs_mount_t *mp)	924	xfs_mount_t *mp)
925	{	925	{
926	xfs_inode_log_item_t *iip;	926	xfs_inode_log_item_t *iip;
927		927
928	ASSERT(ip->i_itemp == NULL);	928	ASSERT(ip->i_itemp == NULL);
929	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);	929	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
930		930
931	iip->ili_item.li_type = XFS_LI_INODE;	931	iip->ili_item.li_type = XFS_LI_INODE;
932	iip->ili_item.li_ops = &xfs_inode_item_ops;	932	iip->ili_item.li_ops = &xfs_inode_item_ops;
933	iip->ili_item.li_mountp = mp;	933	iip->ili_item.li_mountp = mp;
934	iip->ili_item.li_ailp = mp->m_ail;	934	iip->ili_item.li_ailp = mp->m_ail;
935	iip->ili_inode = ip;	935	iip->ili_inode = ip;
936		936
937	/*	937	/*
938	We have zeroed memory. No need ...	938	We have zeroed memory. No need ...
939	iip->ili_extents_buf = NULL;	939	iip->ili_extents_buf = NULL;
940	iip->ili_pushbuf_flag = 0;	940	iip->ili_pushbuf_flag = 0;
941	*/	941	*/
942		942
943	iip->ili_format.ilf_type = XFS_LI_INODE;	943	iip->ili_format.ilf_type = XFS_LI_INODE;
944	iip->ili_format.ilf_ino = ip->i_ino;	944	iip->ili_format.ilf_ino = ip->i_ino;
945	iip->ili_format.ilf_blkno = ip->i_blkno;	945	iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
946	iip->ili_format.ilf_len = ip->i_len;	946	iip->ili_format.ilf_len = ip->i_imap.im_len;
947	iip->ili_format.ilf_boffset = ip->i_boffset;	947	iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
948	}	948	}
949		949
950	/*	950	/*
951	* Free the inode log item and any memory hanging off of it.	951	* Free the inode log item and any memory hanging off of it.
952	*/	952	*/
953	void	953	void
954	xfs_inode_item_destroy(	954	xfs_inode_item_destroy(
955	xfs_inode_t *ip)	955	xfs_inode_t *ip)
956	{	956	{
957	#ifdef XFS_TRANS_DEBUG	957	#ifdef XFS_TRANS_DEBUG
958	if (ip->i_itemp->ili_root_size != 0) {	958	if (ip->i_itemp->ili_root_size != 0) {
959	kmem_free(ip->i_itemp->ili_orig_root);	959	kmem_free(ip->i_itemp->ili_orig_root);
960	}	960	}
961	#endif	961	#endif
962	kmem_zone_free(xfs_ili_zone, ip->i_itemp);	962	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
963	}	963	}
964		964
965		965
966	/*	966	/*
967	* This is the inode flushing I/O completion routine. It is called	967	* This is the inode flushing I/O completion routine. It is called
968	* from interrupt level when the buffer containing the inode is	968	* from interrupt level when the buffer containing the inode is
969	* flushed to disk. It is responsible for removing the inode item	969	* flushed to disk. It is responsible for removing the inode item
970	* from the AIL if it has not been re-logged, and unlocking the inode's	970	* from the AIL if it has not been re-logged, and unlocking the inode's
971	* flush lock.	971	* flush lock.
972	*/	972	*/
973	/ARGSUSED/	973	/ARGSUSED/
974	void	974	void
975	xfs_iflush_done(	975	xfs_iflush_done(
976	xfs_buf_t *bp,	976	xfs_buf_t *bp,
977	xfs_inode_log_item_t *iip)	977	xfs_inode_log_item_t *iip)
978	{	978	{
979	xfs_inode_t *ip = iip->ili_inode;	979	xfs_inode_t *ip = iip->ili_inode;
980	struct xfs_ail *ailp = iip->ili_item.li_ailp;	980	struct xfs_ail *ailp = iip->ili_item.li_ailp;
981		981
982	/*	982	/*
983	* We only want to pull the item from the AIL if it is	983	* We only want to pull the item from the AIL if it is
984	* actually there and its location in the log has not	984	* actually there and its location in the log has not
985	* changed since we started the flush. Thus, we only bother	985	* changed since we started the flush. Thus, we only bother
986	* if the ili_logged flag is set and the inode's lsn has not	986	* if the ili_logged flag is set and the inode's lsn has not
987	* changed. First we check the lsn outside	987	* changed. First we check the lsn outside
988	* the lock since it's cheaper, and then we recheck while	988	* the lock since it's cheaper, and then we recheck while
989	* holding the lock before removing the inode from the AIL.	989	* holding the lock before removing the inode from the AIL.
990	*/	990	*/
991	if (iip->ili_logged &&	991	if (iip->ili_logged &&
992	(iip->ili_item.li_lsn == iip->ili_flush_lsn)) {	992	(iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
993	spin_lock(&ailp->xa_lock);	993	spin_lock(&ailp->xa_lock);
994	if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {	994	if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
995	/* xfs_trans_ail_delete() drops the AIL lock. */	995	/* xfs_trans_ail_delete() drops the AIL lock. */
996	xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);	996	xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
997	} else {	997	} else {
998	spin_unlock(&ailp->xa_lock);	998	spin_unlock(&ailp->xa_lock);
999	}	999	}
1000	}	1000	}
1001		1001
1002	iip->ili_logged = 0;	1002	iip->ili_logged = 0;
1003		1003
1004	/*	1004	/*
1005	* Clear the ili_last_fields bits now that we know that the	1005	* Clear the ili_last_fields bits now that we know that the
1006	* data corresponding to them is safely on disk.	1006	* data corresponding to them is safely on disk.
1007	*/	1007	*/
1008	iip->ili_last_fields = 0;	1008	iip->ili_last_fields = 0;
1009		1009
1010	/*	1010	/*
1011	* Release the inode's flush lock since we're done with it.	1011	* Release the inode's flush lock since we're done with it.
1012	*/	1012	*/
1013	xfs_ifunlock(ip);	1013	xfs_ifunlock(ip);
1014		1014
1015	return;	1015	return;
1016	}	1016	}
1017		1017
1018	/*	1018	/*
1019	* This is the inode flushing abort routine. It is called	1019	* This is the inode flushing abort routine. It is called
1020	* from xfs_iflush when the filesystem is shutting down to clean	1020	* from xfs_iflush when the filesystem is shutting down to clean
1021	* up the inode state.	1021	* up the inode state.
1022	* It is responsible for removing the inode item	1022	* It is responsible for removing the inode item
1023	* from the AIL if it has not been re-logged, and unlocking the inode's	1023	* from the AIL if it has not been re-logged, and unlocking the inode's
1024	* flush lock.	1024	* flush lock.
1025	*/	1025	*/
1026	void	1026	void
1027	xfs_iflush_abort(	1027	xfs_iflush_abort(
1028	xfs_inode_t *ip)	1028	xfs_inode_t *ip)
1029	{	1029	{
1030	xfs_inode_log_item_t *iip = ip->i_itemp;	1030	xfs_inode_log_item_t *iip = ip->i_itemp;
1031	xfs_mount_t *mp;	1031	xfs_mount_t *mp;
1032		1032
1033	iip = ip->i_itemp;	1033	iip = ip->i_itemp;
1034	mp = ip->i_mount;	1034	mp = ip->i_mount;
1035	if (iip) {	1035	if (iip) {
1036	struct xfs_ail *ailp = iip->ili_item.li_ailp;	1036	struct xfs_ail *ailp = iip->ili_item.li_ailp;
1037	if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {	1037	if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1038	spin_lock(&ailp->xa_lock);	1038	spin_lock(&ailp->xa_lock);
1039	if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {	1039	if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1040	/* xfs_trans_ail_delete() drops the AIL lock. */	1040	/* xfs_trans_ail_delete() drops the AIL lock. */
1041	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);	1041	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
1042	} else	1042	} else
1043	spin_unlock(&ailp->xa_lock);	1043	spin_unlock(&ailp->xa_lock);
1044	}	1044	}
1045	iip->ili_logged = 0;	1045	iip->ili_logged = 0;
1046	/*	1046	/*
1047	* Clear the ili_last_fields bits now that we know that the	1047	* Clear the ili_last_fields bits now that we know that the
1048	* data corresponding to them is safely on disk.	1048	* data corresponding to them is safely on disk.
1049	*/	1049	*/
1050	iip->ili_last_fields = 0;	1050	iip->ili_last_fields = 0;
1051	/*	1051	/*
1052	* Clear the inode logging fields so no more flushes are	1052	* Clear the inode logging fields so no more flushes are
1053	* attempted.	1053	* attempted.
1054	*/	1054	*/
1055	iip->ili_format.ilf_fields = 0;	1055	iip->ili_format.ilf_fields = 0;
1056	}	1056	}
1057	/*	1057	/*
1058	* Release the inode's flush lock since we're done with it.	1058	* Release the inode's flush lock since we're done with it.
1059	*/	1059	*/
1060	xfs_ifunlock(ip);	1060	xfs_ifunlock(ip);
1061	}	1061	}
1062		1062
1063	void	1063	void
1064	xfs_istale_done(	1064	xfs_istale_done(
1065	xfs_buf_t *bp,	1065	xfs_buf_t *bp,
1066	xfs_inode_log_item_t *iip)	1066	xfs_inode_log_item_t *iip)
1067	{	1067	{
1068	xfs_iflush_abort(iip->ili_inode);	1068	xfs_iflush_abort(iip->ili_inode);
1069	}	1069	}
1070		1070
1071	/*	1071	/*
1072	* convert an xfs_inode_log_format struct from either 32 or 64 bit versions	1072	* convert an xfs_inode_log_format struct from either 32 or 64 bit versions
1073	* (which can have different field alignments) to the native version	1073	* (which can have different field alignments) to the native version
1074	*/	1074	*/
1075	int	1075	int
1076	xfs_inode_item_format_convert(	1076	xfs_inode_item_format_convert(
1077	xfs_log_iovec_t *buf,	1077	xfs_log_iovec_t *buf,
1078	xfs_inode_log_format_t *in_f)	1078	xfs_inode_log_format_t *in_f)
1079	{	1079	{
1080	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {	1080	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
1081	xfs_inode_log_format_32_t *in_f32;	1081	xfs_inode_log_format_32_t *in_f32;
1082		1082
1083	in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;	1083	in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
1084	in_f->ilf_type = in_f32->ilf_type;	1084	in_f->ilf_type = in_f32->ilf_type;
1085	in_f->ilf_size = in_f32->ilf_size;	1085	in_f->ilf_size = in_f32->ilf_size;
1086	in_f->ilf_fields = in_f32->ilf_fields;	1086	in_f->ilf_fields = in_f32->ilf_fields;
1087	in_f->ilf_asize = in_f32->ilf_asize;	1087	in_f->ilf_asize = in_f32->ilf_asize;
1088	in_f->ilf_dsize = in_f32->ilf_dsize;	1088	in_f->ilf_dsize = in_f32->ilf_dsize;
1089	in_f->ilf_ino = in_f32->ilf_ino;	1089	in_f->ilf_ino = in_f32->ilf_ino;
1090	/* copy biggest field of ilf_u */	1090	/* copy biggest field of ilf_u */
1091	memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,	1091	memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
1092	in_f32->ilf_u.ilfu_uuid.__u_bits,	1092	in_f32->ilf_u.ilfu_uuid.__u_bits,
1093	sizeof(uuid_t));	1093	sizeof(uuid_t));
1094	in_f->ilf_blkno = in_f32->ilf_blkno;	1094	in_f->ilf_blkno = in_f32->ilf_blkno;
1095	in_f->ilf_len = in_f32->ilf_len;	1095	in_f->ilf_len = in_f32->ilf_len;
1096	in_f->ilf_boffset = in_f32->ilf_boffset;	1096	in_f->ilf_boffset = in_f32->ilf_boffset;
1097	return 0;	1097	return 0;
1098	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){	1098	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
1099	xfs_inode_log_format_64_t *in_f64;	1099	xfs_inode_log_format_64_t *in_f64;
1100		1100
1101	in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;	1101	in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
1102	in_f->ilf_type = in_f64->ilf_type;	1102	in_f->ilf_type = in_f64->ilf_type;
1103	in_f->ilf_size = in_f64->ilf_size;	1103	in_f->ilf_size = in_f64->ilf_size;
1104	in_f->ilf_fields = in_f64->ilf_fields;	1104	in_f->ilf_fields = in_f64->ilf_fields;
1105	in_f->ilf_asize = in_f64->ilf_asize;	1105	in_f->ilf_asize = in_f64->ilf_asize;
1106	in_f->ilf_dsize = in_f64->ilf_dsize;	1106	in_f->ilf_dsize = in_f64->ilf_dsize;
1107	in_f->ilf_ino = in_f64->ilf_ino;	1107	in_f->ilf_ino = in_f64->ilf_ino;
1108	/* copy biggest field of ilf_u */	1108	/* copy biggest field of ilf_u */
1109	memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,	1109	memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
1110	in_f64->ilf_u.ilfu_uuid.__u_bits,	1110	in_f64->ilf_u.ilfu_uuid.__u_bits,
1111	sizeof(uuid_t));	1111	sizeof(uuid_t));
1112	in_f->ilf_blkno = in_f64->ilf_blkno;	1112	in_f->ilf_blkno = in_f64->ilf_blkno;
1113	in_f->ilf_len = in_f64->ilf_len;	1113	in_f->ilf_len = in_f64->ilf_len;
1114	in_f->ilf_boffset = in_f64->ilf_boffset;	1114	in_f->ilf_boffset = in_f64->ilf_boffset;
1115	return 0;	1115	return 0;
1116	}	1116	}
1117	return EFSCORRUPTED;	1117	return EFSCORRUPTED;
1118	}	1118	}
1119		1119

fs/xfs/xfs_itable.c

Diff comments View file @ 92bfc6e

1	/*	1	/*
2	* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.	2	* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3	* All Rights Reserved.	3	* All Rights Reserved.
4	*	4	*
5	* This program is free software; you can redistribute it and/or	5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as	6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.	7	* published by the Free Software Foundation.
8	*	8	*
9	* This program is distributed in the hope that it would be useful,	9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,	15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/	17	*/
18	#include "xfs.h"	18	#include "xfs.h"
19	#include "xfs_fs.h"	19	#include "xfs_fs.h"
20	#include "xfs_types.h"	20	#include "xfs_types.h"
21	#include "xfs_bit.h"	21	#include "xfs_bit.h"
22	#include "xfs_log.h"	22	#include "xfs_log.h"
23	#include "xfs_inum.h"	23	#include "xfs_inum.h"
24	#include "xfs_trans.h"	24	#include "xfs_trans.h"
25	#include "xfs_sb.h"	25	#include "xfs_sb.h"
26	#include "xfs_ag.h"	26	#include "xfs_ag.h"
27	#include "xfs_dir2.h"	27	#include "xfs_dir2.h"
28	#include "xfs_dmapi.h"	28	#include "xfs_dmapi.h"
29	#include "xfs_mount.h"	29	#include "xfs_mount.h"
30	#include "xfs_bmap_btree.h"	30	#include "xfs_bmap_btree.h"
31	#include "xfs_alloc_btree.h"	31	#include "xfs_alloc_btree.h"
32	#include "xfs_ialloc_btree.h"	32	#include "xfs_ialloc_btree.h"
33	#include "xfs_dir2_sf.h"	33	#include "xfs_dir2_sf.h"
34	#include "xfs_attr_sf.h"	34	#include "xfs_attr_sf.h"
35	#include "xfs_dinode.h"	35	#include "xfs_dinode.h"
36	#include "xfs_inode.h"	36	#include "xfs_inode.h"
37	#include "xfs_ialloc.h"	37	#include "xfs_ialloc.h"
38	#include "xfs_itable.h"	38	#include "xfs_itable.h"
39	#include "xfs_error.h"	39	#include "xfs_error.h"
40	#include "xfs_btree.h"	40	#include "xfs_btree.h"
41		41
42	int	42	int
43	xfs_internal_inum(	43	xfs_internal_inum(
44	xfs_mount_t *mp,	44	xfs_mount_t *mp,
45	xfs_ino_t ino)	45	xfs_ino_t ino)
46	{	46	{
47	return (ino == mp->m_sb.sb_rbmino \|\| ino == mp->m_sb.sb_rsumino \|\|	47	return (ino == mp->m_sb.sb_rbmino \|\| ino == mp->m_sb.sb_rsumino \|\|
48	(xfs_sb_version_hasquota(&mp->m_sb) &&	48	(xfs_sb_version_hasquota(&mp->m_sb) &&
49	(ino == mp->m_sb.sb_uquotino \|\| ino == mp->m_sb.sb_gquotino)));	49	(ino == mp->m_sb.sb_uquotino \|\| ino == mp->m_sb.sb_gquotino)));
50	}	50	}
51		51
52	STATIC int	52	STATIC int
53	xfs_bulkstat_one_iget(	53	xfs_bulkstat_one_iget(
54	xfs_mount_t mp, / mount point for filesystem */	54	xfs_mount_t mp, / mount point for filesystem */
55	xfs_ino_t ino, /* inode number to get data for */	55	xfs_ino_t ino, /* inode number to get data for */
56	xfs_daddr_t bno, /* starting bno of inode cluster */	56	xfs_daddr_t bno, /* starting bno of inode cluster */
57	xfs_bstat_t buf, / return buffer */	57	xfs_bstat_t buf, / return buffer */
58	int stat) / BULKSTAT_RV_... */	58	int stat) / BULKSTAT_RV_... */
59	{	59	{
60	xfs_icdinode_t dic; / dinode core info pointer */	60	xfs_icdinode_t dic; / dinode core info pointer */
61	xfs_inode_t ip; / incore inode pointer */	61	xfs_inode_t ip; / incore inode pointer */
62	int error;	62	int error;
63		63
64	error = xfs_iget(mp, NULL, ino,	64	error = xfs_iget(mp, NULL, ino,
65	XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);	65	XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
66	if (error) {	66	if (error) {
67	*stat = BULKSTAT_RV_NOTHING;	67	*stat = BULKSTAT_RV_NOTHING;
68	return error;	68	return error;
69	}	69	}
70		70
71	ASSERT(ip != NULL);	71	ASSERT(ip != NULL);
72	ASSERT(ip->i_blkno != (xfs_daddr_t)0);	72	ASSERT(ip->i_imap.im_blkno != 0);
73		73
74	dic = &ip->i_d;	74	dic = &ip->i_d;
75		75
76	/* xfs_iget returns the following without needing	76	/* xfs_iget returns the following without needing
77	* further change.	77	* further change.
78	*/	78	*/
79	buf->bs_nlink = dic->di_nlink;	79	buf->bs_nlink = dic->di_nlink;
80	buf->bs_projid = dic->di_projid;	80	buf->bs_projid = dic->di_projid;
81	buf->bs_ino = ino;	81	buf->bs_ino = ino;
82	buf->bs_mode = dic->di_mode;	82	buf->bs_mode = dic->di_mode;
83	buf->bs_uid = dic->di_uid;	83	buf->bs_uid = dic->di_uid;
84	buf->bs_gid = dic->di_gid;	84	buf->bs_gid = dic->di_gid;
85	buf->bs_size = dic->di_size;	85	buf->bs_size = dic->di_size;
86	vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);	86	vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
87	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;	87	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
88	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;	88	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
89	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;	89	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
90	buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;	90	buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
91	buf->bs_xflags = xfs_ip2xflags(ip);	91	buf->bs_xflags = xfs_ip2xflags(ip);
92	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;	92	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
93	buf->bs_extents = dic->di_nextents;	93	buf->bs_extents = dic->di_nextents;
94	buf->bs_gen = dic->di_gen;	94	buf->bs_gen = dic->di_gen;
95	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));	95	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
96	buf->bs_dmevmask = dic->di_dmevmask;	96	buf->bs_dmevmask = dic->di_dmevmask;
97	buf->bs_dmstate = dic->di_dmstate;	97	buf->bs_dmstate = dic->di_dmstate;
98	buf->bs_aextents = dic->di_anextents;	98	buf->bs_aextents = dic->di_anextents;
99		99
100	switch (dic->di_format) {	100	switch (dic->di_format) {
101	case XFS_DINODE_FMT_DEV:	101	case XFS_DINODE_FMT_DEV:
102	buf->bs_rdev = ip->i_df.if_u2.if_rdev;	102	buf->bs_rdev = ip->i_df.if_u2.if_rdev;
103	buf->bs_blksize = BLKDEV_IOSIZE;	103	buf->bs_blksize = BLKDEV_IOSIZE;
104	buf->bs_blocks = 0;	104	buf->bs_blocks = 0;
105	break;	105	break;
106	case XFS_DINODE_FMT_LOCAL:	106	case XFS_DINODE_FMT_LOCAL:
107	case XFS_DINODE_FMT_UUID:	107	case XFS_DINODE_FMT_UUID:
108	buf->bs_rdev = 0;	108	buf->bs_rdev = 0;
109	buf->bs_blksize = mp->m_sb.sb_blocksize;	109	buf->bs_blksize = mp->m_sb.sb_blocksize;
110	buf->bs_blocks = 0;	110	buf->bs_blocks = 0;
111	break;	111	break;
112	case XFS_DINODE_FMT_EXTENTS:	112	case XFS_DINODE_FMT_EXTENTS:
113	case XFS_DINODE_FMT_BTREE:	113	case XFS_DINODE_FMT_BTREE:
114	buf->bs_rdev = 0;	114	buf->bs_rdev = 0;
115	buf->bs_blksize = mp->m_sb.sb_blocksize;	115	buf->bs_blksize = mp->m_sb.sb_blocksize;
116	buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;	116	buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
117	break;	117	break;
118	}	118	}
119		119
120	xfs_iput(ip, XFS_ILOCK_SHARED);	120	xfs_iput(ip, XFS_ILOCK_SHARED);
121	return error;	121	return error;
122	}	122	}
123		123
124	STATIC void	124	STATIC void
125	xfs_bulkstat_one_dinode(	125	xfs_bulkstat_one_dinode(
126	xfs_mount_t mp, / mount point for filesystem */	126	xfs_mount_t mp, / mount point for filesystem */
127	xfs_ino_t ino, /* inode number to get data for */	127	xfs_ino_t ino, /* inode number to get data for */
128	xfs_dinode_t dic, / dinode inode pointer */	128	xfs_dinode_t dic, / dinode inode pointer */
129	xfs_bstat_t buf) / return buffer */	129	xfs_bstat_t buf) / return buffer */
130	{	130	{
131	/*	131	/*
132	* The inode format changed when we moved the link count and	132	* The inode format changed when we moved the link count and
133	* made it 32 bits long. If this is an old format inode,	133	* made it 32 bits long. If this is an old format inode,
134	* convert it in memory to look like a new one. If it gets	134	* convert it in memory to look like a new one. If it gets
135	* flushed to disk we will convert back before flushing or	135	* flushed to disk we will convert back before flushing or
136	* logging it. We zero out the new projid field and the old link	136	* logging it. We zero out the new projid field and the old link
137	* count field. We'll handle clearing the pad field (the remains	137	* count field. We'll handle clearing the pad field (the remains
138	* of the old uuid field) when we actually convert the inode to	138	* of the old uuid field) when we actually convert the inode to
139	* the new format. We don't change the version number so that we	139	* the new format. We don't change the version number so that we
140	* can distinguish this from a real new format inode.	140	* can distinguish this from a real new format inode.
141	*/	141	*/
142	if (dic->di_version == 1) {	142	if (dic->di_version == 1) {
143	buf->bs_nlink = be16_to_cpu(dic->di_onlink);	143	buf->bs_nlink = be16_to_cpu(dic->di_onlink);
144	buf->bs_projid = 0;	144	buf->bs_projid = 0;
145	} else {	145	} else {
146	buf->bs_nlink = be32_to_cpu(dic->di_nlink);	146	buf->bs_nlink = be32_to_cpu(dic->di_nlink);
147	buf->bs_projid = be16_to_cpu(dic->di_projid);	147	buf->bs_projid = be16_to_cpu(dic->di_projid);
148	}	148	}
149		149
150	buf->bs_ino = ino;	150	buf->bs_ino = ino;
151	buf->bs_mode = be16_to_cpu(dic->di_mode);	151	buf->bs_mode = be16_to_cpu(dic->di_mode);
152	buf->bs_uid = be32_to_cpu(dic->di_uid);	152	buf->bs_uid = be32_to_cpu(dic->di_uid);
153	buf->bs_gid = be32_to_cpu(dic->di_gid);	153	buf->bs_gid = be32_to_cpu(dic->di_gid);
154	buf->bs_size = be64_to_cpu(dic->di_size);	154	buf->bs_size = be64_to_cpu(dic->di_size);
155	buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);	155	buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);
156	buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);	156	buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);
157	buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);	157	buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);
158	buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);	158	buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
159	buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);	159	buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
160	buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);	160	buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
161	buf->bs_xflags = xfs_dic2xflags(dic);	161	buf->bs_xflags = xfs_dic2xflags(dic);
162	buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;	162	buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
163	buf->bs_extents = be32_to_cpu(dic->di_nextents);	163	buf->bs_extents = be32_to_cpu(dic->di_nextents);
164	buf->bs_gen = be32_to_cpu(dic->di_gen);	164	buf->bs_gen = be32_to_cpu(dic->di_gen);
165	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));	165	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
166	buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);	166	buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
167	buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);	167	buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
168	buf->bs_aextents = be16_to_cpu(dic->di_anextents);	168	buf->bs_aextents = be16_to_cpu(dic->di_anextents);
169		169
170	switch (dic->di_format) {	170	switch (dic->di_format) {
171	case XFS_DINODE_FMT_DEV:	171	case XFS_DINODE_FMT_DEV:
172	buf->bs_rdev = xfs_dinode_get_rdev(dic);	172	buf->bs_rdev = xfs_dinode_get_rdev(dic);
173	buf->bs_blksize = BLKDEV_IOSIZE;	173	buf->bs_blksize = BLKDEV_IOSIZE;
174	buf->bs_blocks = 0;	174	buf->bs_blocks = 0;
175	break;	175	break;
176	case XFS_DINODE_FMT_LOCAL:	176	case XFS_DINODE_FMT_LOCAL:
177	case XFS_DINODE_FMT_UUID:	177	case XFS_DINODE_FMT_UUID:
178	buf->bs_rdev = 0;	178	buf->bs_rdev = 0;
179	buf->bs_blksize = mp->m_sb.sb_blocksize;	179	buf->bs_blksize = mp->m_sb.sb_blocksize;
180	buf->bs_blocks = 0;	180	buf->bs_blocks = 0;
181	break;	181	break;
182	case XFS_DINODE_FMT_EXTENTS:	182	case XFS_DINODE_FMT_EXTENTS:
183	case XFS_DINODE_FMT_BTREE:	183	case XFS_DINODE_FMT_BTREE:
184	buf->bs_rdev = 0;	184	buf->bs_rdev = 0;
185	buf->bs_blksize = mp->m_sb.sb_blocksize;	185	buf->bs_blksize = mp->m_sb.sb_blocksize;
186	buf->bs_blocks = be64_to_cpu(dic->di_nblocks);	186	buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
187	break;	187	break;
188	}	188	}
189	}	189	}
190		190
191	STATIC int	191	STATIC int
192	xfs_bulkstat_one_fmt(	192	xfs_bulkstat_one_fmt(
193	void __user *ubuffer,	193	void __user *ubuffer,
194	const xfs_bstat_t *buffer)	194	const xfs_bstat_t *buffer)
195	{	195	{
196	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))	196	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
197	return -EFAULT;	197	return -EFAULT;
198	return sizeof(*buffer);	198	return sizeof(*buffer);
199	}	199	}
200		200
201	/*	201	/*
202	* Return stat information for one inode.	202	* Return stat information for one inode.
203	* Return 0 if ok, else errno.	203	* Return 0 if ok, else errno.
204	*/	204	*/
205	int /* error status */	205	int /* error status */
206	xfs_bulkstat_one(	206	xfs_bulkstat_one(
207	xfs_mount_t mp, / mount point for filesystem */	207	xfs_mount_t mp, / mount point for filesystem */
208	xfs_ino_t ino, /* inode number to get data for */	208	xfs_ino_t ino, /* inode number to get data for */
209	void __user buffer, / buffer to place output in */	209	void __user buffer, / buffer to place output in */
210	int ubsize, /* size of buffer */	210	int ubsize, /* size of buffer */
211	void private_data, / my private data */	211	void private_data, / my private data */
212	xfs_daddr_t bno, /* starting bno of inode cluster */	212	xfs_daddr_t bno, /* starting bno of inode cluster */
213	int ubused, / bytes used by me */	213	int ubused, / bytes used by me */
214	void dibuff, / on-disk inode buffer */	214	void dibuff, / on-disk inode buffer */
215	int stat) / BULKSTAT_RV_... */	215	int stat) / BULKSTAT_RV_... */
216	{	216	{
217	xfs_bstat_t buf; / return buffer */	217	xfs_bstat_t buf; / return buffer */
218	int error = 0; /* error value */	218	int error = 0; /* error value */
219	xfs_dinode_t dip; / dinode inode pointer */	219	xfs_dinode_t dip; / dinode inode pointer */
220	bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;	220	bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
221		221
222	dip = (xfs_dinode_t *)dibuff;	222	dip = (xfs_dinode_t *)dibuff;
223	*stat = BULKSTAT_RV_NOTHING;	223	*stat = BULKSTAT_RV_NOTHING;
224		224
225	if (!buffer \|\| xfs_internal_inum(mp, ino))	225	if (!buffer \|\| xfs_internal_inum(mp, ino))
226	return XFS_ERROR(EINVAL);	226	return XFS_ERROR(EINVAL);
227	if (ubsize < sizeof(*buf))	227	if (ubsize < sizeof(*buf))
228	return XFS_ERROR(ENOMEM);	228	return XFS_ERROR(ENOMEM);
229		229
230	buf = kmem_alloc(sizeof(*buf), KM_SLEEP);	230	buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
231		231
232	if (dip == NULL) {	232	if (dip == NULL) {
233	/* We're not being passed a pointer to a dinode. This happens	233	/* We're not being passed a pointer to a dinode. This happens
234	* if BULKSTAT_FG_IGET is selected. Do the iget.	234	* if BULKSTAT_FG_IGET is selected. Do the iget.
235	*/	235	*/
236	error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);	236	error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
237	if (error)	237	if (error)
238	goto out_free;	238	goto out_free;
239	} else {	239	} else {
240	xfs_bulkstat_one_dinode(mp, ino, dip, buf);	240	xfs_bulkstat_one_dinode(mp, ino, dip, buf);
241	}	241	}
242		242
243	error = formatter(buffer, buf);	243	error = formatter(buffer, buf);
244	if (error < 0) {	244	if (error < 0) {
245	error = EFAULT;	245	error = EFAULT;
246	goto out_free;	246	goto out_free;
247	}	247	}
248		248
249	*stat = BULKSTAT_RV_DIDONE;	249	*stat = BULKSTAT_RV_DIDONE;
250	if (ubused)	250	if (ubused)
251	*ubused = error;	251	*ubused = error;
252		252
253	out_free:	253	out_free:
254	kmem_free(buf);	254	kmem_free(buf);
255	return error;	255	return error;
256	}	256	}
257		257
258	/*	258	/*
259	* Test to see whether we can use the ondisk inode directly, based	259	* Test to see whether we can use the ondisk inode directly, based
260	* on the given bulkstat flags, filling in dipp accordingly.	260	* on the given bulkstat flags, filling in dipp accordingly.
261	* Returns zero if the inode is dodgey.	261	* Returns zero if the inode is dodgey.
262	*/	262	*/
263	STATIC int	263	STATIC int
264	xfs_bulkstat_use_dinode(	264	xfs_bulkstat_use_dinode(
265	xfs_mount_t *mp,	265	xfs_mount_t *mp,
266	int flags,	266	int flags,
267	xfs_buf_t *bp,	267	xfs_buf_t *bp,
268	int clustidx,	268	int clustidx,
269	xfs_dinode_t **dipp)	269	xfs_dinode_t **dipp)
270	{	270	{
271	xfs_dinode_t *dip;	271	xfs_dinode_t *dip;
272	unsigned int aformat;	272	unsigned int aformat;
273		273
274	*dipp = NULL;	274	*dipp = NULL;
275	if (!bp \|\| (flags & BULKSTAT_FG_IGET))	275	if (!bp \|\| (flags & BULKSTAT_FG_IGET))
276	return 1;	276	return 1;
277	dip = (xfs_dinode_t *)	277	dip = (xfs_dinode_t *)
278	xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);	278	xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
279	/*	279	/*
280	* Check the buffer containing the on-disk inode for di_mode == 0.	280	* Check the buffer containing the on-disk inode for di_mode == 0.
281	* This is to prevent xfs_bulkstat from picking up just reclaimed	281	* This is to prevent xfs_bulkstat from picking up just reclaimed
282	* inodes that have their in-core state initialized but not flushed	282	* inodes that have their in-core state initialized but not flushed
283	* to disk yet. This is a temporary hack that would require a proper	283	* to disk yet. This is a temporary hack that would require a proper
284	* fix in the future.	284	* fix in the future.
285	*/	285	*/
286	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC \|\|	286	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC \|\|
287	!XFS_DINODE_GOOD_VERSION(dip->di_version) \|\|	287	!XFS_DINODE_GOOD_VERSION(dip->di_version) \|\|
288	!dip->di_mode)	288	!dip->di_mode)
289	return 0;	289	return 0;
290	if (flags & BULKSTAT_FG_QUICK) {	290	if (flags & BULKSTAT_FG_QUICK) {
291	*dipp = dip;	291	*dipp = dip;
292	return 1;	292	return 1;
293	}	293	}
294	/* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */	294	/* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
295	aformat = dip->di_aformat;	295	aformat = dip->di_aformat;
296	if ((XFS_DFORK_Q(dip) == 0) \|\|	296	if ((XFS_DFORK_Q(dip) == 0) \|\|
297	(aformat == XFS_DINODE_FMT_LOCAL) \|\|	297	(aformat == XFS_DINODE_FMT_LOCAL) \|\|
298	(aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {	298	(aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
299	*dipp = dip;	299	*dipp = dip;
300	return 1;	300	return 1;
301	}	301	}
302	return 1;	302	return 1;
303	}	303	}
304		304
305	#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)	305	#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
306		306
307	/*	307	/*
308	* Return stat information in bulk (by-inode) for the filesystem.	308	* Return stat information in bulk (by-inode) for the filesystem.
309	*/	309	*/
310	int /* error status */	310	int /* error status */
311	xfs_bulkstat(	311	xfs_bulkstat(
312	xfs_mount_t mp, / mount point for filesystem */	312	xfs_mount_t mp, / mount point for filesystem */
313	xfs_ino_t lastinop, / last inode returned */	313	xfs_ino_t lastinop, / last inode returned */
314	int ubcountp, / size of buffer/count returned */	314	int ubcountp, / size of buffer/count returned */
315	bulkstat_one_pf formatter, /* func that'd fill a single buf */	315	bulkstat_one_pf formatter, /* func that'd fill a single buf */
316	void private_data,/ private data for formatter */	316	void private_data,/ private data for formatter */
317	size_t statstruct_size, /* sizeof struct filling */	317	size_t statstruct_size, /* sizeof struct filling */
318	char __user ubuffer, / buffer with inode stats */	318	char __user ubuffer, / buffer with inode stats */
319	int flags, /* defined in xfs_itable.h */	319	int flags, /* defined in xfs_itable.h */
320	int done) / 1 if there are more stats to get */	320	int done) / 1 if there are more stats to get */
321	{	321	{
322	xfs_agblock_t agbno=0;/* allocation group block number */	322	xfs_agblock_t agbno=0;/* allocation group block number */
323	xfs_buf_t agbp; / agi header buffer */	323	xfs_buf_t agbp; / agi header buffer */
324	xfs_agi_t agi; / agi header data */	324	xfs_agi_t agi; / agi header data */
325	xfs_agino_t agino; /* inode # in allocation group */	325	xfs_agino_t agino; /* inode # in allocation group */
326	xfs_agnumber_t agno; /* allocation group number */	326	xfs_agnumber_t agno; /* allocation group number */
327	xfs_daddr_t bno; /* inode cluster start daddr */	327	xfs_daddr_t bno; /* inode cluster start daddr */
328	int chunkidx; /* current index into inode chunk */	328	int chunkidx; /* current index into inode chunk */
329	int clustidx; /* current index into inode cluster */	329	int clustidx; /* current index into inode cluster */
330	xfs_btree_cur_t cur; / btree cursor for ialloc btree */	330	xfs_btree_cur_t cur; / btree cursor for ialloc btree */
331	int end_of_ag; /* set if we've seen the ag end */	331	int end_of_ag; /* set if we've seen the ag end */
332	int error; /* error code */	332	int error; /* error code */
333	int fmterror;/* bulkstat formatter result */	333	int fmterror;/* bulkstat formatter result */
334	__int32_t gcnt; /* current btree rec's count */	334	__int32_t gcnt; /* current btree rec's count */
335	xfs_inofree_t gfree; /* current btree rec's free mask */	335	xfs_inofree_t gfree; /* current btree rec's free mask */
336	xfs_agino_t gino; /* current btree rec's start inode */	336	xfs_agino_t gino; /* current btree rec's start inode */
337	int i; /* loop index */	337	int i; /* loop index */
338	int icount; /* count of inodes good in irbuf */	338	int icount; /* count of inodes good in irbuf */
339	size_t irbsize; /* size of irec buffer in bytes */	339	size_t irbsize; /* size of irec buffer in bytes */
340	xfs_ino_t ino; /* inode number (filesystem) */	340	xfs_ino_t ino; /* inode number (filesystem) */
341	xfs_inobt_rec_incore_t irbp; / current irec buffer pointer */	341	xfs_inobt_rec_incore_t irbp; / current irec buffer pointer */
342	xfs_inobt_rec_incore_t irbuf; / start of irec buffer */	342	xfs_inobt_rec_incore_t irbuf; / start of irec buffer */
343	xfs_inobt_rec_incore_t irbufend; / end of good irec buffer entries */	343	xfs_inobt_rec_incore_t irbufend; / end of good irec buffer entries */
344	xfs_ino_t lastino; /* last inode number returned */	344	xfs_ino_t lastino; /* last inode number returned */
345	int nbcluster; /* # of blocks in a cluster */	345	int nbcluster; /* # of blocks in a cluster */
346	int nicluster; /* # of inodes in a cluster */	346	int nicluster; /* # of inodes in a cluster */
347	int nimask; /* mask for inode clusters */	347	int nimask; /* mask for inode clusters */
348	int nirbuf; /* size of irbuf */	348	int nirbuf; /* size of irbuf */
349	int rval; /* return value error code */	349	int rval; /* return value error code */
350	int tmp; /* result value from btree calls */	350	int tmp; /* result value from btree calls */
351	int ubcount; /* size of user's buffer */	351	int ubcount; /* size of user's buffer */
352	int ubleft; /* bytes left in user's buffer */	352	int ubleft; /* bytes left in user's buffer */
353	char __user ubufp; / pointer into user's buffer */	353	char __user ubufp; / pointer into user's buffer */
354	int ubelem; /* spaces used in user's buffer */	354	int ubelem; /* spaces used in user's buffer */
355	int ubused; /* bytes used by formatter */	355	int ubused; /* bytes used by formatter */
356	xfs_buf_t bp; / ptr to on-disk inode cluster buf */	356	xfs_buf_t bp; / ptr to on-disk inode cluster buf */
357	xfs_dinode_t dip; / ptr into bp for specific inode */	357	xfs_dinode_t dip; / ptr into bp for specific inode */
358		358
359	/*	359	/*
360	* Get the last inode value, see if there's nothing to do.	360	* Get the last inode value, see if there's nothing to do.
361	*/	361	*/
362	ino = (xfs_ino_t)*lastinop;	362	ino = (xfs_ino_t)*lastinop;
363	lastino = ino;	363	lastino = ino;
364	dip = NULL;	364	dip = NULL;
365	agno = XFS_INO_TO_AGNO(mp, ino);	365	agno = XFS_INO_TO_AGNO(mp, ino);
366	agino = XFS_INO_TO_AGINO(mp, ino);	366	agino = XFS_INO_TO_AGINO(mp, ino);
367	if (agno >= mp->m_sb.sb_agcount \|\|	367	if (agno >= mp->m_sb.sb_agcount \|\|
368	ino != XFS_AGINO_TO_INO(mp, agno, agino)) {	368	ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
369	*done = 1;	369	*done = 1;
370	*ubcountp = 0;	370	*ubcountp = 0;
371	return 0;	371	return 0;
372	}	372	}
373	if (!ubcountp \|\| *ubcountp <= 0) {	373	if (!ubcountp \|\| *ubcountp <= 0) {
374	return EINVAL;	374	return EINVAL;
375	}	375	}
376	ubcount = ubcountp; / statstruct's */	376	ubcount = ubcountp; / statstruct's */
377	ubleft = ubcount * statstruct_size; /* bytes */	377	ubleft = ubcount * statstruct_size; /* bytes */
378	*ubcountp = ubelem = 0;	378	*ubcountp = ubelem = 0;
379	*done = 0;	379	*done = 0;
380	fmterror = 0;	380	fmterror = 0;
381	ubufp = ubuffer;	381	ubufp = ubuffer;
382	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?	382	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
383	mp->m_sb.sb_inopblock :	383	mp->m_sb.sb_inopblock :
384	(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);	384	(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
385	nimask = ~(nicluster - 1);	385	nimask = ~(nicluster - 1);
386	nbcluster = nicluster >> mp->m_sb.sb_inopblog;	386	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
387	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4,	387	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4,
388	KM_SLEEP \| KM_MAYFAIL \| KM_LARGE);	388	KM_SLEEP \| KM_MAYFAIL \| KM_LARGE);
389	nirbuf = irbsize / sizeof(*irbuf);	389	nirbuf = irbsize / sizeof(*irbuf);
390		390
391	/*	391	/*
392	* Loop over the allocation groups, starting from the last	392	* Loop over the allocation groups, starting from the last
393	* inode returned; 0 means start of the allocation group.	393	* inode returned; 0 means start of the allocation group.
394	*/	394	*/
395	rval = 0;	395	rval = 0;
396	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {	396	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
397	cond_resched();	397	cond_resched();
398	bp = NULL;	398	bp = NULL;
399	down_read(&mp->m_peraglock);	399	down_read(&mp->m_peraglock);
400	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);	400	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
401	up_read(&mp->m_peraglock);	401	up_read(&mp->m_peraglock);
402	if (error) {	402	if (error) {
403	/*	403	/*
404	* Skip this allocation group and go to the next one.	404	* Skip this allocation group and go to the next one.
405	*/	405	*/
406	agno++;	406	agno++;
407	agino = 0;	407	agino = 0;
408	continue;	408	continue;
409	}	409	}
410	agi = XFS_BUF_TO_AGI(agbp);	410	agi = XFS_BUF_TO_AGI(agbp);
411	/*	411	/*
412	* Allocate and initialize a btree cursor for ialloc btree.	412	* Allocate and initialize a btree cursor for ialloc btree.
413	*/	413	*/
414	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);	414	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
415	irbp = irbuf;	415	irbp = irbuf;
416	irbufend = irbuf + nirbuf;	416	irbufend = irbuf + nirbuf;
417	end_of_ag = 0;	417	end_of_ag = 0;
418	/*	418	/*
419	* If we're returning in the middle of an allocation group,	419	* If we're returning in the middle of an allocation group,
420	* we need to get the remainder of the chunk we're in.	420	* we need to get the remainder of the chunk we're in.
421	*/	421	*/
422	if (agino > 0) {	422	if (agino > 0) {
423	/*	423	/*
424	* Lookup the inode chunk that this inode lives in.	424	* Lookup the inode chunk that this inode lives in.
425	*/	425	*/
426	error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);	426	error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
427	if (!error && /* no I/O error */	427	if (!error && /* no I/O error */
428	tmp && /* lookup succeeded */	428	tmp && /* lookup succeeded */
429	/* got the record, should always work */	429	/* got the record, should always work */
430	!(error = xfs_inobt_get_rec(cur, &gino, &gcnt,	430	!(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
431	&gfree, &i)) &&	431	&gfree, &i)) &&
432	i == 1 &&	432	i == 1 &&
433	/* this is the right chunk */	433	/* this is the right chunk */
434	agino < gino + XFS_INODES_PER_CHUNK &&	434	agino < gino + XFS_INODES_PER_CHUNK &&
435	/* lastino was not last in chunk */	435	/* lastino was not last in chunk */
436	(chunkidx = agino - gino + 1) <	436	(chunkidx = agino - gino + 1) <
437	XFS_INODES_PER_CHUNK &&	437	XFS_INODES_PER_CHUNK &&
438	/* there are some left allocated */	438	/* there are some left allocated */
439	XFS_INOBT_MASKN(chunkidx,	439	XFS_INOBT_MASKN(chunkidx,
440	XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {	440	XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
441	/*	441	/*
442	* Grab the chunk record. Mark all the	442	* Grab the chunk record. Mark all the
443	* uninteresting inodes (because they're	443	* uninteresting inodes (because they're
444	* before our start point) free.	444	* before our start point) free.
445	*/	445	*/
446	for (i = 0; i < chunkidx; i++) {	446	for (i = 0; i < chunkidx; i++) {
447	if (XFS_INOBT_MASK(i) & ~gfree)	447	if (XFS_INOBT_MASK(i) & ~gfree)
448	gcnt++;	448	gcnt++;
449	}	449	}
450	gfree \|= XFS_INOBT_MASKN(0, chunkidx);	450	gfree \|= XFS_INOBT_MASKN(0, chunkidx);
451	irbp->ir_startino = gino;	451	irbp->ir_startino = gino;
452	irbp->ir_freecount = gcnt;	452	irbp->ir_freecount = gcnt;
453	irbp->ir_free = gfree;	453	irbp->ir_free = gfree;
454	irbp++;	454	irbp++;
455	agino = gino + XFS_INODES_PER_CHUNK;	455	agino = gino + XFS_INODES_PER_CHUNK;
456	icount = XFS_INODES_PER_CHUNK - gcnt;	456	icount = XFS_INODES_PER_CHUNK - gcnt;
457	} else {	457	} else {
458	/*	458	/*
459	* If any of those tests failed, bump the	459	* If any of those tests failed, bump the
460	* inode number (just in case).	460	* inode number (just in case).
461	*/	461	*/
462	agino++;	462	agino++;
463	icount = 0;	463	icount = 0;
464	}	464	}
465	/*	465	/*
466	* In any case, increment to the next record.	466	* In any case, increment to the next record.
467	*/	467	*/
468	if (!error)	468	if (!error)
469	error = xfs_btree_increment(cur, 0, &tmp);	469	error = xfs_btree_increment(cur, 0, &tmp);
470	} else {	470	} else {
471	/*	471	/*
472	* Start of ag. Lookup the first inode chunk.	472	* Start of ag. Lookup the first inode chunk.
473	*/	473	*/
474	error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);	474	error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
475	icount = 0;	475	icount = 0;
476	}	476	}
477	/*	477	/*
478	* Loop through inode btree records in this ag,	478	* Loop through inode btree records in this ag,
479	* until we run out of inodes or space in the buffer.	479	* until we run out of inodes or space in the buffer.
480	*/	480	*/
481	while (irbp < irbufend && icount < ubcount) {	481	while (irbp < irbufend && icount < ubcount) {
482	/*	482	/*
483	* Loop as long as we're unable to read the	483	* Loop as long as we're unable to read the
484	* inode btree.	484	* inode btree.
485	*/	485	*/
486	while (error) {	486	while (error) {
487	agino += XFS_INODES_PER_CHUNK;	487	agino += XFS_INODES_PER_CHUNK;
488	if (XFS_AGINO_TO_AGBNO(mp, agino) >=	488	if (XFS_AGINO_TO_AGBNO(mp, agino) >=
489	be32_to_cpu(agi->agi_length))	489	be32_to_cpu(agi->agi_length))
490	break;	490	break;
491	error = xfs_inobt_lookup_ge(cur, agino, 0, 0,	491	error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
492	&tmp);	492	&tmp);
493	cond_resched();	493	cond_resched();
494	}	494	}
495	/*	495	/*
496	* If ran off the end of the ag either with an error,	496	* If ran off the end of the ag either with an error,
497	* or the normal way, set end and stop collecting.	497	* or the normal way, set end and stop collecting.
498	*/	498	*/
499	if (error \|\|	499	if (error \|\|
500	(error = xfs_inobt_get_rec(cur, &gino, &gcnt,	500	(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
501	&gfree, &i)) \|\|	501	&gfree, &i)) \|\|
502	i == 0) {	502	i == 0) {
503	end_of_ag = 1;	503	end_of_ag = 1;
504	break;	504	break;
505	}	505	}
506	/*	506	/*
507	* If this chunk has any allocated inodes, save it.	507	* If this chunk has any allocated inodes, save it.
508	* Also start read-ahead now for this chunk.	508	* Also start read-ahead now for this chunk.
509	*/	509	*/
510	if (gcnt < XFS_INODES_PER_CHUNK) {	510	if (gcnt < XFS_INODES_PER_CHUNK) {
511	/*	511	/*
512	* Loop over all clusters in the next chunk.	512	* Loop over all clusters in the next chunk.
513	* Do a readahead if there are any allocated	513	* Do a readahead if there are any allocated
514	* inodes in that cluster.	514	* inodes in that cluster.
515	*/	515	*/
516	for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),	516	for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
517	chunkidx = 0;	517	chunkidx = 0;
518	chunkidx < XFS_INODES_PER_CHUNK;	518	chunkidx < XFS_INODES_PER_CHUNK;
519	chunkidx += nicluster,	519	chunkidx += nicluster,
520	agbno += nbcluster) {	520	agbno += nbcluster) {
521	if (XFS_INOBT_MASKN(chunkidx,	521	if (XFS_INOBT_MASKN(chunkidx,
522	nicluster) & ~gfree)	522	nicluster) & ~gfree)
523	xfs_btree_reada_bufs(mp, agno,	523	xfs_btree_reada_bufs(mp, agno,
524	agbno, nbcluster);	524	agbno, nbcluster);
525	}	525	}
526	irbp->ir_startino = gino;	526	irbp->ir_startino = gino;
527	irbp->ir_freecount = gcnt;	527	irbp->ir_freecount = gcnt;
528	irbp->ir_free = gfree;	528	irbp->ir_free = gfree;
529	irbp++;	529	irbp++;
530	icount += XFS_INODES_PER_CHUNK - gcnt;	530	icount += XFS_INODES_PER_CHUNK - gcnt;
531	}	531	}
532	/*	532	/*
533	* Set agino to after this chunk and bump the cursor.	533	* Set agino to after this chunk and bump the cursor.
534	*/	534	*/
535	agino = gino + XFS_INODES_PER_CHUNK;	535	agino = gino + XFS_INODES_PER_CHUNK;
536	error = xfs_btree_increment(cur, 0, &tmp);	536	error = xfs_btree_increment(cur, 0, &tmp);
537	cond_resched();	537	cond_resched();
538	}	538	}
539	/*	539	/*
540	* Drop the btree buffers and the agi buffer.	540	* Drop the btree buffers and the agi buffer.
541	* We can't hold any of the locks these represent	541	* We can't hold any of the locks these represent
542	* when calling iget.	542	* when calling iget.
543	*/	543	*/
544	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);	544	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
545	xfs_buf_relse(agbp);	545	xfs_buf_relse(agbp);
546	/*	546	/*
547	* Now format all the good inodes into the user's buffer.	547	* Now format all the good inodes into the user's buffer.
548	*/	548	*/
549	irbufend = irbp;	549	irbufend = irbp;
550	for (irbp = irbuf;	550	for (irbp = irbuf;
551	irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {	551	irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
552	/*	552	/*
553	* Now process this chunk of inodes.	553	* Now process this chunk of inodes.
554	*/	554	*/
555	for (agino = irbp->ir_startino, chunkidx = clustidx = 0;	555	for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
556	XFS_BULKSTAT_UBLEFT(ubleft) &&	556	XFS_BULKSTAT_UBLEFT(ubleft) &&
557	irbp->ir_freecount < XFS_INODES_PER_CHUNK;	557	irbp->ir_freecount < XFS_INODES_PER_CHUNK;
558	chunkidx++, clustidx++, agino++) {	558	chunkidx++, clustidx++, agino++) {
559	ASSERT(chunkidx < XFS_INODES_PER_CHUNK);	559	ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
560	/*	560	/*
561	* Recompute agbno if this is the	561	* Recompute agbno if this is the
562	* first inode of the cluster.	562	* first inode of the cluster.
563	*	563	*
564	* Careful with clustidx. There can be	564	* Careful with clustidx. There can be
565	* multple clusters per chunk, a single	565	* multple clusters per chunk, a single
566	* cluster per chunk or a cluster that has	566	* cluster per chunk or a cluster that has
567	* inodes represented from several different	567	* inodes represented from several different
568	* chunks (if blocksize is large).	568	* chunks (if blocksize is large).
569	*	569	*
570	* Because of this, the starting clustidx is	570	* Because of this, the starting clustidx is
571	* initialized to zero in this loop but must	571	* initialized to zero in this loop but must
572	* later be reset after reading in the cluster	572	* later be reset after reading in the cluster
573	* buffer.	573	* buffer.
574	*/	574	*/
575	if ((chunkidx & (nicluster - 1)) == 0) {	575	if ((chunkidx & (nicluster - 1)) == 0) {
576	agbno = XFS_AGINO_TO_AGBNO(mp,	576	agbno = XFS_AGINO_TO_AGBNO(mp,
577	irbp->ir_startino) +	577	irbp->ir_startino) +
578	((chunkidx & nimask) >>	578	((chunkidx & nimask) >>
579	mp->m_sb.sb_inopblog);	579	mp->m_sb.sb_inopblog);
580		580
581	if (flags & (BULKSTAT_FG_QUICK \|	581	if (flags & (BULKSTAT_FG_QUICK \|
582	BULKSTAT_FG_INLINE)) {	582	BULKSTAT_FG_INLINE)) {
583	int offset;	583	int offset;
584		584
585	ino = XFS_AGINO_TO_INO(mp, agno,	585	ino = XFS_AGINO_TO_INO(mp, agno,
586	agino);	586	agino);
587	bno = XFS_AGB_TO_DADDR(mp, agno,	587	bno = XFS_AGB_TO_DADDR(mp, agno,
588	agbno);	588	agbno);
589		589
590	/*	590	/*
591	* Get the inode cluster buffer	591	* Get the inode cluster buffer
592	*/	592	*/
593	if (bp)	593	if (bp)
594	xfs_buf_relse(bp);	594	xfs_buf_relse(bp);
595		595
596	error = xfs_inotobp(mp, NULL, ino, &dip,	596	error = xfs_inotobp(mp, NULL, ino, &dip,
597	&bp, &offset,	597	&bp, &offset,
598	XFS_IMAP_BULKSTAT);	598	XFS_IMAP_BULKSTAT);
599		599
600	if (!error)	600	if (!error)
601	clustidx = offset / mp->m_sb.sb_inodesize;	601	clustidx = offset / mp->m_sb.sb_inodesize;
602	if (XFS_TEST_ERROR(error != 0,	602	if (XFS_TEST_ERROR(error != 0,
603	mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,	603	mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
604	XFS_RANDOM_BULKSTAT_READ_CHUNK)) {	604	XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
605	bp = NULL;	605	bp = NULL;
606	ubleft = 0;	606	ubleft = 0;
607	rval = error;	607	rval = error;
608	break;	608	break;
609	}	609	}
610	}	610	}
611	}	611	}
612	ino = XFS_AGINO_TO_INO(mp, agno, agino);	612	ino = XFS_AGINO_TO_INO(mp, agno, agino);
613	bno = XFS_AGB_TO_DADDR(mp, agno, agbno);	613	bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
614	/*	614	/*
615	* Skip if this inode is free.	615	* Skip if this inode is free.
616	*/	616	*/
617	if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {	617	if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
618	lastino = ino;	618	lastino = ino;
619	continue;	619	continue;
620	}	620	}
621	/*	621	/*
622	* Count used inodes as free so we can tell	622	* Count used inodes as free so we can tell
623	* when the chunk is used up.	623	* when the chunk is used up.
624	*/	624	*/
625	irbp->ir_freecount++;	625	irbp->ir_freecount++;
626	if (!xfs_bulkstat_use_dinode(mp, flags, bp,	626	if (!xfs_bulkstat_use_dinode(mp, flags, bp,
627	clustidx, &dip)) {	627	clustidx, &dip)) {
628	lastino = ino;	628	lastino = ino;
629	continue;	629	continue;
630	}	630	}
631	/*	631	/*
632	* If we need to do an iget, cannot hold bp.	632	* If we need to do an iget, cannot hold bp.
633	* Drop it, until starting the next cluster.	633	* Drop it, until starting the next cluster.
634	*/	634	*/
635	if ((flags & BULKSTAT_FG_INLINE) && !dip) {	635	if ((flags & BULKSTAT_FG_INLINE) && !dip) {
636	if (bp)	636	if (bp)
637	xfs_buf_relse(bp);	637	xfs_buf_relse(bp);
638	bp = NULL;	638	bp = NULL;
639	}	639	}
640		640
641	/*	641	/*
642	* Get the inode and fill in a single buffer.	642	* Get the inode and fill in a single buffer.
643	* BULKSTAT_FG_QUICK uses dip to fill it in.	643	* BULKSTAT_FG_QUICK uses dip to fill it in.
644	* BULKSTAT_FG_IGET uses igets.	644	* BULKSTAT_FG_IGET uses igets.
645	* BULKSTAT_FG_INLINE uses dip if we have an	645	* BULKSTAT_FG_INLINE uses dip if we have an
646	* inline attr fork, else igets.	646	* inline attr fork, else igets.
647	* See: xfs_bulkstat_one & xfs_dm_bulkstat_one.	647	* See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
648	* This is also used to count inodes/blks, etc	648	* This is also used to count inodes/blks, etc
649	* in xfs_qm_quotacheck.	649	* in xfs_qm_quotacheck.
650	*/	650	*/
651	ubused = statstruct_size;	651	ubused = statstruct_size;
652	error = formatter(mp, ino, ubufp,	652	error = formatter(mp, ino, ubufp,
653	ubleft, private_data,	653	ubleft, private_data,
654	bno, &ubused, dip, &fmterror);	654	bno, &ubused, dip, &fmterror);
655	if (fmterror == BULKSTAT_RV_NOTHING) {	655	if (fmterror == BULKSTAT_RV_NOTHING) {
656	if (error && error != ENOENT &&	656	if (error && error != ENOENT &&
657	error != EINVAL) {	657	error != EINVAL) {
658	ubleft = 0;	658	ubleft = 0;
659	rval = error;	659	rval = error;
660	break;	660	break;
661	}	661	}
662	lastino = ino;	662	lastino = ino;
663	continue;	663	continue;
664	}	664	}
665	if (fmterror == BULKSTAT_RV_GIVEUP) {	665	if (fmterror == BULKSTAT_RV_GIVEUP) {
666	ubleft = 0;	666	ubleft = 0;
667	ASSERT(error);	667	ASSERT(error);
668	rval = error;	668	rval = error;
669	break;	669	break;
670	}	670	}
671	if (ubufp)	671	if (ubufp)
672	ubufp += ubused;	672	ubufp += ubused;
673	ubleft -= ubused;	673	ubleft -= ubused;
674	ubelem++;	674	ubelem++;
675	lastino = ino;	675	lastino = ino;
676	}	676	}
677		677
678	cond_resched();	678	cond_resched();
679	}	679	}
680		680
681	if (bp)	681	if (bp)
682	xfs_buf_relse(bp);	682	xfs_buf_relse(bp);
683		683
684	/*	684	/*
685	* Set up for the next loop iteration.	685	* Set up for the next loop iteration.
686	*/	686	*/
687	if (XFS_BULKSTAT_UBLEFT(ubleft)) {	687	if (XFS_BULKSTAT_UBLEFT(ubleft)) {
688	if (end_of_ag) {	688	if (end_of_ag) {
689	agno++;	689	agno++;
690	agino = 0;	690	agino = 0;
691	} else	691	} else
692	agino = XFS_INO_TO_AGINO(mp, lastino);	692	agino = XFS_INO_TO_AGINO(mp, lastino);
693	} else	693	} else
694	break;	694	break;
695	}	695	}
696	/*	696	/*
697	* Done, we're either out of filesystem or space to put the data.	697	* Done, we're either out of filesystem or space to put the data.
698	*/	698	*/
699	kmem_free(irbuf);	699	kmem_free(irbuf);
700	*ubcountp = ubelem;	700	*ubcountp = ubelem;
701	/*	701	/*
702	* Found some inodes, return them now and return the error next time.	702	* Found some inodes, return them now and return the error next time.
703	*/	703	*/
704	if (ubelem)	704	if (ubelem)
705	rval = 0;	705	rval = 0;
706	if (agno >= mp->m_sb.sb_agcount) {	706	if (agno >= mp->m_sb.sb_agcount) {
707	/*	707	/*
708	* If we ran out of filesystem, mark lastino as off	708	* If we ran out of filesystem, mark lastino as off
709	* the end of the filesystem, so the next call	709	* the end of the filesystem, so the next call
710	* will return immediately.	710	* will return immediately.
711	*/	711	*/
712	*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);	712	*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
713	*done = 1;	713	*done = 1;
714	} else	714	} else
715	*lastinop = (xfs_ino_t)lastino;	715	*lastinop = (xfs_ino_t)lastino;
716		716
717	return rval;	717	return rval;
718	}	718	}
719		719
720	/*	720	/*
721	* Return stat information in bulk (by-inode) for the filesystem.	721	* Return stat information in bulk (by-inode) for the filesystem.
722	* Special case for non-sequential one inode bulkstat.	722	* Special case for non-sequential one inode bulkstat.
723	*/	723	*/
724	int /* error status */	724	int /* error status */
725	xfs_bulkstat_single(	725	xfs_bulkstat_single(
726	xfs_mount_t mp, / mount point for filesystem */	726	xfs_mount_t mp, / mount point for filesystem */
727	xfs_ino_t lastinop, / inode to return */	727	xfs_ino_t lastinop, / inode to return */
728	char __user buffer, / buffer with inode stats */	728	char __user buffer, / buffer with inode stats */
729	int done) / 1 if there are more stats to get */	729	int done) / 1 if there are more stats to get */
730	{	730	{
731	int count; /* count value for bulkstat call */	731	int count; /* count value for bulkstat call */
732	int error; /* return value */	732	int error; /* return value */
733	xfs_ino_t ino; /* filesystem inode number */	733	xfs_ino_t ino; /* filesystem inode number */
734	int res; /* result from bs1 */	734	int res; /* result from bs1 */
735		735
736	/*	736	/*
737	* note that requesting valid inode numbers which are not allocated	737	* note that requesting valid inode numbers which are not allocated
738	* to inodes will most likely cause xfs_itobp to generate warning	738	* to inodes will most likely cause xfs_itobp to generate warning
739	* messages about bad magic numbers. This is ok. The fact that	739	* messages about bad magic numbers. This is ok. The fact that
740	* the inode isn't actually an inode is handled by the	740	* the inode isn't actually an inode is handled by the
741	* error check below. Done this way to make the usual case faster	741	* error check below. Done this way to make the usual case faster
742	* at the expense of the error case.	742	* at the expense of the error case.
743	*/	743	*/
744		744
745	ino = (xfs_ino_t)*lastinop;	745	ino = (xfs_ino_t)*lastinop;
746	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),	746	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
747	NULL, 0, NULL, NULL, &res);	747	NULL, 0, NULL, NULL, &res);
748	if (error) {	748	if (error) {
749	/*	749	/*
750	* Special case way failed, do it the "long" way	750	* Special case way failed, do it the "long" way
751	* to see if that works.	751	* to see if that works.
752	*/	752	*/
753	(*lastinop)--;	753	(*lastinop)--;
754	count = 1;	754	count = 1;
755	if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,	755	if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
756	NULL, sizeof(xfs_bstat_t), buffer,	756	NULL, sizeof(xfs_bstat_t), buffer,
757	BULKSTAT_FG_IGET, done))	757	BULKSTAT_FG_IGET, done))
758	return error;	758	return error;
759	if (count == 0 \|\| (xfs_ino_t)*lastinop != ino)	759	if (count == 0 \|\| (xfs_ino_t)*lastinop != ino)
760	return error == EFSCORRUPTED ?	760	return error == EFSCORRUPTED ?
761	XFS_ERROR(EINVAL) : error;	761	XFS_ERROR(EINVAL) : error;
762	else	762	else
763	return 0;	763	return 0;
764	}	764	}
765	*done = 0;	765	*done = 0;
766	return 0;	766	return 0;
767	}	767	}
768		768
769	int	769	int
770	xfs_inumbers_fmt(	770	xfs_inumbers_fmt(
771	void __user ubuffer, / buffer to write to */	771	void __user ubuffer, / buffer to write to */
772	const xfs_inogrp_t buffer, / buffer to read from */	772	const xfs_inogrp_t buffer, / buffer to read from */
773	long count, /* # of elements to read */	773	long count, /* # of elements to read */
774	long written) / # of bytes written */	774	long written) / # of bytes written */
775	{	775	{
776	if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))	776	if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
777	return -EFAULT;	777	return -EFAULT;
778	written = count sizeof(*buffer);	778	written = count sizeof(*buffer);
779	return 0;	779	return 0;
780	}	780	}
781		781
782	/*	782	/*
783	* Return inode number table for the filesystem.	783	* Return inode number table for the filesystem.
784	*/	784	*/
785	int /* error status */	785	int /* error status */
786	xfs_inumbers(	786	xfs_inumbers(
787	xfs_mount_t mp, / mount point for filesystem */	787	xfs_mount_t mp, / mount point for filesystem */
788	xfs_ino_t lastino, / last inode returned */	788	xfs_ino_t lastino, / last inode returned */
789	int count, / size of buffer/count returned */	789	int count, / size of buffer/count returned */
790	void __user ubuffer,/ buffer with inode descriptions */	790	void __user ubuffer,/ buffer with inode descriptions */
791	inumbers_fmt_pf formatter)	791	inumbers_fmt_pf formatter)
792	{	792	{
793	xfs_buf_t *agbp;	793	xfs_buf_t *agbp;
794	xfs_agino_t agino;	794	xfs_agino_t agino;
795	xfs_agnumber_t agno;	795	xfs_agnumber_t agno;
796	int bcount;	796	int bcount;
797	xfs_inogrp_t *buffer;	797	xfs_inogrp_t *buffer;
798	int bufidx;	798	int bufidx;
799	xfs_btree_cur_t *cur;	799	xfs_btree_cur_t *cur;
800	int error;	800	int error;
801	__int32_t gcnt;	801	__int32_t gcnt;
802	xfs_inofree_t gfree;	802	xfs_inofree_t gfree;
803	xfs_agino_t gino;	803	xfs_agino_t gino;
804	int i;	804	int i;
805	xfs_ino_t ino;	805	xfs_ino_t ino;
806	int left;	806	int left;
807	int tmp;	807	int tmp;
808		808
809	ino = (xfs_ino_t)*lastino;	809	ino = (xfs_ino_t)*lastino;
810	agno = XFS_INO_TO_AGNO(mp, ino);	810	agno = XFS_INO_TO_AGNO(mp, ino);
811	agino = XFS_INO_TO_AGINO(mp, ino);	811	agino = XFS_INO_TO_AGINO(mp, ino);
812	left = *count;	812	left = *count;
813	*count = 0;	813	*count = 0;
814	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));	814	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
815	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);	815	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
816	error = bufidx = 0;	816	error = bufidx = 0;
817	cur = NULL;	817	cur = NULL;
818	agbp = NULL;	818	agbp = NULL;
819	while (left > 0 && agno < mp->m_sb.sb_agcount) {	819	while (left > 0 && agno < mp->m_sb.sb_agcount) {
820	if (agbp == NULL) {	820	if (agbp == NULL) {
821	down_read(&mp->m_peraglock);	821	down_read(&mp->m_peraglock);
822	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);	822	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
823	up_read(&mp->m_peraglock);	823	up_read(&mp->m_peraglock);
824	if (error) {	824	if (error) {
825	/*	825	/*
826	* If we can't read the AGI of this ag,	826	* If we can't read the AGI of this ag,
827	* then just skip to the next one.	827	* then just skip to the next one.
828	*/	828	*/
829	ASSERT(cur == NULL);	829	ASSERT(cur == NULL);
830	agbp = NULL;	830	agbp = NULL;
831	agno++;	831	agno++;
832	agino = 0;	832	agino = 0;
833	continue;	833	continue;
834	}	834	}
835	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);	835	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
836	error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);	836	error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
837	if (error) {	837	if (error) {
838	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);	838	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
839	cur = NULL;	839	cur = NULL;
840	xfs_buf_relse(agbp);	840	xfs_buf_relse(agbp);
841	agbp = NULL;	841	agbp = NULL;
842	/*	842	/*
843	* Move up the last inode in the current	843	* Move up the last inode in the current
844	* chunk. The lookup_ge will always get	844	* chunk. The lookup_ge will always get
845	* us the first inode in the next chunk.	845	* us the first inode in the next chunk.
846	*/	846	*/
847	agino += XFS_INODES_PER_CHUNK - 1;	847	agino += XFS_INODES_PER_CHUNK - 1;
848	continue;	848	continue;
849	}	849	}
850	}	850	}
851	if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,	851	if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
852	&i)) \|\|	852	&i)) \|\|
853	i == 0) {	853	i == 0) {
854	xfs_buf_relse(agbp);	854	xfs_buf_relse(agbp);
855	agbp = NULL;	855	agbp = NULL;
856	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);	856	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
857	cur = NULL;	857	cur = NULL;
858	agno++;	858	agno++;
859	agino = 0;	859	agino = 0;
860	continue;	860	continue;
861	}	861	}
862	agino = gino + XFS_INODES_PER_CHUNK - 1;	862	agino = gino + XFS_INODES_PER_CHUNK - 1;
863	buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);	863	buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
864	buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;	864	buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
865	buffer[bufidx].xi_allocmask = ~gfree;	865	buffer[bufidx].xi_allocmask = ~gfree;
866	bufidx++;	866	bufidx++;
867	left--;	867	left--;
868	if (bufidx == bcount) {	868	if (bufidx == bcount) {
869	long written;	869	long written;
870	if (formatter(ubuffer, buffer, bufidx, &written)) {	870	if (formatter(ubuffer, buffer, bufidx, &written)) {
871	error = XFS_ERROR(EFAULT);	871	error = XFS_ERROR(EFAULT);
872	break;	872	break;
873	}	873	}
874	ubuffer += written;	874	ubuffer += written;
875	*count += bufidx;	875	*count += bufidx;
876	bufidx = 0;	876	bufidx = 0;
877	}	877	}
878	if (left) {	878	if (left) {
879	error = xfs_btree_increment(cur, 0, &tmp);	879	error = xfs_btree_increment(cur, 0, &tmp);
880	if (error) {	880	if (error) {
881	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);	881	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
882	cur = NULL;	882	cur = NULL;
883	xfs_buf_relse(agbp);	883	xfs_buf_relse(agbp);
884	agbp = NULL;	884	agbp = NULL;
885	/*	885	/*
886	* The agino value has already been bumped.	886	* The agino value has already been bumped.
887	* Just try to skip up to it.	887	* Just try to skip up to it.
888	*/	888	*/
889	agino += XFS_INODES_PER_CHUNK;	889	agino += XFS_INODES_PER_CHUNK;
890	continue;	890	continue;
891	}	891	}
892	}	892	}
893	}	893	}
894	if (!error) {	894	if (!error) {
895	if (bufidx) {	895	if (bufidx) {
896	long written;	896	long written;
897	if (formatter(ubuffer, buffer, bufidx, &written))	897	if (formatter(ubuffer, buffer, bufidx, &written))
898	error = XFS_ERROR(EFAULT);	898	error = XFS_ERROR(EFAULT);
899	else	899	else
900	*count += bufidx;	900	*count += bufidx;
901	}	901	}
902	*lastino = XFS_AGINO_TO_INO(mp, agno, agino);	902	*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
903	}	903	}
904	kmem_free(buffer);	904	kmem_free(buffer);
905	if (cur)	905	if (cur)
906	xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :	906	xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
907	XFS_BTREE_NOERROR));	907	XFS_BTREE_NOERROR));
908	if (agbp)	908	if (agbp)
909	xfs_buf_relse(agbp);	909	xfs_buf_relse(agbp);
910	return error;	910	return error;
911	}	911	}
912		912

fs/xfs/xfs_log_recover.c

Diff comments View file @ 92bfc6e

 /*
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_recover.h"
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
 #include "xfs_utils.h"
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
 STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
 					       xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void	xlog_recover_check_summary(xlog_t *);
 #else
 #define	xlog_recover_check_summary(log)
 #endif
 /*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
 #define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)	\
 	( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
 	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
 xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
 	int		num_bblks)
 {
 	ASSERT(num_bblks > 0);
 	if (log->l_sectbb_log) {
 		if (num_bblks > 1)
 			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
 		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
 	}
 	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
 }
 void
 xlog_put_bp(
 	xfs_buf_t	*bp)
 {
 	xfs_buf_free(bp);
 }
 /*
  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  */
 int
 xlog_bread(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	int		error;
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 	ASSERT(bp);
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 	XFS_BUF_READ(bp);
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_iowait(bp);
 	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
 }
 /*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
  * This can only be used for synchronous log writes.
  */
 STATIC int
 xlog_bwrite(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	int		error;
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 	XFS_BUF_ZEROFLAGS(bp);
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_HOLD(bp);
 	XFS_BUF_PSEMA(bp, PRIBIO);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 	if ((error = xfs_bwrite(log->l_mp, bp)))
 		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
 }
 STATIC xfs_caddr_t
 xlog_align(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	xfs_caddr_t	ptr;
 	if (!log->l_sectbb_log)
 		return XFS_BUF_PTR(bp);
 	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
 	ASSERT(XFS_BUF_SIZE(bp) >=
 		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
 	return ptr;
 }
 #ifdef DEBUG
 /*
  * dump debug superblock and log record information
  */
 STATIC void
 xlog_header_check_dump(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	int			b;
 	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
 	cmn_err(CE_DEBUG, "    log : uuid = ");
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
 #define xlog_header_check_dump(mp, head)
 #endif
 /*
  * check log record header for recovery
  */
 STATIC int
 xlog_header_check_recover(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 	/*
 	 * IRIX doesn't write the h_fmt field and leaves it zeroed
 	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 	 * a dirty log created in IRIX.
 	 */
 	if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
 		xlog_warn(
 	"XFS: dirty log written in incompatible format - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 		xlog_warn(
 	"XFS: dirty log entry has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 /*
  * read the head block of the log and check the header
  */
 STATIC int
 xlog_header_check_mount(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 	if (uuid_is_nil(&head->h_fs_uuid)) {
 		/*
 		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 		 * h_fs_uuid is nil, we assume this log was last mounted
 		 * by IRIX and continue.
 		 */
 		xlog_warn("XFS: nil uuid in log - IRIX style log");
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 		xlog_warn("XFS: log has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_mount",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 STATIC void
 xlog_recover_iodone(
 	struct xfs_buf	*bp)
 {
 	xfs_mount_t	*mp;
 	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
 	if (XFS_BUF_GETERROR(bp)) {
 		/*
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
 		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
 		xfs_ioerror_alert("xlog_recover_iodone",
 				  mp, bp, XFS_BUF_ADDR(bp));
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	}
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	xfs_biodone(bp);
 }
 /*
  * This routine finds (to an approximation) the first block in the physical
  * log which contains the given cycle.  It uses a binary search algorithm.
  * Note that the algorithm can not be perfect because the disk will not
  * necessarily be perfect.
  */
 STATIC int
 xlog_find_cycle_start(
 	xlog_t		*log,
 	xfs_buf_t	*bp,
 	xfs_daddr_t	first_blk,
 	xfs_daddr_t	*last_blk,
 	uint		cycle)
 {
 	xfs_caddr_t	offset;
 	xfs_daddr_t	mid_blk;
 	uint		mid_cycle;
 	int		error;
 	mid_blk = BLK_AVG(first_blk, *last_blk);
 	while (mid_blk != first_blk && mid_blk != *last_blk) {
 		if ((error = xlog_bread(log, mid_blk, 1, bp)))
 			return error;
 		offset = xlog_align(log, mid_blk, 1, bp);
 		mid_cycle = xlog_get_cycle(offset);
 		if (mid_cycle == cycle) {
 			*last_blk = mid_blk;
 			/* last_half_cycle == mid_cycle */
 		} else {
 			first_blk = mid_blk;
 			/* first_half_cycle == mid_cycle */
 		}
 		mid_blk = BLK_AVG(first_blk, *last_blk);
 	}
 	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
 	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
 	return 0;
 }
 /*
  * Check that the range of blocks does not contain the cycle number
  * given.  The scan needs to occur from front to back and the ptr into the
  * region must be updated since a later routine will need to perform another
  * test.  If the region is completely good, we end up returning the same
  * last block number.
  *
  * Set blkno to -1 if we encounter no errors.  This is an invalid block number
  * since we don't ever expect logs to get this large.
  */
 STATIC int
 xlog_find_verify_cycle(
 	xlog_t		*log,
 	xfs_daddr_t	start_blk,
 	int		nbblks,
 	uint		stop_on_cycle_no,
 	xfs_daddr_t	*new_blk)
 {
 	xfs_daddr_t	i, j;
 	uint		cycle;
 	xfs_buf_t	*bp;
 	xfs_daddr_t	bufblks;
 	xfs_caddr_t	buf = NULL;
 	int		error = 0;
 	bufblks = 1 << ffs(nbblks);
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		/* can't get enough memory to do everything in one big buffer */
 		bufblks >>= 1;
 		if (bufblks <= log->l_sectbb_log)
 			return ENOMEM;
 	}
 	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 		int	bcount;
 		bcount = min(bufblks, (start_blk + nbblks - i));
 		if ((error = xlog_bread(log, i, bcount, bp)))
 			goto out;
 		buf = xlog_align(log, i, bcount, bp);
 		for (j = 0; j < bcount; j++) {
 			cycle = xlog_get_cycle(buf);
 			if (cycle == stop_on_cycle_no) {
 				*new_blk = i+j;
 				goto out;
 			}
 			buf += BBSIZE;
 		}
 	}
 	*new_blk = -1;
 out:
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * Potentially backup over partial log record write.
  *
  * In the typical case, last_blk is the number of the block directly after
  * a good log record.  Therefore, we subtract one to get the block number
  * of the last block in the given buffer.  extra_bblks contains the number
  * of blocks we would have read on a previous read.  This happens when the
  * last log record is split over the end of the physical log.
  *
  * extra_bblks is the number of blocks potentially verified on a previous
  * call to this routine.
  */
 STATIC int
 xlog_find_verify_log_record(
 	xlog_t			*log,
 	xfs_daddr_t		start_blk,
 	xfs_daddr_t		*last_blk,
 	int			extra_bblks)
 {
 	xfs_daddr_t		i;
 	xfs_buf_t		*bp;
 	xfs_caddr_t		offset = NULL;
 	xlog_rec_header_t	*head = NULL;
 	int			error = 0;
 	int			smallmem = 0;
 	int			num_blks = *last_blk - start_blk;
 	int			xhdrs;
 	ASSERT(start_blk != 0 || *last_blk != start_blk);
 	if (!(bp = xlog_get_bp(log, num_blks))) {
 		if (!(bp = xlog_get_bp(log, 1)))
 			return ENOMEM;
 		smallmem = 1;
 	} else {
 		if ((error = xlog_bread(log, start_blk, num_blks, bp)))
 			goto out;
 		offset = xlog_align(log, start_blk, num_blks, bp);
 		offset += ((num_blks - 1) << BBSHIFT);
 	}
 	for (i = (*last_blk) - 1; i >= 0; i--) {
 		if (i < start_blk) {
 			/* valid log record not found */
 			xlog_warn(
 		"XFS: Log inconsistent (didn't find previous header)");
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			goto out;
 		}
 		if (smallmem) {
 			if ((error = xlog_bread(log, i, 1, bp)))
 				goto out;
 			offset = xlog_align(log, i, 1, bp);
 		}
 		head = (xlog_rec_header_t *)offset;
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
 			break;
 		if (!smallmem)
 			offset -= BBSIZE;
 	}
 	/*
 	 * We hit the beginning of the physical log & still no header.  Return
 	 * to caller.  If caller can handle a return of -1, then this routine
 	 * will be called again for the end of the physical log.
 	 */
 	if (i == -1) {
 		error = -1;
 		goto out;
 	}
 	/*
 	 * We have the final block of the good log (the first block
 	 * of the log record _before_ the head. So we check the uuid.
 	 */
 	if ((error = xlog_header_check_mount(log->l_mp, head)))
 		goto out;
 	/*
 	 * We may have found a log record header before we expected one.
 	 * last_blk will be the 1st block # with a given cycle #.  We may end
 	 * up reading an entire log record.  In this case, we don't want to
 	 * reset last_blk.  Only when last_blk points in the middle of a log
 	 * record do we update last_blk.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		uint	h_size = be32_to_cpu(head->h_size);
 		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 		if (h_size % XLOG_HEADER_CYCLE_SIZE)
 			xhdrs++;
 	} else {
 		xhdrs = 1;
 	}
 	if (*last_blk - i + extra_bblks !=
 	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 		*last_blk = i;
 out:
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * Head is defined to be the point of the log where the next log write
  * write could go.  This means that incomplete LR writes at the end are
  * eliminated when calculating the head.  We aren't guaranteed that previous
  * LR have complete transactions.  We only know that a cycle number of
  * current cycle number -1 won't be present in the log if we start writing
  * from our current block number.
  *
  * last_blk contains the block number of the first block with a given
  * cycle number.
  *
  * Return: zero if normal, non-zero if error.
  */
 STATIC int
 xlog_find_head(
 	xlog_t 		*log,
 	xfs_daddr_t	*return_head_blk)
 {
 	xfs_buf_t	*bp;
 	xfs_caddr_t	offset;
 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
 	int		num_scan_bblks;
 	uint		first_half_cycle, last_half_cycle;
 	uint		stop_on_cycle;
 	int		error, log_bbnum = log->l_logBBsize;
 	/* Is the end of the log device zeroed? */
 	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 		*return_head_blk = first_blk;
 		/* Is the whole lot zeroed? */
 		if (!first_blk) {
 			/* Linux XFS shouldn't generate totally zeroed logs -
 			 * mkfs etc write a dummy unmount record to a fresh
 			 * log so we can store the uuid in there
 			 */
 			xlog_warn("XFS: totally zeroed log");
 		}
 		return 0;
 	} else if (error) {
 		xlog_warn("XFS: empty log check failed");
 		return error;
 	}
 	first_blk = 0;			/* get cycle # of 1st block */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if ((error = xlog_bread(log, 0, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, 0, 1, bp);
 	first_half_cycle = xlog_get_cycle(offset);
 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
 	if ((error = xlog_bread(log, last_blk, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, last_blk, 1, bp);
 	last_half_cycle = xlog_get_cycle(offset);
 	ASSERT(last_half_cycle != 0);
 	/*
 	 * If the 1st half cycle number is equal to the last half cycle number,
 	 * then the entire log is stamped with the same cycle number.  In this
 	 * case, head_blk can't be set to zero (which makes sense).  The below
 	 * math doesn't work out properly with head_blk equal to zero.  Instead,
 	 * we set it to log_bbnum which is an invalid block number, but this
 	 * value makes the math correct.  If head_blk doesn't changed through
 	 * all the tests below, *head_blk is set to zero at the very end rather
 	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 	 * in a circular file.
 	 */
 	if (first_half_cycle == last_half_cycle) {
 		/*
 		 * In this case we believe that the entire log should have
 		 * cycle number last_half_cycle.  We need to scan backwards
 		 * from the end verifying that there are no holes still
 		 * containing last_half_cycle - 1.  If we find such a hole,
 		 * then the start of that hole will be the new head.  The
 		 * simple case looks like
 		 *        x | x ... | x - 1 | x
 		 * Another case that fits this picture would be
 		 *        x | x + 1 | x ... | x
 		 * In this case the head really is somewhere at the end of the
 		 * log, as one of the latest writes at the beginning was
 		 * incomplete.
 		 * One more case is
 		 *        x | x + 1 | x ... | x - 1 | x
 		 * This is really the combination of the above two cases, and
 		 * the head has to end up at the start of the x-1 hole at the
 		 * end of the log.
 		 *
 		 * In the 256k log case, we will read from the beginning to the
 		 * end of the log and search for cycle numbers equal to x-1.
 		 * We don't worry about the x+1 blocks that we encounter,
 		 * because we know that they cannot be the head since the log
 		 * started with x.
 		 */
 		head_blk = log_bbnum;
 		stop_on_cycle = last_half_cycle - 1;
 	} else {
 		/*
 		 * In this case we want to find the first block with cycle
 		 * number matching last_half_cycle.  We expect the log to be
 		 * some variation on
 		 *        x + 1 ... | x ...
 		 * The first block with cycle number x (last_half_cycle) will
 		 * be where the new head belongs.  First we do a binary search
 		 * for the first occurrence of last_half_cycle.  The binary
 		 * search may not be totally accurate, so then we scan back
 		 * from there looking for occurrences of last_half_cycle before
 		 * us.  If that backwards scan wraps around the beginning of
 		 * the log, then we look for occurrences of last_half_cycle - 1
 		 * at the end of the log.  The cases we're looking for look
 		 * like
 		 *        x + 1 ... | x | x + 1 | x ...
 		 *                               ^ binary search stopped here
 		 * or
 		 *        x + 1 ... | x ... | x - 1 | x
 		 *        <---------> less than scan distance
 		 */
 		stop_on_cycle = last_half_cycle;
 		if ((error = xlog_find_cycle_start(log, bp, first_blk,
 						&head_blk, last_half_cycle)))
 			goto bp_err;
 	}
 	/*
 	 * Now validate the answer.  Scan back some number of maximum possible
 	 * blocks and make sure each one has the expected cycle number.  The
 	 * maximum is determined by the total possible amount of buffering
 	 * in the in-core log.  The following number can be made tighter if
 	 * we actually look at the block size of the filesystem.
 	 */
 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 	if (head_blk >= num_scan_bblks) {
 		/*
 		 * We are guaranteed that the entire check can be performed
 		 * in one buffer.
 		 */
 		start_blk = head_blk - num_scan_bblks;
 		if ((error = xlog_find_verify_cycle(log,
 						start_blk, num_scan_bblks,
 						stop_on_cycle, &new_blk)))
 			goto bp_err;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	} else {		/* need to read 2 parts of log */
 		/*
 		 * We are going to scan backwards in the log in two parts.
 		 * First we scan the physical end of the log.  In this part
 		 * of the log, we are looking for blocks with cycle number
 		 * last_half_cycle - 1.
 		 * If we find one, then we know that the log starts there, as
 		 * we've found a hole that didn't get written in going around
 		 * the end of the physical log.  The simple case for this is
 		 *        x + 1 ... | x ... | x - 1 | x
 		 *        <---------> less than scan distance
 		 * If all of the blocks at the end of the log have cycle number
 		 * last_half_cycle, then we check the blocks at the start of
 		 * the log looking for occurrences of last_half_cycle.  If we
 		 * find one, then our current estimate for the location of the
 		 * first occurrence of last_half_cycle is wrong and we move
 		 * back to the hole we've found.  This case looks like
 		 *        x + 1 ... | x | x + 1 | x ...
 		 *                               ^ binary search stopped here
 		 * Another case we need to handle that only occurs in 256k
 		 * logs is
 		 *        x + 1 ... | x ... | x+1 | x ...
 		 *                   ^ binary search stops here
 		 * In a 256k log, the scan at the end of the log will see the
 		 * x + 1 blocks.  We need to skip past those since that is
 		 * certainly not the head of the log.  By searching for
 		 * last_half_cycle-1 we accomplish that.
 		 */
 		start_blk = log_bbnum - num_scan_bblks + head_blk;
 		ASSERT(head_blk <= INT_MAX &&
 			(xfs_daddr_t) num_scan_bblks - head_blk >= 0);
 		if ((error = xlog_find_verify_cycle(log, start_blk,
 					num_scan_bblks - (int)head_blk,
 					(stop_on_cycle - 1), &new_blk)))
 			goto bp_err;
 		if (new_blk != -1) {
 			head_blk = new_blk;
 			goto bad_blk;
 		}
 		/*
 		 * Scan beginning of log now.  The last part of the physical
 		 * log is good.  This scan needs to verify that it doesn't find
 		 * the last_half_cycle.
 		 */
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
 		if ((error = xlog_find_verify_cycle(log,
 					start_blk, (int)head_blk,
 					stop_on_cycle, &new_blk)))
 			goto bp_err;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	}
  bad_blk:
 	/*
 	 * Now we need to make sure head_blk is not pointing to a block in
 	 * the middle of a log record.
 	 */
 	num_scan_bblks = XLOG_REC_SHIFT(log);
 	if (head_blk >= num_scan_bblks) {
 		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 		/* start ptr at last block ptr before head_blk */
 		if ((error = xlog_find_verify_log_record(log, start_blk,
 							&head_blk, 0)) == -1) {
 			error = XFS_ERROR(EIO);
 			goto bp_err;
 		} else if (error)
 			goto bp_err;
 	} else {
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
 		if ((error = xlog_find_verify_log_record(log, start_blk,
 							&head_blk, 0)) == -1) {
 			/* We hit the beginning of the log during our search */
 			start_blk = log_bbnum - num_scan_bblks + head_blk;
 			new_blk = log_bbnum;
 			ASSERT(start_blk <= INT_MAX &&
 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
 			ASSERT(head_blk <= INT_MAX);
 			if ((error = xlog_find_verify_log_record(log,
 							start_blk, &new_blk,
 							(int)head_blk)) == -1) {
 				error = XFS_ERROR(EIO);
 				goto bp_err;
 			} else if (error)
 				goto bp_err;
 			if (new_blk != log_bbnum)
 				head_blk = new_blk;
 		} else if (error)
 			goto bp_err;
 	}
 	xlog_put_bp(bp);
 	if (head_blk == log_bbnum)
 		*return_head_blk = 0;
 	else
 		*return_head_blk = head_blk;
 	/*
 	 * When returning here, we have a good block number.  Bad block
 	 * means that during a previous crash, we didn't have a clean break
 	 * from cycle number N to cycle number N-1.  In this case, we need
 	 * to find the first block with cycle number N-1.
 	 */
 	return 0;
  bp_err:
 	xlog_put_bp(bp);
 	if (error)
 	    xlog_warn("XFS: failed to find log head");
 	return error;
 }
 /*
  * Find the sync block number or the tail of the log.
  *
  * This will be the block number of the last record to have its
  * associated buffers synced to disk.  Every log record header has
  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
  * to get a sync block number.  The only concern is to figure out which
  * log record header to believe.
  *
  * The following algorithm uses the log record header with the largest
  * lsn.  The entire log record does not need to be valid.  We only care
  * that the header is valid.
  *
  * We could speed up search by using current head_blk buffer, but it is not
  * available.
  */
 int
 xlog_find_tail(
 	xlog_t			*log,
 	xfs_daddr_t		*head_blk,
 	xfs_daddr_t		*tail_blk)
 {
 	xlog_rec_header_t	*rhead;
 	xlog_op_header_t	*op_head;
 	xfs_caddr_t		offset = NULL;
 	xfs_buf_t		*bp;
 	int			error, i, found;
 	xfs_daddr_t		umount_data_blk;
 	xfs_daddr_t		after_umount_blk;
 	xfs_lsn_t		tail_lsn;
 	int			hblks;
 	found = 0;
 	/*
 	 * Find previous log record
 	 */
 	if ((error = xlog_find_head(log, head_blk)))
 		return error;
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if (*head_blk == 0) {				/* special case */
 		if ((error = xlog_bread(log, 0, 1, bp)))
 			goto bread_err;
 		offset = xlog_align(log, 0, 1, bp);
 		if (xlog_get_cycle(offset) == 0) {
 			*tail_blk = 0;
 			/* leave all other log inited values alone */
 			goto exit;
 		}
 	}
 	/*
 	 * Search backwards looking for log record header block
 	 */
 	ASSERT(*head_blk < INT_MAX);
 	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 		if ((error = xlog_bread(log, i, 1, bp)))
 			goto bread_err;
 		offset = xlog_align(log, i, 1, bp);
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
 			found = 1;
 			break;
 		}
 	}
 	/*
 	 * If we haven't found the log record header block, start looking
 	 * again from the end of the physical log.  XXXmiken: There should be
 	 * a check here to make sure we didn't search more than N blocks in
 	 * the previous code.
 	 */
 	if (!found) {
 		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 			if ((error = xlog_bread(log, i, 1, bp)))
 				goto bread_err;
 			offset = xlog_align(log, i, 1, bp);
 			if (XLOG_HEADER_MAGIC_NUM ==
 			    be32_to_cpu(*(__be32 *)offset)) {
 				found = 2;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
 		ASSERT(0);
 		return XFS_ERROR(EIO);
 	}
 	/* find blk_no of tail of log */
 	rhead = (xlog_rec_header_t *)offset;
 	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 	/*
 	 * Reset log values according to the state of the log when we
 	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
 	 * one because the next write starts a new cycle rather than
 	 * continuing the cycle of the last good log record.  At this
 	 * point we have guaranteed that all partial log records have been
 	 * accounted for.  Therefore, we know that the last good log record
 	 * written was complete and ended exactly on the end boundary
 	 * of the physical log.
 	 */
 	log->l_prev_block = i;
 	log->l_curr_block = (int)*head_blk;
 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 	if (found == 2)
 		log->l_curr_cycle++;
 	log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
 	log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
 	log->l_grant_reserve_cycle = log->l_curr_cycle;
 	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
 	log->l_grant_write_cycle = log->l_curr_cycle;
 	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
 	/*
 	 * Look for unmount record.  If we find it, then we know there
 	 * was a clean unmount.  Since 'i' could be the last block in
 	 * the physical log, we convert to a log block before comparing
 	 * to the head_blk.
 	 *
 	 * Save the current tail lsn to use to pass to
 	 * xlog_clear_stale_blocks() below.  We won't want to clear the
 	 * unmount record if there is one, so we pass the lsn of the
 	 * unmount record rather than the block after it.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		int	h_size = be32_to_cpu(rhead->h_size);
 		int	h_version = be32_to_cpu(rhead->h_version);
 		if ((h_version & XLOG_VERSION_2) &&
 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
 				hblks++;
 		} else {
 			hblks = 1;
 		}
 	} else {
 		hblks = 1;
 	}
 	after_umount_blk = (i + hblks + (int)
 		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
 	tail_lsn = log->l_tail_lsn;
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
 		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
 			goto bread_err;
 		}
 		offset = xlog_align(log, umount_data_blk, 1, bp);
 		op_head = (xlog_op_header_t *)offset;
 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 			/*
 			 * Set tail and last sync so that newly written
 			 * log records will point recovery to after the
 			 * current unmount record.
 			 */
 			log->l_tail_lsn =
 				xlog_assign_lsn(log->l_curr_cycle,
 						after_umount_blk);
 			log->l_last_sync_lsn =
 				xlog_assign_lsn(log->l_curr_cycle,
 						after_umount_blk);
 			*tail_blk = after_umount_blk;
 			/*
 			 * Note that the unmount was clean. If the unmount
 			 * was not clean, we need to know this to rebuild the
 			 * superblock counters from the perag headers if we
 			 * have a filesystem using non-persistent counters.
 			 */
 			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 		}
 	}
 	/*
 	 * Make sure that there are no blocks in front of the head
 	 * with the same cycle number as the head.  This can happen
 	 * because we allow multiple outstanding log writes concurrently,
 	 * and the later writes might make it out before earlier ones.
 	 *
 	 * We use the lsn from before modifying it so that we'll never
 	 * overwrite the unmount record after a clean unmount.
 	 *
 	 * Do this only if we are going to recover the filesystem
 	 *
 	 * NOTE: This used to say "if (!readonly)"
 	 * However on Linux, we can & do recover a read-only filesystem.
 	 * We only skip recovery if NORECOVERY is specified on mount,
 	 * in which case we would not be here.
 	 *
 	 * But... if the -device- itself is readonly, just skip this.
 	 * We can't recover this device anyway, so it won't matter.
 	 */
 	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
 		error = xlog_clear_stale_blocks(log, tail_lsn);
 	}
 bread_err:
 exit:
 	xlog_put_bp(bp);
 	if (error)
 		xlog_warn("XFS: failed to locate log tail");
 	return error;
 }
 /*
  * Is the log zeroed at all?
  *
  * The last binary search should be changed to perform an X block read
  * once X becomes small enough.  You can then search linearly through
  * the X blocks.  This will cut down on the number of reads we need to do.
  *
  * If the log is partially zeroed, this routine will pass back the blkno
  * of the first block with cycle number 0.  It won't have a complete LR
  * preceding it.
  *
  * Return:
  *	0  => the log is completely written to
  *	-1 => use *blk_no as the first block of the log
  *	>0 => error has occurred
  */
 STATIC int
 xlog_find_zeroed(
 	xlog_t		*log,
 	xfs_daddr_t	*blk_no)
 {
 	xfs_buf_t	*bp;
 	xfs_caddr_t	offset;
 	uint	        first_cycle, last_cycle;
 	xfs_daddr_t	new_blk, last_blk, start_blk;
 	xfs_daddr_t     num_scan_bblks;
 	int	        error, log_bbnum = log->l_logBBsize;
 	*blk_no = 0;
 	/* check totally zeroed log */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if ((error = xlog_bread(log, 0, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, 0, 1, bp);
 	first_cycle = xlog_get_cycle(offset);
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
 		xlog_put_bp(bp);
 		return -1;
 	}
 	/* check partially zeroed log */
 	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, log_bbnum-1, 1, bp);
 	last_cycle = xlog_get_cycle(offset);
 	if (last_cycle != 0) {		/* log completely written to */
 		xlog_put_bp(bp);
 		return 0;
 	} else if (first_cycle != 1) {
 		/*
 		 * If the cycle of the last block is zero, the cycle of
 		 * the first block must be 1. If it's not, maybe we're
 		 * not looking at a log... Bail out.
 		 */
 		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
 		return XFS_ERROR(EINVAL);
 	}
 	/* we have a partially zeroed log */
 	last_blk = log_bbnum-1;
 	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
 		goto bp_err;
 	/*
 	 * Validate the answer.  Because there is no way to guarantee that
 	 * the entire log is made up of log records which are the same size,
 	 * we scan over the defined maximum blocks.  At this point, the maximum
 	 * is not chosen to mean anything special.   XXXmiken
 	 */
 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 	ASSERT(num_scan_bblks <= INT_MAX);
 	if (last_blk < num_scan_bblks)
 		num_scan_bblks = last_blk;
 	start_blk = last_blk - num_scan_bblks;
 	/*
 	 * We search for any instances of cycle number 0 that occur before
 	 * our current estimate of the head.  What we're trying to detect is
 	 *        1 ... | 0 | 1 | 0...
 	 *                       ^ binary search ends here
 	 */
 	if ((error = xlog_find_verify_cycle(log, start_blk,
 					 (int)num_scan_bblks, 0, &new_blk)))
 		goto bp_err;
 	if (new_blk != -1)
 		last_blk = new_blk;
 	/*
 	 * Potentially backup over partial log record write.  We don't need
 	 * to search the end of the log because we know it is zero.
 	 */
 	if ((error = xlog_find_verify_log_record(log, start_blk,
 				&last_blk, 0)) == -1) {
 	    error = XFS_ERROR(EIO);
 	    goto bp_err;
 	} else if (error)
 	    goto bp_err;
 	*blk_no = last_blk;
 bp_err:
 	xlog_put_bp(bp);
 	if (error)
 		return error;
 	return -1;
 }
 /*
  * These are simple subroutines used by xlog_clear_stale_blocks() below
  * to initialize a buffer full of empty log record headers and write
  * them into the log.
  */
 STATIC void
 xlog_add_record(
 	xlog_t			*log,
 	xfs_caddr_t		buf,
 	int			cycle,
 	int			block,
 	int			tail_cycle,
 	int			tail_block)
 {
 	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
 	memset(buf, 0, BBSIZE);
 	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
 	recp->h_cycle = cpu_to_be32(cycle);
 	recp->h_version = cpu_to_be32(
 			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
 	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
 	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
 	recp->h_fmt = cpu_to_be32(XLOG_FMT);
 	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
 }
 STATIC int
 xlog_write_log_records(
 	xlog_t		*log,
 	int		cycle,
 	int		start_block,
 	int		blocks,
 	int		tail_cycle,
 	int		tail_block)
 {
 	xfs_caddr_t	offset;
 	xfs_buf_t	*bp;
 	int		balign, ealign;
 	int		sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
 	int		end_block = start_block + blocks;
 	int		bufblks;
 	int		error = 0;
 	int		i, j = 0;
 	bufblks = 1 << ffs(blocks);
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks <= log->l_sectbb_log)
 			return ENOMEM;
 	}
 	/* We may need to do a read at the start to fill in part of
 	 * the buffer in the starting sector not covered by the first
 	 * write below.
 	 */
 	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
 	if (balign != start_block) {
 		if ((error = xlog_bread(log, start_block, 1, bp))) {
 			xlog_put_bp(bp);
 			return error;
 		}
 		j = start_block - balign;
 	}
 	for (i = start_block; i < end_block; i += bufblks) {
 		int		bcount, endcount;
 		bcount = min(bufblks, end_block - start_block);
 		endcount = bcount - j;
 		/* We may need to do a read at the end to fill in part of
 		 * the buffer in the final sector not covered by the write.
 		 * If this is the same sector as the above read, skip it.
 		 */
 		ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
 		if (j == 0 && (start_block + endcount > ealign)) {
 			offset = XFS_BUF_PTR(bp);
 			balign = BBTOB(ealign - start_block);
 			error = XFS_BUF_SET_PTR(bp, offset + balign,
 						BBTOB(sectbb));
 			if (!error)
 				error = xlog_bread(log, ealign, sectbb, bp);
 			if (!error)
 				error = XFS_BUF_SET_PTR(bp, offset, bufblks);
 			if (error)
 				break;
 		}
 		offset = xlog_align(log, start_block, endcount, bp);
 		for (; j < endcount; j++) {
 			xlog_add_record(log, offset, cycle, i+j,
 					tail_cycle, tail_block);
 			offset += BBSIZE;
 		}
 		error = xlog_bwrite(log, start_block, endcount, bp);
 		if (error)
 			break;
 		start_block += endcount;
 		j = 0;
 	}
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * This routine is called to blow away any incomplete log writes out
  * in front of the log head.  We do this so that we won't become confused
  * if we come up, write only a little bit more, and then crash again.
  * If we leave the partial log records out there, this situation could
  * cause us to think those partial writes are valid blocks since they
  * have the current cycle number.  We get rid of them by overwriting them
  * with empty log records with the old cycle number rather than the
  * current one.
  *
  * The tail lsn is passed in rather than taken from
  * the log so that we will not write over the unmount record after a
  * clean unmount in a 512 block log.  Doing so would leave the log without
  * any valid log records in it until a new one was written.  If we crashed
  * during that time we would not be able to recover.
  */
 STATIC int
 xlog_clear_stale_blocks(
 	xlog_t		*log,
 	xfs_lsn_t	tail_lsn)
 {
 	int		tail_cycle, head_cycle;
 	int		tail_block, head_block;
 	int		tail_distance, max_distance;
 	int		distance;
 	int		error;
 	tail_cycle = CYCLE_LSN(tail_lsn);
 	tail_block = BLOCK_LSN(tail_lsn);
 	head_cycle = log->l_curr_cycle;
 	head_block = log->l_curr_block;
 	/*
 	 * Figure out the distance between the new head of the log
 	 * and the tail.  We want to write over any blocks beyond the
 	 * head that we may have written just before the crash, but
 	 * we don't want to overwrite the tail of the log.
 	 */
 	if (head_cycle == tail_cycle) {
 		/*
 		 * The tail is behind the head in the physical log,
 		 * so the distance from the head to the tail is the
 		 * distance from the head to the end of the log plus
 		 * the distance from the beginning of the log to the
 		 * tail.
 		 */
 		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		tail_distance = tail_block + (log->l_logBBsize - head_block);
 	} else {
 		/*
 		 * The head is behind the tail in the physical log,
 		 * so the distance from the head to the tail is just
 		 * the tail block minus the head block.
 		 */
 		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		tail_distance = tail_block - head_block;
 	}
 	/*
 	 * If the head is right up against the tail, we can't clear
 	 * anything.
 	 */
 	if (tail_distance <= 0) {
 		ASSERT(tail_distance == 0);
 		return 0;
 	}
 	max_distance = XLOG_TOTAL_REC_SHIFT(log);
 	/*
 	 * Take the smaller of the maximum amount of outstanding I/O
 	 * we could have and the distance to the tail to clear out.
 	 * We take the smaller so that we don't overwrite the tail and
 	 * we don't waste all day writing from the head to the tail
 	 * for no reason.
 	 */
 	max_distance = MIN(max_distance, tail_distance);
 	if ((head_block + max_distance) <= log->l_logBBsize) {
 		/*
 		 * We can stomp all the blocks we need to without
 		 * wrapping around the end of the log.  Just do it
 		 * in a single write.  Use the cycle number of the
 		 * current cycle minus one so that the log will look like:
 		 *     n ... | n - 1 ...
 		 */
 		error = xlog_write_log_records(log, (head_cycle - 1),
 				head_block, max_distance, tail_cycle,
 				tail_block);
 		if (error)
 			return error;
 	} else {
 		/*
 		 * We need to wrap around the end of the physical log in
 		 * order to clear all the blocks.  Do it in two separate
 		 * I/Os.  The first write should be from the head to the
 		 * end of the physical log, and it should use the current
 		 * cycle number minus one just like above.
 		 */
 		distance = log->l_logBBsize - head_block;
 		error = xlog_write_log_records(log, (head_cycle - 1),
 				head_block, distance, tail_cycle,
 				tail_block);
 		if (error)
 			return error;
 		/*
 		 * Now write the blocks at the start of the physical log.
 		 * This writes the remainder of the blocks we want to clear.
 		 * It uses the current cycle number since we're now on the
 		 * same cycle as the head so that we get:
 		 *    n ... n ... | n - 1 ...
 		 *    ^^^^^ blocks we're writing
 		 */
 		distance = max_distance - (log->l_logBBsize - head_block);
 		error = xlog_write_log_records(log, head_cycle, 0, distance,
 				tail_cycle, tail_block);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 /******************************************************************************
  *
  *		Log recover routines
  *
  ******************************************************************************
  */
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
 	xlog_recover_t		*q,
 	xlog_tid_t		tid)
 {
 	xlog_recover_t		*p = q;
 	while (p != NULL) {
 		if (p->r_log_tid == tid)
 		    break;
 		p = p->r_next;
 	}
 	return p;
 }
 STATIC void
 xlog_recover_put_hashq(
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans)
 {
 	trans->r_next = *q;
 	*q = trans;
 }
 STATIC void
 xlog_recover_add_item(
 	xlog_recover_item_t	**itemq)
 {
 	xlog_recover_item_t	*item;
 	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
 	xlog_recover_insert_item_backq(itemq, item);
 }
 STATIC int
 xlog_recover_add_to_cont_trans(
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
 {
 	xlog_recover_item_t	*item;
 	xfs_caddr_t		ptr, old_ptr;
 	int			old_len;
 	item = trans->r_itemq;
 	if (item == NULL) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
 		ptr = (xfs_caddr_t) &trans->r_theader +
 				sizeof(xfs_trans_header_t) - len;
 		memcpy(ptr, dp, len); /* d, s, l */
 		return 0;
 	}
 	item = item->ri_prev;
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
 	return 0;
 }
 /*
  * The next region to add is the start of a new region.  It could be
  * a whole region or it could be the first part of a new region.  Because
  * of this, the assumption here is that the type and size fields of all
  * format structures fit into the first 32 bits of the structure.
  *
  * This works because all regions must be 32 bit aligned.  Therefore, we
  * either have both fields or we have neither field.  In the case we have
  * neither field, the data part of the region is zero length.  We only have
  * a log_op_header and can throw away the header since a new one will appear
  * later.  If we have at least 4 bytes, then we can determine how many regions
  * will appear in the current log item.
  */
 STATIC int
 xlog_recover_add_to_trans(
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
 {
 	xfs_inode_log_format_t	*in_f;			/* any will do */
 	xlog_recover_item_t	*item;
 	xfs_caddr_t		ptr;
 	if (!len)
 		return 0;
 	item = trans->r_itemq;
 	if (item == NULL) {
 		/* we need to catch log corruptions here */
 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
 			xlog_warn("XFS: xlog_recover_add_to_trans: "
 				  "bad header magic number");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
 		return 0;
 	}
 	ptr = kmem_alloc(len, KM_SLEEP);
 	memcpy(ptr, dp, len);
 	in_f = (xfs_inode_log_format_t *)ptr;
 	if (item->ri_prev->ri_total != 0 &&
 	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
 		xlog_recover_add_item(&trans->r_itemq);
 	}
 	item = trans->r_itemq;
 	item = item->ri_prev;
 	if (item->ri_total == 0) {		/* first region to be added */
 		item->ri_total	= in_f->ilf_size;
 		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
 		item->ri_buf = kmem_zalloc((item->ri_total *
 					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */
 	item->ri_buf[item->ri_cnt].i_addr = ptr;
 	item->ri_buf[item->ri_cnt].i_len  = len;
 	item->ri_cnt++;
 	return 0;
 }
 STATIC void
 xlog_recover_new_tid(
 	xlog_recover_t		**q,
 	xlog_tid_t		tid,
 	xfs_lsn_t		lsn)
 {
 	xlog_recover_t		*trans;
 	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
 	trans->r_log_tid   = tid;
 	trans->r_lsn	   = lsn;
 	xlog_recover_put_hashq(q, trans);
 }
 STATIC int
 xlog_recover_unlink_tid(
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans)
 {
 	xlog_recover_t		*tp;
 	int			found = 0;
 	ASSERT(trans != NULL);
 	if (trans == *q) {
 		*q = (*q)->r_next;
 	} else {
 		tp = *q;
 		while (tp) {
 			if (tp->r_next == trans) {
 				found = 1;
 				break;
 			}
 			tp = tp->r_next;
 		}
 		if (!found) {
 			xlog_warn(
 			     "XFS: xlog_recover_unlink_tid: trans not found");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		tp->r_next = tp->r_next->r_next;
 	}
 	return 0;
 }
 STATIC void
 xlog_recover_insert_item_backq(
 	xlog_recover_item_t	**q,
 	xlog_recover_item_t	*item)
 {
 	if (*q == NULL) {
 		item->ri_prev = item->ri_next = item;
 		*q = item;
 	} else {
 		item->ri_next		= *q;
 		item->ri_prev		= (*q)->ri_prev;
 		(*q)->ri_prev		= item;
 		item->ri_prev->ri_next	= item;
 	}
 }
 STATIC void
 xlog_recover_insert_item_frontq(
 	xlog_recover_item_t	**q,
 	xlog_recover_item_t	*item)
 {
 	xlog_recover_insert_item_backq(q, item);
 	*q = item;
 }
 STATIC int
 xlog_recover_reorder_trans(
 	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
 	xfs_buf_log_format_t	*buf_f;
 	ushort			flags = 0;
 	first_item = itemq = trans->r_itemq;
 	trans->r_itemq = NULL;
 	do {
 		itemq_next = itemq->ri_next;
 		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
 		switch (ITEM_TYPE(itemq)) {
 		case XFS_LI_BUF:
 			flags = buf_f->blf_flags;
 			if (!(flags & XFS_BLI_CANCEL)) {
 				xlog_recover_insert_item_frontq(&trans->r_itemq,
 								itemq);
 				break;
 			}
 		case XFS_LI_INODE:
 		case XFS_LI_DQUOT:
 		case XFS_LI_QUOTAOFF:
 		case XFS_LI_EFD:
 		case XFS_LI_EFI:
 			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
 			break;
 		default:
 			xlog_warn(
 	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		itemq = itemq_next;
 	} while (first_item != itemq);
 	return 0;
 }
 /*
  * Build up the table of buf cancel records so that we don't replay
  * cancelled data in the second pass.  For buffer records that are
  * not cancel records, there is nothing to do here so we just return.
  *
  * If we get a cancel record which is already in the table, this indicates
  * that the buffer was cancelled multiple times.  In order to ensure
  * that during pass 2 we keep the record in the table until we reach its
  * last occurrence in the log, we keep a reference count in the cancel
  * record in the table to tell us how many times we expect to see this
  * record during the second pass.
  */
 STATIC void
 xlog_recover_do_buffer_pass1(
 	xlog_t			*log,
 	xfs_buf_log_format_t	*buf_f)
 {
 	xfs_buf_cancel_t	*bcp;
 	xfs_buf_cancel_t	*nextp;
 	xfs_buf_cancel_t	*prevp;
 	xfs_buf_cancel_t	**bucket;
 	xfs_daddr_t		blkno = 0;
 	uint			len = 0;
 	ushort			flags = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		len = buf_f->blf_len;
 		flags = buf_f->blf_flags;
 		break;
 	}
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
 	if (!(flags & XFS_BLI_CANCEL))
 		return;
 	/*
 	 * Insert an xfs_buf_cancel record into the hash table of
 	 * them.  If there is already an identical record, bump
 	 * its reference count.
 	 */
 	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
 					  XLOG_BC_TABLE_SIZE];
 	/*
 	 * If the hash bucket is empty then just insert a new record into
 	 * the bucket.
 	 */
 	if (*bucket == NULL) {
 		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
 						     KM_SLEEP);
 		bcp->bc_blkno = blkno;
 		bcp->bc_len = len;
 		bcp->bc_refcount = 1;
 		bcp->bc_next = NULL;
 		*bucket = bcp;
 		return;
 	}
 	/*
 	 * The hash bucket is not empty, so search for duplicates of our
 	 * record.  If we find one them just bump its refcount.  If not
 	 * then add us at the end of the list.
 	 */
 	prevp = NULL;
 	nextp = *bucket;
 	while (nextp != NULL) {
 		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
 			nextp->bc_refcount++;
 			return;
 		}
 		prevp = nextp;
 		nextp = nextp->bc_next;
 	}
 	ASSERT(prevp != NULL);
 	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
 					     KM_SLEEP);
 	bcp->bc_blkno = blkno;
 	bcp->bc_len = len;
 	bcp->bc_refcount = 1;
 	bcp->bc_next = NULL;
 	prevp->bc_next = bcp;
 }
 /*
  * Check to see whether the buffer being recovered has a corresponding
  * entry in the buffer cancel record table.  If it does then return 1
  * so that it will be cancelled, otherwise return 0.  If the buffer is
  * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
  * the refcount on the entry in the table and remove it from the table
  * if this is the last reference.
  *
  * We remove the cancel record from the table when we encounter its
  * last occurrence in the log so that if the same buffer is re-used
  * again after its last cancellation we actually replay the changes
  * made at that point.
  */
 STATIC int
 xlog_check_buffer_cancelled(
 	xlog_t			*log,
 	xfs_daddr_t		blkno,
 	uint			len,
 	ushort			flags)
 {
 	xfs_buf_cancel_t	*bcp;
 	xfs_buf_cancel_t	*prevp;
 	xfs_buf_cancel_t	**bucket;
 	if (log->l_buf_cancel_table == NULL) {
 		/*
 		 * There is nothing in the table built in pass one,
 		 * so this buffer must not be cancelled.
 		 */
 		ASSERT(!(flags & XFS_BLI_CANCEL));
 		return 0;
 	}
 	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
 					  XLOG_BC_TABLE_SIZE];
 	bcp = *bucket;
 	if (bcp == NULL) {
 		/*
 		 * There is no corresponding entry in the table built
 		 * in pass one, so this buffer has not been cancelled.
 		 */
 		ASSERT(!(flags & XFS_BLI_CANCEL));
 		return 0;
 	}
 	/*
 	 * Search for an entry in the buffer cancel table that
 	 * matches our buffer.
 	 */
 	prevp = NULL;
 	while (bcp != NULL) {
 		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
 			/*
 			 * We've go a match, so return 1 so that the
 			 * recovery of this buffer is cancelled.
 			 * If this buffer is actually a buffer cancel
 			 * log item, then decrement the refcount on the
 			 * one in the table and remove it if this is the
 			 * last reference.
 			 */
 			if (flags & XFS_BLI_CANCEL) {
 				bcp->bc_refcount--;
 				if (bcp->bc_refcount == 0) {
 					if (prevp == NULL) {
 						*bucket = bcp->bc_next;
 					} else {
 						prevp->bc_next = bcp->bc_next;
 					}
 					kmem_free(bcp);
 				}
 			}
 			return 1;
 		}
 		prevp = bcp;
 		bcp = bcp->bc_next;
 	}
 	/*
 	 * We didn't find a corresponding entry in the table, so
 	 * return 0 so that the buffer is NOT cancelled.
 	 */
 	ASSERT(!(flags & XFS_BLI_CANCEL));
 	return 0;
 }
 STATIC int
 xlog_recover_do_buffer_pass2(
 	xlog_t			*log,
 	xfs_buf_log_format_t	*buf_f)
 {
 	xfs_daddr_t		blkno = 0;
 	ushort			flags = 0;
 	uint			len = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		flags = buf_f->blf_flags;
 		len = buf_f->blf_len;
 		break;
 	}
 	return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
  * Perform recovery for a buffer full of inodes.  In these buffers,
  * the only data which should be recovered is that which corresponds
  * to the di_next_unlinked pointers in the on disk inode structures.
  * The rest of the data for the inodes is always logged through the
  * inodes themselves rather than the inode buffer and is recovered
  * in xlog_recover_do_inode_trans().
  *
  * The only time when buffers full of inodes are fully recovered is
  * when the buffer is full of newly allocated inodes.  In this case
  * the buffer will not be marked as an inode buffer and so will be
  * sent to xlog_recover_do_reg_buffer() below during recovery.
  */
 STATIC int
 xlog_recover_do_inode_buffer(
 	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			item_index;
 	int			bit;
 	int			nbits;
 	int			reg_buf_offset;
 	int			reg_buf_bytes;
 	int			next_unlinked_offset;
 	int			inodes_per_buf;
 	xfs_agino_t		*logged_nextp;
 	xfs_agino_t		*buffer_nextp;
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
 		map_size = buf_f->blf_map_size;
 		break;
 	}
 	/*
 	 * Set the variables corresponding to the current region to
 	 * 0 so that we'll initialize them on the first pass through
 	 * the loop.
 	 */
 	reg_buf_offset = 0;
 	reg_buf_bytes = 0;
 	bit = 0;
 	nbits = 0;
 	item_index = 0;
 	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
 			offsetof(xfs_dinode_t, di_next_unlinked);
 		while (next_unlinked_offset >=
 		       (reg_buf_offset + reg_buf_bytes)) {
 			/*
 			 * The next di_next_unlinked field is beyond
 			 * the current logged region.  Find the next
 			 * logged region that contains or is beyond
 			 * the current di_next_unlinked field.
 			 */
 			bit += nbits;
 			bit = xfs_next_bit(data_map, map_size, bit);
 			/*
 			 * If there are no more logged regions in the
 			 * buffer, then we're done.
 			 */
 			if (bit == -1) {
 				return 0;
 			}
 			nbits = xfs_contig_bits(data_map, map_size,
 							 bit);
 			ASSERT(nbits > 0);
 			reg_buf_offset = bit << XFS_BLI_SHIFT;
 			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
 			item_index++;
 		}
 		/*
 		 * If the current logged region starts after the current
 		 * di_next_unlinked field, then move on to the next
 		 * di_next_unlinked field.
 		 */
 		if (next_unlinked_offset < reg_buf_offset) {
 			continue;
 		}
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
 		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
 		/*
 		 * The current logged region contains a copy of the
 		 * current di_next_unlinked field.  Extract its value
 		 * and copy it to the buffer copy.
 		 */
 		logged_nextp = (xfs_agino_t *)
 			       ((char *)(item->ri_buf[item_index].i_addr) +
 				(next_unlinked_offset - reg_buf_offset));
 		if (unlikely(*logged_nextp == 0)) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
 				item, bp);
 			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
 					 XFS_ERRLEVEL_LOW, mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
 					      next_unlinked_offset);
 		*buffer_nextp = *logged_nextp;
 	}
 	return 0;
 }
 /*
  * Perform a 'normal' buffer recovery.  Each logged region of the
  * buffer should be copied over the corresponding region in the
  * given buffer.  The bitmap in the buf log format structure indicates
  * where to place the logged data.
  */
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			bit;
 	int			nbits;
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
 		map_size = buf_f->blf_map_size;
 		break;
 	}
 	bit = 0;
 	i = 1;  /* 0 is the buf format structure */
 	while (1) {
 		bit = xfs_next_bit(data_map, map_size, bit);
 		if (bit == -1)
 			break;
 		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
 		ASSERT(XFS_BUF_COUNT(bp) >=
 		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
 		/*
 		 * Do a sanity check if this is a dquot buffer. Just checking
 		 * the first dquot in the buffer should do. XXXThis is
 		 * probably a good thing to do for other buf types also.
 		 */
 		error = 0;
 		if (buf_f->blf_flags &
 		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
 					       item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
 		if (!error)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
 				nbits<<XFS_BLI_SHIFT);		/* length */
 		i++;
 		bit += nbits;
 	}
 	/* Shouldn't be any more regions */
 	ASSERT(i == item->ri_total);
 }
 /*
  * Do some primitive error checking on ondisk dquot data structures.
  */
 int
 xfs_qm_dqcheck(
 	xfs_disk_dquot_t *ddq,
 	xfs_dqid_t	 id,
 	uint		 type,	  /* used only when IO_dorepair is true */
 	uint		 flags,
 	char		 *str)
 {
 	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
 	int		errs = 0;
 	/*
 	 * We can encounter an uninitialized dquot buffer for 2 reasons:
 	 * 1. If we crash while deleting the quotainode(s), and those blks got
 	 *    used for user data. This is because we take the path of regular
 	 *    file deletion; however, the size field of quotainodes is never
 	 *    updated, so all the tricks that we play in itruncate_finish
 	 *    don't quite matter.
 	 *
 	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
 	 *    But the allocation will be replayed so we'll end up with an
 	 *    uninitialized quota block.
 	 *
 	 * This is all fine; things are still consistent, and we haven't lost
 	 * any quota information. Just don't complain about bad dquot blks.
 	 */
 	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
 			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
 		errs++;
 	}
 	if (ddq->d_version != XFS_DQUOT_VERSION) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
 			str, id, ddq->d_version, XFS_DQUOT_VERSION);
 		errs++;
 	}
 	if (ddq->d_flags != XFS_DQ_USER &&
 	    ddq->d_flags != XFS_DQ_PROJ &&
 	    ddq->d_flags != XFS_DQ_GROUP) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
 			str, id, ddq->d_flags);
 		errs++;
 	}
 	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : ondisk-dquot 0x%p, ID mismatch: "
 			"0x%x expected, found id 0x%x",
 			str, ddq, id, be32_to_cpu(ddq->d_id));
 		errs++;
 	}
 	if (!errs && ddq->d_id) {
 		if (ddq->d_blk_softlimit &&
 		    be64_to_cpu(ddq->d_bcount) >=
 				be64_to_cpu(ddq->d_blk_softlimit)) {
 			if (!ddq->d_btimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"BLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 		if (ddq->d_ino_softlimit &&
 		    be64_to_cpu(ddq->d_icount) >=
 				be64_to_cpu(ddq->d_ino_softlimit)) {
 			if (!ddq->d_itimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"INODE TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 		if (ddq->d_rtb_softlimit &&
 		    be64_to_cpu(ddq->d_rtbcount) >=
 				be64_to_cpu(ddq->d_rtb_softlimit)) {
 			if (!ddq->d_rtbtimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"RTBLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 	}
 	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
 		return errs;
 	if (flags & XFS_QMOPT_DOWARN)
 		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
 	/*
 	 * Typically, a repair is only requested by quotacheck.
 	 */
 	ASSERT(id != -1);
 	ASSERT(flags & XFS_QMOPT_DQREPAIR);
 	memset(d, 0, sizeof(xfs_dqblk_t));
 	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
 	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
 	d->dd_diskdq.d_flags = type;
 	d->dd_diskdq.d_id = cpu_to_be32(id);
 	return errs;
 }
 /*
  * Perform a dquot buffer recovery.
  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
  */
 STATIC void
 xlog_recover_do_dquot_buffer(
 	xfs_mount_t		*mp,
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	uint			type;
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
 	if (mp->m_qflags == 0) {
 		return;
 	}
 	type = 0;
 	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
 		type |= XFS_DQ_USER;
 	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
 		type |= XFS_DQ_PROJ;
 	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
 		type |= XFS_DQ_GROUP;
 	/*
 	 * This type of quotas was turned off, so ignore this buffer
 	 */
 	if (log->l_quotaoffs_flag & type)
 		return;
 	xlog_recover_do_reg_buffer(item, bp, buf_f);
 }
 /*
  * This routine replays a modification made to a buffer at runtime.
  * There are actually two types of buffer, regular and inode, which
  * are handled differently.  Inode buffers are handled differently
  * in that we only recover a specific set of data from them, namely
  * the inode di_next_unlinked fields.  This is because all other inode
  * data is actually logged via inode records and any data we replay
  * here which overlaps that may be stale.
  *
  * When meta-data buffers are freed at run time we log a buffer item
  * with the XFS_BLI_CANCEL bit set to indicate that previous copies
  * of the buffer in the log should not be replayed at recovery time.
  * This is so that if the blocks covered by the buffer are reused for
  * file data before we crash we don't end up replaying old, freed
  * meta-data into a user's file.
  *
  * To handle the cancellation of buffer log items, we make two passes
  * over the log during recovery.  During the first we build a table of
  * those buffers which have been cancelled, and during the second we
  * only replay those buffers which do not have corresponding cancel
  * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
 xlog_recover_do_buffer_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_buf_log_format_t	*buf_f;
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	int			error;
 	int			cancel;
 	xfs_daddr_t		blkno;
 	int			len;
 	ushort			flags;
 	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 	if (pass == XLOG_RECOVER_PASS1) {
 		/*
 		 * In this pass we're only looking for buf items
 		 * with the XFS_BLI_CANCEL bit set.
 		 */
 		xlog_recover_do_buffer_pass1(log, buf_f);
 		return 0;
 	} else {
 		/*
 		 * In this pass we want to recover all the buffers
 		 * which have not been cancelled and are not
 		 * cancellation buffers themselves.  The routine
 		 * we call here will tell us whether or not to
 		 * continue with the replay of this buffer.
 		 */
 		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
 		if (cancel) {
 			return 0;
 		}
 	}
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		len = buf_f->blf_len;
 		flags = buf_f->blf_flags;
 		break;
 	default:
 		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
 			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
 			buf_f->blf_type, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
 				 XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	mp = log->l_mp;
 	if (flags & XFS_BLI_INODE_BUF) {
 		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
 								XFS_BUF_LOCK);
 	} else {
 		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
 	}
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
 				  bp, blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		return error;
 	}
 	error = 0;
 	if (flags & XFS_BLI_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
 	} else if (flags &
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
 		xlog_recover_do_reg_buffer(item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
 	/*
 	 * Perform delayed write on the buffer.  Asynchronous writes will be
 	 * slower when taking into account all the buffers to be flushed.
 	 *
 	 * Also make sure that only inode buffers with good sizes stay in
 	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
 	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
 	 * buffers in the log can be a different size if the log was generated
 	 * by an older kernel using unclustered inode buffers or a newer kernel
 	 * running with a different inode cluster size.  Regardless, if the
 	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
 	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
 	 * the buffer out of the buffer cache so that the buffer won't
 	 * overlap with future reads of those inodes.
 	 */
 	if (XFS_DINODE_MAGIC ==
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	} else {
 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
 		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
 		XFS_BUF_SET_FSPRIVATE(bp, mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	}
 	return (error);
 }
 STATIC int
 xlog_recover_do_inode_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_inode_log_format_t	*in_f;
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
 	xfs_ino_t		ino;
 	int			len;
 	xfs_caddr_t		src;
 	xfs_caddr_t		dest;
 	int			error;
 	int			attr_index;
 	uint			fields;
 	xfs_icdinode_t		*dicp;
 	int			need_free = 0;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
 		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
 	} else {
 		in_f = (xfs_inode_log_format_t *)kmem_alloc(
 			sizeof(xfs_inode_log_format_t), KM_SLEEP);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
 			goto error;
 	}
 	ino = in_f->ilf_ino;
 	mp = log->l_mp;
 	/*
 	 * Inode buffers can be freed, look out for it,
 	 * and do not replay the inode.
 	 */
 	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
 					in_f->ilf_len, 0)) {
 		error = 0;
 		goto error;
 	}
 	bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
 				in_f->ilf_len, XFS_BUF_LOCK);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
 				  bp, in_f->ilf_blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		goto error;
 	}
 	error = 0;
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
 	/*
 	 * Make sure the place we're flushing out to really looks
 	 * like an inode!
 	 */
 	if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
 			dip, bp, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
 			item, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	/* Skip replay when the on disk inode is newer than the log one */
 	if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
 		/*
 		 * Deal with the wrap case, DI_MAX_FLUSH is less
 		 * than smaller numbers
 		 */
 		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
 		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
 			/* do nothing */
 		} else {
 			xfs_buf_relse(bp);
 			error = 0;
 			goto error;
 		}
 	}
 	/* Take the opportunity to reset the flush iteration count */
 	dicp->di_flushiter = 0;
 	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
 			item, dip, bp, ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
 			item, dip, bp, ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
 			item->ri_buf[1].i_len, item);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	/* The core is in in-core format */
 	xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
 	/* the rest is in on-disk format */
 	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
 		memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
 			item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
 			item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
 	}
 	fields = in_f->ilf_fields;
 	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
 	case XFS_ILOG_DEV:
 		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
 		break;
 	case XFS_ILOG_UUID:
 		memcpy(XFS_DFORK_DPTR(dip),
 		       &in_f->ilf_u.ilfu_uuid,
 		       sizeof(uuid_t));
 		break;
 	}
 	if (in_f->ilf_size == 2)
 		goto write_inode_buffer;
 	len = item->ri_buf[2].i_len;
 	src = item->ri_buf[2].i_addr;
 	ASSERT(in_f->ilf_size <= 4);
 	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
 	ASSERT(!(fields & XFS_ILOG_DFORK) ||
 	       (len == in_f->ilf_dsize));
 	switch (fields & XFS_ILOG_DFORK) {
 	case XFS_ILOG_DDATA:
 	case XFS_ILOG_DEXT:
 		memcpy(XFS_DFORK_DPTR(dip), src, len);
 		break;
 	case XFS_ILOG_DBROOT:
 		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
 				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
 				 XFS_DFORK_DSIZE(dip, mp));
 		break;
 	default:
 		/*
 		 * There are no data fork flags set.
 		 */
 		ASSERT((fields & XFS_ILOG_DFORK) == 0);
 		break;
 	}
 	/*
 	 * If we logged any attribute data, recover it.  There may or
 	 * may not have been any other non-core data logged in this
 	 * transaction.
 	 */
 	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
 		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
 			attr_index = 3;
 		} else {
 			attr_index = 2;
 		}
 		len = item->ri_buf[attr_index].i_len;
 		src = item->ri_buf[attr_index].i_addr;
 		ASSERT(len == in_f->ilf_asize);
 		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
 		case XFS_ILOG_ADATA:
 		case XFS_ILOG_AEXT:
 			dest = XFS_DFORK_APTR(dip);
 			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
 			memcpy(dest, src, len);
 			break;
 		case XFS_ILOG_ABROOT:
 			dest = XFS_DFORK_APTR(dip);
 			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
 					 len, (xfs_bmdr_block_t*)dest,
 					 XFS_DFORK_ASIZE(dip, mp));
 			break;
 		default:
 			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
 			error = EIO;
 			goto error;
 		}
 	}
 write_inode_buffer:
 	if (ITEM_TYPE(item) == XFS_LI_INODE) {
 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
 		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
 		XFS_BUF_SET_FSPRIVATE(bp, mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	} else {
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	}
 error:
 	if (need_free)
 		kmem_free(in_f);
 	return XFS_ERROR(error);
 }
 /*
  * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
  * structure, so that we know not to do any dquot item or dquot buffer recovery,
  * of that type.
  */
 STATIC int
 xlog_recover_do_quotaoff_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_qoff_logformat_t	*qoff_f;
 	if (pass == XLOG_RECOVER_PASS2) {
 		return (0);
 	}
 	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
 	ASSERT(qoff_f);
 	/*
 	 * The logitem format's flag tells us if this was user quotaoff,
 	 * group/project quotaoff or both.
 	 */
 	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_USER;
 	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
 	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
 	return (0);
 }
 /*
  * Recover a dquot record
  */
 STATIC int
 xlog_recover_do_dquot_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	struct xfs_disk_dquot	*ddq, *recddq;
 	int			error;
 	xfs_dq_logformat_t	*dq_f;
 	uint			type;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	mp = log->l_mp;
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
 	if (mp->m_qflags == 0)
 		return (0);
 	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
 	ASSERT(recddq);
 	/*
 	 * This type of quotas was turned off, so ignore this record.
 	 */
 	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
 	ASSERT(type);
 	if (log->l_quotaoffs_flag & type)
 		return (0);
 	/*
 	 * At this point we know that quota was _not_ turned off.
 	 * Since the mount flags are not indicating to us otherwise, this
 	 * must mean that quota is on, and the dquot needs to be replayed.
 	 * Remember that we may not have fully recovered the superblock yet,
 	 * so we can't do the usual trick of looking at the SB quota bits.
 	 *
 	 * The other possibility, of course, is that the quota subsystem was
 	 * removed since the last mount - ENOSYS.
 	 */
 	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
 	if ((error = xfs_qm_dqcheck(recddq,
 			   dq_f->qlf_id,
 			   0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_do_dquot_trans (log copy)"))) {
 		return XFS_ERROR(EIO);
 	}
 	ASSERT(dq_f->qlf_len == 1);
 	error = xfs_read_buf(mp, mp->m_ddev_targp,
 			     dq_f->qlf_blkno,
 			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
 			     0, &bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
 				  bp, dq_f->qlf_blkno);
 		return error;
 	}
 	ASSERT(bp);
 	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
 	/*
 	 * At least the magic num portion should be on disk because this
 	 * was among a chunk of dquots created earlier, and we did some
 	 * minimal initialization then.
 	 */
 	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_do_dquot_trans")) {
 		xfs_buf_relse(bp);
 		return XFS_ERROR(EIO);
 	}
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
 	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
 	XFS_BUF_SET_FSPRIVATE(bp, mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 	return (0);
 }
 /*
  * This routine is called to create an in-core extent free intent
  * item from the efi format structure which was logged on disk.
  * It allocates an in-core efi, copies the extents from the format
  * structure into it, and adds the efi to the AIL with the given
  * LSN.
  */
 STATIC int
 xlog_recover_do_efi_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_lsn_t		lsn,
 	int			pass)
 {
 	int			error;
 	xfs_mount_t		*mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
 	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
 	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
 					 &(efip->efi_format)))) {
 		xfs_efi_item_free(efip);
 		return error;
 	}
 	efip->efi_next_extent = efi_formatp->efi_nextents;
 	efip->efi_flags |= XFS_EFI_COMMITTED;
 	spin_lock(&log->l_ailp->xa_lock);
 	/*
 	 * xfs_trans_ail_update() drops the AIL lock.
 	 */
 	xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
 	return 0;
 }
 /*
  * This routine is called when an efd format structure is found in
  * a committed transaction in the log.  It's purpose is to cancel
  * the corresponding efi if it was still in the log.  To do this
  * it searches the AIL for the efi with an id equal to that in the
  * efd format structure.  If we find it, we remove the efi from the
  * AIL and free it.
  */
 STATIC void
 xlog_recover_do_efd_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_efd_log_format_t	*efd_formatp;
 	xfs_efi_log_item_t	*efip = NULL;
 	xfs_log_item_t		*lip;
 	__uint64_t		efi_id;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp = log->l_ailp;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return;
 	}
 	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
 	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
 	efi_id = efd_formatp->efd_efi_id;
 	/*
 	 * Search for the efi with the id in the efd format structure
 	 * in the AIL.
 	 */
 	spin_lock(&ailp->xa_lock);
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	while (lip != NULL) {
 		if (lip->li_type == XFS_LI_EFI) {
 			efip = (xfs_efi_log_item_t *)lip;
 			if (efip->efi_format.efi_id == efi_id) {
 				/*
 				 * xfs_trans_ail_delete() drops the
 				 * AIL lock.
 				 */
 				xfs_trans_ail_delete(ailp, lip);
 				xfs_efi_item_free(efip);
 				spin_lock(&ailp->xa_lock);
 				break;
 			}
 		}
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
 	}
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 }
 /*
  * Perform the transaction
  *
  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
  * EFIs and EFDs get queued up by adding entries into the AIL for them.
  */
 STATIC int
 xlog_recover_do_trans(
 	xlog_t			*log,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error = 0;
 	xlog_recover_item_t	*item, *first_item;
 	if ((error = xlog_recover_reorder_trans(trans)))
 		return error;
 	first_item = item = trans->r_itemq;
 	do {
 		/*
 		 * we don't need to worry about the block number being
 		 * truncated in > 1 TB buffers because in user-land,
 		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
 		 * the blknos will get through the user-mode buffer
 		 * cache properly.  The only bad case is o32 kernels
 		 * where xfs_daddr_t is 32-bits but mount will warn us
 		 * off a > 1 TB filesystem before we get here.
 		 */
 		if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
 			if  ((error = xlog_recover_do_buffer_trans(log, item,
 								 pass)))
 				break;
 		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
 			if ((error = xlog_recover_do_inode_trans(log, item,
 								pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
 			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
 						  pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
 			xlog_recover_do_efd_trans(log, item, pass);
 		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
 			if ((error = xlog_recover_do_dquot_trans(log, item,
 								   pass)))
 					break;
 		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
 			if ((error = xlog_recover_do_quotaoff_trans(log, item,
 								   pass)))
 					break;
 		} else {
 			xlog_warn("XFS: xlog_recover_do_trans");
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			break;
 		}
 		item = item->ri_next;
 	} while (first_item != item);
 	return error;
 }
 /*
  * Free up any resources allocated by the transaction
  *
  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
  */
 STATIC void
 xlog_recover_free_trans(
 	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*first_item, *item, *free_item;
 	int			i;
 	item = first_item = trans->r_itemq;
 	do {
 		free_item = item;
 		item = item->ri_next;
 		 /* Free the regions in the item. */
 		for (i = 0; i < free_item->ri_cnt; i++) {
 			kmem_free(free_item->ri_buf[i].i_addr);
 		}
 		/* Free the item itself */
 		kmem_free(free_item->ri_buf);
 		kmem_free(free_item);
 	} while (first_item != item);
 	/* Free the transaction recover structure */
 	kmem_free(trans);
 }
 STATIC int
 xlog_recover_commit_trans(
 	xlog_t			*log,
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error;
 	if ((error = xlog_recover_unlink_tid(q, trans)))
 		return error;
 	if ((error = xlog_recover_do_trans(log, trans, pass)))
 		return error;
 	xlog_recover_free_trans(trans);			/* no error */
 	return 0;
 }
 STATIC int
 xlog_recover_unmount_trans(
 	xlog_recover_t		*trans)
 {
 	/* Do nothing now */
 	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
 	return 0;
 }
 /*
  * There are two valid states of the r_state field.  0 indicates that the
  * transaction structure is in a normal state.  We have either seen the
  * start of the transaction or the last operation we added was not a partial
  * operation.  If the last operation we added to the transaction was a
  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
  *
  * NOTE: skip LRs with 0 data length.
  */
 STATIC int
 xlog_recover_process_data(
 	xlog_t			*log,
 	xlog_recover_t		*rhash[],
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	int			pass)
 {
 	xfs_caddr_t		lp;
 	int			num_logops;
 	xlog_op_header_t	*ohead;
 	xlog_recover_t		*trans;
 	xlog_tid_t		tid;
 	int			error;
 	unsigned long		hash;
 	uint			flags;
 	lp = dp + be32_to_cpu(rhead->h_len);
 	num_logops = be32_to_cpu(rhead->h_num_logops);
 	/* check the log format matches our own - else we can't recover */
 	if (xlog_header_check_recover(log->l_mp, rhead))
 		return (XFS_ERROR(EIO));
 	while ((dp < lp) && num_logops) {
 		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
 		ohead = (xlog_op_header_t *)dp;
 		dp += sizeof(xlog_op_header_t);
 		if (ohead->oh_clientid != XFS_TRANSACTION &&
 		    ohead->oh_clientid != XFS_LOG) {
 			xlog_warn(
 		"XFS: xlog_recover_process_data: bad clientid");
 			ASSERT(0);
 			return (XFS_ERROR(EIO));
 		}
 		tid = be32_to_cpu(ohead->oh_tid);
 		hash = XLOG_RHASH(tid);
 		trans = xlog_recover_find_tid(rhash[hash], tid);
 		if (trans == NULL) {		   /* not found; add new tid */
 			if (ohead->oh_flags & XLOG_START_TRANS)
 				xlog_recover_new_tid(&rhash[hash], tid,
 					be64_to_cpu(rhead->h_lsn));
 		} else {
 			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad length");
 				WARN_ON(1);
 				return (XFS_ERROR(EIO));
 			}
 			flags = ohead->oh_flags & ~XLOG_END_TRANS;
 			if (flags & XLOG_WAS_CONT_TRANS)
 				flags &= ~XLOG_CONTINUE_TRANS;
 			switch (flags) {
 			case XLOG_COMMIT_TRANS:
 				error = xlog_recover_commit_trans(log,
 						&rhash[hash], trans, pass);
 				break;
 			case XLOG_UNMOUNT_TRANS:
 				error = xlog_recover_unmount_trans(trans);
 				break;
 			case XLOG_WAS_CONT_TRANS:
 				error = xlog_recover_add_to_cont_trans(trans,
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			case XLOG_START_TRANS:
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad transaction");
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
 			case 0:
 			case XLOG_CONTINUE_TRANS:
 				error = xlog_recover_add_to_trans(trans,
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			default:
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad flag");
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
 			}
 			if (error)
 				return error;
 		}
 		dp += be32_to_cpu(ohead->oh_len);
 		num_logops--;
 	}
 	return 0;
 }
 /*
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
 STATIC int
 xlog_recover_process_efi(
 	xfs_mount_t		*mp,
 	xfs_efi_log_item_t	*efip)
 {
 	xfs_efd_log_item_t	*efdp;
 	xfs_trans_t		*tp;
 	int			i;
 	int			error = 0;
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
 	/*
 	 * First check the validity of the extents described by the
 	 * EFI.  If any are bad, then assume that all are bad and
 	 * just toss the EFI.
 	 */
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
 		startblock_fsb = XFS_BB_TO_FSB(mp,
 				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
 		if ((startblock_fsb == 0) ||
 		    (extp->ext_len == 0) ||
 		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
 		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
 			/*
 			 * This will pull the EFI from the AIL and
 			 * free the memory associated with it.
 			 */
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
 			return XFS_ERROR(EIO);
 		}
 	}
 	tp = xfs_trans_alloc(mp, 0);
 	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
 	if (error)
 		goto abort_error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
 		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
 		if (error)
 			goto abort_error;
 		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
 					 extp->ext_len);
 	}
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	error = xfs_trans_commit(tp, 0);
 	return error;
 abort_error:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 	return error;
 }
 /*
  * When this is called, all of the EFIs which did not have
  * corresponding EFDs should be in the AIL.  What we do now
  * is free the extents associated with each one.
  *
  * Since we process the EFIs in normal transactions, they
  * will be removed at some point after the commit.  This prevents
  * us from just walking down the list processing each one.
  * We'll use a flag in the EFI to skip those that we've already
  * processed and use the AIL iteration mechanism's generation
  * count to try to speed this up at least a bit.
  *
  * When we start, we know that the EFIs are the only things in
  * the AIL.  As we process them, however, other items are added
  * to the AIL.  Since everything added to the AIL must come after
  * everything already in the AIL, we stop processing as soon as
  * we see something other than an EFI in the AIL.
  */
 STATIC int
 xlog_recover_process_efis(
 	xlog_t			*log)
 {
 	xfs_log_item_t		*lip;
 	xfs_efi_log_item_t	*efip;
 	int			error = 0;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp;
 	ailp = log->l_ailp;
 	spin_lock(&ailp->xa_lock);
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	while (lip != NULL) {
 		/*
 		 * We're done when we see something other than an EFI.
 		 * There should be no EFIs left in the AIL now.
 		 */
 		if (lip->li_type != XFS_LI_EFI) {
 #ifdef DEBUG
 			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
 				ASSERT(lip->li_type != XFS_LI_EFI);
 #endif
 			break;
 		}
 		/*
 		 * Skip EFIs that we've already processed.
 		 */
 		efip = (xfs_efi_log_item_t *)lip;
 		if (efip->efi_flags & XFS_EFI_RECOVERED) {
 			lip = xfs_trans_ail_cursor_next(ailp, &cur);
 			continue;
 		}
 		spin_unlock(&ailp->xa_lock);
 		error = xlog_recover_process_efi(log->l_mp, efip);
 		spin_lock(&ailp->xa_lock);
 		if (error)
 			goto out;
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
 	}
 out:
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 	return error;
 }
 /*
  * This routine performs a transaction to null out a bad inode pointer
  * in an agi unlinked inode hash bucket.
  */
 STATIC void
 xlog_recover_clear_agi_bucket(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno,
 	int		bucket)
 {
 	xfs_trans_t	*tp;
 	xfs_agi_t	*agi;
 	xfs_buf_t	*agibp;
 	int		offset;
 	int		error;
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
 	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
 				  0, 0, 0);
 	if (error)
 		goto out_abort;
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		goto out_abort;
 	agi = XFS_BUF_TO_AGI(agibp);
 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 		 (sizeof(xfs_agino_t) * bucket);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 	error = xfs_trans_commit(tp, 0);
 	if (error)
 		goto out_error;
 	return;
 out_abort:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
 	xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
 			"failed to clear agi %d. Continuing.", agno);
 	return;
 }
 STATIC xfs_agino_t
 xlog_recover_process_one_iunlink(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
 	xfs_agino_t			agino,
 	int				bucket)
 {
 	struct xfs_buf			*ibp;
 	struct xfs_dinode		*dip;
 	struct xfs_inode		*ip;
 	xfs_ino_t			ino;
 	int				error;
 	ino = XFS_AGINO_TO_INO(mp, agno, agino);
 	error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
 	if (error)
 		goto fail;
 	/*
 	 * Get the on disk inode to find the next inode in the bucket.
 	 */
 	ASSERT(ip != NULL);
 	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
 	if (error)
 		goto fail;
 	ASSERT(dip != NULL);
 	ASSERT(ip->i_d.di_nlink == 0);
 	/* setup for the next pass */
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
 	 * the inode is dropped.
 	 */
 	ip->i_d.di_dmevmask = 0;
 	/*
 	 * If this is a new inode, handle it specially.  Otherwise, just
 	 * drop our reference to the inode.  If there are no other
 	 * references, this will send the inode to xfs_inactive() which
 	 * will truncate the file and free the inode.
 	 */
 	if (ip->i_d.di_mode == 0)
 		xfs_iput_new(ip, 0);
 	else
 		IRELE(ip);
 	return agino;
  fail:
 	/*
 	 * We can't read in the inode this bucket points to, or this inode
 	 * is messed up.  Just ditch this bucket of inodes.  We will lose
 	 * some inodes and space, but at least we won't hang.
 	 *
 	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
 	 * clear the inode pointer in the bucket.
 	 */
 	xlog_recover_clear_agi_bucket(mp, agno, bucket);
 	return NULLAGINO;
 }
 /*
  * xlog_iunlink_recover
  *
  * This is called during recovery to process any inodes which
  * we unlinked but not freed when the system crashed.  These
  * inodes will be on the lists in the AGI blocks.  What we do
  * here is scan all the AGIs and fully truncate and free any
  * inodes found on the lists.  Each inode is removed from the
  * lists when it has been fully truncated and is freed.  The
  * freeing of the inode and its removal from the list must be
  * atomic.
  */
 void
 xlog_recover_process_iunlinks(
 	xlog_t		*log)
 {
 	xfs_mount_t	*mp;
 	xfs_agnumber_t	agno;
 	xfs_agi_t	*agi;
 	xfs_buf_t	*agibp;
 	xfs_agino_t	agino;
 	int		bucket;
 	int		error;
 	uint		mp_dmevmask;
 	mp = log->l_mp;
 	/*
 	 * Prevent any DMAPI event from being sent while in this function.
 	 */
 	mp_dmevmask = mp->m_dmevmask;
 	mp->m_dmevmask = 0;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		/*
 		 * Find the agi for this ag.
 		 */
 		error = xfs_read_agi(mp, NULL, agno, &agibp);
 		if (error) {
 			/*
 			 * AGI is b0rked. Don't process it.
 			 *
 			 * We should probably mark the filesystem as corrupt
 			 * after we've recovered all the ag's we can....
 			 */
 			continue;
 		}
 		agi = XFS_BUF_TO_AGI(agibp);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
 			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
 			while (agino != NULLAGINO) {
 				/*
 				 * Release the agi buffer so that it can
 				 * be acquired in the normal course of the
 				 * transaction to truncate and free the inode.
 				 */
 				xfs_buf_relse(agibp);
 				agino = xlog_recover_process_one_iunlink(mp,
 							agno, agino, bucket);
 				/*
 				 * Reacquire the agibuffer and continue around
 				 * the loop. This should never fail as we know
 				 * the buffer was good earlier on.
 				 */
 				error = xfs_read_agi(mp, NULL, agno, &agibp);
 				ASSERT(error == 0);
 				agi = XFS_BUF_TO_AGI(agibp);
 			}
 		}
 		/*
 		 * Release the buffer for the current agi so we can
 		 * go on to the next one.
 		 */
 		xfs_buf_relse(agibp);
 	}
 	mp->m_dmevmask = mp_dmevmask;
 }
 #ifdef DEBUG
 STATIC void
 xlog_pack_data_checksum(
 	xlog_t		*log,
 	xlog_in_core_t	*iclog,
 	int		size)
 {
 	int		i;
 	__be32		*up;
 	uint		chksum = 0;
 	up = (__be32 *)iclog->ic_datap;
 	/* divide length by 4 to get # words */
 	for (i = 0; i < (size >> 2); i++) {
 		chksum ^= be32_to_cpu(*up);
 		up++;
 	}
 	iclog->ic_header.h_chksum = cpu_to_be32(chksum);
 }
 #else
 #define xlog_pack_data_checksum(log, iclog, size)
 #endif
 /*
  * Stamp cycle number in every block
  */
 void
 xlog_pack_data(
 	xlog_t			*log,
 	xlog_in_core_t		*iclog,
 	int			roundoff)
 {
 	int			i, j, k;
 	int			size = iclog->ic_offset + roundoff;
 	__be32			cycle_lsn;
 	xfs_caddr_t		dp;
 	xlog_pack_data_checksum(log, iclog, size);
 	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
 	dp = iclog->ic_datap;
 	for (i = 0; i < BTOBB(size) &&
 		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
 		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
 		*(__be32 *)dp = cycle_lsn;
 		dp += BBSIZE;
 	}
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		xlog_in_core_2_t *xhdr = iclog->ic_data;
 		for ( ; i < BTOBB(size); i++) {
 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
 			*(__be32 *)dp = cycle_lsn;
 			dp += BBSIZE;
 		}
 		for (i = 1; i < log->l_iclog_heads; i++) {
 			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
 		}
 	}
 }
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
 STATIC void
 xlog_unpack_data_checksum(
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	xlog_t			*log)
 {
 	__be32			*up = (__be32 *)dp;
 	uint			chksum = 0;
 	int			i;
 	/* divide length by 4 to get # words */
 	for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
 		chksum ^= be32_to_cpu(*up);
 		up++;
 	}
 	if (chksum != be32_to_cpu(rhead->h_chksum)) {
 	    if (rhead->h_chksum ||
 		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
 		    cmn_err(CE_DEBUG,
 			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
 			    be32_to_cpu(rhead->h_chksum), chksum);
 		    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
 		    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 			    cmn_err(CE_DEBUG,
 				"XFS: LogR this is a LogV2 filesystem\n");
 		    }
 		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
 	    }
 	}
 }
 #else
 #define xlog_unpack_data_checksum(rhead, dp, log)
 #endif
 STATIC void
 xlog_unpack_data(
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	xlog_t			*log)
 {
 	int			i, j, k;
 	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
 		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
 		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
 		dp += BBSIZE;
 	}
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
 		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
 			dp += BBSIZE;
 		}
 	}
 	xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
 xlog_valid_rec_header(
 	xlog_t			*log,
 	xlog_rec_header_t	*rhead,
 	xfs_daddr_t		blkno)
 {
 	int			hlen;
 	if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (unlikely(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
 		xlog_warn("XFS: %s: unrecognised log version (%d).",
 			__func__, be32_to_cpu(rhead->h_version));
 		return XFS_ERROR(EIO);
 	}
 	/* LR body must have data or it wouldn't have been written */
 	hlen = be32_to_cpu(rhead->h_len);
 	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 /*
  * Read the log from tail to head and process the log records found.
  * Handle the two cases where the tail and head are in the same cycle
  * and where the active portion of the log wraps around the end of
  * the physical log separately.  The pass parameter is passed through
  * to the routines called to process the data and is not looked at
  * here.
  */
 STATIC int
 xlog_do_recovery_pass(
 	xlog_t			*log,
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk,
 	int			pass)
 {
 	xlog_rec_header_t	*rhead;
 	xfs_daddr_t		blk_no;
 	xfs_caddr_t		bufaddr, offset;
 	xfs_buf_t		*hbp, *dbp;
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
 	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
 	ASSERT(head_blk != tail_blk);
 	/*
 	 * Read the header of the tail block and get the iclog buffer size from
 	 * h_size.  Use this to tell how many sectors make up the log header.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		/*
 		 * When using variable length iclogs, read first sector of
 		 * iclog header and extract the header size from it.  Get a
 		 * new hbp that is the correct size.
 		 */
 		hbp = xlog_get_bp(log, 1);
 		if (!hbp)
 			return ENOMEM;
 		if ((error = xlog_bread(log, tail_blk, 1, hbp)))
 			goto bread_err1;
 		offset = xlog_align(log, tail_blk, 1, hbp);
 		rhead = (xlog_rec_header_t *)offset;
 		error = xlog_valid_rec_header(log, rhead, tail_blk);
 		if (error)
 			goto bread_err1;
 		h_size = be32_to_cpu(rhead->h_size);
 		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
 				hblks++;
 			xlog_put_bp(hbp);
 			hbp = xlog_get_bp(log, hblks);
 		} else {
 			hblks = 1;
 		}
 	} else {
 		ASSERT(log->l_sectbb_log == 0);
 		hblks = 1;
 		hbp = xlog_get_bp(log, 1);
 		h_size = XLOG_BIG_RECORD_BSIZE;
 	}
 	if (!hbp)
 		return ENOMEM;
 	dbp = xlog_get_bp(log, BTOBB(h_size));
 	if (!dbp) {
 		xlog_put_bp(hbp);
 		return ENOMEM;
 	}
 	memset(rhash, 0, sizeof(rhash));
 	if (tail_blk <= head_blk) {
 		for (blk_no = tail_blk; blk_no < head_blk; ) {
 			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no, hblks, hbp);
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
 				goto bread_err2;
 			/* blocks in data section */
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			error = xlog_bread(log, blk_no + hblks, bblks, dbp);
 			if (error)
 				goto bread_err2;
 			offset = xlog_align(log, blk_no + hblks, bblks, dbp);
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log,
 						rhash, rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
 	} else {
 		/*
 		 * Perform recovery around the end of the physical log.
 		 * When the head is not on the same cycle number as the tail,
 		 * we can't do a sequential recovery as above.
 		 */
 		blk_no = tail_blk;
 		while (blk_no < log->l_logBBsize) {
 			/*
 			 * Check for header wrapping around physical end-of-log
 			 */
 			offset = NULL;
 			split_hblks = 0;
 			wrapped_hblks = 0;
 			if (blk_no + hblks <= log->l_logBBsize) {
 				/* Read header in one read */
 				error = xlog_bread(log, blk_no, hblks, hbp);
 				if (error)
 					goto bread_err2;
 				offset = xlog_align(log, blk_no, hblks, hbp);
 			} else {
 				/* This LR is split across physical log end */
 				if (blk_no != log->l_logBBsize) {
 					/* some data before physical log end */
 					ASSERT(blk_no <= INT_MAX);
 					split_hblks = log->l_logBBsize - (int)blk_no;
 					ASSERT(split_hblks > 0);
 					if ((error = xlog_bread(log, blk_no,
 							split_hblks, hbp)))
 						goto bread_err2;
 					offset = xlog_align(log, blk_no,
 							split_hblks, hbp);
 				}
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
 				 * - we increased the buffer size originally
 				 *   by 1 sector giving us enough extra space
 				 *   for the second read;
 				 * - the log start is guaranteed to be sector
 				 *   aligned;
 				 * - we read the log end (LR header start)
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
 				wrapped_hblks = hblks - split_hblks;
 				bufaddr = XFS_BUF_PTR(hbp);
 				error = XFS_BUF_SET_PTR(hbp,
 						bufaddr + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
 				if (!error)
 					error = xlog_bread(log, 0,
 							wrapped_hblks, hbp);
 				if (!error)
 					error = XFS_BUF_SET_PTR(hbp, bufaddr,
 							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
 			}
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead,
 						split_hblks ? blk_no : 0);
 			if (error)
 				goto bread_err2;
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			blk_no += hblks;
 			/* Read in data for log record */
 			if (blk_no + bblks <= log->l_logBBsize) {
 				error = xlog_bread(log, blk_no, bblks, dbp);
 				if (error)
 					goto bread_err2;
 				offset = xlog_align(log, blk_no, bblks, dbp);
 			} else {
 				/* This log record is split across the
 				 * physical end of log */
 				offset = NULL;
 				split_bblks = 0;
 				if (blk_no != log->l_logBBsize) {
 					/* some data is before the physical
 					 * end of log */
 					ASSERT(!wrapped_hblks);
 					ASSERT(blk_no <= INT_MAX);
 					split_bblks =
 						log->l_logBBsize - (int)blk_no;
 					ASSERT(split_bblks > 0);
 					if ((error = xlog_bread(log, blk_no,
 							split_bblks, dbp)))
 						goto bread_err2;
 					offset = xlog_align(log, blk_no,
 							split_bblks, dbp);
 				}
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
 				 * - we increased the buffer size originally
 				 *   by 1 sector giving us enough extra space
 				 *   for the second read;
 				 * - the log start is guaranteed to be sector
 				 *   aligned;
 				 * - we read the log end (LR header start)
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
 				bufaddr = XFS_BUF_PTR(dbp);
 				error = XFS_BUF_SET_PTR(dbp,
 						bufaddr + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
 				if (!error)
 					error = xlog_bread(log, wrapped_hblks,
 							bblks - split_bblks,
 							dbp);
 				if (!error)
 					error = XFS_BUF_SET_PTR(dbp, bufaddr,
 							h_size);
 				if (error)
 					goto bread_err2;
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
 			}
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
 							rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks;
 		}
 		ASSERT(blk_no >= log->l_logBBsize);
 		blk_no -= log->l_logBBsize;
 		/* read first part of physical log */
 		while (blk_no < head_blk) {
 			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no, hblks, hbp);
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
 				goto bread_err2;
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no+hblks, bblks, dbp);
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
 							rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
 	}
  bread_err2:
 	xlog_put_bp(dbp);
  bread_err1:
 	xlog_put_bp(hbp);
 	return error;
 }
 /*
  * Do the recovery of the log.  We actually do this in two phases.
  * The two passes are necessary in order to implement the function
  * of cancelling a record written into the log.  The first pass
  * determines those things which have been cancelled, and the
  * second pass replays log items normally except for those which
  * have been cancelled.  The handling of the replay and cancellations
  * takes place in the log item type specific routines.
  *
  * The table of items which have cancel records in the log is allocated
  * and freed at this level, since only here do we know when all of
  * the log recovery has been completed.
  */
 STATIC int
 xlog_do_log_recovery(
 	xlog_t		*log,
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
 	int		error;
 	ASSERT(head_blk != tail_blk);
 	/*
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
 	log->l_buf_cancel_table =
 		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
 						 sizeof(xfs_buf_cancel_t*),
 						 KM_SLEEP);
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1);
 	if (error != 0) {
 		kmem_free(log->l_buf_cancel_table);
 		log->l_buf_cancel_table = NULL;
 		return error;
 	}
 	/*
 	 * Then do a second pass to actually recover the items in the log.
 	 * When it is complete free the table of buf cancel items.
 	 */
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS2);
 #ifdef DEBUG
 	if (!error) {
 		int	i;
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
 			ASSERT(log->l_buf_cancel_table[i] == NULL);
 	}
 #endif	/* DEBUG */
 	kmem_free(log->l_buf_cancel_table);
 	log->l_buf_cancel_table = NULL;
 	return error;
 }
 /*
  * Do the actual recovery
  */
 STATIC int
 xlog_do_recover(
 	xlog_t		*log,
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
 	int		error;
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp;
 	/*
 	 * First replay the images in the log.
 	 */
 	error = xlog_do_log_recovery(log, head_blk, tail_blk);
 	if (error) {
 		return error;
 	}
 	XFS_bflush(log->l_mp->m_ddev_targp);
 	/*
 	 * If IO errors happened during recovery, bail out.
 	 */
 	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
 		return (EIO);
 	}
 	/*
 	 * We now update the tail_lsn since much of the recovery has completed
 	 * and there may be space available to use.  If there were no extent
 	 * or iunlinks, we can free up the entire log and set the tail_lsn to
 	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
 	 * lsn of the last known good LR on disk.  If there are extent frees
 	 * or iunlinks they will have some entries in the AIL; so we look at
 	 * the AIL to determine how to set the tail_lsn.
 	 */
 	xlog_assign_tail_lsn(log->l_mp);
 	/*
 	 * Now that we've finished replaying all buffer and inode
 	 * updates, re-read in the superblock.
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
 	XFS_BUF_UNDONE(bp);
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_iowait(bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
 		ASSERT(0);
 		xfs_buf_relse(bp);
 		return error;
 	}
 	/* Convert superblock from on-disk format */
 	sbp = &log->l_mp->m_sb;
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
 	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
 	ASSERT(xfs_sb_good_version(sbp));
 	xfs_buf_relse(bp);
 	/* We've re-read the superblock so re-initialize per-cpu counters */
 	xfs_icsb_reinit_counters(log->l_mp);
 	xlog_recover_check_summary(log);
 	/* Normal transactions can now occur */
 	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 	return 0;
 }
 /*
  * Perform recovery and re-initialize some log variables in xlog_find_tail.
  *
  * Return error or zero.
  */
 int
 xlog_recover(
 	xlog_t		*log)
 {
 	xfs_daddr_t	head_blk, tail_blk;
 	int		error;
 	/* find the tail of the log */
 	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
 		return error;
 	if (tail_blk != head_blk) {
 		/* There used to be a comment here:
 		 *
 		 * disallow recovery on read-only mounts.  note -- mount
 		 * checks for ENOSPC and turns it into an intelligent
 		 * error message.
 		 * ...but this is no longer true.  Now, unless you specify
 		 * NORECOVERY (in which case this function would never be
 		 * called), we just go ahead and recover.  We do this all
 		 * under the vfs layer, so we can get away with it unless
 		 * the device itself is read-only, in which case we fail.
 		 */
 		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
 			return error;
 		}
 		cmn_err(CE_NOTE,
 			"Starting XFS recovery on filesystem: %s (logdev: %s)",
 			log->l_mp->m_fsname, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		error = xlog_do_recover(log, head_blk, tail_blk);
 		log->l_flags |= XLOG_RECOVERY_NEEDED;
 	}
 	return error;
 }
 /*
  * In the first part of recovery we replay inodes and buffers and build
  * up the list of extent free items which need to be processed.  Here
  * we process the extent free items and clean up the on disk unlinked
  * inode lists.  This is separated from the first part of recovery so
  * that the root and real-time bitmap inodes can be read in from disk in
  * between the two stages.  This is necessary so that we can free space
  * in the real-time portion of the file system.
  */
 int
 xlog_recover_finish(
 	xlog_t		*log)
 {
 	/*
 	 * Now we're ready to do the transactions needed for the
 	 * rest of recovery.  Start with completing all the extent
 	 * free intent records and then process the unlinked inode
 	 * lists.  At this point, we essentially run in normal mode
 	 * except that we're still performing recovery actions
 	 * rather than accepting new requests.
 	 */
 	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
 		int	error;
 		error = xlog_recover_process_efis(log);
 		if (error) {
 			cmn_err(CE_ALERT,
 				"Failed to recover EFIs on filesystem: %s",
 				log->l_mp->m_fsname);
 			return error;
 		}
 		/*
 		 * Sync the log to get all the EFIs out of the AIL.
 		 * This isn't absolutely necessary, but it helps in
 		 * case the unlink transactions would have problems
 		 * pushing the EFIs out of the way.
 		 */
 		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
 			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
 		xlog_recover_process_iunlinks(log);
 		xlog_recover_check_summary(log);
 		cmn_err(CE_NOTE,
 			"Ending XFS recovery on filesystem: %s (logdev: %s)",
 			log->l_mp->m_fsname, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
 			"!Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
 }
 #if defined(DEBUG)
 /*
  * Read all of the agf and agi counters and check that they
  * are consistent with the superblock counters.
  */
 void
 xlog_recover_check_summary(
 	xlog_t		*log)
 {
 	xfs_mount_t	*mp;
 	xfs_agf_t	*agfp;
 	xfs_buf_t	*agfbp;
 	xfs_buf_t	*agibp;
 	xfs_buf_t	*sbbp;
 #ifdef XFS_LOUD_RECOVERY
 	xfs_sb_t	*sbp;
 #endif
 	xfs_agnumber_t	agno;
 	__uint64_t	freeblks;
 	__uint64_t	itotal;
 	__uint64_t	ifree;
 	int		error;
 	mp = log->l_mp;
 	freeblks = 0LL;
 	itotal = 0LL;
 	ifree = 0LL;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xlog_recover_check_summary(agf)"
 					"agf read failed agno %d error %d",
 							agno, error);
 		} else {
 			agfp = XFS_BUF_TO_AGF(agfbp);
 			freeblks += be32_to_cpu(agfp->agf_freeblks) +
 				    be32_to_cpu(agfp->agf_flcount);
 			xfs_buf_relse(agfbp);
 		}
 		error = xfs_read_agi(mp, NULL, agno, &agibp);
 		if (!error) {
 			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
 			itotal += be32_to_cpu(agi->agi_count);
 			ifree += be32_to_cpu(agi->agi_freecount);
 			xfs_buf_relse(agibp);
 		}
 	}
 	sbbp = xfs_getsb(mp, 0);
 #ifdef XFS_LOUD_RECOVERY
 	sbp = &mp->m_sb;
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
 		sbp->sb_icount, itotal);
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
 		sbp->sb_ifree, ifree);
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
 		sbp->sb_fdblocks, freeblks);
 #if 0
 	/*
 	 * This is turned off until I account for the allocation
 	 * btree blocks which live in free space.
 	 */
 	ASSERT(sbp->sb_icount == itotal);
 	ASSERT(sbp->sb_ifree == ifree);
 	ASSERT(sbp->sb_fdblocks == freeblks);
 #endif
 #endif
 	xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */