Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 891456227881da9c565c455010380a40d385a478

Authored by Marcin Slusarz 18 years ago

Committed by Dave Kleikamp 18 years ago

jfs: le*_add_cpu conversion

replace all:
little_endian_variable = cpu_to_leX(leX_to_cpu(little_endian_variable) +
                                        expression_in_cpu_byteorder);
with:
        leX_add_cpu(&little_endian_variable, expression_in_cpu_byteorder);
generated with semantic patch

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: jfs-discussion@lists.sourceforge.net

Showing 3 changed files with 19 additions and 33 deletions Inline Diff

fs/jfs/jfs_dmap.c
fs/jfs/jfs_imap.c
fs/jfs/jfs_xtree.c

fs/jfs/jfs_dmap.c

Diff comments View file @ 8914562

 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
  *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
  *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
 #include "jfs_imap.h"
 #include "jfs_lock.h"
 #include "jfs_metapage.h"
 #include "jfs_debug.h"
 /*
  *	SERIALIZATION of the Block Allocation Map.
  *
  *	the working state of the block allocation map is accessed in
  *	two directions:
  *
  *	1) allocation and free requests that start at the dmap
  *	   level and move up through the dmap control pages (i.e.
  *	   the vast majority of requests).
  *
  *	2) allocation requests that start at dmap control page
  *	   level and work down towards the dmaps.
  *
  *	the serialization scheme used here is as follows.
  *
  *	requests which start at the bottom are serialized against each
  *	other through buffers and each requests holds onto its buffers
  *	as it works it way up from a single dmap to the required level
  *	of dmap control page.
  *	requests that start at the top are serialized against each other
  *	and request that start from the bottom by the multiple read/single
  *	write inode lock of the bmap inode. requests starting at the top
  *	take this lock in write mode while request starting at the bottom
  *	take the lock in read mode.  a single top-down request may proceed
  *	exclusively while multiple bottoms-up requests may proceed
  *	simultaneously (under the protection of busy buffers).
  *
  *	in addition to information found in dmaps and dmap control pages,
  *	the working state of the block allocation map also includes read/
  *	write information maintained in the bmap descriptor (i.e. total
  *	free block count, allocation group level free block counts).
  *	a single exclusive lock (BMAP_LOCK) is used to guard this information
  *	in the face of multiple-bottoms up requests.
  *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
  *
  *	accesses to the persistent state of the block allocation map (limited
  *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
  */
 #define BMAP_LOCK_INIT(bmp)	mutex_init(&bmp->db_bmaplock)
 #define BMAP_LOCK(bmp)		mutex_lock(&bmp->db_bmaplock)
 #define BMAP_UNLOCK(bmp)	mutex_unlock(&bmp->db_bmaplock)
 /*
  * forward references
  */
 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 			int nblocks);
 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
 static int dbBackSplit(dmtree_t * tp, int leafno);
 static int dbJoin(dmtree_t * tp, int leafno, int newval);
 static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
 		    int level);
 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks);
 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks,
 		       int l2nb, s64 * results);
 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks);
 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
 			  int l2nb,
 			  s64 * results);
 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
 		     s64 * results);
 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
 		      s64 * results);
 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
 static int dbFindBits(u32 word, int l2nb);
 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		      int nblocks);
 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		      int nblocks);
 static int dbMaxBud(u8 * cp);
 s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
 static int blkstol2(s64 nb);
 static int cntlz(u32 value);
 static int cnttz(u32 word);
 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 			 int nblocks);
 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
 static int dbInitDmapTree(struct dmap * dp);
 static int dbInitTree(struct dmaptree * dtp);
 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
 static int dbGetL2AGSize(s64 nblocks);
 /*
  *	buddy table
  *
  * table used for determining buddy sizes within characters of
  * dmap bitmap words.  the characters themselves serve as indexes
  * into the table, with the table elements yielding the maximum
  * binary buddy of free bits within the character.
  */
 static const s8 budtab[256] = {
 	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
 };
 /*
  * NAME:	dbMount()
  *
  * FUNCTION:	initializate the block allocation map.
  *
  *		memory is allocated for the in-core bmap descriptor and
  *		the in-core descriptor is initialized from disk.
  *
  * PARAMETERS:
  *	ipbmap	- pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOMEM	- insufficient memory
  *	-EIO	- i/o error
  */
 int dbMount(struct inode *ipbmap)
 {
 	struct bmap *bmp;
 	struct dbmap_disk *dbmp_le;
 	struct metapage *mp;
 	int i;
 	/*
 	 * allocate/initialize the in-memory bmap descriptor
 	 */
 	/* allocate memory for the in-memory bmap descriptor */
 	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
 	if (bmp == NULL)
 		return -ENOMEM;
 	/* read the on-disk bmap descriptor. */
 	mp = read_metapage(ipbmap,
 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
 			   PSIZE, 0);
 	if (mp == NULL) {
 		kfree(bmp);
 		return -EIO;
 	}
 	/* copy the on-disk bmap descriptor to its in-memory version. */
 	dbmp_le = (struct dbmap_disk *) mp->data;
 	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
 	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
 	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
 	bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
 	for (i = 0; i < MAXAG; i++)
 		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
 	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
 	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
 	/* release the buffer. */
 	release_metapage(mp);
 	/* bind the bmap inode and the bmap descriptor to each other. */
 	bmp->db_ipbmap = ipbmap;
 	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
 	memset(bmp->db_active, 0, sizeof(bmp->db_active));
 	/*
 	 * allocate/initialize the bmap lock
 	 */
 	BMAP_LOCK_INIT(bmp);
 	return (0);
 }
 /*
  * NAME:	dbUnmount()
  *
  * FUNCTION:	terminate the block allocation map in preparation for
  *		file system unmount.
  *
  *		the in-core bmap descriptor is written to disk and
  *		the memory for this descriptor is freed.
  *
  * PARAMETERS:
  *	ipbmap	- pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  */
 int dbUnmount(struct inode *ipbmap, int mounterror)
 {
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	if (!(mounterror || isReadOnly(ipbmap)))
 		dbSync(ipbmap);
 	/*
 	 * Invalidate the page cache buffers
 	 */
 	truncate_inode_pages(ipbmap->i_mapping, 0);
 	/* free the memory for the in-memory bmap. */
 	kfree(bmp);
 	return (0);
 }
 /*
  *	dbSync()
  */
 int dbSync(struct inode *ipbmap)
 {
 	struct dbmap_disk *dbmp_le;
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	struct metapage *mp;
 	int i;
 	/*
 	 * write bmap global control page
 	 */
 	/* get the buffer for the on-disk bmap descriptor. */
 	mp = read_metapage(ipbmap,
 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
 			   PSIZE, 0);
 	if (mp == NULL) {
 		jfs_err("dbSync: read_metapage failed!");
 		return -EIO;
 	}
 	/* copy the in-memory version of the bmap to the on-disk version */
 	dbmp_le = (struct dbmap_disk *) mp->data;
 	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
 	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
 	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
 	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
 	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
 	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
 	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
 	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
 	dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
 	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
 	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
 	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
 	for (i = 0; i < MAXAG; i++)
 		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
 	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
 	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
 	/* write the buffer */
 	write_metapage(mp);
 	/*
 	 * write out dirty pages of bmap
 	 */
 	filemap_write_and_wait(ipbmap->i_mapping);
 	diWriteSpecial(ipbmap, 0);
 	return (0);
 }
 /*
  * NAME:	dbFree()
  *
  * FUNCTION:	free the specified block range from the working block
  *		allocation map.
  *
  *		the blocks will be free from the working map one dmap
  *		at a time.
  *
  * PARAMETERS:
  *	ip	- pointer to in-core inode;
  *	blkno	- starting block number to be freed.
  *	nblocks	- number of blocks to be freed.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  */
 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 {
 	struct metapage *mp;
 	struct dmap *dp;
 	int nb, rc;
 	s64 lblkno, rem;
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 	/* block to be freed better be within the mapsize. */
 	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
 		IREAD_UNLOCK(ipbmap);
 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
 		       (unsigned long long) blkno,
 		       (unsigned long long) nblocks);
 		jfs_error(ip->i_sb,
 			  "dbFree: block to be freed is outside the map");
 		return -EIO;
 	}
 	/*
 	 * free the blocks a dmap at a time.
 	 */
 	mp = NULL;
 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
 		/* release previous dmap if any */
 		if (mp) {
 			write_metapage(mp);
 		}
 		/* get the buffer for the current dmap. */
 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL) {
 			IREAD_UNLOCK(ipbmap);
 			return -EIO;
 		}
 		dp = (struct dmap *) mp->data;
 		/* determine the number of blocks to be freed from
 		 * this dmap.
 		 */
 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
 		/* free the blocks. */
 		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
 			jfs_error(ip->i_sb, "dbFree: error in block map\n");
 			release_metapage(mp);
 			IREAD_UNLOCK(ipbmap);
 			return (rc);
 		}
 	}
 	/* write the last buffer. */
 	write_metapage(mp);
 	IREAD_UNLOCK(ipbmap);
 	return (0);
 }
 /*
  * NAME:	dbUpdatePMap()
  *
  * FUNCTION:	update the allocation state (free or allocate) of the
  *		specified block range in the persistent block allocation map.
  *
  *		the blocks will be updated in the persistent map one
  *		dmap at a time.
  *
  * PARAMETERS:
  *	ipbmap	- pointer to in-core inode for the block map.
  *	free	- 'true' if block range is to be freed from the persistent
  *		  map; 'false' if it is to be allocated.
  *	blkno	- starting block number of the range.
  *	nblocks	- number of contiguous blocks in the range.
  *	tblk	- transaction block;
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  */
 int
 dbUpdatePMap(struct inode *ipbmap,
 	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
 {
 	int nblks, dbitno, wbitno, rbits;
 	int word, nbits, nwords;
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	s64 lblkno, rem, lastlblkno;
 	u32 mask;
 	struct dmap *dp;
 	struct metapage *mp;
 	struct jfs_log *log;
 	int lsn, difft, diffp;
 	unsigned long flags;
 	/* the blocks better be within the mapsize. */
 	if (blkno + nblocks > bmp->db_mapsize) {
 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
 		       (unsigned long long) blkno,
 		       (unsigned long long) nblocks);
 		jfs_error(ipbmap->i_sb,
 			  "dbUpdatePMap: blocks are outside the map");
 		return -EIO;
 	}
 	/* compute delta of transaction lsn from log syncpt */
 	lsn = tblk->lsn;
 	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
 	logdiff(difft, lsn, log);
 	/*
 	 * update the block state a dmap at a time.
 	 */
 	mp = NULL;
 	lastlblkno = 0;
 	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
 		/* get the buffer for the current dmap. */
 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 		if (lblkno != lastlblkno) {
 			if (mp) {
 				write_metapage(mp);
 			}
 			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
 					   0);
 			if (mp == NULL)
 				return -EIO;
 			metapage_wait_for_io(mp);
 		}
 		dp = (struct dmap *) mp->data;
 		/* determine the bit number and word within the dmap of
 		 * the starting block.  also determine how many blocks
 		 * are to be updated within this dmap.
 		 */
 		dbitno = blkno & (BPERDMAP - 1);
 		word = dbitno >> L2DBWORD;
 		nblks = min(rem, (s64)BPERDMAP - dbitno);
 		/* update the bits of the dmap words. the first and last
 		 * words may only have a subset of their bits updated. if
 		 * this is the case, we'll work against that word (i.e.
 		 * partial first and/or last) only in a single pass.  a
 		 * single pass will also be used to update all words that
 		 * are to have all their bits updated.
 		 */
 		for (rbits = nblks; rbits > 0;
 		     rbits -= nbits, dbitno += nbits) {
 			/* determine the bit number within the word and
 			 * the number of bits within the word.
 			 */
 			wbitno = dbitno & (DBWORD - 1);
 			nbits = min(rbits, DBWORD - wbitno);
 			/* check if only part of the word is to be updated. */
 			if (nbits < DBWORD) {
 				/* update (free or allocate) the bits
 				 * in this word.
 				 */
 				mask =
 				    (ONES << (DBWORD - nbits) >> wbitno);
 				if (free)
 					dp->pmap[word] &=
 					    cpu_to_le32(~mask);
 				else
 					dp->pmap[word] |=
 					    cpu_to_le32(mask);
 				word += 1;
 			} else {
 				/* one or more words are to have all
 				 * their bits updated.  determine how
 				 * many words and how many bits.
 				 */
 				nwords = rbits >> L2DBWORD;
 				nbits = nwords << L2DBWORD;
 				/* update (free or allocate) the bits
 				 * in these words.
 				 */
 				if (free)
 					memset(&dp->pmap[word], 0,
 					       nwords * 4);
 				else
 					memset(&dp->pmap[word], (int) ONES,
 					       nwords * 4);
 				word += nwords;
 			}
 		}
 		/*
 		 * update dmap lsn
 		 */
 		if (lblkno == lastlblkno)
 			continue;
 		lastlblkno = lblkno;
 		LOGSYNC_LOCK(log, flags);
 		if (mp->lsn != 0) {
 			/* inherit older/smaller lsn */
 			logdiff(diffp, mp->lsn, log);
 			if (difft < diffp) {
 				mp->lsn = lsn;
 				/* move bp after tblock in logsync list */
 				list_move(&mp->synclist, &tblk->synclist);
 			}
 			/* inherit younger/larger clsn */
 			logdiff(difft, tblk->clsn, log);
 			logdiff(diffp, mp->clsn, log);
 			if (difft > diffp)
 				mp->clsn = tblk->clsn;
 		} else {
 			mp->log = log;
 			mp->lsn = lsn;
 			/* insert bp after tblock in logsync list */
 			log->count++;
 			list_add(&mp->synclist, &tblk->synclist);
 			mp->clsn = tblk->clsn;
 		}
 		LOGSYNC_UNLOCK(log, flags);
 	}
 	/* write the last buffer. */
 	if (mp) {
 		write_metapage(mp);
 	}
 	return (0);
 }
 /*
  * NAME:	dbNextAG()
  *
  * FUNCTION:	find the preferred allocation group for new allocations.
  *
  *		Within the allocation groups, we maintain a preferred
  *		allocation group which consists of a group with at least
  *		average free space.  It is the preferred group that we target
  *		new inode allocation towards.  The tie-in between inode
  *		allocation and block allocation occurs as we allocate the
  *		first (data) block of an inode and specify the inode (block)
  *		as the allocation hint for this block.
  *
  *		We try to avoid having more than one open file growing in
  *		an allocation group, as this will lead to fragmentation.
  *		This differs from the old OS/2 method of trying to keep
  *		empty ags around for large allocations.
  *
  * PARAMETERS:
  *	ipbmap	- pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
  *	the preferred allocation group number.
  */
 int dbNextAG(struct inode *ipbmap)
 {
 	s64 avgfree;
 	int agpref;
 	s64 hwm = 0;
 	int i;
 	int next_best = -1;
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	BMAP_LOCK(bmp);
 	/* determine the average number of free blocks within the ags. */
 	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
 	/*
 	 * if the current preferred ag does not have an active allocator
 	 * and has at least average freespace, return it
 	 */
 	agpref = bmp->db_agpref;
 	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
 	    (bmp->db_agfree[agpref] >= avgfree))
 		goto unlock;
 	/* From the last preferred ag, find the next one with at least
 	 * average free space.
 	 */
 	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
 		if (agpref == bmp->db_numag)
 			agpref = 0;
 		if (atomic_read(&bmp->db_active[agpref]))
 			/* open file is currently growing in this ag */
 			continue;
 		if (bmp->db_agfree[agpref] >= avgfree) {
 			/* Return this one */
 			bmp->db_agpref = agpref;
 			goto unlock;
 		} else if (bmp->db_agfree[agpref] > hwm) {
 			/* Less than avg. freespace, but best so far */
 			hwm = bmp->db_agfree[agpref];
 			next_best = agpref;
 		}
 	}
 	/*
 	 * If no inactive ag was found with average freespace, use the
 	 * next best
 	 */
 	if (next_best != -1)
 		bmp->db_agpref = next_best;
 	/* else leave db_agpref unchanged */
 unlock:
 	BMAP_UNLOCK(bmp);
 	/* return the preferred group.
 	 */
 	return (bmp->db_agpref);
 }
 /*
  * NAME:	dbAlloc()
  *
  * FUNCTION:	attempt to allocate a specified number of contiguous free
  *		blocks from the working allocation block map.
  *
  *		the block allocation policy uses hints and a multi-step
  *		approach.
  *
  *		for allocation requests smaller than the number of blocks
  *		per dmap, we first try to allocate the new blocks
  *		immediately following the hint.  if these blocks are not
  *		available, we try to allocate blocks near the hint.  if
  *		no blocks near the hint are available, we next try to
  *		allocate within the same dmap as contains the hint.
  *
  *		if no blocks are available in the dmap or the allocation
  *		request is larger than the dmap size, we try to allocate
  *		within the same allocation group as contains the hint. if
  *		this does not succeed, we finally try to allocate anywhere
  *		within the aggregate.
  *
  *		we also try to allocate anywhere within the aggregate for
  *		for allocation requests larger than the allocation group
  *		size or requests that specify no hint value.
  *
  * PARAMETERS:
  *	ip	- pointer to in-core inode;
  *	hint	- allocation hint.
  *	nblocks	- number of contiguous blocks in the range.
  *	results	- on successful return, set to the starting block number
  *		  of the newly allocated contiguous range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  */
 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 {
 	int rc, agno;
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp;
 	struct metapage *mp;
 	s64 lblkno, blkno;
 	struct dmap *dp;
 	int l2nb;
 	s64 mapSize;
 	int writers;
 	/* assert that nblocks is valid */
 	assert(nblocks > 0);
 	/* get the log2 number of blocks to be allocated.
 	 * if the number of blocks is not a log2 multiple,
 	 * it will be rounded up to the next log2 multiple.
 	 */
 	l2nb = BLKSTOL2(nblocks);
 	bmp = JFS_SBI(ip->i_sb)->bmap;
 	mapSize = bmp->db_mapsize;
 	/* the hint should be within the map */
 	if (hint >= mapSize) {
 		jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
 		return -EIO;
 	}
 	/* if the number of blocks to be allocated is greater than the
 	 * allocation group size, try to allocate anywhere.
 	 */
 	if (l2nb > bmp->db_agl2size) {
 		IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
 		goto write_unlock;
 	}
 	/*
 	 * If no hint, let dbNextAG recommend an allocation group
 	 */
 	if (hint == 0)
 		goto pref_ag;
 	/* we would like to allocate close to the hint.  adjust the
 	 * hint to the block following the hint since the allocators
 	 * will start looking for free space starting at this point.
 	 */
 	blkno = hint + 1;
 	if (blkno >= bmp->db_mapsize)
 		goto pref_ag;
 	agno = blkno >> bmp->db_agl2size;
 	/* check if blkno crosses over into a new allocation group.
 	 * if so, check if we should allow allocations within this
 	 * allocation group.
 	 */
 	if ((blkno & (bmp->db_agsize - 1)) == 0)
 		/* check if the AG is currenly being written to.
 		 * if so, call dbNextAG() to find a non-busy
 		 * AG with sufficient free space.
 		 */
 		if (atomic_read(&bmp->db_active[agno]))
 			goto pref_ag;
 	/* check if the allocation request size can be satisfied from a
 	 * single dmap.  if so, try to allocate from the dmap containing
 	 * the hint using a tiered strategy.
 	 */
 	if (nblocks <= BPERDMAP) {
 		IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 		/* get the buffer for the dmap containing the hint.
 		 */
 		rc = -EIO;
 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL)
 			goto read_unlock;
 		dp = (struct dmap *) mp->data;
 		/* first, try to satisfy the allocation request with the
 		 * blocks beginning at the hint.
 		 */
 		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
 		    != -ENOSPC) {
 			if (rc == 0) {
 				*results = blkno;
 				mark_metapage_dirty(mp);
 			}
 			release_metapage(mp);
 			goto read_unlock;
 		}
 		writers = atomic_read(&bmp->db_active[agno]);
 		if ((writers > 1) ||
 		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
 			/*
 			 * Someone else is writing in this allocation
 			 * group.  To avoid fragmenting, try another ag
 			 */
 			release_metapage(mp);
 			IREAD_UNLOCK(ipbmap);
 			goto pref_ag;
 		}
 		/* next, try to satisfy the allocation request with blocks
 		 * near the hint.
 		 */
 		if ((rc =
 		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
 		    != -ENOSPC) {
 			if (rc == 0)
 				mark_metapage_dirty(mp);
 			release_metapage(mp);
 			goto read_unlock;
 		}
 		/* try to satisfy the allocation request with blocks within
 		 * the same dmap as the hint.
 		 */
 		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
 		    != -ENOSPC) {
 			if (rc == 0)
 				mark_metapage_dirty(mp);
 			release_metapage(mp);
 			goto read_unlock;
 		}
 		release_metapage(mp);
 		IREAD_UNLOCK(ipbmap);
 	}
 	/* try to satisfy the allocation request with blocks within
 	 * the same allocation group as the hint.
 	 */
 	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
 		goto write_unlock;
 	IWRITE_UNLOCK(ipbmap);
       pref_ag:
 	/*
 	 * Let dbNextAG recommend a preferred allocation group
 	 */
 	agno = dbNextAG(ipbmap);
 	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 	/* Try to allocate within this allocation group.  if that fails, try to
 	 * allocate anywhere in the map.
 	 */
 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
       write_unlock:
 	IWRITE_UNLOCK(ipbmap);
 	return (rc);
       read_unlock:
 	IREAD_UNLOCK(ipbmap);
 	return (rc);
 }
 #ifdef _NOTYET
 /*
  * NAME:	dbAllocExact()
  *
  * FUNCTION:	try to allocate the requested extent;
  *
  * PARAMETERS:
  *	ip	- pointer to in-core inode;
  *	blkno	- extent address;
  *	nblocks	- extent length;
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  */
 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 {
 	int rc;
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 	struct dmap *dp;
 	s64 lblkno;
 	struct metapage *mp;
 	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 	/*
 	 * validate extent request:
 	 *
 	 * note: defragfs policy:
 	 *  max 64 blocks will be moved.
 	 *  allocation request size must be satisfied from a single dmap.
 	 */
 	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
 		IREAD_UNLOCK(ipbmap);
 		return -EINVAL;
 	}
 	if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
 		/* the free space is no longer available */
 		IREAD_UNLOCK(ipbmap);
 		return -ENOSPC;
 	}
 	/* read in the dmap covering the extent */
 	lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 	if (mp == NULL) {
 		IREAD_UNLOCK(ipbmap);
 		return -EIO;
 	}
 	dp = (struct dmap *) mp->data;
 	/* try to allocate the requested extent */
 	rc = dbAllocNext(bmp, dp, blkno, nblocks);
 	IREAD_UNLOCK(ipbmap);
 	if (rc == 0)
 		mark_metapage_dirty(mp);
 	release_metapage(mp);
 	return (rc);
 }
 #endif /* _NOTYET */
 /*
  * NAME:	dbReAlloc()
  *
  * FUNCTION:	attempt to extend a current allocation by a specified
  *		number of blocks.
  *
  *		this routine attempts to satisfy the allocation request
  *		by first trying to extend the existing allocation in
  *		place by allocating the additional blocks as the blocks
  *		immediately following the current allocation.  if these
  *		blocks are not available, this routine will attempt to
  *		allocate a new set of contiguous blocks large enough
  *		to cover the existing allocation plus the additional
  *		number of blocks required.
  *
  * PARAMETERS:
  *	ip	    -  pointer to in-core inode requiring allocation.
  *	blkno	    -  starting block of the current allocation.
  *	nblocks	    -  number of contiguous blocks within the current
  *		       allocation.
  *	addnblocks  -  number of blocks to add to the allocation.
  *	results	-      on successful return, set to the starting block number
  *		       of the existing allocation if the existing allocation
  *		       was extended in place or to a newly allocated contiguous
  *		       range if the existing allocation could not be extended
  *		       in place.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  */
 int
 dbReAlloc(struct inode *ip,
 	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
 {
 	int rc;
 	/* try to extend the allocation in place.
 	 */
 	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
 		*results = blkno;
 		return (0);
 	} else {
 		if (rc != -ENOSPC)
 			return (rc);
 	}
 	/* could not extend the allocation in place, so allocate a
 	 * new set of blocks for the entire request (i.e. try to get
 	 * a range of contiguous blocks large enough to cover the
 	 * existing allocation plus the additional blocks.)
 	 */
 	return (dbAlloc
 		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
 }
 /*
  * NAME:	dbExtend()
  *
  * FUNCTION:	attempt to extend a current allocation by a specified
  *		number of blocks.
  *
  *		this routine attempts to satisfy the allocation request
  *		by first trying to extend the existing allocation in
  *		place by allocating the additional blocks as the blocks
  *		immediately following the current allocation.
  *
  * PARAMETERS:
  *	ip	    -  pointer to in-core inode requiring allocation.
  *	blkno	    -  starting block of the current allocation.
  *	nblocks	    -  number of contiguous blocks within the current
  *		       allocation.
  *	addnblocks  -  number of blocks to add to the allocation.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  */
 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	s64 lblkno, lastblkno, extblkno;
 	uint rel_block;
 	struct metapage *mp;
 	struct dmap *dp;
 	int rc;
 	struct inode *ipbmap = sbi->ipbmap;
 	struct bmap *bmp;
 	/*
 	 * We don't want a non-aligned extent to cross a page boundary
 	 */
 	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
 	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
 		return -ENOSPC;
 	/* get the last block of the current allocation */
 	lastblkno = blkno + nblocks - 1;
 	/* determine the block number of the block following
 	 * the existing allocation.
 	 */
 	extblkno = lastblkno + 1;
 	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 	/* better be within the file system */
 	bmp = sbi->bmap;
 	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
 		IREAD_UNLOCK(ipbmap);
 		jfs_error(ip->i_sb,
 			  "dbExtend: the block is outside the filesystem");
 		return -EIO;
 	}
 	/* we'll attempt to extend the current allocation in place by
 	 * allocating the additional blocks as the blocks immediately
 	 * following the current allocation.  we only try to extend the
 	 * current allocation in place if the number of additional blocks
 	 * can fit into a dmap, the last block of the current allocation
 	 * is not the last block of the file system, and the start of the
 	 * inplace extension is not on an allocation group boundary.
 	 */
 	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
 	    (extblkno & (bmp->db_agsize - 1)) == 0) {
 		IREAD_UNLOCK(ipbmap);
 		return -ENOSPC;
 	}
 	/* get the buffer for the dmap containing the first block
 	 * of the extension.
 	 */
 	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 	if (mp == NULL) {
 		IREAD_UNLOCK(ipbmap);
 		return -EIO;
 	}
 	dp = (struct dmap *) mp->data;
 	/* try to allocate the blocks immediately following the
 	 * current allocation.
 	 */
 	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
 	IREAD_UNLOCK(ipbmap);
 	/* were we successful ? */
 	if (rc == 0)
 		write_metapage(mp);
 	else
 		/* we were not successful */
 		release_metapage(mp);
 	return (rc);
 }
 /*
  * NAME:	dbAllocNext()
  *
  * FUNCTION:	attempt to allocate the blocks of the specified block
  *		range within a dmap.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap.
  *	blkno	-  starting block number of the range.
  *	nblocks	-  number of contiguous free blocks of the range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
  */
 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks)
 {
 	int dbitno, word, rembits, nb, nwords, wbitno, nw;
 	int l2size;
 	s8 *leaf;
 	u32 mask;
 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAllocNext: Corrupt dmap page");
 		return -EIO;
 	}
 	/* pick up a pointer to the leaves of the dmap tree.
 	 */
 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
 	/* determine the bit number and word within the dmap of the
 	 * starting block.
 	 */
 	dbitno = blkno & (BPERDMAP - 1);
 	word = dbitno >> L2DBWORD;
 	/* check if the specified block range is contained within
 	 * this dmap.
 	 */
 	if (dbitno + nblocks > BPERDMAP)
 		return -ENOSPC;
 	/* check if the starting leaf indicates that anything
 	 * is free.
 	 */
 	if (leaf[word] == NOFREE)
 		return -ENOSPC;
 	/* check the dmaps words corresponding to block range to see
 	 * if the block range is free.  not all bits of the first and
 	 * last words may be contained within the block range.  if this
 	 * is the case, we'll work against those words (i.e. partial first
 	 * and/or last) on an individual basis (a single pass) and examine
 	 * the actual bits to determine if they are free.  a single pass
 	 * will be used for all dmap words fully contained within the
 	 * specified range.  within this pass, the leaves of the dmap
 	 * tree will be examined to determine if the blocks are free. a
 	 * single leaf may describe the free space of multiple dmap
 	 * words, so we may visit only a subset of the actual leaves
 	 * corresponding to the dmap words of the block range.
 	 */
 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 		/* determine the bit number within the word and
 		 * the number of bits within the word.
 		 */
 		wbitno = dbitno & (DBWORD - 1);
 		nb = min(rembits, DBWORD - wbitno);
 		/* check if only part of the word is to be examined.
 		 */
 		if (nb < DBWORD) {
 			/* check if the bits are free.
 			 */
 			mask = (ONES << (DBWORD - nb) >> wbitno);
 			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
 				return -ENOSPC;
 			word += 1;
 		} else {
 			/* one or more dmap words are fully contained
 			 * within the block range.  determine how many
 			 * words and how many bits.
 			 */
 			nwords = rembits >> L2DBWORD;
 			nb = nwords << L2DBWORD;
 			/* now examine the appropriate leaves to determine
 			 * if the blocks are free.
 			 */
 			while (nwords > 0) {
 				/* does the leaf describe any free space ?
 				 */
 				if (leaf[word] < BUDMIN)
 					return -ENOSPC;
 				/* determine the l2 number of bits provided
 				 * by this leaf.
 				 */
 				l2size =
 				    min((int)leaf[word], NLSTOL2BSZ(nwords));
 				/* determine how many words were handled.
 				 */
 				nw = BUDSIZE(l2size, BUDMIN);
 				nwords -= nw;
 				word += nw;
 			}
 		}
 	}
 	/* allocate the blocks.
 	 */
 	return (dbAllocDmap(bmp, dp, blkno, nblocks));
 }
 /*
  * NAME:	dbAllocNear()
  *
  * FUNCTION:	attempt to allocate a number of contiguous free blocks near
  *		a specified block (hint) within a dmap.
  *
  *		starting with the dmap leaf that covers the hint, we'll
  *		check the next four contiguous leaves for sufficient free
  *		space.  if sufficient free space is found, we'll allocate
  *		the desired free space.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap.
  *	blkno	-  block number to allocate near.
  *	nblocks	-  actual number of contiguous free blocks desired.
  *	l2nb	-  log2 number of contiguous free blocks desired.
  *	results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
  */
 static int
 dbAllocNear(struct bmap * bmp,
 	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
 {
 	int word, lword, rc;
 	s8 *leaf;
 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAllocNear: Corrupt dmap page");
 		return -EIO;
 	}
 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
 	/* determine the word within the dmap that holds the hint
 	 * (i.e. blkno).  also, determine the last word in the dmap
 	 * that we'll include in our examination.
 	 */
 	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
 	lword = min(word + 4, LPERDMAP);
 	/* examine the leaves for sufficient free space.
 	 */
 	for (; word < lword; word++) {
 		/* does the leaf describe sufficient free space ?
 		 */
 		if (leaf[word] < l2nb)
 			continue;
 		/* determine the block number within the file system
 		 * of the first block described by this dmap word.
 		 */
 		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
 		/* if not all bits of the dmap word are free, get the
 		 * starting bit number within the dmap word of the required
 		 * string of free bits and adjust the block number with the
 		 * value.
 		 */
 		if (leaf[word] < BUDMIN)
 			blkno +=
 			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
 		/* allocate the blocks.
 		 */
 		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
 			*results = blkno;
 		return (rc);
 	}
 	return -ENOSPC;
 }
 /*
  * NAME:	dbAllocAG()
  *
  * FUNCTION:	attempt to allocate the specified number of contiguous
  *		free blocks within the specified allocation group.
  *
  *		unless the allocation group size is equal to the number
  *		of blocks per dmap, the dmap control pages will be used to
  *		find the required free space, if available.  we start the
  *		search at the highest dmap control page level which
  *		distinctly describes the allocation group's free space
  *		(i.e. the highest level at which the allocation group's
  *		free space is not mixed in with that of any other group).
  *		in addition, we start the search within this level at a
  *		height of the dmapctl dmtree at which the nodes distinctly
  *		describe the allocation group's free space.  at this height,
  *		the allocation group's free space may be represented by 1
  *		or two sub-trees, depending on the allocation group size.
  *		we search the top nodes of these subtrees left to right for
  *		sufficient free space.  if sufficient free space is found,
  *		the subtree is searched to find the leftmost leaf that
  *		has free space.  once we have made it to the leaf, we
  *		move the search to the next lower level dmap control page
  *		corresponding to this leaf.  we continue down the dmap control
  *		pages until we find the dmap that contains or starts the
  *		sufficient free space and we allocate at this dmap.
  *
  *		if the allocation group size is equal to the dmap size,
  *		we'll start at the dmap corresponding to the allocation
  *		group and attempt the allocation at this level.
  *
  *		the dmap control page search is also not performed if the
  *		allocation group is completely free and we go to the first
  *		dmap of the allocation group to do the allocation.  this is
  *		done because the allocation group may be part (not the first
  *		part) of a larger binary buddy system, causing the dmap
  *		control pages to indicate no free space (NOFREE) within
  *		the allocation group.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	agno	- allocation group number.
  *	nblocks	-  actual number of contiguous free blocks desired.
  *	l2nb	-  log2 number of contiguous free blocks desired.
  *	results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * note: IWRITE_LOCK(ipmap) held on entry/exit;
  */
 static int
 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 {
 	struct metapage *mp;
 	struct dmapctl *dcp;
 	int rc, ti, i, k, m, n, agperlev;
 	s64 blkno, lblkno;
 	int budmin;
 	/* allocation request should not be for more than the
 	 * allocation group size.
 	 */
 	if (l2nb > bmp->db_agl2size) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAllocAG: allocation request is larger than the "
 			  "allocation group size");
 		return -EIO;
 	}
 	/* determine the starting block number of the allocation
 	 * group.
 	 */
 	blkno = (s64) agno << bmp->db_agl2size;
 	/* check if the allocation group size is the minimum allocation
 	 * group size or if the allocation group is completely free. if
 	 * the allocation group size is the minimum size of BPERDMAP (i.e.
 	 * 1 dmap), there is no need to search the dmap control page (below)
 	 * that fully describes the allocation group since the allocation
 	 * group is already fully described by a dmap.  in this case, we
 	 * just call dbAllocCtl() to search the dmap tree and allocate the
 	 * required space if available.
 	 *
 	 * if the allocation group is completely free, dbAllocCtl() is
 	 * also called to allocate the required space.  this is done for
 	 * two reasons.  first, it makes no sense searching the dmap control
 	 * pages for free space when we know that free space exists.  second,
 	 * the dmap control pages may indicate that the allocation group
 	 * has no free space if the allocation group is part (not the first
 	 * part) of a larger binary buddy system.
 	 */
 	if (bmp->db_agsize == BPERDMAP
 	    || bmp->db_agfree[agno] == bmp->db_agsize) {
 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 		if ((rc == -ENOSPC) &&
 		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
 			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
 			       (unsigned long long) blkno,
 			       (unsigned long long) nblocks);
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbAllocAG: dbAllocCtl failed in free AG");
 		}
 		return (rc);
 	}
 	/* the buffer for the dmap control page that fully describes the
 	 * allocation group.
 	 */
 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 	if (mp == NULL)
 		return -EIO;
 	dcp = (struct dmapctl *) mp->data;
 	budmin = dcp->budmin;
 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAllocAG: Corrupt dmapctl page");
 		release_metapage(mp);
 		return -EIO;
 	}
 	/* search the subtree(s) of the dmap control page that describes
 	 * the allocation group, looking for sufficient free space.  to begin,
 	 * determine how many allocation groups are represented in a dmap
 	 * control page at the control page level (i.e. L0, L1, L2) that
 	 * fully describes an allocation group. next, determine the starting
 	 * tree index of this allocation group within the control page.
 	 */
 	agperlev =
 	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
 	/* dmap control page trees fan-out by 4 and a single allocation
 	 * group may be described by 1 or 2 subtrees within the ag level
 	 * dmap control page, depending upon the ag size. examine the ag's
 	 * subtrees for sufficient free space, starting with the leftmost
 	 * subtree.
 	 */
 	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
 		/* is there sufficient free space ?
 		 */
 		if (l2nb > dcp->stree[ti])
 			continue;
 		/* sufficient free space found in a subtree. now search down
 		 * the subtree to find the leftmost leaf that describes this
 		 * free space.
 		 */
 		for (k = bmp->db_agheigth; k > 0; k--) {
 			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
 				if (l2nb <= dcp->stree[m + n]) {
 					ti = m + n;
 					break;
 				}
 			}
 			if (n == 4) {
 				jfs_error(bmp->db_ipbmap->i_sb,
 					  "dbAllocAG: failed descending stree");
 				release_metapage(mp);
 				return -EIO;
 			}
 		}
 		/* determine the block number within the file system
 		 * that corresponds to this leaf.
 		 */
 		if (bmp->db_aglevel == 2)
 			blkno = 0;
 		else if (bmp->db_aglevel == 1)
 			blkno &= ~(MAXL1SIZE - 1);
 		else		/* bmp->db_aglevel == 0 */
 			blkno &= ~(MAXL0SIZE - 1);
 		blkno +=
 		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
 		/* release the buffer in preparation for going down
 		 * the next level of dmap control pages.
 		 */
 		release_metapage(mp);
 		/* check if we need to continue to search down the lower
 		 * level dmap control pages.  we need to if the number of
 		 * blocks required is less than maximum number of blocks
 		 * described at the next lower level.
 		 */
 		if (l2nb < budmin) {
 			/* search the lower level dmap control pages to get
 			 * the starting block number of the dmap that
 			 * contains or starts off the free space.
 			 */
 			if ((rc =
 			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
 				       &blkno))) {
 				if (rc == -ENOSPC) {
 					jfs_error(bmp->db_ipbmap->i_sb,
 						  "dbAllocAG: control page "
 						  "inconsistent");
 					return -EIO;
 				}
 				return (rc);
 			}
 		}
 		/* allocate the blocks.
 		 */
 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 		if (rc == -ENOSPC) {
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbAllocAG: unable to allocate blocks");
 			rc = -EIO;
 		}
 		return (rc);
 	}
 	/* no space in the allocation group.  release the buffer and
 	 * return -ENOSPC.
 	 */
 	release_metapage(mp);
 	return -ENOSPC;
 }
 /*
  * NAME:	dbAllocAny()
  *
  * FUNCTION:	attempt to allocate the specified number of contiguous
  *		free blocks anywhere in the file system.
  *
  *		dbAllocAny() attempts to find the sufficient free space by
  *		searching down the dmap control pages, starting with the
  *		highest level (i.e. L0, L1, L2) control page.  if free space
  *		large enough to satisfy the desired free space is found, the
  *		desired free space is allocated.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	nblocks	 -  actual number of contiguous free blocks desired.
  *	l2nb	 -  log2 number of contiguous free blocks desired.
  *	results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
 {
 	int rc;
 	s64 blkno = 0;
 	/* starting with the top level dmap control page, search
 	 * down the dmap control levels for sufficient free space.
 	 * if free space is found, dbFindCtl() returns the starting
 	 * block number of the dmap that contains or starts off the
 	 * range of free space.
 	 */
 	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
 		return (rc);
 	/* allocate the blocks.
 	 */
 	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 	if (rc == -ENOSPC) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAllocAny: unable to allocate blocks");
 		return -EIO;
 	}
 	return (rc);
 }
 /*
  * NAME:	dbFindCtl()
  *
  * FUNCTION:	starting at a specified dmap control page level and block
  *		number, search down the dmap control levels for a range of
  *		contiguous free blocks large enough to satisfy an allocation
  *		request for the specified number of free blocks.
  *
  *		if sufficient contiguous free blocks are found, this routine
  *		returns the starting block number within a dmap page that
  *		contains or starts a range of contiqious free blocks that
  *		is sufficient in size.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	level	-  starting dmap control page level.
  *	l2nb	-  log2 number of contiguous free blocks desired.
  *	*blkno	-  on entry, starting block number for conducting the search.
  *		   on successful return, the first block within a dmap page
  *		   that contains or starts a range of contiguous free blocks.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 {
 	int rc, leafidx, lev;
 	s64 b, lblkno;
 	struct dmapctl *dcp;
 	int budmin;
 	struct metapage *mp;
 	/* starting at the specified dmap control page level and block
 	 * number, search down the dmap control levels for the starting
 	 * block number of a dmap page that contains or starts off
 	 * sufficient free blocks.
 	 */
 	for (lev = level, b = *blkno; lev >= 0; lev--) {
 		/* get the buffer of the dmap control page for the block
 		 * number and level (i.e. L0, L1, L2).
 		 */
 		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL)
 			return -EIO;
 		dcp = (struct dmapctl *) mp->data;
 		budmin = dcp->budmin;
 		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbFindCtl: Corrupt dmapctl page");
 			release_metapage(mp);
 			return -EIO;
 		}
 		/* search the tree within the dmap control page for
 		 * sufficent free space.  if sufficient free space is found,
 		 * dbFindLeaf() returns the index of the leaf at which
 		 * free space was found.
 		 */
 		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
 		/* release the buffer.
 		 */
 		release_metapage(mp);
 		/* space found ?
 		 */
 		if (rc) {
 			if (lev != level) {
 				jfs_error(bmp->db_ipbmap->i_sb,
 					  "dbFindCtl: dmap inconsistent");
 				return -EIO;
 			}
 			return -ENOSPC;
 		}
 		/* adjust the block number to reflect the location within
 		 * the dmap control page (i.e. the leaf) at which free
 		 * space was found.
 		 */
 		b += (((s64) leafidx) << budmin);
 		/* we stop the search at this dmap control page level if
 		 * the number of blocks required is greater than or equal
 		 * to the maximum number of blocks described at the next
 		 * (lower) level.
 		 */
 		if (l2nb >= budmin)
 			break;
 	}
 	*blkno = b;
 	return (0);
 }
 /*
  * NAME:	dbAllocCtl()
  *
  * FUNCTION:	attempt to allocate a specified number of contiguous
  *		blocks starting within a specific dmap.
  *
  *		this routine is called by higher level routines that search
  *		the dmap control pages above the actual dmaps for contiguous
  *		free space.  the result of successful searches by these
  *		routines are the starting block numbers within dmaps, with
  *		the dmaps themselves containing the desired contiguous free
  *		space or starting a contiguous free space of desired size
  *		that is made up of the blocks of one or more dmaps. these
  *		calls should not fail due to insufficent resources.
  *
  *		this routine is called in some cases where it is not known
  *		whether it will fail due to insufficient resources.  more
  *		specifically, this occurs when allocating from an allocation
  *		group whose size is equal to the number of blocks per dmap.
  *		in this case, the dmap control pages are not examined prior
  *		to calling this routine (to save pathlength) and the call
  *		might fail.
  *
  *		for a request size that fits within a dmap, this routine relies
  *		upon the dmap's dmtree to find the requested contiguous free
  *		space.  for request sizes that are larger than a dmap, the
  *		requested free space will start at the first block of the
  *		first dmap (i.e. blkno).
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	nblocks	 -  actual number of contiguous free blocks to allocate.
  *	l2nb	 -  log2 number of contiguous free blocks to allocate.
  *	blkno	 -  starting block number of the dmap to start the allocation
  *		    from.
  *	results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int
 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 {
 	int rc, nb;
 	s64 b, lblkno, n;
 	struct metapage *mp;
 	struct dmap *dp;
 	/* check if the allocation request is confined to a single dmap.
 	 */
 	if (l2nb <= L2BPERDMAP) {
 		/* get the buffer for the dmap.
 		 */
 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL)
 			return -EIO;
 		dp = (struct dmap *) mp->data;
 		/* try to allocate the blocks.
 		 */
 		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
 		if (rc == 0)
 			mark_metapage_dirty(mp);
 		release_metapage(mp);
 		return (rc);
 	}
 	/* allocation request involving multiple dmaps. it must start on
 	 * a dmap boundary.
 	 */
 	assert((blkno & (BPERDMAP - 1)) == 0);
 	/* allocate the blocks dmap by dmap.
 	 */
 	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
 		/* get the buffer for the dmap.
 		 */
 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL) {
 			rc = -EIO;
 			goto backout;
 		}
 		dp = (struct dmap *) mp->data;
 		/* the dmap better be all free.
 		 */
 		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
 			release_metapage(mp);
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbAllocCtl: the dmap is not all free");
 			rc = -EIO;
 			goto backout;
 		}
 		/* determine how many blocks to allocate from this dmap.
 		 */
 		nb = min(n, (s64)BPERDMAP);
 		/* allocate the blocks from the dmap.
 		 */
 		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
 			release_metapage(mp);
 			goto backout;
 		}
 		/* write the buffer.
 		 */
 		write_metapage(mp);
 	}
 	/* set the results (starting block number) and return.
 	 */
 	*results = blkno;
 	return (0);
 	/* something failed in handling an allocation request involving
 	 * multiple dmaps.  we'll try to clean up by backing out any
 	 * allocation that has already happened for this request.  if
 	 * we fail in backing out the allocation, we'll mark the file
 	 * system to indicate that blocks have been leaked.
 	 */
       backout:
 	/* try to backout the allocations dmap by dmap.
 	 */
 	for (n = nblocks - n, b = blkno; n > 0;
 	     n -= BPERDMAP, b += BPERDMAP) {
 		/* get the buffer for this dmap.
 		 */
 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL) {
 			/* could not back out.  mark the file system
 			 * to indicate that we have leaked blocks.
 			 */
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbAllocCtl: I/O Error: Block Leakage.");
 			continue;
 		}
 		dp = (struct dmap *) mp->data;
 		/* free the blocks is this dmap.
 		 */
 		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
 			/* could not back out.  mark the file system
 			 * to indicate that we have leaked blocks.
 			 */
 			release_metapage(mp);
 			jfs_error(bmp->db_ipbmap->i_sb,
 				  "dbAllocCtl: Block Leakage.");
 			continue;
 		}
 		/* write the buffer.
 		 */
 		write_metapage(mp);
 	}
 	return (rc);
 }
 /*
  * NAME:	dbAllocDmapLev()
  *
  * FUNCTION:	attempt to allocate a specified number of contiguous blocks
  *		from a specified dmap.
  *
  *		this routine checks if the contiguous blocks are available.
  *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
  *		returned.
  *
  * PARAMETERS:
  *	mp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap to attempt to allocate blocks from.
  *	l2nb	-  log2 number of contiguous block desired.
  *	nblocks	-  actual number of contiguous block desired.
  *	results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient disk resources
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
  *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
  */
 static int
 dbAllocDmapLev(struct bmap * bmp,
 	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
 {
 	s64 blkno;
 	int leafidx, rc;
 	/* can't be more than a dmaps worth of blocks */
 	assert(l2nb <= L2BPERDMAP);
 	/* search the tree within the dmap page for sufficient
 	 * free space.  if sufficient free space is found, dbFindLeaf()
 	 * returns the index of the leaf at which free space was found.
 	 */
 	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
 		return -ENOSPC;
 	/* determine the block number within the file system corresponding
 	 * to the leaf at which free space was found.
 	 */
 	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
 	/* if not all bits of the dmap word are free, get the starting
 	 * bit number within the dmap word of the required string of free
 	 * bits and adjust the block number with this value.
 	 */
 	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
 		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
 	/* allocate the blocks */
 	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
 		*results = blkno;
 	return (rc);
 }
 /*
  * NAME:	dbAllocDmap()
  *
  * FUNCTION:	adjust the disk allocation map to reflect the allocation
  *		of a specified block range within a dmap.
  *
  *		this routine allocates the specified blocks from the dmap
  *		through a call to dbAllocBits(). if the allocation of the
  *		block range causes the maximum string of free blocks within
  *		the dmap to change (i.e. the value of the root of the dmap's
  *		dmtree), this routine will cause this change to be reflected
  *		up through the appropriate levels of the dmap control pages
  *		by a call to dbAdjCtl() for the L0 dmap control page that
  *		covers this dmap.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap to allocate the block range from.
  *	blkno	-  starting block number of the block to be allocated.
  *	nblocks	-  number of blocks to be allocated.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks)
 {
 	s8 oldroot;
 	int rc;
 	/* save the current value of the root (i.e. maximum free string)
 	 * of the dmap tree.
 	 */
 	oldroot = dp->tree.stree[ROOT];
 	/* allocate the specified (blocks) bits */
 	dbAllocBits(bmp, dp, blkno, nblocks);
 	/* if the root has not changed, done. */
 	if (dp->tree.stree[ROOT] == oldroot)
 		return (0);
 	/* root changed. bubble the change up to the dmap control pages.
 	 * if the adjustment of the upper level control pages fails,
 	 * backout the bit allocation (thus making everything consistent).
 	 */
 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
 		dbFreeBits(bmp, dp, blkno, nblocks);
 	return (rc);
 }
 /*
  * NAME:	dbFreeDmap()
  *
  * FUNCTION:	adjust the disk allocation map to reflect the allocation
  *		of a specified block range within a dmap.
  *
  *		this routine frees the specified blocks from the dmap through
  *		a call to dbFreeBits(). if the deallocation of the block range
  *		causes the maximum string of free blocks within the dmap to
  *		change (i.e. the value of the root of the dmap's dmtree), this
  *		routine will cause this change to be reflected up through the
  *		appropriate levels of the dmap control pages by a call to
  *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap to free the block range from.
  *	blkno	-  starting block number of the block to be freed.
  *	nblocks	-  number of blocks to be freed.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		      int nblocks)
 {
 	s8 oldroot;
 	int rc = 0, word;
 	/* save the current value of the root (i.e. maximum free string)
 	 * of the dmap tree.
 	 */
 	oldroot = dp->tree.stree[ROOT];
 	/* free the specified (blocks) bits */
 	rc = dbFreeBits(bmp, dp, blkno, nblocks);
 	/* if error or the root has not changed, done. */
 	if (rc || (dp->tree.stree[ROOT] == oldroot))
 		return (rc);
 	/* root changed. bubble the change up to the dmap control pages.
 	 * if the adjustment of the upper level control pages fails,
 	 * backout the deallocation.
 	 */
 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
 		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
 		/* as part of backing out the deallocation, we will have
 		 * to back split the dmap tree if the deallocation caused
 		 * the freed blocks to become part of a larger binary buddy
 		 * system.
 		 */
 		if (dp->tree.stree[word] == NOFREE)
 			dbBackSplit((dmtree_t *) & dp->tree, word);
 		dbAllocBits(bmp, dp, blkno, nblocks);
 	}
 	return (rc);
 }
 /*
  * NAME:	dbAllocBits()
  *
  * FUNCTION:	allocate a specified block range from a dmap.
  *
  *		this routine updates the dmap to reflect the working
  *		state allocation of the specified block range. it directly
  *		updates the bits of the working map and causes the adjustment
  *		of the binary buddy system described by the dmap's dmtree
  *		leaves to reflect the bits allocated.  it also causes the
  *		dmap's dmtree, as a whole, to reflect the allocated range.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap to allocate bits from.
  *	blkno	-  starting block number of the bits to be allocated.
  *	nblocks	-  number of bits to be allocated.
  *
  * RETURN VALUES: none
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 			int nblocks)
 {
 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
 	dmtree_t *tp = (dmtree_t *) & dp->tree;
 	int size;
 	s8 *leaf;
 	/* pick up a pointer to the leaves of the dmap tree */
 	leaf = dp->tree.stree + LEAFIND;
 	/* determine the bit number and word within the dmap of the
 	 * starting block.
 	 */
 	dbitno = blkno & (BPERDMAP - 1);
 	word = dbitno >> L2DBWORD;
 	/* block range better be within the dmap */
 	assert(dbitno + nblocks <= BPERDMAP);
 	/* allocate the bits of the dmap's words corresponding to the block
 	 * range. not all bits of the first and last words may be contained
 	 * within the block range.  if this is the case, we'll work against
 	 * those words (i.e. partial first and/or last) on an individual basis
 	 * (a single pass), allocating the bits of interest by hand and
 	 * updating the leaf corresponding to the dmap word. a single pass
 	 * will be used for all dmap words fully contained within the
 	 * specified range.  within this pass, the bits of all fully contained
 	 * dmap words will be marked as free in a single shot and the leaves
 	 * will be updated. a single leaf may describe the free space of
 	 * multiple dmap words, so we may update only a subset of the actual
 	 * leaves corresponding to the dmap words of the block range.
 	 */
 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 		/* determine the bit number within the word and
 		 * the number of bits within the word.
 		 */
 		wbitno = dbitno & (DBWORD - 1);
 		nb = min(rembits, DBWORD - wbitno);
 		/* check if only part of a word is to be allocated.
 		 */
 		if (nb < DBWORD) {
 			/* allocate (set to 1) the appropriate bits within
 			 * this dmap word.
 			 */
 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
 						      >> wbitno);
 			/* update the leaf for this dmap word. in addition
 			 * to setting the leaf value to the binary buddy max
 			 * of the updated dmap word, dbSplit() will split
 			 * the binary system of the leaves if need be.
 			 */
 			dbSplit(tp, word, BUDMIN,
 				dbMaxBud((u8 *) & dp->wmap[word]));
 			word += 1;
 		} else {
 			/* one or more dmap words are fully contained
 			 * within the block range.  determine how many
 			 * words and allocate (set to 1) the bits of these
 			 * words.
 			 */
 			nwords = rembits >> L2DBWORD;
 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
 			/* determine how many bits.
 			 */
 			nb = nwords << L2DBWORD;
 			/* now update the appropriate leaves to reflect
 			 * the allocated words.
 			 */
 			for (; nwords > 0; nwords -= nw) {
 				if (leaf[word] < BUDMIN) {
 					jfs_error(bmp->db_ipbmap->i_sb,
 						  "dbAllocBits: leaf page "
 						  "corrupt");
 					break;
 				}
 				/* determine what the leaf value should be
 				 * updated to as the minimum of the l2 number
 				 * of bits being allocated and the l2 number
 				 * of bits currently described by this leaf.
 				 */
 				size = min((int)leaf[word], NLSTOL2BSZ(nwords));
 				/* update the leaf to reflect the allocation.
 				 * in addition to setting the leaf value to
 				 * NOFREE, dbSplit() will split the binary
 				 * system of the leaves to reflect the current
 				 * allocation (size).
 				 */
 				dbSplit(tp, word, size, NOFREE);
 				/* get the number of dmap words handled */
 				nw = BUDSIZE(size, BUDMIN);
 				word += nw;
 			}
 		}
 	}
 	/* update the free count for this dmap */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+	le32_add_cpu(&dp->nfree, -nblocks);
 	BMAP_LOCK(bmp);
 	/* if this allocation group is completely free,
 	 * update the maximum allocation group number if this allocation
 	 * group is the new max.
 	 */
 	agno = blkno >> bmp->db_agl2size;
 	if (agno > bmp->db_maxag)
 		bmp->db_maxag = agno;
 	/* update the free count for the allocation group and map */
 	bmp->db_agfree[agno] -= nblocks;
 	bmp->db_nfree -= nblocks;
 	BMAP_UNLOCK(bmp);
 }
 /*
  * NAME:	dbFreeBits()
  *
  * FUNCTION:	free a specified block range from a dmap.
  *
  *		this routine updates the dmap to reflect the working
  *		state allocation of the specified block range. it directly
  *		updates the bits of the working map and causes the adjustment
  *		of the binary buddy system described by the dmap's dmtree
  *		leaves to reflect the bits freed.  it also causes the dmap's
  *		dmtree, as a whole, to reflect the deallocated range.
  *
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	dp	-  pointer to dmap to free bits from.
  *	blkno	-  starting block number of the bits to be freed.
  *	nblocks	-  number of bits to be freed.
  *
  * RETURN VALUES: 0 for success
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		       int nblocks)
 {
 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
 	dmtree_t *tp = (dmtree_t *) & dp->tree;
 	int rc = 0;
 	int size;
 	/* determine the bit number and word within the dmap of the
 	 * starting block.
 	 */
 	dbitno = blkno & (BPERDMAP - 1);
 	word = dbitno >> L2DBWORD;
 	/* block range better be within the dmap.
 	 */
 	assert(dbitno + nblocks <= BPERDMAP);
 	/* free the bits of the dmaps words corresponding to the block range.
 	 * not all bits of the first and last words may be contained within
 	 * the block range.  if this is the case, we'll work against those
 	 * words (i.e. partial first and/or last) on an individual basis
 	 * (a single pass), freeing the bits of interest by hand and updating
 	 * the leaf corresponding to the dmap word. a single pass will be used
 	 * for all dmap words fully contained within the specified range.
 	 * within this pass, the bits of all fully contained dmap words will
 	 * be marked as free in a single shot and the leaves will be updated. a
 	 * single leaf may describe the free space of multiple dmap words,
 	 * so we may update only a subset of the actual leaves corresponding
 	 * to the dmap words of the block range.
 	 *
 	 * dbJoin() is used to update leaf values and will join the binary
 	 * buddy system of the leaves if the new leaf values indicate this
 	 * should be done.
 	 */
 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 		/* determine the bit number within the word and
 		 * the number of bits within the word.
 		 */
 		wbitno = dbitno & (DBWORD - 1);
 		nb = min(rembits, DBWORD - wbitno);
 		/* check if only part of a word is to be freed.
 		 */
 		if (nb < DBWORD) {
 			/* free (zero) the appropriate bits within this
 			 * dmap word.
 			 */
 			dp->wmap[word] &=
 			    cpu_to_le32(~(ONES << (DBWORD - nb)
 					  >> wbitno));
 			/* update the leaf for this dmap word.
 			 */
 			rc = dbJoin(tp, word,
 				    dbMaxBud((u8 *) & dp->wmap[word]));
 			if (rc)
 				return rc;
 			word += 1;
 		} else {
 			/* one or more dmap words are fully contained
 			 * within the block range.  determine how many
 			 * words and free (zero) the bits of these words.
 			 */
 			nwords = rembits >> L2DBWORD;
 			memset(&dp->wmap[word], 0, nwords * 4);
 			/* determine how many bits.
 			 */
 			nb = nwords << L2DBWORD;
 			/* now update the appropriate leaves to reflect
 			 * the freed words.
 			 */
 			for (; nwords > 0; nwords -= nw) {
 				/* determine what the leaf value should be
 				 * updated to as the minimum of the l2 number
 				 * of bits being freed and the l2 (max) number
 				 * of bits that can be described by this leaf.
 				 */
 				size =
 				    min(LITOL2BSZ
 					(word, L2LPERDMAP, BUDMIN),
 					NLSTOL2BSZ(nwords));
 				/* update the leaf.
 				 */
 				rc = dbJoin(tp, word, size);
 				if (rc)
 					return rc;
 				/* get the number of dmap words handled.
 				 */
 				nw = BUDSIZE(size, BUDMIN);
 				word += nw;
 			}
 		}
 	}
 	/* update the free count for this dmap.
 	 */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+	le32_add_cpu(&dp->nfree, nblocks);
 	BMAP_LOCK(bmp);
 	/* update the free count for the allocation group and
 	 * map.
 	 */
 	agno = blkno >> bmp->db_agl2size;
 	bmp->db_nfree += nblocks;
 	bmp->db_agfree[agno] += nblocks;
 	/* check if this allocation group is not completely free and
 	 * if it is currently the maximum (rightmost) allocation group.
 	 * if so, establish the new maximum allocation group number by
 	 * searching left for the first allocation group with allocation.
 	 */
 	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
 	    (agno == bmp->db_numag - 1 &&
 	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
 		while (bmp->db_maxag > 0) {
 			bmp->db_maxag -= 1;
 			if (bmp->db_agfree[bmp->db_maxag] !=
 			    bmp->db_agsize)
 				break;
 		}
 		/* re-establish the allocation group preference if the
 		 * current preference is right of the maximum allocation
 		 * group.
 		 */
 		if (bmp->db_agpref > bmp->db_maxag)
 			bmp->db_agpref = bmp->db_maxag;
 	}
 	BMAP_UNLOCK(bmp);
 	return 0;
 }
 /*
  * NAME:	dbAdjCtl()
  *
  * FUNCTION:	adjust a dmap control page at a specified level to reflect
  *		the change in a lower level dmap or dmap control page's
  *		maximum string of free blocks (i.e. a change in the root
  *		of the lower level object's dmtree) due to the allocation
  *		or deallocation of a range of blocks with a single dmap.
  *
  *		on entry, this routine is provided with the new value of
  *		the lower level dmap or dmap control page root and the
  *		starting block number of the block range whose allocation
  *		or deallocation resulted in the root change.  this range
  *		is respresented by a single leaf of the current dmapctl
  *		and the leaf will be updated with this value, possibly
  *		causing a binary buddy system within the leaves to be
  *		split or joined.  the update may also cause the dmapctl's
  *		dmtree to be updated.
  *
  *		if the adjustment of the dmap control page, itself, causes its
  *		root to change, this change will be bubbled up to the next dmap
  *		control level by a recursive call to this routine, specifying
  *		the new root value and the next dmap control page level to
  *		be adjusted.
  * PARAMETERS:
  *	bmp	-  pointer to bmap descriptor
  *	blkno	-  the first block of a block range within a dmap.  it is
  *		   the allocation or deallocation of this block range that
  *		   requires the dmap control page to be adjusted.
  *	newval	-  the new value of the lower level dmap or dmap control
  *		   page root.
  *	alloc	-  'true' if adjustment is due to an allocation.
  *	level	-  current level of dmap control page (i.e. L0, L1, L2) to
  *		   be adjusted.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int
 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 {
 	struct metapage *mp;
 	s8 oldroot;
 	int oldval;
 	s64 lblkno;
 	struct dmapctl *dcp;
 	int rc, leafno, ti;
 	/* get the buffer for the dmap control page for the specified
 	 * block number and control page level.
 	 */
 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 	if (mp == NULL)
 		return -EIO;
 	dcp = (struct dmapctl *) mp->data;
 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
 		jfs_error(bmp->db_ipbmap->i_sb,
 			  "dbAdjCtl: Corrupt dmapctl page");
 		release_metapage(mp);
 		return -EIO;
 	}
 	/* determine the leaf number corresponding to the block and
 	 * the index within the dmap control tree.
 	 */
 	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
 	ti = leafno + le32_to_cpu(dcp->leafidx);
 	/* save the current leaf value and the current root level (i.e.
 	 * maximum l2 free string described by this dmapctl).
 	 */
 	oldval = dcp->stree[ti];
 	oldroot = dcp->stree[ROOT];
 	/* check if this is a control page update for an allocation.
 	 * if so, update the leaf to reflect the new leaf value using
 	 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
 	 * the leaf with the new value.  in addition to updating the
 	 * leaf, dbSplit() will also split the binary buddy system of
 	 * the leaves, if required, and bubble new values within the
 	 * dmapctl tree, if required.  similarly, dbJoin() will join
 	 * the binary buddy system of leaves and bubble new values up
 	 * the dmapctl tree as required by the new leaf value.
 	 */
 	if (alloc) {
 		/* check if we are in the middle of a binary buddy
 		 * system.  this happens when we are performing the
 		 * first allocation out of an allocation group that
 		 * is part (not the first part) of a larger binary
 		 * buddy system.  if we are in the middle, back split
 		 * the system prior to calling dbSplit() which assumes
 		 * that it is at the front of a binary buddy system.
 		 */
 		if (oldval == NOFREE) {
 			rc = dbBackSplit((dmtree_t *) dcp, leafno);
 			if (rc)
 				return rc;
 			oldval = dcp->stree[ti];
 		}
 		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
 	} else {
 		rc = dbJoin((dmtree_t *) dcp, leafno, newval);
 		if (rc)
 			return rc;
 	}
 	/* check if the root of the current dmap control page changed due
 	 * to the update and if the current dmap control page is not at
 	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
 	 * root changed and this is not the top level), call this routine
 	 * again (recursion) for the next higher level of the mapping to
 	 * reflect the change in root for the current dmap control page.
 	 */
 	if (dcp->stree[ROOT] != oldroot) {
 		/* are we below the top level of the map.  if so,
 		 * bubble the root up to the next higher level.
 		 */
 		if (level < bmp->db_maxlevel) {
 			/* bubble up the new root of this dmap control page to
 			 * the next level.
 			 */
 			if ((rc =
 			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
 				      level + 1))) {
 				/* something went wrong in bubbling up the new
 				 * root value, so backout the changes to the
 				 * current dmap control page.
 				 */
 				if (alloc) {
 					dbJoin((dmtree_t *) dcp, leafno,
 					       oldval);
 				} else {
 					/* the dbJoin() above might have
 					 * caused a larger binary buddy system
 					 * to form and we may now be in the
 					 * middle of it.  if this is the case,
 					 * back split the buddies.
 					 */
 					if (dcp->stree[ti] == NOFREE)
 						dbBackSplit((dmtree_t *)
 							    dcp, leafno);
 					dbSplit((dmtree_t *) dcp, leafno,
 						dcp->budmin, oldval);
 				}
 				/* release the buffer and return the error.
 				 */
 				release_metapage(mp);
 				return (rc);
 			}
 		} else {
 			/* we're at the top level of the map. update
 			 * the bmap control page to reflect the size
 			 * of the maximum free buddy system.
 			 */
 			assert(level == bmp->db_maxlevel);
 			if (bmp->db_maxfreebud != oldroot) {
 				jfs_error(bmp->db_ipbmap->i_sb,
 					  "dbAdjCtl: the maximum free buddy is "
 					  "not the old root");
 			}
 			bmp->db_maxfreebud = dcp->stree[ROOT];
 		}
 	}
 	/* write the buffer.
 	 */
 	write_metapage(mp);
 	return (0);
 }
 /*
  * NAME:	dbSplit()
  *
  * FUNCTION:	update the leaf of a dmtree with a new value, splitting
  *		the leaf from the binary buddy system of the dmtree's
  *		leaves, as required.
  *
  * PARAMETERS:
  *	tp	- pointer to the tree containing the leaf.
  *	leafno	- the number of the leaf to be updated.
  *	splitsz	- the size the binary buddy system starting at the leaf
  *		  must be split to, specified as the log2 number of blocks.
  *	newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 {
 	int budsz;
 	int cursz;
 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 	/* check if the leaf needs to be split.
 	 */
 	if (leaf[leafno] > tp->dmt_budmin) {
 		/* the split occurs by cutting the buddy system in half
 		 * at the specified leaf until we reach the specified
 		 * size.  pick up the starting split size (current size
 		 * - 1 in l2) and the corresponding buddy size.
 		 */
 		cursz = leaf[leafno] - 1;
 		budsz = BUDSIZE(cursz, tp->dmt_budmin);
 		/* split until we reach the specified size.
 		 */
 		while (cursz >= splitsz) {
 			/* update the buddy's leaf with its new value.
 			 */
 			dbAdjTree(tp, leafno ^ budsz, cursz);
 			/* on to the next size and buddy.
 			 */
 			cursz -= 1;
 			budsz >>= 1;
 		}
 	}
 	/* adjust the dmap tree to reflect the specified leaf's new
 	 * value.
 	 */
 	dbAdjTree(tp, leafno, newval);
 }
 /*
  * NAME:	dbBackSplit()
  *
  * FUNCTION:	back split the binary buddy system of dmtree leaves
  *		that hold a specified leaf until the specified leaf
  *		starts its own binary buddy system.
  *
  *		the allocators typically perform allocations at the start
  *		of binary buddy systems and dbSplit() is used to accomplish
  *		any required splits.  in some cases, however, allocation
  *		may occur in the middle of a binary system and requires a
  *		back split, with the split proceeding out from the middle of
  *		the system (less efficient) rather than the start of the
  *		system (more efficient).  the cases in which a back split
  *		is required are rare and are limited to the first allocation
  *		within an allocation group which is a part (not first part)
  *		of a larger binary buddy system and a few exception cases
  *		in which a previous join operation must be backed out.
  *
  * PARAMETERS:
  *	tp	- pointer to the tree containing the leaf.
  *	leafno	- the number of the leaf to be updated.
  *
  * RETURN VALUES: none
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
 static int dbBackSplit(dmtree_t * tp, int leafno)
 {
 	int budsz, bud, w, bsz, size;
 	int cursz;
 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 	/* leaf should be part (not first part) of a binary
 	 * buddy system.
 	 */
 	assert(leaf[leafno] == NOFREE);
 	/* the back split is accomplished by iteratively finding the leaf
 	 * that starts the buddy system that contains the specified leaf and
 	 * splitting that system in two.  this iteration continues until
 	 * the specified leaf becomes the start of a buddy system.
 	 *
 	 * determine maximum possible l2 size for the specified leaf.
 	 */
 	size =
 	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
 		      tp->dmt_budmin);
 	/* determine the number of leaves covered by this size.  this
 	 * is the buddy size that we will start with as we search for
 	 * the buddy system that contains the specified leaf.
 	 */
 	budsz = BUDSIZE(size, tp->dmt_budmin);
 	/* back split.
 	 */
 	while (leaf[leafno] == NOFREE) {
 		/* find the leftmost buddy leaf.
 		 */
 		for (w = leafno, bsz = budsz;; bsz <<= 1,
 		     w = (w < bud) ? w : bud) {
 			if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
 				jfs_err("JFS: block map error in dbBackSplit");
 				return -EIO;
 			}
 			/* determine the buddy.
 			 */
 			bud = w ^ bsz;
 			/* check if this buddy is the start of the system.
 			 */
 			if (leaf[bud] != NOFREE) {
 				/* split the leaf at the start of the
 				 * system in two.
 				 */
 				cursz = leaf[bud] - 1;
 				dbSplit(tp, bud, cursz, cursz);
 				break;
 			}
 		}
 	}
 	if (leaf[leafno] != size) {
 		jfs_err("JFS: wrong leaf value in dbBackSplit");
 		return -EIO;
 	}
 	return 0;
 }
 /*
  * NAME:	dbJoin()
  *
  * FUNCTION:	update the leaf of a dmtree with a new value, joining
  *		the leaf with other leaves of the dmtree into a multi-leaf
  *		binary buddy system, as required.
  *
  * PARAMETERS:
  *	tp	- pointer to the tree containing the leaf.
  *	leafno	- the number of the leaf to be updated.
  *	newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  */
 static int dbJoin(dmtree_t * tp, int leafno, int newval)
 {
 	int budsz, buddy;
 	s8 *leaf;
 	/* can the new leaf value require a join with other leaves ?
 	 */
 	if (newval >= tp->dmt_budmin) {
 		/* pickup a pointer to the leaves of the tree.
 		 */
 		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 		/* try to join the specified leaf into a large binary
 		 * buddy system.  the join proceeds by attempting to join
 		 * the specified leafno with its buddy (leaf) at new value.
 		 * if the join occurs, we attempt to join the left leaf
 		 * of the joined buddies with its buddy at new value + 1.
 		 * we continue to join until we find a buddy that cannot be
 		 * joined (does not have a value equal to the size of the
 		 * last join) or until all leaves have been joined into a
 		 * single system.
 		 *
 		 * get the buddy size (number of words covered) of
 		 * the new value.
 		 */
 		budsz = BUDSIZE(newval, tp->dmt_budmin);
 		/* try to join.
 		 */
 		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
 			/* get the buddy leaf.
 			 */
 			buddy = leafno ^ budsz;
 			/* if the leaf's new value is greater than its
 			 * buddy's value, we join no more.
 			 */
 			if (newval > leaf[buddy])
 				break;
 			/* It shouldn't be less */
 			if (newval < leaf[buddy])
 				return -EIO;
 			/* check which (leafno or buddy) is the left buddy.
 			 * the left buddy gets to claim the blocks resulting
 			 * from the join while the right gets to claim none.
 			 * the left buddy is also eligable to participate in
 			 * a join at the next higher level while the right
 			 * is not.
 			 *
 			 */
 			if (leafno < buddy) {
 				/* leafno is the left buddy.
 				 */
 				dbAdjTree(tp, buddy, NOFREE);
 			} else {
 				/* buddy is the left buddy and becomes
 				 * leafno.
 				 */
 				dbAdjTree(tp, leafno, NOFREE);
 				leafno = buddy;
 			}
 			/* on to try the next join.
 			 */
 			newval += 1;
 			budsz <<= 1;
 		}
 	}
 	/* update the leaf value.
 	 */
 	dbAdjTree(tp, leafno, newval);
 	return 0;
 }
 /*
  * NAME:	dbAdjTree()
  *
  * FUNCTION:	update a leaf of a dmtree with a new value, adjusting
  *		the dmtree, as required, to reflect the new leaf value.
  *		the combination of any buddies must already be done before
  *		this is called.
  *
  * PARAMETERS:
  *	tp	- pointer to the tree to be adjusted.
  *	leafno	- the number of the leaf to be updated.
  *	newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  */
 static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
 {
 	int lp, pp, k;
 	int max;
 	/* pick up the index of the leaf for this leafno.
 	 */
 	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
 	/* is the current value the same as the old value ?  if so,
 	 * there is nothing to do.
 	 */
 	if (tp->dmt_stree[lp] == newval)
 		return;
 	/* set the new value.
 	 */
 	tp->dmt_stree[lp] = newval;
 	/* bubble the new value up the tree as required.
 	 */
 	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
 		/* get the index of the first leaf of the 4 leaf
 		 * group containing the specified leaf (leafno).
 		 */
 		lp = ((lp - 1) & ~0x03) + 1;
 		/* get the index of the parent of this 4 leaf group.
 		 */
 		pp = (lp - 1) >> 2;
 		/* determine the maximum of the 4 leaves.
 		 */
 		max = TREEMAX(&tp->dmt_stree[lp]);
 		/* if the maximum of the 4 is the same as the
 		 * parent's value, we're done.
 		 */
 		if (tp->dmt_stree[pp] == max)
 			break;
 		/* parent gets new value.
 		 */
 		tp->dmt_stree[pp] = max;
 		/* parent becomes leaf for next go-round.
 		 */
 		lp = pp;
 	}
 }
 /*
  * NAME:	dbFindLeaf()
  *
  * FUNCTION:	search a dmtree_t for sufficient free blocks, returning
  *		the index of a leaf describing the free blocks if
  *		sufficient free blocks are found.
  *
  *		the search starts at the top of the dmtree_t tree and
  *		proceeds down the tree to the leftmost leaf with sufficient
  *		free space.
  *
  * PARAMETERS:
  *	tp	- pointer to the tree to be searched.
  *	l2nb	- log2 number of free blocks to search for.
  *	leafidx	- return pointer to be set to the index of the leaf
  *		  describing at least l2nb free blocks if sufficient
  *		  free blocks are found.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient free blocks.
  */
 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 {
 	int ti, n = 0, k, x = 0;
 	/* first check the root of the tree to see if there is
 	 * sufficient free space.
 	 */
 	if (l2nb > tp->dmt_stree[ROOT])
 		return -ENOSPC;
 	/* sufficient free space available. now search down the tree
 	 * starting at the next level for the leftmost leaf that
 	 * describes sufficient free space.
 	 */
 	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
 	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
 		/* search the four nodes at this level, starting from
 		 * the left.
 		 */
 		for (x = ti, n = 0; n < 4; n++) {
 			/* sufficient free space found.  move to the next
 			 * level (or quit if this is the last level).
 			 */
 			if (l2nb <= tp->dmt_stree[x + n])
 				break;
 		}
 		/* better have found something since the higher
 		 * levels of the tree said it was here.
 		 */
 		assert(n < 4);
 	}
 	/* set the return to the leftmost leaf describing sufficient
 	 * free space.
 	 */
 	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
 	return (0);
 }
 /*
  * NAME:	dbFindBits()
  *
  * FUNCTION:	find a specified number of binary buddy free bits within a
  *		dmap bitmap word value.
  *
  *		this routine searches the bitmap value for (1 << l2nb) free
  *		bits at (1 << l2nb) alignments within the value.
  *
  * PARAMETERS:
  *	word	-  dmap bitmap word value.
  *	l2nb	-  number of free bits specified as a log2 number.
  *
  * RETURN VALUES:
  *	starting bit number of free bits.
  */
 static int dbFindBits(u32 word, int l2nb)
 {
 	int bitno, nb;
 	u32 mask;
 	/* get the number of bits.
 	 */
 	nb = 1 << l2nb;
 	assert(nb <= DBWORD);
 	/* complement the word so we can use a mask (i.e. 0s represent
 	 * free bits) and compute the mask.
 	 */
 	word = ~word;
 	mask = ONES << (DBWORD - nb);
 	/* scan the word for nb free bits at nb alignments.
 	 */
 	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
 		if ((mask & word) == mask)
 			break;
 	}
 	ASSERT(bitno < 32);
 	/* return the bit number.
 	 */
 	return (bitno);
 }
 /*
  * NAME:	dbMaxBud(u8 *cp)
  *
  * FUNCTION:	determine the largest binary buddy string of free
  *		bits within 32-bits of the map.
  *
  * PARAMETERS:
  *	cp	-  pointer to the 32-bit value.
  *
  * RETURN VALUES:
  *	largest binary buddy of free bits within a dmap word.
  */
 static int dbMaxBud(u8 * cp)
 {
 	signed char tmp1, tmp2;
 	/* check if the wmap word is all free. if so, the
 	 * free buddy size is BUDMIN.
 	 */
 	if (*((uint *) cp) == 0)
 		return (BUDMIN);
 	/* check if the wmap word is half free. if so, the
 	 * free buddy size is BUDMIN-1.
 	 */
 	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
 		return (BUDMIN - 1);
 	/* not all free or half free. determine the free buddy
 	 * size thru table lookup using quarters of the wmap word.
 	 */
 	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
 	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
 	return (max(tmp1, tmp2));
 }
 /*
  * NAME:	cnttz(uint word)
  *
  * FUNCTION:	determine the number of trailing zeros within a 32-bit
  *		value.
  *
  * PARAMETERS:
  *	value	-  32-bit value to be examined.
  *
  * RETURN VALUES:
  *	count of trailing zeros
  */
 static int cnttz(u32 word)
 {
 	int n;
 	for (n = 0; n < 32; n++, word >>= 1) {
 		if (word & 0x01)
 			break;
 	}
 	return (n);
 }
 /*
  * NAME:	cntlz(u32 value)
  *
  * FUNCTION:	determine the number of leading zeros within a 32-bit
  *		value.
  *
  * PARAMETERS:
  *	value	-  32-bit value to be examined.
  *
  * RETURN VALUES:
  *	count of leading zeros
  */
 static int cntlz(u32 value)
 {
 	int n;
 	for (n = 0; n < 32; n++, value <<= 1) {
 		if (value & HIGHORDER)
 			break;
 	}
 	return (n);
 }
 /*
  * NAME:	blkstol2(s64 nb)
  *
  * FUNCTION:	convert a block count to its log2 value. if the block
  *		count is not a l2 multiple, it is rounded up to the next
  *		larger l2 multiple.
  *
  * PARAMETERS:
  *	nb	-  number of blocks
  *
  * RETURN VALUES:
  *	log2 number of blocks
  */
 static int blkstol2(s64 nb)
 {
 	int l2nb;
 	s64 mask;		/* meant to be signed */
 	mask = (s64) 1 << (64 - 1);
 	/* count the leading bits.
 	 */
 	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
 		/* leading bit found.
 		 */
 		if (nb & mask) {
 			/* determine the l2 value.
 			 */
 			l2nb = (64 - 1) - l2nb;
 			/* check if we need to round up.
 			 */
 			if (~mask & nb)
 				l2nb++;
 			return (l2nb);
 		}
 	}
 	assert(0);
 	return 0;		/* fix compiler warning */
 }
 /*
  * NAME:	dbAllocBottomUp()
  *
  * FUNCTION:	alloc the specified block range from the working block
  *		allocation map.
  *
  *		the blocks will be alloc from the working map one dmap
  *		at a time.
  *
  * PARAMETERS:
  *	ip	-  pointer to in-core inode;
  *	blkno	-  starting block number to be freed.
  *	nblocks	-  number of blocks to be freed.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error
  */
 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
 {
 	struct metapage *mp;
 	struct dmap *dp;
 	int nb, rc;
 	s64 lblkno, rem;
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 	/* block to be allocated better be within the mapsize. */
 	ASSERT(nblocks <= bmp->db_mapsize - blkno);
 	/*
 	 * allocate the blocks a dmap at a time.
 	 */
 	mp = NULL;
 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
 		/* release previous dmap if any */
 		if (mp) {
 			write_metapage(mp);
 		}
 		/* get the buffer for the current dmap. */
 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 		if (mp == NULL) {
 			IREAD_UNLOCK(ipbmap);
 			return -EIO;
 		}
 		dp = (struct dmap *) mp->data;
 		/* determine the number of blocks to be allocated from
 		 * this dmap.
 		 */
 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
 		/* allocate the blocks. */
 		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
 			release_metapage(mp);
 			IREAD_UNLOCK(ipbmap);
 			return (rc);
 		}
 	}
 	/* write the last buffer. */
 	write_metapage(mp);
 	IREAD_UNLOCK(ipbmap);
 	return (0);
 }
 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 			 int nblocks)
 {
 	int rc;
 	int dbitno, word, rembits, nb, nwords, wbitno, agno;
 	s8 oldroot, *leaf;
 	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
 	/* save the current value of the root (i.e. maximum free string)
 	 * of the dmap tree.
 	 */
 	oldroot = tp->stree[ROOT];
 	/* pick up a pointer to the leaves of the dmap tree */
 	leaf = tp->stree + LEAFIND;
 	/* determine the bit number and word within the dmap of the
 	 * starting block.
 	 */
 	dbitno = blkno & (BPERDMAP - 1);
 	word = dbitno >> L2DBWORD;
 	/* block range better be within the dmap */
 	assert(dbitno + nblocks <= BPERDMAP);
 	/* allocate the bits of the dmap's words corresponding to the block
 	 * range. not all bits of the first and last words may be contained
 	 * within the block range.  if this is the case, we'll work against
 	 * those words (i.e. partial first and/or last) on an individual basis
 	 * (a single pass), allocating the bits of interest by hand and
 	 * updating the leaf corresponding to the dmap word. a single pass
 	 * will be used for all dmap words fully contained within the
 	 * specified range.  within this pass, the bits of all fully contained
 	 * dmap words will be marked as free in a single shot and the leaves
 	 * will be updated. a single leaf may describe the free space of
 	 * multiple dmap words, so we may update only a subset of the actual
 	 * leaves corresponding to the dmap words of the block range.
 	 */
 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 		/* determine the bit number within the word and
 		 * the number of bits within the word.
 		 */
 		wbitno = dbitno & (DBWORD - 1);
 		nb = min(rembits, DBWORD - wbitno);
 		/* check if only part of a word is to be allocated.
 		 */
 		if (nb < DBWORD) {
 			/* allocate (set to 1) the appropriate bits within
 			 * this dmap word.
 			 */
 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
 						      >> wbitno);
 			word++;
 		} else {
 			/* one or more dmap words are fully contained
 			 * within the block range.  determine how many
 			 * words and allocate (set to 1) the bits of these
 			 * words.
 			 */
 			nwords = rembits >> L2DBWORD;
 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
 			/* determine how many bits */
 			nb = nwords << L2DBWORD;
 			word += nwords;
 		}
 	}
 	/* update the free count for this dmap */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+	le32_add_cpu(&dp->nfree, -nblocks);
 	/* reconstruct summary tree */
 	dbInitDmapTree(dp);
 	BMAP_LOCK(bmp);
 	/* if this allocation group is completely free,
 	 * update the highest active allocation group number
 	 * if this allocation group is the new max.
 	 */
 	agno = blkno >> bmp->db_agl2size;
 	if (agno > bmp->db_maxag)
 		bmp->db_maxag = agno;
 	/* update the free count for the allocation group and map */
 	bmp->db_agfree[agno] -= nblocks;
 	bmp->db_nfree -= nblocks;
 	BMAP_UNLOCK(bmp);
 	/* if the root has not changed, done. */
 	if (tp->stree[ROOT] == oldroot)
 		return (0);
 	/* root changed. bubble the change up to the dmap control pages.
 	 * if the adjustment of the upper level control pages fails,
 	 * backout the bit allocation (thus making everything consistent).
 	 */
 	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
 		dbFreeBits(bmp, dp, blkno, nblocks);
 	return (rc);
 }
 /*
  * NAME:	dbExtendFS()
  *
  * FUNCTION:	extend bmap from blkno for nblocks;
  *		dbExtendFS() updates bmap ready for dbAllocBottomUp();
  *
  * L2
  *  |
  *   L1---------------------------------L1
  *    |					 |
  *     L0---------L0---------L0		  L0---------L0---------L0
  *      |	   |	      |		   |	      |		 |
  *	 d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
  *
  * <---old---><----------------------------extend----------------------->
  */
 int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
 	int nbperpage = sbi->nbperpage;
 	int i, i0 = true, j, j0 = true, k, n;
 	s64 newsize;
 	s64 p;
 	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
 	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
 	struct dmap *dp;
 	s8 *l0leaf, *l1leaf, *l2leaf;
 	struct bmap *bmp = sbi->bmap;
 	int agno, l2agsize, oldl2agsize;
 	s64 ag_rem;
 	newsize = blkno + nblocks;
 	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
 		 (long long) blkno, (long long) nblocks, (long long) newsize);
 	/*
 	 *	initialize bmap control page.
 	 *
 	 * all the data in bmap control page should exclude
 	 * the mkfs hidden dmap page.
 	 */
 	/* update mapsize */
 	bmp->db_mapsize = newsize;
 	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
 	/* compute new AG size */
 	l2agsize = dbGetL2AGSize(newsize);
 	oldl2agsize = bmp->db_agl2size;
 	bmp->db_agl2size = l2agsize;
 	bmp->db_agsize = 1 << l2agsize;
 	/* compute new number of AG */
 	agno = bmp->db_numag;
 	bmp->db_numag = newsize >> l2agsize;
 	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
 	/*
 	 *	reconfigure db_agfree[]
 	 * from old AG configuration to new AG configuration;
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
 	 * note: new AG size = old AG size * (2**x).
 	 */
 	if (l2agsize == oldl2agsize)
 		goto extend;
 	k = 1 << (l2agsize - oldl2agsize);
 	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
 	for (i = 0, n = 0; i < agno; n++) {
 		bmp->db_agfree[n] = 0;	/* init collection point */
 		/* coalesce cotiguous k AGs; */
 		for (j = 0; j < k && i < agno; j++, i++) {
 			/* merge AGi to AGn */
 			bmp->db_agfree[n] += bmp->db_agfree[i];
 		}
 	}
 	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
 	for (; n < MAXAG; n++)
 		bmp->db_agfree[n] = 0;
 	/*
 	 * update highest active ag number
 	 */
 	bmp->db_maxag = bmp->db_maxag / k;
 	/*
 	 *	extend bmap
 	 *
 	 * update bit maps and corresponding level control pages;
 	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
 	 */
       extend:
 	/* get L2 page */
 	p = BMAPBLKNO + nbperpage;	/* L2 page */
 	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
 	if (!l2mp) {
 		jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
 		return -EIO;
 	}
 	l2dcp = (struct dmapctl *) l2mp->data;
 	/* compute start L1 */
 	k = blkno >> L2MAXL1SIZE;
 	l2leaf = l2dcp->stree + CTLLEAFIND + k;
 	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
 	/*
 	 * extend each L1 in L2
 	 */
 	for (; k < LPERCTL; k++, p += nbperpage) {
 		/* get L1 page */
 		if (j0) {
 			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
 			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
 			if (l1mp == NULL)
 				goto errout;
 			l1dcp = (struct dmapctl *) l1mp->data;
 			/* compute start L0 */
 			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
 			l1leaf = l1dcp->stree + CTLLEAFIND + j;
 			p = BLKTOL0(blkno, sbi->l2nbperpage);
 			j0 = false;
 		} else {
 			/* assign/init L1 page */
 			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
 			if (l1mp == NULL)
 				goto errout;
 			l1dcp = (struct dmapctl *) l1mp->data;
 			/* compute start L0 */
 			j = 0;
 			l1leaf = l1dcp->stree + CTLLEAFIND;
 			p += nbperpage;	/* 1st L0 of L1.k */
 		}
 		/*
 		 * extend each L0 in L1
 		 */
 		for (; j < LPERCTL; j++) {
 			/* get L0 page */
 			if (i0) {
 				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
 				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
 				if (l0mp == NULL)
 					goto errout;
 				l0dcp = (struct dmapctl *) l0mp->data;
 				/* compute start dmap */
 				i = (blkno & (MAXL0SIZE - 1)) >>
 				    L2BPERDMAP;
 				l0leaf = l0dcp->stree + CTLLEAFIND + i;
 				p = BLKTODMAP(blkno,
 					      sbi->l2nbperpage);
 				i0 = false;
 			} else {
 				/* assign/init L0 page */
 				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
 				if (l0mp == NULL)
 					goto errout;
 				l0dcp = (struct dmapctl *) l0mp->data;
 				/* compute start dmap */
 				i = 0;
 				l0leaf = l0dcp->stree + CTLLEAFIND;
 				p += nbperpage;	/* 1st dmap of L0.j */
 			}
 			/*
 			 * extend each dmap in L0
 			 */
 			for (; i < LPERCTL; i++) {
 				/*
 				 * reconstruct the dmap page, and
 				 * initialize corresponding parent L0 leaf
 				 */
 				if ((n = blkno & (BPERDMAP - 1))) {
 					/* read in dmap page: */
 					mp = read_metapage(ipbmap, p,
 							   PSIZE, 0);
 					if (mp == NULL)
 						goto errout;
 					n = min(nblocks, (s64)BPERDMAP - n);
 				} else {
 					/* assign/init dmap page */
 					mp = read_metapage(ipbmap, p,
 							   PSIZE, 0);
 					if (mp == NULL)
 						goto errout;
 					n = min(nblocks, (s64)BPERDMAP);
 				}
 				dp = (struct dmap *) mp->data;
 				*l0leaf = dbInitDmap(dp, blkno, n);
 				bmp->db_nfree += n;
 				agno = le64_to_cpu(dp->start) >> l2agsize;
 				bmp->db_agfree[agno] += n;
 				write_metapage(mp);
 				l0leaf++;
 				p += nbperpage;
 				blkno += n;
 				nblocks -= n;
 				if (nblocks == 0)
 					break;
 			}	/* for each dmap in a L0 */
 			/*
 			 * build current L0 page from its leaves, and
 			 * initialize corresponding parent L1 leaf
 			 */
 			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
 			write_metapage(l0mp);
 			l0mp = NULL;
 			if (nblocks)
 				l1leaf++;	/* continue for next L0 */
 			else {
 				/* more than 1 L0 ? */
 				if (j > 0)
 					break;	/* build L1 page */
 				else {
 					/* summarize in global bmap page */
 					bmp->db_maxfreebud = *l1leaf;
 					release_metapage(l1mp);
 					release_metapage(l2mp);
 					goto finalize;
 				}
 			}
 		}		/* for each L0 in a L1 */
 		/*
 		 * build current L1 page from its leaves, and
 		 * initialize corresponding parent L2 leaf
 		 */
 		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
 		write_metapage(l1mp);
 		l1mp = NULL;
 		if (nblocks)
 			l2leaf++;	/* continue for next L1 */
 		else {
 			/* more than 1 L1 ? */
 			if (k > 0)
 				break;	/* build L2 page */
 			else {
 				/* summarize in global bmap page */
 				bmp->db_maxfreebud = *l2leaf;
 				release_metapage(l2mp);
 				goto finalize;
 			}
 		}
 	}			/* for each L1 in a L2 */
 	jfs_error(ipbmap->i_sb,
 		  "dbExtendFS: function has not returned as expected");
 errout:
 	if (l0mp)
 		release_metapage(l0mp);
 	if (l1mp)
 		release_metapage(l1mp);
 	release_metapage(l2mp);
 	return -EIO;
 	/*
 	 *	finalize bmap control page
 	 */
 finalize:
 	return 0;
 }
 /*
  *	dbFinalizeBmap()
  */
 void dbFinalizeBmap(struct inode *ipbmap)
 {
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	int actags, inactags, l2nl;
 	s64 ag_rem, actfree, inactfree, avgfree;
 	int i, n;
 	/*
 	 *	finalize bmap control page
 	 */
 //finalize:
 	/*
 	 * compute db_agpref: preferred ag to allocate from
 	 * (the leftmost ag with average free space in it);
 	 */
 //agpref:
 	/* get the number of active ags and inacitve ags */
 	actags = bmp->db_maxag + 1;
 	inactags = bmp->db_numag - actags;
 	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
 	/* determine how many blocks are in the inactive allocation
 	 * groups. in doing this, we must account for the fact that
 	 * the rightmost group might be a partial group (i.e. file
 	 * system size is not a multiple of the group size).
 	 */
 	inactfree = (inactags && ag_rem) ?
 	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
 	    : inactags << bmp->db_agl2size;
 	/* determine how many free blocks are in the active
 	 * allocation groups plus the average number of free blocks
 	 * within the active ags.
 	 */
 	actfree = bmp->db_nfree - inactfree;
 	avgfree = (u32) actfree / (u32) actags;
 	/* if the preferred allocation group has not average free space.
 	 * re-establish the preferred group as the leftmost
 	 * group with average free space.
 	 */
 	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
 		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
 		     bmp->db_agpref++) {
 			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
 				break;
 		}
 		if (bmp->db_agpref >= bmp->db_numag) {
 			jfs_error(ipbmap->i_sb,
 				  "cannot find ag with average freespace");
 		}
 	}
 	/*
 	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
 	 * an ag is covered in aglevel dmapctl summary tree,
 	 * at agheight level height (from leaf) with agwidth number of nodes
 	 * each, which starts at agstart index node of the smmary tree node
 	 * array;
 	 */
 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
 	l2nl =
 	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
 	bmp->db_agheigth = l2nl >> 1;
 	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
 	for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
 	     i--) {
 		bmp->db_agstart += n;
 		n <<= 2;
 	}
 }
 /*
  * NAME:	dbInitDmap()/ujfs_idmap_page()
  *
  * FUNCTION:	initialize working/persistent bitmap of the dmap page
  *		for the specified number of blocks:
  *
  *		at entry, the bitmaps had been initialized as free (ZEROS);
  *		The number of blocks will only account for the actually
  *		existing blocks. Blocks which don't actually exist in
  *		the aggregate will be marked as allocated (ONES);
  *
  * PARAMETERS:
  *	dp	- pointer to page of map
  *	nblocks	- number of blocks this page
  *
  * RETURNS: NONE
  */
 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 {
 	int blkno, w, b, r, nw, nb, i;
 	/* starting block number within the dmap */
 	blkno = Blkno & (BPERDMAP - 1);
 	if (blkno == 0) {
 		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
 		dp->start = cpu_to_le64(Blkno);
 		if (nblocks == BPERDMAP) {
 			memset(&dp->wmap[0], 0, LPERDMAP * 4);
 			memset(&dp->pmap[0], 0, LPERDMAP * 4);
 			goto initTree;
 		}
 	} else {
-		dp->nblocks =
+		le32_add_cpu(&dp->nblocks, nblocks);
-		    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
+		le32_add_cpu(&dp->nfree, nblocks);
-		dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
 	}
 	/* word number containing start block number */
 	w = blkno >> L2DBWORD;
 	/*
 	 * free the bits corresponding to the block range (ZEROS):
 	 * note: not all bits of the first and last words may be contained
 	 * within the block range.
 	 */
 	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
 		/* number of bits preceding range to be freed in the word */
 		b = blkno & (DBWORD - 1);
 		/* number of bits to free in the word */
 		nb = min(r, DBWORD - b);
 		/* is partial word to be freed ? */
 		if (nb < DBWORD) {
 			/* free (set to 0) from the bitmap word */
 			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
 						     >> b));
 			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
 						     >> b));
 			/* skip the word freed */
 			w++;
 		} else {
 			/* free (set to 0) contiguous bitmap words */
 			nw = r >> L2DBWORD;
 			memset(&dp->wmap[w], 0, nw * 4);
 			memset(&dp->pmap[w], 0, nw * 4);
 			/* skip the words freed */
 			nb = nw << L2DBWORD;
 			w += nw;
 		}
 	}
 	/*
 	 * mark bits following the range to be freed (non-existing
 	 * blocks) as allocated (ONES)
 	 */
 	if (blkno == BPERDMAP)
 		goto initTree;
 	/* the first word beyond the end of existing blocks */
 	w = blkno >> L2DBWORD;
 	/* does nblocks fall on a 32-bit boundary ? */
 	b = blkno & (DBWORD - 1);
 	if (b) {
 		/* mark a partial word allocated */
 		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
 		w++;
 	}
 	/* set the rest of the words in the page to allocated (ONES) */
 	for (i = w; i < LPERDMAP; i++)
 		dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
 	/*
 	 * init tree
 	 */
       initTree:
 	return (dbInitDmapTree(dp));
 }
 /*
  * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
  *
  * FUNCTION:	initialize summary tree of the specified dmap:
  *
  *		at entry, bitmap of the dmap has been initialized;
  *
  * PARAMETERS:
  *	dp	- dmap to complete
  *	blkno	- starting block number for this dmap
  *	treemax	- will be filled in with max free for this dmap
  *
  * RETURNS:	max free string at the root of the tree
  */
 static int dbInitDmapTree(struct dmap * dp)
 {
 	struct dmaptree *tp;
 	s8 *cp;
 	int i;
 	/* init fixed info of tree */
 	tp = &dp->tree;
 	tp->nleafs = cpu_to_le32(LPERDMAP);
 	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
 	tp->leafidx = cpu_to_le32(LEAFIND);
 	tp->height = cpu_to_le32(4);
 	tp->budmin = BUDMIN;
 	/* init each leaf from corresponding wmap word:
 	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
 	 * bitmap word are allocated.
 	 */
 	cp = tp->stree + le32_to_cpu(tp->leafidx);
 	for (i = 0; i < LPERDMAP; i++)
 		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
 	/* build the dmap's binary buddy summary tree */
 	return (dbInitTree(tp));
 }
 /*
  * NAME:	dbInitTree()/ujfs_adjtree()
  *
  * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
  *
  *		at entry, the leaves of the tree has been initialized
  *		from corresponding bitmap word or root of summary tree
  *		of the child control page;
  *		configure binary buddy system at the leaf level, then
  *		bubble up the values of the leaf nodes up the tree.
  *
  * PARAMETERS:
  *	cp	- Pointer to the root of the tree
  *	l2leaves- Number of leaf nodes as a power of 2
  *	l2min	- Number of blocks that can be covered by a leaf
  *		  as a power of 2
  *
  * RETURNS: max free string at the root of the tree
  */
 static int dbInitTree(struct dmaptree * dtp)
 {
 	int l2max, l2free, bsize, nextb, i;
 	int child, parent, nparent;
 	s8 *tp, *cp, *cp1;
 	tp = dtp->stree;
 	/* Determine the maximum free string possible for the leaves */
 	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
 	/*
 	 * configure the leaf levevl into binary buddy system
 	 *
 	 * Try to combine buddies starting with a buddy size of 1
 	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
 	 * can be combined if both buddies have a maximum free of l2min;
 	 * the combination will result in the left-most buddy leaf having
 	 * a maximum free of l2min+1.
 	 * After processing all buddies for a given size, process buddies
 	 * at the next higher buddy size (i.e. current size * 2) and
 	 * the next maximum free (current free + 1).
 	 * This continues until the maximum possible buddy combination
 	 * yields maximum free.
 	 */
 	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
 	     l2free++, bsize = nextb) {
 		/* get next buddy size == current buddy pair size */
 		nextb = bsize << 1;
 		/* scan each adjacent buddy pair at current buddy size */
 		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
 		     i < le32_to_cpu(dtp->nleafs);
 		     i += nextb, cp += nextb) {
 			/* coalesce if both adjacent buddies are max free */
 			if (*cp == l2free && *(cp + bsize) == l2free) {
 				*cp = l2free + 1;	/* left take right */
 				*(cp + bsize) = -1;	/* right give left */
 			}
 		}
 	}
 	/*
 	 * bubble summary information of leaves up the tree.
 	 *
 	 * Starting at the leaf node level, the four nodes described by
 	 * the higher level parent node are compared for a maximum free and
 	 * this maximum becomes the value of the parent node.
 	 * when all lower level nodes are processed in this fashion then
 	 * move up to the next level (parent becomes a lower level node) and
 	 * continue the process for that level.
 	 */
 	for (child = le32_to_cpu(dtp->leafidx),
 	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
 	     nparent > 0; nparent >>= 2, child = parent) {
 		/* get index of 1st node of parent level */
 		parent = (child - 1) >> 2;
 		/* set the value of the parent node as the maximum
 		 * of the four nodes of the current level.
 		 */
 		for (i = 0, cp = tp + child, cp1 = tp + parent;
 		     i < nparent; i++, cp += 4, cp1++)
 			*cp1 = TREEMAX(cp);
 	}
 	return (*tp);
 }
 /*
  *	dbInitDmapCtl()
  *
  * function: initialize dmapctl page
  */
 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
 {				/* start leaf index not covered by range */
 	s8 *cp;
 	dcp->nleafs = cpu_to_le32(LPERCTL);
 	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
 	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
 	dcp->height = cpu_to_le32(5);
 	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
 	/*
 	 * initialize the leaves of current level that were not covered
 	 * by the specified input block range (i.e. the leaves have no
 	 * low level dmapctl or dmap).
 	 */
 	cp = &dcp->stree[CTLLEAFIND + i];
 	for (; i < LPERCTL; i++)
 		*cp++ = NOFREE;
 	/* build the dmap's binary buddy summary tree */
 	return (dbInitTree((struct dmaptree *) dcp));
 }
 /*
  * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
  *
  * FUNCTION:	Determine log2(allocation group size) from aggregate size
  *
  * PARAMETERS:
  *	nblocks	- Number of blocks in aggregate
  *
  * RETURNS: log2(allocation group size) in aggregate blocks
  */
 static int dbGetL2AGSize(s64 nblocks)
 {
 	s64 sz;
 	s64 m;
 	int l2sz;
 	if (nblocks < BPERDMAP * MAXAG)
 		return (L2BPERDMAP);
 	/* round up aggregate size to power of 2 */
 	m = ((u64) 1 << (64 - 1));
 	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
 		if (m & nblocks)
 			break;
 	}
 	sz = (s64) 1 << l2sz;
 	if (sz < nblocks)
 		l2sz += 1;
 	/* agsize = roundupSize/max_number_of_ag */
 	return (l2sz - L2MAXAG);
 }
 /*
  * NAME:	dbMapFileSizeToMapSize()
  *
  * FUNCTION:	compute number of blocks the block allocation map file
  *		can cover from the map file size;
  *
  * RETURNS:	Number of blocks which can be covered by this block map file;
  */
 /*
  * maximum number of map pages at each level including control pages
  */
 #define MAXL0PAGES	(1 + LPERCTL)
 #define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
 #define MAXL2PAGES	(1 + LPERCTL * MAXL1PAGES)
 /*
  * convert number of map pages to the zero origin top dmapctl level
  */
 #define BMAPPGTOLEV(npages)	\
 	(((npages) <= 3 + MAXL0PAGES) ? 0 : \
 	 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 {
 	struct super_block *sb = ipbmap->i_sb;
 	s64 nblocks;
 	s64 npages, ndmaps;
 	int level, i;
 	int complete, factor;
 	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
 	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
 	level = BMAPPGTOLEV(npages);
 	/* At each level, accumulate the number of dmap pages covered by
 	 * the number of full child levels below it;
 	 * repeat for the last incomplete child level.
 	 */
 	ndmaps = 0;
 	npages--;		/* skip the first global control page */
 	/* skip higher level control pages above top level covered by map */
 	npages -= (2 - level);
 	npages--;		/* skip top level's control page */
 	for (i = level; i >= 0; i--) {
 		factor =
 		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
 		complete = (u32) npages / factor;
 		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
 				      ((i == 1) ? LPERCTL : 1));
 		/* pages in last/incomplete child */
 		npages = (u32) npages % factor;
 		/* skip incomplete child's level control page */
 		npages--;
 	}
 	/* convert the number of dmaps into the number of blocks
 	 * which can be covered by the dmaps;
 	 */
 	nblocks = ndmaps << L2BPERDMAP;
 	return (nblocks);
 }

fs/jfs/jfs_imap.c

Diff comments View file @ 8914562

 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
  *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
  *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 /*
  *	jfs_imap.c: inode allocation map manager
  *
  * Serialization:
  *   Each AG has a simple lock which is used to control the serialization of
  *	the AG level lists.  This lock should be taken first whenever an AG
  *	level list will be modified or accessed.
  *
  *   Each IAG is locked by obtaining the buffer for the IAG page.
  *
  *   There is also a inode lock for the inode map inode.  A read lock needs to
  *	be taken whenever an IAG is read from the map or the global level
  *	information is read.  A write lock needs to be taken whenever the global
  *	level information is modified or an atomic operation needs to be used.
  *
  *	If more than one IAG is read at one time, the read lock may not
  *	be given up until all of the IAG's are read.  Otherwise, a deadlock
  *	may occur when trying to obtain the read lock while another thread
  *	holding the read lock is waiting on the IAG already being held.
  *
  *   The control page of the inode map is read into memory by diMount().
  *	Thereafter it should only be modified in memory and then it will be
  *	written out when the filesystem is unmounted by diUnmount().
  */
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
 #include "jfs_dinode.h"
 #include "jfs_dmap.h"
 #include "jfs_imap.h"
 #include "jfs_metapage.h"
 #include "jfs_superblock.h"
 #include "jfs_debug.h"
 /*
  * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
  * special inodes in the fileset inode space, we hash them to a dummy head
  */
 static HLIST_HEAD(aggregate_hash);
 /*
  * imap locks
  */
 /* iag free list lock */
 #define IAGFREE_LOCK_INIT(imap)		mutex_init(&imap->im_freelock)
 #define IAGFREE_LOCK(imap)		mutex_lock(&imap->im_freelock)
 #define IAGFREE_UNLOCK(imap)		mutex_unlock(&imap->im_freelock)
 /* per ag iag list locks */
 #define AG_LOCK_INIT(imap,index)	mutex_init(&(imap->im_aglock[index]))
 #define AG_LOCK(imap,agno)		mutex_lock(&imap->im_aglock[agno])
 #define AG_UNLOCK(imap,agno)		mutex_unlock(&imap->im_aglock[agno])
 /*
  * forward references
  */
 static int diAllocAG(struct inomap *, int, bool, struct inode *);
 static int diAllocAny(struct inomap *, int, bool, struct inode *);
 static int diAllocBit(struct inomap *, struct iag *, int);
 static int diAllocExt(struct inomap *, int, struct inode *);
 static int diAllocIno(struct inomap *, int, struct inode *);
 static int diFindFree(u32, int);
 static int diNewExt(struct inomap *, struct iag *, int);
 static int diNewIAG(struct inomap *, int *, int, struct metapage **);
 static void duplicateIXtree(struct super_block *, s64, int, s64 *);
 static int diIAGRead(struct inomap * imap, int, struct metapage **);
 static int copy_from_dinode(struct dinode *, struct inode *);
 static void copy_to_dinode(struct dinode *, struct inode *);
 /*
  * NAME:	diMount()
  *
  * FUNCTION:	initialize the incore inode map control structures for
  *		a fileset or aggregate init time.
  *
  *		the inode map's control structure (dinomap) is
  *		brought in from disk and placed in virtual memory.
  *
  * PARAMETERS:
  *	ipimap	- pointer to inode map inode for the aggregate or fileset.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOMEM	- insufficient free virtual memory.
  *	-EIO	- i/o error.
  */
 int diMount(struct inode *ipimap)
 {
 	struct inomap *imap;
 	struct metapage *mp;
 	int index;
 	struct dinomap_disk *dinom_le;
 	/*
 	 * allocate/initialize the in-memory inode map control structure
 	 */
 	/* allocate the in-memory inode map control structure. */
 	imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
 	if (imap == NULL) {
 		jfs_err("diMount: kmalloc returned NULL!");
 		return -ENOMEM;
 	}
 	/* read the on-disk inode map control structure. */
 	mp = read_metapage(ipimap,
 			   IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
 			   PSIZE, 0);
 	if (mp == NULL) {
 		kfree(imap);
 		return -EIO;
 	}
 	/* copy the on-disk version to the in-memory version. */
 	dinom_le = (struct dinomap_disk *) mp->data;
 	imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
 	imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
 	atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
 	atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
 	imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
 	imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
 	for (index = 0; index < MAXAG; index++) {
 		imap->im_agctl[index].inofree =
 		    le32_to_cpu(dinom_le->in_agctl[index].inofree);
 		imap->im_agctl[index].extfree =
 		    le32_to_cpu(dinom_le->in_agctl[index].extfree);
 		imap->im_agctl[index].numinos =
 		    le32_to_cpu(dinom_le->in_agctl[index].numinos);
 		imap->im_agctl[index].numfree =
 		    le32_to_cpu(dinom_le->in_agctl[index].numfree);
 	}
 	/* release the buffer. */
 	release_metapage(mp);
 	/*
 	 * allocate/initialize inode allocation map locks
 	 */
 	/* allocate and init iag free list lock */
 	IAGFREE_LOCK_INIT(imap);
 	/* allocate and init ag list locks */
 	for (index = 0; index < MAXAG; index++) {
 		AG_LOCK_INIT(imap, index);
 	}
 	/* bind the inode map inode and inode map control structure
 	 * to each other.
 	 */
 	imap->im_ipimap = ipimap;
 	JFS_IP(ipimap)->i_imap = imap;
 	return (0);
 }
 /*
  * NAME:	diUnmount()
  *
  * FUNCTION:	write to disk the incore inode map control structures for
  *		a fileset or aggregate at unmount time.
  *
  * PARAMETERS:
  *	ipimap	- pointer to inode map inode for the aggregate or fileset.
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOMEM	- insufficient free virtual memory.
  *	-EIO	- i/o error.
  */
 int diUnmount(struct inode *ipimap, int mounterror)
 {
 	struct inomap *imap = JFS_IP(ipimap)->i_imap;
 	/*
 	 * update the on-disk inode map control structure
 	 */
 	if (!(mounterror || isReadOnly(ipimap)))
 		diSync(ipimap);
 	/*
 	 * Invalidate the page cache buffers
 	 */
 	truncate_inode_pages(ipimap->i_mapping, 0);
 	/*
 	 * free in-memory control structure
 	 */
 	kfree(imap);
 	return (0);
 }
 /*
  *	diSync()
  */
 int diSync(struct inode *ipimap)
 {
 	struct dinomap_disk *dinom_le;
 	struct inomap *imp = JFS_IP(ipimap)->i_imap;
 	struct metapage *mp;
 	int index;
 	/*
 	 * write imap global conrol page
 	 */
 	/* read the on-disk inode map control structure */
 	mp = get_metapage(ipimap,
 			  IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
 			  PSIZE, 0);
 	if (mp == NULL) {
 		jfs_err("diSync: get_metapage failed!");
 		return -EIO;
 	}
 	/* copy the in-memory version to the on-disk version */
 	dinom_le = (struct dinomap_disk *) mp->data;
 	dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
 	dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
 	dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
 	dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
 	dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
 	dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
 	for (index = 0; index < MAXAG; index++) {
 		dinom_le->in_agctl[index].inofree =
 		    cpu_to_le32(imp->im_agctl[index].inofree);
 		dinom_le->in_agctl[index].extfree =
 		    cpu_to_le32(imp->im_agctl[index].extfree);
 		dinom_le->in_agctl[index].numinos =
 		    cpu_to_le32(imp->im_agctl[index].numinos);
 		dinom_le->in_agctl[index].numfree =
 		    cpu_to_le32(imp->im_agctl[index].numfree);
 	}
 	/* write out the control structure */
 	write_metapage(mp);
 	/*
 	 * write out dirty pages of imap
 	 */
 	filemap_write_and_wait(ipimap->i_mapping);
 	diWriteSpecial(ipimap, 0);
 	return (0);
 }
 /*
  * NAME:	diRead()
  *
  * FUNCTION:	initialize an incore inode from disk.
  *
  *		on entry, the specifed incore inode should itself
  *		specify the disk inode number corresponding to the
  *		incore inode (i.e. i_number should be initialized).
  *
  *		this routine handles incore inode initialization for
  *		both "special" and "regular" inodes.  special inodes
  *		are those required early in the mount process and
  *		require special handling since much of the file system
  *		is not yet initialized.  these "special" inodes are
  *		identified by a NULL inode map inode pointer and are
  *		actually initialized by a call to diReadSpecial().
  *
  *		for regular inodes, the iag describing the disk inode
  *		is read from disk to determine the inode extent address
  *		for the disk inode.  with the inode extent address in
  *		hand, the page of the extent that contains the disk
  *		inode is read and the disk inode is copied to the
  *		incore inode.
  *
  * PARAMETERS:
  *	ip	-  pointer to incore inode to be initialized from disk.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error.
  *	-ENOMEM	- insufficient memory
  *
  */
 int diRead(struct inode *ip)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	int iagno, ino, extno, rc;
 	struct inode *ipimap;
 	struct dinode *dp;
 	struct iag *iagp;
 	struct metapage *mp;
 	s64 blkno, agstart;
 	struct inomap *imap;
 	int block_offset;
 	int inodes_left;
 	unsigned long pageno;
 	int rel_inode;
 	jfs_info("diRead: ino = %ld", ip->i_ino);
 	ipimap = sbi->ipimap;
 	JFS_IP(ip)->ipimap = ipimap;
 	/* determine the iag number for this inode (number) */
 	iagno = INOTOIAG(ip->i_ino);
 	/* read the iag */
 	imap = JFS_IP(ipimap)->i_imap;
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	rc = diIAGRead(imap, iagno, &mp);
 	IREAD_UNLOCK(ipimap);
 	if (rc) {
 		jfs_err("diRead: diIAGRead returned %d", rc);
 		return (rc);
 	}
 	iagp = (struct iag *) mp->data;
 	/* determine inode extent that holds the disk inode */
 	ino = ip->i_ino & (INOSPERIAG - 1);
 	extno = ino >> L2INOSPEREXT;
 	if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
 	    (addressPXD(&iagp->inoext[extno]) == 0)) {
 		release_metapage(mp);
 		return -ESTALE;
 	}
 	/* get disk block number of the page within the inode extent
 	 * that holds the disk inode.
 	 */
 	blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
 	/* get the ag for the iag */
 	agstart = le64_to_cpu(iagp->agstart);
 	release_metapage(mp);
 	rel_inode = (ino & (INOSPERPAGE - 1));
 	pageno = blkno >> sbi->l2nbperpage;
 	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
 		/*
 		 * OS/2 didn't always align inode extents on page boundaries
 		 */
 		inodes_left =
 		     (sbi->nbperpage - block_offset) << sbi->l2niperblk;
 		if (rel_inode < inodes_left)
 			rel_inode += block_offset << sbi->l2niperblk;
 		else {
 			pageno += 1;
 			rel_inode -= inodes_left;
 		}
 	}
 	/* read the page of disk inode */
 	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
 	if (!mp) {
 		jfs_err("diRead: read_metapage failed");
 		return -EIO;
 	}
 	/* locate the disk inode requested */
 	dp = (struct dinode *) mp->data;
 	dp += rel_inode;
 	if (ip->i_ino != le32_to_cpu(dp->di_number)) {
 		jfs_error(ip->i_sb, "diRead: i_ino != di_number");
 		rc = -EIO;
 	} else if (le32_to_cpu(dp->di_nlink) == 0)
 		rc = -ESTALE;
 	else
 		/* copy the disk inode to the in-memory inode */
 		rc = copy_from_dinode(dp, ip);
 	release_metapage(mp);
 	/* set the ag for the inode */
 	JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
 	JFS_IP(ip)->active_ag = -1;
 	return (rc);
 }
 /*
  * NAME:	diReadSpecial()
  *
  * FUNCTION:	initialize a 'special' inode from disk.
  *
  *		this routines handles aggregate level inodes.  The
  *		inode cache cannot differentiate between the
  *		aggregate inodes and the filesystem inodes, so we
  *		handle these here.  We don't actually use the aggregate
  *		inode map, since these inodes are at a fixed location
  *		and in some cases the aggregate inode map isn't initialized
  *		yet.
  *
  * PARAMETERS:
  *	sb - filesystem superblock
  *	inum - aggregate inode number
  *	secondary - 1 if secondary aggregate inode table
  *
  * RETURN VALUES:
  *	new inode	- success
  *	NULL		- i/o error.
  */
 struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	uint address;
 	struct dinode *dp;
 	struct inode *ip;
 	struct metapage *mp;
 	ip = new_inode(sb);
 	if (ip == NULL) {
 		jfs_err("diReadSpecial: new_inode returned NULL!");
 		return ip;
 	}
 	if (secondary) {
 		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
 		JFS_IP(ip)->ipimap = sbi->ipaimap2;
 	} else {
 		address = AITBL_OFF >> L2PSIZE;
 		JFS_IP(ip)->ipimap = sbi->ipaimap;
 	}
 	ASSERT(inum < INOSPEREXT);
 	ip->i_ino = inum;
 	address += inum >> 3;	/* 8 inodes per 4K page */
 	/* read the page of fixed disk inode (AIT) in raw mode */
 	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
 	if (mp == NULL) {
 		ip->i_nlink = 1;	/* Don't want iput() deleting it */
 		iput(ip);
 		return (NULL);
 	}
 	/* get the pointer to the disk inode of interest */
 	dp = (struct dinode *) (mp->data);
 	dp += inum % 8;		/* 8 inodes per 4K page */
 	/* copy on-disk inode to in-memory inode */
 	if ((copy_from_dinode(dp, ip)) != 0) {
 		/* handle bad return by returning NULL for ip */
 		ip->i_nlink = 1;	/* Don't want iput() deleting it */
 		iput(ip);
 		/* release the page */
 		release_metapage(mp);
 		return (NULL);
 	}
 	ip->i_mapping->a_ops = &jfs_metapage_aops;
 	mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
 	/* Allocations to metadata inodes should not affect quotas */
 	ip->i_flags |= S_NOQUOTA;
 	if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
 		sbi->gengen = le32_to_cpu(dp->di_gengen);
 		sbi->inostamp = le32_to_cpu(dp->di_inostamp);
 	}
 	/* release the page */
 	release_metapage(mp);
 	hlist_add_head(&ip->i_hash, &aggregate_hash);
 	return (ip);
 }
 /*
  * NAME:	diWriteSpecial()
  *
  * FUNCTION:	Write the special inode to disk
  *
  * PARAMETERS:
  *	ip - special inode
  *	secondary - 1 if secondary aggregate inode table
  *
  * RETURN VALUES: none
  */
 void diWriteSpecial(struct inode *ip, int secondary)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	uint address;
 	struct dinode *dp;
 	ino_t inum = ip->i_ino;
 	struct metapage *mp;
 	if (secondary)
 		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
 	else
 		address = AITBL_OFF >> L2PSIZE;
 	ASSERT(inum < INOSPEREXT);
 	address += inum >> 3;	/* 8 inodes per 4K page */
 	/* read the page of fixed disk inode (AIT) in raw mode */
 	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
 	if (mp == NULL) {
 		jfs_err("diWriteSpecial: failed to read aggregate inode "
 			"extent!");
 		return;
 	}
 	/* get the pointer to the disk inode of interest */
 	dp = (struct dinode *) (mp->data);
 	dp += inum % 8;		/* 8 inodes per 4K page */
 	/* copy on-disk inode to in-memory inode */
 	copy_to_dinode(dp, ip);
 	memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
 	if (inum == FILESYSTEM_I)
 		dp->di_gengen = cpu_to_le32(sbi->gengen);
 	/* write the page */
 	write_metapage(mp);
 }
 /*
  * NAME:	diFreeSpecial()
  *
  * FUNCTION:	Free allocated space for special inode
  */
 void diFreeSpecial(struct inode *ip)
 {
 	if (ip == NULL) {
 		jfs_err("diFreeSpecial called with NULL ip!");
 		return;
 	}
 	filemap_write_and_wait(ip->i_mapping);
 	truncate_inode_pages(ip->i_mapping, 0);
 	iput(ip);
 }
 /*
  * NAME:	diWrite()
  *
  * FUNCTION:	write the on-disk inode portion of the in-memory inode
  *		to its corresponding on-disk inode.
  *
  *		on entry, the specifed incore inode should itself
  *		specify the disk inode number corresponding to the
  *		incore inode (i.e. i_number should be initialized).
  *
  *		the inode contains the inode extent address for the disk
  *		inode.  with the inode extent address in hand, the
  *		page of the extent that contains the disk inode is
  *		read and the disk inode portion of the incore inode
  *		is copied to the disk inode.
  *
  * PARAMETERS:
  *	tid -  transacation id
  *	ip  -  pointer to incore inode to be written to the inode extent.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error.
  */
 int diWrite(tid_t tid, struct inode *ip)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 	int rc = 0;
 	s32 ino;
 	struct dinode *dp;
 	s64 blkno;
 	int block_offset;
 	int inodes_left;
 	struct metapage *mp;
 	unsigned long pageno;
 	int rel_inode;
 	int dioffset;
 	struct inode *ipimap;
 	uint type;
 	lid_t lid;
 	struct tlock *ditlck, *tlck;
 	struct linelock *dilinelock, *ilinelock;
 	struct lv *lv;
 	int n;
 	ipimap = jfs_ip->ipimap;
 	ino = ip->i_ino & (INOSPERIAG - 1);
 	if (!addressPXD(&(jfs_ip->ixpxd)) ||
 	    (lengthPXD(&(jfs_ip->ixpxd)) !=
 	     JFS_IP(ipimap)->i_imap->im_nbperiext)) {
 		jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
 		return -EIO;
 	}
 	/*
 	 * read the page of disk inode containing the specified inode:
 	 */
 	/* compute the block address of the page */
 	blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
 	rel_inode = (ino & (INOSPERPAGE - 1));
 	pageno = blkno >> sbi->l2nbperpage;
 	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
 		/*
 		 * OS/2 didn't always align inode extents on page boundaries
 		 */
 		inodes_left =
 		    (sbi->nbperpage - block_offset) << sbi->l2niperblk;
 		if (rel_inode < inodes_left)
 			rel_inode += block_offset << sbi->l2niperblk;
 		else {
 			pageno += 1;
 			rel_inode -= inodes_left;
 		}
 	}
 	/* read the page of disk inode */
       retry:
 	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
 	if (!mp)
 		return -EIO;
 	/* get the pointer to the disk inode */
 	dp = (struct dinode *) mp->data;
 	dp += rel_inode;
 	dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
 	/*
 	 * acquire transaction lock on the on-disk inode;
 	 * N.B. tlock is acquired on ipimap not ip;
 	 */
 	if ((ditlck =
 	     txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
 		goto retry;
 	dilinelock = (struct linelock *) & ditlck->lock;
 	/*
 	 * copy btree root from in-memory inode to on-disk inode
 	 *
 	 * (tlock is taken from inline B+-tree root in in-memory
 	 * inode when the B+-tree root is updated, which is pointed
 	 * by jfs_ip->blid as well as being on tx tlock list)
 	 *
 	 * further processing of btree root is based on the copy
 	 * in in-memory inode, where txLog() will log from, and,
 	 * for xtree root, txUpdateMap() will update map and reset
 	 * XAD_NEW bit;
 	 */
 	if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
 		/*
 		 * This is the special xtree inside the directory for storing
 		 * the directory table
 		 */
 		xtpage_t *p, *xp;
 		xad_t *xad;
 		jfs_ip->xtlid = 0;
 		tlck = lid_to_tlock(lid);
 		assert(tlck->type & tlckXTREE);
 		tlck->type |= tlckBTROOT;
 		tlck->mp = mp;
 		ilinelock = (struct linelock *) & tlck->lock;
 		/*
 		 * copy xtree root from inode to dinode:
 		 */
 		p = &jfs_ip->i_xtroot;
 		xp = (xtpage_t *) &dp->di_dirtable;
 		lv = ilinelock->lv;
 		for (n = 0; n < ilinelock->index; n++, lv++) {
 			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
 			       lv->length << L2XTSLOTSIZE);
 		}
 		/* reset on-disk (metadata page) xtree XAD_NEW bit */
 		xad = &xp->xad[XTENTRYSTART];
 		for (n = XTENTRYSTART;
 		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
 			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
 				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
 	}
 	if ((lid = jfs_ip->blid) == 0)
 		goto inlineData;
 	jfs_ip->blid = 0;
 	tlck = lid_to_tlock(lid);
 	type = tlck->type;
 	tlck->type |= tlckBTROOT;
 	tlck->mp = mp;
 	ilinelock = (struct linelock *) & tlck->lock;
 	/*
 	 *	regular file: 16 byte (XAD slot) granularity
 	 */
 	if (type & tlckXTREE) {
 		xtpage_t *p, *xp;
 		xad_t *xad;
 		/*
 		 * copy xtree root from inode to dinode:
 		 */
 		p = &jfs_ip->i_xtroot;
 		xp = &dp->di_xtroot;
 		lv = ilinelock->lv;
 		for (n = 0; n < ilinelock->index; n++, lv++) {
 			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
 			       lv->length << L2XTSLOTSIZE);
 		}
 		/* reset on-disk (metadata page) xtree XAD_NEW bit */
 		xad = &xp->xad[XTENTRYSTART];
 		for (n = XTENTRYSTART;
 		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
 			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
 				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
 	}
 	/*
 	 *	directory: 32 byte (directory entry slot) granularity
 	 */
 	else if (type & tlckDTREE) {
 		dtpage_t *p, *xp;
 		/*
 		 * copy dtree root from inode to dinode:
 		 */
 		p = (dtpage_t *) &jfs_ip->i_dtroot;
 		xp = (dtpage_t *) & dp->di_dtroot;
 		lv = ilinelock->lv;
 		for (n = 0; n < ilinelock->index; n++, lv++) {
 			memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
 			       lv->length << L2DTSLOTSIZE);
 		}
 	} else {
 		jfs_err("diWrite: UFO tlock");
 	}
       inlineData:
 	/*
 	 * copy inline symlink from in-memory inode to on-disk inode
 	 */
 	if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
 		lv = & dilinelock->lv[dilinelock->index];
 		lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
 		lv->length = 2;
 		memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
 		dilinelock->index++;
 	}
 	/*
 	 * copy inline data from in-memory inode to on-disk inode:
 	 * 128 byte slot granularity
 	 */
 	if (test_cflag(COMMIT_Inlineea, ip)) {
 		lv = & dilinelock->lv[dilinelock->index];
 		lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
 		lv->length = 1;
 		memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
 		dilinelock->index++;
 		clear_cflag(COMMIT_Inlineea, ip);
 	}
 	/*
 	 *	lock/copy inode base: 128 byte slot granularity
 	 */
 	lv = & dilinelock->lv[dilinelock->index];
 	lv->offset = dioffset >> L2INODESLOTSIZE;
 	copy_to_dinode(dp, ip);
 	if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
 		lv->length = 2;
 		memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
 	} else
 		lv->length = 1;
 	dilinelock->index++;
 	/* release the buffer holding the updated on-disk inode.
 	 * the buffer will be later written by commit processing.
 	 */
 	write_metapage(mp);
 	return (rc);
 }
 /*
  * NAME:	diFree(ip)
  *
  * FUNCTION:	free a specified inode from the inode working map
  *		for a fileset or aggregate.
  *
  *		if the inode to be freed represents the first (only)
  *		free inode within the iag, the iag will be placed on
  *		the ag free inode list.
  *
  *		freeing the inode will cause the inode extent to be
  *		freed if the inode is the only allocated inode within
  *		the extent.  in this case all the disk resource backing
  *		up the inode extent will be freed. in addition, the iag
  *		will be placed on the ag extent free list if the extent
  *		is the first free extent in the iag.  if freeing the
  *		extent also means that no free inodes will exist for
  *		the iag, the iag will also be removed from the ag free
  *		inode list.
  *
  *		the iag describing the inode will be freed if the extent
  *		is to be freed and it is the only backed extent within
  *		the iag.  in this case, the iag will be removed from the
  *		ag free extent list and ag free inode list and placed on
  *		the inode map's free iag list.
  *
  *		a careful update approach is used to provide consistency
  *		in the face of updates to multiple buffers.  under this
  *		approach, all required buffers are obtained before making
  *		any updates and are held until all updates are complete.
  *
  * PARAMETERS:
  *	ip	- inode to be freed.
  *
  * RETURN VALUES:
  *	0	- success
  *	-EIO	- i/o error.
  */
 int diFree(struct inode *ip)
 {
 	int rc;
 	ino_t inum = ip->i_ino;
 	struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
 	struct metapage *mp, *amp, *bmp, *cmp, *dmp;
 	int iagno, ino, extno, bitno, sword, agno;
 	int back, fwd;
 	u32 bitmap, mask;
 	struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
 	struct inomap *imap = JFS_IP(ipimap)->i_imap;
 	pxd_t freepxd;
 	tid_t tid;
 	struct inode *iplist[3];
 	struct tlock *tlck;
 	struct pxd_lock *pxdlock;
 	/*
 	 * This is just to suppress compiler warnings.  The same logic that
 	 * references these variables is used to initialize them.
 	 */
 	aiagp = biagp = ciagp = diagp = NULL;
 	/* get the iag number containing the inode.
 	 */
 	iagno = INOTOIAG(inum);
 	/* make sure that the iag is contained within
 	 * the map.
 	 */
 	if (iagno >= imap->im_nextiag) {
 		print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
 			       imap, 32, 0);
 		jfs_error(ip->i_sb,
 			  "diFree: inum = %d, iagno = %d, nextiag = %d",
 			  (uint) inum, iagno, imap->im_nextiag);
 		return -EIO;
 	}
 	/* get the allocation group for this ino.
 	 */
 	agno = JFS_IP(ip)->agno;
 	/* Lock the AG specific inode map information
 	 */
 	AG_LOCK(imap, agno);
 	/* Obtain read lock in imap inode.  Don't release it until we have
 	 * read all of the IAG's that we are going to.
 	 */
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	/* read the iag.
 	 */
 	if ((rc = diIAGRead(imap, iagno, &mp))) {
 		IREAD_UNLOCK(ipimap);
 		AG_UNLOCK(imap, agno);
 		return (rc);
 	}
 	iagp = (struct iag *) mp->data;
 	/* get the inode number and extent number of the inode within
 	 * the iag and the inode number within the extent.
 	 */
 	ino = inum & (INOSPERIAG - 1);
 	extno = ino >> L2INOSPEREXT;
 	bitno = ino & (INOSPEREXT - 1);
 	mask = HIGHORDER >> bitno;
 	if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
 		jfs_error(ip->i_sb,
 			  "diFree: wmap shows inode already free");
 	}
 	if (!addressPXD(&iagp->inoext[extno])) {
 		release_metapage(mp);
 		IREAD_UNLOCK(ipimap);
 		AG_UNLOCK(imap, agno);
 		jfs_error(ip->i_sb, "diFree: invalid inoext");
 		return -EIO;
 	}
 	/* compute the bitmap for the extent reflecting the freed inode.
 	 */
 	bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
 	if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
 		release_metapage(mp);
 		IREAD_UNLOCK(ipimap);
 		AG_UNLOCK(imap, agno);
 		jfs_error(ip->i_sb, "diFree: numfree > numinos");
 		return -EIO;
 	}
 	/*
 	 *	inode extent still has some inodes or below low water mark:
 	 *	keep the inode extent;
 	 */
 	if (bitmap ||
 	    imap->im_agctl[agno].numfree < 96 ||
 	    (imap->im_agctl[agno].numfree < 288 &&
 	     (((imap->im_agctl[agno].numfree * 100) /
 	       imap->im_agctl[agno].numinos) <= 25))) {
 		/* if the iag currently has no free inodes (i.e.,
 		 * the inode being freed is the first free inode of iag),
 		 * insert the iag at head of the inode free list for the ag.
 		 */
 		if (iagp->nfreeinos == 0) {
 			/* check if there are any iags on the ag inode
 			 * free list.  if so, read the first one so that
 			 * we can link the current iag onto the list at
 			 * the head.
 			 */
 			if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
 				/* read the iag that currently is the head
 				 * of the list.
 				 */
 				if ((rc = diIAGRead(imap, fwd, &amp))) {
 					IREAD_UNLOCK(ipimap);
 					AG_UNLOCK(imap, agno);
 					release_metapage(mp);
 					return (rc);
 				}
 				aiagp = (struct iag *) amp->data;
 				/* make current head point back to the iag.
 				 */
 				aiagp->inofreeback = cpu_to_le32(iagno);
 				write_metapage(amp);
 			}
 			/* iag points forward to current head and iag
 			 * becomes the new head of the list.
 			 */
 			iagp->inofreefwd =
 			    cpu_to_le32(imap->im_agctl[agno].inofree);
 			iagp->inofreeback = cpu_to_le32(-1);
 			imap->im_agctl[agno].inofree = iagno;
 		}
 		IREAD_UNLOCK(ipimap);
 		/* update the free inode summary map for the extent if
 		 * freeing the inode means the extent will now have free
 		 * inodes (i.e., the inode being freed is the first free
 		 * inode of extent),
 		 */
 		if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
 			sword = extno >> L2EXTSPERSUM;
 			bitno = extno & (EXTSPERSUM - 1);
 			iagp->inosmap[sword] &=
 			    cpu_to_le32(~(HIGHORDER >> bitno));
 		}
 		/* update the bitmap.
 		 */
 		iagp->wmap[extno] = cpu_to_le32(bitmap);
 		/* update the free inode counts at the iag, ag and
 		 * map level.
 		 */
-		iagp->nfreeinos =
+		le32_add_cpu(&iagp->nfreeinos, 1);
-		    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
 		imap->im_agctl[agno].numfree += 1;
 		atomic_inc(&imap->im_numfree);
 		/* release the AG inode map lock
 		 */
 		AG_UNLOCK(imap, agno);
 		/* write the iag */
 		write_metapage(mp);
 		return (0);
 	}
 	/*
 	 *	inode extent has become free and above low water mark:
 	 *	free the inode extent;
 	 */
 	/*
 	 *	prepare to update iag list(s) (careful update step 1)
 	 */
 	amp = bmp = cmp = dmp = NULL;
 	fwd = back = -1;
 	/* check if the iag currently has no free extents.  if so,
 	 * it will be placed on the head of the ag extent free list.
 	 */
 	if (iagp->nfreeexts == 0) {
 		/* check if the ag extent free list has any iags.
 		 * if so, read the iag at the head of the list now.
 		 * this (head) iag will be updated later to reflect
 		 * the addition of the current iag at the head of
 		 * the list.
 		 */
 		if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
 			if ((rc = diIAGRead(imap, fwd, &amp)))
 				goto error_out;
 			aiagp = (struct iag *) amp->data;
 		}
 	} else {
 		/* iag has free extents. check if the addition of a free
 		 * extent will cause all extents to be free within this
 		 * iag.  if so, the iag will be removed from the ag extent
 		 * free list and placed on the inode map's free iag list.
 		 */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
 			/* in preparation for removing the iag from the
 			 * ag extent free list, read the iags preceeding
 			 * and following the iag on the ag extent free
 			 * list.
 			 */
 			if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
 				if ((rc = diIAGRead(imap, fwd, &amp)))
 					goto error_out;
 				aiagp = (struct iag *) amp->data;
 			}
 			if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
 				if ((rc = diIAGRead(imap, back, &bmp)))
 					goto error_out;
 				biagp = (struct iag *) bmp->data;
 			}
 		}
 	}
 	/* remove the iag from the ag inode free list if freeing
 	 * this extent cause the iag to have no free inodes.
 	 */
 	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
 		int inofreeback = le32_to_cpu(iagp->inofreeback);
 		int inofreefwd = le32_to_cpu(iagp->inofreefwd);
 		/* in preparation for removing the iag from the
 		 * ag inode free list, read the iags preceeding
 		 * and following the iag on the ag inode free
 		 * list.  before reading these iags, we must make
 		 * sure that we already don't have them in hand
 		 * from up above, since re-reading an iag (buffer)
 		 * we are currently holding would cause a deadlock.
 		 */
 		if (inofreefwd >= 0) {
 			if (inofreefwd == fwd)
 				ciagp = (struct iag *) amp->data;
 			else if (inofreefwd == back)
 				ciagp = (struct iag *) bmp->data;
 			else {
 				if ((rc =
 				     diIAGRead(imap, inofreefwd, &cmp)))
 					goto error_out;
 				ciagp = (struct iag *) cmp->data;
 			}
 			assert(ciagp != NULL);
 		}
 		if (inofreeback >= 0) {
 			if (inofreeback == fwd)
 				diagp = (struct iag *) amp->data;
 			else if (inofreeback == back)
 				diagp = (struct iag *) bmp->data;
 			else {
 				if ((rc =
 				     diIAGRead(imap, inofreeback, &dmp)))
 					goto error_out;
 				diagp = (struct iag *) dmp->data;
 			}
 			assert(diagp != NULL);
 		}
 	}
 	IREAD_UNLOCK(ipimap);
 	/*
 	 * invalidate any page of the inode extent freed from buffer cache;
 	 */
 	freepxd = iagp->inoext[extno];
 	invalidate_pxd_metapages(ip, freepxd);
 	/*
 	 *	update iag list(s) (careful update step 2)
 	 */
 	/* add the iag to the ag extent free list if this is the
 	 * first free extent for the iag.
 	 */
 	if (iagp->nfreeexts == 0) {
 		if (fwd >= 0)
 			aiagp->extfreeback = cpu_to_le32(iagno);
 		iagp->extfreefwd =
 		    cpu_to_le32(imap->im_agctl[agno].extfree);
 		iagp->extfreeback = cpu_to_le32(-1);
 		imap->im_agctl[agno].extfree = iagno;
 	} else {
 		/* remove the iag from the ag extent list if all extents
 		 * are now free and place it on the inode map iag free list.
 		 */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
 			if (fwd >= 0)
 				aiagp->extfreeback = iagp->extfreeback;
 			if (back >= 0)
 				biagp->extfreefwd = iagp->extfreefwd;
 			else
 				imap->im_agctl[agno].extfree =
 				    le32_to_cpu(iagp->extfreefwd);
 			iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
 			IAGFREE_LOCK(imap);
 			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
 			imap->im_freeiag = iagno;
 			IAGFREE_UNLOCK(imap);
 		}
 	}
 	/* remove the iag from the ag inode free list if freeing
 	 * this extent causes the iag to have no free inodes.
 	 */
 	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
 		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
 			ciagp->inofreeback = iagp->inofreeback;
 		if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
 			diagp->inofreefwd = iagp->inofreefwd;
 		else
 			imap->im_agctl[agno].inofree =
 			    le32_to_cpu(iagp->inofreefwd);
 		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
 	}
 	/* update the inode extent address and working map
 	 * to reflect the free extent.
 	 * the permanent map should have been updated already
 	 * for the inode being freed.
 	 */
 	if (iagp->pmap[extno] != 0) {
 		jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
 	}
 	iagp->wmap[extno] = 0;
 	PXDlength(&iagp->inoext[extno], 0);
 	PXDaddress(&iagp->inoext[extno], 0);
 	/* update the free extent and free inode summary maps
 	 * to reflect the freed extent.
 	 * the inode summary map is marked to indicate no inodes
 	 * available for the freed extent.
 	 */
 	sword = extno >> L2EXTSPERSUM;
 	bitno = extno & (EXTSPERSUM - 1);
 	mask = HIGHORDER >> bitno;
 	iagp->inosmap[sword] |= cpu_to_le32(mask);
 	iagp->extsmap[sword] &= cpu_to_le32(~mask);
 	/* update the number of free inodes and number of free extents
 	 * for the iag.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
+	le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
-				      (INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, 1);
-	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
 	/* update the number of free inodes and backed inodes
 	 * at the ag and inode map level.
 	 */
 	imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
 	imap->im_agctl[agno].numinos -= INOSPEREXT;
 	atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
 	atomic_sub(INOSPEREXT, &imap->im_numinos);
 	if (amp)
 		write_metapage(amp);
 	if (bmp)
 		write_metapage(bmp);
 	if (cmp)
 		write_metapage(cmp);
 	if (dmp)
 		write_metapage(dmp);
 	/*
 	 * start transaction to update block allocation map
 	 * for the inode extent freed;
 	 *
 	 * N.B. AG_LOCK is released and iag will be released below, and
 	 * other thread may allocate inode from/reusing the ixad freed
 	 * BUT with new/different backing inode extent from the extent
 	 * to be freed by the transaction;
 	 */
 	tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
 	mutex_lock(&JFS_IP(ipimap)->commit_mutex);
 	/* acquire tlock of the iag page of the freed ixad
 	 * to force the page NOHOMEOK (even though no data is
 	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
 	 * for the free of the extent is committed;
 	 * write FREEXTENT|NOREDOPAGE log record
 	 * N.B. linelock is overlaid as freed extent descriptor;
 	 */
 	tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
 	pxdlock = (struct pxd_lock *) & tlck->lock;
 	pxdlock->flag = mlckFREEPXD;
 	pxdlock->pxd = freepxd;
 	pxdlock->index = 1;
 	write_metapage(mp);
 	iplist[0] = ipimap;
 	/*
 	 * logredo needs the IAG number and IAG extent index in order
 	 * to ensure that the IMap is consistent.  The least disruptive
 	 * way to pass these values through  to the transaction manager
 	 * is in the iplist array.
 	 *
 	 * It's not pretty, but it works.
 	 */
 	iplist[1] = (struct inode *) (size_t)iagno;
 	iplist[2] = (struct inode *) (size_t)extno;
 	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
 	txEnd(tid);
 	mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 	/* unlock the AG inode map information */
 	AG_UNLOCK(imap, agno);
 	return (0);
       error_out:
 	IREAD_UNLOCK(ipimap);
 	if (amp)
 		release_metapage(amp);
 	if (bmp)
 		release_metapage(bmp);
 	if (cmp)
 		release_metapage(cmp);
 	if (dmp)
 		release_metapage(dmp);
 	AG_UNLOCK(imap, agno);
 	release_metapage(mp);
 	return (rc);
 }
 /*
  * There are several places in the diAlloc* routines where we initialize
  * the inode.
  */
 static inline void
 diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 	ip->i_ino = (iagno << L2INOSPERIAG) + ino;
 	jfs_ip->ixpxd = iagp->inoext[extno];
 	jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
 	jfs_ip->active_ag = -1;
 }
 /*
  * NAME:	diAlloc(pip,dir,ip)
  *
  * FUNCTION:	allocate a disk inode from the inode working map
  *		for a fileset or aggregate.
  *
  * PARAMETERS:
  *	pip	- pointer to incore inode for the parent inode.
  *	dir	- 'true' if the new disk inode is for a directory.
  *	ip	- pointer to a new inode
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 {
 	int rc, ino, iagno, addext, extno, bitno, sword;
 	int nwords, rem, i, agno;
 	u32 mask, inosmap, extsmap;
 	struct inode *ipimap;
 	struct metapage *mp;
 	ino_t inum;
 	struct iag *iagp;
 	struct inomap *imap;
 	/* get the pointers to the inode map inode and the
 	 * corresponding imap control structure.
 	 */
 	ipimap = JFS_SBI(pip->i_sb)->ipimap;
 	imap = JFS_IP(ipimap)->i_imap;
 	JFS_IP(ip)->ipimap = ipimap;
 	JFS_IP(ip)->fileset = FILESYSTEM_I;
 	/* for a directory, the allocation policy is to start
 	 * at the ag level using the preferred ag.
 	 */
 	if (dir) {
 		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
 		AG_LOCK(imap, agno);
 		goto tryag;
 	}
 	/* for files, the policy starts off by trying to allocate from
 	 * the same iag containing the parent disk inode:
 	 * try to allocate the new disk inode close to the parent disk
 	 * inode, using parent disk inode number + 1 as the allocation
 	 * hint.  (we use a left-to-right policy to attempt to avoid
 	 * moving backward on the disk.)  compute the hint within the
 	 * file system and the iag.
 	 */
 	/* get the ag number of this iag */
 	agno = JFS_IP(pip)->agno;
 	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
 		/*
 		 * There is an open file actively growing.  We want to
 		 * allocate new inodes from a different ag to avoid
 		 * fragmentation problems.
 		 */
 		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
 		AG_LOCK(imap, agno);
 		goto tryag;
 	}
 	inum = pip->i_ino + 1;
 	ino = inum & (INOSPERIAG - 1);
 	/* back off the hint if it is outside of the iag */
 	if (ino == 0)
 		inum = pip->i_ino;
 	/* lock the AG inode map information */
 	AG_LOCK(imap, agno);
 	/* Get read lock on imap inode */
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	/* get the iag number and read the iag */
 	iagno = INOTOIAG(inum);
 	if ((rc = diIAGRead(imap, iagno, &mp))) {
 		IREAD_UNLOCK(ipimap);
 		AG_UNLOCK(imap, agno);
 		return (rc);
 	}
 	iagp = (struct iag *) mp->data;
 	/* determine if new inode extent is allowed to be added to the iag.
 	 * new inode extent can be added to the iag if the ag
 	 * has less than 32 free disk inodes and the iag has free extents.
 	 */
 	addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
 	/*
 	 *	try to allocate from the IAG
 	 */
 	/* check if the inode may be allocated from the iag
 	 * (i.e. the inode has free inodes or new extent can be added).
 	 */
 	if (iagp->nfreeinos || addext) {
 		/* determine the extent number of the hint.
 		 */
 		extno = ino >> L2INOSPEREXT;
 		/* check if the extent containing the hint has backed
 		 * inodes.  if so, try to allocate within this extent.
 		 */
 		if (addressPXD(&iagp->inoext[extno])) {
 			bitno = ino & (INOSPEREXT - 1);
 			if ((bitno =
 			     diFindFree(le32_to_cpu(iagp->wmap[extno]),
 					bitno))
 			    < INOSPEREXT) {
 				ino = (extno << L2INOSPEREXT) + bitno;
 				/* a free inode (bit) was found within this
 				 * extent, so allocate it.
 				 */
 				rc = diAllocBit(imap, iagp, ino);
 				IREAD_UNLOCK(ipimap);
 				if (rc) {
 					assert(rc == -EIO);
 				} else {
 					/* set the results of the allocation
 					 * and write the iag.
 					 */
 					diInitInode(ip, iagno, ino, extno,
 						    iagp);
 					mark_metapage_dirty(mp);
 				}
 				release_metapage(mp);
 				/* free the AG lock and return.
 				 */
 				AG_UNLOCK(imap, agno);
 				return (rc);
 			}
 			if (!addext)
 				extno =
 				    (extno ==
 				     EXTSPERIAG - 1) ? 0 : extno + 1;
 		}
 		/*
 		 * no free inodes within the extent containing the hint.
 		 *
 		 * try to allocate from the backed extents following
 		 * hint or, if appropriate (i.e. addext is true), allocate
 		 * an extent of free inodes at or following the extent
 		 * containing the hint.
 		 *
 		 * the free inode and free extent summary maps are used
 		 * here, so determine the starting summary map position
 		 * and the number of words we'll have to examine.  again,
 		 * the approach is to allocate following the hint, so we
 		 * might have to initially ignore prior bits of the summary
 		 * map that represent extents prior to the extent containing
 		 * the hint and later revisit these bits.
 		 */
 		bitno = extno & (EXTSPERSUM - 1);
 		nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
 		sword = extno >> L2EXTSPERSUM;
 		/* mask any prior bits for the starting words of the
 		 * summary map.
 		 */
 		mask = ONES << (EXTSPERSUM - bitno);
 		inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
 		extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
 		/* scan the free inode and free extent summary maps for
 		 * free resources.
 		 */
 		for (i = 0; i < nwords; i++) {
 			/* check if this word of the free inode summary
 			 * map describes an extent with free inodes.
 			 */
 			if (~inosmap) {
 				/* an extent with free inodes has been
 				 * found. determine the extent number
 				 * and the inode number within the extent.
 				 */
 				rem = diFindFree(inosmap, 0);
 				extno = (sword << L2EXTSPERSUM) + rem;
 				rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
 						 0);
 				if (rem >= INOSPEREXT) {
 					IREAD_UNLOCK(ipimap);
 					release_metapage(mp);
 					AG_UNLOCK(imap, agno);
 					jfs_error(ip->i_sb,
 						  "diAlloc: can't find free bit "
 						  "in wmap");
 					return EIO;
 				}
 				/* determine the inode number within the
 				 * iag and allocate the inode from the
 				 * map.
 				 */
 				ino = (extno << L2INOSPEREXT) + rem;
 				rc = diAllocBit(imap, iagp, ino);
 				IREAD_UNLOCK(ipimap);
 				if (rc)
 					assert(rc == -EIO);
 				else {
 					/* set the results of the allocation
 					 * and write the iag.
 					 */
 					diInitInode(ip, iagno, ino, extno,
 						    iagp);
 					mark_metapage_dirty(mp);
 				}
 				release_metapage(mp);
 				/* free the AG lock and return.
 				 */
 				AG_UNLOCK(imap, agno);
 				return (rc);
 			}
 			/* check if we may allocate an extent of free
 			 * inodes and whether this word of the free
 			 * extents summary map describes a free extent.
 			 */
 			if (addext && ~extsmap) {
 				/* a free extent has been found.  determine
 				 * the extent number.
 				 */
 				rem = diFindFree(extsmap, 0);
 				extno = (sword << L2EXTSPERSUM) + rem;
 				/* allocate an extent of free inodes.
 				 */
 				if ((rc = diNewExt(imap, iagp, extno))) {
 					/* if there is no disk space for a
 					 * new extent, try to allocate the
 					 * disk inode from somewhere else.
 					 */
 					if (rc == -ENOSPC)
 						break;
 					assert(rc == -EIO);
 				} else {
 					/* set the results of the allocation
 					 * and write the iag.
 					 */
 					diInitInode(ip, iagno,
 						    extno << L2INOSPEREXT,
 						    extno, iagp);
 					mark_metapage_dirty(mp);
 				}
 				release_metapage(mp);
 				/* free the imap inode & the AG lock & return.
 				 */
 				IREAD_UNLOCK(ipimap);
 				AG_UNLOCK(imap, agno);
 				return (rc);
 			}
 			/* move on to the next set of summary map words.
 			 */
 			sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
 			inosmap = le32_to_cpu(iagp->inosmap[sword]);
 			extsmap = le32_to_cpu(iagp->extsmap[sword]);
 		}
 	}
 	/* unlock imap inode */
 	IREAD_UNLOCK(ipimap);
 	/* nothing doing in this iag, so release it. */
 	release_metapage(mp);
       tryag:
 	/*
 	 * try to allocate anywhere within the same AG as the parent inode.
 	 */
 	rc = diAllocAG(imap, agno, dir, ip);
 	AG_UNLOCK(imap, agno);
 	if (rc != -ENOSPC)
 		return (rc);
 	/*
 	 * try to allocate in any AG.
 	 */
 	return (diAllocAny(imap, agno, dir, ip));
 }
 /*
  * NAME:	diAllocAG(imap,agno,dir,ip)
  *
  * FUNCTION:	allocate a disk inode from the allocation group.
  *
  *		this routine first determines if a new extent of free
  *		inodes should be added for the allocation group, with
  *		the current request satisfied from this extent. if this
  *		is the case, an attempt will be made to do just that.  if
  *		this attempt fails or it has been determined that a new
  *		extent should not be added, an attempt is made to satisfy
  *		the request by allocating an existing (backed) free inode
  *		from the allocation group.
  *
  * PRE CONDITION: Already have the AG lock for this AG.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	agno	- allocation group to allocate from.
  *	dir	- 'true' if the new disk inode is for a directory.
  *	ip	- pointer to the new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int
 diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
 {
 	int rc, addext, numfree, numinos;
 	/* get the number of free and the number of backed disk
 	 * inodes currently within the ag.
 	 */
 	numfree = imap->im_agctl[agno].numfree;
 	numinos = imap->im_agctl[agno].numinos;
 	if (numfree > numinos) {
 		jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
 		return -EIO;
 	}
 	/* determine if we should allocate a new extent of free inodes
 	 * within the ag: for directory inodes, add a new extent
 	 * if there are a small number of free inodes or number of free
 	 * inodes is a small percentage of the number of backed inodes.
 	 */
 	if (dir)
 		addext = (numfree < 64 ||
 			  (numfree < 256
 			   && ((numfree * 100) / numinos) <= 20));
 	else
 		addext = (numfree == 0);
 	/*
 	 * try to allocate a new extent of free inodes.
 	 */
 	if (addext) {
 		/* if free space is not avaliable for this new extent, try
 		 * below to allocate a free and existing (already backed)
 		 * inode from the ag.
 		 */
 		if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
 			return (rc);
 	}
 	/*
 	 * try to allocate an existing free inode from the ag.
 	 */
 	return (diAllocIno(imap, agno, ip));
 }
 /*
  * NAME:	diAllocAny(imap,agno,dir,iap)
  *
  * FUNCTION:	allocate a disk inode from any other allocation group.
  *
  *		this routine is called when an allocation attempt within
  *		the primary allocation group has failed. if attempts to
  *		allocate an inode from any allocation group other than the
  *		specified primary group.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	agno	- primary allocation group (to avoid).
  *	dir	- 'true' if the new disk inode is for a directory.
  *	ip	- pointer to a new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int
 diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
 {
 	int ag, rc;
 	int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
 	/* try to allocate from the ags following agno up to
 	 * the maximum ag number.
 	 */
 	for (ag = agno + 1; ag <= maxag; ag++) {
 		AG_LOCK(imap, ag);
 		rc = diAllocAG(imap, ag, dir, ip);
 		AG_UNLOCK(imap, ag);
 		if (rc != -ENOSPC)
 			return (rc);
 	}
 	/* try to allocate from the ags in front of agno.
 	 */
 	for (ag = 0; ag < agno; ag++) {
 		AG_LOCK(imap, ag);
 		rc = diAllocAG(imap, ag, dir, ip);
 		AG_UNLOCK(imap, ag);
 		if (rc != -ENOSPC)
 			return (rc);
 	}
 	/* no free disk inodes.
 	 */
 	return -ENOSPC;
 }
 /*
  * NAME:	diAllocIno(imap,agno,ip)
  *
  * FUNCTION:	allocate a disk inode from the allocation group's free
  *		inode list, returning an error if this free list is
  *		empty (i.e. no iags on the list).
  *
  *		allocation occurs from the first iag on the list using
  *		the iag's free inode summary map to find the leftmost
  *		free inode in the iag.
  *
  * PRE CONDITION: Already have AG lock for this AG.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	agno	- allocation group.
  *	ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 {
 	int iagno, ino, rc, rem, extno, sword;
 	struct metapage *mp;
 	struct iag *iagp;
 	/* check if there are iags on the ag's free inode list.
 	 */
 	if ((iagno = imap->im_agctl[agno].inofree) < 0)
 		return -ENOSPC;
 	/* obtain read lock on imap inode */
 	IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
 	/* read the iag at the head of the list.
 	 */
 	if ((rc = diIAGRead(imap, iagno, &mp))) {
 		IREAD_UNLOCK(imap->im_ipimap);
 		return (rc);
 	}
 	iagp = (struct iag *) mp->data;
 	/* better be free inodes in this iag if it is on the
 	 * list.
 	 */
 	if (!iagp->nfreeinos) {
 		IREAD_UNLOCK(imap->im_ipimap);
 		release_metapage(mp);
 		jfs_error(ip->i_sb,
 			  "diAllocIno: nfreeinos = 0, but iag on freelist");
 		return -EIO;
 	}
 	/* scan the free inode summary map to find an extent
 	 * with free inodes.
 	 */
 	for (sword = 0;; sword++) {
 		if (sword >= SMAPSZ) {
 			IREAD_UNLOCK(imap->im_ipimap);
 			release_metapage(mp);
 			jfs_error(ip->i_sb,
 				  "diAllocIno: free inode not found in summary map");
 			return -EIO;
 		}
 		if (~iagp->inosmap[sword])
 			break;
 	}
 	/* found a extent with free inodes. determine
 	 * the extent number.
 	 */
 	rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
 	if (rem >= EXTSPERSUM) {
 		IREAD_UNLOCK(imap->im_ipimap);
 		release_metapage(mp);
 		jfs_error(ip->i_sb, "diAllocIno: no free extent found");
 		return -EIO;
 	}
 	extno = (sword << L2EXTSPERSUM) + rem;
 	/* find the first free inode in the extent.
 	 */
 	rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
 	if (rem >= INOSPEREXT) {
 		IREAD_UNLOCK(imap->im_ipimap);
 		release_metapage(mp);
 		jfs_error(ip->i_sb, "diAllocIno: free inode not found");
 		return -EIO;
 	}
 	/* compute the inode number within the iag.
 	 */
 	ino = (extno << L2INOSPEREXT) + rem;
 	/* allocate the inode.
 	 */
 	rc = diAllocBit(imap, iagp, ino);
 	IREAD_UNLOCK(imap->im_ipimap);
 	if (rc) {
 		release_metapage(mp);
 		return (rc);
 	}
 	/* set the results of the allocation and write the iag.
 	 */
 	diInitInode(ip, iagno, ino, extno, iagp);
 	write_metapage(mp);
 	return (0);
 }
 /*
  * NAME:	diAllocExt(imap,agno,ip)
  *
  * FUNCTION:	add a new extent of free inodes to an iag, allocating
  *		an inode from this extent to satisfy the current allocation
  *		request.
  *
  *		this routine first tries to find an existing iag with free
  *		extents through the ag free extent list.  if list is not
  *		empty, the head of the list will be selected as the home
  *		of the new extent of free inodes.  otherwise (the list is
  *		empty), a new iag will be allocated for the ag to contain
  *		the extent.
  *
  *		once an iag has been selected, the free extent summary map
  *		is used to locate a free extent within the iag and diNewExt()
  *		is called to initialize the extent, with initialization
  *		including the allocation of the first inode of the extent
  *		for the purpose of satisfying this request.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	agno	- allocation group number.
  *	ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 {
 	int rem, iagno, sword, extno, rc;
 	struct metapage *mp;
 	struct iag *iagp;
 	/* check if the ag has any iags with free extents.  if not,
 	 * allocate a new iag for the ag.
 	 */
 	if ((iagno = imap->im_agctl[agno].extfree) < 0) {
 		/* If successful, diNewIAG will obtain the read lock on the
 		 * imap inode.
 		 */
 		if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
 			return (rc);
 		}
 		iagp = (struct iag *) mp->data;
 		/* set the ag number if this a brand new iag
 		 */
 		iagp->agstart =
 		    cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
 	} else {
 		/* read the iag.
 		 */
 		IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
 		if ((rc = diIAGRead(imap, iagno, &mp))) {
 			IREAD_UNLOCK(imap->im_ipimap);
 			jfs_error(ip->i_sb, "diAllocExt: error reading iag");
 			return rc;
 		}
 		iagp = (struct iag *) mp->data;
 	}
 	/* using the free extent summary map, find a free extent.
 	 */
 	for (sword = 0;; sword++) {
 		if (sword >= SMAPSZ) {
 			release_metapage(mp);
 			IREAD_UNLOCK(imap->im_ipimap);
 			jfs_error(ip->i_sb,
 				  "diAllocExt: free ext summary map not found");
 			return -EIO;
 		}
 		if (~iagp->extsmap[sword])
 			break;
 	}
 	/* determine the extent number of the free extent.
 	 */
 	rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
 	if (rem >= EXTSPERSUM) {
 		release_metapage(mp);
 		IREAD_UNLOCK(imap->im_ipimap);
 		jfs_error(ip->i_sb, "diAllocExt: free extent not found");
 		return -EIO;
 	}
 	extno = (sword << L2EXTSPERSUM) + rem;
 	/* initialize the new extent.
 	 */
 	rc = diNewExt(imap, iagp, extno);
 	IREAD_UNLOCK(imap->im_ipimap);
 	if (rc) {
 		/* something bad happened.  if a new iag was allocated,
 		 * place it back on the inode map's iag free list, and
 		 * clear the ag number information.
 		 */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
 			IAGFREE_LOCK(imap);
 			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
 			imap->im_freeiag = iagno;
 			IAGFREE_UNLOCK(imap);
 		}
 		write_metapage(mp);
 		return (rc);
 	}
 	/* set the results of the allocation and write the iag.
 	 */
 	diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
 	write_metapage(mp);
 	return (0);
 }
 /*
  * NAME:	diAllocBit(imap,iagp,ino)
  *
  * FUNCTION:	allocate a backed inode from an iag.
  *
  *		this routine performs the mechanics of allocating a
  *		specified inode from a backed extent.
  *
  *		if the inode to be allocated represents the last free
  *		inode within the iag, the iag will be removed from the
  *		ag free inode list.
  *
  *		a careful update approach is used to provide consistency
  *		in the face of updates to multiple buffers.  under this
  *		approach, all required buffers are obtained before making
  *		any updates and are held all are updates are complete.
  *
  * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	iagp	- pointer to iag.
  *	ino	- inode number to be allocated within the iag.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 {
 	int extno, bitno, agno, sword, rc;
 	struct metapage *amp = NULL, *bmp = NULL;
 	struct iag *aiagp = NULL, *biagp = NULL;
 	u32 mask;
 	/* check if this is the last free inode within the iag.
 	 * if so, it will have to be removed from the ag free
 	 * inode list, so get the iags preceeding and following
 	 * it on the list.
 	 */
 	if (iagp->nfreeinos == cpu_to_le32(1)) {
 		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
 			if ((rc =
 			     diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
 				       &amp)))
 				return (rc);
 			aiagp = (struct iag *) amp->data;
 		}
 		if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
 			if ((rc =
 			     diIAGRead(imap,
 				       le32_to_cpu(iagp->inofreeback),
 				       &bmp))) {
 				if (amp)
 					release_metapage(amp);
 				return (rc);
 			}
 			biagp = (struct iag *) bmp->data;
 		}
 	}
 	/* get the ag number, extent number, inode number within
 	 * the extent.
 	 */
 	agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
 	extno = ino >> L2INOSPEREXT;
 	bitno = ino & (INOSPEREXT - 1);
 	/* compute the mask for setting the map.
 	 */
 	mask = HIGHORDER >> bitno;
 	/* the inode should be free and backed.
 	 */
 	if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
 	    ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
 	    (addressPXD(&iagp->inoext[extno]) == 0)) {
 		if (amp)
 			release_metapage(amp);
 		if (bmp)
 			release_metapage(bmp);
 		jfs_error(imap->im_ipimap->i_sb,
 			  "diAllocBit: iag inconsistent");
 		return -EIO;
 	}
 	/* mark the inode as allocated in the working map.
 	 */
 	iagp->wmap[extno] |= cpu_to_le32(mask);
 	/* check if all inodes within the extent are now
 	 * allocated.  if so, update the free inode summary
 	 * map to reflect this.
 	 */
 	if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
 		sword = extno >> L2EXTSPERSUM;
 		bitno = extno & (EXTSPERSUM - 1);
 		iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
 	}
 	/* if this was the last free inode in the iag, remove the
 	 * iag from the ag free inode list.
 	 */
 	if (iagp->nfreeinos == cpu_to_le32(1)) {
 		if (amp) {
 			aiagp->inofreeback = iagp->inofreeback;
 			write_metapage(amp);
 		}
 		if (bmp) {
 			biagp->inofreefwd = iagp->inofreefwd;
 			write_metapage(bmp);
 		} else {
 			imap->im_agctl[agno].inofree =
 			    le32_to_cpu(iagp->inofreefwd);
 		}
 		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
 	}
 	/* update the free inode count at the iag, ag, inode
 	 * map levels.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+	le32_add_cpu(&iagp->nfreeinos, -1);
 	imap->im_agctl[agno].numfree -= 1;
 	atomic_dec(&imap->im_numfree);
 	return (0);
 }
 /*
  * NAME:	diNewExt(imap,iagp,extno)
  *
  * FUNCTION:	initialize a new extent of inodes for an iag, allocating
  *		the first inode of the extent for use for the current
  *		allocation request.
  *
  *		disk resources are allocated for the new extent of inodes
  *		and the inodes themselves are initialized to reflect their
  *		existence within the extent (i.e. their inode numbers and
  *		inode extent addresses are set) and their initial state
  *		(mode and link count are set to zero).
  *
  *		if the iag is new, it is not yet on an ag extent free list
  *		but will now be placed on this list.
  *
  *		if the allocation of the new extent causes the iag to
  *		have no free extent, the iag will be removed from the
  *		ag extent free list.
  *
  *		if the iag has no free backed inodes, it will be placed
  *		on the ag free inode list, since the addition of the new
  *		extent will now cause it to have free inodes.
  *
  *		a careful update approach is used to provide consistency
  *		(i.e. list consistency) in the face of updates to multiple
  *		buffers.  under this approach, all required buffers are
  *		obtained before making any updates and are held until all
  *		updates are complete.
  *
  * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	iagp	- pointer to iag.
  *	extno	- extent number.
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  */
 static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 {
 	int agno, iagno, fwd, back, freei = 0, sword, rc;
 	struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
 	struct metapage *amp, *bmp, *cmp, *dmp;
 	struct inode *ipimap;
 	s64 blkno, hint;
 	int i, j;
 	u32 mask;
 	ino_t ino;
 	struct dinode *dp;
 	struct jfs_sb_info *sbi;
 	/* better have free extents.
 	 */
 	if (!iagp->nfreeexts) {
 		jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
 		return -EIO;
 	}
 	/* get the inode map inode.
 	 */
 	ipimap = imap->im_ipimap;
 	sbi = JFS_SBI(ipimap->i_sb);
 	amp = bmp = cmp = NULL;
 	/* get the ag and iag numbers for this iag.
 	 */
 	agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
 	iagno = le32_to_cpu(iagp->iagnum);
 	/* check if this is the last free extent within the
 	 * iag.  if so, the iag must be removed from the ag
 	 * free extent list, so get the iags preceeding and
 	 * following the iag on this list.
 	 */
 	if (iagp->nfreeexts == cpu_to_le32(1)) {
 		if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
 			if ((rc = diIAGRead(imap, fwd, &amp)))
 				return (rc);
 			aiagp = (struct iag *) amp->data;
 		}
 		if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
 			if ((rc = diIAGRead(imap, back, &bmp)))
 				goto error_out;
 			biagp = (struct iag *) bmp->data;
 		}
 	} else {
 		/* the iag has free extents.  if all extents are free
 		 * (as is the case for a newly allocated iag), the iag
 		 * must be added to the ag free extent list, so get
 		 * the iag at the head of the list in preparation for
 		 * adding this iag to this list.
 		 */
 		fwd = back = -1;
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
 			if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
 				if ((rc = diIAGRead(imap, fwd, &amp)))
 					goto error_out;
 				aiagp = (struct iag *) amp->data;
 			}
 		}
 	}
 	/* check if the iag has no free inodes.  if so, the iag
 	 * will have to be added to the ag free inode list, so get
 	 * the iag at the head of the list in preparation for
 	 * adding this iag to this list.  in doing this, we must
 	 * check if we already have the iag at the head of
 	 * the list in hand.
 	 */
 	if (iagp->nfreeinos == 0) {
 		freei = imap->im_agctl[agno].inofree;
 		if (freei >= 0) {
 			if (freei == fwd) {
 				ciagp = aiagp;
 			} else if (freei == back) {
 				ciagp = biagp;
 			} else {
 				if ((rc = diIAGRead(imap, freei, &cmp)))
 					goto error_out;
 				ciagp = (struct iag *) cmp->data;
 			}
 			if (ciagp == NULL) {
 				jfs_error(imap->im_ipimap->i_sb,
 					  "diNewExt: ciagp == NULL");
 				rc = -EIO;
 				goto error_out;
 			}
 		}
 	}
 	/* allocate disk space for the inode extent.
 	 */
 	if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
 		hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
 	else
 		hint = addressPXD(&iagp->inoext[extno - 1]) +
 		    lengthPXD(&iagp->inoext[extno - 1]) - 1;
 	if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
 		goto error_out;
 	/* compute the inode number of the first inode within the
 	 * extent.
 	 */
 	ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
 	/* initialize the inodes within the newly allocated extent a
 	 * page at a time.
 	 */
 	for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
 		/* get a buffer for this page of disk inodes.
 		 */
 		dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
 		if (dmp == NULL) {
 			rc = -EIO;
 			goto error_out;
 		}
 		dp = (struct dinode *) dmp->data;
 		/* initialize the inode number, mode, link count and
 		 * inode extent address.
 		 */
 		for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
 			dp->di_inostamp = cpu_to_le32(sbi->inostamp);
 			dp->di_number = cpu_to_le32(ino);
 			dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
 			dp->di_mode = 0;
 			dp->di_nlink = 0;
 			PXDaddress(&(dp->di_ixpxd), blkno);
 			PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
 		}
 		write_metapage(dmp);
 	}
 	/* if this is the last free extent within the iag, remove the
 	 * iag from the ag free extent list.
 	 */
 	if (iagp->nfreeexts == cpu_to_le32(1)) {
 		if (fwd >= 0)
 			aiagp->extfreeback = iagp->extfreeback;
 		if (back >= 0)
 			biagp->extfreefwd = iagp->extfreefwd;
 		else
 			imap->im_agctl[agno].extfree =
 			    le32_to_cpu(iagp->extfreefwd);
 		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
 	} else {
 		/* if the iag has all free extents (newly allocated iag),
 		 * add the iag to the ag free extent list.
 		 */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
 			if (fwd >= 0)
 				aiagp->extfreeback = cpu_to_le32(iagno);
 			iagp->extfreefwd = cpu_to_le32(fwd);
 			iagp->extfreeback = cpu_to_le32(-1);
 			imap->im_agctl[agno].extfree = iagno;
 		}
 	}
 	/* if the iag has no free inodes, add the iag to the
 	 * ag free inode list.
 	 */
 	if (iagp->nfreeinos == 0) {
 		if (freei >= 0)
 			ciagp->inofreeback = cpu_to_le32(iagno);
 		iagp->inofreefwd =
 		    cpu_to_le32(imap->im_agctl[agno].inofree);
 		iagp->inofreeback = cpu_to_le32(-1);
 		imap->im_agctl[agno].inofree = iagno;
 	}
 	/* initialize the extent descriptor of the extent. */
 	PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
 	PXDaddress(&iagp->inoext[extno], blkno);
 	/* initialize the working and persistent map of the extent.
 	 * the working map will be initialized such that
 	 * it indicates the first inode of the extent is allocated.
 	 */
 	iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
 	iagp->pmap[extno] = 0;
 	/* update the free inode and free extent summary maps
 	 * for the extent to indicate the extent has free inodes
 	 * and no longer represents a free extent.
 	 */
 	sword = extno >> L2EXTSPERSUM;
 	mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
 	iagp->extsmap[sword] |= cpu_to_le32(mask);
 	iagp->inosmap[sword] &= cpu_to_le32(~mask);
 	/* update the free inode and free extent counts for the
 	 * iag.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
+	le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
-				      (INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, -1);
-	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
 	/* update the free and backed inode counts for the ag.
 	 */
 	imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
 	imap->im_agctl[agno].numinos += INOSPEREXT;
 	/* update the free and backed inode counts for the inode map.
 	 */
 	atomic_add(INOSPEREXT - 1, &imap->im_numfree);
 	atomic_add(INOSPEREXT, &imap->im_numinos);
 	/* write the iags.
 	 */
 	if (amp)
 		write_metapage(amp);
 	if (bmp)
 		write_metapage(bmp);
 	if (cmp)
 		write_metapage(cmp);
 	return (0);
       error_out:
 	/* release the iags.
 	 */
 	if (amp)
 		release_metapage(amp);
 	if (bmp)
 		release_metapage(bmp);
 	if (cmp)
 		release_metapage(cmp);
 	return (rc);
 }
 /*
  * NAME:	diNewIAG(imap,iagnop,agno)
  *
  * FUNCTION:	allocate a new iag for an allocation group.
  *
  *		first tries to allocate the iag from the inode map
  *		iagfree list:
  *		if the list has free iags, the head of the list is removed
  *		and returned to satisfy the request.
  *		if the inode map's iag free list is empty, the inode map
  *		is extended to hold a new iag. this new iag is initialized
  *		and returned to satisfy the request.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	iagnop	- pointer to an iag number set with the number of the
  *		  newly allocated iag upon successful return.
  *	agno	- allocation group number.
  *	bpp	- Buffer pointer to be filled in with new IAG's buffer
  *
  * RETURN VALUES:
  *	0	- success.
  *	-ENOSPC	- insufficient disk resources.
  *	-EIO	- i/o error.
  *
  * serialization:
  *	AG lock held on entry/exit;
  *	write lock on the map is held inside;
  *	read lock on the map is held on successful completion;
  *
  * note: new iag transaction:
  * . synchronously write iag;
  * . write log of xtree and inode of imap;
  * . commit;
  * . synchronous write of xtree (right to left, bottom to top);
  * . at start of logredo(): init in-memory imap with one additional iag page;
  * . at end of logredo(): re-read imap inode to determine
  *   new imap size;
  */
 static int
 diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 {
 	int rc;
 	int iagno, i, xlen;
 	struct inode *ipimap;
 	struct super_block *sb;
 	struct jfs_sb_info *sbi;
 	struct metapage *mp;
 	struct iag *iagp;
 	s64 xaddr = 0;
 	s64 blkno;
 	tid_t tid;
 	struct inode *iplist[1];
 	/* pick up pointers to the inode map and mount inodes */
 	ipimap = imap->im_ipimap;
 	sb = ipimap->i_sb;
 	sbi = JFS_SBI(sb);
 	/* acquire the free iag lock */
 	IAGFREE_LOCK(imap);
 	/* if there are any iags on the inode map free iag list,
 	 * allocate the iag from the head of the list.
 	 */
 	if (imap->im_freeiag >= 0) {
 		/* pick up the iag number at the head of the list */
 		iagno = imap->im_freeiag;
 		/* determine the logical block number of the iag */
 		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
 	} else {
 		/* no free iags. the inode map will have to be extented
 		 * to include a new iag.
 		 */
 		/* acquire inode map lock */
 		IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
 		if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
 			IWRITE_UNLOCK(ipimap);
 			IAGFREE_UNLOCK(imap);
 			jfs_error(imap->im_ipimap->i_sb,
 				  "diNewIAG: ipimap->i_size is wrong");
 			return -EIO;
 		}
 		/* get the next avaliable iag number */
 		iagno = imap->im_nextiag;
 		/* make sure that we have not exceeded the maximum inode
 		 * number limit.
 		 */
 		if (iagno > (MAXIAGS - 1)) {
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
 			rc = -ENOSPC;
 			goto out;
 		}
 		/*
 		 * synchronously append new iag page.
 		 */
 		/* determine the logical address of iag page to append */
 		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
 		/* Allocate extent for new iag page */
 		xlen = sbi->nbperpage;
 		if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
 			goto out;
 		}
 		/*
 		 * start transaction of update of the inode map
 		 * addressing structure pointing to the new iag page;
 		 */
 		tid = txBegin(sb, COMMIT_FORCE);
 		mutex_lock(&JFS_IP(ipimap)->commit_mutex);
 		/* update the inode map addressing structure to point to it */
 		if ((rc =
 		     xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
 			txEnd(tid);
 			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 			/* Free the blocks allocated for the iag since it was
 			 * not successfully added to the inode map
 			 */
 			dbFree(ipimap, xaddr, (s64) xlen);
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
 			goto out;
 		}
 		/* update the inode map's inode to reflect the extension */
 		ipimap->i_size += PSIZE;
 		inode_add_bytes(ipimap, PSIZE);
 		/* assign a buffer for the page */
 		mp = get_metapage(ipimap, blkno, PSIZE, 0);
 		if (!mp) {
 			/*
 			 * This is very unlikely since we just created the
 			 * extent, but let's try to handle it correctly
 			 */
 			xtTruncate(tid, ipimap, ipimap->i_size - PSIZE,
 				   COMMIT_PWMAP);
 			txAbort(tid, 0);
 			txEnd(tid);
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
 			rc = -EIO;
 			goto out;
 		}
 		iagp = (struct iag *) mp->data;
 		/* init the iag */
 		memset(iagp, 0, sizeof(struct iag));
 		iagp->iagnum = cpu_to_le32(iagno);
 		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
 		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
 		iagp->iagfree = cpu_to_le32(-1);
 		iagp->nfreeinos = 0;
 		iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
 		/* initialize the free inode summary map (free extent
 		 * summary map initialization handled by bzero).
 		 */
 		for (i = 0; i < SMAPSZ; i++)
 			iagp->inosmap[i] = cpu_to_le32(ONES);
 		/*
 		 * Write and sync the metapage
 		 */
 		flush_metapage(mp);
 		/*
 		 * txCommit(COMMIT_FORCE) will synchronously write address
 		 * index pages and inode after commit in careful update order
 		 * of address index pages (right to left, bottom up);
 		 */
 		iplist[0] = ipimap;
 		rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
 		txEnd(tid);
 		mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 		duplicateIXtree(sb, blkno, xlen, &xaddr);
 		/* update the next avaliable iag number */
 		imap->im_nextiag += 1;
 		/* Add the iag to the iag free list so we don't lose the iag
 		 * if a failure happens now.
 		 */
 		imap->im_freeiag = iagno;
 		/* Until we have logredo working, we want the imap inode &
 		 * control page to be up to date.
 		 */
 		diSync(ipimap);
 		/* release the inode map lock */
 		IWRITE_UNLOCK(ipimap);
 	}
 	/* obtain read lock on map */
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	/* read the iag */
 	if ((rc = diIAGRead(imap, iagno, &mp))) {
 		IREAD_UNLOCK(ipimap);
 		rc = -EIO;
 		goto out;
 	}
 	iagp = (struct iag *) mp->data;
 	/* remove the iag from the iag free list */
 	imap->im_freeiag = le32_to_cpu(iagp->iagfree);
 	iagp->iagfree = cpu_to_le32(-1);
 	/* set the return iag number and buffer pointer */
 	*iagnop = iagno;
 	*mpp = mp;
       out:
 	/* release the iag free lock */
 	IAGFREE_UNLOCK(imap);
 	return (rc);
 }
 /*
  * NAME:	diIAGRead()
  *
  * FUNCTION:	get the buffer for the specified iag within a fileset
  *		or aggregate inode map.
  *
  * PARAMETERS:
  *	imap	- pointer to inode map control structure.
  *	iagno	- iag number.
  *	bpp	- point to buffer pointer to be filled in on successful
  *		  exit.
  *
  * SERIALIZATION:
  *	must have read lock on imap inode
  *	(When called by diExtendFS, the filesystem is quiesced, therefore
  *	 the read lock is unnecessary.)
  *
  * RETURN VALUES:
  *	0	- success.
  *	-EIO	- i/o error.
  */
 static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 {
 	struct inode *ipimap = imap->im_ipimap;
 	s64 blkno;
 	/* compute the logical block number of the iag. */
 	blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
 	/* read the iag. */
 	*mpp = read_metapage(ipimap, blkno, PSIZE, 0);
 	if (*mpp == NULL) {
 		return -EIO;
 	}
 	return (0);
 }
 /*
  * NAME:	diFindFree()
  *
  * FUNCTION:	find the first free bit in a word starting at
  *		the specified bit position.
  *
  * PARAMETERS:
  *	word	- word to be examined.
  *	start	- starting bit position.
  *
  * RETURN VALUES:
  *	bit position of first free bit in the word or 32 if
  *	no free bits were found.
  */
 static int diFindFree(u32 word, int start)
 {
 	int bitno;
 	assert(start < 32);
 	/* scan the word for the first free bit. */
 	for (word <<= start, bitno = start; bitno < 32;
 	     bitno++, word <<= 1) {
 		if ((word & HIGHORDER) == 0)
 			break;
 	}
 	return (bitno);
 }
 /*
  * NAME:	diUpdatePMap()
  *
  * FUNCTION: Update the persistent map in an IAG for the allocation or
  *	freeing of the specified inode.
  *
  * PRE CONDITIONS: Working map has already been updated for allocate.
  *
  * PARAMETERS:
  *	ipimap	- Incore inode map inode
  *	inum	- Number of inode to mark in permanent map
  *	is_free	- If 'true' indicates inode should be marked freed, otherwise
  *		  indicates inode should be marked allocated.
  *
  * RETURN VALUES:
  *		0 for success
  */
 int
 diUpdatePMap(struct inode *ipimap,
 	     unsigned long inum, bool is_free, struct tblock * tblk)
 {
 	int rc;
 	struct iag *iagp;
 	struct metapage *mp;
 	int iagno, ino, extno, bitno;
 	struct inomap *imap;
 	u32 mask;
 	struct jfs_log *log;
 	int lsn, difft, diffp;
 	unsigned long flags;
 	imap = JFS_IP(ipimap)->i_imap;
 	/* get the iag number containing the inode */
 	iagno = INOTOIAG(inum);
 	/* make sure that the iag is contained within the map */
 	if (iagno >= imap->im_nextiag) {
 		jfs_error(ipimap->i_sb,
 			  "diUpdatePMap: the iag is outside the map");
 		return -EIO;
 	}
 	/* read the iag */
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	rc = diIAGRead(imap, iagno, &mp);
 	IREAD_UNLOCK(ipimap);
 	if (rc)
 		return (rc);
 	metapage_wait_for_io(mp);
 	iagp = (struct iag *) mp->data;
 	/* get the inode number and extent number of the inode within
 	 * the iag and the inode number within the extent.
 	 */
 	ino = inum & (INOSPERIAG - 1);
 	extno = ino >> L2INOSPEREXT;
 	bitno = ino & (INOSPEREXT - 1);
 	mask = HIGHORDER >> bitno;
 	/*
 	 * mark the inode free in persistent map:
 	 */
 	if (is_free) {
 		/* The inode should have been allocated both in working
 		 * map and in persistent map;
 		 * the inode will be freed from working map at the release
 		 * of last reference release;
 		 */
 		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
 			jfs_error(ipimap->i_sb,
 				  "diUpdatePMap: inode %ld not marked as "
 				  "allocated in wmap!", inum);
 		}
 		if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
 			jfs_error(ipimap->i_sb,
 				  "diUpdatePMap: inode %ld not marked as "
 				  "allocated in pmap!", inum);
 		}
 		/* update the bitmap for the extent of the freed inode */
 		iagp->pmap[extno] &= cpu_to_le32(~mask);
 	}
 	/*
 	 * mark the inode allocated in persistent map:
 	 */
 	else {
 		/* The inode should be already allocated in the working map
 		 * and should be free in persistent map;
 		 */
 		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
 			release_metapage(mp);
 			jfs_error(ipimap->i_sb,
 				  "diUpdatePMap: the inode is not allocated in "
 				  "the working map");
 			return -EIO;
 		}
 		if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
 			release_metapage(mp);
 			jfs_error(ipimap->i_sb,
 				  "diUpdatePMap: the inode is not free in the "
 				  "persistent map");
 			return -EIO;
 		}
 		/* update the bitmap for the extent of the allocated inode */
 		iagp->pmap[extno] |= cpu_to_le32(mask);
 	}
 	/*
 	 * update iag lsn
 	 */
 	lsn = tblk->lsn;
 	log = JFS_SBI(tblk->sb)->log;
 	LOGSYNC_LOCK(log, flags);
 	if (mp->lsn != 0) {
 		/* inherit older/smaller lsn */
 		logdiff(difft, lsn, log);
 		logdiff(diffp, mp->lsn, log);
 		if (difft < diffp) {
 			mp->lsn = lsn;
 			/* move mp after tblock in logsync list */
 			list_move(&mp->synclist, &tblk->synclist);
 		}
 		/* inherit younger/larger clsn */
 		assert(mp->clsn);
 		logdiff(difft, tblk->clsn, log);
 		logdiff(diffp, mp->clsn, log);
 		if (difft > diffp)
 			mp->clsn = tblk->clsn;
 	} else {
 		mp->log = log;
 		mp->lsn = lsn;
 		/* insert mp after tblock in logsync list */
 		log->count++;
 		list_add(&mp->synclist, &tblk->synclist);
 		mp->clsn = tblk->clsn;
 	}
 	LOGSYNC_UNLOCK(log, flags);
 	write_metapage(mp);
 	return (0);
 }
 /*
  *	diExtendFS()
  *
  * function: update imap for extendfs();
  *
  * note: AG size has been increased s.t. each k old contiguous AGs are
  * coalesced into a new AG;
  */
 int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 {
 	int rc, rcx = 0;
 	struct inomap *imap = JFS_IP(ipimap)->i_imap;
 	struct iag *iagp = NULL, *hiagp = NULL;
 	struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
 	struct metapage *bp, *hbp;
 	int i, n, head;
 	int numinos, xnuminos = 0, xnumfree = 0;
 	s64 agstart;
 	jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
 		   imap->im_nextiag, atomic_read(&imap->im_numinos),
 		   atomic_read(&imap->im_numfree));
 	/*
 	 *	reconstruct imap
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
 	 * note: new AG size = old AG size * (2**x).
 	 */
 	/* init per AG control information im_agctl[] */
 	for (i = 0; i < MAXAG; i++) {
 		imap->im_agctl[i].inofree = -1;
 		imap->im_agctl[i].extfree = -1;
 		imap->im_agctl[i].numinos = 0;	/* number of backed inodes */
 		imap->im_agctl[i].numfree = 0;	/* number of free backed inodes */
 	}
 	/*
 	 *	process each iag page of the map.
 	 *
 	 * rebuild AG Free Inode List, AG Free Inode Extent List;
 	 */
 	for (i = 0; i < imap->im_nextiag; i++) {
 		if ((rc = diIAGRead(imap, i, &bp))) {
 			rcx = rc;
 			continue;
 		}
 		iagp = (struct iag *) bp->data;
 		if (le32_to_cpu(iagp->iagnum) != i) {
 			release_metapage(bp);
 			jfs_error(ipimap->i_sb,
 				  "diExtendFs: unexpected value of iagnum");
 			return -EIO;
 		}
 		/* leave free iag in the free iag list */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
 			release_metapage(bp);
 			continue;
 		}
 		/* agstart that computes to the same ag is treated as same; */
 		agstart = le64_to_cpu(iagp->agstart);
 		/* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
 		n = agstart >> mp->db_agl2size;
 		/* compute backed inodes */
 		numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
 		    << L2INOSPEREXT;
 		if (numinos > 0) {
 			/* merge AG backed inodes */
 			imap->im_agctl[n].numinos += numinos;
 			xnuminos += numinos;
 		}
 		/* if any backed free inodes, insert at AG free inode list */
 		if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
 			if ((head = imap->im_agctl[n].inofree) == -1) {
 				iagp->inofreefwd = cpu_to_le32(-1);
 				iagp->inofreeback = cpu_to_le32(-1);
 			} else {
 				if ((rc = diIAGRead(imap, head, &hbp))) {
 					rcx = rc;
 					goto nextiag;
 				}
 				hiagp = (struct iag *) hbp->data;
 				hiagp->inofreeback = iagp->iagnum;
 				iagp->inofreefwd = cpu_to_le32(head);
 				iagp->inofreeback = cpu_to_le32(-1);
 				write_metapage(hbp);
 			}
 			imap->im_agctl[n].inofree =
 			    le32_to_cpu(iagp->iagnum);
 			/* merge AG backed free inodes */
 			imap->im_agctl[n].numfree +=
 			    le32_to_cpu(iagp->nfreeinos);
 			xnumfree += le32_to_cpu(iagp->nfreeinos);
 		}
 		/* if any free extents, insert at AG free extent list */
 		if (le32_to_cpu(iagp->nfreeexts) > 0) {
 			if ((head = imap->im_agctl[n].extfree) == -1) {
 				iagp->extfreefwd = cpu_to_le32(-1);
 				iagp->extfreeback = cpu_to_le32(-1);
 			} else {
 				if ((rc = diIAGRead(imap, head, &hbp))) {
 					rcx = rc;
 					goto nextiag;
 				}
 				hiagp = (struct iag *) hbp->data;
 				hiagp->extfreeback = iagp->iagnum;
 				iagp->extfreefwd = cpu_to_le32(head);
 				iagp->extfreeback = cpu_to_le32(-1);
 				write_metapage(hbp);
 			}
 			imap->im_agctl[n].extfree =
 			    le32_to_cpu(iagp->iagnum);
 		}
 	      nextiag:
 		write_metapage(bp);
 	}
 	if (xnuminos != atomic_read(&imap->im_numinos) ||
 	    xnumfree != atomic_read(&imap->im_numfree)) {
 		jfs_error(ipimap->i_sb,
 			  "diExtendFs: numinos or numfree incorrect");
 		return -EIO;
 	}
 	return rcx;
 }
 /*
  *	duplicateIXtree()
  *
  * serialization: IWRITE_LOCK held on entry/exit
  *
  * note: shadow page with regular inode (rel.2);
  */
 static void duplicateIXtree(struct super_block *sb, s64 blkno,
 			    int xlen, s64 *xaddr)
 {
 	struct jfs_superblock *j_sb;
 	struct buffer_head *bh;
 	struct inode *ip;
 	tid_t tid;
 	/* if AIT2 ipmap2 is bad, do not try to update it */
 	if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT)	/* s_flag */
 		return;
 	ip = diReadSpecial(sb, FILESYSTEM_I, 1);
 	if (ip == NULL) {
 		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
 		if (readSuper(sb, &bh))
 			return;
 		j_sb = (struct jfs_superblock *)bh->b_data;
 		j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
 		mark_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
 		brelse(bh);
 		return;
 	}
 	/* start transaction */
 	tid = txBegin(sb, COMMIT_FORCE);
 	/* update the inode map addressing structure to point to it */
 	if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
 		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
 		txAbort(tid, 1);
 		goto cleanup;
 	}
 	/* update the inode map's inode to reflect the extension */
 	ip->i_size += PSIZE;
 	inode_add_bytes(ip, PSIZE);
 	txCommit(tid, 1, &ip, COMMIT_FORCE);
       cleanup:
 	txEnd(tid);
 	diFreeSpecial(ip);
 }
 /*
  * NAME:	copy_from_dinode()
  *
  * FUNCTION:	Copies inode info from disk inode to in-memory inode
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOMEM	- insufficient memory
  */
 static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 {
 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
 	jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
 	jfs_set_inode_flags(ip);
 	ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
 	if (sbi->umask != -1) {
 		ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask);
 		/* For directories, add x permission if r is allowed by umask */
 		if (S_ISDIR(ip->i_mode)) {
 			if (ip->i_mode & 0400)
 				ip->i_mode |= 0100;
 			if (ip->i_mode & 0040)
 				ip->i_mode |= 0010;
 			if (ip->i_mode & 0004)
 				ip->i_mode |= 0001;
 		}
 	}
 	ip->i_nlink = le32_to_cpu(dip->di_nlink);
 	jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
 	if (sbi->uid == -1)
 		ip->i_uid = jfs_ip->saved_uid;
 	else {
 		ip->i_uid = sbi->uid;
 	}
 	jfs_ip->saved_gid = le32_to_cpu(dip->di_gid);
 	if (sbi->gid == -1)
 		ip->i_gid = jfs_ip->saved_gid;
 	else {
 		ip->i_gid = sbi->gid;
 	}
 	ip->i_size = le64_to_cpu(dip->di_size);
 	ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
 	ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
 	ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
 	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
 	ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
 	ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
 	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
 	ip->i_generation = le32_to_cpu(dip->di_gen);
 	jfs_ip->ixpxd = dip->di_ixpxd;	/* in-memory pxd's are little-endian */
 	jfs_ip->acl = dip->di_acl;	/* as are dxd's */
 	jfs_ip->ea = dip->di_ea;
 	jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
 	jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
 	jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
 	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
 		jfs_ip->dev = le32_to_cpu(dip->di_rdev);
 		ip->i_rdev = new_decode_dev(jfs_ip->dev);
 	}
 	if (S_ISDIR(ip->i_mode)) {
 		memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
 	} else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
 		memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
 	} else
 		memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
 	/* Zero the in-memory-only stuff */
 	jfs_ip->cflag = 0;
 	jfs_ip->btindex = 0;
 	jfs_ip->btorder = 0;
 	jfs_ip->bxflag = 0;
 	jfs_ip->blid = 0;
 	jfs_ip->atlhead = 0;
 	jfs_ip->atltail = 0;
 	jfs_ip->xtlid = 0;
 	return (0);
 }
 /*
  * NAME:	copy_to_dinode()
  *
  * FUNCTION:	Copies inode info from in-memory inode to disk inode
  */
 static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 {
 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
 	dip->di_inostamp = cpu_to_le32(sbi->inostamp);
 	dip->di_number = cpu_to_le32(ip->i_ino);
 	dip->di_gen = cpu_to_le32(ip->i_generation);
 	dip->di_size = cpu_to_le64(ip->i_size);
 	dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
 	dip->di_nlink = cpu_to_le32(ip->i_nlink);
 	if (sbi->uid == -1)
 		dip->di_uid = cpu_to_le32(ip->i_uid);
 	else
 		dip->di_uid = cpu_to_le32(jfs_ip->saved_uid);
 	if (sbi->gid == -1)
 		dip->di_gid = cpu_to_le32(ip->i_gid);
 	else
 		dip->di_gid = cpu_to_le32(jfs_ip->saved_gid);
 	jfs_get_inode_flags(jfs_ip);
 	/*
 	 * mode2 is only needed for storing the higher order bits.
 	 * Trust i_mode for the lower order ones
 	 */
 	if (sbi->umask == -1)
 		dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) |
 					   ip->i_mode);
 	else /* Leave the original permissions alone */
 		dip->di_mode = cpu_to_le32(jfs_ip->mode2);
 	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
 	dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
 	dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
 	dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
 	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
 	dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
 	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
 	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
 	dip->di_ea = jfs_ip->ea;
 	dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
 	dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
 	dip->di_otime.tv_nsec = 0;
 	dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
 	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
 		dip->di_rdev = cpu_to_le32(jfs_ip->dev);
 }

fs/jfs/jfs_xtree.c

Diff comments View file @ 8914562

 /*
  *   Copyright (C) International Business Machines Corp., 2000-2005
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
  *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
  *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 /*
  *	jfs_xtree.c: extent allocation descriptor B+-tree manager
  */
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
 #include "jfs_dmap.h"
 #include "jfs_dinode.h"
 #include "jfs_superblock.h"
 #include "jfs_debug.h"
 /*
  * xtree local flag
  */
 #define XT_INSERT	0x00000001
 /*
  *	xtree key/entry comparison: extent offset
  *
  * return:
  *	-1: k < start of extent
  *	 0: start_of_extent <= k <= end_of_extent
  *	 1: k > end_of_extent
  */
 #define XT_CMP(CMP, K, X, OFFSET64)\
 {\
 	OFFSET64 = offsetXAD(X);\
 	(CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
 		((K) < OFFSET64) ? -1 : 0;\
 }
 /* write a xad entry */
 #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
 {\
 	(XAD)->flag = (FLAG);\
 	XADoffset((XAD), (OFF));\
 	XADlength((XAD), (LEN));\
 	XADaddress((XAD), (ADDR));\
 }
 #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
 /* get page buffer for specified block address */
 /* ToDo: Replace this ugly macro with a function */
 #define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
 {\
 	BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
 	if (!(RC))\
 	{\
 		if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
 		    (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
 		    (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
 		{\
 			jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
 			BT_PUTPAGE(MP);\
 			MP = NULL;\
 			RC = -EIO;\
 		}\
 	}\
 }
 /* for consistency */
 #define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
 #define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
 	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
 /* xtree entry parameter descriptor */
 struct xtsplit {
 	struct metapage *mp;
 	s16 index;
 	u8 flag;
 	s64 off;
 	s64 addr;
 	int len;
 	struct pxdlist *pxdlist;
 };
 /*
  *	statistics
  */
 #ifdef CONFIG_JFS_STATISTICS
 static struct {
 	uint search;
 	uint fastSearch;
 	uint split;
 } xtStat;
 #endif
 /*
  * forward references
  */
 static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp,
 		    struct btstack * btstack, int flag);
 static int xtSplitUp(tid_t tid,
 		     struct inode *ip,
 		     struct xtsplit * split, struct btstack * btstack);
 static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
 		       struct metapage ** rmpp, s64 * rbnp);
 static int xtSplitRoot(tid_t tid, struct inode *ip,
 		       struct xtsplit * split, struct metapage ** rmpp);
 #ifdef _STILL_TO_PORT
 static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
 		      xtpage_t * fp, struct btstack * btstack);
 static int xtSearchNode(struct inode *ip,
 			xad_t * xad,
 			int *cmpp, struct btstack * btstack, int flag);
 static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
 #endif				/*  _STILL_TO_PORT */
 /*
  *	xtLookup()
  *
  * function: map a single page into a physical extent;
  */
 int xtLookup(struct inode *ip, s64 lstart,
 	     s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
 {
 	int rc = 0;
 	struct btstack btstack;
 	int cmp;
 	s64 bn;
 	struct metapage *mp;
 	xtpage_t *p;
 	int index;
 	xad_t *xad;
 	s64 next, size, xoff, xend;
 	int xlen;
 	s64 xaddr;
 	*paddr = 0;
 	*plen = llen;
 	if (!no_check) {
 		/* is lookup offset beyond eof ? */
 		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
 		    JFS_SBI(ip->i_sb)->l2bsize;
 		if (lstart >= size) {
 			jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)",
 				(ulong) lstart, (ulong) size);
 			return 0;
 		}
 	}
 	/*
 	 * search for the xad entry covering the logical extent
 	 */
 //search:
 	if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) {
 		jfs_err("xtLookup: xtSearch returned %d", rc);
 		return rc;
 	}
 	/*
 	 *	compute the physical extent covering logical extent
 	 *
 	 * N.B. search may have failed (e.g., hole in sparse file),
 	 * and returned the index of the next entry.
 	 */
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	/* is xad found covering start of logical extent ?
 	 * lstart is a page start address,
 	 * i.e., lstart cannot start in a hole;
 	 */
 	if (cmp) {
 		if (next)
 			*plen = min(next - lstart, llen);
 		goto out;
 	}
 	/*
 	 * lxd covered by xad
 	 */
 	xad = &p->xad[index];
 	xoff = offsetXAD(xad);
 	xlen = lengthXAD(xad);
 	xend = xoff + xlen;
 	xaddr = addressXAD(xad);
 	/* initialize new pxd */
 	*pflag = xad->flag;
 	*paddr = xaddr + (lstart - xoff);
 	/* a page must be fully covered by an xad */
 	*plen = min(xend - lstart, llen);
       out:
 	XT_PUTPAGE(mp);
 	return rc;
 }
 /*
  *	xtLookupList()
  *
  * function: map a single logical extent into a list of physical extent;
  *
  * parameter:
  *	struct inode	*ip,
  *	struct lxdlist	*lxdlist,	lxd list (in)
  *	struct xadlist	*xadlist,	xad list (in/out)
  *	int		flag)
  *
  * coverage of lxd by xad under assumption of
  * . lxd's are ordered and disjoint.
  * . xad's are ordered and disjoint.
  *
  * return:
  *	0:	success
  *
  * note: a page being written (even a single byte) is backed fully,
  *	except the last page which is only backed with blocks
  *	required to cover the last byte;
  *	the extent backing a page is fully contained within an xad;
  */
 int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 		 struct xadlist * xadlist, int flag)
 {
 	int rc = 0;
 	struct btstack btstack;
 	int cmp;
 	s64 bn;
 	struct metapage *mp;
 	xtpage_t *p;
 	int index;
 	lxd_t *lxd;
 	xad_t *xad, *pxd;
 	s64 size, lstart, lend, xstart, xend, pstart;
 	s64 llen, xlen, plen;
 	s64 xaddr, paddr;
 	int nlxd, npxd, maxnpxd;
 	npxd = xadlist->nxad = 0;
 	maxnpxd = xadlist->maxnxad;
 	pxd = xadlist->xad;
 	nlxd = lxdlist->nlxd;
 	lxd = lxdlist->lxd;
 	lstart = offsetLXD(lxd);
 	llen = lengthLXD(lxd);
 	lend = lstart + llen;
 	size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
 	    JFS_SBI(ip->i_sb)->l2bsize;
 	/*
 	 * search for the xad entry covering the logical extent
 	 */
       search:
 	if (lstart >= size)
 		return 0;
 	if ((rc = xtSearch(ip, lstart, NULL, &cmp, &btstack, 0)))
 		return rc;
 	/*
 	 *	compute the physical extent covering logical extent
 	 *
 	 * N.B. search may have failed (e.g., hole in sparse file),
 	 * and returned the index of the next entry.
 	 */
 //map:
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	/* is xad on the next sibling page ? */
 	if (index == le16_to_cpu(p->header.nextindex)) {
 		if (p->header.flag & BT_ROOT)
 			goto mapend;
 		if ((bn = le64_to_cpu(p->header.next)) == 0)
 			goto mapend;
 		XT_PUTPAGE(mp);
 		/* get next sibling page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		index = XTENTRYSTART;
 	}
 	xad = &p->xad[index];
 	/*
 	 * is lxd covered by xad ?
 	 */
       compare:
 	xstart = offsetXAD(xad);
 	xlen = lengthXAD(xad);
 	xend = xstart + xlen;
 	xaddr = addressXAD(xad);
       compare1:
 	if (xstart < lstart)
 		goto compare2;
 	/* (lstart <= xstart) */
 	/* lxd is NOT covered by xad */
 	if (lend <= xstart) {
 		/*
 		 * get next lxd
 		 */
 		if (--nlxd == 0)
 			goto mapend;
 		lxd++;
 		lstart = offsetLXD(lxd);
 		llen = lengthLXD(lxd);
 		lend = lstart + llen;
 		if (lstart >= size)
 			goto mapend;
 		/* compare with the current xad */
 		goto compare1;
 	}
 	/* lxd is covered by xad */
 	else {			/* (xstart < lend) */
 		/* initialize new pxd */
 		pstart = xstart;
 		plen = min(lend - xstart, xlen);
 		paddr = xaddr;
 		goto cover;
 	}
 	/* (xstart < lstart) */
       compare2:
 	/* lxd is covered by xad */
 	if (lstart < xend) {
 		/* initialize new pxd */
 		pstart = lstart;
 		plen = min(xend - lstart, llen);
 		paddr = xaddr + (lstart - xstart);
 		goto cover;
 	}
 	/* lxd is NOT covered by xad */
 	else {			/* (xend <= lstart) */
 		/*
 		 * get next xad
 		 *
 		 * linear search next xad covering lxd on
 		 * the current xad page, and then tree search
 		 */
 		if (index == le16_to_cpu(p->header.nextindex) - 1) {
 			if (p->header.flag & BT_ROOT)
 				goto mapend;
 			XT_PUTPAGE(mp);
 			goto search;
 		} else {
 			index++;
 			xad++;
 			/* compare with new xad */
 			goto compare;
 		}
 	}
 	/*
 	 * lxd is covered by xad and a new pxd has been initialized
 	 * (lstart <= xstart < lend) or (xstart < lstart < xend)
 	 */
       cover:
 	/* finalize pxd corresponding to current xad */
 	XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
 	if (++npxd >= maxnpxd)
 		goto mapend;
 	pxd++;
 	/*
 	 * lxd is fully covered by xad
 	 */
 	if (lend <= xend) {
 		/*
 		 * get next lxd
 		 */
 		if (--nlxd == 0)
 			goto mapend;
 		lxd++;
 		lstart = offsetLXD(lxd);
 		llen = lengthLXD(lxd);
 		lend = lstart + llen;
 		if (lstart >= size)
 			goto mapend;
 		/*
 		 * test for old xad covering new lxd
 		 * (old xstart < new lstart)
 		 */
 		goto compare2;
 	}
 	/*
 	 * lxd is partially covered by xad
 	 */
 	else {			/* (xend < lend) */
 		/*
 		 * get next xad
 		 *
 		 * linear search next xad covering lxd on
 		 * the current xad page, and then next xad page search
 		 */
 		if (index == le16_to_cpu(p->header.nextindex) - 1) {
 			if (p->header.flag & BT_ROOT)
 				goto mapend;
 			if ((bn = le64_to_cpu(p->header.next)) == 0)
 				goto mapend;
 			XT_PUTPAGE(mp);
 			/* get next sibling page */
 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc)
 				return rc;
 			index = XTENTRYSTART;
 			xad = &p->xad[index];
 		} else {
 			index++;
 			xad++;
 		}
 		/*
 		 * test for new xad covering old lxd
 		 * (old lstart < new xstart)
 		 */
 		goto compare;
 	}
       mapend:
 	xadlist->nxad = npxd;
 //out:
 	XT_PUTPAGE(mp);
 	return rc;
 }
 /*
  *	xtSearch()
  *
  * function:	search for the xad entry covering specified offset.
  *
  * parameters:
  *	ip	- file object;
  *	xoff	- extent offset;
  *	nextp	- address of next extent (if any) for search miss
  *	cmpp	- comparison result:
  *	btstack - traverse stack;
  *	flag	- search process flag (XT_INSERT);
  *
  * returns:
  *	btstack contains (bn, index) of search path traversed to the entry.
  *	*cmpp is set to result of comparison with the entry returned.
  *	the page containing the entry is pinned at exit.
  */
 static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 		    int *cmpp, struct btstack * btstack, int flag)
 {
 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 	int rc = 0;
 	int cmp = 1;		/* init for empty page */
 	s64 bn;			/* block number */
 	struct metapage *mp;	/* page buffer */
 	xtpage_t *p;		/* page */
 	xad_t *xad;
 	int base, index, lim, btindex;
 	struct btframe *btsp;
 	int nsplit = 0;		/* number of pages to split */
 	s64 t64;
 	s64 next = 0;
 	INCREMENT(xtStat.search);
 	BT_CLR(btstack);
 	btstack->nsplit = 0;
 	/*
 	 *	search down tree from root:
 	 *
 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 	 *
 	 * if entry with search key K is not found
 	 * internal page search find the entry with largest key Ki
 	 * less than K which point to the child page to search;
 	 * leaf page search find the entry with smallest key Kj
 	 * greater than K so that the returned index is the position of
 	 * the entry to be shifted right for insertion of new entry.
 	 * for empty tree, search key is greater than any key of the tree.
 	 *
 	 * by convention, root bn = 0.
 	 */
 	for (bn = 0;;) {
 		/* get/pin the page to search */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/* try sequential access heuristics with the previous
 		 * access entry in target leaf page:
 		 * once search narrowed down into the target leaf,
 		 * key must either match an entry in the leaf or
 		 * key entry does not exist in the tree;
 		 */
 //fastSearch:
 		if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
 		    (p->header.flag & BT_LEAF) &&
 		    (index = jfs_ip->btindex) <
 		    le16_to_cpu(p->header.nextindex)) {
 			xad = &p->xad[index];
 			t64 = offsetXAD(xad);
 			if (xoff < t64 + lengthXAD(xad)) {
 				if (xoff >= t64) {
 					*cmpp = 0;
 					goto out;
 				}
 				/* stop sequential access heuristics */
 				goto binarySearch;
 			} else {	/* (t64 + lengthXAD(xad)) <= xoff */
 				/* try next sequential entry */
 				index++;
 				if (index <
 				    le16_to_cpu(p->header.nextindex)) {
 					xad++;
 					t64 = offsetXAD(xad);
 					if (xoff < t64 + lengthXAD(xad)) {
 						if (xoff >= t64) {
 							*cmpp = 0;
 							goto out;
 						}
 						/* miss: key falls between
 						 * previous and this entry
 						 */
 						*cmpp = 1;
 						next = t64;
 						goto out;
 					}
 					/* (xoff >= t64 + lengthXAD(xad));
 					 * matching entry may be further out:
 					 * stop heuristic search
 					 */
 					/* stop sequential access heuristics */
 					goto binarySearch;
 				}
 				/* (index == p->header.nextindex);
 				 * miss: key entry does not exist in
 				 * the target leaf/tree
 				 */
 				*cmpp = 1;
 				goto out;
 			}
 			/*
 			 * if hit, return index of the entry found, and
 			 * if miss, where new entry with search key is
 			 * to be inserted;
 			 */
 		      out:
 			/* compute number of pages to split */
 			if (flag & XT_INSERT) {
 				if (p->header.nextindex ==	/* little-endian */
 				    p->header.maxentry)
 					nsplit++;
 				else
 					nsplit = 0;
 				btstack->nsplit = nsplit;
 			}
 			/* save search result */
 			btsp = btstack->top;
 			btsp->bn = bn;
 			btsp->index = index;
 			btsp->mp = mp;
 			/* update sequential access heuristics */
 			jfs_ip->btindex = index;
 			if (nextp)
 				*nextp = next;
 			INCREMENT(xtStat.fastSearch);
 			return 0;
 		}
 		/* well, ... full search now */
 	      binarySearch:
 		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
 		/*
 		 * binary search with search key K on the current page
 		 */
 		for (base = XTENTRYSTART; lim; lim >>= 1) {
 			index = base + (lim >> 1);
 			XT_CMP(cmp, xoff, &p->xad[index], t64);
 			if (cmp == 0) {
 				/*
 				 *	search hit
 				 */
 				/* search hit - leaf page:
 				 * return the entry found
 				 */
 				if (p->header.flag & BT_LEAF) {
 					*cmpp = cmp;
 					/* compute number of pages to split */
 					if (flag & XT_INSERT) {
 						if (p->header.nextindex ==
 						    p->header.maxentry)
 							nsplit++;
 						else
 							nsplit = 0;
 						btstack->nsplit = nsplit;
 					}
 					/* save search result */
 					btsp = btstack->top;
 					btsp->bn = bn;
 					btsp->index = index;
 					btsp->mp = mp;
 					/* init sequential access heuristics */
 					btindex = jfs_ip->btindex;
 					if (index == btindex ||
 					    index == btindex + 1)
 						jfs_ip->btorder = BT_SEQUENTIAL;
 					else
 						jfs_ip->btorder = BT_RANDOM;
 					jfs_ip->btindex = index;
 					return 0;
 				}
 				/* search hit - internal page:
 				 * descend/search its child page
 				 */
 				if (index < le16_to_cpu(p->header.nextindex)-1)
 					next = offsetXAD(&p->xad[index + 1]);
 				goto next;
 			}
 			if (cmp > 0) {
 				base = index + 1;
 				--lim;
 			}
 		}
 		/*
 		 *	search miss
 		 *
 		 * base is the smallest index with key (Kj) greater than
 		 * search key (K) and may be zero or maxentry index.
 		 */
 		if (base < le16_to_cpu(p->header.nextindex))
 			next = offsetXAD(&p->xad[base]);
 		/*
 		 * search miss - leaf page:
 		 *
 		 * return location of entry (base) where new entry with
 		 * search key K is to be inserted.
 		 */
 		if (p->header.flag & BT_LEAF) {
 			*cmpp = cmp;
 			/* compute number of pages to split */
 			if (flag & XT_INSERT) {
 				if (p->header.nextindex ==
 				    p->header.maxentry)
 					nsplit++;
 				else
 					nsplit = 0;
 				btstack->nsplit = nsplit;
 			}
 			/* save search result */
 			btsp = btstack->top;
 			btsp->bn = bn;
 			btsp->index = base;
 			btsp->mp = mp;
 			/* init sequential access heuristics */
 			btindex = jfs_ip->btindex;
 			if (base == btindex || base == btindex + 1)
 				jfs_ip->btorder = BT_SEQUENTIAL;
 			else
 				jfs_ip->btorder = BT_RANDOM;
 			jfs_ip->btindex = base;
 			if (nextp)
 				*nextp = next;
 			return 0;
 		}
 		/*
 		 * search miss - non-leaf page:
 		 *
 		 * if base is non-zero, decrement base by one to get the parent
 		 * entry of the child page to search.
 		 */
 		index = base ? base - 1 : base;
 		/*
 		 * go down to child page
 		 */
 	      next:
 		/* update number of pages to split */
 		if (p->header.nextindex == p->header.maxentry)
 			nsplit++;
 		else
 			nsplit = 0;
 		/* push (bn, index) of the parent page/entry */
 		if (BT_STACK_FULL(btstack)) {
 			jfs_error(ip->i_sb, "stack overrun in xtSearch!");
 			XT_PUTPAGE(mp);
 			return -EIO;
 		}
 		BT_PUSH(btstack, bn, index);
 		/* get the child page block number */
 		bn = addressXAD(&p->xad[index]);
 		/* unpin the parent page */
 		XT_PUTPAGE(mp);
 	}
 }
 /*
  *	xtInsert()
  *
  * function:
  *
  * parameter:
  *	tid	- transaction id;
  *	ip	- file object;
  *	xflag	- extent flag (XAD_NOTRECORDED):
  *	xoff	- extent offset;
  *	xlen	- extent length;
  *	xaddrp	- extent address pointer (in/out):
  *		if (*xaddrp)
  *			caller allocated data extent at *xaddrp;
  *		else
  *			allocate data extent and return its xaddr;
  *	flag	-
  *
  * return:
  */
 int xtInsert(tid_t tid,		/* transaction id */
 	     struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
 	     int flag)
 {
 	int rc = 0;
 	s64 xaddr, hint;
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* base B+-tree index page */
 	s64 bn;
 	int index, nextindex;
 	struct btstack btstack;	/* traverse stack */
 	struct xtsplit split;	/* split information */
 	xad_t *xad;
 	int cmp;
 	s64 next;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
 	/*
 	 *	search for the entry location at which to insert:
 	 *
 	 * xtFastSearch() and xtSearch() both returns (leaf page
 	 * pinned, index at which to insert).
 	 * n.b. xtSearch() may return index of maxentry of
 	 * the full page.
 	 */
 	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
 		return rc;
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	/* This test must follow XT_GETSEARCH since mp must be valid if
 	 * we branch to out: */
 	if ((cmp == 0) || (next && (xlen > next - xoff))) {
 		rc = -EEXIST;
 		goto out;
 	}
 	/*
 	 * allocate data extent requested
 	 *
 	 * allocation hint: last xad
 	 */
 	if ((xaddr = *xaddrp) == 0) {
 		if (index > XTENTRYSTART) {
 			xad = &p->xad[index - 1];
 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
 		} else
 			hint = 0;
 		if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen)))
 			goto out;
 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
 			DQUOT_FREE_BLOCK(ip, xlen);
 			goto out;
 		}
 	}
 	/*
 	 *	insert entry for new extent
 	 */
 	xflag |= XAD_NEW;
 	/*
 	 *	if the leaf page is full, split the page and
 	 *	propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
 	nextindex = le16_to_cpu(p->header.nextindex);
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 		split.mp = mp;
 		split.index = index;
 		split.flag = xflag;
 		split.off = xoff;
 		split.len = xlen;
 		split.addr = xaddr;
 		split.pxdlist = NULL;
 		if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
 			/* undo data extent allocation */
 			if (*xaddrp == 0) {
 				dbFree(ip, xaddr, (s64) xlen);
 				DQUOT_FREE_BLOCK(ip, xlen);
 			}
 			return rc;
 		}
 		*xaddrp = xaddr;
 		return 0;
 	}
 	/*
 	 *	insert the new entry into the leaf page
 	 */
 	/*
 	 * acquire a transaction lock on the leaf page;
 	 *
 	 * action: xad insertion/extension;
 	 */
 	BT_MARK_DIRTY(mp, ip);
 	/* if insert into middle, shift right remaining entries. */
 	if (index < nextindex)
 		memmove(&p->xad[index + 1], &p->xad[index],
 			(nextindex - index) * sizeof(xad_t));
 	/* insert the new entry: mark the entry NEW */
 	xad = &p->xad[index];
 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
 	/* advance next available entry index */
-	p->header.nextindex =
+	le16_add_cpu(&p->header.nextindex, 1);
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	/* Don't log it if there are no links to the file */
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 		xtlck = (struct xtlock *) & tlck->lock;
 		xtlck->lwm.offset =
 		    (xtlck->lwm.offset) ? min(index,
 					      (int)xtlck->lwm.offset) : index;
 		xtlck->lwm.length =
 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
 	}
 	*xaddrp = xaddr;
       out:
 	/* unpin the leaf page */
 	XT_PUTPAGE(mp);
 	return rc;
 }
 /*
  *	xtSplitUp()
  *
  * function:
  *	split full pages as propagating insertion up the tree
  *
  * parameter:
  *	tid	- transaction id;
  *	ip	- file object;
  *	split	- entry parameter descriptor;
  *	btstack - traverse stack from xtSearch()
  *
  * return:
  */
 static int
 xtSplitUp(tid_t tid,
 	  struct inode *ip, struct xtsplit * split, struct btstack * btstack)
 {
 	int rc = 0;
 	struct metapage *smp;
 	xtpage_t *sp;		/* split page */
 	struct metapage *rmp;
 	s64 rbn;		/* new right page block number */
 	struct metapage *rcmp;
 	xtpage_t *rcp;		/* right child page */
 	s64 rcbn;		/* right child page block number */
 	int skip;		/* index of entry of insertion */
 	int nextindex;		/* next available entry index of p */
 	struct btframe *parent;	/* parent page entry on traverse stack */
 	xad_t *xad;
 	s64 xaddr;
 	int xlen;
 	int nsplit;		/* number of pages split */
 	struct pxdlist pxdlist;
 	pxd_t *pxd;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	smp = split->mp;
 	sp = XT_PAGE(ip, smp);
 	/* is inode xtree root extension/inline EA area free ? */
 	if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
 	    (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
 	    (JFS_IP(ip)->mode2 & INLINEEA)) {
 		sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
 		JFS_IP(ip)->mode2 &= ~INLINEEA;
 		BT_MARK_DIRTY(smp, ip);
 		/*
 		 * acquire a transaction lock on the leaf page;
 		 *
 		 * action: xad insertion/extension;
 		 */
 		/* if insert into middle, shift right remaining entries. */
 		skip = split->index;
 		nextindex = le16_to_cpu(sp->header.nextindex);
 		if (skip < nextindex)
 			memmove(&sp->xad[skip + 1], &sp->xad[skip],
 				(nextindex - skip) * sizeof(xad_t));
 		/* insert the new entry: mark the entry NEW */
 		xad = &sp->xad[skip];
 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
 			    split->addr);
 		/* advance next available entry index */
-		sp->header.nextindex =
+		le16_add_cpu(&sp->header.nextindex, 1);
-		    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
 		/* Don't log it if there are no links to the file */
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
 			xtlck = (struct xtlock *) & tlck->lock;
 			xtlck->lwm.offset = (xtlck->lwm.offset) ?
 			    min(skip, (int)xtlck->lwm.offset) : skip;
 			xtlck->lwm.length =
 			    le16_to_cpu(sp->header.nextindex) -
 			    xtlck->lwm.offset;
 		}
 		return 0;
 	}
 	/*
 	 * allocate new index blocks to cover index page split(s)
 	 *
 	 * allocation hint: ?
 	 */
 	if (split->pxdlist == NULL) {
 		nsplit = btstack->nsplit;
 		split->pxdlist = &pxdlist;
 		pxdlist.maxnpxd = pxdlist.npxd = 0;
 		pxd = &pxdlist.pxd[0];
 		xlen = JFS_SBI(ip->i_sb)->nbperpage;
 		for (; nsplit > 0; nsplit--, pxd++) {
 			if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
 			    == 0) {
 				PXDaddress(pxd, xaddr);
 				PXDlength(pxd, xlen);
 				pxdlist.maxnpxd++;
 				continue;
 			}
 			/* undo allocation */
 			XT_PUTPAGE(smp);
 			return rc;
 		}
 	}
 	/*
 	 * Split leaf page <sp> into <sp> and a new right page <rp>.
 	 *
 	 * The split routines insert the new entry into the leaf page,
 	 * and acquire txLock as appropriate.
 	 * return <rp> pinned and its block number <rpbn>.
 	 */
 	rc = (sp->header.flag & BT_ROOT) ?
 	    xtSplitRoot(tid, ip, split, &rmp) :
 	    xtSplitPage(tid, ip, split, &rmp, &rbn);
 	XT_PUTPAGE(smp);
 	if (rc)
 		return -EIO;
 	/*
 	 * propagate up the router entry for the leaf page just split
 	 *
 	 * insert a router entry for the new page into the parent page,
 	 * propagate the insert/split up the tree by walking back the stack
 	 * of (bn of parent page, index of child page entry in parent page)
 	 * that were traversed during the search for the page that split.
 	 *
 	 * the propagation of insert/split up the tree stops if the root
 	 * splits or the page inserted into doesn't have to split to hold
 	 * the new entry.
 	 *
 	 * the parent entry for the split page remains the same, and
 	 * a new entry is inserted at its right with the first key and
 	 * block number of the new right page.
 	 *
 	 * There are a maximum of 3 pages pinned at any time:
 	 * right child, left parent and right parent (when the parent splits)
 	 * to keep the child page pinned while working on the parent.
 	 * make sure that all pins are released at exit.
 	 */
 	while ((parent = BT_POP(btstack)) != NULL) {
 		/* parent page specified by stack frame <parent> */
 		/* keep current child pages <rcp> pinned */
 		rcmp = rmp;
 		rcbn = rbn;
 		rcp = XT_PAGE(ip, rcmp);
 		/*
 		 * insert router entry in parent for new right child page <rp>
 		 */
 		/* get/pin the parent page <sp> */
 		XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
 		if (rc) {
 			XT_PUTPAGE(rcmp);
 			return rc;
 		}
 		/*
 		 * The new key entry goes ONE AFTER the index of parent entry,
 		 * because the split was to the right.
 		 */
 		skip = parent->index + 1;
 		/*
 		 * split or shift right remaining entries of the parent page
 		 */
 		nextindex = le16_to_cpu(sp->header.nextindex);
 		/*
 		 * parent page is full - split the parent page
 		 */
 		if (nextindex == le16_to_cpu(sp->header.maxentry)) {
 			/* init for parent page split */
 			split->mp = smp;
 			split->index = skip;	/* index at insert */
 			split->flag = XAD_NEW;
 			split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
 			split->len = JFS_SBI(ip->i_sb)->nbperpage;
 			split->addr = rcbn;
 			/* unpin previous right child page */
 			XT_PUTPAGE(rcmp);
 			/* The split routines insert the new entry,
 			 * and acquire txLock as appropriate.
 			 * return <rp> pinned and its block number <rpbn>.
 			 */
 			rc = (sp->header.flag & BT_ROOT) ?
 			    xtSplitRoot(tid, ip, split, &rmp) :
 			    xtSplitPage(tid, ip, split, &rmp, &rbn);
 			if (rc) {
 				XT_PUTPAGE(smp);
 				return rc;
 			}
 			XT_PUTPAGE(smp);
 			/* keep new child page <rp> pinned */
 		}
 		/*
 		 * parent page is not full - insert in parent page
 		 */
 		else {
 			/*
 			 * insert router entry in parent for the right child
 			 * page from the first entry of the right child page:
 			 */
 			/*
 			 * acquire a transaction lock on the parent page;
 			 *
 			 * action: router xad insertion;
 			 */
 			BT_MARK_DIRTY(smp, ip);
 			/*
 			 * if insert into middle, shift right remaining entries
 			 */
 			if (skip < nextindex)
 				memmove(&sp->xad[skip + 1], &sp->xad[skip],
 					(nextindex -
 					 skip) << L2XTSLOTSIZE);
 			/* insert the router entry */
 			xad = &sp->xad[skip];
 			XT_PUTENTRY(xad, XAD_NEW,
 				    offsetXAD(&rcp->xad[XTENTRYSTART]),
 				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
 			/* advance next available entry index. */
-			sp->header.nextindex =
+			le16_add_cpu(&sp->header.nextindex, 1);
-			    cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
-					1);
 			/* Don't log it if there are no links to the file */
 			if (!test_cflag(COMMIT_Nolink, ip)) {
 				tlck = txLock(tid, ip, smp,
 					      tlckXTREE | tlckGROW);
 				xtlck = (struct xtlock *) & tlck->lock;
 				xtlck->lwm.offset = (xtlck->lwm.offset) ?
 				    min(skip, (int)xtlck->lwm.offset) : skip;
 				xtlck->lwm.length =
 				    le16_to_cpu(sp->header.nextindex) -
 				    xtlck->lwm.offset;
 			}
 			/* unpin parent page */
 			XT_PUTPAGE(smp);
 			/* exit propagate up */
 			break;
 		}
 	}
 	/* unpin current right page */
 	XT_PUTPAGE(rmp);
 	return 0;
 }
 /*
  *	xtSplitPage()
  *
  * function:
  *	split a full non-root page into
  *	original/split/left page and new right page
  *	i.e., the original/split page remains as left page.
  *
  * parameter:
  *	int		tid,
  *	struct inode	*ip,
  *	struct xtsplit	*split,
  *	struct metapage	**rmpp,
  *	u64		*rbnp,
  *
  * return:
  *	Pointer to page in which to insert or NULL on error.
  */
 static int
 xtSplitPage(tid_t tid, struct inode *ip,
 	    struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
 {
 	int rc = 0;
 	struct metapage *smp;
 	xtpage_t *sp;
 	struct metapage *rmp;
 	xtpage_t *rp;		/* new right page allocated */
 	s64 rbn;		/* new right page block number */
 	struct metapage *mp;
 	xtpage_t *p;
 	s64 nextbn;
 	int skip, maxentry, middle, righthalf, n;
 	xad_t *xad;
 	struct pxdlist *pxdlist;
 	pxd_t *pxd;
 	struct tlock *tlck;
 	struct xtlock *sxtlck = NULL, *rxtlck = NULL;
 	int quota_allocation = 0;
 	smp = split->mp;
 	sp = XT_PAGE(ip, smp);
 	INCREMENT(xtStat.split);
 	pxdlist = split->pxdlist;
 	pxd = &pxdlist->pxd[pxdlist->npxd];
 	pxdlist->npxd++;
 	rbn = addressPXD(pxd);
 	/* Allocate blocks to quota. */
 	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
 		rc = -EDQUOT;
 		goto clean_up;
 	}
 	quota_allocation += lengthPXD(pxd);
 	/*
 	 * allocate the new right page for the split
 	 */
 	rmp = get_metapage(ip, rbn, PSIZE, 1);
 	if (rmp == NULL) {
 		rc = -EIO;
 		goto clean_up;
 	}
 	jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
 	BT_MARK_DIRTY(rmp, ip);
 	/*
 	 * action: new page;
 	 */
 	rp = (xtpage_t *) rmp->data;
 	rp->header.self = *pxd;
 	rp->header.flag = sp->header.flag & BT_TYPE;
 	rp->header.maxentry = sp->header.maxentry;	/* little-endian */
 	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
 	BT_MARK_DIRTY(smp, ip);
 	/* Don't log it if there are no links to the file */
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		/*
 		 * acquire a transaction lock on the new right page;
 		 */
 		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
 		rxtlck = (struct xtlock *) & tlck->lock;
 		rxtlck->lwm.offset = XTENTRYSTART;
 		/*
 		 * acquire a transaction lock on the split page
 		 */
 		tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
 		sxtlck = (struct xtlock *) & tlck->lock;
 	}
 	/*
 	 * initialize/update sibling pointers of <sp> and <rp>
 	 */
 	nextbn = le64_to_cpu(sp->header.next);
 	rp->header.next = cpu_to_le64(nextbn);
 	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
 	sp->header.next = cpu_to_le64(rbn);
 	skip = split->index;
 	/*
 	 *	sequential append at tail (after last entry of last page)
 	 *
 	 * if splitting the last page on a level because of appending
 	 * a entry to it (skip is maxentry), it's likely that the access is
 	 * sequential. adding an empty page on the side of the level is less
 	 * work and can push the fill factor much higher than normal.
 	 * if we're wrong it's no big deal -  we will do the split the right
 	 * way next time.
 	 * (it may look like it's equally easy to do a similar hack for
 	 * reverse sorted data, that is, split the tree left, but it's not.
 	 * Be my guest.)
 	 */
 	if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
 		/*
 		 * acquire a transaction lock on the new/right page;
 		 *
 		 * action: xad insertion;
 		 */
 		/* insert entry at the first entry of the new right page */
 		xad = &rp->xad[XTENTRYSTART];
 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
 			    split->addr);
 		rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			/* rxtlck->lwm.offset = XTENTRYSTART; */
 			rxtlck->lwm.length = 1;
 		}
 		*rmpp = rmp;
 		*rbnp = rbn;
 		jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
 		return 0;
 	}
 	/*
 	 *	non-sequential insert (at possibly middle page)
 	 */
 	/*
 	 * update previous pointer of old next/right page of <sp>
 	 */
 	if (nextbn != 0) {
 		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
 		if (rc) {
 			XT_PUTPAGE(rmp);
 			goto clean_up;
 		}
 		BT_MARK_DIRTY(mp, ip);
 		/*
 		 * acquire a transaction lock on the next page;
 		 *
 		 * action:sibling pointer update;
 		 */
 		if (!test_cflag(COMMIT_Nolink, ip))
 			tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
 		p->header.prev = cpu_to_le64(rbn);
 		/* sibling page may have been updated previously, or
 		 * it may be updated later;
 		 */
 		XT_PUTPAGE(mp);
 	}
 	/*
 	 * split the data between the split and new/right pages
 	 */
 	maxentry = le16_to_cpu(sp->header.maxentry);
 	middle = maxentry >> 1;
 	righthalf = maxentry - middle;
 	/*
 	 * skip index in old split/left page - insert into left page:
 	 */
 	if (skip <= middle) {
 		/* move right half of split page to the new right page */
 		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
 			righthalf << L2XTSLOTSIZE);
 		/* shift right tail of left half to make room for new entry */
 		if (skip < middle)
 			memmove(&sp->xad[skip + 1], &sp->xad[skip],
 				(middle - skip) << L2XTSLOTSIZE);
 		/* insert new entry */
 		xad = &sp->xad[skip];
 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
 			    split->addr);
 		/* update page header */
 		sp->header.nextindex = cpu_to_le16(middle + 1);
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
 			    min(skip, (int)sxtlck->lwm.offset) : skip;
 		}
 		rp->header.nextindex =
 		    cpu_to_le16(XTENTRYSTART + righthalf);
 	}
 	/*
 	 * skip index in new right page - insert into right page:
 	 */
 	else {
 		/* move left head of right half to right page */
 		n = skip - middle;
 		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
 			n << L2XTSLOTSIZE);
 		/* insert new entry */
 		n += XTENTRYSTART;
 		xad = &rp->xad[n];
 		XT_PUTENTRY(xad, split->flag, split->off, split->len,
 			    split->addr);
 		/* move right tail of right half to right page */
 		if (skip < maxentry)
 			memmove(&rp->xad[n + 1], &sp->xad[skip],
 				(maxentry - skip) << L2XTSLOTSIZE);
 		/* update page header */
 		sp->header.nextindex = cpu_to_le16(middle);
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
 			    min(middle, (int)sxtlck->lwm.offset) : middle;
 		}
 		rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
 						   righthalf + 1);
 	}
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
 		    sxtlck->lwm.offset;
 		/* rxtlck->lwm.offset = XTENTRYSTART; */
 		rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
 		    XTENTRYSTART;
 	}
 	*rmpp = rmp;
 	*rbnp = rbn;
 	jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
 	return rc;
       clean_up:
 	/* Rollback quota allocation. */
 	if (quota_allocation)
 		DQUOT_FREE_BLOCK(ip, quota_allocation);
 	return (rc);
 }
 /*
  *	xtSplitRoot()
  *
  * function:
  *	split the full root page into original/root/split page and new
  *	right page
  *	i.e., root remains fixed in tree anchor (inode) and the root is
  *	copied to a single new right child page since root page <<
  *	non-root page, and the split root page contains a single entry
  *	for the new right child page.
  *
  * parameter:
  *	int		tid,
  *	struct inode	*ip,
  *	struct xtsplit	*split,
  *	struct metapage	**rmpp)
  *
  * return:
  *	Pointer to page in which to insert or NULL on error.
  */
 static int
 xtSplitRoot(tid_t tid,
 	    struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
 {
 	xtpage_t *sp;
 	struct metapage *rmp;
 	xtpage_t *rp;
 	s64 rbn;
 	int skip, nextindex;
 	xad_t *xad;
 	pxd_t *pxd;
 	struct pxdlist *pxdlist;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	sp = &JFS_IP(ip)->i_xtroot;
 	INCREMENT(xtStat.split);
 	/*
 	 *	allocate a single (right) child page
 	 */
 	pxdlist = split->pxdlist;
 	pxd = &pxdlist->pxd[pxdlist->npxd];
 	pxdlist->npxd++;
 	rbn = addressPXD(pxd);
 	rmp = get_metapage(ip, rbn, PSIZE, 1);
 	if (rmp == NULL)
 		return -EIO;
 	/* Allocate blocks to quota. */
 	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
 	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
 	/*
 	 * acquire a transaction lock on the new right page;
 	 *
 	 * action: new page;
 	 */
 	BT_MARK_DIRTY(rmp, ip);
 	rp = (xtpage_t *) rmp->data;
 	rp->header.flag =
 	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
 	rp->header.self = *pxd;
 	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
 	rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
 	/* initialize sibling pointers */
 	rp->header.next = 0;
 	rp->header.prev = 0;
 	/*
 	 * copy the in-line root page into new right page extent
 	 */
 	nextindex = le16_to_cpu(sp->header.maxentry);
 	memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
 		(nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
 	/*
 	 * insert the new entry into the new right/child page
 	 * (skip index in the new right page will not change)
 	 */
 	skip = split->index;
 	/* if insert into middle, shift right remaining entries */
 	if (skip != nextindex)
 		memmove(&rp->xad[skip + 1], &rp->xad[skip],
 			(nextindex - skip) * sizeof(xad_t));
 	xad = &rp->xad[skip];
 	XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
 	/* update page header */
 	rp->header.nextindex = cpu_to_le16(nextindex + 1);
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
 		xtlck = (struct xtlock *) & tlck->lock;
 		xtlck->lwm.offset = XTENTRYSTART;
 		xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
 		    XTENTRYSTART;
 	}
 	/*
 	 *	reset the root
 	 *
 	 * init root with the single entry for the new right page
 	 * set the 1st entry offset to 0, which force the left-most key
 	 * at any level of the tree to be less than any search key.
 	 */
 	/*
 	 * acquire a transaction lock on the root page (in-memory inode);
 	 *
 	 * action: root split;
 	 */
 	BT_MARK_DIRTY(split->mp, ip);
 	xad = &sp->xad[XTENTRYSTART];
 	XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
 	/* update page header of root */
 	sp->header.flag &= ~BT_LEAF;
 	sp->header.flag |= BT_INTERNAL;
 	sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
 		xtlck = (struct xtlock *) & tlck->lock;
 		xtlck->lwm.offset = XTENTRYSTART;
 		xtlck->lwm.length = 1;
 	}
 	*rmpp = rmp;
 	jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
 	return 0;
 }
 /*
  *	xtExtend()
  *
  * function: extend in-place;
  *
  * note: existing extent may or may not have been committed.
  * caller is responsible for pager buffer cache update, and
  * working block allocation map update;
  * update pmap: alloc whole extended extent;
  */
 int xtExtend(tid_t tid,		/* transaction id */
 	     struct inode *ip, s64 xoff,	/* delta extent offset */
 	     s32 xlen,		/* delta extent length */
 	     int flag)
 {
 	int rc = 0;
 	int cmp;
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* base B+-tree index page */
 	s64 bn;
 	int index, nextindex, len;
 	struct btstack btstack;	/* traverse stack */
 	struct xtsplit split;	/* split information */
 	xad_t *xad;
 	s64 xaddr;
 	struct tlock *tlck;
 	struct xtlock *xtlck = NULL;
 	jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
 	/* there must exist extent to be extended */
 	if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT)))
 		return rc;
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	if (cmp != 0) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
 		return -EIO;
 	}
 	/* extension must be contiguous */
 	xad = &p->xad[index];
 	if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
 		return -EIO;
 	}
 	/*
 	 * acquire a transaction lock on the leaf page;
 	 *
 	 * action: xad insertion/extension;
 	 */
 	BT_MARK_DIRTY(mp, ip);
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 		xtlck = (struct xtlock *) & tlck->lock;
 	}
 	/* extend will overflow extent ? */
 	xlen = lengthXAD(xad) + xlen;
 	if ((len = xlen - MAXXLEN) <= 0)
 		goto extendOld;
 	/*
 	 *	extent overflow: insert entry for new extent
 	 */
 //insertNew:
 	xoff = offsetXAD(xad) + MAXXLEN;
 	xaddr = addressXAD(xad) + MAXXLEN;
 	nextindex = le16_to_cpu(p->header.nextindex);
 	/*
 	 *	if the leaf page is full, insert the new entry and
 	 *	propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 		/* xtSpliUp() unpins leaf pages */
 		split.mp = mp;
 		split.index = index + 1;
 		split.flag = XAD_NEW;
 		split.off = xoff;	/* split offset */
 		split.len = len;
 		split.addr = xaddr;
 		split.pxdlist = NULL;
 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
 			return rc;
 		/* get back old page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * if leaf root has been split, original root has been
 		 * copied to new child page, i.e., original entry now
 		 * resides on the new child page;
 		 */
 		if (p->header.flag & BT_INTERNAL) {
 			ASSERT(p->header.nextindex ==
 			       cpu_to_le16(XTENTRYSTART + 1));
 			xad = &p->xad[XTENTRYSTART];
 			bn = addressXAD(xad);
 			XT_PUTPAGE(mp);
 			/* get new child page */
 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc)
 				return rc;
 			BT_MARK_DIRTY(mp, ip);
 			if (!test_cflag(COMMIT_Nolink, ip)) {
 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
 				xtlck = (struct xtlock *) & tlck->lock;
 			}
 		}
 	}
 	/*
 	 *	insert the new entry into the leaf page
 	 */
 	else {
 		/* insert the new entry: mark the entry NEW */
 		xad = &p->xad[index + 1];
 		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
 		/* advance next available entry index */
-		p->header.nextindex =
+		le16_add_cpu(&p->header.nextindex, 1);
-		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	}
 	/* get back old entry */
 	xad = &p->xad[index];
 	xlen = MAXXLEN;
 	/*
 	 * extend old extent
 	 */
       extendOld:
 	XADlength(xad, xlen);
 	if (!(xad->flag & XAD_NEW))
 		xad->flag |= XAD_EXTENDED;
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		xtlck->lwm.offset =
 		    (xtlck->lwm.offset) ? min(index,
 					      (int)xtlck->lwm.offset) : index;
 		xtlck->lwm.length =
 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
 	}
 	/* unpin the leaf page */
 	XT_PUTPAGE(mp);
 	return rc;
 }
 #ifdef _NOTYET
 /*
  *	xtTailgate()
  *
  * function: split existing 'tail' extent
  *	(split offset >= start offset of tail extent), and
  *	relocate and extend the split tail half;
  *
  * note: existing extent may or may not have been committed.
  * caller is responsible for pager buffer cache update, and
  * working block allocation map update;
  * update pmap: free old split tail extent, alloc new extent;
  */
 int xtTailgate(tid_t tid,		/* transaction id */
 	       struct inode *ip, s64 xoff,	/* split/new extent offset */
 	       s32 xlen,	/* new extent length */
 	       s64 xaddr,	/* new extent address */
 	       int flag)
 {
 	int rc = 0;
 	int cmp;
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* base B+-tree index page */
 	s64 bn;
 	int index, nextindex, llen, rlen;
 	struct btstack btstack;	/* traverse stack */
 	struct xtsplit split;	/* split information */
 	xad_t *xad;
 	struct tlock *tlck;
 	struct xtlock *xtlck = 0;
 	struct tlock *mtlck;
 	struct maplock *pxdlock;
 /*
 printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
 	(ulong)xoff, xlen, (ulong)xaddr);
 */
 	/* there must exist extent to be tailgated */
 	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
 		return rc;
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	if (cmp != 0) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
 		return -EIO;
 	}
 	/* entry found must be last entry */
 	nextindex = le16_to_cpu(p->header.nextindex);
 	if (index != nextindex - 1) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb,
 			  "xtTailgate: the entry found is not the last entry");
 		return -EIO;
 	}
 	BT_MARK_DIRTY(mp, ip);
 	/*
 	 * acquire tlock of the leaf page containing original entry
 	 */
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 		xtlck = (struct xtlock *) & tlck->lock;
 	}
 	/* completely replace extent ? */
 	xad = &p->xad[index];
 /*
 printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
 	(ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
 */
 	if ((llen = xoff - offsetXAD(xad)) == 0)
 		goto updateOld;
 	/*
 	 *	partially replace extent: insert entry for new extent
 	 */
 //insertNew:
 	/*
 	 *	if the leaf page is full, insert the new entry and
 	 *	propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 		/* xtSpliUp() unpins leaf pages */
 		split.mp = mp;
 		split.index = index + 1;
 		split.flag = XAD_NEW;
 		split.off = xoff;	/* split offset */
 		split.len = xlen;
 		split.addr = xaddr;
 		split.pxdlist = NULL;
 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
 			return rc;
 		/* get back old page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * if leaf root has been split, original root has been
 		 * copied to new child page, i.e., original entry now
 		 * resides on the new child page;
 		 */
 		if (p->header.flag & BT_INTERNAL) {
 			ASSERT(p->header.nextindex ==
 			       cpu_to_le16(XTENTRYSTART + 1));
 			xad = &p->xad[XTENTRYSTART];
 			bn = addressXAD(xad);
 			XT_PUTPAGE(mp);
 			/* get new child page */
 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc)
 				return rc;
 			BT_MARK_DIRTY(mp, ip);
 			if (!test_cflag(COMMIT_Nolink, ip)) {
 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
 				xtlck = (struct xtlock *) & tlck->lock;
 			}
 		}
 	}
 	/*
 	 *	insert the new entry into the leaf page
 	 */
 	else {
 		/* insert the new entry: mark the entry NEW */
 		xad = &p->xad[index + 1];
 		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
 		/* advance next available entry index */
-		p->header.nextindex =
+		le16_add_cpu(&p->header.nextindex, 1);
-		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	}
 	/* get back old XAD */
 	xad = &p->xad[index];
 	/*
 	 * truncate/relocate old extent at split offset
 	 */
       updateOld:
 	/* update dmap for old/committed/truncated extent */
 	rlen = lengthXAD(xad) - llen;
 	if (!(xad->flag & XAD_NEW)) {
 		/* free from PWMAP at commit */
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			mtlck = txMaplock(tid, ip, tlckMAP);
 			pxdlock = (struct maplock *) & mtlck->lock;
 			pxdlock->flag = mlckFREEPXD;
 			PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
 			PXDlength(&pxdlock->pxd, rlen);
 			pxdlock->index = 1;
 		}
 	} else
 		/* free from WMAP */
 		dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
 	if (llen)
 		/* truncate */
 		XADlength(xad, llen);
 	else
 		/* replace */
 		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		xtlck->lwm.offset = (xtlck->lwm.offset) ?
 		    min(index, (int)xtlck->lwm.offset) : index;
 		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
 		    xtlck->lwm.offset;
 	}
 	/* unpin the leaf page */
 	XT_PUTPAGE(mp);
 	return rc;
 }
 #endif /* _NOTYET */
 /*
  *	xtUpdate()
  *
  * function: update XAD;
  *
  *	update extent for allocated_but_not_recorded or
  *	compressed extent;
  *
  * parameter:
  *	nxad	- new XAD;
  *		logical extent of the specified XAD must be completely
  *		contained by an existing XAD;
  */
 int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
 {				/* new XAD */
 	int rc = 0;
 	int cmp;
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* base B+-tree index page */
 	s64 bn;
 	int index0, index, newindex, nextindex;
 	struct btstack btstack;	/* traverse stack */
 	struct xtsplit split;	/* split information */
 	xad_t *xad, *lxad, *rxad;
 	int xflag;
 	s64 nxoff, xoff;
 	int nxlen, xlen, lxlen, rxlen;
 	s64 nxaddr, xaddr;
 	struct tlock *tlck;
 	struct xtlock *xtlck = NULL;
 	int newpage = 0;
 	/* there must exist extent to be tailgated */
 	nxoff = offsetXAD(nxad);
 	nxlen = lengthXAD(nxad);
 	nxaddr = addressXAD(nxad);
 	if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
 		return rc;
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
 	if (cmp != 0) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
 		return -EIO;
 	}
 	BT_MARK_DIRTY(mp, ip);
 	/*
 	 * acquire tlock of the leaf page containing original entry
 	 */
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 		xtlck = (struct xtlock *) & tlck->lock;
 	}
 	xad = &p->xad[index0];
 	xflag = xad->flag;
 	xoff = offsetXAD(xad);
 	xlen = lengthXAD(xad);
 	xaddr = addressXAD(xad);
 	/* nXAD must be completely contained within XAD */
 	if ((xoff > nxoff) ||
 	    (nxoff + nxlen > xoff + xlen)) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb,
 			  "xtUpdate: nXAD in not completely contained within XAD");
 		return -EIO;
 	}
 	index = index0;
 	newindex = index + 1;
 	nextindex = le16_to_cpu(p->header.nextindex);
 #ifdef  _JFS_WIP_NOCOALESCE
 	if (xoff < nxoff)
 		goto updateRight;
 	/*
 	 * replace XAD with nXAD
 	 */
       replace:			/* (nxoff == xoff) */
 	if (nxlen == xlen) {
 		/* replace XAD with nXAD:recorded */
 		*xad = *nxad;
 		xad->flag = xflag & ~XAD_NOTRECORDED;
 		goto out;
 	} else			/* (nxlen < xlen) */
 		goto updateLeft;
 #endif				/* _JFS_WIP_NOCOALESCE */
 /* #ifdef _JFS_WIP_COALESCE */
 	if (xoff < nxoff)
 		goto coalesceRight;
 	/*
 	 * coalesce with left XAD
 	 */
 //coalesceLeft: /* (xoff == nxoff) */
 	/* is XAD first entry of page ? */
 	if (index == XTENTRYSTART)
 		goto replace;
 	/* is nXAD logically and physically contiguous with lXAD ? */
 	lxad = &p->xad[index - 1];
 	lxlen = lengthXAD(lxad);
 	if (!(lxad->flag & XAD_NOTRECORDED) &&
 	    (nxoff == offsetXAD(lxad) + lxlen) &&
 	    (nxaddr == addressXAD(lxad) + lxlen) &&
 	    (lxlen + nxlen < MAXXLEN)) {
 		/* extend right lXAD */
 		index0 = index - 1;
 		XADlength(lxad, lxlen + nxlen);
 		/* If we just merged two extents together, need to make sure the
 		 * right extent gets logged.  If the left one is marked XAD_NEW,
 		 * then we know it will be logged.  Otherwise, mark as
 		 * XAD_EXTENDED
 		 */
 		if (!(lxad->flag & XAD_NEW))
 			lxad->flag |= XAD_EXTENDED;
 		if (xlen > nxlen) {
 			/* truncate XAD */
 			XADoffset(xad, xoff + nxlen);
 			XADlength(xad, xlen - nxlen);
 			XADaddress(xad, xaddr + nxlen);
 			goto out;
 		} else {	/* (xlen == nxlen) */
 			/* remove XAD */
 			if (index < nextindex - 1)
 				memmove(&p->xad[index], &p->xad[index + 1],
 					(nextindex - index -
 					 1) << L2XTSLOTSIZE);
 			p->header.nextindex =
 			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
 					1);
 			index = index0;
 			newindex = index + 1;
 			nextindex = le16_to_cpu(p->header.nextindex);
 			xoff = nxoff = offsetXAD(lxad);
 			xlen = nxlen = lxlen + nxlen;
 			xaddr = nxaddr = addressXAD(lxad);
 			goto coalesceRight;
 		}
 	}
 	/*
 	 * replace XAD with nXAD
 	 */
       replace:			/* (nxoff == xoff) */
 	if (nxlen == xlen) {
 		/* replace XAD with nXAD:recorded */
 		*xad = *nxad;
 		xad->flag = xflag & ~XAD_NOTRECORDED;
 		goto coalesceRight;
 	} else			/* (nxlen < xlen) */
 		goto updateLeft;
 	/*
 	 * coalesce with right XAD
 	 */
       coalesceRight:		/* (xoff <= nxoff) */
 	/* is XAD last entry of page ? */
 	if (newindex == nextindex) {
 		if (xoff == nxoff)
 			goto out;
 		goto updateRight;
 	}
 	/* is nXAD logically and physically contiguous with rXAD ? */
 	rxad = &p->xad[index + 1];
 	rxlen = lengthXAD(rxad);
 	if (!(rxad->flag & XAD_NOTRECORDED) &&
 	    (nxoff + nxlen == offsetXAD(rxad)) &&
 	    (nxaddr + nxlen == addressXAD(rxad)) &&
 	    (rxlen + nxlen < MAXXLEN)) {
 		/* extend left rXAD */
 		XADoffset(rxad, nxoff);
 		XADlength(rxad, rxlen + nxlen);
 		XADaddress(rxad, nxaddr);
 		/* If we just merged two extents together, need to make sure
 		 * the left extent gets logged.  If the right one is marked
 		 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
 		 * XAD_EXTENDED
 		 */
 		if (!(rxad->flag & XAD_NEW))
 			rxad->flag |= XAD_EXTENDED;
 		if (xlen > nxlen)
 			/* truncate XAD */
 			XADlength(xad, xlen - nxlen);
 		else {		/* (xlen == nxlen) */
 			/* remove XAD */
 			memmove(&p->xad[index], &p->xad[index + 1],
 				(nextindex - index - 1) << L2XTSLOTSIZE);
 			p->header.nextindex =
 			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
 					1);
 		}
 		goto out;
 	} else if (xoff == nxoff)
 		goto out;
 	if (xoff >= nxoff) {
 		XT_PUTPAGE(mp);
 		jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
 		return -EIO;
 	}
 /* #endif _JFS_WIP_COALESCE */
 	/*
 	 * split XAD into (lXAD, nXAD):
 	 *
 	 *          |---nXAD--->
 	 * --|----------XAD----------|--
 	 *   |-lXAD-|
 	 */
       updateRight:		/* (xoff < nxoff) */
 	/* truncate old XAD as lXAD:not_recorded */
 	xad = &p->xad[index];
 	XADlength(xad, nxoff - xoff);
 	/* insert nXAD:recorded */
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 		/* xtSpliUp() unpins leaf pages */
 		split.mp = mp;
 		split.index = newindex;
 		split.flag = xflag & ~XAD_NOTRECORDED;
 		split.off = nxoff;
 		split.len = nxlen;
 		split.addr = nxaddr;
 		split.pxdlist = NULL;
 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
 			return rc;
 		/* get back old page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * if leaf root has been split, original root has been
 		 * copied to new child page, i.e., original entry now
 		 * resides on the new child page;
 		 */
 		if (p->header.flag & BT_INTERNAL) {
 			ASSERT(p->header.nextindex ==
 			       cpu_to_le16(XTENTRYSTART + 1));
 			xad = &p->xad[XTENTRYSTART];
 			bn = addressXAD(xad);
 			XT_PUTPAGE(mp);
 			/* get new child page */
 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc)
 				return rc;
 			BT_MARK_DIRTY(mp, ip);
 			if (!test_cflag(COMMIT_Nolink, ip)) {
 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
 				xtlck = (struct xtlock *) & tlck->lock;
 			}
 		} else {
 			/* is nXAD on new page ? */
 			if (newindex >
 			    (le16_to_cpu(p->header.maxentry) >> 1)) {
 				newindex =
 				    newindex -
 				    le16_to_cpu(p->header.nextindex) +
 				    XTENTRYSTART;
 				newpage = 1;
 			}
 		}
 	} else {
 		/* if insert into middle, shift right remaining entries */
 		if (newindex < nextindex)
 			memmove(&p->xad[newindex + 1], &p->xad[newindex],
 				(nextindex - newindex) << L2XTSLOTSIZE);
 		/* insert the entry */
 		xad = &p->xad[newindex];
 		*xad = *nxad;
 		xad->flag = xflag & ~XAD_NOTRECORDED;
 		/* advance next available entry index. */
 		p->header.nextindex =
 		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	}
 	/*
 	 * does nXAD force 3-way split ?
 	 *
 	 *          |---nXAD--->|
 	 * --|----------XAD-------------|--
 	 *   |-lXAD-|           |-rXAD -|
 	 */
 	if (nxoff + nxlen == xoff + xlen)
 		goto out;
 	/* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
 	if (newpage) {
 		/* close out old page */
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			xtlck->lwm.offset = (xtlck->lwm.offset) ?
 			    min(index0, (int)xtlck->lwm.offset) : index0;
 			xtlck->lwm.length =
 			    le16_to_cpu(p->header.nextindex) -
 			    xtlck->lwm.offset;
 		}
 		bn = le64_to_cpu(p->header.next);
 		XT_PUTPAGE(mp);
 		/* get new right page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		BT_MARK_DIRTY(mp, ip);
 		if (!test_cflag(COMMIT_Nolink, ip)) {
 			tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 			xtlck = (struct xtlock *) & tlck->lock;
 		}
 		index0 = index = newindex;
 	} else
 		index++;
 	newindex = index + 1;
 	nextindex = le16_to_cpu(p->header.nextindex);
 	xlen = xlen - (nxoff - xoff);
 	xoff = nxoff;
 	xaddr = nxaddr;
 	/* recompute split pages */
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 		XT_PUTPAGE(mp);
 		if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
 			return rc;
 		/* retrieve search result */
 		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
 		if (cmp != 0) {
 			XT_PUTPAGE(mp);
 			jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
 			return -EIO;
 		}
 		if (index0 != index) {
 			XT_PUTPAGE(mp);
 			jfs_error(ip->i_sb,
 				  "xtUpdate: unexpected value of index");
 			return -EIO;
 		}
 	}
 	/*
 	 * split XAD into (nXAD, rXAD)
 	 *
 	 *          ---nXAD---|
 	 * --|----------XAD----------|--
 	 *                    |-rXAD-|
 	 */
       updateLeft:		/* (nxoff == xoff) && (nxlen < xlen) */
 	/* update old XAD with nXAD:recorded */
 	xad = &p->xad[index];
 	*xad = *nxad;
 	xad->flag = xflag & ~XAD_NOTRECORDED;
 	/* insert rXAD:not_recorded */
 	xoff = xoff + nxlen;
 	xlen = xlen - nxlen;
 	xaddr = xaddr + nxlen;
 	if (nextindex == le16_to_cpu(p->header.maxentry)) {
 /*
 printf("xtUpdate.updateLeft.split p:0x%p\n", p);
 */
 		/* xtSpliUp() unpins leaf pages */
 		split.mp = mp;
 		split.index = newindex;
 		split.flag = xflag;
 		split.off = xoff;
 		split.len = xlen;
 		split.addr = xaddr;
 		split.pxdlist = NULL;
 		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
 			return rc;
 		/* get back old page */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * if leaf root has been split, original root has been
 		 * copied to new child page, i.e., original entry now
 		 * resides on the new child page;
 		 */
 		if (p->header.flag & BT_INTERNAL) {
 			ASSERT(p->header.nextindex ==
 			       cpu_to_le16(XTENTRYSTART + 1));
 			xad = &p->xad[XTENTRYSTART];
 			bn = addressXAD(xad);
 			XT_PUTPAGE(mp);
 			/* get new child page */
 			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc)
 				return rc;
 			BT_MARK_DIRTY(mp, ip);
 			if (!test_cflag(COMMIT_Nolink, ip)) {
 				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
 				xtlck = (struct xtlock *) & tlck->lock;
 			}
 		}
 	} else {
 		/* if insert into middle, shift right remaining entries */
 		if (newindex < nextindex)
 			memmove(&p->xad[newindex + 1], &p->xad[newindex],
 				(nextindex - newindex) << L2XTSLOTSIZE);
 		/* insert the entry */
 		xad = &p->xad[newindex];
 		XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
 		/* advance next available entry index. */
 		p->header.nextindex =
 		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	}
       out:
 	if (!test_cflag(COMMIT_Nolink, ip)) {
 		xtlck->lwm.offset = (xtlck->lwm.offset) ?
 		    min(index0, (int)xtlck->lwm.offset) : index0;
 		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
 		    xtlck->lwm.offset;
 	}
 	/* unpin the leaf page */
 	XT_PUTPAGE(mp);
 	return rc;
 }
 /*
  *	xtAppend()
  *
  * function: grow in append mode from contiguous region specified ;
  *
  * parameter:
  *	tid		- transaction id;
  *	ip		- file object;
  *	xflag		- extent flag:
  *	xoff		- extent offset;
  *	maxblocks	- max extent length;
  *	xlen		- extent length (in/out);
  *	xaddrp		- extent address pointer (in/out):
  *	flag		-
  *
  * return:
  */
 int xtAppend(tid_t tid,		/* transaction id */
 	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
 	     s32 * xlenp,	/* (in/out) */
 	     s64 * xaddrp,	/* (in/out) */
 	     int flag)
 {
 	int rc = 0;
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* base B+-tree index page */
 	s64 bn, xaddr;
 	int index, nextindex;
 	struct btstack btstack;	/* traverse stack */
 	struct xtsplit split;	/* split information */
 	xad_t *xad;
 	int cmp;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	int nsplit, nblocks, xlen;
 	struct pxdlist pxdlist;
 	pxd_t *pxd;
 	s64 next;
 	xaddr = *xaddrp;
 	xlen = *xlenp;
 	jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
 		 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
 	/*
 	 *	search for the entry location at which to insert:
 	 *
 	 * xtFastSearch() and xtSearch() both returns (leaf page
 	 * pinned, index at which to insert).
 	 * n.b. xtSearch() may return index of maxentry of
 	 * the full page.
 	 */
 	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
 		return rc;
 	/* retrieve search result */
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	if (cmp == 0) {
 		rc = -EEXIST;
 		goto out;
 	}
 	if (next)
 		xlen = min(xlen, (int)(next - xoff));
 //insert:
 	/*
 	 *	insert entry for new extent
 	 */
 	xflag |= XAD_NEW;
 	/*
 	 *	if the leaf page is full, split the page and
 	 *	propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
 	nextindex = le16_to_cpu(p->header.nextindex);
 	if (nextindex < le16_to_cpu(p->header.maxentry))
 		goto insertLeaf;
 	/*
 	 * allocate new index blocks to cover index page split(s)
 	 */
 	nsplit = btstack.nsplit;
 	split.pxdlist = &pxdlist;
 	pxdlist.maxnpxd = pxdlist.npxd = 0;
 	pxd = &pxdlist.pxd[0];
 	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
 	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
 		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
 			PXDaddress(pxd, xaddr);
 			PXDlength(pxd, nblocks);
 			pxdlist.maxnpxd++;
 			continue;
 		}
 		/* undo allocation */
 		goto out;
 	}
 	xlen = min(xlen, maxblocks);
 	/*
 	 * allocate data extent requested
 	 */
 	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
 		goto out;
 	split.mp = mp;
 	split.index = index;
 	split.flag = xflag;
 	split.off = xoff;
 	split.len = xlen;
 	split.addr = xaddr;
 	if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
 		/* undo data extent allocation */
 		dbFree(ip, *xaddrp, (s64) * xlenp);
 		return rc;
 	}
 	*xaddrp = xaddr;
 	*xlenp = xlen;
 	return 0;
 	/*
 	 *	insert the new entry into the leaf page
 	 */
       insertLeaf:
 	/*
 	 * allocate data extent requested
 	 */
 	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
 		goto out;
 	BT_MARK_DIRTY(mp, ip);
 	/*
 	 * acquire a transaction lock on the leaf page;
 	 *
 	 * action: xad insertion/extension;
 	 */
 	tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
 	xtlck = (struct xtlock *) & tlck->lock;
 	/* insert the new entry: mark the entry NEW */
 	xad = &p->xad[index];
 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
 	/* advance next available entry index */
-	p->header.nextindex =
+	le16_add_cpu(&p->header.nextindex, 1);
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
 	xtlck->lwm.offset =
 	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
 	xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
 	    xtlck->lwm.offset;
 	*xaddrp = xaddr;
 	*xlenp = xlen;
       out:
 	/* unpin the leaf page */
 	XT_PUTPAGE(mp);
 	return rc;
 }
 #ifdef _STILL_TO_PORT
 /* - TBD for defragmentaion/reorganization -
  *
  *	xtDelete()
  *
  * function:
  *	delete the entry with the specified key.
  *
  *	N.B.: whole extent of the entry is assumed to be deleted.
  *
  * parameter:
  *
  * return:
  *	ENOENT: if the entry is not found.
  *
  * exception:
  */
 int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
 {
 	int rc = 0;
 	struct btstack btstack;
 	int cmp;
 	s64 bn;
 	struct metapage *mp;
 	xtpage_t *p;
 	int index, nextindex;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	/*
 	 * find the matching entry; xtSearch() pins the page
 	 */
 	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
 		return rc;
 	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 	if (cmp) {
 		/* unpin the leaf page */
 		XT_PUTPAGE(mp);
 		return -ENOENT;
 	}
 	/*
 	 * delete the entry from the leaf page
 	 */
 	nextindex = le16_to_cpu(p->header.nextindex);
-	p->header.nextindex =
+	le16_add_cpu(&p->header.nextindex, -1);
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
 	/*
 	 * if the leaf page bocome empty, free the page
 	 */
 	if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
 		return (xtDeleteUp(tid, ip, mp, p, &btstack));
 	BT_MARK_DIRTY(mp, ip);
 	/*
 	 * acquire a transaction lock on the leaf page;
 	 *
 	 * action:xad deletion;
 	 */
 	tlck = txLock(tid, ip, mp, tlckXTREE);
 	xtlck = (struct xtlock *) & tlck->lock;
 	xtlck->lwm.offset =
 	    (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
 	/* if delete from middle, shift left/compact the remaining entries */
 	if (index < nextindex - 1)
 		memmove(&p->xad[index], &p->xad[index + 1],
 			(nextindex - index - 1) * sizeof(xad_t));
 	XT_PUTPAGE(mp);
 	return 0;
 }
 /* - TBD for defragmentaion/reorganization -
  *
  *	xtDeleteUp()
  *
  * function:
  *	free empty pages as propagating deletion up the tree
  *
  * parameter:
  *
  * return:
  */
 static int
 xtDeleteUp(tid_t tid, struct inode *ip,
 	   struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
 {
 	int rc = 0;
 	struct metapage *mp;
 	xtpage_t *p;
 	int index, nextindex;
 	s64 xaddr;
 	int xlen;
 	struct btframe *parent;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	/*
 	 * keep root leaf page which has become empty
 	 */
 	if (fp->header.flag & BT_ROOT) {
 		/* keep the root page */
 		fp->header.flag &= ~BT_INTERNAL;
 		fp->header.flag |= BT_LEAF;
 		fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
 		/* XT_PUTPAGE(fmp); */
 		return 0;
 	}
 	/*
 	 * free non-root leaf page
 	 */
 	if ((rc = xtRelink(tid, ip, fp))) {
 		XT_PUTPAGE(fmp);
 		return rc;
 	}
 	xaddr = addressPXD(&fp->header.self);
 	xlen = lengthPXD(&fp->header.self);
 	/* free the page extent */
 	dbFree(ip, xaddr, (s64) xlen);
 	/* free the buffer page */
 	discard_metapage(fmp);
 	/*
 	 * propagate page deletion up the index tree
 	 *
 	 * If the delete from the parent page makes it empty,
 	 * continue all the way up the tree.
 	 * stop if the root page is reached (which is never deleted) or
 	 * if the entry deletion does not empty the page.
 	 */
 	while ((parent = BT_POP(btstack)) != NULL) {
 		/* get/pin the parent page <sp> */
 		XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		index = parent->index;
 		/* delete the entry for the freed child page from parent.
 		 */
 		nextindex = le16_to_cpu(p->header.nextindex);
 		/*
 		 * the parent has the single entry being deleted:
 		 * free the parent page which has become empty.
 		 */
 		if (nextindex == 1) {
 			if (p->header.flag & BT_ROOT) {
 				/* keep the root page */
 				p->header.flag &= ~BT_INTERNAL;
 				p->header.flag |= BT_LEAF;
 				p->header.nextindex =
 				    cpu_to_le16(XTENTRYSTART);
 				/* XT_PUTPAGE(mp); */
 				break;
 			} else {
 				/* free the parent page */
 				if ((rc = xtRelink(tid, ip, p)))
 					return rc;
 				xaddr = addressPXD(&p->header.self);
 				/* free the page extent */
 				dbFree(ip, xaddr,
 				       (s64) JFS_SBI(ip->i_sb)->nbperpage);
 				/* unpin/free the buffer page */
 				discard_metapage(mp);
 				/* propagate up */
 				continue;
 			}
 		}
 		/*
 		 * the parent has other entries remaining:
 		 * delete the router entry from the parent page.
 		 */
 		else {
 			BT_MARK_DIRTY(mp, ip);
 			/*
 			 * acquire a transaction lock on the leaf page;
 			 *
 			 * action:xad deletion;
 			 */
 			tlck = txLock(tid, ip, mp, tlckXTREE);
 			xtlck = (struct xtlock *) & tlck->lock;
 			xtlck->lwm.offset =
 			    (xtlck->lwm.offset) ? min(index,
 						      xtlck->lwm.
 						      offset) : index;
 			/* if delete from middle,
 			 * shift left/compact the remaining entries in the page
 			 */
 			if (index < nextindex - 1)
 				memmove(&p->xad[index], &p->xad[index + 1],
 					(nextindex - index -
 					 1) << L2XTSLOTSIZE);
-			p->header.nextindex =
+			le16_add_cpu(&p->header.nextindex, -1);
-			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
-					1);
 			jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
 				 (ulong) parent->bn, index);
 		}
 		/* unpin the parent page */
 		XT_PUTPAGE(mp);
 		/* exit propagation up */
 		break;
 	}
 	return 0;
 }
 /*
  * NAME:	xtRelocate()
  *
  * FUNCTION:	relocate xtpage or data extent of regular file;
  *		This function is mainly used by defragfs utility.
  *
  * NOTE:	This routine does not have the logic to handle
  *		uncommitted allocated extent. The caller should call
  *		txCommit() to commit all the allocation before call
  *		this routine.
  */
 int
 xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 	   s64 nxaddr,		/* new xaddr */
 	   int xtype)
 {				/* extent type: XTPAGE or DATAEXT */
 	int rc = 0;
 	struct tblock *tblk;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
 	struct metapage *mp, *pmp, *lmp, *rmp;	/* meta-page buffer */
 	xtpage_t *p, *pp, *rp, *lp;	/* base B+-tree index page */
 	xad_t *xad;
 	pxd_t *pxd;
 	s64 xoff, xsize;
 	int xlen;
 	s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
 	cbuf_t *cp;
 	s64 offset, nbytes, nbrd, pno;
 	int nb, npages, nblks;
 	s64 bn;
 	int cmp;
 	int index;
 	struct pxd_lock *pxdlock;
 	struct btstack btstack;	/* traverse stack */
 	xtype = xtype & EXTENT_TYPE;
 	xoff = offsetXAD(oxad);
 	oxaddr = addressXAD(oxad);
 	xlen = lengthXAD(oxad);
 	/* validate extent offset */
 	offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
 	if (offset >= ip->i_size)
 		return -ESTALE;	/* stale extent */
 	jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
 		 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
 	/*
 	 *	1. get and validate the parent xtpage/xad entry
 	 *	covering the source extent to be relocated;
 	 */
 	if (xtype == DATAEXT) {
 		/* search in leaf entry */
 		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
 		if (rc)
 			return rc;
 		/* retrieve search result */
 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
 		if (cmp) {
 			XT_PUTPAGE(pmp);
 			return -ESTALE;
 		}
 		/* validate for exact match with a single entry */
 		xad = &pp->xad[index];
 		if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
 			XT_PUTPAGE(pmp);
 			return -ESTALE;
 		}
 	} else {		/* (xtype == XTPAGE) */
 		/* search in internal entry */
 		rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
 		if (rc)
 			return rc;
 		/* retrieve search result */
 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
 		if (cmp) {
 			XT_PUTPAGE(pmp);
 			return -ESTALE;
 		}
 		/* xtSearchNode() validated for exact match with a single entry
 		 */
 		xad = &pp->xad[index];
 	}
 	jfs_info("xtRelocate: parent xad entry validated.");
 	/*
 	 *	2. relocate the extent
 	 */
 	if (xtype == DATAEXT) {
 		/* if the extent is allocated-but-not-recorded
 		 * there is no real data to be moved in this extent,
 		 */
 		if (xad->flag & XAD_NOTRECORDED)
 			goto out;
 		else
 			/* release xtpage for cmRead()/xtLookup() */
 			XT_PUTPAGE(pmp);
 		/*
 		 *	cmRelocate()
 		 *
 		 * copy target data pages to be relocated;
 		 *
 		 * data extent must start at page boundary and
 		 * multiple of page size (except the last data extent);
 		 * read in each page of the source data extent into cbuf,
 		 * update the cbuf extent descriptor of the page to be
 		 * homeward bound to new dst data extent
 		 * copy the data from the old extent to new extent.
 		 * copy is essential for compressed files to avoid problems
 		 * that can arise if there was a change in compression
 		 * algorithms.
 		 * it is a good strategy because it may disrupt cache
 		 * policy to keep the pages in memory afterwards.
 		 */
 		offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
 		assert((offset & CM_OFFSET) == 0);
 		nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
 		pno = offset >> CM_L2BSIZE;
 		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
 /*
 		npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
 			  (offset >> CM_L2BSIZE) + 1;
 */
 		sxaddr = oxaddr;
 		dxaddr = nxaddr;
 		/* process the request one cache buffer at a time */
 		for (nbrd = 0; nbrd < nbytes; nbrd += nb,
 		     offset += nb, pno++, npages--) {
 			/* compute page size */
 			nb = min(nbytes - nbrd, CM_BSIZE);
 			/* get the cache buffer of the page */
 			if (rc = cmRead(ip, offset, npages, &cp))
 				break;
 			assert(addressPXD(&cp->cm_pxd) == sxaddr);
 			assert(!cp->cm_modified);
 			/* bind buffer with the new extent address */
 			nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
 			cmSetXD(ip, cp, pno, dxaddr, nblks);
 			/* release the cbuf, mark it as modified */
 			cmPut(cp, true);
 			dxaddr += nblks;
 			sxaddr += nblks;
 		}
 		/* get back parent page */
 		if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
 			return rc;
 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
 		jfs_info("xtRelocate: target data extent relocated.");
 	} else {		/* (xtype == XTPAGE) */
 		/*
 		 * read in the target xtpage from the source extent;
 		 */
 		XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
 		if (rc) {
 			XT_PUTPAGE(pmp);
 			return rc;
 		}
 		/*
 		 * read in sibling pages if any to update sibling pointers;
 		 */
 		rmp = NULL;
 		if (p->header.next) {
 			nextbn = le64_to_cpu(p->header.next);
 			XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
 			if (rc) {
 				XT_PUTPAGE(pmp);
 				XT_PUTPAGE(mp);
 				return (rc);
 			}
 		}
 		lmp = NULL;
 		if (p->header.prev) {
 			prevbn = le64_to_cpu(p->header.prev);
 			XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
 			if (rc) {
 				XT_PUTPAGE(pmp);
 				XT_PUTPAGE(mp);
 				if (rmp)
 					XT_PUTPAGE(rmp);
 				return (rc);
 			}
 		}
 		/* at this point, all xtpages to be updated are in memory */
 		/*
 		 * update sibling pointers of sibling xtpages if any;
 		 */
 		if (lmp) {
 			BT_MARK_DIRTY(lmp, ip);
 			tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
 			lp->header.next = cpu_to_le64(nxaddr);
 			XT_PUTPAGE(lmp);
 		}
 		if (rmp) {
 			BT_MARK_DIRTY(rmp, ip);
 			tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
 			rp->header.prev = cpu_to_le64(nxaddr);
 			XT_PUTPAGE(rmp);
 		}
 		/*
 		 * update the target xtpage to be relocated
 		 *
 		 * update the self address of the target page
 		 * and write to destination extent;
 		 * redo image covers the whole xtpage since it is new page
 		 * to the destination extent;
 		 * update of bmap for the free of source extent
 		 * of the target xtpage itself:
 		 * update of bmap for the allocation of destination extent
 		 * of the target xtpage itself:
 		 * update of bmap for the extents covered by xad entries in
 		 * the target xtpage is not necessary since they are not
 		 * updated;
 		 * if not committed before this relocation,
 		 * target page may contain XAD_NEW entries which must
 		 * be scanned for bmap update (logredo() always
 		 * scan xtpage REDOPAGE image for bmap update);
 		 * if committed before this relocation (tlckRELOCATE),
 		 * scan may be skipped by commit() and logredo();
 		 */
 		BT_MARK_DIRTY(mp, ip);
 		/* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
 		xtlck = (struct xtlock *) & tlck->lock;
 		/* update the self address in the xtpage header */
 		pxd = &p->header.self;
 		PXDaddress(pxd, nxaddr);
 		/* linelock for the after image of the whole page */
 		xtlck->lwm.length =
 		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
 		/* update the buffer extent descriptor of target xtpage */
 		xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
 		bmSetXD(mp, nxaddr, xsize);
 		/* unpin the target page to new homeward bound */
 		XT_PUTPAGE(mp);
 		jfs_info("xtRelocate: target xtpage relocated.");
 	}
 	/*
 	 *	3. acquire maplock for the source extent to be freed;
 	 *
 	 * acquire a maplock saving the src relocated extent address;
 	 * to free of the extent at commit time;
 	 */
       out:
 	/* if DATAEXT relocation, write a LOG_UPDATEMAP record for
 	 * free PXD of the source data extent (logredo() will update
 	 * bmap for free of source data extent), and update bmap for
 	 * free of the source data extent;
 	 */
 	if (xtype == DATAEXT)
 		tlck = txMaplock(tid, ip, tlckMAP);
 	/* if XTPAGE relocation, write a LOG_NOREDOPAGE record
 	 * for the source xtpage (logredo() will init NoRedoPage
 	 * filter and will also update bmap for free of the source
 	 * xtpage), and update bmap for free of the source xtpage;
 	 * N.B. We use tlckMAP instead of tlkcXTREE because there
 	 *      is no buffer associated with this lock since the buffer
 	 *      has been redirected to the target location.
 	 */
 	else			/* (xtype == XTPAGE) */
 		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
 	pxdlock = (struct pxd_lock *) & tlck->lock;
 	pxdlock->flag = mlckFREEPXD;
 	PXDaddress(&pxdlock->pxd, oxaddr);
 	PXDlength(&pxdlock->pxd, xlen);
 	pxdlock->index = 1;
 	/*
 	 *	4. update the parent xad entry for relocation;
 	 *
 	 * acquire tlck for the parent entry with XAD_NEW as entry
 	 * update which will write LOG_REDOPAGE and update bmap for
 	 * allocation of XAD_NEW destination extent;
 	 */
 	jfs_info("xtRelocate: update parent xad entry.");
 	BT_MARK_DIRTY(pmp, ip);
 	tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
 	xtlck = (struct xtlock *) & tlck->lock;
 	/* update the XAD with the new destination extent; */
 	xad = &pp->xad[index];
 	xad->flag |= XAD_NEW;
 	XADaddress(xad, nxaddr);
 	xtlck->lwm.offset = min(index, xtlck->lwm.offset);
 	xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
 	    xtlck->lwm.offset;
 	/* unpin the parent xtpage */
 	XT_PUTPAGE(pmp);
 	return rc;
 }
 /*
  *	xtSearchNode()
  *
  * function:	search for the internal xad entry covering specified extent.
  *		This function is mainly used by defragfs utility.
  *
  * parameters:
  *	ip	- file object;
  *	xad	- extent to find;
  *	cmpp	- comparison result:
  *	btstack - traverse stack;
  *	flag	- search process flag;
  *
  * returns:
  *	btstack contains (bn, index) of search path traversed to the entry.
  *	*cmpp is set to result of comparison with the entry returned.
  *	the page containing the entry is pinned at exit.
  */
 static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 			int *cmpp, struct btstack * btstack, int flag)
 {
 	int rc = 0;
 	s64 xoff, xaddr;
 	int xlen;
 	int cmp = 1;		/* init for empty page */
 	s64 bn;			/* block number */
 	struct metapage *mp;	/* meta-page buffer */
 	xtpage_t *p;		/* page */
 	int base, index, lim;
 	struct btframe *btsp;
 	s64 t64;
 	BT_CLR(btstack);
 	xoff = offsetXAD(xad);
 	xlen = lengthXAD(xad);
 	xaddr = addressXAD(xad);
 	/*
 	 *	search down tree from root:
 	 *
 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 	 *
 	 * if entry with search key K is not found
 	 * internal page search find the entry with largest key Ki
 	 * less than K which point to the child page to search;
 	 * leaf page search find the entry with smallest key Kj
 	 * greater than K so that the returned index is the position of
 	 * the entry to be shifted right for insertion of new entry.
 	 * for empty tree, search key is greater than any key of the tree.
 	 *
 	 * by convention, root bn = 0.
 	 */
 	for (bn = 0;;) {
 		/* get/pin the page to search */
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		if (p->header.flag & BT_LEAF) {
 			XT_PUTPAGE(mp);
 			return -ESTALE;
 		}
 		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
 		/*
 		 * binary search with search key K on the current page
 		 */
 		for (base = XTENTRYSTART; lim; lim >>= 1) {
 			index = base + (lim >> 1);
 			XT_CMP(cmp, xoff, &p->xad[index], t64);
 			if (cmp == 0) {
 				/*
 				 *	search hit
 				 *
 				 * verify for exact match;
 				 */
 				if (xaddr == addressXAD(&p->xad[index]) &&
 				    xoff == offsetXAD(&p->xad[index])) {
 					*cmpp = cmp;
 					/* save search result */
 					btsp = btstack->top;
 					btsp->bn = bn;
 					btsp->index = index;
 					btsp->mp = mp;
 					return 0;
 				}
 				/* descend/search its child page */
 				goto next;
 			}
 			if (cmp > 0) {
 				base = index + 1;
 				--lim;
 			}
 		}
 		/*
 		 *	search miss - non-leaf page:
 		 *
 		 * base is the smallest index with key (Kj) greater than
 		 * search key (K) and may be zero or maxentry index.
 		 * if base is non-zero, decrement base by one to get the parent
 		 * entry of the child page to search.
 		 */
 		index = base ? base - 1 : base;
 		/*
 		 * go down to child page
 		 */
 	      next:
 		/* get the child page block number */
 		bn = addressXAD(&p->xad[index]);
 		/* unpin the parent page */
 		XT_PUTPAGE(mp);
 	}
 }
 /*
  *	xtRelink()
  *
  * function:
  *	link around a freed page.
  *
  * Parameter:
  *	int		tid,
  *	struct inode	*ip,
  *	xtpage_t	*p)
  *
  * returns:
  */
 static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
 {
 	int rc = 0;
 	struct metapage *mp;
 	s64 nextbn, prevbn;
 	struct tlock *tlck;
 	nextbn = le64_to_cpu(p->header.next);
 	prevbn = le64_to_cpu(p->header.prev);
 	/* update prev pointer of the next page */
 	if (nextbn != 0) {
 		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * acquire a transaction lock on the page;
 		 *
 		 * action: update prev pointer;
 		 */
 		BT_MARK_DIRTY(mp, ip);
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
 		/* the page may already have been tlock'd */
 		p->header.prev = cpu_to_le64(prevbn);
 		XT_PUTPAGE(mp);
 	}
 	/* update next pointer of the previous page */
 	if (prevbn != 0) {
 		XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/*
 		 * acquire a transaction lock on the page;
 		 *
 		 * action: update next pointer;
 		 */
 		BT_MARK_DIRTY(mp, ip);
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
 		/* the page may already have been tlock'd */
 		p->header.next = le64_to_cpu(nextbn);
 		XT_PUTPAGE(mp);
 	}
 	return 0;
 }
 #endif				/*  _STILL_TO_PORT */
 /*
  *	xtInitRoot()
  *
  * initialize file root (inline in inode)
  */
 void xtInitRoot(tid_t tid, struct inode *ip)
 {
 	xtpage_t *p;
 	/*
 	 * acquire a transaction lock on the root
 	 *
 	 * action:
 	 */
 	txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
 		      tlckXTREE | tlckNEW);
 	p = &JFS_IP(ip)->i_xtroot;
 	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
 	p->header.nextindex = cpu_to_le16(XTENTRYSTART);
 	if (S_ISDIR(ip->i_mode))
 		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
 	else {
 		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
 		ip->i_size = 0;
 	}
 	return;
 }
 /*
  * We can run into a deadlock truncating a file with a large number of
  * xtree pages (large fragmented file).  A robust fix would entail a
  * reservation system where we would reserve a number of metadata pages
  * and tlocks which we would be guaranteed without a deadlock.  Without
  * this, a partial fix is to limit number of metadata pages we will lock
  * in a single transaction.  Currently we will truncate the file so that
  * no more than 50 leaf pages will be locked.  The caller of xtTruncate
  * will be responsible for ensuring that the current transaction gets
  * committed, and that subsequent transactions are created to truncate
  * the file further if needed.
  */
 #define MAX_TRUNCATE_LEAVES 50
 /*
  *	xtTruncate()
  *
  * function:
  *	traverse for truncation logging backward bottom up;
  *	terminate at the last extent entry at the current subtree
  *	root page covering new down size.
  *	truncation may occur within the last extent entry.
  *
  * parameter:
  *	int		tid,
  *	struct inode	*ip,
  *	s64		newsize,
  *	int		type)	{PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
  *
  * return:
  *
  * note:
  *	PWMAP:
  *	 1. truncate (non-COMMIT_NOLINK file)
  *	    by jfs_truncate() or jfs_open(O_TRUNC):
  *	    xtree is updated;
  *	 2. truncate index table of directory when last entry removed
  *	map update via tlock at commit time;
  *	PMAP:
  *	 Call xtTruncate_pmap instead
  *	WMAP:
  *	 1. remove (free zero link count) on last reference release
  *	    (pmap has been freed at commit zero link count);
  *	 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
  *	    xtree is updated;
  *	 map update directly at truncation time;
  *
  *	if (DELETE)
  *		no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
  *	else if (TRUNCATE)
  *		must write LOG_NOREDOPAGE for deleted index page;
  *
  * pages may already have been tlocked by anonymous transactions
  * during file growth (i.e., write) before truncation;
  *
  * except last truncated entry, deleted entries remains as is
  * in the page (nextindex is updated) for other use
  * (e.g., log/update allocation map): this avoid copying the page
  * info but delay free of pages;
  *
  */
 s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 {
 	int rc = 0;
 	s64 teof;
 	struct metapage *mp;
 	xtpage_t *p;
 	s64 bn;
 	int index, nextindex;
 	xad_t *xad;
 	s64 xoff, xaddr;
 	int xlen, len, freexlen;
 	struct btstack btstack;
 	struct btframe *parent;
 	struct tblock *tblk = NULL;
 	struct tlock *tlck = NULL;
 	struct xtlock *xtlck = NULL;
 	struct xdlistlock xadlock;	/* maplock for COMMIT_WMAP */
 	struct pxd_lock *pxdlock;		/* maplock for COMMIT_WMAP */
 	s64 nfreed;
 	int freed, log;
 	int locked_leaves = 0;
 	/* save object truncation type */
 	if (tid) {
 		tblk = tid_to_tblock(tid);
 		tblk->xflag |= flag;
 	}
 	nfreed = 0;
 	flag &= COMMIT_MAP;
 	assert(flag != COMMIT_PMAP);
 	if (flag == COMMIT_PWMAP)
 		log = 1;
 	else {
 		log = 0;
 		xadlock.flag = mlckFREEXADLIST;
 		xadlock.index = 1;
 	}
 	/*
 	 * if the newsize is not an integral number of pages,
 	 * the file between newsize and next page boundary will
 	 * be cleared.
 	 * if truncating into a file hole, it will cause
 	 * a full block to be allocated for the logical block.
 	 */
 	/*
 	 * release page blocks of truncated region <teof, eof>
 	 *
 	 * free the data blocks from the leaf index blocks.
 	 * delete the parent index entries corresponding to
 	 * the freed child data/index blocks.
 	 * free the index blocks themselves which aren't needed
 	 * in new sized file.
 	 *
 	 * index blocks are updated only if the blocks are to be
 	 * retained in the new sized file.
 	 * if type is PMAP, the data and index pages are NOT
 	 * freed, and the data and index blocks are NOT freed
 	 * from working map.
 	 * (this will allow continued access of data/index of
 	 * temporary file (zerolink count file truncated to zero-length)).
 	 */
 	teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
 	    JFS_SBI(ip->i_sb)->l2bsize;
 	/* clear stack */
 	BT_CLR(&btstack);
 	/*
 	 * start with root
 	 *
 	 * root resides in the inode
 	 */
 	bn = 0;
 	/*
 	 * first access of each page:
 	 */
       getPage:
 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 	if (rc)
 		return rc;
 	/* process entries backward from last index */
 	index = le16_to_cpu(p->header.nextindex) - 1;
 	/* Since this is the rightmost page at this level, and we may have
 	 * already freed a page that was formerly to the right, let's make
 	 * sure that the next pointer is zero.
 	 */
 	if (p->header.next) {
 		if (log)
 			/*
 			 * Make sure this change to the header is logged.
 			 * If we really truncate this leaf, the flag
 			 * will be changed to tlckTRUNCATE
 			 */
 			tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
 		BT_MARK_DIRTY(mp, ip);
 		p->header.next = 0;
 	}
 	if (p->header.flag & BT_INTERNAL)
 		goto getChild;
 	/*
 	 *	leaf page
 	 */
 	freed = 0;
 	/* does region covered by leaf page precede Teof ? */
 	xad = &p->xad[index];
 	xoff = offsetXAD(xad);
 	xlen = lengthXAD(xad);
 	if (teof >= xoff + xlen) {
 		XT_PUTPAGE(mp);
 		goto getParent;
 	}
 	/* (re)acquire tlock of the leaf page */
 	if (log) {
 		if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
 			/*
 			 * We need to limit the size of the transaction
 			 * to avoid exhausting pagecache & tlocks
 			 */
 			XT_PUTPAGE(mp);
 			newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
 			goto getParent;
 		}
 		tlck = txLock(tid, ip, mp, tlckXTREE);
 		tlck->type = tlckXTREE | tlckTRUNCATE;
 		xtlck = (struct xtlock *) & tlck->lock;
 		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
 	}
 	BT_MARK_DIRTY(mp, ip);
 	/*
 	 * scan backward leaf page entries
 	 */
 	for (; index >= XTENTRYSTART; index--) {
 		xad = &p->xad[index];
 		xoff = offsetXAD(xad);
 		xlen = lengthXAD(xad);
 		xaddr = addressXAD(xad);
 		/*
 		 * The "data" for a directory is indexed by the block
 		 * device's address space.  This metadata must be invalidated
 		 * here
 		 */
 		if (S_ISDIR(ip->i_mode) && (teof == 0))
 			invalidate_xad_metapages(ip, *xad);
 		/*
 		 * entry beyond eof: continue scan of current page
 		 *          xad
 		 * ---|---=======------->
 		 *   eof
 		 */
 		if (teof < xoff) {
 			nfreed += xlen;
 			continue;
 		}
 		/*
 		 * (xoff <= teof): last entry to be deleted from page;
 		 * If other entries remain in page: keep and update the page.
 		 */
 		/*
 		 * eof == entry_start: delete the entry
 		 *           xad
 		 * -------|=======------->
 		 *       eof
 		 *
 		 */
 		if (teof == xoff) {
 			nfreed += xlen;
 			if (index == XTENTRYSTART)
 				break;
 			nextindex = index;
 		}
 		/*
 		 * eof within the entry: truncate the entry.
 		 *          xad
 		 * -------===|===------->
 		 *          eof
 		 */
 		else if (teof < xoff + xlen) {
 			/* update truncated entry */
 			len = teof - xoff;
 			freexlen = xlen - len;
 			XADlength(xad, len);
 			/* save pxd of truncated extent in tlck */
 			xaddr += len;
 			if (log) {	/* COMMIT_PWMAP */
 				xtlck->lwm.offset = (xtlck->lwm.offset) ?
 				    min(index, (int)xtlck->lwm.offset) : index;
 				xtlck->lwm.length = index + 1 -
 				    xtlck->lwm.offset;
 				xtlck->twm.offset = index;
 				pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
 				pxdlock->flag = mlckFREEPXD;
 				PXDaddress(&pxdlock->pxd, xaddr);
 				PXDlength(&pxdlock->pxd, freexlen);
 			}
 			/* free truncated extent */
 			else {	/* COMMIT_WMAP */
 				pxdlock = (struct pxd_lock *) & xadlock;
 				pxdlock->flag = mlckFREEPXD;
 				PXDaddress(&pxdlock->pxd, xaddr);
 				PXDlength(&pxdlock->pxd, freexlen);
 				txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
 				/* reset map lock */
 				xadlock.flag = mlckFREEXADLIST;
 			}
 			/* current entry is new last entry; */
 			nextindex = index + 1;
 			nfreed += freexlen;
 		}
 		/*
 		 * eof beyond the entry:
 		 *          xad
 		 * -------=======---|--->
 		 *                 eof
 		 */
 		else {		/* (xoff + xlen < teof) */
 			nextindex = index + 1;
 		}
 		if (nextindex < le16_to_cpu(p->header.nextindex)) {
 			if (!log) {	/* COMMIT_WAMP */
 				xadlock.xdlist = &p->xad[nextindex];
 				xadlock.count =
 				    le16_to_cpu(p->header.nextindex) -
 				    nextindex;
 				txFreeMap(ip, (struct maplock *) & xadlock,
 					  NULL, COMMIT_WMAP);
 			}
 			p->header.nextindex = cpu_to_le16(nextindex);
 		}
 		XT_PUTPAGE(mp);
 		/* assert(freed == 0); */
 		goto getParent;
 	}			/* end scan of leaf page entries */
 	freed = 1;
 	/*
 	 * leaf page become empty: free the page if type != PMAP
 	 */
 	if (log) {		/* COMMIT_PWMAP */
 		/* txCommit() with tlckFREE:
 		 * free data extents covered by leaf [XTENTRYSTART:hwm);
 		 * invalidate leaf if COMMIT_PWMAP;
 		 * if (TRUNCATE), will write LOG_NOREDOPAGE;
 		 */
 		tlck->type = tlckXTREE | tlckFREE;
 	} else {		/* COMMIT_WAMP */
 		/* free data extents covered by leaf */
 		xadlock.xdlist = &p->xad[XTENTRYSTART];
 		xadlock.count =
 		    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
 		txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
 	}
 	if (p->header.flag & BT_ROOT) {
 		p->header.flag &= ~BT_INTERNAL;
 		p->header.flag |= BT_LEAF;
 		p->header.nextindex = cpu_to_le16(XTENTRYSTART);
 		XT_PUTPAGE(mp);	/* debug */
 		goto out;
 	} else {
 		if (log) {	/* COMMIT_PWMAP */
 			/* page will be invalidated at tx completion
 			 */
 			XT_PUTPAGE(mp);
 		} else {	/* COMMIT_WMAP */
 			if (mp->lid)
 				lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
 			/* invalidate empty leaf page */
 			discard_metapage(mp);
 		}
 	}
 	/*
 	 * the leaf page become empty: delete the parent entry
 	 * for the leaf page if the parent page is to be kept
 	 * in the new sized file.
 	 */
 	/*
 	 * go back up to the parent page
 	 */
       getParent:
 	/* pop/restore parent entry for the current child page */
 	if ((parent = BT_POP(&btstack)) == NULL)
 		/* current page must have been root */
 		goto out;
 	/* get back the parent page */
 	bn = parent->bn;
 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 	if (rc)
 		return rc;
 	index = parent->index;
 	/*
 	 * child page was not empty:
 	 */
 	if (freed == 0) {
 		/* has any entry deleted from parent ? */
 		if (index < le16_to_cpu(p->header.nextindex) - 1) {
 			/* (re)acquire tlock on the parent page */
 			if (log) {	/* COMMIT_PWMAP */
 				/* txCommit() with tlckTRUNCATE:
 				 * free child extents covered by parent [);
 				 */
 				tlck = txLock(tid, ip, mp, tlckXTREE);
 				xtlck = (struct xtlock *) & tlck->lock;
 				if (!(tlck->type & tlckTRUNCATE)) {
 					xtlck->hwm.offset =
 					    le16_to_cpu(p->header.
 							nextindex) - 1;
 					tlck->type =
 					    tlckXTREE | tlckTRUNCATE;
 				}
 			} else {	/* COMMIT_WMAP */
 				/* free child extents covered by parent */
 				xadlock.xdlist = &p->xad[index + 1];
 				xadlock.count =
 				    le16_to_cpu(p->header.nextindex) -
 				    index - 1;
 				txFreeMap(ip, (struct maplock *) & xadlock,
 					  NULL, COMMIT_WMAP);
 			}
 			BT_MARK_DIRTY(mp, ip);
 			p->header.nextindex = cpu_to_le16(index + 1);
 		}
 		XT_PUTPAGE(mp);
 		goto getParent;
 	}
 	/*
 	 * child page was empty:
 	 */
 	nfreed += lengthXAD(&p->xad[index]);
 	/*
 	 * During working map update, child page's tlock must be handled
 	 * before parent's.  This is because the parent's tlock will cause
 	 * the child's disk space to be marked available in the wmap, so
 	 * it's important that the child page be released by that time.
 	 *
 	 * ToDo:  tlocks should be on doubly-linked list, so we can
 	 * quickly remove it and add it to the end.
 	 */
 	/*
 	 * Move parent page's tlock to the end of the tid's tlock list
 	 */
 	if (log && mp->lid && (tblk->last != mp->lid) &&
 	    lid_to_tlock(mp->lid)->tid) {
 		lid_t lid = mp->lid;
 		struct tlock *prev;
 		tlck = lid_to_tlock(lid);
 		if (tblk->next == lid)
 			tblk->next = tlck->next;
 		else {
 			for (prev = lid_to_tlock(tblk->next);
 			     prev->next != lid;
 			     prev = lid_to_tlock(prev->next)) {
 				assert(prev->next);
 			}
 			prev->next = tlck->next;
 		}
 		lid_to_tlock(tblk->last)->next = lid;
 		tlck->next = 0;
 		tblk->last = lid;
 	}
 	/*
 	 * parent page become empty: free the page
 	 */
 	if (index == XTENTRYSTART) {
 		if (log) {	/* COMMIT_PWMAP */
 			/* txCommit() with tlckFREE:
 			 * free child extents covered by parent;
 			 * invalidate parent if COMMIT_PWMAP;
 			 */
 			tlck = txLock(tid, ip, mp, tlckXTREE);
 			xtlck = (struct xtlock *) & tlck->lock;
 			xtlck->hwm.offset =
 			    le16_to_cpu(p->header.nextindex) - 1;
 			tlck->type = tlckXTREE | tlckFREE;
 		} else {	/* COMMIT_WMAP */
 			/* free child extents covered by parent */
 			xadlock.xdlist = &p->xad[XTENTRYSTART];
 			xadlock.count =
 			    le16_to_cpu(p->header.nextindex) -
 			    XTENTRYSTART;
 			txFreeMap(ip, (struct maplock *) & xadlock, NULL,
 				  COMMIT_WMAP);
 		}
 		BT_MARK_DIRTY(mp, ip);
 		if (p->header.flag & BT_ROOT) {
 			p->header.flag &= ~BT_INTERNAL;
 			p->header.flag |= BT_LEAF;
 			p->header.nextindex = cpu_to_le16(XTENTRYSTART);
 			if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
 				/*
 				 * Shrink root down to allow inline
 				 * EA (otherwise fsck complains)
 				 */
 				p->header.maxentry =
 				    cpu_to_le16(XTROOTINITSLOT);
 				JFS_IP(ip)->mode2 |= INLINEEA;
 			}
 			XT_PUTPAGE(mp);	/* debug */
 			goto out;
 		} else {
 			if (log) {	/* COMMIT_PWMAP */
 				/* page will be invalidated at tx completion
 				 */
 				XT_PUTPAGE(mp);
 			} else {	/* COMMIT_WMAP */
 				if (mp->lid)
 					lid_to_tlock(mp->lid)->flag |=
 						tlckFREELOCK;
 				/* invalidate parent page */
 				discard_metapage(mp);
 			}
 			/* parent has become empty and freed:
 			 * go back up to its parent page
 			 */
 			/* freed = 1; */
 			goto getParent;
 		}
 	}
 	/*
 	 * parent page still has entries for front region;
 	 */
 	else {
 		/* try truncate region covered by preceding entry
 		 * (process backward)
 		 */
 		index--;
 		/* go back down to the child page corresponding
 		 * to the entry
 		 */
 		goto getChild;
 	}
 	/*
 	 *	internal page: go down to child page of current entry
 	 */
       getChild:
 	/* save current parent entry for the child page */
 	if (BT_STACK_FULL(&btstack)) {
 		jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
 		XT_PUTPAGE(mp);
 		return -EIO;
 	}
 	BT_PUSH(&btstack, bn, index);
 	/* get child page */
 	xad = &p->xad[index];
 	bn = addressXAD(xad);
 	/*
 	 * first access of each internal entry:
 	 */
 	/* release parent page */
 	XT_PUTPAGE(mp);
 	/* process the child page */
 	goto getPage;
       out:
 	/*
 	 * update file resource stat
 	 */
 	/* set size
 	 */
 	if (S_ISDIR(ip->i_mode) && !newsize)
 		ip->i_size = 1;	/* fsck hates zero-length directories */
 	else
 		ip->i_size = newsize;
 	/* update quota allocation to reflect freed blocks */
 	DQUOT_FREE_BLOCK(ip, nfreed);
 	/*
 	 * free tlock of invalidated pages
 	 */
 	if (flag == COMMIT_WMAP)
 		txFreelock(ip);
 	return newsize;
 }
 /*
  *	xtTruncate_pmap()
  *
  * function:
  *	Perform truncate to zero length for deleted file, leaving the
  *	the xtree and working map untouched.  This allows the file to
  *	be accessed via open file handles, while the delete of the file
  *	is committed to disk.
  *
  * parameter:
  *	tid_t		tid,
  *	struct inode	*ip,
  *	s64		committed_size)
  *
  * return: new committed size
  *
  * note:
  *
  *	To avoid deadlock by holding too many transaction locks, the
  *	truncation may be broken up into multiple transactions.
  *	The committed_size keeps track of part of the file has been
  *	freed from the pmaps.
  */
 s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 {
 	s64 bn;
 	struct btstack btstack;
 	int cmp;
 	int index;
 	int locked_leaves = 0;
 	struct metapage *mp;
 	xtpage_t *p;
 	struct btframe *parent;
 	int rc;
 	struct tblock *tblk;
 	struct tlock *tlck = NULL;
 	xad_t *xad;
 	int xlen;
 	s64 xoff;
 	struct xtlock *xtlck = NULL;
 	/* save object truncation type */
 	tblk = tid_to_tblock(tid);
 	tblk->xflag |= COMMIT_PMAP;
 	/* clear stack */
 	BT_CLR(&btstack);
 	if (committed_size) {
 		xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
 		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
 		if (rc)
 			return rc;
 		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
 		if (cmp != 0) {
 			XT_PUTPAGE(mp);
 			jfs_error(ip->i_sb,
 				  "xtTruncate_pmap: did not find extent");
 			return -EIO;
 		}
 	} else {
 		/*
 		 * start with root
 		 *
 		 * root resides in the inode
 		 */
 		bn = 0;
 		/*
 		 * first access of each page:
 		 */
       getPage:
 		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 		if (rc)
 			return rc;
 		/* process entries backward from last index */
 		index = le16_to_cpu(p->header.nextindex) - 1;
 		if (p->header.flag & BT_INTERNAL)
 			goto getChild;
 	}
 	/*
 	 *	leaf page
 	 */
 	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
 		/*
 		 * We need to limit the size of the transaction
 		 * to avoid exhausting pagecache & tlocks
 		 */
 		xad = &p->xad[index];
 		xoff = offsetXAD(xad);
 		xlen = lengthXAD(xad);
 		XT_PUTPAGE(mp);
 		return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
 	}
 	tlck = txLock(tid, ip, mp, tlckXTREE);
 	tlck->type = tlckXTREE | tlckFREE;
 	xtlck = (struct xtlock *) & tlck->lock;
 	xtlck->hwm.offset = index;
 	XT_PUTPAGE(mp);
 	/*
 	 * go back up to the parent page
 	 */
       getParent:
 	/* pop/restore parent entry for the current child page */
 	if ((parent = BT_POP(&btstack)) == NULL)
 		/* current page must have been root */
 		goto out;
 	/* get back the parent page */
 	bn = parent->bn;
 	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 	if (rc)
 		return rc;
 	index = parent->index;
 	/*
 	 * parent page become empty: free the page
 	 */
 	if (index == XTENTRYSTART) {
 		/* txCommit() with tlckFREE:
 		 * free child extents covered by parent;
 		 * invalidate parent if COMMIT_PWMAP;
 		 */
 		tlck = txLock(tid, ip, mp, tlckXTREE);
 		xtlck = (struct xtlock *) & tlck->lock;
 		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
 		tlck->type = tlckXTREE | tlckFREE;
 		XT_PUTPAGE(mp);
 		if (p->header.flag & BT_ROOT) {
 			goto out;
 		} else {
 			goto getParent;
 		}
 	}
 	/*
 	 * parent page still has entries for front region;
 	 */
 	else
 		index--;
 	/*
 	 *	internal page: go down to child page of current entry
 	 */
       getChild:
 	/* save current parent entry for the child page */
 	if (BT_STACK_FULL(&btstack)) {
 		jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
 		XT_PUTPAGE(mp);
 		return -EIO;
 	}
 	BT_PUSH(&btstack, bn, index);
 	/* get child page */
 	xad = &p->xad[index];
 	bn = addressXAD(xad);
 	/*
 	 * first access of each internal entry:
 	 */
 	/* release parent page */
 	XT_PUTPAGE(mp);
 	/* process the child page */
 	goto getPage;
       out:
 	return 0;
 }
 #ifdef CONFIG_JFS_STATISTICS
 int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
 		    int *eof, void *data)
 {
 	int len = 0;
 	off_t begin;
 	len += sprintf(buffer,
 		       "JFS Xtree statistics\n"
 		       "====================\n"
 		       "searches = %d\n"
 		       "fast searches = %d\n"
 		       "splits = %d\n",
 		       xtStat.search,
 		       xtStat.fastSearch,
 		       xtStat.split);
 	begin = offset;
 	*start = buffer + begin;
 	len -= begin;
 	if (len > length)
 		len = length;
 	else
 		*eof = 1;
 	if (len < 0)
 		len = 0;
 	return len;
 }
 #endif