Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 4f902c37727bbedbc0508a1477874c58ddcc9af8

Authored by Mark Fasheh 2007-03-10 08:26:50 +0800

ocfs2: Fix extent lookup to return true size of holes

Initially, we had wired things to return a size '1' of holes. Cook up a
small amount of code to find the next extent and calculate the number of
clusters between the virtual offset and the next allocated extent.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 5 changed files with 109 additions and 12 deletions Inline Diff

fs/ocfs2/aops.c
fs/ocfs2/extent_map.c
fs/ocfs2/extent_map.h
fs/ocfs2/journal.c
fs/ocfs2/namei.c

fs/ocfs2/aops.c

Diff comments View file @ 4f902c3

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 #include "buffer_head_io.h"
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
 	int status;
 	struct ocfs2_dinode *fe = NULL;
 	struct buffer_head *bh = NULL;
 	struct buffer_head *buffer_cache_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
 		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
 		     (unsigned long long)iblock);
 		goto bail;
 	}
 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
 				  OCFS2_I(inode)->ip_blkno,
 				  &bh, OCFS2_BH_CACHED, inode);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
 		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
 		goto bail;
 	}
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		     "%llu\n", (unsigned long long)iblock);
 		goto bail;
 	}
 	/* We don't use the page cache to create symlink data, so if
 	 * need be, copy it over from the buffer cache. */
 	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
 		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
 			    iblock;
 		buffer_cache_bh = sb_getblk(osb->sb, blkno);
 		if (!buffer_cache_bh) {
 			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 			goto bail;
 		}
 		/* we haven't locked out transactions, so a commit
 		 * could've happened. Since we've got a reference on
 		 * the bh, even if it commits while we're doing the
 		 * copy, the data is still good. */
 		if (buffer_jbd(buffer_cache_bh)
 		    && ocfs2_inode_is_new(inode)) {
 			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 			if (!kaddr) {
 				mlog(ML_ERROR, "couldn't kmap!\n");
 				goto bail;
 			}
 			memcpy(kaddr + (bh_result->b_size * iblock),
 			       buffer_cache_bh->b_data,
 			       bh_result->b_size);
 			kunmap_atomic(kaddr, KM_USER0);
 			set_buffer_uptodate(bh_result);
 		}
 		brelse(buffer_cache_bh);
 	}
 	map_bh(bh_result, inode->i_sb,
 	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
 	err = 0;
 bail:
 	if (bh)
 		brelse(bh);
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
 	int err = 0;
 	unsigned int ext_flags;
 	u64 p_blkno, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 		     inode, inode->i_ino);
 	if (S_ISLNK(inode->i_mode)) {
 		/* this always does I/O for some reason. */
 		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
 		goto bail;
 	}
 	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
 					  &ext_flags);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
 		     (unsigned long long)p_blkno);
 		goto bail;
 	}
 	/*
 	 * ocfs2 never allocates in this function - the only time we
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
 	 * allows block_prepare_write() to zero.
 	 */
 	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
 			"ino %lu, iblock %llu\n", inode->i_ino,
 			(unsigned long long)iblock);
 	/* Treat the unwritten extent as a hole for zeroing purposes. */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 	if (!ocfs2_sparse_alloc(osb)) {
 		if (p_blkno == 0) {
 			err = -EIO;
 			mlog(ML_ERROR,
 			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
 			     (unsigned long long)iblock,
 			     (unsigned long long)p_blkno,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 			dump_stack();
 		}
 		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
 		     (unsigned long long)past_eof);
 		if (create && (iblock >= past_eof))
 			set_buffer_new(bh_result);
 	}
 bail:
 	if (err < 0)
 		err = -EIO;
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out;
 	}
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	/*
 	 * i_size might have just been updated as we grabed the meta lock.  We
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
 	 * (generic_file_read, fault->nopage) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
 		char *addr = kmap(page);
 		memset(addr, 0, PAGE_SIZE);
 		flush_dcache_page(page);
 		kunmap(page);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
 	}
 	ret = ocfs2_data_lock_with_page(inode, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out_alloc;
 	}
 	ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 0);
 out:
 	if (unlock)
 		unlock_page(page);
 	mlog_exit(ret);
 	return ret;
 }
 /* Note: Because we don't support holes, our allocation has
  * already happened (allocation writes zeros to the file data)
  * so we don't have to worry about ordered writes in
  * ocfs2_writepage.
  *
  * ->writepage is called during the process of invalidating the page cache
  * during blocked lock processing.  It can't block on any cluster locks
  * to during block mapping.  It's relying on the fact that the block
  * mapping can't have disappeared under the dirty pages that it is
  * being asked to write back.
  */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 	mlog_entry("(0x%p)\n", page);
 	ret = block_write_full_page(page, ocfs2_get_block, wbc);
 	mlog_exit(ret);
 	return ret;
 }
 /*
  * This is called from ocfs2_write_zero_page() which has handled it's
  * own cluster locking and has ensured allocation exists for those
  * blocks to be written.
  */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 			       unsigned from, unsigned to)
 {
 	int ret;
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	ret = block_prepare_write(page, from, to, ocfs2_get_block);
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	return ret;
 }
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
 			unsigned from,
 			unsigned to,
 			int *partial,
 			int (*fn)(	handle_t *handle,
 					struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
 	    	block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
 							 unsigned to)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	int ret = 0;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (!handle) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	if (ocfs2_should_order_data(inode)) {
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
 	if (ret) {
 		if (handle)
 			ocfs2_commit_trans(osb, handle);
 		handle = ERR_PTR(ret);
 	}
 	return handle;
 }
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
 	u64 p_blkno = 0;
 	int err = 0;
 	struct inode *inode = mapping->host;
 	mlog_entry("(block = %llu)\n", (unsigned long long)block);
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
 	 */
 	if (!INODE_JOURNAL(inode)) {
 		err = ocfs2_meta_lock(inode, NULL, 0);
 		if (err) {
 			if (err != -ENOENT)
 				mlog_errno(err);
 			goto bail;
 		}
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 		ocfs2_meta_unlock(inode, 0);
 	}
 	if (err) {
 		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
 		     (unsigned long long)block);
 		mlog_errno(err);
 		goto bail;
 	}
 bail:
 	status = err ? 0 : p_blkno;
 	mlog_exit((int)status);
 	return status;
 }
 /*
  * TODO: Make this into a generic get_blocks function.
  *
  * From do_direct_io in direct-io.c:
  *  "So what we do is to permit the ->get_blocks function to populate
  *   bh.b_size with the size of IO which is permitted at this offset and
  *   this i_blkbits."
  *
  * This function is called directly from get_more_blocks in direct-io.c.
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
-	u64 p_blkno, inode_blocks;
+	u64 p_blkno, inode_blocks, contig_blocks;
-	int contig_blocks;
 	unsigned int ext_flags;
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 	/* This function won't even be called if the request isn't all
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
 	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 	/*
 	 * Any write past EOF is not allowed because we'd be extending.
 	 */
 	if (create && (iblock + max_blocks) > inode_blocks) {
 		ret = -EIO;
 		goto bail;
 	}
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 					  &contig_blocks, &ext_flags);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
 		ret = -EIO;
 		goto bail;
 	}
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has a hole at block %llu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    (unsigned long long)iblock);
 		ret = -EROFS;
 		goto bail;
 	}
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
 	 *
 	 * Consider an unwritten extent as a hole.
 	 */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 	else {
 		/*
 		 * ocfs2_prepare_inode_for_write() should have caught
 		 * the case where we'd be filling a hole and triggered
 		 * a buffered write instead.
 		 */
 		if (create) {
 			ret = -EIO;
 			mlog_errno(ret);
 			goto bail;
 		}
 		clear_buffer_mapped(bh_result);
 	}
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
 	if (max_blocks < contig_blocks)
 		contig_blocks = max_blocks;
 	bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
 	return ret;
 }
 /*
  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
  * particularly interested in the aio/dio case.  Like the core uses
  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
  * truncation on another.
  */
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
 			     void *private)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 	ocfs2_iocb_clear_rw_locked(iocb);
 	up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, 0);
 }
 /*
  * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
  * from ext3.  PageChecked() bits have been removed as OCFS2 does not
  * do journalled data.
  */
 static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 	journal_invalidatepage(journal, page, offset);
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 	if (!page_has_buffers(page))
 		return 0;
 	return journal_try_to_free_buffers(journal, page, wait);
 }
 static ssize_t ocfs2_direct_IO(int rw,
 			       struct kiocb *iocb,
 			       const struct iovec *iov,
 			       loff_t offset,
 			       unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	int ret;
 	mlog_entry_void();
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 		/*
 		 * We get PR data locks even for O_DIRECT.  This
 		 * allows concurrent O_DIRECT I/O but doesn't let
 		 * O_DIRECT with extending and buffered zeroing writes
 		 * race.  If they did race then the buffered zeroing
 		 * could be written back after the O_DIRECT I/O.  It's
 		 * one thing to tell people not to mix buffered and
 		 * O_DIRECT writes, but expecting them to understand
 		 * that file extension is also an implicit buffered
 		 * write is too much.  By getting the PR we force
 		 * writeback of the buffered zeroing before
 		 * proceeding.
 		 */
 		ret = ocfs2_data_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		ocfs2_data_unlock(inode, 0);
 	}
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs,
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 					    u32 cpos,
 					    unsigned int *start,
 					    unsigned int *end)
 {
 	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
 		unsigned int cpp;
 		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
 		cluster_start = cpos % cpp;
 		cluster_start = cluster_start << osb->s_clustersize_bits;
 		cluster_end = cluster_start + osb->s_clustersize;
 	}
 	BUG_ON(cluster_start > PAGE_SIZE);
 	BUG_ON(cluster_end > PAGE_SIZE);
 	if (start)
 		*start = cluster_start;
 	if (end)
 		*end = cluster_end;
 }
 /*
  * 'from' and 'to' are the region in the page to avoid zeroing.
  *
  * If pagesize > clustersize, this function will avoid zeroing outside
  * of the cluster boundary.
  *
  * from == to == 0 is code for "zero the entire cluster region"
  */
 static void ocfs2_clear_page_regions(struct page *page,
 				     struct ocfs2_super *osb, u32 cpos,
 				     unsigned from, unsigned to)
 {
 	void *kaddr;
 	unsigned int cluster_start, cluster_end;
 	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
 	kaddr = kmap_atomic(page, KM_USER0);
 	if (from || to) {
 		if (from > cluster_start)
 			memset(kaddr + cluster_start, 0, from - cluster_start);
 		if (to < cluster_end)
 			memset(kaddr + to, 0, cluster_end - to);
 	} else {
 		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
 	}
 	kunmap_atomic(kaddr, KM_USER0);
 }
 /*
  * Some of this taken from block_prepare_write(). We already have our
  * mapping by now though, and the entire write will be allocating or
  * it won't, so not much need to use BH_New.
  *
  * This will also skip zeroing, which is handled externally.
  */
 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new)
 {
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 	unsigned int block_end, block_start;
 	unsigned int bsize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, bsize, 0);
 	head = page_buffers(page);
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	     bh = bh->b_this_page, block_start += bsize) {
 		block_end = block_start + bsize;
 		/*
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
 		 */
 		if (block_start >= to || block_end <= from) {
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			continue;
 		}
 		/*
 		 * For an allocating write with cluster size >= page
 		 * size, we always write the entire page.
 		 */
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			map_bh(bh, inode->i_sb, *p_blkno);
 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 		}
 		if (PageUptodate(page)) {
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		     (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 		*p_blkno = *p_blkno + 1;
 	}
 	/*
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
 		wait_on_buffer(*--wait_bh);
 		if (!buffer_uptodate(*wait_bh))
 			ret = -EIO;
 	}
 	if (ret == 0 || !new)
 		return ret;
 	/*
 	 * If we get -EIO above, zero out any newly allocated blocks
 	 * to avoid exposing stale data.
 	 */
 	bh = head;
 	block_start = 0;
 	do {
 		void *kaddr;
 		block_end = block_start + bsize;
 		if (block_end <= from)
 			goto next_bh;
 		if (block_start >= to)
 			break;
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr+block_start, 0, bh->b_size);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 next_bh:
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	return ret;
 }
 /*
  * This will copy user data from the buffer page in the splice
  * context.
  *
  * For now, we ignore SPLICE_F_MOVE as that would require some extra
  * communication out all the way to ocfs2_write().
  */
 int ocfs2_map_and_write_splice_data(struct inode *inode,
 				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
 				  unsigned int *ret_from, unsigned int *ret_to)
 {
 	int ret;
 	unsigned int to, from, cluster_start, cluster_end;
 	char *src, *dst;
 	struct ocfs2_splice_write_priv *sp = wc->w_private;
 	struct pipe_buffer *buf = sp->s_buf;
 	unsigned long bytes, src_from;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
 					&cluster_end);
 	from = sp->s_offset;
 	src_from = sp->s_buf_offset;
 	bytes = wc->w_count;
 	if (wc->w_large_pages) {
 		/*
 		 * For cluster size < page size, we have to
 		 * calculate pos within the cluster and obey
 		 * the rightmost boundary.
 		 */
 		bytes = min(bytes, (unsigned long)(osb->s_clustersize
 				   - (wc->w_pos & (osb->s_clustersize - 1))));
 	}
 	to = from + bytes;
 	if (wc->w_this_page_new)
 		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 					    cluster_start, cluster_end, 1);
 	else
 		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 					    from, to, 0);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	BUG_ON(from > PAGE_CACHE_SIZE);
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > osb->s_clustersize);
 	BUG_ON(to > osb->s_clustersize);
 	src = buf->ops->map(sp->s_pipe, buf, 1);
 	dst = kmap_atomic(wc->w_this_page, KM_USER1);
 	memcpy(dst + from, src + src_from, bytes);
 	kunmap_atomic(wc->w_this_page, KM_USER1);
 	buf->ops->unmap(sp->s_pipe, buf, src);
 	wc->w_finished_copy = 1;
 	*ret_from = from;
 	*ret_to = to;
 out:
 	return bytes ? (unsigned int)bytes : ret;
 }
 /*
  * This will copy user data from the iovec in the buffered write
  * context.
  */
 int ocfs2_map_and_write_user_data(struct inode *inode,
 				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
 				  unsigned int *ret_from, unsigned int *ret_to)
 {
 	int ret;
 	unsigned int to, from, cluster_start, cluster_end;
 	unsigned long bytes, src_from;
 	char *dst;
 	struct ocfs2_buffered_write_priv *bp = wc->w_private;
 	const struct iovec *cur_iov = bp->b_cur_iov;
 	char __user *buf;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
 					&cluster_end);
 	buf = cur_iov->iov_base + bp->b_cur_off;
 	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
 	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
 	/*
 	 * This is a lot of comparisons, but it reads quite
 	 * easily, which is important here.
 	 */
 	/* Stay within the src page */
 	bytes = PAGE_SIZE - src_from;
 	/* Stay within the vector */
 	bytes = min(bytes,
 		    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
 	/* Stay within count */
 	bytes = min(bytes, (unsigned long)wc->w_count);
 	/*
 	 * For clustersize > page size, just stay within
 	 * target page, otherwise we have to calculate pos
 	 * within the cluster and obey the rightmost
 	 * boundary.
 	 */
 	if (wc->w_large_pages) {
 		/*
 		 * For cluster size < page size, we have to
 		 * calculate pos within the cluster and obey
 		 * the rightmost boundary.
 		 */
 		bytes = min(bytes, (unsigned long)(osb->s_clustersize
 				   - (wc->w_pos & (osb->s_clustersize - 1))));
 	} else {
 		/*
 		 * cluster size > page size is the most common
 		 * case - we just stay within the target page
 		 * boundary.
 		 */
 		bytes = min(bytes, PAGE_CACHE_SIZE - from);
 	}
 	to = from + bytes;
 	if (wc->w_this_page_new)
 		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 					    cluster_start, cluster_end, 1);
 	else
 		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 					    from, to, 0);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	BUG_ON(from > PAGE_CACHE_SIZE);
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > osb->s_clustersize);
 	BUG_ON(to > osb->s_clustersize);
 	dst = kmap(wc->w_this_page);
 	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
 	kunmap(wc->w_this_page);
 	/*
 	 * XXX: This is slow, but simple. The caller of
 	 * ocfs2_buffered_write_cluster() is responsible for
 	 * passing through the iovecs, so it's difficult to
 	 * predict what our next step is in here after our
 	 * initial write. A future version should be pushing
 	 * that iovec manipulation further down.
 	 *
 	 * By setting this, we indicate that a copy from user
 	 * data was done, and subsequent calls for this
 	 * cluster will skip copying more data.
 	 */
 	wc->w_finished_copy = 1;
 	*ret_from = from;
 	*ret_to = to;
 out:
 	return bytes ? (unsigned int)bytes : ret;
 }
 /*
  * Map, fill and write a page to disk.
  *
  * The work of copying data is done via callback.  Newly allocated
  * pages which don't take user data will be zero'd (set 'new' to
  * indicate an allocating write)
  *
  * Returns a negative error code or the number of bytes copied into
  * the page.
  */
 int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
 			  u64 *p_blkno, struct page *page,
 			  struct ocfs2_write_ctxt *wc, int new)
 {
 	int ret, copied = 0;
 	unsigned int from = 0, to = 0;
 	unsigned int cluster_start, cluster_end;
 	unsigned int zero_from = 0, zero_to = 0;
 	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
 					&cluster_start, &cluster_end);
 	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
 	    && !wc->w_finished_copy) {
 		wc->w_this_page = page;
 		wc->w_this_page_new = new;
 		ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		copied = ret;
 		zero_from = from;
 		zero_to = to;
 		if (new) {
 			from = cluster_start;
 			to = cluster_end;
 		}
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
 		 * shouldn't be writing it out without copying user
 		 * data. This is likely a math error from the caller.
 		 */
 		BUG_ON(!new);
 		from = cluster_start;
 		to = cluster_end;
 		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
 					    cluster_start, cluster_end, 1);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * Parts of newly allocated pages need to be zero'd.
 	 *
 	 * Above, we have also rewritten 'to' and 'from' - as far as
 	 * the rest of the function is concerned, the entire cluster
 	 * range inside of a page needs to be written.
 	 *
 	 * We can skip this if the page is up to date - it's already
 	 * been zero'd from being read in as a hole.
 	 */
 	if (new && !PageUptodate(page))
 		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
 					 wc->w_cpos, zero_from, zero_to);
 	flush_dcache_page(page);
 	if (ocfs2_should_order_data(inode)) {
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
 	/*
 	 * We don't use generic_commit_write() because we need to
 	 * handle our own i_size update.
 	 */
 	ret = block_commit_write(page, from, to);
 	if (ret)
 		mlog_errno(ret);
 out:
 	return copied ? copied : ret;
 }
 /*
  * Do the actual write of some data into an inode. Optionally allocate
  * in order to fulfill the write.
  *
  * cpos is the logical cluster offset within the file to write at
  *
  * 'phys' is the physical mapping of that offset. a 'phys' value of
  * zero indicates that allocation is required. In this case, data_ac
  * and meta_ac should be valid (meta_ac can be null if metadata
  * allocation isn't required).
  */
 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 			   struct buffer_head *di_bh,
 			   struct ocfs2_alloc_context *data_ac,
 			   struct ocfs2_alloc_context *meta_ac,
 			   struct ocfs2_write_ctxt *wc)
 {
 	int ret, i, numpages = 1, new;
 	unsigned int copied = 0;
 	u32 tmp_pos;
 	u64 v_blkno, p_blkno;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long index, start;
 	struct page **cpages;
 	new = phys == 0 ? 1 : 0;
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
 	 * page. Otherwise, we'll need a whole clusters worth.
 	 */
 	if (new)
 		numpages = ocfs2_pages_per_cluster(inode->i_sb);
 	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
 	if (!cpages) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		return ret;
 	}
 	/*
 	 * Fill our page array first. That way we've grabbed enough so
 	 * that we can zero and flush if we error after adding the
 	 * extent.
 	 */
 	if (new) {
 		start = ocfs2_align_clusters_to_page_index(inode->i_sb,
 							   wc->w_cpos);
 		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
 	} else {
 		start = wc->w_pos >> PAGE_CACHE_SHIFT;
 		v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
 	}
 	for(i = 0; i < numpages; i++) {
 		index = start + i;
 		cpages[i] = grab_cache_page(mapping, index);
 		if (!cpages[i]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	if (new) {
 		/*
 		 * This is safe to call with the page locks - it won't take
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = wc->w_cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
 						 &tmp_pos, 1, di_bh, handle,
 						 data_ac, meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
 		 * internal tree allocation code should know how to increase
 		 * transaction credits itself.
 		 *
 		 * If need be, we could handle -EAGAIN for a
 		 * RESTART_TRANS here.
 		 */
 		mlog_bug_on_msg(ret == -EAGAIN,
 				"Inode %llu: EAGAIN return during allocation.\n",
 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
 					  NULL);
 	if (ret < 0) {
 		/*
 		 * XXX: Should we go readonly here?
 		 */
 		mlog_errno(ret);
 		goto out;
 	}
 	BUG_ON(p_blkno == 0);
 	for(i = 0; i < numpages; i++) {
 		ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
 					    wc, new);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		copied += ret;
 	}
 out:
 	for(i = 0; i < numpages; i++) {
 		unlock_page(cpages[i]);
 		mark_page_accessed(cpages[i]);
 		page_cache_release(cpages[i]);
 	}
 	kfree(cpages);
 	return copied ? copied : ret;
 }
 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
 				  struct ocfs2_super *osb, loff_t pos,
 				  size_t count, ocfs2_page_writer *cb,
 				  void *cb_priv)
 {
 	wc->w_count = count;
 	wc->w_pos = pos;
 	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
 	wc->w_finished_copy = 0;
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
 		wc->w_large_pages = 1;
 	else
 		wc->w_large_pages = 0;
 	wc->w_write_data_page = cb;
 	wc->w_private = cb_priv;
 }
 /*
  * Write a cluster to an inode. The cluster may not be allocated yet,
  * in which case it will be. This only exists for buffered writes -
  * O_DIRECT takes a more "traditional" path through the kernel.
  *
  * The caller is responsible for incrementing pos, written counts, etc
  *
  * For file systems that don't support sparse files, pre-allocation
  * and page zeroing up until cpos should be done prior to this
  * function call.
  *
  * Callers should be holding i_sem, and the rw cluster lock.
  *
  * Returns the number of user bytes written, or less than zero for
  * error.
  */
 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
 				     size_t count, ocfs2_page_writer *actor,
 				     void *priv)
 {
 	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
 	ssize_t written = 0;
 	u32 phys;
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_write_ctxt wc;
 	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
 	ret = ocfs2_meta_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	/*
 	 * Take alloc sem here to prevent concurrent lookups. That way
 	 * the mapping, zeroing and tree manipulation within
 	 * ocfs2_write() will be safe against ->readpage(). This
 	 * should also serve to lock out allocation from a shared
 	 * writeable region.
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_meta;
 	}
 	/* phys == 0 means that allocation is required. */
 	if (phys == 0) {
 		ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_meta;
 		}
 		credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
 	}
 	ret = ocfs2_data_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_meta;
 	}
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out_data;
 	}
 	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
 			      meta_ac, &wc);
 	if (written < 0) {
 		ret = written;
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	ret = ocfs2_journal_access(handle, inode, di_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	pos += written;
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
 	}
 	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
 	di->i_size = cpu_to_le64((u64)i_size_read(inode));
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret)
 		mlog_errno(ret);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out_data:
 	ocfs2_data_unlock(inode, 1);
 out_meta:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 1);
 out:
 	brelse(di_bh);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 	return written ? written : ret;
 }
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.bmap		= ocfs2_bmap,
 	.sync_page	= block_sync_page,
 	.direct_IO	= ocfs2_direct_IO,
 	.invalidatepage	= ocfs2_invalidatepage,
 	.releasepage	= ocfs2_releasepage,
 	.migratepage	= buffer_migrate_page,
 };

fs/ocfs2/extent_map.c

Diff comments View file @ 4f902c3

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * extent_map.c
  *
  * Block/Cluster mapping functions
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License, version 2,  as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 #include "buffer_head_io.h"
 /*
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
+ */
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       u32 v_cluster)
+{
+	int i;
+	struct ocfs2_extent_rec *rec;
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		if (v_cluster < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+	return i;
+}
+/*
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
+ *
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
+ */
+static int ocfs2_figure_hole_clusters(struct inode *inode,
+				      struct ocfs2_extent_list *el,
+				      struct buffer_head *eb_bh,
+				      u32 v_cluster,
+				      u32 *num_clusters)
+{
+	int ret, i;
+	struct buffer_head *next_eb_bh = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
+	i = ocfs2_search_for_hole_index(el, v_cluster);
+	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		/*
+		 * Check the next leaf for any extents.
+		 */
+		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+			goto no_more_extents;
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(eb->h_next_leaf_blk),
+				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
+			ret = -EROFS;
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
+			goto out;
+		}
+		el = &next_eb->h_list;
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+	}
+no_more_extents:
+	if (i == le16_to_cpu(el->l_next_free_rec)) {
+		/*
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
+		 */
+		*num_clusters = UINT_MAX - v_cluster;
+	} else {
+		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
+	}
+	ret = 0;
+out:
+	brelse(next_eb_bh);
+	return ret;
+}
+/*
  * Return the index of the extent record which contains cluster #v_cluster.
  * -1 is returned if it was not found.
  *
  * Should work fine on interior and exterior nodes.
  */
 static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
 				    u32 v_cluster)
 {
 	int ret = -1;
 	int i;
 	struct ocfs2_extent_rec *rec;
 	u32 rec_end, rec_start, clusters;
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
 		rec_start = le32_to_cpu(rec->e_cpos);
 		clusters = ocfs2_rec_clusters(el, rec);
 		rec_end = rec_start + clusters;
 		if (v_cluster >= rec_start && v_cluster < rec_end) {
 			ret = i;
 			break;
 		}
 	}
 	return ret;
 }
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
 {
 	int ret, i;
 	unsigned int flags = 0;
 	struct buffer_head *di_bh = NULL;
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
 	u32 coff;
 	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
 			       &di_bh, OCFS2_BH_CACHED, inode);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
 	if (el->l_tree_depth) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 		if (el->l_tree_depth) {
 			ocfs2_error(inode->i_sb,
 				    "Inode %lu has non zero tree depth in "
 				    "leaf block %llu\n", inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
 			ret = -EROFS;
 			goto out;
 		}
 	}
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
 		 * A hole was found. Return some canned values that
-		 * callers can key on.
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
 		 */
 		*p_cluster = 0;
-		if (num_clusters)
+		if (num_clusters) {
-			*num_clusters = 1;
+			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+							 v_cluster,
+							 num_clusters);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
 	} else {
 		rec = &el->l_recs[i];
 		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 		if (!rec->e_blkno) {
 			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 				    "record (%u, %u, 0)", inode->i_ino,
 				    le32_to_cpu(rec->e_cpos),
 				    ocfs2_rec_clusters(el, rec));
 			ret = -EROFS;
 			goto out;
 		}
 		coff = v_cluster - le32_to_cpu(rec->e_cpos);
 		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
 						    le64_to_cpu(rec->e_blkno));
 		*p_cluster = *p_cluster + coff;
 		if (num_clusters)
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 		flags = rec->e_flags;
 	}
 	if (extent_flags)
 		*extent_flags = flags;
 out:
 	brelse(di_bh);
 	brelse(eb_bh);
 	return ret;
 }
 /*
  * This expects alloc_sem to be held. The allocation cannot change at
  * all while the map is in the process of being updated.
  */
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
-				int *ret_count, unsigned int *extent_flags)
+				u64 *ret_count, unsigned int *extent_flags)
 {
 	int ret;
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	u32 cpos, num_clusters, p_cluster;
 	u64 boff = 0;
 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
 	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
 				 extent_flags);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * p_cluster == 0 indicates a hole.
 	 */
 	if (p_cluster) {
 		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		boff += (v_blkno & (u64)(bpc - 1));
 	}
 	*p_blkno = boff;
 	if (ret_count) {
 		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
 		*ret_count -= v_blkno & (u64)(bpc - 1);
 	}
 out:
 	return ret;
 }

fs/ocfs2/extent_map.h

Diff comments View file @ 4f902c3

fs/ocfs2/journal.c

Diff comments View file @ 4f902c3

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* journal.c	4	* journal.c
5	*	5	*
6	* Defines functions of journalling api	6	* Defines functions of journalling api
7	*	7	*
8	* Copyright (C) 2003, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2003, 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*/	24	*/
25		25
26	#include <linux/fs.h>	26	#include <linux/fs.h>
27	#include <linux/types.h>	27	#include <linux/types.h>
28	#include <linux/slab.h>	28	#include <linux/slab.h>
29	#include <linux/highmem.h>	29	#include <linux/highmem.h>
30	#include <linux/kthread.h>	30	#include <linux/kthread.h>
31		31
32	#define MLOG_MASK_PREFIX ML_JOURNAL	32	#define MLOG_MASK_PREFIX ML_JOURNAL
33	#include <cluster/masklog.h>	33	#include <cluster/masklog.h>
34		34
35	#include "ocfs2.h"	35	#include "ocfs2.h"
36		36
37	#include "alloc.h"	37	#include "alloc.h"
38	#include "dlmglue.h"	38	#include "dlmglue.h"
39	#include "extent_map.h"	39	#include "extent_map.h"
40	#include "heartbeat.h"	40	#include "heartbeat.h"
41	#include "inode.h"	41	#include "inode.h"
42	#include "journal.h"	42	#include "journal.h"
43	#include "localalloc.h"	43	#include "localalloc.h"
44	#include "namei.h"	44	#include "namei.h"
45	#include "slot_map.h"	45	#include "slot_map.h"
46	#include "super.h"	46	#include "super.h"
47	#include "vote.h"	47	#include "vote.h"
48	#include "sysfile.h"	48	#include "sysfile.h"
49		49
50	#include "buffer_head_io.h"	50	#include "buffer_head_io.h"
51		51
52	DEFINE_SPINLOCK(trans_inc_lock);	52	DEFINE_SPINLOCK(trans_inc_lock);
53		53
54	static int ocfs2_force_read_journal(struct inode *inode);	54	static int ocfs2_force_read_journal(struct inode *inode);
55	static int ocfs2_recover_node(struct ocfs2_super *osb,	55	static int ocfs2_recover_node(struct ocfs2_super *osb,
56	int node_num);	56	int node_num);
57	static int __ocfs2_recovery_thread(void *arg);	57	static int __ocfs2_recovery_thread(void *arg);
58	static int ocfs2_commit_cache(struct ocfs2_super *osb);	58	static int ocfs2_commit_cache(struct ocfs2_super *osb);
59	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);	59	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	60	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
61	int dirty);	61	int dirty);
62	static int ocfs2_trylock_journal(struct ocfs2_super *osb,	62	static int ocfs2_trylock_journal(struct ocfs2_super *osb,
63	int slot_num);	63	int slot_num);
64	static int ocfs2_recover_orphans(struct ocfs2_super *osb,	64	static int ocfs2_recover_orphans(struct ocfs2_super *osb,
65	int slot);	65	int slot);
66	static int ocfs2_commit_thread(void *arg);	66	static int ocfs2_commit_thread(void *arg);
67		67
68	static int ocfs2_commit_cache(struct ocfs2_super *osb)	68	static int ocfs2_commit_cache(struct ocfs2_super *osb)
69	{	69	{
70	int status = 0;	70	int status = 0;
71	unsigned int flushed;	71	unsigned int flushed;
72	unsigned long old_id;	72	unsigned long old_id;
73	struct ocfs2_journal *journal = NULL;	73	struct ocfs2_journal *journal = NULL;
74		74
75	mlog_entry_void();	75	mlog_entry_void();
76		76
77	journal = osb->journal;	77	journal = osb->journal;
78		78
79	/* Flush all pending commits and checkpoint the journal. */	79	/* Flush all pending commits and checkpoint the journal. */
80	down_write(&journal->j_trans_barrier);	80	down_write(&journal->j_trans_barrier);
81		81
82	if (atomic_read(&journal->j_num_trans) == 0) {	82	if (atomic_read(&journal->j_num_trans) == 0) {
83	up_write(&journal->j_trans_barrier);	83	up_write(&journal->j_trans_barrier);
84	mlog(0, "No transactions for me to flush!\n");	84	mlog(0, "No transactions for me to flush!\n");
85	goto finally;	85	goto finally;
86	}	86	}
87		87
88	journal_lock_updates(journal->j_journal);	88	journal_lock_updates(journal->j_journal);
89	status = journal_flush(journal->j_journal);	89	status = journal_flush(journal->j_journal);
90	journal_unlock_updates(journal->j_journal);	90	journal_unlock_updates(journal->j_journal);
91	if (status < 0) {	91	if (status < 0) {
92	up_write(&journal->j_trans_barrier);	92	up_write(&journal->j_trans_barrier);
93	mlog_errno(status);	93	mlog_errno(status);
94	goto finally;	94	goto finally;
95	}	95	}
96		96
97	old_id = ocfs2_inc_trans_id(journal);	97	old_id = ocfs2_inc_trans_id(journal);
98		98
99	flushed = atomic_read(&journal->j_num_trans);	99	flushed = atomic_read(&journal->j_num_trans);
100	atomic_set(&journal->j_num_trans, 0);	100	atomic_set(&journal->j_num_trans, 0);
101	up_write(&journal->j_trans_barrier);	101	up_write(&journal->j_trans_barrier);
102		102
103	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",	103	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104	journal->j_trans_id, flushed);	104	journal->j_trans_id, flushed);
105		105
106	ocfs2_kick_vote_thread(osb);	106	ocfs2_kick_vote_thread(osb);
107	wake_up(&journal->j_checkpointed);	107	wake_up(&journal->j_checkpointed);
108	finally:	108	finally:
109	mlog_exit(status);	109	mlog_exit(status);
110	return status;	110	return status;
111	}	111	}
112		112
113	/* pass it NULL and it will allocate a new handle object for you. If	113	/* pass it NULL and it will allocate a new handle object for you. If
114	* you pass it a handle however, it may still return error, in which	114	* you pass it a handle however, it may still return error, in which
115	* case it has free'd the passed handle for you. */	115	* case it has free'd the passed handle for you. */
116	handle_t ocfs2_start_trans(struct ocfs2_super osb, int max_buffs)	116	handle_t ocfs2_start_trans(struct ocfs2_super osb, int max_buffs)
117	{	117	{
118	journal_t *journal = osb->journal->j_journal;	118	journal_t *journal = osb->journal->j_journal;
119	handle_t *handle;	119	handle_t *handle;
120		120
121	BUG_ON(!osb \|\| !osb->journal->j_journal);	121	BUG_ON(!osb \|\| !osb->journal->j_journal);
122		122
123	if (ocfs2_is_hard_readonly(osb))	123	if (ocfs2_is_hard_readonly(osb))
124	return ERR_PTR(-EROFS);	124	return ERR_PTR(-EROFS);
125		125
126	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);	126	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
127	BUG_ON(max_buffs <= 0);	127	BUG_ON(max_buffs <= 0);
128		128
129	/* JBD might support this, but our journalling code doesn't yet. */	129	/* JBD might support this, but our journalling code doesn't yet. */
130	if (journal_current_handle()) {	130	if (journal_current_handle()) {
131	mlog(ML_ERROR, "Recursive transaction attempted!\n");	131	mlog(ML_ERROR, "Recursive transaction attempted!\n");
132	BUG();	132	BUG();
133	}	133	}
134		134
135	down_read(&osb->journal->j_trans_barrier);	135	down_read(&osb->journal->j_trans_barrier);
136		136
137	handle = journal_start(journal, max_buffs);	137	handle = journal_start(journal, max_buffs);
138	if (IS_ERR(handle)) {	138	if (IS_ERR(handle)) {
139	up_read(&osb->journal->j_trans_barrier);	139	up_read(&osb->journal->j_trans_barrier);
140		140
141	mlog_errno(PTR_ERR(handle));	141	mlog_errno(PTR_ERR(handle));
142		142
143	if (is_journal_aborted(journal)) {	143	if (is_journal_aborted(journal)) {
144	ocfs2_abort(osb->sb, "Detected aborted journal");	144	ocfs2_abort(osb->sb, "Detected aborted journal");
145	handle = ERR_PTR(-EROFS);	145	handle = ERR_PTR(-EROFS);
146	}	146	}
147	} else {	147	} else {
148	if (!ocfs2_mount_local(osb))	148	if (!ocfs2_mount_local(osb))
149	atomic_inc(&(osb->journal->j_num_trans));	149	atomic_inc(&(osb->journal->j_num_trans));
150	}	150	}
151		151
152	return handle;	152	return handle;
153	}	153	}
154		154
155	int ocfs2_commit_trans(struct ocfs2_super *osb,	155	int ocfs2_commit_trans(struct ocfs2_super *osb,
156	handle_t *handle)	156	handle_t *handle)
157	{	157	{
158	int ret;	158	int ret;
159	struct ocfs2_journal *journal = osb->journal;	159	struct ocfs2_journal *journal = osb->journal;
160		160
161	BUG_ON(!handle);	161	BUG_ON(!handle);
162		162
163	ret = journal_stop(handle);	163	ret = journal_stop(handle);
164	if (ret < 0)	164	if (ret < 0)
165	mlog_errno(ret);	165	mlog_errno(ret);
166		166
167	up_read(&journal->j_trans_barrier);	167	up_read(&journal->j_trans_barrier);
168		168
169	return ret;	169	return ret;
170	}	170	}
171		171
172	/*	172	/*
173	* 'nblocks' is what you want to add to the current	173	* 'nblocks' is what you want to add to the current
174	* transaction. extend_trans will either extend the current handle by	174	* transaction. extend_trans will either extend the current handle by
175	* nblocks, or commit it and start a new one with nblocks credits.	175	* nblocks, or commit it and start a new one with nblocks credits.
176	*	176	*
177	* WARNING: This will not release any semaphores or disk locks taken	177	* WARNING: This will not release any semaphores or disk locks taken
178	* during the transaction, so make sure they were taken before	178	* during the transaction, so make sure they were taken before
179	* start_trans or we'll have ordering deadlocks.	179	* start_trans or we'll have ordering deadlocks.
180	*	180	*
181	* WARNING2: Note that we do not drop j_trans_barrier here. This is	181	* WARNING2: Note that we do not drop j_trans_barrier here. This is
182	* good because transaction ids haven't yet been recorded on the	182	* good because transaction ids haven't yet been recorded on the
183	* cluster locks associated with this handle.	183	* cluster locks associated with this handle.
184	*/	184	*/
185	int ocfs2_extend_trans(handle_t *handle, int nblocks)	185	int ocfs2_extend_trans(handle_t *handle, int nblocks)
186	{	186	{
187	int status;	187	int status;
188		188
189	BUG_ON(!handle);	189	BUG_ON(!handle);
190	BUG_ON(!nblocks);	190	BUG_ON(!nblocks);
191		191
192	mlog_entry_void();	192	mlog_entry_void();
193		193
194	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);	194	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
195		195
196	status = journal_extend(handle, nblocks);	196	status = journal_extend(handle, nblocks);
197	if (status < 0) {	197	if (status < 0) {
198	mlog_errno(status);	198	mlog_errno(status);
199	goto bail;	199	goto bail;
200	}	200	}
201		201
202	if (status > 0) {	202	if (status > 0) {
203	mlog(0, "journal_extend failed, trying journal_restart\n");	203	mlog(0, "journal_extend failed, trying journal_restart\n");
204	status = journal_restart(handle, nblocks);	204	status = journal_restart(handle, nblocks);
205	if (status < 0) {	205	if (status < 0) {
206	mlog_errno(status);	206	mlog_errno(status);
207	goto bail;	207	goto bail;
208	}	208	}
209	}	209	}
210		210
211	status = 0;	211	status = 0;
212	bail:	212	bail:
213		213
214	mlog_exit(status);	214	mlog_exit(status);
215	return status;	215	return status;
216	}	216	}
217		217
218	int ocfs2_journal_access(handle_t *handle,	218	int ocfs2_journal_access(handle_t *handle,
219	struct inode *inode,	219	struct inode *inode,
220	struct buffer_head *bh,	220	struct buffer_head *bh,
221	int type)	221	int type)
222	{	222	{
223	int status;	223	int status;
224		224
225	BUG_ON(!inode);	225	BUG_ON(!inode);
226	BUG_ON(!handle);	226	BUG_ON(!handle);
227	BUG_ON(!bh);	227	BUG_ON(!bh);
228		228
229	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",	229	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
230	(unsigned long long)bh->b_blocknr, type,	230	(unsigned long long)bh->b_blocknr, type,
231	(type == OCFS2_JOURNAL_ACCESS_CREATE) ?	231	(type == OCFS2_JOURNAL_ACCESS_CREATE) ?
232	"OCFS2_JOURNAL_ACCESS_CREATE" :	232	"OCFS2_JOURNAL_ACCESS_CREATE" :
233	"OCFS2_JOURNAL_ACCESS_WRITE",	233	"OCFS2_JOURNAL_ACCESS_WRITE",
234	bh->b_size);	234	bh->b_size);
235		235
236	/* we can safely remove this assertion after testing. */	236	/* we can safely remove this assertion after testing. */
237	if (!buffer_uptodate(bh)) {	237	if (!buffer_uptodate(bh)) {
238	mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");	238	mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
239	mlog(ML_ERROR, "b_blocknr=%llu\n",	239	mlog(ML_ERROR, "b_blocknr=%llu\n",
240	(unsigned long long)bh->b_blocknr);	240	(unsigned long long)bh->b_blocknr);
241	BUG();	241	BUG();
242	}	242	}
243		243
244	/* Set the current transaction information on the inode so	244	/* Set the current transaction information on the inode so
245	* that the locking code knows whether it can drop it's locks	245	* that the locking code knows whether it can drop it's locks
246	* on this inode or not. We're protected from the commit	246	* on this inode or not. We're protected from the commit
247	* thread updating the current transaction id until	247	* thread updating the current transaction id until
248	* ocfs2_commit_trans() because ocfs2_start_trans() took	248	* ocfs2_commit_trans() because ocfs2_start_trans() took
249	* j_trans_barrier for us. */	249	* j_trans_barrier for us. */
250	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);	250	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
251		251
252	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);	252	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
253	switch (type) {	253	switch (type) {
254	case OCFS2_JOURNAL_ACCESS_CREATE:	254	case OCFS2_JOURNAL_ACCESS_CREATE:
255	case OCFS2_JOURNAL_ACCESS_WRITE:	255	case OCFS2_JOURNAL_ACCESS_WRITE:
256	status = journal_get_write_access(handle, bh);	256	status = journal_get_write_access(handle, bh);
257	break;	257	break;
258		258
259	case OCFS2_JOURNAL_ACCESS_UNDO:	259	case OCFS2_JOURNAL_ACCESS_UNDO:
260	status = journal_get_undo_access(handle, bh);	260	status = journal_get_undo_access(handle, bh);
261	break;	261	break;
262		262
263	default:	263	default:
264	status = -EINVAL;	264	status = -EINVAL;
265	mlog(ML_ERROR, "Uknown access type!\n");	265	mlog(ML_ERROR, "Uknown access type!\n");
266	}	266	}
267	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);	267	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
268		268
269	if (status < 0)	269	if (status < 0)
270	mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",	270	mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
271	status, type);	271	status, type);
272		272
273	mlog_exit(status);	273	mlog_exit(status);
274	return status;	274	return status;
275	}	275	}
276		276
277	int ocfs2_journal_dirty(handle_t *handle,	277	int ocfs2_journal_dirty(handle_t *handle,
278	struct buffer_head *bh)	278	struct buffer_head *bh)
279	{	279	{
280	int status;	280	int status;
281		281
282	mlog_entry("(bh->b_blocknr=%llu)\n",	282	mlog_entry("(bh->b_blocknr=%llu)\n",
283	(unsigned long long)bh->b_blocknr);	283	(unsigned long long)bh->b_blocknr);
284		284
285	status = journal_dirty_metadata(handle, bh);	285	status = journal_dirty_metadata(handle, bh);
286	if (status < 0)	286	if (status < 0)
287	mlog(ML_ERROR, "Could not dirty metadata buffer. "	287	mlog(ML_ERROR, "Could not dirty metadata buffer. "
288	"(bh->b_blocknr=%llu)\n",	288	"(bh->b_blocknr=%llu)\n",
289	(unsigned long long)bh->b_blocknr);	289	(unsigned long long)bh->b_blocknr);
290		290
291	mlog_exit(status);	291	mlog_exit(status);
292	return status;	292	return status;
293	}	293	}
294		294
295	int ocfs2_journal_dirty_data(handle_t *handle,	295	int ocfs2_journal_dirty_data(handle_t *handle,
296	struct buffer_head *bh)	296	struct buffer_head *bh)
297	{	297	{
298	int err = journal_dirty_data(handle, bh);	298	int err = journal_dirty_data(handle, bh);
299	if (err)	299	if (err)
300	mlog_errno(err);	300	mlog_errno(err);
301	/* TODO: When we can handle it, abort the handle and go RO on	301	/* TODO: When we can handle it, abort the handle and go RO on
302	* error here. */	302	* error here. */
303		303
304	return err;	304	return err;
305	}	305	}
306		306
307	#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)	307	#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
308		308
309	void ocfs2_set_journal_params(struct ocfs2_super *osb)	309	void ocfs2_set_journal_params(struct ocfs2_super *osb)
310	{	310	{
311	journal_t *journal = osb->journal->j_journal;	311	journal_t *journal = osb->journal->j_journal;
312		312
313	spin_lock(&journal->j_state_lock);	313	spin_lock(&journal->j_state_lock);
314	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;	314	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
315	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)	315	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
316	journal->j_flags \|= JFS_BARRIER;	316	journal->j_flags \|= JFS_BARRIER;
317	else	317	else
318	journal->j_flags &= ~JFS_BARRIER;	318	journal->j_flags &= ~JFS_BARRIER;
319	spin_unlock(&journal->j_state_lock);	319	spin_unlock(&journal->j_state_lock);
320	}	320	}
321		321
322	int ocfs2_journal_init(struct ocfs2_journal journal, int dirty)	322	int ocfs2_journal_init(struct ocfs2_journal journal, int dirty)
323	{	323	{
324	int status = -1;	324	int status = -1;
325	struct inode inode = NULL; / the journal inode */	325	struct inode inode = NULL; / the journal inode */
326	journal_t *j_journal = NULL;	326	journal_t *j_journal = NULL;
327	struct ocfs2_dinode *di = NULL;	327	struct ocfs2_dinode *di = NULL;
328	struct buffer_head *bh = NULL;	328	struct buffer_head *bh = NULL;
329	struct ocfs2_super *osb;	329	struct ocfs2_super *osb;
330	int meta_lock = 0;	330	int meta_lock = 0;
331		331
332	mlog_entry_void();	332	mlog_entry_void();
333		333
334	BUG_ON(!journal);	334	BUG_ON(!journal);
335		335
336	osb = journal->j_osb;	336	osb = journal->j_osb;
337		337
338	/* already have the inode for our journal */	338	/* already have the inode for our journal */
339	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,	339	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
340	osb->slot_num);	340	osb->slot_num);
341	if (inode == NULL) {	341	if (inode == NULL) {
342	status = -EACCES;	342	status = -EACCES;
343	mlog_errno(status);	343	mlog_errno(status);
344	goto done;	344	goto done;
345	}	345	}
346	if (is_bad_inode(inode)) {	346	if (is_bad_inode(inode)) {
347	mlog(ML_ERROR, "access error (bad inode)\n");	347	mlog(ML_ERROR, "access error (bad inode)\n");
348	iput(inode);	348	iput(inode);
349	inode = NULL;	349	inode = NULL;
350	status = -EACCES;	350	status = -EACCES;
351	goto done;	351	goto done;
352	}	352	}
353		353
354	SET_INODE_JOURNAL(inode);	354	SET_INODE_JOURNAL(inode);
355	OCFS2_I(inode)->ip_open_count++;	355	OCFS2_I(inode)->ip_open_count++;
356		356
357	/* Skip recovery waits here - journal inode metadata never	357	/* Skip recovery waits here - journal inode metadata never
358	* changes in a live cluster so it can be considered an	358	* changes in a live cluster so it can be considered an
359	* exception to the rule. */	359	* exception to the rule. */
360	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);	360	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
361	if (status < 0) {	361	if (status < 0) {
362	if (status != -ERESTARTSYS)	362	if (status != -ERESTARTSYS)
363	mlog(ML_ERROR, "Could not get lock on journal!\n");	363	mlog(ML_ERROR, "Could not get lock on journal!\n");
364	goto done;	364	goto done;
365	}	365	}
366		366
367	meta_lock = 1;	367	meta_lock = 1;
368	di = (struct ocfs2_dinode *)bh->b_data;	368	di = (struct ocfs2_dinode *)bh->b_data;
369		369
370	if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {	370	if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
371	mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",	371	mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
372	inode->i_size);	372	inode->i_size);
373	status = -EINVAL;	373	status = -EINVAL;
374	goto done;	374	goto done;
375	}	375	}
376		376
377	mlog(0, "inode->i_size = %lld\n", inode->i_size);	377	mlog(0, "inode->i_size = %lld\n", inode->i_size);
378	mlog(0, "inode->i_blocks = %llu\n",	378	mlog(0, "inode->i_blocks = %llu\n",
379	(unsigned long long)inode->i_blocks);	379	(unsigned long long)inode->i_blocks);
380	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);	380	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
381		381
382	/* call the kernels journal init function now */	382	/* call the kernels journal init function now */
383	j_journal = journal_init_inode(inode);	383	j_journal = journal_init_inode(inode);
384	if (j_journal == NULL) {	384	if (j_journal == NULL) {
385	mlog(ML_ERROR, "Linux journal layer error\n");	385	mlog(ML_ERROR, "Linux journal layer error\n");
386	status = -EINVAL;	386	status = -EINVAL;
387	goto done;	387	goto done;
388	}	388	}
389		389
390	mlog(0, "Returned from journal_init_inode\n");	390	mlog(0, "Returned from journal_init_inode\n");
391	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);	391	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
392		392
393	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &	393	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
394	OCFS2_JOURNAL_DIRTY_FL);	394	OCFS2_JOURNAL_DIRTY_FL);
395		395
396	journal->j_journal = j_journal;	396	journal->j_journal = j_journal;
397	journal->j_inode = inode;	397	journal->j_inode = inode;
398	journal->j_bh = bh;	398	journal->j_bh = bh;
399		399
400	ocfs2_set_journal_params(osb);	400	ocfs2_set_journal_params(osb);
401		401
402	journal->j_state = OCFS2_JOURNAL_LOADED;	402	journal->j_state = OCFS2_JOURNAL_LOADED;
403		403
404	status = 0;	404	status = 0;
405	done:	405	done:
406	if (status < 0) {	406	if (status < 0) {
407	if (meta_lock)	407	if (meta_lock)
408	ocfs2_meta_unlock(inode, 1);	408	ocfs2_meta_unlock(inode, 1);
409	if (bh != NULL)	409	if (bh != NULL)
410	brelse(bh);	410	brelse(bh);
411	if (inode) {	411	if (inode) {
412	OCFS2_I(inode)->ip_open_count--;	412	OCFS2_I(inode)->ip_open_count--;
413	iput(inode);	413	iput(inode);
414	}	414	}
415	}	415	}
416		416
417	mlog_exit(status);	417	mlog_exit(status);
418	return status;	418	return status;
419	}	419	}
420		420
421	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	421	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
422	int dirty)	422	int dirty)
423	{	423	{
424	int status;	424	int status;
425	unsigned int flags;	425	unsigned int flags;
426	struct ocfs2_journal *journal = osb->journal;	426	struct ocfs2_journal *journal = osb->journal;
427	struct buffer_head *bh = journal->j_bh;	427	struct buffer_head *bh = journal->j_bh;
428	struct ocfs2_dinode *fe;	428	struct ocfs2_dinode *fe;
429		429
430	mlog_entry_void();	430	mlog_entry_void();
431		431
432	fe = (struct ocfs2_dinode *)bh->b_data;	432	fe = (struct ocfs2_dinode *)bh->b_data;
433	if (!OCFS2_IS_VALID_DINODE(fe)) {	433	if (!OCFS2_IS_VALID_DINODE(fe)) {
434	/* This is called from startup/shutdown which will	434	/* This is called from startup/shutdown which will
435	* handle the errors in a specific manner, so no need	435	* handle the errors in a specific manner, so no need
436	* to call ocfs2_error() here. */	436	* to call ocfs2_error() here. */
437	mlog(ML_ERROR, "Journal dinode %llu has invalid "	437	mlog(ML_ERROR, "Journal dinode %llu has invalid "
438	"signature: %.*s", (unsigned long long)fe->i_blkno, 7,	438	"signature: %.*s", (unsigned long long)fe->i_blkno, 7,
439	fe->i_signature);	439	fe->i_signature);
440	status = -EIO;	440	status = -EIO;
441	goto out;	441	goto out;
442	}	442	}
443		443
444	flags = le32_to_cpu(fe->id1.journal1.ij_flags);	444	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
445	if (dirty)	445	if (dirty)
446	flags \|= OCFS2_JOURNAL_DIRTY_FL;	446	flags \|= OCFS2_JOURNAL_DIRTY_FL;
447	else	447	else
448	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	448	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
449	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	449	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
450		450
451	status = ocfs2_write_block(osb, bh, journal->j_inode);	451	status = ocfs2_write_block(osb, bh, journal->j_inode);
452	if (status < 0)	452	if (status < 0)
453	mlog_errno(status);	453	mlog_errno(status);
454		454
455	out:	455	out:
456	mlog_exit(status);	456	mlog_exit(status);
457	return status;	457	return status;
458	}	458	}
459		459
460	/*	460	/*
461	* If the journal has been kmalloc'd it needs to be freed after this	461	* If the journal has been kmalloc'd it needs to be freed after this
462	* call.	462	* call.
463	*/	463	*/
464	void ocfs2_journal_shutdown(struct ocfs2_super *osb)	464	void ocfs2_journal_shutdown(struct ocfs2_super *osb)
465	{	465	{
466	struct ocfs2_journal *journal = NULL;	466	struct ocfs2_journal *journal = NULL;
467	int status = 0;	467	int status = 0;
468	struct inode *inode = NULL;	468	struct inode *inode = NULL;
469	int num_running_trans = 0;	469	int num_running_trans = 0;
470		470
471	mlog_entry_void();	471	mlog_entry_void();
472		472
473	BUG_ON(!osb);	473	BUG_ON(!osb);
474		474
475	journal = osb->journal;	475	journal = osb->journal;
476	if (!journal)	476	if (!journal)
477	goto done;	477	goto done;
478		478
479	inode = journal->j_inode;	479	inode = journal->j_inode;
480		480
481	if (journal->j_state != OCFS2_JOURNAL_LOADED)	481	if (journal->j_state != OCFS2_JOURNAL_LOADED)
482	goto done;	482	goto done;
483		483
484	/* need to inc inode use count as journal_destroy will iput. */	484	/* need to inc inode use count as journal_destroy will iput. */
485	if (!igrab(inode))	485	if (!igrab(inode))
486	BUG();	486	BUG();
487		487
488	num_running_trans = atomic_read(&(osb->journal->j_num_trans));	488	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
489	if (num_running_trans > 0)	489	if (num_running_trans > 0)
490	mlog(0, "Shutting down journal: must wait on %d "	490	mlog(0, "Shutting down journal: must wait on %d "
491	"running transactions!\n",	491	"running transactions!\n",
492	num_running_trans);	492	num_running_trans);
493		493
494	/* Do a commit_cache here. It will flush our journal, and	494	/* Do a commit_cache here. It will flush our journal, and
495	* release any locks that are still held.	495	* release any locks that are still held.
496	* set the SHUTDOWN flag and release the trans lock.	496	* set the SHUTDOWN flag and release the trans lock.
497	* the commit thread will take the trans lock for us below. */	497	* the commit thread will take the trans lock for us below. */
498	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;	498	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
499		499
500	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not	500	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
501	* drop the trans_lock (which we want to hold until we	501	* drop the trans_lock (which we want to hold until we
502	* completely destroy the journal. */	502	* completely destroy the journal. */
503	if (osb->commit_task) {	503	if (osb->commit_task) {
504	/* Wait for the commit thread */	504	/* Wait for the commit thread */
505	mlog(0, "Waiting for ocfs2commit to exit....\n");	505	mlog(0, "Waiting for ocfs2commit to exit....\n");
506	kthread_stop(osb->commit_task);	506	kthread_stop(osb->commit_task);
507	osb->commit_task = NULL;	507	osb->commit_task = NULL;
508	}	508	}
509		509
510	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);	510	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
511		511
512	if (ocfs2_mount_local(osb)) {	512	if (ocfs2_mount_local(osb)) {
513	journal_lock_updates(journal->j_journal);	513	journal_lock_updates(journal->j_journal);
514	status = journal_flush(journal->j_journal);	514	status = journal_flush(journal->j_journal);
515	journal_unlock_updates(journal->j_journal);	515	journal_unlock_updates(journal->j_journal);
516	if (status < 0)	516	if (status < 0)
517	mlog_errno(status);	517	mlog_errno(status);
518	}	518	}
519		519
520	if (status == 0) {	520	if (status == 0) {
521	/*	521	/*
522	* Do not toggle if flush was unsuccessful otherwise	522	* Do not toggle if flush was unsuccessful otherwise
523	* will leave dirty metadata in a "clean" journal	523	* will leave dirty metadata in a "clean" journal
524	*/	524	*/
525	status = ocfs2_journal_toggle_dirty(osb, 0);	525	status = ocfs2_journal_toggle_dirty(osb, 0);
526	if (status < 0)	526	if (status < 0)
527	mlog_errno(status);	527	mlog_errno(status);
528	}	528	}
529		529
530	/* Shutdown the kernel journal system */	530	/* Shutdown the kernel journal system */
531	journal_destroy(journal->j_journal);	531	journal_destroy(journal->j_journal);
532		532
533	OCFS2_I(inode)->ip_open_count--;	533	OCFS2_I(inode)->ip_open_count--;
534		534
535	/* unlock our journal */	535	/* unlock our journal */
536	ocfs2_meta_unlock(inode, 1);	536	ocfs2_meta_unlock(inode, 1);
537		537
538	brelse(journal->j_bh);	538	brelse(journal->j_bh);
539	journal->j_bh = NULL;	539	journal->j_bh = NULL;
540		540
541	journal->j_state = OCFS2_JOURNAL_FREE;	541	journal->j_state = OCFS2_JOURNAL_FREE;
542		542
543	// up_write(&journal->j_trans_barrier);	543	// up_write(&journal->j_trans_barrier);
544	done:	544	done:
545	if (inode)	545	if (inode)
546	iput(inode);	546	iput(inode);
547	mlog_exit_void();	547	mlog_exit_void();
548	}	548	}
549		549
550	static void ocfs2_clear_journal_error(struct super_block *sb,	550	static void ocfs2_clear_journal_error(struct super_block *sb,
551	journal_t *journal,	551	journal_t *journal,
552	int slot)	552	int slot)
553	{	553	{
554	int olderr;	554	int olderr;
555		555
556	olderr = journal_errno(journal);	556	olderr = journal_errno(journal);
557	if (olderr) {	557	if (olderr) {
558	mlog(ML_ERROR, "File system error %d recorded in "	558	mlog(ML_ERROR, "File system error %d recorded in "
559	"journal %u.\n", olderr, slot);	559	"journal %u.\n", olderr, slot);
560	mlog(ML_ERROR, "File system on device %s needs checking.\n",	560	mlog(ML_ERROR, "File system on device %s needs checking.\n",
561	sb->s_id);	561	sb->s_id);
562		562
563	journal_ack_err(journal);	563	journal_ack_err(journal);
564	journal_clear_err(journal);	564	journal_clear_err(journal);
565	}	565	}
566	}	566	}
567		567
568	int ocfs2_journal_load(struct ocfs2_journal *journal, int local)	568	int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
569	{	569	{
570	int status = 0;	570	int status = 0;
571	struct ocfs2_super *osb;	571	struct ocfs2_super *osb;
572		572
573	mlog_entry_void();	573	mlog_entry_void();
574		574
575	if (!journal)	575	if (!journal)
576	BUG();	576	BUG();
577		577
578	osb = journal->j_osb;	578	osb = journal->j_osb;
579		579
580	status = journal_load(journal->j_journal);	580	status = journal_load(journal->j_journal);
581	if (status < 0) {	581	if (status < 0) {
582	mlog(ML_ERROR, "Failed to load journal!\n");	582	mlog(ML_ERROR, "Failed to load journal!\n");
583	goto done;	583	goto done;
584	}	584	}
585		585
586	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);	586	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
587		587
588	status = ocfs2_journal_toggle_dirty(osb, 1);	588	status = ocfs2_journal_toggle_dirty(osb, 1);
589	if (status < 0) {	589	if (status < 0) {
590	mlog_errno(status);	590	mlog_errno(status);
591	goto done;	591	goto done;
592	}	592	}
593		593
594	/* Launch the commit thread */	594	/* Launch the commit thread */
595	if (!local) {	595	if (!local) {
596	osb->commit_task = kthread_run(ocfs2_commit_thread, osb,	596	osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
597	"ocfs2cmt");	597	"ocfs2cmt");
598	if (IS_ERR(osb->commit_task)) {	598	if (IS_ERR(osb->commit_task)) {
599	status = PTR_ERR(osb->commit_task);	599	status = PTR_ERR(osb->commit_task);
600	osb->commit_task = NULL;	600	osb->commit_task = NULL;
601	mlog(ML_ERROR, "unable to launch ocfs2commit thread, "	601	mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
602	"error=%d", status);	602	"error=%d", status);
603	goto done;	603	goto done;
604	}	604	}
605	} else	605	} else
606	osb->commit_task = NULL;	606	osb->commit_task = NULL;
607		607
608	done:	608	done:
609	mlog_exit(status);	609	mlog_exit(status);
610	return status;	610	return status;
611	}	611	}
612		612
613		613
614	/* 'full' flag tells us whether we clear out all blocks or if we just	614	/* 'full' flag tells us whether we clear out all blocks or if we just
615	* mark the journal clean */	615	* mark the journal clean */
616	int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)	616	int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
617	{	617	{
618	int status;	618	int status;
619		619
620	mlog_entry_void();	620	mlog_entry_void();
621		621
622	BUG_ON(!journal);	622	BUG_ON(!journal);
623		623
624	status = journal_wipe(journal->j_journal, full);	624	status = journal_wipe(journal->j_journal, full);
625	if (status < 0) {	625	if (status < 0) {
626	mlog_errno(status);	626	mlog_errno(status);
627	goto bail;	627	goto bail;
628	}	628	}
629		629
630	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);	630	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
631	if (status < 0)	631	if (status < 0)
632	mlog_errno(status);	632	mlog_errno(status);
633		633
634	bail:	634	bail:
635	mlog_exit(status);	635	mlog_exit(status);
636	return status;	636	return status;
637	}	637	}
638		638
639	/*	639	/*
640	* JBD Might read a cached version of another nodes journal file. We	640	* JBD Might read a cached version of another nodes journal file. We
641	* don't want this as this file changes often and we get no	641	* don't want this as this file changes often and we get no
642	* notification on those changes. The only way to be sure that we've	642	* notification on those changes. The only way to be sure that we've
643	* got the most up to date version of those blocks then is to force	643	* got the most up to date version of those blocks then is to force
644	* read them off disk. Just searching through the buffer cache won't	644	* read them off disk. Just searching through the buffer cache won't
645	* work as there may be pages backing this file which are still marked	645	* work as there may be pages backing this file which are still marked
646	* up to date. We know things can't change on this file underneath us	646	* up to date. We know things can't change on this file underneath us
647	* as we have the lock by now :)	647	* as we have the lock by now :)
648	*/	648	*/
649	static int ocfs2_force_read_journal(struct inode *inode)	649	static int ocfs2_force_read_journal(struct inode *inode)
650	{	650	{
651	int status = 0;	651	int status = 0;
652	int i, p_blocks;	652	int i;
653	u64 v_blkno, p_blkno;	653	u64 v_blkno, p_blkno, p_blocks;
654	#define CONCURRENT_JOURNAL_FILL 32	654	#define CONCURRENT_JOURNAL_FILL 32ULL
655	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];	655	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
656		656
657	mlog_entry_void();	657	mlog_entry_void();
658		658
659	BUG_ON(inode->i_blocks !=	659	BUG_ON(inode->i_blocks !=
660	ocfs2_align_bytes_to_sectors(i_size_read(inode)));	660	ocfs2_align_bytes_to_sectors(i_size_read(inode)));
661		661
662	memset(bhs, 0, sizeof(struct buffer_head ) CONCURRENT_JOURNAL_FILL);	662	memset(bhs, 0, sizeof(struct buffer_head ) CONCURRENT_JOURNAL_FILL);
663		663
664	mlog(0, "Force reading %llu blocks\n",	664	mlog(0, "Force reading %llu blocks\n",
665	(unsigned long long)(inode->i_blocks >>	665	(unsigned long long)(inode->i_blocks >>
666	(inode->i_sb->s_blocksize_bits - 9)));	666	(inode->i_sb->s_blocksize_bits - 9)));
667		667
668	v_blkno = 0;	668	v_blkno = 0;
669	while (v_blkno <	669	while (v_blkno <
670	(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {	670	(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
671		671
672	status = ocfs2_extent_map_get_blocks(inode, v_blkno,	672	status = ocfs2_extent_map_get_blocks(inode, v_blkno,
673	&p_blkno, &p_blocks, NULL);	673	&p_blkno, &p_blocks, NULL);
674	if (status < 0) {	674	if (status < 0) {
675	mlog_errno(status);	675	mlog_errno(status);
676	goto bail;	676	goto bail;
677	}	677	}
678		678
679	if (p_blocks > CONCURRENT_JOURNAL_FILL)	679	if (p_blocks > CONCURRENT_JOURNAL_FILL)
680	p_blocks = CONCURRENT_JOURNAL_FILL;	680	p_blocks = CONCURRENT_JOURNAL_FILL;
681		681
682	/* We are reading journal data which should not	682	/* We are reading journal data which should not
683	* be put in the uptodate cache */	683	* be put in the uptodate cache */
684	status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),	684	status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
685	p_blkno, p_blocks, bhs, 0,	685	p_blkno, p_blocks, bhs, 0,
686	NULL);	686	NULL);
687	if (status < 0) {	687	if (status < 0) {
688	mlog_errno(status);	688	mlog_errno(status);
689	goto bail;	689	goto bail;
690	}	690	}
691		691
692	for(i = 0; i < p_blocks; i++) {	692	for(i = 0; i < p_blocks; i++) {
693	brelse(bhs[i]);	693	brelse(bhs[i]);
694	bhs[i] = NULL;	694	bhs[i] = NULL;
695	}	695	}
696		696
697	v_blkno += p_blocks;	697	v_blkno += p_blocks;
698	}	698	}
699		699
700	bail:	700	bail:
701	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)	701	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
702	if (bhs[i])	702	if (bhs[i])
703	brelse(bhs[i]);	703	brelse(bhs[i]);
704	mlog_exit(status);	704	mlog_exit(status);
705	return status;	705	return status;
706	}	706	}
707		707
708	struct ocfs2_la_recovery_item {	708	struct ocfs2_la_recovery_item {
709	struct list_head lri_list;	709	struct list_head lri_list;
710	int lri_slot;	710	int lri_slot;
711	struct ocfs2_dinode *lri_la_dinode;	711	struct ocfs2_dinode *lri_la_dinode;
712	struct ocfs2_dinode *lri_tl_dinode;	712	struct ocfs2_dinode *lri_tl_dinode;
713	};	713	};
714		714
715	/* Does the second half of the recovery process. By this point, the	715	/* Does the second half of the recovery process. By this point, the
716	* node is marked clean and can actually be considered recovered,	716	* node is marked clean and can actually be considered recovered,
717	* hence it's no longer in the recovery map, but there's still some	717	* hence it's no longer in the recovery map, but there's still some
718	* cleanup we can do which shouldn't happen within the recovery thread	718	* cleanup we can do which shouldn't happen within the recovery thread
719	* as locking in that context becomes very difficult if we are to take	719	* as locking in that context becomes very difficult if we are to take
720	* recovering nodes into account.	720	* recovering nodes into account.
721	*	721	*
722	* NOTE: This function can and will sleep on recovery of other nodes	722	* NOTE: This function can and will sleep on recovery of other nodes
723	* during cluster locking, just like any other ocfs2 process.	723	* during cluster locking, just like any other ocfs2 process.
724	*/	724	*/
725	void ocfs2_complete_recovery(struct work_struct *work)	725	void ocfs2_complete_recovery(struct work_struct *work)
726	{	726	{
727	int ret;	727	int ret;
728	struct ocfs2_journal *journal =	728	struct ocfs2_journal *journal =
729	container_of(work, struct ocfs2_journal, j_recovery_work);	729	container_of(work, struct ocfs2_journal, j_recovery_work);
730	struct ocfs2_super *osb = journal->j_osb;	730	struct ocfs2_super *osb = journal->j_osb;
731	struct ocfs2_dinode la_dinode, tl_dinode;	731	struct ocfs2_dinode la_dinode, tl_dinode;
732	struct ocfs2_la_recovery_item *item;	732	struct ocfs2_la_recovery_item *item;
733	struct list_head p, n;	733	struct list_head p, n;
734	LIST_HEAD(tmp_la_list);	734	LIST_HEAD(tmp_la_list);
735		735
736	mlog_entry_void();	736	mlog_entry_void();
737		737
738	mlog(0, "completing recovery from keventd\n");	738	mlog(0, "completing recovery from keventd\n");
739		739
740	spin_lock(&journal->j_lock);	740	spin_lock(&journal->j_lock);
741	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);	741	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
742	spin_unlock(&journal->j_lock);	742	spin_unlock(&journal->j_lock);
743		743
744	list_for_each_safe(p, n, &tmp_la_list) {	744	list_for_each_safe(p, n, &tmp_la_list) {
745	item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);	745	item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
746	list_del_init(&item->lri_list);	746	list_del_init(&item->lri_list);
747		747
748	mlog(0, "Complete recovery for slot %d\n", item->lri_slot);	748	mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
749		749
750	la_dinode = item->lri_la_dinode;	750	la_dinode = item->lri_la_dinode;
751	if (la_dinode) {	751	if (la_dinode) {
752	mlog(0, "Clean up local alloc %llu\n",	752	mlog(0, "Clean up local alloc %llu\n",
753	(unsigned long long)la_dinode->i_blkno);	753	(unsigned long long)la_dinode->i_blkno);
754		754
755	ret = ocfs2_complete_local_alloc_recovery(osb,	755	ret = ocfs2_complete_local_alloc_recovery(osb,
756	la_dinode);	756	la_dinode);
757	if (ret < 0)	757	if (ret < 0)
758	mlog_errno(ret);	758	mlog_errno(ret);
759		759
760	kfree(la_dinode);	760	kfree(la_dinode);
761	}	761	}
762		762
763	tl_dinode = item->lri_tl_dinode;	763	tl_dinode = item->lri_tl_dinode;
764	if (tl_dinode) {	764	if (tl_dinode) {
765	mlog(0, "Clean up truncate log %llu\n",	765	mlog(0, "Clean up truncate log %llu\n",
766	(unsigned long long)tl_dinode->i_blkno);	766	(unsigned long long)tl_dinode->i_blkno);
767		767
768	ret = ocfs2_complete_truncate_log_recovery(osb,	768	ret = ocfs2_complete_truncate_log_recovery(osb,
769	tl_dinode);	769	tl_dinode);
770	if (ret < 0)	770	if (ret < 0)
771	mlog_errno(ret);	771	mlog_errno(ret);
772		772
773	kfree(tl_dinode);	773	kfree(tl_dinode);
774	}	774	}
775		775
776	ret = ocfs2_recover_orphans(osb, item->lri_slot);	776	ret = ocfs2_recover_orphans(osb, item->lri_slot);
777	if (ret < 0)	777	if (ret < 0)
778	mlog_errno(ret);	778	mlog_errno(ret);
779		779
780	kfree(item);	780	kfree(item);
781	}	781	}
782		782
783	mlog(0, "Recovery completion\n");	783	mlog(0, "Recovery completion\n");
784	mlog_exit_void();	784	mlog_exit_void();
785	}	785	}
786		786
787	/* NOTE: This function always eats your references to la_dinode and	787	/* NOTE: This function always eats your references to la_dinode and
788	* tl_dinode, either manually on error, or by passing them to	788	* tl_dinode, either manually on error, or by passing them to
789	* ocfs2_complete_recovery */	789	* ocfs2_complete_recovery */
790	static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,	790	static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
791	int slot_num,	791	int slot_num,
792	struct ocfs2_dinode *la_dinode,	792	struct ocfs2_dinode *la_dinode,
793	struct ocfs2_dinode *tl_dinode)	793	struct ocfs2_dinode *tl_dinode)
794	{	794	{
795	struct ocfs2_la_recovery_item *item;	795	struct ocfs2_la_recovery_item *item;
796		796
797	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);	797	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
798	if (!item) {	798	if (!item) {
799	/* Though we wish to avoid it, we are in fact safe in	799	/* Though we wish to avoid it, we are in fact safe in
800	* skipping local alloc cleanup as fsck.ocfs2 is more	800	* skipping local alloc cleanup as fsck.ocfs2 is more
801	* than capable of reclaiming unused space. */	801	* than capable of reclaiming unused space. */
802	if (la_dinode)	802	if (la_dinode)
803	kfree(la_dinode);	803	kfree(la_dinode);
804		804
805	if (tl_dinode)	805	if (tl_dinode)
806	kfree(tl_dinode);	806	kfree(tl_dinode);
807		807
808	mlog_errno(-ENOMEM);	808	mlog_errno(-ENOMEM);
809	return;	809	return;
810	}	810	}
811		811
812	INIT_LIST_HEAD(&item->lri_list);	812	INIT_LIST_HEAD(&item->lri_list);
813	item->lri_la_dinode = la_dinode;	813	item->lri_la_dinode = la_dinode;
814	item->lri_slot = slot_num;	814	item->lri_slot = slot_num;
815	item->lri_tl_dinode = tl_dinode;	815	item->lri_tl_dinode = tl_dinode;
816		816
817	spin_lock(&journal->j_lock);	817	spin_lock(&journal->j_lock);
818	list_add_tail(&item->lri_list, &journal->j_la_cleanups);	818	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
819	queue_work(ocfs2_wq, &journal->j_recovery_work);	819	queue_work(ocfs2_wq, &journal->j_recovery_work);
820	spin_unlock(&journal->j_lock);	820	spin_unlock(&journal->j_lock);
821	}	821	}
822		822
823	/* Called by the mount code to queue recovery the last part of	823	/* Called by the mount code to queue recovery the last part of
824	* recovery for it's own slot. */	824	* recovery for it's own slot. */
825	void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)	825	void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
826	{	826	{
827	struct ocfs2_journal *journal = osb->journal;	827	struct ocfs2_journal *journal = osb->journal;
828		828
829	if (osb->dirty) {	829	if (osb->dirty) {
830	/* No need to queue up our truncate_log as regular	830	/* No need to queue up our truncate_log as regular
831	* cleanup will catch that. */	831	* cleanup will catch that. */
832	ocfs2_queue_recovery_completion(journal,	832	ocfs2_queue_recovery_completion(journal,
833	osb->slot_num,	833	osb->slot_num,
834	osb->local_alloc_copy,	834	osb->local_alloc_copy,
835	NULL);	835	NULL);
836	ocfs2_schedule_truncate_log_flush(osb, 0);	836	ocfs2_schedule_truncate_log_flush(osb, 0);
837		837
838	osb->local_alloc_copy = NULL;	838	osb->local_alloc_copy = NULL;
839	osb->dirty = 0;	839	osb->dirty = 0;
840	}	840	}
841	}	841	}
842		842
843	static int __ocfs2_recovery_thread(void *arg)	843	static int __ocfs2_recovery_thread(void *arg)
844	{	844	{
845	int status, node_num;	845	int status, node_num;
846	struct ocfs2_super *osb = arg;	846	struct ocfs2_super *osb = arg;
847		847
848	mlog_entry_void();	848	mlog_entry_void();
849		849
850	status = ocfs2_wait_on_mount(osb);	850	status = ocfs2_wait_on_mount(osb);
851	if (status < 0) {	851	if (status < 0) {
852	goto bail;	852	goto bail;
853	}	853	}
854		854
855	restart:	855	restart:
856	status = ocfs2_super_lock(osb, 1);	856	status = ocfs2_super_lock(osb, 1);
857	if (status < 0) {	857	if (status < 0) {
858	mlog_errno(status);	858	mlog_errno(status);
859	goto bail;	859	goto bail;
860	}	860	}
861		861
862	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {	862	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
863	node_num = ocfs2_node_map_first_set_bit(osb,	863	node_num = ocfs2_node_map_first_set_bit(osb,
864	&osb->recovery_map);	864	&osb->recovery_map);
865	if (node_num == O2NM_INVALID_NODE_NUM) {	865	if (node_num == O2NM_INVALID_NODE_NUM) {
866	mlog(0, "Out of nodes to recover.\n");	866	mlog(0, "Out of nodes to recover.\n");
867	break;	867	break;
868	}	868	}
869		869
870	status = ocfs2_recover_node(osb, node_num);	870	status = ocfs2_recover_node(osb, node_num);
871	if (status < 0) {	871	if (status < 0) {
872	mlog(ML_ERROR,	872	mlog(ML_ERROR,
873	"Error %d recovering node %d on device (%u,%u)!\n",	873	"Error %d recovering node %d on device (%u,%u)!\n",
874	status, node_num,	874	status, node_num,
875	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));	875	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
876	mlog(ML_ERROR, "Volume requires unmount.\n");	876	mlog(ML_ERROR, "Volume requires unmount.\n");
877	continue;	877	continue;
878	}	878	}
879		879
880	ocfs2_recovery_map_clear(osb, node_num);	880	ocfs2_recovery_map_clear(osb, node_num);
881	}	881	}
882	ocfs2_super_unlock(osb, 1);	882	ocfs2_super_unlock(osb, 1);
883		883
884	/* We always run recovery on our own orphan dir - the dead	884	/* We always run recovery on our own orphan dir - the dead
885	* node(s) may have voted "no" on an inode delete earlier. A	885	* node(s) may have voted "no" on an inode delete earlier. A
886	* revote is therefore required. */	886	* revote is therefore required. */
887	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,	887	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
888	NULL);	888	NULL);
889		889
890	bail:	890	bail:
891	mutex_lock(&osb->recovery_lock);	891	mutex_lock(&osb->recovery_lock);
892	if (!status &&	892	if (!status &&
893	!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {	893	!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
894	mutex_unlock(&osb->recovery_lock);	894	mutex_unlock(&osb->recovery_lock);
895	goto restart;	895	goto restart;
896	}	896	}
897		897
898	osb->recovery_thread_task = NULL;	898	osb->recovery_thread_task = NULL;
899	mb(); /* sync with ocfs2_recovery_thread_running */	899	mb(); /* sync with ocfs2_recovery_thread_running */
900	wake_up(&osb->recovery_event);	900	wake_up(&osb->recovery_event);
901		901
902	mutex_unlock(&osb->recovery_lock);	902	mutex_unlock(&osb->recovery_lock);
903		903
904	mlog_exit(status);	904	mlog_exit(status);
905	/* no one is callint kthread_stop() for us so the kthread() api	905	/* no one is callint kthread_stop() for us so the kthread() api
906	* requires that we call do_exit(). And it isn't exported, but	906	* requires that we call do_exit(). And it isn't exported, but
907	* complete_and_exit() seems to be a minimal wrapper around it. */	907	* complete_and_exit() seems to be a minimal wrapper around it. */
908	complete_and_exit(NULL, status);	908	complete_and_exit(NULL, status);
909	return status;	909	return status;
910	}	910	}
911		911
912	void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)	912	void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
913	{	913	{
914	mlog_entry("(node_num=%d, osb->node_num = %d)\n",	914	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
915	node_num, osb->node_num);	915	node_num, osb->node_num);
916		916
917	mutex_lock(&osb->recovery_lock);	917	mutex_lock(&osb->recovery_lock);
918	if (osb->disable_recovery)	918	if (osb->disable_recovery)
919	goto out;	919	goto out;
920		920
921	/* People waiting on recovery will wait on	921	/* People waiting on recovery will wait on
922	* the recovery map to empty. */	922	* the recovery map to empty. */
923	if (!ocfs2_recovery_map_set(osb, node_num))	923	if (!ocfs2_recovery_map_set(osb, node_num))
924	mlog(0, "node %d already be in recovery.\n", node_num);	924	mlog(0, "node %d already be in recovery.\n", node_num);
925		925
926	mlog(0, "starting recovery thread...\n");	926	mlog(0, "starting recovery thread...\n");
927		927
928	if (osb->recovery_thread_task)	928	if (osb->recovery_thread_task)
929	goto out;	929	goto out;
930		930
931	osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,	931	osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
932	"ocfs2rec");	932	"ocfs2rec");
933	if (IS_ERR(osb->recovery_thread_task)) {	933	if (IS_ERR(osb->recovery_thread_task)) {
934	mlog_errno((int)PTR_ERR(osb->recovery_thread_task));	934	mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
935	osb->recovery_thread_task = NULL;	935	osb->recovery_thread_task = NULL;
936	}	936	}
937		937
938	out:	938	out:
939	mutex_unlock(&osb->recovery_lock);	939	mutex_unlock(&osb->recovery_lock);
940	wake_up(&osb->recovery_event);	940	wake_up(&osb->recovery_event);
941		941
942	mlog_exit_void();	942	mlog_exit_void();
943	}	943	}
944		944
945	/* Does the actual journal replay and marks the journal inode as	945	/* Does the actual journal replay and marks the journal inode as
946	* clean. Will only replay if the journal inode is marked dirty. */	946	* clean. Will only replay if the journal inode is marked dirty. */
947	static int ocfs2_replay_journal(struct ocfs2_super *osb,	947	static int ocfs2_replay_journal(struct ocfs2_super *osb,
948	int node_num,	948	int node_num,
949	int slot_num)	949	int slot_num)
950	{	950	{
951	int status;	951	int status;
952	int got_lock = 0;	952	int got_lock = 0;
953	unsigned int flags;	953	unsigned int flags;
954	struct inode *inode = NULL;	954	struct inode *inode = NULL;
955	struct ocfs2_dinode *fe;	955	struct ocfs2_dinode *fe;
956	journal_t *journal = NULL;	956	journal_t *journal = NULL;
957	struct buffer_head *bh = NULL;	957	struct buffer_head *bh = NULL;
958		958
959	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,	959	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
960	slot_num);	960	slot_num);
961	if (inode == NULL) {	961	if (inode == NULL) {
962	status = -EACCES;	962	status = -EACCES;
963	mlog_errno(status);	963	mlog_errno(status);
964	goto done;	964	goto done;
965	}	965	}
966	if (is_bad_inode(inode)) {	966	if (is_bad_inode(inode)) {
967	status = -EACCES;	967	status = -EACCES;
968	iput(inode);	968	iput(inode);
969	inode = NULL;	969	inode = NULL;
970	mlog_errno(status);	970	mlog_errno(status);
971	goto done;	971	goto done;
972	}	972	}
973	SET_INODE_JOURNAL(inode);	973	SET_INODE_JOURNAL(inode);
974		974
975	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);	975	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
976	if (status < 0) {	976	if (status < 0) {
977	mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);	977	mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
978	if (status != -ERESTARTSYS)	978	if (status != -ERESTARTSYS)
979	mlog(ML_ERROR, "Could not lock journal!\n");	979	mlog(ML_ERROR, "Could not lock journal!\n");
980	goto done;	980	goto done;
981	}	981	}
982	got_lock = 1;	982	got_lock = 1;
983		983
984	fe = (struct ocfs2_dinode *) bh->b_data;	984	fe = (struct ocfs2_dinode *) bh->b_data;
985		985
986	flags = le32_to_cpu(fe->id1.journal1.ij_flags);	986	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
987		987
988	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {	988	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
989	mlog(0, "No recovery required for node %d\n", node_num);	989	mlog(0, "No recovery required for node %d\n", node_num);
990	goto done;	990	goto done;
991	}	991	}
992		992
993	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",	993	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
994	node_num, slot_num,	994	node_num, slot_num,
995	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));	995	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
996		996
997	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);	997	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
998		998
999	status = ocfs2_force_read_journal(inode);	999	status = ocfs2_force_read_journal(inode);
1000	if (status < 0) {	1000	if (status < 0) {
1001	mlog_errno(status);	1001	mlog_errno(status);
1002	goto done;	1002	goto done;
1003	}	1003	}
1004		1004
1005	mlog(0, "calling journal_init_inode\n");	1005	mlog(0, "calling journal_init_inode\n");
1006	journal = journal_init_inode(inode);	1006	journal = journal_init_inode(inode);
1007	if (journal == NULL) {	1007	if (journal == NULL) {
1008	mlog(ML_ERROR, "Linux journal layer error\n");	1008	mlog(ML_ERROR, "Linux journal layer error\n");
1009	status = -EIO;	1009	status = -EIO;
1010	goto done;	1010	goto done;
1011	}	1011	}
1012		1012
1013	status = journal_load(journal);	1013	status = journal_load(journal);
1014	if (status < 0) {	1014	if (status < 0) {
1015	mlog_errno(status);	1015	mlog_errno(status);
1016	if (!igrab(inode))	1016	if (!igrab(inode))
1017	BUG();	1017	BUG();
1018	journal_destroy(journal);	1018	journal_destroy(journal);
1019	goto done;	1019	goto done;
1020	}	1020	}
1021		1021
1022	ocfs2_clear_journal_error(osb->sb, journal, slot_num);	1022	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1023		1023
1024	/* wipe the journal */	1024	/* wipe the journal */
1025	mlog(0, "flushing the journal.\n");	1025	mlog(0, "flushing the journal.\n");
1026	journal_lock_updates(journal);	1026	journal_lock_updates(journal);
1027	status = journal_flush(journal);	1027	status = journal_flush(journal);
1028	journal_unlock_updates(journal);	1028	journal_unlock_updates(journal);
1029	if (status < 0)	1029	if (status < 0)
1030	mlog_errno(status);	1030	mlog_errno(status);
1031		1031
1032	/* This will mark the node clean */	1032	/* This will mark the node clean */
1033	flags = le32_to_cpu(fe->id1.journal1.ij_flags);	1033	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1034	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	1034	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1035	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	1035	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1036		1036
1037	status = ocfs2_write_block(osb, bh, inode);	1037	status = ocfs2_write_block(osb, bh, inode);
1038	if (status < 0)	1038	if (status < 0)
1039	mlog_errno(status);	1039	mlog_errno(status);
1040		1040
1041	if (!igrab(inode))	1041	if (!igrab(inode))
1042	BUG();	1042	BUG();
1043		1043
1044	journal_destroy(journal);	1044	journal_destroy(journal);
1045		1045
1046	done:	1046	done:
1047	/* drop the lock on this nodes journal */	1047	/* drop the lock on this nodes journal */
1048	if (got_lock)	1048	if (got_lock)
1049	ocfs2_meta_unlock(inode, 1);	1049	ocfs2_meta_unlock(inode, 1);
1050		1050
1051	if (inode)	1051	if (inode)
1052	iput(inode);	1052	iput(inode);
1053		1053
1054	if (bh)	1054	if (bh)
1055	brelse(bh);	1055	brelse(bh);
1056		1056
1057	mlog_exit(status);	1057	mlog_exit(status);
1058	return status;	1058	return status;
1059	}	1059	}
1060		1060
1061	/*	1061	/*
1062	* Do the most important parts of node recovery:	1062	* Do the most important parts of node recovery:
1063	* - Replay it's journal	1063	* - Replay it's journal
1064	* - Stamp a clean local allocator file	1064	* - Stamp a clean local allocator file
1065	* - Stamp a clean truncate log	1065	* - Stamp a clean truncate log
1066	* - Mark the node clean	1066	* - Mark the node clean
1067	*	1067	*
1068	* If this function completes without error, a node in OCFS2 can be	1068	* If this function completes without error, a node in OCFS2 can be
1069	* said to have been safely recovered. As a result, failure during the	1069	* said to have been safely recovered. As a result, failure during the
1070	* second part of a nodes recovery process (local alloc recovery) is	1070	* second part of a nodes recovery process (local alloc recovery) is
1071	* far less concerning.	1071	* far less concerning.
1072	*/	1072	*/
1073	static int ocfs2_recover_node(struct ocfs2_super *osb,	1073	static int ocfs2_recover_node(struct ocfs2_super *osb,
1074	int node_num)	1074	int node_num)
1075	{	1075	{
1076	int status = 0;	1076	int status = 0;
1077	int slot_num;	1077	int slot_num;
1078	struct ocfs2_slot_info *si = osb->slot_info;	1078	struct ocfs2_slot_info *si = osb->slot_info;
1079	struct ocfs2_dinode *la_copy = NULL;	1079	struct ocfs2_dinode *la_copy = NULL;
1080	struct ocfs2_dinode *tl_copy = NULL;	1080	struct ocfs2_dinode *tl_copy = NULL;
1081		1081
1082	mlog_entry("(node_num=%d, osb->node_num = %d)\n",	1082	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1083	node_num, osb->node_num);	1083	node_num, osb->node_num);
1084		1084
1085	mlog(0, "checking node %d\n", node_num);	1085	mlog(0, "checking node %d\n", node_num);
1086		1086
1087	/* Should not ever be called to recover ourselves -- in that	1087	/* Should not ever be called to recover ourselves -- in that
1088	* case we should've called ocfs2_journal_load instead. */	1088	* case we should've called ocfs2_journal_load instead. */
1089	BUG_ON(osb->node_num == node_num);	1089	BUG_ON(osb->node_num == node_num);
1090		1090
1091	slot_num = ocfs2_node_num_to_slot(si, node_num);	1091	slot_num = ocfs2_node_num_to_slot(si, node_num);
1092	if (slot_num == OCFS2_INVALID_SLOT) {	1092	if (slot_num == OCFS2_INVALID_SLOT) {
1093	status = 0;	1093	status = 0;
1094	mlog(0, "no slot for this node, so no recovery required.\n");	1094	mlog(0, "no slot for this node, so no recovery required.\n");
1095	goto done;	1095	goto done;
1096	}	1096	}
1097		1097
1098	mlog(0, "node %d was using slot %d\n", node_num, slot_num);	1098	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1099		1099
1100	status = ocfs2_replay_journal(osb, node_num, slot_num);	1100	status = ocfs2_replay_journal(osb, node_num, slot_num);
1101	if (status < 0) {	1101	if (status < 0) {
1102	mlog_errno(status);	1102	mlog_errno(status);
1103	goto done;	1103	goto done;
1104	}	1104	}
1105		1105
1106	/* Stamp a clean local alloc file AFTER recovering the journal... */	1106	/* Stamp a clean local alloc file AFTER recovering the journal... */
1107	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);	1107	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1108	if (status < 0) {	1108	if (status < 0) {
1109	mlog_errno(status);	1109	mlog_errno(status);
1110	goto done;	1110	goto done;
1111	}	1111	}
1112		1112
1113	/* An error from begin_truncate_log_recovery is not	1113	/* An error from begin_truncate_log_recovery is not
1114	* serious enough to warrant halting the rest of	1114	* serious enough to warrant halting the rest of
1115	* recovery. */	1115	* recovery. */
1116	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);	1116	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1117	if (status < 0)	1117	if (status < 0)
1118	mlog_errno(status);	1118	mlog_errno(status);
1119		1119
1120	/* Likewise, this would be a strange but ultimately not so	1120	/* Likewise, this would be a strange but ultimately not so
1121	* harmful place to get an error... */	1121	* harmful place to get an error... */
1122	ocfs2_clear_slot(si, slot_num);	1122	ocfs2_clear_slot(si, slot_num);
1123	status = ocfs2_update_disk_slots(osb, si);	1123	status = ocfs2_update_disk_slots(osb, si);
1124	if (status < 0)	1124	if (status < 0)
1125	mlog_errno(status);	1125	mlog_errno(status);
1126		1126
1127	/* This will kfree the memory pointed to by la_copy and tl_copy */	1127	/* This will kfree the memory pointed to by la_copy and tl_copy */
1128	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,	1128	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1129	tl_copy);	1129	tl_copy);
1130		1130
1131	status = 0;	1131	status = 0;
1132	done:	1132	done:
1133		1133
1134	mlog_exit(status);	1134	mlog_exit(status);
1135	return status;	1135	return status;
1136	}	1136	}
1137		1137
1138	/* Test node liveness by trylocking his journal. If we get the lock,	1138	/* Test node liveness by trylocking his journal. If we get the lock,
1139	* we drop it here. Return 0 if we got the lock, -EAGAIN if node is	1139	* we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1140	* still alive (we couldn't get the lock) and < 0 on error. */	1140	* still alive (we couldn't get the lock) and < 0 on error. */
1141	static int ocfs2_trylock_journal(struct ocfs2_super *osb,	1141	static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1142	int slot_num)	1142	int slot_num)
1143	{	1143	{
1144	int status, flags;	1144	int status, flags;
1145	struct inode *inode = NULL;	1145	struct inode *inode = NULL;
1146		1146
1147	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,	1147	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1148	slot_num);	1148	slot_num);
1149	if (inode == NULL) {	1149	if (inode == NULL) {
1150	mlog(ML_ERROR, "access error\n");	1150	mlog(ML_ERROR, "access error\n");
1151	status = -EACCES;	1151	status = -EACCES;
1152	goto bail;	1152	goto bail;
1153	}	1153	}
1154	if (is_bad_inode(inode)) {	1154	if (is_bad_inode(inode)) {
1155	mlog(ML_ERROR, "access error (bad inode)\n");	1155	mlog(ML_ERROR, "access error (bad inode)\n");
1156	iput(inode);	1156	iput(inode);
1157	inode = NULL;	1157	inode = NULL;
1158	status = -EACCES;	1158	status = -EACCES;
1159	goto bail;	1159	goto bail;
1160	}	1160	}
1161	SET_INODE_JOURNAL(inode);	1161	SET_INODE_JOURNAL(inode);
1162		1162
1163	flags = OCFS2_META_LOCK_RECOVERY \| OCFS2_META_LOCK_NOQUEUE;	1163	flags = OCFS2_META_LOCK_RECOVERY \| OCFS2_META_LOCK_NOQUEUE;
1164	status = ocfs2_meta_lock_full(inode, NULL, 1, flags);	1164	status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
1165	if (status < 0) {	1165	if (status < 0) {
1166	if (status != -EAGAIN)	1166	if (status != -EAGAIN)
1167	mlog_errno(status);	1167	mlog_errno(status);
1168	goto bail;	1168	goto bail;
1169	}	1169	}
1170		1170
1171	ocfs2_meta_unlock(inode, 1);	1171	ocfs2_meta_unlock(inode, 1);
1172	bail:	1172	bail:
1173	if (inode)	1173	if (inode)
1174	iput(inode);	1174	iput(inode);
1175		1175
1176	return status;	1176	return status;
1177	}	1177	}
1178		1178
1179	/* Call this underneath ocfs2_super_lock. It also assumes that the	1179	/* Call this underneath ocfs2_super_lock. It also assumes that the
1180	* slot info struct has been updated from disk. */	1180	* slot info struct has been updated from disk. */
1181	int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)	1181	int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1182	{	1182	{
1183	int status, i, node_num;	1183	int status, i, node_num;
1184	struct ocfs2_slot_info *si = osb->slot_info;	1184	struct ocfs2_slot_info *si = osb->slot_info;
1185		1185
1186	/* This is called with the super block cluster lock, so we	1186	/* This is called with the super block cluster lock, so we
1187	* know that the slot map can't change underneath us. */	1187	* know that the slot map can't change underneath us. */
1188		1188
1189	spin_lock(&si->si_lock);	1189	spin_lock(&si->si_lock);
1190	for(i = 0; i < si->si_num_slots; i++) {	1190	for(i = 0; i < si->si_num_slots; i++) {
1191	if (i == osb->slot_num)	1191	if (i == osb->slot_num)
1192	continue;	1192	continue;
1193	if (ocfs2_is_empty_slot(si, i))	1193	if (ocfs2_is_empty_slot(si, i))
1194	continue;	1194	continue;
1195		1195
1196	node_num = si->si_global_node_nums[i];	1196	node_num = si->si_global_node_nums[i];
1197	if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))	1197	if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1198	continue;	1198	continue;
1199	spin_unlock(&si->si_lock);	1199	spin_unlock(&si->si_lock);
1200		1200
1201	/* Ok, we have a slot occupied by another node which	1201	/* Ok, we have a slot occupied by another node which
1202	* is not in the recovery map. We trylock his journal	1202	* is not in the recovery map. We trylock his journal
1203	* file here to test if he's alive. */	1203	* file here to test if he's alive. */
1204	status = ocfs2_trylock_journal(osb, i);	1204	status = ocfs2_trylock_journal(osb, i);
1205	if (!status) {	1205	if (!status) {
1206	/* Since we're called from mount, we know that	1206	/* Since we're called from mount, we know that
1207	* the recovery thread can't race us on	1207	* the recovery thread can't race us on
1208	* setting / checking the recovery bits. */	1208	* setting / checking the recovery bits. */
1209	ocfs2_recovery_thread(osb, node_num);	1209	ocfs2_recovery_thread(osb, node_num);
1210	} else if ((status < 0) && (status != -EAGAIN)) {	1210	} else if ((status < 0) && (status != -EAGAIN)) {
1211	mlog_errno(status);	1211	mlog_errno(status);
1212	goto bail;	1212	goto bail;
1213	}	1213	}
1214		1214
1215	spin_lock(&si->si_lock);	1215	spin_lock(&si->si_lock);
1216	}	1216	}
1217	spin_unlock(&si->si_lock);	1217	spin_unlock(&si->si_lock);
1218		1218
1219	status = 0;	1219	status = 0;
1220	bail:	1220	bail:
1221	mlog_exit(status);	1221	mlog_exit(status);
1222	return status;	1222	return status;
1223	}	1223	}
1224		1224
1225	static int ocfs2_queue_orphans(struct ocfs2_super *osb,	1225	static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1226	int slot,	1226	int slot,
1227	struct inode **head)	1227	struct inode **head)
1228	{	1228	{
1229	int status;	1229	int status;
1230	struct inode *orphan_dir_inode = NULL;	1230	struct inode *orphan_dir_inode = NULL;
1231	struct inode *iter;	1231	struct inode *iter;
1232	unsigned long offset, blk, local;	1232	unsigned long offset, blk, local;
1233	struct buffer_head *bh = NULL;	1233	struct buffer_head *bh = NULL;
1234	struct ocfs2_dir_entry *de;	1234	struct ocfs2_dir_entry *de;
1235	struct super_block *sb = osb->sb;	1235	struct super_block *sb = osb->sb;
1236		1236
1237	orphan_dir_inode = ocfs2_get_system_file_inode(osb,	1237	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1238	ORPHAN_DIR_SYSTEM_INODE,	1238	ORPHAN_DIR_SYSTEM_INODE,
1239	slot);	1239	slot);
1240	if (!orphan_dir_inode) {	1240	if (!orphan_dir_inode) {
1241	status = -ENOENT;	1241	status = -ENOENT;
1242	mlog_errno(status);	1242	mlog_errno(status);
1243	return status;	1243	return status;
1244	}	1244	}
1245		1245
1246	mutex_lock(&orphan_dir_inode->i_mutex);	1246	mutex_lock(&orphan_dir_inode->i_mutex);
1247	status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);	1247	status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
1248	if (status < 0) {	1248	if (status < 0) {
1249	mlog_errno(status);	1249	mlog_errno(status);
1250	goto out;	1250	goto out;
1251	}	1251	}
1252		1252
1253	offset = 0;	1253	offset = 0;
1254	iter = NULL;	1254	iter = NULL;
1255	while(offset < i_size_read(orphan_dir_inode)) {	1255	while(offset < i_size_read(orphan_dir_inode)) {
1256	blk = offset >> sb->s_blocksize_bits;	1256	blk = offset >> sb->s_blocksize_bits;
1257		1257
1258	bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);	1258	bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1259	if (!bh)	1259	if (!bh)
1260	status = -EINVAL;	1260	status = -EINVAL;
1261	if (status < 0) {	1261	if (status < 0) {
1262	if (bh)	1262	if (bh)
1263	brelse(bh);	1263	brelse(bh);
1264	mlog_errno(status);	1264	mlog_errno(status);
1265	goto out_unlock;	1265	goto out_unlock;
1266	}	1266	}
1267		1267
1268	local = 0;	1268	local = 0;
1269	while(offset < i_size_read(orphan_dir_inode)	1269	while(offset < i_size_read(orphan_dir_inode)
1270	&& local < sb->s_blocksize) {	1270	&& local < sb->s_blocksize) {
1271	de = (struct ocfs2_dir_entry *) (bh->b_data + local);	1271	de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1272		1272
1273	if (!ocfs2_check_dir_entry(orphan_dir_inode,	1273	if (!ocfs2_check_dir_entry(orphan_dir_inode,
1274	de, bh, local)) {	1274	de, bh, local)) {
1275	status = -EINVAL;	1275	status = -EINVAL;
1276	mlog_errno(status);	1276	mlog_errno(status);
1277	brelse(bh);	1277	brelse(bh);
1278	goto out_unlock;	1278	goto out_unlock;
1279	}	1279	}
1280		1280
1281	local += le16_to_cpu(de->rec_len);	1281	local += le16_to_cpu(de->rec_len);
1282	offset += le16_to_cpu(de->rec_len);	1282	offset += le16_to_cpu(de->rec_len);
1283		1283
1284	/* I guess we silently fail on no inode? */	1284	/* I guess we silently fail on no inode? */
1285	if (!le64_to_cpu(de->inode))	1285	if (!le64_to_cpu(de->inode))
1286	continue;	1286	continue;
1287	if (de->file_type > OCFS2_FT_MAX) {	1287	if (de->file_type > OCFS2_FT_MAX) {
1288	mlog(ML_ERROR,	1288	mlog(ML_ERROR,
1289	"block %llu contains invalid de: "	1289	"block %llu contains invalid de: "
1290	"inode = %llu, rec_len = %u, "	1290	"inode = %llu, rec_len = %u, "
1291	"name_len = %u, file_type = %u, "	1291	"name_len = %u, file_type = %u, "
1292	"name='%.*s'\n",	1292	"name='%.*s'\n",
1293	(unsigned long long)bh->b_blocknr,	1293	(unsigned long long)bh->b_blocknr,
1294	(unsigned long long)le64_to_cpu(de->inode),	1294	(unsigned long long)le64_to_cpu(de->inode),
1295	le16_to_cpu(de->rec_len),	1295	le16_to_cpu(de->rec_len),
1296	de->name_len,	1296	de->name_len,
1297	de->file_type,	1297	de->file_type,
1298	de->name_len,	1298	de->name_len,
1299	de->name);	1299	de->name);
1300	continue;	1300	continue;
1301	}	1301	}
1302	if (de->name_len == 1 && !strncmp(".", de->name, 1))	1302	if (de->name_len == 1 && !strncmp(".", de->name, 1))
1303	continue;	1303	continue;
1304	if (de->name_len == 2 && !strncmp("..", de->name, 2))	1304	if (de->name_len == 2 && !strncmp("..", de->name, 2))
1305	continue;	1305	continue;
1306		1306
1307	iter = ocfs2_iget(osb, le64_to_cpu(de->inode),	1307	iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1308	OCFS2_FI_FLAG_ORPHAN_RECOVERY);	1308	OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1309	if (IS_ERR(iter))	1309	if (IS_ERR(iter))
1310	continue;	1310	continue;
1311		1311
1312	mlog(0, "queue orphan %llu\n",	1312	mlog(0, "queue orphan %llu\n",
1313	(unsigned long long)OCFS2_I(iter)->ip_blkno);	1313	(unsigned long long)OCFS2_I(iter)->ip_blkno);
1314	/* No locking is required for the next_orphan	1314	/* No locking is required for the next_orphan
1315	* queue as there is only ever a single	1315	* queue as there is only ever a single
1316	* process doing orphan recovery. */	1316	* process doing orphan recovery. */
1317	OCFS2_I(iter)->ip_next_orphan = *head;	1317	OCFS2_I(iter)->ip_next_orphan = *head;
1318	*head = iter;	1318	*head = iter;
1319	}	1319	}
1320	brelse(bh);	1320	brelse(bh);
1321	}	1321	}
1322		1322
1323	out_unlock:	1323	out_unlock:
1324	ocfs2_meta_unlock(orphan_dir_inode, 0);	1324	ocfs2_meta_unlock(orphan_dir_inode, 0);
1325	out:	1325	out:
1326	mutex_unlock(&orphan_dir_inode->i_mutex);	1326	mutex_unlock(&orphan_dir_inode->i_mutex);
1327	iput(orphan_dir_inode);	1327	iput(orphan_dir_inode);
1328	return status;	1328	return status;
1329	}	1329	}
1330		1330
1331	static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,	1331	static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
1332	int slot)	1332	int slot)
1333	{	1333	{
1334	int ret;	1334	int ret;
1335		1335
1336	spin_lock(&osb->osb_lock);	1336	spin_lock(&osb->osb_lock);
1337	ret = !osb->osb_orphan_wipes[slot];	1337	ret = !osb->osb_orphan_wipes[slot];
1338	spin_unlock(&osb->osb_lock);	1338	spin_unlock(&osb->osb_lock);
1339	return ret;	1339	return ret;
1340	}	1340	}
1341		1341
1342	static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,	1342	static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
1343	int slot)	1343	int slot)
1344	{	1344	{
1345	spin_lock(&osb->osb_lock);	1345	spin_lock(&osb->osb_lock);
1346	/* Mark ourselves such that new processes in delete_inode()	1346	/* Mark ourselves such that new processes in delete_inode()
1347	* know to quit early. */	1347	* know to quit early. */
1348	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);	1348	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1349	while (osb->osb_orphan_wipes[slot]) {	1349	while (osb->osb_orphan_wipes[slot]) {
1350	/* If any processes are already in the middle of an	1350	/* If any processes are already in the middle of an
1351	* orphan wipe on this dir, then we need to wait for	1351	* orphan wipe on this dir, then we need to wait for
1352	* them. */	1352	* them. */
1353	spin_unlock(&osb->osb_lock);	1353	spin_unlock(&osb->osb_lock);
1354	wait_event_interruptible(osb->osb_wipe_event,	1354	wait_event_interruptible(osb->osb_wipe_event,
1355	ocfs2_orphan_recovery_can_continue(osb, slot));	1355	ocfs2_orphan_recovery_can_continue(osb, slot));
1356	spin_lock(&osb->osb_lock);	1356	spin_lock(&osb->osb_lock);
1357	}	1357	}
1358	spin_unlock(&osb->osb_lock);	1358	spin_unlock(&osb->osb_lock);
1359	}	1359	}
1360		1360
1361	static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,	1361	static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
1362	int slot)	1362	int slot)
1363	{	1363	{
1364	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);	1364	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1365	}	1365	}
1366		1366
1367	/*	1367	/*
1368	* Orphan recovery. Each mounted node has it's own orphan dir which we	1368	* Orphan recovery. Each mounted node has it's own orphan dir which we
1369	* must run during recovery. Our strategy here is to build a list of	1369	* must run during recovery. Our strategy here is to build a list of
1370	* the inodes in the orphan dir and iget/iput them. The VFS does	1370	* the inodes in the orphan dir and iget/iput them. The VFS does
1371	* (most) of the rest of the work.	1371	* (most) of the rest of the work.
1372	*	1372	*
1373	* Orphan recovery can happen at any time, not just mount so we have a	1373	* Orphan recovery can happen at any time, not just mount so we have a
1374	* couple of extra considerations.	1374	* couple of extra considerations.
1375	*	1375	*
1376	* - We grab as many inodes as we can under the orphan dir lock -	1376	* - We grab as many inodes as we can under the orphan dir lock -
1377	* doing iget() outside the orphan dir risks getting a reference on	1377	* doing iget() outside the orphan dir risks getting a reference on
1378	* an invalid inode.	1378	* an invalid inode.
1379	* - We must be sure not to deadlock with other processes on the	1379	* - We must be sure not to deadlock with other processes on the
1380	* system wanting to run delete_inode(). This can happen when they go	1380	* system wanting to run delete_inode(). This can happen when they go
1381	* to lock the orphan dir and the orphan recovery process attempts to	1381	* to lock the orphan dir and the orphan recovery process attempts to
1382	* iget() inside the orphan dir lock. This can be avoided by	1382	* iget() inside the orphan dir lock. This can be avoided by
1383	* advertising our state to ocfs2_delete_inode().	1383	* advertising our state to ocfs2_delete_inode().
1384	*/	1384	*/
1385	static int ocfs2_recover_orphans(struct ocfs2_super *osb,	1385	static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1386	int slot)	1386	int slot)
1387	{	1387	{
1388	int ret = 0;	1388	int ret = 0;
1389	struct inode *inode = NULL;	1389	struct inode *inode = NULL;
1390	struct inode *iter;	1390	struct inode *iter;
1391	struct ocfs2_inode_info *oi;	1391	struct ocfs2_inode_info *oi;
1392		1392
1393	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);	1393	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1394		1394
1395	ocfs2_mark_recovering_orphan_dir(osb, slot);	1395	ocfs2_mark_recovering_orphan_dir(osb, slot);
1396	ret = ocfs2_queue_orphans(osb, slot, &inode);	1396	ret = ocfs2_queue_orphans(osb, slot, &inode);
1397	ocfs2_clear_recovering_orphan_dir(osb, slot);	1397	ocfs2_clear_recovering_orphan_dir(osb, slot);
1398		1398
1399	/* Error here should be noted, but we want to continue with as	1399	/* Error here should be noted, but we want to continue with as
1400	* many queued inodes as we've got. */	1400	* many queued inodes as we've got. */
1401	if (ret)	1401	if (ret)
1402	mlog_errno(ret);	1402	mlog_errno(ret);
1403		1403
1404	while (inode) {	1404	while (inode) {
1405	oi = OCFS2_I(inode);	1405	oi = OCFS2_I(inode);
1406	mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);	1406	mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
1407		1407
1408	iter = oi->ip_next_orphan;	1408	iter = oi->ip_next_orphan;
1409		1409
1410	spin_lock(&oi->ip_lock);	1410	spin_lock(&oi->ip_lock);
1411	/* Delete voting may have set these on the assumption	1411	/* Delete voting may have set these on the assumption
1412	* that the other node would wipe them successfully.	1412	* that the other node would wipe them successfully.
1413	* If they are still in the node's orphan dir, we need	1413	* If they are still in the node's orphan dir, we need
1414	* to reset that state. */	1414	* to reset that state. */
1415	oi->ip_flags &= ~(OCFS2_INODE_DELETED\|OCFS2_INODE_SKIP_DELETE);	1415	oi->ip_flags &= ~(OCFS2_INODE_DELETED\|OCFS2_INODE_SKIP_DELETE);
1416		1416
1417	/* Set the proper information to get us going into	1417	/* Set the proper information to get us going into
1418	* ocfs2_delete_inode. */	1418	* ocfs2_delete_inode. */
1419	oi->ip_flags \|= OCFS2_INODE_MAYBE_ORPHANED;	1419	oi->ip_flags \|= OCFS2_INODE_MAYBE_ORPHANED;
1420	spin_unlock(&oi->ip_lock);	1420	spin_unlock(&oi->ip_lock);
1421		1421
1422	iput(inode);	1422	iput(inode);
1423		1423
1424	inode = iter;	1424	inode = iter;
1425	}	1425	}
1426		1426
1427	return ret;	1427	return ret;
1428	}	1428	}
1429		1429
1430	static int ocfs2_wait_on_mount(struct ocfs2_super *osb)	1430	static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1431	{	1431	{
1432	/* This check is good because ocfs2 will wait on our recovery	1432	/* This check is good because ocfs2 will wait on our recovery
1433	* thread before changing it to something other than MOUNTED	1433	* thread before changing it to something other than MOUNTED
1434	* or DISABLED. */	1434	* or DISABLED. */
1435	wait_event(osb->osb_mount_event,	1435	wait_event(osb->osb_mount_event,
1436	atomic_read(&osb->vol_state) == VOLUME_MOUNTED \|\|	1436	atomic_read(&osb->vol_state) == VOLUME_MOUNTED \|\|
1437	atomic_read(&osb->vol_state) == VOLUME_DISABLED);	1437	atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1438		1438
1439	/* If there's an error on mount, then we may never get to the	1439	/* If there's an error on mount, then we may never get to the
1440	* MOUNTED flag, but this is set right before	1440	* MOUNTED flag, but this is set right before
1441	* dismount_volume() so we can trust it. */	1441	* dismount_volume() so we can trust it. */
1442	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {	1442	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1443	mlog(0, "mount error, exiting!\n");	1443	mlog(0, "mount error, exiting!\n");
1444	return -EBUSY;	1444	return -EBUSY;
1445	}	1445	}
1446		1446
1447	return 0;	1447	return 0;
1448	}	1448	}
1449		1449
1450	static int ocfs2_commit_thread(void *arg)	1450	static int ocfs2_commit_thread(void *arg)
1451	{	1451	{
1452	int status;	1452	int status;
1453	struct ocfs2_super *osb = arg;	1453	struct ocfs2_super *osb = arg;
1454	struct ocfs2_journal *journal = osb->journal;	1454	struct ocfs2_journal *journal = osb->journal;
1455		1455
1456	/* we can trust j_num_trans here because _should_stop() is only set in	1456	/* we can trust j_num_trans here because _should_stop() is only set in
1457	* shutdown and nobody other than ourselves should be able to start	1457	* shutdown and nobody other than ourselves should be able to start
1458	* transactions. committing on shutdown might take a few iterations	1458	* transactions. committing on shutdown might take a few iterations
1459	* as final transactions put deleted inodes on the list */	1459	* as final transactions put deleted inodes on the list */
1460	while (!(kthread_should_stop() &&	1460	while (!(kthread_should_stop() &&
1461	atomic_read(&journal->j_num_trans) == 0)) {	1461	atomic_read(&journal->j_num_trans) == 0)) {
1462		1462
1463	wait_event_interruptible(osb->checkpoint_event,	1463	wait_event_interruptible(osb->checkpoint_event,
1464	atomic_read(&journal->j_num_trans)	1464	atomic_read(&journal->j_num_trans)
1465	\|\| kthread_should_stop());	1465	\|\| kthread_should_stop());
1466		1466
1467	status = ocfs2_commit_cache(osb);	1467	status = ocfs2_commit_cache(osb);
1468	if (status < 0)	1468	if (status < 0)
1469	mlog_errno(status);	1469	mlog_errno(status);
1470		1470
1471	if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){	1471	if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1472	mlog(ML_KTHREAD,	1472	mlog(ML_KTHREAD,
1473	"commit_thread: %u transactions pending on "	1473	"commit_thread: %u transactions pending on "
1474	"shutdown\n",	1474	"shutdown\n",
1475	atomic_read(&journal->j_num_trans));	1475	atomic_read(&journal->j_num_trans));
1476	}	1476	}
1477	}	1477	}
1478		1478
1479	return 0;	1479	return 0;
1480	}	1480	}
1481		1481
1482	/* Look for a dirty journal without taking any cluster locks. Used for	1482	/* Look for a dirty journal without taking any cluster locks. Used for
1483	* hard readonly access to determine whether the file system journals	1483	* hard readonly access to determine whether the file system journals
1484	* require recovery. */	1484	* require recovery. */
1485	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)	1485	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1486	{	1486	{
1487	int ret = 0;	1487	int ret = 0;
1488	unsigned int slot;	1488	unsigned int slot;
1489	struct buffer_head *di_bh;	1489	struct buffer_head *di_bh;
1490	struct ocfs2_dinode *di;	1490	struct ocfs2_dinode *di;
1491	struct inode *journal = NULL;	1491	struct inode *journal = NULL;
1492		1492
1493	for(slot = 0; slot < osb->max_slots; slot++) {	1493	for(slot = 0; slot < osb->max_slots; slot++) {
1494	journal = ocfs2_get_system_file_inode(osb,	1494	journal = ocfs2_get_system_file_inode(osb,
1495	JOURNAL_SYSTEM_INODE,	1495	JOURNAL_SYSTEM_INODE,
1496	slot);	1496	slot);
1497	if (!journal \|\| is_bad_inode(journal)) {	1497	if (!journal \|\| is_bad_inode(journal)) {
1498	ret = -EACCES;	1498	ret = -EACCES;
1499	mlog_errno(ret);	1499	mlog_errno(ret);
1500	goto out;	1500	goto out;
1501	}	1501	}
1502		1502
1503	di_bh = NULL;	1503	di_bh = NULL;
1504	ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,	1504	ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1505	0, journal);	1505	0, journal);
1506	if (ret < 0) {	1506	if (ret < 0) {
1507	mlog_errno(ret);	1507	mlog_errno(ret);
1508	goto out;	1508	goto out;
1509	}	1509	}
1510		1510
1511	di = (struct ocfs2_dinode *) di_bh->b_data;	1511	di = (struct ocfs2_dinode *) di_bh->b_data;
1512		1512
1513	if (le32_to_cpu(di->id1.journal1.ij_flags) &	1513	if (le32_to_cpu(di->id1.journal1.ij_flags) &
1514	OCFS2_JOURNAL_DIRTY_FL)	1514	OCFS2_JOURNAL_DIRTY_FL)
1515	ret = -EROFS;	1515	ret = -EROFS;
1516		1516
1517	brelse(di_bh);	1517	brelse(di_bh);
1518	if (ret)	1518	if (ret)
1519	break;	1519	break;
1520	}	1520	}
1521		1521
1522	out:	1522	out:
1523	if (journal)	1523	if (journal)
1524	iput(journal);	1524	iput(journal);
1525		1525
1526	return ret;	1526	return ret;
1527	}	1527	}
1528		1528

fs/ocfs2/namei.c

Diff comments View file @ 4f902c3

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * namei.c
  *
  * Create and rename file, directory, symlinks
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  *  Portions of this code from linux/fs/ext3/dir.c
  *
  *  Copyright (C) 1992, 1993, 1994, 1995
  *  Remy Card (card@masi.ibp.fr)
  *  Laboratoire MASI - Institut Blaise pascal
  *  Universite Pierre et Marie Curie (Paris VI)
  *
  *   from
  *
  *   linux/fs/minix/dir.c
  *
  *   Copyright (C) 1991, 1992 Linux Torvalds
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "dcache.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
 #include "vote.h"
 #include "buffer_head_io.h"
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
 #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 					struct inode *dir,
 					const char *name, int namelen,
 					unsigned long offset,
 					struct ocfs2_dir_entry **res_dir);
 static int ocfs2_delete_entry(handle_t *handle,
 			      struct inode *dir,
 			      struct ocfs2_dir_entry *de_del,
 			      struct buffer_head *bh);
 static int __ocfs2_add_entry(handle_t *handle,
 			     struct inode *dir,
 			     const char *name, int namelen,
 			     struct inode *inode, u64 blkno,
 			     struct buffer_head *parent_fe_bh,
 			     struct buffer_head *insert_bh);
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct dentry *dentry, int mode,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
 			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac);
 static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 			      handle_t *handle,
 			      struct inode *parent,
 			      struct inode *inode,
 			      struct buffer_head *fe_bh,
 			      struct ocfs2_alloc_context *data_ac);
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
 				    struct inode *inode,
 				    char *name,
 				    struct buffer_head **de_bh);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
 			    struct ocfs2_dinode *fe,
 			    char *name,
 			    struct buffer_head *de_bh,
 			    struct inode *orphan_dir_inode);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 				     handle_t *handle,
 				     struct inode *inode,
 				     const char *symname);
 static inline int ocfs2_add_entry(handle_t *handle,
 				  struct dentry *dentry,
 				  struct inode *inode, u64 blkno,
 				  struct buffer_head *parent_fe_bh,
 				  struct buffer_head *insert_bh)
 {
 	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
 				 dentry->d_name.name, dentry->d_name.len,
 				 inode, blkno, parent_fe_bh, insert_bh);
 }
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
 	int status;
 	u64 blkno;
 	struct buffer_head *dirent_bh = NULL;
 	struct inode *inode = NULL;
 	struct dentry *ret;
 	struct ocfs2_dir_entry *dirent;
 	struct ocfs2_inode_info *oi;
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
 		ret = ERR_PTR(-ENAMETOOLONG);
 		goto bail;
 	}
 	mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
 	     dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
 	status = ocfs2_meta_lock(dir, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		ret = ERR_PTR(status);
 		goto bail;
 	}
 	status = ocfs2_find_files_on_disk(dentry->d_name.name,
 					  dentry->d_name.len, &blkno,
 					  dir, &dirent_bh, &dirent);
 	if (status < 0)
 		goto bail_add;
 	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
 	if (IS_ERR(inode)) {
 		ret = ERR_PTR(-EACCES);
 		goto bail_unlock;
 	}
 	oi = OCFS2_I(inode);
 	/* Clear any orphaned state... If we were able to look up the
 	 * inode from a directory, it certainly can't be orphaned. We
 	 * might have the bad state from a node which intended to
 	 * orphan this inode but crashed before it could commit the
 	 * unlink. */
 	spin_lock(&oi->ip_lock);
 	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
 	spin_unlock(&oi->ip_lock);
 bail_add:
 	dentry->d_op = &ocfs2_dentry_ops;
 	ret = d_splice_alias(inode, dentry);
 	if (inode) {
 		/*
 		 * If d_splice_alias() finds a DCACHE_DISCONNECTED
 		 * dentry, it will d_move() it on top of ourse. The
 		 * return value will indicate this however, so in
 		 * those cases, we switch them around for the locking
 		 * code.
 		 *
 		 * NOTE: This dentry already has ->d_op set from
 		 * ocfs2_get_parent() and ocfs2_get_dentry()
 		 */
 		if (ret)
 			dentry = ret;
 		status = ocfs2_dentry_attach_lock(dentry, inode,
 						  OCFS2_I(dir)->ip_blkno);
 		if (status) {
 			mlog_errno(status);
 			ret = ERR_PTR(status);
 			goto bail_unlock;
 		}
 	}
 bail_unlock:
 	/* Don't drop the cluster lock until *after* the d_add --
 	 * unlink on another node will message us to remove that
 	 * dentry under this lock so otherwise we can race this with
 	 * the vote thread and have a stale dentry. */
 	ocfs2_meta_unlock(dir, 0);
 bail:
 	if (dirent_bh)
 		brelse(dirent_bh);
 	mlog_exit_ptr(ret);
 	return ret;
 }
 static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 			      handle_t *handle,
 			      struct inode *parent,
 			      struct inode *inode,
 			      struct buffer_head *fe_bh,
 			      struct ocfs2_alloc_context *data_ac)
 {
 	int status;
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry *de = NULL;
 	mlog_entry_void();
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
 				     data_ac, NULL, &new_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	ocfs2_set_new_buffer_uptodate(inode, new_bh);
 	status = ocfs2_journal_access(handle, inode, new_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
 	de->name_len = 1;
 	de->rec_len =
 		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
 	strcpy(de->name, ".");
 	ocfs2_set_de_type(de, S_IFDIR);
 	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
 	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
 	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
 				  OCFS2_DIR_REC_LEN(1));
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ocfs2_set_de_type(de, S_IFDIR);
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	i_size_write(inode, inode->i_sb->s_blocksize);
 	inode->i_nlink = 2;
 	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	status = 0;
 bail:
 	if (new_bh)
 		brelse(new_bh);
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode,
 		       dev_t dev)
 {
 	int status = 0;
 	struct buffer_head *parent_fe_bh = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_super *osb;
 	struct ocfs2_dinode *dirfe;
 	struct buffer_head *new_fe_bh = NULL;
 	struct buffer_head *de_bh = NULL;
 	struct inode *inode = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 	/* get our super block */
 	osb = OCFS2_SB(dir->i_sb);
 	status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		return status;
 	}
 	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
 		status = -EMLINK;
 		goto leave;
 	}
 	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
 		goto leave;
 	}
 	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
 					   dentry->d_name.len);
 	if (status)
 		goto leave;
 	/* get a spot inside the dir. */
 	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					      dentry->d_name.name,
 					      dentry->d_name.len, &de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	/* reserve an inode spot */
 	status = ocfs2_reserve_new_inode(osb, &inode_ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 	/* are we making a directory? If so, reserve a cluster for his
 	 * 1st extent. */
 	if (S_ISDIR(mode)) {
 		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			goto leave;
 		}
 	}
 	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto leave;
 	}
 	/* do the real work now. */
 	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
 				    &inode, inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	if (S_ISDIR(mode)) {
 		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
 					    new_fe_bh, data_ac);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
 		}
 		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
 		}
 		le16_add_cpu(&dirfe->i_links_count, 1);
 		status = ocfs2_journal_dirty(handle, parent_fe_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
 		}
 		inc_nlink(dir);
 	}
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_dentry_attach_lock(dentry, inode,
 					  OCFS2_I(dir)->ip_blkno);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 	insert_inode_hash(inode);
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 	ocfs2_meta_unlock(dir, 1);
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
 	if (new_fe_bh)
 		brelse(new_fe_bh);
 	if (de_bh)
 		brelse(de_bh);
 	if (parent_fe_bh)
 		brelse(parent_fe_bh);
 	if ((status < 0) && inode)
 		iput(inode);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct dentry *dentry, int mode,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
 			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac)
 {
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_extent_list *fel;
 	u64 fe_blkno = 0;
 	u16 suballoc_bit;
 	struct inode *inode = NULL;
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 	*new_fe_bh = NULL;
 	*ret_inode = NULL;
 	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
 				       &fe_blkno);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	inode = new_inode(dir->i_sb);
 	if (IS_ERR(inode)) {
 		status = PTR_ERR(inode);
 		mlog(ML_ERROR, "new_inode failed!\n");
 		goto leave;
 	}
 	/* populate as many fields early on as possible - many of
 	 * these are used by the support functions here and in
 	 * callers. */
 	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
 	OCFS2_I(inode)->ip_blkno = fe_blkno;
 	if (S_ISDIR(mode))
 		inode->i_nlink = 2;
 	else
 		inode->i_nlink = 1;
 	inode->i_mode = mode;
 	spin_lock(&osb->osb_lock);
 	inode->i_generation = osb->s_next_generation++;
 	spin_unlock(&osb->osb_lock);
 	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
 	if (!*new_fe_bh) {
 		status = -EIO;
 		mlog_errno(status);
 		goto leave;
 	}
 	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
 	memset(fe, 0, osb->sb->s_blocksize);
 	fe->i_generation = cpu_to_le32(inode->i_generation);
 	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
 	fe->i_uid = cpu_to_le32(current->fsuid);
 	if (dir->i_mode & S_ISGID) {
 		fe->i_gid = cpu_to_le32(dir->i_gid);
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else
 		fe->i_gid = cpu_to_le32(current->fsgid);
 	fe->i_mode = cpu_to_le16(mode);
 	if (S_ISCHR(mode) || S_ISBLK(mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
 	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
 		cpu_to_le64(CURRENT_TIME.tv_sec);
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
 		cpu_to_le32(CURRENT_TIME.tv_nsec);
 	fe->i_dtime = 0;
 	fel = &fe->id2.i_list;
 	fel->l_tree_depth = 0;
 	fel->l_next_free_rec = 0;
 	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
 	status = ocfs2_journal_dirty(handle, *new_fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
 		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
 		     "i_blkno=%llu, i_ino=%lu\n",
 		     (unsigned long long) (*new_fe_bh)->b_blocknr,
 		     (unsigned long long)fe->i_blkno, inode->i_ino);
 		BUG();
 	}
 	ocfs2_inode_set_new(osb, inode);
 	if (!ocfs2_mount_local(osb)) {
 		status = ocfs2_create_new_inode_locks(inode);
 		if (status < 0)
 			mlog_errno(status);
 	}
 	status = 0; /* error in ocfs2_create_new_inode_locks is not
 		     * critical */
 	*ret_inode = inode;
 leave:
 	if (status < 0) {
 		if (*new_fe_bh) {
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
 		if (inode)
 			iput(inode);
 	}
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_mkdir(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode)
 {
 	int ret;
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
 		   dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
 	mlog_exit(ret);
 	return ret;
 }
 static int ocfs2_create(struct inode *dir,
 			struct dentry *dentry,
 			int mode,
 			struct nameidata *nd)
 {
 	int ret;
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
 		   dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
 	mlog_exit(ret);
 	return ret;
 }
 static int ocfs2_link(struct dentry *old_dentry,
 		      struct inode *dir,
 		      struct dentry *dentry)
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
 	int err;
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
 	struct buffer_head *de_bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
 		   dentry->d_name.len, dentry->d_name.name);
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 	err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
 	if (err < 0) {
 		if (err != -ENOENT)
 			mlog_errno(err);
 		return err;
 	}
 	if (!dir->i_nlink) {
 		err = -ENOENT;
 		goto out;
 	}
 	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
 					dentry->d_name.len);
 	if (err)
 		goto out;
 	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					   dentry->d_name.name,
 					   dentry->d_name.len, &de_bh);
 	if (err < 0) {
 		mlog_errno(err);
 		goto out;
 	}
 	err = ocfs2_meta_lock(inode, &fe_bh, 1);
 	if (err < 0) {
 		if (err != -ENOENT)
 			mlog_errno(err);
 		goto out;
 	}
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
 		err = -EMLINK;
 		goto out_unlock_inode;
 	}
 	handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(err);
 		goto out_unlock_inode;
 	}
 	err = ocfs2_journal_access(handle, inode, fe_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (err < 0) {
 		mlog_errno(err);
 		goto out_commit;
 	}
 	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	err = ocfs2_journal_dirty(handle, fe_bh);
 	if (err < 0) {
 		le16_add_cpu(&fe->i_links_count, -1);
 		drop_nlink(inode);
 		mlog_errno(err);
 		goto out_commit;
 	}
 	err = ocfs2_add_entry(handle, dentry, inode,
 			      OCFS2_I(inode)->ip_blkno,
 			      parent_fe_bh, de_bh);
 	if (err) {
 		le16_add_cpu(&fe->i_links_count, -1);
 		drop_nlink(inode);
 		mlog_errno(err);
 		goto out_commit;
 	}
 	err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
 	if (err) {
 		mlog_errno(err);
 		goto out_commit;
 	}
 	atomic_inc(&inode->i_count);
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out_unlock_inode:
 	ocfs2_meta_unlock(inode, 1);
 out:
 	ocfs2_meta_unlock(dir, 1);
 	if (de_bh)
 		brelse(de_bh);
 	if (fe_bh)
 		brelse(fe_bh);
 	if (parent_fe_bh)
 		brelse(parent_fe_bh);
 	mlog_exit(err);
 	return err;
 }
 /*
  * Takes and drops an exclusive lock on the given dentry. This will
  * force other nodes to drop it.
  */
 static int ocfs2_remote_dentry_delete(struct dentry *dentry)
 {
 	int ret;
 	ret = ocfs2_dentry_lock(dentry, 1);
 	if (ret)
 		mlog_errno(ret);
 	else
 		ocfs2_dentry_unlock(dentry, 1);
 	return ret;
 }
 static inline int inode_is_unlinkable(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
 		if (inode->i_nlink == 2)
 			return 1;
 		return 0;
 	}
 	if (inode->i_nlink == 1)
 		return 1;
 	return 0;
 }
 static int ocfs2_unlink(struct inode *dir,
 			struct dentry *dentry)
 {
 	int status;
 	int child_locked = 0;
 	struct inode *inode = dentry->d_inode;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	u64 blkno;
 	struct ocfs2_dinode *fe = NULL;
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_node_bh = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_dir_entry *dirent = NULL;
 	struct buffer_head *dirent_bh = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct buffer_head *orphan_entry_bh = NULL;
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 	BUG_ON(dentry->d_parent->d_inode != dir);
 	mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	if (inode == osb->root_inode) {
 		mlog(0, "Cannot delete the root directory\n");
 		return -EPERM;
 	}
 	status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		return status;
 	}
 	status = ocfs2_find_files_on_disk(dentry->d_name.name,
 					  dentry->d_name.len, &blkno,
 					  dir, &dirent_bh, &dirent);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto leave;
 	}
 	if (OCFS2_I(inode)->ip_blkno != blkno) {
 		status = -ENOENT;
 		mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		     (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
 		goto leave;
 	}
 	status = ocfs2_meta_lock(inode, &fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto leave;
 	}
 	child_locked = 1;
 	if (S_ISDIR(inode->i_mode)) {
 	       	if (!ocfs2_empty_dir(inode)) {
 			status = -ENOTEMPTY;
 			goto leave;
 		} else if (inode->i_nlink != 2) {
 			status = -ENOTEMPTY;
 			goto leave;
 		}
 	}
 	status = ocfs2_remote_dentry_delete(dentry);
 	if (status < 0) {
 		/* This vote should succeed under all normal
 		 * circumstances. */
 		mlog_errno(status);
 		goto leave;
 	}
 	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
 						  orphan_name,
 						  &orphan_entry_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
 		}
 	}
 	handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_access(handle, inode, fe_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
 					  orphan_entry_bh, orphan_dir);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
 		}
 	}
 	/* delete the name from the parent dir */
 	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	if (S_ISDIR(inode->i_mode))
 		drop_nlink(inode);
 	drop_nlink(inode);
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	if (S_ISDIR(inode->i_mode))
 		drop_nlink(dir);
 	status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		if (S_ISDIR(inode->i_mode))
 			inc_nlink(dir);
 	}
 leave:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 	if (child_locked)
 		ocfs2_meta_unlock(inode, 1);
 	ocfs2_meta_unlock(dir, 1);
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_meta_unlock(orphan_dir, 1);
 		mutex_unlock(&orphan_dir->i_mutex);
 		iput(orphan_dir);
 	}
 	if (fe_bh)
 		brelse(fe_bh);
 	if (dirent_bh)
 		brelse(dirent_bh);
 	if (parent_node_bh)
 		brelse(parent_node_bh);
 	if (orphan_entry_bh)
 		brelse(orphan_entry_bh);
 	mlog_exit(status);
 	return status;
 }
 /*
  * The only place this should be used is rename!
  * if they have the same id, then the 1st one is the only one locked.
  */
 static int ocfs2_double_lock(struct ocfs2_super *osb,
 			     struct buffer_head **bh1,
 			     struct inode *inode1,
 			     struct buffer_head **bh2,
 			     struct inode *inode2)
 {
 	int status;
 	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
 	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
 	struct buffer_head **tmpbh;
 	struct inode *tmpinode;
 	mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
 		   (unsigned long long)oi1->ip_blkno,
 		   (unsigned long long)oi2->ip_blkno);
 	if (*bh1)
 		*bh1 = NULL;
 	if (*bh2)
 		*bh2 = NULL;
 	/* we always want to lock the one with the lower lockid first. */
 	if (oi1->ip_blkno != oi2->ip_blkno) {
 		if (oi1->ip_blkno < oi2->ip_blkno) {
 			/* switch id1 and id2 around */
 			mlog(0, "switching them around...\n");
 			tmpbh = bh2;
 			bh2 = bh1;
 			bh1 = tmpbh;
 			tmpinode = inode2;
 			inode2 = inode1;
 			inode1 = tmpinode;
 		}
 		/* lock id2 */
 		status = ocfs2_meta_lock(inode2, bh2, 1);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
 			goto bail;
 		}
 	}
 	/* lock id1 */
 	status = ocfs2_meta_lock(inode1, bh1, 1);
 	if (status < 0) {
 		/*
 		 * An error return must mean that no cluster locks
 		 * were held on function exit.
 		 */
 		if (oi1->ip_blkno != oi2->ip_blkno)
 			ocfs2_meta_unlock(inode2, 1);
 		if (status != -ENOENT)
 			mlog_errno(status);
 	}
 bail:
 	mlog_exit(status);
 	return status;
 }
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 {
 	ocfs2_meta_unlock(inode1, 1);
 	if (inode1 != inode2)
 		ocfs2_meta_unlock(inode2, 1);
 }
 #define PARENT_INO(buffer) \
 	((struct ocfs2_dir_entry *) \
 	 ((char *)buffer + \
 	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
 static int ocfs2_rename(struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
 			struct dentry *new_dentry)
 {
 	int status = 0, rename_lock = 0, parents_locked = 0;
 	int old_child_locked = 0, new_child_locked = 0;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_dinode *newfe = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct buffer_head *orphan_entry_bh = NULL;
 	struct buffer_head *newfe_bh = NULL;
 	struct buffer_head *old_inode_bh = NULL;
 	struct buffer_head *insert_entry_bh = NULL;
 	struct ocfs2_super *osb = NULL;
 	u64 newfe_blkno;
 	handle_t *handle = NULL;
 	struct buffer_head *old_dir_bh = NULL;
 	struct buffer_head *new_dir_bh = NULL;
 	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
 							       // and new_dentry
 	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
 	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
 						    // this is the 1st dirent bh
 	nlink_t old_dir_nlink = old_dir->i_nlink;
 	/* At some point it might be nice to break this function up a
 	 * bit. */
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
 		   old_dir, old_dentry, new_dir, new_dentry,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
 		   new_dentry->d_name.len, new_dentry->d_name.name);
 	osb = OCFS2_SB(old_dir->i_sb);
 	if (new_inode) {
 		if (!igrab(new_inode))
 			BUG();
 	}
 	/* Assume a directory hierarchy thusly:
 	 * a/b/c
 	 * a/d
 	 * a,b,c, and d are all directories.
 	 *
 	 * from cwd of 'a' on both nodes:
 	 * node1: mv b/c d
 	 * node2: mv d   b/c
 	 *
 	 * And that's why, just like the VFS, we need a file system
 	 * rename lock. */
 	if (old_dentry != new_dentry) {
 		status = ocfs2_rename_lock(osb);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		rename_lock = 1;
 	}
 	/* if old and new are the same, this'll just do one lock. */
 	status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
 				   &new_dir_bh, new_dir);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	parents_locked = 1;
 	/* make sure both dirs have bhs
 	 * get an extra ref on old_dir_bh if old==new */
 	if (!new_dir_bh) {
 		if (old_dir_bh) {
 			new_dir_bh = old_dir_bh;
 			get_bh(new_dir_bh);
 		} else {
 			mlog(ML_ERROR, "no old_dir_bh!\n");
 			status = -EIO;
 			goto bail;
 		}
 	}
 	/*
 	 * Aside from allowing a meta data update, the locking here
 	 * also ensures that the vote thread on other nodes won't have
 	 * to concurrently downconvert the inode and the dentry locks.
 	 */
 	status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto bail;
 	}
 	old_child_locked = 1;
 	status = ocfs2_remote_dentry_delete(old_dentry);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	if (S_ISDIR(old_inode->i_mode)) {
 		status = -EIO;
 		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
 		if (!old_inode_de_bh)
 			goto bail;
 		status = -EIO;
 		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
 		    OCFS2_I(old_dir)->ip_blkno)
 			goto bail;
 		status = -EMLINK;
 		if (!new_inode && new_dir!=old_dir &&
 		    new_dir->i_nlink >= OCFS2_LINK_MAX)
 			goto bail;
 	}
 	status = -ENOENT;
 	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
 				     old_dentry->d_name.len,
 				     old_dir, &old_de);
 	if (!old_de_bh)
 		goto bail;
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
 	 *  and merrily kill the link to whatever was created under the
 	 *  same name. Goodbye sticky bit ;-<
 	 */
 	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
 		goto bail;
 	/* check if the target already exists (in which case we need
 	 * to delete it */
 	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
 					  new_dentry->d_name.len,
 					  &newfe_blkno, new_dir, &new_de_bh,
 					  &new_de);
 	/* The only error we allow here is -ENOENT because the new
 	 * file not existing is perfectly valid. */
 	if ((status < 0) && (status != -ENOENT)) {
 		/* If we cannot find the file specified we should just */
 		/* return the error... */
 		mlog_errno(status);
 		goto bail;
 	}
 	if (!new_de && new_inode)
 		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
 		     "directory!", new_inode->i_ino);
 	/* In case we need to overwrite an existing file, we blow it
 	 * away first */
 	if (new_de) {
 		/* VFS didn't think there existed an inode here, but
 		 * someone else in the cluster must have raced our
 		 * rename to create one. Today we error cleanly, in
 		 * the future we should consider calling iget to build
 		 * a new struct inode for this entry. */
 		if (!new_inode) {
 			status = -EACCES;
 			mlog(0, "We found an inode for name %.*s but VFS "
 			     "didn't give us one.\n", new_dentry->d_name.len,
 			     new_dentry->d_name.name);
 			goto bail;
 		}
 		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
 			status = -EACCES;
 			mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
 			     (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
 			     (unsigned long long)newfe_blkno,
 			     OCFS2_I(new_inode)->ip_flags);
 			goto bail;
 		}
 		status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
 			goto bail;
 		}
 		new_child_locked = 1;
 		status = ocfs2_remote_dentry_delete(new_dentry);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 		mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
 		     "newfebh=%p bhblocknr=%llu\n", new_de,
 		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
 		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
 		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
 			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
 							  new_inode,
 							  orphan_name,
 							  &orphan_entry_bh);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
 			}
 		}
 	} else {
 		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
 		status = ocfs2_check_dir_for_entry(new_dir,
 						   new_dentry->d_name.name,
 						   new_dentry->d_name.len);
 		if (status)
 			goto bail;
 		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
 						      new_dentry->d_name.name,
 						      new_dentry->d_name.len,
 						      &insert_entry_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	}
 	handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto bail;
 	}
 	if (new_de) {
 		if (S_ISDIR(new_inode->i_mode)) {
 			if (!ocfs2_empty_dir(new_inode) ||
 			    new_inode->i_nlink != 2) {
 				status = -ENOTEMPTY;
 				goto bail;
 			}
 		}
 		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		if (S_ISDIR(new_inode->i_mode) ||
 		    (newfe->i_links_count == cpu_to_le16(1))){
 			status = ocfs2_orphan_add(osb, handle, new_inode,
 						  newfe, orphan_name,
 						  orphan_entry_bh, orphan_dir);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
 			}
 		}
 		/* change the dirent to point to the correct inode */
 		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
 		new_de->file_type = old_de->file_type;
 		new_dir->i_version++;
 		status = ocfs2_journal_dirty(handle, new_de_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		if (S_ISDIR(new_inode->i_mode))
 			newfe->i_links_count = 0;
 		else
 			le16_add_cpu(&newfe->i_links_count, -1);
 		status = ocfs2_journal_dirty(handle, newfe_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	} else {
 		/* if the name was not found in new_dir, add it now */
 		status = ocfs2_add_entry(handle, new_dentry, old_inode,
 					 OCFS2_I(old_inode)->ip_blkno,
 					 new_dir_bh, insert_entry_bh);
 	}
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
 	ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh);
 	/* now that the name has been added to new_dir, remove the old name */
 	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	if (new_inode) {
 		new_inode->i_nlink--;
 		new_inode->i_ctime = CURRENT_TIME;
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
 	if (old_inode_de_bh) {
 		status = ocfs2_journal_access(handle, old_inode,
 					     old_inode_de_bh,
 					     OCFS2_JOURNAL_ACCESS_WRITE);
 		PARENT_INO(old_inode_de_bh->b_data) =
 			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
 		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
 		old_dir->i_nlink--;
 		if (new_inode) {
 			new_inode->i_nlink--;
 		} else {
 			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
 	mark_inode_dirty(old_dir);
 	ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh);
 	if (new_inode) {
 		mark_inode_dirty(new_inode);
 		ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh);
 	}
 	if (old_dir != new_dir) {
 		/* Keep the same times on both directories.*/
 		new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
 		/*
 		 * This will also pick up the i_nlink change from the
 		 * block above.
 		 */
 		ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh);
 	}
 	if (old_dir_nlink != old_dir->i_nlink) {
 		if (!old_dir_bh) {
 			mlog(ML_ERROR, "need to change nlink for old dir "
 			     "%llu from %d to %d but bh is NULL!\n",
 			     (unsigned long long)OCFS2_I(old_dir)->ip_blkno,
 			     (int)old_dir_nlink, old_dir->i_nlink);
 		} else {
 			struct ocfs2_dinode *fe;
 			status = ocfs2_journal_access(handle, old_dir,
 						      old_dir_bh,
 						      OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
 		}
 	}
 	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
 	status = 0;
 bail:
 	if (rename_lock)
 		ocfs2_rename_unlock(osb);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 	if (parents_locked)
 		ocfs2_double_unlock(old_dir, new_dir);
 	if (old_child_locked)
 		ocfs2_meta_unlock(old_inode, 1);
 	if (new_child_locked)
 		ocfs2_meta_unlock(new_inode, 1);
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_meta_unlock(orphan_dir, 1);
 		mutex_unlock(&orphan_dir->i_mutex);
 		iput(orphan_dir);
 	}
 	if (new_inode)
 		sync_mapping_buffers(old_inode->i_mapping);
 	if (new_inode)
 		iput(new_inode);
 	if (newfe_bh)
 		brelse(newfe_bh);
 	if (old_inode_bh)
 		brelse(old_inode_bh);
 	if (old_dir_bh)
 		brelse(old_dir_bh);
 	if (new_dir_bh)
 		brelse(new_dir_bh);
 	if (new_de_bh)
 		brelse(new_de_bh);
 	if (old_de_bh)
 		brelse(old_de_bh);
 	if (old_inode_de_bh)
 		brelse(old_inode_de_bh);
 	if (orphan_entry_bh)
 		brelse(orphan_entry_bh);
 	if (insert_entry_bh)
 		brelse(insert_entry_bh);
 	mlog_exit(status);
 	return status;
 }
 /*
  * we expect i_size = strlen(symname). Copy symname into the file
  * data, including the null terminator.
  */
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 				     handle_t *handle,
 				     struct inode *inode,
 				     const char *symname)
 {
 	struct buffer_head **bhs = NULL;
 	const char *c;
 	struct super_block *sb = osb->sb;
-	u64 p_blkno;
+	u64 p_blkno, p_blocks;
-	int p_blocks;
 	int virtual, blocks, status, i, bytes_left;
 	bytes_left = i_size_read(inode) + 1;
 	/* we can't trust i_blocks because we're actually going to
 	 * write i_size + 1 bytes. */
 	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
 			(unsigned long long)inode->i_blocks,
 			i_size_read(inode), blocks);
 	/* Sanity check -- make sure we're going to fit. */
 	if (bytes_left >
 	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
 		status = -EIO;
 		mlog_errno(status);
 		goto bail;
 	}
 	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
 	if (!bhs) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
 	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
 					     NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	/* links can never be larger than one cluster so we know this
 	 * is all going to be contiguous, but do a sanity check
 	 * anyway. */
 	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
 		status = -EIO;
 		mlog_errno(status);
 		goto bail;
 	}
 	virtual = 0;
 	while(bytes_left > 0) {
 		c = &symname[virtual * sb->s_blocksize];
 		bhs[virtual] = sb_getblk(sb, p_blkno);
 		if (!bhs[virtual]) {
 			status = -ENOMEM;
 			mlog_errno(status);
 			goto bail;
 		}
 		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
 		status = ocfs2_journal_access(handle, inode, bhs[virtual],
 					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
 		memcpy(bhs[virtual]->b_data, c,
 		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
 		       bytes_left);
 		status = ocfs2_journal_dirty(handle, bhs[virtual]);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		virtual++;
 		p_blkno++;
 		bytes_left -= sb->s_blocksize;
 	}
 	status = 0;
 bail:
 	if (bhs) {
 		for(i = 0; i < blocks; i++)
 			if (bhs[i])
 				brelse(bhs[i]);
 		kfree(bhs);
 	}
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_symlink(struct inode *dir,
 			 struct dentry *dentry,
 			 const char *symname)
 {
 	int status, l, credits;
 	u64 newsize;
 	struct ocfs2_super *osb = NULL;
 	struct inode *inode = NULL;
 	struct super_block *sb;
 	struct buffer_head *new_fe_bh = NULL;
 	struct buffer_head *de_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_dinode *dirfe;
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
 	sb = dir->i_sb;
 	osb = OCFS2_SB(sb);
 	l = strlen(symname) + 1;
 	credits = ocfs2_calc_symlink_credits(sb);
 	/* lock the parent directory */
 	status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		return status;
 	}
 	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
 		goto bail;
 	}
 	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
 					   dentry->d_name.len);
 	if (status)
 		goto bail;
 	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					      dentry->d_name.name,
 					      dentry->d_name.len, &de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	status = ocfs2_reserve_new_inode(osb, &inode_ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto bail;
 	}
 	/* don't reserve bitmap space for fast symlinks. */
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			goto bail;
 		}
 	}
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto bail;
 	}
 	status = ocfs2_mknod_locked(osb, dir, dentry,
 				    S_IFLNK | S_IRWXUGO, 0,
 				    &new_fe_bh, parent_fe_bh, handle,
 				    &inode, inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
 	inode->i_rdev = 0;
 	newsize = l - 1;
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		u32 offset = 0;
 		inode->i_op = &ocfs2_symlink_inode_operations;
 		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
 						    new_fe_bh,
 						    handle, data_ac, NULL,
 						    NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR,
 				     "Failed to extend file to %llu\n",
 				     (unsigned long long)newsize);
 				mlog_errno(status);
 				status = -ENOSPC;
 			}
 			goto bail;
 		}
 		i_size_write(inode, newsize);
 		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
 	} else {
 		inode->i_op = &ocfs2_fast_symlink_inode_operations;
 		memcpy((char *) fe->id2.i_symlink, symname, l);
 		i_size_write(inode, newsize);
 		inode->i_blocks = 0;
 	}
 	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	if (!ocfs2_inode_is_fast_symlink(inode)) {
 		status = ocfs2_create_symlink_data(osb, handle, inode,
 						   symname);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	}
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
 	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
 	insert_inode_hash(inode);
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 	ocfs2_meta_unlock(dir, 1);
 	if (new_fe_bh)
 		brelse(new_fe_bh);
 	if (parent_fe_bh)
 		brelse(parent_fe_bh);
 	if (de_bh)
 		brelse(de_bh);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if ((status < 0) && inode)
 		iput(inode);
 	mlog_exit(status);
 	return status;
 }
 int ocfs2_check_dir_entry(struct inode * dir,
 			  struct ocfs2_dir_entry * de,
 			  struct buffer_head * bh,
 			  unsigned long offset)
 {
 	const char *error_msg = NULL;
 	const int rlen = le16_to_cpu(de->rec_len);
 	if (rlen < OCFS2_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
 	else if (rlen % 4 != 0)
 		error_msg = "rec_len % 4 != 0";
 	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
 		error_msg = "rec_len is too small for name_len";
 	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
 		error_msg = "directory entry across blocks";
 	if (error_msg != NULL)
 		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
 		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
 		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
 		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
 		     de->name_len);
 	return error_msg == NULL ? 1 : 0;
 }
 /* we don't always have a dentry for what we want to add, so people
  * like orphan dir can call this instead.
  *
  * If you pass me insert_bh, I'll skip the search of the other dir
  * blocks and put the record in there.
  */
 static int __ocfs2_add_entry(handle_t *handle,
 			     struct inode *dir,
 			     const char *name, int namelen,
 			     struct inode *inode, u64 blkno,
 			     struct buffer_head *parent_fe_bh,
 			     struct buffer_head *insert_bh)
 {
 	unsigned long offset;
 	unsigned short rec_len;
 	struct ocfs2_dir_entry *de, *de1;
 	struct super_block *sb;
 	int retval, status;
 	mlog_entry_void();
 	sb = dir->i_sb;
 	if (!namelen)
 		return -EINVAL;
 	rec_len = OCFS2_DIR_REC_LEN(namelen);
 	offset = 0;
 	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
 	while (1) {
 		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
 		/* These checks should've already been passed by the
 		 * prepare function, but I guess we can leave them
 		 * here anyway. */
 		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
 			retval = -ENOENT;
 			goto bail;
 		}
 		if (ocfs2_match(namelen, name, de)) {
 			retval = -EEXIST;
 			goto bail;
 		}
 		if (((le64_to_cpu(de->inode) == 0) &&
 		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
 		    (le16_to_cpu(de->rec_len) >=
 		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 			if (retval < 0) {
 				mlog_errno(retval);
 				goto bail;
 			}
 			status = ocfs2_journal_access(handle, dir, insert_bh,
 						      OCFS2_JOURNAL_ACCESS_WRITE);
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
 				de1 = (struct ocfs2_dir_entry *)((char *) de +
 					OCFS2_DIR_REC_LEN(de->name_len));
 				de1->rec_len =
 					cpu_to_le16(le16_to_cpu(de->rec_len) -
 					OCFS2_DIR_REC_LEN(de->name_len));
 				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
 				de = de1;
 			}
 			de->file_type = OCFS2_FT_UNKNOWN;
 			if (blkno) {
 				de->inode = cpu_to_le64(blkno);
 				ocfs2_set_de_type(de, inode->i_mode);
 			} else
 				de->inode = 0;
 			de->name_len = namelen;
 			memcpy(de->name, name, namelen);
 			dir->i_version++;
 			status = ocfs2_journal_dirty(handle, insert_bh);
 			retval = 0;
 			goto bail;
 		}
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
 	/* when you think about it, the assert above should prevent us
 	 * from ever getting here. */
 	retval = -ENOSPC;
 bail:
 	mlog_exit(retval);
 	return retval;
 }
 /*
  * ocfs2_delete_entry deletes a directory entry by merging it with the
  * previous entry
  */
 static int ocfs2_delete_entry(handle_t *handle,
 			      struct inode *dir,
 			      struct ocfs2_dir_entry *de_del,
 			      struct buffer_head *bh)
 {
 	struct ocfs2_dir_entry *de, *pde;
 	int i, status = -ENOENT;
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 	i = 0;
 	pde = NULL;
 	de = (struct ocfs2_dir_entry *) bh->b_data;
 	while (i < bh->b_size) {
 		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
 			status = -EIO;
 			mlog_errno(status);
 			goto bail;
 		}
 		if (de == de_del)  {
 			status = ocfs2_journal_access(handle, dir, bh,
 						      OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
 			}
 			if (pde)
 				pde->rec_len =
 					cpu_to_le16(le16_to_cpu(pde->rec_len) +
 						    le16_to_cpu(de->rec_len));
 			else
 				de->inode = 0;
 			dir->i_version++;
 			status = ocfs2_journal_dirty(handle, bh);
 			goto bail;
 		}
 		i += le16_to_cpu(de->rec_len);
 		pde = de;
 		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
 	}
 bail:
 	mlog_exit(status);
 	return status;
 }
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
 static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 					struct inode *dir,
 					const char *name, int namelen,
 					unsigned long offset,
 					struct ocfs2_dir_entry **res_dir)
 {
 	struct ocfs2_dir_entry *de;
 	char *dlimit, *de_buf;
 	int de_len;
 	int ret = 0;
 	mlog_entry_void();
 	de_buf = bh->b_data;
 	dlimit = de_buf + dir->i_sb->s_blocksize;
 	while (de_buf < dlimit) {
 		/* this code is executed quadratically often */
 		/* do minimal checking `by hand' */
 		de = (struct ocfs2_dir_entry *) de_buf;
 		if (de_buf + namelen <= dlimit &&
 		    ocfs2_match(namelen, name, de)) {
 			/* found a match - just to be sure, do a full check */
 			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
 				ret = -1;
 				goto bail;
 			}
 			*res_dir = de;
 			ret = 1;
 			goto bail;
 		}
 		/* prevent looping on a bad block */
 		de_len = le16_to_cpu(de->rec_len);
 		if (de_len <= 0) {
 			ret = -1;
 			goto bail;
 		}
 		de_buf += de_len;
 		offset += de_len;
 	}
 bail:
 	mlog_exit(ret);
 	return ret;
 }
 struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
 				     struct inode *dir,
 				     struct ocfs2_dir_entry **res_dir)
 {
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
 	struct buffer_head *bh, *ret = NULL;
 	unsigned long start, block, b;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
 				   buffer */
 	int num = 0;
 	int nblocks, i, err;
 	mlog_entry_void();
 	*res_dir = NULL;
 	sb = dir->i_sb;
 	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
 	start = OCFS2_I(dir)->ip_dir_start_lookup;
 	if (start >= nblocks)
 		start = 0;
 	block = start;
 restart:
 	do {
 		/*
 		 * We deal with the read-ahead logic here.
 		 */
 		if (ra_ptr >= ra_max) {
 			/* Refill the readahead buffer */
 			ra_ptr = 0;
 			b = block;
 			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
 				/*
 				 * Terminate if we reach the end of the
 				 * directory and must wrap, or if our
 				 * search has finished at this block.
 				 */
 				if (b >= nblocks || (num && block == start)) {
 					bh_use[ra_max] = NULL;
 					break;
 				}
 				num++;
 				bh = ocfs2_bread(dir, b++, &err, 1);
 				bh_use[ra_max] = bh;
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    block);
 			brelse(bh);
 			goto next;
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
 					  block << sb->s_blocksize_bits,
 					  res_dir);
 		if (i == 1) {
 			OCFS2_I(dir)->ip_dir_start_lookup = block;
 			ret = bh;
 			goto cleanup_and_exit;
 		} else {
 			brelse(bh);
 			if (i < 0)
 				goto cleanup_and_exit;
 		}
 	next:
 		if (++block >= nblocks)
 			block = 0;
 	} while (block != start);
 	/*
 	 * If the directory has grown while we were searching, then
 	 * search the last part of the directory before giving up.
 	 */
 	block = nblocks;
 	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
 	if (block < nblocks) {
 		start = 0;
 		goto restart;
 	}
 cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
 	mlog_exit_ptr(ret);
 	return ret;
 }
 static int ocfs2_blkno_stringify(u64 blkno, char *name)
 {
 	int status, namelen;
 	mlog_entry_void();
 	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
 			   (long long)blkno);
 	if (namelen <= 0) {
 		if (namelen)
 			status = namelen;
 		else
 			status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	if (namelen != OCFS2_ORPHAN_NAMELEN) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
 	     namelen);
 	status = 0;
 bail:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
 				    struct inode *inode,
 				    char *name,
 				    struct buffer_head **de_bh)
 {
 	struct inode *orphan_dir_inode;
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
 		return status;
 	}
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
 						       osb->slot_num);
 	if (!orphan_dir_inode) {
 		status = -ENOENT;
 		mlog_errno(status);
 		return status;
 	}
 	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
 					      orphan_dir_bh, name,
 					      OCFS2_ORPHAN_NAMELEN, de_bh);
 	if (status < 0) {
 		ocfs2_meta_unlock(orphan_dir_inode, 1);
 		mlog_errno(status);
 		goto leave;
 	}
 	*ret_orphan_dir = orphan_dir_inode;
 leave:
 	if (status) {
 		mutex_unlock(&orphan_dir_inode->i_mutex);
 		iput(orphan_dir_inode);
 	}
 	if (orphan_dir_bh)
 		brelse(orphan_dir_bh);
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
 			    struct ocfs2_dinode *fe,
 			    char *name,
 			    struct buffer_head *de_bh,
 			    struct inode *orphan_dir_inode)
 {
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 	struct ocfs2_dinode *orphan_fe;
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 	status = ocfs2_read_block(osb,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
 				  &orphan_dir_bh, OCFS2_BH_CACHED,
 				  orphan_dir_inode);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	/* we're a cluster, and nlink can change on disk from
 	 * underneath us... */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
 		le16_add_cpu(&orphan_fe->i_links_count, 1);
 	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
 	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
 				   OCFS2_ORPHAN_NAMELEN, inode,
 				   OCFS2_I(inode)->ip_blkno,
 				   orphan_dir_bh, de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
 	/* Record which orphan dir our inode now resides
 	 * in. delete_inode will use this to determine which orphan
 	 * dir to lock. */
 	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 leave:
 	if (orphan_dir_bh)
 		brelse(orphan_dir_bh);
 	mlog_exit(status);
 	return status;
 }
 /* unlike orphan_add, we expect the orphan dir to already be locked here. */
 int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     handle_t *handle,
 		     struct inode *orphan_dir_inode,
 		     struct inode *inode,
 		     struct buffer_head *orphan_dir_bh)
 {
 	char name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct ocfs2_dinode *orphan_fe;
 	int status = 0;
 	struct buffer_head *target_de_bh = NULL;
 	struct ocfs2_dir_entry *target_de = NULL;
 	mlog_entry_void();
 	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
 	     name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
 	     OCFS2_ORPHAN_NAMELEN);
 	/* find it's spot in the orphan directory */
 	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
 					orphan_dir_inode, &target_de);
 	if (!target_de_bh) {
 		status = -ENOENT;
 		mlog_errno(status);
 		goto leave;
 	}
 	/* remove it from the orphan directory */
 	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
 				    target_de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	/* do the i_nlink dance! :) */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
 		le16_add_cpu(&orphan_fe->i_links_count, -1);
 	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
 	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 leave:
 	if (target_de_bh)
 		brelse(target_de_bh);
 	mlog_exit(status);
 	return status;
 }
 const struct inode_operations ocfs2_dir_iops = {
 	.create		= ocfs2_create,
 	.lookup		= ocfs2_lookup,
 	.link		= ocfs2_link,
 	.unlink		= ocfs2_unlink,
 	.rmdir		= ocfs2_unlink,
 	.symlink	= ocfs2_symlink,
 	.mkdir		= ocfs2_mkdir,
 	.mknod		= ocfs2_mknod,
 	.rename		= ocfs2_rename,
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 };