Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8

Authored by Mark Fasheh 2007-05-10 06:16:19 +0800

ocfs2: shared writeable mmap

Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 4 changed files with 200 additions and 39 deletions Inline Diff

fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/file.c
fs/ocfs2/mmap.c

fs/ocfs2/aops.c

Diff comments View file @ 7307de8

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 #include "buffer_head_io.h"
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
 	int status;
 	struct ocfs2_dinode *fe = NULL;
 	struct buffer_head *bh = NULL;
 	struct buffer_head *buffer_cache_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
 		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
 		     (unsigned long long)iblock);
 		goto bail;
 	}
 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
 				  OCFS2_I(inode)->ip_blkno,
 				  &bh, OCFS2_BH_CACHED, inode);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
 		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
 		     fe->i_signature);
 		goto bail;
 	}
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		     "%llu\n", (unsigned long long)iblock);
 		goto bail;
 	}
 	/* We don't use the page cache to create symlink data, so if
 	 * need be, copy it over from the buffer cache. */
 	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
 		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
 			    iblock;
 		buffer_cache_bh = sb_getblk(osb->sb, blkno);
 		if (!buffer_cache_bh) {
 			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 			goto bail;
 		}
 		/* we haven't locked out transactions, so a commit
 		 * could've happened. Since we've got a reference on
 		 * the bh, even if it commits while we're doing the
 		 * copy, the data is still good. */
 		if (buffer_jbd(buffer_cache_bh)
 		    && ocfs2_inode_is_new(inode)) {
 			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 			if (!kaddr) {
 				mlog(ML_ERROR, "couldn't kmap!\n");
 				goto bail;
 			}
 			memcpy(kaddr + (bh_result->b_size * iblock),
 			       buffer_cache_bh->b_data,
 			       bh_result->b_size);
 			kunmap_atomic(kaddr, KM_USER0);
 			set_buffer_uptodate(bh_result);
 		}
 		brelse(buffer_cache_bh);
 	}
 	map_bh(bh_result, inode->i_sb,
 	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
 	err = 0;
 bail:
 	if (bh)
 		brelse(bh);
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
 	int err = 0;
 	unsigned int ext_flags;
 	u64 p_blkno, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 		     inode, inode->i_ino);
 	if (S_ISLNK(inode->i_mode)) {
 		/* this always does I/O for some reason. */
 		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
 		goto bail;
 	}
 	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
 					  &ext_flags);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
 		     (unsigned long long)p_blkno);
 		goto bail;
 	}
 	/*
 	 * ocfs2 never allocates in this function - the only time we
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
 	 * allows block_prepare_write() to zero.
 	 */
 	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
 			"ino %lu, iblock %llu\n", inode->i_ino,
 			(unsigned long long)iblock);
 	/* Treat the unwritten extent as a hole for zeroing purposes. */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 	if (!ocfs2_sparse_alloc(osb)) {
 		if (p_blkno == 0) {
 			err = -EIO;
 			mlog(ML_ERROR,
 			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
 			     (unsigned long long)iblock,
 			     (unsigned long long)p_blkno,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 			dump_stack();
 		}
 		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
 		     (unsigned long long)past_eof);
 		if (create && (iblock >= past_eof))
 			set_buffer_new(bh_result);
 	}
 bail:
 	if (err < 0)
 		err = -EIO;
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out;
 	}
 	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
 		ret = AOP_TRUNCATED_PAGE;
 		goto out_meta_unlock;
 	}
 	/*
 	 * i_size might have just been updated as we grabed the meta lock.  We
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
 	 * (generic_file_read, fault->nopage) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
 		zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
 	}
 	ret = ocfs2_data_lock_with_page(inode, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out_alloc;
 	}
 	ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 out_meta_unlock:
 	ocfs2_meta_unlock(inode, 0);
 out:
 	if (unlock)
 		unlock_page(page);
 	mlog_exit(ret);
 	return ret;
 }
 /* Note: Because we don't support holes, our allocation has
  * already happened (allocation writes zeros to the file data)
  * so we don't have to worry about ordered writes in
  * ocfs2_writepage.
  *
  * ->writepage is called during the process of invalidating the page cache
  * during blocked lock processing.  It can't block on any cluster locks
  * to during block mapping.  It's relying on the fact that the block
  * mapping can't have disappeared under the dirty pages that it is
  * being asked to write back.
  */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 	mlog_entry("(0x%p)\n", page);
 	ret = block_write_full_page(page, ocfs2_get_block, wbc);
 	mlog_exit(ret);
 	return ret;
 }
 /*
  * This is called from ocfs2_write_zero_page() which has handled it's
  * own cluster locking and has ensured allocation exists for those
  * blocks to be written.
  */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 			       unsigned from, unsigned to)
 {
 	int ret;
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	ret = block_prepare_write(page, from, to, ocfs2_get_block);
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	return ret;
 }
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
 			unsigned from,
 			unsigned to,
 			int *partial,
 			int (*fn)(	handle_t *handle,
 					struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
 	    	block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
 							 unsigned to)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	int ret = 0;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (!handle) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	if (ocfs2_should_order_data(inode)) {
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
 	if (ret) {
 		if (handle)
 			ocfs2_commit_trans(osb, handle);
 		handle = ERR_PTR(ret);
 	}
 	return handle;
 }
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
 	u64 p_blkno = 0;
 	int err = 0;
 	struct inode *inode = mapping->host;
 	mlog_entry("(block = %llu)\n", (unsigned long long)block);
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
 	 */
 	if (!INODE_JOURNAL(inode)) {
 		err = ocfs2_meta_lock(inode, NULL, 0);
 		if (err) {
 			if (err != -ENOENT)
 				mlog_errno(err);
 			goto bail;
 		}
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 		ocfs2_meta_unlock(inode, 0);
 	}
 	if (err) {
 		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
 		     (unsigned long long)block);
 		mlog_errno(err);
 		goto bail;
 	}
 bail:
 	status = err ? 0 : p_blkno;
 	mlog_exit((int)status);
 	return status;
 }
 /*
  * TODO: Make this into a generic get_blocks function.
  *
  * From do_direct_io in direct-io.c:
  *  "So what we do is to permit the ->get_blocks function to populate
  *   bh.b_size with the size of IO which is permitted at this offset and
  *   this i_blkbits."
  *
  * This function is called directly from get_more_blocks in direct-io.c.
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
 	u64 p_blkno, inode_blocks, contig_blocks;
 	unsigned int ext_flags;
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 	/* This function won't even be called if the request isn't all
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
 	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 	/*
 	 * Any write past EOF is not allowed because we'd be extending.
 	 */
 	if (create && (iblock + max_blocks) > inode_blocks) {
 		ret = -EIO;
 		goto bail;
 	}
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 					  &contig_blocks, &ext_flags);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
 		ret = -EIO;
 		goto bail;
 	}
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has a hole at block %llu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    (unsigned long long)iblock);
 		ret = -EROFS;
 		goto bail;
 	}
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
 	 *
 	 * Consider an unwritten extent as a hole.
 	 */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 	else {
 		/*
 		 * ocfs2_prepare_inode_for_write() should have caught
 		 * the case where we'd be filling a hole and triggered
 		 * a buffered write instead.
 		 */
 		if (create) {
 			ret = -EIO;
 			mlog_errno(ret);
 			goto bail;
 		}
 		clear_buffer_mapped(bh_result);
 	}
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
 	if (max_blocks < contig_blocks)
 		contig_blocks = max_blocks;
 	bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
 	return ret;
 }
 /*
  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
  * particularly interested in the aio/dio case.  Like the core uses
  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
  * truncation on another.
  */
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
 			     void *private)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 	ocfs2_iocb_clear_rw_locked(iocb);
 	level = ocfs2_iocb_rw_locked_level(iocb);
 	if (!level)
 		up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, level);
 }
 /*
  * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
  * from ext3.  PageChecked() bits have been removed as OCFS2 does not
  * do journalled data.
  */
 static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 	journal_invalidatepage(journal, page, offset);
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 	if (!page_has_buffers(page))
 		return 0;
 	return journal_try_to_free_buffers(journal, page, wait);
 }
 static ssize_t ocfs2_direct_IO(int rw,
 			       struct kiocb *iocb,
 			       const struct iovec *iov,
 			       loff_t offset,
 			       unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	int ret;
 	mlog_entry_void();
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 		/*
 		 * We get PR data locks even for O_DIRECT.  This
 		 * allows concurrent O_DIRECT I/O but doesn't let
 		 * O_DIRECT with extending and buffered zeroing writes
 		 * race.  If they did race then the buffered zeroing
 		 * could be written back after the O_DIRECT I/O.  It's
 		 * one thing to tell people not to mix buffered and
 		 * O_DIRECT writes, but expecting them to understand
 		 * that file extension is also an implicit buffered
 		 * write is too much.  By getting the PR we force
 		 * writeback of the buffered zeroing before
 		 * proceeding.
 		 */
 		ret = ocfs2_data_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		ocfs2_data_unlock(inode, 0);
 	}
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs,
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 					    u32 cpos,
 					    unsigned int *start,
 					    unsigned int *end)
 {
 	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
 		unsigned int cpp;
 		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
 		cluster_start = cpos % cpp;
 		cluster_start = cluster_start << osb->s_clustersize_bits;
 		cluster_end = cluster_start + osb->s_clustersize;
 	}
 	BUG_ON(cluster_start > PAGE_SIZE);
 	BUG_ON(cluster_end > PAGE_SIZE);
 	if (start)
 		*start = cluster_start;
 	if (end)
 		*end = cluster_end;
 }
 /*
  * 'from' and 'to' are the region in the page to avoid zeroing.
  *
  * If pagesize > clustersize, this function will avoid zeroing outside
  * of the cluster boundary.
  *
  * from == to == 0 is code for "zero the entire cluster region"
  */
 static void ocfs2_clear_page_regions(struct page *page,
 				     struct ocfs2_super *osb, u32 cpos,
 				     unsigned from, unsigned to)
 {
 	void *kaddr;
 	unsigned int cluster_start, cluster_end;
 	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
 	kaddr = kmap_atomic(page, KM_USER0);
 	if (from || to) {
 		if (from > cluster_start)
 			memset(kaddr + cluster_start, 0, from - cluster_start);
 		if (to < cluster_end)
 			memset(kaddr + to, 0, cluster_end - to);
 	} else {
 		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
 	}
 	kunmap_atomic(kaddr, KM_USER0);
 }
 /*
  * Some of this taken from block_prepare_write(). We already have our
  * mapping by now though, and the entire write will be allocating or
  * it won't, so not much need to use BH_New.
  *
  * This will also skip zeroing, which is handled externally.
  */
 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new)
 {
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 	unsigned int block_end, block_start;
 	unsigned int bsize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, bsize, 0);
 	head = page_buffers(page);
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	     bh = bh->b_this_page, block_start += bsize) {
 		block_end = block_start + bsize;
 		clear_buffer_new(bh);
 		/*
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
 		 */
 		if (block_start >= to || block_end <= from) {
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			continue;
 		}
 		/*
 		 * For an allocating write with cluster size >= page
 		 * size, we always write the entire page.
 		 */
 		if (new)
 			set_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			map_bh(bh, inode->i_sb, *p_blkno);
 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 		}
 		if (PageUptodate(page)) {
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		     (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 		*p_blkno = *p_blkno + 1;
 	}
 	/*
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
 		wait_on_buffer(*--wait_bh);
 		if (!buffer_uptodate(*wait_bh))
 			ret = -EIO;
 	}
 	if (ret == 0 || !new)
 		return ret;
 	/*
 	 * If we get -EIO above, zero out any newly allocated blocks
 	 * to avoid exposing stale data.
 	 */
 	bh = head;
 	block_start = 0;
 	do {
 		void *kaddr;
 		block_end = block_start + bsize;
 		if (block_end <= from)
 			goto next_bh;
 		if (block_start >= to)
 			break;
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr+block_start, 0, bh->b_size);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 next_bh:
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	return ret;
 }
 #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
 #define OCFS2_MAX_CTXT_PAGES	1
 #else
 #define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
 #endif
 #define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 /*
  * Describe the state of a single cluster to be written to.
  */
 struct ocfs2_write_cluster_desc {
 	u32		c_cpos;
 	u32		c_phys;
 	/*
 	 * Give this a unique field because c_phys eventually gets
 	 * filled.
 	 */
 	unsigned	c_new;
 };
 struct ocfs2_write_ctxt {
 	/* Logical cluster position / len of write */
 	u32				w_cpos;
 	u32				w_clen;
 	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 	/*
 	 * This is true if page_size > cluster_size.
 	 *
 	 * It triggers a set of special cases during write which might
 	 * have to deal with allocating writes to partial pages.
 	 */
 	unsigned int			w_large_pages;
 	/*
 	 * Pages involved in this write.
 	 *
 	 * w_target_page is the page being written to by the user.
 	 *
 	 * w_pages is an array of pages which always contains
 	 * w_target_page, and in the case of an allocating write with
 	 * page_size < cluster size, it will contain zero'd and mapped
 	 * pages adjacent to w_target_page which need to be written
 	 * out in so that future reads from that region will get
 	 * zero's.
 	 */
 	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 	unsigned int			w_num_pages;
 	struct page			*w_target_page;
 	/*
 	 * ocfs2_write_end() uses this to know what the real range to
 	 * write in the target should be.
 	 */
 	unsigned int			w_target_from;
 	unsigned int			w_target_to;
 	/*
 	 * We could use journal_current_handle() but this is cleaner,
 	 * IMHO -Mark
 	 */
 	handle_t			*w_handle;
 	struct buffer_head		*w_di_bh;
 };
 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 {
 	int i;
 	for(i = 0; i < wc->w_num_pages; i++) {
 		if (wc->w_pages[i] == NULL)
 			continue;
 		unlock_page(wc->w_pages[i]);
 		mark_page_accessed(wc->w_pages[i]);
 		page_cache_release(wc->w_pages[i]);
 	}
 	brelse(wc->w_di_bh);
 	kfree(wc);
 }
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 				  struct ocfs2_super *osb, loff_t pos,
 				  unsigned len, struct buffer_head *di_bh)
 {
 	struct ocfs2_write_ctxt *wc;
 	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
 	if (!wc)
 		return -ENOMEM;
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
 	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
 	get_bh(di_bh);
 	wc->w_di_bh = di_bh;
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
 		wc->w_large_pages = 1;
 	else
 		wc->w_large_pages = 0;
 	*wcp = wc;
 	return 0;
 }
 /*
  * If a page has any new buffers, zero them out here, and mark them uptodate
  * and dirty so they'll be written out (in order to prevent uninitialised
  * block data from leaking). And clear the new bit.
  */
 static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		return;
 	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;
 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
 				if (!PageUptodate(page)) {
 					unsigned start, end;
 					void *kaddr;
 					start = max(from, block_start);
 					end = min(to, block_end);
 					kaddr = kmap_atomic(page, KM_USER0);
 					memset(kaddr+start, 0, end - start);
 					flush_dcache_page(page);
 					kunmap_atomic(kaddr, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 				clear_buffer_new(bh);
 				mark_buffer_dirty(bh);
 			}
 		}
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 /*
  * Only called when we have a failure during allocating write to write
  * zero's to the newly allocated region.
  */
 static void ocfs2_write_failure(struct inode *inode,
 				struct ocfs2_write_ctxt *wc,
 				loff_t user_pos, unsigned user_len)
 {
 	int i;
 	unsigned from, to;
 	struct page *tmppage;
 	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
 	if (wc->w_large_pages) {
 		from = wc->w_target_from;
 		to = wc->w_target_to;
 	} else {
 		from = 0;
 		to = PAGE_CACHE_SIZE;
 	}
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
 		if (ocfs2_should_order_data(inode))
 			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
 					  from, to, NULL,
 					  ocfs2_journal_dirty_data);
 		block_commit_write(tmppage, from, to);
 	}
 }
 static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 					struct ocfs2_write_ctxt *wc,
 					struct page *page, u32 cpos,
 					loff_t user_pos, unsigned user_len,
 					int new)
 {
 	int ret;
 	unsigned int map_from = 0, map_to = 0;
 	unsigned int cluster_start, cluster_end;
 	unsigned int user_data_from = 0, user_data_to = 0;
 	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
 					&cluster_start, &cluster_end);
 	if (page == wc->w_target_page) {
 		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
 		map_to = map_from + user_len;
 		if (new)
 			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
 						    cluster_start, cluster_end,
 						    new);
 		else
 			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
 						    map_from, map_to, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		user_data_from = map_from;
 		user_data_to = map_to;
 		if (new) {
 			map_from = cluster_start;
 			map_to = cluster_end;
 		}
 		wc->w_target_from = map_from;
 		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
 		 * shouldn't be writing it out without copying user
 		 * data. This is likely a math error from the caller.
 		 */
 		BUG_ON(!new);
 		map_from = cluster_start;
 		map_to = cluster_end;
 		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
 					    cluster_start, cluster_end, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * Parts of newly allocated pages need to be zero'd.
 	 *
 	 * Above, we have also rewritten 'to' and 'from' - as far as
 	 * the rest of the function is concerned, the entire cluster
 	 * range inside of a page needs to be written.
 	 *
 	 * We can skip this if the page is up to date - it's already
 	 * been zero'd from being read in as a hole.
 	 */
 	if (new && !PageUptodate(page))
 		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
 					 cpos, user_data_from, user_data_to);
 	flush_dcache_page(page);
 out:
 	return ret;
 }
 /*
  * This function will only grab one clusters worth of pages.
  */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				      struct ocfs2_write_ctxt *wc,
-				      u32 cpos, loff_t user_pos, int new)
+				      u32 cpos, loff_t user_pos, int new,
+				      struct page *mmap_page)
 {
 	int ret = 0, i;
 	unsigned long start, target_index, index;
 	struct inode *inode = mapping->host;
 	target_index = user_pos >> PAGE_CACHE_SHIFT;
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
 	 * page. Otherwise, we'll need a whole clusters worth.
 	 */
 	if (new) {
 		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
 		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
 	} else {
 		wc->w_num_pages = 1;
 		start = target_index;
 	}
 	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
-		wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS);
+		if (index == target_index && mmap_page) {
-		if (!wc->w_pages[i]) {
+			/*
-			ret = -ENOMEM;
+			 * ocfs2_pagemkwrite() is a little different
-			mlog_errno(ret);
+			 * and wants us to directly use the page
-			goto out;
+			 * passed in.
+			 */
+			lock_page(mmap_page);
+			if (mmap_page->mapping != mapping) {
+				unlock_page(mmap_page);
+				/*
+				 * Sanity check - the locking in
+				 * ocfs2_pagemkwrite() should ensure
+				 * that this code doesn't trigger.
+				 */
+				ret = -EINVAL;
+				mlog_errno(ret);
+				goto out;
+			}
+			page_cache_get(mmap_page);
+			wc->w_pages[i] = mmap_page;
+		} else {
+			wc->w_pages[i] = find_or_create_page(mapping, index,
+							     GFP_NOFS);
+			if (!wc->w_pages[i]) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
 		}
 		if (index == target_index)
 			wc->w_target_page = wc->w_pages[i];
 	}
 out:
 	return ret;
 }
 /*
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
 			       u32 phys, struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
 	int ret, i, new;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 	new = phys == 0 ? 1 : 0;
 	if (new) {
 		u32 tmp_pos;
 		/*
 		 * This is safe to call with the page locks - it won't take
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
 						 &tmp_pos, 1, wc->w_di_bh,
 						 wc->w_handle, data_ac,
 						 meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
 		 * internal tree allocation code should know how to increase
 		 * transaction credits itself.
 		 *
 		 * If need be, we could handle -EAGAIN for a
 		 * RESTART_TRANS here.
 		 */
 		mlog_bug_on_msg(ret == -EAGAIN,
 				"Inode %llu: EAGAIN return during allocation.\n",
 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
 	} else {
 		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
 	}
 	/*
 	 * The only reason this should fail is due to an inability to
 	 * find the extent added.
 	 */
 	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
 					  NULL);
 	if (ret < 0) {
 		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
 			    "at logical block %llu",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    (unsigned long long)v_blkno);
 		goto out;
 	}
 	BUG_ON(p_blkno == 0);
 	for(i = 0; i < wc->w_num_pages; i++) {
 		int tmpret;
 		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
 						      wc->w_pages[i], cpos,
 						      user_pos, user_len, new);
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
 				tmpret = ret;
 		}
 	}
 	/*
 	 * We only have cleanup to do in case of allocating write.
 	 */
 	if (ret && new)
 		ocfs2_write_failure(inode, wc, user_pos, user_len);
 out:
 	return ret;
 }
 /*
  * ocfs2_write_end() wants to know which parts of the target page it
  * should complete the write on. It's easiest to compute them ahead of
  * time when a more complete view of the write is available.
  */
 static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 					struct ocfs2_write_ctxt *wc,
 					loff_t pos, unsigned len, int alloc)
 {
 	struct ocfs2_write_cluster_desc *desc;
 	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
 	wc->w_target_to = wc->w_target_from + len;
 	if (alloc == 0)
 		return;
 	/*
 	 * Allocating write - we may have different boundaries based
 	 * on page size and cluster size.
 	 *
 	 * NOTE: We can no longer compute one value from the other as
 	 * the actual write length and user provided length may be
 	 * different.
 	 */
 	if (wc->w_large_pages) {
 		/*
 		 * We only care about the 1st and last cluster within
 		 * our range and whether they are holes or not. Either
 		 * value may be extended out to the start/end of a
 		 * newly allocated cluster.
 		 */
 		desc = &wc->w_desc[0];
 		if (desc->c_new)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							&wc->w_target_from,
 							NULL);
 		desc = &wc->w_desc[wc->w_clen - 1];
 		if (desc->c_new)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							NULL,
 							&wc->w_target_to);
 	} else {
 		wc->w_target_from = 0;
 		wc->w_target_to = PAGE_CACHE_SIZE;
 	}
 }
-static int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
-				    loff_t pos, unsigned len, unsigned flags,
+			     loff_t pos, unsigned len, unsigned flags,
-				    struct page **pagep, void **fsdata,
+			     struct page **pagep, void **fsdata,
-				    struct buffer_head *di_bh)
+			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int num_clusters = 0, clusters_to_alloc = 0;
 	u32 phys = 0;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_write_cluster_desc *desc;
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
 	}
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 		desc->c_cpos = wc->w_cpos + i;
 		if (num_clusters == 0) {
 			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
 						 &num_clusters, NULL);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 		} else if (phys) {
 			/*
 			 * Only increment phys if it doesn't describe
 			 * a hole.
 			 */
 			phys++;
 		}
 		desc->c_phys = phys;
 		if (phys == 0) {
 			desc->c_new = 1;
 			clusters_to_alloc++;
 		}
 		num_clusters--;
 	}
 	/*
 	 * We set w_target_from, w_target_to here so that
 	 * ocfs2_write_end() knows which range in the target page to
 	 * write out. An allocation requires that we write the entire
 	 * cluster range.
 	 */
 	if (clusters_to_alloc > 0) {
 		/*
 		 * XXX: We are stretching the limits of
 		 * ocfs2_lock_allocators(). It greately over-estimates
 		 * the work to be done.
 		 */
 		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
 					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
 						    clusters_to_alloc);
 	}
 	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	wc->w_handle = handle;
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * here.
 	 */
 	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	/*
 	 * Fill our page array first. That way we've grabbed enough so
 	 * that we can zero and flush if we error after adding the
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc);
+					 clusters_to_alloc, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
 					  meta_ac, wc, desc->c_cpos, pos, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
 		}
 	}
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	ocfs2_free_write_ctxt(wc);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 	return ret;
 }
 int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 		      loff_t pos, unsigned len, unsigned flags,
 		      struct page **pagep, void **fsdata)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
 	struct inode *inode = mapping->host;
 	ret = ocfs2_meta_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
 	}
 	/*
 	 * Take alloc sem here to prevent concurrent lookups. That way
 	 * the mapping, zeroing and tree manipulation within
 	 * ocfs2_write() will be safe against ->readpage(). This
 	 * should also serve to lock out allocation from a shared
 	 * writeable region.
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ret = ocfs2_data_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_fail;
 	}
 	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
-				       fsdata, di_bh);
+				       fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_fail_data;
 	}
 	brelse(di_bh);
 	return 0;
 out_fail_data:
 	ocfs2_data_unlock(inode, 1);
 out_fail:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	brelse(di_bh);
 	ocfs2_meta_unlock(inode, 1);
 	return ret;
 }
-static int ocfs2_write_end_nolock(struct address_space *mapping,
+int ocfs2_write_end_nolock(struct address_space *mapping,
-				  loff_t pos, unsigned len, unsigned copied,
+			   loff_t pos, unsigned len, unsigned copied,
-				  struct page *page, void *fsdata)
+			   struct page *page, void *fsdata)
 {
 	int i;
 	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_write_ctxt *wc = fsdata;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 	if (unlikely(copied < len)) {
 		if (!PageUptodate(wc->w_target_page))
 			copied = 0;
 		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
 				       start+len);
 	}
 	flush_dcache_page(wc->w_target_page);
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
 		if (tmppage == wc->w_target_page) {
 			from = wc->w_target_from;
 			to = wc->w_target_to;
 			BUG_ON(from > PAGE_CACHE_SIZE ||
 			       to > PAGE_CACHE_SIZE ||
 			       to < from);
 		} else {
 			/*
 			 * Pages adjacent to the target (if any) imply
 			 * a hole-filling write in which case we want
 			 * to flush their entire range.
 			 */
 			from = 0;
 			to = PAGE_CACHE_SIZE;
 		}
 		if (ocfs2_should_order_data(inode))
 			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
 					  from, to, NULL,
 					  ocfs2_journal_dirty_data);
 		block_commit_write(tmppage, from, to);
 	}
 	pos += copied;
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
 	}
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	di->i_size = cpu_to_le64((u64)i_size_read(inode));
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 	ocfs2_commit_trans(osb, handle);
 	ocfs2_free_write_ctxt(wc);
 	return copied;
 }
 int ocfs2_write_end(struct file *file, struct address_space *mapping,
 		    loff_t pos, unsigned len, unsigned copied,
 		    struct page *page, void *fsdata)
 {
 	int ret;
 	struct inode *inode = mapping->host;
 	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
 	ocfs2_data_unlock(inode, 1);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 1);
 	return ret;
 }
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.bmap		= ocfs2_bmap,
 	.sync_page	= block_sync_page,
 	.direct_IO	= ocfs2_direct_IO,
 	.invalidatepage	= ocfs2_invalidatepage,
 	.releasepage	= ocfs2_releasepage,
 	.migratepage	= buffer_migrate_page,
 };

fs/ocfs2/aops.h

Diff comments View file @ 7307de8

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 			       unsigned from, unsigned to);
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
 							 unsigned to);
 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
 			unsigned from,
 			unsigned to,
 			int *partial,
 			int (*fn)(	handle_t *handle,
 					struct buffer_head *bh));
 int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 		      loff_t pos, unsigned len, unsigned flags,
 		      struct page **pagep, void **fsdata);
 int ocfs2_write_end(struct file *file, struct address_space *mapping,
 		    loff_t pos, unsigned len, unsigned copied,
 		    struct page *page, void *fsdata);
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata);
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
 static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 {
 	set_bit(0, (unsigned long *)&iocb->private);
 	if (level)
 		set_bit(1, (unsigned long *)&iocb->private);
 	else
 		clear_bit(1, (unsigned long *)&iocb->private);
 }
 #define ocfs2_iocb_clear_rw_locked(iocb) \
 	clear_bit(0, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
 	test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */

fs/ocfs2/file.c

Diff comments View file @ 7307de8

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * file.c
  *
  * File open, close, extend, truncate
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
 #include <linux/splice.h>
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
 #include "buffer_head_io.h"
 static int ocfs2_sync_inode(struct inode *inode)
 {
 	filemap_fdatawrite(inode->i_mapping);
 	return sync_mapping_buffers(inode->i_mapping);
 }
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
 	int status;
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 	spin_lock(&oi->ip_lock);
 	/* Check that the inode hasn't been wiped from disk by another
 	 * node. If it hasn't then we're safe as long as we hold the
 	 * spin lock until our increment of open count. */
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&oi->ip_lock);
 		status = -ENOENT;
 		goto leave;
 	}
 	if (mode & O_DIRECT)
 		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 	oi->ip_open_count++;
 	spin_unlock(&oi->ip_lock);
 	status = 0;
 leave:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 		       file->f_path.dentry->d_name.len,
 		       file->f_path.dentry->d_name.name);
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 	spin_unlock(&oi->ip_lock);
 	mlog_exit(0);
 	return 0;
 }
 static int ocfs2_sync_file(struct file *file,
 			   struct dentry *dentry,
 			   int datasync)
 {
 	int err = 0;
 	journal_t *journal;
 	struct inode *inode = dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
 		   dentry->d_name.len, dentry->d_name.name);
 	err = ocfs2_sync_inode(dentry->d_inode);
 	if (err)
 		goto bail;
 	journal = osb->journal->j_journal;
 	err = journal_force_commit(journal);
 bail:
 	mlog_exit(err);
 	return (err < 0) ? -EIO : 0;
 }
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt)
 {
 	struct timespec now;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return 0;
 	if ((inode->i_flags & S_NOATIME) ||
 	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	/*
 	 * We can be called with no vfsmnt structure - NFSD will
 	 * sometimes do this.
 	 *
 	 * Note that our action here is different than touch_atime() -
 	 * if we can't tell whether this is a noatime mount, then we
 	 * don't know whether to trust the value of s_atime_quantum.
 	 */
 	if (vfsmnt == NULL)
 		return 0;
 	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
 		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 			return 1;
 		return 0;
 	}
 	now = CURRENT_TIME;
 	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 		return 0;
 	else
 		return 1;
 }
 int ocfs2_update_inode_atime(struct inode *inode,
 			     struct buffer_head *bh)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle;
 	mlog_entry_void();
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (handle == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	inode->i_atime = CURRENT_TIME;
 	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static int ocfs2_set_inode_size(handle_t *handle,
 				struct inode *inode,
 				struct buffer_head *fe_bh,
 				u64 new_i_size)
 {
 	int status;
 	mlog_entry_void();
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 bail:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_simple_size_update(struct inode *inode,
 				    struct buffer_head *di_bh,
 				    u64 new_i_size)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (handle == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_set_inode_size(handle, inode, di_bh,
 				   new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
 				     u64 new_i_size)
 {
 	int status;
 	handle_t *handle;
 	struct ocfs2_dinode *di;
 	mlog_entry_void();
 	/* TODO: This needs to actually orphan the inode in this
 	 * transaction. */
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out;
 	}
 	status = ocfs2_journal_access(handle, inode, fe_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	/*
 	 * Do this before setting i_size.
 	 */
 	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
 	if (status) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_truncate_file(struct inode *inode,
 			       struct buffer_head *di_bh,
 			       u64 new_i_size)
 {
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_truncate_context *tc = NULL;
 	mlog_entry("(inode = %llu, new_i_size = %llu\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 		status = -EIO;
 		goto bail;
 	}
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"i_size = %llu, i_flags = 0x%x\n",
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			i_size_read(inode),
 			(unsigned long long)le64_to_cpu(fe->i_size),
 			le32_to_cpu(fe->i_flags));
 	if (new_i_size > le64_to_cpu(fe->i_size)) {
 		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
 		     (unsigned long long)le64_to_cpu(fe->i_size),
 		     (unsigned long long)new_i_size);
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
 	     (unsigned long long)le64_to_cpu(fe->i_blkno),
 	     (unsigned long long)le64_to_cpu(fe->i_size),
 	     (unsigned long long)new_i_size);
 	/* lets handle the simple truncate cases before doing any more
 	 * cluster locking. */
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	/* This forces other nodes to sync and drop their pages. Do
 	 * this even if we have a truncate without allocation change -
 	 * ocfs2 cluster sizes can be much greater than page size, so
 	 * we have to truncate them anyway.  */
 	status = ocfs2_data_lock(inode, 1);
 	if (status < 0) {
 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
 		mlog_errno(status);
 		goto bail;
 	}
 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
 	 * i_size. */
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_data;
 	}
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_data;
 	}
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_data;
 	}
 	/* TODO: orphan dir cleanup here. */
 bail_unlock_data:
 	ocfs2_data_unlock(inode, 1);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
 	mlog_exit(status);
 	return status;
 }
 /*
  * extend allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
  * metadata reservations in the contexts.
  *
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
 			       u32 *logical_offset,
 			       u32 clusters_to_add,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason_ret)
 {
 	int status = 0;
 	int free_extents;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
 	u64 block;
 	BUG_ON(!clusters_to_add);
 	free_extents = ocfs2_num_free_extents(osb, inode, fe);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
 		goto leave;
 	}
 	/* there are two cases which could cause us to EAGAIN in the
 	 * we-need-more-metadata case:
 	 * 1) we haven't reserved *any*
 	 * 2) we are so fragmented, we've needed to add metadata too
 	 *    many times. */
 	if (!free_extents && !meta_ac) {
 		mlog(0, "we haven't reserved any metadata!\n");
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
 		       < ocfs2_extend_meta_needed(fe))) {
 		mlog(0, "filesystem is really fragmented...\n");
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
 	}
 	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
 				      &bit_off, &num_bits);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 	BUG_ON(num_bits > clusters_to_add);
 	/* reserve our write early -- insert_extent may update the inode */
 	status = ocfs2_journal_access(handle, inode, fe_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
 				     *logical_offset, block, num_bits,
 				     meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	clusters_to_add -= num_bits;
 	*logical_offset += num_bits;
 	if (clusters_to_add) {
 		mlog(0, "need to alloc once more, clusters = %u, wanted = "
 		     "%u\n", fe->i_clusters, clusters_to_add);
 		status = -EAGAIN;
 		reason = RESTART_TRANS;
 	}
 leave:
 	mlog_exit(status);
 	if (reason_ret)
 		*reason_ret = reason;
 	return status;
 }
 /*
  * For a given allocation, determine which allocators will need to be
  * accessed, and lock them, reserving the appropriate number of bits.
  *
  * Called from ocfs2_extend_allocation() for file systems which don't
  * support holes, and from ocfs2_write() for file systems which
  * understand sparse inodes.
  */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
 {
 	int ret, num_free_extents;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	*meta_ac = NULL;
 	*data_ac = NULL;
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 	     "clusters_to_add = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
 	     le32_to_cpu(di->i_clusters), clusters_to_add);
 	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * Sparse allocation file systems need to be more conservative
 	 * with reserving room for expansion - the actual allocation
 	 * happens while we've got a journal handle open so re-taking
 	 * a cluster lock (because we ran out of room for another
 	 * extent) will violate ordering rules.
 	 *
 	 * Most of the time we'll only be seeing this 1 cluster at a time
 	 * anyway.
 	 */
 	if (!num_free_extents ||
 	    (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
 		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out;
 		}
 	}
 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
 		goto out;
 	}
 out:
 	if (ret) {
 		if (*meta_ac) {
 			ocfs2_free_alloc_context(*meta_ac);
 			*meta_ac = NULL;
 		}
 		/*
 		 * We cannot have an error and a non null *data_ac.
 		 */
 	}
 	return ret;
 }
 static int ocfs2_extend_allocation(struct inode *inode,
 				   u32 clusters_to_add)
 {
 	int status = 0;
 	int restart_func = 0;
 	int drop_alloc_sem = 0;
 	int credits;
 	u32 prev_clusters, logical_start;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 	/*
 	 * This function only exists for file systems which don't
 	 * support holes.
 	 */
 	BUG_ON(ocfs2_sparse_alloc(osb));
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 		status = -EIO;
 		goto leave;
 	}
 	logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 	/* blocks peope in read/write from reading our allocation
 	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
 	 * start_trans is important here -- always do it before! */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto leave;
 	}
 restarted_transaction:
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
 	status = ocfs2_journal_access(handle, inode, bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 	status = ocfs2_do_extend_allocation(osb,
 					    inode,
 					    &logical_start,
 					    clusters_to_add,
 					    bh,
 					    handle,
 					    data_ac,
 					    meta_ac,
 					    &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_dirty(handle, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 			mlog(0, "restarting function.\n");
 			restart_func = 1;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
 							    fe,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
 				/* handle still has to be committed at
 				 * this point. */
 				status = -ENOMEM;
 				mlog_errno(status);
 				goto leave;
 			}
 			goto restarted_transaction;
 		}
 	}
 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
 	     le32_to_cpu(fe->i_clusters),
 	     (unsigned long long)le64_to_cpu(fe->i_size));
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 leave:
 	if (drop_alloc_sem) {
 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
 		drop_alloc_sem = 0;
 	}
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 	}
 	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
 		data_ac = NULL;
 	}
 	if (meta_ac) {
 		ocfs2_free_alloc_context(meta_ac);
 		meta_ac = NULL;
 	}
 	if ((!status) && restart_func) {
 		restart_func = 0;
 		goto restart_all;
 	}
 	if (bh) {
 		brelse(bh);
 		bh = NULL;
 	}
 	mlog_exit(status);
 	return status;
 }
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->prepare_write() and
  * ->commit_write(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long index;
 	unsigned int offset;
 	handle_t *handle = NULL;
 	int ret;
 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
 	** skip the prepare.  make sure we never send an offset for the start
 	** of a block
 	*/
 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
 		offset++;
 	}
 	index = size >> PAGE_CACHE_SHIFT;
 	page = grab_cache_page(mapping, index);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 	if (ocfs2_should_order_data(inode)) {
 		handle = ocfs2_start_walk_page_trans(inode, page, offset,
 						     offset);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			handle = NULL;
 			goto out_unlock;
 		}
 	}
 	/* must not update i_size! */
 	ret = block_commit_write(page, offset, offset);
 	if (ret < 0)
 		mlog_errno(ret);
 	else
 		ret = 0;
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return ret;
 }
 static int ocfs2_zero_extend(struct inode *inode,
 			     u64 zero_to_size)
 {
 	int ret = 0;
 	u64 start_off;
 	struct super_block *sb = inode->i_sb;
 	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 	while (start_off < zero_to_size) {
 		ret = ocfs2_write_zero_page(inode, start_off);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		start_off += sb->s_blocksize;
 		/*
 		 * Very large extends have the potential to lock up
 		 * the cpu for extended periods of time.
 		 */
 		cond_resched();
 	}
 out:
 	return ret;
 }
 /*
  * A tail_to_skip value > 0 indicates that we're being called from
  * ocfs2_file_aio_write(). This has the following implications:
  *
  * - we don't want to update i_size
  * - di_bh will be NULL, which is fine because it's only used in the
  *   case where we want to update i_size.
  * - ocfs2_zero_extend() will then only be filling the hole created
  *   between i_size and the start of the write.
  */
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size,
 			     size_t tail_to_skip)
 {
 	int ret = 0;
 	u32 clusters_to_add = 0;
 	BUG_ON(!tail_to_skip && !di_bh);
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
 		goto out;
 	if (i_size_read(inode) == new_i_size)
   		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 		BUG_ON(tail_to_skip != 0);
 		goto out_update_size;
 	}
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
 		OCFS2_I(inode)->ip_clusters;
 	/*
 	 * protect the pages that ocfs2_zero_extend is going to be
 	 * pulling into the page cache.. we do this before the
 	 * metadata extend so that we don't get into the situation
 	 * where we've extended the metadata but can't get the data
 	 * lock to zero.
 	 */
 	ret = ocfs2_data_lock(inode, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	if (clusters_to_add) {
 		ret = ocfs2_extend_allocation(inode, clusters_to_add);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
 		}
 	}
 	/*
 	 * Call this even if we don't add any clusters to the tree. We
 	 * still need to zero the area between the old i_size and the
 	 * new i_size.
 	 */
 	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 out_update_size:
 	if (!tail_to_skip) {
 		/* We're being called from ocfs2_setattr() which wants
 		 * us to update i_size */
 		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
 out_unlock:
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		ocfs2_data_unlock(inode, 1);
 out:
 	return ret;
 }
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0, size_change;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
 	if (attr->ia_valid & ATTR_MODE)
 		mlog(0, "mode change: %d\n", attr->ia_mode);
 	if (attr->ia_valid & ATTR_UID)
 		mlog(0, "uid change: %d\n", attr->ia_uid);
 	if (attr->ia_valid & ATTR_GID)
 		mlog(0, "gid change: %d\n", attr->ia_gid);
 	if (attr->ia_valid & ATTR_SIZE)
 		mlog(0, "size change...\n");
 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
 		mlog(0, "time change...\n");
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
 		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
 		return 0;
 	}
 	status = inode_change_ok(inode, attr);
 	if (status)
 		return status;
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	}
 	status = ocfs2_meta_lock(inode, &bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto bail_unlock_rw;
 	}
 	if (size_change && attr->ia_size != i_size_read(inode)) {
 		if (i_size_read(inode) > attr->ia_size)
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			status = -ENOSPC;
 			goto bail_unlock;
 		}
 	}
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto bail_unlock;
 	}
+	/*
+	 * This will intentionally not wind up calling vmtruncate(),
+	 * since all the work for a size change has been done above.
+	 * Otherwise, we could get into problems with truncate as
+	 * ip_alloc_sem is used there to protect against i_size
+	 * changes.
+	 */
 	status = inode_setattr(inode, attr);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_commit;
 	}
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (status < 0)
 		mlog_errno(status);
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 	ocfs2_meta_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
 	if (bh)
 		brelse(bh);
 	mlog_exit(status);
 	return status;
 }
 int ocfs2_getattr(struct vfsmount *mnt,
 		  struct dentry *dentry,
 		  struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = dentry->d_inode->i_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 	mlog_entry_void();
 	err = ocfs2_inode_revalidate(dentry);
 	if (err) {
 		if (err != -ENOENT)
 			mlog_errno(err);
 		goto bail;
 	}
 	generic_fillattr(inode, stat);
 	/* We set the blksize from the cluster size for performance */
 	stat->blksize = osb->s_clustersize;
 bail:
 	mlog_exit(err);
 	return err;
 }
 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int ret;
 	mlog_entry_void();
 	ret = ocfs2_meta_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
 			mlog_errno(ret);
 		goto out;
 	}
 	ret = generic_permission(inode, mask, NULL);
 	ocfs2_meta_unlock(inode, 0);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 	mlog_entry("(Inode %llu, mode 0%o)\n",
 		   (unsigned long long)oi->ip_blkno, inode->i_mode);
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (handle == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_trans;
 	}
 	ret = ocfs2_journal_access(handle, inode, bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_bh;
 	}
 	inode->i_mode &= ~S_ISUID;
 	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
 		inode->i_mode &= ~S_ISGID;
 	di = (struct ocfs2_dinode *) bh->b_data;
 	di->i_mode = cpu_to_le16(inode->i_mode);
 	ret = ocfs2_journal_dirty(handle, bh);
 	if (ret < 0)
 		mlog_errno(ret);
 out_bh:
 	brelse(bh);
 out_trans:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 /*
  * Will look for holes and unwritten extents in the range starting at
  * pos for count bytes (inclusive).
  */
 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
 	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
 					 &extent_flags);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
 			ret = 1;
 			break;
 		}
 		if (extent_len > clusters)
 			extent_len = clusters;
 		clusters -= extent_len;
 		cpos += extent_len;
 	}
 out:
 	return ret;
 }
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
 					 int appending,
 					 int *direct_io)
 {
 	int ret = 0, meta_level = appending;
 	struct inode *inode = dentry->d_inode;
 	u32 clusters;
 	loff_t newsize, saved_pos;
 	/*
 	 * We sample i_size under a read level meta lock to see if our write
 	 * is extending the file, if it is we back off and get a write level
 	 * meta lock.
 	 */
 	for(;;) {
 		ret = ocfs2_meta_lock(inode, NULL, meta_level);
 		if (ret < 0) {
 			meta_level = -1;
 			mlog_errno(ret);
 			goto out;
 		}
 		/* Clear suid / sgid if necessary. We do this here
 		 * instead of later in the write path because
 		 * remove_suid() calls ->setattr without any hint that
 		 * we may have already done our cluster locking. Since
 		 * ocfs2_setattr() *must* take cluster locks to
 		 * proceeed, this will lead us to recursively lock the
 		 * inode. There's also the dinode i_size state which
 		 * can be lost via setattr during extending writes (we
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
 				ocfs2_meta_unlock(inode, meta_level);
 				meta_level = 1;
 				continue;
 			}
 			ret = ocfs2_write_remove_suid(inode);
 			if (ret < 0) {
 				mlog_errno(ret);
 				goto out_unlock;
 			}
 		}
 		/* work on a copy of ppos until we're sure that we won't have
 		 * to recalculate it due to relocking. */
 		if (appending) {
 			saved_pos = i_size_read(inode);
 			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
 		} else {
 			saved_pos = *ppos;
 		}
 		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 			loff_t end = saved_pos + count;
 			/*
 			 * Skip the O_DIRECT checks if we don't need
 			 * them.
 			 */
 			if (!direct_io || !(*direct_io))
 				break;
 			/*
 			 * Allowing concurrent direct writes means
 			 * i_size changes wouldn't be synchronized, so
 			 * one node could wind up truncating another
 			 * nodes writes.
 			 */
 			if (end > i_size_read(inode)) {
 				*direct_io = 0;
 				break;
 			}
 			/*
 			 * We don't fill holes during direct io, so
 			 * check for them here. If any are found, the
 			 * caller will have to retake some cluster
 			 * locks and initiate the io as buffered.
 			 */
 			ret = ocfs2_check_range_for_holes(inode, saved_pos,
 							  count);
 			if (ret == 1) {
 				*direct_io = 0;
 				ret = 0;
 			} else if (ret < 0)
 				mlog_errno(ret);
 			break;
 		}
 		/*
 		 * The rest of this loop is concerned with legacy file
 		 * systems which don't support sparse files.
 		 */
 		newsize = count + saved_pos;
 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
 		     (long long) saved_pos, (long long) newsize,
 		     (long long) i_size_read(inode));
 		/* No need for a higher level metadata lock if we're
 		 * never going past i_size. */
 		if (newsize <= i_size_read(inode))
 			break;
 		if (meta_level == 0) {
 			ocfs2_meta_unlock(inode, meta_level);
 			meta_level = 1;
 			continue;
 		}
 		spin_lock(&OCFS2_I(inode)->ip_lock);
 		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
 			OCFS2_I(inode)->ip_clusters;
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
 		mlog(0, "Writing at EOF, may need more allocation: "
 		     "i_size = %lld, newsize = %lld, need %u clusters\n",
 		     (long long) i_size_read(inode), (long long) newsize,
 		     clusters);
 		/* We only want to continue the rest of this loop if
 		 * our extend will actually require more
 		 * allocation. */
 		if (!clusters)
 			break;
 		ret = ocfs2_extend_file(inode, NULL, newsize, count);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out_unlock;
 		}
 		break;
 	}
 	if (appending)
 		*ppos = saved_pos;
 out_unlock:
 	ocfs2_meta_unlock(inode, meta_level);
 out:
 	return ret;
 }
 static inline void
 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 {
 	const struct iovec *iov = *iovp;
 	size_t base = *basep;
 	do {
 		int copy = min(bytes, iov->iov_len - base);
 		bytes -= copy;
 		base += copy;
 		if (iov->iov_len == base) {
 			iov++;
 			base = 0;
 		}
 	} while (bytes);
 	*iovp = iov;
 	*basep = base;
 }
 static struct page * ocfs2_get_write_source(char **ret_src_buf,
 					    const struct iovec *cur_iov,
 					    size_t iov_offset)
 {
 	int ret;
 	char *buf = cur_iov->iov_base + iov_offset;
 	struct page *src_page = NULL;
 	unsigned long off;
 	off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
 	if (!segment_eq(get_fs(), KERNEL_DS)) {
 		/*
 		 * Pull in the user page. We want to do this outside
 		 * of the meta data locks in order to preserve locking
 		 * order in case of page fault.
 		 */
 		ret = get_user_pages(current, current->mm,
 				     (unsigned long)buf & PAGE_CACHE_MASK, 1,
 				     0, 0, &src_page, NULL);
 		if (ret == 1)
 			*ret_src_buf = kmap(src_page) + off;
 		else
 			src_page = ERR_PTR(-EFAULT);
 	} else {
 		*ret_src_buf = buf;
 	}
 	return src_page;
 }
 static void ocfs2_put_write_source(struct page *page)
 {
 	if (page) {
 		kunmap(page);
 		page_cache_release(page);
 	}
 }
 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
 					 const struct iovec *iov,
 					 unsigned long nr_segs,
 					 size_t count,
 					 ssize_t o_direct_written)
 {
 	int ret = 0;
 	ssize_t copied, total = 0;
 	size_t iov_offset = 0, bytes;
 	loff_t pos;
 	const struct iovec *cur_iov = iov;
 	struct page *user_page, *page;
 	char *buf, *dst;
 	void *fsdata;
 	/*
 	 * handle partial DIO write.  Adjust cur_iov if needed.
 	 */
 	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
 	do {
 		pos = *ppos;
 		user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
 		if (IS_ERR(user_page)) {
 			ret = PTR_ERR(user_page);
 			goto out;
 		}
 		/* Stay within our page boundaries */
 		bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
 			    (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
 		/* Stay within the vector boundary */
 		bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
 		/* Stay within count */
 		bytes = min(bytes, count);
 		page = NULL;
 		ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
 					&page, &fsdata);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		dst = kmap_atomic(page, KM_USER0);
 		memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
 		kunmap_atomic(dst, KM_USER0);
 		flush_dcache_page(page);
 		ocfs2_put_write_source(user_page);
 		copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
 					 bytes, page, fsdata);
 		if (copied < 0) {
 			mlog_errno(copied);
 			ret = copied;
 			goto out;
 		}
 		total += copied;
 		*ppos = pos + copied;
 		count -= copied;
 		ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
 	} while(count);
 out:
 	return total ? total : ret;
 }
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
 	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
 	int can_do_direct, sync = 0;
 	ssize_t written = 0;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	loff_t *ppos = &iocb->ki_pos;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
 		   file->f_path.dentry->d_name.len,
 		   file->f_path.dentry->d_name.name);
 	if (iocb->ki_left == 0)
 		return 0;
 	ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 	if (ret)
 		return ret;
 	count = ocount;
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
 	mutex_lock(&inode->i_mutex);
 relock:
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (direct_io) {
 		down_read(&inode->i_alloc_sem);
 		have_alloc_sem = 1;
 	}
 	/* concurrent O_DIRECT writes are allowed */
 	rw_level = !direct_io;
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
 					    iocb->ki_left, appending,
 					    &can_do_direct);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
 	 * buffered I/O.
 	 */
 	if (direct_io && !can_do_direct) {
 		ocfs2_rw_unlock(inode, rw_level);
 		up_read(&inode->i_alloc_sem);
 		have_alloc_sem = 0;
 		rw_level = -1;
 		direct_io = 0;
 		sync = 1;
 		goto relock;
 	}
 	if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
 		sync = 1;
 	/*
 	 * XXX: Is it ok to execute these checks a second time?
 	 */
 	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
 	if (ret)
 		goto out;
 	/*
 	 * Set pos so that sync_page_range_nolock() below understands
 	 * where to start from. We might've moved it around via the
 	 * calls above. The range we want to actually sync starts from
 	 * *ppos here.
 	 *
 	 */
 	pos = *ppos;
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	if (direct_io) {
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
 			ret = written;
 			goto out_dio;
 		}
 	} else {
 		written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
 						    count, written);
 		if (written < 0) {
 			ret = written;
 			if (ret != -EFAULT || ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out;
 		}
 	}
 out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
 	 * it can unlock our rw lock.  (it's the clustered equivalent of
 	 * i_alloc_sem; protects truncate from racing with pending ios).
 	 * Unfortunately there are error cases which call end_io and others
 	 * that don't.  so we don't have to unlock the rw_lock if either an
 	 * async dio is going to do it in the future or an end_io after an
 	 * error has already done it.
 	 */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 out_sems:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
 	if (written > 0 && sync) {
 		ssize_t err;
 		err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
 		if (err < 0)
 			written = err;
 	}
 	mutex_unlock(&inode->i_mutex);
 	mlog_exit(ret);
 	return written ? written : ret;
 }
 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
 				    struct pipe_buffer *buf,
 				    struct splice_desc *sd)
 {
 	int ret, count;
 	ssize_t copied = 0;
 	struct file *file = sd->u.file;
 	unsigned int offset;
 	struct page *page = NULL;
 	void *fsdata;
 	char *src, *dst;
 	ret = buf->ops->confirm(pipe, buf);
 	if (ret)
 		goto out;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 	count = sd->len;
 	if (count + offset > PAGE_CACHE_SIZE)
 		count = PAGE_CACHE_SIZE - offset;
 	ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
 				&page, &fsdata);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	src = buf->ops->map(pipe, buf, 1);
 	dst = kmap_atomic(page, KM_USER1);
 	memcpy(dst + offset, src + buf->offset, count);
 	kunmap_atomic(page, KM_USER1);
 	buf->ops->unmap(pipe, buf, src);
 	copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
 				 page, fsdata);
 	if (copied < 0) {
 		mlog_errno(copied);
 		ret = copied;
 		goto out;
 	}
 out:
 	return copied ? copied : ret;
 }
 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 					 struct file *out,
 					 loff_t *ppos,
 					 size_t len,
 					 unsigned int flags)
 {
 	int ret, err;
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
 		.pos = *ppos,
 		.u.file = out,
 	};
 	ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
 	if (ret > 0) {
 		*ppos += ret;
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
 			if (err)
 				ret = err;
 		}
 	}
 	return ret;
 }
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
 				       size_t len,
 				       unsigned int flags)
 {
 	int ret;
 	struct inode *inode = out->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
 		   (unsigned int)len,
 		   out->f_path.dentry->d_name.len,
 		   out->f_path.dentry->d_name.name);
 	inode_double_lock(inode, pipe->inode);
 	ret = ocfs2_rw_lock(inode, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
 					    NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 	/* ok, we're done with i_size and alloc work */
 	ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
 out_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
 	inode_double_unlock(inode, pipe->inode);
 	mlog_exit(ret);
 	return ret;
 }
 static ssize_t ocfs2_file_splice_read(struct file *in,
 				      loff_t *ppos,
 				      struct pipe_inode_info *pipe,
 				      size_t len,
 				      unsigned int flags)
 {
 	int ret = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
 		   (unsigned int)len,
 		   in->f_path.dentry->d_name.len,
 		   in->f_path.dentry->d_name.name);
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
 	ret = ocfs2_meta_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_meta_unlock(inode, 0);
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 bail:
 	mlog_exit(ret);
 	return ret;
 }
 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 				   const struct iovec *iov,
 				   unsigned long nr_segs,
 				   loff_t pos)
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
 		   (unsigned int)nr_segs,
 		   filp->f_path.dentry->d_name.len,
 		   filp->f_path.dentry->d_name.name);
 	if (!inode) {
 		ret = -EINVAL;
 		mlog_errno(ret);
 		goto bail;
 	}
 	/*
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
 	 */
 	if (filp->f_flags & O_DIRECT) {
 		down_read(&inode->i_alloc_sem);
 		have_alloc_sem = 1;
 		ret = ocfs2_rw_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto bail;
 		}
 		rw_level = 0;
 		/* communicate with ocfs2_dio_end_io */
 		ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	}
 	/*
 	 * We're fine letting folks race truncates and extending
 	 * writes with read across the cluster, just like they can
 	 * locally. Hence no rw_lock during read.
 	 *
 	 * Take and drop the meta data lock to update inode fields
 	 * like i_size. This allows the checks down below
 	 * generic_file_aio_read() a chance of actually working.
 	 */
 	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_meta_unlock(inode, lock_level);
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
 		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
 	/* see ocfs2_file_aio_write */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 bail:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 	mlog_exit(ret);
 	return ret;
 }
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 };
 const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 };
 const struct file_operations ocfs2_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.ioctl		= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
 const struct file_operations ocfs2_dops = {
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.ioctl		= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 };

fs/ocfs2/mmap.c

Diff comments View file @ 7307de8

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* mmap.c	4	* mmap.c
5	*	5	*
6	* Code to deal with the mess that is clustered mmap.	6	* Code to deal with the mess that is clustered mmap.
7	*	7	*
8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*/	24	*/
25		25
26	#include <linux/fs.h>	26	#include <linux/fs.h>
27	#include <linux/types.h>	27	#include <linux/types.h>
28	#include <linux/slab.h>	28	#include <linux/slab.h>
29	#include <linux/highmem.h>	29	#include <linux/highmem.h>
30	#include <linux/pagemap.h>	30	#include <linux/pagemap.h>
31	#include <linux/uio.h>	31	#include <linux/uio.h>
32	#include <linux/signal.h>	32	#include <linux/signal.h>
33	#include <linux/rbtree.h>	33	#include <linux/rbtree.h>
34		34
35	#define MLOG_MASK_PREFIX ML_FILE_IO	35	#define MLOG_MASK_PREFIX ML_FILE_IO
36	#include <cluster/masklog.h>	36	#include <cluster/masklog.h>
37		37
38	#include "ocfs2.h"	38	#include "ocfs2.h"
39		39
		40	#include "aops.h"
40	#include "dlmglue.h"	41	#include "dlmglue.h"
41	#include "file.h"	42	#include "file.h"
42	#include "inode.h"	43	#include "inode.h"
43	#include "mmap.h"	44	#include "mmap.h"
44		45
		46	static inline int ocfs2_vm_op_block_sigs(sigset_t blocked, sigset_t oldset)
		47	{
		48	/* The best way to deal with signals in the vm path is
		49	* to block them upfront, rather than allowing the
		50	* locking paths to return -ERESTARTSYS. */
		51	sigfillset(blocked);
		52
		53	/* We should technically never get a bad return value
		54	* from sigprocmask */
		55	return sigprocmask(SIG_BLOCK, blocked, oldset);
		56	}
		57
		58	static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
		59	{
		60	return sigprocmask(SIG_SETMASK, oldset, NULL);
		61	}
		62
45	static struct page ocfs2_nopage(struct vm_area_struct area,	63	static struct page ocfs2_nopage(struct vm_area_struct area,
46	unsigned long address,	64	unsigned long address,
47	int *type)	65	int *type)
48	{	66	{
49	struct page *page = NOPAGE_SIGBUS;	67	struct page *page = NOPAGE_SIGBUS;
50	sigset_t blocked, oldset;	68	sigset_t blocked, oldset;
51	int ret;	69	int ret;
52		70
53	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,	71	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
54	type);	72	type);
55		73
56	/* The best way to deal with signals in this path is	74	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
57	* to block them upfront, rather than allowing the
58	* locking paths to return -ERESTARTSYS. */
59	sigfillset(&blocked);
60
61	/* We should technically never get a bad ret return
62	* from sigprocmask */
63	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64	if (ret < 0) {	75	if (ret < 0) {
65	mlog_errno(ret);	76	mlog_errno(ret);
66	goto out;	77	goto out;
67	}	78	}
68		79
69	page = filemap_nopage(area, address, type);	80	page = filemap_nopage(area, address, type);
70		81
71	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);	82	ret = ocfs2_vm_op_unblock_sigs(&oldset);
72	if (ret < 0)	83	if (ret < 0)
73	mlog_errno(ret);	84	mlog_errno(ret);
74	out:	85	out:
75	mlog_exit_ptr(page);	86	mlog_exit_ptr(page);
76	return page;	87	return page;
77	}	88	}
78		89
79	static struct vm_operations_struct ocfs2_file_vm_ops = {	90	static int __ocfs2_page_mkwrite(struct inode inode, struct buffer_head di_bh,
80	.nopage = ocfs2_nopage,	91	struct page *page)
81	};	92	{
		93	int ret;
		94	struct address_space *mapping = inode->i_mapping;
		95	loff_t pos = page->index << PAGE_CACHE_SHIFT;
		96	unsigned int len = PAGE_CACHE_SIZE;
		97	pgoff_t last_index;
		98	struct page *locked_page = NULL;
		99	void *fsdata;
		100	loff_t size = i_size_read(inode);
82		101
83	int ocfs2_mmap(struct file file, struct vm_area_struct vma)	102	/*
		103	* Another node might have truncated while we were waiting on
		104	* cluster locks.
		105	*/
		106	last_index = size >> PAGE_CACHE_SHIFT;
		107	if (page->index > last_index) {
		108	ret = -EINVAL;
		109	goto out;
		110	}
		111
		112	/*
		113	* The i_size check above doesn't catch the case where nodes
		114	* truncated and then re-extended the file. We'll re-check the
		115	* page mapping after taking the page lock inside of
		116	* ocfs2_write_begin_nolock().
		117	*/
		118	if (!PageUptodate(page) \|\| page->mapping != inode->i_mapping) {
		119	ret = -EINVAL;
		120	goto out;
		121	}
		122
		123	/*
		124	* Call ocfs2_write_begin() and ocfs2_write_end() to take
		125	* advantage of the allocation code there. We pass a write
		126	* length of the whole page (chopped to i_size) to make sure
		127	* the whole thing is allocated.
		128	*
		129	* Since we know the page is up to date, we don't have to
		130	* worry about ocfs2_write_begin() skipping some buffer reads
		131	* because the "write" would invalidate their data.
		132	*/
		133	if (page->index == last_index)
		134	len = size & ~PAGE_CACHE_MASK;
		135
		136	ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
		137	&fsdata, di_bh, page);
		138	if (ret) {
		139	if (ret != -ENOSPC)
		140	mlog_errno(ret);
		141	goto out;
		142	}
		143
		144	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
		145	fsdata);
		146	if (ret < 0) {
		147	mlog_errno(ret);
		148	goto out;
		149	}
		150	BUG_ON(ret != len);
		151	ret = 0;
		152	out:
		153	return ret;
		154	}
		155
		156	static int ocfs2_page_mkwrite(struct vm_area_struct vma, struct page page)
84	{	157	{
85	int ret = 0, lock_level = 0;	158	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
86	struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);	159	struct buffer_head *di_bh = NULL;
		160	sigset_t blocked, oldset;
		161	int ret, ret2;
87		162
		163	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
		164	if (ret < 0) {
		165	mlog_errno(ret);
		166	return ret;
		167	}
		168
88	/*	169	/*
89	* Only support shared writeable mmap for local mounts which	170	* The cluster locks taken will block a truncate from another
90	* don't know about holes.	171	* node. Taking the data lock will also ensure that we don't
		172	* attempt page truncation as part of a downconvert.
91	*/	173	*/
92	if ((!ocfs2_mount_local(osb) \|\| ocfs2_sparse_alloc(osb)) &&	174	ret = ocfs2_meta_lock(inode, &di_bh, 1);
93	((vma->vm_flags & VM_SHARED) \|\| (vma->vm_flags & VM_MAYSHARE)) &&	175	if (ret < 0) {
94	((vma->vm_flags & VM_WRITE) \|\| (vma->vm_flags & VM_MAYWRITE))) {	176	mlog_errno(ret);
95	mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);	177	goto out;
96	/* This is -EINVAL because generic_file_readonly_mmap
97	* returns it in a similar situation. */
98	return -EINVAL;
99	}	178	}
		179
		180	/*
		181	* The alloc sem should be enough to serialize with
		182	* ocfs2_truncate_file() changing i_size as well as any thread
		183	* modifying the inode btree.
		184	*/
		185	down_write(&OCFS2_I(inode)->ip_alloc_sem);
		186
		187	ret = ocfs2_data_lock(inode, 1);
		188	if (ret < 0) {
		189	mlog_errno(ret);
		190	goto out_meta_unlock;
		191	}
		192
		193	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
		194
		195	ocfs2_data_unlock(inode, 1);
		196
		197	out_meta_unlock:
		198	up_write(&OCFS2_I(inode)->ip_alloc_sem);
		199
		200	brelse(di_bh);
		201	ocfs2_meta_unlock(inode, 1);
		202
		203	out:
		204	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
		205	if (ret2 < 0)
		206	mlog_errno(ret2);
		207
		208	return ret;
		209	}
		210
		211	static struct vm_operations_struct ocfs2_file_vm_ops = {
		212	.nopage = ocfs2_nopage,
		213	.page_mkwrite = ocfs2_page_mkwrite,
		214	};
		215
		216	int ocfs2_mmap(struct file file, struct vm_area_struct vma)
		217	{
		218	int ret = 0, lock_level = 0;
100		219
101	ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,	220	ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102	file->f_vfsmnt, &lock_level);	221	file->f_vfsmnt, &lock_level);
103	if (ret < 0) {	222	if (ret < 0) {