Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6: jbd: change the field "b_cow_tid" of struct journal_head from type unsigned to tid_t ext3.txt: update the links in the section "useful links" to the latest ones ext3: Fix data corruption in inodes with journalled data ext2: check xattr name_len before acquiring xattr_sem in ext2_xattr_get ext3: Fix compilation with -DDX_DEBUG quota: Remove unused declaration jbd: Use WRITE_SYNC in journal checkpoint. jbd: Fix oops in journal_remove_journal_head() ext3: Return -EINVAL when start is beyond the end of fs in ext3_trim_fs() ext3/ioctl.c: silence sparse warnings about different address spaces ext3/ext4 Documentation: remove bh/nobh since it has been deprecated ext3: Improve truncate error handling ext3: use proper little-endian bitops ext2: include fs.h into ext2_fs.h ext3: Fix oops in ext3_try_to_allocate_with_rsv() jbd: fix a bug of leaking jh->b_jcount jbd: remove dependency on __GFP_NOFAIL ext3: Convert ext3 to new truncate calling convention jbd: Add fixed tracepoints ext3: Add fixed tracepoints Resolve conflicts in fs/ext3/fsync.c due to fsync locking push-down and new fixed tracepoints.

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6: jbd: change the field "b_cow_tid" of struct journal_head from type unsigned to tid_t ext3.txt: update the links in the section "useful links" to the latest ones ext3: Fix data corruption in inodes with journalled data ext2: check xattr name_len before acquiring xattr_sem in ext2_xattr_get ext3: Fix compilation with -DDX_DEBUG quota: Remove unused declaration jbd: Use WRITE_SYNC in journal checkpoint. jbd: Fix oops in journal_remove_journal_head() ext3: Return -EINVAL when start is beyond the end of fs in ext3_trim_fs() ext3/ioctl.c: silence sparse warnings about different address spaces ext3/ext4 Documentation: remove bh/nobh since it has been deprecated ext3: Improve truncate error handling ext3: use proper little-endian bitops ext2: include fs.h into ext2_fs.h ext3: Fix oops in ext3_try_to_allocate_with_rsv() jbd: fix a bug of leaking jh->b_jcount jbd: remove dependency on __GFP_NOFAIL ext3: Convert ext3 to new truncate calling convention jbd: Add fixed tracepoints ext3: Add fixed tracepoints Resolve conflicts in fs/ext3/fsync.c due to fsync locking push-down and new fixed tracepoints.
Linus Torvalds
2 parents fa8f53ace4 5cf49d763e
Showing 23 changed files Side-by-side Diff
Documentation/filesystems/ext3.txt
Documentation/filesystems/ext4.txt
fs/ext2/xattr.c
fs/ext3/balloc.c
fs/ext3/file.c
fs/ext3/fsync.c
fs/ext3/ialloc.c
fs/ext3/inode.c
fs/ext3/ioctl.c
fs/ext3/namei.c
fs/ext3/super.c
fs/ext3/xattr.c
fs/jbd/checkpoint.c
fs/jbd/commit.c
fs/jbd/journal.c
fs/jbd/transaction.c
include/linux/ext2_fs.h
include/linux/ext3_fs.h
include/linux/jbd.h
include/linux/journal-head.h
@@ -147,15 +147,6 @@
 			package for more details
 			(http://sourceforge.net/projects/linuxquota).
  
-bh		(*)	ext3 associates buffer heads to data pages to
-nobh			(a) cache disk block mapping information
-			(b) link pages into transaction to provide
-			    ordering guarantees.
-			"bh" option forces use of buffer heads.
-			"nobh" option tries to avoid associating buffer
-			heads (supported only for "writeback" mode).
-
-
 Specification
 =============
 Ext3 shares all disk implementation with the ext2 filesystem, and adds
@@ -227,6 +218,6 @@
 programs: 	http://e2fsprogs.sourceforge.net/
 		http://ext2resize.sourceforge.net
  
-useful links:	http://www.ibm.com/developerworks/library/l-fs7.html
-		http://www.ibm.com/developerworks/library/l-fs8.html
+useful links:	http://www.ibm.com/developerworks/library/l-fs7/index.html
+        http://www.ibm.com/developerworks/library/l-fs8/index.html
@@ -68,12 +68,12 @@
     '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
     for a fair comparison.  When tuning ext3 for best benchmark numbers,
     it is often worthwhile to try changing the data journaling mode; '-o
-    data=writeback,nobh' can be faster for some workloads.  (Note
-    however that running mounted with data=writeback can potentially
-    leave stale data exposed in recently written files in case of an
-    unclean shutdown, which could be a security exposure in some
-    situations.)  Configuring the filesystem with a large journal can
-    also be helpful for metadata-intensive workloads.
+    data=writeback' can be faster for some workloads.  (Note however that
+    running mounted with data=writeback can potentially leave stale data
+    exposed in recently written files in case of an unclean shutdown,
+    which could be a security exposure in some situations.)  Configuring
+    the filesystem with a large journal can also be helpful for
+    metadata-intensive workloads.
  
 2. Features
 ===========
@@ -272,14 +272,6 @@
 			package for more details
 			(http://sourceforge.net/projects/linuxquota).
  
-bh		(*)	ext4 associates buffer heads to data pages to
-nobh			(a) cache disk block mapping information
-			(b) link pages into transaction to provide
-			    ordering guarantees.
-			"bh" option forces use of buffer heads.
-			"nobh" option tries to avoid associating buffer
-			heads (supported only for "writeback" mode).
-
 stripe=n		Number of filesystem blocks that mballoc will try
 			to use for allocation size and alignment. For RAID5/6
 			systems this should be the number of data
@@ -393,8 +385,7 @@
 			write and convert the extent to initialized after IO
 			completes. This approach allows ext4 code to avoid
 			using inode mutex, which improves scalability on high
-			speed storages. However this does not work with nobh
-			option and the mount will fail. Nor does it work with
+			speed storages. However this does not work with
 			data journaling and dioread_nolock option will be
 			ignored with kernel warning. Note that dioread_nolock
 			code path is only used for extent-based files.
@@ -161,6 +161,10 @@
  
 	if (name == NULL)
 		return -EINVAL;
+	name_len = strlen(name);
+	if (name_len > 255)
+		return -ERANGE;
+
 	down_read(&EXT2_I(inode)->xattr_sem);
 	error = -ENODATA;
 	if (!EXT2_I(inode)->i_file_acl)
  
@@ -181,12 +185,8 @@
 		error = -EIO;
 		goto cleanup;
 	}
-	/* find named attribute */
-	name_len = strlen(name);
  
-	error = -ERANGE;
-	if (name_len > 255)
-		goto cleanup;
+	/* find named attribute */
 	entry = FIRST_ENTRY(bh);
 	while (!IS_LAST_ENTRY(entry)) {
 		struct ext2_xattr_entry *next =
@@ -21,6 +21,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <trace/events/ext3.h>
  
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -161,6 +162,7 @@
 	desc = ext3_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return NULL;
+	trace_ext3_read_block_bitmap(sb, block_group);
 	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
@@ -351,6 +353,7 @@
 	struct rb_node * parent = NULL;
 	struct ext3_reserve_window_node *this;
  
+	trace_ext3_rsv_window_add(sb, rsv);
 	while (*p)
 	{
 		parent = *p;
  
@@ -476,8 +479,10 @@
 	rsv = &block_i->rsv_window_node;
 	if (!rsv_is_empty(&rsv->rsv_window)) {
 		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window))
+		if (!rsv_is_empty(&rsv->rsv_window)) {
+			trace_ext3_discard_reservation(inode, rsv);
 			rsv_window_remove(inode->i_sb, rsv);
+		}
 		spin_unlock(rsv_lock);
 	}
 }
  
@@ -683,14 +688,10 @@
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t block, unsigned long count)
 {
-	struct super_block * sb;
+	struct super_block *sb = inode->i_sb;
 	unsigned long dquot_freed_blocks;
  
-	sb = inode->i_sb;
-	if (!sb) {
-		printk ("ext3_free_blocks: nonexistent device");
-		return;
-	}
+	trace_ext3_free_blocks(inode, block, count);
 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
 		dquot_free_block(inode, dquot_freed_blocks);
@@ -1136,6 +1137,7 @@
 	else
 		start_block = grp_goal + group_first_block;
  
+	trace_ext3_alloc_new_reservation(sb, start_block);
 	size = my_rsv->rsv_goal_size;
  
 	if (!rsv_is_empty(&my_rsv->rsv_window)) {
  
@@ -1230,8 +1232,11 @@
 	 * check if the first free block is within the
 	 * free space we just reserved
 	 */
-	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+	if (start_block >= my_rsv->rsv_start &&
+	    start_block <= my_rsv->rsv_end) {
+		trace_ext3_reserved(sb, start_block, my_rsv);
 		return 0;		/* success */
+	}
 	/*
 	 * if the first free bit we found is out of the reservable space
 	 * continue search for next reservable space,
@@ -1514,10 +1519,6 @@
  
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
-	if (!sb) {
-		printk("ext3_new_block: nonexistent device");
-		return 0;
-	}
  
 	/*
 	 * Check quota for allocation of this block.
  
@@ -1528,8 +1529,10 @@
 		return 0;
 	}
  
+	trace_ext3_request_blocks(inode, goal, num);
+
 	sbi = EXT3_SB(sb);
-	es = EXT3_SB(sb)->s_es;
+	es = sbi->s_es;
 	ext3_debug("goal=%lu.\n", goal);
 	/*
 	 * Allocate a block from reservation only when
@@ -1742,6 +1745,10 @@
 	brelse(bitmap_bh);
 	dquot_free_block(inode, *count-num);
 	*count = num;
+
+	trace_ext3_allocate_blocks(inode, goal, num,
+				   (unsigned long long)ret_block);
+
 	return ret_block;
  
 io_error:
@@ -1996,6 +2003,7 @@
 		if ((next - start) < minblocks)
 			goto free_extent;
  
+		trace_ext3_discard_blocks(sb, discard_block, next - start);
 		 /* Send the TRIM command down to the device */
 		err = sb_issue_discard(sb, discard_block, next - start,
 				       GFP_NOFS, 0);
@@ -2100,7 +2108,7 @@
 	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
 		return -EINVAL;
 	if (start >= max_blks)
-		goto out;
+		return -EINVAL;
 	if (start + len > max_blks)
 		len = max_blks - start;
  
@@ -2148,8 +2156,6 @@
  
 	if (ret >= 0)
 		ret = 0;
-
-out:
 	range->len = trimmed * sb->s_blocksize;
  
 	return ret;
@@ -71,7 +71,6 @@
 };
  
 const struct inode_operations ext3_file_inode_operations = {
-	.truncate	= ext3_truncate,
 	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -30,6 +30,7 @@
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
+#include <trace/events/ext3.h>
  
 /*
  * akpm: A new design for ext3_sync_file().
  
@@ -51,12 +52,14 @@
 	int ret, needs_barrier = 0;
 	tid_t commit_tid;
  
+	trace_ext3_sync_file_enter(file, datasync);
+
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
  
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
-		return ret;
+		goto out;
  
 	/*
 	 * Taking the mutex here just to keep consistent with how fsync was
@@ -83,7 +86,8 @@
 	 */
 	if (ext3_should_journal_data(inode)) {
 		mutex_unlock(&inode->i_mutex);
-		return ext3_force_commit(inode->i_sb);
+		ret = ext3_force_commit(inode->i_sb);
+		goto out;
 	}
  
 	if (datasync)
  
@@ -104,7 +108,10 @@
 	 */
 	if (needs_barrier)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
 	mutex_unlock(&inode->i_mutex);
+out:
+	trace_ext3_sync_file_exit(inode, ret);
 	return ret;
 }
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/random.h>
 #include <linux/bitops.h>
+#include <trace/events/ext3.h>
  
 #include <asm/byteorder.h>
  
@@ -118,6 +119,7 @@
  
 	ino = inode->i_ino;
 	ext3_debug ("freeing inode %lu\n", ino);
+	trace_ext3_free_inode(inode);
  
 	is_directory = S_ISDIR(inode->i_mode);
  
@@ -426,6 +428,7 @@
 		return ERR_PTR(-EPERM);
  
 	sb = dir->i_sb;
+	trace_ext3_request_inode(dir, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -601,6 +604,7 @@
 	}
  
 	ext3_debug("allocating inode %lu\n", inode->i_ino);
+	trace_ext3_allocate_inode(inode, dir, mode);
 	goto really_out;
 fail:
 	ext3_std_error(sb, err);
@@ -38,10 +38,12 @@
 #include <linux/bio.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <trace/events/ext3.h>
 #include "xattr.h"
 #include "acl.h"
  
 static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
  
 /*
  * Test whether an inode is a fast symlink.
@@ -70,6 +72,7 @@
  
 	might_sleep();
  
+	trace_ext3_forget(inode, is_metadata, blocknr);
 	BUFFER_TRACE(bh, "enter");
  
 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  
  
  
@@ -194,20 +197,47 @@
  */
 void ext3_evict_inode (struct inode *inode)
 {
+	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct ext3_block_alloc_info *rsv;
 	handle_t *handle;
 	int want_delete = 0;
  
+	trace_ext3_evict_inode(inode);
 	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		dquot_initialize(inode);
 		want_delete = 1;
 	}
  
+	/*
+	 * When journalling data dirty buffers are tracked only in the journal.
+	 * So although mm thinks everything is clean and ready for reaping the
+	 * inode might still have some pages to write in the running
+	 * transaction or waiting to be checkpointed. Thus calling
+	 * journal_invalidatepage() (via truncate_inode_pages()) to discard
+	 * these buffers can cause data loss. Also even if we did not discard
+	 * these buffers, we would have no way to find them after the inode
+	 * is reaped and thus user could see stale data if he tries to read
+	 * them before the transaction is checkpointed. So be careful and
+	 * force everything to disk here... We use ei->i_datasync_tid to
+	 * store the newest transaction containing inode's data.
+	 *
+	 * Note that directories do not have this problem because they don't
+	 * use page cache.
+	 */
+	if (inode->i_nlink && ext3_should_journal_data(inode) &&
+	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+		tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+		log_start_commit(journal, commit_tid);
+		log_wait_commit(journal, commit_tid);
+		filemap_write_and_wait(&inode->i_data);
+	}
 	truncate_inode_pages(&inode->i_data, 0);
  
 	ext3_discard_reservation(inode);
-	rsv = EXT3_I(inode)->i_block_alloc_info;
-	EXT3_I(inode)->i_block_alloc_info = NULL;
+	rsv = ei->i_block_alloc_info;
+	ei->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
  
  
@@ -231,15 +261,13 @@
 	if (inode->i_blocks)
 		ext3_truncate(inode);
 	/*
-	 * Kill off the orphan record which ext3_truncate created.
-	 * AKPM: I think this can be inside the above `if'.
-	 * Note that ext3_orphan_del() has to be able to cope with the
-	 * deletion of a non-existent orphan - this is because we don't
-	 * know if ext3_truncate() actually created an orphan record.
-	 * (Well, we could do this if we need to, but heck - it works)
+	 * Kill off the orphan record created when the inode lost the last
+	 * link.  Note that ext3_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - ext3_truncate() could
+	 * have removed the record.
 	 */
 	ext3_orphan_del(handle, inode);
-	EXT3_I(inode)->i_dtime	= get_seconds();
+	ei->i_dtime = get_seconds();
  
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
@@ -842,6 +870,7 @@
 	ext3_fsblk_t first_block = 0;
  
  
+	trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
 	J_ASSERT(handle != NULL || create == 0);
 	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
  
@@ -886,6 +915,9 @@
 	if (!create || err == -EIO)
 		goto cleanup;
  
+	/*
+	 * Block out ext3_truncate while we alter the tree
+	 */
 	mutex_lock(&ei->truncate_mutex);
  
 	/*
@@ -934,9 +966,6 @@
 	 */
 	count = ext3_blks_to_allocate(partial, indirect_blks,
 					maxblocks, blocks_to_boundary);
-	/*
-	 * Block out ext3_truncate while we alter the tree
-	 */
 	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
 				offsets + (partial - chain), partial);
  
@@ -970,6 +999,9 @@
 	}
 	BUFFER_TRACE(bh_result, "returned");
 out:
+	trace_ext3_get_blocks_exit(inode, iblock,
+				   depth ? le32_to_cpu(chain[depth-1].key) : 0,
+				   count, err);
 	return err;
 }
  
@@ -1202,6 +1234,16 @@
 	ext3_truncate(inode);
 }
  
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+	ext3_block_truncate_page(inode, inode->i_size);
+	ext3_truncate(inode);
+}
+
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
@@ -1217,6 +1259,8 @@
 	 * we allocate blocks but write fails for some reason */
 	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
  
+	trace_ext3_write_begin(inode, pos, len, flags);
+
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1332,6 +1376,7 @@
 	unsigned from, to;
 	int ret = 0, ret2;
  
+	trace_ext3_ordered_write_end(inode, pos, len, copied);
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  
 	from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1367,6 +1412,7 @@
 	struct inode *inode = file->f_mapping->host;
 	int ret;
  
+	trace_ext3_writeback_write_end(inode, pos, len, copied);
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	update_file_sizes(inode, pos, copied);
 	/*
  
@@ -1391,10 +1437,12 @@
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
  
+	trace_ext3_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
  
@@ -1419,8 +1467,9 @@
 	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
-	if (inode->i_size > EXT3_I(inode)->i_disksize) {
-		EXT3_I(inode)->i_disksize = inode->i_size;
+	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+	if (inode->i_size > ei->i_disksize) {
+		ei->i_disksize = inode->i_size;
 		ret2 = ext3_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
@@ -1577,6 +1626,7 @@
 	if (ext3_journal_current_handle())
 		goto out_fail;
  
+	trace_ext3_ordered_writepage(page);
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1647,6 +1697,7 @@
 	if (ext3_journal_current_handle())
 		goto out_fail;
  
+	trace_ext3_writeback_writepage(page);
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0,
 				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
@@ -1689,6 +1740,7 @@
 	if (ext3_journal_current_handle())
 		goto no_write;
  
+	trace_ext3_journalled_writepage(page);
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -1715,6 +1767,8 @@
 		if (ret == 0)
 			ret = err;
 		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+		atomic_set(&EXT3_I(inode)->i_datasync_tid,
+			   handle->h_transaction->t_tid);
 		unlock_page(page);
 	} else {
 		/*
@@ -1739,6 +1793,7 @@
  
 static int ext3_readpage(struct file *file, struct page *page)
 {
+	trace_ext3_readpage(page);
 	return mpage_readpage(page, ext3_get_block);
 }
  
@@ -1753,6 +1808,8 @@
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
  
+	trace_ext3_invalidatepage(page, offset);
+
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
@@ -1766,6 +1823,7 @@
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
  
+	trace_ext3_releasepage(page);
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
@@ -1794,6 +1852,8 @@
 	size_t count = iov_length(iov, nr_segs);
 	int retries = 0;
  
+	trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
  
@@ -1827,7 +1887,7 @@
 		loff_t end = offset + iov_length(iov, nr_segs);
  
 		if (end > isize)
-			vmtruncate(inode, isize);
+			ext3_truncate_failed_direct_write(inode);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
@@ -1841,7 +1901,7 @@
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Truncate allocated blocks
 			 * and pretend the write failed... */
-			ext3_truncate(inode);
+			ext3_truncate_failed_direct_write(inode);
 			ret = PTR_ERR(handle);
 			goto out;
 		}
@@ -1867,6 +1927,8 @@
 			ret = err;
 	}
 out:
+	trace_ext3_direct_IO_exit(inode, offset,
+				iov_length(iov, nr_segs), rw, ret);
 	return ret;
 }
  
  
  
  
  
@@ -1949,17 +2011,24 @@
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
-		struct address_space *mapping, loff_t from)
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
 {
 	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
 	unsigned blocksize, iblock, length, pos;
-	struct inode *inode = mapping->host;
+	struct page *page;
+	handle_t *handle = NULL;
 	struct buffer_head *bh;
 	int err = 0;
  
+	/* Truncated on block boundary - nothing to do */
 	blocksize = inode->i_sb->s_blocksize;
+	if ((from & (blocksize - 1)) == 0)
+		return 0;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOMEM;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
  
  
@@ -2004,11 +2073,23 @@
 			goto unlock;
 	}
  
+	/* data=writeback mode doesn't need transaction to zero-out data */
+	if (!ext3_should_writeback_data(inode)) {
+		/* We journal at most one block */
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			err = PTR_ERR(handle);
+			goto unlock;
+		}
+	}
+
 	if (ext3_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext3_journal_get_write_access(handle, bh);
 		if (err)
-			goto unlock;
+			goto stop;
 	}
  
 	zero_user(page, offset, length);
@@ -2022,6 +2103,9 @@
 			err = ext3_journal_dirty_data(handle, bh);
 		mark_buffer_dirty(bh);
 	}
+stop:
+	if (handle)
+		ext3_journal_stop(handle);
  
 unlock:
 	unlock_page(page);
@@ -2390,8 +2474,6 @@
  
 int ext3_can_truncate(struct inode *inode)
 {
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return 0;
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
@@ -2435,7 +2517,6 @@
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-	struct address_space *mapping = inode->i_mapping;
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
  
  
  
  
  
@@ -2443,45 +2524,21 @@
 	int n;
 	long last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
  
+	trace_ext3_truncate_enter(inode);
+
 	if (!ext3_can_truncate(inode))
 		goto out_notrans;
  
 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
 		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
  
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			goto out_notrans;
-	}
-
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		goto out_notrans;
-	}
  
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-
-	if (page)
-		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
-
 	n = ext3_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
 		goto out_stop;	/* error */
@@ -2596,6 +2653,7 @@
 		ext3_orphan_del(handle, inode);
  
 	ext3_journal_stop(handle);
+	trace_ext3_truncate_exit(inode);
 	return;
 out_notrans:
 	/*
@@ -2604,6 +2662,7 @@
 	 */
 	if (inode->i_nlink)
 		ext3_orphan_del(NULL, inode);
+	trace_ext3_truncate_exit(inode);
 }
  
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2745,6 +2804,7 @@
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
+		trace_ext3_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
  
  
  
@@ -3229,18 +3289,36 @@
 		}
  
 		error = ext3_orphan_add(handle, inode);
+		if (error) {
+			ext3_journal_stop(handle);
+			goto err_out;
+		}
 		EXT3_I(inode)->i_disksize = attr->ia_size;
-		rc = ext3_mark_inode_dirty(handle, inode);
-		if (!error)
-			error = rc;
+		error = ext3_mark_inode_dirty(handle, inode);
 		ext3_journal_stop(handle);
+		if (error) {
+			/* Some hard fs error must have happened. Bail out. */
+			ext3_orphan_del(NULL, inode);
+			goto err_out;
+		}
+		rc = ext3_block_truncate_page(inode, attr->ia_size);
+		if (rc) {
+			/* Cleanup orphan list and exit */
+			handle = ext3_journal_start(inode, 3);
+			if (IS_ERR(handle)) {
+				ext3_orphan_del(NULL, inode);
+				goto err_out;
+			}
+			ext3_orphan_del(handle, inode);
+			ext3_journal_stop(handle);
+			goto err_out;
+		}
 	}
  
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
-		rc = vmtruncate(inode, attr->ia_size);
-		if (rc)
-			goto err_out;
+		truncate_setsize(inode, attr->ia_size);
+		ext3_truncate(inode);
 	}
  
 	setattr_copy(inode, attr);
@@ -3374,6 +3452,7 @@
 	int err;
  
 	might_sleep();
+	trace_ext3_mark_inode_dirty(inode, _RET_IP_);
 	err = ext3_reserve_inode_write(handle, inode, &iloc);
 	if (!err)
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
@@ -285,7 +285,7 @@
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
  
-		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
 				   sizeof(range)))
 			return -EFAULT;
  
@@ -293,7 +293,7 @@
 		if (ret < 0)
 			return ret;
  
-		if (copy_to_user((struct fstrim_range *)arg, &range,
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
 				 sizeof(range)))
 			return -EFAULT;
  
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <trace/events/ext3.h>
  
 #include "namei.h"
 #include "xattr.h"
@@ -287,7 +288,7 @@
 				while (len--) printk("%c", *name++);
 				ext3fs_dirhash(de->name, de->name_len, &h);
 				printk(":%x.%u ", h.hash,
-				       ((char *) de - base));
+				       (unsigned) ((char *) de - base));
 			}
 			space += EXT3_DIR_REC_LEN(de->name_len);
 			names++;
@@ -1013,7 +1014,7 @@
  
 	*err = -ENOENT;
 errout:
-	dxtrace(printk("%s not found\n", name));
+	dxtrace(printk("%s not found\n", entry->name));
 	dx_release (frames);
 	return NULL;
 }
@@ -2140,6 +2141,7 @@
 	struct ext3_dir_entry_2 * de;
 	handle_t *handle;
  
+	trace_ext3_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	dquot_initialize(dir);
@@ -2185,6 +2187,7 @@
 end_unlink:
 	ext3_journal_stop(handle);
 	brelse (bh);
+	trace_ext3_unlink_exit(dentry, retval);
 	return retval;
 }
  
@@ -44,6 +44,9 @@
 #include "acl.h"
 #include "namei.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext3.h>
+
 #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
   #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
 #else
@@ -497,6 +500,14 @@
 	return &ei->vfs_inode;
 }
  
+static int ext3_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext3_drop_inode(inode, drop);
+	return drop;
+}
+
 static void ext3_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -788,6 +799,7 @@
 	.destroy_inode	= ext3_destroy_inode,
 	.write_inode	= ext3_write_inode,
 	.dirty_inode	= ext3_dirty_inode,
+	.drop_inode	= ext3_drop_inode,
 	.evict_inode	= ext3_evict_inode,
 	.put_super	= ext3_put_super,
 	.sync_fs	= ext3_sync_fs,
@@ -2509,6 +2521,7 @@
 {
 	tid_t target;
  
+	trace_ext3_sync_fs(sb, wait);
 	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
 		if (wait)
 			log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -803,8 +803,16 @@
 			/* We need to allocate a new block */
 			ext3_fsblk_t goal = ext3_group_first_block_no(sb,
 						EXT3_I(inode)->i_block_group);
-			ext3_fsblk_t block = ext3_new_block(handle, inode,
-							goal, &error);
+			ext3_fsblk_t block;
+
+			/*
+			 * Protect us agaist concurrent allocations to the
+			 * same inode from ext3_..._writepage(). Reservation
+			 * code does not expect racing allocations.
+			 */
+			mutex_lock(&EXT3_I(inode)->truncate_mutex);
+			block = ext3_new_block(handle, inode, goal, &error);
+			mutex_unlock(&EXT3_I(inode)->truncate_mutex);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
@@ -22,6 +22,8 @@
 #include <linux/jbd.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
  
 /*
  * Unlink a buffer from a transaction checkpoint list.
  
@@ -95,10 +97,14 @@
  
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
 	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		/*
+		 * Get our reference so that bh cannot be freed before
+		 * we unlock it
+		 */
+		get_bh(bh);
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
-		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
 	} else {
  
@@ -220,8 +226,8 @@
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		get_bh(bh);
 		if (buffer_locked(bh)) {
-			get_bh(bh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
@@ -240,7 +246,6 @@
 		 */
 		released = __journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
-		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
  
  
  
@@ -253,9 +258,12 @@
 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
+	struct blk_plug plug;
  
+	blk_start_plug(&plug);
 	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(bhs[i], WRITE);
+		write_dirty_buffer(bhs[i], WRITE_SYNC);
+	blk_finish_plug(&plug);
  
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
  
@@ -304,12 +312,12 @@
 		ret = 1;
 		if (unlikely(buffer_write_io_error(bh)))
 			ret = -EIO;
+		get_bh(bh);
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
-		journal_remove_journal_head(bh);
 		__brelse(bh);
 	} else {
 		/*
@@ -358,6 +366,7 @@
 	 * journal straight away.
 	 */
 	result = cleanup_journal_tail(journal);
+	trace_jbd_checkpoint(journal, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
@@ -503,6 +512,7 @@
 	if (blocknr < journal->j_tail)
 		freed = freed + journal->j_last - journal->j_first;
  
+	trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
 	jbd_debug(1,
 		  "Cleaning journal tail from %d to %d (offset %u), "
 		  "freeing %u\n",
  
@@ -523,9 +533,9 @@
 /*
  * journal_clean_one_cp_list
  *
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and release
+ * them.
  *
- * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of bufers reaped (for debug)
  */
  
@@ -632,8 +642,8 @@
  * checkpoint lists.
  *
  * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
  *
- * This function is called with the journal locked.
  * This function is called with j_list_lock held.
  * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
  
  
@@ -652,13 +662,14 @@
 	}
 	journal = transaction->t_journal;
  
+	JBUFFER_TRACE(jh, "removing from transaction");
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
+	journal_put_journal_head(jh);
  
 	if (transaction->t_checkpoint_list != NULL ||
 	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
-	JBUFFER_TRACE(jh, "transaction has no more buffers");
  
 	/*
 	 * There is one special case to worry about: if we have just pulled the
  
@@ -669,10 +680,8 @@
 	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of journal_commit_transaction().
 	 */
-	if (transaction->t_state != T_FINISHED) {
-		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
+	if (transaction->t_state != T_FINISHED)
 		goto out;
-	}
  
 	/* OK, that was the last buffer for the transaction: we can now
 	   safely remove this transaction from the log */
@@ -684,7 +693,6 @@
 	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
-	JBUFFER_TRACE(jh, "exit");
 	return ret;
 }
  
@@ -703,6 +711,8 @@
 	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
 	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
  
+	/* Get reference for checkpointing transaction */
+	journal_grab_journal_head(jh2bh(jh));
 	jh->b_cp_transaction = transaction;
  
 	if (!transaction->t_checkpoint_list) {
@@ -752,6 +762,7 @@
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
  
+	trace_jbd_drop_transaction(journal, transaction);
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 	kfree(transaction);
 }
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <trace/events/jbd.h>
  
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -204,6 +205,8 @@
 			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
+				trace_jbd_do_submit_data(journal,
+						     commit_transaction);
 				/* Write out all data to prevent deadlocks */
 				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
@@ -236,6 +239,8 @@
 			jbd_unlock_bh_state(bh);
 			if (bufs == journal->j_wbufsize) {
 				spin_unlock(&journal->j_list_lock);
+				trace_jbd_do_submit_data(journal,
+						     commit_transaction);
 				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				goto write_out_data;
@@ -253,10 +258,6 @@
 			jbd_unlock_bh_state(bh);
 			if (locked)
 				unlock_buffer(bh);
-			journal_remove_journal_head(bh);
-			/* One for our safety reference, other for
-			 * journal_remove_journal_head() */
-			put_bh(bh);
 			release_data_buffer(bh);
 		}
  
@@ -266,6 +267,7 @@
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
+	trace_jbd_do_submit_data(journal, commit_transaction);
 	journal_do_submit_data(wbuf, bufs, write_op);
  
 	return err;
  
@@ -316,12 +318,14 @@
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
  
+	trace_jbd_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
  
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
  
+	trace_jbd_commit_locking(journal, commit_transaction);
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -392,6 +396,7 @@
 	 */
 	journal_switch_revoke_table(journal);
  
+	trace_jbd_commit_flushing(journal, commit_transaction);
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
  
@@ -446,14 +451,9 @@
 		}
 		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
 		    jh->b_transaction == commit_transaction &&
-		    jh->b_jlist == BJ_Locked) {
+		    jh->b_jlist == BJ_Locked)
 			__journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
+		jbd_unlock_bh_state(bh);
 		release_data_buffer(bh);
 		cond_resched_lock(&journal->j_list_lock);
 	}
@@ -493,6 +493,7 @@
 	commit_transaction->t_state = T_COMMIT;
 	spin_unlock(&journal->j_state_lock);
  
+	trace_jbd_commit_logging(journal, commit_transaction);
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
  
  
@@ -797,10 +798,16 @@
 	while (commit_transaction->t_forget) {
 		transaction_t *cp_transaction;
 		struct buffer_head *bh;
+		int try_to_free = 0;
  
 		jh = commit_transaction->t_forget;
 		spin_unlock(&journal->j_list_lock);
 		bh = jh2bh(jh);
+		/*
+		 * Get a reference so that bh cannot be freed before we are
+		 * done with it.
+		 */
+		get_bh(bh);
 		jbd_lock_bh_state(bh);
 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
 			jh->b_transaction == journal->j_running_transaction);
  
  
  
@@ -858,28 +865,27 @@
 			__journal_insert_checkpoint(jh, commit_transaction);
 			if (is_journal_aborted(journal))
 				clear_buffer_jbddirty(bh);
-			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-			__journal_refile_buffer(jh);
-			jbd_unlock_bh_state(bh);
 		} else {
 			J_ASSERT_BH(bh, !buffer_dirty(bh));
-			/* The buffer on BJ_Forget list and not jbddirty means
+			/*
+			 * The buffer on BJ_Forget list and not jbddirty means
 			 * it has been freed by this transaction and hence it
 			 * could not have been reallocated until this
 			 * transaction has committed. *BUT* it could be
 			 * reallocated once we have written all the data to
 			 * disk and before we process the buffer on BJ_Forget
-			 * list. */
-			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
-			__journal_refile_buffer(jh);
-			if (!jh->b_transaction) {
-				jbd_unlock_bh_state(bh);
-				 /* needs a brelse */
-				journal_remove_journal_head(bh);
-				release_buffer_page(bh);
-			} else
-				jbd_unlock_bh_state(bh);
+			 * list.
+			 */
+			if (!jh->b_next_transaction)
+				try_to_free = 1;
 		}
+		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+		__journal_refile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		if (try_to_free)
+			release_buffer_page(bh);
+		else
+			__brelse(bh);
 		cond_resched_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -946,6 +952,7 @@
 	}
 	spin_unlock(&journal->j_list_lock);
  
+	trace_jbd_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
  
@@ -38,6 +38,9 @@
 #include <linux/debugfs.h>
 #include <linux/ratelimit.h>
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd.h>
+
 #include <asm/uaccess.h>
 #include <asm/page.h>
  
@@ -1065,6 +1068,7 @@
 	} else
 		write_dirty_buffer(bh, WRITE);
  
+	trace_jbd_update_superblock_end(journal, wait);
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
 	 * any future commit will have to be careful to update the
@@ -1799,10 +1803,9 @@
  * When a buffer has its BH_JBD bit set it is immune from being released by
  * core kernel code, mainly via ->b_count.
  *
- * A journal_head may be detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
- * Various places in JBD call journal_remove_journal_head() to indicate that the
- * journal_head can be dropped if needed.
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
  *
  * Various places in the kernel want to attach a journal_head to a buffer_head
  * _before_ attaching the journal_head to a transaction.  To protect the
  
@@ -1815,17 +1818,16 @@
  *	(Attach a journal_head if needed.  Increments b_jcount)
  *	struct journal_head *jh = journal_add_journal_head(bh);
  *	...
- *	jh->b_transaction = xxx;
- *	journal_put_journal_head(jh);
- *
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
- * because it has a non-zero b_transaction.
+ *      (Get another reference for transaction)
+ *      journal_grab_journal_head(bh);
+ *      jh->b_transaction = xxx;
+ *      (Put original reference)
+ *      journal_put_journal_head(jh);
  */
  
 /*
  * Give a buffer_head a journal_head.
  *
- * Doesn't need the journal lock.
  * May sleep.
  */
 struct journal_head *journal_add_journal_head(struct buffer_head *bh)
  
  
@@ -1889,61 +1891,29 @@
 	struct journal_head *jh = bh2jh(bh);
  
 	J_ASSERT_JH(jh, jh->b_jcount >= 0);
-
-	get_bh(bh);
-	if (jh->b_jcount == 0) {
-		if (jh->b_transaction == NULL &&
-				jh->b_next_transaction == NULL &&
-				jh->b_cp_transaction == NULL) {
-			J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-			J_ASSERT_BH(bh, buffer_jbd(bh));
-			J_ASSERT_BH(bh, jh2bh(jh) == bh);
-			BUFFER_TRACE(bh, "remove journal_head");
-			if (jh->b_frozen_data) {
-				printk(KERN_WARNING "%s: freeing "
-						"b_frozen_data\n",
-						__func__);
-				jbd_free(jh->b_frozen_data, bh->b_size);
-			}
-			if (jh->b_committed_data) {
-				printk(KERN_WARNING "%s: freeing "
-						"b_committed_data\n",
-						__func__);
-				jbd_free(jh->b_committed_data, bh->b_size);
-			}
-			bh->b_private = NULL;
-			jh->b_bh = NULL;	/* debug, really */
-			clear_buffer_jbd(bh);
-			__brelse(bh);
-			journal_free_journal_head(jh);
-		} else {
-			BUFFER_TRACE(bh, "journal_head was locked");
-		}
+	J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+	J_ASSERT_BH(bh, buffer_jbd(bh));
+	J_ASSERT_BH(bh, jh2bh(jh) == bh);
+	BUFFER_TRACE(bh, "remove journal_head");
+	if (jh->b_frozen_data) {
+		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+		jbd_free(jh->b_frozen_data, bh->b_size);
 	}
+	if (jh->b_committed_data) {
+		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+		jbd_free(jh->b_committed_data, bh->b_size);
+	}
+	bh->b_private = NULL;
+	jh->b_bh = NULL;	/* debug, really */
+	clear_buffer_jbd(bh);
+	journal_free_journal_head(jh);
 }
  
 /*
- * journal_remove_journal_head(): if the buffer isn't attached to a transaction
- * and has a zero b_jcount then remove and release its journal_head.   If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
- * time.  Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void journal_remove_journal_head(struct buffer_head *bh)
-{
-	jbd_lock_bh_journal_head(bh);
-	__journal_remove_journal_head(bh);
-	jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * Drop a reference on the passed journal_head.  If it fell to zero then
  * release the journal_head from the buffer_head.
  */
 void journal_put_journal_head(struct journal_head *jh)
  
  
@@ -1953,11 +1923,12 @@
 	jbd_lock_bh_journal_head(bh);
 	J_ASSERT_JH(jh, jh->b_jcount > 0);
 	--jh->b_jcount;
-	if (!jh->b_jcount && !jh->b_transaction) {
+	if (!jh->b_jcount) {
 		__journal_remove_journal_head(bh);
+		jbd_unlock_bh_journal_head(bh);
 		__brelse(bh);
-	}
-	jbd_unlock_bh_journal_head(bh);
+	} else
+		jbd_unlock_bh_journal_head(bh);
 }
  
 /*
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
  
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
  
  
@@ -99,11 +100,10 @@
  
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
+		new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
 		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto alloc_transaction;
 		}
 	}
  
@@ -696,7 +696,6 @@
 	if (!jh->b_transaction) {
 		JBUFFER_TRACE(jh, "no transaction");
 		J_ASSERT_JH(jh, !jh->b_next_transaction);
-		jh->b_transaction = transaction;
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 		spin_lock(&journal->j_list_lock);
 		__journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -818,7 +817,6 @@
 		 * committed and so it's safe to clear the dirty bit.
 		 */
 		clear_buffer_dirty(jh2bh(jh));
-		jh->b_transaction = transaction;
  
 		/* first access by this transaction */
 		jh->b_modified = 0;
  
@@ -844,8 +842,8 @@
 	 */
 	JBUFFER_TRACE(jh, "cancelling revoke");
 	journal_cancel_revoke(handle, jh);
-	journal_put_journal_head(jh);
 out:
+	journal_put_journal_head(jh);
 	return err;
 }
  
@@ -1069,8 +1067,9 @@
 				ret = -EIO;
 				goto no_journal;
 			}
-
-			if (jh->b_transaction != NULL) {
+			/* We might have slept so buffer could be refiled now */
+			if (jh->b_transaction != NULL &&
+			    jh->b_transaction != handle->h_transaction) {
 				JBUFFER_TRACE(jh, "unfile from commit");
 				__journal_temp_unlink_buffer(jh);
 				/* It still points to the committing
@@ -1091,8 +1090,6 @@
 		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
 			JBUFFER_TRACE(jh, "not on correct data list: unfile");
 			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
 			JBUFFER_TRACE(jh, "file as data");
 			__journal_file_buffer(jh, handle->h_transaction,
 						BJ_SyncData);
@@ -1300,8 +1297,6 @@
 			__journal_file_buffer(jh, transaction, BJ_Forget);
 		} else {
 			__journal_unfile_buffer(jh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
 			if (!buffer_jbd(bh)) {
 				spin_unlock(&journal->j_list_lock);
 				jbd_unlock_bh_state(bh);
  
  
  
@@ -1622,19 +1617,32 @@
 		mark_buffer_dirty(bh);	/* Expose it to the VM */
 }
  
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
 void __journal_unfile_buffer(struct journal_head *jh)
 {
 	__journal_temp_unlink_buffer(jh);
 	jh->b_transaction = NULL;
+	journal_put_journal_head(jh);
 }
  
 void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
-	jbd_lock_bh_state(jh2bh(jh));
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
 	__journal_unfile_buffer(jh);
 	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(jh2bh(jh));
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
 }
  
 /*
  
@@ -1661,16 +1669,12 @@
 			/* A written-back ordered data buffer */
 			JBUFFER_TRACE(jh, "release data");
 			__journal_unfile_buffer(jh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
 		}
 	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
 			__journal_remove_checkpoint(jh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -1733,7 +1737,7 @@
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * journal_remove_journal_head() and journal_put_journal_head().
+		 * journal_put_journal_head().
 		 */
 		jh = journal_grab_journal_head(bh);
 		if (!jh)
  
@@ -1770,10 +1774,9 @@
 	int may_free = 1;
 	struct buffer_head *bh = jh2bh(jh);
  
-	__journal_unfile_buffer(jh);
-
 	if (jh->b_cp_transaction) {
 		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__journal_temp_unlink_buffer(jh);
 		/*
 		 * We don't want to write the buffer anymore, clear the
 		 * bit so that we don't confuse checks in
@@ -1784,8 +1787,7 @@
 		may_free = 0;
 	} else {
 		JBUFFER_TRACE(jh, "on running transaction");
-		journal_remove_journal_head(bh);
-		__brelse(bh);
+		__journal_unfile_buffer(jh);
 	}
 	return may_free;
 }
@@ -2070,6 +2072,8 @@
  
 	if (jh->b_transaction)
 		__journal_temp_unlink_buffer(jh);
+	else
+		journal_grab_journal_head(bh);
 	jh->b_transaction = transaction;
  
 	switch (jlist) {
  
@@ -2127,9 +2131,10 @@
  * already started to be used by a subsequent transaction, refile the
  * buffer on that transaction's metadata list.
  *
- * Called under journal->j_list_lock
- *
+ * Called under j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
  */
 void __journal_refile_buffer(struct journal_head *jh)
 {
@@ -2153,6 +2158,11 @@
  
 	was_dirty = test_clear_buffer_jbddirty(bh);
 	__journal_temp_unlink_buffer(jh);
+	/*
+	 * We set b_transaction here because b_next_transaction will inherit
+	 * our jh reference and thus __journal_file_buffer() must not take a
+	 * new one.
+	 */
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
 	if (buffer_freed(bh))
  
  
  
  
@@ -2169,30 +2179,21 @@
 }
  
 /*
- * For the unlocked version of this call, also make sure that any
- * hanging journal_head is cleaned up if necessary.
+ * __journal_refile_buffer() with necessary locking added. We take our bh
+ * reference so that we can safely unlock bh.
  *
- * __journal_refile_buffer is usually called as part of a single locked
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists.  In that case it is up
- * to the caller to remove the journal_head if necessary.  For the
- * unlocked journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
+ * The jh and bh may be freed by this call.
  */
 void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
 	struct buffer_head *bh = jh2bh(jh);
  
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
-
 	__journal_refile_buffer(jh);
 	jbd_unlock_bh_state(bh);
-	journal_remove_journal_head(bh);
-
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
@@ -18,6 +18,7 @@
  
 #include <linux/types.h>
 #include <linux/magic.h>
+#include <linux/fs.h>
  
 /*
  * The second extended filesystem constants/structures
@@ -418,12 +418,11 @@
 #define EXT2_MOUNT_DATA_FLAGS		EXT3_MOUNT_DATA_FLAGS
 #endif
  
-#define ext3_set_bit			__test_and_set_bit_le
+#define ext3_set_bit			__set_bit_le
 #define ext3_set_bit_atomic		ext2_set_bit_atomic
-#define ext3_clear_bit			__test_and_clear_bit_le
+#define ext3_clear_bit			__clear_bit_le
 #define ext3_clear_bit_atomic		ext2_clear_bit_atomic
 #define ext3_test_bit			test_bit_le
-#define ext3_find_first_zero_bit	find_first_zero_bit_le
 #define ext3_find_next_zero_bit		find_next_zero_bit_le
  
 /*
@@ -913,7 +912,7 @@
 extern int ext3_change_inode_journal_flag(struct inode *, int);
 extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
 extern int ext3_can_truncate(struct inode *inode);
-extern void ext3_truncate (struct inode *);
+extern void ext3_truncate(struct inode *inode);
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_get_inode_flags(struct ext3_inode_info *);
 extern void ext3_set_aops(struct inode *inode);
@@ -940,7 +940,6 @@
  */
 struct journal_head *journal_add_journal_head(struct buffer_head *bh);
 struct journal_head *journal_grab_journal_head(struct buffer_head *bh);
-void journal_remove_journal_head(struct buffer_head *bh);
 void journal_put_journal_head(struct journal_head *jh);
  
 /*
@@ -45,7 +45,7 @@
 	 * has been cowed
 	 * [jbd_lock_bh_state()]
 	 */
-	unsigned b_cow_tid;
+	tid_t b_cow_tid;
  
 	/*
 	 * Copy of the buffer data frozen for writing to the log.
@@ -415,14 +415,6 @@
 	{QFMT_VFS_V0, "quota_v2"},\
 	{0, NULL}}
  
-#else
-
-# /* nodep */ include <sys/cdefs.h>
-
-__BEGIN_DECLS
-long quotactl __P ((unsigned int, const char *, int, caddr_t));
-__END_DECLS
-
 #endif /* __KERNEL__ */
 #endif /* _QUOTA_ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ext3
+
+#if !defined(_TRACE_EXT3_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EXT3_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ext3_free_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	uid_t,	uid			)
+		__field(	gid_t,	gid			)
+		__field(	blkcnt_t, blocks		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->uid	= inode->i_uid;
+		__entry->gid	= inode->i_gid;
+		__entry->blocks	= inode->i_blocks;
+	),
+
+	TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->mode, __entry->uid, __entry->gid,
+		  (unsigned long) __entry->blocks)
+);
+
+TRACE_EVENT(ext3_request_inode,
+	TP_PROTO(struct inode *dir, int mode),
+
+	TP_ARGS(dir, mode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	dir			)
+		__field(	umode_t, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->dir	= dir->i_ino;
+		__entry->mode	= mode;
+	),
+
+	TP_printk("dev %d,%d dir %lu mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->dir, __entry->mode)
+);
+
+TRACE_EVENT(ext3_allocate_inode,
+	TP_PROTO(struct inode *inode, struct inode *dir, int mode),
+
+	TP_ARGS(inode, dir, mode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	dir			)
+		__field(	umode_t, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->dir	= dir->i_ino;
+		__entry->mode	= mode;
+	),
+
+	TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long) __entry->dir, __entry->mode)
+);
+
+TRACE_EVENT(ext3_evict_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	int,	nlink			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->nlink	= inode->i_nlink;
+	),
+
+	TP_printk("dev %d,%d ino %lu nlink %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->nlink)
+);
+
+TRACE_EVENT(ext3_drop_inode,
+	TP_PROTO(struct inode *inode, int drop),
+
+	TP_ARGS(inode, drop),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	int,	drop			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->drop	= drop;
+	),
+
+	TP_printk("dev %d,%d ino %lu drop %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->drop)
+);
+
+TRACE_EVENT(ext3_mark_inode_dirty,
+	TP_PROTO(struct inode *inode, unsigned long IP),
+
+	TP_ARGS(inode, IP),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(unsigned long,	ip			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->ip	= IP;
+	),
+
+	TP_printk("dev %d,%d ino %lu caller %pF",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, (void *)__entry->ip)
+);
+
+TRACE_EVENT(ext3_write_begin,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int flags),
+
+	TP_ARGS(inode, pos, len, flags),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pos, __entry->len,
+		  __entry->flags)
+);
+
+DECLARE_EVENT_CLASS(ext3__write_end,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+			unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, copied		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->copied	= copied;
+	),
+
+	TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pos, __entry->len,
+		  __entry->copied)
+);
+
+DEFINE_EVENT(ext3__write_end, ext3_ordered_write_end,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied)
+);
+
+DEFINE_EVENT(ext3__write_end, ext3_writeback_write_end,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied)
+);
+
+DEFINE_EVENT(ext3__write_end, ext3_journalled_write_end,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied)
+);
+
+DECLARE_EVENT_CLASS(ext3__page_op,
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	pgoff_t, index			)
+
+	),
+
+	TP_fast_assign(
+		__entry->index	= page->index;
+		__entry->ino	= page->mapping->host->i_ino;
+		__entry->dev	= page->mapping->host->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu page_index %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->index)
+);
+
+DEFINE_EVENT(ext3__page_op, ext3_ordered_writepage,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page)
+);
+
+DEFINE_EVENT(ext3__page_op, ext3_writeback_writepage,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page)
+);
+
+DEFINE_EVENT(ext3__page_op, ext3_journalled_writepage,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page)
+);
+
+DEFINE_EVENT(ext3__page_op, ext3_readpage,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page)
+);
+
+DEFINE_EVENT(ext3__page_op, ext3_releasepage,
+
+	TP_PROTO(struct page *page),
+
+	TP_ARGS(page)
+);
+
+TRACE_EVENT(ext3_invalidatepage,
+	TP_PROTO(struct page *page, unsigned long offset),
+
+	TP_ARGS(page, offset),
+
+	TP_STRUCT__entry(
+		__field(	pgoff_t, index			)
+		__field(	unsigned long, offset		)
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+
+	),
+
+	TP_fast_assign(
+		__entry->index	= page->index;
+		__entry->offset	= offset;
+		__entry->ino	= page->mapping->host->i_ino;
+		__entry->dev	= page->mapping->host->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->index, __entry->offset)
+);
+
+TRACE_EVENT(ext3_discard_blocks,
+	TP_PROTO(struct super_block *sb, unsigned long blk,
+			unsigned long count),
+
+	TP_ARGS(sb, blk, count),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	unsigned long,	blk		)
+		__field(	unsigned long,	count		)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->blk	= blk;
+		__entry->count	= count;
+	),
+
+	TP_printk("dev %d,%d blk %lu count %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blk, __entry->count)
+);
+
+TRACE_EVENT(ext3_request_blocks,
+	TP_PROTO(struct inode *inode, unsigned long goal,
+		 unsigned long count),
+
+	TP_ARGS(inode, goal, count),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	unsigned long, count		)
+		__field(	unsigned long,	goal		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->count	= count;
+		__entry->goal	= goal;
+	),
+
+	TP_printk("dev %d,%d ino %lu count %lu goal %lu ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->count, __entry->goal)
+);
+
+TRACE_EVENT(ext3_allocate_blocks,
+	TP_PROTO(struct inode *inode, unsigned long goal,
+		 unsigned long count, unsigned long block),
+
+	TP_ARGS(inode, goal, count, block),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	unsigned long,	block		)
+		__field(	unsigned long, count		)
+		__field(	unsigned long,	goal		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->block	= block;
+		__entry->count	= count;
+		__entry->goal	= goal;
+	),
+
+	TP_printk("dev %d,%d ino %lu count %lu block %lu goal %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		   __entry->count, __entry->block,
+		  __entry->goal)
+);
+
+TRACE_EVENT(ext3_free_blocks,
+	TP_PROTO(struct inode *inode, unsigned long block,
+		 unsigned long count),
+
+	TP_ARGS(inode, block, count),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	unsigned long,	block		)
+		__field(	unsigned long,	count		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->mode		= inode->i_mode;
+		__entry->block		= block;
+		__entry->count		= count;
+	),
+
+	TP_printk("dev %d,%d ino %lu mode 0%o block %lu count %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->mode, __entry->block, __entry->count)
+);
+
+TRACE_EVENT(ext3_sync_file_enter,
+	TP_PROTO(struct file *file, int datasync),
+
+	TP_ARGS(file, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+	),
+
+	TP_fast_assign(
+		struct dentry *dentry = file->f_path.dentry;
+
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+	),
+
+	TP_printk("dev %d,%d ino %lu parent %ld datasync %d ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync)
+);
+
+TRACE_EVENT(ext3_sync_file_exit,
+	TP_PROTO(struct inode *inode, int ret),
+
+	TP_ARGS(inode, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret			)
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+	),
+
+	TP_fast_assign(
+		__entry->ret		= ret;
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->ret)
+);
+
+TRACE_EVENT(ext3_sync_fs,
+	TP_PROTO(struct super_block *sb, int wait),
+
+	TP_ARGS(sb, wait),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	wait			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->wait	= wait;
+	),
+
+	TP_printk("dev %d,%d wait %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait)
+);
+
+TRACE_EVENT(ext3_rsv_window_add,
+	TP_PROTO(struct super_block *sb,
+		 struct ext3_reserve_window_node *rsv_node),
+
+	TP_ARGS(sb, rsv_node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	start		)
+		__field(	unsigned long,	end		)
+		__field(	dev_t,	dev			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->start	= rsv_node->rsv_window._rsv_start;
+		__entry->end	= rsv_node->rsv_window._rsv_end;
+	),
+
+	TP_printk("dev %d,%d start %lu end %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->start, __entry->end)
+);
+
+TRACE_EVENT(ext3_discard_reservation,
+	TP_PROTO(struct inode *inode,
+		 struct ext3_reserve_window_node *rsv_node),
+
+	TP_ARGS(inode, rsv_node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	start		)
+		__field(	unsigned long,	end		)
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+	),
+
+	TP_fast_assign(
+		__entry->start	= rsv_node->rsv_window._rsv_start;
+		__entry->end	= rsv_node->rsv_window._rsv_end;
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu start %lu end %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long)__entry->ino, __entry->start,
+		  __entry->end)
+);
+
+TRACE_EVENT(ext3_alloc_new_reservation,
+	TP_PROTO(struct super_block *sb, unsigned long goal),
+
+	TP_ARGS(sb, goal),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	unsigned long,	goal		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->goal	= goal;
+	),
+
+	TP_printk("dev %d,%d goal %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->goal)
+);
+
+TRACE_EVENT(ext3_reserved,
+	TP_PROTO(struct super_block *sb, unsigned long block,
+		 struct ext3_reserve_window_node *rsv_node),
+
+	TP_ARGS(sb, block, rsv_node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	block		)
+		__field(	unsigned long,	start		)
+		__field(	unsigned long,	end		)
+		__field(	dev_t,	dev			)
+	),
+
+	TP_fast_assign(
+		__entry->block	= block;
+		__entry->start	= rsv_node->rsv_window._rsv_start;
+		__entry->end	= rsv_node->rsv_window._rsv_end;
+		__entry->dev	= sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d block %lu, start %lu end %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->block, __entry->start, __entry->end)
+);
+
+TRACE_EVENT(ext3_forget,
+	TP_PROTO(struct inode *inode, int is_metadata, unsigned long block),
+
+	TP_ARGS(inode, is_metadata, block),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	int,	is_metadata		)
+		__field(	unsigned long,	block		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->is_metadata = is_metadata;
+		__entry->block	= block;
+	),
+
+	TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->mode, __entry->is_metadata, __entry->block)
+);
+
+TRACE_EVENT(ext3_read_block_bitmap,
+	TP_PROTO(struct super_block *sb, unsigned int group),
+
+	TP_ARGS(sb, group),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	__u32,	group			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->group	= group;
+	),
+
+	TP_printk("dev %d,%d group %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->group)
+);
+
+TRACE_EVENT(ext3_direct_IO_enter,
+	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),
+
+	TP_ARGS(inode, offset, len, rw),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned long,	len		)
+		__field(	int,	rw			)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->pos	= offset;
+		__entry->len	= len;
+		__entry->rw	= rw;
+	),
+
+	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pos, __entry->len,
+		  __entry->rw)
+);
+
+TRACE_EVENT(ext3_direct_IO_exit,
+	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
+		 int rw, int ret),
+
+	TP_ARGS(inode, offset, len, rw, ret),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned long,	len		)
+		__field(	int,	rw			)
+		__field(	int,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->pos	= offset;
+		__entry->len	= len;
+		__entry->rw	= rw;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pos, __entry->len,
+		  __entry->rw, __entry->ret)
+);
+
+TRACE_EVENT(ext3_unlink_enter,
+	TP_PROTO(struct inode *parent, struct dentry *dentry),
+
+	TP_ARGS(parent, dentry),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,	parent			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	size			)
+		__field(	dev_t,	dev			)
+	),
+
+	TP_fast_assign(
+		__entry->parent		= parent->i_ino;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->size		= dentry->d_inode->i_size;
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu size %lld parent %ld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long)__entry->size,
+		  (unsigned long) __entry->parent)
+);
+
+TRACE_EVENT(ext3_unlink_exit,
+	TP_PROTO(struct dentry *dentry, int ret),
+
+	TP_ARGS(dentry, ret),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,	ino			)
+		__field(	dev_t,	dev			)
+		__field(	int,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->ret)
+);
+
+DECLARE_EVENT_CLASS(ext3__truncate,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	blkcnt_t,	blocks		)
+	),
+
+	TP_fast_assign(
+		__entry->ino    = inode->i_ino;
+		__entry->dev    = inode->i_sb->s_dev;
+		__entry->blocks	= inode->i_blocks;
+	),
+
+	TP_printk("dev %d,%d ino %lu blocks %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
+);
+
+DEFINE_EVENT(ext3__truncate, ext3_truncate_enter,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(ext3__truncate, ext3_truncate_exit,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+TRACE_EVENT(ext3_get_blocks_enter,
+	TP_PROTO(struct inode *inode, unsigned long lblk,
+		 unsigned long len, int create),
+
+	TP_ARGS(inode, lblk, len, create),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	unsigned long,	lblk		)
+		__field(	unsigned long,	len		)
+		__field(	int,		create		)
+	),
+
+	TP_fast_assign(
+		__entry->ino    = inode->i_ino;
+		__entry->dev    = inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
+		__entry->create	= create;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %lu len %lu create %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->lblk, __entry->len, __entry->create)
+);
+
+TRACE_EVENT(ext3_get_blocks_exit,
+	TP_PROTO(struct inode *inode, unsigned long lblk,
+		 unsigned long pblk, unsigned long len, int ret),
+
+	TP_ARGS(inode, lblk, pblk, len, ret),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	unsigned long,	lblk		)
+		__field(	unsigned long,	pblk		)
+		__field(	unsigned long,	len		)
+		__field(	int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->ino    = inode->i_ino;
+		__entry->dev    = inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->pblk	= pblk;
+		__entry->len	= len;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %lu pblk %lu len %lu ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		   __entry->lblk, __entry->pblk,
+		  __entry->len, __entry->ret)
+);
+
+TRACE_EVENT(ext3_load_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,	ino		)
+		__field(	dev_t,	dev		)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d ino %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino)
+);
+
+#endif /* _TRACE_EXT3_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM jbd
+
+#if !defined(_TRACE_JBD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_JBD_H
+
+#include <linux/jbd.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(jbd_checkpoint,
+
+	TP_PROTO(journal_t *journal, int result),
+
+	TP_ARGS(journal, result),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	result			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->result		= result;
+	),
+
+	TP_printk("dev %d,%d result %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->result)
+);
+
+DECLARE_EVENT_CLASS(jbd_commit,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		)
+		__field(	int,	transaction		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %d,%d transaction %d sync %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->transaction, __entry->sync_commit)
+);
+
+DEFINE_EVENT(jbd_commit, jbd_start_commit,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction)
+);
+
+DEFINE_EVENT(jbd_commit, jbd_commit_locking,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction)
+);
+
+DEFINE_EVENT(jbd_commit, jbd_commit_flushing,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction)
+);
+
+DEFINE_EVENT(jbd_commit, jbd_commit_logging,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction)
+);
+
+TRACE_EVENT(jbd_drop_transaction,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		)
+		__field(	int,	transaction		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %d,%d transaction %d sync %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->transaction, __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd_end_commit,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		)
+		__field(	int,	transaction		)
+		__field(	int,	head			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+		__entry->head		= journal->j_tail_sequence;
+	),
+
+	TP_printk("dev %d,%d transaction %d sync %d head %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->transaction, __entry->sync_commit, __entry->head)
+);
+
+TRACE_EVENT(jbd_do_submit_data,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		)
+		__field(	int,	transaction		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %d,%d transaction %d sync %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->transaction, __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd_cleanup_journal_tail,
+
+	TP_PROTO(journal_t *journal, tid_t first_tid,
+		 unsigned long block_nr, unsigned long freed),
+
+	TP_ARGS(journal, first_tid, block_nr, freed),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	tid_t,	tail_sequence		)
+		__field(	tid_t,	first_tid		)
+		__field(unsigned long,	block_nr		)
+		__field(unsigned long,	freed			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->tail_sequence	= journal->j_tail_sequence;
+		__entry->first_tid	= first_tid;
+		__entry->block_nr	= block_nr;
+		__entry->freed		= freed;
+	),
+
+	TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tail_sequence, __entry->first_tid,
+		  __entry->block_nr, __entry->freed)
+);
+
+TRACE_EVENT(jbd_update_superblock_end,
+	TP_PROTO(journal_t *journal, int wait),
+
+	TP_ARGS(journal, wait),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	wait			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->wait		= wait;
+	),
+
+	TP_printk("dev %d,%d wait %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->wait)
+);
+
+#endif /* _TRACE_JBD_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
...	...	@@ -147,15 +147,6 @@
147	147	package for more details
148	148	(http://sourceforge.net/projects/linuxquota).
149	149
150		-bh (*) ext3 associates buffer heads to data pages to
151		-nobh (a) cache disk block mapping information
152		- (b) link pages into transaction to provide
153		- ordering guarantees.
154		- "bh" option forces use of buffer heads.
155		- "nobh" option tries to avoid associating buffer
156		- heads (supported only for "writeback" mode).
157		-
158		-
159	150	Specification
160	151	=============
161	152	Ext3 shares all disk implementation with the ext2 filesystem, and adds
...	...	@@ -227,6 +218,6 @@
227	218	programs: http://e2fsprogs.sourceforge.net/
228	219	http://ext2resize.sourceforge.net
229	220
230		-useful links: http://www.ibm.com/developerworks/library/l-fs7.html
231		- http://www.ibm.com/developerworks/library/l-fs8.html
	221	+useful links: http://www.ibm.com/developerworks/library/l-fs7/index.html
	222	+ http://www.ibm.com/developerworks/library/l-fs8/index.html
...	...	@@ -68,12 +68,12 @@
68	68	'-o barriers=[0\|1]' mount option for both ext3 and ext4 filesystems
69	69	for a fair comparison. When tuning ext3 for best benchmark numbers,
70	70	it is often worthwhile to try changing the data journaling mode; '-o
71		- data=writeback,nobh' can be faster for some workloads. (Note
72		- however that running mounted with data=writeback can potentially
73		- leave stale data exposed in recently written files in case of an
74		- unclean shutdown, which could be a security exposure in some
75		- situations.) Configuring the filesystem with a large journal can
76		- also be helpful for metadata-intensive workloads.
	71	+ data=writeback' can be faster for some workloads. (Note however that
	72	+ running mounted with data=writeback can potentially leave stale data
	73	+ exposed in recently written files in case of an unclean shutdown,
	74	+ which could be a security exposure in some situations.) Configuring
	75	+ the filesystem with a large journal can also be helpful for
	76	+ metadata-intensive workloads.
77	77
78	78	2. Features
79	79	===========
...	...	@@ -272,14 +272,6 @@
272	272	package for more details
273	273	(http://sourceforge.net/projects/linuxquota).
274	274
275		-bh (*) ext4 associates buffer heads to data pages to
276		-nobh (a) cache disk block mapping information
277		- (b) link pages into transaction to provide
278		- ordering guarantees.
279		- "bh" option forces use of buffer heads.
280		- "nobh" option tries to avoid associating buffer
281		- heads (supported only for "writeback" mode).
282		-
283	275	stripe=n Number of filesystem blocks that mballoc will try
284	276	to use for allocation size and alignment. For RAID5/6
285	277	systems this should be the number of data
...	...	@@ -393,8 +385,7 @@
393	385	write and convert the extent to initialized after IO
394	386	completes. This approach allows ext4 code to avoid
395	387	using inode mutex, which improves scalability on high
396		- speed storages. However this does not work with nobh
397		- option and the mount will fail. Nor does it work with
	388	+ speed storages. However this does not work with
398	389	data journaling and dioread_nolock option will be
399	390	ignored with kernel warning. Note that dioread_nolock
400	391	code path is only used for extent-based files.
...	...	@@ -161,6 +161,10 @@
161	161
162	162	if (name == NULL)
163	163	return -EINVAL;
	164	+ name_len = strlen(name);
	165	+ if (name_len > 255)
	166	+ return -ERANGE;
	167	+
164	168	down_read(&EXT2_I(inode)->xattr_sem);
165	169	error = -ENODATA;
166	170	if (!EXT2_I(inode)->i_file_acl)
167	171
...	...	@@ -181,12 +185,8 @@
181	185	error = -EIO;
182	186	goto cleanup;
183	187	}
184		- /* find named attribute */
185		- name_len = strlen(name);
186	188
187		- error = -ERANGE;
188		- if (name_len > 255)
189		- goto cleanup;
	189	+ /* find named attribute */
190	190	entry = FIRST_ENTRY(bh);
191	191	while (!IS_LAST_ENTRY(entry)) {
192	192	struct ext2_xattr_entry *next =
...	...	@@ -21,6 +21,7 @@
21	21	#include <linux/quotaops.h>
22	22	#include <linux/buffer_head.h>
23	23	#include <linux/blkdev.h>
	24	+#include <trace/events/ext3.h>
24	25
25	26	/*
26	27	* balloc.c contains the blocks allocation and deallocation routines
...	...	@@ -161,6 +162,7 @@
161	162	desc = ext3_get_group_desc(sb, block_group, NULL);
162	163	if (!desc)
163	164	return NULL;
	165	+ trace_ext3_read_block_bitmap(sb, block_group);
164	166	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
165	167	bh = sb_getblk(sb, bitmap_blk);
166	168	if (unlikely(!bh)) {
...	...	@@ -351,6 +353,7 @@
351	353	struct rb_node * parent = NULL;
352	354	struct ext3_reserve_window_node *this;
353	355
	356	+ trace_ext3_rsv_window_add(sb, rsv);
354	357	while (*p)
355	358	{
356	359	parent = *p;
357	360
...	...	@@ -476,8 +479,10 @@
476	479	rsv = &block_i->rsv_window_node;
477	480	if (!rsv_is_empty(&rsv->rsv_window)) {
478	481	spin_lock(rsv_lock);
479		- if (!rsv_is_empty(&rsv->rsv_window))
	482	+ if (!rsv_is_empty(&rsv->rsv_window)) {
	483	+ trace_ext3_discard_reservation(inode, rsv);
480	484	rsv_window_remove(inode->i_sb, rsv);
	485	+ }
481	486	spin_unlock(rsv_lock);
482	487	}
483	488	}
484	489
...	...	@@ -683,14 +688,10 @@
683	688	void ext3_free_blocks(handle_t handle, struct inode inode,
684	689	ext3_fsblk_t block, unsigned long count)
685	690	{
686		- struct super_block * sb;
	691	+ struct super_block *sb = inode->i_sb;
687	692	unsigned long dquot_freed_blocks;
688	693
689		- sb = inode->i_sb;
690		- if (!sb) {
691		- printk ("ext3_free_blocks: nonexistent device");
692		- return;
693		- }
	694	+ trace_ext3_free_blocks(inode, block, count);
694	695	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
695	696	if (dquot_freed_blocks)
696	697	dquot_free_block(inode, dquot_freed_blocks);
...	...	@@ -1136,6 +1137,7 @@
1136	1137	else
1137	1138	start_block = grp_goal + group_first_block;
1138	1139
	1140	+ trace_ext3_alloc_new_reservation(sb, start_block);
1139	1141	size = my_rsv->rsv_goal_size;
1140	1142
1141	1143	if (!rsv_is_empty(&my_rsv->rsv_window)) {
1142	1144
...	...	@@ -1230,8 +1232,11 @@
1230	1232	* check if the first free block is within the
1231	1233	* free space we just reserved
1232	1234	*/
1233		- if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
	1235	+ if (start_block >= my_rsv->rsv_start &&
	1236	+ start_block <= my_rsv->rsv_end) {
	1237	+ trace_ext3_reserved(sb, start_block, my_rsv);
1234	1238	return 0; /* success */
	1239	+ }
1235	1240	/*
1236	1241	* if the first free bit we found is out of the reservable space
1237	1242	* continue search for next reservable space,
...	...	@@ -1514,10 +1519,6 @@
1514	1519
1515	1520	*errp = -ENOSPC;
1516	1521	sb = inode->i_sb;
1517		- if (!sb) {
1518		- printk("ext3_new_block: nonexistent device");
1519		- return 0;
1520		- }
1521	1522
1522	1523	/*
1523	1524	* Check quota for allocation of this block.
1524	1525
...	...	@@ -1528,8 +1529,10 @@
1528	1529	return 0;
1529	1530	}
1530	1531
	1532	+ trace_ext3_request_blocks(inode, goal, num);
	1533	+
1531	1534	sbi = EXT3_SB(sb);
1532		- es = EXT3_SB(sb)->s_es;
	1535	+ es = sbi->s_es;
1533	1536	ext3_debug("goal=%lu.\n", goal);
1534	1537	/*
1535	1538	* Allocate a block from reservation only when
...	...	@@ -1742,6 +1745,10 @@
1742	1745	brelse(bitmap_bh);
1743	1746	dquot_free_block(inode, *count-num);
1744	1747	*count = num;
	1748	+
	1749	+ trace_ext3_allocate_blocks(inode, goal, num,
	1750	+ (unsigned long long)ret_block);
	1751	+
1745	1752	return ret_block;
1746	1753
1747	1754	io_error:
...	...	@@ -1996,6 +2003,7 @@
1996	2003	if ((next - start) < minblocks)
1997	2004	goto free_extent;
1998	2005
	2006	+ trace_ext3_discard_blocks(sb, discard_block, next - start);
1999	2007	/* Send the TRIM command down to the device */
2000	2008	err = sb_issue_discard(sb, discard_block, next - start,
2001	2009	GFP_NOFS, 0);
...	...	@@ -2100,7 +2108,7 @@
2100	2108	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2101	2109	return -EINVAL;
2102	2110	if (start >= max_blks)
2103		- goto out;
	2111	+ return -EINVAL;
2104	2112	if (start + len > max_blks)
2105	2113	len = max_blks - start;
2106	2114
...	...	@@ -2148,8 +2156,6 @@
2148	2156
2149	2157	if (ret >= 0)
2150	2158	ret = 0;
2151		-
2152		-out:
2153	2159	range->len = trimmed * sb->s_blocksize;
2154	2160
2155	2161	return ret;
...	...	@@ -71,7 +71,6 @@
71	71	};
72	72
73	73	const struct inode_operations ext3_file_inode_operations = {
74		- .truncate = ext3_truncate,
75	74	.setattr = ext3_setattr,
76	75	#ifdef CONFIG_EXT3_FS_XATTR
77	76	.setxattr = generic_setxattr,
...	...	@@ -30,6 +30,7 @@
30	30	#include <linux/jbd.h>
31	31	#include <linux/ext3_fs.h>
32	32	#include <linux/ext3_jbd.h>
	33	+#include <trace/events/ext3.h>
33	34
34	35	/*
35	36	* akpm: A new design for ext3_sync_file().
36	37
...	...	@@ -51,12 +52,14 @@
51	52	int ret, needs_barrier = 0;
52	53	tid_t commit_tid;
53	54
	55	+ trace_ext3_sync_file_enter(file, datasync);
	56	+
54	57	if (inode->i_sb->s_flags & MS_RDONLY)
55	58	return 0;
56	59
57	60	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
58	61	if (ret)
59		- return ret;
	62	+ goto out;
60	63
61	64	/*
62	65	* Taking the mutex here just to keep consistent with how fsync was
...	...	@@ -83,7 +86,8 @@
83	86	*/
84	87	if (ext3_should_journal_data(inode)) {
85	88	mutex_unlock(&inode->i_mutex);
86		- return ext3_force_commit(inode->i_sb);
	89	+ ret = ext3_force_commit(inode->i_sb);
	90	+ goto out;
87	91	}
88	92
89	93	if (datasync)
90	94
...	...	@@ -104,7 +108,10 @@
104	108	*/
105	109	if (needs_barrier)
106	110	blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
	111	+
107	112	mutex_unlock(&inode->i_mutex);
	113	+out:
	114	+ trace_ext3_sync_file_exit(inode, ret);
108	115	return ret;
109	116	}
...	...	@@ -23,6 +23,7 @@
23	23	#include <linux/buffer_head.h>
24	24	#include <linux/random.h>
25	25	#include <linux/bitops.h>
	26	+#include <trace/events/ext3.h>
26	27
27	28	#include <asm/byteorder.h>
28	29
...	...	@@ -118,6 +119,7 @@
118	119
119	120	ino = inode->i_ino;
120	121	ext3_debug ("freeing inode %lu\n", ino);
	122	+ trace_ext3_free_inode(inode);
121	123
122	124	is_directory = S_ISDIR(inode->i_mode);
123	125
...	...	@@ -426,6 +428,7 @@
426	428	return ERR_PTR(-EPERM);
427	429
428	430	sb = dir->i_sb;
	431	+ trace_ext3_request_inode(dir, mode);
429	432	inode = new_inode(sb);
430	433	if (!inode)
431	434	return ERR_PTR(-ENOMEM);
...	...	@@ -601,6 +604,7 @@
601	604	}
602	605
603	606	ext3_debug("allocating inode %lu\n", inode->i_ino);
	607	+ trace_ext3_allocate_inode(inode, dir, mode);
604	608	goto really_out;
605	609	fail:
606	610	ext3_std_error(sb, err);
...	...	@@ -38,10 +38,12 @@
38	38	#include <linux/bio.h>
39	39	#include <linux/fiemap.h>
40	40	#include <linux/namei.h>
	41	+#include <trace/events/ext3.h>
41	42	#include "xattr.h"
42	43	#include "acl.h"
43	44
44	45	static int ext3_writepage_trans_blocks(struct inode *inode);
	46	+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
45	47
46	48	/*
47	49	* Test whether an inode is a fast symlink.
...	...	@@ -70,6 +72,7 @@
70	72
71	73	might_sleep();
72	74
	75	+ trace_ext3_forget(inode, is_metadata, blocknr);
73	76	BUFFER_TRACE(bh, "enter");
74	77
75	78	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76	79
77	80
78	81
...	...	@@ -194,20 +197,47 @@
194	197	*/
195	198	void ext3_evict_inode (struct inode *inode)
196	199	{
	200	+ struct ext3_inode_info *ei = EXT3_I(inode);
197	201	struct ext3_block_alloc_info *rsv;
198	202	handle_t *handle;
199	203	int want_delete = 0;
200	204
	205	+ trace_ext3_evict_inode(inode);
201	206	if (!inode->i_nlink && !is_bad_inode(inode)) {
202	207	dquot_initialize(inode);
203	208	want_delete = 1;
204	209	}
205	210
	211	+ /*
	212	+ * When journalling data dirty buffers are tracked only in the journal.
	213	+ * So although mm thinks everything is clean and ready for reaping the
	214	+ * inode might still have some pages to write in the running
	215	+ * transaction or waiting to be checkpointed. Thus calling
	216	+ * journal_invalidatepage() (via truncate_inode_pages()) to discard
	217	+ * these buffers can cause data loss. Also even if we did not discard
	218	+ * these buffers, we would have no way to find them after the inode
	219	+ * is reaped and thus user could see stale data if he tries to read
	220	+ * them before the transaction is checkpointed. So be careful and
	221	+ * force everything to disk here... We use ei->i_datasync_tid to
	222	+ * store the newest transaction containing inode's data.
	223	+ *
	224	+ * Note that directories do not have this problem because they don't
	225	+ * use page cache.
	226	+ */
	227	+ if (inode->i_nlink && ext3_should_journal_data(inode) &&
	228	+ (S_ISLNK(inode->i_mode) \|\| S_ISREG(inode->i_mode))) {
	229	+ tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
	230	+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
	231	+
	232	+ log_start_commit(journal, commit_tid);
	233	+ log_wait_commit(journal, commit_tid);
	234	+ filemap_write_and_wait(&inode->i_data);
	235	+ }
206	236	truncate_inode_pages(&inode->i_data, 0);
207	237
208	238	ext3_discard_reservation(inode);
209		- rsv = EXT3_I(inode)->i_block_alloc_info;
210		- EXT3_I(inode)->i_block_alloc_info = NULL;
	239	+ rsv = ei->i_block_alloc_info;
	240	+ ei->i_block_alloc_info = NULL;
211	241	if (unlikely(rsv))
212	242	kfree(rsv);
213	243
214	244
...	...	@@ -231,15 +261,13 @@
231	261	if (inode->i_blocks)
232	262	ext3_truncate(inode);
233	263	/*
234		- * Kill off the orphan record which ext3_truncate created.
235		- * AKPM: I think this can be inside the above `if'.
236		- * Note that ext3_orphan_del() has to be able to cope with the
237		- * deletion of a non-existent orphan - this is because we don't
238		- * know if ext3_truncate() actually created an orphan record.
239		- * (Well, we could do this if we need to, but heck - it works)
	264	+ * Kill off the orphan record created when the inode lost the last
	265	+ * link. Note that ext3_orphan_del() has to be able to cope with the
	266	+ * deletion of a non-existent orphan - ext3_truncate() could
	267	+ * have removed the record.
240	268	*/
241	269	ext3_orphan_del(handle, inode);
242		- EXT3_I(inode)->i_dtime = get_seconds();
	270	+ ei->i_dtime = get_seconds();
243	271
244	272	/*
245	273	* One subtle ordering requirement: if anything has gone wrong
...	...	@@ -842,6 +870,7 @@
842	870	ext3_fsblk_t first_block = 0;
843	871
844	872
	873	+ trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
845	874	J_ASSERT(handle != NULL \|\| create == 0);
846	875	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
847	876
...	...	@@ -886,6 +915,9 @@
886	915	if (!create \|\| err == -EIO)
887	916	goto cleanup;
888	917
	918	+ /*
	919	+ * Block out ext3_truncate while we alter the tree
	920	+ */
889	921	mutex_lock(&ei->truncate_mutex);
890	922
891	923	/*
...	...	@@ -934,9 +966,6 @@
934	966	*/
935	967	count = ext3_blks_to_allocate(partial, indirect_blks,
936	968	maxblocks, blocks_to_boundary);
937		- /*
938		- * Block out ext3_truncate while we alter the tree
939		- */
940	969	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
941	970	offsets + (partial - chain), partial);
942	971
...	...	@@ -970,6 +999,9 @@
970	999	}
971	1000	BUFFER_TRACE(bh_result, "returned");
972	1001	out:
	1002	+ trace_ext3_get_blocks_exit(inode, iblock,
	1003	+ depth ? le32_to_cpu(chain[depth-1].key) : 0,
	1004	+ count, err);
973	1005	return err;
974	1006	}
975	1007
...	...	@@ -1202,6 +1234,16 @@
1202	1234	ext3_truncate(inode);
1203	1235	}
1204	1236
	1237	+/*
	1238	+ * Truncate blocks that were not used by direct IO write. We have to zero out
	1239	+ * the last file block as well because direct IO might have written to it.
	1240	+ */
	1241	+static void ext3_truncate_failed_direct_write(struct inode *inode)
	1242	+{
	1243	+ ext3_block_truncate_page(inode, inode->i_size);
	1244	+ ext3_truncate(inode);
	1245	+}
	1246	+
1205	1247	static int ext3_write_begin(struct file file, struct address_space mapping,
1206	1248	loff_t pos, unsigned len, unsigned flags,
1207	1249	struct page pagep, void fsdata)
...	...	@@ -1217,6 +1259,8 @@
1217	1259	* we allocate blocks but write fails for some reason */
1218	1260	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1219	1261
	1262	+ trace_ext3_write_begin(inode, pos, len, flags);
	1263	+
1220	1264	index = pos >> PAGE_CACHE_SHIFT;
1221	1265	from = pos & (PAGE_CACHE_SIZE - 1);
1222	1266	to = from + len;
...	...	@@ -1332,6 +1376,7 @@
1332	1376	unsigned from, to;
1333	1377	int ret = 0, ret2;
1334	1378
	1379	+ trace_ext3_ordered_write_end(inode, pos, len, copied);
1335	1380	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1336	1381
1337	1382	from = pos & (PAGE_CACHE_SIZE - 1);
...	...	@@ -1367,6 +1412,7 @@
1367	1412	struct inode *inode = file->f_mapping->host;
1368	1413	int ret;
1369	1414
	1415	+ trace_ext3_writeback_write_end(inode, pos, len, copied);
1370	1416	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1371	1417	update_file_sizes(inode, pos, copied);
1372	1418	/*
1373	1419
...	...	@@ -1391,10 +1437,12 @@
1391	1437	{
1392	1438	handle_t *handle = ext3_journal_current_handle();
1393	1439	struct inode *inode = mapping->host;
	1440	+ struct ext3_inode_info *ei = EXT3_I(inode);
1394	1441	int ret = 0, ret2;
1395	1442	int partial = 0;
1396	1443	unsigned from, to;
1397	1444
	1445	+ trace_ext3_journalled_write_end(inode, pos, len, copied);
1398	1446	from = pos & (PAGE_CACHE_SIZE - 1);
1399	1447	to = from + len;
1400	1448
...	...	@@ -1419,8 +1467,9 @@
1419	1467	if (pos + len > inode->i_size && ext3_can_truncate(inode))
1420	1468	ext3_orphan_add(handle, inode);
1421	1469	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1422		- if (inode->i_size > EXT3_I(inode)->i_disksize) {
1423		- EXT3_I(inode)->i_disksize = inode->i_size;
	1470	+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
	1471	+ if (inode->i_size > ei->i_disksize) {
	1472	+ ei->i_disksize = inode->i_size;
1424	1473	ret2 = ext3_mark_inode_dirty(handle, inode);
1425	1474	if (!ret)
1426	1475	ret = ret2;
...	...	@@ -1577,6 +1626,7 @@
1577	1626	if (ext3_journal_current_handle())
1578	1627	goto out_fail;
1579	1628
	1629	+ trace_ext3_ordered_writepage(page);
1580	1630	if (!page_has_buffers(page)) {
1581	1631	create_empty_buffers(page, inode->i_sb->s_blocksize,
1582	1632	(1 << BH_Dirty)\|(1 << BH_Uptodate));
...	...	@@ -1647,6 +1697,7 @@
1647	1697	if (ext3_journal_current_handle())
1648	1698	goto out_fail;
1649	1699
	1700	+ trace_ext3_writeback_writepage(page);
1650	1701	if (page_has_buffers(page)) {
1651	1702	if (!walk_page_buffers(NULL, page_buffers(page), 0,
1652	1703	PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
...	...	@@ -1689,6 +1740,7 @@
1689	1740	if (ext3_journal_current_handle())
1690	1741	goto no_write;
1691	1742
	1743	+ trace_ext3_journalled_writepage(page);
1692	1744	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1693	1745	if (IS_ERR(handle)) {
1694	1746	ret = PTR_ERR(handle);
...	...	@@ -1715,6 +1767,8 @@
1715	1767	if (ret == 0)
1716	1768	ret = err;
1717	1769	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
	1770	+ atomic_set(&EXT3_I(inode)->i_datasync_tid,
	1771	+ handle->h_transaction->t_tid);
1718	1772	unlock_page(page);
1719	1773	} else {
1720	1774	/*
...	...	@@ -1739,6 +1793,7 @@
1739	1793
1740	1794	static int ext3_readpage(struct file file, struct page page)
1741	1795	{
	1796	+ trace_ext3_readpage(page);
1742	1797	return mpage_readpage(page, ext3_get_block);
1743	1798	}
1744	1799
...	...	@@ -1753,6 +1808,8 @@
1753	1808	{
1754	1809	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1755	1810
	1811	+ trace_ext3_invalidatepage(page, offset);
	1812	+
1756	1813	/*
1757	1814	* If it's a full truncate we just forget about the pending dirtying
1758	1815	*/
...	...	@@ -1766,6 +1823,7 @@
1766	1823	{
1767	1824	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1768	1825
	1826	+ trace_ext3_releasepage(page);
1769	1827	WARN_ON(PageChecked(page));
1770	1828	if (!page_has_buffers(page))
1771	1829	return 0;
...	...	@@ -1794,6 +1852,8 @@
1794	1852	size_t count = iov_length(iov, nr_segs);
1795	1853	int retries = 0;
1796	1854
	1855	+ trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
	1856	+
1797	1857	if (rw == WRITE) {
1798	1858	loff_t final_size = offset + count;
1799	1859
...	...	@@ -1827,7 +1887,7 @@
1827	1887	loff_t end = offset + iov_length(iov, nr_segs);
1828	1888
1829	1889	if (end > isize)
1830		- vmtruncate(inode, isize);
	1890	+ ext3_truncate_failed_direct_write(inode);
1831	1891	}
1832	1892	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1833	1893	goto retry;
...	...	@@ -1841,7 +1901,7 @@
1841	1901	/* This is really bad luck. We've written the data
1842	1902	* but cannot extend i_size. Truncate allocated blocks
1843	1903	* and pretend the write failed... */
1844		- ext3_truncate(inode);
	1904	+ ext3_truncate_failed_direct_write(inode);
1845	1905	ret = PTR_ERR(handle);
1846	1906	goto out;
1847	1907	}
...	...	@@ -1867,6 +1927,8 @@
1867	1927	ret = err;
1868	1928	}
1869	1929	out:
	1930	+ trace_ext3_direct_IO_exit(inode, offset,
	1931	+ iov_length(iov, nr_segs), rw, ret);
1870	1932	return ret;
1871	1933	}
1872	1934
1873	1935
1874	1936
1875	1937
1876	1938
...	...	@@ -1949,17 +2011,24 @@
1949	2011	* This required during truncate. We need to physically zero the tail end
1950	2012	* of that block so it doesn't yield old data if the file is later grown.
1951	2013	*/
1952		-static int ext3_block_truncate_page(handle_t handle, struct page page,
1953		- struct address_space *mapping, loff_t from)
	2014	+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
1954	2015	{
1955	2016	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1956		- unsigned offset = from & (PAGE_CACHE_SIZE-1);
	2017	+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
1957	2018	unsigned blocksize, iblock, length, pos;
1958		- struct inode *inode = mapping->host;
	2019	+ struct page *page;
	2020	+ handle_t *handle = NULL;
1959	2021	struct buffer_head *bh;
1960	2022	int err = 0;
1961	2023
	2024	+ /* Truncated on block boundary - nothing to do */
1962	2025	blocksize = inode->i_sb->s_blocksize;
	2026	+ if ((from & (blocksize - 1)) == 0)
	2027	+ return 0;
	2028	+
	2029	+ page = grab_cache_page(inode->i_mapping, index);
	2030	+ if (!page)
	2031	+ return -ENOMEM;
1963	2032	length = blocksize - (offset & (blocksize - 1));
1964	2033	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1965	2034
1966	2035
...	...	@@ -2004,11 +2073,23 @@
2004	2073	goto unlock;
2005	2074	}
2006	2075
	2076	+ /* data=writeback mode doesn't need transaction to zero-out data */
	2077	+ if (!ext3_should_writeback_data(inode)) {
	2078	+ /* We journal at most one block */
	2079	+ handle = ext3_journal_start(inode, 1);
	2080	+ if (IS_ERR(handle)) {
	2081	+ clear_highpage(page);
	2082	+ flush_dcache_page(page);
	2083	+ err = PTR_ERR(handle);
	2084	+ goto unlock;
	2085	+ }
	2086	+ }
	2087	+
2007	2088	if (ext3_should_journal_data(inode)) {
2008	2089	BUFFER_TRACE(bh, "get write access");
2009	2090	err = ext3_journal_get_write_access(handle, bh);
2010	2091	if (err)
2011		- goto unlock;
	2092	+ goto stop;
2012	2093	}
2013	2094
2014	2095	zero_user(page, offset, length);
...	...	@@ -2022,6 +2103,9 @@
2022	2103	err = ext3_journal_dirty_data(handle, bh);
2023	2104	mark_buffer_dirty(bh);
2024	2105	}
	2106	+stop:
	2107	+ if (handle)
	2108	+ ext3_journal_stop(handle);
2025	2109
2026	2110	unlock:
2027	2111	unlock_page(page);
...	...	@@ -2390,8 +2474,6 @@
2390	2474
2391	2475	int ext3_can_truncate(struct inode *inode)
2392	2476	{
2393		- if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
2394		- return 0;
2395	2477	if (S_ISREG(inode->i_mode))
2396	2478	return 1;
2397	2479	if (S_ISDIR(inode->i_mode))
...	...	@@ -2435,7 +2517,6 @@
2435	2517	struct ext3_inode_info *ei = EXT3_I(inode);
2436	2518	__le32 *i_data = ei->i_data;
2437	2519	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2438		- struct address_space *mapping = inode->i_mapping;
2439	2520	int offsets[4];
2440	2521	Indirect chain[4];
2441	2522	Indirect *partial;
2442	2523
2443	2524
2444	2525
2445	2526
2446	2527
...	...	@@ -2443,45 +2524,21 @@
2443	2524	int n;
2444	2525	long last_block;
2445	2526	unsigned blocksize = inode->i_sb->s_blocksize;
2446		- struct page *page;
2447	2527
	2528	+ trace_ext3_truncate_enter(inode);
	2529	+
2448	2530	if (!ext3_can_truncate(inode))
2449	2531	goto out_notrans;
2450	2532
2451	2533	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2452	2534	ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2453	2535
2454		- /*
2455		- * We have to lock the EOF page here, because lock_page() nests
2456		- * outside journal_start().
2457		- */
2458		- if ((inode->i_size & (blocksize - 1)) == 0) {
2459		- /* Block boundary? Nothing to do */
2460		- page = NULL;
2461		- } else {
2462		- page = grab_cache_page(mapping,
2463		- inode->i_size >> PAGE_CACHE_SHIFT);
2464		- if (!page)
2465		- goto out_notrans;
2466		- }
2467		-
2468	2536	handle = start_transaction(inode);
2469		- if (IS_ERR(handle)) {
2470		- if (page) {
2471		- clear_highpage(page);
2472		- flush_dcache_page(page);
2473		- unlock_page(page);
2474		- page_cache_release(page);
2475		- }
	2537	+ if (IS_ERR(handle))
2476	2538	goto out_notrans;
2477		- }
2478	2539
2479	2540	last_block = (inode->i_size + blocksize-1)
2480	2541	>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2481		-
2482		- if (page)
2483		- ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2484		-
2485	2542	n = ext3_block_to_path(inode, last_block, offsets, NULL);
2486	2543	if (n == 0)
2487	2544	goto out_stop; /* error */
...	...	@@ -2596,6 +2653,7 @@
2596	2653	ext3_orphan_del(handle, inode);
2597	2654
2598	2655	ext3_journal_stop(handle);
	2656	+ trace_ext3_truncate_exit(inode);
2599	2657	return;
2600	2658	out_notrans:
2601	2659	/*
...	...	@@ -2604,6 +2662,7 @@
2604	2662	*/
2605	2663	if (inode->i_nlink)
2606	2664	ext3_orphan_del(NULL, inode);
	2665	+ trace_ext3_truncate_exit(inode);
2607	2666	}
2608	2667
2609	2668	static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
...	...	@@ -2745,6 +2804,7 @@
2745	2804	* has in-inode xattrs, or we don't have this inode in memory.
2746	2805	* Read the block from disk.
2747	2806	*/
	2807	+ trace_ext3_load_inode(inode);
2748	2808	get_bh(bh);
2749	2809	bh->b_end_io = end_buffer_read_sync;
2750	2810	submit_bh(READ_META, bh);
2751	2811
2752	2812
2753	2813
...	...	@@ -3229,18 +3289,36 @@
3229	3289	}
3230	3290
3231	3291	error = ext3_orphan_add(handle, inode);
	3292	+ if (error) {
	3293	+ ext3_journal_stop(handle);
	3294	+ goto err_out;
	3295	+ }
3232	3296	EXT3_I(inode)->i_disksize = attr->ia_size;
3233		- rc = ext3_mark_inode_dirty(handle, inode);
3234		- if (!error)
3235		- error = rc;
	3297	+ error = ext3_mark_inode_dirty(handle, inode);
3236	3298	ext3_journal_stop(handle);
	3299	+ if (error) {
	3300	+ /* Some hard fs error must have happened. Bail out. */
	3301	+ ext3_orphan_del(NULL, inode);
	3302	+ goto err_out;
	3303	+ }
	3304	+ rc = ext3_block_truncate_page(inode, attr->ia_size);
	3305	+ if (rc) {
	3306	+ /* Cleanup orphan list and exit */
	3307	+ handle = ext3_journal_start(inode, 3);
	3308	+ if (IS_ERR(handle)) {
	3309	+ ext3_orphan_del(NULL, inode);
	3310	+ goto err_out;
	3311	+ }
	3312	+ ext3_orphan_del(handle, inode);
	3313	+ ext3_journal_stop(handle);
	3314	+ goto err_out;
	3315	+ }
3237	3316	}
3238	3317
3239	3318	if ((attr->ia_valid & ATTR_SIZE) &&
3240	3319	attr->ia_size != i_size_read(inode)) {
3241		- rc = vmtruncate(inode, attr->ia_size);
3242		- if (rc)
3243		- goto err_out;
	3320	+ truncate_setsize(inode, attr->ia_size);
	3321	+ ext3_truncate(inode);
3244	3322	}
3245	3323
3246	3324	setattr_copy(inode, attr);
...	...	@@ -3374,6 +3452,7 @@
3374	3452	int err;
3375	3453
3376	3454	might_sleep();
	3455	+ trace_ext3_mark_inode_dirty(inode, _RET_IP_);
3377	3456	err = ext3_reserve_inode_write(handle, inode, &iloc);
3378	3457	if (!err)
3379	3458	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
...	...	@@ -285,7 +285,7 @@
285	285	if (!capable(CAP_SYS_ADMIN))
286	286	return -EPERM;
287	287
288		- if (copy_from_user(&range, (struct fstrim_range *)arg,
	288	+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
289	289	sizeof(range)))
290	290	return -EFAULT;
291	291
...	...	@@ -293,7 +293,7 @@
293	293	if (ret < 0)
294	294	return ret;
295	295
296		- if (copy_to_user((struct fstrim_range *)arg, &range,
	296	+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
297	297	sizeof(range)))
298	298	return -EFAULT;
299	299
...	...	@@ -36,6 +36,7 @@
36	36	#include <linux/quotaops.h>
37	37	#include <linux/buffer_head.h>
38	38	#include <linux/bio.h>
	39	+#include <trace/events/ext3.h>
39	40
40	41	#include "namei.h"
41	42	#include "xattr.h"
...	...	@@ -287,7 +288,7 @@
287	288	while (len--) printk("%c", *name++);
288	289	ext3fs_dirhash(de->name, de->name_len, &h);
289	290	printk(":%x.%u ", h.hash,
290		- ((char *) de - base));
	291	+ (unsigned) ((char *) de - base));
291	292	}
292	293	space += EXT3_DIR_REC_LEN(de->name_len);
293	294	names++;
...	...	@@ -1013,7 +1014,7 @@
1013	1014
1014	1015	*err = -ENOENT;
1015	1016	errout:
1016		- dxtrace(printk("%s not found\n", name));
	1017	+ dxtrace(printk("%s not found\n", entry->name));
1017	1018	dx_release (frames);
1018	1019	return NULL;
1019	1020	}
...	...	@@ -2140,6 +2141,7 @@
2140	2141	struct ext3_dir_entry_2 * de;
2141	2142	handle_t *handle;
2142	2143
	2144	+ trace_ext3_unlink_enter(dir, dentry);
2143	2145	/* Initialize quotas before so that eventual writes go
2144	2146	* in separate transaction */
2145	2147	dquot_initialize(dir);
...	...	@@ -2185,6 +2187,7 @@
2185	2187	end_unlink:
2186	2188	ext3_journal_stop(handle);
2187	2189	brelse (bh);
	2190	+ trace_ext3_unlink_exit(dentry, retval);
2188	2191	return retval;
2189	2192	}
2190	2193