Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: jbd2: fix race between write_metadata_buffer and get_write_access ext4: Fix ext4_mb_initialize_context() to initialize all fields ext4: fix null handler of ioctls in no journal mode ext4: Fix buffer head reference leak in no-journal mode ext4: Move __ext4_journalled_writepage() to avoid forward declaration ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc ext4: Fix mmap/truncate race when blocksize < pagesize && delayed allocation ext4: Don't look at buffer_heads outside i_size. ext4: Fix goal inum check in the inode allocator ext4: fix no journal corruption with locale-gen ext4: Calculate required journal credits for inserting an extent properly ext4: Fix truncation of symlinks after failed write jbd2: Fix a race between checkpointing code and journal_get_write_access() ext4: Use rcu_barrier() on module unload. ext4: naturally align struct ext4_allocation_request ext4: mark several more functions in mballoc.c as noinline ext4: Fix potential reclaim deadlock when truncating partial block jbd2: Remove GFP_ATOMIC kmalloc from inside spinlock critical region ext4: Fix type warning on 64-bit platforms in tracing events header

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: jbd2: fix race between write_metadata_buffer and get_write_access ext4: Fix ext4_mb_initialize_context() to initialize all fields ext4: fix null handler of ioctls in no journal mode ext4: Fix buffer head reference leak in no-journal mode ext4: Move __ext4_journalled_writepage() to avoid forward declaration ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc ext4: Fix mmap/truncate race when blocksize < pagesize && delayed allocation ext4: Don't look at buffer_heads outside i_size. ext4: Fix goal inum check in the inode allocator ext4: fix no journal corruption with locale-gen ext4: Calculate required journal credits for inserting an extent properly ext4: Fix truncation of symlinks after failed write jbd2: Fix a race between checkpointing code and journal_get_write_access() ext4: Use rcu_barrier() on module unload. ext4: naturally align struct ext4_allocation_request ext4: mark several more functions in mballoc.c as noinline ext4: Fix potential reclaim deadlock when truncating partial block jbd2: Remove GFP_ATOMIC kmalloc from inside spinlock critical region ext4: Fix type warning on 64-bit platforms in tracing events header
Linus Torvalds
2 parents 4a390e07fc 96577c4382
Showing 11 changed files Side-by-side Diff
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
include/trace/events/ext4.h
@@ -93,20 +93,20 @@
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
 	struct inode *inode;
+	/* how many blocks we want to allocate */
+	unsigned int len;
 	/* logical block in target inode */
 	ext4_lblk_t logical;
-	/* phys. target (a hint) */
-	ext4_fsblk_t goal;
 	/* the closest logical allocated block to the left */
 	ext4_lblk_t lleft;
-	/* phys. block for ^^^ */
-	ext4_fsblk_t pleft;
 	/* the closest logical allocated block to the right */
 	ext4_lblk_t lright;
-	/* phys. block for ^^^ */
+	/* phys. target (a hint) */
+	ext4_fsblk_t goal;
+	/* phys. block for the closest logical allocated block to the left */
+	ext4_fsblk_t pleft;
+	/* phys. block for the closest logical allocated block to the right */
 	ext4_fsblk_t pright;
-	/* how many blocks we want to allocate */
-	unsigned int len;
 	/* flags. see above EXT4_MB_HINT_* */
 	unsigned int flags;
 };
@@ -43,6 +43,8 @@
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	}
+	else
+		brelse(bh);
 	return err;
 }
  
@@ -57,6 +59,8 @@
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	}
+	else
+		brelse(bh);
 	return err;
 }
  
@@ -131,9 +131,11 @@
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 				struct buffer_head *bh);
  
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_forget(const char *where, handle_t *handle,
 				struct buffer_head *bh);
  
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_revoke(const char *where, handle_t *handle,
 				ext4_fsblk_t blocknr, struct buffer_head *bh);
  
  
@@ -281,10 +283,10 @@
  
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-	if (EXT4_JOURNAL(inode) == NULL)
-		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 1;
 	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
@@ -1977,6 +1977,7 @@
 			 */
 			/* 1 bitmap, 1 block group descriptor */
 			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+			return ret;
 		}
 	}
  
@@ -833,7 +833,7 @@
 	if (!goal)
 		goal = sbi->s_inode_goal;
  
-	if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
 		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
 		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
 		ret2 = 0;
@@ -78,16 +78,14 @@
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
  *
- * If the handle isn't valid we're not journaling so there's nothing to do.
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
  
-	if (!ext4_handle_valid(handle))
-		return 0;
-
 	might_sleep();
  
 	BUFFER_TRACE(bh, "enter");
  
  
@@ -1513,14 +1511,14 @@
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
-		if (pos + len > inode->i_size)
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
  
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
-			vmtruncate(inode, inode->i_size);
+			ext4_truncate(inode);
 			/*
-			 * If vmtruncate failed early the inode might
+			 * If truncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
 			 * orphan list in that case.
@@ -1614,7 +1612,7 @@
 		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
-		if (pos + len > inode->i_size)
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			/* if we have allocated more blocks and copied
 			 * less. We will have blocks allocated outside
 			 * inode->i_size. So truncate them
  
@@ -1628,9 +1626,9 @@
 		ret = ret2;
  
 	if (pos + len > inode->i_size) {
-		vmtruncate(inode, inode->i_size);
+		ext4_truncate(inode);
 		/*
-		 * If vmtruncate failed early the inode might still be
+		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
@@ -1655,7 +1653,7 @@
 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
-	if (pos + len > inode->i_size)
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
  
@@ -1670,9 +1668,9 @@
 		ret = ret2;
  
 	if (pos + len > inode->i_size) {
-		vmtruncate(inode, inode->i_size);
+		ext4_truncate(inode);
 		/*
-		 * If vmtruncate failed early the inode might still be
+		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
@@ -1722,7 +1720,7 @@
  
 	unlock_page(page);
 	page_cache_release(page);
-	if (pos + len > inode->i_size)
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
  
@@ -1733,9 +1731,9 @@
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
-		vmtruncate(inode, inode->i_size);
+		ext4_truncate(inode);
 		/*
-		 * If vmtruncate failed early the inode might still be
+		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
  
@@ -2305,15 +2303,9 @@
 	return;
 }
  
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
-	/*
-	 * unmapped buffer is possible for holes.
-	 * delay buffer is possible with delayed allocation.
-	 * We also need to consider unwritten buffer as unmapped.
-	 */
-	return (!buffer_mapped(bh) || buffer_delay(bh) ||
-				buffer_unwritten(bh)) && buffer_dirty(bh);
+	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
  
 /*
  
@@ -2398,9 +2390,9 @@
 			 * We need to try to allocate
 			 * unmapped blocks in the same page.
 			 * Otherwise we won't make progress
-			 * with the page in ext4_da_writepage
+			 * with the page in ext4_writepage
 			 */
-			if (ext4_bh_unmapped_or_delay(NULL, bh)) {
+			if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       bh->b_size,
 						       bh->b_state);
@@ -2517,7 +2509,6 @@
 	 * so call get_block_wrap with create = 0
 	 */
 	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-	BUG_ON(create && ret == 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
  
  
  
@@ -2525,15 +2516,102 @@
 	return ret;
 }
  
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+				       struct writeback_control *wbc,
+				       unsigned int len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	page_bufs = page_buffers(page);
+	BUG_ON(!page_bufs);
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				do_journal_get_write_access);
+
+	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				write_end_fn);
+	if (ret == 0)
+		ret = err;
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+out:
+	return ret;
+}
+
 /*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
  * This function can get called via...
  *   - ext4_da_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via pdflush (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *		ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
  */
-static int ext4_da_writepage(struct page *page,
-				struct writeback_control *wbc)
+static int ext4_writepage(struct page *page,
+			  struct writeback_control *wbc)
 {
 	int ret = 0;
 	loff_t size;
@@ -2541,7 +2619,7 @@
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
  
-	trace_ext4_da_writepage(inode, page);
+	trace_ext4_writepage(inode, page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2551,7 +2629,7 @@
 	if (page_has_buffers(page)) {
 		page_bufs = page_buffers(page);
 		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-					ext4_bh_unmapped_or_delay)) {
+					ext4_bh_delay_or_unwritten)) {
 			/*
 			 * We don't want to do  block allocation
 			 * So redirty the page and return
  
@@ -2578,13 +2656,13 @@
 		 * all are mapped and non delay. We don't want to
 		 * do block allocation here.
 		 */
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+		ret = block_prepare_write(page, 0, len,
 					  noalloc_get_block_write);
 		if (!ret) {
 			page_bufs = page_buffers(page);
 			/* check whether all are mapped and non delay */
 			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-						ext4_bh_unmapped_or_delay)) {
+						ext4_bh_delay_or_unwritten)) {
 				redirty_page_for_writepage(wbc, page);
 				unlock_page(page);
 				return 0;
  
@@ -2600,9 +2678,18 @@
 			return 0;
 		}
 		/* now mark the buffer_heads as dirty and uptodate */
-		block_commit_write(page, 0, PAGE_CACHE_SIZE);
+		block_commit_write(page, 0, len);
 	}
  
+	if (PageChecked(page) && ext4_should_journal_data(inode)) {
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		ClearPageChecked(page);
+		return __ext4_journalled_writepage(page, wbc, len);
+	}
+
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
@@ -2907,7 +2994,7 @@
 		 * i_size_read because we hold i_mutex.
 		 */
 		if (pos + len > inode->i_size)
-			vmtruncate(inode, inode->i_size);
+			ext4_truncate(inode);
 	}
  
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3130,222 +3217,6 @@
 	return generic_block_bmap(mapping, block, ext4_get_block);
 }
  
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-	get_bh(bh);
-	return 0;
-}
-
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-	put_bh(bh);
-	return 0;
-}
-
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * In all journaling modes block_write_full_page() will start the I/O.
- *
- * Problem:
- *
- *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *		ext4_writepage()
- *
- * Similar for:
- *
- *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *	    non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- *
- */
-static int __ext4_normal_writepage(struct page *page,
-				   struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-
-	if (test_opt(inode->i_sb, NOBH))
-		return nobh_writepage(page, noalloc_get_block_write, wbc);
-	else
-		return block_write_full_page(page, noalloc_get_block_write,
-					     wbc);
-}
-
-static int ext4_normal_writepage(struct page *page,
-				 struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t size = i_size_read(inode);
-	loff_t len;
-
-	trace_ext4_normal_writepage(inode, page);
-	J_ASSERT(PageLocked(page));
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
-	else
-		len = PAGE_CACHE_SIZE;
-
-	if (page_has_buffers(page)) {
-		/* if page has buffers it should all be mapped
-		 * and allocated. If there are not buffers attached
-		 * to the page we know the page is dirty but it lost
-		 * buffers. That means that at some moment in time
-		 * after write_begin() / write_end() has been called
-		 * all buffers have been clean and thus they must have been
-		 * written at least once. So they are all mapped and we can
-		 * happily proceed with mapping them and writing the page.
-		 */
-		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-					ext4_bh_unmapped_or_delay));
-	}
-
-	if (!ext4_journal_current_handle())
-		return __ext4_normal_writepage(page, wbc);
-
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-}
-
-static int __ext4_journalled_writepage(struct page *page,
-				       struct writeback_control *wbc)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-				  noalloc_get_block_write);
-	if (ret != 0)
-		goto out_unlock;
-
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-								bget_one);
-	/* As soon as we unlock the page, it can go away, but we have
-	 * references to buffers so we are safe */
-	unlock_page(page);
-
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
-	}
-
-	ret = walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-	err = walk_page_buffers(handle, page_bufs, 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-	if (ret == 0)
-		ret = err;
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-
-	walk_page_buffers(handle, page_bufs, 0,
-				PAGE_CACHE_SIZE, NULL, bput_one);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-	goto out;
-
-out_unlock:
-	unlock_page(page);
-out:
-	return ret;
-}
-
-static int ext4_journalled_writepage(struct page *page,
-				     struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t size = i_size_read(inode);
-	loff_t len;
-
-	trace_ext4_journalled_writepage(inode, page);
-	J_ASSERT(PageLocked(page));
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
-	else
-		len = PAGE_CACHE_SIZE;
-
-	if (page_has_buffers(page)) {
-		/* if page has buffers it should all be mapped
-		 * and allocated. If there are not buffers attached
-		 * to the page we know the page is dirty but it lost
-		 * buffers. That means that at some moment in time
-		 * after write_begin() / write_end() has been called
-		 * all buffers have been clean and thus they must have been
-		 * written at least once. So they are all mapped and we can
-		 * happily proceed with mapping them and writing the page.
-		 */
-		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-					ext4_bh_unmapped_or_delay));
-	}
-
-	if (ext4_journal_current_handle())
-		goto no_write;
-
-	if (PageChecked(page)) {
-		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
-		 */
-		ClearPageChecked(page);
-		return __ext4_journalled_writepage(page, wbc);
-	} else {
-		/*
-		 * It may be a page full of checkpoint-mode buffers.  We don't
-		 * really know unless we go poke around in the buffer_heads.
-		 * But block_write_full_page will do the right thing.
-		 */
-		return block_write_full_page(page, noalloc_get_block_write,
-					     wbc);
-	}
-no_write:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-}
-
 static int ext4_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, ext4_get_block);
@@ -3492,7 +3363,7 @@
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
-	.writepage		= ext4_normal_writepage,
+	.writepage		= ext4_writepage,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
@@ -3507,7 +3378,7 @@
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
-	.writepage		= ext4_normal_writepage,
+	.writepage		= ext4_writepage,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
@@ -3522,7 +3393,7 @@
 static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
-	.writepage		= ext4_journalled_writepage,
+	.writepage		= ext4_writepage,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
@@ -3536,7 +3407,7 @@
 static const struct address_space_operations ext4_da_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
-	.writepage		= ext4_da_writepage,
+	.writepage		= ext4_writepage,
 	.writepages		= ext4_da_writepages,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_da_write_begin,
@@ -3583,7 +3454,8 @@
 	struct page *page;
 	int err = 0;
  
-	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
 		return -EINVAL;
  
@@ -191,7 +191,7 @@
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
-		int err, err2;
+		int err, err2=0;
  
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -204,9 +204,11 @@
 			return err;
  
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
-		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
 		if (err == 0)
 			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
@@ -251,7 +253,7 @@
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
-		int err, err2;
+		int err, err2=0;
  
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -265,9 +267,11 @@
 			return err;
  
 		err = ext4_group_add(sb, &input);
-		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
 		if (err == 0)
 			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
@@ -657,7 +657,8 @@
 	}
 }
  
-static void ext4_mb_generate_buddy(struct super_block *sb,
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -1480,7 +1481,8 @@
 	ext4_mb_check_limits(ac, e4b, 0);
 }
  
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1507,7 +1509,8 @@
 	return 0;
 }
  
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 				struct ext4_buddy *e4b)
 {
 	ext4_group_t group = ac->ac_g_ex.fe_group;
@@ -1566,7 +1569,8 @@
  * The routine scans buddy structures (not bitmap!) from given order
  * to max order and tries to find big enough chunk to satisfy the req
  */
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -1609,7 +1613,8 @@
  * In order to optimize scanning, caller must pass number of
  * free blocks in the group, so the routine can know upper limit.
  */
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -1668,7 +1673,8 @@
  * we try to find stripe-aligned chunks for stripe-size requests
  * XXX should do so at least for multiples of stripe size as well
  */
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 				 struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -1831,7 +1837,8 @@
  
 }
  
-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
  
 	int ret;
@@ -2902,7 +2909,11 @@
  
 void exit_ext4_mballoc(void)
 {
-	/* XXX: synchronize_rcu(); */
+	/* 
+	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+	 * before destroying the slab cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
@@ -3457,7 +3468,8 @@
  * used in in-core bitmap. buddy must be generated from this bitmap
  * Need to be called with ext4 group lock held
  */
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  
  
@@ -4215,14 +4227,9 @@
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
  
 	/* set up allocation goals */
+	memset(ac, 0, sizeof(struct ext4_allocation_context));
 	ac->ac_b_ex.fe_logical = ar->logical;
-	ac->ac_b_ex.fe_group = 0;
-	ac->ac_b_ex.fe_start = 0;
-	ac->ac_b_ex.fe_len = 0;
 	ac->ac_status = AC_STATUS_CONTINUE;
-	ac->ac_groups_scanned = 0;
-	ac->ac_ex_scanned = 0;
-	ac->ac_found = 0;
 	ac->ac_sb = sb;
 	ac->ac_inode = ar->inode;
 	ac->ac_o_ex.fe_logical = ar->logical;
  
@@ -4233,15 +4240,7 @@
 	ac->ac_g_ex.fe_group = group;
 	ac->ac_g_ex.fe_start = block;
 	ac->ac_g_ex.fe_len = len;
-	ac->ac_f_ex.fe_len = 0;
 	ac->ac_flags = ar->flags;
-	ac->ac_2order = 0;
-	ac->ac_criteria = 0;
-	ac->ac_pa = NULL;
-	ac->ac_bitmap_page = NULL;
-	ac->ac_buddy_page = NULL;
-	ac->alloc_semp = NULL;
-	ac->ac_lg = NULL;
  
 	/* we have to define context: we'll we work with a file or
 	 * locality group. this is a policy, actually */
@@ -4509,10 +4508,7 @@
 	}
  
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = ar->inode;
-	} else {
+	if (!ac) {
 		ar->len = 0;
 		*errp = -ENOMEM;
 		goto out1;
@@ -297,6 +297,7 @@
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
 	struct jbd2_buffer_trigger_type *triggers;
+	journal_t *journal = transaction->t_journal;
  
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
@@ -310,6 +311,11 @@
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
  
 	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	/* keep subsequent assertions sane */
+	new_bh->b_state = 0;
+	init_buffer(new_bh, NULL, NULL);
+	atomic_set(&new_bh->b_count, 1);
+	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
  
 	/*
 	 * If a new transaction has already done a buffer copy-out, then
@@ -388,14 +394,6 @@
 		kunmap_atomic(mapped_data, KM_USER0);
 	}
  
-	/* keep subsequent assertions sane */
-	new_bh->b_state = 0;
-	init_buffer(new_bh, NULL, NULL);
-	atomic_set(&new_bh->b_count, 1);
-	jbd_unlock_bh_state(bh_in);
-
-	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
-
 	set_bh_page(new_bh, new_page, new_offset);
 	new_jh->b_transaction = NULL;
 	new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -412,7 +410,11 @@
 	 * copying is moved to the transaction's shadow queue.
 	 */
 	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-	jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh_in);
+
 	JBUFFER_TRACE(new_jh, "file as BJ_IO");
 	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
  
@@ -2410,6 +2412,7 @@
 	int	i = hash_32(device, CACHE_SIZE_BITS);
 	char	*ret;
 	struct block_device *bd;
+	static struct devname_cache *new_dev;
  
 	rcu_read_lock();
 	if (devcache[i] && devcache[i]->device == device) {
  
  
@@ -2419,20 +2422,20 @@
 	}
 	rcu_read_unlock();
  
+	new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+	if (!new_dev)
+		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
 	spin_lock(&devname_cache_lock);
 	if (devcache[i]) {
 		if (devcache[i]->device == device) {
+			kfree(new_dev);
 			ret = devcache[i]->devname;
 			spin_unlock(&devname_cache_lock);
 			return ret;
 		}
 		call_rcu(&devcache[i]->rcu, free_devcache);
 	}
-	devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
-	if (!devcache[i]) {
-		spin_unlock(&devname_cache_lock);
-		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
-	}
+	devcache[i] = new_dev;
 	devcache[i]->device = device;
 	bd = bdget(device);
 	if (bd) {
@@ -499,34 +499,15 @@
 	wake_up(&journal->j_wait_transaction_locked);
 }
  
-/*
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+static void warn_dirty_buffer(struct buffer_head *bh)
 {
-	int jlist;
+	char b[BDEVNAME_SIZE];
  
-	/* If this buffer is one which might reasonably be dirty
-	 * --- ie. data, or not part of this journal --- then
-	 * we're OK to leave it alone, but otherwise we need to
-	 * move the dirty bit to the journal's own internal
-	 * JBDDirty bit. */
-	jlist = jh->b_jlist;
-
-	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-	    jlist == BJ_Shadow || jlist == BJ_Forget) {
-		struct buffer_head *bh = jh2bh(jh);
-
-		if (test_clear_buffer_dirty(bh))
-			set_buffer_jbddirty(bh);
-	}
+	printk(KERN_WARNING
+	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
  
 /*
  
@@ -593,14 +574,16 @@
 			if (jh->b_next_transaction)
 				J_ASSERT_JH(jh, jh->b_next_transaction ==
 							transaction);
+			warn_dirty_buffer(bh);
 		}
 		/*
 		 * In any case we need to clean the dirty flag and we must
 		 * do it under the buffer lock to be sure we don't race
 		 * with running write-out.
 		 */
-		JBUFFER_TRACE(jh, "Unexpected dirty buffer");
-		jbd_unexpected_dirty_buffer(jh);
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
 	}
  
 	unlock_buffer(bh);
@@ -843,6 +826,15 @@
 	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
  
 	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous jbd2_journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
 		jh->b_transaction = transaction;
  
 		/* first access by this transaction */
  
@@ -1644,8 +1636,13 @@
  
 	if (jh->b_cp_transaction) {
 		JBUFFER_TRACE(jh, "on running+cp transaction");
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
 		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
-		clear_buffer_jbddirty(bh);
 		may_free = 0;
 	} else {
 		JBUFFER_TRACE(jh, "on running transaction");
  
@@ -1896,12 +1893,17 @@
 	if (jh->b_transaction && jh->b_jlist == jlist)
 		return;
  
-	/* The following list of buffer states needs to be consistent
-	 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-	 * state. */
-
 	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
 	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
 		if (test_clear_buffer_dirty(bh) ||
 		    test_clear_buffer_jbddirty(bh))
 			was_dirty = 1;
@@ -34,7 +34,8 @@
  
 	TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
 		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode,
-		  __entry->uid, __entry->gid, __entry->blocks)
+		  __entry->uid, __entry->gid,
+		  (unsigned long long) __entry->blocks)
 );
  
 TRACE_EVENT(ext4_request_inode,
@@ -189,7 +190,7 @@
 		  __entry->copied)
 );
  
-TRACE_EVENT(ext4_da_writepage,
+TRACE_EVENT(ext4_writepage,
 	TP_PROTO(struct inode *inode, struct page *page),
  
 	TP_ARGS(inode, page),
@@ -339,49 +340,6 @@
 	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
 		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
 		  __entry->copied)
-);
-
-TRACE_EVENT(ext4_normal_writepage,
-	TP_PROTO(struct inode *inode, struct page *page),
-
-	TP_ARGS(inode, page),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	pgoff_t, index			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->index	= page->index;
-	),
-
-	TP_printk("dev %s ino %lu page_index %lu",
-		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
-);
-
-TRACE_EVENT(ext4_journalled_writepage,
-	TP_PROTO(struct inode *inode, struct page *page),
-
-	TP_ARGS(inode, page),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	pgoff_t, index			)
-
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->index	= page->index;
-	),
-
-	TP_printk("dev %s ino %lu page_index %lu",
-		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
 );
  
 TRACE_EVENT(ext4_discard_blocks,
...	...	@@ -93,20 +93,20 @@
93	93	struct ext4_allocation_request {
94	94	/* target inode for block we're allocating */
95	95	struct inode *inode;
	96	+ /* how many blocks we want to allocate */
	97	+ unsigned int len;
96	98	/* logical block in target inode */
97	99	ext4_lblk_t logical;
98		- /* phys. target (a hint) */
99		- ext4_fsblk_t goal;
100	100	/* the closest logical allocated block to the left */
101	101	ext4_lblk_t lleft;
102		- /* phys. block for ^^^ */
103		- ext4_fsblk_t pleft;
104	102	/* the closest logical allocated block to the right */
105	103	ext4_lblk_t lright;
106		- /* phys. block for ^^^ */
	104	+ /* phys. target (a hint) */
	105	+ ext4_fsblk_t goal;
	106	+ /* phys. block for the closest logical allocated block to the left */
	107	+ ext4_fsblk_t pleft;
	108	+ /* phys. block for the closest logical allocated block to the right */
107	109	ext4_fsblk_t pright;
108		- /* how many blocks we want to allocate */
109		- unsigned int len;
110	110	/* flags. see above EXT4_MB_HINT_* */
111	111	unsigned int flags;
112	112	};
...	...	@@ -43,6 +43,8 @@
43	43	ext4_journal_abort_handle(where, __func__, bh,
44	44	handle, err);
45	45	}
	46	+ else
	47	+ brelse(bh);
46	48	return err;
47	49	}
48	50
...	...	@@ -57,6 +59,8 @@
57	59	ext4_journal_abort_handle(where, __func__, bh,
58	60	handle, err);
59	61	}
	62	+ else
	63	+ brelse(bh);
60	64	return err;
61	65	}
62	66
...	...	@@ -131,9 +131,11 @@
131	131	int __ext4_journal_get_write_access(const char where, handle_t handle,
132	132	struct buffer_head *bh);
133	133
	134	+/* When called with an invalid handle, this will still do a put on the BH */
134	135	int __ext4_journal_forget(const char where, handle_t handle,
135	136	struct buffer_head *bh);
136	137
	138	+/* When called with an invalid handle, this will still do a put on the BH */
137	139	int __ext4_journal_revoke(const char where, handle_t handle,
138	140	ext4_fsblk_t blocknr, struct buffer_head *bh);
139	141
140	142
...	...	@@ -281,10 +283,10 @@
281	283
282	284	static inline int ext4_should_writeback_data(struct inode *inode)
283	285	{
284		- if (EXT4_JOURNAL(inode) == NULL)
285		- return 0;
286	286	if (!S_ISREG(inode->i_mode))
287	287	return 0;
	288	+ if (EXT4_JOURNAL(inode) == NULL)
	289	+ return 1;
288	290	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
289	291	return 0;
290	292	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
...	...	@@ -1977,6 +1977,7 @@
1977	1977	*/
1978	1978	/* 1 bitmap, 1 block group descriptor */
1979	1979	ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
	1980	+ return ret;
1980	1981	}
1981	1982	}
1982	1983
...	...	@@ -833,7 +833,7 @@
833	833	if (!goal)
834	834	goal = sbi->s_inode_goal;
835	835
836		- if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
	836	+ if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
837	837	group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
838	838	ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
839	839	ret2 = 0;
...	...	@@ -191,7 +191,7 @@
191	191	case EXT4_IOC_GROUP_EXTEND: {
192	192	ext4_fsblk_t n_blocks_count;
193	193	struct super_block *sb = inode->i_sb;
194		- int err, err2;
	194	+ int err, err2=0;
195	195
196	196	if (!capable(CAP_SYS_RESOURCE))
197	197	return -EPERM;
...	...	@@ -204,9 +204,11 @@
204	204	return err;
205	205
206	206	err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
207		- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
208		- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
209		- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
	207	+ if (EXT4_SB(sb)->s_journal) {
	208	+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
	209	+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
	210	+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
	211	+ }
210	212	if (err == 0)
211	213	err = err2;
212	214	mnt_drop_write(filp->f_path.mnt);
...	...	@@ -251,7 +253,7 @@
251	253	case EXT4_IOC_GROUP_ADD: {
252	254	struct ext4_new_group_data input;
253	255	struct super_block *sb = inode->i_sb;
254		- int err, err2;
	256	+ int err, err2=0;
255	257
256	258	if (!capable(CAP_SYS_RESOURCE))
257	259	return -EPERM;
...	...	@@ -265,9 +267,11 @@
265	267	return err;
266	268
267	269	err = ext4_group_add(sb, &input);
268		- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
269		- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
270		- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
	270	+ if (EXT4_SB(sb)->s_journal) {
	271	+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
	272	+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
	273	+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
	274	+ }
271	275	if (err == 0)
272	276	err = err2;
273	277	mnt_drop_write(filp->f_path.mnt);
...	...	@@ -657,7 +657,8 @@
657	657	}
658	658	}
659	659
660		-static void ext4_mb_generate_buddy(struct super_block *sb,
	660	+static noinline_for_stack
	661	+void ext4_mb_generate_buddy(struct super_block *sb,
661	662	void buddy, void bitmap, ext4_group_t group)
662	663	{
663	664	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
...	...	@@ -1480,7 +1481,8 @@
1480	1481	ext4_mb_check_limits(ac, e4b, 0);
1481	1482	}
1482	1483
1483		-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
	1484	+static noinline_for_stack
	1485	+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1484	1486	struct ext4_buddy *e4b)
1485	1487	{
1486	1488	struct ext4_free_extent ex = ac->ac_b_ex;
...	...	@@ -1507,7 +1509,8 @@
1507	1509	return 0;
1508	1510	}
1509	1511
1510		-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
	1512	+static noinline_for_stack
	1513	+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1511	1514	struct ext4_buddy *e4b)
1512	1515	{
1513	1516	ext4_group_t group = ac->ac_g_ex.fe_group;
...	...	@@ -1566,7 +1569,8 @@
1566	1569	* The routine scans buddy structures (not bitmap!) from given order
1567	1570	* to max order and tries to find big enough chunk to satisfy the req
1568	1571	*/
1569		-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
	1572	+static noinline_for_stack
	1573	+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1570	1574	struct ext4_buddy *e4b)
1571	1575	{
1572	1576	struct super_block *sb = ac->ac_sb;
...	...	@@ -1609,7 +1613,8 @@
1609	1613	* In order to optimize scanning, caller must pass number of
1610	1614	* free blocks in the group, so the routine can know upper limit.
1611	1615	*/
1612		-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
	1616	+static noinline_for_stack
	1617	+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1613	1618	struct ext4_buddy *e4b)
1614	1619	{
1615	1620	struct super_block *sb = ac->ac_sb;
...	...	@@ -1668,7 +1673,8 @@
1668	1673	* we try to find stripe-aligned chunks for stripe-size requests
1669	1674	* XXX should do so at least for multiples of stripe size as well
1670	1675	*/
1671		-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
	1676	+static noinline_for_stack
	1677	+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1672	1678	struct ext4_buddy *e4b)
1673	1679	{
1674	1680	struct super_block *sb = ac->ac_sb;
...	...	@@ -1831,7 +1837,8 @@
1831	1837
1832	1838	}
1833	1839
1834		-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
	1840	+static noinline_for_stack
	1841	+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1835	1842	{
1836	1843
1837	1844	int ret;
...	...	@@ -2902,7 +2909,11 @@
2902	2909
2903	2910	void exit_ext4_mballoc(void)
2904	2911	{
2905		- /* XXX: synchronize_rcu(); */
	2912	+ /*
	2913	+ * Wait for completion of call_rcu()'s on ext4_pspace_cachep
	2914	+ * before destroying the slab cache.
	2915	+ */
	2916	+ rcu_barrier();
2906	2917	kmem_cache_destroy(ext4_pspace_cachep);
2907	2918	kmem_cache_destroy(ext4_ac_cachep);
2908	2919	kmem_cache_destroy(ext4_free_ext_cachep);
...	...	@@ -3457,7 +3468,8 @@
3457	3468	* used in in-core bitmap. buddy must be generated from this bitmap
3458	3469	* Need to be called with ext4 group lock held
3459	3470	*/
3460		-static void ext4_mb_generate_from_pa(struct super_block sb, void bitmap,
	3471	+static noinline_for_stack
	3472	+void ext4_mb_generate_from_pa(struct super_block sb, void bitmap,
3461	3473	ext4_group_t group)
3462	3474	{
3463	3475	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3464	3476
3465	3477
...	...	@@ -4215,14 +4227,9 @@
4215	4227	ext4_get_group_no_and_offset(sb, goal, &group, &block);
4216	4228
4217	4229	/* set up allocation goals */
	4230	+ memset(ac, 0, sizeof(struct ext4_allocation_context));
4218	4231	ac->ac_b_ex.fe_logical = ar->logical;
4219		- ac->ac_b_ex.fe_group = 0;
4220		- ac->ac_b_ex.fe_start = 0;
4221		- ac->ac_b_ex.fe_len = 0;
4222	4232	ac->ac_status = AC_STATUS_CONTINUE;
4223		- ac->ac_groups_scanned = 0;
4224		- ac->ac_ex_scanned = 0;
4225		- ac->ac_found = 0;
4226	4233	ac->ac_sb = sb;
4227	4234	ac->ac_inode = ar->inode;
4228	4235	ac->ac_o_ex.fe_logical = ar->logical;
4229	4236
...	...	@@ -4233,15 +4240,7 @@
4233	4240	ac->ac_g_ex.fe_group = group;
4234	4241	ac->ac_g_ex.fe_start = block;
4235	4242	ac->ac_g_ex.fe_len = len;
4236		- ac->ac_f_ex.fe_len = 0;
4237	4243	ac->ac_flags = ar->flags;
4238		- ac->ac_2order = 0;
4239		- ac->ac_criteria = 0;
4240		- ac->ac_pa = NULL;
4241		- ac->ac_bitmap_page = NULL;
4242		- ac->ac_buddy_page = NULL;
4243		- ac->alloc_semp = NULL;
4244		- ac->ac_lg = NULL;
4245	4244
4246	4245	/* we have to define context: we'll we work with a file or
4247	4246	* locality group. this is a policy, actually */
...	...	@@ -4509,10 +4508,7 @@
4509	4508	}
4510	4509
4511	4510	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4512		- if (ac) {
4513		- ac->ac_sb = sb;
4514		- ac->ac_inode = ar->inode;
4515		- } else {
	4511	+ if (!ac) {
4516	4512	ar->len = 0;
4517	4513	*errp = -ENOMEM;
4518	4514	goto out1;
...	...	@@ -297,6 +297,7 @@
297	297	unsigned int new_offset;
298	298	struct buffer_head *bh_in = jh2bh(jh_in);
299	299	struct jbd2_buffer_trigger_type *triggers;
	300	+ journal_t *journal = transaction->t_journal;
300	301
301	302	/*
302	303	* The buffer really shouldn't be locked: only the current committing
...	...	@@ -310,6 +311,11 @@
310	311	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
311	312
312	313	new_bh = alloc_buffer_head(GFP_NOFS\|__GFP_NOFAIL);
	314	+ /* keep subsequent assertions sane */
	315	+ new_bh->b_state = 0;
	316	+ init_buffer(new_bh, NULL, NULL);
	317	+ atomic_set(&new_bh->b_count, 1);
	318	+ new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
313	319
314	320	/*
315	321	* If a new transaction has already done a buffer copy-out, then
...	...	@@ -388,14 +394,6 @@
388	394	kunmap_atomic(mapped_data, KM_USER0);
389	395	}
390	396
391		- /* keep subsequent assertions sane */
392		- new_bh->b_state = 0;
393		- init_buffer(new_bh, NULL, NULL);
394		- atomic_set(&new_bh->b_count, 1);
395		- jbd_unlock_bh_state(bh_in);
396		-
397		- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
398		-
399	397	set_bh_page(new_bh, new_page, new_offset);
400	398	new_jh->b_transaction = NULL;
401	399	new_bh->b_size = jh2bh(jh_in)->b_size;
...	...	@@ -412,7 +410,11 @@
412	410	* copying is moved to the transaction's shadow queue.
413	411	*/
414	412	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
415		- jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
	413	+ spin_lock(&journal->j_list_lock);
	414	+ __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
	415	+ spin_unlock(&journal->j_list_lock);
	416	+ jbd_unlock_bh_state(bh_in);
	417	+
416	418	JBUFFER_TRACE(new_jh, "file as BJ_IO");
417	419	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
418	420
...	...	@@ -2410,6 +2412,7 @@
2410	2412	int i = hash_32(device, CACHE_SIZE_BITS);
2411	2413	char *ret;
2412	2414	struct block_device *bd;
	2415	+ static struct devname_cache *new_dev;
2413	2416
2414	2417	rcu_read_lock();
2415	2418	if (devcache[i] && devcache[i]->device == device) {
2416	2419
2417	2420
...	...	@@ -2419,20 +2422,20 @@
2419	2422	}
2420	2423	rcu_read_unlock();
2421	2424
	2425	+ new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
	2426	+ if (!new_dev)
	2427	+ return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2422	2428	spin_lock(&devname_cache_lock);
2423	2429	if (devcache[i]) {
2424	2430	if (devcache[i]->device == device) {
	2431	+ kfree(new_dev);
2425	2432	ret = devcache[i]->devname;
2426	2433	spin_unlock(&devname_cache_lock);
2427	2434	return ret;
2428	2435	}
2429	2436	call_rcu(&devcache[i]->rcu, free_devcache);
2430	2437	}
2431		- devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2432		- if (!devcache[i]) {
2433		- spin_unlock(&devname_cache_lock);
2434		- return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2435		- }
	2438	+ devcache[i] = new_dev;
2436	2439	devcache[i]->device = device;
2437	2440	bd = bdget(device);
2438	2441	if (bd) {
...	...	@@ -499,34 +499,15 @@
499	499	wake_up(&journal->j_wait_transaction_locked);
500	500	}
501	501
502		-/*
503		- * Report any unexpected dirty buffers which turn up. Normally those
504		- * indicate an error, but they can occur if the user is running (say)
505		- * tune2fs to modify the live filesystem, so we need the option of
506		- * continuing as gracefully as possible. #
507		- *
508		- * The caller should already hold the journal lock and
509		- * j_list_lock spinlock: most callers will need those anyway
510		- * in order to probe the buffer's journaling state safely.
511		- */
512		-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
	502	+static void warn_dirty_buffer(struct buffer_head *bh)
513	503	{
514		- int jlist;
	504	+ char b[BDEVNAME_SIZE];
515	505
516		- /* If this buffer is one which might reasonably be dirty
517		- * --- ie. data, or not part of this journal --- then
518		- * we're OK to leave it alone, but otherwise we need to
519		- * move the dirty bit to the journal's own internal
520		- * JBDDirty bit. */
521		- jlist = jh->b_jlist;
522		-
523		- if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
524		- jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
525		- struct buffer_head *bh = jh2bh(jh);
526		-
527		- if (test_clear_buffer_dirty(bh))
528		- set_buffer_jbddirty(bh);
529		- }
	506	+ printk(KERN_WARNING
	507	+ "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
	508	+ "There's a risk of filesystem corruption in case of system "
	509	+ "crash.\n",
	510	+ bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
530	511	}
531	512
532	513	/*
533	514
...	...	@@ -593,14 +574,16 @@
593	574	if (jh->b_next_transaction)
594	575	J_ASSERT_JH(jh, jh->b_next_transaction ==
595	576	transaction);
	577	+ warn_dirty_buffer(bh);
596	578	}
597	579	/*
598	580	* In any case we need to clean the dirty flag and we must
599	581	* do it under the buffer lock to be sure we don't race
600	582	* with running write-out.
601	583	*/
602		- JBUFFER_TRACE(jh, "Unexpected dirty buffer");
603		- jbd_unexpected_dirty_buffer(jh);
	584	+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
	585	+ clear_buffer_dirty(bh);
	586	+ set_buffer_jbddirty(bh);
604	587	}
605	588
606	589	unlock_buffer(bh);
...	...	@@ -843,6 +826,15 @@
843	826	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
844	827
845	828	if (jh->b_transaction == NULL) {
	829	+ /*
	830	+ * Previous jbd2_journal_forget() could have left the buffer
	831	+ * with jbddirty bit set because it was being committed. When
	832	+ * the commit finished, we've filed the buffer for
	833	+ * checkpointing and marked it dirty. Now we are reallocating
	834	+ * the buffer so the transaction freeing it must have
	835	+ * committed and so it's safe to clear the dirty bit.
	836	+ */
	837	+ clear_buffer_dirty(jh2bh(jh));
846	838	jh->b_transaction = transaction;
847	839
848	840	/* first access by this transaction */
849	841
...	...	@@ -1644,8 +1636,13 @@
1644	1636
1645	1637	if (jh->b_cp_transaction) {
1646	1638	JBUFFER_TRACE(jh, "on running+cp transaction");
	1639	+ /*
	1640	+ * We don't want to write the buffer anymore, clear the
	1641	+ * bit so that we don't confuse checks in
	1642	+ * __journal_file_buffer
	1643	+ */
	1644	+ clear_buffer_dirty(bh);
1647	1645	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1648		- clear_buffer_jbddirty(bh);
1649	1646	may_free = 0;
1650	1647	} else {
1651	1648	JBUFFER_TRACE(jh, "on running transaction");
1652	1649
...	...	@@ -1896,12 +1893,17 @@
1896	1893	if (jh->b_transaction && jh->b_jlist == jlist)
1897	1894	return;
1898	1895
1899		- /* The following list of buffer states needs to be consistent
1900		- * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1901		- * state. */
1902		-
1903	1896	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
1904	1897	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
	1898	+ /*
	1899	+ * For metadata buffers, we track dirty bit in buffer_jbddirty
	1900	+ * instead of buffer_dirty. We should not see a dirty bit set
	1901	+ * here because we clear it in do_get_write_access but e.g.
	1902	+ * tune2fs can modify the sb and set the dirty bit at any time
	1903	+ * so we try to gracefully handle that.
	1904	+ */
	1905	+ if (buffer_dirty(bh))
	1906	+ warn_dirty_buffer(bh);
1905	1907	if (test_clear_buffer_dirty(bh) \|\|
1906	1908	test_clear_buffer_jbddirty(bh))
1907	1909	was_dirty = 1;
...	...	@@ -34,7 +34,8 @@
34	34
35	35	TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
36	36	jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode,
37		- __entry->uid, __entry->gid, __entry->blocks)
	37	+ __entry->uid, __entry->gid,
	38	+ (unsigned long long) __entry->blocks)
38	39	);
39	40
40	41	TRACE_EVENT(ext4_request_inode,
...	...	@@ -189,7 +190,7 @@
189	190	__entry->copied)
190	191	);
191	192
192		-TRACE_EVENT(ext4_da_writepage,
	193	+TRACE_EVENT(ext4_writepage,
193	194	TP_PROTO(struct inode inode, struct page page),
194	195
195	196	TP_ARGS(inode, page),
...	...	@@ -339,49 +340,6 @@
339	340	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
340	341	jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
341	342	__entry->copied)
342		-);
343		-
344		-TRACE_EVENT(ext4_normal_writepage,
345		- TP_PROTO(struct inode inode, struct page page),
346		-
347		- TP_ARGS(inode, page),
348		-
349		- TP_STRUCT__entry(
350		- __field( dev_t, dev )
351		- __field( ino_t, ino )
352		- __field( pgoff_t, index )
353		- ),
354		-
355		- TP_fast_assign(
356		- __entry->dev = inode->i_sb->s_dev;
357		- __entry->ino = inode->i_ino;
358		- __entry->index = page->index;
359		- ),
360		-
361		- TP_printk("dev %s ino %lu page_index %lu",
362		- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
363		-);
364		-
365		-TRACE_EVENT(ext4_journalled_writepage,
366		- TP_PROTO(struct inode inode, struct page page),
367		-
368		- TP_ARGS(inode, page),
369		-
370		- TP_STRUCT__entry(
371		- __field( dev_t, dev )
372		- __field( ino_t, ino )
373		- __field( pgoff_t, index )
374		-
375		- ),
376		-
377		- TP_fast_assign(
378		- __entry->dev = inode->i_sb->s_dev;
379		- __entry->ino = inode->i_ino;
380		- __entry->index = page->index;
381		- ),
382		-
383		- TP_printk("dev %s ino %lu page_index %lu",
384		- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
385	343	);
386	344
387	345	TRACE_EVENT(ext4_discard_blocks,