Merge tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o: "Fix a number of regression and other bugs in ext4, most of which were relatively obscure cornercases or races that were found using regression tests." * tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits) ext4: fix data=journal fast mount/umount hang ext4: fix ext4_evict_inode() racing against workqueue processing code ext4: fix memory leakage in mext_check_coverage ext4: use s_extent_max_zeroout_kb value as number of kb ext4: use atomic64_t for the per-flexbg free_clusters count jbd2: fix use after free in jbd2_journal_dirty_metadata() ext4: reserve metadata block for every delayed write ext4: update reserved space after the 'correction' ext4: do not use yield() ext4: remove unused variable in ext4_free_blocks() ext4: fix WARN_ON from ext4_releasepage() ext4: fix the wrong number of the allocated blocks in ext4_split_extent() ext4: update extent status tree after an extent is zeroed out ext4: fix wrong m_len value after unwritten extent conversion ext4: add self-testing infrastructure to do a sanity check ext4: avoid a potential overflow in ext4_es_can_be_merged() ext4: invalidate extent status tree during extent migration ext4: remove unnecessary wait for extent conversion in ext4_fallocate() ext4: add warning to ext4_convert_unwritten_extents_endio ext4: disable merging of uninitialized extents ...

Merge tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 fixes from Ted Ts'o: "Fix a number of regression and other bugs in ext4, most of which were relatively obscure cornercases or races that were found using regression tests." * tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits) ext4: fix data=journal fast mount/umount hang ext4: fix ext4_evict_inode() racing against workqueue processing code ext4: fix memory leakage in mext_check_coverage ext4: use s_extent_max_zeroout_kb value as number of kb ext4: use atomic64_t for the per-flexbg free_clusters count jbd2: fix use after free in jbd2_journal_dirty_metadata() ext4: reserve metadata block for every delayed write ext4: update reserved space after the 'correction' ext4: do not use yield() ext4: remove unused variable in ext4_free_blocks() ext4: fix WARN_ON from ext4_releasepage() ext4: fix the wrong number of the allocated blocks in ext4_split_extent() ext4: update extent status tree after an extent is zeroed out ext4: fix wrong m_len value after unwritten extent conversion ext4: add self-testing infrastructure to do a sanity check ext4: avoid a potential overflow in ext4_es_can_be_merged() ext4: invalidate extent status tree during extent migration ext4: remove unnecessary wait for extent conversion in ext4_fallocate() ext4: add warning to ext4_convert_unwritten_extents_endio ext4: disable merging of uninitialized extents ...
Linus Torvalds
2 parents 0a7e453103 2b405bfa84
Showing 12 changed files Side-by-side Diff
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/extents_status.h
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/move_extent.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/transaction.c
@@ -335,9 +335,9 @@
  */
  
 struct flex_groups {
-	atomic_t free_inodes;
-	atomic_t free_clusters;
-	atomic_t used_dirs;
+	atomic64_t	free_clusters;
+	atomic_t	free_inodes;
+	atomic_t	used_dirs;
 };
  
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -2617,7 +2617,7 @@
 extern int __init ext4_init_pageio(void);
 extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_wait(struct inode *);
+extern void ext4_ioend_shutdown(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern void ext4_end_io_work(struct work_struct *work);
@@ -1584,10 +1584,12 @@
 	unsigned short ext1_ee_len, ext2_ee_len, max_len;
  
 	/*
-	 * Make sure that either both extents are uninitialized, or
-	 * both are _not_.
+	 * Make sure that both extents are initialized. We don't merge
+	 * uninitialized extents so that we can be sure that end_io code has
+	 * the extent that was written properly split out and conversion to
+	 * initialized is trivial.
 	 */
-	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
 		return 0;
  
 	if (ext4_ext_is_uninitialized(ex1))
@@ -2923,7 +2925,7 @@
 {
 	ext4_fsblk_t newblock;
 	ext4_lblk_t ee_block;
-	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex, newex, orig_ex, zero_ex;
 	struct ext4_extent *ex2 = NULL;
 	unsigned int ee_len, depth;
 	int err = 0;
@@ -2943,6 +2945,10 @@
 	newblock = split - ee_block + ext4_ext_pblock(ex);
  
 	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+	BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
+			     EXT4_EXT_MARK_UNINIT1 |
+			     EXT4_EXT_MARK_UNINIT2));
  
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
  
  
  
@@ -2990,12 +2996,26 @@
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
-			if (split_flag & EXT4_EXT_DATA_VALID1)
+			if (split_flag & EXT4_EXT_DATA_VALID1) {
 				err = ext4_ext_zeroout(inode, ex2);
-			else
+				zero_ex.ee_block = ex2->ee_block;
+				zero_ex.ee_len = ext4_ext_get_actual_len(ex2);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex2));
+			} else {
 				err = ext4_ext_zeroout(inode, ex);
-		} else
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = ext4_ext_get_actual_len(ex);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+			}
+		} else {
 			err = ext4_ext_zeroout(inode, &orig_ex);
+			zero_ex.ee_block = orig_ex.ee_block;
+			zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex);
+			ext4_ext_store_pblock(&zero_ex,
+					      ext4_ext_pblock(&orig_ex));
+		}
  
 		if (err)
 			goto fix_extent_len;
@@ -3003,6 +3023,12 @@
 		ex->ee_len = cpu_to_le16(ee_len);
 		ext4_ext_try_to_merge(handle, inode, path, ex);
 		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		if (err)
+			goto fix_extent_len;
+
+		/* update extent status tree */
+		err = ext4_es_zeroout(inode, &zero_ex);
+
 		goto out;
 	} else if (err)
 		goto fix_extent_len;
@@ -3041,6 +3067,7 @@
 	int err = 0;
 	int uninitialized;
 	int split_flag1, flags1;
+	int allocated = map->m_len;
  
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
  
  
  
  
@@ -3060,20 +3087,29 @@
 				map->m_lblk + map->m_len, split_flag1, flags1);
 		if (err)
 			goto out;
+	} else {
+		allocated = ee_len - (map->m_lblk - ee_block);
 	}
-
+	/*
+	 * Update path is required because previous ext4_split_extent_at() may
+	 * result in split of original leaf or extent zeroout.
+	 */
 	ext4_ext_drop_refs(path);
 	path = ext4_ext_find_extent(inode, map->m_lblk, path);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	uninitialized = ext4_ext_is_uninitialized(ex);
+	split_flag1 = 0;
  
 	if (map->m_lblk >= ee_block) {
-		split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
-					    EXT4_EXT_DATA_VALID2);
-		if (uninitialized)
+		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		if (uninitialized) {
 			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
-		if (split_flag & EXT4_EXT_MARK_UNINIT2)
-			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+						     EXT4_EXT_MARK_UNINIT2);
+		}
 		err = ext4_split_extent_at(handle, inode, path,
 				map->m_lblk, split_flag1, flags);
 		if (err)
@@ -3082,7 +3118,7 @@
  
 	ext4_ext_show_leaf(inode, path);
 out:
-	return err ? err : map->m_len;
+	return err ? err : allocated;
 }
  
 /*
@@ -3137,6 +3173,7 @@
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
+	zero_ex.ee_len = 0;
  
 	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
  
  
@@ -3227,13 +3264,16 @@
  
 	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
 		max_zeroout = sbi->s_extent_max_zeroout_kb >>
-			inode->i_sb->s_blocksize_bits;
+			(inode->i_sb->s_blocksize_bits - 10);
  
 	/* If extent is less than s_max_zeroout_kb, zeroout directly */
 	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 			goto out;
+		zero_ex.ee_block = ex->ee_block;
+		zero_ex.ee_len = ext4_ext_get_actual_len(ex);
+		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
  
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -3292,6 +3332,9 @@
 		err = allocated;
  
 out:
+	/* If we have gotten a failure, don't zero out status tree */
+	if (!err)
+		err = ext4_es_zeroout(inode, &zero_ex);
 	return err ? err : allocated;
 }
  
  
@@ -3374,8 +3417,19 @@
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		  (unsigned long long)ee_block, ee_len);
  
-	/* If extent is larger than requested then split is required */
+	/* If extent is larger than requested it is a clear sign that we still
+	 * have some extent state machine issues left. So extent_split is still
+	 * required.
+	 * TODO: Once all related issues will be fixed this situation should be
+	 * illegal.
+	 */
 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+			     " len %u; IO logical block %llu, len %u\n",
+			     inode->i_ino, (unsigned long long)ee_block, ee_len,
+			     (unsigned long long)map->m_lblk, map->m_len);
+#endif
 		err = ext4_split_unwritten_extents(handle, inode, map, path,
 						   EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
@@ -3626,6 +3680,10 @@
 						 path, map->m_len);
 		} else
 			err = ret;
+		map->m_flags |= EXT4_MAP_MAPPED;
+		if (allocated > map->m_len)
+			allocated = map->m_len;
+		map->m_len = allocated;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3675,6 +3733,7 @@
 					allocated - map->m_len);
 		allocated = map->m_len;
 	}
+	map->m_len = allocated;
  
 	/*
 	 * If we have done fallocate with the offset that is already
@@ -4106,9 +4165,6 @@
 			}
 		} else {
 			BUG_ON(allocated_clusters < reserved_clusters);
-			/* We will claim quota for all newly allocated blocks.*/
-			ext4_da_update_reserve_space(inode, allocated_clusters,
-							1);
 			if (reserved_clusters < allocated_clusters) {
 				struct ext4_inode_info *ei = EXT4_I(inode);
 				int reservation = allocated_clusters -
@@ -4159,6 +4215,15 @@
 				ei->i_reserved_data_blocks += reservation;
 				spin_unlock(&ei->i_block_reservation_lock);
 			}
+			/*
+			 * We will claim quota for all newly allocated blocks.
+			 * We're updating the reserved space *after* the
+			 * correction above so we do not accidentally free
+			 * all the metadata reservation because we might
+			 * actually need it later on.
+			 */
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
 		}
 	}
  
@@ -4368,8 +4433,6 @@
 	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
  
-	/* Prevent race condition between unwritten */
-	ext4_flush_unwritten_io(inode);
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
@@ -333,17 +333,27 @@
 static int ext4_es_can_be_merged(struct extent_status *es1,
 				 struct extent_status *es2)
 {
-	if (es1->es_lblk + es1->es_len != es2->es_lblk)
+	if (ext4_es_status(es1) != ext4_es_status(es2))
 		return 0;
  
-	if (ext4_es_status(es1) != ext4_es_status(es2))
+	if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
 		return 0;
  
-	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
-	    (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
+	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
 		return 0;
  
-	return 1;
+	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+		return 1;
+
+	if (ext4_es_is_hole(es1))
+		return 1;
+
+	/* we need to check delayed extent is without unwritten status */
+	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+		return 1;
+
+	return 0;
 }
  
 static struct extent_status *
@@ -389,6 +399,179 @@
 	return es;
 }
  
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	ext4_fsblk_t ee_start;
+	unsigned short ee_len;
+	int depth, ee_status, es_status;
+
+	path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+	if (IS_ERR(path))
+		return;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+
+	if (ex) {
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_start = ext4_ext_pblock(ex);
+		ee_len = ext4_ext_get_actual_len(ex);
+
+		ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+		/*
+		 * Make sure ex and es are not overlap when we try to insert
+		 * a delayed/hole extent.
+		 */
+		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+			if (in_range(es->es_lblk, ee_block, ee_len)) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu we can find an extent "
+					"at block [%d/%d/%llu/%c], but we "
+					"want to add an delayed/hole extent "
+					"[%d/%d/%llu/%llx]\n",
+					inode->i_ino, ee_block, ee_len,
+					ee_start, ee_status ? 'u' : 'w',
+					es->es_lblk, es->es_len,
+					ext4_es_pblock(es), ext4_es_status(es));
+			}
+			goto out;
+		}
+
+		/*
+		 * We don't check ee_block == es->es_lblk, etc. because es
+		 * might be a part of whole extent, vice versa.
+		 */
+		if (es->es_lblk < ee_block ||
+		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+			goto out;
+		}
+
+		if (ee_status ^ es_status) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+		}
+	} else {
+		/*
+		 * We can't find an extent on disk.  So we need to make sure
+		 * that we don't want to add an written/unwritten extent.
+		 */
+		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"can't find an extent at block %d but we want "
+				"to add an written/unwritten extent "
+				"[%d/%d/%llu/%llx]\n", inode->i_ino,
+				es->es_lblk, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+		}
+	}
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_map_blocks map;
+	int retval;
+
+	/*
+	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
+	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
+	 * access direct/indirect tree from outside.  It is too dirty to define
+	 * this function in indirect.c file.
+	 */
+
+	map.m_lblk = es->es_lblk;
+	map.m_len = es->es_len;
+
+	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+	if (retval > 0) {
+		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+			/*
+			 * We want to add a delayed/hole extent but this
+			 * block has been allocated.
+			 */
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"We can find blocks but we want to add a "
+				"delayed/hole extent [%d/%d/%llu/%llx]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		} else if (ext4_es_is_written(es)) {
+			if (retval != es->es_len) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu retval %d != es_len %d\n",
+					inode->i_ino, retval, es->es_len);
+				return;
+			}
+			if (map.m_pblk != ext4_es_pblock(es)) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu m_pblk %llu != "
+					"es_pblk %llu\n",
+					inode->i_ino, map.m_pblk,
+					ext4_es_pblock(es));
+				return;
+			}
+		} else {
+			/*
+			 * We don't need to check unwritten extent because
+			 * indirect-based file doesn't have it.
+			 */
+			BUG_ON(1);
+		}
+	} else if (retval == 0) {
+		if (ext4_es_is_written(es)) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"We can't find the block but we want to add "
+				"an written extent [%d/%d/%llu/%llx]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		}
+	}
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+	/*
+	 * We don't need to worry about the race condition because
+	 * caller takes i_data_sem locking.
+	 */
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_es_insert_extent_ext_check(inode, es);
+	else
+		ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+}
+#endif
+
 static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
 {
 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
@@ -471,6 +654,8 @@
 	ext4_es_store_status(&newes, status);
 	trace_ext4_es_insert_extent(inode, &newes);
  
+	ext4_es_insert_extent_check(inode, &newes);
+
 	write_lock(&EXT4_I(inode)->i_es_lock);
 	err = __es_remove_extent(inode, lblk, end);
 	if (err != 0)
@@ -667,6 +852,23 @@
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	ext4_es_print_tree(inode);
 	return err;
+}
+
+int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_lblk_t  ee_block;
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+
+	ee_block  = le32_to_cpu(ex->ee_block);
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	if (ee_len == 0)
+		return 0;
+
+	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+				     EXTENT_STATUS_WRITTEN);
 }
  
 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -21,6 +21,12 @@
 #endif
  
 /*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
  * These flags live in the high bits of extent_status.es_pblk
  */
 #define EXTENT_STATUS_WRITTEN	(1ULL << 63)
@@ -33,6 +39,8 @@
 				 EXTENT_STATUS_DELAYED | \
 				 EXTENT_STATUS_HOLE)
  
+struct ext4_extent;
+
 struct extent_status {
 	struct rb_node rb_node;
 	ext4_lblk_t es_lblk;	/* first logical block extent covers */
@@ -58,6 +66,7 @@
 					struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 				 struct extent_status *es);
+extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
  
 static inline int ext4_es_is_written(struct extent_status *es)
 {
@@ -324,8 +324,8 @@
 }
  
 struct orlov_stats {
+	__u64 free_clusters;
 	__u32 free_inodes;
-	__u32 free_clusters;
 	__u32 used_dirs;
 };
  
@@ -342,7 +342,7 @@
  
 	if (flex_size > 1) {
 		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
+		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
 		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
 		return;
 	}
@@ -185,8 +185,6 @@
  
 	trace_ext4_evict_inode(inode);
  
-	ext4_ioend_wait(inode);
-
 	if (inode->i_nlink) {
 		/*
 		 * When journalling data dirty buffers are tracked only in the
@@ -207,7 +205,8 @@
 		 * don't use page cache.
 		 */
 		if (ext4_should_journal_data(inode) &&
-		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+		    inode->i_ino != EXT4_JOURNAL_INO) {
 			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
  
@@ -216,6 +215,7 @@
 			filemap_write_and_wait(&inode->i_data);
 		}
 		truncate_inode_pages(&inode->i_data, 0);
+		ext4_ioend_shutdown(inode);
 		goto no_delete;
 	}
  
@@ -225,6 +225,7 @@
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
+	ext4_ioend_shutdown(inode);
  
 	if (is_bad_inode(inode))
 		goto no_delete;
@@ -482,6 +483,58 @@
 	return num;
 }
  
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+				       struct inode *inode,
+				       struct ext4_map_blocks *es_map,
+				       struct ext4_map_blocks *map,
+				       int flags)
+{
+	int retval;
+
+	map->m_flags = 0;
+	/*
+	 * There is a race window that the result is not the same.
+	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
+	 * is that we lookup a block mapping in extent status tree with
+	 * out taking i_data_sem.  So at the time the unwritten extent
+	 * could be converted.
+	 */
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read((&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	}
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
+	/*
+	 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+	 * because it shouldn't be marked in es_map->m_flags.
+	 */
+	map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
+
+	/*
+	 * We don't check m_len because extent will be collpased in status
+	 * tree.  So the m_len might not equal.
+	 */
+	if (es_map->m_lblk != map->m_lblk ||
+	    es_map->m_flags != map->m_flags ||
+	    es_map->m_pblk != map->m_pblk) {
+		printk("ES cache assertation failed for inode: %lu "
+		       "es_cached ex [%d/%d/%llu/%x] != "
+		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+		       inode->i_ino, es_map->m_lblk, es_map->m_len,
+		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
+		       map->m_len, map->m_pblk, map->m_flags,
+		       retval, flags);
+	}
+}
+#endif /* ES_AGGRESSIVE_TEST */
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  
@@ -509,7 +562,12 @@
 {
 	struct extent_status es;
 	int retval;
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
  
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
@@ -531,6 +589,10 @@
 		} else {
 			BUG_ON(1);
 		}
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(handle, inode, map,
+					   &orig_map, flags);
+#endif
 		goto found;
 	}
  
@@ -551,6 +613,15 @@
 		int ret;
 		unsigned long long status;
  
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (lookup)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
+		}
+#endif
+
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
@@ -643,6 +714,24 @@
 		int ret;
 		unsigned long long status;
  
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (allocation)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
+		}
+#endif
+
+		/*
+		 * If the extent has been zeroed out, we don't need to update
+		 * extent status tree.
+		 */
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+			if (ext4_es_is_written(&es))
+				goto has_zeroout;
+		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
@@ -655,6 +744,7 @@
 			retval = ret;
 	}
  
+has_zeroout:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
@@ -1216,6 +1306,55 @@
 }
  
 /*
+ * Reserve a metadata for a single block located at lblock
+ */
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
+{
+	int retries = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int md_needed;
+	ext4_lblk_t save_last_lblock;
+	int save_len;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+repeat:
+	spin_lock(&ei->i_block_reservation_lock);
+	/*
+	 * ext4_calc_metadata_amount() has side effects, which we have
+	 * to be prepared undo if we fail to claim space.
+	 */
+	save_len = ei->i_da_metadata_calc_len;
+	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
+	trace_ext4_da_reserve_space(inode, md_needed);
+
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
+	if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+		ei->i_da_metadata_calc_len = save_len;
+		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+		spin_unlock(&ei->i_block_reservation_lock);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			cond_resched();
+			goto repeat;
+		}
+		return -ENOSPC;
+	}
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+/*
  * Reserve a single cluster located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
@@ -1263,7 +1402,7 @@
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			yield();
+			cond_resched();
 			goto repeat;
 		}
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
  
@@ -1768,7 +1907,12 @@
 	struct extent_status es;
 	int retval;
 	sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
  
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
 		invalid_block = ~0;
  
@@ -1809,6 +1953,9 @@
 		else
 			BUG_ON(1);
  
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
 		return retval;
 	}
  
@@ -1843,8 +1990,11 @@
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
-		/* If the block was allocated from previously allocated cluster,
-		 * then we dont need to reserve it again. */
+		/*
+		 * If the block was allocated from previously allocated cluster,
+		 * then we don't need to reserve it again. However we still need
+		 * to reserve metadata for every block we're going to write.
+		 */
 		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
 			ret = ext4_da_reserve_space(inode, iblock);
 			if (ret) {
@@ -1852,6 +2002,13 @@
 				retval = ret;
 				goto out_unlock;
 			}
+		} else {
+			ret = ext4_da_reserve_metadata(inode, iblock);
+			if (ret) {
+				/* not enough space to reserve */
+				retval = ret;
+				goto out_unlock;
+			}
 		}
  
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -1873,6 +2030,15 @@
 		int ret;
 		unsigned long long status;
  
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (lookup)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
+		}
+#endif
+
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -2908,8 +3074,8 @@
  
 	trace_ext4_releasepage(page);
  
-	WARN_ON(PageChecked(page));
-	if (!page_has_buffers(page))
+	/* Page has dirty journalled data -> cannot release */
+	if (PageChecked(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
@@ -2804,8 +2804,8 @@
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
-		atomic_sub(ac->ac_b_ex.fe_len,
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_sub(ac->ac_b_ex.fe_len,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
  
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3692,11 +3692,7 @@
 	if (free < needed && busy) {
 		busy = 0;
 		ext4_unlock_group(sb, group);
-		/*
-		 * Yield the CPU here so that we don't get soft lockup
-		 * in non preempt case.
-		 */
-		yield();
+		cond_resched();
 		goto repeat;
 	}
  
@@ -4246,7 +4242,7 @@
 			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
  
 			/* let others to free the space */
-			yield();
+			cond_resched();
 			ar->len = ar->len >> 1;
 		}
 		if (!ar->len) {
@@ -4464,7 +4460,6 @@
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
 	struct ext4_group_desc *gdp;
-	unsigned long freed = 0;
 	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
  
@@ -4666,14 +4661,12 @@
  
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(count_clusters,
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(count_clusters,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
  
 	ext4_mb_unload_buddy(&e4b);
  
-	freed += count;
-
 	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
 		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
  
@@ -4811,8 +4804,8 @@
  
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
  
 	ext4_mb_unload_buddy(&e4b);
@@ -32,16 +32,18 @@
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **path)
+		struct ext4_ext_path **orig_path)
 {
 	int ret = 0;
+	struct ext4_ext_path *path;
  
-	*path = ext4_ext_find_extent(inode, lblock, *path);
-	if (IS_ERR(*path)) {
-		ret = PTR_ERR(*path);
-		*path = NULL;
-	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+	path = ext4_ext_find_extent(inode, lblock, *orig_path);
+	if (IS_ERR(path))
+		ret = PTR_ERR(path);
+	else if (path[ext_depth(inode)].p_ext == NULL)
 		ret = -ENODATA;
+	else
+		*orig_path = path;
  
 	return ret;
 }
  
  
  
@@ -611,24 +613,25 @@
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent *ext;
+	int ret = 0;
 	ext4_lblk_t last = from + count;
 	while (from < last) {
 		*err = get_ext_path(inode, from, &path);
 		if (*err)
-			return 0;
+			goto out;
 		ext = path[ext_depth(inode)].p_ext;
-		if (!ext) {
-			ext4_ext_drop_refs(path);
-			return 0;
-		}
-		if (uninit != ext4_ext_is_uninitialized(ext)) {
-			ext4_ext_drop_refs(path);
-			return 0;
-		}
+		if (uninit != ext4_ext_is_uninitialized(ext))
+			goto out;
 		from += ext4_ext_get_actual_len(ext);
 		ext4_ext_drop_refs(path);
 	}
-	return 1;
+	ret = 1;
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	return ret;
 }
  
 /**
@@ -665,6 +668,14 @@
 	int depth;
 	int replaced_count = 0;
 	int dext_alen;
+
+	*err = ext4_es_remove_extent(orig_inode, from, count);
+	if (*err)
+		goto out;
+
+	*err = ext4_es_remove_extent(donor_inode, from, count);
+	if (*err)
+		goto out;
  
 	/* Get the original extent for the block "orig_off" */
 	*err = get_ext_path(orig_inode, orig_off, &orig_path);
@@ -50,11 +50,21 @@
 	kmem_cache_destroy(io_page_cachep);
 }
  
-void ext4_ioend_wait(struct inode *inode)
+/*
+ * This function is called by ext4_evict_inode() to make sure there is
+ * no more pending I/O completion work left to do.
+ */
+void ext4_ioend_shutdown(struct inode *inode)
 {
 	wait_queue_head_t *wq = ext4_ioend_wq(inode);
  
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+	/*
+	 * We need to make sure the work structure is finished being
+	 * used before we let the inode get destroyed.
+	 */
+	if (work_pending(&EXT4_I(inode)->i_unwritten_work))
+		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
 }
  
 static void put_io_page(struct ext4_io_page *io_page)
@@ -1360,8 +1360,8 @@
 	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, group_data[0].group);
-		atomic_add(EXT4_NUM_B2C(sbi, free_blocks),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
@@ -1927,8 +1927,8 @@
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_add(ext4_free_inodes_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
-		atomic_add(ext4_free_group_clusters(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(ext4_free_group_clusters(sb, gdp),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(ext4_used_dirs_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
@@ -1065,9 +1065,12 @@
 void jbd2_journal_set_triggers(struct buffer_head *bh,
 			       struct jbd2_buffer_trigger_type *type)
 {
-	struct journal_head *jh = bh2jh(bh);
+	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
  
+	if (WARN_ON(!jh))
+		return;
 	jh->b_triggers = type;
+	jbd2_journal_put_journal_head(jh);
 }
  
 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
  
  
  
@@ -1119,17 +1122,18 @@
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	struct journal_head *jh = bh2jh(bh);
+	struct journal_head *jh;
 	int ret = 0;
  
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
 	if (is_handle_aborted(handle))
 		goto out;
-	if (!buffer_jbd(bh)) {
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh) {
 		ret = -EUCLEAN;
 		goto out;
 	}
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
  
 	jbd_lock_bh_state(bh);
  
@@ -1220,6 +1224,7 @@
 	spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
 	jbd_unlock_bh_state(bh);
+	jbd2_journal_put_journal_head(jh);
 out:
 	JBUFFER_TRACE(jh, "exit");
 	WARN_ON(ret);	/* All errors are bugs, so dump the stack */
...	...	@@ -335,9 +335,9 @@
335	335	*/
336	336
337	337	struct flex_groups {
338		- atomic_t free_inodes;
339		- atomic_t free_clusters;
340		- atomic_t used_dirs;
	338	+ atomic64_t free_clusters;
	339	+ atomic_t free_inodes;
	340	+ atomic_t used_dirs;
341	341	};
342	342
343	343	#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
...	...	@@ -2617,7 +2617,7 @@
2617	2617	extern int __init ext4_init_pageio(void);
2618	2618	extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2619	2619	extern void ext4_exit_pageio(void);
2620		-extern void ext4_ioend_wait(struct inode *);
	2620	+extern void ext4_ioend_shutdown(struct inode *);
2621	2621	extern void ext4_free_io_end(ext4_io_end_t *io);
2622	2622	extern ext4_io_end_t ext4_init_io_end(struct inode inode, gfp_t flags);
2623	2623	extern void ext4_end_io_work(struct work_struct *work);
...	...	@@ -1584,10 +1584,12 @@
1584	1584	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1585	1585
1586	1586	/*
1587		- * Make sure that either both extents are uninitialized, or
1588		- * both are _not_.
	1587	+ * Make sure that both extents are initialized. We don't merge
	1588	+ * uninitialized extents so that we can be sure that end_io code has
	1589	+ * the extent that was written properly split out and conversion to
	1590	+ * initialized is trivial.
1589	1591	*/
1590		- if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
	1592	+ if (ext4_ext_is_uninitialized(ex1) \|\| ext4_ext_is_uninitialized(ex2))
1591	1593	return 0;
1592	1594
1593	1595	if (ext4_ext_is_uninitialized(ex1))
...	...	@@ -2923,7 +2925,7 @@
2923	2925	{
2924	2926	ext4_fsblk_t newblock;
2925	2927	ext4_lblk_t ee_block;
2926		- struct ext4_extent *ex, newex, orig_ex;
	2928	+ struct ext4_extent *ex, newex, orig_ex, zero_ex;
2927	2929	struct ext4_extent *ex2 = NULL;
2928	2930	unsigned int ee_len, depth;
2929	2931	int err = 0;
...	...	@@ -2943,6 +2945,10 @@
2943	2945	newblock = split - ee_block + ext4_ext_pblock(ex);
2944	2946
2945	2947	BUG_ON(split < ee_block \|\| split >= (ee_block + ee_len));
	2948	+ BUG_ON(!ext4_ext_is_uninitialized(ex) &&
	2949	+ split_flag & (EXT4_EXT_MAY_ZEROOUT \|
	2950	+ EXT4_EXT_MARK_UNINIT1 \|
	2951	+ EXT4_EXT_MARK_UNINIT2));
2946	2952
2947	2953	err = ext4_ext_get_access(handle, inode, path + depth);
2948	2954	if (err)
2949	2955
2950	2956
2951	2957
...	...	@@ -2990,12 +2996,26 @@
2990	2996	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2991	2997	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2992	2998	if (split_flag & (EXT4_EXT_DATA_VALID1\|EXT4_EXT_DATA_VALID2)) {
2993		- if (split_flag & EXT4_EXT_DATA_VALID1)
	2999	+ if (split_flag & EXT4_EXT_DATA_VALID1) {
2994	3000	err = ext4_ext_zeroout(inode, ex2);
2995		- else
	3001	+ zero_ex.ee_block = ex2->ee_block;
	3002	+ zero_ex.ee_len = ext4_ext_get_actual_len(ex2);
	3003	+ ext4_ext_store_pblock(&zero_ex,
	3004	+ ext4_ext_pblock(ex2));
	3005	+ } else {
2996	3006	err = ext4_ext_zeroout(inode, ex);
2997		- } else
	3007	+ zero_ex.ee_block = ex->ee_block;
	3008	+ zero_ex.ee_len = ext4_ext_get_actual_len(ex);
	3009	+ ext4_ext_store_pblock(&zero_ex,
	3010	+ ext4_ext_pblock(ex));
	3011	+ }
	3012	+ } else {
2998	3013	err = ext4_ext_zeroout(inode, &orig_ex);
	3014	+ zero_ex.ee_block = orig_ex.ee_block;
	3015	+ zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex);
	3016	+ ext4_ext_store_pblock(&zero_ex,
	3017	+ ext4_ext_pblock(&orig_ex));
	3018	+ }
2999	3019
3000	3020	if (err)
3001	3021	goto fix_extent_len;
...	...	@@ -3003,6 +3023,12 @@
3003	3023	ex->ee_len = cpu_to_le16(ee_len);
3004	3024	ext4_ext_try_to_merge(handle, inode, path, ex);
3005	3025	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
	3026	+ if (err)
	3027	+ goto fix_extent_len;
	3028	+
	3029	+ /* update extent status tree */
	3030	+ err = ext4_es_zeroout(inode, &zero_ex);
	3031	+
3006	3032	goto out;
3007	3033	} else if (err)
3008	3034	goto fix_extent_len;
...	...	@@ -3041,6 +3067,7 @@
3041	3067	int err = 0;
3042	3068	int uninitialized;
3043	3069	int split_flag1, flags1;
	3070	+ int allocated = map->m_len;
3044	3071
3045	3072	depth = ext_depth(inode);
3046	3073	ex = path[depth].p_ext;
3047	3074
3048	3075
3049	3076
3050	3077
...	...	@@ -3060,20 +3087,29 @@
3060	3087	map->m_lblk + map->m_len, split_flag1, flags1);
3061	3088	if (err)
3062	3089	goto out;
	3090	+ } else {
	3091	+ allocated = ee_len - (map->m_lblk - ee_block);
3063	3092	}
3064		-
	3093	+ /*
	3094	+ * Update path is required because previous ext4_split_extent_at() may
	3095	+ * result in split of original leaf or extent zeroout.
	3096	+ */
3065	3097	ext4_ext_drop_refs(path);
3066	3098	path = ext4_ext_find_extent(inode, map->m_lblk, path);
3067	3099	if (IS_ERR(path))
3068	3100	return PTR_ERR(path);
	3101	+ depth = ext_depth(inode);
	3102	+ ex = path[depth].p_ext;
	3103	+ uninitialized = ext4_ext_is_uninitialized(ex);
	3104	+ split_flag1 = 0;
3069	3105
3070	3106	if (map->m_lblk >= ee_block) {
3071		- split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT \|
3072		- EXT4_EXT_DATA_VALID2);
3073		- if (uninitialized)
	3107	+ split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
	3108	+ if (uninitialized) {
3074	3109	split_flag1 \|= EXT4_EXT_MARK_UNINIT1;
3075		- if (split_flag & EXT4_EXT_MARK_UNINIT2)
3076		- split_flag1 \|= EXT4_EXT_MARK_UNINIT2;
	3110	+ split_flag1 \|= split_flag & (EXT4_EXT_MAY_ZEROOUT \|
	3111	+ EXT4_EXT_MARK_UNINIT2);
	3112	+ }
3077	3113	err = ext4_split_extent_at(handle, inode, path,
3078	3114	map->m_lblk, split_flag1, flags);
3079	3115	if (err)
...	...	@@ -3082,7 +3118,7 @@
3082	3118
3083	3119	ext4_ext_show_leaf(inode, path);
3084	3120	out:
3085		- return err ? err : map->m_len;
	3121	+ return err ? err : allocated;
3086	3122	}
3087	3123
3088	3124	/*
...	...	@@ -3137,6 +3173,7 @@
3137	3173	ee_block = le32_to_cpu(ex->ee_block);
3138	3174	ee_len = ext4_ext_get_actual_len(ex);
3139	3175	allocated = ee_len - (map->m_lblk - ee_block);
	3176	+ zero_ex.ee_len = 0;
3140	3177
3141	3178	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3142	3179
3143	3180
...	...	@@ -3227,13 +3264,16 @@
3227	3264
3228	3265	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3229	3266	max_zeroout = sbi->s_extent_max_zeroout_kb >>
3230		- inode->i_sb->s_blocksize_bits;
	3267	+ (inode->i_sb->s_blocksize_bits - 10);
3231	3268
3232	3269	/* If extent is less than s_max_zeroout_kb, zeroout directly */
3233	3270	if (max_zeroout && (ee_len <= max_zeroout)) {
3234	3271	err = ext4_ext_zeroout(inode, ex);
3235	3272	if (err)
3236	3273	goto out;
	3274	+ zero_ex.ee_block = ex->ee_block;
	3275	+ zero_ex.ee_len = ext4_ext_get_actual_len(ex);
	3276	+ ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3237	3277
3238	3278	err = ext4_ext_get_access(handle, inode, path + depth);
3239	3279	if (err)
...	...	@@ -3292,6 +3332,9 @@
3292	3332	err = allocated;
3293	3333
3294	3334	out:
	3335	+ /* If we have gotten a failure, don't zero out status tree */
	3336	+ if (!err)
	3337	+ err = ext4_es_zeroout(inode, &zero_ex);
3295	3338	return err ? err : allocated;
3296	3339	}
3297	3340
3298	3341
...	...	@@ -3374,8 +3417,19 @@
3374	3417	"block %llu, max_blocks %u\n", inode->i_ino,
3375	3418	(unsigned long long)ee_block, ee_len);
3376	3419
3377		- /* If extent is larger than requested then split is required */
	3420	+ /* If extent is larger than requested it is a clear sign that we still
	3421	+ * have some extent state machine issues left. So extent_split is still
	3422	+ * required.
	3423	+ * TODO: Once all related issues will be fixed this situation should be
	3424	+ * illegal.
	3425	+ */
3378	3426	if (ee_block != map->m_lblk \|\| ee_len > map->m_len) {
	3427	+#ifdef EXT4_DEBUG
	3428	+ ext4_warning("Inode (%ld) finished: extent logical block %llu,"
	3429	+ " len %u; IO logical block %llu, len %u\n",
	3430	+ inode->i_ino, (unsigned long long)ee_block, ee_len,
	3431	+ (unsigned long long)map->m_lblk, map->m_len);
	3432	+#endif
3379	3433	err = ext4_split_unwritten_extents(handle, inode, map, path,
3380	3434	EXT4_GET_BLOCKS_CONVERT);
3381	3435	if (err < 0)
...	...	@@ -3626,6 +3680,10 @@
3626	3680	path, map->m_len);
3627	3681	} else
3628	3682	err = ret;
	3683	+ map->m_flags \|= EXT4_MAP_MAPPED;
	3684	+ if (allocated > map->m_len)
	3685	+ allocated = map->m_len;
	3686	+ map->m_len = allocated;
3629	3687	goto out2;
3630	3688	}
3631	3689	/* buffered IO case */
...	...	@@ -3675,6 +3733,7 @@
3675	3733	allocated - map->m_len);
3676	3734	allocated = map->m_len;
3677	3735	}
	3736	+ map->m_len = allocated;
3678	3737
3679	3738	/*
3680	3739	* If we have done fallocate with the offset that is already
...	...	@@ -4106,9 +4165,6 @@
4106	4165	}
4107	4166	} else {
4108	4167	BUG_ON(allocated_clusters < reserved_clusters);
4109		- /* We will claim quota for all newly allocated blocks.*/
4110		- ext4_da_update_reserve_space(inode, allocated_clusters,
4111		- 1);
4112	4168	if (reserved_clusters < allocated_clusters) {
4113	4169	struct ext4_inode_info *ei = EXT4_I(inode);
4114	4170	int reservation = allocated_clusters -
...	...	@@ -4159,6 +4215,15 @@
4159	4215	ei->i_reserved_data_blocks += reservation;
4160	4216	spin_unlock(&ei->i_block_reservation_lock);
4161	4217	}
	4218	+ /*
	4219	+ * We will claim quota for all newly allocated blocks.
	4220	+ * We're updating the reserved space after the
	4221	+ * correction above so we do not accidentally free
	4222	+ * all the metadata reservation because we might
	4223	+ * actually need it later on.
	4224	+ */
	4225	+ ext4_da_update_reserve_space(inode, allocated_clusters,
	4226	+ 1);
4162	4227	}
4163	4228	}
4164	4229
...	...	@@ -4368,8 +4433,6 @@
4368	4433	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4369	4434	flags \|= EXT4_GET_BLOCKS_NO_NORMALIZE;
4370	4435
4371		- /* Prevent race condition between unwritten */
4372		- ext4_flush_unwritten_io(inode);
4373	4436	retry:
4374	4437	while (ret >= 0 && ret < max_blocks) {
4375	4438	map.m_lblk = map.m_lblk + ret;
...	...	@@ -333,17 +333,27 @@
333	333	static int ext4_es_can_be_merged(struct extent_status *es1,
334	334	struct extent_status *es2)
335	335	{
336		- if (es1->es_lblk + es1->es_len != es2->es_lblk)
	336	+ if (ext4_es_status(es1) != ext4_es_status(es2))
337	337	return 0;
338	338
339		- if (ext4_es_status(es1) != ext4_es_status(es2))
	339	+ if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
340	340	return 0;
341	341
342		- if ((ext4_es_is_written(es1) \|\| ext4_es_is_unwritten(es1)) &&
343		- (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
	342	+ if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
344	343	return 0;
345	344
346		- return 1;
	345	+ if ((ext4_es_is_written(es1) \|\| ext4_es_is_unwritten(es1)) &&
	346	+ (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
	347	+ return 1;
	348	+
	349	+ if (ext4_es_is_hole(es1))
	350	+ return 1;
	351	+
	352	+ /* we need to check delayed extent is without unwritten status */
	353	+ if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
	354	+ return 1;
	355	+
	356	+ return 0;
347	357	}
348	358
349	359	static struct extent_status *
...	...	@@ -389,6 +399,179 @@
389	399	return es;
390	400	}
391	401
	402	+#ifdef ES_AGGRESSIVE_TEST
	403	+static void ext4_es_insert_extent_ext_check(struct inode *inode,
	404	+ struct extent_status *es)
	405	+{
	406	+ struct ext4_ext_path *path = NULL;
	407	+ struct ext4_extent *ex;
	408	+ ext4_lblk_t ee_block;
	409	+ ext4_fsblk_t ee_start;
	410	+ unsigned short ee_len;
	411	+ int depth, ee_status, es_status;
	412	+
	413	+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
	414	+ if (IS_ERR(path))
	415	+ return;
	416	+
	417	+ depth = ext_depth(inode);
	418	+ ex = path[depth].p_ext;
	419	+
	420	+ if (ex) {
	421	+
	422	+ ee_block = le32_to_cpu(ex->ee_block);
	423	+ ee_start = ext4_ext_pblock(ex);
	424	+ ee_len = ext4_ext_get_actual_len(ex);
	425	+
	426	+ ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
	427	+ es_status = ext4_es_is_unwritten(es) ? 1 : 0;
	428	+
	429	+ /*
	430	+ * Make sure ex and es are not overlap when we try to insert
	431	+ * a delayed/hole extent.
	432	+ */
	433	+ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
	434	+ if (in_range(es->es_lblk, ee_block, ee_len)) {
	435	+ pr_warn("ES insert assertation failed for "
	436	+ "inode: %lu we can find an extent "
	437	+ "at block [%d/%d/%llu/%c], but we "
	438	+ "want to add an delayed/hole extent "
	439	+ "[%d/%d/%llu/%llx]\n",
	440	+ inode->i_ino, ee_block, ee_len,
	441	+ ee_start, ee_status ? 'u' : 'w',
	442	+ es->es_lblk, es->es_len,
	443	+ ext4_es_pblock(es), ext4_es_status(es));
	444	+ }
	445	+ goto out;
	446	+ }
	447	+
	448	+ /*
	449	+ * We don't check ee_block == es->es_lblk, etc. because es
	450	+ * might be a part of whole extent, vice versa.
	451	+ */
	452	+ if (es->es_lblk < ee_block \|\|
	453	+ ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
	454	+ pr_warn("ES insert assertation failed for inode: %lu "
	455	+ "ex_status [%d/%d/%llu/%c] != "
	456	+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
	457	+ ee_block, ee_len, ee_start,
	458	+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
	459	+ ext4_es_pblock(es), es_status ? 'u' : 'w');
	460	+ goto out;
	461	+ }
	462	+
	463	+ if (ee_status ^ es_status) {
	464	+ pr_warn("ES insert assertation failed for inode: %lu "
	465	+ "ex_status [%d/%d/%llu/%c] != "
	466	+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
	467	+ ee_block, ee_len, ee_start,
	468	+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
	469	+ ext4_es_pblock(es), es_status ? 'u' : 'w');
	470	+ }
	471	+ } else {
	472	+ /*
	473	+ * We can't find an extent on disk. So we need to make sure
	474	+ * that we don't want to add an written/unwritten extent.
	475	+ */
	476	+ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
	477	+ pr_warn("ES insert assertation failed for inode: %lu "
	478	+ "can't find an extent at block %d but we want "
	479	+ "to add an written/unwritten extent "
	480	+ "[%d/%d/%llu/%llx]\n", inode->i_ino,
	481	+ es->es_lblk, es->es_lblk, es->es_len,
	482	+ ext4_es_pblock(es), ext4_es_status(es));
	483	+ }
	484	+ }
	485	+out:
	486	+ if (path) {
	487	+ ext4_ext_drop_refs(path);
	488	+ kfree(path);
	489	+ }
	490	+}
	491	+
	492	+static void ext4_es_insert_extent_ind_check(struct inode *inode,
	493	+ struct extent_status *es)
	494	+{
	495	+ struct ext4_map_blocks map;
	496	+ int retval;
	497	+
	498	+ /*
	499	+ * Here we call ext4_ind_map_blocks to lookup a block mapping because
	500	+ * 'Indirect' structure is defined in indirect.c. So we couldn't
	501	+ * access direct/indirect tree from outside. It is too dirty to define
	502	+ * this function in indirect.c file.
	503	+ */
	504	+
	505	+ map.m_lblk = es->es_lblk;
	506	+ map.m_len = es->es_len;
	507	+
	508	+ retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
	509	+ if (retval > 0) {
	510	+ if (ext4_es_is_delayed(es) \|\| ext4_es_is_hole(es)) {
	511	+ /*
	512	+ * We want to add a delayed/hole extent but this
	513	+ * block has been allocated.
	514	+ */
	515	+ pr_warn("ES insert assertation failed for inode: %lu "
	516	+ "We can find blocks but we want to add a "
	517	+ "delayed/hole extent [%d/%d/%llu/%llx]\n",
	518	+ inode->i_ino, es->es_lblk, es->es_len,
	519	+ ext4_es_pblock(es), ext4_es_status(es));
	520	+ return;
	521	+ } else if (ext4_es_is_written(es)) {
	522	+ if (retval != es->es_len) {
	523	+ pr_warn("ES insert assertation failed for "
	524	+ "inode: %lu retval %d != es_len %d\n",
	525	+ inode->i_ino, retval, es->es_len);
	526	+ return;
	527	+ }
	528	+ if (map.m_pblk != ext4_es_pblock(es)) {
	529	+ pr_warn("ES insert assertation failed for "
	530	+ "inode: %lu m_pblk %llu != "
	531	+ "es_pblk %llu\n",
	532	+ inode->i_ino, map.m_pblk,
	533	+ ext4_es_pblock(es));
	534	+ return;
	535	+ }
	536	+ } else {
	537	+ /*
	538	+ * We don't need to check unwritten extent because
	539	+ * indirect-based file doesn't have it.
	540	+ */
	541	+ BUG_ON(1);
	542	+ }
	543	+ } else if (retval == 0) {
	544	+ if (ext4_es_is_written(es)) {
	545	+ pr_warn("ES insert assertation failed for inode: %lu "
	546	+ "We can't find the block but we want to add "
	547	+ "an written extent [%d/%d/%llu/%llx]\n",
	548	+ inode->i_ino, es->es_lblk, es->es_len,
	549	+ ext4_es_pblock(es), ext4_es_status(es));
	550	+ return;
	551	+ }
	552	+ }
	553	+}
	554	+
	555	+static inline void ext4_es_insert_extent_check(struct inode *inode,
	556	+ struct extent_status *es)
	557	+{
	558	+ /*
	559	+ * We don't need to worry about the race condition because
	560	+ * caller takes i_data_sem locking.
	561	+ */
	562	+ BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
	563	+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
	564	+ ext4_es_insert_extent_ext_check(inode, es);
	565	+ else
	566	+ ext4_es_insert_extent_ind_check(inode, es);
	567	+}
	568	+#else
	569	+static inline void ext4_es_insert_extent_check(struct inode *inode,
	570	+ struct extent_status *es)
	571	+{
	572	+}
	573	+#endif
	574	+
392	575	static int __es_insert_extent(struct inode inode, struct extent_status newes)
393	576	{
394	577	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
...	...	@@ -471,6 +654,8 @@
471	654	ext4_es_store_status(&newes, status);
472	655	trace_ext4_es_insert_extent(inode, &newes);
473	656
	657	+ ext4_es_insert_extent_check(inode, &newes);
	658	+
474	659	write_lock(&EXT4_I(inode)->i_es_lock);
475	660	err = __es_remove_extent(inode, lblk, end);
476	661	if (err != 0)
...	...	@@ -667,6 +852,23 @@
667	852	write_unlock(&EXT4_I(inode)->i_es_lock);
668	853	ext4_es_print_tree(inode);
669	854	return err;
	855	+}
	856	+
	857	+int ext4_es_zeroout(struct inode inode, struct ext4_extent ex)
	858	+{
	859	+ ext4_lblk_t ee_block;
	860	+ ext4_fsblk_t ee_pblock;
	861	+ unsigned int ee_len;
	862	+
	863	+ ee_block = le32_to_cpu(ex->ee_block);
	864	+ ee_len = ext4_ext_get_actual_len(ex);
	865	+ ee_pblock = ext4_ext_pblock(ex);
	866	+
	867	+ if (ee_len == 0)
	868	+ return 0;
	869	+
	870	+ return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
	871	+ EXTENT_STATUS_WRITTEN);
670	872	}
671	873
672	874	static int ext4_es_shrink(struct shrinker shrink, struct shrink_control sc)
...	...	@@ -21,6 +21,12 @@
21	21	#endif
22	22
23	23	/*
	24	+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
	25	+ * checked with old map_block's result.
	26	+ */
	27	+#define ES_AGGRESSIVE_TEST__
	28	+
	29	+/*
24	30	* These flags live in the high bits of extent_status.es_pblk
25	31	*/
26	32	#define EXTENT_STATUS_WRITTEN (1ULL << 63)
...	...	@@ -33,6 +39,8 @@
33	39	EXTENT_STATUS_DELAYED \| \
34	40	EXTENT_STATUS_HOLE)
35	41
	42	+struct ext4_extent;
	43	+
36	44	struct extent_status {
37	45	struct rb_node rb_node;
38	46	ext4_lblk_t es_lblk; /* first logical block extent covers */
...	...	@@ -58,6 +66,7 @@
58	66	struct extent_status *es);
59	67	extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
60	68	struct extent_status *es);
	69	+extern int ext4_es_zeroout(struct inode inode, struct ext4_extent ex);
61	70
62	71	static inline int ext4_es_is_written(struct extent_status *es)
63	72	{
...	...	@@ -324,8 +324,8 @@
324	324	}
325	325
326	326	struct orlov_stats {
	327	+ __u64 free_clusters;
327	328	__u32 free_inodes;
328		- __u32 free_clusters;
329	329	__u32 used_dirs;
330	330	};
331	331
...	...	@@ -342,7 +342,7 @@
342	342
343	343	if (flex_size > 1) {
344	344	stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
345		- stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
	345	+ stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
346	346	stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
347	347	return;
348	348	}
...	...	@@ -185,8 +185,6 @@
185	185
186	186	trace_ext4_evict_inode(inode);
187	187
188		- ext4_ioend_wait(inode);
189		-
190	188	if (inode->i_nlink) {
191	189	/*
192	190	* When journalling data dirty buffers are tracked only in the
...	...	@@ -207,7 +205,8 @@
207	205	* don't use page cache.
208	206	*/
209	207	if (ext4_should_journal_data(inode) &&
210		- (S_ISLNK(inode->i_mode) \|\| S_ISREG(inode->i_mode))) {
	208	+ (S_ISLNK(inode->i_mode) \|\| S_ISREG(inode->i_mode)) &&
	209	+ inode->i_ino != EXT4_JOURNAL_INO) {
211	210	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
212	211	tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
213	212
...	...	@@ -216,6 +215,7 @@
216	215	filemap_write_and_wait(&inode->i_data);
217	216	}
218	217	truncate_inode_pages(&inode->i_data, 0);
	218	+ ext4_ioend_shutdown(inode);
219	219	goto no_delete;
220	220	}
221	221
...	...	@@ -225,6 +225,7 @@
225	225	if (ext4_should_order_data(inode))
226	226	ext4_begin_ordered_truncate(inode, 0);
227	227	truncate_inode_pages(&inode->i_data, 0);
	228	+ ext4_ioend_shutdown(inode);
228	229
229	230	if (is_bad_inode(inode))
230	231	goto no_delete;
...	...	@@ -482,6 +483,58 @@
482	483	return num;
483	484	}
484	485
	486	+#ifdef ES_AGGRESSIVE_TEST
	487	+static void ext4_map_blocks_es_recheck(handle_t *handle,
	488	+ struct inode *inode,
	489	+ struct ext4_map_blocks *es_map,
	490	+ struct ext4_map_blocks *map,
	491	+ int flags)
	492	+{
	493	+ int retval;
	494	+
	495	+ map->m_flags = 0;
	496	+ /*
	497	+ * There is a race window that the result is not the same.
	498	+ * e.g. xfstests #223 when dioread_nolock enables. The reason
	499	+ * is that we lookup a block mapping in extent status tree with
	500	+ * out taking i_data_sem. So at the time the unwritten extent
	501	+ * could be converted.
	502	+ */
	503	+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
	504	+ down_read((&EXT4_I(inode)->i_data_sem));
	505	+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
	506	+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
	507	+ EXT4_GET_BLOCKS_KEEP_SIZE);
	508	+ } else {
	509	+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
	510	+ EXT4_GET_BLOCKS_KEEP_SIZE);
	511	+ }
	512	+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
	513	+ up_read((&EXT4_I(inode)->i_data_sem));
	514	+ /*
	515	+ * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
	516	+ * because it shouldn't be marked in es_map->m_flags.
	517	+ */
	518	+ map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER \| EXT4_MAP_BOUNDARY);
	519	+
	520	+ /*
	521	+ * We don't check m_len because extent will be collpased in status
	522	+ * tree. So the m_len might not equal.
	523	+ */
	524	+ if (es_map->m_lblk != map->m_lblk \|\|
	525	+ es_map->m_flags != map->m_flags \|\|
	526	+ es_map->m_pblk != map->m_pblk) {
	527	+ printk("ES cache assertation failed for inode: %lu "
	528	+ "es_cached ex [%d/%d/%llu/%x] != "
	529	+ "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
	530	+ inode->i_ino, es_map->m_lblk, es_map->m_len,
	531	+ es_map->m_pblk, es_map->m_flags, map->m_lblk,
	532	+ map->m_len, map->m_pblk, map->m_flags,
	533	+ retval, flags);
	534	+ }
	535	+}
	536	+#endif /* ES_AGGRESSIVE_TEST */
	537	+
485	538	/*
486	539	* The ext4_map_blocks() function tries to look up the requested blocks,
487	540	* and returns if the blocks are already mapped.
488	541
...	...	@@ -509,7 +562,12 @@
509	562	{
510	563	struct extent_status es;
511	564	int retval;
	565	+#ifdef ES_AGGRESSIVE_TEST
	566	+ struct ext4_map_blocks orig_map;
512	567
	568	+ memcpy(&orig_map, map, sizeof(*map));
	569	+#endif
	570	+
513	571	map->m_flags = 0;
514	572	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
515	573	"logical block %lu\n", inode->i_ino, flags, map->m_len,
...	...	@@ -531,6 +589,10 @@
531	589	} else {
532	590	BUG_ON(1);
533	591	}
	592	+#ifdef ES_AGGRESSIVE_TEST
	593	+ ext4_map_blocks_es_recheck(handle, inode, map,
	594	+ &orig_map, flags);
	595	+#endif
534	596	goto found;
535	597	}
536	598
...	...	@@ -551,6 +613,15 @@
551	613	int ret;
552	614	unsigned long long status;
553	615
	616	+#ifdef ES_AGGRESSIVE_TEST
	617	+ if (retval != map->m_len) {
	618	+ printk("ES len assertation failed for inode: %lu "
	619	+ "retval %d != map->m_len %d "
	620	+ "in %s (lookup)\n", inode->i_ino, retval,
	621	+ map->m_len, __func__);
	622	+ }
	623	+#endif
	624	+
554	625	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
555	626	EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
556	627	if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
...	...	@@ -643,6 +714,24 @@
643	714	int ret;
644	715	unsigned long long status;
645	716
	717	+#ifdef ES_AGGRESSIVE_TEST
	718	+ if (retval != map->m_len) {
	719	+ printk("ES len assertation failed for inode: %lu "
	720	+ "retval %d != map->m_len %d "
	721	+ "in %s (allocation)\n", inode->i_ino, retval,
	722	+ map->m_len, __func__);
	723	+ }
	724	+#endif
	725	+
	726	+ /*
	727	+ * If the extent has been zeroed out, we don't need to update
	728	+ * extent status tree.
	729	+ */
	730	+ if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
	731	+ ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
	732	+ if (ext4_es_is_written(&es))
	733	+ goto has_zeroout;
	734	+ }
646	735	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
647	736	EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
648	737	if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
...	...	@@ -655,6 +744,7 @@
655	744	retval = ret;
656	745	}
657	746
	747	+has_zeroout:
658	748	up_write((&EXT4_I(inode)->i_data_sem));
659	749	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
660	750	int ret = check_block_validity(inode, map);
...	...	@@ -1216,6 +1306,55 @@
1216	1306	}
1217	1307
1218	1308	/*
	1309	+ * Reserve a metadata for a single block located at lblock
	1310	+ */
	1311	+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
	1312	+{
	1313	+ int retries = 0;
	1314	+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	1315	+ struct ext4_inode_info *ei = EXT4_I(inode);
	1316	+ unsigned int md_needed;
	1317	+ ext4_lblk_t save_last_lblock;
	1318	+ int save_len;
	1319	+
	1320	+ /*
	1321	+ * recalculate the amount of metadata blocks to reserve
	1322	+ * in order to allocate nrblocks
	1323	+ * worse case is one extent per block
	1324	+ */
	1325	+repeat:
	1326	+ spin_lock(&ei->i_block_reservation_lock);
	1327	+ /*
	1328	+ * ext4_calc_metadata_amount() has side effects, which we have
	1329	+ * to be prepared undo if we fail to claim space.
	1330	+ */
	1331	+ save_len = ei->i_da_metadata_calc_len;
	1332	+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
	1333	+ md_needed = EXT4_NUM_B2C(sbi,
	1334	+ ext4_calc_metadata_amount(inode, lblock));
	1335	+ trace_ext4_da_reserve_space(inode, md_needed);
	1336	+
	1337	+ /*
	1338	+ * We do still charge estimated metadata to the sb though;
	1339	+ * we cannot afford to run out of free blocks.
	1340	+ */
	1341	+ if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
	1342	+ ei->i_da_metadata_calc_len = save_len;
	1343	+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
	1344	+ spin_unlock(&ei->i_block_reservation_lock);
	1345	+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
	1346	+ cond_resched();
	1347	+ goto repeat;
	1348	+ }
	1349	+ return -ENOSPC;
	1350	+ }
	1351	+ ei->i_reserved_meta_blocks += md_needed;
	1352	+ spin_unlock(&ei->i_block_reservation_lock);
	1353	+
	1354	+ return 0; /* success */
	1355	+}
	1356	+
	1357	+/*
1219	1358	* Reserve a single cluster located at lblock
1220	1359	*/
1221	1360	static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
...	...	@@ -1263,7 +1402,7 @@
1263	1402	ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1264	1403	spin_unlock(&ei->i_block_reservation_lock);
1265	1404	if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1266		- yield();
	1405	+ cond_resched();
1267	1406	goto repeat;
1268	1407	}
1269	1408	dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1270	1409
...	...	@@ -1768,7 +1907,12 @@
1768	1907	struct extent_status es;
1769	1908	int retval;
1770	1909	sector_t invalid_block = ~((sector_t) 0xffff);
	1910	+#ifdef ES_AGGRESSIVE_TEST
	1911	+ struct ext4_map_blocks orig_map;
1771	1912
	1913	+ memcpy(&orig_map, map, sizeof(*map));
	1914	+#endif
	1915	+
1772	1916	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1773	1917	invalid_block = ~0;
1774	1918
...	...	@@ -1809,6 +1953,9 @@
1809	1953	else
1810	1954	BUG_ON(1);
1811	1955
	1956	+#ifdef ES_AGGRESSIVE_TEST
	1957	+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
	1958	+#endif
1812	1959	return retval;
1813	1960	}
1814	1961
...	...	@@ -1843,8 +1990,11 @@
1843	1990	* XXX: __block_prepare_write() unmaps passed block,
1844	1991	* is it OK?
1845	1992	*/
1846		- /* If the block was allocated from previously allocated cluster,
1847		- * then we dont need to reserve it again. */
	1993	+ /*
	1994	+ * If the block was allocated from previously allocated cluster,
	1995	+ * then we don't need to reserve it again. However we still need
	1996	+ * to reserve metadata for every block we're going to write.
	1997	+ */
1848	1998	if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1849	1999	ret = ext4_da_reserve_space(inode, iblock);
1850	2000	if (ret) {
...	...	@@ -1852,6 +2002,13 @@
1852	2002	retval = ret;
1853	2003	goto out_unlock;
1854	2004	}
	2005	+ } else {
	2006	+ ret = ext4_da_reserve_metadata(inode, iblock);
	2007	+ if (ret) {
	2008	+ /* not enough space to reserve */
	2009	+ retval = ret;
	2010	+ goto out_unlock;
	2011	+ }
1855	2012	}
1856	2013
1857	2014	ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
...	...	@@ -1873,6 +2030,15 @@
1873	2030	int ret;
1874	2031	unsigned long long status;
1875	2032
	2033	+#ifdef ES_AGGRESSIVE_TEST
	2034	+ if (retval != map->m_len) {
	2035	+ printk("ES len assertation failed for inode: %lu "
	2036	+ "retval %d != map->m_len %d "
	2037	+ "in %s (lookup)\n", inode->i_ino, retval,
	2038	+ map->m_len, __func__);
	2039	+ }
	2040	+#endif
	2041	+
1876	2042	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
1877	2043	EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
1878	2044	ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
...	...	@@ -2908,8 +3074,8 @@
2908	3074
2909	3075	trace_ext4_releasepage(page);
2910	3076
2911		- WARN_ON(PageChecked(page));
2912		- if (!page_has_buffers(page))
	3077	+ /* Page has dirty journalled data -> cannot release */
	3078	+ if (PageChecked(page))
2913	3079	return 0;
2914	3080	if (journal)
2915	3081	return jbd2_journal_try_to_free_buffers(journal, page, wait);
...	...	@@ -2804,8 +2804,8 @@
2804	2804	if (sbi->s_log_groups_per_flex) {
2805	2805	ext4_group_t flex_group = ext4_flex_group(sbi,
2806	2806	ac->ac_b_ex.fe_group);
2807		- atomic_sub(ac->ac_b_ex.fe_len,
2808		- &sbi->s_flex_groups[flex_group].free_clusters);
	2807	+ atomic64_sub(ac->ac_b_ex.fe_len,
	2808	+ &sbi->s_flex_groups[flex_group].free_clusters);
2809	2809	}
2810	2810
2811	2811	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
...	...	@@ -3692,11 +3692,7 @@
3692	3692	if (free < needed && busy) {
3693	3693	busy = 0;
3694	3694	ext4_unlock_group(sb, group);
3695		- /*
3696		- * Yield the CPU here so that we don't get soft lockup
3697		- * in non preempt case.
3698		- */
3699		- yield();
	3695	+ cond_resched();
3700	3696	goto repeat;
3701	3697	}
3702	3698
...	...	@@ -4246,7 +4242,7 @@
4246	4242	ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4247	4243
4248	4244	/* let others to free the space */
4249		- yield();
	4245	+ cond_resched();
4250	4246	ar->len = ar->len >> 1;
4251	4247	}
4252	4248	if (!ar->len) {
...	...	@@ -4464,7 +4460,6 @@
4464	4460	struct buffer_head *bitmap_bh = NULL;
4465	4461	struct super_block *sb = inode->i_sb;
4466	4462	struct ext4_group_desc *gdp;
4467		- unsigned long freed = 0;
4468	4463	unsigned int overflow;
4469	4464	ext4_grpblk_t bit;
4470	4465	struct buffer_head *gd_bh;
4471	4466
...	...	@@ -4666,14 +4661,12 @@
4666	4661
4667	4662	if (sbi->s_log_groups_per_flex) {
4668	4663	ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4669		- atomic_add(count_clusters,
4670		- &sbi->s_flex_groups[flex_group].free_clusters);
	4664	+ atomic64_add(count_clusters,
	4665	+ &sbi->s_flex_groups[flex_group].free_clusters);
4671	4666	}
4672	4667
4673	4668	ext4_mb_unload_buddy(&e4b);
4674	4669
4675		- freed += count;
4676		-
4677	4670	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4678	4671	dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4679	4672
...	...	@@ -4811,8 +4804,8 @@
4811	4804
4812	4805	if (sbi->s_log_groups_per_flex) {
4813	4806	ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4814		- atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),
4815		- &sbi->s_flex_groups[flex_group].free_clusters);
	4807	+ atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
	4808	+ &sbi->s_flex_groups[flex_group].free_clusters);
4816	4809	}
4817	4810
4818	4811	ext4_mb_unload_buddy(&e4b);
...	...	@@ -32,16 +32,18 @@
32	32	*/
33	33	static inline int
34	34	get_ext_path(struct inode *inode, ext4_lblk_t lblock,
35		- struct ext4_ext_path **path)
	35	+ struct ext4_ext_path **orig_path)
36	36	{
37	37	int ret = 0;
	38	+ struct ext4_ext_path *path;
38	39
39		- path = ext4_ext_find_extent(inode, lblock, path);
40		- if (IS_ERR(*path)) {
41		- ret = PTR_ERR(*path);
42		- *path = NULL;
43		- } else if ((*path)[ext_depth(inode)].p_ext == NULL)
	40	+ path = ext4_ext_find_extent(inode, lblock, *orig_path);
	41	+ if (IS_ERR(path))
	42	+ ret = PTR_ERR(path);
	43	+ else if (path[ext_depth(inode)].p_ext == NULL)
44	44	ret = -ENODATA;
	45	+ else
	46	+ *orig_path = path;
45	47
46	48	return ret;
47	49	}
48	50
49	51
50	52
...	...	@@ -611,24 +613,25 @@
611	613	{
612	614	struct ext4_ext_path *path = NULL;
613	615	struct ext4_extent *ext;
	616	+ int ret = 0;
614	617	ext4_lblk_t last = from + count;
615	618	while (from < last) {
616	619	*err = get_ext_path(inode, from, &path);
617	620	if (*err)
618		- return 0;
	621	+ goto out;
619	622	ext = path[ext_depth(inode)].p_ext;
620		- if (!ext) {
621		- ext4_ext_drop_refs(path);
622		- return 0;
623		- }
624		- if (uninit != ext4_ext_is_uninitialized(ext)) {
625		- ext4_ext_drop_refs(path);
626		- return 0;
627		- }
	623	+ if (uninit != ext4_ext_is_uninitialized(ext))
	624	+ goto out;
628	625	from += ext4_ext_get_actual_len(ext);
629	626	ext4_ext_drop_refs(path);
630	627	}
631		- return 1;
	628	+ ret = 1;
	629	+out:
	630	+ if (path) {
	631	+ ext4_ext_drop_refs(path);
	632	+ kfree(path);
	633	+ }
	634	+ return ret;
632	635	}
633	636
634	637	/**
...	...	@@ -665,6 +668,14 @@
665	668	int depth;
666	669	int replaced_count = 0;
667	670	int dext_alen;
	671	+
	672	+ *err = ext4_es_remove_extent(orig_inode, from, count);
	673	+ if (*err)
	674	+ goto out;
	675	+
	676	+ *err = ext4_es_remove_extent(donor_inode, from, count);
	677	+ if (*err)
	678	+ goto out;
668	679
669	680	/* Get the original extent for the block "orig_off" */
670	681	*err = get_ext_path(orig_inode, orig_off, &orig_path);
...	...	@@ -50,11 +50,21 @@
50	50	kmem_cache_destroy(io_page_cachep);
51	51	}
52	52
53		-void ext4_ioend_wait(struct inode *inode)
	53	+/*
	54	+ * This function is called by ext4_evict_inode() to make sure there is
	55	+ * no more pending I/O completion work left to do.
	56	+ */
	57	+void ext4_ioend_shutdown(struct inode *inode)
54	58	{
55	59	wait_queue_head_t *wq = ext4_ioend_wq(inode);
56	60
57	61	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
	62	+ /*
	63	+ * We need to make sure the work structure is finished being
	64	+ * used before we let the inode get destroyed.
	65	+ */
	66	+ if (work_pending(&EXT4_I(inode)->i_unwritten_work))
	67	+ cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
58	68	}
59	69
60	70	static void put_io_page(struct ext4_io_page *io_page)
...	...	@@ -1360,8 +1360,8 @@
1360	1360	sbi->s_log_groups_per_flex) {
1361	1361	ext4_group_t flex_group;
1362	1362	flex_group = ext4_flex_group(sbi, group_data[0].group);
1363		- atomic_add(EXT4_NUM_B2C(sbi, free_blocks),
1364		- &sbi->s_flex_groups[flex_group].free_clusters);
	1363	+ atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
	1364	+ &sbi->s_flex_groups[flex_group].free_clusters);
1365	1365	atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
1366	1366	&sbi->s_flex_groups[flex_group].free_inodes);
1367	1367	}