Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason: "Outside of misc fixes, Filipe has a few fsync corners and we're pulling in one more of Josef's fixes from production use here" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: Btrfs:__add_inode_ref: out of bounds memory read when looking for extended ref. Btrfs: fix data loss in the fast fsync path Btrfs: remove extra run_delayed_refs in update_cowonly_root Btrfs: incremental send, don't rename a directory too soon btrfs: fix lost return value due to variable shadowing Btrfs: do not ignore errors from btrfs_lookup_xattr in do_setxattr Btrfs: fix off-by-one logic error in btrfs_realloc_node Btrfs: add missing inode update when punching hole Btrfs: abort the transaction if we fail to update the free space cache inode Btrfs: fix fsync race leading to ordered extent memory leaks

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs fixes from Chris Mason: "Outside of misc fixes, Filipe has a few fsync corners and we're pulling in one more of Josef's fixes from production use here" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: Btrfs:__add_inode_ref: out of bounds memory read when looking for extended ref. Btrfs: fix data loss in the fast fsync path Btrfs: remove extra run_delayed_refs in update_cowonly_root Btrfs: incremental send, don't rename a directory too soon btrfs: fix lost return value due to variable shadowing Btrfs: do not ignore errors from btrfs_lookup_xattr in do_setxattr Btrfs: fix off-by-one logic error in btrfs_realloc_node Btrfs: add missing inode update when punching hole Btrfs: abort the transaction if we fail to update the free space cache inode Btrfs: fix fsync race leading to ordered extent memory leaks
Linus Torvalds
2 parents 0d9b9c1674 dd9ef135e3
Showing 9 changed files Side-by-side Diff
fs/btrfs/ctree.c
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/send.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/xattr.c
@@ -1645,14 +1645,14 @@
  
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = root->nodesize;
-	end_slot = parent_nritems;
+	end_slot = parent_nritems - 1;
  
-	if (parent_nritems == 1)
+	if (parent_nritems <= 1)
 		return 0;
  
 	btrfs_set_lock_blocking(parent);
  
-	for (i = start_slot; i < end_slot; i++) {
+	for (i = start_slot; i <= end_slot; i++) {
 		int close = 1;
  
 		btrfs_node_key(parent, &disk_key, i);
@@ -1669,7 +1669,7 @@
 			other = btrfs_node_blockptr(parent, i - 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
-		if (!close && i < end_slot - 2) {
+		if (!close && i < end_slot) {
 			other = btrfs_node_blockptr(parent, i + 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
@@ -3208,6 +3208,8 @@
 		return 0;
 	}
  
+	if (trans->aborted)
+		return 0;
 again:
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3243,6 +3245,20 @@
 	 */
 	BTRFS_I(inode)->generation = 0;
 	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		/*
+		 * So theoretically we could recover from this, simply set the
+		 * super cache generation to 0 so we know to invalidate the
+		 * cache, but then we'd have to keep track of the block groups
+		 * that fail this way so we know we _have_ to reset this cache
+		 * before the next commit or risk reading stale cache.  So to
+		 * limit our exposure to horrible edge cases lets just abort the
+		 * transaction, this only happens in really bad situations
+		 * anyway.
+		 */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
 	WARN_ON(ret);
  
 	if (i_size_read(inode) > 0) {
@@ -1811,22 +1811,10 @@
 	mutex_unlock(&inode->i_mutex);
  
 	/*
-	 * we want to make sure fsync finds this change
-	 * but we haven't joined a transaction running right now.
-	 *
-	 * Later on, someone is sure to update the inode and get the
-	 * real transid recorded.
-	 *
-	 * We set last_trans now to the fs_info generation + 1,
-	 * this will either be one more than the running transaction
-	 * or the generation used for the next transaction if there isn't
-	 * one running right now.
-	 *
 	 * We also have to set last_sub_trans to the current log transid,
 	 * otherwise subsequent syncs to a file that's been synced in this
 	 * transaction will appear to have already occured.
 	 */
-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	if (num_written > 0) {
 		err = generic_write_sync(file, pos, num_written);
  
  
@@ -1959,25 +1947,37 @@
 	atomic_inc(&root->log_batch);
  
 	/*
-	 * check the transaction that last modified this inode
-	 * and see if its already been committed
+	 * If the last transaction that changed this file was before the current
+	 * transaction and we have the full sync flag set in our inode, we can
+	 * bail out now without any syncing.
+	 *
+	 * Note that we can't bail out if the full sync flag isn't set. This is
+	 * because when the full sync flag is set we start all ordered extents
+	 * and wait for them to fully complete - when they complete they update
+	 * the inode's last_trans field through:
+	 *
+	 *     btrfs_finish_ordered_io() ->
+	 *         btrfs_update_inode_fallback() ->
+	 *             btrfs_update_inode() ->
+	 *                 btrfs_set_inode_last_trans()
+	 *
+	 * So we are sure that last_trans is up to date and can do this check to
+	 * bail out safely. For the fast path, when the full sync flag is not
+	 * set in our inode, we can not do it because we start only our ordered
+	 * extents and don't wait for them to complete (that is when
+	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
+	 * value might be less than or equals to fs_info->last_trans_committed,
+	 * and setting a speculative last_trans for an inode when a buffered
+	 * write is made (such as fs_info->generation + 1 for example) would not
+	 * be reliable since after setting the value and before fsync is called
+	 * any number of transactions can start and commit (transaction kthread
+	 * commits the current transaction periodically), and a transaction
+	 * commit does not start nor waits for ordered extents to complete.
 	 */
-	if (!BTRFS_I(inode)->last_trans) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
-	}
-
-	/*
-	 * if the last transaction that changed this file was before
-	 * the current transaction, we can bail out now without any
-	 * syncing
-	 */
 	smp_mb();
 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-	    BTRFS_I(inode)->last_trans <=
-	    root->fs_info->last_trans_committed) {
-		BTRFS_I(inode)->last_trans = 0;
-
+	    (full_sync && BTRFS_I(inode)->last_trans <=
+	     root->fs_info->last_trans_committed)) {
 		/*
 		 * We'v had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@
 	bool same_page;
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 	u64 ino_size;
+	bool truncated_page = false;
+	bool updated_inode = false;
  
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
  
  
@@ -2306,13 +2308,18 @@
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
-		if (offset < ino_size)
+		if (offset < ino_size) {
+			truncated_page = true;
 			ret = btrfs_truncate_page(inode, offset, len, 0);
+		} else {
+			ret = 0;
+		}
 		goto out_only_mutex;
 	}
  
 	/* zero back part of the first page */
 	if (offset < ino_size) {
+		truncated_page = true;
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@
 		if (!ret) {
 			/* zero the front end of the last page */
 			if (tail_start + tail_len < ino_size) {
+				truncated_page = true;
 				ret = btrfs_truncate_page(inode,
 						tail_start + tail_len, 0, 1);
 				if (ret)
@@ -2357,8 +2365,8 @@
 	}
  
 	if (lockend < lockstart) {
-		mutex_unlock(&inode->i_mutex);
-		return 0;
+		ret = 0;
+		goto out_only_mutex;
 	}
  
 	while (1) {
@@ -2506,6 +2514,7 @@
  
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
+	updated_inode = true;
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 out_free:
@@ -2515,6 +2524,22 @@
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 out_only_mutex:
+	if (!updated_inode && truncated_page && !ret && !err) {
+		/*
+		 * If we only end up zeroing part of a page, we still need to
+		 * update the inode item, so that all the time fields are
+		 * updated as well as the necessary btrfs inode in memory fields
+		 * for detecting, at fsync time, if the inode isn't yet in the
+		 * log tree or it's there but not up to date.
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+		} else {
+			err = btrfs_update_inode(trans, root, inode);
+			ret = btrfs_end_transaction(trans, root);
+		}
+	}
 	mutex_unlock(&inode->i_mutex);
 	if (ret && !err)
 		err = ret;
@@ -7285,7 +7285,6 @@
 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 	     em->block_start != EXTENT_MAP_HOLE)) {
 		int type;
-		int ret;
 		u64 block_start, orig_start, orig_block_len, ram_bytes;
  
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -452,10 +452,8 @@
 			continue;
 		if (entry_end(ordered) <= start)
 			break;
-		if (!list_empty(&ordered->log_list))
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
 			continue;
-		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
-			continue;
 		list_add(&ordered->log_list, logged_list);
 		atomic_inc(&ordered->refs);
 	}
@@ -511,8 +509,7 @@
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
  
-		if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
-			list_add_tail(&ordered->trans_list, &trans->ordered);
+		list_add_tail(&ordered->trans_list, &trans->ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
@@ -230,6 +230,7 @@
 	u64 parent_ino;
 	u64 ino;
 	u64 gen;
+	bool is_orphan;
 	struct list_head update_refs;
 };
  
@@ -2984,7 +2985,8 @@
 				u64 ino_gen,
 				u64 parent_ino,
 				struct list_head *new_refs,
-				struct list_head *deleted_refs)
+				struct list_head *deleted_refs,
+				const bool is_orphan)
 {
 	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
@@ -2999,6 +3001,7 @@
 	pm->parent_ino = parent_ino;
 	pm->ino = ino;
 	pm->gen = ino_gen;
+	pm->is_orphan = is_orphan;
 	INIT_LIST_HEAD(&pm->list);
 	INIT_LIST_HEAD(&pm->update_refs);
 	RB_CLEAR_NODE(&pm->node);
  
  
@@ -3131,26 +3134,31 @@
 	rmdir_ino = dm->rmdir_ino;
 	free_waiting_dir_move(sctx, dm);
  
-	ret = get_first_ref(sctx->parent_root, pm->ino,
-			    &parent_ino, &parent_gen, name);
+	if (pm->is_orphan) {
+		ret = gen_unique_name(sctx, pm->ino,
+				      pm->gen, from_path);
+	} else {
+		ret = get_first_ref(sctx->parent_root, pm->ino,
+				    &parent_ino, &parent_gen, name);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, parent_ino, parent_gen,
+				   from_path);
+		if (ret < 0)
+			goto out;
+		ret = fs_path_add_path(from_path, name);
+	}
 	if (ret < 0)
 		goto out;
  
-	ret = get_cur_path(sctx, parent_ino, parent_gen,
-			   from_path);
-	if (ret < 0)
-		goto out;
-	ret = fs_path_add_path(from_path, name);
-	if (ret < 0)
-		goto out;
-
 	sctx->send_progress = sctx->cur_ino + 1;
 	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
 	if (ret) {
 		LIST_HEAD(deleted_refs);
 		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
 		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
-					   &pm->update_refs, &deleted_refs);
+					   &pm->update_refs, &deleted_refs,
+					   pm->is_orphan);
 		if (ret < 0)
 			goto out;
 		if (rmdir_ino) {
@@ -3283,6 +3291,127 @@
 	return ret;
 }
  
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 257)
+ * |     |---- file                        (ino 260)
+ * |
+ * |---- b/                                (ino 258)
+ * |---- c/                                (ino 259)
+ *
+ * Send snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 258)
+ * |---- x/                                (ino 259)
+ *       |---- y/                          (ino 257)
+ *             |----- file                 (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+				  struct recorded_ref *parent_ref,
+				  const bool is_orphan)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key di_key;
+	struct btrfs_dir_item *di;
+	u64 left_gen;
+	u64 right_gen;
+	int ret = 0;
+
+	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+		return 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = parent_ref->dir;
+	key.type = BTRFS_DIR_ITEM_KEY;
+	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	di = btrfs_match_dir_item_name(sctx->parent_root, path,
+				       parent_ref->name, parent_ref->name_len);
+	if (!di) {
+		ret = 0;
+		goto out;
+	}
+	/*
+	 * di_key.objectid has the number of the inode that has a dentry in the
+	 * parent directory with the same name that sctx->cur_ino is being
+	 * renamed to. We need to check if that inode is in the send root as
+	 * well and if it is currently marked as an inode with a pending rename,
+	 * if it is, we need to delay the rename of sctx->cur_ino as well, so
+	 * that it happens after that other inode is renamed.
+	 */
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+			     &left_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		goto out;
+	ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+			     &right_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	/* Different inode, no need to delay the rename of sctx->cur_ino */
+	if (right_gen != left_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	if (is_waiting_for_move(sctx, di_key.objectid)) {
+		ret = add_pending_dir_move(sctx,
+					   sctx->cur_ino,
+					   sctx->cur_inode_gen,
+					   di_key.objectid,
+					   &sctx->new_refs,
+					   &sctx->deleted_refs,
+					   is_orphan);
+		if (!ret)
+			ret = 1;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int wait_for_parent_move(struct send_ctx *sctx,
 				struct recorded_ref *parent_ref)
 {
@@ -3349,7 +3478,8 @@
 					   sctx->cur_inode_gen,
 					   ino,
 					   &sctx->new_refs,
-					   &sctx->deleted_refs);
+					   &sctx->deleted_refs,
+					   false);
 		if (!ret)
 			ret = 1;
 	}
@@ -3372,6 +3502,7 @@
 	int did_overwrite = 0;
 	int is_orphan = 0;
 	u64 last_dir_ino_rm = 0;
+	bool can_rename = true;
  
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
  
  
@@ -3490,12 +3621,22 @@
 			}
 		}
  
+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+			ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				can_rename = false;
+				*pending_move = 1;
+			}
+		}
+
 		/*
 		 * link/move the ref to the new place. If we have an orphan
 		 * inode, move it and update valid_path. If not, link or move
 		 * it depending on the inode mode.
 		 */
-		if (is_orphan) {
+		if (is_orphan && can_rename) {
 			ret = send_rename(sctx, valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
@@ -3503,7 +3644,7 @@
 			ret = fs_path_copy(valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
-		} else {
+		} else if (can_rename) {
 			if (S_ISDIR(sctx->cur_inode_mode)) {
 				/*
 				 * Dirs can't be linked, so move it. For moved
@@ -1052,9 +1052,6 @@
 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 		if (ret)
 			return ret;
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		if (ret)
-			return ret;
 	}
  
 	return 0;
@@ -1012,7 +1012,7 @@
 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
  
 		while (cur_offset < item_size) {
-			extref = (struct btrfs_inode_extref *)base + cur_offset;
+			extref = (struct btrfs_inode_extref *)(base + cur_offset);
  
 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
  
@@ -111,6 +111,8 @@
 					name, name_len, -1);
 		if (!di && (flags & XATTR_REPLACE))
 			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
 		else if (di)
 			ret = btrfs_delete_one_dir_name(trans, root, path, di);
 		goto out;
  
  
@@ -127,10 +129,12 @@
 		ASSERT(mutex_is_locked(&inode->i_mutex));
 		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
 					name, name_len, 0);
-		if (!di) {
+		if (!di)
 			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
+		if (ret)
 			goto out;
-		}
 		btrfs_release_path(path);
 		di = NULL;
 	}
...	...	@@ -1645,14 +1645,14 @@
1645	1645
1646	1646	parent_nritems = btrfs_header_nritems(parent);
1647	1647	blocksize = root->nodesize;
1648		- end_slot = parent_nritems;
	1648	+ end_slot = parent_nritems - 1;
1649	1649
1650		- if (parent_nritems == 1)
	1650	+ if (parent_nritems <= 1)
1651	1651	return 0;
1652	1652
1653	1653	btrfs_set_lock_blocking(parent);
1654	1654
1655		- for (i = start_slot; i < end_slot; i++) {
	1655	+ for (i = start_slot; i <= end_slot; i++) {
1656	1656	int close = 1;
1657	1657
1658	1658	btrfs_node_key(parent, &disk_key, i);
...	...	@@ -1669,7 +1669,7 @@
1669	1669	other = btrfs_node_blockptr(parent, i - 1);
1670	1670	close = close_blocks(blocknr, other, blocksize);
1671	1671	}
1672		- if (!close && i < end_slot - 2) {
	1672	+ if (!close && i < end_slot) {
1673	1673	other = btrfs_node_blockptr(parent, i + 1);
1674	1674	close = close_blocks(blocknr, other, blocksize);
1675	1675	}
...	...	@@ -3208,6 +3208,8 @@
3208	3208	return 0;
3209	3209	}
3210	3210
	3211	+ if (trans->aborted)
	3212	+ return 0;
3211	3213	again:
3212	3214	inode = lookup_free_space_inode(root, block_group, path);
3213	3215	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
...	...	@@ -3243,6 +3245,20 @@
3243	3245	*/
3244	3246	BTRFS_I(inode)->generation = 0;
3245	3247	ret = btrfs_update_inode(trans, root, inode);
	3248	+ if (ret) {
	3249	+ /*
	3250	+ * So theoretically we could recover from this, simply set the
	3251	+ * super cache generation to 0 so we know to invalidate the
	3252	+ * cache, but then we'd have to keep track of the block groups
	3253	+ * that fail this way so we know we _have_ to reset this cache
	3254	+ * before the next commit or risk reading stale cache. So to
	3255	+ * limit our exposure to horrible edge cases lets just abort the
	3256	+ * transaction, this only happens in really bad situations
	3257	+ * anyway.
	3258	+ */
	3259	+ btrfs_abort_transaction(trans, root, ret);
	3260	+ goto out_put;
	3261	+ }
3246	3262	WARN_ON(ret);
3247	3263
3248	3264	if (i_size_read(inode) > 0) {
...	...	@@ -1811,22 +1811,10 @@
1811	1811	mutex_unlock(&inode->i_mutex);
1812	1812
1813	1813	/*
1814		- * we want to make sure fsync finds this change
1815		- * but we haven't joined a transaction running right now.
1816		- *
1817		- * Later on, someone is sure to update the inode and get the
1818		- * real transid recorded.
1819		- *
1820		- * We set last_trans now to the fs_info generation + 1,
1821		- * this will either be one more than the running transaction
1822		- * or the generation used for the next transaction if there isn't
1823		- * one running right now.
1824		- *
1825	1814	* We also have to set last_sub_trans to the current log transid,
1826	1815	* otherwise subsequent syncs to a file that's been synced in this
1827	1816	* transaction will appear to have already occured.
1828	1817	*/
1829		- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1830	1818	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1831	1819	if (num_written > 0) {
1832	1820	err = generic_write_sync(file, pos, num_written);
1833	1821
1834	1822
...	...	@@ -1959,25 +1947,37 @@
1959	1947	atomic_inc(&root->log_batch);
1960	1948
1961	1949	/*
1962		- * check the transaction that last modified this inode
1963		- * and see if its already been committed
	1950	+ * If the last transaction that changed this file was before the current
	1951	+ * transaction and we have the full sync flag set in our inode, we can
	1952	+ * bail out now without any syncing.
	1953	+ *
	1954	+ * Note that we can't bail out if the full sync flag isn't set. This is
	1955	+ * because when the full sync flag is set we start all ordered extents
	1956	+ * and wait for them to fully complete - when they complete they update
	1957	+ * the inode's last_trans field through:
	1958	+ *
	1959	+ * btrfs_finish_ordered_io() ->
	1960	+ * btrfs_update_inode_fallback() ->
	1961	+ * btrfs_update_inode() ->
	1962	+ * btrfs_set_inode_last_trans()
	1963	+ *
	1964	+ * So we are sure that last_trans is up to date and can do this check to
	1965	+ * bail out safely. For the fast path, when the full sync flag is not
	1966	+ * set in our inode, we can not do it because we start only our ordered
	1967	+ * extents and don't wait for them to complete (that is when
	1968	+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
	1969	+ * value might be less than or equals to fs_info->last_trans_committed,
	1970	+ * and setting a speculative last_trans for an inode when a buffered
	1971	+ * write is made (such as fs_info->generation + 1 for example) would not
	1972	+ * be reliable since after setting the value and before fsync is called
	1973	+ * any number of transactions can start and commit (transaction kthread
	1974	+ * commits the current transaction periodically), and a transaction
	1975	+ * commit does not start nor waits for ordered extents to complete.
1964	1976	*/
1965		- if (!BTRFS_I(inode)->last_trans) {
1966		- mutex_unlock(&inode->i_mutex);
1967		- goto out;
1968		- }
1969		-
1970		- /*
1971		- * if the last transaction that changed this file was before
1972		- * the current transaction, we can bail out now without any
1973		- * syncing
1974		- */
1975	1977	smp_mb();
1976	1978	if (btrfs_inode_in_log(inode, root->fs_info->generation) \|\|
1977		- BTRFS_I(inode)->last_trans <=
1978		- root->fs_info->last_trans_committed) {
1979		- BTRFS_I(inode)->last_trans = 0;
1980		-
	1979	+ (full_sync && BTRFS_I(inode)->last_trans <=
	1980	+ root->fs_info->last_trans_committed)) {
1981	1981	/*
1982	1982	* We'v had everything committed since the last time we were
1983	1983	* modified so clear this flag in case it was set for whatever
...	...	@@ -2275,6 +2275,8 @@
2275	2275	bool same_page;
2276	2276	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2277	2277	u64 ino_size;
	2278	+ bool truncated_page = false;
	2279	+ bool updated_inode = false;
2278	2280
2279	2281	ret = btrfs_wait_ordered_range(inode, offset, len);
2280	2282	if (ret)
2281	2283
2282	2284
...	...	@@ -2306,13 +2308,18 @@
2306	2308	* entire page.
2307	2309	*/
2308	2310	if (same_page && len < PAGE_CACHE_SIZE) {
2309		- if (offset < ino_size)
	2311	+ if (offset < ino_size) {
	2312	+ truncated_page = true;
2310	2313	ret = btrfs_truncate_page(inode, offset, len, 0);
	2314	+ } else {
	2315	+ ret = 0;
	2316	+ }
2311	2317	goto out_only_mutex;
2312	2318	}
2313	2319
2314	2320	/* zero back part of the first page */
2315	2321	if (offset < ino_size) {
	2322	+ truncated_page = true;
2316	2323	ret = btrfs_truncate_page(inode, offset, 0, 0);
2317	2324	if (ret) {
2318	2325	mutex_unlock(&inode->i_mutex);
...	...	@@ -2348,6 +2355,7 @@
2348	2355	if (!ret) {
2349	2356	/* zero the front end of the last page */
2350	2357	if (tail_start + tail_len < ino_size) {
	2358	+ truncated_page = true;
2351	2359	ret = btrfs_truncate_page(inode,
2352	2360	tail_start + tail_len, 0, 1);
2353	2361	if (ret)
...	...	@@ -2357,8 +2365,8 @@
2357	2365	}
2358	2366
2359	2367	if (lockend < lockstart) {
2360		- mutex_unlock(&inode->i_mutex);
2361		- return 0;
	2368	+ ret = 0;
	2369	+ goto out_only_mutex;
2362	2370	}
2363	2371
2364	2372	while (1) {
...	...	@@ -2506,6 +2514,7 @@
2506	2514
2507	2515	trans->block_rsv = &root->fs_info->trans_block_rsv;
2508	2516	ret = btrfs_update_inode(trans, root, inode);
	2517	+ updated_inode = true;
2509	2518	btrfs_end_transaction(trans, root);
2510	2519	btrfs_btree_balance_dirty(root);
2511	2520	out_free:
...	...	@@ -2515,6 +2524,22 @@
2515	2524	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2516	2525	&cached_state, GFP_NOFS);
2517	2526	out_only_mutex:
	2527	+ if (!updated_inode && truncated_page && !ret && !err) {
	2528	+ /*
	2529	+ * If we only end up zeroing part of a page, we still need to
	2530	+ * update the inode item, so that all the time fields are
	2531	+ * updated as well as the necessary btrfs inode in memory fields
	2532	+ * for detecting, at fsync time, if the inode isn't yet in the
	2533	+ * log tree or it's there but not up to date.
	2534	+ */
	2535	+ trans = btrfs_start_transaction(root, 1);
	2536	+ if (IS_ERR(trans)) {
	2537	+ err = PTR_ERR(trans);
	2538	+ } else {
	2539	+ err = btrfs_update_inode(trans, root, inode);
	2540	+ ret = btrfs_end_transaction(trans, root);
	2541	+ }
	2542	+ }
2518	2543	mutex_unlock(&inode->i_mutex);
2519	2544	if (ret && !err)
2520	2545	err = ret;
...	...	@@ -7285,7 +7285,6 @@
7285	7285	((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7286	7286	em->block_start != EXTENT_MAP_HOLE)) {
7287	7287	int type;
7288		- int ret;
7289	7288	u64 block_start, orig_start, orig_block_len, ram_bytes;
7290	7289
7291	7290	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
...	...	@@ -452,10 +452,8 @@
452	452	continue;
453	453	if (entry_end(ordered) <= start)
454	454	break;
455		- if (!list_empty(&ordered->log_list))
	455	+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
456	456	continue;
457		- if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
458		- continue;
459	457	list_add(&ordered->log_list, logged_list);
460	458	atomic_inc(&ordered->refs);
461	459	}
...	...	@@ -511,8 +509,7 @@
511	509	wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
512	510	&ordered->flags));
513	511
514		- if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
515		- list_add_tail(&ordered->trans_list, &trans->ordered);
	512	+ list_add_tail(&ordered->trans_list, &trans->ordered);
516	513	spin_lock_irq(&log->log_extents_lock[index]);
517	514	}
518	515	spin_unlock_irq(&log->log_extents_lock[index]);
...	...	@@ -230,6 +230,7 @@
230	230	u64 parent_ino;
231	231	u64 ino;
232	232	u64 gen;
	233	+ bool is_orphan;
233	234	struct list_head update_refs;
234	235	};
235	236
...	...	@@ -2984,7 +2985,8 @@
2984	2985	u64 ino_gen,
2985	2986	u64 parent_ino,
2986	2987	struct list_head *new_refs,
2987		- struct list_head *deleted_refs)
	2988	+ struct list_head *deleted_refs,
	2989	+ const bool is_orphan)
2988	2990	{
2989	2991	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2990	2992	struct rb_node *parent = NULL;
...	...	@@ -2999,6 +3001,7 @@
2999	3001	pm->parent_ino = parent_ino;
3000	3002	pm->ino = ino;
3001	3003	pm->gen = ino_gen;
	3004	+ pm->is_orphan = is_orphan;
3002	3005	INIT_LIST_HEAD(&pm->list);
3003	3006	INIT_LIST_HEAD(&pm->update_refs);
3004	3007	RB_CLEAR_NODE(&pm->node);
3005	3008
3006	3009
...	...	@@ -3131,26 +3134,31 @@
3131	3134	rmdir_ino = dm->rmdir_ino;
3132	3135	free_waiting_dir_move(sctx, dm);
3133	3136
3134		- ret = get_first_ref(sctx->parent_root, pm->ino,
3135		- &parent_ino, &parent_gen, name);
	3137	+ if (pm->is_orphan) {
	3138	+ ret = gen_unique_name(sctx, pm->ino,
	3139	+ pm->gen, from_path);
	3140	+ } else {
	3141	+ ret = get_first_ref(sctx->parent_root, pm->ino,
	3142	+ &parent_ino, &parent_gen, name);
	3143	+ if (ret < 0)
	3144	+ goto out;
	3145	+ ret = get_cur_path(sctx, parent_ino, parent_gen,
	3146	+ from_path);
	3147	+ if (ret < 0)
	3148	+ goto out;
	3149	+ ret = fs_path_add_path(from_path, name);
	3150	+ }
3136	3151	if (ret < 0)
3137	3152	goto out;
3138	3153
3139		- ret = get_cur_path(sctx, parent_ino, parent_gen,
3140		- from_path);
3141		- if (ret < 0)
3142		- goto out;
3143		- ret = fs_path_add_path(from_path, name);
3144		- if (ret < 0)
3145		- goto out;
3146		-
3147	3154	sctx->send_progress = sctx->cur_ino + 1;
3148	3155	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3149	3156	if (ret) {
3150	3157	LIST_HEAD(deleted_refs);
3151	3158	ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3152	3159	ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3153		- &pm->update_refs, &deleted_refs);
	3160	+ &pm->update_refs, &deleted_refs,
	3161	+ pm->is_orphan);
3154	3162	if (ret < 0)
3155	3163	goto out;
3156	3164	if (rmdir_ino) {
...	...	@@ -3283,6 +3291,127 @@
3283	3291	return ret;
3284	3292	}
3285	3293
	3294	+/*
	3295	+ * We might need to delay a directory rename even when no ancestor directory
	3296	+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
	3297	+ * renamed. This happens when we rename a directory to the old name (the name
	3298	+ * in the parent root) of some other unrelated directory that got its rename
	3299	+ * delayed due to some ancestor with higher number that got renamed.
	3300	+ *
	3301	+ * Example:
	3302	+ *
	3303	+ * Parent snapshot:
	3304	+ * . (ino 256)
	3305	+ * \|---- a/ (ino 257)
	3306	+ * \| \|---- file (ino 260)
	3307	+ * \|
	3308	+ * \|---- b/ (ino 258)
	3309	+ * \|---- c/ (ino 259)
	3310	+ *
	3311	+ * Send snapshot:
	3312	+ * . (ino 256)
	3313	+ * \|---- a/ (ino 258)
	3314	+ * \|---- x/ (ino 259)
	3315	+ * \|---- y/ (ino 257)
	3316	+ * \|----- file (ino 260)
	3317	+ *
	3318	+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
	3319	+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
	3320	+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
	3321	+ * must issue is:
	3322	+ *
	3323	+ * 1 - rename 259 from 'c' to 'x'
	3324	+ * 2 - rename 257 from 'a' to 'x/y'
	3325	+ * 3 - rename 258 from 'b' to 'a'
	3326	+ *
	3327	+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
	3328	+ * be done right away and < 0 on error.
	3329	+ */
	3330	+static int wait_for_dest_dir_move(struct send_ctx *sctx,
	3331	+ struct recorded_ref *parent_ref,
	3332	+ const bool is_orphan)
	3333	+{
	3334	+ struct btrfs_path *path;
	3335	+ struct btrfs_key key;
	3336	+ struct btrfs_key di_key;
	3337	+ struct btrfs_dir_item *di;
	3338	+ u64 left_gen;
	3339	+ u64 right_gen;
	3340	+ int ret = 0;
	3341	+
	3342	+ if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
	3343	+ return 0;
	3344	+
	3345	+ path = alloc_path_for_send();
	3346	+ if (!path)
	3347	+ return -ENOMEM;
	3348	+
	3349	+ key.objectid = parent_ref->dir;
	3350	+ key.type = BTRFS_DIR_ITEM_KEY;
	3351	+ key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
	3352	+
	3353	+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
	3354	+ if (ret < 0) {
	3355	+ goto out;
	3356	+ } else if (ret > 0) {
	3357	+ ret = 0;
	3358	+ goto out;
	3359	+ }
	3360	+
	3361	+ di = btrfs_match_dir_item_name(sctx->parent_root, path,
	3362	+ parent_ref->name, parent_ref->name_len);
	3363	+ if (!di) {
	3364	+ ret = 0;
	3365	+ goto out;
	3366	+ }
	3367	+ /*
	3368	+ * di_key.objectid has the number of the inode that has a dentry in the
	3369	+ * parent directory with the same name that sctx->cur_ino is being
	3370	+ * renamed to. We need to check if that inode is in the send root as
	3371	+ * well and if it is currently marked as an inode with a pending rename,
	3372	+ * if it is, we need to delay the rename of sctx->cur_ino as well, so
	3373	+ * that it happens after that other inode is renamed.
	3374	+ */
	3375	+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
	3376	+ if (di_key.type != BTRFS_INODE_ITEM_KEY) {
	3377	+ ret = 0;
	3378	+ goto out;
	3379	+ }
	3380	+
	3381	+ ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
	3382	+ &left_gen, NULL, NULL, NULL, NULL);
	3383	+ if (ret < 0)
	3384	+ goto out;
	3385	+ ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
	3386	+ &right_gen, NULL, NULL, NULL, NULL);
	3387	+ if (ret < 0) {
	3388	+ if (ret == -ENOENT)
	3389	+ ret = 0;
	3390	+ goto out;
	3391	+ }
	3392	+
	3393	+ /* Different inode, no need to delay the rename of sctx->cur_ino */
	3394	+ if (right_gen != left_gen) {
	3395	+ ret = 0;
	3396	+ goto out;
	3397	+ }
	3398	+
	3399	+ if (is_waiting_for_move(sctx, di_key.objectid)) {
	3400	+ ret = add_pending_dir_move(sctx,
	3401	+ sctx->cur_ino,
	3402	+ sctx->cur_inode_gen,
	3403	+ di_key.objectid,
	3404	+ &sctx->new_refs,
	3405	+ &sctx->deleted_refs,
	3406	+ is_orphan);
	3407	+ if (!ret)
	3408	+ ret = 1;
	3409	+ }
	3410	+out:
	3411	+ btrfs_free_path(path);
	3412	+ return ret;
	3413	+}
	3414	+
3286	3415	static int wait_for_parent_move(struct send_ctx *sctx,
3287	3416	struct recorded_ref *parent_ref)
3288	3417	{
...	...	@@ -3349,7 +3478,8 @@
3349	3478	sctx->cur_inode_gen,
3350	3479	ino,
3351	3480	&sctx->new_refs,
3352		- &sctx->deleted_refs);
	3481	+ &sctx->deleted_refs,
	3482	+ false);
3353	3483	if (!ret)
3354	3484	ret = 1;
3355	3485	}
...	...	@@ -3372,6 +3502,7 @@
3372	3502	int did_overwrite = 0;
3373	3503	int is_orphan = 0;
3374	3504	u64 last_dir_ino_rm = 0;
	3505	+ bool can_rename = true;
3375	3506
3376	3507	verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3377	3508
3378	3509
...	...	@@ -3490,12 +3621,22 @@
3490	3621	}
3491	3622	}
3492	3623
	3624	+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
	3625	+ ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
	3626	+ if (ret < 0)
	3627	+ goto out;
	3628	+ if (ret == 1) {
	3629	+ can_rename = false;
	3630	+ *pending_move = 1;
	3631	+ }
	3632	+ }
	3633	+
3493	3634	/*
3494	3635	* link/move the ref to the new place. If we have an orphan
3495	3636	* inode, move it and update valid_path. If not, link or move
3496	3637	* it depending on the inode mode.
3497	3638	*/
3498		- if (is_orphan) {
	3639	+ if (is_orphan && can_rename) {
3499	3640	ret = send_rename(sctx, valid_path, cur->full_path);
3500	3641	if (ret < 0)
3501	3642	goto out;
...	...	@@ -3503,7 +3644,7 @@
3503	3644	ret = fs_path_copy(valid_path, cur->full_path);
3504	3645	if (ret < 0)
3505	3646	goto out;
3506		- } else {
	3647	+ } else if (can_rename) {
3507	3648	if (S_ISDIR(sctx->cur_inode_mode)) {
3508	3649	/*
3509	3650	* Dirs can't be linked, so move it. For moved
...	...	@@ -1052,9 +1052,6 @@
1052	1052	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1053	1053	if (ret)
1054	1054	return ret;
1055		- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1056		- if (ret)
1057		- return ret;
1058	1055	}
1059	1056
1060	1057	return 0;
...	...	@@ -1012,7 +1012,7 @@
1012	1012	base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1013	1013
1014	1014	while (cur_offset < item_size) {
1015		- extref = (struct btrfs_inode_extref *)base + cur_offset;
	1015	+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
1016	1016
1017	1017	victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1018	1018
...	...	@@ -111,6 +111,8 @@
111	111	name, name_len, -1);
112	112	if (!di && (flags & XATTR_REPLACE))
113	113	ret = -ENODATA;
	114	+ else if (IS_ERR(di))
	115	+ ret = PTR_ERR(di);
114	116	else if (di)
115	117	ret = btrfs_delete_one_dir_name(trans, root, path, di);
116	118	goto out;
117	119
118	120
...	...	@@ -127,10 +129,12 @@
127	129	ASSERT(mutex_is_locked(&inode->i_mutex));
128	130	di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
129	131	name, name_len, 0);
130		- if (!di) {
	132	+ if (!di)
131	133	ret = -ENODATA;
	134	+ else if (IS_ERR(di))
	135	+ ret = PTR_ERR(di);
	136	+ if (ret)
132	137	goto out;
133		- }
134	138	btrfs_release_path(path);
135	139	di = NULL;
136	140	}