Btrfs: separate sequence numbers for delayed ref tracking and tree mod log

Sequence numbers for delayed refs have been introduced in the first version of the qgroup patch set. To solve the problem of find_all_roots on a busy file system, the tree mod log was introduced. The sequence numbers for that were simply shared between those two users. However, at one point in qgroup's quota accounting, there's a statement accessing the previous sequence number, that's still just doing (seq - 1) just as it would have to in the very first version. To satisfy that requirement, this patch makes the sequence number counter 64 bit and splits it into a major part (used for qgroup sequence number counting) and a minor part (incremented for each tree modification in the log). This enables us to go exactly one major step backwards, as required for qgroups, while still incrementing the sequence counter for tree mod log insertions to keep track of their order. Keeping them in a single variable means there's no need to change all the code dealing with comparisons of two sequence numbers. The sequence number is reset to 0 on commit (not new in this patch), which ensures we won't overflow the two 32 bit counters. Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs from the tree mod log code may happen. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> Signed-off-by: Josef Bacik <jbacik@fusionio.com>

Btrfs: separate sequence numbers for delayed ref tracking and tree mod log
Sequence numbers for delayed refs have been introduced in the first version of the qgroup patch set. To solve the problem of find_all_roots on a busy file system, the tree mod log was introduced. The sequence numbers for that were simply shared between those two users. However, at one point in qgroup's quota accounting, there's a statement accessing the previous sequence number, that's still just doing (seq - 1) just as it would have to in the very first version. To satisfy that requirement, this patch makes the sequence number counter 64 bit and splits it into a major part (used for qgroup sequence number counting) and a minor part (incremented for each tree modification in the log). This enables us to go exactly one major step backwards, as required for qgroups, while still incrementing the sequence counter for tree mod log insertions to keep track of their order. Keeping them in a single variable means there's no need to change all the code dealing with comparisons of two sequence numbers. The sequence number is reset to 0 on commit (not new in this patch), which ensures we won't overflow the two 32 bit counters. Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs from the tree mod log code may happen. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Jan Schmidt · Josef Bacik
1 parent 6d49ba1b47
Showing 7 changed files with 63 additions and 19 deletions Side-by-side Diff
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/qgroup.c
fs/btrfs/transaction.c
@@ -361,6 +361,44 @@
 }
  
 /*
+ * Increment the upper half of tree_mod_seq, set lower half zero.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
+{
+	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
+	seq &= 0xffffffff00000000ull;
+	seq += 1ull << 32;
+	atomic64_set(&fs_info->tree_mod_seq, seq);
+	return seq;
+}
+
+/*
+ * Increment the lower half of tree_mod_seq.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
+ * are generated should not technically require a spin lock here. (Rationale:
+ * incrementing the minor while incrementing the major seq number is between its
+ * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
+ * just returns a unique sequence number as usual.) We have decided to leave
+ * that requirement in here and rethink it once we notice it really imposes a
+ * problem on some workload.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * return the last minor in the previous major tree_mod_seq number
+ */
+u64 btrfs_tree_mod_seq_prev(u64 seq)
+{
+	return (seq & 0xffffffff00000000ull) - 1ull;
+}
+
+/*
  * This adds a new blocker to the tree mod log's blocker list if the @elem
  * passed does not already have a sequence number set. So when a caller expects
  * to record tree modifications, it should ensure to set elem->seq to zero
  
@@ -376,10 +414,10 @@
 	tree_mod_log_write_lock(fs_info);
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	if (!elem->seq) {
-		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 	}
-	seq = btrfs_inc_tree_mod_seq(fs_info);
+	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 	tree_mod_log_write_unlock(fs_info);
  
@@ -524,7 +562,10 @@
 	if (!tm)
 		return -ENOMEM;
  
-	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
 	return tm->seq;
 }
  
@@ -1422,7 +1422,7 @@
  
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
-	atomic_t tree_mod_seq;
+	atomic64_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
 	struct seq_list tree_mod_seq_elem;
  
@@ -3332,10 +3332,7 @@
 			   struct seq_list *elem);
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			    struct seq_list *elem);
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
-	return atomic_inc_return(&fs_info->tree_mod_seq);
-}
+u64 btrfs_tree_mod_seq_prev(u64 seq);
 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
  
 /* root-item.c */
@@ -361,8 +361,10 @@
 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
 					struct seq_list, list);
 		if (seq >= elem->seq) {
-			pr_debug("holding back delayed_ref %llu, lowest is "
-				 "%llu (%p)\n", seq, elem->seq, delayed_refs);
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
 			ret = 1;
 		}
 	}
@@ -2157,7 +2157,7 @@
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->defrag_running, 0);
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
@@ -2541,9 +2541,10 @@
 	    !trans->delayed_ref_elem.seq) {
 		/* list without seq or seq without list */
 		btrfs_err(fs_info,
-			"qgroup accounting update error, list is%s empty, seq is %llu",
+			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
-			trans->delayed_ref_elem.seq);
+			(u32)(trans->delayed_ref_elem.seq >> 32),
+			(u32)trans->delayed_ref_elem.seq);
 		BUG();
 	}
  
@@ -1242,9 +1242,11 @@
 	case BTRFS_ADD_DELAYED_REF:
 	case BTRFS_ADD_DELAYED_EXTENT:
 		sgn = 1;
+		seq = btrfs_tree_mod_seq_prev(node->seq);
 		break;
 	case BTRFS_DROP_DELAYED_REF:
 		sgn = -1;
+		seq = node->seq;
 		break;
 	case BTRFS_UPDATE_DELAYED_HEAD:
 		return 0;
  
@@ -1254,14 +1256,14 @@
  
 	/*
 	 * the delayed ref sequence number we pass depends on the direction of
-	 * the operation. for add operations, we pass (node->seq - 1) to skip
+	 * the operation. for add operations, we pass
+	 * tree_mod_log_prev_seq(node->seq) to skip
 	 * the delayed ref's current sequence number, because we need the state
 	 * of the tree before the add operation. for delete operations, we pass
 	 * (node->seq) to include the delayed ref's current sequence number,
 	 * because we need the state of the tree after the delete operation.
 	 */
-	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
-				   sgn > 0 ? node->seq - 1 : node->seq, &roots);
+	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
 	if (ret < 0)
 		return ret;
  
  
@@ -1772,9 +1774,10 @@
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
 		return;
-	printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+	pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
 		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
-		trans->delayed_ref_elem.seq);
+		(u32)(trans->delayed_ref_elem.seq >> 32),
+		(u32)trans->delayed_ref_elem.seq);
 	BUG();
 }
@@ -162,7 +162,7 @@
 	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
 		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
 			"creating a fresh transaction\n");
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
  
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
...	...	@@ -361,6 +361,44 @@
361	361	}
362	362
363	363	/*
	364	+ * Increment the upper half of tree_mod_seq, set lower half zero.
	365	+ *
	366	+ * Must be called with fs_info->tree_mod_seq_lock held.
	367	+ */
	368	+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
	369	+{
	370	+ u64 seq = atomic64_read(&fs_info->tree_mod_seq);
	371	+ seq &= 0xffffffff00000000ull;
	372	+ seq += 1ull << 32;
	373	+ atomic64_set(&fs_info->tree_mod_seq, seq);
	374	+ return seq;
	375	+}
	376	+
	377	+/*
	378	+ * Increment the lower half of tree_mod_seq.
	379	+ *
	380	+ * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
	381	+ * are generated should not technically require a spin lock here. (Rationale:
	382	+ * incrementing the minor while incrementing the major seq number is between its
	383	+ * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
	384	+ * just returns a unique sequence number as usual.) We have decided to leave
	385	+ * that requirement in here and rethink it once we notice it really imposes a
	386	+ * problem on some workload.
	387	+ */
	388	+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
	389	+{
	390	+ return atomic64_inc_return(&fs_info->tree_mod_seq);
	391	+}
	392	+
	393	+/*
	394	+ * return the last minor in the previous major tree_mod_seq number
	395	+ */
	396	+u64 btrfs_tree_mod_seq_prev(u64 seq)
	397	+{
	398	+ return (seq & 0xffffffff00000000ull) - 1ull;
	399	+}
	400	+
	401	+/*
364	402	* This adds a new blocker to the tree mod log's blocker list if the @elem
365	403	* passed does not already have a sequence number set. So when a caller expects
366	404	* to record tree modifications, it should ensure to set elem->seq to zero
367	405
...	...	@@ -376,10 +414,10 @@
376	414	tree_mod_log_write_lock(fs_info);
377	415	spin_lock(&fs_info->tree_mod_seq_lock);
378	416	if (!elem->seq) {
379		- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
	417	+ elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
380	418	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381	419	}
382		- seq = btrfs_inc_tree_mod_seq(fs_info);
	420	+ seq = btrfs_inc_tree_mod_seq_minor(fs_info);
383	421	spin_unlock(&fs_info->tree_mod_seq_lock);
384	422	tree_mod_log_write_unlock(fs_info);
385	423
...	...	@@ -524,7 +562,10 @@
524	562	if (!tm)
525	563	return -ENOMEM;
526	564
527		- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
	565	+ spin_lock(&fs_info->tree_mod_seq_lock);
	566	+ tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
	567	+ spin_unlock(&fs_info->tree_mod_seq_lock);
	568	+
528	569	return tm->seq;
529	570	}
530	571
...	...	@@ -1422,7 +1422,7 @@
1422	1422
1423	1423	/* this protects tree_mod_seq_list */
1424	1424	spinlock_t tree_mod_seq_lock;
1425		- atomic_t tree_mod_seq;
	1425	+ atomic64_t tree_mod_seq;
1426	1426	struct list_head tree_mod_seq_list;
1427	1427	struct seq_list tree_mod_seq_elem;
1428	1428
...	...	@@ -3332,10 +3332,7 @@
3332	3332	struct seq_list *elem);
3333	3333	void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3334	3334	struct seq_list *elem);
3335		-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3336		-{
3337		- return atomic_inc_return(&fs_info->tree_mod_seq);
3338		-}
	3335	+u64 btrfs_tree_mod_seq_prev(u64 seq);
3339	3336	int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3340	3337
3341	3338	/* root-item.c */
...	...	@@ -361,8 +361,10 @@
361	361	elem = list_first_entry(&fs_info->tree_mod_seq_list,
362	362	struct seq_list, list);
363	363	if (seq >= elem->seq) {
364		- pr_debug("holding back delayed_ref %llu, lowest is "
365		- "%llu (%p)\n", seq, elem->seq, delayed_refs);
	364	+ pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
	365	+ (u32)(seq >> 32), (u32)seq,
	366	+ (u32)(elem->seq >> 32), (u32)elem->seq,
	367	+ delayed_refs);
366	368	ret = 1;
367	369	}
368	370	}
...	...	@@ -2157,7 +2157,7 @@
2157	2157	atomic_set(&fs_info->async_submit_draining, 0);
2158	2158	atomic_set(&fs_info->nr_async_bios, 0);
2159	2159	atomic_set(&fs_info->defrag_running, 0);
2160		- atomic_set(&fs_info->tree_mod_seq, 0);
	2160	+ atomic64_set(&fs_info->tree_mod_seq, 0);
2161	2161	fs_info->sb = sb;
2162	2162	fs_info->max_inline = 8192 * 1024;
2163	2163	fs_info->metadata_ratio = 0;
...	...	@@ -2541,9 +2541,10 @@
2541	2541	!trans->delayed_ref_elem.seq) {
2542	2542	/* list without seq or seq without list */
2543	2543	btrfs_err(fs_info,
2544		- "qgroup accounting update error, list is%s empty, seq is %llu",
	2544	+ "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2545	2545	list_empty(&trans->qgroup_ref_list) ? "" : " not",
2546		- trans->delayed_ref_elem.seq);
	2546	+ (u32)(trans->delayed_ref_elem.seq >> 32),
	2547	+ (u32)trans->delayed_ref_elem.seq);
2547	2548	BUG();
2548	2549	}
2549	2550
...	...	@@ -1242,9 +1242,11 @@
1242	1242	case BTRFS_ADD_DELAYED_REF:
1243	1243	case BTRFS_ADD_DELAYED_EXTENT:
1244	1244	sgn = 1;
	1245	+ seq = btrfs_tree_mod_seq_prev(node->seq);
1245	1246	break;
1246	1247	case BTRFS_DROP_DELAYED_REF:
1247	1248	sgn = -1;
	1249	+ seq = node->seq;
1248	1250	break;
1249	1251	case BTRFS_UPDATE_DELAYED_HEAD:
1250	1252	return 0;
1251	1253
...	...	@@ -1254,14 +1256,14 @@
1254	1256
1255	1257	/*
1256	1258	* the delayed ref sequence number we pass depends on the direction of
1257		- * the operation. for add operations, we pass (node->seq - 1) to skip
	1259	+ * the operation. for add operations, we pass
	1260	+ * tree_mod_log_prev_seq(node->seq) to skip
1258	1261	* the delayed ref's current sequence number, because we need the state
1259	1262	* of the tree before the add operation. for delete operations, we pass
1260	1263	* (node->seq) to include the delayed ref's current sequence number,
1261	1264	* because we need the state of the tree after the delete operation.
1262	1265	*/
1263		- ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
1264		- sgn > 0 ? node->seq - 1 : node->seq, &roots);
	1266	+ ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
1265	1267	if (ret < 0)
1266	1268	return ret;
1267	1269
1268	1270
...	...	@@ -1772,9 +1774,10 @@
1772	1774	{
1773	1775	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1774	1776	return;
1775		- printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
	1777	+ pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
1776	1778	trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1777		- trans->delayed_ref_elem.seq);
	1779	+ (u32)(trans->delayed_ref_elem.seq >> 32),
	1780	+ (u32)trans->delayed_ref_elem.seq);
1778	1781	BUG();
1779	1782	}
...	...	@@ -162,7 +162,7 @@
162	162	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
163	163	WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
164	164	"creating a fresh transaction\n");
165		- atomic_set(&fs_info->tree_mod_seq, 0);
	165	+ atomic64_set(&fs_info->tree_mod_seq, 0);
166	166
167	167	spin_lock_init(&cur_trans->commit_lock);
168	168	spin_lock_init(&cur_trans->delayed_refs.lock);