Commit fc36ed7e0b13955ba66fc56dc5067e67ac105150

Authored by Jan Schmidt
Committed by Josef Bacik
1 parent 6d49ba1b47

Btrfs: separate sequence numbers for delayed ref tracking and tree mod log

Sequence numbers for delayed refs have been introduced in the first version
of the qgroup patch set. To solve the problem of find_all_roots on a busy
file system, the tree mod log was introduced. The sequence numbers for that
were simply shared between those two users.

However, at one point in qgroup's quota accounting, there's a statement
accessing the previous sequence number, that's still just doing (seq - 1)
just as it would have to in the very first version.

To satisfy that requirement, this patch makes the sequence number counter 64
bit and splits it into a major part (used for qgroup sequence number
counting) and a minor part (incremented for each tree modification in the
log). This enables us to go exactly one major step backwards, as required
for qgroups, while still incrementing the sequence counter for tree mod log
insertions to keep track of their order. Keeping them in a single variable
means there's no need to change all the code dealing with comparisons of two
sequence numbers.

The sequence number is reset to 0 on commit (not new in this patch), which
ensures we won't overflow the two 32 bit counters.

Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
from the tree mod log code may happen.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

Showing 7 changed files with 63 additions and 19 deletions Side-by-side Diff

... ... @@ -361,6 +361,44 @@
361 361 }
362 362  
363 363 /*
  364 + * Increment the upper half of tree_mod_seq, set lower half zero.
  365 + *
  366 + * Must be called with fs_info->tree_mod_seq_lock held.
  367 + */
  368 +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
  369 +{
  370 + u64 seq = atomic64_read(&fs_info->tree_mod_seq);
  371 + seq &= 0xffffffff00000000ull;
  372 + seq += 1ull << 32;
  373 + atomic64_set(&fs_info->tree_mod_seq, seq);
  374 + return seq;
  375 +}
  376 +
  377 +/*
  378 + * Increment the lower half of tree_mod_seq.
  379 + *
  380 + * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
  381 + * are generated should not technically require a spin lock here. (Rationale:
  382 + * incrementing the minor while incrementing the major seq number is between its
  383 + * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
  384 + * just returns a unique sequence number as usual.) We have decided to leave
  385 + * that requirement in here and rethink it once we notice it really imposes a
  386 + * problem on some workload.
  387 + */
  388 +static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
  389 +{
  390 + return atomic64_inc_return(&fs_info->tree_mod_seq);
  391 +}
  392 +
  393 +/*
  394 + * return the last minor in the previous major tree_mod_seq number
  395 + */
  396 +u64 btrfs_tree_mod_seq_prev(u64 seq)
  397 +{
  398 + return (seq & 0xffffffff00000000ull) - 1ull;
  399 +}
  400 +
  401 +/*
364 402 * This adds a new blocker to the tree mod log's blocker list if the @elem
365 403 * passed does not already have a sequence number set. So when a caller expects
366 404 * to record tree modifications, it should ensure to set elem->seq to zero
367 405  
... ... @@ -376,10 +414,10 @@
376 414 tree_mod_log_write_lock(fs_info);
377 415 spin_lock(&fs_info->tree_mod_seq_lock);
378 416 if (!elem->seq) {
379   - elem->seq = btrfs_inc_tree_mod_seq(fs_info);
  417 + elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
380 418 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381 419 }
382   - seq = btrfs_inc_tree_mod_seq(fs_info);
  420 + seq = btrfs_inc_tree_mod_seq_minor(fs_info);
383 421 spin_unlock(&fs_info->tree_mod_seq_lock);
384 422 tree_mod_log_write_unlock(fs_info);
385 423  
... ... @@ -524,7 +562,10 @@
524 562 if (!tm)
525 563 return -ENOMEM;
526 564  
527   - tm->seq = btrfs_inc_tree_mod_seq(fs_info);
  565 + spin_lock(&fs_info->tree_mod_seq_lock);
  566 + tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
  567 + spin_unlock(&fs_info->tree_mod_seq_lock);
  568 +
528 569 return tm->seq;
529 570 }
530 571  
... ... @@ -1422,7 +1422,7 @@
1422 1422  
1423 1423 /* this protects tree_mod_seq_list */
1424 1424 spinlock_t tree_mod_seq_lock;
1425   - atomic_t tree_mod_seq;
  1425 + atomic64_t tree_mod_seq;
1426 1426 struct list_head tree_mod_seq_list;
1427 1427 struct seq_list tree_mod_seq_elem;
1428 1428  
... ... @@ -3332,10 +3332,7 @@
3332 3332 struct seq_list *elem);
3333 3333 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3334 3334 struct seq_list *elem);
3335   -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3336   -{
3337   - return atomic_inc_return(&fs_info->tree_mod_seq);
3338   -}
  3335 +u64 btrfs_tree_mod_seq_prev(u64 seq);
3339 3336 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3340 3337  
3341 3338 /* root-item.c */
fs/btrfs/delayed-ref.c
... ... @@ -361,8 +361,10 @@
361 361 elem = list_first_entry(&fs_info->tree_mod_seq_list,
362 362 struct seq_list, list);
363 363 if (seq >= elem->seq) {
364   - pr_debug("holding back delayed_ref %llu, lowest is "
365   - "%llu (%p)\n", seq, elem->seq, delayed_refs);
  364 + pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
  365 + (u32)(seq >> 32), (u32)seq,
  366 + (u32)(elem->seq >> 32), (u32)elem->seq,
  367 + delayed_refs);
366 368 ret = 1;
367 369 }
368 370 }
... ... @@ -2157,7 +2157,7 @@
2157 2157 atomic_set(&fs_info->async_submit_draining, 0);
2158 2158 atomic_set(&fs_info->nr_async_bios, 0);
2159 2159 atomic_set(&fs_info->defrag_running, 0);
2160   - atomic_set(&fs_info->tree_mod_seq, 0);
  2160 + atomic64_set(&fs_info->tree_mod_seq, 0);
2161 2161 fs_info->sb = sb;
2162 2162 fs_info->max_inline = 8192 * 1024;
2163 2163 fs_info->metadata_ratio = 0;
fs/btrfs/extent-tree.c
... ... @@ -2541,9 +2541,10 @@
2541 2541 !trans->delayed_ref_elem.seq) {
2542 2542 /* list without seq or seq without list */
2543 2543 btrfs_err(fs_info,
2544   - "qgroup accounting update error, list is%s empty, seq is %llu",
  2544 + "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2545 2545 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2546   - trans->delayed_ref_elem.seq);
  2546 + (u32)(trans->delayed_ref_elem.seq >> 32),
  2547 + (u32)trans->delayed_ref_elem.seq);
2547 2548 BUG();
2548 2549 }
2549 2550  
... ... @@ -1242,9 +1242,11 @@
1242 1242 case BTRFS_ADD_DELAYED_REF:
1243 1243 case BTRFS_ADD_DELAYED_EXTENT:
1244 1244 sgn = 1;
  1245 + seq = btrfs_tree_mod_seq_prev(node->seq);
1245 1246 break;
1246 1247 case BTRFS_DROP_DELAYED_REF:
1247 1248 sgn = -1;
  1249 + seq = node->seq;
1248 1250 break;
1249 1251 case BTRFS_UPDATE_DELAYED_HEAD:
1250 1252 return 0;
1251 1253  
... ... @@ -1254,14 +1256,14 @@
1254 1256  
1255 1257 /*
1256 1258 * the delayed ref sequence number we pass depends on the direction of
1257   - * the operation. for add operations, we pass (node->seq - 1) to skip
  1259 + * the operation. for add operations, we pass
  1260 + * tree_mod_log_prev_seq(node->seq) to skip
1258 1261 * the delayed ref's current sequence number, because we need the state
1259 1262 * of the tree before the add operation. for delete operations, we pass
1260 1263 * (node->seq) to include the delayed ref's current sequence number,
1261 1264 * because we need the state of the tree after the delete operation.
1262 1265 */
1263   - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
1264   - sgn > 0 ? node->seq - 1 : node->seq, &roots);
  1266 + ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
1265 1267 if (ret < 0)
1266 1268 return ret;
1267 1269  
1268 1270  
... ... @@ -1772,9 +1774,10 @@
1772 1774 {
1773 1775 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1774 1776 return;
1775   - printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
  1777 + pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
1776 1778 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1777   - trans->delayed_ref_elem.seq);
  1779 + (u32)(trans->delayed_ref_elem.seq >> 32),
  1780 + (u32)trans->delayed_ref_elem.seq);
1778 1781 BUG();
1779 1782 }
fs/btrfs/transaction.c
... ... @@ -162,7 +162,7 @@
162 162 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
163 163 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
164 164 "creating a fresh transaction\n");
165   - atomic_set(&fs_info->tree_mod_seq, 0);
  165 + atomic64_set(&fs_info->tree_mod_seq, 0);
166 166  
167 167 spin_lock_init(&cur_trans->commit_lock);
168 168 spin_lock_init(&cur_trans->delayed_refs.lock);