Commit c5966bf78165ff493aefe87f4e81b267858764c5

Authored by Filipe Manana
Committed by Greg Kroah-Hartman
1 parent 46da9e765e

Btrfs: fix fsync data loss after adding hard link to inode

commit 1a4bcf470c886b955adf36486f4c86f2441d85cb upstream.

We have a scenario where after the fsync log replay we can lose file data
that had been previously fsync'ed if we added an hard link for our inode
and after that we sync'ed the fsync log (for example by fsync'ing some
other file or directory).

This is because when adding an hard link we updated the inode item in the
log tree with an i_size value of 0. At that point the new inode item was
in memory only and a subsequent fsync log replay would not make us lose
the file data. However if after adding the hard link we sync the log tree
to disk, by fsync'ing some other file or directory for example, we ended
up losing the file data after log replay, because the inode item in the
persisted log tree had an an i_size of zero.

This is easy to reproduce, and the following excerpt from my test for
xfstests shows this:

  _scratch_mkfs >> $seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create one file with data and fsync it.
  # This made the btrfs fsync log persist the data and the inode metadata with
  # a correct inode->i_size (4096 bytes).
  $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 4K 0 4K" -c "fsync" \
       $SCRATCH_MNT/foo | _filter_xfs_io

  # Now add one hard link to our file. This made the btrfs code update the fsync
  # log, in memory only, with an inode metadata having a size of 0.
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link

  # Now force persistence of the fsync log to disk, for example, by fsyncing some
  # other file.
  touch $SCRATCH_MNT/bar
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar

  # Before a power loss or crash, we could read the 4Kb of data from our file as
  # expected.
  echo "File content before:"
  od -t x1 $SCRATCH_MNT/foo

  # Simulate a crash/power loss.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # After the fsync log replay, because the fsync log had a value of 0 for our
  # inode's i_size, we couldn't read anymore the 4Kb of data that we previously
  # wrote and fsync'ed. The size of the file became 0 after the fsync log replay.
  echo "File content after:"
  od -t x1 $SCRATCH_MNT/foo

Another alternative test, that doesn't need to fsync an inode in the same
transaction it was created, is:

  _scratch_mkfs >> $seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test file with some data.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \
       $SCRATCH_MNT/foo | _filter_xfs_io

  # Make sure the file is durably persisted.
  sync

  # Append some data to our file, to increase its size.
  $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \
       $SCRATCH_MNT/foo | _filter_xfs_io

  # Fsync the file, so from this point on if a crash/power failure happens, our
  # new data is guaranteed to be there next time the fs is mounted.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

  # Add one hard link to our file. This made btrfs write into the in memory fsync
  # log a special inode with generation 0 and an i_size of 0 too. Note that this
  # didn't update the inode in the fsync log on disk.
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link

  # Now make sure the in memory fsync log is durably persisted.
  # Creating and fsync'ing another file will do it.
  touch $SCRATCH_MNT/bar
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar

  # As expected, before the crash/power failure, we should be able to read the
  # 12Kb of file data.
  echo "File content before:"
  od -t x1 $SCRATCH_MNT/foo

  # Simulate a crash/power loss.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # After mounting the fs again, the fsync log was replayed.
  # The btrfs fsync log replay code didn't update the i_size of the persisted
  # inode because the inode item in the log had a special generation with a
  # value of 0 (and it couldn't know the correct i_size, since that inode item
  # had a 0 i_size too). This made the last 4Kb of file data inaccessible and
  # effectively lost.
  echo "File content after:"
  od -t x1 $SCRATCH_MNT/foo

This isn't a new issue/regression. This problem has been around since the
log tree code was added in 2008:

  Btrfs: Add a write ahead tree log to optimize synchronous operations
  (commit e02119d5a7b4396c5a872582fddc8bd6d305a70a)

Test cases for xfstests follow soon.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 73 additions and 9 deletions Side-by-side Diff

... ... @@ -488,8 +488,20 @@
488 488 src_item = (struct btrfs_inode_item *)src_ptr;
489 489 dst_item = (struct btrfs_inode_item *)dst_ptr;
490 490  
491   - if (btrfs_inode_generation(eb, src_item) == 0)
  491 + if (btrfs_inode_generation(eb, src_item) == 0) {
  492 + struct extent_buffer *dst_eb = path->nodes[0];
  493 +
  494 + if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
  495 + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
  496 + struct btrfs_map_token token;
  497 + u64 ino_size = btrfs_inode_size(eb, src_item);
  498 +
  499 + btrfs_init_map_token(&token);
  500 + btrfs_set_token_inode_size(dst_eb, dst_item,
  501 + ino_size, &token);
  502 + }
492 503 goto no_copy;
  504 + }
493 505  
494 506 if (overwrite_root &&
495 507 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
... ... @@ -3228,7 +3240,8 @@
3228 3240 static void fill_inode_item(struct btrfs_trans_handle *trans,
3229 3241 struct extent_buffer *leaf,
3230 3242 struct btrfs_inode_item *item,
3231   - struct inode *inode, int log_inode_only)
  3243 + struct inode *inode, int log_inode_only,
  3244 + u64 logged_isize)
3232 3245 {
3233 3246 struct btrfs_map_token token;
3234 3247  
... ... @@ -3241,7 +3254,7 @@
3241 3254 * to say 'update this inode with these values'
3242 3255 */
3243 3256 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3244   - btrfs_set_token_inode_size(leaf, item, 0, &token);
  3257 + btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3245 3258 } else {
3246 3259 btrfs_set_token_inode_generation(leaf, item,
3247 3260 BTRFS_I(inode)->generation,
... ... @@ -3293,7 +3306,7 @@
3293 3306 return ret;
3294 3307 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3295 3308 struct btrfs_inode_item);
3296   - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
  3309 + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3297 3310 btrfs_release_path(path);
3298 3311 return 0;
3299 3312 }
... ... @@ -3302,7 +3315,8 @@
3302 3315 struct inode *inode,
3303 3316 struct btrfs_path *dst_path,
3304 3317 struct btrfs_path *src_path, u64 *last_extent,
3305   - int start_slot, int nr, int inode_only)
  3318 + int start_slot, int nr, int inode_only,
  3319 + u64 logged_isize)
3306 3320 {
3307 3321 unsigned long src_offset;
3308 3322 unsigned long dst_offset;
... ... @@ -3359,7 +3373,8 @@
3359 3373 dst_path->slots[0],
3360 3374 struct btrfs_inode_item);
3361 3375 fill_inode_item(trans, dst_path->nodes[0], inode_item,
3362   - inode, inode_only == LOG_INODE_EXISTS);
  3376 + inode, inode_only == LOG_INODE_EXISTS,
  3377 + logged_isize);
3363 3378 } else {
3364 3379 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3365 3380 src_offset, ins_sizes[i]);
... ... @@ -3911,6 +3926,33 @@
3911 3926 return ret;
3912 3927 }
3913 3928  
  3929 +static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
  3930 + struct btrfs_path *path, u64 *size_ret)
  3931 +{
  3932 + struct btrfs_key key;
  3933 + int ret;
  3934 +
  3935 + key.objectid = btrfs_ino(inode);
  3936 + key.type = BTRFS_INODE_ITEM_KEY;
  3937 + key.offset = 0;
  3938 +
  3939 + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
  3940 + if (ret < 0) {
  3941 + return ret;
  3942 + } else if (ret > 0) {
  3943 + *size_ret = i_size_read(inode);
  3944 + } else {
  3945 + struct btrfs_inode_item *item;
  3946 +
  3947 + item = btrfs_item_ptr(path->nodes[0], path->slots[0],
  3948 + struct btrfs_inode_item);
  3949 + *size_ret = btrfs_inode_size(path->nodes[0], item);
  3950 + }
  3951 +
  3952 + btrfs_release_path(path);
  3953 + return 0;
  3954 +}
  3955 +
3914 3956 /* log a single inode in the tree log.
3915 3957 * At least one parent directory for this inode must exist in the tree
3916 3958 * or be logged already.
... ... @@ -3948,6 +3990,7 @@
3948 3990 bool fast_search = false;
3949 3991 u64 ino = btrfs_ino(inode);
3950 3992 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  3993 + u64 logged_isize = 0;
3951 3994  
3952 3995 path = btrfs_alloc_path();
3953 3996 if (!path)
... ... @@ -4001,6 +4044,25 @@
4001 4044 max_key_type = BTRFS_XATTR_ITEM_KEY;
4002 4045 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4003 4046 } else {
  4047 + if (inode_only == LOG_INODE_EXISTS) {
  4048 + /*
  4049 + * Make sure the new inode item we write to the log has
  4050 + * the same isize as the current one (if it exists).
  4051 + * This is necessary to prevent data loss after log
  4052 + * replay, and also to prevent doing a wrong expanding
  4053 + * truncate - for e.g. create file, write 4K into offset
  4054 + * 0, fsync, write 4K into offset 4096, add hard link,
  4055 + * fsync some other file (to sync log), power fail - if
  4056 + * we use the inode's current i_size, after log replay
  4057 + * we get a 8Kb file, with the last 4Kb extent as a hole
  4058 + * (zeroes), as if an expanding truncate happened,
  4059 + * instead of getting a file of 4Kb only.
  4060 + */
  4061 + err = logged_inode_size(log, inode, path,
  4062 + &logged_isize);
  4063 + if (err)
  4064 + goto out_unlock;
  4065 + }
4004 4066 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4005 4067 &BTRFS_I(inode)->runtime_flags)) {
4006 4068 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
... ... @@ -4056,7 +4118,8 @@
4056 4118 }
4057 4119  
4058 4120 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4059   - ins_start_slot, ins_nr, inode_only);
  4121 + ins_start_slot, ins_nr, inode_only,
  4122 + logged_isize);
4060 4123 if (ret < 0) {
4061 4124 err = ret;
4062 4125 goto out_unlock;
... ... @@ -4080,7 +4143,7 @@
4080 4143 if (ins_nr) {
4081 4144 ret = copy_items(trans, inode, dst_path, path,
4082 4145 &last_extent, ins_start_slot,
4083   - ins_nr, inode_only);
  4146 + ins_nr, inode_only, logged_isize);
4084 4147 if (ret < 0) {
4085 4148 err = ret;
4086 4149 goto out_unlock;
... ... @@ -4101,7 +4164,8 @@
4101 4164 }
4102 4165 if (ins_nr) {
4103 4166 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4104   - ins_start_slot, ins_nr, inode_only);
  4167 + ins_start_slot, ins_nr, inode_only,
  4168 + logged_isize);
4105 4169 if (ret < 0) {
4106 4170 err = ret;
4107 4171 goto out_unlock;