Commit 8d875f95da43c6a8f18f77869f2ef26e9594fecc

Authored by Chris Mason
1 parent 27b9a8122f

btrfs: disable strict file flushes for renames and truncates

Truncates and renames are often used to replace old versions of a file
with new versions.  Applications often expect this to be an atomic
replacement, even if they haven't done anything to make sure the new
version is fully on disk.

Btrfs has strict flushing in place to make sure that renaming over an
old file with a new file will fully flush out the new file before
allowing the transaction commit with the rename to complete.

This ordering means the commit code needs to be able to lock file pages,
and there are a few paths in the filesystem where we will try to end a
transaction with the page lock held.  It's rare, but these things can
deadlock.

This patch removes the ordered flushes and switches to a best effort
filemap_flush like ext4 uses. It's not perfect, but it should fix the
deadlocks.

Signed-off-by: Chris Mason <clm@fb.com>

Showing 8 changed files with 6 additions and 267 deletions Side-by-side Diff

fs/btrfs/btrfs_inode.h
... ... @@ -84,12 +84,6 @@
84 84 */
85 85 struct list_head delalloc_inodes;
86 86  
87   - /*
88   - * list for tracking inodes that must be sent to disk before a
89   - * rename or truncate commit
90   - */
91   - struct list_head ordered_operations;
92   -
93 87 /* node for the red-black tree that links inodes in subvolume root */
94 88 struct rb_node rb_node;
95 89  
... ... @@ -60,8 +60,6 @@
60 60 static void free_fs_root(struct btrfs_root *root);
61 61 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
62 62 int read_only);
63   -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
64   - struct btrfs_root *root);
65 63 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
66 64 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
67 65 struct btrfs_root *root);
... ... @@ -3829,34 +3827,6 @@
3829 3827 btrfs_cleanup_transaction(root);
3830 3828 }
3831 3829  
3832   -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3833   - struct btrfs_root *root)
3834   -{
3835   - struct btrfs_inode *btrfs_inode;
3836   - struct list_head splice;
3837   -
3838   - INIT_LIST_HEAD(&splice);
3839   -
3840   - mutex_lock(&root->fs_info->ordered_operations_mutex);
3841   - spin_lock(&root->fs_info->ordered_root_lock);
3842   -
3843   - list_splice_init(&t->ordered_operations, &splice);
3844   - while (!list_empty(&splice)) {
3845   - btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3846   - ordered_operations);
3847   -
3848   - list_del_init(&btrfs_inode->ordered_operations);
3849   - spin_unlock(&root->fs_info->ordered_root_lock);
3850   -
3851   - btrfs_invalidate_inodes(btrfs_inode->root);
3852   -
3853   - spin_lock(&root->fs_info->ordered_root_lock);
3854   - }
3855   -
3856   - spin_unlock(&root->fs_info->ordered_root_lock);
3857   - mutex_unlock(&root->fs_info->ordered_operations_mutex);
3858   -}
3859   -
3860 3830 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3861 3831 {
3862 3832 struct btrfs_ordered_extent *ordered;
... ... @@ -4093,8 +4063,6 @@
4093 4063 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4094 4064 struct btrfs_root *root)
4095 4065 {
4096   - btrfs_destroy_ordered_operations(cur_trans, root);
4097   -
4098 4066 btrfs_destroy_delayed_refs(cur_trans, root);
4099 4067  
4100 4068 cur_trans->state = TRANS_STATE_COMMIT_START;
... ... @@ -1838,33 +1838,9 @@
1838 1838  
1839 1839 int btrfs_release_file(struct inode *inode, struct file *filp)
1840 1840 {
1841   - /*
1842   - * ordered_data_close is set by settattr when we are about to truncate
1843   - * a file from a non-zero size to a zero size. This tries to
1844   - * flush down new bytes that may have been written if the
1845   - * application were using truncate to replace a file in place.
1846   - */
1847   - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1848   - &BTRFS_I(inode)->runtime_flags)) {
1849   - struct btrfs_trans_handle *trans;
1850   - struct btrfs_root *root = BTRFS_I(inode)->root;
1851   -
1852   - /*
1853   - * We need to block on a committing transaction to keep us from
1854   - * throwing a ordered operation on to the list and causing
1855   - * something like sync to deadlock trying to flush out this
1856   - * inode.
1857   - */
1858   - trans = btrfs_start_transaction(root, 0);
1859   - if (IS_ERR(trans))
1860   - return PTR_ERR(trans);
1861   - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1862   - btrfs_end_transaction(trans, root);
1863   - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1864   - filemap_flush(inode->i_mapping);
1865   - }
1866 1841 if (filp->private_data)
1867 1842 btrfs_ioctl_trans_end(filp);
  1843 + filemap_flush(inode->i_mapping);
1868 1844 return 0;
1869 1845 }
1870 1846  
... ... @@ -7951,27 +7951,6 @@
7951 7951 BUG_ON(ret);
7952 7952  
7953 7953 /*
7954   - * setattr is responsible for setting the ordered_data_close flag,
7955   - * but that is only tested during the last file release. That
7956   - * could happen well after the next commit, leaving a great big
7957   - * window where new writes may get lost if someone chooses to write
7958   - * to this file after truncating to zero
7959   - *
7960   - * The inode doesn't have any dirty data here, and so if we commit
7961   - * this is a noop. If someone immediately starts writing to the inode
7962   - * it is very likely we'll catch some of their writes in this
7963   - * transaction, and the commit will find this file on the ordered
7964   - * data list with good things to send down.
7965   - *
7966   - * This is a best effort solution, there is still a window where
7967   - * using truncate to replace the contents of the file will
7968   - * end up with a zero length file after a crash.
7969   - */
7970   - if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7971   - &BTRFS_I(inode)->runtime_flags))
7972   - btrfs_add_ordered_operation(trans, root, inode);
7973   -
7974   - /*
7975 7954 * So if we truncate and then write and fsync we normally would just
7976 7955 * write the extents that changed, which is a problem if we need to
7977 7956 * first truncate that entire inode. So set this flag so we write out
... ... @@ -8118,7 +8097,6 @@
8118 8097 mutex_init(&ei->delalloc_mutex);
8119 8098 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8120 8099 INIT_LIST_HEAD(&ei->delalloc_inodes);
8121   - INIT_LIST_HEAD(&ei->ordered_operations);
8122 8100 RB_CLEAR_NODE(&ei->rb_node);
8123 8101  
8124 8102 return inode;
... ... @@ -8158,17 +8136,6 @@
8158 8136 if (!root)
8159 8137 goto free;
8160 8138  
8161   - /*
8162   - * Make sure we're properly removed from the ordered operation
8163   - * lists.
8164   - */
8165   - smp_mb();
8166   - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8167   - spin_lock(&root->fs_info->ordered_root_lock);
8168   - list_del_init(&BTRFS_I(inode)->ordered_operations);
8169   - spin_unlock(&root->fs_info->ordered_root_lock);
8170   - }
8171   -
8172 8139 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8173 8140 &BTRFS_I(inode)->runtime_flags)) {
8174 8141 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
8175 8142  
... ... @@ -8350,12 +8317,10 @@
8350 8317 ret = 0;
8351 8318  
8352 8319 /*
8353   - * we're using rename to replace one file with another.
8354   - * and the replacement file is large. Start IO on it now so
8355   - * we don't add too much work to the end of the transaction
  8320 + * we're using rename to replace one file with another. Start IO on it
  8321 + * now so we don't add too much work to the end of the transaction
8356 8322 */
8357   - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
8358   - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
  8323 + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8359 8324 filemap_flush(old_inode->i_mapping);
8360 8325  
8361 8326 /* close the racy window with snapshot create/destroy ioctl */
... ... @@ -8403,12 +8368,6 @@
8403 8368 */
8404 8369 btrfs_pin_log_trans(root);
8405 8370 }
8406   - /*
8407   - * make sure the inode gets flushed if it is replacing
8408   - * something.
8409   - */
8410   - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8411   - btrfs_add_ordered_operation(trans, root, old_inode);
8412 8371  
8413 8372 inode_inc_iversion(old_dir);
8414 8373 inode_inc_iversion(new_dir);
fs/btrfs/ordered-data.c
... ... @@ -571,18 +571,6 @@
571 571  
572 572 trace_btrfs_ordered_extent_remove(inode, entry);
573 573  
574   - /*
575   - * we have no more ordered extents for this inode and
576   - * no dirty pages. We can safely remove it from the
577   - * list of ordered extents
578   - */
579   - if (RB_EMPTY_ROOT(&tree->tree) &&
580   - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
581   - spin_lock(&root->fs_info->ordered_root_lock);
582   - list_del_init(&BTRFS_I(inode)->ordered_operations);
583   - spin_unlock(&root->fs_info->ordered_root_lock);
584   - }
585   -
586 574 if (!root->nr_ordered_extents) {
587 575 spin_lock(&root->fs_info->ordered_root_lock);
588 576 BUG_ON(list_empty(&root->ordered_root));
... ... @@ -687,81 +675,6 @@
687 675 }
688 676  
689 677 /*
690   - * this is used during transaction commit to write all the inodes
691   - * added to the ordered operation list. These files must be fully on
692   - * disk before the transaction commits.
693   - *
694   - * we have two modes here, one is to just start the IO via filemap_flush
695   - * and the other is to wait for all the io. When we wait, we have an
696   - * extra check to make sure the ordered operation list really is empty
697   - * before we return
698   - */
699   -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
700   - struct btrfs_root *root, int wait)
701   -{
702   - struct btrfs_inode *btrfs_inode;
703   - struct inode *inode;
704   - struct btrfs_transaction *cur_trans = trans->transaction;
705   - struct list_head splice;
706   - struct list_head works;
707   - struct btrfs_delalloc_work *work, *next;
708   - int ret = 0;
709   -
710   - INIT_LIST_HEAD(&splice);
711   - INIT_LIST_HEAD(&works);
712   -
713   - mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
714   - spin_lock(&root->fs_info->ordered_root_lock);
715   - list_splice_init(&cur_trans->ordered_operations, &splice);
716   - while (!list_empty(&splice)) {
717   - btrfs_inode = list_entry(splice.next, struct btrfs_inode,
718   - ordered_operations);
719   - inode = &btrfs_inode->vfs_inode;
720   -
721   - list_del_init(&btrfs_inode->ordered_operations);
722   -
723   - /*
724   - * the inode may be getting freed (in sys_unlink path).
725   - */
726   - inode = igrab(inode);
727   - if (!inode)
728   - continue;
729   -
730   - if (!wait)
731   - list_add_tail(&BTRFS_I(inode)->ordered_operations,
732   - &cur_trans->ordered_operations);
733   - spin_unlock(&root->fs_info->ordered_root_lock);
734   -
735   - work = btrfs_alloc_delalloc_work(inode, wait, 1);
736   - if (!work) {
737   - spin_lock(&root->fs_info->ordered_root_lock);
738   - if (list_empty(&BTRFS_I(inode)->ordered_operations))
739   - list_add_tail(&btrfs_inode->ordered_operations,
740   - &splice);
741   - list_splice_tail(&splice,
742   - &cur_trans->ordered_operations);
743   - spin_unlock(&root->fs_info->ordered_root_lock);
744   - ret = -ENOMEM;
745   - goto out;
746   - }
747   - list_add_tail(&work->list, &works);
748   - btrfs_queue_work(root->fs_info->flush_workers,
749   - &work->work);
750   -
751   - cond_resched();
752   - spin_lock(&root->fs_info->ordered_root_lock);
753   - }
754   - spin_unlock(&root->fs_info->ordered_root_lock);
755   -out:
756   - list_for_each_entry_safe(work, next, &works, list) {
757   - list_del_init(&work->list);
758   - btrfs_wait_and_free_delalloc_work(work);
759   - }
760   - mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
761   - return ret;
762   -}
763   -
764   -/*
765 678 * Used to start IO or wait for a given ordered extent to finish.
766 679 *
767 680 * If wait is one, this effectively waits on page writeback for all the pages
... ... @@ -1118,42 +1031,6 @@
1118 1031 spin_unlock_irq(&tree->lock);
1119 1032 btrfs_put_ordered_extent(ordered);
1120 1033 return index;
1121   -}
1122   -
1123   -
1124   -/*
1125   - * add a given inode to the list of inodes that must be fully on
1126   - * disk before a transaction commit finishes.
1127   - *
1128   - * This basically gives us the ext3 style data=ordered mode, and it is mostly
1129   - * used to make sure renamed files are fully on disk.
1130   - *
1131   - * It is a noop if the inode is already fully on disk.
1132   - *
1133   - * If trans is not null, we'll do a friendly check for a transaction that
1134   - * is already flushing things and force the IO down ourselves.
1135   - */
1136   -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1137   - struct btrfs_root *root, struct inode *inode)
1138   -{
1139   - struct btrfs_transaction *cur_trans = trans->transaction;
1140   - u64 last_mod;
1141   -
1142   - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1143   -
1144   - /*
1145   - * if this file hasn't been changed since the last transaction
1146   - * commit, we can safely return without doing anything
1147   - */
1148   - if (last_mod <= root->fs_info->last_trans_committed)
1149   - return;
1150   -
1151   - spin_lock(&root->fs_info->ordered_root_lock);
1152   - if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1153   - list_add_tail(&BTRFS_I(inode)->ordered_operations,
1154   - &cur_trans->ordered_operations);
1155   - }
1156   - spin_unlock(&root->fs_info->ordered_root_lock);
1157 1034 }
1158 1035  
1159 1036 int __init ordered_data_init(void)
fs/btrfs/ordered-data.h
... ... @@ -190,11 +190,6 @@
190 190 struct btrfs_ordered_extent *ordered);
191 191 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
192 192 u32 *sum, int len);
193   -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
194   - struct btrfs_root *root, int wait);
195   -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
196   - struct btrfs_root *root,
197   - struct inode *inode);
198 193 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199 194 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200 195 void btrfs_get_logged_extents(struct inode *inode,
fs/btrfs/transaction.c
... ... @@ -218,7 +218,6 @@
218 218 spin_lock_init(&cur_trans->delayed_refs.lock);
219 219  
220 220 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221   - INIT_LIST_HEAD(&cur_trans->ordered_operations);
222 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
223 222 INIT_LIST_HEAD(&cur_trans->switch_commits);
224 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
... ... @@ -1612,27 +1611,6 @@
1612 1611 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1613 1612 }
1614 1613  
1615   -static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1616   - struct btrfs_root *root)
1617   -{
1618   - int ret;
1619   -
1620   - ret = btrfs_run_delayed_items(trans, root);
1621   - if (ret)
1622   - return ret;
1623   -
1624   - /*
1625   - * rename don't use btrfs_join_transaction, so, once we
1626   - * set the transaction to blocked above, we aren't going
1627   - * to get any new ordered operations. We can safely run
1628   - * it here and no for sure that nothing new will be added
1629   - * to the list
1630   - */
1631   - ret = btrfs_run_ordered_operations(trans, root, 1);
1632   -
1633   - return ret;
1634   -}
1635   -
1636 1614 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1637 1615 {
1638 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
... ... @@ -1653,13 +1631,6 @@
1653 1631 struct btrfs_transaction *prev_trans = NULL;
1654 1632 int ret;
1655 1633  
1656   - ret = btrfs_run_ordered_operations(trans, root, 0);
1657   - if (ret) {
1658   - btrfs_abort_transaction(trans, root, ret);
1659   - btrfs_end_transaction(trans, root);
1660   - return ret;
1661   - }
1662   -
1663 1634 /* Stop the commit early if ->aborted is set */
1664 1635 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1665 1636 ret = cur_trans->aborted;
... ... @@ -1740,7 +1711,7 @@
1740 1711 if (ret)
1741 1712 goto cleanup_transaction;
1742 1713  
1743   - ret = btrfs_flush_all_pending_stuffs(trans, root);
  1714 + ret = btrfs_run_delayed_items(trans, root);
1744 1715 if (ret)
1745 1716 goto cleanup_transaction;
1746 1717  
... ... @@ -1748,7 +1719,7 @@
1748 1719 extwriter_counter_read(cur_trans) == 0);
1749 1720  
1750 1721 /* some pending stuffs might be added after the previous flush. */
1751   - ret = btrfs_flush_all_pending_stuffs(trans, root);
  1722 + ret = btrfs_run_delayed_items(trans, root);
1752 1723 if (ret)
1753 1724 goto cleanup_transaction;
1754 1725  
fs/btrfs/transaction.h
... ... @@ -55,7 +55,6 @@
55 55 wait_queue_head_t writer_wait;
56 56 wait_queue_head_t commit_wait;
57 57 struct list_head pending_snapshots;
58   - struct list_head ordered_operations;
59 58 struct list_head pending_chunks;
60 59 struct list_head switch_commits;
61 60 struct btrfs_delayed_ref_root delayed_refs;