Commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52

Authored by Ryusuke Konishi
Committed by Linus Torvalds
1 parent 81cca6fb12

nilfs2: fix deadlock of segment constructor over I_SYNC flag

Nilfs2 eventually hangs in a stress test with fsstress program.  This
issue was caused by the following deadlock over I_SYNC flag between
nilfs_segctor_thread() and writeback_sb_inodes():

  nilfs_segctor_thread()
    nilfs_segctor_thread_construct()
      nilfs_segctor_unlock()
        nilfs_dispose_list()
          iput()
            iput_final()
              evict()
                inode_wait_for_writeback()  * wait for I_SYNC flag

  writeback_sb_inodes()
     * set I_SYNC flag on inode->i_state
    __writeback_single_inode()
      do_writepages()
        nilfs_writepages()
          nilfs_construct_dsync_segment()
            nilfs_segctor_sync()
               * wait for completion of segment constructor
    inode_sync_complete()
       * clear I_SYNC flag after __writeback_single_inode() completed

writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state.  do_writepages() in turn calls
nilfs_writepages(), which can run segment constructor and wait for its
completion.  On the other hand, segment constructor calls iput(), which
can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().

Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has a
non-zero count.  We can prevent evict() from being called in iput() by
implementing sop->drop_inode(), but it's not preferable to leave inodes
with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation.  So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 44 additions and 7 deletions Side-by-side Diff

... ... @@ -141,7 +141,6 @@
141 141 * @ti_save: Backup of journal_info field of task_struct
142 142 * @ti_flags: Flags
143 143 * @ti_count: Nest level
144   - * @ti_garbage: List of inode to be put when releasing semaphore
145 144 */
146 145 struct nilfs_transaction_info {
147 146 u32 ti_magic;
... ... @@ -150,7 +149,6 @@
150 149 one of other filesystems has a bug. */
151 150 unsigned short ti_flags;
152 151 unsigned short ti_count;
153   - struct list_head ti_garbage;
154 152 };
155 153  
156 154 /* ti_magic */
... ... @@ -305,7 +305,6 @@
305 305 ti->ti_count = 0;
306 306 ti->ti_save = cur_ti;
307 307 ti->ti_magic = NILFS_TI_MAGIC;
308   - INIT_LIST_HEAD(&ti->ti_garbage);
309 308 current->journal_info = ti;
310 309  
311 310 for (;;) {
... ... @@ -332,8 +331,6 @@
332 331  
333 332 up_write(&nilfs->ns_segctor_sem);
334 333 current->journal_info = ti->ti_save;
335   - if (!list_empty(&ti->ti_garbage))
336   - nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
337 334 }
338 335  
339 336 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
... ... @@ -746,6 +743,15 @@
746 743 }
747 744 }
748 745  
  746 +static void nilfs_iput_work_func(struct work_struct *work)
  747 +{
  748 + struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
  749 + sc_iput_work);
  750 + struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
  751 +
  752 + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
  753 +}
  754 +
749 755 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
750 756 struct nilfs_root *root)
751 757 {
752 758  
... ... @@ -1900,8 +1906,8 @@
1900 1906 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
1901 1907 struct the_nilfs *nilfs)
1902 1908 {
1903   - struct nilfs_transaction_info *ti = current->journal_info;
1904 1909 struct nilfs_inode_info *ii, *n;
  1910 + int defer_iput = false;
1905 1911  
1906 1912 spin_lock(&nilfs->ns_inode_lock);
1907 1913 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
1908 1914  
... ... @@ -1912,9 +1918,24 @@
1912 1918 clear_bit(NILFS_I_BUSY, &ii->i_state);
1913 1919 brelse(ii->i_bh);
1914 1920 ii->i_bh = NULL;
1915   - list_move_tail(&ii->i_dirty, &ti->ti_garbage);
  1921 + list_del_init(&ii->i_dirty);
  1922 + if (!ii->vfs_inode.i_nlink) {
  1923 + /*
  1924 + * Defer calling iput() to avoid a deadlock
  1925 + * over I_SYNC flag for inodes with i_nlink == 0
  1926 + */
  1927 + list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
  1928 + defer_iput = true;
  1929 + } else {
  1930 + spin_unlock(&nilfs->ns_inode_lock);
  1931 + iput(&ii->vfs_inode);
  1932 + spin_lock(&nilfs->ns_inode_lock);
  1933 + }
1916 1934 }
1917 1935 spin_unlock(&nilfs->ns_inode_lock);
  1936 +
  1937 + if (defer_iput)
  1938 + schedule_work(&sci->sc_iput_work);
1918 1939 }
1919 1940  
1920 1941 /*
... ... @@ -2583,6 +2604,8 @@
2583 2604 INIT_LIST_HEAD(&sci->sc_segbufs);
2584 2605 INIT_LIST_HEAD(&sci->sc_write_logs);
2585 2606 INIT_LIST_HEAD(&sci->sc_gc_inodes);
  2607 + INIT_LIST_HEAD(&sci->sc_iput_queue);
  2608 + INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
2586 2609 init_timer(&sci->sc_timer);
2587 2610  
2588 2611 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
... ... @@ -2609,6 +2632,8 @@
2609 2632 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2610 2633 nilfs_transaction_unlock(sci->sc_super);
2611 2634  
  2635 + flush_work(&sci->sc_iput_work);
  2636 +
2612 2637 } while (ret && retrycount-- > 0);
2613 2638 }
2614 2639  
... ... @@ -2633,6 +2658,9 @@
2633 2658 || sci->sc_seq_request != sci->sc_seq_done);
2634 2659 spin_unlock(&sci->sc_state_lock);
2635 2660  
  2661 + if (flush_work(&sci->sc_iput_work))
  2662 + flag = true;
  2663 +
2636 2664 if (flag || !nilfs_segctor_confirm(sci))
2637 2665 nilfs_segctor_write_out(sci);
2638 2666  
... ... @@ -2640,6 +2668,12 @@
2640 2668 nilfs_warning(sci->sc_super, __func__,
2641 2669 "dirty file(s) after the final construction\n");
2642 2670 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
  2671 + }
  2672 +
  2673 + if (!list_empty(&sci->sc_iput_queue)) {
  2674 + nilfs_warning(sci->sc_super, __func__,
  2675 + "iput queue is not empty\n");
  2676 + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
2643 2677 }
2644 2678  
2645 2679 WARN_ON(!list_empty(&sci->sc_segbufs));
... ... @@ -26,6 +26,7 @@
26 26 #include <linux/types.h>
27 27 #include <linux/fs.h>
28 28 #include <linux/buffer_head.h>
  29 +#include <linux/workqueue.h>
29 30 #include <linux/nilfs2_fs.h>
30 31 #include "nilfs.h"
31 32  
... ... @@ -92,6 +93,8 @@
92 93 * @sc_nblk_inc: Block count of current generation
93 94 * @sc_dirty_files: List of files to be written
94 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
  96 + * @sc_iput_queue: list of inodes for which iput should be done
  97 + * @sc_iput_work: work struct to defer iput call
95 98 * @sc_freesegs: array of segment numbers to be freed
96 99 * @sc_nfreesegs: number of segments on @sc_freesegs
97 100 * @sc_dsync_inode: inode whose data pages are written for a sync operation
... ... @@ -135,6 +138,8 @@
135 138  
136 139 struct list_head sc_dirty_files;
137 140 struct list_head sc_gc_inodes;
  141 + struct list_head sc_iput_queue;
  142 + struct work_struct sc_iput_work;
138 143  
139 144 __u64 *sc_freesegs;
140 145 size_t sc_nfreesegs;