Commit 7bb9e4a06e12583f1418b669dc45bb3ee84496c6
Committed by
Greg Kroah-Hartman
1 parent
32a7d7c223
nilfs2: fix deadlock of segment constructor over I_SYNC flag
commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 upstream. Nilfs2 eventually hangs in a stress test with fsstress program. This issue was caused by the following deadlock over I_SYNC flag between nilfs_segctor_thread() and writeback_sb_inodes(): nilfs_segctor_thread() nilfs_segctor_thread_construct() nilfs_segctor_unlock() nilfs_dispose_list() iput() iput_final() evict() inode_wait_for_writeback() * wait for I_SYNC flag writeback_sb_inodes() * set I_SYNC flag on inode->i_state __writeback_single_inode() do_writepages() nilfs_writepages() nilfs_construct_dsync_segment() nilfs_segctor_sync() * wait for completion of segment constructor inode_sync_complete() * clear I_SYNC flag after __writeback_single_inode() completed writeback_sb_inodes() calls do_writepages() for dirty inodes after setting I_SYNC flag on inode->i_state. do_writepages() in turn calls nilfs_writepages(), which can run segment constructor and wait for its completion. On the other hand, segment constructor calls iput(), which can call evict() and wait for the I_SYNC flag on inode_wait_for_writeback(). Since segment constructor doesn't know when I_SYNC will be set, it cannot know whether iput() will block or not unless inode->i_nlink has a non-zero count. We can prevent evict() from being called in iput() by implementing sop->drop_inode(), but it's not preferable to leave inodes with i_nlink == 0 for long periods because it even defers file truncation and inode deallocation. So, this instead resolves the deadlock by calling iput() asynchronously with a workqueue for inodes with i_nlink == 0. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: Al Viro <viro@zeniv.linux.org.uk> Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Showing 3 changed files with 44 additions and 7 deletions Side-by-side Diff
fs/nilfs2/nilfs.h
... | ... | @@ -141,7 +141,6 @@ |
141 | 141 | * @ti_save: Backup of journal_info field of task_struct |
142 | 142 | * @ti_flags: Flags |
143 | 143 | * @ti_count: Nest level |
144 | - * @ti_garbage: List of inode to be put when releasing semaphore | |
145 | 144 | */ |
146 | 145 | struct nilfs_transaction_info { |
147 | 146 | u32 ti_magic; |
... | ... | @@ -150,7 +149,6 @@ |
150 | 149 | one of other filesystems has a bug. */ |
151 | 150 | unsigned short ti_flags; |
152 | 151 | unsigned short ti_count; |
153 | - struct list_head ti_garbage; | |
154 | 152 | }; |
155 | 153 | |
156 | 154 | /* ti_magic */ |
fs/nilfs2/segment.c
... | ... | @@ -305,7 +305,6 @@ |
305 | 305 | ti->ti_count = 0; |
306 | 306 | ti->ti_save = cur_ti; |
307 | 307 | ti->ti_magic = NILFS_TI_MAGIC; |
308 | - INIT_LIST_HEAD(&ti->ti_garbage); | |
309 | 308 | current->journal_info = ti; |
310 | 309 | |
311 | 310 | for (;;) { |
... | ... | @@ -332,8 +331,6 @@ |
332 | 331 | |
333 | 332 | up_write(&nilfs->ns_segctor_sem); |
334 | 333 | current->journal_info = ti->ti_save; |
335 | - if (!list_empty(&ti->ti_garbage)) | |
336 | - nilfs_dispose_list(nilfs, &ti->ti_garbage, 0); | |
337 | 334 | } |
338 | 335 | |
339 | 336 | static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, |
... | ... | @@ -746,6 +743,15 @@ |
746 | 743 | } |
747 | 744 | } |
748 | 745 | |
746 | +static void nilfs_iput_work_func(struct work_struct *work) | |
747 | +{ | |
748 | + struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info, | |
749 | + sc_iput_work); | |
750 | + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; | |
751 | + | |
752 | + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0); | |
753 | +} | |
754 | + | |
749 | 755 | static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, |
750 | 756 | struct nilfs_root *root) |
751 | 757 | { |
752 | 758 | |
... | ... | @@ -1900,8 +1906,8 @@ |
1900 | 1906 | static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, |
1901 | 1907 | struct the_nilfs *nilfs) |
1902 | 1908 | { |
1903 | - struct nilfs_transaction_info *ti = current->journal_info; | |
1904 | 1909 | struct nilfs_inode_info *ii, *n; |
1910 | + int defer_iput = false; | |
1905 | 1911 | |
1906 | 1912 | spin_lock(&nilfs->ns_inode_lock); |
1907 | 1913 | list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { |
1908 | 1914 | |
... | ... | @@ -1912,9 +1918,24 @@ |
1912 | 1918 | clear_bit(NILFS_I_BUSY, &ii->i_state); |
1913 | 1919 | brelse(ii->i_bh); |
1914 | 1920 | ii->i_bh = NULL; |
1915 | - list_move_tail(&ii->i_dirty, &ti->ti_garbage); | |
1921 | + list_del_init(&ii->i_dirty); | |
1922 | + if (!ii->vfs_inode.i_nlink) { | |
1923 | + /* | |
1924 | + * Defer calling iput() to avoid a deadlock | |
1925 | + * over I_SYNC flag for inodes with i_nlink == 0 | |
1926 | + */ | |
1927 | + list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); | |
1928 | + defer_iput = true; | |
1929 | + } else { | |
1930 | + spin_unlock(&nilfs->ns_inode_lock); | |
1931 | + iput(&ii->vfs_inode); | |
1932 | + spin_lock(&nilfs->ns_inode_lock); | |
1933 | + } | |
1916 | 1934 | } |
1917 | 1935 | spin_unlock(&nilfs->ns_inode_lock); |
1936 | + | |
1937 | + if (defer_iput) | |
1938 | + schedule_work(&sci->sc_iput_work); | |
1918 | 1939 | } |
1919 | 1940 | |
1920 | 1941 | /* |
... | ... | @@ -2583,6 +2604,8 @@ |
2583 | 2604 | INIT_LIST_HEAD(&sci->sc_segbufs); |
2584 | 2605 | INIT_LIST_HEAD(&sci->sc_write_logs); |
2585 | 2606 | INIT_LIST_HEAD(&sci->sc_gc_inodes); |
2607 | + INIT_LIST_HEAD(&sci->sc_iput_queue); | |
2608 | + INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); | |
2586 | 2609 | init_timer(&sci->sc_timer); |
2587 | 2610 | |
2588 | 2611 | sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; |
... | ... | @@ -2609,6 +2632,8 @@ |
2609 | 2632 | ret = nilfs_segctor_construct(sci, SC_LSEG_SR); |
2610 | 2633 | nilfs_transaction_unlock(sci->sc_super); |
2611 | 2634 | |
2635 | + flush_work(&sci->sc_iput_work); | |
2636 | + | |
2612 | 2637 | } while (ret && retrycount-- > 0); |
2613 | 2638 | } |
2614 | 2639 | |
... | ... | @@ -2633,6 +2658,9 @@ |
2633 | 2658 | || sci->sc_seq_request != sci->sc_seq_done); |
2634 | 2659 | spin_unlock(&sci->sc_state_lock); |
2635 | 2660 | |
2661 | + if (flush_work(&sci->sc_iput_work)) | |
2662 | + flag = true; | |
2663 | + | |
2636 | 2664 | if (flag || !nilfs_segctor_confirm(sci)) |
2637 | 2665 | nilfs_segctor_write_out(sci); |
2638 | 2666 | |
... | ... | @@ -2640,6 +2668,12 @@ |
2640 | 2668 | nilfs_warning(sci->sc_super, __func__, |
2641 | 2669 | "dirty file(s) after the final construction\n"); |
2642 | 2670 | nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); |
2671 | + } | |
2672 | + | |
2673 | + if (!list_empty(&sci->sc_iput_queue)) { | |
2674 | + nilfs_warning(sci->sc_super, __func__, | |
2675 | + "iput queue is not empty\n"); | |
2676 | + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); | |
2643 | 2677 | } |
2644 | 2678 | |
2645 | 2679 | WARN_ON(!list_empty(&sci->sc_segbufs)); |
fs/nilfs2/segment.h
... | ... | @@ -26,6 +26,7 @@ |
26 | 26 | #include <linux/types.h> |
27 | 27 | #include <linux/fs.h> |
28 | 28 | #include <linux/buffer_head.h> |
29 | +#include <linux/workqueue.h> | |
29 | 30 | #include <linux/nilfs2_fs.h> |
30 | 31 | #include "nilfs.h" |
31 | 32 | |
... | ... | @@ -92,6 +93,8 @@ |
92 | 93 | * @sc_nblk_inc: Block count of current generation |
93 | 94 | * @sc_dirty_files: List of files to be written |
94 | 95 | * @sc_gc_inodes: List of GC inodes having blocks to be written |
96 | + * @sc_iput_queue: list of inodes for which iput should be done | |
97 | + * @sc_iput_work: work struct to defer iput call | |
95 | 98 | * @sc_freesegs: array of segment numbers to be freed |
96 | 99 | * @sc_nfreesegs: number of segments on @sc_freesegs |
97 | 100 | * @sc_dsync_inode: inode whose data pages are written for a sync operation |
... | ... | @@ -135,6 +138,8 @@ |
135 | 138 | |
136 | 139 | struct list_head sc_dirty_files; |
137 | 140 | struct list_head sc_gc_inodes; |
141 | + struct list_head sc_iput_queue; | |
142 | + struct work_struct sc_iput_work; | |
138 | 143 | |
139 | 144 | __u64 *sc_freesegs; |
140 | 145 | size_t sc_nfreesegs; |