Commit 28a535f9a0df060569dcc786e5bc2e1de43d7dc7

Authored by Dmitry Monakhov
Committed by Theodore Ts'o
1 parent 82e5422911

ext4: completed_io locking cleanup

Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
  My diagnosis:
  We already protect extent tree with i_data_sem, truncate and punch_hole
  should wait for DIO, so the only data we have to protect is end_io->flags
  modification, but only flush_completed_IO and end_io_work modified this
  flags and we can serialize them via i_completed_io_lock.

  Currently all these games with mutex_trylock result in the following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286

This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
    NOTE1: because of (2A) work should be queued only if
    ->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list
    Flushing sequence consists of following stages:
    A) LOCKED: Atomically drain completed_io_list to local_list
    B) Perform extents conversion
    C) LOCKED: move converted io's to to_free list for final deletion
       	     This logic depends on context which we was called from.
    D) Final end_io context destruction
    NOTE1: i_mutex is no longer required because end_io->flags modification
    is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
  logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
  not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
  Fall back to active flush_completed_IO() approach in order to prevent
  performance issues with nolocked DIO reads.
Changes since V2:
  Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 6 changed files with 121 additions and 169 deletions Side-by-side Diff

... ... @@ -186,7 +186,6 @@
186 186 #define EXT4_IO_END_ERROR 0x0002
187 187 #define EXT4_IO_END_QUEUED 0x0004
188 188 #define EXT4_IO_END_DIRECT 0x0008
189   -#define EXT4_IO_END_IN_FSYNC 0x0010
190 189  
191 190 struct ext4_io_page {
192 191 struct page *p_page;
193 192  
... ... @@ -2418,11 +2417,11 @@
2418 2417  
2419 2418 /* page-io.c */
2420 2419 extern int __init ext4_init_pageio(void);
  2420 +extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2421 2421 extern void ext4_exit_pageio(void);
2422 2422 extern void ext4_ioend_wait(struct inode *);
2423 2423 extern void ext4_free_io_end(ext4_io_end_t *io);
2424 2424 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2425   -extern int ext4_end_io_nolock(ext4_io_end_t *io);
2426 2425 extern void ext4_io_submit(struct ext4_io_submit *io);
2427 2426 extern int ext4_bio_write_page(struct ext4_io_submit *io,
2428 2427 struct page *page,
... ... @@ -4833,7 +4833,9 @@
4833 4833 }
4834 4834  
4835 4835 /* finish any pending end_io work */
4836   - ext4_flush_completed_IO(inode);
  4836 + err = ext4_flush_completed_IO(inode);
  4837 + if (err)
  4838 + return err;
4837 4839  
4838 4840 credits = ext4_writepage_trans_blocks(inode);
4839 4841 handle = ext4_journal_start(inode, credits);
... ... @@ -34,87 +34,6 @@
34 34  
35 35 #include <trace/events/ext4.h>
36 36  
37   -static void dump_completed_IO(struct inode * inode)
38   -{
39   -#ifdef EXT4FS_DEBUG
40   - struct list_head *cur, *before, *after;
41   - ext4_io_end_t *io, *io0, *io1;
42   - unsigned long flags;
43   -
44   - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45   - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46   - return;
47   - }
48   -
49   - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50   - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51   - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52   - cur = &io->list;
53   - before = cur->prev;
54   - io0 = container_of(before, ext4_io_end_t, list);
55   - after = cur->next;
56   - io1 = container_of(after, ext4_io_end_t, list);
57   -
58   - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59   - io, inode->i_ino, io0, io1);
60   - }
61   - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62   -#endif
63   -}
64   -
65   -/*
66   - * This function is called from ext4_sync_file().
67   - *
68   - * When IO is completed, the work to convert unwritten extents to
69   - * written is queued on workqueue but may not get immediately
70   - * scheduled. When fsync is called, we need to ensure the
71   - * conversion is complete before fsync returns.
72   - * The inode keeps track of a list of pending/completed IO that
73   - * might needs to do the conversion. This function walks through
74   - * the list and convert the related unwritten extents for completed IO
75   - * to written.
76   - * The function return the number of pending IOs on success.
77   - */
78   -int ext4_flush_completed_IO(struct inode *inode)
79   -{
80   - ext4_io_end_t *io;
81   - struct ext4_inode_info *ei = EXT4_I(inode);
82   - unsigned long flags;
83   - int ret = 0;
84   - int ret2 = 0;
85   -
86   - dump_completed_IO(inode);
87   - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
88   - while (!list_empty(&ei->i_completed_io_list)){
89   - io = list_entry(ei->i_completed_io_list.next,
90   - ext4_io_end_t, list);
91   - list_del_init(&io->list);
92   - io->flag |= EXT4_IO_END_IN_FSYNC;
93   - /*
94   - * Calling ext4_end_io_nolock() to convert completed
95   - * IO to written.
96   - *
97   - * When ext4_sync_file() is called, run_queue() may already
98   - * about to flush the work corresponding to this io structure.
99   - * It will be upset if it founds the io structure related
100   - * to the work-to-be schedule is freed.
101   - *
102   - * Thus we need to keep the io structure still valid here after
103   - * conversion finished. The io structure has a flag to
104   - * avoid double converting from both fsync and background work
105   - * queue work.
106   - */
107   - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
108   - ret = ext4_end_io_nolock(io);
109   - if (ret < 0)
110   - ret2 = ret;
111   - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112   - io->flag &= ~EXT4_IO_END_IN_FSYNC;
113   - }
114   - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
115   - return (ret2 < 0) ? ret2 : 0;
116   -}
117   -
118 37 /*
119 38 * If we're not journaling and this is a just-created file, we have to
120 39 * sync our parent directory (if it was freshly created) since
... ... @@ -807,11 +807,9 @@
807 807  
808 808 retry:
809 809 if (rw == READ && ext4_should_dioread_nolock(inode)) {
810   - if (unlikely(!list_empty(&ei->i_completed_io_list))) {
811   - mutex_lock(&inode->i_mutex);
  810 + if (unlikely(!list_empty(&ei->i_completed_io_list)))
812 811 ext4_flush_completed_IO(inode);
813   - mutex_unlock(&inode->i_mutex);
814   - }
  812 +
815 813 ret = __blockdev_direct_IO(rw, iocb, inode,
816 814 inode->i_sb->s_bdev, iov,
817 815 offset, nr_segs,
... ... @@ -2881,9 +2881,6 @@
2881 2881 {
2882 2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2883 2883 ext4_io_end_t *io_end = iocb->private;
2884   - struct workqueue_struct *wq;
2885   - unsigned long flags;
2886   - struct ext4_inode_info *ei;
2887 2884  
2888 2885 /* if not async direct IO or dio with 0 bytes write, just return */
2889 2886 if (!io_end || !size)
2890 2887  
2891 2888  
2892 2889  
... ... @@ -2912,24 +2909,14 @@
2912 2909 io_end->iocb = iocb;
2913 2910 io_end->result = ret;
2914 2911 }
2915   - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2916 2912  
2917   - /* Add the io_end to per-inode completed aio dio list*/
2918   - ei = EXT4_I(io_end->inode);
2919   - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2920   - list_add_tail(&io_end->list, &ei->i_completed_io_list);
2921   - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2922   -
2923   - /* queue the work to convert unwritten extents to written */
2924   - queue_work(wq, &io_end->work);
  2913 + ext4_add_complete_io(io_end);
2925 2914 }
2926 2915  
2927 2916 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2928 2917 {
2929 2918 ext4_io_end_t *io_end = bh->b_private;
2930   - struct workqueue_struct *wq;
2931 2919 struct inode *inode;
2932   - unsigned long flags;
2933 2920  
2934 2921 if (!test_clear_buffer_uninit(bh) || !io_end)
2935 2922 goto out;
... ... @@ -2948,15 +2935,7 @@
2948 2935 */
2949 2936 inode = io_end->inode;
2950 2937 ext4_set_io_unwritten_flag(inode, io_end);
2951   -
2952   - /* Add the io_end to per-inode completed io list*/
2953   - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2954   - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2955   - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2956   -
2957   - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2958   - /* queue the work to convert unwritten extents to written */
2959   - queue_work(wq, &io_end->work);
  2938 + ext4_add_complete_io(io_end);
2960 2939 out:
2961 2940 bh->b_private = NULL;
2962 2941 bh->b_end_io = NULL;
... ... @@ -71,6 +71,7 @@
71 71 int i;
72 72  
73 73 BUG_ON(!io);
  74 + BUG_ON(!list_empty(&io->list));
74 75 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
75 76  
76 77 if (io->page)
77 78  
... ... @@ -83,21 +84,14 @@
83 84 kmem_cache_free(io_end_cachep, io);
84 85 }
85 86  
86   -/*
87   - * check a range of space and convert unwritten extents to written.
88   - *
89   - * Called with inode->i_mutex; we depend on this when we manipulate
90   - * io->flag, since we could otherwise race with ext4_flush_completed_IO()
91   - */
92   -int ext4_end_io_nolock(ext4_io_end_t *io)
  87 +/* check a range of space and convert unwritten extents to written. */
  88 +static int ext4_end_io(ext4_io_end_t *io)
93 89 {
94 90 struct inode *inode = io->inode;
95 91 loff_t offset = io->offset;
96 92 ssize_t size = io->size;
97 93 int ret = 0;
98 94  
99   - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
100   -
101 95 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
102 96 "list->prev 0x%p\n",
103 97 io, inode->i_ino, io->list.next, io->list.prev);
... ... @@ -110,7 +104,6 @@
110 104 "(inode %lu, offset %llu, size %zd, error %d)",
111 105 inode->i_ino, offset, size, ret);
112 106 }
113   - io->flag &= ~EXT4_IO_END_UNWRITTEN;
114 107 if (io->iocb)
115 108 aio_complete(io->iocb, io->result, 0);
116 109  
117 110  
118 111  
119 112  
120 113  
121 114  
122 115  
123 116  
124 117  
... ... @@ -122,53 +115,124 @@
122 115 return ret;
123 116 }
124 117  
125   -/*
126   - * work on completed aio dio IO, to convert unwritten extents to extents
127   - */
128   -static void ext4_end_io_work(struct work_struct *work)
  118 +static void dump_completed_IO(struct inode *inode)
129 119 {
130   - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
131   - struct inode *inode = io->inode;
132   - struct ext4_inode_info *ei = EXT4_I(inode);
133   - unsigned long flags;
  120 +#ifdef EXT4FS_DEBUG
  121 + struct list_head *cur, *before, *after;
  122 + ext4_io_end_t *io, *io0, *io1;
  123 + unsigned long flags;
134 124  
  125 + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
  126 + ext4_debug("inode %lu completed_io list is empty\n",
  127 + inode->i_ino);
  128 + return;
  129 + }
  130 +
  131 + ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
  132 + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
  133 + cur = &io->list;
  134 + before = cur->prev;
  135 + io0 = container_of(before, ext4_io_end_t, list);
  136 + after = cur->next;
  137 + io1 = container_of(after, ext4_io_end_t, list);
  138 +
  139 + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
  140 + io, inode->i_ino, io0, io1);
  141 + }
  142 +#endif
  143 +}
  144 +
  145 +/* Add the io_end to per-inode completed end_io list. */
  146 +void ext4_add_complete_io(ext4_io_end_t *io_end)
  147 +{
  148 + struct ext4_inode_info *ei = EXT4_I(io_end->inode);
  149 + struct workqueue_struct *wq;
  150 + unsigned long flags;
  151 +
  152 + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
  153 + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
  154 +
135 155 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
136   - if (io->flag & EXT4_IO_END_IN_FSYNC)
137   - goto requeue;
138   - if (list_empty(&io->list)) {
139   - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
140   - goto free;
  156 + if (list_empty(&ei->i_completed_io_list)) {
  157 + io_end->flag |= EXT4_IO_END_QUEUED;
  158 + queue_work(wq, &io_end->work);
141 159 }
  160 + list_add_tail(&io_end->list, &ei->i_completed_io_list);
  161 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
  162 +}
142 163  
143   - if (!mutex_trylock(&inode->i_mutex)) {
144   - bool was_queued;
145   -requeue:
146   - was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
147   - io->flag |= EXT4_IO_END_QUEUED;
148   - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
149   - /*
150   - * Requeue the work instead of waiting so that the work
151   - * items queued after this can be processed.
152   - */
153   - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
154   - /*
155   - * To prevent the ext4-dio-unwritten thread from keeping
156   - * requeueing end_io requests and occupying cpu for too long,
157   - * yield the cpu if it sees an end_io request that has already
158   - * been requeued.
159   - */
160   - if (was_queued)
161   - yield();
162   - return;
  164 +static int ext4_do_flush_completed_IO(struct inode *inode,
  165 + ext4_io_end_t *work_io)
  166 +{
  167 + ext4_io_end_t *io;
  168 + struct list_head unwritten, complete, to_free;
  169 + unsigned long flags;
  170 + struct ext4_inode_info *ei = EXT4_I(inode);
  171 + int err, ret = 0;
  172 +
  173 + INIT_LIST_HEAD(&complete);
  174 + INIT_LIST_HEAD(&to_free);
  175 +
  176 + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
  177 + dump_completed_IO(inode);
  178 + list_replace_init(&ei->i_completed_io_list, &unwritten);
  179 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
  180 +
  181 + while (!list_empty(&unwritten)) {
  182 + io = list_entry(unwritten.next, ext4_io_end_t, list);
  183 + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
  184 + list_del_init(&io->list);
  185 +
  186 + err = ext4_end_io(io);
  187 + if (unlikely(!ret && err))
  188 + ret = err;
  189 +
  190 + list_add_tail(&io->list, &complete);
163 191 }
164   - list_del_init(&io->list);
  192 + /* It is important to update all flags for all end_io in one shot w/o
  193 + * dropping the lock.*/
  194 + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
  195 + while (!list_empty(&complete)) {
  196 + io = list_entry(complete.next, ext4_io_end_t, list);
  197 + io->flag &= ~EXT4_IO_END_UNWRITTEN;
  198 + /* end_io context can not be destroyed now because it still
  199 + * used by queued worker. Worker thread will destroy it later */
  200 + if (io->flag & EXT4_IO_END_QUEUED)
  201 + list_del_init(&io->list);
  202 + else
  203 + list_move(&io->list, &to_free);
  204 + }
  205 + /* If we are called from worker context, it is time to clear queued
  206 + * flag, and destroy it's end_io if it was converted already */
  207 + if (work_io) {
  208 + work_io->flag &= ~EXT4_IO_END_QUEUED;
  209 + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
  210 + list_add_tail(&work_io->list, &to_free);
  211 + }
165 212 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
166   - (void) ext4_end_io_nolock(io);
167   - mutex_unlock(&inode->i_mutex);
168   -free:
169   - ext4_free_io_end(io);
  213 +
  214 + while (!list_empty(&to_free)) {
  215 + io = list_entry(to_free.next, ext4_io_end_t, list);
  216 + list_del_init(&io->list);
  217 + ext4_free_io_end(io);
  218 + }
  219 + return ret;
170 220 }
171 221  
  222 +/*
  223 + * work on completed aio dio IO, to convert unwritten extents to extents
  224 + */
  225 +static void ext4_end_io_work(struct work_struct *work)
  226 +{
  227 + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
  228 + ext4_do_flush_completed_IO(io->inode, io);
  229 +}
  230 +
  231 +int ext4_flush_completed_IO(struct inode *inode)
  232 +{
  233 + return ext4_do_flush_completed_IO(inode, NULL);
  234 +}
  235 +
172 236 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
173 237 {
174 238 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
175 239  
... ... @@ -199,9 +263,7 @@
199 263 static void ext4_end_bio(struct bio *bio, int error)
200 264 {
201 265 ext4_io_end_t *io_end = bio->bi_private;
202   - struct workqueue_struct *wq;
203 266 struct inode *inode;
204   - unsigned long flags;
205 267 int i;
206 268 sector_t bi_sector = bio->bi_sector;
207 269  
... ... @@ -259,14 +321,7 @@
259 321 return;
260 322 }
261 323  
262   - /* Add the io_end to per-inode completed io list*/
263   - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
264   - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
265   - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
266   -
267   - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
268   - /* queue the work to convert unwritten extents to written */
269   - queue_work(wq, &io_end->work);
  324 + ext4_add_complete_io(io_end);
270 325 }
271 326  
272 327 void ext4_io_submit(struct ext4_io_submit *io)