Commit 91dd8c114499e9818f2d5919ef0b9eee61810220

Authored by Lukas Czerner
Committed by Theodore Ts'o
1 parent f3b59291a6

ext4: prevent race while walking extent tree for fiemap

Currently ext4_ext_walk_space() only takes i_data_sem for read when
searching for the extent at given block with ext4_ext_find_extent().
Then it drops the lock and the extent tree can be changed at will.
However later on we're searching for the 'next' extent, but the extent
tree might already have changed, so the information might not be
accurate.

In fact we can hit BUG_ON(end <= start) if the extent got inserted into
the tree after the one we found and before the block we were searching
for. This has been reproduced by running xfstests 225 in loop on s390x
architecture, but theoretically we could hit this on any other
architecture as well, but probably not as often.

Moreover the extent currently in delayed allocation might be allocated
after we search the extent tree and before we search extent status tree
delayed buffers resulting in those delayed buffers being completely
missed, even though completely written and allocated.

We fix all those problems in several steps:

 1. remove unnecessary callback indirection
 2. rename functions
        ext4_ext_walk_space -> ext4_fill_fiemap_extents
        ext4_ext_fiemap_cb -> ext4_find_delayed_extent
 3. move fiemap_fill_next_extent() into ext4_fill_fiemap_extents()
 4. hold the i_data_sem for:
        ext4_ext_find_extent()
        ext4_ext_next_allocated_block()
        ext4_find_delayed_extent()
 5. call fiemap_fill_next_extent after releasing the i_data_sem
 6. move path reinitialization into the critical section.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 2 changed files with 76 additions and 74 deletions Side-by-side Diff

fs/ext4/ext4_extents.h
... ... @@ -144,20 +144,6 @@
144 144 */
145 145  
146 146 /*
147   - * to be called by ext4_ext_walk_space()
148   - * negative retcode - error
149   - * positive retcode - signal for ext4_ext_walk_space(), see below
150   - * callback must return valid extent (passed or newly created)
151   - */
152   -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
153   - struct ext4_ext_cache *,
154   - struct ext4_extent *, void *);
155   -
156   -#define EXT_CONTINUE 0
157   -#define EXT_BREAK 1
158   -#define EXT_REPEAT 2
159   -
160   -/*
161 147 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
162 148 * __le32.
163 149 */
... ... @@ -109,6 +109,9 @@
109 109 int split_flag,
110 110 int flags);
111 111  
  112 +static int ext4_find_delayed_extent(struct inode *inode,
  113 + struct ext4_ext_cache *newex);
  114 +
112 115 static int ext4_ext_truncate_extend_restart(handle_t *handle,
113 116 struct inode *inode,
114 117 int needed)
115 118  
116 119  
117 120  
118 121  
119 122  
120 123  
... ... @@ -1959,27 +1962,33 @@
1959 1962 return err;
1960 1963 }
1961 1964  
1962   -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1963   - ext4_lblk_t num, ext_prepare_callback func,
1964   - void *cbdata)
  1965 +static int ext4_fill_fiemap_extents(struct inode *inode,
  1966 + ext4_lblk_t block, ext4_lblk_t num,
  1967 + struct fiemap_extent_info *fieinfo)
1965 1968 {
1966 1969 struct ext4_ext_path *path = NULL;
1967 1970 struct ext4_ext_cache cbex;
1968 1971 struct ext4_extent *ex;
1969   - ext4_lblk_t next, start = 0, end = 0;
  1972 + ext4_lblk_t next, next_del, start = 0, end = 0;
1970 1973 ext4_lblk_t last = block + num;
1971   - int depth, exists, err = 0;
  1974 + int exists, depth = 0, err = 0;
  1975 + unsigned int flags = 0;
  1976 + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
1972 1977  
1973   - BUG_ON(func == NULL);
1974   - BUG_ON(inode == NULL);
1975   -
1976 1978 while (block < last && block != EXT_MAX_BLOCKS) {
1977 1979 num = last - block;
1978 1980 /* find extent for this block */
1979 1981 down_read(&EXT4_I(inode)->i_data_sem);
  1982 +
  1983 + if (path && ext_depth(inode) != depth) {
  1984 + /* depth was changed. we have to realloc path */
  1985 + kfree(path);
  1986 + path = NULL;
  1987 + }
  1988 +
1980 1989 path = ext4_ext_find_extent(inode, block, path);
1981   - up_read(&EXT4_I(inode)->i_data_sem);
1982 1990 if (IS_ERR(path)) {
  1991 + up_read(&EXT4_I(inode)->i_data_sem);
1983 1992 err = PTR_ERR(path);
1984 1993 path = NULL;
1985 1994 break;
1986 1995  
1987 1996  
... ... @@ -1987,13 +1996,16 @@
1987 1996  
1988 1997 depth = ext_depth(inode);
1989 1998 if (unlikely(path[depth].p_hdr == NULL)) {
  1999 + up_read(&EXT4_I(inode)->i_data_sem);
1990 2000 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1991 2001 err = -EIO;
1992 2002 break;
1993 2003 }
1994 2004 ex = path[depth].p_ext;
1995 2005 next = ext4_ext_next_allocated_block(path);
  2006 + ext4_ext_drop_refs(path);
1996 2007  
  2008 + flags = 0;
1997 2009 exists = 0;
1998 2010 if (!ex) {
1999 2011 /* there is no extent yet, so try to allocate
2000 2012  
2001 2013  
2002 2014  
2003 2015  
... ... @@ -2037,30 +2049,54 @@
2037 2049 cbex.ec_block = le32_to_cpu(ex->ee_block);
2038 2050 cbex.ec_len = ext4_ext_get_actual_len(ex);
2039 2051 cbex.ec_start = ext4_ext_pblock(ex);
  2052 + if (ext4_ext_is_uninitialized(ex))
  2053 + flags |= FIEMAP_EXTENT_UNWRITTEN;
2040 2054 }
2041 2055  
  2056 + /*
  2057 + * Find delayed extent and update cbex accordingly. We call
  2058 + * it even in !exists case to find out whether cbex is the
  2059 + * last existing extent or not.
  2060 + */
  2061 + next_del = ext4_find_delayed_extent(inode, &cbex);
  2062 + if (!exists && next_del) {
  2063 + exists = 1;
  2064 + flags |= FIEMAP_EXTENT_DELALLOC;
  2065 + }
  2066 + up_read(&EXT4_I(inode)->i_data_sem);
  2067 +
2042 2068 if (unlikely(cbex.ec_len == 0)) {
2043 2069 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
2044 2070 err = -EIO;
2045 2071 break;
2046 2072 }
2047   - err = func(inode, next, &cbex, ex, cbdata);
2048   - ext4_ext_drop_refs(path);
2049 2073  
2050   - if (err < 0)
2051   - break;
2052   -
2053   - if (err == EXT_REPEAT)
2054   - continue;
2055   - else if (err == EXT_BREAK) {
2056   - err = 0;
2057   - break;
  2074 + /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
  2075 + if (next == next_del) {
  2076 + flags |= FIEMAP_EXTENT_LAST;
  2077 + if (unlikely(next_del != EXT_MAX_BLOCKS ||
  2078 + next != EXT_MAX_BLOCKS)) {
  2079 + EXT4_ERROR_INODE(inode,
  2080 + "next extent == %u, next "
  2081 + "delalloc extent = %u",
  2082 + next, next_del);
  2083 + err = -EIO;
  2084 + break;
  2085 + }
2058 2086 }
2059 2087  
2060   - if (ext_depth(inode) != depth) {
2061   - /* depth was changed. we have to realloc path */
2062   - kfree(path);
2063   - path = NULL;
  2088 + if (exists) {
  2089 + err = fiemap_fill_next_extent(fieinfo,
  2090 + (__u64)cbex.ec_block << blksize_bits,
  2091 + (__u64)cbex.ec_start << blksize_bits,
  2092 + (__u64)cbex.ec_len << blksize_bits,
  2093 + flags);
  2094 + if (err < 0)
  2095 + break;
  2096 + if (err == 1) {
  2097 + err = 0;
  2098 + break;
  2099 + }
2064 2100 }
2065 2101  
2066 2102 block = cbex.ec_block + cbex.ec_len;
2067 2103  
2068 2104  
2069 2105  
2070 2106  
... ... @@ -4493,26 +4529,23 @@
4493 4529 }
4494 4530  
4495 4531 /*
4496   - * Callback function called for each extent to gather FIEMAP information.
  4532 + * If newex is not existing extent (newex->ec_start equals zero) find
  4533 + * delayed extent at start of newex and update newex accordingly and
  4534 + * return start of the next delayed extent.
  4535 + *
  4536 + * If newex is existing extent (newex->ec_start is not equal zero)
  4537 + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
  4538 + * extent found. Leave newex unmodified.
4497 4539 */
4498   -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4499   - struct ext4_ext_cache *newex, struct ext4_extent *ex,
4500   - void *data)
  4540 +static int ext4_find_delayed_extent(struct inode *inode,
  4541 + struct ext4_ext_cache *newex)
4501 4542 {
4502 4543 struct extent_status es;
4503   - __u64 logical;
4504   - __u64 physical;
4505   - __u64 length;
4506   - __u32 flags = 0;
4507 4544 ext4_lblk_t next_del;
4508   - int ret = 0;
4509   - struct fiemap_extent_info *fieinfo = data;
4510   - unsigned char blksize_bits;
4511 4545  
4512 4546 es.start = newex->ec_block;
4513 4547 next_del = ext4_es_find_extent(inode, &es);
4514 4548  
4515   - next = min(next_del, next);
4516 4549 if (newex->ec_start == 0) {
4517 4550 /*
4518 4551 * No extent in extent-tree contains block @newex->ec_start,
4519 4552  
4520 4553  
4521 4554  
... ... @@ -4520,37 +4553,19 @@
4520 4553 */
4521 4554 if (es.len == 0)
4522 4555 /* A hole found. */
4523   - return EXT_CONTINUE;
  4556 + return 0;
4524 4557  
4525 4558 if (es.start > newex->ec_block) {
4526 4559 /* A hole found. */
4527 4560 newex->ec_len = min(es.start - newex->ec_block,
4528 4561 newex->ec_len);
4529   - return EXT_CONTINUE;
  4562 + return 0;
4530 4563 }
4531 4564  
4532   - flags |= FIEMAP_EXTENT_DELALLOC;
4533 4565 newex->ec_len = es.start + es.len - newex->ec_block;
4534 4566 }
4535 4567  
4536   - if (ex && ext4_ext_is_uninitialized(ex))
4537   - flags |= FIEMAP_EXTENT_UNWRITTEN;
4538   -
4539   - if (next == EXT_MAX_BLOCKS)
4540   - flags |= FIEMAP_EXTENT_LAST;
4541   -
4542   - blksize_bits = inode->i_sb->s_blocksize_bits;
4543   - logical = (__u64)newex->ec_block << blksize_bits;
4544   - physical = (__u64)newex->ec_start << blksize_bits;
4545   - length = (__u64)newex->ec_len << blksize_bits;
4546   -
4547   - ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4548   - length, flags);
4549   - if (ret < 0)
4550   - return ret;
4551   - if (ret == 1)
4552   - return EXT_BREAK;
4553   - return EXT_CONTINUE;
  4568 + return next_del;
4554 4569 }
4555 4570 /* fiemap flags we can handle specified here */
4556 4571 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
... ... @@ -4772,6 +4787,7 @@
4772 4787 mutex_unlock(&inode->i_mutex);
4773 4788 return err;
4774 4789 }
  4790 +
4775 4791 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4776 4792 __u64 start, __u64 len)
4777 4793 {
4778 4794  
... ... @@ -4799,11 +4815,11 @@
4799 4815 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4800 4816  
4801 4817 /*
4802   - * Walk the extent tree gathering extent information.
4803   - * ext4_ext_fiemap_cb will push extents back to user.
  4818 + * Walk the extent tree gathering extent information
  4819 + * and pushing extents back to the user.
4804 4820 */
4805   - error = ext4_ext_walk_space(inode, start_blk, len_blks,
4806   - ext4_ext_fiemap_cb, fieinfo);
  4821 + error = ext4_fill_fiemap_extents(inode, start_blk,
  4822 + len_blks, fieinfo);
4807 4823 }
4808 4824  
4809 4825 return error;