Commit 53e872681fed6a43047e71bf927f77d06f467988

Authored by Jan Kara
Committed by Theodore Ts'o
1 parent 4520fb3c36

ext4: fix deadlock in journal_unmap_buffer()

We cannot wait for transaction commit in journal_unmap_buffer()
because we hold page lock which ranks below transaction start.  We
solve the issue by bailing out of journal_unmap_buffer() and
jbd2_journal_invalidatepage() with -EBUSY.  Caller is then responsible
for waiting for transaction commit to finish and try invalidation
again. Since the issue can happen only for page stradding i_size, it
is simple enough to manually call jbd2_journal_invalidatepage() for
such page from ext4_setattr(), check the return value and wait if
necessary.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 3 changed files with 86 additions and 25 deletions Side-by-side Diff

... ... @@ -2894,8 +2894,8 @@
2894 2894 block_invalidatepage(page, offset);
2895 2895 }
2896 2896  
2897   -static void ext4_journalled_invalidatepage(struct page *page,
2898   - unsigned long offset)
  2897 +static int __ext4_journalled_invalidatepage(struct page *page,
  2898 + unsigned long offset)
2899 2899 {
2900 2900 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2901 2901  
2902 2902  
... ... @@ -2907,9 +2907,16 @@
2907 2907 if (offset == 0)
2908 2908 ClearPageChecked(page);
2909 2909  
2910   - jbd2_journal_invalidatepage(journal, page, offset);
  2910 + return jbd2_journal_invalidatepage(journal, page, offset);
2911 2911 }
2912 2912  
  2913 +/* Wrapper for aops... */
  2914 +static void ext4_journalled_invalidatepage(struct page *page,
  2915 + unsigned long offset)
  2916 +{
  2917 + WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
  2918 +}
  2919 +
2913 2920 static int ext4_releasepage(struct page *page, gfp_t wait)
2914 2921 {
2915 2922 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
... ... @@ -4314,6 +4321,47 @@
4314 4321 }
4315 4322  
4316 4323 /*
  4324 + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
  4325 + * buffers that are attached to a page stradding i_size and are undergoing
  4326 + * commit. In that case we have to wait for commit to finish and try again.
  4327 + */
  4328 +static void ext4_wait_for_tail_page_commit(struct inode *inode)
  4329 +{
  4330 + struct page *page;
  4331 + unsigned offset;
  4332 + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
  4333 + tid_t commit_tid = 0;
  4334 + int ret;
  4335 +
  4336 + offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
  4337 + /*
  4338 + * All buffers in the last page remain valid? Then there's nothing to
  4339 + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
  4340 + * blocksize case
  4341 + */
  4342 + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
  4343 + return;
  4344 + while (1) {
  4345 + page = find_lock_page(inode->i_mapping,
  4346 + inode->i_size >> PAGE_CACHE_SHIFT);
  4347 + if (!page)
  4348 + return;
  4349 + ret = __ext4_journalled_invalidatepage(page, offset);
  4350 + unlock_page(page);
  4351 + page_cache_release(page);
  4352 + if (ret != -EBUSY)
  4353 + return;
  4354 + commit_tid = 0;
  4355 + read_lock(&journal->j_state_lock);
  4356 + if (journal->j_committing_transaction)
  4357 + commit_tid = journal->j_committing_transaction->t_tid;
  4358 + read_unlock(&journal->j_state_lock);
  4359 + if (commit_tid)
  4360 + jbd2_log_wait_commit(journal, commit_tid);
  4361 + }
  4362 +}
  4363 +
  4364 +/*
4317 4365 * ext4_setattr()
4318 4366 *
4319 4367 * Called from notify_change.
4320 4368  
4321 4369  
... ... @@ -4426,16 +4474,28 @@
4426 4474 }
4427 4475  
4428 4476 if (attr->ia_valid & ATTR_SIZE) {
4429   - if (attr->ia_size != i_size_read(inode)) {
4430   - truncate_setsize(inode, attr->ia_size);
4431   - /* Inode size will be reduced, wait for dio in flight.
4432   - * Temporarily disable dioread_nolock to prevent
4433   - * livelock. */
  4477 + if (attr->ia_size != inode->i_size) {
  4478 + loff_t oldsize = inode->i_size;
  4479 +
  4480 + i_size_write(inode, attr->ia_size);
  4481 + /*
  4482 + * Blocks are going to be removed from the inode. Wait
  4483 + * for dio in flight. Temporarily disable
  4484 + * dioread_nolock to prevent livelock.
  4485 + */
4434 4486 if (orphan) {
4435   - ext4_inode_block_unlocked_dio(inode);
4436   - inode_dio_wait(inode);
4437   - ext4_inode_resume_unlocked_dio(inode);
  4487 + if (!ext4_should_journal_data(inode)) {
  4488 + ext4_inode_block_unlocked_dio(inode);
  4489 + inode_dio_wait(inode);
  4490 + ext4_inode_resume_unlocked_dio(inode);
  4491 + } else
  4492 + ext4_wait_for_tail_page_commit(inode);
4438 4493 }
  4494 + /*
  4495 + * Truncate pagecache after we've waited for commit
  4496 + * in data=journal mode to make pages freeable.
  4497 + */
  4498 + truncate_pagecache(inode, oldsize, inode->i_size);
4439 4499 }
4440 4500 ext4_truncate(inode);
4441 4501 }
fs/jbd2/transaction.c
... ... @@ -1840,7 +1840,6 @@
1840 1840  
1841 1841 BUFFER_TRACE(bh, "entry");
1842 1842  
1843   -retry:
1844 1843 /*
1845 1844 * It is safe to proceed here without the j_list_lock because the
1846 1845 * buffers cannot be stolen by try_to_free_buffers as long as we are
1847 1846  
... ... @@ -1935,14 +1934,11 @@
1935 1934 * for commit and try again.
1936 1935 */
1937 1936 if (partial_page) {
1938   - tid_t tid = journal->j_committing_transaction->t_tid;
1939   -
1940 1937 jbd2_journal_put_journal_head(jh);
1941 1938 spin_unlock(&journal->j_list_lock);
1942 1939 jbd_unlock_bh_state(bh);
1943 1940 write_unlock(&journal->j_state_lock);
1944   - jbd2_log_wait_commit(journal, tid);
1945   - goto retry;
  1941 + return -EBUSY;
1946 1942 }
1947 1943 /*
1948 1944 * OK, buffer won't be reachable after truncate. We just set
1949 1945  
1950 1946  
1951 1947  
... ... @@ -2003,21 +1999,23 @@
2003 1999 * @page: page to flush
2004 2000 * @offset: length of page to invalidate.
2005 2001 *
2006   - * Reap page buffers containing data after offset in page.
2007   - *
  2002 + * Reap page buffers containing data after offset in page. Can return -EBUSY
  2003 + * if buffers are part of the committing transaction and the page is straddling
  2004 + * i_size. Caller then has to wait for current commit and try again.
2008 2005 */
2009   -void jbd2_journal_invalidatepage(journal_t *journal,
2010   - struct page *page,
2011   - unsigned long offset)
  2006 +int jbd2_journal_invalidatepage(journal_t *journal,
  2007 + struct page *page,
  2008 + unsigned long offset)
2012 2009 {
2013 2010 struct buffer_head *head, *bh, *next;
2014 2011 unsigned int curr_off = 0;
2015 2012 int may_free = 1;
  2013 + int ret = 0;
2016 2014  
2017 2015 if (!PageLocked(page))
2018 2016 BUG();
2019 2017 if (!page_has_buffers(page))
2020   - return;
  2018 + return 0;
2021 2019  
2022 2020 /* We will potentially be playing with lists other than just the
2023 2021 * data lists (especially for journaled data mode), so be
2024 2022  
... ... @@ -2031,9 +2029,11 @@
2031 2029 if (offset <= curr_off) {
2032 2030 /* This block is wholly outside the truncation point */
2033 2031 lock_buffer(bh);
2034   - may_free &= journal_unmap_buffer(journal, bh,
2035   - offset > 0);
  2032 + ret = journal_unmap_buffer(journal, bh, offset > 0);
2036 2033 unlock_buffer(bh);
  2034 + if (ret < 0)
  2035 + return ret;
  2036 + may_free &= ret;
2037 2037 }
2038 2038 curr_off = next_off;
2039 2039 bh = next;
... ... @@ -2044,6 +2044,7 @@
2044 2044 if (may_free && try_to_free_buffers(page))
2045 2045 J_ASSERT(!page_has_buffers(page));
2046 2046 }
  2047 + return 0;
2047 2048 }
2048 2049  
2049 2050 /*
include/linux/jbd2.h
... ... @@ -1098,7 +1098,7 @@
1098 1098 extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
1099 1099 extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
1100 1100 extern void journal_sync_buffer (struct buffer_head *);
1101   -extern void jbd2_journal_invalidatepage(journal_t *,
  1101 +extern int jbd2_journal_invalidatepage(journal_t *,
1102 1102 struct page *, unsigned long);
1103 1103 extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
1104 1104 extern int jbd2_journal_stop(handle_t *);