Commit 1cf29683f4414296dc772a87caa207cab16c310c
Exists in
master
and in
4 other branches
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: jbd2: fix race between write_metadata_buffer and get_write_access ext4: Fix ext4_mb_initialize_context() to initialize all fields ext4: fix null handler of ioctls in no journal mode ext4: Fix buffer head reference leak in no-journal mode ext4: Move __ext4_journalled_writepage() to avoid forward declaration ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc ext4: Fix mmap/truncate race when blocksize < pagesize && delayed allocation ext4: Don't look at buffer_heads outside i_size. ext4: Fix goal inum check in the inode allocator ext4: fix no journal corruption with locale-gen ext4: Calculate required journal credits for inserting an extent properly ext4: Fix truncation of symlinks after failed write jbd2: Fix a race between checkpointing code and journal_get_write_access() ext4: Use rcu_barrier() on module unload. ext4: naturally align struct ext4_allocation_request ext4: mark several more functions in mballoc.c as noinline ext4: Fix potential reclaim deadlock when truncating partial block jbd2: Remove GFP_ATOMIC kmalloc from inside spinlock critical region ext4: Fix type warning on 64-bit platforms in tracing events header
Showing 11 changed files Side-by-side Diff
fs/ext4/ext4.h
... | ... | @@ -93,20 +93,20 @@ |
93 | 93 | struct ext4_allocation_request { |
94 | 94 | /* target inode for block we're allocating */ |
95 | 95 | struct inode *inode; |
96 | + /* how many blocks we want to allocate */ | |
97 | + unsigned int len; | |
96 | 98 | /* logical block in target inode */ |
97 | 99 | ext4_lblk_t logical; |
98 | - /* phys. target (a hint) */ | |
99 | - ext4_fsblk_t goal; | |
100 | 100 | /* the closest logical allocated block to the left */ |
101 | 101 | ext4_lblk_t lleft; |
102 | - /* phys. block for ^^^ */ | |
103 | - ext4_fsblk_t pleft; | |
104 | 102 | /* the closest logical allocated block to the right */ |
105 | 103 | ext4_lblk_t lright; |
106 | - /* phys. block for ^^^ */ | |
104 | + /* phys. target (a hint) */ | |
105 | + ext4_fsblk_t goal; | |
106 | + /* phys. block for the closest logical allocated block to the left */ | |
107 | + ext4_fsblk_t pleft; | |
108 | + /* phys. block for the closest logical allocated block to the right */ | |
107 | 109 | ext4_fsblk_t pright; |
108 | - /* how many blocks we want to allocate */ | |
109 | - unsigned int len; | |
110 | 110 | /* flags. see above EXT4_MB_HINT_* */ |
111 | 111 | unsigned int flags; |
112 | 112 | }; |
fs/ext4/ext4_jbd2.c
... | ... | @@ -43,6 +43,8 @@ |
43 | 43 | ext4_journal_abort_handle(where, __func__, bh, |
44 | 44 | handle, err); |
45 | 45 | } |
46 | + else | |
47 | + brelse(bh); | |
46 | 48 | return err; |
47 | 49 | } |
48 | 50 | |
... | ... | @@ -57,6 +59,8 @@ |
57 | 59 | ext4_journal_abort_handle(where, __func__, bh, |
58 | 60 | handle, err); |
59 | 61 | } |
62 | + else | |
63 | + brelse(bh); | |
60 | 64 | return err; |
61 | 65 | } |
62 | 66 |
fs/ext4/ext4_jbd2.h
... | ... | @@ -131,9 +131,11 @@ |
131 | 131 | int __ext4_journal_get_write_access(const char *where, handle_t *handle, |
132 | 132 | struct buffer_head *bh); |
133 | 133 | |
134 | +/* When called with an invalid handle, this will still do a put on the BH */ | |
134 | 135 | int __ext4_journal_forget(const char *where, handle_t *handle, |
135 | 136 | struct buffer_head *bh); |
136 | 137 | |
138 | +/* When called with an invalid handle, this will still do a put on the BH */ | |
137 | 139 | int __ext4_journal_revoke(const char *where, handle_t *handle, |
138 | 140 | ext4_fsblk_t blocknr, struct buffer_head *bh); |
139 | 141 | |
140 | 142 | |
... | ... | @@ -281,10 +283,10 @@ |
281 | 283 | |
282 | 284 | static inline int ext4_should_writeback_data(struct inode *inode) |
283 | 285 | { |
284 | - if (EXT4_JOURNAL(inode) == NULL) | |
285 | - return 0; | |
286 | 286 | if (!S_ISREG(inode->i_mode)) |
287 | 287 | return 0; |
288 | + if (EXT4_JOURNAL(inode) == NULL) | |
289 | + return 1; | |
288 | 290 | if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) |
289 | 291 | return 0; |
290 | 292 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) |
fs/ext4/extents.c
fs/ext4/ialloc.c
... | ... | @@ -833,7 +833,7 @@ |
833 | 833 | if (!goal) |
834 | 834 | goal = sbi->s_inode_goal; |
835 | 835 | |
836 | - if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { | |
836 | + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { | |
837 | 837 | group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); |
838 | 838 | ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); |
839 | 839 | ret2 = 0; |
fs/ext4/inode.c
... | ... | @@ -78,16 +78,14 @@ |
78 | 78 | * but there may still be a record of it in the journal, and that record |
79 | 79 | * still needs to be revoked. |
80 | 80 | * |
81 | - * If the handle isn't valid we're not journaling so there's nothing to do. | |
81 | + * If the handle isn't valid we're not journaling, but we still need to | |
82 | + * call into ext4_journal_revoke() to put the buffer head. | |
82 | 83 | */ |
83 | 84 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, |
84 | 85 | struct buffer_head *bh, ext4_fsblk_t blocknr) |
85 | 86 | { |
86 | 87 | int err; |
87 | 88 | |
88 | - if (!ext4_handle_valid(handle)) | |
89 | - return 0; | |
90 | - | |
91 | 89 | might_sleep(); |
92 | 90 | |
93 | 91 | BUFFER_TRACE(bh, "enter"); |
94 | 92 | |
95 | 93 | |
... | ... | @@ -1513,14 +1511,14 @@ |
1513 | 1511 | * Add inode to orphan list in case we crash before |
1514 | 1512 | * truncate finishes |
1515 | 1513 | */ |
1516 | - if (pos + len > inode->i_size) | |
1514 | + if (pos + len > inode->i_size && ext4_can_truncate(inode)) | |
1517 | 1515 | ext4_orphan_add(handle, inode); |
1518 | 1516 | |
1519 | 1517 | ext4_journal_stop(handle); |
1520 | 1518 | if (pos + len > inode->i_size) { |
1521 | - vmtruncate(inode, inode->i_size); | |
1519 | + ext4_truncate(inode); | |
1522 | 1520 | /* |
1523 | - * If vmtruncate failed early the inode might | |
1521 | + * If truncate failed early the inode might | |
1524 | 1522 | * still be on the orphan list; we need to |
1525 | 1523 | * make sure the inode is removed from the |
1526 | 1524 | * orphan list in that case. |
... | ... | @@ -1614,7 +1612,7 @@ |
1614 | 1612 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1615 | 1613 | page, fsdata); |
1616 | 1614 | copied = ret2; |
1617 | - if (pos + len > inode->i_size) | |
1615 | + if (pos + len > inode->i_size && ext4_can_truncate(inode)) | |
1618 | 1616 | /* if we have allocated more blocks and copied |
1619 | 1617 | * less. We will have blocks allocated outside |
1620 | 1618 | * inode->i_size. So truncate them |
1621 | 1619 | |
... | ... | @@ -1628,9 +1626,9 @@ |
1628 | 1626 | ret = ret2; |
1629 | 1627 | |
1630 | 1628 | if (pos + len > inode->i_size) { |
1631 | - vmtruncate(inode, inode->i_size); | |
1629 | + ext4_truncate(inode); | |
1632 | 1630 | /* |
1633 | - * If vmtruncate failed early the inode might still be | |
1631 | + * If truncate failed early the inode might still be | |
1634 | 1632 | * on the orphan list; we need to make sure the inode |
1635 | 1633 | * is removed from the orphan list in that case. |
1636 | 1634 | */ |
... | ... | @@ -1655,7 +1653,7 @@ |
1655 | 1653 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1656 | 1654 | page, fsdata); |
1657 | 1655 | copied = ret2; |
1658 | - if (pos + len > inode->i_size) | |
1656 | + if (pos + len > inode->i_size && ext4_can_truncate(inode)) | |
1659 | 1657 | /* if we have allocated more blocks and copied |
1660 | 1658 | * less. We will have blocks allocated outside |
1661 | 1659 | * inode->i_size. So truncate them |
1662 | 1660 | |
... | ... | @@ -1670,9 +1668,9 @@ |
1670 | 1668 | ret = ret2; |
1671 | 1669 | |
1672 | 1670 | if (pos + len > inode->i_size) { |
1673 | - vmtruncate(inode, inode->i_size); | |
1671 | + ext4_truncate(inode); | |
1674 | 1672 | /* |
1675 | - * If vmtruncate failed early the inode might still be | |
1673 | + * If truncate failed early the inode might still be | |
1676 | 1674 | * on the orphan list; we need to make sure the inode |
1677 | 1675 | * is removed from the orphan list in that case. |
1678 | 1676 | */ |
... | ... | @@ -1722,7 +1720,7 @@ |
1722 | 1720 | |
1723 | 1721 | unlock_page(page); |
1724 | 1722 | page_cache_release(page); |
1725 | - if (pos + len > inode->i_size) | |
1723 | + if (pos + len > inode->i_size && ext4_can_truncate(inode)) | |
1726 | 1724 | /* if we have allocated more blocks and copied |
1727 | 1725 | * less. We will have blocks allocated outside |
1728 | 1726 | * inode->i_size. So truncate them |
1729 | 1727 | |
... | ... | @@ -1733,9 +1731,9 @@ |
1733 | 1731 | if (!ret) |
1734 | 1732 | ret = ret2; |
1735 | 1733 | if (pos + len > inode->i_size) { |
1736 | - vmtruncate(inode, inode->i_size); | |
1734 | + ext4_truncate(inode); | |
1737 | 1735 | /* |
1738 | - * If vmtruncate failed early the inode might still be | |
1736 | + * If truncate failed early the inode might still be | |
1739 | 1737 | * on the orphan list; we need to make sure the inode |
1740 | 1738 | * is removed from the orphan list in that case. |
1741 | 1739 | */ |
1742 | 1740 | |
... | ... | @@ -2305,15 +2303,9 @@ |
2305 | 2303 | return; |
2306 | 2304 | } |
2307 | 2305 | |
2308 | -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | |
2306 | +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | |
2309 | 2307 | { |
2310 | - /* | |
2311 | - * unmapped buffer is possible for holes. | |
2312 | - * delay buffer is possible with delayed allocation. | |
2313 | - * We also need to consider unwritten buffer as unmapped. | |
2314 | - */ | |
2315 | - return (!buffer_mapped(bh) || buffer_delay(bh) || | |
2316 | - buffer_unwritten(bh)) && buffer_dirty(bh); | |
2308 | + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); | |
2317 | 2309 | } |
2318 | 2310 | |
2319 | 2311 | /* |
2320 | 2312 | |
... | ... | @@ -2398,9 +2390,9 @@ |
2398 | 2390 | * We need to try to allocate |
2399 | 2391 | * unmapped blocks in the same page. |
2400 | 2392 | * Otherwise we won't make progress |
2401 | - * with the page in ext4_da_writepage | |
2393 | + * with the page in ext4_writepage | |
2402 | 2394 | */ |
2403 | - if (ext4_bh_unmapped_or_delay(NULL, bh)) { | |
2395 | + if (ext4_bh_delay_or_unwritten(NULL, bh)) { | |
2404 | 2396 | mpage_add_bh_to_extent(mpd, logical, |
2405 | 2397 | bh->b_size, |
2406 | 2398 | bh->b_state); |
... | ... | @@ -2517,7 +2509,6 @@ |
2517 | 2509 | * so call get_block_wrap with create = 0 |
2518 | 2510 | */ |
2519 | 2511 | ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); |
2520 | - BUG_ON(create && ret == 0); | |
2521 | 2512 | if (ret > 0) { |
2522 | 2513 | bh_result->b_size = (ret << inode->i_blkbits); |
2523 | 2514 | ret = 0; |
2524 | 2515 | |
2525 | 2516 | |
2526 | 2517 | |
... | ... | @@ -2525,15 +2516,102 @@ |
2525 | 2516 | return ret; |
2526 | 2517 | } |
2527 | 2518 | |
2519 | +static int bget_one(handle_t *handle, struct buffer_head *bh) | |
2520 | +{ | |
2521 | + get_bh(bh); | |
2522 | + return 0; | |
2523 | +} | |
2524 | + | |
2525 | +static int bput_one(handle_t *handle, struct buffer_head *bh) | |
2526 | +{ | |
2527 | + put_bh(bh); | |
2528 | + return 0; | |
2529 | +} | |
2530 | + | |
2531 | +static int __ext4_journalled_writepage(struct page *page, | |
2532 | + struct writeback_control *wbc, | |
2533 | + unsigned int len) | |
2534 | +{ | |
2535 | + struct address_space *mapping = page->mapping; | |
2536 | + struct inode *inode = mapping->host; | |
2537 | + struct buffer_head *page_bufs; | |
2538 | + handle_t *handle = NULL; | |
2539 | + int ret = 0; | |
2540 | + int err; | |
2541 | + | |
2542 | + page_bufs = page_buffers(page); | |
2543 | + BUG_ON(!page_bufs); | |
2544 | + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); | |
2545 | + /* As soon as we unlock the page, it can go away, but we have | |
2546 | + * references to buffers so we are safe */ | |
2547 | + unlock_page(page); | |
2548 | + | |
2549 | + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | |
2550 | + if (IS_ERR(handle)) { | |
2551 | + ret = PTR_ERR(handle); | |
2552 | + goto out; | |
2553 | + } | |
2554 | + | |
2555 | + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, | |
2556 | + do_journal_get_write_access); | |
2557 | + | |
2558 | + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, | |
2559 | + write_end_fn); | |
2560 | + if (ret == 0) | |
2561 | + ret = err; | |
2562 | + err = ext4_journal_stop(handle); | |
2563 | + if (!ret) | |
2564 | + ret = err; | |
2565 | + | |
2566 | + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); | |
2567 | + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | |
2568 | +out: | |
2569 | + return ret; | |
2570 | +} | |
2571 | + | |
2528 | 2572 | /* |
2573 | + * Note that we don't need to start a transaction unless we're journaling data | |
2574 | + * because we should have holes filled from ext4_page_mkwrite(). We even don't | |
2575 | + * need to file the inode to the transaction's list in ordered mode because if | |
2576 | + * we are writing back data added by write(), the inode is already there and if | |
2577 | + * we are writing back data modified via mmap(), noone guarantees in which | |
2578 | + * transaction the data will hit the disk. In case we are journaling data, we | |
2579 | + * cannot start transaction directly because transaction start ranks above page | |
2580 | + * lock so we have to do some magic. | |
2581 | + * | |
2529 | 2582 | * This function can get called via... |
2530 | 2583 | * - ext4_da_writepages after taking page lock (have journal handle) |
2531 | 2584 | * - journal_submit_inode_data_buffers (no journal handle) |
2532 | 2585 | * - shrink_page_list via pdflush (no journal handle) |
2533 | 2586 | * - grab_page_cache when doing write_begin (have journal handle) |
2587 | + * | |
2588 | + * We don't do any block allocation in this function. If we have page with | |
2589 | + * multiple blocks we need to write those buffer_heads that are mapped. This | |
2590 | + * is important for mmaped based write. So if we do with blocksize 1K | |
2591 | + * truncate(f, 1024); | |
2592 | + * a = mmap(f, 0, 4096); | |
2593 | + * a[0] = 'a'; | |
2594 | + * truncate(f, 4096); | |
2595 | + * we have in the page first buffer_head mapped via page_mkwrite call back | |
2596 | + * but other bufer_heads would be unmapped but dirty(dirty done via the | |
2597 | + * do_wp_page). So writepage should write the first block. If we modify | |
2598 | + * the mmap area beyond 1024 we will again get a page_fault and the | |
2599 | + * page_mkwrite callback will do the block allocation and mark the | |
2600 | + * buffer_heads mapped. | |
2601 | + * | |
2602 | + * We redirty the page if we have any buffer_heads that is either delay or | |
2603 | + * unwritten in the page. | |
2604 | + * | |
2605 | + * We can get recursively called as show below. | |
2606 | + * | |
2607 | + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | |
2608 | + * ext4_writepage() | |
2609 | + * | |
2610 | + * But since we don't do any block allocation we should not deadlock. | |
2611 | + * Page also have the dirty flag cleared so we don't get recurive page_lock. | |
2534 | 2612 | */ |
2535 | -static int ext4_da_writepage(struct page *page, | |
2536 | - struct writeback_control *wbc) | |
2613 | +static int ext4_writepage(struct page *page, | |
2614 | + struct writeback_control *wbc) | |
2537 | 2615 | { |
2538 | 2616 | int ret = 0; |
2539 | 2617 | loff_t size; |
... | ... | @@ -2541,7 +2619,7 @@ |
2541 | 2619 | struct buffer_head *page_bufs; |
2542 | 2620 | struct inode *inode = page->mapping->host; |
2543 | 2621 | |
2544 | - trace_ext4_da_writepage(inode, page); | |
2622 | + trace_ext4_writepage(inode, page); | |
2545 | 2623 | size = i_size_read(inode); |
2546 | 2624 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2547 | 2625 | len = size & ~PAGE_CACHE_MASK; |
... | ... | @@ -2551,7 +2629,7 @@ |
2551 | 2629 | if (page_has_buffers(page)) { |
2552 | 2630 | page_bufs = page_buffers(page); |
2553 | 2631 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, |
2554 | - ext4_bh_unmapped_or_delay)) { | |
2632 | + ext4_bh_delay_or_unwritten)) { | |
2555 | 2633 | /* |
2556 | 2634 | * We don't want to do block allocation |
2557 | 2635 | * So redirty the page and return |
2558 | 2636 | |
... | ... | @@ -2578,13 +2656,13 @@ |
2578 | 2656 | * all are mapped and non delay. We don't want to |
2579 | 2657 | * do block allocation here. |
2580 | 2658 | */ |
2581 | - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | |
2659 | + ret = block_prepare_write(page, 0, len, | |
2582 | 2660 | noalloc_get_block_write); |
2583 | 2661 | if (!ret) { |
2584 | 2662 | page_bufs = page_buffers(page); |
2585 | 2663 | /* check whether all are mapped and non delay */ |
2586 | 2664 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, |
2587 | - ext4_bh_unmapped_or_delay)) { | |
2665 | + ext4_bh_delay_or_unwritten)) { | |
2588 | 2666 | redirty_page_for_writepage(wbc, page); |
2589 | 2667 | unlock_page(page); |
2590 | 2668 | return 0; |
2591 | 2669 | |
... | ... | @@ -2600,9 +2678,18 @@ |
2600 | 2678 | return 0; |
2601 | 2679 | } |
2602 | 2680 | /* now mark the buffer_heads as dirty and uptodate */ |
2603 | - block_commit_write(page, 0, PAGE_CACHE_SIZE); | |
2681 | + block_commit_write(page, 0, len); | |
2604 | 2682 | } |
2605 | 2683 | |
2684 | + if (PageChecked(page) && ext4_should_journal_data(inode)) { | |
2685 | + /* | |
2686 | + * It's mmapped pagecache. Add buffers and journal it. There | |
2687 | + * doesn't seem much point in redirtying the page here. | |
2688 | + */ | |
2689 | + ClearPageChecked(page); | |
2690 | + return __ext4_journalled_writepage(page, wbc, len); | |
2691 | + } | |
2692 | + | |
2606 | 2693 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) |
2607 | 2694 | ret = nobh_writepage(page, noalloc_get_block_write, wbc); |
2608 | 2695 | else |
... | ... | @@ -2907,7 +2994,7 @@ |
2907 | 2994 | * i_size_read because we hold i_mutex. |
2908 | 2995 | */ |
2909 | 2996 | if (pos + len > inode->i_size) |
2910 | - vmtruncate(inode, inode->i_size); | |
2997 | + ext4_truncate(inode); | |
2911 | 2998 | } |
2912 | 2999 | |
2913 | 3000 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
... | ... | @@ -3130,222 +3217,6 @@ |
3130 | 3217 | return generic_block_bmap(mapping, block, ext4_get_block); |
3131 | 3218 | } |
3132 | 3219 | |
3133 | -static int bget_one(handle_t *handle, struct buffer_head *bh) | |
3134 | -{ | |
3135 | - get_bh(bh); | |
3136 | - return 0; | |
3137 | -} | |
3138 | - | |
3139 | -static int bput_one(handle_t *handle, struct buffer_head *bh) | |
3140 | -{ | |
3141 | - put_bh(bh); | |
3142 | - return 0; | |
3143 | -} | |
3144 | - | |
3145 | -/* | |
3146 | - * Note that we don't need to start a transaction unless we're journaling data | |
3147 | - * because we should have holes filled from ext4_page_mkwrite(). We even don't | |
3148 | - * need to file the inode to the transaction's list in ordered mode because if | |
3149 | - * we are writing back data added by write(), the inode is already there and if | |
3150 | - * we are writing back data modified via mmap(), noone guarantees in which | |
3151 | - * transaction the data will hit the disk. In case we are journaling data, we | |
3152 | - * cannot start transaction directly because transaction start ranks above page | |
3153 | - * lock so we have to do some magic. | |
3154 | - * | |
3155 | - * In all journaling modes block_write_full_page() will start the I/O. | |
3156 | - * | |
3157 | - * Problem: | |
3158 | - * | |
3159 | - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | |
3160 | - * ext4_writepage() | |
3161 | - * | |
3162 | - * Similar for: | |
3163 | - * | |
3164 | - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... | |
3165 | - * | |
3166 | - * Same applies to ext4_get_block(). We will deadlock on various things like | |
3167 | - * lock_journal and i_data_sem | |
3168 | - * | |
3169 | - * Setting PF_MEMALLOC here doesn't work - too many internal memory | |
3170 | - * allocations fail. | |
3171 | - * | |
3172 | - * 16May01: If we're reentered then journal_current_handle() will be | |
3173 | - * non-zero. We simply *return*. | |
3174 | - * | |
3175 | - * 1 July 2001: @@@ FIXME: | |
3176 | - * In journalled data mode, a data buffer may be metadata against the | |
3177 | - * current transaction. But the same file is part of a shared mapping | |
3178 | - * and someone does a writepage() on it. | |
3179 | - * | |
3180 | - * We will move the buffer onto the async_data list, but *after* it has | |
3181 | - * been dirtied. So there's a small window where we have dirty data on | |
3182 | - * BJ_Metadata. | |
3183 | - * | |
3184 | - * Note that this only applies to the last partial page in the file. The | |
3185 | - * bit which block_write_full_page() uses prepare/commit for. (That's | |
3186 | - * broken code anyway: it's wrong for msync()). | |
3187 | - * | |
3188 | - * It's a rare case: affects the final partial page, for journalled data | |
3189 | - * where the file is subject to bith write() and writepage() in the same | |
3190 | - * transction. To fix it we'll need a custom block_write_full_page(). | |
3191 | - * We'll probably need that anyway for journalling writepage() output. | |
3192 | - * | |
3193 | - * We don't honour synchronous mounts for writepage(). That would be | |
3194 | - * disastrous. Any write() or metadata operation will sync the fs for | |
3195 | - * us. | |
3196 | - * | |
3197 | - */ | |
3198 | -static int __ext4_normal_writepage(struct page *page, | |
3199 | - struct writeback_control *wbc) | |
3200 | -{ | |
3201 | - struct inode *inode = page->mapping->host; | |
3202 | - | |
3203 | - if (test_opt(inode->i_sb, NOBH)) | |
3204 | - return nobh_writepage(page, noalloc_get_block_write, wbc); | |
3205 | - else | |
3206 | - return block_write_full_page(page, noalloc_get_block_write, | |
3207 | - wbc); | |
3208 | -} | |
3209 | - | |
3210 | -static int ext4_normal_writepage(struct page *page, | |
3211 | - struct writeback_control *wbc) | |
3212 | -{ | |
3213 | - struct inode *inode = page->mapping->host; | |
3214 | - loff_t size = i_size_read(inode); | |
3215 | - loff_t len; | |
3216 | - | |
3217 | - trace_ext4_normal_writepage(inode, page); | |
3218 | - J_ASSERT(PageLocked(page)); | |
3219 | - if (page->index == size >> PAGE_CACHE_SHIFT) | |
3220 | - len = size & ~PAGE_CACHE_MASK; | |
3221 | - else | |
3222 | - len = PAGE_CACHE_SIZE; | |
3223 | - | |
3224 | - if (page_has_buffers(page)) { | |
3225 | - /* if page has buffers it should all be mapped | |
3226 | - * and allocated. If there are not buffers attached | |
3227 | - * to the page we know the page is dirty but it lost | |
3228 | - * buffers. That means that at some moment in time | |
3229 | - * after write_begin() / write_end() has been called | |
3230 | - * all buffers have been clean and thus they must have been | |
3231 | - * written at least once. So they are all mapped and we can | |
3232 | - * happily proceed with mapping them and writing the page. | |
3233 | - */ | |
3234 | - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | |
3235 | - ext4_bh_unmapped_or_delay)); | |
3236 | - } | |
3237 | - | |
3238 | - if (!ext4_journal_current_handle()) | |
3239 | - return __ext4_normal_writepage(page, wbc); | |
3240 | - | |
3241 | - redirty_page_for_writepage(wbc, page); | |
3242 | - unlock_page(page); | |
3243 | - return 0; | |
3244 | -} | |
3245 | - | |
3246 | -static int __ext4_journalled_writepage(struct page *page, | |
3247 | - struct writeback_control *wbc) | |
3248 | -{ | |
3249 | - struct address_space *mapping = page->mapping; | |
3250 | - struct inode *inode = mapping->host; | |
3251 | - struct buffer_head *page_bufs; | |
3252 | - handle_t *handle = NULL; | |
3253 | - int ret = 0; | |
3254 | - int err; | |
3255 | - | |
3256 | - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | |
3257 | - noalloc_get_block_write); | |
3258 | - if (ret != 0) | |
3259 | - goto out_unlock; | |
3260 | - | |
3261 | - page_bufs = page_buffers(page); | |
3262 | - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, | |
3263 | - bget_one); | |
3264 | - /* As soon as we unlock the page, it can go away, but we have | |
3265 | - * references to buffers so we are safe */ | |
3266 | - unlock_page(page); | |
3267 | - | |
3268 | - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | |
3269 | - if (IS_ERR(handle)) { | |
3270 | - ret = PTR_ERR(handle); | |
3271 | - goto out; | |
3272 | - } | |
3273 | - | |
3274 | - ret = walk_page_buffers(handle, page_bufs, 0, | |
3275 | - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); | |
3276 | - | |
3277 | - err = walk_page_buffers(handle, page_bufs, 0, | |
3278 | - PAGE_CACHE_SIZE, NULL, write_end_fn); | |
3279 | - if (ret == 0) | |
3280 | - ret = err; | |
3281 | - err = ext4_journal_stop(handle); | |
3282 | - if (!ret) | |
3283 | - ret = err; | |
3284 | - | |
3285 | - walk_page_buffers(handle, page_bufs, 0, | |
3286 | - PAGE_CACHE_SIZE, NULL, bput_one); | |
3287 | - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | |
3288 | - goto out; | |
3289 | - | |
3290 | -out_unlock: | |
3291 | - unlock_page(page); | |
3292 | -out: | |
3293 | - return ret; | |
3294 | -} | |
3295 | - | |
3296 | -static int ext4_journalled_writepage(struct page *page, | |
3297 | - struct writeback_control *wbc) | |
3298 | -{ | |
3299 | - struct inode *inode = page->mapping->host; | |
3300 | - loff_t size = i_size_read(inode); | |
3301 | - loff_t len; | |
3302 | - | |
3303 | - trace_ext4_journalled_writepage(inode, page); | |
3304 | - J_ASSERT(PageLocked(page)); | |
3305 | - if (page->index == size >> PAGE_CACHE_SHIFT) | |
3306 | - len = size & ~PAGE_CACHE_MASK; | |
3307 | - else | |
3308 | - len = PAGE_CACHE_SIZE; | |
3309 | - | |
3310 | - if (page_has_buffers(page)) { | |
3311 | - /* if page has buffers it should all be mapped | |
3312 | - * and allocated. If there are not buffers attached | |
3313 | - * to the page we know the page is dirty but it lost | |
3314 | - * buffers. That means that at some moment in time | |
3315 | - * after write_begin() / write_end() has been called | |
3316 | - * all buffers have been clean and thus they must have been | |
3317 | - * written at least once. So they are all mapped and we can | |
3318 | - * happily proceed with mapping them and writing the page. | |
3319 | - */ | |
3320 | - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | |
3321 | - ext4_bh_unmapped_or_delay)); | |
3322 | - } | |
3323 | - | |
3324 | - if (ext4_journal_current_handle()) | |
3325 | - goto no_write; | |
3326 | - | |
3327 | - if (PageChecked(page)) { | |
3328 | - /* | |
3329 | - * It's mmapped pagecache. Add buffers and journal it. There | |
3330 | - * doesn't seem much point in redirtying the page here. | |
3331 | - */ | |
3332 | - ClearPageChecked(page); | |
3333 | - return __ext4_journalled_writepage(page, wbc); | |
3334 | - } else { | |
3335 | - /* | |
3336 | - * It may be a page full of checkpoint-mode buffers. We don't | |
3337 | - * really know unless we go poke around in the buffer_heads. | |
3338 | - * But block_write_full_page will do the right thing. | |
3339 | - */ | |
3340 | - return block_write_full_page(page, noalloc_get_block_write, | |
3341 | - wbc); | |
3342 | - } | |
3343 | -no_write: | |
3344 | - redirty_page_for_writepage(wbc, page); | |
3345 | - unlock_page(page); | |
3346 | - return 0; | |
3347 | -} | |
3348 | - | |
3349 | 3220 | static int ext4_readpage(struct file *file, struct page *page) |
3350 | 3221 | { |
3351 | 3222 | return mpage_readpage(page, ext4_get_block); |
... | ... | @@ -3492,7 +3363,7 @@ |
3492 | 3363 | static const struct address_space_operations ext4_ordered_aops = { |
3493 | 3364 | .readpage = ext4_readpage, |
3494 | 3365 | .readpages = ext4_readpages, |
3495 | - .writepage = ext4_normal_writepage, | |
3366 | + .writepage = ext4_writepage, | |
3496 | 3367 | .sync_page = block_sync_page, |
3497 | 3368 | .write_begin = ext4_write_begin, |
3498 | 3369 | .write_end = ext4_ordered_write_end, |
... | ... | @@ -3507,7 +3378,7 @@ |
3507 | 3378 | static const struct address_space_operations ext4_writeback_aops = { |
3508 | 3379 | .readpage = ext4_readpage, |
3509 | 3380 | .readpages = ext4_readpages, |
3510 | - .writepage = ext4_normal_writepage, | |
3381 | + .writepage = ext4_writepage, | |
3511 | 3382 | .sync_page = block_sync_page, |
3512 | 3383 | .write_begin = ext4_write_begin, |
3513 | 3384 | .write_end = ext4_writeback_write_end, |
... | ... | @@ -3522,7 +3393,7 @@ |
3522 | 3393 | static const struct address_space_operations ext4_journalled_aops = { |
3523 | 3394 | .readpage = ext4_readpage, |
3524 | 3395 | .readpages = ext4_readpages, |
3525 | - .writepage = ext4_journalled_writepage, | |
3396 | + .writepage = ext4_writepage, | |
3526 | 3397 | .sync_page = block_sync_page, |
3527 | 3398 | .write_begin = ext4_write_begin, |
3528 | 3399 | .write_end = ext4_journalled_write_end, |
... | ... | @@ -3536,7 +3407,7 @@ |
3536 | 3407 | static const struct address_space_operations ext4_da_aops = { |
3537 | 3408 | .readpage = ext4_readpage, |
3538 | 3409 | .readpages = ext4_readpages, |
3539 | - .writepage = ext4_da_writepage, | |
3410 | + .writepage = ext4_writepage, | |
3540 | 3411 | .writepages = ext4_da_writepages, |
3541 | 3412 | .sync_page = block_sync_page, |
3542 | 3413 | .write_begin = ext4_da_write_begin, |
... | ... | @@ -3583,7 +3454,8 @@ |
3583 | 3454 | struct page *page; |
3584 | 3455 | int err = 0; |
3585 | 3456 | |
3586 | - page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); | |
3457 | + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, | |
3458 | + mapping_gfp_mask(mapping) & ~__GFP_FS); | |
3587 | 3459 | if (!page) |
3588 | 3460 | return -EINVAL; |
3589 | 3461 |
fs/ext4/ioctl.c
... | ... | @@ -191,7 +191,7 @@ |
191 | 191 | case EXT4_IOC_GROUP_EXTEND: { |
192 | 192 | ext4_fsblk_t n_blocks_count; |
193 | 193 | struct super_block *sb = inode->i_sb; |
194 | - int err, err2; | |
194 | + int err, err2=0; | |
195 | 195 | |
196 | 196 | if (!capable(CAP_SYS_RESOURCE)) |
197 | 197 | return -EPERM; |
... | ... | @@ -204,9 +204,11 @@ |
204 | 204 | return err; |
205 | 205 | |
206 | 206 | err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); |
207 | - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); | |
208 | - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); | |
209 | - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); | |
207 | + if (EXT4_SB(sb)->s_journal) { | |
208 | + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); | |
209 | + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); | |
210 | + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); | |
211 | + } | |
210 | 212 | if (err == 0) |
211 | 213 | err = err2; |
212 | 214 | mnt_drop_write(filp->f_path.mnt); |
... | ... | @@ -251,7 +253,7 @@ |
251 | 253 | case EXT4_IOC_GROUP_ADD: { |
252 | 254 | struct ext4_new_group_data input; |
253 | 255 | struct super_block *sb = inode->i_sb; |
254 | - int err, err2; | |
256 | + int err, err2=0; | |
255 | 257 | |
256 | 258 | if (!capable(CAP_SYS_RESOURCE)) |
257 | 259 | return -EPERM; |
... | ... | @@ -265,9 +267,11 @@ |
265 | 267 | return err; |
266 | 268 | |
267 | 269 | err = ext4_group_add(sb, &input); |
268 | - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); | |
269 | - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); | |
270 | - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); | |
270 | + if (EXT4_SB(sb)->s_journal) { | |
271 | + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); | |
272 | + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); | |
273 | + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); | |
274 | + } | |
271 | 275 | if (err == 0) |
272 | 276 | err = err2; |
273 | 277 | mnt_drop_write(filp->f_path.mnt); |
fs/ext4/mballoc.c
... | ... | @@ -657,7 +657,8 @@ |
657 | 657 | } |
658 | 658 | } |
659 | 659 | |
660 | -static void ext4_mb_generate_buddy(struct super_block *sb, | |
660 | +static noinline_for_stack | |
661 | +void ext4_mb_generate_buddy(struct super_block *sb, | |
661 | 662 | void *buddy, void *bitmap, ext4_group_t group) |
662 | 663 | { |
663 | 664 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
... | ... | @@ -1480,7 +1481,8 @@ |
1480 | 1481 | ext4_mb_check_limits(ac, e4b, 0); |
1481 | 1482 | } |
1482 | 1483 | |
1483 | -static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, | |
1484 | +static noinline_for_stack | |
1485 | +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, | |
1484 | 1486 | struct ext4_buddy *e4b) |
1485 | 1487 | { |
1486 | 1488 | struct ext4_free_extent ex = ac->ac_b_ex; |
... | ... | @@ -1507,7 +1509,8 @@ |
1507 | 1509 | return 0; |
1508 | 1510 | } |
1509 | 1511 | |
1510 | -static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, | |
1512 | +static noinline_for_stack | |
1513 | +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, | |
1511 | 1514 | struct ext4_buddy *e4b) |
1512 | 1515 | { |
1513 | 1516 | ext4_group_t group = ac->ac_g_ex.fe_group; |
... | ... | @@ -1566,7 +1569,8 @@ |
1566 | 1569 | * The routine scans buddy structures (not bitmap!) from given order |
1567 | 1570 | * to max order and tries to find big enough chunk to satisfy the req |
1568 | 1571 | */ |
1569 | -static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, | |
1572 | +static noinline_for_stack | |
1573 | +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, | |
1570 | 1574 | struct ext4_buddy *e4b) |
1571 | 1575 | { |
1572 | 1576 | struct super_block *sb = ac->ac_sb; |
... | ... | @@ -1609,7 +1613,8 @@ |
1609 | 1613 | * In order to optimize scanning, caller must pass number of |
1610 | 1614 | * free blocks in the group, so the routine can know upper limit. |
1611 | 1615 | */ |
1612 | -static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | |
1616 | +static noinline_for_stack | |
1617 | +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | |
1613 | 1618 | struct ext4_buddy *e4b) |
1614 | 1619 | { |
1615 | 1620 | struct super_block *sb = ac->ac_sb; |
... | ... | @@ -1668,7 +1673,8 @@ |
1668 | 1673 | * we try to find stripe-aligned chunks for stripe-size requests |
1669 | 1674 | * XXX should do so at least for multiples of stripe size as well |
1670 | 1675 | */ |
1671 | -static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, | |
1676 | +static noinline_for_stack | |
1677 | +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, | |
1672 | 1678 | struct ext4_buddy *e4b) |
1673 | 1679 | { |
1674 | 1680 | struct super_block *sb = ac->ac_sb; |
... | ... | @@ -1831,7 +1837,8 @@ |
1831 | 1837 | |
1832 | 1838 | } |
1833 | 1839 | |
1834 | -static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |
1840 | +static noinline_for_stack | |
1841 | +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |
1835 | 1842 | { |
1836 | 1843 | |
1837 | 1844 | int ret; |
... | ... | @@ -2902,7 +2909,11 @@ |
2902 | 2909 | |
2903 | 2910 | void exit_ext4_mballoc(void) |
2904 | 2911 | { |
2905 | - /* XXX: synchronize_rcu(); */ | |
2912 | + /* | |
2913 | + * Wait for completion of call_rcu()'s on ext4_pspace_cachep | |
2914 | + * before destroying the slab cache. | |
2915 | + */ | |
2916 | + rcu_barrier(); | |
2906 | 2917 | kmem_cache_destroy(ext4_pspace_cachep); |
2907 | 2918 | kmem_cache_destroy(ext4_ac_cachep); |
2908 | 2919 | kmem_cache_destroy(ext4_free_ext_cachep); |
... | ... | @@ -3457,7 +3468,8 @@ |
3457 | 3468 | * used in in-core bitmap. buddy must be generated from this bitmap |
3458 | 3469 | * Need to be called with ext4 group lock held |
3459 | 3470 | */ |
3460 | -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |
3471 | +static noinline_for_stack | |
3472 | +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |
3461 | 3473 | ext4_group_t group) |
3462 | 3474 | { |
3463 | 3475 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
3464 | 3476 | |
3465 | 3477 | |
... | ... | @@ -4215,14 +4227,9 @@ |
4215 | 4227 | ext4_get_group_no_and_offset(sb, goal, &group, &block); |
4216 | 4228 | |
4217 | 4229 | /* set up allocation goals */ |
4230 | + memset(ac, 0, sizeof(struct ext4_allocation_context)); | |
4218 | 4231 | ac->ac_b_ex.fe_logical = ar->logical; |
4219 | - ac->ac_b_ex.fe_group = 0; | |
4220 | - ac->ac_b_ex.fe_start = 0; | |
4221 | - ac->ac_b_ex.fe_len = 0; | |
4222 | 4232 | ac->ac_status = AC_STATUS_CONTINUE; |
4223 | - ac->ac_groups_scanned = 0; | |
4224 | - ac->ac_ex_scanned = 0; | |
4225 | - ac->ac_found = 0; | |
4226 | 4233 | ac->ac_sb = sb; |
4227 | 4234 | ac->ac_inode = ar->inode; |
4228 | 4235 | ac->ac_o_ex.fe_logical = ar->logical; |
4229 | 4236 | |
... | ... | @@ -4233,15 +4240,7 @@ |
4233 | 4240 | ac->ac_g_ex.fe_group = group; |
4234 | 4241 | ac->ac_g_ex.fe_start = block; |
4235 | 4242 | ac->ac_g_ex.fe_len = len; |
4236 | - ac->ac_f_ex.fe_len = 0; | |
4237 | 4243 | ac->ac_flags = ar->flags; |
4238 | - ac->ac_2order = 0; | |
4239 | - ac->ac_criteria = 0; | |
4240 | - ac->ac_pa = NULL; | |
4241 | - ac->ac_bitmap_page = NULL; | |
4242 | - ac->ac_buddy_page = NULL; | |
4243 | - ac->alloc_semp = NULL; | |
4244 | - ac->ac_lg = NULL; | |
4245 | 4244 | |
4246 | 4245 | /* we have to define context: we'll we work with a file or |
4247 | 4246 | * locality group. this is a policy, actually */ |
... | ... | @@ -4509,10 +4508,7 @@ |
4509 | 4508 | } |
4510 | 4509 | |
4511 | 4510 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); |
4512 | - if (ac) { | |
4513 | - ac->ac_sb = sb; | |
4514 | - ac->ac_inode = ar->inode; | |
4515 | - } else { | |
4511 | + if (!ac) { | |
4516 | 4512 | ar->len = 0; |
4517 | 4513 | *errp = -ENOMEM; |
4518 | 4514 | goto out1; |
fs/jbd2/journal.c
... | ... | @@ -297,6 +297,7 @@ |
297 | 297 | unsigned int new_offset; |
298 | 298 | struct buffer_head *bh_in = jh2bh(jh_in); |
299 | 299 | struct jbd2_buffer_trigger_type *triggers; |
300 | + journal_t *journal = transaction->t_journal; | |
300 | 301 | |
301 | 302 | /* |
302 | 303 | * The buffer really shouldn't be locked: only the current committing |
... | ... | @@ -310,6 +311,11 @@ |
310 | 311 | J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); |
311 | 312 | |
312 | 313 | new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); |
314 | + /* keep subsequent assertions sane */ | |
315 | + new_bh->b_state = 0; | |
316 | + init_buffer(new_bh, NULL, NULL); | |
317 | + atomic_set(&new_bh->b_count, 1); | |
318 | + new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ | |
313 | 319 | |
314 | 320 | /* |
315 | 321 | * If a new transaction has already done a buffer copy-out, then |
... | ... | @@ -388,14 +394,6 @@ |
388 | 394 | kunmap_atomic(mapped_data, KM_USER0); |
389 | 395 | } |
390 | 396 | |
391 | - /* keep subsequent assertions sane */ | |
392 | - new_bh->b_state = 0; | |
393 | - init_buffer(new_bh, NULL, NULL); | |
394 | - atomic_set(&new_bh->b_count, 1); | |
395 | - jbd_unlock_bh_state(bh_in); | |
396 | - | |
397 | - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ | |
398 | - | |
399 | 397 | set_bh_page(new_bh, new_page, new_offset); |
400 | 398 | new_jh->b_transaction = NULL; |
401 | 399 | new_bh->b_size = jh2bh(jh_in)->b_size; |
... | ... | @@ -412,7 +410,11 @@ |
412 | 410 | * copying is moved to the transaction's shadow queue. |
413 | 411 | */ |
414 | 412 | JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); |
415 | - jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); | |
413 | + spin_lock(&journal->j_list_lock); | |
414 | + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); | |
415 | + spin_unlock(&journal->j_list_lock); | |
416 | + jbd_unlock_bh_state(bh_in); | |
417 | + | |
416 | 418 | JBUFFER_TRACE(new_jh, "file as BJ_IO"); |
417 | 419 | jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); |
418 | 420 | |
... | ... | @@ -2410,6 +2412,7 @@ |
2410 | 2412 | int i = hash_32(device, CACHE_SIZE_BITS); |
2411 | 2413 | char *ret; |
2412 | 2414 | struct block_device *bd; |
2415 | + static struct devname_cache *new_dev; | |
2413 | 2416 | |
2414 | 2417 | rcu_read_lock(); |
2415 | 2418 | if (devcache[i] && devcache[i]->device == device) { |
2416 | 2419 | |
2417 | 2420 | |
... | ... | @@ -2419,20 +2422,20 @@ |
2419 | 2422 | } |
2420 | 2423 | rcu_read_unlock(); |
2421 | 2424 | |
2425 | + new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); | |
2426 | + if (!new_dev) | |
2427 | + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ | |
2422 | 2428 | spin_lock(&devname_cache_lock); |
2423 | 2429 | if (devcache[i]) { |
2424 | 2430 | if (devcache[i]->device == device) { |
2431 | + kfree(new_dev); | |
2425 | 2432 | ret = devcache[i]->devname; |
2426 | 2433 | spin_unlock(&devname_cache_lock); |
2427 | 2434 | return ret; |
2428 | 2435 | } |
2429 | 2436 | call_rcu(&devcache[i]->rcu, free_devcache); |
2430 | 2437 | } |
2431 | - devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); | |
2432 | - if (!devcache[i]) { | |
2433 | - spin_unlock(&devname_cache_lock); | |
2434 | - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ | |
2435 | - } | |
2438 | + devcache[i] = new_dev; | |
2436 | 2439 | devcache[i]->device = device; |
2437 | 2440 | bd = bdget(device); |
2438 | 2441 | if (bd) { |
fs/jbd2/transaction.c
... | ... | @@ -499,34 +499,15 @@ |
499 | 499 | wake_up(&journal->j_wait_transaction_locked); |
500 | 500 | } |
501 | 501 | |
502 | -/* | |
503 | - * Report any unexpected dirty buffers which turn up. Normally those | |
504 | - * indicate an error, but they can occur if the user is running (say) | |
505 | - * tune2fs to modify the live filesystem, so we need the option of | |
506 | - * continuing as gracefully as possible. # | |
507 | - * | |
508 | - * The caller should already hold the journal lock and | |
509 | - * j_list_lock spinlock: most callers will need those anyway | |
510 | - * in order to probe the buffer's journaling state safely. | |
511 | - */ | |
512 | -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) | |
502 | +static void warn_dirty_buffer(struct buffer_head *bh) | |
513 | 503 | { |
514 | - int jlist; | |
504 | + char b[BDEVNAME_SIZE]; | |
515 | 505 | |
516 | - /* If this buffer is one which might reasonably be dirty | |
517 | - * --- ie. data, or not part of this journal --- then | |
518 | - * we're OK to leave it alone, but otherwise we need to | |
519 | - * move the dirty bit to the journal's own internal | |
520 | - * JBDDirty bit. */ | |
521 | - jlist = jh->b_jlist; | |
522 | - | |
523 | - if (jlist == BJ_Metadata || jlist == BJ_Reserved || | |
524 | - jlist == BJ_Shadow || jlist == BJ_Forget) { | |
525 | - struct buffer_head *bh = jh2bh(jh); | |
526 | - | |
527 | - if (test_clear_buffer_dirty(bh)) | |
528 | - set_buffer_jbddirty(bh); | |
529 | - } | |
506 | + printk(KERN_WARNING | |
507 | + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " | |
508 | + "There's a risk of filesystem corruption in case of system " | |
509 | + "crash.\n", | |
510 | + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); | |
530 | 511 | } |
531 | 512 | |
532 | 513 | /* |
533 | 514 | |
... | ... | @@ -593,14 +574,16 @@ |
593 | 574 | if (jh->b_next_transaction) |
594 | 575 | J_ASSERT_JH(jh, jh->b_next_transaction == |
595 | 576 | transaction); |
577 | + warn_dirty_buffer(bh); | |
596 | 578 | } |
597 | 579 | /* |
598 | 580 | * In any case we need to clean the dirty flag and we must |
599 | 581 | * do it under the buffer lock to be sure we don't race |
600 | 582 | * with running write-out. |
601 | 583 | */ |
602 | - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); | |
603 | - jbd_unexpected_dirty_buffer(jh); | |
584 | + JBUFFER_TRACE(jh, "Journalling dirty buffer"); | |
585 | + clear_buffer_dirty(bh); | |
586 | + set_buffer_jbddirty(bh); | |
604 | 587 | } |
605 | 588 | |
606 | 589 | unlock_buffer(bh); |
... | ... | @@ -843,6 +826,15 @@ |
843 | 826 | J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); |
844 | 827 | |
845 | 828 | if (jh->b_transaction == NULL) { |
829 | + /* | |
830 | + * Previous jbd2_journal_forget() could have left the buffer | |
831 | + * with jbddirty bit set because it was being committed. When | |
832 | + * the commit finished, we've filed the buffer for | |
833 | + * checkpointing and marked it dirty. Now we are reallocating | |
834 | + * the buffer so the transaction freeing it must have | |
835 | + * committed and so it's safe to clear the dirty bit. | |
836 | + */ | |
837 | + clear_buffer_dirty(jh2bh(jh)); | |
846 | 838 | jh->b_transaction = transaction; |
847 | 839 | |
848 | 840 | /* first access by this transaction */ |
849 | 841 | |
... | ... | @@ -1644,8 +1636,13 @@ |
1644 | 1636 | |
1645 | 1637 | if (jh->b_cp_transaction) { |
1646 | 1638 | JBUFFER_TRACE(jh, "on running+cp transaction"); |
1639 | + /* | |
1640 | + * We don't want to write the buffer anymore, clear the | |
1641 | + * bit so that we don't confuse checks in | |
1642 | + * __journal_file_buffer | |
1643 | + */ | |
1644 | + clear_buffer_dirty(bh); | |
1647 | 1645 | __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); |
1648 | - clear_buffer_jbddirty(bh); | |
1649 | 1646 | may_free = 0; |
1650 | 1647 | } else { |
1651 | 1648 | JBUFFER_TRACE(jh, "on running transaction"); |
1652 | 1649 | |
... | ... | @@ -1896,12 +1893,17 @@ |
1896 | 1893 | if (jh->b_transaction && jh->b_jlist == jlist) |
1897 | 1894 | return; |
1898 | 1895 | |
1899 | - /* The following list of buffer states needs to be consistent | |
1900 | - * with __jbd_unexpected_dirty_buffer()'s handling of dirty | |
1901 | - * state. */ | |
1902 | - | |
1903 | 1896 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || |
1904 | 1897 | jlist == BJ_Shadow || jlist == BJ_Forget) { |
1898 | + /* | |
1899 | + * For metadata buffers, we track dirty bit in buffer_jbddirty | |
1900 | + * instead of buffer_dirty. We should not see a dirty bit set | |
1901 | + * here because we clear it in do_get_write_access but e.g. | |
1902 | + * tune2fs can modify the sb and set the dirty bit at any time | |
1903 | + * so we try to gracefully handle that. | |
1904 | + */ | |
1905 | + if (buffer_dirty(bh)) | |
1906 | + warn_dirty_buffer(bh); | |
1905 | 1907 | if (test_clear_buffer_dirty(bh) || |
1906 | 1908 | test_clear_buffer_jbddirty(bh)) |
1907 | 1909 | was_dirty = 1; |
include/trace/events/ext4.h
... | ... | @@ -34,7 +34,8 @@ |
34 | 34 | |
35 | 35 | TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", |
36 | 36 | jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, |
37 | - __entry->uid, __entry->gid, __entry->blocks) | |
37 | + __entry->uid, __entry->gid, | |
38 | + (unsigned long long) __entry->blocks) | |
38 | 39 | ); |
39 | 40 | |
40 | 41 | TRACE_EVENT(ext4_request_inode, |
... | ... | @@ -189,7 +190,7 @@ |
189 | 190 | __entry->copied) |
190 | 191 | ); |
191 | 192 | |
192 | -TRACE_EVENT(ext4_da_writepage, | |
193 | +TRACE_EVENT(ext4_writepage, | |
193 | 194 | TP_PROTO(struct inode *inode, struct page *page), |
194 | 195 | |
195 | 196 | TP_ARGS(inode, page), |
... | ... | @@ -339,49 +340,6 @@ |
339 | 340 | TP_printk("dev %s ino %lu pos %llu len %u copied %u", |
340 | 341 | jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, |
341 | 342 | __entry->copied) |
342 | -); | |
343 | - | |
344 | -TRACE_EVENT(ext4_normal_writepage, | |
345 | - TP_PROTO(struct inode *inode, struct page *page), | |
346 | - | |
347 | - TP_ARGS(inode, page), | |
348 | - | |
349 | - TP_STRUCT__entry( | |
350 | - __field( dev_t, dev ) | |
351 | - __field( ino_t, ino ) | |
352 | - __field( pgoff_t, index ) | |
353 | - ), | |
354 | - | |
355 | - TP_fast_assign( | |
356 | - __entry->dev = inode->i_sb->s_dev; | |
357 | - __entry->ino = inode->i_ino; | |
358 | - __entry->index = page->index; | |
359 | - ), | |
360 | - | |
361 | - TP_printk("dev %s ino %lu page_index %lu", | |
362 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) | |
363 | -); | |
364 | - | |
365 | -TRACE_EVENT(ext4_journalled_writepage, | |
366 | - TP_PROTO(struct inode *inode, struct page *page), | |
367 | - | |
368 | - TP_ARGS(inode, page), | |
369 | - | |
370 | - TP_STRUCT__entry( | |
371 | - __field( dev_t, dev ) | |
372 | - __field( ino_t, ino ) | |
373 | - __field( pgoff_t, index ) | |
374 | - | |
375 | - ), | |
376 | - | |
377 | - TP_fast_assign( | |
378 | - __entry->dev = inode->i_sb->s_dev; | |
379 | - __entry->ino = inode->i_ino; | |
380 | - __entry->index = page->index; | |
381 | - ), | |
382 | - | |
383 | - TP_printk("dev %s ino %lu page_index %lu", | |
384 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) | |
385 | 343 | ); |
386 | 344 | |
387 | 345 | TRACE_EVENT(ext4_discard_blocks, |