Commit a78e877e9a689d8195b2e62d68030c36fd8e7f34
Committed by
Greg Kroah-Hartman
1 parent
d73437ade6
mm: protect set_page_dirty() from ongoing truncation
commit 2d6d7f98284648c5ed113fe22a132148950b140f upstream. Tejun, while reviewing the code, spotted the following race condition between the dirtying and truncation of a page: __set_page_dirty_nobuffers() __delete_from_page_cache() if (TestSetPageDirty(page)) page->mapping = NULL if (PageDirty()) dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); if (page->mapping) account_page_dirtied(page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); which results in an imbalance of NR_FILE_DIRTY and BDI_RECLAIMABLE. Dirtiers usually lock out truncation, either by holding the page lock directly, or in case of zap_pte_range(), by pinning the mapcount with the page table lock held. The notable exception to this rule, though, is do_wp_page(), for which this race exists. However, do_wp_page() already waits for a locked page to unlock before setting the dirty bit, in order to prevent a race where clear_page_dirty() misses the page bit in the presence of dirty ptes. Upgrade that wait to a fully locked set_page_dirty() to also cover the situation explained above. Afterwards, the code in set_page_dirty() dealing with a truncation race is no longer needed. Remove it. Reported-by: Tejun Heo <tj@kernel.org> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Showing 3 changed files with 29 additions and 42 deletions Side-by-side Diff
include/linux/writeback.h
... | ... | @@ -177,7 +177,6 @@ |
177 | 177 | struct writeback_control *wbc, writepage_t writepage, |
178 | 178 | void *data); |
179 | 179 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc); |
180 | -void set_page_dirty_balance(struct page *page); | |
181 | 180 | void writeback_set_ratelimit(void); |
182 | 181 | void tag_pages_for_writeback(struct address_space *mapping, |
183 | 182 | pgoff_t start, pgoff_t end); |
mm/memory.c
... | ... | @@ -2150,17 +2150,24 @@ |
2150 | 2150 | if (!dirty_page) |
2151 | 2151 | return ret; |
2152 | 2152 | |
2153 | - /* | |
2154 | - * Yes, Virginia, this is actually required to prevent a race | |
2155 | - * with clear_page_dirty_for_io() from clearing the page dirty | |
2156 | - * bit after it clear all dirty ptes, but before a racing | |
2157 | - * do_wp_page installs a dirty pte. | |
2158 | - * | |
2159 | - * do_shared_fault is protected similarly. | |
2160 | - */ | |
2161 | 2153 | if (!page_mkwrite) { |
2162 | - wait_on_page_locked(dirty_page); | |
2163 | - set_page_dirty_balance(dirty_page); | |
2154 | + struct address_space *mapping; | |
2155 | + int dirtied; | |
2156 | + | |
2157 | + lock_page(dirty_page); | |
2158 | + dirtied = set_page_dirty(dirty_page); | |
2159 | + VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); | |
2160 | + mapping = dirty_page->mapping; | |
2161 | + unlock_page(dirty_page); | |
2162 | + | |
2163 | + if (dirtied && mapping) { | |
2164 | + /* | |
2165 | + * Some device drivers do not set page.mapping | |
2166 | + * but still dirty their pages | |
2167 | + */ | |
2168 | + balance_dirty_pages_ratelimited(mapping); | |
2169 | + } | |
2170 | + | |
2164 | 2171 | /* file_update_time outside page_lock */ |
2165 | 2172 | if (vma->vm_file) |
2166 | 2173 | file_update_time(vma->vm_file); |
mm/page-writeback.c
... | ... | @@ -1541,16 +1541,6 @@ |
1541 | 1541 | bdi_start_background_writeback(bdi); |
1542 | 1542 | } |
1543 | 1543 | |
1544 | -void set_page_dirty_balance(struct page *page) | |
1545 | -{ | |
1546 | - if (set_page_dirty(page)) { | |
1547 | - struct address_space *mapping = page_mapping(page); | |
1548 | - | |
1549 | - if (mapping) | |
1550 | - balance_dirty_pages_ratelimited(mapping); | |
1551 | - } | |
1552 | -} | |
1553 | - | |
1554 | 1544 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
1555 | 1545 | |
1556 | 1546 | /* |
1557 | 1547 | |
1558 | 1548 | |
... | ... | @@ -2123,32 +2113,25 @@ |
2123 | 2113 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
2124 | 2114 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
2125 | 2115 | * |
2126 | - * Most callers have locked the page, which pins the address_space in memory. | |
2127 | - * But zap_pte_range() does not lock the page, however in that case the | |
2128 | - * mapping is pinned by the vma's ->vm_file reference. | |
2129 | - * | |
2130 | - * We take care to handle the case where the page was truncated from the | |
2131 | - * mapping by re-checking page_mapping() inside tree_lock. | |
2116 | + * The caller must ensure this doesn't race with truncation. Most will simply | |
2117 | + * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and | |
2118 | + * the pte lock held, which also locks out truncation. | |
2132 | 2119 | */ |
2133 | 2120 | int __set_page_dirty_nobuffers(struct page *page) |
2134 | 2121 | { |
2135 | 2122 | if (!TestSetPageDirty(page)) { |
2136 | 2123 | struct address_space *mapping = page_mapping(page); |
2137 | - struct address_space *mapping2; | |
2138 | 2124 | unsigned long flags; |
2139 | 2125 | |
2140 | 2126 | if (!mapping) |
2141 | 2127 | return 1; |
2142 | 2128 | |
2143 | 2129 | spin_lock_irqsave(&mapping->tree_lock, flags); |
2144 | - mapping2 = page_mapping(page); | |
2145 | - if (mapping2) { /* Race with truncate? */ | |
2146 | - BUG_ON(mapping2 != mapping); | |
2147 | - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | |
2148 | - account_page_dirtied(page, mapping); | |
2149 | - radix_tree_tag_set(&mapping->page_tree, | |
2150 | - page_index(page), PAGECACHE_TAG_DIRTY); | |
2151 | - } | |
2130 | + BUG_ON(page_mapping(page) != mapping); | |
2131 | + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | |
2132 | + account_page_dirtied(page, mapping); | |
2133 | + radix_tree_tag_set(&mapping->page_tree, page_index(page), | |
2134 | + PAGECACHE_TAG_DIRTY); | |
2152 | 2135 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
2153 | 2136 | if (mapping->host) { |
2154 | 2137 | /* !PageAnon && !swapper_space */ |
... | ... | @@ -2305,12 +2288,10 @@ |
2305 | 2288 | /* |
2306 | 2289 | * We carefully synchronise fault handlers against |
2307 | 2290 | * installing a dirty pte and marking the page dirty |
2308 | - * at this point. We do this by having them hold the | |
2309 | - * page lock at some point after installing their | |
2310 | - * pte, but before marking the page dirty. | |
2311 | - * Pages are always locked coming in here, so we get | |
2312 | - * the desired exclusion. See mm/memory.c:do_wp_page() | |
2313 | - * for more comments. | |
2291 | + * at this point. We do this by having them hold the | |
2292 | + * page lock while dirtying the page, and pages are | |
2293 | + * always locked coming in here, so we get the desired | |
2294 | + * exclusion. | |
2314 | 2295 | */ |
2315 | 2296 | if (TestClearPageDirty(page)) { |
2316 | 2297 | dec_zone_page_state(page, NR_FILE_DIRTY); |