Commit c475a8ab625d567eacf5e30ec35d6d8704558062

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent d296e9cd02

[PATCH] can_share_swap_page: use page_mapcount

Remember that ironic get_user_pages race?  when the raised page_count on a
page swapped out led do_wp_page to decide that it had to copy on write, so
substituted a different page into userspace.  2.6.7 onwards have Andrea's
solution, where try_to_unmap_one backs out if it finds page_count raised.

Which works, but is unsatisfying (rmap.c has no other page_count heuristics),
and was found a few months ago to hang an intensive page migration test.  A
year ago I was hesitant to engage page_mapcount, now it seems the right fix.

So remove the page_count hack from try_to_unmap_one; and use activate_page in
unuse_mm when dropping lock, to replace its secondary effect of helping
swapoff to make progress in that case.

Simplify can_share_swap_page (now called only on anonymous pages) to check
page_mapcount + page_swapcount == 1: still needs the page lock to stabilize
their (pessimistic) sum, but does not need swapper_space.tree_lock for that.

In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to
keep sum on the high side, and correct when can_share_swap_page called.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 3 changed files with 21 additions and 65 deletions Side-by-side Diff

... ... @@ -1686,10 +1686,6 @@
1686 1686 }
1687 1687  
1688 1688 /* The page isn't present yet, go ahead with the fault. */
1689   -
1690   - swap_free(entry);
1691   - if (vm_swap_full())
1692   - remove_exclusive_swap_page(page);
1693 1689  
1694 1690 inc_mm_counter(mm, rss);
1695 1691 pte = mk_pte(page, vma->vm_page_prot);
1696 1692  
... ... @@ -1697,11 +1693,15 @@
1697 1693 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1698 1694 write_access = 0;
1699 1695 }
1700   - unlock_page(page);
1701 1696  
1702 1697 flush_icache_page(vma, page);
1703 1698 set_pte_at(mm, address, page_table, pte);
1704 1699 page_add_anon_rmap(page, vma, address);
  1700 +
  1701 + swap_free(entry);
  1702 + if (vm_swap_full())
  1703 + remove_exclusive_swap_page(page);
  1704 + unlock_page(page);
1705 1705  
1706 1706 if (write_access) {
1707 1707 if (do_wp_page(mm, vma, address,
... ... @@ -539,27 +539,6 @@
539 539 goto out_unmap;
540 540 }
541 541  
542   - /*
543   - * Don't pull an anonymous page out from under get_user_pages.
544   - * GUP carefully breaks COW and raises page count (while holding
545   - * page_table_lock, as we have here) to make sure that the page
546   - * cannot be freed. If we unmap that page here, a user write
547   - * access to the virtual address will bring back the page, but
548   - * its raised count will (ironically) be taken to mean it's not
549   - * an exclusive swap page, do_wp_page will replace it by a copy
550   - * page, and the user never get to see the data GUP was holding
551   - * the original page for.
552   - *
553   - * This test is also useful for when swapoff (unuse_process) has
554   - * to drop page lock: its reference to the page stops existing
555   - * ptes from being unmapped, so swapoff can make progress.
556   - */
557   - if (PageSwapCache(page) &&
558   - page_count(page) != page_mapcount(page) + 2) {
559   - ret = SWAP_FAIL;
560   - goto out_unmap;
561   - }
562   -
563 542 /* Nuke the page table entry. */
564 543 flush_cache_page(vma, address, page_to_pfn(page));
565 544 pteval = ptep_clear_flush(vma, address, pte);
... ... @@ -276,61 +276,37 @@
276 276 }
277 277  
278 278 /*
279   - * Check if we're the only user of a swap page,
280   - * when the page is locked.
  279 + * How many references to page are currently swapped out?
281 280 */
282   -static int exclusive_swap_page(struct page *page)
  281 +static inline int page_swapcount(struct page *page)
283 282 {
284   - int retval = 0;
285   - struct swap_info_struct * p;
  283 + int count = 0;
  284 + struct swap_info_struct *p;
286 285 swp_entry_t entry;
287 286  
288 287 entry.val = page->private;
289 288 p = swap_info_get(entry);
290 289 if (p) {
291   - /* Is the only swap cache user the cache itself? */
292   - if (p->swap_map[swp_offset(entry)] == 1) {
293   - /* Recheck the page count with the swapcache lock held.. */
294   - write_lock_irq(&swapper_space.tree_lock);
295   - if (page_count(page) == 2)
296   - retval = 1;
297   - write_unlock_irq(&swapper_space.tree_lock);
298   - }
  290 + /* Subtract the 1 for the swap cache itself */
  291 + count = p->swap_map[swp_offset(entry)] - 1;
299 292 swap_info_put(p);
300 293 }
301   - return retval;
  294 + return count;
302 295 }
303 296  
304 297 /*
305 298 * We can use this swap cache entry directly
306 299 * if there are no other references to it.
307   - *
308   - * Here "exclusive_swap_page()" does the real
309   - * work, but we opportunistically check whether
310   - * we need to get all the locks first..
311 300 */
312 301 int can_share_swap_page(struct page *page)
313 302 {
314   - int retval = 0;
  303 + int count;
315 304  
316   - if (!PageLocked(page))
317   - BUG();
318   - switch (page_count(page)) {
319   - case 3:
320   - if (!PagePrivate(page))
321   - break;
322   - /* Fallthrough */
323   - case 2:
324   - if (!PageSwapCache(page))
325   - break;
326   - retval = exclusive_swap_page(page);
327   - break;
328   - case 1:
329   - if (PageReserved(page))
330   - break;
331   - retval = 1;
332   - }
333   - return retval;
  305 + BUG_ON(!PageLocked(page));
  306 + count = page_mapcount(page);
  307 + if (count <= 1 && PageSwapCache(page))
  308 + count += page_swapcount(page);
  309 + return count == 1;
334 310 }
335 311  
336 312 /*
337 313  
... ... @@ -529,9 +505,10 @@
529 505  
530 506 if (!down_read_trylock(&mm->mmap_sem)) {
531 507 /*
532   - * Our reference to the page stops try_to_unmap_one from
533   - * unmapping its ptes, so swapoff can make progress.
  508 + * Activate page so shrink_cache is unlikely to unmap its
  509 + * ptes while lock is dropped, so swapoff can make progress.
534 510 */
  511 + activate_page(page);
535 512 unlock_page(page);
536 513 down_read(&mm->mmap_sem);
537 514 lock_page(page);