[PATCH] can_share_swap_page: use page_mapcount

Remember that ironic get_user_pages race? when the raised page_count on a page swapped out led do_wp_page to decide that it had to copy on write, so substituted a different page into userspace. 2.6.7 onwards have Andrea's solution, where try_to_unmap_one backs out if it finds page_count raised. Which works, but is unsatisfying (rmap.c has no other page_count heuristics), and was found a few months ago to hang an intensive page migration test. A year ago I was hesitant to engage page_mapcount, now it seems the right fix. So remove the page_count hack from try_to_unmap_one; and use activate_page in unuse_mm when dropping lock, to replace its secondary effect of helping swapoff to make progress in that case. Simplify can_share_swap_page (now called only on anonymous pages) to check page_mapcount + page_swapcount == 1: still needs the page lock to stabilize their (pessimistic) sum, but does not need swapper_space.tree_lock for that. In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to keep sum on the high side, and correct when can_share_swap_page called. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] can_share_swap_page: use page_mapcount
Remember that ironic get_user_pages race? when the raised page_count on a page swapped out led do_wp_page to decide that it had to copy on write, so substituted a different page into userspace. 2.6.7 onwards have Andrea's solution, where try_to_unmap_one backs out if it finds page_count raised. Which works, but is unsatisfying (rmap.c has no other page_count heuristics), and was found a few months ago to hang an intensive page migration test. A year ago I was hesitant to engage page_mapcount, now it seems the right fix. So remove the page_count hack from try_to_unmap_one; and use activate_page in unuse_mm when dropping lock, to replace its secondary effect of helping swapoff to make progress in that case. Simplify can_share_swap_page (now called only on anonymous pages) to check page_mapcount + page_swapcount == 1: still needs the page lock to stabilize their (pessimistic) sum, but does not need swapper_space.tree_lock for that. In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to keep sum on the high side, and correct when can_share_swap_page called. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Hugh Dickins · Linus Torvalds
1 parent d296e9cd02
Showing 3 changed files with 21 additions and 65 deletions Side-by-side Diff
mm/memory.c
mm/rmap.c
mm/swapfile.c
@@ -1686,10 +1686,6 @@
 	}
  
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
  
 	inc_mm_counter(mm, rss);
 	pte = mk_pte(page, vma->vm_page_prot);
  
@@ -1697,11 +1693,15 @@
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		write_access = 0;
 	}
-	unlock_page(page);
  
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
+
+	swap_free(entry);
+	if (vm_swap_full())
+		remove_exclusive_swap_page(page);
+	unlock_page(page);
  
 	if (write_access) {
 		if (do_wp_page(mm, vma, address,
@@ -539,27 +539,6 @@
 		goto out_unmap;
 	}
  
-	/*
-	 * Don't pull an anonymous page out from under get_user_pages.
-	 * GUP carefully breaks COW and raises page count (while holding
-	 * page_table_lock, as we have here) to make sure that the page
-	 * cannot be freed.  If we unmap that page here, a user write
-	 * access to the virtual address will bring back the page, but
-	 * its raised count will (ironically) be taken to mean it's not
-	 * an exclusive swap page, do_wp_page will replace it by a copy
-	 * page, and the user never get to see the data GUP was holding
-	 * the original page for.
-	 *
-	 * This test is also useful for when swapoff (unuse_process) has
-	 * to drop page lock: its reference to the page stops existing
-	 * ptes from being unmapped, so swapoff can make progress.
-	 */
-	if (PageSwapCache(page) &&
-	    page_count(page) != page_mapcount(page) + 2) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
-	}
-
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
 	pteval = ptep_clear_flush(vma, address, pte);
@@ -276,61 +276,37 @@
 }
  
 /*
- * Check if we're the only user of a swap page,
- * when the page is locked.
+ * How many references to page are currently swapped out?
  */
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
 {
-	int retval = 0;
-	struct swap_info_struct * p;
+	int count = 0;
+	struct swap_info_struct *p;
 	swp_entry_t entry;
  
 	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (p) {
-		/* Is the only swap cache user the cache itself? */
-		if (p->swap_map[swp_offset(entry)] == 1) {
-			/* Recheck the page count with the swapcache lock held.. */
-			write_lock_irq(&swapper_space.tree_lock);
-			if (page_count(page) == 2)
-				retval = 1;
-			write_unlock_irq(&swapper_space.tree_lock);
-		}
+		/* Subtract the 1 for the swap cache itself */
+		count = p->swap_map[swp_offset(entry)] - 1;
 		swap_info_put(p);
 	}
-	return retval;
+	return count;
 }
  
 /*
  * We can use this swap cache entry directly
  * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
  */
 int can_share_swap_page(struct page *page)
 {
-	int retval = 0;
+	int count;
  
-	if (!PageLocked(page))
-		BUG();
-	switch (page_count(page)) {
-	case 3:
-		if (!PagePrivate(page))
-			break;
-		/* Fallthrough */
-	case 2:
-		if (!PageSwapCache(page))
-			break;
-		retval = exclusive_swap_page(page);
-		break;
-	case 1:
-		if (PageReserved(page))
-			break;
-		retval = 1;
-	}
-	return retval;
+	BUG_ON(!PageLocked(page));
+	count = page_mapcount(page);
+	if (count <= 1 && PageSwapCache(page))
+		count += page_swapcount(page);
+	return count == 1;
 }
  
 /*
  
@@ -529,9 +505,10 @@
  
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		/*
-		 * Our reference to the page stops try_to_unmap_one from
-		 * unmapping its ptes, so swapoff can make progress.
+		 * Activate page so shrink_cache is unlikely to unmap its
+		 * ptes while lock is dropped, so swapoff can make progress.
 		 */
+		activate_page(page);
 		unlock_page(page);
 		down_read(&mm->mmap_sem);
 		lock_page(page);
...	...	@@ -1686,10 +1686,6 @@
1686	1686	}
1687	1687
1688	1688	/* The page isn't present yet, go ahead with the fault. */
1689		-
1690		- swap_free(entry);
1691		- if (vm_swap_full())
1692		- remove_exclusive_swap_page(page);
1693	1689
1694	1690	inc_mm_counter(mm, rss);
1695	1691	pte = mk_pte(page, vma->vm_page_prot);
1696	1692
...	...	@@ -1697,11 +1693,15 @@
1697	1693	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1698	1694	write_access = 0;
1699	1695	}
1700		- unlock_page(page);
1701	1696
1702	1697	flush_icache_page(vma, page);
1703	1698	set_pte_at(mm, address, page_table, pte);
1704	1699	page_add_anon_rmap(page, vma, address);
	1700	+
	1701	+ swap_free(entry);
	1702	+ if (vm_swap_full())
	1703	+ remove_exclusive_swap_page(page);
	1704	+ unlock_page(page);
1705	1705
1706	1706	if (write_access) {
1707	1707	if (do_wp_page(mm, vma, address,
...	...	@@ -539,27 +539,6 @@
539	539	goto out_unmap;
540	540	}
541	541
542		- /*
543		- * Don't pull an anonymous page out from under get_user_pages.
544		- * GUP carefully breaks COW and raises page count (while holding
545		- * page_table_lock, as we have here) to make sure that the page
546		- * cannot be freed. If we unmap that page here, a user write
547		- * access to the virtual address will bring back the page, but
548		- * its raised count will (ironically) be taken to mean it's not
549		- * an exclusive swap page, do_wp_page will replace it by a copy
550		- * page, and the user never get to see the data GUP was holding
551		- * the original page for.
552		- *
553		- * This test is also useful for when swapoff (unuse_process) has
554		- * to drop page lock: its reference to the page stops existing
555		- * ptes from being unmapped, so swapoff can make progress.
556		- */
557		- if (PageSwapCache(page) &&
558		- page_count(page) != page_mapcount(page) + 2) {
559		- ret = SWAP_FAIL;
560		- goto out_unmap;
561		- }
562		-
563	542	/* Nuke the page table entry. */
564	543	flush_cache_page(vma, address, page_to_pfn(page));
565	544	pteval = ptep_clear_flush(vma, address, pte);
...	...	@@ -276,61 +276,37 @@
276	276	}
277	277
278	278	/*
279		- * Check if we're the only user of a swap page,
280		- * when the page is locked.
	279	+ * How many references to page are currently swapped out?
281	280	*/
282		-static int exclusive_swap_page(struct page *page)
	281	+static inline int page_swapcount(struct page *page)
283	282	{
284		- int retval = 0;
285		- struct swap_info_struct * p;
	283	+ int count = 0;
	284	+ struct swap_info_struct *p;
286	285	swp_entry_t entry;
287	286
288	287	entry.val = page->private;
289	288	p = swap_info_get(entry);
290	289	if (p) {
291		- /* Is the only swap cache user the cache itself? */
292		- if (p->swap_map[swp_offset(entry)] == 1) {
293		- /* Recheck the page count with the swapcache lock held.. */
294		- write_lock_irq(&swapper_space.tree_lock);
295		- if (page_count(page) == 2)
296		- retval = 1;
297		- write_unlock_irq(&swapper_space.tree_lock);
298		- }
	290	+ /* Subtract the 1 for the swap cache itself */
	291	+ count = p->swap_map[swp_offset(entry)] - 1;
299	292	swap_info_put(p);
300	293	}
301		- return retval;
	294	+ return count;
302	295	}
303	296
304	297	/*
305	298	* We can use this swap cache entry directly
306	299	* if there are no other references to it.
307		- *
308		- * Here "exclusive_swap_page()" does the real
309		- * work, but we opportunistically check whether
310		- * we need to get all the locks first..
311	300	*/
312	301	int can_share_swap_page(struct page *page)
313	302	{
314		- int retval = 0;
	303	+ int count;
315	304
316		- if (!PageLocked(page))
317		- BUG();
318		- switch (page_count(page)) {
319		- case 3:
320		- if (!PagePrivate(page))
321		- break;
322		- /* Fallthrough */
323		- case 2:
324		- if (!PageSwapCache(page))
325		- break;
326		- retval = exclusive_swap_page(page);
327		- break;
328		- case 1:
329		- if (PageReserved(page))
330		- break;
331		- retval = 1;
332		- }
333		- return retval;
	305	+ BUG_ON(!PageLocked(page));
	306	+ count = page_mapcount(page);
	307	+ if (count <= 1 && PageSwapCache(page))
	308	+ count += page_swapcount(page);
	309	+ return count == 1;
334	310	}
335	311
336	312	/*
337	313
...	...	@@ -529,9 +505,10 @@
529	505
530	506	if (!down_read_trylock(&mm->mmap_sem)) {
531	507	/*
532		- * Our reference to the page stops try_to_unmap_one from
533		- * unmapping its ptes, so swapoff can make progress.
	508	+ * Activate page so shrink_cache is unlikely to unmap its
	509	+ * ptes while lock is dropped, so swapoff can make progress.
534	510	*/
	511	+ activate_page(page);
535	512	unlock_page(page);
536	513	down_read(&mm->mmap_sem);
537	514	lock_page(page);