mm: reinstate ZERO_PAGE

KAMEZAWA Hiroyuki has observed customers of earlier kernels taking advantage of the ZERO_PAGE: which we stopped do_anonymous_page() from using in 2.6.24. And there were a couple of regression reports on LKML. Following suggestions from Linus, reinstate do_anonymous_page() use of the ZERO_PAGE; but this time avoid dirtying its struct page cacheline with (map)count updates - let vm_normal_page() regard it as abnormal. Use it only on arches which __HAVE_ARCH_PTE_SPECIAL (x86, s390, sh32, most powerpc): that's not essential, but minimizes additional branches (keeping them in the unlikely pte_special case); and incidentally excludes mips (some models of which needed eight colours of ZERO_PAGE to avoid costly exceptions). Don't be fanatical about avoiding ZERO_PAGE updates: get_user_pages() callers won't want to make exceptions for it, so increment its count there. Changes to mlock and migration? happily seems not needed. In most places it's quicker to check pfn than struct page address: prepare a __read_mostly zero_pfn for that. Does get_dump_page() still need its ZERO_PAGE check? probably not, but keep it anyway. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: reinstate ZERO_PAGE
KAMEZAWA Hiroyuki has observed customers of earlier kernels taking advantage of the ZERO_PAGE: which we stopped do_anonymous_page() from using in 2.6.24. And there were a couple of regression reports on LKML. Following suggestions from Linus, reinstate do_anonymous_page() use of the ZERO_PAGE; but this time avoid dirtying its struct page cacheline with (map)count updates - let vm_normal_page() regard it as abnormal. Use it only on arches which __HAVE_ARCH_PTE_SPECIAL (x86, s390, sh32, most powerpc): that's not essential, but minimizes additional branches (keeping them in the unlikely pte_special case); and incidentally excludes mips (some models of which needed eight colours of ZERO_PAGE to avoid costly exceptions). Don't be fanatical about avoiding ZERO_PAGE updates: get_user_pages() callers won't want to make exceptions for it, so increment its count there. Changes to mlock and migration? happily seems not needed. In most places it's quicker to check pfn than struct page address: prepare a __read_mostly zero_pfn for that. Does get_dump_page() still need its ZERO_PAGE check? probably not, but keep it anyway. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hugh Dickins · Linus Torvalds
1 parent 1ac0cb5d0e
Showing 1 changed file with 44 additions and 9 deletions Side-by-side Diff
mm/memory.c
@@ -108,8 +108,19 @@
 }
 __setup("norandmaps", disable_randmaps);
  
+static unsigned long zero_pfn __read_mostly;
  
 /*
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ */
+static int __init init_zero_pfn(void)
+{
+	zero_pfn = page_to_pfn(ZERO_PAGE(0));
+	return 0;
+}
+core_initcall(init_zero_pfn);
+
+/*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
  * very seldom) called out from the p?d_none_or_clear_bad macros.
@@ -499,7 +510,9 @@
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
-		if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+			return NULL;
+		if (pfn != zero_pfn)
 			print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
  
@@ -1144,9 +1157,14 @@
 		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
+
 	page = vm_normal_page(vma, address, pte);
-	if (unlikely(!page))
-		goto bad_page;
+	if (unlikely(!page)) {
+		if ((flags & FOLL_DUMP) ||
+		    pte_pfn(pte) != zero_pfn)
+			goto bad_page;
+		page = pte_page(pte);
+	}
  
 	if (flags & FOLL_GET)
 		get_page(page);
@@ -2084,10 +2102,19 @@
  
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	VM_BUG_ON(old_page == ZERO_PAGE(0));
-	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-	if (!new_page)
-		goto oom;
+
+	if (pte_pfn(orig_pte) == zero_pfn) {
+		new_page = alloc_zeroed_user_highpage_movable(vma, address);
+		if (!new_page)
+			goto oom;
+	} else {
+		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+		if (!new_page)
+			goto oom;
+		cow_user_page(new_page, old_page, address, vma);
+	}
+	__SetPageUptodate(new_page);
+
 	/*
 	 * Don't let another task, with possibly unlocked vma,
 	 * keep the mlocked page.
@@ -2097,8 +2124,6 @@
 		clear_page_mlock(old_page);
 		unlock_page(old_page);
 	}
-	cow_user_page(new_page, old_page, address, vma);
-	__SetPageUptodate(new_page);
  
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
@@ -2639,6 +2664,15 @@
 	spinlock_t *ptl;
 	pte_t entry;
  
+	if (HAVE_PTE_SPECIAL && !(flags & FAULT_FLAG_WRITE)) {
+		entry = pte_mkspecial(pfn_pte(zero_pfn, vma->vm_page_prot));
+		ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
+		if (!pte_none(*page_table))
+			goto unlock;
+		goto setpte;
+	}
+
 	/* Allocate our own private page. */
 	pte_unmap(page_table);
  
@@ -2662,6 +2696,7 @@
  
 	inc_mm_counter(mm, anon_rss);
 	page_add_new_anon_rmap(page, vma, address);
+setpte:
 	set_pte_at(mm, address, page_table, entry);
  
 	/* No need to invalidate - it was non-present before */
...	...	@@ -108,8 +108,19 @@
108	108	}
109	109	__setup("norandmaps", disable_randmaps);
110	110
	111	+static unsigned long zero_pfn __read_mostly;
111	112
112	113	/*
	114	+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
	115	+ */
	116	+static int __init init_zero_pfn(void)
	117	+{
	118	+ zero_pfn = page_to_pfn(ZERO_PAGE(0));
	119	+ return 0;
	120	+}
	121	+core_initcall(init_zero_pfn);
	122	+
	123	+/*
113	124	* If a p?d_bad entry is found while walking page tables, report
114	125	* the error, before resetting entry to p?d_none. Usually (but
115	126	* very seldom) called out from the p?d_none_or_clear_bad macros.
...	...	@@ -499,7 +510,9 @@
499	510	if (HAVE_PTE_SPECIAL) {
500	511	if (likely(!pte_special(pte)))
501	512	goto check_pfn;
502		- if (!(vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP)))
	513	+ if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
	514	+ return NULL;
	515	+ if (pfn != zero_pfn)
503	516	print_bad_pte(vma, addr, pte, NULL);
504	517	return NULL;
505	518	}
506	519
...	...	@@ -1144,9 +1157,14 @@
1144	1157	goto no_page;
1145	1158	if ((flags & FOLL_WRITE) && !pte_write(pte))
1146	1159	goto unlock;
	1160	+
1147	1161	page = vm_normal_page(vma, address, pte);
1148		- if (unlikely(!page))
1149		- goto bad_page;
	1162	+ if (unlikely(!page)) {
	1163	+ if ((flags & FOLL_DUMP) \|\|
	1164	+ pte_pfn(pte) != zero_pfn)
	1165	+ goto bad_page;
	1166	+ page = pte_page(pte);
	1167	+ }
1150	1168
1151	1169	if (flags & FOLL_GET)
1152	1170	get_page(page);
...	...	@@ -2084,10 +2102,19 @@
2084	2102
2085	2103	if (unlikely(anon_vma_prepare(vma)))
2086	2104	goto oom;
2087		- VM_BUG_ON(old_page == ZERO_PAGE(0));
2088		- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2089		- if (!new_page)
2090		- goto oom;
	2105	+
	2106	+ if (pte_pfn(orig_pte) == zero_pfn) {
	2107	+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
	2108	+ if (!new_page)
	2109	+ goto oom;
	2110	+ } else {
	2111	+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
	2112	+ if (!new_page)
	2113	+ goto oom;
	2114	+ cow_user_page(new_page, old_page, address, vma);
	2115	+ }
	2116	+ __SetPageUptodate(new_page);
	2117	+
2091	2118	/*
2092	2119	* Don't let another task, with possibly unlocked vma,
2093	2120	* keep the mlocked page.
...	...	@@ -2097,8 +2124,6 @@
2097	2124	clear_page_mlock(old_page);
2098	2125	unlock_page(old_page);
2099	2126	}
2100		- cow_user_page(new_page, old_page, address, vma);
2101		- __SetPageUptodate(new_page);
2102	2127
2103	2128	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2104	2129	goto oom_free_new;
...	...	@@ -2639,6 +2664,15 @@
2639	2664	spinlock_t *ptl;
2640	2665	pte_t entry;
2641	2666
	2667	+ if (HAVE_PTE_SPECIAL && !(flags & FAULT_FLAG_WRITE)) {
	2668	+ entry = pte_mkspecial(pfn_pte(zero_pfn, vma->vm_page_prot));
	2669	+ ptl = pte_lockptr(mm, pmd);
	2670	+ spin_lock(ptl);
	2671	+ if (!pte_none(*page_table))
	2672	+ goto unlock;
	2673	+ goto setpte;
	2674	+ }
	2675	+
2642	2676	/* Allocate our own private page. */
2643	2677	pte_unmap(page_table);
2644	2678
...	...	@@ -2662,6 +2696,7 @@
2662	2696
2663	2697	inc_mm_counter(mm, anon_rss);
2664	2698	page_add_new_anon_rmap(page, vma, address);
	2699	+setpte:
2665	2700	set_pte_at(mm, address, page_table, entry);
2666	2701
2667	2702	/* No need to invalidate - it was non-present before */