Commit a13ea5b759645a0779edc6dbfec9abfd83220844

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 1ac0cb5d0e

mm: reinstate ZERO_PAGE

KAMEZAWA Hiroyuki has observed customers of earlier kernels taking
advantage of the ZERO_PAGE: which we stopped do_anonymous_page() from
using in 2.6.24.  And there were a couple of regression reports on LKML.

Following suggestions from Linus, reinstate do_anonymous_page() use of
the ZERO_PAGE; but this time avoid dirtying its struct page cacheline
with (map)count updates - let vm_normal_page() regard it as abnormal.

Use it only on arches which __HAVE_ARCH_PTE_SPECIAL (x86, s390, sh32,
most powerpc): that's not essential, but minimizes additional branches
(keeping them in the unlikely pte_special case); and incidentally
excludes mips (some models of which needed eight colours of ZERO_PAGE
to avoid costly exceptions).

Don't be fanatical about avoiding ZERO_PAGE updates: get_user_pages()
callers won't want to make exceptions for it, so increment its count
there.  Changes to mlock and migration? happily seems not needed.

In most places it's quicker to check pfn than struct page address:
prepare a __read_mostly zero_pfn for that.  Does get_dump_page()
still need its ZERO_PAGE check? probably not, but keep it anyway.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 44 additions and 9 deletions Side-by-side Diff

... ... @@ -108,8 +108,19 @@
108 108 }
109 109 __setup("norandmaps", disable_randmaps);
110 110  
  111 +static unsigned long zero_pfn __read_mostly;
111 112  
112 113 /*
  114 + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  115 + */
  116 +static int __init init_zero_pfn(void)
  117 +{
  118 + zero_pfn = page_to_pfn(ZERO_PAGE(0));
  119 + return 0;
  120 +}
  121 +core_initcall(init_zero_pfn);
  122 +
  123 +/*
113 124 * If a p?d_bad entry is found while walking page tables, report
114 125 * the error, before resetting entry to p?d_none. Usually (but
115 126 * very seldom) called out from the p?d_none_or_clear_bad macros.
... ... @@ -499,7 +510,9 @@
499 510 if (HAVE_PTE_SPECIAL) {
500 511 if (likely(!pte_special(pte)))
501 512 goto check_pfn;
502   - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
  513 + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
  514 + return NULL;
  515 + if (pfn != zero_pfn)
503 516 print_bad_pte(vma, addr, pte, NULL);
504 517 return NULL;
505 518 }
506 519  
... ... @@ -1144,9 +1157,14 @@
1144 1157 goto no_page;
1145 1158 if ((flags & FOLL_WRITE) && !pte_write(pte))
1146 1159 goto unlock;
  1160 +
1147 1161 page = vm_normal_page(vma, address, pte);
1148   - if (unlikely(!page))
1149   - goto bad_page;
  1162 + if (unlikely(!page)) {
  1163 + if ((flags & FOLL_DUMP) ||
  1164 + pte_pfn(pte) != zero_pfn)
  1165 + goto bad_page;
  1166 + page = pte_page(pte);
  1167 + }
1150 1168  
1151 1169 if (flags & FOLL_GET)
1152 1170 get_page(page);
... ... @@ -2084,10 +2102,19 @@
2084 2102  
2085 2103 if (unlikely(anon_vma_prepare(vma)))
2086 2104 goto oom;
2087   - VM_BUG_ON(old_page == ZERO_PAGE(0));
2088   - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2089   - if (!new_page)
2090   - goto oom;
  2105 +
  2106 + if (pte_pfn(orig_pte) == zero_pfn) {
  2107 + new_page = alloc_zeroed_user_highpage_movable(vma, address);
  2108 + if (!new_page)
  2109 + goto oom;
  2110 + } else {
  2111 + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  2112 + if (!new_page)
  2113 + goto oom;
  2114 + cow_user_page(new_page, old_page, address, vma);
  2115 + }
  2116 + __SetPageUptodate(new_page);
  2117 +
2091 2118 /*
2092 2119 * Don't let another task, with possibly unlocked vma,
2093 2120 * keep the mlocked page.
... ... @@ -2097,8 +2124,6 @@
2097 2124 clear_page_mlock(old_page);
2098 2125 unlock_page(old_page);
2099 2126 }
2100   - cow_user_page(new_page, old_page, address, vma);
2101   - __SetPageUptodate(new_page);
2102 2127  
2103 2128 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2104 2129 goto oom_free_new;
... ... @@ -2639,6 +2664,15 @@
2639 2664 spinlock_t *ptl;
2640 2665 pte_t entry;
2641 2666  
  2667 + if (HAVE_PTE_SPECIAL && !(flags & FAULT_FLAG_WRITE)) {
  2668 + entry = pte_mkspecial(pfn_pte(zero_pfn, vma->vm_page_prot));
  2669 + ptl = pte_lockptr(mm, pmd);
  2670 + spin_lock(ptl);
  2671 + if (!pte_none(*page_table))
  2672 + goto unlock;
  2673 + goto setpte;
  2674 + }
  2675 +
2642 2676 /* Allocate our own private page. */
2643 2677 pte_unmap(page_table);
2644 2678  
... ... @@ -2662,6 +2696,7 @@
2662 2696  
2663 2697 inc_mm_counter(mm, anon_rss);
2664 2698 page_add_new_anon_rmap(page, vma, address);
  2699 +setpte:
2665 2700 set_pte_at(mm, address, page_table, entry);
2666 2701  
2667 2702 /* No need to invalidate - it was non-present before */