Commit 97ae17497e996ff09bf97b6db3b33f7fd4029092

Authored by Kirill A. Shutemov
Committed by Linus Torvalds
1 parent 78ca0e6792

thp: implement refcounting for huge zero page

H.  Peter Anvin doesn't like huge zero page which sticks in memory forever
after the first allocation.  Here's implementation of lockless refcounting
for huge zero page.

We have two basic primitives: {get,put}_huge_zero_page(). They
manipulate reference counter.

If counter is 0, get_huge_zero_page() allocates a new huge page and takes
two references: one for caller and one for shrinker.  We free the page
only in shrinker callback if counter is 1 (only shrinker has the
reference).

put_huge_zero_page() only decrements counter.  Counter is never zero in
put_huge_zero_page() since shrinker holds on reference.

Freeing huge zero page in shrinker callback helps to avoid frequent
allocate-free.

Refcounting has cost.  On 4 socket machine I observe ~1% slowdown on
parallel (40 processes) read page faulting comparing to lazy huge page
allocation.  I think it's pretty reasonable for synthetic benchmark.

[lliubbo@gmail.com: fix mismerge]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Bob Liu <lliubbo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 88 additions and 25 deletions Side-by-side Diff

... ... @@ -12,12 +12,14 @@
12 12 #include <linux/mmu_notifier.h>
13 13 #include <linux/rmap.h>
14 14 #include <linux/swap.h>
  15 +#include <linux/shrinker.h>
15 16 #include <linux/mm_inline.h>
16 17 #include <linux/kthread.h>
17 18 #include <linux/khugepaged.h>
18 19 #include <linux/freezer.h>
19 20 #include <linux/mman.h>
20 21 #include <linux/pagemap.h>
  22 +
21 23 #include <asm/tlb.h>
22 24 #include <asm/pgalloc.h>
23 25 #include "internal.h"
... ... @@ -47,7 +49,6 @@
47 49 /* during fragmentation poll the hugepage allocator once every minute */
48 50 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
49 51 static struct task_struct *khugepaged_thread __read_mostly;
50   -static unsigned long huge_zero_pfn __read_mostly;
51 52 static DEFINE_MUTEX(khugepaged_mutex);
52 53 static DEFINE_SPINLOCK(khugepaged_mm_lock);
53 54 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
54 55  
55 56  
56 57  
57 58  
58 59  
59 60  
60 61  
61 62  
... ... @@ -160,31 +161,74 @@
160 161 return err;
161 162 }
162 163  
163   -static int init_huge_zero_pfn(void)
  164 +static atomic_t huge_zero_refcount;
  165 +static unsigned long huge_zero_pfn __read_mostly;
  166 +
  167 +static inline bool is_huge_zero_pfn(unsigned long pfn)
164 168 {
165   - struct page *hpage;
166   - unsigned long pfn;
  169 + unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
  170 + return zero_pfn && pfn == zero_pfn;
  171 +}
167 172  
168   - hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
  173 +static inline bool is_huge_zero_pmd(pmd_t pmd)
  174 +{
  175 + return is_huge_zero_pfn(pmd_pfn(pmd));
  176 +}
  177 +
  178 +static unsigned long get_huge_zero_page(void)
  179 +{
  180 + struct page *zero_page;
  181 +retry:
  182 + if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
  183 + return ACCESS_ONCE(huge_zero_pfn);
  184 +
  185 + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
169 186 HPAGE_PMD_ORDER);
170   - if (!hpage)
171   - return -ENOMEM;
172   - pfn = page_to_pfn(hpage);
173   - if (cmpxchg(&huge_zero_pfn, 0, pfn))
174   - __free_page(hpage);
175   - return 0;
  187 + if (!zero_page)
  188 + return 0;
  189 + preempt_disable();
  190 + if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
  191 + preempt_enable();
  192 + __free_page(zero_page);
  193 + goto retry;
  194 + }
  195 +
  196 + /* We take additional reference here. It will be put back by shrinker */
  197 + atomic_set(&huge_zero_refcount, 2);
  198 + preempt_enable();
  199 + return ACCESS_ONCE(huge_zero_pfn);
176 200 }
177 201  
178   -static inline bool is_huge_zero_pfn(unsigned long pfn)
  202 +static void put_huge_zero_page(void)
179 203 {
180   - return huge_zero_pfn && pfn == huge_zero_pfn;
  204 + /*
  205 + * Counter should never go to zero here. Only shrinker can put
  206 + * last reference.
  207 + */
  208 + BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
181 209 }
182 210  
183   -static inline bool is_huge_zero_pmd(pmd_t pmd)
  211 +static int shrink_huge_zero_page(struct shrinker *shrink,
  212 + struct shrink_control *sc)
184 213 {
185   - return is_huge_zero_pfn(pmd_pfn(pmd));
  214 + if (!sc->nr_to_scan)
  215 + /* we can free zero page only if last reference remains */
  216 + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  217 +
  218 + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
  219 + unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
  220 + BUG_ON(zero_pfn == 0);
  221 + __free_page(__pfn_to_page(zero_pfn));
  222 + }
  223 +
  224 + return 0;
186 225 }
187 226  
  227 +static struct shrinker huge_zero_page_shrinker = {
  228 + .shrink = shrink_huge_zero_page,
  229 + .seeks = DEFAULT_SEEKS,
  230 +};
  231 +
188 232 #ifdef CONFIG_SYSFS
189 233  
190 234 static ssize_t double_flag_show(struct kobject *kobj,
... ... @@ -576,6 +620,8 @@
576 620 goto out;
577 621 }
578 622  
  623 + register_shrinker(&huge_zero_page_shrinker);
  624 +
579 625 /*
580 626 * By default disable transparent hugepages on smaller systems,
581 627 * where the extra memory used could hurt more than TLB overhead
582 628  
... ... @@ -705,10 +751,11 @@
705 751 #endif
706 752  
707 753 static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
708   - struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
  754 + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
  755 + unsigned long zero_pfn)
709 756 {
710 757 pmd_t entry;
711   - entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
  758 + entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
712 759 entry = pmd_wrprotect(entry);
713 760 entry = pmd_mkhuge(entry);
714 761 set_pmd_at(mm, haddr, pmd, entry);
715 762  
716 763  
... ... @@ -731,15 +778,19 @@
731 778 return VM_FAULT_OOM;
732 779 if (!(flags & FAULT_FLAG_WRITE)) {
733 780 pgtable_t pgtable;
734   - if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
735   - count_vm_event(THP_FAULT_FALLBACK);
736   - goto out;
737   - }
  781 + unsigned long zero_pfn;
738 782 pgtable = pte_alloc_one(mm, haddr);
739 783 if (unlikely(!pgtable))
740 784 return VM_FAULT_OOM;
  785 + zero_pfn = get_huge_zero_page();
  786 + if (unlikely(!zero_pfn)) {
  787 + pte_free(mm, pgtable);
  788 + count_vm_event(THP_FAULT_FALLBACK);
  789 + goto out;
  790 + }
741 791 spin_lock(&mm->page_table_lock);
742   - set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
  792 + set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
  793 + zero_pfn);
743 794 spin_unlock(&mm->page_table_lock);
744 795 return 0;
745 796 }
... ... @@ -813,7 +864,15 @@
813 864 * a page table.
814 865 */
815 866 if (is_huge_zero_pmd(pmd)) {
816   - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
  867 + unsigned long zero_pfn;
  868 + /*
  869 + * get_huge_zero_page() will never allocate a new page here,
  870 + * since we already have a zero page to copy. It just takes a
  871 + * reference.
  872 + */
  873 + zero_pfn = get_huge_zero_page();
  874 + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
  875 + zero_pfn);
817 876 ret = 0;
818 877 goto out_unlock;
819 878 }
... ... @@ -923,6 +982,7 @@
923 982 smp_wmb(); /* make pte visible before pmd */
924 983 pmd_populate(mm, pmd, pgtable);
925 984 spin_unlock(&mm->page_table_lock);
  985 + put_huge_zero_page();
926 986 inc_mm_counter(mm, MM_ANONPAGES);
927 987  
928 988 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
929 989  
... ... @@ -1123,9 +1183,10 @@
1123 1183 page_add_new_anon_rmap(new_page, vma, haddr);
1124 1184 set_pmd_at(mm, haddr, pmd, entry);
1125 1185 update_mmu_cache_pmd(vma, address, pmd);
1126   - if (is_huge_zero_pmd(orig_pmd))
  1186 + if (is_huge_zero_pmd(orig_pmd)) {
1127 1187 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1128   - else {
  1188 + put_huge_zero_page();
  1189 + } else {
1129 1190 VM_BUG_ON(!PageHead(page));
1130 1191 page_remove_rmap(page);
1131 1192 put_page(page);
... ... @@ -1202,6 +1263,7 @@
1202 1263 if (is_huge_zero_pmd(orig_pmd)) {
1203 1264 tlb->mm->nr_ptes--;
1204 1265 spin_unlock(&tlb->mm->page_table_lock);
  1266 + put_huge_zero_page();
1205 1267 } else {
1206 1268 page = pmd_page(orig_pmd);
1207 1269 page_remove_rmap(page);
... ... @@ -2511,6 +2573,7 @@
2511 2573 }
2512 2574 smp_wmb(); /* make pte visible before pmd */
2513 2575 pmd_populate(mm, pmd, pgtable);
  2576 + put_huge_zero_page();
2514 2577 }
2515 2578  
2516 2579 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,