Commit 97ae17497e996ff09bf97b6db3b33f7fd4029092
Committed by
Linus Torvalds
1 parent
78ca0e6792
Exists in
master
and in
20 other branches
thp: implement refcounting for huge zero page
H. Peter Anvin doesn't like huge zero page which sticks in memory forever after the first allocation. Here's implementation of lockless refcounting for huge zero page. We have two basic primitives: {get,put}_huge_zero_page(). They manipulate reference counter. If counter is 0, get_huge_zero_page() allocates a new huge page and takes two references: one for caller and one for shrinker. We free the page only in shrinker callback if counter is 1 (only shrinker has the reference). put_huge_zero_page() only decrements counter. Counter is never zero in put_huge_zero_page() since shrinker holds on reference. Freeing huge zero page in shrinker callback helps to avoid frequent allocate-free. Refcounting has cost. On 4 socket machine I observe ~1% slowdown on parallel (40 processes) read page faulting comparing to lazy huge page allocation. I think it's pretty reasonable for synthetic benchmark. [lliubbo@gmail.com: fix mismerge] Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: "H. Peter Anvin" <hpa@linux.intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Bob Liu <lliubbo@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 88 additions and 25 deletions Side-by-side Diff
mm/huge_memory.c
... | ... | @@ -12,12 +12,14 @@ |
12 | 12 | #include <linux/mmu_notifier.h> |
13 | 13 | #include <linux/rmap.h> |
14 | 14 | #include <linux/swap.h> |
15 | +#include <linux/shrinker.h> | |
15 | 16 | #include <linux/mm_inline.h> |
16 | 17 | #include <linux/kthread.h> |
17 | 18 | #include <linux/khugepaged.h> |
18 | 19 | #include <linux/freezer.h> |
19 | 20 | #include <linux/mman.h> |
20 | 21 | #include <linux/pagemap.h> |
22 | + | |
21 | 23 | #include <asm/tlb.h> |
22 | 24 | #include <asm/pgalloc.h> |
23 | 25 | #include "internal.h" |
... | ... | @@ -47,7 +49,6 @@ |
47 | 49 | /* during fragmentation poll the hugepage allocator once every minute */ |
48 | 50 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; |
49 | 51 | static struct task_struct *khugepaged_thread __read_mostly; |
50 | -static unsigned long huge_zero_pfn __read_mostly; | |
51 | 52 | static DEFINE_MUTEX(khugepaged_mutex); |
52 | 53 | static DEFINE_SPINLOCK(khugepaged_mm_lock); |
53 | 54 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); |
54 | 55 | |
55 | 56 | |
56 | 57 | |
57 | 58 | |
58 | 59 | |
59 | 60 | |
60 | 61 | |
61 | 62 | |
... | ... | @@ -160,31 +161,74 @@ |
160 | 161 | return err; |
161 | 162 | } |
162 | 163 | |
163 | -static int init_huge_zero_pfn(void) | |
164 | +static atomic_t huge_zero_refcount; | |
165 | +static unsigned long huge_zero_pfn __read_mostly; | |
166 | + | |
167 | +static inline bool is_huge_zero_pfn(unsigned long pfn) | |
164 | 168 | { |
165 | - struct page *hpage; | |
166 | - unsigned long pfn; | |
169 | + unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | |
170 | + return zero_pfn && pfn == zero_pfn; | |
171 | +} | |
167 | 172 | |
168 | - hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | |
173 | +static inline bool is_huge_zero_pmd(pmd_t pmd) | |
174 | +{ | |
175 | + return is_huge_zero_pfn(pmd_pfn(pmd)); | |
176 | +} | |
177 | + | |
178 | +static unsigned long get_huge_zero_page(void) | |
179 | +{ | |
180 | + struct page *zero_page; | |
181 | +retry: | |
182 | + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | |
183 | + return ACCESS_ONCE(huge_zero_pfn); | |
184 | + | |
185 | + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | |
169 | 186 | HPAGE_PMD_ORDER); |
170 | - if (!hpage) | |
171 | - return -ENOMEM; | |
172 | - pfn = page_to_pfn(hpage); | |
173 | - if (cmpxchg(&huge_zero_pfn, 0, pfn)) | |
174 | - __free_page(hpage); | |
175 | - return 0; | |
187 | + if (!zero_page) | |
188 | + return 0; | |
189 | + preempt_disable(); | |
190 | + if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | |
191 | + preempt_enable(); | |
192 | + __free_page(zero_page); | |
193 | + goto retry; | |
194 | + } | |
195 | + | |
196 | + /* We take additional reference here. It will be put back by shrinker */ | |
197 | + atomic_set(&huge_zero_refcount, 2); | |
198 | + preempt_enable(); | |
199 | + return ACCESS_ONCE(huge_zero_pfn); | |
176 | 200 | } |
177 | 201 | |
178 | -static inline bool is_huge_zero_pfn(unsigned long pfn) | |
202 | +static void put_huge_zero_page(void) | |
179 | 203 | { |
180 | - return huge_zero_pfn && pfn == huge_zero_pfn; | |
204 | + /* | |
205 | + * Counter should never go to zero here. Only shrinker can put | |
206 | + * last reference. | |
207 | + */ | |
208 | + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | |
181 | 209 | } |
182 | 210 | |
183 | -static inline bool is_huge_zero_pmd(pmd_t pmd) | |
211 | +static int shrink_huge_zero_page(struct shrinker *shrink, | |
212 | + struct shrink_control *sc) | |
184 | 213 | { |
185 | - return is_huge_zero_pfn(pmd_pfn(pmd)); | |
214 | + if (!sc->nr_to_scan) | |
215 | + /* we can free zero page only if last reference remains */ | |
216 | + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | |
217 | + | |
218 | + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | |
219 | + unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | |
220 | + BUG_ON(zero_pfn == 0); | |
221 | + __free_page(__pfn_to_page(zero_pfn)); | |
222 | + } | |
223 | + | |
224 | + return 0; | |
186 | 225 | } |
187 | 226 | |
227 | +static struct shrinker huge_zero_page_shrinker = { | |
228 | + .shrink = shrink_huge_zero_page, | |
229 | + .seeks = DEFAULT_SEEKS, | |
230 | +}; | |
231 | + | |
188 | 232 | #ifdef CONFIG_SYSFS |
189 | 233 | |
190 | 234 | static ssize_t double_flag_show(struct kobject *kobj, |
... | ... | @@ -576,6 +620,8 @@ |
576 | 620 | goto out; |
577 | 621 | } |
578 | 622 | |
623 | + register_shrinker(&huge_zero_page_shrinker); | |
624 | + | |
579 | 625 | /* |
580 | 626 | * By default disable transparent hugepages on smaller systems, |
581 | 627 | * where the extra memory used could hurt more than TLB overhead |
582 | 628 | |
... | ... | @@ -705,10 +751,11 @@ |
705 | 751 | #endif |
706 | 752 | |
707 | 753 | static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
708 | - struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) | |
754 | + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | |
755 | + unsigned long zero_pfn) | |
709 | 756 | { |
710 | 757 | pmd_t entry; |
711 | - entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); | |
758 | + entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | |
712 | 759 | entry = pmd_wrprotect(entry); |
713 | 760 | entry = pmd_mkhuge(entry); |
714 | 761 | set_pmd_at(mm, haddr, pmd, entry); |
715 | 762 | |
716 | 763 | |
... | ... | @@ -731,15 +778,19 @@ |
731 | 778 | return VM_FAULT_OOM; |
732 | 779 | if (!(flags & FAULT_FLAG_WRITE)) { |
733 | 780 | pgtable_t pgtable; |
734 | - if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { | |
735 | - count_vm_event(THP_FAULT_FALLBACK); | |
736 | - goto out; | |
737 | - } | |
781 | + unsigned long zero_pfn; | |
738 | 782 | pgtable = pte_alloc_one(mm, haddr); |
739 | 783 | if (unlikely(!pgtable)) |
740 | 784 | return VM_FAULT_OOM; |
785 | + zero_pfn = get_huge_zero_page(); | |
786 | + if (unlikely(!zero_pfn)) { | |
787 | + pte_free(mm, pgtable); | |
788 | + count_vm_event(THP_FAULT_FALLBACK); | |
789 | + goto out; | |
790 | + } | |
741 | 791 | spin_lock(&mm->page_table_lock); |
742 | - set_huge_zero_page(pgtable, mm, vma, haddr, pmd); | |
792 | + set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | |
793 | + zero_pfn); | |
743 | 794 | spin_unlock(&mm->page_table_lock); |
744 | 795 | return 0; |
745 | 796 | } |
... | ... | @@ -813,7 +864,15 @@ |
813 | 864 | * a page table. |
814 | 865 | */ |
815 | 866 | if (is_huge_zero_pmd(pmd)) { |
816 | - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); | |
867 | + unsigned long zero_pfn; | |
868 | + /* | |
869 | + * get_huge_zero_page() will never allocate a new page here, | |
870 | + * since we already have a zero page to copy. It just takes a | |
871 | + * reference. | |
872 | + */ | |
873 | + zero_pfn = get_huge_zero_page(); | |
874 | + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | |
875 | + zero_pfn); | |
817 | 876 | ret = 0; |
818 | 877 | goto out_unlock; |
819 | 878 | } |
... | ... | @@ -923,6 +982,7 @@ |
923 | 982 | smp_wmb(); /* make pte visible before pmd */ |
924 | 983 | pmd_populate(mm, pmd, pgtable); |
925 | 984 | spin_unlock(&mm->page_table_lock); |
985 | + put_huge_zero_page(); | |
926 | 986 | inc_mm_counter(mm, MM_ANONPAGES); |
927 | 987 | |
928 | 988 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
929 | 989 | |
... | ... | @@ -1123,9 +1183,10 @@ |
1123 | 1183 | page_add_new_anon_rmap(new_page, vma, haddr); |
1124 | 1184 | set_pmd_at(mm, haddr, pmd, entry); |
1125 | 1185 | update_mmu_cache_pmd(vma, address, pmd); |
1126 | - if (is_huge_zero_pmd(orig_pmd)) | |
1186 | + if (is_huge_zero_pmd(orig_pmd)) { | |
1127 | 1187 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1128 | - else { | |
1188 | + put_huge_zero_page(); | |
1189 | + } else { | |
1129 | 1190 | VM_BUG_ON(!PageHead(page)); |
1130 | 1191 | page_remove_rmap(page); |
1131 | 1192 | put_page(page); |
... | ... | @@ -1202,6 +1263,7 @@ |
1202 | 1263 | if (is_huge_zero_pmd(orig_pmd)) { |
1203 | 1264 | tlb->mm->nr_ptes--; |
1204 | 1265 | spin_unlock(&tlb->mm->page_table_lock); |
1266 | + put_huge_zero_page(); | |
1205 | 1267 | } else { |
1206 | 1268 | page = pmd_page(orig_pmd); |
1207 | 1269 | page_remove_rmap(page); |
... | ... | @@ -2511,6 +2573,7 @@ |
2511 | 2573 | } |
2512 | 2574 | smp_wmb(); /* make pte visible before pmd */ |
2513 | 2575 | pmd_populate(mm, pmd, pgtable); |
2576 | + put_huge_zero_page(); | |
2514 | 2577 | } |
2515 | 2578 | |
2516 | 2579 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, |
-
mentioned in commit 271aa4
-
mentioned in commit 271aa4
-
mentioned in commit 271aa4
-
mentioned in commit 271aa4
-
mentioned in commit 271aa4
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb
-
mentioned in commit 5ddacb