Commit 8ee53820edfd1f3b6554c593f337148dd3d7fc91

Authored by Andrea Arcangeli
Committed by Linus Torvalds
1 parent 4b7167b9ff

thp: mmu_notifier_test_young

For GRU and EPT, we need gup-fast to set referenced bit too (this is why
it's correct to return 0 when shadow_access_mask is zero, it requires
gup-fast to set the referenced bit).  qemu-kvm access already sets the
young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow
paging EPT minor fault we relay on gup-fast to signal the page is in
use...

We also need to check the young bits on the secondary pagetables for NPT
and not nested shadow mmu as the data may never get accessed again by the
primary pte.

Without this closer accuracy, we'd have to remove the heuristic that
avoids collapsing hugepages in hugepage virtual regions that have not even
a single subpage in use.

->test_young is full backwards compatible with GRU and other usages that
don't have young bits in pagetables set by the hardware and that should
nuke the secondary mmu mappings when ->clear_flush_young runs just like
EPT does.

Removing the heuristic that checks the young bit in
khugepaged/collapse_huge_page completely isn't so bad either probably but
I thought it was worth it and this makes it reliable.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 7 changed files with 105 additions and 2 deletions Side-by-side Diff

arch/x86/include/asm/kvm_host.h
... ... @@ -822,6 +822,7 @@
822 822 #define KVM_ARCH_WANT_MMU_NOTIFIER
823 823 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
824 824 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
  825 +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
825 826 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
826 827 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
827 828 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
... ... @@ -945,6 +945,35 @@
945 945 return young;
946 946 }
947 947  
  948 +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
  949 + unsigned long data)
  950 +{
  951 + u64 *spte;
  952 + int young = 0;
  953 +
  954 + /*
  955 + * If there's no access bit in the secondary pte set by the
  956 + * hardware it's up to gup-fast/gup to set the access bit in
  957 + * the primary pte or in the page structure.
  958 + */
  959 + if (!shadow_accessed_mask)
  960 + goto out;
  961 +
  962 + spte = rmap_next(kvm, rmapp, NULL);
  963 + while (spte) {
  964 + u64 _spte = *spte;
  965 + BUG_ON(!(_spte & PT_PRESENT_MASK));
  966 + young = _spte & PT_ACCESSED_MASK;
  967 + if (young) {
  968 + young = 1;
  969 + break;
  970 + }
  971 + spte = rmap_next(kvm, rmapp, spte);
  972 + }
  973 +out:
  974 + return young;
  975 +}
  976 +
948 977 #define RMAP_RECYCLE_THRESHOLD 1000
949 978  
950 979 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
... ... @@ -963,6 +992,11 @@
963 992 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
964 993 {
965 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
  995 +}
  996 +
  997 +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
  998 +{
  999 + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
966 1000 }
967 1001  
968 1002 #ifdef MMU_DEBUG
... ... @@ -8,6 +8,7 @@
8 8 #include <linux/mm.h>
9 9 #include <linux/vmstat.h>
10 10 #include <linux/highmem.h>
  11 +#include <linux/swap.h>
11 12  
12 13 #include <asm/pgtable.h>
13 14  
... ... @@ -89,6 +90,7 @@
89 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 91 page = pte_page(pte);
91 92 get_page(page);
  93 + SetPageReferenced(page);
92 94 pages[*nr] = page;
93 95 (*nr)++;
94 96  
... ... @@ -103,6 +105,7 @@
103 105 VM_BUG_ON(page != compound_head(page));
104 106 VM_BUG_ON(page_count(page) == 0);
105 107 atomic_add(nr, &page->_count);
  108 + SetPageReferenced(page);
106 109 }
107 110  
108 111 static inline void get_huge_page_tail(struct page *page)
include/linux/mmu_notifier.h
... ... @@ -62,6 +62,16 @@
62 62 unsigned long address);
63 63  
64 64 /*
  65 + * test_young is called to check the young/accessed bitflag in
  66 + * the secondary pte. This is used to know if the page is
  67 + * frequently used without actually clearing the flag or tearing
  68 + * down the secondary mapping on the page.
  69 + */
  70 + int (*test_young)(struct mmu_notifier *mn,
  71 + struct mm_struct *mm,
  72 + unsigned long address);
  73 +
  74 + /*
65 75 * change_pte is called in cases that pte mapping to page is changed:
66 76 * for example, when ksm remaps pte to point to a new shared page.
67 77 */
... ... @@ -163,6 +173,8 @@
163 173 extern void __mmu_notifier_release(struct mm_struct *mm);
164 174 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
165 175 unsigned long address);
  176 +extern int __mmu_notifier_test_young(struct mm_struct *mm,
  177 + unsigned long address);
166 178 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
167 179 unsigned long address, pte_t pte);
168 180 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
... ... @@ -186,6 +198,14 @@
186 198 return 0;
187 199 }
188 200  
  201 +static inline int mmu_notifier_test_young(struct mm_struct *mm,
  202 + unsigned long address)
  203 +{
  204 + if (mm_has_notifiers(mm))
  205 + return __mmu_notifier_test_young(mm, address);
  206 + return 0;
  207 +}
  208 +
189 209 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
190 210 unsigned long address, pte_t pte)
191 211 {
... ... @@ -308,6 +328,12 @@
308 328 }
309 329  
310 330 static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
  331 + unsigned long address)
  332 +{
  333 + return 0;
  334 +}
  335 +
  336 +static inline int mmu_notifier_test_young(struct mm_struct *mm,
311 337 unsigned long address)
312 338 {
313 339 return 0;
... ... @@ -1632,7 +1632,8 @@
1632 1632 VM_BUG_ON(PageLRU(page));
1633 1633  
1634 1634 /* If there is no mapped pte young don't collapse the page */
1635   - if (pte_young(pteval))
  1635 + if (pte_young(pteval) || PageReferenced(page) ||
  1636 + mmu_notifier_test_young(vma->vm_mm, address))
1636 1637 referenced = 1;
1637 1638 }
1638 1639 if (unlikely(!referenced))
... ... @@ -1892,7 +1893,8 @@
1892 1893 /* cannot use mapcount: can't collapse if there's a gup pin */
1893 1894 if (page_count(page) != 1)
1894 1895 goto out_unmap;
1895   - if (pte_young(pteval))
  1896 + if (pte_young(pteval) || PageReferenced(page) ||
  1897 + mmu_notifier_test_young(vma->vm_mm, address))
1896 1898 referenced = 1;
1897 1899 }
1898 1900 if (referenced)
... ... @@ -100,6 +100,26 @@
100 100 return young;
101 101 }
102 102  
  103 +int __mmu_notifier_test_young(struct mm_struct *mm,
  104 + unsigned long address)
  105 +{
  106 + struct mmu_notifier *mn;
  107 + struct hlist_node *n;
  108 + int young = 0;
  109 +
  110 + rcu_read_lock();
  111 + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
  112 + if (mn->ops->test_young) {
  113 + young = mn->ops->test_young(mn, mm, address);
  114 + if (young)
  115 + break;
  116 + }
  117 + }
  118 + rcu_read_unlock();
  119 +
  120 + return young;
  121 +}
  122 +
103 123 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 124 pte_t pte)
105 125 {
... ... @@ -380,6 +380,22 @@
380 380 return young;
381 381 }
382 382  
  383 +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
  384 + struct mm_struct *mm,
  385 + unsigned long address)
  386 +{
  387 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
  388 + int young, idx;
  389 +
  390 + idx = srcu_read_lock(&kvm->srcu);
  391 + spin_lock(&kvm->mmu_lock);
  392 + young = kvm_test_age_hva(kvm, address);
  393 + spin_unlock(&kvm->mmu_lock);
  394 + srcu_read_unlock(&kvm->srcu, idx);
  395 +
  396 + return young;
  397 +}
  398 +
383 399 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
384 400 struct mm_struct *mm)
385 401 {
... ... @@ -396,6 +412,7 @@
396 412 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
397 413 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
398 414 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
  415 + .test_young = kvm_mmu_notifier_test_young,
399 416 .change_pte = kvm_mmu_notifier_change_pte,
400 417 .release = kvm_mmu_notifier_release,
401 418 };