Commit e930bffe95e1e886a1ede80726ea38df5838d067

Authored by Andrea Arcangeli
Committed by Avi Kivity
1 parent 604b38ac03

KVM: Synchronize guest physical memory map to host virtual memory map

Synchronize changes to host virtual addresses which are part of
a KVM memory slot to the KVM shadow mmu.  This allows pte operations
like swapping, page migration, and madvise() to transparently work
with KVM.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

Showing 5 changed files with 277 additions and 0 deletions Side-by-side Diff

... ... @@ -653,6 +653,84 @@
653 653 account_shadowed(kvm, gfn);
654 654 }
655 655  
  656 +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
  657 +{
  658 + u64 *spte;
  659 + int need_tlb_flush = 0;
  660 +
  661 + while ((spte = rmap_next(kvm, rmapp, NULL))) {
  662 + BUG_ON(!(*spte & PT_PRESENT_MASK));
  663 + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
  664 + rmap_remove(kvm, spte);
  665 + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
  666 + need_tlb_flush = 1;
  667 + }
  668 + return need_tlb_flush;
  669 +}
  670 +
  671 +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
  672 + int (*handler)(struct kvm *kvm, unsigned long *rmapp))
  673 +{
  674 + int i;
  675 + int retval = 0;
  676 +
  677 + /*
  678 + * If mmap_sem isn't taken, we can look the memslots with only
  679 + * the mmu_lock by skipping over the slots with userspace_addr == 0.
  680 + */
  681 + for (i = 0; i < kvm->nmemslots; i++) {
  682 + struct kvm_memory_slot *memslot = &kvm->memslots[i];
  683 + unsigned long start = memslot->userspace_addr;
  684 + unsigned long end;
  685 +
  686 + /* mmu_lock protects userspace_addr */
  687 + if (!start)
  688 + continue;
  689 +
  690 + end = start + (memslot->npages << PAGE_SHIFT);
  691 + if (hva >= start && hva < end) {
  692 + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
  693 + retval |= handler(kvm, &memslot->rmap[gfn_offset]);
  694 + retval |= handler(kvm,
  695 + &memslot->lpage_info[
  696 + gfn_offset /
  697 + KVM_PAGES_PER_HPAGE].rmap_pde);
  698 + }
  699 + }
  700 +
  701 + return retval;
  702 +}
  703 +
  704 +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
  705 +{
  706 + return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
  707 +}
  708 +
  709 +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
  710 +{
  711 + u64 *spte;
  712 + int young = 0;
  713 +
  714 + spte = rmap_next(kvm, rmapp, NULL);
  715 + while (spte) {
  716 + int _young;
  717 + u64 _spte = *spte;
  718 + BUG_ON(!(_spte & PT_PRESENT_MASK));
  719 + _young = _spte & PT_ACCESSED_MASK;
  720 + if (_young) {
  721 + young = 1;
  722 + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
  723 + }
  724 + spte = rmap_next(kvm, rmapp, spte);
  725 + }
  726 + return young;
  727 +}
  728 +
  729 +int kvm_age_hva(struct kvm *kvm, unsigned long hva)
  730 +{
  731 + return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
  732 +}
  733 +
656 734 #ifdef MMU_DEBUG
657 735 static int is_empty_shadow_page(u64 *spt)
658 736 {
... ... @@ -1203,6 +1281,7 @@
1203 1281 int r;
1204 1282 int largepage = 0;
1205 1283 pfn_t pfn;
  1284 + unsigned long mmu_seq;
1206 1285  
1207 1286 down_read(&current->mm->mmap_sem);
1208 1287 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
... ... @@ -1210,6 +1289,8 @@
1210 1289 largepage = 1;
1211 1290 }
1212 1291  
  1292 + mmu_seq = vcpu->kvm->mmu_notifier_seq;
  1293 + /* implicit mb(), we'll read before PT lock is unlocked */
1213 1294 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1214 1295 up_read(&current->mm->mmap_sem);
1215 1296  
... ... @@ -1220,6 +1301,8 @@
1220 1301 }
1221 1302  
1222 1303 spin_lock(&vcpu->kvm->mmu_lock);
  1304 + if (mmu_notifier_retry(vcpu, mmu_seq))
  1305 + goto out_unlock;
1223 1306 kvm_mmu_free_some_pages(vcpu);
1224 1307 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1225 1308 PT32E_ROOT_LEVEL);
... ... @@ -1227,6 +1310,11 @@
1227 1310  
1228 1311  
1229 1312 return r;
  1313 +
  1314 +out_unlock:
  1315 + spin_unlock(&vcpu->kvm->mmu_lock);
  1316 + kvm_release_pfn_clean(pfn);
  1317 + return 0;
1230 1318 }
1231 1319  
1232 1320  
... ... @@ -1345,6 +1433,7 @@
1345 1433 int r;
1346 1434 int largepage = 0;
1347 1435 gfn_t gfn = gpa >> PAGE_SHIFT;
  1436 + unsigned long mmu_seq;
1348 1437  
1349 1438 ASSERT(vcpu);
1350 1439 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
... ... @@ -1358,6 +1447,8 @@
1358 1447 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1359 1448 largepage = 1;
1360 1449 }
  1450 + mmu_seq = vcpu->kvm->mmu_notifier_seq;
  1451 + /* implicit mb(), we'll read before PT lock is unlocked */
1361 1452 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1362 1453 up_read(&current->mm->mmap_sem);
1363 1454 if (is_error_pfn(pfn)) {
1364 1455  
... ... @@ -1365,12 +1456,19 @@
1365 1456 return 1;
1366 1457 }
1367 1458 spin_lock(&vcpu->kvm->mmu_lock);
  1459 + if (mmu_notifier_retry(vcpu, mmu_seq))
  1460 + goto out_unlock;
1368 1461 kvm_mmu_free_some_pages(vcpu);
1369 1462 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1370 1463 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1371 1464 spin_unlock(&vcpu->kvm->mmu_lock);
1372 1465  
1373 1466 return r;
  1467 +
  1468 +out_unlock:
  1469 + spin_unlock(&vcpu->kvm->mmu_lock);
  1470 + kvm_release_pfn_clean(pfn);
  1471 + return 0;
1374 1472 }
1375 1473  
1376 1474 static void nonpaging_free(struct kvm_vcpu *vcpu)
... ... @@ -1670,6 +1768,8 @@
1670 1768 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1671 1769 vcpu->arch.update_pte.largepage = 1;
1672 1770 }
  1771 + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
  1772 + /* implicit mb(), we'll read before PT lock is unlocked */
1673 1773 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1674 1774 up_read(&current->mm->mmap_sem);
1675 1775  
arch/x86/kvm/paging_tmpl.h
... ... @@ -263,6 +263,8 @@
263 263 pfn = vcpu->arch.update_pte.pfn;
264 264 if (is_error_pfn(pfn))
265 265 return;
  266 + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
  267 + return;
266 268 kvm_get_pfn(pfn);
267 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
... ... @@ -380,6 +382,7 @@
380 382 int r;
381 383 pfn_t pfn;
382 384 int largepage = 0;
  385 + unsigned long mmu_seq;
383 386  
384 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 388 kvm_mmu_audit(vcpu, "pre page fault");
... ... @@ -413,6 +416,8 @@
413 416 largepage = 1;
414 417 }
415 418 }
  419 + mmu_seq = vcpu->kvm->mmu_notifier_seq;
  420 + /* implicit mb(), we'll read before PT lock is unlocked */
416 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 422 up_read(&current->mm->mmap_sem);
418 423  
... ... @@ -424,6 +429,8 @@
424 429 }
425 430  
426 431 spin_lock(&vcpu->kvm->mmu_lock);
  432 + if (mmu_notifier_retry(vcpu, mmu_seq))
  433 + goto out_unlock;
427 434 kvm_mmu_free_some_pages(vcpu);
428 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 436 largepage, &write_pt, pfn);
... ... @@ -439,6 +446,11 @@
439 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447  
441 448 return write_pt;
  449 +
  450 +out_unlock:
  451 + spin_unlock(&vcpu->kvm->mmu_lock);
  452 + kvm_release_pfn_clean(pfn);
  453 + return 0;
442 454 }
443 455  
444 456 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
include/asm-x86/kvm_host.h
... ... @@ -13,6 +13,7 @@
13 13  
14 14 #include <linux/types.h>
15 15 #include <linux/mm.h>
  16 +#include <linux/mmu_notifier.h>
16 17  
17 18 #include <linux/kvm.h>
18 19 #include <linux/kvm_para.h>
... ... @@ -251,6 +252,7 @@
251 252 gfn_t gfn; /* presumed gfn during guest pte update */
252 253 pfn_t pfn; /* pfn corresponding to that gfn */
253 254 int largepage;
  255 + unsigned long mmu_seq;
254 256 } update_pte;
255 257  
256 258 struct i387_fxsave_struct host_fx_image;
... ... @@ -728,6 +730,10 @@
728 730 ".pushsection __ex_table, \"a\" \n\t" \
729 731 KVM_EX_ENTRY " 666b, 667b \n\t" \
730 732 ".popsection"
  733 +
  734 +#define KVM_ARCH_WANT_MMU_NOTIFIER
  735 +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
  736 +int kvm_age_hva(struct kvm *kvm, unsigned long hva);
731 737  
732 738 #endif
include/linux/kvm_host.h
... ... @@ -121,6 +121,12 @@
121 121 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
122 122 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
123 123 #endif
  124 +
  125 +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
  126 + struct mmu_notifier mmu_notifier;
  127 + unsigned long mmu_notifier_seq;
  128 + long mmu_notifier_count;
  129 +#endif
124 130 };
125 131  
126 132 /* The guest did something we don't support. */
... ... @@ -330,6 +336,24 @@
330 336 return -EINVAL;
331 337 }
332 338 #define kvm_trace_cleanup() ((void)0)
  339 +#endif
  340 +
  341 +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
  342 +static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
  343 +{
  344 + if (unlikely(vcpu->kvm->mmu_notifier_count))
  345 + return 1;
  346 + /*
  347 + * Both reads happen under the mmu_lock and both values are
  348 + * modified under mmu_lock, so there's no need of smb_rmb()
  349 + * here in between, otherwise mmu_notifier_count should be
  350 + * read before mmu_notifier_seq, see
  351 + * mmu_notifier_invalidate_range_end write side.
  352 + */
  353 + if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
  354 + return 1;
  355 + return 0;
  356 +}
333 357 #endif
334 358  
335 359 #endif
... ... @@ -192,6 +192,123 @@
192 192 }
193 193 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
194 194  
  195 +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
  196 +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
  197 +{
  198 + return container_of(mn, struct kvm, mmu_notifier);
  199 +}
  200 +
  201 +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
  202 + struct mm_struct *mm,
  203 + unsigned long address)
  204 +{
  205 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
  206 + int need_tlb_flush;
  207 +
  208 + /*
  209 + * When ->invalidate_page runs, the linux pte has been zapped
  210 + * already but the page is still allocated until
  211 + * ->invalidate_page returns. So if we increase the sequence
  212 + * here the kvm page fault will notice if the spte can't be
  213 + * established because the page is going to be freed. If
  214 + * instead the kvm page fault establishes the spte before
  215 + * ->invalidate_page runs, kvm_unmap_hva will release it
  216 + * before returning.
  217 + *
  218 + * The sequence increase only need to be seen at spin_unlock
  219 + * time, and not at spin_lock time.
  220 + *
  221 + * Increasing the sequence after the spin_unlock would be
  222 + * unsafe because the kvm page fault could then establish the
  223 + * pte after kvm_unmap_hva returned, without noticing the page
  224 + * is going to be freed.
  225 + */
  226 + spin_lock(&kvm->mmu_lock);
  227 + kvm->mmu_notifier_seq++;
  228 + need_tlb_flush = kvm_unmap_hva(kvm, address);
  229 + spin_unlock(&kvm->mmu_lock);
  230 +
  231 + /* we've to flush the tlb before the pages can be freed */
  232 + if (need_tlb_flush)
  233 + kvm_flush_remote_tlbs(kvm);
  234 +
  235 +}
  236 +
  237 +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  238 + struct mm_struct *mm,
  239 + unsigned long start,
  240 + unsigned long end)
  241 +{
  242 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
  243 + int need_tlb_flush = 0;
  244 +
  245 + spin_lock(&kvm->mmu_lock);
  246 + /*
  247 + * The count increase must become visible at unlock time as no
  248 + * spte can be established without taking the mmu_lock and
  249 + * count is also read inside the mmu_lock critical section.
  250 + */
  251 + kvm->mmu_notifier_count++;
  252 + for (; start < end; start += PAGE_SIZE)
  253 + need_tlb_flush |= kvm_unmap_hva(kvm, start);
  254 + spin_unlock(&kvm->mmu_lock);
  255 +
  256 + /* we've to flush the tlb before the pages can be freed */
  257 + if (need_tlb_flush)
  258 + kvm_flush_remote_tlbs(kvm);
  259 +}
  260 +
  261 +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
  262 + struct mm_struct *mm,
  263 + unsigned long start,
  264 + unsigned long end)
  265 +{
  266 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
  267 +
  268 + spin_lock(&kvm->mmu_lock);
  269 + /*
  270 + * This sequence increase will notify the kvm page fault that
  271 + * the page that is going to be mapped in the spte could have
  272 + * been freed.
  273 + */
  274 + kvm->mmu_notifier_seq++;
  275 + /*
  276 + * The above sequence increase must be visible before the
  277 + * below count decrease but both values are read by the kvm
  278 + * page fault under mmu_lock spinlock so we don't need to add
  279 + * a smb_wmb() here in between the two.
  280 + */
  281 + kvm->mmu_notifier_count--;
  282 + spin_unlock(&kvm->mmu_lock);
  283 +
  284 + BUG_ON(kvm->mmu_notifier_count < 0);
  285 +}
  286 +
  287 +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
  288 + struct mm_struct *mm,
  289 + unsigned long address)
  290 +{
  291 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
  292 + int young;
  293 +
  294 + spin_lock(&kvm->mmu_lock);
  295 + young = kvm_age_hva(kvm, address);
  296 + spin_unlock(&kvm->mmu_lock);
  297 +
  298 + if (young)
  299 + kvm_flush_remote_tlbs(kvm);
  300 +
  301 + return young;
  302 +}
  303 +
  304 +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
  305 + .invalidate_page = kvm_mmu_notifier_invalidate_page,
  306 + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
  307 + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
  308 + .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
  309 +};
  310 +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
  311 +
195 312 static struct kvm *kvm_create_vm(void)
196 313 {
197 314 struct kvm *kvm = kvm_arch_create_vm();
... ... @@ -212,6 +329,21 @@
212 329 (struct kvm_coalesced_mmio_ring *)page_address(page);
213 330 #endif
214 331  
  332 +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
  333 + {
  334 + int err;
  335 + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
  336 + err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
  337 + if (err) {
  338 +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
  339 + put_page(page);
  340 +#endif
  341 + kfree(kvm);
  342 + return ERR_PTR(err);
  343 + }
  344 + }
  345 +#endif
  346 +
215 347 kvm->mm = current->mm;
216 348 atomic_inc(&kvm->mm->mm_count);
217 349 spin_lock_init(&kvm->mmu_lock);
... ... @@ -271,6 +403,9 @@
271 403 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
272 404 if (kvm->coalesced_mmio_ring != NULL)
273 405 free_page((unsigned long)kvm->coalesced_mmio_ring);
  406 +#endif
  407 +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
  408 + mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
274 409 #endif
275 410 kvm_arch_destroy_vm(kvm);
276 411 mmdrop(mm);