Commit e930bffe95e1e886a1ede80726ea38df5838d067
Committed by
Avi Kivity
1 parent
604b38ac03
Exists in
master
and in
4 other branches
KVM: Synchronize guest physical memory map to host virtual memory map
Synchronize changes to host virtual addresses which are part of a KVM memory slot to the KVM shadow mmu. This allows pte operations like swapping, page migration, and madvise() to transparently work with KVM. Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com>
Showing 5 changed files with 277 additions and 0 deletions Side-by-side Diff
arch/x86/kvm/mmu.c
... | ... | @@ -653,6 +653,84 @@ |
653 | 653 | account_shadowed(kvm, gfn); |
654 | 654 | } |
655 | 655 | |
656 | +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | |
657 | +{ | |
658 | + u64 *spte; | |
659 | + int need_tlb_flush = 0; | |
660 | + | |
661 | + while ((spte = rmap_next(kvm, rmapp, NULL))) { | |
662 | + BUG_ON(!(*spte & PT_PRESENT_MASK)); | |
663 | + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | |
664 | + rmap_remove(kvm, spte); | |
665 | + set_shadow_pte(spte, shadow_trap_nonpresent_pte); | |
666 | + need_tlb_flush = 1; | |
667 | + } | |
668 | + return need_tlb_flush; | |
669 | +} | |
670 | + | |
671 | +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |
672 | + int (*handler)(struct kvm *kvm, unsigned long *rmapp)) | |
673 | +{ | |
674 | + int i; | |
675 | + int retval = 0; | |
676 | + | |
677 | + /* | |
678 | + * If mmap_sem isn't taken, we can look the memslots with only | |
679 | + * the mmu_lock by skipping over the slots with userspace_addr == 0. | |
680 | + */ | |
681 | + for (i = 0; i < kvm->nmemslots; i++) { | |
682 | + struct kvm_memory_slot *memslot = &kvm->memslots[i]; | |
683 | + unsigned long start = memslot->userspace_addr; | |
684 | + unsigned long end; | |
685 | + | |
686 | + /* mmu_lock protects userspace_addr */ | |
687 | + if (!start) | |
688 | + continue; | |
689 | + | |
690 | + end = start + (memslot->npages << PAGE_SHIFT); | |
691 | + if (hva >= start && hva < end) { | |
692 | + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | |
693 | + retval |= handler(kvm, &memslot->rmap[gfn_offset]); | |
694 | + retval |= handler(kvm, | |
695 | + &memslot->lpage_info[ | |
696 | + gfn_offset / | |
697 | + KVM_PAGES_PER_HPAGE].rmap_pde); | |
698 | + } | |
699 | + } | |
700 | + | |
701 | + return retval; | |
702 | +} | |
703 | + | |
704 | +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | |
705 | +{ | |
706 | + return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); | |
707 | +} | |
708 | + | |
709 | +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |
710 | +{ | |
711 | + u64 *spte; | |
712 | + int young = 0; | |
713 | + | |
714 | + spte = rmap_next(kvm, rmapp, NULL); | |
715 | + while (spte) { | |
716 | + int _young; | |
717 | + u64 _spte = *spte; | |
718 | + BUG_ON(!(_spte & PT_PRESENT_MASK)); | |
719 | + _young = _spte & PT_ACCESSED_MASK; | |
720 | + if (_young) { | |
721 | + young = 1; | |
722 | + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | |
723 | + } | |
724 | + spte = rmap_next(kvm, rmapp, spte); | |
725 | + } | |
726 | + return young; | |
727 | +} | |
728 | + | |
729 | +int kvm_age_hva(struct kvm *kvm, unsigned long hva) | |
730 | +{ | |
731 | + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); | |
732 | +} | |
733 | + | |
656 | 734 | #ifdef MMU_DEBUG |
657 | 735 | static int is_empty_shadow_page(u64 *spt) |
658 | 736 | { |
... | ... | @@ -1203,6 +1281,7 @@ |
1203 | 1281 | int r; |
1204 | 1282 | int largepage = 0; |
1205 | 1283 | pfn_t pfn; |
1284 | + unsigned long mmu_seq; | |
1206 | 1285 | |
1207 | 1286 | down_read(¤t->mm->mmap_sem); |
1208 | 1287 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
... | ... | @@ -1210,6 +1289,8 @@ |
1210 | 1289 | largepage = 1; |
1211 | 1290 | } |
1212 | 1291 | |
1292 | + mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
1293 | + /* implicit mb(), we'll read before PT lock is unlocked */ | |
1213 | 1294 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1214 | 1295 | up_read(¤t->mm->mmap_sem); |
1215 | 1296 | |
... | ... | @@ -1220,6 +1301,8 @@ |
1220 | 1301 | } |
1221 | 1302 | |
1222 | 1303 | spin_lock(&vcpu->kvm->mmu_lock); |
1304 | + if (mmu_notifier_retry(vcpu, mmu_seq)) | |
1305 | + goto out_unlock; | |
1223 | 1306 | kvm_mmu_free_some_pages(vcpu); |
1224 | 1307 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, |
1225 | 1308 | PT32E_ROOT_LEVEL); |
... | ... | @@ -1227,6 +1310,11 @@ |
1227 | 1310 | |
1228 | 1311 | |
1229 | 1312 | return r; |
1313 | + | |
1314 | +out_unlock: | |
1315 | + spin_unlock(&vcpu->kvm->mmu_lock); | |
1316 | + kvm_release_pfn_clean(pfn); | |
1317 | + return 0; | |
1230 | 1318 | } |
1231 | 1319 | |
1232 | 1320 | |
... | ... | @@ -1345,6 +1433,7 @@ |
1345 | 1433 | int r; |
1346 | 1434 | int largepage = 0; |
1347 | 1435 | gfn_t gfn = gpa >> PAGE_SHIFT; |
1436 | + unsigned long mmu_seq; | |
1348 | 1437 | |
1349 | 1438 | ASSERT(vcpu); |
1350 | 1439 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
... | ... | @@ -1358,6 +1447,8 @@ |
1358 | 1447 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1359 | 1448 | largepage = 1; |
1360 | 1449 | } |
1450 | + mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
1451 | + /* implicit mb(), we'll read before PT lock is unlocked */ | |
1361 | 1452 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1362 | 1453 | up_read(¤t->mm->mmap_sem); |
1363 | 1454 | if (is_error_pfn(pfn)) { |
1364 | 1455 | |
... | ... | @@ -1365,12 +1456,19 @@ |
1365 | 1456 | return 1; |
1366 | 1457 | } |
1367 | 1458 | spin_lock(&vcpu->kvm->mmu_lock); |
1459 | + if (mmu_notifier_retry(vcpu, mmu_seq)) | |
1460 | + goto out_unlock; | |
1368 | 1461 | kvm_mmu_free_some_pages(vcpu); |
1369 | 1462 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1370 | 1463 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); |
1371 | 1464 | spin_unlock(&vcpu->kvm->mmu_lock); |
1372 | 1465 | |
1373 | 1466 | return r; |
1467 | + | |
1468 | +out_unlock: | |
1469 | + spin_unlock(&vcpu->kvm->mmu_lock); | |
1470 | + kvm_release_pfn_clean(pfn); | |
1471 | + return 0; | |
1374 | 1472 | } |
1375 | 1473 | |
1376 | 1474 | static void nonpaging_free(struct kvm_vcpu *vcpu) |
... | ... | @@ -1670,6 +1768,8 @@ |
1670 | 1768 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1671 | 1769 | vcpu->arch.update_pte.largepage = 1; |
1672 | 1770 | } |
1771 | + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
1772 | + /* implicit mb(), we'll read before PT lock is unlocked */ | |
1673 | 1773 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1674 | 1774 | up_read(¤t->mm->mmap_sem); |
1675 | 1775 |
arch/x86/kvm/paging_tmpl.h
... | ... | @@ -263,6 +263,8 @@ |
263 | 263 | pfn = vcpu->arch.update_pte.pfn; |
264 | 264 | if (is_error_pfn(pfn)) |
265 | 265 | return; |
266 | + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) | |
267 | + return; | |
266 | 268 | kvm_get_pfn(pfn); |
267 | 269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
268 | 270 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), |
... | ... | @@ -380,6 +382,7 @@ |
380 | 382 | int r; |
381 | 383 | pfn_t pfn; |
382 | 384 | int largepage = 0; |
385 | + unsigned long mmu_seq; | |
383 | 386 | |
384 | 387 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
385 | 388 | kvm_mmu_audit(vcpu, "pre page fault"); |
... | ... | @@ -413,6 +416,8 @@ |
413 | 416 | largepage = 1; |
414 | 417 | } |
415 | 418 | } |
419 | + mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
420 | + /* implicit mb(), we'll read before PT lock is unlocked */ | |
416 | 421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
417 | 422 | up_read(¤t->mm->mmap_sem); |
418 | 423 | |
... | ... | @@ -424,6 +429,8 @@ |
424 | 429 | } |
425 | 430 | |
426 | 431 | spin_lock(&vcpu->kvm->mmu_lock); |
432 | + if (mmu_notifier_retry(vcpu, mmu_seq)) | |
433 | + goto out_unlock; | |
427 | 434 | kvm_mmu_free_some_pages(vcpu); |
428 | 435 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
429 | 436 | largepage, &write_pt, pfn); |
... | ... | @@ -439,6 +446,11 @@ |
439 | 446 | spin_unlock(&vcpu->kvm->mmu_lock); |
440 | 447 | |
441 | 448 | return write_pt; |
449 | + | |
450 | +out_unlock: | |
451 | + spin_unlock(&vcpu->kvm->mmu_lock); | |
452 | + kvm_release_pfn_clean(pfn); | |
453 | + return 0; | |
442 | 454 | } |
443 | 455 | |
444 | 456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
include/asm-x86/kvm_host.h
... | ... | @@ -13,6 +13,7 @@ |
13 | 13 | |
14 | 14 | #include <linux/types.h> |
15 | 15 | #include <linux/mm.h> |
16 | +#include <linux/mmu_notifier.h> | |
16 | 17 | |
17 | 18 | #include <linux/kvm.h> |
18 | 19 | #include <linux/kvm_para.h> |
... | ... | @@ -251,6 +252,7 @@ |
251 | 252 | gfn_t gfn; /* presumed gfn during guest pte update */ |
252 | 253 | pfn_t pfn; /* pfn corresponding to that gfn */ |
253 | 254 | int largepage; |
255 | + unsigned long mmu_seq; | |
254 | 256 | } update_pte; |
255 | 257 | |
256 | 258 | struct i387_fxsave_struct host_fx_image; |
... | ... | @@ -728,6 +730,10 @@ |
728 | 730 | ".pushsection __ex_table, \"a\" \n\t" \ |
729 | 731 | KVM_EX_ENTRY " 666b, 667b \n\t" \ |
730 | 732 | ".popsection" |
733 | + | |
734 | +#define KVM_ARCH_WANT_MMU_NOTIFIER | |
735 | +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | |
736 | +int kvm_age_hva(struct kvm *kvm, unsigned long hva); | |
731 | 737 | |
732 | 738 | #endif |
include/linux/kvm_host.h
... | ... | @@ -121,6 +121,12 @@ |
121 | 121 | struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; |
122 | 122 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; |
123 | 123 | #endif |
124 | + | |
125 | +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER | |
126 | + struct mmu_notifier mmu_notifier; | |
127 | + unsigned long mmu_notifier_seq; | |
128 | + long mmu_notifier_count; | |
129 | +#endif | |
124 | 130 | }; |
125 | 131 | |
126 | 132 | /* The guest did something we don't support. */ |
... | ... | @@ -330,6 +336,24 @@ |
330 | 336 | return -EINVAL; |
331 | 337 | } |
332 | 338 | #define kvm_trace_cleanup() ((void)0) |
339 | +#endif | |
340 | + | |
341 | +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER | |
342 | +static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) | |
343 | +{ | |
344 | + if (unlikely(vcpu->kvm->mmu_notifier_count)) | |
345 | + return 1; | |
346 | + /* | |
347 | + * Both reads happen under the mmu_lock and both values are | |
348 | + * modified under mmu_lock, so there's no need of smb_rmb() | |
349 | + * here in between, otherwise mmu_notifier_count should be | |
350 | + * read before mmu_notifier_seq, see | |
351 | + * mmu_notifier_invalidate_range_end write side. | |
352 | + */ | |
353 | + if (vcpu->kvm->mmu_notifier_seq != mmu_seq) | |
354 | + return 1; | |
355 | + return 0; | |
356 | +} | |
333 | 357 | #endif |
334 | 358 | |
335 | 359 | #endif |
virt/kvm/kvm_main.c
... | ... | @@ -192,6 +192,123 @@ |
192 | 192 | } |
193 | 193 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); |
194 | 194 | |
195 | +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | |
196 | +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) | |
197 | +{ | |
198 | + return container_of(mn, struct kvm, mmu_notifier); | |
199 | +} | |
200 | + | |
201 | +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | |
202 | + struct mm_struct *mm, | |
203 | + unsigned long address) | |
204 | +{ | |
205 | + struct kvm *kvm = mmu_notifier_to_kvm(mn); | |
206 | + int need_tlb_flush; | |
207 | + | |
208 | + /* | |
209 | + * When ->invalidate_page runs, the linux pte has been zapped | |
210 | + * already but the page is still allocated until | |
211 | + * ->invalidate_page returns. So if we increase the sequence | |
212 | + * here the kvm page fault will notice if the spte can't be | |
213 | + * established because the page is going to be freed. If | |
214 | + * instead the kvm page fault establishes the spte before | |
215 | + * ->invalidate_page runs, kvm_unmap_hva will release it | |
216 | + * before returning. | |
217 | + * | |
218 | + * The sequence increase only need to be seen at spin_unlock | |
219 | + * time, and not at spin_lock time. | |
220 | + * | |
221 | + * Increasing the sequence after the spin_unlock would be | |
222 | + * unsafe because the kvm page fault could then establish the | |
223 | + * pte after kvm_unmap_hva returned, without noticing the page | |
224 | + * is going to be freed. | |
225 | + */ | |
226 | + spin_lock(&kvm->mmu_lock); | |
227 | + kvm->mmu_notifier_seq++; | |
228 | + need_tlb_flush = kvm_unmap_hva(kvm, address); | |
229 | + spin_unlock(&kvm->mmu_lock); | |
230 | + | |
231 | + /* we've to flush the tlb before the pages can be freed */ | |
232 | + if (need_tlb_flush) | |
233 | + kvm_flush_remote_tlbs(kvm); | |
234 | + | |
235 | +} | |
236 | + | |
237 | +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |
238 | + struct mm_struct *mm, | |
239 | + unsigned long start, | |
240 | + unsigned long end) | |
241 | +{ | |
242 | + struct kvm *kvm = mmu_notifier_to_kvm(mn); | |
243 | + int need_tlb_flush = 0; | |
244 | + | |
245 | + spin_lock(&kvm->mmu_lock); | |
246 | + /* | |
247 | + * The count increase must become visible at unlock time as no | |
248 | + * spte can be established without taking the mmu_lock and | |
249 | + * count is also read inside the mmu_lock critical section. | |
250 | + */ | |
251 | + kvm->mmu_notifier_count++; | |
252 | + for (; start < end; start += PAGE_SIZE) | |
253 | + need_tlb_flush |= kvm_unmap_hva(kvm, start); | |
254 | + spin_unlock(&kvm->mmu_lock); | |
255 | + | |
256 | + /* we've to flush the tlb before the pages can be freed */ | |
257 | + if (need_tlb_flush) | |
258 | + kvm_flush_remote_tlbs(kvm); | |
259 | +} | |
260 | + | |
261 | +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, | |
262 | + struct mm_struct *mm, | |
263 | + unsigned long start, | |
264 | + unsigned long end) | |
265 | +{ | |
266 | + struct kvm *kvm = mmu_notifier_to_kvm(mn); | |
267 | + | |
268 | + spin_lock(&kvm->mmu_lock); | |
269 | + /* | |
270 | + * This sequence increase will notify the kvm page fault that | |
271 | + * the page that is going to be mapped in the spte could have | |
272 | + * been freed. | |
273 | + */ | |
274 | + kvm->mmu_notifier_seq++; | |
275 | + /* | |
276 | + * The above sequence increase must be visible before the | |
277 | + * below count decrease but both values are read by the kvm | |
278 | + * page fault under mmu_lock spinlock so we don't need to add | |
279 | + * a smb_wmb() here in between the two. | |
280 | + */ | |
281 | + kvm->mmu_notifier_count--; | |
282 | + spin_unlock(&kvm->mmu_lock); | |
283 | + | |
284 | + BUG_ON(kvm->mmu_notifier_count < 0); | |
285 | +} | |
286 | + | |
287 | +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, | |
288 | + struct mm_struct *mm, | |
289 | + unsigned long address) | |
290 | +{ | |
291 | + struct kvm *kvm = mmu_notifier_to_kvm(mn); | |
292 | + int young; | |
293 | + | |
294 | + spin_lock(&kvm->mmu_lock); | |
295 | + young = kvm_age_hva(kvm, address); | |
296 | + spin_unlock(&kvm->mmu_lock); | |
297 | + | |
298 | + if (young) | |
299 | + kvm_flush_remote_tlbs(kvm); | |
300 | + | |
301 | + return young; | |
302 | +} | |
303 | + | |
304 | +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { | |
305 | + .invalidate_page = kvm_mmu_notifier_invalidate_page, | |
306 | + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, | |
307 | + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, | |
308 | + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, | |
309 | +}; | |
310 | +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ | |
311 | + | |
195 | 312 | static struct kvm *kvm_create_vm(void) |
196 | 313 | { |
197 | 314 | struct kvm *kvm = kvm_arch_create_vm(); |
... | ... | @@ -212,6 +329,21 @@ |
212 | 329 | (struct kvm_coalesced_mmio_ring *)page_address(page); |
213 | 330 | #endif |
214 | 331 | |
332 | +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | |
333 | + { | |
334 | + int err; | |
335 | + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; | |
336 | + err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); | |
337 | + if (err) { | |
338 | +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET | |
339 | + put_page(page); | |
340 | +#endif | |
341 | + kfree(kvm); | |
342 | + return ERR_PTR(err); | |
343 | + } | |
344 | + } | |
345 | +#endif | |
346 | + | |
215 | 347 | kvm->mm = current->mm; |
216 | 348 | atomic_inc(&kvm->mm->mm_count); |
217 | 349 | spin_lock_init(&kvm->mmu_lock); |
... | ... | @@ -271,6 +403,9 @@ |
271 | 403 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET |
272 | 404 | if (kvm->coalesced_mmio_ring != NULL) |
273 | 405 | free_page((unsigned long)kvm->coalesced_mmio_ring); |
406 | +#endif | |
407 | +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | |
408 | + mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); | |
274 | 409 | #endif |
275 | 410 | kvm_arch_destroy_vm(kvm); |
276 | 411 | mmdrop(mm); |