Commit 6606c3e0da5360799e07ae24b05080cc85c68e72

Authored by Zachary Amsden
Committed by Linus Torvalds
1 parent 9888a1cae3

[PATCH] paravirt: lazy mmu mode hooks.patch

Implement lazy MMU update hooks which are SMP safe for both direct and shadow
page tables.  The idea is that PTE updates and page invalidations while in
lazy mode can be batched into a single hypercall.  We use this in VMI for
shadow page table synchronization, and it is a win.  It also can be used by
PPC and for direct page tables on Xen.

For SMP, the enter / leave must happen under protection of the page table
locks for page tables which are being modified.  This is because otherwise,
you end up with stale state in the batched hypercall, which other CPUs can
race ahead of.  Doing this under the protection of the locks guarantees the
synchronization is correct, and also means that spurious faults which are
generated during this window by remote CPUs are properly handled, as the page
fault handler must re-check the PTE under protection of the same lock.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 32 additions and 0 deletions Side-by-side Diff

include/asm-generic/pgtable.h
... ... @@ -171,6 +171,26 @@
171 171 #endif
172 172  
173 173 /*
  174 + * A facility to provide lazy MMU batching. This allows PTE updates and
  175 + * page invalidations to be delayed until a call to leave lazy MMU mode
  176 + * is issued. Some architectures may benefit from doing this, and it is
  177 + * beneficial for both shadow and direct mode hypervisors, which may batch
  178 + * the PTE updates which happen during this window. Note that using this
  179 + * interface requires that read hazards be removed from the code. A read
  180 + * hazard could result in the direct mode hypervisor case, since the actual
  181 + * write to the page tables may not yet have taken place, so reads though
  182 + * a raw PTE pointer after it has been modified are not guaranteed to be
  183 + * up to date. This mode can only be entered and left under the protection of
  184 + * the page table locks for all page tables which may be modified. In the UP
  185 + * case, this is required so that preemption is disabled, and in the SMP case,
  186 + * it must synchronize the delayed page table writes properly on other CPUs.
  187 + */
  188 +#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
  189 +#define arch_enter_lazy_mmu_mode() do {} while (0)
  190 +#define arch_leave_lazy_mmu_mode() do {} while (0)
  191 +#endif
  192 +
  193 +/*
174 194 * When walking page tables, get the address of the next boundary,
175 195 * or the end address of the range if that comes earlier. Although no
176 196 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
... ... @@ -506,6 +506,7 @@
506 506 src_pte = pte_offset_map_nested(src_pmd, addr);
507 507 src_ptl = pte_lockptr(src_mm, src_pmd);
508 508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  509 + arch_enter_lazy_mmu_mode();
509 510  
510 511 do {
511 512 /*
... ... @@ -527,6 +528,7 @@
527 528 progress += 8;
528 529 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
529 530  
  531 + arch_leave_lazy_mmu_mode();
530 532 spin_unlock(src_ptl);
531 533 pte_unmap_nested(src_pte - 1);
532 534 add_mm_rss(dst_mm, rss[0], rss[1]);
... ... @@ -628,6 +630,7 @@
628 630 int anon_rss = 0;
629 631  
630 632 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  633 + arch_enter_lazy_mmu_mode();
631 634 do {
632 635 pte_t ptent = *pte;
633 636 if (pte_none(ptent)) {
... ... @@ -694,6 +697,7 @@
694 697 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
695 698  
696 699 add_mm_rss(mm, file_rss, anon_rss);
  700 + arch_leave_lazy_mmu_mode();
697 701 pte_unmap_unlock(pte - 1, ptl);
698 702  
699 703 return addr;
... ... @@ -1109,6 +1113,7 @@
1109 1113 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1110 1114 if (!pte)
1111 1115 return -ENOMEM;
  1116 + arch_enter_lazy_mmu_mode();
1112 1117 do {
1113 1118 struct page *page = ZERO_PAGE(addr);
1114 1119 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
... ... @@ -1118,6 +1123,7 @@
1118 1123 BUG_ON(!pte_none(*pte));
1119 1124 set_pte_at(mm, addr, pte, zero_pte);
1120 1125 } while (pte++, addr += PAGE_SIZE, addr != end);
  1126 + arch_leave_lazy_mmu_mode();
1121 1127 pte_unmap_unlock(pte - 1, ptl);
1122 1128 return 0;
1123 1129 }
1124 1130  
... ... @@ -1275,11 +1281,13 @@
1275 1281 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1276 1282 if (!pte)
1277 1283 return -ENOMEM;
  1284 + arch_enter_lazy_mmu_mode();
1278 1285 do {
1279 1286 BUG_ON(!pte_none(*pte));
1280 1287 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1281 1288 pfn++;
1282 1289 } while (pte++, addr += PAGE_SIZE, addr != end);
  1290 + arch_leave_lazy_mmu_mode();
1283 1291 pte_unmap_unlock(pte - 1, ptl);
1284 1292 return 0;
1285 1293 }
... ... @@ -34,6 +34,7 @@
34 34 spinlock_t *ptl;
35 35  
36 36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  37 + arch_enter_lazy_mmu_mode();
37 38 do {
38 39 oldpte = *pte;
39 40 if (pte_present(oldpte)) {
... ... @@ -70,6 +71,7 @@
70 71 }
71 72  
72 73 } while (pte++, addr += PAGE_SIZE, addr != end);
  74 + arch_leave_lazy_mmu_mode();
73 75 pte_unmap_unlock(pte - 1, ptl);
74 76 }
75 77  
... ... @@ -98,6 +98,7 @@
98 98 new_ptl = pte_lockptr(mm, new_pmd);
99 99 if (new_ptl != old_ptl)
100 100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  101 + arch_enter_lazy_mmu_mode();
101 102  
102 103 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 104 new_pte++, new_addr += PAGE_SIZE) {
... ... @@ -109,6 +110,7 @@
109 110 set_pte_at(mm, new_addr, new_pte, pte);
110 111 }
111 112  
  113 + arch_leave_lazy_mmu_mode();
112 114 if (new_ptl != old_ptl)
113 115 spin_unlock(new_ptl);
114 116 pte_unmap_nested(new_pte - 1);