[PATCH] paravirt: lazy mmu mode hooks.patch

Implement lazy MMU update hooks which are SMP safe for both direct and shadow page tables. The idea is that PTE updates and page invalidations while in lazy mode can be batched into a single hypercall. We use this in VMI for shadow page table synchronization, and it is a win. It also can be used by PPC and for direct page tables on Xen. For SMP, the enter / leave must happen under protection of the page table locks for page tables which are being modified. This is because otherwise, you end up with stale state in the batched hypercall, which other CPUs can race ahead of. Doing this under the protection of the locks guarantees the synchronization is correct, and also means that spurious faults which are generated during this window by remote CPUs are properly handled, as the page fault handler must re-check the PTE under protection of the same lock. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] paravirt: lazy mmu mode hooks.patch
Implement lazy MMU update hooks which are SMP safe for both direct and shadow page tables. The idea is that PTE updates and page invalidations while in lazy mode can be batched into a single hypercall. We use this in VMI for shadow page table synchronization, and it is a win. It also can be used by PPC and for direct page tables on Xen. For SMP, the enter / leave must happen under protection of the page table locks for page tables which are being modified. This is because otherwise, you end up with stale state in the batched hypercall, which other CPUs can race ahead of. Doing this under the protection of the locks guarantees the synchronization is correct, and also means that spurious faults which are generated during this window by remote CPUs are properly handled, as the page fault handler must re-check the PTE under protection of the same lock. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Zachary Amsden · Linus Torvalds
1 parent 9888a1cae3
Showing 4 changed files with 32 additions and 0 deletions Side-by-side Diff
include/asm-generic/pgtable.h
mm/memory.c
mm/mprotect.c
mm/mremap.c
@@ -171,6 +171,26 @@
 #endif
  
 /*
+ * A facility to provide lazy MMU batching.  This allows PTE updates and
+ * page invalidations to be delayed until a call to leave lazy MMU mode
+ * is issued.  Some architectures may benefit from doing this, and it is
+ * beneficial for both shadow and direct mode hypervisors, which may batch
+ * the PTE updates which happen during this window.  Note that using this
+ * interface requires that read hazards be removed from the code.  A read
+ * hazard could result in the direct mode hypervisor case, since the actual
+ * write to the page tables may not yet have taken place, so reads though
+ * a raw PTE pointer after it has been modified are not guaranteed to be
+ * up to date.  This mode can only be entered and left under the protection of
+ * the page table locks for all page tables which may be modified.  In the UP
+ * case, this is required so that preemption is disabled, and in the SMP case,
+ * it must synchronize the delayed page table writes properly on other CPUs.
+ */
+#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+#define arch_enter_lazy_mmu_mode()	do {} while (0)
+#define arch_leave_lazy_mmu_mode()	do {} while (0)
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
@@ -506,6 +506,7 @@
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
  
 	do {
 		/*
@@ -527,6 +528,7 @@
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
  
+	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@
 	int anon_rss = 0;
  
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
@@ -694,6 +697,7 @@
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
  
 	add_mm_rss(mm, file_rss, anon_rss);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
  
 	return addr;
@@ -1109,6 +1113,7 @@
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		struct page *page = ZERO_PAGE(addr);
 		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1123,7 @@
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
  
@@ -1275,11 +1281,13 @@
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
@@ -34,6 +34,7 @@
 	spinlock_t *ptl;
  
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@
 		}
  
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 }
  
@@ -98,6 +98,7 @@
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
  
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
  
+	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
...	...	@@ -171,6 +171,26 @@
171	171	#endif
172	172
173	173	/*
	174	+ * A facility to provide lazy MMU batching. This allows PTE updates and
	175	+ * page invalidations to be delayed until a call to leave lazy MMU mode
	176	+ * is issued. Some architectures may benefit from doing this, and it is
	177	+ * beneficial for both shadow and direct mode hypervisors, which may batch
	178	+ * the PTE updates which happen during this window. Note that using this
	179	+ * interface requires that read hazards be removed from the code. A read
	180	+ * hazard could result in the direct mode hypervisor case, since the actual
	181	+ * write to the page tables may not yet have taken place, so reads though
	182	+ * a raw PTE pointer after it has been modified are not guaranteed to be
	183	+ * up to date. This mode can only be entered and left under the protection of
	184	+ * the page table locks for all page tables which may be modified. In the UP
	185	+ * case, this is required so that preemption is disabled, and in the SMP case,
	186	+ * it must synchronize the delayed page table writes properly on other CPUs.
	187	+ */
	188	+#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
	189	+#define arch_enter_lazy_mmu_mode() do {} while (0)
	190	+#define arch_leave_lazy_mmu_mode() do {} while (0)
	191	+#endif
	192	+
	193	+/*
174	194	* When walking page tables, get the address of the next boundary,
175	195	* or the end address of the range if that comes earlier. Although no
176	196	* vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
...	...	@@ -506,6 +506,7 @@
506	506	src_pte = pte_offset_map_nested(src_pmd, addr);
507	507	src_ptl = pte_lockptr(src_mm, src_pmd);
508	508	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
	509	+ arch_enter_lazy_mmu_mode();
509	510
510	511	do {
511	512	/*
...	...	@@ -527,6 +528,7 @@
527	528	progress += 8;
528	529	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
529	530
	531	+ arch_leave_lazy_mmu_mode();
530	532	spin_unlock(src_ptl);
531	533	pte_unmap_nested(src_pte - 1);
532	534	add_mm_rss(dst_mm, rss[0], rss[1]);
...	...	@@ -628,6 +630,7 @@
628	630	int anon_rss = 0;
629	631
630	632	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	633	+ arch_enter_lazy_mmu_mode();
631	634	do {
632	635	pte_t ptent = *pte;
633	636	if (pte_none(ptent)) {
...	...	@@ -694,6 +697,7 @@
694	697	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
695	698
696	699	add_mm_rss(mm, file_rss, anon_rss);
	700	+ arch_leave_lazy_mmu_mode();
697	701	pte_unmap_unlock(pte - 1, ptl);
698	702
699	703	return addr;
...	...	@@ -1109,6 +1113,7 @@
1109	1113	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1110	1114	if (!pte)
1111	1115	return -ENOMEM;
	1116	+ arch_enter_lazy_mmu_mode();
1112	1117	do {
1113	1118	struct page *page = ZERO_PAGE(addr);
1114	1119	pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
...	...	@@ -1118,6 +1123,7 @@
1118	1123	BUG_ON(!pte_none(*pte));
1119	1124	set_pte_at(mm, addr, pte, zero_pte);
1120	1125	} while (pte++, addr += PAGE_SIZE, addr != end);
	1126	+ arch_leave_lazy_mmu_mode();
1121	1127	pte_unmap_unlock(pte - 1, ptl);
1122	1128	return 0;
1123	1129	}
1124	1130
...	...	@@ -1275,11 +1281,13 @@
1275	1281	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1276	1282	if (!pte)
1277	1283	return -ENOMEM;
	1284	+ arch_enter_lazy_mmu_mode();
1278	1285	do {
1279	1286	BUG_ON(!pte_none(*pte));
1280	1287	set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1281	1288	pfn++;
1282	1289	} while (pte++, addr += PAGE_SIZE, addr != end);
	1290	+ arch_leave_lazy_mmu_mode();
1283	1291	pte_unmap_unlock(pte - 1, ptl);
1284	1292	return 0;
1285	1293	}
...	...	@@ -34,6 +34,7 @@
34	34	spinlock_t *ptl;
35	35
36	36	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	37	+ arch_enter_lazy_mmu_mode();
37	38	do {
38	39	oldpte = *pte;
39	40	if (pte_present(oldpte)) {
...	...	@@ -70,6 +71,7 @@
70	71	}
71	72
72	73	} while (pte++, addr += PAGE_SIZE, addr != end);
	74	+ arch_leave_lazy_mmu_mode();
73	75	pte_unmap_unlock(pte - 1, ptl);
74	76	}
75	77
...	...	@@ -98,6 +98,7 @@
98	98	new_ptl = pte_lockptr(mm, new_pmd);
99	99	if (new_ptl != old_ptl)
100	100	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
	101	+ arch_enter_lazy_mmu_mode();
101	102
102	103	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103	104	new_pte++, new_addr += PAGE_SIZE) {
...	...	@@ -109,6 +110,7 @@
109	110	set_pte_at(mm, new_addr, new_pte, pte);
110	111	}
111	112
	113	+ arch_leave_lazy_mmu_mode();
112	114	if (new_ptl != old_ptl)
113	115	spin_unlock(new_ptl);
114	116	pte_unmap_nested(new_pte - 1);