kvm: Faults which trigger IO release the mmap_sem

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been swapped out or is behind a filemap, this will trigger async readahead and return immediately. The rationale is that KVM will kick back the guest with an "async page fault" and allow for some other guest process to take over. If async PFs are enabled the fault is retried asap from an async workqueue. If not, it's retried immediately in the same code path. In either case the retry will not relinquish the mmap semaphore and will block on the IO. This is a bad thing, as other mmap semaphore users now stall as a function of swap or filemap latency. This patch ensures both the regular and async PF path re-enter the fault allowing for the mmap semaphore to be relinquished in the case of IO wait. Reviewed-by: Radim Krčmář <rkrcmar@redhat.com> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com> Acked-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

kvm: Faults which trigger IO release the mmap_sem
When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been swapped out or is behind a filemap, this will trigger async readahead and return immediately. The rationale is that KVM will kick back the guest with an "async page fault" and allow for some other guest process to take over. If async PFs are enabled the fault is retried asap from an async workqueue. If not, it's retried immediately in the same code path. In either case the retry will not relinquish the mmap semaphore and will block on the IO. This is a bad thing, as other mmap semaphore users now stall as a function of swap or filemap latency. This patch ensures both the regular and async PF path re-enter the fault allowing for the mmap semaphore to be relinquished in the case of IO wait. Reviewed-by: Radim Krčmář <rkrcmar@redhat.com> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com> Acked-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Andres Lagar-Cavilla · Paolo Bonzini
1 parent b461966063
Showing 5 changed files with 63 additions and 6 deletions Side-by-side Diff
include/linux/kvm_host.h
include/linux/mm.h
mm/gup.c
virt/kvm/async_pf.c
virt/kvm/kvm_main.c
@@ -198,6 +198,17 @@
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
  
+/*
+ * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
+ * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
+ * controls whether we retry the gup one more time to completion in that case.
+ * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
+ * handler.
+ */
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep);
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
@@ -1985,6 +1985,7 @@
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
+#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
  
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
@@ -281,6 +281,10 @@
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (*flags & FOLL_NOWAIT)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+	if (*flags & FOLL_TRIED) {
+		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+		fault_flags |= FAULT_FLAG_TRIED;
+	}
  
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
@@ -80,9 +80,7 @@
  
 	might_sleep();
  
-	down_read(&mm->mmap_sem);
-	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-	up_read(&mm->mmap_sem);
+	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
  
 	spin_lock(&vcpu->async_pf.lock);
@@ -1122,6 +1122,43 @@
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
  
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep)
+{
+	int npages;
+	int locked = 1;
+	int flags = FOLL_TOUCH | FOLL_HWPOISON |
+		    (pagep ? FOLL_GET : 0) |
+		    (write_fault ? FOLL_WRITE : 0);
+
+	/*
+	 * If retrying the fault, we get here *not* having allowed the filemap
+	 * to wait on the page lock. We should now allow waiting on the IO with
+	 * the mmap semaphore released.
+	 */
+	down_read(&mm->mmap_sem);
+	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+				  &locked);
+	if (!locked) {
+		VM_BUG_ON(npages != -EBUSY);
+
+		if (!pagep)
+			return 0;
+
+		/*
+		 * The previous call has now waited on the IO. Now we can
+		 * retry and complete. Pass TRIED to ensure we do not re
+		 * schedule async IO (see e.g. filemap_fault).
+		 */
+		down_read(&mm->mmap_sem);
+		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+					  pagep, NULL, NULL);
+	}
+	up_read(&mm->mmap_sem);
+	return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1184,9 +1221,15 @@
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
-		npages = get_user_pages_fast(addr, 1, write_fault,
-					     page);
+	} else {
+		/*
+		 * By now we have tried gup_fast, and possibly async_pf, and we
+		 * are certainly not atomic. Time to retry the gup, allowing
+		 * mmap semaphore to be relinquished in the case of IO.
+		 */
+		npages = kvm_get_user_page_io(current, current->mm, addr,
+					      write_fault, page);
+	}
 	if (npages != 1)
 		return npages;
...	...	@@ -198,6 +198,17 @@
198	198	int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
199	199	#endif
200	200
	201	+/*
	202	+ * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
	203	+ * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
	204	+ * controls whether we retry the gup one more time to completion in that case.
	205	+ * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
	206	+ * handler.
	207	+ */
	208	+int kvm_get_user_page_io(struct task_struct tsk, struct mm_struct mm,
	209	+ unsigned long addr, bool write_fault,
	210	+ struct page **pagep);
	211	+
201	212	enum {
202	213	OUTSIDE_GUEST_MODE,
203	214	IN_GUEST_MODE,
...	...	@@ -1985,6 +1985,7 @@
1985	1985	#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1986	1986	#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
1987	1987	#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
	1988	+#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
1988	1989
1989	1990	typedef int (pte_fn_t)(pte_t pte, pgtable_t token, unsigned long addr,
1990	1991	void *data);
...	...	@@ -281,6 +281,10 @@
281	281	fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
282	282	if (*flags & FOLL_NOWAIT)
283	283	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_RETRY_NOWAIT;
	284	+ if (*flags & FOLL_TRIED) {
	285	+ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
	286	+ fault_flags \|= FAULT_FLAG_TRIED;
	287	+ }
284	288
285	289	ret = handle_mm_fault(mm, vma, address, fault_flags);
286	290	if (ret & VM_FAULT_ERROR) {
...	...	@@ -80,9 +80,7 @@
80	80
81	81	might_sleep();
82	82
83		- down_read(&mm->mmap_sem);
84		- get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
85		- up_read(&mm->mmap_sem);
	83	+ kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
86	84	kvm_async_page_present_sync(vcpu, apf);
87	85
88	86	spin_lock(&vcpu->async_pf.lock);
...	...	@@ -1122,6 +1122,43 @@
1122	1122	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1123	1123	}
1124	1124
	1125	+int kvm_get_user_page_io(struct task_struct tsk, struct mm_struct mm,
	1126	+ unsigned long addr, bool write_fault,
	1127	+ struct page **pagep)
	1128	+{
	1129	+ int npages;
	1130	+ int locked = 1;
	1131	+ int flags = FOLL_TOUCH \| FOLL_HWPOISON \|
	1132	+ (pagep ? FOLL_GET : 0) \|
	1133	+ (write_fault ? FOLL_WRITE : 0);
	1134	+
	1135	+ /*
	1136	+ * If retrying the fault, we get here not having allowed the filemap
	1137	+ * to wait on the page lock. We should now allow waiting on the IO with
	1138	+ * the mmap semaphore released.
	1139	+ */
	1140	+ down_read(&mm->mmap_sem);
	1141	+ npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
	1142	+ &locked);
	1143	+ if (!locked) {
	1144	+ VM_BUG_ON(npages != -EBUSY);
	1145	+
	1146	+ if (!pagep)
	1147	+ return 0;
	1148	+
	1149	+ /*
	1150	+ * The previous call has now waited on the IO. Now we can
	1151	+ * retry and complete. Pass TRIED to ensure we do not re
	1152	+ * schedule async IO (see e.g. filemap_fault).
	1153	+ */
	1154	+ down_read(&mm->mmap_sem);
	1155	+ npages = __get_user_pages(tsk, mm, addr, 1, flags \| FOLL_TRIED,
	1156	+ pagep, NULL, NULL);
	1157	+ }
	1158	+ up_read(&mm->mmap_sem);
	1159	+ return npages;
	1160	+}
	1161	+
1125	1162	static inline int check_user_page_hwpoison(unsigned long addr)
1126	1163	{
1127	1164	int rc, flags = FOLL_TOUCH \| FOLL_HWPOISON \| FOLL_WRITE;
...	...	@@ -1184,9 +1221,15 @@
1184	1221	npages = get_user_page_nowait(current, current->mm,
1185	1222	addr, write_fault, page);
1186	1223	up_read(&current->mm->mmap_sem);
1187		- } else
1188		- npages = get_user_pages_fast(addr, 1, write_fault,
1189		- page);
	1224	+ } else {
	1225	+ /*
	1226	+ * By now we have tried gup_fast, and possibly async_pf, and we
	1227	+ * are certainly not atomic. Time to retry the gup, allowing
	1228	+ * mmap semaphore to be relinquished in the case of IO.
	1229	+ */
	1230	+ npages = kvm_get_user_page_io(current, current->mm, addr,
	1231	+ write_fault, page);
	1232	+ }
1190	1233	if (npages != 1)
1191	1234	return npages;
1192	1235