Commit 234b239bea395316d7f78018c672f4a88b3cdf0d

Authored by Andres Lagar-Cavilla
Committed by Paolo Bonzini
1 parent b461966063

kvm: Faults which trigger IO release the mmap_sem

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory
has been swapped out or is behind a filemap, this will trigger async
readahead and return immediately. The rationale is that KVM will kick
back the guest with an "async page fault" and allow for some other
guest process to take over.

If async PFs are enabled the fault is retried asap from an async
workqueue. If not, it's retried immediately in the same code path. In
either case the retry will not relinquish the mmap semaphore and will
block on the IO. This is a bad thing, as other mmap semaphore users
now stall as a function of swap or filemap latency.

This patch ensures both the regular and async PF path re-enter the
fault allowing for the mmap semaphore to be relinquished in the case
of IO wait.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Showing 5 changed files with 63 additions and 6 deletions Side-by-side Diff

include/linux/kvm_host.h
... ... @@ -198,6 +198,17 @@
198 198 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
199 199 #endif
200 200  
  201 +/*
  202 + * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
  203 + * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
  204 + * controls whether we retry the gup one more time to completion in that case.
  205 + * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
  206 + * handler.
  207 + */
  208 +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
  209 + unsigned long addr, bool write_fault,
  210 + struct page **pagep);
  211 +
201 212 enum {
202 213 OUTSIDE_GUEST_MODE,
203 214 IN_GUEST_MODE,
... ... @@ -1985,6 +1985,7 @@
1985 1985 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1986 1986 #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
1987 1987 #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
  1988 +#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
1988 1989  
1989 1990 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1990 1991 void *data);
... ... @@ -281,6 +281,10 @@
281 281 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
282 282 if (*flags & FOLL_NOWAIT)
283 283 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
  284 + if (*flags & FOLL_TRIED) {
  285 + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
  286 + fault_flags |= FAULT_FLAG_TRIED;
  287 + }
284 288  
285 289 ret = handle_mm_fault(mm, vma, address, fault_flags);
286 290 if (ret & VM_FAULT_ERROR) {
... ... @@ -80,9 +80,7 @@
80 80  
81 81 might_sleep();
82 82  
83   - down_read(&mm->mmap_sem);
84   - get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
85   - up_read(&mm->mmap_sem);
  83 + kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
86 84 kvm_async_page_present_sync(vcpu, apf);
87 85  
88 86 spin_lock(&vcpu->async_pf.lock);
... ... @@ -1122,6 +1122,43 @@
1122 1122 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1123 1123 }
1124 1124  
  1125 +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
  1126 + unsigned long addr, bool write_fault,
  1127 + struct page **pagep)
  1128 +{
  1129 + int npages;
  1130 + int locked = 1;
  1131 + int flags = FOLL_TOUCH | FOLL_HWPOISON |
  1132 + (pagep ? FOLL_GET : 0) |
  1133 + (write_fault ? FOLL_WRITE : 0);
  1134 +
  1135 + /*
  1136 + * If retrying the fault, we get here *not* having allowed the filemap
  1137 + * to wait on the page lock. We should now allow waiting on the IO with
  1138 + * the mmap semaphore released.
  1139 + */
  1140 + down_read(&mm->mmap_sem);
  1141 + npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
  1142 + &locked);
  1143 + if (!locked) {
  1144 + VM_BUG_ON(npages != -EBUSY);
  1145 +
  1146 + if (!pagep)
  1147 + return 0;
  1148 +
  1149 + /*
  1150 + * The previous call has now waited on the IO. Now we can
  1151 + * retry and complete. Pass TRIED to ensure we do not re
  1152 + * schedule async IO (see e.g. filemap_fault).
  1153 + */
  1154 + down_read(&mm->mmap_sem);
  1155 + npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
  1156 + pagep, NULL, NULL);
  1157 + }
  1158 + up_read(&mm->mmap_sem);
  1159 + return npages;
  1160 +}
  1161 +
1125 1162 static inline int check_user_page_hwpoison(unsigned long addr)
1126 1163 {
1127 1164 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
... ... @@ -1184,9 +1221,15 @@
1184 1221 npages = get_user_page_nowait(current, current->mm,
1185 1222 addr, write_fault, page);
1186 1223 up_read(&current->mm->mmap_sem);
1187   - } else
1188   - npages = get_user_pages_fast(addr, 1, write_fault,
1189   - page);
  1224 + } else {
  1225 + /*
  1226 + * By now we have tried gup_fast, and possibly async_pf, and we
  1227 + * are certainly not atomic. Time to retry the gup, allowing
  1228 + * mmap semaphore to be relinquished in the case of IO.
  1229 + */
  1230 + npages = kvm_get_user_page_io(current, current->mm, addr,
  1231 + write_fault, page);
  1232 + }
1190 1233 if (npages != 1)
1191 1234 return npages;
1192 1235