Commit 948f017b093a9baac23855fcd920d3a970b71bb6
Committed by
Linus Torvalds
1 parent
df0a6daa01
Exists in
master
and in
6 other branches
mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma()
migrate was doing an rmap_walk with speculative lock-less access on pagetables. That could lead it to not serializing properly against mremap PT locks. But a second problem remains in the order of vmas in the same_anon_vma list used by the rmap_walk. If vma_merge succeeds in copy_vma, the src vma could be placed after the dst vma in the same_anon_vma list. That could still lead to migrate missing some pte. This patch adds an anon_vma_moveto_tail() function to force the dst vma at the end of the list before mremap starts to solve the problem. If the mremap is very large and there are a lots of parents or childs sharing the anon_vma root lock, this should still scale better than taking the anon_vma root lock around every pte copy practically for the whole duration of mremap. Update: Hugh noticed special care is needed in the error path where move_page_tables goes in the reverse direction, a second anon_vma_moveto_tail() call is needed in the error path. This program exercises the anon_vma_moveto_tail: === int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, SIZE)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); printf("%p\n", p); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); if (memcmp(p, p2, SIZE)) printf("error\n"); p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3); if (p4 != p3) perror("mremap"), exit(1); p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2); if (p4 != p+SIZE/2) perror("mremap"), exit(1); if (memcmp(p, p2, SIZE)) printf("error\n"); printf("ok\n"); return 0; } === $ perf probe -a anon_vma_moveto_tail Add new event: probe:anon_vma_moveto_tail (on anon_vma_moveto_tail) You can now use it on all perf tools, such as: perf record -e probe:anon_vma_moveto_tail -aR sleep 1 $ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail 0x7f2ca2800000 ok [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ] $ perf report --stdio 100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Nai Xia <nai.xia@gmail.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Hugh Dickins <hughd@google.com> Cc: Pawel Sikora <pluto@agmk.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 76 additions and 3 deletions Side-by-side Diff
include/linux/rmap.h
... | ... | @@ -120,6 +120,7 @@ |
120 | 120 | int anon_vma_prepare(struct vm_area_struct *); |
121 | 121 | void unlink_anon_vmas(struct vm_area_struct *); |
122 | 122 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); |
123 | +void anon_vma_moveto_tail(struct vm_area_struct *); | |
123 | 124 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); |
124 | 125 | void __anon_vma_link(struct vm_area_struct *); |
125 | 126 |
mm/mmap.c
... | ... | @@ -2322,13 +2322,16 @@ |
2322 | 2322 | struct vm_area_struct *new_vma, *prev; |
2323 | 2323 | struct rb_node **rb_link, *rb_parent; |
2324 | 2324 | struct mempolicy *pol; |
2325 | + bool faulted_in_anon_vma = true; | |
2325 | 2326 | |
2326 | 2327 | /* |
2327 | 2328 | * If anonymous vma has not yet been faulted, update new pgoff |
2328 | 2329 | * to match new location, to increase its chance of merging. |
2329 | 2330 | */ |
2330 | - if (!vma->vm_file && !vma->anon_vma) | |
2331 | + if (unlikely(!vma->vm_file && !vma->anon_vma)) { | |
2331 | 2332 | pgoff = addr >> PAGE_SHIFT; |
2333 | + faulted_in_anon_vma = false; | |
2334 | + } | |
2332 | 2335 | |
2333 | 2336 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
2334 | 2337 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2335 | 2338 | |
... | ... | @@ -2337,9 +2340,24 @@ |
2337 | 2340 | /* |
2338 | 2341 | * Source vma may have been merged into new_vma |
2339 | 2342 | */ |
2340 | - if (vma_start >= new_vma->vm_start && | |
2341 | - vma_start < new_vma->vm_end) | |
2343 | + if (unlikely(vma_start >= new_vma->vm_start && | |
2344 | + vma_start < new_vma->vm_end)) { | |
2345 | + /* | |
2346 | + * The only way we can get a vma_merge with | |
2347 | + * self during an mremap is if the vma hasn't | |
2348 | + * been faulted in yet and we were allowed to | |
2349 | + * reset the dst vma->vm_pgoff to the | |
2350 | + * destination address of the mremap to allow | |
2351 | + * the merge to happen. mremap must change the | |
2352 | + * vm_pgoff linearity between src and dst vmas | |
2353 | + * (in turn preventing a vma_merge) to be | |
2354 | + * safe. It is only safe to keep the vm_pgoff | |
2355 | + * linear if there are no pages mapped yet. | |
2356 | + */ | |
2357 | + VM_BUG_ON(faulted_in_anon_vma); | |
2342 | 2358 | *vmap = new_vma; |
2359 | + } else | |
2360 | + anon_vma_moveto_tail(new_vma); | |
2343 | 2361 | } else { |
2344 | 2362 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2345 | 2363 | if (new_vma) { |
mm/mremap.c
... | ... | @@ -221,6 +221,15 @@ |
221 | 221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); |
222 | 222 | if (moved_len < old_len) { |
223 | 223 | /* |
224 | + * Before moving the page tables from the new vma to | |
225 | + * the old vma, we need to be sure the old vma is | |
226 | + * queued after new vma in the same_anon_vma list to | |
227 | + * prevent SMP races with rmap_walk (that could lead | |
228 | + * rmap_walk to miss some page table). | |
229 | + */ | |
230 | + anon_vma_moveto_tail(vma); | |
231 | + | |
232 | + /* | |
224 | 233 | * On error, move entries back from new area to old, |
225 | 234 | * which will succeed since page tables still there, |
226 | 235 | * and then proceed to unmap new area instead of old. |
mm/rmap.c
... | ... | @@ -272,6 +272,51 @@ |
272 | 272 | } |
273 | 273 | |
274 | 274 | /* |
275 | + * Some rmap walk that needs to find all ptes/hugepmds without false | |
276 | + * negatives (like migrate and split_huge_page) running concurrent | |
277 | + * with operations that copy or move pagetables (like mremap() and | |
278 | + * fork()) to be safe. They depend on the anon_vma "same_anon_vma" | |
279 | + * list to be in a certain order: the dst_vma must be placed after the | |
280 | + * src_vma in the list. This is always guaranteed by fork() but | |
281 | + * mremap() needs to call this function to enforce it in case the | |
282 | + * dst_vma isn't newly allocated and chained with the anon_vma_clone() | |
283 | + * function but just an extension of a pre-existing vma through | |
284 | + * vma_merge. | |
285 | + * | |
286 | + * NOTE: the same_anon_vma list can still be changed by other | |
287 | + * processes while mremap runs because mremap doesn't hold the | |
288 | + * anon_vma mutex to prevent modifications to the list while it | |
289 | + * runs. All we need to enforce is that the relative order of this | |
290 | + * process vmas isn't changing (we don't care about other vmas | |
291 | + * order). Each vma corresponds to an anon_vma_chain structure so | |
292 | + * there's no risk that other processes calling anon_vma_moveto_tail() | |
293 | + * and changing the same_anon_vma list under mremap() will screw with | |
294 | + * the relative order of this process vmas in the list, because we | |
295 | + * they can't alter the order of any vma that belongs to this | |
296 | + * process. And there can't be another anon_vma_moveto_tail() running | |
297 | + * concurrently with mremap() coming from this process because we hold | |
298 | + * the mmap_sem for the whole mremap(). fork() ordering dependency | |
299 | + * also shouldn't be affected because fork() only cares that the | |
300 | + * parent vmas are placed in the list before the child vmas and | |
301 | + * anon_vma_moveto_tail() won't reorder vmas from either the fork() | |
302 | + * parent or child. | |
303 | + */ | |
304 | +void anon_vma_moveto_tail(struct vm_area_struct *dst) | |
305 | +{ | |
306 | + struct anon_vma_chain *pavc; | |
307 | + struct anon_vma *root = NULL; | |
308 | + | |
309 | + list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { | |
310 | + struct anon_vma *anon_vma = pavc->anon_vma; | |
311 | + VM_BUG_ON(pavc->vma != dst); | |
312 | + root = lock_anon_vma_root(root, anon_vma); | |
313 | + list_del(&pavc->same_anon_vma); | |
314 | + list_add_tail(&pavc->same_anon_vma, &anon_vma->head); | |
315 | + } | |
316 | + unlock_anon_vma_root(root); | |
317 | +} | |
318 | + | |
319 | +/* | |
275 | 320 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
276 | 321 | * the corresponding VMA in the parent process is attached to. |
277 | 322 | * Returns 0 on success, non-zero on failure. |