Commit 38a76013ad809beb0b52f60d365c960d035bd83c

Authored by Michel Lespinasse
Committed by Linus Torvalds
1 parent 523d4e2008

mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the
original vma in rmap traversal order: in move_vma(), we always have
new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >=
vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one.

When the destination VMA is placed after the original in rmap traversal
order, we can avoid taking the rmap locks in move_ptes().

Essentially, this reintroduces the optimization that had been disabled in
"mm anon rmap: remove anon_vma_moveto_tail".  The difference is that we
don't try to impose the rmap traversal order; instead we just rely on
things being in the desired order in the common case and fall back to
taking locks in the uncommon case.  Also we skip the i_mmap_mutex in
addition to the anon_vma lock: in both cases, the vmas are traversed in
increasing vm_pgoff order with ties resolved in tree insertion order.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 49 additions and 23 deletions Side-by-side Diff

... ... @@ -603,7 +603,7 @@
603 603 * process cleanup to remove whatever mess we made.
604 604 */
605 605 if (length != move_page_tables(vma, old_start,
606   - vma, new_start, length))
  606 + vma, new_start, length, false))
607 607 return -ENOMEM;
608 608  
609 609 lru_add_drain();
... ... @@ -1060,7 +1060,8 @@
1060 1060  
1061 1061 extern unsigned long move_page_tables(struct vm_area_struct *vma,
1062 1062 unsigned long old_addr, struct vm_area_struct *new_vma,
1063   - unsigned long new_addr, unsigned long len);
  1063 + unsigned long new_addr, unsigned long len,
  1064 + bool need_rmap_locks);
1064 1065 extern unsigned long do_mremap(unsigned long addr,
1065 1066 unsigned long old_len, unsigned long new_len,
1066 1067 unsigned long flags, unsigned long new_addr);
... ... @@ -1410,7 +1411,8 @@
1410 1411 struct rb_node **, struct rb_node *);
1411 1412 extern void unlink_file_vma(struct vm_area_struct *);
1412 1413 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1413   - unsigned long addr, unsigned long len, pgoff_t pgoff);
  1414 + unsigned long addr, unsigned long len, pgoff_t pgoff,
  1415 + bool *need_rmap_locks);
1414 1416 extern void exit_mmap(struct mm_struct *);
1415 1417  
1416 1418 extern int mm_take_all_locks(struct mm_struct *mm);
... ... @@ -2371,7 +2371,8 @@
2371 2371 * prior to moving page table entries, to effect an mremap move.
2372 2372 */
2373 2373 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2374   - unsigned long addr, unsigned long len, pgoff_t pgoff)
  2374 + unsigned long addr, unsigned long len, pgoff_t pgoff,
  2375 + bool *need_rmap_locks)
2375 2376 {
2376 2377 struct vm_area_struct *vma = *vmap;
2377 2378 unsigned long vma_start = vma->vm_start;
2378 2379  
... ... @@ -2413,8 +2414,9 @@
2413 2414 * linear if there are no pages mapped yet.
2414 2415 */
2415 2416 VM_BUG_ON(faulted_in_anon_vma);
2416   - *vmap = new_vma;
  2417 + *vmap = vma = new_vma;
2417 2418 }
  2419 + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2418 2420 } else {
2419 2421 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2420 2422 if (new_vma) {
... ... @@ -2434,6 +2436,7 @@
2434 2436 if (new_vma->vm_ops && new_vma->vm_ops->open)
2435 2437 new_vma->vm_ops->open(new_vma);
2436 2438 vma_link(mm, new_vma, prev, rb_link, rb_parent);
  2439 + *need_rmap_locks = false;
2437 2440 }
2438 2441 }
2439 2442 return new_vma;
... ... @@ -71,26 +71,42 @@
71 71 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72 72 unsigned long old_addr, unsigned long old_end,
73 73 struct vm_area_struct *new_vma, pmd_t *new_pmd,
74   - unsigned long new_addr)
  74 + unsigned long new_addr, bool need_rmap_locks)
75 75 {
76 76 struct address_space *mapping = NULL;
77   - struct anon_vma *anon_vma = vma->anon_vma;
  77 + struct anon_vma *anon_vma = NULL;
78 78 struct mm_struct *mm = vma->vm_mm;
79 79 pte_t *old_pte, *new_pte, pte;
80 80 spinlock_t *old_ptl, *new_ptl;
81 81  
82   - if (vma->vm_file) {
83   - /*
84   - * Subtle point from Rajesh Venkatasubramanian: before
85   - * moving file-based ptes, we must lock truncate_pagecache
86   - * out, since it might clean the dst vma before the src vma,
87   - * and we propagate stale pages into the dst afterward.
88   - */
89   - mapping = vma->vm_file->f_mapping;
90   - mutex_lock(&mapping->i_mmap_mutex);
  82 + /*
  83 + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
  84 + * locks to ensure that rmap will always observe either the old or the
  85 + * new ptes. This is the easiest way to avoid races with
  86 + * truncate_pagecache(), page migration, etc...
  87 + *
  88 + * When need_rmap_locks is false, we use other ways to avoid
  89 + * such races:
  90 + *
  91 + * - During exec() shift_arg_pages(), we use a specially tagged vma
  92 + * which rmap call sites look for using is_vma_temporary_stack().
  93 + *
  94 + * - During mremap(), new_vma is often known to be placed after vma
  95 + * in rmap traversal order. This ensures rmap will always observe
  96 + * either the old pte, or the new pte, or both (the page table locks
  97 + * serialize access to individual ptes, but only rmap traversal
  98 + * order guarantees that we won't miss both the old and new ptes).
  99 + */
  100 + if (need_rmap_locks) {
  101 + if (vma->vm_file) {
  102 + mapping = vma->vm_file->f_mapping;
  103 + mutex_lock(&mapping->i_mmap_mutex);
  104 + }
  105 + if (vma->anon_vma) {
  106 + anon_vma = vma->anon_vma;
  107 + anon_vma_lock(anon_vma);
  108 + }
91 109 }
92   - if (anon_vma)
93   - anon_vma_lock(anon_vma);
94 110  
95 111 /*
96 112 * We don't have to worry about the ordering of src and dst
... ... @@ -127,7 +143,8 @@
127 143  
128 144 unsigned long move_page_tables(struct vm_area_struct *vma,
129 145 unsigned long old_addr, struct vm_area_struct *new_vma,
130   - unsigned long new_addr, unsigned long len)
  146 + unsigned long new_addr, unsigned long len,
  147 + bool need_rmap_locks)
131 148 {
132 149 unsigned long extent, next, old_end;
133 150 pmd_t *old_pmd, *new_pmd;
... ... @@ -174,7 +191,7 @@
174 191 if (extent > LATENCY_LIMIT)
175 192 extent = LATENCY_LIMIT;
176 193 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
177   - new_vma, new_pmd, new_addr);
  194 + new_vma, new_pmd, new_addr, need_rmap_locks);
178 195 need_flush = true;
179 196 }
180 197 if (likely(need_flush))
... ... @@ -198,6 +215,7 @@
198 215 unsigned long hiwater_vm;
199 216 int split = 0;
200 217 int err;
  218 + bool need_rmap_locks;
201 219  
202 220 /*
203 221 * We'd prefer to avoid failure later on in do_munmap:
204 222  
205 223  
... ... @@ -219,18 +237,21 @@
219 237 return err;
220 238  
221 239 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
222   - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
  240 + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
  241 + &need_rmap_locks);
223 242 if (!new_vma)
224 243 return -ENOMEM;
225 244  
226   - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
  245 + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
  246 + need_rmap_locks);
227 247 if (moved_len < old_len) {
228 248 /*
229 249 * On error, move entries back from new area to old,
230 250 * which will succeed since page tables still there,
231 251 * and then proceed to unmap new area instead of old.
232 252 */
233   - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
  253 + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
  254 + true);
234 255 vma = new_vma;
235 256 old_len = new_len;
236 257 old_addr = new_addr;