Commit 38a76013ad809beb0b52f60d365c960d035bd83c
Committed by
Linus Torvalds
1 parent
523d4e2008
Exists in
master
and in
20 other branches
mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 49 additions and 23 deletions Side-by-side Diff
fs/exec.c
include/linux/mm.h
... | ... | @@ -1060,7 +1060,8 @@ |
1060 | 1060 | |
1061 | 1061 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
1062 | 1062 | unsigned long old_addr, struct vm_area_struct *new_vma, |
1063 | - unsigned long new_addr, unsigned long len); | |
1063 | + unsigned long new_addr, unsigned long len, | |
1064 | + bool need_rmap_locks); | |
1064 | 1065 | extern unsigned long do_mremap(unsigned long addr, |
1065 | 1066 | unsigned long old_len, unsigned long new_len, |
1066 | 1067 | unsigned long flags, unsigned long new_addr); |
... | ... | @@ -1410,7 +1411,8 @@ |
1410 | 1411 | struct rb_node **, struct rb_node *); |
1411 | 1412 | extern void unlink_file_vma(struct vm_area_struct *); |
1412 | 1413 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, |
1413 | - unsigned long addr, unsigned long len, pgoff_t pgoff); | |
1414 | + unsigned long addr, unsigned long len, pgoff_t pgoff, | |
1415 | + bool *need_rmap_locks); | |
1414 | 1416 | extern void exit_mmap(struct mm_struct *); |
1415 | 1417 | |
1416 | 1418 | extern int mm_take_all_locks(struct mm_struct *mm); |
mm/mmap.c
... | ... | @@ -2371,7 +2371,8 @@ |
2371 | 2371 | * prior to moving page table entries, to effect an mremap move. |
2372 | 2372 | */ |
2373 | 2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2374 | - unsigned long addr, unsigned long len, pgoff_t pgoff) | |
2374 | + unsigned long addr, unsigned long len, pgoff_t pgoff, | |
2375 | + bool *need_rmap_locks) | |
2375 | 2376 | { |
2376 | 2377 | struct vm_area_struct *vma = *vmap; |
2377 | 2378 | unsigned long vma_start = vma->vm_start; |
2378 | 2379 | |
... | ... | @@ -2413,8 +2414,9 @@ |
2413 | 2414 | * linear if there are no pages mapped yet. |
2414 | 2415 | */ |
2415 | 2416 | VM_BUG_ON(faulted_in_anon_vma); |
2416 | - *vmap = new_vma; | |
2417 | + *vmap = vma = new_vma; | |
2417 | 2418 | } |
2419 | + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | |
2418 | 2420 | } else { |
2419 | 2421 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2420 | 2422 | if (new_vma) { |
... | ... | @@ -2434,6 +2436,7 @@ |
2434 | 2436 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2435 | 2437 | new_vma->vm_ops->open(new_vma); |
2436 | 2438 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2439 | + *need_rmap_locks = false; | |
2437 | 2440 | } |
2438 | 2441 | } |
2439 | 2442 | return new_vma; |
mm/mremap.c
... | ... | @@ -71,26 +71,42 @@ |
71 | 71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
72 | 72 | unsigned long old_addr, unsigned long old_end, |
73 | 73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
74 | - unsigned long new_addr) | |
74 | + unsigned long new_addr, bool need_rmap_locks) | |
75 | 75 | { |
76 | 76 | struct address_space *mapping = NULL; |
77 | - struct anon_vma *anon_vma = vma->anon_vma; | |
77 | + struct anon_vma *anon_vma = NULL; | |
78 | 78 | struct mm_struct *mm = vma->vm_mm; |
79 | 79 | pte_t *old_pte, *new_pte, pte; |
80 | 80 | spinlock_t *old_ptl, *new_ptl; |
81 | 81 | |
82 | - if (vma->vm_file) { | |
83 | - /* | |
84 | - * Subtle point from Rajesh Venkatasubramanian: before | |
85 | - * moving file-based ptes, we must lock truncate_pagecache | |
86 | - * out, since it might clean the dst vma before the src vma, | |
87 | - * and we propagate stale pages into the dst afterward. | |
88 | - */ | |
89 | - mapping = vma->vm_file->f_mapping; | |
90 | - mutex_lock(&mapping->i_mmap_mutex); | |
82 | + /* | |
83 | + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma | |
84 | + * locks to ensure that rmap will always observe either the old or the | |
85 | + * new ptes. This is the easiest way to avoid races with | |
86 | + * truncate_pagecache(), page migration, etc... | |
87 | + * | |
88 | + * When need_rmap_locks is false, we use other ways to avoid | |
89 | + * such races: | |
90 | + * | |
91 | + * - During exec() shift_arg_pages(), we use a specially tagged vma | |
92 | + * which rmap call sites look for using is_vma_temporary_stack(). | |
93 | + * | |
94 | + * - During mremap(), new_vma is often known to be placed after vma | |
95 | + * in rmap traversal order. This ensures rmap will always observe | |
96 | + * either the old pte, or the new pte, or both (the page table locks | |
97 | + * serialize access to individual ptes, but only rmap traversal | |
98 | + * order guarantees that we won't miss both the old and new ptes). | |
99 | + */ | |
100 | + if (need_rmap_locks) { | |
101 | + if (vma->vm_file) { | |
102 | + mapping = vma->vm_file->f_mapping; | |
103 | + mutex_lock(&mapping->i_mmap_mutex); | |
104 | + } | |
105 | + if (vma->anon_vma) { | |
106 | + anon_vma = vma->anon_vma; | |
107 | + anon_vma_lock(anon_vma); | |
108 | + } | |
91 | 109 | } |
92 | - if (anon_vma) | |
93 | - anon_vma_lock(anon_vma); | |
94 | 110 | |
95 | 111 | /* |
96 | 112 | * We don't have to worry about the ordering of src and dst |
... | ... | @@ -127,7 +143,8 @@ |
127 | 143 | |
128 | 144 | unsigned long move_page_tables(struct vm_area_struct *vma, |
129 | 145 | unsigned long old_addr, struct vm_area_struct *new_vma, |
130 | - unsigned long new_addr, unsigned long len) | |
146 | + unsigned long new_addr, unsigned long len, | |
147 | + bool need_rmap_locks) | |
131 | 148 | { |
132 | 149 | unsigned long extent, next, old_end; |
133 | 150 | pmd_t *old_pmd, *new_pmd; |
... | ... | @@ -174,7 +191,7 @@ |
174 | 191 | if (extent > LATENCY_LIMIT) |
175 | 192 | extent = LATENCY_LIMIT; |
176 | 193 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
177 | - new_vma, new_pmd, new_addr); | |
194 | + new_vma, new_pmd, new_addr, need_rmap_locks); | |
178 | 195 | need_flush = true; |
179 | 196 | } |
180 | 197 | if (likely(need_flush)) |
... | ... | @@ -198,6 +215,7 @@ |
198 | 215 | unsigned long hiwater_vm; |
199 | 216 | int split = 0; |
200 | 217 | int err; |
218 | + bool need_rmap_locks; | |
201 | 219 | |
202 | 220 | /* |
203 | 221 | * We'd prefer to avoid failure later on in do_munmap: |
204 | 222 | |
205 | 223 | |
... | ... | @@ -219,18 +237,21 @@ |
219 | 237 | return err; |
220 | 238 | |
221 | 239 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
222 | - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | |
240 | + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, | |
241 | + &need_rmap_locks); | |
223 | 242 | if (!new_vma) |
224 | 243 | return -ENOMEM; |
225 | 244 | |
226 | - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | |
245 | + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, | |
246 | + need_rmap_locks); | |
227 | 247 | if (moved_len < old_len) { |
228 | 248 | /* |
229 | 249 | * On error, move entries back from new area to old, |
230 | 250 | * which will succeed since page tables still there, |
231 | 251 | * and then proceed to unmap new area instead of old. |
232 | 252 | */ |
233 | - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | |
253 | + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, | |
254 | + true); | |
234 | 255 | vma = new_vma; |
235 | 256 | old_len = new_len; |
236 | 257 | old_addr = new_addr; |
-
mentioned in commit dd18db
-
mentioned in commit 2bcd64
-
mentioned in commit 2bcd64
-
mentioned in commit 2bcd64
-
mentioned in commit 2bcd64
-
mentioned in commit 2bcd64
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db
-
mentioned in commit dd18db