mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Michel Lespinasse · Linus Torvalds · Eric Lee · Eric Lee · Eric Lee · Eric Lee
1 parent 523d4e2008
Showing 4 changed files with 49 additions and 23 deletions Side-by-side Diff
fs/exec.c
include/linux/mm.h
mm/mmap.c
mm/mremap.c
@@ -603,7 +603,7 @@
 	 * process cleanup to remove whatever mess we made.
 	 */
 	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length))
+				       vma, new_start, length, false))
 		return -ENOMEM;
  
 	lru_add_drain();
@@ -1060,7 +1060,8 @@
  
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len);
+		unsigned long new_addr, unsigned long len,
+		bool need_rmap_locks);
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
@@ -1410,7 +1411,8 @@
 	struct rb_node **, struct rb_node *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-	unsigned long addr, unsigned long len, pgoff_t pgoff);
+	unsigned long addr, unsigned long len, pgoff_t pgoff,
+	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
  
 extern int mm_take_all_locks(struct mm_struct *mm);
@@ -2371,7 +2371,8 @@
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-	unsigned long addr, unsigned long len, pgoff_t pgoff)
+	unsigned long addr, unsigned long len, pgoff_t pgoff,
+	bool *need_rmap_locks)
 {
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
  
@@ -2413,8 +2414,9 @@
 			 * linear if there are no pages mapped yet.
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
-			*vmap = new_vma;
+			*vmap = vma = new_vma;
 		}
+		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
@@ -2434,6 +2436,7 @@
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+			*need_rmap_locks = false;
 		}
 	}
 	return new_vma;
@@ -71,26 +71,42 @@
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long old_addr, unsigned long old_end,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr)
+		unsigned long new_addr, bool need_rmap_locks)
 {
 	struct address_space *mapping = NULL;
-	struct anon_vma *anon_vma = vma->anon_vma;
+	struct anon_vma *anon_vma = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
  
-	if (vma->vm_file) {
-		/*
-		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock truncate_pagecache
-		 * out, since it might clean the dst vma before the src vma,
-		 * and we propagate stale pages into the dst afterward.
-		 */
-		mapping = vma->vm_file->f_mapping;
-		mutex_lock(&mapping->i_mmap_mutex);
+	/*
+	 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+	 * locks to ensure that rmap will always observe either the old or the
+	 * new ptes. This is the easiest way to avoid races with
+	 * truncate_pagecache(), page migration, etc...
+	 *
+	 * When need_rmap_locks is false, we use other ways to avoid
+	 * such races:
+	 *
+	 * - During exec() shift_arg_pages(), we use a specially tagged vma
+	 *   which rmap call sites look for using is_vma_temporary_stack().
+	 *
+	 * - During mremap(), new_vma is often known to be placed after vma
+	 *   in rmap traversal order. This ensures rmap will always observe
+	 *   either the old pte, or the new pte, or both (the page table locks
+	 *   serialize access to individual ptes, but only rmap traversal
+	 *   order guarantees that we won't miss both the old and new ptes).
+	 */
+	if (need_rmap_locks) {
+		if (vma->vm_file) {
+			mapping = vma->vm_file->f_mapping;
+			mutex_lock(&mapping->i_mmap_mutex);
+		}
+		if (vma->anon_vma) {
+			anon_vma = vma->anon_vma;
+			anon_vma_lock(anon_vma);
+		}
 	}
-	if (anon_vma)
-		anon_vma_lock(anon_vma);
  
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@
  
 unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len)
+		unsigned long new_addr, unsigned long len,
+		bool need_rmap_locks)
 {
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@
 		if (extent > LATENCY_LIMIT)
 			extent = LATENCY_LIMIT;
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-				new_vma, new_pmd, new_addr);
+			  new_vma, new_pmd, new_addr, need_rmap_locks);
 		need_flush = true;
 	}
 	if (likely(need_flush))
@@ -198,6 +215,7 @@
 	unsigned long hiwater_vm;
 	int split = 0;
 	int err;
+	bool need_rmap_locks;
  
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
  
  
@@ -219,18 +237,21 @@
 		return err;
  
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+			   &need_rmap_locks);
 	if (!new_vma)
 		return -ENOMEM;
  
-	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+				     need_rmap_locks);
 	if (moved_len < old_len) {
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
 		 */
-		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+				 true);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
...	...	@@ -603,7 +603,7 @@
603	603	* process cleanup to remove whatever mess we made.
604	604	*/
605	605	if (length != move_page_tables(vma, old_start,
606		- vma, new_start, length))
	606	+ vma, new_start, length, false))
607	607	return -ENOMEM;
608	608
609	609	lru_add_drain();
...	...	@@ -1060,7 +1060,8 @@
1060	1060
1061	1061	extern unsigned long move_page_tables(struct vm_area_struct *vma,
1062	1062	unsigned long old_addr, struct vm_area_struct *new_vma,
1063		- unsigned long new_addr, unsigned long len);
	1063	+ unsigned long new_addr, unsigned long len,
	1064	+ bool need_rmap_locks);
1064	1065	extern unsigned long do_mremap(unsigned long addr,
1065	1066	unsigned long old_len, unsigned long new_len,
1066	1067	unsigned long flags, unsigned long new_addr);
...	...	@@ -1410,7 +1411,8 @@
1410	1411	struct rb_node *, struct rb_node );
1411	1412	extern void unlink_file_vma(struct vm_area_struct *);
1412	1413	extern struct vm_area_struct copy_vma(struct vm_area_struct *,
1413		- unsigned long addr, unsigned long len, pgoff_t pgoff);
	1414	+ unsigned long addr, unsigned long len, pgoff_t pgoff,
	1415	+ bool *need_rmap_locks);
1414	1416	extern void exit_mmap(struct mm_struct *);
1415	1417
1416	1418	extern int mm_take_all_locks(struct mm_struct *mm);
...	...	@@ -2371,7 +2371,8 @@
2371	2371	* prior to moving page table entries, to effect an mremap move.
2372	2372	*/
2373	2373	struct vm_area_struct copy_vma(struct vm_area_struct *vmap,
2374		- unsigned long addr, unsigned long len, pgoff_t pgoff)
	2374	+ unsigned long addr, unsigned long len, pgoff_t pgoff,
	2375	+ bool *need_rmap_locks)
2375	2376	{
2376	2377	struct vm_area_struct vma = vmap;
2377	2378	unsigned long vma_start = vma->vm_start;
2378	2379
...	...	@@ -2413,8 +2414,9 @@
2413	2414	* linear if there are no pages mapped yet.
2414	2415	*/
2415	2416	VM_BUG_ON(faulted_in_anon_vma);
2416		- *vmap = new_vma;
	2417	+ *vmap = vma = new_vma;
2417	2418	}
	2419	+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2418	2420	} else {
2419	2421	new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2420	2422	if (new_vma) {
...	...	@@ -2434,6 +2436,7 @@
2434	2436	if (new_vma->vm_ops && new_vma->vm_ops->open)
2435	2437	new_vma->vm_ops->open(new_vma);
2436	2438	vma_link(mm, new_vma, prev, rb_link, rb_parent);
	2439	+ *need_rmap_locks = false;
2437	2440	}
2438	2441	}
2439	2442	return new_vma;
...	...	@@ -71,26 +71,42 @@
71	71	static void move_ptes(struct vm_area_struct vma, pmd_t old_pmd,
72	72	unsigned long old_addr, unsigned long old_end,
73	73	struct vm_area_struct new_vma, pmd_t new_pmd,
74		- unsigned long new_addr)
	74	+ unsigned long new_addr, bool need_rmap_locks)
75	75	{
76	76	struct address_space *mapping = NULL;
77		- struct anon_vma *anon_vma = vma->anon_vma;
	77	+ struct anon_vma *anon_vma = NULL;
78	78	struct mm_struct *mm = vma->vm_mm;
79	79	pte_t old_pte, new_pte, pte;
80	80	spinlock_t old_ptl, new_ptl;
81	81
82		- if (vma->vm_file) {
83		- /*
84		- * Subtle point from Rajesh Venkatasubramanian: before
85		- * moving file-based ptes, we must lock truncate_pagecache
86		- * out, since it might clean the dst vma before the src vma,
87		- * and we propagate stale pages into the dst afterward.
88		- */
89		- mapping = vma->vm_file->f_mapping;
90		- mutex_lock(&mapping->i_mmap_mutex);
	82	+ /*
	83	+ * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
	84	+ * locks to ensure that rmap will always observe either the old or the
	85	+ * new ptes. This is the easiest way to avoid races with
	86	+ * truncate_pagecache(), page migration, etc...
	87	+ *
	88	+ * When need_rmap_locks is false, we use other ways to avoid
	89	+ * such races:
	90	+ *
	91	+ * - During exec() shift_arg_pages(), we use a specially tagged vma
	92	+ * which rmap call sites look for using is_vma_temporary_stack().
	93	+ *
	94	+ * - During mremap(), new_vma is often known to be placed after vma
	95	+ * in rmap traversal order. This ensures rmap will always observe
	96	+ * either the old pte, or the new pte, or both (the page table locks
	97	+ * serialize access to individual ptes, but only rmap traversal
	98	+ * order guarantees that we won't miss both the old and new ptes).
	99	+ */
	100	+ if (need_rmap_locks) {
	101	+ if (vma->vm_file) {
	102	+ mapping = vma->vm_file->f_mapping;
	103	+ mutex_lock(&mapping->i_mmap_mutex);
	104	+ }
	105	+ if (vma->anon_vma) {
	106	+ anon_vma = vma->anon_vma;
	107	+ anon_vma_lock(anon_vma);
	108	+ }
91	109	}
92		- if (anon_vma)
93		- anon_vma_lock(anon_vma);
94	110
95	111	/*
96	112	* We don't have to worry about the ordering of src and dst
...	...	@@ -127,7 +143,8 @@
127	143
128	144	unsigned long move_page_tables(struct vm_area_struct *vma,
129	145	unsigned long old_addr, struct vm_area_struct *new_vma,
130		- unsigned long new_addr, unsigned long len)
	146	+ unsigned long new_addr, unsigned long len,
	147	+ bool need_rmap_locks)
131	148	{
132	149	unsigned long extent, next, old_end;
133	150	pmd_t old_pmd, new_pmd;
...	...	@@ -174,7 +191,7 @@
174	191	if (extent > LATENCY_LIMIT)
175	192	extent = LATENCY_LIMIT;
176	193	move_ptes(vma, old_pmd, old_addr, old_addr + extent,
177		- new_vma, new_pmd, new_addr);
	194	+ new_vma, new_pmd, new_addr, need_rmap_locks);
178	195	need_flush = true;
179	196	}
180	197	if (likely(need_flush))
...	...	@@ -198,6 +215,7 @@
198	215	unsigned long hiwater_vm;
199	216	int split = 0;
200	217	int err;
	218	+ bool need_rmap_locks;
201	219
202	220	/*
203	221	* We'd prefer to avoid failure later on in do_munmap:
204	222
205	223
...	...	@@ -219,18 +237,21 @@
219	237	return err;
220	238
221	239	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
222		- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
	240	+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
	241	+ &need_rmap_locks);
223	242	if (!new_vma)
224	243	return -ENOMEM;
225	244
226		- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
	245	+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
	246	+ need_rmap_locks);
227	247	if (moved_len < old_len) {
228	248	/*
229	249	* On error, move entries back from new area to old,
230	250	* which will succeed since page tables still there,
231	251	* and then proceed to unmap new area instead of old.
232	252	*/
233		- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
	253	+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
	254	+ true);
234	255	vma = new_vma;
235	256	old_len = new_len;
236	257	old_addr = new_addr;