[PATCH] mm: unmap_vmas with inner ptlock

Remove the page_table_lock from around the calls to unmap_vmas, and replace the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are now safe to descend without page_table_lock. Don't attempt fancy locking for hugepages, just take page_table_lock in unmap_hugepage_range. Which makes zap_hugepage_range, and the hugetlb test in zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway. Nor does unmap_vmas have much use for its mm arg now. The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without page_table_lock: if they're implemented at all, they typically come down to flush_cache_range (usually done outside page_table_lock) and flush_tlb_range (which we already audited for the mprotect case). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] mm: unmap_vmas with inner ptlock
Remove the page_table_lock from around the calls to unmap_vmas, and replace the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are now safe to descend without page_table_lock. Don't attempt fancy locking for hugepages, just take page_table_lock in unmap_hugepage_range. Which makes zap_hugepage_range, and the hugetlb test in zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway. Nor does unmap_vmas have much use for its mm arg now. The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without page_table_lock: if they're implemented at all, they typically come down to flush_cache_range (usually done outside page_table_lock) and flush_tlb_range (which we already audited for the mprotect case). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Hugh Dickins · Linus Torvalds
1 parent 8f4f8c164c
Showing 6 changed files with 21 additions and 54 deletions Side-by-side Diff
fs/hugetlbfs/inode.c
include/linux/hugetlb.h
include/linux/mm.h
mm/hugetlb.c
mm/memory.c
mm/mmap.c
@@ -92,7 +92,7 @@
 }
  
 /*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem).
  */
  
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -308,7 +308,6 @@
  
 	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
 		unsigned long h_vm_pgoff;
-		unsigned long v_length;
 		unsigned long v_offset;
  
 		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +318,8 @@
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
  
-		v_length = vma->vm_end - vma->vm_start;
-
-		zap_hugepage_range(vma,
-				vma->vm_start + v_offset,
-				v_length - v_offset);
+		unmap_hugepage_range(vma,
+				vma->vm_start + v_offset, vma->vm_end);
 	}
 }
  
@@ -16,7 +16,6 @@
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
@@ -87,7 +86,6 @@
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
-#define zap_hugepage_range(vma, start, len)	BUG()
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
@@ -682,7 +682,7 @@
  
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
@@ -314,6 +314,8 @@
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
  
+	spin_lock(&mm->page_table_lock);
+
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
  
  
  
@@ -333,17 +335,9 @@
 		put_page(page);
 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
-	flush_tlb_range(vma, start, end);
-}
  
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
+	flush_tlb_range(vma, start, end);
 }
  
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
@@ -551,10 +551,11 @@
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
+	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
  
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent))
@@ -621,7 +622,7 @@
 	} while (pte++, addr += PAGE_SIZE, addr != end);
  
 	add_mm_rss(mm, file_rss, anon_rss);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 }
  
 static inline void zap_pmd_range(struct mmu_gather *tlb,
@@ -690,7 +691,6 @@
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlbp: address of the caller's struct mmu_gather
- * @mm: the controlling mm_struct
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
  
@@ -699,10 +699,10 @@
  *
  * Returns the end address of the unmapping (restart addr if interrupted).
  *
- * Unmap all pages in the vma list.  Called under page_table_lock.
+ * Unmap all pages in the vma list.
  *
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
  * return the ending mmu_gather to the caller.
  *
  * Only addresses between `start' and `end' will be unmapped.
@@ -714,7 +714,7 @@
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
  
  
  
  
@@ -764,19 +764,15 @@
 			tlb_finish_mmu(*tlbp, tlb_start, start);
  
 			if (need_resched() ||
-				need_lockbreak(&mm->page_table_lock) ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
-					/* must reset count of rss freed */
-					*tlbp = tlb_gather_mmu(mm, fullmm);
+					*tlbp = NULL;
 					goto out;
 				}
-				spin_unlock(&mm->page_table_lock);
 				cond_resched();
-				spin_lock(&mm->page_table_lock);
 			}
  
-			*tlbp = tlb_gather_mmu(mm, fullmm);
+			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
 			zap_bytes = ZAP_BLOCK_SIZE;
 		}
  
@@ -800,18 +796,12 @@
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
  
-	if (is_vm_hugetlb_page(vma)) {
-		zap_hugepage_range(vma, address, size);
-		return end;
-	}
-
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	spin_lock(&mm->page_table_lock);
-	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
-	spin_unlock(&mm->page_table_lock);
-	tlb_finish_mmu(tlb, address, end);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	if (tlb)
+		tlb_finish_mmu(tlb, address, end);
 	return end;
 }
  
@@ -1434,13 +1424,6 @@
  
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-
-	/*
-	 * We cannot rely on the break test in unmap_vmas:
-	 * on the one hand, we don't want to restart our loop
-	 * just because that broke out for the page_table_lock;
-	 * on the other hand, it does no test when vma is small.
-	 */
 	need_break = need_resched() ||
 			need_lockbreak(details->i_mmap_lock);
  
@@ -1673,9 +1673,7 @@
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	spin_lock(&mm->page_table_lock);
-	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
-	spin_unlock(&mm->page_table_lock);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -1958,9 +1956,7 @@
 	tlb = tlb_gather_mmu(mm, 1);
 	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	spin_lock(&mm->page_table_lock);
-	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
-	spin_unlock(&mm->page_table_lock);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
...	...	@@ -92,7 +92,7 @@
92	92	}
93	93
94	94	/*
95		- * Called under down_write(mmap_sem), page_table_lock is not held
	95	+ * Called under down_write(mmap_sem).
96	96	*/
97	97
98	98	#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
...	...	@@ -308,7 +308,6 @@
308	308
309	309	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
310	310	unsigned long h_vm_pgoff;
311		- unsigned long v_length;
312	311	unsigned long v_offset;
313	312
314	313	h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
...	...	@@ -319,11 +318,8 @@
319	318	if (h_vm_pgoff >= h_pgoff)
320	319	v_offset = 0;
321	320
322		- v_length = vma->vm_end - vma->vm_start;
323		-
324		- zap_hugepage_range(vma,
325		- vma->vm_start + v_offset,
326		- v_length - v_offset);
	321	+ unmap_hugepage_range(vma,
	322	+ vma->vm_start + v_offset, vma->vm_end);
327	323	}
328	324	}
329	325
...	...	@@ -16,7 +16,6 @@
16	16	int hugetlb_sysctl_handler(struct ctl_table , int, struct file , void __user , size_t , loff_t *);
17	17	int copy_hugetlb_page_range(struct mm_struct , struct mm_struct , struct vm_area_struct *);
18	18	int follow_hugetlb_page(struct mm_struct , struct vm_area_struct , struct page , struct vm_area_struct , unsigned long , int , int);
19		-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
20	19	void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
21	20	int hugetlb_prefault(struct address_space , struct vm_area_struct );
22	21	int hugetlb_report_meminfo(char *);
...	...	@@ -87,7 +86,6 @@
87	86	#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
88	87	#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
89	88	#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
90		-#define zap_hugepage_range(vma, start, len) BUG()
91	89	#define unmap_hugepage_range(vma, start, end) BUG()
92	90	#define is_hugepage_mem_enough(size) 0
93	91	#define hugetlb_report_meminfo(buf) 0
...	...	@@ -682,7 +682,7 @@
682	682
683	683	unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
684	684	unsigned long size, struct zap_details *);
685		-unsigned long unmap_vmas(struct mmu_gather *tlb, struct mm_struct mm,
	685	+unsigned long unmap_vmas(struct mmu_gather **tlb,
686	686	struct vm_area_struct *start_vma, unsigned long start_addr,
687	687	unsigned long end_addr, unsigned long *nr_accounted,
688	688	struct zap_details *);
...	...	@@ -314,6 +314,8 @@
314	314	BUG_ON(start & ~HPAGE_MASK);
315	315	BUG_ON(end & ~HPAGE_MASK);
316	316
	317	+ spin_lock(&mm->page_table_lock);
	318	+
317	319	/* Update high watermark before we lower rss */
318	320	update_hiwater_rss(mm);
319	321
320	322
321	323
...	...	@@ -333,17 +335,9 @@
333	335	put_page(page);
334	336	add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
335	337	}
336		- flush_tlb_range(vma, start, end);
337		-}
338	338
339		-void zap_hugepage_range(struct vm_area_struct *vma,
340		- unsigned long start, unsigned long length)
341		-{
342		- struct mm_struct *mm = vma->vm_mm;
343		-
344		- spin_lock(&mm->page_table_lock);
345		- unmap_hugepage_range(vma, start, start + length);
346	339	spin_unlock(&mm->page_table_lock);
	340	+ flush_tlb_range(vma, start, end);
347	341	}
348	342
349	343	int hugetlb_prefault(struct address_space mapping, struct vm_area_struct vma)
...	...	@@ -551,10 +551,11 @@
551	551	{
552	552	struct mm_struct *mm = tlb->mm;
553	553	pte_t *pte;
	554	+ spinlock_t *ptl;
554	555	int file_rss = 0;
555	556	int anon_rss = 0;
556	557
557		- pte = pte_offset_map(pmd, addr);
	558	+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
558	559	do {
559	560	pte_t ptent = *pte;
560	561	if (pte_none(ptent))
...	...	@@ -621,7 +622,7 @@
621	622	} while (pte++, addr += PAGE_SIZE, addr != end);
622	623
623	624	add_mm_rss(mm, file_rss, anon_rss);
624		- pte_unmap(pte - 1);
	625	+ pte_unmap_unlock(pte - 1, ptl);
625	626	}
626	627
627	628	static inline void zap_pmd_range(struct mmu_gather *tlb,
...	...	@@ -690,7 +691,6 @@
690	691	/**
691	692	* unmap_vmas - unmap a range of memory covered by a list of vma's
692	693	* @tlbp: address of the caller's struct mmu_gather
693		- * @mm: the controlling mm_struct
694	694	* @vma: the starting vma
695	695	* @start_addr: virtual address at which to start unmapping
696	696	* @end_addr: virtual address at which to end unmapping
697	697
...	...	@@ -699,10 +699,10 @@
699	699	*
700	700	* Returns the end address of the unmapping (restart addr if interrupted).
701	701	*
702		- * Unmap all pages in the vma list. Called under page_table_lock.
	702	+ * Unmap all pages in the vma list.
703	703	*
704		- * We aim to not hold page_table_lock for too long (for scheduling latency
705		- * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
	704	+ * We aim to not hold locks for too long (for scheduling latency reasons).
	705	+ * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
706	706	* return the ending mmu_gather to the caller.
707	707	*
708	708	* Only addresses between `start' and `end' will be unmapped.
...	...	@@ -714,7 +714,7 @@
714	714	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
715	715	* drops the lock and schedules.
716	716	*/
717		-unsigned long unmap_vmas(struct mmu_gather *tlbp, struct mm_struct mm,
	717	+unsigned long unmap_vmas(struct mmu_gather **tlbp,
718	718	struct vm_area_struct *vma, unsigned long start_addr,
719	719	unsigned long end_addr, unsigned long *nr_accounted,
720	720	struct zap_details *details)
721	721
722	722
723	723
724	724
...	...	@@ -764,19 +764,15 @@
764	764	tlb_finish_mmu(*tlbp, tlb_start, start);
765	765
766	766	if (need_resched() \|\|
767		- need_lockbreak(&mm->page_table_lock) \|\|
768	767	(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
769	768	if (i_mmap_lock) {
770		- /* must reset count of rss freed */
771		- *tlbp = tlb_gather_mmu(mm, fullmm);
	769	+ *tlbp = NULL;
772	770	goto out;
773	771	}
774		- spin_unlock(&mm->page_table_lock);
775	772	cond_resched();
776		- spin_lock(&mm->page_table_lock);
777	773	}
778	774
779		- *tlbp = tlb_gather_mmu(mm, fullmm);
	775	+ *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
780	776	tlb_start_valid = 0;
781	777	zap_bytes = ZAP_BLOCK_SIZE;
782	778	}
783	779
...	...	@@ -800,18 +796,12 @@
800	796	unsigned long end = address + size;
801	797	unsigned long nr_accounted = 0;
802	798
803		- if (is_vm_hugetlb_page(vma)) {
804		- zap_hugepage_range(vma, address, size);
805		- return end;
806		- }
807		-
808	799	lru_add_drain();
809	800	tlb = tlb_gather_mmu(mm, 0);
810	801	update_hiwater_rss(mm);
811		- spin_lock(&mm->page_table_lock);
812		- end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
813		- spin_unlock(&mm->page_table_lock);
814		- tlb_finish_mmu(tlb, address, end);
	802	+ end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
	803	+ if (tlb)
	804	+ tlb_finish_mmu(tlb, address, end);
815	805	return end;
816	806	}
817	807
...	...	@@ -1434,13 +1424,6 @@
1434	1424
1435	1425	restart_addr = zap_page_range(vma, start_addr,
1436	1426	end_addr - start_addr, details);
1437		-
1438		- /*
1439		- * We cannot rely on the break test in unmap_vmas:
1440		- * on the one hand, we don't want to restart our loop
1441		- * just because that broke out for the page_table_lock;
1442		- * on the other hand, it does no test when vma is small.
1443		- */
1444	1427	need_break = need_resched() \|\|
1445	1428	need_lockbreak(details->i_mmap_lock);
1446	1429
...	...	@@ -1673,9 +1673,7 @@
1673	1673	lru_add_drain();
1674	1674	tlb = tlb_gather_mmu(mm, 0);
1675	1675	update_hiwater_rss(mm);
1676		- spin_lock(&mm->page_table_lock);
1677		- unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1678		- spin_unlock(&mm->page_table_lock);
	1676	+ unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1679	1677	vm_unacct_memory(nr_accounted);
1680	1678	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1681	1679	next? next->vm_start: 0);
...	...	@@ -1958,9 +1956,7 @@
1958	1956	tlb = tlb_gather_mmu(mm, 1);
1959	1957	/* Don't update_hiwater_rss(mm) here, do_exit already did */
1960	1958	/* Use -1 here to ensure all VMAs in the mm are unmapped */
1961		- spin_lock(&mm->page_table_lock);
1962		- end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1963		- spin_unlock(&mm->page_table_lock);
	1959	+ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
1964	1960	vm_unacct_memory(nr_accounted);
1965	1961	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1966	1962	tlb_finish_mmu(tlb, 0, end);