mm: Remove i_mmap_lock lockbreak

Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: Remove i_mmap_lock lockbreak
Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Peter Zijlstra · Linus Torvalds
1 parent e4c70a6629
Showing 8 changed files with 28 additions and 188 deletions Side-by-side Diff
fs/inode.c
include/linux/fs.h
include/linux/mm.h
include/linux/mm_types.h
kernel/fork.c
mm/memory.c
mm/mmap.c
mm/mremap.c
@@ -331,7 +331,6 @@
 	spin_lock_init(&mapping->private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-	mutex_init(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(address_space_init_once);
  
@@ -635,7 +635,6 @@
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	spinlock_t		i_mmap_lock;	/* protect tree, count, list */
-	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
@@ -644,7 +643,6 @@
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
-	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -895,8 +895,6 @@
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	spinlock_t *i_mmap_lock;		/* For unmap_mapping_range: */
-	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
  
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -175,7 +175,6 @@
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
-	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
  
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
@@ -386,7 +386,6 @@
 			spin_lock(&mapping->i_mmap_lock);
 			if (tmp->vm_flags & VM_SHARED)
 				mapping->i_mmap_writable++;
-			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_prio_tree_add(tmp, mpnt);
@@ -986,13 +986,13 @@
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
-				long *zap_work, struct zap_details *details)
+				struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	int force_flush = 0;
-	pte_t *pte;
-	spinlock_t *ptl;
 	int rss[NR_MM_COUNTERS];
+	spinlock_t *ptl;
+	pte_t *pte;
  
 again:
 	init_rss_vec(rss);
  
@@ -1001,12 +1001,9 @@
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
-			(*zap_work)--;
 			continue;
 		}
  
-		(*zap_work) -= PAGE_SIZE;
-
 		if (pte_present(ptent)) {
 			struct page *page;
  
@@ -1075,7 +1072,7 @@
 				print_bad_pte(vma, addr, ptent, NULL);
 		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
  
 	add_mm_rss_vec(mm, rss);
 	arch_leave_lazy_mmu_mode();
@@ -1099,7 +1096,7 @@
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				long *zap_work, struct zap_details *details)
+				struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
  
  
  
@@ -1111,19 +1108,15 @@
 			if (next-addr != HPAGE_PMD_SIZE) {
 				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
 				split_huge_page_pmd(vma->vm_mm, pmd);
-			} else if (zap_huge_pmd(tlb, vma, pmd)) {
-				(*zap_work)--;
+			} else if (zap_huge_pmd(tlb, vma, pmd))
 				continue;
-			}
 			/* fall through */
 		}
-		if (pmd_none_or_clear_bad(pmd)) {
-			(*zap_work)--;
+		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		}
-		next = zap_pte_range(tlb, vma, pmd, addr, next,
-						zap_work, details);
-	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
+		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+		cond_resched();
+	} while (pmd++, addr = next, addr != end);
  
 	return addr;
 }
@@ -1131,7 +1124,7 @@
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				long *zap_work, struct zap_details *details)
+				struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
  
@@ -1139,13 +1132,10 @@
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud)) {
-			(*zap_work)--;
+		if (pud_none_or_clear_bad(pud))
 			continue;
-		}
-		next = zap_pmd_range(tlb, vma, pud, addr, next,
-						zap_work, details);
-	} while (pud++, addr = next, (addr != end && *zap_work > 0));
+		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+	} while (pud++, addr = next, addr != end);
  
 	return addr;
 }
@@ -1153,7 +1143,7 @@
 static unsigned long unmap_page_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
-				long *zap_work, struct zap_details *details)
+				struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
  
@@ -1167,13 +1157,10 @@
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd)) {
-			(*zap_work)--;
+		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		}
-		next = zap_pud_range(tlb, vma, pgd, addr, next,
-						zap_work, details);
-	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
+		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 	mem_cgroup_uncharge_end();
  
  
@@ -1218,9 +1205,7 @@
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long start = start_addr;
-	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	struct mm_struct *mm = vma->vm_mm;
  
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
  
  
  
@@ -1253,33 +1238,15 @@
 				 * Since no pte has actually been setup, it is
 				 * safe to do nothing in this case.
 				 */
-				if (vma->vm_file) {
+				if (vma->vm_file)
 					unmap_hugepage_range(vma, start, end, NULL);
-					zap_work -= (end - start) /
-					pages_per_huge_page(hstate_vma(vma));
-				}
  
 				start = end;
 			} else
-				start = unmap_page_range(tlb, vma,
-						start, end, &zap_work, details);
-
-			if (zap_work > 0) {
-				BUG_ON(start != end);
-				break;
-			}
-
-			if (need_resched() ||
-				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-				if (i_mmap_lock)
-					goto out;
-				cond_resched();
-			}
-
-			zap_work = ZAP_BLOCK_SIZE;
+				start = unmap_page_range(tlb, vma, start, end, details);
 		}
 	}
-out:
+
 	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 	return start;	/* which is now the end (or restart) address */
 }
  
@@ -2612,96 +2579,11 @@
 	return ret;
 }
  
-/*
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around.  Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again.  It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
-	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
-
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-		vma->vm_truncate_count = 0;
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-		vma->vm_truncate_count = 0;
-}
-
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
-	unsigned long restart_addr;
-	int need_break;
-
-	/*
-	 * files that support invalidating or truncating portions of the
-	 * file from under mmaped areas must have their ->fault function
-	 * return a locked page (and set VM_FAULT_LOCKED in the return).
-	 * This provides synchronisation against concurrent unmapping here.
-	 */
-
-again:
-	restart_addr = vma->vm_truncate_count;
-	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
-		start_addr = restart_addr;
-		if (start_addr >= end_addr) {
-			/* Top of vma has been split off since last time */
-			vma->vm_truncate_count = details->truncate_count;
-			return 0;
-		}
-	}
-
-	restart_addr = zap_page_range(vma, start_addr,
-					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-
-	if (restart_addr >= end_addr) {
-		/* We have now completed this vma: mark it so */
-		vma->vm_truncate_count = details->truncate_count;
-		if (!need_break)
-			return 0;
-	} else {
-		/* Note restart_addr in vma's truncate_count field */
-		vma->vm_truncate_count = restart_addr;
-		if (!need_break)
-			goto again;
-	}
-
-	spin_unlock(details->i_mmap_lock);
-	cond_resched();
-	spin_lock(details->i_mmap_lock);
-	return -EINTR;
+	zap_page_range(vma, start_addr, end_addr - start_addr, details);
 }
  
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
  
@@ -2711,12 +2593,8 @@
 	struct prio_tree_iter iter;
 	pgoff_t vba, vea, zba, zea;
  
-restart:
 	vma_prio_tree_foreach(vma, &iter, root,
 			details->first_index, details->last_index) {
-		/* Skip quickly over those we have already dealt with */
-		if (vma->vm_truncate_count == details->truncate_count)
-			continue;
  
 		vba = vma->vm_pgoff;
 		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
  
@@ -2728,11 +2606,10 @@
 		if (zea > vea)
 			zea = vea;
  
-		if (unmap_mapping_range_vma(vma,
+		unmap_mapping_range_vma(vma,
 			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
 			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
-				details) < 0)
-			goto restart;
+				details);
 	}
 }
  
  
  
@@ -2747,15 +2624,9 @@
 	 * across *all* the pages in each nonlinear VMA, not just the pages
 	 * whose virtual address lies outside the file truncation point.
 	 */
-restart:
 	list_for_each_entry(vma, head, shared.vm_set.list) {
-		/* Skip quickly over those we have already dealt with */
-		if (vma->vm_truncate_count == details->truncate_count)
-			continue;
 		details->nonlinear_vma = vma;
-		if (unmap_mapping_range_vma(vma, vma->vm_start,
-					vma->vm_end, details) < 0)
-			goto restart;
+		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
 	}
 }
  
  
  
  
@@ -2794,26 +2665,14 @@
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
-	details.i_mmap_lock = &mapping->i_mmap_lock;
  
-	mutex_lock(&mapping->unmap_mutex);
-	spin_lock(&mapping->i_mmap_lock);
  
-	/* Protect against endless unmapping loops */
-	mapping->truncate_count++;
-	if (unlikely(is_restart_addr(mapping->truncate_count))) {
-		if (mapping->truncate_count == 0)
-			reset_vma_truncate_counts(mapping);
-		mapping->truncate_count++;
-	}
-	details.truncate_count = mapping->truncate_count;
-
+	spin_lock(&mapping->i_mmap_lock);
 	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
-	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
  
@@ -445,10 +445,8 @@
 	if (vma->vm_file)
 		mapping = vma->vm_file->f_mapping;
  
-	if (mapping) {
+	if (mapping)
 		spin_lock(&mapping->i_mmap_lock);
-		vma->vm_truncate_count = mapping->truncate_count;
-	}
  
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
  
@@ -558,16 +556,7 @@
 		if (!(vma->vm_flags & VM_NONLINEAR))
 			root = &mapping->i_mmap;
 		spin_lock(&mapping->i_mmap_lock);
-		if (importer &&
-		    vma->vm_truncate_count != next->vm_truncate_count) {
-			/*
-			 * unmap_mapping_range might be in progress:
-			 * ensure that the expanding vma is rescanned.
-			 */
-			importer->vm_truncate_count = 0;
-		}
 		if (insert) {
-			insert->vm_truncate_count = vma->vm_truncate_count;
 			/*
 			 * Put into prio_tree now, so instantiated pages
 			 * are visible to arm/parisc __flush_dcache_page
@@ -94,7 +94,6 @@
 		 */
 		mapping = vma->vm_file->f_mapping;
 		spin_lock(&mapping->i_mmap_lock);
-		new_vma->vm_truncate_count = 0;
 	}
  
 	/*
...	...	@@ -331,7 +331,6 @@
331	331	spin_lock_init(&mapping->private_lock);
332	332	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
333	333	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
334		- mutex_init(&mapping->unmap_mutex);
335	334	}
336	335	EXPORT_SYMBOL(address_space_init_once);
337	336
...	...	@@ -635,7 +635,6 @@
635	635	struct prio_tree_root i_mmap; /* tree of private and shared mappings */
636	636	struct list_head i_mmap_nonlinear;/list VM_NONLINEAR mappings /
637	637	spinlock_t i_mmap_lock; /* protect tree, count, list */
638		- unsigned int truncate_count; /* Cover race condition with truncate */
639	638	unsigned long nrpages; /* number of total pages */
640	639	pgoff_t writeback_index;/* writeback starts here */
641	640	const struct address_space_operations a_ops; / methods */
...	...	@@ -644,7 +643,6 @@
644	643	spinlock_t private_lock; /* for use by the address_space */
645	644	struct list_head private_list; /* ditto */
646	645	struct address_space assoc_mapping; / ditto */
647		- struct mutex unmap_mutex; /* to protect unmapping */
648	646	} __attribute__((aligned(sizeof(long))));
649	647	/*
650	648	* On most architectures that alignment is already the case; but
...	...	@@ -895,8 +895,6 @@
895	895	struct address_space check_mapping; / Check page->mapping if set */
896	896	pgoff_t first_index; /* Lowest page->index to unmap */
897	897	pgoff_t last_index; /* Highest page->index to unmap */
898		- spinlock_t i_mmap_lock; / For unmap_mapping_range: */
899		- unsigned long truncate_count; /* Compare vm_truncate_count */
900	898	};
901	899
902	900	struct page vm_normal_page(struct vm_area_struct vma, unsigned long addr,
...	...	@@ -175,7 +175,6 @@
175	175	units, not PAGE_CACHE_SIZE */
176	176	struct file * vm_file; /* File we map to (can be NULL). */
177	177	void * vm_private_data; /* was vm_pte (shared mem) */
178		- unsigned long vm_truncate_count;/* truncate_count or restart_addr */
179	178
180	179	#ifndef CONFIG_MMU
181	180	struct vm_region vm_region; / NOMMU mapping region */
...	...	@@ -386,7 +386,6 @@
386	386	spin_lock(&mapping->i_mmap_lock);
387	387	if (tmp->vm_flags & VM_SHARED)
388	388	mapping->i_mmap_writable++;
389		- tmp->vm_truncate_count = mpnt->vm_truncate_count;
390	389	flush_dcache_mmap_lock(mapping);
391	390	/* insert tmp into the share list, just after mpnt */
392	391	vma_prio_tree_add(tmp, mpnt);
...	...	@@ -986,13 +986,13 @@
986	986	static unsigned long zap_pte_range(struct mmu_gather *tlb,
987	987	struct vm_area_struct vma, pmd_t pmd,
988	988	unsigned long addr, unsigned long end,
989		- long zap_work, struct zap_details details)
	989	+ struct zap_details *details)
990	990	{
991	991	struct mm_struct *mm = tlb->mm;
992	992	int force_flush = 0;
993		- pte_t *pte;
994		- spinlock_t *ptl;
995	993	int rss[NR_MM_COUNTERS];
	994	+ spinlock_t *ptl;
	995	+ pte_t *pte;
996	996
997	997	again:
998	998	init_rss_vec(rss);
999	999
...	...	@@ -1001,12 +1001,9 @@
1001	1001	do {
1002	1002	pte_t ptent = *pte;
1003	1003	if (pte_none(ptent)) {
1004		- (*zap_work)--;
1005	1004	continue;
1006	1005	}
1007	1006
1008		- (*zap_work) -= PAGE_SIZE;
1009		-
1010	1007	if (pte_present(ptent)) {
1011	1008	struct page *page;
1012	1009
...	...	@@ -1075,7 +1072,7 @@
1075	1072	print_bad_pte(vma, addr, ptent, NULL);
1076	1073	}
1077	1074	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1078		- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
	1075	+ } while (pte++, addr += PAGE_SIZE, addr != end);
1079	1076
1080	1077	add_mm_rss_vec(mm, rss);
1081	1078	arch_leave_lazy_mmu_mode();
...	...	@@ -1099,7 +1096,7 @@
1099	1096	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1100	1097	struct vm_area_struct vma, pud_t pud,
1101	1098	unsigned long addr, unsigned long end,
1102		- long zap_work, struct zap_details details)
	1099	+ struct zap_details *details)
1103	1100	{
1104	1101	pmd_t *pmd;
1105	1102	unsigned long next;
1106	1103
1107	1104
1108	1105
...	...	@@ -1111,19 +1108,15 @@
1111	1108	if (next-addr != HPAGE_PMD_SIZE) {
1112	1109	VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1113	1110	split_huge_page_pmd(vma->vm_mm, pmd);
1114		- } else if (zap_huge_pmd(tlb, vma, pmd)) {
1115		- (*zap_work)--;
	1111	+ } else if (zap_huge_pmd(tlb, vma, pmd))
1116	1112	continue;
1117		- }
1118	1113	/* fall through */
1119	1114	}
1120		- if (pmd_none_or_clear_bad(pmd)) {
1121		- (*zap_work)--;
	1115	+ if (pmd_none_or_clear_bad(pmd))
1122	1116	continue;
1123		- }
1124		- next = zap_pte_range(tlb, vma, pmd, addr, next,
1125		- zap_work, details);
1126		- } while (pmd++, addr = next, (addr != end && *zap_work > 0));
	1117	+ next = zap_pte_range(tlb, vma, pmd, addr, next, details);
	1118	+ cond_resched();
	1119	+ } while (pmd++, addr = next, addr != end);
1127	1120
1128	1121	return addr;
1129	1122	}
...	...	@@ -1131,7 +1124,7 @@
1131	1124	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1132	1125	struct vm_area_struct vma, pgd_t pgd,
1133	1126	unsigned long addr, unsigned long end,
1134		- long zap_work, struct zap_details details)
	1127	+ struct zap_details *details)
1135	1128	{
1136	1129	pud_t *pud;
1137	1130	unsigned long next;
1138	1131
...	...	@@ -1139,13 +1132,10 @@
1139	1132	pud = pud_offset(pgd, addr);
1140	1133	do {
1141	1134	next = pud_addr_end(addr, end);
1142		- if (pud_none_or_clear_bad(pud)) {
1143		- (*zap_work)--;
	1135	+ if (pud_none_or_clear_bad(pud))
1144	1136	continue;
1145		- }
1146		- next = zap_pmd_range(tlb, vma, pud, addr, next,
1147		- zap_work, details);
1148		- } while (pud++, addr = next, (addr != end && *zap_work > 0));
	1137	+ next = zap_pmd_range(tlb, vma, pud, addr, next, details);
	1138	+ } while (pud++, addr = next, addr != end);
1149	1139
1150	1140	return addr;
1151	1141	}
...	...	@@ -1153,7 +1143,7 @@
1153	1143	static unsigned long unmap_page_range(struct mmu_gather *tlb,
1154	1144	struct vm_area_struct *vma,
1155	1145	unsigned long addr, unsigned long end,
1156		- long zap_work, struct zap_details details)
	1146	+ struct zap_details *details)
1157	1147	{
1158	1148	pgd_t *pgd;
1159	1149	unsigned long next;
1160	1150
...	...	@@ -1167,13 +1157,10 @@
1167	1157	pgd = pgd_offset(vma->vm_mm, addr);
1168	1158	do {
1169	1159	next = pgd_addr_end(addr, end);
1170		- if (pgd_none_or_clear_bad(pgd)) {
1171		- (*zap_work)--;
	1160	+ if (pgd_none_or_clear_bad(pgd))
1172	1161	continue;
1173		- }
1174		- next = zap_pud_range(tlb, vma, pgd, addr, next,
1175		- zap_work, details);
1176		- } while (pgd++, addr = next, (addr != end && *zap_work > 0));
	1162	+ next = zap_pud_range(tlb, vma, pgd, addr, next, details);
	1163	+ } while (pgd++, addr = next, addr != end);
1177	1164	tlb_end_vma(tlb, vma);
1178	1165	mem_cgroup_uncharge_end();
1179	1166
1180	1167
...	...	@@ -1218,9 +1205,7 @@
1218	1205	unsigned long end_addr, unsigned long *nr_accounted,
1219	1206	struct zap_details *details)
1220	1207	{
1221		- long zap_work = ZAP_BLOCK_SIZE;
1222	1208	unsigned long start = start_addr;
1223		- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1224	1209	struct mm_struct *mm = vma->vm_mm;
1225	1210
1226	1211	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1227	1212
1228	1213
1229	1214
...	...	@@ -1253,33 +1238,15 @@
1253	1238	* Since no pte has actually been setup, it is
1254	1239	* safe to do nothing in this case.
1255	1240	*/
1256		- if (vma->vm_file) {
	1241	+ if (vma->vm_file)
1257	1242	unmap_hugepage_range(vma, start, end, NULL);
1258		- zap_work -= (end - start) /
1259		- pages_per_huge_page(hstate_vma(vma));
1260		- }
1261	1243
1262	1244	start = end;
1263	1245	} else
1264		- start = unmap_page_range(tlb, vma,
1265		- start, end, &zap_work, details);
1266		-
1267		- if (zap_work > 0) {
1268		- BUG_ON(start != end);
1269		- break;
1270		- }
1271		-
1272		- if (need_resched() \|\|
1273		- (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1274		- if (i_mmap_lock)
1275		- goto out;
1276		- cond_resched();
1277		- }
1278		-
1279		- zap_work = ZAP_BLOCK_SIZE;
	1246	+ start = unmap_page_range(tlb, vma, start, end, details);
1280	1247	}
1281	1248	}
1282		-out:
	1249	+
1283	1250	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1284	1251	return start; /* which is now the end (or restart) address */
1285	1252	}
1286	1253
...	...	@@ -2612,96 +2579,11 @@
2612	2579	return ret;
2613	2580	}
2614	2581
2615		-/*
2616		- * Helper functions for unmap_mapping_range().
2617		- *
2618		- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2619		- *
2620		- * We have to restart searching the prio_tree whenever we drop the lock,
2621		- * since the iterator is only valid while the lock is held, and anyway
2622		- * a later vma might be split and reinserted earlier while lock dropped.
2623		- *
2624		- * The list of nonlinear vmas could be handled more efficiently, using
2625		- * a placeholder, but handle it in the same way until a need is shown.
2626		- * It is important to search the prio_tree before nonlinear list: a vma
2627		- * may become nonlinear and be shifted from prio_tree to nonlinear list
2628		- * while the lock is dropped; but never shifted from list to prio_tree.
2629		- *
2630		- * In order to make forward progress despite restarting the search,
2631		- * vm_truncate_count is used to mark a vma as now dealt with, so we can
2632		- * quickly skip it next time around. Since the prio_tree search only
2633		- * shows us those vmas affected by unmapping the range in question, we
2634		- * can't efficiently keep all vmas in step with mapping->truncate_count:
2635		- * so instead reset them all whenever it wraps back to 0 (then go to 1).
2636		- * mapping->truncate_count and vma->vm_truncate_count are protected by
2637		- * i_mmap_lock.
2638		- *
2639		- * In order to make forward progress despite repeatedly restarting some
2640		- * large vma, note the restart_addr from unmap_vmas when it breaks out:
2641		- * and restart from that address when we reach that vma again. It might
2642		- * have been split or merged, shrunk or extended, but never shifted: so
2643		- * restart_addr remains valid so long as it remains in the vma's range.
2644		- * unmap_mapping_range forces truncate_count to leap over page-aligned
2645		- * values so we can save vma's restart_addr in its truncate_count field.
2646		- */
2647		-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2648		-
2649		-static void reset_vma_truncate_counts(struct address_space *mapping)
2650		-{
2651		- struct vm_area_struct *vma;
2652		- struct prio_tree_iter iter;
2653		-
2654		- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2655		- vma->vm_truncate_count = 0;
2656		- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2657		- vma->vm_truncate_count = 0;
2658		-}
2659		-
2660		-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
	2582	+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2661	2583	unsigned long start_addr, unsigned long end_addr,
2662	2584	struct zap_details *details)
2663	2585	{
2664		- unsigned long restart_addr;
2665		- int need_break;
2666		-
2667		- /*
2668		- * files that support invalidating or truncating portions of the
2669		- * file from under mmaped areas must have their ->fault function
2670		- * return a locked page (and set VM_FAULT_LOCKED in the return).
2671		- * This provides synchronisation against concurrent unmapping here.
2672		- */
2673		-
2674		-again:
2675		- restart_addr = vma->vm_truncate_count;
2676		- if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2677		- start_addr = restart_addr;
2678		- if (start_addr >= end_addr) {
2679		- /* Top of vma has been split off since last time */
2680		- vma->vm_truncate_count = details->truncate_count;
2681		- return 0;
2682		- }
2683		- }
2684		-
2685		- restart_addr = zap_page_range(vma, start_addr,
2686		- end_addr - start_addr, details);
2687		- need_break = need_resched() \|\| spin_needbreak(details->i_mmap_lock);
2688		-
2689		- if (restart_addr >= end_addr) {
2690		- /* We have now completed this vma: mark it so */
2691		- vma->vm_truncate_count = details->truncate_count;
2692		- if (!need_break)
2693		- return 0;
2694		- } else {
2695		- /* Note restart_addr in vma's truncate_count field */
2696		- vma->vm_truncate_count = restart_addr;
2697		- if (!need_break)
2698		- goto again;
2699		- }
2700		-
2701		- spin_unlock(details->i_mmap_lock);
2702		- cond_resched();
2703		- spin_lock(details->i_mmap_lock);
2704		- return -EINTR;
	2586	+ zap_page_range(vma, start_addr, end_addr - start_addr, details);
2705	2587	}
2706	2588
2707	2589	static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2708	2590
...	...	@@ -2711,12 +2593,8 @@
2711	2593	struct prio_tree_iter iter;
2712	2594	pgoff_t vba, vea, zba, zea;
2713	2595
2714		-restart:
2715	2596	vma_prio_tree_foreach(vma, &iter, root,
2716	2597	details->first_index, details->last_index) {
2717		- /* Skip quickly over those we have already dealt with */
2718		- if (vma->vm_truncate_count == details->truncate_count)
2719		- continue;
2720	2598
2721	2599	vba = vma->vm_pgoff;
2722	2600	vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2723	2601
...	...	@@ -2728,11 +2606,10 @@
2728	2606	if (zea > vea)
2729	2607	zea = vea;
2730	2608
2731		- if (unmap_mapping_range_vma(vma,
	2609	+ unmap_mapping_range_vma(vma,
2732	2610	((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2733	2611	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2734		- details) < 0)
2735		- goto restart;
	2612	+ details);
2736	2613	}
2737	2614	}
2738	2615
2739	2616
2740	2617
...	...	@@ -2747,15 +2624,9 @@
2747	2624	* across all the pages in each nonlinear VMA, not just the pages
2748	2625	* whose virtual address lies outside the file truncation point.
2749	2626	*/
2750		-restart:
2751	2627	list_for_each_entry(vma, head, shared.vm_set.list) {
2752		- /* Skip quickly over those we have already dealt with */
2753		- if (vma->vm_truncate_count == details->truncate_count)
2754		- continue;
2755	2628	details->nonlinear_vma = vma;
2756		- if (unmap_mapping_range_vma(vma, vma->vm_start,
2757		- vma->vm_end, details) < 0)
2758		- goto restart;
	2629	+ unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2759	2630	}
2760	2631	}
2761	2632
2762	2633
2763	2634
2764	2635
...	...	@@ -2794,26 +2665,14 @@
2794	2665	details.last_index = hba + hlen - 1;
2795	2666	if (details.last_index < details.first_index)
2796	2667	details.last_index = ULONG_MAX;
2797		- details.i_mmap_lock = &mapping->i_mmap_lock;
2798	2668
2799		- mutex_lock(&mapping->unmap_mutex);
2800		- spin_lock(&mapping->i_mmap_lock);
2801	2669
2802		- /* Protect against endless unmapping loops */
2803		- mapping->truncate_count++;
2804		- if (unlikely(is_restart_addr(mapping->truncate_count))) {
2805		- if (mapping->truncate_count == 0)
2806		- reset_vma_truncate_counts(mapping);
2807		- mapping->truncate_count++;
2808		- }
2809		- details.truncate_count = mapping->truncate_count;
2810		-
	2670	+ spin_lock(&mapping->i_mmap_lock);
2811	2671	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2812	2672	unmap_mapping_range_tree(&mapping->i_mmap, &details);
2813	2673	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2814	2674	unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2815	2675	spin_unlock(&mapping->i_mmap_lock);
2816		- mutex_unlock(&mapping->unmap_mutex);
2817	2676	}
2818	2677	EXPORT_SYMBOL(unmap_mapping_range);
2819	2678
...	...	@@ -445,10 +445,8 @@
445	445	if (vma->vm_file)
446	446	mapping = vma->vm_file->f_mapping;
447	447
448		- if (mapping) {
	448	+ if (mapping)
449	449	spin_lock(&mapping->i_mmap_lock);
450		- vma->vm_truncate_count = mapping->truncate_count;
451		- }
452	450
453	451	__vma_link(mm, vma, prev, rb_link, rb_parent);
454	452	__vma_link_file(vma);
455	453
...	...	@@ -558,16 +556,7 @@
558	556	if (!(vma->vm_flags & VM_NONLINEAR))
559	557	root = &mapping->i_mmap;
560	558	spin_lock(&mapping->i_mmap_lock);
561		- if (importer &&
562		- vma->vm_truncate_count != next->vm_truncate_count) {
563		- /*
564		- * unmap_mapping_range might be in progress:
565		- * ensure that the expanding vma is rescanned.
566		- */
567		- importer->vm_truncate_count = 0;
568		- }
569	559	if (insert) {
570		- insert->vm_truncate_count = vma->vm_truncate_count;
571	560	/*
572	561	* Put into prio_tree now, so instantiated pages
573	562	* are visible to arm/parisc __flush_dcache_page
...	...	@@ -94,7 +94,6 @@
94	94	*/
95	95	mapping = vma->vm_file->f_mapping;
96	96	spin_lock(&mapping->i_mmap_lock);
97		- new_vma->vm_truncate_count = 0;
98	97	}
99	98
100	99	/*