[PATCH] freepgt: free_pgtables use vma list

Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Hugh Dickins · Linus Torvalds
1 parent 9f6c6fc505
Showing 5 changed files with 141 additions and 155 deletions Side-by-side Diff
arch/i386/mm/pgtable.c
arch/ia64/mm/hugetlbpage.c
include/linux/mm.h
mm/memory.c
mm/mmap.c
@@ -255,7 +255,7 @@
 	if (PTRS_PER_PMD > 1)
 		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
 			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	/* in the non-PAE case, clear_page_range() clears user pgd entries */
+	/* in the non-PAE case, free_pgtables() clears user pgd entries */
 	kmem_cache_free(pgd_cache, pgd);
 }
@@ -187,45 +187,12 @@
 }
  
 /*
- * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
- * are hugetlb region specific.
+ * Do nothing, until we've worked out what to do!  To allow build, we
+ * must remove reference to clear_page_range since it no longer exists.
  */
 void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 	unsigned long start, unsigned long end)
 {
-	unsigned long first = start & HUGETLB_PGDIR_MASK;
-	unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
-	struct mm_struct *mm = tlb->mm;
-
-	if (!prev) {
-		prev = mm->mmap;
-		if (!prev)
-			goto no_mmaps;
-		if (prev->vm_end > start) {
-			if (last > prev->vm_start)
-				last = prev->vm_start;
-			goto no_mmaps;
-		}
-	}
-	for (;;) {
-		struct vm_area_struct *next = prev->vm_next;
-
-		if (next) {
-			if (next->vm_start < start) {
-				prev = next;
-				continue;
-			}
-			if (last > next->vm_start)
-				last = next->vm_start;
-		}
-		if (prev->vm_end > first)
-			first = prev->vm_end;
-		break;
-	}
-no_mmaps:
-	if (last < first)	/* for arches with discontiguous pgd indices */
-		return;
-	clear_page_range(tlb, first, last);
 }
  
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
@@ -592,7 +592,8 @@
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
-void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
@@ -110,87 +110,165 @@
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
-static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
-				unsigned long addr, unsigned long end)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
-	if (!((addr | end) & ~PMD_MASK)) {
-		/* Only free fully aligned ranges */
-		struct page *page = pmd_page(*pmd);
-		pmd_clear(pmd);
-		dec_page_state(nr_page_table_pages);
-		tlb->mm->nr_ptes--;
-		pte_free_tlb(tlb, page);
-	}
+	struct page *page = pmd_page(*pmd);
+	pmd_clear(pmd);
+	pte_free_tlb(tlb, page);
+	dec_page_state(nr_page_table_pages);
+	tlb->mm->nr_ptes--;
 }
  
-static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-				unsigned long addr, unsigned long end)
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	pmd_t *empty_pmd = NULL;
+	unsigned long start;
  
+	start = addr;
 	pmd = pmd_offset(pud, addr);
-
-	/* Only free fully aligned ranges */
-	if (!((addr | end) & ~PUD_MASK))
-		empty_pmd = pmd;
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		clear_pte_range(tlb, pmd, addr, next);
+		free_pte_range(tlb, pmd);
 	} while (pmd++, addr = next, addr != end);
  
-	if (empty_pmd) {
-		pud_clear(pud);
-		pmd_free_tlb(tlb, empty_pmd);
+	start &= PUD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PUD_MASK;
+		if (!ceiling)
+			return;
 	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pmd = pmd_offset(pud, start);
+	pud_clear(pud);
+	pmd_free_tlb(tlb, pmd);
 }
  
-static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
-				unsigned long addr, unsigned long end)
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
 {
 	pud_t *pud;
 	unsigned long next;
-	pud_t *empty_pud = NULL;
+	unsigned long start;
  
+	start = addr;
 	pud = pud_offset(pgd, addr);
-
-	/* Only free fully aligned ranges */
-	if (!((addr | end) & ~PGDIR_MASK))
-		empty_pud = pud;
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		clear_pmd_range(tlb, pud, addr, next);
+		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 	} while (pud++, addr = next, addr != end);
  
-	if (empty_pud) {
-		pgd_clear(pgd);
-		pud_free_tlb(tlb, empty_pud);
+	start &= PGDIR_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PGDIR_MASK;
+		if (!ceiling)
+			return;
 	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pud = pud_offset(pgd, start);
+	pgd_clear(pgd);
+	pud_free_tlb(tlb, pud);
 }
  
 /*
- * This function clears user-level page tables of a process.
- * Unlike other pagetable walks, some memory layouts might give end 0.
+ * This function frees user-level page tables of a process.
+ *
  * Must be called with pagetable lock held.
  */
-void clear_page_range(struct mmu_gather *tlb,
-				unsigned long addr, unsigned long end)
+static inline void free_pgd_range(struct mmu_gather *tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
 	unsigned long next;
+	unsigned long start;
  
+	/*
+	 * The next few lines have given us lots of grief...
+	 *
+	 * Why are we testing PMD* at this top level?  Because often
+	 * there will be no work to do at all, and we'd prefer not to
+	 * go all the way down to the bottom just to discover that.
+	 *
+	 * Why all these "- 1"s?  Because 0 represents both the bottom
+	 * of the address space and the top of it (using -1 for the
+	 * top wouldn't help much: the masks would do the wrong thing).
+	 * The rule is that addr 0 and floor 0 refer to the bottom of
+	 * the address space, but end 0 and ceiling 0 refer to the top
+	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
+	 * that end 0 case should be mythical).
+	 *
+	 * Wherever addr is brought up or ceiling brought down, we must
+	 * be careful to reject "the opposite 0" before it confuses the
+	 * subsequent tests.  But what about where end is brought down
+	 * by PMD_SIZE below? no, end can't go down to 0 there.
+	 *
+	 * Whereas we round start (addr) and ceiling down, by different
+	 * masks at different levels, in order to test whether a table
+	 * now has no other vmas using it, so can be freed, we don't
+	 * bother to round floor or end up - the tests don't need that.
+	 */
+
+	addr &= PMD_MASK;
+	if (addr < floor) {
+		addr += PMD_SIZE;
+		if (!addr)
+			return;
+	}
+	if (ceiling) {
+		ceiling &= PMD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		end -= PMD_SIZE;
+	if (addr > end - 1)
+		return;
+
+	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		clear_pud_range(tlb, pgd, addr, next);
+		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
+
+	if (!tlb_is_full_mm(tlb))
+		flush_tlb_pgtables(tlb->mm, start, end);
+}
+
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+				unsigned long floor, unsigned long ceiling)
+{
+	while (vma) {
+		struct vm_area_struct *next = vma->vm_next;
+		unsigned long addr = vma->vm_start;
+
+		/* Optimization: gather nearby vmas into a single call down */
+		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+			vma = next;
+			next = vma->vm_next;
+		}
+		free_pgd_range(*tlb, addr, vma->vm_end,
+				floor, next? next->vm_start: ceiling);
+		vma = next;
+	}
 }
  
 pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
@@ -29,6 +29,10 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
  
+static void unmap_region(struct mm_struct *mm,
+		struct vm_area_struct *vma, struct vm_area_struct *prev,
+		unsigned long start, unsigned long end);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -1129,7 +1133,8 @@
 	fput(file);
  
 	/* Undo any partial mapping done by a device driver. */
-	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+	charged = 0;
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
@@ -1572,66 +1577,6 @@
 }
 #endif
  
-/*
- * Try to free as many page directory entries as we can,
- * without having to work very hard at actually scanning
- * the page tables themselves.
- *
- * Right now we try to free page tables if we have a nice
- * PGDIR-aligned area that got free'd up. We could be more
- * granular if we want to, but this is fast and simple,
- * and covers the bad cases.
- *
- * "prev", if it exists, points to a vma before the one
- * we just free'd - but there's no telling how much before.
- */
-static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
-	unsigned long start, unsigned long end)
-{
-	unsigned long first = start & PGDIR_MASK;
-	unsigned long last = end + PGDIR_SIZE - 1;
-	struct mm_struct *mm = tlb->mm;
-
-	if (last > MM_VM_SIZE(mm) || last < end)
-		last = MM_VM_SIZE(mm);
-
-	if (!prev) {
-		prev = mm->mmap;
-		if (!prev)
-			goto no_mmaps;
-		if (prev->vm_end > start) {
-			if (last > prev->vm_start)
-				last = prev->vm_start;
-			goto no_mmaps;
-		}
-	}
-	for (;;) {
-		struct vm_area_struct *next = prev->vm_next;
-
-		if (next) {
-			if (next->vm_start < start) {
-				prev = next;
-				continue;
-			}
-			if (last > next->vm_start)
-				last = next->vm_start;
-		}
-		if (prev->vm_end > first)
-			first = prev->vm_end;
-		break;
-	}
-no_mmaps:
-	if (last < first)	/* for arches with discontiguous pgd indices */
-		return;
-	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
-		first = FIRST_USER_PGD_NR * PGDIR_SIZE;
-	/* No point trying to free anything if we're in the same pte page */
-	if ((first & PMD_MASK) < (last & PMD_MASK)) {
-		clear_page_range(tlb, first, last);
-		flush_tlb_pgtables(mm, first, last);
-	}
-}
-
 /* Normal function to fix up a mapping
  * This function is the default for when an area has no specific
  * function.  This may be used as part of a more specific routine.
  
  
  
  
@@ -1674,24 +1619,22 @@
  * Called with the page table lock held.
  */
 static void unmap_region(struct mm_struct *mm,
-	struct vm_area_struct *vma,
-	struct vm_area_struct *prev,
-	unsigned long start,
-	unsigned long end)
+		struct vm_area_struct *vma, struct vm_area_struct *prev,
+		unsigned long start, unsigned long end)
 {
+	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
 	struct mmu_gather *tlb;
 	unsigned long nr_accounted = 0;
  
 	lru_add_drain();
+	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-
-	if (is_hugepage_only_range(mm, start, end - start))
-		hugetlb_free_pgtables(tlb, prev, start, end);
-	else
-		free_pgtables(tlb, prev, start, end);
+	free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
+				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	spin_unlock(&mm->page_table_lock);
 }
  
 /*
  
@@ -1823,9 +1766,7 @@
 	 * Remove the vma's, and unmap the actual pages
 	 */
 	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
-	spin_lock(&mm->page_table_lock);
 	unmap_region(mm, mpnt, prev, start, end);
-	spin_unlock(&mm->page_table_lock);
  
 	/* Fix up all other VM information */
 	unmap_vma_list(mm, mpnt);
  
  
  
  
@@ -1957,25 +1898,21 @@
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma = mm->mmap;
 	unsigned long nr_accounted = 0;
  
 	lru_add_drain();
  
 	spin_lock(&mm->page_table_lock);
  
-	tlb = tlb_gather_mmu(mm, 1);
 	flush_cache_mm(mm);
-	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
-	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
-					~0UL, &nr_accounted, NULL);
+	tlb = tlb_gather_mmu(mm, 1);
+	/* Use -1 here to ensure all VMAs in the mm are unmapped */
+	mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	BUG_ON(mm->map_count);	/* This is just debugging */
-	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
-	
+	free_pgtables(&tlb, vma, 0, 0);
 	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
  
-	vma = mm->mmap;
 	mm->mmap = mm->mmap_cache = NULL;
 	mm->mm_rb = RB_ROOT;
 	set_mm_counter(mm, rss, 0);
@@ -1993,6 +1930,9 @@
 		remove_vm_struct(vma);
 		vma = next;
 	}
+
+	BUG_ON(mm->map_count);	/* This is just debugging */
+	BUG_ON(mm->nr_ptes);	/* This is just debugging */
 }
  
 /* Insert vm structure into process list sorted by address
...	...	@@ -255,7 +255,7 @@
255	255	if (PTRS_PER_PMD > 1)
256	256	for (i = 0; i < USER_PTRS_PER_PGD; ++i)
257	257	kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
258		- /* in the non-PAE case, clear_page_range() clears user pgd entries */
	258	+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
259	259	kmem_cache_free(pgd_cache, pgd);
260	260	}
...	...	@@ -187,45 +187,12 @@
187	187	}
188	188
189	189	/*
190		- * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
191		- * are hugetlb region specific.
	190	+ * Do nothing, until we've worked out what to do! To allow build, we
	191	+ * must remove reference to clear_page_range since it no longer exists.
192	192	*/
193	193	void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
194	194	unsigned long start, unsigned long end)
195	195	{
196		- unsigned long first = start & HUGETLB_PGDIR_MASK;
197		- unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
198		- struct mm_struct *mm = tlb->mm;
199		-
200		- if (!prev) {
201		- prev = mm->mmap;
202		- if (!prev)
203		- goto no_mmaps;
204		- if (prev->vm_end > start) {
205		- if (last > prev->vm_start)
206		- last = prev->vm_start;
207		- goto no_mmaps;
208		- }
209		- }
210		- for (;;) {
211		- struct vm_area_struct *next = prev->vm_next;
212		-
213		- if (next) {
214		- if (next->vm_start < start) {
215		- prev = next;
216		- continue;
217		- }
218		- if (last > next->vm_start)
219		- last = next->vm_start;
220		- }
221		- if (prev->vm_end > first)
222		- first = prev->vm_end;
223		- break;
224		- }
225		-no_mmaps:
226		- if (last < first) /* for arches with discontiguous pgd indices */
227		- return;
228		- clear_page_range(tlb, first, last);
229	196	}
230	197
231	198	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
...	...	@@ -592,7 +592,8 @@
592	592	struct vm_area_struct *start_vma, unsigned long start_addr,
593	593	unsigned long end_addr, unsigned long *nr_accounted,
594	594	struct zap_details *);
595		-void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
	595	+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
	596	+ unsigned long floor, unsigned long ceiling);
596	597	int copy_page_range(struct mm_struct dst, struct mm_struct src,
597	598	struct vm_area_struct *vma);
598	599	int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
...	...	@@ -110,87 +110,165 @@
110	110	* Note: this doesn't free the actual pages themselves. That
111	111	* has been handled earlier when unmapping all the memory regions.
112	112	*/
113		-static inline void clear_pte_range(struct mmu_gather tlb, pmd_t pmd,
114		- unsigned long addr, unsigned long end)
	113	+static void free_pte_range(struct mmu_gather tlb, pmd_t pmd)
115	114	{
116		- if (!((addr \| end) & ~PMD_MASK)) {
117		- /* Only free fully aligned ranges */
118		- struct page page = pmd_page(pmd);
119		- pmd_clear(pmd);
120		- dec_page_state(nr_page_table_pages);
121		- tlb->mm->nr_ptes--;
122		- pte_free_tlb(tlb, page);
123		- }
	115	+ struct page page = pmd_page(pmd);
	116	+ pmd_clear(pmd);
	117	+ pte_free_tlb(tlb, page);
	118	+ dec_page_state(nr_page_table_pages);
	119	+ tlb->mm->nr_ptes--;
124	120	}
125	121
126		-static inline void clear_pmd_range(struct mmu_gather tlb, pud_t pud,
127		- unsigned long addr, unsigned long end)
	122	+static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
	123	+ unsigned long addr, unsigned long end,
	124	+ unsigned long floor, unsigned long ceiling)
128	125	{
129	126	pmd_t *pmd;
130	127	unsigned long next;
131		- pmd_t *empty_pmd = NULL;
	128	+ unsigned long start;
132	129
	130	+ start = addr;
133	131	pmd = pmd_offset(pud, addr);
134		-
135		- /* Only free fully aligned ranges */
136		- if (!((addr \| end) & ~PUD_MASK))
137		- empty_pmd = pmd;
138	132	do {
139	133	next = pmd_addr_end(addr, end);
140	134	if (pmd_none_or_clear_bad(pmd))
141	135	continue;
142		- clear_pte_range(tlb, pmd, addr, next);
	136	+ free_pte_range(tlb, pmd);
143	137	} while (pmd++, addr = next, addr != end);
144	138
145		- if (empty_pmd) {
146		- pud_clear(pud);
147		- pmd_free_tlb(tlb, empty_pmd);
	139	+ start &= PUD_MASK;
	140	+ if (start < floor)
	141	+ return;
	142	+ if (ceiling) {
	143	+ ceiling &= PUD_MASK;
	144	+ if (!ceiling)
	145	+ return;
148	146	}
	147	+ if (end - 1 > ceiling - 1)
	148	+ return;
	149	+
	150	+ pmd = pmd_offset(pud, start);
	151	+ pud_clear(pud);
	152	+ pmd_free_tlb(tlb, pmd);
149	153	}
150	154
151		-static inline void clear_pud_range(struct mmu_gather tlb, pgd_t pgd,
152		- unsigned long addr, unsigned long end)
	155	+static inline void free_pud_range(struct mmu_gather tlb, pgd_t pgd,
	156	+ unsigned long addr, unsigned long end,
	157	+ unsigned long floor, unsigned long ceiling)
153	158	{
154	159	pud_t *pud;
155	160	unsigned long next;
156		- pud_t *empty_pud = NULL;
	161	+ unsigned long start;
157	162
	163	+ start = addr;
158	164	pud = pud_offset(pgd, addr);
159		-
160		- /* Only free fully aligned ranges */
161		- if (!((addr \| end) & ~PGDIR_MASK))
162		- empty_pud = pud;
163	165	do {
164	166	next = pud_addr_end(addr, end);
165	167	if (pud_none_or_clear_bad(pud))
166	168	continue;
167		- clear_pmd_range(tlb, pud, addr, next);
	169	+ free_pmd_range(tlb, pud, addr, next, floor, ceiling);
168	170	} while (pud++, addr = next, addr != end);
169	171
170		- if (empty_pud) {
171		- pgd_clear(pgd);
172		- pud_free_tlb(tlb, empty_pud);
	172	+ start &= PGDIR_MASK;
	173	+ if (start < floor)
	174	+ return;
	175	+ if (ceiling) {
	176	+ ceiling &= PGDIR_MASK;
	177	+ if (!ceiling)
	178	+ return;
173	179	}
	180	+ if (end - 1 > ceiling - 1)
	181	+ return;
	182	+
	183	+ pud = pud_offset(pgd, start);
	184	+ pgd_clear(pgd);
	185	+ pud_free_tlb(tlb, pud);
174	186	}
175	187
176	188	/*
177		- * This function clears user-level page tables of a process.
178		- * Unlike other pagetable walks, some memory layouts might give end 0.
	189	+ * This function frees user-level page tables of a process.
	190	+ *
179	191	* Must be called with pagetable lock held.
180	192	*/
181		-void clear_page_range(struct mmu_gather *tlb,
182		- unsigned long addr, unsigned long end)
	193	+static inline void free_pgd_range(struct mmu_gather *tlb,
	194	+ unsigned long addr, unsigned long end,
	195	+ unsigned long floor, unsigned long ceiling)
183	196	{
184	197	pgd_t *pgd;
185	198	unsigned long next;
	199	+ unsigned long start;
186	200
	201	+ /*
	202	+ * The next few lines have given us lots of grief...
	203	+ *
	204	+ * Why are we testing PMD* at this top level? Because often
	205	+ * there will be no work to do at all, and we'd prefer not to
	206	+ * go all the way down to the bottom just to discover that.
	207	+ *
	208	+ * Why all these "- 1"s? Because 0 represents both the bottom
	209	+ * of the address space and the top of it (using -1 for the
	210	+ * top wouldn't help much: the masks would do the wrong thing).
	211	+ * The rule is that addr 0 and floor 0 refer to the bottom of
	212	+ * the address space, but end 0 and ceiling 0 refer to the top
	213	+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
	214	+ * that end 0 case should be mythical).
	215	+ *
	216	+ * Wherever addr is brought up or ceiling brought down, we must
	217	+ * be careful to reject "the opposite 0" before it confuses the
	218	+ * subsequent tests. But what about where end is brought down
	219	+ * by PMD_SIZE below? no, end can't go down to 0 there.
	220	+ *
	221	+ * Whereas we round start (addr) and ceiling down, by different
	222	+ * masks at different levels, in order to test whether a table
	223	+ * now has no other vmas using it, so can be freed, we don't
	224	+ * bother to round floor or end up - the tests don't need that.
	225	+ */
	226	+
	227	+ addr &= PMD_MASK;
	228	+ if (addr < floor) {
	229	+ addr += PMD_SIZE;
	230	+ if (!addr)
	231	+ return;
	232	+ }
	233	+ if (ceiling) {
	234	+ ceiling &= PMD_MASK;
	235	+ if (!ceiling)
	236	+ return;
	237	+ }
	238	+ if (end - 1 > ceiling - 1)
	239	+ end -= PMD_SIZE;
	240	+ if (addr > end - 1)
	241	+ return;
	242	+
	243	+ start = addr;
187	244	pgd = pgd_offset(tlb->mm, addr);
188	245	do {
189	246	next = pgd_addr_end(addr, end);
190	247	if (pgd_none_or_clear_bad(pgd))
191	248	continue;
192		- clear_pud_range(tlb, pgd, addr, next);
	249	+ free_pud_range(tlb, pgd, addr, next, floor, ceiling);
193	250	} while (pgd++, addr = next, addr != end);
	251	+
	252	+ if (!tlb_is_full_mm(tlb))
	253	+ flush_tlb_pgtables(tlb->mm, start, end);
	254	+}
	255	+
	256	+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
	257	+ unsigned long floor, unsigned long ceiling)
	258	+{
	259	+ while (vma) {
	260	+ struct vm_area_struct *next = vma->vm_next;
	261	+ unsigned long addr = vma->vm_start;
	262	+
	263	+ /* Optimization: gather nearby vmas into a single call down */
	264	+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
	265	+ vma = next;
	266	+ next = vma->vm_next;
	267	+ }
	268	+ free_pgd_range(*tlb, addr, vma->vm_end,
	269	+ floor, next? next->vm_start: ceiling);
	270	+ vma = next;
	271	+ }
194	272	}
195	273
196	274	pte_t fastcall * pte_alloc_map(struct mm_struct mm, pmd_t pmd, unsigned long address)
...	...	@@ -29,6 +29,10 @@
29	29	#include <asm/cacheflush.h>
30	30	#include <asm/tlb.h>
31	31
	32	+static void unmap_region(struct mm_struct *mm,
	33	+ struct vm_area_struct vma, struct vm_area_struct prev,
	34	+ unsigned long start, unsigned long end);
	35	+
32	36	/*
33	37	* WARNING: the debugging will use recursive algorithms so never enable this
34	38	* unless you know what you are doing.
...	...	@@ -1129,7 +1133,8 @@
1129	1133	fput(file);
1130	1134
1131	1135	/* Undo any partial mapping done by a device driver. */
1132		- zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
	1136	+ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
	1137	+ charged = 0;
1133	1138	free_vma:
1134	1139	kmem_cache_free(vm_area_cachep, vma);
1135	1140	unacct_error:
...	...	@@ -1572,66 +1577,6 @@
1572	1577	}
1573	1578	#endif
1574	1579
1575		-/*
1576		- * Try to free as many page directory entries as we can,
1577		- * without having to work very hard at actually scanning
1578		- * the page tables themselves.
1579		- *
1580		- * Right now we try to free page tables if we have a nice
1581		- * PGDIR-aligned area that got free'd up. We could be more
1582		- * granular if we want to, but this is fast and simple,
1583		- * and covers the bad cases.
1584		- *
1585		- * "prev", if it exists, points to a vma before the one
1586		- * we just free'd - but there's no telling how much before.
1587		- */
1588		-static void free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
1589		- unsigned long start, unsigned long end)
1590		-{
1591		- unsigned long first = start & PGDIR_MASK;
1592		- unsigned long last = end + PGDIR_SIZE - 1;
1593		- struct mm_struct *mm = tlb->mm;
1594		-
1595		- if (last > MM_VM_SIZE(mm) \|\| last < end)
1596		- last = MM_VM_SIZE(mm);
1597		-
1598		- if (!prev) {
1599		- prev = mm->mmap;
1600		- if (!prev)
1601		- goto no_mmaps;
1602		- if (prev->vm_end > start) {
1603		- if (last > prev->vm_start)
1604		- last = prev->vm_start;
1605		- goto no_mmaps;
1606		- }
1607		- }
1608		- for (;;) {
1609		- struct vm_area_struct *next = prev->vm_next;
1610		-
1611		- if (next) {
1612		- if (next->vm_start < start) {
1613		- prev = next;
1614		- continue;
1615		- }
1616		- if (last > next->vm_start)
1617		- last = next->vm_start;
1618		- }
1619		- if (prev->vm_end > first)
1620		- first = prev->vm_end;
1621		- break;
1622		- }
1623		-no_mmaps:
1624		- if (last < first) /* for arches with discontiguous pgd indices */
1625		- return;
1626		- if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
1627		- first = FIRST_USER_PGD_NR * PGDIR_SIZE;
1628		- /* No point trying to free anything if we're in the same pte page */
1629		- if ((first & PMD_MASK) < (last & PMD_MASK)) {
1630		- clear_page_range(tlb, first, last);
1631		- flush_tlb_pgtables(mm, first, last);
1632		- }
1633		-}
1634		-
1635	1580	/* Normal function to fix up a mapping
1636	1581	* This function is the default for when an area has no specific
1637	1582	* function. This may be used as part of a more specific routine.
1638	1583
1639	1584
1640	1585
1641	1586
...	...	@@ -1674,24 +1619,22 @@
1674	1619	* Called with the page table lock held.
1675	1620	*/
1676	1621	static void unmap_region(struct mm_struct *mm,
1677		- struct vm_area_struct *vma,
1678		- struct vm_area_struct *prev,
1679		- unsigned long start,
1680		- unsigned long end)
	1622	+ struct vm_area_struct vma, struct vm_area_struct prev,
	1623	+ unsigned long start, unsigned long end)
1681	1624	{
	1625	+ struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1682	1626	struct mmu_gather *tlb;
1683	1627	unsigned long nr_accounted = 0;
1684	1628
1685	1629	lru_add_drain();
	1630	+ spin_lock(&mm->page_table_lock);
1686	1631	tlb = tlb_gather_mmu(mm, 0);
1687	1632	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1688	1633	vm_unacct_memory(nr_accounted);
1689		-
1690		- if (is_hugepage_only_range(mm, start, end - start))
1691		- hugetlb_free_pgtables(tlb, prev, start, end);
1692		- else
1693		- free_pgtables(tlb, prev, start, end);
	1634	+ free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
	1635	+ next? next->vm_start: 0);
1694	1636	tlb_finish_mmu(tlb, start, end);
	1637	+ spin_unlock(&mm->page_table_lock);
1695	1638	}
1696	1639
1697	1640	/*
1698	1641
...	...	@@ -1823,9 +1766,7 @@
1823	1766	* Remove the vma's, and unmap the actual pages
1824	1767	*/
1825	1768	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1826		- spin_lock(&mm->page_table_lock);
1827	1769	unmap_region(mm, mpnt, prev, start, end);
1828		- spin_unlock(&mm->page_table_lock);
1829	1770
1830	1771	/* Fix up all other VM information */
1831	1772	unmap_vma_list(mm, mpnt);
1832	1773
1833	1774
1834	1775
1835	1776
...	...	@@ -1957,25 +1898,21 @@
1957	1898	void exit_mmap(struct mm_struct *mm)
1958	1899	{
1959	1900	struct mmu_gather *tlb;
1960		- struct vm_area_struct *vma;
	1901	+ struct vm_area_struct *vma = mm->mmap;
1961	1902	unsigned long nr_accounted = 0;
1962	1903
1963	1904	lru_add_drain();
1964	1905
1965	1906	spin_lock(&mm->page_table_lock);
1966	1907
1967		- tlb = tlb_gather_mmu(mm, 1);
1968	1908	flush_cache_mm(mm);
1969		- /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1970		- mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1971		- ~0UL, &nr_accounted, NULL);
	1909	+ tlb = tlb_gather_mmu(mm, 1);
	1910	+ /* Use -1 here to ensure all VMAs in the mm are unmapped */
	1911	+ mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1972	1912	vm_unacct_memory(nr_accounted);
1973		- BUG_ON(mm->map_count); /* This is just debugging */
1974		- clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
1975		-
	1913	+ free_pgtables(&tlb, vma, 0, 0);
1976	1914	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1977	1915
1978		- vma = mm->mmap;
1979	1916	mm->mmap = mm->mmap_cache = NULL;
1980	1917	mm->mm_rb = RB_ROOT;
1981	1918	set_mm_counter(mm, rss, 0);
...	...	@@ -1993,6 +1930,9 @@
1993	1930	remove_vm_struct(vma);
1994	1931	vma = next;
1995	1932	}
	1933	+
	1934	+ BUG_ON(mm->map_count); /* This is just debugging */
	1935	+ BUG_ON(mm->nr_ptes); /* This is just debugging */
1996	1936	}
1997	1937
1998	1938	/* Insert vm structure into process list sorted by address