[PATCH] freepgt: hugetlb_free_pgd_range

ia64 and ppc64 had hugetlb_free_pgtables functions which were no longer being called, and it wasn't obvious what to do about them. The ppc64 case turns out to be easy: the associated tables are noted elsewhere and freed later, safe to either skip its hugetlb areas or go through the motions of freeing nothing. Since ia64 does need a special case, restore to ppc64 the special case of skipping them. The ia64 hugetlb case has been broken since pgd_addr_end went in, though it probably appeared to work okay if you just had one such area; in fact it's been broken much longer if you consider a long munmap spanning from another region into the hugetlb region. In the ia64 hugetlb region, more virtual address bits are available than in the other regions, yet the page tables are structured the same way: the page at the bottom is larger. Here we need to scale down each addr before passing it to the standard free_pgd_range. Was about to write a hugely_scaled_down macro, but found htlbpage_to_page already exists for just this purpose. Fixed off-by-one in ia64 is_hugepage_only_range. Uninline free_pgd_range to make it available to ia64. Make sure the vma-gathering loop in free_pgtables cannot join a hugepage_only_range to any other (safe to join huges? probably but don't bother). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] freepgt: hugetlb_free_pgd_range
ia64 and ppc64 had hugetlb_free_pgtables functions which were no longer being called, and it wasn't obvious what to do about them. The ppc64 case turns out to be easy: the associated tables are noted elsewhere and freed later, safe to either skip its hugetlb areas or go through the motions of freeing nothing. Since ia64 does need a special case, restore to ppc64 the special case of skipping them. The ia64 hugetlb case has been broken since pgd_addr_end went in, though it probably appeared to work okay if you just had one such area; in fact it's been broken much longer if you consider a long munmap spanning from another region into the hugetlb region. In the ia64 hugetlb region, more virtual address bits are available than in the other regions, yet the page tables are structured the same way: the page at the bottom is larger. Here we need to scale down each addr before passing it to the standard free_pgd_range. Was about to write a hugely_scaled_down macro, but found htlbpage_to_page already exists for just this purpose. Fixed off-by-one in ia64 is_hugepage_only_range. Uninline free_pgd_range to make it available to ia64. Make sure the vma-gathering loop in free_pgtables cannot join a hugepage_only_range to any other (safe to join huges? probably but don't bother). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Hugh Dickins · Linus Torvalds
1 parent ee39b37b23
Showing 8 changed files with 65 additions and 38 deletions Side-by-side Diff
arch/ia64/mm/hugetlbpage.c
arch/ppc64/mm/hugetlbpage.c
include/asm-ia64/page.h
include/asm-ia64/pgtable.h
include/asm-ppc64/pgtable.h
include/linux/hugetlb.h
include/linux/mm.h
mm/memory.c
@@ -186,13 +186,30 @@
 	return NULL;
 }
  
-/*
- * Do nothing, until we've worked out what to do!  To allow build, we
- * must remove reference to clear_page_range since it no longer exists.
- */
-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
-	unsigned long start, unsigned long end)
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
 {
+	/*
+	 * This is called only when is_hugepage_only_range(addr,),
+	 * and it follows that is_hugepage_only_range(end,) also.
+	 *
+	 * The offset of these addresses from the base of the hugetlb
+	 * region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
+	 * the standard free_pgd_range will free the right page tables.
+	 *
+	 * If floor and ceiling are also in the hugetlb region, they
+	 * must likewise be scaled down; but if outside, left unchanged.
+	 */
+
+	addr = htlbpage_to_page(addr);
+	end  = htlbpage_to_page(end);
+	if (is_hugepage_only_range(tlb->mm, floor, HPAGE_SIZE))
+		floor = htlbpage_to_page(floor);
+	if (is_hugepage_only_range(tlb->mm, ceiling, HPAGE_SIZE))
+		ceiling = htlbpage_to_page(ceiling);
+
+	free_pgd_range(tlb, addr, end, floor, ceiling);
 }
  
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
@@ -430,16 +430,6 @@
 	flush_tlb_pending();
 }
  
-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
-			   unsigned long start, unsigned long end)
-{
-	/* Because the huge pgtables are only 2 level, they can take
-	 * at most around 4M, much less than one hugepage which the
-	 * process is presumably entitled to use.  So we don't bother
-	 * freeing up the pagetables on unmap, and wait until
-	 * destroy_context() to clean up the lot. */
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -139,7 +139,7 @@
 # define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 # define is_hugepage_only_range(mm, addr, len)		\
 	 (REGION_NUMBER(addr) == REGION_HPAGE &&	\
-	  REGION_NUMBER((addr)+(len)) == REGION_HPAGE)
+	  REGION_NUMBER((addr)+(len)-1) == REGION_HPAGE)
 extern unsigned int hpage_shift;
 #endif
  
@@ -472,8 +472,8 @@
 #define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
 #define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
 struct mmu_gather;
-extern void hugetlb_free_pgtables(struct mmu_gather *tlb,
-	struct vm_area_struct * prev, unsigned long start, unsigned long end);
+void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+		unsigned long end, unsigned long floor, unsigned long ceiling);
 #endif
  
 /*
@@ -500,9 +500,15 @@
  
 extern void paging_init(void);
  
-struct mmu_gather;
-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
-			   unsigned long start, unsigned long end);
+/*
+ * Because the huge pgtables are only 2 level, they can take
+ * at most around 4M, much less than one hugepage which the
+ * process is presumably entitled to use.  So we don't bother
+ * freeing up the pagetables on unmap, and wait until
+ * destroy_context() to clean up the lot.
+ */
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+						do { } while (0)
  
 /*
  * This gets called at the end of handling a page fault, when
@@ -37,7 +37,8 @@
  
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+						do { } while (0)
 #endif
  
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
@@ -72,7 +73,8 @@
 #define prepare_hugepage_range(addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+						do { } while (0)
 #define alloc_huge_page()			({ NULL; })
 #define free_huge_page(p)			({ (void)(p); BUG(); })
  
@@ -587,7 +587,9 @@
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+		unsigned long end, unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
@@ -190,7 +190,7 @@
  *
  * Must be called with pagetable lock held.
  */
-static inline void free_pgd_range(struct mmu_gather *tlb,
+void free_pgd_range(struct mmu_gather **tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
  
  
  
  
  
  
@@ -241,37 +241,47 @@
 		return;
  
 	start = addr;
-	pgd = pgd_offset(tlb->mm, addr);
+	pgd = pgd_offset((*tlb)->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
  
-	if (!tlb_is_full_mm(tlb))
-		flush_tlb_pgtables(tlb->mm, start, end);
+	if (!tlb_is_full_mm(*tlb))
+		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
  
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
-				unsigned long floor, unsigned long ceiling)
+		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
  
-		/* Optimization: gather nearby vmas into a single call down */
-		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
-			vma = next;
-			next = vma->vm_next;
-		}
-		free_pgd_range(*tlb, addr, vma->vm_end,
+		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
+		} else {
+			/*
+			 * Optimization: gather nearby vmas into one call down
+			 */
+			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+			  && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
+							HPAGE_SIZE)) {
+				vma = next;
+				next = vma->vm_next;
+			}
+			free_pgd_range(tlb, addr, vma->vm_end,
+				floor, next? next->vm_start: ceiling);
+		}
 		vma = next;
 	}
 }
  
-pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
 		struct page *new;
...	...	@@ -186,13 +186,30 @@
186	186	return NULL;
187	187	}
188	188
189		-/*
190		- * Do nothing, until we've worked out what to do! To allow build, we
191		- * must remove reference to clear_page_range since it no longer exists.
192		- */
193		-void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
194		- unsigned long start, unsigned long end)
	189	+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
	190	+ unsigned long addr, unsigned long end,
	191	+ unsigned long floor, unsigned long ceiling)
195	192	{
	193	+ /*
	194	+ * This is called only when is_hugepage_only_range(addr,),
	195	+ * and it follows that is_hugepage_only_range(end,) also.
	196	+ *
	197	+ * The offset of these addresses from the base of the hugetlb
	198	+ * region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
	199	+ * the standard free_pgd_range will free the right page tables.
	200	+ *
	201	+ * If floor and ceiling are also in the hugetlb region, they
	202	+ * must likewise be scaled down; but if outside, left unchanged.
	203	+ */
	204	+
	205	+ addr = htlbpage_to_page(addr);
	206	+ end = htlbpage_to_page(end);
	207	+ if (is_hugepage_only_range(tlb->mm, floor, HPAGE_SIZE))
	208	+ floor = htlbpage_to_page(floor);
	209	+ if (is_hugepage_only_range(tlb->mm, ceiling, HPAGE_SIZE))
	210	+ ceiling = htlbpage_to_page(ceiling);
	211	+
	212	+ free_pgd_range(tlb, addr, end, floor, ceiling);
196	213	}
197	214
198	215	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
...	...	@@ -430,16 +430,6 @@
430	430	flush_tlb_pending();
431	431	}
432	432
433		-void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
434		- unsigned long start, unsigned long end)
435		-{
436		- /* Because the huge pgtables are only 2 level, they can take
437		- * at most around 4M, much less than one hugepage which the
438		- * process is presumably entitled to use. So we don't bother
439		- * freeing up the pagetables on unmap, and wait until
440		- * destroy_context() to clean up the lot. */
441		-}
442		-
443	433	int hugetlb_prefault(struct address_space mapping, struct vm_area_struct vma)
444	434	{
445	435	struct mm_struct *mm = current->mm;
...	...	@@ -139,7 +139,7 @@
139	139	# define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
140	140	# define is_hugepage_only_range(mm, addr, len) \
141	141	(REGION_NUMBER(addr) == REGION_HPAGE && \
142		- REGION_NUMBER((addr)+(len)) == REGION_HPAGE)
	142	+ REGION_NUMBER((addr)+(len)-1) == REGION_HPAGE)
143	143	extern unsigned int hpage_shift;
144	144	#endif
145	145
...	...	@@ -472,8 +472,8 @@
472	472	#define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
473	473	#define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1))
474	474	struct mmu_gather;
475		-extern void hugetlb_free_pgtables(struct mmu_gather *tlb,
476		- struct vm_area_struct * prev, unsigned long start, unsigned long end);
	475	+void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
	476	+ unsigned long end, unsigned long floor, unsigned long ceiling);
477	477	#endif
478	478
479	479	/*
...	...	@@ -500,9 +500,15 @@
500	500
501	501	extern void paging_init(void);
502	502
503		-struct mmu_gather;
504		-void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
505		- unsigned long start, unsigned long end);
	503	+/*
	504	+ * Because the huge pgtables are only 2 level, they can take
	505	+ * at most around 4M, much less than one hugepage which the
	506	+ * process is presumably entitled to use. So we don't bother
	507	+ * freeing up the pagetables on unmap, and wait until
	508	+ * destroy_context() to clean up the lot.
	509	+ */
	510	+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
	511	+ do { } while (0)
506	512
507	513	/*
508	514	* This gets called at the end of handling a page fault, when
...	...	@@ -37,7 +37,8 @@
37	37
38	38	#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
39	39	#define is_hugepage_only_range(mm, addr, len) 0
40		-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
	40	+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
	41	+ do { } while (0)
41	42	#endif
42	43
43	44	#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
...	...	@@ -72,7 +73,8 @@
72	73	#define prepare_hugepage_range(addr, len) (-EINVAL)
73	74	#define pmd_huge(x) 0
74	75	#define is_hugepage_only_range(mm, addr, len) 0
75		-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
	76	+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
	77	+ do { } while (0)
76	78	#define alloc_huge_page() ({ NULL; })
77	79	#define free_huge_page(p) ({ (void)(p); BUG(); })
78	80
...	...	@@ -587,7 +587,9 @@
587	587	struct vm_area_struct *start_vma, unsigned long start_addr,
588	588	unsigned long end_addr, unsigned long *nr_accounted,
589	589	struct zap_details *);
590		-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
	590	+void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
	591	+ unsigned long end, unsigned long floor, unsigned long ceiling);
	592	+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct start_vma,
591	593	unsigned long floor, unsigned long ceiling);
592	594	int copy_page_range(struct mm_struct dst, struct mm_struct src,
593	595	struct vm_area_struct *vma);
...	...	@@ -190,7 +190,7 @@
190	190	*
191	191	* Must be called with pagetable lock held.
192	192	*/
193		-static inline void free_pgd_range(struct mmu_gather *tlb,
	193	+void free_pgd_range(struct mmu_gather **tlb,
194	194	unsigned long addr, unsigned long end,
195	195	unsigned long floor, unsigned long ceiling)
196	196	{
197	197
198	198
199	199
200	200
201	201
202	202
...	...	@@ -241,37 +241,47 @@
241	241	return;
242	242
243	243	start = addr;
244		- pgd = pgd_offset(tlb->mm, addr);
	244	+ pgd = pgd_offset((*tlb)->mm, addr);
245	245	do {
246	246	next = pgd_addr_end(addr, end);
247	247	if (pgd_none_or_clear_bad(pgd))
248	248	continue;
249		- free_pud_range(tlb, pgd, addr, next, floor, ceiling);
	249	+ free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250	250	} while (pgd++, addr = next, addr != end);
251	251
252		- if (!tlb_is_full_mm(tlb))
253		- flush_tlb_pgtables(tlb->mm, start, end);
	252	+ if (!tlb_is_full_mm(*tlb))
	253	+ flush_tlb_pgtables((*tlb)->mm, start, end);
254	254	}
255	255
256	256	void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
257		- unsigned long floor, unsigned long ceiling)
	257	+ unsigned long floor, unsigned long ceiling)
258	258	{
259	259	while (vma) {
260	260	struct vm_area_struct *next = vma->vm_next;
261	261	unsigned long addr = vma->vm_start;
262	262
263		- /* Optimization: gather nearby vmas into a single call down */
264		- while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
265		- vma = next;
266		- next = vma->vm_next;
267		- }
268		- free_pgd_range(*tlb, addr, vma->vm_end,
	263	+ if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
	264	+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
269	265	floor, next? next->vm_start: ceiling);
	266	+ } else {
	267	+ /*
	268	+ * Optimization: gather nearby vmas into one call down
	269	+ */
	270	+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
	271	+ && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
	272	+ HPAGE_SIZE)) {
	273	+ vma = next;
	274	+ next = vma->vm_next;
	275	+ }
	276	+ free_pgd_range(tlb, addr, vma->vm_end,
	277	+ floor, next? next->vm_start: ceiling);
	278	+ }
270	279	vma = next;
271	280	}
272	281	}
273	282
274		-pte_t fastcall * pte_alloc_map(struct mm_struct mm, pmd_t pmd, unsigned long address)
	283	+pte_t fastcall pte_alloc_map(struct mm_struct mm, pmd_t *pmd,
	284	+ unsigned long address)
275	285	{
276	286	if (!pmd_present(*pmd)) {
277	287	struct page *new;