[PATCH] mm: ZAP_BLOCK causes redundant work

The address based work estimate for unmapping (for lockbreak) is and always was horribly inefficient for sparse mappings. The problem is most simply explained with an example: If we find a pgd is clear, we still have to call into unmap_page_range PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in order to progress the working address to the next pgd. The fundamental way to solve the problem is to keep track of the end address we've processed and pass it back to the higher layers. From: Nick Piggin <npiggin@suse.de> Modification to completely get away from address based work estimate and instead use an abstract count, with a very small cost for empty entries as opposed to present pages. On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB of virtual address space takes 1.69s; with the following patch applied, this operation can be done 1000 times in less than 0.01s From: Andrew Morton <akpm@osdl.org> With CONFIG_HUTETLB_PAGE=n: mm/memory.c: In function `unmap_vmas': mm/memory.c:779: warning: division by zero Due to zap_work -= (end - start) / (HPAGE_SIZE / PAGE_SIZE); So make the dummy HPAGE_SIZE non-zero Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] mm: ZAP_BLOCK causes redundant work
The address based work estimate for unmapping (for lockbreak) is and always was horribly inefficient for sparse mappings. The problem is most simply explained with an example: If we find a pgd is clear, we still have to call into unmap_page_range PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in order to progress the working address to the next pgd. The fundamental way to solve the problem is to keep track of the end address we've processed and pass it back to the higher layers. From: Nick Piggin <npiggin@suse.de> Modification to completely get away from address based work estimate and instead use an abstract count, with a very small cost for empty entries as opposed to present pages. On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB of virtual address space takes 1.69s; with the following patch applied, this operation can be done 1000 times in less than 0.01s From: Andrew Morton <akpm@osdl.org> With CONFIG_HUTETLB_PAGE=n: mm/memory.c: In function `unmap_vmas': mm/memory.c:779: warning: division by zero Due to zap_work -= (end - start) / (HPAGE_SIZE / PAGE_SIZE); So make the dummy HPAGE_SIZE non-zero Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Robin Holt · Linus Torvalds · Eric Lee
1 parent 885036d32f
Showing 2 changed files with 57 additions and 36 deletions Side-by-side Diff
include/linux/hugetlb.h
mm/memory.c
@@ -102,8 +102,8 @@
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
  
 #ifndef HPAGE_MASK
-#define HPAGE_MASK	0		/* Keep the compiler happy */
-#define HPAGE_SIZE	0
+#define HPAGE_MASK	PAGE_MASK		/* Keep the compiler happy */
+#define HPAGE_SIZE	PAGE_SIZE
 #endif
  
 #endif /* !CONFIG_HUGETLB_PAGE */
@@ -549,10 +549,10 @@
 	return 0;
 }
  
-static void zap_pte_range(struct mmu_gather *tlb,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
  
  
@@ -563,10 +563,15 @@
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
-		if (pte_none(ptent))
+		if (pte_none(ptent)) {
+			(*zap_work)--;
 			continue;
+		}
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
+
+			(*zap_work) -= PAGE_SIZE;
+
 			if (!(vma->vm_flags & VM_RESERVED)) {
 				unsigned long pfn = pte_pfn(ptent);
 				if (unlikely(!pfn_valid(pfn)))
  
  
  
@@ -624,16 +629,18 @@
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
  
 	add_mm_rss(mm, file_rss, anon_rss);
 	pte_unmap_unlock(pte - 1, ptl);
+
+	return addr;
 }
  
-static inline void zap_pmd_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
  
  
  
@@ -641,16 +648,21 @@
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_clear_bad(pmd)) {
+			(*zap_work)--;
 			continue;
-		zap_pte_range(tlb, vma, pmd, addr, next, details);
-	} while (pmd++, addr = next, addr != end);
+		}
+		next = zap_pte_range(tlb, vma, pmd, addr, next,
+						zap_work, details);
+	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
  
-static inline void zap_pud_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
  
  
  
@@ -658,15 +670,21 @@
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
+		if (pud_none_or_clear_bad(pud)) {
+			(*zap_work)--;
 			continue;
-		zap_pmd_range(tlb, vma, pud, addr, next, details);
-	} while (pud++, addr = next, addr != end);
+		}
+		next = zap_pmd_range(tlb, vma, pud, addr, next,
+						zap_work, details);
+	} while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
  
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
  
  
@@ -679,11 +697,16 @@
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
+		if (pgd_none_or_clear_bad(pgd)) {
+			(*zap_work)--;
 			continue;
-		zap_pud_range(tlb, vma, pgd, addr, next, details);
-	} while (pgd++, addr = next, addr != end);
+		}
+		next = zap_pud_range(tlb, vma, pgd, addr, next,
+						zap_work, details);
+	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
+
+	return addr;
 }
  
 #ifdef CONFIG_PREEMPT
@@ -724,7 +747,7 @@
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
  
  
  
@@ -745,27 +768,25 @@
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
  
 		while (start != end) {
-			unsigned long block;
-
 			if (!tlb_start_valid) {
 				tlb_start = start;
 				tlb_start_valid = 1;
 			}
  
-			if (is_vm_hugetlb_page(vma)) {
-				block = end - start;
+			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end);
-			} else {
-				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
-						start + block, details);
+				zap_work -= (end - start) /
+						(HPAGE_SIZE / PAGE_SIZE);
+				start = end;
+			} else
+				start = unmap_page_range(*tlbp, vma,
+						start, end, &zap_work, details);
+
+			if (zap_work > 0) {
+				BUG_ON(start != end);
+				break;
 			}
  
-			start += block;
-			zap_bytes -= block;
-			if ((long)zap_bytes > 0)
-				continue;
-
 			tlb_finish_mmu(*tlbp, tlb_start, start);
  
 			if (need_resched() ||
@@ -779,7 +800,7 @@
  
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
-			zap_bytes = ZAP_BLOCK_SIZE;
+			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
 out:
...	...	@@ -102,8 +102,8 @@
102	102	#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103	103
104	104	#ifndef HPAGE_MASK
105		-#define HPAGE_MASK 0 /* Keep the compiler happy */
106		-#define HPAGE_SIZE 0
	105	+#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */
	106	+#define HPAGE_SIZE PAGE_SIZE
107	107	#endif
108	108
109	109	#endif /* !CONFIG_HUGETLB_PAGE */
...	...	@@ -549,10 +549,10 @@
549	549	return 0;
550	550	}
551	551
552		-static void zap_pte_range(struct mmu_gather *tlb,
	552	+static unsigned long zap_pte_range(struct mmu_gather *tlb,
553	553	struct vm_area_struct vma, pmd_t pmd,
554	554	unsigned long addr, unsigned long end,
555		- struct zap_details *details)
	555	+ long zap_work, struct zap_details details)
556	556	{
557	557	struct mm_struct *mm = tlb->mm;
558	558	pte_t *pte;
559	559
560	560
...	...	@@ -563,10 +563,15 @@
563	563	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
564	564	do {
565	565	pte_t ptent = *pte;
566		- if (pte_none(ptent))
	566	+ if (pte_none(ptent)) {
	567	+ (*zap_work)--;
567	568	continue;
	569	+ }
568	570	if (pte_present(ptent)) {
569	571	struct page *page = NULL;
	572	+
	573	+ (*zap_work) -= PAGE_SIZE;
	574	+
570	575	if (!(vma->vm_flags & VM_RESERVED)) {
571	576	unsigned long pfn = pte_pfn(ptent);
572	577	if (unlikely(!pfn_valid(pfn)))
573	578
574	579
575	580
...	...	@@ -624,16 +629,18 @@
624	629	if (!pte_file(ptent))
625	630	free_swap_and_cache(pte_to_swp_entry(ptent));
626	631	pte_clear_full(mm, addr, pte, tlb->fullmm);
627		- } while (pte++, addr += PAGE_SIZE, addr != end);
	632	+ } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
628	633
629	634	add_mm_rss(mm, file_rss, anon_rss);
630	635	pte_unmap_unlock(pte - 1, ptl);
	636	+
	637	+ return addr;
631	638	}
632	639
633		-static inline void zap_pmd_range(struct mmu_gather *tlb,
	640	+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
634	641	struct vm_area_struct vma, pud_t pud,
635	642	unsigned long addr, unsigned long end,
636		- struct zap_details *details)
	643	+ long zap_work, struct zap_details details)
637	644	{
638	645	pmd_t *pmd;
639	646	unsigned long next;
640	647
641	648
642	649
...	...	@@ -641,16 +648,21 @@
641	648	pmd = pmd_offset(pud, addr);
642	649	do {
643	650	next = pmd_addr_end(addr, end);
644		- if (pmd_none_or_clear_bad(pmd))
	651	+ if (pmd_none_or_clear_bad(pmd)) {
	652	+ (*zap_work)--;
645	653	continue;
646		- zap_pte_range(tlb, vma, pmd, addr, next, details);
647		- } while (pmd++, addr = next, addr != end);
	654	+ }
	655	+ next = zap_pte_range(tlb, vma, pmd, addr, next,
	656	+ zap_work, details);
	657	+ } while (pmd++, addr = next, (addr != end && *zap_work > 0));
	658	+
	659	+ return addr;
648	660	}
649	661
650		-static inline void zap_pud_range(struct mmu_gather *tlb,
	662	+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
651	663	struct vm_area_struct vma, pgd_t pgd,
652	664	unsigned long addr, unsigned long end,
653		- struct zap_details *details)
	665	+ long zap_work, struct zap_details details)
654	666	{
655	667	pud_t *pud;
656	668	unsigned long next;
657	669
658	670
659	671
...	...	@@ -658,15 +670,21 @@
658	670	pud = pud_offset(pgd, addr);
659	671	do {
660	672	next = pud_addr_end(addr, end);
661		- if (pud_none_or_clear_bad(pud))
	673	+ if (pud_none_or_clear_bad(pud)) {
	674	+ (*zap_work)--;
662	675	continue;
663		- zap_pmd_range(tlb, vma, pud, addr, next, details);
664		- } while (pud++, addr = next, addr != end);
	676	+ }
	677	+ next = zap_pmd_range(tlb, vma, pud, addr, next,
	678	+ zap_work, details);
	679	+ } while (pud++, addr = next, (addr != end && *zap_work > 0));
	680	+
	681	+ return addr;
665	682	}
666	683
667		-static void unmap_page_range(struct mmu_gather tlb, struct vm_area_struct vma,
	684	+static unsigned long unmap_page_range(struct mmu_gather *tlb,
	685	+ struct vm_area_struct *vma,
668	686	unsigned long addr, unsigned long end,
669		- struct zap_details *details)
	687	+ long zap_work, struct zap_details details)
670	688	{
671	689	pgd_t *pgd;
672	690	unsigned long next;
673	691
674	692
...	...	@@ -679,11 +697,16 @@
679	697	pgd = pgd_offset(vma->vm_mm, addr);
680	698	do {
681	699	next = pgd_addr_end(addr, end);
682		- if (pgd_none_or_clear_bad(pgd))
	700	+ if (pgd_none_or_clear_bad(pgd)) {
	701	+ (*zap_work)--;
683	702	continue;
684		- zap_pud_range(tlb, vma, pgd, addr, next, details);
685		- } while (pgd++, addr = next, addr != end);
	703	+ }
	704	+ next = zap_pud_range(tlb, vma, pgd, addr, next,
	705	+ zap_work, details);
	706	+ } while (pgd++, addr = next, (addr != end && *zap_work > 0));
686	707	tlb_end_vma(tlb, vma);
	708	+
	709	+ return addr;
687	710	}
688	711
689	712	#ifdef CONFIG_PREEMPT
...	...	@@ -724,7 +747,7 @@
724	747	unsigned long end_addr, unsigned long *nr_accounted,
725	748	struct zap_details *details)
726	749	{
727		- unsigned long zap_bytes = ZAP_BLOCK_SIZE;
	750	+ long zap_work = ZAP_BLOCK_SIZE;
728	751	unsigned long tlb_start = 0; /* For tlb_finish_mmu */
729	752	int tlb_start_valid = 0;
730	753	unsigned long start = start_addr;
731	754
732	755
733	756
...	...	@@ -745,27 +768,25 @@
745	768	*nr_accounted += (end - start) >> PAGE_SHIFT;
746	769
747	770	while (start != end) {
748		- unsigned long block;
749		-
750	771	if (!tlb_start_valid) {
751	772	tlb_start = start;
752	773	tlb_start_valid = 1;
753	774	}
754	775
755		- if (is_vm_hugetlb_page(vma)) {
756		- block = end - start;
	776	+ if (unlikely(is_vm_hugetlb_page(vma))) {
757	777	unmap_hugepage_range(vma, start, end);
758		- } else {
759		- block = min(zap_bytes, end - start);
760		- unmap_page_range(*tlbp, vma, start,
761		- start + block, details);
	778	+ zap_work -= (end - start) /
	779	+ (HPAGE_SIZE / PAGE_SIZE);
	780	+ start = end;
	781	+ } else
	782	+ start = unmap_page_range(*tlbp, vma,
	783	+ start, end, &zap_work, details);
	784	+
	785	+ if (zap_work > 0) {
	786	+ BUG_ON(start != end);
	787	+ break;
762	788	}
763	789
764		- start += block;
765		- zap_bytes -= block;
766		- if ((long)zap_bytes > 0)
767		- continue;
768		-
769	790	tlb_finish_mmu(*tlbp, tlb_start, start);
770	791
771	792	if (need_resched() \|\|
...	...	@@ -779,7 +800,7 @@
779	800
780	801	*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
781	802	tlb_start_valid = 0;
782		- zap_bytes = ZAP_BLOCK_SIZE;
	803	+ zap_work = ZAP_BLOCK_SIZE;
783	804	}
784	805	}
785	806	out: