Commit 51c6f666fceb3184eeff045dad4432b602cd648e

Authored by Robin Holt
Committed by Linus Torvalds
1 parent 885036d32f

[PATCH] mm: ZAP_BLOCK causes redundant work

The address based work estimate for unmapping (for lockbreak) is and always
was horribly inefficient for sparse mappings.  The problem is most simply
explained with an example:

If we find a pgd is clear, we still have to call into unmap_page_range
PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in
order to progress the working address to the next pgd.

The fundamental way to solve the problem is to keep track of the end
address we've processed and pass it back to the higher layers.

From: Nick Piggin <npiggin@suse.de>

  Modification to completely get away from address based work estimate
  and instead use an abstract count, with a very small cost for empty
  entries as opposed to present pages.

  On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB
  of virtual address space takes 1.69s; with the following patch applied,
  this operation can be done 1000 times in less than 0.01s

From: Andrew Morton <akpm@osdl.org>

With CONFIG_HUTETLB_PAGE=n:

mm/memory.c: In function `unmap_vmas':
mm/memory.c:779: warning: division by zero

Due to

			zap_work -= (end - start) /
					(HPAGE_SIZE / PAGE_SIZE);

So make the dummy HPAGE_SIZE non-zero

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 57 additions and 36 deletions Side-by-side Diff

include/linux/hugetlb.h
... ... @@ -102,8 +102,8 @@
102 102 #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103 103  
104 104 #ifndef HPAGE_MASK
105   -#define HPAGE_MASK 0 /* Keep the compiler happy */
106   -#define HPAGE_SIZE 0
  105 +#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */
  106 +#define HPAGE_SIZE PAGE_SIZE
107 107 #endif
108 108  
109 109 #endif /* !CONFIG_HUGETLB_PAGE */
... ... @@ -549,10 +549,10 @@
549 549 return 0;
550 550 }
551 551  
552   -static void zap_pte_range(struct mmu_gather *tlb,
  552 +static unsigned long zap_pte_range(struct mmu_gather *tlb,
553 553 struct vm_area_struct *vma, pmd_t *pmd,
554 554 unsigned long addr, unsigned long end,
555   - struct zap_details *details)
  555 + long *zap_work, struct zap_details *details)
556 556 {
557 557 struct mm_struct *mm = tlb->mm;
558 558 pte_t *pte;
559 559  
560 560  
... ... @@ -563,10 +563,15 @@
563 563 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
564 564 do {
565 565 pte_t ptent = *pte;
566   - if (pte_none(ptent))
  566 + if (pte_none(ptent)) {
  567 + (*zap_work)--;
567 568 continue;
  569 + }
568 570 if (pte_present(ptent)) {
569 571 struct page *page = NULL;
  572 +
  573 + (*zap_work) -= PAGE_SIZE;
  574 +
570 575 if (!(vma->vm_flags & VM_RESERVED)) {
571 576 unsigned long pfn = pte_pfn(ptent);
572 577 if (unlikely(!pfn_valid(pfn)))
573 578  
574 579  
575 580  
... ... @@ -624,16 +629,18 @@
624 629 if (!pte_file(ptent))
625 630 free_swap_and_cache(pte_to_swp_entry(ptent));
626 631 pte_clear_full(mm, addr, pte, tlb->fullmm);
627   - } while (pte++, addr += PAGE_SIZE, addr != end);
  632 + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
628 633  
629 634 add_mm_rss(mm, file_rss, anon_rss);
630 635 pte_unmap_unlock(pte - 1, ptl);
  636 +
  637 + return addr;
631 638 }
632 639  
633   -static inline void zap_pmd_range(struct mmu_gather *tlb,
  640 +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
634 641 struct vm_area_struct *vma, pud_t *pud,
635 642 unsigned long addr, unsigned long end,
636   - struct zap_details *details)
  643 + long *zap_work, struct zap_details *details)
637 644 {
638 645 pmd_t *pmd;
639 646 unsigned long next;
640 647  
641 648  
642 649  
... ... @@ -641,16 +648,21 @@
641 648 pmd = pmd_offset(pud, addr);
642 649 do {
643 650 next = pmd_addr_end(addr, end);
644   - if (pmd_none_or_clear_bad(pmd))
  651 + if (pmd_none_or_clear_bad(pmd)) {
  652 + (*zap_work)--;
645 653 continue;
646   - zap_pte_range(tlb, vma, pmd, addr, next, details);
647   - } while (pmd++, addr = next, addr != end);
  654 + }
  655 + next = zap_pte_range(tlb, vma, pmd, addr, next,
  656 + zap_work, details);
  657 + } while (pmd++, addr = next, (addr != end && *zap_work > 0));
  658 +
  659 + return addr;
648 660 }
649 661  
650   -static inline void zap_pud_range(struct mmu_gather *tlb,
  662 +static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
651 663 struct vm_area_struct *vma, pgd_t *pgd,
652 664 unsigned long addr, unsigned long end,
653   - struct zap_details *details)
  665 + long *zap_work, struct zap_details *details)
654 666 {
655 667 pud_t *pud;
656 668 unsigned long next;
657 669  
658 670  
659 671  
... ... @@ -658,15 +670,21 @@
658 670 pud = pud_offset(pgd, addr);
659 671 do {
660 672 next = pud_addr_end(addr, end);
661   - if (pud_none_or_clear_bad(pud))
  673 + if (pud_none_or_clear_bad(pud)) {
  674 + (*zap_work)--;
662 675 continue;
663   - zap_pmd_range(tlb, vma, pud, addr, next, details);
664   - } while (pud++, addr = next, addr != end);
  676 + }
  677 + next = zap_pmd_range(tlb, vma, pud, addr, next,
  678 + zap_work, details);
  679 + } while (pud++, addr = next, (addr != end && *zap_work > 0));
  680 +
  681 + return addr;
665 682 }
666 683  
667   -static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  684 +static unsigned long unmap_page_range(struct mmu_gather *tlb,
  685 + struct vm_area_struct *vma,
668 686 unsigned long addr, unsigned long end,
669   - struct zap_details *details)
  687 + long *zap_work, struct zap_details *details)
670 688 {
671 689 pgd_t *pgd;
672 690 unsigned long next;
673 691  
674 692  
... ... @@ -679,11 +697,16 @@
679 697 pgd = pgd_offset(vma->vm_mm, addr);
680 698 do {
681 699 next = pgd_addr_end(addr, end);
682   - if (pgd_none_or_clear_bad(pgd))
  700 + if (pgd_none_or_clear_bad(pgd)) {
  701 + (*zap_work)--;
683 702 continue;
684   - zap_pud_range(tlb, vma, pgd, addr, next, details);
685   - } while (pgd++, addr = next, addr != end);
  703 + }
  704 + next = zap_pud_range(tlb, vma, pgd, addr, next,
  705 + zap_work, details);
  706 + } while (pgd++, addr = next, (addr != end && *zap_work > 0));
686 707 tlb_end_vma(tlb, vma);
  708 +
  709 + return addr;
687 710 }
688 711  
689 712 #ifdef CONFIG_PREEMPT
... ... @@ -724,7 +747,7 @@
724 747 unsigned long end_addr, unsigned long *nr_accounted,
725 748 struct zap_details *details)
726 749 {
727   - unsigned long zap_bytes = ZAP_BLOCK_SIZE;
  750 + long zap_work = ZAP_BLOCK_SIZE;
728 751 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
729 752 int tlb_start_valid = 0;
730 753 unsigned long start = start_addr;
731 754  
732 755  
733 756  
... ... @@ -745,27 +768,25 @@
745 768 *nr_accounted += (end - start) >> PAGE_SHIFT;
746 769  
747 770 while (start != end) {
748   - unsigned long block;
749   -
750 771 if (!tlb_start_valid) {
751 772 tlb_start = start;
752 773 tlb_start_valid = 1;
753 774 }
754 775  
755   - if (is_vm_hugetlb_page(vma)) {
756   - block = end - start;
  776 + if (unlikely(is_vm_hugetlb_page(vma))) {
757 777 unmap_hugepage_range(vma, start, end);
758   - } else {
759   - block = min(zap_bytes, end - start);
760   - unmap_page_range(*tlbp, vma, start,
761   - start + block, details);
  778 + zap_work -= (end - start) /
  779 + (HPAGE_SIZE / PAGE_SIZE);
  780 + start = end;
  781 + } else
  782 + start = unmap_page_range(*tlbp, vma,
  783 + start, end, &zap_work, details);
  784 +
  785 + if (zap_work > 0) {
  786 + BUG_ON(start != end);
  787 + break;
762 788 }
763 789  
764   - start += block;
765   - zap_bytes -= block;
766   - if ((long)zap_bytes > 0)
767   - continue;
768   -
769 790 tlb_finish_mmu(*tlbp, tlb_start, start);
770 791  
771 792 if (need_resched() ||
... ... @@ -779,7 +800,7 @@
779 800  
780 801 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
781 802 tlb_start_valid = 0;
782   - zap_bytes = ZAP_BLOCK_SIZE;
  803 + zap_work = ZAP_BLOCK_SIZE;
783 804 }
784 805 }
785 806 out: