Commit 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 8f4f8c164c

[PATCH] mm: unmap_vmas with inner ptlock

Remove the page_table_lock from around the calls to unmap_vmas, and replace
the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are
now safe to descend without page_table_lock.

Don't attempt fancy locking for hugepages, just take page_table_lock in
unmap_hugepage_range.  Which makes zap_hugepage_range, and the hugetlb test in
zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway.  Nor
does unmap_vmas have much use for its mm arg now.

The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without
page_table_lock: if they're implemented at all, they typically come down to
flush_cache_range (usually done outside page_table_lock) and flush_tlb_range
(which we already audited for the mprotect case).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 6 changed files with 21 additions and 54 deletions Side-by-side Diff

fs/hugetlbfs/inode.c
... ... @@ -92,7 +92,7 @@
92 92 }
93 93  
94 94 /*
95   - * Called under down_write(mmap_sem), page_table_lock is not held
  95 + * Called under down_write(mmap_sem).
96 96 */
97 97  
98 98 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
... ... @@ -308,7 +308,6 @@
308 308  
309 309 vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
310 310 unsigned long h_vm_pgoff;
311   - unsigned long v_length;
312 311 unsigned long v_offset;
313 312  
314 313 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
... ... @@ -319,11 +318,8 @@
319 318 if (h_vm_pgoff >= h_pgoff)
320 319 v_offset = 0;
321 320  
322   - v_length = vma->vm_end - vma->vm_start;
323   -
324   - zap_hugepage_range(vma,
325   - vma->vm_start + v_offset,
326   - v_length - v_offset);
  321 + unmap_hugepage_range(vma,
  322 + vma->vm_start + v_offset, vma->vm_end);
327 323 }
328 324 }
329 325  
include/linux/hugetlb.h
... ... @@ -16,7 +16,6 @@
16 16 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
17 17 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
18 18 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
19   -void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
20 19 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
21 20 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
22 21 int hugetlb_report_meminfo(char *);
... ... @@ -87,7 +86,6 @@
87 86 #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
88 87 #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
89 88 #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
90   -#define zap_hugepage_range(vma, start, len) BUG()
91 89 #define unmap_hugepage_range(vma, start, end) BUG()
92 90 #define is_hugepage_mem_enough(size) 0
93 91 #define hugetlb_report_meminfo(buf) 0
... ... @@ -682,7 +682,7 @@
682 682  
683 683 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
684 684 unsigned long size, struct zap_details *);
685   -unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
  685 +unsigned long unmap_vmas(struct mmu_gather **tlb,
686 686 struct vm_area_struct *start_vma, unsigned long start_addr,
687 687 unsigned long end_addr, unsigned long *nr_accounted,
688 688 struct zap_details *);
... ... @@ -314,6 +314,8 @@
314 314 BUG_ON(start & ~HPAGE_MASK);
315 315 BUG_ON(end & ~HPAGE_MASK);
316 316  
  317 + spin_lock(&mm->page_table_lock);
  318 +
317 319 /* Update high watermark before we lower rss */
318 320 update_hiwater_rss(mm);
319 321  
320 322  
321 323  
... ... @@ -333,17 +335,9 @@
333 335 put_page(page);
334 336 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
335 337 }
336   - flush_tlb_range(vma, start, end);
337   -}
338 338  
339   -void zap_hugepage_range(struct vm_area_struct *vma,
340   - unsigned long start, unsigned long length)
341   -{
342   - struct mm_struct *mm = vma->vm_mm;
343   -
344   - spin_lock(&mm->page_table_lock);
345   - unmap_hugepage_range(vma, start, start + length);
346 339 spin_unlock(&mm->page_table_lock);
  340 + flush_tlb_range(vma, start, end);
347 341 }
348 342  
349 343 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
... ... @@ -551,10 +551,11 @@
551 551 {
552 552 struct mm_struct *mm = tlb->mm;
553 553 pte_t *pte;
  554 + spinlock_t *ptl;
554 555 int file_rss = 0;
555 556 int anon_rss = 0;
556 557  
557   - pte = pte_offset_map(pmd, addr);
  558 + pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
558 559 do {
559 560 pte_t ptent = *pte;
560 561 if (pte_none(ptent))
... ... @@ -621,7 +622,7 @@
621 622 } while (pte++, addr += PAGE_SIZE, addr != end);
622 623  
623 624 add_mm_rss(mm, file_rss, anon_rss);
624   - pte_unmap(pte - 1);
  625 + pte_unmap_unlock(pte - 1, ptl);
625 626 }
626 627  
627 628 static inline void zap_pmd_range(struct mmu_gather *tlb,
... ... @@ -690,7 +691,6 @@
690 691 /**
691 692 * unmap_vmas - unmap a range of memory covered by a list of vma's
692 693 * @tlbp: address of the caller's struct mmu_gather
693   - * @mm: the controlling mm_struct
694 694 * @vma: the starting vma
695 695 * @start_addr: virtual address at which to start unmapping
696 696 * @end_addr: virtual address at which to end unmapping
697 697  
... ... @@ -699,10 +699,10 @@
699 699 *
700 700 * Returns the end address of the unmapping (restart addr if interrupted).
701 701 *
702   - * Unmap all pages in the vma list. Called under page_table_lock.
  702 + * Unmap all pages in the vma list.
703 703 *
704   - * We aim to not hold page_table_lock for too long (for scheduling latency
705   - * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
  704 + * We aim to not hold locks for too long (for scheduling latency reasons).
  705 + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
706 706 * return the ending mmu_gather to the caller.
707 707 *
708 708 * Only addresses between `start' and `end' will be unmapped.
... ... @@ -714,7 +714,7 @@
714 714 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
715 715 * drops the lock and schedules.
716 716 */
717   -unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
  717 +unsigned long unmap_vmas(struct mmu_gather **tlbp,
718 718 struct vm_area_struct *vma, unsigned long start_addr,
719 719 unsigned long end_addr, unsigned long *nr_accounted,
720 720 struct zap_details *details)
721 721  
722 722  
723 723  
724 724  
... ... @@ -764,19 +764,15 @@
764 764 tlb_finish_mmu(*tlbp, tlb_start, start);
765 765  
766 766 if (need_resched() ||
767   - need_lockbreak(&mm->page_table_lock) ||
768 767 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
769 768 if (i_mmap_lock) {
770   - /* must reset count of rss freed */
771   - *tlbp = tlb_gather_mmu(mm, fullmm);
  769 + *tlbp = NULL;
772 770 goto out;
773 771 }
774   - spin_unlock(&mm->page_table_lock);
775 772 cond_resched();
776   - spin_lock(&mm->page_table_lock);
777 773 }
778 774  
779   - *tlbp = tlb_gather_mmu(mm, fullmm);
  775 + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
780 776 tlb_start_valid = 0;
781 777 zap_bytes = ZAP_BLOCK_SIZE;
782 778 }
783 779  
... ... @@ -800,18 +796,12 @@
800 796 unsigned long end = address + size;
801 797 unsigned long nr_accounted = 0;
802 798  
803   - if (is_vm_hugetlb_page(vma)) {
804   - zap_hugepage_range(vma, address, size);
805   - return end;
806   - }
807   -
808 799 lru_add_drain();
809 800 tlb = tlb_gather_mmu(mm, 0);
810 801 update_hiwater_rss(mm);
811   - spin_lock(&mm->page_table_lock);
812   - end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
813   - spin_unlock(&mm->page_table_lock);
814   - tlb_finish_mmu(tlb, address, end);
  802 + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
  803 + if (tlb)
  804 + tlb_finish_mmu(tlb, address, end);
815 805 return end;
816 806 }
817 807  
... ... @@ -1434,13 +1424,6 @@
1434 1424  
1435 1425 restart_addr = zap_page_range(vma, start_addr,
1436 1426 end_addr - start_addr, details);
1437   -
1438   - /*
1439   - * We cannot rely on the break test in unmap_vmas:
1440   - * on the one hand, we don't want to restart our loop
1441   - * just because that broke out for the page_table_lock;
1442   - * on the other hand, it does no test when vma is small.
1443   - */
1444 1427 need_break = need_resched() ||
1445 1428 need_lockbreak(details->i_mmap_lock);
1446 1429  
... ... @@ -1673,9 +1673,7 @@
1673 1673 lru_add_drain();
1674 1674 tlb = tlb_gather_mmu(mm, 0);
1675 1675 update_hiwater_rss(mm);
1676   - spin_lock(&mm->page_table_lock);
1677   - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1678   - spin_unlock(&mm->page_table_lock);
  1676 + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1679 1677 vm_unacct_memory(nr_accounted);
1680 1678 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1681 1679 next? next->vm_start: 0);
... ... @@ -1958,9 +1956,7 @@
1958 1956 tlb = tlb_gather_mmu(mm, 1);
1959 1957 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1960 1958 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1961   - spin_lock(&mm->page_table_lock);
1962   - end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1963   - spin_unlock(&mm->page_table_lock);
  1959 + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
1964 1960 vm_unacct_memory(nr_accounted);
1965 1961 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1966 1962 tlb_finish_mmu(tlb, 0, end);