Commit 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6
Committed by
Linus Torvalds
1 parent
8f4f8c164c
Exists in
master
and in
4 other branches
[PATCH] mm: unmap_vmas with inner ptlock
Remove the page_table_lock from around the calls to unmap_vmas, and replace the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are now safe to descend without page_table_lock. Don't attempt fancy locking for hugepages, just take page_table_lock in unmap_hugepage_range. Which makes zap_hugepage_range, and the hugetlb test in zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway. Nor does unmap_vmas have much use for its mm arg now. The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without page_table_lock: if they're implemented at all, they typically come down to flush_cache_range (usually done outside page_table_lock) and flush_tlb_range (which we already audited for the mprotect case). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 6 changed files with 21 additions and 54 deletions Side-by-side Diff
fs/hugetlbfs/inode.c
... | ... | @@ -92,7 +92,7 @@ |
92 | 92 | } |
93 | 93 | |
94 | 94 | /* |
95 | - * Called under down_write(mmap_sem), page_table_lock is not held | |
95 | + * Called under down_write(mmap_sem). | |
96 | 96 | */ |
97 | 97 | |
98 | 98 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
... | ... | @@ -308,7 +308,6 @@ |
308 | 308 | |
309 | 309 | vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { |
310 | 310 | unsigned long h_vm_pgoff; |
311 | - unsigned long v_length; | |
312 | 311 | unsigned long v_offset; |
313 | 312 | |
314 | 313 | h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); |
... | ... | @@ -319,11 +318,8 @@ |
319 | 318 | if (h_vm_pgoff >= h_pgoff) |
320 | 319 | v_offset = 0; |
321 | 320 | |
322 | - v_length = vma->vm_end - vma->vm_start; | |
323 | - | |
324 | - zap_hugepage_range(vma, | |
325 | - vma->vm_start + v_offset, | |
326 | - v_length - v_offset); | |
321 | + unmap_hugepage_range(vma, | |
322 | + vma->vm_start + v_offset, vma->vm_end); | |
327 | 323 | } |
328 | 324 | } |
329 | 325 |
include/linux/hugetlb.h
... | ... | @@ -16,7 +16,6 @@ |
16 | 16 | int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); |
17 | 17 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); |
18 | 18 | int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); |
19 | -void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); | |
20 | 19 | void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); |
21 | 20 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); |
22 | 21 | int hugetlb_report_meminfo(char *); |
... | ... | @@ -87,7 +86,6 @@ |
87 | 86 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) |
88 | 87 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) |
89 | 88 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) |
90 | -#define zap_hugepage_range(vma, start, len) BUG() | |
91 | 89 | #define unmap_hugepage_range(vma, start, end) BUG() |
92 | 90 | #define is_hugepage_mem_enough(size) 0 |
93 | 91 | #define hugetlb_report_meminfo(buf) 0 |
include/linux/mm.h
... | ... | @@ -682,7 +682,7 @@ |
682 | 682 | |
683 | 683 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, |
684 | 684 | unsigned long size, struct zap_details *); |
685 | -unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, | |
685 | +unsigned long unmap_vmas(struct mmu_gather **tlb, | |
686 | 686 | struct vm_area_struct *start_vma, unsigned long start_addr, |
687 | 687 | unsigned long end_addr, unsigned long *nr_accounted, |
688 | 688 | struct zap_details *); |
mm/hugetlb.c
... | ... | @@ -314,6 +314,8 @@ |
314 | 314 | BUG_ON(start & ~HPAGE_MASK); |
315 | 315 | BUG_ON(end & ~HPAGE_MASK); |
316 | 316 | |
317 | + spin_lock(&mm->page_table_lock); | |
318 | + | |
317 | 319 | /* Update high watermark before we lower rss */ |
318 | 320 | update_hiwater_rss(mm); |
319 | 321 | |
320 | 322 | |
321 | 323 | |
... | ... | @@ -333,17 +335,9 @@ |
333 | 335 | put_page(page); |
334 | 336 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); |
335 | 337 | } |
336 | - flush_tlb_range(vma, start, end); | |
337 | -} | |
338 | 338 | |
339 | -void zap_hugepage_range(struct vm_area_struct *vma, | |
340 | - unsigned long start, unsigned long length) | |
341 | -{ | |
342 | - struct mm_struct *mm = vma->vm_mm; | |
343 | - | |
344 | - spin_lock(&mm->page_table_lock); | |
345 | - unmap_hugepage_range(vma, start, start + length); | |
346 | 339 | spin_unlock(&mm->page_table_lock); |
340 | + flush_tlb_range(vma, start, end); | |
347 | 341 | } |
348 | 342 | |
349 | 343 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) |
mm/memory.c
... | ... | @@ -551,10 +551,11 @@ |
551 | 551 | { |
552 | 552 | struct mm_struct *mm = tlb->mm; |
553 | 553 | pte_t *pte; |
554 | + spinlock_t *ptl; | |
554 | 555 | int file_rss = 0; |
555 | 556 | int anon_rss = 0; |
556 | 557 | |
557 | - pte = pte_offset_map(pmd, addr); | |
558 | + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
558 | 559 | do { |
559 | 560 | pte_t ptent = *pte; |
560 | 561 | if (pte_none(ptent)) |
... | ... | @@ -621,7 +622,7 @@ |
621 | 622 | } while (pte++, addr += PAGE_SIZE, addr != end); |
622 | 623 | |
623 | 624 | add_mm_rss(mm, file_rss, anon_rss); |
624 | - pte_unmap(pte - 1); | |
625 | + pte_unmap_unlock(pte - 1, ptl); | |
625 | 626 | } |
626 | 627 | |
627 | 628 | static inline void zap_pmd_range(struct mmu_gather *tlb, |
... | ... | @@ -690,7 +691,6 @@ |
690 | 691 | /** |
691 | 692 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
692 | 693 | * @tlbp: address of the caller's struct mmu_gather |
693 | - * @mm: the controlling mm_struct | |
694 | 694 | * @vma: the starting vma |
695 | 695 | * @start_addr: virtual address at which to start unmapping |
696 | 696 | * @end_addr: virtual address at which to end unmapping |
697 | 697 | |
... | ... | @@ -699,10 +699,10 @@ |
699 | 699 | * |
700 | 700 | * Returns the end address of the unmapping (restart addr if interrupted). |
701 | 701 | * |
702 | - * Unmap all pages in the vma list. Called under page_table_lock. | |
702 | + * Unmap all pages in the vma list. | |
703 | 703 | * |
704 | - * We aim to not hold page_table_lock for too long (for scheduling latency | |
705 | - * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | |
704 | + * We aim to not hold locks for too long (for scheduling latency reasons). | |
705 | + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | |
706 | 706 | * return the ending mmu_gather to the caller. |
707 | 707 | * |
708 | 708 | * Only addresses between `start' and `end' will be unmapped. |
... | ... | @@ -714,7 +714,7 @@ |
714 | 714 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
715 | 715 | * drops the lock and schedules. |
716 | 716 | */ |
717 | -unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | |
717 | +unsigned long unmap_vmas(struct mmu_gather **tlbp, | |
718 | 718 | struct vm_area_struct *vma, unsigned long start_addr, |
719 | 719 | unsigned long end_addr, unsigned long *nr_accounted, |
720 | 720 | struct zap_details *details) |
721 | 721 | |
722 | 722 | |
723 | 723 | |
724 | 724 | |
... | ... | @@ -764,19 +764,15 @@ |
764 | 764 | tlb_finish_mmu(*tlbp, tlb_start, start); |
765 | 765 | |
766 | 766 | if (need_resched() || |
767 | - need_lockbreak(&mm->page_table_lock) || | |
768 | 767 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { |
769 | 768 | if (i_mmap_lock) { |
770 | - /* must reset count of rss freed */ | |
771 | - *tlbp = tlb_gather_mmu(mm, fullmm); | |
769 | + *tlbp = NULL; | |
772 | 770 | goto out; |
773 | 771 | } |
774 | - spin_unlock(&mm->page_table_lock); | |
775 | 772 | cond_resched(); |
776 | - spin_lock(&mm->page_table_lock); | |
777 | 773 | } |
778 | 774 | |
779 | - *tlbp = tlb_gather_mmu(mm, fullmm); | |
775 | + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); | |
780 | 776 | tlb_start_valid = 0; |
781 | 777 | zap_bytes = ZAP_BLOCK_SIZE; |
782 | 778 | } |
783 | 779 | |
... | ... | @@ -800,18 +796,12 @@ |
800 | 796 | unsigned long end = address + size; |
801 | 797 | unsigned long nr_accounted = 0; |
802 | 798 | |
803 | - if (is_vm_hugetlb_page(vma)) { | |
804 | - zap_hugepage_range(vma, address, size); | |
805 | - return end; | |
806 | - } | |
807 | - | |
808 | 799 | lru_add_drain(); |
809 | 800 | tlb = tlb_gather_mmu(mm, 0); |
810 | 801 | update_hiwater_rss(mm); |
811 | - spin_lock(&mm->page_table_lock); | |
812 | - end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); | |
813 | - spin_unlock(&mm->page_table_lock); | |
814 | - tlb_finish_mmu(tlb, address, end); | |
802 | + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | |
803 | + if (tlb) | |
804 | + tlb_finish_mmu(tlb, address, end); | |
815 | 805 | return end; |
816 | 806 | } |
817 | 807 | |
... | ... | @@ -1434,13 +1424,6 @@ |
1434 | 1424 | |
1435 | 1425 | restart_addr = zap_page_range(vma, start_addr, |
1436 | 1426 | end_addr - start_addr, details); |
1437 | - | |
1438 | - /* | |
1439 | - * We cannot rely on the break test in unmap_vmas: | |
1440 | - * on the one hand, we don't want to restart our loop | |
1441 | - * just because that broke out for the page_table_lock; | |
1442 | - * on the other hand, it does no test when vma is small. | |
1443 | - */ | |
1444 | 1427 | need_break = need_resched() || |
1445 | 1428 | need_lockbreak(details->i_mmap_lock); |
1446 | 1429 |
mm/mmap.c
... | ... | @@ -1673,9 +1673,7 @@ |
1673 | 1673 | lru_add_drain(); |
1674 | 1674 | tlb = tlb_gather_mmu(mm, 0); |
1675 | 1675 | update_hiwater_rss(mm); |
1676 | - spin_lock(&mm->page_table_lock); | |
1677 | - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | |
1678 | - spin_unlock(&mm->page_table_lock); | |
1676 | + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | |
1679 | 1677 | vm_unacct_memory(nr_accounted); |
1680 | 1678 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
1681 | 1679 | next? next->vm_start: 0); |
... | ... | @@ -1958,9 +1956,7 @@ |
1958 | 1956 | tlb = tlb_gather_mmu(mm, 1); |
1959 | 1957 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ |
1960 | 1958 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1961 | - spin_lock(&mm->page_table_lock); | |
1962 | - end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); | |
1963 | - spin_unlock(&mm->page_table_lock); | |
1959 | + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | |
1964 | 1960 | vm_unacct_memory(nr_accounted); |
1965 | 1961 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
1966 | 1962 | tlb_finish_mmu(tlb, 0, end); |