Commit 97a894136f29802da19a15541de3c019e1ca147e

Authored by Peter Zijlstra
Committed by Linus Torvalds
1 parent e4c70a6629

mm: Remove i_mmap_lock lockbreak

Hugh says:
 "The only significant loser, I think, would be page reclaim (when
  concurrent with truncation): could spin for a long time waiting for
  the i_mmap_mutex it expects would soon be dropped? "

Counter points:
 - cpu contention makes the spin stop (need_resched())
 - zap pages should be freeing pages at a higher rate than reclaim
   ever can

I think the simplification of the truncate code is definitely worth it.

Effectively reverts: 2aa15890f3c ("mm: prevent concurrent
unmap_mapping_range() on the same inode") and takes out the code that
caused its problem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 8 changed files with 28 additions and 188 deletions Side-by-side Diff

... ... @@ -331,7 +331,6 @@
331 331 spin_lock_init(&mapping->private_lock);
332 332 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
333 333 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
334   - mutex_init(&mapping->unmap_mutex);
335 334 }
336 335 EXPORT_SYMBOL(address_space_init_once);
337 336  
... ... @@ -635,7 +635,6 @@
635 635 struct prio_tree_root i_mmap; /* tree of private and shared mappings */
636 636 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
637 637 spinlock_t i_mmap_lock; /* protect tree, count, list */
638   - unsigned int truncate_count; /* Cover race condition with truncate */
639 638 unsigned long nrpages; /* number of total pages */
640 639 pgoff_t writeback_index;/* writeback starts here */
641 640 const struct address_space_operations *a_ops; /* methods */
... ... @@ -644,7 +643,6 @@
644 643 spinlock_t private_lock; /* for use by the address_space */
645 644 struct list_head private_list; /* ditto */
646 645 struct address_space *assoc_mapping; /* ditto */
647   - struct mutex unmap_mutex; /* to protect unmapping */
648 646 } __attribute__((aligned(sizeof(long))));
649 647 /*
650 648 * On most architectures that alignment is already the case; but
... ... @@ -895,8 +895,6 @@
895 895 struct address_space *check_mapping; /* Check page->mapping if set */
896 896 pgoff_t first_index; /* Lowest page->index to unmap */
897 897 pgoff_t last_index; /* Highest page->index to unmap */
898   - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
899   - unsigned long truncate_count; /* Compare vm_truncate_count */
900 898 };
901 899  
902 900 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
include/linux/mm_types.h
... ... @@ -175,7 +175,6 @@
175 175 units, *not* PAGE_CACHE_SIZE */
176 176 struct file * vm_file; /* File we map to (can be NULL). */
177 177 void * vm_private_data; /* was vm_pte (shared mem) */
178   - unsigned long vm_truncate_count;/* truncate_count or restart_addr */
179 178  
180 179 #ifndef CONFIG_MMU
181 180 struct vm_region *vm_region; /* NOMMU mapping region */
... ... @@ -386,7 +386,6 @@
386 386 spin_lock(&mapping->i_mmap_lock);
387 387 if (tmp->vm_flags & VM_SHARED)
388 388 mapping->i_mmap_writable++;
389   - tmp->vm_truncate_count = mpnt->vm_truncate_count;
390 389 flush_dcache_mmap_lock(mapping);
391 390 /* insert tmp into the share list, just after mpnt */
392 391 vma_prio_tree_add(tmp, mpnt);
... ... @@ -986,13 +986,13 @@
986 986 static unsigned long zap_pte_range(struct mmu_gather *tlb,
987 987 struct vm_area_struct *vma, pmd_t *pmd,
988 988 unsigned long addr, unsigned long end,
989   - long *zap_work, struct zap_details *details)
  989 + struct zap_details *details)
990 990 {
991 991 struct mm_struct *mm = tlb->mm;
992 992 int force_flush = 0;
993   - pte_t *pte;
994   - spinlock_t *ptl;
995 993 int rss[NR_MM_COUNTERS];
  994 + spinlock_t *ptl;
  995 + pte_t *pte;
996 996  
997 997 again:
998 998 init_rss_vec(rss);
999 999  
... ... @@ -1001,12 +1001,9 @@
1001 1001 do {
1002 1002 pte_t ptent = *pte;
1003 1003 if (pte_none(ptent)) {
1004   - (*zap_work)--;
1005 1004 continue;
1006 1005 }
1007 1006  
1008   - (*zap_work) -= PAGE_SIZE;
1009   -
1010 1007 if (pte_present(ptent)) {
1011 1008 struct page *page;
1012 1009  
... ... @@ -1075,7 +1072,7 @@
1075 1072 print_bad_pte(vma, addr, ptent, NULL);
1076 1073 }
1077 1074 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1078   - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
  1075 + } while (pte++, addr += PAGE_SIZE, addr != end);
1079 1076  
1080 1077 add_mm_rss_vec(mm, rss);
1081 1078 arch_leave_lazy_mmu_mode();
... ... @@ -1099,7 +1096,7 @@
1099 1096 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1100 1097 struct vm_area_struct *vma, pud_t *pud,
1101 1098 unsigned long addr, unsigned long end,
1102   - long *zap_work, struct zap_details *details)
  1099 + struct zap_details *details)
1103 1100 {
1104 1101 pmd_t *pmd;
1105 1102 unsigned long next;
1106 1103  
1107 1104  
1108 1105  
... ... @@ -1111,19 +1108,15 @@
1111 1108 if (next-addr != HPAGE_PMD_SIZE) {
1112 1109 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1113 1110 split_huge_page_pmd(vma->vm_mm, pmd);
1114   - } else if (zap_huge_pmd(tlb, vma, pmd)) {
1115   - (*zap_work)--;
  1111 + } else if (zap_huge_pmd(tlb, vma, pmd))
1116 1112 continue;
1117   - }
1118 1113 /* fall through */
1119 1114 }
1120   - if (pmd_none_or_clear_bad(pmd)) {
1121   - (*zap_work)--;
  1115 + if (pmd_none_or_clear_bad(pmd))
1122 1116 continue;
1123   - }
1124   - next = zap_pte_range(tlb, vma, pmd, addr, next,
1125   - zap_work, details);
1126   - } while (pmd++, addr = next, (addr != end && *zap_work > 0));
  1117 + next = zap_pte_range(tlb, vma, pmd, addr, next, details);
  1118 + cond_resched();
  1119 + } while (pmd++, addr = next, addr != end);
1127 1120  
1128 1121 return addr;
1129 1122 }
... ... @@ -1131,7 +1124,7 @@
1131 1124 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1132 1125 struct vm_area_struct *vma, pgd_t *pgd,
1133 1126 unsigned long addr, unsigned long end,
1134   - long *zap_work, struct zap_details *details)
  1127 + struct zap_details *details)
1135 1128 {
1136 1129 pud_t *pud;
1137 1130 unsigned long next;
1138 1131  
... ... @@ -1139,13 +1132,10 @@
1139 1132 pud = pud_offset(pgd, addr);
1140 1133 do {
1141 1134 next = pud_addr_end(addr, end);
1142   - if (pud_none_or_clear_bad(pud)) {
1143   - (*zap_work)--;
  1135 + if (pud_none_or_clear_bad(pud))
1144 1136 continue;
1145   - }
1146   - next = zap_pmd_range(tlb, vma, pud, addr, next,
1147   - zap_work, details);
1148   - } while (pud++, addr = next, (addr != end && *zap_work > 0));
  1137 + next = zap_pmd_range(tlb, vma, pud, addr, next, details);
  1138 + } while (pud++, addr = next, addr != end);
1149 1139  
1150 1140 return addr;
1151 1141 }
... ... @@ -1153,7 +1143,7 @@
1153 1143 static unsigned long unmap_page_range(struct mmu_gather *tlb,
1154 1144 struct vm_area_struct *vma,
1155 1145 unsigned long addr, unsigned long end,
1156   - long *zap_work, struct zap_details *details)
  1146 + struct zap_details *details)
1157 1147 {
1158 1148 pgd_t *pgd;
1159 1149 unsigned long next;
1160 1150  
... ... @@ -1167,13 +1157,10 @@
1167 1157 pgd = pgd_offset(vma->vm_mm, addr);
1168 1158 do {
1169 1159 next = pgd_addr_end(addr, end);
1170   - if (pgd_none_or_clear_bad(pgd)) {
1171   - (*zap_work)--;
  1160 + if (pgd_none_or_clear_bad(pgd))
1172 1161 continue;
1173   - }
1174   - next = zap_pud_range(tlb, vma, pgd, addr, next,
1175   - zap_work, details);
1176   - } while (pgd++, addr = next, (addr != end && *zap_work > 0));
  1162 + next = zap_pud_range(tlb, vma, pgd, addr, next, details);
  1163 + } while (pgd++, addr = next, addr != end);
1177 1164 tlb_end_vma(tlb, vma);
1178 1165 mem_cgroup_uncharge_end();
1179 1166  
1180 1167  
... ... @@ -1218,9 +1205,7 @@
1218 1205 unsigned long end_addr, unsigned long *nr_accounted,
1219 1206 struct zap_details *details)
1220 1207 {
1221   - long zap_work = ZAP_BLOCK_SIZE;
1222 1208 unsigned long start = start_addr;
1223   - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1224 1209 struct mm_struct *mm = vma->vm_mm;
1225 1210  
1226 1211 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1227 1212  
1228 1213  
1229 1214  
... ... @@ -1253,33 +1238,15 @@
1253 1238 * Since no pte has actually been setup, it is
1254 1239 * safe to do nothing in this case.
1255 1240 */
1256   - if (vma->vm_file) {
  1241 + if (vma->vm_file)
1257 1242 unmap_hugepage_range(vma, start, end, NULL);
1258   - zap_work -= (end - start) /
1259   - pages_per_huge_page(hstate_vma(vma));
1260   - }
1261 1243  
1262 1244 start = end;
1263 1245 } else
1264   - start = unmap_page_range(tlb, vma,
1265   - start, end, &zap_work, details);
1266   -
1267   - if (zap_work > 0) {
1268   - BUG_ON(start != end);
1269   - break;
1270   - }
1271   -
1272   - if (need_resched() ||
1273   - (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1274   - if (i_mmap_lock)
1275   - goto out;
1276   - cond_resched();
1277   - }
1278   -
1279   - zap_work = ZAP_BLOCK_SIZE;
  1246 + start = unmap_page_range(tlb, vma, start, end, details);
1280 1247 }
1281 1248 }
1282   -out:
  1249 +
1283 1250 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1284 1251 return start; /* which is now the end (or restart) address */
1285 1252 }
1286 1253  
... ... @@ -2612,96 +2579,11 @@
2612 2579 return ret;
2613 2580 }
2614 2581  
2615   -/*
2616   - * Helper functions for unmap_mapping_range().
2617   - *
2618   - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2619   - *
2620   - * We have to restart searching the prio_tree whenever we drop the lock,
2621   - * since the iterator is only valid while the lock is held, and anyway
2622   - * a later vma might be split and reinserted earlier while lock dropped.
2623   - *
2624   - * The list of nonlinear vmas could be handled more efficiently, using
2625   - * a placeholder, but handle it in the same way until a need is shown.
2626   - * It is important to search the prio_tree before nonlinear list: a vma
2627   - * may become nonlinear and be shifted from prio_tree to nonlinear list
2628   - * while the lock is dropped; but never shifted from list to prio_tree.
2629   - *
2630   - * In order to make forward progress despite restarting the search,
2631   - * vm_truncate_count is used to mark a vma as now dealt with, so we can
2632   - * quickly skip it next time around. Since the prio_tree search only
2633   - * shows us those vmas affected by unmapping the range in question, we
2634   - * can't efficiently keep all vmas in step with mapping->truncate_count:
2635   - * so instead reset them all whenever it wraps back to 0 (then go to 1).
2636   - * mapping->truncate_count and vma->vm_truncate_count are protected by
2637   - * i_mmap_lock.
2638   - *
2639   - * In order to make forward progress despite repeatedly restarting some
2640   - * large vma, note the restart_addr from unmap_vmas when it breaks out:
2641   - * and restart from that address when we reach that vma again. It might
2642   - * have been split or merged, shrunk or extended, but never shifted: so
2643   - * restart_addr remains valid so long as it remains in the vma's range.
2644   - * unmap_mapping_range forces truncate_count to leap over page-aligned
2645   - * values so we can save vma's restart_addr in its truncate_count field.
2646   - */
2647   -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2648   -
2649   -static void reset_vma_truncate_counts(struct address_space *mapping)
2650   -{
2651   - struct vm_area_struct *vma;
2652   - struct prio_tree_iter iter;
2653   -
2654   - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2655   - vma->vm_truncate_count = 0;
2656   - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2657   - vma->vm_truncate_count = 0;
2658   -}
2659   -
2660   -static int unmap_mapping_range_vma(struct vm_area_struct *vma,
  2582 +static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2661 2583 unsigned long start_addr, unsigned long end_addr,
2662 2584 struct zap_details *details)
2663 2585 {
2664   - unsigned long restart_addr;
2665   - int need_break;
2666   -
2667   - /*
2668   - * files that support invalidating or truncating portions of the
2669   - * file from under mmaped areas must have their ->fault function
2670   - * return a locked page (and set VM_FAULT_LOCKED in the return).
2671   - * This provides synchronisation against concurrent unmapping here.
2672   - */
2673   -
2674   -again:
2675   - restart_addr = vma->vm_truncate_count;
2676   - if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2677   - start_addr = restart_addr;
2678   - if (start_addr >= end_addr) {
2679   - /* Top of vma has been split off since last time */
2680   - vma->vm_truncate_count = details->truncate_count;
2681   - return 0;
2682   - }
2683   - }
2684   -
2685   - restart_addr = zap_page_range(vma, start_addr,
2686   - end_addr - start_addr, details);
2687   - need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2688   -
2689   - if (restart_addr >= end_addr) {
2690   - /* We have now completed this vma: mark it so */
2691   - vma->vm_truncate_count = details->truncate_count;
2692   - if (!need_break)
2693   - return 0;
2694   - } else {
2695   - /* Note restart_addr in vma's truncate_count field */
2696   - vma->vm_truncate_count = restart_addr;
2697   - if (!need_break)
2698   - goto again;
2699   - }
2700   -
2701   - spin_unlock(details->i_mmap_lock);
2702   - cond_resched();
2703   - spin_lock(details->i_mmap_lock);
2704   - return -EINTR;
  2586 + zap_page_range(vma, start_addr, end_addr - start_addr, details);
2705 2587 }
2706 2588  
2707 2589 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2708 2590  
... ... @@ -2711,12 +2593,8 @@
2711 2593 struct prio_tree_iter iter;
2712 2594 pgoff_t vba, vea, zba, zea;
2713 2595  
2714   -restart:
2715 2596 vma_prio_tree_foreach(vma, &iter, root,
2716 2597 details->first_index, details->last_index) {
2717   - /* Skip quickly over those we have already dealt with */
2718   - if (vma->vm_truncate_count == details->truncate_count)
2719   - continue;
2720 2598  
2721 2599 vba = vma->vm_pgoff;
2722 2600 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2723 2601  
... ... @@ -2728,11 +2606,10 @@
2728 2606 if (zea > vea)
2729 2607 zea = vea;
2730 2608  
2731   - if (unmap_mapping_range_vma(vma,
  2609 + unmap_mapping_range_vma(vma,
2732 2610 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2733 2611 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2734   - details) < 0)
2735   - goto restart;
  2612 + details);
2736 2613 }
2737 2614 }
2738 2615  
2739 2616  
2740 2617  
... ... @@ -2747,15 +2624,9 @@
2747 2624 * across *all* the pages in each nonlinear VMA, not just the pages
2748 2625 * whose virtual address lies outside the file truncation point.
2749 2626 */
2750   -restart:
2751 2627 list_for_each_entry(vma, head, shared.vm_set.list) {
2752   - /* Skip quickly over those we have already dealt with */
2753   - if (vma->vm_truncate_count == details->truncate_count)
2754   - continue;
2755 2628 details->nonlinear_vma = vma;
2756   - if (unmap_mapping_range_vma(vma, vma->vm_start,
2757   - vma->vm_end, details) < 0)
2758   - goto restart;
  2629 + unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2759 2630 }
2760 2631 }
2761 2632  
2762 2633  
2763 2634  
2764 2635  
... ... @@ -2794,26 +2665,14 @@
2794 2665 details.last_index = hba + hlen - 1;
2795 2666 if (details.last_index < details.first_index)
2796 2667 details.last_index = ULONG_MAX;
2797   - details.i_mmap_lock = &mapping->i_mmap_lock;
2798 2668  
2799   - mutex_lock(&mapping->unmap_mutex);
2800   - spin_lock(&mapping->i_mmap_lock);
2801 2669  
2802   - /* Protect against endless unmapping loops */
2803   - mapping->truncate_count++;
2804   - if (unlikely(is_restart_addr(mapping->truncate_count))) {
2805   - if (mapping->truncate_count == 0)
2806   - reset_vma_truncate_counts(mapping);
2807   - mapping->truncate_count++;
2808   - }
2809   - details.truncate_count = mapping->truncate_count;
2810   -
  2670 + spin_lock(&mapping->i_mmap_lock);
2811 2671 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2812 2672 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2813 2673 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2814 2674 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2815 2675 spin_unlock(&mapping->i_mmap_lock);
2816   - mutex_unlock(&mapping->unmap_mutex);
2817 2676 }
2818 2677 EXPORT_SYMBOL(unmap_mapping_range);
2819 2678  
... ... @@ -445,10 +445,8 @@
445 445 if (vma->vm_file)
446 446 mapping = vma->vm_file->f_mapping;
447 447  
448   - if (mapping) {
  448 + if (mapping)
449 449 spin_lock(&mapping->i_mmap_lock);
450   - vma->vm_truncate_count = mapping->truncate_count;
451   - }
452 450  
453 451 __vma_link(mm, vma, prev, rb_link, rb_parent);
454 452 __vma_link_file(vma);
455 453  
... ... @@ -558,16 +556,7 @@
558 556 if (!(vma->vm_flags & VM_NONLINEAR))
559 557 root = &mapping->i_mmap;
560 558 spin_lock(&mapping->i_mmap_lock);
561   - if (importer &&
562   - vma->vm_truncate_count != next->vm_truncate_count) {
563   - /*
564   - * unmap_mapping_range might be in progress:
565   - * ensure that the expanding vma is rescanned.
566   - */
567   - importer->vm_truncate_count = 0;
568   - }
569 559 if (insert) {
570   - insert->vm_truncate_count = vma->vm_truncate_count;
571 560 /*
572 561 * Put into prio_tree now, so instantiated pages
573 562 * are visible to arm/parisc __flush_dcache_page
... ... @@ -94,7 +94,6 @@
94 94 */
95 95 mapping = vma->vm_file->f_mapping;
96 96 spin_lock(&mapping->i_mmap_lock);
97   - new_vma->vm_truncate_count = 0;
98 97 }
99 98  
100 99 /*