Commit 025c5b2451e42c9e8dfdecd6dc84956ce8f321b5
Committed by
Linus Torvalds
1 parent
5aaabe831e
Exists in
master
and in
20 other branches
thp: optimize away unnecessary page table locking
Currently when we check if we can handle thp as it is or we need to split it into regular sized pages, we hold page table lock prior to check whether a given pmd is mapping thp or not. Because of this, when it's not "huge pmd" we suffer from unnecessary lock/unlock overhead. To remove it, this patch introduces a optimized check function and replace several similar logics with it. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: David Rientjes <rientjes@google.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 101 additions and 114 deletions Side-by-side Diff
fs/proc/task_mmu.c
... | ... | @@ -394,20 +394,11 @@ |
394 | 394 | pte_t *pte; |
395 | 395 | spinlock_t *ptl; |
396 | 396 | |
397 | - spin_lock(&walk->mm->page_table_lock); | |
398 | - if (pmd_trans_huge(*pmd)) { | |
399 | - if (pmd_trans_splitting(*pmd)) { | |
400 | - spin_unlock(&walk->mm->page_table_lock); | |
401 | - wait_split_huge_page(vma->anon_vma, pmd); | |
402 | - } else { | |
403 | - smaps_pte_entry(*(pte_t *)pmd, addr, | |
404 | - HPAGE_PMD_SIZE, walk); | |
405 | - spin_unlock(&walk->mm->page_table_lock); | |
406 | - mss->anonymous_thp += HPAGE_PMD_SIZE; | |
407 | - return 0; | |
408 | - } | |
409 | - } else { | |
397 | + if (pmd_trans_huge_lock(pmd, vma) == 1) { | |
398 | + smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); | |
410 | 399 | spin_unlock(&walk->mm->page_table_lock); |
400 | + mss->anonymous_thp += HPAGE_PMD_SIZE; | |
401 | + return 0; | |
411 | 402 | } |
412 | 403 | |
413 | 404 | if (pmd_trans_unstable(pmd)) |
414 | 405 | |
415 | 406 | |
416 | 407 | |
... | ... | @@ -705,26 +696,19 @@ |
705 | 696 | /* find the first VMA at or above 'addr' */ |
706 | 697 | vma = find_vma(walk->mm, addr); |
707 | 698 | spin_lock(&walk->mm->page_table_lock); |
708 | - if (pmd_trans_huge(*pmd)) { | |
709 | - if (pmd_trans_splitting(*pmd)) { | |
710 | - spin_unlock(&walk->mm->page_table_lock); | |
711 | - wait_split_huge_page(vma->anon_vma, pmd); | |
712 | - } else { | |
713 | - for (; addr != end; addr += PAGE_SIZE) { | |
714 | - unsigned long offset; | |
699 | + if (pmd_trans_huge_lock(pmd, vma) == 1) { | |
700 | + for (; addr != end; addr += PAGE_SIZE) { | |
701 | + unsigned long offset; | |
715 | 702 | |
716 | - offset = (addr & ~PAGEMAP_WALK_MASK) >> | |
717 | - PAGE_SHIFT; | |
718 | - pfn = thp_pmd_to_pagemap_entry(*pmd, offset); | |
719 | - err = add_to_pagemap(addr, pfn, pm); | |
720 | - if (err) | |
721 | - break; | |
722 | - } | |
723 | - spin_unlock(&walk->mm->page_table_lock); | |
724 | - return err; | |
703 | + offset = (addr & ~PAGEMAP_WALK_MASK) >> | |
704 | + PAGE_SHIFT; | |
705 | + pfn = thp_pmd_to_pagemap_entry(*pmd, offset); | |
706 | + err = add_to_pagemap(addr, pfn, pm); | |
707 | + if (err) | |
708 | + break; | |
725 | 709 | } |
726 | - } else { | |
727 | 710 | spin_unlock(&walk->mm->page_table_lock); |
711 | + return err; | |
728 | 712 | } |
729 | 713 | |
730 | 714 | for (; addr != end; addr += PAGE_SIZE) { |
731 | 715 | |
732 | 716 | |
... | ... | @@ -992,24 +976,17 @@ |
992 | 976 | pte_t *pte; |
993 | 977 | |
994 | 978 | md = walk->private; |
995 | - spin_lock(&walk->mm->page_table_lock); | |
996 | - if (pmd_trans_huge(*pmd)) { | |
997 | - if (pmd_trans_splitting(*pmd)) { | |
998 | - spin_unlock(&walk->mm->page_table_lock); | |
999 | - wait_split_huge_page(md->vma->anon_vma, pmd); | |
1000 | - } else { | |
1001 | - pte_t huge_pte = *(pte_t *)pmd; | |
1002 | - struct page *page; | |
1003 | 979 | |
1004 | - page = can_gather_numa_stats(huge_pte, md->vma, addr); | |
1005 | - if (page) | |
1006 | - gather_stats(page, md, pte_dirty(huge_pte), | |
1007 | - HPAGE_PMD_SIZE/PAGE_SIZE); | |
1008 | - spin_unlock(&walk->mm->page_table_lock); | |
1009 | - return 0; | |
1010 | - } | |
1011 | - } else { | |
980 | + if (pmd_trans_huge_lock(pmd, md->vma) == 1) { | |
981 | + pte_t huge_pte = *(pte_t *)pmd; | |
982 | + struct page *page; | |
983 | + | |
984 | + page = can_gather_numa_stats(huge_pte, md->vma, addr); | |
985 | + if (page) | |
986 | + gather_stats(page, md, pte_dirty(huge_pte), | |
987 | + HPAGE_PMD_SIZE/PAGE_SIZE); | |
1012 | 988 | spin_unlock(&walk->mm->page_table_lock); |
989 | + return 0; | |
1013 | 990 | } |
1014 | 991 | |
1015 | 992 | if (pmd_trans_unstable(pmd)) |
include/linux/huge_mm.h
... | ... | @@ -113,6 +113,18 @@ |
113 | 113 | unsigned long start, |
114 | 114 | unsigned long end, |
115 | 115 | long adjust_next); |
116 | +extern int __pmd_trans_huge_lock(pmd_t *pmd, | |
117 | + struct vm_area_struct *vma); | |
118 | +/* mmap_sem must be held on entry */ | |
119 | +static inline int pmd_trans_huge_lock(pmd_t *pmd, | |
120 | + struct vm_area_struct *vma) | |
121 | +{ | |
122 | + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); | |
123 | + if (pmd_trans_huge(*pmd)) | |
124 | + return __pmd_trans_huge_lock(pmd, vma); | |
125 | + else | |
126 | + return 0; | |
127 | +} | |
116 | 128 | static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, |
117 | 129 | unsigned long start, |
118 | 130 | unsigned long end, |
... | ... | @@ -175,6 +187,11 @@ |
175 | 187 | unsigned long end, |
176 | 188 | long adjust_next) |
177 | 189 | { |
190 | +} | |
191 | +static inline int pmd_trans_huge_lock(pmd_t *pmd, | |
192 | + struct vm_area_struct *vma) | |
193 | +{ | |
194 | + return 0; | |
178 | 195 | } |
179 | 196 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
180 | 197 |
mm/huge_memory.c
... | ... | @@ -1031,32 +1031,23 @@ |
1031 | 1031 | { |
1032 | 1032 | int ret = 0; |
1033 | 1033 | |
1034 | - spin_lock(&tlb->mm->page_table_lock); | |
1035 | - if (likely(pmd_trans_huge(*pmd))) { | |
1036 | - if (unlikely(pmd_trans_splitting(*pmd))) { | |
1037 | - spin_unlock(&tlb->mm->page_table_lock); | |
1038 | - wait_split_huge_page(vma->anon_vma, | |
1039 | - pmd); | |
1040 | - } else { | |
1041 | - struct page *page; | |
1042 | - pgtable_t pgtable; | |
1043 | - pgtable = get_pmd_huge_pte(tlb->mm); | |
1044 | - page = pmd_page(*pmd); | |
1045 | - pmd_clear(pmd); | |
1046 | - tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | |
1047 | - page_remove_rmap(page); | |
1048 | - VM_BUG_ON(page_mapcount(page) < 0); | |
1049 | - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | |
1050 | - VM_BUG_ON(!PageHead(page)); | |
1051 | - tlb->mm->nr_ptes--; | |
1052 | - spin_unlock(&tlb->mm->page_table_lock); | |
1053 | - tlb_remove_page(tlb, page); | |
1054 | - pte_free(tlb->mm, pgtable); | |
1055 | - ret = 1; | |
1056 | - } | |
1057 | - } else | |
1034 | + if (__pmd_trans_huge_lock(pmd, vma) == 1) { | |
1035 | + struct page *page; | |
1036 | + pgtable_t pgtable; | |
1037 | + pgtable = get_pmd_huge_pte(tlb->mm); | |
1038 | + page = pmd_page(*pmd); | |
1039 | + pmd_clear(pmd); | |
1040 | + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | |
1041 | + page_remove_rmap(page); | |
1042 | + VM_BUG_ON(page_mapcount(page) < 0); | |
1043 | + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | |
1044 | + VM_BUG_ON(!PageHead(page)); | |
1045 | + tlb->mm->nr_ptes--; | |
1058 | 1046 | spin_unlock(&tlb->mm->page_table_lock); |
1059 | - | |
1047 | + tlb_remove_page(tlb, page); | |
1048 | + pte_free(tlb->mm, pgtable); | |
1049 | + ret = 1; | |
1050 | + } | |
1060 | 1051 | return ret; |
1061 | 1052 | } |
1062 | 1053 | |
1063 | 1054 | |
... | ... | @@ -1066,21 +1057,15 @@ |
1066 | 1057 | { |
1067 | 1058 | int ret = 0; |
1068 | 1059 | |
1069 | - spin_lock(&vma->vm_mm->page_table_lock); | |
1070 | - if (likely(pmd_trans_huge(*pmd))) { | |
1071 | - ret = !pmd_trans_splitting(*pmd); | |
1060 | + if (__pmd_trans_huge_lock(pmd, vma) == 1) { | |
1061 | + /* | |
1062 | + * All logical pages in the range are present | |
1063 | + * if backed by a huge page. | |
1064 | + */ | |
1072 | 1065 | spin_unlock(&vma->vm_mm->page_table_lock); |
1073 | - if (unlikely(!ret)) | |
1074 | - wait_split_huge_page(vma->anon_vma, pmd); | |
1075 | - else { | |
1076 | - /* | |
1077 | - * All logical pages in the range are present | |
1078 | - * if backed by a huge page. | |
1079 | - */ | |
1080 | - memset(vec, 1, (end - addr) >> PAGE_SHIFT); | |
1081 | - } | |
1082 | - } else | |
1083 | - spin_unlock(&vma->vm_mm->page_table_lock); | |
1066 | + memset(vec, 1, (end - addr) >> PAGE_SHIFT); | |
1067 | + ret = 1; | |
1068 | + } | |
1084 | 1069 | |
1085 | 1070 | return ret; |
1086 | 1071 | } |
... | ... | @@ -1110,20 +1095,11 @@ |
1110 | 1095 | goto out; |
1111 | 1096 | } |
1112 | 1097 | |
1113 | - spin_lock(&mm->page_table_lock); | |
1114 | - if (likely(pmd_trans_huge(*old_pmd))) { | |
1115 | - if (pmd_trans_splitting(*old_pmd)) { | |
1116 | - spin_unlock(&mm->page_table_lock); | |
1117 | - wait_split_huge_page(vma->anon_vma, old_pmd); | |
1118 | - ret = -1; | |
1119 | - } else { | |
1120 | - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | |
1121 | - VM_BUG_ON(!pmd_none(*new_pmd)); | |
1122 | - set_pmd_at(mm, new_addr, new_pmd, pmd); | |
1123 | - spin_unlock(&mm->page_table_lock); | |
1124 | - ret = 1; | |
1125 | - } | |
1126 | - } else { | |
1098 | + ret = __pmd_trans_huge_lock(old_pmd, vma); | |
1099 | + if (ret == 1) { | |
1100 | + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | |
1101 | + VM_BUG_ON(!pmd_none(*new_pmd)); | |
1102 | + set_pmd_at(mm, new_addr, new_pmd, pmd); | |
1127 | 1103 | spin_unlock(&mm->page_table_lock); |
1128 | 1104 | } |
1129 | 1105 | out: |
1130 | 1106 | |
1131 | 1107 | |
1132 | 1108 | |
1133 | 1109 | |
... | ... | @@ -1136,24 +1112,41 @@ |
1136 | 1112 | struct mm_struct *mm = vma->vm_mm; |
1137 | 1113 | int ret = 0; |
1138 | 1114 | |
1139 | - spin_lock(&mm->page_table_lock); | |
1115 | + if (__pmd_trans_huge_lock(pmd, vma) == 1) { | |
1116 | + pmd_t entry; | |
1117 | + entry = pmdp_get_and_clear(mm, addr, pmd); | |
1118 | + entry = pmd_modify(entry, newprot); | |
1119 | + set_pmd_at(mm, addr, pmd, entry); | |
1120 | + spin_unlock(&vma->vm_mm->page_table_lock); | |
1121 | + ret = 1; | |
1122 | + } | |
1123 | + | |
1124 | + return ret; | |
1125 | +} | |
1126 | + | |
1127 | +/* | |
1128 | + * Returns 1 if a given pmd maps a stable (not under splitting) thp. | |
1129 | + * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | |
1130 | + * | |
1131 | + * Note that if it returns 1, this routine returns without unlocking page | |
1132 | + * table locks. So callers must unlock them. | |
1133 | + */ | |
1134 | +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | |
1135 | +{ | |
1136 | + spin_lock(&vma->vm_mm->page_table_lock); | |
1140 | 1137 | if (likely(pmd_trans_huge(*pmd))) { |
1141 | 1138 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1142 | - spin_unlock(&mm->page_table_lock); | |
1139 | + spin_unlock(&vma->vm_mm->page_table_lock); | |
1143 | 1140 | wait_split_huge_page(vma->anon_vma, pmd); |
1141 | + return -1; | |
1144 | 1142 | } else { |
1145 | - pmd_t entry; | |
1146 | - | |
1147 | - entry = pmdp_get_and_clear(mm, addr, pmd); | |
1148 | - entry = pmd_modify(entry, newprot); | |
1149 | - set_pmd_at(mm, addr, pmd, entry); | |
1150 | - spin_unlock(&vma->vm_mm->page_table_lock); | |
1151 | - ret = 1; | |
1143 | + /* Thp mapped by 'pmd' is stable, so we can | |
1144 | + * handle it as it is. */ | |
1145 | + return 1; | |
1152 | 1146 | } |
1153 | - } else | |
1154 | - spin_unlock(&vma->vm_mm->page_table_lock); | |
1155 | - | |
1156 | - return ret; | |
1147 | + } | |
1148 | + spin_unlock(&vma->vm_mm->page_table_lock); | |
1149 | + return 0; | |
1157 | 1150 | } |
1158 | 1151 | |
1159 | 1152 | pmd_t *page_check_address_pmd(struct page *page, |