Commit 025c5b2451e42c9e8dfdecd6dc84956ce8f321b5

Authored by Naoya Horiguchi
Committed by Linus Torvalds
1 parent 5aaabe831e

thp: optimize away unnecessary page table locking

Currently when we check if we can handle thp as it is or we need to split
it into regular sized pages, we hold page table lock prior to check
whether a given pmd is mapping thp or not.  Because of this, when it's not
"huge pmd" we suffer from unnecessary lock/unlock overhead.  To remove it,
this patch introduces a optimized check function and replace several
similar logics with it.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 101 additions and 114 deletions Side-by-side Diff

... ... @@ -394,20 +394,11 @@
394 394 pte_t *pte;
395 395 spinlock_t *ptl;
396 396  
397   - spin_lock(&walk->mm->page_table_lock);
398   - if (pmd_trans_huge(*pmd)) {
399   - if (pmd_trans_splitting(*pmd)) {
400   - spin_unlock(&walk->mm->page_table_lock);
401   - wait_split_huge_page(vma->anon_vma, pmd);
402   - } else {
403   - smaps_pte_entry(*(pte_t *)pmd, addr,
404   - HPAGE_PMD_SIZE, walk);
405   - spin_unlock(&walk->mm->page_table_lock);
406   - mss->anonymous_thp += HPAGE_PMD_SIZE;
407   - return 0;
408   - }
409   - } else {
  397 + if (pmd_trans_huge_lock(pmd, vma) == 1) {
  398 + smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
410 399 spin_unlock(&walk->mm->page_table_lock);
  400 + mss->anonymous_thp += HPAGE_PMD_SIZE;
  401 + return 0;
411 402 }
412 403  
413 404 if (pmd_trans_unstable(pmd))
414 405  
415 406  
416 407  
... ... @@ -705,26 +696,19 @@
705 696 /* find the first VMA at or above 'addr' */
706 697 vma = find_vma(walk->mm, addr);
707 698 spin_lock(&walk->mm->page_table_lock);
708   - if (pmd_trans_huge(*pmd)) {
709   - if (pmd_trans_splitting(*pmd)) {
710   - spin_unlock(&walk->mm->page_table_lock);
711   - wait_split_huge_page(vma->anon_vma, pmd);
712   - } else {
713   - for (; addr != end; addr += PAGE_SIZE) {
714   - unsigned long offset;
  699 + if (pmd_trans_huge_lock(pmd, vma) == 1) {
  700 + for (; addr != end; addr += PAGE_SIZE) {
  701 + unsigned long offset;
715 702  
716   - offset = (addr & ~PAGEMAP_WALK_MASK) >>
717   - PAGE_SHIFT;
718   - pfn = thp_pmd_to_pagemap_entry(*pmd, offset);
719   - err = add_to_pagemap(addr, pfn, pm);
720   - if (err)
721   - break;
722   - }
723   - spin_unlock(&walk->mm->page_table_lock);
724   - return err;
  703 + offset = (addr & ~PAGEMAP_WALK_MASK) >>
  704 + PAGE_SHIFT;
  705 + pfn = thp_pmd_to_pagemap_entry(*pmd, offset);
  706 + err = add_to_pagemap(addr, pfn, pm);
  707 + if (err)
  708 + break;
725 709 }
726   - } else {
727 710 spin_unlock(&walk->mm->page_table_lock);
  711 + return err;
728 712 }
729 713  
730 714 for (; addr != end; addr += PAGE_SIZE) {
731 715  
732 716  
... ... @@ -992,24 +976,17 @@
992 976 pte_t *pte;
993 977  
994 978 md = walk->private;
995   - spin_lock(&walk->mm->page_table_lock);
996   - if (pmd_trans_huge(*pmd)) {
997   - if (pmd_trans_splitting(*pmd)) {
998   - spin_unlock(&walk->mm->page_table_lock);
999   - wait_split_huge_page(md->vma->anon_vma, pmd);
1000   - } else {
1001   - pte_t huge_pte = *(pte_t *)pmd;
1002   - struct page *page;
1003 979  
1004   - page = can_gather_numa_stats(huge_pte, md->vma, addr);
1005   - if (page)
1006   - gather_stats(page, md, pte_dirty(huge_pte),
1007   - HPAGE_PMD_SIZE/PAGE_SIZE);
1008   - spin_unlock(&walk->mm->page_table_lock);
1009   - return 0;
1010   - }
1011   - } else {
  980 + if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
  981 + pte_t huge_pte = *(pte_t *)pmd;
  982 + struct page *page;
  983 +
  984 + page = can_gather_numa_stats(huge_pte, md->vma, addr);
  985 + if (page)
  986 + gather_stats(page, md, pte_dirty(huge_pte),
  987 + HPAGE_PMD_SIZE/PAGE_SIZE);
1012 988 spin_unlock(&walk->mm->page_table_lock);
  989 + return 0;
1013 990 }
1014 991  
1015 992 if (pmd_trans_unstable(pmd))
include/linux/huge_mm.h
... ... @@ -113,6 +113,18 @@
113 113 unsigned long start,
114 114 unsigned long end,
115 115 long adjust_next);
  116 +extern int __pmd_trans_huge_lock(pmd_t *pmd,
  117 + struct vm_area_struct *vma);
  118 +/* mmap_sem must be held on entry */
  119 +static inline int pmd_trans_huge_lock(pmd_t *pmd,
  120 + struct vm_area_struct *vma)
  121 +{
  122 + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
  123 + if (pmd_trans_huge(*pmd))
  124 + return __pmd_trans_huge_lock(pmd, vma);
  125 + else
  126 + return 0;
  127 +}
116 128 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
117 129 unsigned long start,
118 130 unsigned long end,
... ... @@ -175,6 +187,11 @@
175 187 unsigned long end,
176 188 long adjust_next)
177 189 {
  190 +}
  191 +static inline int pmd_trans_huge_lock(pmd_t *pmd,
  192 + struct vm_area_struct *vma)
  193 +{
  194 + return 0;
178 195 }
179 196 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
180 197  
... ... @@ -1031,32 +1031,23 @@
1031 1031 {
1032 1032 int ret = 0;
1033 1033  
1034   - spin_lock(&tlb->mm->page_table_lock);
1035   - if (likely(pmd_trans_huge(*pmd))) {
1036   - if (unlikely(pmd_trans_splitting(*pmd))) {
1037   - spin_unlock(&tlb->mm->page_table_lock);
1038   - wait_split_huge_page(vma->anon_vma,
1039   - pmd);
1040   - } else {
1041   - struct page *page;
1042   - pgtable_t pgtable;
1043   - pgtable = get_pmd_huge_pte(tlb->mm);
1044   - page = pmd_page(*pmd);
1045   - pmd_clear(pmd);
1046   - tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1047   - page_remove_rmap(page);
1048   - VM_BUG_ON(page_mapcount(page) < 0);
1049   - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1050   - VM_BUG_ON(!PageHead(page));
1051   - tlb->mm->nr_ptes--;
1052   - spin_unlock(&tlb->mm->page_table_lock);
1053   - tlb_remove_page(tlb, page);
1054   - pte_free(tlb->mm, pgtable);
1055   - ret = 1;
1056   - }
1057   - } else
  1034 + if (__pmd_trans_huge_lock(pmd, vma) == 1) {
  1035 + struct page *page;
  1036 + pgtable_t pgtable;
  1037 + pgtable = get_pmd_huge_pte(tlb->mm);
  1038 + page = pmd_page(*pmd);
  1039 + pmd_clear(pmd);
  1040 + tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  1041 + page_remove_rmap(page);
  1042 + VM_BUG_ON(page_mapcount(page) < 0);
  1043 + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  1044 + VM_BUG_ON(!PageHead(page));
  1045 + tlb->mm->nr_ptes--;
1058 1046 spin_unlock(&tlb->mm->page_table_lock);
1059   -
  1047 + tlb_remove_page(tlb, page);
  1048 + pte_free(tlb->mm, pgtable);
  1049 + ret = 1;
  1050 + }
1060 1051 return ret;
1061 1052 }
1062 1053  
1063 1054  
... ... @@ -1066,21 +1057,15 @@
1066 1057 {
1067 1058 int ret = 0;
1068 1059  
1069   - spin_lock(&vma->vm_mm->page_table_lock);
1070   - if (likely(pmd_trans_huge(*pmd))) {
1071   - ret = !pmd_trans_splitting(*pmd);
  1060 + if (__pmd_trans_huge_lock(pmd, vma) == 1) {
  1061 + /*
  1062 + * All logical pages in the range are present
  1063 + * if backed by a huge page.
  1064 + */
1072 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1073   - if (unlikely(!ret))
1074   - wait_split_huge_page(vma->anon_vma, pmd);
1075   - else {
1076   - /*
1077   - * All logical pages in the range are present
1078   - * if backed by a huge page.
1079   - */
1080   - memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1081   - }
1082   - } else
1083   - spin_unlock(&vma->vm_mm->page_table_lock);
  1066 + memset(vec, 1, (end - addr) >> PAGE_SHIFT);
  1067 + ret = 1;
  1068 + }
1084 1069  
1085 1070 return ret;
1086 1071 }
... ... @@ -1110,20 +1095,11 @@
1110 1095 goto out;
1111 1096 }
1112 1097  
1113   - spin_lock(&mm->page_table_lock);
1114   - if (likely(pmd_trans_huge(*old_pmd))) {
1115   - if (pmd_trans_splitting(*old_pmd)) {
1116   - spin_unlock(&mm->page_table_lock);
1117   - wait_split_huge_page(vma->anon_vma, old_pmd);
1118   - ret = -1;
1119   - } else {
1120   - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1121   - VM_BUG_ON(!pmd_none(*new_pmd));
1122   - set_pmd_at(mm, new_addr, new_pmd, pmd);
1123   - spin_unlock(&mm->page_table_lock);
1124   - ret = 1;
1125   - }
1126   - } else {
  1098 + ret = __pmd_trans_huge_lock(old_pmd, vma);
  1099 + if (ret == 1) {
  1100 + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
  1101 + VM_BUG_ON(!pmd_none(*new_pmd));
  1102 + set_pmd_at(mm, new_addr, new_pmd, pmd);
1127 1103 spin_unlock(&mm->page_table_lock);
1128 1104 }
1129 1105 out:
1130 1106  
1131 1107  
1132 1108  
1133 1109  
... ... @@ -1136,24 +1112,41 @@
1136 1112 struct mm_struct *mm = vma->vm_mm;
1137 1113 int ret = 0;
1138 1114  
1139   - spin_lock(&mm->page_table_lock);
  1115 + if (__pmd_trans_huge_lock(pmd, vma) == 1) {
  1116 + pmd_t entry;
  1117 + entry = pmdp_get_and_clear(mm, addr, pmd);
  1118 + entry = pmd_modify(entry, newprot);
  1119 + set_pmd_at(mm, addr, pmd, entry);
  1120 + spin_unlock(&vma->vm_mm->page_table_lock);
  1121 + ret = 1;
  1122 + }
  1123 +
  1124 + return ret;
  1125 +}
  1126 +
  1127 +/*
  1128 + * Returns 1 if a given pmd maps a stable (not under splitting) thp.
  1129 + * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
  1130 + *
  1131 + * Note that if it returns 1, this routine returns without unlocking page
  1132 + * table locks. So callers must unlock them.
  1133 + */
  1134 +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
  1135 +{
  1136 + spin_lock(&vma->vm_mm->page_table_lock);
1140 1137 if (likely(pmd_trans_huge(*pmd))) {
1141 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1142   - spin_unlock(&mm->page_table_lock);
  1139 + spin_unlock(&vma->vm_mm->page_table_lock);
1143 1140 wait_split_huge_page(vma->anon_vma, pmd);
  1141 + return -1;
1144 1142 } else {
1145   - pmd_t entry;
1146   -
1147   - entry = pmdp_get_and_clear(mm, addr, pmd);
1148   - entry = pmd_modify(entry, newprot);
1149   - set_pmd_at(mm, addr, pmd, entry);
1150   - spin_unlock(&vma->vm_mm->page_table_lock);
1151   - ret = 1;
  1143 + /* Thp mapped by 'pmd' is stable, so we can
  1144 + * handle it as it is. */
  1145 + return 1;
1152 1146 }
1153   - } else
1154   - spin_unlock(&vma->vm_mm->page_table_lock);
1155   -
1156   - return ret;
  1147 + }
  1148 + spin_unlock(&vma->vm_mm->page_table_lock);
  1149 + return 0;
1157 1150 }
1158 1151  
1159 1152 pmd_t *page_check_address_pmd(struct page *page,