Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99
Committed by
Linus Torvalds
1 parent
b522c94da5
Exists in
master
and in
4 other branches
mm: retry page fault when blocking on disk transfer
This change reduces mmap_sem hold times that are caused by waiting for disk transfers when accessing file mapped VMAs. It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call site wants mmap_sem to be released if blocking on a pending disk transfer. In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and do_page_fault() will then re-acquire mmap_sem and retry the page fault. It is expected that the retry will hit the same page which will now be cached, and thus it will complete with a low mmap_sem hold time. Tests: - microbenchmark: thread A mmaps a large file and does random read accesses to the mmaped area - achieves about 55 iterations/s. Thread B does mmap/munmap in a loop at a separate location - achieves 55 iterations/s before, 15000 iterations/s after. - We are seeing related effects in some applications in house, which show significant performance regressions when running without this change. [akpm@linux-foundation.org: fix warning & crash] Signed-off-by: Michel Lespinasse <walken@google.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> Cc: Ying Han <yinghan@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Acked-by: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 5 changed files with 64 additions and 15 deletions Side-by-side Diff
arch/x86/mm/fault.c
... | ... | @@ -956,8 +956,10 @@ |
956 | 956 | struct task_struct *tsk; |
957 | 957 | unsigned long address; |
958 | 958 | struct mm_struct *mm; |
959 | - int write; | |
960 | 959 | int fault; |
960 | + int write = error_code & PF_WRITE; | |
961 | + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | | |
962 | + (write ? FAULT_FLAG_WRITE : 0); | |
961 | 963 | |
962 | 964 | tsk = current; |
963 | 965 | mm = tsk->mm; |
... | ... | @@ -1068,6 +1070,7 @@ |
1068 | 1070 | bad_area_nosemaphore(regs, error_code, address); |
1069 | 1071 | return; |
1070 | 1072 | } |
1073 | +retry: | |
1071 | 1074 | down_read(&mm->mmap_sem); |
1072 | 1075 | } else { |
1073 | 1076 | /* |
... | ... | @@ -1111,8 +1114,6 @@ |
1111 | 1114 | * we can handle it.. |
1112 | 1115 | */ |
1113 | 1116 | good_area: |
1114 | - write = error_code & PF_WRITE; | |
1115 | - | |
1116 | 1117 | if (unlikely(access_error(error_code, write, vma))) { |
1117 | 1118 | bad_area_access_error(regs, error_code, address); |
1118 | 1119 | return; |
1119 | 1120 | |
... | ... | @@ -1123,21 +1124,34 @@ |
1123 | 1124 | * make sure we exit gracefully rather than endlessly redo |
1124 | 1125 | * the fault: |
1125 | 1126 | */ |
1126 | - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); | |
1127 | + fault = handle_mm_fault(mm, vma, address, flags); | |
1127 | 1128 | |
1128 | 1129 | if (unlikely(fault & VM_FAULT_ERROR)) { |
1129 | 1130 | mm_fault_error(regs, error_code, address, fault); |
1130 | 1131 | return; |
1131 | 1132 | } |
1132 | 1133 | |
1133 | - if (fault & VM_FAULT_MAJOR) { | |
1134 | - tsk->maj_flt++; | |
1135 | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | |
1136 | - regs, address); | |
1137 | - } else { | |
1138 | - tsk->min_flt++; | |
1139 | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | |
1140 | - regs, address); | |
1134 | + /* | |
1135 | + * Major/minor page fault accounting is only done on the | |
1136 | + * initial attempt. If we go through a retry, it is extremely | |
1137 | + * likely that the page will be found in page cache at that point. | |
1138 | + */ | |
1139 | + if (flags & FAULT_FLAG_ALLOW_RETRY) { | |
1140 | + if (fault & VM_FAULT_MAJOR) { | |
1141 | + tsk->maj_flt++; | |
1142 | + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | |
1143 | + regs, address); | |
1144 | + } else { | |
1145 | + tsk->min_flt++; | |
1146 | + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | |
1147 | + regs, address); | |
1148 | + } | |
1149 | + if (fault & VM_FAULT_RETRY) { | |
1150 | + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | |
1151 | + * of starvation. */ | |
1152 | + flags &= ~FAULT_FLAG_ALLOW_RETRY; | |
1153 | + goto retry; | |
1154 | + } | |
1141 | 1155 | } |
1142 | 1156 | |
1143 | 1157 | check_v8086_mode(regs, address, tsk); |
include/linux/mm.h
... | ... | @@ -144,6 +144,7 @@ |
144 | 144 | #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ |
145 | 145 | #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ |
146 | 146 | #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ |
147 | +#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ | |
147 | 148 | |
148 | 149 | /* |
149 | 150 | * This interface is used by x86 PAT code to identify a pfn mapping that is |
... | ... | @@ -723,6 +724,7 @@ |
723 | 724 | |
724 | 725 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
725 | 726 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
727 | +#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ | |
726 | 728 | |
727 | 729 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
728 | 730 |
include/linux/pagemap.h
... | ... | @@ -299,6 +299,8 @@ |
299 | 299 | extern void __lock_page(struct page *page); |
300 | 300 | extern int __lock_page_killable(struct page *page); |
301 | 301 | extern void __lock_page_nosync(struct page *page); |
302 | +extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |
303 | + unsigned int flags); | |
302 | 304 | extern void unlock_page(struct page *page); |
303 | 305 | |
304 | 306 | static inline void __set_page_locked(struct page *page) |
... | ... | @@ -350,6 +352,17 @@ |
350 | 352 | __lock_page_nosync(page); |
351 | 353 | } |
352 | 354 | |
355 | +/* | |
356 | + * lock_page_or_retry - Lock the page, unless this would block and the | |
357 | + * caller indicated that it can handle a retry. | |
358 | + */ | |
359 | +static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, | |
360 | + unsigned int flags) | |
361 | +{ | |
362 | + might_sleep(); | |
363 | + return trylock_page(page) || __lock_page_or_retry(page, mm, flags); | |
364 | +} | |
365 | + | |
353 | 366 | /* |
354 | 367 | * This is exported only for wait_on_page_locked/wait_on_page_writeback. |
355 | 368 | * Never use this directly! |
mm/filemap.c
... | ... | @@ -612,6 +612,19 @@ |
612 | 612 | TASK_UNINTERRUPTIBLE); |
613 | 613 | } |
614 | 614 | |
615 | +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |
616 | + unsigned int flags) | |
617 | +{ | |
618 | + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { | |
619 | + __lock_page(page); | |
620 | + return 1; | |
621 | + } else { | |
622 | + up_read(&mm->mmap_sem); | |
623 | + wait_on_page_locked(page); | |
624 | + return 0; | |
625 | + } | |
626 | +} | |
627 | + | |
615 | 628 | /** |
616 | 629 | * find_get_page - find and get a page reference |
617 | 630 | * @mapping: the address_space to search |
... | ... | @@ -1550,7 +1563,8 @@ |
1550 | 1563 | goto no_cached_page; |
1551 | 1564 | } |
1552 | 1565 | |
1553 | - lock_page(page); | |
1566 | + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) | |
1567 | + return ret | VM_FAULT_RETRY; | |
1554 | 1568 | |
1555 | 1569 | /* Did it get truncated? */ |
1556 | 1570 | if (unlikely(page->mapping != mapping)) { |
mm/memory.c
... | ... | @@ -2627,6 +2627,7 @@ |
2627 | 2627 | struct page *page, *swapcache = NULL; |
2628 | 2628 | swp_entry_t entry; |
2629 | 2629 | pte_t pte; |
2630 | + int locked; | |
2630 | 2631 | struct mem_cgroup *ptr = NULL; |
2631 | 2632 | int exclusive = 0; |
2632 | 2633 | int ret = 0; |
2633 | 2634 | |
... | ... | @@ -2677,8 +2678,12 @@ |
2677 | 2678 | goto out_release; |
2678 | 2679 | } |
2679 | 2680 | |
2680 | - lock_page(page); | |
2681 | + locked = lock_page_or_retry(page, mm, flags); | |
2681 | 2682 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2683 | + if (!locked) { | |
2684 | + ret |= VM_FAULT_RETRY; | |
2685 | + goto out_release; | |
2686 | + } | |
2682 | 2687 | |
2683 | 2688 | /* |
2684 | 2689 | * Make sure try_to_free_swap or reuse_swap_page or swapoff did not |
... | ... | @@ -2927,7 +2932,8 @@ |
2927 | 2932 | vmf.page = NULL; |
2928 | 2933 | |
2929 | 2934 | ret = vma->vm_ops->fault(vma, &vmf); |
2930 | - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | |
2935 | + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | |
2936 | + VM_FAULT_RETRY))) | |
2931 | 2937 | return ret; |
2932 | 2938 | |
2933 | 2939 | if (unlikely(PageHWPoison(vmf.page))) { |