Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99

Authored by Michel Lespinasse
Committed by Linus Torvalds
1 parent b522c94da5

mm: retry page fault when blocking on disk transfer

This change reduces mmap_sem hold times that are caused by waiting for
disk transfers when accessing file mapped VMAs.

It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call
site wants mmap_sem to be released if blocking on a pending disk transfer.
In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and
do_page_fault() will then re-acquire mmap_sem and retry the page fault.

It is expected that the retry will hit the same page which will now be
cached, and thus it will complete with a low mmap_sem hold time.

Tests:

- microbenchmark: thread A mmaps a large file and does random read accesses
  to the mmaped area - achieves about 55 iterations/s. Thread B does
  mmap/munmap in a loop at a separate location - achieves 55 iterations/s
  before, 15000 iterations/s after.

- We are seeing related effects in some applications in house, which show
  significant performance regressions when running without this change.

[akpm@linux-foundation.org: fix warning & crash]
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 64 additions and 15 deletions Side-by-side Diff

... ... @@ -956,8 +956,10 @@
956 956 struct task_struct *tsk;
957 957 unsigned long address;
958 958 struct mm_struct *mm;
959   - int write;
960 959 int fault;
  960 + int write = error_code & PF_WRITE;
  961 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
  962 + (write ? FAULT_FLAG_WRITE : 0);
961 963  
962 964 tsk = current;
963 965 mm = tsk->mm;
... ... @@ -1068,6 +1070,7 @@
1068 1070 bad_area_nosemaphore(regs, error_code, address);
1069 1071 return;
1070 1072 }
  1073 +retry:
1071 1074 down_read(&mm->mmap_sem);
1072 1075 } else {
1073 1076 /*
... ... @@ -1111,8 +1114,6 @@
1111 1114 * we can handle it..
1112 1115 */
1113 1116 good_area:
1114   - write = error_code & PF_WRITE;
1115   -
1116 1117 if (unlikely(access_error(error_code, write, vma))) {
1117 1118 bad_area_access_error(regs, error_code, address);
1118 1119 return;
1119 1120  
... ... @@ -1123,21 +1124,34 @@
1123 1124 * make sure we exit gracefully rather than endlessly redo
1124 1125 * the fault:
1125 1126 */
1126   - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
  1127 + fault = handle_mm_fault(mm, vma, address, flags);
1127 1128  
1128 1129 if (unlikely(fault & VM_FAULT_ERROR)) {
1129 1130 mm_fault_error(regs, error_code, address, fault);
1130 1131 return;
1131 1132 }
1132 1133  
1133   - if (fault & VM_FAULT_MAJOR) {
1134   - tsk->maj_flt++;
1135   - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1136   - regs, address);
1137   - } else {
1138   - tsk->min_flt++;
1139   - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1140   - regs, address);
  1134 + /*
  1135 + * Major/minor page fault accounting is only done on the
  1136 + * initial attempt. If we go through a retry, it is extremely
  1137 + * likely that the page will be found in page cache at that point.
  1138 + */
  1139 + if (flags & FAULT_FLAG_ALLOW_RETRY) {
  1140 + if (fault & VM_FAULT_MAJOR) {
  1141 + tsk->maj_flt++;
  1142 + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
  1143 + regs, address);
  1144 + } else {
  1145 + tsk->min_flt++;
  1146 + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
  1147 + regs, address);
  1148 + }
  1149 + if (fault & VM_FAULT_RETRY) {
  1150 + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
  1151 + * of starvation. */
  1152 + flags &= ~FAULT_FLAG_ALLOW_RETRY;
  1153 + goto retry;
  1154 + }
1141 1155 }
1142 1156  
1143 1157 check_v8086_mode(regs, address, tsk);
... ... @@ -144,6 +144,7 @@
144 144 #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
145 145 #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
146 146 #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
  147 +#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
147 148  
148 149 /*
149 150 * This interface is used by x86 PAT code to identify a pfn mapping that is
... ... @@ -723,6 +724,7 @@
723 724  
724 725 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
725 726 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
  727 +#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
726 728  
727 729 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
728 730  
include/linux/pagemap.h
... ... @@ -299,6 +299,8 @@
299 299 extern void __lock_page(struct page *page);
300 300 extern int __lock_page_killable(struct page *page);
301 301 extern void __lock_page_nosync(struct page *page);
  302 +extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
  303 + unsigned int flags);
302 304 extern void unlock_page(struct page *page);
303 305  
304 306 static inline void __set_page_locked(struct page *page)
... ... @@ -350,6 +352,17 @@
350 352 __lock_page_nosync(page);
351 353 }
352 354  
  355 +/*
  356 + * lock_page_or_retry - Lock the page, unless this would block and the
  357 + * caller indicated that it can handle a retry.
  358 + */
  359 +static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
  360 + unsigned int flags)
  361 +{
  362 + might_sleep();
  363 + return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
  364 +}
  365 +
353 366 /*
354 367 * This is exported only for wait_on_page_locked/wait_on_page_writeback.
355 368 * Never use this directly!
... ... @@ -612,6 +612,19 @@
612 612 TASK_UNINTERRUPTIBLE);
613 613 }
614 614  
  615 +int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
  616 + unsigned int flags)
  617 +{
  618 + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
  619 + __lock_page(page);
  620 + return 1;
  621 + } else {
  622 + up_read(&mm->mmap_sem);
  623 + wait_on_page_locked(page);
  624 + return 0;
  625 + }
  626 +}
  627 +
615 628 /**
616 629 * find_get_page - find and get a page reference
617 630 * @mapping: the address_space to search
... ... @@ -1550,7 +1563,8 @@
1550 1563 goto no_cached_page;
1551 1564 }
1552 1565  
1553   - lock_page(page);
  1566 + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
  1567 + return ret | VM_FAULT_RETRY;
1554 1568  
1555 1569 /* Did it get truncated? */
1556 1570 if (unlikely(page->mapping != mapping)) {
... ... @@ -2627,6 +2627,7 @@
2627 2627 struct page *page, *swapcache = NULL;
2628 2628 swp_entry_t entry;
2629 2629 pte_t pte;
  2630 + int locked;
2630 2631 struct mem_cgroup *ptr = NULL;
2631 2632 int exclusive = 0;
2632 2633 int ret = 0;
2633 2634  
... ... @@ -2677,8 +2678,12 @@
2677 2678 goto out_release;
2678 2679 }
2679 2680  
2680   - lock_page(page);
  2681 + locked = lock_page_or_retry(page, mm, flags);
2681 2682 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  2683 + if (!locked) {
  2684 + ret |= VM_FAULT_RETRY;
  2685 + goto out_release;
  2686 + }
2682 2687  
2683 2688 /*
2684 2689 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
... ... @@ -2927,7 +2932,8 @@
2927 2932 vmf.page = NULL;
2928 2933  
2929 2934 ret = vma->vm_ops->fault(vma, &vmf);
2930   - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
  2935 + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
  2936 + VM_FAULT_RETRY)))
2931 2937 return ret;
2932 2938  
2933 2939 if (unlikely(PageHWPoison(vmf.page))) {