mm: retry page fault when blocking on disk transfer

This change reduces mmap_sem hold times that are caused by waiting for disk transfers when accessing file mapped VMAs. It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call site wants mmap_sem to be released if blocking on a pending disk transfer. In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and do_page_fault() will then re-acquire mmap_sem and retry the page fault. It is expected that the retry will hit the same page which will now be cached, and thus it will complete with a low mmap_sem hold time. Tests: - microbenchmark: thread A mmaps a large file and does random read accesses to the mmaped area - achieves about 55 iterations/s. Thread B does mmap/munmap in a loop at a separate location - achieves 55 iterations/s before, 15000 iterations/s after. - We are seeing related effects in some applications in house, which show significant performance regressions when running without this change. [akpm@linux-foundation.org: fix warning & crash] Signed-off-by: Michel Lespinasse <walken@google.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> Cc: Ying Han <yinghan@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Acked-by: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: retry page fault when blocking on disk transfer
This change reduces mmap_sem hold times that are caused by waiting for disk transfers when accessing file mapped VMAs. It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call site wants mmap_sem to be released if blocking on a pending disk transfer. In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and do_page_fault() will then re-acquire mmap_sem and retry the page fault. It is expected that the retry will hit the same page which will now be cached, and thus it will complete with a low mmap_sem hold time. Tests: - microbenchmark: thread A mmaps a large file and does random read accesses to the mmaped area - achieves about 55 iterations/s. Thread B does mmap/munmap in a loop at a separate location - achieves 55 iterations/s before, 15000 iterations/s after. - We are seeing related effects in some applications in house, which show significant performance regressions when running without this change. [akpm@linux-foundation.org: fix warning & crash] Signed-off-by: Michel Lespinasse <walken@google.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> Cc: Ying Han <yinghan@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Acked-by: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Michel Lespinasse · Linus Torvalds
1 parent b522c94da5
Showing 5 changed files with 64 additions and 15 deletions Side-by-side Diff
arch/x86/mm/fault.c
include/linux/mm.h
include/linux/pagemap.h
mm/filemap.c
mm/memory.c
@@ -956,8 +956,10 @@
 	struct task_struct *tsk;
 	unsigned long address;
 	struct mm_struct *mm;
-	int write;
 	int fault;
+	int write = error_code & PF_WRITE;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+					(write ? FAULT_FLAG_WRITE : 0);
  
 	tsk = current;
 	mm = tsk->mm;
@@ -1068,6 +1070,7 @@
 			bad_area_nosemaphore(regs, error_code, address);
 			return;
 		}
+retry:
 		down_read(&mm->mmap_sem);
 	} else {
 		/*
@@ -1111,8 +1114,6 @@
 	 * we can handle it..
 	 */
 good_area:
-	write = error_code & PF_WRITE;
-
 	if (unlikely(access_error(error_code, write, vma))) {
 		bad_area_access_error(regs, error_code, address);
 		return;
  
@@ -1123,21 +1124,34 @@
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
-	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, address, flags);
  
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
 	}
  
-	if (fault & VM_FAULT_MAJOR) {
-		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-				     regs, address);
-	} else {
-		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-				     regs, address);
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			goto retry;
+		}
 	}
  
 	check_v8086_mode(regs, address, tsk);
@@ -144,6 +144,7 @@
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 #define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
  
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -723,6 +724,7 @@
  
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
+#define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
  
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
  
@@ -299,6 +299,8 @@
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
+extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+				unsigned int flags);
 extern void unlock_page(struct page *page);
  
 static inline void __set_page_locked(struct page *page)
@@ -350,6 +352,17 @@
 		__lock_page_nosync(page);
 }
  
+/*
+ * lock_page_or_retry - Lock the page, unless this would block and the
+ * caller indicated that it can handle a retry.
+ */
+static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
+				     unsigned int flags)
+{
+	might_sleep();
+	return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
@@ -612,6 +612,19 @@
 							TASK_UNINTERRUPTIBLE);
 }
  
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+			 unsigned int flags)
+{
+	if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+		__lock_page(page);
+		return 1;
+	} else {
+		up_read(&mm->mmap_sem);
+		wait_on_page_locked(page);
+		return 0;
+	}
+}
+
 /**
  * find_get_page - find and get a page reference
  * @mapping: the address_space to search
@@ -1550,7 +1563,8 @@
 			goto no_cached_page;
 	}
  
-	lock_page(page);
+	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+		return ret | VM_FAULT_RETRY;
  
 	/* Did it get truncated? */
 	if (unlikely(page->mapping != mapping)) {
@@ -2627,6 +2627,7 @@
 	struct page *page, *swapcache = NULL;
 	swp_entry_t entry;
 	pte_t pte;
+	int locked;
 	struct mem_cgroup *ptr = NULL;
 	int exclusive = 0;
 	int ret = 0;
  
@@ -2677,8 +2678,12 @@
 		goto out_release;
 	}
  
-	lock_page(page);
+	locked = lock_page_or_retry(page, mm, flags);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!locked) {
+		ret |= VM_FAULT_RETRY;
+		goto out_release;
+	}
  
 	/*
 	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2927,7 +2932,8 @@
 	vmf.page = NULL;
  
 	ret = vma->vm_ops->fault(vma, &vmf);
-	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+			    VM_FAULT_RETRY)))
 		return ret;
  
 	if (unlikely(PageHWPoison(vmf.page))) {
...	...	@@ -956,8 +956,10 @@
956	956	struct task_struct *tsk;
957	957	unsigned long address;
958	958	struct mm_struct *mm;
959		- int write;
960	959	int fault;
	960	+ int write = error_code & PF_WRITE;
	961	+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY \|
	962	+ (write ? FAULT_FLAG_WRITE : 0);
961	963
962	964	tsk = current;
963	965	mm = tsk->mm;
...	...	@@ -1068,6 +1070,7 @@
1068	1070	bad_area_nosemaphore(regs, error_code, address);
1069	1071	return;
1070	1072	}
	1073	+retry:
1071	1074	down_read(&mm->mmap_sem);
1072	1075	} else {
1073	1076	/*
...	...	@@ -1111,8 +1114,6 @@
1111	1114	* we can handle it..
1112	1115	*/
1113	1116	good_area:
1114		- write = error_code & PF_WRITE;
1115		-
1116	1117	if (unlikely(access_error(error_code, write, vma))) {
1117	1118	bad_area_access_error(regs, error_code, address);
1118	1119	return;
1119	1120
...	...	@@ -1123,21 +1124,34 @@
1123	1124	* make sure we exit gracefully rather than endlessly redo
1124	1125	* the fault:
1125	1126	*/
1126		- fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
	1127	+ fault = handle_mm_fault(mm, vma, address, flags);
1127	1128
1128	1129	if (unlikely(fault & VM_FAULT_ERROR)) {
1129	1130	mm_fault_error(regs, error_code, address, fault);
1130	1131	return;
1131	1132	}
1132	1133
1133		- if (fault & VM_FAULT_MAJOR) {
1134		- tsk->maj_flt++;
1135		- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1136		- regs, address);
1137		- } else {
1138		- tsk->min_flt++;
1139		- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1140		- regs, address);
	1134	+ /*
	1135	+ * Major/minor page fault accounting is only done on the
	1136	+ * initial attempt. If we go through a retry, it is extremely
	1137	+ * likely that the page will be found in page cache at that point.
	1138	+ */
	1139	+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
	1140	+ if (fault & VM_FAULT_MAJOR) {
	1141	+ tsk->maj_flt++;
	1142	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
	1143	+ regs, address);
	1144	+ } else {
	1145	+ tsk->min_flt++;
	1146	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
	1147	+ regs, address);
	1148	+ }
	1149	+ if (fault & VM_FAULT_RETRY) {
	1150	+ /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
	1151	+ * of starvation. */
	1152	+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
	1153	+ goto retry;
	1154	+ }
1141	1155	}
1142	1156
1143	1157	check_v8086_mode(regs, address, tsk);
...	...	@@ -144,6 +144,7 @@
144	144	#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
145	145	#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
146	146	#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
	147	+#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
147	148
148	149	/*
149	150	* This interface is used by x86 PAT code to identify a pfn mapping that is
...	...	@@ -723,6 +724,7 @@
723	724
724	725	#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
725	726	#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
	727	+#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
726	728
727	729	#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
728	730
...	...	@@ -299,6 +299,8 @@
299	299	extern void __lock_page(struct page *page);
300	300	extern int __lock_page_killable(struct page *page);
301	301	extern void __lock_page_nosync(struct page *page);
	302	+extern int __lock_page_or_retry(struct page page, struct mm_struct mm,
	303	+ unsigned int flags);
302	304	extern void unlock_page(struct page *page);
303	305
304	306	static inline void __set_page_locked(struct page *page)
...	...	@@ -350,6 +352,17 @@
350	352	__lock_page_nosync(page);
351	353	}
352	354
	355	+/*
	356	+ * lock_page_or_retry - Lock the page, unless this would block and the
	357	+ * caller indicated that it can handle a retry.
	358	+ */
	359	+static inline int lock_page_or_retry(struct page page, struct mm_struct mm,
	360	+ unsigned int flags)
	361	+{
	362	+ might_sleep();
	363	+ return trylock_page(page) \|\| __lock_page_or_retry(page, mm, flags);
	364	+}
	365	+
353	366	/*
354	367	* This is exported only for wait_on_page_locked/wait_on_page_writeback.
355	368	* Never use this directly!
...	...	@@ -612,6 +612,19 @@
612	612	TASK_UNINTERRUPTIBLE);
613	613	}
614	614
	615	+int __lock_page_or_retry(struct page page, struct mm_struct mm,
	616	+ unsigned int flags)
	617	+{
	618	+ if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
	619	+ __lock_page(page);
	620	+ return 1;
	621	+ } else {
	622	+ up_read(&mm->mmap_sem);
	623	+ wait_on_page_locked(page);
	624	+ return 0;
	625	+ }
	626	+}
	627	+
615	628	/**
616	629	* find_get_page - find and get a page reference
617	630	* @mapping: the address_space to search
...	...	@@ -1550,7 +1563,8 @@
1550	1563	goto no_cached_page;
1551	1564	}
1552	1565
1553		- lock_page(page);
	1566	+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
	1567	+ return ret \| VM_FAULT_RETRY;
1554	1568
1555	1569	/* Did it get truncated? */
1556	1570	if (unlikely(page->mapping != mapping)) {
...	...	@@ -2627,6 +2627,7 @@
2627	2627	struct page page, swapcache = NULL;
2628	2628	swp_entry_t entry;
2629	2629	pte_t pte;
	2630	+ int locked;
2630	2631	struct mem_cgroup *ptr = NULL;
2631	2632	int exclusive = 0;
2632	2633	int ret = 0;
2633	2634
...	...	@@ -2677,8 +2678,12 @@
2677	2678	goto out_release;
2678	2679	}
2679	2680
2680		- lock_page(page);
	2681	+ locked = lock_page_or_retry(page, mm, flags);
2681	2682	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
	2683	+ if (!locked) {
	2684	+ ret \|= VM_FAULT_RETRY;
	2685	+ goto out_release;
	2686	+ }
2682	2687
2683	2688	/*
2684	2689	* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
...	...	@@ -2927,7 +2932,8 @@
2927	2932	vmf.page = NULL;
2928	2933
2929	2934	ret = vma->vm_ops->fault(vma, &vmf);
2930		- if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))
	2935	+ if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \|
	2936	+ VM_FAULT_RETRY)))
2931	2937	return ret;
2932	2938
2933	2939	if (unlikely(PageHWPoison(vmf.page))) {