mm: munlock use follow_page

Hiroaki Wakabayashi points out that when mlock() has been interrupted by SIGKILL, the subsequent munlock() takes unnecessarily long because its use of __get_user_pages() insists on faulting in all the pages which mlock() never reached. It's worse than slowness if mlock() is terminated by Out Of Memory kill: the munlock_vma_pages_all() in exit_mmap() insists on faulting in all the pages which mlock() could not find memory for; so innocent bystanders are killed too, and perhaps the system hangs. __get_user_pages() does a lot that's silly for munlock(): so remove the munlock option from __mlock_vma_pages_range(), and use a simple loop of follow_page()s in munlock_vma_pages_range() instead; ignoring absent pages, and not marking present pages as accessed or dirty. (Change munlock() to only go so far as mlock() reached? That does not work out, given the convention that mlock() claims complete success even when it has to give up early - in part so that an underlying file can be extended later, and those pages locked which earlier would give SIGBUS.) Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: <stable@kernel.org> Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Hiroaki Wakabayashi <primulaelatior@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: munlock use follow_page
Hiroaki Wakabayashi points out that when mlock() has been interrupted by SIGKILL, the subsequent munlock() takes unnecessarily long because its use of __get_user_pages() insists on faulting in all the pages which mlock() never reached. It's worse than slowness if mlock() is terminated by Out Of Memory kill: the munlock_vma_pages_all() in exit_mmap() insists on faulting in all the pages which mlock() could not find memory for; so innocent bystanders are killed too, and perhaps the system hangs. __get_user_pages() does a lot that's silly for munlock(): so remove the munlock option from __mlock_vma_pages_range(), and use a simple loop of follow_page()s in munlock_vma_pages_range() instead; ignoring absent pages, and not marking present pages as accessed or dirty. (Change munlock() to only go so far as mlock() reached? That does not work out, given the convention that mlock() claims complete success even when it has to give up early - in part so that an underlying file can be extended later, and those pages locked which earlier would give SIGBUS.) Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: <stable@kernel.org> Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Hiroaki Wakabayashi <primulaelatior@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hugh Dickins · Linus Torvalds
1 parent 5d3bc27091
Showing 1 changed file with 40 additions and 59 deletions Side-by-side Diff
mm/mlock.c
@@ -139,49 +139,36 @@
 }
  
 /**
- * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
  * @vma:   target vma
  * @start: start address
  * @end:   end address
- * @mlock: 0 indicate munlock, otherwise mlock.
  *
- * If @mlock == 0, unlock an mlocked range;
- * else mlock the range of pages.  This takes care of making the pages present ,
- * too.
+ * This takes care of making the pages present too.
  *
  * return 0 on success, negative error code on error.
  *
  * vma->vm_mm->mmap_sem must be held for at least read.
  */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end,
-				   int mlock)
+				    unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = start;
 	struct page *pages[16]; /* 16 gives a reasonable batch */
 	int nr_pages = (end - start) / PAGE_SIZE;
 	int ret = 0;
-	int gup_flags = 0;
+	int gup_flags;
  
 	VM_BUG_ON(start & ~PAGE_MASK);
 	VM_BUG_ON(end   & ~PAGE_MASK);
 	VM_BUG_ON(start < vma->vm_start);
 	VM_BUG_ON(end   > vma->vm_end);
-	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
-		  (atomic_read(&mm->mm_users) != 0));
+	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
  
-	/*
-	 * mlock:   don't page populate if vma has PROT_NONE permission.
-	 * munlock: always do munlock although the vma has PROT_NONE
-	 *          permission, or SIGKILL is pending.
-	 */
-	if (!mlock)
-		gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
-			     GUP_FLAGS_IGNORE_SIGKILL;
-
+	gup_flags = 0;
 	if (vma->vm_flags & VM_WRITE)
-		gup_flags |= GUP_FLAGS_WRITE;
+		gup_flags = GUP_FLAGS_WRITE;
  
 	while (nr_pages > 0) {
 		int i;
  
@@ -201,19 +188,10 @@
 		 * This can happen for, e.g., VM_NONLINEAR regions before
 		 * a page has been allocated and mapped at a given offset,
 		 * or for addresses that map beyond end of a file.
-		 * We'll mlock the the pages if/when they get faulted in.
+		 * We'll mlock the pages if/when they get faulted in.
 		 */
 		if (ret < 0)
 			break;
-		if (ret == 0) {
-			/*
-			 * We know the vma is there, so the only time
-			 * we cannot get a single page should be an
-			 * error (ret < 0) case.
-			 */
-			WARN_ON(1);
-			break;
-		}
  
 		lru_add_drain();	/* push cached pages to LRU */
  
  
  
  
  
@@ -224,28 +202,22 @@
 			/*
 			 * Because we lock page here and migration is blocked
 			 * by the elevated reference, we need only check for
-			 * page truncation (file-cache only).
+			 * file-cache page truncation.  This page->mapping
+			 * check also neatly skips over the ZERO_PAGE(),
+			 * though if that's common we'd prefer not to lock it.
 			 */
-			if (page->mapping) {
-				if (mlock)
-					mlock_vma_page(page);
-				else
-					munlock_vma_page(page);
-			}
+			if (page->mapping)
+				mlock_vma_page(page);
 			unlock_page(page);
-			put_page(page);		/* ref from get_user_pages() */
-
-			/*
-			 * here we assume that get_user_pages() has given us
-			 * a list of virtually contiguous pages.
-			 */
-			addr += PAGE_SIZE;	/* for next get_user_pages() */
-			nr_pages--;
+			put_page(page);	/* ref from get_user_pages() */
 		}
+
+		addr += ret * PAGE_SIZE;
+		nr_pages -= ret;
 		ret = 0;
 	}
  
-	return ret;	/* count entire vma as locked_vm */
+	return ret;	/* 0 or negative error code */
 }
  
 /*
@@ -289,7 +261,7 @@
 			is_vm_hugetlb_page(vma) ||
 			vma == get_gate_vma(current))) {
  
-		__mlock_vma_pages_range(vma, start, end, 1);
+		__mlock_vma_pages_range(vma, start, end);
  
 		/* Hide errors from mmap() and other callers */
 		return 0;
@@ -310,7 +282,6 @@
 	return nr_pages;		/* error or pages NOT mlocked */
 }
  
-
 /*
  * munlock_vma_pages_range() - munlock all pages in the vma range.'
  * @vma - vma containing range to be munlock()ed.
  
  
@@ -330,10 +301,24 @@
  * free them.  This will result in freeing mlocked pages.
  */
 void munlock_vma_pages_range(struct vm_area_struct *vma,
-			   unsigned long start, unsigned long end)
+			     unsigned long start, unsigned long end)
 {
+	unsigned long addr;
+
+	lru_add_drain();
 	vma->vm_flags &= ~VM_LOCKED;
-	__mlock_vma_pages_range(vma, start, end, 0);
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		struct page *page = follow_page(vma, addr, FOLL_GET);
+		if (page) {
+			lock_page(page);
+			if (page->mapping)
+				munlock_vma_page(page);
+			unlock_page(page);
+			put_page(page);
+		}
+		cond_resched();
+	}
 }
  
 /*
  
  
@@ -400,18 +385,14 @@
 	 * It's okay if try_to_unmap_one unmaps a page just after we
 	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 	 */
-	vma->vm_flags = newflags;
  
 	if (lock) {
-		ret = __mlock_vma_pages_range(vma, start, end, 1);
-
-		if (ret > 0) {
-			mm->locked_vm -= ret;
-			ret = 0;
-		} else
-			ret = __mlock_posix_error_return(ret); /* translate if needed */
+		vma->vm_flags = newflags;
+		ret = __mlock_vma_pages_range(vma, start, end);
+		if (ret < 0)
+			ret = __mlock_posix_error_return(ret);
 	} else {
-		__mlock_vma_pages_range(vma, start, end, 0);
+		munlock_vma_pages_range(vma, start, end);
 	}
  
 out:
...	...	@@ -139,49 +139,36 @@
139	139	}
140	140
141	141	/**
142		- * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
	142	+ * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143	143	* @vma: target vma
144	144	* @start: start address
145	145	* @end: end address
146		- * @mlock: 0 indicate munlock, otherwise mlock.
147	146	*
148		- * If @mlock == 0, unlock an mlocked range;
149		- * else mlock the range of pages. This takes care of making the pages present ,
150		- * too.
	147	+ * This takes care of making the pages present too.
151	148	*
152	149	* return 0 on success, negative error code on error.
153	150	*
154	151	* vma->vm_mm->mmap_sem must be held for at least read.
155	152	*/
156	153	static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157		- unsigned long start, unsigned long end,
158		- int mlock)
	154	+ unsigned long start, unsigned long end)
159	155	{
160	156	struct mm_struct *mm = vma->vm_mm;
161	157	unsigned long addr = start;
162	158	struct page pages[16]; / 16 gives a reasonable batch */
163	159	int nr_pages = (end - start) / PAGE_SIZE;
164	160	int ret = 0;
165		- int gup_flags = 0;
	161	+ int gup_flags;
166	162
167	163	VM_BUG_ON(start & ~PAGE_MASK);
168	164	VM_BUG_ON(end & ~PAGE_MASK);
169	165	VM_BUG_ON(start < vma->vm_start);
170	166	VM_BUG_ON(end > vma->vm_end);
171		- VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
172		- (atomic_read(&mm->mm_users) != 0));
	167	+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
173	168
174		- /*
175		- * mlock: don't page populate if vma has PROT_NONE permission.
176		- * munlock: always do munlock although the vma has PROT_NONE
177		- * permission, or SIGKILL is pending.
178		- */
179		- if (!mlock)
180		- gup_flags \|= GUP_FLAGS_IGNORE_VMA_PERMISSIONS \|
181		- GUP_FLAGS_IGNORE_SIGKILL;
182		-
	169	+ gup_flags = 0;
183	170	if (vma->vm_flags & VM_WRITE)
184		- gup_flags \|= GUP_FLAGS_WRITE;
	171	+ gup_flags = GUP_FLAGS_WRITE;
185	172
186	173	while (nr_pages > 0) {
187	174	int i;
188	175
...	...	@@ -201,19 +188,10 @@
201	188	* This can happen for, e.g., VM_NONLINEAR regions before
202	189	* a page has been allocated and mapped at a given offset,
203	190	* or for addresses that map beyond end of a file.
204		- * We'll mlock the the pages if/when they get faulted in.
	191	+ * We'll mlock the pages if/when they get faulted in.
205	192	*/
206	193	if (ret < 0)
207	194	break;
208		- if (ret == 0) {
209		- /*
210		- * We know the vma is there, so the only time
211		- * we cannot get a single page should be an
212		- * error (ret < 0) case.
213		- */
214		- WARN_ON(1);
215		- break;
216		- }
217	195
218	196	lru_add_drain(); /* push cached pages to LRU */
219	197
220	198
221	199
222	200
223	201
...	...	@@ -224,28 +202,22 @@
224	202	/*
225	203	* Because we lock page here and migration is blocked
226	204	* by the elevated reference, we need only check for
227		- * page truncation (file-cache only).
	205	+ * file-cache page truncation. This page->mapping
	206	+ * check also neatly skips over the ZERO_PAGE(),
	207	+ * though if that's common we'd prefer not to lock it.
228	208	*/
229		- if (page->mapping) {
230		- if (mlock)
231		- mlock_vma_page(page);
232		- else
233		- munlock_vma_page(page);
234		- }
	209	+ if (page->mapping)
	210	+ mlock_vma_page(page);
235	211	unlock_page(page);
236		- put_page(page); /* ref from get_user_pages() */
237		-
238		- /*
239		- * here we assume that get_user_pages() has given us
240		- * a list of virtually contiguous pages.
241		- */
242		- addr += PAGE_SIZE; /* for next get_user_pages() */
243		- nr_pages--;
	212	+ put_page(page); /* ref from get_user_pages() */
244	213	}
	214	+
	215	+ addr += ret * PAGE_SIZE;
	216	+ nr_pages -= ret;
245	217	ret = 0;
246	218	}
247	219
248		- return ret; /* count entire vma as locked_vm */
	220	+ return ret; /* 0 or negative error code */
249	221	}
250	222
251	223	/*
...	...	@@ -289,7 +261,7 @@
289	261	is_vm_hugetlb_page(vma) \|\|
290	262	vma == get_gate_vma(current))) {
291	263
292		- __mlock_vma_pages_range(vma, start, end, 1);
	264	+ __mlock_vma_pages_range(vma, start, end);
293	265
294	266	/* Hide errors from mmap() and other callers */
295	267	return 0;
...	...	@@ -310,7 +282,6 @@
310	282	return nr_pages; /* error or pages NOT mlocked */
311	283	}
312	284
313		-
314	285	/*
315	286	* munlock_vma_pages_range() - munlock all pages in the vma range.'
316	287	* @vma - vma containing range to be munlock()ed.
317	288
318	289
...	...	@@ -330,10 +301,24 @@
330	301	* free them. This will result in freeing mlocked pages.
331	302	*/
332	303	void munlock_vma_pages_range(struct vm_area_struct *vma,
333		- unsigned long start, unsigned long end)
	304	+ unsigned long start, unsigned long end)
334	305	{
	306	+ unsigned long addr;
	307	+
	308	+ lru_add_drain();
335	309	vma->vm_flags &= ~VM_LOCKED;
336		- __mlock_vma_pages_range(vma, start, end, 0);
	310	+
	311	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
	312	+ struct page *page = follow_page(vma, addr, FOLL_GET);
	313	+ if (page) {
	314	+ lock_page(page);
	315	+ if (page->mapping)
	316	+ munlock_vma_page(page);
	317	+ unlock_page(page);
	318	+ put_page(page);
	319	+ }
	320	+ cond_resched();
	321	+ }
337	322	}
338	323
339	324	/*
340	325
341	326
...	...	@@ -400,18 +385,14 @@
400	385	* It's okay if try_to_unmap_one unmaps a page just after we
401	386	* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402	387	*/
403		- vma->vm_flags = newflags;
404	388
405	389	if (lock) {
406		- ret = __mlock_vma_pages_range(vma, start, end, 1);
407		-
408		- if (ret > 0) {
409		- mm->locked_vm -= ret;
410		- ret = 0;
411		- } else
412		- ret = __mlock_posix_error_return(ret); /* translate if needed */
	390	+ vma->vm_flags = newflags;
	391	+ ret = __mlock_vma_pages_range(vma, start, end);
	392	+ if (ret < 0)
	393	+ ret = __mlock_posix_error_return(ret);
413	394	} else {
414		- __mlock_vma_pages_range(vma, start, end, 0);
	395	+ munlock_vma_pages_range(vma, start, end);
415	396	}
416	397
417	398	out: