Commit 408e82b78bcc9f1b47c76e833c3df97f675947de

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 5d3bc27091

mm: munlock use follow_page

Hiroaki Wakabayashi points out that when mlock() has been interrupted
by SIGKILL, the subsequent munlock() takes unnecessarily long because
its use of __get_user_pages() insists on faulting in all the pages
which mlock() never reached.

It's worse than slowness if mlock() is terminated by Out Of Memory kill:
the munlock_vma_pages_all() in exit_mmap() insists on faulting in all the
pages which mlock() could not find memory for; so innocent bystanders are
killed too, and perhaps the system hangs.

__get_user_pages() does a lot that's silly for munlock(): so remove the
munlock option from __mlock_vma_pages_range(), and use a simple loop of
follow_page()s in munlock_vma_pages_range() instead; ignoring absent
pages, and not marking present pages as accessed or dirty.

(Change munlock() to only go so far as mlock() reached?  That does not
work out, given the convention that mlock() claims complete success even
when it has to give up early - in part so that an underlying file can be
extended later, and those pages locked which earlier would give SIGBUS.)

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Hiroaki Wakabayashi <primulaelatior@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 40 additions and 59 deletions Side-by-side Diff

... ... @@ -139,49 +139,36 @@
139 139 }
140 140  
141 141 /**
142   - * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
  142 + * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143 143 * @vma: target vma
144 144 * @start: start address
145 145 * @end: end address
146   - * @mlock: 0 indicate munlock, otherwise mlock.
147 146 *
148   - * If @mlock == 0, unlock an mlocked range;
149   - * else mlock the range of pages. This takes care of making the pages present ,
150   - * too.
  147 + * This takes care of making the pages present too.
151 148 *
152 149 * return 0 on success, negative error code on error.
153 150 *
154 151 * vma->vm_mm->mmap_sem must be held for at least read.
155 152 */
156 153 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157   - unsigned long start, unsigned long end,
158   - int mlock)
  154 + unsigned long start, unsigned long end)
159 155 {
160 156 struct mm_struct *mm = vma->vm_mm;
161 157 unsigned long addr = start;
162 158 struct page *pages[16]; /* 16 gives a reasonable batch */
163 159 int nr_pages = (end - start) / PAGE_SIZE;
164 160 int ret = 0;
165   - int gup_flags = 0;
  161 + int gup_flags;
166 162  
167 163 VM_BUG_ON(start & ~PAGE_MASK);
168 164 VM_BUG_ON(end & ~PAGE_MASK);
169 165 VM_BUG_ON(start < vma->vm_start);
170 166 VM_BUG_ON(end > vma->vm_end);
171   - VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
172   - (atomic_read(&mm->mm_users) != 0));
  167 + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
173 168  
174   - /*
175   - * mlock: don't page populate if vma has PROT_NONE permission.
176   - * munlock: always do munlock although the vma has PROT_NONE
177   - * permission, or SIGKILL is pending.
178   - */
179   - if (!mlock)
180   - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181   - GUP_FLAGS_IGNORE_SIGKILL;
182   -
  169 + gup_flags = 0;
183 170 if (vma->vm_flags & VM_WRITE)
184   - gup_flags |= GUP_FLAGS_WRITE;
  171 + gup_flags = GUP_FLAGS_WRITE;
185 172  
186 173 while (nr_pages > 0) {
187 174 int i;
188 175  
... ... @@ -201,19 +188,10 @@
201 188 * This can happen for, e.g., VM_NONLINEAR regions before
202 189 * a page has been allocated and mapped at a given offset,
203 190 * or for addresses that map beyond end of a file.
204   - * We'll mlock the the pages if/when they get faulted in.
  191 + * We'll mlock the pages if/when they get faulted in.
205 192 */
206 193 if (ret < 0)
207 194 break;
208   - if (ret == 0) {
209   - /*
210   - * We know the vma is there, so the only time
211   - * we cannot get a single page should be an
212   - * error (ret < 0) case.
213   - */
214   - WARN_ON(1);
215   - break;
216   - }
217 195  
218 196 lru_add_drain(); /* push cached pages to LRU */
219 197  
220 198  
221 199  
222 200  
223 201  
... ... @@ -224,28 +202,22 @@
224 202 /*
225 203 * Because we lock page here and migration is blocked
226 204 * by the elevated reference, we need only check for
227   - * page truncation (file-cache only).
  205 + * file-cache page truncation. This page->mapping
  206 + * check also neatly skips over the ZERO_PAGE(),
  207 + * though if that's common we'd prefer not to lock it.
228 208 */
229   - if (page->mapping) {
230   - if (mlock)
231   - mlock_vma_page(page);
232   - else
233   - munlock_vma_page(page);
234   - }
  209 + if (page->mapping)
  210 + mlock_vma_page(page);
235 211 unlock_page(page);
236   - put_page(page); /* ref from get_user_pages() */
237   -
238   - /*
239   - * here we assume that get_user_pages() has given us
240   - * a list of virtually contiguous pages.
241   - */
242   - addr += PAGE_SIZE; /* for next get_user_pages() */
243   - nr_pages--;
  212 + put_page(page); /* ref from get_user_pages() */
244 213 }
  214 +
  215 + addr += ret * PAGE_SIZE;
  216 + nr_pages -= ret;
245 217 ret = 0;
246 218 }
247 219  
248   - return ret; /* count entire vma as locked_vm */
  220 + return ret; /* 0 or negative error code */
249 221 }
250 222  
251 223 /*
... ... @@ -289,7 +261,7 @@
289 261 is_vm_hugetlb_page(vma) ||
290 262 vma == get_gate_vma(current))) {
291 263  
292   - __mlock_vma_pages_range(vma, start, end, 1);
  264 + __mlock_vma_pages_range(vma, start, end);
293 265  
294 266 /* Hide errors from mmap() and other callers */
295 267 return 0;
... ... @@ -310,7 +282,6 @@
310 282 return nr_pages; /* error or pages NOT mlocked */
311 283 }
312 284  
313   -
314 285 /*
315 286 * munlock_vma_pages_range() - munlock all pages in the vma range.'
316 287 * @vma - vma containing range to be munlock()ed.
317 288  
318 289  
... ... @@ -330,10 +301,24 @@
330 301 * free them. This will result in freeing mlocked pages.
331 302 */
332 303 void munlock_vma_pages_range(struct vm_area_struct *vma,
333   - unsigned long start, unsigned long end)
  304 + unsigned long start, unsigned long end)
334 305 {
  306 + unsigned long addr;
  307 +
  308 + lru_add_drain();
335 309 vma->vm_flags &= ~VM_LOCKED;
336   - __mlock_vma_pages_range(vma, start, end, 0);
  310 +
  311 + for (addr = start; addr < end; addr += PAGE_SIZE) {
  312 + struct page *page = follow_page(vma, addr, FOLL_GET);
  313 + if (page) {
  314 + lock_page(page);
  315 + if (page->mapping)
  316 + munlock_vma_page(page);
  317 + unlock_page(page);
  318 + put_page(page);
  319 + }
  320 + cond_resched();
  321 + }
337 322 }
338 323  
339 324 /*
340 325  
341 326  
... ... @@ -400,18 +385,14 @@
400 385 * It's okay if try_to_unmap_one unmaps a page just after we
401 386 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 387 */
403   - vma->vm_flags = newflags;
404 388  
405 389 if (lock) {
406   - ret = __mlock_vma_pages_range(vma, start, end, 1);
407   -
408   - if (ret > 0) {
409   - mm->locked_vm -= ret;
410   - ret = 0;
411   - } else
412   - ret = __mlock_posix_error_return(ret); /* translate if needed */
  390 + vma->vm_flags = newflags;
  391 + ret = __mlock_vma_pages_range(vma, start, end);
  392 + if (ret < 0)
  393 + ret = __mlock_posix_error_return(ret);
413 394 } else {
414   - __mlock_vma_pages_range(vma, start, end, 0);
  395 + munlock_vma_pages_range(vma, start, end);
415 396 }
416 397  
417 398 out: