Blame view

mm/mlock.c 21 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *	linux/mm/mlock.c
   *
   *  (C) Copyright 1995 Linus Torvalds
   *  (C) Copyright 2002 Christoph Hellwig
   */
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
7
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
8
9
  #include <linux/mman.h>
  #include <linux/mm.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
10
11
12
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/pagemap.h>
7225522bb   Vlastimil Babka   mm: munlock: batc...
13
  #include <linux/pagevec.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
15
  #include <linux/mempolicy.h>
  #include <linux/syscalls.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
16
  #include <linux/sched.h>
b95f1b31b   Paul Gortmaker   mm: Map most file...
17
  #include <linux/export.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
18
19
20
  #include <linux/rmap.h>
  #include <linux/mmzone.h>
  #include <linux/hugetlb.h>
7225522bb   Vlastimil Babka   mm: munlock: batc...
21
22
  #include <linux/memcontrol.h>
  #include <linux/mm_inline.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
23
24
  
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
25

7f43add45   Wang Xiaoqiang   mm/mlock.c: chang...
26
  bool can_do_mlock(void)
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
27
  {
59e99e5b9   Jiri Slaby   mm: use rlimit he...
28
  	if (rlimit(RLIMIT_MEMLOCK) != 0)
7f43add45   Wang Xiaoqiang   mm/mlock.c: chang...
29
  		return true;
a5a6579db   Jeff Vander Stoep   mm: reorder can_d...
30
  	if (capable(CAP_IPC_LOCK))
7f43add45   Wang Xiaoqiang   mm/mlock.c: chang...
31
32
  		return true;
  	return false;
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
33
34
  }
  EXPORT_SYMBOL(can_do_mlock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
35

b291f0003   Nick Piggin   mlock: mlocked pa...
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  /*
   * Mlocked pages are marked with PageMlocked() flag for efficient testing
   * in vmscan and, possibly, the fault path; and to support semi-accurate
   * statistics.
   *
   * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
   * be placed on the LRU "unevictable" list, rather than the [in]active lists.
   * The unevictable list is an LRU sibling list to the [in]active lists.
   * PageUnevictable is set to indicate the unevictable state.
   *
   * When lazy mlocking via vmscan, it is important to ensure that the
   * vma's VM_LOCKED status is not concurrently being modified, otherwise we
   * may have mlocked a page that is being munlocked. So lazy mlock must take
   * the mmap_sem for read, and verify that the vma really is locked
   * (see mm/rmap.c).
   */
  
  /*
   *  LRU accounting for clear_page_mlock()
   */
e6c509f85   Hugh Dickins   mm: use clear_pag...
56
  void clear_page_mlock(struct page *page)
b291f0003   Nick Piggin   mlock: mlocked pa...
57
  {
e6c509f85   Hugh Dickins   mm: use clear_pag...
58
  	if (!TestClearPageMlocked(page))
b291f0003   Nick Piggin   mlock: mlocked pa...
59
  		return;
b291f0003   Nick Piggin   mlock: mlocked pa...
60

8449d21fb   David Rientjes   mm, thp: fix mloc...
61
62
  	mod_zone_page_state(page_zone(page), NR_MLOCK,
  			    -hpage_nr_pages(page));
5344b7e64   Nick Piggin   vmstat: mlocked p...
63
  	count_vm_event(UNEVICTABLE_PGCLEARED);
b291f0003   Nick Piggin   mlock: mlocked pa...
64
65
66
67
  	if (!isolate_lru_page(page)) {
  		putback_lru_page(page);
  	} else {
  		/*
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
68
  		 * We lost the race. the page already moved to evictable list.
b291f0003   Nick Piggin   mlock: mlocked pa...
69
  		 */
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
70
  		if (PageUnevictable(page))
5344b7e64   Nick Piggin   vmstat: mlocked p...
71
  			count_vm_event(UNEVICTABLE_PGSTRANDED);
b291f0003   Nick Piggin   mlock: mlocked pa...
72
73
74
75
76
77
78
79
80
  	}
  }
  
  /*
   * Mark page as mlocked if not already.
   * If page on LRU, isolate and putback to move to unevictable list.
   */
  void mlock_vma_page(struct page *page)
  {
57e68e9cd   Vlastimil Babka   mm: try_to_unmap_...
81
  	/* Serialize with page migration */
b291f0003   Nick Piggin   mlock: mlocked pa...
82
  	BUG_ON(!PageLocked(page));
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
83
84
  	VM_BUG_ON_PAGE(PageTail(page), page);
  	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
5344b7e64   Nick Piggin   vmstat: mlocked p...
85
  	if (!TestSetPageMlocked(page)) {
8449d21fb   David Rientjes   mm, thp: fix mloc...
86
87
  		mod_zone_page_state(page_zone(page), NR_MLOCK,
  				    hpage_nr_pages(page));
5344b7e64   Nick Piggin   vmstat: mlocked p...
88
89
90
91
  		count_vm_event(UNEVICTABLE_PGMLOCKED);
  		if (!isolate_lru_page(page))
  			putback_lru_page(page);
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
92
  }
7225522bb   Vlastimil Babka   mm: munlock: batc...
93
  /*
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
   * Isolate a page from LRU with optional get_page() pin.
   * Assumes lru_lock already held and page already pinned.
   */
  static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
  {
  	if (PageLRU(page)) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
  		if (getpage)
  			get_page(page);
  		ClearPageLRU(page);
  		del_page_from_lru_list(page, lruvec, page_lru(page));
  		return true;
  	}
  
  	return false;
  }
  
  /*
7225522bb   Vlastimil Babka   mm: munlock: batc...
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
   * Finish munlock after successful page isolation
   *
   * Page must be locked. This is a wrapper for try_to_munlock()
   * and putback_lru_page() with munlock accounting.
   */
  static void __munlock_isolated_page(struct page *page)
  {
  	int ret = SWAP_AGAIN;
  
  	/*
  	 * Optimization: if the page was mapped just once, that's our mapping
  	 * and we don't need to check all the other vmas.
  	 */
  	if (page_mapcount(page) > 1)
  		ret = try_to_munlock(page);
  
  	/* Did try_to_unlock() succeed or punt? */
  	if (ret != SWAP_MLOCK)
  		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
  
  	putback_lru_page(page);
  }
  
  /*
   * Accounting for page isolation fail during munlock
   *
   * Performs accounting when page isolation fails in munlock. There is nothing
   * else to do because it means some other task has already removed the page
   * from the LRU. putback_lru_page() will take care of removing the page from
   * the unevictable list, if necessary. vmscan [page_referenced()] will move
   * the page back to the unevictable list if some other vma has it mlocked.
   */
  static void __munlock_isolation_failed(struct page *page)
  {
  	if (PageUnevictable(page))
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
149
  		__count_vm_event(UNEVICTABLE_PGSTRANDED);
7225522bb   Vlastimil Babka   mm: munlock: batc...
150
  	else
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
151
  		__count_vm_event(UNEVICTABLE_PGMUNLOCKED);
7225522bb   Vlastimil Babka   mm: munlock: batc...
152
  }
6927c1dd9   Lee Schermerhorn   mlock: replace st...
153
154
  /**
   * munlock_vma_page - munlock a vma page
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
155
156
157
158
   * @page - page to be unlocked, either a normal page or THP page head
   *
   * returns the size of the page as a page mask (0 for normal page,
   *         HPAGE_PMD_NR - 1 for THP head page)
b291f0003   Nick Piggin   mlock: mlocked pa...
159
   *
6927c1dd9   Lee Schermerhorn   mlock: replace st...
160
161
162
163
164
165
166
167
168
169
   * called from munlock()/munmap() path with page supposedly on the LRU.
   * When we munlock a page, because the vma where we found the page is being
   * munlock()ed or munmap()ed, we want to check whether other vmas hold the
   * page locked so that we can leave it on the unevictable lru list and not
   * bother vmscan with it.  However, to walk the page's rmap list in
   * try_to_munlock() we must isolate the page from the LRU.  If some other
   * task has removed the page from the LRU, we won't be able to do that.
   * So we clear the PageMlocked as we might not get another chance.  If we
   * can't isolate the page, we leave it for putback_lru_page() and vmscan
   * [page_referenced()/try_to_unmap()] to deal with.
b291f0003   Nick Piggin   mlock: mlocked pa...
170
   */
ff6a6da60   Michel Lespinasse   mm: accelerate mu...
171
  unsigned int munlock_vma_page(struct page *page)
b291f0003   Nick Piggin   mlock: mlocked pa...
172
  {
7162a1e87   Kirill A. Shutemov   mm: fix mlock acc...
173
  	int nr_pages;
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
174
  	struct zone *zone = page_zone(page);
ff6a6da60   Michel Lespinasse   mm: accelerate mu...
175

57e68e9cd   Vlastimil Babka   mm: try_to_unmap_...
176
  	/* For try_to_munlock() and to serialize with page migration */
b291f0003   Nick Piggin   mlock: mlocked pa...
177
  	BUG_ON(!PageLocked(page));
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
178
  	VM_BUG_ON_PAGE(PageTail(page), page);
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
179
  	/*
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
180
181
182
  	 * Serialize with any parallel __split_huge_page_refcount() which
  	 * might otherwise copy PageMlocked to part of the tail pages before
  	 * we clear it in the head page. It also stabilizes hpage_nr_pages().
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
183
  	 */
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
  	spin_lock_irq(&zone->lru_lock);
  
  	nr_pages = hpage_nr_pages(page);
  	if (!TestClearPageMlocked(page))
  		goto unlock_out;
  
  	__mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
  
  	if (__munlock_isolate_lru_page(page, true)) {
  		spin_unlock_irq(&zone->lru_lock);
  		__munlock_isolated_page(page);
  		goto out;
  	}
  	__munlock_isolation_failed(page);
  
  unlock_out:
  	spin_unlock_irq(&zone->lru_lock);
  
  out:
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
203
  	return nr_pages - 1;
b291f0003   Nick Piggin   mlock: mlocked pa...
204
  }
9978ad583   Lee Schermerhorn   mlock: make mlock...
205
206
207
208
209
210
211
212
213
214
  /*
   * convert get_user_pages() return value to posix mlock() error
   */
  static int __mlock_posix_error_return(long retval)
  {
  	if (retval == -EFAULT)
  		retval = -ENOMEM;
  	else if (retval == -ENOMEM)
  		retval = -EAGAIN;
  	return retval;
b291f0003   Nick Piggin   mlock: mlocked pa...
215
  }
b291f0003   Nick Piggin   mlock: mlocked pa...
216
  /*
56afe477d   Vlastimil Babka   mm: munlock: bypa...
217
218
219
220
221
222
223
224
225
226
227
228
229
230
   * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
   *
   * The fast path is available only for evictable pages with single mapping.
   * Then we can bypass the per-cpu pvec and get better performance.
   * when mapcount > 1 we need try_to_munlock() which can fail.
   * when !page_evictable(), we need the full redo logic of putback_lru_page to
   * avoid leaving evictable page in unevictable list.
   *
   * In case of success, @page is added to @pvec and @pgrescued is incremented
   * in case that the page was previously unevictable. @page is also unlocked.
   */
  static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
  		int *pgrescued)
  {
309381fea   Sasha Levin   mm: dump page whe...
231
232
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
56afe477d   Vlastimil Babka   mm: munlock: bypa...
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
  
  	if (page_mapcount(page) <= 1 && page_evictable(page)) {
  		pagevec_add(pvec, page);
  		if (TestClearPageUnevictable(page))
  			(*pgrescued)++;
  		unlock_page(page);
  		return true;
  	}
  
  	return false;
  }
  
  /*
   * Putback multiple evictable pages to the LRU
   *
   * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
   * the pages might have meanwhile become unevictable but that is OK.
   */
  static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
  {
  	count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
  	/*
  	 *__pagevec_lru_add() calls release_pages() so we don't call
  	 * put_page() explicitly
  	 */
  	__pagevec_lru_add(pvec);
  	count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
  }
  
  /*
7225522bb   Vlastimil Babka   mm: munlock: batc...
263
264
265
266
267
268
269
   * Munlock a batch of pages from the same zone
   *
   * The work is split to two main phases. First phase clears the Mlocked flag
   * and attempts to isolate the pages, all under a single zone lru lock.
   * The second phase finishes the munlock only for pages where isolation
   * succeeded.
   *
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
270
   * Note that the pagevec may be modified during the process.
7225522bb   Vlastimil Babka   mm: munlock: batc...
271
272
273
274
275
   */
  static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
  {
  	int i;
  	int nr = pagevec_count(pvec);
3b25df93c   Vlastimil Babka   mm: munlock: fix ...
276
  	int delta_munlocked;
56afe477d   Vlastimil Babka   mm: munlock: bypa...
277
278
  	struct pagevec pvec_putback;
  	int pgrescued = 0;
7225522bb   Vlastimil Babka   mm: munlock: batc...
279

3b25df93c   Vlastimil Babka   mm: munlock: fix ...
280
  	pagevec_init(&pvec_putback, 0);
7225522bb   Vlastimil Babka   mm: munlock: batc...
281
282
283
284
285
286
  	/* Phase 1: page isolation */
  	spin_lock_irq(&zone->lru_lock);
  	for (i = 0; i < nr; i++) {
  		struct page *page = pvec->pages[i];
  
  		if (TestClearPageMlocked(page)) {
7225522bb   Vlastimil Babka   mm: munlock: batc...
287
  			/*
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
288
289
  			 * We already have pin from follow_page_mask()
  			 * so we can spare the get_page() here.
7225522bb   Vlastimil Babka   mm: munlock: batc...
290
  			 */
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
291
292
293
294
  			if (__munlock_isolate_lru_page(page, false))
  				continue;
  			else
  				__munlock_isolation_failed(page);
7225522bb   Vlastimil Babka   mm: munlock: batc...
295
  		}
01cc2e586   Vlastimil Babka   mm: munlock: fix ...
296
297
298
299
300
301
302
303
304
  
  		/*
  		 * We won't be munlocking this page in the next phase
  		 * but we still need to release the follow_page_mask()
  		 * pin. We cannot do it under lru_lock however. If it's
  		 * the last pin, __page_cache_release() would deadlock.
  		 */
  		pagevec_add(&pvec_putback, pvec->pages[i]);
  		pvec->pages[i] = NULL;
7225522bb   Vlastimil Babka   mm: munlock: batc...
305
  	}
3b25df93c   Vlastimil Babka   mm: munlock: fix ...
306
  	delta_munlocked = -nr + pagevec_count(&pvec_putback);
1ebb7cc6a   Vlastimil Babka   mm: munlock: batc...
307
  	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
7225522bb   Vlastimil Babka   mm: munlock: batc...
308
  	spin_unlock_irq(&zone->lru_lock);
3b25df93c   Vlastimil Babka   mm: munlock: fix ...
309
310
  	/* Now we can release pins of pages that we are not munlocking */
  	pagevec_release(&pvec_putback);
56afe477d   Vlastimil Babka   mm: munlock: bypa...
311
  	/* Phase 2: page munlock */
7225522bb   Vlastimil Babka   mm: munlock: batc...
312
313
314
315
316
  	for (i = 0; i < nr; i++) {
  		struct page *page = pvec->pages[i];
  
  		if (page) {
  			lock_page(page);
56afe477d   Vlastimil Babka   mm: munlock: bypa...
317
318
  			if (!__putback_lru_fast_prepare(page, &pvec_putback,
  					&pgrescued)) {
5b40998ae   Vlastimil Babka   mm: munlock: remo...
319
320
321
322
323
  				/*
  				 * Slow path. We don't want to lose the last
  				 * pin before unlock_page()
  				 */
  				get_page(page); /* for putback_lru_page() */
56afe477d   Vlastimil Babka   mm: munlock: bypa...
324
325
  				__munlock_isolated_page(page);
  				unlock_page(page);
5b40998ae   Vlastimil Babka   mm: munlock: remo...
326
  				put_page(page); /* from follow_page_mask() */
56afe477d   Vlastimil Babka   mm: munlock: bypa...
327
  			}
7225522bb   Vlastimil Babka   mm: munlock: batc...
328
329
  		}
  	}
56afe477d   Vlastimil Babka   mm: munlock: bypa...
330

5b40998ae   Vlastimil Babka   mm: munlock: remo...
331
332
333
334
  	/*
  	 * Phase 3: page putback for pages that qualified for the fast path
  	 * This will also call put_page() to return pin from follow_page_mask()
  	 */
56afe477d   Vlastimil Babka   mm: munlock: bypa...
335
336
  	if (pagevec_count(&pvec_putback))
  		__putback_lru_fast(&pvec_putback, pgrescued);
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  }
  
  /*
   * Fill up pagevec for __munlock_pagevec using pte walk
   *
   * The function expects that the struct page corresponding to @start address is
   * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
   *
   * The rest of @pvec is filled by subsequent pages within the same pmd and same
   * zone, as long as the pte's are present and vm_normal_page() succeeds. These
   * pages also get pinned.
   *
   * Returns the address of the next page that should be scanned. This equals
   * @start + PAGE_SIZE when no page could be added by the pte walk.
   */
  static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
  		struct vm_area_struct *vma, int zoneid,	unsigned long start,
  		unsigned long end)
  {
  	pte_t *pte;
  	spinlock_t *ptl;
  
  	/*
  	 * Initialize pte walk starting at the already pinned page where we
eadb41ae8   Vlastimil Babka   mm/mlock.c: preve...
361
362
  	 * are sure that there is a pte, as it was pinned under the same
  	 * mmap_sem write op.
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
363
364
  	 */
  	pte = get_locked_pte(vma->vm_mm, start,	&ptl);
eadb41ae8   Vlastimil Babka   mm/mlock.c: preve...
365
366
367
368
  	/* Make sure we do not cross the page table boundary */
  	end = pgd_addr_end(start, end);
  	end = pud_addr_end(start, end);
  	end = pmd_addr_end(start, end);
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
369
370
371
372
373
374
375
376
377
378
379
380
381
382
  
  	/* The page next to the pinned page is the first we will try to get */
  	start += PAGE_SIZE;
  	while (start < end) {
  		struct page *page = NULL;
  		pte++;
  		if (pte_present(*pte))
  			page = vm_normal_page(vma, start, *pte);
  		/*
  		 * Break if page could not be obtained or the page's node+zone does not
  		 * match
  		 */
  		if (!page || page_zone_id(page) != zoneid)
  			break;
56afe477d   Vlastimil Babka   mm: munlock: bypa...
383

e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
384
385
386
387
388
389
  		/*
  		 * Do not use pagevec for PTE-mapped THP,
  		 * munlock_vma_pages_range() will handle them.
  		 */
  		if (PageTransCompound(page))
  			break;
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
390
391
392
393
394
395
396
397
398
399
400
  		get_page(page);
  		/*
  		 * Increase the address that will be returned *before* the
  		 * eventual break due to pvec becoming full by adding the page
  		 */
  		start += PAGE_SIZE;
  		if (pagevec_add(pvec, page) == 0)
  			break;
  	}
  	pte_unmap_unlock(pte, ptl);
  	return start;
7225522bb   Vlastimil Babka   mm: munlock: batc...
401
402
403
  }
  
  /*
ba470de43   Rik van Riel   mmap: handle mloc...
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
   * munlock_vma_pages_range() - munlock all pages in the vma range.'
   * @vma - vma containing range to be munlock()ed.
   * @start - start address in @vma of the range
   * @end - end of range in @vma.
   *
   *  For mremap(), munmap() and exit().
   *
   * Called with @vma VM_LOCKED.
   *
   * Returns with VM_LOCKED cleared.  Callers must be prepared to
   * deal with this.
   *
   * We don't save and restore VM_LOCKED here because pages are
   * still on lru.  In unmap path, pages might be scanned by reclaim
   * and re-mlocked by try_to_{munlock|unmap} before we unmap and
   * free them.  This will result in freeing mlocked pages.
b291f0003   Nick Piggin   mlock: mlocked pa...
420
   */
ba470de43   Rik van Riel   mmap: handle mloc...
421
  void munlock_vma_pages_range(struct vm_area_struct *vma,
408e82b78   Hugh Dickins   mm: munlock use f...
422
  			     unsigned long start, unsigned long end)
b291f0003   Nick Piggin   mlock: mlocked pa...
423
  {
de60f5f10   Eric B Munson   mm: introduce VM_...
424
  	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
408e82b78   Hugh Dickins   mm: munlock use f...
425

ff6a6da60   Michel Lespinasse   mm: accelerate mu...
426
  	while (start < end) {
ab7a5af7f   Alexey Klimov   mm/mlock.c: drop ...
427
  		struct page *page;
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
428
429
  		unsigned int page_mask;
  		unsigned long page_increm;
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
430
431
432
  		struct pagevec pvec;
  		struct zone *zone;
  		int zoneid;
ff6a6da60   Michel Lespinasse   mm: accelerate mu...
433

7a8010cd3   Vlastimil Babka   mm: munlock: manu...
434
  		pagevec_init(&pvec, 0);
6e919717c   Hugh Dickins   mm: m(un)lock avo...
435
436
437
438
439
440
441
  		/*
  		 * Although FOLL_DUMP is intended for get_dump_page(),
  		 * it just so happens that its special treatment of the
  		 * ZERO_PAGE (returning an error instead of doing get_page)
  		 * suits munlock very well (and if somehow an abnormal page
  		 * has sneaked into the range, we won't oops here: great).
  		 */
ff6a6da60   Michel Lespinasse   mm: accelerate mu...
442
  		page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
443
  				&page_mask);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  		if (page && !IS_ERR(page)) {
  			if (PageTransTail(page)) {
  				VM_BUG_ON_PAGE(PageMlocked(page), page);
  				put_page(page); /* follow_page_mask() */
  			} else if (PageTransHuge(page)) {
  				lock_page(page);
  				/*
  				 * Any THP page found by follow_page_mask() may
  				 * have gotten split before reaching
  				 * munlock_vma_page(), so we need to recompute
  				 * the page_mask here.
  				 */
  				page_mask = munlock_vma_page(page);
  				unlock_page(page);
  				put_page(page); /* follow_page_mask() */
  			} else {
  				/*
  				 * Non-huge pages are handled in batches via
  				 * pagevec. The pin from follow_page_mask()
  				 * prevents them from collapsing by THP.
  				 */
  				pagevec_add(&pvec, page);
  				zone = page_zone(page);
  				zoneid = page_zone_id(page);
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
468

e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
469
470
471
472
473
474
475
476
477
478
479
  				/*
  				 * Try to fill the rest of pagevec using fast
  				 * pte walk. This will also update start to
  				 * the next page to process. Then munlock the
  				 * pagevec.
  				 */
  				start = __munlock_pagevec_fill(&pvec, vma,
  						zoneid, start, end);
  				__munlock_pagevec(&pvec, zone);
  				goto next;
  			}
408e82b78   Hugh Dickins   mm: munlock use f...
480
  		}
c424be1cb   Vlastimil Babka   mm: munlock: fix ...
481
  		page_increm = 1 + page_mask;
ff6a6da60   Michel Lespinasse   mm: accelerate mu...
482
  		start += page_increm * PAGE_SIZE;
7a8010cd3   Vlastimil Babka   mm: munlock: manu...
483
  next:
408e82b78   Hugh Dickins   mm: munlock use f...
484
485
  		cond_resched();
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
486
487
488
489
490
491
492
  }
  
  /*
   * mlock_fixup  - handle mlock[all]/munlock[all] requests.
   *
   * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
   * munlock is a no-op.  However, for some special vmas, we go ahead and
cea10a19b   Michel Lespinasse   mm: directly use ...
493
   * populate the ptes.
b291f0003   Nick Piggin   mlock: mlocked pa...
494
495
496
   *
   * For vmas that pass the filters, merge/split as appropriate.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497
  static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
ca16d140a   KOSAKI Motohiro   mm: don't access ...
498
  	unsigned long start, unsigned long end, vm_flags_t newflags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
499
  {
b291f0003   Nick Piggin   mlock: mlocked pa...
500
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501
  	pgoff_t pgoff;
b291f0003   Nick Piggin   mlock: mlocked pa...
502
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  	int ret = 0;
ca16d140a   KOSAKI Motohiro   mm: don't access ...
504
  	int lock = !!(newflags & VM_LOCKED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505

fed067da4   Michel Lespinasse   mlock: only hold ...
506
  	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
31db58b3a   Stephen Wilson   mm: arch: make ge...
507
  	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
b0f205c2a   Eric B Munson   mm: mlock: add ml...
508
509
  		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
  		goto out;
b291f0003   Nick Piggin   mlock: mlocked pa...
510

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
511
512
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
513
514
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
515
516
517
518
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
519
520
521
522
523
524
525
526
527
528
529
530
531
532
  	if (start != vma->vm_start) {
  		ret = split_vma(mm, vma, start, 1);
  		if (ret)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		ret = split_vma(mm, vma, end, 0);
  		if (ret)
  			goto out;
  	}
  
  success:
  	/*
b291f0003   Nick Piggin   mlock: mlocked pa...
533
534
535
536
537
538
539
540
  	 * Keep track of amount of locked VM.
  	 */
  	nr_pages = (end - start) >> PAGE_SHIFT;
  	if (!lock)
  		nr_pages = -nr_pages;
  	mm->locked_vm += nr_pages;
  
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
541
542
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 * It's okay if try_to_unmap_one unmaps a page just after we
fc05f5662   Kirill A. Shutemov   mm: rename __mloc...
543
  	 * set VM_LOCKED, populate_vma_page_range will bring it back.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
544
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
545

fed067da4   Michel Lespinasse   mlock: only hold ...
546
  	if (lock)
408e82b78   Hugh Dickins   mm: munlock use f...
547
  		vma->vm_flags = newflags;
fed067da4   Michel Lespinasse   mlock: only hold ...
548
  	else
408e82b78   Hugh Dickins   mm: munlock use f...
549
  		munlock_vma_pages_range(vma, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
550

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
551
  out:
b291f0003   Nick Piggin   mlock: mlocked pa...
552
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
553
554
  	return ret;
  }
1aab92ec3   Eric B Munson   mm: mlock: refact...
555
556
  static int apply_vma_lock_flags(unsigned long start, size_t len,
  				vm_flags_t flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
557
558
559
560
  {
  	unsigned long nstart, end, tmp;
  	struct vm_area_struct * vma, * prev;
  	int error;
8fd9e4883   Alexander Kuleshov   mm/mlock: use off...
561
  	VM_BUG_ON(offset_in_page(start));
fed067da4   Michel Lespinasse   mlock: only hold ...
562
  	VM_BUG_ON(len != PAGE_ALIGN(len));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
563
564
565
566
567
  	end = start + len;
  	if (end < start)
  		return -EINVAL;
  	if (end == start)
  		return 0;
097d59106   Linus Torvalds   vm: avoid using f...
568
  	vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
569
570
  	if (!vma || vma->vm_start > start)
  		return -ENOMEM;
097d59106   Linus Torvalds   vm: avoid using f...
571
  	prev = vma->vm_prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
572
573
574
575
  	if (start > vma->vm_start)
  		prev = vma;
  
  	for (nstart = start ; ; ) {
b0f205c2a   Eric B Munson   mm: mlock: add ml...
576
  		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
577

1aab92ec3   Eric B Munson   mm: mlock: refact...
578
  		newflags |= flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
579

1aab92ec3   Eric B Munson   mm: mlock: refact...
580
  		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
  		tmp = vma->vm_end;
  		if (tmp > end)
  			tmp = end;
  		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
  		if (error)
  			break;
  		nstart = tmp;
  		if (nstart < prev->vm_end)
  			nstart = prev->vm_end;
  		if (nstart >= end)
  			break;
  
  		vma = prev->vm_next;
  		if (!vma || vma->vm_start != nstart) {
  			error = -ENOMEM;
  			break;
  		}
  	}
  	return error;
  }
1aab92ec3   Eric B Munson   mm: mlock: refact...
601
  static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
602
603
604
605
606
607
608
  {
  	unsigned long locked;
  	unsigned long lock_limit;
  	int error = -ENOMEM;
  
  	if (!can_do_mlock())
  		return -EPERM;
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
609
  	lru_add_drain_all();	/* flush pagevec */
8fd9e4883   Alexander Kuleshov   mm/mlock: use off...
610
  	len = PAGE_ALIGN(len + (offset_in_page(start)));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
611
  	start &= PAGE_MASK;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
612
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
613
  	lock_limit >>= PAGE_SHIFT;
1f1cd7054   Davidlohr Bueso   mm/mlock: prepare...
614
615
616
617
618
  	locked = len >> PAGE_SHIFT;
  
  	down_write(&current->mm->mmap_sem);
  
  	locked += current->mm->locked_vm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
619
620
621
  
  	/* check against resource limits */
  	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
1aab92ec3   Eric B Munson   mm: mlock: refact...
622
  		error = apply_vma_lock_flags(start, len, flags);
1f1cd7054   Davidlohr Bueso   mm/mlock: prepare...
623

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
624
  	up_write(&current->mm->mmap_sem);
c561259ca   Kirill A. Shutemov   mm: move gup() ->...
625
626
627
628
629
630
631
  	if (error)
  		return error;
  
  	error = __mm_populate(start, len, 0);
  	if (error)
  		return __mlock_posix_error_return(error);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
632
  }
1aab92ec3   Eric B Munson   mm: mlock: refact...
633
634
635
636
  SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
  {
  	return do_mlock(start, len, VM_LOCKED);
  }
a8ca5d0ec   Eric B Munson   mm: mlock: add ne...
637
638
  SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
  {
b0f205c2a   Eric B Munson   mm: mlock: add ml...
639
640
641
  	vm_flags_t vm_flags = VM_LOCKED;
  
  	if (flags & ~MLOCK_ONFAULT)
a8ca5d0ec   Eric B Munson   mm: mlock: add ne...
642
  		return -EINVAL;
b0f205c2a   Eric B Munson   mm: mlock: add ml...
643
644
645
646
  	if (flags & MLOCK_ONFAULT)
  		vm_flags |= VM_LOCKONFAULT;
  
  	return do_mlock(start, len, vm_flags);
a8ca5d0ec   Eric B Munson   mm: mlock: add ne...
647
  }
6a6160a7b   Heiko Carstens   [CVE-2009-0029] S...
648
  SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
649
650
  {
  	int ret;
8fd9e4883   Alexander Kuleshov   mm/mlock: use off...
651
  	len = PAGE_ALIGN(len + (offset_in_page(start)));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
652
  	start &= PAGE_MASK;
1f1cd7054   Davidlohr Bueso   mm/mlock: prepare...
653
654
  
  	down_write(&current->mm->mmap_sem);
1aab92ec3   Eric B Munson   mm: mlock: refact...
655
  	ret = apply_vma_lock_flags(start, len, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
656
  	up_write(&current->mm->mmap_sem);
1f1cd7054   Davidlohr Bueso   mm/mlock: prepare...
657

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
659
  	return ret;
  }
b0f205c2a   Eric B Munson   mm: mlock: add ml...
660
661
662
663
664
665
666
667
668
669
  /*
   * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
   * and translate into the appropriate modifications to mm->def_flags and/or the
   * flags for all current VMAs.
   *
   * There are a couple of subtleties with this.  If mlockall() is called multiple
   * times with different flags, the values do not necessarily stack.  If mlockall
   * is called once including the MCL_FUTURE flag and then a second time without
   * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
   */
1aab92ec3   Eric B Munson   mm: mlock: refact...
670
  static int apply_mlockall_flags(int flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
671
672
  {
  	struct vm_area_struct * vma, * prev = NULL;
b0f205c2a   Eric B Munson   mm: mlock: add ml...
673
  	vm_flags_t to_add = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
674

b0f205c2a   Eric B Munson   mm: mlock: add ml...
675
676
  	current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
  	if (flags & MCL_FUTURE) {
09a9f1d27   Michel Lespinasse   Revert "mm: intro...
677
  		current->mm->def_flags |= VM_LOCKED;
1aab92ec3   Eric B Munson   mm: mlock: refact...
678

b0f205c2a   Eric B Munson   mm: mlock: add ml...
679
680
681
682
683
684
685
686
687
688
689
690
  		if (flags & MCL_ONFAULT)
  			current->mm->def_flags |= VM_LOCKONFAULT;
  
  		if (!(flags & MCL_CURRENT))
  			goto out;
  	}
  
  	if (flags & MCL_CURRENT) {
  		to_add |= VM_LOCKED;
  		if (flags & MCL_ONFAULT)
  			to_add |= VM_LOCKONFAULT;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
691
692
  
  	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
ca16d140a   KOSAKI Motohiro   mm: don't access ...
693
  		vm_flags_t newflags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694

b0f205c2a   Eric B Munson   mm: mlock: add ml...
695
696
  		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
  		newflags |= to_add;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
698
699
  
  		/* Ignore errors */
  		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
bde6c3aa9   Paul E. McKenney   rcu: Provide cond...
700
  		cond_resched_rcu_qs();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
702
703
704
  	}
  out:
  	return 0;
  }
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
705
  SYSCALL_DEFINE1(mlockall, int, flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
707
  {
  	unsigned long lock_limit;
86d2adccf   Alexey Klimov   mm/mlock.c: reorg...
708
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
709

b0f205c2a   Eric B Munson   mm: mlock: add ml...
710
  	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
86d2adccf   Alexey Klimov   mm/mlock.c: reorg...
711
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
712

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
  	if (!can_do_mlock())
86d2adccf   Alexey Klimov   mm/mlock.c: reorg...
714
  		return -EPERM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
715

df9d6985b   Christoph Lameter   mm: do not drain ...
716
717
  	if (flags & MCL_CURRENT)
  		lru_add_drain_all();	/* flush pagevec */
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
718

59e99e5b9   Jiri Slaby   mm: use rlimit he...
719
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
720
721
722
  	lock_limit >>= PAGE_SHIFT;
  
  	ret = -ENOMEM;
1f1cd7054   Davidlohr Bueso   mm/mlock: prepare...
723
  	down_write(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
724
725
  	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
  	    capable(CAP_IPC_LOCK))
1aab92ec3   Eric B Munson   mm: mlock: refact...
726
  		ret = apply_mlockall_flags(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
727
  	up_write(&current->mm->mmap_sem);
bebeb3d68   Michel Lespinasse   mm: introduce mm_...
728
729
  	if (!ret && (flags & MCL_CURRENT))
  		mm_populate(0, TASK_SIZE);
86d2adccf   Alexey Klimov   mm/mlock.c: reorg...
730

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
731
732
  	return ret;
  }
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
733
  SYSCALL_DEFINE0(munlockall)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
734
735
736
737
  {
  	int ret;
  
  	down_write(&current->mm->mmap_sem);
1aab92ec3   Eric B Munson   mm: mlock: refact...
738
  	ret = apply_mlockall_flags(0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
  	up_write(&current->mm->mmap_sem);
  	return ret;
  }
  
  /*
   * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
   * shm segments) get accounted against the user_struct instead.
   */
  static DEFINE_SPINLOCK(shmlock_user_lock);
  
  int user_shm_lock(size_t size, struct user_struct *user)
  {
  	unsigned long lock_limit, locked;
  	int allowed = 0;
  
  	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
755
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
5ed44a401   Herbert van den Bergh   do not limit lock...
756
757
  	if (lock_limit == RLIM_INFINITY)
  		allowed = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
758
759
  	lock_limit >>= PAGE_SHIFT;
  	spin_lock(&shmlock_user_lock);
5ed44a401   Herbert van den Bergh   do not limit lock...
760
761
  	if (!allowed &&
  	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
  		goto out;
  	get_uid(user);
  	user->locked_shm += locked;
  	allowed = 1;
  out:
  	spin_unlock(&shmlock_user_lock);
  	return allowed;
  }
  
  void user_shm_unlock(size_t size, struct user_struct *user)
  {
  	spin_lock(&shmlock_user_lock);
  	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
  	spin_unlock(&shmlock_user_lock);
  	free_uid(user);
  }